Back to index

wims  3.65+svn20090927
huffman.c
Go to the documentation of this file.
00001 /*    Copyright (C) 2002-2003 XIAO, Gang of Universite de Nice - Sophia Antipolis
00002  *
00003  *  This program is free software; you can redistribute it and/or modify
00004  *  it under the terms of the GNU General Public License as published by
00005  *  the Free Software Foundation; either version 2 of the License, or
00006  *  (at your option) any later version.
00007  *
00008  *  This program is distributed in the hope that it will be useful,
00009  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00010  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00011  *  GNU General Public License for more details.
00012  *
00013  *  You should have received a copy of the GNU General Public License
00014  *  along with this program; if not, write to the Free Software
00015  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
00016  */
00017 
00018        /* This program computes an optimal coding of variable lengths
00019         * on a given distribution of probabilities,
00020         * using Huffman algorithm */
00021 
00022 /* Input data: via two environment variables.
00023  * wims_exec_parm is a comma-separated list of probability distributions.
00024  *     Limited to MAX_ITEMS.
00025  *     The input data will be scaled to unit sum.
00026  * w_huffman_radix is the encoding radix, between 2 and MAX_RADIX.
00027  * 
00028  * Output: two lines.
00029  * Line 1: Entropy and Average code length, comma-separated.
00030  * Line 2: comma-separated list of codes.
00031  */
00032 
00033 #define MAX_ITEMS    2048
00034 #define MAX_CODELEN  100
00035 
00036 const char *codechar="0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
00037 #define MAX_RADIX    strlen(codechar)
00038 
00039 #include "../Lib/libwims.h"
00040 
00041 struct {
00042     double prob;
00043     int ind;
00044     unsigned char code[MAX_CODELEN];
00045     int codelen;
00046 } maintab[MAX_ITEMS*2];
00047 int itemcnt, origcnt;
00048 
00049 int indtab[MAX_ITEMS];
00050 int indcnt;
00051 int radix;
00052 double entropy, avelen;
00053 
00054 void error(char *msg)
00055 {
00056     fprintf(stderr,"%s\n",msg);
00057     printf("ERROR\n");
00058     exit(1);
00059 }
00060 
00061 int indcmp(const void *p1, const void *p2)
00062 {
00063     const int *i1, *i2;
00064     double d1, d2;
00065     
00066     i1=p1; i2=p2;
00067     d1=maintab[*i1].prob; d2=maintab[*i2].prob;
00068     if(d1==d2) return 0;
00069     if(d1>d2) return -1;
00070     else return 1;
00071 }
00072 
00073 void huffman(void)
00074 {
00075     int t, i, j, l;
00076     double d;
00077     
00078     while(indcnt>radix) {
00079        qsort(indtab,indcnt,sizeof(indtab[0]),indcmp);
00080        if(radix>2) t=(indcnt+radix-3)%(radix-1)+2;
00081        else t=2;
00082        d=0;
00083        for(i=indcnt-t; i<indcnt; i++) {
00084            d+=maintab[indtab[i]].prob;
00085            maintab[indtab[i]].ind=itemcnt;
00086        }
00087        maintab[itemcnt].prob=d;
00088        maintab[itemcnt].ind=-1;
00089        maintab[itemcnt].codelen=-1;
00090        indtab[indcnt-t]=itemcnt;
00091        indcnt-=t-1; itemcnt++;
00092     }
00093     for(i=0;i<indcnt;i++) {
00094        maintab[indtab[i]].codelen=1;
00095        maintab[indtab[i]].code[0]=i;
00096        maintab[indtab[i]].ind=0;
00097     }
00098     for(i=itemcnt-1;i>=0;i--) {
00099        if(maintab[i].codelen>0) continue;
00100        j=maintab[i].ind; l=maintab[j].codelen;
00101        if(l>=MAX_CODELEN) error("Code too long.");
00102        memmove(maintab[i].code,maintab[j].code,l);
00103        maintab[i].code[l]=maintab[j].ind++;
00104        maintab[i].codelen=l+1;
00105        maintab[i].ind=0;
00106     }
00107 }
00108 
00109 void output(void)
00110 {
00111     int i, j;
00112     double d;
00113 
00114     d=0;
00115     for(i=0;i<origcnt;i++) d+=maintab[i].prob*maintab[i].codelen;
00116     printf("%f,%f\n",entropy,d);
00117     for(i=0;i<origcnt;i++) {
00118        for(j=0;j<maintab[i].codelen;j++) printf("%c",codechar[(int) maintab[i].code[j]]);
00119        if(i<origcnt-1) printf(",");
00120        else printf("\n");
00121     }
00122 }
00123 
00124 void getparm(char *p)
00125 {
00126     int i;
00127     char *p1, *p2;
00128     double d1, dt;
00129 
00130     origcnt=0; dt=0;
00131     for(p1=find_word_start(p);
00132        *p1; p1=find_word_start(p2)) {
00133        for(p2=p1; *p2 && strchr(",;",*p2)==NULL; p2++);
00134        if(*p2) *p2++=0;
00135        d1=strevalue(p1);
00136        if(!finite(d1) || d1<0) {
00137            char buf[256];
00138            snprintf(buf,sizeof(buf),"Bad data: %s",p1);
00139            error(buf);
00140        }
00141        maintab[origcnt++].prob=d1;
00142        dt+=d1;
00143     }
00144     if(dt*1000000<1) error("Empty data sum.");
00145     if(origcnt<2) error("Insufficient data for encoding.");
00146     itemcnt=origcnt;
00147     if(dt!=1) for(i=0; i<origcnt; i++) maintab[i].prob/=dt;
00148     
00149     entropy=0;
00150     for(i=0;i<origcnt;i++) {
00151        maintab[i].codelen=-1; maintab[i].ind=-1;
00152        indtab[i]=i;
00153        d1=maintab[i].prob;
00154        if(d1>0) entropy-=d1*log(d1);
00155     }
00156     entropy=entropy/log(radix);
00157     indcnt=origcnt;
00158 }
00159 
00160 int main()
00161 {
00162     char *p;
00163     int r;
00164     
00165     error1=error; error2=error; error3=error;
00166     p=getenv("w_huffman_radix");
00167     if(p==NULL || *p==0) p=getenv("huffman_radix");
00168     if(p==NULL || *p==0) radix=2;
00169     else {
00170        r=atoi(p); if(r!=0) radix=r;
00171     }
00172     if(radix<2 || radix>MAX_RADIX) error("Bad radix.");
00173     
00174     p=getenv("wims_exec_parm");
00175     if(p==NULL || *p==0) error("No input data.");
00176     getparm(p);
00177     
00178     huffman();
00179     output();
00180     
00181     return 0;
00182 }
00183