Back to index

wims  3.65+svn20090927
dicsort.c
Go to the documentation of this file.
00001 /*    Copyright (C) 1998-2003 XIAO, Gang of Universite de Nice - Sophia Antipolis
00002  *
00003  *  This program is free software; you can redistribute it and/or modify
00004  *  it under the terms of the GNU General Public License as published by
00005  *  the Free Software Foundation; either version 2 of the License, or
00006  *  (at your option) any later version.
00007  *
00008  *  This program is distributed in the hope that it will be useful,
00009  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00010  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00011  *  GNU General Public License for more details.
00012  *
00013  *  You should have received a copy of the GNU General Public License
00014  *  along with this program; if not, write to the Free Software
00015  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
00016  */
00017 
00018        /* Sort dictionary */
00019 
00020 /*************** Customization: change values hereafter ****************/
00021 
00022        /* limit of dictionary entries */
00023 #define entrylim 512*1024
00024        /* limit of dictionary length */
00025 #define diclim       32*1024*1024
00026        /* separation character */
00027 char sepchar=':', grpchar=0;
00028 
00029 /***************** Nothing should need change hereafter *****************/
00030 
00031 #include "../wims.h"
00032 
00033 char inpbuf[MAX_LINELEN+1], outbuf[2*MAX_LINELEN+2];
00034 char *dicbuf;
00035 char dicname[1024], suffixname[1024];
00036 
00037 struct entry {
00038     char *original;
00039     char *replace;
00040 } entry[entrylim];
00041 int entrycount;
00042 
00043 int nocase=0, hassuffix=0, leaveline=0;
00044 int entrycount, ocount;
00045 
00046 void *xmalloc(size_t n)
00047 {
00048     void *p;
00049     p=malloc(n);
00050     if(p==NULL) exit(1);
00051     return p;
00052 }
00053 
00054        /* Points to the end of the word */
00055 char *find_word_end(char *p)
00056 {
00057     int i;
00058     for(i=0;!isspace(*p) && *p!=0 && i<MAX_LINELEN; p++,i++);
00059     return p;
00060 }
00061 
00062        /* Strips leading spaces */
00063 char *find_word_start(char *p)
00064 {
00065     int i;
00066     for(i=0; isspace(*p) && i<MAX_LINELEN; p++,i++);
00067     return p;
00068 }
00069 
00070        /* strip trailing spaces; return string end. */
00071 char *strip_trailing_spaces(char *p)
00072 {
00073     char *pp;
00074     if(*p==0) return p;
00075     for(pp=p+strlen(p)-1; pp>=p && isspace(*pp); *(pp--)=0);
00076     return pp;
00077 }
00078 
00079 int compare(const void *s1, const void *s2)
00080 {
00081     const struct entry *p1, *p2;
00082     p1=s1; p2=s2;
00083     if(nocase) return strcasecmp(p1->original,p2->original);
00084     else return strcmp(p1->original,p2->original);
00085 }
00086 
00087 void sortdic(void)
00088 {
00089     qsort(entry,entrycount,sizeof(entry[0]),compare);
00090 }
00091 
00092        /* modify a string. Bufferlen must be ast least MAX_LINELEN */
00093 void string_modify(char *start, char *bad_beg, char *bad_end, char *good,...)
00094 {
00095     char buf[MAX_LINELEN+1];
00096     va_list vp;
00097     
00098     va_start(vp,good);
00099     vsnprintf(buf,sizeof(buf),good,vp); va_end(vp);
00100     if(strlen(start)-(bad_end-bad_beg)+strlen(buf)>=MAX_LINELEN)
00101       return; /* this is an error situation. */
00102     strcat(buf,bad_end);
00103     strcpy(bad_beg,buf);
00104 }
00105 
00106        /* change all spaces into ' ', and collapse multiple occurences */
00107 void singlespace(char *p)
00108 {
00109     char *pp, *p2;
00110     for(pp=p;*pp;pp++) {
00111        if(!isspace(*pp)) continue;
00112        if(leaveline) {
00113            if(*pp==13) strcpy(pp,pp+1);
00114            if(*pp=='\n') {
00115               pp++;
00116               gopt: for(p2=pp; isspace(*p2) && *p2!='\n'; p2++);
00117               if(p2>pp) strcpy(pp,p2); pp--;
00118            }
00119            else {
00120               pp++; if(!isspace(*pp) || *pp=='\n') continue;
00121               goto gopt;
00122            }
00123        }
00124        else {
00125            if(*pp!=' ') *pp=' ';
00126            pp++; if(!isspace(*pp)) continue;
00127            for(p2=pp;isspace(*p2);p2++);
00128            strcpy(pp,p2); pp--;
00129        }
00130     }
00131 }
00132 
00133        /* Prepare dictionary */
00134 void prepare_dic(void)
00135 {
00136     int i;
00137     FILE *dicf;
00138     char *p1, *p2, *pp;
00139     long int flen;
00140 
00141     entrycount=0;
00142     dicf=fopen(dicname,"r"); if(dicf==NULL) return;
00143     fseek(dicf,0,SEEK_END);flen=ftell(dicf); fseek(dicf,0,SEEK_SET);
00144     if(flen>diclim) return;
00145     dicbuf=xmalloc(2*flen+1024);flen=fread(dicbuf,1,flen,dicf);
00146     fclose(dicf);
00147     if(flen>0 && flen<diclim) dicbuf[flen]=0;
00148     else return;
00149     for(i=0,p1=dicbuf;p1!=NULL && *p1!=0 && i<entrylim;p1=p2) {
00150        p2=strchr(p1+1,'\n'); if(p2>p1) *p2++=0;
00151        pp=strchr(p1,sepchar); if(pp==NULL) continue;
00152        *pp++=0;
00153        strip_trailing_spaces(p1); strip_trailing_spaces(pp);
00154        singlespace(p1);
00155        p1=find_word_start(p1); pp=find_word_start(pp);
00156        if(*p1==0) continue;
00157        entry[i].original=p1; entry[i].replace=pp; i++;
00158     }
00159     entrycount=i;
00160 }
00161 
00162 #include "suffix.c"
00163 
00164 void output(void)
00165 {
00166     int i;
00167     FILE *f;
00168 
00169     ocount=0;
00170     strcat(dicname,".sorted");
00171     f=fopen(dicname,"w"); if(f==NULL) return;
00172     for(i=0;i<entrycount;i++) {
00173        if(i>0 && strcmp(entry[i].original,entry[i-1].original)==0
00174           && strcmp(entry[i].replace,entry[i-1].replace)==0)
00175          continue;
00176        if(grpchar!=0) {
00177            if(i>0 && strcmp(entry[i].original,entry[i-1].original)==0)
00178              fprintf(f,"%c%s",grpchar, entry[i].replace);
00179            else {
00180               if(i>0) fprintf(f,"\n");
00181               fprintf(f,"%s%c%s",entry[i].original,sepchar,entry[i].replace);
00182               ocount++;
00183            }
00184            
00185        }
00186        else {
00187            fprintf(f,"%s%c%s\n",entry[i].original,sepchar,entry[i].replace);
00188            ocount++;
00189        }
00190     }
00191     if(grpchar!=0) fprintf(f,"\n");
00192     fclose(f);
00193 }
00194 
00195 int main(int argc, char *argv[])
00196 {
00197     char *ss, *gr;
00198     if(argc<2) return -1;
00199     
00200     ss=getenv("dicsort_separator");
00201     if(ss!=NULL && *ss!=0) sepchar=*ss;
00202     gr=getenv("dicsort_grouping");
00203     if(gr!=NULL && *gr!=0) grpchar=*gr;
00204     snprintf(dicname,sizeof(dicname)-128,"%s",argv[1]); prepare_dic();
00205     if(argc>2) {
00206        snprintf(suffixname,sizeof(suffixname),"%s",argv[2]);
00207        suffix_dic(suffixname); hassuffix=1;
00208     }
00209     else suffixname[0]=hassuffix=0;
00210     sortdic(); output();
00211     printf("%s: sorted %d entries.\n",dicname, ocount);
00212     return 0;
00213 }
00214