Back to index

wims  3.65+svn20090927
translator_.c
Go to the documentation of this file.
00001 /*    Copyright (C) 1998-2003 XIAO, Gang of Universite de Nice - Sophia Antipolis
00002  *
00003  *  This program is free software; you can redistribute it and/or modify
00004  *  it under the terms of the GNU General Public License as published by
00005  *  the Free Software Foundation; either version 2 of the License, or
00006  *  (at your option) any later version.
00007  *
00008  *  This program is distributed in the hope that it will be useful,
00009  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00010  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00011  *  GNU General Public License for more details.
00012  *
00013  *  You should have received a copy of the GNU General Public License
00014  *  along with this program; if not, write to the Free Software
00015  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
00016  */
00017 
00018        /* Versatile translation according to a dictionary */
00019 
00020 /*************** Customization: change values hereafter ****************/
00021 
00022        /* limit of dictionary entries */
00023 #define entrylim 32768
00024        /* limit of dictionary length */
00025 #define diclim       1024*1024
00026 
00027 /***************** Nothing should need change hereafter *****************/
00028 
00029 char inpbuf[MAX_LINELEN+1], outbuf[2*MAX_LINELEN+2];
00030 char *dicbuf;
00031 struct entry {
00032     unsigned char *original, *replace;
00033     int olen,earlier;
00034 } entry[entrylim];
00035 int entrycount;
00036 
00037 enum {
00038     unk_delete, unk_leave, unk_replace
00039 };
00040 
00041 int has_digits=0;
00042 int unknown_type=unk_delete;
00043 int nocase=0,leaveline=0;
00044 char *unknown, unkbuf[1024];
00045 
00046 int compare(int i1, const char *s2)
00047 {
00048     int k;
00049     if(nocase) k=strncasecmp(entry[i1].original,s2,entry[i1].olen);
00050     else k=strncmp(entry[i1].original,s2,entry[i1].olen);
00051     if(k==0 && (isalnum(*(s2+entry[i1].olen)) || (*(s2+entry[i1].olen)&128)!=0)) return -1;
00052     else return k;
00053 }
00054 
00055        /* searches a list. Returns index if found, -1 if nomatch. 
00056         * Uses binary search, list must be sorted. */
00057 int search_list(struct entry *list, int items, size_t item_size, const char *str)
00058 {
00059     int i1,i2,j,k,t,t1;
00060     unsigned char c;
00061 
00062     if(items<=0) return -1;
00063     j=0; c=str[0];
00064     k=list[0].original[0]-c; if(k==0) k=compare(0,str);
00065     if(k==0) goto more; if(k>0) return -1;
00066     j=items-1; k=list[j].original[0]-c; if(k==0) k=compare(j,str);
00067     if(k==0) return j;
00068     if(k>0) for(i1=0,i2=j;i2>i1+1;) {
00069        j=i1+(i2-i1)/2;
00070        k=list[j].original[0]-c; if(k==0) k=compare(j,str);
00071        if(k==0) goto more;
00072        if(k>0) {i2=j; continue;}
00073        if(k<0) {i1=j; continue;}   
00074     }
00075     if(k>0) {j--;k=compare(j,str);}
00076     more:
00077     if((t=list[j].earlier)<0) {
00078        if(k==0) return j; else return -1;
00079     }
00080     if(compare(t,str)!=0) return -1;
00081     for(j=t1=t,k=0;j<items && list[j].earlier==t1 && (k=compare(j,str))<=0; j++)
00082       if(k==0) t=j;
00083     return t;
00084 }
00085 
00086        /* change all spaces into ' ', and collapse multiple occurences */
00087 void singlespace(char *p)
00088 {
00089     char *pp, *p2;
00090     for(pp=p;*pp;pp++) {
00091        if(!isspace(*pp)) continue;
00092        if(leaveline) {
00093            if(*pp==13) strcpy(pp,pp+1);
00094            if(*pp=='\n') {
00095               pp++;
00096               gopt: for(p2=pp; isspace(*p2) && *p2!='\n'; p2++);
00097               if(p2>pp) strcpy(pp,p2); pp--;
00098            }
00099            else {
00100               pp++; if(!isspace(*pp) || *pp=='\n') continue;
00101               goto gopt;
00102            }
00103        }
00104        else {
00105            if(*pp!=' ') *pp=' ';
00106            if(!isspace(*(pp+1))) continue;
00107            for(pp++,p2=pp;isspace(*p2);p2++);
00108            strcpy(pp,p2); pp--;
00109        }
00110     }
00111 }
00112 
00113 #include "suffix.c"
00114 
00115        /* Prepare dictionary */
00116 void prepare_dic(char *fname)
00117 {
00118     int i,l;
00119     FILE *dicf;
00120     char *p1, *p2, *pp;
00121     long int flen;
00122     
00123     entrycount=0;
00124     dicf=fopen(fname,"r"); if(dicf==NULL) return;
00125     fseek(dicf,0,SEEK_END);flen=ftell(dicf); fseek(dicf,0,SEEK_SET);
00126     if(flen>diclim) return;
00127     dicbuf=xmalloc(flen+16);flen=fread(dicbuf,1,flen,dicf);
00128     fclose(dicf);
00129     if(flen>0 && flen<diclim) dicbuf[flen]=0;
00130     else return;
00131     for(i=0,p1=dicbuf;p1!=NULL && *p1!=0 && i<entrylim;p1=p2) {
00132        p2=strchr(p1+1,'\n'); if(p2>p1) *p2++=0;
00133        pp=strchr(p1,':'); if(pp==NULL) continue;
00134        *pp++=0;
00135        strip_trailing_spaces(p1); strip_trailing_spaces(pp);
00136        singlespace(p1);
00137        p1=find_word_start(p1); pp=find_word_start(pp);
00138        if(*p1==0) continue;
00139        if(has_digits==0) {
00140            char *p;
00141            for(p=p1;*p!=0 && p<pp && !isdigit(*p);p++);
00142            if(isdigit(*p)) has_digits=1;
00143        }
00144        entry[i].original=p1; entry[i].replace=pp; 
00145        entry[i].olen=l=strlen(p1); entry[i].earlier=-1;
00146        if(i>0) {
00147            int l1,l2;
00148            l1=entry[i-1].earlier; if(l1>=0) l2=entry[l1].olen;
00149            else {l2=entry[i-1].olen;l1=i-1;}
00150            if(l>l2 && isspace(p1[l2])
00151               && strncmp(entry[l1].original,p1,l2)==0) 
00152              entry[i].earlier=entry[i-1].earlier=l1;
00153        }
00154        i++;
00155     }
00156     entrycount=i;
00157 }
00158 
00159        /* now make the translation. */
00160 void translate(char *p)
00161 {
00162     char *p1, *p2, *pp;
00163     int t;
00164 
00165     if(entrycount<=0 && suffixcnt<=0) return;
00166     snprintf(outbuf,sizeof(outbuf),"%s",p);
00167     for(p1=find_word_start(outbuf);
00168        p1!=NULL && p1-outbuf<MAX_LINELEN && *p1!=0;
00169        p1=p2) {
00170        p2=find_word_end(p1);
00171        for(pp=p1;pp<p2 && 
00172            ((!has_digits && isalpha(*pp)) ||
00173             (has_digits && isalnum(*pp)) || (*pp&128)!=0 ||
00174             strchr("_",*pp)!=NULL);pp++);
00175        p2=find_word_start(p2);
00176        if(pp==p1 || 
00177           (has_digits==0 && isdigit(*pp)) || 
00178           (*pp!=0 && !isspace(*pp) && strchr(",.?!/;",*pp)==NULL)) continue;
00179        t=search_list(entry,entrycount,sizeof(entry[0]),p1);
00180        if(t<0) {
00181            switch(unknown_type) {
00182               case unk_leave: break;
00183               case unk_delete: {
00184                   strcpy(p1,find_word_start(pp)); p2=p1;
00185                   break;
00186               }
00187               case unk_replace: {
00188                   string_modify(outbuf,p1,pp,unkbuf);
00189                   p2=find_word_start(p1+strlen(unkbuf));
00190               }
00191            }
00192            continue;
00193        }
00194        string_modify(outbuf,p1,p1+strlen(entry[t].original),
00195                     entry[t].replace);
00196        p2=find_word_start(p1+strlen(entry[t].replace));
00197     }
00198     snprintf(p,MAX_LINELEN,"%s",outbuf);
00199 }
00200