Back to index

wims  3.65+svn20090927
translate.c
Go to the documentation of this file.
00001 /*    Copyright (C) 1998-2003 XIAO, Gang of Universite de Nice - Sophia Antipolis
00002  *
00003  *  This program is free software; you can redistribute it and/or modify
00004  *  it under the terms of the GNU General Public License as published by
00005  *  the Free Software Foundation; either version 2 of the License, or
00006  *  (at your option) any later version.
00007  *
00008  *  This program is distributed in the hope that it will be useful,
00009  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00010  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00011  *  GNU General Public License for more details.
00012  *
00013  *  You should have received a copy of the GNU General Public License
00014  *  along with this program; if not, write to the Free Software
00015  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
00016  */
00017 
00018        /* Versatile translation according to a dictionary */
00019 
00020 char inpbuf[MAX_LINELEN+1], troutbuf[2*MAX_LINELEN+2];
00021 struct entry {
00022     unsigned char *original, *replace;
00023     int olen,earlier;
00024 } entry[MAX_DICENTRIES];
00025 int entrycount=0;
00026 
00027 struct dic {
00028     char name[MAX_FNAME+1];
00029     char unknown[256];
00030     char *buf;
00031     int unknown_type;
00032     int start;
00033     int len;
00034 } dic[MAX_DICS];
00035 int diccnt;
00036 int transdic, macrodic;
00037 
00038 enum {
00039     unk_delete, unk_leave, unk_replace
00040 };
00041 
00042 int compare(struct entry *e, const char *s2)
00043 {
00044     int k;
00045     k=strncmp(e->original,s2,e->olen);
00046     if(k==0 && isalnum(*(s2+e->olen))) return -1;
00047     else return k;
00048 }
00049 
00050        /* searches a list. Returns index if found, -1 if nomatch. 
00051         * Uses binary search, list must be sorted. */
00052 int search_dic(struct entry *list, int items, size_t item_size, const char *str)
00053 {
00054     int i1,i2,j,k,t,t1;
00055     unsigned char c;
00056 
00057     if(items<=0) return -1;
00058     j=0; c=str[0];
00059     k=list[0].original[0]-c; if(k==0) k=compare(list,str);
00060     if(k==0) goto more; if(k>0) return -1;
00061     j=items-1; k=list[j].original[0]-c; if(k==0) k=compare(list+j,str);
00062     if(k==0) return j;
00063     if(k>0) for(i1=0,i2=j;i2>i1+1;) {
00064        j=i1+(i2-i1)/2;
00065        k=list[j].original[0]-c; if(k==0) k=compare(list+j,str);
00066        if(k==0) goto more;
00067        if(k>0) {i2=j; continue;}
00068        if(k<0) {i1=j; continue;}   
00069     }
00070     if(k>0) {j--;k=compare(list+j,str);}
00071     more:
00072     if((t=list[j].earlier)<0) {
00073        if(k==0) return j; else return -1;
00074     }
00075     if(compare(entry+t,str)!=0) return -1;
00076     for(j=t1=t,k=0;j<items+(list-entry) && entry[j].earlier==t1 && (k=compare(entry+j,str))<=0; j++) 
00077       if(k==0) t=j;
00078     return t-(list-entry);
00079 }
00080 
00081 #include "suffix.c"
00082 
00083        /* Prepare dictionary */
00084 struct dic *prepare_dic(char *fname)
00085 {
00086     int i,l;
00087     struct dic *thisdic;
00088     FILE *dicf;
00089     char *p1, *p2, *pp;
00090     char tbuf[256], buf[MAX_LINELEN+1];
00091     long int flen;
00092     
00093     if(diccnt>=MAX_DICS) error("too_many_dictionaries");
00094     thisdic=dic+diccnt; diccnt++;
00095     thisdic->len=0;
00096     thisdic->start=entrycount;
00097     snprintf(thisdic->name,sizeof(thisdic->name),"%s",fname);
00098     dicf=fopen(mkfname(NULL,"%s/%s",styledir,fname),"r"); if(dicf==NULL) return NULL;
00099     fseek(dicf,0,SEEK_END);flen=ftell(dicf); fseek(dicf,0,SEEK_SET);
00100     if(flen>=MAX_DICSIZE) return NULL;
00101     thisdic->buf=xmalloc(flen+16);flen=fread(thisdic->buf,1,flen,dicf);
00102     fclose(dicf);
00103     if(flen>0 && flen<MAX_DICSIZE) thisdic->buf[flen]=0;
00104     else return NULL;
00105     for(i=entrycount,p1=thisdic->buf;p1!=NULL && *p1!=0 && i<MAX_DICENTRIES;p1=p2) {
00106        p2=strchr(p1+1,'\n'); if(p2>p1) *p2++=0;
00107        pp=strchr(p1,':'); if(pp==NULL) continue;
00108        *pp++=0;
00109        strip_trailing_spaces(p1); strip_trailing_spaces(pp);
00110        singlespace(p1);
00111        p1=find_word_start(p1); pp=find_word_start(pp);
00112        if(*p1==0) continue;
00113        if(i>entrycount && compare(entry+i-1,p1)>0)
00114          error("unsorted_dictionary %s: %s > %s.\n",
00115               fname,entry[i-1].original,p1);
00116        if(i>entrycount && strcmp(entry[i-1].original,p1)==0)
00117          error("duplication_in_dictionary %s: %s.\n",
00118               fname,p1);
00119        entry[i].original=p1; entry[i].replace=pp; 
00120        entry[i].olen=l=strlen(p1); entry[i].earlier=-1;
00121        if(i>0) {
00122            int l1,l2;
00123            l1=entry[i-1].earlier; if(l1>=0) l2=entry[l1].olen;
00124            else {l2=entry[i-1].olen;l1=i-1;}
00125            if(l>l2 && isspace(p1[l2])
00126               && strncmp(entry[l1].original,p1,l2)==0) 
00127              entry[i].earlier=entry[i-1].earlier=l1;
00128        }
00129        i++;
00130     }
00131     thisdic->len=i-entrycount;
00132     pp=strrchr("fname",'/'); if(pp==NULL) pp=fname;
00133     snprintf(tbuf,sizeof(tbuf),"unknown_%s",pp);
00134     _getdef(defbuf,tbuf,buf);
00135     p1=find_word_start(buf); *find_word_end(p1)=0;
00136     for(pp=p1; *pp; pp++) *pp=tolower(*pp);
00137     thisdic->unknown_type=unk_delete;
00138     if(strcmp(p1,"leave")==0) thisdic->unknown_type=unk_leave;
00139     else if(strcmp(p1,"delete")!=0) {
00140        thisdic->unknown_type=unk_replace;
00141        snprintf(thisdic->unknown,sizeof(thisdic->unknown),"%s",p1);
00142     }
00143     entrycount=i;
00144     if(debug) fprintf(stderr,"Dictionary %d: %s, %d entries.\n",
00145                     diccnt,fname,thisdic->len);
00146     return thisdic;
00147 }
00148 
00149        /* make the translation. */
00150 void _translate(char *p, int i)
00151 {
00152     char *p1, *p2, *pp;
00153     int t;
00154 
00155     if(i<0 || i>=diccnt) return;
00156     if(dic[i].len<=0) return;
00157     snprintf(troutbuf,sizeof(troutbuf),"%s",p);
00158     for(p1=find_word_start(troutbuf);
00159        p1!=NULL && p1-troutbuf<MAX_LINELEN && *p1!=0;
00160        p1=p2) {
00161        p2=find_word_end(p1);
00162        for(pp=p1;pp<p2 && (isalnum(*pp) || strchr("_",*pp)!=NULL);pp++);
00163        p2=find_word_start(p2);
00164        if(pp==p1 || (*pp!=0 && strchr(" ,.?!",*pp)==NULL)) continue;
00165        t=search_dic(entry+dic[i].start,dic[i].len,sizeof(entry[0]),p1);
00166        if(t<0) {
00167            switch(dic[i].unknown_type) {
00168               case unk_leave: break;
00169               case unk_delete: {
00170                   strcpy(p1,find_word_start(pp)); p2=p1;
00171                   break;
00172               }
00173               case unk_replace: {
00174                   string_modify(troutbuf,p1,pp,dic[i].unknown);
00175                   p2=find_word_start(p1+strlen(dic[i].unknown));
00176               }
00177            }
00178            continue;
00179        }
00180        t+=dic[i].start;
00181        string_modify(troutbuf,p1,p1+strlen(entry[t].original),
00182                     entry[t].replace);
00183        p2=find_word_start(p1+strlen(entry[t].replace));
00184     }
00185     snprintf(p,MAX_LINELEN,"%s",troutbuf);
00186 }
00187 
00188        /* make translation using file name */
00189 void translate(char *p, char *dicname)
00190 {
00191     int i;
00192     for(i=0;i<diccnt && strcmp(dicname,dic[i].name)!=0;i++);
00193     if(i<diccnt) _translate(p,i);
00194 }
00195 
00196        /* Returns dictionary index, or -1 if not found */
00197 int getdic(char *dicname)
00198 {
00199     int i;
00200     char *p1, *p2, buf[MAX_LINELEN+1];
00201     for(i=0;i<diccnt && strcmp(dicname,dic[i].name)!=0;i++);
00202     if(i<diccnt) return i;
00203     _getdef(defbuf,"dictionaries",buf);
00204     p1=wordchr(buf,dicname); if(p1==NULL) return -1;
00205     for(p2=p1; myisalnum(*p2) || *p2=='.'; p2++);
00206     if(p2-p1 >= MAX_NAMELEN) return -1;
00207     *p2=0; i=diccnt;
00208     prepare_dic(dicname); return i;
00209 }
00210