Back to index

wims  3.65+svn20090927
suffix.c
Go to the documentation of this file.
00001 /*    Copyright (C) 1998-2003 XIAO, Gang of Universite de Nice - Sophia Antipolis
00002  *
00003  *  This program is free software; you can redistribute it and/or modify
00004  *  it under the terms of the GNU General Public License as published by
00005  *  the Free Software Foundation; either version 2 of the License, or
00006  *  (at your option) any later version.
00007  *
00008  *  This program is distributed in the hope that it will be useful,
00009  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00010  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00011  *  GNU General Public License for more details.
00012  *
00013  *  You should have received a copy of the GNU General Public License
00014  *  along with this program; if not, write to the Free Software
00015  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
00016  */
00017 
00018 #define suflim       256
00019 #define sufbuflim 102400
00020 
00021 int suffixcnt;
00022 struct {
00023     unsigned char *original;
00024     int olen;
00025     unsigned char *replace;
00026 } 
00027 suf[suflim];
00028 char *sufbuf;
00029 int sufwordlen, sufminlen;
00030 
00031        /* Suffix translation, to be used within translator. */
00032 
00033 int sufcomp(int t, const unsigned char *s2)
00034 {
00035     int k;
00036     
00037     for(k=0;k<suf[t].olen && k<sufwordlen
00038        && suf[t].original[k]==s2[sufwordlen-k-1];k++);
00039     if(k>=suf[t].olen) {
00040        if(sufwordlen>k) return -1; else return 0;
00041     }
00042     else return suf[t].original[k]-s2[sufwordlen-k-1];
00043 }
00044 
00045        /* searches a list. Returns index if found, -1 if nomatch. 
00046         * This routine is faster than naive one by one comparisons, 
00047         * and is especially suited for large lists. */
00048 int suffix_list(void *list, int items, size_t item_size, const char *str)
00049 {
00050     int i1,i2,j,k,t,v;
00051     char c,d;
00052     
00053     if(items<=0) return -1;
00054     k=sufcomp(0,str);
00055     if(k==0) return 0; if(k>0) return -1;
00056     j=items-1; k=sufcomp(j,str);
00057     if(k==0) return j;
00058     if(k>0) for(i1=0,i2=j;i2>i1+1;) {
00059        j=i1+(i2-i1)/2;      k=sufcomp(j,str);
00060        if(k==0) return j;
00061        if(k>0) {i2=j; continue;}
00062        if(k<0) {i1=j; continue;}   
00063     }
00064     if(k>0 && j>0) j--;
00065     backcheck:
00066     v=j;for(t=0;t<suf[j].olen && t<sufwordlen
00067        && suf[j].original[t]==str[sufwordlen-t-1];t++);
00068     if(t<sufminlen) return -1; if(t>=suf[j].olen) return j;
00069     for(j--,c=str[sufwordlen-1],d=str[sufwordlen-t];
00070        j>=0 && suf[j].original[0]==c && suf[j].olen>t
00071        && suf[j].original[t-1]==d;j--);
00072     if(j>=0 && suf[j].original[0]==c && 
00073        strncmp(suf[j].original,suf[v].original,suf[j].olen)==0)
00074       return j;
00075     else goto backcheck;
00076 }
00077 
00078        /* Prepare dictionary.  */
00079 void suffix_dic(char *sdicname)
00080 {
00081     int i,k,l;
00082     FILE *suff;
00083     char *p1, *p2, *pp;
00084     long int flen;
00085 
00086     suffixcnt=0; sufminlen=100000;
00087     suff=fopen(sdicname,"r"); if(suff==NULL) return;
00088     fseek(suff,0,SEEK_END);flen=ftell(suff); fseek(suff,0,SEEK_SET);
00089     if(flen>sufbuflim) return;
00090     sufbuf=xmalloc(flen+16);flen=fread(sufbuf,1,flen,suff);
00091     fclose(suff);
00092     if(flen>0 && flen<sufbuflim) sufbuf[flen]=0;
00093     else return;
00094     for(i=0,p1=sufbuf;p1!=NULL && *p1!=0 && i<suflim;p1=p2) {
00095        p2=strchr(p1+1,'\n'); if(p2>p1) *p2++=0;
00096        pp=strchr(p1,':'); if(pp==NULL) continue;
00097        *pp++=0;
00098        strip_trailing_spaces(p1); strip_trailing_spaces(pp);
00099        p1=find_word_start(p1); pp=find_word_start(pp);
00100        if(*p1==0) continue;
00101        if(i>0) {
00102            k=strcmp(suf[i-1].original,p1);
00103            if(k>0) {
00104               pp=strrchr(sdicname,'/'); if(pp==NULL) pp=sdicname; else pp++;
00105               error("unsorted_dictionary %s: %s > %s.\n",
00106                     pp,suf[i-1].original,p1);
00107            }
00108            if(k==0) {
00109               pp=strrchr(sdicname,'/'); if(pp==NULL) pp=sdicname; else pp++;
00110               error("duplication_in_dictionary %s: %s.\n",pp,p1);
00111            }
00112        }
00113        suf[i].original=p1; suf[i].olen=l=strlen(p1);
00114        if(l<sufminlen) sufminlen=l;
00115        suf[i].replace=pp; i++;
00116     }
00117     suffixcnt=i;
00118 }
00119 
00120        /* Suffix translation. */
00121 void suffix_translate(char *p)
00122 {
00123     char *p1, *p2;
00124     int t;
00125 
00126     for(p1=find_word_start(p);
00127        p1!=NULL && p1-p<MAX_LINELEN && *p1!=0;
00128        p1=p2) {
00129        if(!isalpha(*p1)) {p2=p1+1; continue;}
00130        for(p2=p1;isalpha(*p2);p2++);
00131        if(*p2!=0 && strchr(" ,.?!'\"\n`:;()[]{}<>",*p2)==NULL) continue;
00132        sufwordlen=p2-p1;
00133        t=suffix_list(suf,suffixcnt,sizeof(suf[0]),p1);
00134        if(t<0) continue;
00135        string_modify(p,p2-suf[t].olen,p2,suf[t].replace);
00136        p2=p2-suf[t].olen+strlen(suf[t].replace);
00137     }
00138     p[MAX_LINELEN]=0;
00139 }
00140 
00141 void suffix(char *p, char *sdicname)
00142 {
00143     suffix_dic(sdicname); if(suffixcnt>0) suffix_translate(p);
00144 }
00145