Back to index

lightning-sunbird  0.9+nobinonly
suggestmgr.cpp
Go to the documentation of this file.
00001 #include "license.readme"
00002 #include <stdlib.h>
00003 #include <ctype.h>
00004 #include <string.h>
00005 #include <stdio.h>
00006 
00007 #include "suggestmgr.hxx"
00008 
00009 // using namespace std;
00010 
00011 extern char * mystrdup(const char *);
00012 
00013 
00014 SuggestMgr::SuggestMgr(const char * tryme, int maxn, 
00015                        AffixMgr * aptr)
00016 {
00017 
00018   // register affix manager and check in string of chars to 
00019   // try when building candidate suggestions
00020   pAMgr = aptr;
00021   ctry = mystrdup(tryme);
00022   ctryl = 0;
00023   if (ctry)
00024     ctryl = strlen(ctry);
00025   maxSug = maxn;
00026   nosplitsugs=(0==1);
00027   if (pAMgr) pAMgr->get_nosplitsugs();
00028 }
00029 
00030 
00031 SuggestMgr::~SuggestMgr()
00032 {
00033   pAMgr = NULL;
00034   if (ctry) free(ctry);
00035   ctry = NULL;
00036   ctryl = 0;
00037   maxSug = 0;
00038 }
00039 
00040 
00041 
00042 // generate suggestions for a mispelled word
00043 //    pass in address of array of char * pointers
00044 
00045 int SuggestMgr::suggest(char** wlst, int ns, const char * word)
00046 {
00047     
00048     int nsug = ns;
00049 
00050     // did we swap the order of chars by mistake
00051     if ((nsug < maxSug) && (nsug > -1))
00052       nsug = swapchar(wlst, word, nsug);
00053 
00054     // perhaps we made chose the wrong char from a related set
00055     if ((nsug < maxSug) && (nsug > -1))
00056       nsug = mapchars(wlst, word, nsug);
00057 
00058     // perhaps we made a typical fault of spelling
00059     if ((nsug < maxSug) && (nsug > -1))
00060       nsug = replchars(wlst, word, nsug);
00061 
00062     // did we forget to add a char
00063     if ((nsug < maxSug) && (nsug > -1))
00064       nsug = forgotchar(wlst, word, nsug);
00065 
00066     // did we add a char that should not be there
00067     if ((nsug < maxSug) && (nsug > -1))
00068       nsug = extrachar(wlst, word, nsug);
00069    
00070     // did we just hit the wrong key in place of a good char
00071     if ((nsug < maxSug) && (nsug > -1))
00072       nsug = badchar(wlst, word, nsug);
00073 
00074     // perhaps we forgot to hit space and two words ran together
00075     if (!nosplitsugs) {
00076         if ((nsug < maxSug) && (nsug > -1))
00077            nsug = twowords(wlst, word, nsug);
00078     }
00079     return nsug;
00080 }
00081 
00082 
00083 
00084 // suggestions for when chose the wrong char out of a related set
00085 int SuggestMgr::mapchars(char** wlst, const char * word, int ns)
00086 {
00087   int wl = strlen(word);
00088   if (wl < 2 || ! pAMgr) return ns;
00089 
00090   int nummap = pAMgr->get_nummap();
00091   struct mapentry* maptable = pAMgr->get_maptable();
00092   if (maptable==NULL) return ns;
00093   ns = map_related(word, 0, wlst, ns, maptable, nummap);
00094   return ns;
00095 }
00096 
00097 
00098 int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns, const mapentry* maptable, int nummap) 
00099 {
00100   char c = *(word + i);
00101   if (c == 0) {
00102       int cwrd = 1;
00103       for (int m=0; m < ns; m++)
00104          if (strcmp(word,wlst[m]) == 0) cwrd = 0;
00105       if ((cwrd) && check(word,strlen(word))) {
00106          if (ns < maxSug) {
00107              wlst[ns] = mystrdup(word);
00108               // fprintf(stderr,"map_related %d adding %s\n",ns, wlst[ns]); fflush(stderr);
00109              if (wlst[ns] == NULL) return -1;
00110              ns++;
00111          }
00112       }
00113       return ns;
00114   } 
00115   int in_map = 0;
00116   for (int j = 0; j < nummap; j++) {
00117     if (strchr(maptable[j].set,c) != 0) {
00118       in_map = 1;
00119 #ifdef __SUNPRO_CC // for SunONE Studio compiler
00120       char * newword = mystrdup(word);
00121 #else
00122       char * newword = strdup(word);
00123 #endif
00124       for (int k = 0; k < maptable[j].len; k++) {
00125        *(newword + i) = *(maptable[j].set + k);
00126        ns = map_related(newword, (i+1), wlst, ns, maptable, nummap);
00127       }
00128       free(newword);
00129     }
00130   }
00131   if (!in_map) {
00132      i++;
00133      ns = map_related(word, i, wlst, ns, maptable, nummap);
00134   }
00135   return ns;
00136 }
00137 
00138 
00139 
00140 // suggestions for a typical fault of spelling, that
00141 // differs with more, than 1 letter from the right form.
00142 int SuggestMgr::replchars(char** wlst, const char * word, int ns)
00143 {
00144   char candidate[MAXSWL];
00145   const char * r;
00146   int lenr, lenp;
00147   int cwrd;
00148 
00149   int wl = strlen(word);
00150   if (wl < 2 || ! pAMgr) return ns;
00151 
00152   int numrep = pAMgr->get_numrep();
00153   struct replentry* reptable = pAMgr->get_reptable();
00154   if (reptable==NULL) return ns;
00155 
00156   for (int i=0; i < numrep; i++ ) {
00157       r = word;
00158       lenr = strlen(reptable[i].replacement);
00159       lenp = strlen(reptable[i].pattern);
00160       // search every occurence of the pattern in the word
00161       while ((r=strstr(r, reptable[i].pattern)) != NULL) {
00162          strcpy(candidate, word);
00163          if (r-word + lenr + strlen(r+lenp) >= MAXSWL) break;
00164          strcpy(candidate+(r-word),reptable[i].replacement);
00165          strcpy(candidate+(r-word)+lenr, r+lenp);
00166           cwrd = 1;
00167           for (int k=0; k < ns; k++)
00168              if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
00169           if ((cwrd) && check(candidate,strlen(candidate))) {
00170              if (ns < maxSug) {
00171                 wlst[ns] = mystrdup(candidate);
00172                   // fprintf(stderr,"replchars %d adding %s\n",ns,wlst[ns]); fflush(stderr);
00173                 if (wlst[ns] == NULL) return -1;
00174                 ns++;
00175              } else return ns;
00176          }
00177           r++; // search for the next letter
00178       }
00179    }
00180    return ns;
00181 }
00182 
00183 
00184 // error is wrong char in place of correct one
00185 int SuggestMgr::badchar(char ** wlst, const char * word, int ns)
00186 {
00187   char tmpc;
00188   char candidate[MAXSWL];
00189 
00190   int wl = strlen(word);
00191   int cwrd;
00192   strcpy (candidate, word);
00193 
00194   // swap out each char one by one and try all the tryme
00195   // chars in its place to see if that makes a good word
00196   for (int i=0; i < wl; i++) {
00197     tmpc = candidate[i];
00198     for (int j=0; j < ctryl; j++) {
00199        if (ctry[j] == tmpc) continue;
00200        candidate[i] = ctry[j];
00201        cwrd = 1;
00202        for (int k=0; k < ns; k++)
00203         if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
00204        if ((cwrd) && check(candidate,wl)) {
00205         if (ns < maxSug) {
00206             wlst[ns] = mystrdup(candidate);
00207             // fprintf(stderr,"bad_char %d adding %s\n",ns, wlst[ns]); fflush(stderr);
00208             if (wlst[ns] == NULL) return -1;
00209             ns++;
00210          } else return ns;
00211        }
00212        candidate[i] = tmpc;
00213     }
00214   }
00215   return ns;
00216 }
00217 
00218 
00219 // error is word has an extra letter it does not need 
00220 int SuggestMgr::extrachar(char** wlst, const char * word, int ns)
00221 {
00222    char          candidate[MAXSWL];
00223    const char *  p;
00224    char *  r;
00225    int cwrd;
00226 
00227    int wl = strlen(word);
00228    if (wl < 2) return ns;
00229 
00230    // try omitting one char of word at a time
00231    strcpy (candidate, word + 1);
00232    for (p = word, r = candidate;  *p != 0;  ) {
00233        cwrd = 1;
00234        for (int k=0; k < ns; k++)
00235         if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
00236        if ((cwrd) && check(candidate,wl-1)) {
00237         if (ns < maxSug) {
00238             wlst[ns] = mystrdup(candidate);
00239             // fprintf(stderr,"extra_char %d adding %s\n",ns,wlst[ns]); fflush(stderr);
00240             if (wlst[ns] == NULL) return -1;
00241             ns++;
00242          } else return ns; 
00243        }
00244        *r++ = *p++;
00245    }
00246    return ns;
00247 }
00248 
00249 
00250 // error is mising a letter it needs
00251 int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns)
00252 {
00253    char       candidate[MAXSWL];
00254    const char *      p;
00255    char *     q;
00256    int cwrd;
00257 
00258    int wl = strlen(word);
00259 
00260    // try inserting a tryme character before every letter
00261    strcpy(candidate + 1, word);
00262    for (p = word, q = candidate;  *p != 0;  )  {
00263       for (int i = 0;  i < ctryl;  i++) {
00264         *q = ctry[i];
00265          cwrd = 1;
00266          for (int k=0; k < ns; k++)
00267           if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
00268          if ((cwrd) && check(candidate,wl+1)) {
00269            if (ns < maxSug) {
00270                 wlst[ns] = mystrdup(candidate);
00271                 // fprintf(stderr,"forgotchar %d adding %s\n",ns,wlst[ns]); fflush(stderr);
00272                 if (wlst[ns] == NULL) return -1;
00273                 ns++;
00274             } else return ns; 
00275          }
00276       }
00277       *q++ = *p++;
00278    }
00279 
00280    // now try adding one to end */
00281    for (int i = 0;  i < ctryl;  i++) {
00282       *q = ctry[i];
00283       cwrd = 1;
00284       for (int k=0; k < ns; k++)
00285        if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
00286       if ((cwrd) && check(candidate,wl+1)) {
00287         if (ns < maxSug) {
00288              wlst[ns] = mystrdup(candidate);
00289             // fprintf(stderr,"forgot_char %d adding %s\n",ns,wlst[ns]); fflush(stderr);
00290             if (wlst[ns] == NULL) return -1;
00291              ns++;
00292          } else return ns;
00293       }
00294    }
00295    return ns;
00296 }
00297 
00298 
00299 /* error is should have been two words */
00300 int SuggestMgr::twowords(char ** wlst, const char * word, int ns)
00301 {
00302     char candidate[MAXSWL];
00303     char * p;
00304 
00305     int wl=strlen(word);
00306     if (wl < 3) return ns;
00307     strcpy(candidate + 1, word);
00308 
00309     // split the string into two pieces after every char
00310     // if both pieces are good words make them a suggestion
00311     for (p = candidate + 1;  p[1] != '\0';  p++) {
00312        p[-1] = *p;
00313        *p = '\0';
00314        if (check(candidate,strlen(candidate))) {
00315         if (check((p+1),strlen(p+1))) {
00316            *p = ' ';
00317            if (ns < maxSug) {
00318                 wlst[ns] = mystrdup(candidate);
00319                // fprintf(stderr,"two_words %d adding %s\n",ns,wlst[ns]); fflush(stderr);
00320                 if (wlst[ns] == NULL) return -1;
00321                 ns++;
00322             } else return ns;
00323          }
00324        }
00325     }
00326     return ns;
00327 }
00328 
00329 
00330 // error is adjacent letter were swapped
00331 int SuggestMgr::swapchar(char ** wlst, const char * word, int ns)
00332 {
00333    char       candidate[MAXSWL];
00334    char * p;
00335    char       tmpc;
00336    int cwrd;
00337 
00338    int wl = strlen(word);
00339 
00340    // try swapping adjacent chars one by one
00341    strcpy(candidate, word);
00342    for (p = candidate;  p[1] != 0;  p++) {
00343       tmpc = *p;
00344       *p = p[1];
00345       p[1] = tmpc;
00346       cwrd = 1;
00347       for (int k=0; k < ns; k++)
00348        if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
00349       if ((cwrd) && check(candidate,wl)) {
00350         if (ns < maxSug) {
00351              wlst[ns] = mystrdup(candidate);
00352             // fprintf(stderr,"swap_char %d adding %s\n",ns,wlst[ns]); fflush(stderr);
00353              if (wlst[ns] == NULL) return -1;
00354              ns++;
00355          } else return ns;
00356       }
00357       tmpc = *p;
00358       *p = p[1];
00359       p[1] = tmpc;
00360    }
00361    return ns;
00362 }
00363 
00364 
00365 // generate a set of suggestions for very poorly spelled words
00366 int SuggestMgr::ngsuggest(char** wlst, char * word, HashMgr* pHMgr)
00367 {
00368 
00369   int i, j;
00370   int lval;
00371   int sc;
00372   int lp;
00373 
00374   if (! pHMgr) return 0;
00375 
00376   // exhaustively search through all root words
00377   // keeping track of the MAX_ROOTS most similar root words
00378   struct hentry * roots[MAX_ROOTS];
00379   int scores[MAX_ROOTS];
00380   for (i = 0; i < MAX_ROOTS; i++) {
00381     roots[i] = NULL;
00382     scores[i] = -100 * i;
00383   }
00384   lp = MAX_ROOTS - 1;
00385 
00386   int n = strlen(word);
00387 
00388   struct hentry* hp = NULL;
00389   int col = -1;
00390   while ((hp = pHMgr->walk_hashtable(col, hp))) {
00391     sc = ngram(3, word, hp->word, NGRAM_LONGER_WORSE);
00392     if (sc > scores[lp]) {
00393       scores[lp] = sc;
00394       roots[lp] = hp;
00395       int lval = sc;
00396       for (j=0; j < MAX_ROOTS; j++)
00397        if (scores[j] < lval) {
00398          lp = j;
00399           lval = scores[j];
00400        }
00401     }  
00402   }
00403 
00404   // find minimum threshhold for a passable suggestion
00405   // mangle original word three differnt ways
00406   // and score them to generate a minimum acceptable score
00407   int thresh = 0;
00408   char * mw = NULL;
00409   for (int sp = 1; sp < 4; sp++) {
00410 #ifdef __SUNPRO_CC // for SunONE Studio compiler
00411      mw = mystrdup(word);
00412 #else
00413      mw = strdup(word);
00414 #endif
00415      for (int k=sp; k < n; k+=4) *(mw + k) = '*';
00416      thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH);
00417      free(mw);
00418   }
00419   mw = NULL;
00420   thresh = thresh / 3;
00421   thresh--;
00422 
00423   // now expand affixes on each of these root words and
00424   // and use length adjusted ngram scores to select
00425   // possible suggestions
00426   char * guess[MAX_GUESS];
00427   int gscore[MAX_GUESS];
00428   for(i=0;i<MAX_GUESS;i++) {
00429      guess[i] = NULL;
00430      gscore[i] = -100 * i;
00431   }
00432 
00433   lp = MAX_GUESS - 1;
00434 
00435   struct guessword * glst;
00436   glst = (struct guessword *) calloc(MAX_WORDS,sizeof(struct guessword));
00437   if (! glst) return 0;
00438 
00439   for (i = 0; i < MAX_ROOTS; i++) {
00440 
00441       if (roots[i]) {
00442         struct hentry * rp = roots[i];
00443        int nw = pAMgr->expand_rootword(glst, MAX_WORDS, rp->word, rp->wlen,
00444                                         rp->astr, rp->alen);
00445         for (int k = 0; k < nw; k++) {
00446            sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH);
00447           if (sc > thresh)
00448           {
00449               if (sc > gscore[lp])
00450               {
00451                      if (guess[lp]) free(guess[lp]);
00452                      gscore[lp] = sc;
00453                      guess[lp] = glst[k].word;
00454                      glst[k].word = NULL;
00455                      lval = sc;
00456                      for (j=0; j < MAX_GUESS; j++)
00457                      {
00458                             if (gscore[j] < lval)
00459                             {
00460                                    lp = j;
00461                                    lval = gscore[j];
00462                             }
00463                      }
00464               }
00465           }
00466           free (glst[k].word);
00467           glst[k].word = NULL;
00468           glst[k].allow = 0;
00469        }
00470       }
00471   }
00472   if (glst) free(glst);
00473 
00474   // now we are done generating guesses
00475   // sort in order of decreasing score and copy over
00476   
00477   bubblesort(&guess[0], &gscore[0], MAX_GUESS);
00478   int ns = 0;
00479   for (i=0; i < MAX_GUESS; i++) {
00480     if (guess[i]) {
00481       int unique = 1;
00482       for (j=i+1; j < MAX_GUESS; j++)
00483        if (guess[j]) 
00484            if (!strcmp(guess[i], guess[j])) unique = 0;
00485       if (unique) {
00486          wlst[ns++] = guess[i];
00487       } else {
00488         free(guess[i]);
00489       }
00490     }
00491   }
00492   return ns;
00493 }
00494 
00495 
00496 
00497 
00498 // see if a candidate suggestion is spelled correctly
00499 // needs to check both root words and words with affixes
00500 int SuggestMgr::check(const char * word, int len)
00501 {
00502   struct hentry * rv=NULL;
00503   if (pAMgr) { 
00504     rv = pAMgr->lookup(word);
00505     if (rv == NULL) rv = pAMgr->affix_check(word,len);
00506   }
00507   if (rv) return 1;
00508   return 0;
00509 }
00510 
00511 
00512 
00513 // generate an n-gram score comparing s1 and s2
00514 int SuggestMgr::ngram(int n, char * s1, const char * s2, int uselen)
00515 {
00516   int nscore = 0;
00517   int l1 = strlen(s1);
00518   int l2 = strlen(s2);
00519   int ns;
00520   for (int j=1;j<=n;j++) {
00521     ns = 0;
00522     for (int i=0;i<=(l1-j);i++) {
00523       char c = *(s1 + i + j);
00524       *(s1 + i + j) = '\0';
00525       if (strstr(s2,(s1+i))) ns++;
00526       *(s1 + i + j ) = c;
00527     }
00528     nscore = nscore + ns;
00529     if (ns < 2) break;
00530   }
00531   ns = 0;
00532   if (uselen == NGRAM_LONGER_WORSE) ns = (l2-l1)-2;
00533   if (uselen == NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2;
00534   return (nscore - ((ns > 0) ? ns : 0));
00535 }
00536 
00537 
00538 // sort in decreasing order of score
00539 void SuggestMgr::bubblesort(char** rword, int* rsc, int n )
00540 {
00541       int m = 1;
00542       while (m < n) {
00543          int j = m;
00544          while (j > 0) {
00545            if (rsc[j-1] < rsc[j]) {
00546                int sctmp = rsc[j-1];
00547                 char * wdtmp = rword[j-1];
00548                rsc[j-1] = rsc[j];
00549                 rword[j-1] = rword[j];
00550                 rsc[j] = sctmp;
00551                 rword[j] = wdtmp;
00552                j--;
00553            } else break;
00554          }
00555           m++;
00556       }
00557       return;
00558 }
00559