Back to index

lightning-sunbird  0.9+nobinonly
myspell.cpp
Go to the documentation of this file.
00001 #include "license.readme"
00002 #include <string.h>
00003 #include <stdlib.h>
00004 #include <stdio.h>
00005 
00006 #include "myspell.hxx"
00007 
00008 // using namespace std;
00009 
00010 
00011 MySpell::MySpell(const char * affpath, const char * dpath)
00012 {
00013     encoding = NULL;
00014     csconv = NULL;
00015 
00016     /* first set up the hash manager */
00017     pHMgr = new HashMgr(dpath);
00018 
00019     /* next set up the affix manager */
00020     /* it needs access to the hash manager lookup methods */
00021     pAMgr = new AffixMgr(affpath,pHMgr);
00022 
00023     /* get the preferred try string and the dictionary */
00024     /* encoding from the Affix Manager for that dictionary */
00025     char * try_string = pAMgr->get_try_string();
00026     encoding = pAMgr->get_encoding();
00027     csconv = get_current_cs(encoding);
00028 
00029     /* and finally set up the suggestion manager */
00030     maxSug = 25;
00031     pSMgr = new SuggestMgr(try_string, maxSug, pAMgr);
00032     if (try_string) free(try_string);
00033 }
00034 
00035 
00036 MySpell::~MySpell()
00037 {
00038     delete pSMgr;
00039     delete pAMgr;
00040     delete pHMgr;
00041     
00042     csconv= NULL;
00043     if (encoding) 
00044         free(encoding);
00045 }
00046 
00047 
00048 // make a copy of src at destination while removing all leading
00049 // blanks and removing any trailing periods after recording
00050 // their presence with the abbreviation flag
00051 // also since already going through character by character, 
00052 // set the capitalization type
00053 // return the length of the "cleaned" word
00054 
00055 int MySpell::cleanword(char * dest, const char * src, int * pcaptype, int * pabbrev)
00056 { 
00057 
00058   // with the new breakiterator code this should not be needed anymore
00059    const char * special_chars = "._#$%&()* +,-/:;<=>[]\\^`{|}~\t \x0a\x0d\x01\'\"";
00060 
00061    unsigned char * p = (unsigned char *) dest;
00062    const unsigned char * q = (const unsigned char * ) src;
00063 
00064    // first skip over any leading special characters
00065    while ((*q != '\0') && (strchr(special_chars,(int)(*q)))) q++;
00066    
00067    // now strip off any trailing special characters 
00068    // if a period comes after a normal char record its presence
00069    *pabbrev = 0;
00070    int nl = strlen((const char *)q);
00071    while ((nl > 0) && (strchr(special_chars,(int)(*(q+nl-1))))) {
00072        nl--;
00073    }
00074    if ( *(q+nl) == '.' ) *pabbrev = 1;
00075    
00076    // if no characters are left it can't be an abbreviation and can't be capitalized
00077    if (nl <= 0) { 
00078        *pcaptype = NOCAP;
00079        *pabbrev = 0;
00080        *p = '\0';
00081        return 0;
00082    }
00083 
00084    // now determine the capitalization type of the first nl letters
00085    int ncap = 0;
00086    int nneutral = 0;
00087    int nc = 0;
00088    while (nl > 0) {
00089        nc++;
00090        if (csconv[(*q)].ccase) ncap++;
00091        if (csconv[(*q)].cupper == csconv[(*q)].clower) nneutral++;
00092        *p++ = *q++;
00093        nl--;
00094    }
00095    // remember to terminate the destination string
00096    *p = '\0';
00097 
00098    // now finally set the captype
00099    if (ncap == 0) {
00100         *pcaptype = NOCAP;
00101    } else if ((ncap == 1) && csconv[(unsigned char)(*dest)].ccase) {
00102         *pcaptype = INITCAP;
00103   } else if ((ncap == nc) || ((ncap + nneutral) == nc)){
00104         *pcaptype = ALLCAP;
00105   } else {
00106         *pcaptype = HUHCAP;
00107   }
00108   return nc;
00109 } 
00110        
00111 
00112 int MySpell::spell(const char * word)
00113 {
00114   char * rv=NULL;
00115   char cw[MAXWORDLEN+1];
00116   char wspace[MAXWORDLEN+1];
00117 
00118   int wl = strlen(word);
00119   if (wl > (MAXWORDLEN - 1)) return 0;
00120   int captype = 0;
00121   int abbv = 0;
00122   wl = cleanword(cw, word, &captype, &abbv);
00123   if (wl == 0) return 1;
00124 
00125   switch(captype) {
00126      case HUHCAP:
00127      case NOCAP:   { 
00128                      rv = check(cw); 
00129                      if ((abbv) && !(rv)) {
00130                        memcpy(wspace,cw,wl);
00131                          *(wspace+wl) = '.';
00132                          *(wspace+wl+1) = '\0';
00133                          rv = check(wspace);
00134                      }
00135                      break;
00136                    }
00137 
00138      case ALLCAP:  {
00139                      memcpy(wspace,cw,(wl+1));
00140                      mkallsmall(wspace, csconv);
00141                      rv = check(wspace);
00142                      if (!rv) {
00143                         mkinitcap(wspace, csconv);
00144                         rv = check(wspace);
00145                      }
00146                      if (!rv) rv = check(cw);
00147                      if ((abbv) && !(rv)) {
00148                        memcpy(wspace,cw,wl);
00149                          *(wspace+wl) = '.';
00150                          *(wspace+wl+1) = '\0';
00151                          rv = check(wspace);
00152                      }
00153                      break; 
00154                    }
00155      case INITCAP: { 
00156                      memcpy(wspace,cw,(wl+1));
00157                      mkallsmall(wspace, csconv);
00158                      rv = check(wspace);
00159                      if (!rv) rv = check(cw);
00160                      if ((abbv) && !(rv)) {
00161                        memcpy(wspace,cw,wl);
00162                          *(wspace+wl) = '.';
00163                          *(wspace+wl+1) = '\0';
00164                          rv = check(wspace);
00165                      }
00166                      break; 
00167                    }
00168   }
00169   if (rv) return 1;
00170   return 0;
00171 }
00172 
00173 
00174 char * MySpell::check(const char * word)
00175 {
00176   struct hentry * he = NULL;
00177   if (pHMgr)
00178      he = pHMgr->lookup (word);
00179 
00180   if ((he == NULL) && (pAMgr)) {
00181      // try stripping off affixes */
00182      he = pAMgr->affix_check(word, strlen(word));
00183 
00184      // try check compound word
00185      if ((he == NULL) && (pAMgr->get_compound())) {
00186           he = pAMgr->compound_check(word, strlen(word), (pAMgr->get_compound())[0]);
00187      }
00188 
00189   }
00190 
00191   if (he) return he->word;
00192   return NULL;
00193 }
00194 
00195 
00196 
00197 int MySpell::suggest(char*** slst, const char * word)
00198 {
00199   char cw[MAXWORDLEN+1];
00200   char wspace[MAXWORDLEN+1];
00201   if (! pSMgr) return 0;
00202   int wl = strlen(word);
00203   if (wl > (MAXWORDLEN-1)) return 0;
00204   int captype = 0;
00205   int abbv = 0;
00206   wl = cleanword(cw, word, &captype, &abbv);
00207   if (wl == 0) return 0;
00208 
00209   int ns = 0;
00210   char ** wlst = (char **) calloc(maxSug, sizeof(char *));
00211   if (wlst == NULL) return 0;
00212 
00213   switch(captype) {
00214      case NOCAP:   { 
00215                      ns = pSMgr->suggest(wlst, ns, cw); 
00216                      break;
00217                    }
00218 
00219      case INITCAP: { 
00220 
00221                      ns = pSMgr->suggest(wlst,ns,cw); 
00222                      if (ns != -1) {
00223                         memcpy(wspace,cw,(wl+1));
00224                         mkallsmall(wspace, csconv);
00225                         if (ns) {
00226                            ns = pSMgr->suggest(wlst, ns, wspace);
00227                         } else {
00228                            int ns2 = pSMgr->suggest(wlst, ns, wspace);
00229                            for (int j=ns; j < ns2; j++)
00230                               mkinitcap(wlst[j], csconv);
00231                            ns = ns2;
00232                      }    
00233                      }
00234                      break;
00235                    }
00236 
00237      case HUHCAP: { 
00238                      ns = pSMgr->suggest(wlst, ns, cw);
00239                      if (ns != -1) {
00240                        memcpy(wspace,cw,(wl+1));
00241                        mkallsmall(wspace, csconv);
00242                        ns = pSMgr->suggest(wlst, ns, wspace);
00243                      } 
00244                      break;
00245                    }
00246 
00247      case ALLCAP: { 
00248                      memcpy(wspace,cw,(wl+1));
00249                      mkallsmall(wspace, csconv);
00250                      ns = pSMgr->suggest(wlst, ns, wspace);
00251                      if (ns > 0) {
00252                        for (int j=0; j < ns; j++)
00253                          mkallcap(wlst[j], csconv);
00254                      } 
00255                      if (ns != -1) 
00256                          ns = pSMgr->suggest(wlst, ns , cw);
00257                      break;
00258                    }
00259   }
00260   if (ns > 0) {
00261        *slst = wlst;
00262        return ns;
00263   }
00264   // try ngram approach since found nothing
00265   if (ns == 0) { 
00266      ns = pSMgr->ngsuggest(wlst, cw, pHMgr);
00267      if (ns) {
00268          switch(captype) {
00269            case NOCAP:  break;
00270             case HUHCAP: break; 
00271             case INITCAP: { 
00272                             for (int j=0; j < ns; j++)
00273                               mkinitcap(wlst[j], csconv);
00274                           }
00275                           break;
00276 
00277             case ALLCAP: { 
00278                             for (int j=0; j < ns; j++)
00279                               mkallcap(wlst[j], csconv);
00280                          } 
00281                          break;
00282         }
00283          *slst = wlst;
00284          return ns;
00285      }
00286   }
00287   if (ns < 0) {
00288      // we ran out of memory - we should free up as much as possible
00289      for (int i=0;i<maxSug; i++)
00290         if (wlst[i] != NULL) free(wlst[i]);
00291   }
00292   if (wlst) free(wlst);
00293   *slst = NULL;
00294   return 0;
00295 }
00296 
00297 
00298 char * MySpell::get_dic_encoding()
00299 {
00300   return encoding;
00301 }
00302