Back to index

lightning-sunbird  0.9+nobinonly
affixmgr.cpp
Go to the documentation of this file.
00001 #include "license.readme"
00002 
00003 #include <stdlib.h> 
00004 #include <string.h>
00005 #include <stdio.h> 
00006 
00007 #include "affixmgr.hxx"
00008 #include "affentry.hxx"
00009 
00010 // using namespace std;
00011 
00012 
00013 // First some base level utility routines
00014 extern void   mychomp(char * s);
00015 extern char * mystrdup(const char * s);
00016 extern char * myrevstrdup(const char * s);
00017 extern char * mystrsep(char ** sptr, const char delim);
00018 extern int    isSubset(const char * s1, const char * s2); 
00019 extern int    isRevSubset(const char * s1, const char * end_of_s2, int len_s2); 
00020 
00021 
00022 AffixMgr::AffixMgr(const char * affpath, HashMgr* ptr) 
00023 {
00024   // register hash manager and load affix data from aff file
00025   pHMgr = ptr;
00026   trystring = NULL;
00027   encoding=NULL;
00028   reptable = NULL;
00029   numrep = 0;
00030   maptable = NULL;
00031   nummap = 0;
00032   compound=NULL;
00033   nosplitsugs= (0==1);
00034 
00035   cpdmin = 3;  // default value
00036   for (int i=0; i < SETSIZE; i++) {
00037      pStart[i] = NULL;
00038      sStart[i] = NULL;
00039      pFlag[i] = NULL;
00040      sFlag[i] = NULL;
00041   }
00042   if (parse_file(affpath)) {
00043      fprintf(stderr,"Failure loading aff file %s\n",affpath);
00044      fflush(stderr);
00045   }
00046 }
00047 
00048 
00049 AffixMgr::~AffixMgr() 
00050 {
00051  
00052   // pass through linked prefix entries and clean up
00053   for (int i=0; i < SETSIZE ;i++) {
00054        pFlag[i] = NULL;
00055        PfxEntry * ptr = (PfxEntry *)pStart[i];
00056        PfxEntry * nptr = NULL;
00057        while (ptr) {
00058             nptr = ptr->getNext();
00059             delete(ptr);
00060             ptr = nptr;
00061             nptr = NULL;
00062        }  
00063   }
00064 
00065   // pass through linked suffix entries and clean up
00066   for (int j=0; j < SETSIZE ; j++) {
00067        sFlag[j] = NULL;
00068        SfxEntry * ptr = (SfxEntry *)sStart[j];
00069        SfxEntry * nptr = NULL;
00070        while (ptr) {
00071             nptr = ptr->getNext();
00072             delete(ptr);
00073             ptr = nptr;
00074             nptr = NULL;
00075        }  
00076   }
00077 
00078   if (trystring) free(trystring);
00079   trystring=NULL;
00080   if (encoding) free(encoding);
00081   encoding=NULL;
00082   if (maptable) {  
00083      for (int j=0; j < nummap; j++) {
00084         free(maptable[j].set);
00085         maptable[j].set = NULL;
00086         maptable[j].len = 0;
00087      }
00088      free(maptable);  
00089      maptable = NULL;
00090   }
00091   nummap = 0;
00092   if (reptable) {  
00093      for (int j=0; j < numrep; j++) {
00094         free(reptable[j].pattern);
00095         free(reptable[j].replacement);
00096         reptable[j].pattern = NULL;
00097         reptable[j].replacement = NULL;
00098      }
00099      free(reptable);  
00100      reptable = NULL;
00101   }
00102   numrep = 0;
00103   if (compound) free(compound);
00104   compound=NULL;
00105   pHMgr = NULL;
00106   cpdmin = 0;
00107 }
00108 
00109 
00110 // read in aff file and build up prefix and suffix entry objects 
00111 int  AffixMgr::parse_file(const char * affpath)
00112 {
00113 
00114   // io buffers
00115   char line[MAXLNLEN+1];
00116  
00117   // affix type
00118   char ft;
00119 
00120   // open the affix file
00121   FILE * afflst;
00122   afflst = fopen(affpath,"r");
00123   if (!afflst) {
00124     fprintf(stderr,"Error - could not open affix description file %s\n",affpath);
00125     return 1;
00126   }
00127 
00128   // step one is to parse the affix file building up the internal
00129   // affix data structures
00130 
00131 
00132     // read in each line ignoring any that do not
00133     // start with a known line type indicator
00134 
00135     while (fgets(line,MAXLNLEN,afflst)) {
00136        mychomp(line);
00137 
00138        /* parse in the try string */
00139        if (strncmp(line,"TRY",3) == 0) {
00140           if (parse_try(line)) {
00141              return 1;
00142           }
00143        }
00144 
00145        /* parse in the name of the character set used by the .dict and .aff */
00146        if (strncmp(line,"SET",3) == 0) {
00147           if (parse_set(line)) {
00148              return 1;
00149           }
00150        }
00151 
00152        /* parse in the flag used by the controlled compound words */
00153        if (strncmp(line,"COMPOUNDFLAG",12) == 0) {
00154           if (parse_cpdflag(line)) {
00155              return 1;
00156           }
00157        }
00158 
00159        /* parse in the flag used by the controlled compound words */
00160        if (strncmp(line,"COMPOUNDMIN",11) == 0) {
00161           if (parse_cpdmin(line)) {
00162              return 1;
00163           }
00164        }
00165 
00166        /* parse in the typical fault correcting table */
00167        if (strncmp(line,"REP",3) == 0) {
00168           if (parse_reptable(line, afflst)) {
00169              return 1;
00170           }
00171        }
00172 
00173        /* parse in the related character map table */
00174        if (strncmp(line,"MAP",3) == 0) {
00175           if (parse_maptable(line, afflst)) {
00176              return 1;
00177           }
00178        }
00179 
00180        // parse this affix: P - prefix, S - suffix
00181        ft = ' ';
00182        if (strncmp(line,"PFX",3) == 0) ft = 'P';
00183        if (strncmp(line,"SFX",3) == 0) ft = 'S';
00184        if (ft != ' ') {
00185           if (parse_affix(line, ft, afflst)) {
00186              return 1;
00187           }
00188        }
00189 
00190        // handle NOSPLITSUGS
00191        if (strncmp(line,"NOSPLITSUGS",11) == 0)
00192                  nosplitsugs=(0==0);
00193 
00194     }
00195     fclose(afflst);
00196 
00197     // convert affix trees to sorted list
00198     process_pfx_tree_to_list();
00199     process_sfx_tree_to_list();
00200 
00201     // now we can speed up performance greatly taking advantage of the 
00202     // relationship between the affixes and the idea of "subsets".
00203 
00204     // View each prefix as a potential leading subset of another and view
00205     // each suffix (reversed) as a potential trailing subset of another.
00206 
00207     // To illustrate this relationship if we know the prefix "ab" is found in the
00208     // word to examine, only prefixes that "ab" is a leading subset of need be examined.
00209     // Furthermore is "ab" is not present then none of the prefixes that "ab" is
00210     // is a subset need be examined.
00211     // The same argument goes for suffix string that are reversed.
00212 
00213     // Then to top this off why not examine the first char of the word to quickly
00214     // limit the set of prefixes to examine (i.e. the prefixes to examine must 
00215     // be leading supersets of the first character of the word (if they exist)
00216  
00217     // To take advantage of this "subset" relationship, we need to add two links
00218     // from entry.  One to take next if the current prefix is found (call it nexteq)
00219     // and one to take next if the current prefix is not found (call it nextne).
00220 
00221     // Since we have built ordered lists, all that remains is to properly intialize 
00222     // the nextne and nexteq pointers that relate them
00223 
00224     process_pfx_order();
00225     process_sfx_order();
00226 
00227     return 0;
00228 }
00229 
00230 
00231 // we want to be able to quickly access prefix information
00232 // both by prefix flag, and sorted by prefix string itself 
00233 // so we need to set up two indexes
00234 
00235 int AffixMgr::build_pfxtree(AffEntry* pfxptr)
00236 {
00237   PfxEntry * ptr;
00238   PfxEntry * pptr;
00239   PfxEntry * ep = (PfxEntry*) pfxptr;
00240 
00241   // get the right starting points
00242   const char * key = ep->getKey();
00243   const unsigned char flg = ep->getFlag();
00244 
00245   // first index by flag which must exist
00246   ptr = (PfxEntry*)pFlag[flg];
00247   ep->setFlgNxt(ptr);
00248   pFlag[flg] = (AffEntry *) ep;
00249 
00250 
00251   // handle the special case of null affix string
00252   if (strlen(key) == 0) {
00253     // always inset them at head of list at element 0
00254      ptr = (PfxEntry*)pStart[0];
00255      ep->setNext(ptr);
00256      pStart[0] = (AffEntry*)ep;
00257      return 0;
00258   }
00259 
00260   // now handle the normal case
00261   ep->setNextEQ(NULL);
00262   ep->setNextNE(NULL);
00263 
00264   unsigned char sp = *((const unsigned char *)key);
00265   ptr = (PfxEntry*)pStart[sp];
00266   
00267   // handle the first insert 
00268   if (!ptr) {
00269      pStart[sp] = (AffEntry*)ep;
00270      return 0;
00271   }
00272 
00273 
00274   // otherwise use binary tree insertion so that a sorted
00275   // list can easily be generated later
00276   pptr = NULL;
00277   for (;;) {
00278     pptr = ptr;
00279     if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
00280        ptr = ptr->getNextEQ();
00281        if (!ptr) {
00282          pptr->setNextEQ(ep);
00283           break;
00284        }
00285     } else {
00286        ptr = ptr->getNextNE();
00287        if (!ptr) {
00288          pptr->setNextNE(ep);
00289           break;
00290        }
00291     }
00292   }
00293   return 0;
00294 }
00295 
00296 
00297 
00298 // we want to be able to quickly access suffix information
00299 // both by suffix flag, and sorted by the reverse of the
00300 // suffix string itself; so we need to set up two indexes
00301 int AffixMgr::build_sfxtree(AffEntry* sfxptr)
00302 {
00303   SfxEntry * ptr;
00304   SfxEntry * pptr;
00305   SfxEntry * ep = (SfxEntry *) sfxptr;
00306 
00307   /* get the right starting point */
00308   const char * key = ep->getKey();
00309   const unsigned char flg = ep->getFlag();
00310 
00311   // first index by flag which must exist
00312   ptr = (SfxEntry*)sFlag[flg];
00313   ep->setFlgNxt(ptr);
00314   sFlag[flg] = (AffEntry *) ep;
00315 
00316 
00317   // next index by affix string
00318 
00319   // handle the special case of null affix string
00320   if (strlen(key) == 0) {
00321     // always inset them at head of list at element 0
00322      ptr = (SfxEntry*)sStart[0];
00323      ep->setNext(ptr);
00324      sStart[0] = (AffEntry*)ep;
00325      return 0;
00326   }
00327 
00328   // now handle the normal case
00329   ep->setNextEQ(NULL);
00330   ep->setNextNE(NULL);
00331 
00332   unsigned char sp = *((const unsigned char *)key);
00333   ptr = (SfxEntry*)sStart[sp];
00334   
00335   // handle the first insert 
00336   if (!ptr) {
00337      sStart[sp] = (AffEntry*)ep;
00338      return 0;
00339   }
00340 
00341 
00342   // otherwise use binary tree insertion so that a sorted
00343   // list can easily be generated later
00344   pptr = NULL;
00345   for (;;) {
00346     pptr = ptr;
00347     if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
00348        ptr = ptr->getNextEQ();
00349        if (!ptr) {
00350          pptr->setNextEQ(ep);
00351           break;
00352        }
00353     } else {
00354        ptr = ptr->getNextNE();
00355        if (!ptr) {
00356          pptr->setNextNE(ep);
00357           break;
00358        }
00359     }
00360   }
00361   return 0;
00362 }
00363 
00364 
00365 // convert from binary tree to sorted list
00366 int AffixMgr::process_pfx_tree_to_list()
00367 {
00368   for (int i=1; i< SETSIZE; i++) {
00369     pStart[i] = process_pfx_in_order(pStart[i],NULL);
00370   }
00371   return 0;
00372 }
00373 
00374 
00375 AffEntry* AffixMgr::process_pfx_in_order(AffEntry* ptr, AffEntry* nptr)
00376 {
00377   if (ptr) {
00378     nptr = process_pfx_in_order(((PfxEntry*) ptr)->getNextNE(), nptr);
00379     ((PfxEntry*) ptr)->setNext((PfxEntry*) nptr);
00380     nptr = process_pfx_in_order(((PfxEntry*) ptr)->getNextEQ(), ptr);
00381   }
00382   return nptr;
00383 }
00384 
00385 
00386 // convert from binary tree to sorted list
00387 int AffixMgr:: process_sfx_tree_to_list()
00388 {
00389   for (int i=1; i< SETSIZE; i++) {
00390     sStart[i] = process_sfx_in_order(sStart[i],NULL);
00391   }
00392   return 0;
00393 }
00394 
00395 AffEntry* AffixMgr::process_sfx_in_order(AffEntry* ptr, AffEntry* nptr)
00396 {
00397   if (ptr) {
00398     nptr = process_sfx_in_order(((SfxEntry*) ptr)->getNextNE(), nptr);
00399     ((SfxEntry*) ptr)->setNext((SfxEntry*) nptr);
00400     nptr = process_sfx_in_order(((SfxEntry*) ptr)->getNextEQ(), ptr);
00401   }
00402   return nptr;
00403 }
00404 
00405 
00406 
00407 // reinitialize the PfxEntry links NextEQ and NextNE to speed searching
00408 // using the idea of leading subsets this time
00409 int AffixMgr::process_pfx_order()
00410 {
00411     PfxEntry* ptr;
00412 
00413     // loop through each prefix list starting point
00414     for (int i=1; i < SETSIZE; i++) {
00415 
00416          ptr = (PfxEntry*)pStart[i];
00417 
00418          // look through the remainder of the list
00419          //  and find next entry with affix that 
00420          // the current one is not a subset of
00421          // mark that as destination for NextNE
00422          // use next in list that you are a subset
00423          // of as NextEQ
00424 
00425          for (; ptr != NULL; ptr = ptr->getNext()) {
00426 
00427             PfxEntry * nptr = ptr->getNext();
00428              for (; nptr != NULL; nptr = nptr->getNext()) {
00429                 if (! isSubset( ptr->getKey() , nptr->getKey() )) break;
00430              }
00431              ptr->setNextNE(nptr);
00432              ptr->setNextEQ(NULL);
00433              if ((ptr->getNext()) && isSubset(ptr->getKey() , (ptr->getNext())->getKey())) 
00434                  ptr->setNextEQ(ptr->getNext());
00435          }
00436 
00437          // now clean up by adding smart search termination strings:
00438          // if you are already a superset of the previous prefix
00439          // but not a subset of the next, search can end here
00440          // so set NextNE properly
00441 
00442          ptr = (PfxEntry *) pStart[i];
00443          for (; ptr != NULL; ptr = ptr->getNext()) {
00444             PfxEntry * nptr = ptr->getNext();
00445              PfxEntry * mptr = NULL;
00446              for (; nptr != NULL; nptr = nptr->getNext()) {
00447                 if (! isSubset(ptr->getKey(),nptr->getKey())) break;
00448                  mptr = nptr;
00449              }
00450              if (mptr) mptr->setNextNE(NULL);
00451          }
00452     }
00453     return 0;
00454 }
00455 
00456 
00457 
00458 // reinitialize the SfxEntry links NextEQ and NextNE to speed searching
00459 // using the idea of leading subsets this time
00460 int AffixMgr::process_sfx_order()
00461 {
00462     SfxEntry* ptr;
00463 
00464     // loop through each prefix list starting point
00465     for (int i=1; i < SETSIZE; i++) {
00466 
00467          ptr = (SfxEntry *) sStart[i];
00468 
00469          // look through the remainder of the list
00470          //  and find next entry with affix that 
00471          // the current one is not a subset of
00472          // mark that as destination for NextNE
00473          // use next in list that you are a subset
00474          // of as NextEQ
00475 
00476          for (; ptr != NULL; ptr = ptr->getNext()) {
00477             SfxEntry * nptr = ptr->getNext();
00478              for (; nptr != NULL; nptr = nptr->getNext()) {
00479                 if (! isSubset(ptr->getKey(),nptr->getKey())) break;
00480              }
00481              ptr->setNextNE(nptr);
00482              ptr->setNextEQ(NULL);
00483              if ((ptr->getNext()) && isSubset(ptr->getKey(),(ptr->getNext())->getKey())) 
00484                  ptr->setNextEQ(ptr->getNext());
00485          }
00486 
00487 
00488          // now clean up by adding smart search termination strings:
00489          // if you are already a superset of the previous suffix
00490          // but not a subset of the next, search can end here
00491          // so set NextNE properly
00492 
00493          ptr = (SfxEntry *) sStart[i];
00494          for (; ptr != NULL; ptr = ptr->getNext()) {
00495             SfxEntry * nptr = ptr->getNext();
00496              SfxEntry * mptr = NULL;
00497              for (; nptr != NULL; nptr = nptr->getNext()) {
00498                 if (! isSubset(ptr->getKey(),nptr->getKey())) break;
00499                  mptr = nptr;
00500              }
00501              if (mptr) mptr->setNextNE(NULL);
00502          }
00503     }
00504     return 0;
00505 }
00506 
00507 
00508 
00509 // takes aff file condition string and creates the
00510 // conds array - please see the appendix at the end of the
00511 // file affentry.cxx which describes what is going on here
00512 // in much more detail
00513 
00514 void AffixMgr::encodeit(struct affentry * ptr, char * cs)
00515 {
00516   unsigned char c;
00517   int i, j, k;
00518   unsigned char mbr[MAXLNLEN];
00519 
00520   // now clear the conditions array */
00521   for (i=0;i<SETSIZE;i++) ptr->conds[i] = (unsigned char) 0;
00522 
00523   // now parse the string to create the conds array */
00524   int nc = strlen(cs);
00525   int neg = 0;   // complement indicator
00526   int grp = 0;   // group indicator
00527   int n = 0;     // number of conditions
00528   int ec = 0;    // end condition indicator
00529   int nm = 0;    // number of member in group
00530 
00531   // if no condition just return
00532   if (strcmp(cs,".")==0) {
00533     ptr->numconds = 0;
00534     return;
00535   }
00536 
00537   i = 0;
00538   while (i < nc) {
00539     c = *((unsigned char *)(cs + i));
00540 
00541     // start group indicator
00542     if (c == '[') {
00543        grp = 1;
00544        c = 0;
00545     }
00546 
00547     // complement flag
00548     if ((grp == 1) && (c == '^')) {
00549        neg = 1;
00550        c = 0;
00551     }
00552 
00553     // end goup indicator
00554     if (c == ']') {
00555        ec = 1;
00556        c = 0;
00557     }
00558 
00559     // add character of group to list
00560     if ((grp == 1) && (c != 0)) {
00561       *(mbr + nm) = c;
00562       nm++;
00563       c = 0;
00564     }
00565 
00566     // end of condition 
00567     if (c != 0) {
00568        ec = 1;
00569     }
00570 
00571     
00572     if (ec) {
00573       if (grp == 1) {
00574         if (neg == 0) {
00575           // set the proper bits in the condition array vals for those chars
00576          for (j=0;j<nm;j++) {
00577             k = (unsigned int) mbr[j];
00578              ptr->conds[k] = ptr->conds[k] | (1 << n);
00579           }
00580        } else {
00581          // complement so set all of them and then unset indicated ones
00582           for (j=0;j<SETSIZE;j++) ptr->conds[j] = ptr->conds[j] | (1 << n);
00583           for (j=0;j<nm;j++) {
00584             k = (unsigned int) mbr[j];
00585              ptr->conds[k] = ptr->conds[k] & ~(1 << n);
00586           }
00587         }
00588         neg = 0;
00589         grp = 0;   
00590         nm = 0;
00591       } else {
00592          // not a group so just set the proper bit for this char
00593          // but first handle special case of . inside condition
00594          if (c == '.') {
00595            // wild card character so set them all
00596             for (j=0;j<SETSIZE;j++) ptr->conds[j] = ptr->conds[j] | (1 << n);
00597          } else {  
00598            ptr->conds[(unsigned int) c] = ptr->conds[(unsigned int)c] | (1 << n);
00599          }
00600       }
00601       n++;
00602       ec = 0;
00603     }
00604 
00605 
00606     i++;
00607   }
00608   ptr->numconds = n;
00609   return;
00610 }
00611 
00612 
00613 // check word for prefixes
00614 struct hentry * AffixMgr::prefix_check (const char * word, int len)
00615 {
00616     struct hentry * rv= NULL;
00617  
00618     // first handle the special case of 0 length prefixes
00619     PfxEntry * pe = (PfxEntry *) pStart[0];
00620     while (pe) {
00621        rv = pe->check(word,len);
00622        if (rv) return rv;
00623        pe = pe->getNext();
00624     }
00625   
00626     // now handle the general case
00627     unsigned char sp = *((const unsigned char *)word);
00628     PfxEntry * pptr = (PfxEntry *)pStart[sp];
00629 
00630     while (pptr) {
00631         if (isSubset(pptr->getKey(),word)) {
00632             rv = pptr->check(word,len);
00633              if (rv) return rv;
00634              pptr = pptr->getNextEQ();
00635         } else {
00636             pptr = pptr->getNextNE();
00637         }
00638     }
00639     
00640     return NULL;
00641 }
00642 
00643 // check if compound word is correctly spelled
00644 struct hentry * AffixMgr::compound_check (const char * word, int len, char compound_flag)
00645 {
00646     int i;
00647     struct hentry * rv= NULL;
00648     char * st;
00649     char ch;
00650     
00651     // handle case of string too short to be a piece of a compound word 
00652     if (len < cpdmin) return NULL;
00653 
00654     st = mystrdup(word);
00655     
00656     for (i=cpdmin; i < (len - (cpdmin-1)); i++) {
00657 
00658         ch = st[i];
00659        st[i] = '\0';
00660 
00661        rv = lookup(st);
00662         if (!rv) rv = affix_check(st,i);
00663 
00664        if ((rv) && (TESTAFF(rv->astr, compound_flag, rv->alen))) {
00665            rv = lookup((word+i));
00666            if ((rv) && (TESTAFF(rv->astr, compound_flag, rv->alen))) {
00667               free(st);
00668               return rv;
00669            }
00670            rv = affix_check((word+i),strlen(word+i));
00671            if ((rv) && (TESTAFF(rv->astr, compound_flag, rv->alen))) {
00672               free(st);
00673               return rv;
00674            }
00675            rv = compound_check((word+i),strlen(word+i),compound_flag); 
00676            if (rv) {
00677               free(st);
00678               return rv;
00679            }
00680            
00681        }
00682         st[i] = ch;
00683     }
00684     free(st);
00685     return NULL;
00686 }    
00687 
00688 
00689 
00690 // check word for suffixes
00691 struct hentry * AffixMgr::suffix_check (const char * word, int len, 
00692                        int sfxopts, AffEntry * ppfx)
00693 {
00694     struct hentry * rv = NULL;
00695 
00696     // first handle the special case of 0 length suffixes
00697     SfxEntry * se = (SfxEntry *) sStart[0];
00698     while (se) {
00699        rv = se->check(word,len, sfxopts, ppfx);
00700        if (rv) return rv;
00701        se = se->getNext();
00702     }
00703   
00704     // now handle the general case
00705     unsigned char sp = *((const unsigned char *)(word + len - 1));
00706 
00707 
00708     SfxEntry * sptr = (SfxEntry *) sStart[sp];
00709 
00710     while (sptr) {
00711         if (isRevSubset(sptr->getKey(),(word+len-1), len)) {
00712             rv = sptr->check(word,len, sfxopts, ppfx);
00713              if (rv) {
00714                   return rv;
00715              }
00716              sptr = sptr->getNextEQ();
00717         } else {
00718             sptr = sptr->getNextNE();
00719         }
00720     }
00721     return NULL;
00722 }
00723 
00724 
00725 
00726 // check if word with affixes is correctly spelled
00727 struct hentry * AffixMgr::affix_check (const char * word, int len)
00728 {
00729     struct hentry * rv= NULL;
00730 
00731     // check all prefixes (also crossed with suffixes if allowed) 
00732     rv = prefix_check(word, len);
00733     if (rv) return rv;
00734 
00735     // if still not found check all suffixes
00736     rv = suffix_check(word, len, 0, NULL);
00737     return rv;
00738 }
00739 
00740 
00741 int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, 
00742                        const char * ts, int wl, const char * ap, int al)
00743 {
00744 
00745     int nh=0;
00746 
00747     // first add root word to list
00748 
00749     if (nh < maxn) {
00750        wlst[nh].word = mystrdup(ts);
00751        wlst[nh].allow = (1 == 0);
00752        nh++;
00753     }
00754 
00755     // handle suffixes
00756     for (int i = 0; i < al; i++) {
00757        unsigned char c = (unsigned char) ap[i];
00758        SfxEntry * sptr = (SfxEntry *)sFlag[c];
00759        while (sptr) {
00760         char * newword = sptr->add(ts, wl);
00761          if (newword) {
00762            if (nh < maxn) {
00763              wlst[nh].word = newword;
00764               wlst[nh].allow = sptr->allowCross();
00765               nh++;
00766           } else {
00767              free(newword);
00768            }
00769         }
00770          sptr = (SfxEntry *)sptr ->getFlgNxt();
00771        }
00772     }
00773 
00774     int n = nh;
00775 
00776     // handle cross products of prefixes and suffixes
00777     for (int j=1;j<n ;j++)
00778        if (wlst[j].allow) {
00779           for (int k = 0; k < al; k++) {
00780              unsigned char c = (unsigned char) ap[k];
00781              PfxEntry * cptr = (PfxEntry *) pFlag[c];
00782              while (cptr) {
00783                 if (cptr->allowCross()) {
00784                    int l1 = strlen(wlst[j].word);
00785                    char * newword = cptr->add(wlst[j].word, l1);
00786                     if (newword) {
00787                      if (nh < maxn) {
00788                          wlst[nh].word = newword;
00789                           wlst[nh].allow = cptr->allowCross();
00790                           nh++;
00791                      } else {
00792                        free(newword);
00793                        }
00794                    }
00795                 }
00796                 cptr = (PfxEntry *)cptr ->getFlgNxt();
00797              }
00798          }
00799        }
00800 
00801 
00802     // now handle pure prefixes
00803     for (int m = 0; m < al; m ++) {
00804        unsigned char c = (unsigned char) ap[m];
00805        PfxEntry * ptr = (PfxEntry *) pFlag[c];
00806        while (ptr) {
00807         char * newword = ptr->add(ts, wl);
00808          if (newword) {
00809             if (nh < maxn) {
00810                wlst[nh].word = newword;
00811                 wlst[nh].allow = ptr->allowCross();
00812                 nh++;
00813              } else {
00814                free(newword);
00815             } 
00816         }
00817          ptr = (PfxEntry *)ptr ->getFlgNxt();
00818        }
00819     }
00820 
00821     return nh;
00822 }
00823 
00824 
00825 // return length of replacing table
00826 int AffixMgr::get_numrep()
00827 {
00828   return numrep;
00829 }
00830 
00831 // return replacing table
00832 struct replentry * AffixMgr::get_reptable()
00833 {
00834   if (! reptable ) return NULL;
00835   return reptable;
00836 }
00837 
00838 
00839 // return length of character map table
00840 int AffixMgr::get_nummap()
00841 {
00842   return nummap;
00843 }
00844 
00845 // return character map table
00846 struct mapentry * AffixMgr::get_maptable()
00847 {
00848   if (! maptable ) return NULL;
00849   return maptable;
00850 }
00851 
00852 // return text encoding of dictionary
00853 char * AffixMgr::get_encoding()
00854 {
00855   if (! encoding ) {
00856       encoding = mystrdup("ISO8859-1");
00857   }
00858   return mystrdup(encoding);
00859 }
00860 
00861 
00862 // return the preferred try string for suggestions
00863 char * AffixMgr::get_try_string()
00864 {
00865   if (! trystring ) return NULL;
00866   return mystrdup(trystring);
00867 }
00868 
00869 // return the compound words control flag
00870 char * AffixMgr::get_compound()
00871 {
00872   if (! compound ) return NULL;
00873   return compound;
00874 }
00875 
00876 // utility method to look up root words in hash table
00877 struct hentry * AffixMgr::lookup(const char * word)
00878 {
00879   if (! pHMgr) return NULL;
00880   return pHMgr->lookup(word);
00881 }
00882 
00883 // return nosplitsugs
00884 bool AffixMgr::get_nosplitsugs(void)
00885 {
00886   return nosplitsugs;
00887 }
00888 
00889 /* parse in the try string */
00890 int  AffixMgr::parse_try(char * line)
00891 {
00892    if (trystring) {
00893       fprintf(stderr,"error: duplicate TRY strings\n");
00894       return 1;
00895    }
00896    char * tp = line;
00897    char * piece;
00898    int i = 0;
00899    int np = 0;
00900    while ((piece=mystrsep(&tp,' '))) {
00901       if (*piece != '\0') {
00902           switch(i) {
00903              case 0: { np++; break; }
00904               case 1: { trystring = mystrdup(piece); np++; break; }
00905              default: break;
00906           }
00907           i++;
00908       }
00909       free(piece);
00910    }
00911    if (np != 2) {
00912       fprintf(stderr,"error: missing TRY information\n");
00913       return 1;
00914    } 
00915    return 0;
00916 }
00917 
00918 
00919 /* parse in the name of the character set used by the .dict and .aff */
00920 int  AffixMgr::parse_set(char * line)
00921 {
00922    if (encoding) {
00923       fprintf(stderr,"error: duplicate SET strings\n");
00924       return 1;
00925    }
00926    char * tp = line;
00927    char * piece;
00928    int i = 0;
00929    int np = 0;
00930    while ((piece=mystrsep(&tp,' '))) {
00931       if (*piece != '\0') {
00932           switch(i) {
00933             case 0: { np++; break; }
00934              case 1: { encoding = mystrdup(piece); np++; break; }
00935             default: break;
00936           }
00937           i++;
00938       }
00939       free(piece);
00940    }
00941    if (np != 2) {
00942       fprintf(stderr,"error: missing SET information\n");
00943       return 1;
00944    } 
00945    return 0;
00946 }
00947 
00948 
00949 /* parse in the flag used by the controlled compound words */
00950 int  AffixMgr::parse_cpdflag(char * line)
00951 {
00952    if (compound) {
00953       fprintf(stderr,"error: duplicate compound flags used\n");
00954       return 1;
00955    }
00956    char * tp = line;
00957    char * piece;
00958    int i = 0;
00959    int np = 0;
00960    while ((piece=mystrsep(&tp,' '))) {
00961       if (*piece != '\0') {
00962           switch(i) {
00963             case 0: { np++; break; }
00964              case 1: { compound = mystrdup(piece); np++; break; }
00965             default: break;
00966           }
00967           i++;
00968       }
00969       free(piece);
00970    }
00971    if (np != 2) {
00972       fprintf(stderr,"error: missing compound flag information\n");
00973       return 1;
00974    }
00975    return 0;
00976 }
00977 
00978 
00979 /* parse in the min compound word length */
00980 int  AffixMgr::parse_cpdmin(char * line)
00981 {
00982    char * tp = line;
00983    char * piece;
00984    int i = 0;
00985    int np = 0;
00986    while ((piece=mystrsep(&tp,' '))) {
00987       if (*piece != '\0') {
00988           switch(i) {
00989             case 0: { np++; break; }
00990              case 1: { cpdmin = atoi(piece); np++; break; }
00991             default: break;
00992           }
00993           i++;
00994       }
00995       free(piece);
00996    }
00997    if (np != 2) {
00998       fprintf(stderr,"error: missing compound min information\n");
00999       return 1;
01000    } 
01001    if ((cpdmin < 1) || (cpdmin > 50)) cpdmin = 3;
01002    return 0;
01003 }
01004 
01005 
01006 /* parse in the typical fault correcting table */
01007 int  AffixMgr::parse_reptable(char * line, FILE * af)
01008 {
01009    if (numrep != 0) {
01010       fprintf(stderr,"error: duplicate REP tables used\n");
01011       return 1;
01012    }
01013    char * tp = line;
01014    char * piece;
01015    int i = 0;
01016    int np = 0;
01017    while ((piece=mystrsep(&tp,' '))) {
01018        if (*piece != '\0') {
01019           switch(i) {
01020             case 0: { np++; break; }
01021              case 1: { 
01022                        numrep = atoi(piece);
01023                       if (numrep < 1) {
01024                        fprintf(stderr,"incorrect number of entries in replacement table\n");
01025                        free(piece);
01026                           return 1;
01027                        }
01028                        reptable = (replentry *) malloc(numrep * sizeof(struct replentry));
01029                        np++;
01030                        break;
01031                     }
01032             default: break;
01033           }
01034           i++;
01035        }
01036        free(piece);
01037    }
01038    if (np != 2) {
01039       fprintf(stderr,"error: missing replacement table information\n");
01040       return 1;
01041    } 
01042  
01043    /* now parse the numrep lines to read in the remainder of the table */
01044    char * nl = line;
01045    for (int j=0; j < numrep; j++) {
01046         fgets(nl,MAXLNLEN,af);
01047         mychomp(nl);
01048         tp = nl;
01049         i = 0;
01050         reptable[j].pattern = NULL;
01051         reptable[j].replacement = NULL;
01052         while ((piece=mystrsep(&tp,' '))) {
01053            if (*piece != '\0') {
01054                switch(i) {
01055                   case 0: {
01056                            if (strncmp(piece,"REP",3) != 0) {
01057                                fprintf(stderr,"error: replacement table is corrupt\n");
01058                                  free(piece);
01059                                  return 1;
01060                              }
01061                              break;
01062                         }
01063                   case 1: { reptable[j].pattern = mystrdup(piece); break; }
01064                   case 2: { reptable[j].replacement = mystrdup(piece); break; }
01065                 default: break;
01066                }
01067                i++;
01068            }
01069            free(piece);
01070         }
01071        if ((!(reptable[j].pattern)) || (!(reptable[j].replacement))) {
01072             fprintf(stderr,"error: replacement table is corrupt\n");
01073              return 1;
01074         }
01075    }
01076    return 0;
01077 }
01078 
01079 
01080 
01081 /* parse in the character map table */
01082 int  AffixMgr::parse_maptable(char * line, FILE * af)
01083 {
01084    if (nummap != 0) {
01085       fprintf(stderr,"error: duplicate MAP tables used\n");
01086       return 1;
01087    }
01088    char * tp = line;
01089    char * piece;
01090    int i = 0;
01091    int np = 0;
01092    while ((piece=mystrsep(&tp,' '))) {
01093        if (*piece != '\0') {
01094           switch(i) {
01095             case 0: { np++; break; }
01096              case 1: { 
01097                        nummap = atoi(piece);
01098                       if (nummap < 1) {
01099                        fprintf(stderr,"incorrect number of entries in map table\n");
01100                        free(piece);
01101                           return 1;
01102                        }
01103                        maptable = (mapentry *) malloc(nummap * sizeof(struct mapentry));
01104                        np++;
01105                        break;
01106                     }
01107             default: break;
01108           }
01109           i++;
01110        }
01111        free(piece);
01112    }
01113    if (np != 2) {
01114       fprintf(stderr,"error: missing map table information\n");
01115       return 1;
01116    } 
01117  
01118    /* now parse the nummap lines to read in the remainder of the table */
01119    char * nl = line;
01120    for (int j=0; j < nummap; j++) {
01121         fgets(nl,MAXLNLEN,af);
01122         mychomp(nl);
01123         tp = nl;
01124         i = 0;
01125         maptable[j].set = NULL;
01126         maptable[j].len = 0;
01127         while ((piece=mystrsep(&tp,' '))) {
01128            if (*piece != '\0') {
01129                switch(i) {
01130                   case 0: {
01131                            if (strncmp(piece,"MAP",3) != 0) {
01132                                fprintf(stderr,"error: map table is corrupt\n");
01133                                  free(piece);
01134                                  return 1;
01135                              }
01136                              break;
01137                         }
01138                   case 1: { maptable[j].set = mystrdup(piece); 
01139                           maptable[j].len = strlen(maptable[j].set);
01140                             break; }
01141                 default: break;
01142                }
01143                i++;
01144            }
01145            free(piece);
01146         }
01147        if ((!(maptable[j].set)) || (!(maptable[j].len))) {
01148             fprintf(stderr,"error: map table is corrupt\n");
01149              return 1;
01150         }
01151    }
01152    return 0;
01153 }
01154 
01155 
01156 
01157 
01158 int  AffixMgr::parse_affix(char * line, const char at, FILE * af)
01159 {
01160    int numents = 0;      // number of affentry structures to parse
01161    char achar='\0';      // affix char identifier
01162    short ff=0;
01163    struct affentry * ptr= NULL;
01164    struct affentry * nptr= NULL;
01165 
01166    char * tp = line;
01167    char * nl = line;
01168    char * piece;
01169    int i = 0;
01170 
01171    // split affix header line into pieces
01172 
01173    int np = 0;
01174    while ((piece=mystrsep(&tp,' '))) {
01175       if (*piece != '\0') {
01176           switch(i) {
01177              // piece 1 - is type of affix
01178              case 0: { np++; break; }
01179           
01180              // piece 2 - is affix char
01181              case 1: { np++; achar = *piece; break; }
01182 
01183              // piece 3 - is cross product indicator 
01184              case 2: { np++; if (*piece == 'Y') ff = XPRODUCT; break; }
01185 
01186              // piece 4 - is number of affentries
01187              case 3: { 
01188                        np++;
01189                        numents = atoi(piece); 
01190                        ptr = (struct affentry *) malloc(numents * sizeof(struct affentry));
01191                        ptr->xpflg = ff;
01192                        ptr->achar = achar;
01193                        break;
01194                      }
01195 
01196             default: break;
01197           }
01198           i++;
01199       }
01200       free(piece);
01201    }
01202    // check to make sure we parsed enough pieces
01203    if (np != 4) {
01204        fprintf(stderr, "error: affix %c header has insufficient data in line %s\n",achar,nl);
01205        free(ptr);
01206        return 1;
01207    }
01208  
01209    // store away ptr to first affentry
01210    nptr = ptr;
01211 
01212    // now parse numents affentries for this affix
01213    for (int j=0; j < numents; j++) {
01214       fgets(nl,MAXLNLEN,af);
01215       mychomp(nl);
01216       tp = nl;
01217       i = 0;
01218       np = 0;
01219 
01220       // split line into pieces
01221       while ((piece=mystrsep(&tp,' '))) {
01222          if (*piece != '\0') {
01223              switch(i) {
01224 
01225                 // piece 1 - is type
01226                 case 0: { 
01227                           np++;
01228                           if (nptr != ptr) nptr->xpflg = ptr->xpflg;
01229                           break;
01230                         }
01231 
01232                 // piece 2 - is affix char
01233                 case 1: { 
01234                         np++;
01235                           if (*piece != achar) {
01236                               fprintf(stderr, "error: affix %c is corrupt near line %s\n",achar,nl);
01237                               fprintf(stderr, "error: possible incorrect count\n");
01238                               free(piece);
01239                               return 1;
01240                           }
01241                           if (nptr != ptr) nptr->achar = ptr->achar;
01242                           break;
01243                       }
01244 
01245                 // piece 3 - is string to strip or 0 for null 
01246                 case 2: { 
01247                           np++;
01248                           nptr->strip = mystrdup(piece);
01249                           nptr->stripl = strlen(nptr->strip);
01250                           if (strcmp(nptr->strip,"0") == 0) {
01251                               free(nptr->strip);
01252                               nptr->strip=mystrdup("");
01253                            nptr->stripl = 0;
01254                           }   
01255                           break; 
01256                         }
01257 
01258                 // piece 4 - is affix string or 0 for null
01259                 case 3: { 
01260                         np++;
01261                           nptr->appnd = mystrdup(piece);
01262                           nptr->appndl = strlen(nptr->appnd);
01263                           if (strcmp(nptr->appnd,"0") == 0) {
01264                               free(nptr->appnd);
01265                               nptr->appnd=mystrdup("");
01266                            nptr->appndl = 0;
01267                           }   
01268                           break; 
01269                         }
01270 
01271                 // piece 5 - is the conditions descriptions
01272                 case 4: { np++; encodeit(nptr,piece); }
01273 
01274               default: break;
01275              }
01276              i++;
01277          }
01278          free(piece);
01279       }
01280       // check to make sure we parsed enough pieces
01281       if (np != 5) {
01282           fprintf(stderr, "error: affix %c is corrupt near line %s\n",achar,nl);
01283           free(ptr);
01284           return 1;
01285       }
01286       nptr++;
01287    }
01288          
01289    // now create SfxEntry or PfxEntry objects and use links to
01290    // build an ordered (sorted by affix string) list
01291    nptr = ptr;
01292    for (int k = 0; k < numents; k++) {
01293       if (at == 'P') {
01294          PfxEntry * pfxptr = new PfxEntry(this,nptr);
01295           build_pfxtree((AffEntry *)pfxptr);
01296       } else {
01297          SfxEntry * sfxptr = new SfxEntry(this,nptr);
01298           build_sfxtree((AffEntry *)sfxptr); 
01299       }
01300       nptr++;
01301    }      
01302    free(ptr);
01303    return 0;
01304 }