Back to index

lightning-sunbird  0.9+nobinonly
affentry.cpp
Go to the documentation of this file.
00001 #include "license.readme"
00002 
00003 #include <ctype.h>
00004 #include <string.h>
00005 #include <stdlib.h> 
00006 #include <stdio.h> 
00007 
00008 #include "affentry.hxx"
00009 
00010 // using namespace std;
00011 
00012 extern char * mystrdup(const char * s);
00013 extern char *  myrevstrdup(const char * s);
00014 
00015 PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
00016 {
00017   // register affix manager
00018   pmyMgr = pmgr;
00019 
00020   // set up its intial values
00021   achar = dp->achar;         // char flag 
00022   strip = dp->strip;         // string to strip
00023   appnd = dp->appnd;         // string to append
00024   stripl = dp->stripl;       // length of strip string
00025   appndl = dp->appndl;       // length of append string
00026   numconds = dp->numconds;   // number of conditions to match
00027   xpflg = dp->xpflg;         // cross product flag
00028   // then copy over all of the conditions
00029   memcpy(&conds[0],&dp->conds[0],SETSIZE*sizeof(conds[0]));
00030   next = NULL;
00031   nextne = NULL;
00032   nexteq = NULL;
00033 }
00034 
00035 
00036 PfxEntry::~PfxEntry()
00037 {
00038     achar = '\0';
00039     if (appnd) free(appnd);
00040     if (strip)free(strip);
00041     pmyMgr = NULL;
00042     appnd = NULL;
00043     strip = NULL;    
00044 }
00045 
00046 
00047 
00048 // add prefix to this word assuming conditions hold
00049 char * PfxEntry::add(const char * word, int len)
00050 {
00051     int                     cond;
00052     char              tword[MAXWORDLEN+1];
00053 
00054      /* make sure all conditions match */
00055      if ((len > stripl) && (len >= numconds)) {
00056             unsigned char * cp = (unsigned char *) word;
00057             for (cond = 0;  cond < numconds;  cond++) {
00058               if ((conds[*cp++] & (1 << cond)) == 0)
00059                  break;
00060             }
00061             if (cond >= numconds) {
00062              /* we have a match so add prefix */
00063               int tlen = 0;
00064               if (appndl) {
00065                  strcpy(tword,appnd);
00066                   tlen += appndl;
00067                } 
00068                char * pp = tword + tlen;
00069                strcpy(pp, (word + stripl));
00070                return mystrdup(tword);
00071            }
00072      }
00073      return NULL;    
00074 }
00075 
00076 
00077 
00078 
00079 // check if this prefix entry matches 
00080 struct hentry * PfxEntry::check(const char * word, int len)
00081 {
00082     int                     cond;  // condition number being examined
00083     int                       tmpl;   // length of tmpword
00084     struct hentry *     he;     // hash entry of root word or NULL
00085     unsigned char *  cp;           
00086     char              tmpword[MAXWORDLEN+1];
00087 
00088 
00089     // on entry prefix is 0 length or already matches the beginning of the word.
00090     // So if the remaining root word has positive length
00091     // and if there are enough chars in root word and added back strip chars
00092     // to meet the number of characters conditions, then test it
00093 
00094      tmpl = len - appndl;
00095 
00096      if ((tmpl > 0) &&  (tmpl + stripl >= numconds)) {
00097 
00098            // generate new root word by removing prefix and adding
00099            // back any characters that would have been stripped
00100 
00101            if (stripl) strcpy (tmpword, strip);
00102            strcpy ((tmpword + stripl), (word + appndl));
00103 
00104             // now make sure all of the conditions on characters
00105             // are met.  Please see the appendix at the end of
00106             // this file for more info on exactly what is being
00107             // tested
00108 
00109            cp = (unsigned char *)tmpword;
00110            for (cond = 0;  cond < numconds;  cond++) {
00111               if ((conds[*cp++] & (1 << cond)) == 0) break;
00112            }
00113 
00114             // if all conditions are met then check if resulting
00115             // root word in the dictionary
00116 
00117            if (cond >= numconds) {
00118               tmpl += stripl;
00119               if ((he = pmyMgr->lookup(tmpword)) != NULL) {
00120                  if (TESTAFF(he->astr, achar, he->alen)) return he;
00121               }
00122 
00123               // prefix matched but no root word was found 
00124                 // if XPRODUCT is allowed, try again but now 
00125                 // ross checked combined with a suffix
00126 
00127               if (xpflg & XPRODUCT) {
00128                  he = pmyMgr->suffix_check(tmpword, tmpl, XPRODUCT, (AffEntry *)this);
00129                    if (he) return he;
00130               }
00131            }
00132      }
00133     return NULL;
00134 }
00135 
00136 
00137 
00138 SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp)
00139 {
00140   // register affix manager
00141   pmyMgr = pmgr;
00142 
00143   // set up its intial values
00144   achar = dp->achar;         // char flag 
00145   strip = dp->strip;         // string to strip
00146   appnd = dp->appnd;         // string to append
00147   stripl = dp->stripl;       // length of strip string
00148   appndl = dp->appndl;       // length of append string
00149   numconds = dp->numconds;   // number of conditions to match
00150   xpflg = dp->xpflg;         // cross product flag
00151 
00152   // then copy over all of the conditions
00153   memcpy(&conds[0],&dp->conds[0],SETSIZE*sizeof(conds[0]));
00154 
00155   rappnd = myrevstrdup(appnd);
00156 }
00157 
00158 
00159 SfxEntry::~SfxEntry()
00160 {
00161     achar = '\0';
00162     if (appnd) free(appnd);
00163     if (rappnd) free(rappnd);
00164     if (strip) free(strip);
00165     pmyMgr = NULL;
00166     appnd = NULL;
00167     strip = NULL;    
00168 }
00169 
00170 
00171 
00172 // add suffix to this word assuming conditions hold
00173 char * SfxEntry::add(const char * word, int len)
00174 {
00175     int                     cond;
00176     char              tword[MAXWORDLEN+1];
00177 
00178      /* make sure all conditions match */
00179      if ((len > stripl) && (len >= numconds)) {
00180             unsigned char * cp = (unsigned char *) (word + len);
00181             for (cond = numconds; --cond >=0; ) {
00182               if ((conds[*--cp] & (1 << cond)) == 0)
00183                  break;
00184             }
00185             if (cond < 0) {
00186              /* we have a match so add suffix */
00187               strcpy(tword,word);
00188               int tlen = len;
00189               if (stripl) {
00190                tlen -= stripl;
00191               }
00192               char * pp = (tword + tlen);
00193               if (appndl) {
00194                  strcpy(pp,appnd);
00195                   tlen += appndl;
00196              } else *pp = '\0';
00197                return mystrdup(tword);
00198            }
00199      }
00200      return NULL;
00201 }
00202 
00203 
00204 
00205 // see if this suffix is present in the word 
00206 struct hentry * SfxEntry::check(const char * word, int len, int optflags, AffEntry* ppfx)
00207 {
00208     int                       tmpl;               // length of tmpword 
00209     int                     cond;          // condition beng examined
00210     struct hentry *     he;              // hash entry pointer
00211     unsigned char *  cp;
00212     char              tmpword[MAXWORDLEN+1];
00213     PfxEntry* ep = (PfxEntry *) ppfx;
00214 
00215 
00216     // if this suffix is being cross checked with a prefix
00217     // but it does not support cross products skip it
00218 
00219     if ((optflags & XPRODUCT) != 0 &&  (xpflg & XPRODUCT) == 0)
00220         return NULL;
00221 
00222     // upon entry suffix is 0 length or already matches the end of the word.
00223     // So if the remaining root word has positive length
00224     // and if there are enough chars in root word and added back strip chars
00225     // to meet the number of characters conditions, then test it
00226 
00227     tmpl = len - appndl;
00228 
00229     if ((tmpl > 0)  &&  (tmpl + stripl >= numconds)) {
00230 
00231            // generate new root word by removing suffix and adding
00232            // back any characters that would have been stripped or
00233            // or null terminating the shorter string
00234 
00235            strcpy (tmpword, word);
00236            cp = (unsigned char *)(tmpword + tmpl);
00237            if (stripl) {
00238               strcpy ((char *)cp, strip);
00239               tmpl += stripl;
00240               cp = (unsigned char *)(tmpword + tmpl);
00241            } else *cp = '\0';
00242 
00243             // now make sure all of the conditions on characters
00244             // are met.  Please see the appendix at the end of
00245             // this file for more info on exactly what is being
00246             // tested
00247 
00248            for (cond = numconds;  --cond >= 0; ) {
00249               if ((conds[*--cp] & (1 << cond)) == 0) break;
00250            }
00251 
00252             // if all conditions are met then check if resulting
00253             // root word in the dictionary
00254 
00255            if (cond < 0) {
00256                if ((he = pmyMgr->lookup(tmpword)) != NULL) {
00257                      if (TESTAFF(he->astr, achar , he->alen) && 
00258                            ((optflags & XPRODUCT) == 0 || 
00259                            TESTAFF(he->astr, ep->getFlag(), he->alen))) return he;
00260                }  
00261            }
00262     }
00263     return NULL;
00264 }
00265 
00266 
00267 
00268 
00269 #if 0
00270 
00271 Appendix:  Understanding Affix Code
00272 
00273 
00274 An affix is either a  prefix or a suffix attached to root words to make 
00275 other words.
00276 
00277 Basically a Prefix or a Suffix is set of AffEntry objects
00278 which store information about the prefix or suffix along 
00279 with supporting routines to check if a word has a particular 
00280 prefix or suffix or a combination.
00281 
00282 The structure affentry is defined as follows:
00283 
00284 struct affentry
00285 {
00286    unsigned char achar;   // char used to represent the affix
00287    char * strip;          // string to strip before adding affix
00288    char * appnd;          // the affix string to add
00289    short  stripl;         // length of the strip string
00290    short  appndl;         // length of the affix string
00291    short  numconds;       // the number of conditions that must be met
00292    short  xpflg;          // flag: XPRODUCT- combine both prefix and suffix 
00293    char   conds[SETSIZE]; // array which encodes the conditions to be met
00294 };
00295 
00296 
00297 Here is a suffix borrowed from the en_US.aff file.  This file 
00298 is whitespace delimited.
00299 
00300 SFX D Y 4 
00301 SFX D   0     e          d
00302 SFX D   y     ied        [^aeiou]y
00303 SFX D   0     ed         [^ey]
00304 SFX D   0     ed         [aeiou]y
00305 
00306 This information can be interpreted as follows:
00307 
00308 In the first line has 4 fields
00309 
00310 Field
00311 -----
00312 1     SFX - indicates this is a suffix
00313 2     D   - is the name of the character flag which represents this suffix
00314 3     Y   - indicates it can be combined with prefixes (cross product)
00315 4     4   - indicates that sequence of 4 affentry structures are needed to
00316                properly store the affix information
00317 
00318 The remaining lines describe the unique information for the 4 SfxEntry 
00319 objects that make up this affix.  Each line can be interpreted
00320 as follows: (note fields 1 and 2 are as a check against line 1 info)
00321 
00322 Field
00323 -----
00324 1     SFX         - indicates this is a suffix
00325 2     D           - is the name of the character flag for this affix
00326 3     y           - the string of chars to strip off before adding affix
00327                          (a 0 here indicates the NULL string)
00328 4     ied         - the string of affix characters to add
00329 5     [^aeiou]y   - the conditions which must be met before the affix
00330                     can be applied
00331 
00332 Field 5 is interesting.  Since this is a suffix, field 5 tells us that
00333 there are 2 conditions that must be met.  The first condition is that 
00334 the next to the last character in the word must *NOT* be any of the 
00335 following "a", "e", "i", "o" or "u".  The second condition is that
00336 the last character of the word must end in "y".
00337 
00338 So how can we encode this information concisely and be able to 
00339 test for both conditions in a fast manner?  The answer is found
00340 but studying the wonderful ispell code of Geoff Kuenning, et.al. 
00341 (now available under a normal BSD license).
00342 
00343 If we set up a conds array of 256 bytes indexed (0 to 255) and access it
00344 using a character (cast to an unsigned char) of a string, we have 8 bits
00345 of information we can store about that character.  Specifically we
00346 could use each bit to say if that character is allowed in any of the 
00347 last (or first for prefixes) 8 characters of the word.
00348 
00349 Basically, each character at one end of the word (up to the number 
00350 of conditions) is used to index into the conds array and the resulting 
00351 value found there says whether the that character is valid for a 
00352 specific character position in the word.  
00353 
00354 For prefixes, it does this by setting bit 0 if that char is valid 
00355 in the first position, bit 1 if valid in the second position, and so on. 
00356 
00357 If a bit is not set, then that char is not valid for that postion in the
00358 word.
00359 
00360 If working with suffixes bit 0 is used for the character closest 
00361 to the front, bit 1 for the next character towards the end, ..., 
00362 with bit numconds-1 representing the last char at the end of the string. 
00363 
00364 Note: since entries in the conds[] are 8 bits, only 8 conditions 
00365 (read that only 8 character positions) can be examined at one
00366 end of a word (the beginning for prefixes and the end for suffixes.
00367 
00368 So to make this clearer, lets encode the conds array values for the 
00369 first two affentries for the suffix D described earlier.
00370 
00371 
00372   For the first affentry:    
00373      numconds = 1             (only examine the last character)
00374 
00375      conds['e'] =  (1 << 0)   (the word must end in an E)
00376      all others are all 0
00377 
00378   For the second affentry:
00379      numconds = 2             (only examine the last two characters)     
00380 
00381      conds[X] = conds[X] | (1 << 0)     (aeiou are not allowed)
00382          where X is all characters *but* a, e, i, o, or u
00383          
00384 
00385      conds['y'] = (1 << 1)     (the last char must be a y)
00386      all other bits for all other entries in the conds array are zero
00387 
00388 
00389 #endif
00390