Back to index

lightning-sunbird  0.9+nobinonly
regex.c
Go to the documentation of this file.
00001 /* ***** BEGIN LICENSE BLOCK *****
00002  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00003  *
00004  * The contents of this file are subject to the Mozilla Public License Version
00005  * 1.1 (the "License"); you may not use this file except in compliance with
00006  * the License. You may obtain a copy of the License at
00007  * http://www.mozilla.org/MPL/
00008  *
00009  * Software distributed under the License is distributed on an "AS IS" basis,
00010  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00011  * for the specific language governing rights and limitations under the
00012  * License.
00013  *
00014  * The Original Code is Mozilla Communicator client code, released
00015  * March 31, 1998.
00016  *
00017  * The Initial Developer of the Original Code is
00018  * Netscape Communications Corporation.
00019  * Portions created by the Initial Developer are Copyright (C) 1998-1999
00020  * the Initial Developer. All Rights Reserved.
00021  *
00022  * Contributor(s):
00023  *
00024  * Alternatively, the contents of this file may be used under the terms of
00025  * either the GNU General Public License Version 2 or later (the "GPL"), or
00026  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00027  * in which case the provisions of the GPL or the LGPL are applicable instead
00028  * of those above. If you wish to allow use of your version of this file only
00029  * under the terms of either the GPL or the LGPL, and not to allow others to
00030  * use your version of this file under the terms of the MPL, indicate your
00031  * decision by deleting the provisions above and replace them with the notice
00032  * and other provisions required by the GPL or the LGPL. If you do not delete
00033  * the provisions above, a recipient may use your version of this file under
00034  * the terms of any one of the MPL, the GPL or the LGPL.
00035  *
00036  * ***** END LICENSE BLOCK ***** */
00037 #include "ldap-int.h"
00038 #if defined( macintosh ) || defined( DOS ) || defined( _WINDOWS ) || defined( NEED_BSDREGEX ) || defined( XP_OS2)
00039 #include "regex.h"
00040 
00041 /*
00042  * regex - Regular expression pattern matching  and replacement
00043  *
00044  * By:  Ozan S. Yigit (oz)
00045  *      Dept. of Computer Science
00046  *      York University
00047  *
00048  * These routines are the PUBLIC DOMAIN equivalents of regex
00049  * routines as found in 4.nBSD UN*X, with minor extensions.
00050  *
00051  * These routines are derived from various implementations found
00052  * in software tools books, and Conroy's grep. They are NOT derived
00053  * from licensed/restricted software.
00054  * For more interesting/academic/complicated implementations,
00055  * see Henry Spencer's regexp routines, or GNU Emacs pattern
00056  * matching module.
00057  *
00058  * Use the actual CCL code in the CLO
00059  * section of pmatch. No need for a recursive
00060  * pmatch call.
00061  * 
00062  * Use a bitmap table to set char bits in an
00063  * 8-bit chunk.
00064  * 
00065  * Interfaces:
00066  *      re_comp:        compile a regular expression into a NFA.
00067  *
00068  *                   char *re_comp(s)
00069  *                   char *s;
00070  *
00071  *      re_exec:        execute the NFA to match a pattern.
00072  *
00073  *                   int re_exec(s)
00074  *                   char *s;
00075  *
00076  *     re_modw              change re_exec's understanding of what a "word"
00077  *                   looks like (for < and >) by adding into the
00078  *                   hidden word-syntax table.
00079  *
00080  *                   void re_modw(s)
00081  *                   char *s;
00082  *
00083  *      re_subs:     substitute the matched portions in a new string.
00084  *
00085  *                   int re_subs(src, dst)
00086  *                   char *src;
00087  *                   char *dst;
00088  *
00089  *     re_fail:      failure routine for re_exec.
00090  *
00091  *                   void re_fail(msg, op)
00092  *                   char *msg;
00093  *                   char op;
00094  *  
00095  * Regular Expressions:
00096  *
00097  *      [1]     char    matches itself, unless it is a special
00098  *                      character (metachar): . \ [ ] * + ^ $
00099  *
00100  *      [2]     .       matches any character.
00101  *
00102  *      [3]     \       matches the character following it, except
00103  *                   when followed by a left or right round bracket,
00104  *                   a digit 1 to 9 or a left or right angle bracket. 
00105  *                   (see [7], [8] and [9])
00106  *                   It is used as an escape character for all 
00107  *                   other meta-characters, and itself. When used
00108  *                   in a set ([4]), it is treated as an ordinary
00109  *                   character.
00110  *
00111  *      [4]     [set]   matches one of the characters in the set.
00112  *                      If the first character in the set is "^",
00113  *                      it matches a character NOT in the set, i.e. 
00114  *                   complements the set. A shorthand S-E is 
00115  *                   used to specify a set of characters S upto 
00116  *                   E, inclusive. The special characters "]" and 
00117  *                   "-" have no special meaning if they appear 
00118  *                   as the first chars in the set.
00119  *                      examples:        match:
00120  *
00121  *                              [a-z]    any lowercase alpha
00122  *
00123  *                              [^]-]    any char except ] and -
00124  *
00125  *                              [^A-Z]   any char except uppercase
00126  *                                       alpha
00127  *
00128  *                              [a-zA-Z] any alpha
00129  *
00130  *      [5]     *       any regular expression form [1] to [4], followed by
00131  *                      closure char (*) matches zero or more matches of
00132  *                      that form.
00133  *
00134  *      [6]     +       same as [5], except it matches one or more.
00135  *
00136  *      [7]             a regular expression in the form [1] to [10], enclosed
00137  *                      as \(form\) matches what form matches. The enclosure
00138  *                      creates a set of tags, used for [8] and for
00139  *                      pattern substution. The tagged forms are numbered
00140  *                   starting from 1.
00141  *
00142  *      [8]             a \ followed by a digit 1 to 9 matches whatever a
00143  *                      previously tagged regular expression ([7]) matched.
00144  *
00145  *     [9]    <      a regular expression starting with a < construct
00146  *            >      and/or ending with a > construct, restricts the
00147  *                   pattern matching to the beginning of a word, and/or
00148  *                   the end of a word. A word is defined to be a character
00149  *                   string beginning and/or ending with the characters
00150  *                   A-Z a-z 0-9 and _. It must also be preceded and/or
00151  *                   followed by any character outside those mentioned.
00152  *
00153  *      [10]            a composite regular expression xy where x and y
00154  *                      are in the form [1] to [10] matches the longest
00155  *                      match of x followed by a match for y.
00156  *
00157  *      [11]  ^      a regular expression starting with a ^ character
00158  *            $      and/or ending with a $ character, restricts the
00159  *                      pattern matching to the beginning of the line,
00160  *                      or the end of line. [anchors] Elsewhere in the
00161  *                   pattern, ^ and $ are treated as ordinary characters.
00162  *
00163  *
00164  * Acknowledgements:
00165  *
00166  *     HCR's Hugh Redelmeier has been most helpful in various
00167  *     stages of development. He convinced me to include BOW
00168  *     and EOW constructs, originally invented by Rob Pike at
00169  *     the University of Toronto.
00170  *
00171  * References:
00172  *              Software tools                   Kernighan & Plauger
00173  *              Software tools in Pascal        Kernighan & Plauger
00174  *              Grep [rsx-11 C dist]            David Conroy
00175  *            ed - text editor            Un*x Programmer's Manual
00176  *            Advanced editing on Un*x    B. W. Kernighan
00177  *            RegExp routines                    Henry Spencer
00178  *
00179  * Notes:
00180  *
00181  *     This implementation uses a bit-set representation for character
00182  *     classes for speed and compactness. Each character is represented 
00183  *     by one bit in a 128-bit block. Thus, CCL always takes a 
00184  *     constant 16 bytes in the internal nfa, and re_exec does a single
00185  *     bit comparison to locate the character in the set.
00186  *
00187  * Examples:
00188  *
00189  *     pattern:      foo*.*
00190  *     compile:      CHR f CHR o CLO CHR o END CLO ANY END END
00191  *     matches:      fo foo fooo foobar fobar foxx ...
00192  *
00193  *     pattern:      fo[ob]a[rz]   
00194  *     compile:      CHR f CHR o CCL bitset CHR a CCL bitset END
00195  *     matches:      fobar fooar fobaz fooaz
00196  *
00197  *     pattern:      foo\\+
00198  *     compile:      CHR f CHR o CHR o CHR \ CLO CHR \ END END
00199  *     matches:      foo\ foo\\ foo\\\  ...
00200  *
00201  *     pattern:      \(foo\)[1-3]\1       (same as foo[1-3]foo)
00202  *     compile:      BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
00203  *     matches:      foo1foo foo2foo foo3foo
00204  *
00205  *     pattern:      \(fo.*\)-\1
00206  *     compile:      BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
00207  *     matches:      foo-foo fo-fo fob-fob foobar-foobar ...
00208  */
00209 
00210 #define MAXNFA  1024
00211 #define MAXTAG  10
00212 
00213 #define OKP     1
00214 #define NOP     0
00215 
00216 #define CHR     1
00217 #define ANY     2
00218 #define CCL     3
00219 #define BOL     4
00220 #define EOL     5
00221 #define BOT     6
00222 #define EOT     7
00223 #define BOW   8
00224 #define EOW   9
00225 #define REF     10
00226 #define CLO     11
00227 
00228 #define END     0
00229 
00230 /*
00231  * The following defines are not meant to be changeable.
00232  * They are for readability only.
00233  */
00234 #define MAXCHR       128
00235 #define CHRBIT       8
00236 #define BITBLK       MAXCHR/CHRBIT
00237 #define BLKIND       0170
00238 #define BITIND       07
00239 
00240 #define ASCIIB       0177
00241 
00242 /* Plain char, on the other hand, may be signed or unsigned; it depends on
00243  * the platform and perhaps a compiler option.  A hard fact of life, in C.
00244  *
00245  * 6-April-1999 mcs@netscape.com: replaced CHAR with REGEXCHAR to avoid
00246  *              conflicts with system types on Win32.   Changed typedef
00247  *              for REGEXCHAR to always be unsigned, which seems right.
00248  */
00249 typedef unsigned char REGEXCHAR;
00250 
00251 static int  tagstk[MAXTAG];             /* subpat tag stack..*/
00252 static REGEXCHAR nfa[MAXNFA];             /* automaton..       */
00253 static int  sta = NOP;                    /* status of lastpat */
00254 
00255 static REGEXCHAR bittab[BITBLK];   /* bit table for CCL */
00256                                    /* pre-set bits...   */
00257 static REGEXCHAR bitarr[] = {1,2,4,8,16,32,64,128};
00258 
00259 static void
00260 chset(REGEXCHAR c)
00261 {
00262        bittab[((c) & (unsigned)BLKIND) >> 3] |= bitarr[(c) & BITIND];
00263 }
00264 
00265 #define badpat(x)    (*nfa = END, x)
00266 #define store(x)     *mp++ = x
00267  
00268 char *     
00269 LDAP_CALL
00270 re_comp( char *pat )
00271 {
00272        register REGEXCHAR *p;          /* pattern pointer   */
00273        register REGEXCHAR *mp=nfa;     /* nfa pointer       */
00274        register REGEXCHAR *lp;         /* saved pointer..   */
00275        register REGEXCHAR *sp=nfa;     /* another one..     */
00276 
00277        register int tagi = 0;          /* tag stack index   */
00278        register int tagc = 1;          /* actual tag count  */
00279 
00280        register int n;
00281        register REGEXCHAR mask;    /* xor mask -CCL/NCL */
00282        int c1, c2;
00283               
00284        if (!pat || !*pat) {
00285               if (sta) {
00286                      return 0;
00287               } else {
00288                      return badpat("No previous regular expression");
00289               }
00290        }
00291        sta = NOP;
00292 
00293        for (p = (REGEXCHAR*)pat; *p; p++) {
00294               lp = mp;
00295               switch(*p) {
00296 
00297               case '.':               /* match any char..  */
00298                      store(ANY);
00299                      break;
00300 
00301               case '^':               /* match beginning.. */
00302                      if (p == (REGEXCHAR*)pat)
00303                             store(BOL);
00304                      else {
00305                             store(CHR);
00306                             store(*p);
00307                      }
00308                      break;
00309 
00310               case '$':               /* match endofline.. */
00311                      if (!*(p+1))
00312                             store(EOL);
00313                      else {
00314                             store(CHR);
00315                             store(*p);
00316                      }
00317                      break;
00318 
00319               case '[':               /* match char class..*/
00320                      store(CCL);
00321 
00322                      if (*++p == '^') {
00323                             mask = 0377;  
00324                             p++;
00325                      }
00326                      else
00327                             mask = 0;
00328 
00329                      if (*p == '-')              /* real dash */
00330                             chset(*p++);
00331                      if (*p == ']')              /* real brac */
00332                             chset(*p++);
00333                      while (*p && *p != ']') {
00334                             if (*p == '-' && *(p+1) && *(p+1) != ']') {
00335                                    p++;
00336                                    c1 = *(p-2) + 1;
00337                                    c2 = *p++;
00338                                    while (c1 <= c2)
00339                                           chset((REGEXCHAR)c1++);
00340                             }
00341 #ifdef EXTEND
00342                             else if (*p == '\\' && *(p+1)) {
00343                                    p++;
00344                                    chset(*p++);
00345                             }
00346 #endif
00347                             else
00348                                    chset(*p++);
00349                      }
00350                      if (!*p)
00351                             return badpat("Missing ]");
00352 
00353                      for (n = 0; n < BITBLK; bittab[n++] = (REGEXCHAR) 0)
00354                             store(mask ^ bittab[n]);
00355        
00356                      break;
00357 
00358               case '*':               /* match 0 or more.. */
00359               case '+':               /* match 1 or more.. */
00360                      if (p == (REGEXCHAR*)pat)
00361                             return badpat("Empty closure");
00362                      lp = sp;             /* previous opcode */
00363                      if (*lp == CLO)             /* equivalence..   */
00364                             break;
00365                      switch(*lp) {
00366 
00367                      case BOL:
00368                      case BOT:
00369                      case EOT:
00370                      case BOW:
00371                      case EOW:
00372                      case REF:
00373                             return badpat("Illegal closure");
00374                      default:
00375                             break;
00376                      }
00377 
00378                      if (*p == '+')
00379                             for (sp = mp; lp < sp; lp++)
00380                                    store(*lp);
00381 
00382                      store(END);
00383                      store(END);
00384                      sp = mp;
00385                      while (--mp > lp)
00386                             *mp = mp[-1];
00387                      store(CLO);
00388                      mp = sp;
00389                      break;
00390 
00391               case '\\':              /* tags, backrefs .. */
00392                      switch(*++p) {
00393 
00394                      case '(':
00395                             if (tagc < MAXTAG) {
00396                                    tagstk[++tagi] = tagc;
00397                                    store(BOT);
00398                                    store(tagc++);
00399                             }
00400                             else
00401                                    return badpat("Too many \\(\\) pairs");
00402                             break;
00403                      case ')':
00404                             if (*sp == BOT)
00405                                    return badpat("Null pattern inside \\(\\)");
00406                             if (tagi > 0) {
00407                                    store(EOT);
00408                                    store(tagstk[tagi--]);
00409                             }
00410                             else
00411                                    return badpat("Unmatched \\)");
00412                             break;
00413                      case '<':
00414                             store(BOW);
00415                             break;
00416                      case '>':
00417                             if (*sp == BOW)
00418                                    return badpat("Null pattern inside \\<\\>");
00419                             store(EOW);
00420                             break;
00421                      case '1':
00422                      case '2':
00423                      case '3':
00424                      case '4':
00425                      case '5':
00426                      case '6':
00427                      case '7':
00428                      case '8':
00429                      case '9':
00430                             n = *p-'0';
00431                             if (tagi > 0 && tagstk[tagi] == n)
00432                                    return badpat("Cyclical reference");
00433                             if (tagc > n) {
00434                                    store(REF);
00435                                    store(n);
00436                             }
00437                             else
00438                                    return badpat("Undetermined reference");
00439                             break;
00440 #ifdef EXTEND
00441                      case 'b':
00442                             store(CHR);
00443                             store('\b');
00444                             break;
00445                      case 'n':
00446                             store(CHR);
00447                             store('\n');
00448                             break;
00449                      case 'f':
00450                             store(CHR);
00451                             store('\f');
00452                             break;
00453                      case 'r':
00454                             store(CHR);
00455                             store('\r');
00456                             break;
00457                      case 't':
00458                             store(CHR);
00459                             store('\t');
00460                             break;
00461 #endif
00462                      default:
00463                             store(CHR);
00464                             store(*p);
00465                      }
00466                      break;
00467 
00468               default :               /* an ordinary char  */
00469                      store(CHR);
00470                      store(*p);
00471                      break;
00472               }
00473               sp = lp;
00474        }
00475        if (tagi > 0)
00476               return badpat("Unmatched \\(");
00477        store(END);
00478        sta = OKP;
00479        return 0;
00480 }
00481 
00482 
00483 static REGEXCHAR *bol;
00484 static REGEXCHAR *bopat[MAXTAG];
00485 static REGEXCHAR *eopat[MAXTAG];
00486 #ifdef NEEDPROTOS
00487 static REGEXCHAR *pmatch( REGEXCHAR *lp, REGEXCHAR *ap );
00488 #else /* NEEDPROTOS */
00489 static REGEXCHAR *pmatch();
00490 #endif /* NEEDPROTOS */
00491 
00492 /*
00493  * re_exec:
00494  *     execute nfa to find a match.
00495  *
00496  *     special cases: (nfa[0])     
00497  *            BOL
00498  *                   Match only once, starting from the
00499  *                   beginning.
00500  *            CHR
00501  *                   First locate the character without
00502  *                   calling pmatch, and if found, call
00503  *                   pmatch for the remaining string.
00504  *            END
00505  *                   re_comp failed, poor luser did not
00506  *                   check for it. Fail fast.
00507  *
00508  *     If a match is found, bopat[0] and eopat[0] are set
00509  *     to the beginning and the end of the matched fragment,
00510  *     respectively.
00511  *
00512  */
00513 
00514 int
00515 LDAP_CALL
00516 re_exec( char *lp )
00517 {
00518        register REGEXCHAR c;
00519        register REGEXCHAR *ep = 0;
00520        register REGEXCHAR *ap = nfa;
00521 
00522        bol = (REGEXCHAR*)lp;
00523 
00524        bopat[0] = 0;
00525        bopat[1] = 0;
00526        bopat[2] = 0;
00527        bopat[3] = 0;
00528        bopat[4] = 0;
00529        bopat[5] = 0;
00530        bopat[6] = 0;
00531        bopat[7] = 0;
00532        bopat[8] = 0;
00533        bopat[9] = 0;
00534 
00535        switch(*ap) {
00536 
00537        case BOL:                   /* anchored: match from BOL only */
00538               ep = pmatch((REGEXCHAR*)lp,ap);
00539               break;
00540        case CHR:                   /* ordinary char: locate it fast */
00541               c = *(ap+1);
00542               while (*lp && *(REGEXCHAR*)lp != c)
00543                      lp++;
00544               if (!*lp)            /* if EOS, fail, else fall thru. */
00545                      return 0;
00546        default:                    /* regular matching all the way. */
00547               do {
00548                      if ((ep = pmatch((REGEXCHAR*)lp,ap)))
00549                             break;
00550                      lp++;
00551               } while (*lp);
00552 
00553               break;
00554        case END:                   /* munged automaton. fail always */
00555               return 0;
00556        }
00557        if (!ep)
00558               return 0;
00559 
00560        bopat[0] = (REGEXCHAR*)lp;
00561        eopat[0] = ep;
00562        return 1;
00563 }
00564 
00565 /* 
00566  * pmatch: internal routine for the hard part
00567  *
00568  *     This code is partly snarfed from an early grep written by
00569  *     David Conroy. The backref and tag stuff, and various other
00570  *     innovations are by oz.
00571  *
00572  *     special case optimizations: (nfa[n], nfa[n+1])
00573  *            CLO ANY
00574  *                   We KNOW .* will match everything upto the
00575  *                   end of line. Thus, directly go to the end of
00576  *                   line, without recursive pmatch calls. As in
00577  *                   the other closure cases, the remaining pattern
00578  *                   must be matched by moving backwards on the
00579  *                   string recursively, to find a match for xy
00580  *                   (x is ".*" and y is the remaining pattern)
00581  *                   where the match satisfies the LONGEST match for
00582  *                   x followed by a match for y.
00583  *            CLO CHR
00584  *                   We can again scan the string forward for the
00585  *                   single char and at the point of failure, we
00586  *                   execute the remaining nfa recursively, same as
00587  *                   above.
00588  *
00589  *     At the end of a successful match, bopat[n] and eopat[n]
00590  *     are set to the beginning and end of subpatterns matched
00591  *     by tagged expressions (n = 1 to 9).       
00592  *
00593  */
00594 
00595 #ifndef re_fail
00596 extern void re_fail();
00597 #endif /* re_fail */
00598 
00599 /*
00600  * character classification table for word boundary operators BOW
00601  * and EOW. the reason for not using ctype macros is that we can
00602  * let the user add into our own table. see re_modw. This table
00603  * is not in the bitset form, since we may wish to extend it in the
00604  * future for other character classifications. 
00605  *
00606  *     TRUE for 0-9 A-Z a-z _
00607  */
00608 static char chrtyp[MAXCHR] = {
00609        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
00610        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
00611        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
00612        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
00613        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 
00614        1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 
00615        0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 
00616        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
00617        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
00618        1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 
00619        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
00620        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
00621        1, 1, 1, 0, 0, 0, 0, 0
00622        };
00623 
00624 #define HIBIT        0200
00625 #define inascii(x)   (0177&(x))
00626 #define iswordc(x)   chrtyp[inascii(x)]
00627 #define isinset(x,y)        (((y)&HIBIT)?0:((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND]))
00628 
00629 /*
00630  * skip values for CLO XXX to skip past the closure
00631  */
00632 
00633 #define ANYSKIP      2      /* [CLO] ANY END ...      */
00634 #define CHRSKIP      3      /* [CLO] CHR chr END ...     */
00635 #define CCLSKIP 18   /* [CLO] CCL 16bytes END ... */
00636 
00637 static REGEXCHAR *
00638 pmatch( REGEXCHAR *lp, REGEXCHAR *ap)
00639 {
00640        register int op, c, n;
00641        register REGEXCHAR *e;             /* extra pointer for CLO */
00642        register REGEXCHAR *bp;            /* beginning of subpat.. */
00643        register REGEXCHAR *ep;            /* ending of subpat..        */
00644        REGEXCHAR *are;                    /* to save the line ptr. */
00645 
00646        while ((op = *ap++) != END)
00647               switch(op) {
00648 
00649               case CHR:
00650                      if (*lp++ != *ap++)
00651                             return 0;
00652                      break;
00653               case ANY:
00654                      if (!*lp++)
00655                             return 0;
00656                      break;
00657               case CCL:
00658                      c = *lp++;
00659                      if (!isinset(ap,c))
00660                             return 0;
00661                      ap += BITBLK;
00662                      break;
00663               case BOL:
00664                      if (lp != bol)
00665                             return 0;
00666                      break;
00667               case EOL:
00668                      if (*lp)
00669                             return 0;
00670                      break;
00671               case BOT:
00672                      bopat[*ap++] = lp;
00673                      break;
00674               case EOT:
00675                      eopat[*ap++] = lp;
00676                      break;
00677               case BOW:
00678                      if ((lp!=bol && iswordc(lp[-1])) || !iswordc(*lp))
00679                             return 0;
00680                      break;
00681               case EOW:
00682                      if (lp==bol || !iswordc(lp[-1]) || iswordc(*lp))
00683                             return 0;
00684                      break;
00685               case REF:
00686                      n = *ap++;
00687                      bp = bopat[n];
00688                      ep = eopat[n];
00689                      while (bp < ep)
00690                             if (*bp++ != *lp++)
00691                                    return 0;
00692                      break;
00693               case CLO:
00694                      are = lp;
00695                      switch(*ap) {
00696 
00697                      case ANY:
00698                             while (*lp)
00699                                    lp++;
00700                             n = ANYSKIP;
00701                             break;
00702                      case CHR:
00703                             c = *(ap+1);
00704                             while (*lp && c == *lp)
00705                                    lp++;
00706                             n = CHRSKIP;
00707                             break;
00708                      case CCL:
00709                             while ((c = *lp) && isinset(ap+1,c))
00710                                    lp++;
00711                             n = CCLSKIP;
00712                             break;
00713                      default:
00714                             re_fail("closure: bad nfa.", *ap);
00715                             return 0;
00716                      }
00717 
00718                      ap += n;
00719 
00720                      while (lp >= are) {
00721                             if ((e = pmatch(lp, ap)))
00722                                    return e;
00723                             --lp;
00724                      }
00725                      return 0;
00726               default:
00727                      re_fail("re_exec: bad nfa.", op);
00728                      return 0;
00729               }
00730        return lp;
00731 }
00732 
00733 /*
00734  * re_modw:
00735  *     add new characters into the word table to change re_exec's
00736  *     understanding of what a word should look like. Note that we
00737  *     only accept additions into the word definition.
00738  *
00739  *     If the string parameter is 0 or null string, the table is
00740  *     reset back to the default containing A-Z a-z 0-9 _. [We use
00741  *     the compact bitset representation for the default table]
00742  */
00743 
00744 static REGEXCHAR deftab[16] = {    
00745        0, 0, 0, 0, 0, 0, 0377, 003, 0376, 0377, 0377, 0207,  
00746        0376, 0377, 0377, 007 
00747 }; 
00748 
00749 void
00750 LDAP_CALL
00751 re_modw( char *s )
00752 {
00753        register int i;
00754 
00755        if (!s || !*s) {
00756               for (i = 0; i < MAXCHR; i++)
00757                      if (!isinset(deftab,i))
00758                             iswordc(i) = 0;
00759        }
00760        else
00761               while(*s)
00762                      iswordc(*s++) = 1;
00763 }
00764 
00765 /*
00766  * re_subs:
00767  *     substitute the matched portions of the src in dst.
00768  *
00769  *     &      substitute the entire matched pattern.
00770  *
00771  *     \digit substitute a subpattern, with the given   tag number.
00772  *            Tags are numbered from 1 to 9. If the particular
00773  *            tagged subpattern does not exist, null is substituted.
00774  */
00775 int
00776 LDAP_CALL
00777 re_subs( char *src, char *dst)
00778 {
00779        register char      c;
00780        register int       pin;
00781        register REGEXCHAR *bp;
00782        register REGEXCHAR *ep;
00783 
00784        if (!*src || !bopat[0])
00785               return 0;
00786 
00787        while ((c = *src++)) {
00788               switch(c) {
00789 
00790               case '&':
00791                      pin = 0;
00792                      break;
00793 
00794               case '\\':
00795                      c = *src++;
00796                      if (c >= '0' && c <= '9') {
00797                             pin = c - '0';
00798                             break;
00799                      }
00800                      
00801               default:
00802                      *dst++ = c;
00803                      continue;
00804               }
00805 
00806               if ((bp = bopat[pin]) && (ep = eopat[pin])) {
00807                      while (*bp && bp < ep)
00808                             *dst++ = *(char*)bp++;
00809                      if (bp < ep)
00810                             return 0;
00811               }
00812        }
00813        *dst = (char) 0;
00814        return 1;
00815 }
00816                      
00817 #ifdef DEBUG
00818 
00819 /* No printf or exit in 16-bit Windows */
00820 #if defined( _WINDOWS ) && !defined( _WIN32 )
00821 static int LDAP_C printf( const char* pszFormat, ...)
00822 {
00823     char buf[1024];
00824        va_list arglist;
00825        va_start(arglist, pszFormat);
00826     vsprintf(buf, pszFormat, arglist);
00827        va_end(arglist);
00828     OutputDebugString(buf);
00829        return 0;
00830 }
00831 #define exit(v) return
00832 #endif /* 16-bit Windows */
00833 
00834 
00835 #ifdef REGEX_DEBUG
00836 
00837 static void nfadump( REGEXCHAR *ap);
00838 
00839 /*
00840  * symbolic - produce a symbolic dump of the nfa
00841  */
00842 void
00843 symbolic( char *s ) 
00844 {
00845        printf("pattern: %s\n", s);
00846        printf("nfacode:\n");
00847        nfadump(nfa);
00848 }
00849 
00850 static void
00851 nfadump( REGEXCHAR *ap)
00852 {
00853        register int n;
00854 
00855        while (*ap != END)
00856               switch(*ap++) {
00857               case CLO:
00858                      printf("CLOSURE");
00859                      nfadump(ap);
00860                      switch(*ap) {
00861                      case CHR:
00862                             n = CHRSKIP;
00863                             break;
00864                      case ANY:
00865                             n = ANYSKIP;
00866                             break;
00867                      case CCL:
00868                             n = CCLSKIP;
00869                             break;
00870                      }
00871                      ap += n;
00872                      break;
00873               case CHR:
00874                      printf("\tCHR %c\n",*ap++);
00875                      break;
00876               case ANY:
00877                      printf("\tANY .\n");
00878                      break;
00879               case BOL:
00880                      printf("\tBOL -\n");
00881                      break;
00882               case EOL:
00883                      printf("\tEOL -\n");
00884                      break;
00885               case BOT:
00886                      printf("BOT: %d\n",*ap++);
00887                      break;
00888               case EOT:
00889                      printf("EOT: %d\n",*ap++);
00890                      break;
00891               case BOW:
00892                      printf("BOW\n");
00893                      break;
00894               case EOW:
00895                      printf("EOW\n");
00896                      break;
00897               case REF:
00898                      printf("REF: %d\n",*ap++);
00899                      break;
00900               case CCL:
00901                      printf("\tCCL [");
00902                      for (n = 0; n < MAXCHR; n++)
00903                             if (isinset(ap,(REGEXCHAR)n)) {
00904                                    if (n < ' ')
00905                                           printf("^%c", n ^ 0x040);
00906                                    else
00907                                           printf("%c", n);
00908                             }
00909                      printf("]\n");
00910                      ap += BITBLK;
00911                      break;
00912               default:
00913                      printf("bad nfa. opcode %o\n", ap[-1]);
00914                      exit(1);
00915                      break;
00916               }
00917 }
00918 #endif /* REGEX_DEBUG */
00919 #endif /* DEBUG */
00920 #endif /* macintosh or DOS or _WINDOWS or NEED_BSDREGEX */