Back to index

openldap  2.4.31
ure.c
Go to the documentation of this file.
00001 /* $OpenLDAP$ */
00002 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
00003  *
00004  * Copyright 1998-2012 The OpenLDAP Foundation.
00005  * All rights reserved.
00006  *
00007  * Redistribution and use in source and binary forms, with or without
00008  * modification, are permitted only as authorized by the OpenLDAP
00009  * Public License.
00010  *
00011  * A copy of this license is available in file LICENSE in the
00012  * top-level directory of the distribution or, alternatively, at
00013  * <http://www.OpenLDAP.org/license.html>.
00014  */
00015 /* Copyright 1997, 1998, 1999 Computing Research Labs,
00016  * New Mexico State University
00017  *
00018  * Permission is hereby granted, free of charge, to any person obtaining a
00019  * copy of this software and associated documentation files (the "Software"),
00020  * to deal in the Software without restriction, including without limitation
00021  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
00022  * and/or sell copies of the Software, and to permit persons to whom the
00023  * Software is furnished to do so, subject to the following conditions:
00024  *
00025  * The above copyright notice and this permission notice shall be included in
00026  * all copies or substantial portions of the Software.
00027  *
00028  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00029  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00030  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
00031  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
00032  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
00033  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
00034  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
00035  */
00036 /* $Id: ure.c,v 1.2 1999/09/21 15:47:43 mleisher Exp $" */
00037 
00038 #include "portable.h"
00039 
00040 #include <ac/stdlib.h>
00041 #include <ac/string.h>
00042 #include <ac/unistd.h>
00043 
00044 #include "ure.h"
00045 
00046 /*
00047  * Flags used internally in the DFA.
00048  */
00049 #define _URE_DFA_CASEFOLD  0x01
00050 #define _URE_DFA_BLANKLINE 0x02
00051 
00052 static unsigned long cclass_flags[] = {
00053     0,
00054     _URE_NONSPACING,
00055     _URE_COMBINING,
00056     _URE_NUMDIGIT,
00057     _URE_NUMOTHER,
00058     _URE_SPACESEP,
00059     _URE_LINESEP,
00060     _URE_PARASEP,
00061     _URE_CNTRL,
00062     _URE_PUA,
00063     _URE_UPPER,
00064     _URE_LOWER,
00065     _URE_TITLE,
00066     _URE_MODIFIER,
00067     _URE_OTHERLETTER,
00068     _URE_DASHPUNCT,
00069     _URE_OPENPUNCT,
00070     _URE_CLOSEPUNCT,
00071     _URE_OTHERPUNCT,
00072     _URE_MATHSYM,
00073     _URE_CURRENCYSYM,
00074     _URE_OTHERSYM,
00075     _URE_LTR,
00076     _URE_RTL,
00077     _URE_EURONUM,
00078     _URE_EURONUMSEP,
00079     _URE_EURONUMTERM,
00080     _URE_ARABNUM,
00081     _URE_COMMONSEP,
00082     _URE_BLOCKSEP,
00083     _URE_SEGMENTSEP,
00084     _URE_WHITESPACE,
00085     _URE_OTHERNEUT,
00086 };
00087 
00088 /*
00089  * Symbol types for the DFA.
00090  */
00091 #define _URE_ANY_CHAR   1
00092 #define _URE_CHAR       2
00093 #define _URE_CCLASS     3
00094 #define _URE_NCCLASS    4
00095 #define _URE_BOL_ANCHOR 5
00096 #define _URE_EOL_ANCHOR 6
00097 
00098 /*
00099  * Op codes for converting the NFA to a DFA.
00100  */
00101 #define _URE_SYMBOL     10
00102 #define _URE_PAREN      11
00103 #define _URE_QUEST      12
00104 #define _URE_STAR       13
00105 #define _URE_PLUS       14
00106 #define _URE_ONE        15
00107 #define _URE_AND        16
00108 #define _URE_OR         17
00109 
00110 #define _URE_NOOP       0xffff
00111 
00112 #define _URE_REGSTART 0x8000
00113 #define _URE_REGEND   0x4000
00114 
00115 /*
00116  * Structure used to handle a compacted range of characters.
00117  */
00118 typedef struct {
00119     ucs4_t min_code;
00120     ucs4_t max_code;
00121 } _ure_range_t;
00122 
00123 typedef struct {
00124     _ure_range_t *ranges;
00125     ucs2_t ranges_used;
00126     ucs2_t ranges_size;
00127 } _ure_ccl_t;
00128 
00129 typedef union {
00130     ucs4_t chr;
00131     _ure_ccl_t ccl;
00132 } _ure_sym_t;
00133 
00134 /*
00135  * This is a general element structure used for expressions and stack
00136  * elements.
00137  */
00138 typedef struct {
00139     ucs2_t reg;
00140     ucs2_t onstack;
00141     ucs2_t type;
00142     ucs2_t lhs;
00143     ucs2_t rhs;
00144 } _ure_elt_t;
00145 
00146 /*
00147  * This is a structure used to track a list or a stack of states.
00148  */
00149 typedef struct {
00150     ucs2_t *slist;
00151     ucs2_t slist_size;
00152     ucs2_t slist_used;
00153 } _ure_stlist_t;
00154 
00155 /*
00156  * Structure to track the list of unique states for a symbol
00157  * during reduction.
00158  */
00159 typedef struct {
00160     ucs2_t id;
00161     ucs2_t type;
00162     unsigned long mods;
00163     unsigned long props;
00164     _ure_sym_t sym;
00165     _ure_stlist_t states;
00166 } _ure_symtab_t;
00167 
00168 /*
00169  * Structure to hold a single state.
00170  */
00171 typedef struct {
00172     ucs2_t id;
00173     ucs2_t accepting;
00174     ucs2_t pad;
00175     _ure_stlist_t st;
00176     _ure_elt_t *trans;
00177     ucs2_t trans_size;
00178     ucs2_t trans_used;
00179 } _ure_state_t;
00180 
00181 /*
00182  * Structure used for keeping lists of states.
00183  */
00184 typedef struct {
00185     _ure_state_t *states;
00186     ucs2_t states_size;
00187     ucs2_t states_used;
00188 } _ure_statetable_t;
00189 
00190 /*
00191  * Structure to track pairs of DFA states when equivalent states are
00192  * merged.
00193  */
00194 typedef struct {
00195     ucs2_t l;
00196     ucs2_t r;
00197 } _ure_equiv_t;
00198 
00199 /*
00200  * Structure used for constructing the NFA and reducing to a minimal DFA.
00201  */
00202 typedef struct _ure_buffer_t {
00203     int reducing;
00204     int error;
00205     unsigned long flags;
00206 
00207     _ure_stlist_t stack;
00208 
00209     /*
00210      * Table of unique symbols encountered.
00211      */
00212     _ure_symtab_t *symtab;
00213     ucs2_t symtab_size;
00214     ucs2_t symtab_used;
00215 
00216     /*
00217      * Tracks the unique expressions generated for the NFA and when the NFA is
00218      * reduced.
00219      */
00220     _ure_elt_t *expr;
00221     ucs2_t expr_used;
00222     ucs2_t expr_size;
00223 
00224     /*
00225      * The reduced table of unique groups of NFA states.
00226      */
00227     _ure_statetable_t states;
00228 
00229     /*
00230      * Tracks states when equivalent states are merged.
00231      */
00232     _ure_equiv_t *equiv;
00233     ucs2_t equiv_used;
00234     ucs2_t equiv_size;
00235 } _ure_buffer_t;
00236 
00237 typedef struct {
00238     ucs2_t symbol;
00239     ucs2_t next_state;
00240 } _ure_trans_t;
00241 
00242 typedef struct {
00243     ucs2_t accepting;
00244     ucs2_t ntrans;
00245     _ure_trans_t *trans;
00246 } _ure_dstate_t;
00247 
00248 typedef struct _ure_dfa_t {
00249     unsigned long flags;
00250 
00251     _ure_symtab_t *syms;
00252     ucs2_t nsyms;
00253 
00254     _ure_dstate_t *states;
00255     ucs2_t nstates;
00256 
00257     _ure_trans_t *trans;
00258     ucs2_t ntrans;
00259 } _ure_dfa_t;
00260 
00261 /*************************************************************************
00262  *
00263  * Functions.
00264  *
00265  *************************************************************************/
00266 
00267 static void
00268 _ure_memmove(char *dest, char *src, unsigned long bytes)
00269 {
00270     long i, j;
00271 
00272     i = (long) bytes;
00273     j = i & 7;
00274     i = (i + 7) >> 3;
00275 
00276     /*
00277      * Do a memmove using Ye Olde Duff's Device for efficiency.
00278      */
00279     if (src < dest) {
00280         src += bytes;
00281         dest += bytes;
00282 
00283         switch (j) {
00284           case 0: do {
00285               *--dest = *--src;
00286             case 7: *--dest = *--src;
00287             case 6: *--dest = *--src;
00288             case 5: *--dest = *--src;
00289             case 4: *--dest = *--src;
00290             case 3: *--dest = *--src;
00291             case 2: *--dest = *--src;
00292             case 1: *--dest = *--src;
00293           } while (--i > 0);
00294         }
00295     } else if (src > dest) {
00296         switch (j) {
00297           case 0: do {
00298               *dest++ = *src++;
00299             case 7: *dest++ = *src++;
00300             case 6: *dest++ = *src++;
00301             case 5: *dest++ = *src++;
00302             case 4: *dest++ = *src++;
00303             case 3: *dest++ = *src++;
00304             case 2: *dest++ = *src++;
00305             case 1: *dest++ = *src++;
00306           } while (--i > 0);
00307         }
00308     }
00309 }
00310 
00311 static void
00312 _ure_push(ucs2_t v, _ure_buffer_t *b)
00313 {
00314     _ure_stlist_t *s;
00315 
00316     if (b == 0)
00317       return;
00318 
00319     /*
00320      * If the `reducing' parameter is non-zero, check to see if the value
00321      * passed is already on the stack.
00322      */
00323     if (b->reducing != 0 && b->expr[v].onstack != 0)
00324       return;
00325 
00326     s = &b->stack;
00327     if (s->slist_used == s->slist_size) {
00328         if (s->slist_size == 0)
00329           s->slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
00330         else
00331           s->slist = (ucs2_t *) realloc((char *) s->slist,
00332                                         sizeof(ucs2_t) * (s->slist_size + 8));
00333         s->slist_size += 8;
00334     }
00335     s->slist[s->slist_used++] = v;
00336 
00337     /*
00338      * If the `reducing' parameter is non-zero, flag the element as being on
00339      * the stack.
00340      */
00341     if (b->reducing != 0)
00342       b->expr[v].onstack = 1;
00343 }
00344 
00345 static ucs2_t
00346 _ure_peek(_ure_buffer_t *b)
00347 {
00348     if (b == 0 || b->stack.slist_used == 0)
00349       return _URE_NOOP;
00350 
00351     return b->stack.slist[b->stack.slist_used - 1];
00352 }
00353 
00354 static ucs2_t
00355 _ure_pop(_ure_buffer_t *b)
00356 {
00357     ucs2_t v;
00358 
00359     if (b == 0 || b->stack.slist_used == 0)
00360       return _URE_NOOP;
00361 
00362     v = b->stack.slist[--b->stack.slist_used];
00363     if (b->reducing)
00364       b->expr[v].onstack = 0;
00365 
00366     return v;
00367 }
00368 
00369 /*************************************************************************
00370  *
00371  * Start symbol parse functions.
00372  *
00373  *************************************************************************/
00374 
00375 /*
00376  * Parse a comma-separated list of integers that represent character
00377  * properties.  Combine them into a mask that is returned in the `mask'
00378  * variable, and return the number of characters consumed.
00379  */
00380 static unsigned long
00381 _ure_prop_list(ucs2_t *pp, unsigned long limit, unsigned long *mask,
00382                _ure_buffer_t *b)
00383 {
00384     unsigned long n, m;
00385     ucs2_t *sp, *ep;
00386 
00387     sp = pp;
00388     ep = sp + limit;
00389 
00390     for (m = n = 0; b->error == _URE_OK && sp < ep; sp++) {
00391         if (*sp == ',') {
00392             /*
00393              * Encountered a comma, so select the next character property flag
00394              * and reset the number.
00395              */
00396             m |= cclass_flags[n];
00397             n = 0;
00398         } else if (*sp >= '0' && *sp <= '9')
00399           /*
00400            * Encountered a digit, so start or continue building the cardinal
00401            * that represents the character property flag.
00402            */
00403           n = (n * 10) + (*sp - '0');
00404         else
00405           /*
00406            * Encountered something that is not part of the property list.
00407            * Indicate that we are done.
00408            */
00409           break;
00410 
00411         /*
00412          * If a property number greater than 32 occurs, then there is a
00413          * problem.  Most likely a missing comma separator.
00414          */
00415         if (n > 32)
00416           b->error = _URE_INVALID_PROPERTY;
00417     }
00418 
00419     if (n != 0)
00420       m |= cclass_flags[n];
00421 
00422     /*
00423      * Set the mask that represents the group of character properties.
00424      */
00425     *mask = m;
00426 
00427     /*
00428      * Return the number of characters consumed.
00429      */
00430     return sp - pp;
00431 }
00432 
00433 /*
00434  * Collect a hex number with 1 to 4 digits and return the number
00435  * of characters used.
00436  */
00437 static unsigned long
00438 _ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n)
00439 {
00440     ucs2_t i;
00441     ucs2_t *sp, *ep;
00442     ucs4_t nn;
00443 
00444     sp = np;
00445     ep = sp + limit;
00446 
00447     for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) {
00448         if (*sp >= '0' && *sp <= '9')
00449           nn = (nn << 4) + (*sp - '0');
00450         else if (*sp >= 'A' && *sp <= 'F')
00451           nn = (nn << 4) + ((*sp - 'A') + 10);
00452         else if (*sp >= 'a' && *sp <= 'f')
00453           nn = (nn << 4) + ((*sp - 'a') + 10);
00454         else
00455           /*
00456            * Encountered something that is not a hex digit.
00457            */
00458           break;
00459     }
00460 
00461     /*
00462      * Assign the character code collected and return the number of
00463      * characters used.
00464      */
00465     *n = nn;
00466 
00467     return sp - np;
00468 }
00469 
00470 /*
00471  * Insert a range into a character class, removing duplicates and ordering
00472  * them in increasing range-start order.
00473  */
00474 static void
00475 _ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b)
00476 {
00477     ucs2_t i;
00478     ucs4_t tmp;
00479     _ure_range_t *rp;
00480 
00481     /*
00482      * If the `casefold' flag is set, then make sure both endpoints of the
00483      * range are converted to lower case.
00484      */
00485     if (b->flags & _URE_DFA_CASEFOLD) {
00486         r->min_code = _ure_tolower(r->min_code);
00487         r->max_code = _ure_tolower(r->max_code);
00488     }
00489 
00490     /*
00491      * Swap the range endpoints if they are not in increasing order.
00492      */
00493     if (r->min_code > r->max_code) {
00494         tmp = r->min_code;
00495         r->min_code = r->max_code;
00496         r->max_code = tmp;
00497     }
00498 
00499     for (i = 0, rp = ccl->ranges;
00500          i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ;
00501 
00502     /*
00503      * Check for a duplicate.
00504      */
00505     if (i < ccl->ranges_used &&
00506         r->min_code == rp->min_code && r->max_code == rp->max_code)
00507       return;
00508 
00509     if (ccl->ranges_used == ccl->ranges_size) {
00510         if (ccl->ranges_size == 0)
00511           ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3);
00512         else
00513           ccl->ranges = (_ure_range_t *)
00514               realloc((char *) ccl->ranges,
00515                       sizeof(_ure_range_t) * (ccl->ranges_size + 8));
00516         ccl->ranges_size += 8;
00517     }
00518 
00519     rp = ccl->ranges + ccl->ranges_used;
00520 
00521     if (i < ccl->ranges_used)
00522       _ure_memmove((char *) (rp + 1), (char *) rp,
00523                    sizeof(_ure_range_t) * (ccl->ranges_used - i));
00524 
00525     ccl->ranges_used++;
00526     rp->min_code = r->min_code;
00527     rp->max_code = r->max_code;
00528 }
00529 
00530 #define _URE_ALPHA_MASK  (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\
00531 _URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING)
00532 #define _URE_ALNUM_MASK  (_URE_ALPHA_MASK|_URE_NUMDIGIT)
00533 #define _URE_PUNCT_MASK  (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\
00534 _URE_OTHERPUNCT)
00535 #define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\
00536 _URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM)
00537 #define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP)
00538 #define _URE_SPACE_MASK  (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP)
00539 
00540 typedef void (*_ure_cclsetup_t)(
00541     _ure_symtab_t *sym,
00542     unsigned long mask,
00543     _ure_buffer_t *b
00544 );
00545 
00546 typedef struct {
00547     ucs2_t key;
00548     unsigned long len;
00549     unsigned long next;
00550     _ure_cclsetup_t func;
00551     unsigned long mask;
00552 } _ure_trie_t;
00553 
00554 static void
00555 _ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
00556 {
00557     sym->props |= mask;
00558 }
00559 
00560 static void
00561 _ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
00562 {
00563     _ure_range_t range;
00564 
00565     sym->props |= mask;
00566 
00567     /*
00568      * Add the additional characters needed for handling isspace().
00569      */
00570     range.min_code = range.max_code = '\t';
00571     _ure_add_range(&sym->sym.ccl, &range, b);
00572     range.min_code = range.max_code = '\r';
00573     _ure_add_range(&sym->sym.ccl, &range, b);
00574     range.min_code = range.max_code = '\n';
00575     _ure_add_range(&sym->sym.ccl, &range, b);
00576     range.min_code = range.max_code = '\f';
00577     _ure_add_range(&sym->sym.ccl, &range, b);
00578     range.min_code = range.max_code = 0xfeff;
00579     _ure_add_range(&sym->sym.ccl, &range, b);
00580 }
00581 
00582 static void
00583 _ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
00584 {
00585     _ure_range_t range;
00586 
00587     /*
00588      * Add the additional characters needed for handling isxdigit().
00589      */
00590     range.min_code = '0';
00591     range.max_code = '9';
00592     _ure_add_range(&sym->sym.ccl, &range, b);
00593     range.min_code = 'A';
00594     range.max_code = 'F';
00595     _ure_add_range(&sym->sym.ccl, &range, b);
00596     range.min_code = 'a';
00597     range.max_code = 'f';
00598     _ure_add_range(&sym->sym.ccl, &range, b);
00599 }
00600 
00601 static _ure_trie_t cclass_trie[] = {
00602     {0x003a, 1, 1, 0, 0},
00603     {0x0061, 9, 10, 0, 0},
00604     {0x0063, 8, 19, 0, 0},
00605     {0x0064, 7, 24, 0, 0},
00606     {0x0067, 6, 29, 0, 0},
00607     {0x006c, 5, 34, 0, 0},
00608     {0x0070, 4, 39, 0, 0},
00609     {0x0073, 3, 49, 0, 0},
00610     {0x0075, 2, 54, 0, 0},
00611     {0x0078, 1, 59, 0, 0},
00612     {0x006c, 1, 11, 0, 0},
00613     {0x006e, 2, 13, 0, 0},
00614     {0x0070, 1, 16, 0, 0},
00615     {0x0075, 1, 14, 0, 0},
00616     {0x006d, 1, 15, 0, 0},
00617     {0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK},
00618     {0x0068, 1, 17, 0, 0},
00619     {0x0061, 1, 18, 0, 0},
00620     {0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK},
00621     {0x006e, 1, 20, 0, 0},
00622     {0x0074, 1, 21, 0, 0},
00623     {0x0072, 1, 22, 0, 0},
00624     {0x006c, 1, 23, 0, 0},
00625     {0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL},
00626     {0x0069, 1, 25, 0, 0},
00627     {0x0067, 1, 26, 0, 0},
00628     {0x0069, 1, 27, 0, 0},
00629     {0x0074, 1, 28, 0, 0},
00630     {0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT},
00631     {0x0072, 1, 30, 0, 0},
00632     {0x0061, 1, 31, 0, 0},
00633     {0x0070, 1, 32, 0, 0},
00634     {0x0068, 1, 33, 0, 0},
00635     {0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK},
00636     {0x006f, 1, 35, 0, 0},
00637     {0x0077, 1, 36, 0, 0},
00638     {0x0065, 1, 37, 0, 0},
00639     {0x0072, 1, 38, 0, 0},
00640     {0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER},
00641     {0x0072, 2, 41, 0, 0},
00642     {0x0075, 1, 45, 0, 0},
00643     {0x0069, 1, 42, 0, 0},
00644     {0x006e, 1, 43, 0, 0},
00645     {0x0074, 1, 44, 0, 0},
00646     {0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK},
00647     {0x006e, 1, 46, 0, 0},
00648     {0x0063, 1, 47, 0, 0},
00649     {0x0074, 1, 48, 0, 0},
00650     {0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK},
00651     {0x0070, 1, 50, 0, 0},
00652     {0x0061, 1, 51, 0, 0},
00653     {0x0063, 1, 52, 0, 0},
00654     {0x0065, 1, 53, 0, 0},
00655     {0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK},
00656     {0x0070, 1, 55, 0, 0},
00657     {0x0070, 1, 56, 0, 0},
00658     {0x0065, 1, 57, 0, 0},
00659     {0x0072, 1, 58, 0, 0},
00660     {0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER},
00661     {0x0064, 1, 60, 0, 0},
00662     {0x0069, 1, 61, 0, 0},
00663     {0x0067, 1, 62, 0, 0},
00664     {0x0069, 1, 63, 0, 0},
00665     {0x0074, 1, 64, 0, 0},
00666     {0x003a, 1, 65, _ure_xdigit_setup, 0},
00667 };
00668 
00669 /*
00670  * Probe for one of the POSIX colon delimited character classes in the static
00671  * trie.
00672  */
00673 static unsigned long
00674 _ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym,
00675                _ure_buffer_t *b)
00676 {
00677     int i;
00678     unsigned long n;
00679     _ure_trie_t *tp;
00680     ucs2_t *sp, *ep;
00681 
00682     /*
00683      * If the number of characters left is less than 7, then this cannot be
00684      * interpreted as one of the colon delimited classes.
00685      */
00686     if (limit < 7)
00687       return 0;
00688 
00689     sp = cp;
00690     ep = sp + limit;
00691     tp = cclass_trie;
00692     for (i = 0; sp < ep && i < 8; i++, sp++) {
00693         n = tp->len;
00694 
00695         for (; n > 0 && tp->key != *sp; tp++, n--) ;
00696 
00697         if (n == 0)
00698           return 0;
00699 
00700         if (*sp == ':' && (i == 6 || i == 7)) {
00701             sp++;
00702             break;
00703         }
00704         if (sp + 1 < ep)
00705           tp = cclass_trie + tp->next;
00706     }
00707     if (tp->func == 0)
00708       return 0;
00709 
00710     (*tp->func)(sym, tp->mask, b);
00711 
00712     return sp - cp;
00713 }
00714 
00715 /*
00716  * Construct a list of ranges and return the number of characters consumed.
00717  */
00718 static unsigned long
00719 _ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp,
00720             _ure_buffer_t *b)
00721 {
00722     int range_end;
00723     unsigned long n;
00724     ucs2_t *sp, *ep;
00725     ucs4_t c, last;
00726     _ure_ccl_t *cclp;
00727     _ure_range_t range;
00728 
00729     sp = cp;
00730     ep = sp + limit;
00731 
00732     if (*sp == '^') {
00733       symp->type = _URE_NCCLASS;
00734       sp++;
00735     } else
00736       symp->type = _URE_CCLASS;
00737 
00738     for (last = 0, range_end = 0;
00739          b->error == _URE_OK && sp < ep && *sp != ']'; ) {
00740         c = *sp++;
00741         if (c == '\\') {
00742             if (sp == ep) {
00743                 /*
00744                  * The EOS was encountered when expecting the reverse solidus
00745                  * to be followed by the character it is escaping.  Set an
00746                  * error code and return the number of characters consumed up
00747                  * to this point.
00748                  */
00749                 b->error = _URE_UNEXPECTED_EOS;
00750                 return sp - cp;
00751             }
00752 
00753             c = *sp++;
00754             switch (c) {
00755               case 'a':
00756                 c = 0x07;
00757                 break;
00758               case 'b':
00759                 c = 0x08;
00760                 break;
00761               case 'f':
00762                 c = 0x0c;
00763                 break;
00764               case 'n':
00765                 c = 0x0a;
00766                 break;
00767               case 'r':
00768                 c = 0x0d;
00769                 break;
00770               case 't':
00771                 c = 0x09;
00772                 break;
00773               case 'v':
00774                 c = 0x0b;
00775                 break;
00776               case 'p':
00777               case 'P':
00778                 sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
00779                 /*
00780                  * Invert the bit mask of the properties if this is a negated
00781                  * character class or if 'P' is used to specify a list of
00782                  * character properties that should *not* match in a
00783                  * character class.
00784                  */
00785                 if (c == 'P')
00786                   symp->props = ~symp->props;
00787                 continue;
00788                 break;
00789               case 'x':
00790               case 'X':
00791               case 'u':
00792               case 'U':
00793                 if (sp < ep &&
00794                     ((*sp >= '0' && *sp <= '9') ||
00795                      (*sp >= 'A' && *sp <= 'F') ||
00796                      (*sp >= 'a' && *sp <= 'f')))
00797                   sp += _ure_hex(sp, ep - sp, &c);
00798             }
00799         } else if (c == ':') {
00800             /*
00801              * Probe for a POSIX colon delimited character class.
00802              */
00803             sp--;
00804             if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0)
00805               sp++;
00806             else {
00807                 sp += n;
00808                 continue;
00809             }
00810         }
00811 
00812         cclp = &symp->sym.ccl;
00813 
00814         /*
00815          * Check to see if the current character is a low surrogate that needs
00816          * to be combined with a preceding high surrogate.
00817          */
00818         if (last != 0) {
00819             if (c >= 0xdc00 && c <= 0xdfff)
00820               /*
00821                * Construct the UTF16 character code.
00822                */
00823               c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff));
00824             else {
00825                 /*
00826                  * Add the isolated high surrogate to the range.
00827                  */
00828                 if (range_end == 1)
00829                   range.max_code = last & 0xffff;
00830                 else
00831                   range.min_code = range.max_code = last & 0xffff;
00832 
00833                 _ure_add_range(cclp, &range, b);
00834                 range_end = 0;
00835             }
00836         }
00837 
00838         /*
00839          * Clear the last character code.
00840          */
00841         last = 0;
00842 
00843         /*
00844          * This slightly awkward code handles the different cases needed to
00845          * construct a range.
00846          */
00847         if (c >= 0xd800 && c <= 0xdbff) {
00848             /*
00849              * If the high surrogate is followed by a range indicator, simply
00850              * add it as the range start.  Otherwise, save it in case the next
00851              * character is a low surrogate.
00852              */
00853             if (*sp == '-') {
00854                 sp++;
00855                 range.min_code = c;
00856                 range_end = 1;
00857             } else
00858               last = c;
00859         } else if (range_end == 1) {
00860             range.max_code = c;
00861             _ure_add_range(cclp, &range, b);
00862             range_end = 0;
00863         } else {
00864             range.min_code = range.max_code = c;
00865             if (*sp == '-') {
00866                 sp++;
00867                 range_end = 1;
00868             } else
00869               _ure_add_range(cclp, &range, b);
00870         }
00871     }
00872 
00873     if (sp < ep && *sp == ']')
00874       sp++;
00875     else
00876       /*
00877        * The parse was not terminated by the character class close symbol
00878        * (']'), so set an error code.
00879        */
00880       b->error = _URE_CCLASS_OPEN;
00881 
00882     return sp - cp;
00883 }
00884 
00885 /*
00886  * Probe for a low surrogate hex code.
00887  */
00888 static unsigned long
00889 _ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c)
00890 {
00891     ucs4_t i, code;
00892     ucs2_t *sp, *ep;
00893 
00894     for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) {
00895         if (*sp >= '0' && *sp <= '9')
00896           code = (code << 4) + (*sp - '0');
00897         else if (*sp >= 'A' && *sp <= 'F')
00898           code = (code << 4) + ((*sp - 'A') + 10);
00899         else if (*sp >= 'a' && *sp <= 'f')
00900           code = (code << 4) + ((*sp - 'a') + 10);
00901         else
00902           break;
00903     }
00904 
00905     *c = code;
00906     return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0;
00907 }
00908 
00909 static unsigned long
00910 _ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp,
00911                     _ure_buffer_t *b)
00912 {
00913     ucs4_t c;
00914     ucs2_t *sp, *ep;
00915 
00916     sp = sym;
00917     ep = sym + limit;
00918 
00919     if ((c = *sp++) == '\\') {
00920 
00921         if (sp == ep) {
00922             /*
00923              * The EOS was encountered when expecting the reverse solidus to
00924              * be followed by the character it is escaping.  Set an error code
00925              * and return the number of characters consumed up to this point.
00926              */
00927             b->error = _URE_UNEXPECTED_EOS;
00928             return sp - sym;
00929         }
00930 
00931         c = *sp++;
00932         switch (c) {
00933           case 'p':
00934           case 'P':
00935             symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS;
00936             sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
00937             break;
00938           case 'a':
00939             symp->type = _URE_CHAR;
00940             symp->sym.chr = 0x07;
00941             break;
00942           case 'b':
00943             symp->type = _URE_CHAR;
00944             symp->sym.chr = 0x08;
00945             break;
00946           case 'f':
00947             symp->type = _URE_CHAR;
00948             symp->sym.chr = 0x0c;
00949             break;
00950           case 'n':
00951             symp->type = _URE_CHAR;
00952             symp->sym.chr = 0x0a;
00953             break;
00954           case 'r':
00955             symp->type = _URE_CHAR;
00956             symp->sym.chr = 0x0d;
00957             break;
00958           case 't':
00959             symp->type = _URE_CHAR;
00960             symp->sym.chr = 0x09;
00961             break;
00962           case 'v':
00963             symp->type = _URE_CHAR;
00964             symp->sym.chr = 0x0b;
00965             break;
00966           case 'x':
00967           case 'X':
00968           case 'u':
00969           case 'U':
00970             /*
00971              * Collect between 1 and 4 digits representing a UCS2 code.  Fall
00972              * through to the next case.
00973              */
00974             if (sp < ep &&
00975                 ((*sp >= '0' && *sp <= '9') ||
00976                  (*sp >= 'A' && *sp <= 'F') ||
00977                  (*sp >= 'a' && *sp <= 'f')))
00978               sp += _ure_hex(sp, ep - sp, &c);
00979             /* FALLTHROUGH */
00980           default:
00981             /*
00982              * Simply add an escaped character here.
00983              */
00984             symp->type = _URE_CHAR;
00985             symp->sym.chr = c;
00986         }
00987     } else if (c == '^' || c == '$')
00988       /*
00989        * Handle the BOL and EOL anchors.  This actually consists simply of
00990        * setting a flag that indicates that the user supplied anchor match
00991        * function should be called.  This needs to be done instead of simply
00992        * matching line/paragraph separators because beginning-of-text and
00993        * end-of-text tests are needed as well.
00994        */
00995       symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR;
00996     else if (c == '[')
00997       /*
00998        * Construct a character class.
00999        */
01000       sp += _ure_cclass(sp, ep - sp, symp, b);
01001     else if (c == '.')
01002       symp->type = _URE_ANY_CHAR;
01003     else {
01004         symp->type = _URE_CHAR;
01005         symp->sym.chr = c;
01006     }
01007 
01008     /*
01009      * If the symbol type happens to be a character and is a high surrogate,
01010      * then probe forward to see if it is followed by a low surrogate that
01011      * needs to be added.
01012      */
01013     if (sp < ep && symp->type == _URE_CHAR &&
01014         0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) {
01015 
01016         if (0xdc00 <= *sp && *sp <= 0xdfff) {
01017             symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
01018                                        (*sp & 0x03ff));
01019             sp++;
01020         } else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' ||
01021                                  *(sp + 1) == 'u' || *(sp + 1) == 'U')) {
01022             sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c);
01023             if (0xdc00 <= c && c <= 0xdfff) {
01024                 /*
01025                  * Take into account the \[xu] in front of the hex code.
01026                  */
01027                 sp += 2;
01028                 symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
01029                                            (c & 0x03ff));
01030             }
01031         }
01032     }
01033 
01034     /*
01035      * Last, make sure any _URE_CHAR type symbols are changed to lower case if
01036      * the `casefold' flag is set.
01037      */
01038     if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR)
01039       symp->sym.chr = _ure_tolower(symp->sym.chr);
01040 
01041     /*
01042      * If the symbol constructed is anything other than one of the anchors,
01043      * make sure the _URE_DFA_BLANKLINE flag is removed.
01044      */
01045     if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR)
01046       b->flags &= ~_URE_DFA_BLANKLINE;
01047 
01048     /*
01049      * Return the number of characters consumed.
01050      */
01051     return sp - sym;
01052 }
01053 
01054 static int
01055 _ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b)
01056 {
01057     if (a->type != b->type || a->mods != b->mods || a->props != b->props)
01058       return 1;
01059 
01060     if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) {
01061         if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used)
01062           return 1;
01063         if (a->sym.ccl.ranges_used > 0 &&
01064             memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges,
01065                    sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0)
01066           return 1;
01067     } else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr)
01068       return 1;
01069     return 0;
01070 }
01071 
01072 /*
01073  * Construct a symbol, but only keep unique symbols.
01074  */
01075 static ucs2_t
01076 _ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed,
01077                  _ure_buffer_t *b)
01078 {
01079     ucs2_t i;
01080     _ure_symtab_t *sp, symbol;
01081 
01082     /*
01083      * Build the next symbol so we can test to see if it is already in the
01084      * symbol table.
01085      */
01086     (void) memset((char *) &symbol, '\0', sizeof(_ure_symtab_t));
01087     *consumed = _ure_compile_symbol(sym, limit, &symbol, b);
01088 
01089     /*
01090      * Check to see if the symbol exists.
01091      */
01092     for (i = 0, sp = b->symtab;
01093          i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ;
01094 
01095     if (i < b->symtab_used) {
01096         /*
01097          * Free up any ranges used for the symbol.
01098          */
01099         if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) &&
01100             symbol.sym.ccl.ranges_size > 0)
01101           free((char *) symbol.sym.ccl.ranges);
01102 
01103         return b->symtab[i].id;
01104     }
01105 
01106     /*
01107      * Need to add the new symbol.
01108      */
01109     if (b->symtab_used == b->symtab_size) {
01110         if (b->symtab_size == 0)
01111           b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3);
01112         else
01113           b->symtab = (_ure_symtab_t *)
01114               realloc((char *) b->symtab,
01115                       sizeof(_ure_symtab_t) * (b->symtab_size + 8));
01116         sp = b->symtab + b->symtab_size;
01117         (void) memset((char *) sp, '\0', sizeof(_ure_symtab_t) << 3);
01118         b->symtab_size += 8;
01119     }
01120 
01121     symbol.id = b->symtab_used++;
01122     (void) AC_MEMCPY((char *) &b->symtab[symbol.id], (char *) &symbol,
01123                   sizeof(_ure_symtab_t));
01124 
01125     return symbol.id;
01126 }
01127 
01128 /*************************************************************************
01129  *
01130  * End symbol parse functions.
01131  *
01132  *************************************************************************/
01133 
01134 static ucs2_t
01135 _ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b)
01136 {
01137     ucs2_t i;
01138 
01139     if (b == 0)
01140       return _URE_NOOP;
01141 
01142     /*
01143      * Determine if the expression already exists or not.
01144      */
01145     for (i = 0; i < b->expr_used; i++) {
01146         if (b->expr[i].type == type && b->expr[i].lhs == lhs &&
01147             b->expr[i].rhs == rhs)
01148           break;
01149     }
01150     if (i < b->expr_used)
01151       return i;
01152 
01153     /*
01154      * Need to add a new expression.
01155      */
01156     if (b->expr_used == b->expr_size) {
01157         if (b->expr_size == 0)
01158           b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3);
01159         else
01160           b->expr = (_ure_elt_t *)
01161               realloc((char *) b->expr,
01162                       sizeof(_ure_elt_t) * (b->expr_size + 8));
01163         b->expr_size += 8;
01164     }
01165 
01166     b->expr[b->expr_used].onstack = 0;
01167     b->expr[b->expr_used].type = type;
01168     b->expr[b->expr_used].lhs = lhs;
01169     b->expr[b->expr_used].rhs = rhs;
01170 
01171     return b->expr_used++;
01172 }
01173 
01174 static unsigned char spmap[] = {
01175     0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
01176     0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
01177     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
01178 };
01179 
01180 #define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \
01181                             (spmap[(cc) >> 3] & (1 << ((cc) & 7))))
01182 
01183 /*
01184  * Convert the regular expression into an NFA in a form that will be easy to
01185  * reduce to a DFA.  The starting state for the reduction will be returned.
01186  */
01187 static ucs2_t
01188 _ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b)
01189 {
01190     ucs2_t c, state, top, sym, *sp, *ep;
01191     unsigned long used;
01192 
01193     state = _URE_NOOP;
01194 
01195     sp = re;
01196     ep = sp + relen;
01197     while (b->error == _URE_OK && sp < ep) {
01198         c = *sp++;
01199         switch (c) {
01200           case '(':
01201             _ure_push(_URE_PAREN, b);
01202             break;
01203           case ')':
01204             /*
01205              * Check for the case of too many close parentheses.
01206              */
01207             if (_ure_peek(b) == _URE_NOOP) {
01208                 b->error = _URE_UNBALANCED_GROUP;
01209                 break;
01210             }
01211 
01212             while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
01213               /*
01214                * Make an expression with the AND or OR operator and its right
01215                * hand side.
01216                */
01217               state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
01218 
01219             /*
01220              * Remove the _URE_PAREN off the stack.
01221              */
01222             (void) _ure_pop(b);
01223             break;
01224           case '*':
01225             state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b);
01226             break;
01227           case '+':
01228             state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b);
01229             break;
01230           case '?':
01231             state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b);
01232             break;
01233           case '|':
01234             while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
01235               /*
01236                * Make an expression with the AND or OR operator and its right
01237                * hand side.
01238                */
01239               state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
01240 
01241             _ure_push(state, b);
01242             _ure_push(_URE_OR, b);
01243             break;
01244           default:
01245             sp--;
01246             sym = _ure_make_symbol(sp, ep - sp, &used, b);
01247             sp += used;
01248             state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b);
01249             break;
01250         }
01251 
01252         if (c != '(' && c != '|' && sp < ep &&
01253             (!_ure_isspecial(*sp) || *sp == '(')) {
01254             _ure_push(state, b);
01255             _ure_push(_URE_AND, b);
01256         }
01257     }
01258     while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
01259       /*
01260        * Make an expression with the AND or OR operator and its right
01261        * hand side.
01262        */
01263       state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
01264 
01265     if (b->stack.slist_used > 0)
01266       b->error = _URE_UNBALANCED_GROUP;
01267 
01268     return (b->error == _URE_OK) ? state : _URE_NOOP;
01269 }
01270 
01271 static void
01272 _ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b)
01273 {
01274     ucs2_t i, *stp;
01275     _ure_symtab_t *sp;
01276 
01277     /*
01278      * Locate the symbol in the symbol table so the state can be added.
01279      * If the symbol doesn't exist, then a real problem exists.
01280      */
01281     for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id;
01282          i++, sp++) ;
01283 
01284     /*
01285      * Now find out if the state exists in the symbol's state list.
01286      */
01287     for (i = 0, stp = sp->states.slist;
01288          i < sp->states.slist_used && state > *stp; i++, stp++) ;
01289 
01290     if (i == sp->states.slist_used || state < *stp) {
01291         /*
01292          * Need to add the state in order.
01293          */
01294         if (sp->states.slist_used == sp->states.slist_size) {
01295             if (sp->states.slist_size == 0)
01296               sp->states.slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
01297             else
01298               sp->states.slist = (ucs2_t *)
01299                   realloc((char *) sp->states.slist,
01300                           sizeof(ucs2_t) * (sp->states.slist_size + 8));
01301             sp->states.slist_size += 8;
01302         }
01303         if (i < sp->states.slist_used)
01304           (void) _ure_memmove((char *) (sp->states.slist + i + 1),
01305                               (char *) (sp->states.slist + i),
01306                               sizeof(ucs2_t) * (sp->states.slist_used - i));
01307         sp->states.slist[i] = state;
01308         sp->states.slist_used++;
01309     }
01310 }
01311 
01312 static ucs2_t
01313 _ure_add_state(ucs2_t nstates, ucs2_t *states, _ure_buffer_t *b)
01314 {
01315     ucs2_t i;
01316     _ure_state_t *sp;
01317 
01318     for (i = 0, sp = b->states.states; i < b->states.states_used; i++, sp++) {
01319         if (sp->st.slist_used == nstates &&
01320             memcmp((char *) states, (char *) sp->st.slist,
01321                    sizeof(ucs2_t) * nstates) == 0)
01322           break;
01323     }
01324 
01325     if (i == b->states.states_used) {
01326         /*
01327          * Need to add a new DFA state (set of NFA states).
01328          */
01329         if (b->states.states_used == b->states.states_size) {
01330             if (b->states.states_size == 0)
01331               b->states.states = (_ure_state_t *)
01332                   malloc(sizeof(_ure_state_t) << 3);
01333             else
01334               b->states.states = (_ure_state_t *)
01335                   realloc((char *) b->states.states,
01336                           sizeof(_ure_state_t) * (b->states.states_size + 8));
01337             sp = b->states.states + b->states.states_size;
01338             (void) memset((char *) sp, '\0', sizeof(_ure_state_t) << 3);
01339             b->states.states_size += 8;
01340         }
01341 
01342         sp = b->states.states + b->states.states_used++;
01343         sp->id = i;
01344 
01345         if (sp->st.slist_used + nstates > sp->st.slist_size) {
01346             if (sp->st.slist_size == 0)
01347               sp->st.slist = (ucs2_t *)
01348                   malloc(sizeof(ucs2_t) * (sp->st.slist_used + nstates));
01349             else
01350               sp->st.slist = (ucs2_t *)
01351                   realloc((char *) sp->st.slist,
01352                           sizeof(ucs2_t) * (sp->st.slist_used + nstates));
01353             sp->st.slist_size = sp->st.slist_used + nstates;
01354         }
01355         sp->st.slist_used = nstates;
01356         (void) AC_MEMCPY((char *) sp->st.slist, (char *) states,
01357                       sizeof(ucs2_t) * nstates);
01358     }
01359 
01360     /*
01361      * Return the ID of the DFA state representing a group of NFA states.
01362      */
01363     return i;
01364 }
01365 
01366 static void
01367 _ure_reduce(ucs2_t start, _ure_buffer_t *b)
01368 {
01369     ucs2_t i, j, state, eval, syms, rhs;
01370     ucs2_t s1, s2, ns1, ns2;
01371     _ure_state_t *sp;
01372     _ure_symtab_t *smp;
01373 
01374     b->reducing = 1;
01375 
01376     /*
01377      * Add the starting state for the reduction.
01378      */
01379     _ure_add_state(1, &start, b);
01380 
01381     /*
01382      * Process each set of NFA states that get created.
01383      */
01384     for (i = 0; i < b->states.states_used; i++) {
01385         sp = b->states.states + i;
01386 
01387         /*
01388          * Push the current states on the stack.
01389          */
01390         for (j = 0; j < sp->st.slist_used; j++)
01391           _ure_push(sp->st.slist[j], b);
01392 
01393         /*
01394          * Reduce the NFA states.
01395          */
01396         for (j = sp->accepting = syms = 0; j < b->stack.slist_used; j++) {
01397             state = b->stack.slist[j];
01398             eval = 1;
01399 
01400             /*
01401              * This inner loop is the iterative equivalent of recursively
01402              * reducing subexpressions generated as a result of a reduction.
01403              */
01404             while (eval) {
01405                 switch (b->expr[state].type) {
01406                   case _URE_SYMBOL:
01407                     ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
01408                     _ure_add_symstate(b->expr[state].lhs, ns1, b);
01409                     syms++;
01410                     eval = 0;
01411                     break;
01412                   case _URE_ONE:
01413                     sp->accepting = 1;
01414                     eval = 0;
01415                     break;
01416                   case _URE_QUEST:
01417                     s1 = b->expr[state].lhs;
01418                     ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
01419                     state = _ure_make_expr(_URE_OR, ns1, s1, b);
01420                     break;
01421                   case _URE_PLUS:
01422                     s1 = b->expr[state].lhs;
01423                     ns1 = _ure_make_expr(_URE_STAR, s1, _URE_NOOP, b);
01424                     state = _ure_make_expr(_URE_AND, s1, ns1, b);
01425                     break;
01426                   case _URE_STAR:
01427                     s1 = b->expr[state].lhs;
01428                     ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
01429                     ns2 = _ure_make_expr(_URE_PLUS, s1, _URE_NOOP, b);
01430                     state = _ure_make_expr(_URE_OR, ns1, ns2, b);
01431                     break;
01432                   case _URE_OR:
01433                     s1 = b->expr[state].lhs;
01434                     s2 = b->expr[state].rhs;
01435                     _ure_push(s1, b);
01436                     _ure_push(s2, b);
01437                     eval = 0;
01438                     break;
01439                   case _URE_AND:
01440                     s1 = b->expr[state].lhs;
01441                     s2 = b->expr[state].rhs;
01442                     switch (b->expr[s1].type) {
01443                       case _URE_SYMBOL:
01444                         _ure_add_symstate(b->expr[s1].lhs, s2, b);
01445                         syms++;
01446                         eval = 0;
01447                         break;
01448                       case _URE_ONE:
01449                         state = s2;
01450                         break;
01451                       case _URE_QUEST:
01452                         ns1 = b->expr[s1].lhs;
01453                         ns2 = _ure_make_expr(_URE_AND, ns1, s2, b);
01454                         state = _ure_make_expr(_URE_OR, s2, ns2, b);
01455                         break;
01456                       case _URE_PLUS:
01457                         ns1 = b->expr[s1].lhs;
01458                         ns2 = _ure_make_expr(_URE_OR, s2, state, b);
01459                         state = _ure_make_expr(_URE_AND, ns1, ns2, b);
01460                         break;
01461                       case _URE_STAR:
01462                         ns1 = b->expr[s1].lhs;
01463                         ns2 = _ure_make_expr(_URE_AND, ns1, state, b);
01464                         state = _ure_make_expr(_URE_OR, s2, ns2, b);
01465                         break;
01466                       case _URE_OR:
01467                         ns1 = b->expr[s1].lhs;
01468                         ns2 = b->expr[s1].rhs;
01469                         ns1 = _ure_make_expr(_URE_AND, ns1, s2, b);
01470                         ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
01471                         state = _ure_make_expr(_URE_OR, ns1, ns2, b);
01472                         break;
01473                       case _URE_AND:
01474                         ns1 = b->expr[s1].lhs;
01475                         ns2 = b->expr[s1].rhs;
01476                         ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
01477                         state = _ure_make_expr(_URE_AND, ns1, ns2, b);
01478                         break;
01479                     }
01480                 }
01481             }
01482         }
01483 
01484         /*
01485          * Clear the state stack.
01486          */
01487         while (_ure_pop(b) != _URE_NOOP) ;
01488 
01489         /*
01490          * Reset the state pointer because the reduction may have moved it
01491          * during a reallocation.
01492          */
01493         sp = b->states.states + i;
01494 
01495         /*
01496          * Generate the DFA states for the symbols collected during the
01497          * current reduction.
01498          */
01499         if (sp->trans_used + syms > sp->trans_size) {
01500             if (sp->trans_size == 0)
01501               sp->trans = (_ure_elt_t *)
01502                   malloc(sizeof(_ure_elt_t) * (sp->trans_used + syms));
01503             else
01504               sp->trans = (_ure_elt_t *)
01505                   realloc((char *) sp->trans,
01506                           sizeof(_ure_elt_t) * (sp->trans_used + syms));
01507             sp->trans_size = sp->trans_used + syms;
01508         }
01509 
01510         /*
01511          * Go through the symbol table and generate the DFA state transitions
01512          * for each symbol that has collected NFA states.
01513          */
01514         for (j = syms = 0, smp = b->symtab; j < b->symtab_used; j++, smp++) {
01515             sp = b->states.states + i;
01516 
01517             if (smp->states.slist_used > 0) {
01518                 sp->trans[syms].lhs = smp->id;
01519                 rhs = _ure_add_state(smp->states.slist_used,
01520                                      smp->states.slist, b);
01521                 /*
01522                  * Reset the state pointer in case the reallocation moves it
01523                  * in memory.
01524                  */
01525                 sp = b->states.states + i;
01526                 sp->trans[syms].rhs = rhs;
01527 
01528                 smp->states.slist_used = 0;
01529                 syms++;
01530             }
01531         }
01532 
01533         /*
01534          * Set the number of transitions actually used.
01535          */
01536         sp->trans_used = syms;
01537     }
01538     b->reducing = 0;
01539 }
01540 
01541 static void
01542 _ure_add_equiv(ucs2_t l, ucs2_t r, _ure_buffer_t *b)
01543 {
01544     ucs2_t tmp;
01545 
01546     l = b->states.states[l].id;
01547     r = b->states.states[r].id;
01548 
01549     if (l == r)
01550       return;
01551 
01552     if (l > r) {
01553         tmp = l;
01554         l = r;
01555         r = tmp;
01556     }
01557 
01558     /*
01559      * Check to see if the equivalence pair already exists.
01560      */
01561     for (tmp = 0; tmp < b->equiv_used &&
01562              (b->equiv[tmp].l != l || b->equiv[tmp].r != r);
01563          tmp++) ;
01564 
01565     if (tmp < b->equiv_used)
01566       return;
01567 
01568     if (b->equiv_used == b->equiv_size) {
01569         if (b->equiv_size == 0)
01570           b->equiv = (_ure_equiv_t *) malloc(sizeof(_ure_equiv_t) << 3);
01571         else
01572           b->equiv = (_ure_equiv_t *) realloc((char *) b->equiv,
01573                                               sizeof(_ure_equiv_t) *
01574                                               (b->equiv_size + 8));
01575         b->equiv_size += 8;
01576     }
01577     b->equiv[b->equiv_used].l = l;
01578     b->equiv[b->equiv_used].r = r;
01579     b->equiv_used++;
01580 }
01581 
01582 /*
01583  * Merge the DFA states that are equivalent.
01584  */
01585 static void
01586 _ure_merge_equiv(_ure_buffer_t *b)
01587 {
01588     ucs2_t i, j, k, eq, done;
01589     _ure_state_t *sp1, *sp2, *ls, *rs;
01590 
01591     for (i = 0; i < b->states.states_used; i++) {
01592         sp1 = b->states.states + i;
01593         if (sp1->id != i)
01594           continue;
01595         for (j = 0; j < i; j++) {
01596             sp2 = b->states.states + j;
01597             if (sp2->id != j)
01598               continue;
01599             b->equiv_used = 0;
01600             _ure_add_equiv(i, j, b);
01601             for (eq = 0, done = 0; eq < b->equiv_used; eq++) {
01602                 ls = b->states.states + b->equiv[eq].l;
01603                 rs = b->states.states + b->equiv[eq].r;
01604                 if (ls->accepting != rs->accepting ||
01605                     ls->trans_used != rs->trans_used) {
01606                     done = 1;
01607                     break;
01608                 }
01609                 for (k = 0; k < ls->trans_used &&
01610                          ls->trans[k].lhs == rs->trans[k].lhs; k++) ;
01611                 if (k < ls->trans_used) {
01612                     done = 1;
01613                     break;
01614                 }
01615 
01616                 for (k = 0; k < ls->trans_used; k++)
01617                   _ure_add_equiv(ls->trans[k].rhs, rs->trans[k].rhs, b);
01618             }
01619             if (done == 0)
01620               break;
01621         }
01622         for (eq = 0; j < i && eq < b->equiv_used; eq++)
01623           b->states.states[b->equiv[eq].r].id =
01624               b->states.states[b->equiv[eq].l].id;
01625     }
01626 
01627     /*
01628      * Renumber the states appropriately.
01629      */
01630     for (i = eq = 0, sp1 = b->states.states; i < b->states.states_used;
01631          sp1++, i++)
01632       sp1->id = (sp1->id == i) ? eq++ : b->states.states[sp1->id].id;
01633 }
01634 
01635 /*************************************************************************
01636  *
01637  * API.
01638  *
01639  *************************************************************************/
01640 
01641 ure_buffer_t
01642 ure_buffer_create(void)
01643 {
01644     ure_buffer_t b;
01645 
01646     b = (ure_buffer_t) calloc(1, sizeof(_ure_buffer_t));
01647 
01648     return b;
01649 }
01650 
01651 void
01652 ure_buffer_free(ure_buffer_t buf)
01653 {
01654     unsigned long i;
01655 
01656     if (buf == 0)
01657       return;
01658 
01659     if (buf->stack.slist_size > 0)
01660       free((char *) buf->stack.slist);
01661 
01662     if (buf->expr_size > 0)
01663       free((char *) buf->expr);
01664 
01665     for (i = 0; i < buf->symtab_size; i++) {
01666         if (buf->symtab[i].states.slist_size > 0)
01667           free((char *) buf->symtab[i].states.slist);
01668     }
01669 
01670     if (buf->symtab_size > 0)
01671       free((char *) buf->symtab);
01672 
01673     for (i = 0; i < buf->states.states_size; i++) {
01674         if (buf->states.states[i].trans_size > 0)
01675           free((char *) buf->states.states[i].trans);
01676         if (buf->states.states[i].st.slist_size > 0)
01677           free((char *) buf->states.states[i].st.slist);
01678     }
01679 
01680     if (buf->states.states_size > 0)
01681       free((char *) buf->states.states);
01682 
01683     if (buf->equiv_size > 0)
01684       free((char *) buf->equiv);
01685 
01686     free((char *) buf);
01687 }
01688 
01689 ure_dfa_t
01690 ure_compile(ucs2_t *re, unsigned long relen, int casefold, ure_buffer_t buf)
01691 {
01692     ucs2_t i, j, state;
01693     _ure_state_t *sp;
01694     _ure_dstate_t *dsp;
01695     _ure_trans_t *tp;
01696     ure_dfa_t dfa;
01697 
01698     if (re == 0 || *re == 0 || relen == 0 || buf == 0)
01699       return 0;
01700 
01701     /*
01702      * Reset the various fields of the compilation buffer.  Default the flags
01703      * to indicate the presense of the "^$" pattern.  If any other pattern
01704      * occurs, then this flag will be removed.  This is done to catch this
01705      * special pattern and handle it specially when matching.
01706      */
01707     buf->flags = _URE_DFA_BLANKLINE | ((casefold) ? _URE_DFA_CASEFOLD : 0);
01708     buf->reducing = 0;
01709     buf->stack.slist_used = 0;
01710     buf->expr_used = 0;
01711 
01712     for (i = 0; i < buf->symtab_used; i++)
01713       buf->symtab[i].states.slist_used = 0;
01714     buf->symtab_used = 0;
01715 
01716     for (i = 0; i < buf->states.states_used; i++) {
01717         buf->states.states[i].st.slist_used = 0;
01718         buf->states.states[i].trans_used = 0;
01719     }
01720     buf->states.states_used = 0;
01721 
01722     /*
01723      * Construct the NFA.  If this stage returns a 0, then an error occured or
01724      * an empty expression was passed.
01725      */
01726     if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP)
01727       return 0;
01728 
01729     /*
01730      * Do the expression reduction to get the initial DFA.
01731      */
01732     _ure_reduce(state, buf);
01733 
01734     /*
01735      * Merge all the equivalent DFA states.
01736      */
01737     _ure_merge_equiv(buf);
01738 
01739     /*
01740      * Construct the minimal DFA.
01741      */
01742     dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t));
01743     (void) memset((char *) dfa, '\0', sizeof(_ure_dfa_t));
01744 
01745     dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE);
01746 
01747     /*
01748      * Free up the NFA state groups and transfer the symbols from the buffer
01749      * to the DFA.
01750      */
01751     for (i = 0; i < buf->symtab_size; i++) {
01752         if (buf->symtab[i].states.slist_size > 0)
01753           free((char *) buf->symtab[i].states.slist);
01754     }
01755     dfa->syms = buf->symtab;
01756     dfa->nsyms = buf->symtab_used;
01757 
01758     buf->symtab_used = buf->symtab_size = 0;
01759 
01760     /*
01761      * Collect the total number of states and transitions needed for the DFA.
01762      */
01763     for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
01764          i++, sp++) {
01765         if (sp->id == state) {
01766             dfa->nstates++;
01767             dfa->ntrans += sp->trans_used;
01768             state++;
01769         }
01770     }
01771 
01772     /*
01773      * Allocate enough space for the states and transitions.
01774      */
01775     dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) *
01776                                            dfa->nstates);
01777     dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans);
01778 
01779     /*
01780      * Actually transfer the DFA states from the buffer.
01781      */
01782     dsp = dfa->states;
01783     tp = dfa->trans;
01784     for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
01785          i++, sp++) {
01786         if (sp->id == state) {
01787             dsp->trans = tp;
01788             dsp->ntrans = sp->trans_used;
01789             dsp->accepting = sp->accepting;
01790 
01791             /*
01792              * Add the transitions for the state.
01793              */
01794             for (j = 0; j < dsp->ntrans; j++, tp++) {
01795                 tp->symbol = sp->trans[j].lhs;
01796                 tp->next_state = buf->states.states[sp->trans[j].rhs].id;
01797             }
01798 
01799             dsp++;
01800             state++;
01801         }
01802     }
01803 
01804     return dfa;
01805 }
01806 
01807 void
01808 ure_dfa_free(ure_dfa_t dfa)
01809 {
01810     ucs2_t i;
01811 
01812     if (dfa == 0)
01813       return;
01814 
01815     for (i = 0; i < dfa->nsyms; i++) {
01816         if ((dfa->syms[i].type == _URE_CCLASS ||
01817              dfa->syms[i].type == _URE_NCCLASS) &&
01818             dfa->syms[i].sym.ccl.ranges_size > 0)
01819           free((char *) dfa->syms[i].sym.ccl.ranges);
01820     }
01821     if (dfa->nsyms > 0)
01822       free((char *) dfa->syms);
01823 
01824     if (dfa->nstates > 0)
01825       free((char *) dfa->states);
01826     if (dfa->ntrans > 0)
01827       free((char *) dfa->trans);
01828     free((char *) dfa);
01829 }
01830 
01831 void
01832 ure_write_dfa(ure_dfa_t dfa, FILE *out)
01833 {
01834     ucs2_t i, j, k, h, l;
01835     _ure_dstate_t *sp;
01836     _ure_symtab_t *sym;
01837     _ure_range_t *rp;
01838 
01839     if (dfa == 0 || out == 0)
01840       return;
01841 
01842     /*
01843      * Write all the different character classes.
01844      */
01845     for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) {
01846         if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) {
01847             fprintf(out, "C%hd = ", sym->id);
01848             if (sym->sym.ccl.ranges_used > 0) {
01849                 putc('[', out);
01850                 if (sym->type == _URE_NCCLASS)
01851                   putc('^', out);
01852             }
01853             if (sym->props != 0) {
01854                 if (sym->type == _URE_NCCLASS)
01855                   fprintf(out, "\\P");
01856                 else
01857                   fprintf(out, "\\p");
01858                 for (k = h = 0; k < 32; k++) {
01859                     if (sym->props & (1 << k)) {
01860                         if (h != 0)
01861                           putc(',', out);
01862                         fprintf(out, "%hd", k + 1);
01863                         h = 1;
01864                     }
01865                 }
01866             }
01867             /*
01868              * Dump the ranges.
01869              */
01870             for (k = 0, rp = sym->sym.ccl.ranges;
01871                  k < sym->sym.ccl.ranges_used; k++, rp++) {
01872                 /*
01873                  * Check for UTF16 characters.
01874                  */
01875                 if (0x10000 <= rp->min_code &&
01876                     rp->min_code <= 0x10ffff) {
01877                     h = (ucs2_t) (((rp->min_code - 0x10000) >> 10) + 0xd800);
01878                     l = (ucs2_t) (((rp->min_code - 0x10000) & 1023) + 0xdc00);
01879                     fprintf(out, "\\x%04hX\\x%04hX", h, l);
01880                 } else
01881                   fprintf(out, "\\x%04lX", rp->min_code & 0xffff);
01882                 if (rp->max_code != rp->min_code) {
01883                     putc('-', out);
01884                     if (rp->max_code >= 0x10000 &&
01885                         rp->max_code <= 0x10ffff) {
01886                         h = (ucs2_t) (((rp->max_code - 0x10000) >> 10) + 0xd800);
01887                         l = (ucs2_t) (((rp->max_code - 0x10000) & 1023) + 0xdc00);
01888                         fprintf(out, "\\x%04hX\\x%04hX", h, l);
01889                     } else
01890                       fprintf(out, "\\x%04lX", rp->max_code & 0xffff);
01891                 }
01892             }
01893             if (sym->sym.ccl.ranges_used > 0)
01894               putc(']', out);
01895             putc('\n', out);
01896         }
01897     }
01898 
01899     for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) {
01900         fprintf(out, "S%hd = ", i);
01901         if (sp->accepting) {
01902             fprintf(out, "1 ");
01903             if (sp->ntrans)
01904               fprintf(out, "| ");
01905         }
01906         for (j = 0; j < sp->ntrans; j++) {
01907             if (j > 0)
01908               fprintf(out, "| ");
01909 
01910             sym = dfa->syms + sp->trans[j].symbol;
01911             switch (sym->type) {
01912               case _URE_CHAR:
01913                 if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) {
01914                     /*
01915                      * Take care of UTF16 characters.
01916                      */
01917                     h = (ucs2_t) (((sym->sym.chr - 0x10000) >> 10) + 0xd800);
01918                     l = (ucs2_t) (((sym->sym.chr - 0x10000) & 1023) + 0xdc00);
01919                     fprintf(out, "\\x%04hX\\x%04hX ", h, l);
01920                 } else
01921                   fprintf(out, "\\x%04lX ", sym->sym.chr & 0xffff);
01922                 break;
01923               case _URE_ANY_CHAR:
01924                 fprintf(out, "<any> ");
01925                 break;
01926               case _URE_BOL_ANCHOR:
01927                 fprintf(out, "<bol-anchor> ");
01928                 break;
01929               case _URE_EOL_ANCHOR:
01930                 fprintf(out, "<eol-anchor> ");
01931                 break;
01932               case _URE_CCLASS:
01933               case _URE_NCCLASS:
01934                 fprintf(out, "[C%hd] ", sym->id);
01935                 break;
01936             }
01937             fprintf(out, "S%hd", sp->trans[j].next_state);
01938             if (j + 1 < sp->ntrans)
01939               putc(' ', out);
01940         }
01941         putc('\n', out);
01942     }
01943 }
01944 
01945 #define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\
01946                         (cc) == 0x2029)
01947 
01948 int
01949 ure_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen,
01950          unsigned long *match_start, unsigned long *match_end)
01951 {
01952     int i, j, matched, found, skip;
01953     unsigned long ms, me;
01954     ucs4_t c;
01955     ucs2_t *sp, *ep, *lp;
01956     _ure_dstate_t *stp;
01957     _ure_symtab_t *sym;
01958     _ure_range_t *rp;
01959 
01960     if (dfa == 0 || text == 0)
01961       return 0;
01962 
01963     /*
01964      * Handle the special case of an empty string matching the "^$" pattern.
01965      */
01966     if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) {
01967         *match_start = *match_end = 0;
01968         return 1;
01969     }
01970 
01971     sp = text;
01972     ep = sp + textlen;
01973 
01974     ms = me = ~0;
01975 
01976     stp = dfa->states;
01977 
01978     for (found = skip = 0; found == 0 && sp < ep; ) {
01979         lp = sp;
01980         c = *sp++;
01981 
01982         /*
01983          * Check to see if this is a high surrogate that should be
01984          * combined with a following low surrogate.
01985          */
01986         if (sp < ep && 0xd800 <= c && c <= 0xdbff &&
01987             0xdc00 <= *sp && *sp <= 0xdfff)
01988           c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff));
01989 
01990         /*
01991          * Determine if the character is non-spacing and should be skipped.
01992          */
01993         if (_ure_matches_properties(_URE_NONSPACING, c) &&
01994             (flags & URE_IGNORE_NONSPACING)) {
01995             sp++;
01996             continue;
01997         }
01998 
01999         if (dfa->flags & _URE_DFA_CASEFOLD)
02000           c = _ure_tolower(c);
02001 
02002         /*
02003          * See if one of the transitions matches.
02004          */
02005         for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) {
02006             sym = dfa->syms + stp->trans[i].symbol;
02007             switch (sym->type) {
02008               case _URE_ANY_CHAR:
02009                 if ((flags & URE_DOT_MATCHES_SEPARATORS) ||
02010                     !_ure_issep(c))
02011                   matched = 1;
02012                 break;
02013               case _URE_CHAR:
02014                 if (c == sym->sym.chr)
02015                   matched = 1;
02016                 break;
02017               case _URE_BOL_ANCHOR:
02018                 if (lp == text) {
02019                     sp = lp;
02020                     matched = 1;
02021                 } else if (_ure_issep(c)) {
02022                     if (c == '\r' && sp < ep && *sp == '\n')
02023                       sp++;
02024                     lp = sp;
02025                     matched = 1;
02026                 }
02027                 break;
02028               case _URE_EOL_ANCHOR:
02029                 if (_ure_issep(c)) {
02030                     /*
02031                      * Put the pointer back before the separator so the match
02032                      * end position will be correct.  This case will also
02033                      * cause the `sp' pointer to be advanced over the current
02034                      * separator once the match end point has been recorded.
02035                      */
02036                     sp = lp;
02037                     matched = 1;
02038                 }
02039                 break;
02040               case _URE_CCLASS:
02041               case _URE_NCCLASS:
02042                 if (sym->props != 0)
02043                   matched = _ure_matches_properties(sym->props, c);
02044                 for (j = 0, rp = sym->sym.ccl.ranges;
02045                      j < sym->sym.ccl.ranges_used; j++, rp++) {
02046                     if (rp->min_code <= c && c <= rp->max_code)
02047                       matched = 1;
02048                 }
02049                 if (sym->type == _URE_NCCLASS)
02050                   matched = !matched;
02051                 break;
02052             }
02053 
02054             if (matched) {
02055                 if (ms == ~0UL)
02056                   ms = lp - text;
02057                 else
02058                   me = sp - text;
02059                 stp = dfa->states + stp->trans[i].next_state;
02060 
02061                 /*
02062                  * If the match was an EOL anchor, adjust the pointer past the
02063                  * separator that caused the match.  The correct match
02064                  * position has been recorded already.
02065                  */
02066                 if (sym->type == _URE_EOL_ANCHOR) {
02067                     /*
02068                      * Skip the character that caused the match.
02069                      */
02070                     sp++;
02071 
02072                     /*
02073                      * Handle the infamous CRLF situation.
02074                      */
02075                     if (sp < ep && c == '\r' && *sp == '\n')
02076                       sp++;
02077                 }
02078             }
02079         }
02080 
02081         if (matched == 0) {
02082             if (stp->accepting == 0) {
02083                 /*
02084                  * If the last state was not accepting, then reset
02085                  * and start over.
02086                  */
02087                 stp = dfa->states;
02088                 ms = me = ~0;
02089             } else
02090               /*
02091                * The last state was accepting, so terminate the matching
02092                * loop to avoid more work.
02093                */
02094               found = 1;
02095         } else if (sp == ep) {
02096             if (!stp->accepting) {
02097                 /*
02098                  * This ugly hack is to make sure the end-of-line anchors
02099                  * match when the source text hits the end.  This is only done
02100                  * if the last subexpression matches.
02101                  */
02102                 for (i = 0; found == 0 && i < stp->ntrans; i++) {
02103                     sym = dfa->syms + stp->trans[i].symbol;
02104                     if (sym->type ==_URE_EOL_ANCHOR) {
02105                         stp = dfa->states + stp->trans[i].next_state;
02106                         if (stp->accepting) {
02107                             me = sp - text;
02108                             found = 1;
02109                         } else
02110                           break;
02111                     }
02112                 }
02113             } else {
02114                 /*
02115                  * Make sure any conditions that match all the way to the end
02116                  * of the string match.
02117                  */
02118                 found = 1;
02119                 me = sp - text;
02120             }
02121         }
02122     }
02123 
02124     if (found == 0)
02125       ms = me = ~0;
02126 
02127     *match_start = ms;
02128     *match_end = me;
02129 
02130     return (ms != ~0UL) ? 1 : 0;
02131 }