Back to index

glibc  2.9
regcomp.c
Go to the documentation of this file.
00001 /* Extended regular expression matching and search library.
00002    Copyright (C) 2002,2003,2004,2005,2006,2007 Free Software Foundation, Inc.
00003    This file is part of the GNU C Library.
00004    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
00005 
00006    The GNU C Library is free software; you can redistribute it and/or
00007    modify it under the terms of the GNU Lesser General Public
00008    License as published by the Free Software Foundation; either
00009    version 2.1 of the License, or (at your option) any later version.
00010 
00011    The GNU C Library is distributed in the hope that it will be useful,
00012    but WITHOUT ANY WARRANTY; without even the implied warranty of
00013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014    Lesser General Public License for more details.
00015 
00016    You should have received a copy of the GNU Lesser General Public
00017    License along with the GNU C Library; if not, write to the Free
00018    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
00019    02111-1307 USA.  */
00020 
00021 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
00022                                      size_t length, reg_syntax_t syntax);
00023 static void re_compile_fastmap_iter (regex_t *bufp,
00024                                  const re_dfastate_t *init_state,
00025                                  char *fastmap);
00026 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
00027 #ifdef RE_ENABLE_I18N
00028 static void free_charset (re_charset_t *cset);
00029 #endif /* RE_ENABLE_I18N */
00030 static void free_workarea_compile (regex_t *preg);
00031 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
00032 #ifdef RE_ENABLE_I18N
00033 static void optimize_utf8 (re_dfa_t *dfa);
00034 #endif
00035 static reg_errcode_t analyze (regex_t *preg);
00036 static reg_errcode_t preorder (bin_tree_t *root,
00037                             reg_errcode_t (fn (void *, bin_tree_t *)),
00038                             void *extra);
00039 static reg_errcode_t postorder (bin_tree_t *root,
00040                             reg_errcode_t (fn (void *, bin_tree_t *)),
00041                             void *extra);
00042 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
00043 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
00044 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
00045                              bin_tree_t *node);
00046 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
00047 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
00048 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
00049 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
00050 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
00051                                unsigned int constraint);
00052 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
00053 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
00054                                     int node, int root);
00055 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
00056 static int fetch_number (re_string_t *input, re_token_t *token,
00057                       reg_syntax_t syntax);
00058 static int peek_token (re_token_t *token, re_string_t *input,
00059                      reg_syntax_t syntax) internal_function;
00060 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
00061                        reg_syntax_t syntax, reg_errcode_t *err);
00062 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
00063                               re_token_t *token, reg_syntax_t syntax,
00064                               int nest, reg_errcode_t *err);
00065 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
00066                              re_token_t *token, reg_syntax_t syntax,
00067                              int nest, reg_errcode_t *err);
00068 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
00069                                  re_token_t *token, reg_syntax_t syntax,
00070                                  int nest, reg_errcode_t *err);
00071 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
00072                               re_token_t *token, reg_syntax_t syntax,
00073                               int nest, reg_errcode_t *err);
00074 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
00075                              re_dfa_t *dfa, re_token_t *token,
00076                              reg_syntax_t syntax, reg_errcode_t *err);
00077 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
00078                                   re_token_t *token, reg_syntax_t syntax,
00079                                   reg_errcode_t *err);
00080 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
00081                                        re_string_t *regexp,
00082                                        re_token_t *token, int token_len,
00083                                        re_dfa_t *dfa,
00084                                        reg_syntax_t syntax,
00085                                        int accept_hyphen);
00086 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
00087                                      re_string_t *regexp,
00088                                      re_token_t *token);
00089 #ifdef RE_ENABLE_I18N
00090 static reg_errcode_t build_equiv_class (bitset_t sbcset,
00091                                    re_charset_t *mbcset,
00092                                    int *equiv_class_alloc,
00093                                    const unsigned char *name);
00094 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
00095                                   bitset_t sbcset,
00096                                   re_charset_t *mbcset,
00097                                   int *char_class_alloc,
00098                                   const unsigned char *class_name,
00099                                   reg_syntax_t syntax);
00100 #else  /* not RE_ENABLE_I18N */
00101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
00102                                    const unsigned char *name);
00103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
00104                                   bitset_t sbcset,
00105                                   const unsigned char *class_name,
00106                                   reg_syntax_t syntax);
00107 #endif /* not RE_ENABLE_I18N */
00108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
00109                                    RE_TRANSLATE_TYPE trans,
00110                                    const unsigned char *class_name,
00111                                    const unsigned char *extra,
00112                                    int non_match, reg_errcode_t *err);
00113 static bin_tree_t *create_tree (re_dfa_t *dfa,
00114                             bin_tree_t *left, bin_tree_t *right,
00115                             re_token_type_t type);
00116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
00117                                   bin_tree_t *left, bin_tree_t *right,
00118                                   const re_token_t *token);
00119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
00120 static void free_token (re_token_t *node);
00121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
00122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
00123 
00124 /* This table gives an error message for each of the error codes listed
00125    in regex.h.  Obviously the order here has to be same as there.
00126    POSIX doesn't require that we do anything for REG_NOERROR,
00127    but why not be nice?  */
00128 
00129 const char __re_error_msgid[] attribute_hidden =
00130   {
00131 #define REG_NOERROR_IDX     0
00132     gettext_noop ("Success")       /* REG_NOERROR */
00133     "\0"
00134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
00135     gettext_noop ("No match")      /* REG_NOMATCH */
00136     "\0"
00137 #define REG_BADPAT_IDX      (REG_NOMATCH_IDX + sizeof "No match")
00138     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
00139     "\0"
00140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
00141     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
00142     "\0"
00143 #define REG_ECTYPE_IDX      (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
00144     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
00145     "\0"
00146 #define REG_EESCAPE_IDX     (REG_ECTYPE_IDX + sizeof "Invalid character class name")
00147     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
00148     "\0"
00149 #define REG_ESUBREG_IDX     (REG_EESCAPE_IDX + sizeof "Trailing backslash")
00150     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
00151     "\0"
00152 #define REG_EBRACK_IDX      (REG_ESUBREG_IDX + sizeof "Invalid back reference")
00153     gettext_noop ("Unmatched [ or [^")    /* REG_EBRACK */
00154     "\0"
00155 #define REG_EPAREN_IDX      (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
00156     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
00157     "\0"
00158 #define REG_EBRACE_IDX      (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
00159     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
00160     "\0"
00161 #define REG_BADBR_IDX       (REG_EBRACE_IDX + sizeof "Unmatched \\{")
00162     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
00163     "\0"
00164 #define REG_ERANGE_IDX      (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
00165     gettext_noop ("Invalid range end")    /* REG_ERANGE */
00166     "\0"
00167 #define REG_ESPACE_IDX      (REG_ERANGE_IDX + sizeof "Invalid range end")
00168     gettext_noop ("Memory exhausted") /* REG_ESPACE */
00169     "\0"
00170 #define REG_BADRPT_IDX      (REG_ESPACE_IDX + sizeof "Memory exhausted")
00171     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
00172     "\0"
00173 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
00174     gettext_noop ("Premature end of regular expression") /* REG_EEND */
00175     "\0"
00176 #define REG_ESIZE_IDX       (REG_EEND_IDX + sizeof "Premature end of regular expression")
00177     gettext_noop ("Regular expression too big") /* REG_ESIZE */
00178     "\0"
00179 #define REG_ERPAREN_IDX     (REG_ESIZE_IDX + sizeof "Regular expression too big")
00180     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
00181   };
00182 
00183 const size_t __re_error_msgid_idx[] attribute_hidden =
00184   {
00185     REG_NOERROR_IDX,
00186     REG_NOMATCH_IDX,
00187     REG_BADPAT_IDX,
00188     REG_ECOLLATE_IDX,
00189     REG_ECTYPE_IDX,
00190     REG_EESCAPE_IDX,
00191     REG_ESUBREG_IDX,
00192     REG_EBRACK_IDX,
00193     REG_EPAREN_IDX,
00194     REG_EBRACE_IDX,
00195     REG_BADBR_IDX,
00196     REG_ERANGE_IDX,
00197     REG_ESPACE_IDX,
00198     REG_BADRPT_IDX,
00199     REG_EEND_IDX,
00200     REG_ESIZE_IDX,
00201     REG_ERPAREN_IDX
00202   };
00203 
00204 /* Entry points for GNU code.  */
00205 
00206 /* re_compile_pattern is the GNU regular expression compiler: it
00207    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
00208    Returns 0 if the pattern was valid, otherwise an error string.
00209 
00210    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
00211    are set in BUFP on entry.  */
00212 
00213 const char *
00214 re_compile_pattern (pattern, length, bufp)
00215     const char *pattern;
00216     size_t length;
00217     struct re_pattern_buffer *bufp;
00218 {
00219   reg_errcode_t ret;
00220 
00221   /* And GNU code determines whether or not to get register information
00222      by passing null for the REGS argument to re_match, etc., not by
00223      setting no_sub, unless RE_NO_SUB is set.  */
00224   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
00225 
00226   /* Match anchors at newline.  */
00227   bufp->newline_anchor = 1;
00228 
00229   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
00230 
00231   if (!ret)
00232     return NULL;
00233   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
00234 }
00235 #ifdef _LIBC
00236 weak_alias (__re_compile_pattern, re_compile_pattern)
00237 #endif
00238 
00239 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
00240    also be assigned to arbitrarily: each pattern buffer stores its own
00241    syntax, so it can be changed between regex compilations.  */
00242 /* This has no initializer because initialized variables in Emacs
00243    become read-only after dumping.  */
00244 reg_syntax_t re_syntax_options;
00245 
00246 
00247 /* Specify the precise syntax of regexps for compilation.  This provides
00248    for compatibility for various utilities which historically have
00249    different, incompatible syntaxes.
00250 
00251    The argument SYNTAX is a bit mask comprised of the various bits
00252    defined in regex.h.  We return the old syntax.  */
00253 
00254 reg_syntax_t
00255 re_set_syntax (syntax)
00256     reg_syntax_t syntax;
00257 {
00258   reg_syntax_t ret = re_syntax_options;
00259 
00260   re_syntax_options = syntax;
00261   return ret;
00262 }
00263 #ifdef _LIBC
00264 weak_alias (__re_set_syntax, re_set_syntax)
00265 #endif
00266 
00267 int
00268 re_compile_fastmap (bufp)
00269     struct re_pattern_buffer *bufp;
00270 {
00271   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
00272   char *fastmap = bufp->fastmap;
00273 
00274   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
00275   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
00276   if (dfa->init_state != dfa->init_state_word)
00277     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
00278   if (dfa->init_state != dfa->init_state_nl)
00279     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
00280   if (dfa->init_state != dfa->init_state_begbuf)
00281     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
00282   bufp->fastmap_accurate = 1;
00283   return 0;
00284 }
00285 #ifdef _LIBC
00286 weak_alias (__re_compile_fastmap, re_compile_fastmap)
00287 #endif
00288 
00289 static inline void
00290 __attribute ((always_inline))
00291 re_set_fastmap (char *fastmap, int icase, int ch)
00292 {
00293   fastmap[ch] = 1;
00294   if (icase)
00295     fastmap[tolower (ch)] = 1;
00296 }
00297 
00298 /* Helper function for re_compile_fastmap.
00299    Compile fastmap for the initial_state INIT_STATE.  */
00300 
00301 static void
00302 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
00303                       char *fastmap)
00304 {
00305   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
00306   int node_cnt;
00307   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
00308   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
00309     {
00310       int node = init_state->nodes.elems[node_cnt];
00311       re_token_type_t type = dfa->nodes[node].type;
00312 
00313       if (type == CHARACTER)
00314        {
00315          re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
00316 #ifdef RE_ENABLE_I18N
00317          if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
00318            {
00319              unsigned char *buf = alloca (dfa->mb_cur_max), *p;
00320              wchar_t wc;
00321              mbstate_t state;
00322 
00323              p = buf;
00324              *p++ = dfa->nodes[node].opr.c;
00325              while (++node < dfa->nodes_len
00326                    &&       dfa->nodes[node].type == CHARACTER
00327                    && dfa->nodes[node].mb_partial)
00328               *p++ = dfa->nodes[node].opr.c;
00329              memset (&state, '\0', sizeof (state));
00330              if (mbrtowc (&wc, (const char *) buf, p - buf,
00331                         &state) == p - buf
00332                 && (__wcrtomb ((char *) buf, towlower (wc), &state)
00333                     != (size_t) -1))
00334               re_set_fastmap (fastmap, 0, buf[0]);
00335            }
00336 #endif
00337        }
00338       else if (type == SIMPLE_BRACKET)
00339        {
00340          int i, ch;
00341          for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
00342            {
00343              int j;
00344              bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
00345              for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
00346               if (w & ((bitset_word_t) 1 << j))
00347                 re_set_fastmap (fastmap, icase, ch);
00348            }
00349        }
00350 #ifdef RE_ENABLE_I18N
00351       else if (type == COMPLEX_BRACKET)
00352        {
00353          int i;
00354          re_charset_t *cset = dfa->nodes[node].opr.mbcset;
00355          if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
00356              || cset->nranges || cset->nchar_classes)
00357            {
00358 # ifdef _LIBC
00359              if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
00360               {
00361                 /* In this case we want to catch the bytes which are
00362                    the first byte of any collation elements.
00363                    e.g. In da_DK, we want to catch 'a' since "aa"
00364                        is a valid collation element, and don't catch
00365                        'b' since 'b' is the only collation element
00366                        which starts from 'b'.  */
00367                 const int32_t *table = (const int32_t *)
00368                   _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
00369                 for (i = 0; i < SBC_MAX; ++i)
00370                   if (table[i] < 0)
00371                     re_set_fastmap (fastmap, icase, i);
00372               }
00373 # else
00374              if (dfa->mb_cur_max > 1)
00375               for (i = 0; i < SBC_MAX; ++i)
00376                 if (__btowc (i) == WEOF)
00377                   re_set_fastmap (fastmap, icase, i);
00378 # endif /* not _LIBC */
00379            }
00380          for (i = 0; i < cset->nmbchars; ++i)
00381            {
00382              char buf[256];
00383              mbstate_t state;
00384              memset (&state, '\0', sizeof (state));
00385              if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
00386               re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
00387              if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
00388               {
00389                 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
00390                     != (size_t) -1)
00391                   re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
00392               }
00393            }
00394        }
00395 #endif /* RE_ENABLE_I18N */
00396       else if (type == OP_PERIOD
00397 #ifdef RE_ENABLE_I18N
00398               || type == OP_UTF8_PERIOD
00399 #endif /* RE_ENABLE_I18N */
00400               || type == END_OF_RE)
00401        {
00402          memset (fastmap, '\1', sizeof (char) * SBC_MAX);
00403          if (type == END_OF_RE)
00404            bufp->can_be_null = 1;
00405          return;
00406        }
00407     }
00408 }
00409 
00410 /* Entry point for POSIX code.  */
00411 /* regcomp takes a regular expression as a string and compiles it.
00412 
00413    PREG is a regex_t *.  We do not expect any fields to be initialized,
00414    since POSIX says we shouldn't.  Thus, we set
00415 
00416      `buffer' to the compiled pattern;
00417      `used' to the length of the compiled pattern;
00418      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
00419        REG_EXTENDED bit in CFLAGS is set; otherwise, to
00420        RE_SYNTAX_POSIX_BASIC;
00421      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
00422      `fastmap' to an allocated space for the fastmap;
00423      `fastmap_accurate' to zero;
00424      `re_nsub' to the number of subexpressions in PATTERN.
00425 
00426    PATTERN is the address of the pattern string.
00427 
00428    CFLAGS is a series of bits which affect compilation.
00429 
00430      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
00431      use POSIX basic syntax.
00432 
00433      If REG_NEWLINE is set, then . and [^...] don't match newline.
00434      Also, regexec will try a match beginning after every newline.
00435 
00436      If REG_ICASE is set, then we considers upper- and lowercase
00437      versions of letters to be equivalent when matching.
00438 
00439      If REG_NOSUB is set, then when PREG is passed to regexec, that
00440      routine will report only success or failure, and nothing about the
00441      registers.
00442 
00443    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
00444    the return codes and their meanings.)  */
00445 
00446 int
00447 regcomp (preg, pattern, cflags)
00448     regex_t *__restrict preg;
00449     const char *__restrict pattern;
00450     int cflags;
00451 {
00452   reg_errcode_t ret;
00453   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
00454                       : RE_SYNTAX_POSIX_BASIC);
00455 
00456   preg->buffer = NULL;
00457   preg->allocated = 0;
00458   preg->used = 0;
00459 
00460   /* Try to allocate space for the fastmap.  */
00461   preg->fastmap = re_malloc (char, SBC_MAX);
00462   if (BE (preg->fastmap == NULL, 0))
00463     return REG_ESPACE;
00464 
00465   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
00466 
00467   /* If REG_NEWLINE is set, newlines are treated differently.  */
00468   if (cflags & REG_NEWLINE)
00469     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
00470       syntax &= ~RE_DOT_NEWLINE;
00471       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
00472       /* It also changes the matching behavior.  */
00473       preg->newline_anchor = 1;
00474     }
00475   else
00476     preg->newline_anchor = 0;
00477   preg->no_sub = !!(cflags & REG_NOSUB);
00478   preg->translate = NULL;
00479 
00480   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
00481 
00482   /* POSIX doesn't distinguish between an unmatched open-group and an
00483      unmatched close-group: both are REG_EPAREN.  */
00484   if (ret == REG_ERPAREN)
00485     ret = REG_EPAREN;
00486 
00487   /* We have already checked preg->fastmap != NULL.  */
00488   if (BE (ret == REG_NOERROR, 1))
00489     /* Compute the fastmap now, since regexec cannot modify the pattern
00490        buffer.  This function never fails in this implementation.  */
00491     (void) re_compile_fastmap (preg);
00492   else
00493     {
00494       /* Some error occurred while compiling the expression.  */
00495       re_free (preg->fastmap);
00496       preg->fastmap = NULL;
00497     }
00498 
00499   return (int) ret;
00500 }
00501 #ifdef _LIBC
00502 weak_alias (__regcomp, regcomp)
00503 #endif
00504 
00505 /* Returns a message corresponding to an error code, ERRCODE, returned
00506    from either regcomp or regexec.   We don't use PREG here.  */
00507 
00508 size_t
00509 regerror (errcode, preg, errbuf, errbuf_size)
00510     int errcode;
00511     const regex_t *__restrict preg;
00512     char *__restrict errbuf;
00513     size_t errbuf_size;
00514 {
00515   const char *msg;
00516   size_t msg_size;
00517 
00518   if (BE (errcode < 0
00519          || errcode >= (int) (sizeof (__re_error_msgid_idx)
00520                             / sizeof (__re_error_msgid_idx[0])), 0))
00521     /* Only error codes returned by the rest of the code should be passed
00522        to this routine.  If we are given anything else, or if other regex
00523        code generates an invalid error code, then the program has a bug.
00524        Dump core so we can fix it.  */
00525     abort ();
00526 
00527   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
00528 
00529   msg_size = strlen (msg) + 1; /* Includes the null.  */
00530 
00531   if (BE (errbuf_size != 0, 1))
00532     {
00533       if (BE (msg_size > errbuf_size, 0))
00534        {
00535 #if defined HAVE_MEMPCPY || defined _LIBC
00536          *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
00537 #else
00538          memcpy (errbuf, msg, errbuf_size - 1);
00539          errbuf[errbuf_size - 1] = 0;
00540 #endif
00541        }
00542       else
00543        memcpy (errbuf, msg, msg_size);
00544     }
00545 
00546   return msg_size;
00547 }
00548 #ifdef _LIBC
00549 weak_alias (__regerror, regerror)
00550 #endif
00551 
00552 
00553 #ifdef RE_ENABLE_I18N
00554 /* This static array is used for the map to single-byte characters when
00555    UTF-8 is used.  Otherwise we would allocate memory just to initialize
00556    it the same all the time.  UTF-8 is the preferred encoding so this is
00557    a worthwhile optimization.  */
00558 static const bitset_t utf8_sb_map =
00559 {
00560   /* Set the first 128 bits.  */
00561   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
00562 };
00563 #endif
00564 
00565 
00566 static void
00567 free_dfa_content (re_dfa_t *dfa)
00568 {
00569   int i, j;
00570 
00571   if (dfa->nodes)
00572     for (i = 0; i < dfa->nodes_len; ++i)
00573       free_token (dfa->nodes + i);
00574   re_free (dfa->nexts);
00575   for (i = 0; i < dfa->nodes_len; ++i)
00576     {
00577       if (dfa->eclosures != NULL)
00578        re_node_set_free (dfa->eclosures + i);
00579       if (dfa->inveclosures != NULL)
00580        re_node_set_free (dfa->inveclosures + i);
00581       if (dfa->edests != NULL)
00582        re_node_set_free (dfa->edests + i);
00583     }
00584   re_free (dfa->edests);
00585   re_free (dfa->eclosures);
00586   re_free (dfa->inveclosures);
00587   re_free (dfa->nodes);
00588 
00589   if (dfa->state_table)
00590     for (i = 0; i <= dfa->state_hash_mask; ++i)
00591       {
00592        struct re_state_table_entry *entry = dfa->state_table + i;
00593        for (j = 0; j < entry->num; ++j)
00594          {
00595            re_dfastate_t *state = entry->array[j];
00596            free_state (state);
00597          }
00598         re_free (entry->array);
00599       }
00600   re_free (dfa->state_table);
00601 #ifdef RE_ENABLE_I18N
00602   if (dfa->sb_char != utf8_sb_map)
00603     re_free (dfa->sb_char);
00604 #endif
00605   re_free (dfa->subexp_map);
00606 #ifdef DEBUG
00607   re_free (dfa->re_str);
00608 #endif
00609 
00610   re_free (dfa);
00611 }
00612 
00613 
00614 /* Free dynamically allocated space used by PREG.  */
00615 
00616 void
00617 regfree (preg)
00618     regex_t *preg;
00619 {
00620   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
00621   if (BE (dfa != NULL, 1))
00622     free_dfa_content (dfa);
00623   preg->buffer = NULL;
00624   preg->allocated = 0;
00625 
00626   re_free (preg->fastmap);
00627   preg->fastmap = NULL;
00628 
00629   re_free (preg->translate);
00630   preg->translate = NULL;
00631 }
00632 #ifdef _LIBC
00633 weak_alias (__regfree, regfree)
00634 #endif
00635 
00636 /* Entry points compatible with 4.2 BSD regex library.  We don't define
00637    them unless specifically requested.  */
00638 
00639 #if defined _REGEX_RE_COMP || defined _LIBC
00640 
00641 /* BSD has one and only one pattern buffer.  */
00642 static struct re_pattern_buffer re_comp_buf;
00643 
00644 char *
00645 # ifdef _LIBC
00646 /* Make these definitions weak in libc, so POSIX programs can redefine
00647    these names if they don't use our functions, and still use
00648    regcomp/regexec above without link errors.  */
00649 weak_function
00650 # endif
00651 re_comp (s)
00652      const char *s;
00653 {
00654   reg_errcode_t ret;
00655   char *fastmap;
00656 
00657   if (!s)
00658     {
00659       if (!re_comp_buf.buffer)
00660        return gettext ("No previous regular expression");
00661       return 0;
00662     }
00663 
00664   if (re_comp_buf.buffer)
00665     {
00666       fastmap = re_comp_buf.fastmap;
00667       re_comp_buf.fastmap = NULL;
00668       __regfree (&re_comp_buf);
00669       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
00670       re_comp_buf.fastmap = fastmap;
00671     }
00672 
00673   if (re_comp_buf.fastmap == NULL)
00674     {
00675       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
00676       if (re_comp_buf.fastmap == NULL)
00677        return (char *) gettext (__re_error_msgid
00678                              + __re_error_msgid_idx[(int) REG_ESPACE]);
00679     }
00680 
00681   /* Since `re_exec' always passes NULL for the `regs' argument, we
00682      don't need to initialize the pattern buffer fields which affect it.  */
00683 
00684   /* Match anchors at newlines.  */
00685   re_comp_buf.newline_anchor = 1;
00686 
00687   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
00688 
00689   if (!ret)
00690     return NULL;
00691 
00692   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
00693   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
00694 }
00695 
00696 #ifdef _LIBC
00697 libc_freeres_fn (free_mem)
00698 {
00699   __regfree (&re_comp_buf);
00700 }
00701 #endif
00702 
00703 #endif /* _REGEX_RE_COMP */
00704 
00705 /* Internal entry point.
00706    Compile the regular expression PATTERN, whose length is LENGTH.
00707    SYNTAX indicate regular expression's syntax.  */
00708 
00709 static reg_errcode_t
00710 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
00711                    reg_syntax_t syntax)
00712 {
00713   reg_errcode_t err = REG_NOERROR;
00714   re_dfa_t *dfa;
00715   re_string_t regexp;
00716 
00717   /* Initialize the pattern buffer.  */
00718   preg->fastmap_accurate = 0;
00719   preg->syntax = syntax;
00720   preg->not_bol = preg->not_eol = 0;
00721   preg->used = 0;
00722   preg->re_nsub = 0;
00723   preg->can_be_null = 0;
00724   preg->regs_allocated = REGS_UNALLOCATED;
00725 
00726   /* Initialize the dfa.  */
00727   dfa = (re_dfa_t *) preg->buffer;
00728   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
00729     {
00730       /* If zero allocated, but buffer is non-null, try to realloc
00731         enough space.  This loses if buffer's address is bogus, but
00732         that is the user's responsibility.  If ->buffer is NULL this
00733         is a simple allocation.  */
00734       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
00735       if (dfa == NULL)
00736        return REG_ESPACE;
00737       preg->allocated = sizeof (re_dfa_t);
00738       preg->buffer = (unsigned char *) dfa;
00739     }
00740   preg->used = sizeof (re_dfa_t);
00741 
00742   err = init_dfa (dfa, length);
00743   if (BE (err != REG_NOERROR, 0))
00744     {
00745       free_dfa_content (dfa);
00746       preg->buffer = NULL;
00747       preg->allocated = 0;
00748       return err;
00749     }
00750 #ifdef DEBUG
00751   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
00752   dfa->re_str = re_malloc (char, length + 1);
00753   strncpy (dfa->re_str, pattern, length + 1);
00754 #endif
00755 
00756   __libc_lock_init (dfa->lock);
00757 
00758   err = re_string_construct (&regexp, pattern, length, preg->translate,
00759                           syntax & RE_ICASE, dfa);
00760   if (BE (err != REG_NOERROR, 0))
00761     {
00762     re_compile_internal_free_return:
00763       free_workarea_compile (preg);
00764       re_string_destruct (&regexp);
00765       free_dfa_content (dfa);
00766       preg->buffer = NULL;
00767       preg->allocated = 0;
00768       return err;
00769     }
00770 
00771   /* Parse the regular expression, and build a structure tree.  */
00772   preg->re_nsub = 0;
00773   dfa->str_tree = parse (&regexp, preg, syntax, &err);
00774   if (BE (dfa->str_tree == NULL, 0))
00775     goto re_compile_internal_free_return;
00776 
00777   /* Analyze the tree and create the nfa.  */
00778   err = analyze (preg);
00779   if (BE (err != REG_NOERROR, 0))
00780     goto re_compile_internal_free_return;
00781 
00782 #ifdef RE_ENABLE_I18N
00783   /* If possible, do searching in single byte encoding to speed things up.  */
00784   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
00785     optimize_utf8 (dfa);
00786 #endif
00787 
00788   /* Then create the initial state of the dfa.  */
00789   err = create_initial_state (dfa);
00790 
00791   /* Release work areas.  */
00792   free_workarea_compile (preg);
00793   re_string_destruct (&regexp);
00794 
00795   if (BE (err != REG_NOERROR, 0))
00796     {
00797       free_dfa_content (dfa);
00798       preg->buffer = NULL;
00799       preg->allocated = 0;
00800     }
00801 
00802   return err;
00803 }
00804 
00805 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
00806    as the initial length of some arrays.  */
00807 
00808 static reg_errcode_t
00809 init_dfa (re_dfa_t *dfa, size_t pat_len)
00810 {
00811   unsigned int table_size;
00812 #ifndef _LIBC
00813   char *codeset_name;
00814 #endif
00815 
00816   memset (dfa, '\0', sizeof (re_dfa_t));
00817 
00818   /* Force allocation of str_tree_storage the first time.  */
00819   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
00820 
00821   /* Avoid overflows.  */
00822   if (pat_len == SIZE_MAX)
00823     return REG_ESPACE;
00824 
00825   dfa->nodes_alloc = pat_len + 1;
00826   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
00827 
00828   /*  table_size = 2 ^ ceil(log pat_len) */
00829   for (table_size = 1; ; table_size <<= 1)
00830     if (table_size > pat_len)
00831       break;
00832 
00833   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
00834   dfa->state_hash_mask = table_size - 1;
00835 
00836   dfa->mb_cur_max = MB_CUR_MAX;
00837 #ifdef _LIBC
00838   if (dfa->mb_cur_max == 6
00839       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
00840     dfa->is_utf8 = 1;
00841   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
00842                      != 0);
00843 #else
00844 # ifdef HAVE_LANGINFO_CODESET
00845   codeset_name = nl_langinfo (CODESET);
00846 # else
00847   codeset_name = getenv ("LC_ALL");
00848   if (codeset_name == NULL || codeset_name[0] == '\0')
00849     codeset_name = getenv ("LC_CTYPE");
00850   if (codeset_name == NULL || codeset_name[0] == '\0')
00851     codeset_name = getenv ("LANG");
00852   if (codeset_name == NULL)
00853     codeset_name = "";
00854   else if (strchr (codeset_name, '.') !=  NULL)
00855     codeset_name = strchr (codeset_name, '.') + 1;
00856 # endif
00857 
00858   if (strcasecmp (codeset_name, "UTF-8") == 0
00859       || strcasecmp (codeset_name, "UTF8") == 0)
00860     dfa->is_utf8 = 1;
00861 
00862   /* We check exhaustively in the loop below if this charset is a
00863      superset of ASCII.  */
00864   dfa->map_notascii = 0;
00865 #endif
00866 
00867 #ifdef RE_ENABLE_I18N
00868   if (dfa->mb_cur_max > 1)
00869     {
00870       if (dfa->is_utf8)
00871        dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
00872       else
00873        {
00874          int i, j, ch;
00875 
00876          dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
00877          if (BE (dfa->sb_char == NULL, 0))
00878            return REG_ESPACE;
00879 
00880          /* Set the bits corresponding to single byte chars.  */
00881          for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
00882            for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
00883              {
00884               wint_t wch = __btowc (ch);
00885               if (wch != WEOF)
00886                 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
00887 # ifndef _LIBC
00888               if (isascii (ch) && wch != ch)
00889                 dfa->map_notascii = 1;
00890 # endif
00891              }
00892        }
00893     }
00894 #endif
00895 
00896   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
00897     return REG_ESPACE;
00898   return REG_NOERROR;
00899 }
00900 
00901 /* Initialize WORD_CHAR table, which indicate which character is
00902    "word".  In this case "word" means that it is the word construction
00903    character used by some operators like "<", ">", etc.  */
00904 
00905 static void
00906 internal_function
00907 init_word_char (re_dfa_t *dfa)
00908 {
00909   int i, j, ch;
00910   dfa->word_ops_used = 1;
00911   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
00912     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
00913       if (isalnum (ch) || ch == '_')
00914        dfa->word_char[i] |= (bitset_word_t) 1 << j;
00915 }
00916 
00917 /* Free the work area which are only used while compiling.  */
00918 
00919 static void
00920 free_workarea_compile (regex_t *preg)
00921 {
00922   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
00923   bin_tree_storage_t *storage, *next;
00924   for (storage = dfa->str_tree_storage; storage; storage = next)
00925     {
00926       next = storage->next;
00927       re_free (storage);
00928     }
00929   dfa->str_tree_storage = NULL;
00930   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
00931   dfa->str_tree = NULL;
00932   re_free (dfa->org_indices);
00933   dfa->org_indices = NULL;
00934 }
00935 
00936 /* Create initial states for all contexts.  */
00937 
00938 static reg_errcode_t
00939 create_initial_state (re_dfa_t *dfa)
00940 {
00941   int first, i;
00942   reg_errcode_t err;
00943   re_node_set init_nodes;
00944 
00945   /* Initial states have the epsilon closure of the node which is
00946      the first node of the regular expression.  */
00947   first = dfa->str_tree->first->node_idx;
00948   dfa->init_node = first;
00949   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
00950   if (BE (err != REG_NOERROR, 0))
00951     return err;
00952 
00953   /* The back-references which are in initial states can epsilon transit,
00954      since in this case all of the subexpressions can be null.
00955      Then we add epsilon closures of the nodes which are the next nodes of
00956      the back-references.  */
00957   if (dfa->nbackref > 0)
00958     for (i = 0; i < init_nodes.nelem; ++i)
00959       {
00960        int node_idx = init_nodes.elems[i];
00961        re_token_type_t type = dfa->nodes[node_idx].type;
00962 
00963        int clexp_idx;
00964        if (type != OP_BACK_REF)
00965          continue;
00966        for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
00967          {
00968            re_token_t *clexp_node;
00969            clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
00970            if (clexp_node->type == OP_CLOSE_SUBEXP
00971               && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
00972              break;
00973          }
00974        if (clexp_idx == init_nodes.nelem)
00975          continue;
00976 
00977        if (type == OP_BACK_REF)
00978          {
00979            int dest_idx = dfa->edests[node_idx].elems[0];
00980            if (!re_node_set_contains (&init_nodes, dest_idx))
00981              {
00982               re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
00983               i = 0;
00984              }
00985          }
00986       }
00987 
00988   /* It must be the first time to invoke acquire_state.  */
00989   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
00990   /* We don't check ERR here, since the initial state must not be NULL.  */
00991   if (BE (dfa->init_state == NULL, 0))
00992     return err;
00993   if (dfa->init_state->has_constraint)
00994     {
00995       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
00996                                                  CONTEXT_WORD);
00997       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
00998                                                CONTEXT_NEWLINE);
00999       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
01000                                                   &init_nodes,
01001                                                   CONTEXT_NEWLINE
01002                                                   | CONTEXT_BEGBUF);
01003       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
01004              || dfa->init_state_begbuf == NULL, 0))
01005        return err;
01006     }
01007   else
01008     dfa->init_state_word = dfa->init_state_nl
01009       = dfa->init_state_begbuf = dfa->init_state;
01010 
01011   re_node_set_free (&init_nodes);
01012   return REG_NOERROR;
01013 }
01014 
01015 #ifdef RE_ENABLE_I18N
01016 /* If it is possible to do searching in single byte encoding instead of UTF-8
01017    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
01018    DFA nodes where needed.  */
01019 
01020 static void
01021 optimize_utf8 (re_dfa_t *dfa)
01022 {
01023   int node, i, mb_chars = 0, has_period = 0;
01024 
01025   for (node = 0; node < dfa->nodes_len; ++node)
01026     switch (dfa->nodes[node].type)
01027       {
01028       case CHARACTER:
01029        if (dfa->nodes[node].opr.c >= 0x80)
01030          mb_chars = 1;
01031        break;
01032       case ANCHOR:
01033        switch (dfa->nodes[node].opr.ctx_type)
01034          {
01035          case LINE_FIRST:
01036          case LINE_LAST:
01037          case BUF_FIRST:
01038          case BUF_LAST:
01039            break;
01040          default:
01041            /* Word anchors etc. cannot be handled.  It's okay to test
01042               opr.ctx_type since constraints (for all DFA nodes) are
01043               created by ORing one or more opr.ctx_type values.  */
01044            return;
01045          }
01046        break;
01047       case OP_PERIOD:
01048         has_period = 1;
01049         break;
01050       case OP_BACK_REF:
01051       case OP_ALT:
01052       case END_OF_RE:
01053       case OP_DUP_ASTERISK:
01054       case OP_OPEN_SUBEXP:
01055       case OP_CLOSE_SUBEXP:
01056        break;
01057       case COMPLEX_BRACKET:
01058        return;
01059       case SIMPLE_BRACKET:
01060        /* Just double check.  The non-ASCII range starts at 0x80.  */
01061        assert (0x80 % BITSET_WORD_BITS == 0);
01062         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
01063          if (dfa->nodes[node].opr.sbcset[i])
01064            return;
01065        break;
01066       default:
01067        abort ();
01068       }
01069 
01070   if (mb_chars || has_period)
01071     for (node = 0; node < dfa->nodes_len; ++node)
01072       {
01073        if (dfa->nodes[node].type == CHARACTER
01074            && dfa->nodes[node].opr.c >= 0x80)
01075          dfa->nodes[node].mb_partial = 0;
01076        else if (dfa->nodes[node].type == OP_PERIOD)
01077          dfa->nodes[node].type = OP_UTF8_PERIOD;
01078       }
01079 
01080   /* The search can be in single byte locale.  */
01081   dfa->mb_cur_max = 1;
01082   dfa->is_utf8 = 0;
01083   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
01084 }
01085 #endif
01086 
01087 /* Analyze the structure tree, and calculate "first", "next", "edest",
01088    "eclosure", and "inveclosure".  */
01089 
01090 static reg_errcode_t
01091 analyze (regex_t *preg)
01092 {
01093   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
01094   reg_errcode_t ret;
01095 
01096   /* Allocate arrays.  */
01097   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
01098   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
01099   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
01100   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
01101   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
01102          || dfa->eclosures == NULL, 0))
01103     return REG_ESPACE;
01104 
01105   dfa->subexp_map = re_malloc (int, preg->re_nsub);
01106   if (dfa->subexp_map != NULL)
01107     {
01108       int i;
01109       for (i = 0; i < preg->re_nsub; i++)
01110        dfa->subexp_map[i] = i;
01111       preorder (dfa->str_tree, optimize_subexps, dfa);
01112       for (i = 0; i < preg->re_nsub; i++)
01113        if (dfa->subexp_map[i] != i)
01114          break;
01115       if (i == preg->re_nsub)
01116        {
01117          free (dfa->subexp_map);
01118          dfa->subexp_map = NULL;
01119        }
01120     }
01121 
01122   ret = postorder (dfa->str_tree, lower_subexps, preg);
01123   if (BE (ret != REG_NOERROR, 0))
01124     return ret;
01125   ret = postorder (dfa->str_tree, calc_first, dfa);
01126   if (BE (ret != REG_NOERROR, 0))
01127     return ret;
01128   preorder (dfa->str_tree, calc_next, dfa);
01129   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
01130   if (BE (ret != REG_NOERROR, 0))
01131     return ret;
01132   ret = calc_eclosure (dfa);
01133   if (BE (ret != REG_NOERROR, 0))
01134     return ret;
01135 
01136   /* We only need this during the prune_impossible_nodes pass in regexec.c;
01137      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
01138   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
01139       || dfa->nbackref)
01140     {
01141       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
01142       if (BE (dfa->inveclosures == NULL, 0))
01143         return REG_ESPACE;
01144       ret = calc_inveclosure (dfa);
01145     }
01146 
01147   return ret;
01148 }
01149 
01150 /* Our parse trees are very unbalanced, so we cannot use a stack to
01151    implement parse tree visits.  Instead, we use parent pointers and
01152    some hairy code in these two functions.  */
01153 static reg_errcode_t
01154 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
01155           void *extra)
01156 {
01157   bin_tree_t *node, *prev;
01158 
01159   for (node = root; ; )
01160     {
01161       /* Descend down the tree, preferably to the left (or to the right
01162         if that's the only child).  */
01163       while (node->left || node->right)
01164        if (node->left)
01165           node = node->left;
01166         else
01167           node = node->right;
01168 
01169       do
01170        {
01171          reg_errcode_t err = fn (extra, node);
01172          if (BE (err != REG_NOERROR, 0))
01173            return err;
01174           if (node->parent == NULL)
01175            return REG_NOERROR;
01176          prev = node;
01177          node = node->parent;
01178        }
01179       /* Go up while we have a node that is reached from the right.  */
01180       while (node->right == prev || node->right == NULL);
01181       node = node->right;
01182     }
01183 }
01184 
01185 static reg_errcode_t
01186 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
01187          void *extra)
01188 {
01189   bin_tree_t *node;
01190 
01191   for (node = root; ; )
01192     {
01193       reg_errcode_t err = fn (extra, node);
01194       if (BE (err != REG_NOERROR, 0))
01195        return err;
01196 
01197       /* Go to the left node, or up and to the right.  */
01198       if (node->left)
01199        node = node->left;
01200       else
01201        {
01202          bin_tree_t *prev = NULL;
01203          while (node->right == prev || node->right == NULL)
01204            {
01205              prev = node;
01206              node = node->parent;
01207              if (!node)
01208                return REG_NOERROR;
01209            }
01210          node = node->right;
01211        }
01212     }
01213 }
01214 
01215 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
01216    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
01217    backreferences as well.  Requires a preorder visit.  */
01218 static reg_errcode_t
01219 optimize_subexps (void *extra, bin_tree_t *node)
01220 {
01221   re_dfa_t *dfa = (re_dfa_t *) extra;
01222 
01223   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
01224     {
01225       int idx = node->token.opr.idx;
01226       node->token.opr.idx = dfa->subexp_map[idx];
01227       dfa->used_bkref_map |= 1 << node->token.opr.idx;
01228     }
01229 
01230   else if (node->token.type == SUBEXP
01231            && node->left && node->left->token.type == SUBEXP)
01232     {
01233       int other_idx = node->left->token.opr.idx;
01234 
01235       node->left = node->left->left;
01236       if (node->left)
01237         node->left->parent = node;
01238 
01239       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
01240       if (other_idx < BITSET_WORD_BITS)
01241          dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
01242     }
01243 
01244   return REG_NOERROR;
01245 }
01246 
01247 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
01248    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
01249 static reg_errcode_t
01250 lower_subexps (void *extra, bin_tree_t *node)
01251 {
01252   regex_t *preg = (regex_t *) extra;
01253   reg_errcode_t err = REG_NOERROR;
01254 
01255   if (node->left && node->left->token.type == SUBEXP)
01256     {
01257       node->left = lower_subexp (&err, preg, node->left);
01258       if (node->left)
01259        node->left->parent = node;
01260     }
01261   if (node->right && node->right->token.type == SUBEXP)
01262     {
01263       node->right = lower_subexp (&err, preg, node->right);
01264       if (node->right)
01265        node->right->parent = node;
01266     }
01267 
01268   return err;
01269 }
01270 
01271 static bin_tree_t *
01272 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
01273 {
01274   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
01275   bin_tree_t *body = node->left;
01276   bin_tree_t *op, *cls, *tree1, *tree;
01277 
01278   if (preg->no_sub
01279       /* We do not optimize empty subexpressions, because otherwise we may
01280         have bad CONCAT nodes with NULL children.  This is obviously not
01281         very common, so we do not lose much.  An example that triggers
01282         this case is the sed "script" /\(\)/x.  */
01283       && node->left != NULL
01284       && (node->token.opr.idx >= BITSET_WORD_BITS
01285          || !(dfa->used_bkref_map
01286               & ((bitset_word_t) 1 << node->token.opr.idx))))
01287     return node->left;
01288 
01289   /* Convert the SUBEXP node to the concatenation of an
01290      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
01291   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
01292   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
01293   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
01294   tree = create_tree (dfa, op, tree1, CONCAT);
01295   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
01296     {
01297       *err = REG_ESPACE;
01298       return NULL;
01299     }
01300 
01301   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
01302   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
01303   return tree;
01304 }
01305 
01306 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
01307    nodes.  Requires a postorder visit.  */
01308 static reg_errcode_t
01309 calc_first (void *extra, bin_tree_t *node)
01310 {
01311   re_dfa_t *dfa = (re_dfa_t *) extra;
01312   if (node->token.type == CONCAT)
01313     {
01314       node->first = node->left->first;
01315       node->node_idx = node->left->node_idx;
01316     }
01317   else
01318     {
01319       node->first = node;
01320       node->node_idx = re_dfa_add_node (dfa, node->token);
01321       if (BE (node->node_idx == -1, 0))
01322         return REG_ESPACE;
01323       if (node->token.type == ANCHOR)
01324         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
01325     }
01326   return REG_NOERROR;
01327 }
01328 
01329 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
01330 static reg_errcode_t
01331 calc_next (void *extra, bin_tree_t *node)
01332 {
01333   switch (node->token.type)
01334     {
01335     case OP_DUP_ASTERISK:
01336       node->left->next = node;
01337       break;
01338     case CONCAT:
01339       node->left->next = node->right->first;
01340       node->right->next = node->next;
01341       break;
01342     default:
01343       if (node->left)
01344        node->left->next = node->next;
01345       if (node->right)
01346         node->right->next = node->next;
01347       break;
01348     }
01349   return REG_NOERROR;
01350 }
01351 
01352 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
01353 static reg_errcode_t
01354 link_nfa_nodes (void *extra, bin_tree_t *node)
01355 {
01356   re_dfa_t *dfa = (re_dfa_t *) extra;
01357   int idx = node->node_idx;
01358   reg_errcode_t err = REG_NOERROR;
01359 
01360   switch (node->token.type)
01361     {
01362     case CONCAT:
01363       break;
01364 
01365     case END_OF_RE:
01366       assert (node->next == NULL);
01367       break;
01368 
01369     case OP_DUP_ASTERISK:
01370     case OP_ALT:
01371       {
01372        int left, right;
01373        dfa->has_plural_match = 1;
01374        if (node->left != NULL)
01375          left = node->left->first->node_idx;
01376        else
01377          left = node->next->node_idx;
01378        if (node->right != NULL)
01379          right = node->right->first->node_idx;
01380        else
01381          right = node->next->node_idx;
01382        assert (left > -1);
01383        assert (right > -1);
01384        err = re_node_set_init_2 (dfa->edests + idx, left, right);
01385       }
01386       break;
01387 
01388     case ANCHOR:
01389     case OP_OPEN_SUBEXP:
01390     case OP_CLOSE_SUBEXP:
01391       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
01392       break;
01393 
01394     case OP_BACK_REF:
01395       dfa->nexts[idx] = node->next->node_idx;
01396       if (node->token.type == OP_BACK_REF)
01397        re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
01398       break;
01399 
01400     default:
01401       assert (!IS_EPSILON_NODE (node->token.type));
01402       dfa->nexts[idx] = node->next->node_idx;
01403       break;
01404     }
01405 
01406   return err;
01407 }
01408 
01409 /* Duplicate the epsilon closure of the node ROOT_NODE.
01410    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
01411    to their own constraint.  */
01412 
01413 static reg_errcode_t
01414 internal_function
01415 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
01416                      int root_node, unsigned int init_constraint)
01417 {
01418   int org_node, clone_node, ret;
01419   unsigned int constraint = init_constraint;
01420   for (org_node = top_org_node, clone_node = top_clone_node;;)
01421     {
01422       int org_dest, clone_dest;
01423       if (dfa->nodes[org_node].type == OP_BACK_REF)
01424        {
01425          /* If the back reference epsilon-transit, its destination must
01426             also have the constraint.  Then duplicate the epsilon closure
01427             of the destination of the back reference, and store it in
01428             edests of the back reference.  */
01429          org_dest = dfa->nexts[org_node];
01430          re_node_set_empty (dfa->edests + clone_node);
01431          clone_dest = duplicate_node (dfa, org_dest, constraint);
01432          if (BE (clone_dest == -1, 0))
01433            return REG_ESPACE;
01434          dfa->nexts[clone_node] = dfa->nexts[org_node];
01435          ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
01436          if (BE (ret < 0, 0))
01437            return REG_ESPACE;
01438        }
01439       else if (dfa->edests[org_node].nelem == 0)
01440        {
01441          /* In case of the node can't epsilon-transit, don't duplicate the
01442             destination and store the original destination as the
01443             destination of the node.  */
01444          dfa->nexts[clone_node] = dfa->nexts[org_node];
01445          break;
01446        }
01447       else if (dfa->edests[org_node].nelem == 1)
01448        {
01449          /* In case of the node can epsilon-transit, and it has only one
01450             destination.  */
01451          org_dest = dfa->edests[org_node].elems[0];
01452          re_node_set_empty (dfa->edests + clone_node);
01453          /* If the node is root_node itself, it means the epsilon clsoure
01454             has a loop.   Then tie it to the destination of the root_node.  */
01455          if (org_node == root_node && clone_node != org_node)
01456            {
01457              ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
01458              if (BE (ret < 0, 0))
01459               return REG_ESPACE;
01460              break;
01461            }
01462          /* In case of the node has another constraint, add it.  */
01463          constraint |= dfa->nodes[org_node].constraint;
01464          clone_dest = duplicate_node (dfa, org_dest, constraint);
01465          if (BE (clone_dest == -1, 0))
01466            return REG_ESPACE;
01467          ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
01468          if (BE (ret < 0, 0))
01469            return REG_ESPACE;
01470        }
01471       else /* dfa->edests[org_node].nelem == 2 */
01472        {
01473          /* In case of the node can epsilon-transit, and it has two
01474             destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
01475          org_dest = dfa->edests[org_node].elems[0];
01476          re_node_set_empty (dfa->edests + clone_node);
01477          /* Search for a duplicated node which satisfies the constraint.  */
01478          clone_dest = search_duplicated_node (dfa, org_dest, constraint);
01479          if (clone_dest == -1)
01480            {
01481              /* There is no such duplicated node, create a new one.  */
01482              reg_errcode_t err;
01483              clone_dest = duplicate_node (dfa, org_dest, constraint);
01484              if (BE (clone_dest == -1, 0))
01485               return REG_ESPACE;
01486              ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
01487              if (BE (ret < 0, 0))
01488               return REG_ESPACE;
01489              err = duplicate_node_closure (dfa, org_dest, clone_dest,
01490                                        root_node, constraint);
01491              if (BE (err != REG_NOERROR, 0))
01492               return err;
01493            }
01494          else
01495            {
01496              /* There is a duplicated node which satisfies the constraint,
01497                use it to avoid infinite loop.  */
01498              ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
01499              if (BE (ret < 0, 0))
01500               return REG_ESPACE;
01501            }
01502 
01503          org_dest = dfa->edests[org_node].elems[1];
01504          clone_dest = duplicate_node (dfa, org_dest, constraint);
01505          if (BE (clone_dest == -1, 0))
01506            return REG_ESPACE;
01507          ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
01508          if (BE (ret < 0, 0))
01509            return REG_ESPACE;
01510        }
01511       org_node = org_dest;
01512       clone_node = clone_dest;
01513     }
01514   return REG_NOERROR;
01515 }
01516 
01517 /* Search for a node which is duplicated from the node ORG_NODE, and
01518    satisfies the constraint CONSTRAINT.  */
01519 
01520 static int
01521 search_duplicated_node (const re_dfa_t *dfa, int org_node,
01522                      unsigned int constraint)
01523 {
01524   int idx;
01525   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
01526     {
01527       if (org_node == dfa->org_indices[idx]
01528          && constraint == dfa->nodes[idx].constraint)
01529        return idx; /* Found.  */
01530     }
01531   return -1; /* Not found.  */
01532 }
01533 
01534 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
01535    Return the index of the new node, or -1 if insufficient storage is
01536    available.  */
01537 
01538 static int
01539 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
01540 {
01541   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
01542   if (BE (dup_idx != -1, 1))
01543     {
01544       dfa->nodes[dup_idx].constraint = constraint;
01545       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
01546       dfa->nodes[dup_idx].duplicated = 1;
01547 
01548       /* Store the index of the original node.  */
01549       dfa->org_indices[dup_idx] = org_idx;
01550     }
01551   return dup_idx;
01552 }
01553 
01554 static reg_errcode_t
01555 calc_inveclosure (re_dfa_t *dfa)
01556 {
01557   int src, idx, ret;
01558   for (idx = 0; idx < dfa->nodes_len; ++idx)
01559     re_node_set_init_empty (dfa->inveclosures + idx);
01560 
01561   for (src = 0; src < dfa->nodes_len; ++src)
01562     {
01563       int *elems = dfa->eclosures[src].elems;
01564       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
01565        {
01566          ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
01567          if (BE (ret == -1, 0))
01568            return REG_ESPACE;
01569        }
01570     }
01571 
01572   return REG_NOERROR;
01573 }
01574 
01575 /* Calculate "eclosure" for all the node in DFA.  */
01576 
01577 static reg_errcode_t
01578 calc_eclosure (re_dfa_t *dfa)
01579 {
01580   int node_idx, incomplete;
01581 #ifdef DEBUG
01582   assert (dfa->nodes_len > 0);
01583 #endif
01584   incomplete = 0;
01585   /* For each nodes, calculate epsilon closure.  */
01586   for (node_idx = 0; ; ++node_idx)
01587     {
01588       reg_errcode_t err;
01589       re_node_set eclosure_elem;
01590       if (node_idx == dfa->nodes_len)
01591        {
01592          if (!incomplete)
01593            break;
01594          incomplete = 0;
01595          node_idx = 0;
01596        }
01597 
01598 #ifdef DEBUG
01599       assert (dfa->eclosures[node_idx].nelem != -1);
01600 #endif
01601 
01602       /* If we have already calculated, skip it.  */
01603       if (dfa->eclosures[node_idx].nelem != 0)
01604        continue;
01605       /* Calculate epsilon closure of `node_idx'.  */
01606       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
01607       if (BE (err != REG_NOERROR, 0))
01608        return err;
01609 
01610       if (dfa->eclosures[node_idx].nelem == 0)
01611        {
01612          incomplete = 1;
01613          re_node_set_free (&eclosure_elem);
01614        }
01615     }
01616   return REG_NOERROR;
01617 }
01618 
01619 /* Calculate epsilon closure of NODE.  */
01620 
01621 static reg_errcode_t
01622 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
01623 {
01624   reg_errcode_t err;
01625   int i, incomplete;
01626   re_node_set eclosure;
01627   incomplete = 0;
01628   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
01629   if (BE (err != REG_NOERROR, 0))
01630     return err;
01631 
01632   /* This indicates that we are calculating this node now.
01633      We reference this value to avoid infinite loop.  */
01634   dfa->eclosures[node].nelem = -1;
01635 
01636   /* If the current node has constraints, duplicate all nodes
01637      since they must inherit the constraints.  */
01638   if (dfa->nodes[node].constraint
01639       && dfa->edests[node].nelem
01640       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
01641     {
01642       err = duplicate_node_closure (dfa, node, node, node,
01643                                 dfa->nodes[node].constraint);
01644       if (BE (err != REG_NOERROR, 0))
01645        return err;
01646     }
01647 
01648   /* Expand each epsilon destination nodes.  */
01649   if (IS_EPSILON_NODE(dfa->nodes[node].type))
01650     for (i = 0; i < dfa->edests[node].nelem; ++i)
01651       {
01652        re_node_set eclosure_elem;
01653        int edest = dfa->edests[node].elems[i];
01654        /* If calculating the epsilon closure of `edest' is in progress,
01655           return intermediate result.  */
01656        if (dfa->eclosures[edest].nelem == -1)
01657          {
01658            incomplete = 1;
01659            continue;
01660          }
01661        /* If we haven't calculated the epsilon closure of `edest' yet,
01662           calculate now. Otherwise use calculated epsilon closure.  */
01663        if (dfa->eclosures[edest].nelem == 0)
01664          {
01665            err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
01666            if (BE (err != REG_NOERROR, 0))
01667              return err;
01668          }
01669        else
01670          eclosure_elem = dfa->eclosures[edest];
01671        /* Merge the epsilon closure of `edest'.  */
01672        re_node_set_merge (&eclosure, &eclosure_elem);
01673        /* If the epsilon closure of `edest' is incomplete,
01674           the epsilon closure of this node is also incomplete.  */
01675        if (dfa->eclosures[edest].nelem == 0)
01676          {
01677            incomplete = 1;
01678            re_node_set_free (&eclosure_elem);
01679          }
01680       }
01681 
01682   /* Epsilon closures include itself.  */
01683   re_node_set_insert (&eclosure, node);
01684   if (incomplete && !root)
01685     dfa->eclosures[node].nelem = 0;
01686   else
01687     dfa->eclosures[node] = eclosure;
01688   *new_set = eclosure;
01689   return REG_NOERROR;
01690 }
01691 
01692 /* Functions for token which are used in the parser.  */
01693 
01694 /* Fetch a token from INPUT.
01695    We must not use this function inside bracket expressions.  */
01696 
01697 static void
01698 internal_function
01699 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
01700 {
01701   re_string_skip_bytes (input, peek_token (result, input, syntax));
01702 }
01703 
01704 /* Peek a token from INPUT, and return the length of the token.
01705    We must not use this function inside bracket expressions.  */
01706 
01707 static int
01708 internal_function
01709 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
01710 {
01711   unsigned char c;
01712 
01713   if (re_string_eoi (input))
01714     {
01715       token->type = END_OF_RE;
01716       return 0;
01717     }
01718 
01719   c = re_string_peek_byte (input, 0);
01720   token->opr.c = c;
01721 
01722   token->word_char = 0;
01723 #ifdef RE_ENABLE_I18N
01724   token->mb_partial = 0;
01725   if (input->mb_cur_max > 1 &&
01726       !re_string_first_byte (input, re_string_cur_idx (input)))
01727     {
01728       token->type = CHARACTER;
01729       token->mb_partial = 1;
01730       return 1;
01731     }
01732 #endif
01733   if (c == '\\')
01734     {
01735       unsigned char c2;
01736       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
01737        {
01738          token->type = BACK_SLASH;
01739          return 1;
01740        }
01741 
01742       c2 = re_string_peek_byte_case (input, 1);
01743       token->opr.c = c2;
01744       token->type = CHARACTER;
01745 #ifdef RE_ENABLE_I18N
01746       if (input->mb_cur_max > 1)
01747        {
01748          wint_t wc = re_string_wchar_at (input,
01749                                      re_string_cur_idx (input) + 1);
01750          token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
01751        }
01752       else
01753 #endif
01754        token->word_char = IS_WORD_CHAR (c2) != 0;
01755 
01756       switch (c2)
01757        {
01758        case '|':
01759          if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
01760            token->type = OP_ALT;
01761          break;
01762        case '1': case '2': case '3': case '4': case '5':
01763        case '6': case '7': case '8': case '9':
01764          if (!(syntax & RE_NO_BK_REFS))
01765            {
01766              token->type = OP_BACK_REF;
01767              token->opr.idx = c2 - '1';
01768            }
01769          break;
01770        case '<':
01771          if (!(syntax & RE_NO_GNU_OPS))
01772            {
01773              token->type = ANCHOR;
01774              token->opr.ctx_type = WORD_FIRST;
01775            }
01776          break;
01777        case '>':
01778          if (!(syntax & RE_NO_GNU_OPS))
01779            {
01780              token->type = ANCHOR;
01781              token->opr.ctx_type = WORD_LAST;
01782            }
01783          break;
01784        case 'b':
01785          if (!(syntax & RE_NO_GNU_OPS))
01786            {
01787              token->type = ANCHOR;
01788              token->opr.ctx_type = WORD_DELIM;
01789            }
01790          break;
01791        case 'B':
01792          if (!(syntax & RE_NO_GNU_OPS))
01793            {
01794              token->type = ANCHOR;
01795              token->opr.ctx_type = NOT_WORD_DELIM;
01796            }
01797          break;
01798        case 'w':
01799          if (!(syntax & RE_NO_GNU_OPS))
01800            token->type = OP_WORD;
01801          break;
01802        case 'W':
01803          if (!(syntax & RE_NO_GNU_OPS))
01804            token->type = OP_NOTWORD;
01805          break;
01806        case 's':
01807          if (!(syntax & RE_NO_GNU_OPS))
01808            token->type = OP_SPACE;
01809          break;
01810        case 'S':
01811          if (!(syntax & RE_NO_GNU_OPS))
01812            token->type = OP_NOTSPACE;
01813          break;
01814        case '`':
01815          if (!(syntax & RE_NO_GNU_OPS))
01816            {
01817              token->type = ANCHOR;
01818              token->opr.ctx_type = BUF_FIRST;
01819            }
01820          break;
01821        case '\'':
01822          if (!(syntax & RE_NO_GNU_OPS))
01823            {
01824              token->type = ANCHOR;
01825              token->opr.ctx_type = BUF_LAST;
01826            }
01827          break;
01828        case '(':
01829          if (!(syntax & RE_NO_BK_PARENS))
01830            token->type = OP_OPEN_SUBEXP;
01831          break;
01832        case ')':
01833          if (!(syntax & RE_NO_BK_PARENS))
01834            token->type = OP_CLOSE_SUBEXP;
01835          break;
01836        case '+':
01837          if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
01838            token->type = OP_DUP_PLUS;
01839          break;
01840        case '?':
01841          if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
01842            token->type = OP_DUP_QUESTION;
01843          break;
01844        case '{':
01845          if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
01846            token->type = OP_OPEN_DUP_NUM;
01847          break;
01848        case '}':
01849          if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
01850            token->type = OP_CLOSE_DUP_NUM;
01851          break;
01852        default:
01853          break;
01854        }
01855       return 2;
01856     }
01857 
01858   token->type = CHARACTER;
01859 #ifdef RE_ENABLE_I18N
01860   if (input->mb_cur_max > 1)
01861     {
01862       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
01863       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
01864     }
01865   else
01866 #endif
01867     token->word_char = IS_WORD_CHAR (token->opr.c);
01868 
01869   switch (c)
01870     {
01871     case '\n':
01872       if (syntax & RE_NEWLINE_ALT)
01873        token->type = OP_ALT;
01874       break;
01875     case '|':
01876       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
01877        token->type = OP_ALT;
01878       break;
01879     case '*':
01880       token->type = OP_DUP_ASTERISK;
01881       break;
01882     case '+':
01883       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
01884        token->type = OP_DUP_PLUS;
01885       break;
01886     case '?':
01887       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
01888        token->type = OP_DUP_QUESTION;
01889       break;
01890     case '{':
01891       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
01892        token->type = OP_OPEN_DUP_NUM;
01893       break;
01894     case '}':
01895       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
01896        token->type = OP_CLOSE_DUP_NUM;
01897       break;
01898     case '(':
01899       if (syntax & RE_NO_BK_PARENS)
01900        token->type = OP_OPEN_SUBEXP;
01901       break;
01902     case ')':
01903       if (syntax & RE_NO_BK_PARENS)
01904        token->type = OP_CLOSE_SUBEXP;
01905       break;
01906     case '[':
01907       token->type = OP_OPEN_BRACKET;
01908       break;
01909     case '.':
01910       token->type = OP_PERIOD;
01911       break;
01912     case '^':
01913       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
01914          re_string_cur_idx (input) != 0)
01915        {
01916          char prev = re_string_peek_byte (input, -1);
01917          if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
01918            break;
01919        }
01920       token->type = ANCHOR;
01921       token->opr.ctx_type = LINE_FIRST;
01922       break;
01923     case '$':
01924       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
01925          re_string_cur_idx (input) + 1 != re_string_length (input))
01926        {
01927          re_token_t next;
01928          re_string_skip_bytes (input, 1);
01929          peek_token (&next, input, syntax);
01930          re_string_skip_bytes (input, -1);
01931          if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
01932            break;
01933        }
01934       token->type = ANCHOR;
01935       token->opr.ctx_type = LINE_LAST;
01936       break;
01937     default:
01938       break;
01939     }
01940   return 1;
01941 }
01942 
01943 /* Peek a token from INPUT, and return the length of the token.
01944    We must not use this function out of bracket expressions.  */
01945 
01946 static int
01947 internal_function
01948 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
01949 {
01950   unsigned char c;
01951   if (re_string_eoi (input))
01952     {
01953       token->type = END_OF_RE;
01954       return 0;
01955     }
01956   c = re_string_peek_byte (input, 0);
01957   token->opr.c = c;
01958 
01959 #ifdef RE_ENABLE_I18N
01960   if (input->mb_cur_max > 1 &&
01961       !re_string_first_byte (input, re_string_cur_idx (input)))
01962     {
01963       token->type = CHARACTER;
01964       return 1;
01965     }
01966 #endif /* RE_ENABLE_I18N */
01967 
01968   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
01969       && re_string_cur_idx (input) + 1 < re_string_length (input))
01970     {
01971       /* In this case, '\' escape a character.  */
01972       unsigned char c2;
01973       re_string_skip_bytes (input, 1);
01974       c2 = re_string_peek_byte (input, 0);
01975       token->opr.c = c2;
01976       token->type = CHARACTER;
01977       return 1;
01978     }
01979   if (c == '[') /* '[' is a special char in a bracket exps.  */
01980     {
01981       unsigned char c2;
01982       int token_len;
01983       if (re_string_cur_idx (input) + 1 < re_string_length (input))
01984        c2 = re_string_peek_byte (input, 1);
01985       else
01986        c2 = 0;
01987       token->opr.c = c2;
01988       token_len = 2;
01989       switch (c2)
01990        {
01991        case '.':
01992          token->type = OP_OPEN_COLL_ELEM;
01993          break;
01994        case '=':
01995          token->type = OP_OPEN_EQUIV_CLASS;
01996          break;
01997        case ':':
01998          if (syntax & RE_CHAR_CLASSES)
01999            {
02000              token->type = OP_OPEN_CHAR_CLASS;
02001              break;
02002            }
02003          /* else fall through.  */
02004        default:
02005          token->type = CHARACTER;
02006          token->opr.c = c;
02007          token_len = 1;
02008          break;
02009        }
02010       return token_len;
02011     }
02012   switch (c)
02013     {
02014     case '-':
02015       token->type = OP_CHARSET_RANGE;
02016       break;
02017     case ']':
02018       token->type = OP_CLOSE_BRACKET;
02019       break;
02020     case '^':
02021       token->type = OP_NON_MATCH_LIST;
02022       break;
02023     default:
02024       token->type = CHARACTER;
02025     }
02026   return 1;
02027 }
02028 
02029 /* Functions for parser.  */
02030 
02031 /* Entry point of the parser.
02032    Parse the regular expression REGEXP and return the structure tree.
02033    If an error is occured, ERR is set by error code, and return NULL.
02034    This function build the following tree, from regular expression <reg_exp>:
02035           CAT
02036           / \
02037          /   \
02038    <reg_exp>  EOR
02039 
02040    CAT means concatenation.
02041    EOR means end of regular expression.  */
02042 
02043 static bin_tree_t *
02044 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
02045        reg_errcode_t *err)
02046 {
02047   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
02048   bin_tree_t *tree, *eor, *root;
02049   re_token_t current_token;
02050   dfa->syntax = syntax;
02051   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
02052   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
02053   if (BE (*err != REG_NOERROR && tree == NULL, 0))
02054     return NULL;
02055   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
02056   if (tree != NULL)
02057     root = create_tree (dfa, tree, eor, CONCAT);
02058   else
02059     root = eor;
02060   if (BE (eor == NULL || root == NULL, 0))
02061     {
02062       *err = REG_ESPACE;
02063       return NULL;
02064     }
02065   return root;
02066 }
02067 
02068 /* This function build the following tree, from regular expression
02069    <branch1>|<branch2>:
02070           ALT
02071           / \
02072          /   \
02073    <branch1> <branch2>
02074 
02075    ALT means alternative, which represents the operator `|'.  */
02076 
02077 static bin_tree_t *
02078 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
02079               reg_syntax_t syntax, int nest, reg_errcode_t *err)
02080 {
02081   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
02082   bin_tree_t *tree, *branch = NULL;
02083   tree = parse_branch (regexp, preg, token, syntax, nest, err);
02084   if (BE (*err != REG_NOERROR && tree == NULL, 0))
02085     return NULL;
02086 
02087   while (token->type == OP_ALT)
02088     {
02089       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
02090       if (token->type != OP_ALT && token->type != END_OF_RE
02091          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
02092        {
02093          branch = parse_branch (regexp, preg, token, syntax, nest, err);
02094          if (BE (*err != REG_NOERROR && branch == NULL, 0))
02095            return NULL;
02096        }
02097       else
02098        branch = NULL;
02099       tree = create_tree (dfa, tree, branch, OP_ALT);
02100       if (BE (tree == NULL, 0))
02101        {
02102          *err = REG_ESPACE;
02103          return NULL;
02104        }
02105     }
02106   return tree;
02107 }
02108 
02109 /* This function build the following tree, from regular expression
02110    <exp1><exp2>:
02111        CAT
02112        / \
02113        /   \
02114    <exp1> <exp2>
02115 
02116    CAT means concatenation.  */
02117 
02118 static bin_tree_t *
02119 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
02120              reg_syntax_t syntax, int nest, reg_errcode_t *err)
02121 {
02122   bin_tree_t *tree, *exp;
02123   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
02124   tree = parse_expression (regexp, preg, token, syntax, nest, err);
02125   if (BE (*err != REG_NOERROR && tree == NULL, 0))
02126     return NULL;
02127 
02128   while (token->type != OP_ALT && token->type != END_OF_RE
02129         && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
02130     {
02131       exp = parse_expression (regexp, preg, token, syntax, nest, err);
02132       if (BE (*err != REG_NOERROR && exp == NULL, 0))
02133        {
02134          return NULL;
02135        }
02136       if (tree != NULL && exp != NULL)
02137        {
02138          tree = create_tree (dfa, tree, exp, CONCAT);
02139          if (tree == NULL)
02140            {
02141              *err = REG_ESPACE;
02142              return NULL;
02143            }
02144        }
02145       else if (tree == NULL)
02146        tree = exp;
02147       /* Otherwise exp == NULL, we don't need to create new tree.  */
02148     }
02149   return tree;
02150 }
02151 
02152 /* This function build the following tree, from regular expression a*:
02153         *
02154         |
02155         a
02156 */
02157 
02158 static bin_tree_t *
02159 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
02160                 reg_syntax_t syntax, int nest, reg_errcode_t *err)
02161 {
02162   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
02163   bin_tree_t *tree;
02164   switch (token->type)
02165     {
02166     case CHARACTER:
02167       tree = create_token_tree (dfa, NULL, NULL, token);
02168       if (BE (tree == NULL, 0))
02169        {
02170          *err = REG_ESPACE;
02171          return NULL;
02172        }
02173 #ifdef RE_ENABLE_I18N
02174       if (dfa->mb_cur_max > 1)
02175        {
02176          while (!re_string_eoi (regexp)
02177                && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
02178            {
02179              bin_tree_t *mbc_remain;
02180              fetch_token (token, regexp, syntax);
02181              mbc_remain = create_token_tree (dfa, NULL, NULL, token);
02182              tree = create_tree (dfa, tree, mbc_remain, CONCAT);
02183              if (BE (mbc_remain == NULL || tree == NULL, 0))
02184               {
02185                 *err = REG_ESPACE;
02186                 return NULL;
02187               }
02188            }
02189        }
02190 #endif
02191       break;
02192     case OP_OPEN_SUBEXP:
02193       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
02194       if (BE (*err != REG_NOERROR && tree == NULL, 0))
02195        return NULL;
02196       break;
02197     case OP_OPEN_BRACKET:
02198       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
02199       if (BE (*err != REG_NOERROR && tree == NULL, 0))
02200        return NULL;
02201       break;
02202     case OP_BACK_REF:
02203       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
02204        {
02205          *err = REG_ESUBREG;
02206          return NULL;
02207        }
02208       dfa->used_bkref_map |= 1 << token->opr.idx;
02209       tree = create_token_tree (dfa, NULL, NULL, token);
02210       if (BE (tree == NULL, 0))
02211        {
02212          *err = REG_ESPACE;
02213          return NULL;
02214        }
02215       ++dfa->nbackref;
02216       dfa->has_mb_node = 1;
02217       break;
02218     case OP_OPEN_DUP_NUM:
02219       if (syntax & RE_CONTEXT_INVALID_DUP)
02220        {
02221          *err = REG_BADRPT;
02222          return NULL;
02223        }
02224       /* FALLTHROUGH */
02225     case OP_DUP_ASTERISK:
02226     case OP_DUP_PLUS:
02227     case OP_DUP_QUESTION:
02228       if (syntax & RE_CONTEXT_INVALID_OPS)
02229        {
02230          *err = REG_BADRPT;
02231          return NULL;
02232        }
02233       else if (syntax & RE_CONTEXT_INDEP_OPS)
02234        {
02235          fetch_token (token, regexp, syntax);
02236          return parse_expression (regexp, preg, token, syntax, nest, err);
02237        }
02238       /* else fall through  */
02239     case OP_CLOSE_SUBEXP:
02240       if ((token->type == OP_CLOSE_SUBEXP) &&
02241          !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
02242        {
02243          *err = REG_ERPAREN;
02244          return NULL;
02245        }
02246       /* else fall through  */
02247     case OP_CLOSE_DUP_NUM:
02248       /* We treat it as a normal character.  */
02249 
02250       /* Then we can these characters as normal characters.  */
02251       token->type = CHARACTER;
02252       /* mb_partial and word_char bits should be initialized already
02253         by peek_token.  */
02254       tree = create_token_tree (dfa, NULL, NULL, token);
02255       if (BE (tree == NULL, 0))
02256        {
02257          *err = REG_ESPACE;
02258          return NULL;
02259        }
02260       break;
02261     case ANCHOR:
02262       if ((token->opr.ctx_type
02263           & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
02264          && dfa->word_ops_used == 0)
02265        init_word_char (dfa);
02266       if (token->opr.ctx_type == WORD_DELIM
02267           || token->opr.ctx_type == NOT_WORD_DELIM)
02268        {
02269          bin_tree_t *tree_first, *tree_last;
02270          if (token->opr.ctx_type == WORD_DELIM)
02271            {
02272              token->opr.ctx_type = WORD_FIRST;
02273              tree_first = create_token_tree (dfa, NULL, NULL, token);
02274              token->opr.ctx_type = WORD_LAST;
02275             }
02276           else
02277             {
02278              token->opr.ctx_type = INSIDE_WORD;
02279              tree_first = create_token_tree (dfa, NULL, NULL, token);
02280              token->opr.ctx_type = INSIDE_NOTWORD;
02281             }
02282          tree_last = create_token_tree (dfa, NULL, NULL, token);
02283          tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
02284          if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
02285            {
02286              *err = REG_ESPACE;
02287              return NULL;
02288            }
02289        }
02290       else
02291        {
02292          tree = create_token_tree (dfa, NULL, NULL, token);
02293          if (BE (tree == NULL, 0))
02294            {
02295              *err = REG_ESPACE;
02296              return NULL;
02297            }
02298        }
02299       /* We must return here, since ANCHORs can't be followed
02300         by repetition operators.
02301         eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
02302             it must not be "<ANCHOR(^)><REPEAT(*)>".  */
02303       fetch_token (token, regexp, syntax);
02304       return tree;
02305     case OP_PERIOD:
02306       tree = create_token_tree (dfa, NULL, NULL, token);
02307       if (BE (tree == NULL, 0))
02308        {
02309          *err = REG_ESPACE;
02310          return NULL;
02311        }
02312       if (dfa->mb_cur_max > 1)
02313        dfa->has_mb_node = 1;
02314       break;
02315     case OP_WORD:
02316     case OP_NOTWORD:
02317       tree = build_charclass_op (dfa, regexp->trans,
02318                              (const unsigned char *) "alnum",
02319                              (const unsigned char *) "_",
02320                              token->type == OP_NOTWORD, err);
02321       if (BE (*err != REG_NOERROR && tree == NULL, 0))
02322        return NULL;
02323       break;
02324     case OP_SPACE:
02325     case OP_NOTSPACE:
02326       tree = build_charclass_op (dfa, regexp->trans,
02327                              (const unsigned char *) "space",
02328                              (const unsigned char *) "",
02329                              token->type == OP_NOTSPACE, err);
02330       if (BE (*err != REG_NOERROR && tree == NULL, 0))
02331        return NULL;
02332       break;
02333     case OP_ALT:
02334     case END_OF_RE:
02335       return NULL;
02336     case BACK_SLASH:
02337       *err = REG_EESCAPE;
02338       return NULL;
02339     default:
02340       /* Must not happen?  */
02341 #ifdef DEBUG
02342       assert (0);
02343 #endif
02344       return NULL;
02345     }
02346   fetch_token (token, regexp, syntax);
02347 
02348   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
02349         || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
02350     {
02351       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
02352       if (BE (*err != REG_NOERROR && tree == NULL, 0))
02353        return NULL;
02354       /* In BRE consecutive duplications are not allowed.  */
02355       if ((syntax & RE_CONTEXT_INVALID_DUP)
02356          && (token->type == OP_DUP_ASTERISK
02357              || token->type == OP_OPEN_DUP_NUM))
02358        {
02359          *err = REG_BADRPT;
02360          return NULL;
02361        }
02362     }
02363 
02364   return tree;
02365 }
02366 
02367 /* This function build the following tree, from regular expression
02368    (<reg_exp>):
02369         SUBEXP
02370            |
02371        <reg_exp>
02372 */
02373 
02374 static bin_tree_t *
02375 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
02376               reg_syntax_t syntax, int nest, reg_errcode_t *err)
02377 {
02378   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
02379   bin_tree_t *tree;
02380   size_t cur_nsub;
02381   cur_nsub = preg->re_nsub++;
02382 
02383   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
02384 
02385   /* The subexpression may be a null string.  */
02386   if (token->type == OP_CLOSE_SUBEXP)
02387     tree = NULL;
02388   else
02389     {
02390       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
02391       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
02392         *err = REG_EPAREN;
02393       if (BE (*err != REG_NOERROR, 0))
02394        return NULL;
02395     }
02396 
02397   if (cur_nsub <= '9' - '1')
02398     dfa->completed_bkref_map |= 1 << cur_nsub;
02399 
02400   tree = create_tree (dfa, tree, NULL, SUBEXP);
02401   if (BE (tree == NULL, 0))
02402     {
02403       *err = REG_ESPACE;
02404       return NULL;
02405     }
02406   tree->token.opr.idx = cur_nsub;
02407   return tree;
02408 }
02409 
02410 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
02411 
02412 static bin_tree_t *
02413 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
02414              re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
02415 {
02416   bin_tree_t *tree = NULL, *old_tree = NULL;
02417   int i, start, end, start_idx = re_string_cur_idx (regexp);
02418   re_token_t start_token = *token;
02419 
02420   if (token->type == OP_OPEN_DUP_NUM)
02421     {
02422       end = 0;
02423       start = fetch_number (regexp, token, syntax);
02424       if (start == -1)
02425        {
02426          if (token->type == CHARACTER && token->opr.c == ',')
02427            start = 0; /* We treat "{,m}" as "{0,m}".  */
02428          else
02429            {
02430              *err = REG_BADBR; /* <re>{} is invalid.  */
02431              return NULL;
02432            }
02433        }
02434       if (BE (start != -2, 1))
02435        {
02436          /* We treat "{n}" as "{n,n}".  */
02437          end = ((token->type == OP_CLOSE_DUP_NUM) ? start
02438                : ((token->type == CHARACTER && token->opr.c == ',')
02439                   ? fetch_number (regexp, token, syntax) : -2));
02440        }
02441       if (BE (start == -2 || end == -2, 0))
02442        {
02443          /* Invalid sequence.  */
02444          if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
02445            {
02446              if (token->type == END_OF_RE)
02447               *err = REG_EBRACE;
02448              else
02449               *err = REG_BADBR;
02450 
02451              return NULL;
02452            }
02453 
02454          /* If the syntax bit is set, rollback.  */
02455          re_string_set_index (regexp, start_idx);
02456          *token = start_token;
02457          token->type = CHARACTER;
02458          /* mb_partial and word_char bits should be already initialized by
02459             peek_token.  */
02460          return elem;
02461        }
02462 
02463       if (BE (end != -1 && start > end, 0))
02464        {
02465          /* First number greater than second.  */
02466          *err = REG_BADBR;
02467          return NULL;
02468        }
02469     }
02470   else
02471     {
02472       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
02473       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
02474     }
02475 
02476   fetch_token (token, regexp, syntax);
02477 
02478   if (BE (elem == NULL, 0))
02479     return NULL;
02480   if (BE (start == 0 && end == 0, 0))
02481     {
02482       postorder (elem, free_tree, NULL);
02483       return NULL;
02484     }
02485 
02486   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
02487   if (BE (start > 0, 0))
02488     {
02489       tree = elem;
02490       for (i = 2; i <= start; ++i)
02491        {
02492          elem = duplicate_tree (elem, dfa);
02493          tree = create_tree (dfa, tree, elem, CONCAT);
02494          if (BE (elem == NULL || tree == NULL, 0))
02495            goto parse_dup_op_espace;
02496        }
02497 
02498       if (start == end)
02499        return tree;
02500 
02501       /* Duplicate ELEM before it is marked optional.  */
02502       elem = duplicate_tree (elem, dfa);
02503       old_tree = tree;
02504     }
02505   else
02506     old_tree = NULL;
02507 
02508   if (elem->token.type == SUBEXP)
02509     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
02510 
02511   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
02512   if (BE (tree == NULL, 0))
02513     goto parse_dup_op_espace;
02514 
02515   /* This loop is actually executed only when end != -1,
02516      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
02517      already created the start+1-th copy.  */
02518   for (i = start + 2; i <= end; ++i)
02519     {
02520       elem = duplicate_tree (elem, dfa);
02521       tree = create_tree (dfa, tree, elem, CONCAT);
02522       if (BE (elem == NULL || tree == NULL, 0))
02523         goto parse_dup_op_espace;
02524 
02525       tree = create_tree (dfa, tree, NULL, OP_ALT);
02526       if (BE (tree == NULL, 0))
02527         goto parse_dup_op_espace;
02528     }
02529 
02530   if (old_tree)
02531     tree = create_tree (dfa, old_tree, tree, CONCAT);
02532 
02533   return tree;
02534 
02535  parse_dup_op_espace:
02536   *err = REG_ESPACE;
02537   return NULL;
02538 }
02539 
02540 /* Size of the names for collating symbol/equivalence_class/character_class.
02541    I'm not sure, but maybe enough.  */
02542 #define BRACKET_NAME_BUF_SIZE 32
02543 
02544 #ifndef _LIBC
02545   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
02546      Build the range expression which starts from START_ELEM, and ends
02547      at END_ELEM.  The result are written to MBCSET and SBCSET.
02548      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
02549      mbcset->range_ends, is a pointer argument sinse we may
02550      update it.  */
02551 
02552 static reg_errcode_t
02553 internal_function
02554 # ifdef RE_ENABLE_I18N
02555 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
02556                bracket_elem_t *start_elem, bracket_elem_t *end_elem)
02557 # else /* not RE_ENABLE_I18N */
02558 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
02559                bracket_elem_t *end_elem)
02560 # endif /* not RE_ENABLE_I18N */
02561 {
02562   unsigned int start_ch, end_ch;
02563   /* Equivalence Classes and Character Classes can't be a range start/end.  */
02564   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
02565          || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
02566          0))
02567     return REG_ERANGE;
02568 
02569   /* We can handle no multi character collating elements without libc
02570      support.  */
02571   if (BE ((start_elem->type == COLL_SYM
02572           && strlen ((char *) start_elem->opr.name) > 1)
02573          || (end_elem->type == COLL_SYM
02574              && strlen ((char *) end_elem->opr.name) > 1), 0))
02575     return REG_ECOLLATE;
02576 
02577 # ifdef RE_ENABLE_I18N
02578   {
02579     wchar_t wc;
02580     wint_t start_wc;
02581     wint_t end_wc;
02582     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
02583 
02584     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
02585               : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
02586                  : 0));
02587     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
02588              : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
02589                : 0));
02590     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
02591               ? __btowc (start_ch) : start_elem->opr.wch);
02592     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
02593              ? __btowc (end_ch) : end_elem->opr.wch);
02594     if (start_wc == WEOF || end_wc == WEOF)
02595       return REG_ECOLLATE;
02596     cmp_buf[0] = start_wc;
02597     cmp_buf[4] = end_wc;
02598     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
02599       return REG_ERANGE;
02600 
02601     /* Got valid collation sequence values, add them as a new entry.
02602        However, for !_LIBC we have no collation elements: if the
02603        character set is single byte, the single byte character set
02604        that we build below suffices.  parse_bracket_exp passes
02605        no MBCSET if dfa->mb_cur_max == 1.  */
02606     if (mbcset)
02607       {
02608         /* Check the space of the arrays.  */
02609         if (BE (*range_alloc == mbcset->nranges, 0))
02610           {
02611            /* There is not enough space, need realloc.  */
02612            wchar_t *new_array_start, *new_array_end;
02613            int new_nranges;
02614 
02615            /* +1 in case of mbcset->nranges is 0.  */
02616            new_nranges = 2 * mbcset->nranges + 1;
02617            /* Use realloc since mbcset->range_starts and mbcset->range_ends
02618               are NULL if *range_alloc == 0.  */
02619            new_array_start = re_realloc (mbcset->range_starts, wchar_t,
02620                                       new_nranges);
02621            new_array_end = re_realloc (mbcset->range_ends, wchar_t,
02622                                     new_nranges);
02623 
02624            if (BE (new_array_start == NULL || new_array_end == NULL, 0))
02625              return REG_ESPACE;
02626 
02627            mbcset->range_starts = new_array_start;
02628            mbcset->range_ends = new_array_end;
02629            *range_alloc = new_nranges;
02630           }
02631 
02632         mbcset->range_starts[mbcset->nranges] = start_wc;
02633         mbcset->range_ends[mbcset->nranges++] = end_wc;
02634       }
02635 
02636     /* Build the table for single byte characters.  */
02637     for (wc = 0; wc < SBC_MAX; ++wc)
02638       {
02639        cmp_buf[2] = wc;
02640        if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
02641            && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
02642          bitset_set (sbcset, wc);
02643       }
02644   }
02645 # else /* not RE_ENABLE_I18N */
02646   {
02647     unsigned int ch;
02648     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
02649               : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
02650                  : 0));
02651     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
02652              : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
02653                : 0));
02654     if (start_ch > end_ch)
02655       return REG_ERANGE;
02656     /* Build the table for single byte characters.  */
02657     for (ch = 0; ch < SBC_MAX; ++ch)
02658       if (start_ch <= ch  && ch <= end_ch)
02659        bitset_set (sbcset, ch);
02660   }
02661 # endif /* not RE_ENABLE_I18N */
02662   return REG_NOERROR;
02663 }
02664 #endif /* not _LIBC */
02665 
02666 #ifndef _LIBC
02667 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
02668    Build the collating element which is represented by NAME.
02669    The result are written to MBCSET and SBCSET.
02670    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
02671    pointer argument since we may update it.  */
02672 
02673 static reg_errcode_t
02674 internal_function
02675 # ifdef RE_ENABLE_I18N
02676 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
02677                      int *coll_sym_alloc, const unsigned char *name)
02678 # else /* not RE_ENABLE_I18N */
02679 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
02680 # endif /* not RE_ENABLE_I18N */
02681 {
02682   size_t name_len = strlen ((const char *) name);
02683   if (BE (name_len != 1, 0))
02684     return REG_ECOLLATE;
02685   else
02686     {
02687       bitset_set (sbcset, name[0]);
02688       return REG_NOERROR;
02689     }
02690 }
02691 #endif /* not _LIBC */
02692 
02693 /* This function parse bracket expression like "[abc]", "[a-c]",
02694    "[[.a-a.]]" etc.  */
02695 
02696 static bin_tree_t *
02697 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
02698                  reg_syntax_t syntax, reg_errcode_t *err)
02699 {
02700 #ifdef _LIBC
02701   const unsigned char *collseqmb;
02702   const char *collseqwc;
02703   uint32_t nrules;
02704   int32_t table_size;
02705   const int32_t *symb_table;
02706   const unsigned char *extra;
02707 
02708   /* Local function for parse_bracket_exp used in _LIBC environement.
02709      Seek the collating symbol entry correspondings to NAME.
02710      Return the index of the symbol in the SYMB_TABLE.  */
02711 
02712   auto inline int32_t
02713   __attribute ((always_inline))
02714   seek_collating_symbol_entry (name, name_len)
02715         const unsigned char *name;
02716         size_t name_len;
02717     {
02718       int32_t hash = elem_hash ((const char *) name, name_len);
02719       int32_t elem = hash % table_size;
02720       if (symb_table[2 * elem] != 0)
02721        {
02722          int32_t second = hash % (table_size - 2) + 1;
02723 
02724          do
02725            {
02726              /* First compare the hashing value.  */
02727              if (symb_table[2 * elem] == hash
02728                 /* Compare the length of the name.  */
02729                 && name_len == extra[symb_table[2 * elem + 1]]
02730                 /* Compare the name.  */
02731                 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
02732                           name_len) == 0)
02733               {
02734                 /* Yep, this is the entry.  */
02735                 break;
02736               }
02737 
02738              /* Next entry.  */
02739              elem += second;
02740            }
02741          while (symb_table[2 * elem] != 0);
02742        }
02743       return elem;
02744     }
02745 
02746   /* Local function for parse_bracket_exp used in _LIBC environment.
02747      Look up the collation sequence value of BR_ELEM.
02748      Return the value if succeeded, UINT_MAX otherwise.  */
02749 
02750   auto inline unsigned int
02751   __attribute ((always_inline))
02752   lookup_collation_sequence_value (br_elem)
02753         bracket_elem_t *br_elem;
02754     {
02755       if (br_elem->type == SB_CHAR)
02756        {
02757          /*
02758          if (MB_CUR_MAX == 1)
02759          */
02760          if (nrules == 0)
02761            return collseqmb[br_elem->opr.ch];
02762          else
02763            {
02764              wint_t wc = __btowc (br_elem->opr.ch);
02765              return __collseq_table_lookup (collseqwc, wc);
02766            }
02767        }
02768       else if (br_elem->type == MB_CHAR)
02769        {
02770          if (nrules != 0)
02771            return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
02772        }
02773       else if (br_elem->type == COLL_SYM)
02774        {
02775          size_t sym_name_len = strlen ((char *) br_elem->opr.name);
02776          if (nrules != 0)
02777            {
02778              int32_t elem, idx;
02779              elem = seek_collating_symbol_entry (br_elem->opr.name,
02780                                             sym_name_len);
02781              if (symb_table[2 * elem] != 0)
02782               {
02783                 /* We found the entry.  */
02784                 idx = symb_table[2 * elem + 1];
02785                 /* Skip the name of collating element name.  */
02786                 idx += 1 + extra[idx];
02787                 /* Skip the byte sequence of the collating element.  */
02788                 idx += 1 + extra[idx];
02789                 /* Adjust for the alignment.  */
02790                 idx = (idx + 3) & ~3;
02791                 /* Skip the multibyte collation sequence value.  */
02792                 idx += sizeof (unsigned int);
02793                 /* Skip the wide char sequence of the collating element.  */
02794                 idx += sizeof (unsigned int) *
02795                   (1 + *(unsigned int *) (extra + idx));
02796                 /* Return the collation sequence value.  */
02797                 return *(unsigned int *) (extra + idx);
02798               }
02799              else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
02800               {
02801                 /* No valid character.  Match it as a single byte
02802                    character.  */
02803                 return collseqmb[br_elem->opr.name[0]];
02804               }
02805            }
02806          else if (sym_name_len == 1)
02807            return collseqmb[br_elem->opr.name[0]];
02808        }
02809       return UINT_MAX;
02810     }
02811 
02812   /* Local function for parse_bracket_exp used in _LIBC environement.
02813      Build the range expression which starts from START_ELEM, and ends
02814      at END_ELEM.  The result are written to MBCSET and SBCSET.
02815      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
02816      mbcset->range_ends, is a pointer argument sinse we may
02817      update it.  */
02818 
02819   auto inline reg_errcode_t
02820   __attribute ((always_inline))
02821   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
02822         re_charset_t *mbcset;
02823         int *range_alloc;
02824         bitset_t sbcset;
02825         bracket_elem_t *start_elem, *end_elem;
02826     {
02827       unsigned int ch;
02828       uint32_t start_collseq;
02829       uint32_t end_collseq;
02830 
02831       /* Equivalence Classes and Character Classes can't be a range
02832         start/end.  */
02833       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
02834              || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
02835              0))
02836        return REG_ERANGE;
02837 
02838       start_collseq = lookup_collation_sequence_value (start_elem);
02839       end_collseq = lookup_collation_sequence_value (end_elem);
02840       /* Check start/end collation sequence values.  */
02841       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
02842        return REG_ECOLLATE;
02843       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
02844        return REG_ERANGE;
02845 
02846       /* Got valid collation sequence values, add them as a new entry.
02847         However, if we have no collation elements, and the character set
02848         is single byte, the single byte character set that we
02849         build below suffices. */
02850       if (nrules > 0 || dfa->mb_cur_max > 1)
02851        {
02852           /* Check the space of the arrays.  */
02853           if (BE (*range_alloc == mbcset->nranges, 0))
02854            {
02855              /* There is not enough space, need realloc.  */
02856              uint32_t *new_array_start;
02857              uint32_t *new_array_end;
02858              int new_nranges;
02859 
02860              /* +1 in case of mbcset->nranges is 0.  */
02861              new_nranges = 2 * mbcset->nranges + 1;
02862              new_array_start = re_realloc (mbcset->range_starts, uint32_t,
02863                                        new_nranges);
02864              new_array_end = re_realloc (mbcset->range_ends, uint32_t,
02865                                       new_nranges);
02866 
02867              if (BE (new_array_start == NULL || new_array_end == NULL, 0))
02868                return REG_ESPACE;
02869 
02870              mbcset->range_starts = new_array_start;
02871              mbcset->range_ends = new_array_end;
02872              *range_alloc = new_nranges;
02873            }
02874 
02875           mbcset->range_starts[mbcset->nranges] = start_collseq;
02876           mbcset->range_ends[mbcset->nranges++] = end_collseq;
02877        }
02878 
02879       /* Build the table for single byte characters.  */
02880       for (ch = 0; ch < SBC_MAX; ch++)
02881        {
02882          uint32_t ch_collseq;
02883          /*
02884          if (MB_CUR_MAX == 1)
02885          */
02886          if (nrules == 0)
02887            ch_collseq = collseqmb[ch];
02888          else
02889            ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
02890          if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
02891            bitset_set (sbcset, ch);
02892        }
02893       return REG_NOERROR;
02894     }
02895 
02896   /* Local function for parse_bracket_exp used in _LIBC environement.
02897      Build the collating element which is represented by NAME.
02898      The result are written to MBCSET and SBCSET.
02899      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
02900      pointer argument sinse we may update it.  */
02901 
02902   auto inline reg_errcode_t
02903   __attribute ((always_inline))
02904   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
02905         re_charset_t *mbcset;
02906         int *coll_sym_alloc;
02907         bitset_t sbcset;
02908         const unsigned char *name;
02909     {
02910       int32_t elem, idx;
02911       size_t name_len = strlen ((const char *) name);
02912       if (nrules != 0)
02913        {
02914          elem = seek_collating_symbol_entry (name, name_len);
02915          if (symb_table[2 * elem] != 0)
02916            {
02917              /* We found the entry.  */
02918              idx = symb_table[2 * elem + 1];
02919              /* Skip the name of collating element name.  */
02920              idx += 1 + extra[idx];
02921            }
02922          else if (symb_table[2 * elem] == 0 && name_len == 1)
02923            {
02924              /* No valid character, treat it as a normal
02925                character.  */
02926              bitset_set (sbcset, name[0]);
02927              return REG_NOERROR;
02928            }
02929          else
02930            return REG_ECOLLATE;
02931 
02932          /* Got valid collation sequence, add it as a new entry.  */
02933          /* Check the space of the arrays.  */
02934          if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
02935            {
02936              /* Not enough, realloc it.  */
02937              /* +1 in case of mbcset->ncoll_syms is 0.  */
02938              int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
02939              /* Use realloc since mbcset->coll_syms is NULL
02940                if *alloc == 0.  */
02941              int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
02942                                              new_coll_sym_alloc);
02943              if (BE (new_coll_syms == NULL, 0))
02944               return REG_ESPACE;
02945              mbcset->coll_syms = new_coll_syms;
02946              *coll_sym_alloc = new_coll_sym_alloc;
02947            }
02948          mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
02949          return REG_NOERROR;
02950        }
02951       else
02952        {
02953          if (BE (name_len != 1, 0))
02954            return REG_ECOLLATE;
02955          else
02956            {
02957              bitset_set (sbcset, name[0]);
02958              return REG_NOERROR;
02959            }
02960        }
02961     }
02962 #endif
02963 
02964   re_token_t br_token;
02965   re_bitset_ptr_t sbcset;
02966 #ifdef RE_ENABLE_I18N
02967   re_charset_t *mbcset;
02968   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
02969   int equiv_class_alloc = 0, char_class_alloc = 0;
02970 #endif /* not RE_ENABLE_I18N */
02971   int non_match = 0;
02972   bin_tree_t *work_tree;
02973   int token_len;
02974   int first_round = 1;
02975 #ifdef _LIBC
02976   collseqmb = (const unsigned char *)
02977     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
02978   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
02979   if (nrules)
02980     {
02981       /*
02982       if (MB_CUR_MAX > 1)
02983       */
02984       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
02985       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
02986       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
02987                                             _NL_COLLATE_SYMB_TABLEMB);
02988       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
02989                                              _NL_COLLATE_SYMB_EXTRAMB);
02990     }
02991 #endif
02992   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
02993 #ifdef RE_ENABLE_I18N
02994   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
02995 #endif /* RE_ENABLE_I18N */
02996 #ifdef RE_ENABLE_I18N
02997   if (BE (sbcset == NULL || mbcset == NULL, 0))
02998 #else
02999   if (BE (sbcset == NULL, 0))
03000 #endif /* RE_ENABLE_I18N */
03001     {
03002       *err = REG_ESPACE;
03003       return NULL;
03004     }
03005 
03006   token_len = peek_token_bracket (token, regexp, syntax);
03007   if (BE (token->type == END_OF_RE, 0))
03008     {
03009       *err = REG_BADPAT;
03010       goto parse_bracket_exp_free_return;
03011     }
03012   if (token->type == OP_NON_MATCH_LIST)
03013     {
03014 #ifdef RE_ENABLE_I18N
03015       mbcset->non_match = 1;
03016 #endif /* not RE_ENABLE_I18N */
03017       non_match = 1;
03018       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
03019        bitset_set (sbcset, '\n');
03020       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
03021       token_len = peek_token_bracket (token, regexp, syntax);
03022       if (BE (token->type == END_OF_RE, 0))
03023        {
03024          *err = REG_BADPAT;
03025          goto parse_bracket_exp_free_return;
03026        }
03027     }
03028 
03029   /* We treat the first ']' as a normal character.  */
03030   if (token->type == OP_CLOSE_BRACKET)
03031     token->type = CHARACTER;
03032 
03033   while (1)
03034     {
03035       bracket_elem_t start_elem, end_elem;
03036       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
03037       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
03038       reg_errcode_t ret;
03039       int token_len2 = 0, is_range_exp = 0;
03040       re_token_t token2;
03041 
03042       start_elem.opr.name = start_name_buf;
03043       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
03044                                syntax, first_round);
03045       if (BE (ret != REG_NOERROR, 0))
03046        {
03047          *err = ret;
03048          goto parse_bracket_exp_free_return;
03049        }
03050       first_round = 0;
03051 
03052       /* Get information about the next token.  We need it in any case.  */
03053       token_len = peek_token_bracket (token, regexp, syntax);
03054 
03055       /* Do not check for ranges if we know they are not allowed.  */
03056       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
03057        {
03058          if (BE (token->type == END_OF_RE, 0))
03059            {
03060              *err = REG_EBRACK;
03061              goto parse_bracket_exp_free_return;
03062            }
03063          if (token->type == OP_CHARSET_RANGE)
03064            {
03065              re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
03066              token_len2 = peek_token_bracket (&token2, regexp, syntax);
03067              if (BE (token2.type == END_OF_RE, 0))
03068               {
03069                 *err = REG_EBRACK;
03070                 goto parse_bracket_exp_free_return;
03071               }
03072              if (token2.type == OP_CLOSE_BRACKET)
03073               {
03074                 /* We treat the last '-' as a normal character.  */
03075                 re_string_skip_bytes (regexp, -token_len);
03076                 token->type = CHARACTER;
03077               }
03078              else
03079               is_range_exp = 1;
03080            }
03081        }
03082 
03083       if (is_range_exp == 1)
03084        {
03085          end_elem.opr.name = end_name_buf;
03086          ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
03087                                    dfa, syntax, 1);
03088          if (BE (ret != REG_NOERROR, 0))
03089            {
03090              *err = ret;
03091              goto parse_bracket_exp_free_return;
03092            }
03093 
03094          token_len = peek_token_bracket (token, regexp, syntax);
03095 
03096 #ifdef _LIBC
03097          *err = build_range_exp (sbcset, mbcset, &range_alloc,
03098                               &start_elem, &end_elem);
03099 #else
03100 # ifdef RE_ENABLE_I18N
03101          *err = build_range_exp (sbcset,
03102                               dfa->mb_cur_max > 1 ? mbcset : NULL,
03103                               &range_alloc, &start_elem, &end_elem);
03104 # else
03105          *err = build_range_exp (sbcset, &start_elem, &end_elem);
03106 # endif
03107 #endif /* RE_ENABLE_I18N */
03108          if (BE (*err != REG_NOERROR, 0))
03109            goto parse_bracket_exp_free_return;
03110        }
03111       else
03112        {
03113          switch (start_elem.type)
03114            {
03115            case SB_CHAR:
03116              bitset_set (sbcset, start_elem.opr.ch);
03117              break;
03118 #ifdef RE_ENABLE_I18N
03119            case MB_CHAR:
03120              /* Check whether the array has enough space.  */
03121              if (BE (mbchar_alloc == mbcset->nmbchars, 0))
03122               {
03123                 wchar_t *new_mbchars;
03124                 /* Not enough, realloc it.  */
03125                 /* +1 in case of mbcset->nmbchars is 0.  */
03126                 mbchar_alloc = 2 * mbcset->nmbchars + 1;
03127                 /* Use realloc since array is NULL if *alloc == 0.  */
03128                 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
03129                                        mbchar_alloc);
03130                 if (BE (new_mbchars == NULL, 0))
03131                   goto parse_bracket_exp_espace;
03132                 mbcset->mbchars = new_mbchars;
03133               }
03134              mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
03135              break;
03136 #endif /* RE_ENABLE_I18N */
03137            case EQUIV_CLASS:
03138              *err = build_equiv_class (sbcset,
03139 #ifdef RE_ENABLE_I18N
03140                                    mbcset, &equiv_class_alloc,
03141 #endif /* RE_ENABLE_I18N */
03142                                    start_elem.opr.name);
03143              if (BE (*err != REG_NOERROR, 0))
03144               goto parse_bracket_exp_free_return;
03145              break;
03146            case COLL_SYM:
03147              *err = build_collating_symbol (sbcset,
03148 #ifdef RE_ENABLE_I18N
03149                                         mbcset, &coll_sym_alloc,
03150 #endif /* RE_ENABLE_I18N */
03151                                         start_elem.opr.name);
03152              if (BE (*err != REG_NOERROR, 0))
03153               goto parse_bracket_exp_free_return;
03154              break;
03155            case CHAR_CLASS:
03156              *err = build_charclass (regexp->trans, sbcset,
03157 #ifdef RE_ENABLE_I18N
03158                                   mbcset, &char_class_alloc,
03159 #endif /* RE_ENABLE_I18N */
03160                                   start_elem.opr.name, syntax);
03161              if (BE (*err != REG_NOERROR, 0))
03162               goto parse_bracket_exp_free_return;
03163              break;
03164            default:
03165              assert (0);
03166              break;
03167            }
03168        }
03169       if (BE (token->type == END_OF_RE, 0))
03170        {
03171          *err = REG_EBRACK;
03172          goto parse_bracket_exp_free_return;
03173        }
03174       if (token->type == OP_CLOSE_BRACKET)
03175        break;
03176     }
03177 
03178   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
03179 
03180   /* If it is non-matching list.  */
03181   if (non_match)
03182     bitset_not (sbcset);
03183 
03184 #ifdef RE_ENABLE_I18N
03185   /* Ensure only single byte characters are set.  */
03186   if (dfa->mb_cur_max > 1)
03187     bitset_mask (sbcset, dfa->sb_char);
03188 
03189   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
03190       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
03191                                                || mbcset->non_match)))
03192     {
03193       bin_tree_t *mbc_tree;
03194       int sbc_idx;
03195       /* Build a tree for complex bracket.  */
03196       dfa->has_mb_node = 1;
03197       br_token.type = COMPLEX_BRACKET;
03198       br_token.opr.mbcset = mbcset;
03199       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
03200       if (BE (mbc_tree == NULL, 0))
03201        goto parse_bracket_exp_espace;
03202       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
03203        if (sbcset[sbc_idx])
03204          break;
03205       /* If there are no bits set in sbcset, there is no point
03206         of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
03207       if (sbc_idx < BITSET_WORDS)
03208        {
03209           /* Build a tree for simple bracket.  */
03210           br_token.type = SIMPLE_BRACKET;
03211           br_token.opr.sbcset = sbcset;
03212           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
03213           if (BE (work_tree == NULL, 0))
03214             goto parse_bracket_exp_espace;
03215 
03216           /* Then join them by ALT node.  */
03217           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
03218           if (BE (work_tree == NULL, 0))
03219             goto parse_bracket_exp_espace;
03220        }
03221       else
03222        {
03223          re_free (sbcset);
03224          work_tree = mbc_tree;
03225        }
03226     }
03227   else
03228 #endif /* not RE_ENABLE_I18N */
03229     {
03230 #ifdef RE_ENABLE_I18N
03231       free_charset (mbcset);
03232 #endif
03233       /* Build a tree for simple bracket.  */
03234       br_token.type = SIMPLE_BRACKET;
03235       br_token.opr.sbcset = sbcset;
03236       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
03237       if (BE (work_tree == NULL, 0))
03238         goto parse_bracket_exp_espace;
03239     }
03240   return work_tree;
03241 
03242  parse_bracket_exp_espace:
03243   *err = REG_ESPACE;
03244  parse_bracket_exp_free_return:
03245   re_free (sbcset);
03246 #ifdef RE_ENABLE_I18N
03247   free_charset (mbcset);
03248 #endif /* RE_ENABLE_I18N */
03249   return NULL;
03250 }
03251 
03252 /* Parse an element in the bracket expression.  */
03253 
03254 static reg_errcode_t
03255 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
03256                      re_token_t *token, int token_len, re_dfa_t *dfa,
03257                      reg_syntax_t syntax, int accept_hyphen)
03258 {
03259 #ifdef RE_ENABLE_I18N
03260   int cur_char_size;
03261   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
03262   if (cur_char_size > 1)
03263     {
03264       elem->type = MB_CHAR;
03265       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
03266       re_string_skip_bytes (regexp, cur_char_size);
03267       return REG_NOERROR;
03268     }
03269 #endif /* RE_ENABLE_I18N */
03270   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
03271   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
03272       || token->type == OP_OPEN_EQUIV_CLASS)
03273     return parse_bracket_symbol (elem, regexp, token);
03274   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
03275     {
03276       /* A '-' must only appear as anything but a range indicator before
03277         the closing bracket.  Everything else is an error.  */
03278       re_token_t token2;
03279       (void) peek_token_bracket (&token2, regexp, syntax);
03280       if (token2.type != OP_CLOSE_BRACKET)
03281        /* The actual error value is not standardized since this whole
03282           case is undefined.  But ERANGE makes good sense.  */
03283        return REG_ERANGE;
03284     }
03285   elem->type = SB_CHAR;
03286   elem->opr.ch = token->opr.c;
03287   return REG_NOERROR;
03288 }
03289 
03290 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
03291    such as [:<character_class>:], [.<collating_element>.], and
03292    [=<equivalent_class>=].  */
03293 
03294 static reg_errcode_t
03295 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
03296                     re_token_t *token)
03297 {
03298   unsigned char ch, delim = token->opr.c;
03299   int i = 0;
03300   if (re_string_eoi(regexp))
03301     return REG_EBRACK;
03302   for (;; ++i)
03303     {
03304       if (i >= BRACKET_NAME_BUF_SIZE)
03305        return REG_EBRACK;
03306       if (token->type == OP_OPEN_CHAR_CLASS)
03307        ch = re_string_fetch_byte_case (regexp);
03308       else
03309        ch = re_string_fetch_byte (regexp);
03310       if (re_string_eoi(regexp))
03311        return REG_EBRACK;
03312       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
03313        break;
03314       elem->opr.name[i] = ch;
03315     }
03316   re_string_skip_bytes (regexp, 1);
03317   elem->opr.name[i] = '\0';
03318   switch (token->type)
03319     {
03320     case OP_OPEN_COLL_ELEM:
03321       elem->type = COLL_SYM;
03322       break;
03323     case OP_OPEN_EQUIV_CLASS:
03324       elem->type = EQUIV_CLASS;
03325       break;
03326     case OP_OPEN_CHAR_CLASS:
03327       elem->type = CHAR_CLASS;
03328       break;
03329     default:
03330       break;
03331     }
03332   return REG_NOERROR;
03333 }
03334 
03335   /* Helper function for parse_bracket_exp.
03336      Build the equivalence class which is represented by NAME.
03337      The result are written to MBCSET and SBCSET.
03338      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
03339      is a pointer argument sinse we may update it.  */
03340 
03341 static reg_errcode_t
03342 #ifdef RE_ENABLE_I18N
03343 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
03344                  int *equiv_class_alloc, const unsigned char *name)
03345 #else /* not RE_ENABLE_I18N */
03346 build_equiv_class (bitset_t sbcset, const unsigned char *name)
03347 #endif /* not RE_ENABLE_I18N */
03348 {
03349 #ifdef _LIBC
03350   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
03351   if (nrules != 0)
03352     {
03353       const int32_t *table, *indirect;
03354       const unsigned char *weights, *extra, *cp;
03355       unsigned char char_buf[2];
03356       int32_t idx1, idx2;
03357       unsigned int ch;
03358       size_t len;
03359       /* This #include defines a local function!  */
03360 # include <locale/weight.h>
03361       /* Calculate the index for equivalence class.  */
03362       cp = name;
03363       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
03364       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
03365                                           _NL_COLLATE_WEIGHTMB);
03366       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
03367                                              _NL_COLLATE_EXTRAMB);
03368       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
03369                                           _NL_COLLATE_INDIRECTMB);
03370       idx1 = findidx (&cp);
03371       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
03372        /* This isn't a valid character.  */
03373        return REG_ECOLLATE;
03374 
03375       /* Build single byte matcing table for this equivalence class.  */
03376       char_buf[1] = (unsigned char) '\0';
03377       len = weights[idx1 & 0xffffff];
03378       for (ch = 0; ch < SBC_MAX; ++ch)
03379        {
03380          char_buf[0] = ch;
03381          cp = char_buf;
03382          idx2 = findidx (&cp);
03383 /*
03384          idx2 = table[ch];
03385 */
03386          if (idx2 == 0)
03387            /* This isn't a valid character.  */
03388            continue;
03389          /* Compare only if the length matches and the collation rule
03390             index is the same.  */
03391          if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
03392            {
03393              int cnt = 0;
03394 
03395              while (cnt <= len &&
03396                    weights[(idx1 & 0xffffff) + 1 + cnt]
03397                    == weights[(idx2 & 0xffffff) + 1 + cnt])
03398               ++cnt;
03399 
03400              if (cnt > len)
03401               bitset_set (sbcset, ch);
03402            }
03403        }
03404       /* Check whether the array has enough space.  */
03405       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
03406        {
03407          /* Not enough, realloc it.  */
03408          /* +1 in case of mbcset->nequiv_classes is 0.  */
03409          int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
03410          /* Use realloc since the array is NULL if *alloc == 0.  */
03411          int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
03412                                              int32_t,
03413                                              new_equiv_class_alloc);
03414          if (BE (new_equiv_classes == NULL, 0))
03415            return REG_ESPACE;
03416          mbcset->equiv_classes = new_equiv_classes;
03417          *equiv_class_alloc = new_equiv_class_alloc;
03418        }
03419       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
03420     }
03421   else
03422 #endif /* _LIBC */
03423     {
03424       if (BE (strlen ((const char *) name) != 1, 0))
03425        return REG_ECOLLATE;
03426       bitset_set (sbcset, *name);
03427     }
03428   return REG_NOERROR;
03429 }
03430 
03431   /* Helper function for parse_bracket_exp.
03432      Build the character class which is represented by NAME.
03433      The result are written to MBCSET and SBCSET.
03434      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
03435      is a pointer argument sinse we may update it.  */
03436 
03437 static reg_errcode_t
03438 #ifdef RE_ENABLE_I18N
03439 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
03440                re_charset_t *mbcset, int *char_class_alloc,
03441                const unsigned char *class_name, reg_syntax_t syntax)
03442 #else /* not RE_ENABLE_I18N */
03443 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
03444                const unsigned char *class_name, reg_syntax_t syntax)
03445 #endif /* not RE_ENABLE_I18N */
03446 {
03447   int i;
03448   const char *name = (const char *) class_name;
03449 
03450   /* In case of REG_ICASE "upper" and "lower" match the both of
03451      upper and lower cases.  */
03452   if ((syntax & RE_ICASE)
03453       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
03454     name = "alpha";
03455 
03456 #ifdef RE_ENABLE_I18N
03457   /* Check the space of the arrays.  */
03458   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
03459     {
03460       /* Not enough, realloc it.  */
03461       /* +1 in case of mbcset->nchar_classes is 0.  */
03462       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
03463       /* Use realloc since array is NULL if *alloc == 0.  */
03464       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
03465                                           new_char_class_alloc);
03466       if (BE (new_char_classes == NULL, 0))
03467        return REG_ESPACE;
03468       mbcset->char_classes = new_char_classes;
03469       *char_class_alloc = new_char_class_alloc;
03470     }
03471   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
03472 #endif /* RE_ENABLE_I18N */
03473 
03474 #define BUILD_CHARCLASS_LOOP(ctype_func)  \
03475   do {                                    \
03476     if (BE (trans != NULL, 0))                   \
03477       {                                          \
03478        for (i = 0; i < SBC_MAX; ++i)             \
03479          if (ctype_func (i))                     \
03480            bitset_set (sbcset, trans[i]); \
03481       }                                          \
03482     else                                  \
03483       {                                          \
03484        for (i = 0; i < SBC_MAX; ++i)             \
03485          if (ctype_func (i))                     \
03486            bitset_set (sbcset, i);        \
03487       }                                          \
03488   } while (0)
03489 
03490   if (strcmp (name, "alnum") == 0)
03491     BUILD_CHARCLASS_LOOP (isalnum);
03492   else if (strcmp (name, "cntrl") == 0)
03493     BUILD_CHARCLASS_LOOP (iscntrl);
03494   else if (strcmp (name, "lower") == 0)
03495     BUILD_CHARCLASS_LOOP (islower);
03496   else if (strcmp (name, "space") == 0)
03497     BUILD_CHARCLASS_LOOP (isspace);
03498   else if (strcmp (name, "alpha") == 0)
03499     BUILD_CHARCLASS_LOOP (isalpha);
03500   else if (strcmp (name, "digit") == 0)
03501     BUILD_CHARCLASS_LOOP (isdigit);
03502   else if (strcmp (name, "print") == 0)
03503     BUILD_CHARCLASS_LOOP (isprint);
03504   else if (strcmp (name, "upper") == 0)
03505     BUILD_CHARCLASS_LOOP (isupper);
03506   else if (strcmp (name, "blank") == 0)
03507     BUILD_CHARCLASS_LOOP (isblank);
03508   else if (strcmp (name, "graph") == 0)
03509     BUILD_CHARCLASS_LOOP (isgraph);
03510   else if (strcmp (name, "punct") == 0)
03511     BUILD_CHARCLASS_LOOP (ispunct);
03512   else if (strcmp (name, "xdigit") == 0)
03513     BUILD_CHARCLASS_LOOP (isxdigit);
03514   else
03515     return REG_ECTYPE;
03516 
03517   return REG_NOERROR;
03518 }
03519 
03520 static bin_tree_t *
03521 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
03522                   const unsigned char *class_name,
03523                   const unsigned char *extra, int non_match,
03524                   reg_errcode_t *err)
03525 {
03526   re_bitset_ptr_t sbcset;
03527 #ifdef RE_ENABLE_I18N
03528   re_charset_t *mbcset;
03529   int alloc = 0;
03530 #endif /* not RE_ENABLE_I18N */
03531   reg_errcode_t ret;
03532   re_token_t br_token;
03533   bin_tree_t *tree;
03534 
03535   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
03536 #ifdef RE_ENABLE_I18N
03537   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
03538 #endif /* RE_ENABLE_I18N */
03539 
03540 #ifdef RE_ENABLE_I18N
03541   if (BE (sbcset == NULL || mbcset == NULL, 0))
03542 #else /* not RE_ENABLE_I18N */
03543   if (BE (sbcset == NULL, 0))
03544 #endif /* not RE_ENABLE_I18N */
03545     {
03546       *err = REG_ESPACE;
03547       return NULL;
03548     }
03549 
03550   if (non_match)
03551     {
03552 #ifdef RE_ENABLE_I18N
03553       mbcset->non_match = 1;
03554 #endif /* not RE_ENABLE_I18N */
03555     }
03556 
03557   /* We don't care the syntax in this case.  */
03558   ret = build_charclass (trans, sbcset,
03559 #ifdef RE_ENABLE_I18N
03560                       mbcset, &alloc,
03561 #endif /* RE_ENABLE_I18N */
03562                       class_name, 0);
03563 
03564   if (BE (ret != REG_NOERROR, 0))
03565     {
03566       re_free (sbcset);
03567 #ifdef RE_ENABLE_I18N
03568       free_charset (mbcset);
03569 #endif /* RE_ENABLE_I18N */
03570       *err = ret;
03571       return NULL;
03572     }
03573   /* \w match '_' also.  */
03574   for (; *extra; extra++)
03575     bitset_set (sbcset, *extra);
03576 
03577   /* If it is non-matching list.  */
03578   if (non_match)
03579     bitset_not (sbcset);
03580 
03581 #ifdef RE_ENABLE_I18N
03582   /* Ensure only single byte characters are set.  */
03583   if (dfa->mb_cur_max > 1)
03584     bitset_mask (sbcset, dfa->sb_char);
03585 #endif
03586 
03587   /* Build a tree for simple bracket.  */
03588   br_token.type = SIMPLE_BRACKET;
03589   br_token.opr.sbcset = sbcset;
03590   tree = create_token_tree (dfa, NULL, NULL, &br_token);
03591   if (BE (tree == NULL, 0))
03592     goto build_word_op_espace;
03593 
03594 #ifdef RE_ENABLE_I18N
03595   if (dfa->mb_cur_max > 1)
03596     {
03597       bin_tree_t *mbc_tree;
03598       /* Build a tree for complex bracket.  */
03599       br_token.type = COMPLEX_BRACKET;
03600       br_token.opr.mbcset = mbcset;
03601       dfa->has_mb_node = 1;
03602       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
03603       if (BE (mbc_tree == NULL, 0))
03604        goto build_word_op_espace;
03605       /* Then join them by ALT node.  */
03606       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
03607       if (BE (mbc_tree != NULL, 1))
03608        return tree;
03609     }
03610   else
03611     {
03612       free_charset (mbcset);
03613       return tree;
03614     }
03615 #else /* not RE_ENABLE_I18N */
03616   return tree;
03617 #endif /* not RE_ENABLE_I18N */
03618 
03619  build_word_op_espace:
03620   re_free (sbcset);
03621 #ifdef RE_ENABLE_I18N
03622   free_charset (mbcset);
03623 #endif /* RE_ENABLE_I18N */
03624   *err = REG_ESPACE;
03625   return NULL;
03626 }
03627 
03628 /* This is intended for the expressions like "a{1,3}".
03629    Fetch a number from `input', and return the number.
03630    Return -1, if the number field is empty like "{,1}".
03631    Return -2, If an error is occured.  */
03632 
03633 static int
03634 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
03635 {
03636   int num = -1;
03637   unsigned char c;
03638   while (1)
03639     {
03640       fetch_token (token, input, syntax);
03641       c = token->opr.c;
03642       if (BE (token->type == END_OF_RE, 0))
03643        return -2;
03644       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
03645        break;
03646       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
03647             ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
03648       num = (num > RE_DUP_MAX) ? -2 : num;
03649     }
03650   return num;
03651 }
03652 
03653 #ifdef RE_ENABLE_I18N
03654 static void
03655 free_charset (re_charset_t *cset)
03656 {
03657   re_free (cset->mbchars);
03658 # ifdef _LIBC
03659   re_free (cset->coll_syms);
03660   re_free (cset->equiv_classes);
03661   re_free (cset->range_starts);
03662   re_free (cset->range_ends);
03663 # endif
03664   re_free (cset->char_classes);
03665   re_free (cset);
03666 }
03667 #endif /* RE_ENABLE_I18N */
03668 
03669 /* Functions for binary tree operation.  */
03670 
03671 /* Create a tree node.  */
03672 
03673 static bin_tree_t *
03674 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
03675             re_token_type_t type)
03676 {
03677   re_token_t t;
03678   t.type = type;
03679   return create_token_tree (dfa, left, right, &t);
03680 }
03681 
03682 static bin_tree_t *
03683 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
03684                  const re_token_t *token)
03685 {
03686   bin_tree_t *tree;
03687   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
03688     {
03689       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
03690 
03691       if (storage == NULL)
03692        return NULL;
03693       storage->next = dfa->str_tree_storage;
03694       dfa->str_tree_storage = storage;
03695       dfa->str_tree_storage_idx = 0;
03696     }
03697   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
03698 
03699   tree->parent = NULL;
03700   tree->left = left;
03701   tree->right = right;
03702   tree->token = *token;
03703   tree->token.duplicated = 0;
03704   tree->token.opt_subexp = 0;
03705   tree->first = NULL;
03706   tree->next = NULL;
03707   tree->node_idx = -1;
03708 
03709   if (left != NULL)
03710     left->parent = tree;
03711   if (right != NULL)
03712     right->parent = tree;
03713   return tree;
03714 }
03715 
03716 /* Mark the tree SRC as an optional subexpression.
03717    To be called from preorder or postorder.  */
03718 
03719 static reg_errcode_t
03720 mark_opt_subexp (void *extra, bin_tree_t *node)
03721 {
03722   int idx = (int) (long) extra;
03723   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
03724     node->token.opt_subexp = 1;
03725 
03726   return REG_NOERROR;
03727 }
03728 
03729 /* Free the allocated memory inside NODE. */
03730 
03731 static void
03732 free_token (re_token_t *node)
03733 {
03734 #ifdef RE_ENABLE_I18N
03735   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
03736     free_charset (node->opr.mbcset);
03737   else
03738 #endif /* RE_ENABLE_I18N */
03739     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
03740       re_free (node->opr.sbcset);
03741 }
03742 
03743 /* Worker function for tree walking.  Free the allocated memory inside NODE
03744    and its children. */
03745 
03746 static reg_errcode_t
03747 free_tree (void *extra, bin_tree_t *node)
03748 {
03749   free_token (&node->token);
03750   return REG_NOERROR;
03751 }
03752 
03753 
03754 /* Duplicate the node SRC, and return new node.  This is a preorder
03755    visit similar to the one implemented by the generic visitor, but
03756    we need more infrastructure to maintain two parallel trees --- so,
03757    it's easier to duplicate.  */
03758 
03759 static bin_tree_t *
03760 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
03761 {
03762   const bin_tree_t *node;
03763   bin_tree_t *dup_root;
03764   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
03765 
03766   for (node = root; ; )
03767     {
03768       /* Create a new tree and link it back to the current parent.  */
03769       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
03770       if (*p_new == NULL)
03771        return NULL;
03772       (*p_new)->parent = dup_node;
03773       (*p_new)->token.duplicated = 1;
03774       dup_node = *p_new;
03775 
03776       /* Go to the left node, or up and to the right.  */
03777       if (node->left)
03778        {
03779          node = node->left;
03780          p_new = &dup_node->left;
03781        }
03782       else
03783        {
03784          const bin_tree_t *prev = NULL;
03785          while (node->right == prev || node->right == NULL)
03786            {
03787              prev = node;
03788              node = node->parent;
03789              dup_node = dup_node->parent;
03790              if (!node)
03791                return dup_root;
03792            }
03793          node = node->right;
03794          p_new = &dup_node->right;
03795        }
03796     }
03797 }