Back to index

nagios-plugins  1.4.16
regcomp.c
Go to the documentation of this file.
00001 /* Extended regular expression matching and search library.
00002    Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
00003    Software Foundation, Inc.
00004    This file is part of the GNU C Library.
00005    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
00006 
00007    This program is free software; you can redistribute it and/or modify
00008    it under the terms of the GNU General Public License as published by
00009    the Free Software Foundation; either version 3, or (at your option)
00010    any later version.
00011 
00012    This program is distributed in the hope that it will be useful,
00013    but WITHOUT ANY WARRANTY; without even the implied warranty of
00014    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015    GNU General Public License for more details.
00016 
00017    You should have received a copy of the GNU General Public License along
00018    with this program; if not, write to the Free Software Foundation,
00019    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
00020 
00021 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
00022                                      size_t length, reg_syntax_t syntax);
00023 static void re_compile_fastmap_iter (regex_t *bufp,
00024                                  const re_dfastate_t *init_state,
00025                                  char *fastmap);
00026 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
00027 #ifdef RE_ENABLE_I18N
00028 static void free_charset (re_charset_t *cset);
00029 #endif /* RE_ENABLE_I18N */
00030 static void free_workarea_compile (regex_t *preg);
00031 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
00032 #ifdef RE_ENABLE_I18N
00033 static void optimize_utf8 (re_dfa_t *dfa);
00034 #endif
00035 static reg_errcode_t analyze (regex_t *preg);
00036 static reg_errcode_t preorder (bin_tree_t *root,
00037                             reg_errcode_t (fn (void *, bin_tree_t *)),
00038                             void *extra);
00039 static reg_errcode_t postorder (bin_tree_t *root,
00040                             reg_errcode_t (fn (void *, bin_tree_t *)),
00041                             void *extra);
00042 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
00043 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
00044 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
00045                              bin_tree_t *node);
00046 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
00047 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
00048 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
00049 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
00050 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
00051                                unsigned int constraint);
00052 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
00053 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
00054                                     Idx node, bool root);
00055 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
00056 static Idx fetch_number (re_string_t *input, re_token_t *token,
00057                       reg_syntax_t syntax);
00058 static int peek_token (re_token_t *token, re_string_t *input,
00059                      reg_syntax_t syntax) internal_function;
00060 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
00061                        reg_syntax_t syntax, reg_errcode_t *err);
00062 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
00063                               re_token_t *token, reg_syntax_t syntax,
00064                               Idx nest, reg_errcode_t *err);
00065 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
00066                              re_token_t *token, reg_syntax_t syntax,
00067                              Idx nest, reg_errcode_t *err);
00068 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
00069                                  re_token_t *token, reg_syntax_t syntax,
00070                                  Idx nest, reg_errcode_t *err);
00071 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
00072                               re_token_t *token, reg_syntax_t syntax,
00073                               Idx nest, reg_errcode_t *err);
00074 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
00075                              re_dfa_t *dfa, re_token_t *token,
00076                              reg_syntax_t syntax, reg_errcode_t *err);
00077 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
00078                                   re_token_t *token, reg_syntax_t syntax,
00079                                   reg_errcode_t *err);
00080 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
00081                                        re_string_t *regexp,
00082                                        re_token_t *token, int token_len,
00083                                        re_dfa_t *dfa,
00084                                        reg_syntax_t syntax,
00085                                        bool accept_hyphen);
00086 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
00087                                      re_string_t *regexp,
00088                                      re_token_t *token);
00089 #ifdef RE_ENABLE_I18N
00090 static reg_errcode_t build_equiv_class (bitset_t sbcset,
00091                                    re_charset_t *mbcset,
00092                                    Idx *equiv_class_alloc,
00093                                    const unsigned char *name);
00094 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
00095                                   bitset_t sbcset,
00096                                   re_charset_t *mbcset,
00097                                   Idx *char_class_alloc,
00098                                   const unsigned char *class_name,
00099                                   reg_syntax_t syntax);
00100 #else  /* not RE_ENABLE_I18N */
00101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
00102                                    const unsigned char *name);
00103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
00104                                   bitset_t sbcset,
00105                                   const unsigned char *class_name,
00106                                   reg_syntax_t syntax);
00107 #endif /* not RE_ENABLE_I18N */
00108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
00109                                    RE_TRANSLATE_TYPE trans,
00110                                    const unsigned char *class_name,
00111                                    const unsigned char *extra,
00112                                    bool non_match, reg_errcode_t *err);
00113 static bin_tree_t *create_tree (re_dfa_t *dfa,
00114                             bin_tree_t *left, bin_tree_t *right,
00115                             re_token_type_t type);
00116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
00117                                   bin_tree_t *left, bin_tree_t *right,
00118                                   const re_token_t *token);
00119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
00120 static void free_token (re_token_t *node);
00121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
00122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
00123 
00124 /* This table gives an error message for each of the error codes listed
00125    in regex.h.  Obviously the order here has to be same as there.
00126    POSIX doesn't require that we do anything for REG_NOERROR,
00127    but why not be nice?  */
00128 
00129 static const char __re_error_msgid[] =
00130   {
00131 #define REG_NOERROR_IDX     0
00132     gettext_noop ("Success")       /* REG_NOERROR */
00133     "\0"
00134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
00135     gettext_noop ("No match")      /* REG_NOMATCH */
00136     "\0"
00137 #define REG_BADPAT_IDX      (REG_NOMATCH_IDX + sizeof "No match")
00138     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
00139     "\0"
00140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
00141     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
00142     "\0"
00143 #define REG_ECTYPE_IDX      (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
00144     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
00145     "\0"
00146 #define REG_EESCAPE_IDX     (REG_ECTYPE_IDX + sizeof "Invalid character class name")
00147     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
00148     "\0"
00149 #define REG_ESUBREG_IDX     (REG_EESCAPE_IDX + sizeof "Trailing backslash")
00150     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
00151     "\0"
00152 #define REG_EBRACK_IDX      (REG_ESUBREG_IDX + sizeof "Invalid back reference")
00153     gettext_noop ("Unmatched [ or [^")    /* REG_EBRACK */
00154     "\0"
00155 #define REG_EPAREN_IDX      (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
00156     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
00157     "\0"
00158 #define REG_EBRACE_IDX      (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
00159     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
00160     "\0"
00161 #define REG_BADBR_IDX       (REG_EBRACE_IDX + sizeof "Unmatched \\{")
00162     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
00163     "\0"
00164 #define REG_ERANGE_IDX      (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
00165     gettext_noop ("Invalid range end")    /* REG_ERANGE */
00166     "\0"
00167 #define REG_ESPACE_IDX      (REG_ERANGE_IDX + sizeof "Invalid range end")
00168     gettext_noop ("Memory exhausted") /* REG_ESPACE */
00169     "\0"
00170 #define REG_BADRPT_IDX      (REG_ESPACE_IDX + sizeof "Memory exhausted")
00171     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
00172     "\0"
00173 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
00174     gettext_noop ("Premature end of regular expression") /* REG_EEND */
00175     "\0"
00176 #define REG_ESIZE_IDX       (REG_EEND_IDX + sizeof "Premature end of regular expression")
00177     gettext_noop ("Regular expression too big") /* REG_ESIZE */
00178     "\0"
00179 #define REG_ERPAREN_IDX     (REG_ESIZE_IDX + sizeof "Regular expression too big")
00180     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
00181   };
00182 
00183 static const size_t __re_error_msgid_idx[] =
00184   {
00185     REG_NOERROR_IDX,
00186     REG_NOMATCH_IDX,
00187     REG_BADPAT_IDX,
00188     REG_ECOLLATE_IDX,
00189     REG_ECTYPE_IDX,
00190     REG_EESCAPE_IDX,
00191     REG_ESUBREG_IDX,
00192     REG_EBRACK_IDX,
00193     REG_EPAREN_IDX,
00194     REG_EBRACE_IDX,
00195     REG_BADBR_IDX,
00196     REG_ERANGE_IDX,
00197     REG_ESPACE_IDX,
00198     REG_BADRPT_IDX,
00199     REG_EEND_IDX,
00200     REG_ESIZE_IDX,
00201     REG_ERPAREN_IDX
00202   };
00203 
00204 /* Entry points for GNU code.  */
00205 
00206 /* re_compile_pattern is the GNU regular expression compiler: it
00207    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
00208    Returns 0 if the pattern was valid, otherwise an error string.
00209 
00210    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
00211    are set in BUFP on entry.  */
00212 
00213 #ifdef _LIBC
00214 const char *
00215 re_compile_pattern (pattern, length, bufp)
00216     const char *pattern;
00217     size_t length;
00218     struct re_pattern_buffer *bufp;
00219 #else /* size_t might promote */
00220 const char *
00221 re_compile_pattern (const char *pattern, size_t length,
00222                   struct re_pattern_buffer *bufp)
00223 #endif
00224 {
00225   reg_errcode_t ret;
00226 
00227   /* And GNU code determines whether or not to get register information
00228      by passing null for the REGS argument to re_match, etc., not by
00229      setting no_sub, unless RE_NO_SUB is set.  */
00230   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
00231 
00232   /* Match anchors at newline.  */
00233   bufp->newline_anchor = 1;
00234 
00235   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
00236 
00237   if (!ret)
00238     return NULL;
00239   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
00240 }
00241 #ifdef _LIBC
00242 weak_alias (__re_compile_pattern, re_compile_pattern)
00243 #endif
00244 
00245 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
00246    also be assigned to arbitrarily: each pattern buffer stores its own
00247    syntax, so it can be changed between regex compilations.  */
00248 /* This has no initializer because initialized variables in Emacs
00249    become read-only after dumping.  */
00250 reg_syntax_t re_syntax_options;
00251 
00252 
00253 /* Specify the precise syntax of regexps for compilation.  This provides
00254    for compatibility for various utilities which historically have
00255    different, incompatible syntaxes.
00256 
00257    The argument SYNTAX is a bit mask comprised of the various bits
00258    defined in regex.h.  We return the old syntax.  */
00259 
00260 reg_syntax_t
00261 re_set_syntax (syntax)
00262     reg_syntax_t syntax;
00263 {
00264   reg_syntax_t ret = re_syntax_options;
00265 
00266   re_syntax_options = syntax;
00267   return ret;
00268 }
00269 #ifdef _LIBC
00270 weak_alias (__re_set_syntax, re_set_syntax)
00271 #endif
00272 
00273 int
00274 re_compile_fastmap (bufp)
00275     struct re_pattern_buffer *bufp;
00276 {
00277   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
00278   char *fastmap = bufp->fastmap;
00279 
00280   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
00281   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
00282   if (dfa->init_state != dfa->init_state_word)
00283     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
00284   if (dfa->init_state != dfa->init_state_nl)
00285     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
00286   if (dfa->init_state != dfa->init_state_begbuf)
00287     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
00288   bufp->fastmap_accurate = 1;
00289   return 0;
00290 }
00291 #ifdef _LIBC
00292 weak_alias (__re_compile_fastmap, re_compile_fastmap)
00293 #endif
00294 
00295 static inline void
00296 __attribute ((always_inline))
00297 re_set_fastmap (char *fastmap, bool icase, int ch)
00298 {
00299   fastmap[ch] = 1;
00300   if (icase)
00301     fastmap[tolower (ch)] = 1;
00302 }
00303 
00304 /* Helper function for re_compile_fastmap.
00305    Compile fastmap for the initial_state INIT_STATE.  */
00306 
00307 static void
00308 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
00309                       char *fastmap)
00310 {
00311   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
00312   Idx node_cnt;
00313   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
00314   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
00315     {
00316       Idx node = init_state->nodes.elems[node_cnt];
00317       re_token_type_t type = dfa->nodes[node].type;
00318 
00319       if (type == CHARACTER)
00320        {
00321          re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
00322 #ifdef RE_ENABLE_I18N
00323          if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
00324            {
00325              unsigned char buf[MB_LEN_MAX];
00326              unsigned char *p;
00327              wchar_t wc;
00328              mbstate_t state;
00329 
00330              p = buf;
00331              *p++ = dfa->nodes[node].opr.c;
00332              while (++node < dfa->nodes_len
00333                    &&       dfa->nodes[node].type == CHARACTER
00334                    && dfa->nodes[node].mb_partial)
00335               *p++ = dfa->nodes[node].opr.c;
00336              memset (&state, '\0', sizeof (state));
00337              if (__mbrtowc (&wc, (const char *) buf, p - buf,
00338                           &state) == p - buf
00339                 && (__wcrtomb ((char *) buf, towlower (wc), &state)
00340                     != (size_t) -1))
00341               re_set_fastmap (fastmap, false, buf[0]);
00342            }
00343 #endif
00344        }
00345       else if (type == SIMPLE_BRACKET)
00346        {
00347          int i, ch;
00348          for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
00349            {
00350              int j;
00351              bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
00352              for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
00353               if (w & ((bitset_word_t) 1 << j))
00354                 re_set_fastmap (fastmap, icase, ch);
00355            }
00356        }
00357 #ifdef RE_ENABLE_I18N
00358       else if (type == COMPLEX_BRACKET)
00359        {
00360          re_charset_t *cset = dfa->nodes[node].opr.mbcset;
00361          Idx i;
00362 
00363 # ifdef _LIBC
00364          /* See if we have to try all bytes which start multiple collation
00365             elements.
00366             e.g. In da_DK, we want to catch 'a' since "aa" is a valid
00367                 collation element, and don't catch 'b' since 'b' is
00368                 the only collation element which starts from 'b' (and
00369                 it is caught by SIMPLE_BRACKET).  */
00370              if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
00371                 && (cset->ncoll_syms || cset->nranges))
00372               {
00373                 const int32_t *table = (const int32_t *)
00374                   _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
00375                 for (i = 0; i < SBC_MAX; ++i)
00376                   if (table[i] < 0)
00377                     re_set_fastmap (fastmap, icase, i);
00378               }
00379 # endif /* _LIBC */
00380 
00381          /* See if we have to start the match at all multibyte characters,
00382             i.e. where we would not find an invalid sequence.  This only
00383             applies to multibyte character sets; for single byte character
00384             sets, the SIMPLE_BRACKET again suffices.  */
00385          if (dfa->mb_cur_max > 1
00386              && (cset->nchar_classes || cset->non_match || cset->nranges
00387 # ifdef _LIBC
00388                 || cset->nequiv_classes
00389 # endif /* _LIBC */
00390                ))
00391            {
00392              unsigned char c = 0;
00393              do
00394               {
00395                 mbstate_t mbs;
00396                 memset (&mbs, 0, sizeof (mbs));
00397                 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
00398                   re_set_fastmap (fastmap, false, (int) c);
00399               }
00400              while (++c != 0);
00401            }
00402 
00403          else
00404            {
00405              /* ... Else catch all bytes which can start the mbchars.  */
00406              for (i = 0; i < cset->nmbchars; ++i)
00407               {
00408                 char buf[256];
00409                 mbstate_t state;
00410                 memset (&state, '\0', sizeof (state));
00411                 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
00412                   re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
00413                 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
00414                   {
00415                     if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
00416                        != (size_t) -1)
00417                      re_set_fastmap (fastmap, false, *(unsigned char *) buf);
00418                   }
00419               }
00420            }
00421        }
00422 #endif /* RE_ENABLE_I18N */
00423       else if (type == OP_PERIOD
00424 #ifdef RE_ENABLE_I18N
00425               || type == OP_UTF8_PERIOD
00426 #endif /* RE_ENABLE_I18N */
00427               || type == END_OF_RE)
00428        {
00429          memset (fastmap, '\1', sizeof (char) * SBC_MAX);
00430          if (type == END_OF_RE)
00431            bufp->can_be_null = 1;
00432          return;
00433        }
00434     }
00435 }
00436 
00437 /* Entry point for POSIX code.  */
00438 /* regcomp takes a regular expression as a string and compiles it.
00439 
00440    PREG is a regex_t *.  We do not expect any fields to be initialized,
00441    since POSIX says we shouldn't.  Thus, we set
00442 
00443      `buffer' to the compiled pattern;
00444      `used' to the length of the compiled pattern;
00445      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
00446        REG_EXTENDED bit in CFLAGS is set; otherwise, to
00447        RE_SYNTAX_POSIX_BASIC;
00448      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
00449      `fastmap' to an allocated space for the fastmap;
00450      `fastmap_accurate' to zero;
00451      `re_nsub' to the number of subexpressions in PATTERN.
00452 
00453    PATTERN is the address of the pattern string.
00454 
00455    CFLAGS is a series of bits which affect compilation.
00456 
00457      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
00458      use POSIX basic syntax.
00459 
00460      If REG_NEWLINE is set, then . and [^...] don't match newline.
00461      Also, regexec will try a match beginning after every newline.
00462 
00463      If REG_ICASE is set, then we considers upper- and lowercase
00464      versions of letters to be equivalent when matching.
00465 
00466      If REG_NOSUB is set, then when PREG is passed to regexec, that
00467      routine will report only success or failure, and nothing about the
00468      registers.
00469 
00470    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
00471    the return codes and their meanings.)  */
00472 
00473 int
00474 regcomp (preg, pattern, cflags)
00475     regex_t *_Restrict_ preg;
00476     const char *_Restrict_ pattern;
00477     int cflags;
00478 {
00479   reg_errcode_t ret;
00480   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
00481                       : RE_SYNTAX_POSIX_BASIC);
00482 
00483   preg->buffer = NULL;
00484   preg->allocated = 0;
00485   preg->used = 0;
00486 
00487   /* Try to allocate space for the fastmap.  */
00488   preg->fastmap = re_malloc (char, SBC_MAX);
00489   if (BE (preg->fastmap == NULL, 0))
00490     return REG_ESPACE;
00491 
00492   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
00493 
00494   /* If REG_NEWLINE is set, newlines are treated differently.  */
00495   if (cflags & REG_NEWLINE)
00496     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
00497       syntax &= ~RE_DOT_NEWLINE;
00498       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
00499       /* It also changes the matching behavior.  */
00500       preg->newline_anchor = 1;
00501     }
00502   else
00503     preg->newline_anchor = 0;
00504   preg->no_sub = !!(cflags & REG_NOSUB);
00505   preg->translate = NULL;
00506 
00507   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
00508 
00509   /* POSIX doesn't distinguish between an unmatched open-group and an
00510      unmatched close-group: both are REG_EPAREN.  */
00511   if (ret == REG_ERPAREN)
00512     ret = REG_EPAREN;
00513 
00514   /* We have already checked preg->fastmap != NULL.  */
00515   if (BE (ret == REG_NOERROR, 1))
00516     /* Compute the fastmap now, since regexec cannot modify the pattern
00517        buffer.  This function never fails in this implementation.  */
00518     (void) re_compile_fastmap (preg);
00519   else
00520     {
00521       /* Some error occurred while compiling the expression.  */
00522       re_free (preg->fastmap);
00523       preg->fastmap = NULL;
00524     }
00525 
00526   return (int) ret;
00527 }
00528 #ifdef _LIBC
00529 weak_alias (__regcomp, regcomp)
00530 #endif
00531 
00532 /* Returns a message corresponding to an error code, ERRCODE, returned
00533    from either regcomp or regexec.   We don't use PREG here.  */
00534 
00535 #ifdef _LIBC
00536 size_t
00537 regerror (errcode, preg, errbuf, errbuf_size)
00538     int errcode;
00539     const regex_t *_Restrict_ preg;
00540     char *_Restrict_ errbuf;
00541     size_t errbuf_size;
00542 #else /* size_t might promote */
00543 size_t
00544 regerror (int errcode, const regex_t *_Restrict_ preg,
00545          char *_Restrict_ errbuf, size_t errbuf_size)
00546 #endif
00547 {
00548   const char *msg;
00549   size_t msg_size;
00550 
00551   if (BE (errcode < 0
00552          || errcode >= (int) (sizeof (__re_error_msgid_idx)
00553                             / sizeof (__re_error_msgid_idx[0])), 0))
00554     /* Only error codes returned by the rest of the code should be passed
00555        to this routine.  If we are given anything else, or if other regex
00556        code generates an invalid error code, then the program has a bug.
00557        Dump core so we can fix it.  */
00558     abort ();
00559 
00560   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
00561 
00562   msg_size = strlen (msg) + 1; /* Includes the null.  */
00563 
00564   if (BE (errbuf_size != 0, 1))
00565     {
00566       size_t cpy_size = msg_size;
00567       if (BE (msg_size > errbuf_size, 0))
00568        {
00569          cpy_size = errbuf_size - 1;
00570          errbuf[cpy_size] = '\0';
00571        }
00572       memcpy (errbuf, msg, cpy_size);
00573     }
00574 
00575   return msg_size;
00576 }
00577 #ifdef _LIBC
00578 weak_alias (__regerror, regerror)
00579 #endif
00580 
00581 
00582 #ifdef RE_ENABLE_I18N
00583 /* This static array is used for the map to single-byte characters when
00584    UTF-8 is used.  Otherwise we would allocate memory just to initialize
00585    it the same all the time.  UTF-8 is the preferred encoding so this is
00586    a worthwhile optimization.  */
00587 static const bitset_t utf8_sb_map =
00588 {
00589   /* Set the first 128 bits.  */
00590 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
00591 #  error "bitset_word_t is narrower than 32 bits"
00592 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
00593   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
00594 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
00595   BITSET_WORD_MAX, BITSET_WORD_MAX,
00596 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
00597   BITSET_WORD_MAX,
00598 # endif
00599   (BITSET_WORD_MAX
00600    >> (SBC_MAX % BITSET_WORD_BITS == 0
00601        ? 0
00602        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
00603 };
00604 #endif
00605 
00606 
00607 static void
00608 free_dfa_content (re_dfa_t *dfa)
00609 {
00610   Idx i, j;
00611 
00612   if (dfa->nodes)
00613     for (i = 0; i < dfa->nodes_len; ++i)
00614       free_token (dfa->nodes + i);
00615   re_free (dfa->nexts);
00616   for (i = 0; i < dfa->nodes_len; ++i)
00617     {
00618       if (dfa->eclosures != NULL)
00619        re_node_set_free (dfa->eclosures + i);
00620       if (dfa->inveclosures != NULL)
00621        re_node_set_free (dfa->inveclosures + i);
00622       if (dfa->edests != NULL)
00623        re_node_set_free (dfa->edests + i);
00624     }
00625   re_free (dfa->edests);
00626   re_free (dfa->eclosures);
00627   re_free (dfa->inveclosures);
00628   re_free (dfa->nodes);
00629 
00630   if (dfa->state_table)
00631     for (i = 0; i <= dfa->state_hash_mask; ++i)
00632       {
00633        struct re_state_table_entry *entry = dfa->state_table + i;
00634        for (j = 0; j < entry->num; ++j)
00635          {
00636            re_dfastate_t *state = entry->array[j];
00637            free_state (state);
00638          }
00639        re_free (entry->array);
00640       }
00641   re_free (dfa->state_table);
00642 #ifdef RE_ENABLE_I18N
00643   if (dfa->sb_char != utf8_sb_map)
00644     re_free (dfa->sb_char);
00645 #endif
00646   re_free (dfa->subexp_map);
00647 #ifdef DEBUG
00648   re_free (dfa->re_str);
00649 #endif
00650 
00651   re_free (dfa);
00652 }
00653 
00654 
00655 /* Free dynamically allocated space used by PREG.  */
00656 
00657 void
00658 regfree (preg)
00659     regex_t *preg;
00660 {
00661   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
00662   if (BE (dfa != NULL, 1))
00663     free_dfa_content (dfa);
00664   preg->buffer = NULL;
00665   preg->allocated = 0;
00666 
00667   re_free (preg->fastmap);
00668   preg->fastmap = NULL;
00669 
00670   re_free (preg->translate);
00671   preg->translate = NULL;
00672 }
00673 #ifdef _LIBC
00674 weak_alias (__regfree, regfree)
00675 #endif
00676 
00677 /* Entry points compatible with 4.2 BSD regex library.  We don't define
00678    them unless specifically requested.  */
00679 
00680 #if defined _REGEX_RE_COMP || defined _LIBC
00681 
00682 /* BSD has one and only one pattern buffer.  */
00683 static struct re_pattern_buffer re_comp_buf;
00684 
00685 char *
00686 # ifdef _LIBC
00687 /* Make these definitions weak in libc, so POSIX programs can redefine
00688    these names if they don't use our functions, and still use
00689    regcomp/regexec above without link errors.  */
00690 weak_function
00691 # endif
00692 re_comp (s)
00693      const char *s;
00694 {
00695   reg_errcode_t ret;
00696   char *fastmap;
00697 
00698   if (!s)
00699     {
00700       if (!re_comp_buf.buffer)
00701        return gettext ("No previous regular expression");
00702       return 0;
00703     }
00704 
00705   if (re_comp_buf.buffer)
00706     {
00707       fastmap = re_comp_buf.fastmap;
00708       re_comp_buf.fastmap = NULL;
00709       __regfree (&re_comp_buf);
00710       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
00711       re_comp_buf.fastmap = fastmap;
00712     }
00713 
00714   if (re_comp_buf.fastmap == NULL)
00715     {
00716       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
00717       if (re_comp_buf.fastmap == NULL)
00718        return (char *) gettext (__re_error_msgid
00719                              + __re_error_msgid_idx[(int) REG_ESPACE]);
00720     }
00721 
00722   /* Since `re_exec' always passes NULL for the `regs' argument, we
00723      don't need to initialize the pattern buffer fields which affect it.  */
00724 
00725   /* Match anchors at newlines.  */
00726   re_comp_buf.newline_anchor = 1;
00727 
00728   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
00729 
00730   if (!ret)
00731     return NULL;
00732 
00733   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
00734   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
00735 }
00736 
00737 #ifdef _LIBC
00738 libc_freeres_fn (free_mem)
00739 {
00740   __regfree (&re_comp_buf);
00741 }
00742 #endif
00743 
00744 #endif /* _REGEX_RE_COMP */
00745 
00746 /* Internal entry point.
00747    Compile the regular expression PATTERN, whose length is LENGTH.
00748    SYNTAX indicate regular expression's syntax.  */
00749 
00750 static reg_errcode_t
00751 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
00752                    reg_syntax_t syntax)
00753 {
00754   reg_errcode_t err = REG_NOERROR;
00755   re_dfa_t *dfa;
00756   re_string_t regexp;
00757 
00758   /* Initialize the pattern buffer.  */
00759   preg->fastmap_accurate = 0;
00760   preg->syntax = syntax;
00761   preg->not_bol = preg->not_eol = 0;
00762   preg->used = 0;
00763   preg->re_nsub = 0;
00764   preg->can_be_null = 0;
00765   preg->regs_allocated = REGS_UNALLOCATED;
00766 
00767   /* Initialize the dfa.  */
00768   dfa = (re_dfa_t *) preg->buffer;
00769   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
00770     {
00771       /* If zero allocated, but buffer is non-null, try to realloc
00772         enough space.  This loses if buffer's address is bogus, but
00773         that is the user's responsibility.  If ->buffer is NULL this
00774         is a simple allocation.  */
00775       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
00776       if (dfa == NULL)
00777        return REG_ESPACE;
00778       preg->allocated = sizeof (re_dfa_t);
00779       preg->buffer = (unsigned char *) dfa;
00780     }
00781   preg->used = sizeof (re_dfa_t);
00782 
00783   err = init_dfa (dfa, length);
00784   if (BE (err != REG_NOERROR, 0))
00785     {
00786       free_dfa_content (dfa);
00787       preg->buffer = NULL;
00788       preg->allocated = 0;
00789       return err;
00790     }
00791 #ifdef DEBUG
00792   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
00793   dfa->re_str = re_malloc (char, length + 1);
00794   strncpy (dfa->re_str, pattern, length + 1);
00795 #endif
00796 
00797   __libc_lock_init (dfa->lock);
00798 
00799   err = re_string_construct (&regexp, pattern, length, preg->translate,
00800                           (syntax & RE_ICASE) != 0, dfa);
00801   if (BE (err != REG_NOERROR, 0))
00802     {
00803     re_compile_internal_free_return:
00804       free_workarea_compile (preg);
00805       re_string_destruct (&regexp);
00806       free_dfa_content (dfa);
00807       preg->buffer = NULL;
00808       preg->allocated = 0;
00809       return err;
00810     }
00811 
00812   /* Parse the regular expression, and build a structure tree.  */
00813   preg->re_nsub = 0;
00814   dfa->str_tree = parse (&regexp, preg, syntax, &err);
00815   if (BE (dfa->str_tree == NULL, 0))
00816     goto re_compile_internal_free_return;
00817 
00818   /* Analyze the tree and create the nfa.  */
00819   err = analyze (preg);
00820   if (BE (err != REG_NOERROR, 0))
00821     goto re_compile_internal_free_return;
00822 
00823 #ifdef RE_ENABLE_I18N
00824   /* If possible, do searching in single byte encoding to speed things up.  */
00825   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
00826     optimize_utf8 (dfa);
00827 #endif
00828 
00829   /* Then create the initial state of the dfa.  */
00830   err = create_initial_state (dfa);
00831 
00832   /* Release work areas.  */
00833   free_workarea_compile (preg);
00834   re_string_destruct (&regexp);
00835 
00836   if (BE (err != REG_NOERROR, 0))
00837     {
00838       free_dfa_content (dfa);
00839       preg->buffer = NULL;
00840       preg->allocated = 0;
00841     }
00842 
00843   return err;
00844 }
00845 
00846 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
00847    as the initial length of some arrays.  */
00848 
00849 static reg_errcode_t
00850 init_dfa (re_dfa_t *dfa, size_t pat_len)
00851 {
00852   __re_size_t table_size;
00853 #ifndef _LIBC
00854   char *codeset_name;
00855 #endif
00856 #ifdef RE_ENABLE_I18N
00857   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
00858 #else
00859   size_t max_i18n_object_size = 0;
00860 #endif
00861   size_t max_object_size =
00862     MAX (sizeof (struct re_state_table_entry),
00863         MAX (sizeof (re_token_t),
00864              MAX (sizeof (re_node_set),
00865                  MAX (sizeof (regmatch_t),
00866                      max_i18n_object_size))));
00867 
00868   memset (dfa, '\0', sizeof (re_dfa_t));
00869 
00870   /* Force allocation of str_tree_storage the first time.  */
00871   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
00872 
00873   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
00874      calculation below, and for similar doubling calculations
00875      elsewhere.  And it's <= rather than <, because some of the
00876      doubling calculations add 1 afterwards.  */
00877   if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
00878     return REG_ESPACE;
00879 
00880   dfa->nodes_alloc = pat_len + 1;
00881   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
00882 
00883   /*  table_size = 2 ^ ceil(log pat_len) */
00884   for (table_size = 1; ; table_size <<= 1)
00885     if (table_size > pat_len)
00886       break;
00887 
00888   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
00889   dfa->state_hash_mask = table_size - 1;
00890 
00891   dfa->mb_cur_max = MB_CUR_MAX;
00892 #ifdef _LIBC
00893   if (dfa->mb_cur_max == 6
00894       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
00895     dfa->is_utf8 = 1;
00896   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
00897                      != 0);
00898 #else
00899   codeset_name = nl_langinfo (CODESET);
00900   if (strcasecmp (codeset_name, "UTF-8") == 0
00901       || strcasecmp (codeset_name, "UTF8") == 0)
00902     dfa->is_utf8 = 1;
00903 
00904   /* We check exhaustively in the loop below if this charset is a
00905      superset of ASCII.  */
00906   dfa->map_notascii = 0;
00907 #endif
00908 
00909 #ifdef RE_ENABLE_I18N
00910   if (dfa->mb_cur_max > 1)
00911     {
00912       if (dfa->is_utf8)
00913        dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
00914       else
00915        {
00916          int i, j, ch;
00917 
00918          dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
00919          if (BE (dfa->sb_char == NULL, 0))
00920            return REG_ESPACE;
00921 
00922          /* Set the bits corresponding to single byte chars.  */
00923          for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
00924            for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
00925              {
00926               wint_t wch = __btowc (ch);
00927               if (wch != WEOF)
00928                 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
00929 # ifndef _LIBC
00930               if (isascii (ch) && wch != ch)
00931                 dfa->map_notascii = 1;
00932 # endif
00933              }
00934        }
00935     }
00936 #endif
00937 
00938   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
00939     return REG_ESPACE;
00940   return REG_NOERROR;
00941 }
00942 
00943 /* Initialize WORD_CHAR table, which indicate which character is
00944    "word".  In this case "word" means that it is the word construction
00945    character used by some operators like "<", ">", etc.  */
00946 
00947 static void
00948 internal_function
00949 init_word_char (re_dfa_t *dfa)
00950 {
00951   int i, j, ch;
00952   dfa->word_ops_used = 1;
00953   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
00954     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
00955       if (isalnum (ch) || ch == '_')
00956        dfa->word_char[i] |= (bitset_word_t) 1 << j;
00957 }
00958 
00959 /* Free the work area which are only used while compiling.  */
00960 
00961 static void
00962 free_workarea_compile (regex_t *preg)
00963 {
00964   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
00965   bin_tree_storage_t *storage, *next;
00966   for (storage = dfa->str_tree_storage; storage; storage = next)
00967     {
00968       next = storage->next;
00969       re_free (storage);
00970     }
00971   dfa->str_tree_storage = NULL;
00972   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
00973   dfa->str_tree = NULL;
00974   re_free (dfa->org_indices);
00975   dfa->org_indices = NULL;
00976 }
00977 
00978 /* Create initial states for all contexts.  */
00979 
00980 static reg_errcode_t
00981 create_initial_state (re_dfa_t *dfa)
00982 {
00983   Idx first, i;
00984   reg_errcode_t err;
00985   re_node_set init_nodes;
00986 
00987   /* Initial states have the epsilon closure of the node which is
00988      the first node of the regular expression.  */
00989   first = dfa->str_tree->first->node_idx;
00990   dfa->init_node = first;
00991   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
00992   if (BE (err != REG_NOERROR, 0))
00993     return err;
00994 
00995   /* The back-references which are in initial states can epsilon transit,
00996      since in this case all of the subexpressions can be null.
00997      Then we add epsilon closures of the nodes which are the next nodes of
00998      the back-references.  */
00999   if (dfa->nbackref > 0)
01000     for (i = 0; i < init_nodes.nelem; ++i)
01001       {
01002        Idx node_idx = init_nodes.elems[i];
01003        re_token_type_t type = dfa->nodes[node_idx].type;
01004 
01005        Idx clexp_idx;
01006        if (type != OP_BACK_REF)
01007          continue;
01008        for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
01009          {
01010            re_token_t *clexp_node;
01011            clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
01012            if (clexp_node->type == OP_CLOSE_SUBEXP
01013               && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
01014              break;
01015          }
01016        if (clexp_idx == init_nodes.nelem)
01017          continue;
01018 
01019        if (type == OP_BACK_REF)
01020          {
01021            Idx dest_idx = dfa->edests[node_idx].elems[0];
01022            if (!re_node_set_contains (&init_nodes, dest_idx))
01023              {
01024               reg_errcode_t merge_err
01025                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
01026               if (merge_err != REG_NOERROR)
01027                 return merge_err;
01028               i = 0;
01029              }
01030          }
01031       }
01032 
01033   /* It must be the first time to invoke acquire_state.  */
01034   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
01035   /* We don't check ERR here, since the initial state must not be NULL.  */
01036   if (BE (dfa->init_state == NULL, 0))
01037     return err;
01038   if (dfa->init_state->has_constraint)
01039     {
01040       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
01041                                                  CONTEXT_WORD);
01042       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
01043                                                CONTEXT_NEWLINE);
01044       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
01045                                                   &init_nodes,
01046                                                   CONTEXT_NEWLINE
01047                                                   | CONTEXT_BEGBUF);
01048       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
01049              || dfa->init_state_begbuf == NULL, 0))
01050        return err;
01051     }
01052   else
01053     dfa->init_state_word = dfa->init_state_nl
01054       = dfa->init_state_begbuf = dfa->init_state;
01055 
01056   re_node_set_free (&init_nodes);
01057   return REG_NOERROR;
01058 }
01059 
01060 #ifdef RE_ENABLE_I18N
01061 /* If it is possible to do searching in single byte encoding instead of UTF-8
01062    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
01063    DFA nodes where needed.  */
01064 
01065 static void
01066 optimize_utf8 (re_dfa_t *dfa)
01067 {
01068   Idx node;
01069   int i;
01070   bool mb_chars = false;
01071   bool has_period = false;
01072 
01073   for (node = 0; node < dfa->nodes_len; ++node)
01074     switch (dfa->nodes[node].type)
01075       {
01076       case CHARACTER:
01077        if (dfa->nodes[node].opr.c >= ASCII_CHARS)
01078          mb_chars = true;
01079        break;
01080       case ANCHOR:
01081        switch (dfa->nodes[node].opr.ctx_type)
01082          {
01083          case LINE_FIRST:
01084          case LINE_LAST:
01085          case BUF_FIRST:
01086          case BUF_LAST:
01087            break;
01088          default:
01089            /* Word anchors etc. cannot be handled.  It's okay to test
01090               opr.ctx_type since constraints (for all DFA nodes) are
01091               created by ORing one or more opr.ctx_type values.  */
01092            return;
01093          }
01094        break;
01095       case OP_PERIOD:
01096        has_period = true;
01097        break;
01098       case OP_BACK_REF:
01099       case OP_ALT:
01100       case END_OF_RE:
01101       case OP_DUP_ASTERISK:
01102       case OP_OPEN_SUBEXP:
01103       case OP_CLOSE_SUBEXP:
01104        break;
01105       case COMPLEX_BRACKET:
01106        return;
01107       case SIMPLE_BRACKET:
01108        /* Just double check.  */
01109        {
01110          int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
01111                      ? 0
01112                      : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
01113          for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
01114            {
01115              if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
01116               return;
01117              rshift = 0;
01118            }
01119        }
01120        break;
01121       default:
01122        abort ();
01123       }
01124 
01125   if (mb_chars || has_period)
01126     for (node = 0; node < dfa->nodes_len; ++node)
01127       {
01128        if (dfa->nodes[node].type == CHARACTER
01129            && dfa->nodes[node].opr.c >= ASCII_CHARS)
01130          dfa->nodes[node].mb_partial = 0;
01131        else if (dfa->nodes[node].type == OP_PERIOD)
01132          dfa->nodes[node].type = OP_UTF8_PERIOD;
01133       }
01134 
01135   /* The search can be in single byte locale.  */
01136   dfa->mb_cur_max = 1;
01137   dfa->is_utf8 = 0;
01138   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
01139 }
01140 #endif
01141 
01142 /* Analyze the structure tree, and calculate "first", "next", "edest",
01143    "eclosure", and "inveclosure".  */
01144 
01145 static reg_errcode_t
01146 analyze (regex_t *preg)
01147 {
01148   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
01149   reg_errcode_t ret;
01150 
01151   /* Allocate arrays.  */
01152   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
01153   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
01154   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
01155   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
01156   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
01157          || dfa->eclosures == NULL, 0))
01158     return REG_ESPACE;
01159 
01160   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
01161   if (dfa->subexp_map != NULL)
01162     {
01163       Idx i;
01164       for (i = 0; i < preg->re_nsub; i++)
01165        dfa->subexp_map[i] = i;
01166       preorder (dfa->str_tree, optimize_subexps, dfa);
01167       for (i = 0; i < preg->re_nsub; i++)
01168        if (dfa->subexp_map[i] != i)
01169          break;
01170       if (i == preg->re_nsub)
01171        {
01172          free (dfa->subexp_map);
01173          dfa->subexp_map = NULL;
01174        }
01175     }
01176 
01177   ret = postorder (dfa->str_tree, lower_subexps, preg);
01178   if (BE (ret != REG_NOERROR, 0))
01179     return ret;
01180   ret = postorder (dfa->str_tree, calc_first, dfa);
01181   if (BE (ret != REG_NOERROR, 0))
01182     return ret;
01183   preorder (dfa->str_tree, calc_next, dfa);
01184   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
01185   if (BE (ret != REG_NOERROR, 0))
01186     return ret;
01187   ret = calc_eclosure (dfa);
01188   if (BE (ret != REG_NOERROR, 0))
01189     return ret;
01190 
01191   /* We only need this during the prune_impossible_nodes pass in regexec.c;
01192      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
01193   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
01194       || dfa->nbackref)
01195     {
01196       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
01197       if (BE (dfa->inveclosures == NULL, 0))
01198        return REG_ESPACE;
01199       ret = calc_inveclosure (dfa);
01200     }
01201 
01202   return ret;
01203 }
01204 
01205 /* Our parse trees are very unbalanced, so we cannot use a stack to
01206    implement parse tree visits.  Instead, we use parent pointers and
01207    some hairy code in these two functions.  */
01208 static reg_errcode_t
01209 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
01210           void *extra)
01211 {
01212   bin_tree_t *node, *prev;
01213 
01214   for (node = root; ; )
01215     {
01216       /* Descend down the tree, preferably to the left (or to the right
01217         if that's the only child).  */
01218       while (node->left || node->right)
01219        if (node->left)
01220          node = node->left;
01221        else
01222          node = node->right;
01223 
01224       do
01225        {
01226          reg_errcode_t err = fn (extra, node);
01227          if (BE (err != REG_NOERROR, 0))
01228            return err;
01229          if (node->parent == NULL)
01230            return REG_NOERROR;
01231          prev = node;
01232          node = node->parent;
01233        }
01234       /* Go up while we have a node that is reached from the right.  */
01235       while (node->right == prev || node->right == NULL);
01236       node = node->right;
01237     }
01238 }
01239 
01240 static reg_errcode_t
01241 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
01242          void *extra)
01243 {
01244   bin_tree_t *node;
01245 
01246   for (node = root; ; )
01247     {
01248       reg_errcode_t err = fn (extra, node);
01249       if (BE (err != REG_NOERROR, 0))
01250        return err;
01251 
01252       /* Go to the left node, or up and to the right.  */
01253       if (node->left)
01254        node = node->left;
01255       else
01256        {
01257          bin_tree_t *prev = NULL;
01258          while (node->right == prev || node->right == NULL)
01259            {
01260              prev = node;
01261              node = node->parent;
01262              if (!node)
01263               return REG_NOERROR;
01264            }
01265          node = node->right;
01266        }
01267     }
01268 }
01269 
01270 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
01271    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
01272    backreferences as well.  Requires a preorder visit.  */
01273 static reg_errcode_t
01274 optimize_subexps (void *extra, bin_tree_t *node)
01275 {
01276   re_dfa_t *dfa = (re_dfa_t *) extra;
01277 
01278   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
01279     {
01280       int idx = node->token.opr.idx;
01281       node->token.opr.idx = dfa->subexp_map[idx];
01282       dfa->used_bkref_map |= 1 << node->token.opr.idx;
01283     }
01284 
01285   else if (node->token.type == SUBEXP
01286           && node->left && node->left->token.type == SUBEXP)
01287     {
01288       Idx other_idx = node->left->token.opr.idx;
01289 
01290       node->left = node->left->left;
01291       if (node->left)
01292        node->left->parent = node;
01293 
01294       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
01295       if (other_idx < BITSET_WORD_BITS)
01296        dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
01297     }
01298 
01299   return REG_NOERROR;
01300 }
01301 
01302 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
01303    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
01304 static reg_errcode_t
01305 lower_subexps (void *extra, bin_tree_t *node)
01306 {
01307   regex_t *preg = (regex_t *) extra;
01308   reg_errcode_t err = REG_NOERROR;
01309 
01310   if (node->left && node->left->token.type == SUBEXP)
01311     {
01312       node->left = lower_subexp (&err, preg, node->left);
01313       if (node->left)
01314        node->left->parent = node;
01315     }
01316   if (node->right && node->right->token.type == SUBEXP)
01317     {
01318       node->right = lower_subexp (&err, preg, node->right);
01319       if (node->right)
01320        node->right->parent = node;
01321     }
01322 
01323   return err;
01324 }
01325 
01326 static bin_tree_t *
01327 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
01328 {
01329   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
01330   bin_tree_t *body = node->left;
01331   bin_tree_t *op, *cls, *tree1, *tree;
01332 
01333   if (preg->no_sub
01334       /* We do not optimize empty subexpressions, because otherwise we may
01335         have bad CONCAT nodes with NULL children.  This is obviously not
01336         very common, so we do not lose much.  An example that triggers
01337         this case is the sed "script" /\(\)/x.  */
01338       && node->left != NULL
01339       && (node->token.opr.idx >= BITSET_WORD_BITS
01340          || !(dfa->used_bkref_map
01341               & ((bitset_word_t) 1 << node->token.opr.idx))))
01342     return node->left;
01343 
01344   /* Convert the SUBEXP node to the concatenation of an
01345      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
01346   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
01347   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
01348   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
01349   tree = create_tree (dfa, op, tree1, CONCAT);
01350   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
01351     {
01352       *err = REG_ESPACE;
01353       return NULL;
01354     }
01355 
01356   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
01357   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
01358   return tree;
01359 }
01360 
01361 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
01362    nodes.  Requires a postorder visit.  */
01363 static reg_errcode_t
01364 calc_first (void *extra, bin_tree_t *node)
01365 {
01366   re_dfa_t *dfa = (re_dfa_t *) extra;
01367   if (node->token.type == CONCAT)
01368     {
01369       node->first = node->left->first;
01370       node->node_idx = node->left->node_idx;
01371     }
01372   else
01373     {
01374       node->first = node;
01375       node->node_idx = re_dfa_add_node (dfa, node->token);
01376       if (BE (node->node_idx == REG_MISSING, 0))
01377        return REG_ESPACE;
01378       if (node->token.type == ANCHOR)
01379        dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
01380     }
01381   return REG_NOERROR;
01382 }
01383 
01384 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
01385 static reg_errcode_t
01386 calc_next (void *extra, bin_tree_t *node)
01387 {
01388   switch (node->token.type)
01389     {
01390     case OP_DUP_ASTERISK:
01391       node->left->next = node;
01392       break;
01393     case CONCAT:
01394       node->left->next = node->right->first;
01395       node->right->next = node->next;
01396       break;
01397     default:
01398       if (node->left)
01399        node->left->next = node->next;
01400       if (node->right)
01401        node->right->next = node->next;
01402       break;
01403     }
01404   return REG_NOERROR;
01405 }
01406 
01407 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
01408 static reg_errcode_t
01409 link_nfa_nodes (void *extra, bin_tree_t *node)
01410 {
01411   re_dfa_t *dfa = (re_dfa_t *) extra;
01412   Idx idx = node->node_idx;
01413   reg_errcode_t err = REG_NOERROR;
01414 
01415   switch (node->token.type)
01416     {
01417     case CONCAT:
01418       break;
01419 
01420     case END_OF_RE:
01421       assert (node->next == NULL);
01422       break;
01423 
01424     case OP_DUP_ASTERISK:
01425     case OP_ALT:
01426       {
01427        Idx left, right;
01428        dfa->has_plural_match = 1;
01429        if (node->left != NULL)
01430          left = node->left->first->node_idx;
01431        else
01432          left = node->next->node_idx;
01433        if (node->right != NULL)
01434          right = node->right->first->node_idx;
01435        else
01436          right = node->next->node_idx;
01437        assert (REG_VALID_INDEX (left));
01438        assert (REG_VALID_INDEX (right));
01439        err = re_node_set_init_2 (dfa->edests + idx, left, right);
01440       }
01441       break;
01442 
01443     case ANCHOR:
01444     case OP_OPEN_SUBEXP:
01445     case OP_CLOSE_SUBEXP:
01446       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
01447       break;
01448 
01449     case OP_BACK_REF:
01450       dfa->nexts[idx] = node->next->node_idx;
01451       if (node->token.type == OP_BACK_REF)
01452        err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
01453       break;
01454 
01455     default:
01456       assert (!IS_EPSILON_NODE (node->token.type));
01457       dfa->nexts[idx] = node->next->node_idx;
01458       break;
01459     }
01460 
01461   return err;
01462 }
01463 
01464 /* Duplicate the epsilon closure of the node ROOT_NODE.
01465    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
01466    to their own constraint.  */
01467 
01468 static reg_errcode_t
01469 internal_function
01470 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
01471                      Idx root_node, unsigned int init_constraint)
01472 {
01473   Idx org_node, clone_node;
01474   bool ok;
01475   unsigned int constraint = init_constraint;
01476   for (org_node = top_org_node, clone_node = top_clone_node;;)
01477     {
01478       Idx org_dest, clone_dest;
01479       if (dfa->nodes[org_node].type == OP_BACK_REF)
01480        {
01481          /* If the back reference epsilon-transit, its destination must
01482             also have the constraint.  Then duplicate the epsilon closure
01483             of the destination of the back reference, and store it in
01484             edests of the back reference.  */
01485          org_dest = dfa->nexts[org_node];
01486          re_node_set_empty (dfa->edests + clone_node);
01487          clone_dest = duplicate_node (dfa, org_dest, constraint);
01488          if (BE (clone_dest == REG_MISSING, 0))
01489            return REG_ESPACE;
01490          dfa->nexts[clone_node] = dfa->nexts[org_node];
01491          ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
01492          if (BE (! ok, 0))
01493            return REG_ESPACE;
01494        }
01495       else if (dfa->edests[org_node].nelem == 0)
01496        {
01497          /* In case of the node can't epsilon-transit, don't duplicate the
01498             destination and store the original destination as the
01499             destination of the node.  */
01500          dfa->nexts[clone_node] = dfa->nexts[org_node];
01501          break;
01502        }
01503       else if (dfa->edests[org_node].nelem == 1)
01504        {
01505          /* In case of the node can epsilon-transit, and it has only one
01506             destination.  */
01507          org_dest = dfa->edests[org_node].elems[0];
01508          re_node_set_empty (dfa->edests + clone_node);
01509          /* If the node is root_node itself, it means the epsilon closure
01510             has a loop.  Then tie it to the destination of the root_node.  */
01511          if (org_node == root_node && clone_node != org_node)
01512            {
01513              ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
01514              if (BE (! ok, 0))
01515                return REG_ESPACE;
01516              break;
01517            }
01518          /* In case the node has another constraint, append it.  */
01519          constraint |= dfa->nodes[org_node].constraint;
01520          clone_dest = duplicate_node (dfa, org_dest, constraint);
01521          if (BE (clone_dest == REG_MISSING, 0))
01522            return REG_ESPACE;
01523          ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
01524          if (BE (! ok, 0))
01525            return REG_ESPACE;
01526        }
01527       else /* dfa->edests[org_node].nelem == 2 */
01528        {
01529          /* In case of the node can epsilon-transit, and it has two
01530             destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
01531          org_dest = dfa->edests[org_node].elems[0];
01532          re_node_set_empty (dfa->edests + clone_node);
01533          /* Search for a duplicated node which satisfies the constraint.  */
01534          clone_dest = search_duplicated_node (dfa, org_dest, constraint);
01535          if (clone_dest == REG_MISSING)
01536            {
01537              /* There is no such duplicated node, create a new one.  */
01538              reg_errcode_t err;
01539              clone_dest = duplicate_node (dfa, org_dest, constraint);
01540              if (BE (clone_dest == REG_MISSING, 0))
01541               return REG_ESPACE;
01542              ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
01543              if (BE (! ok, 0))
01544               return REG_ESPACE;
01545              err = duplicate_node_closure (dfa, org_dest, clone_dest,
01546                                        root_node, constraint);
01547              if (BE (err != REG_NOERROR, 0))
01548               return err;
01549            }
01550          else
01551            {
01552              /* There is a duplicated node which satisfies the constraint,
01553                use it to avoid infinite loop.  */
01554              ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
01555              if (BE (! ok, 0))
01556               return REG_ESPACE;
01557            }
01558 
01559          org_dest = dfa->edests[org_node].elems[1];
01560          clone_dest = duplicate_node (dfa, org_dest, constraint);
01561          if (BE (clone_dest == REG_MISSING, 0))
01562            return REG_ESPACE;
01563          ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
01564          if (BE (! ok, 0))
01565            return REG_ESPACE;
01566        }
01567       org_node = org_dest;
01568       clone_node = clone_dest;
01569     }
01570   return REG_NOERROR;
01571 }
01572 
01573 /* Search for a node which is duplicated from the node ORG_NODE, and
01574    satisfies the constraint CONSTRAINT.  */
01575 
01576 static Idx
01577 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
01578                      unsigned int constraint)
01579 {
01580   Idx idx;
01581   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
01582     {
01583       if (org_node == dfa->org_indices[idx]
01584          && constraint == dfa->nodes[idx].constraint)
01585        return idx; /* Found.  */
01586     }
01587   return REG_MISSING; /* Not found.  */
01588 }
01589 
01590 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
01591    Return the index of the new node, or REG_MISSING if insufficient storage is
01592    available.  */
01593 
01594 static Idx
01595 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
01596 {
01597   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
01598   if (BE (dup_idx != REG_MISSING, 1))
01599     {
01600       dfa->nodes[dup_idx].constraint = constraint;
01601       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
01602       dfa->nodes[dup_idx].duplicated = 1;
01603 
01604       /* Store the index of the original node.  */
01605       dfa->org_indices[dup_idx] = org_idx;
01606     }
01607   return dup_idx;
01608 }
01609 
01610 static reg_errcode_t
01611 calc_inveclosure (re_dfa_t *dfa)
01612 {
01613   Idx src, idx;
01614   bool ok;
01615   for (idx = 0; idx < dfa->nodes_len; ++idx)
01616     re_node_set_init_empty (dfa->inveclosures + idx);
01617 
01618   for (src = 0; src < dfa->nodes_len; ++src)
01619     {
01620       Idx *elems = dfa->eclosures[src].elems;
01621       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
01622        {
01623          ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
01624          if (BE (! ok, 0))
01625            return REG_ESPACE;
01626        }
01627     }
01628 
01629   return REG_NOERROR;
01630 }
01631 
01632 /* Calculate "eclosure" for all the node in DFA.  */
01633 
01634 static reg_errcode_t
01635 calc_eclosure (re_dfa_t *dfa)
01636 {
01637   Idx node_idx;
01638   bool incomplete;
01639 #ifdef DEBUG
01640   assert (dfa->nodes_len > 0);
01641 #endif
01642   incomplete = false;
01643   /* For each nodes, calculate epsilon closure.  */
01644   for (node_idx = 0; ; ++node_idx)
01645     {
01646       reg_errcode_t err;
01647       re_node_set eclosure_elem;
01648       if (node_idx == dfa->nodes_len)
01649        {
01650          if (!incomplete)
01651            break;
01652          incomplete = false;
01653          node_idx = 0;
01654        }
01655 
01656 #ifdef DEBUG
01657       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
01658 #endif
01659 
01660       /* If we have already calculated, skip it.  */
01661       if (dfa->eclosures[node_idx].nelem != 0)
01662        continue;
01663       /* Calculate epsilon closure of `node_idx'.  */
01664       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
01665       if (BE (err != REG_NOERROR, 0))
01666        return err;
01667 
01668       if (dfa->eclosures[node_idx].nelem == 0)
01669        {
01670          incomplete = true;
01671          re_node_set_free (&eclosure_elem);
01672        }
01673     }
01674   return REG_NOERROR;
01675 }
01676 
01677 /* Calculate epsilon closure of NODE.  */
01678 
01679 static reg_errcode_t
01680 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
01681 {
01682   reg_errcode_t err;
01683   Idx i;
01684   re_node_set eclosure;
01685   bool ok;
01686   bool incomplete = false;
01687   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
01688   if (BE (err != REG_NOERROR, 0))
01689     return err;
01690 
01691   /* This indicates that we are calculating this node now.
01692      We reference this value to avoid infinite loop.  */
01693   dfa->eclosures[node].nelem = REG_MISSING;
01694 
01695   /* If the current node has constraints, duplicate all nodes
01696      since they must inherit the constraints.  */
01697   if (dfa->nodes[node].constraint
01698       && dfa->edests[node].nelem
01699       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
01700     {
01701       err = duplicate_node_closure (dfa, node, node, node,
01702                                 dfa->nodes[node].constraint);
01703       if (BE (err != REG_NOERROR, 0))
01704        return err;
01705     }
01706 
01707   /* Expand each epsilon destination nodes.  */
01708   if (IS_EPSILON_NODE(dfa->nodes[node].type))
01709     for (i = 0; i < dfa->edests[node].nelem; ++i)
01710       {
01711        re_node_set eclosure_elem;
01712        Idx edest = dfa->edests[node].elems[i];
01713        /* If calculating the epsilon closure of `edest' is in progress,
01714           return intermediate result.  */
01715        if (dfa->eclosures[edest].nelem == REG_MISSING)
01716          {
01717            incomplete = true;
01718            continue;
01719          }
01720        /* If we haven't calculated the epsilon closure of `edest' yet,
01721           calculate now. Otherwise use calculated epsilon closure.  */
01722        if (dfa->eclosures[edest].nelem == 0)
01723          {
01724            err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
01725            if (BE (err != REG_NOERROR, 0))
01726              return err;
01727          }
01728        else
01729          eclosure_elem = dfa->eclosures[edest];
01730        /* Merge the epsilon closure of `edest'.  */
01731        err = re_node_set_merge (&eclosure, &eclosure_elem);
01732        if (BE (err != REG_NOERROR, 0))
01733          return err;
01734        /* If the epsilon closure of `edest' is incomplete,
01735           the epsilon closure of this node is also incomplete.  */
01736        if (dfa->eclosures[edest].nelem == 0)
01737          {
01738            incomplete = true;
01739            re_node_set_free (&eclosure_elem);
01740          }
01741       }
01742 
01743   /* An epsilon closure includes itself.  */
01744   ok = re_node_set_insert (&eclosure, node);
01745   if (BE (! ok, 0))
01746     return REG_ESPACE;
01747   if (incomplete && !root)
01748     dfa->eclosures[node].nelem = 0;
01749   else
01750     dfa->eclosures[node] = eclosure;
01751   *new_set = eclosure;
01752   return REG_NOERROR;
01753 }
01754 
01755 /* Functions for token which are used in the parser.  */
01756 
01757 /* Fetch a token from INPUT.
01758    We must not use this function inside bracket expressions.  */
01759 
01760 static void
01761 internal_function
01762 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
01763 {
01764   re_string_skip_bytes (input, peek_token (result, input, syntax));
01765 }
01766 
01767 /* Peek a token from INPUT, and return the length of the token.
01768    We must not use this function inside bracket expressions.  */
01769 
01770 static int
01771 internal_function
01772 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
01773 {
01774   unsigned char c;
01775 
01776   if (re_string_eoi (input))
01777     {
01778       token->type = END_OF_RE;
01779       return 0;
01780     }
01781 
01782   c = re_string_peek_byte (input, 0);
01783   token->opr.c = c;
01784 
01785   token->word_char = 0;
01786 #ifdef RE_ENABLE_I18N
01787   token->mb_partial = 0;
01788   if (input->mb_cur_max > 1 &&
01789       !re_string_first_byte (input, re_string_cur_idx (input)))
01790     {
01791       token->type = CHARACTER;
01792       token->mb_partial = 1;
01793       return 1;
01794     }
01795 #endif
01796   if (c == '\\')
01797     {
01798       unsigned char c2;
01799       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
01800        {
01801          token->type = BACK_SLASH;
01802          return 1;
01803        }
01804 
01805       c2 = re_string_peek_byte_case (input, 1);
01806       token->opr.c = c2;
01807       token->type = CHARACTER;
01808 #ifdef RE_ENABLE_I18N
01809       if (input->mb_cur_max > 1)
01810        {
01811          wint_t wc = re_string_wchar_at (input,
01812                                      re_string_cur_idx (input) + 1);
01813          token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
01814        }
01815       else
01816 #endif
01817        token->word_char = IS_WORD_CHAR (c2) != 0;
01818 
01819       switch (c2)
01820        {
01821        case '|':
01822          if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
01823            token->type = OP_ALT;
01824          break;
01825        case '1': case '2': case '3': case '4': case '5':
01826        case '6': case '7': case '8': case '9':
01827          if (!(syntax & RE_NO_BK_REFS))
01828            {
01829              token->type = OP_BACK_REF;
01830              token->opr.idx = c2 - '1';
01831            }
01832          break;
01833        case '<':
01834          if (!(syntax & RE_NO_GNU_OPS))
01835            {
01836              token->type = ANCHOR;
01837              token->opr.ctx_type = WORD_FIRST;
01838            }
01839          break;
01840        case '>':
01841          if (!(syntax & RE_NO_GNU_OPS))
01842            {
01843              token->type = ANCHOR;
01844              token->opr.ctx_type = WORD_LAST;
01845            }
01846          break;
01847        case 'b':
01848          if (!(syntax & RE_NO_GNU_OPS))
01849            {
01850              token->type = ANCHOR;
01851              token->opr.ctx_type = WORD_DELIM;
01852            }
01853          break;
01854        case 'B':
01855          if (!(syntax & RE_NO_GNU_OPS))
01856            {
01857              token->type = ANCHOR;
01858              token->opr.ctx_type = NOT_WORD_DELIM;
01859            }
01860          break;
01861        case 'w':
01862          if (!(syntax & RE_NO_GNU_OPS))
01863            token->type = OP_WORD;
01864          break;
01865        case 'W':
01866          if (!(syntax & RE_NO_GNU_OPS))
01867            token->type = OP_NOTWORD;
01868          break;
01869        case 's':
01870          if (!(syntax & RE_NO_GNU_OPS))
01871            token->type = OP_SPACE;
01872          break;
01873        case 'S':
01874          if (!(syntax & RE_NO_GNU_OPS))
01875            token->type = OP_NOTSPACE;
01876          break;
01877        case '`':
01878          if (!(syntax & RE_NO_GNU_OPS))
01879            {
01880              token->type = ANCHOR;
01881              token->opr.ctx_type = BUF_FIRST;
01882            }
01883          break;
01884        case '\'':
01885          if (!(syntax & RE_NO_GNU_OPS))
01886            {
01887              token->type = ANCHOR;
01888              token->opr.ctx_type = BUF_LAST;
01889            }
01890          break;
01891        case '(':
01892          if (!(syntax & RE_NO_BK_PARENS))
01893            token->type = OP_OPEN_SUBEXP;
01894          break;
01895        case ')':
01896          if (!(syntax & RE_NO_BK_PARENS))
01897            token->type = OP_CLOSE_SUBEXP;
01898          break;
01899        case '+':
01900          if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
01901            token->type = OP_DUP_PLUS;
01902          break;
01903        case '?':
01904          if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
01905            token->type = OP_DUP_QUESTION;
01906          break;
01907        case '{':
01908          if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
01909            token->type = OP_OPEN_DUP_NUM;
01910          break;
01911        case '}':
01912          if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
01913            token->type = OP_CLOSE_DUP_NUM;
01914          break;
01915        default:
01916          break;
01917        }
01918       return 2;
01919     }
01920 
01921   token->type = CHARACTER;
01922 #ifdef RE_ENABLE_I18N
01923   if (input->mb_cur_max > 1)
01924     {
01925       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
01926       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
01927     }
01928   else
01929 #endif
01930     token->word_char = IS_WORD_CHAR (token->opr.c);
01931 
01932   switch (c)
01933     {
01934     case '\n':
01935       if (syntax & RE_NEWLINE_ALT)
01936        token->type = OP_ALT;
01937       break;
01938     case '|':
01939       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
01940        token->type = OP_ALT;
01941       break;
01942     case '*':
01943       token->type = OP_DUP_ASTERISK;
01944       break;
01945     case '+':
01946       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
01947        token->type = OP_DUP_PLUS;
01948       break;
01949     case '?':
01950       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
01951        token->type = OP_DUP_QUESTION;
01952       break;
01953     case '{':
01954       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
01955        token->type = OP_OPEN_DUP_NUM;
01956       break;
01957     case '}':
01958       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
01959        token->type = OP_CLOSE_DUP_NUM;
01960       break;
01961     case '(':
01962       if (syntax & RE_NO_BK_PARENS)
01963        token->type = OP_OPEN_SUBEXP;
01964       break;
01965     case ')':
01966       if (syntax & RE_NO_BK_PARENS)
01967        token->type = OP_CLOSE_SUBEXP;
01968       break;
01969     case '[':
01970       token->type = OP_OPEN_BRACKET;
01971       break;
01972     case '.':
01973       token->type = OP_PERIOD;
01974       break;
01975     case '^':
01976       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
01977          re_string_cur_idx (input) != 0)
01978        {
01979          char prev = re_string_peek_byte (input, -1);
01980          if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
01981            break;
01982        }
01983       token->type = ANCHOR;
01984       token->opr.ctx_type = LINE_FIRST;
01985       break;
01986     case '$':
01987       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
01988          re_string_cur_idx (input) + 1 != re_string_length (input))
01989        {
01990          re_token_t next;
01991          re_string_skip_bytes (input, 1);
01992          peek_token (&next, input, syntax);
01993          re_string_skip_bytes (input, -1);
01994          if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
01995            break;
01996        }
01997       token->type = ANCHOR;
01998       token->opr.ctx_type = LINE_LAST;
01999       break;
02000     default:
02001       break;
02002     }
02003   return 1;
02004 }
02005 
02006 /* Peek a token from INPUT, and return the length of the token.
02007    We must not use this function out of bracket expressions.  */
02008 
02009 static int
02010 internal_function
02011 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
02012 {
02013   unsigned char c;
02014   if (re_string_eoi (input))
02015     {
02016       token->type = END_OF_RE;
02017       return 0;
02018     }
02019   c = re_string_peek_byte (input, 0);
02020   token->opr.c = c;
02021 
02022 #ifdef RE_ENABLE_I18N
02023   if (input->mb_cur_max > 1 &&
02024       !re_string_first_byte (input, re_string_cur_idx (input)))
02025     {
02026       token->type = CHARACTER;
02027       return 1;
02028     }
02029 #endif /* RE_ENABLE_I18N */
02030 
02031   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
02032       && re_string_cur_idx (input) + 1 < re_string_length (input))
02033     {
02034       /* In this case, '\' escape a character.  */
02035       unsigned char c2;
02036       re_string_skip_bytes (input, 1);
02037       c2 = re_string_peek_byte (input, 0);
02038       token->opr.c = c2;
02039       token->type = CHARACTER;
02040       return 1;
02041     }
02042   if (c == '[') /* '[' is a special char in a bracket exps.  */
02043     {
02044       unsigned char c2;
02045       int token_len;
02046       if (re_string_cur_idx (input) + 1 < re_string_length (input))
02047        c2 = re_string_peek_byte (input, 1);
02048       else
02049        c2 = 0;
02050       token->opr.c = c2;
02051       token_len = 2;
02052       switch (c2)
02053        {
02054        case '.':
02055          token->type = OP_OPEN_COLL_ELEM;
02056          break;
02057        case '=':
02058          token->type = OP_OPEN_EQUIV_CLASS;
02059          break;
02060        case ':':
02061          if (syntax & RE_CHAR_CLASSES)
02062            {
02063              token->type = OP_OPEN_CHAR_CLASS;
02064              break;
02065            }
02066          /* else fall through.  */
02067        default:
02068          token->type = CHARACTER;
02069          token->opr.c = c;
02070          token_len = 1;
02071          break;
02072        }
02073       return token_len;
02074     }
02075   switch (c)
02076     {
02077     case '-':
02078       token->type = OP_CHARSET_RANGE;
02079       break;
02080     case ']':
02081       token->type = OP_CLOSE_BRACKET;
02082       break;
02083     case '^':
02084       token->type = OP_NON_MATCH_LIST;
02085       break;
02086     default:
02087       token->type = CHARACTER;
02088     }
02089   return 1;
02090 }
02091 
02092 /* Functions for parser.  */
02093 
02094 /* Entry point of the parser.
02095    Parse the regular expression REGEXP and return the structure tree.
02096    If an error is occured, ERR is set by error code, and return NULL.
02097    This function build the following tree, from regular expression <reg_exp>:
02098           CAT
02099           / \
02100          /   \
02101    <reg_exp>  EOR
02102 
02103    CAT means concatenation.
02104    EOR means end of regular expression.  */
02105 
02106 static bin_tree_t *
02107 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
02108        reg_errcode_t *err)
02109 {
02110   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
02111   bin_tree_t *tree, *eor, *root;
02112   re_token_t current_token;
02113   dfa->syntax = syntax;
02114   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
02115   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
02116   if (BE (*err != REG_NOERROR && tree == NULL, 0))
02117     return NULL;
02118   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
02119   if (tree != NULL)
02120     root = create_tree (dfa, tree, eor, CONCAT);
02121   else
02122     root = eor;
02123   if (BE (eor == NULL || root == NULL, 0))
02124     {
02125       *err = REG_ESPACE;
02126       return NULL;
02127     }
02128   return root;
02129 }
02130 
02131 /* This function build the following tree, from regular expression
02132    <branch1>|<branch2>:
02133           ALT
02134           / \
02135          /   \
02136    <branch1> <branch2>
02137 
02138    ALT means alternative, which represents the operator `|'.  */
02139 
02140 static bin_tree_t *
02141 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
02142               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
02143 {
02144   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
02145   bin_tree_t *tree, *branch = NULL;
02146   tree = parse_branch (regexp, preg, token, syntax, nest, err);
02147   if (BE (*err != REG_NOERROR && tree == NULL, 0))
02148     return NULL;
02149 
02150   while (token->type == OP_ALT)
02151     {
02152       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
02153       if (token->type != OP_ALT && token->type != END_OF_RE
02154          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
02155        {
02156          branch = parse_branch (regexp, preg, token, syntax, nest, err);
02157          if (BE (*err != REG_NOERROR && branch == NULL, 0))
02158            return NULL;
02159        }
02160       else
02161        branch = NULL;
02162       tree = create_tree (dfa, tree, branch, OP_ALT);
02163       if (BE (tree == NULL, 0))
02164        {
02165          *err = REG_ESPACE;
02166          return NULL;
02167        }
02168     }
02169   return tree;
02170 }
02171 
02172 /* This function build the following tree, from regular expression
02173    <exp1><exp2>:
02174        CAT
02175        / \
02176        /   \
02177    <exp1> <exp2>
02178 
02179    CAT means concatenation.  */
02180 
02181 static bin_tree_t *
02182 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
02183              reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
02184 {
02185   bin_tree_t *tree, *expr;
02186   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
02187   tree = parse_expression (regexp, preg, token, syntax, nest, err);
02188   if (BE (*err != REG_NOERROR && tree == NULL, 0))
02189     return NULL;
02190 
02191   while (token->type != OP_ALT && token->type != END_OF_RE
02192         && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
02193     {
02194       expr = parse_expression (regexp, preg, token, syntax, nest, err);
02195       if (BE (*err != REG_NOERROR && expr == NULL, 0))
02196        {
02197          return NULL;
02198        }
02199       if (tree != NULL && expr != NULL)
02200        {
02201          tree = create_tree (dfa, tree, expr, CONCAT);
02202          if (tree == NULL)
02203            {
02204              *err = REG_ESPACE;
02205              return NULL;
02206            }
02207        }
02208       else if (tree == NULL)
02209        tree = expr;
02210       /* Otherwise expr == NULL, we don't need to create new tree.  */
02211     }
02212   return tree;
02213 }
02214 
02215 /* This function build the following tree, from regular expression a*:
02216         *
02217         |
02218         a
02219 */
02220 
02221 static bin_tree_t *
02222 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
02223                 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
02224 {
02225   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
02226   bin_tree_t *tree;
02227   switch (token->type)
02228     {
02229     case CHARACTER:
02230       tree = create_token_tree (dfa, NULL, NULL, token);
02231       if (BE (tree == NULL, 0))
02232        {
02233          *err = REG_ESPACE;
02234          return NULL;
02235        }
02236 #ifdef RE_ENABLE_I18N
02237       if (dfa->mb_cur_max > 1)
02238        {
02239          while (!re_string_eoi (regexp)
02240                && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
02241            {
02242              bin_tree_t *mbc_remain;
02243              fetch_token (token, regexp, syntax);
02244              mbc_remain = create_token_tree (dfa, NULL, NULL, token);
02245              tree = create_tree (dfa, tree, mbc_remain, CONCAT);
02246              if (BE (mbc_remain == NULL || tree == NULL, 0))
02247               {
02248                 *err = REG_ESPACE;
02249                 return NULL;
02250               }
02251            }
02252        }
02253 #endif
02254       break;
02255     case OP_OPEN_SUBEXP:
02256       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
02257       if (BE (*err != REG_NOERROR && tree == NULL, 0))
02258        return NULL;
02259       break;
02260     case OP_OPEN_BRACKET:
02261       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
02262       if (BE (*err != REG_NOERROR && tree == NULL, 0))
02263        return NULL;
02264       break;
02265     case OP_BACK_REF:
02266       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
02267        {
02268          *err = REG_ESUBREG;
02269          return NULL;
02270        }
02271       dfa->used_bkref_map |= 1 << token->opr.idx;
02272       tree = create_token_tree (dfa, NULL, NULL, token);
02273       if (BE (tree == NULL, 0))
02274        {
02275          *err = REG_ESPACE;
02276          return NULL;
02277        }
02278       ++dfa->nbackref;
02279       dfa->has_mb_node = 1;
02280       break;
02281     case OP_OPEN_DUP_NUM:
02282       if (syntax & RE_CONTEXT_INVALID_DUP)
02283        {
02284          *err = REG_BADRPT;
02285          return NULL;
02286        }
02287       /* FALLTHROUGH */
02288     case OP_DUP_ASTERISK:
02289     case OP_DUP_PLUS:
02290     case OP_DUP_QUESTION:
02291       if (syntax & RE_CONTEXT_INVALID_OPS)
02292        {
02293          *err = REG_BADRPT;
02294          return NULL;
02295        }
02296       else if (syntax & RE_CONTEXT_INDEP_OPS)
02297        {
02298          fetch_token (token, regexp, syntax);
02299          return parse_expression (regexp, preg, token, syntax, nest, err);
02300        }
02301       /* else fall through  */
02302     case OP_CLOSE_SUBEXP:
02303       if ((token->type == OP_CLOSE_SUBEXP) &&
02304          !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
02305        {
02306          *err = REG_ERPAREN;
02307          return NULL;
02308        }
02309       /* else fall through  */
02310     case OP_CLOSE_DUP_NUM:
02311       /* We treat it as a normal character.  */
02312 
02313       /* Then we can these characters as normal characters.  */
02314       token->type = CHARACTER;
02315       /* mb_partial and word_char bits should be initialized already
02316         by peek_token.  */
02317       tree = create_token_tree (dfa, NULL, NULL, token);
02318       if (BE (tree == NULL, 0))
02319        {
02320          *err = REG_ESPACE;
02321          return NULL;
02322        }
02323       break;
02324     case ANCHOR:
02325       if ((token->opr.ctx_type
02326           & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
02327          && dfa->word_ops_used == 0)
02328        init_word_char (dfa);
02329       if (token->opr.ctx_type == WORD_DELIM
02330          || token->opr.ctx_type == NOT_WORD_DELIM)
02331        {
02332          bin_tree_t *tree_first, *tree_last;
02333          if (token->opr.ctx_type == WORD_DELIM)
02334            {
02335              token->opr.ctx_type = WORD_FIRST;
02336              tree_first = create_token_tree (dfa, NULL, NULL, token);
02337              token->opr.ctx_type = WORD_LAST;
02338            }
02339          else
02340            {
02341              token->opr.ctx_type = INSIDE_WORD;
02342              tree_first = create_token_tree (dfa, NULL, NULL, token);
02343              token->opr.ctx_type = INSIDE_NOTWORD;
02344            }
02345          tree_last = create_token_tree (dfa, NULL, NULL, token);
02346          tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
02347          if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
02348            {
02349              *err = REG_ESPACE;
02350              return NULL;
02351            }
02352        }
02353       else
02354        {
02355          tree = create_token_tree (dfa, NULL, NULL, token);
02356          if (BE (tree == NULL, 0))
02357            {
02358              *err = REG_ESPACE;
02359              return NULL;
02360            }
02361        }
02362       /* We must return here, since ANCHORs can't be followed
02363         by repetition operators.
02364         eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
02365             it must not be "<ANCHOR(^)><REPEAT(*)>".  */
02366       fetch_token (token, regexp, syntax);
02367       return tree;
02368     case OP_PERIOD:
02369       tree = create_token_tree (dfa, NULL, NULL, token);
02370       if (BE (tree == NULL, 0))
02371        {
02372          *err = REG_ESPACE;
02373          return NULL;
02374        }
02375       if (dfa->mb_cur_max > 1)
02376        dfa->has_mb_node = 1;
02377       break;
02378     case OP_WORD:
02379     case OP_NOTWORD:
02380       tree = build_charclass_op (dfa, regexp->trans,
02381                              (const unsigned char *) "alnum",
02382                              (const unsigned char *) "_",
02383                              token->type == OP_NOTWORD, err);
02384       if (BE (*err != REG_NOERROR && tree == NULL, 0))
02385        return NULL;
02386       break;
02387     case OP_SPACE:
02388     case OP_NOTSPACE:
02389       tree = build_charclass_op (dfa, regexp->trans,
02390                              (const unsigned char *) "space",
02391                              (const unsigned char *) "",
02392                              token->type == OP_NOTSPACE, err);
02393       if (BE (*err != REG_NOERROR && tree == NULL, 0))
02394        return NULL;
02395       break;
02396     case OP_ALT:
02397     case END_OF_RE:
02398       return NULL;
02399     case BACK_SLASH:
02400       *err = REG_EESCAPE;
02401       return NULL;
02402     default:
02403       /* Must not happen?  */
02404 #ifdef DEBUG
02405       assert (0);
02406 #endif
02407       return NULL;
02408     }
02409   fetch_token (token, regexp, syntax);
02410 
02411   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
02412         || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
02413     {
02414       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
02415       if (BE (*err != REG_NOERROR && tree == NULL, 0))
02416        return NULL;
02417       /* In BRE consecutive duplications are not allowed.  */
02418       if ((syntax & RE_CONTEXT_INVALID_DUP)
02419          && (token->type == OP_DUP_ASTERISK
02420              || token->type == OP_OPEN_DUP_NUM))
02421        {
02422          *err = REG_BADRPT;
02423          return NULL;
02424        }
02425     }
02426 
02427   return tree;
02428 }
02429 
02430 /* This function build the following tree, from regular expression
02431    (<reg_exp>):
02432         SUBEXP
02433            |
02434        <reg_exp>
02435 */
02436 
02437 static bin_tree_t *
02438 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
02439               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
02440 {
02441   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
02442   bin_tree_t *tree;
02443   size_t cur_nsub;
02444   cur_nsub = preg->re_nsub++;
02445 
02446   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
02447 
02448   /* The subexpression may be a null string.  */
02449   if (token->type == OP_CLOSE_SUBEXP)
02450     tree = NULL;
02451   else
02452     {
02453       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
02454       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
02455        *err = REG_EPAREN;
02456       if (BE (*err != REG_NOERROR, 0))
02457        return NULL;
02458     }
02459 
02460   if (cur_nsub <= '9' - '1')
02461     dfa->completed_bkref_map |= 1 << cur_nsub;
02462 
02463   tree = create_tree (dfa, tree, NULL, SUBEXP);
02464   if (BE (tree == NULL, 0))
02465     {
02466       *err = REG_ESPACE;
02467       return NULL;
02468     }
02469   tree->token.opr.idx = cur_nsub;
02470   return tree;
02471 }
02472 
02473 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
02474 
02475 static bin_tree_t *
02476 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
02477              re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
02478 {
02479   bin_tree_t *tree = NULL, *old_tree = NULL;
02480   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
02481   re_token_t start_token = *token;
02482 
02483   if (token->type == OP_OPEN_DUP_NUM)
02484     {
02485       end = 0;
02486       start = fetch_number (regexp, token, syntax);
02487       if (start == REG_MISSING)
02488        {
02489          if (token->type == CHARACTER && token->opr.c == ',')
02490            start = 0; /* We treat "{,m}" as "{0,m}".  */
02491          else
02492            {
02493              *err = REG_BADBR; /* <re>{} is invalid.  */
02494              return NULL;
02495            }
02496        }
02497       if (BE (start != REG_ERROR, 1))
02498        {
02499          /* We treat "{n}" as "{n,n}".  */
02500          end = ((token->type == OP_CLOSE_DUP_NUM) ? start
02501                : ((token->type == CHARACTER && token->opr.c == ',')
02502                   ? fetch_number (regexp, token, syntax) : REG_ERROR));
02503        }
02504       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
02505        {
02506          /* Invalid sequence.  */
02507          if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
02508            {
02509              if (token->type == END_OF_RE)
02510               *err = REG_EBRACE;
02511              else
02512               *err = REG_BADBR;
02513 
02514              return NULL;
02515            }
02516 
02517          /* If the syntax bit is set, rollback.  */
02518          re_string_set_index (regexp, start_idx);
02519          *token = start_token;
02520          token->type = CHARACTER;
02521          /* mb_partial and word_char bits should be already initialized by
02522             peek_token.  */
02523          return elem;
02524        }
02525 
02526       if (BE ((end != REG_MISSING && start > end)
02527              || token->type != OP_CLOSE_DUP_NUM, 0))
02528        {
02529          /* First number greater than second.  */
02530          *err = REG_BADBR;
02531          return NULL;
02532        }
02533     }
02534   else
02535     {
02536       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
02537       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
02538     }
02539 
02540   fetch_token (token, regexp, syntax);
02541 
02542   if (BE (elem == NULL, 0))
02543     return NULL;
02544   if (BE (start == 0 && end == 0, 0))
02545     {
02546       postorder (elem, free_tree, NULL);
02547       return NULL;
02548     }
02549 
02550   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
02551   if (BE (start > 0, 0))
02552     {
02553       tree = elem;
02554       for (i = 2; i <= start; ++i)
02555        {
02556          elem = duplicate_tree (elem, dfa);
02557          tree = create_tree (dfa, tree, elem, CONCAT);
02558          if (BE (elem == NULL || tree == NULL, 0))
02559            goto parse_dup_op_espace;
02560        }
02561 
02562       if (start == end)
02563        return tree;
02564 
02565       /* Duplicate ELEM before it is marked optional.  */
02566       elem = duplicate_tree (elem, dfa);
02567       old_tree = tree;
02568     }
02569   else
02570     old_tree = NULL;
02571 
02572   if (elem->token.type == SUBEXP)
02573     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
02574 
02575   tree = create_tree (dfa, elem, NULL,
02576                     (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
02577   if (BE (tree == NULL, 0))
02578     goto parse_dup_op_espace;
02579 
02580 /* From gnulib's "intprops.h":
02581    True if the arithmetic type T is signed.  */
02582 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
02583 
02584   /* This loop is actually executed only when end != REG_MISSING,
02585      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
02586      already created the start+1-th copy.  */
02587   if (TYPE_SIGNED (Idx) || end != REG_MISSING)
02588     for (i = start + 2; i <= end; ++i)
02589       {
02590        elem = duplicate_tree (elem, dfa);
02591        tree = create_tree (dfa, tree, elem, CONCAT);
02592        if (BE (elem == NULL || tree == NULL, 0))
02593          goto parse_dup_op_espace;
02594 
02595        tree = create_tree (dfa, tree, NULL, OP_ALT);
02596        if (BE (tree == NULL, 0))
02597          goto parse_dup_op_espace;
02598       }
02599 
02600   if (old_tree)
02601     tree = create_tree (dfa, old_tree, tree, CONCAT);
02602 
02603   return tree;
02604 
02605  parse_dup_op_espace:
02606   *err = REG_ESPACE;
02607   return NULL;
02608 }
02609 
02610 /* Size of the names for collating symbol/equivalence_class/character_class.
02611    I'm not sure, but maybe enough.  */
02612 #define BRACKET_NAME_BUF_SIZE 32
02613 
02614 #ifndef _LIBC
02615   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
02616      Build the range expression which starts from START_ELEM, and ends
02617      at END_ELEM.  The result are written to MBCSET and SBCSET.
02618      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
02619      mbcset->range_ends, is a pointer argument sinse we may
02620      update it.  */
02621 
02622 static reg_errcode_t
02623 internal_function
02624 # ifdef RE_ENABLE_I18N
02625 build_range_exp (const reg_syntax_t syntax,
02626                  bitset_t sbcset,
02627                  re_charset_t *mbcset,
02628                  Idx *range_alloc,
02629                  const bracket_elem_t *start_elem,
02630                  const bracket_elem_t *end_elem)
02631 # else /* not RE_ENABLE_I18N */
02632 build_range_exp (const reg_syntax_t syntax,
02633                  bitset_t sbcset,
02634                  const bracket_elem_t *start_elem,
02635                  const bracket_elem_t *end_elem)
02636 # endif /* not RE_ENABLE_I18N */
02637 {
02638   unsigned int start_ch, end_ch;
02639   /* Equivalence Classes and Character Classes can't be a range start/end.  */
02640   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
02641          || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
02642          0))
02643     return REG_ERANGE;
02644 
02645   /* We can handle no multi character collating elements without libc
02646      support.  */
02647   if (BE ((start_elem->type == COLL_SYM
02648           && strlen ((char *) start_elem->opr.name) > 1)
02649          || (end_elem->type == COLL_SYM
02650              && strlen ((char *) end_elem->opr.name) > 1), 0))
02651     return REG_ECOLLATE;
02652 
02653 # ifdef RE_ENABLE_I18N
02654   {
02655     wchar_t wc;
02656     wint_t start_wc;
02657     wint_t end_wc;
02658     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
02659 
02660     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
02661               : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
02662                  : 0));
02663     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
02664              : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
02665                : 0));
02666     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
02667               ? __btowc (start_ch) : start_elem->opr.wch);
02668     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
02669              ? __btowc (end_ch) : end_elem->opr.wch);
02670     if (start_wc == WEOF || end_wc == WEOF)
02671       return REG_ECOLLATE;
02672     cmp_buf[0] = start_wc;
02673     cmp_buf[4] = end_wc;
02674 
02675     if (BE ((syntax & RE_NO_EMPTY_RANGES)
02676             && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
02677       return REG_ERANGE;
02678 
02679     /* Got valid collation sequence values, add them as a new entry.
02680        However, for !_LIBC we have no collation elements: if the
02681        character set is single byte, the single byte character set
02682        that we build below suffices.  parse_bracket_exp passes
02683        no MBCSET if dfa->mb_cur_max == 1.  */
02684     if (mbcset)
02685       {
02686        /* Check the space of the arrays.  */
02687        if (BE (*range_alloc == mbcset->nranges, 0))
02688          {
02689            /* There is not enough space, need realloc.  */
02690            wchar_t *new_array_start, *new_array_end;
02691            Idx new_nranges;
02692 
02693            /* +1 in case of mbcset->nranges is 0.  */
02694            new_nranges = 2 * mbcset->nranges + 1;
02695            /* Use realloc since mbcset->range_starts and mbcset->range_ends
02696               are NULL if *range_alloc == 0.  */
02697            new_array_start = re_realloc (mbcset->range_starts, wchar_t,
02698                                      new_nranges);
02699            new_array_end = re_realloc (mbcset->range_ends, wchar_t,
02700                                    new_nranges);
02701 
02702            if (BE (new_array_start == NULL || new_array_end == NULL, 0))
02703              return REG_ESPACE;
02704 
02705            mbcset->range_starts = new_array_start;
02706            mbcset->range_ends = new_array_end;
02707            *range_alloc = new_nranges;
02708          }
02709 
02710        mbcset->range_starts[mbcset->nranges] = start_wc;
02711        mbcset->range_ends[mbcset->nranges++] = end_wc;
02712       }
02713 
02714     /* Build the table for single byte characters.  */
02715     for (wc = 0; wc < SBC_MAX; ++wc)
02716       {
02717        cmp_buf[2] = wc;
02718        if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
02719            && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
02720          bitset_set (sbcset, wc);
02721       }
02722   }
02723 # else /* not RE_ENABLE_I18N */
02724   {
02725     unsigned int ch;
02726     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
02727               : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
02728                  : 0));
02729     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
02730              : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
02731                : 0));
02732     if (start_ch > end_ch)
02733       return REG_ERANGE;
02734     /* Build the table for single byte characters.  */
02735     for (ch = 0; ch < SBC_MAX; ++ch)
02736       if (start_ch <= ch  && ch <= end_ch)
02737        bitset_set (sbcset, ch);
02738   }
02739 # endif /* not RE_ENABLE_I18N */
02740   return REG_NOERROR;
02741 }
02742 #endif /* not _LIBC */
02743 
02744 #ifndef _LIBC
02745 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
02746    Build the collating element which is represented by NAME.
02747    The result are written to MBCSET and SBCSET.
02748    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
02749    pointer argument since we may update it.  */
02750 
02751 static reg_errcode_t
02752 internal_function
02753 build_collating_symbol (bitset_t sbcset,
02754 # ifdef RE_ENABLE_I18N
02755                      re_charset_t *mbcset, Idx *coll_sym_alloc,
02756 # endif
02757                      const unsigned char *name)
02758 {
02759   size_t name_len = strlen ((const char *) name);
02760   if (BE (name_len != 1, 0))
02761     return REG_ECOLLATE;
02762   else
02763     {
02764       bitset_set (sbcset, name[0]);
02765       return REG_NOERROR;
02766     }
02767 }
02768 #endif /* not _LIBC */
02769 
02770 /* This function parse bracket expression like "[abc]", "[a-c]",
02771    "[[.a-a.]]" etc.  */
02772 
02773 static bin_tree_t *
02774 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
02775                  reg_syntax_t syntax, reg_errcode_t *err)
02776 {
02777 #ifdef _LIBC
02778   const unsigned char *collseqmb;
02779   const char *collseqwc;
02780   uint32_t nrules;
02781   int32_t table_size;
02782   const int32_t *symb_table;
02783   const unsigned char *extra;
02784 
02785   /* Local function for parse_bracket_exp used in _LIBC environement.
02786      Seek the collating symbol entry correspondings to NAME.
02787      Return the index of the symbol in the SYMB_TABLE.  */
02788 
02789   auto inline int32_t
02790   __attribute ((always_inline))
02791   seek_collating_symbol_entry (name, name_len)
02792         const unsigned char *name;
02793         size_t name_len;
02794     {
02795       int32_t hash = elem_hash ((const char *) name, name_len);
02796       int32_t elem = hash % table_size;
02797       if (symb_table[2 * elem] != 0)
02798        {
02799          int32_t second = hash % (table_size - 2) + 1;
02800 
02801          do
02802            {
02803              /* First compare the hashing value.  */
02804              if (symb_table[2 * elem] == hash
02805                 /* Compare the length of the name.  */
02806                 && name_len == extra[symb_table[2 * elem + 1]]
02807                 /* Compare the name.  */
02808                 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
02809                           name_len) == 0)
02810               {
02811                 /* Yep, this is the entry.  */
02812                 break;
02813               }
02814 
02815              /* Next entry.  */
02816              elem += second;
02817            }
02818          while (symb_table[2 * elem] != 0);
02819        }
02820       return elem;
02821     }
02822 
02823   /* Local function for parse_bracket_exp used in _LIBC environment.
02824      Look up the collation sequence value of BR_ELEM.
02825      Return the value if succeeded, UINT_MAX otherwise.  */
02826 
02827   auto inline unsigned int
02828   __attribute ((always_inline))
02829   lookup_collation_sequence_value (br_elem)
02830         bracket_elem_t *br_elem;
02831     {
02832       if (br_elem->type == SB_CHAR)
02833        {
02834          /*
02835          if (MB_CUR_MAX == 1)
02836          */
02837          if (nrules == 0)
02838            return collseqmb[br_elem->opr.ch];
02839          else
02840            {
02841              wint_t wc = __btowc (br_elem->opr.ch);
02842              return __collseq_table_lookup (collseqwc, wc);
02843            }
02844        }
02845       else if (br_elem->type == MB_CHAR)
02846        {
02847          if (nrules != 0)
02848            return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
02849        }
02850       else if (br_elem->type == COLL_SYM)
02851        {
02852          size_t sym_name_len = strlen ((char *) br_elem->opr.name);
02853          if (nrules != 0)
02854            {
02855              int32_t elem, idx;
02856              elem = seek_collating_symbol_entry (br_elem->opr.name,
02857                                             sym_name_len);
02858              if (symb_table[2 * elem] != 0)
02859               {
02860                 /* We found the entry.  */
02861                 idx = symb_table[2 * elem + 1];
02862                 /* Skip the name of collating element name.  */
02863                 idx += 1 + extra[idx];
02864                 /* Skip the byte sequence of the collating element.  */
02865                 idx += 1 + extra[idx];
02866                 /* Adjust for the alignment.  */
02867                 idx = (idx + 3) & ~3;
02868                 /* Skip the multibyte collation sequence value.  */
02869                 idx += sizeof (unsigned int);
02870                 /* Skip the wide char sequence of the collating element.  */
02871                 idx += sizeof (unsigned int) *
02872                   (1 + *(unsigned int *) (extra + idx));
02873                 /* Return the collation sequence value.  */
02874                 return *(unsigned int *) (extra + idx);
02875               }
02876              else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
02877               {
02878                 /* No valid character.  Match it as a single byte
02879                    character.  */
02880                 return collseqmb[br_elem->opr.name[0]];
02881               }
02882            }
02883          else if (sym_name_len == 1)
02884            return collseqmb[br_elem->opr.name[0]];
02885        }
02886       return UINT_MAX;
02887     }
02888 
02889   /* Local function for parse_bracket_exp used in _LIBC environement.
02890      Build the range expression which starts from START_ELEM, and ends
02891      at END_ELEM.  The result are written to MBCSET and SBCSET.
02892      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
02893      mbcset->range_ends, is a pointer argument sinse we may
02894      update it.  */
02895 
02896   auto inline reg_errcode_t
02897   __attribute ((always_inline))
02898   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
02899         re_charset_t *mbcset;
02900         Idx *range_alloc;
02901         bitset_t sbcset;
02902         bracket_elem_t *start_elem, *end_elem;
02903     {
02904       unsigned int ch;
02905       uint32_t start_collseq;
02906       uint32_t end_collseq;
02907 
02908       /* Equivalence Classes and Character Classes can't be a range
02909         start/end.  */
02910       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
02911              || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
02912              0))
02913        return REG_ERANGE;
02914 
02915       start_collseq = lookup_collation_sequence_value (start_elem);
02916       end_collseq = lookup_collation_sequence_value (end_elem);
02917       /* Check start/end collation sequence values.  */
02918       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
02919        return REG_ECOLLATE;
02920       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
02921        return REG_ERANGE;
02922 
02923       /* Got valid collation sequence values, add them as a new entry.
02924         However, if we have no collation elements, and the character set
02925         is single byte, the single byte character set that we
02926         build below suffices. */
02927       if (nrules > 0 || dfa->mb_cur_max > 1)
02928        {
02929          /* Check the space of the arrays.  */
02930          if (BE (*range_alloc == mbcset->nranges, 0))
02931            {
02932              /* There is not enough space, need realloc.  */
02933              uint32_t *new_array_start;
02934              uint32_t *new_array_end;
02935              Idx new_nranges;
02936 
02937              /* +1 in case of mbcset->nranges is 0.  */
02938              new_nranges = 2 * mbcset->nranges + 1;
02939              new_array_start = re_realloc (mbcset->range_starts, uint32_t,
02940                                        new_nranges);
02941              new_array_end = re_realloc (mbcset->range_ends, uint32_t,
02942                                      new_nranges);
02943 
02944              if (BE (new_array_start == NULL || new_array_end == NULL, 0))
02945               return REG_ESPACE;
02946 
02947              mbcset->range_starts = new_array_start;
02948              mbcset->range_ends = new_array_end;
02949              *range_alloc = new_nranges;
02950            }
02951 
02952          mbcset->range_starts[mbcset->nranges] = start_collseq;
02953          mbcset->range_ends[mbcset->nranges++] = end_collseq;
02954        }
02955 
02956       /* Build the table for single byte characters.  */
02957       for (ch = 0; ch < SBC_MAX; ch++)
02958        {
02959          uint32_t ch_collseq;
02960          /*
02961          if (MB_CUR_MAX == 1)
02962          */
02963          if (nrules == 0)
02964            ch_collseq = collseqmb[ch];
02965          else
02966            ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
02967          if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
02968            bitset_set (sbcset, ch);
02969        }
02970       return REG_NOERROR;
02971     }
02972 
02973   /* Local function for parse_bracket_exp used in _LIBC environement.
02974      Build the collating element which is represented by NAME.
02975      The result are written to MBCSET and SBCSET.
02976      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
02977      pointer argument sinse we may update it.  */
02978 
02979   auto inline reg_errcode_t
02980   __attribute ((always_inline))
02981   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
02982         re_charset_t *mbcset;
02983         Idx *coll_sym_alloc;
02984         bitset_t sbcset;
02985         const unsigned char *name;
02986     {
02987       int32_t elem, idx;
02988       size_t name_len = strlen ((const char *) name);
02989       if (nrules != 0)
02990        {
02991          elem = seek_collating_symbol_entry (name, name_len);
02992          if (symb_table[2 * elem] != 0)
02993            {
02994              /* We found the entry.  */
02995              idx = symb_table[2 * elem + 1];
02996              /* Skip the name of collating element name.  */
02997              idx += 1 + extra[idx];
02998            }
02999          else if (symb_table[2 * elem] == 0 && name_len == 1)
03000            {
03001              /* No valid character, treat it as a normal
03002                character.  */
03003              bitset_set (sbcset, name[0]);
03004              return REG_NOERROR;
03005            }
03006          else
03007            return REG_ECOLLATE;
03008 
03009          /* Got valid collation sequence, add it as a new entry.  */
03010          /* Check the space of the arrays.  */
03011          if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
03012            {
03013              /* Not enough, realloc it.  */
03014              /* +1 in case of mbcset->ncoll_syms is 0.  */
03015              Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
03016              /* Use realloc since mbcset->coll_syms is NULL
03017                if *alloc == 0.  */
03018              int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
03019                                              new_coll_sym_alloc);
03020              if (BE (new_coll_syms == NULL, 0))
03021               return REG_ESPACE;
03022              mbcset->coll_syms = new_coll_syms;
03023              *coll_sym_alloc = new_coll_sym_alloc;
03024            }
03025          mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
03026          return REG_NOERROR;
03027        }
03028       else
03029        {
03030          if (BE (name_len != 1, 0))
03031            return REG_ECOLLATE;
03032          else
03033            {
03034              bitset_set (sbcset, name[0]);
03035              return REG_NOERROR;
03036            }
03037        }
03038     }
03039 #endif
03040 
03041   re_token_t br_token;
03042   re_bitset_ptr_t sbcset;
03043 #ifdef RE_ENABLE_I18N
03044   re_charset_t *mbcset;
03045   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
03046   Idx equiv_class_alloc = 0, char_class_alloc = 0;
03047 #endif /* not RE_ENABLE_I18N */
03048   bool non_match = false;
03049   bin_tree_t *work_tree;
03050   int token_len;
03051   bool first_round = true;
03052 #ifdef _LIBC
03053   collseqmb = (const unsigned char *)
03054     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
03055   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
03056   if (nrules)
03057     {
03058       /*
03059       if (MB_CUR_MAX > 1)
03060       */
03061       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
03062       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
03063       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
03064                                             _NL_COLLATE_SYMB_TABLEMB);
03065       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
03066                                              _NL_COLLATE_SYMB_EXTRAMB);
03067     }
03068 #endif
03069   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
03070 #ifdef RE_ENABLE_I18N
03071   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
03072 #endif /* RE_ENABLE_I18N */
03073 #ifdef RE_ENABLE_I18N
03074   if (BE (sbcset == NULL || mbcset == NULL, 0))
03075 #else
03076   if (BE (sbcset == NULL, 0))
03077 #endif /* RE_ENABLE_I18N */
03078     {
03079       *err = REG_ESPACE;
03080       return NULL;
03081     }
03082 
03083   token_len = peek_token_bracket (token, regexp, syntax);
03084   if (BE (token->type == END_OF_RE, 0))
03085     {
03086       *err = REG_BADPAT;
03087       goto parse_bracket_exp_free_return;
03088     }
03089   if (token->type == OP_NON_MATCH_LIST)
03090     {
03091 #ifdef RE_ENABLE_I18N
03092       mbcset->non_match = 1;
03093 #endif /* not RE_ENABLE_I18N */
03094       non_match = true;
03095       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
03096        bitset_set (sbcset, '\n');
03097       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
03098       token_len = peek_token_bracket (token, regexp, syntax);
03099       if (BE (token->type == END_OF_RE, 0))
03100        {
03101          *err = REG_BADPAT;
03102          goto parse_bracket_exp_free_return;
03103        }
03104     }
03105 
03106   /* We treat the first ']' as a normal character.  */
03107   if (token->type == OP_CLOSE_BRACKET)
03108     token->type = CHARACTER;
03109 
03110   while (1)
03111     {
03112       bracket_elem_t start_elem, end_elem;
03113       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
03114       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
03115       reg_errcode_t ret;
03116       int token_len2 = 0;
03117       bool is_range_exp = false;
03118       re_token_t token2;
03119 
03120       start_elem.opr.name = start_name_buf;
03121       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
03122                                syntax, first_round);
03123       if (BE (ret != REG_NOERROR, 0))
03124        {
03125          *err = ret;
03126          goto parse_bracket_exp_free_return;
03127        }
03128       first_round = false;
03129 
03130       /* Get information about the next token.  We need it in any case.  */
03131       token_len = peek_token_bracket (token, regexp, syntax);
03132 
03133       /* Do not check for ranges if we know they are not allowed.  */
03134       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
03135        {
03136          if (BE (token->type == END_OF_RE, 0))
03137            {
03138              *err = REG_EBRACK;
03139              goto parse_bracket_exp_free_return;
03140            }
03141          if (token->type == OP_CHARSET_RANGE)
03142            {
03143              re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
03144              token_len2 = peek_token_bracket (&token2, regexp, syntax);
03145              if (BE (token2.type == END_OF_RE, 0))
03146               {
03147                 *err = REG_EBRACK;
03148                 goto parse_bracket_exp_free_return;
03149               }
03150              if (token2.type == OP_CLOSE_BRACKET)
03151               {
03152                 /* We treat the last '-' as a normal character.  */
03153                 re_string_skip_bytes (regexp, -token_len);
03154                 token->type = CHARACTER;
03155               }
03156              else
03157               is_range_exp = true;
03158            }
03159        }
03160 
03161       if (is_range_exp == true)
03162        {
03163          end_elem.opr.name = end_name_buf;
03164          ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
03165                                    dfa, syntax, true);
03166          if (BE (ret != REG_NOERROR, 0))
03167            {
03168              *err = ret;
03169              goto parse_bracket_exp_free_return;
03170            }
03171 
03172          token_len = peek_token_bracket (token, regexp, syntax);
03173 
03174 #ifdef _LIBC
03175          *err = build_range_exp (sbcset, mbcset, &range_alloc,
03176                               &start_elem, &end_elem);
03177 #else
03178 # ifdef RE_ENABLE_I18N
03179          *err = build_range_exp (syntax, sbcset,
03180                               dfa->mb_cur_max > 1 ? mbcset : NULL,
03181                               &range_alloc, &start_elem, &end_elem);
03182 # else
03183          *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
03184 # endif
03185 #endif /* RE_ENABLE_I18N */
03186          if (BE (*err != REG_NOERROR, 0))
03187            goto parse_bracket_exp_free_return;
03188        }
03189       else
03190        {
03191          switch (start_elem.type)
03192            {
03193            case SB_CHAR:
03194              bitset_set (sbcset, start_elem.opr.ch);
03195              break;
03196 #ifdef RE_ENABLE_I18N
03197            case MB_CHAR:
03198              /* Check whether the array has enough space.  */
03199              if (BE (mbchar_alloc == mbcset->nmbchars, 0))
03200               {
03201                 wchar_t *new_mbchars;
03202                 /* Not enough, realloc it.  */
03203                 /* +1 in case of mbcset->nmbchars is 0.  */
03204                 mbchar_alloc = 2 * mbcset->nmbchars + 1;
03205                 /* Use realloc since array is NULL if *alloc == 0.  */
03206                 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
03207                                        mbchar_alloc);
03208                 if (BE (new_mbchars == NULL, 0))
03209                   goto parse_bracket_exp_espace;
03210                 mbcset->mbchars = new_mbchars;
03211               }
03212              mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
03213              break;
03214 #endif /* RE_ENABLE_I18N */
03215            case EQUIV_CLASS:
03216              *err = build_equiv_class (sbcset,
03217 #ifdef RE_ENABLE_I18N
03218                                    mbcset, &equiv_class_alloc,
03219 #endif /* RE_ENABLE_I18N */
03220                                    start_elem.opr.name);
03221              if (BE (*err != REG_NOERROR, 0))
03222               goto parse_bracket_exp_free_return;
03223              break;
03224            case COLL_SYM:
03225              *err = build_collating_symbol (sbcset,
03226 #ifdef RE_ENABLE_I18N
03227                                         mbcset, &coll_sym_alloc,
03228 #endif /* RE_ENABLE_I18N */
03229                                         start_elem.opr.name);
03230              if (BE (*err != REG_NOERROR, 0))
03231               goto parse_bracket_exp_free_return;
03232              break;
03233            case CHAR_CLASS:
03234              *err = build_charclass (regexp->trans, sbcset,
03235 #ifdef RE_ENABLE_I18N
03236                                   mbcset, &char_class_alloc,
03237 #endif /* RE_ENABLE_I18N */
03238                                   start_elem.opr.name, syntax);
03239              if (BE (*err != REG_NOERROR, 0))
03240               goto parse_bracket_exp_free_return;
03241              break;
03242            default:
03243              assert (0);
03244              break;
03245            }
03246        }
03247       if (BE (token->type == END_OF_RE, 0))
03248        {
03249          *err = REG_EBRACK;
03250          goto parse_bracket_exp_free_return;
03251        }
03252       if (token->type == OP_CLOSE_BRACKET)
03253        break;
03254     }
03255 
03256   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
03257 
03258   /* If it is non-matching list.  */
03259   if (non_match)
03260     bitset_not (sbcset);
03261 
03262 #ifdef RE_ENABLE_I18N
03263   /* Ensure only single byte characters are set.  */
03264   if (dfa->mb_cur_max > 1)
03265     bitset_mask (sbcset, dfa->sb_char);
03266 
03267   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
03268       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
03269                                                || mbcset->non_match)))
03270     {
03271       bin_tree_t *mbc_tree;
03272       int sbc_idx;
03273       /* Build a tree for complex bracket.  */
03274       dfa->has_mb_node = 1;
03275       br_token.type = COMPLEX_BRACKET;
03276       br_token.opr.mbcset = mbcset;
03277       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
03278       if (BE (mbc_tree == NULL, 0))
03279        goto parse_bracket_exp_espace;
03280       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
03281        if (sbcset[sbc_idx])
03282          break;
03283       /* If there are no bits set in sbcset, there is no point
03284         of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
03285       if (sbc_idx < BITSET_WORDS)
03286        {
03287          /* Build a tree for simple bracket.  */
03288          br_token.type = SIMPLE_BRACKET;
03289          br_token.opr.sbcset = sbcset;
03290          work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
03291          if (BE (work_tree == NULL, 0))
03292            goto parse_bracket_exp_espace;
03293 
03294          /* Then join them by ALT node.  */
03295          work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
03296          if (BE (work_tree == NULL, 0))
03297            goto parse_bracket_exp_espace;
03298        }
03299       else
03300        {
03301          re_free (sbcset);
03302          work_tree = mbc_tree;
03303        }
03304     }
03305   else
03306 #endif /* not RE_ENABLE_I18N */
03307     {
03308 #ifdef RE_ENABLE_I18N
03309       free_charset (mbcset);
03310 #endif
03311       /* Build a tree for simple bracket.  */
03312       br_token.type = SIMPLE_BRACKET;
03313       br_token.opr.sbcset = sbcset;
03314       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
03315       if (BE (work_tree == NULL, 0))
03316        goto parse_bracket_exp_espace;
03317     }
03318   return work_tree;
03319 
03320  parse_bracket_exp_espace:
03321   *err = REG_ESPACE;
03322  parse_bracket_exp_free_return:
03323   re_free (sbcset);
03324 #ifdef RE_ENABLE_I18N
03325   free_charset (mbcset);
03326 #endif /* RE_ENABLE_I18N */
03327   return NULL;
03328 }
03329 
03330 /* Parse an element in the bracket expression.  */
03331 
03332 static reg_errcode_t
03333 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
03334                      re_token_t *token, int token_len, re_dfa_t *dfa,
03335                      reg_syntax_t syntax, bool accept_hyphen)
03336 {
03337 #ifdef RE_ENABLE_I18N
03338   int cur_char_size;
03339   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
03340   if (cur_char_size > 1)
03341     {
03342       elem->type = MB_CHAR;
03343       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
03344       re_string_skip_bytes (regexp, cur_char_size);
03345       return REG_NOERROR;
03346     }
03347 #endif /* RE_ENABLE_I18N */
03348   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
03349   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
03350       || token->type == OP_OPEN_EQUIV_CLASS)
03351     return parse_bracket_symbol (elem, regexp, token);
03352   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
03353     {
03354       /* A '-' must only appear as anything but a range indicator before
03355         the closing bracket.  Everything else is an error.  */
03356       re_token_t token2;
03357       (void) peek_token_bracket (&token2, regexp, syntax);
03358       if (token2.type != OP_CLOSE_BRACKET)
03359        /* The actual error value is not standardized since this whole
03360           case is undefined.  But ERANGE makes good sense.  */
03361        return REG_ERANGE;
03362     }
03363   elem->type = SB_CHAR;
03364   elem->opr.ch = token->opr.c;
03365   return REG_NOERROR;
03366 }
03367 
03368 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
03369    such as [:<character_class>:], [.<collating_element>.], and
03370    [=<equivalent_class>=].  */
03371 
03372 static reg_errcode_t
03373 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
03374                     re_token_t *token)
03375 {
03376   unsigned char ch, delim = token->opr.c;
03377   int i = 0;
03378   if (re_string_eoi(regexp))
03379     return REG_EBRACK;
03380   for (;; ++i)
03381     {
03382       if (i >= BRACKET_NAME_BUF_SIZE)
03383        return REG_EBRACK;
03384       if (token->type == OP_OPEN_CHAR_CLASS)
03385        ch = re_string_fetch_byte_case (regexp);
03386       else
03387        ch = re_string_fetch_byte (regexp);
03388       if (re_string_eoi(regexp))
03389        return REG_EBRACK;
03390       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
03391        break;
03392       elem->opr.name[i] = ch;
03393     }
03394   re_string_skip_bytes (regexp, 1);
03395   elem->opr.name[i] = '\0';
03396   switch (token->type)
03397     {
03398     case OP_OPEN_COLL_ELEM:
03399       elem->type = COLL_SYM;
03400       break;
03401     case OP_OPEN_EQUIV_CLASS:
03402       elem->type = EQUIV_CLASS;
03403       break;
03404     case OP_OPEN_CHAR_CLASS:
03405       elem->type = CHAR_CLASS;
03406       break;
03407     default:
03408       break;
03409     }
03410   return REG_NOERROR;
03411 }
03412 
03413   /* Helper function for parse_bracket_exp.
03414      Build the equivalence class which is represented by NAME.
03415      The result are written to MBCSET and SBCSET.
03416      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
03417      is a pointer argument sinse we may update it.  */
03418 
03419 static reg_errcode_t
03420 #ifdef RE_ENABLE_I18N
03421 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
03422                  Idx *equiv_class_alloc, const unsigned char *name)
03423 #else /* not RE_ENABLE_I18N */
03424 build_equiv_class (bitset_t sbcset, const unsigned char *name)
03425 #endif /* not RE_ENABLE_I18N */
03426 {
03427 #ifdef _LIBC
03428   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
03429   if (nrules != 0)
03430     {
03431       const int32_t *table, *indirect;
03432       const unsigned char *weights, *extra, *cp;
03433       unsigned char char_buf[2];
03434       int32_t idx1, idx2;
03435       unsigned int ch;
03436       size_t len;
03437       /* This #include defines a local function!  */
03438 # include <locale/weight.h>
03439       /* Calculate the index for equivalence class.  */
03440       cp = name;
03441       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
03442       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
03443                                           _NL_COLLATE_WEIGHTMB);
03444       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
03445                                              _NL_COLLATE_EXTRAMB);
03446       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
03447                                           _NL_COLLATE_INDIRECTMB);
03448       idx1 = findidx (&cp);
03449       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
03450        /* This isn't a valid character.  */
03451        return REG_ECOLLATE;
03452 
03453       /* Build single byte matcing table for this equivalence class.  */
03454       char_buf[1] = (unsigned char) '\0';
03455       len = weights[idx1 & 0xffffff];
03456       for (ch = 0; ch < SBC_MAX; ++ch)
03457        {
03458          char_buf[0] = ch;
03459          cp = char_buf;
03460          idx2 = findidx (&cp);
03461 /*
03462          idx2 = table[ch];
03463 */
03464          if (idx2 == 0)
03465            /* This isn't a valid character.  */
03466            continue;
03467          /* Compare only if the length matches and the collation rule
03468             index is the same.  */
03469          if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
03470            {
03471              int cnt = 0;
03472 
03473              while (cnt <= len &&
03474                    weights[(idx1 & 0xffffff) + 1 + cnt]
03475                    == weights[(idx2 & 0xffffff) + 1 + cnt])
03476               ++cnt;
03477 
03478              if (cnt > len)
03479               bitset_set (sbcset, ch);
03480            }
03481        }
03482       /* Check whether the array has enough space.  */
03483       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
03484        {
03485          /* Not enough, realloc it.  */
03486          /* +1 in case of mbcset->nequiv_classes is 0.  */
03487          Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
03488          /* Use realloc since the array is NULL if *alloc == 0.  */
03489          int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
03490                                              int32_t,
03491                                              new_equiv_class_alloc);
03492          if (BE (new_equiv_classes == NULL, 0))
03493            return REG_ESPACE;
03494          mbcset->equiv_classes = new_equiv_classes;
03495          *equiv_class_alloc = new_equiv_class_alloc;
03496        }
03497       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
03498     }
03499   else
03500 #endif /* _LIBC */
03501     {
03502       if (BE (strlen ((const char *) name) != 1, 0))
03503        return REG_ECOLLATE;
03504       bitset_set (sbcset, *name);
03505     }
03506   return REG_NOERROR;
03507 }
03508 
03509   /* Helper function for parse_bracket_exp.
03510      Build the character class which is represented by NAME.
03511      The result are written to MBCSET and SBCSET.
03512      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
03513      is a pointer argument sinse we may update it.  */
03514 
03515 static reg_errcode_t
03516 #ifdef RE_ENABLE_I18N
03517 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
03518                re_charset_t *mbcset, Idx *char_class_alloc,
03519                const unsigned char *class_name, reg_syntax_t syntax)
03520 #else /* not RE_ENABLE_I18N */
03521 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
03522                const unsigned char *class_name, reg_syntax_t syntax)
03523 #endif /* not RE_ENABLE_I18N */
03524 {
03525   int i;
03526   const char *name = (const char *) class_name;
03527 
03528   /* In case of REG_ICASE "upper" and "lower" match the both of
03529      upper and lower cases.  */
03530   if ((syntax & RE_ICASE)
03531       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
03532     name = "alpha";
03533 
03534 #ifdef RE_ENABLE_I18N
03535   /* Check the space of the arrays.  */
03536   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
03537     {
03538       /* Not enough, realloc it.  */
03539       /* +1 in case of mbcset->nchar_classes is 0.  */
03540       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
03541       /* Use realloc since array is NULL if *alloc == 0.  */
03542       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
03543                                           new_char_class_alloc);
03544       if (BE (new_char_classes == NULL, 0))
03545        return REG_ESPACE;
03546       mbcset->char_classes = new_char_classes;
03547       *char_class_alloc = new_char_class_alloc;
03548     }
03549   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
03550 #endif /* RE_ENABLE_I18N */
03551 
03552 #define BUILD_CHARCLASS_LOOP(ctype_func)  \
03553   do {                                    \
03554     if (BE (trans != NULL, 0))                   \
03555       {                                          \
03556        for (i = 0; i < SBC_MAX; ++i)             \
03557          if (ctype_func (i))                     \
03558            bitset_set (sbcset, trans[i]); \
03559       }                                          \
03560     else                                  \
03561       {                                          \
03562        for (i = 0; i < SBC_MAX; ++i)             \
03563          if (ctype_func (i))                     \
03564            bitset_set (sbcset, i);        \
03565       }                                          \
03566   } while (0)
03567 
03568   if (strcmp (name, "alnum") == 0)
03569     BUILD_CHARCLASS_LOOP (isalnum);
03570   else if (strcmp (name, "cntrl") == 0)
03571     BUILD_CHARCLASS_LOOP (iscntrl);
03572   else if (strcmp (name, "lower") == 0)
03573     BUILD_CHARCLASS_LOOP (islower);
03574   else if (strcmp (name, "space") == 0)
03575     BUILD_CHARCLASS_LOOP (isspace);
03576   else if (strcmp (name, "alpha") == 0)
03577     BUILD_CHARCLASS_LOOP (isalpha);
03578   else if (strcmp (name, "digit") == 0)
03579     BUILD_CHARCLASS_LOOP (isdigit);
03580   else if (strcmp (name, "print") == 0)
03581     BUILD_CHARCLASS_LOOP (isprint);
03582   else if (strcmp (name, "upper") == 0)
03583     BUILD_CHARCLASS_LOOP (isupper);
03584   else if (strcmp (name, "blank") == 0)
03585     BUILD_CHARCLASS_LOOP (isblank);
03586   else if (strcmp (name, "graph") == 0)
03587     BUILD_CHARCLASS_LOOP (isgraph);
03588   else if (strcmp (name, "punct") == 0)
03589     BUILD_CHARCLASS_LOOP (ispunct);
03590   else if (strcmp (name, "xdigit") == 0)
03591     BUILD_CHARCLASS_LOOP (isxdigit);
03592   else
03593     return REG_ECTYPE;
03594 
03595   return REG_NOERROR;
03596 }
03597 
03598 static bin_tree_t *
03599 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
03600                   const unsigned char *class_name,
03601                   const unsigned char *extra, bool non_match,
03602                   reg_errcode_t *err)
03603 {
03604   re_bitset_ptr_t sbcset;
03605 #ifdef RE_ENABLE_I18N
03606   re_charset_t *mbcset;
03607   Idx alloc = 0;
03608 #endif /* not RE_ENABLE_I18N */
03609   reg_errcode_t ret;
03610   re_token_t br_token;
03611   bin_tree_t *tree;
03612 
03613   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
03614 #ifdef RE_ENABLE_I18N
03615   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
03616 #endif /* RE_ENABLE_I18N */
03617 
03618 #ifdef RE_ENABLE_I18N
03619   if (BE (sbcset == NULL || mbcset == NULL, 0))
03620 #else /* not RE_ENABLE_I18N */
03621   if (BE (sbcset == NULL, 0))
03622 #endif /* not RE_ENABLE_I18N */
03623     {
03624       *err = REG_ESPACE;
03625       return NULL;
03626     }
03627 
03628   if (non_match)
03629     {
03630 #ifdef RE_ENABLE_I18N
03631       mbcset->non_match = 1;
03632 #endif /* not RE_ENABLE_I18N */
03633     }
03634 
03635   /* We don't care the syntax in this case.  */
03636   ret = build_charclass (trans, sbcset,
03637 #ifdef RE_ENABLE_I18N
03638                       mbcset, &alloc,
03639 #endif /* RE_ENABLE_I18N */
03640                       class_name, 0);
03641 
03642   if (BE (ret != REG_NOERROR, 0))
03643     {
03644       re_free (sbcset);
03645 #ifdef RE_ENABLE_I18N
03646       free_charset (mbcset);
03647 #endif /* RE_ENABLE_I18N */
03648       *err = ret;
03649       return NULL;
03650     }
03651   /* \w match '_' also.  */
03652   for (; *extra; extra++)
03653     bitset_set (sbcset, *extra);
03654 
03655   /* If it is non-matching list.  */
03656   if (non_match)
03657     bitset_not (sbcset);
03658 
03659 #ifdef RE_ENABLE_I18N
03660   /* Ensure only single byte characters are set.  */
03661   if (dfa->mb_cur_max > 1)
03662     bitset_mask (sbcset, dfa->sb_char);
03663 #endif
03664 
03665   /* Build a tree for simple bracket.  */
03666   br_token.type = SIMPLE_BRACKET;
03667   br_token.opr.sbcset = sbcset;
03668   tree = create_token_tree (dfa, NULL, NULL, &br_token);
03669   if (BE (tree == NULL, 0))
03670     goto build_word_op_espace;
03671 
03672 #ifdef RE_ENABLE_I18N
03673   if (dfa->mb_cur_max > 1)
03674     {
03675       bin_tree_t *mbc_tree;
03676       /* Build a tree for complex bracket.  */
03677       br_token.type = COMPLEX_BRACKET;
03678       br_token.opr.mbcset = mbcset;
03679       dfa->has_mb_node = 1;
03680       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
03681       if (BE (mbc_tree == NULL, 0))
03682        goto build_word_op_espace;
03683       /* Then join them by ALT node.  */
03684       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
03685       if (BE (mbc_tree != NULL, 1))
03686        return tree;
03687     }
03688   else
03689     {
03690       free_charset (mbcset);
03691       return tree;
03692     }
03693 #else /* not RE_ENABLE_I18N */
03694   return tree;
03695 #endif /* not RE_ENABLE_I18N */
03696 
03697  build_word_op_espace:
03698   re_free (sbcset);
03699 #ifdef RE_ENABLE_I18N
03700   free_charset (mbcset);
03701 #endif /* RE_ENABLE_I18N */
03702   *err = REG_ESPACE;
03703   return NULL;
03704 }
03705 
03706 /* This is intended for the expressions like "a{1,3}".
03707    Fetch a number from `input', and return the number.
03708    Return REG_MISSING if the number field is empty like "{,1}".
03709    Return REG_ERROR if an error occurred.  */
03710 
03711 static Idx
03712 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
03713 {
03714   Idx num = REG_MISSING;
03715   unsigned char c;
03716   while (1)
03717     {
03718       fetch_token (token, input, syntax);
03719       c = token->opr.c;
03720       if (BE (token->type == END_OF_RE, 0))
03721        return REG_ERROR;
03722       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
03723        break;
03724       num = ((token->type != CHARACTER || c < '0' || '9' < c
03725              || num == REG_ERROR)
03726             ? REG_ERROR
03727             : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
03728       num = (num > RE_DUP_MAX) ? REG_ERROR : num;
03729     }
03730   return num;
03731 }
03732 
03733 #ifdef RE_ENABLE_I18N
03734 static void
03735 free_charset (re_charset_t *cset)
03736 {
03737   re_free (cset->mbchars);
03738 # ifdef _LIBC
03739   re_free (cset->coll_syms);
03740   re_free (cset->equiv_classes);
03741   re_free (cset->range_starts);
03742   re_free (cset->range_ends);
03743 # endif
03744   re_free (cset->char_classes);
03745   re_free (cset);
03746 }
03747 #endif /* RE_ENABLE_I18N */
03748 
03749 /* Functions for binary tree operation.  */
03750 
03751 /* Create a tree node.  */
03752 
03753 static bin_tree_t *
03754 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
03755             re_token_type_t type)
03756 {
03757   re_token_t t;
03758   t.type = type;
03759   return create_token_tree (dfa, left, right, &t);
03760 }
03761 
03762 static bin_tree_t *
03763 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
03764                  const re_token_t *token)
03765 {
03766   bin_tree_t *tree;
03767   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
03768     {
03769       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
03770 
03771       if (storage == NULL)
03772        return NULL;
03773       storage->next = dfa->str_tree_storage;
03774       dfa->str_tree_storage = storage;
03775       dfa->str_tree_storage_idx = 0;
03776     }
03777   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
03778 
03779   tree->parent = NULL;
03780   tree->left = left;
03781   tree->right = right;
03782   tree->token = *token;
03783   tree->token.duplicated = 0;
03784   tree->token.opt_subexp = 0;
03785   tree->first = NULL;
03786   tree->next = NULL;
03787   tree->node_idx = REG_MISSING;
03788 
03789   if (left != NULL)
03790     left->parent = tree;
03791   if (right != NULL)
03792     right->parent = tree;
03793   return tree;
03794 }
03795 
03796 /* Mark the tree SRC as an optional subexpression.
03797    To be called from preorder or postorder.  */
03798 
03799 static reg_errcode_t
03800 mark_opt_subexp (void *extra, bin_tree_t *node)
03801 {
03802   Idx idx = (Idx) (long) extra;
03803   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
03804     node->token.opt_subexp = 1;
03805 
03806   return REG_NOERROR;
03807 }
03808 
03809 /* Free the allocated memory inside NODE. */
03810 
03811 static void
03812 free_token (re_token_t *node)
03813 {
03814 #ifdef RE_ENABLE_I18N
03815   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
03816     free_charset (node->opr.mbcset);
03817   else
03818 #endif /* RE_ENABLE_I18N */
03819     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
03820       re_free (node->opr.sbcset);
03821 }
03822 
03823 /* Worker function for tree walking.  Free the allocated memory inside NODE
03824    and its children. */
03825 
03826 static reg_errcode_t
03827 free_tree (void *extra, bin_tree_t *node)
03828 {
03829   free_token (&node->token);
03830   return REG_NOERROR;
03831 }
03832 
03833 
03834 /* Duplicate the node SRC, and return new node.  This is a preorder
03835    visit similar to the one implemented by the generic visitor, but
03836    we need more infrastructure to maintain two parallel trees --- so,
03837    it's easier to duplicate.  */
03838 
03839 static bin_tree_t *
03840 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
03841 {
03842   const bin_tree_t *node;
03843   bin_tree_t *dup_root;
03844   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
03845 
03846   for (node = root; ; )
03847     {
03848       /* Create a new tree and link it back to the current parent.  */
03849       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
03850       if (*p_new == NULL)
03851        return NULL;
03852       (*p_new)->parent = dup_node;
03853       (*p_new)->token.duplicated = 1;
03854       dup_node = *p_new;
03855 
03856       /* Go to the left node, or up and to the right.  */
03857       if (node->left)
03858        {
03859          node = node->left;
03860          p_new = &dup_node->left;
03861        }
03862       else
03863        {
03864          const bin_tree_t *prev = NULL;
03865          while (node->right == prev || node->right == NULL)
03866            {
03867              prev = node;
03868              node = node->parent;
03869              dup_node = dup_node->parent;
03870              if (!node)
03871               return dup_root;
03872            }
03873          node = node->right;
03874          p_new = &dup_node->right;
03875        }
03876     }
03877 }