Back to index

nagios-plugins  1.4.16
regex_internal.c
Go to the documentation of this file.
00001 /* Extended regular expression matching and search library.
00002    Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
00003    Software Foundation, Inc.
00004    This file is part of the GNU C Library.
00005    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
00006 
00007    This program is free software; you can redistribute it and/or modify
00008    it under the terms of the GNU General Public License as published by
00009    the Free Software Foundation; either version 3, or (at your option)
00010    any later version.
00011 
00012    This program is distributed in the hope that it will be useful,
00013    but WITHOUT ANY WARRANTY; without even the implied warranty of
00014    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015    GNU General Public License for more details.
00016 
00017    You should have received a copy of the GNU General Public License along
00018    with this program; if not, write to the Free Software Foundation,
00019    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
00020 
00021 static void re_string_construct_common (const char *str, Idx len,
00022                                    re_string_t *pstr,
00023                                    RE_TRANSLATE_TYPE trans, bool icase,
00024                                    const re_dfa_t *dfa) internal_function;
00025 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
00026                                      const re_node_set *nodes,
00027                                      re_hashval_t hash) internal_function;
00028 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
00029                                      const re_node_set *nodes,
00030                                      unsigned int context,
00031                                      re_hashval_t hash) internal_function;
00032 
00033 /* Functions for string operation.  */
00034 
00035 /* This function allocate the buffers.  It is necessary to call
00036    re_string_reconstruct before using the object.  */
00037 
00038 static reg_errcode_t
00039 internal_function __attribute_warn_unused_result__
00040 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
00041                   RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
00042 {
00043   reg_errcode_t ret;
00044   Idx init_buf_len;
00045 
00046   /* Ensure at least one character fits into the buffers.  */
00047   if (init_len < dfa->mb_cur_max)
00048     init_len = dfa->mb_cur_max;
00049   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
00050   re_string_construct_common (str, len, pstr, trans, icase, dfa);
00051 
00052   ret = re_string_realloc_buffers (pstr, init_buf_len);
00053   if (BE (ret != REG_NOERROR, 0))
00054     return ret;
00055 
00056   pstr->word_char = dfa->word_char;
00057   pstr->word_ops_used = dfa->word_ops_used;
00058   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
00059   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
00060   pstr->valid_raw_len = pstr->valid_len;
00061   return REG_NOERROR;
00062 }
00063 
00064 /* This function allocate the buffers, and initialize them.  */
00065 
00066 static reg_errcode_t
00067 internal_function __attribute_warn_unused_result__
00068 re_string_construct (re_string_t *pstr, const char *str, Idx len,
00069                    RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
00070 {
00071   reg_errcode_t ret;
00072   memset (pstr, '\0', sizeof (re_string_t));
00073   re_string_construct_common (str, len, pstr, trans, icase, dfa);
00074 
00075   if (len > 0)
00076     {
00077       ret = re_string_realloc_buffers (pstr, len + 1);
00078       if (BE (ret != REG_NOERROR, 0))
00079        return ret;
00080     }
00081   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
00082 
00083   if (icase)
00084     {
00085 #ifdef RE_ENABLE_I18N
00086       if (dfa->mb_cur_max > 1)
00087        {
00088          while (1)
00089            {
00090              ret = build_wcs_upper_buffer (pstr);
00091              if (BE (ret != REG_NOERROR, 0))
00092               return ret;
00093              if (pstr->valid_raw_len >= len)
00094               break;
00095              if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
00096               break;
00097              ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
00098              if (BE (ret != REG_NOERROR, 0))
00099               return ret;
00100            }
00101        }
00102       else
00103 #endif /* RE_ENABLE_I18N  */
00104        build_upper_buffer (pstr);
00105     }
00106   else
00107     {
00108 #ifdef RE_ENABLE_I18N
00109       if (dfa->mb_cur_max > 1)
00110        build_wcs_buffer (pstr);
00111       else
00112 #endif /* RE_ENABLE_I18N  */
00113        {
00114          if (trans != NULL)
00115            re_string_translate_buffer (pstr);
00116          else
00117            {
00118              pstr->valid_len = pstr->bufs_len;
00119              pstr->valid_raw_len = pstr->bufs_len;
00120            }
00121        }
00122     }
00123 
00124   return REG_NOERROR;
00125 }
00126 
00127 /* Helper functions for re_string_allocate, and re_string_construct.  */
00128 
00129 static reg_errcode_t
00130 internal_function __attribute_warn_unused_result__
00131 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
00132 {
00133 #ifdef RE_ENABLE_I18N
00134   if (pstr->mb_cur_max > 1)
00135     {
00136       wint_t *new_wcs;
00137 
00138       /* Avoid overflow.  */
00139       size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
00140       if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
00141        return REG_ESPACE;
00142 
00143       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
00144       if (BE (new_wcs == NULL, 0))
00145        return REG_ESPACE;
00146       pstr->wcs = new_wcs;
00147       if (pstr->offsets != NULL)
00148        {
00149          Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
00150          if (BE (new_offsets == NULL, 0))
00151            return REG_ESPACE;
00152          pstr->offsets = new_offsets;
00153        }
00154     }
00155 #endif /* RE_ENABLE_I18N  */
00156   if (pstr->mbs_allocated)
00157     {
00158       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
00159                                       new_buf_len);
00160       if (BE (new_mbs == NULL, 0))
00161        return REG_ESPACE;
00162       pstr->mbs = new_mbs;
00163     }
00164   pstr->bufs_len = new_buf_len;
00165   return REG_NOERROR;
00166 }
00167 
00168 
00169 static void
00170 internal_function
00171 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
00172                          RE_TRANSLATE_TYPE trans, bool icase,
00173                          const re_dfa_t *dfa)
00174 {
00175   pstr->raw_mbs = (const unsigned char *) str;
00176   pstr->len = len;
00177   pstr->raw_len = len;
00178   pstr->trans = trans;
00179   pstr->icase = icase;
00180   pstr->mbs_allocated = (trans != NULL || icase);
00181   pstr->mb_cur_max = dfa->mb_cur_max;
00182   pstr->is_utf8 = dfa->is_utf8;
00183   pstr->map_notascii = dfa->map_notascii;
00184   pstr->stop = pstr->len;
00185   pstr->raw_stop = pstr->stop;
00186 }
00187 
00188 #ifdef RE_ENABLE_I18N
00189 
00190 /* Build wide character buffer PSTR->WCS.
00191    If the byte sequence of the string are:
00192      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
00193    Then wide character buffer will be:
00194      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
00195    We use WEOF for padding, they indicate that the position isn't
00196    a first byte of a multibyte character.
00197 
00198    Note that this function assumes PSTR->VALID_LEN elements are already
00199    built and starts from PSTR->VALID_LEN.  */
00200 
00201 static void
00202 internal_function
00203 build_wcs_buffer (re_string_t *pstr)
00204 {
00205 #ifdef _LIBC
00206   unsigned char buf[MB_LEN_MAX];
00207   assert (MB_LEN_MAX >= pstr->mb_cur_max);
00208 #else
00209   unsigned char buf[64];
00210 #endif
00211   mbstate_t prev_st;
00212   Idx byte_idx, end_idx, remain_len;
00213   size_t mbclen;
00214 
00215   /* Build the buffers from pstr->valid_len to either pstr->len or
00216      pstr->bufs_len.  */
00217   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
00218   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
00219     {
00220       wchar_t wc;
00221       const char *p;
00222 
00223       remain_len = end_idx - byte_idx;
00224       prev_st = pstr->cur_state;
00225       /* Apply the translation if we need.  */
00226       if (BE (pstr->trans != NULL, 0))
00227        {
00228          int i, ch;
00229 
00230          for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
00231            {
00232              ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
00233              buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
00234            }
00235          p = (const char *) buf;
00236        }
00237       else
00238        p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
00239       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
00240       if (BE (mbclen == (size_t) -2, 0))
00241        {
00242          /* The buffer doesn't have enough space, finish to build.  */
00243          pstr->cur_state = prev_st;
00244          break;
00245        }
00246       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
00247        {
00248          /* We treat these cases as a singlebyte character.  */
00249          mbclen = 1;
00250          wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
00251          if (BE (pstr->trans != NULL, 0))
00252            wc = pstr->trans[wc];
00253          pstr->cur_state = prev_st;
00254        }
00255 
00256       /* Write wide character and padding.  */
00257       pstr->wcs[byte_idx++] = wc;
00258       /* Write paddings.  */
00259       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
00260        pstr->wcs[byte_idx++] = WEOF;
00261     }
00262   pstr->valid_len = byte_idx;
00263   pstr->valid_raw_len = byte_idx;
00264 }
00265 
00266 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
00267    but for REG_ICASE.  */
00268 
00269 static reg_errcode_t
00270 internal_function __attribute_warn_unused_result__
00271 build_wcs_upper_buffer (re_string_t *pstr)
00272 {
00273   mbstate_t prev_st;
00274   Idx src_idx, byte_idx, end_idx, remain_len;
00275   size_t mbclen;
00276 #ifdef _LIBC
00277   char buf[MB_LEN_MAX];
00278   assert (MB_LEN_MAX >= pstr->mb_cur_max);
00279 #else
00280   char buf[64];
00281 #endif
00282 
00283   byte_idx = pstr->valid_len;
00284   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
00285 
00286   /* The following optimization assumes that ASCII characters can be
00287      mapped to wide characters with a simple cast.  */
00288   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
00289     {
00290       while (byte_idx < end_idx)
00291        {
00292          wchar_t wc;
00293 
00294          if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
00295              && mbsinit (&pstr->cur_state))
00296            {
00297              /* In case of a singlebyte character.  */
00298              pstr->mbs[byte_idx]
00299               = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
00300              /* The next step uses the assumption that wchar_t is encoded
00301                ASCII-safe: all ASCII values can be converted like this.  */
00302              pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
00303              ++byte_idx;
00304              continue;
00305            }
00306 
00307          remain_len = end_idx - byte_idx;
00308          prev_st = pstr->cur_state;
00309          mbclen = __mbrtowc (&wc,
00310                            ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
00311                             + byte_idx), remain_len, &pstr->cur_state);
00312          if (BE (mbclen < (size_t) -2, 1))
00313            {
00314              wchar_t wcu = wc;
00315              if (iswlower (wc))
00316               {
00317                 size_t mbcdlen;
00318 
00319                 wcu = towupper (wc);
00320                 mbcdlen = wcrtomb (buf, wcu, &prev_st);
00321                 if (BE (mbclen == mbcdlen, 1))
00322                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
00323                 else
00324                   {
00325                     src_idx = byte_idx;
00326                     goto offsets_needed;
00327                   }
00328               }
00329              else
00330               memcpy (pstr->mbs + byte_idx,
00331                      pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
00332              pstr->wcs[byte_idx++] = wcu;
00333              /* Write paddings.  */
00334              for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
00335               pstr->wcs[byte_idx++] = WEOF;
00336            }
00337          else if (mbclen == (size_t) -1 || mbclen == 0)
00338            {
00339              /* It is an invalid character or '\0'.  Just use the byte.  */
00340              int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
00341              pstr->mbs[byte_idx] = ch;
00342              /* And also cast it to wide char.  */
00343              pstr->wcs[byte_idx++] = (wchar_t) ch;
00344              if (BE (mbclen == (size_t) -1, 0))
00345               pstr->cur_state = prev_st;
00346            }
00347          else
00348            {
00349              /* The buffer doesn't have enough space, finish to build.  */
00350              pstr->cur_state = prev_st;
00351              break;
00352            }
00353        }
00354       pstr->valid_len = byte_idx;
00355       pstr->valid_raw_len = byte_idx;
00356       return REG_NOERROR;
00357     }
00358   else
00359     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
00360       {
00361        wchar_t wc;
00362        const char *p;
00363       offsets_needed:
00364        remain_len = end_idx - byte_idx;
00365        prev_st = pstr->cur_state;
00366        if (BE (pstr->trans != NULL, 0))
00367          {
00368            int i, ch;
00369 
00370            for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
00371              {
00372               ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
00373               buf[i] = pstr->trans[ch];
00374              }
00375            p = (const char *) buf;
00376          }
00377        else
00378          p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
00379        mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
00380        if (BE (mbclen < (size_t) -2, 1))
00381          {
00382            wchar_t wcu = wc;
00383            if (iswlower (wc))
00384              {
00385               size_t mbcdlen;
00386 
00387               wcu = towupper (wc);
00388               mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
00389               if (BE (mbclen == mbcdlen, 1))
00390                 memcpy (pstr->mbs + byte_idx, buf, mbclen);
00391               else if (mbcdlen != (size_t) -1)
00392                 {
00393                   size_t i;
00394 
00395                   if (byte_idx + mbcdlen > pstr->bufs_len)
00396                     {
00397                      pstr->cur_state = prev_st;
00398                      break;
00399                     }
00400 
00401                   if (pstr->offsets == NULL)
00402                     {
00403                      pstr->offsets = re_malloc (Idx, pstr->bufs_len);
00404 
00405                      if (pstr->offsets == NULL)
00406                        return REG_ESPACE;
00407                     }
00408                   if (!pstr->offsets_needed)
00409                     {
00410                      for (i = 0; i < (size_t) byte_idx; ++i)
00411                        pstr->offsets[i] = i;
00412                      pstr->offsets_needed = 1;
00413                     }
00414 
00415                   memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
00416                   pstr->wcs[byte_idx] = wcu;
00417                   pstr->offsets[byte_idx] = src_idx;
00418                   for (i = 1; i < mbcdlen; ++i)
00419                     {
00420                      pstr->offsets[byte_idx + i]
00421                        = src_idx + (i < mbclen ? i : mbclen - 1);
00422                      pstr->wcs[byte_idx + i] = WEOF;
00423                     }
00424                   pstr->len += mbcdlen - mbclen;
00425                   if (pstr->raw_stop > src_idx)
00426                     pstr->stop += mbcdlen - mbclen;
00427                   end_idx = (pstr->bufs_len > pstr->len)
00428                            ? pstr->len : pstr->bufs_len;
00429                   byte_idx += mbcdlen;
00430                   src_idx += mbclen;
00431                   continue;
00432                 }
00433               else
00434                 memcpy (pstr->mbs + byte_idx, p, mbclen);
00435              }
00436            else
00437              memcpy (pstr->mbs + byte_idx, p, mbclen);
00438 
00439            if (BE (pstr->offsets_needed != 0, 0))
00440              {
00441               size_t i;
00442               for (i = 0; i < mbclen; ++i)
00443                 pstr->offsets[byte_idx + i] = src_idx + i;
00444              }
00445            src_idx += mbclen;
00446 
00447            pstr->wcs[byte_idx++] = wcu;
00448            /* Write paddings.  */
00449            for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
00450              pstr->wcs[byte_idx++] = WEOF;
00451          }
00452        else if (mbclen == (size_t) -1 || mbclen == 0)
00453          {
00454            /* It is an invalid character or '\0'.  Just use the byte.  */
00455            int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
00456 
00457            if (BE (pstr->trans != NULL, 0))
00458              ch = pstr->trans [ch];
00459            pstr->mbs[byte_idx] = ch;
00460 
00461            if (BE (pstr->offsets_needed != 0, 0))
00462              pstr->offsets[byte_idx] = src_idx;
00463            ++src_idx;
00464 
00465            /* And also cast it to wide char.  */
00466            pstr->wcs[byte_idx++] = (wchar_t) ch;
00467            if (BE (mbclen == (size_t) -1, 0))
00468              pstr->cur_state = prev_st;
00469          }
00470        else
00471          {
00472            /* The buffer doesn't have enough space, finish to build.  */
00473            pstr->cur_state = prev_st;
00474            break;
00475          }
00476       }
00477   pstr->valid_len = byte_idx;
00478   pstr->valid_raw_len = src_idx;
00479   return REG_NOERROR;
00480 }
00481 
00482 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
00483    Return the index.  */
00484 
00485 static Idx
00486 internal_function
00487 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
00488 {
00489   mbstate_t prev_st;
00490   Idx rawbuf_idx;
00491   size_t mbclen;
00492   wint_t wc = WEOF;
00493 
00494   /* Skip the characters which are not necessary to check.  */
00495   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
00496        rawbuf_idx < new_raw_idx;)
00497     {
00498       wchar_t wc2;
00499       Idx remain_len;
00500       remain_len = pstr->len - rawbuf_idx;
00501       prev_st = pstr->cur_state;
00502       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
00503                        remain_len, &pstr->cur_state);
00504       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
00505        {
00506          /* We treat these cases as a single byte character.  */
00507          if (mbclen == 0 || remain_len == 0)
00508            wc = L'\0';
00509          else
00510            wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
00511          mbclen = 1;
00512          pstr->cur_state = prev_st;
00513        }
00514       else
00515        wc = wc2;
00516       /* Then proceed the next character.  */
00517       rawbuf_idx += mbclen;
00518     }
00519   *last_wc = wc;
00520   return rawbuf_idx;
00521 }
00522 #endif /* RE_ENABLE_I18N  */
00523 
00524 /* Build the buffer PSTR->MBS, and apply the translation if we need.
00525    This function is used in case of REG_ICASE.  */
00526 
00527 static void
00528 internal_function
00529 build_upper_buffer (re_string_t *pstr)
00530 {
00531   Idx char_idx, end_idx;
00532   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
00533 
00534   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
00535     {
00536       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
00537       if (BE (pstr->trans != NULL, 0))
00538        ch = pstr->trans[ch];
00539       if (islower (ch))
00540        pstr->mbs[char_idx] = toupper (ch);
00541       else
00542        pstr->mbs[char_idx] = ch;
00543     }
00544   pstr->valid_len = char_idx;
00545   pstr->valid_raw_len = char_idx;
00546 }
00547 
00548 /* Apply TRANS to the buffer in PSTR.  */
00549 
00550 static void
00551 internal_function
00552 re_string_translate_buffer (re_string_t *pstr)
00553 {
00554   Idx buf_idx, end_idx;
00555   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
00556 
00557   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
00558     {
00559       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
00560       pstr->mbs[buf_idx] = pstr->trans[ch];
00561     }
00562 
00563   pstr->valid_len = buf_idx;
00564   pstr->valid_raw_len = buf_idx;
00565 }
00566 
00567 /* This function re-construct the buffers.
00568    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
00569    convert to upper case in case of REG_ICASE, apply translation.  */
00570 
00571 static reg_errcode_t
00572 internal_function __attribute_warn_unused_result__
00573 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
00574 {
00575   Idx offset;
00576 
00577   if (BE (pstr->raw_mbs_idx <= idx, 0))
00578     offset = idx - pstr->raw_mbs_idx;
00579   else
00580     {
00581       /* Reset buffer.  */
00582 #ifdef RE_ENABLE_I18N
00583       if (pstr->mb_cur_max > 1)
00584        memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
00585 #endif /* RE_ENABLE_I18N */
00586       pstr->len = pstr->raw_len;
00587       pstr->stop = pstr->raw_stop;
00588       pstr->valid_len = 0;
00589       pstr->raw_mbs_idx = 0;
00590       pstr->valid_raw_len = 0;
00591       pstr->offsets_needed = 0;
00592       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
00593                         : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
00594       if (!pstr->mbs_allocated)
00595        pstr->mbs = (unsigned char *) pstr->raw_mbs;
00596       offset = idx;
00597     }
00598 
00599   if (BE (offset != 0, 1))
00600     {
00601       /* Should the already checked characters be kept?  */
00602       if (BE (offset < pstr->valid_raw_len, 1))
00603        {
00604          /* Yes, move them to the front of the buffer.  */
00605 #ifdef RE_ENABLE_I18N
00606          if (BE (pstr->offsets_needed, 0))
00607            {
00608              Idx low = 0, high = pstr->valid_len, mid;
00609              do
00610               {
00611                 mid = (high + low) / 2;
00612                 if (pstr->offsets[mid] > offset)
00613                   high = mid;
00614                 else if (pstr->offsets[mid] < offset)
00615                   low = mid + 1;
00616                 else
00617                   break;
00618               }
00619              while (low < high);
00620              if (pstr->offsets[mid] < offset)
00621               ++mid;
00622              pstr->tip_context = re_string_context_at (pstr, mid - 1,
00623                                                  eflags);
00624              /* This can be quite complicated, so handle specially
00625                only the common and easy case where the character with
00626                different length representation of lower and upper
00627                case is present at or after offset.  */
00628              if (pstr->valid_len > offset
00629                 && mid == offset && pstr->offsets[mid] == offset)
00630               {
00631                 memmove (pstr->wcs, pstr->wcs + offset,
00632                         (pstr->valid_len - offset) * sizeof (wint_t));
00633                 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
00634                 pstr->valid_len -= offset;
00635                 pstr->valid_raw_len -= offset;
00636                 for (low = 0; low < pstr->valid_len; low++)
00637                   pstr->offsets[low] = pstr->offsets[low + offset] - offset;
00638               }
00639              else
00640               {
00641                 /* Otherwise, just find out how long the partial multibyte
00642                    character at offset is and fill it with WEOF/255.  */
00643                 pstr->len = pstr->raw_len - idx + offset;
00644                 pstr->stop = pstr->raw_stop - idx + offset;
00645                 pstr->offsets_needed = 0;
00646                 while (mid > 0 && pstr->offsets[mid - 1] == offset)
00647                   --mid;
00648                 while (mid < pstr->valid_len)
00649                   if (pstr->wcs[mid] != WEOF)
00650                     break;
00651                   else
00652                     ++mid;
00653                 if (mid == pstr->valid_len)
00654                   pstr->valid_len = 0;
00655                 else
00656                   {
00657                     pstr->valid_len = pstr->offsets[mid] - offset;
00658                     if (pstr->valid_len)
00659                      {
00660                        for (low = 0; low < pstr->valid_len; ++low)
00661                          pstr->wcs[low] = WEOF;
00662                        memset (pstr->mbs, 255, pstr->valid_len);
00663                      }
00664                   }
00665                 pstr->valid_raw_len = pstr->valid_len;
00666               }
00667            }
00668          else
00669 #endif
00670            {
00671              pstr->tip_context = re_string_context_at (pstr, offset - 1,
00672                                                  eflags);
00673 #ifdef RE_ENABLE_I18N
00674              if (pstr->mb_cur_max > 1)
00675               memmove (pstr->wcs, pstr->wcs + offset,
00676                       (pstr->valid_len - offset) * sizeof (wint_t));
00677 #endif /* RE_ENABLE_I18N */
00678              if (BE (pstr->mbs_allocated, 0))
00679               memmove (pstr->mbs, pstr->mbs + offset,
00680                       pstr->valid_len - offset);
00681              pstr->valid_len -= offset;
00682              pstr->valid_raw_len -= offset;
00683 #if DEBUG
00684              assert (pstr->valid_len > 0);
00685 #endif
00686            }
00687        }
00688       else
00689        {
00690 #ifdef RE_ENABLE_I18N
00691          /* No, skip all characters until IDX.  */
00692          Idx prev_valid_len = pstr->valid_len;
00693 
00694          if (BE (pstr->offsets_needed, 0))
00695            {
00696              pstr->len = pstr->raw_len - idx + offset;
00697              pstr->stop = pstr->raw_stop - idx + offset;
00698              pstr->offsets_needed = 0;
00699            }
00700 #endif
00701          pstr->valid_len = 0;
00702 #ifdef RE_ENABLE_I18N
00703          if (pstr->mb_cur_max > 1)
00704            {
00705              Idx wcs_idx;
00706              wint_t wc = WEOF;
00707 
00708              if (pstr->is_utf8)
00709               {
00710                 const unsigned char *raw, *p, *end;
00711 
00712                 /* Special case UTF-8.  Multi-byte chars start with any
00713                    byte other than 0x80 - 0xbf.  */
00714                 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
00715                 end = raw + (offset - pstr->mb_cur_max);
00716                 if (end < pstr->raw_mbs)
00717                   end = pstr->raw_mbs;
00718                 p = raw + offset - 1;
00719 #ifdef _LIBC
00720                 /* We know the wchar_t encoding is UCS4, so for the simple
00721                    case, ASCII characters, skip the conversion step.  */
00722                 if (isascii (*p) && BE (pstr->trans == NULL, 1))
00723                   {
00724                     memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
00725                     /* pstr->valid_len = 0; */
00726                     wc = (wchar_t) *p;
00727                   }
00728                 else
00729 #endif
00730                   for (; p >= end; --p)
00731                     if ((*p & 0xc0) != 0x80)
00732                      {
00733                        mbstate_t cur_state;
00734                        wchar_t wc2;
00735                        Idx mlen = raw + pstr->len - p;
00736                        size_t mbclen;
00737 
00738 #if 0 /* dead code: buf is set but never used */
00739                        unsigned char buf[6];
00740                        if (BE (pstr->trans != NULL, 0))
00741                          {
00742                            int i = mlen < 6 ? mlen : 6;
00743                            while (--i >= 0)
00744                             buf[i] = pstr->trans[p[i]];
00745                          }
00746 #endif
00747                        /* XXX Don't use mbrtowc, we know which conversion
00748                           to use (UTF-8 -> UCS4).  */
00749                        memset (&cur_state, 0, sizeof (cur_state));
00750                        mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
00751                                          &cur_state);
00752                        if (raw + offset - p <= mbclen
00753                            && mbclen < (size_t) -2)
00754                          {
00755                            memset (&pstr->cur_state, '\0',
00756                                   sizeof (mbstate_t));
00757                            pstr->valid_len = mbclen - (raw + offset - p);
00758                            wc = wc2;
00759                          }
00760                        break;
00761                      }
00762               }
00763 
00764              if (wc == WEOF)
00765               pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
00766              if (wc == WEOF)
00767               pstr->tip_context
00768                 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
00769              else
00770               pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
00771                                   && IS_WIDE_WORD_CHAR (wc))
00772                                  ? CONTEXT_WORD
00773                                  : ((IS_WIDE_NEWLINE (wc)
00774                                     && pstr->newline_anchor)
00775                                    ? CONTEXT_NEWLINE : 0));
00776              if (BE (pstr->valid_len, 0))
00777               {
00778                 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
00779                   pstr->wcs[wcs_idx] = WEOF;
00780                 if (pstr->mbs_allocated)
00781                   memset (pstr->mbs, 255, pstr->valid_len);
00782               }
00783              pstr->valid_raw_len = pstr->valid_len;
00784            }
00785          else
00786 #endif /* RE_ENABLE_I18N */
00787            {
00788              int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
00789              pstr->valid_raw_len = 0;
00790              if (pstr->trans)
00791               c = pstr->trans[c];
00792              pstr->tip_context = (bitset_contain (pstr->word_char, c)
00793                                ? CONTEXT_WORD
00794                                : ((IS_NEWLINE (c) && pstr->newline_anchor)
00795                                   ? CONTEXT_NEWLINE : 0));
00796            }
00797        }
00798       if (!BE (pstr->mbs_allocated, 0))
00799        pstr->mbs += offset;
00800     }
00801   pstr->raw_mbs_idx = idx;
00802   pstr->len -= offset;
00803   pstr->stop -= offset;
00804 
00805   /* Then build the buffers.  */
00806 #ifdef RE_ENABLE_I18N
00807   if (pstr->mb_cur_max > 1)
00808     {
00809       if (pstr->icase)
00810        {
00811          reg_errcode_t ret = build_wcs_upper_buffer (pstr);
00812          if (BE (ret != REG_NOERROR, 0))
00813            return ret;
00814        }
00815       else
00816        build_wcs_buffer (pstr);
00817     }
00818   else
00819 #endif /* RE_ENABLE_I18N */
00820     if (BE (pstr->mbs_allocated, 0))
00821       {
00822        if (pstr->icase)
00823          build_upper_buffer (pstr);
00824        else if (pstr->trans != NULL)
00825          re_string_translate_buffer (pstr);
00826       }
00827     else
00828       pstr->valid_len = pstr->len;
00829 
00830   pstr->cur_idx = 0;
00831   return REG_NOERROR;
00832 }
00833 
00834 static unsigned char
00835 internal_function __attribute ((pure))
00836 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
00837 {
00838   int ch;
00839   Idx off;
00840 
00841   /* Handle the common (easiest) cases first.  */
00842   if (BE (!pstr->mbs_allocated, 1))
00843     return re_string_peek_byte (pstr, idx);
00844 
00845 #ifdef RE_ENABLE_I18N
00846   if (pstr->mb_cur_max > 1
00847       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
00848     return re_string_peek_byte (pstr, idx);
00849 #endif
00850 
00851   off = pstr->cur_idx + idx;
00852 #ifdef RE_ENABLE_I18N
00853   if (pstr->offsets_needed)
00854     off = pstr->offsets[off];
00855 #endif
00856 
00857   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
00858 
00859 #ifdef RE_ENABLE_I18N
00860   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
00861      this function returns CAPITAL LETTER I instead of first byte of
00862      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
00863      since peek_byte_case doesn't advance cur_idx in any way.  */
00864   if (pstr->offsets_needed && !isascii (ch))
00865     return re_string_peek_byte (pstr, idx);
00866 #endif
00867 
00868   return ch;
00869 }
00870 
00871 static unsigned char
00872 internal_function __attribute ((pure))
00873 re_string_fetch_byte_case (re_string_t *pstr)
00874 {
00875   if (BE (!pstr->mbs_allocated, 1))
00876     return re_string_fetch_byte (pstr);
00877 
00878 #ifdef RE_ENABLE_I18N
00879   if (pstr->offsets_needed)
00880     {
00881       Idx off;
00882       int ch;
00883 
00884       /* For tr_TR.UTF-8 [[:islower:]] there is
00885         [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
00886         in that case the whole multi-byte character and return
00887         the original letter.  On the other side, with
00888         [[: DOTLESS SMALL LETTER I return [[:I, as doing
00889         anything else would complicate things too much.  */
00890 
00891       if (!re_string_first_byte (pstr, pstr->cur_idx))
00892        return re_string_fetch_byte (pstr);
00893 
00894       off = pstr->offsets[pstr->cur_idx];
00895       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
00896 
00897       if (! isascii (ch))
00898        return re_string_fetch_byte (pstr);
00899 
00900       re_string_skip_bytes (pstr,
00901                          re_string_char_size_at (pstr, pstr->cur_idx));
00902       return ch;
00903     }
00904 #endif
00905 
00906   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
00907 }
00908 
00909 static void
00910 internal_function
00911 re_string_destruct (re_string_t *pstr)
00912 {
00913 #ifdef RE_ENABLE_I18N
00914   re_free (pstr->wcs);
00915   re_free (pstr->offsets);
00916 #endif /* RE_ENABLE_I18N  */
00917   if (pstr->mbs_allocated)
00918     re_free (pstr->mbs);
00919 }
00920 
00921 /* Return the context at IDX in INPUT.  */
00922 
00923 static unsigned int
00924 internal_function
00925 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
00926 {
00927   int c;
00928   if (BE (! REG_VALID_INDEX (idx), 0))
00929     /* In this case, we use the value stored in input->tip_context,
00930        since we can't know the character in input->mbs[-1] here.  */
00931     return input->tip_context;
00932   if (BE (idx == input->len, 0))
00933     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
00934            : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
00935 #ifdef RE_ENABLE_I18N
00936   if (input->mb_cur_max > 1)
00937     {
00938       wint_t wc;
00939       Idx wc_idx = idx;
00940       while(input->wcs[wc_idx] == WEOF)
00941        {
00942 #ifdef DEBUG
00943          /* It must not happen.  */
00944          assert (REG_VALID_INDEX (wc_idx));
00945 #endif
00946          --wc_idx;
00947          if (! REG_VALID_INDEX (wc_idx))
00948            return input->tip_context;
00949        }
00950       wc = input->wcs[wc_idx];
00951       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
00952        return CONTEXT_WORD;
00953       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
00954              ? CONTEXT_NEWLINE : 0);
00955     }
00956   else
00957 #endif
00958     {
00959       c = re_string_byte_at (input, idx);
00960       if (bitset_contain (input->word_char, c))
00961        return CONTEXT_WORD;
00962       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
00963     }
00964 }
00965 
00966 /* Functions for set operation.  */
00967 
00968 static reg_errcode_t
00969 internal_function __attribute_warn_unused_result__
00970 re_node_set_alloc (re_node_set *set, Idx size)
00971 {
00972   set->alloc = size;
00973   set->nelem = 0;
00974   set->elems = re_malloc (Idx, size);
00975   if (BE (set->elems == NULL, 0))
00976     return REG_ESPACE;
00977   return REG_NOERROR;
00978 }
00979 
00980 static reg_errcode_t
00981 internal_function __attribute_warn_unused_result__
00982 re_node_set_init_1 (re_node_set *set, Idx elem)
00983 {
00984   set->alloc = 1;
00985   set->nelem = 1;
00986   set->elems = re_malloc (Idx, 1);
00987   if (BE (set->elems == NULL, 0))
00988     {
00989       set->alloc = set->nelem = 0;
00990       return REG_ESPACE;
00991     }
00992   set->elems[0] = elem;
00993   return REG_NOERROR;
00994 }
00995 
00996 static reg_errcode_t
00997 internal_function __attribute_warn_unused_result__
00998 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
00999 {
01000   set->alloc = 2;
01001   set->elems = re_malloc (Idx, 2);
01002   if (BE (set->elems == NULL, 0))
01003     return REG_ESPACE;
01004   if (elem1 == elem2)
01005     {
01006       set->nelem = 1;
01007       set->elems[0] = elem1;
01008     }
01009   else
01010     {
01011       set->nelem = 2;
01012       if (elem1 < elem2)
01013        {
01014          set->elems[0] = elem1;
01015          set->elems[1] = elem2;
01016        }
01017       else
01018        {
01019          set->elems[0] = elem2;
01020          set->elems[1] = elem1;
01021        }
01022     }
01023   return REG_NOERROR;
01024 }
01025 
01026 static reg_errcode_t
01027 internal_function __attribute_warn_unused_result__
01028 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
01029 {
01030   dest->nelem = src->nelem;
01031   if (src->nelem > 0)
01032     {
01033       dest->alloc = dest->nelem;
01034       dest->elems = re_malloc (Idx, dest->alloc);
01035       if (BE (dest->elems == NULL, 0))
01036        {
01037          dest->alloc = dest->nelem = 0;
01038          return REG_ESPACE;
01039        }
01040       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
01041     }
01042   else
01043     re_node_set_init_empty (dest);
01044   return REG_NOERROR;
01045 }
01046 
01047 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
01048    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
01049    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
01050 
01051 static reg_errcode_t
01052 internal_function __attribute_warn_unused_result__
01053 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
01054                         const re_node_set *src2)
01055 {
01056   Idx i1, i2, is, id, delta, sbase;
01057   if (src1->nelem == 0 || src2->nelem == 0)
01058     return REG_NOERROR;
01059 
01060   /* We need dest->nelem + 2 * elems_in_intersection; this is a
01061      conservative estimate.  */
01062   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
01063     {
01064       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
01065       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
01066       if (BE (new_elems == NULL, 0))
01067        return REG_ESPACE;
01068       dest->elems = new_elems;
01069       dest->alloc = new_alloc;
01070     }
01071 
01072   /* Find the items in the intersection of SRC1 and SRC2, and copy
01073      into the top of DEST those that are not already in DEST itself.  */
01074   sbase = dest->nelem + src1->nelem + src2->nelem;
01075   i1 = src1->nelem - 1;
01076   i2 = src2->nelem - 1;
01077   id = dest->nelem - 1;
01078   for (;;)
01079     {
01080       if (src1->elems[i1] == src2->elems[i2])
01081        {
01082          /* Try to find the item in DEST.  Maybe we could binary search?  */
01083          while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
01084            --id;
01085 
01086           if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
01087             dest->elems[--sbase] = src1->elems[i1];
01088 
01089          if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
01090            break;
01091        }
01092 
01093       /* Lower the highest of the two items.  */
01094       else if (src1->elems[i1] < src2->elems[i2])
01095        {
01096          if (! REG_VALID_INDEX (--i2))
01097            break;
01098        }
01099       else
01100        {
01101          if (! REG_VALID_INDEX (--i1))
01102            break;
01103        }
01104     }
01105 
01106   id = dest->nelem - 1;
01107   is = dest->nelem + src1->nelem + src2->nelem - 1;
01108   delta = is - sbase + 1;
01109 
01110   /* Now copy.  When DELTA becomes zero, the remaining
01111      DEST elements are already in place; this is more or
01112      less the same loop that is in re_node_set_merge.  */
01113   dest->nelem += delta;
01114   if (delta > 0 && REG_VALID_INDEX (id))
01115     for (;;)
01116       {
01117        if (dest->elems[is] > dest->elems[id])
01118          {
01119            /* Copy from the top.  */
01120            dest->elems[id + delta--] = dest->elems[is--];
01121            if (delta == 0)
01122              break;
01123          }
01124        else
01125          {
01126            /* Slide from the bottom.  */
01127            dest->elems[id + delta] = dest->elems[id];
01128            if (! REG_VALID_INDEX (--id))
01129              break;
01130          }
01131       }
01132 
01133   /* Copy remaining SRC elements.  */
01134   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
01135 
01136   return REG_NOERROR;
01137 }
01138 
01139 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
01140    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
01141 
01142 static reg_errcode_t
01143 internal_function __attribute_warn_unused_result__
01144 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
01145                      const re_node_set *src2)
01146 {
01147   Idx i1, i2, id;
01148   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
01149     {
01150       dest->alloc = src1->nelem + src2->nelem;
01151       dest->elems = re_malloc (Idx, dest->alloc);
01152       if (BE (dest->elems == NULL, 0))
01153        return REG_ESPACE;
01154     }
01155   else
01156     {
01157       if (src1 != NULL && src1->nelem > 0)
01158        return re_node_set_init_copy (dest, src1);
01159       else if (src2 != NULL && src2->nelem > 0)
01160        return re_node_set_init_copy (dest, src2);
01161       else
01162        re_node_set_init_empty (dest);
01163       return REG_NOERROR;
01164     }
01165   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
01166     {
01167       if (src1->elems[i1] > src2->elems[i2])
01168        {
01169          dest->elems[id++] = src2->elems[i2++];
01170          continue;
01171        }
01172       if (src1->elems[i1] == src2->elems[i2])
01173        ++i2;
01174       dest->elems[id++] = src1->elems[i1++];
01175     }
01176   if (i1 < src1->nelem)
01177     {
01178       memcpy (dest->elems + id, src1->elems + i1,
01179             (src1->nelem - i1) * sizeof (Idx));
01180       id += src1->nelem - i1;
01181     }
01182   else if (i2 < src2->nelem)
01183     {
01184       memcpy (dest->elems + id, src2->elems + i2,
01185             (src2->nelem - i2) * sizeof (Idx));
01186       id += src2->nelem - i2;
01187     }
01188   dest->nelem = id;
01189   return REG_NOERROR;
01190 }
01191 
01192 /* Calculate the union set of the sets DEST and SRC. And store it to
01193    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
01194 
01195 static reg_errcode_t
01196 internal_function __attribute_warn_unused_result__
01197 re_node_set_merge (re_node_set *dest, const re_node_set *src)
01198 {
01199   Idx is, id, sbase, delta;
01200   if (src == NULL || src->nelem == 0)
01201     return REG_NOERROR;
01202   if (dest->alloc < 2 * src->nelem + dest->nelem)
01203     {
01204       Idx new_alloc = 2 * (src->nelem + dest->alloc);
01205       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
01206       if (BE (new_buffer == NULL, 0))
01207        return REG_ESPACE;
01208       dest->elems = new_buffer;
01209       dest->alloc = new_alloc;
01210     }
01211 
01212   if (BE (dest->nelem == 0, 0))
01213     {
01214       dest->nelem = src->nelem;
01215       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
01216       return REG_NOERROR;
01217     }
01218 
01219   /* Copy into the top of DEST the items of SRC that are not
01220      found in DEST.  Maybe we could binary search in DEST?  */
01221   for (sbase = dest->nelem + 2 * src->nelem,
01222        is = src->nelem - 1, id = dest->nelem - 1;
01223        REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
01224     {
01225       if (dest->elems[id] == src->elems[is])
01226        is--, id--;
01227       else if (dest->elems[id] < src->elems[is])
01228        dest->elems[--sbase] = src->elems[is--];
01229       else /* if (dest->elems[id] > src->elems[is]) */
01230        --id;
01231     }
01232 
01233   if (REG_VALID_INDEX (is))
01234     {
01235       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
01236       sbase -= is + 1;
01237       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
01238     }
01239 
01240   id = dest->nelem - 1;
01241   is = dest->nelem + 2 * src->nelem - 1;
01242   delta = is - sbase + 1;
01243   if (delta == 0)
01244     return REG_NOERROR;
01245 
01246   /* Now copy.  When DELTA becomes zero, the remaining
01247      DEST elements are already in place.  */
01248   dest->nelem += delta;
01249   for (;;)
01250     {
01251       if (dest->elems[is] > dest->elems[id])
01252        {
01253          /* Copy from the top.  */
01254          dest->elems[id + delta--] = dest->elems[is--];
01255          if (delta == 0)
01256            break;
01257        }
01258       else
01259        {
01260          /* Slide from the bottom.  */
01261          dest->elems[id + delta] = dest->elems[id];
01262          if (! REG_VALID_INDEX (--id))
01263            {
01264              /* Copy remaining SRC elements.  */
01265              memcpy (dest->elems, dest->elems + sbase,
01266                     delta * sizeof (Idx));
01267              break;
01268            }
01269        }
01270     }
01271 
01272   return REG_NOERROR;
01273 }
01274 
01275 /* Insert the new element ELEM to the re_node_set* SET.
01276    SET should not already have ELEM.
01277    Return true if successful.  */
01278 
01279 static bool
01280 internal_function __attribute_warn_unused_result__
01281 re_node_set_insert (re_node_set *set, Idx elem)
01282 {
01283   Idx idx;
01284   /* In case the set is empty.  */
01285   if (set->alloc == 0)
01286     return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
01287 
01288   if (BE (set->nelem, 0) == 0)
01289     {
01290       /* We already guaranteed above that set->alloc != 0.  */
01291       set->elems[0] = elem;
01292       ++set->nelem;
01293       return true;
01294     }
01295 
01296   /* Realloc if we need.  */
01297   if (set->alloc == set->nelem)
01298     {
01299       Idx *new_elems;
01300       set->alloc = set->alloc * 2;
01301       new_elems = re_realloc (set->elems, Idx, set->alloc);
01302       if (BE (new_elems == NULL, 0))
01303        return false;
01304       set->elems = new_elems;
01305     }
01306 
01307   /* Move the elements which follows the new element.  Test the
01308      first element separately to skip a check in the inner loop.  */
01309   if (elem < set->elems[0])
01310     {
01311       idx = 0;
01312       for (idx = set->nelem; idx > 0; idx--)
01313        set->elems[idx] = set->elems[idx - 1];
01314     }
01315   else
01316     {
01317       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
01318        set->elems[idx] = set->elems[idx - 1];
01319     }
01320 
01321   /* Insert the new element.  */
01322   set->elems[idx] = elem;
01323   ++set->nelem;
01324   return true;
01325 }
01326 
01327 /* Insert the new element ELEM to the re_node_set* SET.
01328    SET should not already have any element greater than or equal to ELEM.
01329    Return true if successful.  */
01330 
01331 static bool
01332 internal_function __attribute_warn_unused_result__
01333 re_node_set_insert_last (re_node_set *set, Idx elem)
01334 {
01335   /* Realloc if we need.  */
01336   if (set->alloc == set->nelem)
01337     {
01338       Idx *new_elems;
01339       set->alloc = (set->alloc + 1) * 2;
01340       new_elems = re_realloc (set->elems, Idx, set->alloc);
01341       if (BE (new_elems == NULL, 0))
01342        return false;
01343       set->elems = new_elems;
01344     }
01345 
01346   /* Insert the new element.  */
01347   set->elems[set->nelem++] = elem;
01348   return true;
01349 }
01350 
01351 /* Compare two node sets SET1 and SET2.
01352    Return true if SET1 and SET2 are equivalent.  */
01353 
01354 static bool
01355 internal_function __attribute ((pure))
01356 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
01357 {
01358   Idx i;
01359   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
01360     return false;
01361   for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
01362     if (set1->elems[i] != set2->elems[i])
01363       return false;
01364   return true;
01365 }
01366 
01367 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
01368 
01369 static Idx
01370 internal_function __attribute ((pure))
01371 re_node_set_contains (const re_node_set *set, Idx elem)
01372 {
01373   __re_size_t idx, right, mid;
01374   if (! REG_VALID_NONZERO_INDEX (set->nelem))
01375     return 0;
01376 
01377   /* Binary search the element.  */
01378   idx = 0;
01379   right = set->nelem - 1;
01380   while (idx < right)
01381     {
01382       mid = (idx + right) / 2;
01383       if (set->elems[mid] < elem)
01384        idx = mid + 1;
01385       else
01386        right = mid;
01387     }
01388   return set->elems[idx] == elem ? idx + 1 : 0;
01389 }
01390 
01391 static void
01392 internal_function
01393 re_node_set_remove_at (re_node_set *set, Idx idx)
01394 {
01395   if (idx < 0 || idx >= set->nelem)
01396     return;
01397   --set->nelem;
01398   for (; idx < set->nelem; idx++)
01399     set->elems[idx] = set->elems[idx + 1];
01400 }
01401 
01402 
01403 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
01404    Or return REG_MISSING if an error occurred.  */
01405 
01406 static Idx
01407 internal_function
01408 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
01409 {
01410   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
01411     {
01412       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
01413       Idx *new_nexts, *new_indices;
01414       re_node_set *new_edests, *new_eclosures;
01415       re_token_t *new_nodes;
01416       size_t max_object_size =
01417        MAX (sizeof (re_token_t),
01418             MAX (sizeof (re_node_set),
01419                 sizeof (Idx)));
01420 
01421       /* Avoid overflows.  */
01422       if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0))
01423        return REG_MISSING;
01424 
01425       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
01426       if (BE (new_nodes == NULL, 0))
01427        return REG_MISSING;
01428       dfa->nodes = new_nodes;
01429       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
01430       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
01431       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
01432       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
01433       if (BE (new_nexts == NULL || new_indices == NULL
01434              || new_edests == NULL || new_eclosures == NULL, 0))
01435        return REG_MISSING;
01436       dfa->nexts = new_nexts;
01437       dfa->org_indices = new_indices;
01438       dfa->edests = new_edests;
01439       dfa->eclosures = new_eclosures;
01440       dfa->nodes_alloc = new_nodes_alloc;
01441     }
01442   dfa->nodes[dfa->nodes_len] = token;
01443   dfa->nodes[dfa->nodes_len].constraint = 0;
01444 #ifdef RE_ENABLE_I18N
01445   {
01446   int type = token.type;
01447   dfa->nodes[dfa->nodes_len].accept_mb =
01448     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
01449   }
01450 #endif
01451   dfa->nexts[dfa->nodes_len] = REG_MISSING;
01452   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
01453   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
01454   return dfa->nodes_len++;
01455 }
01456 
01457 static inline re_hashval_t
01458 internal_function
01459 calc_state_hash (const re_node_set *nodes, unsigned int context)
01460 {
01461   re_hashval_t hash = nodes->nelem + context;
01462   Idx i;
01463   for (i = 0 ; i < nodes->nelem ; i++)
01464     hash += nodes->elems[i];
01465   return hash;
01466 }
01467 
01468 /* Search for the state whose node_set is equivalent to NODES.
01469    Return the pointer to the state, if we found it in the DFA.
01470    Otherwise create the new one and return it.  In case of an error
01471    return NULL and set the error code in ERR.
01472    Note: - We assume NULL as the invalid state, then it is possible that
01473           return value is NULL and ERR is REG_NOERROR.
01474         - We never return non-NULL value in case of any errors, it is for
01475           optimization.  */
01476 
01477 static re_dfastate_t *
01478 internal_function __attribute_warn_unused_result__
01479 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
01480                 const re_node_set *nodes)
01481 {
01482   re_hashval_t hash;
01483   re_dfastate_t *new_state;
01484   struct re_state_table_entry *spot;
01485   Idx i;
01486 #ifdef lint
01487   /* Suppress bogus uninitialized-variable warnings.  */
01488   *err = REG_NOERROR;
01489 #endif
01490   if (BE (nodes->nelem == 0, 0))
01491     {
01492       *err = REG_NOERROR;
01493       return NULL;
01494     }
01495   hash = calc_state_hash (nodes, 0);
01496   spot = dfa->state_table + (hash & dfa->state_hash_mask);
01497 
01498   for (i = 0 ; i < spot->num ; i++)
01499     {
01500       re_dfastate_t *state = spot->array[i];
01501       if (hash != state->hash)
01502        continue;
01503       if (re_node_set_compare (&state->nodes, nodes))
01504        return state;
01505     }
01506 
01507   /* There are no appropriate state in the dfa, create the new one.  */
01508   new_state = create_ci_newstate (dfa, nodes, hash);
01509   if (BE (new_state == NULL, 0))
01510     *err = REG_ESPACE;
01511 
01512   return new_state;
01513 }
01514 
01515 /* Search for the state whose node_set is equivalent to NODES and
01516    whose context is equivalent to CONTEXT.
01517    Return the pointer to the state, if we found it in the DFA.
01518    Otherwise create the new one and return it.  In case of an error
01519    return NULL and set the error code in ERR.
01520    Note: - We assume NULL as the invalid state, then it is possible that
01521           return value is NULL and ERR is REG_NOERROR.
01522         - We never return non-NULL value in case of any errors, it is for
01523           optimization.  */
01524 
01525 static re_dfastate_t *
01526 internal_function __attribute_warn_unused_result__
01527 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
01528                        const re_node_set *nodes, unsigned int context)
01529 {
01530   re_hashval_t hash;
01531   re_dfastate_t *new_state;
01532   struct re_state_table_entry *spot;
01533   Idx i;
01534 #ifdef lint
01535   /* Suppress bogus uninitialized-variable warnings.  */
01536   *err = REG_NOERROR;
01537 #endif
01538   if (nodes->nelem == 0)
01539     {
01540       *err = REG_NOERROR;
01541       return NULL;
01542     }
01543   hash = calc_state_hash (nodes, context);
01544   spot = dfa->state_table + (hash & dfa->state_hash_mask);
01545 
01546   for (i = 0 ; i < spot->num ; i++)
01547     {
01548       re_dfastate_t *state = spot->array[i];
01549       if (state->hash == hash
01550          && state->context == context
01551          && re_node_set_compare (state->entrance_nodes, nodes))
01552        return state;
01553     }
01554   /* There are no appropriate state in `dfa', create the new one.  */
01555   new_state = create_cd_newstate (dfa, nodes, context, hash);
01556   if (BE (new_state == NULL, 0))
01557     *err = REG_ESPACE;
01558 
01559   return new_state;
01560 }
01561 
01562 /* Finish initialization of the new state NEWSTATE, and using its hash value
01563    HASH put in the appropriate bucket of DFA's state table.  Return value
01564    indicates the error code if failed.  */
01565 
01566 static reg_errcode_t
01567 __attribute_warn_unused_result__
01568 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
01569               re_hashval_t hash)
01570 {
01571   struct re_state_table_entry *spot;
01572   reg_errcode_t err;
01573   Idx i;
01574 
01575   newstate->hash = hash;
01576   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
01577   if (BE (err != REG_NOERROR, 0))
01578     return REG_ESPACE;
01579   for (i = 0; i < newstate->nodes.nelem; i++)
01580     {
01581       Idx elem = newstate->nodes.elems[i];
01582       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
01583        if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0))
01584          return REG_ESPACE;
01585     }
01586 
01587   spot = dfa->state_table + (hash & dfa->state_hash_mask);
01588   if (BE (spot->alloc <= spot->num, 0))
01589     {
01590       Idx new_alloc = 2 * spot->num + 2;
01591       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
01592                                          new_alloc);
01593       if (BE (new_array == NULL, 0))
01594        return REG_ESPACE;
01595       spot->array = new_array;
01596       spot->alloc = new_alloc;
01597     }
01598   spot->array[spot->num++] = newstate;
01599   return REG_NOERROR;
01600 }
01601 
01602 static void
01603 free_state (re_dfastate_t *state)
01604 {
01605   re_node_set_free (&state->non_eps_nodes);
01606   re_node_set_free (&state->inveclosure);
01607   if (state->entrance_nodes != &state->nodes)
01608     {
01609       re_node_set_free (state->entrance_nodes);
01610       re_free (state->entrance_nodes);
01611     }
01612   re_node_set_free (&state->nodes);
01613   re_free (state->word_trtable);
01614   re_free (state->trtable);
01615   re_free (state);
01616 }
01617 
01618 /* Create the new state which is independ of contexts.
01619    Return the new state if succeeded, otherwise return NULL.  */
01620 
01621 static re_dfastate_t *
01622 internal_function __attribute_warn_unused_result__
01623 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
01624                   re_hashval_t hash)
01625 {
01626   Idx i;
01627   reg_errcode_t err;
01628   re_dfastate_t *newstate;
01629 
01630   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
01631   if (BE (newstate == NULL, 0))
01632     return NULL;
01633   err = re_node_set_init_copy (&newstate->nodes, nodes);
01634   if (BE (err != REG_NOERROR, 0))
01635     {
01636       re_free (newstate);
01637       return NULL;
01638     }
01639 
01640   newstate->entrance_nodes = &newstate->nodes;
01641   for (i = 0 ; i < nodes->nelem ; i++)
01642     {
01643       re_token_t *node = dfa->nodes + nodes->elems[i];
01644       re_token_type_t type = node->type;
01645       if (type == CHARACTER && !node->constraint)
01646        continue;
01647 #ifdef RE_ENABLE_I18N
01648       newstate->accept_mb |= node->accept_mb;
01649 #endif /* RE_ENABLE_I18N */
01650 
01651       /* If the state has the halt node, the state is a halt state.  */
01652       if (type == END_OF_RE)
01653        newstate->halt = 1;
01654       else if (type == OP_BACK_REF)
01655        newstate->has_backref = 1;
01656       else if (type == ANCHOR || node->constraint)
01657        newstate->has_constraint = 1;
01658     }
01659   err = register_state (dfa, newstate, hash);
01660   if (BE (err != REG_NOERROR, 0))
01661     {
01662       free_state (newstate);
01663       newstate = NULL;
01664     }
01665   return newstate;
01666 }
01667 
01668 /* Create the new state which is depend on the context CONTEXT.
01669    Return the new state if succeeded, otherwise return NULL.  */
01670 
01671 static re_dfastate_t *
01672 internal_function __attribute_warn_unused_result__
01673 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
01674                   unsigned int context, re_hashval_t hash)
01675 {
01676   Idx i, nctx_nodes = 0;
01677   reg_errcode_t err;
01678   re_dfastate_t *newstate;
01679 
01680   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
01681   if (BE (newstate == NULL, 0))
01682     return NULL;
01683   err = re_node_set_init_copy (&newstate->nodes, nodes);
01684   if (BE (err != REG_NOERROR, 0))
01685     {
01686       re_free (newstate);
01687       return NULL;
01688     }
01689 
01690   newstate->context = context;
01691   newstate->entrance_nodes = &newstate->nodes;
01692 
01693   for (i = 0 ; i < nodes->nelem ; i++)
01694     {
01695       re_token_t *node = dfa->nodes + nodes->elems[i];
01696       re_token_type_t type = node->type;
01697       unsigned int constraint = node->constraint;
01698 
01699       if (type == CHARACTER && !constraint)
01700        continue;
01701 #ifdef RE_ENABLE_I18N
01702       newstate->accept_mb |= node->accept_mb;
01703 #endif /* RE_ENABLE_I18N */
01704 
01705       /* If the state has the halt node, the state is a halt state.  */
01706       if (type == END_OF_RE)
01707        newstate->halt = 1;
01708       else if (type == OP_BACK_REF)
01709        newstate->has_backref = 1;
01710 
01711       if (constraint)
01712        {
01713          if (newstate->entrance_nodes == &newstate->nodes)
01714            {
01715              newstate->entrance_nodes = re_malloc (re_node_set, 1);
01716              if (BE (newstate->entrance_nodes == NULL, 0))
01717               {
01718                 free_state (newstate);
01719                 return NULL;
01720               }
01721              if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
01722                 != REG_NOERROR)
01723               return NULL;
01724              nctx_nodes = 0;
01725              newstate->has_constraint = 1;
01726            }
01727 
01728          if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
01729            {
01730              re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
01731              ++nctx_nodes;
01732            }
01733        }
01734     }
01735   err = register_state (dfa, newstate, hash);
01736   if (BE (err != REG_NOERROR, 0))
01737     {
01738       free_state (newstate);
01739       newstate = NULL;
01740     }
01741   return  newstate;
01742 }