Back to index

glibc  2.9
regex_internal.c
Go to the documentation of this file.
00001 /* Extended regular expression matching and search library.
00002    Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
00003    This file is part of the GNU C Library.
00004    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
00005 
00006    The GNU C Library is free software; you can redistribute it and/or
00007    modify it under the terms of the GNU Lesser General Public
00008    License as published by the Free Software Foundation; either
00009    version 2.1 of the License, or (at your option) any later version.
00010 
00011    The GNU C Library is distributed in the hope that it will be useful,
00012    but WITHOUT ANY WARRANTY; without even the implied warranty of
00013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014    Lesser General Public License for more details.
00015 
00016    You should have received a copy of the GNU Lesser General Public
00017    License along with the GNU C Library; if not, write to the Free
00018    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
00019    02111-1307 USA.  */
00020 
00021 static void re_string_construct_common (const char *str, int len,
00022                                    re_string_t *pstr,
00023                                    RE_TRANSLATE_TYPE trans, int icase,
00024                                    const re_dfa_t *dfa) internal_function;
00025 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
00026                                      const re_node_set *nodes,
00027                                      unsigned int hash) internal_function;
00028 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
00029                                      const re_node_set *nodes,
00030                                      unsigned int context,
00031                                      unsigned int hash) internal_function;
00032 
00033 /* Functions for string operation.  */
00034 
00035 /* This function allocate the buffers.  It is necessary to call
00036    re_string_reconstruct before using the object.  */
00037 
00038 static reg_errcode_t
00039 internal_function
00040 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
00041                   RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
00042 {
00043   reg_errcode_t ret;
00044   int init_buf_len;
00045 
00046   /* Ensure at least one character fits into the buffers.  */
00047   if (init_len < dfa->mb_cur_max)
00048     init_len = dfa->mb_cur_max;
00049   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
00050   re_string_construct_common (str, len, pstr, trans, icase, dfa);
00051 
00052   ret = re_string_realloc_buffers (pstr, init_buf_len);
00053   if (BE (ret != REG_NOERROR, 0))
00054     return ret;
00055 
00056   pstr->word_char = dfa->word_char;
00057   pstr->word_ops_used = dfa->word_ops_used;
00058   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
00059   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
00060   pstr->valid_raw_len = pstr->valid_len;
00061   return REG_NOERROR;
00062 }
00063 
00064 /* This function allocate the buffers, and initialize them.  */
00065 
00066 static reg_errcode_t
00067 internal_function
00068 re_string_construct (re_string_t *pstr, const char *str, int len,
00069                    RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
00070 {
00071   reg_errcode_t ret;
00072   memset (pstr, '\0', sizeof (re_string_t));
00073   re_string_construct_common (str, len, pstr, trans, icase, dfa);
00074 
00075   if (len > 0)
00076     {
00077       ret = re_string_realloc_buffers (pstr, len + 1);
00078       if (BE (ret != REG_NOERROR, 0))
00079        return ret;
00080     }
00081   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
00082 
00083   if (icase)
00084     {
00085 #ifdef RE_ENABLE_I18N
00086       if (dfa->mb_cur_max > 1)
00087        {
00088          while (1)
00089            {
00090              ret = build_wcs_upper_buffer (pstr);
00091              if (BE (ret != REG_NOERROR, 0))
00092               return ret;
00093              if (pstr->valid_raw_len >= len)
00094               break;
00095              if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
00096               break;
00097              ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
00098              if (BE (ret != REG_NOERROR, 0))
00099               return ret;
00100            }
00101        }
00102       else
00103 #endif /* RE_ENABLE_I18N  */
00104        build_upper_buffer (pstr);
00105     }
00106   else
00107     {
00108 #ifdef RE_ENABLE_I18N
00109       if (dfa->mb_cur_max > 1)
00110        build_wcs_buffer (pstr);
00111       else
00112 #endif /* RE_ENABLE_I18N  */
00113        {
00114          if (trans != NULL)
00115            re_string_translate_buffer (pstr);
00116          else
00117            {
00118              pstr->valid_len = pstr->bufs_len;
00119              pstr->valid_raw_len = pstr->bufs_len;
00120            }
00121        }
00122     }
00123 
00124   return REG_NOERROR;
00125 }
00126 
00127 /* Helper functions for re_string_allocate, and re_string_construct.  */
00128 
00129 static reg_errcode_t
00130 internal_function
00131 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
00132 {
00133 #ifdef RE_ENABLE_I18N
00134   if (pstr->mb_cur_max > 1)
00135     {
00136       wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
00137       if (BE (new_wcs == NULL, 0))
00138        return REG_ESPACE;
00139       pstr->wcs = new_wcs;
00140       if (pstr->offsets != NULL)
00141        {
00142          int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
00143          if (BE (new_offsets == NULL, 0))
00144            return REG_ESPACE;
00145          pstr->offsets = new_offsets;
00146        }
00147     }
00148 #endif /* RE_ENABLE_I18N  */
00149   if (pstr->mbs_allocated)
00150     {
00151       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
00152                                       new_buf_len);
00153       if (BE (new_mbs == NULL, 0))
00154        return REG_ESPACE;
00155       pstr->mbs = new_mbs;
00156     }
00157   pstr->bufs_len = new_buf_len;
00158   return REG_NOERROR;
00159 }
00160 
00161 
00162 static void
00163 internal_function
00164 re_string_construct_common (const char *str, int len, re_string_t *pstr,
00165                          RE_TRANSLATE_TYPE trans, int icase,
00166                          const re_dfa_t *dfa)
00167 {
00168   pstr->raw_mbs = (const unsigned char *) str;
00169   pstr->len = len;
00170   pstr->raw_len = len;
00171   pstr->trans = trans;
00172   pstr->icase = icase ? 1 : 0;
00173   pstr->mbs_allocated = (trans != NULL || icase);
00174   pstr->mb_cur_max = dfa->mb_cur_max;
00175   pstr->is_utf8 = dfa->is_utf8;
00176   pstr->map_notascii = dfa->map_notascii;
00177   pstr->stop = pstr->len;
00178   pstr->raw_stop = pstr->stop;
00179 }
00180 
00181 #ifdef RE_ENABLE_I18N
00182 
00183 /* Build wide character buffer PSTR->WCS.
00184    If the byte sequence of the string are:
00185      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
00186    Then wide character buffer will be:
00187      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
00188    We use WEOF for padding, they indicate that the position isn't
00189    a first byte of a multibyte character.
00190 
00191    Note that this function assumes PSTR->VALID_LEN elements are already
00192    built and starts from PSTR->VALID_LEN.  */
00193 
00194 static void
00195 internal_function
00196 build_wcs_buffer (re_string_t *pstr)
00197 {
00198 #ifdef _LIBC
00199   unsigned char buf[MB_LEN_MAX];
00200   assert (MB_LEN_MAX >= pstr->mb_cur_max);
00201 #else
00202   unsigned char buf[64];
00203 #endif
00204   mbstate_t prev_st;
00205   int byte_idx, end_idx, remain_len;
00206   size_t mbclen;
00207 
00208   /* Build the buffers from pstr->valid_len to either pstr->len or
00209      pstr->bufs_len.  */
00210   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
00211   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
00212     {
00213       wchar_t wc;
00214       const char *p;
00215 
00216       remain_len = end_idx - byte_idx;
00217       prev_st = pstr->cur_state;
00218       /* Apply the translation if we need.  */
00219       if (BE (pstr->trans != NULL, 0))
00220        {
00221          int i, ch;
00222 
00223          for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
00224            {
00225              ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
00226              buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
00227            }
00228          p = (const char *) buf;
00229        }
00230       else
00231        p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
00232       mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
00233       if (BE (mbclen == (size_t) -2, 0))
00234        {
00235          /* The buffer doesn't have enough space, finish to build.  */
00236          pstr->cur_state = prev_st;
00237          break;
00238        }
00239       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
00240        {
00241          /* We treat these cases as a singlebyte character.  */
00242          mbclen = 1;
00243          wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
00244          if (BE (pstr->trans != NULL, 0))
00245            wc = pstr->trans[wc];
00246          pstr->cur_state = prev_st;
00247        }
00248 
00249       /* Write wide character and padding.  */
00250       pstr->wcs[byte_idx++] = wc;
00251       /* Write paddings.  */
00252       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
00253        pstr->wcs[byte_idx++] = WEOF;
00254     }
00255   pstr->valid_len = byte_idx;
00256   pstr->valid_raw_len = byte_idx;
00257 }
00258 
00259 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
00260    but for REG_ICASE.  */
00261 
00262 static reg_errcode_t
00263 internal_function
00264 build_wcs_upper_buffer (re_string_t *pstr)
00265 {
00266   mbstate_t prev_st;
00267   int src_idx, byte_idx, end_idx, remain_len;
00268   size_t mbclen;
00269 #ifdef _LIBC
00270   char buf[MB_LEN_MAX];
00271   assert (MB_LEN_MAX >= pstr->mb_cur_max);
00272 #else
00273   char buf[64];
00274 #endif
00275 
00276   byte_idx = pstr->valid_len;
00277   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
00278 
00279   /* The following optimization assumes that ASCII characters can be
00280      mapped to wide characters with a simple cast.  */
00281   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
00282     {
00283       while (byte_idx < end_idx)
00284        {
00285          wchar_t wc;
00286 
00287          if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
00288              && mbsinit (&pstr->cur_state))
00289            {
00290              /* In case of a singlebyte character.  */
00291              pstr->mbs[byte_idx]
00292               = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
00293              /* The next step uses the assumption that wchar_t is encoded
00294                ASCII-safe: all ASCII values can be converted like this.  */
00295              pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
00296              ++byte_idx;
00297              continue;
00298            }
00299 
00300          remain_len = end_idx - byte_idx;
00301          prev_st = pstr->cur_state;
00302          mbclen = mbrtowc (&wc,
00303                          ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
00304                           + byte_idx), remain_len, &pstr->cur_state);
00305          if (BE (mbclen + 2 > 2, 1))
00306            {
00307              wchar_t wcu = wc;
00308              if (iswlower (wc))
00309               {
00310                 size_t mbcdlen;
00311 
00312                 wcu = towupper (wc);
00313                 mbcdlen = wcrtomb (buf, wcu, &prev_st);
00314                 if (BE (mbclen == mbcdlen, 1))
00315                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
00316                 else
00317                   {
00318                     src_idx = byte_idx;
00319                     goto offsets_needed;
00320                   }
00321               }
00322              else
00323               memcpy (pstr->mbs + byte_idx,
00324                      pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
00325              pstr->wcs[byte_idx++] = wcu;
00326              /* Write paddings.  */
00327              for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
00328               pstr->wcs[byte_idx++] = WEOF;
00329            }
00330          else if (mbclen == (size_t) -1 || mbclen == 0)
00331            {
00332              /* It is an invalid character or '\0'.  Just use the byte.  */
00333              int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
00334              pstr->mbs[byte_idx] = ch;
00335              /* And also cast it to wide char.  */
00336              pstr->wcs[byte_idx++] = (wchar_t) ch;
00337              if (BE (mbclen == (size_t) -1, 0))
00338               pstr->cur_state = prev_st;
00339            }
00340          else
00341            {
00342              /* The buffer doesn't have enough space, finish to build.  */
00343              pstr->cur_state = prev_st;
00344              break;
00345            }
00346        }
00347       pstr->valid_len = byte_idx;
00348       pstr->valid_raw_len = byte_idx;
00349       return REG_NOERROR;
00350     }
00351   else
00352     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
00353       {
00354        wchar_t wc;
00355        const char *p;
00356       offsets_needed:
00357        remain_len = end_idx - byte_idx;
00358        prev_st = pstr->cur_state;
00359        if (BE (pstr->trans != NULL, 0))
00360          {
00361            int i, ch;
00362 
00363            for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
00364              {
00365               ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
00366               buf[i] = pstr->trans[ch];
00367              }
00368            p = (const char *) buf;
00369          }
00370        else
00371          p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
00372        mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
00373        if (BE (mbclen + 2 > 2, 1))
00374          {
00375            wchar_t wcu = wc;
00376            if (iswlower (wc))
00377              {
00378               size_t mbcdlen;
00379 
00380               wcu = towupper (wc);
00381               mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
00382               if (BE (mbclen == mbcdlen, 1))
00383                 memcpy (pstr->mbs + byte_idx, buf, mbclen);
00384               else if (mbcdlen != (size_t) -1)
00385                 {
00386                   size_t i;
00387 
00388                   if (byte_idx + mbcdlen > pstr->bufs_len)
00389                     {
00390                      pstr->cur_state = prev_st;
00391                      break;
00392                     }
00393 
00394                   if (pstr->offsets == NULL)
00395                     {
00396                      pstr->offsets = re_malloc (int, pstr->bufs_len);
00397 
00398                      if (pstr->offsets == NULL)
00399                        return REG_ESPACE;
00400                     }
00401                   if (!pstr->offsets_needed)
00402                     {
00403                      for (i = 0; i < (size_t) byte_idx; ++i)
00404                        pstr->offsets[i] = i;
00405                      pstr->offsets_needed = 1;
00406                     }
00407 
00408                   memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
00409                   pstr->wcs[byte_idx] = wcu;
00410                   pstr->offsets[byte_idx] = src_idx;
00411                   for (i = 1; i < mbcdlen; ++i)
00412                     {
00413                      pstr->offsets[byte_idx + i]
00414                        = src_idx + (i < mbclen ? i : mbclen - 1);
00415                      pstr->wcs[byte_idx + i] = WEOF;
00416                     }
00417                   pstr->len += mbcdlen - mbclen;
00418                   if (pstr->raw_stop > src_idx)
00419                     pstr->stop += mbcdlen - mbclen;
00420                   end_idx = (pstr->bufs_len > pstr->len)
00421                            ? pstr->len : pstr->bufs_len;
00422                   byte_idx += mbcdlen;
00423                   src_idx += mbclen;
00424                   continue;
00425                 }
00426                 else
00427                   memcpy (pstr->mbs + byte_idx, p, mbclen);
00428              }
00429            else
00430              memcpy (pstr->mbs + byte_idx, p, mbclen);
00431 
00432            if (BE (pstr->offsets_needed != 0, 0))
00433              {
00434               size_t i;
00435               for (i = 0; i < mbclen; ++i)
00436                 pstr->offsets[byte_idx + i] = src_idx + i;
00437              }
00438            src_idx += mbclen;
00439 
00440            pstr->wcs[byte_idx++] = wcu;
00441            /* Write paddings.  */
00442            for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
00443              pstr->wcs[byte_idx++] = WEOF;
00444          }
00445        else if (mbclen == (size_t) -1 || mbclen == 0)
00446          {
00447            /* It is an invalid character or '\0'.  Just use the byte.  */
00448            int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
00449 
00450            if (BE (pstr->trans != NULL, 0))
00451              ch = pstr->trans [ch];
00452            pstr->mbs[byte_idx] = ch;
00453 
00454            if (BE (pstr->offsets_needed != 0, 0))
00455              pstr->offsets[byte_idx] = src_idx;
00456            ++src_idx;
00457 
00458            /* And also cast it to wide char.  */
00459            pstr->wcs[byte_idx++] = (wchar_t) ch;
00460            if (BE (mbclen == (size_t) -1, 0))
00461              pstr->cur_state = prev_st;
00462          }
00463        else
00464          {
00465            /* The buffer doesn't have enough space, finish to build.  */
00466            pstr->cur_state = prev_st;
00467            break;
00468          }
00469       }
00470   pstr->valid_len = byte_idx;
00471   pstr->valid_raw_len = src_idx;
00472   return REG_NOERROR;
00473 }
00474 
00475 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
00476    Return the index.  */
00477 
00478 static int
00479 internal_function
00480 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
00481 {
00482   mbstate_t prev_st;
00483   int rawbuf_idx;
00484   size_t mbclen;
00485   wchar_t wc = WEOF;
00486 
00487   /* Skip the characters which are not necessary to check.  */
00488   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
00489        rawbuf_idx < new_raw_idx;)
00490     {
00491       int remain_len;
00492       remain_len = pstr->len - rawbuf_idx;
00493       prev_st = pstr->cur_state;
00494       mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
00495                      remain_len, &pstr->cur_state);
00496       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
00497        {
00498          /* We treat these cases as a single byte character.  */
00499          if (mbclen == 0 || remain_len == 0)
00500            wc = L'\0';
00501          else
00502            wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
00503          mbclen = 1;
00504          pstr->cur_state = prev_st;
00505        }
00506       /* Then proceed the next character.  */
00507       rawbuf_idx += mbclen;
00508     }
00509   *last_wc = (wint_t) wc;
00510   return rawbuf_idx;
00511 }
00512 #endif /* RE_ENABLE_I18N  */
00513 
00514 /* Build the buffer PSTR->MBS, and apply the translation if we need.
00515    This function is used in case of REG_ICASE.  */
00516 
00517 static void
00518 internal_function
00519 build_upper_buffer (re_string_t *pstr)
00520 {
00521   int char_idx, end_idx;
00522   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
00523 
00524   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
00525     {
00526       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
00527       if (BE (pstr->trans != NULL, 0))
00528        ch = pstr->trans[ch];
00529       if (islower (ch))
00530        pstr->mbs[char_idx] = toupper (ch);
00531       else
00532        pstr->mbs[char_idx] = ch;
00533     }
00534   pstr->valid_len = char_idx;
00535   pstr->valid_raw_len = char_idx;
00536 }
00537 
00538 /* Apply TRANS to the buffer in PSTR.  */
00539 
00540 static void
00541 internal_function
00542 re_string_translate_buffer (re_string_t *pstr)
00543 {
00544   int buf_idx, end_idx;
00545   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
00546 
00547   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
00548     {
00549       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
00550       pstr->mbs[buf_idx] = pstr->trans[ch];
00551     }
00552 
00553   pstr->valid_len = buf_idx;
00554   pstr->valid_raw_len = buf_idx;
00555 }
00556 
00557 /* This function re-construct the buffers.
00558    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
00559    convert to upper case in case of REG_ICASE, apply translation.  */
00560 
00561 static reg_errcode_t
00562 internal_function
00563 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
00564 {
00565   int offset = idx - pstr->raw_mbs_idx;
00566   if (BE (offset < 0, 0))
00567     {
00568       /* Reset buffer.  */
00569 #ifdef RE_ENABLE_I18N
00570       if (pstr->mb_cur_max > 1)
00571        memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
00572 #endif /* RE_ENABLE_I18N */
00573       pstr->len = pstr->raw_len;
00574       pstr->stop = pstr->raw_stop;
00575       pstr->valid_len = 0;
00576       pstr->raw_mbs_idx = 0;
00577       pstr->valid_raw_len = 0;
00578       pstr->offsets_needed = 0;
00579       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
00580                         : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
00581       if (!pstr->mbs_allocated)
00582        pstr->mbs = (unsigned char *) pstr->raw_mbs;
00583       offset = idx;
00584     }
00585 
00586   if (BE (offset != 0, 1))
00587     {
00588       /* Should the already checked characters be kept?  */
00589       if (BE (offset < pstr->valid_raw_len, 1))
00590        {
00591          /* Yes, move them to the front of the buffer.  */
00592 #ifdef RE_ENABLE_I18N
00593          if (BE (pstr->offsets_needed, 0))
00594            {
00595              int low = 0, high = pstr->valid_len, mid;
00596              do
00597               {
00598                 mid = (high + low) / 2;
00599                 if (pstr->offsets[mid] > offset)
00600                   high = mid;
00601                 else if (pstr->offsets[mid] < offset)
00602                   low = mid + 1;
00603                 else
00604                   break;
00605               }
00606              while (low < high);
00607              if (pstr->offsets[mid] < offset)
00608               ++mid;
00609              pstr->tip_context = re_string_context_at (pstr, mid - 1,
00610                                                  eflags);
00611              /* This can be quite complicated, so handle specially
00612                only the common and easy case where the character with
00613                different length representation of lower and upper
00614                case is present at or after offset.  */
00615              if (pstr->valid_len > offset
00616                 && mid == offset && pstr->offsets[mid] == offset)
00617               {
00618                 memmove (pstr->wcs, pstr->wcs + offset,
00619                         (pstr->valid_len - offset) * sizeof (wint_t));
00620                 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
00621                 pstr->valid_len -= offset;
00622                 pstr->valid_raw_len -= offset;
00623                 for (low = 0; low < pstr->valid_len; low++)
00624                   pstr->offsets[low] = pstr->offsets[low + offset] - offset;
00625               }
00626              else
00627               {
00628                 /* Otherwise, just find out how long the partial multibyte
00629                    character at offset is and fill it with WEOF/255.  */
00630                 pstr->len = pstr->raw_len - idx + offset;
00631                 pstr->stop = pstr->raw_stop - idx + offset;
00632                 pstr->offsets_needed = 0;
00633                 while (mid > 0 && pstr->offsets[mid - 1] == offset)
00634                   --mid;
00635                 while (mid < pstr->valid_len)
00636                   if (pstr->wcs[mid] != WEOF)
00637                     break;
00638                   else
00639                     ++mid;
00640                 if (mid == pstr->valid_len)
00641                   pstr->valid_len = 0;
00642                 else
00643                   {
00644                     pstr->valid_len = pstr->offsets[mid] - offset;
00645                     if (pstr->valid_len)
00646                      {
00647                        for (low = 0; low < pstr->valid_len; ++low)
00648                          pstr->wcs[low] = WEOF;
00649                        memset (pstr->mbs, 255, pstr->valid_len);
00650                      }
00651                   }
00652                 pstr->valid_raw_len = pstr->valid_len;
00653               }
00654            }
00655          else
00656 #endif
00657            {
00658              pstr->tip_context = re_string_context_at (pstr, offset - 1,
00659                                                  eflags);
00660 #ifdef RE_ENABLE_I18N
00661              if (pstr->mb_cur_max > 1)
00662               memmove (pstr->wcs, pstr->wcs + offset,
00663                       (pstr->valid_len - offset) * sizeof (wint_t));
00664 #endif /* RE_ENABLE_I18N */
00665              if (BE (pstr->mbs_allocated, 0))
00666               memmove (pstr->mbs, pstr->mbs + offset,
00667                       pstr->valid_len - offset);
00668              pstr->valid_len -= offset;
00669              pstr->valid_raw_len -= offset;
00670 #if DEBUG
00671              assert (pstr->valid_len > 0);
00672 #endif
00673            }
00674        }
00675       else
00676        {
00677          /* No, skip all characters until IDX.  */
00678          int prev_valid_len = pstr->valid_len;
00679 
00680 #ifdef RE_ENABLE_I18N
00681          if (BE (pstr->offsets_needed, 0))
00682            {
00683              pstr->len = pstr->raw_len - idx + offset;
00684              pstr->stop = pstr->raw_stop - idx + offset;
00685              pstr->offsets_needed = 0;
00686            }
00687 #endif
00688          pstr->valid_len = 0;
00689 #ifdef RE_ENABLE_I18N
00690          if (pstr->mb_cur_max > 1)
00691            {
00692              int wcs_idx;
00693              wint_t wc = WEOF;
00694 
00695              if (pstr->is_utf8)
00696               {
00697                 const unsigned char *raw, *p, *q, *end;
00698 
00699                 /* Special case UTF-8.  Multi-byte chars start with any
00700                    byte other than 0x80 - 0xbf.  */
00701                 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
00702                 end = raw + (offset - pstr->mb_cur_max);
00703                 if (end < pstr->raw_mbs)
00704                   end = pstr->raw_mbs;
00705                 p = raw + offset - 1;
00706 #ifdef _LIBC
00707                 /* We know the wchar_t encoding is UCS4, so for the simple
00708                    case, ASCII characters, skip the conversion step.  */
00709                 if (isascii (*p) && BE (pstr->trans == NULL, 1))
00710                   {
00711                     memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
00712                     /* pstr->valid_len = 0; */
00713                     wc = (wchar_t) *p;
00714                   }
00715                 else
00716 #endif
00717                   for (; p >= end; --p)
00718                     if ((*p & 0xc0) != 0x80)
00719                      {
00720                        mbstate_t cur_state;
00721                        wchar_t wc2;
00722                        int mlen = raw + pstr->len - p;
00723                        unsigned char buf[6];
00724                        size_t mbclen;
00725 
00726                        q = p;
00727                        if (BE (pstr->trans != NULL, 0))
00728                          {
00729                            int i = mlen < 6 ? mlen : 6;
00730                            while (--i >= 0)
00731                             buf[i] = pstr->trans[p[i]];
00732                            q = buf;
00733                          }
00734                        /* XXX Don't use mbrtowc, we know which conversion
00735                           to use (UTF-8 -> UCS4).  */
00736                        memset (&cur_state, 0, sizeof (cur_state));
00737                        mbclen = mbrtowc (&wc2, (const char *) p, mlen,
00738                                        &cur_state);
00739                        if (raw + offset - p <= mbclen
00740                            && mbclen < (size_t) -2)
00741                          {
00742                            memset (&pstr->cur_state, '\0',
00743                                   sizeof (mbstate_t));
00744                            pstr->valid_len = mbclen - (raw + offset - p);
00745                            wc = wc2;
00746                          }
00747                        break;
00748                      }
00749               }
00750 
00751              if (wc == WEOF)
00752               pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
00753              if (wc == WEOF)
00754               pstr->tip_context
00755                 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
00756              else
00757               pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
00758                                   && IS_WIDE_WORD_CHAR (wc))
00759                                  ? CONTEXT_WORD
00760                                  : ((IS_WIDE_NEWLINE (wc)
00761                                     && pstr->newline_anchor)
00762                                    ? CONTEXT_NEWLINE : 0));
00763              if (BE (pstr->valid_len, 0))
00764               {
00765                 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
00766                   pstr->wcs[wcs_idx] = WEOF;
00767                 if (pstr->mbs_allocated)
00768                   memset (pstr->mbs, 255, pstr->valid_len);
00769               }
00770              pstr->valid_raw_len = pstr->valid_len;
00771            }
00772          else
00773 #endif /* RE_ENABLE_I18N */
00774            {
00775              int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
00776              pstr->valid_raw_len = 0;
00777              if (pstr->trans)
00778               c = pstr->trans[c];
00779              pstr->tip_context = (bitset_contain (pstr->word_char, c)
00780                                ? CONTEXT_WORD
00781                                : ((IS_NEWLINE (c) && pstr->newline_anchor)
00782                                   ? CONTEXT_NEWLINE : 0));
00783            }
00784        }
00785       if (!BE (pstr->mbs_allocated, 0))
00786        pstr->mbs += offset;
00787     }
00788   pstr->raw_mbs_idx = idx;
00789   pstr->len -= offset;
00790   pstr->stop -= offset;
00791 
00792   /* Then build the buffers.  */
00793 #ifdef RE_ENABLE_I18N
00794   if (pstr->mb_cur_max > 1)
00795     {
00796       if (pstr->icase)
00797        {
00798          reg_errcode_t ret = build_wcs_upper_buffer (pstr);
00799          if (BE (ret != REG_NOERROR, 0))
00800            return ret;
00801        }
00802       else
00803        build_wcs_buffer (pstr);
00804     }
00805   else
00806 #endif /* RE_ENABLE_I18N */
00807     if (BE (pstr->mbs_allocated, 0))
00808       {
00809        if (pstr->icase)
00810          build_upper_buffer (pstr);
00811        else if (pstr->trans != NULL)
00812          re_string_translate_buffer (pstr);
00813       }
00814     else
00815       pstr->valid_len = pstr->len;
00816 
00817   pstr->cur_idx = 0;
00818   return REG_NOERROR;
00819 }
00820 
00821 static unsigned char
00822 internal_function __attribute ((pure))
00823 re_string_peek_byte_case (const re_string_t *pstr, int idx)
00824 {
00825   int ch, off;
00826 
00827   /* Handle the common (easiest) cases first.  */
00828   if (BE (!pstr->mbs_allocated, 1))
00829     return re_string_peek_byte (pstr, idx);
00830 
00831 #ifdef RE_ENABLE_I18N
00832   if (pstr->mb_cur_max > 1
00833       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
00834     return re_string_peek_byte (pstr, idx);
00835 #endif
00836 
00837   off = pstr->cur_idx + idx;
00838 #ifdef RE_ENABLE_I18N
00839   if (pstr->offsets_needed)
00840     off = pstr->offsets[off];
00841 #endif
00842 
00843   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
00844 
00845 #ifdef RE_ENABLE_I18N
00846   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
00847      this function returns CAPITAL LETTER I instead of first byte of
00848      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
00849      since peek_byte_case doesn't advance cur_idx in any way.  */
00850   if (pstr->offsets_needed && !isascii (ch))
00851     return re_string_peek_byte (pstr, idx);
00852 #endif
00853 
00854   return ch;
00855 }
00856 
00857 static unsigned char
00858 internal_function __attribute ((pure))
00859 re_string_fetch_byte_case (re_string_t *pstr)
00860 {
00861   if (BE (!pstr->mbs_allocated, 1))
00862     return re_string_fetch_byte (pstr);
00863 
00864 #ifdef RE_ENABLE_I18N
00865   if (pstr->offsets_needed)
00866     {
00867       int off, ch;
00868 
00869       /* For tr_TR.UTF-8 [[:islower:]] there is
00870         [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
00871         in that case the whole multi-byte character and return
00872         the original letter.  On the other side, with
00873         [[: DOTLESS SMALL LETTER I return [[:I, as doing
00874         anything else would complicate things too much.  */
00875 
00876       if (!re_string_first_byte (pstr, pstr->cur_idx))
00877        return re_string_fetch_byte (pstr);
00878 
00879       off = pstr->offsets[pstr->cur_idx];
00880       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
00881 
00882       if (! isascii (ch))
00883        return re_string_fetch_byte (pstr);
00884 
00885       re_string_skip_bytes (pstr,
00886                          re_string_char_size_at (pstr, pstr->cur_idx));
00887       return ch;
00888     }
00889 #endif
00890 
00891   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
00892 }
00893 
00894 static void
00895 internal_function
00896 re_string_destruct (re_string_t *pstr)
00897 {
00898 #ifdef RE_ENABLE_I18N
00899   re_free (pstr->wcs);
00900   re_free (pstr->offsets);
00901 #endif /* RE_ENABLE_I18N  */
00902   if (pstr->mbs_allocated)
00903     re_free (pstr->mbs);
00904 }
00905 
00906 /* Return the context at IDX in INPUT.  */
00907 
00908 static unsigned int
00909 internal_function
00910 re_string_context_at (const re_string_t *input, int idx, int eflags)
00911 {
00912   int c;
00913   if (BE (idx < 0, 0))
00914     /* In this case, we use the value stored in input->tip_context,
00915        since we can't know the character in input->mbs[-1] here.  */
00916     return input->tip_context;
00917   if (BE (idx == input->len, 0))
00918     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
00919            : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
00920 #ifdef RE_ENABLE_I18N
00921   if (input->mb_cur_max > 1)
00922     {
00923       wint_t wc;
00924       int wc_idx = idx;
00925       while(input->wcs[wc_idx] == WEOF)
00926        {
00927 #ifdef DEBUG
00928          /* It must not happen.  */
00929          assert (wc_idx >= 0);
00930 #endif
00931          --wc_idx;
00932          if (wc_idx < 0)
00933            return input->tip_context;
00934        }
00935       wc = input->wcs[wc_idx];
00936       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
00937        return CONTEXT_WORD;
00938       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
00939              ? CONTEXT_NEWLINE : 0);
00940     }
00941   else
00942 #endif
00943     {
00944       c = re_string_byte_at (input, idx);
00945       if (bitset_contain (input->word_char, c))
00946        return CONTEXT_WORD;
00947       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
00948     }
00949 }
00950 
00951 /* Functions for set operation.  */
00952 
00953 static reg_errcode_t
00954 internal_function
00955 re_node_set_alloc (re_node_set *set, int size)
00956 {
00957   set->alloc = size;
00958   set->nelem = 0;
00959   set->elems = re_malloc (int, size);
00960   if (BE (set->elems == NULL, 0))
00961     return REG_ESPACE;
00962   return REG_NOERROR;
00963 }
00964 
00965 static reg_errcode_t
00966 internal_function
00967 re_node_set_init_1 (re_node_set *set, int elem)
00968 {
00969   set->alloc = 1;
00970   set->nelem = 1;
00971   set->elems = re_malloc (int, 1);
00972   if (BE (set->elems == NULL, 0))
00973     {
00974       set->alloc = set->nelem = 0;
00975       return REG_ESPACE;
00976     }
00977   set->elems[0] = elem;
00978   return REG_NOERROR;
00979 }
00980 
00981 static reg_errcode_t
00982 internal_function
00983 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
00984 {
00985   set->alloc = 2;
00986   set->elems = re_malloc (int, 2);
00987   if (BE (set->elems == NULL, 0))
00988     return REG_ESPACE;
00989   if (elem1 == elem2)
00990     {
00991       set->nelem = 1;
00992       set->elems[0] = elem1;
00993     }
00994   else
00995     {
00996       set->nelem = 2;
00997       if (elem1 < elem2)
00998        {
00999          set->elems[0] = elem1;
01000          set->elems[1] = elem2;
01001        }
01002       else
01003        {
01004          set->elems[0] = elem2;
01005          set->elems[1] = elem1;
01006        }
01007     }
01008   return REG_NOERROR;
01009 }
01010 
01011 static reg_errcode_t
01012 internal_function
01013 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
01014 {
01015   dest->nelem = src->nelem;
01016   if (src->nelem > 0)
01017     {
01018       dest->alloc = dest->nelem;
01019       dest->elems = re_malloc (int, dest->alloc);
01020       if (BE (dest->elems == NULL, 0))
01021        {
01022          dest->alloc = dest->nelem = 0;
01023          return REG_ESPACE;
01024        }
01025       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
01026     }
01027   else
01028     re_node_set_init_empty (dest);
01029   return REG_NOERROR;
01030 }
01031 
01032 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
01033    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
01034    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
01035 
01036 static reg_errcode_t
01037 internal_function
01038 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
01039                         const re_node_set *src2)
01040 {
01041   int i1, i2, is, id, delta, sbase;
01042   if (src1->nelem == 0 || src2->nelem == 0)
01043     return REG_NOERROR;
01044 
01045   /* We need dest->nelem + 2 * elems_in_intersection; this is a
01046      conservative estimate.  */
01047   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
01048     {
01049       int new_alloc = src1->nelem + src2->nelem + dest->alloc;
01050       int *new_elems = re_realloc (dest->elems, int, new_alloc);
01051       if (BE (new_elems == NULL, 0))
01052         return REG_ESPACE;
01053       dest->elems = new_elems;
01054       dest->alloc = new_alloc;
01055     }
01056 
01057   /* Find the items in the intersection of SRC1 and SRC2, and copy
01058      into the top of DEST those that are not already in DEST itself.  */
01059   sbase = dest->nelem + src1->nelem + src2->nelem;
01060   i1 = src1->nelem - 1;
01061   i2 = src2->nelem - 1;
01062   id = dest->nelem - 1;
01063   for (;;)
01064     {
01065       if (src1->elems[i1] == src2->elems[i2])
01066        {
01067          /* Try to find the item in DEST.  Maybe we could binary search?  */
01068          while (id >= 0 && dest->elems[id] > src1->elems[i1])
01069            --id;
01070 
01071           if (id < 0 || dest->elems[id] != src1->elems[i1])
01072             dest->elems[--sbase] = src1->elems[i1];
01073 
01074          if (--i1 < 0 || --i2 < 0)
01075            break;
01076        }
01077 
01078       /* Lower the highest of the two items.  */
01079       else if (src1->elems[i1] < src2->elems[i2])
01080        {
01081          if (--i2 < 0)
01082            break;
01083        }
01084       else
01085        {
01086          if (--i1 < 0)
01087            break;
01088        }
01089     }
01090 
01091   id = dest->nelem - 1;
01092   is = dest->nelem + src1->nelem + src2->nelem - 1;
01093   delta = is - sbase + 1;
01094 
01095   /* Now copy.  When DELTA becomes zero, the remaining
01096      DEST elements are already in place; this is more or
01097      less the same loop that is in re_node_set_merge.  */
01098   dest->nelem += delta;
01099   if (delta > 0 && id >= 0)
01100     for (;;)
01101       {
01102         if (dest->elems[is] > dest->elems[id])
01103           {
01104             /* Copy from the top.  */
01105             dest->elems[id + delta--] = dest->elems[is--];
01106             if (delta == 0)
01107               break;
01108           }
01109         else
01110           {
01111             /* Slide from the bottom.  */
01112             dest->elems[id + delta] = dest->elems[id];
01113             if (--id < 0)
01114               break;
01115           }
01116       }
01117 
01118   /* Copy remaining SRC elements.  */
01119   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
01120 
01121   return REG_NOERROR;
01122 }
01123 
01124 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
01125    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
01126 
01127 static reg_errcode_t
01128 internal_function
01129 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
01130                      const re_node_set *src2)
01131 {
01132   int i1, i2, id;
01133   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
01134     {
01135       dest->alloc = src1->nelem + src2->nelem;
01136       dest->elems = re_malloc (int, dest->alloc);
01137       if (BE (dest->elems == NULL, 0))
01138        return REG_ESPACE;
01139     }
01140   else
01141     {
01142       if (src1 != NULL && src1->nelem > 0)
01143        return re_node_set_init_copy (dest, src1);
01144       else if (src2 != NULL && src2->nelem > 0)
01145        return re_node_set_init_copy (dest, src2);
01146       else
01147        re_node_set_init_empty (dest);
01148       return REG_NOERROR;
01149     }
01150   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
01151     {
01152       if (src1->elems[i1] > src2->elems[i2])
01153        {
01154          dest->elems[id++] = src2->elems[i2++];
01155          continue;
01156        }
01157       if (src1->elems[i1] == src2->elems[i2])
01158        ++i2;
01159       dest->elems[id++] = src1->elems[i1++];
01160     }
01161   if (i1 < src1->nelem)
01162     {
01163       memcpy (dest->elems + id, src1->elems + i1,
01164             (src1->nelem - i1) * sizeof (int));
01165       id += src1->nelem - i1;
01166     }
01167   else if (i2 < src2->nelem)
01168     {
01169       memcpy (dest->elems + id, src2->elems + i2,
01170             (src2->nelem - i2) * sizeof (int));
01171       id += src2->nelem - i2;
01172     }
01173   dest->nelem = id;
01174   return REG_NOERROR;
01175 }
01176 
01177 /* Calculate the union set of the sets DEST and SRC. And store it to
01178    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
01179 
01180 static reg_errcode_t
01181 internal_function
01182 re_node_set_merge (re_node_set *dest, const re_node_set *src)
01183 {
01184   int is, id, sbase, delta;
01185   if (src == NULL || src->nelem == 0)
01186     return REG_NOERROR;
01187   if (dest->alloc < 2 * src->nelem + dest->nelem)
01188     {
01189       int new_alloc = 2 * (src->nelem + dest->alloc);
01190       int *new_buffer = re_realloc (dest->elems, int, new_alloc);
01191       if (BE (new_buffer == NULL, 0))
01192        return REG_ESPACE;
01193       dest->elems = new_buffer;
01194       dest->alloc = new_alloc;
01195     }
01196 
01197   if (BE (dest->nelem == 0, 0))
01198     {
01199       dest->nelem = src->nelem;
01200       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
01201       return REG_NOERROR;
01202     }
01203 
01204   /* Copy into the top of DEST the items of SRC that are not
01205      found in DEST.  Maybe we could binary search in DEST?  */
01206   for (sbase = dest->nelem + 2 * src->nelem,
01207        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
01208     {
01209       if (dest->elems[id] == src->elems[is])
01210         is--, id--;
01211       else if (dest->elems[id] < src->elems[is])
01212         dest->elems[--sbase] = src->elems[is--];
01213       else /* if (dest->elems[id] > src->elems[is]) */
01214         --id;
01215     }
01216 
01217   if (is >= 0)
01218     {
01219       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
01220       sbase -= is + 1;
01221       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
01222     }
01223 
01224   id = dest->nelem - 1;
01225   is = dest->nelem + 2 * src->nelem - 1;
01226   delta = is - sbase + 1;
01227   if (delta == 0)
01228     return REG_NOERROR;
01229 
01230   /* Now copy.  When DELTA becomes zero, the remaining
01231      DEST elements are already in place.  */
01232   dest->nelem += delta;
01233   for (;;)
01234     {
01235       if (dest->elems[is] > dest->elems[id])
01236         {
01237          /* Copy from the top.  */
01238           dest->elems[id + delta--] = dest->elems[is--];
01239          if (delta == 0)
01240            break;
01241        }
01242       else
01243         {
01244           /* Slide from the bottom.  */
01245           dest->elems[id + delta] = dest->elems[id];
01246          if (--id < 0)
01247            {
01248              /* Copy remaining SRC elements.  */
01249              memcpy (dest->elems, dest->elems + sbase,
01250                      delta * sizeof (int));
01251              break;
01252            }
01253        }
01254     }
01255 
01256   return REG_NOERROR;
01257 }
01258 
01259 /* Insert the new element ELEM to the re_node_set* SET.
01260    SET should not already have ELEM.
01261    return -1 if an error is occured, return 1 otherwise.  */
01262 
01263 static int
01264 internal_function
01265 re_node_set_insert (re_node_set *set, int elem)
01266 {
01267   int idx;
01268   /* In case the set is empty.  */
01269   if (set->alloc == 0)
01270     {
01271       if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
01272        return 1;
01273       else
01274        return -1;
01275     }
01276 
01277   if (BE (set->nelem, 0) == 0)
01278     {
01279       /* We already guaranteed above that set->alloc != 0.  */
01280       set->elems[0] = elem;
01281       ++set->nelem;
01282       return 1;
01283     }
01284 
01285   /* Realloc if we need.  */
01286   if (set->alloc == set->nelem)
01287     {
01288       int *new_elems;
01289       set->alloc = set->alloc * 2;
01290       new_elems = re_realloc (set->elems, int, set->alloc);
01291       if (BE (new_elems == NULL, 0))
01292        return -1;
01293       set->elems = new_elems;
01294     }
01295 
01296   /* Move the elements which follows the new element.  Test the
01297      first element separately to skip a check in the inner loop.  */
01298   if (elem < set->elems[0])
01299     {
01300       idx = 0;
01301       for (idx = set->nelem; idx > 0; idx--)
01302         set->elems[idx] = set->elems[idx - 1];
01303     }
01304   else
01305     {
01306       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
01307         set->elems[idx] = set->elems[idx - 1];
01308     }
01309 
01310   /* Insert the new element.  */
01311   set->elems[idx] = elem;
01312   ++set->nelem;
01313   return 1;
01314 }
01315 
01316 /* Insert the new element ELEM to the re_node_set* SET.
01317    SET should not already have any element greater than or equal to ELEM.
01318    Return -1 if an error is occured, return 1 otherwise.  */
01319 
01320 static int
01321 internal_function
01322 re_node_set_insert_last (re_node_set *set, int elem)
01323 {
01324   /* Realloc if we need.  */
01325   if (set->alloc == set->nelem)
01326     {
01327       int *new_elems;
01328       set->alloc = (set->alloc + 1) * 2;
01329       new_elems = re_realloc (set->elems, int, set->alloc);
01330       if (BE (new_elems == NULL, 0))
01331        return -1;
01332       set->elems = new_elems;
01333     }
01334 
01335   /* Insert the new element.  */
01336   set->elems[set->nelem++] = elem;
01337   return 1;
01338 }
01339 
01340 /* Compare two node sets SET1 and SET2.
01341    return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
01342 
01343 static int
01344 internal_function __attribute ((pure))
01345 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
01346 {
01347   int i;
01348   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
01349     return 0;
01350   for (i = set1->nelem ; --i >= 0 ; )
01351     if (set1->elems[i] != set2->elems[i])
01352       return 0;
01353   return 1;
01354 }
01355 
01356 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
01357 
01358 static int
01359 internal_function __attribute ((pure))
01360 re_node_set_contains (const re_node_set *set, int elem)
01361 {
01362   unsigned int idx, right, mid;
01363   if (set->nelem <= 0)
01364     return 0;
01365 
01366   /* Binary search the element.  */
01367   idx = 0;
01368   right = set->nelem - 1;
01369   while (idx < right)
01370     {
01371       mid = (idx + right) / 2;
01372       if (set->elems[mid] < elem)
01373        idx = mid + 1;
01374       else
01375        right = mid;
01376     }
01377   return set->elems[idx] == elem ? idx + 1 : 0;
01378 }
01379 
01380 static void
01381 internal_function
01382 re_node_set_remove_at (re_node_set *set, int idx)
01383 {
01384   if (idx < 0 || idx >= set->nelem)
01385     return;
01386   --set->nelem;
01387   for (; idx < set->nelem; idx++)
01388     set->elems[idx] = set->elems[idx + 1];
01389 }
01390 
01391 
01392 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
01393    Or return -1, if an error will be occured.  */
01394 
01395 static int
01396 internal_function
01397 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
01398 {
01399   int type = token.type;
01400   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
01401     {
01402       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
01403       int *new_nexts, *new_indices;
01404       re_node_set *new_edests, *new_eclosures;
01405       re_token_t *new_nodes;
01406 
01407       /* Avoid overflows.  */
01408       if (BE (new_nodes_alloc < dfa->nodes_alloc, 0))
01409        return -1;
01410 
01411       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
01412       if (BE (new_nodes == NULL, 0))
01413        return -1;
01414       dfa->nodes = new_nodes;
01415       new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
01416       new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
01417       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
01418       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
01419       if (BE (new_nexts == NULL || new_indices == NULL
01420              || new_edests == NULL || new_eclosures == NULL, 0))
01421        return -1;
01422       dfa->nexts = new_nexts;
01423       dfa->org_indices = new_indices;
01424       dfa->edests = new_edests;
01425       dfa->eclosures = new_eclosures;
01426       dfa->nodes_alloc = new_nodes_alloc;
01427     }
01428   dfa->nodes[dfa->nodes_len] = token;
01429   dfa->nodes[dfa->nodes_len].constraint = 0;
01430 #ifdef RE_ENABLE_I18N
01431   dfa->nodes[dfa->nodes_len].accept_mb =
01432     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
01433 #endif
01434   dfa->nexts[dfa->nodes_len] = -1;
01435   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
01436   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
01437   return dfa->nodes_len++;
01438 }
01439 
01440 static inline unsigned int
01441 internal_function
01442 calc_state_hash (const re_node_set *nodes, unsigned int context)
01443 {
01444   unsigned int hash = nodes->nelem + context;
01445   int i;
01446   for (i = 0 ; i < nodes->nelem ; i++)
01447     hash += nodes->elems[i];
01448   return hash;
01449 }
01450 
01451 /* Search for the state whose node_set is equivalent to NODES.
01452    Return the pointer to the state, if we found it in the DFA.
01453    Otherwise create the new one and return it.  In case of an error
01454    return NULL and set the error code in ERR.
01455    Note: - We assume NULL as the invalid state, then it is possible that
01456           return value is NULL and ERR is REG_NOERROR.
01457         - We never return non-NULL value in case of any errors, it is for
01458           optimization.  */
01459 
01460 static re_dfastate_t *
01461 internal_function
01462 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
01463                 const re_node_set *nodes)
01464 {
01465   unsigned int hash;
01466   re_dfastate_t *new_state;
01467   struct re_state_table_entry *spot;
01468   int i;
01469   if (BE (nodes->nelem == 0, 0))
01470     {
01471       *err = REG_NOERROR;
01472       return NULL;
01473     }
01474   hash = calc_state_hash (nodes, 0);
01475   spot = dfa->state_table + (hash & dfa->state_hash_mask);
01476 
01477   for (i = 0 ; i < spot->num ; i++)
01478     {
01479       re_dfastate_t *state = spot->array[i];
01480       if (hash != state->hash)
01481        continue;
01482       if (re_node_set_compare (&state->nodes, nodes))
01483        return state;
01484     }
01485 
01486   /* There are no appropriate state in the dfa, create the new one.  */
01487   new_state = create_ci_newstate (dfa, nodes, hash);
01488   if (BE (new_state == NULL, 0))
01489     *err = REG_ESPACE;
01490 
01491   return new_state;
01492 }
01493 
01494 /* Search for the state whose node_set is equivalent to NODES and
01495    whose context is equivalent to CONTEXT.
01496    Return the pointer to the state, if we found it in the DFA.
01497    Otherwise create the new one and return it.  In case of an error
01498    return NULL and set the error code in ERR.
01499    Note: - We assume NULL as the invalid state, then it is possible that
01500           return value is NULL and ERR is REG_NOERROR.
01501         - We never return non-NULL value in case of any errors, it is for
01502           optimization.  */
01503 
01504 static re_dfastate_t *
01505 internal_function
01506 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
01507                        const re_node_set *nodes, unsigned int context)
01508 {
01509   unsigned int hash;
01510   re_dfastate_t *new_state;
01511   struct re_state_table_entry *spot;
01512   int i;
01513   if (nodes->nelem == 0)
01514     {
01515       *err = REG_NOERROR;
01516       return NULL;
01517     }
01518   hash = calc_state_hash (nodes, context);
01519   spot = dfa->state_table + (hash & dfa->state_hash_mask);
01520 
01521   for (i = 0 ; i < spot->num ; i++)
01522     {
01523       re_dfastate_t *state = spot->array[i];
01524       if (state->hash == hash
01525          && state->context == context
01526          && re_node_set_compare (state->entrance_nodes, nodes))
01527        return state;
01528     }
01529   /* There are no appropriate state in `dfa', create the new one.  */
01530   new_state = create_cd_newstate (dfa, nodes, context, hash);
01531   if (BE (new_state == NULL, 0))
01532     *err = REG_ESPACE;
01533 
01534   return new_state;
01535 }
01536 
01537 /* Finish initialization of the new state NEWSTATE, and using its hash value
01538    HASH put in the appropriate bucket of DFA's state table.  Return value
01539    indicates the error code if failed.  */
01540 
01541 static reg_errcode_t
01542 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
01543               unsigned int hash)
01544 {
01545   struct re_state_table_entry *spot;
01546   reg_errcode_t err;
01547   int i;
01548 
01549   newstate->hash = hash;
01550   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
01551   if (BE (err != REG_NOERROR, 0))
01552     return REG_ESPACE;
01553   for (i = 0; i < newstate->nodes.nelem; i++)
01554     {
01555       int elem = newstate->nodes.elems[i];
01556       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
01557         re_node_set_insert_last (&newstate->non_eps_nodes, elem);
01558     }
01559 
01560   spot = dfa->state_table + (hash & dfa->state_hash_mask);
01561   if (BE (spot->alloc <= spot->num, 0))
01562     {
01563       int new_alloc = 2 * spot->num + 2;
01564       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
01565                                          new_alloc);
01566       if (BE (new_array == NULL, 0))
01567        return REG_ESPACE;
01568       spot->array = new_array;
01569       spot->alloc = new_alloc;
01570     }
01571   spot->array[spot->num++] = newstate;
01572   return REG_NOERROR;
01573 }
01574 
01575 static void
01576 free_state (re_dfastate_t *state)
01577 {
01578   re_node_set_free (&state->non_eps_nodes);
01579   re_node_set_free (&state->inveclosure);
01580   if (state->entrance_nodes != &state->nodes)
01581     {
01582       re_node_set_free (state->entrance_nodes);
01583       re_free (state->entrance_nodes);
01584     }
01585   re_node_set_free (&state->nodes);
01586   re_free (state->word_trtable);
01587   re_free (state->trtable);
01588   re_free (state);
01589 }
01590 
01591 /* Create the new state which is independ of contexts.
01592    Return the new state if succeeded, otherwise return NULL.  */
01593 
01594 static re_dfastate_t *
01595 internal_function
01596 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
01597                   unsigned int hash)
01598 {
01599   int i;
01600   reg_errcode_t err;
01601   re_dfastate_t *newstate;
01602 
01603   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
01604   if (BE (newstate == NULL, 0))
01605     return NULL;
01606   err = re_node_set_init_copy (&newstate->nodes, nodes);
01607   if (BE (err != REG_NOERROR, 0))
01608     {
01609       re_free (newstate);
01610       return NULL;
01611     }
01612 
01613   newstate->entrance_nodes = &newstate->nodes;
01614   for (i = 0 ; i < nodes->nelem ; i++)
01615     {
01616       re_token_t *node = dfa->nodes + nodes->elems[i];
01617       re_token_type_t type = node->type;
01618       if (type == CHARACTER && !node->constraint)
01619        continue;
01620 #ifdef RE_ENABLE_I18N
01621       newstate->accept_mb |= node->accept_mb;
01622 #endif /* RE_ENABLE_I18N */
01623 
01624       /* If the state has the halt node, the state is a halt state.  */
01625       if (type == END_OF_RE)
01626        newstate->halt = 1;
01627       else if (type == OP_BACK_REF)
01628        newstate->has_backref = 1;
01629       else if (type == ANCHOR || node->constraint)
01630        newstate->has_constraint = 1;
01631     }
01632   err = register_state (dfa, newstate, hash);
01633   if (BE (err != REG_NOERROR, 0))
01634     {
01635       free_state (newstate);
01636       newstate = NULL;
01637     }
01638   return newstate;
01639 }
01640 
01641 /* Create the new state which is depend on the context CONTEXT.
01642    Return the new state if succeeded, otherwise return NULL.  */
01643 
01644 static re_dfastate_t *
01645 internal_function
01646 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
01647                   unsigned int context, unsigned int hash)
01648 {
01649   int i, nctx_nodes = 0;
01650   reg_errcode_t err;
01651   re_dfastate_t *newstate;
01652 
01653   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
01654   if (BE (newstate == NULL, 0))
01655     return NULL;
01656   err = re_node_set_init_copy (&newstate->nodes, nodes);
01657   if (BE (err != REG_NOERROR, 0))
01658     {
01659       re_free (newstate);
01660       return NULL;
01661     }
01662 
01663   newstate->context = context;
01664   newstate->entrance_nodes = &newstate->nodes;
01665 
01666   for (i = 0 ; i < nodes->nelem ; i++)
01667     {
01668       re_token_t *node = dfa->nodes + nodes->elems[i];
01669       re_token_type_t type = node->type;
01670       unsigned int constraint = node->constraint;
01671 
01672       if (type == CHARACTER && !constraint)
01673        continue;
01674 #ifdef RE_ENABLE_I18N
01675       newstate->accept_mb |= node->accept_mb;
01676 #endif /* RE_ENABLE_I18N */
01677 
01678       /* If the state has the halt node, the state is a halt state.  */
01679       if (type == END_OF_RE)
01680        newstate->halt = 1;
01681       else if (type == OP_BACK_REF)
01682        newstate->has_backref = 1;
01683 
01684       if (constraint)
01685        {
01686          if (newstate->entrance_nodes == &newstate->nodes)
01687            {
01688              newstate->entrance_nodes = re_malloc (re_node_set, 1);
01689              if (BE (newstate->entrance_nodes == NULL, 0))
01690               {
01691                 free_state (newstate);
01692                 return NULL;
01693               }
01694              re_node_set_init_copy (newstate->entrance_nodes, nodes);
01695              nctx_nodes = 0;
01696              newstate->has_constraint = 1;
01697            }
01698 
01699          if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
01700            {
01701              re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
01702              ++nctx_nodes;
01703            }
01704        }
01705     }
01706   err = register_state (dfa, newstate, hash);
01707   if (BE (err != REG_NOERROR, 0))
01708     {
01709       free_state (newstate);
01710       newstate = NULL;
01711     }
01712   return  newstate;
01713 }