Back to index

php5  5.3.10
pcre_study.c
Go to the documentation of this file.
00001 /*************************************************
00002 *      Perl-Compatible Regular Expressions       *
00003 *************************************************/
00004 
00005 /* PCRE is a library of functions to support regular expressions whose syntax
00006 and semantics are as close as possible to those of the Perl 5 language.
00007 
00008                        Written by Philip Hazel
00009            Copyright (c) 1997-2010 University of Cambridge
00010 
00011 -----------------------------------------------------------------------------
00012 Redistribution and use in source and binary forms, with or without
00013 modification, are permitted provided that the following conditions are met:
00014 
00015     * Redistributions of source code must retain the above copyright notice,
00016       this list of conditions and the following disclaimer.
00017 
00018     * Redistributions in binary form must reproduce the above copyright
00019       notice, this list of conditions and the following disclaimer in the
00020       documentation and/or other materials provided with the distribution.
00021 
00022     * Neither the name of the University of Cambridge nor the names of its
00023       contributors may be used to endorse or promote products derived from
00024       this software without specific prior written permission.
00025 
00026 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00027 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00028 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00029 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
00030 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00031 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00032 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00033 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00034 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00035 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00036 POSSIBILITY OF SUCH DAMAGE.
00037 -----------------------------------------------------------------------------
00038 */
00039 
00040 
00041 /* This module contains the external function pcre_study(), along with local
00042 supporting functions. */
00043 
00044 
00045 #include "config.h"
00046 
00047 #include "pcre_internal.h"
00048 
00049 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
00050 
00051 /* Returns from set_start_bits() */
00052 
00053 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
00054 
00055 
00056 
00057 /*************************************************
00058 *   Find the minimum subject length for a group  *
00059 *************************************************/
00060 
00061 /* Scan a parenthesized group and compute the minimum length of subject that
00062 is needed to match it. This is a lower bound; it does not mean there is a
00063 string of that length that matches. In UTF8 mode, the result is in characters
00064 rather than bytes.
00065 
00066 Arguments:
00067   code       pointer to start of group (the bracket)
00068   startcode  pointer to start of the whole pattern
00069   options    the compiling options
00070 
00071 Returns:   the minimum length
00072            -1 if \C was encountered
00073            -2 internal error (missing capturing bracket)
00074 */
00075 
00076 static int
00077 find_minlength(const uschar *code, const uschar *startcode, int options)
00078 {
00079 int length = -1;
00080 BOOL utf8 = (options & PCRE_UTF8) != 0;
00081 BOOL had_recurse = FALSE;
00082 register int branchlength = 0;
00083 register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
00084 
00085 if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
00086 
00087 /* Scan along the opcodes for this branch. If we get to the end of the
00088 branch, check the length against that of the other branches. */
00089 
00090 for (;;)
00091   {
00092   int d, min;
00093   uschar *cs, *ce;
00094   register int op = *cc;
00095 
00096   switch (op)
00097     {
00098     case OP_COND:
00099     case OP_SCOND:
00100 
00101     /* If there is only one branch in a condition, the implied branch has zero
00102     length, so we don't add anything. This covers the DEFINE "condition"
00103     automatically. */
00104 
00105     cs = cc + GET(cc, 1);
00106     if (*cs != OP_ALT)
00107       {
00108       cc = cs + 1 + LINK_SIZE;
00109       break;
00110       }
00111 
00112     /* Otherwise we can fall through and treat it the same as any other
00113     subpattern. */
00114 
00115     case OP_CBRA:
00116     case OP_SCBRA:
00117     case OP_BRA:
00118     case OP_SBRA:
00119     case OP_ONCE:
00120     d = find_minlength(cc, startcode, options);
00121     if (d < 0) return d;
00122     branchlength += d;
00123     do cc += GET(cc, 1); while (*cc == OP_ALT);
00124     cc += 1 + LINK_SIZE;
00125     break;
00126 
00127     /* Reached end of a branch; if it's a ket it is the end of a nested
00128     call. If it's ALT it is an alternation in a nested call. If it is
00129     END it's the end of the outer call. All can be handled by the same code. */
00130 
00131     case OP_ALT:
00132     case OP_KET:
00133     case OP_KETRMAX:
00134     case OP_KETRMIN:
00135     case OP_END:
00136     if (length < 0 || (!had_recurse && branchlength < length))
00137       length = branchlength;
00138     if (*cc != OP_ALT) return length;
00139     cc += 1 + LINK_SIZE;
00140     branchlength = 0;
00141     had_recurse = FALSE;
00142     break;
00143 
00144     /* Skip over assertive subpatterns */
00145 
00146     case OP_ASSERT:
00147     case OP_ASSERT_NOT:
00148     case OP_ASSERTBACK:
00149     case OP_ASSERTBACK_NOT:
00150     do cc += GET(cc, 1); while (*cc == OP_ALT);
00151     /* Fall through */
00152 
00153     /* Skip over things that don't match chars */
00154 
00155     case OP_REVERSE:
00156     case OP_CREF:
00157     case OP_NCREF:
00158     case OP_RREF:
00159     case OP_NRREF:
00160     case OP_DEF:
00161     case OP_OPT:
00162     case OP_CALLOUT:
00163     case OP_SOD:
00164     case OP_SOM:
00165     case OP_EOD:
00166     case OP_EODN:
00167     case OP_CIRC:
00168     case OP_DOLL:
00169     case OP_NOT_WORD_BOUNDARY:
00170     case OP_WORD_BOUNDARY:
00171     cc += _pcre_OP_lengths[*cc];
00172     break;
00173 
00174     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
00175 
00176     case OP_BRAZERO:
00177     case OP_BRAMINZERO:
00178     case OP_SKIPZERO:
00179     cc += _pcre_OP_lengths[*cc];
00180     do cc += GET(cc, 1); while (*cc == OP_ALT);
00181     cc += 1 + LINK_SIZE;
00182     break;
00183 
00184     /* Handle literal characters and + repetitions */
00185 
00186     case OP_CHAR:
00187     case OP_CHARNC:
00188     case OP_NOT:
00189     case OP_PLUS:
00190     case OP_MINPLUS:
00191     case OP_POSPLUS:
00192     case OP_NOTPLUS:
00193     case OP_NOTMINPLUS:
00194     case OP_NOTPOSPLUS:
00195     branchlength++;
00196     cc += 2;
00197 #ifdef SUPPORT_UTF8
00198     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
00199 #endif
00200     break;
00201 
00202     case OP_TYPEPLUS:
00203     case OP_TYPEMINPLUS:
00204     case OP_TYPEPOSPLUS:
00205     branchlength++;
00206     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
00207     break;
00208 
00209     /* Handle exact repetitions. The count is already in characters, but we
00210     need to skip over a multibyte character in UTF8 mode.  */
00211 
00212     case OP_EXACT:
00213     case OP_NOTEXACT:
00214     branchlength += GET2(cc,1);
00215     cc += 4;
00216 #ifdef SUPPORT_UTF8
00217     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
00218 #endif
00219     break;
00220 
00221     case OP_TYPEEXACT:
00222     branchlength += GET2(cc,1);
00223     cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
00224     break;
00225 
00226     /* Handle single-char non-literal matchers */
00227 
00228     case OP_PROP:
00229     case OP_NOTPROP:
00230     cc += 2;
00231     /* Fall through */
00232 
00233     case OP_NOT_DIGIT:
00234     case OP_DIGIT:
00235     case OP_NOT_WHITESPACE:
00236     case OP_WHITESPACE:
00237     case OP_NOT_WORDCHAR:
00238     case OP_WORDCHAR:
00239     case OP_ANY:
00240     case OP_ALLANY:
00241     case OP_EXTUNI:
00242     case OP_HSPACE:
00243     case OP_NOT_HSPACE:
00244     case OP_VSPACE:
00245     case OP_NOT_VSPACE:
00246     branchlength++;
00247     cc++;
00248     break;
00249 
00250     /* "Any newline" might match two characters */
00251 
00252     case OP_ANYNL:
00253     branchlength += 2;
00254     cc++;
00255     break;
00256 
00257     /* The single-byte matcher means we can't proceed in UTF-8 mode */
00258 
00259     case OP_ANYBYTE:
00260 #ifdef SUPPORT_UTF8
00261     if (utf8) return -1;
00262 #endif
00263     branchlength++;
00264     cc++;
00265     break;
00266 
00267     /* For repeated character types, we have to test for \p and \P, which have
00268     an extra two bytes of parameters. */
00269 
00270     case OP_TYPESTAR:
00271     case OP_TYPEMINSTAR:
00272     case OP_TYPEQUERY:
00273     case OP_TYPEMINQUERY:
00274     case OP_TYPEPOSSTAR:
00275     case OP_TYPEPOSQUERY:
00276     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
00277     cc += _pcre_OP_lengths[op];
00278     break;
00279 
00280     case OP_TYPEUPTO:
00281     case OP_TYPEMINUPTO:
00282     case OP_TYPEPOSUPTO:
00283     if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
00284     cc += _pcre_OP_lengths[op];
00285     break;
00286 
00287     /* Check a class for variable quantification */
00288 
00289 #ifdef SUPPORT_UTF8
00290     case OP_XCLASS:
00291     cc += GET(cc, 1) - 33;
00292     /* Fall through */
00293 #endif
00294 
00295     case OP_CLASS:
00296     case OP_NCLASS:
00297     cc += 33;
00298 
00299     switch (*cc)
00300       {
00301       case OP_CRPLUS:
00302       case OP_CRMINPLUS:
00303       branchlength++;
00304       /* Fall through */
00305 
00306       case OP_CRSTAR:
00307       case OP_CRMINSTAR:
00308       case OP_CRQUERY:
00309       case OP_CRMINQUERY:
00310       cc++;
00311       break;
00312 
00313       case OP_CRRANGE:
00314       case OP_CRMINRANGE:
00315       branchlength += GET2(cc,1);
00316       cc += 5;
00317       break;
00318 
00319       default:
00320       branchlength++;
00321       break;
00322       }
00323     break;
00324 
00325     /* Backreferences and subroutine calls are treated in the same way: we find
00326     the minimum length for the subpattern. A recursion, however, causes an
00327     a flag to be set that causes the length of this branch to be ignored. The
00328     logic is that a recursion can only make sense if there is another
00329     alternation that stops the recursing. That will provide the minimum length
00330     (when no recursion happens). A backreference within the group that it is
00331     referencing behaves in the same way.
00332 
00333     If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
00334     matches an empty string (by default it causes a matching failure), so in
00335     that case we must set the minimum length to zero. */
00336 
00337     case OP_REF:
00338     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
00339       {
00340       ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
00341       if (cs == NULL) return -2;
00342       do ce += GET(ce, 1); while (*ce == OP_ALT);
00343       if (cc > cs && cc < ce)
00344         {
00345         d = 0;
00346         had_recurse = TRUE;
00347         }
00348       else d = find_minlength(cs, startcode, options);
00349       }
00350     else d = 0;
00351     cc += 3;
00352 
00353     /* Handle repeated back references */
00354 
00355     switch (*cc)
00356       {
00357       case OP_CRSTAR:
00358       case OP_CRMINSTAR:
00359       case OP_CRQUERY:
00360       case OP_CRMINQUERY:
00361       min = 0;
00362       cc++;
00363       break;
00364 
00365       case OP_CRRANGE:
00366       case OP_CRMINRANGE:
00367       min = GET2(cc, 1);
00368       cc += 5;
00369       break;
00370 
00371       default:
00372       min = 1;
00373       break;
00374       }
00375 
00376     branchlength += min * d;
00377     break;
00378 
00379     case OP_RECURSE:
00380     cs = ce = (uschar *)startcode + GET(cc, 1);
00381     if (cs == NULL) return -2;
00382     do ce += GET(ce, 1); while (*ce == OP_ALT);
00383     if (cc > cs && cc < ce)
00384       had_recurse = TRUE;
00385     else
00386       branchlength += find_minlength(cs, startcode, options);
00387     cc += 1 + LINK_SIZE;
00388     break;
00389 
00390     /* Anything else does not or need not match a character. We can get the
00391     item's length from the table, but for those that can match zero occurrences
00392     of a character, we must take special action for UTF-8 characters. */
00393 
00394     case OP_UPTO:
00395     case OP_NOTUPTO:
00396     case OP_MINUPTO:
00397     case OP_NOTMINUPTO:
00398     case OP_POSUPTO:
00399     case OP_STAR:
00400     case OP_MINSTAR:
00401     case OP_NOTMINSTAR:
00402     case OP_POSSTAR:
00403     case OP_NOTPOSSTAR:
00404     case OP_QUERY:
00405     case OP_MINQUERY:
00406     case OP_NOTMINQUERY:
00407     case OP_POSQUERY:
00408     case OP_NOTPOSQUERY:
00409     cc += _pcre_OP_lengths[op];
00410 #ifdef SUPPORT_UTF8
00411     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
00412 #endif
00413     break;
00414 
00415     /* Skip these, but we need to add in the name length. */
00416 
00417     case OP_MARK:
00418     case OP_PRUNE_ARG:
00419     case OP_SKIP_ARG:
00420     cc += _pcre_OP_lengths[op] + cc[1];
00421     break;
00422 
00423     case OP_THEN_ARG:
00424     cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
00425     break;
00426 
00427     /* For the record, these are the opcodes that are matched by "default":
00428     OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
00429     OP_THEN. */
00430 
00431     default:
00432     cc += _pcre_OP_lengths[op];
00433     break;
00434     }
00435   }
00436 /* Control never gets here */
00437 }
00438 
00439 
00440 
00441 /*************************************************
00442 *      Set a bit and maybe its alternate case    *
00443 *************************************************/
00444 
00445 /* Given a character, set its first byte's bit in the table, and also the
00446 corresponding bit for the other version of a letter if we are caseless. In
00447 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
00448 when Unicode property support is available.
00449 
00450 Arguments:
00451   start_bits    points to the bit map
00452   p             points to the character
00453   caseless      the caseless flag
00454   cd            the block with char table pointers
00455   utf8          TRUE for UTF-8 mode
00456 
00457 Returns:        pointer after the character
00458 */
00459 
00460 static const uschar *
00461 set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
00462   compile_data *cd, BOOL utf8)
00463 {
00464 unsigned int c = *p;
00465 
00466 SET_BIT(c);
00467 
00468 #ifdef SUPPORT_UTF8
00469 if (utf8 && c > 127)
00470   {
00471   GETCHARINC(c, p);
00472 #ifdef SUPPORT_UCP
00473   if (caseless)
00474     {
00475     uschar buff[8];
00476     c = UCD_OTHERCASE(c);
00477     (void)_pcre_ord2utf8(c, buff);
00478     SET_BIT(buff[0]);
00479     }
00480 #endif
00481   return p;
00482   }
00483 #endif
00484 
00485 /* Not UTF-8 mode, or character is less than 127. */
00486 
00487 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
00488 return p + 1;
00489 }
00490 
00491 
00492 
00493 /*************************************************
00494 *     Set bits for a positive character type     *
00495 *************************************************/
00496 
00497 /* This function sets starting bits for a character type. In UTF-8 mode, we can
00498 only do a direct setting for bytes less than 128, as otherwise there can be
00499 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
00500 environment, the tables will only recognize ASCII characters anyway, but in at
00501 least one Windows environment, some higher bytes bits were set in the tables.
00502 So we deal with that case by considering the UTF-8 encoding.
00503 
00504 Arguments:
00505   start_bits     the starting bitmap
00506   cbit type      the type of character wanted
00507   table_limit    32 for non-UTF-8; 16 for UTF-8
00508   cd             the block with char table pointers
00509 
00510 Returns:         nothing
00511 */
00512 
00513 static void
00514 set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
00515   compile_data *cd)
00516 {
00517 register int c;
00518 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
00519 if (table_limit == 32) return;
00520 for (c = 128; c < 256; c++)
00521   {
00522   if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
00523     {
00524     uschar buff[8];
00525     (void)_pcre_ord2utf8(c, buff);
00526     SET_BIT(buff[0]);
00527     }
00528   }
00529 }
00530 
00531 
00532 /*************************************************
00533 *     Set bits for a negative character type     *
00534 *************************************************/
00535 
00536 /* This function sets starting bits for a negative character type such as \D.
00537 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
00538 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
00539 Unlike in the positive case, where we can set appropriate starting bits for
00540 specific high-valued UTF-8 characters, in this case we have to set the bits for
00541 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
00542 0xc0 (192) for simplicity.
00543 
00544 Arguments:
00545   start_bits     the starting bitmap
00546   cbit type      the type of character wanted
00547   table_limit    32 for non-UTF-8; 16 for UTF-8
00548   cd             the block with char table pointers
00549 
00550 Returns:         nothing
00551 */
00552 
00553 static void
00554 set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
00555   compile_data *cd)
00556 {
00557 register int c;
00558 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
00559 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
00560 }
00561 
00562 
00563 
00564 /*************************************************
00565 *          Create bitmap of starting bytes       *
00566 *************************************************/
00567 
00568 /* This function scans a compiled unanchored expression recursively and
00569 attempts to build a bitmap of the set of possible starting bytes. As time goes
00570 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
00571 useful for parenthesized groups in patterns such as (a*)b where the group
00572 provides some optional starting bytes but scanning must continue at the outer
00573 level to find at least one mandatory byte. At the outermost level, this
00574 function fails unless the result is SSB_DONE.
00575 
00576 Arguments:
00577   code         points to an expression
00578   start_bits   points to a 32-byte table, initialized to 0
00579   caseless     the current state of the caseless flag
00580   utf8         TRUE if in UTF-8 mode
00581   cd           the block with char table pointers
00582 
00583 Returns:       SSB_FAIL     => Failed to find any starting bytes
00584                SSB_DONE     => Found mandatory starting bytes
00585                SSB_CONTINUE => Found optional starting bytes
00586 */
00587 
00588 static int
00589 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
00590   BOOL utf8, compile_data *cd)
00591 {
00592 register int c;
00593 int yield = SSB_DONE;
00594 int table_limit = utf8? 16:32;
00595 
00596 #if 0
00597 /* ========================================================================= */
00598 /* The following comment and code was inserted in January 1999. In May 2006,
00599 when it was observed to cause compiler warnings about unused values, I took it
00600 out again. If anybody is still using OS/2, they will have to put it back
00601 manually. */
00602 
00603 /* This next statement and the later reference to dummy are here in order to
00604 trick the optimizer of the IBM C compiler for OS/2 into generating correct
00605 code. Apparently IBM isn't going to fix the problem, and we would rather not
00606 disable optimization (in this module it actually makes a big difference, and
00607 the pcre module can use all the optimization it can get). */
00608 
00609 volatile int dummy;
00610 /* ========================================================================= */
00611 #endif
00612 
00613 do
00614   {
00615   const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
00616   BOOL try_next = TRUE;
00617 
00618   while (try_next)    /* Loop for items in this branch */
00619     {
00620     int rc;
00621     switch(*tcode)
00622       {
00623       /* Fail if we reach something we don't understand */
00624 
00625       default:
00626       return SSB_FAIL;
00627 
00628       /* If we hit a bracket or a positive lookahead assertion, recurse to set
00629       bits from within the subpattern. If it can't find anything, we have to
00630       give up. If it finds some mandatory character(s), we are done for this
00631       branch. Otherwise, carry on scanning after the subpattern. */
00632 
00633       case OP_BRA:
00634       case OP_SBRA:
00635       case OP_CBRA:
00636       case OP_SCBRA:
00637       case OP_ONCE:
00638       case OP_ASSERT:
00639       rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
00640       if (rc == SSB_FAIL) return SSB_FAIL;
00641       if (rc == SSB_DONE) try_next = FALSE; else
00642         {
00643         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
00644         tcode += 1 + LINK_SIZE;
00645         }
00646       break;
00647 
00648       /* If we hit ALT or KET, it means we haven't found anything mandatory in
00649       this branch, though we might have found something optional. For ALT, we
00650       continue with the next alternative, but we have to arrange that the final
00651       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
00652       return SSB_CONTINUE: if this is the top level, that indicates failure,
00653       but after a nested subpattern, it causes scanning to continue. */
00654 
00655       case OP_ALT:
00656       yield = SSB_CONTINUE;
00657       try_next = FALSE;
00658       break;
00659 
00660       case OP_KET:
00661       case OP_KETRMAX:
00662       case OP_KETRMIN:
00663       return SSB_CONTINUE;
00664 
00665       /* Skip over callout */
00666 
00667       case OP_CALLOUT:
00668       tcode += 2 + 2*LINK_SIZE;
00669       break;
00670 
00671       /* Skip over lookbehind and negative lookahead assertions */
00672 
00673       case OP_ASSERT_NOT:
00674       case OP_ASSERTBACK:
00675       case OP_ASSERTBACK_NOT:
00676       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
00677       tcode += 1 + LINK_SIZE;
00678       break;
00679 
00680       /* Skip over an option setting, changing the caseless flag */
00681 
00682       case OP_OPT:
00683       caseless = (tcode[1] & PCRE_CASELESS) != 0;
00684       tcode += 2;
00685       break;
00686 
00687       /* BRAZERO does the bracket, but carries on. */
00688 
00689       case OP_BRAZERO:
00690       case OP_BRAMINZERO:
00691       if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
00692         return SSB_FAIL;
00693 /* =========================================================================
00694       See the comment at the head of this function concerning the next line,
00695       which was an old fudge for the benefit of OS/2.
00696       dummy = 1;
00697   ========================================================================= */
00698       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
00699       tcode += 1 + LINK_SIZE;
00700       break;
00701 
00702       /* SKIPZERO skips the bracket. */
00703 
00704       case OP_SKIPZERO:
00705       tcode++;
00706       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
00707       tcode += 1 + LINK_SIZE;
00708       break;
00709 
00710       /* Single-char * or ? sets the bit and tries the next item */
00711 
00712       case OP_STAR:
00713       case OP_MINSTAR:
00714       case OP_POSSTAR:
00715       case OP_QUERY:
00716       case OP_MINQUERY:
00717       case OP_POSQUERY:
00718       tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
00719       break;
00720 
00721       /* Single-char upto sets the bit and tries the next */
00722 
00723       case OP_UPTO:
00724       case OP_MINUPTO:
00725       case OP_POSUPTO:
00726       tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
00727       break;
00728 
00729       /* At least one single char sets the bit and stops */
00730 
00731       case OP_EXACT:       /* Fall through */
00732       tcode += 2;
00733 
00734       case OP_CHAR:
00735       case OP_CHARNC:
00736       case OP_PLUS:
00737       case OP_MINPLUS:
00738       case OP_POSPLUS:
00739       (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
00740       try_next = FALSE;
00741       break;
00742 
00743       /* Special spacing and line-terminating items. These recognize specific
00744       lists of characters. The difference between VSPACE and ANYNL is that the
00745       latter can match the two-character CRLF sequence, but that is not
00746       relevant for finding the first character, so their code here is
00747       identical. */
00748 
00749       case OP_HSPACE:
00750       SET_BIT(0x09);
00751       SET_BIT(0x20);
00752       if (utf8)
00753         {
00754         SET_BIT(0xC2);  /* For U+00A0 */
00755         SET_BIT(0xE1);  /* For U+1680, U+180E */
00756         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
00757         SET_BIT(0xE3);  /* For U+3000 */
00758         }
00759       else SET_BIT(0xA0);
00760       try_next = FALSE;
00761       break;
00762 
00763       case OP_ANYNL:
00764       case OP_VSPACE:
00765       SET_BIT(0x0A);
00766       SET_BIT(0x0B);
00767       SET_BIT(0x0C);
00768       SET_BIT(0x0D);
00769       if (utf8)
00770         {
00771         SET_BIT(0xC2);  /* For U+0085 */
00772         SET_BIT(0xE2);  /* For U+2028, U+2029 */
00773         }
00774       else SET_BIT(0x85);
00775       try_next = FALSE;
00776       break;
00777 
00778       /* Single character types set the bits and stop. Note that if PCRE_UCP
00779       is set, we do not see these op codes because \d etc are converted to
00780       properties. Therefore, these apply in the case when only characters less
00781       than 256 are recognized to match the types. */
00782 
00783       case OP_NOT_DIGIT:
00784       set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
00785       try_next = FALSE;
00786       break;
00787 
00788       case OP_DIGIT:
00789       set_type_bits(start_bits, cbit_digit, table_limit, cd);
00790       try_next = FALSE;
00791       break;
00792 
00793       /* The cbit_space table has vertical tab as whitespace; we have to
00794       ensure it is set as not whitespace. */
00795 
00796       case OP_NOT_WHITESPACE:
00797       set_nottype_bits(start_bits, cbit_space, table_limit, cd);
00798       start_bits[1] |= 0x08;
00799       try_next = FALSE;
00800       break;
00801 
00802       /* The cbit_space table has vertical tab as whitespace; we have to
00803       not set it from the table. */
00804 
00805       case OP_WHITESPACE:
00806       c = start_bits[1];    /* Save in case it was already set */
00807       set_type_bits(start_bits, cbit_space, table_limit, cd);
00808       start_bits[1] = (start_bits[1] & ~0x08) | c;
00809       try_next = FALSE;
00810       break;
00811 
00812       case OP_NOT_WORDCHAR:
00813       set_nottype_bits(start_bits, cbit_word, table_limit, cd);
00814       try_next = FALSE;
00815       break;
00816 
00817       case OP_WORDCHAR:
00818       set_type_bits(start_bits, cbit_word, table_limit, cd);
00819       try_next = FALSE;
00820       break;
00821 
00822       /* One or more character type fudges the pointer and restarts, knowing
00823       it will hit a single character type and stop there. */
00824 
00825       case OP_TYPEPLUS:
00826       case OP_TYPEMINPLUS:
00827       case OP_TYPEPOSPLUS:
00828       tcode++;
00829       break;
00830 
00831       case OP_TYPEEXACT:
00832       tcode += 3;
00833       break;
00834 
00835       /* Zero or more repeats of character types set the bits and then
00836       try again. */
00837 
00838       case OP_TYPEUPTO:
00839       case OP_TYPEMINUPTO:
00840       case OP_TYPEPOSUPTO:
00841       tcode += 2;               /* Fall through */
00842 
00843       case OP_TYPESTAR:
00844       case OP_TYPEMINSTAR:
00845       case OP_TYPEPOSSTAR:
00846       case OP_TYPEQUERY:
00847       case OP_TYPEMINQUERY:
00848       case OP_TYPEPOSQUERY:
00849       switch(tcode[1])
00850         {
00851         default:
00852         case OP_ANY:
00853         case OP_ALLANY:
00854         return SSB_FAIL;
00855 
00856         case OP_HSPACE:
00857         SET_BIT(0x09);
00858         SET_BIT(0x20);
00859         if (utf8)
00860           {
00861           SET_BIT(0xC2);  /* For U+00A0 */
00862           SET_BIT(0xE1);  /* For U+1680, U+180E */
00863           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
00864           SET_BIT(0xE3);  /* For U+3000 */
00865           }
00866         else SET_BIT(0xA0);
00867         break;
00868 
00869         case OP_ANYNL:
00870         case OP_VSPACE:
00871         SET_BIT(0x0A);
00872         SET_BIT(0x0B);
00873         SET_BIT(0x0C);
00874         SET_BIT(0x0D);
00875         if (utf8)
00876           {
00877           SET_BIT(0xC2);  /* For U+0085 */
00878           SET_BIT(0xE2);  /* For U+2028, U+2029 */
00879           }
00880         else SET_BIT(0x85);
00881         break;
00882 
00883         case OP_NOT_DIGIT:
00884         set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
00885         break;
00886 
00887         case OP_DIGIT:
00888         set_type_bits(start_bits, cbit_digit, table_limit, cd);
00889         break;
00890 
00891         /* The cbit_space table has vertical tab as whitespace; we have to
00892         ensure it gets set as not whitespace. */
00893 
00894         case OP_NOT_WHITESPACE:
00895         set_nottype_bits(start_bits, cbit_space, table_limit, cd);
00896         start_bits[1] |= 0x08;
00897         break;
00898 
00899         /* The cbit_space table has vertical tab as whitespace; we have to
00900         avoid setting it. */
00901 
00902         case OP_WHITESPACE:
00903         c = start_bits[1];    /* Save in case it was already set */
00904         set_type_bits(start_bits, cbit_space, table_limit, cd);
00905         start_bits[1] = (start_bits[1] & ~0x08) | c;
00906         break;
00907 
00908         case OP_NOT_WORDCHAR:
00909         set_nottype_bits(start_bits, cbit_word, table_limit, cd);
00910         break;
00911 
00912         case OP_WORDCHAR:
00913         set_type_bits(start_bits, cbit_word, table_limit, cd);
00914         break;
00915         }
00916 
00917       tcode += 2;
00918       break;
00919 
00920       /* Character class where all the information is in a bit map: set the
00921       bits and either carry on or not, according to the repeat count. If it was
00922       a negative class, and we are operating with UTF-8 characters, any byte
00923       with a value >= 0xc4 is a potentially valid starter because it starts a
00924       character with a value > 255. */
00925 
00926       case OP_NCLASS:
00927 #ifdef SUPPORT_UTF8
00928       if (utf8)
00929         {
00930         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
00931         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
00932         }
00933 #endif
00934       /* Fall through */
00935 
00936       case OP_CLASS:
00937         {
00938         tcode++;
00939 
00940         /* In UTF-8 mode, the bits in a bit map correspond to character
00941         values, not to byte values. However, the bit map we are constructing is
00942         for byte values. So we have to do a conversion for characters whose
00943         value is > 127. In fact, there are only two possible starting bytes for
00944         characters in the range 128 - 255. */
00945 
00946 #ifdef SUPPORT_UTF8
00947         if (utf8)
00948           {
00949           for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
00950           for (c = 128; c < 256; c++)
00951             {
00952             if ((tcode[c/8] && (1 << (c&7))) != 0)
00953               {
00954               int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
00955               start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
00956               c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
00957               }
00958             }
00959           }
00960 
00961         /* In non-UTF-8 mode, the two bit maps are completely compatible. */
00962 
00963         else
00964 #endif
00965           {
00966           for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
00967           }
00968 
00969         /* Advance past the bit map, and act on what follows */
00970 
00971         tcode += 32;
00972         switch (*tcode)
00973           {
00974           case OP_CRSTAR:
00975           case OP_CRMINSTAR:
00976           case OP_CRQUERY:
00977           case OP_CRMINQUERY:
00978           tcode++;
00979           break;
00980 
00981           case OP_CRRANGE:
00982           case OP_CRMINRANGE:
00983           if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
00984             else try_next = FALSE;
00985           break;
00986 
00987           default:
00988           try_next = FALSE;
00989           break;
00990           }
00991         }
00992       break; /* End of bitmap class handling */
00993 
00994       }      /* End of switch */
00995     }        /* End of try_next loop */
00996 
00997   code += GET(code, 1);   /* Advance to next branch */
00998   }
00999 while (*code == OP_ALT);
01000 return yield;
01001 }
01002 
01003 
01004 
01005 /*************************************************
01006 *          Study a compiled expression           *
01007 *************************************************/
01008 
01009 /* This function is handed a compiled expression that it must study to produce
01010 information that will speed up the matching. It returns a pcre_extra block
01011 which then gets handed back to pcre_exec().
01012 
01013 Arguments:
01014   re        points to the compiled expression
01015   options   contains option bits
01016   errorptr  points to where to place error messages;
01017             set NULL unless error
01018 
01019 Returns:    pointer to a pcre_extra block, with study_data filled in and the
01020               appropriate flags set;
01021             NULL on error or if no optimization possible
01022 */
01023 
01024 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
01025 pcre_study(const pcre *external_re, int options, const char **errorptr)
01026 {
01027 int min;
01028 BOOL bits_set = FALSE;
01029 uschar start_bits[32];
01030 pcre_extra *extra;
01031 pcre_study_data *study;
01032 const uschar *tables;
01033 uschar *code;
01034 compile_data compile_block;
01035 const real_pcre *re = (const real_pcre *)external_re;
01036 
01037 *errorptr = NULL;
01038 
01039 if (re == NULL || re->magic_number != MAGIC_NUMBER)
01040   {
01041   *errorptr = "argument is not a compiled regular expression";
01042   return NULL;
01043   }
01044 
01045 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
01046   {
01047   *errorptr = "unknown or incorrect option bit(s) set";
01048   return NULL;
01049   }
01050 
01051 code = (uschar *)re + re->name_table_offset +
01052   (re->name_count * re->name_entry_size);
01053 
01054 /* For an anchored pattern, or an unanchored pattern that has a first char, or
01055 a multiline pattern that matches only at "line starts", there is no point in
01056 seeking a list of starting bytes. */
01057 
01058 if ((re->options & PCRE_ANCHORED) == 0 &&
01059     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
01060   {
01061   /* Set the character tables in the block that is passed around */
01062 
01063   tables = re->tables;
01064   if (tables == NULL)
01065     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
01066     (void *)(&tables));
01067 
01068   compile_block.lcc = tables + lcc_offset;
01069   compile_block.fcc = tables + fcc_offset;
01070   compile_block.cbits = tables + cbits_offset;
01071   compile_block.ctypes = tables + ctypes_offset;
01072 
01073   /* See if we can find a fixed set of initial characters for the pattern. */
01074 
01075   memset(start_bits, 0, 32 * sizeof(uschar));
01076   bits_set = set_start_bits(code, start_bits,
01077     (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
01078     &compile_block) == SSB_DONE;
01079   }
01080 
01081 /* Find the minimum length of subject string. */
01082 
01083 min = find_minlength(code, code, re->options);
01084 
01085 /* Return NULL if no optimization is possible. */
01086 
01087 if (!bits_set && min < 0) return NULL;
01088 
01089 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
01090 the latter, which is pointed to by the former, which may also get additional
01091 data set later by the calling program. At the moment, the size of
01092 pcre_study_data is fixed. We nevertheless save it in a field for returning via
01093 the pcre_fullinfo() function so that if it becomes variable in the future, we
01094 don't have to change that code. */
01095 
01096 extra = (pcre_extra *)(pcre_malloc)
01097   (sizeof(pcre_extra) + sizeof(pcre_study_data));
01098 
01099 if (extra == NULL)
01100   {
01101   *errorptr = "failed to get memory";
01102   return NULL;
01103   }
01104 
01105 study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
01106 extra->flags = PCRE_EXTRA_STUDY_DATA;
01107 extra->study_data = study;
01108 
01109 study->size = sizeof(pcre_study_data);
01110 study->flags = 0;
01111 
01112 if (bits_set)
01113   {
01114   study->flags |= PCRE_STUDY_MAPPED;
01115   memcpy(study->start_bits, start_bits, sizeof(start_bits));
01116   }
01117 
01118 if (min >= 0)
01119   {
01120   study->flags |= PCRE_STUDY_MINLEN;
01121   study->minlength = min;
01122   }
01123 
01124 return extra;
01125 }
01126 
01127 /* End of pcre_study.c */