Back to index

php5  5.3.10
pcre_compile.c
Go to the documentation of this file.
00001 /*************************************************
00002 *      Perl-Compatible Regular Expressions       *
00003 *************************************************/
00004 
00005 /* PCRE is a library of functions to support regular expressions whose syntax
00006 and semantics are as close as possible to those of the Perl 5 language.
00007 
00008                        Written by Philip Hazel
00009            Copyright (c) 1997-2010 University of Cambridge
00010 
00011 -----------------------------------------------------------------------------
00012 Redistribution and use in source and binary forms, with or without
00013 modification, are permitted provided that the following conditions are met:
00014 
00015     * Redistributions of source code must retain the above copyright notice,
00016       this list of conditions and the following disclaimer.
00017 
00018     * Redistributions in binary form must reproduce the above copyright
00019       notice, this list of conditions and the following disclaimer in the
00020       documentation and/or other materials provided with the distribution.
00021 
00022     * Neither the name of the University of Cambridge nor the names of its
00023       contributors may be used to endorse or promote products derived from
00024       this software without specific prior written permission.
00025 
00026 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00027 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00028 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00029 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
00030 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00031 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00032 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00033 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00034 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00035 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00036 POSSIBILITY OF SUCH DAMAGE.
00037 -----------------------------------------------------------------------------
00038 */
00039 
00040 
00041 /* This module contains the external function pcre_compile(), along with
00042 supporting internal functions that are not used by other modules. */
00043 
00044 
00045 #include "config.h"
00046 
00047 #define NLBLOCK cd             /* Block containing newline information */
00048 #define PSSTART start_pattern  /* Field containing processed string start */
00049 #define PSEND   end_pattern    /* Field containing processed string end */
00050 
00051 #include "pcre_internal.h"
00052 
00053 
00054 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
00055 also used by pcretest. PCRE_DEBUG is not defined when building a production
00056 library. */
00057 
00058 #ifdef PCRE_DEBUG
00059 #include "pcre_printint.src"
00060 #endif
00061 
00062 
00063 /* Macro for setting individual bits in class bitmaps. */
00064 
00065 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
00066 
00067 /* Maximum length value to check against when making sure that the integer that
00068 holds the compiled pattern length does not overflow. We make it a bit less than
00069 INT_MAX to allow for adding in group terminating bytes, so that we don't have
00070 to check them every time. */
00071 
00072 #define OFLOW_MAX (INT_MAX - 20)
00073 
00074 
00075 /*************************************************
00076 *      Code parameters and static tables         *
00077 *************************************************/
00078 
00079 /* This value specifies the size of stack workspace that is used during the
00080 first pre-compile phase that determines how much memory is required. The regex
00081 is partly compiled into this space, but the compiled parts are discarded as
00082 soon as they can be, so that hopefully there will never be an overrun. The code
00083 does, however, check for an overrun. The largest amount I've seen used is 218,
00084 so this number is very generous.
00085 
00086 The same workspace is used during the second, actual compile phase for
00087 remembering forward references to groups so that they can be filled in at the
00088 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
00089 is 4 there is plenty of room. */
00090 
00091 #define COMPILE_WORK_SIZE (4096)
00092 
00093 /* The overrun tests check for a slightly smaller size so that they detect the
00094 overrun before it actually does run off the end of the data block. */
00095 
00096 #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
00097 
00098 
00099 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
00100 are simple data values; negative values are for special things like \d and so
00101 on. Zero means further processing is needed (for things like \x), or the escape
00102 is invalid. */
00103 
00104 #ifndef EBCDIC
00105 
00106 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
00107 in UTF-8 mode. */
00108 
00109 static const short int escapes[] = {
00110      0,                       0,
00111      0,                       0,
00112      0,                       0,
00113      0,                       0,
00114      0,                       0,
00115      CHAR_COLON,              CHAR_SEMICOLON,
00116      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
00117      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
00118      CHAR_COMMERCIAL_AT,      -ESC_A,
00119      -ESC_B,                  -ESC_C,
00120      -ESC_D,                  -ESC_E,
00121      0,                       -ESC_G,
00122      -ESC_H,                  0,
00123      0,                       -ESC_K,
00124      0,                       0,
00125      -ESC_N,                  0,
00126      -ESC_P,                  -ESC_Q,
00127      -ESC_R,                  -ESC_S,
00128      0,                       0,
00129      -ESC_V,                  -ESC_W,
00130      -ESC_X,                  0,
00131      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
00132      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
00133      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
00134      CHAR_GRAVE_ACCENT,       7,
00135      -ESC_b,                  0,
00136      -ESC_d,                  ESC_e,
00137      ESC_f,                   0,
00138      -ESC_h,                  0,
00139      0,                       -ESC_k,
00140      0,                       0,
00141      ESC_n,                   0,
00142      -ESC_p,                  0,
00143      ESC_r,                   -ESC_s,
00144      ESC_tee,                 0,
00145      -ESC_v,                  -ESC_w,
00146      0,                       0,
00147      -ESC_z
00148 };
00149 
00150 #else
00151 
00152 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
00153 
00154 static const short int escapes[] = {
00155 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
00156 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
00157 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
00158 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
00159 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
00160 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
00161 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
00162 /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
00163 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
00164 /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
00165 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
00166 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
00167 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
00168 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
00169 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
00170 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
00171 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
00172 /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
00173 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
00174 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
00175 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
00176 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
00177 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
00178 };
00179 #endif
00180 
00181 
00182 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
00183 searched linearly. Put all the names into a single string, in order to reduce
00184 the number of relocations when a shared library is dynamically linked. The
00185 string is built from string macros so that it works in UTF-8 mode on EBCDIC
00186 platforms. */
00187 
00188 typedef struct verbitem {
00189   int   len;                 /* Length of verb name */
00190   int   op;                  /* Op when no arg, or -1 if arg mandatory */
00191   int   op_arg;              /* Op when arg present, or -1 if not allowed */
00192 } verbitem;
00193 
00194 static const char verbnames[] =
00195   "\0"                       /* Empty name is a shorthand for MARK */
00196   STRING_MARK0
00197   STRING_ACCEPT0
00198   STRING_COMMIT0
00199   STRING_F0
00200   STRING_FAIL0
00201   STRING_PRUNE0
00202   STRING_SKIP0
00203   STRING_THEN;
00204 
00205 static const verbitem verbs[] = {
00206   { 0, -1,        OP_MARK },
00207   { 4, -1,        OP_MARK },
00208   { 6, OP_ACCEPT, -1 },
00209   { 6, OP_COMMIT, -1 },
00210   { 1, OP_FAIL,   -1 },
00211   { 4, OP_FAIL,   -1 },
00212   { 5, OP_PRUNE,  OP_PRUNE_ARG },
00213   { 4, OP_SKIP,   OP_SKIP_ARG  },
00214   { 4, OP_THEN,   OP_THEN_ARG  }
00215 };
00216 
00217 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
00218 
00219 
00220 /* Tables of names of POSIX character classes and their lengths. The names are
00221 now all in a single string, to reduce the number of relocations when a shared
00222 library is dynamically loaded. The list of lengths is terminated by a zero
00223 length entry. The first three must be alpha, lower, upper, as this is assumed
00224 for handling case independence. */
00225 
00226 static const char posix_names[] =
00227   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
00228   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
00229   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
00230   STRING_word0  STRING_xdigit;
00231 
00232 static const uschar posix_name_lengths[] = {
00233   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
00234 
00235 /* Table of class bit maps for each POSIX class. Each class is formed from a
00236 base map, with an optional addition or removal of another map. Then, for some
00237 classes, there is some additional tweaking: for [:blank:] the vertical space
00238 characters are removed, and for [:alpha:] and [:alnum:] the underscore
00239 character is removed. The triples in the table consist of the base map offset,
00240 second map offset or -1 if no second map, and a non-negative value for map
00241 addition or a negative value for map subtraction (if there are two maps). The
00242 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
00243 remove vertical space characters, 2 => remove underscore. */
00244 
00245 static const int posix_class_maps[] = {
00246   cbit_word,  cbit_digit, -2,             /* alpha */
00247   cbit_lower, -1,          0,             /* lower */
00248   cbit_upper, -1,          0,             /* upper */
00249   cbit_word,  -1,          2,             /* alnum - word without underscore */
00250   cbit_print, cbit_cntrl,  0,             /* ascii */
00251   cbit_space, -1,          1,             /* blank - a GNU extension */
00252   cbit_cntrl, -1,          0,             /* cntrl */
00253   cbit_digit, -1,          0,             /* digit */
00254   cbit_graph, -1,          0,             /* graph */
00255   cbit_print, -1,          0,             /* print */
00256   cbit_punct, -1,          0,             /* punct */
00257   cbit_space, -1,          0,             /* space */
00258   cbit_word,  -1,          0,             /* word - a Perl extension */
00259   cbit_xdigit,-1,          0              /* xdigit */
00260 };
00261 
00262 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
00263 substitutes must be in the order of the names, defined above, and there are
00264 both positive and negative cases. NULL means no substitute. */
00265 
00266 #ifdef SUPPORT_UCP
00267 static const uschar *substitutes[] = {
00268   (uschar *)"\\P{Nd}",    /* \D */
00269   (uschar *)"\\p{Nd}",    /* \d */
00270   (uschar *)"\\P{Xsp}",   /* \S */       /* NOTE: Xsp is Perl space */
00271   (uschar *)"\\p{Xsp}",   /* \s */
00272   (uschar *)"\\P{Xwd}",   /* \W */
00273   (uschar *)"\\p{Xwd}"    /* \w */
00274 };
00275 
00276 static const uschar *posix_substitutes[] = {
00277   (uschar *)"\\p{L}",     /* alpha */
00278   (uschar *)"\\p{Ll}",    /* lower */
00279   (uschar *)"\\p{Lu}",    /* upper */
00280   (uschar *)"\\p{Xan}",   /* alnum */
00281   NULL,                   /* ascii */
00282   (uschar *)"\\h",        /* blank */
00283   NULL,                   /* cntrl */
00284   (uschar *)"\\p{Nd}",    /* digit */
00285   NULL,                   /* graph */
00286   NULL,                   /* print */
00287   NULL,                   /* punct */
00288   (uschar *)"\\p{Xps}",   /* space */    /* NOTE: Xps is POSIX space */
00289   (uschar *)"\\p{Xwd}",   /* word */
00290   NULL,                   /* xdigit */
00291   /* Negated cases */
00292   (uschar *)"\\P{L}",     /* ^alpha */
00293   (uschar *)"\\P{Ll}",    /* ^lower */
00294   (uschar *)"\\P{Lu}",    /* ^upper */
00295   (uschar *)"\\P{Xan}",   /* ^alnum */
00296   NULL,                   /* ^ascii */
00297   (uschar *)"\\H",        /* ^blank */
00298   NULL,                   /* ^cntrl */
00299   (uschar *)"\\P{Nd}",    /* ^digit */
00300   NULL,                   /* ^graph */
00301   NULL,                   /* ^print */
00302   NULL,                   /* ^punct */
00303   (uschar *)"\\P{Xps}",   /* ^space */   /* NOTE: Xps is POSIX space */
00304   (uschar *)"\\P{Xwd}",   /* ^word */
00305   NULL                    /* ^xdigit */
00306 };
00307 #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
00308 #endif
00309 
00310 #define STRING(a)  # a
00311 #define XSTRING(s) STRING(s)
00312 
00313 /* The texts of compile-time error messages. These are "char *" because they
00314 are passed to the outside world. Do not ever re-use any error number, because
00315 they are documented. Always add a new error instead. Messages marked DEAD below
00316 are no longer used. This used to be a table of strings, but in order to reduce
00317 the number of relocations needed when a shared library is loaded dynamically,
00318 it is now one long string. We cannot use a table of offsets, because the
00319 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
00320 simply count through to the one we want - this isn't a performance issue
00321 because these strings are used only when there is a compilation error.
00322 
00323 Each substring ends with \0 to insert a null character. This includes the final
00324 substring, so that the whole string ends with \0\0, which can be detected when
00325 counting through. */
00326 
00327 static const char error_texts[] =
00328   "no error\0"
00329   "\\ at end of pattern\0"
00330   "\\c at end of pattern\0"
00331   "unrecognized character follows \\\0"
00332   "numbers out of order in {} quantifier\0"
00333   /* 5 */
00334   "number too big in {} quantifier\0"
00335   "missing terminating ] for character class\0"
00336   "invalid escape sequence in character class\0"
00337   "range out of order in character class\0"
00338   "nothing to repeat\0"
00339   /* 10 */
00340   "operand of unlimited repeat could match the empty string\0"  
00341   "internal error: unexpected repeat\0"
00342   "unrecognized character after (? or (?-\0"
00343   "POSIX named classes are supported only within a class\0"
00344   "missing )\0"
00345   /* 15 */
00346   "reference to non-existent subpattern\0"
00347   "erroffset passed as NULL\0"
00348   "unknown option bit(s) set\0"
00349   "missing ) after comment\0"
00350   "parentheses nested too deeply\0"  
00351   /* 20 */
00352   "regular expression is too large\0"
00353   "failed to get memory\0"
00354   "unmatched parentheses\0"
00355   "internal error: code overflow\0"
00356   "unrecognized character after (?<\0"
00357   /* 25 */
00358   "lookbehind assertion is not fixed length\0"
00359   "malformed number or name after (?(\0"
00360   "conditional group contains more than two branches\0"
00361   "assertion expected after (?(\0"
00362   "(?R or (?[+-]digits must be followed by )\0"
00363   /* 30 */
00364   "unknown POSIX class name\0"
00365   "POSIX collating elements are not supported\0"
00366   "this version of PCRE is not compiled with PCRE_UTF8 support\0"
00367   "spare error\0"  
00368   "character value in \\x{...} sequence is too large\0"
00369   /* 35 */
00370   "invalid condition (?(0)\0"
00371   "\\C not allowed in lookbehind assertion\0"
00372   "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
00373   "number after (?C is > 255\0"
00374   "closing ) for (?C expected\0"
00375   /* 40 */
00376   "recursive call could loop indefinitely\0"
00377   "unrecognized character after (?P\0"
00378   "syntax error in subpattern name (missing terminator)\0"
00379   "two named subpatterns have the same name\0"
00380   "invalid UTF-8 string\0"
00381   /* 45 */
00382   "support for \\P, \\p, and \\X has not been compiled\0"
00383   "malformed \\P or \\p sequence\0"
00384   "unknown property name after \\P or \\p\0"
00385   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
00386   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
00387   /* 50 */
00388   "repeated subpattern is too long\0"    
00389   "octal value is greater than \\377 (not in UTF-8 mode)\0"
00390   "internal error: overran compiling workspace\0"
00391   "internal error: previously-checked referenced subpattern not found\0"
00392   "DEFINE group contains more than one branch\0"
00393   /* 55 */
00394   "repeating a DEFINE group is not allowed\0"
00395   "inconsistent NEWLINE options\0"
00396   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
00397   "a numbered reference must not be zero\0"
00398   "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
00399   /* 60 */
00400   "(*VERB) not recognized\0"
00401   "number is too big\0"
00402   "subpattern name expected\0"
00403   "digit expected after (?+\0"
00404   "] is an invalid data character in JavaScript compatibility mode\0"
00405   /* 65 */
00406   "different names for subpatterns of the same number are not allowed\0"
00407   "(*MARK) must have an argument\0"
00408   "this version of PCRE is not compiled with PCRE_UCP support\0"
00409   "\\c must be followed by an ASCII character\0"
00410   ;
00411 
00412 /* Table to identify digits and hex digits. This is used when compiling
00413 patterns. Note that the tables in chartables are dependent on the locale, and
00414 may mark arbitrary characters as digits - but the PCRE compiling code expects
00415 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
00416 a private table here. It costs 256 bytes, but it is a lot faster than doing
00417 character value tests (at least in some simple cases I timed), and in some
00418 applications one wants PCRE to compile efficiently as well as match
00419 efficiently.
00420 
00421 For convenience, we use the same bit definitions as in chartables:
00422 
00423   0x04   decimal digit
00424   0x08   hexadecimal digit
00425 
00426 Then we can use ctype_digit and ctype_xdigit in the code. */
00427 
00428 #ifndef EBCDIC
00429 
00430 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
00431 UTF-8 mode. */
00432 
00433 static const unsigned char digitab[] =
00434   {
00435   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
00436   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
00437   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
00438   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
00439   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
00440   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
00441   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
00442   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
00443   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
00444   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
00445   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
00446   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
00447   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
00448   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
00449   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
00450   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
00451   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
00452   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
00453   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
00454   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
00455   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
00456   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
00457   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
00458   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
00459   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
00460   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
00461   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
00462   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
00463   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
00464   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
00465   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
00466   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
00467 
00468 #else
00469 
00470 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
00471 
00472 static const unsigned char digitab[] =
00473   {
00474   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
00475   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
00476   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
00477   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
00478   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
00479   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
00480   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
00481   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
00482   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
00483   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
00484   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
00485   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
00486   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
00487   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
00488   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
00489   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
00490   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
00491   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
00492   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
00493   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
00494   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
00495   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
00496   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
00497   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
00498   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
00499   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
00500   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
00501   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
00502   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
00503   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
00504   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
00505   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
00506 
00507 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
00508   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
00509   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
00510   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
00511   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
00512   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
00513   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
00514   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
00515   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
00516   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
00517   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
00518   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
00519   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
00520   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
00521   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
00522   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
00523   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
00524   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
00525   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
00526   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
00527   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
00528   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
00529   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
00530   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
00531   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
00532   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
00533   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
00534   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
00535   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
00536   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
00537   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
00538   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
00539   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
00540 #endif
00541 
00542 
00543 /* Definition to allow mutual recursion */
00544 
00545 static BOOL
00546   compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
00547     int *, int *, branch_chain *, compile_data *, int *);
00548 
00549 
00550 
00551 /*************************************************
00552 *            Find an error text                  *
00553 *************************************************/
00554 
00555 /* The error texts are now all in one long string, to save on relocations. As
00556 some of the text is of unknown length, we can't use a table of offsets.
00557 Instead, just count through the strings. This is not a performance issue
00558 because it happens only when there has been a compilation error.
00559 
00560 Argument:   the error number
00561 Returns:    pointer to the error string
00562 */
00563 
00564 static const char *
00565 find_error_text(int n)
00566 {
00567 const char *s = error_texts;
00568 for (; n > 0; n--)
00569   {
00570   while (*s++ != 0) {};
00571   if (*s == 0) return "Error text not found (please report)";
00572   }
00573 return s;
00574 }
00575 
00576 
00577 /*************************************************
00578 *            Handle escapes                      *
00579 *************************************************/
00580 
00581 /* This function is called when a \ has been encountered. It either returns a
00582 positive value for a simple escape such as \n, or a negative value which
00583 encodes one of the more complicated things such as \d. A backreference to group
00584 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
00585 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
00586 ptr is pointing at the \. On exit, it is on the final character of the escape
00587 sequence.
00588 
00589 Arguments:
00590   ptrptr         points to the pattern position pointer
00591   errorcodeptr   points to the errorcode variable
00592   bracount       number of previous extracting brackets
00593   options        the options bits
00594   isclass        TRUE if inside a character class
00595 
00596 Returns:         zero or positive => a data character
00597                  negative => a special escape sequence
00598                  on error, errorcodeptr is set
00599 */
00600 
00601 static int
00602 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
00603   int options, BOOL isclass)
00604 {
00605 BOOL utf8 = (options & PCRE_UTF8) != 0;
00606 const uschar *ptr = *ptrptr + 1;
00607 int c, i;
00608 
00609 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
00610 ptr--;                            /* Set pointer back to the last byte */
00611 
00612 /* If backslash is at the end of the pattern, it's an error. */
00613 
00614 if (c == 0) *errorcodeptr = ERR1;
00615 
00616 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
00617 in a table. A non-zero result is something that can be returned immediately.
00618 Otherwise further processing may be required. */
00619 
00620 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
00621 else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */
00622 else if ((i = escapes[c - CHAR_0]) != 0) c = i;
00623 
00624 #else           /* EBCDIC coding */
00625 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
00626 else if ((i = escapes[c - 0x48]) != 0)  c = i;
00627 #endif
00628 
00629 /* Escapes that need further processing, or are illegal. */
00630 
00631 else
00632   {
00633   const uschar *oldptr;
00634   BOOL braced, negated;
00635 
00636   switch (c)
00637     {
00638     /* A number of Perl escapes are not handled by PCRE. We give an explicit
00639     error. */
00640 
00641     case CHAR_l:
00642     case CHAR_L:
00643     case CHAR_u:
00644     case CHAR_U:
00645     *errorcodeptr = ERR37;
00646     break;
00647 
00648     /* \g must be followed by one of a number of specific things:
00649 
00650     (1) A number, either plain or braced. If positive, it is an absolute
00651     backreference. If negative, it is a relative backreference. This is a Perl
00652     5.10 feature.
00653 
00654     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
00655     is part of Perl's movement towards a unified syntax for back references. As
00656     this is synonymous with \k{name}, we fudge it up by pretending it really
00657     was \k.
00658 
00659     (3) For Oniguruma compatibility we also support \g followed by a name or a
00660     number either in angle brackets or in single quotes. However, these are
00661     (possibly recursive) subroutine calls, _not_ backreferences. Just return
00662     the -ESC_g code (cf \k). */
00663 
00664     case CHAR_g:
00665     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
00666       {
00667       c = -ESC_g;
00668       break;
00669       }
00670 
00671     /* Handle the Perl-compatible cases */
00672 
00673     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
00674       {
00675       const uschar *p;
00676       for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
00677         if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
00678       if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
00679         {
00680         c = -ESC_k;
00681         break;
00682         }
00683       braced = TRUE;
00684       ptr++;
00685       }
00686     else braced = FALSE;
00687 
00688     if (ptr[1] == CHAR_MINUS)
00689       {
00690       negated = TRUE;
00691       ptr++;
00692       }
00693     else negated = FALSE;
00694 
00695     c = 0;
00696     while ((digitab[ptr[1]] & ctype_digit) != 0)
00697       c = c * 10 + *(++ptr) - CHAR_0;
00698 
00699     if (c < 0)   /* Integer overflow */
00700       {
00701       *errorcodeptr = ERR61;
00702       break;
00703       }
00704 
00705     if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
00706       {
00707       *errorcodeptr = ERR57;
00708       break;
00709       }
00710 
00711     if (c == 0)
00712       {
00713       *errorcodeptr = ERR58;
00714       break;
00715       }
00716 
00717     if (negated)
00718       {
00719       if (c > bracount)
00720         {
00721         *errorcodeptr = ERR15;
00722         break;
00723         }
00724       c = bracount - (c - 1);
00725       }
00726 
00727     c = -(ESC_REF + c);
00728     break;
00729 
00730     /* The handling of escape sequences consisting of a string of digits
00731     starting with one that is not zero is not straightforward. By experiment,
00732     the way Perl works seems to be as follows:
00733 
00734     Outside a character class, the digits are read as a decimal number. If the
00735     number is less than 10, or if there are that many previous extracting
00736     left brackets, then it is a back reference. Otherwise, up to three octal
00737     digits are read to form an escaped byte. Thus \123 is likely to be octal
00738     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
00739     value is greater than 377, the least significant 8 bits are taken. Inside a
00740     character class, \ followed by a digit is always an octal number. */
00741 
00742     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
00743     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
00744 
00745     if (!isclass)
00746       {
00747       oldptr = ptr;
00748       c -= CHAR_0;
00749       while ((digitab[ptr[1]] & ctype_digit) != 0)
00750         c = c * 10 + *(++ptr) - CHAR_0;
00751       if (c < 0)    /* Integer overflow */
00752         {
00753         *errorcodeptr = ERR61;
00754         break;
00755         }
00756       if (c < 10 || c <= bracount)
00757         {
00758         c = -(ESC_REF + c);
00759         break;
00760         }
00761       ptr = oldptr;      /* Put the pointer back and fall through */
00762       }
00763 
00764     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
00765     generates a binary zero byte and treats the digit as a following literal.
00766     Thus we have to pull back the pointer by one. */
00767 
00768     if ((c = *ptr) >= CHAR_8)
00769       {
00770       ptr--;
00771       c = 0;
00772       break;
00773       }
00774 
00775     /* \0 always starts an octal number, but we may drop through to here with a
00776     larger first octal digit. The original code used just to take the least
00777     significant 8 bits of octal numbers (I think this is what early Perls used
00778     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
00779     than 3 octal digits. */
00780 
00781     case CHAR_0:
00782     c -= CHAR_0;
00783     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
00784         c = c * 8 + *(++ptr) - CHAR_0;
00785     if (!utf8 && c > 255) *errorcodeptr = ERR51;
00786     break;
00787 
00788     /* \x is complicated. \x{ddd} is a character number which can be greater
00789     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
00790     treated as a data character. */
00791 
00792     case CHAR_x:
00793     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
00794       {
00795       const uschar *pt = ptr + 2;
00796       int count = 0;
00797 
00798       c = 0;
00799       while ((digitab[*pt] & ctype_xdigit) != 0)
00800         {
00801         register int cc = *pt++;
00802         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
00803         count++;
00804 
00805 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
00806         if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
00807         c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
00808 #else           /* EBCDIC coding */
00809         if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
00810         c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
00811 #endif
00812         }
00813 
00814       if (*pt == CHAR_RIGHT_CURLY_BRACKET)
00815         {
00816         if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
00817         ptr = pt;
00818         break;
00819         }
00820 
00821       /* If the sequence of hex digits does not end with '}', then we don't
00822       recognize this construct; fall through to the normal \x handling. */
00823       }
00824 
00825     /* Read just a single-byte hex-defined char */
00826 
00827     c = 0;
00828     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
00829       {
00830       int cc;                                  /* Some compilers don't like */
00831       cc = *(++ptr);                           /* ++ in initializers */
00832 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
00833       if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
00834       c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
00835 #else           /* EBCDIC coding */
00836       if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
00837       c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
00838 #endif
00839       }
00840     break;
00841 
00842     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
00843     An error is given if the byte following \c is not an ASCII character. This
00844     coding is ASCII-specific, but then the whole concept of \cx is
00845     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
00846 
00847     case CHAR_c:
00848     c = *(++ptr);
00849     if (c == 0)
00850       {
00851       *errorcodeptr = ERR2;
00852       break;
00853       }
00854 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
00855     if (c > 127)  /* Excludes all non-ASCII in either mode */
00856       {
00857       *errorcodeptr = ERR68;
00858       break;
00859       }
00860     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
00861     c ^= 0x40;
00862 #else             /* EBCDIC coding */
00863     if (c >= CHAR_a && c <= CHAR_z) c += 64;
00864     c ^= 0xC0;
00865 #endif
00866     break;
00867 
00868     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
00869     other alphanumeric following \ is an error if PCRE_EXTRA was set;
00870     otherwise, for Perl compatibility, it is a literal. This code looks a bit
00871     odd, but there used to be some cases other than the default, and there may
00872     be again in future, so I haven't "optimized" it. */
00873 
00874     default:
00875     if ((options & PCRE_EXTRA) != 0) switch(c)
00876       {
00877       default:
00878       *errorcodeptr = ERR3;
00879       break;
00880       }
00881     break;
00882     }
00883   }
00884 
00885 /* Perl supports \N{name} for character names, as well as plain \N for "not
00886 newline". PCRE does not support \N{name}. */
00887 
00888 if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
00889   *errorcodeptr = ERR37;
00890 
00891 /* If PCRE_UCP is set, we change the values for \d etc. */
00892 
00893 if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
00894   c -= (ESC_DU - ESC_D);
00895 
00896 /* Set the pointer to the final character before returning. */
00897 
00898 *ptrptr = ptr;
00899 return c;
00900 }
00901 
00902 
00903 
00904 #ifdef SUPPORT_UCP
00905 /*************************************************
00906 *               Handle \P and \p                 *
00907 *************************************************/
00908 
00909 /* This function is called after \P or \p has been encountered, provided that
00910 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
00911 pointing at the P or p. On exit, it is pointing at the final character of the
00912 escape sequence.
00913 
00914 Argument:
00915   ptrptr         points to the pattern position pointer
00916   negptr         points to a boolean that is set TRUE for negation else FALSE
00917   dptr           points to an int that is set to the detailed property value
00918   errorcodeptr   points to the error code variable
00919 
00920 Returns:         type value from ucp_type_table, or -1 for an invalid type
00921 */
00922 
00923 static int
00924 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
00925 {
00926 int c, i, bot, top;
00927 const uschar *ptr = *ptrptr;
00928 char name[32];
00929 
00930 c = *(++ptr);
00931 if (c == 0) goto ERROR_RETURN;
00932 
00933 *negptr = FALSE;
00934 
00935 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
00936 negation. */
00937 
00938 if (c == CHAR_LEFT_CURLY_BRACKET)
00939   {
00940   if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
00941     {
00942     *negptr = TRUE;
00943     ptr++;
00944     }
00945   for (i = 0; i < (int)sizeof(name) - 1; i++)
00946     {
00947     c = *(++ptr);
00948     if (c == 0) goto ERROR_RETURN;
00949     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
00950     name[i] = c;
00951     }
00952   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
00953   name[i] = 0;
00954   }
00955 
00956 /* Otherwise there is just one following character */
00957 
00958 else
00959   {
00960   name[0] = c;
00961   name[1] = 0;
00962   }
00963 
00964 *ptrptr = ptr;
00965 
00966 /* Search for a recognized property name using binary chop */
00967 
00968 bot = 0;
00969 top = _pcre_utt_size;
00970 
00971 while (bot < top)
00972   {
00973   i = (bot + top) >> 1;
00974   c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
00975   if (c == 0)
00976     {
00977     *dptr = _pcre_utt[i].value;
00978     return _pcre_utt[i].type;
00979     }
00980   if (c > 0) bot = i + 1; else top = i;
00981   }
00982 
00983 *errorcodeptr = ERR47;
00984 *ptrptr = ptr;
00985 return -1;
00986 
00987 ERROR_RETURN:
00988 *errorcodeptr = ERR46;
00989 *ptrptr = ptr;
00990 return -1;
00991 }
00992 #endif
00993 
00994 
00995 
00996 
00997 /*************************************************
00998 *            Check for counted repeat            *
00999 *************************************************/
01000 
01001 /* This function is called when a '{' is encountered in a place where it might
01002 start a quantifier. It looks ahead to see if it really is a quantifier or not.
01003 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
01004 where the ddds are digits.
01005 
01006 Arguments:
01007   p         pointer to the first char after '{'
01008 
01009 Returns:    TRUE or FALSE
01010 */
01011 
01012 static BOOL
01013 is_counted_repeat(const uschar *p)
01014 {
01015 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
01016 while ((digitab[*p] & ctype_digit) != 0) p++;
01017 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
01018 
01019 if (*p++ != CHAR_COMMA) return FALSE;
01020 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
01021 
01022 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
01023 while ((digitab[*p] & ctype_digit) != 0) p++;
01024 
01025 return (*p == CHAR_RIGHT_CURLY_BRACKET);
01026 }
01027 
01028 
01029 
01030 /*************************************************
01031 *         Read repeat counts                     *
01032 *************************************************/
01033 
01034 /* Read an item of the form {n,m} and return the values. This is called only
01035 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
01036 so the syntax is guaranteed to be correct, but we need to check the values.
01037 
01038 Arguments:
01039   p              pointer to first char after '{'
01040   minp           pointer to int for min
01041   maxp           pointer to int for max
01042                  returned as -1 if no max
01043   errorcodeptr   points to error code variable
01044 
01045 Returns:         pointer to '}' on success;
01046                  current ptr on error, with errorcodeptr set non-zero
01047 */
01048 
01049 static const uschar *
01050 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
01051 {
01052 int min = 0;
01053 int max = -1;
01054 
01055 /* Read the minimum value and do a paranoid check: a negative value indicates
01056 an integer overflow. */
01057 
01058 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
01059 if (min < 0 || min > 65535)
01060   {
01061   *errorcodeptr = ERR5;
01062   return p;
01063   }
01064 
01065 /* Read the maximum value if there is one, and again do a paranoid on its size.
01066 Also, max must not be less than min. */
01067 
01068 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
01069   {
01070   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
01071     {
01072     max = 0;
01073     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
01074     if (max < 0 || max > 65535)
01075       {
01076       *errorcodeptr = ERR5;
01077       return p;
01078       }
01079     if (max < min)
01080       {
01081       *errorcodeptr = ERR4;
01082       return p;
01083       }
01084     }
01085   }
01086 
01087 /* Fill in the required variables, and pass back the pointer to the terminating
01088 '}'. */
01089 
01090 *minp = min;
01091 *maxp = max;
01092 return p;
01093 }
01094 
01095 
01096 
01097 /*************************************************
01098 *  Subroutine for finding forward reference      *
01099 *************************************************/
01100 
01101 /* This recursive function is called only from find_parens() below. The
01102 top-level call starts at the beginning of the pattern. All other calls must
01103 start at a parenthesis. It scans along a pattern's text looking for capturing
01104 subpatterns, and counting them. If it finds a named pattern that matches the
01105 name it is given, it returns its number. Alternatively, if the name is NULL, it
01106 returns when it reaches a given numbered subpattern. Recursion is used to keep
01107 track of subpatterns that reset the capturing group numbers - the (?| feature.
01108 
01109 This function was originally called only from the second pass, in which we know
01110 that if (?< or (?' or (?P< is encountered, the name will be correctly
01111 terminated because that is checked in the first pass. There is now one call to
01112 this function in the first pass, to check for a recursive back reference by
01113 name (so that we can make the whole group atomic). In this case, we need check
01114 only up to the current position in the pattern, and that is still OK because
01115 and previous occurrences will have been checked. To make this work, the test
01116 for "end of pattern" is a check against cd->end_pattern in the main loop,
01117 instead of looking for a binary zero. This means that the special first-pass
01118 call can adjust cd->end_pattern temporarily. (Checks for binary zero while
01119 processing items within the loop are OK, because afterwards the main loop will
01120 terminate.)
01121 
01122 Arguments:
01123   ptrptr       address of the current character pointer (updated)
01124   cd           compile background data
01125   name         name to seek, or NULL if seeking a numbered subpattern
01126   lorn         name length, or subpattern number if name is NULL
01127   xmode        TRUE if we are in /x mode
01128   utf8         TRUE if we are in UTF-8 mode
01129   count        pointer to the current capturing subpattern number (updated)
01130 
01131 Returns:       the number of the named subpattern, or -1 if not found
01132 */
01133 
01134 static int
01135 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
01136   BOOL xmode, BOOL utf8, int *count)
01137 {
01138 uschar *ptr = *ptrptr;
01139 int start_count = *count;
01140 int hwm_count = start_count;
01141 BOOL dup_parens = FALSE;
01142 
01143 /* If the first character is a parenthesis, check on the type of group we are
01144 dealing with. The very first call may not start with a parenthesis. */
01145 
01146 if (ptr[0] == CHAR_LEFT_PARENTHESIS)
01147   {
01148   /* Handle specials such as (*SKIP) or (*UTF8) etc. */
01149 
01150   if (ptr[1] == CHAR_ASTERISK) ptr += 2;
01151 
01152   /* Handle a normal, unnamed capturing parenthesis. */
01153 
01154   else if (ptr[1] != CHAR_QUESTION_MARK)
01155     {
01156     *count += 1;
01157     if (name == NULL && *count == lorn) return *count;
01158     ptr++;
01159     }
01160 
01161   /* All cases now have (? at the start. Remember when we are in a group
01162   where the parenthesis numbers are duplicated. */
01163 
01164   else if (ptr[2] == CHAR_VERTICAL_LINE)
01165     {
01166     ptr += 3;
01167     dup_parens = TRUE;
01168     }
01169 
01170   /* Handle comments; all characters are allowed until a ket is reached. */
01171 
01172   else if (ptr[2] == CHAR_NUMBER_SIGN)
01173     {
01174     for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
01175     goto FAIL_EXIT;
01176     }
01177 
01178   /* Handle a condition. If it is an assertion, just carry on so that it
01179   is processed as normal. If not, skip to the closing parenthesis of the
01180   condition (there can't be any nested parens). */
01181 
01182   else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
01183     {
01184     ptr += 2;
01185     if (ptr[1] != CHAR_QUESTION_MARK)
01186       {
01187       while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
01188       if (*ptr != 0) ptr++;
01189       }
01190     }
01191 
01192   /* Start with (? but not a condition. */
01193 
01194   else
01195     {
01196     ptr += 2;
01197     if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
01198 
01199     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
01200 
01201     if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
01202         ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
01203       {
01204       int term;
01205       const uschar *thisname;
01206       *count += 1;
01207       if (name == NULL && *count == lorn) return *count;
01208       term = *ptr++;
01209       if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
01210       thisname = ptr;
01211       while (*ptr != term) ptr++;
01212       if (name != NULL && lorn == ptr - thisname &&
01213           strncmp((const char *)name, (const char *)thisname, lorn) == 0)
01214         return *count;
01215       term++;
01216       }
01217     }
01218   }
01219 
01220 /* Past any initial parenthesis handling, scan for parentheses or vertical
01221 bars. Stop if we get to cd->end_pattern. Note that this is important for the
01222 first-pass call when this value is temporarily adjusted to stop at the current
01223 position. So DO NOT change this to a test for binary zero. */
01224 
01225 for (; ptr < cd->end_pattern; ptr++)
01226   {
01227   /* Skip over backslashed characters and also entire \Q...\E */
01228 
01229   if (*ptr == CHAR_BACKSLASH)
01230     {
01231     if (*(++ptr) == 0) goto FAIL_EXIT;
01232     if (*ptr == CHAR_Q) for (;;)
01233       {
01234       while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
01235       if (*ptr == 0) goto FAIL_EXIT;
01236       if (*(++ptr) == CHAR_E) break;
01237       }
01238     continue;
01239     }
01240 
01241   /* Skip over character classes; this logic must be similar to the way they
01242   are handled for real. If the first character is '^', skip it. Also, if the
01243   first few characters (either before or after ^) are \Q\E or \E we skip them
01244   too. This makes for compatibility with Perl. Note the use of STR macros to
01245   encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
01246 
01247   if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
01248     {
01249     BOOL negate_class = FALSE;
01250     for (;;)
01251       {
01252       if (ptr[1] == CHAR_BACKSLASH)
01253         {
01254         if (ptr[2] == CHAR_E)
01255           ptr+= 2;
01256         else if (strncmp((const char *)ptr+2,
01257                  STR_Q STR_BACKSLASH STR_E, 3) == 0)
01258           ptr += 4;
01259         else
01260           break;
01261         }
01262       else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
01263         {
01264         negate_class = TRUE;
01265         ptr++;
01266         }
01267       else break;
01268       }
01269 
01270     /* If the next character is ']', it is a data character that must be
01271     skipped, except in JavaScript compatibility mode. */
01272 
01273     if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
01274         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
01275       ptr++;
01276 
01277     while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
01278       {
01279       if (*ptr == 0) return -1;
01280       if (*ptr == CHAR_BACKSLASH)
01281         {
01282         if (*(++ptr) == 0) goto FAIL_EXIT;
01283         if (*ptr == CHAR_Q) for (;;)
01284           {
01285           while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
01286           if (*ptr == 0) goto FAIL_EXIT;
01287           if (*(++ptr) == CHAR_E) break;
01288           }
01289         continue;
01290         }
01291       }
01292     continue;
01293     }
01294 
01295   /* Skip comments in /x mode */
01296 
01297   if (xmode && *ptr == CHAR_NUMBER_SIGN)
01298     {
01299     ptr++;
01300     while (*ptr != 0)
01301       {
01302       if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
01303       ptr++;
01304 #ifdef SUPPORT_UTF8
01305       if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
01306 #endif
01307       }
01308     if (*ptr == 0) goto FAIL_EXIT;
01309     continue;
01310     }
01311 
01312   /* Check for the special metacharacters */
01313 
01314   if (*ptr == CHAR_LEFT_PARENTHESIS)
01315     {
01316     int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
01317     if (rc > 0) return rc;
01318     if (*ptr == 0) goto FAIL_EXIT;
01319     }
01320 
01321   else if (*ptr == CHAR_RIGHT_PARENTHESIS)
01322     {
01323     if (dup_parens && *count < hwm_count) *count = hwm_count;
01324     goto FAIL_EXIT;
01325     }
01326 
01327   else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
01328     {
01329     if (*count > hwm_count) hwm_count = *count;
01330     *count = start_count;
01331     }
01332   }
01333 
01334 FAIL_EXIT:
01335 *ptrptr = ptr;
01336 return -1;
01337 }
01338 
01339 
01340 
01341 
01342 /*************************************************
01343 *       Find forward referenced subpattern       *
01344 *************************************************/
01345 
01346 /* This function scans along a pattern's text looking for capturing
01347 subpatterns, and counting them. If it finds a named pattern that matches the
01348 name it is given, it returns its number. Alternatively, if the name is NULL, it
01349 returns when it reaches a given numbered subpattern. This is used for forward
01350 references to subpatterns. We used to be able to start this scan from the
01351 current compiling point, using the current count value from cd->bracount, and
01352 do it all in a single loop, but the addition of the possibility of duplicate
01353 subpattern numbers means that we have to scan from the very start, in order to
01354 take account of such duplicates, and to use a recursive function to keep track
01355 of the different types of group.
01356 
01357 Arguments:
01358   cd           compile background data
01359   name         name to seek, or NULL if seeking a numbered subpattern
01360   lorn         name length, or subpattern number if name is NULL
01361   xmode        TRUE if we are in /x mode
01362   utf8         TRUE if we are in UTF-8 mode
01363 
01364 Returns:       the number of the found subpattern, or -1 if not found
01365 */
01366 
01367 static int
01368 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
01369   BOOL utf8)
01370 {
01371 uschar *ptr = (uschar *)cd->start_pattern;
01372 int count = 0;
01373 int rc;
01374 
01375 /* If the pattern does not start with an opening parenthesis, the first call
01376 to find_parens_sub() will scan right to the end (if necessary). However, if it
01377 does start with a parenthesis, find_parens_sub() will return when it hits the
01378 matching closing parens. That is why we have to have a loop. */
01379 
01380 for (;;)
01381   {
01382   rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
01383   if (rc > 0 || *ptr++ == 0) break;
01384   }
01385 
01386 return rc;
01387 }
01388 
01389 
01390 
01391 
01392 /*************************************************
01393 *      Find first significant op code            *
01394 *************************************************/
01395 
01396 /* This is called by several functions that scan a compiled expression looking
01397 for a fixed first character, or an anchoring op code etc. It skips over things
01398 that do not influence this. For some calls, a change of option is important.
01399 For some calls, it makes sense to skip negative forward and all backward
01400 assertions, and also the \b assertion; for others it does not.
01401 
01402 Arguments:
01403   code         pointer to the start of the group
01404   options      pointer to external options
01405   optbit       the option bit whose changing is significant, or
01406                  zero if none are
01407   skipassert   TRUE if certain assertions are to be skipped
01408 
01409 Returns:       pointer to the first significant opcode
01410 */
01411 
01412 static const uschar*
01413 first_significant_code(const uschar *code, int *options, int optbit,
01414   BOOL skipassert)
01415 {
01416 for (;;)
01417   {
01418   switch ((int)*code)
01419     {
01420     case OP_OPT:
01421     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
01422       *options = (int)code[1];
01423     code += 2;
01424     break;
01425 
01426     case OP_ASSERT_NOT:
01427     case OP_ASSERTBACK:
01428     case OP_ASSERTBACK_NOT:
01429     if (!skipassert) return code;
01430     do code += GET(code, 1); while (*code == OP_ALT);
01431     code += _pcre_OP_lengths[*code];
01432     break;
01433 
01434     case OP_WORD_BOUNDARY:
01435     case OP_NOT_WORD_BOUNDARY:
01436     if (!skipassert) return code;
01437     /* Fall through */
01438 
01439     case OP_CALLOUT:
01440     case OP_CREF:
01441     case OP_NCREF:
01442     case OP_RREF:
01443     case OP_NRREF:
01444     case OP_DEF:
01445     code += _pcre_OP_lengths[*code];
01446     break;
01447 
01448     default:
01449     return code;
01450     }
01451   }
01452 /* Control never reaches here */
01453 }
01454 
01455 
01456 
01457 
01458 /*************************************************
01459 *        Find the fixed length of a branch       *
01460 *************************************************/
01461 
01462 /* Scan a branch and compute the fixed length of subject that will match it,
01463 if the length is fixed. This is needed for dealing with backward assertions.
01464 In UTF8 mode, the result is in characters rather than bytes. The branch is
01465 temporarily terminated with OP_END when this function is called.
01466 
01467 This function is called when a backward assertion is encountered, so that if it
01468 fails, the error message can point to the correct place in the pattern.
01469 However, we cannot do this when the assertion contains subroutine calls,
01470 because they can be forward references. We solve this by remembering this case
01471 and doing the check at the end; a flag specifies which mode we are running in.
01472 
01473 Arguments:
01474   code     points to the start of the pattern (the bracket)
01475   options  the compiling options
01476   atend    TRUE if called when the pattern is complete
01477   cd       the "compile data" structure
01478 
01479 Returns:   the fixed length,
01480              or -1 if there is no fixed length,
01481              or -2 if \C was encountered
01482              or -3 if an OP_RECURSE item was encountered and atend is FALSE
01483 */
01484 
01485 static int
01486 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
01487 {
01488 int length = -1;
01489 
01490 register int branchlength = 0;
01491 register uschar *cc = code + 1 + LINK_SIZE;
01492 
01493 /* Scan along the opcodes for this branch. If we get to the end of the
01494 branch, check the length against that of the other branches. */
01495 
01496 for (;;)
01497   {
01498   int d;
01499   uschar *ce, *cs;
01500   register int op = *cc;
01501   switch (op)
01502     {
01503     case OP_CBRA:
01504     case OP_BRA:
01505     case OP_ONCE:
01506     case OP_COND:
01507     d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
01508     if (d < 0) return d;
01509     branchlength += d;
01510     do cc += GET(cc, 1); while (*cc == OP_ALT);
01511     cc += 1 + LINK_SIZE;
01512     break;
01513 
01514     /* Reached end of a branch; if it's a ket it is the end of a nested
01515     call. If it's ALT it is an alternation in a nested call. If it is
01516     END it's the end of the outer call. All can be handled by the same code. */
01517 
01518     case OP_ALT:
01519     case OP_KET:
01520     case OP_KETRMAX:
01521     case OP_KETRMIN:
01522     case OP_END:
01523     if (length < 0) length = branchlength;
01524       else if (length != branchlength) return -1;
01525     if (*cc != OP_ALT) return length;
01526     cc += 1 + LINK_SIZE;
01527     branchlength = 0;
01528     break;
01529 
01530     /* A true recursion implies not fixed length, but a subroutine call may
01531     be OK. If the subroutine is a forward reference, we can't deal with
01532     it until the end of the pattern, so return -3. */
01533 
01534     case OP_RECURSE:
01535     if (!atend) return -3;
01536     cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
01537     do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */
01538     if (cc > cs && cc < ce) return -1;                /* Recursion */
01539     d = find_fixedlength(cs + 2, options, atend, cd);
01540     if (d < 0) return d;
01541     branchlength += d;
01542     cc += 1 + LINK_SIZE;
01543     break;
01544 
01545     /* Skip over assertive subpatterns */
01546 
01547     case OP_ASSERT:
01548     case OP_ASSERT_NOT:
01549     case OP_ASSERTBACK:
01550     case OP_ASSERTBACK_NOT:
01551     do cc += GET(cc, 1); while (*cc == OP_ALT);
01552     /* Fall through */
01553 
01554     /* Skip over things that don't match chars */
01555 
01556     case OP_REVERSE:
01557     case OP_CREF:
01558     case OP_NCREF:
01559     case OP_RREF:
01560     case OP_NRREF:
01561     case OP_DEF:
01562     case OP_OPT:
01563     case OP_CALLOUT:
01564     case OP_SOD:
01565     case OP_SOM:
01566     case OP_SET_SOM:
01567     case OP_EOD:
01568     case OP_EODN:
01569     case OP_CIRC:
01570     case OP_DOLL:
01571     case OP_NOT_WORD_BOUNDARY:
01572     case OP_WORD_BOUNDARY:
01573     cc += _pcre_OP_lengths[*cc];
01574     break;
01575 
01576     /* Handle literal characters */
01577 
01578     case OP_CHAR:
01579     case OP_CHARNC:
01580     case OP_NOT:
01581     branchlength++;
01582     cc += 2;
01583 #ifdef SUPPORT_UTF8
01584     if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
01585       cc += _pcre_utf8_table4[cc[-1] & 0x3f];
01586 #endif
01587     break;
01588 
01589     /* Handle exact repetitions. The count is already in characters, but we
01590     need to skip over a multibyte character in UTF8 mode.  */
01591 
01592     case OP_EXACT:
01593     branchlength += GET2(cc,1);
01594     cc += 4;
01595 #ifdef SUPPORT_UTF8
01596     if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
01597       cc += _pcre_utf8_table4[cc[-1] & 0x3f];
01598 #endif
01599     break;
01600 
01601     case OP_TYPEEXACT:
01602     branchlength += GET2(cc,1);
01603     if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
01604     cc += 4;
01605     break;
01606 
01607     /* Handle single-char matchers */
01608 
01609     case OP_PROP:
01610     case OP_NOTPROP:
01611     cc += 2;
01612     /* Fall through */
01613 
01614     case OP_NOT_DIGIT:
01615     case OP_DIGIT:
01616     case OP_NOT_WHITESPACE:
01617     case OP_WHITESPACE:
01618     case OP_NOT_WORDCHAR:
01619     case OP_WORDCHAR:
01620     case OP_ANY:
01621     case OP_ALLANY:
01622     branchlength++;
01623     cc++;
01624     break;
01625 
01626     /* The single-byte matcher isn't allowed */
01627 
01628     case OP_ANYBYTE:
01629     return -2;
01630 
01631     /* Check a class for variable quantification */
01632 
01633 #ifdef SUPPORT_UTF8
01634     case OP_XCLASS:
01635     cc += GET(cc, 1) - 33;
01636     /* Fall through */
01637 #endif
01638 
01639     case OP_CLASS:
01640     case OP_NCLASS:
01641     cc += 33;
01642 
01643     switch (*cc)
01644       {
01645       case OP_CRSTAR:
01646       case OP_CRMINSTAR:
01647       case OP_CRQUERY:
01648       case OP_CRMINQUERY:
01649       return -1;
01650 
01651       case OP_CRRANGE:
01652       case OP_CRMINRANGE:
01653       if (GET2(cc,1) != GET2(cc,3)) return -1;
01654       branchlength += GET2(cc,1);
01655       cc += 5;
01656       break;
01657 
01658       default:
01659       branchlength++;
01660       }
01661     break;
01662 
01663     /* Anything else is variable length */
01664 
01665     default:
01666     return -1;
01667     }
01668   }
01669 /* Control never gets here */
01670 }
01671 
01672 
01673 
01674 
01675 /*************************************************
01676 *    Scan compiled regex for specific bracket    *
01677 *************************************************/
01678 
01679 /* This little function scans through a compiled pattern until it finds a
01680 capturing bracket with the given number, or, if the number is negative, an
01681 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
01682 so that it can be called from pcre_study() when finding the minimum matching
01683 length.
01684 
01685 Arguments:
01686   code        points to start of expression
01687   utf8        TRUE in UTF-8 mode
01688   number      the required bracket number or negative to find a lookbehind
01689 
01690 Returns:      pointer to the opcode for the bracket, or NULL if not found
01691 */
01692 
01693 const uschar *
01694 _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
01695 {
01696 for (;;)
01697   {
01698   register int c = *code;
01699   if (c == OP_END) return NULL;
01700 
01701   /* XCLASS is used for classes that cannot be represented just by a bit
01702   map. This includes negated single high-valued characters. The length in
01703   the table is zero; the actual length is stored in the compiled code. */
01704 
01705   if (c == OP_XCLASS) code += GET(code, 1);
01706 
01707   /* Handle recursion */
01708 
01709   else if (c == OP_REVERSE)
01710     {
01711     if (number < 0) return (uschar *)code;
01712     code += _pcre_OP_lengths[c];
01713     }
01714 
01715   /* Handle capturing bracket */
01716 
01717   else if (c == OP_CBRA)
01718     {
01719     int n = GET2(code, 1+LINK_SIZE);
01720     if (n == number) return (uschar *)code;
01721     code += _pcre_OP_lengths[c];
01722     }
01723 
01724   /* Otherwise, we can get the item's length from the table, except that for
01725   repeated character types, we have to test for \p and \P, which have an extra
01726   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
01727   must add in its length. */
01728 
01729   else
01730     {
01731     switch(c)
01732       {
01733       case OP_TYPESTAR:
01734       case OP_TYPEMINSTAR:
01735       case OP_TYPEPLUS:
01736       case OP_TYPEMINPLUS:
01737       case OP_TYPEQUERY:
01738       case OP_TYPEMINQUERY:
01739       case OP_TYPEPOSSTAR:
01740       case OP_TYPEPOSPLUS:
01741       case OP_TYPEPOSQUERY:
01742       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
01743       break;
01744 
01745       case OP_TYPEUPTO:
01746       case OP_TYPEMINUPTO:
01747       case OP_TYPEEXACT:
01748       case OP_TYPEPOSUPTO:
01749       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
01750       break;
01751 
01752       case OP_MARK:
01753       case OP_PRUNE_ARG:
01754       case OP_SKIP_ARG:
01755       code += code[1];
01756       break;
01757 
01758       case OP_THEN_ARG:
01759       code += code[1+LINK_SIZE];
01760       break;
01761       }
01762 
01763     /* Add in the fixed length from the table */
01764 
01765     code += _pcre_OP_lengths[c];
01766 
01767   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
01768   a multi-byte character. The length in the table is a minimum, so we have to
01769   arrange to skip the extra bytes. */
01770 
01771 #ifdef SUPPORT_UTF8
01772     if (utf8) switch(c)
01773       {
01774       case OP_CHAR:
01775       case OP_CHARNC:
01776       case OP_EXACT:
01777       case OP_UPTO:
01778       case OP_MINUPTO:
01779       case OP_POSUPTO:
01780       case OP_STAR:
01781       case OP_MINSTAR:
01782       case OP_POSSTAR:
01783       case OP_PLUS:
01784       case OP_MINPLUS:
01785       case OP_POSPLUS:
01786       case OP_QUERY:
01787       case OP_MINQUERY:
01788       case OP_POSQUERY:
01789       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
01790       break;
01791       }
01792 #else
01793     (void)(utf8);  /* Keep compiler happy by referencing function argument */
01794 #endif
01795     }
01796   }
01797 }
01798 
01799 
01800 
01801 /*************************************************
01802 *   Scan compiled regex for recursion reference  *
01803 *************************************************/
01804 
01805 /* This little function scans through a compiled pattern until it finds an
01806 instance of OP_RECURSE.
01807 
01808 Arguments:
01809   code        points to start of expression
01810   utf8        TRUE in UTF-8 mode
01811 
01812 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
01813 */
01814 
01815 static const uschar *
01816 find_recurse(const uschar *code, BOOL utf8)
01817 {
01818 for (;;)
01819   {
01820   register int c = *code;
01821   if (c == OP_END) return NULL;
01822   if (c == OP_RECURSE) return code;
01823 
01824   /* XCLASS is used for classes that cannot be represented just by a bit
01825   map. This includes negated single high-valued characters. The length in
01826   the table is zero; the actual length is stored in the compiled code. */
01827 
01828   if (c == OP_XCLASS) code += GET(code, 1);
01829 
01830   /* Otherwise, we can get the item's length from the table, except that for
01831   repeated character types, we have to test for \p and \P, which have an extra
01832   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
01833   must add in its length. */
01834 
01835   else
01836     {
01837     switch(c)
01838       {
01839       case OP_TYPESTAR:
01840       case OP_TYPEMINSTAR:
01841       case OP_TYPEPLUS:
01842       case OP_TYPEMINPLUS:
01843       case OP_TYPEQUERY:
01844       case OP_TYPEMINQUERY:
01845       case OP_TYPEPOSSTAR:
01846       case OP_TYPEPOSPLUS:
01847       case OP_TYPEPOSQUERY:
01848       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
01849       break;
01850 
01851       case OP_TYPEPOSUPTO:
01852       case OP_TYPEUPTO:
01853       case OP_TYPEMINUPTO:
01854       case OP_TYPEEXACT:
01855       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
01856       break;
01857 
01858       case OP_MARK:
01859       case OP_PRUNE_ARG:
01860       case OP_SKIP_ARG:
01861       code += code[1];
01862       break;
01863 
01864       case OP_THEN_ARG:
01865       code += code[1+LINK_SIZE];
01866       break;
01867       }
01868 
01869     /* Add in the fixed length from the table */
01870 
01871     code += _pcre_OP_lengths[c];
01872 
01873     /* In UTF-8 mode, opcodes that are followed by a character may be followed
01874     by a multi-byte character. The length in the table is a minimum, so we have
01875     to arrange to skip the extra bytes. */
01876 
01877 #ifdef SUPPORT_UTF8
01878     if (utf8) switch(c)
01879       {
01880       case OP_CHAR:
01881       case OP_CHARNC:
01882       case OP_EXACT:
01883       case OP_UPTO:
01884       case OP_MINUPTO:
01885       case OP_POSUPTO:
01886       case OP_STAR:
01887       case OP_MINSTAR:
01888       case OP_POSSTAR:
01889       case OP_PLUS:
01890       case OP_MINPLUS:
01891       case OP_POSPLUS:
01892       case OP_QUERY:
01893       case OP_MINQUERY:
01894       case OP_POSQUERY:
01895       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
01896       break;
01897       }
01898 #else
01899     (void)(utf8);  /* Keep compiler happy by referencing function argument */
01900 #endif
01901     }
01902   }
01903 }
01904 
01905 
01906 
01907 /*************************************************
01908 *    Scan compiled branch for non-emptiness      *
01909 *************************************************/
01910 
01911 /* This function scans through a branch of a compiled pattern to see whether it
01912 can match the empty string or not. It is called from could_be_empty()
01913 below and from compile_branch() when checking for an unlimited repeat of a
01914 group that can match nothing. Note that first_significant_code() skips over
01915 backward and negative forward assertions when its final argument is TRUE. If we
01916 hit an unclosed bracket, we return "empty" - this means we've struck an inner
01917 bracket whose current branch will already have been scanned.
01918 
01919 Arguments:
01920   code        points to start of search
01921   endcode     points to where to stop
01922   utf8        TRUE if in UTF8 mode
01923   cd          contains pointers to tables etc.
01924 
01925 Returns:      TRUE if what is matched could be empty
01926 */
01927 
01928 static BOOL
01929 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
01930   compile_data *cd)
01931 {
01932 register int c;
01933 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
01934      code < endcode;
01935      code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
01936   {
01937   const uschar *ccode;
01938 
01939   c = *code;
01940 
01941   /* Skip over forward assertions; the other assertions are skipped by
01942   first_significant_code() with a TRUE final argument. */
01943 
01944   if (c == OP_ASSERT)
01945     {
01946     do code += GET(code, 1); while (*code == OP_ALT);
01947     c = *code;
01948     continue;
01949     }
01950 
01951   /* Groups with zero repeats can of course be empty; skip them. */
01952 
01953   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
01954     {
01955     code += _pcre_OP_lengths[c];
01956     do code += GET(code, 1); while (*code == OP_ALT);
01957     c = *code;
01958     continue;
01959     }
01960 
01961   /* For a recursion/subroutine call, if its end has been reached, which
01962   implies a subroutine call, we can scan it. */
01963 
01964   if (c == OP_RECURSE)
01965     {
01966     BOOL empty_branch = FALSE;
01967     const uschar *scode = cd->start_code + GET(code, 1);
01968     if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
01969     do
01970       {
01971       if (could_be_empty_branch(scode, endcode, utf8, cd))
01972         {
01973         empty_branch = TRUE;
01974         break;
01975         }
01976       scode += GET(scode, 1);
01977       }
01978     while (*scode == OP_ALT);
01979     if (!empty_branch) return FALSE;  /* All branches are non-empty */
01980     continue;
01981     }
01982 
01983   /* For other groups, scan the branches. */
01984 
01985   if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
01986     {
01987     BOOL empty_branch;
01988     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
01989 
01990     /* If a conditional group has only one branch, there is a second, implied,
01991     empty branch, so just skip over the conditional, because it could be empty.
01992     Otherwise, scan the individual branches of the group. */
01993 
01994     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
01995       code += GET(code, 1);
01996     else
01997       {
01998       empty_branch = FALSE;
01999       do
02000         {
02001         if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
02002           empty_branch = TRUE;
02003         code += GET(code, 1);
02004         }
02005       while (*code == OP_ALT);
02006       if (!empty_branch) return FALSE;   /* All branches are non-empty */
02007       }
02008 
02009     c = *code;
02010     continue;
02011     }
02012 
02013   /* Handle the other opcodes */
02014 
02015   switch (c)
02016     {
02017     /* Check for quantifiers after a class. XCLASS is used for classes that
02018     cannot be represented just by a bit map. This includes negated single
02019     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
02020     actual length is stored in the compiled code, so we must update "code"
02021     here. */
02022 
02023 #ifdef SUPPORT_UTF8
02024     case OP_XCLASS:
02025     ccode = code += GET(code, 1);
02026     goto CHECK_CLASS_REPEAT;
02027 #endif
02028 
02029     case OP_CLASS:
02030     case OP_NCLASS:
02031     ccode = code + 33;
02032 
02033 #ifdef SUPPORT_UTF8
02034     CHECK_CLASS_REPEAT:
02035 #endif
02036 
02037     switch (*ccode)
02038       {
02039       case OP_CRSTAR:            /* These could be empty; continue */
02040       case OP_CRMINSTAR:
02041       case OP_CRQUERY:
02042       case OP_CRMINQUERY:
02043       break;
02044 
02045       default:                   /* Non-repeat => class must match */
02046       case OP_CRPLUS:            /* These repeats aren't empty */
02047       case OP_CRMINPLUS:
02048       return FALSE;
02049 
02050       case OP_CRRANGE:
02051       case OP_CRMINRANGE:
02052       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
02053       break;
02054       }
02055     break;
02056 
02057     /* Opcodes that must match a character */
02058 
02059     case OP_PROP:
02060     case OP_NOTPROP:
02061     case OP_EXTUNI:
02062     case OP_NOT_DIGIT:
02063     case OP_DIGIT:
02064     case OP_NOT_WHITESPACE:
02065     case OP_WHITESPACE:
02066     case OP_NOT_WORDCHAR:
02067     case OP_WORDCHAR:
02068     case OP_ANY:
02069     case OP_ALLANY:
02070     case OP_ANYBYTE:
02071     case OP_CHAR:
02072     case OP_CHARNC:
02073     case OP_NOT:
02074     case OP_PLUS:
02075     case OP_MINPLUS:
02076     case OP_POSPLUS:
02077     case OP_EXACT:
02078     case OP_NOTPLUS:
02079     case OP_NOTMINPLUS:
02080     case OP_NOTPOSPLUS:
02081     case OP_NOTEXACT:
02082     case OP_TYPEPLUS:
02083     case OP_TYPEMINPLUS:
02084     case OP_TYPEPOSPLUS:
02085     case OP_TYPEEXACT:
02086     return FALSE;
02087 
02088     /* These are going to continue, as they may be empty, but we have to
02089     fudge the length for the \p and \P cases. */
02090 
02091     case OP_TYPESTAR:
02092     case OP_TYPEMINSTAR:
02093     case OP_TYPEPOSSTAR:
02094     case OP_TYPEQUERY:
02095     case OP_TYPEMINQUERY:
02096     case OP_TYPEPOSQUERY:
02097     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
02098     break;
02099 
02100     /* Same for these */
02101 
02102     case OP_TYPEUPTO:
02103     case OP_TYPEMINUPTO:
02104     case OP_TYPEPOSUPTO:
02105     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
02106     break;
02107 
02108     /* End of branch */
02109 
02110     case OP_KET:
02111     case OP_KETRMAX:
02112     case OP_KETRMIN:
02113     case OP_ALT:
02114     return TRUE;
02115 
02116     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
02117     MINUPTO, and POSUPTO may be followed by a multibyte character */
02118 
02119 #ifdef SUPPORT_UTF8
02120     case OP_STAR:
02121     case OP_MINSTAR:
02122     case OP_POSSTAR:
02123     case OP_QUERY:
02124     case OP_MINQUERY:
02125     case OP_POSQUERY:
02126     if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
02127     break;
02128 
02129     case OP_UPTO:
02130     case OP_MINUPTO:
02131     case OP_POSUPTO:
02132     if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
02133     break;
02134 #endif
02135 
02136     /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
02137     string. */
02138 
02139     case OP_MARK:
02140     case OP_PRUNE_ARG:
02141     case OP_SKIP_ARG:
02142     code += code[1];
02143     break;
02144 
02145     case OP_THEN_ARG:
02146     code += code[1+LINK_SIZE];
02147     break;
02148 
02149     /* None of the remaining opcodes are required to match a character. */
02150 
02151     default:
02152     break;
02153     }
02154   }
02155 
02156 return TRUE;
02157 }
02158 
02159 
02160 
02161 /*************************************************
02162 *    Scan compiled regex for non-emptiness       *
02163 *************************************************/
02164 
02165 /* This function is called to check for left recursive calls. We want to check
02166 the current branch of the current pattern to see if it could match the empty
02167 string. If it could, we must look outwards for branches at other levels,
02168 stopping when we pass beyond the bracket which is the subject of the recursion.
02169 
02170 Arguments:
02171   code        points to start of the recursion
02172   endcode     points to where to stop (current RECURSE item)
02173   bcptr       points to the chain of current (unclosed) branch starts
02174   utf8        TRUE if in UTF-8 mode
02175   cd          pointers to tables etc
02176 
02177 Returns:      TRUE if what is matched could be empty
02178 */
02179 
02180 static BOOL
02181 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
02182   BOOL utf8, compile_data *cd)
02183 {
02184 while (bcptr != NULL && bcptr->current_branch >= code)
02185   {
02186   if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
02187     return FALSE;
02188   bcptr = bcptr->outer;
02189   }
02190 return TRUE;
02191 }
02192 
02193 
02194 
02195 /*************************************************
02196 *           Check for POSIX class syntax         *
02197 *************************************************/
02198 
02199 /* This function is called when the sequence "[:" or "[." or "[=" is
02200 encountered in a character class. It checks whether this is followed by a
02201 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
02202 reach an unescaped ']' without the special preceding character, return FALSE.
02203 
02204 Originally, this function only recognized a sequence of letters between the
02205 terminators, but it seems that Perl recognizes any sequence of characters,
02206 though of course unknown POSIX names are subsequently rejected. Perl gives an
02207 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
02208 didn't consider this to be a POSIX class. Likewise for [:1234:].
02209 
02210 The problem in trying to be exactly like Perl is in the handling of escapes. We
02211 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
02212 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
02213 below handles the special case of \], but does not try to do any other escape
02214 processing. This makes it different from Perl for cases such as [:l\ower:]
02215 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
02216 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
02217 I think.
02218 
02219 Arguments:
02220   ptr      pointer to the initial [
02221   endptr   where to return the end pointer
02222 
02223 Returns:   TRUE or FALSE
02224 */
02225 
02226 static BOOL
02227 check_posix_syntax(const uschar *ptr, const uschar **endptr)
02228 {
02229 int terminator;          /* Don't combine these lines; the Solaris cc */
02230 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
02231 for (++ptr; *ptr != 0; ptr++)
02232   {
02233   if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
02234     {
02235     if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
02236     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
02237       {
02238       *endptr = ptr;
02239       return TRUE;
02240       }
02241     }
02242   }
02243 return FALSE;
02244 }
02245 
02246 
02247 
02248 
02249 /*************************************************
02250 *          Check POSIX class name                *
02251 *************************************************/
02252 
02253 /* This function is called to check the name given in a POSIX-style class entry
02254 such as [:alnum:].
02255 
02256 Arguments:
02257   ptr        points to the first letter
02258   len        the length of the name
02259 
02260 Returns:     a value representing the name, or -1 if unknown
02261 */
02262 
02263 static int
02264 check_posix_name(const uschar *ptr, int len)
02265 {
02266 const char *pn = posix_names;
02267 register int yield = 0;
02268 while (posix_name_lengths[yield] != 0)
02269   {
02270   if (len == posix_name_lengths[yield] &&
02271     strncmp((const char *)ptr, pn, len) == 0) return yield;
02272   pn += posix_name_lengths[yield] + 1;
02273   yield++;
02274   }
02275 return -1;
02276 }
02277 
02278 
02279 /*************************************************
02280 *    Adjust OP_RECURSE items in repeated group   *
02281 *************************************************/
02282 
02283 /* OP_RECURSE items contain an offset from the start of the regex to the group
02284 that is referenced. This means that groups can be replicated for fixed
02285 repetition simply by copying (because the recursion is allowed to refer to
02286 earlier groups that are outside the current group). However, when a group is
02287 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
02288 inserted before it, after it has been compiled. This means that any OP_RECURSE
02289 items within it that refer to the group itself or any contained groups have to
02290 have their offsets adjusted. That one of the jobs of this function. Before it
02291 is called, the partially compiled regex must be temporarily terminated with
02292 OP_END.
02293 
02294 This function has been extended with the possibility of forward references for
02295 recursions and subroutine calls. It must also check the list of such references
02296 for the group we are dealing with. If it finds that one of the recursions in
02297 the current group is on this list, it adjusts the offset in the list, not the
02298 value in the reference (which is a group number).
02299 
02300 Arguments:
02301   group      points to the start of the group
02302   adjust     the amount by which the group is to be moved
02303   utf8       TRUE in UTF-8 mode
02304   cd         contains pointers to tables etc.
02305   save_hwm   the hwm forward reference pointer at the start of the group
02306 
02307 Returns:     nothing
02308 */
02309 
02310 static void
02311 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
02312   uschar *save_hwm)
02313 {
02314 uschar *ptr = group;
02315 
02316 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
02317   {
02318   int offset;
02319   uschar *hc;
02320 
02321   /* See if this recursion is on the forward reference list. If so, adjust the
02322   reference. */
02323 
02324   for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
02325     {
02326     offset = GET(hc, 0);
02327     if (cd->start_code + offset == ptr + 1)
02328       {
02329       PUT(hc, 0, offset + adjust);
02330       break;
02331       }
02332     }
02333 
02334   /* Otherwise, adjust the recursion offset if it's after the start of this
02335   group. */
02336 
02337   if (hc >= cd->hwm)
02338     {
02339     offset = GET(ptr, 1);
02340     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
02341     }
02342 
02343   ptr += 1 + LINK_SIZE;
02344   }
02345 }
02346 
02347 
02348 
02349 /*************************************************
02350 *        Insert an automatic callout point       *
02351 *************************************************/
02352 
02353 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
02354 callout points before each pattern item.
02355 
02356 Arguments:
02357   code           current code pointer
02358   ptr            current pattern pointer
02359   cd             pointers to tables etc
02360 
02361 Returns:         new code pointer
02362 */
02363 
02364 static uschar *
02365 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
02366 {
02367 *code++ = OP_CALLOUT;
02368 *code++ = 255;
02369 PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
02370 PUT(code, LINK_SIZE, 0);                       /* Default length */
02371 return code + 2*LINK_SIZE;
02372 }
02373 
02374 
02375 
02376 /*************************************************
02377 *         Complete a callout item                *
02378 *************************************************/
02379 
02380 /* A callout item contains the length of the next item in the pattern, which
02381 we can't fill in till after we have reached the relevant point. This is used
02382 for both automatic and manual callouts.
02383 
02384 Arguments:
02385   previous_callout   points to previous callout item
02386   ptr                current pattern pointer
02387   cd                 pointers to tables etc
02388 
02389 Returns:             nothing
02390 */
02391 
02392 static void
02393 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
02394 {
02395 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
02396 PUT(previous_callout, 2 + LINK_SIZE, length);
02397 }
02398 
02399 
02400 
02401 #ifdef SUPPORT_UCP
02402 /*************************************************
02403 *           Get othercase range                  *
02404 *************************************************/
02405 
02406 /* This function is passed the start and end of a class range, in UTF-8 mode
02407 with UCP support. It searches up the characters, looking for internal ranges of
02408 characters in the "other" case. Each call returns the next one, updating the
02409 start address.
02410 
02411 Arguments:
02412   cptr        points to starting character value; updated
02413   d           end value
02414   ocptr       where to put start of othercase range
02415   odptr       where to put end of othercase range
02416 
02417 Yield:        TRUE when range returned; FALSE when no more
02418 */
02419 
02420 static BOOL
02421 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
02422   unsigned int *odptr)
02423 {
02424 unsigned int c, othercase, next;
02425 
02426 for (c = *cptr; c <= d; c++)
02427   { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
02428 
02429 if (c > d) return FALSE;
02430 
02431 *ocptr = othercase;
02432 next = othercase + 1;
02433 
02434 for (++c; c <= d; c++)
02435   {
02436   if (UCD_OTHERCASE(c) != next) break;
02437   next++;
02438   }
02439 
02440 *odptr = next - 1;
02441 *cptr = c;
02442 
02443 return TRUE;
02444 }
02445 
02446 
02447 
02448 /*************************************************
02449 *        Check a character and a property        *
02450 *************************************************/
02451 
02452 /* This function is called by check_auto_possessive() when a property item
02453 is adjacent to a fixed character.
02454 
02455 Arguments:
02456   c            the character
02457   ptype        the property type
02458   pdata        the data for the type
02459   negated      TRUE if it's a negated property (\P or \p{^)
02460 
02461 Returns:       TRUE if auto-possessifying is OK
02462 */
02463 
02464 static BOOL
02465 check_char_prop(int c, int ptype, int pdata, BOOL negated)
02466 {
02467 const ucd_record *prop = GET_UCD(c);
02468 switch(ptype)
02469   {
02470   case PT_LAMP:
02471   return (prop->chartype == ucp_Lu ||
02472           prop->chartype == ucp_Ll ||
02473           prop->chartype == ucp_Lt) == negated;
02474 
02475   case PT_GC:
02476   return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;
02477 
02478   case PT_PC:
02479   return (pdata == prop->chartype) == negated;
02480 
02481   case PT_SC:
02482   return (pdata == prop->script) == negated;
02483 
02484   /* These are specials */
02485 
02486   case PT_ALNUM:
02487   return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
02488           _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;
02489 
02490   case PT_SPACE:    /* Perl space */
02491   return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
02492           c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
02493           == negated;
02494 
02495   case PT_PXSPACE:  /* POSIX space */
02496   return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
02497           c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
02498           c == CHAR_FF || c == CHAR_CR)
02499           == negated;
02500 
02501   case PT_WORD:
02502   return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
02503           _pcre_ucp_gentype[prop->chartype] == ucp_N ||
02504           c == CHAR_UNDERSCORE) == negated;
02505   }
02506 return FALSE;
02507 }
02508 #endif  /* SUPPORT_UCP */
02509 
02510 
02511 
02512 /*************************************************
02513 *     Check if auto-possessifying is possible    *
02514 *************************************************/
02515 
02516 /* This function is called for unlimited repeats of certain items, to see
02517 whether the next thing could possibly match the repeated item. If not, it makes
02518 sense to automatically possessify the repeated item.
02519 
02520 Arguments:
02521   previous      pointer to the repeated opcode
02522   utf8          TRUE in UTF-8 mode
02523   ptr           next character in pattern
02524   options       options bits
02525   cd            contains pointers to tables etc.
02526 
02527 Returns:        TRUE if possessifying is wanted
02528 */
02529 
02530 static BOOL
02531 check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,
02532   int options, compile_data *cd)
02533 {
02534 int c, next;
02535 int op_code = *previous++;
02536 
02537 /* Skip whitespace and comments in extended mode */
02538 
02539 if ((options & PCRE_EXTENDED) != 0)
02540   {
02541   for (;;)
02542     {
02543     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
02544     if (*ptr == CHAR_NUMBER_SIGN)
02545       {
02546       ptr++;
02547       while (*ptr != 0)
02548         {
02549         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
02550         ptr++;
02551 #ifdef SUPPORT_UTF8
02552         if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
02553 #endif
02554         }
02555       }
02556     else break;
02557     }
02558   }
02559 
02560 /* If the next item is one that we can handle, get its value. A non-negative
02561 value is a character, a negative value is an escape value. */
02562 
02563 if (*ptr == CHAR_BACKSLASH)
02564   {
02565   int temperrorcode = 0;
02566   next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
02567   if (temperrorcode != 0) return FALSE;
02568   ptr++;    /* Point after the escape sequence */
02569   }
02570 
02571 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
02572   {
02573 #ifdef SUPPORT_UTF8
02574   if (utf8) { GETCHARINC(next, ptr); } else
02575 #endif
02576   next = *ptr++;
02577   }
02578 
02579 else return FALSE;
02580 
02581 /* Skip whitespace and comments in extended mode */
02582 
02583 if ((options & PCRE_EXTENDED) != 0)
02584   {
02585   for (;;)
02586     {
02587     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
02588     if (*ptr == CHAR_NUMBER_SIGN)
02589       {
02590       ptr++;
02591       while (*ptr != 0)
02592         {
02593         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
02594         ptr++;
02595 #ifdef SUPPORT_UTF8
02596         if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
02597 #endif
02598         }
02599       }
02600     else break;
02601     }
02602   }
02603 
02604 /* If the next thing is itself optional, we have to give up. */
02605 
02606 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
02607   strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
02608     return FALSE;
02609 
02610 /* Now compare the next item with the previous opcode. First, handle cases when
02611 the next item is a character. */
02612 
02613 if (next >= 0) switch(op_code)
02614   {
02615   case OP_CHAR:
02616 #ifdef SUPPORT_UTF8
02617   GETCHARTEST(c, previous);
02618 #else
02619   c = *previous;
02620 #endif
02621   return c != next;
02622 
02623   /* For CHARNC (caseless character) we must check the other case. If we have
02624   Unicode property support, we can use it to test the other case of
02625   high-valued characters. */
02626 
02627   case OP_CHARNC:
02628 #ifdef SUPPORT_UTF8
02629   GETCHARTEST(c, previous);
02630 #else
02631   c = *previous;
02632 #endif
02633   if (c == next) return FALSE;
02634 #ifdef SUPPORT_UTF8
02635   if (utf8)
02636     {
02637     unsigned int othercase;
02638     if (next < 128) othercase = cd->fcc[next]; else
02639 #ifdef SUPPORT_UCP
02640     othercase = UCD_OTHERCASE((unsigned int)next);
02641 #else
02642     othercase = NOTACHAR;
02643 #endif
02644     return (unsigned int)c != othercase;
02645     }
02646   else
02647 #endif  /* SUPPORT_UTF8 */
02648   return (c != cd->fcc[next]);  /* Non-UTF-8 mode */
02649 
02650   /* For OP_NOT, its data is always a single-byte character. */
02651 
02652   case OP_NOT:
02653   if ((c = *previous) == next) return TRUE;
02654   if ((options & PCRE_CASELESS) == 0) return FALSE;
02655 #ifdef SUPPORT_UTF8
02656   if (utf8)
02657     {
02658     unsigned int othercase;
02659     if (next < 128) othercase = cd->fcc[next]; else
02660 #ifdef SUPPORT_UCP
02661     othercase = UCD_OTHERCASE(next);
02662 #else
02663     othercase = NOTACHAR;
02664 #endif
02665     return (unsigned int)c == othercase;
02666     }
02667   else
02668 #endif  /* SUPPORT_UTF8 */
02669   return (c == cd->fcc[next]);  /* Non-UTF-8 mode */
02670 
02671   /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
02672   When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
02673 
02674   case OP_DIGIT:
02675   return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
02676 
02677   case OP_NOT_DIGIT:
02678   return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
02679 
02680   case OP_WHITESPACE:
02681   return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
02682 
02683   case OP_NOT_WHITESPACE:
02684   return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
02685 
02686   case OP_WORDCHAR:
02687   return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
02688 
02689   case OP_NOT_WORDCHAR:
02690   return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
02691 
02692   case OP_HSPACE:
02693   case OP_NOT_HSPACE:
02694   switch(next)
02695     {
02696     case 0x09:
02697     case 0x20:
02698     case 0xa0:
02699     case 0x1680:
02700     case 0x180e:
02701     case 0x2000:
02702     case 0x2001:
02703     case 0x2002:
02704     case 0x2003:
02705     case 0x2004:
02706     case 0x2005:
02707     case 0x2006:
02708     case 0x2007:
02709     case 0x2008:
02710     case 0x2009:
02711     case 0x200A:
02712     case 0x202f:
02713     case 0x205f:
02714     case 0x3000:
02715     return op_code == OP_NOT_HSPACE;
02716     default:
02717     return op_code != OP_NOT_HSPACE;
02718     }
02719 
02720   case OP_ANYNL:
02721   case OP_VSPACE:
02722   case OP_NOT_VSPACE:
02723   switch(next)
02724     {
02725     case 0x0a:
02726     case 0x0b:
02727     case 0x0c:
02728     case 0x0d:
02729     case 0x85:
02730     case 0x2028:
02731     case 0x2029:
02732     return op_code == OP_NOT_VSPACE;
02733     default:
02734     return op_code != OP_NOT_VSPACE;
02735     }
02736 
02737 #ifdef SUPPORT_UCP
02738   case OP_PROP:
02739   return check_char_prop(next, previous[0], previous[1], FALSE);
02740 
02741   case OP_NOTPROP:
02742   return check_char_prop(next, previous[0], previous[1], TRUE);
02743 #endif
02744 
02745   default:
02746   return FALSE;
02747   }
02748 
02749 
02750 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
02751 is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
02752 generated only when PCRE_UCP is *not* set, that is, when only ASCII
02753 characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
02754 replaced by OP_PROP codes when PCRE_UCP is set. */
02755 
02756 switch(op_code)
02757   {
02758   case OP_CHAR:
02759   case OP_CHARNC:
02760 #ifdef SUPPORT_UTF8
02761   GETCHARTEST(c, previous);
02762 #else
02763   c = *previous;
02764 #endif
02765   switch(-next)
02766     {
02767     case ESC_d:
02768     return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
02769 
02770     case ESC_D:
02771     return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
02772 
02773     case ESC_s:
02774     return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
02775 
02776     case ESC_S:
02777     return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
02778 
02779     case ESC_w:
02780     return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
02781 
02782     case ESC_W:
02783     return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
02784 
02785     case ESC_h:
02786     case ESC_H:
02787     switch(c)
02788       {
02789       case 0x09:
02790       case 0x20:
02791       case 0xa0:
02792       case 0x1680:
02793       case 0x180e:
02794       case 0x2000:
02795       case 0x2001:
02796       case 0x2002:
02797       case 0x2003:
02798       case 0x2004:
02799       case 0x2005:
02800       case 0x2006:
02801       case 0x2007:
02802       case 0x2008:
02803       case 0x2009:
02804       case 0x200A:
02805       case 0x202f:
02806       case 0x205f:
02807       case 0x3000:
02808       return -next != ESC_h;
02809       default:
02810       return -next == ESC_h;
02811       }
02812 
02813     case ESC_v:
02814     case ESC_V:
02815     switch(c)
02816       {
02817       case 0x0a:
02818       case 0x0b:
02819       case 0x0c:
02820       case 0x0d:
02821       case 0x85:
02822       case 0x2028:
02823       case 0x2029:
02824       return -next != ESC_v;
02825       default:
02826       return -next == ESC_v;
02827       }
02828 
02829     /* When PCRE_UCP is set, these values get generated for \d etc. Find
02830     their substitutions and process them. The result will always be either
02831     -ESC_p or -ESC_P. Then fall through to process those values. */
02832 
02833 #ifdef SUPPORT_UCP
02834     case ESC_du:
02835     case ESC_DU:
02836     case ESC_wu:
02837     case ESC_WU:
02838     case ESC_su:
02839     case ESC_SU:
02840       {
02841       int temperrorcode = 0;
02842       ptr = substitutes[-next - ESC_DU];
02843       next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
02844       if (temperrorcode != 0) return FALSE;
02845       ptr++;    /* For compatibility */
02846       }
02847     /* Fall through */
02848 
02849     case ESC_p:
02850     case ESC_P:
02851       {
02852       int ptype, pdata, errorcodeptr;
02853       BOOL negated;
02854 
02855       ptr--;      /* Make ptr point at the p or P */
02856       ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
02857       if (ptype < 0) return FALSE;
02858       ptr++;      /* Point past the final curly ket */
02859 
02860       /* If the property item is optional, we have to give up. (When generated
02861       from \d etc by PCRE_UCP, this test will have been applied much earlier,
02862       to the original \d etc. At this point, ptr will point to a zero byte. */
02863 
02864       if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
02865         strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
02866           return FALSE;
02867 
02868       /* Do the property check. */
02869 
02870       return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
02871       }
02872 #endif
02873 
02874     default:
02875     return FALSE;
02876     }
02877 
02878   /* In principle, support for Unicode properties should be integrated here as
02879   well. It means re-organizing the above code so as to get hold of the property
02880   values before switching on the op-code. However, I wonder how many patterns
02881   combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
02882   these op-codes are never generated.) */
02883 
02884   case OP_DIGIT:
02885   return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
02886          next == -ESC_h || next == -ESC_v || next == -ESC_R;
02887 
02888   case OP_NOT_DIGIT:
02889   return next == -ESC_d;
02890 
02891   case OP_WHITESPACE:
02892   return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
02893 
02894   case OP_NOT_WHITESPACE:
02895   return next == -ESC_s || next == -ESC_h || next == -ESC_v;
02896 
02897   case OP_HSPACE:
02898   return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
02899          next == -ESC_w || next == -ESC_v || next == -ESC_R;
02900 
02901   case OP_NOT_HSPACE:
02902   return next == -ESC_h;
02903 
02904   /* Can't have \S in here because VT matches \S (Perl anomaly) */
02905   case OP_ANYNL:
02906   case OP_VSPACE:
02907   return next == -ESC_V || next == -ESC_d || next == -ESC_w;
02908 
02909   case OP_NOT_VSPACE:
02910   return next == -ESC_v || next == -ESC_R;
02911 
02912   case OP_WORDCHAR:
02913   return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
02914          next == -ESC_v || next == -ESC_R;
02915 
02916   case OP_NOT_WORDCHAR:
02917   return next == -ESC_w || next == -ESC_d;
02918 
02919   default:
02920   return FALSE;
02921   }
02922 
02923 /* Control does not reach here */
02924 }
02925 
02926 
02927 
02928 /*************************************************
02929 *           Compile one branch                   *
02930 *************************************************/
02931 
02932 /* Scan the pattern, compiling it into the a vector. If the options are
02933 changed during the branch, the pointer is used to change the external options
02934 bits. This function is used during the pre-compile phase when we are trying
02935 to find out the amount of memory needed, as well as during the real compile
02936 phase. The value of lengthptr distinguishes the two phases.
02937 
02938 Arguments:
02939   optionsptr     pointer to the option bits
02940   codeptr        points to the pointer to the current code point
02941   ptrptr         points to the current pattern pointer
02942   errorcodeptr   points to error code variable
02943   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
02944   reqbyteptr     set to the last literal character required, else < 0
02945   bcptr          points to current branch chain
02946   cd             contains pointers to tables etc.
02947   lengthptr      NULL during the real compile phase
02948                  points to length accumulator during pre-compile phase
02949 
02950 Returns:         TRUE on success
02951                  FALSE, with *errorcodeptr set non-zero on error
02952 */
02953 
02954 static BOOL
02955 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
02956   int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
02957   compile_data *cd, int *lengthptr)
02958 {
02959 int repeat_type, op_type;
02960 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
02961 int bravalue = 0;
02962 int greedy_default, greedy_non_default;
02963 int firstbyte, reqbyte;
02964 int zeroreqbyte, zerofirstbyte;
02965 int req_caseopt, reqvary, tempreqvary;
02966 int options = *optionsptr;
02967 int after_manual_callout = 0;
02968 int length_prevgroup = 0;
02969 register int c;
02970 register uschar *code = *codeptr;
02971 uschar *last_code = code;
02972 uschar *orig_code = code;
02973 uschar *tempcode;
02974 BOOL inescq = FALSE;
02975 BOOL groupsetfirstbyte = FALSE;
02976 const uschar *ptr = *ptrptr;
02977 const uschar *tempptr;
02978 const uschar *nestptr = NULL;
02979 uschar *previous = NULL;
02980 uschar *previous_callout = NULL;
02981 uschar *save_hwm = NULL;
02982 uschar classbits[32];
02983 
02984 #ifdef SUPPORT_UTF8
02985 BOOL class_utf8;
02986 BOOL utf8 = (options & PCRE_UTF8) != 0;
02987 uschar *class_utf8data;
02988 uschar *class_utf8data_base;
02989 uschar utf8_char[6];
02990 #else
02991 BOOL utf8 = FALSE;
02992 uschar *utf8_char = NULL;
02993 #endif
02994 
02995 #ifdef PCRE_DEBUG
02996 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
02997 #endif
02998 
02999 /* Set up the default and non-default settings for greediness */
03000 
03001 greedy_default = ((options & PCRE_UNGREEDY) != 0);
03002 greedy_non_default = greedy_default ^ 1;
03003 
03004 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
03005 matching encountered yet". It gets changed to REQ_NONE if we hit something that
03006 matches a non-fixed char first char; reqbyte just remains unset if we never
03007 find one.
03008 
03009 When we hit a repeat whose minimum is zero, we may have to adjust these values
03010 to take the zero repeat into account. This is implemented by setting them to
03011 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
03012 item types that can be repeated set these backoff variables appropriately. */
03013 
03014 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
03015 
03016 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
03017 according to the current setting of the caseless flag. REQ_CASELESS is a bit
03018 value > 255. It is added into the firstbyte or reqbyte variables to record the
03019 case status of the value. This is used only for ASCII characters. */
03020 
03021 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
03022 
03023 /* Switch on next character until the end of the branch */
03024 
03025 for (;; ptr++)
03026   {
03027   BOOL negate_class;
03028   BOOL should_flip_negation;
03029   BOOL possessive_quantifier;
03030   BOOL is_quantifier;
03031   BOOL is_recurse;
03032   BOOL reset_bracount;
03033   int class_charcount;
03034   int class_lastchar;
03035   int newoptions;
03036   int recno;
03037   int refsign;
03038   int skipbytes;
03039   int subreqbyte;
03040   int subfirstbyte;
03041   int terminator;
03042   int mclength;
03043   uschar mcbuffer[8];
03044 
03045   /* Get next byte in the pattern */
03046 
03047   c = *ptr;
03048 
03049   /* If we are at the end of a nested substitution, revert to the outer level
03050   string. Nesting only happens one level deep. */
03051 
03052   if (c == 0 && nestptr != NULL)
03053     {
03054     ptr = nestptr;
03055     nestptr = NULL;
03056     c = *ptr;
03057     }
03058 
03059   /* If we are in the pre-compile phase, accumulate the length used for the
03060   previous cycle of this loop. */
03061 
03062   if (lengthptr != NULL)
03063     {
03064 #ifdef PCRE_DEBUG
03065     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
03066 #endif
03067     if (code > cd->start_workspace + WORK_SIZE_CHECK)   /* Check for overrun */
03068       {
03069       *errorcodeptr = ERR52;
03070       goto FAILED;
03071       }
03072 
03073     /* There is at least one situation where code goes backwards: this is the
03074     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
03075     the class is simply eliminated. However, it is created first, so we have to
03076     allow memory for it. Therefore, don't ever reduce the length at this point.
03077     */
03078 
03079     if (code < last_code) code = last_code;
03080 
03081     /* Paranoid check for integer overflow */
03082 
03083     if (OFLOW_MAX - *lengthptr < code - last_code)
03084       {
03085       *errorcodeptr = ERR20;
03086       goto FAILED;
03087       }
03088 
03089     *lengthptr += (int)(code - last_code);
03090     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
03091 
03092     /* If "previous" is set and it is not at the start of the work space, move
03093     it back to there, in order to avoid filling up the work space. Otherwise,
03094     if "previous" is NULL, reset the current code pointer to the start. */
03095 
03096     if (previous != NULL)
03097       {
03098       if (previous > orig_code)
03099         {
03100         memmove(orig_code, previous, code - previous);
03101         code -= previous - orig_code;
03102         previous = orig_code;
03103         }
03104       }
03105     else code = orig_code;
03106 
03107     /* Remember where this code item starts so we can pick up the length
03108     next time round. */
03109 
03110     last_code = code;
03111     }
03112 
03113   /* In the real compile phase, just check the workspace used by the forward
03114   reference list. */
03115 
03116   else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
03117     {
03118     *errorcodeptr = ERR52;
03119     goto FAILED;
03120     }
03121 
03122   /* If in \Q...\E, check for the end; if not, we have a literal */
03123 
03124   if (inescq && c != 0)
03125     {
03126     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
03127       {
03128       inescq = FALSE;
03129       ptr++;
03130       continue;
03131       }
03132     else
03133       {
03134       if (previous_callout != NULL)
03135         {
03136         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
03137           complete_callout(previous_callout, ptr, cd);
03138         previous_callout = NULL;
03139         }
03140       if ((options & PCRE_AUTO_CALLOUT) != 0)
03141         {
03142         previous_callout = code;
03143         code = auto_callout(code, ptr, cd);
03144         }
03145       goto NORMAL_CHAR;
03146       }
03147     }
03148 
03149   /* Fill in length of a previous callout, except when the next thing is
03150   a quantifier. */
03151 
03152   is_quantifier =
03153     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
03154     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
03155 
03156   if (!is_quantifier && previous_callout != NULL &&
03157        after_manual_callout-- <= 0)
03158     {
03159     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
03160       complete_callout(previous_callout, ptr, cd);
03161     previous_callout = NULL;
03162     }
03163 
03164   /* In extended mode, skip white space and comments */
03165 
03166   if ((options & PCRE_EXTENDED) != 0)
03167     {
03168     if ((cd->ctypes[c] & ctype_space) != 0) continue;
03169     if (c == CHAR_NUMBER_SIGN)
03170       {
03171       ptr++;
03172       while (*ptr != 0)
03173         {
03174         if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
03175         ptr++;
03176 #ifdef SUPPORT_UTF8
03177         if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
03178 #endif
03179         }
03180       if (*ptr != 0) continue;
03181 
03182       /* Else fall through to handle end of string */
03183       c = 0;
03184       }
03185     }
03186 
03187   /* No auto callout for quantifiers. */
03188 
03189   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
03190     {
03191     previous_callout = code;
03192     code = auto_callout(code, ptr, cd);
03193     }
03194 
03195   switch(c)
03196     {
03197     /* ===================================================================*/
03198     case 0:                        /* The branch terminates at string end */
03199     case CHAR_VERTICAL_LINE:       /* or | or ) */
03200     case CHAR_RIGHT_PARENTHESIS:
03201     *firstbyteptr = firstbyte;
03202     *reqbyteptr = reqbyte;
03203     *codeptr = code;
03204     *ptrptr = ptr;
03205     if (lengthptr != NULL)
03206       {
03207       if (OFLOW_MAX - *lengthptr < code - last_code)
03208         {
03209         *errorcodeptr = ERR20;
03210         goto FAILED;
03211         }
03212       *lengthptr += (int)(code - last_code);   /* To include callout length */
03213       DPRINTF((">> end branch\n"));
03214       }
03215     return TRUE;
03216 
03217 
03218     /* ===================================================================*/
03219     /* Handle single-character metacharacters. In multiline mode, ^ disables
03220     the setting of any following char as a first character. */
03221 
03222     case CHAR_CIRCUMFLEX_ACCENT:
03223     if ((options & PCRE_MULTILINE) != 0)
03224       {
03225       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
03226       }
03227     previous = NULL;
03228     *code++ = OP_CIRC;
03229     break;
03230 
03231     case CHAR_DOLLAR_SIGN:
03232     previous = NULL;
03233     *code++ = OP_DOLL;
03234     break;
03235 
03236     /* There can never be a first char if '.' is first, whatever happens about
03237     repeats. The value of reqbyte doesn't change either. */
03238 
03239     case CHAR_DOT:
03240     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
03241     zerofirstbyte = firstbyte;
03242     zeroreqbyte = reqbyte;
03243     previous = code;
03244     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
03245     break;
03246 
03247 
03248     /* ===================================================================*/
03249     /* Character classes. If the included characters are all < 256, we build a
03250     32-byte bitmap of the permitted characters, except in the special case
03251     where there is only one such character. For negated classes, we build the
03252     map as usual, then invert it at the end. However, we use a different opcode
03253     so that data characters > 255 can be handled correctly.
03254 
03255     If the class contains characters outside the 0-255 range, a different
03256     opcode is compiled. It may optionally have a bit map for characters < 256,
03257     but those above are are explicitly listed afterwards. A flag byte tells
03258     whether the bitmap is present, and whether this is a negated class or not.
03259 
03260     In JavaScript compatibility mode, an isolated ']' causes an error. In
03261     default (Perl) mode, it is treated as a data character. */
03262 
03263     case CHAR_RIGHT_SQUARE_BRACKET:
03264     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
03265       {
03266       *errorcodeptr = ERR64;
03267       goto FAILED;
03268       }
03269     goto NORMAL_CHAR;
03270 
03271     case CHAR_LEFT_SQUARE_BRACKET:
03272     previous = code;
03273 
03274     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
03275     they are encountered at the top level, so we'll do that too. */
03276 
03277     if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
03278          ptr[1] == CHAR_EQUALS_SIGN) &&
03279         check_posix_syntax(ptr, &tempptr))
03280       {
03281       *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
03282       goto FAILED;
03283       }
03284 
03285     /* If the first character is '^', set the negation flag and skip it. Also,
03286     if the first few characters (either before or after ^) are \Q\E or \E we
03287     skip them too. This makes for compatibility with Perl. */
03288 
03289     negate_class = FALSE;
03290     for (;;)
03291       {
03292       c = *(++ptr);
03293       if (c == CHAR_BACKSLASH)
03294         {
03295         if (ptr[1] == CHAR_E)
03296           ptr++;
03297         else if (strncmp((const char *)ptr+1,
03298                           STR_Q STR_BACKSLASH STR_E, 3) == 0)
03299           ptr += 3;
03300         else
03301           break;
03302         }
03303       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
03304         negate_class = TRUE;
03305       else break;
03306       }
03307 
03308     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
03309     an initial ']' is taken as a data character -- the code below handles
03310     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
03311     [^] must match any character, so generate OP_ALLANY. */
03312 
03313     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
03314         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
03315       {
03316       *code++ = negate_class? OP_ALLANY : OP_FAIL;
03317       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
03318       zerofirstbyte = firstbyte;
03319       break;
03320       }
03321 
03322     /* If a class contains a negative special such as \S, we need to flip the
03323     negation flag at the end, so that support for characters > 255 works
03324     correctly (they are all included in the class). */
03325 
03326     should_flip_negation = FALSE;
03327 
03328     /* Keep a count of chars with values < 256 so that we can optimize the case
03329     of just a single character (as long as it's < 256). However, For higher
03330     valued UTF-8 characters, we don't yet do any optimization. */
03331 
03332     class_charcount = 0;
03333     class_lastchar = -1;
03334 
03335     /* Initialize the 32-char bit map to all zeros. We build the map in a
03336     temporary bit of memory, in case the class contains only 1 character (less
03337     than 256), because in that case the compiled code doesn't use the bit map.
03338     */
03339 
03340     memset(classbits, 0, 32 * sizeof(uschar));
03341 
03342 #ifdef SUPPORT_UTF8
03343     class_utf8 = FALSE;                       /* No chars >= 256 */
03344     class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
03345     class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
03346 #endif
03347 
03348     /* Process characters until ] is reached. By writing this as a "do" it
03349     means that an initial ] is taken as a data character. At the start of the
03350     loop, c contains the first byte of the character. */
03351 
03352     if (c != 0) do
03353       {
03354       const uschar *oldptr;
03355 
03356 #ifdef SUPPORT_UTF8
03357       if (utf8 && c > 127)
03358         {                           /* Braces are required because the */
03359         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
03360         }
03361 
03362       /* In the pre-compile phase, accumulate the length of any UTF-8 extra
03363       data and reset the pointer. This is so that very large classes that
03364       contain a zillion UTF-8 characters no longer overwrite the work space
03365       (which is on the stack). */
03366 
03367       if (lengthptr != NULL)
03368         {
03369         *lengthptr += class_utf8data - class_utf8data_base;
03370         class_utf8data = class_utf8data_base;
03371         }
03372 
03373 #endif
03374 
03375       /* Inside \Q...\E everything is literal except \E */
03376 
03377       if (inescq)
03378         {
03379         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
03380           {
03381           inescq = FALSE;                   /* Reset literal state */
03382           ptr++;                            /* Skip the 'E' */
03383           continue;                         /* Carry on with next */
03384           }
03385         goto CHECK_RANGE;                   /* Could be range if \E follows */
03386         }
03387 
03388       /* Handle POSIX class names. Perl allows a negation extension of the
03389       form [:^name:]. A square bracket that doesn't match the syntax is
03390       treated as a literal. We also recognize the POSIX constructions
03391       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
03392       5.6 and 5.8 do. */
03393 
03394       if (c == CHAR_LEFT_SQUARE_BRACKET &&
03395           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
03396            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
03397         {
03398         BOOL local_negate = FALSE;
03399         int posix_class, taboffset, tabopt;
03400         register const uschar *cbits = cd->cbits;
03401         uschar pbits[32];
03402 
03403         if (ptr[1] != CHAR_COLON)
03404           {
03405           *errorcodeptr = ERR31;
03406           goto FAILED;
03407           }
03408 
03409         ptr += 2;
03410         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
03411           {
03412           local_negate = TRUE;
03413           should_flip_negation = TRUE;  /* Note negative special */
03414           ptr++;
03415           }
03416 
03417         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
03418         if (posix_class < 0)
03419           {
03420           *errorcodeptr = ERR30;
03421           goto FAILED;
03422           }
03423 
03424         /* If matching is caseless, upper and lower are converted to
03425         alpha. This relies on the fact that the class table starts with
03426         alpha, lower, upper as the first 3 entries. */
03427 
03428         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
03429           posix_class = 0;
03430 
03431         /* When PCRE_UCP is set, some of the POSIX classes are converted to
03432         different escape sequences that use Unicode properties. */
03433 
03434 #ifdef SUPPORT_UCP
03435         if ((options & PCRE_UCP) != 0)
03436           {
03437           int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
03438           if (posix_substitutes[pc] != NULL)
03439             {
03440             nestptr = tempptr + 1;
03441             ptr = posix_substitutes[pc] - 1;
03442             continue;
03443             }
03444           }
03445 #endif
03446         /* In the non-UCP case, we build the bit map for the POSIX class in a
03447         chunk of local store because we may be adding and subtracting from it,
03448         and we don't want to subtract bits that may be in the main map already.
03449         At the end we or the result into the bit map that is being built. */
03450 
03451         posix_class *= 3;
03452 
03453         /* Copy in the first table (always present) */
03454 
03455         memcpy(pbits, cbits + posix_class_maps[posix_class],
03456           32 * sizeof(uschar));
03457 
03458         /* If there is a second table, add or remove it as required. */
03459 
03460         taboffset = posix_class_maps[posix_class + 1];
03461         tabopt = posix_class_maps[posix_class + 2];
03462 
03463         if (taboffset >= 0)
03464           {
03465           if (tabopt >= 0)
03466             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
03467           else
03468             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
03469           }
03470 
03471         /* Not see if we need to remove any special characters. An option
03472         value of 1 removes vertical space and 2 removes underscore. */
03473 
03474         if (tabopt < 0) tabopt = -tabopt;
03475         if (tabopt == 1) pbits[1] &= ~0x3c;
03476           else if (tabopt == 2) pbits[11] &= 0x7f;
03477 
03478         /* Add the POSIX table or its complement into the main table that is
03479         being built and we are done. */
03480 
03481         if (local_negate)
03482           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
03483         else
03484           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
03485 
03486         ptr = tempptr + 1;
03487         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
03488         continue;    /* End of POSIX syntax handling */
03489         }
03490 
03491       /* Backslash may introduce a single character, or it may introduce one
03492       of the specials, which just set a flag. The sequence \b is a special
03493       case. Inside a class (and only there) it is treated as backspace. We
03494       assume that other escapes have more than one character in them, so set
03495       class_charcount bigger than one. Unrecognized escapes fall through and
03496       are either treated as literal characters (by default), or are faulted if
03497       PCRE_EXTRA is set. */
03498 
03499       if (c == CHAR_BACKSLASH)
03500         {
03501         c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
03502         if (*errorcodeptr != 0) goto FAILED;
03503 
03504         if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */
03505         else if (-c == ESC_Q)            /* Handle start of quoted string */
03506           {
03507           if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
03508             {
03509             ptr += 2; /* avoid empty string */
03510             }
03511           else inescq = TRUE;
03512           continue;
03513           }
03514         else if (-c == ESC_E) continue;  /* Ignore orphan \E */
03515 
03516         if (c < 0)
03517           {
03518           register const uschar *cbits = cd->cbits;
03519           class_charcount += 2;     /* Greater than 1 is what matters */
03520 
03521           switch (-c)
03522             {
03523 #ifdef SUPPORT_UCP
03524             case ESC_du:     /* These are the values given for \d etc */
03525             case ESC_DU:     /* when PCRE_UCP is set. We replace the */
03526             case ESC_wu:     /* escape sequence with an appropriate \p */
03527             case ESC_WU:     /* or \P to test Unicode properties instead */
03528             case ESC_su:     /* of the default ASCII testing. */
03529             case ESC_SU:
03530             nestptr = ptr;
03531             ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
03532             class_charcount -= 2;                /* Undo! */
03533             continue;
03534 #endif
03535             case ESC_d:
03536             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
03537             continue;
03538 
03539             case ESC_D:
03540             should_flip_negation = TRUE;
03541             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
03542             continue;
03543 
03544             case ESC_w:
03545             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
03546             continue;
03547 
03548             case ESC_W:
03549             should_flip_negation = TRUE;
03550             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
03551             continue;
03552 
03553             /* Perl 5.004 onwards omits VT from \s, but we must preserve it
03554             if it was previously set by something earlier in the character
03555             class. */
03556 
03557             case ESC_s:
03558             classbits[0] |= cbits[cbit_space];
03559             classbits[1] |= cbits[cbit_space+1] & ~0x08;
03560             for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
03561             continue;
03562 
03563             case ESC_S:
03564             should_flip_negation = TRUE;
03565             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
03566             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
03567             continue;
03568 
03569             case ESC_h:
03570             SETBIT(classbits, 0x09); /* VT */
03571             SETBIT(classbits, 0x20); /* SPACE */
03572             SETBIT(classbits, 0xa0); /* NSBP */
03573 #ifdef SUPPORT_UTF8
03574             if (utf8)
03575               {
03576               class_utf8 = TRUE;
03577               *class_utf8data++ = XCL_SINGLE;
03578               class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
03579               *class_utf8data++ = XCL_SINGLE;
03580               class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
03581               *class_utf8data++ = XCL_RANGE;
03582               class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
03583               class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
03584               *class_utf8data++ = XCL_SINGLE;
03585               class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
03586               *class_utf8data++ = XCL_SINGLE;
03587               class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
03588               *class_utf8data++ = XCL_SINGLE;
03589               class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
03590               }
03591 #endif
03592             continue;
03593 
03594             case ESC_H:
03595             for (c = 0; c < 32; c++)
03596               {
03597               int x = 0xff;
03598               switch (c)
03599                 {
03600                 case 0x09/8: x ^= 1 << (0x09%8); break;
03601                 case 0x20/8: x ^= 1 << (0x20%8); break;
03602                 case 0xa0/8: x ^= 1 << (0xa0%8); break;
03603                 default: break;
03604                 }
03605               classbits[c] |= x;
03606               }
03607 
03608 #ifdef SUPPORT_UTF8
03609             if (utf8)
03610               {
03611               class_utf8 = TRUE;
03612               *class_utf8data++ = XCL_RANGE;
03613               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
03614               class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
03615               *class_utf8data++ = XCL_RANGE;
03616               class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
03617               class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
03618               *class_utf8data++ = XCL_RANGE;
03619               class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
03620               class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
03621               *class_utf8data++ = XCL_RANGE;
03622               class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
03623               class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
03624               *class_utf8data++ = XCL_RANGE;
03625               class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
03626               class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
03627               *class_utf8data++ = XCL_RANGE;
03628               class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
03629               class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
03630               *class_utf8data++ = XCL_RANGE;
03631               class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
03632               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
03633               }
03634 #endif
03635             continue;
03636 
03637             case ESC_v:
03638             SETBIT(classbits, 0x0a); /* LF */
03639             SETBIT(classbits, 0x0b); /* VT */
03640             SETBIT(classbits, 0x0c); /* FF */
03641             SETBIT(classbits, 0x0d); /* CR */
03642             SETBIT(classbits, 0x85); /* NEL */
03643 #ifdef SUPPORT_UTF8
03644             if (utf8)
03645               {
03646               class_utf8 = TRUE;
03647               *class_utf8data++ = XCL_RANGE;
03648               class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
03649               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
03650               }
03651 #endif
03652             continue;
03653 
03654             case ESC_V:
03655             for (c = 0; c < 32; c++)
03656               {
03657               int x = 0xff;
03658               switch (c)
03659                 {
03660                 case 0x0a/8: x ^= 1 << (0x0a%8);
03661                              x ^= 1 << (0x0b%8);
03662                              x ^= 1 << (0x0c%8);
03663                              x ^= 1 << (0x0d%8);
03664                              break;
03665                 case 0x85/8: x ^= 1 << (0x85%8); break;
03666                 default: break;
03667                 }
03668               classbits[c] |= x;
03669               }
03670 
03671 #ifdef SUPPORT_UTF8
03672             if (utf8)
03673               {
03674               class_utf8 = TRUE;
03675               *class_utf8data++ = XCL_RANGE;
03676               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
03677               class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
03678               *class_utf8data++ = XCL_RANGE;
03679               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
03680               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
03681               }
03682 #endif
03683             continue;
03684 
03685 #ifdef SUPPORT_UCP
03686             case ESC_p:
03687             case ESC_P:
03688               {
03689               BOOL negated;
03690               int pdata;
03691               int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
03692               if (ptype < 0) goto FAILED;
03693               class_utf8 = TRUE;
03694               *class_utf8data++ = ((-c == ESC_p) != negated)?
03695                 XCL_PROP : XCL_NOTPROP;
03696               *class_utf8data++ = ptype;
03697               *class_utf8data++ = pdata;
03698               class_charcount -= 2;   /* Not a < 256 character */
03699               continue;
03700               }
03701 #endif
03702             /* Unrecognized escapes are faulted if PCRE is running in its
03703             strict mode. By default, for compatibility with Perl, they are
03704             treated as literals. */
03705 
03706             default:
03707             if ((options & PCRE_EXTRA) != 0)
03708               {
03709               *errorcodeptr = ERR7;
03710               goto FAILED;
03711               }
03712             class_charcount -= 2;  /* Undo the default count from above */
03713             c = *ptr;              /* Get the final character and fall through */
03714             break;
03715             }
03716           }
03717 
03718         /* Fall through if we have a single character (c >= 0). This may be
03719         greater than 256 in UTF-8 mode. */
03720 
03721         }   /* End of backslash handling */
03722 
03723       /* A single character may be followed by '-' to form a range. However,
03724       Perl does not permit ']' to be the end of the range. A '-' character
03725       at the end is treated as a literal. Perl ignores orphaned \E sequences
03726       entirely. The code for handling \Q and \E is messy. */
03727 
03728       CHECK_RANGE:
03729       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
03730         {
03731         inescq = FALSE;
03732         ptr += 2;
03733         }
03734 
03735       oldptr = ptr;
03736 
03737       /* Remember \r or \n */
03738 
03739       if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
03740 
03741       /* Check for range */
03742 
03743       if (!inescq && ptr[1] == CHAR_MINUS)
03744         {
03745         int d;
03746         ptr += 2;
03747         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
03748 
03749         /* If we hit \Q (not followed by \E) at this point, go into escaped
03750         mode. */
03751 
03752         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
03753           {
03754           ptr += 2;
03755           if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
03756             { ptr += 2; continue; }
03757           inescq = TRUE;
03758           break;
03759           }
03760 
03761         if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
03762           {
03763           ptr = oldptr;
03764           goto LONE_SINGLE_CHARACTER;
03765           }
03766 
03767 #ifdef SUPPORT_UTF8
03768         if (utf8)
03769           {                           /* Braces are required because the */
03770           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
03771           }
03772         else
03773 #endif
03774         d = *ptr;  /* Not UTF-8 mode */
03775 
03776         /* The second part of a range can be a single-character escape, but
03777         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
03778         in such circumstances. */
03779 
03780         if (!inescq && d == CHAR_BACKSLASH)
03781           {
03782           d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
03783           if (*errorcodeptr != 0) goto FAILED;
03784 
03785           /* \b is backspace; any other special means the '-' was literal */
03786 
03787           if (d < 0)
03788             {
03789             if (d == -ESC_b) d = CHAR_BS; else
03790               {
03791               ptr = oldptr;
03792               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
03793               }
03794             }
03795           }
03796 
03797         /* Check that the two values are in the correct order. Optimize
03798         one-character ranges */
03799 
03800         if (d < c)
03801           {
03802           *errorcodeptr = ERR8;
03803           goto FAILED;
03804           }
03805 
03806         if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
03807 
03808         /* Remember \r or \n */
03809 
03810         if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
03811 
03812         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
03813         matching, we have to use an XCLASS with extra data items. Caseless
03814         matching for characters > 127 is available only if UCP support is
03815         available. */
03816 
03817 #ifdef SUPPORT_UTF8
03818         if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
03819           {
03820           class_utf8 = TRUE;
03821 
03822           /* With UCP support, we can find the other case equivalents of
03823           the relevant characters. There may be several ranges. Optimize how
03824           they fit with the basic range. */
03825 
03826 #ifdef SUPPORT_UCP
03827           if ((options & PCRE_CASELESS) != 0)
03828             {
03829             unsigned int occ, ocd;
03830             unsigned int cc = c;
03831             unsigned int origd = d;
03832             while (get_othercase_range(&cc, origd, &occ, &ocd))
03833               {
03834               if (occ >= (unsigned int)c &&
03835                   ocd <= (unsigned int)d)
03836                 continue;                          /* Skip embedded ranges */
03837 
03838               if (occ < (unsigned int)c  &&
03839                   ocd >= (unsigned int)c - 1)      /* Extend the basic range */
03840                 {                                  /* if there is overlap,   */
03841                 c = occ;                           /* noting that if occ < c */
03842                 continue;                          /* we can't have ocd > d  */
03843                 }                                  /* because a subrange is  */
03844               if (ocd > (unsigned int)d &&
03845                   occ <= (unsigned int)d + 1)      /* always shorter than    */
03846                 {                                  /* the basic range.       */
03847                 d = ocd;
03848                 continue;
03849                 }
03850 
03851               if (occ == ocd)
03852                 {
03853                 *class_utf8data++ = XCL_SINGLE;
03854                 }
03855               else
03856                 {
03857                 *class_utf8data++ = XCL_RANGE;
03858                 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
03859                 }
03860               class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
03861               }
03862             }
03863 #endif  /* SUPPORT_UCP */
03864 
03865           /* Now record the original range, possibly modified for UCP caseless
03866           overlapping ranges. */
03867 
03868           *class_utf8data++ = XCL_RANGE;
03869           class_utf8data += _pcre_ord2utf8(c, class_utf8data);
03870           class_utf8data += _pcre_ord2utf8(d, class_utf8data);
03871 
03872           /* With UCP support, we are done. Without UCP support, there is no
03873           caseless matching for UTF-8 characters > 127; we can use the bit map
03874           for the smaller ones. */
03875 
03876 #ifdef SUPPORT_UCP
03877           continue;    /* With next character in the class */
03878 #else
03879           if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
03880 
03881           /* Adjust upper limit and fall through to set up the map */
03882 
03883           d = 127;
03884 
03885 #endif  /* SUPPORT_UCP */
03886           }
03887 #endif  /* SUPPORT_UTF8 */
03888 
03889         /* We use the bit map for all cases when not in UTF-8 mode; else
03890         ranges that lie entirely within 0-127 when there is UCP support; else
03891         for partial ranges without UCP support. */
03892 
03893         class_charcount += d - c + 1;
03894         class_lastchar = d;
03895 
03896         /* We can save a bit of time by skipping this in the pre-compile. */
03897 
03898         if (lengthptr == NULL) for (; c <= d; c++)
03899           {
03900           classbits[c/8] |= (1 << (c&7));
03901           if ((options & PCRE_CASELESS) != 0)
03902             {
03903             int uc = cd->fcc[c];           /* flip case */
03904             classbits[uc/8] |= (1 << (uc&7));
03905             }
03906           }
03907 
03908         continue;   /* Go get the next char in the class */
03909         }
03910 
03911       /* Handle a lone single character - we can get here for a normal
03912       non-escape char, or after \ that introduces a single character or for an
03913       apparent range that isn't. */
03914 
03915       LONE_SINGLE_CHARACTER:
03916 
03917       /* Handle a character that cannot go in the bit map */
03918 
03919 #ifdef SUPPORT_UTF8
03920       if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
03921         {
03922         class_utf8 = TRUE;
03923         *class_utf8data++ = XCL_SINGLE;
03924         class_utf8data += _pcre_ord2utf8(c, class_utf8data);
03925 
03926 #ifdef SUPPORT_UCP
03927         if ((options & PCRE_CASELESS) != 0)
03928           {
03929           unsigned int othercase;
03930           if ((othercase = UCD_OTHERCASE(c)) != c)
03931             {
03932             *class_utf8data++ = XCL_SINGLE;
03933             class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
03934             }
03935           }
03936 #endif  /* SUPPORT_UCP */
03937 
03938         }
03939       else
03940 #endif  /* SUPPORT_UTF8 */
03941 
03942       /* Handle a single-byte character */
03943         {
03944         classbits[c/8] |= (1 << (c&7));
03945         if ((options & PCRE_CASELESS) != 0)
03946           {
03947           c = cd->fcc[c];   /* flip case */
03948           classbits[c/8] |= (1 << (c&7));
03949           }
03950         class_charcount++;
03951         class_lastchar = c;
03952         }
03953       }
03954 
03955     /* Loop until ']' reached. This "while" is the end of the "do" far above.
03956     If we are at the end of an internal nested string, revert to the outer
03957     string. */
03958 
03959     while (((c = *(++ptr)) != 0 ||
03960            (nestptr != NULL &&
03961              (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) &&
03962            (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
03963 
03964     /* Check for missing terminating ']' */
03965 
03966     if (c == 0)
03967       {
03968       *errorcodeptr = ERR6;
03969       goto FAILED;
03970       }
03971 
03972     /* If class_charcount is 1, we saw precisely one character whose value is
03973     less than 256. As long as there were no characters >= 128 and there was no
03974     use of \p or \P, in other words, no use of any XCLASS features, we can
03975     optimize.
03976 
03977     In UTF-8 mode, we can optimize the negative case only if there were no
03978     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
03979     operate on single-bytes only. This is an historical hangover. Maybe one day
03980     we can tidy these opcodes to handle multi-byte characters.
03981 
03982     The optimization throws away the bit map. We turn the item into a
03983     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
03984     that OP_NOT does not support multibyte characters. In the positive case, it
03985     can cause firstbyte to be set. Otherwise, there can be no first char if
03986     this item is first, whatever repeat count may follow. In the case of
03987     reqbyte, save the previous value for reinstating. */
03988 
03989 #ifdef SUPPORT_UTF8
03990     if (class_charcount == 1 && !class_utf8 &&
03991       (!utf8 || !negate_class || class_lastchar < 128))
03992 #else
03993     if (class_charcount == 1)
03994 #endif
03995       {
03996       zeroreqbyte = reqbyte;
03997 
03998       /* The OP_NOT opcode works on one-byte characters only. */
03999 
04000       if (negate_class)
04001         {
04002         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
04003         zerofirstbyte = firstbyte;
04004         *code++ = OP_NOT;
04005         *code++ = class_lastchar;
04006         break;
04007         }
04008 
04009       /* For a single, positive character, get the value into mcbuffer, and
04010       then we can handle this with the normal one-character code. */
04011 
04012 #ifdef SUPPORT_UTF8
04013       if (utf8 && class_lastchar > 127)
04014         mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
04015       else
04016 #endif
04017         {
04018         mcbuffer[0] = class_lastchar;
04019         mclength = 1;
04020         }
04021       goto ONE_CHAR;
04022       }       /* End of 1-char optimization */
04023 
04024     /* The general case - not the one-char optimization. If this is the first
04025     thing in the branch, there can be no first char setting, whatever the
04026     repeat count. Any reqbyte setting must remain unchanged after any kind of
04027     repeat. */
04028 
04029     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
04030     zerofirstbyte = firstbyte;
04031     zeroreqbyte = reqbyte;
04032 
04033     /* If there are characters with values > 255, we have to compile an
04034     extended class, with its own opcode, unless there was a negated special
04035     such as \S in the class, and PCRE_UCP is not set, because in that case all
04036     characters > 255 are in the class, so any that were explicitly given as
04037     well can be ignored. If (when there are explicit characters > 255 that must
04038     be listed) there are no characters < 256, we can omit the bitmap in the
04039     actual compiled code. */
04040 
04041 #ifdef SUPPORT_UTF8
04042     if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0))
04043       {
04044       *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
04045       *code++ = OP_XCLASS;
04046       code += LINK_SIZE;
04047       *code = negate_class? XCL_NOT : 0;
04048 
04049       /* If the map is required, move up the extra data to make room for it;
04050       otherwise just move the code pointer to the end of the extra data. */
04051 
04052       if (class_charcount > 0)
04053         {
04054         *code++ |= XCL_MAP;
04055         memmove(code + 32, code, class_utf8data - code);
04056         memcpy(code, classbits, 32);
04057         code = class_utf8data + 32;
04058         }
04059       else code = class_utf8data;
04060 
04061       /* Now fill in the complete length of the item */
04062 
04063       PUT(previous, 1, code - previous);
04064       break;   /* End of class handling */
04065       }
04066 #endif
04067 
04068     /* If there are no characters > 255, or they are all to be included or
04069     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
04070     whole class was negated and whether there were negative specials such as \S
04071     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
04072     negating it if necessary. */
04073 
04074     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
04075     if (negate_class)
04076       {
04077       if (lengthptr == NULL)    /* Save time in the pre-compile phase */
04078         for (c = 0; c < 32; c++) code[c] = ~classbits[c];
04079       }
04080     else
04081       {
04082       memcpy(code, classbits, 32);
04083       }
04084     code += 32;
04085     break;
04086 
04087 
04088     /* ===================================================================*/
04089     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
04090     has been tested above. */
04091 
04092     case CHAR_LEFT_CURLY_BRACKET:
04093     if (!is_quantifier) goto NORMAL_CHAR;
04094     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
04095     if (*errorcodeptr != 0) goto FAILED;
04096     goto REPEAT;
04097 
04098     case CHAR_ASTERISK:
04099     repeat_min = 0;
04100     repeat_max = -1;
04101     goto REPEAT;
04102 
04103     case CHAR_PLUS:
04104     repeat_min = 1;
04105     repeat_max = -1;
04106     goto REPEAT;
04107 
04108     case CHAR_QUESTION_MARK:
04109     repeat_min = 0;
04110     repeat_max = 1;
04111 
04112     REPEAT:
04113     if (previous == NULL)
04114       {
04115       *errorcodeptr = ERR9;
04116       goto FAILED;
04117       }
04118 
04119     if (repeat_min == 0)
04120       {
04121       firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
04122       reqbyte = zeroreqbyte;        /* Ditto */
04123       }
04124 
04125     /* Remember whether this is a variable length repeat */
04126 
04127     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
04128 
04129     op_type = 0;                    /* Default single-char op codes */
04130     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
04131 
04132     /* Save start of previous item, in case we have to move it up to make space
04133     for an inserted OP_ONCE for the additional '+' extension. */
04134 
04135     tempcode = previous;
04136 
04137     /* If the next character is '+', we have a possessive quantifier. This
04138     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
04139     If the next character is '?' this is a minimizing repeat, by default,
04140     but if PCRE_UNGREEDY is set, it works the other way round. We change the
04141     repeat type to the non-default. */
04142 
04143     if (ptr[1] == CHAR_PLUS)
04144       {
04145       repeat_type = 0;                  /* Force greedy */
04146       possessive_quantifier = TRUE;
04147       ptr++;
04148       }
04149     else if (ptr[1] == CHAR_QUESTION_MARK)
04150       {
04151       repeat_type = greedy_non_default;
04152       ptr++;
04153       }
04154     else repeat_type = greedy_default;
04155 
04156     /* If previous was a character match, abolish the item and generate a
04157     repeat item instead. If a char item has a minumum of more than one, ensure
04158     that it is set in reqbyte - it might not be if a sequence such as x{3} is
04159     the first thing in a branch because the x will have gone into firstbyte
04160     instead.  */
04161 
04162     if (*previous == OP_CHAR || *previous == OP_CHARNC)
04163       {
04164       /* Deal with UTF-8 characters that take up more than one byte. It's
04165       easier to write this out separately than try to macrify it. Use c to
04166       hold the length of the character in bytes, plus 0x80 to flag that it's a
04167       length rather than a small character. */
04168 
04169 #ifdef SUPPORT_UTF8
04170       if (utf8 && (code[-1] & 0x80) != 0)
04171         {
04172         uschar *lastchar = code - 1;
04173         while((*lastchar & 0xc0) == 0x80) lastchar--;
04174         c = code - lastchar;            /* Length of UTF-8 character */
04175         memcpy(utf8_char, lastchar, c); /* Save the char */
04176         c |= 0x80;                      /* Flag c as a length */
04177         }
04178       else
04179 #endif
04180 
04181       /* Handle the case of a single byte - either with no UTF8 support, or
04182       with UTF-8 disabled, or for a UTF-8 character < 128. */
04183 
04184         {
04185         c = code[-1];
04186         if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
04187         }
04188 
04189       /* If the repetition is unlimited, it pays to see if the next thing on
04190       the line is something that cannot possibly match this character. If so,
04191       automatically possessifying this item gains some performance in the case
04192       where the match fails. */
04193 
04194       if (!possessive_quantifier &&
04195           repeat_max < 0 &&
04196           check_auto_possessive(previous, utf8, ptr + 1, options, cd))
04197         {
04198         repeat_type = 0;    /* Force greedy */
04199         possessive_quantifier = TRUE;
04200         }
04201 
04202       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
04203       }
04204 
04205     /* If previous was a single negated character ([^a] or similar), we use
04206     one of the special opcodes, replacing it. The code is shared with single-
04207     character repeats by setting opt_type to add a suitable offset into
04208     repeat_type. We can also test for auto-possessification. OP_NOT is
04209     currently used only for single-byte chars. */
04210 
04211     else if (*previous == OP_NOT)
04212       {
04213       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
04214       c = previous[1];
04215       if (!possessive_quantifier &&
04216           repeat_max < 0 &&
04217           check_auto_possessive(previous, utf8, ptr + 1, options, cd))
04218         {
04219         repeat_type = 0;    /* Force greedy */
04220         possessive_quantifier = TRUE;
04221         }
04222       goto OUTPUT_SINGLE_REPEAT;
04223       }
04224 
04225     /* If previous was a character type match (\d or similar), abolish it and
04226     create a suitable repeat item. The code is shared with single-character
04227     repeats by setting op_type to add a suitable offset into repeat_type. Note
04228     the the Unicode property types will be present only when SUPPORT_UCP is
04229     defined, but we don't wrap the little bits of code here because it just
04230     makes it horribly messy. */
04231 
04232     else if (*previous < OP_EODN)
04233       {
04234       uschar *oldcode;
04235       int prop_type, prop_value;
04236       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
04237       c = *previous;
04238 
04239       if (!possessive_quantifier &&
04240           repeat_max < 0 &&
04241           check_auto_possessive(previous, utf8, ptr + 1, options, cd))
04242         {
04243         repeat_type = 0;    /* Force greedy */
04244         possessive_quantifier = TRUE;
04245         }
04246 
04247       OUTPUT_SINGLE_REPEAT:
04248       if (*previous == OP_PROP || *previous == OP_NOTPROP)
04249         {
04250         prop_type = previous[1];
04251         prop_value = previous[2];
04252         }
04253       else prop_type = prop_value = -1;
04254 
04255       oldcode = code;
04256       code = previous;                  /* Usually overwrite previous item */
04257 
04258       /* If the maximum is zero then the minimum must also be zero; Perl allows
04259       this case, so we do too - by simply omitting the item altogether. */
04260 
04261       if (repeat_max == 0) goto END_REPEAT;
04262 
04263       /*--------------------------------------------------------------------*/
04264       /* This code is obsolete from release 8.00; the restriction was finally
04265       removed: */
04266 
04267       /* All real repeats make it impossible to handle partial matching (maybe
04268       one day we will be able to remove this restriction). */
04269 
04270       /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
04271       /*--------------------------------------------------------------------*/
04272 
04273       /* Combine the op_type with the repeat_type */
04274 
04275       repeat_type += op_type;
04276 
04277       /* A minimum of zero is handled either as the special case * or ?, or as
04278       an UPTO, with the maximum given. */
04279 
04280       if (repeat_min == 0)
04281         {
04282         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
04283           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
04284         else
04285           {
04286           *code++ = OP_UPTO + repeat_type;
04287           PUT2INC(code, 0, repeat_max);
04288           }
04289         }
04290 
04291       /* A repeat minimum of 1 is optimized into some special cases. If the
04292       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
04293       left in place and, if the maximum is greater than 1, we use OP_UPTO with
04294       one less than the maximum. */
04295 
04296       else if (repeat_min == 1)
04297         {
04298         if (repeat_max == -1)
04299           *code++ = OP_PLUS + repeat_type;
04300         else
04301           {
04302           code = oldcode;                 /* leave previous item in place */
04303           if (repeat_max == 1) goto END_REPEAT;
04304           *code++ = OP_UPTO + repeat_type;
04305           PUT2INC(code, 0, repeat_max - 1);
04306           }
04307         }
04308 
04309       /* The case {n,n} is just an EXACT, while the general case {n,m} is
04310       handled as an EXACT followed by an UPTO. */
04311 
04312       else
04313         {
04314         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
04315         PUT2INC(code, 0, repeat_min);
04316 
04317         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
04318         we have to insert the character for the previous code. For a repeated
04319         Unicode property match, there are two extra bytes that define the
04320         required property. In UTF-8 mode, long characters have their length in
04321         c, with the 0x80 bit as a flag. */
04322 
04323         if (repeat_max < 0)
04324           {
04325 #ifdef SUPPORT_UTF8
04326           if (utf8 && c >= 128)
04327             {
04328             memcpy(code, utf8_char, c & 7);
04329             code += c & 7;
04330             }
04331           else
04332 #endif
04333             {
04334             *code++ = c;
04335             if (prop_type >= 0)
04336               {
04337               *code++ = prop_type;
04338               *code++ = prop_value;
04339               }
04340             }
04341           *code++ = OP_STAR + repeat_type;
04342           }
04343 
04344         /* Else insert an UPTO if the max is greater than the min, again
04345         preceded by the character, for the previously inserted code. If the
04346         UPTO is just for 1 instance, we can use QUERY instead. */
04347 
04348         else if (repeat_max != repeat_min)
04349           {
04350 #ifdef SUPPORT_UTF8
04351           if (utf8 && c >= 128)
04352             {
04353             memcpy(code, utf8_char, c & 7);
04354             code += c & 7;
04355             }
04356           else
04357 #endif
04358           *code++ = c;
04359           if (prop_type >= 0)
04360             {
04361             *code++ = prop_type;
04362             *code++ = prop_value;
04363             }
04364           repeat_max -= repeat_min;
04365 
04366           if (repeat_max == 1)
04367             {
04368             *code++ = OP_QUERY + repeat_type;
04369             }
04370           else
04371             {
04372             *code++ = OP_UPTO + repeat_type;
04373             PUT2INC(code, 0, repeat_max);
04374             }
04375           }
04376         }
04377 
04378       /* The character or character type itself comes last in all cases. */
04379 
04380 #ifdef SUPPORT_UTF8
04381       if (utf8 && c >= 128)
04382         {
04383         memcpy(code, utf8_char, c & 7);
04384         code += c & 7;
04385         }
04386       else
04387 #endif
04388       *code++ = c;
04389 
04390       /* For a repeated Unicode property match, there are two extra bytes that
04391       define the required property. */
04392 
04393 #ifdef SUPPORT_UCP
04394       if (prop_type >= 0)
04395         {
04396         *code++ = prop_type;
04397         *code++ = prop_value;
04398         }
04399 #endif
04400       }
04401 
04402     /* If previous was a character class or a back reference, we put the repeat
04403     stuff after it, but just skip the item if the repeat was {0,0}. */
04404 
04405     else if (*previous == OP_CLASS ||
04406              *previous == OP_NCLASS ||
04407 #ifdef SUPPORT_UTF8
04408              *previous == OP_XCLASS ||
04409 #endif
04410              *previous == OP_REF)
04411       {
04412       if (repeat_max == 0)
04413         {
04414         code = previous;
04415         goto END_REPEAT;
04416         }
04417 
04418       /*--------------------------------------------------------------------*/
04419       /* This code is obsolete from release 8.00; the restriction was finally
04420       removed: */
04421 
04422       /* All real repeats make it impossible to handle partial matching (maybe
04423       one day we will be able to remove this restriction). */
04424 
04425       /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
04426       /*--------------------------------------------------------------------*/
04427 
04428       if (repeat_min == 0 && repeat_max == -1)
04429         *code++ = OP_CRSTAR + repeat_type;
04430       else if (repeat_min == 1 && repeat_max == -1)
04431         *code++ = OP_CRPLUS + repeat_type;
04432       else if (repeat_min == 0 && repeat_max == 1)
04433         *code++ = OP_CRQUERY + repeat_type;
04434       else
04435         {
04436         *code++ = OP_CRRANGE + repeat_type;
04437         PUT2INC(code, 0, repeat_min);
04438         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
04439         PUT2INC(code, 0, repeat_max);
04440         }
04441       }
04442 
04443     /* If previous was a bracket group, we may have to replicate it in certain
04444     cases. */
04445 
04446     else if (*previous == OP_BRA  || *previous == OP_CBRA ||
04447              *previous == OP_ONCE || *previous == OP_COND)
04448       {
04449       register int i;
04450       int ketoffset = 0;
04451       int len = (int)(code - previous);
04452       uschar *bralink = NULL;
04453 
04454       /* Repeating a DEFINE group is pointless */
04455 
04456       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
04457         {
04458         *errorcodeptr = ERR55;
04459         goto FAILED;
04460         }
04461 
04462       /* If the maximum repeat count is unlimited, find the end of the bracket
04463       by scanning through from the start, and compute the offset back to it
04464       from the current code pointer. There may be an OP_OPT setting following
04465       the final KET, so we can't find the end just by going back from the code
04466       pointer. */
04467 
04468       if (repeat_max == -1)
04469         {
04470         register uschar *ket = previous;
04471         do ket += GET(ket, 1); while (*ket != OP_KET);
04472         ketoffset = (int)(code - ket);
04473         }
04474 
04475       /* The case of a zero minimum is special because of the need to stick
04476       OP_BRAZERO in front of it, and because the group appears once in the
04477       data, whereas in other cases it appears the minimum number of times. For
04478       this reason, it is simplest to treat this case separately, as otherwise
04479       the code gets far too messy. There are several special subcases when the
04480       minimum is zero. */
04481 
04482       if (repeat_min == 0)
04483         {
04484         /* If the maximum is also zero, we used to just omit the group from the
04485         output altogether, like this:
04486 
04487         ** if (repeat_max == 0)
04488         **   {
04489         **   code = previous;
04490         **   goto END_REPEAT;
04491         **   }
04492 
04493         However, that fails when a group is referenced as a subroutine from
04494         elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
04495         so that it is skipped on execution. As we don't have a list of which
04496         groups are referenced, we cannot do this selectively.
04497 
04498         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
04499         and do no more at this point. However, we do need to adjust any
04500         OP_RECURSE calls inside the group that refer to the group itself or any
04501         internal or forward referenced group, because the offset is from the
04502         start of the whole regex. Temporarily terminate the pattern while doing
04503         this. */
04504 
04505         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
04506           {
04507           *code = OP_END;
04508           adjust_recurse(previous, 1, utf8, cd, save_hwm);
04509           memmove(previous+1, previous, len);
04510           code++;
04511           if (repeat_max == 0)
04512             {
04513             *previous++ = OP_SKIPZERO;
04514             goto END_REPEAT;
04515             }
04516           *previous++ = OP_BRAZERO + repeat_type;
04517           }
04518 
04519         /* If the maximum is greater than 1 and limited, we have to replicate
04520         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
04521         The first one has to be handled carefully because it's the original
04522         copy, which has to be moved up. The remainder can be handled by code
04523         that is common with the non-zero minimum case below. We have to
04524         adjust the value or repeat_max, since one less copy is required. Once
04525         again, we may have to adjust any OP_RECURSE calls inside the group. */
04526 
04527         else
04528           {
04529           int offset;
04530           *code = OP_END;
04531           adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
04532           memmove(previous + 2 + LINK_SIZE, previous, len);
04533           code += 2 + LINK_SIZE;
04534           *previous++ = OP_BRAZERO + repeat_type;
04535           *previous++ = OP_BRA;
04536 
04537           /* We chain together the bracket offset fields that have to be
04538           filled in later when the ends of the brackets are reached. */
04539 
04540           offset = (bralink == NULL)? 0 : (int)(previous - bralink);
04541           bralink = previous;
04542           PUTINC(previous, 0, offset);
04543           }
04544 
04545         repeat_max--;
04546         }
04547 
04548       /* If the minimum is greater than zero, replicate the group as many
04549       times as necessary, and adjust the maximum to the number of subsequent
04550       copies that we need. If we set a first char from the group, and didn't
04551       set a required char, copy the latter from the former. If there are any
04552       forward reference subroutine calls in the group, there will be entries on
04553       the workspace list; replicate these with an appropriate increment. */
04554 
04555       else
04556         {
04557         if (repeat_min > 1)
04558           {
04559           /* In the pre-compile phase, we don't actually do the replication. We
04560           just adjust the length as if we had. Do some paranoid checks for
04561           potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
04562           integer type when available, otherwise double. */
04563 
04564           if (lengthptr != NULL)
04565             {
04566             int delta = (repeat_min - 1)*length_prevgroup;
04567             if ((INT64_OR_DOUBLE)(repeat_min - 1)*
04568                   (INT64_OR_DOUBLE)length_prevgroup >
04569                     (INT64_OR_DOUBLE)INT_MAX ||
04570                 OFLOW_MAX - *lengthptr < delta)
04571               {
04572               *errorcodeptr = ERR20;
04573               goto FAILED;
04574               }
04575             *lengthptr += delta;
04576             }
04577 
04578           /* This is compiling for real */
04579 
04580           else
04581             {
04582             if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
04583             for (i = 1; i < repeat_min; i++)
04584               {
04585               uschar *hc;
04586               uschar *this_hwm = cd->hwm;
04587               memcpy(code, previous, len);
04588               for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
04589                 {
04590                 PUT(cd->hwm, 0, GET(hc, 0) + len);
04591                 cd->hwm += LINK_SIZE;
04592                 }
04593               save_hwm = this_hwm;
04594               code += len;
04595               }
04596             }
04597           }
04598 
04599         if (repeat_max > 0) repeat_max -= repeat_min;
04600         }
04601 
04602       /* This code is common to both the zero and non-zero minimum cases. If
04603       the maximum is limited, it replicates the group in a nested fashion,
04604       remembering the bracket starts on a stack. In the case of a zero minimum,
04605       the first one was set up above. In all cases the repeat_max now specifies
04606       the number of additional copies needed. Again, we must remember to
04607       replicate entries on the forward reference list. */
04608 
04609       if (repeat_max >= 0)
04610         {
04611         /* In the pre-compile phase, we don't actually do the replication. We
04612         just adjust the length as if we had. For each repetition we must add 1
04613         to the length for BRAZERO and for all but the last repetition we must
04614         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
04615         paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
04616         a 64-bit integer type when available, otherwise double. */
04617 
04618         if (lengthptr != NULL && repeat_max > 0)
04619           {
04620           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
04621                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
04622           if ((INT64_OR_DOUBLE)repeat_max *
04623                 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
04624                   > (INT64_OR_DOUBLE)INT_MAX ||
04625               OFLOW_MAX - *lengthptr < delta)
04626             {
04627             *errorcodeptr = ERR20;
04628             goto FAILED;
04629             }
04630           *lengthptr += delta;
04631           }
04632 
04633         /* This is compiling for real */
04634 
04635         else for (i = repeat_max - 1; i >= 0; i--)
04636           {
04637           uschar *hc;
04638           uschar *this_hwm = cd->hwm;
04639 
04640           *code++ = OP_BRAZERO + repeat_type;
04641 
04642           /* All but the final copy start a new nesting, maintaining the
04643           chain of brackets outstanding. */
04644 
04645           if (i != 0)
04646             {
04647             int offset;
04648             *code++ = OP_BRA;
04649             offset = (bralink == NULL)? 0 : (int)(code - bralink);
04650             bralink = code;
04651             PUTINC(code, 0, offset);
04652             }
04653 
04654           memcpy(code, previous, len);
04655           for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
04656             {
04657             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
04658             cd->hwm += LINK_SIZE;
04659             }
04660           save_hwm = this_hwm;
04661           code += len;
04662           }
04663 
04664         /* Now chain through the pending brackets, and fill in their length
04665         fields (which are holding the chain links pro tem). */
04666 
04667         while (bralink != NULL)
04668           {
04669           int oldlinkoffset;
04670           int offset = (int)(code - bralink + 1);
04671           uschar *bra = code - offset;
04672           oldlinkoffset = GET(bra, 1);
04673           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
04674           *code++ = OP_KET;
04675           PUTINC(code, 0, offset);
04676           PUT(bra, 1, offset);
04677           }
04678         }
04679 
04680       /* If the maximum is unlimited, set a repeater in the final copy. We
04681       can't just offset backwards from the current code point, because we
04682       don't know if there's been an options resetting after the ket. The
04683       correct offset was computed above.
04684 
04685       Then, when we are doing the actual compile phase, check to see whether
04686       this group is a non-atomic one that could match an empty string. If so,
04687       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
04688       that runtime checking can be done. [This check is also applied to
04689       atomic groups at runtime, but in a different way.] */
04690 
04691       else
04692         {
04693         uschar *ketcode = code - ketoffset;
04694         uschar *bracode = ketcode - GET(ketcode, 1);
04695         *ketcode = OP_KETRMAX + repeat_type;
04696         if (lengthptr == NULL && *bracode != OP_ONCE)
04697           {
04698           uschar *scode = bracode;
04699           do
04700             {
04701             if (could_be_empty_branch(scode, ketcode, utf8, cd))
04702               {
04703               *bracode += OP_SBRA - OP_BRA;
04704               break;
04705               }
04706             scode += GET(scode, 1);
04707             }
04708           while (*scode == OP_ALT);
04709           }
04710         }
04711       }
04712 
04713     /* If previous is OP_FAIL, it was generated by an empty class [] in
04714     JavaScript mode. The other ways in which OP_FAIL can be generated, that is
04715     by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
04716     error above. We can just ignore the repeat in JS case. */
04717 
04718     else if (*previous == OP_FAIL) goto END_REPEAT;
04719 
04720     /* Else there's some kind of shambles */
04721 
04722     else
04723       {
04724       *errorcodeptr = ERR11;
04725       goto FAILED;
04726       }
04727 
04728     /* If the character following a repeat is '+', or if certain optimization
04729     tests above succeeded, possessive_quantifier is TRUE. For some of the
04730     simpler opcodes, there is an special alternative opcode for this. For
04731     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
04732     The '+' notation is just syntactic sugar, taken from Sun's Java package,
04733     but the special opcodes can optimize it a bit. The repeated item starts at
04734     tempcode, not at previous, which might be the first part of a string whose
04735     (former) last char we repeated.
04736 
04737     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
04738     an 'upto' may follow. We skip over an 'exact' item, and then test the
04739     length of what remains before proceeding. */
04740 
04741     if (possessive_quantifier)
04742       {
04743       int len;
04744 
04745       if (*tempcode == OP_TYPEEXACT)
04746         tempcode += _pcre_OP_lengths[*tempcode] +
04747           ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
04748 
04749       else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
04750         {
04751         tempcode += _pcre_OP_lengths[*tempcode];
04752 #ifdef SUPPORT_UTF8
04753         if (utf8 && tempcode[-1] >= 0xc0)
04754           tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
04755 #endif
04756         }
04757 
04758       len = (int)(code - tempcode);
04759       if (len > 0) switch (*tempcode)
04760         {
04761         case OP_STAR:  *tempcode = OP_POSSTAR; break;
04762         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
04763         case OP_QUERY: *tempcode = OP_POSQUERY; break;
04764         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
04765 
04766         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
04767         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
04768         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
04769         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
04770 
04771         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
04772         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
04773         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
04774         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
04775 
04776         /* Because we are moving code along, we must ensure that any
04777         pending recursive references are updated. */
04778 
04779         default:
04780         *code = OP_END;
04781         adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);
04782         memmove(tempcode + 1+LINK_SIZE, tempcode, len);
04783         code += 1 + LINK_SIZE;
04784         len += 1 + LINK_SIZE;
04785         tempcode[0] = OP_ONCE;
04786         *code++ = OP_KET;
04787         PUTINC(code, 0, len);
04788         PUT(tempcode, 1, len);
04789         break;
04790         }
04791       }
04792 
04793     /* In all case we no longer have a previous item. We also set the
04794     "follows varying string" flag for subsequently encountered reqbytes if
04795     it isn't already set and we have just passed a varying length item. */
04796 
04797     END_REPEAT:
04798     previous = NULL;
04799     cd->req_varyopt |= reqvary;
04800     break;
04801 
04802 
04803     /* ===================================================================*/
04804     /* Start of nested parenthesized sub-expression, or comment or lookahead or
04805     lookbehind or option setting or condition or all the other extended
04806     parenthesis forms.  */
04807 
04808     case CHAR_LEFT_PARENTHESIS:
04809     newoptions = options;
04810     skipbytes = 0;
04811     bravalue = OP_CBRA;
04812     save_hwm = cd->hwm;
04813     reset_bracount = FALSE;
04814 
04815     /* First deal with various "verbs" that can be introduced by '*'. */
04816 
04817     if (*(++ptr) == CHAR_ASTERISK &&
04818          ((cd->ctypes[ptr[1]] & ctype_letter) != 0 || ptr[1] == ':'))
04819       {
04820       int i, namelen;
04821       int arglen = 0;
04822       const char *vn = verbnames;
04823       const uschar *name = ptr + 1;
04824       const uschar *arg = NULL;
04825       previous = NULL;
04826       while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
04827       namelen = (int)(ptr - name);
04828 
04829       if (*ptr == CHAR_COLON)
04830         {
04831         arg = ++ptr;
04832         while ((cd->ctypes[*ptr] & (ctype_letter|ctype_digit)) != 0
04833           || *ptr == '_') ptr++;
04834         arglen = (int)(ptr - arg);
04835         }
04836 
04837       if (*ptr != CHAR_RIGHT_PARENTHESIS)
04838         {
04839         *errorcodeptr = ERR60;
04840         goto FAILED;
04841         }
04842 
04843       /* Scan the table of verb names */
04844 
04845       for (i = 0; i < verbcount; i++)
04846         {
04847         if (namelen == verbs[i].len &&
04848             strncmp((char *)name, vn, namelen) == 0)
04849           {
04850           /* Check for open captures before ACCEPT */
04851 
04852           if (verbs[i].op == OP_ACCEPT)
04853             {
04854             open_capitem *oc;
04855             cd->had_accept = TRUE;
04856             for (oc = cd->open_caps; oc != NULL; oc = oc->next)
04857               {
04858               *code++ = OP_CLOSE;
04859               PUT2INC(code, 0, oc->number);
04860               }
04861             }
04862 
04863           /* Handle the cases with/without an argument */
04864 
04865           if (arglen == 0)
04866             {
04867             if (verbs[i].op < 0)   /* Argument is mandatory */
04868               {
04869               *errorcodeptr = ERR66;
04870               goto FAILED;
04871               }
04872             *code = verbs[i].op;
04873             if (*code++ == OP_THEN)
04874               {
04875               PUT(code, 0, code - bcptr->current_branch - 1);
04876               code += LINK_SIZE;
04877               }
04878             }
04879 
04880           else
04881             {
04882             if (verbs[i].op_arg < 0)   /* Argument is forbidden */
04883               {
04884               *errorcodeptr = ERR59;
04885               goto FAILED;
04886               }
04887             *code = verbs[i].op_arg;
04888             if (*code++ == OP_THEN_ARG)
04889               {
04890               PUT(code, 0, code - bcptr->current_branch - 1);
04891               code += LINK_SIZE;
04892               }
04893             *code++ = arglen;
04894             memcpy(code, arg, arglen);
04895             code += arglen;
04896             *code++ = 0;
04897             }
04898 
04899           break;  /* Found verb, exit loop */
04900           }
04901 
04902         vn += verbs[i].len + 1;
04903         }
04904 
04905       if (i < verbcount) continue;    /* Successfully handled a verb */
04906       *errorcodeptr = ERR60;          /* Verb not recognized */
04907       goto FAILED;
04908       }
04909 
04910     /* Deal with the extended parentheses; all are introduced by '?', and the
04911     appearance of any of them means that this is not a capturing group. */
04912 
04913     else if (*ptr == CHAR_QUESTION_MARK)
04914       {
04915       int i, set, unset, namelen;
04916       int *optset;
04917       const uschar *name;
04918       uschar *slot;
04919 
04920       switch (*(++ptr))
04921         {
04922         case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */
04923         ptr++;
04924         while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
04925         if (*ptr == 0)
04926           {
04927           *errorcodeptr = ERR18;
04928           goto FAILED;
04929           }
04930         continue;
04931 
04932 
04933         /* ------------------------------------------------------------ */
04934         case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
04935         reset_bracount = TRUE;
04936         /* Fall through */
04937 
04938         /* ------------------------------------------------------------ */
04939         case CHAR_COLON:          /* Non-capturing bracket */
04940         bravalue = OP_BRA;
04941         ptr++;
04942         break;
04943 
04944 
04945         /* ------------------------------------------------------------ */
04946         case CHAR_LEFT_PARENTHESIS:
04947         bravalue = OP_COND;       /* Conditional group */
04948 
04949         /* A condition can be an assertion, a number (referring to a numbered
04950         group), a name (referring to a named group), or 'R', referring to
04951         recursion. R<digits> and R&name are also permitted for recursion tests.
04952 
04953         There are several syntaxes for testing a named group: (?(name)) is used
04954         by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
04955 
04956         There are two unfortunate ambiguities, caused by history. (a) 'R' can
04957         be the recursive thing or the name 'R' (and similarly for 'R' followed
04958         by digits), and (b) a number could be a name that consists of digits.
04959         In both cases, we look for a name first; if not found, we try the other
04960         cases. */
04961 
04962         /* For conditions that are assertions, check the syntax, and then exit
04963         the switch. This will take control down to where bracketed groups,
04964         including assertions, are processed. */
04965 
04966         if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
04967             ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
04968           break;
04969 
04970         /* Most other conditions use OP_CREF (a couple change to OP_RREF
04971         below), and all need to skip 3 bytes at the start of the group. */
04972 
04973         code[1+LINK_SIZE] = OP_CREF;
04974         skipbytes = 3;
04975         refsign = -1;
04976 
04977         /* Check for a test for recursion in a named group. */
04978 
04979         if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
04980           {
04981           terminator = -1;
04982           ptr += 2;
04983           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
04984           }
04985 
04986         /* Check for a test for a named group's having been set, using the Perl
04987         syntax (?(<name>) or (?('name') */
04988 
04989         else if (ptr[1] == CHAR_LESS_THAN_SIGN)
04990           {
04991           terminator = CHAR_GREATER_THAN_SIGN;
04992           ptr++;
04993           }
04994         else if (ptr[1] == CHAR_APOSTROPHE)
04995           {
04996           terminator = CHAR_APOSTROPHE;
04997           ptr++;
04998           }
04999         else
05000           {
05001           terminator = 0;
05002           if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
05003           }
05004 
05005         /* We now expect to read a name; any thing else is an error */
05006 
05007         if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
05008           {
05009           ptr += 1;  /* To get the right offset */
05010           *errorcodeptr = ERR28;
05011           goto FAILED;
05012           }
05013 
05014         /* Read the name, but also get it as a number if it's all digits */
05015 
05016         recno = 0;
05017         name = ++ptr;
05018         while ((cd->ctypes[*ptr] & ctype_word) != 0)
05019           {
05020           if (recno >= 0)
05021             recno = ((digitab[*ptr] & ctype_digit) != 0)?
05022               recno * 10 + *ptr - CHAR_0 : -1;
05023           ptr++;
05024           }
05025         namelen = (int)(ptr - name);
05026 
05027         if ((terminator > 0 && *ptr++ != terminator) ||
05028             *ptr++ != CHAR_RIGHT_PARENTHESIS)
05029           {
05030           ptr--;      /* Error offset */
05031           *errorcodeptr = ERR26;
05032           goto FAILED;
05033           }
05034 
05035         /* Do no further checking in the pre-compile phase. */
05036 
05037         if (lengthptr != NULL) break;
05038 
05039         /* In the real compile we do the work of looking for the actual
05040         reference. If the string started with "+" or "-" we require the rest to
05041         be digits, in which case recno will be set. */
05042 
05043         if (refsign > 0)
05044           {
05045           if (recno <= 0)
05046             {
05047             *errorcodeptr = ERR58;
05048             goto FAILED;
05049             }
05050           recno = (refsign == CHAR_MINUS)?
05051             cd->bracount - recno + 1 : recno +cd->bracount;
05052           if (recno <= 0 || recno > cd->final_bracount)
05053             {
05054             *errorcodeptr = ERR15;
05055             goto FAILED;
05056             }
05057           PUT2(code, 2+LINK_SIZE, recno);
05058           break;
05059           }
05060 
05061         /* Otherwise (did not start with "+" or "-"), start by looking for the
05062         name. If we find a name, add one to the opcode to change OP_CREF or
05063         OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
05064         except they record that the reference was originally to a name. The
05065         information is used to check duplicate names. */
05066 
05067         slot = cd->name_table;
05068         for (i = 0; i < cd->names_found; i++)
05069           {
05070           if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
05071           slot += cd->name_entry_size;
05072           }
05073 
05074         /* Found a previous named subpattern */
05075 
05076         if (i < cd->names_found)
05077           {
05078           recno = GET2(slot, 0);
05079           PUT2(code, 2+LINK_SIZE, recno);
05080           code[1+LINK_SIZE]++;
05081           }
05082 
05083         /* Search the pattern for a forward reference */
05084 
05085         else if ((i = find_parens(cd, name, namelen,
05086                         (options & PCRE_EXTENDED) != 0, utf8)) > 0)
05087           {
05088           PUT2(code, 2+LINK_SIZE, i);
05089           code[1+LINK_SIZE]++;
05090           }
05091 
05092         /* If terminator == 0 it means that the name followed directly after
05093         the opening parenthesis [e.g. (?(abc)...] and in this case there are
05094         some further alternatives to try. For the cases where terminator != 0
05095         [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
05096         now checked all the possibilities, so give an error. */
05097 
05098         else if (terminator != 0)
05099           {
05100           *errorcodeptr = ERR15;
05101           goto FAILED;
05102           }
05103 
05104         /* Check for (?(R) for recursion. Allow digits after R to specify a
05105         specific group number. */
05106 
05107         else if (*name == CHAR_R)
05108           {
05109           recno = 0;
05110           for (i = 1; i < namelen; i++)
05111             {
05112             if ((digitab[name[i]] & ctype_digit) == 0)
05113               {
05114               *errorcodeptr = ERR15;
05115               goto FAILED;
05116               }
05117             recno = recno * 10 + name[i] - CHAR_0;
05118             }
05119           if (recno == 0) recno = RREF_ANY;
05120           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
05121           PUT2(code, 2+LINK_SIZE, recno);
05122           }
05123 
05124         /* Similarly, check for the (?(DEFINE) "condition", which is always
05125         false. */
05126 
05127         else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
05128           {
05129           code[1+LINK_SIZE] = OP_DEF;
05130           skipbytes = 1;
05131           }
05132 
05133         /* Check for the "name" actually being a subpattern number. We are
05134         in the second pass here, so final_bracount is set. */
05135 
05136         else if (recno > 0 && recno <= cd->final_bracount)
05137           {
05138           PUT2(code, 2+LINK_SIZE, recno);
05139           }
05140 
05141         /* Either an unidentified subpattern, or a reference to (?(0) */
05142 
05143         else
05144           {
05145           *errorcodeptr = (recno == 0)? ERR35: ERR15;
05146           goto FAILED;
05147           }
05148         break;
05149 
05150 
05151         /* ------------------------------------------------------------ */
05152         case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
05153         bravalue = OP_ASSERT;
05154         ptr++;
05155         break;
05156 
05157 
05158         /* ------------------------------------------------------------ */
05159         case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
05160         ptr++;
05161         if (*ptr == CHAR_RIGHT_PARENTHESIS)    /* Optimize (?!) */
05162           {
05163           *code++ = OP_FAIL;
05164           previous = NULL;
05165           continue;
05166           }
05167         bravalue = OP_ASSERT_NOT;
05168         break;
05169 
05170 
05171         /* ------------------------------------------------------------ */
05172         case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
05173         switch (ptr[1])
05174           {
05175           case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
05176           bravalue = OP_ASSERTBACK;
05177           ptr += 2;
05178           break;
05179 
05180           case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
05181           bravalue = OP_ASSERTBACK_NOT;
05182           ptr += 2;
05183           break;
05184 
05185           default:                /* Could be name define, else bad */
05186           if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
05187           ptr++;                  /* Correct offset for error */
05188           *errorcodeptr = ERR24;
05189           goto FAILED;
05190           }
05191         break;
05192 
05193 
05194         /* ------------------------------------------------------------ */
05195         case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
05196         bravalue = OP_ONCE;
05197         ptr++;
05198         break;
05199 
05200 
05201         /* ------------------------------------------------------------ */
05202         case CHAR_C:                 /* Callout - may be followed by digits; */
05203         previous_callout = code;  /* Save for later completion */
05204         after_manual_callout = 1; /* Skip one item before completing */
05205         *code++ = OP_CALLOUT;
05206           {
05207           int n = 0;
05208           while ((digitab[*(++ptr)] & ctype_digit) != 0)
05209             n = n * 10 + *ptr - CHAR_0;
05210           if (*ptr != CHAR_RIGHT_PARENTHESIS)
05211             {
05212             *errorcodeptr = ERR39;
05213             goto FAILED;
05214             }
05215           if (n > 255)
05216             {
05217             *errorcodeptr = ERR38;
05218             goto FAILED;
05219             }
05220           *code++ = n;
05221           PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
05222           PUT(code, LINK_SIZE, 0);                          /* Default length */
05223           code += 2 * LINK_SIZE;
05224           }
05225         previous = NULL;
05226         continue;
05227 
05228 
05229         /* ------------------------------------------------------------ */
05230         case CHAR_P:              /* Python-style named subpattern handling */
05231         if (*(++ptr) == CHAR_EQUALS_SIGN ||
05232             *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
05233           {
05234           is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
05235           terminator = CHAR_RIGHT_PARENTHESIS;
05236           goto NAMED_REF_OR_RECURSE;
05237           }
05238         else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
05239           {
05240           *errorcodeptr = ERR41;
05241           goto FAILED;
05242           }
05243         /* Fall through to handle (?P< as (?< is handled */
05244 
05245 
05246         /* ------------------------------------------------------------ */
05247         DEFINE_NAME:    /* Come here from (?< handling */
05248         case CHAR_APOSTROPHE:
05249           {
05250           terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
05251             CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
05252           name = ++ptr;
05253 
05254           while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
05255           namelen = (int)(ptr - name);
05256 
05257           /* In the pre-compile phase, just do a syntax check. */
05258 
05259           if (lengthptr != NULL)
05260             {
05261             if (*ptr != terminator)
05262               {
05263               *errorcodeptr = ERR42;
05264               goto FAILED;
05265               }
05266             if (cd->names_found >= MAX_NAME_COUNT)
05267               {
05268               *errorcodeptr = ERR49;
05269               goto FAILED;
05270               }
05271             if (namelen + 3 > cd->name_entry_size)
05272               {
05273               cd->name_entry_size = namelen + 3;
05274               if (namelen > MAX_NAME_SIZE)
05275                 {
05276                 *errorcodeptr = ERR48;
05277                 goto FAILED;
05278                 }
05279               }
05280             }
05281 
05282           /* In the real compile, create the entry in the table, maintaining
05283           alphabetical order. Duplicate names for different numbers are
05284           permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
05285           number are always OK. (An existing number can be re-used if (?|
05286           appears in the pattern.) In either event, a duplicate name results in
05287           a duplicate entry in the table, even if the number is the same. This
05288           is because the number of names, and hence the table size, is computed
05289           in the pre-compile, and it affects various numbers and pointers which
05290           would all have to be modified, and the compiled code moved down, if
05291           duplicates with the same number were omitted from the table. This
05292           doesn't seem worth the hassle. However, *different* names for the
05293           same number are not permitted. */
05294 
05295           else
05296             {
05297             BOOL dupname = FALSE;
05298             slot = cd->name_table;
05299 
05300             for (i = 0; i < cd->names_found; i++)
05301               {
05302               int crc = memcmp(name, slot+2, namelen);
05303               if (crc == 0)
05304                 {
05305                 if (slot[2+namelen] == 0)
05306                   {
05307                   if (GET2(slot, 0) != cd->bracount + 1 &&
05308                       (options & PCRE_DUPNAMES) == 0)
05309                     {
05310                     *errorcodeptr = ERR43;
05311                     goto FAILED;
05312                     }
05313                   else dupname = TRUE;
05314                   }
05315                 else crc = -1;      /* Current name is a substring */
05316                 }
05317 
05318               /* Make space in the table and break the loop for an earlier
05319               name. For a duplicate or later name, carry on. We do this for
05320               duplicates so that in the simple case (when ?(| is not used) they
05321               are in order of their numbers. */
05322 
05323               if (crc < 0)
05324                 {
05325                 memmove(slot + cd->name_entry_size, slot,
05326                   (cd->names_found - i) * cd->name_entry_size);
05327                 break;
05328                 }
05329 
05330               /* Continue the loop for a later or duplicate name */
05331 
05332               slot += cd->name_entry_size;
05333               }
05334 
05335             /* For non-duplicate names, check for a duplicate number before
05336             adding the new name. */
05337 
05338             if (!dupname)
05339               {
05340               uschar *cslot = cd->name_table;
05341               for (i = 0; i < cd->names_found; i++)
05342                 {
05343                 if (cslot != slot)
05344                   {
05345                   if (GET2(cslot, 0) == cd->bracount + 1)
05346                     {
05347                     *errorcodeptr = ERR65;
05348                     goto FAILED;
05349                     }
05350                   }
05351                 else i--;
05352                 cslot += cd->name_entry_size;
05353                 }
05354               }
05355 
05356             PUT2(slot, 0, cd->bracount + 1);
05357             memcpy(slot + 2, name, namelen);
05358             slot[2+namelen] = 0;
05359             }
05360           }
05361 
05362         /* In both pre-compile and compile, count the number of names we've
05363         encountered. */
05364 
05365         cd->names_found++;
05366         ptr++;                    /* Move past > or ' */
05367         goto NUMBERED_GROUP;
05368 
05369 
05370         /* ------------------------------------------------------------ */
05371         case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
05372         terminator = CHAR_RIGHT_PARENTHESIS;
05373         is_recurse = TRUE;
05374         /* Fall through */
05375 
05376         /* We come here from the Python syntax above that handles both
05377         references (?P=name) and recursion (?P>name), as well as falling
05378         through from the Perl recursion syntax (?&name). We also come here from
05379         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
05380         .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
05381 
05382         NAMED_REF_OR_RECURSE:
05383         name = ++ptr;
05384         while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
05385         namelen = (int)(ptr - name);
05386 
05387         /* In the pre-compile phase, do a syntax check. We used to just set
05388         a dummy reference number, because it was not used in the first pass.
05389         However, with the change of recursive back references to be atomic,
05390         we have to look for the number so that this state can be identified, as
05391         otherwise the incorrect length is computed. If it's not a backwards
05392         reference, the dummy number will do. */
05393 
05394         if (lengthptr != NULL)
05395           {
05396           const uschar *temp;
05397 
05398           if (namelen == 0)
05399             {
05400             *errorcodeptr = ERR62;
05401             goto FAILED;
05402             }
05403           if (*ptr != terminator)
05404             {
05405             *errorcodeptr = ERR42;
05406             goto FAILED;
05407             }
05408           if (namelen > MAX_NAME_SIZE)
05409             {
05410             *errorcodeptr = ERR48;
05411             goto FAILED;
05412             }
05413 
05414           /* The name table does not exist in the first pass, so we cannot
05415           do a simple search as in the code below. Instead, we have to scan the
05416           pattern to find the number. It is important that we scan it only as
05417           far as we have got because the syntax of named subpatterns has not
05418           been checked for the rest of the pattern, and find_parens() assumes
05419           correct syntax. In any case, it's a waste of resources to scan
05420           further. We stop the scan at the current point by temporarily
05421           adjusting the value of cd->endpattern. */
05422 
05423           temp = cd->end_pattern;
05424           cd->end_pattern = ptr;
05425           recno = find_parens(cd, name, namelen,
05426             (options & PCRE_EXTENDED) != 0, utf8);
05427           cd->end_pattern = temp;
05428           if (recno < 0) recno = 0;    /* Forward ref; set dummy number */
05429           }
05430 
05431         /* In the real compile, seek the name in the table. We check the name
05432         first, and then check that we have reached the end of the name in the
05433         table. That way, if the name that is longer than any in the table,
05434         the comparison will fail without reading beyond the table entry. */
05435 
05436         else
05437           {
05438           slot = cd->name_table;
05439           for (i = 0; i < cd->names_found; i++)
05440             {
05441             if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
05442                 slot[2+namelen] == 0)
05443               break;
05444             slot += cd->name_entry_size;
05445             }
05446 
05447           if (i < cd->names_found)         /* Back reference */
05448             {
05449             recno = GET2(slot, 0);
05450             }
05451           else if ((recno =                /* Forward back reference */
05452                     find_parens(cd, name, namelen,
05453                       (options & PCRE_EXTENDED) != 0, utf8)) <= 0)
05454             {
05455             *errorcodeptr = ERR15;
05456             goto FAILED;
05457             }
05458           }
05459 
05460         /* In both phases, we can now go to the code than handles numerical
05461         recursion or backreferences. */
05462 
05463         if (is_recurse) goto HANDLE_RECURSION;
05464           else goto HANDLE_REFERENCE;
05465 
05466 
05467         /* ------------------------------------------------------------ */
05468         case CHAR_R:              /* Recursion */
05469         ptr++;                    /* Same as (?0)      */
05470         /* Fall through */
05471 
05472 
05473         /* ------------------------------------------------------------ */
05474         case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
05475         case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
05476         case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
05477           {
05478           const uschar *called;
05479           terminator = CHAR_RIGHT_PARENTHESIS;
05480 
05481           /* Come here from the \g<...> and \g'...' code (Oniguruma
05482           compatibility). However, the syntax has been checked to ensure that
05483           the ... are a (signed) number, so that neither ERR63 nor ERR29 will
05484           be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
05485           ever be taken. */
05486 
05487           HANDLE_NUMERICAL_RECURSION:
05488 
05489           if ((refsign = *ptr) == CHAR_PLUS)
05490             {
05491             ptr++;
05492             if ((digitab[*ptr] & ctype_digit) == 0)
05493               {
05494               *errorcodeptr = ERR63;
05495               goto FAILED;
05496               }
05497             }
05498           else if (refsign == CHAR_MINUS)
05499             {
05500             if ((digitab[ptr[1]] & ctype_digit) == 0)
05501               goto OTHER_CHAR_AFTER_QUERY;
05502             ptr++;
05503             }
05504 
05505           recno = 0;
05506           while((digitab[*ptr] & ctype_digit) != 0)
05507             recno = recno * 10 + *ptr++ - CHAR_0;
05508 
05509           if (*ptr != terminator)
05510             {
05511             *errorcodeptr = ERR29;
05512             goto FAILED;
05513             }
05514 
05515           if (refsign == CHAR_MINUS)
05516             {
05517             if (recno == 0)
05518               {
05519               *errorcodeptr = ERR58;
05520               goto FAILED;
05521               }
05522             recno = cd->bracount - recno + 1;
05523             if (recno <= 0)
05524               {
05525               *errorcodeptr = ERR15;
05526               goto FAILED;
05527               }
05528             }
05529           else if (refsign == CHAR_PLUS)
05530             {
05531             if (recno == 0)
05532               {
05533               *errorcodeptr = ERR58;
05534               goto FAILED;
05535               }
05536             recno += cd->bracount;
05537             }
05538 
05539           /* Come here from code above that handles a named recursion */
05540 
05541           HANDLE_RECURSION:
05542 
05543           previous = code;
05544           called = cd->start_code;
05545 
05546           /* When we are actually compiling, find the bracket that is being
05547           referenced. Temporarily end the regex in case it doesn't exist before
05548           this point. If we end up with a forward reference, first check that
05549           the bracket does occur later so we can give the error (and position)
05550           now. Then remember this forward reference in the workspace so it can
05551           be filled in at the end. */
05552 
05553           if (lengthptr == NULL)
05554             {
05555             *code = OP_END;
05556             if (recno != 0)
05557               called = _pcre_find_bracket(cd->start_code, utf8, recno);
05558 
05559             /* Forward reference */
05560 
05561             if (called == NULL)
05562               {
05563               if (find_parens(cd, NULL, recno,
05564                     (options & PCRE_EXTENDED) != 0, utf8) < 0)
05565                 {
05566                 *errorcodeptr = ERR15;
05567                 goto FAILED;
05568                 }
05569 
05570               /* Fudge the value of "called" so that when it is inserted as an
05571               offset below, what it actually inserted is the reference number
05572               of the group. */
05573 
05574               called = cd->start_code + recno;
05575               PUTINC(cd->hwm, 0, (int)(code + 2 + LINK_SIZE - cd->start_code));
05576               }
05577 
05578             /* If not a forward reference, and the subpattern is still open,
05579             this is a recursive call. We check to see if this is a left
05580             recursion that could loop for ever, and diagnose that case. */
05581 
05582             else if (GET(called, 1) == 0 &&
05583                      could_be_empty(called, code, bcptr, utf8, cd))
05584               {
05585               *errorcodeptr = ERR40;
05586               goto FAILED;
05587               }
05588             }
05589 
05590           /* Insert the recursion/subroutine item, automatically wrapped inside
05591           "once" brackets. Set up a "previous group" length so that a
05592           subsequent quantifier will work. */
05593 
05594           *code = OP_ONCE;
05595           PUT(code, 1, 2 + 2*LINK_SIZE);
05596           code += 1 + LINK_SIZE;
05597 
05598           *code = OP_RECURSE;
05599           PUT(code, 1, (int)(called - cd->start_code));
05600           code += 1 + LINK_SIZE;
05601 
05602           *code = OP_KET;
05603           PUT(code, 1, 2 + 2*LINK_SIZE);
05604           code += 1 + LINK_SIZE;
05605 
05606           length_prevgroup = 3 + 3*LINK_SIZE;
05607           }
05608 
05609         /* Can't determine a first byte now */
05610 
05611         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
05612         continue;
05613 
05614 
05615         /* ------------------------------------------------------------ */
05616         default:              /* Other characters: check option setting */
05617         OTHER_CHAR_AFTER_QUERY:
05618         set = unset = 0;
05619         optset = &set;
05620 
05621         while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
05622           {
05623           switch (*ptr++)
05624             {
05625             case CHAR_MINUS: optset = &unset; break;
05626 
05627             case CHAR_J:    /* Record that it changed in the external options */
05628             *optset |= PCRE_DUPNAMES;
05629             cd->external_flags |= PCRE_JCHANGED;
05630             break;
05631 
05632             case CHAR_i: *optset |= PCRE_CASELESS; break;
05633             case CHAR_m: *optset |= PCRE_MULTILINE; break;
05634             case CHAR_s: *optset |= PCRE_DOTALL; break;
05635             case CHAR_x: *optset |= PCRE_EXTENDED; break;
05636             case CHAR_U: *optset |= PCRE_UNGREEDY; break;
05637             case CHAR_X: *optset |= PCRE_EXTRA; break;
05638 
05639             default:  *errorcodeptr = ERR12;
05640                       ptr--;    /* Correct the offset */
05641                       goto FAILED;
05642             }
05643           }
05644 
05645         /* Set up the changed option bits, but don't change anything yet. */
05646 
05647         newoptions = (options | set) & (~unset);
05648 
05649         /* If the options ended with ')' this is not the start of a nested
05650         group with option changes, so the options change at this level. If this
05651         item is right at the start of the pattern, the options can be
05652         abstracted and made external in the pre-compile phase, and ignored in
05653         the compile phase. This can be helpful when matching -- for instance in
05654         caseless checking of required bytes.
05655 
05656         If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
05657         definitely *not* at the start of the pattern because something has been
05658         compiled. In the pre-compile phase, however, the code pointer can have
05659         that value after the start, because it gets reset as code is discarded
05660         during the pre-compile. However, this can happen only at top level - if
05661         we are within parentheses, the starting BRA will still be present. At
05662         any parenthesis level, the length value can be used to test if anything
05663         has been compiled at that level. Thus, a test for both these conditions
05664         is necessary to ensure we correctly detect the start of the pattern in
05665         both phases.
05666 
05667         If we are not at the pattern start, compile code to change the ims
05668         options if this setting actually changes any of them, and reset the
05669         greedy defaults and the case value for firstbyte and reqbyte. */
05670 
05671         if (*ptr == CHAR_RIGHT_PARENTHESIS)
05672           {
05673           if (code == cd->start_code + 1 + LINK_SIZE &&
05674                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
05675             {
05676             cd->external_options = newoptions;
05677             }
05678           else
05679             {
05680             if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
05681               {
05682               *code++ = OP_OPT;
05683               *code++ = newoptions & PCRE_IMS;
05684               }
05685             greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
05686             greedy_non_default = greedy_default ^ 1;
05687             req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
05688             }
05689 
05690           /* Change options at this level, and pass them back for use
05691           in subsequent branches. When not at the start of the pattern, this
05692           information is also necessary so that a resetting item can be
05693           compiled at the end of a group (if we are in a group). */
05694 
05695           *optionsptr = options = newoptions;
05696           previous = NULL;       /* This item can't be repeated */
05697           continue;              /* It is complete */
05698           }
05699 
05700         /* If the options ended with ':' we are heading into a nested group
05701         with possible change of options. Such groups are non-capturing and are
05702         not assertions of any kind. All we need to do is skip over the ':';
05703         the newoptions value is handled below. */
05704 
05705         bravalue = OP_BRA;
05706         ptr++;
05707         }     /* End of switch for character following (? */
05708       }       /* End of (? handling */
05709 
05710     /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
05711     is set, all unadorned brackets become non-capturing and behave like (?:...)
05712     brackets. */
05713 
05714     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
05715       {
05716       bravalue = OP_BRA;
05717       }
05718 
05719     /* Else we have a capturing group. */
05720 
05721     else
05722       {
05723       NUMBERED_GROUP:
05724       cd->bracount += 1;
05725       PUT2(code, 1+LINK_SIZE, cd->bracount);
05726       skipbytes = 2;
05727       }
05728 
05729     /* Process nested bracketed regex. Assertions may not be repeated, but
05730     other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
05731     non-register variable in order to be able to pass its address because some
05732     compilers complain otherwise. Pass in a new setting for the ims options if
05733     they have changed. */
05734 
05735     previous = (bravalue >= OP_ONCE)? code : NULL;
05736     *code = bravalue;
05737     tempcode = code;
05738     tempreqvary = cd->req_varyopt;     /* Save value before bracket */
05739     length_prevgroup = 0;              /* Initialize for pre-compile phase */
05740 
05741     if (!compile_regex(
05742          newoptions,                   /* The complete new option state */
05743          options & PCRE_IMS,           /* The previous ims option state */
05744          &tempcode,                    /* Where to put code (updated) */
05745          &ptr,                         /* Input pointer (updated) */
05746          errorcodeptr,                 /* Where to put an error message */
05747          (bravalue == OP_ASSERTBACK ||
05748           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
05749          reset_bracount,               /* True if (?| group */
05750          skipbytes,                    /* Skip over bracket number */
05751          &subfirstbyte,                /* For possible first char */
05752          &subreqbyte,                  /* For possible last char */
05753          bcptr,                        /* Current branch chain */
05754          cd,                           /* Tables block */
05755          (lengthptr == NULL)? NULL :   /* Actual compile phase */
05756            &length_prevgroup           /* Pre-compile phase */
05757          ))
05758       goto FAILED;
05759 
05760     /* At the end of compiling, code is still pointing to the start of the
05761     group, while tempcode has been updated to point past the end of the group
05762     and any option resetting that may follow it. The pattern pointer (ptr)
05763     is on the bracket. */
05764 
05765     /* If this is a conditional bracket, check that there are no more than
05766     two branches in the group, or just one if it's a DEFINE group. We do this
05767     in the real compile phase, not in the pre-pass, where the whole group may
05768     not be available. */
05769 
05770     if (bravalue == OP_COND && lengthptr == NULL)
05771       {
05772       uschar *tc = code;
05773       int condcount = 0;
05774 
05775       do {
05776          condcount++;
05777          tc += GET(tc,1);
05778          }
05779       while (*tc != OP_KET);
05780 
05781       /* A DEFINE group is never obeyed inline (the "condition" is always
05782       false). It must have only one branch. */
05783 
05784       if (code[LINK_SIZE+1] == OP_DEF)
05785         {
05786         if (condcount > 1)
05787           {
05788           *errorcodeptr = ERR54;
05789           goto FAILED;
05790           }
05791         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
05792         }
05793 
05794       /* A "normal" conditional group. If there is just one branch, we must not
05795       make use of its firstbyte or reqbyte, because this is equivalent to an
05796       empty second branch. */
05797 
05798       else
05799         {
05800         if (condcount > 2)
05801           {
05802           *errorcodeptr = ERR27;
05803           goto FAILED;
05804           }
05805         if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
05806         }
05807       }
05808 
05809     /* Error if hit end of pattern */
05810 
05811     if (*ptr != CHAR_RIGHT_PARENTHESIS)
05812       {
05813       *errorcodeptr = ERR14;
05814       goto FAILED;
05815       }
05816 
05817     /* In the pre-compile phase, update the length by the length of the group,
05818     less the brackets at either end. Then reduce the compiled code to just a
05819     set of non-capturing brackets so that it doesn't use much memory if it is
05820     duplicated by a quantifier.*/
05821 
05822     if (lengthptr != NULL)
05823       {
05824       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
05825         {
05826         *errorcodeptr = ERR20;
05827         goto FAILED;
05828         }
05829       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
05830       *code++ = OP_BRA;
05831       PUTINC(code, 0, 1 + LINK_SIZE);
05832       *code++ = OP_KET;
05833       PUTINC(code, 0, 1 + LINK_SIZE);
05834       break;    /* No need to waste time with special character handling */
05835       }
05836 
05837     /* Otherwise update the main code pointer to the end of the group. */
05838 
05839     code = tempcode;
05840 
05841     /* For a DEFINE group, required and first character settings are not
05842     relevant. */
05843 
05844     if (bravalue == OP_DEF) break;
05845 
05846     /* Handle updating of the required and first characters for other types of
05847     group. Update for normal brackets of all kinds, and conditions with two
05848     branches (see code above). If the bracket is followed by a quantifier with
05849     zero repeat, we have to back off. Hence the definition of zeroreqbyte and
05850     zerofirstbyte outside the main loop so that they can be accessed for the
05851     back off. */
05852 
05853     zeroreqbyte = reqbyte;
05854     zerofirstbyte = firstbyte;
05855     groupsetfirstbyte = FALSE;
05856 
05857     if (bravalue >= OP_ONCE)
05858       {
05859       /* If we have not yet set a firstbyte in this branch, take it from the
05860       subpattern, remembering that it was set here so that a repeat of more
05861       than one can replicate it as reqbyte if necessary. If the subpattern has
05862       no firstbyte, set "none" for the whole branch. In both cases, a zero
05863       repeat forces firstbyte to "none". */
05864 
05865       if (firstbyte == REQ_UNSET)
05866         {
05867         if (subfirstbyte >= 0)
05868           {
05869           firstbyte = subfirstbyte;
05870           groupsetfirstbyte = TRUE;
05871           }
05872         else firstbyte = REQ_NONE;
05873         zerofirstbyte = REQ_NONE;
05874         }
05875 
05876       /* If firstbyte was previously set, convert the subpattern's firstbyte
05877       into reqbyte if there wasn't one, using the vary flag that was in
05878       existence beforehand. */
05879 
05880       else if (subfirstbyte >= 0 && subreqbyte < 0)
05881         subreqbyte = subfirstbyte | tempreqvary;
05882 
05883       /* If the subpattern set a required byte (or set a first byte that isn't
05884       really the first byte - see above), set it. */
05885 
05886       if (subreqbyte >= 0) reqbyte = subreqbyte;
05887       }
05888 
05889     /* For a forward assertion, we take the reqbyte, if set. This can be
05890     helpful if the pattern that follows the assertion doesn't set a different
05891     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
05892     for an assertion, however because it leads to incorrect effect for patterns
05893     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
05894     of a firstbyte. This is overcome by a scan at the end if there's no
05895     firstbyte, looking for an asserted first char. */
05896 
05897     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
05898     break;     /* End of processing '(' */
05899 
05900 
05901     /* ===================================================================*/
05902     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
05903     are arranged to be the negation of the corresponding OP_values in the
05904     default case when PCRE_UCP is not set. For the back references, the values
05905     are ESC_REF plus the reference number. Only back references and those types
05906     that consume a character may be repeated. We can test for values between
05907     ESC_b and ESC_Z for the latter; this may have to change if any new ones are
05908     ever created. */
05909 
05910     case CHAR_BACKSLASH:
05911     tempptr = ptr;
05912     c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
05913     if (*errorcodeptr != 0) goto FAILED;
05914 
05915     if (c < 0)
05916       {
05917       if (-c == ESC_Q)            /* Handle start of quoted string */
05918         {
05919         if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
05920           ptr += 2;               /* avoid empty string */
05921             else inescq = TRUE;
05922         continue;
05923         }
05924 
05925       if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
05926 
05927       /* For metasequences that actually match a character, we disable the
05928       setting of a first character if it hasn't already been set. */
05929 
05930       if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
05931         firstbyte = REQ_NONE;
05932 
05933       /* Set values to reset to if this is followed by a zero repeat. */
05934 
05935       zerofirstbyte = firstbyte;
05936       zeroreqbyte = reqbyte;
05937 
05938       /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
05939       is a subroutine call by number (Oniguruma syntax). In fact, the value
05940       -ESC_g is returned only for these cases. So we don't need to check for <
05941       or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
05942       -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
05943       that is a synonym for a named back reference). */
05944 
05945       if (-c == ESC_g)
05946         {
05947         const uschar *p;
05948         save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
05949         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
05950           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
05951 
05952         /* These two statements stop the compiler for warning about possibly
05953         unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
05954         fact, because we actually check for a number below, the paths that
05955         would actually be in error are never taken. */
05956 
05957         skipbytes = 0;
05958         reset_bracount = FALSE;
05959 
05960         /* Test for a name */
05961 
05962         if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
05963           {
05964           BOOL isnumber = TRUE;
05965           for (p = ptr + 1; *p != 0 && *p != terminator; p++)
05966             {
05967             if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
05968             if ((cd->ctypes[*p] & ctype_word) == 0) break;
05969             }
05970           if (*p != terminator)
05971             {
05972             *errorcodeptr = ERR57;
05973             break;
05974             }
05975           if (isnumber)
05976             {
05977             ptr++;
05978             goto HANDLE_NUMERICAL_RECURSION;
05979             }
05980           is_recurse = TRUE;
05981           goto NAMED_REF_OR_RECURSE;
05982           }
05983 
05984         /* Test a signed number in angle brackets or quotes. */
05985 
05986         p = ptr + 2;
05987         while ((digitab[*p] & ctype_digit) != 0) p++;
05988         if (*p != terminator)
05989           {
05990           *errorcodeptr = ERR57;
05991           break;
05992           }
05993         ptr++;
05994         goto HANDLE_NUMERICAL_RECURSION;
05995         }
05996 
05997       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
05998       We also support \k{name} (.NET syntax) */
05999 
06000       if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
06001           ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
06002         {
06003         is_recurse = FALSE;
06004         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
06005           CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
06006           CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
06007         goto NAMED_REF_OR_RECURSE;
06008         }
06009 
06010       /* Back references are handled specially; must disable firstbyte if
06011       not set to cope with cases like (?=(\w+))\1: which would otherwise set
06012       ':' later. */
06013 
06014       if (-c >= ESC_REF)
06015         {
06016         open_capitem *oc;
06017         recno = -c - ESC_REF;
06018 
06019         HANDLE_REFERENCE:    /* Come here from named backref handling */
06020         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
06021         previous = code;
06022         *code++ = OP_REF;
06023         PUT2INC(code, 0, recno);
06024         cd->backref_map |= (recno < 32)? (1 << recno) : 1;
06025         if (recno > cd->top_backref) cd->top_backref = recno;
06026 
06027         /* Check to see if this back reference is recursive, that it, it
06028         is inside the group that it references. A flag is set so that the
06029         group can be made atomic. */
06030 
06031         for (oc = cd->open_caps; oc != NULL; oc = oc->next)
06032           {
06033           if (oc->number == recno)
06034             {
06035             oc->flag = TRUE;
06036             break;
06037             }
06038           }
06039         }
06040 
06041       /* So are Unicode property matches, if supported. */
06042 
06043 #ifdef SUPPORT_UCP
06044       else if (-c == ESC_P || -c == ESC_p)
06045         {
06046         BOOL negated;
06047         int pdata;
06048         int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
06049         if (ptype < 0) goto FAILED;
06050         previous = code;
06051         *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
06052         *code++ = ptype;
06053         *code++ = pdata;
06054         }
06055 #else
06056 
06057       /* If Unicode properties are not supported, \X, \P, and \p are not
06058       allowed. */
06059 
06060       else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
06061         {
06062         *errorcodeptr = ERR45;
06063         goto FAILED;
06064         }
06065 #endif
06066 
06067       /* For the rest (including \X when Unicode properties are supported), we
06068       can obtain the OP value by negating the escape value in the default
06069       situation when PCRE_UCP is not set. When it *is* set, we substitute
06070       Unicode property tests. */
06071 
06072       else
06073         {
06074 #ifdef SUPPORT_UCP
06075         if (-c >= ESC_DU && -c <= ESC_wu)
06076           {
06077           nestptr = ptr + 1;                   /* Where to resume */
06078           ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
06079           }
06080         else
06081 #endif
06082           {
06083           previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
06084           *code++ = -c;
06085           }
06086         }
06087       continue;
06088       }
06089 
06090     /* We have a data character whose value is in c. In UTF-8 mode it may have
06091     a value > 127. We set its representation in the length/buffer, and then
06092     handle it as a data character. */
06093 
06094 #ifdef SUPPORT_UTF8
06095     if (utf8 && c > 127)
06096       mclength = _pcre_ord2utf8(c, mcbuffer);
06097     else
06098 #endif
06099 
06100      {
06101      mcbuffer[0] = c;
06102      mclength = 1;
06103      }
06104     goto ONE_CHAR;
06105 
06106 
06107     /* ===================================================================*/
06108     /* Handle a literal character. It is guaranteed not to be whitespace or #
06109     when the extended flag is set. If we are in UTF-8 mode, it may be a
06110     multi-byte literal character. */
06111 
06112     default:
06113     NORMAL_CHAR:
06114     mclength = 1;
06115     mcbuffer[0] = c;
06116 
06117 #ifdef SUPPORT_UTF8
06118     if (utf8 && c >= 0xc0)
06119       {
06120       while ((ptr[1] & 0xc0) == 0x80)
06121         mcbuffer[mclength++] = *(++ptr);
06122       }
06123 #endif
06124 
06125     /* At this point we have the character's bytes in mcbuffer, and the length
06126     in mclength. When not in UTF-8 mode, the length is always 1. */
06127 
06128     ONE_CHAR:
06129     previous = code;
06130     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
06131     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
06132 
06133     /* Remember if \r or \n were seen */
06134 
06135     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
06136       cd->external_flags |= PCRE_HASCRORLF;
06137 
06138     /* Set the first and required bytes appropriately. If no previous first
06139     byte, set it from this character, but revert to none on a zero repeat.
06140     Otherwise, leave the firstbyte value alone, and don't change it on a zero
06141     repeat. */
06142 
06143     if (firstbyte == REQ_UNSET)
06144       {
06145       zerofirstbyte = REQ_NONE;
06146       zeroreqbyte = reqbyte;
06147 
06148       /* If the character is more than one byte long, we can set firstbyte
06149       only if it is not to be matched caselessly. */
06150 
06151       if (mclength == 1 || req_caseopt == 0)
06152         {
06153         firstbyte = mcbuffer[0] | req_caseopt;
06154         if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
06155         }
06156       else firstbyte = reqbyte = REQ_NONE;
06157       }
06158 
06159     /* firstbyte was previously set; we can set reqbyte only the length is
06160     1 or the matching is caseful. */
06161 
06162     else
06163       {
06164       zerofirstbyte = firstbyte;
06165       zeroreqbyte = reqbyte;
06166       if (mclength == 1 || req_caseopt == 0)
06167         reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
06168       }
06169 
06170     break;            /* End of literal character handling */
06171     }
06172   }                   /* end of big loop */
06173 
06174 
06175 /* Control never reaches here by falling through, only by a goto for all the
06176 error states. Pass back the position in the pattern so that it can be displayed
06177 to the user for diagnosing the error. */
06178 
06179 FAILED:
06180 *ptrptr = ptr;
06181 return FALSE;
06182 }
06183 
06184 
06185 
06186 
06187 /*************************************************
06188 *     Compile sequence of alternatives           *
06189 *************************************************/
06190 
06191 /* On entry, ptr is pointing past the bracket character, but on return it
06192 points to the closing bracket, or vertical bar, or end of string. The code
06193 variable is pointing at the byte into which the BRA operator has been stored.
06194 If the ims options are changed at the start (for a (?ims: group) or during any
06195 branch, we need to insert an OP_OPT item at the start of every following branch
06196 to ensure they get set correctly at run time, and also pass the new options
06197 into every subsequent branch compile.
06198 
06199 This function is used during the pre-compile phase when we are trying to find
06200 out the amount of memory needed, as well as during the real compile phase. The
06201 value of lengthptr distinguishes the two phases.
06202 
06203 Arguments:
06204   options        option bits, including any changes for this subpattern
06205   oldims         previous settings of ims option bits
06206   codeptr        -> the address of the current code pointer
06207   ptrptr         -> the address of the current pattern pointer
06208   errorcodeptr   -> pointer to error code variable
06209   lookbehind     TRUE if this is a lookbehind assertion
06210   reset_bracount TRUE to reset the count for each branch
06211   skipbytes      skip this many bytes at start (for brackets and OP_COND)
06212   firstbyteptr   place to put the first required character, or a negative number
06213   reqbyteptr     place to put the last required character, or a negative number
06214   bcptr          pointer to the chain of currently open branches
06215   cd             points to the data block with tables pointers etc.
06216   lengthptr      NULL during the real compile phase
06217                  points to length accumulator during pre-compile phase
06218 
06219 Returns:         TRUE on success
06220 */
06221 
06222 static BOOL
06223 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
06224   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
06225   int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
06226   int *lengthptr)
06227 {
06228 const uschar *ptr = *ptrptr;
06229 uschar *code = *codeptr;
06230 uschar *last_branch = code;
06231 uschar *start_bracket = code;
06232 uschar *reverse_count = NULL;
06233 open_capitem capitem;
06234 int capnumber = 0;
06235 int firstbyte, reqbyte;
06236 int branchfirstbyte, branchreqbyte;
06237 int length;
06238 int orig_bracount;
06239 int max_bracount;
06240 int old_external_options = cd->external_options;
06241 branch_chain bc;
06242 
06243 bc.outer = bcptr;
06244 bc.current_branch = code;
06245 
06246 firstbyte = reqbyte = REQ_UNSET;
06247 
06248 /* Accumulate the length for use in the pre-compile phase. Start with the
06249 length of the BRA and KET and any extra bytes that are required at the
06250 beginning. We accumulate in a local variable to save frequent testing of
06251 lenthptr for NULL. We cannot do this by looking at the value of code at the
06252 start and end of each alternative, because compiled items are discarded during
06253 the pre-compile phase so that the work space is not exceeded. */
06254 
06255 length = 2 + 2*LINK_SIZE + skipbytes;
06256 
06257 /* WARNING: If the above line is changed for any reason, you must also change
06258 the code that abstracts option settings at the start of the pattern and makes
06259 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
06260 pre-compile phase to find out whether anything has yet been compiled or not. */
06261 
06262 /* If this is a capturing subpattern, add to the chain of open capturing items
06263 so that we can detect them if (*ACCEPT) is encountered. This is also used to
06264 detect groups that contain recursive back references to themselves. */
06265 
06266 if (*code == OP_CBRA)
06267   {
06268   capnumber = GET2(code, 1 + LINK_SIZE);
06269   capitem.number = capnumber;
06270   capitem.next = cd->open_caps;
06271   capitem.flag = FALSE;
06272   cd->open_caps = &capitem;
06273   }
06274 
06275 /* Offset is set zero to mark that this bracket is still open */
06276 
06277 PUT(code, 1, 0);
06278 code += 1 + LINK_SIZE + skipbytes;
06279 
06280 /* Loop for each alternative branch */
06281 
06282 orig_bracount = max_bracount = cd->bracount;
06283 for (;;)
06284   {
06285   /* For a (?| group, reset the capturing bracket count so that each branch
06286   uses the same numbers. */
06287 
06288   if (reset_bracount) cd->bracount = orig_bracount;
06289 
06290   /* Handle a change of ims options at the start of the branch */
06291 
06292   if ((options & PCRE_IMS) != oldims)
06293     {
06294     *code++ = OP_OPT;
06295     *code++ = options & PCRE_IMS;
06296     length += 2;
06297     }
06298 
06299   /* Set up dummy OP_REVERSE if lookbehind assertion */
06300 
06301   if (lookbehind)
06302     {
06303     *code++ = OP_REVERSE;
06304     reverse_count = code;
06305     PUTINC(code, 0, 0);
06306     length += 1 + LINK_SIZE;
06307     }
06308 
06309   /* Now compile the branch; in the pre-compile phase its length gets added
06310   into the length. */
06311 
06312   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
06313         &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
06314     {
06315     *ptrptr = ptr;
06316     return FALSE;
06317     }
06318 
06319   /* If the external options have changed during this branch, it means that we
06320   are at the top level, and a leading option setting has been encountered. We
06321   need to re-set the original option values to take account of this so that,
06322   during the pre-compile phase, we know to allow for a re-set at the start of
06323   subsequent branches. */
06324 
06325   if (old_external_options != cd->external_options)
06326     oldims = cd->external_options & PCRE_IMS;
06327 
06328   /* Keep the highest bracket count in case (?| was used and some branch
06329   has fewer than the rest. */
06330 
06331   if (cd->bracount > max_bracount) max_bracount = cd->bracount;
06332 
06333   /* In the real compile phase, there is some post-processing to be done. */
06334 
06335   if (lengthptr == NULL)
06336     {
06337     /* If this is the first branch, the firstbyte and reqbyte values for the
06338     branch become the values for the regex. */
06339 
06340     if (*last_branch != OP_ALT)
06341       {
06342       firstbyte = branchfirstbyte;
06343       reqbyte = branchreqbyte;
06344       }
06345 
06346     /* If this is not the first branch, the first char and reqbyte have to
06347     match the values from all the previous branches, except that if the
06348     previous value for reqbyte didn't have REQ_VARY set, it can still match,
06349     and we set REQ_VARY for the regex. */
06350 
06351     else
06352       {
06353       /* If we previously had a firstbyte, but it doesn't match the new branch,
06354       we have to abandon the firstbyte for the regex, but if there was
06355       previously no reqbyte, it takes on the value of the old firstbyte. */
06356 
06357       if (firstbyte >= 0 && firstbyte != branchfirstbyte)
06358         {
06359         if (reqbyte < 0) reqbyte = firstbyte;
06360         firstbyte = REQ_NONE;
06361         }
06362 
06363       /* If we (now or from before) have no firstbyte, a firstbyte from the
06364       branch becomes a reqbyte if there isn't a branch reqbyte. */
06365 
06366       if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
06367           branchreqbyte = branchfirstbyte;
06368 
06369       /* Now ensure that the reqbytes match */
06370 
06371       if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
06372         reqbyte = REQ_NONE;
06373       else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
06374       }
06375 
06376     /* If lookbehind, check that this branch matches a fixed-length string, and
06377     put the length into the OP_REVERSE item. Temporarily mark the end of the
06378     branch with OP_END. If the branch contains OP_RECURSE, the result is -3
06379     because there may be forward references that we can't check here. Set a
06380     flag to cause another lookbehind check at the end. Why not do it all at the
06381     end? Because common, erroneous checks are picked up here and the offset of
06382     the problem can be shown. */
06383 
06384     if (lookbehind)
06385       {
06386       int fixed_length;
06387       *code = OP_END;
06388       fixed_length = find_fixedlength(last_branch, options, FALSE, cd);
06389       DPRINTF(("fixed length = %d\n", fixed_length));
06390       if (fixed_length == -3)
06391         {
06392         cd->check_lookbehind = TRUE;
06393         }
06394       else if (fixed_length < 0)
06395         {
06396         *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
06397         *ptrptr = ptr;
06398         return FALSE;
06399         }
06400       else { PUT(reverse_count, 0, fixed_length); }
06401       }
06402     }
06403 
06404   /* Reached end of expression, either ')' or end of pattern. In the real
06405   compile phase, go back through the alternative branches and reverse the chain
06406   of offsets, with the field in the BRA item now becoming an offset to the
06407   first alternative. If there are no alternatives, it points to the end of the
06408   group. The length in the terminating ket is always the length of the whole
06409   bracketed item. If any of the ims options were changed inside the group,
06410   compile a resetting op-code following, except at the very end of the pattern.
06411   Return leaving the pointer at the terminating char. */
06412 
06413   if (*ptr != CHAR_VERTICAL_LINE)
06414     {
06415     if (lengthptr == NULL)
06416       {
06417       int branch_length = (int)(code - last_branch);
06418       do
06419         {
06420         int prev_length = GET(last_branch, 1);
06421         PUT(last_branch, 1, branch_length);
06422         branch_length = prev_length;
06423         last_branch -= branch_length;
06424         }
06425       while (branch_length > 0);
06426       }
06427 
06428     /* Fill in the ket */
06429 
06430     *code = OP_KET;
06431     PUT(code, 1, (int)(code - start_bracket));
06432     code += 1 + LINK_SIZE;
06433 
06434     /* If it was a capturing subpattern, check to see if it contained any
06435     recursive back references. If so, we must wrap it in atomic brackets.
06436     In any event, remove the block from the chain. */
06437 
06438     if (capnumber > 0)
06439       {
06440       if (cd->open_caps->flag)
06441         {
06442         memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
06443           code - start_bracket);
06444         *start_bracket = OP_ONCE;
06445         code += 1 + LINK_SIZE;
06446         PUT(start_bracket, 1, (int)(code - start_bracket));
06447         *code = OP_KET;
06448         PUT(code, 1, (int)(code - start_bracket));
06449         code += 1 + LINK_SIZE;
06450         length += 2 + 2*LINK_SIZE;
06451         }
06452       cd->open_caps = cd->open_caps->next;
06453       }
06454 
06455     /* Reset options if needed. */
06456 
06457     if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)
06458       {
06459       *code++ = OP_OPT;
06460       *code++ = oldims;
06461       length += 2;
06462       }
06463 
06464     /* Retain the highest bracket number, in case resetting was used. */
06465 
06466     cd->bracount = max_bracount;
06467 
06468     /* Set values to pass back */
06469 
06470     *codeptr = code;
06471     *ptrptr = ptr;
06472     *firstbyteptr = firstbyte;
06473     *reqbyteptr = reqbyte;
06474     if (lengthptr != NULL)
06475       {
06476       if (OFLOW_MAX - *lengthptr < length)
06477         {
06478         *errorcodeptr = ERR20;
06479         return FALSE;
06480         }
06481       *lengthptr += length;
06482       }
06483     return TRUE;
06484     }
06485 
06486   /* Another branch follows. In the pre-compile phase, we can move the code
06487   pointer back to where it was for the start of the first branch. (That is,
06488   pretend that each branch is the only one.)
06489 
06490   In the real compile phase, insert an ALT node. Its length field points back
06491   to the previous branch while the bracket remains open. At the end the chain
06492   is reversed. It's done like this so that the start of the bracket has a
06493   zero offset until it is closed, making it possible to detect recursion. */
06494 
06495   if (lengthptr != NULL)
06496     {
06497     code = *codeptr + 1 + LINK_SIZE + skipbytes;
06498     length += 1 + LINK_SIZE;
06499     }
06500   else
06501     {
06502     *code = OP_ALT;
06503     PUT(code, 1, (int)(code - last_branch));
06504     bc.current_branch = last_branch = code;
06505     code += 1 + LINK_SIZE;
06506     }
06507 
06508   ptr++;
06509   }
06510 /* Control never reaches here */
06511 }
06512 
06513 
06514 
06515 
06516 /*************************************************
06517 *          Check for anchored expression         *
06518 *************************************************/
06519 
06520 /* Try to find out if this is an anchored regular expression. Consider each
06521 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
06522 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
06523 it's anchored. However, if this is a multiline pattern, then only OP_SOD
06524 counts, since OP_CIRC can match in the middle.
06525 
06526 We can also consider a regex to be anchored if OP_SOM starts all its branches.
06527 This is the code for \G, which means "match at start of match position, taking
06528 into account the match offset".
06529 
06530 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
06531 because that will try the rest of the pattern at all possible matching points,
06532 so there is no point trying again.... er ....
06533 
06534 .... except when the .* appears inside capturing parentheses, and there is a
06535 subsequent back reference to those parentheses. We haven't enough information
06536 to catch that case precisely.
06537 
06538 At first, the best we could do was to detect when .* was in capturing brackets
06539 and the highest back reference was greater than or equal to that level.
06540 However, by keeping a bitmap of the first 31 back references, we can catch some
06541 of the more common cases more precisely.
06542 
06543 Arguments:
06544   code           points to start of expression (the bracket)
06545   options        points to the options setting
06546   bracket_map    a bitmap of which brackets we are inside while testing; this
06547                   handles up to substring 31; after that we just have to take
06548                   the less precise approach
06549   backref_map    the back reference bitmap
06550 
06551 Returns:     TRUE or FALSE
06552 */
06553 
06554 static BOOL
06555 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
06556   unsigned int backref_map)
06557 {
06558 do {
06559    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
06560      options, PCRE_MULTILINE, FALSE);
06561    register int op = *scode;
06562 
06563    /* Non-capturing brackets */
06564 
06565    if (op == OP_BRA)
06566      {
06567      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
06568      }
06569 
06570    /* Capturing brackets */
06571 
06572    else if (op == OP_CBRA)
06573      {
06574      int n = GET2(scode, 1+LINK_SIZE);
06575      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
06576      if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
06577      }
06578 
06579    /* Other brackets */
06580 
06581    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
06582      {
06583      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
06584      }
06585 
06586    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
06587    it isn't in brackets that are or may be referenced. */
06588 
06589    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
06590              op == OP_TYPEPOSSTAR))
06591      {
06592      if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
06593        return FALSE;
06594      }
06595 
06596    /* Check for explicit anchoring */
06597 
06598    else if (op != OP_SOD && op != OP_SOM &&
06599            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
06600      return FALSE;
06601    code += GET(code, 1);
06602    }
06603 while (*code == OP_ALT);   /* Loop for each alternative */
06604 return TRUE;
06605 }
06606 
06607 
06608 
06609 /*************************************************
06610 *         Check for starting with ^ or .*        *
06611 *************************************************/
06612 
06613 /* This is called to find out if every branch starts with ^ or .* so that
06614 "first char" processing can be done to speed things up in multiline
06615 matching and for non-DOTALL patterns that start with .* (which must start at
06616 the beginning or after \n). As in the case of is_anchored() (see above), we
06617 have to take account of back references to capturing brackets that contain .*
06618 because in that case we can't make the assumption.
06619 
06620 Arguments:
06621   code           points to start of expression (the bracket)
06622   bracket_map    a bitmap of which brackets we are inside while testing; this
06623                   handles up to substring 31; after that we just have to take
06624                   the less precise approach
06625   backref_map    the back reference bitmap
06626 
06627 Returns:         TRUE or FALSE
06628 */
06629 
06630 static BOOL
06631 is_startline(const uschar *code, unsigned int bracket_map,
06632   unsigned int backref_map)
06633 {
06634 do {
06635    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
06636      NULL, 0, FALSE);
06637    register int op = *scode;
06638 
06639    /* If we are at the start of a conditional assertion group, *both* the
06640    conditional assertion *and* what follows the condition must satisfy the test
06641    for start of line. Other kinds of condition fail. Note that there may be an
06642    auto-callout at the start of a condition. */
06643 
06644    if (op == OP_COND)
06645      {
06646      scode += 1 + LINK_SIZE;
06647      if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
06648      switch (*scode)
06649        {
06650        case OP_CREF:
06651        case OP_NCREF:
06652        case OP_RREF:
06653        case OP_NRREF:
06654        case OP_DEF:
06655        return FALSE;
06656 
06657        default:     /* Assertion */
06658        if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
06659        do scode += GET(scode, 1); while (*scode == OP_ALT);
06660        scode += 1 + LINK_SIZE;
06661        break;
06662        }
06663      scode = first_significant_code(scode, NULL, 0, FALSE);
06664      op = *scode;
06665      }
06666 
06667    /* Non-capturing brackets */
06668 
06669    if (op == OP_BRA)
06670      {
06671      if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
06672      }
06673 
06674    /* Capturing brackets */
06675 
06676    else if (op == OP_CBRA)
06677      {
06678      int n = GET2(scode, 1+LINK_SIZE);
06679      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
06680      if (!is_startline(scode, new_map, backref_map)) return FALSE;
06681      }
06682 
06683    /* Other brackets */
06684 
06685    else if (op == OP_ASSERT || op == OP_ONCE)
06686      {
06687      if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
06688      }
06689 
06690    /* .* means "start at start or after \n" if it isn't in brackets that
06691    may be referenced. */
06692 
06693    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
06694      {
06695      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
06696      }
06697 
06698    /* Check for explicit circumflex */
06699 
06700    else if (op != OP_CIRC) return FALSE;
06701 
06702    /* Move on to the next alternative */
06703 
06704    code += GET(code, 1);
06705    }
06706 while (*code == OP_ALT);  /* Loop for each alternative */
06707 return TRUE;
06708 }
06709 
06710 
06711 
06712 /*************************************************
06713 *       Check for asserted fixed first char      *
06714 *************************************************/
06715 
06716 /* During compilation, the "first char" settings from forward assertions are
06717 discarded, because they can cause conflicts with actual literals that follow.
06718 However, if we end up without a first char setting for an unanchored pattern,
06719 it is worth scanning the regex to see if there is an initial asserted first
06720 char. If all branches start with the same asserted char, or with a bracket all
06721 of whose alternatives start with the same asserted char (recurse ad lib), then
06722 we return that char, otherwise -1.
06723 
06724 Arguments:
06725   code       points to start of expression (the bracket)
06726   options    pointer to the options (used to check casing changes)
06727   inassert   TRUE if in an assertion
06728 
06729 Returns:     -1 or the fixed first char
06730 */
06731 
06732 static int
06733 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
06734 {
06735 register int c = -1;
06736 do {
06737    int d;
06738    const uschar *scode =
06739      first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
06740    register int op = *scode;
06741 
06742    switch(op)
06743      {
06744      default:
06745      return -1;
06746 
06747      case OP_BRA:
06748      case OP_CBRA:
06749      case OP_ASSERT:
06750      case OP_ONCE:
06751      case OP_COND:
06752      if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
06753        return -1;
06754      if (c < 0) c = d; else if (c != d) return -1;
06755      break;
06756 
06757      case OP_EXACT:       /* Fall through */
06758      scode += 2;
06759 
06760      case OP_CHAR:
06761      case OP_CHARNC:
06762      case OP_PLUS:
06763      case OP_MINPLUS:
06764      case OP_POSPLUS:
06765      if (!inassert) return -1;
06766      if (c < 0)
06767        {
06768        c = scode[1];
06769        if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
06770        }
06771      else if (c != scode[1]) return -1;
06772      break;
06773      }
06774 
06775    code += GET(code, 1);
06776    }
06777 while (*code == OP_ALT);
06778 return c;
06779 }
06780 
06781 
06782 
06783 /*************************************************
06784 *        Compile a Regular Expression            *
06785 *************************************************/
06786 
06787 /* This function takes a string and returns a pointer to a block of store
06788 holding a compiled version of the expression. The original API for this
06789 function had no error code return variable; it is retained for backwards
06790 compatibility. The new function is given a new name.
06791 
06792 Arguments:
06793   pattern       the regular expression
06794   options       various option bits
06795   errorcodeptr  pointer to error code variable (pcre_compile2() only)
06796                   can be NULL if you don't want a code value
06797   errorptr      pointer to pointer to error text
06798   erroroffset   ptr offset in pattern where error was detected
06799   tables        pointer to character tables or NULL
06800 
06801 Returns:        pointer to compiled data block, or NULL on error,
06802                 with errorptr and erroroffset set
06803 */
06804 
06805 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
06806 pcre_compile(const char *pattern, int options, const char **errorptr,
06807   int *erroroffset, const unsigned char *tables)
06808 {
06809 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
06810 }
06811 
06812 
06813 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
06814 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
06815   const char **errorptr, int *erroroffset, const unsigned char *tables)
06816 {
06817 real_pcre *re;
06818 int length = 1;  /* For final END opcode */
06819 int firstbyte, reqbyte, newline;
06820 int errorcode = 0;
06821 int skipatstart = 0;
06822 BOOL utf8;
06823 size_t size;
06824 uschar *code;
06825 const uschar *codestart;
06826 const uschar *ptr;
06827 compile_data compile_block;
06828 compile_data *cd = &compile_block;
06829 
06830 /* This space is used for "compiling" into during the first phase, when we are
06831 computing the amount of memory that is needed. Compiled items are thrown away
06832 as soon as possible, so that a fairly large buffer should be sufficient for
06833 this purpose. The same space is used in the second phase for remembering where
06834 to fill in forward references to subpatterns. */
06835 
06836 uschar cworkspace[COMPILE_WORK_SIZE];
06837 
06838 /* Set this early so that early errors get offset 0. */
06839 
06840 ptr = (const uschar *)pattern;
06841 
06842 /* We can't pass back an error message if errorptr is NULL; I guess the best we
06843 can do is just return NULL, but we can set a code value if there is a code
06844 pointer. */
06845 
06846 if (errorptr == NULL)
06847   {
06848   if (errorcodeptr != NULL) *errorcodeptr = 99;
06849   return NULL;
06850   }
06851 
06852 *errorptr = NULL;
06853 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
06854 
06855 /* However, we can give a message for this error */
06856 
06857 if (erroroffset == NULL)
06858   {
06859   errorcode = ERR16;
06860   goto PCRE_EARLY_ERROR_RETURN2;
06861   }
06862 
06863 *erroroffset = 0;
06864 
06865 /* Set up pointers to the individual character tables */
06866 
06867 if (tables == NULL) tables = _pcre_default_tables;
06868 cd->lcc = tables + lcc_offset;
06869 cd->fcc = tables + fcc_offset;
06870 cd->cbits = tables + cbits_offset;
06871 cd->ctypes = tables + ctypes_offset;
06872 
06873 /* Check that all undefined public option bits are zero */
06874 
06875 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
06876   {
06877   errorcode = ERR17;
06878   goto PCRE_EARLY_ERROR_RETURN;
06879   }
06880 
06881 /* Check for global one-time settings at the start of the pattern, and remember
06882 the offset for later. */
06883 
06884 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
06885        ptr[skipatstart+1] == CHAR_ASTERISK)
06886   {
06887   int newnl = 0;
06888   int newbsr = 0;
06889 
06890   if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
06891     { skipatstart += 7; options |= PCRE_UTF8; continue; }
06892   else if (strncmp((char *)(ptr+skipatstart+2), STRING_UCP_RIGHTPAR, 4) == 0)
06893     { skipatstart += 6; options |= PCRE_UCP; continue; }
06894   else if (strncmp((char *)(ptr+skipatstart+2), STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
06895     { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
06896 
06897   if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
06898     { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
06899   else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3)  == 0)
06900     { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
06901   else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5)  == 0)
06902     { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
06903   else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0)
06904     { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
06905   else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0)
06906     { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
06907 
06908   else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
06909     { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
06910   else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
06911     { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
06912 
06913   if (newnl != 0)
06914     options = (options & ~PCRE_NEWLINE_BITS) | newnl;
06915   else if (newbsr != 0)
06916     options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
06917   else break;
06918   }
06919 
06920 utf8 = (options & PCRE_UTF8) != 0;
06921 
06922 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
06923 
06924 #ifdef SUPPORT_UTF8
06925 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
06926      (*erroroffset = _pcre_valid_utf8((USPTR)pattern, -1)) >= 0)
06927   {
06928   errorcode = ERR44;
06929   goto PCRE_EARLY_ERROR_RETURN2;
06930   }
06931 #else
06932 if (utf8)
06933   {
06934   errorcode = ERR32;
06935   goto PCRE_EARLY_ERROR_RETURN;
06936   }
06937 #endif
06938 
06939 /* Can't support UCP unless PCRE has been compiled to include the code. */
06940 
06941 #ifndef SUPPORT_UCP
06942 if ((options & PCRE_UCP) != 0)
06943   {
06944   errorcode = ERR67;
06945   goto PCRE_EARLY_ERROR_RETURN;
06946   }
06947 #endif
06948 
06949 /* Check validity of \R options. */
06950 
06951 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
06952   {
06953   case 0:
06954   case PCRE_BSR_ANYCRLF:
06955   case PCRE_BSR_UNICODE:
06956   break;
06957   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
06958   }
06959 
06960 /* Handle different types of newline. The three bits give seven cases. The
06961 current code allows for fixed one- or two-byte sequences, plus "any" and
06962 "anycrlf". */
06963 
06964 switch (options & PCRE_NEWLINE_BITS)
06965   {
06966   case 0: newline = NEWLINE; break;   /* Build-time default */
06967   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
06968   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
06969   case PCRE_NEWLINE_CR+
06970        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
06971   case PCRE_NEWLINE_ANY: newline = -1; break;
06972   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
06973   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
06974   }
06975 
06976 if (newline == -2)
06977   {
06978   cd->nltype = NLTYPE_ANYCRLF;
06979   }
06980 else if (newline < 0)
06981   {
06982   cd->nltype = NLTYPE_ANY;
06983   }
06984 else
06985   {
06986   cd->nltype = NLTYPE_FIXED;
06987   if (newline > 255)
06988     {
06989     cd->nllen = 2;
06990     cd->nl[0] = (newline >> 8) & 255;
06991     cd->nl[1] = newline & 255;
06992     }
06993   else
06994     {
06995     cd->nllen = 1;
06996     cd->nl[0] = newline;
06997     }
06998   }
06999 
07000 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
07001 references to help in deciding whether (.*) can be treated as anchored or not.
07002 */
07003 
07004 cd->top_backref = 0;
07005 cd->backref_map = 0;
07006 
07007 /* Reflect pattern for debugging output */
07008 
07009 DPRINTF(("------------------------------------------------------------------\n"));
07010 DPRINTF(("%s\n", pattern));
07011 
07012 /* Pretend to compile the pattern while actually just accumulating the length
07013 of memory required. This behaviour is triggered by passing a non-NULL final
07014 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
07015 to compile parts of the pattern into; the compiled code is discarded when it is
07016 no longer needed, so hopefully this workspace will never overflow, though there
07017 is a test for its doing so. */
07018 
07019 cd->bracount = cd->final_bracount = 0;
07020 cd->names_found = 0;
07021 cd->name_entry_size = 0;
07022 cd->name_table = NULL;
07023 cd->start_workspace = cworkspace;
07024 cd->start_code = cworkspace;
07025 cd->hwm = cworkspace;
07026 cd->start_pattern = (const uschar *)pattern;
07027 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
07028 cd->req_varyopt = 0;
07029 cd->external_options = options;
07030 cd->external_flags = 0;
07031 cd->open_caps = NULL;
07032 
07033 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
07034 don't need to look at the result of the function here. The initial options have
07035 been put into the cd block so that they can be changed if an option setting is
07036 found within the regex right at the beginning. Bringing initial option settings
07037 outside can help speed up starting point checks. */
07038 
07039 ptr += skipatstart;
07040 code = cworkspace;
07041 *code = OP_BRA;
07042 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
07043   &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
07044   &length);
07045 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
07046 
07047 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
07048   cd->hwm - cworkspace));
07049 
07050 if (length > MAX_PATTERN_SIZE)
07051   {
07052   errorcode = ERR20;
07053   goto PCRE_EARLY_ERROR_RETURN;
07054   }
07055 
07056 /* Compute the size of data block needed and get it, either from malloc or
07057 externally provided function. Integer overflow should no longer be possible
07058 because nowadays we limit the maximum value of cd->names_found and
07059 cd->name_entry_size. */
07060 
07061 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
07062 re = (real_pcre *)(pcre_malloc)(size);
07063 
07064 if (re == NULL)
07065   {
07066   errorcode = ERR21;
07067   goto PCRE_EARLY_ERROR_RETURN;
07068   }
07069 
07070 /* Put in the magic number, and save the sizes, initial options, internal
07071 flags, and character table pointer. NULL is used for the default character
07072 tables. The nullpad field is at the end; it's there to help in the case when a
07073 regex compiled on a system with 4-byte pointers is run on another with 8-byte
07074 pointers. */
07075 
07076 re->magic_number = MAGIC_NUMBER;
07077 re->size = (int)size;
07078 re->options = cd->external_options;
07079 re->flags = cd->external_flags;
07080 re->dummy1 = 0;
07081 re->first_byte = 0;
07082 re->req_byte = 0;
07083 re->name_table_offset = sizeof(real_pcre);
07084 re->name_entry_size = cd->name_entry_size;
07085 re->name_count = cd->names_found;
07086 re->ref_count = 0;
07087 re->tables = (tables == _pcre_default_tables)? NULL : tables;
07088 re->nullpad = NULL;
07089 
07090 /* The starting points of the name/number translation table and of the code are
07091 passed around in the compile data block. The start/end pattern and initial
07092 options are already set from the pre-compile phase, as is the name_entry_size
07093 field. Reset the bracket count and the names_found field. Also reset the hwm
07094 field; this time it's used for remembering forward references to subpatterns.
07095 */
07096 
07097 cd->final_bracount = cd->bracount;  /* Save for checking forward references */
07098 cd->bracount = 0;
07099 cd->names_found = 0;
07100 cd->name_table = (uschar *)re + re->name_table_offset;
07101 codestart = cd->name_table + re->name_entry_size * re->name_count;
07102 cd->start_code = codestart;
07103 cd->hwm = cworkspace;
07104 cd->req_varyopt = 0;
07105 cd->had_accept = FALSE;
07106 cd->check_lookbehind = FALSE;
07107 cd->open_caps = NULL;
07108 
07109 /* Set up a starting, non-extracting bracket, then compile the expression. On
07110 error, errorcode will be set non-zero, so we don't need to look at the result
07111 of the function here. */
07112 
07113 ptr = (const uschar *)pattern + skipatstart;
07114 code = (uschar *)codestart;
07115 *code = OP_BRA;
07116 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
07117   &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
07118 re->top_bracket = cd->bracount;
07119 re->top_backref = cd->top_backref;
07120 re->flags = cd->external_flags;
07121 
07122 if (cd->had_accept) reqbyte = -1;   /* Must disable after (*ACCEPT) */
07123 
07124 /* If not reached end of pattern on success, there's an excess bracket. */
07125 
07126 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
07127 
07128 /* Fill in the terminating state and check for disastrous overflow, but
07129 if debugging, leave the test till after things are printed out. */
07130 
07131 *code++ = OP_END;
07132 
07133 #ifndef PCRE_DEBUG
07134 if (code - codestart > length) errorcode = ERR23;
07135 #endif
07136 
07137 /* Fill in any forward references that are required. */
07138 
07139 while (errorcode == 0 && cd->hwm > cworkspace)
07140   {
07141   int offset, recno;
07142   const uschar *groupptr;
07143   cd->hwm -= LINK_SIZE;
07144   offset = GET(cd->hwm, 0);
07145   recno = GET(codestart, offset);
07146   groupptr = _pcre_find_bracket(codestart, utf8, recno);
07147   if (groupptr == NULL) errorcode = ERR53;
07148     else PUT(((uschar *)codestart), offset, (int)(groupptr - codestart));
07149   }
07150 
07151 /* Give an error if there's back reference to a non-existent capturing
07152 subpattern. */
07153 
07154 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
07155 
07156 /* If there were any lookbehind assertions that contained OP_RECURSE
07157 (recursions or subroutine calls), a flag is set for them to be checked here,
07158 because they may contain forward references. Actual recursions can't be fixed
07159 length, but subroutine calls can. It is done like this so that those without
07160 OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
07161 exceptional ones forgo this. We scan the pattern to check that they are fixed
07162 length, and set their lengths. */
07163 
07164 if (cd->check_lookbehind)
07165   {
07166   uschar *cc = (uschar *)codestart;
07167 
07168   /* Loop, searching for OP_REVERSE items, and process those that do not have
07169   their length set. (Actually, it will also re-process any that have a length
07170   of zero, but that is a pathological case, and it does no harm.) When we find
07171   one, we temporarily terminate the branch it is in while we scan it. */
07172 
07173   for (cc = (uschar *)_pcre_find_bracket(codestart, utf8, -1);
07174        cc != NULL;
07175        cc = (uschar *)_pcre_find_bracket(cc, utf8, -1))
07176     {
07177     if (GET(cc, 1) == 0)
07178       {
07179       int fixed_length;
07180       uschar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
07181       int end_op = *be;
07182       *be = OP_END;
07183       fixed_length = find_fixedlength(cc, re->options, TRUE, cd);
07184       *be = end_op;
07185       DPRINTF(("fixed length = %d\n", fixed_length));
07186       if (fixed_length < 0)
07187         {
07188         errorcode = (fixed_length == -2)? ERR36 : ERR25;
07189         break;
07190         }
07191       PUT(cc, 1, fixed_length);
07192       }
07193     cc += 1 + LINK_SIZE;
07194     }
07195   }
07196 
07197 /* Failed to compile, or error while post-processing */
07198 
07199 if (errorcode != 0)
07200   {
07201   (pcre_free)(re);
07202   PCRE_EARLY_ERROR_RETURN:
07203   *erroroffset = (int)(ptr - (const uschar *)pattern);
07204   PCRE_EARLY_ERROR_RETURN2:
07205   *errorptr = find_error_text(errorcode);
07206   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
07207   return NULL;
07208   }
07209 
07210 /* If the anchored option was not passed, set the flag if we can determine that
07211 the pattern is anchored by virtue of ^ characters or \A or anything else (such
07212 as starting with .* when DOTALL is set).
07213 
07214 Otherwise, if we know what the first byte has to be, save it, because that
07215 speeds up unanchored matches no end. If not, see if we can set the
07216 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
07217 start with ^. and also when all branches start with .* for non-DOTALL matches.
07218 */
07219 
07220 if ((re->options & PCRE_ANCHORED) == 0)
07221   {
07222   int temp_options = re->options;   /* May get changed during these scans */
07223   if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
07224     re->options |= PCRE_ANCHORED;
07225   else
07226     {
07227     if (firstbyte < 0)
07228       firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
07229     if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
07230       {
07231       int ch = firstbyte & 255;
07232       re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
07233          cd->fcc[ch] == ch)? ch : firstbyte;
07234       re->flags |= PCRE_FIRSTSET;
07235       }
07236     else if (is_startline(codestart, 0, cd->backref_map))
07237       re->flags |= PCRE_STARTLINE;
07238     }
07239   }
07240 
07241 /* For an anchored pattern, we use the "required byte" only if it follows a
07242 variable length item in the regex. Remove the caseless flag for non-caseable
07243 bytes. */
07244 
07245 if (reqbyte >= 0 &&
07246      ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
07247   {
07248   int ch = reqbyte & 255;
07249   re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
07250     cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
07251   re->flags |= PCRE_REQCHSET;
07252   }
07253 
07254 /* Print out the compiled data if debugging is enabled. This is never the
07255 case when building a production library. */
07256 
07257 #ifdef PCRE_DEBUG
07258 printf("Length = %d top_bracket = %d top_backref = %d\n",
07259   length, re->top_bracket, re->top_backref);
07260 
07261 printf("Options=%08x\n", re->options);
07262 
07263 if ((re->flags & PCRE_FIRSTSET) != 0)
07264   {
07265   int ch = re->first_byte & 255;
07266   const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
07267     "" : " (caseless)";
07268   if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
07269     else printf("First char = \\x%02x%s\n", ch, caseless);
07270   }
07271 
07272 if ((re->flags & PCRE_REQCHSET) != 0)
07273   {
07274   int ch = re->req_byte & 255;
07275   const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
07276     "" : " (caseless)";
07277   if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
07278     else printf("Req char = \\x%02x%s\n", ch, caseless);
07279   }
07280 
07281 pcre_printint(re, stdout, TRUE);
07282 
07283 /* This check is done here in the debugging case so that the code that
07284 was compiled can be seen. */
07285 
07286 if (code - codestart > length)
07287   {
07288   (pcre_free)(re);
07289   *errorptr = find_error_text(ERR23);
07290   *erroroffset = ptr - (uschar *)pattern;
07291   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
07292   return NULL;
07293   }
07294 #endif   /* PCRE_DEBUG */
07295 
07296 return (pcre *)re;
07297 }
07298 
07299 /* End of pcre_compile.c */