Back to index

php5  5.3.10
pcreposix.c
Go to the documentation of this file.
00001 /*************************************************
00002 *      Perl-Compatible Regular Expressions       *
00003 *************************************************/
00004 
00005 /* PCRE is a library of functions to support regular expressions whose syntax
00006 and semantics are as close as possible to those of the Perl 5 language.
00007 
00008                        Written by Philip Hazel
00009            Copyright (c) 1997-2010 University of Cambridge
00010 
00011 -----------------------------------------------------------------------------
00012 Redistribution and use in source and binary forms, with or without
00013 modification, are permitted provided that the following conditions are met:
00014 
00015     * Redistributions of source code must retain the above copyright notice,
00016       this list of conditions and the following disclaimer.
00017 
00018     * Redistributions in binary form must reproduce the above copyright
00019       notice, this list of conditions and the following disclaimer in the
00020       documentation and/or other materials provided with the distribution.
00021 
00022     * Neither the name of the University of Cambridge nor the names of its
00023       contributors may be used to endorse or promote products derived from
00024       this software without specific prior written permission.
00025 
00026 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00027 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00028 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00029 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
00030 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00031 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00032 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00033 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00034 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00035 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00036 POSSIBILITY OF SUCH DAMAGE.
00037 -----------------------------------------------------------------------------
00038 */
00039 
00040 
00041 /* This module is a wrapper that provides a POSIX API to the underlying PCRE
00042 functions. */
00043 
00044 
00045 #include "config.h"
00046 
00047 
00048 /* Ensure that the PCREPOSIX_EXP_xxx macros are set appropriately for
00049 compiling these functions. This must come before including pcreposix.h, where
00050 they are set for an application (using these functions) if they have not
00051 previously been set. */
00052 
00053 #if defined(_WIN32) && !defined(PCRE_STATIC)
00054 #  define PCREPOSIX_EXP_DECL extern __declspec(dllexport)
00055 #  define PCREPOSIX_EXP_DEFN __declspec(dllexport)
00056 #endif
00057 
00058 /* We include pcre.h before pcre_internal.h so that the PCRE library functions
00059 are declared as "import" for Windows by defining PCRE_EXP_DECL as "import".
00060 This is needed even though pcre_internal.h itself includes pcre.h, because it
00061 does so after it has set PCRE_EXP_DECL to "export" if it is not already set. */
00062 
00063 #include "pcre.h"
00064 #include "pcre_internal.h"
00065 #include "pcreposix.h"
00066 
00067 
00068 /* Table to translate PCRE compile time error codes into POSIX error codes. */
00069 
00070 static const int eint[] = {
00071   0,           /* no error */
00072   REG_EESCAPE, /* \ at end of pattern */
00073   REG_EESCAPE, /* \c at end of pattern */
00074   REG_EESCAPE, /* unrecognized character follows \ */
00075   REG_BADBR,   /* numbers out of order in {} quantifier */
00076   /* 5 */
00077   REG_BADBR,   /* number too big in {} quantifier */
00078   REG_EBRACK,  /* missing terminating ] for character class */
00079   REG_ECTYPE,  /* invalid escape sequence in character class */
00080   REG_ERANGE,  /* range out of order in character class */
00081   REG_BADRPT,  /* nothing to repeat */
00082   /* 10 */
00083   REG_BADRPT,  /* operand of unlimited repeat could match the empty string */
00084   REG_ASSERT,  /* internal error: unexpected repeat */
00085   REG_BADPAT,  /* unrecognized character after (? */
00086   REG_BADPAT,  /* POSIX named classes are supported only within a class */
00087   REG_EPAREN,  /* missing ) */
00088   /* 15 */
00089   REG_ESUBREG, /* reference to non-existent subpattern */
00090   REG_INVARG,  /* erroffset passed as NULL */
00091   REG_INVARG,  /* unknown option bit(s) set */
00092   REG_EPAREN,  /* missing ) after comment */
00093   REG_ESIZE,   /* parentheses nested too deeply */
00094   /* 20 */
00095   REG_ESIZE,   /* regular expression too large */
00096   REG_ESPACE,  /* failed to get memory */
00097   REG_EPAREN,  /* unmatched parentheses */
00098   REG_ASSERT,  /* internal error: code overflow */
00099   REG_BADPAT,  /* unrecognized character after (?< */
00100   /* 25 */
00101   REG_BADPAT,  /* lookbehind assertion is not fixed length */
00102   REG_BADPAT,  /* malformed number or name after (?( */
00103   REG_BADPAT,  /* conditional group contains more than two branches */
00104   REG_BADPAT,  /* assertion expected after (?( */
00105   REG_BADPAT,  /* (?R or (?[+-]digits must be followed by ) */
00106   /* 30 */
00107   REG_ECTYPE,  /* unknown POSIX class name */
00108   REG_BADPAT,  /* POSIX collating elements are not supported */
00109   REG_INVARG,  /* this version of PCRE is not compiled with PCRE_UTF8 support */
00110   REG_BADPAT,  /* spare error */
00111   REG_BADPAT,  /* character value in \x{...} sequence is too large */
00112   /* 35 */
00113   REG_BADPAT,  /* invalid condition (?(0) */
00114   REG_BADPAT,  /* \C not allowed in lookbehind assertion */
00115   REG_EESCAPE, /* PCRE does not support \L, \l, \N, \U, or \u */
00116   REG_BADPAT,  /* number after (?C is > 255 */
00117   REG_BADPAT,  /* closing ) for (?C expected */
00118   /* 40 */
00119   REG_BADPAT,  /* recursive call could loop indefinitely */
00120   REG_BADPAT,  /* unrecognized character after (?P */
00121   REG_BADPAT,  /* syntax error in subpattern name (missing terminator) */
00122   REG_BADPAT,  /* two named subpatterns have the same name */
00123   REG_BADPAT,  /* invalid UTF-8 string */
00124   /* 45 */
00125   REG_BADPAT,  /* support for \P, \p, and \X has not been compiled */
00126   REG_BADPAT,  /* malformed \P or \p sequence */
00127   REG_BADPAT,  /* unknown property name after \P or \p */
00128   REG_BADPAT,  /* subpattern name is too long (maximum 32 characters) */
00129   REG_BADPAT,  /* too many named subpatterns (maximum 10,000) */
00130   /* 50 */
00131   REG_BADPAT,  /* repeated subpattern is too long */
00132   REG_BADPAT,  /* octal value is greater than \377 (not in UTF-8 mode) */
00133   REG_BADPAT,  /* internal error: overran compiling workspace */
00134   REG_BADPAT,  /* internal error: previously-checked referenced subpattern not found */
00135   REG_BADPAT,  /* DEFINE group contains more than one branch */
00136   /* 55 */
00137   REG_BADPAT,  /* repeating a DEFINE group is not allowed */
00138   REG_INVARG,  /* inconsistent NEWLINE options */
00139   REG_BADPAT,  /* \g is not followed followed by an (optionally braced) non-zero number */
00140   REG_BADPAT,  /* a numbered reference must not be zero */
00141   REG_BADPAT,  /* an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT) */
00142   /* 60 */
00143   REG_BADPAT,  /* (*VERB) not recognized */
00144   REG_BADPAT,  /* number is too big */
00145   REG_BADPAT,  /* subpattern name expected */
00146   REG_BADPAT,  /* digit expected after (?+ */
00147   REG_BADPAT,  /* ] is an invalid data character in JavaScript compatibility mode */
00148   /* 65 */
00149   REG_BADPAT,  /* different names for subpatterns of the same number are not allowed */
00150   REG_BADPAT,  /* (*MARK) must have an argument */
00151   REG_INVARG,  /* this version of PCRE is not compiled with PCRE_UCP support */
00152   REG_BADPAT,  /* \c must be followed by an ASCII character */
00153 };
00154 
00155 /* Table of texts corresponding to POSIX error codes */
00156 
00157 static const char *const pstring[] = {
00158   "",                                /* Dummy for value 0 */
00159   "internal error",                  /* REG_ASSERT */
00160   "invalid repeat counts in {}",     /* BADBR      */
00161   "pattern error",                   /* BADPAT     */
00162   "? * + invalid",                   /* BADRPT     */
00163   "unbalanced {}",                   /* EBRACE     */
00164   "unbalanced []",                   /* EBRACK     */
00165   "collation error - not relevant",  /* ECOLLATE   */
00166   "bad class",                       /* ECTYPE     */
00167   "bad escape sequence",             /* EESCAPE    */
00168   "empty expression",                /* EMPTY      */
00169   "unbalanced ()",                   /* EPAREN     */
00170   "bad range inside []",             /* ERANGE     */
00171   "expression too big",              /* ESIZE      */
00172   "failed to get memory",            /* ESPACE     */
00173   "bad back reference",              /* ESUBREG    */
00174   "bad argument",                    /* INVARG     */
00175   "match failed"                     /* NOMATCH    */
00176 };
00177 
00178 
00179 
00180 
00181 /*************************************************
00182 *          Translate error code to string        *
00183 *************************************************/
00184 
00185 PCREPOSIX_EXP_DEFN size_t PCRE_CALL_CONVENTION
00186 regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
00187 {
00188 const char *message, *addmessage;
00189 size_t length, addlength;
00190 
00191 message = (errcode >= (int)(sizeof(pstring)/sizeof(char *)))?
00192   "unknown error code" : pstring[errcode];
00193 length = strlen(message) + 1;
00194 
00195 addmessage = " at offset ";
00196 addlength = (preg != NULL && (int)preg->re_erroffset != -1)?
00197   strlen(addmessage) + 6 : 0;
00198 
00199 if (errbuf_size > 0)
00200   {
00201   if (addlength > 0 && errbuf_size >= length + addlength)
00202     sprintf(errbuf, "%s%s%-6d", message, addmessage, (int)preg->re_erroffset);
00203   else
00204     {
00205     strncpy(errbuf, message, errbuf_size - 1);
00206     errbuf[errbuf_size-1] = 0;
00207     }
00208   }
00209 
00210 return length + addlength;
00211 }
00212 
00213 
00214 
00215 
00216 /*************************************************
00217 *           Free store held by a regex           *
00218 *************************************************/
00219 
00220 PCREPOSIX_EXP_DEFN void PCRE_CALL_CONVENTION
00221 regfree(regex_t *preg)
00222 {
00223 (pcre_free)(preg->re_pcre);
00224 }
00225 
00226 
00227 
00228 
00229 /*************************************************
00230 *            Compile a regular expression        *
00231 *************************************************/
00232 
00233 /*
00234 Arguments:
00235   preg        points to a structure for recording the compiled expression
00236   pattern     the pattern to compile
00237   cflags      compilation flags
00238 
00239 Returns:      0 on success
00240               various non-zero codes on failure
00241 */
00242 
00243 PCREPOSIX_EXP_DEFN int PCRE_CALL_CONVENTION
00244 regcomp(regex_t *preg, const char *pattern, int cflags)
00245 {
00246 const char *errorptr;
00247 int erroffset;
00248 int errorcode;
00249 int options = 0;
00250 
00251 if ((cflags & REG_ICASE) != 0)    options |= PCRE_CASELESS;
00252 if ((cflags & REG_NEWLINE) != 0)  options |= PCRE_MULTILINE;
00253 if ((cflags & REG_DOTALL) != 0)   options |= PCRE_DOTALL;
00254 if ((cflags & REG_NOSUB) != 0)    options |= PCRE_NO_AUTO_CAPTURE;
00255 if ((cflags & REG_UTF8) != 0)     options |= PCRE_UTF8;
00256 if ((cflags & REG_UCP) != 0)      options |= PCRE_UCP;
00257 if ((cflags & REG_UNGREEDY) != 0) options |= PCRE_UNGREEDY;
00258 
00259 preg->re_pcre = pcre_compile2(pattern, options, &errorcode, &errorptr,
00260   &erroffset, NULL);
00261 preg->re_erroffset = erroffset;
00262 
00263 /* Safety: if the error code is too big for the translation vector (which
00264 should not happen, but we all make mistakes), return REG_BADPAT. */
00265 
00266 if (preg->re_pcre == NULL)
00267   {
00268   return (errorcode < sizeof(eint)/sizeof(const int))?
00269     eint[errorcode] : REG_BADPAT;
00270   }
00271 
00272 preg->re_nsub = pcre_info((const pcre *)preg->re_pcre, NULL, NULL);
00273 return 0;
00274 }
00275 
00276 
00277 
00278 
00279 /*************************************************
00280 *              Match a regular expression        *
00281 *************************************************/
00282 
00283 /* Unfortunately, PCRE requires 3 ints of working space for each captured
00284 substring, so we have to get and release working store instead of just using
00285 the POSIX structures as was done in earlier releases when PCRE needed only 2
00286 ints. However, if the number of possible capturing brackets is small, use a
00287 block of store on the stack, to reduce the use of malloc/free. The threshold is
00288 in a macro that can be changed at configure time.
00289 
00290 If REG_NOSUB was specified at compile time, the PCRE_NO_AUTO_CAPTURE flag will
00291 be set. When this is the case, the nmatch and pmatch arguments are ignored, and
00292 the only result is yes/no/error. */
00293 
00294 PCREPOSIX_EXP_DEFN int PCRE_CALL_CONVENTION
00295 regexec(const regex_t *preg, const char *string, size_t nmatch,
00296   regmatch_t pmatch[], int eflags)
00297 {
00298 int rc, so, eo;
00299 int options = 0;
00300 int *ovector = NULL;
00301 int small_ovector[POSIX_MALLOC_THRESHOLD * 3];
00302 BOOL allocated_ovector = FALSE;
00303 BOOL nosub =
00304   (((const pcre *)preg->re_pcre)->options & PCRE_NO_AUTO_CAPTURE) != 0;
00305 
00306 if ((eflags & REG_NOTBOL) != 0) options |= PCRE_NOTBOL;
00307 if ((eflags & REG_NOTEOL) != 0) options |= PCRE_NOTEOL;
00308 if ((eflags & REG_NOTEMPTY) != 0) options |= PCRE_NOTEMPTY;
00309 
00310 ((regex_t *)preg)->re_erroffset = (size_t)(-1);  /* Only has meaning after compile */
00311 
00312 /* When no string data is being returned, or no vector has been passed in which
00313 to put it, ensure that nmatch is zero. Otherwise, ensure the vector for holding
00314 the return data is large enough. */
00315 
00316 if (nosub || pmatch == NULL) nmatch = 0;
00317 
00318 else if (nmatch > 0)
00319   {
00320   if (nmatch <= POSIX_MALLOC_THRESHOLD)
00321     {
00322     ovector = &(small_ovector[0]);
00323     }
00324   else
00325     {
00326     if (nmatch > INT_MAX/(sizeof(int) * 3)) return REG_ESPACE;
00327     ovector = (int *)malloc(sizeof(int) * nmatch * 3);
00328     if (ovector == NULL) return REG_ESPACE;
00329     allocated_ovector = TRUE;
00330     }
00331   }
00332 
00333 /* REG_STARTEND is a BSD extension, to allow for non-NUL-terminated strings.
00334 The man page from OS X says "REG_STARTEND affects only the location of the
00335 string, not how it is matched". That is why the "so" value is used to bump the
00336 start location rather than being passed as a PCRE "starting offset". */
00337 
00338 if ((eflags & REG_STARTEND) != 0)
00339   {
00340   so = pmatch[0].rm_so;
00341   eo = pmatch[0].rm_eo;
00342   }
00343 else
00344   {
00345   so = 0;
00346   eo = (int)strlen(string);
00347   }
00348 
00349 rc = pcre_exec((const pcre *)preg->re_pcre, NULL, string + so, (eo - so),
00350   0, options, ovector, (int)(nmatch * 3));
00351 
00352 if (rc == 0) rc = (int)nmatch;    /* All captured slots were filled in */
00353 
00354 /* Successful match */
00355 
00356 if (rc >= 0)
00357   {
00358   size_t i;
00359   if (!nosub)
00360     {
00361     for (i = 0; i < (size_t)rc; i++)
00362       {
00363       pmatch[i].rm_so = ovector[i*2];
00364       pmatch[i].rm_eo = ovector[i*2+1];
00365       }
00366     if (allocated_ovector) free(ovector);
00367     for (; i < nmatch; i++) pmatch[i].rm_so = pmatch[i].rm_eo = -1;
00368     }
00369   return 0;
00370   }
00371 
00372 /* Unsuccessful match */
00373 
00374 if (allocated_ovector) free(ovector);
00375 switch(rc)
00376   {
00377 /* ========================================================================== */
00378   /* These cases are never obeyed. This is a fudge that causes a compile-time
00379   error if the vector eint, which is indexed by compile-time error number, is
00380   not the correct length. It seems to be the only way to do such a check at
00381   compile time, as the sizeof() operator does not work in the C preprocessor.
00382   As all the PCRE_ERROR_xxx values are negative, we can use 0 and 1. */
00383 
00384   case 0:
00385   case (sizeof(eint)/sizeof(int) == ERRCOUNT):
00386   return REG_ASSERT;
00387 /* ========================================================================== */
00388 
00389   case PCRE_ERROR_NOMATCH: return REG_NOMATCH;
00390   case PCRE_ERROR_NULL: return REG_INVARG;
00391   case PCRE_ERROR_BADOPTION: return REG_INVARG;
00392   case PCRE_ERROR_BADMAGIC: return REG_INVARG;
00393   case PCRE_ERROR_UNKNOWN_NODE: return REG_ASSERT;
00394   case PCRE_ERROR_NOMEMORY: return REG_ESPACE;
00395   case PCRE_ERROR_MATCHLIMIT: return REG_ESPACE;
00396   case PCRE_ERROR_BADUTF8: return REG_INVARG;
00397   case PCRE_ERROR_BADUTF8_OFFSET: return REG_INVARG;
00398   default: return REG_ASSERT;
00399   }
00400 }
00401 
00402 /* End of pcreposix.c */