Back to index

php5  5.3.10
regex2.h
Go to the documentation of this file.
00001 /*
00002  * First, the stuff that ends up in the outside-world include file
00003  = #ifdef WIN32
00004  = #define API_EXPORT(type)    __declspec(dllexport) type __stdcall
00005  = #else
00006  = #define API_EXPORT(type)    type
00007  = #endif
00008  =
00009  = typedef off_t regoff_t;
00010  = typedef struct {
00011  =     int re_magic;
00012  =     size_t re_nsub;             // number of parenthesized subexpressions
00013  =     const unsigned char *re_endp;      // end pointer for REG_PEND
00014  =     struct re_guts *re_g;       // none of your business :-)
00015  = } regex_t;
00016  = typedef struct {
00017  =     regoff_t rm_so;             // start of match
00018  =     regoff_t rm_eo;             // end of match
00019  = } regmatch_t;
00020  */
00021 /*
00022  * internals of regex_t
00023  */
00024 #define       MAGIC1 ((('r'^0200)<<8) | 'e')
00025 
00026 /*
00027  * The internal representation is a *strip*, a sequence of
00028  * operators ending with an endmarker.  (Some terminology etc. is a
00029  * historical relic of earlier versions which used multiple strips.)
00030  * Certain oddities in the representation are there to permit running
00031  * the machinery backwards; in particular, any deviation from sequential
00032  * flow must be marked at both its source and its destination.  Some
00033  * fine points:
00034  *
00035  * - OPLUS_ and O_PLUS are *inside* the loop they create.
00036  * - OQUEST_ and O_QUEST are *outside* the bypass they create.
00037  * - OCH_ and O_CH are *outside* the multi-way branch they create, while
00038  *   OOR1 and OOR2 are respectively the end and the beginning of one of
00039  *   the branches.  Note that there is an implicit OOR2 following OCH_
00040  *   and an implicit OOR1 preceding O_CH.
00041  *
00042  * In state representations, an operator's bit is on to signify a state
00043  * immediately *preceding* "execution" of that operator.
00044  */
00045 typedef long sop;           /* strip operator */
00046 typedef long sopno;
00047 #define       OPRMASK       0x7c000000
00048 #define       OPDMASK       0x03ffffff
00049 #define       OPSHIFT       (26)
00050 #define       OP(n)  ((n)&OPRMASK)
00051 #define       OPND(n)       ((n)&OPDMASK)
00052 #define       SOP(op, opnd) ((op)|(opnd))
00053 /* operators                   meaning    operand                     */
00054 /*                                        (back, fwd are offsets)     */
00055 #define       OEND   (1<<OPSHIFT)  /* endmarker  -                    */
00056 #define       OCHAR  (2<<OPSHIFT)  /* character  unsigned char        */
00057 #define       OBOL   (3<<OPSHIFT)  /* left anchor       -                    */
00058 #define       OEOL   (4<<OPSHIFT)  /* right anchor      -                    */
00059 #define       OANY   (5<<OPSHIFT)  /* .          -                    */
00060 #define       OANYOF (6<<OPSHIFT)  /* [...]      set number           */
00061 #define       OBACK_ (7<<OPSHIFT)  /* begin \d   paren number         */
00062 #define       O_BACK (8<<OPSHIFT)  /* end \d     paren number         */
00063 #define       OPLUS_ (9<<OPSHIFT)  /* + prefix   fwd to suffix        */
00064 #define       O_PLUS (10<<OPSHIFT) /* + suffix   back to prefix              */
00065 #define       OQUEST_       (11<<OPSHIFT) /* ? prefix   fwd to suffix        */
00066 #define       O_QUEST       (12<<OPSHIFT) /* ? suffix   back to prefix              */
00067 #define       OLPAREN       (13<<OPSHIFT) /* (          fwd to )             */
00068 #define       ORPAREN       (14<<OPSHIFT) /* )          back to (            */
00069 #define       OCH_   (15<<OPSHIFT) /* begin choice      fwd to OOR2          */
00070 #define       OOR1   (16<<OPSHIFT) /* | pt. 1    back to OOR1 or OCH_ */
00071 #define       OOR2   (17<<OPSHIFT) /* | pt. 2    fwd to OOR2 or O_CH  */
00072 #define       O_CH   (18<<OPSHIFT) /* end choice back to OOR1         */
00073 #define       OBOW   (19<<OPSHIFT) /* begin word -                    */
00074 #define       OEOW   (20<<OPSHIFT) /* end word   -                    */
00075 
00076 /*
00077  * Structure for [] character-set representation.  Character sets are
00078  * done as bit vectors, grouped 8 to a byte vector for compactness.
00079  * The individual set therefore has both a pointer to the byte vector
00080  * and a mask to pick out the relevant bit of each byte.  A hash code
00081  * simplifies testing whether two sets could be identical.
00082  *
00083  * This will get trickier for multicharacter collating elements.  As
00084  * preliminary hooks for dealing with such things, we also carry along
00085  * a string of multi-character elements, and decide the size of the
00086  * vectors at run time.
00087  */
00088 typedef struct {
00089        uch *ptr;            /* -> uch [csetsize] */
00090        uch mask;            /* bit within array */
00091        uch hash;            /* hash code */
00092        size_t smultis;
00093        unsigned char *multis;             /* -> char[smulti]  ab\0cd\0ef\0\0 */
00094 } cset;
00095 /* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */
00096 #define       CHadd(cs, c)  ((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (c))
00097 #define       CHsub(cs, c)  ((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (c))
00098 #define       CHIN(cs, c)   ((cs)->ptr[(uch)(c)] & (cs)->mask)
00099 #define       MCadd(p, cs, cp)     mcadd(p, cs, cp)     /* regcomp() internal fns */
00100 #define       MCsub(p, cs, cp)     mcsub(p, cs, cp)
00101 #define       MCin(p, cs, cp)      mcin(p, cs, cp)
00102 
00103 /* stuff for character categories */
00104 typedef unsigned char cat_t;
00105 
00106 /*
00107  * main compiled-expression structure
00108  */
00109 struct re_guts {
00110        int magic;
00111 #             define MAGIC2 ((('R'^0200)<<8)|'E')
00112        sop *strip;          /* malloced area for strip */
00113        int csetsize;        /* number of bits in a cset vector */
00114        int ncsets;          /* number of csets in use */
00115        cset *sets;          /* -> cset [ncsets] */
00116        uch *setbits;        /* -> uch[csetsize][ncsets/CHAR_BIT] */
00117        int cflags;          /* copy of regcomp() cflags argument */
00118        sopno nstates;              /* = number of sops */
00119        sopno firststate;    /* the initial OEND (normally 0) */
00120        sopno laststate;     /* the final OEND */
00121        int iflags;          /* internal flags */
00122 #             define USEBOL 01     /* used ^ */
00123 #             define USEEOL 02     /* used $ */
00124 #             define BAD    04     /* something wrong */
00125        int nbol;            /* number of ^ used */
00126        int neol;            /* number of $ used */
00127        int ncategories;     /* how many character categories */
00128        cat_t *categories;   /* ->catspace[-UCHAR_MIN] */
00129        unsigned char *must;        /* match must contain this string */
00130        int mlen;            /* length of must */
00131        size_t nsub;         /* copy of re_nsub */
00132        int backrefs;        /* does it use back references? */
00133        sopno nplus;         /* how deep does it nest +s? */
00134        /* catspace must be last */
00135        cat_t catspace[1];   /* actually [NC] */
00136 };
00137 
00138 /* misc utilities */
00139 #define       OUT    (UCHAR_MAX+1) /* a non-character value */
00140 #define       ISWORD(c)     (isalnum(c) || (c) == '_')