Back to index

plt-scheme  4.2.1
schrx.h
Go to the documentation of this file.
00001 
00002 /* The INDIRECT_TO_PROGRAM mode can be useful for debugging
00003    in 3m to ensure that a regexp "program" is not misinterpreted
00004    as a pointer by the conservative collector. (This was a problem
00005    once, anyway.) */
00006 /* #define INDIRECT_TO_PROGRAM */
00007 
00008 typedef long rxpos;
00009 
00010 struct Regwork;
00011 
00012 typedef int (*Scheme_Regexp_Matcher)(struct Regwork *rw);
00013 
00014 typedef struct regexp {
00015   Scheme_Type type;
00016   MZ_HASH_KEY_EX
00017   Scheme_Object *source;
00018   long nsubexp, ncounter, maxlookback;
00019   long regsize;
00020   short flags;
00021   unsigned char *regstart;      /* Infor about required starting bytes */
00022   long regmust;                 /* pointer relative to self to required starting string */
00023   long regmlen;                    /* length of the string at regmust */
00024 #ifdef INDIRECT_TO_PROGRAM
00025   char *program;
00026 #else
00027   char program[1];          /* Unwarranted chumminess with compiler. */
00028 #endif
00029 } regexp;
00030 
00031 #define REGEXP_IS_UTF8 0x01
00032 #define REGEXP_IS_PCRE 0x02
00033 #define REGEXP_ANCH    0x04
00034 #define REGEXP_MUST_CI 0x08
00035 #define REGEXP_JIT     0x10
00036 
00037 #ifdef INDIRECT_TO_PROGRAM
00038 # define N_ITO_DELTA(prog, extra, re) extra
00039 # define N_ITO_SPACE(v) 0
00040 # define ITO(x, y) x
00041 #else
00042 # define N_ITO_DELTA(prog, extra, re) ((prog+extra) - re)
00043 # define N_ITO_SPACE(v) v
00044 # define ITO(x, y) y
00045 #endif
00046 
00047 /* 156 is octal 234: */
00048 #define MAGIC   156
00049 
00050 /*
00051  * The "internal use only" fields in regexp.h are present to pass info from
00052  * compile to execute that permits the execute phase to run lots faster on
00053  * simple cases.  They are:
00054  *
00055  * regstart   bitmap for chars that must begin a match; NULL if none obvious
00056  * reganch    is the match anchored (at beginning-of-line only)?
00057  * regmust    string (pointer into program) that match must include, or NULL
00058  * regmlen    length of regmust string
00059  *
00060  * Regstart and reganch permit very fast decisions on suitable starting points
00061  * for a match, cutting down the work a lot.  Regmust permits fast rejection
00062  * of lines that cannot possibly match.  The regmust tests are costly enough
00063  * that regcomp() supplies a regmust only if the r.e. contains something
00064  * potentially expensive (at present, the only such thing detected is * or +
00065  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
00066  * supplied because the test in regexec() needs it and regcomp() is computing
00067  * it anyway.
00068  */
00069 
00070 /*
00071  * Structure for regexp "program".  This is essentially a linear encoding
00072  * of a nondeterministic finite-state machine (aka syntax charts or
00073  * "railroad normal form" in parsing technology).  Each node is an opcode
00074  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
00075  * all nodes except BRANCH implement concatenation; a "next" pointer with
00076  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
00077  * have one of the subtle syntax dependencies:  an individual BRANCH (as
00078  * opposed to a collection of them) is never concatenated with anything
00079  * because of operator precedence.)  The operand of some types of node is
00080  * a literal string; for others, it is a node leading into a sub-FSM.  In
00081  * particular, the operand of a BRANCH node is the first node of the branch.
00082  * (NB this is *not* a tree structure:  the tail of the branch connects
00083  * to the thing following the set of BRANCHes.)  The opcodes are:
00084  */
00085 
00086 /* definition     number  opnd?   meaning */
00087 #define END       0       /* no   End of program. */
00088 #define BOI       1       /* no   Match "" at beginning of input. */
00089 #define EOI       2       /* no   Match "" at end of input. */
00090 #define ANY       3       /* no   Match any one character. */
00091 #define ANYL      4       /* no   Anything but a linefeed */
00092 #define ANYOF     5       /* bitmap  Match any character in the bitmap. */
00093 #define EXACTLY1  6       /* byte Match the character. */
00094 #define RANGE     7       /* byte,byte  Match any character in this range. */
00095 #define NOTRANGE  8       /* byte,byte  Match any character not in this range. */
00096 #define BRANCH    9       /* node Match this alternative, or the next... */
00097 #define BACK      10      /* no   Match "", "next" ptr points backward. */
00098 #define EXACTLY   11      /* str  Match this string. */
00099 #define EXACTLY_CI 12     /* str  Match this string. */
00100 #define NOTHING   13      /* no   Match empty string. */
00101 #define STAR      14      /* node Match this (simple) thing 0 or more times. */
00102 #define PLUS      15      /* node Match this (simple) thing 1 or more times. */
00103 #define STAR2     16      /* non-greedy star. */
00104 #define PLUS2     17      /* non-greedy plus. */
00105 #define STAR3     18      /* 2 nos  Like STAR, but with numeric quantifiers */
00106 #define STAR4     19      /* 2 nos  non-greedy STAR3 */
00107 #define OPENN     20      /* like OPEN, but with an n >= 50, or n == 0 means (?:...) */
00108 #define CLOSEN    21      /* like CLOSE, but with an n >= 50 */
00109 #define LOOKT     22      /* (?=...) or (?<=...)*/
00110 #define LOOKF     23      /* (?!...) or (?<!...) */
00111 #define LOOKTX    24      /* (?>...) */
00112 #define LOOKBT    25      /* (?<=...)*/
00113 #define LOOKBF    26      /* (?<!...) */
00114 #define LOOKE     27      /* ender for LOOK* */
00115 #define BACKREF   28      /* <n>, to match exactly the result for cluster <n> */
00116 #define BACKREF_CI 29      /* case-insensitive version */
00117 #define COUNTINIT 30 
00118 #define COUNTOVER 31
00119 #define COUNTUNDER 32
00120 #define COUNTBACK 33
00121 #define COUNTBACKFAIL 34
00122 #define SAVECONST 35      /* no no   Save position and count */
00123 #define MAYBECONST 36      /* no no   Save position and count */
00124 #define WORDBOUND 37
00125 #define NOTWORDBOUND 38
00126 #define BOL       39      /* no   Match "" at beginning of line. */
00127 #define EOL       40      /* no   Match "" at end of line. */
00128 #define UNIPROP   41
00129 #define CONDITIONAL 42
00130 #define EXACTLY2  43      /* byte,byte  Match either byte (useful for some CI cases) */
00131 #define OPEN      44      /* no   Mark this point in input as start of #n. */
00132 /*      OPEN+1 is number 1, etc. */
00133 #define CLOSE     78      /* no   Analogous to OPEN. */
00134 
00135 # define OPSTR(o) (o + 2)
00136 # define OPSTRx(o) (o + 1)
00137 # define OPLEN(o, regstr) ((int)(((unsigned char *)regstr)[o] << 8) | (((unsigned char *)regstr)[o+1]))
00138 # define OPRNGS(o, regstr) ((int)(((unsigned char *)regstr)[o]))
00139 
00140 /*
00141  * Opcode notes:
00142  *
00143  * BRANCH     The set of branches constituting a single choice are hooked
00144  *            together with their "next" pointers, since precedence prevents
00145  *            anything being concatenated to any individual branch.  The
00146  *            "next" pointer of the last BRANCH in a choice points to the
00147  *            thing following the whole choice.  This is also where the
00148  *            final "next" pointer of each individual branch points; each
00149  *            branch starts with the operand node of a BRANCH node.
00150  *
00151  * BACK              Normal "next" pointers all implicitly point forward; BACK
00152  *            exists to make loop structures possible.
00153  *
00154  * STAR,PLUS  '?', and complex '*' and '+', are implemented as circular
00155  *            BRANCH structures using BACK.  Simple cases (one character
00156  *            per match) are implemented with STAR and PLUS for speed
00157  *            and to minimize recursive plunges.
00158  *
00159  * OPEN,CLOSE ...are numbered at compile time.
00160  */
00161 
00162 /*
00163  * A node is one char of opcode followed by two chars of "next" pointer.
00164  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
00165  * value is a positive offset from the opcode of the node containing it.
00166  * An operand, if any, simply follows the node.  (Note that much of the
00167  * code generation knows about this implicit relationship.)
00168  *
00169  * Using two bytes for the "next" pointer is vast overkill for most things,
00170  * but allows patterns to get big without disasters.
00171  */
00172 #define       OP(p, regstr) (regstr[p])
00173 #define       NEXT(p, regstr)      (((((unsigned char *)regstr)[(p)+1]&255)<<8) + (((unsigned char *)regstr)[(p)+2]&255))
00174 #define       OPERAND(p)    ((p) + 3)
00175 #define       OPERAND2(p)   ((p) + 5)
00176 #define       OPERAND3(p)   ((p) + 7)
00177 #define       OPERAND4(p)   ((p) + 9)
00178 
00179 /*
00180  * See regmagic.h for one further detail of program structure.
00181  */
00182 
00183 
00184 /*
00185  * Utility definitions.
00186  */
00187 #define       UCHAR(v) ((unsigned char)(v))
00188 
00189 #define       ISMULT(c, parse_flags)  ((c) == '*' || (c) == '+' || (c) == '?' || ((parse_flags & PARSE_PCRE) && ((c) == '{')))
00190 #define       META      "^$.[()|?+*\\"
00191 #define       PCRE_META  "^$.[()|?+*\\{}]"
00192 
00193 /*
00194  * Flags to be passed up and down.
00195  */
00196 #define       HASWIDTH       0x01  /* Known never to match null string. */
00197 #define       SIMPLE        0x02   /* Simple enough to be STAR/PLUS operand. */
00198 #define       SPSTART              0x04   /* Starts with * or +. */
00199 #define       SPFIXED              0x08   /* Always matches a particular length */
00200 #define NEEDSAVECONST  0x10     /* Fixed-size thing inside (), lift out save in case of repeat. */
00201 #define SPNOTHING      0x20     /* Unconditionally matches nothing. */
00202 #define       WORST         0      /* Worst case. */
00203 
00204 /* Parser flags: */
00205 #define PARSE_CASE_SENS    0x1
00206 #define PARSE_PCRE         0x2
00207 #define PARSE_SINGLE_LINE  0x4
00208 
00209 #define rx_tolower(c) (((c >= 'A') && (c <= 'Z')) ? (c + ('a' - 'A')) : c)
00210 #define rx_toupper(c) (((c >= 'a') && (c <= 'z')) ? (c - ('a' - 'A')) : c)
00211 #define rx_isword(c) (((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')) || ((c >= '0') && (c <= '9')) || (c == '_'))
00212 
00213 /*
00214  * Work variables for regtry().
00215  */
00216 typedef struct Regwork {
00217   MZTAG_IF_REQUIRED
00218   char *str;              /* copy of regstr; used only to protect before thread swaps */
00219   char *instr;
00220   Scheme_Object *port;
00221   Scheme_Object *unless_evt;
00222   short nonblock, aborted;
00223   rxpos instr_size;       /* For port reads */
00224   rxpos input_maxend;     /* For port reads */
00225   rxpos input, input_end, input_start; /* String-input pointer. */
00226   rxpos boi, bol;      /* Beginning of input/line, for ^ check. */
00227   rxpos *startp;       /* Pointer to startp array. */
00228   rxpos *maybep;       /* Pointer to tentative startp array. */
00229   rxpos *endp;                /* Ditto for endp. */
00230   int *counters;          /* For {} counters */
00231   Scheme_Object *peekskip;
00232 } Regwork;