Back to index

plt-scheme  4.2.1
regexp.c
Go to the documentation of this file.
00001 /*
00002  * @(#)regexp.c      1.3 of 18 April 87
00003  * Revised for PLT MzScheme, 1995-2001
00004  * Copyright (c) 2004-2009 PLT Scheme Inc.
00005  *
00006  *     Copyright (c) 1986 by University of Toronto.
00007  *     Written by Henry Spencer.  Not derived from licensed software.
00008  *
00009  *     Permission is granted to anyone to use this software for any
00010  *     purpose on any computer system, and to redistribute it freely,
00011  *     subject to the following restrictions:
00012  *
00013  *     1. The author is not responsible for the consequences of use of
00014  *            this software, no matter how awful, even if they arise
00015  *            from defects in it.
00016  *
00017  *     2. The origin of this software must not be misrepresented, either
00018  *            by explicit claim or by omission.
00019  *
00020  *     3. Altered versions must be plainly marked as such, and must not
00021  *            be misrepresented as being the original software.
00022  *
00023  * Beware that some of this code is subtly aware of the way operator
00024  * precedence is structured in regular expressions.  Serious changes in
00025  * regular-expression syntax might require a total rethink.
00026  *
00027  * Notable changes for MzScheme:
00028  *   Removed hardwired limits on parenthesis nesting
00029  *   Changed to index-based instead of pointer-based (better for GC)
00030  *   Added non-greedy operators *?, +?, and ??
00031  *   Added (?:...) grouping without reporting the group match
00032  *   Added (?=...), (?!...), (?<=...), and (?<!...) lookahead and lookback
00033  *   Added \n backreferences
00034  *   Added numeric quantifiers
00035  *   Added case-insensitive and multi-line modes
00036  *   Added MzScheme glue
00037  *
00038  * from Vladimir Tsyshevsky:
00039  *  additional optional parameter `offset' in `regexp-match'
00040  *  and `regexp-match-positions'
00041  */
00042 
00043 #include "schpriv.h"
00044 #include "schmach.h"
00045 #include "schgencat.h"
00046 #include "schrx.h"
00047 
00048 #include <stdio.h>
00049 #include <string.h>
00050 
00051 #ifdef SIXTY_FOUR_BIT_INTEGERS
00052 # define BIGGEST_RXPOS 0x7FFFFFFFFFFFFFFF
00053 #else
00054 # define BIGGEST_RXPOS 0x7FFFFFFF
00055 #endif
00056 
00057 # define rOP(o) OP(o, regstr)
00058 # define rNEXT(o) NEXT(o, regstr)
00059 # define rOPLEN(o) OPLEN(o, regstr)
00060 # define rOPRNGS(o) OPRNGS(o, regstr)
00061 # define NEXT_OP(scan) (scan + rNEXT(scan))
00062 
00063 static regexp *regcomp(char *, rxpos, int, int);
00064 /* static int regexec(regexp *, char *, int, int, rxpos *, rxpos * ...); */
00065 
00066 /*
00067  * Global work variables for regcomp().
00068  */
00069 static THREAD_LOCAL char *regstr;
00070 static THREAD_LOCAL char *regparsestr;
00071 static THREAD_LOCAL int regmatchmin;
00072 static THREAD_LOCAL int regmatchmax;
00073 static THREAD_LOCAL int regmaxbackposn;
00074 static THREAD_LOCAL int regsavepos;
00075 
00076 static THREAD_LOCAL Scheme_Hash_Table *regbackknown; /* known/assumed backreference [non-]empty */
00077 static THREAD_LOCAL Scheme_Hash_Table *regbackdepends; /* backreferences required to be non-empty for the current to be non-empty */
00078 
00079 static THREAD_LOCAL rxpos regparse;
00080 static THREAD_LOCAL rxpos regparse_end; /* Input-scan pointer. */
00081 static THREAD_LOCAL int regnpar;             /* () count. */
00082 static THREAD_LOCAL int regncounter;          /* {} count */
00083 static THREAD_LOCAL rxpos regcode;           /* Code-emit pointer, if less than regcodesize */
00084 static THREAD_LOCAL rxpos regcodesize;
00085 static THREAD_LOCAL rxpos regcodemax;
00086 static THREAD_LOCAL long regmaxlookback;
00087 
00088 /* caches to avoid gc */
00089 static THREAD_LOCAL long rx_buffer_size;
00090 static THREAD_LOCAL rxpos *startp_buffer_cache;
00091 static THREAD_LOCAL rxpos *endp_buffer_cache;
00092 static THREAD_LOCAL rxpos *maybep_buffer_cache;
00093 
00094 /*
00095  * Forward declarations for regcomp()'s friends.
00096  */
00097 static rxpos reg(int, int *, int, int, int);
00098 static rxpos regbranch(int *, int, int);
00099 static rxpos regpiece(int *, int, int);
00100 static rxpos regatom(int *, int, int);
00101 static rxpos regranges(int parse_flags, int at_start);
00102 static rxpos regunicode(int invert);
00103 static int regdigit();
00104 static rxpos regnode(char);
00105 static void regarg(int);
00106 static rxpos regnext(rxpos);
00107 static void regc(char);
00108 static void reginsert(char, rxpos);
00109 static rxpos reginsertwithop(char, rxpos, int);
00110 static rxpos reginsertwithopop(char, rxpos, int, int);
00111 static void regtail(rxpos, rxpos);
00112 static void regoptail(rxpos, rxpos);
00113 static int regstrcspn(char *, char *, char *);
00114 static unsigned char *extract_regstart(rxpos scan, int *_anch);
00115 
00116 static int check_and_propagate_depends(void);
00117 static int merge_tables(Scheme_Hash_Table *dest, Scheme_Hash_Table *src);
00118 
00119 #define       FAIL(m)       { regcomperror(m); return 0; }
00120 
00121 static void
00122 regerror(char *s)
00123 {
00124   scheme_raise_exn(MZEXN_FAIL_CONTRACT,
00125                  "regexp: %s", s);
00126 }
00127 
00128 const char *failure_msg_for_read;
00129 
00130 static void
00131 regcomperror(char *s)
00132 {
00133   if (failure_msg_for_read) {
00134     failure_msg_for_read = s;
00135     scheme_longjmp(scheme_error_buf, 1);
00136   } else
00137     regerror(s);
00138 }
00139 
00140 /*
00141  - regcomp - compile a regular expression into internal code
00142  *
00143  * We can't allocate space until we know how big the compiled form will be,
00144  * but we can't compile it (and thus know how big it is) until we've got a
00145  * place to put the code.  So we cheat:  we compile it twice, once with code
00146  * generation turned off and size counting turned on, and once "for real".
00147  * This also means that we don't allocate space until we are sure that the
00148  * thing really will compile successfully, and we never have to move the
00149  * code and thus invalidate pointers into it.  (Note that it has to be in
00150  * one piece because free() must be able to free it all.)
00151  *
00152  * Beware that the optimization-preparation code in here knows about some
00153  * of the structure of the compiled regexp.
00154  */
00155 static regexp *
00156 regcomp(char *expstr, rxpos exp, int explen, int pcre)
00157 {
00158   regexp *r;
00159   rxpos scan, next;
00160   rxpos longest;
00161   int len, longest_is_ci;
00162   int flags;
00163 
00164   /* First pass: determine size, legality. */
00165   regstr = NULL;
00166   regparsestr = expstr;
00167   regparse = exp;
00168   regparse_end = exp + explen;
00169   regnpar = 1;
00170   regncounter = 0;
00171   regmaxlookback = 0;
00172   regcode = 1;
00173   regcodesize = 0;
00174   regcodemax = 0;
00175   regmaxbackposn = 0;
00176   regbackknown = NULL;
00177   regbackdepends = NULL;
00178   regc(MAGIC);
00179   if (reg(0, &flags, 0, 0, PARSE_CASE_SENS | PARSE_SINGLE_LINE | (pcre ? PARSE_PCRE : 0)) == 0) {
00180     FAIL("unknown regexp failure");
00181   }
00182   
00183   /* Small enough for pointer-storage convention? */
00184   if (regcodemax >= 32767L)        /* Probably could be 65535L. */
00185     FAIL("regexp too big");
00186 
00187   if (regmaxbackposn >= regnpar)
00188     FAIL("backreference number is larger than the highest-numbered cluster");
00189   
00190   /* Allocate space. */
00191   r = (regexp *)scheme_malloc_tagged(sizeof(regexp) + N_ITO_SPACE((unsigned)regcodemax));
00192   r->type = scheme_regexp_type;
00193   
00194 #ifdef INDIRECT_TO_PROGRAM
00195   r->program = (char *)scheme_malloc_atomic((unsigned)regcodemax + 1);
00196 #endif
00197   
00198   r->regsize = regcodemax;
00199 
00200   r->nsubexp = regnpar;
00201   r->ncounter = regncounter;
00202   r->maxlookback = regmaxlookback;
00203   
00204   /* Second pass: emit code. */
00205   regparse = exp;
00206   regparse_end = exp + explen;
00207   regnpar = 1;
00208   regncounter = 0;
00209   regcodesize = regcodemax;
00210 #ifdef INDIRECT_TO_PROGRAM
00211   regstr = r->program;
00212   regcode = 0;
00213 #else
00214   regstr = (char *)r;
00215   regcode = (char *)r->program - (char *)r;
00216 #endif
00217   regcodesize += regcode;
00218   regcodemax = 0;
00219   regbackknown = NULL;
00220   regbackdepends = NULL;
00221   regc(MAGIC);
00222   if (reg(0, &flags, 0, 0, PARSE_CASE_SENS | PARSE_SINGLE_LINE | (pcre ? PARSE_PCRE : 0)) == 0) {
00223     FAIL("unknown regexp failure (late)");
00224   }
00225 
00226   if (regcode >= regcodesize) {
00227     FAIL("wrote too far");
00228   }
00229 
00230   /* Dig out information for optimizations. */
00231   r->regstart = NULL;       /* Worst-case defaults. */
00232   r->regmust = -1;
00233   r->regmlen = 0;
00234   scan = N_ITO_DELTA(r->program, 1, (char *)r);    /* First BRANCH. */
00235   {
00236     unsigned char *rs;
00237     int anch = 0;
00238     rs = extract_regstart(scan, &anch);
00239     r->regstart = rs;
00240     if (anch)
00241       r->flags |= REGEXP_ANCH;
00242 
00243   }
00244   next = regnext(scan);
00245   if (rOP(next) == END) {   /* Only one top-level choice. */
00246     scan = OPERAND(scan);
00247     /*
00248      * If there's something expensive in the r.e., find the
00249      * longest literal string that must appear and make it the
00250      * regmust.  Resolve ties in favor of later strings, since
00251      * the regstart check works with the beginning of the r.e.
00252      * and avoiding duplication strengthens checking.  Not a
00253      * strong reason, but sufficient in the absence of others.
00254      */
00255     if (flags&SPSTART) {
00256       int prev_op = 0;
00257       longest = 0;
00258       longest_is_ci = 0;
00259       len = 0;
00260       for (; scan != 0; scan = regnext(scan)) {
00261         int mscan = scan;
00262         while (1) {
00263           int mop;
00264           mop = rOP(mscan);
00265           if (((mop == EXACTLY) || (mop == EXACTLY_CI))
00266               && rOPLEN(OPERAND(mscan)) >= len) {
00267             /* Skip regmust if it contains a null character: */
00268             rxpos ls = OPSTR(OPERAND(mscan));
00269             int ll = rOPLEN(OPERAND(mscan)), i;
00270             for (i = 0; i < ll; i++) {
00271               if (!regstr[ls + i])
00272                 break;
00273             }
00274             if (i >= ll) {
00275               longest = ls;
00276               len = ll;
00277               longest_is_ci = (rOP(mscan) == EXACTLY_CI);
00278             }
00279             break;
00280           } else if ((mop == EXACTLY1) && (1 >= len)) {
00281             /* Skip if it's a null character */
00282             if (regstr[OPERAND(mscan)]) {
00283               longest = OPERAND(mscan);
00284               len = 1;
00285               longest_is_ci = 0;
00286             }
00287             break;
00288           } else if ((mop == BRANCH) && (prev_op != BRANCH)) {
00289             int mnext;
00290             mnext = NEXT_OP(mscan);
00291             if (rOP(mnext) != BRANCH) {
00292               /* A branch with only one choice */
00293               mscan = OPERAND(mscan);
00294             } else
00295               break;
00296           } else
00297             break;
00298         }
00299         prev_op = rOP(scan);
00300       }
00301       if (longest) {
00302        r->regmust = longest;
00303        if (longest_is_ci)
00304          r->flags |= REGEXP_MUST_CI;
00305        r->regmlen = len;
00306       }
00307     }
00308   }
00309 
00310 #if 0
00311   if (regcode > r->regsize + sizeof(regexp))
00312     scheme_signal_error("regexp too large!");
00313 #endif
00314   
00315   return(r);
00316 }
00317 
00318 static unsigned char *map_create(unsigned char *map)
00319 {
00320   if (!map) {
00321     map = (unsigned char *)scheme_malloc_atomic(32);
00322     memset(map, 0, 32);
00323   }
00324   return map;
00325 }
00326 
00327 static unsigned char *map_start(unsigned char *map, int c)
00328 {
00329   map = map_create(map);
00330   map[c >> 3] |= ((unsigned char)1 << (c & 0x7));
00331   return map;
00332 }
00333 
00334 static unsigned char *map_copy(unsigned char *map, char *s, int pos)
00335 {
00336   map = map_create(map);
00337   memcpy(map, s XFORM_OK_PLUS pos, 32);
00338   return map;
00339 }
00340 
00341 static unsigned char *map_range(unsigned char *map, char *s, int pos, int invert)
00342 {
00343   int rs, re;
00344 
00345   rs = UCHAR(s[pos++]);
00346   re = UCHAR(s[pos++]);
00347 
00348   if (!invert) {
00349     while (rs <= re) {
00350       map = map_start(map, rs);
00351       rs++;
00352     }
00353   } else {
00354     while (rs > 0) {
00355       map = map_start(map, rs - 1);
00356       --rs;
00357     }
00358     while (re < 255) {
00359       map = map_start(map, re + 1);
00360       re++;
00361     }
00362   }
00363 
00364   return map;
00365 }
00366 
00367 static unsigned char *extract_regstart(rxpos scan, int *_anch)
00368 {
00369   rxpos next;
00370   int retry, the_op;
00371   unsigned char *map = NULL;
00372 
00373   do {
00374     retry = 0;
00375     
00376     the_op = rOP(scan);
00377     switch (the_op) {
00378     case BOL:
00379     case EOL:
00380     case NOTHING:
00381     case SAVECONST:
00382     case MAYBECONST:
00383     case COUNTINIT:
00384     case COUNTOVER:
00385     case COUNTUNDER:
00386       /* We can ignore zero-length things when finding starting info */
00387       scan = regnext(scan);
00388       retry = 1;
00389       break;
00390     case LOOKT:
00391     case LOOKF:
00392     case LOOKBT:
00393     case LOOKBF:
00394       /* Zero-length, but continuation in an unusual place */
00395       scan += rOPLEN(OPERAND(scan));
00396       scan = regnext(scan);
00397       retry = 1;
00398       break;
00399     case LOOKTX:
00400       scan = regnext(scan);
00401       retry = 1;
00402       break;
00403     case PLUS:
00404     case PLUS2:
00405       scan = OPERAND(scan);
00406       retry = 1;
00407       break;
00408     case STAR3:
00409     case STAR4:
00410       if (rOPLEN(OPERAND(scan)) > 0) {
00411        scan = OPERAND3(scan);
00412        retry = 1;
00413       }
00414       break;
00415     case EXACTLY:
00416       map = map_start(map, UCHAR(regstr[OPSTR(OPERAND(scan))]));
00417       break;
00418     case EXACTLY_CI:
00419       {
00420        int c = UCHAR(regstr[OPSTR(OPERAND(scan))]);
00421        map = map_start(map, c);
00422        map = map_start(map, rx_toupper(c));
00423       }
00424       break;
00425     case ANYOF:
00426       map = map_copy(map, regstr, OPERAND(scan));
00427       break;
00428     case EXACTLY1:
00429       map = map_start(map, UCHAR(regstr[OPERAND(scan)]));
00430       break;
00431     case EXACTLY2:
00432       map = map_start(map, UCHAR(regstr[OPERAND(scan)]));
00433       map = map_start(map, UCHAR(regstr[OPERAND(scan)+1]));
00434       break;
00435     case RANGE:
00436       map = map_range(map, regstr, OPERAND(scan), 0);
00437       break;
00438     case NOTRANGE:
00439       map = map_range(map, regstr, OPERAND(scan), 1);
00440       break;
00441     case BOI:
00442       if (_anch)
00443        *_anch = 1;
00444       break;
00445     case BRANCH:
00446       next = regnext(scan);
00447       if (!next || (rOP(next) == END) || (rOP(next) == LOOKE)) {
00448        /* Only one branch */
00449        scan = OPERAND(scan);
00450        retry = 1;
00451       }
00452       break;
00453     default:
00454       if ((the_op == OPENN) || (the_op >= OPEN && the_op < CLOSE)) {
00455        scan = NEXT_OP(scan);
00456        retry = 1;
00457       }
00458       break;
00459     }
00460   } while (retry);
00461 
00462   return map;
00463 }
00464 
00465 #ifdef DO_STACK_CHECK
00466 
00467 static Scheme_Object *reg_k(void)
00468 {
00469   Scheme_Thread *p = scheme_current_thread;
00470   int *flagp = (int *)p->ku.k.p1;
00471   int res;
00472 
00473   p->ku.k.p1 = NULL;
00474 
00475   res = reg(p->ku.k.i1, flagp, p->ku.k.i2, p->ku.k.i3, p->ku.k.i4);
00476 
00477   return scheme_make_integer(res);
00478 }
00479 
00480 #endif
00481 
00482 /*
00483    - reg - regular expression, i.e. main body or parenthesized thing
00484    *
00485    * Caller must absorb opening parenthesis.
00486    *
00487    * Combining parenthesis handling with the base level of regular expression
00488    * is a trifle forced, but the need to tie the tails of the branches to what
00489    * follows makes it hard to avoid.
00490    */
00491 static rxpos
00492 reg(int paren, int *flagp, int paren_set, int lookahead, int parse_flags)
00493 {
00494   rxpos ret;
00495   rxpos br;
00496   rxpos ender;
00497   int parno = 0;
00498   int flags, matchmin, matchmax, brcount;
00499   Scheme_Hash_Table *backdepends;
00500 
00501 #ifdef DO_STACK_CHECK
00502   {
00503 # include "mzstkchk.h"
00504     {
00505       Scheme_Thread *p = scheme_current_thread;
00506       Scheme_Object *ov;
00507       p->ku.k.i1 = paren;
00508       p->ku.k.p1 = (void *)flagp;
00509       p->ku.k.i2 = paren_set;
00510       p->ku.k.i3 = lookahead;
00511       p->ku.k.i4 = parse_flags;
00512       ov = scheme_handle_stack_overflow(reg_k);
00513       return SCHEME_INT_VAL(ov);
00514     }
00515   }
00516 #endif
00517 
00518   *flagp = HASWIDTH;        /* Tentatively. */
00519 
00520   /* Make an OPEN node, if parenthesized. */
00521   if (paren) {
00522     if (lookahead) {
00523       parno = 0;
00524       ret = regnode(lookahead);
00525       regarg(0); /* space for LOOKE pointer */
00526       if ((lookahead == LOOKBT) || (lookahead == LOOKBF)) {
00527        regarg(0); /* space for min count */
00528        regarg(0); /* space for max count */
00529       }
00530     } else if (paren_set) {
00531       parno = regnpar;
00532       regnpar++;
00533       if (OPEN + parno >= CLOSE) {
00534        ret = regnode(OPENN);
00535        regarg(parno);
00536       } else {
00537        ret = regnode(OPEN+parno);
00538       }
00539     } else
00540       ret = 0;
00541   } else
00542     ret = 0;
00543 
00544   /* Pick up the branches, linking them together. */
00545   br = regbranch(&flags, parse_flags, 0);
00546   if (br == 0)
00547     FAIL("branch failed!?");
00548   if (ret != 0)
00549     regtail(ret, br);              /* OPEN -> first. */
00550   else
00551     ret = br;
00552   if (!(flags&HASWIDTH)) {
00553     *flagp &= ~HASWIDTH;
00554     backdepends = NULL;
00555   } else if (regbackdepends) {
00556     backdepends = regbackdepends;
00557     regbackdepends = NULL;
00558   } else
00559     backdepends = NULL;
00560   *flagp |= flags&(SPSTART|SPFIXED);
00561   matchmin = regmatchmin;
00562   matchmax = regmatchmax;
00563   brcount = 1;
00564   while (regparsestr[regparse] == '|') {
00565     brcount++;
00566     regparse++;
00567     br = regbranch(&flags, parse_flags, 0);
00568     if (br == 0)
00569       FAIL("next branch failed!?");
00570     regtail(ret, br);              /* BRANCH -> BRANCH. */
00571     if (!(flags&HASWIDTH))
00572       *flagp &= ~HASWIDTH;
00573     else if ((*flagp) & HASWIDTH) {
00574       if (regbackdepends) {
00575        if (backdepends)
00576          merge_tables(backdepends, regbackdepends);
00577        else
00578          backdepends = regbackdepends;
00579        regbackdepends = NULL;
00580       } else
00581        backdepends = NULL;
00582     }
00583     *flagp |= flags&SPSTART;
00584     if (!(flags&SPFIXED))
00585       *flagp &= ~SPFIXED;
00586     else {
00587       if (regmatchmin < matchmin)
00588        matchmin = regmatchmin;
00589       if (regmatchmax > matchmax)
00590        matchmax = regmatchmax;
00591     }
00592   }
00593   regbackdepends = backdepends;
00594   regmatchmin = matchmin;
00595   regmatchmax = matchmax;
00596 
00597   if (paren && paren_set) {
00598     Scheme_Object *assumed;
00599 
00600     if (!regbackknown)
00601       regbackknown = scheme_make_hash_table(SCHEME_hash_ptr);
00602     assumed = scheme_hash_get(regbackknown, scheme_make_integer(parno));
00603 
00604     if (!((*flagp) & HASWIDTH)) {
00605       if (assumed && !SCHEME_FALSEP(assumed)) {
00606        FAIL("`*', `+', or `{...,}' operand can be empty due to backreference");
00607       }
00608       scheme_hash_set(regbackknown, scheme_make_integer(parno), scheme_false);
00609     } else {
00610       if (!backdepends)
00611        scheme_hash_set(regbackknown, scheme_make_integer(parno), scheme_true);
00612       else {
00613        if (assumed) {
00614          check_and_propagate_depends();
00615        } else
00616          scheme_hash_set(regbackknown, scheme_make_integer(parno), (Scheme_Object *)backdepends);
00617       }
00618     }
00619   }
00620 
00621   if ((brcount == 1)
00622       && paren 
00623       && (!paren_set || ((flags & SPFIXED) 
00624                       && (regmatchmin == regmatchmax)
00625                       && (regmatchmax < 0x7FFFF)))
00626       && !lookahead) {
00627     /* Simplify to just the single branch: */
00628     if (br + 3 < regcodesize) {
00629       int top;
00630       if (regcode <= regcodesize)
00631        top = regcode;
00632       else
00633        top = regcodesize;
00634       memmove(regstr + ret, regstr + br + 3, top - (br + 3));
00635     }
00636     *flagp = flags;
00637     regcode -= (br + 3 - ret);
00638     if (paren_set) {
00639       /* Collude with regpiece: */
00640       *flagp |= NEEDSAVECONST;
00641       *flagp &= ~SPNOTHING;
00642       regsavepos = parno;
00643     }
00644   } else {
00645     if (lookahead) {
00646       if ((lookahead == LOOKBT) || (lookahead == LOOKBF)) {
00647        if (!((*flagp) & SPFIXED))
00648          FAIL("lookbehind pattern does not match a bounded byte width");
00649        if (matchmax > 0x7FFF)
00650          FAIL("lookbehind match is potentially too long (more than 32767 bytes)");
00651        if (matchmax > regmaxlookback)
00652          regmaxlookback = matchmax;
00653        if (ret + 8 < regcodesize) {
00654          regstr[ret + 5] = (matchmin >> 8);
00655          regstr[ret + 6] = (matchmin & 255);
00656          regstr[ret + 7] = (matchmax >> 8);
00657          regstr[ret + 8] = (matchmax & 255);
00658        }
00659       }
00660     }
00661 
00662     /* Make a closing node, and hook it on the end. */
00663     if (paren) {
00664       if (lookahead) {
00665        ender = regnode(LOOKE);
00666        if (ret + 4 < regcodesize) {
00667          int delta = (ender - ret);
00668          regstr[ret + 3] = (delta >> 8);
00669          regstr[ret + 4] = (delta & 255);
00670        }
00671       } else if (paren_set) {
00672        if (OPEN + parno >= CLOSE) {
00673          ender = regcode;
00674          regarg(parno);
00675          reginsert(CLOSEN, ender);
00676        } else
00677          ender = regnode(CLOSE+parno);
00678       } else {
00679        ender = regnode(NOTHING);
00680       }
00681     } else {
00682       ender = regnode(END); 
00683     }
00684     regtail(ret, ender);
00685 
00686     /* Hook the tails of the branches to the closing node. */
00687     if (regcodesize) {
00688       for (br = ret; br != 0; br = regnext(br)) {
00689        regoptail(br, ender);
00690       }
00691     }
00692   }
00693 
00694   /* Check for proper termination. */
00695   if (paren && regparsestr[regparse++] != ')') {
00696     FAIL("missing closing parenthesis in pattern");
00697   } else if (!paren && regparse != regparse_end) {
00698     if (regparsestr[regparse] == ')') {
00699       FAIL("extra closing parenthesis in pattern");
00700     } else
00701       FAIL("junk on end");  /* "Can't happen". */
00702     /* NOTREACHED */
00703   }
00704 
00705   return ret;
00706 }
00707 
00708 /*
00709    - regbranch - one alternative of an | operator
00710    *
00711    * Implements the concatenation operator.
00712    */
00713 static rxpos
00714 regbranch(int *flagp, int parse_flags, int without_branch_node)
00715 {
00716   rxpos ret;
00717   rxpos chain, latest;
00718   int flags = 0, matchmin = 0, matchmax = 0, pcount = 0, save_flags;
00719 
00720   *flagp = (WORST|SPFIXED);        /* Tentatively. */
00721 
00722   if (!without_branch_node)
00723     ret = regnode(BRANCH);
00724   else
00725     ret = 0;
00726   chain = 0;
00727   while (regparse != regparse_end 
00728         && regparsestr[regparse] != '|' 
00729         && regparsestr[regparse] != ')') {
00730     save_flags = flags;
00731     latest = regpiece(&flags, parse_flags, !chain && !without_branch_node);
00732     if (latest == 0)
00733       FAIL("piece failed!?");
00734     if (flags & SPNOTHING) {
00735       /* no need to match nothing */
00736       regcode = latest; /* throw away dead code */
00737       flags = save_flags; /* in case all but the first is discarded */
00738     } else {
00739       pcount++;
00740       *flagp |= flags&HASWIDTH;
00741       if (chain == 0) {            /* First piece. */
00742        *flagp |= flags&SPSTART;
00743        if (without_branch_node)
00744          ret = latest;
00745       } else
00746        regtail(chain, latest);
00747       if (!(flags&SPFIXED))
00748        *flagp &= ~SPFIXED;
00749       matchmin += regmatchmin;
00750       matchmax += regmatchmax;
00751       if (matchmax > 0x7FFF)
00752        matchmax = 0x10000;
00753       chain = latest;
00754     }
00755   }
00756   regmatchmin = matchmin;
00757   regmatchmax = matchmax;
00758   if (chain == 0) {  /* Loop ran zero times. */
00759     latest = regnode(NOTHING);
00760     if (without_branch_node)
00761       ret = latest;
00762     *flagp = SIMPLE|SPNOTHING|SPFIXED;
00763     regmatchmin = regmatchmax = 0;
00764   }
00765 
00766   if (pcount == 1) {
00767     *flagp = flags; /* BRANCH will be deleted if simplicity is relevant */
00768   }
00769 
00770   return(ret);
00771 }
00772 
00773 /*
00774    - regpiece - something followed by possible [*+?]
00775    *
00776    * Note that the branching code sequences used for ? and the general cases
00777    * of * and + are somewhat optimized:  they use the same NOTHING node as
00778    * both the endmarker for their branch list and the body of the last branch.
00779    * It might seem that this node could be dispensed with entirely, but the
00780    * endmarker role is not redundant.
00781    */
00782 static rxpos 
00783 regpiece(int *flagp, int parse_flags, int at_start)
00784 {
00785   rxpos ret;
00786   char op;
00787   rxpos next;
00788   int flags, greedy;
00789   int minreps = 0, maxreps = 0, counter;
00790   int origsavepos, origmatchmin, origmatchmax;
00791 
00792   ret = regatom(&flags, parse_flags, at_start);
00793   if (ret == 0)
00794     FAIL("atom failed!?");
00795 
00796   origsavepos = regsavepos;
00797   origmatchmin = regmatchmin;
00798   origmatchmax = regmatchmax;
00799 
00800   op = regparsestr[regparse];
00801   if (!ISMULT(op, parse_flags)) {
00802     *flagp = (flags & ~NEEDSAVECONST);
00803   } else {
00804     if (op == '{') {
00805       int ch, maxspec = 0;
00806       minreps = maxreps = 0;
00807       regparse++;
00808       do {
00809        ch = regparsestr[regparse];
00810        if ((ch >= '0') && (ch <= '9')) {
00811          minreps = (minreps * 10) + (ch - '0');
00812          if (minreps > 0x7FFF)
00813            FAIL("minimum repetition count too large");
00814          regparse++;
00815        } else if (ch == ',' || ch == '}')
00816          break;
00817        else {
00818          FAIL("expected digit, comma, or `}' to end repetition specification started with `{'");
00819        }
00820       } while (1);
00821       if (ch == ',') {
00822        regparse++;
00823        do {
00824          ch = regparsestr[regparse];
00825          if ((ch >= '0') && (ch <= '9')) {
00826            maxspec = 1;
00827            maxreps = (maxreps * 10) + (ch - '0');
00828            if (maxreps > 0x7FFF)
00829              FAIL("maximum repetition count too large");
00830            regparse++;
00831          } else if (ch == '}')
00832            break;
00833          else {
00834            FAIL("expected digit or `}' to end repetition specification started with `{'");
00835          }
00836        } while (1);
00837       } else {
00838        maxspec = 1;
00839        maxreps = minreps;
00840       }
00841       if (maxspec && (maxreps < minreps)) {
00842        FAIL("maximum repetition count is less than maximum repetition count");
00843       }
00844       if (maxspec && !maxreps) {
00845        /* Match 0 instances */
00846        regparse++;
00847        if (regparsestr[regparse] == '?')
00848          regparse++; /* non-greedy */
00849        if (ISMULT(regparsestr[regparse], parse_flags))
00850          FAIL("nested `*', `?', `+', or `{...}' in pattern");
00851        regcode = ret; /* throw away dead code */
00852        *flagp = SPFIXED|SPNOTHING;
00853        regmatchmin = regmatchmax = 0;
00854        return regnode(NOTHING);
00855       }
00856       op = '*';
00857       if (maxreps || minreps)
00858        counter = regncounter++;
00859       else
00860        counter = 0;
00861     } else
00862       counter = 0;
00863 
00864     if (!(flags&HASWIDTH) && (op != '?')) {
00865       FAIL("`*', `+', or `{...}' operand could be empty");
00866     }
00867 
00868     if (regbackdepends) {
00869       /* Operand has width only if the indicated backreferences have width. */
00870       check_and_propagate_depends();
00871       /* Assumptions are registered, so we no longer need these backdepends: */
00872       regbackdepends = NULL;
00873     }
00874 
00875     if (maxreps || minreps) {
00876       if (minreps > 0)
00877        *flagp = HASWIDTH;
00878       if ((flags & SPFIXED) && maxreps) {
00879        regmatchmin = (origmatchmin * minreps);
00880        regmatchmax = (origmatchmax * maxreps);
00881        if (regmatchmax > 0x7FFF)
00882          regmatchmax = 0x10000;
00883        *flagp |= SPFIXED;
00884       }
00885     } else
00886       *flagp = (op != '+') ? WORST : HASWIDTH;
00887     *flagp |= SPSTART;
00888     if ((op == '?') && (flags & SPFIXED)) {
00889       *flagp |= SPFIXED;
00890       regmatchmin = 0;
00891     }
00892 
00893     if (regparsestr[regparse+1] == '?') {
00894       greedy = 0;
00895       regparse++;
00896     } else
00897       greedy = 1;
00898 
00899     if (op == '*' && (flags&SIMPLE)) {
00900       if (!minreps && !maxreps)
00901        reginsert(greedy ? STAR : STAR2, ret);
00902       else
00903        reginsertwithopop(greedy ? STAR3 : STAR4, ret, minreps, maxreps);
00904     } else if (op == '*' && greedy) {
00905       /* Emit x* as (x&|), where & means "self".
00906         If minreps or maxreps, also insert counter-managing
00907         nodes. This counter detects empty matches, too.
00908         The code is a little difficult to read because it often
00909         uses reginsert, which puts nodes before existing nodes.
00910         So, you almost have to read it backward. */
00911       rxpos br, nothing;
00912       if (minreps || maxreps) {
00913        /* Increment iteration counter, and fail if it's
00914           already at the max: */
00915        rxpos x;
00916        x = reginsertwithopop(COUNTUNDER, ret, counter, maxreps);
00917        regtail(ret, x);
00918       }
00919       reginsert(BRANCH, ret);      /* Either x */
00920       if (minreps || maxreps) {
00921        /* Initialize the iteration counter on entry: */
00922        br = reginsertwithop(COUNTINIT, ret, counter);
00923        regtail(ret, br);
00924       } else
00925        br = ret;
00926       regoptail(br, regnode(BACK)); /* and loop */
00927       regoptail(br, br);    /* back */
00928       regtail(br, regnode(BRANCH)); /* or */
00929       nothing = regnode(NOTHING);
00930       if (minreps) {
00931        /* Fail to match if the counter isn't big enough, yet: */
00932        rxpos n;
00933        n = reginsertwithopop(COUNTOVER, nothing, counter, minreps);
00934        regtail(nothing, n);
00935       }
00936       if (minreps || maxreps) {
00937        /* We incremented the counter for an x match, but now
00938           we're backtracking, so decrement it: */
00939        rxpos n;
00940        n = reginsertwithop(COUNTBACK, nothing, counter);
00941        regtail(nothing, n);
00942       }
00943       regtail(br, nothing); /* null. */
00944     } else if (op == '*') {
00945       /* Emit x*? as (|x&), where & means "self".
00946         With a counter, we need (|(x|-)&), where - reverts
00947         the iteration count and fails. */
00948       rxpos br, nothing, x, next_to_x;
00949       if (minreps || maxreps) {
00950        /* Increment iteration counter, and fail if it's
00951           already at the max: */
00952        rxpos fail;
00953        x = reginsertwithopop(COUNTUNDER, ret, counter, maxreps);
00954        regtail(ret, x);
00955 
00956        fail = regnode(BRANCH);
00957        regnode(COUNTBACKFAIL);
00958        regarg(counter);
00959        reginsert(BRANCH, ret);
00960        fail += 3;
00961        regtail(ret, fail);
00962        x += 3;
00963       } else
00964        x = ret;
00965       reginsert(BRANCH, ret);  /* = next */
00966       next = ret;
00967       next_to_x = (x - next) + 3;
00968       reginsert(NOTHING, ret); /* = nothing */
00969       next += 3;
00970       nothing = ret;
00971       if (minreps) {
00972        /* Fail to match if the counter isn't big enough, yet: */
00973        nothing = reginsertwithopop(COUNTOVER, ret, counter, minreps);
00974        regtail(ret, nothing); /* chain countover -> nothing */
00975        next += (nothing - ret);
00976       }
00977       reginsert(BRANCH, ret); /* b3 */
00978       next += 3;
00979       nothing += 3;
00980       if (minreps || maxreps) {
00981        /* Initialize the iteration counter on entry: */
00982        br = reginsertwithop(COUNTINIT, ret, counter);
00983        regtail(ret, br); /* chain countinit to b3 */
00984        next += (br - ret);
00985        nothing += (br - ret);
00986       } else
00987        br = ret;
00988       regtail(br, next); /* chain b3 to next */
00989       x = next + next_to_x;
00990       regtail(x, regnode(BACK)); /* loop */
00991       regtail(x, br); /* back. */
00992       regtail(next, regnode(BACK)); /* chain next to nothing */
00993       regtail(next, nothing);
00994     } else if (op == '+' && (flags&SIMPLE))
00995       reginsert(greedy ? PLUS : PLUS2, ret);
00996     else if (op == '+' && greedy) {
00997       /* Emit x+ as x(&|), where & means "self". */
00998       next = regnode(BRANCH);      /* Either */
00999       regtail(ret, next);
01000       regtail(regnode(BACK), ret); /* loop back */
01001       regtail(next, regnode(BRANCH)); /* or */
01002       regtail(ret, regnode(NOTHING)); /* null. */
01003     } else if (op == '+') {
01004       /* Emit x+? as x(|&), where & means "self". */
01005       next = regnode(BRANCH);      /* Either */
01006       regtail(ret, next);
01007       regnode(NOTHING); /* op */
01008       regtail(next, regnode(BRANCH)); /* or */
01009       regtail(regnode(BACK), ret); /* loop back. */
01010       regtail(next, regnode(BACK));
01011       regtail(next, next + 3);
01012     } else if (op == '?' && greedy) {
01013       /* Emit x? as (x|) */
01014       reginsert(BRANCH, ret);      /* Either x */
01015       regtail(ret, regnode(BRANCH)); /* or */
01016       next = regnode(NOTHING);     /* null. */
01017       regtail(ret, next);
01018       regoptail(ret, next);
01019     } else if (op == '?') {
01020       /* Emit x?? as (|x) */
01021       reginsert(BRANCH, ret);  /* will be next... */
01022       reginsert(NOTHING, ret);
01023       reginsert(BRANCH, ret);
01024       regtail(ret, ret + 6);
01025       next = regnode(BACK);
01026       regtail(ret + 6, next);
01027       regoptail(ret + 6, next);
01028       regoptail(ret + 6, ret + 3);
01029     }
01030     regparse++;
01031     if (ISMULT(regparsestr[regparse], parse_flags))
01032       FAIL("nested `*', `?', `+', or `{...}' in pattern");
01033   }
01034 
01035   if (flags & NEEDSAVECONST) {
01036     rxpos sv;
01037     sv = regnode(SAVECONST);
01038     regarg(origsavepos);
01039     regarg(origmatchmax);
01040     regtail(ret, sv);
01041     if (origmatchmax) {
01042       sv = reginsertwithop(MAYBECONST, ret, origsavepos);
01043       regtail(ret, sv);
01044     }
01045     *flagp &= ~SIMPLE;
01046   }
01047 
01048   return(ret);
01049 }
01050 
01051 /*
01052    - regatom - the lowest level
01053    *
01054    * Optimization:  gobbles an entire sequence of ordinary characters so that
01055    * it can turn them into a single node, which is smaller to store and
01056    * faster to run.  Backslashed characters are exceptions, each becoming a
01057    * separate node; the code is simpler that way and it's not worth fixing.
01058  */
01059 static rxpos 
01060 regatom(int *flagp, int parse_flags, int at_start)
01061 {
01062   rxpos ret;
01063   int flags;
01064 
01065   *flagp = (WORST|SPFIXED);        /* Tentatively. */
01066   regmatchmin = regmatchmax = 1;
01067 
01068   switch (regparsestr[regparse++]) {
01069   case '^':
01070     if (parse_flags & PARSE_SINGLE_LINE)
01071       ret = regnode(BOI);
01072     else
01073       ret = regnode(BOL);
01074     regmatchmin = regmatchmax = 0;
01075     break;
01076   case '$':
01077     if (parse_flags & PARSE_SINGLE_LINE)
01078       ret = regnode(EOI);
01079     else
01080       ret = regnode(EOL);
01081     regmatchmin = regmatchmax = 0;
01082     break;
01083   case '.':
01084     --regparse;
01085     ret = regranges(parse_flags, at_start);
01086     *flagp |= HASWIDTH|SIMPLE;
01087     break;
01088   case '[': 
01089     --regparse;
01090     ret = regranges(parse_flags, at_start);
01091     *flagp |= HASWIDTH|SIMPLE;
01092     break;
01093   case '(':
01094     {
01095       if (regparsestr[regparse] == '?') {
01096        int moded = 0;
01097 
01098        while (1) {
01099          if (regparsestr[regparse+1] == 'i') {
01100            parse_flags &= ~PARSE_CASE_SENS;
01101            regparse++;
01102            moded = 1;
01103          } else if (regparsestr[regparse+1] == 'm') {
01104            parse_flags &= ~PARSE_SINGLE_LINE;
01105            regparse++;
01106            moded = 1;
01107          } else if (regparsestr[regparse+1] == 's') {
01108            parse_flags |= PARSE_SINGLE_LINE;
01109            regparse++;
01110            moded = 1;
01111          } else if ((regparsestr[regparse+1] == '-') 
01112                    && (regparsestr[regparse+2] == 'i')) {
01113            parse_flags |= PARSE_CASE_SENS;
01114            regparse += 2;
01115            moded = 1;
01116          } else if ((regparsestr[regparse+1] == '-') 
01117                    && (regparsestr[regparse+2] == 'm')) {
01118            parse_flags |= PARSE_SINGLE_LINE;
01119            regparse += 2;
01120            moded = 1;
01121          } else if ((regparsestr[regparse+1] == '-') 
01122                    && (regparsestr[regparse+2] == 's')) {
01123            parse_flags &= ~PARSE_SINGLE_LINE;
01124            regparse += 2;
01125            moded = 1;
01126          } else {
01127            break;
01128          }
01129        }
01130 
01131        if (regparsestr[regparse+1] == ':') {
01132          regparse += 2;
01133          ret = reg(1, &flags, 0, 0, parse_flags);
01134          *flagp = flags;
01135        } else if (moded) {
01136          FAIL("expected `:' or another mode after `(?' and a mode sequence (where a mode is `i', `-i', `m', `-m', `s', or `-s')");
01137        } else if (regparsestr[regparse+1] == '(') {
01138          /* Conditional */
01139          if (((regparsestr[regparse+2] >= '0')
01140               && (regparsestr[regparse+2] <= '9'))
01141              || ((regparsestr[regparse+2] == '?')
01142                 && ((regparsestr[regparse+3] == '=')
01143                     || (regparsestr[regparse+3] == '!')
01144                     || (regparsestr[regparse+3] == '<')))) {
01145            rxpos test, tbr, fbr, ender;
01146            int flags, matchmin, matchmax;
01147            Scheme_Hash_Table *backdepends;
01148 
01149            regparse++;
01150            ret = regnode(CONDITIONAL);
01151            regarg(0); /* space for then */
01152            regarg(0); /* space for else */
01153            if (regparsestr[regparse+1] != '?') {
01154              int posn;
01155              regparse++;
01156              posn = regdigit();
01157              test = regnode(BACKREF);
01158              regarg(posn);
01159              if (regparsestr[regparse] == ')') {
01160               regparse++;
01161              } else {
01162               FAIL("expected `)' after `(?(' followed by a digit");
01163              }
01164            } else {
01165              test = regatom(&flags, parse_flags, 1);
01166            }
01167            if (test != OPERAND3(ret)) {
01168              FAIL("test went to wrong place!?");
01169            }
01170            regtail(test, regnode(END));
01171            if (regparsestr[regparse] == ')') {
01172              FAIL("expected an expression after test in `(?(...))'");
01173            }
01174 
01175            regbackdepends = NULL;
01176            *flagp |= HASWIDTH; /* tentatively */
01177 
01178            tbr = regbranch(&flags, parse_flags, 1);
01179 
01180            if (!(flags&HASWIDTH)) {
01181              *flagp &= ~HASWIDTH;
01182              backdepends = NULL;
01183            } else if (regbackdepends) {
01184              backdepends = regbackdepends;
01185              regbackdepends = NULL;
01186            } else
01187              backdepends = NULL;
01188              
01189            if (!(flags & SPFIXED))
01190              *flagp &= ~SPFIXED;
01191            matchmin = regmatchmin;
01192            matchmax = regmatchmax;
01193 
01194            if (regparsestr[regparse] == ')') {
01195              fbr = regnode(NOTHING);
01196              *flagp &= ~HASWIDTH;
01197              matchmin = 0;
01198            } else if (regparsestr[regparse] != '|') {
01199              FAIL("expected `)' or `|' after first branch of `(?(...)...)'");
01200            } else {
01201              regparse++;
01202              fbr = regbranch(&flags, parse_flags, 1);
01203              if (regparsestr[regparse] != ')') {
01204               FAIL("expected `)' to close `(?(...)...' after second branch");
01205              }
01206 
01207              if (!(flags&HASWIDTH)) {
01208               *flagp &= ~HASWIDTH;
01209               backdepends = NULL;
01210              } else if (regbackdepends) {
01211               if (backdepends)
01212                 merge_tables(backdepends, regbackdepends);
01213               else
01214                 backdepends = regbackdepends;
01215              }
01216              
01217              if (!(flags & SPFIXED))
01218               *flagp &= ~SPFIXED;
01219              else {
01220               if (regmatchmin < matchmin)
01221                 matchmin = regmatchmin;
01222               if (regmatchmax > matchmax)
01223                 matchmax = regmatchmax;
01224              }
01225            }
01226 
01227            regmatchmax = matchmax;
01228            regmatchmin = matchmin;
01229            regbackdepends = backdepends;
01230 
01231            if (OPERAND2(ret) + 1 < regcodesize) {
01232              int delta;
01233              delta = tbr - ret;
01234              regstr[OPERAND(ret)] = delta >> 8;
01235              regstr[OPERAND(ret)+1] = delta & 255;
01236              delta = fbr - ret;
01237              regstr[OPERAND2(ret)] = delta >> 8;
01238              regstr[OPERAND2(ret)+1] = delta & 255;
01239            }
01240            ender = regnode(NOTHING);
01241            regtail(tbr, ender);
01242            regtail(fbr, ender);
01243            regtail(ret, ender);
01244            regparse++;
01245          } else
01246            FAIL("expected `(?=', `(?!', `(?<', or digit after `(?('");
01247        } else if (regparsestr[regparse+1] == '>') {
01248          regparse += 2;
01249          ret = reg(1, &flags, 0, LOOKTX, parse_flags);
01250          *flagp = flags;
01251        } else {
01252          if (regparsestr[regparse+1] == '=') {
01253            regparse += 2;
01254            ret = reg(1, &flags, 0, LOOKT, parse_flags);
01255          } else if (regparsestr[regparse+1] == '!') {
01256            regparse += 2;
01257            ret = reg(1, &flags, 0, LOOKF, parse_flags);
01258          } else if ((regparsestr[regparse+1] == '<')
01259                    && (regparsestr[regparse+2] == '=')) {
01260            regparse += 3;
01261            ret = reg(1, &flags, 0, LOOKBT, parse_flags);
01262          } else if ((regparsestr[regparse+1] == '<')
01263                    && (regparsestr[regparse+2] == '!')) {
01264            regparse += 3;
01265            ret = reg(1, &flags, 0, LOOKBF, parse_flags);
01266          } else {
01267            FAIL("expected `:', `=', `!', `<=', `<!', `i', `-i', `m', `-m', `s', or `-s' after `(?'");
01268          }
01269          regmatchmin = regmatchmax = 0;
01270          *flagp = SPFIXED;
01271          regbackdepends = NULL;
01272        }
01273       } else {
01274        ret = reg(1, &flags, 1, 0, parse_flags);
01275        if (flags & NEEDSAVECONST) {
01276          *flagp = flags;
01277        } else {
01278          *flagp |= flags&(HASWIDTH|SPSTART);
01279          if (!(flags&SPFIXED))
01280            *flagp &= ~SPFIXED;
01281        }
01282       }
01283       /* otherwise, regmatchmin/regmatchmax is set */
01284       if (ret == 0)
01285        FAIL("cluster failed!?");
01286     }
01287     break;
01288   case '|':
01289   case ')':
01290     FAIL("internal urp");   /* Supposed to be caught earlier. */
01291     break;
01292   case '?':
01293     FAIL("`?' follows nothing in pattern");
01294     break;
01295   case '+':
01296     FAIL("`+' follows nothing in pattern");
01297     break;
01298   case '*':
01299     FAIL("`*' follows nothing in pattern");
01300     break;
01301   case '\\':
01302     {
01303       int c;
01304       if (regparse == regparse_end)
01305        FAIL("trailing backslash in pattern");
01306       c = regparsestr[regparse++];
01307       if ((parse_flags & PARSE_PCRE) && (c == 'b')) {
01308        ret = regnode(WORDBOUND);
01309        regmatchmin = regmatchmax = 0;
01310        if (!regmaxlookback)
01311          regmaxlookback = 1;
01312       } else if ((parse_flags & PARSE_PCRE) && (c == 'B')) {
01313        ret = regnode(NOTWORDBOUND);
01314        regmatchmin = regmatchmax = 0;
01315        if (!regmaxlookback)
01316          regmaxlookback = 1;
01317       } else if ((parse_flags & PARSE_PCRE) && (c == 'p')) {
01318        ret = regunicode(0);
01319        regmatchmax = MAX_UTF8_CHAR_BYTES;
01320        *flagp |= HASWIDTH;
01321       } else if ((parse_flags & PARSE_PCRE) && (c == 'P')) {
01322        ret = regunicode(1);
01323        regmatchmax = MAX_UTF8_CHAR_BYTES;
01324        *flagp |= HASWIDTH;
01325       } else if ((parse_flags & PARSE_PCRE) && (c >= '0') && (c <= '9')) {
01326        int posn;
01327        --regparse;
01328        posn = regdigit();
01329        if (parse_flags & PARSE_CASE_SENS)
01330          ret = regnode(BACKREF);
01331        else
01332          ret = regnode(BACKREF_CI);
01333        regarg(posn);
01334        *flagp &= ~SPFIXED;
01335        /* Set HASWIDTH flag: */
01336        {
01337          Scheme_Object *f;
01338          if (regbackknown)
01339            f = scheme_hash_get(regbackknown, scheme_make_integer(posn));
01340          else
01341            f = NULL;
01342          if (f) {
01343            if (SCHEME_TRUEP(f))
01344              *flagp |= HASWIDTH;
01345          } else {
01346            *flagp |= HASWIDTH;
01347            if (!regbackdepends)
01348              regbackdepends = scheme_make_hash_table(SCHEME_hash_ptr);
01349            scheme_hash_set(regbackdepends, scheme_make_integer(posn), scheme_true);
01350          }
01351        }
01352       } else {
01353        regparse -= 2;
01354        ret = regranges(parse_flags, at_start);
01355        *flagp |= HASWIDTH|SIMPLE;
01356       }
01357     }
01358     break;
01359   default:
01360     {
01361       int len, ilen, c;
01362       char ender;
01363 
01364       regparse--;
01365 
01366       if (parse_flags & PARSE_PCRE) {
01367        if (regparsestr[regparse] == '{')
01368          FAIL("`{' follows nothing in pattern");
01369        if (regparsestr[regparse] == '}')
01370          FAIL("unmatched `}' in pattern");
01371        if (regparsestr[regparse] == ']')
01372          FAIL("unmatched `]' in pattern");
01373       }
01374 
01375       for (len = ilen = 0; regparse + ilen < regparse_end; len++, ilen++) {
01376        if (regparsestr[regparse + ilen] == '\\') {
01377          if (regparse + ilen + 1 >= regparse_end)
01378            break;
01379          c = regparsestr[regparse + ilen + 1];
01380          if (((c >= 'a') && (c <= 'z'))
01381              || ((c >= 'A') && (c <= 'Z'))
01382              || ((c >= '0') && (c <= '9')))
01383            break;
01384          ilen++;
01385        } else if (regstrcspn(regparsestr + regparse + ilen, regparsestr + regparse + ilen + 1,
01386                            (parse_flags & PARSE_PCRE) ? PCRE_META : META) < 1)
01387          break;
01388       }
01389       if (len <= 0)
01390        FAIL("internal disaster");
01391 
01392       if ((len == 1) && at_start) {
01393        /* Maybe convert "x|y" to "[xy]", etc.: */
01394        ret = regranges(parse_flags, at_start);
01395        *flagp |= HASWIDTH|SIMPLE;
01396       } else {
01397        if (!(parse_flags & PARSE_CASE_SENS)) {
01398          /* Need case insensitivity? */
01399          int i;
01400          for (i = 0; i < ilen; i++) {
01401            c = regparsestr[regparse + i];
01402            if ((rx_toupper(c) != c)
01403               || (rx_tolower(c) != c)) {
01404              break;
01405            }
01406          }
01407          if (i >= ilen)
01408            parse_flags |= PARSE_CASE_SENS;
01409        }
01410 
01411        ender = regparsestr[regparse+ilen];
01412        if (len > 1 && ISMULT(ender, parse_flags)) {
01413          /* Back off from ?+* operand. */
01414          len--;
01415          ilen--;
01416          if (regparsestr[regparse + ilen] == '\\')
01417            --ilen;
01418        }
01419        *flagp |= HASWIDTH;
01420        if (len == 1)
01421          *flagp |= SIMPLE;
01422        regmatchmin = regmatchmax = len;
01423        ret = regnode((parse_flags & PARSE_CASE_SENS) ? EXACTLY : EXACTLY_CI);
01424        regarg(len);
01425        while (len > 0) {
01426          c = regparsestr[regparse++];
01427          if (c == '\\')
01428            c = regparsestr[regparse++];
01429          if (!(parse_flags & PARSE_CASE_SENS))
01430            c = rx_tolower(c);
01431          regc(c);
01432          len--;
01433        }
01434       }
01435     }
01436     break;
01437   }
01438 
01439   if (!ret)
01440     FAIL("failed!?");
01441        
01442   return ret;
01443 }
01444 
01445 static int regcharclass(int c, char *map)
01446 {
01447   switch(c) {
01448   case 'd':
01449     for (c = 0; c < 10; c++) {
01450       map['0' + c] = 1;
01451     }
01452     break;
01453   case 'D':
01454     for (c = 0; c < '0'; c++) {
01455       map[c] = 1;
01456     }
01457     for (c = '9' + 1; c < 256; c++) {
01458       map[c] = 1;
01459     }
01460     break;
01461   case 'w':
01462     for (c = 0; c < 26; c++) {
01463       map['a' + c] = 1;
01464       map['A' + c] = 1;
01465     }
01466     for (c = 0; c < 10; c++) {
01467       map['0' + c] = 1;
01468     }
01469     map['_'] = 1;
01470     break;
01471   case 'W':
01472     for (c = 0; c < '0'; c++) {
01473       map[c] = 1;
01474     }
01475     for (c = '9' + 1; c < 'A'; c++) {
01476       map[c] = 1;
01477     }
01478     for (c = 'Z' + 1; c < '_'; c++) {
01479       map[c] = 1;
01480     }
01481     for (c = 'z' + 1; c < 256; c++) {
01482       map[c] = 1;
01483     }
01484     break;
01485   case 's':
01486     map['\t'] = 1;
01487     map['\n'] = 1;
01488     map['\f'] = 1;
01489     map['\r'] = 1;
01490     map[' '] = 1;
01491     break;
01492   case 'S':
01493     for (c = 0; c < 256; c++) {
01494       switch (c) {
01495       case '\t':
01496       case '\n':
01497       case '\f':
01498       case '\r':
01499       case ' ':
01500        break;
01501       default:
01502        map[c] = 1;
01503        break;
01504       }
01505     }
01506     break;
01507   default:
01508     if (((c >= 'a') && (c <= 'z'))
01509        || ((c >= 'A') && (c <= 'Z'))) {
01510       FAIL("illegal alphabetic escape");
01511     }
01512     map[c] = 1;
01513     break;
01514   }
01515 
01516   return 1;
01517 }
01518 
01519 static int is_posix_char_class(char *str, int pos, int len, char *map)
01520 {
01521   int c;
01522 
01523   if (pos + 8 <= len) {
01524     if (!scheme_strncmp(":alnum:]", str XFORM_OK_PLUS pos, 8)) {
01525       if (map) {
01526         regcharclass('d', map);
01527         for (c = 'a'; c <= 'z'; c++) {
01528           map[c] = 1;
01529           map[c - ('a' - 'A')] = 1;
01530         }
01531       }
01532       return 1;
01533     } else if (!scheme_strncmp(":alpha:]", str XFORM_OK_PLUS pos, 8)) {
01534       if (map) {
01535         for (c = 'a'; c <= 'z'; c++) {
01536           map[c] = 1;
01537           map[c - ('a' - 'A')] = 1;
01538         }
01539       }
01540       return 1;
01541     } else if (!scheme_strncmp(":ascii:]", str XFORM_OK_PLUS pos, 8)) {
01542       if (map) {
01543         for (c = 0; c <= 127; c++) {
01544           map[c] = 1;
01545         }
01546       }
01547       return 1;
01548     } else if (!scheme_strncmp(":blank:]", str XFORM_OK_PLUS pos, 8)) {
01549       if (map) {
01550         map[' '] = 1;
01551         map['\t'] = 1;
01552       }
01553       return 1;
01554     } else if (!scheme_strncmp(":cntrl:]", str XFORM_OK_PLUS pos, 8)) {
01555       if (map) {
01556         for (c = 0; c <= 31; c++) {
01557           map[c] = 1;
01558         }
01559       }
01560       return 1;
01561     } else if (!scheme_strncmp(":digit:]", str XFORM_OK_PLUS pos, 8)) {
01562       if (map) {
01563         regcharclass('d', map);
01564       }
01565       return 1;
01566     } else if (!scheme_strncmp(":graph:]", str XFORM_OK_PLUS pos, 8)) {
01567       if (map) {
01568         for (c = 0; c <= 127; c++) {
01569           if (scheme_isgraphic(c))
01570             map[c] = 1;
01571         }
01572       }
01573       return 1;
01574     } else if (!scheme_strncmp(":lower:]", str XFORM_OK_PLUS pos, 8)) {
01575       if (map) {
01576         for (c = 'a'; c <= 'z'; c++) {
01577           map[c] = 1;
01578         }
01579       }
01580       return 1;
01581     } else if (!scheme_strncmp(":print:]", str XFORM_OK_PLUS pos, 8)) {
01582       if (map) {
01583         for (c = 0; c <= 127; c++) {
01584           if (scheme_isgraphic(c))
01585             map[c] = 1;
01586         }
01587         map[' '] = 1;
01588         map['\t'] = 1;
01589       }
01590       return 1;
01591     } else if (!scheme_strncmp(":space:]", str XFORM_OK_PLUS pos, 8)) {
01592       if (map) {
01593         regcharclass('s', map);
01594       }
01595       return 1;
01596     } else if (!scheme_strncmp(":upper:]", str XFORM_OK_PLUS pos, 8)) {
01597       if (map) {
01598         for (c = 'A'; c <= 'Z'; c++) {
01599           map[c] = 1;
01600         }
01601       }
01602       return 1;
01603     }
01604   }
01605   
01606   if ((pos + 7 <= len) 
01607       && !scheme_strncmp(":word:]", str XFORM_OK_PLUS pos, 7)) {
01608     if (map) {
01609       regcharclass('w', map);
01610     }
01611     return 1;
01612   } 
01613 
01614   if ((pos + 9 <= len)
01615       && !scheme_strncmp(":xdigit:]", str XFORM_OK_PLUS pos, 9)) {
01616     if (map) {
01617       regcharclass('d', map);
01618       for (c = 'a'; c <= 'f'; c++) {
01619         map[c] = 1;
01620         map[c - ('a' - 'A')] = 1;
01621       }
01622     }
01623     return 1;
01624   }
01625 
01626   return 0;
01627 }
01628 
01629 static int is_posix_char_class_in_unicode(mzchar *str, int pos, int len, char *map)
01630 {
01631   int ulen;
01632   int i;
01633   char buf[10];
01634 
01635   if (pos + 7 > len)
01636     return 0;
01637 
01638   ulen = len - pos;
01639   if (ulen > 9)
01640     ulen = 9;
01641 
01642   for (i = 0; i < ulen; i++) {
01643     if (str[pos + i] > 127)
01644       return 0;
01645     buf[i] = (char)str[pos + i];
01646   }
01647 
01648   return is_posix_char_class(buf, 0, ulen, map);
01649 }
01650 
01651 static char *regrange(int parse_flags, char *map)
01652 /* [ is already consumed; result is an array of 256 bytes of included chars */
01653 {
01654   int xclass, c;
01655   int classend, can_range = 0;
01656   int exclude = 0;
01657 
01658   if (regparsestr[regparse] == '^') { /* Complement of range. */
01659     exclude = 1;
01660     regparse++;
01661   }
01662 
01663   if (regparsestr[regparse] == ']' || regparsestr[regparse] == '-') {
01664     c = regparsestr[regparse];
01665     map[c] = 1;
01666     regparse++;
01667   }
01668   while (regparse != regparse_end && regparsestr[regparse] != ']') {
01669     if (regparsestr[regparse] == '-') {
01670       regparse++;
01671       if (regparsestr[regparse] == ']' || regparse == regparse_end) {
01672        map['-'] = 1;
01673       } else {
01674        if (!can_range) {
01675          FAIL("misplaced hypen within square brackets in pattern");
01676        } else {
01677          xclass = UCHAR(regparsestr[regparse-2])+1;
01678          classend = UCHAR(regparsestr[regparse]);
01679          if (classend == '-') {
01680            FAIL("misplaced hypen within square brackets in pattern");
01681          }
01682          if ((classend == '\\') && (parse_flags & PARSE_PCRE)) {
01683            if (regparse+1 == regparse_end) {
01684              FAIL("escaping backslash at end pattern (within square brackets)");
01685            }
01686            regparse++;
01687            classend = UCHAR(regparsestr[regparse]);
01688            if (((classend >= 'a') && (classend <= 'z'))
01689               || ((classend >= 'A') && (classend <= 'Z'))) {
01690              FAIL("misplaced hypen within square brackets in pattern");
01691            }
01692          }
01693          if (xclass > classend+1)
01694            FAIL("invalid range within square brackets in pattern");
01695          for (; xclass <= classend; xclass++) {
01696            c = xclass;
01697            map[c] = 1;
01698            if (!(parse_flags & PARSE_CASE_SENS)) {
01699              c = rx_toupper(c);
01700              map[c] = 1;
01701              c = rx_tolower(c);
01702              map[c] = 1;
01703            }
01704          }
01705          regparse++;
01706        }
01707       }
01708       can_range = 0;
01709     } else if ((regparsestr[regparse] == '\\') && (parse_flags & PARSE_PCRE)) {
01710       c = UCHAR(regparsestr[regparse + 1]);
01711       if (((c >= 'a') && (c <= 'z'))
01712          || ((c >= 'A') && (c <= 'Z'))) {
01713        regcharclass(c, map);
01714        can_range = 0;
01715       } else {
01716        map[c] = 1;
01717        can_range = 1;
01718       }
01719       regparse += 2;
01720     } else if ((regparsestr[regparse] == '[') 
01721               && (parse_flags & PARSE_PCRE)
01722               && (regparsestr[regparse+1] == ':')
01723               && is_posix_char_class(regparsestr, regparse + 1, regparse_end, map)) {
01724       regparse += 2;
01725       while (regparsestr[regparse] != ']') {
01726        regparse++;
01727       }
01728       regparse++;
01729       can_range = 0;
01730     } else {
01731       c = UCHAR(regparsestr[regparse++]);
01732       map[c] = 1;
01733       if (!(parse_flags & PARSE_CASE_SENS)) {
01734        c = rx_tolower(c); 
01735        map[c] = 1;
01736        c = rx_toupper(c); 
01737        map[c] = 1;
01738       }
01739       can_range = 1;
01740     }
01741   }
01742 
01743   if (exclude) {
01744     for (c = 0; c < 256; c++) {
01745       map[c] = !map[c];
01746     }
01747   }
01748 
01749   if (regparsestr[regparse] != ']')
01750     FAIL("missing closing square bracket in pattern");
01751   regparse++;
01752 
01753   return map;
01754 }
01755 
01756 static rxpos
01757 regranges(int parse_flags, int at_start)
01758 {
01759   int c;
01760   rxpos ret, save_regparse = 0;
01761   int count, all_ci, num_ci, off_ranges, on_ranges, now_on, last_on, prev_last_on, use_ci;
01762   char *new_map = NULL, *accum_map = NULL;
01763 
01764   count = 0;
01765   while (1) {
01766     /* This loop can end up parsing a range and not using the result,
01767        so that the range is parsed twice. That's ok, because there's
01768        no nesting (and therefore no exponential explosion). */
01769 
01770     if (!new_map)
01771       new_map = (char *)scheme_malloc_atomic(256);
01772     memset(new_map, 0, 256);
01773 
01774     if (regparsestr[regparse] == '\\'
01775        && (regparse + 1 < regparse_end)) {
01776       /* <char> */
01777       c = UCHAR(regparsestr[++regparse]);
01778       if (parse_flags & PARSE_PCRE) {
01779        if ((c >= '0') && (c <= '9'))
01780          break;
01781        if (((c >= 'a') && (c <= 'z'))
01782            || ((c >= 'A') && (c <= 'Z'))) {
01783           if ((c == 'p') || (c == 'P')) {
01784             /* unicode char class; give up */
01785             break;
01786           }
01787          regcharclass(regparsestr[regparse], new_map);
01788           
01789        } else
01790          new_map[c] = 1;
01791       } else
01792        new_map[c] = 1;
01793       regparse++;
01794     } else if (regstrcspn(regparsestr + regparse, regparsestr + regparse + 1,
01795                        (parse_flags & PARSE_PCRE) ? PCRE_META : META)) {
01796       /* <char> */
01797       c = UCHAR(regparsestr[regparse]);
01798       new_map[c] = 1;
01799       if (!(parse_flags & PARSE_CASE_SENS))  {
01800        c = rx_tolower(c);
01801        new_map[c] = 1;
01802        c = rx_toupper(c);
01803        new_map[c] = 1;
01804       }
01805       regparse++;
01806     } else if (regparsestr[regparse] == '.') {
01807       /* . */
01808       for (c = 0; c < 256; c++) {
01809        new_map[c] = 1;
01810       }
01811       if (!(parse_flags & PARSE_SINGLE_LINE))
01812        new_map['\n'] = 0;
01813       regparse++;
01814     } else if (regparsestr[regparse] == '[') {
01815       /* [...] */
01816       regparse++;
01817       regrange(parse_flags, new_map);
01818     } else
01819       break;
01820 
01821     /* If the most recently parsed range is not 
01822        continued by a branch or the end of a sub-sequence,
01823        then abandon it, because it actually belongs
01824        with a new sequence. */
01825     if (accum_map
01826        && (regparse < regparse_end)
01827        && (regparsestr[regparse] != '|')
01828        && (regparsestr[regparse] != ')'))
01829       break;
01830     
01831     /* We'll keep it. Merge char maps so far: */
01832     if (accum_map) {
01833       for (c = 0; c < 256; c++) {
01834        accum_map[c] |= new_map[c];
01835       }
01836     } else {
01837       accum_map = new_map;
01838       new_map = NULL;
01839     }
01840     save_regparse = regparse;
01841     
01842     /* If we're at the end, or if we can only do one, then we're done. */
01843     if (!at_start
01844        || (regparsestr[regparse] != '|')
01845        || (regparse >= regparse_end)
01846        || (regparsestr[regparse] == ')'))
01847       break;
01848 
01849     regparse++;
01850     if (regparse == regparse_end)
01851       break;
01852   }
01853 
01854   regparse = save_regparse;
01855 
01856   if (!accum_map)
01857     FAIL("should have found one range!");
01858 
01859   use_ci = 0;
01860   while (1) {
01861     /* Collect stats to pick the best run-time implementation for a range.
01862        We may do this twice if we decide to use a _CI variant. */
01863     count = 0;
01864     num_ci = 0;
01865     all_ci = 1;
01866     on_ranges = 0;
01867     off_ranges = 0;
01868     now_on = 0;
01869     last_on = -1;
01870     prev_last_on = -1;
01871     for (c = 0; c < 256; c++) {
01872       if (accum_map[c]) {
01873        if (now_on < 0)
01874          off_ranges++;
01875        now_on = 1;
01876        count++;
01877        prev_last_on = last_on;
01878        last_on = c;
01879 
01880        if (c != rx_tolower(c)) {
01881          if (accum_map[rx_tolower(c)] != accum_map[c])
01882            all_ci = 0;
01883          num_ci++;
01884        } else if (c != rx_toupper(c)) {
01885          if (accum_map[rx_toupper(c)] != accum_map[c])
01886            all_ci = 0;
01887          num_ci++;
01888        }
01889       } else {
01890        if (now_on > 0)
01891          on_ranges++;
01892        now_on = -1;
01893       }
01894     }
01895     if (now_on > 0)
01896       on_ranges++;
01897     else
01898       off_ranges++;
01899 
01900     /* Pick the best run-time implementation for a range. */
01901     if (count == 256) {
01902       return regnode(ANY);
01903     } else if ((count == 255) && !accum_map['\n']) {
01904       return regnode(ANYL);
01905     } else if (count == 1) {
01906       ret = regnode(EXACTLY1);
01907       regc(last_on);
01908       return ret;
01909     } else if (count == 2) {
01910       ret = regnode(EXACTLY2);
01911       regc(last_on);
01912       regc(prev_last_on);
01913       return ret;
01914     } else if ((on_ranges == 1)
01915               ||  (off_ranges == 1)) {
01916       int rs = 255, re = 255, on;
01917 
01918       if (on_ranges == 1)
01919        on = 1;
01920       else
01921        on = 0;
01922 
01923       for (c = 0; c < 256; c++) {
01924        if (!!accum_map[c] == on) {
01925          rs = c;
01926          break;
01927        }
01928       }
01929       for (c++; c < 256; c++) {
01930        if (!accum_map[c] == on) {
01931          re = c - 1;
01932          break;
01933        }
01934       }
01935 
01936       if (on)
01937        ret = regnode(RANGE);
01938       else
01939        ret = regnode(NOTRANGE);
01940       regc(rs);
01941       regc(re);
01942       return ret;
01943     } else {
01944       rxpos a;
01945 
01946       ret = regnode(ANYOF);
01947       a = regcode;
01948       for (c = 0; c < 32; c++) {
01949        regc(0);
01950       }
01951 
01952       if (regcode <= regcodesize) {
01953        for (c = 0; c < 256; c++) {
01954          if (accum_map[c]) {
01955            regstr[a + (c >> 3)] |= (1 << (c & 0x7));
01956          }
01957        }
01958       }
01959 
01960       return ret;
01961     }
01962   }
01963 }
01964 
01965 static char *prop_names[] = { "Cn",
01966                            "Cc",
01967                            "Cf",
01968                            "Cs",
01969                            "Co",
01970                            "Ll",
01971                            "Lu",
01972                            "Lt",
01973                            "Lm",
01974                            "Lo",
01975                            "Nd",
01976                            "Nl",
01977                            "No",
01978                            "Ps",
01979                            "Pe",
01980                            "Pi",
01981                            "Pf",
01982                            "Pc",
01983                            "Pd",
01984                            "Po",
01985                            "Mn",
01986                            "Mc",
01987                            "Me",
01988                            "Sc",
01989                            "Sk",
01990                            "Sm",
01991                            "So",
01992                            "Zl",
01993                            "Zp",
01994                            "Zs",
01995                            NULL};
01996 
01997 static rxpos
01998 regunicode(int negate)
01999 {
02000   rxpos ret;
02001   int len, bottom, top, i;
02002 
02003   if (regparsestr[regparse] != '{') {
02004     FAIL("expected { after \\p or \\P");
02005   }
02006   regparse++;
02007   if (regparsestr[regparse] == '^') {
02008     negate = !negate;
02009     regparse++;
02010   }
02011   
02012   len = 0;
02013   while ((regparsestr[regparse + len] != '}')
02014         && (regparse + len < regparse_end)) {
02015     len++;
02016   }
02017 
02018   if (regparse + len >= regparse_end) {
02019     FAIL("missing } to close \\p{ or \\P{");
02020   }
02021 
02022   bottom = top = -1;
02023   if (len == 2) {
02024     for (i = 0; prop_names[i]; i++) {
02025       if ((regparsestr[regparse] == prop_names[i][0])
02026          && (regparsestr[regparse+1] == prop_names[i][1])) {
02027        bottom = top = i;
02028        break;
02029       }
02030     }
02031     if (bottom == -1) {
02032       if ((regparsestr[regparse] == 'L')
02033          && (regparsestr[regparse+1] == '&')) {
02034        bottom = mzu_Ll;
02035        top = mzu_Lm;
02036       }
02037     }
02038   } else if (len == 1) {
02039     if (regparsestr[regparse] == '.') {
02040       bottom = 0;
02041       top = mzu_LAST;
02042     } else {
02043       for (i = 0; prop_names[i]; i++) {
02044        if (regparsestr[regparse] == prop_names[i][0]) {
02045          bottom = i;
02046          while (prop_names[i+1]) {
02047            if (regparsestr[regparse] != prop_names[i+1][0])
02048              break;
02049            i++;
02050          }
02051          top = i;
02052          break;
02053        }
02054       }
02055     }
02056   }
02057 
02058   if (bottom < 0) {
02059     FAIL("unrecognized property name in \\p{} or \\P{}");
02060   }
02061 
02062   regparse += len + 1;
02063 
02064   ret = regnode(UNIPROP);
02065   /* This encoding accomodates up to 63 categories: */
02066   regarg((negate << 13) | (bottom << 6) | top);
02067 
02068   return ret;
02069 }
02070 
02071 static int regdigit()
02072 {
02073   int posn, c;
02074   c = regparsestr[regparse++];
02075   posn = c - '0';
02076   while (regparse < regparse_end) {
02077     c = regparsestr[regparse];
02078     if ((c >= '0') && (c <= '9')) {
02079       posn = (posn * 10) + (c - '0');
02080       if (posn > 0x7FFF)
02081        FAIL("backreference number is too large");
02082       regparse++;
02083     } else
02084       break;
02085   }
02086   if (posn > regmaxbackposn)
02087     regmaxbackposn = posn;
02088   return posn;
02089 }
02090 
02091 /*
02092    - regnode - emit a node
02093    */
02094 static rxpos                /* Location. */
02095 regnode(char op)
02096 {
02097   rxpos ret;
02098   rxpos ptr;
02099 
02100   ret = regcode;
02101   if (regcode + 3 >= regcodesize) {
02102     regcode += 3;
02103     if (regcode > regcodemax)
02104       regcodemax = regcode;
02105     return ret;
02106   }
02107 
02108   ptr = ret;
02109   regstr[ptr++] = op;
02110   regstr[ptr++] = '\0';            /* Null "next" pointer. */
02111   regstr[ptr++] = '\0';
02112   regcode = ptr;
02113 
02114   if (regcode > regcodemax)
02115     regcodemax = regcode;
02116     
02117   return ret;
02118 }
02119 
02120 /*
02121    - regc - emit (if appropriate) a byte of code
02122    */
02123 static void
02124 regc(char b)
02125 {
02126   if (regcode + 1 < regcodesize)
02127     regstr[regcode] = b;
02128   regcode++;
02129   if (regcode > regcodemax)
02130     regcodemax = regcode;
02131 }
02132 
02133 static void
02134 regarg(int v)
02135 {
02136   regc(v >> 8);
02137   regc(v & 255);
02138 }
02139 
02140 /*
02141    - reginsert - insert an operator in front of already-emitted operand
02142    *
02143    * Means relocating the operand.
02144    */
02145 static void
02146 regshift(int amt, rxpos opnd)
02147 {
02148   if (regcode + amt < regcodesize) {
02149     memmove(regstr XFORM_OK_PLUS opnd + amt, 
02150            regstr XFORM_OK_PLUS opnd, 
02151            regcode - opnd);
02152   }
02153   regcode += amt;
02154   if (regcode > regcodemax)
02155     regcodemax = regcode;
02156 }
02157 
02158 static void
02159 reginsert(char op, rxpos opnd)
02160 {
02161   regshift(3, opnd);
02162 
02163   if (opnd + 3 >= regcodesize) {
02164     return;
02165   }
02166 
02167   regstr[opnd++] = op;
02168   regstr[opnd++] = '\0'; /* tail */
02169   regstr[opnd++] = '\0';
02170 }
02171 
02172 static rxpos
02173 reginsertwithop(char op, rxpos opnd, int arg)
02174 {
02175   regshift(5, opnd);
02176 
02177   if (opnd + 5 >= regcodesize) {
02178     return opnd + 5;
02179   }
02180 
02181   regstr[opnd++] = op;
02182   regstr[opnd++] = '\0'; /* tail */
02183   regstr[opnd++] = '\0';
02184   regstr[opnd++] = (arg >> 8);
02185   regstr[opnd++] = (arg & 255);
02186 
02187   return opnd;
02188 }
02189 
02190 static rxpos
02191 reginsertwithopop(char op, rxpos opnd, int arg, int arg2)
02192 {
02193   regshift(7, opnd);
02194 
02195   if (opnd + 7 >= regcodesize) {
02196     return opnd + 7;
02197   }
02198 
02199   regstr[opnd++] = op;
02200   regstr[opnd++] = '\0'; /* tail */
02201   regstr[opnd++] = '\0';
02202   regstr[opnd++] = (arg >> 8);
02203   regstr[opnd++] = (arg & 255);
02204   regstr[opnd++] = (arg2 >> 8);
02205   regstr[opnd++] = (arg2 & 255);
02206 
02207   return opnd;
02208 }
02209 
02210 /*
02211    - regtail - set the next-pointer at the end of a node chain
02212    */
02213 static void
02214 regtail(rxpos p, rxpos val)
02215 {
02216   rxpos scan;
02217   rxpos temp;
02218   int offset;
02219 
02220   /* Find last node. */
02221   scan = p;
02222   for (;;) {
02223     if (scan + 2 >= regcodesize) {
02224       return;
02225     }
02226     temp = regnext(scan);
02227     if (temp == 0)
02228       break;
02229     scan = temp;
02230   }
02231 
02232   if (scan + 2 >= regcodesize) {
02233     return;
02234   }
02235   
02236   if (rOP(scan) == BACK)
02237     offset = scan - val;
02238   else
02239     offset = val - scan;
02240   regstr[scan+1] = (offset>>8)&255;
02241   regstr[scan+2] = offset&255;
02242 }
02243 
02244 /*
02245    - regoptail - regtail on operand of first argument; nop if operandless
02246    */
02247 static void
02248 regoptail(rxpos p, rxpos val)
02249 {
02250   /* "Operandless" and "op != BRANCH" are synonymous in practice. */
02251   if (p == 0 || (p >= regcodesize) || rOP(p) != BRANCH) {
02252     return;
02253   }
02254   regtail(OPERAND(p), val);
02255 }
02256 
02257 static int merge_tables(Scheme_Hash_Table *dest, Scheme_Hash_Table *src)
02258 {
02259   int i;
02260 
02261   for (i = src->size; i--; ) {
02262     if (src->vals[i]) {
02263       scheme_hash_set(dest, src->keys[i], src->vals[i]);
02264     }
02265   }
02266 
02267   return 1;
02268 }
02269 
02270 static int check_and_propagate_depends(void)
02271 {
02272   int i, j;
02273   Scheme_Hash_Table *backdepends = regbackdepends, *ht, *next_ht = NULL;
02274   Scheme_Object *v;
02275   
02276   while (backdepends) {
02277     for (i = backdepends->size; i--; ) {
02278       if (backdepends->vals[i]) {
02279        if (regbackknown)
02280          v = scheme_hash_get(regbackknown, backdepends->keys[i]);
02281        else
02282          v = NULL;
02283        if (v) {
02284          /* Check assumption: */
02285          if (SCHEME_FALSEP(v)) {
02286            FAIL("*, +, or {...,} operand could be empty (via empty backreference)");
02287          }
02288          if (SCHEME_HASHTP(v)) {
02289            /* Check/propagate assumption. The fixpoint direction is 
02290               determined by assuming "true" whil erecursively checking. */
02291            scheme_hash_set(regbackknown, backdepends->keys[i], scheme_true);
02292            if (!next_ht)
02293              next_ht = scheme_make_hash_table(SCHEME_hash_ptr);
02294            ht = (Scheme_Hash_Table *)v;
02295            for (j = ht->size; j--; ) {
02296              if (ht->vals[j]) {
02297               scheme_hash_set(next_ht, ht->keys[j], ht->vals[j]);
02298              }
02299            }
02300          }
02301        } else {
02302          /* Add assumption */
02303          if (!regbackknown)
02304            regbackknown = scheme_make_hash_table(SCHEME_hash_ptr);
02305          scheme_hash_set(regbackknown, backdepends->keys[i], scheme_true);
02306        }
02307       }
02308     }
02309     backdepends = next_ht;
02310     next_ht = NULL;
02311   }
02312 
02313   return 1;
02314 }
02315 
02316 static MZ_INLINE rxpos l_strchr(char *str, rxpos a, int l, int c)
02317 {
02318   int i;
02319 
02320   for (i = 0; i < l; i++) {
02321     if (str[a + i] == c)
02322       return a + i;
02323   }
02324 
02325   return -1;
02326 }
02327 
02328 static MZ_INLINE rxpos l_strchr_ci(char *str, rxpos a, int l, int c)
02329 {
02330   int i, ch;
02331 
02332   for (i = 0; i < l; i++) {
02333     ch = str[a + i];
02334     ch = rx_tolower(ch);
02335     if (ch == c)
02336       return a + i;
02337   }
02338 
02339   return -1;
02340 }
02341 
02342 static MZ_INLINE int in_ranges(char *str, rxpos a, int l, int c)
02343 {
02344   int i;
02345 
02346   l *= 2;
02347 
02348   for (i = 0; i < l; i += 2) {
02349     if ((UCHAR(str[a + i]) <= c) && (UCHAR(str[a + i + 1]) >= c))
02350       return 1;
02351   }
02352 
02353   return 0;
02354 }
02355 
02356 static MZ_INLINE int in_ranges_ci(char *str, rxpos a, int l, int c)
02357 {
02358   int i;
02359 
02360   l *= 2;
02361 
02362   c = rx_tolower(c);
02363 
02364   for (i = 0; i < l; i += 2) {
02365     if ((UCHAR(str[a + i]) <= c) && (UCHAR(str[a + i + 1]) >= c))
02366       return 1;
02367   }
02368 
02369   return 0;
02370 }
02371 
02372 /*
02373  * regexec and friends
02374  */
02375 
02376 /*
02377  * Forwards.
02378  */
02379 static int regtry(regexp *, char *, int, int, rxpos *, rxpos *, rxpos *, int *, Regwork *rw, rxpos, int, int, int);
02380 static int regtry_port(regexp *, Scheme_Object *, Scheme_Object *, int nonblock,
02381                      rxpos *, rxpos *, rxpos *, int *,
02382                      char **, rxpos *, rxpos *, rxpos, Scheme_Object*, Scheme_Object*, rxpos, int, int,
02383                      int);
02384 static int regmatch(Regwork *rw, rxpos);
02385 static int regrepeat(Regwork *rw, rxpos, int);
02386 
02387 #ifdef DEBUG
02388 int regnarrate = 0;
02389 void regdump();
02390 static char *regprop();
02391 #endif
02392 
02393 #define REGPORT_FLUSH_THRESHOLD 256
02394 
02395 /*
02396    - regexec - match a regexp against a string
02397    */
02398 static int
02399 regexec(const char *who,
02400        regexp *prog, char *string, 
02401        /* used only for strings: */
02402        int stringpos, int stringlen, 
02403        /* Always used: */
02404        rxpos *startp, rxpos *maybep, rxpos *endp,
02405        Scheme_Object *port, Scheme_Object *unless_evt, int nonblock,
02406        /* Used only when port is non-NULL: */
02407        char **stringp, int peek, int get_offsets,
02408        Scheme_Object *discard_oport, 
02409        Scheme_Object *portstart, Scheme_Object *portend, Scheme_Object **_dropped)
02410 {
02411   int spos;
02412   int *counters;
02413   Scheme_Object *dropped = NULL, *peekskip = NULL; /* used for ports, only */
02414  
02415   /* Check validity of program. */
02416   if (UCHAR(prog->program[0]) != MAGIC) {
02417     regerror("corrupted program");
02418     return(0);
02419   }
02420 
02421   /* If there is a "must appear" string, look for it. */
02422   if (!port && (prog->regmust >= 0)) {
02423     spos = stringpos;
02424     while (1) {
02425       int i, l = prog->regmlen, ch, pos;
02426       GC_CAN_IGNORE char *p;
02427 
02428       if ((spos - stringpos) + l <= stringlen) {
02429        if (prog->flags & REGEXP_MUST_CI)
02430          pos = l_strchr_ci(string, spos, stringlen - (spos - stringpos) - (l - 1), 
02431                          (ITO(prog->program, (char *)prog) XFORM_OK_PLUS prog->regmust)[0]);
02432        else
02433          pos = l_strchr(string, spos, stringlen - (spos - stringpos) - (l - 1), 
02434                       (ITO(prog->program, (char *)prog) XFORM_OK_PLUS prog->regmust)[0]);
02435        if (pos == -1)
02436          return 0; /* Not present. */
02437       } else
02438        return 0; /* Not present, since there's not enough room left. */
02439 
02440       /* ASSUMING NO GC HERE! */
02441       p = (ITO(prog->program, (char *)prog) XFORM_OK_PLUS prog->regmust);
02442       if (prog->flags & REGEXP_MUST_CI) {
02443        for (i = 0; i < l; i++) {
02444          ch = string[pos + i];
02445          ch = rx_tolower(ch);
02446          if (ch != p[i])
02447            break;
02448        }
02449       } else {
02450        for (i = 0; i < l; i++) {
02451          if (string[pos + i] != p[i])
02452            break;
02453        }
02454       }
02455       if (i >= l)
02456        break; /* Found it. */
02457       spos = pos + 1;
02458     }
02459   }
02460 
02461   if (prog->ncounter) {
02462     counters = (int *)scheme_malloc_atomic(sizeof(int) * prog->ncounter);
02463   } else
02464     counters = NULL;
02465 
02466   if (port) {
02467     if (peek) {
02468       peekskip = portstart;
02469       dropped = portstart;
02470       /* Make sure that's there's not an EOF before peekskip: */
02471       if (!SAME_OBJ(peekskip, scheme_make_integer(0))) {
02472         char tmp[1];
02473         long got;
02474         got = scheme_get_byte_string_unless("regexp-match", port, 
02475                                             tmp, 0, 1, 1,
02476                                             1, scheme_bin_minus(peekskip, scheme_make_integer(1)),
02477                                             unless_evt);
02478         if (got == EOF) {
02479           /* Hit EOF before peekstart, so cannot match */
02480           return 0;
02481         }
02482       }
02483     } else {
02484       /* In non-peek port mode, skip over portstart chars: */
02485       long amt, got;
02486 
02487       if (SCHEME_INTP(portstart)) {
02488        amt = SCHEME_INT_VAL(portstart);
02489        if (amt > 4096)
02490          amt = 4096;
02491       } else
02492        amt = 4096;
02493 
02494       dropped = scheme_make_integer(0);
02495        
02496       if (amt) {
02497        char *drain;
02498 
02499        drain = (char *)scheme_malloc_atomic(amt);
02500 
02501        do {
02502          got = scheme_get_byte_string(who, port, drain, 0, amt, 0, 0, 0);
02503          if (got != EOF) {
02504            Scheme_Object *delta;
02505            
02506            if (discard_oport)
02507              scheme_put_byte_string(who, discard_oport, drain, 0, got, 0);
02508            
02509            dropped = scheme_bin_plus(dropped, scheme_make_integer(got));
02510            delta = scheme_bin_minus(portstart, dropped);
02511            if (scheme_bin_gt(scheme_make_integer(amt), delta))
02512              amt = SCHEME_INT_VAL(delta);
02513          }
02514        } while ((got != EOF) && amt);
02515        if (amt)
02516          return 0; /* can't skip far enough, so it fails */
02517       }
02518     }
02519 
02520     if (portend)
02521       portend = scheme_bin_minus(portend, dropped);
02522   }
02523 
02524   /* Simplest case:  anchored match need be tried only once. */
02525   if (prog->flags & REGEXP_ANCH) {
02526     if (port) {
02527       rxpos len = 0, space = 0;
02528 
02529       *stringp = NULL;
02530       if (regtry_port(prog, port, unless_evt, nonblock, 
02531                     startp, maybep, endp, counters, stringp, &len, &space, 0, 
02532                     portend, peekskip, 0, 1, 1, 0)) {
02533        if (!peek) {
02534          /* Need to consume matched chars: */
02535          char *drain;
02536          long got;
02537 
02538          if (discard_oport && *startp)
02539            scheme_put_byte_string(who, discard_oport, *stringp, 0, *startp, 0);
02540 
02541          if (get_offsets)
02542            drain = *stringp;
02543          else
02544            /* Allocate fresh in case we get different results from previous peek: */
02545            drain = (char *)scheme_malloc_atomic(*endp);
02546          got = scheme_get_byte_string(who, port, drain, 0, *endp, 0, 0, 0);
02547        }
02548 
02549        *_dropped = dropped;
02550 
02551        return 1;
02552       } else {
02553        if (!peek) {
02554          /* Need to consume all chars, up to portend */
02555          char *drain;
02556          long got;
02557          
02558          if (portend && SCHEME_INTP(portend) && SCHEME_INT_VAL(portend) < 4096) {
02559            got = SCHEME_INT_VAL(portend);
02560          } else
02561            got = 4096;
02562 
02563          drain = (char *)scheme_malloc_atomic(got);
02564 
02565          while ((got = scheme_get_byte_string(who, port, drain, 0, got, 0, 0, 0)) != EOF) {
02566            if (discard_oport)
02567              scheme_put_byte_string(who, discard_oport, drain, 0, got, 0);
02568 
02569            if (portend) {
02570              portend = scheme_bin_minus(portend, scheme_make_integer(got));
02571              if (SCHEME_INTP(portend)) {
02572               got = SCHEME_INT_VAL(portend);
02573               if (!got)
02574                 break;
02575               else if (got > 4096)
02576                 got = 4096;
02577              }
02578            } else
02579              got = 4096;
02580          }
02581        }
02582        return 0;
02583       }
02584     } else
02585       return regtry(prog, string, stringpos, stringlen, startp, maybep, endp, counters, 0, 
02586                   stringpos, 1, 1, 0);
02587   }
02588 
02589   /* Messy cases:  unanchored match. */
02590   spos = stringpos;
02591   if (port) {
02592     int at_line_start = 1;
02593     rxpos len = 0, skip = 0, space = 0;
02594     *stringp = NULL;
02595 
02596     do {
02597       int discard = skip - prog->maxlookback;
02598       if (discard >= REGPORT_FLUSH_THRESHOLD) {
02599        if (!peek) {
02600          if (discard_oport)
02601            scheme_put_byte_string(who, discard_oport, *stringp, 0, discard, 0);
02602            
02603          scheme_get_byte_string(who, port, *stringp, 0, discard, 0, 0, 0);
02604 
02605          if (portend)
02606            portend = scheme_bin_minus(portend, scheme_make_integer(discard));
02607        } else {
02608          peekskip = scheme_bin_plus(peekskip, scheme_make_integer(discard));
02609        }
02610 
02611        dropped = scheme_bin_plus(dropped, scheme_make_integer(discard));
02612 
02613        len -= discard;
02614        skip -= discard;
02615        memmove(*stringp, *stringp + discard, len);
02616       }
02617 
02618       if (regtry_port(prog, port, unless_evt, nonblock,
02619                     startp, maybep, endp, counters, stringp, &len, &space, skip, 
02620                     portend, peekskip, 0, !space, at_line_start, 1)) {
02621        if (!peek) {
02622          char *drain;
02623 
02624          if (discard_oport && *startp)
02625            scheme_put_byte_string(who, discard_oport, *stringp, 0, *startp, 0);
02626 
02627          if (get_offsets)
02628            drain = *stringp;
02629          else
02630            /* Allocate fresh in case we get different results from previous peek: */
02631            drain = (char *)scheme_malloc_atomic(*endp);
02632 
02633          scheme_get_byte_string(who, port, drain, 0, *endp, 0, 0, 0);
02634        }
02635 
02636        *_dropped = dropped;
02637 
02638        return 1;
02639       }
02640       at_line_start = ((skip < len) && ((*stringp)[skip] == '\n'));
02641       skip++;
02642     } while (len >= skip);
02643 
02644     if (!peek) {
02645       /* If we get here, there must be `len' leftover characters in the port,
02646         and `*stringp' must hold the characters: */
02647       if (len > 0) {
02648        if (discard_oport)
02649          scheme_put_byte_string(who, discard_oport, *stringp, 0, len, 0);
02650        scheme_get_byte_string(who, port, *stringp, 0, len, 0, 0, 0);
02651       }
02652     }
02653   } else {
02654     if (regtry(prog, string, spos, stringlen - (spos - stringpos), 
02655               startp, maybep, endp, counters,
02656               0, stringpos, 1, 1, 1))
02657       return 1;
02658   }
02659 
02660   /* Failure. */
02661   return 0;
02662 }
02663 
02664 /*
02665    - regtry - try match at specific point
02666    */
02667 static int                  /* 0 failure, 1 success */
02668 regtry(regexp *prog, char *string, int stringpos, int stringlen, 
02669        rxpos *startp, rxpos *maybep, rxpos *endp, int *counters,
02670        Regwork *rw, rxpos stringorigin, int atstart, int atlinestart,
02671        int unanchored)
02672 {
02673   int i;
02674   Regwork _rw;
02675 
02676   if (!rw) {
02677     rw = &_rw;
02678     rw->port = NULL;
02679   }
02680   rw->instr = string;
02681   rw->input = stringpos;
02682   rw->input_end = stringpos + stringlen;
02683   rw->input_start = stringorigin;
02684   rw->startp = startp;
02685   rw->maybep = maybep;
02686   rw->endp = endp;
02687   rw->counters = counters;
02688   if (atstart)
02689     rw->boi = stringpos;
02690   else
02691     rw->boi = -1;
02692   if (atlinestart)
02693     rw->bol = stringpos;
02694   else
02695     rw->bol = -1;
02696 
02697   for (i = prog->nsubexp; i--; ) {
02698     startp[i] = -1;
02699     endp[i] = -1;
02700   }
02701 
02702 #ifdef INDIRECT_TO_PROGRAM
02703   regstr = prog->program;
02704 #else
02705   regstr = (char *)prog;
02706 #endif
02707 
02708   while (1) {
02709     int found;
02710 
02711     found = regmatch(rw, N_ITO_DELTA(prog->program, 1, (char *)prog));
02712 
02713     if (found) {
02714       startp[0] = stringpos;
02715       endp[0] = rw->input;
02716       return 1;
02717     } else if (unanchored) {
02718       if (!stringlen)
02719        return 0;
02720       stringpos++;
02721       --stringlen;
02722       if (prog->regstart) {
02723        unsigned char *rs = prog->regstart;
02724        int c;
02725        while (1) {
02726          if (!stringlen)
02727            return 0;
02728          c = UCHAR(string[stringpos]);
02729          if (rs[c >> 3] & (1 << (c & 0x7)))
02730            break;
02731          stringpos++;
02732          --stringlen;
02733        }
02734       }
02735       if (string[stringpos - 1] == '\n')
02736        rw->bol = stringpos;
02737       else
02738        rw->bol = -1;
02739       rw->boi = -1;
02740       rw->input = stringpos;      
02741       for (i = prog->nsubexp; i--; ) {
02742        startp[i] = -1;
02743        endp[i] = -1;
02744       }
02745       /* try again... */
02746     } else
02747       return 0;
02748   }
02749 }
02750 
02751 #define NEED_INPUT(rw, v, n) if (rw->port && (((v) + (n)) > rw->input_end)) read_more_from_regport(rw, (v) + (n))
02752 
02753 static void read_more_from_regport(Regwork *rw, rxpos need_total)
02754      /* Called when we're about to look past our read-ahead */
02755 {
02756   long got;
02757   Scheme_Object *peekskip;
02758 
02759   /* limit reading by rw->input_maxend: */
02760   if (need_total > rw->input_maxend) {
02761     need_total = rw->input_maxend;
02762     if (need_total <= rw->input_end) {
02763       rw->port = NULL; /* turn off further port reading */
02764       return;
02765     }
02766   }
02767 
02768   if (rw->instr_size < need_total) {
02769     char *naya;
02770     long size = rw->instr_size;
02771     
02772     size = size * 2;
02773     if (size < need_total)
02774       size += need_total;
02775     if (size < 16)
02776       size = 16;
02777 
02778     naya = (char *)scheme_malloc_atomic(size);
02779     memcpy(naya, rw->instr, rw->input_end);
02780 
02781     rw->instr = naya;
02782     rw->instr_size = size;
02783   }
02784 
02785   rw->str = regstr; /* get_string can swap threads */
02786 
02787   if (rw->input_maxend < rw->instr_size)
02788     got = rw->input_maxend - rw->input_end;
02789   else
02790     got = rw->instr_size - rw->input_end;
02791   
02792   if (rw->peekskip)
02793     peekskip = scheme_bin_plus(scheme_make_integer(rw->input_end), rw->peekskip);
02794   else
02795     peekskip = scheme_make_integer(rw->input_end);
02796 
02797   /* Fill as much of our buffer as possible: */
02798   got = scheme_get_byte_string_unless("regexp-match", rw->port, 
02799                                   rw->instr, rw->input_end, got,
02800                                   (rw->nonblock
02801                                    ? 2   /* non-blocking read, as much as possible */
02802                                    : 1), /* read at least one char, and as much as possible */
02803                                   1, peekskip,
02804                                   rw->unless_evt);
02805 
02806   regstr = rw->str;
02807 
02808   if (got < 1) {
02809     /* EOF, special, or 0-due-to-unless/nonblock */
02810     if (!got)
02811       rw->aborted = 1;
02812     rw->port = NULL; /* turn off further port reading */
02813     rw->unless_evt = NULL;
02814   } else {
02815     rw->input_end += got;
02816 
02817     /* Non-blocking read got enough? If not, try again in blocking mode: */
02818     if (need_total > rw->input_end) {
02819       if (rw->nonblock) {
02820        rw->port = NULL; /* turn off further port reading */
02821        rw->unless_evt = NULL;
02822        rw->aborted = 1;
02823       } else {
02824        if (rw->peekskip)
02825          peekskip = scheme_bin_plus(scheme_make_integer(rw->input_end), rw->peekskip);
02826        else
02827          peekskip = scheme_make_integer(rw->input_end);
02828 
02829        rw->str = regstr; /* get_string can swap threads */
02830        got = scheme_get_byte_string_unless("regexp-match", rw->port, 
02831                                        rw->instr, rw->input_end, need_total - rw->input_end,
02832                                        0, /* blocking mode */
02833                                        1, peekskip,
02834                                        rw->unless_evt);
02835        regstr = rw->str;
02836       
02837        if (got == EOF) {
02838          rw->port = NULL; /* turn off further port reading */
02839          rw->unless_evt = NULL;
02840        } else
02841          rw->input_end += got;
02842       }
02843     }
02844   }
02845 }
02846 
02847 /*
02848    - regtry - try match in a port
02849    */
02850 static int
02851 regtry_port(regexp *prog, Scheme_Object *port, Scheme_Object *unless_evt, int nonblock,
02852            rxpos *startp, rxpos *maybep, rxpos *endp, int *counters,
02853            char **work_string, rxpos *len, rxpos *size, rxpos skip, 
02854            Scheme_Object *maxlen, Scheme_Object *peekskip, 
02855            rxpos origin, int atstart, int atlinestart,
02856            int read_at_least_one)
02857 {
02858   int m;
02859   Regwork rw;
02860 
02861   rw.port = port;
02862   rw.unless_evt = unless_evt;
02863   rw.nonblock = (short)nonblock;
02864   rw.aborted = 0;
02865   rw.instr_size = *size;
02866   if (maxlen && SCHEME_INTP(maxlen))
02867     rw.input_maxend = SCHEME_INT_VAL(maxlen);
02868   else
02869     rw.input_maxend = BIGGEST_RXPOS;
02870   rw.peekskip = peekskip;
02871 
02872   m = regtry(prog, *work_string, skip, (*len) - skip, 
02873             startp, maybep, endp, counters,
02874             &rw, origin, atstart, atlinestart, 0);
02875 
02876   if (read_at_least_one
02877       && !rw.aborted
02878       && (rw.input_end == skip)
02879       && rw.port) {
02880     read_more_from_regport(&rw, rw.input_end + 1);
02881   }
02882   
02883   *work_string = rw.instr;
02884   *len = rw.input_end;
02885   *size = rw.instr_size;
02886 
02887   if (rw.aborted)
02888     return 0;
02889   else
02890     return m;
02891 }
02892 
02893 #ifdef DO_STACK_CHECK
02894 
02895 static Scheme_Object *regmatch_k(void)
02896 {
02897   Scheme_Thread *p = scheme_current_thread;
02898   Regwork *rw = (Regwork *)p->ku.k.p1;
02899   int res;
02900 
02901   p->ku.k.p1 = NULL;
02902 
02903   regstr = rw->str; /* in case of thread swap */
02904  
02905   res = regmatch(rw, p->ku.k.i1);
02906 
02907   return (res ? scheme_true : scheme_false);
02908 }
02909 
02910 #endif
02911 
02912 /*
02913    - regmatch - main matching routine
02914    *
02915    * Conceptually the strategy is simple:  check to see whether the current
02916    * node matches, call self recursively to see whether the rest matches,
02917    * and then act accordingly.  In practice we make some effort to avoid
02918    * recursion, in particular by going through "ordinary" nodes (that don't
02919    * need to know whether the rest of the match failed) by a loop instead of
02920    * by recursion.
02921    */
02922 static int                  /* 0 failure, 1 success */
02923 regmatch(Regwork *rw, rxpos prog)
02924 {
02925   rxpos scan;        /* Current node. */
02926   rxpos is;          /* Input string pos */
02927   int the_op;
02928 
02929 #ifdef DO_STACK_CHECK
02930   {
02931 # include "mzstkchk.h"
02932     {
02933       Scheme_Thread *p = scheme_current_thread;
02934       Regwork *rw2;
02935       Scheme_Object *res;
02936 
02937       /* rw is likely be stack allocated, so copy out to
02938         the heap and then copy result back in on return. */
02939       rw2 = MALLOC_ONE_RT(Regwork);
02940       memcpy(rw2, rw, sizeof(Regwork));
02941 #ifdef MZTAG_REQUIRED
02942       rw2->type = scheme_rt_regwork;
02943 #endif
02944 
02945       rw2->str = regstr; /* in case of thread swap */
02946       p->ku.k.p1 = rw2;
02947       p->ku.k.i1 = prog;
02948       res = scheme_handle_stack_overflow(regmatch_k);
02949 
02950       memcpy(rw, rw2, sizeof(Regwork));
02951 
02952       return SCHEME_TRUEP(res);
02953     }
02954   }
02955 #endif
02956 
02957   if (DECREMENT_FUEL(scheme_fuel_counter, 1) <= 0) { 
02958     char *rs;
02959     rs = regstr;
02960     scheme_out_of_fuel();
02961     regstr = rs;
02962   }
02963 
02964   is = rw->input;
02965   scan = prog;
02966   while (scan != 0) {
02967     the_op = rOP(scan);
02968     switch (the_op) {
02969     case BOI:
02970       if (is != rw->boi)
02971        return(0);
02972       scan = NEXT_OP(scan);
02973       break;
02974     case EOI:
02975       NEED_INPUT(rw, is, 1);
02976       if (is != rw->input_end)
02977        return(0);
02978       scan = NEXT_OP(scan);
02979       break;
02980     case BOL:
02981       if ((is != rw->bol)
02982          && ((is <= rw->input_start)
02983              || (rw->instr[is - 1] != '\n')))
02984        return(0);
02985       scan = NEXT_OP(scan);
02986       break;
02987     case EOL:
02988       NEED_INPUT(rw, is, 1);
02989       if (is != rw->input_end) {
02990        if (rw->instr[is] != '\n')
02991          return(0);
02992       }
02993       scan = NEXT_OP(scan);
02994       break;
02995     case ANY:
02996       NEED_INPUT(rw, is, 1);
02997       if (is == rw->input_end)
02998        return(0);
02999       is++;
03000       scan = NEXT_OP(scan);
03001       break;
03002     case ANYL:
03003       NEED_INPUT(rw, is, 1);
03004       if (is == rw->input_end)
03005        return(0);
03006       if (rw->instr[is] == '\n')
03007        return 0;
03008       is++;
03009       scan = NEXT_OP(scan);
03010       break;
03011     case EXACTLY:
03012       {
03013        int len, i;
03014        rxpos opnd;
03015 
03016        opnd = OPSTR(OPERAND(scan));
03017        len = rOPLEN(OPERAND(scan));
03018        if (rw->port) {
03019          /* Like the other branch, but demand chars one at a time, as
03020             we need them */
03021          for (i = 0; i < len; i++) {
03022            NEED_INPUT(rw, is + i, 1);
03023            if (is + i >= rw->input_end)
03024              return 0;
03025            if (regstr[opnd+i] != rw->instr[is+i])
03026              return 0;
03027          }
03028        } else {
03029          if (len > rw->input_end - is)
03030            return 0;
03031          for (i = 0; i < len; i++) {
03032            if (regstr[opnd+i] != rw->instr[is+i])
03033              return 0;
03034          }
03035        }
03036        is += len;
03037       }
03038       scan = NEXT_OP(scan);
03039       break;
03040     case EXACTLY_CI:
03041       {
03042        int len, i;
03043        char c;
03044        rxpos opnd;
03045 
03046        opnd = OPSTR(OPERAND(scan));
03047        len = rOPLEN(OPERAND(scan));
03048        if (rw->port) {
03049          /* Like the other branch, but demand chars one at a time, as
03050             we need them */
03051          for (i = 0; i < len; i++) {
03052            NEED_INPUT(rw, is + i, 1);
03053            if (is + i >= rw->input_end)
03054              return 0;
03055            c = rw->instr[is+i];
03056            c = rx_tolower(c);
03057            if (regstr[opnd+i] != c)
03058              return 0;
03059          }
03060        } else {
03061          if (len > rw->input_end - is)
03062            return 0;
03063          for (i = 0; i < len; i++) {
03064            c = rw->instr[is+i];
03065            c = rx_tolower(c);
03066            if (regstr[opnd+i] != c)
03067              return 0;
03068          }
03069        }
03070        is += len;
03071       }
03072       scan = NEXT_OP(scan);
03073       break;
03074     case ANYOF:
03075       {
03076        int c;
03077        NEED_INPUT(rw, is, 1);
03078        if (is == rw->input_end)
03079          return 0;
03080        c = UCHAR(rw->instr[is]);
03081        if (!(regstr[OPERAND(scan) + (c >> 3)] & (1 << (c & 0x7))))
03082          return(0);
03083        is++;
03084        scan = NEXT_OP(scan);
03085       }
03086       break;
03087     case EXACTLY1:
03088       NEED_INPUT(rw, is, 1);
03089       if (is == rw->input_end)
03090        return 0;
03091       if (rw->instr[is] != regstr[OPERAND(scan)])
03092        return 0;
03093       is++;
03094       scan = NEXT_OP(scan);
03095       break;
03096     case EXACTLY2:
03097       NEED_INPUT(rw, is, 1);
03098       if (is == rw->input_end)
03099        return 0;
03100       if (rw->instr[is] != regstr[OPERAND(scan)])
03101         if (rw->instr[is] != regstr[OPERAND(scan)+1])
03102           return 0;
03103       is++;
03104       scan = NEXT_OP(scan);
03105       break;
03106     case RANGE:
03107       {
03108        int c;
03109        NEED_INPUT(rw, is, 1);
03110        if (is == rw->input_end)
03111          return 0;
03112        c = UCHAR(rw->instr[is]);
03113        if ((c < UCHAR(regstr[OPERAND(scan)]))
03114            || (c > UCHAR(regstr[OPERAND(scan)+1])))
03115          return(0);
03116        is++;
03117        scan = NEXT_OP(scan);
03118       }
03119       break;
03120     case NOTRANGE:
03121       {
03122        int c;
03123        NEED_INPUT(rw, is, 1);
03124        if (is == rw->input_end)
03125          return 0;
03126        c = UCHAR(rw->instr[is]);
03127        if ((c >= UCHAR(regstr[OPERAND(scan)]))
03128            && (c <= UCHAR(regstr[OPERAND(scan)+1])))
03129          return(0);
03130        is++;
03131        scan = NEXT_OP(scan);
03132       }
03133       break;
03134     case NOTHING:
03135       scan = NEXT_OP(scan);
03136       break;
03137     case BACK:
03138       scan = scan - rNEXT(scan);
03139       break;
03140     case BRANCH:
03141       {
03142         rxpos delta;
03143        rxpos next;  /* Next node. */
03144 
03145        next = NEXT_OP(scan);
03146 
03147         if (rOP(next) != BRANCH) /* No choice. */
03148           scan = OPERAND(scan);    /* Avoid recursion. */
03149         else {
03150           do {
03151             rw->input = is;
03152             if (regmatch(rw, OPERAND(scan)))
03153               return(1);
03154            scan = next;
03155            delta = rNEXT(scan);
03156             if (!delta)
03157              break;
03158             next = scan + delta;
03159           } while (rOP(next) == BRANCH);
03160          scan = OPERAND(scan);
03161         }
03162       }
03163       break;
03164     case STAR:
03165     case PLUS:
03166     case STAR2:
03167     case PLUS2:
03168     case STAR3:
03169     case STAR4:
03170       {
03171        char nextch;
03172        int no;
03173        rxpos save, body;
03174        int min, maxc;
03175        int nongreedy = (the_op == STAR2 || the_op == PLUS2 || the_op == STAR4);
03176        rxpos next;  /* Next node. */
03177        
03178        /*
03179         * Lookahead to avoid useless match attempts
03180         * when we know what character comes next.
03181         */
03182        nextch = '\0';
03183        next = NEXT_OP(scan);
03184        if (rOP(next) == EXACTLY)
03185          nextch = regstr[OPSTR(OPERAND(next))];
03186        if ((the_op == STAR3) || (the_op == STAR4)) {
03187          min = rOPLEN(OPERAND(scan));
03188          maxc = rOPLEN(OPERAND2(scan));
03189          body = OPERAND3(scan);
03190        } else {
03191          body = OPERAND(scan);
03192          min = ((the_op == STAR) || (the_op == STAR2)) ? 0 : 1;
03193          maxc = 0;
03194        }
03195        save = is;
03196 
03197        rw->input = is;
03198        if (nongreedy && rw->port) {
03199          /* Get at least one, but then don't
03200             let regrepeat pull in arbitrary code: */
03201          Scheme_Object *saveport;
03202          NEED_INPUT(rw, save, 1);
03203          saveport = rw->port;
03204          rw->port = NULL;
03205          no = regrepeat(rw, body, maxc);
03206          rw->port = saveport;
03207          nongreedy = 2;
03208        } else
03209          no = regrepeat(rw, body, maxc);
03210 
03211        if (!nongreedy) {
03212          if (nextch)
03213            NEED_INPUT(rw, save + no, 1);
03214          while (no >= min) {
03215            /* If it could work, try it. */
03216            if (nextch == '\0' || ((save + no < rw->input_end)
03217                                && (rw->instr[save + no] == nextch))) {
03218              rw->input = is + no;
03219              if (regmatch(rw, next))
03220               return(1);
03221            }
03222            /* Couldn't or didn't -- back up. */
03223            no--;
03224          }
03225        } else {
03226          int i;
03227          for (i = min; i <= no; i++) {
03228            /* If it could work, try it. */
03229            if (nextch)
03230              NEED_INPUT(rw, save + i, 1);
03231            if (nextch == '\0' || ((save+i < rw->input_end)
03232                                && (rw->instr[save+i] == nextch))) {
03233              rw->input = save + i;
03234              if (regmatch(rw, next)) {
03235               return(1);
03236              }
03237            }
03238 
03239            if ((i == no) && (nongreedy == 2)) {
03240              /* Maybe regrepeat can match more if we let it read from
03241                the port. */
03242              if ((rw->input_end - save) > no) {
03243               /* We have pulled-in chars to try. */
03244               int moreno;
03245               Scheme_Object *saveport;
03246 
03247               saveport = rw->port;
03248               rw->port = NULL;
03249               is = save + no;
03250               rw->input = is;
03251               moreno = regrepeat(rw, body, maxc ? maxc - no : 0);
03252               rw->port = saveport;
03253 
03254               if (!moreno)
03255                 nongreedy = 1;
03256               else
03257                 no += moreno;
03258              }
03259            }
03260          }
03261        }
03262        return(0);
03263       }
03264       break;
03265     case END:
03266     case LOOKE:
03267       rw->input = is;
03268       return(1);            /* Success! */
03269       break;
03270     case BACKREF:
03271       {
03272        int no, len, start, i;
03273        no = rOPLEN(OPERAND(scan));
03274        if (rw->endp[no] == -1)
03275          return 0;
03276 
03277        start = rw->startp[no];
03278        len = rw->endp[no] - start;
03279 
03280        if (rw->port) {
03281          /* Like the other branch, but demand chars one at a time, as
03282             we need them */
03283          for (i = 0; i < len; i++) {
03284            NEED_INPUT(rw, is + i, 1);
03285            if (is + i >= rw->input_end)
03286              return 0;
03287            if (rw->instr[start+i] != rw->instr[is+i])
03288              return 0;
03289          }
03290        } else {
03291          if (len > rw->input_end - is)
03292            return 0;
03293          for (i = 0; i < len; i++) {
03294            if (rw->instr[start+i] != rw->instr[is+i])
03295              return 0;
03296          }
03297        }
03298        is += len;
03299        scan = NEXT_OP(scan);
03300        break;
03301       }
03302     case BACKREF_CI:
03303       {
03304        int no, len, start, i, c1, c2;
03305        no = rOPLEN(OPERAND(scan));
03306        if (rw->endp[no] == -1)
03307          return 0;
03308 
03309        start = rw->startp[no];
03310        len = rw->endp[no] - start;
03311 
03312        if (rw->port) {
03313          /* Like the other branch, but demand chars one at a time, as
03314             we need them */
03315          for (i = 0; i < len; i++) {
03316            NEED_INPUT(rw, is + i, 1);
03317            if (is + i >= rw->input_end)
03318              return 0;
03319            c1 = rw->instr[start+i];
03320            c1 = rx_tolower(c1);
03321            c2 = rw->instr[is+i];
03322            c2 = rx_tolower(c2);
03323            if (c1 != c2)
03324              return 0;
03325          }
03326        } else {
03327          if (len > rw->input_end - is)
03328            return 0;
03329          for (i = 0; i < len; i++) {
03330            c1 = rw->instr[start+i];
03331            c1 = rx_tolower(c1);
03332            c2 = rw->instr[is+i];
03333            c2 = rx_tolower(c2);
03334            if (c1 != c2)
03335              return 0;
03336          }
03337        }
03338        is += len;
03339        scan = NEXT_OP(scan);
03340        break;
03341       }
03342     case LOOKT:
03343     case LOOKF:
03344     case LOOKTX:
03345     case LOOKBT:
03346     case LOOKBF:
03347       {
03348        int t, no, no_start, no_end;
03349        rxpos save, next;
03350        next = NEXT_OP(scan);
03351        t = ((the_op != LOOKF) && (the_op != LOOKBF));
03352        if ((the_op == LOOKBT)  || (the_op == LOOKBF)) {
03353          no_start = rOPLEN(OPERAND2(scan));
03354          no_end = rOPLEN(OPERAND3(scan));
03355        } else
03356          no_start = no_end = 0;
03357        save = is;
03358        if (no_end) {
03359          for (no = no_start; no <= no_end; no++) {
03360            if (is - rw->input_start >= no) {
03361              rw->input = save - no;
03362              if (regmatch(rw, next)) {
03363               if (is == save) {
03364                 /* Match */
03365                 if (!t) return 0;
03366                 break;
03367               }
03368              }
03369            } else {
03370              no = no_end + 1;
03371              break;
03372            }
03373          }
03374          if (no > no_end) {
03375            /* No matches */
03376            if (t) return 0;
03377          }
03378        } else {
03379          rw->input = is;
03380          if (regmatch(rw, next)) {
03381            if (!t) return 0;
03382          } else {
03383            if (t) return 0;
03384          }
03385          if (the_op == LOOKTX)
03386            is = rw->input;
03387        }
03388        scan = scan + rOPLEN(OPERAND(scan));
03389        scan = NEXT_OP(scan);
03390       }
03391       break;
03392     case COUNTINIT:
03393       {
03394        int no;
03395        no = rOPLEN(OPERAND(scan));
03396        rw->counters[no] = 0;
03397        scan = NEXT_OP(scan);
03398       }
03399       break;
03400     case COUNTBACK:
03401       {
03402        int no;
03403        no = rOPLEN(OPERAND(scan));
03404        rw->counters[no] -= 1;
03405        scan = NEXT_OP(scan);
03406       }
03407       break;
03408     case COUNTBACKFAIL:
03409       {
03410        int no;
03411        no = rOPLEN(OPERAND(scan));
03412        rw->counters[no] -= 1;
03413        return 0;
03414       }
03415       break;
03416     case COUNTUNDER:
03417       {
03418        int no, maxreps;
03419        no = rOPLEN(OPERAND(scan));
03420        maxreps = rOPLEN(OPERAND2(scan));
03421        rw->counters[no]++;
03422        if (maxreps && (rw->counters[no] > maxreps))
03423          return 0;
03424        scan = NEXT_OP(scan);
03425       }
03426       break;
03427     case COUNTOVER:
03428       {
03429        int no, minreps;
03430        no = rOPLEN(OPERAND(scan));
03431        minreps = rOPLEN(OPERAND2(scan));
03432        if (rw->counters[no] < minreps)
03433          return 0;
03434        scan = NEXT_OP(scan);
03435       }
03436       break;
03437     case SAVECONST:
03438       {
03439        int no, len;
03440        no = rOPLEN(OPERAND(scan));
03441        len = rOPLEN(OPERAND2(scan));
03442        /* Check that the match happened more than 0 times: */
03443        if (!len || (is > rw->maybep[no])) {
03444          rw->startp[no] = is - len;
03445          rw->endp[no] = is;
03446        } else {
03447          rw->startp[no] = -1;
03448          rw->endp[no] = -1;
03449        }
03450        scan = NEXT_OP(scan);
03451       }
03452       break;
03453     case MAYBECONST:
03454       {
03455        int no;
03456        no = rOPLEN(OPERAND(scan));
03457        rw->maybep[no] = is;
03458        scan = NEXT_OP(scan);
03459       }
03460       break;
03461     case WORDBOUND:
03462       {
03463         int c, w1, w2;
03464        NEED_INPUT(rw, is, 1);
03465        if (is > rw->input_start) {
03466           c = rw->instr[is - 1];
03467           w1 = rx_isword(c);
03468         } else
03469           w1 = 0;
03470         if (is < rw->input_end) {
03471           c = rw->instr[is];
03472           w2 = rx_isword(c);
03473         } else
03474           w2 = 0;
03475         if (w1 == w2) return 0;
03476        scan = NEXT_OP(scan);
03477       }
03478       break;
03479     case NOTWORDBOUND:
03480       {
03481         int c, w1, w2;
03482        NEED_INPUT(rw, is, 1);
03483        if (is > rw->input_start) {
03484           c = rw->instr[is - 1];
03485           w1 = rx_isword(c);
03486         } else
03487           w1 = 0;
03488         if (is < rw->input_end) {
03489           c = rw->instr[is];
03490           w2 = rx_isword(c);
03491         } else
03492           w2 = 0;
03493         if (w1 != w2) return 0;
03494        scan = NEXT_OP(scan);
03495       }
03496       break;
03497     case UNIPROP:
03498       {
03499        unsigned char buf[MAX_UTF8_CHAR_BYTES];
03500        mzchar us[1];
03501        int c, data;
03502        int v, pos;
03503        int negate, bottom, top;
03504        
03505        data = rOPLEN(OPERAND(scan));
03506 
03507        negate = data >> 13;
03508        bottom = (data >> 6) & 0x3F;
03509        top = data & 0x3F;
03510        
03511        NEED_INPUT(rw, rw->input, 1);
03512        if (rw->input < rw->input_end) {
03513          c = UCHAR(rw->instr[rw->input]);
03514          if (c < 128) {
03515            v = c;
03516            pos = 1;
03517          } else {
03518            pos = 1;
03519            buf[0] = c;
03520            while (1) {
03521              v = scheme_utf8_decode_prefix(buf, pos, us, 0);
03522              if (v == 1) {
03523               v = us[0];
03524               break;
03525              } else if (v < -1)
03526               return 0;
03527              NEED_INPUT(rw, rw->input, pos+1);
03528              if (rw->input + pos < rw->input_end) {
03529               buf[pos] = rw->instr[rw->input + pos];
03530               pos++;
03531              } else
03532               return 0;
03533            }
03534          }
03535        } else
03536          return 0;
03537   
03538        is += pos;
03539 
03540        v = scheme_general_category(v);
03541        
03542        if (negate) {
03543          if ((v >= bottom) && (v <= top))
03544            return 0;
03545        } else {
03546          if ((v < bottom) || (v > top))
03547            return 0;
03548        }
03549        
03550        scan = NEXT_OP(scan);
03551       }
03552       break;
03553     case CONDITIONAL:
03554       {
03555        rxpos test = OPERAND3(scan);
03556        int t;
03557 
03558        if (rOP(test) == BACKREF) {
03559          int no;
03560          no = rOPLEN(OPERAND(test));
03561          t = (rw->endp[no] > -1);
03562        } else {
03563          rw->input = is;
03564          t = regmatch(rw, test);
03565        }
03566 
03567        if (t)
03568          scan = scan + rOPLEN(OPERAND(scan));
03569        else
03570          scan = scan + rOPLEN(OPERAND2(scan));
03571       }
03572       break;
03573     default:
03574       {
03575        int isopen;
03576        int no;
03577 
03578        switch (the_op) {
03579        case OPENN:
03580          isopen = 1;
03581          no = rOPLEN(OPERAND(scan));
03582          if (!no)
03583            no = -1; /* => don't set in result array */
03584          break;
03585        case CLOSEN:
03586          isopen = 0;
03587          no = rOPLEN(OPERAND(scan));
03588          if (!no)
03589            no = -1; /* => don't set in result array */
03590          break;
03591        default:
03592          if (the_op < CLOSE) {
03593            isopen = 1;
03594            no = the_op - OPEN;
03595          } else {
03596            isopen = 0;
03597            no = the_op - CLOSE;
03598          }
03599        }
03600 
03601        if (no < 0) {
03602          /* No need to recur */
03603          scan = NEXT_OP(scan);
03604        } else {
03605          rxpos next;
03606 
03607          next = NEXT_OP(scan);
03608          rw->input = is;
03609           
03610          if (isopen) {
03611            int oldmaybe;
03612            oldmaybe = rw->maybep[no];
03613            rw->maybep[no] = is;
03614            if (regmatch(rw, next)) 
03615              return(1);
03616            else {
03617              rw->maybep[no] = oldmaybe;
03618              return(0);
03619            }
03620          } else {
03621            int oldstart, oldend;
03622 
03623            oldstart = rw->startp[no];
03624            oldend = rw->endp[no];
03625            rw->startp[no] = rw->maybep[no];
03626            rw->endp[no] = is;
03627 
03628            if (regmatch(rw, next)) {
03629              return(1);
03630            } else {
03631              rw->startp[no] = oldstart;
03632              rw->endp[no] = oldend;
03633               return(0);
03634            }
03635          }
03636        }
03637       }
03638       break;
03639     }
03640   }
03641 
03642   /*
03643    * We get here only if there's trouble -- normally "case END" is
03644    * the terminating point.
03645    */
03646   regerror("corrupted pointers");
03647   return(0);
03648 }
03649 
03650 /*
03651    - regrepeat - repeatedly match something simple, report how many
03652    */
03653 static int
03654 regrepeat(Regwork *rw, rxpos p, int maxc)
03655 {
03656   int count = 0;
03657   rxpos scan;
03658   rxpos opnd;
03659 
03660   scan = rw->input;
03661   opnd = OPERAND(p);
03662   switch (rOP(p)) {
03663   case ANY:
03664     if (rw->port) {
03665       if (maxc) {
03666        while (rw->port && (rw->input_end < scan + maxc)) {
03667          read_more_from_regport(rw, scan + maxc);
03668        }
03669       } else {
03670        /* need all port input: */
03671        while (rw->port) {
03672          read_more_from_regport(rw, rw->input_end + 4096);
03673        }
03674       }
03675     }
03676     count = rw->input_end - scan;
03677     if (maxc && (count > maxc))
03678       count = maxc;
03679     scan += count;
03680     break;
03681   case ANYL:
03682     {
03683       NEED_INPUT(rw, scan, 1);
03684       while (scan != rw->input_end
03685             && (rw->instr[scan] != '\n')) {
03686        count++;
03687        scan++;
03688        if (maxc) { maxc--; if (!maxc) break; }
03689        NEED_INPUT(rw, scan, 1);
03690       }
03691     }
03692     break;
03693   case EXACTLY:
03694     {
03695       rxpos opnd2 = OPSTR(opnd);
03696       NEED_INPUT(rw, scan, 1);
03697       while (scan != rw->input_end
03698             && (regstr[opnd2] == rw->instr[scan])) {
03699        count++;
03700        scan++;
03701        if (maxc) { maxc--; if (!maxc) break; }
03702        NEED_INPUT(rw, scan, 1);
03703       }
03704     }
03705     break;
03706   case EXACTLY_CI:
03707     {
03708       char c;
03709       rxpos opnd2 = OPSTR(opnd);
03710       NEED_INPUT(rw, scan, 1);
03711       while (scan != rw->input_end) {
03712        c = rw->instr[scan];
03713        c = rx_tolower(c);
03714        if (regstr[opnd2] != c)
03715          break;
03716        count++;
03717        scan++;
03718        if (maxc) { maxc--; if (!maxc) break; }
03719        NEED_INPUT(rw, scan, 1);
03720       }
03721     }
03722     break;
03723   case ANYOF:
03724     {
03725       int c;
03726       rxpos init = scan;
03727       if (rw->port || maxc) {
03728        /* Slow but general version */
03729        NEED_INPUT(rw, scan, 1);
03730        while (scan != rw->input_end) {
03731          c = UCHAR(rw->instr[scan]);
03732          if (!(regstr[opnd + (c >> 3)] & (1 << (c & 0x7))))
03733            break;
03734          scan++;
03735          if (maxc) { maxc--; if (!maxc) break; }
03736          NEED_INPUT(rw, scan, 1);
03737        }
03738       } else {
03739        /* Fast version */
03740        int e = rw->input_end;
03741        while (scan != e) {
03742          c = UCHAR(rw->instr[scan]);
03743          if (!(regstr[opnd + (c >> 3)] & (1 << (c & 0x7))))
03744            break;
03745          scan++;
03746        }
03747       }
03748       count = scan - init;
03749     }
03750     break;
03751   case EXACTLY1:
03752     {
03753       rxpos init = scan;
03754       char c;
03755       c = regstr[opnd];
03756       if (rw->port || maxc) {
03757        /* Slow but general version */
03758        NEED_INPUT(rw, scan, 1);
03759        while ((scan != rw->input_end)
03760               && (rw->instr[scan] == c)) {
03761          scan++;
03762          if (maxc) { maxc--; if (!maxc) break; }
03763          NEED_INPUT(rw, scan, 1);
03764        }
03765       } else {
03766        /* Fast version */
03767        int e = rw->input_end;
03768        while ((scan != e)
03769               && (rw->instr[scan] == c)) {
03770          scan++;
03771        }
03772       }
03773       count = scan - init;
03774     }
03775     break;
03776   case EXACTLY2:
03777     {
03778       rxpos init = scan;
03779       char c1, c2;
03780       c1 = regstr[opnd];
03781       c2 = regstr[opnd+1];
03782       if (rw->port || maxc) {
03783        /* Slow but general version */
03784        NEED_INPUT(rw, scan, 1);
03785        while ((scan != rw->input_end)
03786               && ((rw->instr[scan] == c1)
03787                    || (rw->instr[scan] == c2))) {
03788          scan++;
03789          if (maxc) { maxc--; if (!maxc) break; }
03790          NEED_INPUT(rw, scan, 1);
03791        }
03792       } else {
03793        /* Fast version */
03794        int e = rw->input_end;
03795        while ((scan != e)
03796               && ((rw->instr[scan] == c1)
03797                    || (rw->instr[scan] == c2))) {
03798          scan++;
03799        }
03800       }
03801       count = scan - init;
03802     }
03803     break;
03804   case RANGE:
03805     {
03806       rxpos init = scan;
03807       int c, sr, er;
03808       NEED_INPUT(rw, scan, 1);
03809       sr = UCHAR(regstr[opnd]);
03810       er = UCHAR(regstr[opnd + 1]);
03811       if (rw->port || maxc) {
03812        /* Slow but general version */
03813        while (scan != rw->input_end) {
03814          c = UCHAR(rw->instr[scan]);
03815          if ((c < sr) || (c > er))
03816            break;
03817          scan++;
03818          if (maxc) { maxc--; if (!maxc) break; }
03819          NEED_INPUT(rw, scan, 1);
03820        }
03821       } else {
03822        /* Fast version */
03823        int e = rw->input_end;
03824        while (scan != e) {
03825          c = UCHAR(rw->instr[scan]);
03826          if ((c < sr) || (c > er))
03827            break;
03828          scan++;
03829        }
03830       }
03831       count = scan - init;
03832     }
03833     break;
03834   case NOTRANGE:
03835     {
03836       rxpos init = scan;
03837       int c, sr, er;
03838       NEED_INPUT(rw, scan, 1);
03839       sr = UCHAR(regstr[opnd]);
03840       er = UCHAR(regstr[opnd + 1]);
03841       if (rw->port || maxc) {
03842        /* Slow but general version */
03843        while (scan != rw->input_end) {
03844          c = UCHAR(rw->instr[scan]);
03845          if ((c >= sr) && (c <= er))
03846            break;
03847          scan++;
03848          if (maxc) { maxc--; if (!maxc) break; }
03849          NEED_INPUT(rw, scan, 1);
03850        }
03851       } else {
03852        /* Fast version */
03853        int e = rw->input_end;
03854        while (scan != e) {
03855          c = UCHAR(rw->instr[scan]);
03856          if ((c >= sr) && (c <= er))
03857            break;
03858          scan++;
03859        }
03860       }
03861       count = scan - init;
03862     }
03863     break;
03864   default:                  /* Oh dear.  Called inappropriately. */
03865     regerror("internal foulup");
03866     count = 0;                     /* Best compromise. */
03867     break;
03868   }
03869   rw->input = scan;
03870 
03871   return(count);
03872 }
03873 
03874 /*
03875    - regnext - dig the "next" pointer out of a node
03876    */
03877 static rxpos
03878 regnext(rxpos p)
03879 {
03880   int offset;
03881 
03882   if (p + 2 >= regcodesize)
03883     return 0;
03884 
03885   offset = rNEXT(p);
03886   if (offset == 0)
03887     return 0;
03888 
03889   if (rOP(p) == BACK)
03890     return (p-offset);
03891   else
03892     return (p+offset);
03893 }
03894 
03895 /*
03896  * strcspn - find length of initial segment of s1 consisting entirely
03897  * of characters not from s2
03898  */
03899 
03900 static int
03901 regstrcspn(char *s1, char *e1, char *s2)
03902 {
03903   char *scan1;
03904   char *scan2;
03905   int count;
03906 
03907   count = 0;
03908   for (scan1 = s1; scan1 != e1; scan1++) {
03909     for (scan2 = s2; *scan2 != '\0';) { /* ++ moved down. */
03910       if (*scan1 == *scan2++)
03911        return(count);
03912     }
03913     count++;
03914   }
03915   return(count);
03916 }
03917 
03918 #ifndef strncpy
03919   extern char *strncpy();
03920 #endif
03921 
03922 /*
03923    - regsub - perform substitutions after a regexp match
03924    */
03925 static 
03926 char *regsub(regexp *prog, char *src, int sourcelen, long *lenout, char *insrc, 
03927              rxpos *startp, rxpos *endp)
03928 {
03929   char *dest;
03930   char c;
03931   long no;
03932   long len;
03933   long destalloc, destlen, srcpos;
03934        
03935   destalloc = 2 * sourcelen;
03936   destlen = 0;
03937   dest = (char *)scheme_malloc_atomic(destalloc + 1);
03938   
03939   srcpos = 0;
03940   while (srcpos < sourcelen) {
03941     c = src[srcpos++];
03942     if (c == '&')
03943       no = 0;
03944     else if (c == '\\') {
03945       if (src[srcpos] == '\\' || src[srcpos] == '&')
03946        no = -1;
03947       else if (src[srcpos] == '$') {
03948        no = prog->nsubexp + 1; /* Gives the empty string */
03949        srcpos++;
03950       } else {
03951        no = 0;
03952        while ('0' <= src[srcpos] && src[srcpos] <= '9') {
03953          no = (no * 10) + (src[srcpos++] - '0');
03954        }
03955       }
03956     } else
03957       no = -1;
03958 
03959 
03960     if (no < 0) {           /* Ordinary character. */
03961       if (c == '\\' && (src[srcpos] == '\\' || src[srcpos] == '&'))
03962        c = src[srcpos++];
03963       if (destlen + 1 >= destalloc) {
03964        char *old = dest;
03965        destalloc *= 2;
03966        dest = (char *)scheme_malloc_atomic(destalloc + 1);
03967        memcpy(dest, old, destlen);
03968       }
03969       dest[destlen++] = c;
03970     } else if (no >= prog->nsubexp) {
03971       /* Number too big; prentend it's the empty string */
03972     } else if (startp[no] != -1 && endp[no] != -1) {
03973       len = endp[no] - startp[no];
03974       if (len + destlen >= destalloc) {
03975        char *old = dest;
03976        destalloc = 2 * destalloc + len + destlen;
03977        dest = (char *)scheme_malloc_atomic(destalloc + 1);
03978        memcpy(dest, old, destlen);
03979       }
03980       memcpy(dest + destlen, insrc + startp[no], len);
03981       destlen += len;
03982     }
03983   }
03984   dest[destlen] = '\0';
03985 
03986   if (lenout)
03987     *lenout = destlen;
03988 
03989   return dest;
03990 }
03991 
03992 /************************************************************/
03993 /*              UTF-8 -> per-byte translation               */
03994 /* Translate a UTF-8-encode regexp over the language of     */
03995 /* unicode code points into a per-byte regexp that matches  */
03996 /* equivalent portionals of a UTF-8-encoded sequences of    */
03997 /* code points.                                             */
03998 /************************************************************/
03999 
04000 #ifdef MZ_XFORM
04001 START_XFORM_SKIP;
04002 #endif
04003 #include "../gc2/my_qsort.c"
04004 #ifdef MZ_XFORM
04005 END_XFORM_SKIP;
04006 #endif
04007 
04008 static int compare_ranges(const void *a, const void *b)
04009 {
04010   unsigned int av, bv;
04011   av = *(unsigned int *)a;
04012   bv = *(unsigned int *)b;
04013   if (av == bv)
04014     return 0;
04015   else if (av < bv)
04016     return -1;
04017   else
04018     return 1;
04019 }
04020 
04021 /* For allocating the traslated string, as we go. When writing an
04022    original char (or something that takes its place), there's always
04023    space, but call make_room() before adding new content. */
04024 typedef struct {
04025   int i;         /* number of original chars written */
04026   int orig_len;  /* original length */
04027   int size;      /* allocated size */
04028 } RoomState;
04029 
04030 static unsigned char *make_room(unsigned char *r, int j, int need_extra, RoomState *rs)
04031 {
04032   int nrs;
04033   unsigned char *nr;
04034 
04035   if ((rs->size - j - (rs->orig_len - rs->i)) < need_extra) {
04036     nrs = ((rs->size) * 2) + need_extra;
04037     nr = (unsigned char *)scheme_malloc_atomic(nrs+1);
04038     memcpy(nr, r, j);
04039     r = nr;
04040     rs->size = nrs;
04041   }
04042 
04043   return r;
04044 }
04045 
04046 static unsigned char *add_byte_range(const unsigned char *lo, const unsigned char *hi, int count,
04047                                  unsigned char *r, int *_j, RoomState *rs,
04048                                  /* did_alt => no need to start with "|" */
04049                                  int did_alt, 
04050                                  /* wrap_alts => wrap "(?:...)" around multiple alts */
04051                                  int wrap_alts)
04052      /* Adds alternatives for matching valid UTF-8 encodings lo
04053        through hi lexicographically. See add_range to get started. */
04054 {
04055   int same_chars, j, i;
04056   const unsigned char *lowest = (unsigned char *)"\200\200\200\200\200";
04057   const unsigned char *highest = (unsigned char *)"\277\277\277\277\277";
04058   unsigned char p, q;
04059 
04060   /* Look for a common prefix: */
04061   for (same_chars = 0; same_chars < count; same_chars++) {
04062     if (lo[same_chars] != hi[same_chars])
04063       break;
04064   }
04065 
04066   j = *_j;
04067 
04068   /* Match exactly the part that's the same for hi and lo */
04069   if (same_chars) {
04070     r = make_room(r, j, 4 + same_chars, rs);
04071     if (!did_alt) {
04072       r[j++] = '|';
04073       did_alt = 1;
04074     }
04075     for (i = 0; i < same_chars; i++) {
04076       r[j++] = lo[i];
04077     }
04078   }
04079 
04080   if (same_chars < count) {
04081     /* We have something like nxxxx to mxxxx where n < m.
04082        Find p such that p >= n and p0000 >= nxxxx, and
04083        find q such that q0000 <= mxxxx */
04084     int choices = 0;
04085 
04086     /* If the xxxxs in nxxxx are 0, then p is n,
04087        otherwise it's n + 1 */
04088     for (i = same_chars + 1; i < count; i++) {
04089       if (lo[i] != 128)
04090        break;
04091     }
04092     if (i == count)
04093       p = lo[same_chars];
04094     else {
04095       p = lo[same_chars] + 1;
04096       choices++;
04097     }
04098 
04099     /* If the xxxxs in mxxxx are 0, then q is m,
04100        otherwise it's m - 1 */
04101     for (i = same_chars + 1; i < count; i++) {
04102       if (hi[i] != 191)
04103        break;
04104     }
04105     if (i == count)
04106       q = hi[same_chars];
04107     else {
04108       q = hi[same_chars] - 1;
04109       choices++;
04110     }
04111 
04112     if (p <= q)
04113       choices++;
04114 
04115     if ((wrap_alts || same_chars) && (choices > 1)) {
04116       r = make_room(r, j, 4, rs);
04117       if (!did_alt) {
04118        r[j++] = '|';
04119        did_alt = 1;
04120       }
04121       r[j++] = '(';
04122       r[j++] = '?';
04123       r[j++] = ':';
04124     }
04125 
04126     /* Fill out [nxxxx, p0000) */
04127     if (p > lo[same_chars]) {
04128       r = make_room(r, j, 2, rs);
04129       if (!did_alt) {
04130        r[j++] = '|';
04131        did_alt = 1;
04132       }
04133       r[j++] = lo[same_chars];
04134       *_j = j;
04135       r = add_byte_range(lo XFORM_OK_PLUS same_chars + 1, highest, count - same_chars - 1,
04136                       r, _j, rs, 1, 1);
04137       j = *_j;
04138       p = lo[same_chars] + 1;
04139       did_alt = 0;
04140     }
04141     
04142     /* Fill out [m0000, mxxxx] */
04143     if (q < hi[same_chars]) {
04144       r = make_room(r, j, 2, rs);
04145       if (!did_alt) {
04146        r[j++] = '|';
04147        did_alt = 1;
04148       }
04149       r[j++] = hi[same_chars];
04150       *_j = j;
04151       r = add_byte_range(lowest, hi  XFORM_OK_PLUS same_chars + 1, count - same_chars - 1,
04152                       r, _j, rs, 1, 1);
04153       j = *_j;
04154       did_alt = 0;
04155 
04156       q = hi[same_chars] - 1;
04157     }
04158     
04159     /* Fill out [p0000,m0000) */
04160     if (p <= q) {
04161       /* Make the alternative that lets the initial digit vary,
04162         since there's room between the lo and hi leading digit */
04163       const char *any_str = "[\200-\277]";
04164       const int any_len = 5;
04165 
04166       r = make_room(r, j, 6 + ((count - same_chars - 1) * any_len), rs);
04167       if (!did_alt) {
04168        r[j++] = '|';
04169        did_alt = 1;
04170       }
04171       if (p == q) {
04172        r[j++] = p;
04173       } else {
04174        r[j++] = '[';
04175        r[j++] = p;
04176        r[j++] = '-';
04177        r[j++] = q;
04178        r[j++] = ']';
04179       }
04180       for (i = same_chars + 1; i < count; i++) {
04181        memcpy(r + j, any_str, any_len);
04182        j += any_len;
04183       }
04184     }
04185 
04186     if ((wrap_alts || same_chars) && (choices > 1)) {
04187       /* Close out the grouping */
04188       r = make_room(r, j, 1, rs);
04189       r[j++] = ')';
04190     } 
04191   }
04192 
04193   *_j = j;
04194   return r;
04195 }
04196 
04197 static unsigned char *add_range(unsigned char *r, int *_j, RoomState *rs,
04198                             unsigned int start, unsigned int end, int did_alt)
04199 {
04200   unsigned int top;
04201   int count;
04202   unsigned char lo[6], hi[6];
04203 
04204   /* If this range spans different-sized encodings, split it up
04205      with a recursive call. */
04206   if (start <= 0x7FF) {
04207     top = 0x7FF;
04208     count = 2;
04209   } else if (start <= 0xFFFF) {
04210     top = 0xFFFF;
04211     count = 3;
04212   } else if (start <= 0x1FFFFF) {
04213     top = 0x1FFFFF;
04214     count = 4;
04215   } else if (start <= 0x3FFFFFF) {
04216     top = 0x3FFFFFF;
04217     count = 5;
04218   } else {
04219     top = 0x7FFFFFFF;
04220     count = 6;
04221   }
04222 
04223   if (end > top) {
04224     r = add_range(r, _j, rs, top + 1, end, did_alt);
04225     end = top;
04226     did_alt = 0;
04227   }
04228 
04229   /* At this point, the situation is much like creating a
04230      regexp to match decimal digits. If we wanted to match the
04231      range 28 to 75 (inclusive), we'd need three parts:
04232 
04233           2[8-9]|[3-6][0-9]|7[0-5]
04234 
04235      It gets more complex with three digits, say 
04236      128 to 715:
04237 
04238        12[8-9]|1[3-6][0-9]|[2-6][0-9][0-9]|7[0-0][0-9]|71[0-5]
04239 
04240      but you get the idea. Note that any_str takes the place of
04241      [0-9].
04242 
04243      This same idea works with UTF-8 "digits", so first encode
04244      our code-point numbers in UTF-8: */
04245 
04246   scheme_utf8_encode_all(&start, 1, lo);
04247   scheme_utf8_encode_all(&end, 1, hi);
04248 
04249   return add_byte_range(lo, hi, count, r, _j, rs, did_alt, 0);
04250 }
04251 
04252 static int need_ci_alternates(unsigned char *s, int delta, int len)
04253 {
04254   mzchar us[1], c;
04255 
04256   scheme_utf8_decode(s, delta, len, us, 0, 1, NULL, 0, 0);
04257   c = us[0];
04258 
04259   return ((c != scheme_toupper(c))
04260          || (c != scheme_tolower(c))
04261          || (c != scheme_tofold(c))
04262          || (c != scheme_totitle(c)));
04263 }
04264 
04265 static int translate(unsigned char *s, int len, char **result, int pcre)
04266 {
04267   int j, parse_flags = PARSE_CASE_SENS | PARSE_SINGLE_LINE;
04268   RoomState rs;
04269   unsigned char *r;
04270   Scheme_Object *parse_params = NULL;
04271 
04272   rs.orig_len = len;
04273   rs.size = len;
04274   
04275   r = (unsigned char *)scheme_malloc_atomic(rs.size + 1);
04276 
04277   /* We need to translate if the pattern contains any use of ".", if
04278      there's a big character in a range, if there's a not-range, or if
04279      there's a big character before '+', '*', or '?'. */
04280 
04281   for (rs.i = j = 0; rs.i < len;) {
04282     if (s[rs.i] == '[') {
04283       int k = rs.i + 1, saw_big = 0;
04284       int not_mode = 0;
04285 
04286       /* First, check whether we need to translate this particular
04287         range. */
04288 
04289       /* Caret at start is special: */
04290       if ((k < len) && (s[k] == '^')) {
04291        not_mode = 1;
04292        k++;
04293       }
04294       /* Close bracket start is special: */
04295       if ((k < len) && (s[k] == ']'))
04296        k++;
04297       while ((k < len) && (s[k] != ']')) {
04298        if (s[k] > 127)
04299          saw_big = 1;
04300        else if (pcre && (s[k] == '\\') && (k + 1 < len))
04301          k++;
04302         else if (pcre 
04303                  && (s[k] == '[') 
04304                  && (k + 1 < len)
04305                  && (s[k+1] == ':')
04306                  && is_posix_char_class((char *)s, k + 1, len, NULL)) {
04307           while (s[k] != ']') {
04308             k++;
04309           }
04310         }
04311        k++;
04312       }
04313       if ((k >= len) || (!saw_big && !not_mode)) {
04314        /* No translation necessary. */
04315        while (rs.i <= k) {
04316          r[j++] = s[rs.i++];
04317        }
04318       } else {
04319        /* Need to translate. */
04320        char *simple_on;
04321        Scheme_Object *ranges;
04322        unsigned int *us, *range_array;
04323        int ulen, on_count, range_len, rp, p;
04324 
04325        ulen = scheme_utf8_decode(s, rs.i + 1, k, NULL, 0, -1, NULL, 0, 0);
04326        us = (unsigned int *)scheme_malloc_atomic(ulen * sizeof(unsigned int));
04327        scheme_utf8_decode(s, rs.i + 1, k, us, 0, -1, NULL, 0, 0);
04328 
04329        /* The simple_on array lists ASCII chars to (not) find
04330           for the match */
04331        simple_on = (char *)scheme_malloc_atomic(128);
04332        memset(simple_on, 0, 128);
04333        /* The ranges list is pairs of larger ranges */
04334        ranges = scheme_null;
04335        
04336        p = 0;
04337        if (not_mode)
04338          p++;
04339        if (us[p] == '-') {
04340          simple_on['-'] = 1;
04341          p++;
04342        }
04343 
04344        while (p < ulen) {
04345          if (((p + 2) < ulen)
04346              && us[p+1] == '-'
04347              && (!pcre || ((us[p] != '\\') && (us[p+2] != '\\')))) {
04348            int beg = us[p], end = us[p+2];
04349            if (end == '-') {
04350              FAIL("misplaced hypen within square brackets in pattern");
04351            }
04352            if (end < beg) {
04353              /* Bad regexp */
04354              FAIL("invalid range within square brackets in pattern");       
04355            }
04356              
04357            if ((beg > 127) || (end > 127)) {
04358              /* A big-char range */
04359              ranges = scheme_make_pair(scheme_make_pair(scheme_make_integer_value_from_unsigned(beg),
04360                                                   scheme_make_integer_value_from_unsigned(end)),
04361                                    ranges);
04362              if (!(parse_flags & PARSE_CASE_SENS)) {
04363               /* Try to build up parallel ranges, though they may
04364                  not turn out to be parallel. If the ranges overlap,
04365                  we'll clean them up in the final sort-and-merge
04366                  pass for the whole ranges list. */
04367               int c, beg2, end2, c2, mode;
04368               for (mode = 0; mode < 4; mode++) {
04369                 for (c = beg; c <= end; c++) {
04370                   switch (mode) {
04371                   case 0:
04372                     beg2 = scheme_tofold(c);
04373                     break;
04374                   case 1:
04375                     beg2 = scheme_tolower(c);
04376                     break;
04377                   case 2:
04378                     beg2 = scheme_toupper(c);
04379                     break;
04380                   case 3:
04381                   default:
04382                     beg2 = scheme_totitle(c);
04383                     break;
04384                   }
04385                   if (c != beg2) {
04386                     end2 = beg2;
04387                     for (; c <= end; c++) {
04388                      switch (mode) {
04389                      case 0:
04390                        c2 = scheme_tofold(c);
04391                        break;
04392                      case 1:
04393                        c2 = scheme_tolower(c);
04394                        break;
04395                      case 2:
04396                        c2 = scheme_toupper(c);
04397                        break;
04398                      case 3:
04399                      default:
04400                        c2 = scheme_totitle(c);
04401                        break;
04402                      }
04403                      if ((c2 == c) || (c2 != end2 + 1))
04404                        break;
04405                     }
04406                     ranges = scheme_make_pair(scheme_make_pair(scheme_make_integer_value_from_unsigned(beg2),
04407                                                          scheme_make_integer_value_from_unsigned(end2)),
04408                                           ranges);
04409                   }
04410                 }
04411               }
04412              }
04413            } else {
04414              /* Small range */
04415              int w;
04416              for (w = beg; w <= end; w++) {
04417               simple_on[w] = 1;
04418              }
04419            }
04420            p += 3;
04421          } else if (pcre && (us[p] == '\\')) {
04422            if ((p + 1) < ulen) {
04423              int c = us[p + 1];
04424              if (((c >= 'a') && (c <= 'z'))
04425                 || ((c >= 'A') && (c <= 'Z'))) {
04426               regcharclass(c, simple_on);
04427               p += 2;
04428              } else if (c < 128) {
04429               simple_on[c] = 1;
04430               p += 2;
04431              } else {
04432               /* Let next iteration handle it.
04433                  (There's no danger of using it as a meta-character.) */
04434                 p++;
04435              }
04436            } else
04437              FAIL("trailing \\ in pattern");
04438          } else if (us[p] > 127) {
04439            int c = us[p];
04440            ranges = scheme_make_pair(scheme_make_pair(scheme_make_integer_value_from_unsigned(c),
04441                                                  scheme_make_integer_value_from_unsigned(c)),
04442                                   ranges);
04443            if (!(parse_flags & PARSE_CASE_SENS)) {
04444              int mode, c2;
04445              for (mode = 0; mode < 4; mode++) {
04446               switch (mode) {
04447               case 0:
04448                 c2 = scheme_tofold(c);
04449                 break;
04450               case 1:
04451                 c2 = scheme_tolower(c);
04452                 break;
04453               case 2:
04454                 c2 = scheme_toupper(c);
04455                 break;
04456               case 3:
04457               default:
04458                 c2 = scheme_totitle(c);
04459                 break;
04460               }
04461               if (c2 != c) {
04462                 ranges = scheme_make_pair(scheme_make_pair(scheme_make_integer_value_from_unsigned(c2),
04463                                                       scheme_make_integer_value_from_unsigned(c2)),
04464                                        ranges);
04465               }
04466              }
04467            }
04468            p++;
04469           } else if (pcre
04470                      && (us[p] == '[')
04471                      && ((p + 1) < ulen)
04472                      && (us[p+1] == ':')
04473                      && is_posix_char_class_in_unicode(us, p + 1, ulen, simple_on)) {
04474             while (us[p] != ']') {
04475               p++;
04476             }
04477             p++;
04478          } else {
04479            if (((p + 1) < ulen) && (us[p] == '-')) {
04480              FAIL("misplaced hypen within square brackets in pattern");
04481              return 0;
04482            }
04483            simple_on[us[p]] = 1;
04484            p++;
04485          }
04486        }
04487 
04488        /* Turn the ranges list into an array */
04489        range_len = scheme_list_length(ranges);
04490        range_array = (unsigned int *)scheme_malloc_atomic(2 * range_len * sizeof(unsigned int));
04491        for (rp = 0; SCHEME_PAIRP(ranges); ranges = SCHEME_CDR(ranges), rp += 2) {
04492          unsigned long hi, lo;
04493          scheme_get_unsigned_int_val(SCHEME_CAAR(ranges), &lo);
04494          scheme_get_unsigned_int_val(SCHEME_CDR(SCHEME_CAR(ranges)), &hi);
04495          range_array[rp] = (unsigned int)lo;
04496          range_array[rp+1] = (unsigned int)hi;
04497        }
04498        range_len *= 2;
04499        /* Sort the ranges by the starting index. */
04500        my_qsort(range_array, range_len >> 1, 2 * sizeof(unsigned int), compare_ranges);
04501        
04502        /* If a range starts below 128, fill in the simple array */
04503        for (rp = 0; rp < range_len; rp += 2) {
04504          if (range_array[rp] < 128) {
04505            for (p = range_array[rp]; p < 128; p++) {
04506              simple_on[p] = 1;
04507            }
04508            range_array[rp] = 128;
04509          }
04510        }
04511        
04512        if (!(parse_flags & PARSE_CASE_SENS)) {
04513          for (p = 'a'; p <= 'z'; p++) {
04514            if (simple_on[p])
04515              simple_on[rx_toupper(p)] = 1;
04516            if (simple_on[rx_toupper(p)])
04517              simple_on[p] = 1;
04518          }
04519        }
04520 
04521        /* Count simples that are on */
04522        on_count = 0;
04523        for (p = 0; p < 128; p++) {
04524          if (simple_on[p])
04525            on_count++;
04526        }
04527 
04528        if (not_mode) {
04529          /* "Not" mode. We produce something in regular mode */
04530          /* Start with "(?:[...]|" for simples. */
04531          unsigned int last_end;
04532          int did_alt;
04533          r = make_room(r, j, 6 + (128 - on_count) + ((pcre && !simple_on['\\']) ? 1 : 0), &rs);
04534          r[j++] = '(';
04535          r[j++] = '?';
04536          r[j++] = ':';
04537          if (on_count < 128) {
04538            if (!on_count) {
04539              r[j++] = '[';
04540              r[j++] = 0;
04541              r[j++] = '-';
04542              r[j++] = 127;
04543              r[j++] = ']';
04544            } else {
04545              r[j++] = '[';
04546              if (!simple_on[']'])
04547               r[j++] = ']';
04548              for (p = 0; p < 128; p++) {
04549               if ((p != '-') && (p != ']') && (!pcre || (p != '\\')))
04550                 if (!simple_on[p])
04551                   r[j++] = p;
04552              }
04553              if (pcre && !simple_on['\\']) {
04554               r[j++] = '\\';
04555               r[j++] = '\\';
04556              }
04557              if (!simple_on['-'])
04558               r[j++] = '-';
04559              r[j++] = ']';
04560            }
04561            did_alt = 0;
04562          } else
04563            did_alt = 1;
04564          last_end = 128;
04565          for (rp = 0; rp < range_len; rp += 2) {
04566            if (range_array[rp] > last_end) {
04567              r = add_range(r, &j, &rs, last_end, range_array[rp] - 1, did_alt);
04568              did_alt = 0;
04569            }
04570            if ((range_array[rp + 1] + 1) > last_end)
04571              last_end = range_array[rp + 1] + 1;
04572          }
04573          if (last_end <= 0x10FFFF) {
04574            if (last_end < 0xD800) {
04575              r = add_range(r, &j, &rs, last_end, 0xD7FF, did_alt);
04576              did_alt = 0;
04577              r = add_range(r, &j, &rs, 0xE000, 0x10FFFF, did_alt);
04578            } else {
04579              r = add_range(r, &j, &rs, last_end, 0x10FFFF, did_alt);
04580              did_alt = 0;
04581            }
04582          }
04583          r = make_room(r, j, 1, &rs);
04584          r[j++] = ')';
04585        } else {
04586          /* Normal mode */
04587          /* Start with "(?:[...]|" for simples. */
04588          int p, did_alt;
04589          r = make_room(r, j, 5 + on_count + ((pcre && simple_on['\\']) ? 1 : 0), &rs);
04590          r[j++] = '(';
04591          r[j++] = '?';
04592          r[j++] = ':';
04593          if (on_count) {
04594            if (on_count == 128) {
04595              r[j++] = '[';
04596              r[j++] = 0;
04597              r[j++] = '-';
04598              r[j++] = 127;
04599              r[j++] = ']';
04600            } else {
04601              r[j++] = '[';
04602              if (simple_on[']'])
04603               r[j++] = ']';
04604              for (p = 0; p < 128; p++) {
04605               if ((p != '-') && (p != ']') && (!pcre || (p != '\\')))
04606                 if (simple_on[p])
04607                   r[j++] = p;
04608              }
04609              if (pcre && simple_on['\\']) {
04610               r[j++] = '\\';
04611               r[j++] = '\\';
04612              }
04613              if (simple_on['-'])
04614               r[j++] = '-';
04615              r[j++] = ']';
04616            }
04617            did_alt = 0;
04618          } else
04619            did_alt = 1;
04620          for (rp = 0; rp < range_len; rp += 2) {
04621            r = add_range(r, &j, &rs, range_array[rp], range_array[rp+1], did_alt);
04622            did_alt = 0;
04623          }
04624          r = make_room(r, j, 1, &rs);
04625          r[j++] = ')';
04626        }
04627       }
04628       rs.i = k + 1;
04629     } else if (s[rs.i] == '\\') {
04630       /* Skip over next char, possibly big: */
04631       r[j++] = s[rs.i++];
04632       if ((rs.i < len)
04633          && (s[rs.i] > 127)) {
04634        r[j++] = s[rs.i++];
04635        while ((rs.i < len) && ((s[rs.i] & 0xC0) == 0x80)) {
04636          r[j++] = s[rs.i++];
04637        }
04638       } else
04639        r[j++] = s[rs.i++];
04640     } else if ((s[rs.i] == '.')
04641               && (!pcre
04642                  || (rs.i < 3)
04643                  || (s[rs.i-1] != '{')
04644                  || ((s[rs.i-2] == 'p')
04645                      && (s[rs.i-2] == 'P'))
04646                  || (s[rs.i-3] != '\\'))) {
04647       /* "." has to be expanded. */
04648       r = make_room(r, j, (parse_flags & PARSE_SINGLE_LINE) ? 9 : 8, &rs);
04649       r[j++] = '(';
04650       r[j++] = '?';
04651       r[j++] = ':';
04652       r[j++] = '[';
04653       r[j++] = '\00';
04654       r[j++] = '-';
04655       if (!(parse_flags & PARSE_SINGLE_LINE)) {
04656        r[j++] = '\n' - 1;
04657        r[j++] = '\n' + 1;
04658        r[j++] = '-';
04659       }
04660       r[j++] = '\177';
04661       r[j++] = ']';
04662       r = add_range(r, &j, &rs, 128, 0xD7FF, 0);
04663       r = add_range(r, &j, &rs, 0xE000, 0x10FFFF, 0);
04664       r = make_room(r, j, 1, &rs);
04665       r[j++] = ')';
04666       rs.i++;
04667     } else if (s[rs.i] > 127) {
04668       int k = rs.i + 1;
04669       /* Look for *, +, or ? after this big char */
04670       while ((k < len) && ((s[k] & 0xC0) == 0x80)) {
04671        k++;
04672       }
04673       if ((k < len) && ((s[k] == '+')
04674                      || (s[k] == '*')
04675                      || (s[k] == '?')
04676                      || (!(parse_flags & PARSE_CASE_SENS)
04677                          && need_ci_alternates(s, rs.i, k)))) {
04678        /* Need to translate; wrap char in (?: ...) */
04679        int orig_i;
04680        r = make_room(r, j, 4, &rs);
04681        r[j++] = '(';
04682        r[j++] = '?';
04683        r[j++] = ':';
04684        orig_i = rs.i;
04685        while (rs.i < k) {
04686          r[j++] = s[rs.i++];
04687        }
04688        if (!(parse_flags & PARSE_CASE_SENS)) {
04689          /* Add alternates for different cases: */
04690          mzchar us[1], c0, c1, wrote[4];
04691          int clen, ci, num_wrote = 1, mode;
04692          unsigned char s2[MAX_UTF8_CHAR_BYTES];
04693 
04694          scheme_utf8_decode(s, orig_i, k, us, 0, 1, NULL, 0, 0);
04695          c0 = us[0];
04696          wrote[0] = c0;
04697          for (mode = 0; mode < 4; mode++) {
04698            switch (mode) {
04699            case 0:
04700              c1 = scheme_tofold(c0);
04701              break;
04702            case 1:
04703              c1 = scheme_tolower(c0);
04704              break;
04705            case 2:
04706              c1 = scheme_toupper(c0);
04707              break;
04708            case 3:
04709            default:
04710              c1 = scheme_totitle(c0);
04711              break;
04712            }
04713            for (ci = 0; ci < num_wrote; ci++) {
04714              if (c1 == wrote[ci])
04715               break;
04716            }
04717            if (ci >= num_wrote) {
04718              wrote[num_wrote++] = c1;
04719              us[0] = c1;
04720              clen = scheme_utf8_encode(us, 0, 1, s2, 0, 0);
04721              r = make_room(r, j, clen + 1, &rs);
04722              r[j++] = '|';
04723              for (ci = 0; ci < clen; ci++) {
04724               r[j++] = s2[ci];
04725              }
04726            }
04727          }
04728        }
04729        r[j++] = ')';
04730       } else {
04731        /* No translation. */
04732        while (rs.i < k) {
04733          r[j++] = s[rs.i++];
04734        }
04735       }
04736     } else {
04737       /* The translation needs to know about case-insensitive
04738         and single-line modes, so track parens: */
04739       if (s[rs.i] == '(') {
04740        int old_flags = parse_flags;
04741        if ((rs.i + 1 < len) && (s[rs.i + 1] == '?')) {
04742          int k;
04743          for (k = rs.i + 2; k < len; k++) {
04744            if ((s[k] == ':')
04745               || (s[k] == '<')
04746               || (s[k] == '>')
04747               || (s[k] == '=')
04748               || (s[k] == '!'))
04749              break;
04750            if (s[k] == 'i') {
04751              parse_flags &= ~PARSE_CASE_SENS;
04752            } else if (s[k] == 's') {
04753              parse_flags |= PARSE_SINGLE_LINE;
04754            } else if (s[k] == 'm') {
04755              parse_flags &= ~PARSE_SINGLE_LINE;
04756            } else if (s[k] == '-') {
04757              if (k + 1 < len) {
04758               k++;
04759               if (s[k] == 'i') {
04760                 parse_flags |= PARSE_CASE_SENS;
04761               } else if (s[k] == 's') {
04762                 parse_flags &= ~PARSE_SINGLE_LINE;
04763               } else if (s[k] == 'm') {
04764                 parse_flags |= PARSE_SINGLE_LINE;
04765               }
04766              }
04767            }
04768          }
04769        }
04770        if (parse_params || (parse_flags != old_flags)) {
04771          parse_params = scheme_make_raw_pair(scheme_make_integer(old_flags),
04772                                          parse_params);
04773        }
04774       } else if (s[rs.i] == ')') {
04775        if (parse_params) {
04776          parse_flags = SCHEME_INT_VAL(SCHEME_CAR(parse_params));
04777          parse_params = SCHEME_CDR(parse_params);
04778        }
04779       }
04780       r[j++] = s[rs.i++];
04781     }
04782   }
04783 
04784   r[j] = 0;
04785   *result = (char *)r;
04786   return j;
04787 }
04788 
04789 /************************************************************/
04790 /*                   Scheme front end                       */
04791 /************************************************************/
04792 
04793 int scheme_is_pregexp(Scheme_Object *o)
04794 {
04795   return !!(((regexp *)o)->flags & REGEXP_IS_PCRE);
04796 }
04797 
04798 static Scheme_Object *do_make_regexp(const char *who, int is_byte, int pcre, int argc, Scheme_Object *argv[])
04799 {
04800   Scheme_Object *re, *bs;
04801   char *s;
04802   int slen;
04803 
04804   if (is_byte) {
04805     if (!SCHEME_BYTE_STRINGP(argv[0]))
04806       scheme_wrong_type(who, "byte string", 0, argc, argv);
04807     bs = argv[0];
04808   } else {
04809     if (!SCHEME_CHAR_STRINGP(argv[0]))
04810       scheme_wrong_type(who, "string", 0, argc, argv);
04811     bs = scheme_char_string_to_byte_string(argv[0]);
04812   }
04813 
04814   s = SCHEME_BYTE_STR_VAL(bs);
04815   slen = SCHEME_BYTE_STRTAG_VAL(bs);
04816 
04817   if (!is_byte) {
04818     slen = translate((unsigned char *)s, slen, &s, pcre);
04819 #if 0
04820     /* Debugging, to see the translated regexp: */
04821     {
04822       char *cp;
04823       int i;
04824       cp = (char *)scheme_malloc_atomic(slen + 1);
04825       memcpy(cp, s, slen + 1);
04826       for (i = 0; i < slen; i++) {
04827        if (!cp[i]) cp[i] = '0';
04828       }
04829       printf("%d %s\n", slen, scheme_write_to_string(scheme_make_byte_string(cp), 0));
04830     }
04831 #endif
04832   }
04833 
04834   re = (Scheme_Object *)regcomp(s, 0, slen, pcre);
04835 
04836   if (!is_byte)
04837     ((regexp *)re)->flags |= REGEXP_IS_UTF8;
04838   if (pcre)
04839     ((regexp *)re)->flags |= REGEXP_IS_PCRE;
04840 
04841   if (SCHEME_IMMUTABLEP(argv[0]))
04842     ((regexp *)re)->source = argv[0];
04843   else if (is_byte) {
04844     Scheme_Object *src;
04845     src = scheme_make_immutable_sized_byte_string(SCHEME_BYTE_STR_VAL(argv[0]), 
04846                                             SCHEME_BYTE_STRTAG_VAL(argv[0]), 
04847                                             1);
04848     ((regexp *)re)->source = src;
04849   } else {
04850     Scheme_Object *src;
04851     src = scheme_make_immutable_sized_char_string(SCHEME_CHAR_STR_VAL(argv[0]), 
04852                                             SCHEME_CHAR_STRTAG_VAL(argv[0]), 
04853                                             1);
04854     ((regexp *)re)->source = src;
04855   }
04856 
04857   {
04858     Scheme_Object *b;
04859     b = scheme_get_param(scheme_current_config(), MZCONFIG_USE_JIT);
04860     if (SCHEME_TRUEP(b))
04861       ((regexp *)re)->flags |= REGEXP_JIT;
04862   }
04863   
04864   return re;
04865 }
04866 
04867 static Scheme_Object *make_regexp(int argc, Scheme_Object *argv[])
04868 {
04869   return do_make_regexp("byte-regexp", 1, 0, argc, argv);
04870 }
04871 
04872 static Scheme_Object *make_utf8_regexp(int argc, Scheme_Object *argv[])
04873 {
04874   return do_make_regexp("regexp", 0, 0, argc, argv);
04875 }
04876 
04877 static Scheme_Object *make_pregexp(int argc, Scheme_Object *argv[])
04878 {
04879   return do_make_regexp("byte-pregexp", 1, 1, argc, argv);
04880 }
04881 
04882 static Scheme_Object *make_utf8_pregexp(int argc, Scheme_Object *argv[])
04883 {
04884   return do_make_regexp("pregexp", 0, 1, argc, argv);
04885 }
04886 
04887 Scheme_Object *scheme_make_regexp(Scheme_Object *str, int is_byte, int pcre, int * volatile result_is_err_string)
04888 {
04889   mz_jmp_buf * volatile save, newbuf;
04890   Scheme_Object * volatile result;
04891 
04892   *result_is_err_string = 0;
04893 
04894   /* we rely on single-threaded, non-blocking regexp compilation: */
04895   save = scheme_current_thread->error_buf;
04896   scheme_current_thread->error_buf = &newbuf;
04897   failure_msg_for_read = "yes";
04898   if (!scheme_setjmp(newbuf)) {
04899     if (is_byte) {
04900       if (pcre)
04901        result = make_pregexp(1, &str);
04902       else
04903        result = make_regexp(1, &str);
04904     } else {
04905       if (pcre)
04906        result = make_utf8_pregexp(1, &str);
04907       else
04908        result = make_utf8_regexp(1, &str);
04909     }
04910   } else {
04911     result = (Scheme_Object *)failure_msg_for_read;
04912     *result_is_err_string = 1;
04913   }
04914 
04915   failure_msg_for_read = NULL;
04916   scheme_current_thread->error_buf = save;
04917   return result;
04918 }
04919 
04920 static regexp *regcomp_object(Scheme_Object *str)
04921 {
04922   if (SCHEME_BYTE_STRINGP(str))
04923     return (regexp *)make_regexp(1, &str);
04924   else
04925     return (regexp *)make_utf8_regexp(1, &str);
04926 }
04927 
04928 void scheme_clear_rx_buffers(void)
04929 {
04930   startp_buffer_cache = NULL;
04931   endp_buffer_cache = NULL;
04932   maybep_buffer_cache = NULL;
04933 }
04934 
04935 static Scheme_Object *gen_compare(char *name, int pos, 
04936                               int argc, Scheme_Object *argv[],
04937                               int peek, int nonblock)
04938 {
04939   regexp *r;
04940   char *full_s;
04941   rxpos *startp, *maybep, *endp;
04942   int offset = 0, orig_offset, endset, m, was_non_byte;
04943   Scheme_Object *iport, *oport = NULL, *startv = NULL, *endv = NULL, *dropped, *unless_evt = NULL;
04944   
04945   if (SCHEME_TYPE(argv[0]) != scheme_regexp_type
04946       && !SCHEME_BYTE_STRINGP(argv[0])
04947       && !SCHEME_CHAR_STRINGP(argv[0]))
04948     scheme_wrong_type(name, "regexp, byte-regexp, string, or byte string", 0, argc, argv);
04949   if ((peek || (!SCHEME_BYTE_STRINGP(argv[1]) && !SCHEME_CHAR_STRINGP(argv[1])))
04950       && !SCHEME_INPUT_PORTP(argv[1]))
04951     scheme_wrong_type(name, peek ? "input-port" : "string, byte string, or input port", 1, argc, argv);
04952   
04953   if (SCHEME_CHAR_STRINGP(argv[1])) {
04954     iport = NULL;
04955     endset = SCHEME_CHAR_STRLEN_VAL(argv[1]);
04956   } else if (SCHEME_INPUT_PORTP(argv[1])) {
04957     iport = argv[1];
04958     endset = -2;
04959   } else {
04960     iport = NULL;
04961     endset = SCHEME_BYTE_STRLEN_VAL(argv[1]);
04962   }
04963 
04964   if (argc > 2) {
04965     int len = endset;
04966 
04967     offset = scheme_extract_index(name, 2, argc, argv, len + 1, 0);
04968 
04969     if (!iport && (offset > len)) {
04970       scheme_out_of_string_range(name, "offset ", argv[2], argv[1], 0, len);
04971       return NULL;
04972     } else if (offset < 0) {
04973       /* argument was a bignum */
04974       offset = 0x7FFFFFFF;
04975     }
04976     startv = argv[2];
04977       
04978     if (argc > 3) {
04979       if (!SCHEME_FALSEP(argv[3])) {
04980        endset = scheme_extract_index(name, 3, argc, argv, len + 1, 1);
04981        
04982        if (iport) {
04983          if (endset < 0) {
04984            /* argument was a bignum */
04985            endset = 0x7FFFFFFF;
04986          }
04987          /* compare numbers */
04988          if (scheme_bin_lt(argv[3], argv[2])) {
04989            scheme_raise_exn(MZEXN_FAIL_CONTRACT,
04990                           "%s: ending index %V is smaller than starting index %V for port",
04991                           name, argv[3], argv[2]);
04992            return NULL;
04993          }
04994        } else if (endset < offset || endset > len) {
04995          scheme_out_of_string_range(name, "ending ", argv[3], argv[1], offset, len);
04996          return NULL;
04997        }
04998        endv = argv[3];
04999       }
05000       
05001       if (argc > 4) {
05002        if (peek) {
05003          if (!SCHEME_FALSEP(argv[4])) {
05004            unless_evt = argv[4];
05005            if (!SAME_TYPE(SCHEME_TYPE(unless_evt), scheme_progress_evt_type)) {
05006              scheme_wrong_type(name, "progress evt or #f", 4, argc, argv);
05007              return NULL;
05008            }
05009            if (!iport) {
05010              scheme_arg_mismatch(name, 
05011                               "progress evt cannot be used with string input: ",
05012                               unless_evt);
05013            } else if (!SAME_OBJ(iport, SCHEME_PTR1_VAL(unless_evt))) {
05014              scheme_arg_mismatch(name,
05015                               "evt is not a progress evt for the given port:",
05016                               unless_evt);
05017              return NULL;
05018            }
05019          }
05020        } else {
05021          if (SCHEME_TRUEP(argv[4])) {
05022            if (!SCHEME_OUTPUT_PORTP(argv[4]))
05023              scheme_wrong_type(name, "output port or #f", 4, argc, argv);
05024            oport = argv[4];
05025          }
05026        }
05027       }
05028     }
05029   }
05030 
05031   if (iport && !startv)
05032     startv = scheme_make_integer(0);
05033 
05034   if (SCHEME_BYTE_STRINGP(argv[0])
05035       || SCHEME_CHAR_STRINGP(argv[0]))
05036     r = regcomp_object(argv[0]);
05037   else
05038     r = (regexp *)argv[0];
05039 
05040   was_non_byte = 0;
05041   orig_offset = 0; /* extra offset */
05042   if (!iport) {
05043     if (SCHEME_BYTE_STRINGP(argv[1]))
05044       full_s = SCHEME_BYTE_STR_VAL(argv[1]);
05045     else {
05046       /* Extract substring and UTF-8 encode: */
05047       int blen;
05048       blen = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(argv[1]), offset, endset,
05049                             NULL, 0,
05050                             0 /* not UTF-16 */);
05051       full_s = (char *)scheme_malloc_atomic(blen);
05052       scheme_utf8_encode(SCHEME_CHAR_STR_VAL(argv[1]), offset, endset,
05053                       (unsigned char *)full_s, 0,
05054                       0 /* not UTF-16 */);
05055       orig_offset = offset;
05056       offset = 0;
05057       endset = blen;
05058       if (r->flags & REGEXP_IS_UTF8)
05059        was_non_byte = 1;
05060       else {
05061        /* Convert orig_offset into encoded bytes */
05062        orig_offset = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(argv[1]), 0, orig_offset,
05063                                     NULL, 0,
05064                                     0);
05065       }
05066     }
05067   } else
05068     full_s = NULL;
05069 
05070   if (startp_buffer_cache && (r->nsubexp <= rx_buffer_size)) {
05071     startp = startp_buffer_cache;
05072     maybep = maybep_buffer_cache;
05073     endp = endp_buffer_cache;
05074     startp_buffer_cache = NULL;
05075   } else {
05076     startp = MALLOC_N_ATOMIC(rxpos, r->nsubexp);
05077     maybep = NULL;
05078     endp = MALLOC_N_ATOMIC(rxpos, r->nsubexp);
05079   }
05080   if ((r->nsubexp > 1) && !maybep)
05081     maybep = MALLOC_N_ATOMIC(rxpos, r->nsubexp);
05082 
05083   dropped = scheme_make_integer(0);
05084 
05085   m = regexec(name, r, full_s, offset, endset - offset, startp, maybep, endp,
05086              iport, unless_evt, nonblock,
05087              &full_s, peek, pos, oport, 
05088              startv, endv, &dropped);
05089 
05090   if (m) {
05091     int i;
05092     Scheme_Object *l = scheme_null, *rs;
05093 
05094     if (oport && !iport)
05095       scheme_put_byte_string(name, oport, full_s, 0, *startp, 0);
05096 
05097     if (pos > 1) {
05098       /* pos == 2 => just get true or false */
05099       dropped = scheme_true;
05100     } else {
05101       for (i = r->nsubexp; i--; ) {
05102        if (startp[i] != -1) {
05103          if (pos) {
05104            Scheme_Object *startpd, *endpd;
05105 
05106            if (was_non_byte) {
05107              /* Need to figure out how startpd and endpd correspond to
05108                code points. Note that the input regexp matches only
05109                unicode chars, so the start and end points can't be in
05110                the middle of encoded characters. */
05111              int uspd, uepd;
05112              uspd = scheme_utf8_decode((const unsigned char *)full_s, offset, startp[i],
05113                                    NULL, 0, -1,
05114                                    NULL, 0, 0);
05115              uspd += orig_offset;
05116              startpd = scheme_make_integer(uspd);
05117              uepd = scheme_utf8_decode((const unsigned char *)full_s, startp[i], endp[i],
05118                                    NULL, 0, -1,
05119                                    NULL, 0, 0);
05120              uepd += uspd;
05121              endpd = scheme_make_integer(uepd);
05122            } else {
05123              int v;
05124              v = startp[i] + orig_offset;
05125              startpd = scheme_make_integer(v);
05126              v = endp[i] + orig_offset;
05127              endpd = scheme_make_integer(v);
05128            
05129              if (iport) {
05130               /* Increment by drop count: */
05131               startpd = scheme_bin_plus(startpd, dropped);
05132               endpd = scheme_bin_plus(endpd, dropped);
05133              }
05134            }
05135          
05136            l = scheme_make_pair(scheme_make_pair(startpd, endpd),
05137                              l);
05138          } else {
05139            long len;
05140            len = endp[i] - startp[i];
05141            if (was_non_byte) {
05142              rs = scheme_make_sized_offset_utf8_string(full_s, startp[i], len);
05143            } else {
05144              rs = scheme_make_sized_offset_byte_string(full_s, startp[i], len, 1);
05145            }
05146            l = scheme_make_pair(rs, l);
05147          }
05148        } else
05149          l = scheme_make_pair(scheme_false, l);
05150       }
05151       dropped = l;
05152     }
05153   } else {
05154     if (oport && !iport)
05155       scheme_put_byte_string(name, oport, full_s, 0, endset, 0);
05156 
05157     dropped = scheme_false;
05158   }
05159   
05160   if (!startp_buffer_cache || (r->nsubexp > rx_buffer_size)) {
05161     rx_buffer_size = r->nsubexp;
05162     startp_buffer_cache = startp;
05163     maybep_buffer_cache = maybep;
05164     endp_buffer_cache = endp;
05165   } else if (maybep && !maybep_buffer_cache && (r->nsubexp == rx_buffer_size)) {
05166     maybep_buffer_cache = maybep;
05167   }
05168   
05169   return dropped;
05170 }
05171 
05172 static Scheme_Object *compare(int argc, Scheme_Object *argv[])
05173 {
05174   return gen_compare("regexp-match", 0, argc, argv, 0, 0);
05175 }
05176 
05177 static Scheme_Object *positions(int argc, Scheme_Object *argv[])
05178 {
05179   return gen_compare("regexp-match-positions", 1, argc, argv, 0, 0);
05180 }
05181 
05182 static Scheme_Object *compare_bool(int argc, Scheme_Object *argv[])
05183 {
05184   return gen_compare("regexp-match?", 2, argc, argv, 0, 0);
05185 }
05186 
05187 static Scheme_Object *compare_peek(int argc, Scheme_Object *argv[])
05188 {
05189   return gen_compare("regexp-match-peek", 0, argc, argv, 1, 0);
05190 }
05191 
05192 static Scheme_Object *positions_peek(int argc, Scheme_Object *argv[])
05193 {
05194   return gen_compare("regexp-match-peek-positions", 1, argc, argv, 1, 0);
05195 }
05196 
05197 static Scheme_Object *compare_peek_nonblock(int argc, Scheme_Object *argv[])
05198 {
05199   return gen_compare("regexp-match-peek-immediate", 0, argc, argv, 1, 1);
05200 }
05201 
05202 static Scheme_Object *positions_peek_nonblock(int argc, Scheme_Object *argv[])
05203 {
05204   return gen_compare("regexp-match-peek-positions-immediate", 1, argc, argv, 1, 1);
05205 }
05206 
05207 static char *build_call_name(const char *n)
05208 {
05209   char *m;
05210   int l;
05211   l = strlen(n);
05212   m = (char *)scheme_malloc_atomic(l + 42);
05213   memcpy(m, n, l);
05214   strcpy(m XFORM_OK_PLUS l, " (calling given filter procedure)");
05215   return m;
05216 }
05217 
05218 static int initial_char_len(unsigned char *source, long start, long end)
05219 {
05220   long i;
05221 
05222   for (i = start + 1; i <= end; i++) {
05223     if (scheme_utf8_decode_count(source, start, i, NULL, 1, 1)) {
05224       return i - start;
05225     }
05226   }
05227 
05228   return 1;
05229 }
05230 
05231 static Scheme_Object *gen_replace(const char *name, int argc, Scheme_Object *argv[], int all)
05232 {
05233   Scheme_Object *orig;
05234   regexp *r;
05235   char *source, *prefix = NULL, *deststr;
05236   rxpos *startp, *maybep, *endp;
05237   int prefix_len = 0, sourcelen, srcoffset = 0, was_non_byte, destlen;
05238 
05239   if (SCHEME_TYPE(argv[0]) != scheme_regexp_type
05240       && !SCHEME_BYTE_STRINGP(argv[0])
05241       && !SCHEME_CHAR_STRINGP(argv[0]))
05242     scheme_wrong_type(name, "regexp, byte-regexp, string, or byte string", 0, argc, argv);
05243   if (!SCHEME_BYTE_STRINGP(argv[1])
05244       && !SCHEME_CHAR_STRINGP(argv[1]))
05245     scheme_wrong_type(name, "string or byte string", 1, argc, argv);
05246   if (!SCHEME_BYTE_STRINGP(argv[2])
05247       && !SCHEME_CHAR_STRINGP(argv[2])
05248       && !SCHEME_PROCP(argv[2]))
05249     scheme_wrong_type(name, "string, byte string, or procedure", 2, argc, argv);
05250 
05251   if (SCHEME_BYTE_STRINGP(argv[2])) {
05252     if (SCHEME_CHAR_STRINGP(argv[0])
05253        || ((SCHEME_TYPE(argv[0]) == scheme_regexp_type)
05254            && (((regexp *)argv[0])->flags & REGEXP_IS_UTF8))) {
05255       if (SCHEME_CHAR_STRINGP(argv[1])) {
05256        scheme_arg_mismatch(name, "cannot replace a string with a byte string: ",
05257                          argv[2]);
05258       }
05259     }
05260   }
05261 
05262   if (SCHEME_BYTE_STRINGP(argv[0])
05263       || SCHEME_CHAR_STRINGP(argv[0]))
05264     r = regcomp_object(argv[0]);
05265   else
05266     r = (regexp *)argv[0];
05267 
05268   if (SCHEME_PROCP(argv[2])) {
05269     if (!scheme_check_proc_arity(NULL, r->nsubexp, 2, argc, argv)) {
05270       scheme_raise_exn(MZEXN_FAIL_CONTRACT,
05271                      "%s: regexp produces %d matches: %V; procedure does not accept %d arguments: %V",
05272                      name, 
05273                      r->nsubexp, (Scheme_Object *)r,
05274                      r->nsubexp, argv[2]);
05275     }
05276   }
05277 
05278   if (SCHEME_CHAR_STRINGP(argv[1])) {
05279     orig = scheme_char_string_to_byte_string(argv[1]);
05280     if (r->flags & REGEXP_IS_UTF8)
05281       was_non_byte = 1;
05282     else
05283       was_non_byte = 0;
05284   } else {
05285     orig = argv[1];
05286     was_non_byte = 0;
05287   }
05288   source = SCHEME_BYTE_STR_VAL(orig);
05289   sourcelen = SCHEME_BYTE_STRTAG_VAL(orig);
05290   deststr = NULL;
05291   destlen = 0;
05292 
05293   startp = MALLOC_N_ATOMIC(rxpos, r->nsubexp);
05294   if (r->nsubexp > 1)
05295     maybep = MALLOC_N_ATOMIC(rxpos, r->nsubexp);
05296   else
05297     maybep = NULL;
05298   endp = MALLOC_N_ATOMIC(rxpos, r->nsubexp);
05299 
05300   while (1) {
05301     int m;
05302 
05303     do {
05304       m = regexec(name, r, source, srcoffset, sourcelen - srcoffset, startp, maybep, endp,
05305                   NULL, NULL, 0,
05306                   NULL, 0, 0, NULL, NULL, NULL, NULL);
05307 
05308       if (m && all && (startp[0] == endp[0])) {
05309         if (!startp[0] && sourcelen) {
05310           int amt;
05311 
05312           if (was_non_byte)
05313             amt = initial_char_len((unsigned char *)source, 0, sourcelen);
05314           else
05315             amt = 1;
05316           
05317           prefix = scheme_malloc_atomic(amt + 1);
05318           prefix_len = amt;
05319           memcpy(prefix, source, amt);
05320           srcoffset += amt;
05321           /* try again */
05322         } else {
05323           /* if it's the end of the input, the match should fail */
05324           if (startp[0] == sourcelen)
05325             m = 0;
05326           break;
05327         }
05328       } else
05329         break;
05330     } while (1);
05331 
05332     if (m) {
05333       char *insert;
05334       long len, end, startpd, endpd;
05335 
05336       if (SCHEME_PROCP(argv[2])) {
05337         int i;
05338         Scheme_Object *m, **args, *quick_args[5];
05339 
05340        if (r->nsubexp <= 5) {
05341          args = quick_args;
05342        } else {
05343          args = MALLOC_N(Scheme_Object*, r->nsubexp);
05344        }
05345 
05346         for (i = r->nsubexp; i--; ) {
05347           if (startp[i] == -1) {
05348             args[i] = scheme_false;
05349           } else {
05350             long len;
05351             len = endp[i] - startp[i];
05352             if (was_non_byte) {
05353              m = scheme_make_sized_offset_utf8_string(source, startp[i], len);
05354               args[i] = m;
05355             } else {
05356              m = scheme_make_sized_offset_byte_string(source, startp[i], len, 1);
05357               args[i] = m;
05358             }
05359           }
05360         }
05361 
05362         m = _scheme_apply(argv[2], r->nsubexp, args);
05363 
05364        if (!was_non_byte) {
05365           if (!SCHEME_BYTE_STRINGP(m)) {
05366            args[0] = m;
05367            scheme_wrong_type(build_call_name(name), "byte string", -1, -1, args);
05368          }
05369          insert = SCHEME_BYTE_STR_VAL(m);
05370           len = SCHEME_BYTE_STRLEN_VAL(m);
05371         } else {
05372          if (!SCHEME_CHAR_STRINGP(m)) {
05373            args[0] = m;
05374            scheme_wrong_type(build_call_name(name), "string", -1, -1, args);
05375          }
05376           len = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(m), 0,
05377                                    SCHEME_CHAR_STRLEN_VAL(m),
05378                                    NULL, 0, 0 /* not UTF-16 */);
05379           insert = (char *)scheme_malloc_atomic(len);
05380           scheme_utf8_encode(SCHEME_CHAR_STR_VAL(m), 0,
05381                              SCHEME_CHAR_STRLEN_VAL(m),
05382                              (unsigned char *)insert, 0, 0 /* not UTF-16 */);
05383         }
05384       } else {
05385        if (!deststr) {
05386          if (SCHEME_CHAR_STRINGP(argv[2])) {
05387            Scheme_Object *bs;
05388            bs = scheme_char_string_to_byte_string(argv[2]);
05389            deststr = SCHEME_BYTE_STR_VAL(bs);
05390            destlen = SCHEME_BYTE_STRTAG_VAL(bs);
05391          } else {
05392            deststr = SCHEME_BYTE_STR_VAL(argv[2]);
05393            destlen = SCHEME_BYTE_STRTAG_VAL(argv[2]);
05394          }
05395        }
05396        insert = regsub(r, deststr, destlen, &len, source, startp, endp);
05397       }
05398       
05399       end = sourcelen;
05400       
05401       startpd = startp[0];
05402       endpd = endp[0];
05403 
05404       if (!startpd && (endpd == end) && !prefix) {
05405        if (was_non_byte)
05406          return scheme_make_sized_utf8_string(insert, len);
05407        else
05408          return scheme_make_sized_byte_string(insert, len, 0);
05409       } else if (!all) {
05410        char *result;
05411        long total;
05412        
05413        total = len + (startpd - srcoffset) + (end - endpd);
05414        
05415        result = (char *)scheme_malloc_atomic(total + 1);
05416        memcpy(result, source + srcoffset, startpd - srcoffset);
05417        memcpy(result + (startpd - srcoffset), insert, len);
05418        memcpy(result + (startpd - srcoffset) + len, source + endpd, (end - endpd) + 1);
05419        
05420        if (was_non_byte)
05421          return scheme_make_sized_utf8_string(result, total);
05422        else
05423          return scheme_make_sized_byte_string(result, total, 0);
05424       } else {
05425        char *naya;
05426        long total;
05427         int more;
05428 
05429         if (startpd == endpd)  {
05430           if (was_non_byte)
05431             more = initial_char_len((unsigned char *)source, startpd, sourcelen);
05432           else
05433             more = 1;
05434         } else
05435           more = 0;
05436 
05437        total = len + prefix_len + (startpd - srcoffset);
05438        
05439        naya = (char *)scheme_malloc_atomic(total + more + 1);
05440        memcpy(naya, prefix, prefix_len);
05441        memcpy(naya + prefix_len, source + srcoffset, startpd - srcoffset);
05442        memcpy(naya + prefix_len + (startpd - srcoffset), insert, len);
05443         if (more) {
05444           memcpy(naya + prefix_len + (endpd - srcoffset) + len, source + startpd, more);
05445           total += more;
05446         }
05447 
05448        prefix = naya;
05449        prefix_len = total;
05450 
05451        srcoffset = endpd + more;        
05452       }
05453     } else if (!prefix) {
05454       if (was_non_byte)
05455        return argv[1];
05456       else
05457        return orig;
05458     } else {
05459       char *result;
05460       long total, slen;
05461       
05462       slen = sourcelen - srcoffset;
05463       total = prefix_len + slen;
05464       
05465       result = (char *)scheme_malloc_atomic(total + 1);
05466       memcpy(result, prefix, prefix_len);
05467       memcpy(result + prefix_len, source + srcoffset, slen);
05468       result[prefix_len + slen] = 0;
05469       
05470       if (was_non_byte)
05471        return scheme_make_sized_utf8_string(result, total);
05472       else
05473        return scheme_make_sized_byte_string(result, total, 0);
05474     }
05475 
05476     SCHEME_USE_FUEL(1);
05477   }
05478 }
05479 
05480 static Scheme_Object *replace(int argc, Scheme_Object *argv[])
05481 {
05482   return gen_replace("regexp-replace", argc, argv, 0);
05483 }
05484 
05485 static Scheme_Object *replace_star(int argc, Scheme_Object *argv[])
05486 {
05487   return gen_replace("regexp-replace*", argc, argv, 1);
05488 }
05489 
05490 static Scheme_Object *regexp_p(int argc, Scheme_Object *argv[])
05491 {
05492   return (((SCHEME_TYPE(argv[0]) == scheme_regexp_type) 
05493           && (((regexp *)argv[0])->flags & REGEXP_IS_UTF8))
05494          ? scheme_true 
05495          : scheme_false);
05496 }
05497 
05498 static Scheme_Object *byte_regexp_p(int argc, Scheme_Object *argv[])
05499 {
05500   return (((SCHEME_TYPE(argv[0]) == scheme_regexp_type) 
05501           && !(((regexp *)argv[0])->flags & REGEXP_IS_UTF8))
05502          ? scheme_true 
05503          : scheme_false);
05504 }
05505 
05506 static Scheme_Object *pregexp_p(int argc, Scheme_Object *argv[])
05507 {
05508   return (((SCHEME_TYPE(argv[0]) == scheme_regexp_type) 
05509           && (((regexp *)argv[0])->flags & REGEXP_IS_UTF8)
05510           && (((regexp *)argv[0])->flags & REGEXP_IS_PCRE))
05511          ? scheme_true 
05512          : scheme_false);
05513 }
05514 
05515 static Scheme_Object *byte_pregexp_p(int argc, Scheme_Object *argv[])
05516 {
05517   return (((SCHEME_TYPE(argv[0]) == scheme_regexp_type) 
05518           && !(((regexp *)argv[0])->flags & REGEXP_IS_UTF8)
05519           && (((regexp *)argv[0])->flags & REGEXP_IS_PCRE))
05520          ? scheme_true 
05521          : scheme_false);
05522 }
05523 
05524 Scheme_Object *scheme_regexp_source(Scheme_Object *re)
05525 {
05526   return ((regexp *)re)->source;
05527 }
05528 
05529 int scheme_regexp_is_byte(Scheme_Object *re)
05530 {
05531   return !(((regexp *)re)->flags & REGEXP_IS_UTF8);
05532 }
05533 
05534 #ifdef MZ_PRECISE_GC
05535 START_XFORM_SKIP;
05536 #define MARKS_FOR_REGEXP_C
05537 #include "mzmark.c"
05538 END_XFORM_SKIP;
05539 #endif
05540 
05541 void scheme_regexp_initialize(Scheme_Env *env)
05542 {
05543 #ifdef MZ_PRECISE_GC
05544   GC_REG_TRAV(scheme_regexp_type, mark_regexp);
05545   GC_REG_TRAV(scheme_rt_regwork, mark_regwork);
05546 #endif
05547 
05548   GLOBAL_PRIM_W_ARITY("byte-regexp",                           make_regexp,             1, 1, env);
05549   GLOBAL_PRIM_W_ARITY("regexp",                                make_utf8_regexp,        1, 1, env);
05550   GLOBAL_PRIM_W_ARITY("byte-pregexp",                          make_pregexp,            1, 1, env);
05551   GLOBAL_PRIM_W_ARITY("pregexp",                               make_utf8_pregexp,       1, 1, env);
05552   GLOBAL_PRIM_W_ARITY("regexp-match",                          compare,                 2, 5, env);
05553   GLOBAL_PRIM_W_ARITY("regexp-match-positions",                positions,               2, 5, env);
05554   GLOBAL_PRIM_W_ARITY("regexp-match?",                         compare_bool,            2, 5, env);
05555   GLOBAL_PRIM_W_ARITY("regexp-match-peek",                     compare_peek,            2, 5, env);
05556   GLOBAL_PRIM_W_ARITY("regexp-match-peek-positions",           positions_peek,          2, 5, env);
05557   GLOBAL_PRIM_W_ARITY("regexp-match-peek-immediate",           compare_peek_nonblock,   2, 5, env);
05558   GLOBAL_PRIM_W_ARITY("regexp-match-peek-positions-immediate", positions_peek_nonblock, 2, 5, env);
05559   GLOBAL_PRIM_W_ARITY("regexp-replace",                        replace,                 3, 3, env);
05560   GLOBAL_PRIM_W_ARITY("regexp-replace*",                       replace_star,            3, 3, env);
05561 
05562   GLOBAL_FOLDING_PRIM("regexp?",                               regexp_p,        1, 1, 1, env);
05563   GLOBAL_FOLDING_PRIM("byte-regexp?",                          byte_regexp_p,   1, 1, 1, env);
05564   GLOBAL_FOLDING_PRIM("pregexp?",                              pregexp_p,       1, 1, 1, env);
05565   GLOBAL_FOLDING_PRIM("byte-pregexp?",                         byte_pregexp_p,  1, 1, 1, env);
05566 }
05567 
05568 void scheme_init_regexp_places()
05569 {
05570   REGISTER_SO(regparsestr);
05571   REGISTER_SO(regstr);
05572   REGISTER_SO(regbackknown);
05573   REGISTER_SO(regbackdepends);
05574 }