Back to index

lightning-sunbird  0.9+nobinonly
xmltok_impl.c
Go to the documentation of this file.
00001 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
00002    See the file COPYING for copying permission.
00003 */
00004 
00005 #ifndef IS_INVALID_CHAR
00006 #define IS_INVALID_CHAR(enc, ptr, n) (0)
00007 #endif
00008 
00009 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
00010     case BT_LEAD ## n: \
00011       if (end - ptr < n) \
00012         return XML_TOK_PARTIAL_CHAR; \
00013       if (IS_INVALID_CHAR(enc, ptr, n)) { \
00014         *(nextTokPtr) = (ptr); \
00015         return XML_TOK_INVALID; \
00016       } \
00017       ptr += n; \
00018       break;
00019 
00020 #define INVALID_CASES(ptr, nextTokPtr) \
00021   INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
00022   INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
00023   INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
00024   case BT_NONXML: \
00025   case BT_MALFORM: \
00026   case BT_TRAIL: \
00027     *(nextTokPtr) = (ptr); \
00028     return XML_TOK_INVALID;
00029 
00030 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
00031    case BT_LEAD ## n: \
00032      if (end - ptr < n) \
00033        return XML_TOK_PARTIAL_CHAR; \
00034      if (!IS_NAME_CHAR(enc, ptr, n)) { \
00035        *nextTokPtr = ptr; \
00036        return XML_TOK_INVALID; \
00037      } \
00038      ptr += n; \
00039      break;
00040 
00041 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
00042   case BT_NONASCII: \
00043     if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
00044       *nextTokPtr = ptr; \
00045       return XML_TOK_INVALID; \
00046     } \
00047   case BT_NMSTRT: \
00048   case BT_HEX: \
00049   case BT_DIGIT: \
00050   case BT_NAME: \
00051   case BT_MINUS: \
00052     ptr += MINBPC(enc); \
00053     break; \
00054   CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
00055   CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
00056   CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
00057 
00058 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
00059    case BT_LEAD ## n: \
00060      if (end - ptr < n) \
00061        return XML_TOK_PARTIAL_CHAR; \
00062      if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
00063        *nextTokPtr = ptr; \
00064        return XML_TOK_INVALID; \
00065      } \
00066      ptr += n; \
00067      break;
00068 
00069 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
00070   case BT_NONASCII: \
00071     if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
00072       *nextTokPtr = ptr; \
00073       return XML_TOK_INVALID; \
00074     } \
00075   case BT_NMSTRT: \
00076   case BT_HEX: \
00077     ptr += MINBPC(enc); \
00078     break; \
00079   CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
00080   CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
00081   CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
00082 
00083 #ifndef PREFIX
00084 #define PREFIX(ident) ident
00085 #endif
00086 
00087 /* ptr points to character following "<!-" */
00088 
00089 static int PTRCALL
00090 PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
00091                     const char *end, const char **nextTokPtr)
00092 {
00093   if (ptr != end) {
00094     if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
00095       *nextTokPtr = ptr;
00096       return XML_TOK_INVALID;
00097     }
00098     ptr += MINBPC(enc);
00099     while (ptr != end) {
00100       switch (BYTE_TYPE(enc, ptr)) {
00101       INVALID_CASES(ptr, nextTokPtr)
00102       case BT_MINUS:
00103         if ((ptr += MINBPC(enc)) == end)
00104           return XML_TOK_PARTIAL;
00105         if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
00106           if ((ptr += MINBPC(enc)) == end)
00107             return XML_TOK_PARTIAL;
00108           if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
00109             *nextTokPtr = ptr;
00110             return XML_TOK_INVALID;
00111           }
00112           *nextTokPtr = ptr + MINBPC(enc);
00113           return XML_TOK_COMMENT;
00114         }
00115         break;
00116       default:
00117         ptr += MINBPC(enc);
00118         break;
00119       }
00120     }
00121   }
00122   return XML_TOK_PARTIAL;
00123 }
00124 
00125 /* ptr points to character following "<!" */
00126 
00127 static int PTRCALL
00128 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
00129                  const char *end, const char **nextTokPtr)
00130 {
00131   if (ptr == end)
00132     return XML_TOK_PARTIAL;
00133   switch (BYTE_TYPE(enc, ptr)) {
00134   case BT_MINUS:
00135     return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00136   case BT_LSQB:
00137     *nextTokPtr = ptr + MINBPC(enc);
00138     return XML_TOK_COND_SECT_OPEN;
00139   case BT_NMSTRT:
00140   case BT_HEX:
00141     ptr += MINBPC(enc);
00142     break;
00143   default:
00144     *nextTokPtr = ptr;
00145     return XML_TOK_INVALID;
00146   }
00147   while (ptr != end) {
00148     switch (BYTE_TYPE(enc, ptr)) {
00149     case BT_PERCNT:
00150       if (ptr + MINBPC(enc) == end)
00151         return XML_TOK_PARTIAL;
00152       /* don't allow <!ENTITY% foo "whatever"> */
00153       switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
00154       case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
00155         *nextTokPtr = ptr;
00156         return XML_TOK_INVALID;
00157       }
00158       /* fall through */
00159     case BT_S: case BT_CR: case BT_LF:
00160       *nextTokPtr = ptr;
00161       return XML_TOK_DECL_OPEN;
00162     case BT_NMSTRT:
00163     case BT_HEX:
00164       ptr += MINBPC(enc);
00165       break;
00166     default:
00167       *nextTokPtr = ptr;
00168       return XML_TOK_INVALID;
00169     }
00170   }
00171   return XML_TOK_PARTIAL;
00172 }
00173 
00174 static int PTRCALL
00175 PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr,
00176                       const char *end, int *tokPtr)
00177 {
00178   int upper = 0;
00179   *tokPtr = XML_TOK_PI;
00180   if (end - ptr != MINBPC(enc)*3)
00181     return 1;
00182   switch (BYTE_TO_ASCII(enc, ptr)) {
00183   case ASCII_x:
00184     break;
00185   case ASCII_X:
00186     upper = 1;
00187     break;
00188   default:
00189     return 1;
00190   }
00191   ptr += MINBPC(enc);
00192   switch (BYTE_TO_ASCII(enc, ptr)) {
00193   case ASCII_m:
00194     break;
00195   case ASCII_M:
00196     upper = 1;
00197     break;
00198   default:
00199     return 1;
00200   }
00201   ptr += MINBPC(enc);
00202   switch (BYTE_TO_ASCII(enc, ptr)) {
00203   case ASCII_l:
00204     break;
00205   case ASCII_L:
00206     upper = 1;
00207     break;
00208   default:
00209     return 1;
00210   }
00211   if (upper)
00212     return 0;
00213   *tokPtr = XML_TOK_XML_DECL;
00214   return 1;
00215 }
00216 
00217 /* ptr points to character following "<?" */
00218 
00219 static int PTRCALL
00220 PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
00221                const char *end, const char **nextTokPtr)
00222 {
00223   int tok;
00224   const char *target = ptr;
00225   if (ptr == end)
00226     return XML_TOK_PARTIAL;
00227   switch (BYTE_TYPE(enc, ptr)) {
00228   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00229   default:
00230     *nextTokPtr = ptr;
00231     return XML_TOK_INVALID;
00232   }
00233   while (ptr != end) {
00234     switch (BYTE_TYPE(enc, ptr)) {
00235     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
00236     case BT_S: case BT_CR: case BT_LF:
00237       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
00238         *nextTokPtr = ptr;
00239         return XML_TOK_INVALID;
00240       }
00241       ptr += MINBPC(enc);
00242       while (ptr != end) {
00243         switch (BYTE_TYPE(enc, ptr)) {
00244         INVALID_CASES(ptr, nextTokPtr)
00245         case BT_QUEST:
00246           ptr += MINBPC(enc);
00247           if (ptr == end)
00248             return XML_TOK_PARTIAL;
00249           if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
00250             *nextTokPtr = ptr + MINBPC(enc);
00251             return tok;
00252           }
00253           break;
00254         default:
00255           ptr += MINBPC(enc);
00256           break;
00257         }
00258       }
00259       return XML_TOK_PARTIAL;
00260     case BT_QUEST:
00261       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
00262         *nextTokPtr = ptr;
00263         return XML_TOK_INVALID;
00264       }
00265       ptr += MINBPC(enc);
00266       if (ptr == end)
00267         return XML_TOK_PARTIAL;
00268       if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
00269         *nextTokPtr = ptr + MINBPC(enc);
00270         return tok;
00271       }
00272       /* fall through */
00273     default:
00274       *nextTokPtr = ptr;
00275       return XML_TOK_INVALID;
00276     }
00277   }
00278   return XML_TOK_PARTIAL;
00279 }
00280 
00281 static int PTRCALL
00282 PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr,
00283                          const char *end, const char **nextTokPtr)
00284 {
00285   static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
00286                                      ASCII_T, ASCII_A, ASCII_LSQB };
00287   int i;
00288   /* CDATA[ */
00289   if (end - ptr < 6 * MINBPC(enc))
00290     return XML_TOK_PARTIAL;
00291   for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
00292     if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
00293       *nextTokPtr = ptr;
00294       return XML_TOK_INVALID;
00295     }
00296   }
00297   *nextTokPtr = ptr;
00298   return XML_TOK_CDATA_SECT_OPEN;
00299 }
00300 
00301 static int PTRCALL
00302 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
00303                         const char *end, const char **nextTokPtr)
00304 {
00305   if (ptr == end)
00306     return XML_TOK_NONE;
00307   if (MINBPC(enc) > 1) {
00308     size_t n = end - ptr;
00309     if (n & (MINBPC(enc) - 1)) {
00310       n &= ~(MINBPC(enc) - 1);
00311       if (n == 0)
00312         return XML_TOK_PARTIAL;
00313       end = ptr + n;
00314     }
00315   }
00316   switch (BYTE_TYPE(enc, ptr)) {
00317   case BT_RSQB:
00318     ptr += MINBPC(enc);
00319     if (ptr == end)
00320       return XML_TOK_PARTIAL;
00321     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
00322       break;
00323     ptr += MINBPC(enc);
00324     if (ptr == end)
00325       return XML_TOK_PARTIAL;
00326     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
00327       ptr -= MINBPC(enc);
00328       break;
00329     }
00330     *nextTokPtr = ptr + MINBPC(enc);
00331     return XML_TOK_CDATA_SECT_CLOSE;
00332   case BT_CR:
00333     ptr += MINBPC(enc);
00334     if (ptr == end)
00335       return XML_TOK_PARTIAL;
00336     if (BYTE_TYPE(enc, ptr) == BT_LF)
00337       ptr += MINBPC(enc);
00338     *nextTokPtr = ptr;
00339     return XML_TOK_DATA_NEWLINE;
00340   case BT_LF:
00341     *nextTokPtr = ptr + MINBPC(enc);
00342     return XML_TOK_DATA_NEWLINE;
00343   INVALID_CASES(ptr, nextTokPtr)
00344   default:
00345     ptr += MINBPC(enc);
00346     break;
00347   }
00348   while (ptr != end) {
00349     switch (BYTE_TYPE(enc, ptr)) {
00350 #define LEAD_CASE(n) \
00351     case BT_LEAD ## n: \
00352       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
00353         *nextTokPtr = ptr; \
00354         return XML_TOK_DATA_CHARS; \
00355       } \
00356       ptr += n; \
00357       break;
00358     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
00359 #undef LEAD_CASE
00360     case BT_NONXML:
00361     case BT_MALFORM:
00362     case BT_TRAIL:
00363     case BT_CR:
00364     case BT_LF:
00365     case BT_RSQB:
00366       *nextTokPtr = ptr;
00367       return XML_TOK_DATA_CHARS;
00368     default:
00369       ptr += MINBPC(enc);
00370       break;
00371     }
00372   }
00373   *nextTokPtr = ptr;
00374   return XML_TOK_DATA_CHARS;
00375 }
00376 
00377 /* ptr points to character following "</" */
00378 
00379 static int PTRCALL
00380 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
00381                    const char *end, const char **nextTokPtr)
00382 {
00383   if (ptr == end)
00384     return XML_TOK_PARTIAL;
00385   switch (BYTE_TYPE(enc, ptr)) {
00386   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00387   default:
00388     *nextTokPtr = ptr;
00389     return XML_TOK_INVALID;
00390   }
00391   while (ptr != end) {
00392     switch (BYTE_TYPE(enc, ptr)) {
00393     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
00394     case BT_S: case BT_CR: case BT_LF:
00395       for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
00396         switch (BYTE_TYPE(enc, ptr)) {
00397         case BT_S: case BT_CR: case BT_LF:
00398           break;
00399         case BT_GT:
00400           *nextTokPtr = ptr + MINBPC(enc);
00401           return XML_TOK_END_TAG;
00402         default:
00403           *nextTokPtr = ptr;
00404           return XML_TOK_INVALID;
00405         }
00406       }
00407       return XML_TOK_PARTIAL;
00408 #ifdef XML_NS
00409     case BT_COLON:
00410       /* no need to check qname syntax here,
00411          since end-tag must match exactly */
00412       ptr += MINBPC(enc);
00413       break;
00414 #endif
00415     case BT_GT:
00416       *nextTokPtr = ptr + MINBPC(enc);
00417       return XML_TOK_END_TAG;
00418     default:
00419       *nextTokPtr = ptr;
00420       return XML_TOK_INVALID;
00421     }
00422   }
00423   return XML_TOK_PARTIAL;
00424 }
00425 
00426 /* ptr points to character following "&#X" */
00427 
00428 static int PTRCALL
00429 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
00430                        const char *end, const char **nextTokPtr)
00431 {
00432   if (ptr != end) {
00433     switch (BYTE_TYPE(enc, ptr)) {
00434     case BT_DIGIT:
00435     case BT_HEX:
00436       break;
00437     default:
00438       *nextTokPtr = ptr;
00439       return XML_TOK_INVALID;
00440     }
00441     for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
00442       switch (BYTE_TYPE(enc, ptr)) {
00443       case BT_DIGIT:
00444       case BT_HEX:
00445         break;
00446       case BT_SEMI:
00447         *nextTokPtr = ptr + MINBPC(enc);
00448         return XML_TOK_CHAR_REF;
00449       default:
00450         *nextTokPtr = ptr;
00451         return XML_TOK_INVALID;
00452       }
00453     }
00454   }
00455   return XML_TOK_PARTIAL;
00456 }
00457 
00458 /* ptr points to character following "&#" */
00459 
00460 static int PTRCALL
00461 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
00462                     const char *end, const char **nextTokPtr)
00463 {
00464   if (ptr != end) {
00465     if (CHAR_MATCHES(enc, ptr, ASCII_x))
00466       return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00467     switch (BYTE_TYPE(enc, ptr)) {
00468     case BT_DIGIT:
00469       break;
00470     default:
00471       *nextTokPtr = ptr;
00472       return XML_TOK_INVALID;
00473     }
00474     for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
00475       switch (BYTE_TYPE(enc, ptr)) {
00476       case BT_DIGIT:
00477         break;
00478       case BT_SEMI:
00479         *nextTokPtr = ptr + MINBPC(enc);
00480         return XML_TOK_CHAR_REF;
00481       default:
00482         *nextTokPtr = ptr;
00483         return XML_TOK_INVALID;
00484       }
00485     }
00486   }
00487   return XML_TOK_PARTIAL;
00488 }
00489 
00490 /* ptr points to character following "&" */
00491 
00492 static int PTRCALL
00493 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
00494                 const char **nextTokPtr)
00495 {
00496   if (ptr == end)
00497     return XML_TOK_PARTIAL;
00498   switch (BYTE_TYPE(enc, ptr)) {
00499   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00500   case BT_NUM:
00501     return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00502   default:
00503     *nextTokPtr = ptr;
00504     return XML_TOK_INVALID;
00505   }
00506   while (ptr != end) {
00507     switch (BYTE_TYPE(enc, ptr)) {
00508     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
00509     case BT_SEMI:
00510       *nextTokPtr = ptr + MINBPC(enc);
00511       return XML_TOK_ENTITY_REF;
00512     default:
00513       *nextTokPtr = ptr;
00514       return XML_TOK_INVALID;
00515     }
00516   }
00517   return XML_TOK_PARTIAL;
00518 }
00519 
00520 /* ptr points to character following first character of attribute name */
00521 
00522 static int PTRCALL
00523 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
00524                  const char **nextTokPtr)
00525 {
00526 #ifdef XML_NS
00527   int hadColon = 0;
00528 #endif
00529   while (ptr != end) {
00530     switch (BYTE_TYPE(enc, ptr)) {
00531     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
00532 #ifdef XML_NS
00533     case BT_COLON:
00534       if (hadColon) {
00535         *nextTokPtr = ptr;
00536         return XML_TOK_INVALID;
00537       }
00538       hadColon = 1;
00539       ptr += MINBPC(enc);
00540       if (ptr == end)
00541         return XML_TOK_PARTIAL;
00542       switch (BYTE_TYPE(enc, ptr)) {
00543       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00544       default:
00545         *nextTokPtr = ptr;
00546         return XML_TOK_INVALID;
00547       }
00548       break;
00549 #endif
00550     case BT_S: case BT_CR: case BT_LF:
00551       for (;;) {
00552         int t;
00553 
00554         ptr += MINBPC(enc);
00555         if (ptr == end)
00556           return XML_TOK_PARTIAL;
00557         t = BYTE_TYPE(enc, ptr);
00558         if (t == BT_EQUALS)
00559           break;
00560         switch (t) {
00561         case BT_S:
00562         case BT_LF:
00563         case BT_CR:
00564           break;
00565         default:
00566           *nextTokPtr = ptr;
00567           return XML_TOK_INVALID;
00568         }
00569       }
00570     /* fall through */
00571     case BT_EQUALS:
00572       {
00573         int open;
00574 #ifdef XML_NS
00575         hadColon = 0;
00576 #endif
00577         for (;;) {
00578           ptr += MINBPC(enc);
00579           if (ptr == end)
00580             return XML_TOK_PARTIAL;
00581           open = BYTE_TYPE(enc, ptr);
00582           if (open == BT_QUOT || open == BT_APOS)
00583             break;
00584           switch (open) {
00585           case BT_S:
00586           case BT_LF:
00587           case BT_CR:
00588             break;
00589           default:
00590             *nextTokPtr = ptr;
00591             return XML_TOK_INVALID;
00592           }
00593         }
00594         ptr += MINBPC(enc);
00595         /* in attribute value */
00596         for (;;) {
00597           int t;
00598           if (ptr == end)
00599             return XML_TOK_PARTIAL;
00600           t = BYTE_TYPE(enc, ptr);
00601           if (t == open)
00602             break;
00603           switch (t) {
00604           INVALID_CASES(ptr, nextTokPtr)
00605           case BT_AMP:
00606             {
00607               int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
00608               if (tok <= 0) {
00609                 if (tok == XML_TOK_INVALID)
00610                   *nextTokPtr = ptr;
00611                 return tok;
00612               }
00613               break;
00614             }
00615           case BT_LT:
00616             *nextTokPtr = ptr;
00617             return XML_TOK_INVALID;
00618           default:
00619             ptr += MINBPC(enc);
00620             break;
00621           }
00622         }
00623         ptr += MINBPC(enc);
00624         if (ptr == end)
00625           return XML_TOK_PARTIAL;
00626         switch (BYTE_TYPE(enc, ptr)) {
00627         case BT_S:
00628         case BT_CR:
00629         case BT_LF:
00630           break;
00631         case BT_SOL:
00632           goto sol;
00633         case BT_GT:
00634           goto gt;
00635         default:
00636           *nextTokPtr = ptr;
00637           return XML_TOK_INVALID;
00638         }
00639         /* ptr points to closing quote */
00640         for (;;) {
00641           ptr += MINBPC(enc);
00642           if (ptr == end)
00643             return XML_TOK_PARTIAL;
00644           switch (BYTE_TYPE(enc, ptr)) {
00645           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00646           case BT_S: case BT_CR: case BT_LF:
00647             continue;
00648           case BT_GT:
00649           gt:
00650             *nextTokPtr = ptr + MINBPC(enc);
00651             return XML_TOK_START_TAG_WITH_ATTS;
00652           case BT_SOL:
00653           sol:
00654             ptr += MINBPC(enc);
00655             if (ptr == end)
00656               return XML_TOK_PARTIAL;
00657             if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
00658               *nextTokPtr = ptr;
00659               return XML_TOK_INVALID;
00660             }
00661             *nextTokPtr = ptr + MINBPC(enc);
00662             return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
00663           default:
00664             *nextTokPtr = ptr;
00665             return XML_TOK_INVALID;
00666           }
00667           break;
00668         }
00669         break;
00670       }
00671     default:
00672       *nextTokPtr = ptr;
00673       return XML_TOK_INVALID;
00674     }
00675   }
00676   return XML_TOK_PARTIAL;
00677 }
00678 
00679 /* ptr points to character following "<" */
00680 
00681 static int PTRCALL
00682 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
00683                const char **nextTokPtr)
00684 {
00685 #ifdef XML_NS
00686   int hadColon;
00687 #endif
00688   if (ptr == end)
00689     return XML_TOK_PARTIAL;
00690   switch (BYTE_TYPE(enc, ptr)) {
00691   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00692   case BT_EXCL:
00693     if ((ptr += MINBPC(enc)) == end)
00694       return XML_TOK_PARTIAL;
00695     switch (BYTE_TYPE(enc, ptr)) {
00696     case BT_MINUS:
00697       return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00698     case BT_LSQB:
00699       return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
00700                                       end, nextTokPtr);
00701     }
00702     *nextTokPtr = ptr;
00703     return XML_TOK_INVALID;
00704   case BT_QUEST:
00705     return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00706   case BT_SOL:
00707     return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00708   default:
00709     *nextTokPtr = ptr;
00710     return XML_TOK_INVALID;
00711   }
00712 #ifdef XML_NS
00713   hadColon = 0;
00714 #endif
00715   /* we have a start-tag */
00716   while (ptr != end) {
00717     switch (BYTE_TYPE(enc, ptr)) {
00718     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
00719 #ifdef XML_NS
00720     case BT_COLON:
00721       if (hadColon) {
00722         *nextTokPtr = ptr;
00723         return XML_TOK_INVALID;
00724       }
00725       hadColon = 1;
00726       ptr += MINBPC(enc);
00727       if (ptr == end)
00728         return XML_TOK_PARTIAL;
00729       switch (BYTE_TYPE(enc, ptr)) {
00730       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00731       default:
00732         *nextTokPtr = ptr;
00733         return XML_TOK_INVALID;
00734       }
00735       break;
00736 #endif
00737     case BT_S: case BT_CR: case BT_LF:
00738       {
00739         ptr += MINBPC(enc);
00740         while (ptr != end) {
00741           switch (BYTE_TYPE(enc, ptr)) {
00742           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00743           case BT_GT:
00744             goto gt;
00745           case BT_SOL:
00746             goto sol;
00747           case BT_S: case BT_CR: case BT_LF:
00748             ptr += MINBPC(enc);
00749             continue;
00750           default:
00751             *nextTokPtr = ptr;
00752             return XML_TOK_INVALID;
00753           }
00754           return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
00755         }
00756         return XML_TOK_PARTIAL;
00757       }
00758     case BT_GT:
00759     gt:
00760       *nextTokPtr = ptr + MINBPC(enc);
00761       return XML_TOK_START_TAG_NO_ATTS;
00762     case BT_SOL:
00763     sol:
00764       ptr += MINBPC(enc);
00765       if (ptr == end)
00766         return XML_TOK_PARTIAL;
00767       if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
00768         *nextTokPtr = ptr;
00769         return XML_TOK_INVALID;
00770       }
00771       *nextTokPtr = ptr + MINBPC(enc);
00772       return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
00773     default:
00774       *nextTokPtr = ptr;
00775       return XML_TOK_INVALID;
00776     }
00777   }
00778   return XML_TOK_PARTIAL;
00779 }
00780 
00781 static int PTRCALL
00782 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
00783                    const char **nextTokPtr)
00784 {
00785   if (ptr == end)
00786     return XML_TOK_NONE;
00787   if (MINBPC(enc) > 1) {
00788     size_t n = end - ptr;
00789     if (n & (MINBPC(enc) - 1)) {
00790       n &= ~(MINBPC(enc) - 1);
00791       if (n == 0)
00792         return XML_TOK_PARTIAL;
00793       end = ptr + n;
00794     }
00795   }
00796   switch (BYTE_TYPE(enc, ptr)) {
00797   case BT_LT:
00798     return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00799   case BT_AMP:
00800     return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00801   case BT_CR:
00802     ptr += MINBPC(enc);
00803     if (ptr == end)
00804       return XML_TOK_TRAILING_CR;
00805     if (BYTE_TYPE(enc, ptr) == BT_LF)
00806       ptr += MINBPC(enc);
00807     *nextTokPtr = ptr;
00808     return XML_TOK_DATA_NEWLINE;
00809   case BT_LF:
00810     *nextTokPtr = ptr + MINBPC(enc);
00811     return XML_TOK_DATA_NEWLINE;
00812   case BT_RSQB:
00813     ptr += MINBPC(enc);
00814     if (ptr == end)
00815       return XML_TOK_TRAILING_RSQB;
00816     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
00817       break;
00818     ptr += MINBPC(enc);
00819     if (ptr == end)
00820       return XML_TOK_TRAILING_RSQB;
00821     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
00822       ptr -= MINBPC(enc);
00823       break;
00824     }
00825     *nextTokPtr = ptr;
00826     return XML_TOK_INVALID;
00827   INVALID_CASES(ptr, nextTokPtr)
00828   default:
00829     ptr += MINBPC(enc);
00830     break;
00831   }
00832   while (ptr != end) {
00833     switch (BYTE_TYPE(enc, ptr)) {
00834 #define LEAD_CASE(n) \
00835     case BT_LEAD ## n: \
00836       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
00837         *nextTokPtr = ptr; \
00838         return XML_TOK_DATA_CHARS; \
00839       } \
00840       ptr += n; \
00841       break;
00842     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
00843 #undef LEAD_CASE
00844     case BT_RSQB:
00845       if (ptr + MINBPC(enc) != end) {
00846          if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
00847            ptr += MINBPC(enc);
00848            break;
00849          }
00850          if (ptr + 2*MINBPC(enc) != end) {
00851            if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
00852              ptr += MINBPC(enc);
00853              break;
00854            }
00855            *nextTokPtr = ptr + 2*MINBPC(enc);
00856            return XML_TOK_INVALID;
00857          }
00858       }
00859       /* fall through */
00860     case BT_AMP:
00861     case BT_LT:
00862     case BT_NONXML:
00863     case BT_MALFORM:
00864     case BT_TRAIL:
00865     case BT_CR:
00866     case BT_LF:
00867       *nextTokPtr = ptr;
00868       return XML_TOK_DATA_CHARS;
00869     default:
00870       ptr += MINBPC(enc);
00871       break;
00872     }
00873   }
00874   *nextTokPtr = ptr;
00875   return XML_TOK_DATA_CHARS;
00876 }
00877 
00878 /* ptr points to character following "%" */
00879 
00880 static int PTRCALL
00881 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
00882                     const char **nextTokPtr)
00883 {
00884   if (ptr == end)
00885     return -XML_TOK_PERCENT;
00886   switch (BYTE_TYPE(enc, ptr)) {
00887   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00888   case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
00889     *nextTokPtr = ptr;
00890     return XML_TOK_PERCENT;
00891   default:
00892     *nextTokPtr = ptr;
00893     return XML_TOK_INVALID;
00894   }
00895   while (ptr != end) {
00896     switch (BYTE_TYPE(enc, ptr)) {
00897     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
00898     case BT_SEMI:
00899       *nextTokPtr = ptr + MINBPC(enc);
00900       return XML_TOK_PARAM_ENTITY_REF;
00901     default:
00902       *nextTokPtr = ptr;
00903       return XML_TOK_INVALID;
00904     }
00905   }
00906   return XML_TOK_PARTIAL;
00907 }
00908 
00909 static int PTRCALL
00910 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
00911                       const char **nextTokPtr)
00912 {
00913   if (ptr == end)
00914     return XML_TOK_PARTIAL;
00915   switch (BYTE_TYPE(enc, ptr)) {
00916   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00917   default:
00918     *nextTokPtr = ptr;
00919     return XML_TOK_INVALID;
00920   }
00921   while (ptr != end) {
00922     switch (BYTE_TYPE(enc, ptr)) {
00923     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
00924     case BT_CR: case BT_LF: case BT_S:
00925     case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
00926       *nextTokPtr = ptr;
00927       return XML_TOK_POUND_NAME;
00928     default:
00929       *nextTokPtr = ptr;
00930       return XML_TOK_INVALID;
00931     }
00932   }
00933   return -XML_TOK_POUND_NAME;
00934 }
00935 
00936 static int PTRCALL
00937 PREFIX(scanLit)(int open, const ENCODING *enc,
00938                 const char *ptr, const char *end,
00939                 const char **nextTokPtr)
00940 {
00941   while (ptr != end) {
00942     int t = BYTE_TYPE(enc, ptr);
00943     switch (t) {
00944     INVALID_CASES(ptr, nextTokPtr)
00945     case BT_QUOT:
00946     case BT_APOS:
00947       ptr += MINBPC(enc);
00948       if (t != open)
00949         break;
00950       if (ptr == end)
00951         return -XML_TOK_LITERAL;
00952       *nextTokPtr = ptr;
00953       switch (BYTE_TYPE(enc, ptr)) {
00954       case BT_S: case BT_CR: case BT_LF:
00955       case BT_GT: case BT_PERCNT: case BT_LSQB:
00956         return XML_TOK_LITERAL;
00957       default:
00958         return XML_TOK_INVALID;
00959       }
00960     default:
00961       ptr += MINBPC(enc);
00962       break;
00963     }
00964   }
00965   return XML_TOK_PARTIAL;
00966 }
00967 
00968 static int PTRCALL
00969 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
00970                   const char **nextTokPtr)
00971 {
00972   int tok;
00973   if (ptr == end)
00974     return XML_TOK_NONE;
00975   if (MINBPC(enc) > 1) {
00976     size_t n = end - ptr;
00977     if (n & (MINBPC(enc) - 1)) {
00978       n &= ~(MINBPC(enc) - 1);
00979       if (n == 0)
00980         return XML_TOK_PARTIAL;
00981       end = ptr + n;
00982     }
00983   }
00984   switch (BYTE_TYPE(enc, ptr)) {
00985   case BT_QUOT:
00986     return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
00987   case BT_APOS:
00988     return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
00989   case BT_LT:
00990     {
00991       ptr += MINBPC(enc);
00992       if (ptr == end)
00993         return XML_TOK_PARTIAL;
00994       switch (BYTE_TYPE(enc, ptr)) {
00995       case BT_EXCL:
00996         return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00997       case BT_QUEST:
00998         return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00999       case BT_NMSTRT:
01000       case BT_HEX:
01001       case BT_NONASCII:
01002       case BT_LEAD2:
01003       case BT_LEAD3:
01004       case BT_LEAD4:
01005         *nextTokPtr = ptr - MINBPC(enc);
01006         return XML_TOK_INSTANCE_START;
01007       }
01008       *nextTokPtr = ptr;
01009       return XML_TOK_INVALID;
01010     }
01011   case BT_CR:
01012     if (ptr + MINBPC(enc) == end) {
01013       *nextTokPtr = end;
01014       /* indicate that this might be part of a CR/LF pair */
01015       return -XML_TOK_PROLOG_S;
01016     }
01017     /* fall through */
01018   case BT_S: case BT_LF:
01019     for (;;) {
01020       ptr += MINBPC(enc);
01021       if (ptr == end)
01022         break;
01023       switch (BYTE_TYPE(enc, ptr)) {
01024       case BT_S: case BT_LF:
01025         break;
01026       case BT_CR:
01027         /* don't split CR/LF pair */
01028         if (ptr + MINBPC(enc) != end)
01029           break;
01030         /* fall through */
01031       default:
01032         *nextTokPtr = ptr;
01033         return XML_TOK_PROLOG_S;
01034       }
01035     }
01036     *nextTokPtr = ptr;
01037     return XML_TOK_PROLOG_S;
01038   case BT_PERCNT:
01039     return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
01040   case BT_COMMA:
01041     *nextTokPtr = ptr + MINBPC(enc);
01042     return XML_TOK_COMMA;
01043   case BT_LSQB:
01044     *nextTokPtr = ptr + MINBPC(enc);
01045     return XML_TOK_OPEN_BRACKET;
01046   case BT_RSQB:
01047     ptr += MINBPC(enc);
01048     if (ptr == end)
01049       return -XML_TOK_CLOSE_BRACKET;
01050     if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
01051       if (ptr + MINBPC(enc) == end)
01052         return XML_TOK_PARTIAL;
01053       if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
01054         *nextTokPtr = ptr + 2*MINBPC(enc);
01055         return XML_TOK_COND_SECT_CLOSE;
01056       }
01057     }
01058     *nextTokPtr = ptr;
01059     return XML_TOK_CLOSE_BRACKET;
01060   case BT_LPAR:
01061     *nextTokPtr = ptr + MINBPC(enc);
01062     return XML_TOK_OPEN_PAREN;
01063   case BT_RPAR:
01064     ptr += MINBPC(enc);
01065     if (ptr == end)
01066       return -XML_TOK_CLOSE_PAREN;
01067     switch (BYTE_TYPE(enc, ptr)) {
01068     case BT_AST:
01069       *nextTokPtr = ptr + MINBPC(enc);
01070       return XML_TOK_CLOSE_PAREN_ASTERISK;
01071     case BT_QUEST:
01072       *nextTokPtr = ptr + MINBPC(enc);
01073       return XML_TOK_CLOSE_PAREN_QUESTION;
01074     case BT_PLUS:
01075       *nextTokPtr = ptr + MINBPC(enc);
01076       return XML_TOK_CLOSE_PAREN_PLUS;
01077     case BT_CR: case BT_LF: case BT_S:
01078     case BT_GT: case BT_COMMA: case BT_VERBAR:
01079     case BT_RPAR:
01080       *nextTokPtr = ptr;
01081       return XML_TOK_CLOSE_PAREN;
01082     }
01083     *nextTokPtr = ptr;
01084     return XML_TOK_INVALID;
01085   case BT_VERBAR:
01086     *nextTokPtr = ptr + MINBPC(enc);
01087     return XML_TOK_OR;
01088   case BT_GT:
01089     *nextTokPtr = ptr + MINBPC(enc);
01090     return XML_TOK_DECL_CLOSE;
01091   case BT_NUM:
01092     return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
01093 #define LEAD_CASE(n) \
01094   case BT_LEAD ## n: \
01095     if (end - ptr < n) \
01096       return XML_TOK_PARTIAL_CHAR; \
01097     if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
01098       ptr += n; \
01099       tok = XML_TOK_NAME; \
01100       break; \
01101     } \
01102     if (IS_NAME_CHAR(enc, ptr, n)) { \
01103       ptr += n; \
01104       tok = XML_TOK_NMTOKEN; \
01105       break; \
01106     } \
01107     *nextTokPtr = ptr; \
01108     return XML_TOK_INVALID;
01109     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
01110 #undef LEAD_CASE
01111   case BT_NMSTRT:
01112   case BT_HEX:
01113     tok = XML_TOK_NAME;
01114     ptr += MINBPC(enc);
01115     break;
01116   case BT_DIGIT:
01117   case BT_NAME:
01118   case BT_MINUS:
01119 #ifdef XML_NS
01120   case BT_COLON:
01121 #endif
01122     tok = XML_TOK_NMTOKEN;
01123     ptr += MINBPC(enc);
01124     break;
01125   case BT_NONASCII:
01126     if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
01127       ptr += MINBPC(enc);
01128       tok = XML_TOK_NAME;
01129       break;
01130     }
01131     if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
01132       ptr += MINBPC(enc);
01133       tok = XML_TOK_NMTOKEN;
01134       break;
01135     }
01136     /* fall through */
01137   default:
01138     *nextTokPtr = ptr;
01139     return XML_TOK_INVALID;
01140   }
01141   while (ptr != end) {
01142     switch (BYTE_TYPE(enc, ptr)) {
01143     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
01144     case BT_GT: case BT_RPAR: case BT_COMMA:
01145     case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
01146     case BT_S: case BT_CR: case BT_LF:
01147       *nextTokPtr = ptr;
01148       return tok;
01149 #ifdef XML_NS
01150     case BT_COLON:
01151       ptr += MINBPC(enc);
01152       switch (tok) {
01153       case XML_TOK_NAME:
01154         if (ptr == end)
01155           return XML_TOK_PARTIAL;
01156         tok = XML_TOK_PREFIXED_NAME;
01157         switch (BYTE_TYPE(enc, ptr)) {
01158         CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
01159         default:
01160           tok = XML_TOK_NMTOKEN;
01161           break;
01162         }
01163         break;
01164       case XML_TOK_PREFIXED_NAME:
01165         tok = XML_TOK_NMTOKEN;
01166         break;
01167       }
01168       break;
01169 #endif
01170     case BT_PLUS:
01171       if (tok == XML_TOK_NMTOKEN)  {
01172         *nextTokPtr = ptr;
01173         return XML_TOK_INVALID;
01174       }
01175       *nextTokPtr = ptr + MINBPC(enc);
01176       return XML_TOK_NAME_PLUS;
01177     case BT_AST:
01178       if (tok == XML_TOK_NMTOKEN)  {
01179         *nextTokPtr = ptr;
01180         return XML_TOK_INVALID;
01181       }
01182       *nextTokPtr = ptr + MINBPC(enc);
01183       return XML_TOK_NAME_ASTERISK;
01184     case BT_QUEST:
01185       if (tok == XML_TOK_NMTOKEN)  {
01186         *nextTokPtr = ptr;
01187         return XML_TOK_INVALID;
01188       }
01189       *nextTokPtr = ptr + MINBPC(enc);
01190       return XML_TOK_NAME_QUESTION;
01191     default:
01192       *nextTokPtr = ptr;
01193       return XML_TOK_INVALID;
01194     }
01195   }
01196   return -tok;
01197 }
01198 
01199 static int PTRCALL
01200 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
01201                           const char *end, const char **nextTokPtr)
01202 {
01203   const char *start;
01204   if (ptr == end)
01205     return XML_TOK_NONE;
01206   start = ptr;
01207   while (ptr != end) {
01208     switch (BYTE_TYPE(enc, ptr)) {
01209 #define LEAD_CASE(n) \
01210     case BT_LEAD ## n: ptr += n; break;
01211     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
01212 #undef LEAD_CASE
01213     case BT_AMP:
01214       if (ptr == start)
01215         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
01216       *nextTokPtr = ptr;
01217       return XML_TOK_DATA_CHARS;
01218     case BT_LT:
01219       /* this is for inside entity references */
01220       *nextTokPtr = ptr;
01221       return XML_TOK_INVALID;
01222     case BT_LF:
01223       if (ptr == start) {
01224         *nextTokPtr = ptr + MINBPC(enc);
01225         return XML_TOK_DATA_NEWLINE;
01226       }
01227       *nextTokPtr = ptr;
01228       return XML_TOK_DATA_CHARS;
01229     case BT_CR:
01230       if (ptr == start) {
01231         ptr += MINBPC(enc);
01232         if (ptr == end)
01233           return XML_TOK_TRAILING_CR;
01234         if (BYTE_TYPE(enc, ptr) == BT_LF)
01235           ptr += MINBPC(enc);
01236         *nextTokPtr = ptr;
01237         return XML_TOK_DATA_NEWLINE;
01238       }
01239       *nextTokPtr = ptr;
01240       return XML_TOK_DATA_CHARS;
01241     case BT_S:
01242       if (ptr == start) {
01243         *nextTokPtr = ptr + MINBPC(enc);
01244         return XML_TOK_ATTRIBUTE_VALUE_S;
01245       }
01246       *nextTokPtr = ptr;
01247       return XML_TOK_DATA_CHARS;
01248     default:
01249       ptr += MINBPC(enc);
01250       break;
01251     }
01252   }
01253   *nextTokPtr = ptr;
01254   return XML_TOK_DATA_CHARS;
01255 }
01256 
01257 static int PTRCALL
01258 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
01259                        const char *end, const char **nextTokPtr)
01260 {
01261   const char *start;
01262   if (ptr == end)
01263     return XML_TOK_NONE;
01264   start = ptr;
01265   while (ptr != end) {
01266     switch (BYTE_TYPE(enc, ptr)) {
01267 #define LEAD_CASE(n) \
01268     case BT_LEAD ## n: ptr += n; break;
01269     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
01270 #undef LEAD_CASE
01271     case BT_AMP:
01272       if (ptr == start)
01273         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
01274       *nextTokPtr = ptr;
01275       return XML_TOK_DATA_CHARS;
01276     case BT_PERCNT:
01277       if (ptr == start) {
01278         int tok =  PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
01279                                        end, nextTokPtr);
01280         return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
01281       }
01282       *nextTokPtr = ptr;
01283       return XML_TOK_DATA_CHARS;
01284     case BT_LF:
01285       if (ptr == start) {
01286         *nextTokPtr = ptr + MINBPC(enc);
01287         return XML_TOK_DATA_NEWLINE;
01288       }
01289       *nextTokPtr = ptr;
01290       return XML_TOK_DATA_CHARS;
01291     case BT_CR:
01292       if (ptr == start) {
01293         ptr += MINBPC(enc);
01294         if (ptr == end)
01295           return XML_TOK_TRAILING_CR;
01296         if (BYTE_TYPE(enc, ptr) == BT_LF)
01297           ptr += MINBPC(enc);
01298         *nextTokPtr = ptr;
01299         return XML_TOK_DATA_NEWLINE;
01300       }
01301       *nextTokPtr = ptr;
01302       return XML_TOK_DATA_CHARS;
01303     default:
01304       ptr += MINBPC(enc);
01305       break;
01306     }
01307   }
01308   *nextTokPtr = ptr;
01309   return XML_TOK_DATA_CHARS;
01310 }
01311 
01312 #ifdef XML_DTD
01313 
01314 static int PTRCALL
01315 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
01316                          const char *end, const char **nextTokPtr)
01317 {
01318   int level = 0;
01319   if (MINBPC(enc) > 1) {
01320     size_t n = end - ptr;
01321     if (n & (MINBPC(enc) - 1)) {
01322       n &= ~(MINBPC(enc) - 1);
01323       end = ptr + n;
01324     }
01325   }
01326   while (ptr != end) {
01327     switch (BYTE_TYPE(enc, ptr)) {
01328     INVALID_CASES(ptr, nextTokPtr)
01329     case BT_LT:
01330       if ((ptr += MINBPC(enc)) == end)
01331         return XML_TOK_PARTIAL;
01332       if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
01333         if ((ptr += MINBPC(enc)) == end)
01334           return XML_TOK_PARTIAL;
01335         if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
01336           ++level;
01337           ptr += MINBPC(enc);
01338         }
01339       }
01340       break;
01341     case BT_RSQB:
01342       if ((ptr += MINBPC(enc)) == end)
01343         return XML_TOK_PARTIAL;
01344       if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
01345         if ((ptr += MINBPC(enc)) == end)
01346           return XML_TOK_PARTIAL;
01347         if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
01348           ptr += MINBPC(enc);
01349           if (level == 0) {
01350             *nextTokPtr = ptr;
01351             return XML_TOK_IGNORE_SECT;
01352           }
01353           --level;
01354         }
01355       }
01356       break;
01357     default:
01358       ptr += MINBPC(enc);
01359       break;
01360     }
01361   }
01362   return XML_TOK_PARTIAL;
01363 }
01364 
01365 #endif /* XML_DTD */
01366 
01367 static int PTRCALL
01368 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
01369                    const char **badPtr)
01370 {
01371   ptr += MINBPC(enc);
01372   end -= MINBPC(enc);
01373   for (; ptr != end; ptr += MINBPC(enc)) {
01374     switch (BYTE_TYPE(enc, ptr)) {
01375     case BT_DIGIT:
01376     case BT_HEX:
01377     case BT_MINUS:
01378     case BT_APOS:
01379     case BT_LPAR:
01380     case BT_RPAR:
01381     case BT_PLUS:
01382     case BT_COMMA:
01383     case BT_SOL:
01384     case BT_EQUALS:
01385     case BT_QUEST:
01386     case BT_CR:
01387     case BT_LF:
01388     case BT_SEMI:
01389     case BT_EXCL:
01390     case BT_AST:
01391     case BT_PERCNT:
01392     case BT_NUM:
01393 #ifdef XML_NS
01394     case BT_COLON:
01395 #endif
01396       break;
01397     case BT_S:
01398       if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
01399         *badPtr = ptr;
01400         return 0;
01401       }
01402       break;
01403     case BT_NAME:
01404     case BT_NMSTRT:
01405       if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
01406         break;
01407     default:
01408       switch (BYTE_TO_ASCII(enc, ptr)) {
01409       case 0x24: /* $ */
01410       case 0x40: /* @ */
01411         break;
01412       default:
01413         *badPtr = ptr;
01414         return 0;
01415       }
01416       break;
01417     }
01418   }
01419   return 1;
01420 }
01421 
01422 /* This must only be called for a well-formed start-tag or empty
01423    element tag.  Returns the number of attributes.  Pointers to the
01424    first attsMax attributes are stored in atts.
01425 */
01426 
01427 static int PTRCALL
01428 PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
01429                 int attsMax, ATTRIBUTE *atts)
01430 {
01431   enum { other, inName, inValue } state = inName;
01432   int nAtts = 0;
01433   int open = 0; /* defined when state == inValue;
01434                    initialization just to shut up compilers */
01435 
01436   for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
01437     switch (BYTE_TYPE(enc, ptr)) {
01438 #define START_NAME \
01439       if (state == other) { \
01440         if (nAtts < attsMax) { \
01441           atts[nAtts].name = ptr; \
01442           atts[nAtts].normalized = 1; \
01443         } \
01444         state = inName; \
01445       }
01446 #define LEAD_CASE(n) \
01447     case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
01448     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
01449 #undef LEAD_CASE
01450     case BT_NONASCII:
01451     case BT_NMSTRT:
01452     case BT_HEX:
01453       START_NAME
01454       break;
01455 #undef START_NAME
01456     case BT_QUOT:
01457       if (state != inValue) {
01458         if (nAtts < attsMax)
01459           atts[nAtts].valuePtr = ptr + MINBPC(enc);
01460         state = inValue;
01461         open = BT_QUOT;
01462       }
01463       else if (open == BT_QUOT) {
01464         state = other;
01465         if (nAtts < attsMax)
01466           atts[nAtts].valueEnd = ptr;
01467         nAtts++;
01468       }
01469       break;
01470     case BT_APOS:
01471       if (state != inValue) {
01472         if (nAtts < attsMax)
01473           atts[nAtts].valuePtr = ptr + MINBPC(enc);
01474         state = inValue;
01475         open = BT_APOS;
01476       }
01477       else if (open == BT_APOS) {
01478         state = other;
01479         if (nAtts < attsMax)
01480           atts[nAtts].valueEnd = ptr;
01481         nAtts++;
01482       }
01483       break;
01484     case BT_AMP:
01485       if (nAtts < attsMax)
01486         atts[nAtts].normalized = 0;
01487       break;
01488     case BT_S:
01489       if (state == inName)
01490         state = other;
01491       else if (state == inValue
01492                && nAtts < attsMax
01493                && atts[nAtts].normalized
01494                && (ptr == atts[nAtts].valuePtr
01495                    || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
01496                    || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
01497                    || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
01498         atts[nAtts].normalized = 0;
01499       break;
01500     case BT_CR: case BT_LF:
01501       /* This case ensures that the first attribute name is counted
01502          Apart from that we could just change state on the quote. */
01503       if (state == inName)
01504         state = other;
01505       else if (state == inValue && nAtts < attsMax)
01506         atts[nAtts].normalized = 0;
01507       break;
01508     case BT_GT:
01509     case BT_SOL:
01510       if (state != inValue)
01511         return nAtts;
01512       break;
01513     default:
01514       break;
01515     }
01516   }
01517   /* not reached */
01518 }
01519 
01520 static int PTRFASTCALL
01521 PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
01522 {
01523   int result = 0;
01524   /* skip &# */
01525   ptr += 2*MINBPC(enc);
01526   if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
01527     for (ptr += MINBPC(enc);
01528          !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
01529          ptr += MINBPC(enc)) {
01530       int c = BYTE_TO_ASCII(enc, ptr);
01531       switch (c) {
01532       case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
01533       case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
01534         result <<= 4;
01535         result |= (c - ASCII_0);
01536         break;
01537       case ASCII_A: case ASCII_B: case ASCII_C:
01538       case ASCII_D: case ASCII_E: case ASCII_F:
01539         result <<= 4;
01540         result += 10 + (c - ASCII_A);
01541         break;
01542       case ASCII_a: case ASCII_b: case ASCII_c:
01543       case ASCII_d: case ASCII_e: case ASCII_f:
01544         result <<= 4;
01545         result += 10 + (c - ASCII_a);
01546         break;
01547       }
01548       if (result >= 0x110000)
01549         return -1;
01550     }
01551   }
01552   else {
01553     for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
01554       int c = BYTE_TO_ASCII(enc, ptr);
01555       result *= 10;
01556       result += (c - ASCII_0);
01557       if (result >= 0x110000)
01558         return -1;
01559     }
01560   }
01561   return checkCharRefNumber(result);
01562 }
01563 
01564 static int PTRCALL
01565 PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
01566                              const char *end)
01567 {
01568   switch ((end - ptr)/MINBPC(enc)) {
01569   case 2:
01570     if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
01571       switch (BYTE_TO_ASCII(enc, ptr)) {
01572       case ASCII_l:
01573         return ASCII_LT;
01574       case ASCII_g:
01575         return ASCII_GT;
01576       }
01577     }
01578     break;
01579   case 3:
01580     if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
01581       ptr += MINBPC(enc);
01582       if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
01583         ptr += MINBPC(enc);
01584         if (CHAR_MATCHES(enc, ptr, ASCII_p))
01585           return ASCII_AMP;
01586       }
01587     }
01588     break;
01589   case 4:
01590     switch (BYTE_TO_ASCII(enc, ptr)) {
01591     case ASCII_q:
01592       ptr += MINBPC(enc);
01593       if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
01594         ptr += MINBPC(enc);
01595         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
01596           ptr += MINBPC(enc);
01597           if (CHAR_MATCHES(enc, ptr, ASCII_t))
01598             return ASCII_QUOT;
01599         }
01600       }
01601       break;
01602     case ASCII_a:
01603       ptr += MINBPC(enc);
01604       if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
01605         ptr += MINBPC(enc);
01606         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
01607           ptr += MINBPC(enc);
01608           if (CHAR_MATCHES(enc, ptr, ASCII_s))
01609             return ASCII_APOS;
01610         }
01611       }
01612       break;
01613     }
01614   }
01615   return 0;
01616 }
01617 
01618 static int PTRCALL
01619 PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
01620 {
01621   for (;;) {
01622     switch (BYTE_TYPE(enc, ptr1)) {
01623 #define LEAD_CASE(n) \
01624     case BT_LEAD ## n: \
01625       if (*ptr1++ != *ptr2++) \
01626         return 0;
01627     LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
01628 #undef LEAD_CASE
01629       /* fall through */
01630       if (*ptr1++ != *ptr2++)
01631         return 0;
01632       break;
01633     case BT_NONASCII:
01634     case BT_NMSTRT:
01635 #ifdef XML_NS
01636     case BT_COLON:
01637 #endif
01638     case BT_HEX:
01639     case BT_DIGIT:
01640     case BT_NAME:
01641     case BT_MINUS:
01642       if (*ptr2++ != *ptr1++)
01643         return 0;
01644       if (MINBPC(enc) > 1) {
01645         if (*ptr2++ != *ptr1++)
01646           return 0;
01647         if (MINBPC(enc) > 2) {
01648           if (*ptr2++ != *ptr1++)
01649             return 0;
01650           if (MINBPC(enc) > 3) {
01651             if (*ptr2++ != *ptr1++)
01652               return 0;
01653           }
01654         }
01655       }
01656       break;
01657     default:
01658       if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
01659         return 1;
01660       switch (BYTE_TYPE(enc, ptr2)) {
01661       case BT_LEAD2:
01662       case BT_LEAD3:
01663       case BT_LEAD4:
01664       case BT_NONASCII:
01665       case BT_NMSTRT:
01666 #ifdef XML_NS
01667       case BT_COLON:
01668 #endif
01669       case BT_HEX:
01670       case BT_DIGIT:
01671       case BT_NAME:
01672       case BT_MINUS:
01673         return 0;
01674       default:
01675         return 1;
01676       }
01677     }
01678   }
01679   /* not reached */
01680 }
01681 
01682 static int PTRCALL
01683 PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
01684                          const char *end1, const char *ptr2)
01685 {
01686   for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
01687     if (ptr1 == end1)
01688       return 0;
01689     if (!CHAR_MATCHES(enc, ptr1, *ptr2))
01690       return 0;
01691   }
01692   return ptr1 == end1;
01693 }
01694 
01695 static int PTRFASTCALL
01696 PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
01697 {
01698   const char *start = ptr;
01699   for (;;) {
01700     switch (BYTE_TYPE(enc, ptr)) {
01701 #define LEAD_CASE(n) \
01702     case BT_LEAD ## n: ptr += n; break;
01703     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
01704 #undef LEAD_CASE
01705     case BT_NONASCII:
01706     case BT_NMSTRT:
01707 #ifdef XML_NS
01708     case BT_COLON:
01709 #endif
01710     case BT_HEX:
01711     case BT_DIGIT:
01712     case BT_NAME:
01713     case BT_MINUS:
01714       ptr += MINBPC(enc);
01715       break;
01716     default:
01717       return ptr - start;
01718     }
01719   }
01720 }
01721 
01722 static const char * PTRFASTCALL
01723 PREFIX(skipS)(const ENCODING *enc, const char *ptr)
01724 {
01725   for (;;) {
01726     switch (BYTE_TYPE(enc, ptr)) {
01727     case BT_LF:
01728     case BT_CR:
01729     case BT_S:
01730       ptr += MINBPC(enc);
01731       break;
01732     default:
01733       return ptr;
01734     }
01735   }
01736 }
01737 
01738 static void PTRCALL
01739 PREFIX(updatePosition)(const ENCODING *enc,
01740                        const char *ptr,
01741                        const char *end,
01742                        POSITION *pos)
01743 {
01744   while (ptr != end) {
01745     switch (BYTE_TYPE(enc, ptr)) {
01746 #define LEAD_CASE(n) \
01747     case BT_LEAD ## n: \
01748       ptr += n; \
01749       break;
01750     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
01751 #undef LEAD_CASE
01752     case BT_LF:
01753       pos->columnNumber = (unsigned)-1;
01754       pos->lineNumber++;
01755       ptr += MINBPC(enc);
01756       break;
01757     case BT_CR:
01758       pos->lineNumber++;
01759       ptr += MINBPC(enc);
01760       if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF)
01761         ptr += MINBPC(enc);
01762       pos->columnNumber = (unsigned)-1;
01763       break;
01764     default:
01765       ptr += MINBPC(enc);
01766       break;
01767     }
01768     pos->columnNumber++;
01769   }
01770 }
01771 
01772 #undef DO_LEAD_CASE
01773 #undef MULTIBYTE_CASES
01774 #undef INVALID_CASES
01775 #undef CHECK_NAME_CASE
01776 #undef CHECK_NAME_CASES
01777 #undef CHECK_NMSTRT_CASE
01778 #undef CHECK_NMSTRT_CASES
01779