Back to index

lightning-sunbird  0.9+nobinonly
xmltok.c
Go to the documentation of this file.
00001 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
00002    See the file COPYING for copying permission.
00003 */
00004 
00005 #ifdef COMPILED_FROM_DSP
00006 #include "winconfig.h"
00007 #elif defined(MACOS_CLASSIC)
00008 #include "macconfig.h"
00009 #else
00010 #ifdef HAVE_EXPAT_CONFIG_H
00011 #include <expat_config.h>
00012 #endif
00013 #endif /* ndef COMPILED_FROM_DSP */
00014 
00015 #include "internal.h"
00016 #include "xmltok.h"
00017 #include "nametab.h"
00018 
00019 #ifdef XML_DTD
00020 #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
00021 #else
00022 #define IGNORE_SECTION_TOK_VTABLE /* as nothing */
00023 #endif
00024 
00025 #define VTABLE1 \
00026   { PREFIX(prologTok), PREFIX(contentTok), \
00027     PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
00028   { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
00029   PREFIX(sameName), \
00030   PREFIX(nameMatchesAscii), \
00031   PREFIX(nameLength), \
00032   PREFIX(skipS), \
00033   PREFIX(getAtts), \
00034   PREFIX(charRefNumber), \
00035   PREFIX(predefinedEntityName), \
00036   PREFIX(updatePosition), \
00037   PREFIX(isPublicId)
00038 
00039 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
00040 
00041 #define UCS2_GET_NAMING(pages, hi, lo) \
00042    (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
00043 
00044 /* A 2 byte UTF-8 representation splits the characters 11 bits between
00045    the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
00046    pages, 3 bits to add to that index and 5 bits to generate the mask.
00047 */
00048 #define UTF8_GET_NAMING2(pages, byte) \
00049     (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
00050                       + ((((byte)[0]) & 3) << 1) \
00051                       + ((((byte)[1]) >> 5) & 1)] \
00052          & (1 << (((byte)[1]) & 0x1F)))
00053 
00054 /* A 3 byte UTF-8 representation splits the characters 16 bits between
00055    the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
00056    into pages, 3 bits to add to that index and 5 bits to generate the
00057    mask.
00058 */
00059 #define UTF8_GET_NAMING3(pages, byte) \
00060   (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
00061                              + ((((byte)[1]) >> 2) & 0xF)] \
00062                        << 3) \
00063                       + ((((byte)[1]) & 3) << 1) \
00064                       + ((((byte)[2]) >> 5) & 1)] \
00065          & (1 << (((byte)[2]) & 0x1F)))
00066 
00067 #define UTF8_GET_NAMING(pages, p, n) \
00068   ((n) == 2 \
00069   ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
00070   : ((n) == 3 \
00071      ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
00072      : 0))
00073 
00074 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
00075    of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
00076    with the additional restriction of not allowing the Unicode
00077    code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
00078    Implementation details:
00079      (A & 0x80) == 0     means A < 0x80
00080    and
00081      (A & 0xC0) == 0xC0  means A > 0xBF
00082 */
00083 
00084 #define UTF8_INVALID2(p) \
00085   ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
00086 
00087 #define UTF8_INVALID3(p) \
00088   (((p)[2] & 0x80) == 0 \
00089   || \
00090   ((*p) == 0xEF && (p)[1] == 0xBF \
00091     ? \
00092     (p)[2] > 0xBD \
00093     : \
00094     ((p)[2] & 0xC0) == 0xC0) \
00095   || \
00096   ((*p) == 0xE0 \
00097     ? \
00098     (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
00099     : \
00100     ((p)[1] & 0x80) == 0 \
00101     || \
00102     ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
00103 
00104 #define UTF8_INVALID4(p) \
00105   (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
00106   || \
00107   ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
00108   || \
00109   ((*p) == 0xF0 \
00110     ? \
00111     (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
00112     : \
00113     ((p)[1] & 0x80) == 0 \
00114     || \
00115     ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
00116 
00117 static int PTRFASTCALL
00118 isNever(const ENCODING *enc, const char *p)
00119 {
00120   return 0;
00121 }
00122 
00123 static int PTRFASTCALL
00124 utf8_isName2(const ENCODING *enc, const char *p)
00125 {
00126   return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
00127 }
00128 
00129 static int PTRFASTCALL
00130 utf8_isName3(const ENCODING *enc, const char *p)
00131 {
00132   return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
00133 }
00134 
00135 #define utf8_isName4 isNever
00136 
00137 static int PTRFASTCALL
00138 utf8_isNmstrt2(const ENCODING *enc, const char *p)
00139 {
00140   return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
00141 }
00142 
00143 static int PTRFASTCALL
00144 utf8_isNmstrt3(const ENCODING *enc, const char *p)
00145 {
00146   return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
00147 }
00148 
00149 #define utf8_isNmstrt4 isNever
00150 
00151 static int PTRFASTCALL
00152 utf8_isInvalid2(const ENCODING *enc, const char *p)
00153 {
00154   return UTF8_INVALID2((const unsigned char *)p);
00155 }
00156 
00157 static int PTRFASTCALL
00158 utf8_isInvalid3(const ENCODING *enc, const char *p)
00159 {
00160   return UTF8_INVALID3((const unsigned char *)p);
00161 }
00162 
00163 static int PTRFASTCALL
00164 utf8_isInvalid4(const ENCODING *enc, const char *p)
00165 {
00166   return UTF8_INVALID4((const unsigned char *)p);
00167 }
00168 
00169 struct normal_encoding {
00170   ENCODING enc;
00171   unsigned char type[256];
00172 #ifdef XML_MIN_SIZE
00173   int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
00174   int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
00175   int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
00176   int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
00177   int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
00178 #endif /* XML_MIN_SIZE */
00179   int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
00180   int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
00181   int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
00182   int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
00183   int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
00184   int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
00185   int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
00186   int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
00187   int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
00188 };
00189 
00190 #define AS_NORMAL_ENCODING(enc)   ((const struct normal_encoding *) (enc))
00191 
00192 #ifdef XML_MIN_SIZE
00193 
00194 #define STANDARD_VTABLE(E) \
00195  E ## byteType, \
00196  E ## isNameMin, \
00197  E ## isNmstrtMin, \
00198  E ## byteToAscii, \
00199  E ## charMatches,
00200 
00201 #else
00202 
00203 #define STANDARD_VTABLE(E) /* as nothing */
00204 
00205 #endif
00206 
00207 #define NORMAL_VTABLE(E) \
00208  E ## isName2, \
00209  E ## isName3, \
00210  E ## isName4, \
00211  E ## isNmstrt2, \
00212  E ## isNmstrt3, \
00213  E ## isNmstrt4, \
00214  E ## isInvalid2, \
00215  E ## isInvalid3, \
00216  E ## isInvalid4
00217 
00218 static int FASTCALL checkCharRefNumber(int);
00219 
00220 #include "xmltok_impl.h"
00221 #include "ascii.h"
00222 
00223 #ifdef XML_MIN_SIZE
00224 #define sb_isNameMin isNever
00225 #define sb_isNmstrtMin isNever
00226 #endif
00227 
00228 #ifdef XML_MIN_SIZE
00229 #define MINBPC(enc) ((enc)->minBytesPerChar)
00230 #else
00231 /* minimum bytes per character */
00232 #define MINBPC(enc) 1
00233 #endif
00234 
00235 #define SB_BYTE_TYPE(enc, p) \
00236   (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
00237 
00238 #ifdef XML_MIN_SIZE
00239 static int PTRFASTCALL
00240 sb_byteType(const ENCODING *enc, const char *p)
00241 {
00242   return SB_BYTE_TYPE(enc, p);
00243 }
00244 #define BYTE_TYPE(enc, p) \
00245  (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
00246 #else
00247 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
00248 #endif
00249 
00250 #ifdef XML_MIN_SIZE
00251 #define BYTE_TO_ASCII(enc, p) \
00252  (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
00253 static int PTRFASTCALL
00254 sb_byteToAscii(const ENCODING *enc, const char *p)
00255 {
00256   return *p;
00257 }
00258 #else
00259 #define BYTE_TO_ASCII(enc, p) (*(p))
00260 #endif
00261 
00262 #define IS_NAME_CHAR(enc, p, n) \
00263  (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
00264 #define IS_NMSTRT_CHAR(enc, p, n) \
00265  (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
00266 #define IS_INVALID_CHAR(enc, p, n) \
00267  (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
00268 
00269 #ifdef XML_MIN_SIZE
00270 #define IS_NAME_CHAR_MINBPC(enc, p) \
00271  (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
00272 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
00273  (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
00274 #else
00275 #define IS_NAME_CHAR_MINBPC(enc, p) (0)
00276 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
00277 #endif
00278 
00279 #ifdef XML_MIN_SIZE
00280 #define CHAR_MATCHES(enc, p, c) \
00281  (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
00282 static int PTRCALL
00283 sb_charMatches(const ENCODING *enc, const char *p, int c)
00284 {
00285   return *p == c;
00286 }
00287 #else
00288 /* c is an ASCII character */
00289 #define CHAR_MATCHES(enc, p, c) (*(p) == c)
00290 #endif
00291 
00292 #define PREFIX(ident) normal_ ## ident
00293 #include "xmltok_impl.c"
00294 
00295 #undef MINBPC
00296 #undef BYTE_TYPE
00297 #undef BYTE_TO_ASCII
00298 #undef CHAR_MATCHES
00299 #undef IS_NAME_CHAR
00300 #undef IS_NAME_CHAR_MINBPC
00301 #undef IS_NMSTRT_CHAR
00302 #undef IS_NMSTRT_CHAR_MINBPC
00303 #undef IS_INVALID_CHAR
00304 
00305 enum {  /* UTF8_cvalN is value of masked first byte of N byte sequence */
00306   UTF8_cval1 = 0x00,
00307   UTF8_cval2 = 0xc0,
00308   UTF8_cval3 = 0xe0,
00309   UTF8_cval4 = 0xf0
00310 };
00311 
00312 static void PTRCALL
00313 utf8_toUtf8(const ENCODING *enc,
00314             const char **fromP, const char *fromLim,
00315             char **toP, const char *toLim)
00316 {
00317   char *to;
00318   const char *from;
00319   if (fromLim - *fromP > toLim - *toP) {
00320     /* Avoid copying partial characters. */
00321     for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
00322       if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
00323         break;
00324   }
00325   for (to = *toP, from = *fromP; from != fromLim; from++, to++)
00326     *to = *from;
00327   *fromP = from;
00328   *toP = to;
00329 }
00330 
00331 static void PTRCALL
00332 utf8_toUtf16(const ENCODING *enc,
00333              const char **fromP, const char *fromLim,
00334              unsigned short **toP, const unsigned short *toLim)
00335 {
00336   unsigned short *to = *toP;
00337   const char *from = *fromP;
00338   while (from != fromLim && to != toLim) {
00339     switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
00340     case BT_LEAD2:
00341       *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
00342       from += 2;
00343       break;
00344     case BT_LEAD3:
00345       *to++ = (unsigned short)(((from[0] & 0xf) << 12)
00346                                | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
00347       from += 3;
00348       break;
00349     case BT_LEAD4:
00350       {
00351         unsigned long n;
00352         if (to + 1 == toLim)
00353           goto after;
00354         n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
00355             | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
00356         n -= 0x10000;
00357         to[0] = (unsigned short)((n >> 10) | 0xD800);
00358         to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
00359         to += 2;
00360         from += 4;
00361       }
00362       break;
00363     default:
00364       *to++ = *from++;
00365       break;
00366     }
00367   }
00368 after:
00369   *fromP = from;
00370   *toP = to;
00371 }
00372 
00373 #ifdef XML_NS
00374 static const struct normal_encoding utf8_encoding_ns = {
00375   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00376   {
00377 #include "asciitab.h"
00378 #include "utf8tab.h"
00379   },
00380   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00381 };
00382 #endif
00383 
00384 static const struct normal_encoding utf8_encoding = {
00385   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00386   {
00387 #define BT_COLON BT_NMSTRT
00388 #include "asciitab.h"
00389 #undef BT_COLON
00390 #include "utf8tab.h"
00391   },
00392   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00393 };
00394 
00395 #ifdef XML_NS
00396 
00397 static const struct normal_encoding internal_utf8_encoding_ns = {
00398   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00399   {
00400 #include "iasciitab.h"
00401 #include "utf8tab.h"
00402   },
00403   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00404 };
00405 
00406 #endif
00407 
00408 static const struct normal_encoding internal_utf8_encoding = {
00409   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00410   {
00411 #define BT_COLON BT_NMSTRT
00412 #include "iasciitab.h"
00413 #undef BT_COLON
00414 #include "utf8tab.h"
00415   },
00416   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00417 };
00418 
00419 static void PTRCALL
00420 latin1_toUtf8(const ENCODING *enc,
00421               const char **fromP, const char *fromLim,
00422               char **toP, const char *toLim)
00423 {
00424   for (;;) {
00425     unsigned char c;
00426     if (*fromP == fromLim)
00427       break;
00428     c = (unsigned char)**fromP;
00429     if (c & 0x80) {
00430       if (toLim - *toP < 2)
00431         break;
00432       *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
00433       *(*toP)++ = (char)((c & 0x3f) | 0x80);
00434       (*fromP)++;
00435     }
00436     else {
00437       if (*toP == toLim)
00438         break;
00439       *(*toP)++ = *(*fromP)++;
00440     }
00441   }
00442 }
00443 
00444 static void PTRCALL
00445 latin1_toUtf16(const ENCODING *enc,
00446                const char **fromP, const char *fromLim,
00447                unsigned short **toP, const unsigned short *toLim)
00448 {
00449   while (*fromP != fromLim && *toP != toLim)
00450     *(*toP)++ = (unsigned char)*(*fromP)++;
00451 }
00452 
00453 #ifdef XML_NS
00454 
00455 static const struct normal_encoding latin1_encoding_ns = {
00456   { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
00457   {
00458 #include "asciitab.h"
00459 #include "latin1tab.h"
00460   },
00461   STANDARD_VTABLE(sb_)
00462 };
00463 
00464 #endif
00465 
00466 static const struct normal_encoding latin1_encoding = {
00467   { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
00468   {
00469 #define BT_COLON BT_NMSTRT
00470 #include "asciitab.h"
00471 #undef BT_COLON
00472 #include "latin1tab.h"
00473   },
00474   STANDARD_VTABLE(sb_)
00475 };
00476 
00477 static void PTRCALL
00478 ascii_toUtf8(const ENCODING *enc,
00479              const char **fromP, const char *fromLim,
00480              char **toP, const char *toLim)
00481 {
00482   while (*fromP != fromLim && *toP != toLim)
00483     *(*toP)++ = *(*fromP)++;
00484 }
00485 
00486 #ifdef XML_NS
00487 
00488 static const struct normal_encoding ascii_encoding_ns = {
00489   { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
00490   {
00491 #include "asciitab.h"
00492 /* BT_NONXML == 0 */
00493   },
00494   STANDARD_VTABLE(sb_)
00495 };
00496 
00497 #endif
00498 
00499 static const struct normal_encoding ascii_encoding = {
00500   { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
00501   {
00502 #define BT_COLON BT_NMSTRT
00503 #include "asciitab.h"
00504 #undef BT_COLON
00505 /* BT_NONXML == 0 */
00506   },
00507   STANDARD_VTABLE(sb_)
00508 };
00509 
00510 static int PTRFASTCALL
00511 unicode_byte_type(char hi, char lo)
00512 {
00513   switch ((unsigned char)hi) {
00514   case 0xD8: case 0xD9: case 0xDA: case 0xDB:
00515     return BT_LEAD4;
00516   case 0xDC: case 0xDD: case 0xDE: case 0xDF:
00517     return BT_TRAIL;
00518   case 0xFF:
00519     switch ((unsigned char)lo) {
00520     case 0xFF:
00521     case 0xFE:
00522       return BT_NONXML;
00523     }
00524     break;
00525   }
00526   return BT_NONASCII;
00527 }
00528 
00529 #define DEFINE_UTF16_TO_UTF8(E) \
00530 static void  PTRCALL \
00531 E ## toUtf8(const ENCODING *enc, \
00532             const char **fromP, const char *fromLim, \
00533             char **toP, const char *toLim) \
00534 { \
00535   const char *from; \
00536   for (from = *fromP; from != fromLim; from += 2) { \
00537     int plane; \
00538     unsigned char lo2; \
00539     unsigned char lo = GET_LO(from); \
00540     unsigned char hi = GET_HI(from); \
00541     switch (hi) { \
00542     case 0: \
00543       if (lo < 0x80) { \
00544         if (*toP == toLim) { \
00545           *fromP = from; \
00546           return; \
00547         } \
00548         *(*toP)++ = lo; \
00549         break; \
00550       } \
00551       /* fall through */ \
00552     case 0x1: case 0x2: case 0x3: \
00553     case 0x4: case 0x5: case 0x6: case 0x7: \
00554       if (toLim -  *toP < 2) { \
00555         *fromP = from; \
00556         return; \
00557       } \
00558       *(*toP)++ = ((lo >> 6) | (hi << 2) |  UTF8_cval2); \
00559       *(*toP)++ = ((lo & 0x3f) | 0x80); \
00560       break; \
00561     default: \
00562       if (toLim -  *toP < 3)  { \
00563         *fromP = from; \
00564         return; \
00565       } \
00566       /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
00567       *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
00568       *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
00569       *(*toP)++ = ((lo & 0x3f) | 0x80); \
00570       break; \
00571     case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
00572       if (toLim -  *toP < 4) { \
00573         *fromP = from; \
00574         return; \
00575       } \
00576       plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
00577       *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
00578       *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
00579       from += 2; \
00580       lo2 = GET_LO(from); \
00581       *(*toP)++ = (((lo & 0x3) << 4) \
00582                    | ((GET_HI(from) & 0x3) << 2) \
00583                    | (lo2 >> 6) \
00584                    | 0x80); \
00585       *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
00586       break; \
00587     } \
00588   } \
00589   *fromP = from; \
00590 }
00591 
00592 #define DEFINE_UTF16_TO_UTF16(E) \
00593 static void  PTRCALL \
00594 E ## toUtf16(const ENCODING *enc, \
00595              const char **fromP, const char *fromLim, \
00596              unsigned short **toP, const unsigned short *toLim) \
00597 { \
00598   /* Avoid copying first half only of surrogate */ \
00599   if (fromLim - *fromP > ((toLim - *toP) << 1) \
00600       && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
00601     fromLim -= 2; \
00602   for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
00603     *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
00604 }
00605 
00606 #define SET2(ptr, ch) \
00607   (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
00608 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
00609 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
00610 
00611 DEFINE_UTF16_TO_UTF8(little2_)
00612 DEFINE_UTF16_TO_UTF16(little2_)
00613 
00614 #undef SET2
00615 #undef GET_LO
00616 #undef GET_HI
00617 
00618 #define SET2(ptr, ch) \
00619   (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
00620 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
00621 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
00622 
00623 DEFINE_UTF16_TO_UTF8(big2_)
00624 DEFINE_UTF16_TO_UTF16(big2_)
00625 
00626 #undef SET2
00627 #undef GET_LO
00628 #undef GET_HI
00629 
00630 #define LITTLE2_BYTE_TYPE(enc, p) \
00631  ((p)[1] == 0 \
00632   ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
00633   : unicode_byte_type((p)[1], (p)[0]))
00634 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
00635 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
00636 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
00637   UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
00638 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
00639   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
00640 
00641 #ifdef XML_MIN_SIZE
00642 
00643 static int PTRFASTCALL
00644 little2_byteType(const ENCODING *enc, const char *p)
00645 {
00646   return LITTLE2_BYTE_TYPE(enc, p);
00647 }
00648 
00649 static int PTRFASTCALL
00650 little2_byteToAscii(const ENCODING *enc, const char *p)
00651 {
00652   return LITTLE2_BYTE_TO_ASCII(enc, p);
00653 }
00654 
00655 static int PTRCALL
00656 little2_charMatches(const ENCODING *enc, const char *p, int c)
00657 {
00658   return LITTLE2_CHAR_MATCHES(enc, p, c);
00659 }
00660 
00661 static int PTRFASTCALL
00662 little2_isNameMin(const ENCODING *enc, const char *p)
00663 {
00664   return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
00665 }
00666 
00667 static int PTRFASTCALL
00668 little2_isNmstrtMin(const ENCODING *enc, const char *p)
00669 {
00670   return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
00671 }
00672 
00673 #undef VTABLE
00674 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
00675 
00676 #else /* not XML_MIN_SIZE */
00677 
00678 #undef PREFIX
00679 #define PREFIX(ident) little2_ ## ident
00680 #define MINBPC(enc) 2
00681 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
00682 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
00683 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
00684 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
00685 #define IS_NAME_CHAR(enc, p, n) 0
00686 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
00687 #define IS_NMSTRT_CHAR(enc, p, n) (0)
00688 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
00689 
00690 #include "xmltok_impl.c"
00691 
00692 #undef MINBPC
00693 #undef BYTE_TYPE
00694 #undef BYTE_TO_ASCII
00695 #undef CHAR_MATCHES
00696 #undef IS_NAME_CHAR
00697 #undef IS_NAME_CHAR_MINBPC
00698 #undef IS_NMSTRT_CHAR
00699 #undef IS_NMSTRT_CHAR_MINBPC
00700 #undef IS_INVALID_CHAR
00701 
00702 #endif /* not XML_MIN_SIZE */
00703 
00704 #ifdef XML_NS
00705 
00706 static const struct normal_encoding little2_encoding_ns = {
00707   { VTABLE, 2, 0,
00708 #if BYTEORDER == 1234
00709     1
00710 #else
00711     0
00712 #endif
00713   },
00714   {
00715 #include "asciitab.h"
00716 #include "latin1tab.h"
00717   },
00718   STANDARD_VTABLE(little2_)
00719 };
00720 
00721 #endif
00722 
00723 static const struct normal_encoding little2_encoding = {
00724   { VTABLE, 2, 0,
00725 #if BYTEORDER == 1234
00726     1
00727 #else
00728     0
00729 #endif
00730   },
00731   {
00732 #define BT_COLON BT_NMSTRT
00733 #include "asciitab.h"
00734 #undef BT_COLON
00735 #include "latin1tab.h"
00736   },
00737   STANDARD_VTABLE(little2_)
00738 };
00739 
00740 #if BYTEORDER != 4321
00741 
00742 #ifdef XML_NS
00743 
00744 static const struct normal_encoding internal_little2_encoding_ns = {
00745   { VTABLE, 2, 0, 1 },
00746   {
00747 #include "iasciitab.h"
00748 #include "latin1tab.h"
00749   },
00750   STANDARD_VTABLE(little2_)
00751 };
00752 
00753 #endif
00754 
00755 static const struct normal_encoding internal_little2_encoding = {
00756   { VTABLE, 2, 0, 1 },
00757   {
00758 #define BT_COLON BT_NMSTRT
00759 #include "iasciitab.h"
00760 #undef BT_COLON
00761 #include "latin1tab.h"
00762   },
00763   STANDARD_VTABLE(little2_)
00764 };
00765 
00766 #endif
00767 
00768 
00769 #define BIG2_BYTE_TYPE(enc, p) \
00770  ((p)[0] == 0 \
00771   ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
00772   : unicode_byte_type((p)[0], (p)[1]))
00773 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
00774 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
00775 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
00776   UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
00777 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
00778   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
00779 
00780 #ifdef XML_MIN_SIZE
00781 
00782 static int PTRFASTCALL
00783 big2_byteType(const ENCODING *enc, const char *p)
00784 {
00785   return BIG2_BYTE_TYPE(enc, p);
00786 }
00787 
00788 static int PTRFASTCALL
00789 big2_byteToAscii(const ENCODING *enc, const char *p)
00790 {
00791   return BIG2_BYTE_TO_ASCII(enc, p);
00792 }
00793 
00794 static int PTRCALL
00795 big2_charMatches(const ENCODING *enc, const char *p, int c)
00796 {
00797   return BIG2_CHAR_MATCHES(enc, p, c);
00798 }
00799 
00800 static int PTRFASTCALL
00801 big2_isNameMin(const ENCODING *enc, const char *p)
00802 {
00803   return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
00804 }
00805 
00806 static int PTRFASTCALL
00807 big2_isNmstrtMin(const ENCODING *enc, const char *p)
00808 {
00809   return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
00810 }
00811 
00812 #undef VTABLE
00813 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
00814 
00815 #else /* not XML_MIN_SIZE */
00816 
00817 #undef PREFIX
00818 #define PREFIX(ident) big2_ ## ident
00819 #define MINBPC(enc) 2
00820 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
00821 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
00822 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
00823 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
00824 #define IS_NAME_CHAR(enc, p, n) 0
00825 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
00826 #define IS_NMSTRT_CHAR(enc, p, n) (0)
00827 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
00828 
00829 #include "xmltok_impl.c"
00830 
00831 #undef MINBPC
00832 #undef BYTE_TYPE
00833 #undef BYTE_TO_ASCII
00834 #undef CHAR_MATCHES
00835 #undef IS_NAME_CHAR
00836 #undef IS_NAME_CHAR_MINBPC
00837 #undef IS_NMSTRT_CHAR
00838 #undef IS_NMSTRT_CHAR_MINBPC
00839 #undef IS_INVALID_CHAR
00840 
00841 #endif /* not XML_MIN_SIZE */
00842 
00843 #ifdef XML_NS
00844 
00845 static const struct normal_encoding big2_encoding_ns = {
00846   { VTABLE, 2, 0,
00847 #if BYTEORDER == 4321
00848   1
00849 #else
00850   0
00851 #endif
00852   },
00853   {
00854 #include "asciitab.h"
00855 #include "latin1tab.h"
00856   },
00857   STANDARD_VTABLE(big2_)
00858 };
00859 
00860 #endif
00861 
00862 static const struct normal_encoding big2_encoding = {
00863   { VTABLE, 2, 0,
00864 #if BYTEORDER == 4321
00865   1
00866 #else
00867   0
00868 #endif
00869   },
00870   {
00871 #define BT_COLON BT_NMSTRT
00872 #include "asciitab.h"
00873 #undef BT_COLON
00874 #include "latin1tab.h"
00875   },
00876   STANDARD_VTABLE(big2_)
00877 };
00878 
00879 #if BYTEORDER != 1234
00880 
00881 #ifdef XML_NS
00882 
00883 static const struct normal_encoding internal_big2_encoding_ns = {
00884   { VTABLE, 2, 0, 1 },
00885   {
00886 #include "iasciitab.h"
00887 #include "latin1tab.h"
00888   },
00889   STANDARD_VTABLE(big2_)
00890 };
00891 
00892 #endif
00893 
00894 static const struct normal_encoding internal_big2_encoding = {
00895   { VTABLE, 2, 0, 1 },
00896   {
00897 #define BT_COLON BT_NMSTRT
00898 #include "iasciitab.h"
00899 #undef BT_COLON
00900 #include "latin1tab.h"
00901   },
00902   STANDARD_VTABLE(big2_)
00903 };
00904 
00905 #endif
00906 
00907 #undef PREFIX
00908 
00909 static int FASTCALL
00910 streqci(const char *s1, const char *s2)
00911 {
00912   for (;;) {
00913     char c1 = *s1++;
00914     char c2 = *s2++;
00915     if (ASCII_a <= c1 && c1 <= ASCII_z)
00916       c1 += ASCII_A - ASCII_a;
00917     if (ASCII_a <= c2 && c2 <= ASCII_z)
00918       c2 += ASCII_A - ASCII_a;
00919     if (c1 != c2)
00920       return 0;
00921     if (!c1)
00922       break;
00923   }
00924   return 1;
00925 }
00926 
00927 static void PTRCALL
00928 initUpdatePosition(const ENCODING *enc, const char *ptr,
00929                    const char *end, POSITION *pos)
00930 {
00931   normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
00932 }
00933 
00934 static int
00935 toAscii(const ENCODING *enc, const char *ptr, const char *end)
00936 {
00937   char buf[1];
00938   char *p = buf;
00939   XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
00940   if (p == buf)
00941     return -1;
00942   else
00943     return buf[0];
00944 }
00945 
00946 static int FASTCALL
00947 isSpace(int c)
00948 {
00949   switch (c) {
00950   case 0x20:
00951   case 0xD:
00952   case 0xA:
00953   case 0x9:
00954     return 1;
00955   }
00956   return 0;
00957 }
00958 
00959 /* Return 1 if there's just optional white space or there's an S
00960    followed by name=val.
00961 */
00962 static int
00963 parsePseudoAttribute(const ENCODING *enc,
00964                      const char *ptr,
00965                      const char *end,
00966                      const char **namePtr,
00967                      const char **nameEndPtr,
00968                      const char **valPtr,
00969                      const char **nextTokPtr)
00970 {
00971   int c;
00972   char open;
00973   if (ptr == end) {
00974     *namePtr = NULL;
00975     return 1;
00976   }
00977   if (!isSpace(toAscii(enc, ptr, end))) {
00978     *nextTokPtr = ptr;
00979     return 0;
00980   }
00981   do {
00982     ptr += enc->minBytesPerChar;
00983   } while (isSpace(toAscii(enc, ptr, end)));
00984   if (ptr == end) {
00985     *namePtr = NULL;
00986     return 1;
00987   }
00988   *namePtr = ptr;
00989   for (;;) {
00990     c = toAscii(enc, ptr, end);
00991     if (c == -1) {
00992       *nextTokPtr = ptr;
00993       return 0;
00994     }
00995     if (c == ASCII_EQUALS) {
00996       *nameEndPtr = ptr;
00997       break;
00998     }
00999     if (isSpace(c)) {
01000       *nameEndPtr = ptr;
01001       do {
01002         ptr += enc->minBytesPerChar;
01003       } while (isSpace(c = toAscii(enc, ptr, end)));
01004       if (c != ASCII_EQUALS) {
01005         *nextTokPtr = ptr;
01006         return 0;
01007       }
01008       break;
01009     }
01010     ptr += enc->minBytesPerChar;
01011   }
01012   if (ptr == *namePtr) {
01013     *nextTokPtr = ptr;
01014     return 0;
01015   }
01016   ptr += enc->minBytesPerChar;
01017   c = toAscii(enc, ptr, end);
01018   while (isSpace(c)) {
01019     ptr += enc->minBytesPerChar;
01020     c = toAscii(enc, ptr, end);
01021   }
01022   if (c != ASCII_QUOT && c != ASCII_APOS) {
01023     *nextTokPtr = ptr;
01024     return 0;
01025   }
01026   open = (char)c;
01027   ptr += enc->minBytesPerChar;
01028   *valPtr = ptr;
01029   for (;; ptr += enc->minBytesPerChar) {
01030     c = toAscii(enc, ptr, end);
01031     if (c == open)
01032       break;
01033     if (!(ASCII_a <= c && c <= ASCII_z)
01034         && !(ASCII_A <= c && c <= ASCII_Z)
01035         && !(ASCII_0 <= c && c <= ASCII_9)
01036         && c != ASCII_PERIOD
01037         && c != ASCII_MINUS
01038         && c != ASCII_UNDERSCORE) {
01039       *nextTokPtr = ptr;
01040       return 0;
01041     }
01042   }
01043   *nextTokPtr = ptr + enc->minBytesPerChar;
01044   return 1;
01045 }
01046 
01047 static const char KW_version[] = {
01048   ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
01049 };
01050 
01051 static const char KW_encoding[] = {
01052   ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
01053 };
01054 
01055 static const char KW_standalone[] = {
01056   ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
01057   ASCII_n, ASCII_e, '\0'
01058 };
01059 
01060 static const char KW_yes[] = {
01061   ASCII_y, ASCII_e, ASCII_s,  '\0'
01062 };
01063 
01064 static const char KW_no[] = {
01065   ASCII_n, ASCII_o,  '\0'
01066 };
01067 
01068 /* BEGIN MOZILLA CHANGE (http://bugzilla.mozilla.org/show_bug.cgi?id=62157) */
01069 static const char KW_XML_1_0[] = {
01070   ASCII_1, ASCII_PERIOD, ASCII_0, '\0'
01071 };
01072 /* END MOZILLA CHANGE */
01073 
01074 static int
01075 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
01076                                                  const char *,
01077                                                  const char *),
01078                int isGeneralTextEntity,
01079                const ENCODING *enc,
01080                const char *ptr,
01081                const char *end,
01082                const char **badPtr,
01083                const char **versionPtr,
01084                const char **versionEndPtr,
01085                const char **encodingName,
01086                const ENCODING **encoding,
01087                int *standalone)
01088 {
01089   const char *val = NULL;
01090   const char *name = NULL;
01091   const char *nameEnd = NULL;
01092   ptr += 5 * enc->minBytesPerChar;
01093   end -= 2 * enc->minBytesPerChar;
01094   if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
01095       || !name) {
01096     *badPtr = ptr;
01097     return 0;
01098   }
01099   if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
01100     if (!isGeneralTextEntity) {
01101       *badPtr = name;
01102       return 0;
01103     }
01104   }
01105   else {
01106     if (versionPtr)
01107       *versionPtr = val;
01108     if (versionEndPtr)
01109       *versionEndPtr = ptr;
01110 /* BEGIN MOZILLA CHANGE (http://bugzilla.mozilla.org/show_bug.cgi?id=62157) */
01111      /* Anything else but a version="1.0" is invalid for us, until we support later versions. */
01112      if (!XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_XML_1_0)) {
01113        *badPtr = val;
01114        return 0;
01115      }
01116 /* END MOZILLA CHANGE */
01117     if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
01118       *badPtr = ptr;
01119       return 0;
01120     }
01121     if (!name) {
01122       if (isGeneralTextEntity) {
01123         /* a TextDecl must have an EncodingDecl */
01124         *badPtr = ptr;
01125         return 0;
01126       }
01127       return 1;
01128     }
01129   }
01130   if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
01131     int c = toAscii(enc, val, end);
01132     if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
01133       *badPtr = val;
01134       return 0;
01135     }
01136     if (encodingName)
01137       *encodingName = val;
01138     if (encoding)
01139       *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
01140     if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
01141       *badPtr = ptr;
01142       return 0;
01143     }
01144     if (!name)
01145       return 1;
01146   }
01147   if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
01148       || isGeneralTextEntity) {
01149     *badPtr = name;
01150     return 0;
01151   }
01152   if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
01153     if (standalone)
01154       *standalone = 1;
01155   }
01156   else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
01157     if (standalone)
01158       *standalone = 0;
01159   }
01160   else {
01161     *badPtr = val;
01162     return 0;
01163   }
01164   while (isSpace(toAscii(enc, ptr, end)))
01165     ptr += enc->minBytesPerChar;
01166   if (ptr != end) {
01167     *badPtr = ptr;
01168     return 0;
01169   }
01170   return 1;
01171 }
01172 
01173 static int FASTCALL
01174 checkCharRefNumber(int result)
01175 {
01176   switch (result >> 8) {
01177   case 0xD8: case 0xD9: case 0xDA: case 0xDB:
01178   case 0xDC: case 0xDD: case 0xDE: case 0xDF:
01179     return -1;
01180   case 0:
01181     if (latin1_encoding.type[result] == BT_NONXML)
01182       return -1;
01183     break;
01184   case 0xFF:
01185     if (result == 0xFFFE || result == 0xFFFF)
01186       return -1;
01187     break;
01188   }
01189   return result;
01190 }
01191 
01192 int FASTCALL
01193 XmlUtf8Encode(int c, char *buf)
01194 {
01195   enum {
01196     /* minN is minimum legal resulting value for N byte sequence */
01197     min2 = 0x80,
01198     min3 = 0x800,
01199     min4 = 0x10000
01200   };
01201 
01202   if (c < 0)
01203     return 0;
01204   if (c < min2) {
01205     buf[0] = (char)(c | UTF8_cval1);
01206     return 1;
01207   }
01208   if (c < min3) {
01209     buf[0] = (char)((c >> 6) | UTF8_cval2);
01210     buf[1] = (char)((c & 0x3f) | 0x80);
01211     return 2;
01212   }
01213   if (c < min4) {
01214     buf[0] = (char)((c >> 12) | UTF8_cval3);
01215     buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
01216     buf[2] = (char)((c & 0x3f) | 0x80);
01217     return 3;
01218   }
01219   if (c < 0x110000) {
01220     buf[0] = (char)((c >> 18) | UTF8_cval4);
01221     buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
01222     buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
01223     buf[3] = (char)((c & 0x3f) | 0x80);
01224     return 4;
01225   }
01226   return 0;
01227 }
01228 
01229 int FASTCALL
01230 XmlUtf16Encode(int charNum, unsigned short *buf)
01231 {
01232   if (charNum < 0)
01233     return 0;
01234   if (charNum < 0x10000) {
01235     buf[0] = (unsigned short)charNum;
01236     return 1;
01237   }
01238   if (charNum < 0x110000) {
01239     charNum -= 0x10000;
01240     buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
01241     buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
01242     return 2;
01243   }
01244   return 0;
01245 }
01246 
01247 struct unknown_encoding {
01248   struct normal_encoding normal;
01249   int (*convert)(void *userData, const char *p);
01250   void *userData;
01251   unsigned short utf16[256];
01252   char utf8[256][4];
01253 };
01254 
01255 #define AS_UNKNOWN_ENCODING(enc)  ((const struct unknown_encoding *) (enc))
01256 
01257 int
01258 XmlSizeOfUnknownEncoding(void)
01259 {
01260   return sizeof(struct unknown_encoding);
01261 }
01262 
01263 static int PTRFASTCALL
01264 unknown_isName(const ENCODING *enc, const char *p)
01265 {
01266   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
01267   int c = uenc->convert(uenc->userData, p);
01268   if (c & ~0xFFFF)
01269     return 0;
01270   return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
01271 }
01272 
01273 static int PTRFASTCALL
01274 unknown_isNmstrt(const ENCODING *enc, const char *p)
01275 {
01276   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
01277   int c = uenc->convert(uenc->userData, p);
01278   if (c & ~0xFFFF)
01279     return 0;
01280   return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
01281 }
01282 
01283 static int PTRFASTCALL
01284 unknown_isInvalid(const ENCODING *enc, const char *p)
01285 {
01286   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
01287   int c = uenc->convert(uenc->userData, p);
01288   return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
01289 }
01290 
01291 static void PTRCALL
01292 unknown_toUtf8(const ENCODING *enc,
01293                const char **fromP, const char *fromLim,
01294                char **toP, const char *toLim)
01295 {
01296   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
01297   char buf[XML_UTF8_ENCODE_MAX];
01298   for (;;) {
01299     const char *utf8;
01300     int n;
01301     if (*fromP == fromLim)
01302       break;
01303     utf8 = uenc->utf8[(unsigned char)**fromP];
01304     n = *utf8++;
01305     if (n == 0) {
01306       int c = uenc->convert(uenc->userData, *fromP);
01307       n = XmlUtf8Encode(c, buf);
01308       if (n > toLim - *toP)
01309         break;
01310       utf8 = buf;
01311       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
01312                  - (BT_LEAD2 - 2));
01313     }
01314     else {
01315       if (n > toLim - *toP)
01316         break;
01317       (*fromP)++;
01318     }
01319     do {
01320       *(*toP)++ = *utf8++;
01321     } while (--n != 0);
01322   }
01323 }
01324 
01325 static void PTRCALL
01326 unknown_toUtf16(const ENCODING *enc,
01327                 const char **fromP, const char *fromLim,
01328                 unsigned short **toP, const unsigned short *toLim)
01329 {
01330   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
01331   while (*fromP != fromLim && *toP != toLim) {
01332     unsigned short c = uenc->utf16[(unsigned char)**fromP];
01333     if (c == 0) {
01334       c = (unsigned short)
01335           uenc->convert(uenc->userData, *fromP);
01336       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
01337                  - (BT_LEAD2 - 2));
01338     }
01339     else
01340       (*fromP)++;
01341     *(*toP)++ = c;
01342   }
01343 }
01344 
01345 ENCODING *
01346 XmlInitUnknownEncoding(void *mem,
01347                        int *table,
01348                        CONVERTER convert, 
01349                        void *userData)
01350 {
01351   int i;
01352   struct unknown_encoding *e = (struct unknown_encoding *)mem;
01353   for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
01354     ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
01355   for (i = 0; i < 128; i++)
01356     if (latin1_encoding.type[i] != BT_OTHER
01357         && latin1_encoding.type[i] != BT_NONXML
01358         && table[i] != i)
01359       return 0;
01360   for (i = 0; i < 256; i++) {
01361     int c = table[i];
01362     if (c == -1) {
01363       e->normal.type[i] = BT_MALFORM;
01364       /* This shouldn't really get used. */
01365       e->utf16[i] = 0xFFFF;
01366       e->utf8[i][0] = 1;
01367       e->utf8[i][1] = 0;
01368     }
01369     else if (c < 0) {
01370       if (c < -4)
01371         return 0;
01372       e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
01373       e->utf8[i][0] = 0;
01374       e->utf16[i] = 0;
01375     }
01376     else if (c < 0x80) {
01377       if (latin1_encoding.type[c] != BT_OTHER
01378           && latin1_encoding.type[c] != BT_NONXML
01379           && c != i)
01380         return 0;
01381       e->normal.type[i] = latin1_encoding.type[c];
01382       e->utf8[i][0] = 1;
01383       e->utf8[i][1] = (char)c;
01384       e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
01385     }
01386     else if (checkCharRefNumber(c) < 0) {
01387       e->normal.type[i] = BT_NONXML;
01388       /* This shouldn't really get used. */
01389       e->utf16[i] = 0xFFFF;
01390       e->utf8[i][0] = 1;
01391       e->utf8[i][1] = 0;
01392     }
01393     else {
01394       if (c > 0xFFFF)
01395         return 0;
01396       if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
01397         e->normal.type[i] = BT_NMSTRT;
01398       else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
01399         e->normal.type[i] = BT_NAME;
01400       else
01401         e->normal.type[i] = BT_OTHER;
01402       e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
01403       e->utf16[i] = (unsigned short)c;
01404     }
01405   }
01406   e->userData = userData;
01407   e->convert = convert;
01408   if (convert) {
01409     e->normal.isName2 = unknown_isName;
01410     e->normal.isName3 = unknown_isName;
01411     e->normal.isName4 = unknown_isName;
01412     e->normal.isNmstrt2 = unknown_isNmstrt;
01413     e->normal.isNmstrt3 = unknown_isNmstrt;
01414     e->normal.isNmstrt4 = unknown_isNmstrt;
01415     e->normal.isInvalid2 = unknown_isInvalid;
01416     e->normal.isInvalid3 = unknown_isInvalid;
01417     e->normal.isInvalid4 = unknown_isInvalid;
01418   }
01419   e->normal.enc.utf8Convert = unknown_toUtf8;
01420   e->normal.enc.utf16Convert = unknown_toUtf16;
01421   return &(e->normal.enc);
01422 }
01423 
01424 /* If this enumeration is changed, getEncodingIndex and encodings
01425 must also be changed. */
01426 enum {
01427   UNKNOWN_ENC = -1,
01428   ISO_8859_1_ENC = 0,
01429   US_ASCII_ENC,
01430   UTF_8_ENC,
01431   UTF_16_ENC,
01432   UTF_16BE_ENC,
01433   UTF_16LE_ENC,
01434   /* must match encodingNames up to here */
01435   NO_ENC
01436 };
01437 
01438 static const char KW_ISO_8859_1[] = {
01439   ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
01440   ASCII_MINUS, ASCII_1, '\0'
01441 };
01442 static const char KW_US_ASCII[] = {
01443   ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
01444   '\0'
01445 };
01446 static const char KW_UTF_8[] =  {
01447   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
01448 };
01449 static const char KW_UTF_16[] = {
01450   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
01451 };
01452 static const char KW_UTF_16BE[] = {
01453   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
01454   '\0'
01455 };
01456 static const char KW_UTF_16LE[] = {
01457   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
01458   '\0'
01459 };
01460 
01461 static int FASTCALL
01462 getEncodingIndex(const char *name)
01463 {
01464   static const char *encodingNames[] = {
01465     KW_ISO_8859_1,
01466     KW_US_ASCII,
01467     KW_UTF_8,
01468     KW_UTF_16,
01469     KW_UTF_16BE,
01470     KW_UTF_16LE,
01471   };
01472   int i;
01473   if (name == NULL)
01474     return NO_ENC;
01475   for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
01476     if (streqci(name, encodingNames[i]))
01477       return i;
01478   return UNKNOWN_ENC;
01479 }
01480 
01481 /* For binary compatibility, we store the index of the encoding
01482    specified at initialization in the isUtf16 member.
01483 */
01484 
01485 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
01486 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
01487 
01488 /* This is what detects the encoding.  encodingTable maps from
01489    encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
01490    the external (protocol) specified encoding; state is
01491    XML_CONTENT_STATE if we're parsing an external text entity, and
01492    XML_PROLOG_STATE otherwise.
01493 */
01494 
01495 
01496 static int
01497 initScan(const ENCODING **encodingTable,
01498          const INIT_ENCODING *enc,
01499          int state,
01500          const char *ptr,
01501          const char *end,
01502          const char **nextTokPtr)
01503 {
01504   const ENCODING **encPtr;
01505 
01506   if (ptr == end)
01507     return XML_TOK_NONE;
01508   encPtr = enc->encPtr;
01509   if (ptr + 1 == end) {
01510     /* only a single byte available for auto-detection */
01511 #ifndef XML_DTD /* FIXME */
01512     /* a well-formed document entity must have more than one byte */
01513     if (state != XML_CONTENT_STATE)
01514       return XML_TOK_PARTIAL;
01515 #endif
01516     /* so we're parsing an external text entity... */
01517     /* if UTF-16 was externally specified, then we need at least 2 bytes */
01518     switch (INIT_ENC_INDEX(enc)) {
01519     case UTF_16_ENC:
01520     case UTF_16LE_ENC:
01521     case UTF_16BE_ENC:
01522       return XML_TOK_PARTIAL;
01523     }
01524     switch ((unsigned char)*ptr) {
01525     case 0xFE:
01526     case 0xFF:
01527     case 0xEF: /* possibly first byte of UTF-8 BOM */
01528       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
01529           && state == XML_CONTENT_STATE)
01530         break;
01531       /* fall through */
01532     case 0x00:
01533     case 0x3C:
01534       return XML_TOK_PARTIAL;
01535     }
01536   }
01537   else {
01538     switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
01539     case 0xFEFF:
01540       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
01541           && state == XML_CONTENT_STATE)
01542         break;
01543       *nextTokPtr = ptr + 2;
01544       *encPtr = encodingTable[UTF_16BE_ENC];
01545       return XML_TOK_BOM;
01546     /* 00 3C is handled in the default case */
01547     case 0x3C00:
01548       if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
01549            || INIT_ENC_INDEX(enc) == UTF_16_ENC)
01550           && state == XML_CONTENT_STATE)
01551         break;
01552       *encPtr = encodingTable[UTF_16LE_ENC];
01553       return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01554     case 0xFFFE:
01555       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
01556           && state == XML_CONTENT_STATE)
01557         break;
01558       *nextTokPtr = ptr + 2;
01559       *encPtr = encodingTable[UTF_16LE_ENC];
01560       return XML_TOK_BOM;
01561     case 0xEFBB:
01562       /* Maybe a UTF-8 BOM (EF BB BF) */
01563       /* If there's an explicitly specified (external) encoding
01564          of ISO-8859-1 or some flavour of UTF-16
01565          and this is an external text entity,
01566          don't look for the BOM,
01567          because it might be a legal data.
01568       */
01569       if (state == XML_CONTENT_STATE) {
01570         int e = INIT_ENC_INDEX(enc);
01571         if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
01572             || e == UTF_16LE_ENC || e == UTF_16_ENC)
01573           break;
01574       }
01575       if (ptr + 2 == end)
01576         return XML_TOK_PARTIAL;
01577       if ((unsigned char)ptr[2] == 0xBF) {
01578         *nextTokPtr = ptr + 3;
01579         *encPtr = encodingTable[UTF_8_ENC];
01580         return XML_TOK_BOM;
01581       }
01582       break;
01583     default:
01584       if (ptr[0] == '\0') {
01585         /* 0 isn't a legal data character. Furthermore a document
01586            entity can only start with ASCII characters.  So the only
01587            way this can fail to be big-endian UTF-16 if it it's an
01588            external parsed general entity that's labelled as
01589            UTF-16LE.
01590         */
01591         if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
01592           break;
01593         *encPtr = encodingTable[UTF_16BE_ENC];
01594         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01595       }
01596       else if (ptr[1] == '\0') {
01597         /* We could recover here in the case:
01598             - parsing an external entity
01599             - second byte is 0
01600             - no externally specified encoding
01601             - no encoding declaration
01602            by assuming UTF-16LE.  But we don't, because this would mean when
01603            presented just with a single byte, we couldn't reliably determine
01604            whether we needed further bytes.
01605         */
01606         if (state == XML_CONTENT_STATE)
01607           break;
01608         *encPtr = encodingTable[UTF_16LE_ENC];
01609         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01610       }
01611       break;
01612     }
01613   }
01614   *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
01615   return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01616 }
01617 
01618 
01619 #define NS(x) x
01620 #define ns(x) x
01621 #include "xmltok_ns.c"
01622 #undef NS
01623 #undef ns
01624 
01625 #ifdef XML_NS
01626 
01627 #define NS(x) x ## NS
01628 #define ns(x) x ## _ns
01629 
01630 #include "xmltok_ns.c"
01631 
01632 #undef NS
01633 #undef ns
01634 
01635 ENCODING *
01636 XmlInitUnknownEncodingNS(void *mem,
01637                          int *table,
01638                          CONVERTER convert, 
01639                          void *userData)
01640 {
01641   ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
01642   if (enc)
01643     ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
01644   return enc;
01645 }
01646 
01647 #endif /* XML_NS */
01648 
01649 /* BEGIN MOZILLA CHANGE (Mozilla extensions for QName checking) */
01650 #ifdef MOZILLA_CLIENT
01651 #include "moz_extensions.c"
01652 #endif /* MOZILLA_CLIENT */
01653 /* END MOZILLA CHANGE */