Back to index

python3.2  3.2.2
_codecs_iso2022.c
Go to the documentation of this file.
00001 /*
00002  * _codecs_iso2022.c: Codecs collection for ISO-2022 encodings.
00003  *
00004  * Written by Hye-Shik Chang <perky@FreeBSD.org>
00005  */
00006 
00007 #define USING_IMPORTED_MAPS
00008 #define USING_BINARY_PAIR_SEARCH
00009 #define EXTERN_JISX0213_PAIR
00010 #define EMULATE_JISX0213_2000_ENCODE_INVALID MAP_UNMAPPABLE
00011 #define EMULATE_JISX0213_2000_DECODE_INVALID MAP_UNMAPPABLE
00012 
00013 #include "cjkcodecs.h"
00014 #include "alg_jisx0201.h"
00015 #include "emu_jisx0213_2000.h"
00016 #include "mappings_jisx0213_pair.h"
00017 
00018 /* STATE
00019 
00020    state->c[0-3]
00021 
00022     00000000
00023     ||^^^^^|
00024     |+-----+----  G0-3 Character Set
00025     +-----------  Is G0-3 double byte?
00026 
00027    state->c[4]
00028 
00029     00000000
00030           ||
00031           |+----  Locked-Shift?
00032           +-----  ESC Throughout
00033 */
00034 
00035 #define ESC                     0x1B
00036 #define SO                      0x0E
00037 #define SI                      0x0F
00038 #define LF                      0x0A
00039 
00040 #define MAX_ESCSEQLEN           16
00041 
00042 #define CHARSET_ISO8859_1       'A'
00043 #define CHARSET_ASCII           'B'
00044 #define CHARSET_ISO8859_7       'F'
00045 #define CHARSET_JISX0201_K      'I'
00046 #define CHARSET_JISX0201_R      'J'
00047 
00048 #define CHARSET_GB2312          ('A'|CHARSET_DBCS)
00049 #define CHARSET_JISX0208        ('B'|CHARSET_DBCS)
00050 #define CHARSET_KSX1001         ('C'|CHARSET_DBCS)
00051 #define CHARSET_JISX0212        ('D'|CHARSET_DBCS)
00052 #define CHARSET_GB2312_8565     ('E'|CHARSET_DBCS)
00053 #define CHARSET_CNS11643_1      ('G'|CHARSET_DBCS)
00054 #define CHARSET_CNS11643_2      ('H'|CHARSET_DBCS)
00055 #define CHARSET_JISX0213_2000_1 ('O'|CHARSET_DBCS)
00056 #define CHARSET_JISX0213_2      ('P'|CHARSET_DBCS)
00057 #define CHARSET_JISX0213_2004_1 ('Q'|CHARSET_DBCS)
00058 #define CHARSET_JISX0208_O      ('@'|CHARSET_DBCS)
00059 
00060 #define CHARSET_DBCS            0x80
00061 #define ESCMARK(mark)           ((mark) & 0x7f)
00062 
00063 #define IS_ESCEND(c)    (((c) >= 'A' && (c) <= 'Z') || (c) == '@')
00064 #define IS_ISO2022ESC(c2) \
00065         ((c2) == '(' || (c2) == ')' || (c2) == '$' || \
00066          (c2) == '.' || (c2) == '&')
00067     /* this is not a complete list of ISO-2022 escape sequence headers.
00068      * but, it's enough to implement CJK instances of iso-2022. */
00069 
00070 #define MAP_UNMAPPABLE          0xFFFF
00071 #define MAP_MULTIPLE_AVAIL      0xFFFE /* for JIS X 0213 */
00072 
00073 #define F_SHIFTED               0x01
00074 #define F_ESCTHROUGHOUT         0x02
00075 
00076 #define STATE_SETG(dn, v)       ((state)->c[dn]) = (v);
00077 #define STATE_GETG(dn)          ((state)->c[dn])
00078 
00079 #define STATE_G0                STATE_GETG(0)
00080 #define STATE_G1                STATE_GETG(1)
00081 #define STATE_G2                STATE_GETG(2)
00082 #define STATE_G3                STATE_GETG(3)
00083 #define STATE_SETG0(v)          STATE_SETG(0, v)
00084 #define STATE_SETG1(v)          STATE_SETG(1, v)
00085 #define STATE_SETG2(v)          STATE_SETG(2, v)
00086 #define STATE_SETG3(v)          STATE_SETG(3, v)
00087 
00088 #define STATE_SETFLAG(f)        ((state)->c[4]) |= (f);
00089 #define STATE_GETFLAG(f)        ((state)->c[4] & (f))
00090 #define STATE_CLEARFLAG(f)      ((state)->c[4]) &= ~(f);
00091 #define STATE_CLEARFLAGS()      ((state)->c[4]) = 0;
00092 
00093 #define ISO2022_CONFIG          ((const struct iso2022_config *)config)
00094 #define CONFIG_ISSET(flag)      (ISO2022_CONFIG->flags & (flag))
00095 #define CONFIG_DESIGNATIONS     (ISO2022_CONFIG->designations)
00096 
00097 /* iso2022_config.flags */
00098 #define NO_SHIFT                0x01
00099 #define USE_G2                  0x02
00100 #define USE_JISX0208_EXT        0x04
00101 
00102 /*-*- internal data structures -*-*/
00103 
00104 typedef int (*iso2022_init_func)(void);
00105 typedef ucs4_t (*iso2022_decode_func)(const unsigned char *data);
00106 typedef DBCHAR (*iso2022_encode_func)(const ucs4_t *data, Py_ssize_t *length);
00107 
00108 struct iso2022_designation {
00109     unsigned char mark;
00110     unsigned char plane;
00111     unsigned char width;
00112     iso2022_init_func initializer;
00113     iso2022_decode_func decoder;
00114     iso2022_encode_func encoder;
00115 };
00116 
00117 struct iso2022_config {
00118     int flags;
00119     const struct iso2022_designation *designations; /* non-ascii desigs */
00120 };
00121 
00122 /*-*- iso-2022 codec implementation -*-*/
00123 
00124 CODEC_INIT(iso2022)
00125 {
00126     const struct iso2022_designation *desig = CONFIG_DESIGNATIONS;
00127     for (desig = CONFIG_DESIGNATIONS; desig->mark; desig++)
00128         if (desig->initializer != NULL && desig->initializer() != 0)
00129             return -1;
00130     return 0;
00131 }
00132 
00133 ENCODER_INIT(iso2022)
00134 {
00135     STATE_CLEARFLAGS()
00136     STATE_SETG0(CHARSET_ASCII)
00137     STATE_SETG1(CHARSET_ASCII)
00138     return 0;
00139 }
00140 
00141 ENCODER_RESET(iso2022)
00142 {
00143     if (STATE_GETFLAG(F_SHIFTED)) {
00144         WRITE1(SI)
00145         NEXT_OUT(1)
00146         STATE_CLEARFLAG(F_SHIFTED)
00147     }
00148     if (STATE_G0 != CHARSET_ASCII) {
00149         WRITE3(ESC, '(', 'B')
00150         NEXT_OUT(3)
00151         STATE_SETG0(CHARSET_ASCII)
00152     }
00153     return 0;
00154 }
00155 
00156 ENCODER(iso2022)
00157 {
00158     while (inleft > 0) {
00159         const struct iso2022_designation *dsg;
00160         DBCHAR encoded;
00161         ucs4_t c = **inbuf;
00162         Py_ssize_t insize;
00163 
00164         if (c < 0x80) {
00165             if (STATE_G0 != CHARSET_ASCII) {
00166                 WRITE3(ESC, '(', 'B')
00167                 STATE_SETG0(CHARSET_ASCII)
00168                 NEXT_OUT(3)
00169             }
00170             if (STATE_GETFLAG(F_SHIFTED)) {
00171                 WRITE1(SI)
00172                 STATE_CLEARFLAG(F_SHIFTED)
00173                 NEXT_OUT(1)
00174             }
00175             WRITE1((unsigned char)c)
00176             NEXT(1, 1)
00177             continue;
00178         }
00179 
00180         DECODE_SURROGATE(c)
00181         insize = GET_INSIZE(c);
00182 
00183         encoded = MAP_UNMAPPABLE;
00184         for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
00185             Py_ssize_t length = 1;
00186             encoded = dsg->encoder(&c, &length);
00187             if (encoded == MAP_MULTIPLE_AVAIL) {
00188                 /* this implementation won't work for pair
00189                  * of non-bmp characters. */
00190                 if (inleft < 2) {
00191                     if (!(flags & MBENC_FLUSH))
00192                         return MBERR_TOOFEW;
00193                     length = -1;
00194                 }
00195                 else
00196                     length = 2;
00197 #if Py_UNICODE_SIZE == 2
00198                 if (length == 2) {
00199                     ucs4_t u4in[2];
00200                     u4in[0] = (ucs4_t)IN1;
00201                     u4in[1] = (ucs4_t)IN2;
00202                     encoded = dsg->encoder(u4in, &length);
00203                 } else
00204                     encoded = dsg->encoder(&c, &length);
00205 #else
00206                 encoded = dsg->encoder(&c, &length);
00207 #endif
00208                 if (encoded != MAP_UNMAPPABLE) {
00209                     insize = length;
00210                     break;
00211                 }
00212             }
00213             else if (encoded != MAP_UNMAPPABLE)
00214                 break;
00215         }
00216 
00217         if (!dsg->mark)
00218             return 1;
00219         assert(dsg->width == 1 || dsg->width == 2);
00220 
00221         switch (dsg->plane) {
00222         case 0: /* G0 */
00223             if (STATE_GETFLAG(F_SHIFTED)) {
00224                 WRITE1(SI)
00225                 STATE_CLEARFLAG(F_SHIFTED)
00226                 NEXT_OUT(1)
00227             }
00228             if (STATE_G0 != dsg->mark) {
00229                 if (dsg->width == 1) {
00230                     WRITE3(ESC, '(', ESCMARK(dsg->mark))
00231                     STATE_SETG0(dsg->mark)
00232                     NEXT_OUT(3)
00233                 }
00234                 else if (dsg->mark == CHARSET_JISX0208) {
00235                     WRITE3(ESC, '$', ESCMARK(dsg->mark))
00236                     STATE_SETG0(dsg->mark)
00237                     NEXT_OUT(3)
00238                 }
00239                 else {
00240                     WRITE4(ESC, '$', '(',
00241                         ESCMARK(dsg->mark))
00242                     STATE_SETG0(dsg->mark)
00243                     NEXT_OUT(4)
00244                 }
00245             }
00246             break;
00247         case 1: /* G1 */
00248             if (STATE_G1 != dsg->mark) {
00249                 if (dsg->width == 1) {
00250                     WRITE3(ESC, ')', ESCMARK(dsg->mark))
00251                     STATE_SETG1(dsg->mark)
00252                     NEXT_OUT(3)
00253                 }
00254                 else {
00255                     WRITE4(ESC, '$', ')',
00256                         ESCMARK(dsg->mark))
00257                     STATE_SETG1(dsg->mark)
00258                     NEXT_OUT(4)
00259                 }
00260             }
00261             if (!STATE_GETFLAG(F_SHIFTED)) {
00262                 WRITE1(SO)
00263                 STATE_SETFLAG(F_SHIFTED)
00264                 NEXT_OUT(1)
00265             }
00266             break;
00267         default: /* G2 and G3 is not supported: no encoding in
00268                   * CJKCodecs are using them yet */
00269             return MBERR_INTERNAL;
00270         }
00271 
00272         if (dsg->width == 1) {
00273             WRITE1((unsigned char)encoded)
00274             NEXT_OUT(1)
00275         }
00276         else {
00277             WRITE2(encoded >> 8, encoded & 0xff)
00278             NEXT_OUT(2)
00279         }
00280         NEXT_IN(insize)
00281     }
00282 
00283     return 0;
00284 }
00285 
00286 DECODER_INIT(iso2022)
00287 {
00288     STATE_CLEARFLAGS()
00289     STATE_SETG0(CHARSET_ASCII)
00290     STATE_SETG1(CHARSET_ASCII)
00291     STATE_SETG2(CHARSET_ASCII)
00292     return 0;
00293 }
00294 
00295 DECODER_RESET(iso2022)
00296 {
00297     STATE_SETG0(CHARSET_ASCII)
00298     STATE_CLEARFLAG(F_SHIFTED)
00299     return 0;
00300 }
00301 
00302 static Py_ssize_t
00303 iso2022processesc(const void *config, MultibyteCodec_State *state,
00304                   const unsigned char **inbuf, Py_ssize_t *inleft)
00305 {
00306     unsigned char charset, designation;
00307     Py_ssize_t i, esclen;
00308 
00309     for (i = 1;i < MAX_ESCSEQLEN;i++) {
00310         if (i >= *inleft)
00311             return MBERR_TOOFEW;
00312         if (IS_ESCEND((*inbuf)[i])) {
00313             esclen = i + 1;
00314             break;
00315         }
00316         else if (CONFIG_ISSET(USE_JISX0208_EXT) && i+1 < *inleft &&
00317                  (*inbuf)[i] == '&' && (*inbuf)[i+1] == '@')
00318             i += 2;
00319     }
00320 
00321     if (i >= MAX_ESCSEQLEN)
00322         return 1; /* unterminated escape sequence */
00323 
00324     switch (esclen) {
00325     case 3:
00326         if (IN2 == '$') {
00327             charset = IN3 | CHARSET_DBCS;
00328             designation = 0;
00329         }
00330         else {
00331             charset = IN3;
00332             if (IN2 == '(') designation = 0;
00333             else if (IN2 == ')') designation = 1;
00334             else if (CONFIG_ISSET(USE_G2) && IN2 == '.')
00335                 designation = 2;
00336             else return 3;
00337         }
00338         break;
00339     case 4:
00340         if (IN2 != '$')
00341             return 4;
00342 
00343         charset = IN4 | CHARSET_DBCS;
00344         if (IN3 == '(') designation = 0;
00345         else if (IN3 == ')') designation = 1;
00346         else return 4;
00347         break;
00348     case 6: /* designation with prefix */
00349         if (CONFIG_ISSET(USE_JISX0208_EXT) &&
00350             (*inbuf)[3] == ESC && (*inbuf)[4] == '$' &&
00351             (*inbuf)[5] == 'B') {
00352             charset = 'B' | CHARSET_DBCS;
00353             designation = 0;
00354         }
00355         else
00356             return 6;
00357         break;
00358     default:
00359         return esclen;
00360     }
00361 
00362     /* raise error when the charset is not designated for this encoding */
00363     if (charset != CHARSET_ASCII) {
00364         const struct iso2022_designation *dsg;
00365 
00366         for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++)
00367             if (dsg->mark == charset)
00368                 break;
00369         if (!dsg->mark)
00370             return esclen;
00371     }
00372 
00373     STATE_SETG(designation, charset)
00374     *inleft -= esclen;
00375     (*inbuf) += esclen;
00376     return 0;
00377 }
00378 
00379 #define ISO8859_7_DECODE(c, assi)                                       \
00380     if ((c) < 0xa0) (assi) = (c);                                       \
00381     else if ((c) < 0xc0 && (0x288f3bc9L & (1L << ((c)-0xa0))))          \
00382         (assi) = (c);                                                   \
00383     else if ((c) >= 0xb4 && (c) <= 0xfe && ((c) >= 0xd4 ||              \
00384              (0xbffffd77L & (1L << ((c)-0xb4)))))                       \
00385         (assi) = 0x02d0 + (c);                                          \
00386     else if ((c) == 0xa1) (assi) = 0x2018;                              \
00387     else if ((c) == 0xa2) (assi) = 0x2019;                              \
00388     else if ((c) == 0xaf) (assi) = 0x2015;
00389 
00390 static Py_ssize_t
00391 iso2022processg2(const void *config, MultibyteCodec_State *state,
00392                  const unsigned char **inbuf, Py_ssize_t *inleft,
00393                  Py_UNICODE **outbuf, Py_ssize_t *outleft)
00394 {
00395     /* not written to use encoder, decoder functions because only few
00396      * encodings use G2 designations in CJKCodecs */
00397     if (STATE_G2 == CHARSET_ISO8859_1) {
00398         if (IN3 < 0x80)
00399             OUT1(IN3 + 0x80)
00400         else
00401             return 3;
00402     }
00403     else if (STATE_G2 == CHARSET_ISO8859_7) {
00404         ISO8859_7_DECODE(IN3 ^ 0x80, **outbuf)
00405         else return 3;
00406     }
00407     else if (STATE_G2 == CHARSET_ASCII) {
00408         if (IN3 & 0x80) return 3;
00409         else **outbuf = IN3;
00410     }
00411     else
00412         return MBERR_INTERNAL;
00413 
00414     (*inbuf) += 3;
00415     *inleft -= 3;
00416     (*outbuf) += 1;
00417     *outleft -= 1;
00418     return 0;
00419 }
00420 
00421 DECODER(iso2022)
00422 {
00423     const struct iso2022_designation *dsgcache = NULL;
00424 
00425     while (inleft > 0) {
00426         unsigned char c = IN1;
00427         Py_ssize_t err;
00428 
00429         if (STATE_GETFLAG(F_ESCTHROUGHOUT)) {
00430             /* ESC throughout mode:
00431              * for non-iso2022 escape sequences */
00432             WRITE1(c) /* assume as ISO-8859-1 */
00433             NEXT(1, 1)
00434             if (IS_ESCEND(c)) {
00435                 STATE_CLEARFLAG(F_ESCTHROUGHOUT)
00436             }
00437             continue;
00438         }
00439 
00440         switch (c) {
00441         case ESC:
00442             REQUIRE_INBUF(2)
00443             if (IS_ISO2022ESC(IN2)) {
00444                 err = iso2022processesc(config, state,
00445                                         inbuf, &inleft);
00446                 if (err != 0)
00447                     return err;
00448             }
00449             else if (CONFIG_ISSET(USE_G2) && IN2 == 'N') {/* SS2 */
00450                 REQUIRE_INBUF(3)
00451                 err = iso2022processg2(config, state,
00452                     inbuf, &inleft, outbuf, &outleft);
00453                 if (err != 0)
00454                     return err;
00455             }
00456             else {
00457                 WRITE1(ESC)
00458                 STATE_SETFLAG(F_ESCTHROUGHOUT)
00459                 NEXT(1, 1)
00460             }
00461             break;
00462         case SI:
00463             if (CONFIG_ISSET(NO_SHIFT))
00464                 goto bypass;
00465             STATE_CLEARFLAG(F_SHIFTED)
00466             NEXT_IN(1)
00467             break;
00468         case SO:
00469             if (CONFIG_ISSET(NO_SHIFT))
00470                 goto bypass;
00471             STATE_SETFLAG(F_SHIFTED)
00472             NEXT_IN(1)
00473             break;
00474         case LF:
00475             STATE_CLEARFLAG(F_SHIFTED)
00476             WRITE1(LF)
00477             NEXT(1, 1)
00478             break;
00479         default:
00480             if (c < 0x20) /* C0 */
00481                 goto bypass;
00482             else if (c >= 0x80)
00483                 return 1;
00484             else {
00485                 const struct iso2022_designation *dsg;
00486                 unsigned char charset;
00487                 ucs4_t decoded;
00488 
00489                 if (STATE_GETFLAG(F_SHIFTED))
00490                     charset = STATE_G1;
00491                 else
00492                     charset = STATE_G0;
00493 
00494                 if (charset == CHARSET_ASCII) {
00495 bypass:                                 WRITE1(c)
00496                                         NEXT(1, 1)
00497                                         break;
00498                                 }
00499 
00500                                 if (dsgcache != NULL &&
00501                                     dsgcache->mark == charset)
00502                                         dsg = dsgcache;
00503                                 else {
00504                                         for (dsg = CONFIG_DESIGNATIONS;
00505                                              dsg->mark != charset
00506 #ifdef Py_DEBUG
00507                                                 && dsg->mark != '\0'
00508 #endif
00509                                              ;dsg++)
00510                                                 /* noop */;
00511                                         assert(dsg->mark != '\0');
00512                                         dsgcache = dsg;
00513                                 }
00514 
00515                                 REQUIRE_INBUF(dsg->width)
00516                                 decoded = dsg->decoder(*inbuf);
00517                                 if (decoded == MAP_UNMAPPABLE)
00518                                         return dsg->width;
00519 
00520                                 if (decoded < 0x10000) {
00521                                         WRITE1(decoded)
00522                                         NEXT_OUT(1)
00523                                 }
00524                                 else if (decoded < 0x30000) {
00525                                         WRITEUCS4(decoded)
00526                                 }
00527                                 else { /* JIS X 0213 pairs */
00528                     WRITE2(decoded >> 16, decoded & 0xffff)
00529                     NEXT_OUT(2)
00530                 }
00531                 NEXT_IN(dsg->width)
00532             }
00533             break;
00534         }
00535     }
00536     return 0;
00537 }
00538 
00539 /*-*- mapping table holders -*-*/
00540 
00541 #define ENCMAP(enc) static const encode_map *enc##_encmap = NULL;
00542 #define DECMAP(enc) static const decode_map *enc##_decmap = NULL;
00543 
00544 /* kr */
00545 ENCMAP(cp949)
00546 DECMAP(ksx1001)
00547 
00548 /* jp */
00549 ENCMAP(jisxcommon)
00550 DECMAP(jisx0208)
00551 DECMAP(jisx0212)
00552 ENCMAP(jisx0213_bmp)
00553 DECMAP(jisx0213_1_bmp)
00554 DECMAP(jisx0213_2_bmp)
00555 ENCMAP(jisx0213_emp)
00556 DECMAP(jisx0213_1_emp)
00557 DECMAP(jisx0213_2_emp)
00558 
00559 /* cn */
00560 ENCMAP(gbcommon)
00561 DECMAP(gb2312)
00562 
00563 /* tw */
00564 
00565 /*-*- mapping access functions -*-*/
00566 
00567 static int
00568 ksx1001_init(void)
00569 {
00570     static int initialized = 0;
00571 
00572     if (!initialized && (
00573                     IMPORT_MAP(kr, cp949, &cp949_encmap, NULL) ||
00574                     IMPORT_MAP(kr, ksx1001, NULL, &ksx1001_decmap)))
00575         return -1;
00576     initialized = 1;
00577     return 0;
00578 }
00579 
00580 static ucs4_t
00581 ksx1001_decoder(const unsigned char *data)
00582 {
00583     ucs4_t u;
00584     TRYMAP_DEC(ksx1001, u, data[0], data[1])
00585         return u;
00586     else
00587         return MAP_UNMAPPABLE;
00588 }
00589 
00590 static DBCHAR
00591 ksx1001_encoder(const ucs4_t *data, Py_ssize_t *length)
00592 {
00593     DBCHAR coded;
00594     assert(*length == 1);
00595     if (*data < 0x10000) {
00596         TRYMAP_ENC(cp949, coded, *data)
00597             if (!(coded & 0x8000))
00598                 return coded;
00599     }
00600     return MAP_UNMAPPABLE;
00601 }
00602 
00603 static int
00604 jisx0208_init(void)
00605 {
00606     static int initialized = 0;
00607 
00608     if (!initialized && (
00609                     IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) ||
00610                     IMPORT_MAP(jp, jisx0208, NULL, &jisx0208_decmap)))
00611         return -1;
00612     initialized = 1;
00613     return 0;
00614 }
00615 
00616 static ucs4_t
00617 jisx0208_decoder(const unsigned char *data)
00618 {
00619     ucs4_t u;
00620     if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
00621         return 0xff3c;
00622     else TRYMAP_DEC(jisx0208, u, data[0], data[1])
00623         return u;
00624     else
00625         return MAP_UNMAPPABLE;
00626 }
00627 
00628 static DBCHAR
00629 jisx0208_encoder(const ucs4_t *data, Py_ssize_t *length)
00630 {
00631     DBCHAR coded;
00632     assert(*length == 1);
00633     if (*data < 0x10000) {
00634         if (*data == 0xff3c) /* F/W REVERSE SOLIDUS */
00635             return 0x2140;
00636         else TRYMAP_ENC(jisxcommon, coded, *data) {
00637             if (!(coded & 0x8000))
00638                 return coded;
00639         }
00640     }
00641     return MAP_UNMAPPABLE;
00642 }
00643 
00644 static int
00645 jisx0212_init(void)
00646 {
00647     static int initialized = 0;
00648 
00649     if (!initialized && (
00650                     IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) ||
00651                     IMPORT_MAP(jp, jisx0212, NULL, &jisx0212_decmap)))
00652         return -1;
00653     initialized = 1;
00654     return 0;
00655 }
00656 
00657 static ucs4_t
00658 jisx0212_decoder(const unsigned char *data)
00659 {
00660     ucs4_t u;
00661     TRYMAP_DEC(jisx0212, u, data[0], data[1])
00662         return u;
00663     else
00664         return MAP_UNMAPPABLE;
00665 }
00666 
00667 static DBCHAR
00668 jisx0212_encoder(const ucs4_t *data, Py_ssize_t *length)
00669 {
00670     DBCHAR coded;
00671     assert(*length == 1);
00672     if (*data < 0x10000) {
00673         TRYMAP_ENC(jisxcommon, coded, *data) {
00674             if (coded & 0x8000)
00675                 return coded & 0x7fff;
00676         }
00677     }
00678     return MAP_UNMAPPABLE;
00679 }
00680 
00681 static int
00682 jisx0213_init(void)
00683 {
00684     static int initialized = 0;
00685 
00686     if (!initialized && (
00687                     jisx0208_init() ||
00688                     IMPORT_MAP(jp, jisx0213_bmp,
00689                                &jisx0213_bmp_encmap, NULL) ||
00690                     IMPORT_MAP(jp, jisx0213_1_bmp,
00691                                NULL, &jisx0213_1_bmp_decmap) ||
00692                     IMPORT_MAP(jp, jisx0213_2_bmp,
00693                                NULL, &jisx0213_2_bmp_decmap) ||
00694                     IMPORT_MAP(jp, jisx0213_emp,
00695                                &jisx0213_emp_encmap, NULL) ||
00696                     IMPORT_MAP(jp, jisx0213_1_emp,
00697                                NULL, &jisx0213_1_emp_decmap) ||
00698                     IMPORT_MAP(jp, jisx0213_2_emp,
00699                                NULL, &jisx0213_2_emp_decmap) ||
00700                     IMPORT_MAP(jp, jisx0213_pair, &jisx0213_pair_encmap,
00701                                &jisx0213_pair_decmap)))
00702         return -1;
00703     initialized = 1;
00704     return 0;
00705 }
00706 
00707 #define config ((void *)2000)
00708 static ucs4_t
00709 jisx0213_2000_1_decoder(const unsigned char *data)
00710 {
00711     ucs4_t u;
00712     EMULATE_JISX0213_2000_DECODE_PLANE1(u, data[0], data[1])
00713     else if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
00714         return 0xff3c;
00715     else TRYMAP_DEC(jisx0208, u, data[0], data[1]);
00716     else TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]);
00717     else TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1])
00718         u |= 0x20000;
00719     else TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]);
00720     else
00721         return MAP_UNMAPPABLE;
00722     return u;
00723 }
00724 
00725 static ucs4_t
00726 jisx0213_2000_2_decoder(const unsigned char *data)
00727 {
00728     ucs4_t u;
00729     EMULATE_JISX0213_2000_DECODE_PLANE2(u, data[0], data[1])
00730     TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]);
00731     else TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1])
00732         u |= 0x20000;
00733     else
00734         return MAP_UNMAPPABLE;
00735     return u;
00736 }
00737 #undef config
00738 
00739 static ucs4_t
00740 jisx0213_2004_1_decoder(const unsigned char *data)
00741 {
00742     ucs4_t u;
00743     if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
00744         return 0xff3c;
00745     else TRYMAP_DEC(jisx0208, u, data[0], data[1]);
00746     else TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]);
00747     else TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1])
00748         u |= 0x20000;
00749     else TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]);
00750     else
00751         return MAP_UNMAPPABLE;
00752     return u;
00753 }
00754 
00755 static ucs4_t
00756 jisx0213_2004_2_decoder(const unsigned char *data)
00757 {
00758     ucs4_t u;
00759     TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]);
00760     else TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1])
00761         u |= 0x20000;
00762     else
00763         return MAP_UNMAPPABLE;
00764     return u;
00765 }
00766 
00767 static DBCHAR
00768 jisx0213_encoder(const ucs4_t *data, Py_ssize_t *length, void *config)
00769 {
00770     DBCHAR coded;
00771 
00772     switch (*length) {
00773     case 1: /* first character */
00774         if (*data >= 0x10000) {
00775             if ((*data) >> 16 == 0x20000 >> 16) {
00776                 EMULATE_JISX0213_2000_ENCODE_EMP(coded, *data)
00777                 else TRYMAP_ENC(jisx0213_emp, coded,
00778                                 (*data) & 0xffff)
00779                     return coded;
00780             }
00781             return MAP_UNMAPPABLE;
00782         }
00783 
00784         EMULATE_JISX0213_2000_ENCODE_BMP(coded, *data)
00785         else TRYMAP_ENC(jisx0213_bmp, coded, *data) {
00786             if (coded == MULTIC)
00787                 return MAP_MULTIPLE_AVAIL;
00788         }
00789         else TRYMAP_ENC(jisxcommon, coded, *data) {
00790             if (coded & 0x8000)
00791                 return MAP_UNMAPPABLE;
00792         }
00793         else
00794             return MAP_UNMAPPABLE;
00795         return coded;
00796     case 2: /* second character of unicode pair */
00797         coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1],
00798                         jisx0213_pair_encmap, JISX0213_ENCPAIRS);
00799         if (coded == DBCINV) {
00800             *length = 1;
00801             coded = find_pairencmap((ucs2_t)data[0], 0,
00802                       jisx0213_pair_encmap, JISX0213_ENCPAIRS);
00803             if (coded == DBCINV)
00804                 return MAP_UNMAPPABLE;
00805         }
00806         else
00807             return coded;
00808     case -1: /* flush unterminated */
00809         *length = 1;
00810         coded = find_pairencmap((ucs2_t)data[0], 0,
00811                         jisx0213_pair_encmap, JISX0213_ENCPAIRS);
00812         if (coded == DBCINV)
00813             return MAP_UNMAPPABLE;
00814         else
00815             return coded;
00816     default:
00817         return MAP_UNMAPPABLE;
00818     }
00819 }
00820 
00821 static DBCHAR
00822 jisx0213_2000_1_encoder(const ucs4_t *data, Py_ssize_t *length)
00823 {
00824     DBCHAR coded = jisx0213_encoder(data, length, (void *)2000);
00825     if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
00826         return coded;
00827     else if (coded & 0x8000)
00828         return MAP_UNMAPPABLE;
00829     else
00830         return coded;
00831 }
00832 
00833 static DBCHAR
00834 jisx0213_2000_1_encoder_paironly(const ucs4_t *data, Py_ssize_t *length)
00835 {
00836     DBCHAR coded;
00837     Py_ssize_t ilength = *length;
00838 
00839     coded = jisx0213_encoder(data, length, (void *)2000);
00840     switch (ilength) {
00841     case 1:
00842         if (coded == MAP_MULTIPLE_AVAIL)
00843             return MAP_MULTIPLE_AVAIL;
00844         else
00845             return MAP_UNMAPPABLE;
00846     case 2:
00847         if (*length != 2)
00848             return MAP_UNMAPPABLE;
00849         else
00850             return coded;
00851     default:
00852         return MAP_UNMAPPABLE;
00853     }
00854 }
00855 
00856 static DBCHAR
00857 jisx0213_2000_2_encoder(const ucs4_t *data, Py_ssize_t *length)
00858 {
00859     DBCHAR coded = jisx0213_encoder(data, length, (void *)2000);
00860     if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
00861         return coded;
00862     else if (coded & 0x8000)
00863         return coded & 0x7fff;
00864     else
00865         return MAP_UNMAPPABLE;
00866 }
00867 
00868 static DBCHAR
00869 jisx0213_2004_1_encoder(const ucs4_t *data, Py_ssize_t *length)
00870 {
00871     DBCHAR coded = jisx0213_encoder(data, length, NULL);
00872     if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
00873         return coded;
00874     else if (coded & 0x8000)
00875         return MAP_UNMAPPABLE;
00876     else
00877         return coded;
00878 }
00879 
00880 static DBCHAR
00881 jisx0213_2004_1_encoder_paironly(const ucs4_t *data, Py_ssize_t *length)
00882 {
00883     DBCHAR coded;
00884     Py_ssize_t ilength = *length;
00885 
00886     coded = jisx0213_encoder(data, length, NULL);
00887     switch (ilength) {
00888     case 1:
00889         if (coded == MAP_MULTIPLE_AVAIL)
00890             return MAP_MULTIPLE_AVAIL;
00891         else
00892             return MAP_UNMAPPABLE;
00893     case 2:
00894         if (*length != 2)
00895             return MAP_UNMAPPABLE;
00896         else
00897             return coded;
00898     default:
00899         return MAP_UNMAPPABLE;
00900     }
00901 }
00902 
00903 static DBCHAR
00904 jisx0213_2004_2_encoder(const ucs4_t *data, Py_ssize_t *length)
00905 {
00906     DBCHAR coded = jisx0213_encoder(data, length, NULL);
00907     if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
00908         return coded;
00909     else if (coded & 0x8000)
00910         return coded & 0x7fff;
00911     else
00912         return MAP_UNMAPPABLE;
00913 }
00914 
00915 static ucs4_t
00916 jisx0201_r_decoder(const unsigned char *data)
00917 {
00918     ucs4_t u;
00919     JISX0201_R_DECODE(*data, u)
00920     else return MAP_UNMAPPABLE;
00921     return u;
00922 }
00923 
00924 static DBCHAR
00925 jisx0201_r_encoder(const ucs4_t *data, Py_ssize_t *length)
00926 {
00927     DBCHAR coded;
00928     JISX0201_R_ENCODE(*data, coded)
00929     else return MAP_UNMAPPABLE;
00930     return coded;
00931 }
00932 
00933 static ucs4_t
00934 jisx0201_k_decoder(const unsigned char *data)
00935 {
00936     ucs4_t u;
00937     JISX0201_K_DECODE(*data ^ 0x80, u)
00938     else return MAP_UNMAPPABLE;
00939     return u;
00940 }
00941 
00942 static DBCHAR
00943 jisx0201_k_encoder(const ucs4_t *data, Py_ssize_t *length)
00944 {
00945     DBCHAR coded;
00946     JISX0201_K_ENCODE(*data, coded)
00947     else return MAP_UNMAPPABLE;
00948     return coded - 0x80;
00949 }
00950 
00951 static int
00952 gb2312_init(void)
00953 {
00954     static int initialized = 0;
00955 
00956     if (!initialized && (
00957                     IMPORT_MAP(cn, gbcommon, &gbcommon_encmap, NULL) ||
00958                     IMPORT_MAP(cn, gb2312, NULL, &gb2312_decmap)))
00959         return -1;
00960     initialized = 1;
00961     return 0;
00962 }
00963 
00964 static ucs4_t
00965 gb2312_decoder(const unsigned char *data)
00966 {
00967     ucs4_t u;
00968     TRYMAP_DEC(gb2312, u, data[0], data[1])
00969         return u;
00970     else
00971         return MAP_UNMAPPABLE;
00972 }
00973 
00974 static DBCHAR
00975 gb2312_encoder(const ucs4_t *data, Py_ssize_t *length)
00976 {
00977     DBCHAR coded;
00978     assert(*length == 1);
00979     if (*data < 0x10000) {
00980         TRYMAP_ENC(gbcommon, coded, *data) {
00981             if (!(coded & 0x8000))
00982                 return coded;
00983         }
00984     }
00985     return MAP_UNMAPPABLE;
00986 }
00987 
00988 
00989 static ucs4_t
00990 dummy_decoder(const unsigned char *data)
00991 {
00992     return MAP_UNMAPPABLE;
00993 }
00994 
00995 static DBCHAR
00996 dummy_encoder(const ucs4_t *data, Py_ssize_t *length)
00997 {
00998     return MAP_UNMAPPABLE;
00999 }
01000 
01001 /*-*- registry tables -*-*/
01002 
01003 #define REGISTRY_KSX1001_G0     { CHARSET_KSX1001, 0, 2,                \
01004                   ksx1001_init,                                         \
01005                   ksx1001_decoder, ksx1001_encoder }
01006 #define REGISTRY_KSX1001_G1     { CHARSET_KSX1001, 1, 2,                \
01007                   ksx1001_init,                                         \
01008                   ksx1001_decoder, ksx1001_encoder }
01009 #define REGISTRY_JISX0201_R     { CHARSET_JISX0201_R, 0, 1,             \
01010                   NULL,                                                 \
01011                   jisx0201_r_decoder, jisx0201_r_encoder }
01012 #define REGISTRY_JISX0201_K     { CHARSET_JISX0201_K, 0, 1,             \
01013                   NULL,                                                 \
01014                   jisx0201_k_decoder, jisx0201_k_encoder }
01015 #define REGISTRY_JISX0208       { CHARSET_JISX0208, 0, 2,               \
01016                   jisx0208_init,                                        \
01017                   jisx0208_decoder, jisx0208_encoder }
01018 #define REGISTRY_JISX0208_O     { CHARSET_JISX0208_O, 0, 2,             \
01019                   jisx0208_init,                                        \
01020                   jisx0208_decoder, jisx0208_encoder }
01021 #define REGISTRY_JISX0212       { CHARSET_JISX0212, 0, 2,               \
01022                   jisx0212_init,                                        \
01023                   jisx0212_decoder, jisx0212_encoder }
01024 #define REGISTRY_JISX0213_2000_1 { CHARSET_JISX0213_2000_1, 0, 2,       \
01025                   jisx0213_init,                                        \
01026                   jisx0213_2000_1_decoder,                              \
01027                   jisx0213_2000_1_encoder }
01028 #define REGISTRY_JISX0213_2000_1_PAIRONLY { CHARSET_JISX0213_2000_1, 0, 2, \
01029                   jisx0213_init,                                        \
01030                   jisx0213_2000_1_decoder,                              \
01031                   jisx0213_2000_1_encoder_paironly }
01032 #define REGISTRY_JISX0213_2000_2 { CHARSET_JISX0213_2, 0, 2,            \
01033                   jisx0213_init,                                        \
01034                   jisx0213_2000_2_decoder,                              \
01035                   jisx0213_2000_2_encoder }
01036 #define REGISTRY_JISX0213_2004_1 { CHARSET_JISX0213_2004_1, 0, 2,       \
01037                   jisx0213_init,                                        \
01038                   jisx0213_2004_1_decoder,                              \
01039                   jisx0213_2004_1_encoder }
01040 #define REGISTRY_JISX0213_2004_1_PAIRONLY { CHARSET_JISX0213_2004_1, 0, 2, \
01041                   jisx0213_init,                                        \
01042                   jisx0213_2004_1_decoder,                              \
01043                   jisx0213_2004_1_encoder_paironly }
01044 #define REGISTRY_JISX0213_2004_2 { CHARSET_JISX0213_2, 0, 2,            \
01045                   jisx0213_init,                                        \
01046                   jisx0213_2004_2_decoder,                              \
01047                   jisx0213_2004_2_encoder }
01048 #define REGISTRY_GB2312         { CHARSET_GB2312, 0, 2,                 \
01049                   gb2312_init,                                          \
01050                   gb2312_decoder, gb2312_encoder }
01051 #define REGISTRY_CNS11643_1     { CHARSET_CNS11643_1, 1, 2,             \
01052                   cns11643_init,                                        \
01053                   cns11643_1_decoder, cns11643_1_encoder }
01054 #define REGISTRY_CNS11643_2     { CHARSET_CNS11643_2, 2, 2,             \
01055                   cns11643_init,                                        \
01056                   cns11643_2_decoder, cns11643_2_encoder }
01057 #define REGISTRY_ISO8859_1      { CHARSET_ISO8859_1, 2, 1,              \
01058                   NULL, dummy_decoder, dummy_encoder }
01059 #define REGISTRY_ISO8859_7      { CHARSET_ISO8859_7, 2, 1,              \
01060                   NULL, dummy_decoder, dummy_encoder }
01061 #define REGISTRY_SENTINEL       { 0, }
01062 #define CONFIGDEF(var, attrs)                                           \
01063     static const struct iso2022_config iso2022_##var##_config = {       \
01064         attrs, iso2022_##var##_designations                             \
01065     };
01066 
01067 static const struct iso2022_designation iso2022_kr_designations[] = {
01068     REGISTRY_KSX1001_G1, REGISTRY_SENTINEL
01069 };
01070 CONFIGDEF(kr, 0)
01071 
01072 static const struct iso2022_designation iso2022_jp_designations[] = {
01073     REGISTRY_JISX0208, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
01074     REGISTRY_SENTINEL
01075 };
01076 CONFIGDEF(jp, NO_SHIFT | USE_JISX0208_EXT)
01077 
01078 static const struct iso2022_designation iso2022_jp_1_designations[] = {
01079     REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
01080     REGISTRY_JISX0208_O, REGISTRY_SENTINEL
01081 };
01082 CONFIGDEF(jp_1, NO_SHIFT | USE_JISX0208_EXT)
01083 
01084 static const struct iso2022_designation iso2022_jp_2_designations[] = {
01085     REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_KSX1001_G0,
01086     REGISTRY_GB2312, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
01087     REGISTRY_ISO8859_1, REGISTRY_ISO8859_7, REGISTRY_SENTINEL
01088 };
01089 CONFIGDEF(jp_2, NO_SHIFT | USE_G2 | USE_JISX0208_EXT)
01090 
01091 static const struct iso2022_designation iso2022_jp_2004_designations[] = {
01092     REGISTRY_JISX0213_2004_1_PAIRONLY, REGISTRY_JISX0208,
01093     REGISTRY_JISX0213_2004_1, REGISTRY_JISX0213_2004_2, REGISTRY_SENTINEL
01094 };
01095 CONFIGDEF(jp_2004, NO_SHIFT | USE_JISX0208_EXT)
01096 
01097 static const struct iso2022_designation iso2022_jp_3_designations[] = {
01098     REGISTRY_JISX0213_2000_1_PAIRONLY, REGISTRY_JISX0208,
01099     REGISTRY_JISX0213_2000_1, REGISTRY_JISX0213_2000_2, REGISTRY_SENTINEL
01100 };
01101 CONFIGDEF(jp_3, NO_SHIFT | USE_JISX0208_EXT)
01102 
01103 static const struct iso2022_designation iso2022_jp_ext_designations[] = {
01104     REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
01105     REGISTRY_JISX0201_K, REGISTRY_JISX0208_O, REGISTRY_SENTINEL
01106 };
01107 CONFIGDEF(jp_ext, NO_SHIFT | USE_JISX0208_EXT)
01108 
01109 
01110 BEGIN_MAPPINGS_LIST
01111   /* no mapping table here */
01112 END_MAPPINGS_LIST
01113 
01114 #define ISO2022_CODEC(variation) {              \
01115     "iso2022_" #variation,                      \
01116     &iso2022_##variation##_config,              \
01117     iso2022_codec_init,                         \
01118     _STATEFUL_METHODS(iso2022)                  \
01119 },
01120 
01121 BEGIN_CODECS_LIST
01122   ISO2022_CODEC(kr)
01123   ISO2022_CODEC(jp)
01124   ISO2022_CODEC(jp_1)
01125   ISO2022_CODEC(jp_2)
01126   ISO2022_CODEC(jp_2004)
01127   ISO2022_CODEC(jp_3)
01128   ISO2022_CODEC(jp_ext)
01129 END_CODECS_LIST
01130 
01131 I_AM_A_MODULE_FOR(iso2022)