Back to index

python3.2  3.2.2
cjkcodecs.h
Go to the documentation of this file.
00001 /*
00002  * cjkcodecs.h: common header for cjkcodecs
00003  *
00004  * Written by Hye-Shik Chang <perky@FreeBSD.org>
00005  */
00006 
00007 #ifndef _CJKCODECS_H_
00008 #define _CJKCODECS_H_
00009 
00010 #define PY_SSIZE_T_CLEAN
00011 #include "Python.h"
00012 #include "multibytecodec.h"
00013 
00014 
00015 /* a unicode "undefined" codepoint */
00016 #define UNIINV  0xFFFE
00017 
00018 /* internal-use DBCS codepoints which aren't used by any charsets */
00019 #define NOCHAR  0xFFFF
00020 #define MULTIC  0xFFFE
00021 #define DBCINV  0xFFFD
00022 
00023 /* shorter macros to save source size of mapping tables */
00024 #define U UNIINV
00025 #define N NOCHAR
00026 #define M MULTIC
00027 #define D DBCINV
00028 
00029 struct dbcs_index {
00030     const ucs2_t *map;
00031     unsigned char bottom, top;
00032 };
00033 typedef struct dbcs_index decode_map;
00034 
00035 struct widedbcs_index {
00036     const ucs4_t *map;
00037     unsigned char bottom, top;
00038 };
00039 typedef struct widedbcs_index widedecode_map;
00040 
00041 struct unim_index {
00042     const DBCHAR *map;
00043     unsigned char bottom, top;
00044 };
00045 typedef struct unim_index encode_map;
00046 
00047 struct unim_index_bytebased {
00048     const unsigned char *map;
00049     unsigned char bottom, top;
00050 };
00051 
00052 struct dbcs_map {
00053     const char *charset;
00054     const struct unim_index *encmap;
00055     const struct dbcs_index *decmap;
00056 };
00057 
00058 struct pair_encodemap {
00059     ucs4_t uniseq;
00060     DBCHAR code;
00061 };
00062 
00063 static const MultibyteCodec *codec_list;
00064 static const struct dbcs_map *mapping_list;
00065 
00066 #define CODEC_INIT(encoding)                                            \
00067     static int encoding##_codec_init(const void *config)
00068 
00069 #define ENCODER_INIT(encoding)                                          \
00070     static int encoding##_encode_init(                                  \
00071         MultibyteCodec_State *state, const void *config)
00072 #define ENCODER(encoding)                                               \
00073     static Py_ssize_t encoding##_encode(                                \
00074         MultibyteCodec_State *state, const void *config,                \
00075         const Py_UNICODE **inbuf, Py_ssize_t inleft,                    \
00076         unsigned char **outbuf, Py_ssize_t outleft, int flags)
00077 #define ENCODER_RESET(encoding)                                         \
00078     static Py_ssize_t encoding##_encode_reset(                          \
00079         MultibyteCodec_State *state, const void *config,                \
00080         unsigned char **outbuf, Py_ssize_t outleft)
00081 
00082 #define DECODER_INIT(encoding)                                          \
00083     static int encoding##_decode_init(                                  \
00084         MultibyteCodec_State *state, const void *config)
00085 #define DECODER(encoding)                                               \
00086     static Py_ssize_t encoding##_decode(                                \
00087         MultibyteCodec_State *state, const void *config,                \
00088         const unsigned char **inbuf, Py_ssize_t inleft,                 \
00089         Py_UNICODE **outbuf, Py_ssize_t outleft)
00090 #define DECODER_RESET(encoding)                                         \
00091     static Py_ssize_t encoding##_decode_reset(                          \
00092         MultibyteCodec_State *state, const void *config)
00093 
00094 #if Py_UNICODE_SIZE == 4
00095 #define UCS4INVALID(code)       \
00096     if ((code) > 0xFFFF)        \
00097     return 1;
00098 #else
00099 #define UCS4INVALID(code)       \
00100     if (0) ;
00101 #endif
00102 
00103 #define NEXT_IN(i)                              \
00104     (*inbuf) += (i);                            \
00105     (inleft) -= (i);
00106 #define NEXT_OUT(o)                             \
00107     (*outbuf) += (o);                           \
00108     (outleft) -= (o);
00109 #define NEXT(i, o)                              \
00110     NEXT_IN(i) NEXT_OUT(o)
00111 
00112 #define REQUIRE_INBUF(n)                        \
00113     if (inleft < (n))                           \
00114         return MBERR_TOOFEW;
00115 #define REQUIRE_OUTBUF(n)                       \
00116     if (outleft < (n))                          \
00117         return MBERR_TOOSMALL;
00118 
00119 #define IN1 ((*inbuf)[0])
00120 #define IN2 ((*inbuf)[1])
00121 #define IN3 ((*inbuf)[2])
00122 #define IN4 ((*inbuf)[3])
00123 
00124 #define OUT1(c) ((*outbuf)[0]) = (c);
00125 #define OUT2(c) ((*outbuf)[1]) = (c);
00126 #define OUT3(c) ((*outbuf)[2]) = (c);
00127 #define OUT4(c) ((*outbuf)[3]) = (c);
00128 
00129 #define WRITE1(c1)              \
00130     REQUIRE_OUTBUF(1)           \
00131     (*outbuf)[0] = (c1);
00132 #define WRITE2(c1, c2)          \
00133     REQUIRE_OUTBUF(2)           \
00134     (*outbuf)[0] = (c1);        \
00135     (*outbuf)[1] = (c2);
00136 #define WRITE3(c1, c2, c3)      \
00137     REQUIRE_OUTBUF(3)           \
00138     (*outbuf)[0] = (c1);        \
00139     (*outbuf)[1] = (c2);        \
00140     (*outbuf)[2] = (c3);
00141 #define WRITE4(c1, c2, c3, c4)  \
00142     REQUIRE_OUTBUF(4)           \
00143     (*outbuf)[0] = (c1);        \
00144     (*outbuf)[1] = (c2);        \
00145     (*outbuf)[2] = (c3);        \
00146     (*outbuf)[3] = (c4);
00147 
00148 #if Py_UNICODE_SIZE == 2
00149 # define WRITEUCS4(c)                                           \
00150     REQUIRE_OUTBUF(2)                                           \
00151     (*outbuf)[0] = 0xd800 + (((c) - 0x10000) >> 10);            \
00152     (*outbuf)[1] = 0xdc00 + (((c) - 0x10000) & 0x3ff);          \
00153     NEXT_OUT(2)
00154 #else
00155 # define WRITEUCS4(c)                                           \
00156     REQUIRE_OUTBUF(1)                                           \
00157     **outbuf = (Py_UNICODE)(c);                                 \
00158     NEXT_OUT(1)
00159 #endif
00160 
00161 #define _TRYMAP_ENC(m, assi, val)                               \
00162     ((m)->map != NULL && (val) >= (m)->bottom &&                \
00163         (val)<= (m)->top && ((assi) = (m)->map[(val) -          \
00164         (m)->bottom]) != NOCHAR)
00165 #define TRYMAP_ENC_COND(charset, assi, uni)                     \
00166     _TRYMAP_ENC(&charset##_encmap[(uni) >> 8], assi, (uni) & 0xff)
00167 #define TRYMAP_ENC(charset, assi, uni)                          \
00168     if TRYMAP_ENC_COND(charset, assi, uni)
00169 
00170 #define _TRYMAP_DEC(m, assi, val)                               \
00171     ((m)->map != NULL && (val) >= (m)->bottom &&                \
00172         (val)<= (m)->top && ((assi) = (m)->map[(val) -          \
00173         (m)->bottom]) != UNIINV)
00174 #define TRYMAP_DEC(charset, assi, c1, c2)                       \
00175     if _TRYMAP_DEC(&charset##_decmap[c1], assi, c2)
00176 
00177 #define _TRYMAP_ENC_MPLANE(m, assplane, asshi, asslo, val)      \
00178     ((m)->map != NULL && (val) >= (m)->bottom &&                \
00179         (val)<= (m)->top &&                                     \
00180         ((assplane) = (m)->map[((val) - (m)->bottom)*3]) != 0 && \
00181         (((asshi) = (m)->map[((val) - (m)->bottom)*3 + 1]), 1) && \
00182         (((asslo) = (m)->map[((val) - (m)->bottom)*3 + 2]), 1))
00183 #define TRYMAP_ENC_MPLANE(charset, assplane, asshi, asslo, uni) \
00184     if _TRYMAP_ENC_MPLANE(&charset##_encmap[(uni) >> 8], \
00185                        assplane, asshi, asslo, (uni) & 0xff)
00186 #define TRYMAP_DEC_MPLANE(charset, assi, plane, c1, c2)         \
00187     if _TRYMAP_DEC(&charset##_decmap[plane][c1], assi, c2)
00188 
00189 #if Py_UNICODE_SIZE == 2
00190 #define DECODE_SURROGATE(c)                                     \
00191     if (c >> 10 == 0xd800 >> 10) { /* high surrogate */         \
00192         REQUIRE_INBUF(2)                                        \
00193         if (IN2 >> 10 == 0xdc00 >> 10) { /* low surrogate */ \
00194             c = 0x10000 + ((ucs4_t)(c - 0xd800) << 10) + \
00195             ((ucs4_t)(IN2) - 0xdc00);                           \
00196         }                                                       \
00197     }
00198 #define GET_INSIZE(c)   ((c) > 0xffff ? 2 : 1)
00199 #else
00200 #define DECODE_SURROGATE(c) {;}
00201 #define GET_INSIZE(c)   1
00202 #endif
00203 
00204 #define BEGIN_MAPPINGS_LIST static const struct dbcs_map _mapping_list[] = {
00205 #define MAPPING_ENCONLY(enc) {#enc, (void*)enc##_encmap, NULL},
00206 #define MAPPING_DECONLY(enc) {#enc, NULL, (void*)enc##_decmap},
00207 #define MAPPING_ENCDEC(enc) {#enc, (void*)enc##_encmap, (void*)enc##_decmap},
00208 #define END_MAPPINGS_LIST                               \
00209     {"", NULL, NULL} };                                 \
00210     static const struct dbcs_map *mapping_list =        \
00211         (const struct dbcs_map *)_mapping_list;
00212 
00213 #define BEGIN_CODECS_LIST static const MultibyteCodec _codec_list[] = {
00214 #define _STATEFUL_METHODS(enc)          \
00215     enc##_encode,                       \
00216     enc##_encode_init,                  \
00217     enc##_encode_reset,                 \
00218     enc##_decode,                       \
00219     enc##_decode_init,                  \
00220     enc##_decode_reset,
00221 #define _STATELESS_METHODS(enc)         \
00222     enc##_encode, NULL, NULL,           \
00223     enc##_decode, NULL, NULL,
00224 #define CODEC_STATEFUL(enc) {           \
00225     #enc, NULL, NULL,                   \
00226     _STATEFUL_METHODS(enc)              \
00227 },
00228 #define CODEC_STATELESS(enc) {          \
00229     #enc, NULL, NULL,                   \
00230     _STATELESS_METHODS(enc)             \
00231 },
00232 #define CODEC_STATELESS_WINIT(enc) {    \
00233     #enc, NULL,                         \
00234     enc##_codec_init,                   \
00235     _STATELESS_METHODS(enc)             \
00236 },
00237 #define END_CODECS_LIST                                 \
00238     {"", NULL,} };                                      \
00239     static const MultibyteCodec *codec_list =           \
00240         (const MultibyteCodec *)_codec_list;
00241 
00242 
00243 
00244 static PyObject *
00245 getmultibytecodec(void)
00246 {
00247     static PyObject *cofunc = NULL;
00248 
00249     if (cofunc == NULL) {
00250         PyObject *mod = PyImport_ImportModuleNoBlock("_multibytecodec");
00251         if (mod == NULL)
00252             return NULL;
00253         cofunc = PyObject_GetAttrString(mod, "__create_codec");
00254         Py_DECREF(mod);
00255     }
00256     return cofunc;
00257 }
00258 
00259 static PyObject *
00260 getcodec(PyObject *self, PyObject *encoding)
00261 {
00262     PyObject *codecobj, *r, *cofunc;
00263     const MultibyteCodec *codec;
00264     const char *enc;
00265 
00266     if (!PyUnicode_Check(encoding)) {
00267         PyErr_SetString(PyExc_TypeError,
00268                         "encoding name must be a string.");
00269         return NULL;
00270     }
00271     enc = _PyUnicode_AsString(encoding);
00272     if (enc == NULL)
00273         return NULL;
00274 
00275     cofunc = getmultibytecodec();
00276     if (cofunc == NULL)
00277         return NULL;
00278 
00279     for (codec = codec_list; codec->encoding[0]; codec++)
00280         if (strcmp(codec->encoding, enc) == 0)
00281             break;
00282 
00283     if (codec->encoding[0] == '\0') {
00284         PyErr_SetString(PyExc_LookupError,
00285                         "no such codec is supported.");
00286         return NULL;
00287     }
00288 
00289     codecobj = PyCapsule_New((void *)codec, PyMultibyteCodec_CAPSULE_NAME, NULL);
00290     if (codecobj == NULL)
00291         return NULL;
00292 
00293     r = PyObject_CallFunctionObjArgs(cofunc, codecobj, NULL);
00294     Py_DECREF(codecobj);
00295 
00296     return r;
00297 }
00298 
00299 static struct PyMethodDef __methods[] = {
00300     {"getcodec", (PyCFunction)getcodec, METH_O, ""},
00301     {NULL, NULL},
00302 };
00303 
00304 static int
00305 register_maps(PyObject *module)
00306 {
00307     const struct dbcs_map *h;
00308 
00309     for (h = mapping_list; h->charset[0] != '\0'; h++) {
00310         char mhname[256] = "__map_";
00311         int r;
00312         strcpy(mhname + sizeof("__map_") - 1, h->charset);
00313         r = PyModule_AddObject(module, mhname,
00314                         PyCapsule_New((void *)h, PyMultibyteCodec_CAPSULE_NAME, NULL));
00315         if (r == -1)
00316             return -1;
00317     }
00318     return 0;
00319 }
00320 
00321 #ifdef USING_BINARY_PAIR_SEARCH
00322 static DBCHAR
00323 find_pairencmap(ucs2_t body, ucs2_t modifier,
00324                 const struct pair_encodemap *haystack, int haystacksize)
00325 {
00326     int pos, min, max;
00327     ucs4_t value = body << 16 | modifier;
00328 
00329     min = 0;
00330     max = haystacksize;
00331 
00332     for (pos = haystacksize >> 1; min != max; pos = (min + max) >> 1)
00333         if (value < haystack[pos].uniseq) {
00334             if (max == pos) break;
00335             else max = pos;
00336         }
00337         else if (value > haystack[pos].uniseq) {
00338             if (min == pos) break;
00339             else min = pos;
00340         }
00341         else
00342             break;
00343 
00344         if (value == haystack[pos].uniseq)
00345             return haystack[pos].code;
00346         else
00347             return DBCINV;
00348 }
00349 #endif
00350 
00351 #ifdef USING_IMPORTED_MAPS
00352 #define IMPORT_MAP(locale, charset, encmap, decmap) \
00353     importmap("_codecs_" #locale, "__map_" #charset, \
00354               (const void**)encmap, (const void**)decmap)
00355 
00356 static int
00357 importmap(const char *modname, const char *symbol,
00358           const void **encmap, const void **decmap)
00359 {
00360     PyObject *o, *mod;
00361 
00362     mod = PyImport_ImportModule((char *)modname);
00363     if (mod == NULL)
00364         return -1;
00365 
00366     o = PyObject_GetAttrString(mod, (char*)symbol);
00367     if (o == NULL)
00368         goto errorexit;
00369     else if (!PyCapsule_IsValid(o, PyMultibyteCodec_CAPSULE_NAME)) {
00370         PyErr_SetString(PyExc_ValueError,
00371                         "map data must be a Capsule.");
00372         goto errorexit;
00373     }
00374     else {
00375         struct dbcs_map *map;
00376         map = PyCapsule_GetPointer(o, PyMultibyteCodec_CAPSULE_NAME);
00377         if (encmap != NULL)
00378             *encmap = map->encmap;
00379         if (decmap != NULL)
00380             *decmap = map->decmap;
00381         Py_DECREF(o);
00382     }
00383 
00384     Py_DECREF(mod);
00385     return 0;
00386 
00387 errorexit:
00388     Py_DECREF(mod);
00389     return -1;
00390 }
00391 #endif
00392 
00393 #define I_AM_A_MODULE_FOR(loc)                                          \
00394     static struct PyModuleDef __module = {                              \
00395         PyModuleDef_HEAD_INIT,                                          \
00396         "_codecs_"#loc,                                                 \
00397         NULL,                                                           \
00398         0,                                                              \
00399         __methods,                                                      \
00400         NULL,                                                           \
00401         NULL,                                                           \
00402         NULL,                                                           \
00403         NULL                                                            \
00404     };                                                                  \
00405     PyObject*                                                           \
00406     PyInit__codecs_##loc(void)                                          \
00407     {                                                                   \
00408         PyObject *m = PyModule_Create(&__module);                       \
00409         if (m != NULL)                                                  \
00410             (void)register_maps(m);                                     \
00411         return m;                                                       \
00412     }
00413 
00414 #endif