Back to index

php5  5.3.10
zend_multibyte.c
Go to the documentation of this file.
00001 /*
00002    +----------------------------------------------------------------------+
00003    | Zend Engine                                                          |
00004    +----------------------------------------------------------------------+
00005    | Copyright (c) 1998-2012 Zend Technologies Ltd. (http://www.zend.com) |
00006    +----------------------------------------------------------------------+
00007    | This source file is subject to version 2.00 of the Zend license,     |
00008    | that is bundled with this package in the file LICENSE, and is        | 
00009    | available through the world-wide-web at                              |
00010    | http://www.zend.com/license/2_00.txt.                                |
00011    | If you did not receive a copy of the Zend license and are unable to  |
00012    | obtain it through the world-wide-web, please send a note to          |
00013    | license@zend.com so we can mail you a copy immediately.              |
00014    +----------------------------------------------------------------------+
00015    | Authors: Masaki Fujimoto <fujimoto@php.net>                          |
00016    |          Rui Hirokawa <hirokawa@php.net>                             |
00017    +----------------------------------------------------------------------+
00018 */
00019 
00020 /* $Id: zend_multibyte.c 321634 2012-01-01 13:15:04Z felipe $ */
00021 
00022 #include "zend.h"
00023 #include "zend_compile.h"
00024 #include "zend_operators.h"
00025 #include "zend_multibyte.h"
00026 
00027 #ifdef ZEND_MULTIBYTE
00028 static size_t zend_multibyte_encoding_filter(unsigned char **to, size_t *to_length, const char *to_encoding, const unsigned char *from, size_t from_length, const char *from_encoding TSRMLS_DC);
00029 size_t sjis_input_filter(unsigned char **buf, size_t *length, const unsigned char *sjis, size_t sjis_length TSRMLS_DC);
00030 size_t sjis_output_filter(unsigned char **buf, size_t *length, const unsigned char *sjis, size_t sjis_length TSRMLS_DC);
00031 static char* zend_multibyte_assemble_encoding_list(zend_encoding **encoding_list, size_t encoding_list_size);
00032 static int zend_multibyte_parse_encoding_list(const char *encoding_list,
00033 size_t encoding_list_size, zend_encoding ***result, size_t *result_size);
00034 static zend_encoding *zend_multibyte_find_script_encoding(zend_encoding *onetime_encoding TSRMLS_DC);
00035 static zend_encoding *zend_multibyte_detect_unicode(TSRMLS_D);
00036 static zend_encoding *zend_multibyte_detect_utf_encoding(const unsigned char *script, size_t script_size TSRMLS_DC);
00037 
00038 /*
00039  * encodings
00040  */
00041 static const char *ucs2_aliases[] = {"ISO-10646-UCS-2", "UCS2" , "UNICODE", NULL};
00042 static zend_encoding encoding_ucs2 = {
00043        NULL,
00044        NULL,
00045        "UCS-2",
00046        (const char *(*)[])&ucs2_aliases,
00047        0
00048 };
00049 
00050 static zend_encoding encoding_ucs2be = {
00051        NULL,
00052        NULL,
00053        "UCS-2BE",
00054        NULL,
00055        0
00056 };
00057 
00058 static zend_encoding encoding_ucs2le = {
00059        NULL,
00060        NULL,
00061        "UCS-2LE",
00062        NULL,
00063        0
00064 };
00065 
00066 static const char *ucs4_aliases[] = {"ISO-10646-UCS-4", "UCS4", NULL};
00067 static zend_encoding encoding_ucs4 = {
00068        NULL,
00069        NULL,
00070        "UCS-4",
00071        (const char *(*)[])&ucs4_aliases,
00072        0
00073 };
00074 
00075 static zend_encoding encoding_ucs4be = {
00076        NULL,
00077        NULL,
00078        "UCS-4BE",
00079        NULL,
00080        0
00081 };
00082 
00083 static zend_encoding encoding_ucs4le = {
00084        NULL,
00085        NULL,
00086        "UCS-4LE",
00087        NULL,
00088        0
00089 };
00090 
00091 static const char *utf32_aliases[] = {"utf32", NULL};
00092 static zend_encoding encoding_utf32 = {
00093        NULL,
00094        NULL,
00095        "UTF-32",
00096        (const char *(*)[])&utf32_aliases,
00097        0
00098 };
00099 
00100 static zend_encoding encoding_utf32be = {
00101        NULL,
00102        NULL,
00103        "UTF-32BE",
00104        NULL,
00105        0
00106 };
00107 
00108 static zend_encoding encoding_utf32le = {
00109        NULL,
00110        NULL,
00111        "UTF-32LE",
00112        NULL,
00113        0
00114 };
00115 
00116 static const char *utf16_aliases[] = {"utf16", NULL};
00117 static zend_encoding encoding_utf16 = {
00118        NULL,
00119        NULL,
00120        "UTF-16",
00121        (const char *(*)[])&utf16_aliases,
00122        0
00123 };
00124 
00125 static zend_encoding encoding_utf16be = {
00126        NULL,
00127        NULL,
00128        "UTF-16BE",
00129        NULL,
00130        0
00131 };
00132 
00133 static zend_encoding encoding_utf16le = {
00134        NULL,
00135        NULL,
00136        "UTF-16LE",
00137        NULL,
00138        0
00139 };
00140 
00141 static const char *utf8_aliases[] = {"utf8", NULL};
00142 static zend_encoding encoding_utf8 = {
00143        NULL,
00144        NULL,
00145        "UTF-8",
00146        (const char *(*)[])&utf8_aliases,
00147        1
00148 };
00149 
00150 static const char *ascii_aliases[] = {"ANSI_X3.4-1968", "iso-ir-6", "ANSI_X3.4-1986", "ISO_646.irv:1991", "US-ASCII", "ISO646-US", "us", "IBM367", "cp367", "csASCII", NULL};
00151 static zend_encoding encoding_ascii = {
00152        NULL,
00153        NULL,
00154        "ASCII",
00155        (const char *(*)[])&ascii_aliases,
00156        1
00157 };
00158 
00159 static const char *euc_jp_aliases[] = {"EUC", "EUC_JP", "eucJP", "x-euc-jp", NULL};
00160 static zend_encoding encoding_euc_jp = {
00161        NULL,
00162        NULL,
00163        "EUC-JP",
00164        (const char *(*)[])&euc_jp_aliases,
00165        1
00166 };
00167 
00168 static const char *sjis_aliases[] = {"x-sjis", "SJIS", "SHIFT-JIS", NULL};
00169 static zend_encoding encoding_sjis = {
00170        sjis_input_filter,
00171        sjis_output_filter,
00172        "Shift_JIS",
00173        (const char *(*)[])&sjis_aliases,
00174        0
00175 };
00176 
00177 static const char *eucjp_win_aliases[] = {"eucJP-open", NULL};
00178 static zend_encoding encoding_eucjp_win = {
00179        NULL,
00180        NULL,
00181        "eucJP-win",
00182        (const char *(*)[])&eucjp_win_aliases,
00183        1
00184 };
00185 
00186 static const char *sjis_win_aliases[] = {"SJIS-open", "MS_Kanji", "Windows-31J", "CP932", NULL};
00187 static zend_encoding encoding_sjis_win = {
00188        /* sjis-filters does not care about diffs of Shift_JIS and CP932 */
00189        sjis_input_filter,
00190        sjis_output_filter,
00191        "SJIS-win",
00192        (const char *(*)[])&sjis_win_aliases,
00193        0
00194 };
00195 
00196 static const char *jis_aliases[] = {"ISO-2022-JP", NULL};
00197 static zend_encoding encoding_jis = {
00198        NULL,
00199        NULL,
00200        "JIS",
00201        (const char *(*)[])&jis_aliases,
00202        0
00203 };
00204 
00205 static const char *euc_cn_aliases[] = {"CN-GB", "EUC_CN", "eucCN", "x-euc-cn", "gb2312", NULL};
00206 static zend_encoding encoding_euc_cn = {
00207        NULL,
00208        NULL,
00209        "EUC-CN",
00210        (const char *(*)[])&euc_cn_aliases,
00211        1
00212 };
00213 
00214 static const char *cp936_aliases[] = {"CP-936", NULL};
00215 static zend_encoding encoding_cp936 = {
00216        NULL,
00217        NULL,
00218        "CP936",
00219        (const char *(*)[])&cp936_aliases,
00220        0
00221 };
00222 
00223 static const char *hz_aliases[] = {"HZ-GB-2312", NULL};
00224 static zend_encoding encoding_hz = {
00225        NULL,
00226        NULL,
00227        "HZ",
00228        (const char *(*)[])&hz_aliases,
00229        0
00230 };
00231 
00232 static const char *euc_tw_aliases[] = {"EUC_TW", "eucTW", "x-euc-tw", NULL};
00233 static zend_encoding encoding_euc_tw = {
00234        NULL,
00235        NULL,
00236        "EUC-TW",
00237        (const char *(*)[])&euc_tw_aliases,
00238        1
00239 };
00240 
00241 static const char *big5_aliases[] = {"BIG5", "CN-BIG5", "BIG-FIVE", "BIGFIVE", "CP950", NULL};
00242 static zend_encoding encoding_big5 = {
00243        NULL,
00244        NULL,
00245        "BIG-5",
00246        (const char *(*)[])&big5_aliases,
00247        0
00248 };
00249 
00250 static const char *euc_kr_aliases[] = {"EUC_KR", "eucKR", "x-euc-kr", NULL};
00251 static zend_encoding encoding_euc_kr = {
00252        NULL,
00253        NULL,
00254        "EUC-KR",
00255        (const char *(*)[])&euc_kr_aliases,
00256        1
00257 };
00258 
00259 static const char *uhc_aliases[] = {"CP949", NULL};
00260 static zend_encoding encoding_uhc = {
00261        NULL,
00262        NULL,
00263        "UHC",
00264        (const char *(*)[])&uhc_aliases,
00265        1
00266 };
00267 
00268 static zend_encoding encoding_2022kr = {
00269        NULL,
00270        NULL,
00271        "ISO-2022-KR",
00272        NULL,
00273        0
00274 };
00275 
00276 static const char *cp1252_aliases[] = {"cp1252", NULL};
00277 static zend_encoding encoding_cp1252 = {
00278        NULL,
00279        NULL,
00280        "Windows-1252",
00281        (const char *(*)[])&cp1252_aliases,
00282        1
00283 };
00284 
00285 static const char *iso_8859_1_aliases[] = {"ISO_8859-1", "latin1", NULL};
00286 static zend_encoding encoding_8859_1 = {
00287        NULL,
00288        NULL,
00289        "ISO-8859-1",
00290        (const char *(*)[])&iso_8859_1_aliases,
00291        1
00292 };
00293 
00294 static const char *iso_8859_2_aliases[] = {"ISO_8859-2", "latin2", NULL};
00295 static zend_encoding encoding_8859_2 = {
00296        NULL,
00297        NULL,
00298        "ISO-8859-2",
00299        (const char *(*)[])&iso_8859_2_aliases,
00300        1
00301 };
00302 
00303 static const char *iso_8859_3_aliases[] = {"ISO_8859-3", "latin3", NULL};
00304 static zend_encoding encoding_8859_3 = {
00305        NULL,
00306        NULL,
00307        "ISO-8859-3",
00308        (const char *(*)[])&iso_8859_3_aliases,
00309        1
00310 };
00311 
00312 static const char *iso_8859_4_aliases[] = {"ISO_8859-4", "latin4", NULL};
00313 static zend_encoding encoding_8859_4 = {
00314        NULL,
00315        NULL,
00316        "ISO-8859-4",
00317        (const char *(*)[])&iso_8859_4_aliases,
00318        1
00319 };
00320 
00321 static const char *iso_8859_5_aliases[] = {"ISO_8859-5", "cyrillic", NULL};
00322 static zend_encoding encoding_8859_5 = {
00323        NULL,
00324        NULL,
00325        "ISO-8859-5",
00326        (const char *(*)[])&iso_8859_5_aliases,
00327        1
00328 };
00329 
00330 static const char *iso_8859_6_aliases[] = {"ISO_8859-6", "arabic", NULL};
00331 static zend_encoding encoding_8859_6 = {
00332        NULL,
00333        NULL,
00334        "ISO-8859-6",
00335        (const char *(*)[])&iso_8859_6_aliases,
00336        1
00337 };
00338 
00339 static const char *iso_8859_7_aliases[] = {"ISO_8859-7", "greek", NULL};
00340 static zend_encoding encoding_8859_7 = {
00341        NULL,
00342        NULL,
00343        "ISO-8859-7",
00344        (const char *(*)[])&iso_8859_7_aliases,
00345        1
00346 };
00347 
00348 static const char *iso_8859_8_aliases[] = {"ISO_8859-8", "hebrew", NULL};
00349 static zend_encoding encoding_8859_8 = {
00350        NULL,
00351        NULL,
00352        "ISO-8859-8",
00353        (const char *(*)[])&iso_8859_8_aliases,
00354        1
00355 };
00356 
00357 static const char *iso_8859_9_aliases[] = {"ISO_8859-9", "latin5", NULL};
00358 static zend_encoding encoding_8859_9 = {
00359        NULL,
00360        NULL,
00361        "ISO-8859-9",
00362        (const char *(*)[])&iso_8859_9_aliases,
00363        1
00364 };
00365 
00366 static const char *iso_8859_10_aliases[] = {"ISO_8859-10", "latin6", NULL};
00367 static zend_encoding encoding_8859_10 = {
00368        NULL,
00369        NULL,
00370        "ISO-8859-10",
00371        (const char *(*)[])&iso_8859_10_aliases,
00372        1
00373 };
00374 
00375 static const char *iso_8859_13_aliases[] = {"ISO_8859-13", NULL};
00376 static zend_encoding encoding_8859_13 = {
00377        NULL,
00378        NULL,
00379        "ISO-8859-13",
00380        (const char *(*)[])&iso_8859_13_aliases,
00381        1
00382 };
00383 
00384 static const char *iso_8859_14_aliases[] = {"ISO_8859-14", "latin8", NULL};
00385 static zend_encoding encoding_8859_14 = {
00386        NULL,
00387        NULL,
00388        "ISO-8859-14",
00389        (const char *(*)[])&iso_8859_14_aliases,
00390        1
00391 };
00392 
00393 static const char *iso_8859_15_aliases[] = {"ISO_8859-15", NULL};
00394 static zend_encoding encoding_8859_15 = {
00395        NULL,
00396        NULL,
00397        "ISO-8859-15",
00398        (const char *(*)[])&iso_8859_15_aliases,
00399        1
00400 };
00401 
00402 static const char *cp1251_aliases[] = {"CP1251", "CP-1251", "WINDOWS-1251", NULL};
00403 static zend_encoding encoding_cp1251 = {
00404        NULL,
00405        NULL,
00406        "Windows-1251",
00407        (const char *(*)[])&cp1251_aliases,
00408        1
00409 };
00410 
00411 static const char *cp866_aliases[] = {"CP866", "CP-866", "IBM-866", NULL};
00412 static zend_encoding encoding_cp866 = {
00413        NULL,
00414        NULL,
00415        "CP866",
00416        (const char *(*)[])&cp866_aliases,
00417        1
00418 };
00419 
00420 static const char *koi8r_aliases[] = {"KOI8-R", "KOI8R", NULL};
00421 static zend_encoding encoding_koi8r = {
00422        NULL,
00423        NULL,
00424        "KOI8-R",
00425        (const char *(*)[])&koi8r_aliases,
00426        1
00427 };
00428 
00429 static const char *koi8u_aliases[] = {"KOI8-U", "KOI8U", NULL};
00430 static zend_encoding encoding_koi8u = {
00431        NULL,
00432        NULL,
00433        "KOI8-U",
00434        (const char *(*)[])&koi8u_aliases,
00435        1
00436 };
00437 
00438 static const char *cp1254_aliases[] = {"cp1254", NULL};
00439 static zend_encoding encoding_cp1254 = {
00440        NULL,
00441        NULL,
00442        "Windows-1254",
00443        (const char *(*)[])&cp1254_aliases,
00444        1
00445 };
00446 
00447 static const char *armscii8_aliases[] = { "ArmSCII8", "ARMSCII-8", "ARMSCII8", NULL};
00448 static zend_encoding encoding_armscii8 = {
00449        NULL,
00450        NULL,
00451        "ArmSCII-8",
00452        (const char *(*)[])&armscii8_aliases,
00453        1
00454 };
00455 
00456 static const char *cp850_aliases[] = {"IBM850", NULL};
00457 static zend_encoding encoding_cp850 = {
00458        NULL,
00459        NULL,
00460        "CP850",
00461        (const char *(*)[])&cp850_aliases,
00462        1
00463 };
00464 
00465 static zend_encoding *zend_encoding_table[] = {
00466        &encoding_ucs4,
00467        &encoding_ucs4be,
00468        &encoding_ucs4le,
00469        &encoding_ucs2,
00470        &encoding_ucs2be,
00471        &encoding_ucs2le,
00472        &encoding_utf32,
00473        &encoding_utf32be,
00474        &encoding_utf32le,
00475        &encoding_utf16,
00476        &encoding_utf16be,
00477        &encoding_utf16le,
00478        &encoding_utf8,
00479        &encoding_ascii,
00480        &encoding_euc_jp,
00481        &encoding_sjis,
00482        &encoding_eucjp_win,
00483        &encoding_sjis_win,
00484        &encoding_jis,
00485        &encoding_cp1252,
00486        &encoding_8859_1,
00487        &encoding_8859_2,
00488        &encoding_8859_3,
00489        &encoding_8859_4,
00490        &encoding_8859_5,
00491        &encoding_8859_6,
00492        &encoding_8859_7,
00493        &encoding_8859_8,
00494        &encoding_8859_9,
00495        &encoding_8859_10,
00496        &encoding_8859_13,
00497        &encoding_8859_14,
00498        &encoding_8859_15,
00499        &encoding_euc_cn,
00500        &encoding_cp936,
00501        &encoding_hz,
00502        &encoding_euc_tw,
00503        &encoding_big5,
00504        &encoding_euc_kr,
00505        &encoding_uhc,
00506        &encoding_2022kr,
00507        &encoding_cp1251,
00508        &encoding_cp866,
00509        &encoding_koi8r,
00510        &encoding_koi8u,
00511        &encoding_armscii8,
00512        &encoding_cp1254,
00513        &encoding_cp850,
00514        NULL
00515 };
00516 
00517 
00518 
00519 ZEND_API int zend_multibyte_set_script_encoding(const char *encoding_list,
00520 size_t encoding_list_size TSRMLS_DC)
00521 {
00522        if (CG(script_encoding_list)) {
00523               efree(CG(script_encoding_list));
00524               CG(script_encoding_list) = NULL;
00525        }
00526        CG(script_encoding_list_size) = 0;
00527 
00528        if (!encoding_list) {
00529               return 0;
00530        }
00531 
00532        zend_multibyte_parse_encoding_list(encoding_list, encoding_list_size, &(CG(script_encoding_list)), &(CG(script_encoding_list_size)));
00533 
00534        return 0;
00535 }
00536 
00537 
00538 ZEND_API int zend_multibyte_set_internal_encoding(const char *encoding_name TSRMLS_DC)
00539 {
00540        CG(internal_encoding) = zend_multibyte_fetch_encoding(encoding_name);
00541        return 0;
00542 }
00543 
00544 ZEND_API int zend_multibyte_set_functions(zend_encoding_detector encoding_detector, zend_encoding_converter encoding_converter, zend_encoding_oddlen encoding_oddlen TSRMLS_DC)
00545 {
00546        CG(encoding_detector) = encoding_detector;
00547        CG(encoding_converter) = encoding_converter;
00548        CG(encoding_oddlen) = encoding_oddlen;
00549        return 0;
00550 }
00551 
00552 
00553 ZEND_API int zend_multibyte_set_filter(zend_encoding *onetime_encoding TSRMLS_DC)
00554 {
00555        LANG_SCNG(script_encoding) = zend_multibyte_find_script_encoding(onetime_encoding TSRMLS_CC);
00556        LANG_SCNG(internal_encoding) = CG(internal_encoding);
00557 
00558        /* judge input/output filter */
00559        LANG_SCNG(input_filter) = NULL;
00560        LANG_SCNG(output_filter) = NULL;
00561 
00562        if (!LANG_SCNG(script_encoding)) {
00563               return 0;
00564        }
00565 
00566        if (!LANG_SCNG(internal_encoding) || LANG_SCNG(script_encoding) == LANG_SCNG(internal_encoding)) {
00567               /* if encoding specfic filters exist, use them */
00568               if (LANG_SCNG(script_encoding)->input_filter && LANG_SCNG(script_encoding)->output_filter) {
00569                      LANG_SCNG(input_filter) = LANG_SCNG(script_encoding)->input_filter;
00570                      LANG_SCNG(output_filter) = LANG_SCNG(script_encoding)->output_filter;
00571                      return 0;
00572               }
00573 
00574               if (!LANG_SCNG(script_encoding)->compatible) {
00575                      /* and if not, work around w/ script_encoding -> utf-8 -> script_encoding conversion */
00576                      LANG_SCNG(internal_encoding) = LANG_SCNG(script_encoding);
00577                      LANG_SCNG(input_filter) = zend_multibyte_script_encoding_filter;
00578                      LANG_SCNG(output_filter) = zend_multibyte_internal_encoding_filter;
00579                      return 0;
00580               } else {
00581                      /* nothing to do in this case */
00582                      return 0;
00583               }
00584        }
00585 
00586        /* LANG_SCNG(internal_encoding) cannot be NULL here */
00587        if (LANG_SCNG(internal_encoding)->compatible) {
00588               LANG_SCNG(input_filter) = zend_multibyte_script_encoding_filter;
00589               return 0;
00590        } else if (LANG_SCNG(script_encoding)->compatible) {
00591               LANG_SCNG(output_filter) = zend_multibyte_internal_encoding_filter;
00592               return 0;
00593        }
00594 
00595        /* both script and internal encodings are incompatible w/ flex */
00596        LANG_SCNG(input_filter) = zend_multibyte_script_encoding_filter;
00597        LANG_SCNG(output_filter) = zend_multibyte_internal_encoding_filter;
00598 
00599        return 0;
00600 }
00601 
00602 
00603 ZEND_API zend_encoding* zend_multibyte_fetch_encoding(const char *encoding_name)
00604 {
00605        int i, j;
00606        zend_encoding *encoding;
00607 
00608        if (!encoding_name) {
00609               return NULL;
00610        }
00611 
00612        for (i = 0; (encoding = zend_encoding_table[i]) != NULL; i++) {
00613               if (zend_binary_strcasecmp(encoding->name, strlen(encoding->name), encoding_name, strlen(encoding_name)) == 0) {
00614                      return encoding;
00615               }
00616        }
00617 
00618        for (i = 0; (encoding = zend_encoding_table[i]) != NULL; i++) {
00619               if (encoding->aliases != NULL) {
00620                      for (j = 0; (*encoding->aliases)[j] != NULL; j++) {
00621                             if (zend_binary_strcasecmp((*encoding->aliases)[j], strlen((*encoding->aliases)[j]), encoding_name, strlen(encoding_name)) == 0) {
00622                                    return encoding;
00623                             }
00624                      }
00625               }
00626        }
00627 
00628        return NULL;
00629 }
00630 
00631 
00632 ZEND_API size_t zend_multibyte_script_encoding_filter(unsigned char **to, size_t
00633 *to_length, const unsigned char *from, size_t from_length TSRMLS_DC)
00634 {
00635        const char *name;
00636 
00637        if (LANG_SCNG(internal_encoding) == NULL || LANG_SCNG(internal_encoding)->compatible == 0) {
00638               name = "UTF-8";
00639        } else {
00640               name = LANG_SCNG(internal_encoding)->name;
00641        }
00642 
00643        return zend_multibyte_encoding_filter(to, to_length, name, from, from_length, LANG_SCNG(script_encoding)->name TSRMLS_CC);
00644 }
00645 
00646 ZEND_API size_t zend_multibyte_internal_encoding_filter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length TSRMLS_DC)
00647 {
00648        const char *name;
00649 
00650        if (LANG_SCNG(script_encoding)->compatible == 0) {
00651               name = "UTF-8";
00652        } else {
00653               name = LANG_SCNG(script_encoding)->name;
00654        }
00655 
00656        return zend_multibyte_encoding_filter(to, to_length, LANG_SCNG(internal_encoding)->name, from, from_length, name TSRMLS_CC);
00657 }
00658 
00659 static size_t zend_multibyte_encoding_filter(unsigned char **to, size_t *to_length, const char *to_encoding, const unsigned char *from, size_t from_length, const char *from_encoding TSRMLS_DC)
00660 {
00661        size_t oddlen;
00662 
00663        if (!CG(encoding_converter)) {
00664               return 0;
00665        }
00666 
00667        if (CG(encoding_oddlen)) {
00668               oddlen = CG(encoding_oddlen)(from, from_length, from_encoding TSRMLS_CC);
00669               if (oddlen > 0) {
00670                      from_length -= oddlen;
00671               }
00672        }
00673 
00674        if (CG(encoding_converter)(to, to_length, from, from_length, to_encoding, from_encoding TSRMLS_CC) != 0) {
00675               return 0;
00676        }
00677 
00678        return from_length;
00679 }
00680 
00681 
00682 /*
00683  *     Shift_JIS Input/Output Filter
00684  */
00685 static const unsigned char table_sjis[] = { /* 0x80-0x9f,0xE0-0xEF */
00686   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00687   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00688   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00689   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00690   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00691   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00692   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00693   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00694   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00695   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00696   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00697   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00698   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00699   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00700   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00701   3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 0, 0, 0
00702 };
00703 
00704 size_t sjis_input_filter(unsigned char **buf, size_t *length, const unsigned char *sjis, size_t sjis_length TSRMLS_DC)
00705 {
00706        const unsigned char *p;
00707        unsigned char *q;
00708        unsigned char  c1, c2;
00709 
00710        *buf = (unsigned char*)emalloc(sjis_length * 3 / 2 + 1);
00711        if (!*buf)
00712               return 0;
00713        *length = 0;
00714 
00715        p = sjis;
00716        q = *buf;
00717 
00718        /* convert [SJIS -> EUC-JP] (for lex scan) -- some other better ways? */
00719        while (*p && (p - sjis) < sjis_length) {
00720               if (!(*p & 0x80)) {
00721                      *q++ = *p++;
00722                      continue;
00723               }
00724 
00725               /* handling 8 bit code */
00726               if (table_sjis[*p] == 1) {
00727                      /* 1 byte kana */
00728                      *q++ = 0x8e;
00729                      *q++ = *p++;
00730                      continue;
00731               }
00732 
00733               if (!*(p+1)) {
00734                      *q++ = *p++;
00735                      break;
00736               }
00737 
00738               if (table_sjis[*p] == 2) {
00739                      /* 2 byte kanji code */
00740                      c1 = *p++;
00741                      if (!*p || (p - sjis) >= sjis_length) {
00742                             break;
00743                      }
00744                      c2 = *p++;
00745                      c1 -= (c1 <= 0x9f) ? 0x71 : 0xb1;
00746                      c1 = (c1 << 1) + 1;
00747                      if (c2 >= 0x9e) {
00748                             c2 -= 0x7e;
00749                             c1++;
00750                      } else if (c2 > 0x7f) {
00751                             c2 -= 0x20;
00752                      } else {
00753                             c2 -= 0x1f;
00754                      }
00755 
00756                      c1 |= 0x80;
00757                      c2 |= 0x80;
00758 
00759                      *q++ = c1;
00760                      *q++ = c2;
00761               } else {
00762                      /*
00763                       * for user defined chars (ATTENTION)
00764                       *
00765                       * THESE ARE NOT CODE FOR CONVERSION! :-P
00766                       * (using *ILLEGALLY* 3byte EUC-JP space)
00767                       *
00768                       * we cannot perfectly (== 1 to 1)  convert these chars to EUC-JP.
00769                       * so, these code are for perfect RESTORING in sjis_output_filter()
00770                       */
00771                      c1 = *p++;
00772                      if (!*p || (p - sjis) >= sjis_length) {
00773                             break;
00774                      }
00775                      c2 = *p++;
00776                      *q++ = 0x8f;
00777                      /*
00778                       * MAP TO (EUC-JP):
00779                       * type A: 0xeba1 - 0xf4fe
00780                       * type B: 0xf5a1 - 0xfefe
00781                       * type C: 0xa1a1 - 0xa6fe
00782                       */
00783                      c1 -= (c1 > 0xf9) ? (0x79+0x71) : (0x0a+0xb1);
00784                      c1 = (c1 << 1) + 1;
00785                      if (c2 >= 0x9e) {
00786                             c2 -= 0x7e;
00787                             c1++;
00788                      } else if (c2 > 0x7f) {
00789                             c2 -= 0x20;
00790                      } else {
00791                             c2 -= 0x1f;
00792                      }
00793                      
00794                      c1 |= 0x80;
00795                      c2 |= 0x80;
00796 
00797                      *q++ = c1;
00798                      *q++ = c2;
00799               }
00800        }
00801        *q = '\0';
00802        *length = q - *buf;
00803 
00804        return *length;
00805 }
00806 
00807 static const unsigned char table_eucjp[] = { /* 0xA1-0xFE */
00808   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00809   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00810   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00811   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00812   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00813   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00814   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00815   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00816   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
00817   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00818   1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00819   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00820   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00821   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00822   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00823   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
00824 };
00825 
00826 size_t sjis_output_filter(unsigned char **sjis, size_t *sjis_length, const unsigned char *buf, size_t length TSRMLS_DC)
00827 {
00828        unsigned char c1, c2;
00829        unsigned char *p;
00830        const unsigned char *q;
00831 
00832        if (!sjis || !sjis_length) {
00833               return 0;
00834        }
00835 
00836        /* always Shift_JIS <= EUC-JP */
00837        *sjis = (unsigned char*)emalloc(length+1);
00838        if (!sjis) {
00839               return 0;
00840        }
00841        p = *sjis;
00842        q = buf;
00843 
00844        /* restore converted strings [EUC-JP -> Shift_JIS] */
00845        while (*q && (q - buf) < length) {
00846               if (!(*q & 0x80)) {
00847                      *p++ = *q++;
00848                      continue;
00849               }
00850 
00851               /* hankaku kana */
00852               if (*q == 0x8e) {
00853                      q++;
00854                      if (*q) {
00855                             *p++ = *q++;
00856                      }
00857                      continue;
00858               }
00859 
00860               /* 2 byte kanji code */
00861               if (table_eucjp[*q] == 2) {
00862                      c1 = (*q++ & ~0x80) & 0xff;
00863                      if (*q) {
00864                             c2 = (*q++ & ~0x80) & 0xff;
00865                      } else {
00866                             q--;
00867                             break;
00868                      }
00869 
00870                      c2 += (c1 & 0x01) ? 0x1f : 0x7d;
00871                      if (c2 >= 0x7f) {
00872                             c2++;
00873                      }
00874                      c1 = ((c1 - 0x21) >> 1) + 0x81;
00875                      if (c1 > 0x9f) {
00876                             c1 += 0x40;
00877                      }
00878                      
00879                      *p++ = c1;
00880                      *p++ = c2;
00881                      continue;
00882               }
00883 
00884               if (*q == 0x8f) {
00885                      q++;
00886                      if (*q) {
00887                             c1 = (*q++ & ~0x80) & 0xff;
00888                      } else {
00889                             q--;
00890                             break;
00891                      }
00892                      if (*q) {
00893                             c2 = (*q++ & ~0x80) & 0xff;
00894                      } else {
00895                             q -= 2;
00896                             break;
00897                      }
00898                      
00899                      c2 += (c1 & 0x01) ? 0x1f : 0x7d;
00900                      if (c2 >= 0x7f) {
00901                             c2++;
00902                      }
00903                      c1 = ((c1 - 0x21) >> 1) + 0x81;
00904                      if (c1 > 0x9f) {
00905                             c1 += 0x40;
00906                      }
00907                      
00908                      if (c1 >= 0x81 && c1 <= 0x9f) {
00909                             c1 += 0x79;
00910                      } else {
00911                             c1 += 0x0a;
00912                      }
00913                      
00914                      *p++ = c1;
00915                      *p++ = c2;
00916                      continue;
00917               }
00918 
00919               /* some other chars (may not happen) */
00920               *p++ = *q++;
00921        }
00922        *p = '\0';
00923        *sjis_length = p - *sjis;
00924 
00925        return q-buf; /* return length we actually read */
00926 }
00927 
00928 
00929 static char *zend_multibyte_assemble_encoding_list(zend_encoding **encoding_list, size_t encoding_list_size)
00930 {
00931        int i, list_size = 0;
00932        const char *name;
00933        char *list = NULL;
00934 
00935        if (!encoding_list || !encoding_list_size) {
00936               return NULL;
00937        }
00938 
00939        for (i = 0; i < encoding_list_size; i++) {
00940               name = (*(encoding_list+i))->name;
00941               if (name) {
00942                      list_size += strlen(name) + 1;
00943                      if (!list) {
00944                             list = (char*)emalloc(list_size);
00945                             if (!list) {
00946                                    return NULL;
00947                             }
00948                             *list = '\0';
00949                      } else {
00950                             list = (char*)erealloc(list, list_size);
00951                             if (!list) {
00952                                    return NULL;
00953                             }
00954                             strcat(list, ",");
00955                      }
00956                      strcat(list, name);
00957               }
00958        }
00959        return list;
00960 }
00961 
00962 
00963 static int zend_multibyte_parse_encoding_list(const char *encoding_list,
00964 size_t encoding_list_size, zend_encoding ***result, size_t *result_size)
00965 {
00966        int n, size;
00967        char *p, *p1, *p2, *endp, *tmpstr;
00968        zend_encoding **list, **entry, *encoding;
00969 
00970        list = NULL;
00971        if (encoding_list == NULL || encoding_list_size <= 0) {
00972               return -1;
00973        } else {
00974               /* copy the encoding_list string for work */
00975               tmpstr = (char *)estrndup(encoding_list, encoding_list_size);
00976               if (tmpstr == NULL) {
00977                      return -1;
00978               }
00979               /* count the number of listed encoding names */
00980               endp = tmpstr + encoding_list_size;
00981               n = 1;
00982               p1 = tmpstr;
00983               while ((p2 = zend_memnstr(p1, ",", 1, endp)) != NULL) {
00984                      p1 = p2 + 1;
00985                      n++;
00986               }
00987               size = n;
00988               /* make list */
00989               list = (zend_encoding**)ecalloc(size, sizeof(zend_encoding*));
00990               if (list != NULL) {
00991                      entry = list;
00992                      n = 0;
00993                      p1 = tmpstr;
00994                      do {
00995                             p2 = p = zend_memnstr(p1, ",", 1, endp);
00996                             if (p == NULL) {
00997                                    p = endp;
00998                             }
00999                             *p = '\0';
01000                             /* trim spaces */
01001                             while (p1 < p && (*p1 == ' ' || *p1 == '\t')) {
01002                                    p1++;
01003                             }
01004                             p--;
01005                             while (p > p1 && (*p == ' ' || *p == '\t')) {
01006                                    *p = '\0';
01007                                    p--;
01008                             }
01009                             /* convert to the encoding number and check encoding */
01010                             encoding = zend_multibyte_fetch_encoding(p1);
01011                             if (encoding)
01012                             {
01013                                    *entry++ = encoding;
01014                                    n++;
01015                             }
01016                             p1 = p2 + 1;
01017                      } while (n < size && p2 != NULL);
01018                      *result = list;
01019                      *result_size = n;
01020               }
01021               efree(tmpstr);
01022        }
01023 
01024        if (list == NULL) {
01025               return -1;
01026        }
01027 
01028        return 0;
01029 }
01030 
01031 
01032 static zend_encoding* zend_multibyte_find_script_encoding(zend_encoding *onetime_encoding TSRMLS_DC)
01033 {
01034        zend_encoding *script_encoding;
01035        char *name, *list;
01036 
01037        /* onetime_encoding is prior to everything */
01038        if (onetime_encoding != NULL) {
01039               return onetime_encoding;
01040        }
01041 
01042        if (CG(detect_unicode)) {
01043               /* check out bom(byte order mark) and see if containing wchars */
01044               script_encoding = zend_multibyte_detect_unicode(TSRMLS_C);
01045               if (script_encoding != NULL) {
01046                      /* bom or wchar detection is prior to 'script_encoding' option */
01047                      return script_encoding;
01048               }
01049        }
01050 
01051        /* if no script_encoding specified, just leave alone */
01052        if (!CG(script_encoding_list) || !CG(script_encoding_list_size)) {
01053               return NULL;
01054        }
01055 
01056        /* if multiple encodings specified, detect automagically */
01057        if (CG(script_encoding_list_size) > 1 && CG(encoding_detector)) {
01058               list = zend_multibyte_assemble_encoding_list(CG(script_encoding_list),
01059                             CG(script_encoding_list_size));
01060               name = CG(encoding_detector)(LANG_SCNG(script_org), 
01061                             LANG_SCNG(script_org_size), list TSRMLS_CC);
01062               if (list) {
01063                      efree(list);
01064               }
01065               if (name) {
01066                      script_encoding = zend_multibyte_fetch_encoding(name);
01067                      efree(name);
01068               } else {
01069                      script_encoding = NULL;
01070               }
01071               return script_encoding;
01072        }
01073 
01074        return *(CG(script_encoding_list));
01075 }
01076 
01077 
01078 static zend_encoding* zend_multibyte_detect_unicode(TSRMLS_D)
01079 {
01080        zend_encoding *script_encoding = NULL;
01081        int bom_size;
01082        unsigned char *script;
01083 
01084        if (LANG_SCNG(script_org_size) < sizeof(BOM_UTF32_LE)-1) {
01085               return NULL;
01086        }
01087 
01088        /* check out BOM */
01089        if (!memcmp(LANG_SCNG(script_org), BOM_UTF32_BE, sizeof(BOM_UTF32_BE)-1)) {
01090               script_encoding = &encoding_utf32be;
01091               bom_size = sizeof(BOM_UTF32_BE)-1;
01092        } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF32_LE, sizeof(BOM_UTF32_LE)-1)) {
01093               script_encoding = &encoding_utf32le;
01094               bom_size = sizeof(BOM_UTF32_LE)-1;
01095        } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF16_BE, sizeof(BOM_UTF16_BE)-1)) {
01096               script_encoding = &encoding_utf16be;
01097               bom_size = sizeof(BOM_UTF16_BE)-1;
01098        } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF16_LE, sizeof(BOM_UTF16_LE)-1)) {
01099               script_encoding = &encoding_utf16le;
01100               bom_size = sizeof(BOM_UTF16_LE)-1;
01101        } else if (!memcmp(LANG_SCNG(script_org), BOM_UTF8, sizeof(BOM_UTF8)-1)) {
01102               script_encoding = &encoding_utf8;
01103               bom_size = sizeof(BOM_UTF8)-1;
01104        }
01105 
01106        if (script_encoding) {
01107               /* remove BOM */
01108               script = (unsigned char*)emalloc(LANG_SCNG(script_org_size)+1-bom_size);
01109               memcpy(script, LANG_SCNG(script_org)+bom_size, LANG_SCNG(script_org_size)+1-bom_size);
01110               efree(LANG_SCNG(script_org));
01111               LANG_SCNG(script_org) = script;
01112               LANG_SCNG(script_org_size) -= bom_size;
01113 
01114               return script_encoding;
01115        }
01116 
01117        /* script contains NULL bytes -> auto-detection */
01118        if (memchr(LANG_SCNG(script_org), 0, LANG_SCNG(script_org_size))) {
01119               /* make best effort if BOM is missing */
01120               return zend_multibyte_detect_utf_encoding(LANG_SCNG(script_org), LANG_SCNG(script_org_size) TSRMLS_CC);
01121        }
01122 
01123        return NULL;
01124 }
01125 
01126 static zend_encoding *zend_multibyte_detect_utf_encoding(const unsigned char *script, size_t script_size TSRMLS_DC)
01127 {
01128        const unsigned char *p;
01129        int wchar_size = 2;
01130        int le = 0;
01131 
01132        /* utf-16 or utf-32? */
01133        p = script;
01134        while ((p-script) < script_size) {
01135               p = memchr(p, 0, script_size-(p-script)-2);
01136               if (!p) {
01137                      break;
01138               }
01139               if (*(p+1) == '\0' && *(p+2) == '\0') {
01140                      wchar_size = 4;
01141                      break;
01142               }
01143 
01144               /* searching for UTF-32 specific byte orders, so this will do */
01145               p += 4;
01146        }
01147 
01148        /* BE or LE? */
01149        p = script;
01150        while ((p-script) < script_size) {
01151               if (*p == '\0' && *(p+wchar_size-1) != '\0') {
01152                      /* BE */
01153                      le = 0;
01154                      break;
01155               } else if (*p != '\0' && *(p+wchar_size-1) == '\0') {
01156                      /* LE* */
01157                      le = 1;
01158                      break;
01159               }
01160               p += wchar_size;
01161        }
01162 
01163        if (wchar_size == 2) {
01164               return le ? &encoding_utf16le : &encoding_utf16be;
01165        } else {
01166               return le ? &encoding_utf32le : &encoding_utf32be;
01167        }
01168 
01169        return NULL;
01170 }
01171 #endif /* ZEND_MULTIBYTE */
01172 
01173 /*
01174  * Local variables:
01175  * tab-width: 4
01176  * c-basic-offset: 4
01177  * End:
01178  * vim600: sw=4 ts=4 tw=78
01179  * vim<600: sw=4 ts=4 tw=78
01180  */