Back to index

php5  5.3.10
mbfilter_euc_jp_win.c
Go to the documentation of this file.
00001 /*
00002  * "streamable kanji code filter and converter"
00003  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
00004  *
00005  * LICENSE NOTICES
00006  *
00007  * This file is part of "streamable kanji code filter and converter",
00008  * which is distributed under the terms of GNU Lesser General Public 
00009  * License (version 2) as published by the Free Software Foundation.
00010  *
00011  * This software is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  * GNU Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with "streamable kanji code filter and converter";
00018  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
00019  * Suite 330, Boston, MA  02111-1307  USA
00020  *
00021  * The author of this file:
00022  *
00023  */
00024 /*
00025  * The source code included in this files was separated from mbfilter_ja.c
00026  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
00027  * 
00028  */
00029 
00030 #ifdef HAVE_CONFIG_H
00031 #include "config.h"
00032 #endif
00033 
00034 #include "mbfilter.h"
00035 #include "mbfilter_euc_jp_win.h"
00036 
00037 #include "unicode_table_cp932_ext.h"
00038 #include "unicode_table_jis.h"
00039 #include "cp932_table.h"
00040 
00041 static int mbfl_filt_ident_eucjp_win(int c, mbfl_identify_filter *filter);
00042 
00043 static const unsigned char mblen_table_eucjp[] = { /* 0xA1-0xFE */
00044   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00045   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00046   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00047   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00048   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00049   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00050   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00051   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00052   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
00053   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00054   1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00055   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00056   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00057   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00058   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00059   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
00060 };
00061 
00062 
00063 static const char *mbfl_encoding_eucjp_win_aliases[] = {"eucJP-open", 
00064                                                  "eucJP-ms", NULL};
00065 
00066 const struct mbfl_identify_vtbl vtbl_identify_eucjpwin = {
00067        mbfl_no_encoding_eucjp_win,
00068        mbfl_filt_ident_common_ctor,
00069        mbfl_filt_ident_common_dtor,
00070        mbfl_filt_ident_eucjp_win
00071 };
00072 
00073 const mbfl_encoding mbfl_encoding_eucjp_win = {
00074        mbfl_no_encoding_eucjp_win,
00075        "eucJP-win",
00076        "EUC-JP",
00077        (const char *(*)[])&mbfl_encoding_eucjp_win_aliases,
00078        mblen_table_eucjp,
00079        MBFL_ENCTYPE_MBCS
00080 };
00081 
00082 const struct mbfl_convert_vtbl vtbl_eucjpwin_wchar = {
00083        mbfl_no_encoding_eucjp_win,
00084        mbfl_no_encoding_wchar,
00085        mbfl_filt_conv_common_ctor,
00086        mbfl_filt_conv_common_dtor,
00087        mbfl_filt_conv_eucjpwin_wchar,
00088        mbfl_filt_conv_common_flush
00089 };
00090 
00091 const struct mbfl_convert_vtbl vtbl_wchar_eucjpwin = {
00092        mbfl_no_encoding_wchar,
00093        mbfl_no_encoding_eucjp_win,
00094        mbfl_filt_conv_common_ctor,
00095        mbfl_filt_conv_common_dtor,
00096        mbfl_filt_conv_wchar_eucjpwin,
00097        mbfl_filt_conv_common_flush
00098 };
00099 
00100 #define CK(statement)       do { if ((statement) < 0) return (-1); } while (0)
00101 
00102 /*
00103  * eucJP-win => wchar
00104  */
00105 int
00106 mbfl_filt_conv_eucjpwin_wchar(int c, mbfl_convert_filter *filter)
00107 {
00108        int c1, s, w, n;
00109 
00110        switch (filter->status) {
00111        case 0:
00112               if (c >= 0 && c < 0x80) {   /* latin */
00113                      CK((*filter->output_function)(c, filter->data));
00114               } else if (c > 0xa0 && c < 0xff) { /* CP932 first char */
00115                      filter->status = 1;
00116                      filter->cache = c;
00117               } else if (c == 0x8e) {     /* kana first char */
00118                      filter->status = 2;
00119               } else if (c == 0x8f) {     /* X 0212 first char */
00120                      filter->status = 3;
00121               } else {
00122                      w = c & MBFL_WCSGROUP_MASK;
00123                      w |= MBFL_WCSGROUP_THROUGH;
00124                      CK((*filter->output_function)(w, filter->data));
00125               }
00126               break;
00127 
00128        case 1:       /* got first half */
00129               filter->status = 0;
00130               c1 = filter->cache;
00131               if (c > 0xa0 && c < 0xff) {
00132                      w = 0;
00133                      s = (c1 - 0xa1)*94 + c - 0xa1;
00134                      if (s <= 137) {
00135                             if (s == 31) {
00136                                    w = 0xff3c;                 /* FULLWIDTH REVERSE SOLIDUS */
00137                             } else if (s == 32) {
00138                                    w = 0xff5e;                 /* FULLWIDTH TILDE */
00139                             } else if (s == 33) {
00140                                    w = 0x2225;                 /* PARALLEL TO */
00141                             } else if (s == 60) {
00142                                    w = 0xff0d;                 /* FULLWIDTH HYPHEN-MINUS */
00143                             } else if (s == 80) {
00144                                    w = 0xffe0;                 /* FULLWIDTH CENT SIGN */
00145                             } else if (s == 81) {
00146                                    w = 0xffe1;                 /* FULLWIDTH POUND SIGN */
00147                             } else if (s == 137) {
00148                                    w = 0xffe2;                 /* FULLWIDTH NOT SIGN */
00149                             }
00150                      }
00151                      if (w == 0) {
00152                             if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {           /* vendor ext1 (13ku) */
00153                                    w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
00154                             } else if (s >= 0 && s < jisx0208_ucs_table_size) {            /* X 0208 */
00155                                    w = jisx0208_ucs_table[s];
00156                             } else if (s >= (84 * 94)) {              /* user (85ku - 94ku) */
00157                                    w = s - (84 * 94) + 0xe000;
00158                             }
00159                      }
00160                      if (w <= 0) {
00161                             w = ((c1 & 0x7f) << 8) | (c & 0x7f);
00162                             w &= MBFL_WCSPLANE_MASK;
00163                             w |= MBFL_WCSPLANE_WINCP932;
00164                      }
00165                      CK((*filter->output_function)(w, filter->data));
00166               } else if ((c >= 0 && c < 0x21) || c == 0x7f) {         /* CTLs */
00167                      CK((*filter->output_function)(c, filter->data));
00168               } else {
00169                      w = (c1 << 8) | c;
00170                      w &= MBFL_WCSGROUP_MASK;
00171                      w |= MBFL_WCSGROUP_THROUGH;
00172                      CK((*filter->output_function)(w, filter->data));
00173               }
00174               break;
00175 
00176        case 2:       /* got 0x8e, X0201 kana */
00177               filter->status = 0;
00178               if (c > 0xa0 && c < 0xe0) {
00179                      w = 0xfec0 + c;
00180                      CK((*filter->output_function)(w, filter->data));
00181               } else if ((c >= 0 && c < 0x21) || c == 0x7f) {         /* CTLs */
00182                      CK((*filter->output_function)(c, filter->data));
00183               } else {
00184                      w = 0x8e00 | c;
00185                      w &= MBFL_WCSGROUP_MASK;
00186                      w |= MBFL_WCSGROUP_THROUGH;
00187                      CK((*filter->output_function)(w, filter->data));
00188               }
00189               break;
00190 
00191        case 3:       /* got 0x8f,  X 0212 first char */
00192               if ((c >= 0 && c < 0x21) || c == 0x7f) {         /* CTLs */
00193                      CK((*filter->output_function)(c, filter->data));
00194                      filter->status = 0;
00195               } else {
00196                      filter->status++;
00197                      filter->cache = c;
00198               }
00199               break;
00200        case 4:       /* got 0x8f,  X 0212 second char */
00201               filter->status = 0;
00202               c1 = filter->cache;
00203               if (c1 > 0xa0 && c1 < 0xff && c > 0xa0 && c < 0xff) {
00204                      s = (c1 - 0xa1)*94 + c - 0xa1;
00205                      if (s >= 0 && s < jisx0212_ucs_table_size) {
00206                             w = jisx0212_ucs_table[s];
00207                             if (w == 0x007e) {
00208                                    w = 0xff5e;          /* FULLWIDTH TILDE */
00209                             }
00210                      } else if (s >= (82*94) && s < (84*94)) { /* vender ext3 (83ku - 84ku) <-> CP932 (115ku -120ku) */
00211                             s = (c1<< 8) | c;
00212                             w = 0;
00213                             n = 0;
00214                             while (n < cp932ext3_eucjp_table_size) {
00215                                    if (s == cp932ext3_eucjp_table[n]) {
00216                                           if (n < (cp932ext3_ucs_table_max - cp932ext3_ucs_table_min)) {
00217                                                  w = cp932ext3_ucs_table[n];
00218                                           }
00219                                           break;
00220                                    }
00221                                    n++;
00222                             }
00223                      } else if (s >= (84*94)) {         /* user (85ku - 94ku) */
00224                             w = s - (84*94) + (0xe000 + (94*10));
00225                      } else {
00226                             w = 0;
00227                      }
00228                      if (w == 0x00A6) {
00229                             w = 0xFFE4;          /* FULLWIDTH BROKEN BAR */
00230                      }
00231                      if (w <= 0) {
00232                             w = ((c1 & 0x7f) << 8) | (c & 0x7f);
00233                             w &= MBFL_WCSPLANE_MASK;
00234                             w |= MBFL_WCSPLANE_JIS0212;
00235                      }
00236                      CK((*filter->output_function)(w, filter->data));
00237               } else if ((c >= 0 && c < 0x21) || c == 0x7f) {         /* CTLs */
00238                      CK((*filter->output_function)(c, filter->data));
00239               } else {
00240                      w = (c1 << 8) | c | 0x8f0000;
00241                      w &= MBFL_WCSGROUP_MASK;
00242                      w |= MBFL_WCSGROUP_THROUGH;
00243                      CK((*filter->output_function)(w, filter->data));
00244               }
00245               break;
00246 
00247        default:
00248               filter->status = 0;
00249               break;
00250        }
00251 
00252        return c;
00253 }
00254 
00255 /*
00256  * wchar => eucJP-win
00257  */
00258 int
00259 mbfl_filt_conv_wchar_eucjpwin(int c, mbfl_convert_filter *filter)
00260 {
00261        int c1, c2, s1;
00262 
00263        s1 = 0;
00264        if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
00265               s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
00266        } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
00267               s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
00268        } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
00269               s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
00270        } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
00271               s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
00272        } else if (c >= 0xe000 && c < (0xe000 + 10*94)) {       /* user  (X0208  85ku - 94ku) */
00273               s1 = c - 0xe000;
00274               c1 = s1/94 + 0x75;
00275               c2 = s1%94 + 0x21;
00276               s1 = (c1 << 8) | c2;
00277        } else if (c >= (0xe000 + 10*94) && c < (0xe000 + 20*94)) {    /* user  (X0212  85ku - 94ku) */
00278               s1 = c - (0xe000 + 10*94);
00279               c1 = s1/94 + 0xf5;
00280               c2 = s1%94 + 0xa1;
00281               s1 = (c1 << 8) | c2;
00282        }
00283        if (s1 == 0xa2f1) {
00284               s1 = 0x2d62;         /* NUMERO SIGN */
00285        }
00286        if (s1 <= 0) {
00287               c1 = c & ~MBFL_WCSPLANE_MASK;
00288               if (c1 == MBFL_WCSPLANE_WINCP932) {
00289                      s1 = c & MBFL_WCSPLANE_MASK;
00290                      if (s1 >= ((85 + 0x20) << 8)) {    /* 85ku - 120ku */
00291                             s1 = -1;
00292                      }
00293               } else if (c1 == MBFL_WCSPLANE_JIS0208) {
00294                      s1 = c & MBFL_WCSPLANE_MASK;
00295                      if (s1 >= ((85 + 0x20) << 8)) {    /* 85ku - 94ku */
00296                             s1 = -1;
00297                      }
00298               } else if (c1 == MBFL_WCSPLANE_JIS0212) {
00299                      s1 = c & MBFL_WCSPLANE_MASK;
00300                      if (s1 >= ((83 + 0x20) << 8)) {    /* 83ku - 94ku */
00301                             s1 = -1;
00302                      } else {
00303                             s1 |= 0x8080;
00304                      }
00305               } else if (c == 0xa5) {            /* YEN SIGN */
00306                      s1 = 0x216f;                /* FULLWIDTH YEN SIGN */
00307               } else if (c == 0x203e) {   /* OVER LINE */
00308                      s1 = 0x2131;                /* FULLWIDTH MACRON */
00309               } else if (c == 0xff3c) {   /* FULLWIDTH REVERSE SOLIDUS */
00310                      s1 = 0x2140;
00311               } else if (c == 0xff5e) {   /* FULLWIDTH TILDE */
00312                      s1 = 0x2141;
00313               } else if (c == 0x2225) {   /* PARALLEL TO */
00314                      s1 = 0x2142;
00315               } else if (c == 0xff0d) {   /* FULLWIDTH HYPHEN-MINUS */
00316                      s1 = 0x215d;
00317               } else if (c == 0xffe0) {   /* FULLWIDTH CENT SIGN */
00318                      s1 = 0x2171;
00319               } else if (c == 0xffe1) {   /* FULLWIDTH POUND SIGN */
00320                      s1 = 0x2172;
00321               } else if (c == 0xffe2) {   /* FULLWIDTH NOT SIGN */
00322                      s1 = 0x224c;
00323               } else if (c == 0xff5e) {   /* FULLWIDTH TILDE */
00324                      s1 = 0x2141;
00325               } else {
00326                      s1 = -1;
00327                      c1 = 0;
00328                      c2 = cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
00329                      while (c1 < c2) {           /* CP932 vendor ext1 (13ku) */
00330                             const int oh = cp932ext1_ucs_table_min / 94;
00331 
00332                             if (c == cp932ext1_ucs_table[c1]) {
00333                                    s1 = ((c1 / 94 + oh + 0x21) << 8) + (c1 % 94 + 0x21);
00334                                    break;
00335                             }
00336                             c1++;
00337                      }
00338                      if (s1 < 0) {
00339                             c1 = 0;
00340                             c2 = cp932ext3_ucs_table_max - cp932ext3_ucs_table_min;
00341                             while (c1 < c2) {           /* CP932 vendor ext3 (115ku - 119ku) */
00342                                    if (c == cp932ext3_ucs_table[c1]) {
00343                                           if (c1 < cp932ext3_eucjp_table_size) {
00344                                                  s1 = cp932ext3_eucjp_table[c1];
00345                                           }
00346                                           break;
00347                                    }
00348                                    c1++;
00349                             }
00350                      }
00351               }
00352               if (c == 0) {
00353                      s1 = 0;
00354               } else if (s1 <= 0) {
00355                      s1 = -1;
00356               }
00357        }
00358 
00359        if (s1 >= 0) {
00360               if (s1 < 0x80) {     /* latin */
00361                      CK((*filter->output_function)(s1, filter->data));
00362               } else if (s1 < 0x100) {    /* kana */
00363                      CK((*filter->output_function)(0x8e, filter->data));
00364                      CK((*filter->output_function)(s1, filter->data));
00365               } else if (s1 < 0x8080)  {  /* X 0208 */
00366                      CK((*filter->output_function)(((s1 >> 8) & 0xff) | 0x80, filter->data));
00367                      CK((*filter->output_function)((s1 & 0xff) | 0x80, filter->data));
00368               } else {      /* X 0212 */
00369                      CK((*filter->output_function)(0x8f, filter->data));
00370                      CK((*filter->output_function)(((s1 >> 8) & 0xff) | 0x80, filter->data));
00371                      CK((*filter->output_function)((s1 & 0xff) | 0x80, filter->data));
00372               }
00373        } else {
00374               if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
00375                      CK(mbfl_filt_conv_illegal_output(c, filter));
00376               }
00377        }
00378 
00379        return c;
00380 }
00381 
00382 static int mbfl_filt_ident_eucjp_win(int c, mbfl_identify_filter *filter)
00383 {
00384        switch (filter->status) {
00385        case  0:      /* latin */
00386               if (c >= 0 && c < 0x80) {   /* ok */
00387                      ;
00388               } else if (c > 0xa0 && c < 0xff) { /* kanji first char */
00389                      filter->status = 1;
00390               } else if (c == 0x8e) {                          /* kana first char */
00391                      filter->status = 2;
00392               } else if (c == 0x8f) {                          /* X 0212 first char */
00393                      filter->status = 3;
00394               } else {                                                /* bad */
00395                      filter->flag = 1;
00396               }
00397               break;
00398 
00399        case  1:      /* got first half */
00400               if (c < 0xa1 || c > 0xfe) {        /* bad */
00401                      filter->flag = 1;
00402               }
00403               filter->status = 0;
00404               break;
00405 
00406        case  2:      /* got 0x8e */
00407               if (c < 0xa1 || c > 0xdf) {        /* bad */
00408                      filter->flag = 1;
00409               }
00410               filter->status = 0;
00411               break;
00412 
00413        case  3:      /* got 0x8f */
00414               if (c < 0xa1 || c > 0xfe) {        /* bad */
00415                      filter->flag = 1;
00416               }
00417               filter->status++;
00418               break;
00419        case  4:      /* got 0x8f */
00420               if (c < 0xa1 || c > 0xfe) {        /* bad */
00421                      filter->flag = 1;
00422               }
00423               filter->status = 0;
00424               break;
00425 
00426        default:
00427               filter->status = 0;
00428               break;
00429        }
00430 
00431        return c;
00432 }
00433 
00434