Back to index

php5  5.3.10
mbfilter_euc_jp.c
Go to the documentation of this file.
00001 /*
00002  * "streamable kanji code filter and converter"
00003  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
00004  *
00005  * LICENSE NOTICES
00006  *
00007  * This file is part of "streamable kanji code filter and converter",
00008  * which is distributed under the terms of GNU Lesser General Public 
00009  * License (version 2) as published by the Free Software Foundation.
00010  *
00011  * This software is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  * GNU Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with "streamable kanji code filter and converter";
00018  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
00019  * Suite 330, Boston, MA  02111-1307  USA
00020  *
00021  * The author of this file:
00022  *
00023  */
00024 /*
00025  * The source code included in this files was separated from mbfilter_ja.c
00026  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
00027  * 
00028  */
00029 
00030 #ifdef HAVE_CONFIG_H
00031 #include "config.h"
00032 #endif
00033 
00034 #include "mbfilter.h"
00035 #include "mbfilter_euc_jp.h"
00036 
00037 #include "unicode_table_cp932_ext.h"
00038 #include "unicode_table_jis.h"
00039 
00040 static int mbfl_filt_ident_eucjp(int c, mbfl_identify_filter *filter);
00041 
00042 static const unsigned char mblen_table_eucjp[] = { /* 0xA1-0xFE */
00043   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00044   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00045   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00046   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00047   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00048   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00049   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00050   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00051   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
00052   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00053   1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00054   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00055   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00056   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00057   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00058   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
00059 };
00060 
00061 static const char *mbfl_encoding_euc_jp_aliases[] = {"EUC", "EUC_JP", "eucJP", "x-euc-jp", NULL};
00062 
00063 const mbfl_encoding mbfl_encoding_euc_jp = {
00064        mbfl_no_encoding_euc_jp,
00065        "EUC-JP",
00066        "EUC-JP",
00067        (const char *(*)[])&mbfl_encoding_euc_jp_aliases,
00068        mblen_table_eucjp,
00069        MBFL_ENCTYPE_MBCS
00070 };
00071 
00072 const struct mbfl_identify_vtbl vtbl_identify_eucjp = {
00073        mbfl_no_encoding_euc_jp,
00074        mbfl_filt_ident_common_ctor,
00075        mbfl_filt_ident_common_dtor,
00076        mbfl_filt_ident_eucjp
00077 };
00078 
00079 const struct mbfl_convert_vtbl vtbl_eucjp_wchar = {
00080        mbfl_no_encoding_euc_jp,
00081        mbfl_no_encoding_wchar,
00082        mbfl_filt_conv_common_ctor,
00083        mbfl_filt_conv_common_dtor,
00084        mbfl_filt_conv_eucjp_wchar,
00085        mbfl_filt_conv_common_flush
00086 };
00087 
00088 const struct mbfl_convert_vtbl vtbl_wchar_eucjp = {
00089        mbfl_no_encoding_wchar,
00090        mbfl_no_encoding_euc_jp,
00091        mbfl_filt_conv_common_ctor,
00092        mbfl_filt_conv_common_dtor,
00093        mbfl_filt_conv_wchar_eucjp,
00094        mbfl_filt_conv_common_flush
00095 };
00096 
00097 #define CK(statement)       do { if ((statement) < 0) return (-1); } while (0)
00098 
00099 /*
00100  * EUC-JP => wchar
00101  */
00102 int
00103 mbfl_filt_conv_eucjp_wchar(int c, mbfl_convert_filter *filter)
00104 {
00105        int c1, s, w;
00106 
00107        switch (filter->status) {
00108        case 0:
00109               if (c >= 0 && c < 0x80) {   /* latin */
00110                      CK((*filter->output_function)(c, filter->data));
00111               } else if (c > 0xa0 && c < 0xff) { /* X 0208 first char */
00112                      filter->status = 1;
00113                      filter->cache = c;
00114               } else if (c == 0x8e) {     /* kana first char */
00115                      filter->status = 2;
00116               } else if (c == 0x8f) {     /* X 0212 first char */
00117                      filter->status = 3;
00118               } else {
00119                      w = c & MBFL_WCSGROUP_MASK;
00120                      w |= MBFL_WCSGROUP_THROUGH;
00121                      CK((*filter->output_function)(w, filter->data));
00122               }
00123               break;
00124 
00125        case 1:       /* got first half */
00126               filter->status = 0;
00127               c1 = filter->cache;
00128               if (c > 0xa0 && c < 0xff) {
00129                      s = (c1 - 0xa1)*94 + c - 0xa1;
00130                      if (s >= 0 && s < jisx0208_ucs_table_size) {
00131                             w = jisx0208_ucs_table[s];
00132                      } else {
00133                             w = 0;
00134                      }
00135                      if (w <= 0) {
00136                             w = ((c1 & 0x7f) << 8) | (c & 0x7f);
00137                             w &= MBFL_WCSPLANE_MASK;
00138                             w |= MBFL_WCSPLANE_JIS0208;
00139                      }
00140                      CK((*filter->output_function)(w, filter->data));
00141               } else if ((c >= 0 && c < 0x21) || c == 0x7f) {         /* CTLs */
00142                      CK((*filter->output_function)(c, filter->data));
00143               } else {
00144                      w = (c1 << 8) | c;
00145                      w &= MBFL_WCSGROUP_MASK;
00146                      w |= MBFL_WCSGROUP_THROUGH;
00147                      CK((*filter->output_function)(w, filter->data));
00148               }
00149               break;
00150 
00151        case 2:       /* got 0x8e */
00152               filter->status = 0;
00153               if (c > 0xa0 && c < 0xe0) {
00154                      w = 0xfec0 + c;
00155                      CK((*filter->output_function)(w, filter->data));
00156               } else if ((c >= 0 && c < 0x21) || c == 0x7f) {         /* CTLs */
00157                      CK((*filter->output_function)(c, filter->data));
00158               } else {
00159                      w = 0x8e00 | c;
00160                      w &= MBFL_WCSGROUP_MASK;
00161                      w |= MBFL_WCSGROUP_THROUGH;
00162                      CK((*filter->output_function)(w, filter->data));
00163               }
00164               break;
00165 
00166        case 3:       /* got 0x8f,  X 0212 first char */
00167               if ((c >= 0 && c < 0x21) || c == 0x7f) {         /* CTLs */
00168                      CK((*filter->output_function)(c, filter->data));
00169                      filter->status = 0;
00170               } else {
00171                      filter->status++;
00172                      filter->cache = c;
00173               }
00174               break;
00175        case 4:       /* got 0x8f,  X 0212 second char */
00176               filter->status = 0;
00177               c1 = filter->cache;
00178               if (c1 > 0xa0 && c1 < 0xff && c > 0xa0 && c < 0xff) {
00179                      s = (c1 - 0xa1)*94 + c - 0xa1;
00180                      if (s >= 0 && s < jisx0212_ucs_table_size) {
00181                             w = jisx0212_ucs_table[s];
00182                      } else {
00183                             w = 0;
00184                      }
00185                      if (w <= 0) {
00186                             w = ((c1 & 0x7f) << 8) | (c & 0x7f);
00187                             w &= MBFL_WCSPLANE_MASK;
00188                             w |= MBFL_WCSPLANE_JIS0212;
00189                      }
00190                      CK((*filter->output_function)(w, filter->data));
00191               } else if ((c >= 0 && c < 0x21) || c == 0x7f) {         /* CTLs */
00192                      CK((*filter->output_function)(c, filter->data));
00193               } else {
00194                      w = (c1 << 8) | c | 0x8f0000;
00195                      w &= MBFL_WCSGROUP_MASK;
00196                      w |= MBFL_WCSGROUP_THROUGH;
00197                      CK((*filter->output_function)(w, filter->data));
00198               }
00199               break;
00200 
00201        default:
00202               filter->status = 0;
00203               break;
00204        }
00205 
00206        return c;
00207 }
00208 
00209 /*
00210  * wchar => EUC-JP
00211  */
00212 int
00213 mbfl_filt_conv_wchar_eucjp(int c, mbfl_convert_filter *filter)
00214 {
00215        int c1, s;
00216 
00217        s = 0;
00218        if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
00219               s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
00220        } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
00221               s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
00222        } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
00223               s = ucs_i_jis_table[c - ucs_i_jis_table_min];
00224        } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
00225               s = ucs_r_jis_table[c - ucs_r_jis_table_min];
00226        }
00227        if (s <= 0) {
00228               c1 = c & ~MBFL_WCSPLANE_MASK;
00229               if (c1 == MBFL_WCSPLANE_JIS0208) {
00230                      s = c & MBFL_WCSPLANE_MASK;
00231               } else if (c1 == MBFL_WCSPLANE_JIS0212) {
00232                      s = c & MBFL_WCSPLANE_MASK;
00233                      s |= 0x8080;
00234               } else if (c == 0xff3c) {   /* FULLWIDTH REVERSE SOLIDUS */
00235                      s = 0x2140;
00236               } else if (c == 0xff5e) {   /* FULLWIDTH TILDE */
00237                      s = 0x2141;
00238               } else if (c == 0x2225) {   /* PARALLEL TO */
00239                      s = 0x2142;
00240               } else if (c == 0xff0d) {   /* FULLWIDTH HYPHEN-MINUS */
00241                      s = 0x215d;
00242               } else if (c == 0xffe0) {   /* FULLWIDTH CENT SIGN */
00243                      s = 0x2171;
00244               } else if (c == 0xffe1) {   /* FULLWIDTH POUND SIGN */
00245                      s = 0x2172;
00246               } else if (c == 0xffe2) {   /* FULLWIDTH NOT SIGN */
00247                      s = 0x224c;
00248               }
00249               if (c == 0) {
00250                      s = 0;
00251               } else if (s <= 0) {
00252                      s = -1;
00253               }
00254        }
00255        if (s >= 0) {
00256               if (s < 0x80) {      /* latin */
00257                      CK((*filter->output_function)(s, filter->data));
00258               } else if (s < 0x100) {     /* kana */
00259                      CK((*filter->output_function)(0x8e, filter->data));
00260                      CK((*filter->output_function)(s, filter->data));
00261               } else if (s < 0x8080)  {   /* X 0208 */
00262                      CK((*filter->output_function)(((s >> 8) & 0xff) | 0x80, filter->data));
00263                      CK((*filter->output_function)((s & 0xff) | 0x80, filter->data));
00264               } else {      /* X 0212 */
00265                      CK((*filter->output_function)(0x8f, filter->data));
00266                      CK((*filter->output_function)(((s >> 8) & 0xff) | 0x80, filter->data));
00267                      CK((*filter->output_function)((s & 0xff) | 0x80, filter->data));
00268               }
00269        } else {
00270               if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
00271                      CK(mbfl_filt_conv_illegal_output(c, filter));
00272               }
00273        }
00274 
00275        return c;
00276 }
00277 
00278 static int mbfl_filt_ident_eucjp(int c, mbfl_identify_filter *filter)
00279 {
00280        switch (filter->status) {
00281        case  0:      /* latin */
00282               if (c >= 0 && c < 0x80) {   /* ok */
00283                      ;
00284               } else if (c > 0xa0 && c < 0xff) { /* kanji first char */
00285                      filter->status = 1;
00286               } else if (c == 0x8e) {                          /* kana first char */
00287                      filter->status = 2;
00288               } else if (c == 0x8f) {                          /* X 0212 first char */
00289                      filter->status = 3;
00290               } else {                                                /* bad */
00291                      filter->flag = 1;
00292               }
00293               break;
00294 
00295        case  1:      /* got first half */
00296               if (c < 0xa1 || c > 0xfe) {        /* bad */
00297                      filter->flag = 1;
00298               }
00299               filter->status = 0;
00300               break;
00301 
00302        case  2:      /* got 0x8e */
00303               if (c < 0xa1 || c > 0xdf) {        /* bad */
00304                      filter->flag = 1;
00305               }
00306               filter->status = 0;
00307               break;
00308 
00309        case  3:      /* got 0x8f */
00310               if (c < 0xa1 || c > 0xfe) {        /* bad */
00311                      filter->flag = 1;
00312               }
00313               filter->status++;
00314               break;
00315        case  4:      /* got 0x8f */
00316               if (c < 0xa1 || c > 0xfe) {        /* bad */
00317                      filter->flag = 1;
00318               }
00319               filter->status = 0;
00320               break;
00321 
00322        default:
00323               filter->status = 0;
00324               break;
00325        }
00326 
00327        return c;
00328 }
00329 
00330 
00331