Back to index

php5  5.3.10
mbfilter_cp51932.c
Go to the documentation of this file.
00001 /*
00002  * "streamable kanji code filter and converter"
00003  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
00004  *
00005  * LICENSE NOTICES
00006  *
00007  * This file is part of "streamable kanji code filter and converter",
00008  * which is distributed under the terms of GNU Lesser General Public 
00009  * License (version 2) as published by the Free Software Foundation.
00010  *
00011  * This software is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  * GNU Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with "streamable kanji code filter and converter";
00018  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
00019  * Suite 330, Boston, MA  02111-1307  USA
00020  *
00021  * The author of this file:
00022  *
00023  */
00024 /*
00025  * The source code included in this files was separated from mbfilter_ja.c
00026  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
00027  * 
00028  */
00029 
00030 #ifdef HAVE_CONFIG_H
00031 #include "config.h"
00032 #endif
00033 
00034 #include "mbfilter.h"
00035 #include "mbfilter_cp51932.h"
00036 
00037 #include "unicode_table_cp932_ext.h"
00038 #include "unicode_table_jis.h"
00039 #include "cp932_table.h"
00040 
00041 static int mbfl_filt_ident_cp51932(int c, mbfl_identify_filter *filter);
00042 
00043 static const unsigned char mblen_table_eucjp[] = { /* 0xA1-0xFE */
00044   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00045   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00046   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00047   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00048   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00049   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00050   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00051   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00052   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,
00053   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00054   1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00055   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00056   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00057   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00058   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00059   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
00060 };
00061 
00062 
00063 static const char *mbfl_encoding_cp51932_aliases[] = {"cp51932", NULL};
00064 
00065 const struct mbfl_identify_vtbl vtbl_identify_cp51932 = {
00066        mbfl_no_encoding_cp51932,
00067        mbfl_filt_ident_common_ctor,
00068        mbfl_filt_ident_common_dtor,
00069        mbfl_filt_ident_cp51932
00070 };
00071 
00072 const mbfl_encoding mbfl_encoding_cp51932 = {
00073        mbfl_no_encoding_cp51932,
00074        "CP51932",
00075        "CP51932",
00076        (const char *(*)[])&mbfl_encoding_cp51932_aliases,
00077        mblen_table_eucjp,
00078        MBFL_ENCTYPE_MBCS
00079 };
00080 
00081 const struct mbfl_convert_vtbl vtbl_cp51932_wchar = {
00082        mbfl_no_encoding_cp51932,
00083        mbfl_no_encoding_wchar,
00084        mbfl_filt_conv_common_ctor,
00085        mbfl_filt_conv_common_dtor,
00086        mbfl_filt_conv_cp51932_wchar,
00087        mbfl_filt_conv_common_flush
00088 };
00089 
00090 const struct mbfl_convert_vtbl vtbl_wchar_cp51932 = {
00091        mbfl_no_encoding_wchar,
00092        mbfl_no_encoding_cp51932,
00093        mbfl_filt_conv_common_ctor,
00094        mbfl_filt_conv_common_dtor,
00095        mbfl_filt_conv_wchar_cp51932,
00096        mbfl_filt_conv_common_flush
00097 };
00098 
00099 #define CK(statement)       do { if ((statement) < 0) return (-1); } while (0)
00100 
00101 #define sjistoidx(c1, c2) \
00102         (((c1) > 0x9f) \
00103         ? (((c1) - 0xc1) * 188 + (c2) - (((c2) > 0x7e) ? 0x41 : 0x40)) \
00104         : (((c1) - 0x81) * 188 + (c2) - (((c2) > 0x7e) ? 0x41 : 0x40)))
00105 #define idxtoeuc1(c) (((c) / 94) + 0xa1)
00106 #define idxtoeuc2(c) (((c) % 94) + 0xa1)
00107 
00108 /*
00109  * cp51932 => wchar
00110  */
00111 int
00112 mbfl_filt_conv_cp51932_wchar(int c, mbfl_convert_filter *filter)
00113 {
00114        int c1, s, w;
00115 
00116        switch (filter->status) {
00117        case 0:
00118               if (c >= 0 && c < 0x80) {   /* latin */
00119                      CK((*filter->output_function)(c, filter->data));
00120               } else if (c > 0xa0 && c < 0xff) { /* CP932 first char */
00121                      filter->status = 1;
00122                      filter->cache = c;
00123               } else if (c == 0x8e) {     /* kana first char */
00124                      filter->status = 2;
00125               } else {
00126                      w = c & MBFL_WCSGROUP_MASK;
00127                      w |= MBFL_WCSGROUP_THROUGH;
00128                      CK((*filter->output_function)(w, filter->data));
00129               }
00130               break;
00131 
00132        case 1:       /* got first half */
00133               filter->status = 0;
00134               c1 = filter->cache;
00135               if (c > 0xa0 && c < 0xff) {
00136                      w = 0;
00137                      s = (c1 - 0xa1)*94 + c - 0xa1;
00138                      if (s <= 137) {
00139                             if (s == 31) {
00140                                    w = 0xff3c;                 /* FULLWIDTH REVERSE SOLIDUS */
00141                             } else if (s == 32) {
00142                                    w = 0xff5e;                 /* FULLWIDTH TILDE */
00143                             } else if (s == 33) {
00144                                    w = 0x2225;                 /* PARALLEL TO */
00145                             } else if (s == 60) {
00146                                    w = 0xff0d;                 /* FULLWIDTH HYPHEN-MINUS */
00147                             } else if (s == 80) {
00148                                    w = 0xffe0;                 /* FULLWIDTH CENT SIGN */
00149                             } else if (s == 81) {
00150                                    w = 0xffe1;                 /* FULLWIDTH POUND SIGN */
00151                             } else if (s == 137) {
00152                                    w = 0xffe2;                 /* FULLWIDTH NOT SIGN */
00153                             }
00154                      }
00155                      if (w == 0) {
00156                             if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {           /* vendor ext1 (13ku) */
00157                                    w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
00158                             } else if (s >= 0 && s < jisx0208_ucs_table_size) {            /* X 0208 */
00159                                    w = jisx0208_ucs_table[s];
00160                             } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {           /* vendor ext2 (89ku - 92ku) */
00161                                    w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
00162                             }
00163                      }
00164                      if (w <= 0) {
00165                             w = ((c1 & 0x7f) << 8) | (c & 0x7f);
00166                             w &= MBFL_WCSPLANE_MASK;
00167                             w |= MBFL_WCSPLANE_WINCP932;
00168                      }
00169                      CK((*filter->output_function)(w, filter->data));
00170               } else if ((c >= 0 && c < 0x21) || c == 0x7f) {         /* CTLs */
00171                      CK((*filter->output_function)(c, filter->data));
00172               } else {
00173                      w = (c1 << 8) | c;
00174                      w &= MBFL_WCSGROUP_MASK;
00175                      w |= MBFL_WCSGROUP_THROUGH;
00176                      CK((*filter->output_function)(w, filter->data));
00177               }
00178               break;
00179 
00180        case 2:       /* got 0x8e, X0201 kana */
00181               filter->status = 0;
00182               if (c > 0xa0 && c < 0xe0) {
00183                      w = 0xfec0 + c;
00184                      CK((*filter->output_function)(w, filter->data));
00185               } else if ((c >= 0 && c < 0x21) || c == 0x7f) {         /* CTLs */
00186                      CK((*filter->output_function)(c, filter->data));
00187               } else {
00188                      w = 0x8e00 | c;
00189                      w &= MBFL_WCSGROUP_MASK;
00190                      w |= MBFL_WCSGROUP_THROUGH;
00191                      CK((*filter->output_function)(w, filter->data));
00192               }
00193               break;
00194 
00195        default:
00196               filter->status = 0;
00197               break;
00198        }
00199 
00200        return c;
00201 }
00202 
00203 /*
00204  * wchar => cp51932
00205  */
00206 int
00207 mbfl_filt_conv_wchar_cp51932(int c, mbfl_convert_filter *filter)
00208 {
00209        int c1, c2, s1;
00210 
00211        s1 = 0;
00212        if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
00213               s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
00214        } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
00215               s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
00216        } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
00217               s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
00218        } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
00219               s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
00220        }
00221        if (s1 >= 0x8080) s1 = -1; /* we don't support JIS X0213 */
00222        if (s1 <= 0) {
00223               c1 = c & ~MBFL_WCSPLANE_MASK;
00224               if (c1 == MBFL_WCSPLANE_WINCP932) {
00225                      s1 = c & MBFL_WCSPLANE_MASK;
00226                      if (s1 >= ((85 + 0x20) << 8)) {    /* 85ku - 120ku */
00227                             s1 = -1;
00228                      }
00229               } else if (c1 == MBFL_WCSPLANE_JIS0208) {
00230                      s1 = c & MBFL_WCSPLANE_MASK;
00231                      if ((s1 >= ((85 + 0x20) << 8) &&  /* 85ku - 94ku */
00232                              s1 <= ((88 + 0x20) << 8)) ||/* IBM extension */
00233                          (s1 >= ((93 + 0x20) << 8) && /* 89ku - 92ku */
00234                              s1 <= ((94 + 0x20) << 8))) {
00235                             s1 = -1;
00236                      }
00237               } else if (c == 0xa5) {            /* YEN SIGN */
00238                      s1 = 0x005c;                /* YEN SIGN */
00239               } else if (c == 0x203e) {   /* OVER LINE */
00240                      s1 = 0x007e;                /* FULLWIDTH MACRON */
00241               } else if (c == 0xff3c) {   /* FULLWIDTH REVERSE SOLIDUS */
00242                      s1 = 0x2140;
00243               } else if (c == 0xff5e) {   /* FULLWIDTH TILDE */
00244                      s1 = 0x2141;
00245               } else if (c == 0x2225) {   /* PARALLEL TO */
00246                      s1 = 0x2142;
00247               } else if (c == 0xff0d) {   /* FULLWIDTH HYPHEN-MINUS */
00248                      s1 = 0x215d;
00249               } else if (c == 0xffe0) {   /* FULLWIDTH CENT SIGN */
00250                      s1 = 0x2171;
00251               } else if (c == 0xffe1) {   /* FULLWIDTH POUND SIGN */
00252                      s1 = 0x2172;
00253               } else if (c == 0xffe2) {   /* FULLWIDTH NOT SIGN */
00254                      s1 = 0x224c;
00255               } else {
00256                      s1 = -1;
00257                      c1 = 0;
00258                      c2 = cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
00259                      while (c1 < c2) {           /* CP932 vendor ext1 (13ku) */
00260                             if (c == cp932ext1_ucs_table[c1]) {
00261                                    s1 = ((c1/94 + 0x2d) << 8) + (c1%94 + 0x21);
00262                                    break;
00263                             }
00264                             c1++;
00265                      }
00266                      if (s1 < 0) {
00267                             c1 = 0;
00268                             c2 = cp932ext2_ucs_table_max - cp932ext2_ucs_table_min;
00269                             while (c1 < c2) {           /* CP932 vendor ext3 (115ku - 119ku) */
00270                                    if (c == cp932ext2_ucs_table[c1]) {
00271                                      s1 = ((c1/94 + 0x79) << 8) +(c1%94 + 0x21);
00272                                      break;
00273                                    }
00274                                    c1++;
00275                             }
00276                      }
00277               }
00278               if (c == 0) {
00279                      s1 = 0;
00280               } else if (s1 <= 0) {
00281                      s1 = -1;
00282               }
00283        }
00284 
00285        if (s1 >= 0) {
00286               if (s1 < 0x80) {     /* latin */
00287                      CK((*filter->output_function)(s1, filter->data));
00288               } else if (s1 < 0x100) {    /* kana */
00289                      CK((*filter->output_function)(0x8e, filter->data));
00290                      CK((*filter->output_function)(s1, filter->data));
00291               } else if (s1 < 0x8080)  {  /* X 0208 */
00292                      CK((*filter->output_function)(((s1 >> 8) & 0xff) | 0x80, filter->data));
00293                      CK((*filter->output_function)((s1 & 0xff) | 0x80, filter->data));
00294               } else {
00295                 if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
00296                   CK(mbfl_filt_conv_illegal_output(c, filter));
00297                 }
00298               }
00299        } else {
00300               if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
00301                      CK(mbfl_filt_conv_illegal_output(c, filter));
00302               }
00303        }
00304 
00305        return c;
00306 }
00307 
00308 static int mbfl_filt_ident_cp51932(int c, mbfl_identify_filter *filter)
00309 {
00310        switch (filter->status) {
00311        case  0:      /* latin */
00312               if (c >= 0 && c < 0x80) {   /* ok */
00313                      ;
00314               } else if (c > 0xa0 && c < 0xff) { /* kanji first char */
00315                      filter->status = 1;
00316               } else if (c == 0x8e) {                          /* kana first char */
00317                      filter->status = 2;
00318               } else {                                                /* bad */
00319                      filter->flag = 1;
00320               }
00321               break;
00322 
00323        case  1:      /* got first half */
00324               if (c < 0xa1 || c > 0xfe) {        /* bad */
00325                      filter->flag = 1;
00326               }
00327               filter->status = 0;
00328               break;
00329 
00330        case  2:      /* got 0x8e */
00331               if (c < 0xa1 || c > 0xdf) {        /* bad */
00332                      filter->flag = 1;
00333               }
00334               filter->status = 0;
00335               break;
00336 
00337        default:
00338               filter->status = 0;
00339               break;
00340        }
00341 
00342        return c;
00343 }
00344 
00345