Back to index

php5  5.3.10
mbfilter_cp932.c
Go to the documentation of this file.
00001 /*
00002  * "streamable kanji code filter and converter"
00003  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
00004  *
00005  * LICENSE NOTICES
00006  *
00007  * This file is part of "streamable kanji code filter and converter",
00008  * which is distributed under the terms of GNU Lesser General Public 
00009  * License (version 2) as published by the Free Software Foundation.
00010  *
00011  * This software is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  * GNU Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with "streamable kanji code filter and converter";
00018  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
00019  * Suite 330, Boston, MA  02111-1307  USA
00020  *
00021  * The author of this file:
00022  *
00023  */
00024 /*
00025  * the source code included in this files was separated from mbfilter_ja.c
00026  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
00027  *
00028  */
00029 
00030 #ifdef HAVE_CONFIG_H
00031 #include "config.h"
00032 #endif
00033 
00034 #include "mbfilter.h"
00035 #include "mbfilter_cp932.h"
00036 
00037 #include "unicode_table_cp932_ext.h"
00038 #include "unicode_table_jis.h"
00039 
00040 static int mbfl_filt_ident_cp932(int c, mbfl_identify_filter *filter);
00041 
00042 static const unsigned char mblen_table_sjis[] = { /* 0x80-0x9f,0xE0-0xFF */
00043   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00044   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00045   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00046   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00047   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00048   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00049   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00050   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00051   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00052   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00053   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00054   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00055   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00056   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00057   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00058   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
00059 };
00060 
00061 static const char *mbfl_encoding_cp932_aliases[] = {"MS932", "Windows-31J", "MS_Kanji", NULL};
00062 
00063 const mbfl_encoding mbfl_encoding_cp932 = {
00064        mbfl_no_encoding_cp932,
00065        "CP932",
00066        "Shift_JIS",
00067        (const char *(*)[])&mbfl_encoding_cp932_aliases,
00068        mblen_table_sjis,
00069        MBFL_ENCTYPE_MBCS
00070 };
00071 
00072 const struct mbfl_identify_vtbl vtbl_identify_cp932 = {
00073        mbfl_no_encoding_cp932,
00074        mbfl_filt_ident_common_ctor,
00075        mbfl_filt_ident_common_dtor,
00076        mbfl_filt_ident_cp932
00077 };
00078 
00079 const struct mbfl_convert_vtbl vtbl_cp932_wchar = {
00080        mbfl_no_encoding_cp932,
00081        mbfl_no_encoding_wchar,
00082        mbfl_filt_conv_common_ctor,
00083        mbfl_filt_conv_common_dtor,
00084        mbfl_filt_conv_cp932_wchar,
00085        mbfl_filt_conv_common_flush
00086 };
00087 
00088 const struct mbfl_convert_vtbl vtbl_wchar_cp932 = {
00089        mbfl_no_encoding_wchar,
00090        mbfl_no_encoding_cp932,
00091        mbfl_filt_conv_common_ctor,
00092        mbfl_filt_conv_common_dtor,
00093        mbfl_filt_conv_wchar_cp932,
00094        mbfl_filt_conv_common_flush
00095 };
00096 
00097 #define CK(statement)       do { if ((statement) < 0) return (-1); } while (0)
00098 
00099 #define SJIS_ENCODE(c1,c2,s1,s2)   \
00100               do {                                      \
00101                      s1 = c1;                           \
00102                      s1--;                              \
00103                      s1 >>= 1;                          \
00104                      if ((c1) < 0x5f) {          \
00105                             s1 += 0x71;                 \
00106                      } else {                           \
00107                             s1 += 0xb1;                 \
00108                      }                                         \
00109                      s2 = c2;                           \
00110                      if ((c1) & 1) {                    \
00111                             if ((c2) < 0x60) {   \
00112                                    s2--;                \
00113                             }                                  \
00114                             s2 += 0x20;                 \
00115                      } else {                           \
00116                             s2 += 0x7e;                 \
00117                      }                                         \
00118               } while (0)
00119 
00120 #define SJIS_DECODE(c1,c2,s1,s2)   \
00121               do {                                      \
00122                      s1 = c1;                           \
00123                      if (s1 < 0xa0) {            \
00124                             s1 -= 0x81;                 \
00125                      } else {                           \
00126                             s1 -= 0xc1;                 \
00127                      }                                         \
00128                      s1 <<= 1;                          \
00129                      s1 += 0x21;                        \
00130                      s2 = c2;                           \
00131                      if (s2 < 0x9f) {            \
00132                             if (s2 < 0x7f) {     \
00133                                    s2++;                \
00134                             }                                  \
00135                             s2 -= 0x20;                 \
00136                      } else {                           \
00137                             s1++;                       \
00138                             s2 -= 0x7e;                 \
00139                      }                                         \
00140               } while (0)
00141 
00142 
00143 /*
00144  * SJIS-win => wchar
00145  */
00146 int
00147 mbfl_filt_conv_cp932_wchar(int c, mbfl_convert_filter *filter)
00148 {
00149        int c1, s, s1, s2, w;
00150 
00151        switch (filter->status) {
00152        case 0:
00153               if (c >= 0 && c < 0x80) {   /* latin */
00154                      CK((*filter->output_function)(c, filter->data));
00155               } else if (c > 0xa0 && c < 0xe0) { /* kana */
00156                      CK((*filter->output_function)(0xfec0 + c, filter->data));
00157               } else if (c > 0x80 && c < 0xfd && c != 0xa0) {  /* kanji first char */
00158                      filter->status = 1;
00159                      filter->cache = c;
00160               } else {
00161                      w = c & MBFL_WCSGROUP_MASK;
00162                      w |= MBFL_WCSGROUP_THROUGH;
00163                      CK((*filter->output_function)(w, filter->data));
00164               }
00165               break;
00166 
00167        case 1:              /* kanji second char */
00168               filter->status = 0;
00169               c1 = filter->cache;
00170               if (c >= 0x40 && c <= 0xfc && c != 0x7f) {
00171                      w = 0;
00172                      SJIS_DECODE(c1, c, s1, s2);
00173                      s = (s1 - 0x21)*94 + s2 - 0x21;
00174                      if (s <= 137) {
00175                             if (s == 31) {
00176                                    w = 0xff3c;                 /* FULLWIDTH REVERSE SOLIDUS */
00177                             } else if (s == 32) {
00178                                    w = 0xff5e;                 /* FULLWIDTH TILDE */
00179                             } else if (s == 33) {
00180                                    w = 0x2225;                 /* PARALLEL TO */
00181                             } else if (s == 60) {
00182                                    w = 0xff0d;                 /* FULLWIDTH HYPHEN-MINUS */
00183                             } else if (s == 80) {
00184                                    w = 0xffe0;                 /* FULLWIDTH CENT SIGN */
00185                             } else if (s == 81) {
00186                                    w = 0xffe1;                 /* FULLWIDTH POUND SIGN */
00187                             } else if (s == 137) {
00188                                    w = 0xffe2;                 /* FULLWIDTH NOT SIGN */
00189                             }
00190                      }
00191                      if (w == 0) {
00192                             if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) {           /* vendor ext1 (13ku) */
00193                                    w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
00194                             } else if (s >= 0 && s < jisx0208_ucs_table_size) {            /* X 0208 */
00195                                    w = jisx0208_ucs_table[s];
00196                             } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) {           /* vendor ext2 (89ku - 92ku) */
00197                                    w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min];
00198                             } else if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) {           /* vendor ext3 (115ku - 119ku) */
00199                                    w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
00200                             } else if (s >= (94*94) && s < (114*94)) {              /* user (95ku - 114ku) */
00201                                    w = s - (94*94) + 0xe000;
00202                             }
00203                      }
00204                      if (w <= 0) {
00205                             w = (s1 << 8) | s2;
00206                             w &= MBFL_WCSPLANE_MASK;
00207                             w |= MBFL_WCSPLANE_WINCP932;
00208                      }
00209                      CK((*filter->output_function)(w, filter->data));
00210               } else if ((c >= 0 && c < 0x21) || c == 0x7f) {         /* CTLs */
00211                      CK((*filter->output_function)(c, filter->data));
00212               } else {
00213                      w = (c1 << 8) | c;
00214                      w &= MBFL_WCSGROUP_MASK;
00215                      w |= MBFL_WCSGROUP_THROUGH;
00216                      CK((*filter->output_function)(w, filter->data));
00217               }
00218               break;
00219 
00220        default:
00221               filter->status = 0;
00222               break;
00223        }
00224 
00225        return c;
00226 }
00227 
00228 /*
00229  * wchar => SJIS-win
00230  */
00231 int
00232 mbfl_filt_conv_wchar_cp932(int c, mbfl_convert_filter *filter)
00233 {
00234        int c1, c2, s1, s2;
00235 
00236        s1 = 0;
00237        s2 = 0;
00238        if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
00239               s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
00240        } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
00241               s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
00242        } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
00243               s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
00244        } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
00245               s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
00246        } else if (c >= 0xe000 && c < (0xe000 + 20*94)) {       /* user  (95ku - 114ku) */
00247               s1 = c - 0xe000;
00248               c1 = s1/94 + 0x7f;
00249               c2 = s1%94 + 0x21;
00250               s1 = (c1 << 8) | c2;
00251               s2 = 1;
00252        }
00253        if (s1 <= 0) {
00254               c1 = c & ~MBFL_WCSPLANE_MASK;
00255               if (c1 == MBFL_WCSPLANE_WINCP932) {
00256                      s1 = c & MBFL_WCSPLANE_MASK;
00257                      s2 = 1;
00258               } else if (c1 == MBFL_WCSPLANE_JIS0208) {
00259                      s1 = c & MBFL_WCSPLANE_MASK;
00260               } else if (c1 == MBFL_WCSPLANE_JIS0212) {
00261                      s1 = c & MBFL_WCSPLANE_MASK;
00262                      s1 |= 0x8080;
00263               } else if (c == 0xa5) {            /* YEN SIGN */
00264                      s1 = 0x005c;  /* YEN SIGN */
00265               } else if (c == 0x203e) {   /* OVER LINE */
00266                      s1 = 0x007e;  /* FULLWIDTH MACRON */
00267               } else if (c == 0xff3c) {   /* FULLWIDTH REVERSE SOLIDUS */
00268                      s1 = 0x2140;
00269               } else if (c == 0xff5e) {   /* FULLWIDTH TILDE */
00270                      s1 = 0x2141;
00271               } else if (c == 0x2225) {   /* PARALLEL TO */
00272                      s1 = 0x2142;
00273               } else if (c == 0xff0d) {   /* FULLWIDTH HYPHEN-MINUS */
00274                      s1 = 0x215d;
00275               } else if (c == 0xffe0) {   /* FULLWIDTH CENT SIGN */
00276                      s1 = 0x2171;
00277               } else if (c == 0xffe1) {   /* FULLWIDTH POUND SIGN */
00278                      s1 = 0x2172;
00279               } else if (c == 0xffe2) {   /* FULLWIDTH NOT SIGN */
00280                      s1 = 0x224c;
00281               }
00282        }
00283        if ((s1 <= 0) || (s1 >= 0x8080 && s2 == 0)) {    /* not found or X 0212 */
00284               s1 = -1;
00285               c1 = 0;
00286               c2 = cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
00287               while (c1 < c2) {           /* CP932 vendor ext1 (13ku) */
00288                      if (c == cp932ext1_ucs_table[c1]) {
00289                             s1 = ((c1/94 + 0x2d) << 8) + (c1%94 + 0x21);
00290                             break;
00291                      }
00292                      c1++;
00293               }
00294               if (s1 <= 0) {
00295                      c1 = 0;
00296                      c2 = cp932ext3_ucs_table_max - cp932ext3_ucs_table_min;
00297                      while (c1 < c2) {           /* CP932 vendor ext3 (115ku - 119ku) */
00298                             if (c == cp932ext3_ucs_table[c1]) {
00299                                    s1 = ((c1/94 + 0x93) << 8) + (c1%94 + 0x21);
00300                                    break;
00301                             }
00302                             c1++;
00303                      }
00304               }
00305               if (c == 0) {
00306                      s1 = 0;
00307               } else if (s1 <= 0) {
00308                      s1 = -1;
00309               }
00310        }
00311        if (s1 >= 0) {
00312               if (s1 < 0x100) { /* latin or kana */
00313                      CK((*filter->output_function)(s1, filter->data));
00314               } else { /* kanji */
00315                      c1 = (s1 >> 8) & 0xff;
00316                      c2 = s1 & 0xff;
00317                      SJIS_ENCODE(c1, c2, s1, s2);
00318                      CK((*filter->output_function)(s1, filter->data));
00319                      CK((*filter->output_function)(s2, filter->data));
00320               }
00321        } else {
00322               if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
00323                      CK(mbfl_filt_conv_illegal_output(c, filter));
00324               }
00325        }
00326 
00327        return c;
00328 }
00329 
00330 static int mbfl_filt_ident_cp932(int c, mbfl_identify_filter *filter)
00331 {
00332        if (filter->status) {              /* kanji second char */
00333               if (c < 0x40 || c > 0xfc || c == 0x7f) {  /* bad */
00334                   filter->flag = 1;
00335               }
00336               filter->status = 0;
00337        } else if (c >= 0 && c < 0x80) {   /* latin  ok */
00338               ;
00339        } else if (c > 0xa0 && c < 0xe0) { /* kana  ok */
00340               ;
00341        } else if (c > 0x80 && c < 0xfd && c != 0xa0) {  /* kanji first char */
00342               filter->status = 1;
00343        } else {                                                /* bad */
00344               filter->flag = 1;
00345        }
00346 
00347        return c;
00348 }
00349 
00350