Back to index

php5  5.3.10
mbfilter_utf8.c
Go to the documentation of this file.
00001 /*
00002  * "streamable kanji code filter and converter"
00003  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
00004  *
00005  * LICENSE NOTICES
00006  *
00007  * This file is part of "streamable kanji code filter and converter",
00008  * which is distributed under the terms of GNU Lesser General Public 
00009  * License (version 2) as published by the Free Software Foundation.
00010  *
00011  * This software is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  * GNU Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with "streamable kanji code filter and converter";
00018  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
00019  * Suite 330, Boston, MA  02111-1307  USA
00020  *
00021  * The author of this file:
00022  *
00023  */
00024 /*
00025  * The source code included in this files was separated from mbfilter.c
00026  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
00027  * 
00028  */
00029 
00030 #ifdef HAVE_CONFIG_H
00031 #include "config.h"
00032 #endif
00033 
00034 #include "mbfilter.h"
00035 #include "mbfilter_utf8.h"
00036 
00037 static int mbfl_filt_ident_utf8(int c, mbfl_identify_filter *filter);
00038 
00039 static const unsigned char mblen_table_utf8[] = {
00040   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00041   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00042   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00043   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00044   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00045   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00046   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00047   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00048   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00049   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00050   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00051   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00052   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00053   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00054   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
00055   4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
00056 };
00057 
00058 static const char *mbfl_encoding_utf8_aliases[] = {"utf8", NULL};
00059 
00060 const mbfl_encoding mbfl_encoding_utf8 = {
00061        mbfl_no_encoding_utf8,
00062        "UTF-8",
00063        "UTF-8",
00064        (const char *(*)[])&mbfl_encoding_utf8_aliases,
00065        mblen_table_utf8,
00066        MBFL_ENCTYPE_MBCS
00067 };
00068 
00069 const struct mbfl_identify_vtbl vtbl_identify_utf8 = {
00070        mbfl_no_encoding_utf8,
00071        mbfl_filt_ident_common_ctor,
00072        mbfl_filt_ident_common_dtor,
00073        mbfl_filt_ident_utf8
00074 };
00075 
00076 const struct mbfl_convert_vtbl vtbl_utf8_wchar = {
00077        mbfl_no_encoding_utf8,
00078        mbfl_no_encoding_wchar,
00079        mbfl_filt_conv_common_ctor,
00080        mbfl_filt_conv_common_dtor,
00081        mbfl_filt_conv_utf8_wchar,
00082        mbfl_filt_conv_common_flush
00083 };
00084 
00085 const struct mbfl_convert_vtbl vtbl_wchar_utf8 = {
00086        mbfl_no_encoding_wchar,
00087        mbfl_no_encoding_utf8,
00088        mbfl_filt_conv_common_ctor,
00089        mbfl_filt_conv_common_dtor,
00090        mbfl_filt_conv_wchar_utf8,
00091        mbfl_filt_conv_common_flush
00092 };
00093 
00094 #define CK(statement)       do { if ((statement) < 0) return (-1); } while (0)
00095 
00096 /*
00097  * UTF-8 => wchar
00098  */
00099 int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
00100 {
00101        int s;
00102 
00103        if (c < 0x80) {
00104               if (c >= 0) {
00105                      CK((*filter->output_function)(c, filter->data));
00106               }
00107               filter->status = 0;
00108        } else if (c < 0xc0) {
00109               int status = filter->status & 0xff;
00110               switch (status) {
00111               case 0x10: /* 2byte code 2nd char */
00112               case 0x21: /* 3byte code 3rd char */
00113               case 0x32: /* 4byte code 4th char */
00114               case 0x43: /* 5byte code 5th char */
00115               case 0x54: /* 6byte code 6th char */
00116                      filter->status = 0;
00117                      s = filter->cache | (c & 0x3f);
00118                      if ((status == 0x10 && s >= 0x80) ||
00119                          (status == 0x21 && s >= 0x800 && (s < 0xd800 || s > 0xdfff)) ||
00120                          (status == 0x32 && s >= 0x10000) ||
00121                          (status == 0x43 && s >= 0x200000) ||
00122                          (status == 0x54 && s >= 0x4000000 && s < MBFL_WCSGROUP_UCS4MAX)) {
00123                             CK((*filter->output_function)(s, filter->data));
00124                      }
00125                      break;
00126               case 0x20: /* 3byte code 2nd char */
00127               case 0x31: /* 4byte code 3rd char */
00128               case 0x42: /* 5byte code 4th char */
00129               case 0x53: /* 6byte code 5th char */
00130                      filter->cache |= ((c & 0x3f) << 6);
00131                      filter->status++;
00132                      break;
00133               case 0x30: /* 4byte code 2nd char */
00134               case 0x41: /* 5byte code 3rd char */
00135               case 0x52: /* 6byte code 4th char */
00136                      filter->cache |= ((c & 0x3f) << 12);
00137                      filter->status++;
00138                      break;
00139               case 0x40: /* 5byte code 2nd char */
00140               case 0x51: /* 6byte code 3rd char */
00141                      filter->cache |= ((c & 0x3f) << 18);
00142                      filter->status++;
00143                      break;
00144               case 0x50: /* 6byte code 2nd char */
00145                      filter->cache |= ((c & 0x3f) << 24);
00146                      filter->status++;
00147                      break;
00148               default:
00149                      filter->status = 0;
00150                      break;
00151               }
00152        } else if (c < 0xe0) { /* 2byte code first char */
00153               filter->status = 0x10;
00154               filter->cache = (c & 0x1f) << 6;
00155        } else if (c < 0xf0) { /* 3byte code first char */
00156               filter->status = 0x20;
00157               filter->cache = (c & 0xf) << 12;
00158        } else if (c < 0xf8) { /* 4byte code first char */
00159               filter->status = 0x30;
00160               filter->cache = (c & 0x7) << 18;
00161        } else if (c < 0xfc) { /* 5byte code first char */
00162               filter->status = 0x40;
00163               filter->cache = (c & 0x3) << 24;
00164        } else if (c < 0xfe)  { /* 6 byte code first char */
00165               filter->status = 0x50;
00166               filter->cache = (c & 0x1) << 30;
00167        } else {
00168               filter->status = 0;
00169               filter->cache = 0;
00170        }
00171 
00172        return c;
00173 }
00174 
00175 /*
00176  * wchar => UTF-8
00177  */
00178 int mbfl_filt_conv_wchar_utf8(int c, mbfl_convert_filter *filter)
00179 {
00180        if (c >= 0 && c < MBFL_WCSGROUP_UCS4MAX) {
00181               if (c < 0x80) {
00182                      CK((*filter->output_function)(c, filter->data));
00183               } else if (c < 0x800) {
00184                      CK((*filter->output_function)(((c >> 6) & 0x1f) | 0xc0, filter->data));
00185                      CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data));
00186               } else if (c < 0x10000) {
00187                      CK((*filter->output_function)(((c >> 12) & 0x0f) | 0xe0, filter->data));
00188                      CK((*filter->output_function)(((c >> 6) & 0x3f) | 0x80, filter->data));
00189                      CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data));
00190               } else if (c < 0x200000) {
00191                      CK((*filter->output_function)(((c >> 18) & 0x07) | 0xf0, filter->data));
00192                      CK((*filter->output_function)(((c >> 12) & 0x3f) | 0x80, filter->data));
00193                      CK((*filter->output_function)(((c >> 6) & 0x3f) | 0x80, filter->data));
00194                      CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data));
00195               } else if (c < 0x4000000) {
00196                      CK((*filter->output_function)(((c >> 24) & 0x03) | 0xf8, filter->data));
00197                      CK((*filter->output_function)(((c >> 18) & 0x3f) | 0x80, filter->data));
00198                      CK((*filter->output_function)(((c >> 12) & 0x3f) | 0x80, filter->data));
00199                      CK((*filter->output_function)(((c >> 6) & 0x3f) | 0x80, filter->data));
00200                      CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data));
00201               } else {
00202                      CK((*filter->output_function)(((c >> 30) & 0x01) | 0xfc, filter->data));
00203                      CK((*filter->output_function)(((c >> 24) & 0x3f) | 0x80, filter->data));
00204                      CK((*filter->output_function)(((c >> 18) & 0x3f) | 0x80, filter->data));
00205                      CK((*filter->output_function)(((c >> 12) & 0x3f) | 0x80, filter->data));
00206                      CK((*filter->output_function)(((c >> 6) & 0x3f) | 0x80, filter->data));
00207                      CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data));
00208               }
00209        } else {
00210               if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
00211                      CK(mbfl_filt_conv_illegal_output(c, filter));
00212               }
00213        }
00214 
00215        return c;
00216 }
00217 
00218 static int mbfl_filt_ident_utf8(int c, mbfl_identify_filter *filter)
00219 {
00220        if (c < 0x80) {
00221               if (c < 0) { 
00222                      filter->flag = 1;    /* bad */
00223               } else if (filter->status) {
00224                      filter->flag = 1;    /* bad */
00225               }
00226               filter->status = 0;
00227        } else if (c < 0xc0) {
00228               switch (filter->status) {
00229               case 0x20: /* 3 byte code 2nd char */
00230               case 0x30: /* 4 byte code 2nd char */
00231               case 0x31: /* 4 byte code 3rd char */
00232               case 0x40: /* 5 byte code 2nd char */
00233               case 0x41: /* 5 byte code 3rd char */
00234               case 0x42: /* 5 byte code 4th char */
00235               case 0x50: /* 6 byte code 2nd char */
00236               case 0x51: /* 6 byte code 3rd char */
00237               case 0x52: /* 6 byte code 4th char */
00238               case 0x53: /* 6 byte code 5th char */
00239                      filter->status++;
00240                      break;
00241               case 0x10: /* 2 byte code 2nd char */
00242               case 0x21: /* 3 byte code 3rd char */
00243               case 0x32: /* 4 byte code 4th char */
00244               case 0x43: /* 5 byte code 5th char */
00245               case 0x54: /* 6 byte code 6th char */
00246                      filter->status = 0;
00247                      break;
00248               default:
00249                      filter->flag = 1;    /* bad */
00250                      filter->status = 0;
00251                      break;
00252               }
00253        } else {
00254               if (filter->status) {
00255                      filter->flag = 1;    /* bad */
00256               }
00257               filter->status = 0;
00258               if (c < 0xe0) {                           /* 2 byte code first char */
00259                      filter->status = 0x10;
00260               } else if (c < 0xf0) {             /* 3 byte code 1st char */
00261                      filter->status = 0x20;
00262               } else if (c < 0xf8) {             /* 4 byte code 1st char */
00263                      filter->status = 0x30;
00264               } else if (c < 0xfc) {             /* 5 byte code 1st char */
00265                      filter->status = 0x40;
00266               } else if (c < 0xfe) {             /* 6 byte code 1st char */
00267                      filter->status = 0x50;
00268               } else {
00269                      filter->flag = 1;    /* bad */
00270               }
00271        }
00272 
00273        return c;
00274 }