Back to index

php5  5.3.10
mbfl_convert.c
Go to the documentation of this file.
00001 /*
00002  * "streamable kanji code filter and converter"
00003  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
00004  *
00005  * LICENSE NOTICES
00006  *
00007  * This file is part of "streamable kanji code filter and converter",
00008  * which is distributed under the terms of GNU Lesser General Public 
00009  * License (version 2) as published by the Free Software Foundation.
00010  *
00011  * This software is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  * GNU Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with "streamable kanji code filter and converter";
00018  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
00019  * Suite 330, Boston, MA  02111-1307  USA
00020  *
00021  * The author of this file:
00022  *
00023  */
00024 /*
00025  * The source code included in this files was separated from mbfilter.c
00026  * by Moriyoshi Koizumi <moriyoshi@php.net> on 20 Dec 2002. The file
00027  * mbfilter.c is included in this package .
00028  *
00029  */
00030 
00031 #ifdef HAVE_CONFIG_H
00032 #include "config.h"
00033 #endif
00034 
00035 #ifdef HAVE_STDDEF_H
00036 #include <stddef.h>
00037 #endif
00038 
00039 #include "mbfl_encoding.h"
00040 #include "mbfl_allocators.h"
00041 #include "mbfl_filter_output.h"
00042 #include "mbfilter_pass.h"
00043 #include "mbfilter_8bit.h"
00044 #include "mbfilter_wchar.h"
00045 
00046 #include "filters/mbfilter_euc_cn.h"
00047 #include "filters/mbfilter_hz.h"
00048 #include "filters/mbfilter_euc_tw.h"
00049 #include "filters/mbfilter_big5.h"
00050 #include "filters/mbfilter_uhc.h"
00051 #include "filters/mbfilter_euc_kr.h"
00052 #include "filters/mbfilter_iso2022_kr.h"
00053 #include "filters/mbfilter_sjis.h"
00054 #include "filters/mbfilter_sjis_open.h"
00055 #include "filters/mbfilter_cp51932.h"
00056 #include "filters/mbfilter_jis.h"
00057 #include "filters/mbfilter_iso2022_jp_ms.h"
00058 #include "filters/mbfilter_euc_jp.h"
00059 #include "filters/mbfilter_euc_jp_win.h"
00060 #include "filters/mbfilter_ascii.h"
00061 #include "filters/mbfilter_koi8r.h"
00062 #include "filters/mbfilter_koi8u.h"
00063 #include "filters/mbfilter_cp866.h"
00064 #include "filters/mbfilter_cp932.h"
00065 #include "filters/mbfilter_cp936.h"
00066 #include "filters/mbfilter_cp1251.h"
00067 #include "filters/mbfilter_cp1252.h"
00068 #include "filters/mbfilter_cp1254.h"
00069 #include "filters/mbfilter_cp5022x.h"
00070 #include "filters/mbfilter_iso8859_1.h"
00071 #include "filters/mbfilter_iso8859_2.h"
00072 #include "filters/mbfilter_iso8859_3.h"
00073 #include "filters/mbfilter_iso8859_4.h"
00074 #include "filters/mbfilter_iso8859_5.h"
00075 #include "filters/mbfilter_iso8859_6.h"
00076 #include "filters/mbfilter_iso8859_7.h"
00077 #include "filters/mbfilter_iso8859_8.h"
00078 #include "filters/mbfilter_iso8859_9.h"
00079 #include "filters/mbfilter_iso8859_10.h"
00080 #include "filters/mbfilter_iso8859_13.h"
00081 #include "filters/mbfilter_iso8859_14.h"
00082 #include "filters/mbfilter_iso8859_15.h"
00083 #include "filters/mbfilter_base64.h"
00084 #include "filters/mbfilter_qprint.h"
00085 #include "filters/mbfilter_uuencode.h"
00086 #include "filters/mbfilter_7bit.h"
00087 #include "filters/mbfilter_utf7.h"
00088 #include "filters/mbfilter_utf7imap.h"
00089 #include "filters/mbfilter_utf8.h"
00090 #include "filters/mbfilter_utf16.h"
00091 #include "filters/mbfilter_utf32.h"
00092 #include "filters/mbfilter_byte2.h"
00093 #include "filters/mbfilter_byte4.h"
00094 #include "filters/mbfilter_ucs4.h"
00095 #include "filters/mbfilter_ucs2.h"
00096 #include "filters/mbfilter_htmlent.h"
00097 #include "filters/mbfilter_armscii8.h"
00098 #include "filters/mbfilter_cp850.h"
00099 
00100 /* hex character table "0123456789ABCDEF" */
00101 static char mbfl_hexchar_table[] = {
00102        0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x41,0x42,0x43,0x44,0x45,0x46
00103 };
00104 
00105 const struct mbfl_convert_vtbl *mbfl_convert_filter_list[] = {
00106        &vtbl_utf8_wchar,
00107        &vtbl_wchar_utf8,
00108        &vtbl_eucjp_wchar,
00109        &vtbl_wchar_eucjp,
00110        &vtbl_sjis_wchar,
00111        &vtbl_wchar_sjis,
00112        &vtbl_sjis_open_wchar,
00113        &vtbl_wchar_sjis_open,
00114        &vtbl_cp51932_wchar,
00115        &vtbl_wchar_cp51932,
00116        &vtbl_jis_wchar,
00117        &vtbl_wchar_jis,
00118        &vtbl_jis_ms_wchar,
00119        &vtbl_wchar_jis_ms,
00120        &vtbl_2022jp_wchar,
00121        &vtbl_wchar_2022jp,
00122        &vtbl_2022jpms_wchar,
00123        &vtbl_wchar_2022jpms,
00124        &vtbl_eucjpwin_wchar,
00125        &vtbl_wchar_eucjpwin,
00126        &vtbl_cp932_wchar,
00127        &vtbl_wchar_cp932,
00128        &vtbl_euccn_wchar,
00129        &vtbl_wchar_euccn,
00130        &vtbl_cp936_wchar,
00131        &vtbl_wchar_cp936,
00132        &vtbl_hz_wchar,
00133        &vtbl_wchar_hz,
00134        &vtbl_euctw_wchar,
00135        &vtbl_wchar_euctw,
00136        &vtbl_big5_wchar,
00137        &vtbl_wchar_big5,
00138        &vtbl_euckr_wchar,
00139        &vtbl_wchar_euckr,
00140        &vtbl_uhc_wchar,
00141        &vtbl_wchar_uhc,
00142        &vtbl_2022kr_wchar,
00143        &vtbl_wchar_2022kr,
00144        &vtbl_cp1251_wchar,
00145        &vtbl_wchar_cp1251,
00146        &vtbl_cp866_wchar,
00147        &vtbl_wchar_cp866,
00148        &vtbl_koi8r_wchar,
00149        &vtbl_wchar_koi8r,
00150        &vtbl_koi8u_wchar,
00151        &vtbl_wchar_koi8u,
00152        &vtbl_cp1252_wchar,
00153        &vtbl_wchar_cp1252,
00154        &vtbl_cp1254_wchar,
00155        &vtbl_wchar_cp1254,
00156        &vtbl_cp50220_wchar,
00157        &vtbl_wchar_cp50220,
00158        &vtbl_cp50220raw_wchar,
00159        &vtbl_wchar_cp50220raw,
00160        &vtbl_cp50221_wchar,
00161        &vtbl_wchar_cp50221,
00162        &vtbl_cp50222_wchar,
00163        &vtbl_wchar_cp50222,
00164        &vtbl_ascii_wchar,
00165        &vtbl_wchar_ascii,
00166        &vtbl_8859_1_wchar,
00167        &vtbl_wchar_8859_1,
00168        &vtbl_8859_2_wchar,
00169        &vtbl_wchar_8859_2,
00170        &vtbl_8859_3_wchar,
00171        &vtbl_wchar_8859_3,
00172        &vtbl_8859_4_wchar,
00173        &vtbl_wchar_8859_4,
00174        &vtbl_8859_5_wchar,
00175        &vtbl_wchar_8859_5,
00176        &vtbl_8859_6_wchar,
00177        &vtbl_wchar_8859_6,
00178        &vtbl_8859_7_wchar,
00179        &vtbl_wchar_8859_7,
00180        &vtbl_8859_8_wchar,
00181        &vtbl_wchar_8859_8,
00182        &vtbl_8859_9_wchar,
00183        &vtbl_wchar_8859_9,
00184        &vtbl_8859_10_wchar,
00185        &vtbl_wchar_8859_10,
00186        &vtbl_8859_13_wchar,
00187        &vtbl_wchar_8859_13,
00188        &vtbl_8859_14_wchar,
00189        &vtbl_wchar_8859_14,
00190        &vtbl_8859_15_wchar,
00191        &vtbl_wchar_8859_15,
00192        &vtbl_8bit_b64,
00193        &vtbl_b64_8bit,
00194        &vtbl_uuencode_8bit,
00195        &vtbl_wchar_html,
00196        &vtbl_html_wchar,
00197        &vtbl_8bit_qprint,
00198        &vtbl_qprint_8bit,
00199        &vtbl_8bit_7bit,
00200        &vtbl_7bit_8bit,
00201        &vtbl_utf7_wchar,
00202        &vtbl_wchar_utf7,
00203        &vtbl_utf7imap_wchar,
00204        &vtbl_wchar_utf7imap,
00205        &vtbl_utf16_wchar,
00206        &vtbl_wchar_utf16,
00207        &vtbl_utf16be_wchar,
00208        &vtbl_wchar_utf16be,
00209        &vtbl_utf16le_wchar,
00210        &vtbl_wchar_utf16le,
00211        &vtbl_utf32_wchar,
00212        &vtbl_wchar_utf32,
00213        &vtbl_utf32be_wchar,
00214        &vtbl_wchar_utf32be,
00215        &vtbl_utf32le_wchar,
00216        &vtbl_wchar_utf32le,
00217        &vtbl_ucs4_wchar,
00218        &vtbl_wchar_ucs4,
00219        &vtbl_ucs4be_wchar,
00220        &vtbl_wchar_ucs4be,
00221        &vtbl_ucs4le_wchar,
00222        &vtbl_wchar_ucs4le,
00223        &vtbl_ucs2_wchar,
00224        &vtbl_wchar_ucs2,
00225        &vtbl_ucs2be_wchar,
00226        &vtbl_wchar_ucs2be,
00227        &vtbl_ucs2le_wchar,
00228        &vtbl_wchar_ucs2le,
00229        &vtbl_byte4be_wchar,
00230        &vtbl_wchar_byte4be,
00231        &vtbl_byte4le_wchar,
00232        &vtbl_wchar_byte4le,
00233        &vtbl_byte2be_wchar,
00234        &vtbl_wchar_byte2be,
00235        &vtbl_byte2le_wchar,
00236        &vtbl_wchar_byte2le,
00237        &vtbl_armscii8_wchar,
00238        &vtbl_wchar_armscii8,
00239        &vtbl_cp850_wchar,
00240        &vtbl_wchar_cp850,
00241        &vtbl_pass,
00242        NULL
00243 };
00244 
00245 static int
00246 mbfl_convert_filter_common_init(
00247        mbfl_convert_filter *filter,
00248        enum mbfl_no_encoding from,
00249        enum mbfl_no_encoding to,
00250        const struct mbfl_convert_vtbl *vtbl,
00251     int (*output_function)(int, void* ),
00252     int (*flush_function)(void*),
00253     void* data)
00254 {
00255        /* encoding structure */
00256        if ((filter->from = mbfl_no2encoding(from)) == NULL) {
00257               return 1;
00258        }
00259 
00260        if ((filter->to = mbfl_no2encoding(to)) == NULL) {
00261               return 1;
00262        }
00263 
00264        if (output_function != NULL) {
00265               filter->output_function = output_function;
00266        } else {
00267               filter->output_function = mbfl_filter_output_null;
00268        }
00269 
00270        filter->flush_function = flush_function;
00271        filter->data = data;
00272        filter->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
00273        filter->illegal_substchar = 0x3f;         /* '?' */
00274        filter->num_illegalchar = 0;
00275        filter->filter_ctor = vtbl->filter_ctor;
00276        filter->filter_dtor = vtbl->filter_dtor;
00277        filter->filter_function = vtbl->filter_function;
00278        filter->filter_flush = vtbl->filter_flush;
00279        filter->filter_copy = vtbl->filter_copy;
00280 
00281        (*filter->filter_ctor)(filter);
00282 
00283        return 0;
00284 }
00285 
00286 
00287 mbfl_convert_filter *
00288 mbfl_convert_filter_new(
00289     enum mbfl_no_encoding from,
00290     enum mbfl_no_encoding to,
00291     int (*output_function)(int, void* ),
00292     int (*flush_function)(void*),
00293     void* data)
00294 {
00295        mbfl_convert_filter * filter;
00296        const struct mbfl_convert_vtbl *vtbl;
00297 
00298        vtbl = mbfl_convert_filter_get_vtbl(from, to);
00299 
00300        if (vtbl == NULL) {
00301               vtbl = &vtbl_pass;
00302        }
00303 
00304        /* allocate */
00305        filter = (mbfl_convert_filter *)mbfl_malloc(sizeof(mbfl_convert_filter));
00306        if (filter == NULL) {
00307               return NULL;
00308        }
00309 
00310        if (mbfl_convert_filter_common_init(filter, from, to, vtbl,
00311                      output_function, flush_function, data)) {
00312               mbfl_free(filter);
00313               return NULL;
00314        }
00315 
00316        return filter;
00317 }
00318 
00319 mbfl_convert_filter *
00320 mbfl_convert_filter_new2(
00321        const struct mbfl_convert_vtbl *vtbl,
00322     int (*output_function)(int, void* ),
00323     int (*flush_function)(void*),
00324     void* data)
00325 {
00326        mbfl_convert_filter * filter;
00327 
00328        if (vtbl == NULL) {
00329               vtbl = &vtbl_pass;
00330        }
00331 
00332        /* allocate */
00333        filter = (mbfl_convert_filter *)mbfl_malloc(sizeof(mbfl_convert_filter));
00334        if (filter == NULL) {
00335               return NULL;
00336        }
00337 
00338        if (mbfl_convert_filter_common_init(filter, vtbl->from, vtbl->to, vtbl,
00339                      output_function, flush_function, data)) {
00340               mbfl_free(filter);
00341               return NULL;
00342        }
00343 
00344        return filter;
00345 }
00346 
00347 void
00348 mbfl_convert_filter_delete(mbfl_convert_filter *filter)
00349 {
00350        if (filter) {
00351               (*filter->filter_dtor)(filter);
00352               mbfl_free((void*)filter);
00353        }
00354 }
00355 
00356 int
00357 mbfl_convert_filter_feed(int c, mbfl_convert_filter *filter)
00358 {
00359        return (*filter->filter_function)(c, filter);
00360 }
00361 
00362 int
00363 mbfl_convert_filter_flush(mbfl_convert_filter *filter)
00364 {
00365        (*filter->filter_flush)(filter);
00366        return (filter->flush_function ? (*filter->flush_function)(filter->data) : 0);
00367 }
00368 
00369 void mbfl_convert_filter_reset(mbfl_convert_filter *filter,
00370            enum mbfl_no_encoding from, enum mbfl_no_encoding to)
00371 {
00372        const struct mbfl_convert_vtbl *vtbl;
00373 
00374        /* destruct old filter */
00375        (*filter->filter_dtor)(filter);
00376 
00377        vtbl = mbfl_convert_filter_get_vtbl(from, to);
00378 
00379        if (vtbl == NULL) {
00380               vtbl = &vtbl_pass;
00381        }
00382 
00383        mbfl_convert_filter_common_init(filter, from, to, vtbl,
00384                      filter->output_function, filter->flush_function, filter->data);
00385 }
00386 
00387 void
00388 mbfl_convert_filter_copy(
00389     mbfl_convert_filter *src,
00390     mbfl_convert_filter *dest)
00391 {
00392        if (src->filter_copy != NULL) {
00393               src->filter_copy(src, dest);
00394               return;
00395        }
00396 
00397        *dest = *src;
00398 }
00399 
00400 int mbfl_convert_filter_devcat(mbfl_convert_filter *filter, mbfl_memory_device *src) 
00401 {
00402        int n;
00403        unsigned char *p;
00404 
00405        p = src->buffer;
00406        n = src->pos;
00407        while (n > 0) {
00408               if ((*filter->filter_function)(*p++, filter) < 0) {
00409                      return -1;
00410               }
00411               n--;
00412        }
00413 
00414        return n;
00415 }
00416 
00417 int mbfl_convert_filter_strcat(mbfl_convert_filter *filter, const unsigned char *p)
00418 {
00419        int c;
00420 
00421        while ((c = *p++) != '\0') {
00422               if ((*filter->filter_function)(c, filter) < 0) {
00423                      return -1;
00424               }
00425        }
00426 
00427        return 0;
00428 }
00429 
00430 /* illegal character output function for conv-filter */
00431 int
00432 mbfl_filt_conv_illegal_output(int c, mbfl_convert_filter *filter)
00433 {
00434        int mode_backup, ret, n, m, r;
00435 
00436        ret = 0;
00437        mode_backup = filter->illegal_mode;
00438        filter->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
00439        switch (mode_backup) {
00440        case MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR:
00441               ret = (*filter->filter_function)(filter->illegal_substchar, filter);
00442               break;
00443        case MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG:
00444               if (c >= 0) {
00445                      if (c < MBFL_WCSGROUP_UCS4MAX) {   /* unicode */
00446                             ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"U+");
00447                      } else {
00448                             if (c < MBFL_WCSGROUP_WCHARMAX) {
00449                                    m = c & ~MBFL_WCSPLANE_MASK;
00450                                    switch (m) {
00451                                    case MBFL_WCSPLANE_JIS0208:
00452                                           ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"JIS+");
00453                                           break;
00454                                    case MBFL_WCSPLANE_JIS0212:
00455                                           ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"JIS2+");
00456                                           break;
00457                                    case MBFL_WCSPLANE_WINCP932:
00458                                           ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"W932+");
00459                                           break;
00460                                    case MBFL_WCSPLANE_8859_1:
00461                                           ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"I8859_1+");
00462                                           break;
00463                                    default:
00464                                           ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"?+");
00465                                           break;
00466                                    }
00467                                    c &= MBFL_WCSPLANE_MASK;
00468                             } else {
00469                                    ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"BAD+");
00470                                    c &= MBFL_WCSGROUP_MASK;
00471                             }
00472                      }
00473                      if (ret >= 0) {
00474                             m = 0;
00475                             r = 28;
00476                             while (r >= 0) {
00477                                    n = (c >> r) & 0xf;
00478                                    if (n || m) {
00479                                           m = 1;
00480                                           ret = (*filter->filter_function)(mbfl_hexchar_table[n], filter);
00481                                           if (ret < 0) {
00482                                                  break;
00483                                           }
00484                                    }
00485                                    r -= 4;
00486                             }
00487                             if (m == 0 && ret >= 0) {
00488                                    ret = (*filter->filter_function)(mbfl_hexchar_table[0], filter);
00489                             }
00490                      }
00491               }
00492               break;
00493        case MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY:
00494               if (c >= 0) {
00495                      if (c < MBFL_WCSGROUP_UCS4MAX) {   /* unicode */
00496                             ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)"&#x");
00497                             if (ret < 0)
00498                                    break;
00499 
00500                             m = 0;
00501                             r = 28;
00502                             while (r >= 0) {
00503                                    n = (c >> r) & 0xf;
00504                                    if (n || m) {
00505                                           m = 1;
00506                                           ret = (*filter->filter_function)(mbfl_hexchar_table[n], filter);
00507                                           if (ret < 0) {
00508                                                  break;
00509                                           }
00510                                    }
00511                                    r -= 4;
00512                             }
00513                             if (ret < 0) {
00514                                    break;
00515                             }
00516                             if (m == 0) {
00517                                    ret = (*filter->filter_function)(mbfl_hexchar_table[0], filter);
00518                             }
00519                             ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)";");
00520                      } else {
00521                             ret = (*filter->filter_function)(filter->illegal_substchar, filter);
00522                      }
00523               }
00524               break;
00525        default:
00526               break;
00527        }
00528        filter->illegal_mode = mode_backup;
00529        filter->num_illegalchar++;
00530 
00531        return ret;
00532 }
00533 
00534 const struct mbfl_convert_vtbl * mbfl_convert_filter_get_vtbl(enum mbfl_no_encoding from, enum mbfl_no_encoding to)
00535 {
00536        const struct mbfl_convert_vtbl *vtbl;
00537        int i;
00538 
00539        if (to == mbfl_no_encoding_base64 ||
00540            to == mbfl_no_encoding_qprint ||
00541            to == mbfl_no_encoding_7bit) {
00542               from = mbfl_no_encoding_8bit;
00543        } else if (from == mbfl_no_encoding_base64 ||
00544                         from == mbfl_no_encoding_qprint ||
00545                         from == mbfl_no_encoding_uuencode) {
00546               to = mbfl_no_encoding_8bit;
00547        }
00548 
00549        i = 0;
00550        while ((vtbl = mbfl_convert_filter_list[i++]) != NULL){
00551               if (vtbl->from == from && vtbl->to == to) {
00552                      return vtbl;
00553               }
00554        }
00555 
00556        return NULL;
00557 }
00558 
00559 /*
00560  * commonly used constructor and destructor
00561  */
00562 void mbfl_filt_conv_common_ctor(mbfl_convert_filter *filter)
00563 {
00564        filter->status = 0;
00565        filter->cache = 0;
00566 }
00567 
00568 int mbfl_filt_conv_common_flush(mbfl_convert_filter *filter)
00569 {
00570        filter->status = 0;
00571        filter->cache = 0;
00572 
00573        if (filter->flush_function != NULL) {
00574               (*filter->flush_function)(filter->data);
00575        }
00576        return 0;
00577 }
00578 
00579 void mbfl_filt_conv_common_dtor(mbfl_convert_filter *filter)
00580 {
00581        filter->status = 0;
00582        filter->cache = 0;
00583 }
00584 
00585