Back to index

php5  5.3.10
sjis.c
Go to the documentation of this file.
00001 /**********************************************************************
00002   sjis.c -  Oniguruma (regular expression library)
00003 **********************************************************************/
00004 /*-
00005  * Copyright (c) 2002-2005  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
00006  * All rights reserved.
00007  *
00008  * Redistribution and use in source and binary forms, with or without
00009  * modification, are permitted provided that the following conditions
00010  * are met:
00011  * 1. Redistributions of source code must retain the above copyright
00012  *    notice, this list of conditions and the following disclaimer.
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in the
00015  *    documentation and/or other materials provided with the distribution.
00016  *
00017  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
00018  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00019  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00020  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
00021  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00022  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00023  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00024  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00025  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00026  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00027  * SUCH DAMAGE.
00028  */
00029 
00030 #include "regenc.h"
00031 
00032 static const int EncLen_SJIS[] = {
00033   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00034   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00035   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00036   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00037   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00038   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00039   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00040   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00041   1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00042   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00043   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00044   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00045   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00046   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00047   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00048   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1
00049 };
00050 
00051 static const char SJIS_CAN_BE_TRAIL_TABLE[256] = {
00052   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00053   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00054   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00055   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00056   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00057   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00058   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00059   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
00060   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00061   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00062   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00063   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00064   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00065   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00066   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00067   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
00068 };
00069 
00070 #define SJIS_ISMB_FIRST(byte)  (EncLen_SJIS[byte] > 1)
00071 #define SJIS_ISMB_TRAIL(byte)  SJIS_CAN_BE_TRAIL_TABLE[(byte)]
00072 
00073 static int
00074 sjis_mbc_enc_len(const UChar* p)
00075 {
00076   return EncLen_SJIS[*p];
00077 }
00078 
00079 static int
00080 sjis_code_to_mbclen(OnigCodePoint code)
00081 {
00082   if (code < 256) {
00083     if (EncLen_SJIS[(int )code] == 1)
00084       return 1;
00085     else
00086       return 0;
00087   }
00088   else if (code <= 0xffff) {
00089     return 2;
00090   }
00091   else
00092     return 0;
00093 }
00094 
00095 static OnigCodePoint
00096 sjis_mbc_to_code(const UChar* p, const UChar* end)
00097 {
00098   int c, i, len;
00099   OnigCodePoint n;
00100 
00101   len = enc_len(ONIG_ENCODING_SJIS, p);
00102   c = *p++;
00103   n = c;
00104   if (len == 1) return n;
00105 
00106   for (i = 1; i < len; i++) {
00107     if (p >= end) break;
00108     c = *p++;
00109     n <<= 8;  n += c;
00110   }
00111   return n;
00112 }
00113 
00114 static int
00115 sjis_code_to_mbc(OnigCodePoint code, UChar *buf)
00116 {
00117   UChar *p = buf;
00118 
00119   if ((code & 0xff00) != 0) *p++ = (UChar )(((code >>  8) & 0xff));
00120   *p++ = (UChar )(code & 0xff);
00121 
00122 #if 0
00123   if (enc_len(ONIG_ENCODING_SJIS, buf) != (p - buf))
00124     return REGERR_INVALID_WIDE_CHAR_VALUE;
00125 #endif
00126   return p - buf;
00127 }
00128 
00129 static int
00130 sjis_mbc_to_normalize(OnigAmbigType flag,
00131                     const UChar** pp, const UChar* end, UChar* lower)
00132 {
00133   const UChar* p = *pp;
00134 
00135   if (ONIGENC_IS_MBC_ASCII(p)) {
00136     if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) {
00137       *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
00138     }
00139     else {
00140       *lower = *p;
00141     }
00142 
00143     (*pp)++;
00144     return 1;
00145   }
00146   else {
00147     int len = enc_len(ONIG_ENCODING_SJIS, p);
00148 
00149     if (lower != p) {
00150       int i;
00151       for (i = 0; i < len; i++) {
00152        *lower++ = *p++;
00153       }
00154     }
00155     (*pp) += len;
00156     return len; /* return byte length of converted char to lower */
00157   }
00158 }
00159 
00160 static int
00161 sjis_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end)
00162 {
00163   return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_SJIS, flag, pp, end);
00164                                       
00165 }
00166 
00167 static int
00168 sjis_is_code_ctype(OnigCodePoint code, unsigned int ctype)
00169 {
00170   if (code < 128)
00171     return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
00172   else {
00173     if ((ctype & (ONIGENC_CTYPE_WORD |
00174                   ONIGENC_CTYPE_GRAPH | ONIGENC_CTYPE_PRINT)) != 0) {
00175       return (sjis_code_to_mbclen(code) > 1 ? TRUE : FALSE);
00176     }
00177   }
00178 
00179   return FALSE;
00180 }
00181 
00182 static UChar*
00183 sjis_left_adjust_char_head(const UChar* start, const UChar* s)
00184 {
00185   const UChar *p;
00186   int len;
00187 
00188   if (s <= start) return (UChar* )s;
00189   p = s;
00190 
00191   if (SJIS_ISMB_TRAIL(*p)) {
00192     while (p > start) {
00193       if (! SJIS_ISMB_FIRST(*--p)) {
00194        p++;
00195        break;
00196       }
00197     } 
00198   }
00199   len = enc_len(ONIG_ENCODING_SJIS, p);
00200   if (p + len > s) return (UChar* )p;
00201   p += len;
00202   return (UChar* )(p + ((s - p) & ~1));
00203 }
00204 
00205 static int
00206 sjis_is_allowed_reverse_match(const UChar* s, const UChar* end)
00207 {
00208   const UChar c = *s;
00209   return (SJIS_ISMB_TRAIL(c) ? FALSE : TRUE);
00210 }
00211 
00212 OnigEncodingType OnigEncodingSJIS = {
00213   sjis_mbc_enc_len,
00214   "Shift_JIS",   /* name */
00215   2,             /* max byte length */
00216   1,             /* min byte length */
00217   ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE,
00218   {
00219       (OnigCodePoint )'\\'                       /* esc */
00220     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.'  */
00221     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*'  */
00222     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
00223     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
00224     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
00225   },
00226   onigenc_is_mbc_newline_0x0a,
00227   sjis_mbc_to_code,
00228   sjis_code_to_mbclen,
00229   sjis_code_to_mbc,
00230   sjis_mbc_to_normalize,
00231   sjis_is_mbc_ambiguous,
00232   onigenc_ascii_get_all_pair_ambig_codes,
00233   onigenc_nothing_get_all_comp_ambig_codes,
00234   sjis_is_code_ctype,
00235   onigenc_not_support_get_ctype_code_range,
00236   sjis_left_adjust_char_head,
00237   sjis_is_allowed_reverse_match
00238 };