Back to index

php5  5.3.10
euc_jp.c
Go to the documentation of this file.
00001 /**********************************************************************
00002   euc_jp.c -  Oniguruma (regular expression library)
00003 **********************************************************************/
00004 /*-
00005  * Copyright (c) 2002-2005  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
00006  * All rights reserved.
00007  *
00008  * Redistribution and use in source and binary forms, with or without
00009  * modification, are permitted provided that the following conditions
00010  * are met:
00011  * 1. Redistributions of source code must retain the above copyright
00012  *    notice, this list of conditions and the following disclaimer.
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in the
00015  *    documentation and/or other materials provided with the distribution.
00016  *
00017  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
00018  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00019  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00020  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
00021  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00022  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00023  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00024  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00025  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00026  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00027  * SUCH DAMAGE.
00028  */
00029 
00030 #include "regenc.h"
00031 
00032 #define eucjp_islead(c)    ((UChar )((c) - 0xa1) > 0xfe - 0xa1)
00033 
00034 static const int EncLen_EUCJP[] = {
00035   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00036   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00037   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00038   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00039   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00040   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00041   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00042   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00043   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
00044   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00045   1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00046   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00047   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00048   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00049   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00050   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
00051 };
00052 
00053 static int
00054 eucjp_mbc_enc_len(const UChar* p)
00055 {
00056   return EncLen_EUCJP[*p];
00057 }
00058 
00059 static OnigCodePoint
00060 eucjp_mbc_to_code(const UChar* p, const UChar* end)
00061 {
00062   int c, i, len;
00063   OnigCodePoint n;
00064 
00065   len = enc_len(ONIG_ENCODING_EUC_JP, p);
00066   n = (OnigCodePoint )*p++;
00067   if (len == 1) return n;
00068 
00069   for (i = 1; i < len; i++) {
00070     if (p >= end) break;
00071     c = *p++;
00072     n <<= 8;  n += c;
00073   }
00074   return n;
00075 }
00076 
00077 static int
00078 eucjp_code_to_mbclen(OnigCodePoint code)
00079 {
00080   if (ONIGENC_IS_CODE_ASCII(code)) return 1;
00081   else if ((code & 0xff0000) != 0) return 3;
00082   else if ((code &   0xff00) != 0) return 2;
00083   else return 0;
00084 }
00085 
00086 #if 0
00087 static int
00088 eucjp_code_to_mbc_first(OnigCodePoint code)
00089 {
00090   int first;
00091 
00092   if ((code & 0xff0000) != 0) {
00093     first = (code >> 16) & 0xff;
00094   }
00095   else if ((code & 0xff00) != 0) {
00096     first = (code >> 8) & 0xff;
00097   }
00098   else {
00099     return (int )code;
00100   }
00101   return first;
00102 }
00103 #endif
00104 
00105 static int
00106 eucjp_code_to_mbc(OnigCodePoint code, UChar *buf)
00107 {
00108   UChar *p = buf;
00109 
00110   if ((code & 0xff0000) != 0) *p++ = (UChar )(((code >> 16) & 0xff));
00111   if ((code &   0xff00) != 0) *p++ = (UChar )(((code >>  8) & 0xff));
00112   *p++ = (UChar )(code & 0xff);
00113 
00114 #if 1
00115   if (enc_len(ONIG_ENCODING_EUC_JP, buf) != (p - buf))
00116     return ONIGENCERR_INVALID_WIDE_CHAR_VALUE;
00117 #endif  
00118   return p - buf;
00119 }
00120 
00121 static int
00122 eucjp_mbc_to_normalize(OnigAmbigType flag,
00123                      const UChar** pp, const UChar* end, UChar* lower)
00124 {
00125   int len;
00126   const UChar* p = *pp;
00127 
00128   if (ONIGENC_IS_MBC_ASCII(p)) {
00129     if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) {
00130       *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
00131     }
00132     else {
00133       *lower = *p;
00134     }
00135 
00136     (*pp)++;
00137     return 1;
00138   }
00139   else {
00140     len = enc_len(ONIG_ENCODING_EUC_JP, p);
00141     if (lower != p) {
00142       int i;
00143       for (i = 0; i < len; i++) {
00144        *lower++ = *p++;
00145       }
00146     }
00147     (*pp) += len;
00148     return len; /* return byte length of converted char to lower */
00149   }
00150 }
00151 
00152 static int
00153 eucjp_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end)
00154 {
00155   return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_EUC_JP, flag, pp, end);
00156 }
00157 
00158 static int
00159 eucjp_is_code_ctype(OnigCodePoint code, unsigned int ctype)
00160 {
00161   if (code < 128)
00162     return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
00163   else {
00164     if ((ctype & (ONIGENC_CTYPE_WORD |
00165                   ONIGENC_CTYPE_GRAPH | ONIGENC_CTYPE_PRINT)) != 0) {
00166       return (eucjp_code_to_mbclen(code) > 1 ? TRUE : FALSE);
00167     }
00168   }
00169 
00170   return FALSE;
00171 }
00172 
00173 static UChar*
00174 eucjp_left_adjust_char_head(const UChar* start, const UChar* s)
00175 {
00176   /* In this encoding
00177      mb-trail bytes doesn't mix with single bytes.
00178   */
00179   const UChar *p;
00180   int len;
00181 
00182   if (s <= start) return (UChar* )s;
00183   p = s;
00184 
00185   while (!eucjp_islead(*p) && p > start) p--;
00186   len = enc_len(ONIG_ENCODING_EUC_JP, p);
00187   if (p + len > s) return (UChar* )p;
00188   p += len;
00189   return (UChar* )(p + ((s - p) & ~1));
00190 }
00191 
00192 static int
00193 eucjp_is_allowed_reverse_match(const UChar* s, const UChar* end)
00194 {
00195   const UChar c = *s;
00196   if (c <= 0x7e || c == 0x8e || c == 0x8f)
00197     return TRUE;
00198   else
00199     return FALSE;
00200 }
00201 
00202 OnigEncodingType OnigEncodingEUC_JP = {
00203   eucjp_mbc_enc_len,
00204   "EUC-JP",   /* name */
00205   3,          /* max enc length */
00206   1,          /* min enc length */
00207   ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE,
00208   {
00209       (OnigCodePoint )'\\'                       /* esc */
00210     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.'  */
00211     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*'  */
00212     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
00213     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
00214     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
00215   },
00216   onigenc_is_mbc_newline_0x0a,
00217   eucjp_mbc_to_code,
00218   eucjp_code_to_mbclen,
00219   eucjp_code_to_mbc,
00220   eucjp_mbc_to_normalize,
00221   eucjp_is_mbc_ambiguous,
00222   onigenc_ascii_get_all_pair_ambig_codes,
00223   onigenc_nothing_get_all_comp_ambig_codes,
00224   eucjp_is_code_ctype,
00225   onigenc_not_support_get_ctype_code_range,
00226   eucjp_left_adjust_char_head,
00227   eucjp_is_allowed_reverse_match
00228 };