Back to index

php5  5.3.10
utf16_le.c
Go to the documentation of this file.
00001 /**********************************************************************
00002   utf16_le.c -  Oniguruma (regular expression library)
00003 **********************************************************************/
00004 /*-
00005  * Copyright (c) 2002-2006  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
00006  * All rights reserved.
00007  *
00008  * Redistribution and use in source and binary forms, with or without
00009  * modification, are permitted provided that the following conditions
00010  * are met:
00011  * 1. Redistributions of source code must retain the above copyright
00012  *    notice, this list of conditions and the following disclaimer.
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in the
00015  *    documentation and/or other materials provided with the distribution.
00016  *
00017  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
00018  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00019  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00020  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
00021  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00022  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00023  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00024  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00025  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00026  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00027  * SUCH DAMAGE.
00028  */
00029 
00030 #include "regenc.h"
00031 
00032 #define UTF16_IS_SURROGATE_FIRST(c)    (c >= 0xd8 && c <= 0xdb)
00033 #define UTF16_IS_SURROGATE_SECOND(c)   (c >= 0xdc && c <= 0xdf)
00034 
00035 static const int EncLen_UTF16[] = {
00036   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00037   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00038   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00039   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00040   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00041   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00042   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00043   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00044   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00045   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00046   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00047   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00048   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00049   2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2,
00050   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00051   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
00052 };
00053 
00054 static int
00055 utf16le_code_to_mbclen(OnigCodePoint code)
00056 {
00057   return (code > 0xffff ? 4 : 2);
00058 }
00059 
00060 static int
00061 utf16le_mbc_enc_len(const UChar* p)
00062 {
00063   return EncLen_UTF16[*(p+1)];
00064 }
00065 
00066 static int
00067 utf16le_is_mbc_newline(const UChar* p, const UChar* end)
00068 {
00069   if (p + 1 < end) {
00070     if (*p == 0x0a && *(p+1) == 0x00)
00071       return 1;
00072 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
00073     if ((*p == 0x0d || *p == 0x85) && *(p+1) == 0x00)
00074       return 1;
00075     if (*(p+1) == 0x20 && (*p == 0x29 || *p == 0x28))
00076       return 1;
00077 #endif
00078   }
00079   return 0;
00080 }
00081 
00082 static OnigCodePoint
00083 utf16le_mbc_to_code(const UChar* p, const UChar* end)
00084 {
00085   OnigCodePoint code;
00086   UChar c0 = *p;
00087   UChar c1 = *(p+1);
00088 
00089   if (UTF16_IS_SURROGATE_FIRST(c1)) {
00090     code = ((((c1 - 0xd8) << 2) + ((c0  & 0xc0) >> 6) + 1) << 16)
00091          + ((((c0 & 0x3f) << 2) + (p[3] - 0xdc)) << 8)
00092          + p[2];
00093   }
00094   else {
00095     code = c1 * 256 + p[0];
00096   }
00097   return code;
00098 }
00099 
00100 static int
00101 utf16le_code_to_mbc(OnigCodePoint code, UChar *buf)
00102 {
00103   UChar* p = buf;
00104 
00105   if (code > 0xffff) {
00106     unsigned int plane, high;
00107 
00108     plane = code >> 16;
00109     high = (code & 0xff00) >> 8;
00110 
00111     *p++ = ((plane & 0x03) << 6) + (high >> 2);
00112     *p++ = (plane >> 2) + 0xd8;
00113     *p++ = (UChar )(code & 0xff);
00114     *p   = (high & 0x02) + 0xdc;
00115     return 4;
00116   }
00117   else {
00118     *p++ = (UChar )(code & 0xff);
00119     *p++ = (UChar )((code & 0xff00) >> 8);
00120     return 2;
00121   }
00122 }
00123 
00124 static int
00125 utf16le_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end,
00126                          UChar* lower)
00127 {
00128   const UChar* p = *pp;
00129 
00130   if (*(p+1) == 0) {
00131     *(lower+1) = '\0';
00132     if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 &&
00133         ONIGENC_IS_MBC_ASCII(p)) ||
00134        ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 &&
00135         !ONIGENC_IS_MBC_ASCII(p))) {
00136       *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p);
00137     }
00138     else {
00139       *lower = *p;
00140     }
00141     (*pp) += 2;
00142     return 2;  /* return byte length of converted char to lower */
00143   }
00144   else {
00145     int len = EncLen_UTF16[*(p+1)];
00146     if (lower != p) {
00147       int i;
00148       for (i = 0; i < len; i++) {
00149        *lower++ = *p++;
00150       }
00151     }
00152     (*pp) += len;
00153     return len; /* return byte length of converted char to lower */
00154   }
00155 }
00156 
00157 static int
00158 utf16le_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end)
00159 {
00160   const UChar* p = *pp;
00161 
00162   (*pp) += EncLen_UTF16[*(p+1)];
00163 
00164   if (*(p+1) == 0) {
00165     int c, v;
00166 
00167     if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 &&
00168         ONIGENC_IS_MBC_ASCII(p)) ||
00169        ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 &&
00170         !ONIGENC_IS_MBC_ASCII(p))) {
00171       c = *p;
00172       v = ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(c,
00173                        (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER));
00174       if ((v | ONIGENC_CTYPE_LOWER) != 0) {
00175         /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
00176         if (c >= 0xaa && c <= 0xba)
00177           return FALSE;
00178         else
00179           return TRUE;
00180       }
00181       return (v != 0 ? TRUE : FALSE);
00182     }
00183   }
00184 
00185   return FALSE;
00186 }
00187 
00188 static UChar*
00189 utf16le_left_adjust_char_head(const UChar* start, const UChar* s)
00190 {
00191   if (s <= start) return (UChar* )s;
00192 
00193   if ((s - start) % 2 == 1) {
00194     s--;
00195   }
00196 
00197   if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1)
00198     s -= 2;
00199 
00200   return (UChar* )s;
00201 }
00202 
00203 OnigEncodingType OnigEncodingUTF16_LE = {
00204   utf16le_mbc_enc_len,
00205   "UTF-16LE",   /* name */
00206   4,            /* max byte length */
00207   2,            /* min byte length */
00208   (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE |
00209    ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE ),
00210   {
00211       (OnigCodePoint )'\\'                       /* esc */
00212     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.'  */
00213     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*'  */
00214     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
00215     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
00216     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
00217   },
00218   utf16le_is_mbc_newline,
00219   utf16le_mbc_to_code,
00220   utf16le_code_to_mbclen,
00221   utf16le_code_to_mbc,
00222   utf16le_mbc_to_normalize,
00223   utf16le_is_mbc_ambiguous,
00224   onigenc_iso_8859_1_get_all_pair_ambig_codes,
00225   onigenc_ess_tsett_get_all_comp_ambig_codes,
00226   onigenc_unicode_is_code_ctype,
00227   onigenc_unicode_get_ctype_code_range,
00228   utf16le_left_adjust_char_head,
00229   onigenc_always_false_is_allowed_reverse_match
00230 };