Back to index

php5  5.3.10
utf16_be.c
Go to the documentation of this file.
00001 /**********************************************************************
00002   utf16_be.c -  Oniguruma (regular expression library)
00003 **********************************************************************/
00004 /*-
00005  * Copyright (c) 2002-2006  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
00006  * All rights reserved.
00007  *
00008  * Redistribution and use in source and binary forms, with or without
00009  * modification, are permitted provided that the following conditions
00010  * are met:
00011  * 1. Redistributions of source code must retain the above copyright
00012  *    notice, this list of conditions and the following disclaimer.
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in the
00015  *    documentation and/or other materials provided with the distribution.
00016  *
00017  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
00018  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00019  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00020  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
00021  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00022  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00023  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00024  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00025  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00026  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00027  * SUCH DAMAGE.
00028  */
00029 
00030 #include "regenc.h"
00031 
00032 #define UTF16_IS_SURROGATE_FIRST(c)    (c >= 0xd8 && c <= 0xdb)
00033 #define UTF16_IS_SURROGATE_SECOND(c)   (c >= 0xdc && c <= 0xdf)
00034 
00035 static const int EncLen_UTF16[] = {
00036   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00037   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00038   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00039   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00040   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00041   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00042   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00043   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00044   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00045   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00046   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00047   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00048   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00049   2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2,
00050   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00051   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
00052 };
00053 
00054 static int
00055 utf16be_mbc_enc_len(const UChar* p)
00056 {
00057   return EncLen_UTF16[*p];
00058 }
00059 
00060 static int
00061 utf16be_is_mbc_newline(const UChar* p, const UChar* end)
00062 {
00063   if (p + 1 < end) {
00064     if (*(p+1) == 0x0a && *p == 0x00)
00065       return 1;
00066 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
00067     if ((*(p+1) == 0x0d || *(p+1) == 0x85) && *p == 0x00)
00068       return 1;
00069     if (*p == 0x20 && (*(p+1) == 0x29 || *(p+1) == 0x28))
00070       return 1;
00071 #endif
00072   }
00073   return 0;
00074 }
00075 
00076 static OnigCodePoint
00077 utf16be_mbc_to_code(const UChar* p, const UChar* end)
00078 {
00079   OnigCodePoint code;
00080 
00081   if (UTF16_IS_SURROGATE_FIRST(*p)) {
00082     code = ((((p[0] - 0xd8) << 2) + ((p[1] & 0xc0) >> 6) + 1) << 16)
00083          + ((((p[1] & 0x3f) << 2) + (p[2] - 0xdc)) << 8)
00084          + p[3];
00085   }
00086   else {
00087     code = p[0] * 256 + p[1];
00088   }
00089   return code;
00090 }
00091 
00092 static int
00093 utf16be_code_to_mbclen(OnigCodePoint code)
00094 {
00095   return (code > 0xffff ? 4 : 2);
00096 }
00097 
00098 static int
00099 utf16be_code_to_mbc(OnigCodePoint code, UChar *buf)
00100 {
00101   UChar* p = buf;
00102 
00103   if (code > 0xffff) {
00104     unsigned int plane, high;
00105 
00106     plane = code >> 16;
00107     *p++ = (plane >> 2) + 0xd8;
00108     high = (code & 0xff00) >> 8;
00109     *p++ = ((plane & 0x03) << 6) + (high >> 2);
00110     *p++ = (high & 0x02) + 0xdc;
00111     *p   = (UChar )(code & 0xff);
00112     return 4;
00113   }
00114   else {
00115     *p++ = (UChar )((code & 0xff00) >> 8);
00116     *p++ = (UChar )(code & 0xff);
00117     return 2;
00118   }
00119 }
00120 
00121 static int
00122 utf16be_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end,
00123                          UChar* lower)
00124 {
00125   const UChar* p = *pp;
00126 
00127   if (*p == 0) {
00128     p++;
00129     *lower++ = '\0';
00130     if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 &&
00131         ONIGENC_IS_MBC_ASCII(p)) ||
00132        ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 &&
00133         !ONIGENC_IS_MBC_ASCII(p))) {
00134       *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p);
00135     }
00136     else {
00137       *lower = *p;
00138     }
00139 
00140     (*pp) += 2;
00141     return 2;  /* return byte length of converted char to lower */
00142   }
00143   else {
00144     int len;
00145     len = EncLen_UTF16[*p];
00146     if (lower != p) {
00147       int i;
00148       for (i = 0; i < len; i++) {
00149        *lower++ = *p++;
00150       }
00151     }
00152     (*pp) += len;
00153     return len; /* return byte length of converted char to lower */
00154   }
00155 }
00156 
00157 static int
00158 utf16be_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end)
00159 {
00160   const UChar* p = *pp;
00161 
00162   (*pp) += EncLen_UTF16[*p];
00163 
00164   if (*p == 0) {
00165     int c, v;
00166 
00167     p++;
00168     if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 &&
00169         ONIGENC_IS_MBC_ASCII(p)) ||
00170        ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 &&
00171         !ONIGENC_IS_MBC_ASCII(p))) {
00172       c = *p;
00173       v = ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(c,
00174              (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER));
00175 
00176       if ((v | ONIGENC_CTYPE_LOWER) != 0) {
00177         /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
00178         if (c >= 0xaa && c <= 0xba)
00179           return FALSE;
00180         else
00181           return TRUE;
00182       }
00183       return (v != 0 ? TRUE : FALSE);
00184     }
00185   }
00186 
00187   return FALSE;
00188 }
00189 
00190 static UChar*
00191 utf16be_left_adjust_char_head(const UChar* start, const UChar* s)
00192 {
00193   if (s <= start) return (UChar* )s;
00194 
00195   if ((s - start) % 2 == 1) {
00196     s--;
00197   }
00198 
00199   if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1)
00200     s -= 2;
00201 
00202   return (UChar* )s;
00203 }
00204 
00205 OnigEncodingType OnigEncodingUTF16_BE = {
00206   utf16be_mbc_enc_len,
00207   "UTF-16BE",   /* name */
00208   4,            /* max byte length */
00209   2,            /* min byte length */
00210   (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE |
00211    ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE ),
00212   {
00213       (OnigCodePoint )'\\'                       /* esc */
00214     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.'  */
00215     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*'  */
00216     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
00217     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
00218     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
00219   },
00220   utf16be_is_mbc_newline,
00221   utf16be_mbc_to_code,
00222   utf16be_code_to_mbclen,
00223   utf16be_code_to_mbc,
00224   utf16be_mbc_to_normalize,
00225   utf16be_is_mbc_ambiguous,
00226   onigenc_iso_8859_1_get_all_pair_ambig_codes,
00227   onigenc_ess_tsett_get_all_comp_ambig_codes,
00228   onigenc_unicode_is_code_ctype,
00229   onigenc_unicode_get_ctype_code_range,
00230   utf16be_left_adjust_char_head,
00231   onigenc_always_false_is_allowed_reverse_match
00232 };