Back to index

php5  5.3.10
iso8859_1.c
Go to the documentation of this file.
00001 /**********************************************************************
00002   iso8859_1.c -  Oniguruma (regular expression library)
00003 **********************************************************************/
00004 /*-
00005  * Copyright (c) 2002-2006  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
00006  * All rights reserved.
00007  *
00008  * Redistribution and use in source and binary forms, with or without
00009  * modification, are permitted provided that the following conditions
00010  * are met:
00011  * 1. Redistributions of source code must retain the above copyright
00012  *    notice, this list of conditions and the following disclaimer.
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in the
00015  *    documentation and/or other materials provided with the distribution.
00016  *
00017  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
00018  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00019  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00020  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
00021  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00022  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00023  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00024  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00025  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00026  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00027  * SUCH DAMAGE.
00028  */
00029 
00030 #include "regenc.h"
00031 
00032 #define ENC_IS_ISO_8859_1_CTYPE(code,ctype) \
00033   ((EncISO_8859_1_CtypeTable[code] & ctype) != 0)
00034 
00035 static const unsigned short EncISO_8859_1_CtypeTable[256] = {
00036   0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008,
00037   0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008,
00038   0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008,
00039   0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008,
00040   0x2284, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0,
00041   0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0,
00042   0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0,
00043   0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0,
00044   0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2,
00045   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
00046   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
00047   0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x31a0,
00048   0x21a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2,
00049   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
00050   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
00051   0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x2008,
00052   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
00053   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
00054   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
00055   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
00056   0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0,
00057   0x00a0, 0x00a0, 0x10e2, 0x01a0, 0x00a0, 0x01a0, 0x00a0, 0x00a0,
00058   0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x10e2, 0x00a0, 0x01a0,
00059   0x00a0, 0x10a0, 0x10e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,
00060   0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2,
00061   0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2,
00062   0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x00a0,
00063   0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x10e2,
00064   0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2,
00065   0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2,
00066   0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x00a0,
00067   0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2
00068 };
00069 
00070 static int
00071 iso_8859_1_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end, UChar* lower)
00072 {
00073   const UChar* p = *pp;
00074 
00075   if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 &&
00076        ONIGENC_IS_MBC_ASCII(p)) ||
00077       ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 &&
00078        !ONIGENC_IS_MBC_ASCII(p))) {
00079     *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p);
00080   }
00081   else {
00082     *lower = *p;
00083   }
00084   (*pp)++;
00085   return 1; /* return byte length of converted char to lower */
00086 }
00087 
00088 static int
00089 iso_8859_1_is_mbc_ambiguous(OnigAmbigType flag,
00090                          const UChar** pp, const UChar* end)
00091 {
00092   const UChar* p = *pp;
00093 
00094   (*pp)++;
00095   if (((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 &&
00096        ONIGENC_IS_MBC_ASCII(p)) ||
00097       ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0 &&
00098        !ONIGENC_IS_MBC_ASCII(p))) {
00099     int v = (EncISO_8859_1_CtypeTable[*p] &
00100              (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER));
00101 
00102     if ((v | ONIGENC_CTYPE_LOWER) != 0) {
00103       /* 0xdf, 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
00104       if (*p == 0xdf || (*p >= 0xaa && *p <= 0xba))
00105         return FALSE;
00106       else
00107         return TRUE;
00108     }
00109 
00110     return (v != 0 ? TRUE : FALSE);
00111   }
00112   return FALSE;
00113 }
00114 
00115 static int
00116 iso_8859_1_is_code_ctype(OnigCodePoint code, unsigned int ctype)
00117 {
00118   if (code < 256)
00119     return ENC_IS_ISO_8859_1_CTYPE(code, ctype);
00120   else
00121     return FALSE;
00122 }
00123 
00124 OnigEncodingType OnigEncodingISO_8859_1 = {
00125   onigenc_single_byte_mbc_enc_len,
00126   "ISO-8859-1",  /* name */
00127   1,             /* max enc length */
00128   1,             /* min enc length */
00129   (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE |
00130    ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE ),
00131   {
00132       (OnigCodePoint )'\\'                       /* esc */
00133     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.'  */
00134     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*'  */
00135     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
00136     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
00137     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
00138   },
00139   onigenc_is_mbc_newline_0x0a,
00140   onigenc_single_byte_mbc_to_code,
00141   onigenc_single_byte_code_to_mbclen,
00142   onigenc_single_byte_code_to_mbc,
00143   iso_8859_1_mbc_to_normalize,
00144   iso_8859_1_is_mbc_ambiguous,
00145   onigenc_iso_8859_1_get_all_pair_ambig_codes,
00146   onigenc_ess_tsett_get_all_comp_ambig_codes,
00147   iso_8859_1_is_code_ctype,
00148   onigenc_not_support_get_ctype_code_range,
00149   onigenc_single_byte_left_adjust_char_head,
00150   onigenc_always_true_is_allowed_reverse_match
00151 };