Back to index

php5  5.3.10
gb18030.c
Go to the documentation of this file.
00001 /**********************************************************************
00002   gb18030.c -  Oniguruma (regular expression library)
00003 **********************************************************************/
00004 /*-
00005  * Copyright (c) 2005  KUBO Takehiro <kubo AT jiubao DOT org>
00006  *                     K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
00007  * All rights reserved.
00008  *
00009  * Redistribution and use in source and binary forms, with or without
00010  * modification, are permitted provided that the following conditions
00011  * are met:
00012  * 1. Redistributions of source code must retain the above copyright
00013  *    notice, this list of conditions and the following disclaimer.
00014  * 2. Redistributions in binary form must reproduce the above copyright
00015  *    notice, this list of conditions and the following disclaimer in the
00016  *    documentation and/or other materials provided with the distribution.
00017  *
00018  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
00019  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00020  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00021  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
00022  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00023  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00024  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00025  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00026  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00027  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00028  * SUCH DAMAGE.
00029  */
00030 
00031 #include "regenc.h"
00032 
00033 #if 1
00034 #define DEBUG_GB18030(arg)
00035 #else
00036 #define DEBUG_GB18030(arg) printf arg
00037 #endif
00038 
00039 enum {
00040   C1, /* one-byte char */
00041   C2, /* one-byte or second of two-byte char */
00042   C4, /* one-byte or second or fourth of four-byte char */
00043   CM  /* first of two- or four-byte char or second of two-byte char */
00044 };
00045 
00046 static const char GB18030_MAP[] = {
00047   C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
00048   C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
00049   C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
00050   C4, C4, C4, C4, C4, C4, C4, C4, C4, C4, C1, C1, C1, C1, C1, C1,
00051   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
00052   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
00053   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
00054   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C1,
00055   C2, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
00056   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
00057   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
00058   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
00059   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
00060   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
00061   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
00062   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, C1
00063 };
00064 
00065 static int
00066 gb18030_mbc_enc_len(const UChar* p)
00067 {
00068   if (GB18030_MAP[*p] != CM)
00069     return 1;
00070   p++;
00071   if (GB18030_MAP[*p] == C4)
00072     return 4;
00073   if (GB18030_MAP[*p] == C1)
00074     return 1; /* illegal sequence */
00075   return 2;
00076 }
00077 
00078 static OnigCodePoint
00079 gb18030_mbc_to_code(const UChar* p, const UChar* end)
00080 {
00081   return onigenc_mbn_mbc_to_code(ONIG_ENCODING_GB18030, p, end);
00082 }
00083 
00084 static int
00085 gb18030_code_to_mbc(OnigCodePoint code, UChar *buf)
00086 {
00087   return onigenc_mb4_code_to_mbc(ONIG_ENCODING_GB18030, code, buf);
00088 }
00089 
00090 static int
00091 gb18030_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end,
00092                        UChar* lower)
00093 {
00094   return onigenc_mbn_mbc_to_normalize(ONIG_ENCODING_GB18030, flag,
00095                                       pp, end, lower);
00096 }
00097 
00098 static int
00099 gb18030_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end)
00100 {
00101   return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_GB18030, flag, pp, end);
00102 }
00103 
00104 static int
00105 gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype)
00106 {
00107   return onigenc_mb4_is_code_ctype(ONIG_ENCODING_GB18030, code, ctype);
00108 }
00109 
00110 enum state {
00111   S_START,
00112   S_one_C2,
00113   S_one_C4,
00114   S_one_CM,
00115 
00116   S_odd_CM_one_CX,
00117   S_even_CM_one_CX,
00118 
00119   /* CMC4 : pair of "CM C4" */
00120   S_one_CMC4,
00121   S_odd_CMC4,
00122   S_one_C4_odd_CMC4,
00123   S_even_CMC4,
00124   S_one_C4_even_CMC4,
00125 
00126   S_odd_CM_odd_CMC4,
00127   S_even_CM_odd_CMC4,
00128 
00129   S_odd_CM_even_CMC4,
00130   S_even_CM_even_CMC4,
00131 
00132   /* C4CM : pair of "C4 CM" */
00133   S_odd_C4CM,
00134   S_one_CM_odd_C4CM,
00135   S_even_C4CM,
00136   S_one_CM_even_C4CM,
00137 
00138   S_even_CM_odd_C4CM,
00139   S_odd_CM_odd_C4CM,
00140   S_even_CM_even_C4CM,
00141   S_odd_CM_even_C4CM,
00142 };
00143 
00144 static UChar*
00145 gb18030_left_adjust_char_head(const UChar* start, const UChar* s)
00146 {
00147   const UChar *p;
00148   enum state state = S_START;
00149 
00150   DEBUG_GB18030(("----------------\n"));
00151   for (p = s; p >= start; p--) {
00152     DEBUG_GB18030(("state %d --(%02x)-->\n", state, *p));
00153     switch (state) {
00154     case S_START:
00155       switch (GB18030_MAP[*p]) {
00156       case C1:
00157        return (UChar *)s;
00158       case C2:
00159        state = S_one_C2; /* C2 */
00160        break;
00161       case C4:
00162        state = S_one_C4; /* C4 */
00163        break;
00164       case CM:
00165        state = S_one_CM; /* CM */
00166        break;
00167       }
00168       break;
00169     case S_one_C2: /* C2 */
00170       switch (GB18030_MAP[*p]) {
00171       case C1:
00172       case C2:
00173       case C4:
00174        return (UChar *)s;
00175       case CM:
00176        state = S_odd_CM_one_CX; /* CM C2 */
00177        break;
00178       }
00179       break;
00180     case S_one_C4: /* C4 */
00181       switch (GB18030_MAP[*p]) {
00182       case C1:
00183       case C2:
00184       case C4:
00185        return (UChar *)s;
00186       case CM:
00187        state = S_one_CMC4;
00188        break;
00189       }
00190       break;
00191     case S_one_CM: /* CM */
00192       switch (GB18030_MAP[*p]) {
00193       case C1:
00194       case C2:
00195        return (UChar *)s;
00196       case C4:
00197        state = S_odd_C4CM;
00198        break;
00199       case CM:
00200        state = S_odd_CM_one_CX; /* CM CM */
00201        break;
00202       }
00203       break;
00204 
00205     case S_odd_CM_one_CX: /* CM C2 */ /* CM CM */ /* CM CM CM C4 */
00206       switch (GB18030_MAP[*p]) {
00207       case C1:
00208       case C2:
00209       case C4:
00210        return (UChar *)(s - 1);
00211       case CM:
00212        state = S_even_CM_one_CX;
00213        break;
00214       }
00215       break;
00216     case S_even_CM_one_CX: /* CM CM C2 */ /* CM CM CM */ /* CM CM C4 */
00217       switch (GB18030_MAP[*p]) {
00218       case C1:
00219       case C2:
00220       case C4:
00221        return (UChar *)s;
00222       case CM:
00223        state = S_odd_CM_one_CX;
00224        break;
00225       }
00226       break;
00227 
00228     case S_one_CMC4: /* CM C4 */
00229       switch (GB18030_MAP[*p]) {
00230       case C1:
00231       case C2:
00232        return (UChar *)(s - 1);
00233       case C4:
00234        state = S_one_C4_odd_CMC4; /* C4 CM C4 */
00235        break;
00236       case CM:
00237        state = S_even_CM_one_CX; /* CM CM C4 */
00238        break;
00239       }
00240       break;
00241     case S_odd_CMC4: /* CM C4 CM C4 CM C4 */
00242       switch (GB18030_MAP[*p]) {
00243       case C1:
00244       case C2:
00245        return (UChar *)(s - 1);
00246       case C4:
00247        state = S_one_C4_odd_CMC4;
00248        break;
00249       case CM:
00250        state = S_odd_CM_odd_CMC4;
00251        break;
00252       }
00253       break;
00254     case S_one_C4_odd_CMC4: /* C4 CM C4 */
00255       switch (GB18030_MAP[*p]) {
00256       case C1:
00257       case C2:
00258       case C4:
00259        return (UChar *)(s - 1);
00260       case CM:
00261        state = S_even_CMC4; /* CM C4 CM C4 */
00262        break;
00263       }
00264       break;
00265     case S_even_CMC4: /* CM C4 CM C4 */
00266       switch (GB18030_MAP[*p]) {
00267       case C1:
00268       case C2:
00269        return (UChar *)(s - 3);
00270       case C4:
00271        state = S_one_C4_even_CMC4;
00272        break;
00273       case CM:
00274        state = S_odd_CM_even_CMC4;
00275        break;
00276       }
00277       break;
00278     case S_one_C4_even_CMC4: /* C4 CM C4 CM C4 */
00279       switch (GB18030_MAP[*p]) {
00280       case C1:
00281       case C2:
00282       case C4:
00283        return (UChar *)(s - 3);
00284       case CM:
00285        state = S_odd_CMC4;
00286        break;
00287       }
00288       break;
00289 
00290     case S_odd_CM_odd_CMC4: /* CM CM C4 CM C4 CM C4 */
00291       switch (GB18030_MAP[*p]) {
00292       case C1:
00293       case C2:
00294       case C4:
00295        return (UChar *)(s - 3);
00296       case CM:
00297        state = S_even_CM_odd_CMC4;
00298        break;
00299       }
00300       break;
00301     case S_even_CM_odd_CMC4: /* CM CM CM C4 CM C4 CM C4 */
00302       switch (GB18030_MAP[*p]) {
00303       case C1:
00304       case C2:
00305       case C4:
00306        return (UChar *)(s - 1);
00307       case CM:
00308        state = S_odd_CM_odd_CMC4;
00309        break;
00310       }
00311       break;
00312 
00313     case S_odd_CM_even_CMC4: /* CM CM C4 CM C4 */
00314       switch (GB18030_MAP[*p]) {
00315       case C1:
00316       case C2:
00317       case C4:
00318        return (UChar *)(s - 1);
00319       case CM:
00320        state = S_even_CM_even_CMC4;
00321        break;
00322       }
00323       break;
00324     case S_even_CM_even_CMC4: /* CM CM CM C4 CM C4 */
00325       switch (GB18030_MAP[*p]) {
00326       case C1:
00327       case C2:
00328       case C4:
00329        return (UChar *)(s - 3);
00330       case CM:
00331        state = S_odd_CM_even_CMC4;
00332        break;
00333       }
00334       break;
00335 
00336     case S_odd_C4CM: /* C4 CM */  /* C4 CM C4 CM C4 CM*/
00337       switch (GB18030_MAP[*p]) {
00338       case C1:
00339       case C2:
00340       case C4:
00341        return (UChar *)s;
00342       case CM:
00343        state = S_one_CM_odd_C4CM; /* CM C4 CM */
00344        break;
00345       }
00346       break;
00347     case S_one_CM_odd_C4CM: /* CM C4 CM */ /* CM C4 CM C4 CM C4 CM */
00348       switch (GB18030_MAP[*p]) {
00349       case C1:
00350       case C2:
00351        return (UChar *)(s - 2); /* |CM C4 CM */
00352       case C4:
00353        state = S_even_C4CM;
00354        break;
00355       case CM:
00356        state = S_even_CM_odd_C4CM;
00357        break;
00358       }
00359       break;
00360     case S_even_C4CM: /* C4 CM C4 CM */
00361       switch (GB18030_MAP[*p]) {
00362       case C1:
00363       case C2:
00364       case C4:
00365        return (UChar *)(s - 2);  /* C4|CM C4 CM */
00366       case CM:
00367        state = S_one_CM_even_C4CM;
00368        break;
00369       }
00370       break;
00371     case S_one_CM_even_C4CM: /* CM C4 CM C4 CM */
00372       switch (GB18030_MAP[*p]) {
00373       case C1:
00374       case C2:
00375        return (UChar *)(s - 0);  /*|CM C4 CM C4|CM */
00376       case C4:
00377        state = S_odd_C4CM;
00378        break;
00379       case CM:
00380        state = S_even_CM_even_C4CM;
00381        break;
00382       }
00383       break;
00384 
00385     case S_even_CM_odd_C4CM: /* CM CM C4 CM */
00386       switch (GB18030_MAP[*p]) {
00387       case C1:
00388       case C2:
00389       case C4:
00390        return (UChar *)(s - 0); /* |CM CM|C4|CM */
00391       case CM:
00392        state = S_odd_CM_odd_C4CM;
00393        break;
00394       }
00395       break;
00396     case S_odd_CM_odd_C4CM: /* CM CM CM C4 CM */
00397       switch (GB18030_MAP[*p]) {
00398       case C1:
00399       case C2:
00400       case C4:
00401        return (UChar *)(s - 2); /* |CM CM|CM C4 CM */
00402       case CM:
00403        state = S_even_CM_odd_C4CM;
00404        break;
00405       }
00406       break;
00407 
00408     case S_even_CM_even_C4CM: /* CM CM C4 CM C4 CM */
00409       switch (GB18030_MAP[*p]) {
00410       case C1:
00411       case C2:
00412       case C4:
00413        return (UChar *)(s - 2); /* |CM CM|C4|CM C4 CM */
00414       case CM:
00415        state = S_odd_CM_even_C4CM;
00416        break;
00417       }
00418       break;
00419     case S_odd_CM_even_C4CM: /* CM CM CM C4 CM C4 CM */
00420       switch (GB18030_MAP[*p]) {
00421       case C1:
00422       case C2:
00423       case C4:
00424        return (UChar *)(s - 0);  /* |CM CM|CM C4 CM C4|CM */
00425       case CM:
00426        state = S_even_CM_even_C4CM;
00427        break;
00428       }
00429       break;
00430     }
00431   }
00432 
00433   DEBUG_GB18030(("state %d\n", state));
00434   switch (state) {
00435   case S_START:             return (UChar *)(s - 0);
00436   case S_one_C2:            return (UChar *)(s - 0);
00437   case S_one_C4:            return (UChar *)(s - 0);
00438   case S_one_CM:            return (UChar *)(s - 0);
00439 
00440   case S_odd_CM_one_CX:     return (UChar *)(s - 1);
00441   case S_even_CM_one_CX:    return (UChar *)(s - 0);
00442 
00443   case S_one_CMC4:          return (UChar *)(s - 1);
00444   case S_odd_CMC4:          return (UChar *)(s - 1);
00445   case S_one_C4_odd_CMC4:   return (UChar *)(s - 1);
00446   case S_even_CMC4:         return (UChar *)(s - 3);
00447   case S_one_C4_even_CMC4:  return (UChar *)(s - 3);
00448 
00449   case S_odd_CM_odd_CMC4:   return (UChar *)(s - 3);
00450   case S_even_CM_odd_CMC4:  return (UChar *)(s - 1);
00451 
00452   case S_odd_CM_even_CMC4:  return (UChar *)(s - 1);
00453   case S_even_CM_even_CMC4: return (UChar *)(s - 3);
00454 
00455   case S_odd_C4CM:          return (UChar *)(s - 0);
00456   case S_one_CM_odd_C4CM:   return (UChar *)(s - 2);
00457   case S_even_C4CM:         return (UChar *)(s - 2);
00458   case S_one_CM_even_C4CM:  return (UChar *)(s - 0);
00459 
00460   case S_even_CM_odd_C4CM:  return (UChar *)(s - 0);
00461   case S_odd_CM_odd_C4CM:   return (UChar *)(s - 2);
00462   case S_even_CM_even_C4CM: return (UChar *)(s - 2);
00463   case S_odd_CM_even_C4CM:  return (UChar *)(s - 0);
00464   }
00465 
00466   return (UChar* )s;  /* never come here. (escape warning) */
00467 }
00468 
00469 static int
00470 gb18030_is_allowed_reverse_match(const UChar* s, const UChar* end)
00471 {
00472   return GB18030_MAP[*s] == C1 ? TRUE : FALSE;
00473 }
00474 
00475 OnigEncodingType OnigEncodingGB18030 = {
00476   gb18030_mbc_enc_len,
00477   "GB18030",   /* name */
00478   4,          /* max enc length */
00479   1,          /* min enc length */
00480   ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE,
00481   {
00482       (OnigCodePoint )'\\'                       /* esc */
00483     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.'  */
00484     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*'  */
00485     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
00486     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
00487     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
00488   },
00489   onigenc_is_mbc_newline_0x0a,
00490   gb18030_mbc_to_code,
00491   onigenc_mb4_code_to_mbclen,
00492   gb18030_code_to_mbc,
00493   gb18030_mbc_to_normalize,
00494   gb18030_is_mbc_ambiguous,
00495   onigenc_ascii_get_all_pair_ambig_codes,
00496   onigenc_nothing_get_all_comp_ambig_codes,
00497   gb18030_is_code_ctype,
00498   onigenc_not_support_get_ctype_code_range,
00499   gb18030_left_adjust_char_head,
00500   gb18030_is_allowed_reverse_match
00501 };