Back to index

php5  5.3.10
Defines | Functions | Variables
utf8.c File Reference
#include "regenc.h"

Go to the source code of this file.

Defines

#define USE_INVALID_CODE_SCHEME
#define INVALID_CODE_FE   0xfffffffe
#define INVALID_CODE_FF   0xffffffff
#define VALID_CODE_LIMIT   0x7fffffff
#define utf8_islead(c)   ((UChar )((c) & 0xc0) != 0x80)
#define UTF8_TRAILS(code, shift)   (UChar )((((code) >> (shift)) & 0x3f) | 0x80)
#define UTF8_TRAIL0(code)   (UChar )(((code) & 0x3f) | 0x80)
#define CR_SET(sbl, mbl)
#define CR_SB_SET(sbl)

Functions

static int utf8_mbc_enc_len (const UChar *p)
static int utf8_is_mbc_newline (const UChar *p, const UChar *end)
static OnigCodePoint utf8_mbc_to_code (const UChar *p, const UChar *end)
static int utf8_code_to_mbclen (OnigCodePoint code)
static int utf8_code_to_mbc (OnigCodePoint code, UChar *buf)
static int utf8_mbc_to_normalize (OnigAmbigType flag, const UChar **pp, const UChar *end, UChar *lower)
static int utf8_is_mbc_ambiguous (OnigAmbigType flag, const UChar **pp, const UChar *end)
static int utf8_get_ctype_code_range (int ctype, const OnigCodePoint *sbr[], const OnigCodePoint *mbr[])
static int utf8_is_code_ctype (OnigCodePoint code, unsigned int ctype)
static UCharutf8_left_adjust_char_head (const UChar *start, const UChar *s)

Variables

static const int EncLen_UTF8 []
static const OnigCodePoint EmptyRange [] = { 0 }
static const OnigCodePoint SBAlnum []
static const OnigCodePoint MBAlnum []
static const OnigCodePoint SBAlpha []
static const OnigCodePoint MBAlpha []
static const OnigCodePoint SBBlank []
static const OnigCodePoint MBBlank []
static const OnigCodePoint SBCntrl []
static const OnigCodePoint MBCntrl []
static const OnigCodePoint SBDigit []
static const OnigCodePoint MBDigit []
static const OnigCodePoint SBGraph []
static const OnigCodePoint MBGraph []
static const OnigCodePoint SBLower []
static const OnigCodePoint MBLower []
static const OnigCodePoint SBPrint []
static const OnigCodePoint MBPrint []
static const OnigCodePoint SBPunct []
static const OnigCodePoint MBPunct []
static const OnigCodePoint SBSpace []
static const OnigCodePoint MBSpace []
static const OnigCodePoint SBUpper []
static const OnigCodePoint MBUpper []
static const OnigCodePoint SBXDigit []
static const OnigCodePoint SBASCII []
static const OnigCodePoint SBWord []
static const OnigCodePoint MBWord []
OnigEncodingType OnigEncodingUTF8

Define Documentation

#define CR_SB_SET (   sbl)
Value:
do { \
  *sbr = sbl; \
  *mbr = EmptyRange; \
} while (0)
#define CR_SET (   sbl,
  mbl 
)
Value:
do { \
  *sbr = sbl; \
  *mbr = mbl; \
} while (0)
#define INVALID_CODE_FE   0xfffffffe

Definition at line 36 of file utf8.c.

#define INVALID_CODE_FF   0xffffffff

Definition at line 37 of file utf8.c.

Definition at line 32 of file utf8.c.

#define utf8_islead (   c)    ((UChar )((c) & 0xc0) != 0x80)

Definition at line 41 of file utf8.c.

#define UTF8_TRAIL0 (   code)    (UChar )(((code) & 0x3f) | 0x80)
#define UTF8_TRAILS (   code,
  shift 
)    (UChar )((((code) >> (shift)) & 0x3f) | 0x80)
#define VALID_CODE_LIMIT   0x7fffffff

Definition at line 38 of file utf8.c.


Function Documentation

static int utf8_code_to_mbc ( OnigCodePoint  code,
UChar buf 
) [static]

Definition at line 164 of file utf8.c.

{
#define UTF8_TRAILS(code, shift) (UChar )((((code) >> (shift)) & 0x3f) | 0x80)
#define UTF8_TRAIL0(code)        (UChar )(((code) & 0x3f) | 0x80)

  if ((code & 0xffffff80) == 0) {
    *buf = (UChar )code;
    return 1;
  }
  else {
    UChar *p = buf;

    if ((code & 0xfffff800) == 0) {
      *p++ = (UChar )(((code>>6)& 0x1f) | 0xc0);
    }
    else if ((code & 0xffff0000) == 0) {
      *p++ = (UChar )(((code>>12) & 0x0f) | 0xe0);
      *p++ = UTF8_TRAILS(code, 6);
    }
    else if ((code & 0xffe00000) == 0) {
      *p++ = (UChar )(((code>>18) & 0x07) | 0xf0);
      *p++ = UTF8_TRAILS(code, 12);
      *p++ = UTF8_TRAILS(code,  6);
    }
    else if ((code & 0xfc000000) == 0) {
      *p++ = (UChar )(((code>>24) & 0x03) | 0xf8);
      *p++ = UTF8_TRAILS(code, 18);
      *p++ = UTF8_TRAILS(code, 12);
      *p++ = UTF8_TRAILS(code,  6);
    }
    else if ((code & 0x80000000) == 0) {
      *p++ = (UChar )(((code>>30) & 0x01) | 0xfc);
      *p++ = UTF8_TRAILS(code, 24);
      *p++ = UTF8_TRAILS(code, 18);
      *p++ = UTF8_TRAILS(code, 12);
      *p++ = UTF8_TRAILS(code,  6);
    }
#ifdef USE_INVALID_CODE_SCHEME
    else if (code == INVALID_CODE_FE) {
      *p = 0xfe;
      return 1;
    }
    else if (code == INVALID_CODE_FF) {
      *p = 0xff;
      return 1;
    }
#endif
    else {
      return ONIGENCERR_TOO_BIG_WIDE_CHAR_VALUE;
    }

    *p++ = UTF8_TRAIL0(code);
    return p - buf;
  }
}
static int utf8_code_to_mbclen ( OnigCodePoint  code) [static]

Definition at line 119 of file utf8.c.

{
  if      ((code & 0xffffff80) == 0) return 1;
  else if ((code & 0xfffff800) == 0) {
    if (code <= 0xff && code >= 0xfe)
      return 1;
    return 2;
  }
  else if ((code & 0xffff0000) == 0) return 3;
  else if ((code & 0xffe00000) == 0) return 4;
  else if ((code & 0xfc000000) == 0) return 5;
  else if ((code & 0x80000000) == 0) return 6;
#ifdef USE_INVALID_CODE_SCHEME
  else if (code == INVALID_CODE_FE) return 1;
  else if (code == INVALID_CODE_FF) return 1;
#endif
  else
    return ONIGENCERR_TOO_BIG_WIDE_CHAR_VALUE;
}
static int utf8_get_ctype_code_range ( int  ctype,
const OnigCodePoint sbr[],
const OnigCodePoint mbr[] 
) [static]

Definition at line 3546 of file utf8.c.

{
#define CR_SET(sbl,mbl) do { \
  *sbr = sbl; \
  *mbr = mbl; \
} while (0)

#define CR_SB_SET(sbl) do { \
  *sbr = sbl; \
  *mbr = EmptyRange; \
} while (0)

  switch (ctype) {
  case ONIGENC_CTYPE_ALPHA:
    CR_SET(SBAlpha, MBAlpha);
    break;
  case ONIGENC_CTYPE_BLANK:
    CR_SET(SBBlank, MBBlank);
    break;
  case ONIGENC_CTYPE_CNTRL:
    CR_SET(SBCntrl, MBCntrl);
    break;
  case ONIGENC_CTYPE_DIGIT:
    CR_SET(SBDigit, MBDigit);
    break;
  case ONIGENC_CTYPE_GRAPH:
    CR_SET(SBGraph, MBGraph);
    break;
  case ONIGENC_CTYPE_LOWER:
    CR_SET(SBLower, MBLower);
    break;
  case ONIGENC_CTYPE_PRINT:
    CR_SET(SBPrint, MBPrint);
    break;
  case ONIGENC_CTYPE_PUNCT:
    CR_SET(SBPunct, MBPunct);
    break;
  case ONIGENC_CTYPE_SPACE:
    CR_SET(SBSpace, MBSpace);
    break;
  case ONIGENC_CTYPE_UPPER:
    CR_SET(SBUpper, MBUpper);
    break;
  case ONIGENC_CTYPE_XDIGIT:
    CR_SB_SET(SBXDigit);
    break;
  case ONIGENC_CTYPE_WORD:
    CR_SET(SBWord, MBWord);
    break;
  case ONIGENC_CTYPE_ASCII:
    CR_SB_SET(SBASCII);
    break;
  case ONIGENC_CTYPE_ALNUM:
    CR_SET(SBAlnum, MBAlnum);
    break;

  default:
    return ONIGENCERR_TYPE_BUG;
    break;
  }

  return 0;
}
static int utf8_is_code_ctype ( OnigCodePoint  code,
unsigned int  ctype 
) [static]

Definition at line 3612 of file utf8.c.

{
#ifdef USE_UNICODE_FULL_RANGE_CTYPE
  const OnigCodePoint *range;
#endif

  if (code < 256) {
    return ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code, ctype);
  }

#ifdef USE_UNICODE_FULL_RANGE_CTYPE

  switch (ctype) {
  case ONIGENC_CTYPE_ALPHA:
    range = MBAlpha;
    break;
  case ONIGENC_CTYPE_BLANK:
    range = MBBlank;
    break;
  case ONIGENC_CTYPE_CNTRL:
    range = MBCntrl;
    break;
  case ONIGENC_CTYPE_DIGIT:
    range = MBDigit;
    break;
  case ONIGENC_CTYPE_GRAPH:
    range = MBGraph;
    break;
  case ONIGENC_CTYPE_LOWER:
    range = MBLower;
    break;
  case ONIGENC_CTYPE_PRINT:
    range = MBPrint;
    break;
  case ONIGENC_CTYPE_PUNCT:
    range = MBPunct;
    break;
  case ONIGENC_CTYPE_SPACE:
    range = MBSpace;
    break;
  case ONIGENC_CTYPE_UPPER:
    range = MBUpper;
    break;
  case ONIGENC_CTYPE_XDIGIT:
    return FALSE;
    break;
  case ONIGENC_CTYPE_WORD:
    range = MBWord;
    break;
  case ONIGENC_CTYPE_ASCII:
    return FALSE;
    break;
  case ONIGENC_CTYPE_ALNUM:
    range = MBAlnum;
    break;
  case ONIGENC_CTYPE_NEWLINE:
    return FALSE;
    break;

  default:
    return ONIGENCERR_TYPE_BUG;
    break;
  }

  return onig_is_in_code_range((UChar* )range, code);

#else

  if ((ctype & ONIGENC_CTYPE_WORD) != 0) {
#ifdef USE_INVALID_CODE_SCHEME
    if (code <= VALID_CODE_LIMIT)
#endif
      return TRUE;
  }
#endif /* USE_UNICODE_FULL_RANGE_CTYPE */

  return FALSE;
}

Here is the call graph for this function:

static int utf8_is_mbc_ambiguous ( OnigAmbigType  flag,
const UChar **  pp,
const UChar end 
) [static]

Definition at line 266 of file utf8.c.

{
  const UChar* p = *pp;

  if (ONIGENC_IS_MBC_ASCII(p)) {
    (*pp)++;
    if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) {
      return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
    }
  }
  else {
    (*pp) += enc_len(ONIG_ENCODING_UTF8, p);

    if (*p == 195) { /* 195 == '\303' */
      int c = *(p + 1);
      if (c >= 128) {
        if ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0) {
          if (c <= (UChar )'\236') { /* upper */
            if (c == (UChar )'\227') return FALSE;
            return TRUE;
          }
          else if (c >= (UChar )'\240' && c <= (UChar )'\276') { /* lower */
            if (c == (UChar )'\267') return FALSE;
            return TRUE;
          }
        }
      }
    }
  }

  return FALSE;
}
static int utf8_is_mbc_newline ( const UChar p,
const UChar end 
) [static]

Definition at line 69 of file utf8.c.

{
  if (p < end) {
    if (*p == 0x0a) return 1;

#ifdef USE_UNICODE_ALL_LINE_TERMINATORS
    if (*p == 0x0d) return 1;
    if (p + 1 < end) {
      if (*(p+1) == 0x85 && *p == 0xc2) /* U+0085 */
       return 1;
      if (p + 2 < end) {
       if ((*(p+2) == 0xa8 || *(p+2) == 0xa9)
           && *(p+1) == 0x80 && *p == 0xe2)  /* U+2028, U+2029 */
         return 1;
      }
    }
#endif
  }

  return 0;
}
static UChar* utf8_left_adjust_char_head ( const UChar start,
const UChar s 
) [static]

Definition at line 3692 of file utf8.c.

{
  const UChar *p;

  if (s <= start) return (UChar* )s;
  p = s;

  while (!utf8_islead(*p) && p > start) p--;
  return (UChar* )p;
}
static int utf8_mbc_enc_len ( const UChar p) [static]

Definition at line 63 of file utf8.c.

{
  return EncLen_UTF8[*p];
}
static OnigCodePoint utf8_mbc_to_code ( const UChar p,
const UChar end 
) [static]

Definition at line 92 of file utf8.c.

{
  int c, len;
  OnigCodePoint n;

  len = enc_len(ONIG_ENCODING_UTF8, p);
  c = *p++;
  if (len > 1) {
    len--;
    n = c & ((1 << (6 - len)) - 1);
    while (len--) {
      c = *p++;
      n = (n << 6) | (c & ((1 << 6) - 1));
    }
    return n;
  }
  else {
#ifdef USE_INVALID_CODE_SCHEME
    if (c > 0xfd) {
      return ((c == 0xfe) ? INVALID_CODE_FE : INVALID_CODE_FF);
    }
#endif
    return (OnigCodePoint )c;
  }
}
static int utf8_mbc_to_normalize ( OnigAmbigType  flag,
const UChar **  pp,
const UChar end,
UChar lower 
) [static]

Definition at line 221 of file utf8.c.

{
  const UChar* p = *pp;

  if (ONIGENC_IS_MBC_ASCII(p)) {
    if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) {
      *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
    }
    else {
      *lower = *p;
    }
    (*pp)++;
    return 1; /* return byte length of converted char to lower */
  }
  else {
    int len;

    if (*p == 195) { /* 195 == '\303' */
      int c = *(p + 1);
      if (c >= 128) {
        if (c <= (UChar )'\236' &&  /* upper */
            (flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0) {
          if (c != (UChar )'\227') {
            *lower++ = *p;
            *lower   = (UChar )(c + 32);
            (*pp) += 2;
            return 2;
          }
        }
      }
    }

    len = enc_len(ONIG_ENCODING_UTF8, p);
    if (lower != p) {
      int i;
      for (i = 0; i < len; i++) {
       *lower++ = *p++;
      }
    }
    (*pp) += len;
    return len; /* return byte length of converted char to lower */
  }
}

Variable Documentation

const OnigCodePoint EmptyRange[] = { 0 } [static]

Definition at line 300 of file utf8.c.

const int EncLen_UTF8[] [static]
Initial value:
 {
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
}

Definition at line 43 of file utf8.c.

const OnigCodePoint MBAlnum[] [static]

Definition at line 309 of file utf8.c.

const OnigCodePoint MBAlpha[] [static]

Definition at line 737 of file utf8.c.

const OnigCodePoint MBBlank[] [static]
Initial value:
 {



  1,

  0x00a0, 0x00a0








 
}

Definition at line 1148 of file utf8.c.

const OnigCodePoint MBCntrl[] [static]
Initial value:
 {



  2,

  0x0080, 0x009f,
  0x00ad, 0x00ad


















 
}

Definition at line 1172 of file utf8.c.

const OnigCodePoint MBDigit[] [static]

Definition at line 1206 of file utf8.c.

const OnigCodePoint MBGraph[] [static]

Definition at line 1243 of file utf8.c.

const OnigCodePoint MBLower[] [static]

Definition at line 1663 of file utf8.c.

const OnigCodePoint MBPrint[] [static]

Definition at line 2103 of file utf8.c.

const OnigCodePoint MBPunct[] [static]

Definition at line 2530 of file utf8.c.

const OnigCodePoint MBSpace[] [static]
Initial value:
 {



  2,

  0x0085, 0x0085,
  0x00a0, 0x00a0









 
}

Definition at line 2624 of file utf8.c.

const OnigCodePoint MBUpper[] [static]

Definition at line 2649 of file utf8.c.

const OnigCodePoint MBWord[] [static]

Definition at line 3100 of file utf8.c.

const OnigCodePoint SBAlnum[] [static]
Initial value:
 {
  3,
  0x0030, 0x0039,
  0x0041, 0x005a,
  0x0061, 0x007a
}

Definition at line 302 of file utf8.c.

const OnigCodePoint SBAlpha[] [static]
Initial value:
 {
  2,
  0x0041, 0x005a,
  0x0061, 0x007a
}

Definition at line 731 of file utf8.c.

const OnigCodePoint SBASCII[] [static]
Initial value:
 {
  1,
  0x0000, 0x007f
}

Definition at line 3087 of file utf8.c.

const OnigCodePoint SBBlank[] [static]
Initial value:
 {
  2,
  0x0009, 0x0009,
  0x0020, 0x0020
}

Definition at line 1142 of file utf8.c.

const OnigCodePoint SBCntrl[] [static]
Initial value:
 {
  2,
  0x0000, 0x001f,
  0x007f, 0x007f
}

Definition at line 1166 of file utf8.c.

const OnigCodePoint SBDigit[] [static]
Initial value:
 {
  1,
  0x0030, 0x0039
}

Definition at line 1201 of file utf8.c.

const OnigCodePoint SBGraph[] [static]
Initial value:
 {
  1,
  0x0021, 0x007e
}

Definition at line 1238 of file utf8.c.

const OnigCodePoint SBLower[] [static]
Initial value:
 {
  1,
  0x0061, 0x007a
}

Definition at line 1658 of file utf8.c.

const OnigCodePoint SBPrint[] [static]
Initial value:
 {
  2,
  0x0009, 0x000d,
  0x0020, 0x007e
}

Definition at line 2097 of file utf8.c.

const OnigCodePoint SBPunct[] [static]
Initial value:
 {
  9,
  0x0021, 0x0023,
  0x0025, 0x002a,
  0x002c, 0x002f,
  0x003a, 0x003b,
  0x003f, 0x0040,
  0x005b, 0x005d,
  0x005f, 0x005f,
  0x007b, 0x007b,
  0x007d, 0x007d
}

Definition at line 2517 of file utf8.c.

const OnigCodePoint SBSpace[] [static]
Initial value:
 {
  2,
  0x0009, 0x000d,
  0x0020, 0x0020
}

Definition at line 2618 of file utf8.c.

const OnigCodePoint SBUpper[] [static]
Initial value:
 {
  1,
  0x0041, 0x005a
}

Definition at line 2644 of file utf8.c.

const OnigCodePoint SBWord[] [static]
Initial value:
 {
  4,
  0x0030, 0x0039,
  0x0041, 0x005a,
  0x005f, 0x005f,
  0x0061, 0x007a
}

Definition at line 3092 of file utf8.c.

const OnigCodePoint SBXDigit[] [static]
Initial value:
 {
  3,
  0x0030, 0x0039,
  0x0041, 0x0046,
  0x0061, 0x0066
}

Definition at line 3080 of file utf8.c.