Back to index

php5  5.3.10
php_unicode.c
Go to the documentation of this file.
00001 /*
00002    +----------------------------------------------------------------------+
00003    | PHP Version 5                                                        |
00004    +----------------------------------------------------------------------+
00005    | Copyright (c) 1997-2012 The PHP Group                                |
00006    +----------------------------------------------------------------------+
00007    | This source file is subject to version 3.01 of the PHP license,      |
00008    | that is bundled with this package in the file LICENSE, and is        |
00009    | available through the world-wide-web at the following url:           |
00010    | http://www.php.net/license/3_01.txt                                  |
00011    | If you did not receive a copy of the PHP license and are unable to   |
00012    | obtain it through the world-wide-web, please send a note to          |
00013    | license@php.net so we can mail you a copy immediately.               |
00014    +----------------------------------------------------------------------+
00015    | Author: Wez Furlong (wez@thebrainroom.com)                           |
00016    +----------------------------------------------------------------------+
00017 
00018        Based on code from ucdata-2.5, which has the following Copyright:
00019    
00020        Copyright 2001 Computing Research Labs, New Mexico State University
00021  
00022        Permission is hereby granted, free of charge, to any person obtaining a
00023        copy of this software and associated documentation files (the "Software"),
00024        to deal in the Software without restriction, including without limitation
00025        the rights to use, copy, modify, merge, publish, distribute, sublicense,
00026        and/or sell copies of the Software, and to permit persons to whom the
00027        Software is furnished to do so, subject to the following conditions:
00028  
00029        The above copyright notice and this permission notice shall be included in
00030        all copies or substantial portions of the Software.
00031 */
00032 
00033 #ifdef HAVE_CONFIG_H
00034 #include "config.h"
00035 #endif
00036 
00037 #include "php.h"
00038 #include "php_ini.h"
00039 
00040 #if HAVE_MBSTRING
00041 
00042 /* include case folding data generated from the official UnicodeData.txt file */
00043 #include "mbstring.h"
00044 #include "php_unicode.h"
00045 #include "unicode_data.h"
00046 
00047 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
00048 
00049 /*
00050  * A simple array of 32-bit masks for lookup.
00051  */
00052 static unsigned long masks32[32] = {
00053     0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020,
00054     0x00000040, 0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800,
00055     0x00001000, 0x00002000, 0x00004000, 0x00008000, 0x00010000, 0x00020000,
00056     0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000,
00057     0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000,
00058     0x40000000, 0x80000000
00059 };
00060 
00061 
00062 static int prop_lookup(unsigned long code, unsigned long n)
00063 {
00064        long l, r, m;
00065 
00066        /*
00067         * There is an extra node on the end of the offsets to allow this routine
00068         * to work right.  If the index is 0xffff, then there are no nodes for the
00069         * property.
00070         */
00071        if ((l = _ucprop_offsets[n]) == 0xffff)
00072               return 0;
00073 
00074        /*
00075         * Locate the next offset that is not 0xffff.  The sentinel at the end of
00076         * the array is the max index value.
00077         */
00078        for (m = 1; n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++)
00079               ;
00080 
00081        r = _ucprop_offsets[n + m] - 1;
00082 
00083        while (l <= r) {
00084               /*
00085                * Determine a "mid" point and adjust to make sure the mid point is at
00086                * the beginning of a range pair.
00087                */
00088               m = (l + r) >> 1;
00089               m -= (m & 1);
00090               if (code > _ucprop_ranges[m + 1])
00091                      l = m + 2;
00092               else if (code < _ucprop_ranges[m])
00093                      r = m - 2;
00094               else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1])
00095                      return 1;
00096        }
00097        return 0;
00098 
00099 }
00100 
00101 MBSTRING_API int php_unicode_is_prop(unsigned long code, unsigned long mask1,
00102               unsigned long mask2)
00103 {
00104        unsigned long i;
00105 
00106        if (mask1 == 0 && mask2 == 0)
00107               return 0;
00108 
00109        for (i = 0; mask1 && i < 32; i++) {
00110               if ((mask1 & masks32[i]) && prop_lookup(code, i))
00111                      return 1;
00112        }
00113 
00114        for (i = 32; mask2 && i < _ucprop_size; i++) {
00115               if ((mask2 & masks32[i & 31]) && prop_lookup(code, i))
00116                      return 1;
00117        }
00118 
00119        return 0;
00120 }
00121 
00122 static unsigned long case_lookup(unsigned long code, long l, long r, int field)
00123 {
00124        long m;
00125 
00126        /*
00127         * Do the binary search.
00128         */
00129        while (l <= r) {
00130               /*
00131                * Determine a "mid" point and adjust to make sure the mid point is at
00132                * the beginning of a case mapping triple.
00133                */
00134               m = (l + r) >> 1;
00135               m -= (m % 3);
00136               if (code > _uccase_map[m])
00137                      l = m + 3;
00138               else if (code < _uccase_map[m])
00139                      r = m - 3;
00140               else if (code == _uccase_map[m])
00141                      return _uccase_map[m + field];
00142        }
00143 
00144        return code;
00145 }
00146 
00147 MBSTRING_API unsigned long php_turkish_toupper(unsigned long code, long l, long r, int field)
00148 {
00149        if (code == 0x0069L) {
00150               return 0x0130L;
00151        }
00152        return case_lookup(code, l, r, field);
00153 }
00154 
00155 MBSTRING_API unsigned long php_turkish_tolower(unsigned long code, long l, long r, int field)
00156 {
00157        if (code == 0x0049L) {
00158               return 0x0131L;
00159        }      
00160        return case_lookup(code, l, r, field);
00161 }
00162 
00163 MBSTRING_API unsigned long php_unicode_toupper(unsigned long code, enum mbfl_no_encoding enc TSRMLS_DC)
00164 {
00165        int field;
00166        long l, r;
00167 
00168        if (php_unicode_is_upper(code))
00169               return code;
00170 
00171        if (php_unicode_is_lower(code)) {
00172               /*
00173                * The character is lower case.
00174                */
00175               field = 2;
00176               l = _uccase_len[0];
00177               r = (l + _uccase_len[1]) - 3;
00178 
00179               if (enc == mbfl_no_encoding_8859_9) {
00180                      return php_turkish_toupper(code, l, r, field);
00181               }
00182 
00183        } else {
00184               /*
00185                * The character is title case.
00186                */
00187               field = 1;
00188               l = _uccase_len[0] + _uccase_len[1];
00189               r = _uccase_size - 3;
00190        }
00191        return case_lookup(code, l, r, field);
00192 }
00193 
00194 MBSTRING_API unsigned long php_unicode_tolower(unsigned long code, enum mbfl_no_encoding enc TSRMLS_DC)
00195 {
00196        int field;
00197        long l, r;
00198 
00199        if (php_unicode_is_lower(code))
00200               return code;
00201 
00202        if (php_unicode_is_upper(code)) {
00203               /*
00204                * The character is upper case.
00205                */
00206               field = 1;
00207               l = 0;
00208               r = _uccase_len[0] - 3;
00209 
00210               if (enc == mbfl_no_encoding_8859_9) {
00211                      return php_turkish_tolower(code, l, r, field);
00212               }
00213 
00214        } else {
00215               /*
00216                * The character is title case.
00217                */
00218               field = 2;
00219               l = _uccase_len[0] + _uccase_len[1];
00220               r = _uccase_size - 3;
00221        }
00222        return case_lookup(code, l, r, field);
00223 }
00224 
00225 MBSTRING_API unsigned long php_unicode_totitle(unsigned long code, enum mbfl_no_encoding enc TSRMLS_DC)
00226 {
00227        int field;
00228        long l, r;
00229 
00230        if (php_unicode_is_title(code))
00231               return code;
00232 
00233        /*
00234         * The offset will always be the same for converting to title case.
00235         */
00236        field = 2;
00237 
00238        if (php_unicode_is_upper(code)) {
00239               /*
00240                * The character is upper case.
00241                */
00242               l = 0;
00243               r = _uccase_len[0] - 3;
00244        } else {
00245               /*
00246                * The character is lower case.
00247                */
00248               l = _uccase_len[0];
00249               r = (l + _uccase_len[1]) - 3;
00250        }
00251        return case_lookup(code, l, r, field);
00252 
00253 }
00254 
00255 
00256 #define BE_ARY_TO_UINT32(ptr) (\
00257        ((unsigned char*)(ptr))[0]<<24 |\
00258        ((unsigned char*)(ptr))[1]<<16 |\
00259        ((unsigned char*)(ptr))[2]<< 8 |\
00260        ((unsigned char*)(ptr))[3] )
00261 
00262 #define UINT32_TO_BE_ARY(ptr,val) { \
00263        unsigned int v = val; \
00264        ((unsigned char*)(ptr))[0] = (v>>24) & 0xff,\
00265        ((unsigned char*)(ptr))[1] = (v>>16) & 0xff,\
00266        ((unsigned char*)(ptr))[2] = (v>> 8) & 0xff,\
00267        ((unsigned char*)(ptr))[3] = (v    ) & 0xff;\
00268 }
00269 
00270 MBSTRING_API char *php_unicode_convert_case(int case_mode, const char *srcstr, size_t srclen, size_t *ret_len,
00271               const char *src_encoding TSRMLS_DC)
00272 {
00273        char *unicode, *newstr;
00274        size_t unicode_len;
00275        unsigned char *unicode_ptr;
00276        size_t i;
00277        enum mbfl_no_encoding _src_encoding = mbfl_name2no_encoding(src_encoding);
00278 
00279        if (_src_encoding == mbfl_no_encoding_invalid) {
00280               php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unknown encoding \"%s\"", src_encoding);
00281               return NULL;
00282        }      
00283 
00284        unicode = php_mb_convert_encoding(srcstr, srclen, "UCS-4BE", src_encoding, &unicode_len TSRMLS_CC);
00285        if (unicode == NULL)
00286               return NULL;
00287        
00288        unicode_ptr = (unsigned char *)unicode;
00289 
00290        switch(case_mode) {
00291               case PHP_UNICODE_CASE_UPPER:
00292                      for (i = 0; i < unicode_len; i+=4) {
00293                             UINT32_TO_BE_ARY(&unicode_ptr[i],
00294                                    php_unicode_toupper(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding TSRMLS_CC));
00295                      }
00296                      break;
00297 
00298               case PHP_UNICODE_CASE_LOWER:
00299                      for (i = 0; i < unicode_len; i+=4) {
00300                             UINT32_TO_BE_ARY(&unicode_ptr[i],
00301                                    php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding TSRMLS_CC));
00302                      }
00303                      break;
00304 
00305               case PHP_UNICODE_CASE_TITLE: {
00306                      int mode = 0; 
00307 
00308                      for (i = 0; i < unicode_len; i+=4) {
00309                             int res = php_unicode_is_prop(
00310                                    BE_ARY_TO_UINT32(&unicode_ptr[i]),
00311                                    UC_MN|UC_ME|UC_CF|UC_LM|UC_SK|UC_LU|UC_LL|UC_LT|UC_PO|UC_OS, 0);
00312                             if (mode) {
00313                                    if (res) {
00314                                           UINT32_TO_BE_ARY(&unicode_ptr[i],
00315                                                  php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding TSRMLS_CC));
00316                                    } else {
00317                                           mode = 0;
00318                                    }      
00319                             } else {
00320                                    if (res) {
00321                                           mode = 1;
00322                                           UINT32_TO_BE_ARY(&unicode_ptr[i],
00323                                                  php_unicode_totitle(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding TSRMLS_CC));
00324                                    }
00325                             }
00326                      }
00327               } break;
00328 
00329        }
00330        
00331        newstr = php_mb_convert_encoding(unicode, unicode_len, src_encoding, "UCS-4BE", ret_len TSRMLS_CC);
00332        efree(unicode);
00333 
00334        return newstr;
00335 }
00336 
00337 
00338 #endif /* HAVE_MBSTRING */
00339 
00340 /*
00341  * Local variables:
00342  * tab-width: 4
00343  * c-basic-offset: 4
00344  * End:
00345  * vim600: sw=4 ts=4 fdm=marker
00346  * vim<600: sw=4 ts=4
00347  */