Back to index

php5  5.3.10
cyr_convert.c
Go to the documentation of this file.
00001 /*
00002    +----------------------------------------------------------------------+
00003    | PHP Version 5                                                        |
00004    +----------------------------------------------------------------------+
00005    | Copyright (c) 1997-2012 The PHP Group                                |
00006    +----------------------------------------------------------------------+
00007    | This source file is subject to version 3.01 of the PHP license,      |
00008    | that is bundled with this package in the file LICENSE, and is        |
00009    | available through the world-wide-web at the following url:           |
00010    | http://www.php.net/license/3_01.txt                                  |
00011    | If you did not receive a copy of the PHP license and are unable to   |
00012    | obtain it through the world-wide-web, please send a note to          |
00013    | license@php.net so we can mail you a copy immediately.               |
00014    +----------------------------------------------------------------------+
00015    | Author: Kirill Maximov <kir@rus.net>                                 |
00016    +----------------------------------------------------------------------+
00017  */
00018 
00019 /* $Id: cyr_convert.c 321634 2012-01-01 13:15:04Z felipe $ */
00020 
00021 #include <stdlib.h>
00022 
00023 #ifdef HAVE_UNISTD_H
00024 #include <unistd.h>
00025 #endif
00026 #include <string.h>
00027 #include <errno.h>
00028 
00029 #include "php.h"
00030 #include "cyr_convert.h"
00031 
00032 #include <stdio.h>
00033 
00034 /*****************************************************************************
00035 * This is codetables for different Cyrillic charsets (relative to koi8-r). 
00036 * Each table contains data for 128-255 symbols from ASCII table.
00037 * First 256 symbols are for conversion from koi8-r to corresponding charset,
00038 * second 256 symbols are for reverse conversion, from charset to koi8-r.
00039 *
00040 * Here we have the following tables:
00041 * _cyr_win1251   - for windows-1251 charset
00042 * _cyr_iso88595  - for iso8859-5 charset
00043 * _cyr_cp866     - for x-cp866 charset
00044 * _cyr_mac       - for x-mac-cyrillic charset
00045 *
00046 *****************************************************************************/
00047 
00048 typedef unsigned char _cyr_charset_table[512];
00049 
00050 /* {{{ static const _cyr_charset_table _cyr_win1251
00051  */
00052 static const _cyr_charset_table _cyr_win1251 = {
00053 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
00054 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
00055 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
00056 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
00057 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
00058 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
00059 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
00060 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
00061 46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,
00062 46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,
00063 154,174,190,46,159,189,46,46,179,191,180,157,46,46,156,183,
00064 46,46,182,166,173,46,46,158,163,152,164,155,46,46,46,167,
00065 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
00066 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
00067 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
00068 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
00069 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
00070 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
00071 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
00072 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
00073 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
00074 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
00075 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
00076 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
00077 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
00078 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
00079 32,32,32,184,186,32,179,191,32,32,32,32,32,180,162,32,
00080 32,32,32,168,170,32,178,175,32,32,32,32,32,165,161,169,
00081 254,224,225,246,228,229,244,227,245,232,233,234,235,236,237,238,
00082 239,255,240,241,242,243,230,226,252,251,231,248,253,249,247,250,
00083 222,192,193,214,196,197,212,195,213,200,201,202,203,204,205,206,
00084 207,223,208,209,210,211,198,194,220,219,199,216,221,217,215,218,
00085 },
00086 _cyr_cp866 = { 
00087 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
00088 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
00089 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
00090 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
00091 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
00092 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
00093 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
00094 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
00095 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
00096 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
00097 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
00098 35,35,35,124,124,124,124,43,43,124,124,43,43,43,43,43,
00099 43,45,45,124,45,43,124,124,43,43,45,45,124,45,43,45,
00100 45,45,45,43,43,43,43,43,43,43,43,35,35,124,124,35,
00101 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
00102 179,163,180,164,183,167,190,174,32,149,158,32,152,159,148,154,
00103 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
00104 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
00105 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
00106 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
00107 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
00108 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
00109 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
00110 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
00111 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
00112 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
00113 205,186,213,241,243,201,32,245,187,212,211,200,190,32,247,198,
00114 199,204,181,240,242,185,32,244,203,207,208,202,216,32,246,32,
00115 238,160,161,230,164,165,228,163,229,168,169,170,171,172,173,174,
00116 175,239,224,225,226,227,166,162,236,235,167,232,237,233,231,234,
00117 158,128,129,150,132,133,148,131,149,136,137,138,139,140,141,142,
00118 143,159,144,145,146,147,134,130,156,155,135,152,157,153,151,154,
00119 },
00120 _cyr_iso88595 = {
00121 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
00122 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
00123 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
00124 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
00125 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
00126 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
00127 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
00128 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
00129 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
00130 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
00131 32,179,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
00132 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
00133 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
00134 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
00135 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
00136 32,163,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
00137 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
00138 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
00139 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
00140 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
00141 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
00142 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
00143 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
00144 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
00145 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
00146 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
00147 32,32,32,241,32,32,32,32,32,32,32,32,32,32,32,32,
00148 32,32,32,161,32,32,32,32,32,32,32,32,32,32,32,32,
00149 238,208,209,230,212,213,228,211,229,216,217,218,219,220,221,222,
00150 223,239,224,225,226,227,214,210,236,235,215,232,237,233,231,234,
00151 206,176,177,198,180,181,196,179,197,184,185,186,187,188,189,190,
00152 191,207,192,193,194,195,182,178,204,203,183,200,205,201,199,202,
00153 },
00154 _cyr_mac = {
00155 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
00156 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
00157 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
00158 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
00159 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
00160 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
00161 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
00162 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
00163 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
00164 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
00165 160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
00166 176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
00167 128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
00168 144,145,146,147,148,149,150,151,152,153,154,155,156,179,163,209,
00169 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
00170 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,255,
00171 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
00172 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
00173 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
00174 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
00175 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
00176 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
00177 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
00178 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
00179 192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
00180 208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
00181 160,161,162,222,164,165,166,167,168,169,170,171,172,173,174,175,
00182 176,177,178,221,180,181,182,183,184,185,186,187,188,189,190,191,
00183 254,224,225,246,228,229,244,227,245,232,233,234,235,236,237,238,
00184 239,223,240,241,242,243,230,226,252,251,231,248,253,249,247,250,
00185 158,128,129,150,132,133,148,131,149,136,137,138,139,140,141,142,
00186 143,159,144,145,146,147,134,130,156,155,135,152,157,153,151,154,
00187 };
00188 /* }}} */
00189 
00190 /* {{{ static char * php_convert_cyr_string(unsigned char *str, int length, char from, char to TSRMLS_DC)
00191 * This is the function that performs real in-place conversion of the string 
00192 * between charsets. 
00193 * Parameters:
00194 *    str - string to be converted
00195 *    from,to - one-symbol label of source and destination charset
00196 * The following symbols are used as labels:
00197 *    k - koi8-r
00198 *    w - windows-1251
00199 *    i - iso8859-5
00200 *    a - x-cp866
00201 *    d - x-cp866
00202 *    m - x-mac-cyrillic
00203 *****************************************************************************/
00204 static char * php_convert_cyr_string(unsigned char *str, int length, char from, char to TSRMLS_DC)
00205 {
00206        const unsigned char *from_table, *to_table;
00207        unsigned char tmp;
00208        int i;
00209 
00210        from_table = NULL;
00211        to_table   = NULL;
00212        
00213        switch (toupper((int)(unsigned char)from))
00214        {
00215               case 'W':
00216                      from_table = _cyr_win1251;
00217                      break;
00218               case 'A':
00219               case 'D':
00220                      from_table = _cyr_cp866;
00221                      break;
00222               case 'I':
00223                      from_table = _cyr_iso88595;
00224                      break;
00225               case 'M':
00226                      from_table = _cyr_mac;
00227                      break;
00228               case 'K':
00229                      break;
00230               default:
00231                      php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unknown source charset: %c", from);
00232                      break;
00233        }
00234 
00235        switch (toupper((int)(unsigned char)to))
00236        {
00237               case 'W':
00238                      to_table = _cyr_win1251;
00239                      break;
00240               case 'A':
00241               case 'D':
00242                      to_table = _cyr_cp866;
00243                      break;
00244               case 'I':
00245                      to_table = _cyr_iso88595;
00246                      break;
00247               case 'M':
00248                      to_table = _cyr_mac;
00249                      break;
00250               case 'K':
00251                      break;
00252               default:
00253                      php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unknown destination charset: %c", to);
00254                      break;
00255        }
00256 
00257 
00258        if (!str)
00259               return (char *)str;
00260        
00261        for( i = 0; i<length; i++)
00262        {
00263               tmp = (from_table == NULL)? str[i] : from_table[ str[i] ];
00264               str[i] = (to_table == NULL) ? tmp : to_table[tmp + 256];
00265        }
00266        return (char *)str;
00267 }
00268 /* }}} */
00269 
00270 /* {{{ proto string convert_cyr_string(string str, string from, string to)
00271    Convert from one Cyrillic character set to another */
00272 PHP_FUNCTION(convert_cyr_string)
00273 {
00274        char *input, *fr_cs, *to_cs;
00275        int input_len, fr_cs_len, to_cs_len;
00276        unsigned char *str;
00277 
00278        if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sss", &input, &input_len, &fr_cs, &fr_cs_len, &to_cs, &to_cs_len) == FAILURE) {
00279               return;
00280        }
00281 
00282        str = (unsigned char*) estrndup(input, input_len);
00283 
00284        php_convert_cyr_string(str, input_len, fr_cs[0], to_cs[0] TSRMLS_CC);
00285        RETVAL_STRING((char *)str, 0)
00286 }
00287 /* }}} */
00288 
00289 /*
00290  * Local variables:
00291  * tab-width: 4
00292  * c-basic-offset: 4
00293  * End:
00294  * vim600: sw=4 ts=4 fdm=marker
00295  * vim<600: sw=4 ts=4
00296  */