Back to index

php5  5.3.10
normalizer_normalize.c
Go to the documentation of this file.
00001 /*
00002    +----------------------------------------------------------------------+
00003    | PHP Version 5                                                                                                |
00004    +----------------------------------------------------------------------+
00005    | This source file is subject to version 3.01 of the PHP license,    |
00006    | that is bundled with this package in the file LICENSE, and is             |
00007    | available through the world-wide-web at the following url:                       |
00008    | http://www.php.net/license/3_01.txt                                                     |
00009    | If you did not receive a copy of the PHP license and are unable to   |
00010    | obtain it through the world-wide-web, please send a note to               |
00011    | license@php.net so we can mail you a copy immediately.                           |
00012    +----------------------------------------------------------------------+
00013    | Authors: Ed Batutis <ed@batutis.com>                                                    |
00014    +----------------------------------------------------------------------+
00015  */
00016 
00017 #ifdef HAVE_CONFIG_H
00018 #include "config.h"
00019 #endif
00020 
00021 #include "php_intl.h"
00022 #include "unicode/unorm.h"
00023 #include "normalizer.h"
00024 #include "normalizer_class.h"
00025 #include "normalizer_normalize.h"
00026 #include "intl_convert.h"
00027 
00028 /* {{{ proto string Normalizer::normalize( string $input [, string $form = FORM_C] )
00029  * Normalize a string. }}} */
00030 /* {{{ proto string normalizer_normalize( string $input [, string $form = FORM_C] )
00031  * Normalize a string.
00032  */
00033 PHP_FUNCTION( normalizer_normalize )
00034 {
00035        char*                input = NULL;
00036        /* form is optional, defaults to FORM_C */
00037        long                 form = NORMALIZER_DEFAULT;
00038        int                  input_len = 0;
00039               
00040        UChar*               uinput = NULL;
00041        int                  uinput_len = 0;
00042        int                  expansion_factor = 1;
00043        UErrorCode           status = U_ZERO_ERROR;
00044               
00045        UChar*               uret_buf = NULL;
00046        int                  uret_len = 0;
00047               
00048        char*                ret_buf = NULL;
00049        int32_t                     ret_len = 0;
00050 
00051        int32_t                     size_needed;
00052               
00053        intl_error_reset( NULL TSRMLS_CC );
00054 
00055        /* Parse parameters. */
00056        if( zend_parse_method_parameters( ZEND_NUM_ARGS() TSRMLS_CC, getThis(), "s|l",
00057                             &input, &input_len, &form ) == FAILURE )
00058        {
00059               intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
00060                                            "normalizer_normalize: unable to parse input params", 0 TSRMLS_CC );
00061 
00062               RETURN_FALSE;
00063        }
00064 
00065        expansion_factor = 1;
00066 
00067        switch(form) {
00068               case NORMALIZER_NONE:
00069                      break;
00070               case NORMALIZER_FORM_D:
00071                      expansion_factor = 3;
00072                      break;
00073               case NORMALIZER_FORM_KD:
00074                      expansion_factor = 3;
00075                      break;
00076               case NORMALIZER_FORM_C:
00077               case NORMALIZER_FORM_KC:
00078                      break;
00079               default:
00080                      intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
00081                                           "normalizer_normalize: illegal normalization form", 0 TSRMLS_CC );
00082                      RETURN_FALSE;
00083        }
00084 
00085        /*
00086         * Normalize string (converting it to UTF-16 first).
00087         */
00088 
00089        /* First convert the string to UTF-16. */
00090        intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
00091 
00092        if( U_FAILURE( status ) )
00093        {
00094               /* Set global error code. */
00095               intl_error_set_code( NULL, status TSRMLS_CC );
00096 
00097               /* Set error messages. */
00098               intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
00099               if (uinput) {
00100                      efree( uinput );
00101               }
00102               RETURN_FALSE;
00103        }
00104 
00105 
00106        /* Allocate memory for the destination buffer for normalization */
00107        uret_len = uinput_len * expansion_factor;
00108        uret_buf = eumalloc( uret_len + 1 );
00109 
00110        /* normalize */
00111        size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
00112        
00113        /* Bail out if an unexpected error occured.
00114         * (U_BUFFER_OVERFLOW_ERROR means that *target buffer is not large enough).
00115         * (U_STRING_NOT_TERMINATED_WARNING usually means that the input string is empty).
00116         */    
00117        if( U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && status != U_STRING_NOT_TERMINATED_WARNING ) {
00118               efree( uret_buf );
00119               efree( uinput );
00120               RETURN_NULL();
00121        }
00122 
00123        if ( size_needed > uret_len ) {
00124               /* realloc does not seem to work properly - memory is corrupted
00125                * uret_buf =  eurealloc(uret_buf, size_needed + 1);
00126                */
00127               efree( uret_buf );
00128               uret_buf = eumalloc( size_needed + 1 );
00129               uret_len = size_needed;
00130 
00131               status = U_ZERO_ERROR;
00132 
00133               /* try normalize again */
00134               size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
00135 
00136               /* Bail out if an unexpected error occured. */
00137               if( U_FAILURE(status)  ) {
00138                      /* Set error messages. */
00139                      intl_error_set_custom_msg( NULL,"Error normalizing string", 0 TSRMLS_CC );
00140                      efree( uret_buf );
00141                      efree( uinput );
00142                      RETURN_FALSE;
00143               }
00144        }
00145 
00146        efree( uinput );
00147 
00148        /* the buffer we actually used */
00149        uret_len = size_needed;
00150 
00151        /* Convert normalized string from UTF-16 to UTF-8. */
00152        intl_convert_utf16_to_utf8( &ret_buf, &ret_len, uret_buf, uret_len, &status );
00153        efree( uret_buf );
00154        if( U_FAILURE( status ) )
00155        {
00156               intl_error_set( NULL, status,
00157                             "normalizer_normalize: error converting normalized text UTF-8", 0 TSRMLS_CC );
00158               RETURN_FALSE;
00159        }
00160 
00161        /* Return it. */
00162        RETVAL_STRINGL( ret_buf, ret_len, FALSE );
00163 }
00164 /* }}} */
00165 
00166 /* {{{ proto bool Normalizer::isNormalized( string $input [, string $form = FORM_C] )
00167  * Test if a string is in a given normalization form. }}} */
00168 /* {{{ proto bool normalizer_is_normalize( string $input [, string $form = FORM_C] )
00169  * Test if a string is in a given normalization form.
00170  */
00171 PHP_FUNCTION( normalizer_is_normalized )
00172 {
00173        char*         input = NULL;
00174        /* form is optional, defaults to FORM_C */
00175        long          form = NORMALIZER_DEFAULT;
00176        int           input_len = 0;
00177 
00178        UChar*        uinput = NULL;
00179        int           uinput_len = 0;
00180        UErrorCode    status = U_ZERO_ERROR;
00181               
00182        UBool         uret = FALSE;
00183               
00184        intl_error_reset( NULL TSRMLS_CC );
00185 
00186        /* Parse parameters. */
00187        if( zend_parse_method_parameters( ZEND_NUM_ARGS() TSRMLS_CC, getThis(), "s|l",
00188                             &input, &input_len, &form) == FAILURE )
00189        {
00190               intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
00191                             "normalizer_is_normalized: unable to parse input params", 0 TSRMLS_CC );
00192 
00193               RETURN_FALSE;
00194        }
00195 
00196        switch(form) {
00197               /* case NORMALIZER_NONE: not allowed - doesn't make sense */
00198 
00199               case NORMALIZER_FORM_D:
00200               case NORMALIZER_FORM_KD:
00201               case NORMALIZER_FORM_C:
00202               case NORMALIZER_FORM_KC:
00203                      break;
00204               default:
00205                      intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
00206                                           "normalizer_normalize: illegal normalization form", 0 TSRMLS_CC );
00207                      RETURN_FALSE;
00208        }
00209 
00210 
00211        /*
00212         * Test normalization of string (converting it to UTF-16 first).
00213         */
00214 
00215        /* First convert the string to UTF-16. */
00216        intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
00217 
00218        if( U_FAILURE( status ) )
00219        {
00220               /* Set global error code. */
00221               intl_error_set_code( NULL, status TSRMLS_CC );
00222 
00223               /* Set error messages. */
00224               intl_error_set_custom_msg( NULL, "Error converting string to UTF-16.", 0 TSRMLS_CC );
00225               if (uinput) {
00226                      efree( uinput );
00227               }
00228               RETURN_FALSE;
00229        }
00230 
00231 
00232        /* test string */
00233        uret = unorm_isNormalizedWithOptions( uinput, uinput_len, form, (int32_t) 0 /* options */, &status);
00234        
00235        efree( uinput );
00236 
00237        /* Bail out if an unexpected error occured. */
00238        if( U_FAILURE(status)  ) {
00239               /* Set error messages. */
00240               intl_error_set_custom_msg( NULL,"Error testing if string is the given normalization form.", 0 TSRMLS_CC );
00241               RETURN_FALSE;
00242        }
00243 
00244        if ( uret )
00245               RETURN_TRUE;
00246                             
00247        RETURN_FALSE;
00248 }
00249 /* }}} */
00250 
00251 /*
00252  * Local variables:
00253  * tab-width: 4
00254  * c-basic-offset: 4
00255  * End:
00256  * vim600: noet sw=4 ts=4 fdm=marker
00257  * vim<600: noet sw=4 ts=4
00258  */