Back to index

php5  5.3.10
grapheme_util.c
Go to the documentation of this file.
00001 /*
00002    +----------------------------------------------------------------------+
00003    | PHP Version 5                                                        |
00004    +----------------------------------------------------------------------+
00005    | This source file is subject to version 3.01 of the PHP license,      |
00006    | that is bundled with this package in the file LICENSE, and is        |
00007    | available through the world-wide-web at the following url:           |
00008    | http://www.php.net/license/3_01.txt                                  |
00009    | If you did not receive a copy of the PHP license and are unable to   |
00010    | obtain it through the world-wide-web, please send a note to          |
00011    | license@php.net so we can mail you a copy immediately.               |
00012    +----------------------------------------------------------------------+
00013    | Author: Ed Batutis <ed@batutis.com>                                  |
00014    +----------------------------------------------------------------------+
00015  */
00016 
00017 /* {{{ includes */
00018 #ifdef HAVE_CONFIG_H
00019 #include "config.h"
00020 #endif
00021 
00022 #include <php.h>
00023 #include "grapheme.h"
00024 #include "grapheme_util.h"
00025 #include "intl_common.h"
00026 
00027 #include <unicode/utypes.h>
00028 #include <unicode/ucol.h>
00029 #include <unicode/ustring.h>
00030 #include <unicode/ubrk.h>
00031 
00032 #include "ext/standard/php_string.h"
00033 
00034 ZEND_EXTERN_MODULE_GLOBALS( intl )
00035 
00036 /* }}} */
00037 
00038 /* {{{ grapheme_close_global_iterator - clean up */
00039 void
00040 grapheme_close_global_iterator( TSRMLS_D )
00041 {
00042        UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator );
00043 
00044        if ( NULL != global_break_iterator ) {
00045               ubrk_close(global_break_iterator);
00046        }
00047 }
00048 /* }}} */
00049 
00050 /* {{{ grapheme_intl_case_fold: convert string to lowercase */
00051 void
00052 grapheme_intl_case_fold(UChar** ptr_to_free, UChar **str, int32_t *str_len, UErrorCode *pstatus )
00053 {
00054     UChar *dest;
00055     int32_t dest_len, size_required;
00056 
00057     /* allocate a destination string that is a bit larger than the src, hoping that is enough */
00058     dest_len = (*str_len) + ( *str_len / 10 );
00059     dest = (UChar*) eumalloc(dest_len);
00060 
00061     *pstatus = U_ZERO_ERROR;
00062     size_required = u_strFoldCase(dest, dest_len, *str, *str_len, U_FOLD_CASE_DEFAULT, pstatus);
00063 
00064     dest_len = size_required;
00065 
00066     if ( U_BUFFER_OVERFLOW_ERROR == *pstatus ) {
00067 
00068         dest = (UChar*) eurealloc(dest, dest_len);
00069 
00070         *pstatus = U_ZERO_ERROR;
00071         size_required = u_strFoldCase(dest, dest_len, *str, *str_len, U_FOLD_CASE_DEFAULT, pstatus);
00072     }
00073 
00074     if ( U_FAILURE(*pstatus) ) {
00075         return;
00076     }
00077 
00078     if ( NULL != ptr_to_free) {
00079         efree(*ptr_to_free);
00080         *ptr_to_free = dest;
00081     }
00082 
00083     *str = dest;
00084     *str_len = dest_len;
00085 
00086     return;
00087 }
00088 /* }}} */
00089 
00090 /* {{{ grapheme_substr_ascii f='from' - starting point, l='length' */
00091 void
00092 grapheme_substr_ascii(char *str, int str_len, int f, int l, int argc, char **sub_str, int *sub_str_len)
00093 {
00094     *sub_str = NULL;
00095 
00096     if (argc > 2) {
00097         if ((l < 0 && -l > str_len)) {
00098             return;
00099         } else if (l > str_len) {
00100             l = str_len;
00101         }
00102     } else {
00103         l = str_len;
00104     }
00105 
00106     if (f > str_len || (f < 0 && -f > str_len)) {
00107         return;
00108     }
00109 
00110     if (l < 0 && (l + str_len - f) < 0) {
00111         return;
00112     }
00113 
00114     /* if "from" position is negative, count start position from the end
00115      * of the string
00116      */
00117     if (f < 0) {
00118         f = str_len + f;
00119         if (f < 0) {
00120             f = 0;
00121         }
00122     }
00123 
00124 
00125     /* if "length" position is negative, set it to the length
00126      * needed to stop that many chars from the end of the string
00127      */
00128     if (l < 0) {
00129         l = (str_len - f) + l;
00130         if (l < 0) {
00131             l = 0;
00132         }
00133     }
00134 
00135     if (f >= str_len) {
00136         return;
00137     }
00138 
00139     if ((f + l) > str_len) {
00140         l = str_len - f;
00141     }
00142 
00143     *sub_str = str + f;
00144     *sub_str_len = l;
00145 
00146     return;
00147 }
00148 /* }}} */
00149 
00150 /* {{{ grapheme_strrpos_utf16 - strrpos using utf16 */
00151 int
00152 grapheme_strrpos_utf16(unsigned char *haystack, int32_t haystack_len, unsigned char*needle, int32_t needle_len, int32_t offset, int f_ignore_case TSRMLS_DC)
00153 {
00154     UChar *uhaystack, *puhaystack, *uhaystack_end, *uneedle;
00155     int32_t uhaystack_len, uneedle_len;
00156     UErrorCode status;
00157     unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
00158     UBreakIterator* bi = NULL;
00159     int ret_pos, pos;
00160 
00161     /* convert the strings to UTF-16. */
00162     uhaystack = NULL;
00163     uhaystack_len = 0;
00164     status = U_ZERO_ERROR;
00165     intl_convert_utf8_to_utf16(&uhaystack, &uhaystack_len, (char *) haystack, haystack_len, &status );
00166 
00167     if ( U_FAILURE( status ) ) {
00168         /* Set global error code. */
00169         intl_error_set_code( NULL, status TSRMLS_CC );
00170 
00171         /* Set error messages. */
00172         intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
00173         if (uhaystack) {
00174                      efree( uhaystack );
00175               }
00176         return -1;
00177     }
00178 
00179     if ( f_ignore_case ) {
00180         grapheme_intl_case_fold(&uhaystack, &uhaystack, &uhaystack_len, &status );
00181     }
00182 
00183     /* get a pointer to the haystack taking into account the offset */
00184     bi = NULL;
00185     status = U_ZERO_ERROR;
00186     bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
00187 
00188     puhaystack = grapheme_get_haystack_offset(bi, uhaystack, uhaystack_len, offset);
00189 
00190     if ( NULL == puhaystack ) {
00191         intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
00192         if (uhaystack) {
00193                      efree( uhaystack );
00194               }
00195         ubrk_close (bi);
00196         return -1;
00197     }
00198 
00199     uneedle = NULL;
00200     uneedle_len = 0;
00201     status = U_ZERO_ERROR;
00202     intl_convert_utf8_to_utf16(&uneedle, &uneedle_len, (char *) needle, needle_len, &status );
00203 
00204     if ( U_FAILURE( status ) ) {
00205         /* Set global error code. */
00206         intl_error_set_code( NULL, status TSRMLS_CC );
00207 
00208         /* Set error messages. */
00209         intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
00210         if (uhaystack) {
00211                      efree( uhaystack );
00212               }
00213               if (uneedle) {
00214                      efree( uneedle );
00215               }
00216         ubrk_close (bi);
00217         return -1;
00218     }
00219 
00220     if ( f_ignore_case ) {
00221         grapheme_intl_case_fold(&uneedle, &uneedle, &uneedle_len, &status );
00222     }
00223 
00224     ret_pos = -1;   /* -1 represents 'not found' */
00225 
00226     /* back up until there's needle_len characters to compare */
00227 
00228     uhaystack_end = uhaystack + uhaystack_len;
00229     pos = ubrk_last(bi);
00230     puhaystack = uhaystack + pos;
00231 
00232     while ( uhaystack_end - puhaystack < uneedle_len ) {
00233 
00234         pos = ubrk_previous(bi);
00235 
00236         if ( UBRK_DONE == pos ) {
00237             break;
00238         }
00239 
00240         puhaystack = uhaystack + pos;
00241     }
00242 
00243     /* is there enough haystack left to hold the needle? */
00244     if ( ( uhaystack_end - puhaystack ) < uneedle_len ) {
00245         /* not enough, not found */
00246         goto exit;
00247     }
00248 
00249     while ( UBRK_DONE != pos ) {
00250 
00251         if (!u_memcmp(uneedle, puhaystack, uneedle_len)) {  /* needle_len - 1 in zend memnstr? */
00252 
00253             /* does the grapheme in the haystack end at the same place as the last grapheme in the needle? */
00254 
00255             if ( ubrk_isBoundary(bi, pos + uneedle_len) ) {
00256 
00257                 /* found it, get grapheme count offset */
00258                 ret_pos = grapheme_count_graphemes(bi, uhaystack, pos);
00259                 break;
00260             }
00261 
00262             /* set position back */
00263             ubrk_isBoundary(bi, pos);
00264         }
00265 
00266         pos = ubrk_previous(bi);
00267         puhaystack = uhaystack + pos;
00268     }
00269 
00270 exit:
00271        if (uhaystack) {
00272               efree( uhaystack );
00273        }
00274        if (uneedle) {
00275               efree( uneedle );
00276        }
00277     ubrk_close (bi);
00278 
00279     return ret_pos;
00280 }
00281 
00282 /* }}} */
00283 
00284 /* {{{ grapheme_strpos_utf16 - strrpos using utf16*/
00285 int
00286 grapheme_strpos_utf16(unsigned char *haystack, int32_t haystack_len, unsigned char*needle, int32_t needle_len, int32_t offset, int32_t *puchar_pos, int f_ignore_case TSRMLS_DC)
00287 {
00288        UChar *uhaystack, *puhaystack, *uneedle;
00289        int32_t uhaystack_len, uneedle_len;
00290        int ret_pos;
00291        unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
00292        UBreakIterator* bi;
00293        UErrorCode status;
00294 
00295        *puchar_pos = -1;
00296 
00297        /* convert the strings to UTF-16. */
00298 
00299        uhaystack = NULL;
00300        uhaystack_len = 0;
00301        status = U_ZERO_ERROR;
00302        intl_convert_utf8_to_utf16(&uhaystack, &uhaystack_len, (char *) haystack, haystack_len, &status );
00303 
00304        if ( U_FAILURE( status ) ) {
00305               /* Set global error code. */
00306               intl_error_set_code( NULL, status TSRMLS_CC );
00307 
00308               /* Set error messages. */
00309               intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
00310               if (uhaystack) {
00311                      efree( uhaystack );
00312               }
00313               return -1;
00314        }
00315 
00316        /* get a pointer to the haystack taking into account the offset */
00317        bi = NULL;
00318        status = U_ZERO_ERROR;
00319        bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
00320        
00321        puhaystack = grapheme_get_haystack_offset(bi, uhaystack, uhaystack_len, offset);
00322        uhaystack_len = (uhaystack_len - ( puhaystack - uhaystack));
00323 
00324        if ( NULL == puhaystack ) {
00325        
00326               intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
00327               if (uhaystack) {
00328                      efree( uhaystack );
00329               }
00330               ubrk_close (bi);
00331                                    
00332               return -1;
00333        }
00334 
00335        if ( f_ignore_case ) {
00336               grapheme_intl_case_fold(&uhaystack, &puhaystack, &uhaystack_len, &status );
00337        }
00338 
00339        uneedle = NULL;
00340        uneedle_len = 0;
00341        status = U_ZERO_ERROR;
00342        intl_convert_utf8_to_utf16(&uneedle, &uneedle_len, (char *) needle, needle_len, &status );
00343 
00344        if ( U_FAILURE( status ) ) {
00345               /* Set global error code. */
00346               intl_error_set_code( NULL, status TSRMLS_CC );
00347 
00348               /* Set error messages. */
00349               intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
00350               if (uhaystack) {
00351                      efree( uhaystack );
00352               }
00353               if (uneedle) {
00354                      efree( uneedle );
00355               }
00356               ubrk_close (bi);
00357               
00358               return -1;
00359        }
00360 
00361        if ( f_ignore_case ) {
00362               grapheme_intl_case_fold(&uneedle, &uneedle, &uneedle_len, &status );
00363        }
00364 
00365        ret_pos = grapheme_memnstr_grapheme(bi, puhaystack, uneedle, uneedle_len, puhaystack + uhaystack_len );
00366        
00367        *puchar_pos = ubrk_current(bi);
00368 
00369        if (uhaystack) {
00370               efree( uhaystack );
00371        }
00372        if (uneedle) {
00373               efree( uneedle );
00374        }
00375        ubrk_close (bi);
00376 
00377        return ret_pos;
00378 }
00379 
00380 /* }}} */
00381 
00382 /* {{{ grapheme_ascii_check: ASCII check */
00383 int grapheme_ascii_check(const unsigned char *day, int32_t len)
00384 {
00385        int ret_len = len;
00386        while ( len-- ) {
00387        if ( *day++ > 0x7f )
00388               return -1;
00389        }
00390 
00391        return ret_len;
00392 }
00393 
00394 /* }}} */
00395 
00396 /* {{{ grapheme_split_string: find and optionally return grapheme boundaries */
00397 int grapheme_split_string(const UChar *text, int32_t text_length, int boundary_array[], int boundary_array_len TSRMLS_DC )
00398 {
00399        unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
00400        UErrorCode           status = U_ZERO_ERROR;
00401        int ret_len, pos;
00402        UBreakIterator* bi;
00403 
00404        bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC );
00405 
00406        if( U_FAILURE(status) ) {
00407               return -1;
00408        }
00409        
00410        ubrk_setText(bi, text, text_length,       &status);
00411 
00412        pos = 0;
00413        
00414        for ( ret_len = 0; pos != UBRK_DONE; ) {
00415        
00416               pos = ubrk_next(bi);
00417               
00418               if ( pos != UBRK_DONE ) {
00419               
00420                      if ( NULL != boundary_array && ret_len < boundary_array_len ) {
00421                             boundary_array[ret_len] = pos;
00422                      }
00423 
00424                      ret_len++;
00425               }
00426        }
00427                      
00428        ubrk_close(bi);
00429        
00430        return ret_len;
00431 }
00432 /* }}} */
00433 
00434 /* {{{ grapheme_count_graphemes */
00435 int32_t
00436 grapheme_count_graphemes(UBreakIterator *bi, UChar *string, int32_t string_len)
00437 {
00438        int ret_len = 0;
00439        int pos = 0;
00440        UErrorCode           status = U_ZERO_ERROR;
00441        
00442        ubrk_setText(bi, string, string_len, &status);
00443 
00444        do {
00445        
00446               pos = ubrk_next(bi);
00447               
00448               if ( UBRK_DONE != pos ) {
00449                      ret_len++;
00450               }
00451               
00452        } while ( UBRK_DONE != pos );
00453        
00454        return ret_len;
00455 }
00456 /* }}} */
00457 
00458 /* {{{ grapheme_memnstr_grapheme: find needle in haystack using grapheme boundaries */
00459 int32_t
00460 grapheme_memnstr_grapheme(UBreakIterator *bi, UChar *haystack, UChar *needle, int32_t needle_len, UChar *end)
00461 {
00462        UChar *p = haystack;
00463        UChar ne = needle[needle_len-1];
00464        UErrorCode status;
00465        int32_t grapheme_offset;
00466        
00467        end -= needle_len;
00468 
00469        while (p <= end) {
00470 
00471               if ((p = u_memchr(p, *needle, (end-p+1))) && ne == p[needle_len-1]) {
00472 
00473                      if (!u_memcmp(needle, p, needle_len - 1)) {  /* needle_len - 1 works because if needle_len is 1, we've already tested the char */
00474 
00475                             /* does the grapheme end here? */
00476 
00477                             status = U_ZERO_ERROR;
00478                             ubrk_setText (bi, haystack, (end - haystack) + needle_len, &status);
00479 
00480                             if ( ubrk_isBoundary (bi, (p - haystack) + needle_len) ) {
00481 
00482                                    /* found it, get grapheme count offset */
00483                                    grapheme_offset = grapheme_count_graphemes(bi, haystack, (p - haystack));
00484 
00485                                    return grapheme_offset;
00486                             }
00487                      }
00488               }
00489 
00490               if (p == NULL) {
00491                      return -1;
00492               }
00493 
00494               p++;
00495        }
00496 
00497        return -1;
00498 }
00499 
00500 /* }}} */
00501 
00502 /* {{{ grapheme_memrstr_grapheme: reverse find needle in haystack using grapheme boundaries */
00503 inline void *grapheme_memrchr_grapheme(const void *s, int c, int32_t n)
00504 {
00505        register unsigned char *e;
00506 
00507        if (n <= 0) {
00508               return NULL;
00509        }
00510 
00511        for (e = (unsigned char *)s + n - 1; e >= (unsigned char *)s; e--) {
00512               if (*e == (unsigned char)c) {
00513                      return (void *)e;
00514               }
00515        }
00516 
00517        return NULL;
00518 }
00519 /* }}} */
00520 
00521 /* {{{        grapheme_get_haystack_offset - bump the haystack pointer based on the grapheme count offset */
00522 UChar *
00523 grapheme_get_haystack_offset(UBreakIterator* bi, UChar *uhaystack, int32_t uhaystack_len, int32_t offset)
00524 {
00525        UErrorCode           status;
00526        int32_t pos;
00527        int32_t (*iter_op)(UBreakIterator* bi);
00528        int iter_incr;
00529 
00530        if ( NULL != bi ) {
00531               status = U_ZERO_ERROR;
00532               ubrk_setText (bi, uhaystack, uhaystack_len, &status);
00533        }
00534 
00535        if ( 0 == offset ) {
00536               return uhaystack;
00537        }
00538        
00539        if ( offset < 0 ) {
00540               iter_op = ubrk_previous;
00541               ubrk_last(bi); /* one past the end */
00542               iter_incr = 1;
00543        }
00544        else {
00545               iter_op = ubrk_next;
00546               iter_incr = -1;
00547        }
00548        
00549        pos = 0;
00550        
00551        while ( pos != UBRK_DONE && offset != 0 ) {
00552        
00553               pos = iter_op(bi);
00554               
00555               if ( UBRK_DONE != pos ) {
00556                      offset += iter_incr;
00557               }
00558        }
00559 
00560        if ( offset != 0 ) {
00561               return NULL;
00562        }
00563        
00564        return uhaystack + pos;
00565 }
00566 /* }}} */
00567 
00568 /* {{{ grapheme_strrpos_ascii: borrowed from the php ext/standard/string.c */
00569  int32_t
00570 grapheme_strrpos_ascii(unsigned char *haystack, int32_t haystack_len, unsigned char *needle, int32_t needle_len, int32_t offset)
00571 {
00572        unsigned char *p, *e;
00573 
00574        if (offset >= 0) {
00575               p = haystack + offset;
00576               e = haystack + haystack_len - needle_len;
00577        } else {
00578               p = haystack;
00579               if (needle_len > -offset) {
00580                      e = haystack + haystack_len - needle_len;
00581               } else {
00582                      e = haystack + haystack_len + offset;
00583               }
00584        }
00585 
00586        if (needle_len == 1) {
00587               /* Single character search can shortcut memcmps */
00588               while (e >= p) {
00589                      if (*e == *needle) {
00590                             return (e - p + (offset > 0 ? offset : 0));
00591                      }
00592                      e--;
00593               }
00594               return -1;
00595        }
00596 
00597        while (e >= p) {
00598               if (memcmp(e, needle, needle_len) == 0) {
00599                      return (e - p + (offset > 0 ? offset : 0));
00600               }
00601               e--;
00602        }
00603 
00604        return -1;
00605 }
00606 
00607 /* }}} */
00608 
00609 /* {{{ grapheme_get_break_iterator: get a clone of the global character break iterator */
00610 UBreakIterator* 
00611 grapheme_get_break_iterator(void *stack_buffer, UErrorCode *status TSRMLS_DC )
00612 {
00613        int32_t buffer_size;
00614 
00615        UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator );
00616 
00617        if ( NULL == global_break_iterator ) {
00618 
00619               global_break_iterator = ubrk_open(UBRK_CHARACTER, 
00620                                                                              NULL,  /* icu default locale - locale has no effect on this iterator */
00621                                                                              NULL,  /* text not set in global iterator */
00622                                                                              0,            /* text length = 0 */
00623                                                                              status);
00624 
00625               INTL_G(grapheme_iterator) = global_break_iterator;
00626        }
00627 
00628        buffer_size = U_BRK_SAFECLONE_BUFFERSIZE;
00629 
00630        return ubrk_safeClone(global_break_iterator, stack_buffer, &buffer_size, status);
00631 }
00632 /* }}} */
00633 
00634 /*
00635  * Local variables:
00636  * tab-width: 4
00637  * c-basic-offset: 4
00638  * End:
00639  * vim600: fdm=marker
00640  * vim: noet sw=4 ts=4
00641  */
00642