Back to index

openldap  2.4.31
utf-8.c
Go to the documentation of this file.
00001 /* utf-8.c -- Basic UTF-8 routines */
00002 /* $OpenLDAP$ */
00003 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
00004  *
00005  * Copyright 1998-2012 The OpenLDAP Foundation.
00006  * All rights reserved.
00007  *
00008  * Redistribution and use in source and binary forms, with or without
00009  * modification, are permitted only as authorized by the OpenLDAP
00010  * Public License.
00011  *
00012  * A copy of this license is available in the file LICENSE in the
00013  * top-level directory of the distribution or, alternatively, at
00014  * <http://www.OpenLDAP.org/license.html>.
00015  */
00016 /* Basic UTF-8 routines
00017  *
00018  * These routines are "dumb".  Though they understand UTF-8,
00019  * they don't grok Unicode.  That is, they can push bits,
00020  * but don't have a clue what the bits represent.  That's
00021  * good enough for use with the LDAP Client SDK.
00022  *
00023  * These routines are not optimized.
00024  */
00025 
00026 #include "portable.h"
00027 
00028 #include <stdio.h>
00029 
00030 #include <ac/stdlib.h>
00031 
00032 #include <ac/socket.h>
00033 #include <ac/string.h>
00034 #include <ac/time.h>
00035 
00036 #include "ldap_utf8.h"
00037 
00038 #include "ldap-int.h"
00039 #include "ldap_defaults.h"
00040 
00041 /*
00042  * return the number of bytes required to hold the
00043  * NULL-terminated UTF-8 string NOT INCLUDING the
00044  * termination.
00045  */
00046 ber_len_t ldap_utf8_bytes( const char * p )
00047 {
00048        ber_len_t bytes;
00049 
00050        for( bytes=0; p[bytes]; bytes++ ) {
00051               /* EMPTY */ ;
00052        }
00053 
00054        return bytes;
00055 }
00056 
00057 ber_len_t ldap_utf8_chars( const char * p )
00058 {
00059        /* could be optimized and could check for invalid sequences */
00060        ber_len_t chars=0;
00061 
00062        for( ; *p ; LDAP_UTF8_INCR(p) ) {
00063               chars++;
00064        }
00065 
00066        return chars;
00067 }
00068 
00069 /* return offset to next character */
00070 int ldap_utf8_offset( const char * p )
00071 {
00072        return LDAP_UTF8_NEXT(p) - p;
00073 }
00074 
00075 /*
00076  * Returns length indicated by first byte.
00077  */
00078 const char ldap_utf8_lentab[] = {
00079        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00080        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00081        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00082        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00083        0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00084        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00085        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
00086        4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
00087 
00088 int ldap_utf8_charlen( const char * p )
00089 {
00090        if (!(*p & 0x80))
00091               return 1;
00092 
00093        return ldap_utf8_lentab[*(const unsigned char *)p ^ 0x80];
00094 }
00095 
00096 /*
00097  * Make sure the UTF-8 char used the shortest possible encoding
00098  * returns charlen if valid, 0 if not. 
00099  *
00100  * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
00101  * The table is slightly modified from that of the RFC.
00102  *
00103  * UCS-4 range (hex)      UTF-8 sequence (binary)
00104  * 0000 0000-0000 007F   0.......
00105  * 0000 0080-0000 07FF   110++++. 10......
00106  * 0000 0800-0000 FFFF   1110++++ 10+..... 10......
00107  * 0001 0000-001F FFFF   11110+++ 10++.... 10...... 10......
00108  * 0020 0000-03FF FFFF   111110++ 10+++... 10...... 10...... 10......
00109  * 0400 0000-7FFF FFFF   1111110+ 10++++.. 10...... 10...... 10...... 10......
00110  *
00111  * The '.' bits are "don't cares". When validating a UTF-8 sequence,
00112  * at least one of the '+' bits must be set, otherwise the character
00113  * should have been encoded in fewer octets. Note that in the two-octet
00114  * case, only the first octet needs to be validated, and this is done
00115  * in the ldap_utf8_lentab[] above.
00116  */
00117 
00118 /* mask of required bits in second octet */
00119 #undef c
00120 #define c const char
00121 c ldap_utf8_mintab[] = {
00122        (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
00123        (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
00124        (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
00125        (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
00126 #undef c
00127 
00128 int ldap_utf8_charlen2( const char * p )
00129 {
00130        int i = LDAP_UTF8_CHARLEN( p );
00131 
00132        if ( i > 2 ) {
00133               if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
00134                      i = 0;
00135        }
00136        return i;
00137 }
00138 
00139 /* conv UTF-8 to UCS-4, useful for comparisons */
00140 ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
00141 {
00142     const unsigned char *c = (const unsigned char *) p;
00143     ldap_ucs4_t ch;
00144        int len, i;
00145        static unsigned char mask[] = {
00146               0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
00147 
00148        len = LDAP_UTF8_CHARLEN2(p, len);
00149 
00150        if( len == 0 ) return LDAP_UCS4_INVALID;
00151 
00152        ch = c[0] & mask[len];
00153 
00154        for(i=1; i < len; i++) {
00155               if ((c[i] & 0xc0) != 0x80) {
00156                      return LDAP_UCS4_INVALID;
00157               }
00158 
00159               ch <<= 6;
00160               ch |= c[i] & 0x3f;
00161        }
00162 
00163        return ch;
00164 }
00165 
00166 /* conv UCS-4 to UTF-8, not used */
00167 int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
00168 {
00169        int len=0;
00170        unsigned char* p = (unsigned char *) buf;
00171 
00172        /* not a valid Unicode character */
00173        if ( c < 0 ) return 0;
00174 
00175        /* Just return length, don't convert */
00176        if(buf == NULL) {
00177               if( c < 0x80 ) return 1;
00178               else if( c < 0x800 ) return 2;
00179               else if( c < 0x10000 ) return 3;
00180               else if( c < 0x200000 ) return 4;
00181               else if( c < 0x4000000 ) return 5;
00182               else return 6;
00183        }
00184 
00185        if( c < 0x80 ) {
00186               p[len++] = c;
00187 
00188        } else if( c < 0x800 ) {
00189               p[len++] = 0xc0 | ( c >> 6 );
00190               p[len++] = 0x80 | ( c & 0x3f );
00191 
00192        } else if( c < 0x10000 ) {
00193               p[len++] = 0xe0 | ( c >> 12 );
00194               p[len++] = 0x80 | ( (c >> 6) & 0x3f );
00195               p[len++] = 0x80 | ( c & 0x3f );
00196 
00197        } else if( c < 0x200000 ) {
00198               p[len++] = 0xf0 | ( c >> 18 );
00199               p[len++] = 0x80 | ( (c >> 12) & 0x3f );
00200               p[len++] = 0x80 | ( (c >> 6) & 0x3f );
00201               p[len++] = 0x80 | ( c & 0x3f );
00202 
00203        } else if( c < 0x4000000 ) {
00204               p[len++] = 0xf8 | ( c >> 24 );
00205               p[len++] = 0x80 | ( (c >> 18) & 0x3f );
00206               p[len++] = 0x80 | ( (c >> 12) & 0x3f );
00207               p[len++] = 0x80 | ( (c >> 6) & 0x3f );
00208               p[len++] = 0x80 | ( c & 0x3f );
00209 
00210        } else /* if( c < 0x80000000 ) */ {
00211               p[len++] = 0xfc | ( c >> 30 );
00212               p[len++] = 0x80 | ( (c >> 24) & 0x3f );
00213               p[len++] = 0x80 | ( (c >> 18) & 0x3f );
00214               p[len++] = 0x80 | ( (c >> 12) & 0x3f );
00215               p[len++] = 0x80 | ( (c >> 6) & 0x3f );
00216               p[len++] = 0x80 | ( c & 0x3f );
00217        }
00218 
00219        return len;
00220 }
00221 
00222 #define LDAP_UCS_UTF8LEN(c) \
00223        c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
00224        (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
00225 
00226 /* Convert a string to UTF-8 format. The input string is expected to
00227  * have characters of 1, 2, or 4 octets (in network byte order)
00228  * corresponding to the ASN.1 T61STRING, BMPSTRING, and UNIVERSALSTRING
00229  * types respectively. (Here T61STRING just means that there is one
00230  * octet per character and characters may use the high bit of the octet.
00231  * The characters are assumed to use ISO mappings, no provision is made
00232  * for converting from T.61 coding rules to Unicode.)
00233  */
00234 
00235 int
00236 ldap_ucs_to_utf8s( struct berval *ucs, int csize, struct berval *utf8s )
00237 {
00238        unsigned char *in, *end;
00239        char *ptr;
00240        ldap_ucs4_t u;
00241        int i, l = 0;
00242 
00243        utf8s->bv_val = NULL;
00244        utf8s->bv_len = 0;
00245 
00246        in = (unsigned char *)ucs->bv_val;
00247 
00248        /* Make sure we stop at an even multiple of csize */
00249        end = in + ( ucs->bv_len & ~(csize-1) );
00250        
00251        for (; in < end; ) {
00252               u = *in++;
00253               if (csize > 1) {
00254                      u <<= 8;
00255                      u |= *in++;
00256               }
00257               if (csize > 2) {
00258                      u <<= 8;
00259                      u |= *in++;
00260                      u <<= 8;
00261                      u |= *in++;
00262               }
00263               i = LDAP_UCS_UTF8LEN(u);
00264               if (i == 0)
00265                      return LDAP_INVALID_SYNTAX;
00266               l += i;
00267        }
00268 
00269        utf8s->bv_val = LDAP_MALLOC( l+1 );
00270        if (utf8s->bv_val == NULL)
00271               return LDAP_NO_MEMORY;
00272        utf8s->bv_len = l;
00273 
00274        ptr = utf8s->bv_val;
00275        for (in = (unsigned char *)ucs->bv_val; in < end; ) {
00276               u = *in++;
00277               if (csize > 1) {
00278                      u <<= 8;
00279                      u |= *in++;
00280               }
00281               if (csize > 2) {
00282                      u <<= 8;
00283                      u |= *in++;
00284                      u <<= 8;
00285                      u |= *in++;
00286               }
00287               ptr += ldap_x_ucs4_to_utf8(u, ptr);
00288        }
00289        *ptr = '\0';
00290        return LDAP_SUCCESS;
00291 }
00292 
00293 /*
00294  * Advance to the next UTF-8 character
00295  *
00296  * Ignores length of multibyte character, instead rely on
00297  * continuation markers to find start of next character.
00298  * This allows for "resyncing" of when invalid characters
00299  * are provided provided the start of the next character
00300  * is appears within the 6 bytes examined.
00301  */
00302 char* ldap_utf8_next( const char * p )
00303 {
00304        int i;
00305        const unsigned char *u = (const unsigned char *) p;
00306 
00307        if( LDAP_UTF8_ISASCII(u) ) {
00308               return (char *) &p[1];
00309        }
00310 
00311        for( i=1; i<6; i++ ) {
00312               if ( ( u[i] & 0xc0 ) != 0x80 ) {
00313                      return (char *) &p[i];
00314               }
00315        }
00316 
00317        return (char *) &p[i];
00318 }
00319 
00320 /*
00321  * Advance to the previous UTF-8 character
00322  *
00323  * Ignores length of multibyte character, instead rely on
00324  * continuation markers to find start of next character.
00325  * This allows for "resyncing" of when invalid characters
00326  * are provided provided the start of the next character
00327  * is appears within the 6 bytes examined.
00328  */
00329 char* ldap_utf8_prev( const char * p )
00330 {
00331        int i;
00332        const unsigned char *u = (const unsigned char *) p;
00333 
00334        for( i=-1; i>-6 ; i-- ) {
00335               if ( ( u[i] & 0xc0 ) != 0x80 ) {
00336                      return (char *) &p[i];
00337               }
00338        }
00339 
00340        return (char *) &p[i];
00341 }
00342 
00343 /*
00344  * Copy one UTF-8 character from src to dst returning
00345  * number of bytes copied.
00346  *
00347  * Ignores length of multibyte character, instead rely on
00348  * continuation markers to find start of next character.
00349  * This allows for "resyncing" of when invalid characters
00350  * are provided provided the start of the next character
00351  * is appears within the 6 bytes examined.
00352  */
00353 int ldap_utf8_copy( char* dst, const char *src )
00354 {
00355        int i;
00356        const unsigned char *u = (const unsigned char *) src;
00357 
00358        dst[0] = src[0];
00359 
00360        if( LDAP_UTF8_ISASCII(u) ) {
00361               return 1;
00362        }
00363 
00364        for( i=1; i<6; i++ ) {
00365               if ( ( u[i] & 0xc0 ) != 0x80 ) {
00366                      return i; 
00367               }
00368               dst[i] = src[i];
00369        }
00370 
00371        return i;
00372 }
00373 
00374 #ifndef UTF8_ALPHA_CTYPE
00375 /*
00376  * UTF-8 ctype routines
00377  * Only deals with characters < 0x80 (ie: US-ASCII)
00378  */
00379 
00380 int ldap_utf8_isascii( const char * p )
00381 {
00382        unsigned c = * (const unsigned char *) p;
00383        return LDAP_ASCII(c);
00384 }
00385 
00386 int ldap_utf8_isdigit( const char * p )
00387 {
00388        unsigned c = * (const unsigned char *) p;
00389 
00390        if(!LDAP_ASCII(c)) return 0;
00391 
00392        return LDAP_DIGIT( c );
00393 }
00394 
00395 int ldap_utf8_isxdigit( const char * p )
00396 {
00397        unsigned c = * (const unsigned char *) p;
00398 
00399        if(!LDAP_ASCII(c)) return 0;
00400 
00401        return LDAP_HEX(c);
00402 }
00403 
00404 int ldap_utf8_isspace( const char * p )
00405 {
00406        unsigned c = * (const unsigned char *) p;
00407 
00408        if(!LDAP_ASCII(c)) return 0;
00409 
00410        switch(c) {
00411        case ' ':
00412        case '\t':
00413        case '\n':
00414        case '\r':
00415        case '\v':
00416        case '\f':
00417               return 1;
00418        }
00419 
00420        return 0;
00421 }
00422 
00423 /*
00424  * These are not needed by the C SDK and are
00425  * not "good enough" for general use.
00426  */
00427 int ldap_utf8_isalpha( const char * p )
00428 {
00429        unsigned c = * (const unsigned char *) p;
00430 
00431        if(!LDAP_ASCII(c)) return 0;
00432 
00433        return LDAP_ALPHA(c);
00434 }
00435 
00436 int ldap_utf8_isalnum( const char * p )
00437 {
00438        unsigned c = * (const unsigned char *) p;
00439 
00440        if(!LDAP_ASCII(c)) return 0;
00441 
00442        return LDAP_ALNUM(c);
00443 }
00444 
00445 int ldap_utf8_islower( const char * p )
00446 {
00447        unsigned c = * (const unsigned char *) p;
00448 
00449        if(!LDAP_ASCII(c)) return 0;
00450 
00451        return LDAP_LOWER(c);
00452 }
00453 
00454 int ldap_utf8_isupper( const char * p )
00455 {
00456        unsigned c = * (const unsigned char *) p;
00457 
00458        if(!LDAP_ASCII(c)) return 0;
00459 
00460        return LDAP_UPPER(c);
00461 }
00462 #endif
00463 
00464 
00465 /*
00466  * UTF-8 string routines
00467  */
00468 
00469 /* like strchr() */
00470 char * (ldap_utf8_strchr)( const char *str, const char *chr )
00471 {
00472        for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
00473               if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
00474                      return (char *) str;
00475               } 
00476        }
00477 
00478        return NULL;
00479 }
00480 
00481 /* like strcspn() but returns number of bytes, not characters */
00482 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
00483 {
00484        const char *cstr;
00485        const char *cset;
00486 
00487        for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
00488               for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
00489                      if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
00490                             return cstr - str;
00491                      } 
00492               }
00493        }
00494 
00495        return cstr - str;
00496 }
00497 
00498 /* like strspn() but returns number of bytes, not characters */
00499 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
00500 {
00501        const char *cstr;
00502        const char *cset;
00503 
00504        for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
00505               for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
00506                      if( *cset == '\0' ) {
00507                             return cstr - str;
00508                      }
00509 
00510                      if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
00511                             break;
00512                      } 
00513               }
00514        }
00515 
00516        return cstr - str;
00517 }
00518 
00519 /* like strpbrk(), replaces strchr() as well */
00520 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
00521 {
00522        for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
00523               const char *cset;
00524 
00525               for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
00526                      if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
00527                             return (char *) str;
00528                      } 
00529               }
00530        }
00531 
00532        return NULL;
00533 }
00534 
00535 /* like strtok_r(), not strtok() */
00536 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
00537 {
00538        char *begin;
00539        char *end;
00540 
00541        if( last == NULL ) return NULL;
00542 
00543        begin = str ? str : *last;
00544 
00545        begin += ldap_utf8_strspn( begin, sep );
00546 
00547        if( *begin == '\0' ) {
00548               *last = NULL;
00549               return NULL;
00550        }
00551 
00552        end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
00553 
00554        if( *end != '\0' ) {
00555               char *next = LDAP_UTF8_NEXT( end );
00556               *end = '\0';
00557               end = next;
00558        }
00559 
00560        *last = end;
00561        return begin;
00562 }