Back to index

openldap  2.4.31
utf-8-conv.c
Go to the documentation of this file.
00001 /* $OpenLDAP$ */
00002 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
00003  *
00004  * Copyright 1998-2012 The OpenLDAP Foundation.
00005  * All rights reserved.
00006  *
00007  * Redistribution and use in source and binary forms, with or without
00008  * modification, are permitted only as authorized by the OpenLDAP
00009  * Public License.
00010  *
00011  * A copy of this license is available in the file LICENSE in the
00012  * top-level directory of the distribution or, alternatively, at
00013  * <http://www.OpenLDAP.org/license.html>.
00014  */
00015 /* Portions Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
00016  * 
00017  * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
00018  * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
00019  * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
00020  * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
00021  * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
00022  * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
00023  * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
00024  * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY. 
00025  *---
00026  * Note: A verbatim copy of version 2.0.1 of the OpenLDAP Public License 
00027  * can be found in the file "build/LICENSE-2.0.1" in this distribution
00028  * of OpenLDAP Software.
00029  */
00030 
00031 /*
00032  * UTF-8 Conversion Routines
00033  *
00034  * These routines convert between Wide Character and UTF-8,
00035  * or between MultiByte and UTF-8 encodings.
00036  *
00037  * Both single character and string versions of the functions are provided.
00038  * All functions return -1 if the character or string cannot be converted.
00039  */
00040 
00041 #include "portable.h"
00042 
00043 #if SIZEOF_WCHAR_T >= 4
00044 /* These routines assume ( sizeof(wchar_t) >= 4 ) */
00045 
00046 #include <stdio.h>
00047 #include <ac/stdlib.h>             /* For wctomb, wcstombs, mbtowc, mbstowcs */
00048 #include <ac/string.h>
00049 #include <ac/time.h>        /* for time_t */
00050 
00051 #include "ldap-int.h"
00052 
00053 #include <ldap_utf8.h>
00054 
00055 static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
00056 
00057 
00058 /*-----------------------------------------------------------------------------
00059                                    UTF-8 Format Summary
00060 
00061 ASCII chars                                      7 bits
00062     0xxxxxxx
00063     
00064 2-character UTF-8 sequence:        11 bits
00065     110xxxxx  10xxxxxx
00066 
00067 3-character UTF-8                  16 bits
00068     1110xxxx  10xxxxxx  10xxxxxx   
00069     
00070 4-char UTF-8                       21 bits 
00071     11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
00072     
00073 5-char UTF-8                       26 bits
00074     111110xx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
00075     
00076 6-char UTF-8                       31 bits
00077     1111110x  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
00078     
00079 Unicode address space   (0 - 0x10FFFF)    21 bits
00080 ISO-10646 address space (0 - 0x7FFFFFFF)  31 bits
00081 
00082 Note: This code does not prevent UTF-8 sequences which are longer than
00083       necessary from being decoded.
00084 */
00085 
00086 /*----------------------------------------------------------------------------- 
00087    Convert a UTF-8 character to a wide char. 
00088    Return the length of the UTF-8 input character in bytes.
00089 */
00090 int
00091 ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
00092 {
00093        int utflen, i;
00094        wchar_t ch;
00095 
00096        if (utf8char == NULL) return -1;
00097 
00098        /* Get UTF-8 sequence length from 1st byte */
00099        utflen = LDAP_UTF8_CHARLEN2(utf8char, utflen);
00100        
00101        if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
00102 
00103        /* First byte minus length tag */
00104        ch = (wchar_t)(utf8char[0] & mask[utflen]);
00105        
00106        for(i=1; i < utflen; i++) {
00107               /* Subsequent bytes must start with 10 */
00108               if ((utf8char[i] & 0xc0) != 0x80) return -1;
00109        
00110               ch <<= 6;                   /* 6 bits of data in each subsequent byte */
00111               ch |= (wchar_t)(utf8char[i] & 0x3f);
00112        }
00113        
00114        if (wchar) *wchar = ch;
00115 
00116        return utflen;
00117 }
00118 
00119 /*-----------------------------------------------------------------------------
00120    Convert a UTF-8 string to a wide char string.
00121    No more than 'count' wide chars will be written to the output buffer.
00122    Return the size of the converted string in wide chars, excl null terminator.
00123 */
00124 int
00125 ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count )
00126 {
00127        size_t wclen = 0;
00128        int utflen, i;
00129        wchar_t ch;
00130 
00131 
00132        /* If input ptr is NULL or empty... */
00133        if (utf8str == NULL || !*utf8str) {
00134               if ( wcstr )
00135                      *wcstr = 0;
00136               return 0;
00137        }
00138 
00139        /* Examine next UTF-8 character.  If output buffer is NULL, ignore count */
00140        while ( *utf8str && (wcstr==NULL || wclen<count) ) {
00141               /* Get UTF-8 sequence length from 1st byte */
00142               utflen = LDAP_UTF8_CHARLEN2(utf8str, utflen);
00143               
00144               if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
00145 
00146               /* First byte minus length tag */
00147               ch = (wchar_t)(utf8str[0] & mask[utflen]);
00148               
00149               for(i=1; i < utflen; i++) {
00150                      /* Subsequent bytes must start with 10 */
00151                      if ((utf8str[i] & 0xc0) != 0x80) return -1;
00152               
00153                      ch <<= 6;                   /* 6 bits of data in each subsequent byte */
00154                      ch |= (wchar_t)(utf8str[i] & 0x3f);
00155               }
00156               
00157               if (wcstr) wcstr[wclen] = ch;
00158               
00159               utf8str += utflen;   /* Move to next UTF-8 character */
00160               wclen++;                    /* Count number of wide chars stored/required */
00161        }
00162 
00163        /* Add null terminator if there's room in the buffer. */
00164        if (wcstr && wclen < count) wcstr[wclen] = 0;
00165 
00166        return wclen;
00167 }
00168 
00169 
00170 /*----------------------------------------------------------------------------- 
00171    Convert one wide char to a UTF-8 character.
00172    Return the length of the converted UTF-8 character in bytes.
00173    No more than 'count' bytes will be written to the output buffer.
00174 */
00175 int
00176 ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count )
00177 {
00178        int len=0;
00179 
00180        if (utf8char == NULL)   /* Just determine the required UTF-8 char length. */
00181        {                                         /* Ignore count */
00182               if( wchar < 0 )
00183                      return -1;
00184               if( wchar < 0x80 )
00185                      return 1;
00186               if( wchar < 0x800 )
00187                      return 2; 
00188               if( wchar < 0x10000 )
00189                      return 3;
00190               if( wchar < 0x200000 ) 
00191                      return 4;
00192               if( wchar < 0x4000000 ) 
00193                      return 5;
00194 #if SIZEOF_WCHAR_T > 4
00195               /* UL is not strictly needed by ANSI C */
00196               if( wchar < (wchar_t)0x80000000UL )
00197 #endif /* SIZEOF_WCHAR_T > 4 */
00198                      return 6;
00199               return -1;
00200        }
00201 
00202        
00203        if ( wchar < 0 ) {                        /* Invalid wide character */
00204               len = -1;
00205 
00206        } else if( wchar < 0x80 ) {
00207               if (count >= 1) {
00208                      utf8char[len++] = (char)wchar;
00209               }
00210 
00211        } else if( wchar < 0x800 ) {
00212               if (count >=2) {
00213                      utf8char[len++] = 0xc0 | ( wchar >> 6 );
00214                      utf8char[len++] = 0x80 | ( wchar & 0x3f );
00215               }
00216 
00217        } else if( wchar < 0x10000 ) {
00218               if (count >= 3) {    
00219                      utf8char[len++] = 0xe0 | ( wchar >> 12 );
00220                      utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
00221                      utf8char[len++] = 0x80 | ( wchar & 0x3f );
00222               }
00223        
00224        } else if( wchar < 0x200000 ) {
00225               if (count >= 4) {
00226                      utf8char[len++] = 0xf0 | ( wchar >> 18 );
00227                      utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
00228                      utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
00229                      utf8char[len++] = 0x80 | ( wchar & 0x3f );
00230               }
00231 
00232        } else if( wchar < 0x4000000 ) {
00233               if (count >= 5) {
00234                      utf8char[len++] = 0xf8 | ( wchar >> 24 );
00235                      utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
00236                      utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
00237                      utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
00238                      utf8char[len++] = 0x80 | ( wchar & 0x3f );
00239               }
00240 
00241        } else
00242 #if SIZEOF_WCHAR_T > 4
00243               /* UL is not strictly needed by ANSI C */
00244               if( wchar < (wchar_t)0x80000000UL )
00245 #endif /* SIZEOF_WCHAR_T > 4 */
00246        {
00247               if (count >= 6) {
00248                      utf8char[len++] = 0xfc | ( wchar >> 30 );
00249                      utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f );
00250                      utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
00251                      utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
00252                      utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
00253                      utf8char[len++] = 0x80 | ( wchar & 0x3f );
00254               }
00255 
00256 #if SIZEOF_WCHAR_T > 4
00257        } else {
00258               len = -1;
00259 #endif /* SIZEOF_WCHAR_T > 4 */
00260        }
00261        
00262        return len;
00263 
00264 }
00265 
00266 
00267 /*-----------------------------------------------------------------------------
00268    Convert a wide char string to a UTF-8 string.
00269    No more than 'count' bytes will be written to the output buffer.
00270    Return the # of bytes written to the output buffer, excl null terminator.
00271 */
00272 int
00273 ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count )
00274 {
00275        int len = 0;
00276        int n;
00277        char *p = utf8str;
00278        wchar_t empty = 0;          /* To avoid use of L"" construct */
00279 
00280        if (wcstr == NULL)          /* Treat input ptr NULL as an empty string */
00281               wcstr = &empty;
00282 
00283        if (utf8str == NULL) /* Just compute size of output, excl null */
00284        {
00285               while (*wcstr)
00286               {
00287                      /* Get UTF-8 size of next wide char */
00288                      n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN);
00289                      if (n == -1)
00290                             return -1;
00291                      len += n;
00292               }
00293 
00294               return len;
00295        }
00296 
00297        
00298        /* Do the actual conversion. */
00299 
00300        n = 1;                             /* In case of empty wcstr */
00301        while (*wcstr)
00302        {
00303               n = ldap_x_wc_to_utf8( p, *wcstr++, count);
00304               
00305               if (n <= 0)          /* If encoding error (-1) or won't fit (0), quit */
00306                      break;
00307               
00308               p += n;
00309               count -= n;                 /* Space left in output buffer */
00310        }
00311 
00312        /* If not enough room for last character, pad remainder with null
00313           so that return value = original count, indicating buffer full. */
00314        if (n == 0)
00315        {
00316               while (count--)
00317                      *p++ = 0;
00318        }
00319 
00320        /* Add a null terminator if there's room. */
00321        else if (count)
00322               *p = 0;
00323 
00324        if (n == -1)                /* Conversion encountered invalid wide char. */
00325               return -1;
00326 
00327        /* Return the number of bytes written to output buffer, excl null. */ 
00328        return (p - utf8str);
00329 }
00330 
00331 #ifdef ANDROID
00332 int wctomb(char *s, wchar_t wc) { return wcrtomb(s,wc,NULL); }
00333 int mbtowc(wchar_t *pwc, const char *s, size_t n) { return mbrtowc(pwc, s, n, NULL); }
00334 #endif
00335 
00336 /*-----------------------------------------------------------------------------
00337    Convert a UTF-8 character to a MultiByte character.
00338    Return the size of the converted character in bytes.
00339 */
00340 int
00341 ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
00342               int (*f_wctomb)(char *mbchar, wchar_t wchar) )
00343 {
00344        wchar_t wchar;
00345        int n;
00346        char tmp[6];                       /* Large enough for biggest multibyte char */
00347 
00348        if (f_wctomb == NULL)              /* If no conversion function was given... */
00349               f_wctomb = wctomb;          /*    use the local ANSI C function */
00350  
00351        /* First convert UTF-8 char to a wide char */
00352        n = ldap_x_utf8_to_wc( &wchar, utf8char);
00353 
00354        if (n == -1)
00355               return -1;           /* Invalid UTF-8 character */
00356 
00357        if (mbchar == NULL)
00358               n = f_wctomb( tmp, wchar );
00359        else
00360               n = f_wctomb( mbchar, wchar);
00361 
00362        return n;
00363 }
00364 
00365 /*-----------------------------------------------------------------------------
00366    Convert a UTF-8 string to a MultiByte string.
00367    No more than 'count' bytes will be written to the output buffer.
00368    Return the size of the converted string in bytes, excl null terminator.
00369 */
00370 int
00371 ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
00372               size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
00373 {
00374        wchar_t *wcs;
00375        size_t wcsize;
00376     int n;
00377 
00378        if (f_wcstombs == NULL)            /* If no conversion function was given... */
00379               f_wcstombs = wcstombs;      /*    use the local ANSI C function */
00380  
00381        if (utf8str == NULL || *utf8str == 0)     /* NULL or empty input string */
00382        {
00383               if (mbstr)
00384                      *mbstr = 0;
00385               return 0;
00386        }
00387 
00388 /* Allocate memory for the maximum size wchar string that we could get. */
00389        wcsize = strlen(utf8str) + 1;
00390        wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t));
00391        if (wcs == NULL)
00392               return -1;                         /* Memory allocation failure. */
00393 
00394        /* First convert the UTF-8 string to a wide char string */
00395        n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize);
00396 
00397        /* Then convert wide char string to multi-byte string */
00398        if (n != -1)
00399        {
00400               n = f_wcstombs(mbstr, wcs, count);
00401        }
00402 
00403        LDAP_FREE(wcs);
00404 
00405        return n;
00406 }
00407 
00408 /*-----------------------------------------------------------------------------
00409    Convert a MultiByte character to a UTF-8 character.
00410    'mbsize' indicates the number of bytes of 'mbchar' to check.
00411    Returns the number of bytes written to the output character.
00412 */
00413 int
00414 ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
00415               int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
00416 {
00417     wchar_t wchar;
00418     int n;
00419 
00420        if (f_mbtowc == NULL)              /* If no conversion function was given... */
00421               f_mbtowc = mbtowc;          /*    use the local ANSI C function */
00422  
00423     if (mbsize == 0)                      /* 0 is not valid. */
00424         return -1;
00425 
00426     if (mbchar == NULL || *mbchar == 0)
00427     {
00428         if (utf8char)
00429             *utf8char = 0;
00430         return 1;
00431     }
00432 
00433        /* First convert the MB char to a Wide Char */
00434        n = f_mbtowc( &wchar, mbchar, mbsize);
00435 
00436        if (n == -1)
00437               return -1;
00438 
00439        /* Convert the Wide Char to a UTF-8 character. */
00440        n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN);
00441 
00442        return n;
00443 }
00444 
00445 
00446 /*-----------------------------------------------------------------------------
00447    Convert a MultiByte string to a UTF-8 string.
00448    No more than 'count' bytes will be written to the output buffer.
00449    Return the size of the converted string in bytes, excl null terminator.
00450 */   
00451 int
00452 ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
00453               size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) )
00454 {
00455        wchar_t *wcs;
00456        int n;
00457        size_t wcsize;
00458 
00459        if (mbstr == NULL)             /* Treat NULL input string as an empty string */
00460               mbstr = "";
00461 
00462        if (f_mbstowcs == NULL)            /* If no conversion function was given... */
00463               f_mbstowcs = mbstowcs;      /*    use the local ANSI C function */
00464  
00465        /* Allocate memory for the maximum size wchar string that we could get. */
00466        wcsize = strlen(mbstr) + 1;
00467        wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) );
00468        if (wcs == NULL)
00469               return -1;
00470 
00471        /* First convert multi-byte string to a wide char string */
00472        n = f_mbstowcs(wcs, mbstr, wcsize);
00473 
00474        /* Convert wide char string to UTF-8 string */
00475        if (n != -1)
00476        {
00477               n = ldap_x_wcs_to_utf8s( utf8str, wcs, count);
00478        }
00479 
00480        LDAP_FREE(wcs);
00481 
00482        return n;     
00483 }
00484 
00485 #endif /* SIZEOF_WCHAR_T >= 4 */