Back to index

lightning-sunbird  0.9+nobinonly
cairo-unicode.c
Go to the documentation of this file.
00001 /* cairo_unicode.c: Unicode conversion routines
00002  * 
00003  * The code in this file is derived from GLib's gutf8.c and
00004  *   ultimately from libunicode. It is relicensed under the
00005  *   dual LGPL/MPL with permission of the original authors.
00006  *
00007  * Copyright © 1999 Tom Tromey
00008  * Copyright © 2005 Red Hat, Inc
00009  *
00010  * This library is free software; you can redistribute it and/or
00011  * modify it either under the terms of the GNU Lesser General Public
00012  * License version 2.1 as published by the Free Software Foundation
00013  * (the "LGPL") or, at your option, under the terms of the Mozilla
00014  * Public License Version 1.1 (the "MPL"). If you do not alter this
00015  * notice, a recipient may use your version of this file under either
00016  * the MPL or the LGPL.
00017  *
00018  * You should have received a copy of the LGPL along with this library
00019  * in the file COPYING-LGPL-2.1; if not, write to the Free Software
00020  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00021  * You should have received a copy of the MPL along with this library
00022  * in the file COPYING-MPL-1.1
00023  *
00024  * The contents of this file are subject to the Mozilla Public License
00025  * Version 1.1 (the "License"); you may not use this file except in
00026  * compliance with the License. You may obtain a copy of the License at
00027  * http://www.mozilla.org/MPL/
00028  *
00029  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY
00030  * OF ANY KIND, either express or implied. See the LGPL or the MPL for
00031  * the specific language governing rights and limitations.
00032  *
00033  * The Original Code is cairo_unicode.c as distributed with the
00034  *   cairo graphics library.
00035  *
00036  * The Initial Developer of the Original Code is Tom Tromey.
00037  *  and Red Hat, Inc.
00038  *
00039  * Contributor(s):
00040  *     Owen Taylor <otaylor@redhat.com>
00041  */
00042 
00043 #include <limits.h>
00044 
00045 #include <cairoint.h>
00046 
00047 #define UTF8_COMPUTE(Char, Mask, Len)                                       \
00048   if (Char < 128)                                                    \
00049     {                                                                \
00050       Len = 1;                                                              \
00051       Mask = 0x7f;                                                   \
00052     }                                                                \
00053   else if ((Char & 0xe0) == 0xc0)                                    \
00054     {                                                                \
00055       Len = 2;                                                              \
00056       Mask = 0x1f;                                                   \
00057     }                                                                \
00058   else if ((Char & 0xf0) == 0xe0)                                    \
00059     {                                                                \
00060       Len = 3;                                                              \
00061       Mask = 0x0f;                                                   \
00062     }                                                                \
00063   else if ((Char & 0xf8) == 0xf0)                                    \
00064     {                                                                \
00065       Len = 4;                                                              \
00066       Mask = 0x07;                                                   \
00067     }                                                                \
00068   else if ((Char & 0xfc) == 0xf8)                                    \
00069     {                                                                \
00070       Len = 5;                                                              \
00071       Mask = 0x03;                                                   \
00072     }                                                                \
00073   else if ((Char & 0xfe) == 0xfc)                                    \
00074     {                                                                \
00075       Len = 6;                                                              \
00076       Mask = 0x01;                                                   \
00077     }                                                                \
00078   else                                                               \
00079     Len = -1;
00080 
00081 #define UTF8_LENGTH(Char)              \
00082   ((Char) < 0x80 ? 1 :                 \
00083    ((Char) < 0x800 ? 2 :               \
00084     ((Char) < 0x10000 ? 3 :            \
00085      ((Char) < 0x200000 ? 4 :          \
00086       ((Char) < 0x4000000 ? 5 : 6)))))
00087    
00088 
00089 #define UTF8_GET(Result, Chars, Count, Mask, Len)                           \
00090   (Result) = (Chars)[0] & (Mask);                                    \
00091   for ((Count) = 1; (Count) < (Len); ++(Count))                             \
00092     {                                                                \
00093       if (((Chars)[(Count)] & 0xc0) != 0x80)                                \
00094        {                                                             \
00095          (Result) = -1;                                              \
00096          break;                                                      \
00097        }                                                             \
00098       (Result) <<= 6;                                                       \
00099       (Result) |= ((Chars)[(Count)] & 0x3f);                                \
00100     }
00101 
00102 #define UNICODE_VALID(Char)                   \
00103     ((Char) < 0x110000 &&                     \
00104      (((Char) & 0xFFFFF800) != 0xD800) &&     \
00105      ((Char) < 0xFDD0 || (Char) > 0xFDEF) &&  \
00106      ((Char) & 0xFFFE) != 0xFFFE)
00107    
00108      
00109 static const char utf8_skip_data[256] = {
00110     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00111     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00112     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00113     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00114     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00115     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00116     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
00117     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
00118 };
00119 
00120 #define UTF8_NEXT_CHAR(p) ((p) + utf8_skip_data[*(unsigned char *)(p)])
00121 
00122 /* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
00123  * If @p does not point to a valid UTF-8 encoded character, results are
00124  * undefined.
00125  **/
00126 static uint32_t
00127 _utf8_get_char (const unsigned char *p)
00128 {
00129     int i, mask = 0, len;
00130     uint32_t result;
00131     unsigned char c = (unsigned char) *p;
00132 
00133     UTF8_COMPUTE (c, mask, len);
00134     if (len == -1)
00135        return (uint32_t)-1;
00136     UTF8_GET (result, p, i, mask, len);
00137 
00138     return result;
00139 }
00140 
00141 /* Like _utf8_get_char, but take a maximum length
00142  * and return (uint32_t)-2 on incomplete trailing character
00143  */
00144 static uint32_t
00145 _utf8_get_char_extended (const unsigned char *p,
00146                       long               max_len)  
00147 {
00148     int i, len;
00149     uint32_t wc = (unsigned char) *p;
00150 
00151     if (wc < 0x80) {
00152        return wc;
00153     } else if (wc < 0xc0) {
00154        return (uint32_t)-1;
00155     } else if (wc < 0xe0) {
00156        len = 2;
00157        wc &= 0x1f;
00158     } else if (wc < 0xf0) {
00159        len = 3;
00160        wc &= 0x0f;
00161     } else if (wc < 0xf8) {
00162        len = 4;
00163        wc &= 0x07;
00164     } else if (wc < 0xfc) {
00165        len = 5;
00166        wc &= 0x03;
00167     } else if (wc < 0xfe) {
00168        len = 6;
00169        wc &= 0x01;
00170     } else {
00171        return (uint32_t)-1;
00172     }
00173   
00174     if (max_len >= 0 && len > max_len) {
00175        for (i = 1; i < max_len; i++) {
00176            if ((((unsigned char *)p)[i] & 0xc0) != 0x80)
00177               return (uint32_t)-1;
00178        }
00179        return (uint32_t)-2;
00180     }
00181 
00182     for (i = 1; i < len; ++i) {
00183        uint32_t ch = ((unsigned char *)p)[i];
00184       
00185        if ((ch & 0xc0) != 0x80) {
00186            if (ch)
00187               return (uint32_t)-1;
00188            else
00189               return (uint32_t)-2;
00190        }
00191 
00192        wc <<= 6;
00193        wc |= (ch & 0x3f);
00194     }
00195 
00196     if (UTF8_LENGTH(wc) != len)
00197        return (uint32_t)-1;
00198   
00199     return wc;
00200 }
00201 
00222 cairo_status_t
00223 _cairo_utf8_to_ucs4 (const unsigned char *str,
00224                    int               len,
00225                    uint32_t        **result,
00226                    int              *items_written)
00227 {
00228     uint32_t *str32 = NULL;
00229     int n_chars, i;
00230     const unsigned char *in;
00231   
00232     in = str;
00233     n_chars = 0;
00234     while ((len < 0 || str + len - in > 0) && *in)
00235     {
00236        uint32_t wc = _utf8_get_char_extended (in, str + len - in);
00237        if (wc & 0x80000000 || !UNICODE_VALID (wc))
00238            return CAIRO_STATUS_INVALID_STRING;
00239       
00240        n_chars++;
00241        if (n_chars == INT_MAX)
00242            return CAIRO_STATUS_INVALID_STRING;
00243 
00244        in = UTF8_NEXT_CHAR (in);
00245     }
00246 
00247     str32 = malloc (sizeof (uint32_t) * (n_chars + 1));
00248     if (!str32)
00249        return CAIRO_STATUS_NO_MEMORY;
00250   
00251     in = str;
00252     for (i=0; i < n_chars; i++) {
00253        str32[i] = _utf8_get_char (in);
00254        in = UTF8_NEXT_CHAR (in);
00255     }
00256     str32[i] = 0;
00257 
00258     *result = str32;
00259     if (items_written)
00260        *items_written = n_chars;
00261 
00262     return CAIRO_STATUS_SUCCESS;
00263 }
00264 
00286 cairo_status_t
00287 _cairo_utf8_to_utf16 (const unsigned char *str,
00288                     int               len,
00289                     uint16_t               **result,
00290                     int              *items_written)
00291 {
00292     uint16_t *str16 = NULL;
00293     int n16, i;
00294     const unsigned char *in;
00295 
00296     in = str;
00297     n16 = 0;
00298     while ((len < 0 || str + len - in > 0) && *in) {
00299        uint32_t wc = _utf8_get_char_extended (in, str + len - in);
00300        if (wc & 0x80000000 || !UNICODE_VALID (wc))
00301            return CAIRO_STATUS_INVALID_STRING;
00302        
00303        if (wc < 0x10000)
00304            n16 += 1;
00305        else
00306            n16 += 2;
00307       
00308        if (n16 == INT_MAX - 1 || n16 == INT_MAX)
00309            return CAIRO_STATUS_INVALID_STRING;
00310        
00311        in = UTF8_NEXT_CHAR (in);
00312     }
00313 
00314   
00315     str16 = malloc (sizeof (uint16_t) * (n16 + 1));
00316     if (!str16)
00317        return CAIRO_STATUS_NO_MEMORY;
00318   
00319     in = str;
00320     for (i = 0; i < n16;) {
00321        uint32_t wc = _utf8_get_char (in);
00322 
00323        if (wc < 0x10000) {
00324            str16[i++] = wc;
00325        } else {
00326            str16[i++] = (wc - 0x10000) / 0x400 + 0xd800;
00327            str16[i++] = (wc - 0x10000) % 0x400 + 0xdc00;
00328        }
00329       
00330        in = UTF8_NEXT_CHAR (in);
00331     }
00332 
00333     str16[i] = 0;
00334 
00335     *result = str16;
00336     if (items_written)
00337        *items_written = n16;
00338 
00339     return CAIRO_STATUS_SUCCESS;
00340 }