Back to index

lightning-sunbird  0.9+nobinonly
utf8.c
Go to the documentation of this file.
00001 /* ***** BEGIN LICENSE BLOCK *****
00002  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00003  *
00004  * The contents of this file are subject to the Mozilla Public License Version
00005  * 1.1 (the "License"); you may not use this file except in compliance with
00006  * the License. You may obtain a copy of the License at
00007  * http://www.mozilla.org/MPL/
00008  *
00009  * Software distributed under the License is distributed on an "AS IS" basis,
00010  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00011  * for the specific language governing rights and limitations under the
00012  * License.
00013  *
00014  * The Original Code is Mozilla Communicator client code, released
00015  * March 31, 1998.
00016  *
00017  * The Initial Developer of the Original Code is
00018  * Netscape Communications Corporation.
00019  * Portions created by the Initial Developer are Copyright (C) 1998-1999
00020  * the Initial Developer. All Rights Reserved.
00021  *
00022  * Contributor(s):
00023  *
00024  * Alternatively, the contents of this file may be used under the terms of
00025  * either the GNU General Public License Version 2 or later (the "GPL"), or
00026  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00027  * in which case the provisions of the GPL or the LGPL are applicable instead
00028  * of those above. If you wish to allow use of your version of this file only
00029  * under the terms of either the GPL or the LGPL, and not to allow others to
00030  * use your version of this file under the terms of the MPL, indicate your
00031  * decision by deleting the provisions above and replace them with the notice
00032  * and other provisions required by the GPL or the LGPL. If you do not delete
00033  * the provisions above, a recipient may use your version of this file under
00034  * the terms of any one of the MPL, the GPL or the LGPL.
00035  *
00036  * ***** END LICENSE BLOCK ***** */
00037 
00038 /* uft8.c - misc. utf8 "string" functions. */
00039 #include "ldap-int.h"
00040 
00041 static char UTF8len[64]
00042 = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00043    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00044    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00045    2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6};
00046 
00047 int
00048 LDAP_CALL
00049 ldap_utf8len (const char* s)
00050      /* Return the number of char's in the character at *s. */
00051 {
00052     return ldap_utf8next((char*)s) - s;
00053 }
00054 
00055 char*
00056 LDAP_CALL
00057 ldap_utf8next (char* s)
00058      /* Return a pointer to the character immediately following *s.
00059        Handle any valid UTF-8 character, including '\0' and ASCII.
00060        Try to handle a misaligned pointer or a malformed character.
00061      */
00062 {
00063     register unsigned char* next = (unsigned char*)s;
00064     switch (UTF8len [(*next >> 2) & 0x3F]) {
00065       case 0: /* erroneous: s points to the middle of a character. */
00066       case 6: if ((*++next & 0xC0) != 0x80) break;
00067       case 5: if ((*++next & 0xC0) != 0x80) break;
00068       case 4: if ((*++next & 0xC0) != 0x80) break;
00069       case 3: if ((*++next & 0xC0) != 0x80) break;
00070       case 2: if ((*++next & 0xC0) != 0x80) break;
00071       case 1: ++next;
00072     }
00073     return (char*) next;
00074 }
00075 
00076 char*
00077 LDAP_CALL
00078 ldap_utf8prev (char* s)
00079      /* Return a pointer to the character immediately preceding *s.
00080        Handle any valid UTF-8 character, including '\0' and ASCII.
00081        Try to handle a misaligned pointer or a malformed character.
00082      */
00083 {
00084     register unsigned char* prev = (unsigned char*)s;
00085     unsigned char* limit = prev - 6;
00086     while (((*--prev & 0xC0) == 0x80) && (prev != limit)) {
00087        ;
00088     }
00089     return (char*) prev;
00090 }
00091 
00092 int
00093 LDAP_CALL
00094 ldap_utf8copy (char* dst, const char* src)
00095      /* Copy a character from src to dst; return the number of char's copied.
00096        Handle any valid UTF-8 character, including '\0' and ASCII.
00097        Try to handle a misaligned pointer or a malformed character.
00098      */
00099 {
00100     register const unsigned char* s = (const unsigned char*)src;
00101     switch (UTF8len [(*s >> 2) & 0x3F]) {
00102       case 0: /* erroneous: s points to the middle of a character. */
00103       case 6: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
00104       case 5: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
00105       case 4: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
00106       case 3: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
00107       case 2: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
00108       case 1: *dst   = *s++;
00109     }
00110     return s - (const unsigned char*)src;
00111 }
00112 
00113 size_t
00114 LDAP_CALL
00115 ldap_utf8characters (const char* src)
00116      /* Return the number of UTF-8 characters in the 0-terminated array s. */
00117 {
00118     register char* s = (char*)src;
00119     size_t n;
00120     for (n = 0; *s; LDAP_UTF8INC(s)) ++n;
00121     return n;
00122 }
00123 
00124 unsigned long LDAP_CALL
00125 ldap_utf8getcc( const char** src )
00126 {
00127     register unsigned long c;
00128     register const unsigned char* s = (const unsigned char*)*src;
00129     switch (UTF8len [(*s >> 2) & 0x3F]) {
00130       case 0: /* erroneous: s points to the middle of a character. */
00131              c = (*s++) & 0x3F; goto more5;
00132       case 1: c = (*s++); break;
00133       case 2: c = (*s++) & 0x1F; goto more1;
00134       case 3: c = (*s++) & 0x0F; goto more2;
00135       case 4: c = (*s++) & 0x07; goto more3;
00136       case 5: c = (*s++) & 0x03; goto more4;
00137       case 6: c = (*s++) & 0x01; goto more5;
00138       more5: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
00139       more4: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
00140       more3: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
00141       more2: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
00142       more1: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
00143        break;
00144     }
00145     *src = (const char*)s;
00146     return c;
00147 }
00148 
00149 char*
00150 LDAP_CALL
00151 ldap_utf8strtok_r( char* sp, const char* brk, char** next)
00152 {
00153     const char *bp;
00154     unsigned long sc, bc;
00155     char *tok;
00156 
00157     if (sp == NULL && (sp = *next) == NULL)
00158       return NULL;
00159 
00160     /* Skip leading delimiters; roughly, sp += strspn(sp, brk) */
00161   cont:
00162     sc = LDAP_UTF8GETC(sp);
00163     for (bp = brk; (bc = LDAP_UTF8GETCC(bp)) != 0;) {
00164        if (sc == bc)
00165          goto cont;
00166     }
00167 
00168     if (sc == 0) { /* no non-delimiter characters */
00169        *next = NULL;
00170        return NULL;
00171     }
00172     tok = LDAP_UTF8PREV(sp);
00173 
00174     /* Scan token; roughly, sp += strcspn(sp, brk)
00175      * Note that brk must be 0-terminated; we stop if we see that, too.
00176      */
00177     while (1) {
00178        sc = LDAP_UTF8GETC(sp);
00179        bp = brk;
00180        do {
00181            if ((bc = LDAP_UTF8GETCC(bp)) == sc) {
00182               if (sc == 0) {
00183                   *next = NULL;
00184               } else {
00185                   *next = sp;
00186                   *(LDAP_UTF8PREV(sp)) = 0;
00187               }
00188               return tok;
00189            }
00190        } while (bc != 0);
00191     }
00192     /* NOTREACHED */
00193 }
00194 
00195 int
00196 LDAP_CALL
00197 ldap_utf8isalnum( char* s )
00198 {
00199     register unsigned char c = *(unsigned char*)s;
00200     if (0x80 & c) return 0;
00201     if (c >= 'A' && c <= 'Z') return 1;
00202     if (c >= 'a' && c <= 'z') return 1;
00203     if (c >= '0' && c <= '9') return 1;
00204     return 0;
00205 }
00206 
00207 int
00208 LDAP_CALL
00209 ldap_utf8isalpha( char* s )
00210 {
00211     register unsigned char c = *(unsigned char*)s;
00212     if (0x80 & c) return 0;
00213     if (c >= 'A' && c <= 'Z') return 1;
00214     if (c >= 'a' && c <= 'z') return 1;
00215     return 0;
00216 }
00217 
00218 int
00219 LDAP_CALL
00220 ldap_utf8isdigit( char* s )
00221 {
00222     register unsigned char c = *(unsigned char*)s;
00223     if (0x80 & c) return 0;
00224     if (c >= '0' && c <= '9') return 1;
00225     return 0;
00226 }
00227 
00228 int
00229 LDAP_CALL
00230 ldap_utf8isxdigit( char* s )
00231 {
00232     register unsigned char c = *(unsigned char*)s;
00233     if (0x80 & c) return 0;
00234     if (c >= '0' && c <= '9') return 1;
00235     if (c >= 'A' && c <= 'F') return 1;
00236     if (c >= 'a' && c <= 'f') return 1;
00237     return 0;
00238 }
00239 
00240 int
00241 LDAP_CALL
00242 ldap_utf8isspace( char* s )
00243 {
00244     register unsigned char *c = (unsigned char*)s;
00245     int len = ldap_utf8len(s);
00246 
00247     if (len == 0) {
00248        return 0;
00249     } else if (len == 1) {
00250        switch (*c) {
00251            case 0x09:
00252            case 0x0A:
00253            case 0x0B:
00254            case 0x0C:
00255            case 0x0D:
00256            case 0x20:
00257               return 1;
00258            default:
00259               return 0;
00260        }
00261     } else if (len == 2) {
00262        if (*c == 0xc2) {
00263               return *(c+1) == 0x80;
00264        }
00265     } else if (len == 3) {
00266        if (*c == 0xE2) {
00267            c++;
00268            if (*c == 0x80) {
00269               c++;
00270               return (*c>=0x80 && *c<=0x8a);
00271            }
00272        } else if (*c == 0xE3) {
00273            return (*(c+1)==0x80) && (*(c+2)==0x80);
00274        } else if (*c==0xEF) {
00275            return (*(c+1)==0xBB) && (*(c+2)==0xBF);
00276        }
00277        return 0;
00278     }
00279 
00280     /* should never reach here */
00281     return 0;
00282 }