Back to index

openldap  2.4.31
ucstr.c
Go to the documentation of this file.
00001 /* $OpenLDAP$ */
00002 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
00003  *
00004  * Copyright 1998-2012 The OpenLDAP Foundation.
00005  * All rights reserved.
00006  *
00007  * Redistribution and use in source and binary forms, with or without
00008  * modification, are permitted only as authorized by the OpenLDAP
00009  * Public License.
00010  *
00011  * A copy of this license is available in file LICENSE in the
00012  * top-level directory of the distribution or, alternatively, at
00013  * <http://www.OpenLDAP.org/license.html>.
00014  */
00015 
00016 #include "portable.h"
00017 
00018 #include <ac/bytes.h>
00019 #include <ac/ctype.h>
00020 #include <ac/string.h>
00021 #include <ac/stdlib.h>
00022 
00023 #include <lber_pvt.h>
00024 
00025 #include <ldap_utf8.h>
00026 #include <ldap_pvt_uc.h>
00027 
00028 #define       malloc(x)     ber_memalloc_x(x,ctx)
00029 #define       realloc(x,y)  ber_memrealloc_x(x,y,ctx)
00030 #define       free(x)              ber_memfree_x(x,ctx)
00031 
00032 int ucstrncmp(
00033        const ldap_unicode_t *u1,
00034        const ldap_unicode_t *u2,
00035        ber_len_t n )
00036 {
00037        for(; 0 < n; ++u1, ++u2, --n ) {
00038               if( *u1 != *u2 ) {
00039                      return *u1 < *u2 ? -1 : +1;
00040               }
00041               if ( *u1 == 0 ) {
00042                      return 0;
00043               }
00044        }
00045        return 0;
00046 }
00047 
00048 int ucstrncasecmp(
00049        const ldap_unicode_t *u1,
00050        const ldap_unicode_t *u2,
00051        ber_len_t n )
00052 {
00053        for(; 0 < n; ++u1, ++u2, --n ) {
00054               ldap_unicode_t uu1 = uctolower( *u1 );
00055               ldap_unicode_t uu2 = uctolower( *u2 );
00056 
00057               if( uu1 != uu2 ) {
00058                      return uu1 < uu2 ? -1 : +1;
00059               }
00060               if ( uu1 == 0 ) {
00061                      return 0;
00062               }
00063        }
00064        return 0;
00065 }
00066 
00067 ldap_unicode_t * ucstrnchr(
00068        const ldap_unicode_t *u,
00069        ber_len_t n,
00070        ldap_unicode_t c )
00071 {
00072        for(; 0 < n; ++u, --n ) {
00073               if( *u == c ) {
00074                      return (ldap_unicode_t *) u;
00075               }
00076        }
00077 
00078        return NULL;
00079 }
00080 
00081 ldap_unicode_t * ucstrncasechr(
00082        const ldap_unicode_t *u,
00083        ber_len_t n,
00084        ldap_unicode_t c )
00085 {
00086        c = uctolower( c );
00087        for(; 0 < n; ++u, --n ) {
00088               if( uctolower( *u ) == c ) {
00089                      return (ldap_unicode_t *) u;
00090               }
00091        }
00092 
00093        return NULL;
00094 }
00095 
00096 void ucstr2upper(
00097        ldap_unicode_t *u,
00098        ber_len_t n )
00099 {
00100        for(; 0 < n; ++u, --n ) {
00101               *u = uctoupper( *u );
00102        }
00103 }
00104 
00105 struct berval * UTF8bvnormalize(
00106        struct berval *bv,
00107        struct berval *newbv,
00108        unsigned flags,
00109        void *ctx )
00110 {
00111        int i, j, len, clen, outpos, ucsoutlen, outsize, last;
00112        char *out, *outtmp, *s;
00113        ac_uint4 *ucs, *p, *ucsout;
00114 
00115        static unsigned char mask[] = {
00116               0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
00117 
00118        unsigned casefold = flags & LDAP_UTF8_CASEFOLD;
00119        unsigned approx = flags & LDAP_UTF8_APPROX;
00120 
00121        if ( bv == NULL ) {
00122               return NULL;
00123        }
00124 
00125        s = bv->bv_val;
00126        len = bv->bv_len;
00127 
00128        if ( len == 0 ) {
00129               return ber_dupbv_x( newbv, bv, ctx );
00130        }
00131 
00132        if ( !newbv ) {
00133               newbv = ber_memalloc_x( sizeof(struct berval), ctx );
00134               if ( !newbv ) return NULL;
00135        }
00136 
00137        /* Should first check to see if string is already in proper
00138         * normalized form. This is almost as time consuming as
00139         * the normalization though.
00140         */
00141 
00142        /* finish off everything up to character before first non-ascii */
00143        if ( LDAP_UTF8_ISASCII( s ) ) {
00144               if ( casefold ) {
00145                      outsize = len + 7;
00146                      out = (char *) ber_memalloc_x( outsize, ctx );
00147                      if ( out == NULL ) {
00148                             return NULL;
00149                      }
00150                      outpos = 0;
00151 
00152                      for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
00153                             out[outpos++] = TOLOWER( s[i-1] );
00154                      }
00155                      if ( i == len ) {
00156                             out[outpos++] = TOLOWER( s[len-1] );
00157                             out[outpos] = '\0';
00158                             newbv->bv_val = out;
00159                             newbv->bv_len = outpos;
00160                             return newbv;
00161                      }
00162               } else {
00163                      for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
00164                             /* empty */
00165                      }
00166 
00167                      if ( i == len ) {
00168                             return ber_str2bv_x( s, len, 1, newbv, ctx );
00169                      }
00170                             
00171                      outsize = len + 7;
00172                      out = (char *) ber_memalloc_x( outsize, ctx );
00173                      if ( out == NULL ) {
00174                             return NULL;
00175                      }
00176                      outpos = i - 1;
00177                      memcpy(out, s, outpos);
00178               }
00179        } else {
00180               outsize = len + 7;
00181               out = (char *) ber_memalloc_x( outsize, ctx );
00182               if ( out == NULL ) {
00183                      return NULL;
00184               }
00185               outpos = 0;
00186               i = 0;
00187        }
00188 
00189        p = ucs = ber_memalloc_x( len * sizeof(*ucs), ctx );
00190        if ( ucs == NULL ) {
00191               ber_memfree_x(out, ctx);
00192               return NULL;
00193        }
00194 
00195        /* convert character before first non-ascii to ucs-4 */
00196        if ( i > 0 ) {
00197               *p = casefold ? TOLOWER( s[i-1] ) : s[i-1];
00198               p++;
00199        }
00200 
00201        /* s[i] is now first non-ascii character */
00202        for (;;) {
00203               /* s[i] is non-ascii */
00204               /* convert everything up to next ascii to ucs-4 */
00205               while ( i < len ) {
00206                      clen = LDAP_UTF8_CHARLEN2( s + i, clen );
00207                      if ( clen == 0 ) {
00208                             ber_memfree_x( ucs, ctx );
00209                             ber_memfree_x( out, ctx );
00210                             return NULL;
00211                      }
00212                      if ( clen == 1 ) {
00213                             /* ascii */
00214                             break;
00215                      }
00216                      *p = s[i] & mask[clen];
00217                      i++;
00218                      for( j = 1; j < clen; j++ ) {
00219                             if ( (s[i] & 0xc0) != 0x80 ) {
00220                                    ber_memfree_x( ucs, ctx );
00221                                    ber_memfree_x( out, ctx );
00222                                    return NULL;
00223                             }
00224                             *p <<= 6;
00225                             *p |= s[i] & 0x3f;
00226                             i++;
00227                      }
00228                      if ( casefold ) {
00229                             *p = uctolower( *p );
00230                      }
00231                      p++;
00232               }
00233               /* normalize ucs of length p - ucs */
00234               uccompatdecomp( ucs, p - ucs, &ucsout, &ucsoutlen, ctx );
00235               if ( approx ) {
00236                      for ( j = 0; j < ucsoutlen; j++ ) {
00237                             if ( ucsout[j] < 0x80 ) {
00238                                    out[outpos++] = ucsout[j];
00239                             }
00240                      }
00241               } else {
00242                      ucsoutlen = uccanoncomp( ucsout, ucsoutlen );
00243                      /* convert ucs to utf-8 and store in out */
00244                      for ( j = 0; j < ucsoutlen; j++ ) {
00245                             /* allocate more space if not enough room for
00246                                6 bytes and terminator */
00247                             if ( outsize - outpos < 7 ) {
00248                                    outsize = ucsoutlen - j + outpos + 6;
00249                                    outtmp = (char *) ber_memrealloc_x( out, outsize, ctx );
00250                                    if ( outtmp == NULL ) {
00251                                           ber_memfree_x( ucsout, ctx );
00252                                           ber_memfree_x( ucs, ctx );
00253                                           ber_memfree_x( out, ctx );
00254                                           return NULL;
00255                                    }
00256                                    out = outtmp;
00257                             }
00258                             outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] );
00259                      }
00260               }
00261 
00262               ber_memfree_x( ucsout, ctx );
00263               ucsout = NULL;
00264               
00265               if ( i == len ) {
00266                      break;
00267               }
00268 
00269               last = i;
00270 
00271               /* Allocate more space in out if necessary */
00272               if (len - i >= outsize - outpos) {
00273                      outsize += 1 + ((len - i) - (outsize - outpos));
00274                      outtmp = (char *) ber_memrealloc_x(out, outsize, ctx);
00275                      if (outtmp == NULL) {
00276                             ber_memfree_x( ucs, ctx );
00277                             ber_memfree_x( out, ctx );
00278                             return NULL;
00279                      }
00280                      out = outtmp;
00281               }
00282 
00283               /* s[i] is ascii */
00284               /* finish off everything up to char before next non-ascii */
00285               for ( i++; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
00286                      out[outpos++] = casefold ? TOLOWER( s[i-1] ) : s[i-1];
00287               }
00288               if ( i == len ) {
00289                      out[outpos++] = casefold ? TOLOWER( s[len-1] ) : s[len-1];
00290                      break;
00291               }
00292 
00293               /* convert character before next non-ascii to ucs-4 */
00294               *ucs = casefold ? TOLOWER( s[i-1] ) : s[i-1];
00295               p = ucs + 1;
00296        }
00297 
00298        ber_memfree_x( ucs, ctx );
00299        out[outpos] = '\0';
00300        newbv->bv_val = out;
00301        newbv->bv_len = outpos;
00302        return newbv;
00303 }
00304 
00305 /* compare UTF8-strings, optionally ignore casing */
00306 /* slow, should be optimized */
00307 int UTF8bvnormcmp(
00308        struct berval *bv1,
00309        struct berval *bv2,
00310        unsigned flags,
00311        void *ctx )
00312 {
00313        int i, l1, l2, len, ulen, res = 0;
00314        char *s1, *s2, *done;
00315        ac_uint4 *ucs, *ucsout1, *ucsout2;
00316 
00317        unsigned casefold = flags & LDAP_UTF8_CASEFOLD;
00318        unsigned norm1 = flags & LDAP_UTF8_ARG1NFC;
00319        unsigned norm2 = flags & LDAP_UTF8_ARG2NFC;
00320 
00321        if (bv1 == NULL) {
00322               return bv2 == NULL ? 0 : -1;
00323 
00324        } else if (bv2 == NULL) {
00325               return 1;
00326        }
00327 
00328        l1 = bv1->bv_len;
00329        l2 = bv2->bv_len;
00330 
00331        len = (l1 < l2) ? l1 : l2;
00332        if (len == 0) {
00333               return l1 == 0 ? (l2 == 0 ? 0 : -1) : 1;
00334        }
00335 
00336        s1 = bv1->bv_val;
00337        s2 = bv2->bv_val;
00338        done = s1 + len;
00339 
00340        while ( (s1 < done) && LDAP_UTF8_ISASCII(s1) && LDAP_UTF8_ISASCII(s2) ) {
00341               if (casefold) {
00342                      char c1 = TOLOWER(*s1);
00343                      char c2 = TOLOWER(*s2);
00344                      res = c1 - c2;
00345               } else {
00346                      res = *s1 - *s2;
00347               }                    
00348               s1++;
00349               s2++;
00350               if (res) {
00351                      /* done unless next character in s1 or s2 is non-ascii */
00352                      if (s1 < done) {
00353                             if (!LDAP_UTF8_ISASCII(s1) || !LDAP_UTF8_ISASCII(s2)) {
00354                                    break;
00355                             }
00356                      } else if (((len < l1) && !LDAP_UTF8_ISASCII(s1)) ||
00357                             ((len < l2) && !LDAP_UTF8_ISASCII(s2)))
00358                      {
00359                             break;
00360                      }
00361                      return res;
00362               }
00363        }
00364 
00365        /* We have encountered non-ascii or strings equal up to len */
00366 
00367        /* set i to number of iterations */
00368        i = s1 - done + len;
00369        /* passed through loop at least once? */
00370        if (i > 0) {
00371               if (!res && (s1 == done) &&
00372                   ((len == l1) || LDAP_UTF8_ISASCII(s1)) &&
00373                   ((len == l2) || LDAP_UTF8_ISASCII(s2))) {
00374                      /* all ascii and equal up to len */
00375                      return l1 - l2;
00376               }
00377 
00378               /* rewind one char, and do normalized compare from there */
00379               s1--;
00380               s2--;
00381               l1 -= i - 1;
00382               l2 -= i - 1;
00383        }
00384                      
00385        /* Should first check to see if strings are already in
00386         * proper normalized form.
00387         */
00388        ucs = malloc( ( ( norm1 || l1 > l2 ) ? l1 : l2 ) * sizeof(*ucs) );
00389        if ( ucs == NULL ) {
00390               return l1 > l2 ? 1 : -1; /* what to do??? */
00391        }
00392        
00393        /*
00394         * XXYYZ: we convert to ucs4 even though -llunicode
00395         * expects ucs2 in an ac_uint4
00396         */
00397        
00398        /* convert and normalize 1st string */
00399        for ( i = 0, ulen = 0; i < l1; i += len, ulen++ ) {
00400               ucs[ulen] = ldap_x_utf8_to_ucs4( s1 + i );
00401               if ( ucs[ulen] == LDAP_UCS4_INVALID ) {
00402                      free( ucs );
00403                      return -1; /* what to do??? */
00404               }
00405               len = LDAP_UTF8_CHARLEN( s1 + i );
00406        }
00407 
00408        if ( norm1 ) {
00409               ucsout1 = ucs;
00410               l1 = ulen;
00411               ucs = malloc( l2 * sizeof(*ucs) );
00412               if ( ucs == NULL ) {
00413                      free( ucsout1 );
00414                      return l1 > l2 ? 1 : -1; /* what to do??? */
00415               }
00416        } else {
00417               uccompatdecomp( ucs, ulen, &ucsout1, &l1, ctx );
00418               l1 = uccanoncomp( ucsout1, l1 );
00419        }
00420 
00421        /* convert and normalize 2nd string */
00422        for ( i = 0, ulen = 0; i < l2; i += len, ulen++ ) {
00423               ucs[ulen] = ldap_x_utf8_to_ucs4( s2 + i );
00424               if ( ucs[ulen] == LDAP_UCS4_INVALID ) {
00425                      free( ucsout1 );
00426                      free( ucs );
00427                      return 1; /* what to do??? */
00428               }
00429               len = LDAP_UTF8_CHARLEN( s2 + i );
00430        }
00431 
00432        if ( norm2 ) {
00433               ucsout2 = ucs;
00434               l2 = ulen;
00435        } else {
00436               uccompatdecomp( ucs, ulen, &ucsout2, &l2, ctx );
00437               l2 = uccanoncomp( ucsout2, l2 );
00438               free( ucs );
00439        }
00440        
00441        res = casefold
00442               ? ucstrncasecmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 )
00443               : ucstrncmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 );
00444        free( ucsout1 );
00445        free( ucsout2 );
00446 
00447        if ( res != 0 ) {
00448               return res;
00449        }
00450        if ( l1 == l2 ) {
00451               return 0;
00452        }
00453        return l1 > l2 ? 1 : -1;
00454 }