Back to index

openldap  2.4.31
utbm.c
Go to the documentation of this file.
00001 /* $OpenLDAP$ */
00002 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
00003  *
00004  * Copyright 1998-2012 The OpenLDAP Foundation.
00005  * All rights reserved.
00006  *
00007  * Redistribution and use in source and binary forms, with or without
00008  * modification, are permitted only as authorized by the OpenLDAP
00009  * Public License.
00010  *
00011  * A copy of this license is available in file LICENSE in the
00012  * top-level directory of the distribution or, alternatively, at
00013  * <http://www.OpenLDAP.org/license.html>.
00014  */
00015 /* Copyright 1997, 1998, 1999 Computing Research Labs,
00016  * New Mexico State University
00017  *
00018  * Permission is hereby granted, free of charge, to any person obtaining a
00019  * copy of this software and associated documentation files (the "Software"),
00020  * to deal in the Software without restriction, including without limitation
00021  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
00022  * and/or sell copies of the Software, and to permit persons to whom the
00023  * Software is furnished to do so, subject to the following conditions:
00024  *
00025  * The above copyright notice and this permission notice shall be included in
00026  * all copies or substantial portions of the Software.
00027  *
00028  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00029  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00030  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
00031  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
00032  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
00033  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
00034  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
00035  */
00036 /* $Id: utbm.c,v 1.1 1999/09/21 15:45:17 mleisher Exp $ */
00037 
00038 /*
00039  * Assumptions:
00040  * 1. Case conversions of UTF-16 characters must also be UTF-16 characters.
00041  * 2. Case conversions are all one-to-one.
00042  * 3. Text and pattern have already been normalized in some fashion.
00043  */
00044 
00045 #include <stdlib.h>
00046 #include <unistd.h>
00047 #include <string.h>
00048 #include "utbm.h"
00049 
00050 /*
00051  * Single pattern character.
00052  */
00053 typedef struct {
00054     ucs4_t lc;
00055     ucs4_t uc;
00056     ucs4_t tc;
00057 } _utbm_char_t;
00058 
00059 typedef struct {
00060     _utbm_char_t *ch;
00061     unsigned long skip;
00062 } _utbm_skip_t;
00063 
00064 typedef struct _utbm_pattern_t {
00065     unsigned long flags;
00066 
00067     _utbm_char_t *pat;
00068     unsigned long pat_used;
00069     unsigned long pat_size;
00070     unsigned long patlen;
00071 
00072     _utbm_skip_t *skip;
00073     unsigned long skip_used;
00074     unsigned long skip_size;
00075 
00076     unsigned long md4;
00077 } _utbm_pattern_t;
00078 
00079 /*************************************************************************
00080  *
00081  * Support functions.
00082  *
00083  *************************************************************************/
00084 
00085 /*
00086  * Routine to look up the skip value for a character.
00087  */
00088 static unsigned long
00089 _utbm_skip(utbm_pattern_t p, ucs2_t *start, ucs2_t *end)
00090 {
00091     unsigned long i;
00092     ucs4_t c1, c2;
00093     _utbm_skip_t *sp;
00094 
00095     if (start >= end)
00096       return 0;
00097 
00098     c1 = *start;
00099     c2 = (start + 1 < end) ? *(start + 1) : ~0;
00100     if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff)
00101       c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
00102 
00103     for (i = 0, sp = p->skip; i < p->skip_used; i++, sp++) {
00104         if (!((c1 ^ sp->ch->uc) & (c1 ^ sp->ch->lc) & (c1 ^ sp->ch->tc))) {
00105             return ((unsigned long) (end - start) < sp->skip) ?
00106                 end - start : sp->skip;
00107         }
00108     }
00109     return p->patlen;
00110 }
00111 
00112 static int
00113 _utbm_match(utbm_pattern_t pat, ucs2_t *text, ucs2_t *start, ucs2_t *end,
00114             unsigned long *match_start, unsigned long *match_end)
00115 {
00116     int check_space;
00117     ucs4_t c1, c2;
00118     unsigned long count;
00119     _utbm_char_t *cp;
00120 
00121     /*
00122      * Set the potential match endpoint first.
00123      */
00124     *match_end = (start - text) + 1;
00125 
00126     c1 = *start;
00127     c2 = (start + 1 < end) ? *(start + 1) : ~0;
00128     if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff) {
00129         c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
00130         /*
00131          * Adjust the match end point to occur after the UTF-16 character.
00132          */
00133         *match_end = *match_end + 1;
00134     }
00135 
00136     if (pat->pat_used == 1) {
00137         *match_start = start - text;
00138         return 1;
00139     }
00140 
00141     /*
00142      * Compare backward.
00143      */
00144     cp = pat->pat + (pat->pat_used - 1);
00145 
00146     for (count = pat->patlen; start > text && count > 0;) {
00147         /*
00148          * Ignore non-spacing characters if indicated.
00149          */
00150         if (pat->flags & UTBM_IGNORE_NONSPACING) {
00151             while (start > text && _utbm_nonspacing(c1)) {
00152                 c2 = *--start;
00153                 c1 = (start - 1 > text) ? *(start - 1) : ~0;
00154                 if (0xdc00 <= c2 && c2 <= 0xdfff &&
00155                     0xd800 <= c1 && c1 <= 0xdbff) {
00156                     c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
00157                     start--;
00158                 } else
00159                   c1 = c2;
00160             }
00161         }
00162 
00163         /*
00164          * Handle space compression if indicated.
00165          */
00166         if (pat->flags & UTBM_SPACE_COMPRESS) {
00167             check_space = 0;
00168             while (start > text &&
00169                    (_utbm_isspace(c1, 1) || _utbm_iscntrl(c1))) {
00170                 check_space = _utbm_isspace(c1, 1);
00171                 c2 = *--start;
00172                 c1 = (start - 1 > text) ? *(start - 1) : ~0;
00173                 if (0xdc00 <= c2 && c2 <= 0xdfff &&
00174                     0xd800 <= c1 && c1 <= 0xdbff) {
00175                     c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
00176                     start--;
00177                 } else
00178                   c1 = c2;
00179             }
00180             /*
00181              * Handle things if space compression was indicated and one or
00182              * more member characters were found.
00183              */
00184             if (check_space) {
00185                 if (cp->uc != ' ')
00186                   return 0;
00187                 cp--;
00188                 count--;
00189             }
00190         }
00191 
00192         /*
00193          * Handle the normal comparison cases.
00194          */
00195         if (count > 0 && ((c1 ^ cp->uc) & (c1 ^ cp->lc) & (c1 ^ cp->tc)))
00196           return 0;
00197 
00198         count -= (c1 >= 0x10000) ? 2 : 1;
00199         if (count > 0) {
00200             cp--;
00201 
00202             /*
00203              * Get the next preceding character.
00204              */
00205             if (start > text) {
00206                 c2 = *--start;
00207                 c1 = (start - 1 > text) ? *(start - 1) : ~0;
00208                 if (0xdc00 <= c2 && c2 <= 0xdfff &&
00209                     0xd800 <= c1 && c1 <= 0xdbff) {
00210                     c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
00211                     start--;
00212                 } else
00213                   c1 = c2;
00214             }
00215         }
00216     }
00217 
00218     /*
00219      * Set the match start position.
00220      */
00221     *match_start = start - text;
00222     return 1;
00223 }
00224 
00225 /*************************************************************************
00226  *
00227  * API.
00228  *
00229  *************************************************************************/
00230 
00231 utbm_pattern_t
00232 utbm_create_pattern(void)
00233 {
00234     utbm_pattern_t p;
00235 
00236     p = (utbm_pattern_t) malloc(sizeof(_utbm_pattern_t));
00237     (void) memset((char *) p, '\0', sizeof(_utbm_pattern_t));
00238     return p;
00239 }
00240 
00241 void
00242 utbm_free_pattern(utbm_pattern_t pattern)
00243 {
00244     if (pattern == 0)
00245       return;
00246 
00247     if (pattern->pat_size > 0)
00248       free((char *) pattern->pat);
00249 
00250     if (pattern->skip_size > 0)
00251       free((char *) pattern->skip);
00252 
00253     free((char *) pattern);
00254 }
00255 
00256 void
00257 utbm_compile(ucs2_t *pat, unsigned long patlen, unsigned long flags,
00258              utbm_pattern_t p)
00259 {
00260     int have_space;
00261     unsigned long i, j, k, slen;
00262     _utbm_char_t *cp;
00263     _utbm_skip_t *sp;
00264     ucs4_t c1, c2, sentinel;
00265 
00266     if (p == 0 || pat == 0 || *pat == 0 || patlen == 0)
00267       return;
00268 
00269     /*
00270      * Reset the pattern buffer.
00271      */
00272     p->patlen = p->pat_used = p->skip_used = 0;
00273 
00274     /*
00275      * Set the flags.
00276      */
00277     p->flags = flags;
00278 
00279     /*
00280      * Initialize the extra skip flag.
00281      */
00282     p->md4 = 1;
00283 
00284     /*
00285      * Allocate more storage if necessary.
00286      */
00287     if (patlen > p->pat_size) {
00288         if (p->pat_size == 0) {
00289             p->pat = (_utbm_char_t *) malloc(sizeof(_utbm_char_t) * patlen);
00290             p->skip = (_utbm_skip_t *) malloc(sizeof(_utbm_skip_t) * patlen);
00291         } else {
00292             p->pat = (_utbm_char_t *)
00293                 realloc((char *) p->pat, sizeof(_utbm_char_t) * patlen);
00294             p->skip = (_utbm_skip_t *)
00295                 realloc((char *) p->skip, sizeof(_utbm_skip_t) * patlen);
00296         }
00297         p->pat_size = p->skip_size = patlen;
00298     }
00299 
00300     /*
00301      * Preprocess the pattern to remove controls (if specified) and determine
00302      * case.
00303      */
00304     for (have_space = 0, cp = p->pat, i = 0; i < patlen; i++) {
00305         c1 = pat[i];
00306         c2 = (i + 1 < patlen) ? pat[i + 1] : ~0;
00307         if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff)
00308           c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
00309 
00310         /*
00311          * Make sure the `have_space' flag is turned off if the character
00312          * is not an appropriate one.
00313          */
00314         if (!_utbm_isspace(c1, flags & UTBM_SPACE_COMPRESS))
00315           have_space = 0;
00316 
00317         /*
00318          * If non-spacing characters should be ignored, do it here.
00319          */
00320         if ((flags & UTBM_IGNORE_NONSPACING) && _utbm_nonspacing(c1))
00321           continue;
00322 
00323         /*
00324          * Check if spaces and controls need to be compressed.
00325          */
00326         if (flags & UTBM_SPACE_COMPRESS) {
00327             if (_utbm_isspace(c1, 1)) {
00328                 if (!have_space) {
00329                     /*
00330                      * Add a space and set the flag.
00331                      */
00332                     cp->uc = cp->lc = cp->tc = ' ';
00333                     cp++;
00334 
00335                     /*
00336                      * Increase the real pattern length.
00337                      */
00338                     p->patlen++;
00339                     sentinel = ' ';
00340                     have_space = 1;
00341                 }
00342                 continue;
00343             }
00344 
00345             /*
00346              * Ignore all control characters.
00347              */
00348             if (_utbm_iscntrl(c1))
00349               continue;
00350         }
00351 
00352         /*
00353          * Add the character.
00354          */
00355         if (flags & UTBM_CASEFOLD) {
00356             cp->uc = _utbm_toupper(c1);
00357             cp->lc = _utbm_tolower(c1);
00358             cp->tc = _utbm_totitle(c1);
00359         } else
00360           cp->uc = cp->lc = cp->tc = c1;
00361 
00362         /*
00363          * Set the sentinel character.
00364          */
00365         sentinel = cp->uc;
00366 
00367         /*
00368          * Move to the next character.
00369          */
00370         cp++;
00371 
00372         /*
00373          * Increase the real pattern length appropriately.
00374          */
00375         p->patlen += (c1 >= 0x10000) ? 2 : 1;
00376 
00377         /*
00378          * Increment the loop index for UTF-16 characters.
00379          */
00380         i += (c1 >= 0x10000) ? 1 : 0;
00381 
00382     }
00383 
00384     /*
00385      * Set the number of characters actually used.
00386      */
00387     p->pat_used = cp - p->pat;
00388 
00389     /*
00390      * Go through and construct the skip array and determine the actual length
00391      * of the pattern in UCS2 terms.
00392      */
00393     slen = p->patlen - 1;
00394     cp = p->pat;
00395     for (i = k = 0; i < p->pat_used; i++, cp++) {
00396         /*
00397          * Locate the character in the skip array.
00398          */
00399         for (sp = p->skip, j = 0;
00400              j < p->skip_used && sp->ch->uc != cp->uc; j++, sp++) ;
00401 
00402         /*
00403          * If the character is not found, set the new skip element and
00404          * increase the number of skip elements.
00405          */
00406         if (j == p->skip_used) {
00407             sp->ch = cp;
00408             p->skip_used++;
00409         }
00410 
00411         /*
00412          * Set the updated skip value.  If the character is UTF-16 and is
00413          * not the last one in the pattern, add one to its skip value.
00414          */
00415         sp->skip = slen - k;
00416         if (cp->uc >= 0x10000 && k + 2 < slen)
00417           sp->skip++;
00418 
00419         /*
00420          * Set the new extra skip for the sentinel character.
00421          */
00422         if (((cp->uc >= 0x10000 && k + 2 <= slen) || k + 1 <= slen) &&
00423             cp->uc == sentinel)
00424           p->md4 = slen - k;
00425 
00426         /*
00427          * Increase the actual index.
00428          */
00429         k += (cp->uc >= 0x10000) ? 2 : 1;
00430     }
00431 }
00432 
00433 int
00434 utbm_exec(utbm_pattern_t pat, ucs2_t *text, unsigned long textlen,
00435           unsigned long *match_start, unsigned long *match_end)
00436 {
00437     unsigned long k;
00438     ucs2_t *start, *end;
00439 
00440     if (pat == 0 || pat->pat_used == 0 || text == 0 || textlen == 0 ||
00441         textlen < pat->patlen)
00442       return 0;
00443 
00444     start = text + pat->patlen;
00445     end = text + textlen;
00446 
00447     /*
00448      * Adjust the start point if it points to a low surrogate.
00449      */
00450     if (0xdc00 <= *start && *start <= 0xdfff &&
00451         0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
00452       start--;
00453 
00454     while (start < end) {
00455         while ((k = _utbm_skip(pat, start, end))) {
00456             start += k;
00457             if (start < end && 0xdc00 <= *start && *start <= 0xdfff &&
00458                 0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
00459               start--;
00460         }
00461 
00462         if (start < end &&
00463             _utbm_match(pat, text, start, end, match_start, match_end))
00464           return 1;
00465 
00466         start += pat->md4;
00467         if (start < end && 0xdc00 <= *start && *start <= 0xdfff &&
00468             0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
00469           start--;
00470     }
00471     return 0;
00472 }