Back to index

glibc  2.9
str-two-way.h
Go to the documentation of this file.
00001 /* Byte-wise substring search, using the Two-Way algorithm.
00002    Copyright (C) 2008 Free Software Foundation, Inc.
00003    This file is part of the GNU C Library.
00004    Written by Eric Blake <ebb9@byu.net>, 2008.
00005 
00006    The GNU C Library is free software; you can redistribute it and/or
00007    modify it under the terms of the GNU Lesser General Public
00008    License as published by the Free Software Foundation; either
00009    version 2.1 of the License, or (at your option) any later version.
00010 
00011    The GNU C Library is distributed in the hope that it will be useful,
00012    but WITHOUT ANY WARRANTY; without even the implied warranty of
00013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014    Lesser General Public License for more details.
00015 
00016    You should have received a copy of the GNU Lesser General Public
00017    License along with the GNU C Library; if not, write to the Free
00018    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
00019    02111-1307 USA.  */
00020 
00021 /* Before including this file, you need to include <string.h> (and
00022    <config.h> before that, if not part of libc), and define:
00023      RESULT_TYPE             A macro that expands to the return type.
00024      AVAILABLE(h, h_l, j, n_l)
00025                           A macro that returns nonzero if there are
00026                           at least N_L bytes left starting at H[J].
00027                           H is 'unsigned char *', H_L, J, and N_L
00028                           are 'size_t'; H_L is an lvalue.  For
00029                           NUL-terminated searches, H_L can be
00030                           modified each iteration to avoid having
00031                           to compute the end of H up front.
00032 
00033   For case-insensitivity, you may optionally define:
00034      CMP_FUNC(p1, p2, l)     A macro that returns 0 iff the first L
00035                           characters of P1 and P2 are equal.
00036      CANON_ELEMENT(c)        A macro that canonicalizes an element right after
00037                           it has been fetched from one of the two strings.
00038                           The argument is an 'unsigned char'; the result
00039                           must be an 'unsigned char' as well.
00040 
00041   This file undefines the macros documented above, and defines
00042   LONG_NEEDLE_THRESHOLD.
00043 */
00044 
00045 #include <limits.h>
00046 #include <stdint.h>
00047 
00048 /* We use the Two-Way string matching algorithm, which guarantees
00049    linear complexity with constant space.  Additionally, for long
00050    needles, we also use a bad character shift table similar to the
00051    Boyer-Moore algorithm to achieve improved (potentially sub-linear)
00052    performance.
00053 
00054    See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
00055    and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
00056 */
00057 
00058 /* Point at which computing a bad-byte shift table is likely to be
00059    worthwhile.  Small needles should not compute a table, since it
00060    adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
00061    speedup no greater than a factor of NEEDLE_LEN.  The larger the
00062    needle, the better the potential performance gain.  On the other
00063    hand, on non-POSIX systems with CHAR_BIT larger than eight, the
00064    memory required for the table is prohibitive.  */
00065 #if CHAR_BIT < 10
00066 # define LONG_NEEDLE_THRESHOLD 32U
00067 #else
00068 # define LONG_NEEDLE_THRESHOLD SIZE_MAX
00069 #endif
00070 
00071 #ifndef MAX
00072 # define MAX(a, b) ((a < b) ? (b) : (a))
00073 #endif
00074 
00075 #ifndef CANON_ELEMENT
00076 # define CANON_ELEMENT(c) c
00077 #endif
00078 #ifndef CMP_FUNC
00079 # define CMP_FUNC memcmp
00080 #endif
00081 
00082 /* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
00083    Return the index of the first byte in the right half, and set
00084    *PERIOD to the global period of the right half.
00085 
00086    The global period of a string is the smallest index (possibly its
00087    length) at which all remaining bytes in the string are repetitions
00088    of the prefix (the last repetition may be a subset of the prefix).
00089 
00090    When NEEDLE is factored into two halves, a local period is the
00091    length of the smallest word that shares a suffix with the left half
00092    and shares a prefix with the right half.  All factorizations of a
00093    non-empty NEEDLE have a local period of at least 1 and no greater
00094    than NEEDLE_LEN.
00095 
00096    A critical factorization has the property that the local period
00097    equals the global period.  All strings have at least one critical
00098    factorization with the left half smaller than the global period.
00099 
00100    Given an ordered alphabet, a critical factorization can be computed
00101    in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
00102    larger of two ordered maximal suffixes.  The ordered maximal
00103    suffixes are determined by lexicographic comparison of
00104    periodicity.  */
00105 static size_t
00106 critical_factorization (const unsigned char *needle, size_t needle_len,
00107                      size_t *period)
00108 {
00109   /* Index of last byte of left half, or SIZE_MAX.  */
00110   size_t max_suffix, max_suffix_rev;
00111   size_t j; /* Index into NEEDLE for current candidate suffix.  */
00112   size_t k; /* Offset into current period.  */
00113   size_t p; /* Intermediate period.  */
00114   unsigned char a, b; /* Current comparison bytes.  */
00115 
00116   /* Invariants:
00117      0 <= j < NEEDLE_LEN - 1
00118      -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
00119      min(max_suffix, max_suffix_rev) < global period of NEEDLE
00120      1 <= p <= global period of NEEDLE
00121      p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
00122      1 <= k <= p
00123   */
00124 
00125   /* Perform lexicographic search.  */
00126   max_suffix = SIZE_MAX;
00127   j = 0;
00128   k = p = 1;
00129   while (j + k < needle_len)
00130     {
00131       a = CANON_ELEMENT (needle[j + k]);
00132       b = CANON_ELEMENT (needle[max_suffix + k]);
00133       if (a < b)
00134        {
00135          /* Suffix is smaller, period is entire prefix so far.  */
00136          j += k;
00137          k = 1;
00138          p = j - max_suffix;
00139        }
00140       else if (a == b)
00141        {
00142          /* Advance through repetition of the current period.  */
00143          if (k != p)
00144            ++k;
00145          else
00146            {
00147              j += p;
00148              k = 1;
00149            }
00150        }
00151       else /* b < a */
00152        {
00153          /* Suffix is larger, start over from current location.  */
00154          max_suffix = j++;
00155          k = p = 1;
00156        }
00157     }
00158   *period = p;
00159 
00160   /* Perform reverse lexicographic search.  */
00161   max_suffix_rev = SIZE_MAX;
00162   j = 0;
00163   k = p = 1;
00164   while (j + k < needle_len)
00165     {
00166       a = CANON_ELEMENT (needle[j + k]);
00167       b = CANON_ELEMENT (needle[max_suffix_rev + k]);
00168       if (b < a)
00169        {
00170          /* Suffix is smaller, period is entire prefix so far.  */
00171          j += k;
00172          k = 1;
00173          p = j - max_suffix_rev;
00174        }
00175       else if (a == b)
00176        {
00177          /* Advance through repetition of the current period.  */
00178          if (k != p)
00179            ++k;
00180          else
00181            {
00182              j += p;
00183              k = 1;
00184            }
00185        }
00186       else /* a < b */
00187        {
00188          /* Suffix is larger, start over from current location.  */
00189          max_suffix_rev = j++;
00190          k = p = 1;
00191        }
00192     }
00193 
00194   /* Choose the longer suffix.  Return the first byte of the right
00195      half, rather than the last byte of the left half.  */
00196   if (max_suffix_rev + 1 < max_suffix + 1)
00197     return max_suffix + 1;
00198   *period = p;
00199   return max_suffix_rev + 1;
00200 }
00201 
00202 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
00203    NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
00204    method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
00205    Performance is guaranteed to be linear, with an initialization cost
00206    of 2 * NEEDLE_LEN comparisons.
00207 
00208    If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
00209    most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
00210    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
00211    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.  */
00212 static RETURN_TYPE
00213 two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
00214                     const unsigned char *needle, size_t needle_len)
00215 {
00216   size_t i; /* Index into current byte of NEEDLE.  */
00217   size_t j; /* Index into current window of HAYSTACK.  */
00218   size_t period; /* The period of the right half of needle.  */
00219   size_t suffix; /* The index of the right half of needle.  */
00220 
00221   /* Factor the needle into two halves, such that the left half is
00222      smaller than the global period, and the right half is
00223      periodic (with a period as large as NEEDLE_LEN - suffix).  */
00224   suffix = critical_factorization (needle, needle_len, &period);
00225 
00226   /* Perform the search.  Each iteration compares the right half
00227      first.  */
00228   if (CMP_FUNC (needle, needle + period, suffix) == 0)
00229     {
00230       /* Entire needle is periodic; a mismatch can only advance by the
00231         period, so use memory to avoid rescanning known occurrences
00232         of the period.  */
00233       size_t memory = 0;
00234       j = 0;
00235       while (AVAILABLE (haystack, haystack_len, j, needle_len))
00236        {
00237          /* Scan for matches in right half.  */
00238          i = MAX (suffix, memory);
00239          while (i < needle_len && (CANON_ELEMENT (needle[i])
00240                                 == CANON_ELEMENT (haystack[i + j])))
00241            ++i;
00242          if (needle_len <= i)
00243            {
00244              /* Scan for matches in left half.  */
00245              i = suffix - 1;
00246              while (memory < i + 1 && (CANON_ELEMENT (needle[i])
00247                                    == CANON_ELEMENT (haystack[i + j])))
00248               --i;
00249              if (i + 1 < memory + 1)
00250               return (RETURN_TYPE) (haystack + j);
00251              /* No match, so remember how many repetitions of period
00252                on the right half were scanned.  */
00253              j += period;
00254              memory = needle_len - period;
00255            }
00256          else
00257            {
00258              j += i - suffix + 1;
00259              memory = 0;
00260            }
00261        }
00262     }
00263   else
00264     {
00265       /* The two halves of needle are distinct; no extra memory is
00266         required, and any mismatch results in a maximal shift.  */
00267       period = MAX (suffix, needle_len - suffix) + 1;
00268       j = 0;
00269       while (AVAILABLE (haystack, haystack_len, j, needle_len))
00270        {
00271          /* Scan for matches in right half.  */
00272          i = suffix;
00273          while (i < needle_len && (CANON_ELEMENT (needle[i])
00274                                 == CANON_ELEMENT (haystack[i + j])))
00275            ++i;
00276          if (needle_len <= i)
00277            {
00278              /* Scan for matches in left half.  */
00279              i = suffix - 1;
00280              while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
00281                                    == CANON_ELEMENT (haystack[i + j])))
00282               --i;
00283              if (i == SIZE_MAX)
00284               return (RETURN_TYPE) (haystack + j);
00285              j += period;
00286            }
00287          else
00288            j += i - suffix + 1;
00289        }
00290     }
00291   return NULL;
00292 }
00293 
00294 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
00295    NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
00296    method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
00297    Performance is guaranteed to be linear, with an initialization cost
00298    of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
00299 
00300    If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
00301    most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
00302    and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
00303    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
00304    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
00305    sublinear performance is not possible.  */
00306 static RETURN_TYPE
00307 two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
00308                    const unsigned char *needle, size_t needle_len)
00309 {
00310   size_t i; /* Index into current byte of NEEDLE.  */
00311   size_t j; /* Index into current window of HAYSTACK.  */
00312   size_t period; /* The period of the right half of needle.  */
00313   size_t suffix; /* The index of the right half of needle.  */
00314   size_t shift_table[1U << CHAR_BIT]; /* See below.  */
00315 
00316   /* Factor the needle into two halves, such that the left half is
00317      smaller than the global period, and the right half is
00318      periodic (with a period as large as NEEDLE_LEN - suffix).  */
00319   suffix = critical_factorization (needle, needle_len, &period);
00320 
00321   /* Populate shift_table.  For each possible byte value c,
00322      shift_table[c] is the distance from the last occurrence of c to
00323      the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
00324      shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0.  */
00325   for (i = 0; i < 1U << CHAR_BIT; i++)
00326     shift_table[i] = needle_len;
00327   for (i = 0; i < needle_len; i++)
00328     shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
00329 
00330   /* Perform the search.  Each iteration compares the right half
00331      first.  */
00332   if (CMP_FUNC (needle, needle + period, suffix) == 0)
00333     {
00334       /* Entire needle is periodic; a mismatch can only advance by the
00335         period, so use memory to avoid rescanning known occurrences
00336         of the period.  */
00337       size_t memory = 0;
00338       size_t shift;
00339       j = 0;
00340       while (AVAILABLE (haystack, haystack_len, j, needle_len))
00341        {
00342          /* Check the last byte first; if it does not match, then
00343             shift to the next possible match location.  */
00344          shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
00345          if (0 < shift)
00346            {
00347              if (memory && shift < period)
00348               {
00349                 /* Since needle is periodic, but the last period has
00350                    a byte out of place, there can be no match until
00351                    after the mismatch.  */
00352                 shift = needle_len - period;
00353                 memory = 0;
00354               }
00355              j += shift;
00356              continue;
00357            }
00358          /* Scan for matches in right half.  The last byte has
00359             already been matched, by virtue of the shift table.  */
00360          i = MAX (suffix, memory);
00361          while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
00362                                    == CANON_ELEMENT (haystack[i + j])))
00363            ++i;
00364          if (needle_len - 1 <= i)
00365            {
00366              /* Scan for matches in left half.  */
00367              i = suffix - 1;
00368              while (memory < i + 1 && (CANON_ELEMENT (needle[i])
00369                                    == CANON_ELEMENT (haystack[i + j])))
00370               --i;
00371              if (i + 1 < memory + 1)
00372               return (RETURN_TYPE) (haystack + j);
00373              /* No match, so remember how many repetitions of period
00374                on the right half were scanned.  */
00375              j += period;
00376              memory = needle_len - period;
00377            }
00378          else
00379            {
00380              j += i - suffix + 1;
00381              memory = 0;
00382            }
00383        }
00384     }
00385   else
00386     {
00387       /* The two halves of needle are distinct; no extra memory is
00388         required, and any mismatch results in a maximal shift.  */
00389       size_t shift;
00390       period = MAX (suffix, needle_len - suffix) + 1;
00391       j = 0;
00392       while (AVAILABLE (haystack, haystack_len, j, needle_len))
00393        {
00394          /* Check the last byte first; if it does not match, then
00395             shift to the next possible match location.  */
00396          shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
00397          if (0 < shift)
00398            {
00399              j += shift;
00400              continue;
00401            }
00402          /* Scan for matches in right half.  The last byte has
00403             already been matched, by virtue of the shift table.  */
00404          i = suffix;
00405          while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
00406                                    == CANON_ELEMENT (haystack[i + j])))
00407            ++i;
00408          if (needle_len - 1 <= i)
00409            {
00410              /* Scan for matches in left half.  */
00411              i = suffix - 1;
00412              while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
00413                                    == CANON_ELEMENT (haystack[i + j])))
00414               --i;
00415              if (i == SIZE_MAX)
00416               return (RETURN_TYPE) (haystack + j);
00417              j += period;
00418            }
00419          else
00420            j += i - suffix + 1;
00421        }
00422     }
00423   return NULL;
00424 }
00425 
00426 #undef AVAILABLE
00427 #undef CANON_ELEMENT
00428 #undef CMP_FUNC
00429 #undef MAX
00430 #undef RETURN_TYPE