Back to index

glibc  2.9
strcoll_l.c
Go to the documentation of this file.
00001 /* Copyright (C) 1995,96,97,2002, 2004, 2007 Free Software Foundation, Inc.
00002    This file is part of the GNU C Library.
00003    Written by Ulrich Drepper <drepper@gnu.org>, 1995.
00004 
00005    The GNU C Library is free software; you can redistribute it and/or
00006    modify it under the terms of the GNU Lesser General Public
00007    License as published by the Free Software Foundation; either
00008    version 2.1 of the License, or (at your option) any later version.
00009 
00010    The GNU C Library is distributed in the hope that it will be useful,
00011    but WITHOUT ANY WARRANTY; without even the implied warranty of
00012    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00013    Lesser General Public License for more details.
00014 
00015    You should have received a copy of the GNU Lesser General Public
00016    License along with the GNU C Library; if not, write to the Free
00017    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
00018    02111-1307 USA.  */
00019 
00020 
00021 #include <assert.h>
00022 #include <langinfo.h>
00023 #include <locale.h>
00024 #include <stddef.h>
00025 #include <stdint.h>
00026 #include <stdlib.h>
00027 #include <string.h>
00028 
00029 #ifndef STRING_TYPE
00030 # define STRING_TYPE char
00031 # define USTRING_TYPE unsigned char
00032 # define STRCOLL __strcoll_l
00033 # define STRCMP strcmp
00034 # define STRLEN strlen
00035 # define WEIGHT_H "../locale/weight.h"
00036 # define SUFFIX      MB
00037 # define L(arg) arg
00038 #endif
00039 
00040 #define CONCAT(a,b) CONCAT1(a,b)
00041 #define CONCAT1(a,b) a##b
00042 
00043 #include "../locale/localeinfo.h"
00044 
00045 int
00046 STRCOLL (s1, s2, l)
00047      const STRING_TYPE *s1;
00048      const STRING_TYPE *s2;
00049      __locale_t l;
00050 {
00051   struct locale_data *current = l->__locales[LC_COLLATE];
00052   uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
00053   /* We don't assign the following values right away since it might be
00054      unnecessary in case there are no rules.  */
00055   const unsigned char *rulesets;
00056   const int32_t *table;
00057   const USTRING_TYPE *weights;
00058   const USTRING_TYPE *extra;
00059   const int32_t *indirect;
00060   uint_fast32_t pass;
00061   int result = 0;
00062   const USTRING_TYPE *us1;
00063   const USTRING_TYPE *us2;
00064   size_t s1len;
00065   size_t s2len;
00066   int32_t *idx1arr;
00067   int32_t *idx2arr;
00068   unsigned char *rule1arr;
00069   unsigned char *rule2arr;
00070   size_t idx1max;
00071   size_t idx2max;
00072   size_t idx1cnt;
00073   size_t idx2cnt;
00074   size_t idx1now;
00075   size_t idx2now;
00076   size_t backw1_stop;
00077   size_t backw2_stop;
00078   size_t backw1;
00079   size_t backw2;
00080   int val1;
00081   int val2;
00082   int position;
00083   int seq1len;
00084   int seq2len;
00085   int use_malloc;
00086 
00087 #include WEIGHT_H
00088 
00089   if (nrules == 0)
00090     return STRCMP (s1, s2);
00091 
00092   rulesets = (const unsigned char *)
00093     current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
00094   table = (const int32_t *)
00095     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
00096   weights = (const USTRING_TYPE *)
00097     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
00098   extra = (const USTRING_TYPE *)
00099     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
00100   indirect = (const int32_t *)
00101     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
00102   use_malloc = 0;
00103 
00104   assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
00105   assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
00106   assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
00107   assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
00108 
00109   /* We need this a few times.  */
00110   s1len = STRLEN (s1);
00111   s2len = STRLEN (s2);
00112 
00113   /* Catch empty strings.  */
00114   if (__builtin_expect (s1len == 0, 0) || __builtin_expect (s2len == 0, 0))
00115     return (s1len != 0) - (s2len != 0);
00116 
00117   /* We need the elements of the strings as unsigned values since they
00118      are used as indeces.  */
00119   us1 = (const USTRING_TYPE *) s1;
00120   us2 = (const USTRING_TYPE *) s2;
00121 
00122   /* Perform the first pass over the string and while doing this find
00123      and store the weights for each character.  Since we want this to
00124      be as fast as possible we are using `alloca' to store the temporary
00125      values.  But since there is no limit on the length of the string
00126      we have to use `malloc' if the string is too long.  We should be
00127      very conservative here.
00128 
00129      Please note that the localedef programs makes sure that `position'
00130      is not used at the first level.  */
00131   if (! __libc_use_alloca ((s1len + s2len) * (sizeof (int32_t) + 1)))
00132     {
00133       idx1arr = (int32_t *) malloc ((s1len + s2len) * (sizeof (int32_t) + 1));
00134       idx2arr = &idx1arr[s1len];
00135       rule1arr = (unsigned char *) &idx2arr[s2len];
00136       rule2arr = &rule1arr[s1len];
00137 
00138       if (idx1arr == NULL)
00139        /* No memory.  Well, go with the stack then.
00140 
00141           XXX Once this implementation is stable we will handle this
00142           differently.  Instead of precomputing the indeces we will
00143           do this in time.  This means, though, that this happens for
00144           every pass again.  */
00145        goto try_stack;
00146       use_malloc = 1;
00147     }
00148   else
00149     {
00150     try_stack:
00151       idx1arr = (int32_t *) alloca (s1len * sizeof (int32_t));
00152       idx2arr = (int32_t *) alloca (s2len * sizeof (int32_t));
00153       rule1arr = (unsigned char *) alloca (s1len);
00154       rule2arr = (unsigned char *) alloca (s2len);
00155     }
00156 
00157   idx1cnt = 0;
00158   idx2cnt = 0;
00159   idx1max = 0;
00160   idx2max = 0;
00161   idx1now = 0;
00162   idx2now = 0;
00163   backw1_stop = ~0ul;
00164   backw2_stop = ~0ul;
00165   backw1 = ~0ul;
00166   backw2 = ~0ul;
00167   seq1len = 0;
00168   seq2len = 0;
00169   position = rulesets[0] & sort_position;
00170   while (1)
00171     {
00172       val1 = 0;
00173       val2 = 0;
00174 
00175       /* Get the next non-IGNOREd element for string `s1'.  */
00176       if (seq1len == 0)
00177        do
00178          {
00179            ++val1;
00180 
00181            if (backw1_stop != ~0ul)
00182              {
00183               /* The is something pushed.  */
00184               if (backw1 == backw1_stop)
00185                 {
00186                   /* The last pushed character was handled.  Continue
00187                      with forward characters.  */
00188                   if (idx1cnt < idx1max)
00189                     {
00190                      idx1now = idx1cnt;
00191                      backw1_stop = ~0ul;
00192                     }
00193                   else
00194                     /* Nothing anymore.  The backward sequence ended with
00195                       the last sequence in the string.  Note that seq1len
00196                       is still zero.  */
00197                     break;
00198                 }
00199               else
00200                 idx1now = --backw1;
00201              }
00202            else
00203              {
00204               backw1_stop = idx1max;
00205 
00206               while (*us1 != L('\0'))
00207                 {
00208                   int32_t tmp = findidx (&us1);
00209                   rule1arr[idx1max] = tmp >> 24;
00210                   idx1arr[idx1max] = tmp & 0xffffff;
00211                   idx1cnt = idx1max++;
00212 
00213                   if ((rulesets[rule1arr[idx1cnt] * nrules]
00214                       & sort_backward) == 0)
00215                     /* No more backward characters to push.  */
00216                     break;
00217                   ++idx1cnt;
00218                 }
00219 
00220               if (backw1_stop >= idx1cnt)
00221                 {
00222                   /* No sequence at all or just one.  */
00223                   if (idx1cnt == idx1max || backw1_stop > idx1cnt)
00224                     /* Note that seq1len is still zero.  */
00225                     break;
00226 
00227                   backw1_stop = ~0ul;
00228                   idx1now = idx1cnt;
00229                 }
00230               else
00231                 /* We pushed backward sequences.  */
00232                 idx1now = backw1 = idx1cnt - 1;
00233              }
00234          }
00235        while ((seq1len = weights[idx1arr[idx1now]++]) == 0);
00236 
00237       /* And the same for string `s2'.  */
00238       if (seq2len == 0)
00239        do
00240          {
00241            ++val2;
00242 
00243            if (backw2_stop != ~0ul)
00244              {
00245               /* The is something pushed.  */
00246               if (backw2 == backw2_stop)
00247                 {
00248                   /* The last pushed character was handled.  Continue
00249                      with forward characters.  */
00250                   if (idx2cnt < idx2max)
00251                     {
00252                      idx2now = idx2cnt;
00253                      backw2_stop = ~0ul;
00254                     }
00255                   else
00256                     /* Nothing anymore.  The backward sequence ended with
00257                       the last sequence in the string.  Note that seq2len
00258                       is still zero.  */
00259                     break;
00260                 }
00261               else
00262                 idx2now = --backw2;
00263              }
00264            else
00265              {
00266               backw2_stop = idx2max;
00267 
00268               while (*us2 != L('\0'))
00269                 {
00270                   int32_t tmp = findidx (&us2);
00271                   rule2arr[idx2max] = tmp >> 24;
00272                   idx2arr[idx2max] = tmp & 0xffffff;
00273                   idx2cnt = idx2max++;
00274 
00275                   if ((rulesets[rule2arr[idx2cnt] * nrules]
00276                       & sort_backward) == 0)
00277                     /* No more backward characters to push.  */
00278                     break;
00279                   ++idx2cnt;
00280                 }
00281 
00282               if (backw2_stop >= idx2cnt)
00283                 {
00284                   /* No sequence at all or just one.  */
00285                   if (idx2cnt == idx2max || backw2_stop > idx2cnt)
00286                     /* Note that seq1len is still zero.  */
00287                     break;
00288 
00289                   backw2_stop = ~0ul;
00290                   idx2now = idx2cnt;
00291                 }
00292               else
00293                 /* We pushed backward sequences.  */
00294                 idx2now = backw2 = idx2cnt - 1;
00295              }
00296          }
00297        while ((seq2len = weights[idx2arr[idx2now]++]) == 0);
00298 
00299       /* See whether any or both strings are empty.  */
00300       if (seq1len == 0 || seq2len == 0)
00301        {
00302          if (seq1len == seq2len)
00303            /* Both ended.  So far so good, both strings are equal at the
00304               first level.  */
00305            break;
00306 
00307          /* This means one string is shorter than the other.  Find out
00308             which one and return an appropriate value.  */
00309          result = seq1len == 0 ? -1 : 1;
00310          goto free_and_return;
00311        }
00312 
00313       /* Test for position if necessary.  */
00314       if (position && val1 != val2)
00315        {
00316          result = val1 - val2;
00317          goto free_and_return;
00318        }
00319 
00320       /* Compare the two sequences.  */
00321       do
00322        {
00323          if (weights[idx1arr[idx1now]] != weights[idx2arr[idx2now]])
00324            {
00325              /* The sequences differ.  */
00326              result = weights[idx1arr[idx1now]] - weights[idx2arr[idx2now]];
00327              goto free_and_return;
00328            }
00329 
00330          /* Increment the offsets.  */
00331          ++idx1arr[idx1now];
00332          ++idx2arr[idx2now];
00333 
00334          --seq1len;
00335          --seq2len;
00336        }
00337       while (seq1len > 0 && seq2len > 0);
00338 
00339       if (position && seq1len != seq2len)
00340        {
00341          result = seq1len - seq2len;
00342          goto free_and_return;
00343        }
00344     }
00345 
00346   /* Now the remaining passes over the weights.  We now use the
00347      indeces we found before.  */
00348   for (pass = 1; pass < nrules; ++pass)
00349     {
00350       /* We assume that if a rule has defined `position' in one section
00351         this is true for all of them.  */
00352       idx1cnt = 0;
00353       idx2cnt = 0;
00354       backw1_stop = ~0ul;
00355       backw2_stop = ~0ul;
00356       backw1 = ~0ul;
00357       backw2 = ~0ul;
00358       position = rulesets[rule1arr[0] * nrules + pass] & sort_position;
00359 
00360       while (1)
00361        {
00362          val1 = 0;
00363          val2 = 0;
00364 
00365          /* Get the next non-IGNOREd element for string `s1'.  */
00366          if (seq1len == 0)
00367            do
00368              {
00369               ++val1;
00370 
00371               if (backw1_stop != ~0ul)
00372                 {
00373                   /* The is something pushed.  */
00374                   if (backw1 == backw1_stop)
00375                     {
00376                      /* The last pushed character was handled.  Continue
00377                         with forward characters.  */
00378                      if (idx1cnt < idx1max)
00379                        {
00380                          idx1now = idx1cnt;
00381                          backw1_stop = ~0ul;
00382                        }
00383                      else
00384                        {
00385                          /* Nothing anymore.  The backward sequence
00386                             ended with the last sequence in the string.  */
00387                          idx1now = ~0ul;
00388                          break;
00389                        }
00390                     }
00391                   else
00392                     idx1now = --backw1;
00393                 }
00394               else
00395                 {
00396                   backw1_stop = idx1cnt;
00397 
00398                   while (idx1cnt < idx1max)
00399                     {
00400                      if ((rulesets[rule1arr[idx1cnt] * nrules + pass]
00401                           & sort_backward) == 0)
00402                        /* No more backward characters to push.  */
00403                        break;
00404                      ++idx1cnt;
00405                     }
00406 
00407                   if (backw1_stop == idx1cnt)
00408                     {
00409                      /* No sequence at all or just one.  */
00410                      if (idx1cnt == idx1max)
00411                        /* Note that seq1len is still zero.  */
00412                        break;
00413 
00414                      backw1_stop = ~0ul;
00415                      idx1now = idx1cnt++;
00416                     }
00417                   else
00418                     /* We pushed backward sequences.  */
00419                     idx1now = backw1 = idx1cnt - 1;
00420                 }
00421              }
00422            while ((seq1len = weights[idx1arr[idx1now]++]) == 0);
00423 
00424          /* And the same for string `s2'.  */
00425          if (seq2len == 0)
00426            do
00427              {
00428               ++val2;
00429 
00430               if (backw2_stop != ~0ul)
00431                 {
00432                   /* The is something pushed.  */
00433                   if (backw2 == backw2_stop)
00434                     {
00435                      /* The last pushed character was handled.  Continue
00436                         with forward characters.  */
00437                      if (idx2cnt < idx2max)
00438                        {
00439                          idx2now = idx2cnt;
00440                          backw2_stop = ~0ul;
00441                        }
00442                      else
00443                        {
00444                          /* Nothing anymore.  The backward sequence
00445                             ended with the last sequence in the string.  */
00446                          idx2now = ~0ul;
00447                          break;
00448                        }
00449                     }
00450                   else
00451                     idx2now = --backw2;
00452                 }
00453               else
00454                 {
00455                   backw2_stop = idx2cnt;
00456 
00457                   while (idx2cnt < idx2max)
00458                     {
00459                      if ((rulesets[rule2arr[idx2cnt] * nrules + pass]
00460                           & sort_backward) == 0)
00461                        /* No more backward characters to push.  */
00462                        break;
00463                      ++idx2cnt;
00464                     }
00465 
00466                   if (backw2_stop == idx2cnt)
00467                     {
00468                      /* No sequence at all or just one.  */
00469                      if (idx2cnt == idx2max)
00470                        /* Note that seq2len is still zero.  */
00471                        break;
00472 
00473                      backw2_stop = ~0ul;
00474                      idx2now = idx2cnt++;
00475                     }
00476                   else
00477                     /* We pushed backward sequences.  */
00478                     idx2now = backw2 = idx2cnt - 1;
00479                 }
00480              }
00481            while ((seq2len = weights[idx2arr[idx2now]++]) == 0);
00482 
00483          /* See whether any or both strings are empty.  */
00484          if (seq1len == 0 || seq2len == 0)
00485            {
00486              if (seq1len == seq2len)
00487               /* Both ended.  So far so good, both strings are equal
00488                  at this level.  */
00489               break;
00490 
00491              /* This means one string is shorter than the other.  Find out
00492                which one and return an appropriate value.  */
00493              result = seq1len == 0 ? -1 : 1;
00494              goto free_and_return;
00495            }
00496 
00497          /* Test for position if necessary.  */
00498          if (position && val1 != val2)
00499            {
00500              result = val1 - val2;
00501              goto free_and_return;
00502            }
00503 
00504          /* Compare the two sequences.  */
00505          do
00506            {
00507              if (weights[idx1arr[idx1now]] != weights[idx2arr[idx2now]])
00508               {
00509                 /* The sequences differ.  */
00510                 result = (weights[idx1arr[idx1now]]
00511                          - weights[idx2arr[idx2now]]);
00512                 goto free_and_return;
00513               }
00514 
00515              /* Increment the offsets.  */
00516              ++idx1arr[idx1now];
00517              ++idx2arr[idx2now];
00518 
00519              --seq1len;
00520              --seq2len;
00521            }
00522          while (seq1len > 0 && seq2len > 0);
00523 
00524          if (position && seq1len != seq2len)
00525            {
00526              result = seq1len - seq2len;
00527              goto free_and_return;
00528            }
00529        }
00530     }
00531 
00532   /* Free the memory if needed.  */
00533  free_and_return:
00534   if (use_malloc)
00535     free (idx1arr);
00536 
00537   return result;
00538 }
00539 libc_hidden_def (STRCOLL)
00540 
00541 #ifndef WIDE_CHAR_VERSION
00542 weak_alias (__strcoll_l, strcoll_l)
00543 #endif