Back to index

glibc  2.9
strxfrm_l.c
Go to the documentation of this file.
00001 /* Copyright (C) 1995, 1996, 1997, 2002, 2004, 2005, 2006
00002    Free Software Foundation, Inc.
00003    This file is part of the GNU C Library.
00004    Written by Ulrich Drepper <drepper@gnu.org>, 1995.
00005 
00006    The GNU C Library is free software; you can redistribute it and/or
00007    modify it under the terms of the GNU Lesser General Public
00008    License as published by the Free Software Foundation; either
00009    version 2.1 of the License, or (at your option) any later version.
00010 
00011    The GNU C Library is distributed in the hope that it will be useful,
00012    but WITHOUT ANY WARRANTY; without even the implied warranty of
00013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014    Lesser General Public License for more details.
00015 
00016    You should have received a copy of the GNU Lesser General Public
00017    License along with the GNU C Library; if not, write to the Free
00018    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
00019    02111-1307 USA.  */
00020 
00021 #include <assert.h>
00022 #include <langinfo.h>
00023 #include <locale.h>
00024 #include <stddef.h>
00025 #include <stdint.h>
00026 #include <stdlib.h>
00027 #include <string.h>
00028 #include <sys/param.h>
00029 
00030 #ifndef STRING_TYPE
00031 # define STRING_TYPE char
00032 # define USTRING_TYPE unsigned char
00033 # define STRXFRM __strxfrm_l
00034 # define STRCMP strcmp
00035 # define STRLEN strlen
00036 # define STPNCPY __stpncpy
00037 # define WEIGHT_H "../locale/weight.h"
00038 # define SUFFIX      MB
00039 # define L(arg) arg
00040 #endif
00041 
00042 #define CONCAT(a,b) CONCAT1(a,b)
00043 #define CONCAT1(a,b) a##b
00044 
00045 #include "../locale/localeinfo.h"
00046 
00047 
00048 #ifndef WIDE_CHAR_VERSION
00049 
00050 /* We need UTF-8 encoding of numbers.  */
00051 static int
00052 utf8_encode (char *buf, int val)
00053 {
00054   int retval;
00055 
00056   if (val < 0x80)
00057     {
00058       *buf++ = (char) val;
00059       retval = 1;
00060     }
00061   else
00062     {
00063       int step;
00064 
00065       for (step = 2; step < 6; ++step)
00066        if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
00067          break;
00068       retval = step;
00069 
00070       *buf = (unsigned char) (~0xff >> step);
00071       --step;
00072       do
00073        {
00074          buf[step] = 0x80 | (val & 0x3f);
00075          val >>= 6;
00076        }
00077       while (--step > 0);
00078       *buf |= val;
00079     }
00080 
00081   return retval;
00082 }
00083 #endif
00084 
00085 
00086 size_t
00087 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
00088 {
00089   struct locale_data *current = l->__locales[LC_COLLATE];
00090   uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
00091   /* We don't assign the following values right away since it might be
00092      unnecessary in case there are no rules.  */
00093   const unsigned char *rulesets;
00094   const int32_t *table;
00095   const USTRING_TYPE *weights;
00096   const USTRING_TYPE *extra;
00097   const int32_t *indirect;
00098   uint_fast32_t pass;
00099   size_t needed;
00100   size_t last_needed;
00101   const USTRING_TYPE *usrc;
00102   size_t srclen = STRLEN (src);
00103   int32_t *idxarr;
00104   unsigned char *rulearr;
00105   size_t idxmax;
00106   size_t idxcnt;
00107   int use_malloc;
00108 
00109 #include WEIGHT_H
00110 
00111   if (nrules == 0)
00112     {
00113       if (n != 0)
00114        STPNCPY (dest, src, MIN (srclen + 1, n));
00115 
00116       return srclen;
00117     }
00118 
00119   rulesets = (const unsigned char *)
00120     current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
00121   table = (const int32_t *)
00122     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
00123   weights = (const USTRING_TYPE *)
00124     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
00125   extra = (const USTRING_TYPE *)
00126     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
00127   indirect = (const int32_t *)
00128     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
00129   use_malloc = 0;
00130 
00131   assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
00132   assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
00133   assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
00134   assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
00135 
00136   /* Handle an empty string as a special case.  */
00137   if (srclen == 0)
00138     {
00139       if (n != 0)
00140         *dest = L('\0');
00141       return 0;
00142     }
00143 
00144   /* We need the elements of the string as unsigned values since they
00145      are used as indeces.  */
00146   usrc = (const USTRING_TYPE *) src;
00147 
00148   /* Perform the first pass over the string and while doing this find
00149      and store the weights for each character.  Since we want this to
00150      be as fast as possible we are using `alloca' to store the temporary
00151      values.  But since there is no limit on the length of the string
00152      we have to use `malloc' if the string is too long.  We should be
00153      very conservative here.  */
00154   if (! __libc_use_alloca (srclen))
00155     {
00156       idxarr = (int32_t *) malloc ((srclen + 1) * (sizeof (int32_t) + 1));
00157       rulearr = (unsigned char *) &idxarr[srclen];
00158 
00159       if (idxarr == NULL)
00160        /* No memory.  Well, go with the stack then.
00161 
00162           XXX Once this implementation is stable we will handle this
00163           differently.  Instead of precomputing the indeces we will
00164           do this in time.  This means, though, that this happens for
00165           every pass again.  */
00166        goto try_stack;
00167       use_malloc = 1;
00168     }
00169   else
00170     {
00171     try_stack:
00172       idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
00173       rulearr = (unsigned char *) alloca (srclen + 1);
00174     }
00175 
00176   idxmax = 0;
00177   do
00178     {
00179       int32_t tmp = findidx (&usrc);
00180       rulearr[idxmax] = tmp >> 24;
00181       idxarr[idxmax] = tmp & 0xffffff;
00182 
00183       ++idxmax;
00184     }
00185   while (*usrc != L('\0'));
00186 
00187   /* This element is only read, the value never used but to determine
00188      another value which then is ignored.  */
00189   rulearr[idxmax] = '\0';
00190 
00191   /* Now the passes over the weights.  We now use the indeces we found
00192      before.  */
00193   needed = 0;
00194   for (pass = 0; pass < nrules; ++pass)
00195     {
00196       size_t backw_stop = ~0ul;
00197       int rule = rulesets[rulearr[0] * nrules + pass];
00198       /* We assume that if a rule has defined `position' in one section
00199         this is true for all of them.  */
00200       int position = rule & sort_position;
00201 
00202       last_needed = needed;
00203       if (position == 0)
00204        {
00205          for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
00206            {
00207              if ((rule & sort_forward) != 0)
00208               {
00209                 size_t len;
00210 
00211                 if (backw_stop != ~0ul)
00212                   {
00213                     /* Handle the pushed elements now.  */
00214                     size_t backw;
00215 
00216                     for (backw = idxcnt; backw > backw_stop; )
00217                      {
00218                        --backw;
00219                        len = weights[idxarr[backw]++];
00220 
00221                        if (needed + len < n)
00222                          while (len-- > 0)
00223                            dest[needed++] = weights[idxarr[backw]++];
00224                        else
00225                          {
00226                             /* No more characters fit into the buffer.  */
00227                            needed += len;
00228                            idxarr[backw] += len;
00229                          }
00230                      }
00231 
00232                     backw_stop = ~0ul;
00233                   }
00234 
00235                 /* Now handle the forward element.  */
00236                 len = weights[idxarr[idxcnt]++];
00237                 if (needed + len < n)
00238                   while (len-- > 0)
00239                     dest[needed++] = weights[idxarr[idxcnt]++];
00240                 else
00241                   {
00242                     /* No more characters fit into the buffer.  */
00243                     needed += len;
00244                     idxarr[idxcnt] += len;
00245                   }
00246               }
00247              else
00248               {
00249                 /* Remember where the backwards series started.  */
00250                 if (backw_stop == ~0ul)
00251                   backw_stop = idxcnt;
00252               }
00253 
00254              rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
00255            }
00256 
00257 
00258          if (backw_stop != ~0ul)
00259            {
00260              /* Handle the pushed elements now.  */
00261              size_t backw;
00262 
00263              backw = idxcnt;
00264              while (backw > backw_stop)
00265               {
00266                 size_t len = weights[idxarr[--backw]++];
00267 
00268                 if (needed + len < n)
00269                   while (len-- > 0)
00270                     dest[needed++] = weights[idxarr[backw]++];
00271                 else
00272                   {
00273                     /* No more characters fit into the buffer.  */
00274                     needed += len;
00275                     idxarr[backw] += len;
00276                   }
00277               }
00278            }
00279        }
00280       else
00281        {
00282          int val = 1;
00283 #ifndef WIDE_CHAR_VERSION
00284          char buf[7];
00285          size_t buflen;
00286 #endif
00287          size_t i;
00288 
00289          for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
00290            {
00291              if ((rule & sort_forward) != 0)
00292               {
00293                 size_t len;
00294 
00295                 if (backw_stop != ~0ul)
00296                   {
00297                    /* Handle the pushed elements now.  */
00298                     size_t backw;
00299 
00300                     for (backw = idxcnt; backw > backw_stop; )
00301                      {
00302                        --backw;
00303                        len = weights[idxarr[backw]++];
00304                        if (len != 0)
00305                          {
00306 #ifdef WIDE_CHAR_VERSION
00307                            if (needed + 1 + len < n)
00308                             {
00309                               dest[needed] = val;
00310                               for (i = 0; i < len; ++i)
00311                                 dest[needed + 1 + i] =
00312                                   weights[idxarr[backw] + i];
00313                             }
00314                            needed += 1 + len;
00315 #else
00316                            buflen = utf8_encode (buf, val);
00317                            if (needed + buflen + len < n)
00318                             {
00319                               for (i = 0; i < buflen; ++i)
00320                                 dest[needed + i] = buf[i];
00321                               for (i = 0; i < len; ++i)
00322                                 dest[needed + buflen + i] =
00323                                   weights[idxarr[backw] + i];
00324                             }
00325                            needed += buflen + len;
00326 #endif
00327                            idxarr[backw] += len;
00328                            val = 1;
00329                          }
00330                        else
00331                          ++val;
00332                      }
00333 
00334                     backw_stop = ~0ul;
00335                   }
00336 
00337                 /* Now handle the forward element.  */
00338                 len = weights[idxarr[idxcnt]++];
00339                 if (len != 0)
00340                   {
00341 #ifdef WIDE_CHAR_VERSION
00342                     if (needed + 1+ len < n)
00343                      {
00344                        dest[needed] = val;
00345                        for (i = 0; i < len; ++i)
00346                          dest[needed + 1 + i] =
00347                            weights[idxarr[idxcnt] + i];
00348                      }
00349                     needed += 1 + len;
00350 #else
00351                     buflen = utf8_encode (buf, val);
00352                     if (needed + buflen + len < n)
00353                      {
00354                        for (i = 0; i < buflen; ++i)
00355                          dest[needed + i] = buf[i];
00356                        for (i = 0; i < len; ++i)
00357                          dest[needed + buflen + i] =
00358                            weights[idxarr[idxcnt] + i];
00359                      }
00360                     needed += buflen + len;
00361 #endif
00362                     idxarr[idxcnt] += len;
00363                     val = 1;
00364                   }
00365                 else
00366                   /* Note that we don't have to increment `idxarr[idxcnt]'
00367                      since the length is zero.  */
00368                   ++val;
00369               }
00370              else
00371               {
00372                 /* Remember where the backwards series started.  */
00373                 if (backw_stop == ~0ul)
00374                   backw_stop = idxcnt;
00375               }
00376 
00377              rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
00378            }
00379 
00380          if (backw_stop != ~0ul)
00381            {
00382              /* Handle the pushed elements now.  */
00383              size_t backw;
00384 
00385              backw = idxmax - 1;
00386              while (backw > backw_stop)
00387               {
00388                 size_t len = weights[idxarr[--backw]++];
00389                 if (len != 0)
00390                   {
00391 #ifdef WIDE_CHAR_VERSION
00392                     if (needed + 1 + len < n)
00393                      {
00394                        dest[needed] = val;
00395                        for (i = 0; i < len; ++i)
00396                          dest[needed + 1 + i] =
00397                            weights[idxarr[backw] + i];
00398                      }
00399                     needed += 1 + len;
00400 #else
00401                     buflen = utf8_encode (buf, val);
00402                     if (needed + buflen + len < n)
00403                      {
00404                        for (i = 0; i < buflen; ++i)
00405                          dest[needed + i] = buf[i];
00406                        for (i = 0; i < len; ++i)
00407                          dest[needed + buflen + i] =
00408                            weights[idxarr[backw] + i];
00409                      }
00410                     needed += buflen + len;
00411 #endif
00412                     idxarr[backw] += len;
00413                     val = 1;
00414                   }
00415                 else
00416                   ++val;
00417               }
00418            }
00419        }
00420 
00421       /* Finally store the byte to separate the passes or terminate
00422         the string.  */
00423       if (needed < n)
00424        dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
00425       ++needed;
00426     }
00427 
00428   /* This is a little optimization: many collation specifications have
00429      a `position' rule at the end and if no non-ignored character
00430      is found the last \1 byte is immediately followed by a \0 byte
00431      signalling this.  We can avoid the \1 byte(s).  */
00432   if (needed > 2 && needed == last_needed + 1)
00433     {
00434       /* Remove the \1 byte.  */
00435       if (--needed <= n)
00436        dest[needed - 1] = L('\0');
00437     }
00438 
00439   /* Free the memory if needed.  */
00440   if (use_malloc)
00441     free (idxarr);
00442 
00443   /* Return the number of bytes/words we need, but don't count the NUL
00444      byte/word at the end.  */
00445   return needed - 1;
00446 }
00447 libc_hidden_def (STRXFRM)
00448 
00449 #ifndef WIDE_CHAR_VERSION
00450 weak_alias (__strxfrm_l, strxfrm_l)
00451 #endif