Back to index

glibc  2.9
repertoire.c
Go to the documentation of this file.
00001 /* Copyright (C) 1998-2002,2004,2005,2007 Free Software Foundation, Inc.
00002    This file is part of the GNU C Library.
00003    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
00004 
00005    This program is free software; you can redistribute it and/or modify
00006    it under the terms of the GNU General Public License as published
00007    by the Free Software Foundation; version 2 of the License, or
00008    (at your option) any later version.
00009 
00010    This program is distributed in the hope that it will be useful,
00011    but WITHOUT ANY WARRANTY; without even the implied warranty of
00012    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013    GNU General Public License for more details.
00014 
00015    You should have received a copy of the GNU General Public License
00016    along with this program; if not, write to the Free Software Foundation,
00017    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
00018 
00019 #ifdef HAVE_CONFIG_H
00020 # include <config.h>
00021 #endif
00022 
00023 #include <errno.h>
00024 #include <error.h>
00025 #include <limits.h>
00026 #include <obstack.h>
00027 #include <search.h>
00028 #include <stdlib.h>
00029 #include <string.h>
00030 #include <unistd.h>
00031 
00032 #include "localedef.h"
00033 #include "linereader.h"
00034 #include "charmap.h"
00035 #include "repertoire.h"
00036 #include "simple-hash.h"
00037 
00038 
00039 /* Simple keyword hashing for the repertoiremap.  */
00040 static const struct keyword_t *repertoiremap_hash (const char *str,
00041                                              unsigned int len);
00042 static void repertoire_new_char (struct linereader *lr, hash_table *ht,
00043                              hash_table *rt, struct obstack *ob,
00044                              uint32_t value, const char *from,
00045                              const char *to, int decimal_ellipsis);
00046 static int repertoire_compare (const void *p1, const void *p2);
00047 
00048 /* Already known repertoire maps.  */
00049 static void *known;
00050 
00051 /* List of repertoire maps which are not available and which have been
00052    reported to not be.  */
00053 static void *unavailable;
00054 
00055 
00056 struct repertoire_t *
00057 repertoire_read (const char *filename)
00058 {
00059   struct linereader *repfile;
00060   struct repertoire_t *result;
00061   struct repertoire_t **resultp;
00062   struct repertoire_t search;
00063   int state;
00064   char *from_name = NULL;
00065   char *to_name = NULL;
00066   enum token_t ellipsis = tok_none;
00067 
00068   search.name = filename;
00069   resultp = tfind (&search, &known, &repertoire_compare);
00070   if (resultp != NULL)
00071     return *resultp;
00072 
00073   /* Determine path.  */
00074   repfile = lr_open (filename, repertoiremap_hash);
00075   if (repfile == NULL)
00076     {
00077       if (strchr (filename, '/') == NULL)
00078        {
00079          char *i18npath = getenv ("I18NPATH");
00080          if (i18npath != NULL && *i18npath != '\0')
00081            {
00082              const size_t pathlen = strlen (i18npath);
00083              char i18npathbuf[pathlen + 1];
00084              char path[strlen (filename) + 1 + pathlen
00085                       + sizeof ("/repertoiremaps/") - 1];
00086              char *next;
00087              i18npath = memcpy (i18npathbuf, i18npath, pathlen + 1);
00088 
00089              while (repfile == NULL
00090                    && (next = strsep (&i18npath, ":")) != NULL)
00091               {
00092                 stpcpy (stpcpy (stpcpy (path, next), "/repertoiremaps/"),
00093                        filename);
00094 
00095                 repfile = lr_open (path, repertoiremap_hash);
00096 
00097                 if (repfile == NULL)
00098                   {
00099                     stpcpy (stpcpy (path, next), filename);
00100 
00101                     repfile = lr_open (path, repertoiremap_hash);
00102                   }
00103               }
00104            }
00105 
00106          if (repfile == NULL)
00107            {
00108              /* Look in the systems charmap directory.  */
00109              char *buf = xmalloc (strlen (filename) + 1
00110                                + sizeof (REPERTOIREMAP_PATH));
00111 
00112              stpcpy (stpcpy (stpcpy (buf, REPERTOIREMAP_PATH), "/"),
00113                     filename);
00114              repfile = lr_open (buf, repertoiremap_hash);
00115 
00116              free (buf);
00117            }
00118        }
00119 
00120       if (repfile == NULL)
00121        return NULL;
00122     }
00123 
00124   /* We don't want symbolic names in string to be translated.  */
00125   repfile->translate_strings = 0;
00126 
00127   /* Allocate room for result.  */
00128   result = (struct repertoire_t *) xmalloc (sizeof (struct repertoire_t));
00129   memset (result, '\0', sizeof (struct repertoire_t));
00130 
00131   result->name = xstrdup (filename);
00132 
00133 #define obstack_chunk_alloc malloc
00134 #define obstack_chunk_free free
00135   obstack_init (&result->mem_pool);
00136 
00137   if (init_hash (&result->char_table, 256)
00138       || init_hash (&result->reverse_table, 256)
00139       || init_hash (&result->seq_table, 256))
00140     {
00141       free (result);
00142       return NULL;
00143     }
00144 
00145   /* We use a state machine to describe the charmap description file
00146      format.  */
00147   state = 1;
00148   while (1)
00149     {
00150       /* What's on?  */
00151       struct token *now = lr_token (repfile, NULL, NULL, NULL, verbose);
00152       enum token_t nowtok = now->tok;
00153       struct token *arg;
00154 
00155       if (nowtok == tok_eof)
00156        break;
00157 
00158       switch (state)
00159        {
00160        case 1:
00161          /* We haven't yet read any character definition.  This is where
00162             we accept escape_char and comment_char definitions.  */
00163          if (nowtok == tok_eol)
00164            /* Ignore empty lines.  */
00165            continue;
00166 
00167          if (nowtok == tok_escape_char || nowtok == tok_comment_char)
00168            {
00169              /* We know that we need an argument.  */
00170              arg = lr_token (repfile, NULL, NULL, NULL, verbose);
00171 
00172              if (arg->tok != tok_ident)
00173               {
00174                 lr_error (repfile, _("syntax error in prolog: %s"),
00175                          _("bad argument"));
00176 
00177                 lr_ignore_rest (repfile, 0);
00178                 continue;
00179               }
00180 
00181              if (arg->val.str.lenmb != 1)
00182               {
00183                 lr_error (repfile, _("\
00184 argument to <%s> must be a single character"),
00185                          nowtok == tok_escape_char ? "escape_char"
00186                                                 : "comment_char");
00187 
00188                 lr_ignore_rest (repfile, 0);
00189                 continue;
00190               }
00191 
00192              if (nowtok == tok_escape_char)
00193               repfile->escape_char = *arg->val.str.startmb;
00194              else
00195               repfile->comment_char = *arg->val.str.startmb;
00196 
00197              lr_ignore_rest (repfile, 1);
00198              continue;
00199            }
00200 
00201          if (nowtok == tok_charids)
00202            {
00203              lr_ignore_rest (repfile, 1);
00204 
00205              state = 2;
00206              continue;
00207            }
00208 
00209          /* Otherwise we start reading the character definitions.  */
00210          state = 2;
00211          /* FALLTHROUGH */
00212 
00213        case 2:
00214          /* We are now are in the body.  Each line
00215             must have the format "%s %s %s\n" or "%s...%s %s %s\n".  */
00216          if (nowtok == tok_eol)
00217            /* Ignore empty lines.  */
00218            continue;
00219 
00220          if (nowtok == tok_end)
00221            {
00222              state = 90;
00223              continue;
00224            }
00225 
00226          if (nowtok != tok_bsymbol)
00227            {
00228              lr_error (repfile,
00229                      _("syntax error in repertoire map definition: %s"),
00230                      _("no symbolic name given"));
00231 
00232              lr_ignore_rest (repfile, 0);
00233              continue;
00234            }
00235 
00236          /* If the previous line was not completely correct free the
00237             used memory.  */
00238          if (from_name != NULL)
00239            obstack_free (&result->mem_pool, from_name);
00240 
00241          from_name = (char *) obstack_copy0 (&result->mem_pool,
00242                                          now->val.str.startmb,
00243                                          now->val.str.lenmb);
00244          to_name = NULL;
00245 
00246          state = 3;
00247          continue;
00248 
00249        case 3:
00250          /* We have two possibilities: We can see an ellipsis or an
00251             encoding value.  */
00252          if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
00253              || nowtok == tok_ellipsis2)
00254            {
00255              ellipsis = nowtok;
00256              state = 4;
00257              continue;
00258            }
00259          /* FALLTHROUGH */
00260 
00261        case 5:
00262          /* We expect a value of the form <Uxxxx> or <Uxxxxxxxx> where
00263             the xxx mean a hexadecimal value.  */
00264          state = 2;
00265 
00266          errno = 0;
00267          if (nowtok != tok_ucs4)
00268            {
00269              lr_error (repfile,
00270                      _("syntax error in repertoire map definition: %s"),
00271                      _("no <Uxxxx> or <Uxxxxxxxx> value given"));
00272 
00273              lr_ignore_rest (repfile, 0);
00274              continue;
00275            }
00276 
00277          /* We've found a new valid definition.  */
00278          repertoire_new_char (repfile, &result->char_table,
00279                             &result->reverse_table, &result->mem_pool,
00280                             now->val.ucs4, from_name, to_name,
00281                             ellipsis != tok_ellipsis2);
00282 
00283          /* Ignore the rest of the line.  */
00284          lr_ignore_rest (repfile, 0);
00285 
00286          from_name = NULL;
00287          to_name = NULL;
00288 
00289          continue;
00290 
00291        case 4:
00292          if (nowtok != tok_bsymbol)
00293            {
00294              lr_error (repfile,
00295                      _("syntax error in repertoire map definition: %s"),
00296                      _("no symbolic name given for end of range"));
00297 
00298              lr_ignore_rest (repfile, 0);
00299              state = 2;
00300              continue;
00301            }
00302 
00303          /* Copy the to-name in a safe place.  */
00304          to_name = (char *) obstack_copy0 (&result->mem_pool,
00305                                        repfile->token.val.str.startmb,
00306                                        repfile->token.val.str.lenmb);
00307 
00308          state = 5;
00309          continue;
00310 
00311        case 90:
00312          if (nowtok != tok_charids)
00313            lr_error (repfile, _("\
00314 %1$s: definition does not end with `END %1$s'"), "CHARIDS");
00315 
00316          lr_ignore_rest (repfile, nowtok == tok_charids);
00317          break;
00318        }
00319 
00320       break;
00321     }
00322 
00323   if (state != 2 && state != 90 && !be_quiet)
00324     WITH_CUR_LOCALE (error (0, 0, _("%s: premature end of file"),
00325                          repfile->fname));
00326 
00327   lr_close (repfile);
00328 
00329   if (tsearch (result, &known, &repertoire_compare) == NULL)
00330     /* Something went wrong.  */
00331     WITH_CUR_LOCALE (error (0, errno, _("cannot save new repertoire map")));
00332 
00333   return result;
00334 }
00335 
00336 
00337 void
00338 repertoire_complain (const char *name)
00339 {
00340   if (tfind (name, &unavailable, (__compar_fn_t) strcmp) == NULL)
00341     {
00342       WITH_CUR_LOCALE (error (0, errno, _("\
00343 repertoire map file `%s' not found"), name));
00344 
00345       /* Remember that we reported this map.  */
00346       tsearch (name, &unavailable, (__compar_fn_t) strcmp);
00347     }
00348 }
00349 
00350 
00351 static int
00352 repertoire_compare (const void *p1, const void *p2)
00353 {
00354   struct repertoire_t *r1 = (struct repertoire_t *) p1;
00355   struct repertoire_t *r2 = (struct repertoire_t *) p2;
00356 
00357   return strcmp (r1->name, r2->name);
00358 }
00359 
00360 
00361 static const struct keyword_t *
00362 repertoiremap_hash (const char *str, unsigned int len)
00363 {
00364   static const struct keyword_t wordlist[] =
00365   {
00366     {"escape_char",      tok_escape_char,     0},
00367     {"comment_char",     tok_comment_char,    0},
00368     {"CHARIDS",          tok_charids,         0},
00369     {"END",              tok_end,             0},
00370   };
00371 
00372   if (len == 11 && memcmp (wordlist[0].name, str, 11) == 0)
00373     return &wordlist[0];
00374   if (len == 12 && memcmp (wordlist[1].name, str, 12) == 0)
00375     return &wordlist[1];
00376   if (len == 7 && memcmp (wordlist[2].name, str, 7) == 0)
00377     return &wordlist[2];
00378   if (len == 3 && memcmp (wordlist[3].name, str, 3) == 0)
00379     return &wordlist[3];
00380 
00381   return NULL;
00382 }
00383 
00384 
00385 static void
00386 repertoire_new_char (struct linereader *lr, hash_table *ht, hash_table *rt,
00387                    struct obstack *ob, uint32_t value, const char *from,
00388                    const char *to, int decimal_ellipsis)
00389 {
00390   char *from_end;
00391   char *to_end;
00392   const char *cp;
00393   char *buf = NULL;
00394   int prefix_len, len1, len2;
00395   unsigned long int from_nr, to_nr, cnt;
00396 
00397   if (to == NULL)
00398     {
00399       insert_entry (ht, from, strlen (from),
00400                   (void *) (unsigned long int) value);
00401       /* Please note that it isn't a bug if a symbol is defined more
00402         than once.  All later definitions are simply discarded.  */
00403 
00404       insert_entry (rt, obstack_copy (ob, &value, sizeof (value)),
00405                   sizeof (value), (void *) from);
00406 
00407       return;
00408     }
00409 
00410   /* We have a range: the names must have names with equal prefixes
00411      and an equal number of digits, where the second number is greater
00412      or equal than the first.  */
00413   len1 = strlen (from);
00414   len2 = strlen (to);
00415 
00416   if (len1 != len2)
00417     {
00418     invalid_range:
00419       lr_error (lr, _("invalid names for character range"));
00420       return;
00421     }
00422 
00423   cp = &from[len1 - 1];
00424   if (decimal_ellipsis)
00425     while (isdigit (*cp) && cp >= from)
00426       --cp;
00427   else
00428     while (isxdigit (*cp) && cp >= from)
00429       {
00430        if (!isdigit (*cp) && !isupper (*cp))
00431          lr_error (lr, _("\
00432 hexadecimal range format should use only capital characters"));
00433        --cp;
00434       }
00435 
00436   prefix_len = (cp - from) + 1;
00437 
00438   if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
00439     goto invalid_range;
00440 
00441   errno = 0;
00442   from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
00443   if (*from_end != '\0' || (from_nr == ULONG_MAX && errno == ERANGE)
00444       || ((to_nr = strtoul (&to[prefix_len], &to_end,
00445                          decimal_ellipsis ? 10 : 16)) == ULONG_MAX
00446           && errno == ERANGE)
00447       || *to_end != '\0')
00448     {
00449       lr_error (lr, _("<%s> and <%s> are invalid names for range"),
00450               from, to);
00451       return;
00452     }
00453 
00454   if (from_nr > to_nr)
00455     {
00456       lr_error (lr, _("upper limit in range is smaller than lower limit"));
00457       return;
00458     }
00459 
00460   for (cnt = from_nr; cnt <= to_nr; ++cnt)
00461     {
00462       uint32_t this_value = value + (cnt - from_nr);
00463 
00464       obstack_printf (ob, decimal_ellipsis ? "%.*s%0*ld" : "%.*s%0*lX",
00465                     prefix_len, from, len1 - prefix_len, cnt);
00466       obstack_1grow (ob, '\0');
00467 
00468       insert_entry (ht, buf, len1,
00469                   (void *) (unsigned long int) this_value);
00470       /* Please note we don't examine the return value since it is no error
00471         if we have two definitions for a symbol.  */
00472 
00473       insert_entry (rt, obstack_copy (ob, &this_value, sizeof (this_value)),
00474                   sizeof (this_value), (void *) from);
00475     }
00476 }
00477 
00478 
00479 uint32_t
00480 repertoire_find_value (const struct repertoire_t *rep, const char *name,
00481                      size_t len)
00482 {
00483   void *result;
00484 
00485   if (rep == NULL)
00486     return ILLEGAL_CHAR_VALUE;
00487 
00488   if (find_entry ((hash_table *) &rep->char_table, name, len, &result) < 0)
00489     return ILLEGAL_CHAR_VALUE;
00490 
00491   return (uint32_t) ((unsigned long int) result);
00492 }
00493 
00494 
00495 const char *
00496 repertoire_find_symbol (const struct repertoire_t *rep, uint32_t ucs)
00497 {
00498   void *result;
00499 
00500   if (rep == NULL)
00501     return NULL;
00502 
00503   if (find_entry ((hash_table *) &rep->reverse_table, &ucs, sizeof (ucs),
00504                 &result) < 0)
00505     return NULL;
00506 
00507   return (const char *) result;
00508 }
00509 
00510 
00511 struct charseq *
00512 repertoire_find_seq (const struct repertoire_t *rep, uint32_t ucs)
00513 {
00514   void *result;
00515 
00516   if (rep == NULL)
00517     return NULL;
00518 
00519   if (find_entry ((hash_table *) &rep->seq_table, &ucs, sizeof (ucs),
00520                 &result) < 0)
00521     return NULL;
00522 
00523   return (struct charseq *) result;
00524 }