Back to index

glibc  2.9
charmap.c
Go to the documentation of this file.
00001 /* Copyright (C) 1996, 1998-2004,2005, 2006 Free Software Foundation, Inc.
00002    This file is part of the GNU C Library.
00003    Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
00004 
00005    This program is free software; you can redistribute it and/or modify
00006    it under the terms of the GNU General Public License as published
00007    by the Free Software Foundation; version 2 of the License, or
00008    (at your option) any later version.
00009 
00010    This program is distributed in the hope that it will be useful,
00011    but WITHOUT ANY WARRANTY; without even the implied warranty of
00012    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013    GNU General Public License for more details.
00014 
00015    You should have received a copy of the GNU General Public License
00016    along with this program; if not, write to the Free Software Foundation,
00017    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
00018 
00019 #ifdef HAVE_CONFIG_H
00020 # include <config.h>
00021 #endif
00022 
00023 #include <ctype.h>
00024 #include <errno.h>
00025 #include <libintl.h>
00026 #include <limits.h>
00027 #include <stdio.h>
00028 #include <stdlib.h>
00029 #include <string.h>
00030 #include <error.h>
00031 
00032 #include "localedef.h"
00033 #include "linereader.h"
00034 #include "charmap.h"
00035 #include "charmap-dir.h"
00036 
00037 #include <assert.h>
00038 
00039 
00040 /* Define the lookup function.  */
00041 #include "charmap-kw.h"
00042 
00043 
00044 /* Prototypes for local functions.  */
00045 static struct charmap_t *parse_charmap (struct linereader *cmfile,
00046                                    int verbose, int be_quiet);
00047 static void new_width (struct linereader *cmfile, struct charmap_t *result,
00048                      const char *from, const char *to,
00049                      unsigned long int width);
00050 static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
00051                            size_t nbytes, unsigned char *bytes,
00052                            const char *from, const char *to,
00053                            int decimal_ellipsis, int step);
00054 
00055 
00056 bool enc_not_ascii_compatible;
00057 
00058 
00059 #ifdef NEED_NULL_POINTER
00060 static const char *null_pointer;
00061 #endif
00062 
00063 static struct linereader *
00064 cmlr_open (const char *directory, const char *name, kw_hash_fct_t hf)
00065 {
00066   FILE *fp;
00067 
00068   fp = charmap_open (directory, name);
00069   if (fp == NULL)
00070     return NULL;
00071   else
00072     {
00073       size_t dlen = strlen (directory);
00074       int add_slash = (dlen == 0 || directory[dlen - 1] != '/');
00075       size_t nlen = strlen (name);
00076       char *pathname;
00077       char *p;
00078 
00079       pathname = alloca (dlen + add_slash + nlen + 1);
00080       p = stpcpy (pathname, directory);
00081       if (add_slash)
00082        *p++ = '/';
00083       stpcpy (p, name);
00084 
00085       return lr_create (fp, pathname, hf);
00086     }
00087 }
00088 
00089 struct charmap_t *
00090 charmap_read (const char *filename, int verbose, int error_not_found,
00091              int be_quiet, int use_default)
00092 {
00093   struct charmap_t *result = NULL;
00094 
00095   if (filename != NULL)
00096     {
00097       struct linereader *cmfile;
00098 
00099       /* First try the name as found in the parameter.  */
00100       cmfile = lr_open (filename, charmap_hash);
00101       if (cmfile == NULL)
00102        {
00103          /* No successful.  So start looking through the directories
00104             in the I18NPATH if this is a simple name.  */
00105          if (strchr (filename, '/') == NULL)
00106            {
00107              char *i18npath = getenv ("I18NPATH");
00108              if (i18npath != NULL && *i18npath != '\0')
00109               {
00110                 const size_t pathlen = strlen (i18npath);
00111                 char i18npathbuf[pathlen + 1];
00112                 char path[pathlen + sizeof ("/charmaps")];
00113                 char *next;
00114                 i18npath = memcpy (i18npathbuf, i18npath, pathlen + 1);
00115 
00116                 while (cmfile == NULL
00117                       && (next = strsep (&i18npath, ":")) != NULL)
00118                   {
00119                     stpcpy (stpcpy (path, next), "/charmaps");
00120                     cmfile = cmlr_open (path, filename, charmap_hash);
00121 
00122                     if (cmfile == NULL)
00123                      /* Try without the "/charmaps" part.  */
00124                      cmfile = cmlr_open (next, filename, charmap_hash);
00125                   }
00126               }
00127 
00128              if (cmfile == NULL)
00129               /* Try the default directory.  */
00130               cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
00131            }
00132        }
00133 
00134       if (cmfile != NULL)
00135        result = parse_charmap (cmfile, verbose, be_quiet);
00136 
00137       if (result == NULL && error_not_found)
00138        WITH_CUR_LOCALE (error (0, errno, _("\
00139 character map file `%s' not found"), filename));
00140     }
00141 
00142   if (result == NULL && filename != NULL && strchr (filename, '/') == NULL)
00143     {
00144       /* OK, one more try.  We also accept the names given to the
00145         character sets in the files.  Sometimes they differ from the
00146         file name.  */
00147       CHARMAP_DIR *dir;
00148 
00149       dir = charmap_opendir (CHARMAP_PATH);
00150       if (dir != NULL)
00151        {
00152          const char *dirent;
00153 
00154          while ((dirent = charmap_readdir (dir)) != NULL)
00155            {
00156              char **aliases;
00157              char **p;
00158              int found;
00159 
00160              aliases = charmap_aliases (CHARMAP_PATH, dirent);
00161              found = 0;
00162              for (p = aliases; *p; p++)
00163               if (strcasecmp (*p, filename) == 0)
00164                 {
00165                   found = 1;
00166                   break;
00167                 }
00168              charmap_free_aliases (aliases);
00169 
00170              if (found)
00171               {
00172                 struct linereader *cmfile;
00173 
00174                 cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
00175                 if (cmfile != NULL)
00176                   result = parse_charmap (cmfile, verbose, be_quiet);
00177 
00178                 break;
00179               }
00180            }
00181 
00182          charmap_closedir (dir);
00183        }
00184     }
00185 
00186   if (result == NULL && DEFAULT_CHARMAP != NULL)
00187     {
00188       struct linereader *cmfile;
00189 
00190       cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
00191       if (cmfile != NULL)
00192        result = parse_charmap (cmfile, verbose, be_quiet);
00193 
00194       if (result == NULL)
00195        WITH_CUR_LOCALE (error (4, errno, _("\
00196 default character map file `%s' not found"), DEFAULT_CHARMAP));
00197     }
00198 
00199   if (result != NULL && result->code_set_name == NULL)
00200     /* The input file does not specify a code set name.  This
00201        shouldn't happen but we should cope with it.  */
00202     result->code_set_name = basename (filename);
00203 
00204   /* Test of ASCII compatibility of locale encoding.
00205 
00206      Verify that the encoding to be used in a locale is ASCII compatible,
00207      at least for the graphic characters, excluding the control characters,
00208      '$' and '@'.  This constraint comes from an ISO C 99 restriction.
00209 
00210      ISO C 99 section 7.17.(2) (about wchar_t):
00211        the null character shall have the code value zero and each member of
00212        the basic character set shall have a code value equal to its value
00213        when used as the lone character in an integer character constant.
00214      ISO C 99 section 5.2.1.(3):
00215        Both the basic source and basic execution character sets shall have
00216        the following members: the 26 uppercase letters of the Latin alphabet
00217             A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
00218        the 26 lowercase letters of the Latin alphabet
00219             a b c d e f g h i j k l m n o p q r s t u v w x y z
00220        the 10 decimal digits
00221             0 1 2 3 4 5 6 7 8 9
00222        the following 29 graphic characters
00223             ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~
00224        the space character, and control characters representing horizontal
00225        tab, vertical tab, and form feed.
00226 
00227      Therefore, for all members of the "basic character set", the 'char' code
00228      must have the same value as the 'wchar_t' code, which in glibc is the
00229      same as the Unicode code, which for all of the enumerated characters
00230      is identical to the ASCII code. */
00231   if (result != NULL && use_default)
00232     {
00233       static const char basic_charset[] =
00234        {
00235          'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
00236          'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
00237          'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
00238          'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
00239          '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
00240          '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
00241          '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^',
00242          '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0'
00243        };
00244       int failed = 0;
00245       const char *p = basic_charset;
00246 
00247       do
00248        {
00249          struct charseq *seq = charmap_find_symbol (result, p, 1);
00250 
00251          if (seq == NULL || seq->ucs4 != (uint32_t) *p)
00252            failed = 1;
00253        }
00254       while (*p++ != '\0');
00255 
00256       if (failed)
00257        {
00258          WITH_CUR_LOCALE (fprintf (stderr, _("\
00259 character map `%s' is not ASCII compatible, locale not ISO C compliant\n"),
00260                                 result->code_set_name));
00261          enc_not_ascii_compatible = true;
00262        }
00263     }
00264 
00265   return result;
00266 }
00267 
00268 
00269 static struct charmap_t *
00270 parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
00271 {
00272   struct charmap_t *result;
00273   int state;
00274   enum token_t expected_tok = tok_error;
00275   const char *expected_str = NULL;
00276   char *from_name = NULL;
00277   char *to_name = NULL;
00278   enum token_t ellipsis = 0;
00279   int step = 1;
00280 
00281   /* We don't want symbolic names in string to be translated.  */
00282   cmfile->translate_strings = 0;
00283 
00284   /* Allocate room for result.  */
00285   result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t));
00286   memset (result, '\0', sizeof (struct charmap_t));
00287   /* The default DEFAULT_WIDTH is 1.  */
00288   result->width_default = 1;
00289 
00290 #define obstack_chunk_alloc malloc
00291 #define obstack_chunk_free free
00292   obstack_init (&result->mem_pool);
00293 
00294   if (init_hash (&result->char_table, 256)
00295       || init_hash (&result->byte_table, 256))
00296     {
00297       free (result);
00298       return NULL;
00299     }
00300 
00301   /* We use a state machine to describe the charmap description file
00302      format.  */
00303   state = 1;
00304   while (1)
00305     {
00306       /* What's on?  */
00307       struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
00308       enum token_t nowtok = now->tok;
00309       struct token *arg;
00310 
00311       if (nowtok == tok_eof)
00312        break;
00313 
00314       switch (state)
00315        {
00316        case 1:
00317          /* The beginning.  We expect the special declarations, EOL or
00318             `CHARMAP'.  */
00319          if (nowtok == tok_eol)
00320            /* Ignore empty lines.  */
00321            continue;
00322 
00323          if (nowtok == tok_charmap)
00324            {
00325              from_name = NULL;
00326              to_name = NULL;
00327 
00328              /* We have to set up the real work.  Fill in some
00329                default values.  */
00330              if (result->mb_cur_max == 0)
00331               result->mb_cur_max = 1;
00332              if (result->mb_cur_min == 0)
00333               result->mb_cur_min = result->mb_cur_max;
00334              if (result->mb_cur_min > result->mb_cur_max)
00335               {
00336                 if (!be_quiet)
00337                   WITH_CUR_LOCALE (error (0, 0, _("\
00338 %s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
00339                                        cmfile->fname));
00340 
00341                 result->mb_cur_min = result->mb_cur_max;
00342               }
00343 
00344              lr_ignore_rest (cmfile, 1);
00345 
00346              state = 2;
00347              continue;
00348            }
00349 
00350          if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
00351              && nowtok != tok_mb_cur_min && nowtok != tok_escape_char
00352              && nowtok != tok_comment_char && nowtok != tok_g0esc
00353              && nowtok != tok_g1esc && nowtok != tok_g2esc
00354              && nowtok != tok_g3esc && nowtok != tok_repertoiremap
00355              && nowtok != tok_include)
00356            {
00357              lr_error (cmfile, _("syntax error in prolog: %s"),
00358                      _("invalid definition"));
00359 
00360              lr_ignore_rest (cmfile, 0);
00361              continue;
00362            }
00363 
00364          /* We know that we need an argument.  */
00365          arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
00366 
00367          switch (nowtok)
00368            {
00369            case tok_code_set_name:
00370            case tok_repertoiremap:
00371              if (arg->tok != tok_ident && arg->tok != tok_string)
00372               {
00373               badarg:
00374                 lr_error (cmfile, _("syntax error in prolog: %s"),
00375                          _("bad argument"));
00376 
00377                 lr_ignore_rest (cmfile, 0);
00378                 continue;
00379               }
00380 
00381              if (nowtok == tok_code_set_name)
00382               result->code_set_name = obstack_copy0 (&result->mem_pool,
00383                                                  arg->val.str.startmb,
00384                                                  arg->val.str.lenmb);
00385              else
00386               result->repertoiremap = obstack_copy0 (&result->mem_pool,
00387                                                  arg->val.str.startmb,
00388                                                  arg->val.str.lenmb);
00389 
00390              lr_ignore_rest (cmfile, 1);
00391              continue;
00392 
00393            case tok_mb_cur_max:
00394            case tok_mb_cur_min:
00395              if (arg->tok != tok_number)
00396               goto badarg;
00397 
00398              if (verbose
00399                 && ((nowtok == tok_mb_cur_max
00400                      && result->mb_cur_max != 0)
00401                     || (nowtok == tok_mb_cur_max
00402                        && result->mb_cur_max != 0)))
00403               lr_error (cmfile, _("duplicate definition of <%s>"),
00404                        nowtok == tok_mb_cur_min
00405                        ? "mb_cur_min" : "mb_cur_max");
00406 
00407              if (arg->val.num < 1)
00408               {
00409                 lr_error (cmfile,
00410                          _("value for <%s> must be 1 or greater"),
00411                          nowtok == tok_mb_cur_min
00412                          ? "mb_cur_min" : "mb_cur_max");
00413 
00414                 lr_ignore_rest (cmfile, 0);
00415                 continue;
00416               }
00417              if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0
00418                  && (int) arg->val.num < result->mb_cur_min)
00419                 || (nowtok == tok_mb_cur_min && result->mb_cur_max != 0
00420                     && (int) arg->val.num > result->mb_cur_max))
00421               {
00422                 lr_error (cmfile, _("\
00423 value of <%s> must be greater or equal than the value of <%s>"),
00424                          "mb_cur_max", "mb_cur_min");
00425 
00426                 lr_ignore_rest (cmfile, 0);
00427                 continue;
00428               }
00429 
00430              if (nowtok == tok_mb_cur_max)
00431               result->mb_cur_max = arg->val.num;
00432              else
00433               result->mb_cur_min = arg->val.num;
00434 
00435              lr_ignore_rest (cmfile, 1);
00436              continue;
00437 
00438            case tok_escape_char:
00439            case tok_comment_char:
00440              if (arg->tok != tok_ident)
00441               goto badarg;
00442 
00443              if (arg->val.str.lenmb != 1)
00444               {
00445                 lr_error (cmfile, _("\
00446 argument to <%s> must be a single character"),
00447                          nowtok == tok_escape_char ? "escape_char"
00448                                                 : "comment_char");
00449 
00450                 lr_ignore_rest (cmfile, 0);
00451                 continue;
00452               }
00453 
00454              if (nowtok == tok_escape_char)
00455               cmfile->escape_char = *arg->val.str.startmb;
00456              else
00457               cmfile->comment_char = *arg->val.str.startmb;
00458 
00459              lr_ignore_rest (cmfile, 1);
00460              continue;
00461 
00462            case tok_g0esc:
00463            case tok_g1esc:
00464            case tok_g2esc:
00465            case tok_g3esc:
00466            case tok_escseq:
00467              lr_ignore_rest (cmfile, 0); /* XXX */
00468              continue;
00469 
00470            case tok_include:
00471              lr_error (cmfile, _("\
00472 character sets with locking states are not supported"));
00473              exit (4);
00474 
00475            default:
00476              /* Cannot happen.  */
00477              assert (! "Should not happen");
00478            }
00479          break;
00480 
00481        case 2:
00482          /* We have seen `CHARMAP' and now are in the body.  Each line
00483             must have the format "%s %s %s\n" or "%s...%s %s %s\n".  */
00484          if (nowtok == tok_eol)
00485            /* Ignore empty lines.  */
00486            continue;
00487 
00488          if (nowtok == tok_end)
00489            {
00490              expected_tok = tok_charmap;
00491              expected_str = "CHARMAP";
00492              state = 90;
00493              continue;
00494            }
00495 
00496          if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
00497            {
00498              lr_error (cmfile, _("syntax error in %s definition: %s"),
00499                      "CHARMAP", _("no symbolic name given"));
00500 
00501              lr_ignore_rest (cmfile, 0);
00502              continue;
00503            }
00504 
00505          /* If the previous line was not completely correct free the
00506             used memory.  */
00507          if (from_name != NULL)
00508            obstack_free (&result->mem_pool, from_name);
00509 
00510          if (nowtok == tok_bsymbol)
00511            from_name = (char *) obstack_copy0 (&result->mem_pool,
00512                                           now->val.str.startmb,
00513                                           now->val.str.lenmb);
00514          else
00515            {
00516              obstack_printf (&result->mem_pool, "U%08X",
00517                            cmfile->token.val.ucs4);
00518              obstack_1grow (&result->mem_pool, '\0');
00519              from_name = (char *) obstack_finish (&result->mem_pool);
00520            }
00521          to_name = NULL;
00522 
00523          state = 3;
00524          continue;
00525 
00526        case 3:
00527          /* We have two possibilities: We can see an ellipsis or an
00528             encoding value.  */
00529          if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
00530              || nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2
00531              || nowtok == tok_ellipsis2_2)
00532            {
00533              ellipsis = nowtok;
00534              if (nowtok == tok_ellipsis4_2)
00535               {
00536                 step = 2;
00537                 nowtok = tok_ellipsis4;
00538               }
00539              else if (nowtok == tok_ellipsis2_2)
00540               {
00541                 step = 2;
00542                 nowtok = tok_ellipsis2;
00543               }
00544              state = 4;
00545              continue;
00546            }
00547          /* FALLTHROUGH */
00548 
00549        case 5:
00550          if (nowtok != tok_charcode)
00551            {
00552              lr_error (cmfile, _("syntax error in %s definition: %s"),
00553                      "CHARMAP", _("invalid encoding given"));
00554 
00555              lr_ignore_rest (cmfile, 0);
00556 
00557              state = 2;
00558              continue;
00559            }
00560 
00561          if (now->val.charcode.nbytes < result->mb_cur_min)
00562            lr_error (cmfile, _("too few bytes in character encoding"));
00563          else if (now->val.charcode.nbytes > result->mb_cur_max)
00564            lr_error (cmfile, _("too many bytes in character encoding"));
00565          else
00566            charmap_new_char (cmfile, result, now->val.charcode.nbytes,
00567                            now->val.charcode.bytes, from_name, to_name,
00568                            ellipsis != tok_ellipsis2, step);
00569 
00570          /* Ignore trailing comment silently.  */
00571          lr_ignore_rest (cmfile, 0);
00572 
00573          from_name = NULL;
00574          to_name = NULL;
00575          ellipsis = tok_none;
00576          step = 1;
00577 
00578          state = 2;
00579          continue;
00580 
00581        case 4:
00582          if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
00583            {
00584              lr_error (cmfile, _("syntax error in %s definition: %s"),
00585                      "CHARMAP",
00586                      _("no symbolic name given for end of range"));
00587 
00588              lr_ignore_rest (cmfile, 0);
00589              continue;
00590            }
00591 
00592          /* Copy the to-name in a safe place.  */
00593          if (nowtok == tok_bsymbol)
00594            to_name = (char *) obstack_copy0 (&result->mem_pool,
00595                                          cmfile->token.val.str.startmb,
00596                                          cmfile->token.val.str.lenmb);
00597          else
00598            {
00599              obstack_printf (&result->mem_pool, "U%08X",
00600                            cmfile->token.val.ucs4);
00601              obstack_1grow (&result->mem_pool, '\0');
00602              to_name = (char *) obstack_finish (&result->mem_pool);
00603            }
00604 
00605          state = 5;
00606          continue;
00607 
00608        case 90:
00609          if (nowtok != expected_tok)
00610            lr_error (cmfile, _("\
00611 %1$s: definition does not end with `END %1$s'"), expected_str);
00612 
00613          lr_ignore_rest (cmfile, nowtok == expected_tok);
00614          state = 91;
00615          continue;
00616 
00617        case 91:
00618          /* Waiting for WIDTH... */
00619          if (nowtok == tok_eol)
00620            /* Ignore empty lines.  */
00621            continue;
00622 
00623          if (nowtok == tok_width_default)
00624            {
00625              state = 92;
00626              continue;
00627            }
00628 
00629          if (nowtok == tok_width)
00630            {
00631              lr_ignore_rest (cmfile, 1);
00632              state = 93;
00633              continue;
00634            }
00635 
00636          if (nowtok == tok_width_variable)
00637            {
00638              lr_ignore_rest (cmfile, 1);
00639              state = 98;
00640              continue;
00641            }
00642 
00643          lr_error (cmfile, _("\
00644 only WIDTH definitions are allowed to follow the CHARMAP definition"));
00645 
00646          lr_ignore_rest (cmfile, 0);
00647          continue;
00648 
00649        case 92:
00650          if (nowtok != tok_number)
00651            lr_error (cmfile, _("value for %s must be an integer"),
00652                     "WIDTH_DEFAULT");
00653          else
00654            result->width_default = now->val.num;
00655 
00656          lr_ignore_rest (cmfile, nowtok == tok_number);
00657 
00658          state = 91;
00659          continue;
00660 
00661        case 93:
00662          /* We now expect `END WIDTH' or lines of the format "%s %d\n" or
00663             "%s...%s %d\n".  */
00664          if (nowtok == tok_eol)
00665            /* ignore empty lines.  */
00666            continue;
00667 
00668          if (nowtok == tok_end)
00669            {
00670              expected_tok = tok_width;
00671              expected_str = "WIDTH";
00672              state = 90;
00673              continue;
00674            }
00675 
00676          if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
00677            {
00678              lr_error (cmfile, _("syntax error in %s definition: %s"),
00679                      "WIDTH", _("no symbolic name given"));
00680 
00681              lr_ignore_rest (cmfile, 0);
00682              continue;
00683            }
00684 
00685          if (from_name != NULL)
00686            obstack_free (&result->mem_pool, from_name);
00687 
00688          if (nowtok == tok_bsymbol)
00689            from_name = (char *) obstack_copy0 (&result->mem_pool,
00690                                           now->val.str.startmb,
00691                                           now->val.str.lenmb);
00692          else
00693            {
00694              obstack_printf (&result->mem_pool, "U%08X",
00695                            cmfile->token.val.ucs4);
00696              obstack_1grow (&result->mem_pool, '\0');
00697              from_name = (char *) obstack_finish (&result->mem_pool);
00698            }
00699 
00700          to_name = NULL;
00701 
00702          state = 94;
00703          continue;
00704 
00705        case 94:
00706          if (nowtok == tok_ellipsis3)
00707            {
00708              state = 95;
00709              continue;
00710            }
00711 
00712        case 96:
00713          if (nowtok != tok_number)
00714            lr_error (cmfile, _("value for %s must be an integer"),
00715                     "WIDTH");
00716          else
00717            {
00718              /* Store width for chars.  */
00719              new_width (cmfile, result, from_name, to_name, now->val.num);
00720 
00721              from_name = NULL;
00722              to_name = NULL;
00723            }
00724 
00725          lr_ignore_rest (cmfile, nowtok == tok_number);
00726 
00727          state = 93;
00728          continue;
00729 
00730        case 95:
00731          if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
00732            {
00733              lr_error (cmfile, _("syntax error in %s definition: %s"),
00734                      "WIDTH", _("no symbolic name given for end of range"));
00735 
00736              lr_ignore_rest (cmfile, 0);
00737 
00738              state = 93;
00739              continue;
00740            }
00741 
00742          if (nowtok == tok_bsymbol)
00743            to_name = (char *) obstack_copy0 (&result->mem_pool,
00744                                          now->val.str.startmb,
00745                                          now->val.str.lenmb);
00746          else
00747            {
00748              obstack_printf (&result->mem_pool, "U%08X",
00749                            cmfile->token.val.ucs4);
00750              obstack_1grow (&result->mem_pool, '\0');
00751              to_name = (char *) obstack_finish (&result->mem_pool);
00752            }
00753 
00754          state = 96;
00755          continue;
00756 
00757        case 98:
00758          /* We now expect `END WIDTH_VARIABLE' or lines of the format
00759             "%s\n" or "%s...%s\n".  */
00760          if (nowtok == tok_eol)
00761            /* ignore empty lines.  */
00762            continue;
00763 
00764          if (nowtok == tok_end)
00765            {
00766              expected_tok = tok_width_variable;
00767              expected_str = "WIDTH_VARIABLE";
00768              state = 90;
00769              continue;
00770            }
00771 
00772          if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
00773            {
00774              lr_error (cmfile, _("syntax error in %s definition: %s"),
00775                      "WIDTH_VARIABLE", _("no symbolic name given"));
00776 
00777              lr_ignore_rest (cmfile, 0);
00778 
00779              continue;
00780            }
00781 
00782          if (from_name != NULL)
00783            obstack_free (&result->mem_pool, from_name);
00784 
00785          if (nowtok == tok_bsymbol)
00786            from_name = (char *) obstack_copy0 (&result->mem_pool,
00787                                           now->val.str.startmb,
00788                                           now->val.str.lenmb);
00789          else
00790            {
00791              obstack_printf (&result->mem_pool, "U%08X",
00792                            cmfile->token.val.ucs4);
00793              obstack_1grow (&result->mem_pool, '\0');
00794              from_name = (char *) obstack_finish (&result->mem_pool);
00795            }
00796          to_name = NULL;
00797 
00798          state = 99;
00799          continue;
00800 
00801        case 99:
00802          if (nowtok == tok_ellipsis3)
00803            state = 100;
00804 
00805          /* Store info.  */
00806          from_name = NULL;
00807 
00808          /* Warn */
00809          state = 98;
00810          continue;
00811 
00812        case 100:
00813          if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
00814            {
00815              lr_error (cmfile, _("syntax error in %s definition: %s"),
00816                      "WIDTH_VARIABLE",
00817                      _("no symbolic name given for end of range"));
00818              lr_ignore_rest (cmfile, 0);
00819              continue;
00820            }
00821 
00822          if (nowtok == tok_bsymbol)
00823            to_name = (char *) obstack_copy0 (&result->mem_pool,
00824                                          now->val.str.startmb,
00825                                          now->val.str.lenmb);
00826          else
00827            {
00828              obstack_printf (&result->mem_pool, "U%08X",
00829                            cmfile->token.val.ucs4);
00830              obstack_1grow (&result->mem_pool, '\0');
00831              to_name = (char *) obstack_finish (&result->mem_pool);
00832            }
00833 
00834          /* XXX Enter value into table.  */
00835 
00836          lr_ignore_rest (cmfile, 1);
00837 
00838          state = 98;
00839          continue;
00840 
00841        default:
00842          WITH_CUR_LOCALE (error (5, 0, _("%s: error in state machine"),
00843                               __FILE__));
00844          /* NOTREACHED */
00845        }
00846       break;
00847     }
00848 
00849   if (state != 91 && !be_quiet)
00850     WITH_CUR_LOCALE (error (0, 0, _("%s: premature end of file"),
00851                          cmfile->fname));
00852 
00853   lr_close (cmfile);
00854 
00855   return result;
00856 }
00857 
00858 
00859 static void
00860 new_width (struct linereader *cmfile, struct charmap_t *result,
00861           const char *from, const char *to, unsigned long int width)
00862 {
00863   struct charseq *from_val;
00864   struct charseq *to_val;
00865 
00866   from_val = charmap_find_value (result, from, strlen (from));
00867   if (from_val == NULL)
00868     {
00869       lr_error (cmfile, _("unknown character `%s'"), from);
00870       return;
00871     }
00872 
00873   if (to == NULL)
00874     to_val = from_val;
00875   else
00876     {
00877       to_val = charmap_find_value (result, to, strlen (to));
00878       if (to_val == NULL)
00879        {
00880          lr_error (cmfile, _("unknown character `%s'"), to);
00881          return;
00882        }
00883 
00884       /* Make sure the number of bytes for the end points of the range
00885         is correct.  */
00886       if (from_val->nbytes != to_val->nbytes)
00887        {
00888          lr_error (cmfile, _("\
00889 number of bytes for byte sequence of beginning and end of range not the same: %d vs %d"),
00890                   from_val->nbytes, to_val->nbytes);
00891          return;
00892        }
00893     }
00894 
00895   if (result->nwidth_rules >= result->nwidth_rules_max)
00896     {
00897       size_t new_size = result->nwidth_rules + 32;
00898       struct width_rule *new_rules =
00899        (struct width_rule *) obstack_alloc (&result->mem_pool,
00900                                         (new_size
00901                                          * sizeof (struct width_rule)));
00902 
00903       memcpy (new_rules, result->width_rules,
00904              result->nwidth_rules_max * sizeof (struct width_rule));
00905 
00906       result->width_rules = new_rules;
00907       result->nwidth_rules_max = new_size;
00908     }
00909 
00910   result->width_rules[result->nwidth_rules].from = from_val;
00911   result->width_rules[result->nwidth_rules].to = to_val;
00912   result->width_rules[result->nwidth_rules].width = (unsigned int) width;
00913   ++result->nwidth_rules;
00914 }
00915 
00916 
00917 struct charseq *
00918 charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
00919 {
00920   void *result;
00921 
00922   return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
00923          < 0 ? NULL : (struct charseq *) result);
00924 }
00925 
00926 
00927 static void
00928 charmap_new_char (struct linereader *lr, struct charmap_t *cm,
00929                 size_t nbytes, unsigned char *bytes,
00930                 const char *from, const char *to,
00931                 int decimal_ellipsis, int step)
00932 {
00933   hash_table *ht = &cm->char_table;
00934   hash_table *bt = &cm->byte_table;
00935   struct obstack *ob = &cm->mem_pool;
00936   char *from_end;
00937   char *to_end;
00938   const char *cp;
00939   int prefix_len, len1, len2;
00940   unsigned int from_nr, to_nr, cnt;
00941   struct charseq *newp;
00942 
00943   len1 = strlen (from);
00944 
00945   if (to == NULL)
00946     {
00947       newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
00948       newp->nbytes = nbytes;
00949       memcpy (newp->bytes, bytes, nbytes);
00950       newp->name = from;
00951 
00952       newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
00953       if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9))
00954        {
00955          /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
00956             xxxx and xxxxxxxx are hexadecimal numbers.  In this case
00957             we use the value of xxxx or xxxxxxxx as the UCS4 value of
00958             this character and we don't have to consult the repertoire
00959             map.
00960 
00961             If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
00962             and xxxxxxxx also give the code point in UCS4 but this must
00963             be in the private, i.e., unassigned, area.  This should be
00964             used for characters which do not (yet) have an equivalent
00965             in ISO 10646 and Unicode.  */
00966          char *endp;
00967 
00968          errno = 0;
00969          newp->ucs4 = strtoul (from + 1, &endp, 16);
00970          if (endp - from != len1
00971              || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
00972              || newp->ucs4 >= 0x80000000)
00973            /* This wasn't successful.  Signal this name cannot be a
00974               correct UCS value.  */
00975            newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
00976        }
00977 
00978       insert_entry (ht, from, len1, newp);
00979       insert_entry (bt, newp->bytes, nbytes, newp);
00980       /* Please note that it isn't a bug if a symbol is defined more
00981         than once.  All later definitions are simply discarded.  */
00982       return;
00983     }
00984 
00985   /* We have a range: the names must have names with equal prefixes
00986      and an equal number of digits, where the second number is greater
00987      or equal than the first.  */
00988   len2 = strlen (to);
00989 
00990   if (len1 != len2)
00991     {
00992     illegal_range:
00993       lr_error (lr, _("invalid names for character range"));
00994       return;
00995     }
00996 
00997   cp = &from[len1 - 1];
00998   if (decimal_ellipsis)
00999     while (isdigit (*cp) && cp >= from)
01000       --cp;
01001   else
01002     while (isxdigit (*cp) && cp >= from)
01003       {
01004        if (!isdigit (*cp) && !isupper (*cp))
01005          lr_error (lr, _("\
01006 hexadecimal range format should use only capital characters"));
01007        --cp;
01008       }
01009 
01010   prefix_len = (cp - from) + 1;
01011 
01012   if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
01013     goto illegal_range;
01014 
01015   errno = 0;
01016   from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
01017   if (*from_end != '\0' || (from_nr == UINT_MAX && errno == ERANGE)
01018       || ((to_nr = strtoul (&to[prefix_len], &to_end,
01019                          decimal_ellipsis ? 10 : 16)) == UINT_MAX
01020          && errno == ERANGE)
01021       || *to_end != '\0')
01022     {
01023       lr_error (lr, _("<%s> and <%s> are invalid names for range"), from, to);
01024       return;
01025     }
01026 
01027   if (from_nr > to_nr)
01028     {
01029       lr_error (lr, _("upper limit in range is smaller than lower limit"));
01030       return;
01031     }
01032 
01033   for (cnt = from_nr; cnt <= to_nr; cnt += step)
01034     {
01035       char *name_end;
01036       obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
01037                     prefix_len, from, len1 - prefix_len, cnt);
01038       obstack_1grow (ob, '\0');
01039       name_end = obstack_finish (ob);
01040 
01041       newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
01042       newp->nbytes = nbytes;
01043       memcpy (newp->bytes, bytes, nbytes);
01044       newp->name = name_end;
01045 
01046       newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
01047       if ((name_end[0] == 'U' || name_end[0] == 'P')
01048          && (len1 == 5 || len1 == 9))
01049        {
01050          /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
01051             xxxx and xxxxxxxx are hexadecimal numbers.  In this case
01052             we use the value of xxxx or xxxxxxxx as the UCS4 value of
01053             this character and we don't have to consult the repertoire
01054             map.
01055 
01056             If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
01057             and xxxxxxxx also give the code point in UCS4 but this must
01058             be in the private, i.e., unassigned, area.  This should be
01059             used for characters which do not (yet) have an equivalent
01060             in ISO 10646 and Unicode.  */
01061          char *endp;
01062 
01063          errno = 0;
01064          newp->ucs4 = strtoul (name_end + 1, &endp, 16);
01065          if (endp - name_end != len1
01066              || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
01067              || newp->ucs4 >= 0x80000000)
01068            /* This wasn't successful.  Signal this name cannot be a
01069               correct UCS value.  */
01070            newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
01071        }
01072 
01073       insert_entry (ht, name_end, len1, newp);
01074       insert_entry (bt, newp->bytes, nbytes, newp);
01075       /* Please note we don't examine the return value since it is no error
01076         if we have two definitions for a symbol.  */
01077 
01078       /* Increment the value in the byte sequence.  */
01079       if (++bytes[nbytes - 1] == '\0')
01080        {
01081          int b = nbytes - 2;
01082 
01083          do
01084            if (b < 0)
01085              {
01086               lr_error (lr,
01087                        _("resulting bytes for range not representable."));
01088               return;
01089              }
01090          while (++bytes[b--] == 0);
01091        }
01092     }
01093 }
01094 
01095 
01096 struct charseq *
01097 charmap_find_symbol (const struct charmap_t *cm, const char *bytes,
01098                    size_t nbytes)
01099 {
01100   void *result;
01101 
01102   return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
01103          < 0 ? NULL : (struct charseq *) result);
01104 }