Back to index

glibc  2.9
gen-unicode-ctype.c
Go to the documentation of this file.
00001 /* Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
00002    Copyright (C) 2000-2001 Free Software Foundation, Inc.
00003    This file is part of the GNU C Library.
00004    Contributed by Bruno Haible <haible@clisp.cons.org>, 2000.
00005 
00006    The GNU C Library is free software; you can redistribute it and/or
00007    modify it under the terms of the GNU Lesser General Public
00008    License as published by the Free Software Foundation; either
00009    version 2.1 of the License, or (at your option) any later version.
00010 
00011    The GNU C Library is distributed in the hope that it will be useful,
00012    but WITHOUT ANY WARRANTY; without even the implied warranty of
00013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014    Lesser General Public License for more details.
00015 
00016    You should have received a copy of the GNU Lesser General Public
00017    License along with the GNU C Library; if not, write to the Free
00018    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
00019    02111-1307 USA.  */
00020 
00021 /* Usage example:
00022      $ gen-unicode /usr/local/share/Unidata/UnicodeData.txt 3.1
00023  */
00024 
00025 #include <stdio.h>
00026 #include <stdlib.h>
00027 #include <stdbool.h>
00028 #include <string.h>
00029 #include <time.h>
00030 
00031 /* This structure represents one line in the UnicodeData.txt file.  */
00032 struct unicode_attribute
00033 {
00034   const char *name;           /* Character name */
00035   const char *category;       /* General category */
00036   const char *combining;      /* Canonical combining classes */
00037   const char *bidi;           /* Bidirectional category */
00038   const char *decomposition;  /* Character decomposition mapping */
00039   const char *decdigit;       /* Decimal digit value */
00040   const char *digit;          /* Digit value */
00041   const char *numeric;        /* Numeric value */
00042   int mirrored;               /* mirrored */
00043   const char *oldname;        /* Old Unicode 1.0 name */
00044   const char *comment;        /* Comment */
00045   unsigned int upper;         /* Uppercase mapping */
00046   unsigned int lower;         /* Lowercase mapping */
00047   unsigned int title;         /* Titlecase mapping */
00048 };
00049 
00050 /* Missing fields are represented with "" for strings, and NONE for
00051    characters.  */
00052 #define NONE (~(unsigned int)0)
00053 
00054 /* The entire contents of the UnicodeData.txt file.  */
00055 struct unicode_attribute unicode_attributes [0x110000];
00056 
00057 /* Stores in unicode_attributes[i] the values from the given fields.  */
00058 static void
00059 fill_attribute (unsigned int i,
00060               const char *field1, const char *field2,
00061               const char *field3, const char *field4,
00062               const char *field5, const char *field6,
00063               const char *field7, const char *field8,
00064               const char *field9, const char *field10,
00065               const char *field11, const char *field12,
00066               const char *field13, const char *field14)
00067 {
00068   struct unicode_attribute * uni;
00069 
00070   if (i >= 0x110000)
00071     {
00072       fprintf (stderr, "index too large\n");
00073       exit (1);
00074     }
00075   if (strcmp (field2, "Cs") == 0)
00076     /* Surrogates are UTF-16 artefacts, not real characters. Ignore them.  */
00077     return;
00078   uni = &unicode_attributes[i];
00079   /* Copy the strings.  */
00080   uni->name          = strdup (field1);
00081   uni->category      = (field2[0] == '\0' ? "" : strdup (field2));
00082   uni->combining     = (field3[0] == '\0' ? "" : strdup (field3));
00083   uni->bidi          = (field4[0] == '\0' ? "" : strdup (field4));
00084   uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
00085   uni->decdigit      = (field6[0] == '\0' ? "" : strdup (field6));
00086   uni->digit         = (field7[0] == '\0' ? "" : strdup (field7));
00087   uni->numeric       = (field8[0] == '\0' ? "" : strdup (field8));
00088   uni->mirrored      = (field9[0] == 'Y');
00089   uni->oldname       = (field10[0] == '\0' ? "" : strdup (field10));
00090   uni->comment       = (field11[0] == '\0' ? "" : strdup (field11));
00091   uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
00092   uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
00093   uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
00094 }
00095 
00096 /* Maximum length of a field in the UnicodeData.txt file.  */
00097 #define FIELDLEN 120
00098 
00099 /* Reads the next field from STREAM.  The buffer BUFFER has size FIELDLEN.
00100    Reads up to (but excluding) DELIM.
00101    Returns 1 when a field was successfully read, otherwise 0.  */
00102 static int
00103 getfield (FILE *stream, char *buffer, int delim)
00104 {
00105   int count = 0;
00106   int c;
00107 
00108   for (; (c = getc (stream)), (c != EOF && c != delim); )
00109     {
00110       /* The original unicode.org UnicodeData.txt file happens to have
00111         CR/LF line terminators.  Silently convert to LF.  */
00112       if (c == '\r')
00113        continue;
00114 
00115       /* Put c into the buffer.  */
00116       if (++count >= FIELDLEN - 1)
00117        {
00118          fprintf (stderr, "field too long\n");
00119          exit (1);
00120        }
00121       *buffer++ = c;
00122     }
00123 
00124   if (c == EOF)
00125     return 0;
00126 
00127   *buffer = '\0';
00128   return 1;
00129 }
00130 
00131 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
00132    file.  */
00133 static void
00134 fill_attributes (const char *unicodedata_filename)
00135 {
00136   unsigned int i, j;
00137   FILE *stream;
00138   char field0[FIELDLEN];
00139   char field1[FIELDLEN];
00140   char field2[FIELDLEN];
00141   char field3[FIELDLEN];
00142   char field4[FIELDLEN];
00143   char field5[FIELDLEN];
00144   char field6[FIELDLEN];
00145   char field7[FIELDLEN];
00146   char field8[FIELDLEN];
00147   char field9[FIELDLEN];
00148   char field10[FIELDLEN];
00149   char field11[FIELDLEN];
00150   char field12[FIELDLEN];
00151   char field13[FIELDLEN];
00152   char field14[FIELDLEN];
00153   int lineno = 0;
00154 
00155   for (i = 0; i < 0x110000; i++)
00156     unicode_attributes[i].name = NULL;
00157 
00158   stream = fopen (unicodedata_filename, "r");
00159   if (stream == NULL)
00160     {
00161       fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
00162       exit (1);
00163     }
00164 
00165   for (;;)
00166     {
00167       int n;
00168 
00169       lineno++;
00170       n = getfield (stream, field0, ';');
00171       n += getfield (stream, field1, ';');
00172       n += getfield (stream, field2, ';');
00173       n += getfield (stream, field3, ';');
00174       n += getfield (stream, field4, ';');
00175       n += getfield (stream, field5, ';');
00176       n += getfield (stream, field6, ';');
00177       n += getfield (stream, field7, ';');
00178       n += getfield (stream, field8, ';');
00179       n += getfield (stream, field9, ';');
00180       n += getfield (stream, field10, ';');
00181       n += getfield (stream, field11, ';');
00182       n += getfield (stream, field12, ';');
00183       n += getfield (stream, field13, ';');
00184       n += getfield (stream, field14, '\n');
00185       if (n == 0)
00186        break;
00187       if (n != 15)
00188        {
00189          fprintf (stderr, "short line in'%s':%d\n",
00190                  unicodedata_filename, lineno);
00191          exit (1);
00192        }
00193       i = strtoul (field0, NULL, 16);
00194       if (field1[0] == '<'
00195          && strlen (field1) >= 9
00196          && !strcmp (field1 + strlen(field1) - 8, ", First>"))
00197        {
00198          /* Deal with a range. */
00199          lineno++;
00200          n = getfield (stream, field0, ';');
00201          n += getfield (stream, field1, ';');
00202          n += getfield (stream, field2, ';');
00203          n += getfield (stream, field3, ';');
00204          n += getfield (stream, field4, ';');
00205          n += getfield (stream, field5, ';');
00206          n += getfield (stream, field6, ';');
00207          n += getfield (stream, field7, ';');
00208          n += getfield (stream, field8, ';');
00209          n += getfield (stream, field9, ';');
00210          n += getfield (stream, field10, ';');
00211          n += getfield (stream, field11, ';');
00212          n += getfield (stream, field12, ';');
00213          n += getfield (stream, field13, ';');
00214          n += getfield (stream, field14, '\n');
00215          if (n != 15)
00216            {
00217              fprintf (stderr, "missing end range in '%s':%d\n",
00218                      unicodedata_filename, lineno);
00219              exit (1);
00220            }
00221          if (!(field1[0] == '<'
00222               && strlen (field1) >= 8
00223               && !strcmp (field1 + strlen (field1) - 7, ", Last>")))
00224            {
00225              fprintf (stderr, "missing end range in '%s':%d\n",
00226                      unicodedata_filename, lineno);
00227              exit (1);
00228            }
00229          field1[strlen (field1) - 7] = '\0';
00230          j = strtoul (field0, NULL, 16);
00231          for (; i <= j; i++)
00232            fill_attribute (i, field1+1, field2, field3, field4, field5,
00233                             field6, field7, field8, field9, field10,
00234                             field11, field12, field13, field14);
00235        }
00236       else
00237        {
00238          /* Single character line */
00239          fill_attribute (i, field1, field2, field3, field4, field5,
00240                           field6, field7, field8, field9, field10,
00241                           field11, field12, field13, field14);
00242        }
00243     }
00244   if (ferror (stream) || fclose (stream))
00245     {
00246       fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
00247       exit (1);
00248     }
00249 }
00250 
00251 /* Character mappings.  */
00252 
00253 static unsigned int
00254 to_upper (unsigned int ch)
00255 {
00256   if (unicode_attributes[ch].name != NULL
00257       && unicode_attributes[ch].upper != NONE)
00258     return unicode_attributes[ch].upper;
00259   else
00260     return ch;
00261 }
00262 
00263 static unsigned int
00264 to_lower (unsigned int ch)
00265 {
00266   if (unicode_attributes[ch].name != NULL
00267       && unicode_attributes[ch].lower != NONE)
00268     return unicode_attributes[ch].lower;
00269   else
00270     return ch;
00271 }
00272 
00273 static unsigned int
00274 to_title (unsigned int ch)
00275 {
00276   if (unicode_attributes[ch].name != NULL
00277       && unicode_attributes[ch].title != NONE)
00278     return unicode_attributes[ch].title;
00279   else
00280     return ch;
00281 }
00282 
00283 /* Character class properties.  */
00284 
00285 static bool
00286 is_upper (unsigned int ch)
00287 {
00288   return (to_lower (ch) != ch);
00289 }
00290 
00291 static bool
00292 is_lower (unsigned int ch)
00293 {
00294   return (to_upper (ch) != ch)
00295         /* <U00DF> is lowercase, but without simple to_upper mapping.  */
00296         || (ch == 0x00DF);
00297 }
00298 
00299 static bool
00300 is_alpha (unsigned int ch)
00301 {
00302   return (unicode_attributes[ch].name != NULL
00303          && ((unicode_attributes[ch].category[0] == 'L'
00304               /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
00305                 <U0E2F>, <U0E46> should belong to is_punct.  */
00306               && (ch != 0x0E2F) && (ch != 0x0E46))
00307              /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
00308                <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha.  */
00309              || (ch == 0x0E31)
00310              || (ch >= 0x0E34 && ch <= 0x0E3A)
00311              || (ch >= 0x0E47 && ch <= 0x0E4E)
00312              /* Avoid warning for <U0345>.  */
00313              || (ch == 0x0345)
00314              /* Avoid warnings for <U2160>..<U217F>.  */
00315              || (unicode_attributes[ch].category[0] == 'N'
00316                 && unicode_attributes[ch].category[1] == 'l')
00317              /* Avoid warnings for <U24B6>..<U24E9>.  */
00318              || (unicode_attributes[ch].category[0] == 'S'
00319                 && unicode_attributes[ch].category[1] == 'o'
00320                 && strstr (unicode_attributes[ch].name, " LETTER ")
00321                    != NULL)
00322              /* Consider all the non-ASCII digits as alphabetic.
00323                ISO C 99 forbids us to have them in category "digit",
00324                but we want iswalnum to return true on them.  */
00325              || (unicode_attributes[ch].category[0] == 'N'
00326                 && unicode_attributes[ch].category[1] == 'd'
00327                 && !(ch >= 0x0030 && ch <= 0x0039))));
00328 }
00329 
00330 static bool
00331 is_digit (unsigned int ch)
00332 {
00333 #if 0
00334   return (unicode_attributes[ch].name != NULL
00335          && unicode_attributes[ch].category[0] == 'N'
00336          && unicode_attributes[ch].category[1] == 'd');
00337   /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
00338      a zero.  Must add <0> in front of them by hand.  */
00339 #else
00340   /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
00341      takes it away:
00342      7.25.2.1.5:
00343         The iswdigit function tests for any wide character that corresponds
00344         to a decimal-digit character (as defined in 5.2.1).
00345      5.2.1:
00346         the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
00347    */
00348   return (ch >= 0x0030 && ch <= 0x0039);
00349 #endif
00350 }
00351 
00352 static bool
00353 is_outdigit (unsigned int ch)
00354 {
00355   return (ch >= 0x0030 && ch <= 0x0039);
00356 }
00357 
00358 static bool
00359 is_blank (unsigned int ch)
00360 {
00361   return (ch == 0x0009 /* '\t' */
00362          /* Category Zs without mention of "<noBreak>" */
00363          || (unicode_attributes[ch].name != NULL
00364              && unicode_attributes[ch].category[0] == 'Z'
00365              && unicode_attributes[ch].category[1] == 's'
00366              && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
00367 }
00368 
00369 static bool
00370 is_space (unsigned int ch)
00371 {
00372   /* Don't make U+00A0 a space. Non-breaking space means that all programs
00373      should treat it like a punctuation character, not like a space. */
00374   return (ch == 0x0020 /* ' ' */
00375          || ch == 0x000C /* '\f' */
00376          || ch == 0x000A /* '\n' */
00377          || ch == 0x000D /* '\r' */
00378          || ch == 0x0009 /* '\t' */
00379          || ch == 0x000B /* '\v' */
00380          /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
00381          || (unicode_attributes[ch].name != NULL
00382              && unicode_attributes[ch].category[0] == 'Z'
00383              && (unicode_attributes[ch].category[1] == 'l'
00384                 || unicode_attributes[ch].category[1] == 'p'
00385                 || (unicode_attributes[ch].category[1] == 's'
00386                     && !strstr (unicode_attributes[ch].decomposition,
00387                               "<noBreak>")))));
00388 }
00389 
00390 static bool
00391 is_cntrl (unsigned int ch)
00392 {
00393   return (unicode_attributes[ch].name != NULL
00394          && (!strcmp (unicode_attributes[ch].name, "<control>")
00395              /* Categories Zl and Zp */
00396              || (unicode_attributes[ch].category[0] == 'Z'
00397                 && (unicode_attributes[ch].category[1] == 'l'
00398                     || unicode_attributes[ch].category[1] == 'p'))));
00399 }
00400 
00401 static bool
00402 is_xdigit (unsigned int ch)
00403 {
00404 #if 0
00405   return is_digit (ch)
00406         || (ch >= 0x0041 && ch <= 0x0046)
00407         || (ch >= 0x0061 && ch <= 0x0066);
00408 #else
00409   /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
00410      takes it away:
00411      7.25.2.1.12:
00412         The iswxdigit function tests for any wide character that corresponds
00413         to a hexadecimal-digit character (as defined in 6.4.4.1).
00414      6.4.4.1:
00415         hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
00416    */
00417   return (ch >= 0x0030 && ch <= 0x0039)
00418         || (ch >= 0x0041 && ch <= 0x0046)
00419         || (ch >= 0x0061 && ch <= 0x0066);
00420 #endif
00421 }
00422 
00423 static bool
00424 is_graph (unsigned int ch)
00425 {
00426   return (unicode_attributes[ch].name != NULL
00427          && strcmp (unicode_attributes[ch].name, "<control>")
00428          && !is_space (ch));
00429 }
00430 
00431 static bool
00432 is_print (unsigned int ch)
00433 {
00434   return (unicode_attributes[ch].name != NULL
00435          && strcmp (unicode_attributes[ch].name, "<control>")
00436          /* Categories Zl and Zp */
00437          && !(unicode_attributes[ch].name != NULL
00438               && unicode_attributes[ch].category[0] == 'Z'
00439               && (unicode_attributes[ch].category[1] == 'l'
00440                  || unicode_attributes[ch].category[1] == 'p')));
00441 }
00442 
00443 static bool
00444 is_punct (unsigned int ch)
00445 {
00446 #if 0
00447   return (unicode_attributes[ch].name != NULL
00448          && unicode_attributes[ch].category[0] == 'P');
00449 #else
00450   /* The traditional POSIX definition of punctuation is every graphic,
00451      non-alphanumeric character.  */
00452   return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
00453 #endif
00454 }
00455 
00456 static bool
00457 is_combining (unsigned int ch)
00458 {
00459   /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
00460      file. In 3.0.1 it was identical to the union of the general categories
00461      "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
00462      PropList.txt file, so we take the latter definition.  */
00463   return (unicode_attributes[ch].name != NULL
00464          && unicode_attributes[ch].category[0] == 'M'
00465          && (unicode_attributes[ch].category[1] == 'n'
00466              || unicode_attributes[ch].category[1] == 'c'
00467              || unicode_attributes[ch].category[1] == 'e'));
00468 }
00469 
00470 static bool
00471 is_combining_level3 (unsigned int ch)
00472 {
00473   return is_combining (ch)
00474         && !(unicode_attributes[ch].combining[0] != '\0'
00475              && unicode_attributes[ch].combining[0] != '0'
00476              && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
00477 }
00478 
00479 /* Return the UCS symbol string for a Unicode character.  */
00480 static const char *
00481 ucs_symbol (unsigned int i)
00482 {
00483   static char buf[11+1];
00484 
00485   sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
00486   return buf;
00487 }
00488 
00489 /* Return the UCS symbol range string for a Unicode characters interval.  */
00490 static const char *
00491 ucs_symbol_range (unsigned int low, unsigned int high)
00492 {
00493   static char buf[24+1];
00494 
00495   strcpy (buf, ucs_symbol (low));
00496   strcat (buf, "..");
00497   strcat (buf, ucs_symbol (high));
00498   return buf;
00499 }
00500 
00501 /* Output a character class (= property) table.  */
00502 
00503 static void
00504 output_charclass (FILE *stream, const char *classname,
00505                 bool (*func) (unsigned int))
00506 {
00507   char table[0x110000];
00508   unsigned int i;
00509   bool need_semicolon;
00510   const int max_column = 75;
00511   int column;
00512 
00513   for (i = 0; i < 0x110000; i++)
00514     table[i] = (int) func (i);
00515 
00516   fprintf (stream, "%s ", classname);
00517   need_semicolon = false;
00518   column = 1000;
00519   for (i = 0; i < 0x110000; )
00520     {
00521       if (!table[i])
00522        i++;
00523       else
00524        {
00525          unsigned int low, high;
00526          char buf[25];
00527 
00528          low = i;
00529          do
00530            i++;
00531          while (i < 0x110000 && table[i]);
00532          high = i - 1;
00533 
00534          if (low == high)
00535            strcpy (buf, ucs_symbol (low));
00536          else
00537            strcpy (buf, ucs_symbol_range (low, high));
00538 
00539          if (need_semicolon)
00540            {
00541              fprintf (stream, ";");
00542              column++;
00543            }
00544 
00545          if (column + strlen (buf) > max_column)
00546            {
00547              fprintf (stream, "/\n   ");
00548              column = 3;
00549            }
00550 
00551          fprintf (stream, "%s", buf);
00552          column += strlen (buf);
00553          need_semicolon = true;
00554        }
00555     }
00556   fprintf (stream, "\n");
00557 }
00558 
00559 /* Output a character mapping table.  */
00560 
00561 static void
00562 output_charmap (FILE *stream, const char *mapname,
00563               unsigned int (*func) (unsigned int))
00564 {
00565   char table[0x110000];
00566   unsigned int i;
00567   bool need_semicolon;
00568   const int max_column = 75;
00569   int column;
00570 
00571   for (i = 0; i < 0x110000; i++)
00572     table[i] = (func (i) != i);
00573 
00574   fprintf (stream, "%s ", mapname);
00575   need_semicolon = false;
00576   column = 1000;
00577   for (i = 0; i < 0x110000; i++)
00578     if (table[i])
00579       {
00580        char buf[25+1];
00581 
00582        strcpy (buf, "(");
00583        strcat (buf, ucs_symbol (i));
00584        strcat (buf, ",");
00585        strcat (buf, ucs_symbol (func (i)));
00586        strcat (buf, ")");
00587 
00588        if (need_semicolon)
00589          {
00590            fprintf (stream, ";");
00591            column++;
00592          }
00593 
00594        if (column + strlen (buf) > max_column)
00595          {
00596            fprintf (stream, "/\n   ");
00597            column = 3;
00598          }
00599 
00600        fprintf (stream, "%s", buf);
00601        column += strlen (buf);
00602        need_semicolon = true;
00603       }
00604   fprintf (stream, "\n");
00605 }
00606 
00607 /* Output the width table.  */
00608 
00609 static void
00610 output_widthmap (FILE *stream)
00611 {
00612 }
00613 
00614 /* Output the tables to the given file.  */
00615 
00616 static void
00617 output_tables (const char *filename, const char *version)
00618 {
00619   FILE *stream;
00620   unsigned int ch;
00621 
00622   stream = fopen (filename, "w");
00623   if (stream == NULL)
00624     {
00625       fprintf (stderr, "cannot open '%s' for writing\n", filename);
00626       exit (1);
00627     }
00628 
00629   fprintf (stream, "escape_char /\n");
00630   fprintf (stream, "comment_char %%\n");
00631   fprintf (stream, "\n");
00632   fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
00633           version);
00634   fprintf (stream, "\n");
00635 
00636   fprintf (stream, "LC_IDENTIFICATION\n");
00637   fprintf (stream, "title     \"Unicode %s FDCC-set\"\n", version);
00638   fprintf (stream, "source    \"UnicodeData.txt, PropList.txt\"\n");
00639   fprintf (stream, "address   \"\"\n");
00640   fprintf (stream, "contact   \"\"\n");
00641   fprintf (stream, "email     \"bug-glibc-locales@gnu.org\"\n");
00642   fprintf (stream, "tel       \"\"\n");
00643   fprintf (stream, "fax       \"\"\n");
00644   fprintf (stream, "language  \"\"\n");
00645   fprintf (stream, "territory \"Earth\"\n");
00646   fprintf (stream, "revision  \"%s\"\n", version);
00647   {
00648     time_t now;
00649     char date[11];
00650     now = time (NULL);
00651     strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
00652     fprintf (stream, "date      \"%s\"\n", date);
00653   }
00654   fprintf (stream, "category  \"unicode:2001\";LC_CTYPE\n");
00655   fprintf (stream, "END LC_IDENTIFICATION\n");
00656   fprintf (stream, "\n");
00657 
00658   /* Verifications. */
00659   for (ch = 0; ch < 0x110000; ch++)
00660     {
00661       /* toupper restriction: "Only characters specified for the keywords
00662         lower and upper shall be specified.  */
00663       if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
00664        fprintf (stderr,
00665                "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
00666                ucs_symbol (ch), ch, to_upper (ch));
00667 
00668       /* tolower restriction: "Only characters specified for the keywords
00669         lower and upper shall be specified.  */
00670       if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
00671        fprintf (stderr,
00672                "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
00673                ucs_symbol (ch), ch, to_lower (ch));
00674 
00675       /* alpha restriction: "Characters classified as either upper or lower
00676         shall automatically belong to this class.  */
00677       if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
00678        fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
00679 
00680       /* alpha restriction: "No character specified for the keywords cntrl,
00681         digit, punct or space shall be specified."  */
00682       if (is_alpha (ch) && is_cntrl (ch))
00683        fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
00684       if (is_alpha (ch) && is_digit (ch))
00685        fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
00686       if (is_alpha (ch) && is_punct (ch))
00687        fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
00688       if (is_alpha (ch) && is_space (ch))
00689        fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
00690 
00691       /* space restriction: "No character specified for the keywords upper,
00692         lower, alpha, digit, graph or xdigit shall be specified."
00693         upper, lower, alpha already checked above.  */
00694       if (is_space (ch) && is_digit (ch))
00695        fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
00696       if (is_space (ch) && is_graph (ch))
00697        fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
00698       if (is_space (ch) && is_xdigit (ch))
00699        fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
00700 
00701       /* cntrl restriction: "No character specified for the keywords upper,
00702         lower, alpha, digit, punct, graph, print or xdigit shall be
00703         specified."  upper, lower, alpha already checked above.  */
00704       if (is_cntrl (ch) && is_digit (ch))
00705        fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
00706       if (is_cntrl (ch) && is_punct (ch))
00707        fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
00708       if (is_cntrl (ch) && is_graph (ch))
00709        fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
00710       if (is_cntrl (ch) && is_print (ch))
00711        fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
00712       if (is_cntrl (ch) && is_xdigit (ch))
00713        fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
00714 
00715       /* punct restriction: "No character specified for the keywords upper,
00716         lower, alpha, digit, cntrl, xdigit or as the <space> character shall
00717         be specified."  upper, lower, alpha, cntrl already checked above.  */
00718       if (is_punct (ch) && is_digit (ch))
00719        fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
00720       if (is_punct (ch) && is_xdigit (ch))
00721        fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
00722       if (is_punct (ch) && (ch == 0x0020))
00723        fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
00724 
00725       /* graph restriction: "No character specified for the keyword cntrl
00726         shall be specified."  Already checked above.  */
00727 
00728       /* print restriction: "No character specified for the keyword cntrl
00729         shall be specified."  Already checked above.  */
00730 
00731       /* graph - print relation: differ only in the <space> character.
00732         How is this possible if there are more than one space character?!
00733         I think susv2/xbd/locale.html should speak of "space characters",
00734         not "space character".  */
00735       if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
00736        fprintf (stderr,
00737                "%s is print but not graph|<space>\n", ucs_symbol (ch));
00738       if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
00739        fprintf (stderr,
00740                "%s is graph|<space> but not print\n", ucs_symbol (ch));
00741     }
00742 
00743   fprintf (stream, "LC_CTYPE\n");
00744   output_charclass (stream, "upper", is_upper);
00745   output_charclass (stream, "lower", is_lower);
00746   output_charclass (stream, "alpha", is_alpha);
00747   output_charclass (stream, "digit", is_digit);
00748   output_charclass (stream, "outdigit", is_outdigit);
00749   output_charclass (stream, "blank", is_blank);
00750   output_charclass (stream, "space", is_space);
00751   output_charclass (stream, "cntrl", is_cntrl);
00752   output_charclass (stream, "punct", is_punct);
00753   output_charclass (stream, "xdigit", is_xdigit);
00754   output_charclass (stream, "graph", is_graph);
00755   output_charclass (stream, "print", is_print);
00756   output_charclass (stream, "class \"combining\";", is_combining);
00757   output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
00758   output_charmap (stream, "toupper", to_upper);
00759   output_charmap (stream, "tolower", to_lower);
00760   output_charmap (stream, "map \"totitle\";", to_title);
00761   output_widthmap (stream);
00762   fprintf (stream, "END LC_CTYPE\n");
00763 
00764   if (ferror (stream) || fclose (stream))
00765     {
00766       fprintf (stderr, "error writing to '%s'\n", filename);
00767       exit (1);
00768     }
00769 }
00770 
00771 int
00772 main (int argc, char * argv[])
00773 {
00774   if (argc != 3)
00775     {
00776       fprintf (stderr, "Usage: %s UnicodeData.txt version\n", argv[0]);
00777       exit (1);
00778     }
00779 
00780   fill_attributes (argv[1]);
00781 
00782   output_tables ("unicode", argv[2]);
00783 
00784   return 0;
00785 }