Back to index

glibc  2.9
tst-table-from.c
Go to the documentation of this file.
00001 /* Copyright (C) 2000-2002 Free Software Foundation, Inc.
00002    This file is part of the GNU C Library.
00003    Contributed by Bruno Haible <haible@clisp.cons.org>, 2000.
00004 
00005    The GNU C Library is free software; you can redistribute it and/or
00006    modify it under the terms of the GNU Lesser General Public
00007    License as published by the Free Software Foundation; either
00008    version 2.1 of the License, or (at your option) any later version.
00009 
00010    The GNU C Library is distributed in the hope that it will be useful,
00011    but WITHOUT ANY WARRANTY; without even the implied warranty of
00012    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00013    Lesser General Public License for more details.
00014 
00015    You should have received a copy of the GNU Lesser General Public
00016    License along with the GNU C Library; if not, write to the Free
00017    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
00018    02111-1307 USA.  */
00019 
00020 /* Create a table from CHARSET to Unicode.
00021    This is a good test for CHARSET's iconv() module, in particular the
00022    FROM_LOOP BODY macro.  */
00023 
00024 #include <stddef.h>
00025 #include <stdio.h>
00026 #include <stdlib.h>
00027 #include <string.h>
00028 #include <iconv.h>
00029 #include <errno.h>
00030 
00031 /* If nonzero, ignore conversions outside Unicode plane 0.  */
00032 static int bmp_only;
00033 
00034 /* Converts a byte buffer to a hexadecimal string.  */
00035 static const char*
00036 hexbuf (unsigned char buf[], unsigned int buflen)
00037 {
00038   static char msg[50];
00039 
00040   switch (buflen)
00041     {
00042     case 1:
00043       sprintf (msg, "0x%02X", buf[0]);
00044       break;
00045     case 2:
00046       sprintf (msg, "0x%02X%02X", buf[0], buf[1]);
00047       break;
00048     case 3:
00049       sprintf (msg, "0x%02X%02X%02X", buf[0], buf[1], buf[2]);
00050       break;
00051     case 4:
00052       sprintf (msg, "0x%02X%02X%02X%02X", buf[0], buf[1], buf[2], buf[3]);
00053       break;
00054     default:
00055       abort ();
00056     }
00057   return msg;
00058 }
00059 
00060 /* Attempts to convert a byte buffer BUF (BUFLEN bytes) to OUT (12 bytes)
00061    using the conversion descriptor CD.  Returns the number of written bytes,
00062    or 0 if ambiguous, or -1 if invalid.  */
00063 static int
00064 try (iconv_t cd, unsigned char buf[], unsigned int buflen, unsigned char *out)
00065 {
00066   const char *inbuf = (const char *) buf;
00067   size_t inbytesleft = buflen;
00068   char *outbuf = (char *) out;
00069   size_t outbytesleft = 12;
00070   size_t result;
00071 
00072   iconv (cd, NULL, NULL, NULL, NULL);
00073   result = iconv (cd, (char **) &inbuf, &inbytesleft, &outbuf, &outbytesleft);
00074   if (result != (size_t)(-1))
00075     result = iconv (cd, NULL, NULL, &outbuf, &outbytesleft);
00076 
00077   if (result == (size_t)(-1))
00078     {
00079       if (errno == EILSEQ)
00080        {
00081          return -1;
00082        }
00083       else if (errno == EINVAL)
00084        {
00085          return 0;
00086        }
00087       else
00088        {
00089          int saved_errno = errno;
00090          fprintf (stderr, "%s: iconv error: ", hexbuf (buf, buflen));
00091          errno = saved_errno;
00092          perror ("");
00093          exit (1);
00094        }
00095     }
00096   else
00097     {
00098       if (inbytesleft != 0)
00099        {
00100          fprintf (stderr, "%s: inbytes = %ld, outbytes = %ld\n",
00101                  hexbuf (buf, buflen),
00102                  (long) (buflen - inbytesleft),
00103                  (long) (12 - outbytesleft));
00104          exit (1);
00105        }
00106       return 12 - outbytesleft;
00107     }
00108 }
00109 
00110 /* Returns the out[] buffer as a Unicode value, formatted as 0x%04X.  */
00111 static const char *
00112 utf8_decode (const unsigned char *out, unsigned int outlen)
00113 {
00114   static char hexbuf[84];
00115   char *p = hexbuf;
00116 
00117   while (outlen > 0)
00118     {
00119       if (p > hexbuf)
00120        *p++ = ' ';
00121 
00122       if (out[0] < 0x80)
00123        {
00124          sprintf (p, "0x%04X", out[0]);
00125          out += 1; outlen -= 1;
00126        }
00127       else if (out[0] >= 0xc0 && out[0] < 0xe0 && outlen >= 2)
00128        {
00129          sprintf (p, "0x%04X", ((out[0] & 0x1f) << 6) + (out[1] & 0x3f));
00130          out += 2; outlen -= 2;
00131        }
00132       else if (out[0] >= 0xe0 && out[0] < 0xf0 && outlen >= 3)
00133        {
00134          sprintf (p, "0x%04X", ((out[0] & 0x0f) << 12)
00135                             + ((out[1] & 0x3f) << 6) + (out[2] & 0x3f));
00136          out += 3; outlen -= 3;
00137        }
00138       else if (out[0] >= 0xf0 && out[0] < 0xf8 && outlen >= 4)
00139        {
00140          sprintf (p, "0x%04X", ((out[0] & 0x07) << 18)
00141                             + ((out[1] & 0x3f) << 12)
00142                             + ((out[2] & 0x3f) << 6) + (out[3] & 0x3f));
00143          out += 4; outlen -= 4;
00144        }
00145       else if (out[0] >= 0xf8 && out[0] < 0xfc && outlen >= 5)
00146        {
00147          sprintf (p, "0x%04X", ((out[0] & 0x03) << 24)
00148                             + ((out[1] & 0x3f) << 18)
00149                             + ((out[2] & 0x3f) << 12)
00150                             + ((out[3] & 0x3f) << 6) + (out[4] & 0x3f));
00151          out += 5; outlen -= 5;
00152        }
00153       else if (out[0] >= 0xfc && out[0] < 0xfe && outlen >= 6)
00154        {
00155          sprintf (p, "0x%04X", ((out[0] & 0x01) << 30)
00156                             + ((out[1] & 0x3f) << 24)
00157                             + ((out[2] & 0x3f) << 18)
00158                             + ((out[3] & 0x3f) << 12)
00159                             + ((out[4] & 0x3f) << 6) + (out[5] & 0x3f));
00160          out += 6; outlen -= 6;
00161        }
00162       else
00163        {
00164          sprintf (p, "0x????");
00165          out += 1; outlen -= 1;
00166        }
00167 
00168       if (bmp_only && strlen (p) > 6)
00169        /* Ignore conversions outside Unicode plane 0.  */
00170        return NULL;
00171 
00172       p += strlen (p);
00173     }
00174 
00175   return hexbuf;
00176 }
00177 
00178 int
00179 main (int argc, char *argv[])
00180 {
00181   const char *charset;
00182   iconv_t cd;
00183   int search_depth;
00184 
00185   if (argc != 2)
00186     {
00187       fprintf (stderr, "Usage: tst-table-from charset\n");
00188       exit (1);
00189     }
00190   charset = argv[1];
00191 
00192   cd = iconv_open ("UTF-8", charset);
00193   if (cd == (iconv_t)(-1))
00194     {
00195       perror ("iconv_open");
00196       exit (1);
00197     }
00198 
00199   /* When testing UTF-8 or GB18030, stop at 0x10000, otherwise the output
00200      file gets too big.  */
00201   bmp_only = (strcmp (charset, "UTF-8") == 0
00202              || strcmp (charset, "GB18030") == 0);
00203   search_depth = (strcmp (charset, "UTF-8") == 0 ? 3 : 4);
00204 
00205   {
00206     unsigned char out[12];
00207     unsigned char buf[4];
00208     unsigned int i0, i1, i2, i3;
00209     int result;
00210 
00211     for (i0 = 0; i0 < 0x100; i0++)
00212       {
00213        buf[0] = i0;
00214        result = try (cd, buf, 1, out);
00215        if (result < 0)
00216          {
00217          }
00218        else if (result > 0)
00219          {
00220            const char *unicode = utf8_decode (out, result);
00221            if (unicode != NULL)
00222              printf ("0x%02X\t%s\n", i0, unicode);
00223          }
00224        else
00225          {
00226            for (i1 = 0; i1 < 0x100; i1++)
00227              {
00228               buf[1] = i1;
00229               result = try (cd, buf, 2, out);
00230               if (result < 0)
00231                 {
00232                 }
00233               else if (result > 0)
00234                 {
00235                   const char *unicode = utf8_decode (out, result);
00236                   if (unicode != NULL)
00237                     printf ("0x%02X%02X\t%s\n", i0, i1, unicode);
00238                 }
00239               else
00240                 {
00241                   for (i2 = 0; i2 < 0x100; i2++)
00242                     {
00243                      buf[2] = i2;
00244                      result = try (cd, buf, 3, out);
00245                      if (result < 0)
00246                        {
00247                        }
00248                      else if (result > 0)
00249                        {
00250                          const char *unicode = utf8_decode (out, result);
00251                          if (unicode != NULL)
00252                            printf ("0x%02X%02X%02X\t%s\n",
00253                                   i0, i1, i2, unicode);
00254                        }
00255                      else if (search_depth > 3)
00256                        {
00257                          for (i3 = 0; i3 < 0x100; i3++)
00258                            {
00259                             buf[3] = i3;
00260                             result = try (cd, buf, 4, out);
00261                             if (result < 0)
00262                               {
00263                               }
00264                             else if (result > 0)
00265                               {
00266                                 const char *unicode =
00267                                   utf8_decode (out, result);
00268                                 if (unicode != NULL)
00269                                   printf ("0x%02X%02X%02X%02X\t%s\n",
00270                                          i0, i1, i2, i3, unicode);
00271                               }
00272                             else
00273                               {
00274                                 fprintf (stderr,
00275                                         "%s: incomplete byte sequence\n",
00276                                         hexbuf (buf, 4));
00277                                 exit (1);
00278                               }
00279                            }
00280                        }
00281                     }
00282                 }
00283              }
00284          }
00285       }
00286   }
00287 
00288   if (iconv_close (cd) < 0)
00289     {
00290       perror ("iconv_close");
00291       exit (1);
00292     }
00293 
00294   if (ferror (stdin) || fflush (stdout) || ferror (stdout))
00295     {
00296       fprintf (stderr, "I/O error\n");
00297       exit (1);
00298     }
00299 
00300   return 0;
00301 }