Back to index

glibc  2.9
iconv_charmap.c
Go to the documentation of this file.
00001 /* Convert using charmaps and possibly iconv().
00002    Copyright (C) 2001, 2005, 2006, 2008 Free Software Foundation, Inc.
00003    This file is part of the GNU C Library.
00004    Contributed by Ulrich Drepper <drepper@redhat.com>, 2001.
00005 
00006    This program is free software; you can redistribute it and/or modify
00007    it under the terms of the GNU General Public License as published
00008    by the Free Software Foundation; version 2 of the License, or
00009    (at your option) any later version.
00010 
00011    This program is distributed in the hope that it will be useful,
00012    but WITHOUT ANY WARRANTY; without even the implied warranty of
00013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014    GNU General Public License for more details.
00015 
00016    You should have received a copy of the GNU General Public License
00017    along with this program; if not, write to the Free Software Foundation,
00018    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
00019 
00020 #include <assert.h>
00021 #include <errno.h>
00022 #include <error.h>
00023 #include <fcntl.h>
00024 #include <iconv.h>
00025 #include <libintl.h>
00026 #include <stdio.h>
00027 #include <stdlib.h>
00028 #include <unistd.h>
00029 #include <sys/mman.h>
00030 #include <sys/stat.h>
00031 
00032 #include "iconv_prog.h"
00033 
00034 
00035 /* Prototypes for a few program-wide used functions.  */
00036 extern void *xmalloc (size_t __n);
00037 extern void *xcalloc (size_t __n, size_t __s);
00038 
00039 
00040 struct convtable
00041 {
00042   int term[256 / 8];
00043   union
00044   {
00045     struct convtable *sub;
00046     struct charseq *out;
00047   } val[256];
00048 };
00049 
00050 
00051 static inline struct convtable *
00052 allocate_table (void)
00053 {
00054   return (struct convtable *) xcalloc (1, sizeof (struct convtable));
00055 }
00056 
00057 
00058 static inline int
00059 is_term (struct convtable *tbl, unsigned int idx)
00060 {
00061   return tbl->term[idx / 8] & (1 << (idx % 8));
00062 }
00063 
00064 
00065 static inline void
00066 clear_term (struct convtable *tbl, unsigned int idx)
00067 {
00068   tbl->term[idx / 8] &= ~(1 << (idx % 8));
00069 }
00070 
00071 
00072 static inline void
00073 set_term (struct convtable *tbl, unsigned int idx)
00074 {
00075   tbl->term[idx / 8] |= 1 << (idx % 8);
00076 }
00077 
00078 
00079 /* Generate the conversion table.  */
00080 static struct convtable *use_from_charmap (struct charmap_t *from_charmap,
00081                                       const char *to_code);
00082 static struct convtable *use_to_charmap (const char *from_code,
00083                                     struct charmap_t *to_charmap);
00084 static struct convtable *use_both_charmaps (struct charmap_t *from_charmap,
00085                                        struct charmap_t *to_charmap);
00086 
00087 /* Prototypes for the functions doing the actual work.  */
00088 static int process_block (struct convtable *tbl, char *addr, size_t len,
00089                        FILE *output);
00090 static int process_fd (struct convtable *tbl, int fd, FILE *output);
00091 static int process_file (struct convtable *tbl, FILE *input, FILE *output);
00092 
00093 
00094 int
00095 charmap_conversion (const char *from_code, struct charmap_t *from_charmap,
00096                   const char *to_code, struct charmap_t *to_charmap,
00097                   int argc, int remaining, char *argv[],
00098                   const char *output_file)
00099 {
00100   struct convtable *cvtbl;
00101   int status = EXIT_SUCCESS;
00102 
00103   /* We have three different cases to handle:
00104 
00105      - both, from_charmap and to_charmap, are available.  This means we
00106        can assume that the symbolic names match and use them to create
00107        the mapping.
00108 
00109      - only from_charmap is available.  In this case we can only hope that
00110        the symbolic names used are of the <Uxxxx> form in which case we
00111        can use a UCS4->"to_code" iconv() conversion for the second step.
00112 
00113      - only to_charmap is available.  This is similar, only that we would
00114        use iconv() for the "to_code"->UCS4 conversion.
00115 
00116        We first create a table which maps input bytes into output bytes.
00117        Once this is done we can handle all three of the cases above
00118        equally.  */
00119   if (from_charmap != NULL)
00120     {
00121       if (to_charmap == NULL)
00122        cvtbl = use_from_charmap (from_charmap, to_code);
00123       else
00124        cvtbl = use_both_charmaps (from_charmap, to_charmap);
00125     }
00126   else
00127     {
00128       assert (to_charmap != NULL);
00129       cvtbl = use_to_charmap (from_code, to_charmap);
00130     }
00131 
00132   /* If we couldn't generate a table stop now.  */
00133   if (cvtbl == NULL)
00134     return EXIT_FAILURE;
00135 
00136   /* Determine output file.  */
00137   FILE *output;
00138   if (output_file != NULL && strcmp (output_file, "-") != 0)
00139     {
00140       output = fopen (output_file, "w");
00141       if (output == NULL)
00142        error (EXIT_FAILURE, errno, _("cannot open output file"));
00143     }
00144   else
00145     output = stdout;
00146 
00147   /* We can now start the conversion.  */
00148   if (remaining == argc)
00149     {
00150       if (process_file (cvtbl, stdin, output) != 0)
00151        status = EXIT_FAILURE;
00152     }
00153   else
00154     do
00155       {
00156        struct stat st;
00157        char *addr;
00158        int fd;
00159 
00160        if (verbose)
00161          printf ("%s:\n", argv[remaining]);
00162        if (strcmp (argv[remaining], "-") == 0)
00163          fd = 0;
00164        else
00165          {
00166            fd = open (argv[remaining], O_RDONLY);
00167 
00168            if (fd == -1)
00169              {
00170               error (0, errno, _("cannot open input file `%s'"),
00171                      argv[remaining]);
00172               status = EXIT_FAILURE;
00173               continue;
00174              }
00175          }
00176 
00177 #ifdef _POSIX_MAPPED_FILES
00178        /* We have possibilities for reading the input file.  First try
00179           to mmap() it since this will provide the fastest solution.  */
00180        if (fstat (fd, &st) == 0
00181            && ((addr = mmap (NULL, st.st_size, PROT_READ, MAP_PRIVATE,
00182                            fd, 0)) != MAP_FAILED))
00183          {
00184            /* Yes, we can use mmap().  The descriptor is not needed
00185               anymore.  */
00186            if (close (fd) != 0)
00187              error (EXIT_FAILURE, errno,
00188                    _("error while closing input `%s'"), argv[remaining]);
00189 
00190            if (process_block (cvtbl, addr, st.st_size, output) < 0)
00191              {
00192               /* Something went wrong.  */
00193               status = EXIT_FAILURE;
00194 
00195               /* We don't need the input data anymore.  */
00196               munmap ((void *) addr, st.st_size);
00197 
00198               /* We cannot go on with producing output since it might
00199                  lead to problem because the last output might leave
00200                  the output stream in an undefined state.  */
00201               break;
00202              }
00203 
00204            /* We don't need the input data anymore.  */
00205            munmap ((void *) addr, st.st_size);
00206          }
00207        else
00208 #endif /* _POSIX_MAPPED_FILES */
00209          {
00210            /* Read the file in pieces.  */
00211            if (process_fd (cvtbl, fd, output) != 0)
00212              {
00213               /* Something went wrong.  */
00214               status = EXIT_FAILURE;
00215 
00216               /* We don't need the input file anymore.  */
00217               close (fd);
00218 
00219               /* We cannot go on with producing output since it might
00220                  lead to problem because the last output might leave
00221                  the output stream in an undefined state.  */
00222               break;
00223              }
00224 
00225            /* Now close the file.  */
00226            close (fd);
00227          }
00228       }
00229     while (++remaining < argc);
00230 
00231   /* All done.  */
00232   return status;
00233 }
00234 
00235 
00236 static void
00237 add_bytes (struct convtable *tbl, struct charseq *in, struct charseq *out)
00238 {
00239   int n = 0;
00240   unsigned int byte;
00241 
00242   assert (in->nbytes > 0);
00243 
00244   byte = ((unsigned char *) in->bytes)[n];
00245   while (n + 1 < in->nbytes)
00246     {
00247       if (is_term (tbl, byte) || tbl->val[byte].sub == NULL)
00248        {
00249          /* Note that we simply ignore a definition for a byte sequence
00250             which is also the prefix for a longer one.  */
00251          clear_term (tbl, byte);
00252          tbl->val[byte].sub =
00253            (struct convtable *) xcalloc (1, sizeof (struct convtable));
00254        }
00255 
00256       tbl = tbl->val[byte].sub;
00257 
00258       byte = ((unsigned char *) in->bytes)[++n];
00259     }
00260 
00261   /* Only add the new sequence if there is none yet and the byte sequence
00262      is not part of an even longer one.  */
00263   if (! is_term (tbl, byte) && tbl->val[byte].sub == NULL)
00264     {
00265       set_term (tbl, byte);
00266       tbl->val[byte].out = out;
00267     }
00268 }
00269 
00270 
00271 static struct convtable *
00272 use_from_charmap (struct charmap_t *from_charmap, const char *to_code)
00273 {
00274   /* We iterate over all entries in the from_charmap and for those which
00275      have a known UCS4 representation we use an iconv() call to determine
00276      the mapping to the to_code charset.  */
00277   struct convtable *rettbl;
00278   iconv_t cd;
00279   void *ptr = NULL;
00280   const void *key;
00281   size_t keylen;
00282   void *data;
00283 
00284   cd = iconv_open (to_code, "WCHAR_T");
00285   if (cd == (iconv_t) -1)
00286     /* We cannot do anything.  */
00287     return NULL;
00288 
00289   rettbl = allocate_table ();
00290 
00291   while (iterate_table (&from_charmap->char_table, &ptr, &key, &keylen, &data)
00292         >= 0)
00293     {
00294       struct charseq *in = (struct charseq *) data;
00295 
00296       if (in->ucs4 != UNINITIALIZED_CHAR_VALUE)
00297        {
00298          /* There is a chance.  Try the iconv module.  */
00299          wchar_t inbuf[1] = { in->ucs4 };
00300          unsigned char outbuf[64];
00301          char *inptr = (char *) inbuf;
00302          size_t inlen = sizeof (inbuf);
00303          char *outptr = (char *) outbuf;
00304          size_t outlen = sizeof (outbuf);
00305 
00306          (void) iconv (cd, &inptr, &inlen, &outptr, &outlen);
00307 
00308          if (outptr != (char *) outbuf)
00309            {
00310              /* We got some output.  Good, use it.  */
00311              struct charseq *newp;
00312 
00313              outlen = sizeof (outbuf) - outlen;
00314              assert ((char *) outbuf + outlen == outptr);
00315 
00316              newp = (struct charseq *) xmalloc (sizeof (struct charseq)
00317                                            + outlen);
00318              newp->name = in->name;
00319              newp->ucs4 = in->ucs4;
00320              newp->nbytes = outlen;
00321              memcpy (newp->bytes, outbuf, outlen);
00322 
00323              add_bytes (rettbl, in, newp);
00324            }
00325 
00326          /* Clear any possible state left behind.  */
00327          (void) iconv (cd, NULL, NULL, NULL, NULL);
00328        }
00329     }
00330 
00331   iconv_close (cd);
00332 
00333   return rettbl;
00334 }
00335 
00336 
00337 static struct convtable *
00338 use_to_charmap (const char *from_code, struct charmap_t *to_charmap)
00339 {
00340   /* We iterate over all entries in the to_charmap and for those which
00341      have a known UCS4 representation we use an iconv() call to determine
00342      the mapping to the from_code charset.  */
00343   struct convtable *rettbl;
00344   iconv_t cd;
00345   void *ptr = NULL;
00346   const void *key;
00347   size_t keylen;
00348   void *data;
00349 
00350   /* Note that the conversion we use here is the reverse direction.  Without
00351      exhaustive search we cannot figure out which input yields the UCS4
00352      character we are looking for.  Therefore we determine it the other
00353      way round.  */
00354   cd = iconv_open (from_code, "WCHAR_T");
00355   if (cd == (iconv_t) -1)
00356     /* We cannot do anything.  */
00357     return NULL;
00358 
00359   rettbl = allocate_table ();
00360 
00361   while (iterate_table (&to_charmap->char_table, &ptr, &key, &keylen, &data)
00362         >= 0)
00363     {
00364       struct charseq *out = (struct charseq *) data;
00365 
00366       if (out->ucs4 != UNINITIALIZED_CHAR_VALUE)
00367        {
00368          /* There is a chance.  Try the iconv module.  */
00369          wchar_t inbuf[1] = { out->ucs4 };
00370          unsigned char outbuf[64];
00371          char *inptr = (char *) inbuf;
00372          size_t inlen = sizeof (inbuf);
00373          char *outptr = (char *) outbuf;
00374          size_t outlen = sizeof (outbuf);
00375 
00376          (void) iconv (cd, &inptr, &inlen, &outptr, &outlen);
00377 
00378          if (outptr != (char *) outbuf)
00379            {
00380              /* We got some output.  Good, use it.  */
00381              union
00382              {
00383               struct charseq seq;
00384               struct
00385               {
00386                 const char *name;
00387                 uint32_t ucs4;
00388                 int nbytes;
00389                 unsigned char bytes[outlen];
00390               } mem;
00391              } new;
00392 
00393              outlen = sizeof (outbuf) - outlen;
00394              assert ((char *) outbuf + outlen == outptr);
00395 
00396              new.mem.name = out->name;
00397              new.mem.ucs4 = out->ucs4;
00398              new.mem.nbytes = outlen;
00399              memcpy (new.mem.bytes, outbuf, outlen);
00400 
00401              add_bytes (rettbl, &new.seq, out);
00402            }
00403 
00404          /* Clear any possible state left behind.  */
00405          (void) iconv (cd, NULL, NULL, NULL, NULL);
00406        }
00407     }
00408 
00409   iconv_close (cd);
00410 
00411   return rettbl;
00412 }
00413 
00414 
00415 static struct convtable *
00416 use_both_charmaps (struct charmap_t *from_charmap,
00417                  struct charmap_t *to_charmap)
00418 {
00419   /* In this case we iterate over all the entries in the from_charmap,
00420      determine the internal name, and find an appropriate entry in the
00421      to_charmap (if it exists).  */
00422   struct convtable *rettbl = allocate_table ();
00423   void *ptr = NULL;
00424   const void *key;
00425   size_t keylen;
00426   void *data;
00427 
00428   while (iterate_table (&from_charmap->char_table, &ptr, &key, &keylen, &data)
00429         >= 0)
00430     {
00431       struct charseq *in = (struct charseq *) data;
00432       struct charseq *out = charmap_find_value (to_charmap, key, keylen);
00433 
00434       if (out != NULL)
00435        add_bytes (rettbl, in, out);
00436     }
00437 
00438   return rettbl;
00439 }
00440 
00441 
00442 static int
00443 process_block (struct convtable *tbl, char *addr, size_t len, FILE *output)
00444 {
00445   size_t n = 0;
00446 
00447   while (n < len)
00448     {
00449       struct convtable *cur = tbl;
00450       unsigned char *curp = (unsigned char *) addr;
00451       unsigned int byte = *curp;
00452       int cnt;
00453       struct charseq *out;
00454 
00455       while (! is_term (cur, byte))
00456        if (cur->val[byte].sub == NULL)
00457          {
00458            /* This is a invalid sequence.  Skip the first byte if we are
00459               ignoring errors.  Otherwise punt.  */
00460            if (! omit_invalid)
00461              {
00462               error (0, 0, _("illegal input sequence at position %Zd"), n);
00463               return -1;
00464              }
00465 
00466            n -= curp - (unsigned char *) addr;
00467 
00468            byte = *(curp = (unsigned char *) ++addr);
00469            if (++n >= len)
00470              /* All converted.  */
00471              return 0;
00472 
00473            cur = tbl;
00474          }
00475        else
00476          {
00477            cur = cur->val[byte].sub;
00478 
00479            if (++n >= len)
00480              {
00481               error (0, 0, _("\
00482 incomplete character or shift sequence at end of buffer"));
00483               return -1;
00484              }
00485 
00486            byte = *++curp;
00487          }
00488 
00489       /* We found a final byte.  Write the output bytes.  */
00490       out = cur->val[byte].out;
00491       for (cnt = 0; cnt < out->nbytes; ++cnt)
00492        fputc_unlocked (out->bytes[cnt], output);
00493 
00494       addr = (char *) curp + 1;
00495       ++n;
00496     }
00497 
00498   return 0;
00499 }
00500 
00501 
00502 static int
00503 process_fd (struct convtable *tbl, int fd, FILE *output)
00504 {
00505   /* We have a problem with reading from a descriptor since we must not
00506      provide the iconv() function an incomplete character or shift
00507      sequence at the end of the buffer.  Since we have to deal with
00508      arbitrary encodings we must read the whole text in a buffer and
00509      process it in one step.  */
00510   static char *inbuf = NULL;
00511   static size_t maxlen = 0;
00512   char *inptr = inbuf;
00513   size_t actlen = 0;
00514 
00515   while (actlen < maxlen)
00516     {
00517       ssize_t n = read (fd, inptr, maxlen - actlen);
00518 
00519       if (n == 0)
00520        /* No more text to read.  */
00521        break;
00522 
00523       if (n == -1)
00524        {
00525          /* Error while reading.  */
00526          error (0, errno, _("error while reading the input"));
00527          return -1;
00528        }
00529 
00530       inptr += n;
00531       actlen += n;
00532     }
00533 
00534   if (actlen == maxlen)
00535     while (1)
00536       {
00537        ssize_t n;
00538        char *new_inbuf;
00539 
00540        /* Increase the buffer.  */
00541        new_inbuf = (char *) realloc (inbuf, maxlen + 32768);
00542        if (new_inbuf == NULL)
00543          {
00544            error (0, errno, _("unable to allocate buffer for input"));
00545            return -1;
00546          }
00547        inbuf = new_inbuf;
00548        maxlen += 32768;
00549        inptr = inbuf + actlen;
00550 
00551        do
00552          {
00553            n = read (fd, inptr, maxlen - actlen);
00554 
00555            if (n == 0)
00556              /* No more text to read.  */
00557              break;
00558 
00559            if (n == -1)
00560              {
00561               /* Error while reading.  */
00562               error (0, errno, _("error while reading the input"));
00563               return -1;
00564              }
00565 
00566            inptr += n;
00567            actlen += n;
00568          }
00569        while (actlen < maxlen);
00570 
00571        if (n == 0)
00572          /* Break again so we leave both loops.  */
00573          break;
00574       }
00575 
00576   /* Now we have all the input in the buffer.  Process it in one run.  */
00577   return process_block (tbl, inbuf, actlen, output);
00578 }
00579 
00580 
00581 static int
00582 process_file (struct convtable *tbl, FILE *input, FILE *output)
00583 {
00584   /* This should be safe since we use this function only for `stdin' and
00585      we haven't read anything so far.  */
00586   return process_fd (tbl, fileno (input), output);
00587 }