Back to index

glibc  2.9
utf-32.c
Go to the documentation of this file.
00001 /* Conversion module for UTF-32.
00002    Copyright (C) 1999, 2000-2002 Free Software Foundation, Inc.
00003    This file is part of the GNU C Library.
00004 
00005    The GNU C Library is free software; you can redistribute it and/or
00006    modify it under the terms of the GNU Lesser General Public
00007    License as published by the Free Software Foundation; either
00008    version 2.1 of the License, or (at your option) any later version.
00009 
00010    The GNU C Library is distributed in the hope that it will be useful,
00011    but WITHOUT ANY WARRANTY; without even the implied warranty of
00012    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00013    Lesser General Public License for more details.
00014 
00015    You should have received a copy of the GNU Lesser General Public
00016    License along with the GNU C Library; if not, write to the Free
00017    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
00018    02111-1307 USA.  */
00019 
00020 #include <byteswap.h>
00021 #include <dlfcn.h>
00022 #include <gconv.h>
00023 #include <stddef.h>
00024 #include <stdint.h>
00025 #include <stdlib.h>
00026 #include <string.h>
00027 
00028 /* This is the Byte Order Mark character (BOM).  */
00029 #define BOM   0x0000feffu
00030 /* And in the other byte order.  */
00031 #define BOM_OE       0xfffe0000u
00032 
00033 
00034 /* Definitions used in the body of the `gconv' function.  */
00035 #define FROM_LOOP           from_utf32_loop
00036 #define TO_LOOP                    to_utf32_loop
00037 #define DEFINE_INIT         0
00038 #define DEFINE_FINI         0
00039 #define MIN_NEEDED_FROM            4
00040 #define MIN_NEEDED_TO              4
00041 #define FROM_DIRECTION             (dir == from_utf32)
00042 #define PREPARE_LOOP \
00043   enum direction dir = ((struct utf32_data *) step->__data)->dir;           \
00044   enum variant var = ((struct utf32_data *) step->__data)->var;                    \
00045   int swap;                                                          \
00046   if (FROM_DIRECTION && var == UTF_32)                                      \
00047     {                                                                \
00048       if (data->__invocation_counter == 0)                                  \
00049        {                                                             \
00050          /* We have to find out which byte order the file is encoded in.  */ \
00051          if (inptr + 4 > inend)                                      \
00052            return (inptr == inend                                    \
00053                   ? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT);        \
00054                                                                      \
00055          if (get32u (inptr) == BOM)                                         \
00056            /* Simply ignore the BOM character.  */                          \
00057            *inptrp = inptr += 4;                                     \
00058          else if (get32u (inptr) == BOM_OE)                                 \
00059            {                                                         \
00060              ((struct utf32_data *) step->__data)->swap = 1;                \
00061              *inptrp = inptr += 4;                                   \
00062            }                                                         \
00063        }                                                             \
00064     }                                                                \
00065   else if (!FROM_DIRECTION && var == UTF_32 && !data->__internal_use        \
00066           && data->__invocation_counter == 0)                               \
00067     {                                                                \
00068       /* Emit the Byte Order Mark.  */                                      \
00069       if (__builtin_expect (outbuf + 4 > outend, 0))                        \
00070        return __GCONV_FULL_OUTPUT;                                   \
00071                                                                      \
00072       put32u (outbuf, BOM);                                          \
00073       outbuf += 4;                                                   \
00074     }                                                                \
00075   swap = ((struct utf32_data *) step->__data)->swap;
00076 #define EXTRA_LOOP_ARGS            , var, swap
00077 
00078 
00079 /* Direction of the transformation.  */
00080 enum direction
00081 {
00082   illegal_dir,
00083   to_utf32,
00084   from_utf32
00085 };
00086 
00087 enum variant
00088 {
00089   illegal_var,
00090   UTF_32,
00091   UTF_32LE,
00092   UTF_32BE
00093 };
00094 
00095 struct utf32_data
00096 {
00097   enum direction dir;
00098   enum variant var;
00099   int swap;
00100 };
00101 
00102 
00103 extern int gconv_init (struct __gconv_step *step);
00104 int
00105 gconv_init (struct __gconv_step *step)
00106 {
00107   /* Determine which direction.  */
00108   struct utf32_data *new_data;
00109   enum direction dir = illegal_dir;
00110   enum variant var = illegal_var;
00111   int result;
00112 
00113   if (__strcasecmp (step->__from_name, "UTF-32//") == 0)
00114     {
00115       dir = from_utf32;
00116       var = UTF_32;
00117     }
00118   else if (__strcasecmp (step->__to_name, "UTF-32//") == 0)
00119     {
00120       dir = to_utf32;
00121       var = UTF_32;
00122     }
00123   else if (__strcasecmp (step->__from_name, "UTF-32BE//") == 0)
00124     {
00125       dir = from_utf32;
00126       var = UTF_32BE;
00127     }
00128   else if (__strcasecmp (step->__to_name, "UTF-32BE//") == 0)
00129     {
00130       dir = to_utf32;
00131       var = UTF_32BE;
00132     }
00133   else if (__strcasecmp (step->__from_name, "UTF-32LE//") == 0)
00134     {
00135       dir = from_utf32;
00136       var = UTF_32LE;
00137     }
00138   else if (__strcasecmp (step->__to_name, "UTF-32LE//") == 0)
00139     {
00140       dir = to_utf32;
00141       var = UTF_32LE;
00142     }
00143 
00144   result = __GCONV_NOCONV;
00145   if (__builtin_expect (dir, to_utf32) != illegal_dir)
00146     {
00147       new_data = (struct utf32_data *) malloc (sizeof (struct utf32_data));
00148 
00149       result = __GCONV_NOMEM;
00150       if (new_data != NULL)
00151        {
00152          new_data->dir = dir;
00153          new_data->var = var;
00154          new_data->swap = ((var == UTF_32LE && BYTE_ORDER == BIG_ENDIAN)
00155                          || (var == UTF_32BE
00156                             && BYTE_ORDER == LITTLE_ENDIAN));
00157          step->__data = new_data;
00158 
00159          if (dir == from_utf32)
00160            {
00161              step->__min_needed_from = MIN_NEEDED_FROM;
00162              step->__max_needed_from = MIN_NEEDED_FROM;
00163              step->__min_needed_to = MIN_NEEDED_TO;
00164              step->__max_needed_to = MIN_NEEDED_TO;
00165            }
00166          else
00167            {
00168              step->__min_needed_from = MIN_NEEDED_TO;
00169              step->__max_needed_from = MIN_NEEDED_TO;
00170              step->__min_needed_to = MIN_NEEDED_FROM;
00171              step->__max_needed_to = MIN_NEEDED_FROM;
00172            }
00173 
00174          step->__stateful = 0;
00175 
00176          result = __GCONV_OK;
00177        }
00178     }
00179 
00180   return result;
00181 }
00182 
00183 
00184 extern void gconv_end (struct __gconv_step *data);
00185 void
00186 gconv_end (struct __gconv_step *data)
00187 {
00188   free (data->__data);
00189 }
00190 
00191 
00192 /* Convert from the internal (UCS4-like) format to UTF-32.  */
00193 #define MIN_NEEDED_INPUT    MIN_NEEDED_TO
00194 #define MIN_NEEDED_OUTPUT   MIN_NEEDED_FROM
00195 #define LOOPFCT                    TO_LOOP
00196 #define BODY \
00197   {                                                                  \
00198     uint32_t c = get32 (inptr);                                             \
00199                                                                      \
00200     if (__builtin_expect (c >= 0x110000, 0))                                \
00201       {                                                                     \
00202        STANDARD_TO_LOOP_ERR_HANDLER (4);                             \
00203       }                                                                     \
00204     else if (__builtin_expect (c >= 0xd800 && c < 0xe000, 0))               \
00205       {                                                                     \
00206        /* Surrogate characters in UCS-4 input are not valid.                \
00207           We must catch this.  If we let surrogates pass through,           \
00208           attackers could make a security hole exploit by                   \
00209           generating "irregular UTF-32" sequences.  */                      \
00210        result = __GCONV_ILLEGAL_INPUT;                                      \
00211        if (! ignore_errors_p ())                                     \
00212          break;                                                      \
00213        inptr += 4;                                                   \
00214        ++*irreversible;                                              \
00215        continue;                                                     \
00216       }                                                                     \
00217                                                                      \
00218     if (swap)                                                        \
00219       put32 (outptr, bswap_32 (c));                                         \
00220     else                                                             \
00221       put32 (outptr, c);                                             \
00222                                                                      \
00223     outptr += 4;                                                     \
00224     inptr += 4;                                                             \
00225   }
00226 #define LOOP_NEED_FLAGS
00227 #define EXTRA_LOOP_DECLS \
00228        , enum variant var, int swap
00229 #include <iconv/loop.c>
00230 
00231 
00232 /* Convert from UTF-32 to the internal (UCS4-like) format.  */
00233 #define MIN_NEEDED_INPUT    MIN_NEEDED_FROM
00234 #define MIN_NEEDED_OUTPUT   MIN_NEEDED_TO
00235 #define LOOPFCT                    FROM_LOOP
00236 #define BODY \
00237   {                                                                  \
00238     uint32_t u1 = get32 (inptr);                                     \
00239                                                                      \
00240     if (swap)                                                        \
00241       u1 = bswap_32 (u1);                                            \
00242                                                                      \
00243     if (__builtin_expect (u1 >= 0x110000, 0))                               \
00244       {                                                                     \
00245        /* This is illegal.  */                                              \
00246        STANDARD_FROM_LOOP_ERR_HANDLER (4);                                  \
00247       }                                                                     \
00248                                                                      \
00249     put32 (outptr, u1);                                                     \
00250     inptr += 4;                                                             \
00251     outptr += 4;                                                     \
00252   }
00253 #define LOOP_NEED_FLAGS
00254 #define EXTRA_LOOP_DECLS \
00255        , enum variant var, int swap
00256 #include <iconv/loop.c>
00257 
00258 
00259 /* Now define the toplevel functions.  */
00260 #include <iconv/skeleton.c>