Back to index

glibc  2.9
unicode.c
Go to the documentation of this file.
00001 /* Conversion module for Unicode
00002    Copyright (C) 1999, 2000-2002 Free Software Foundation, Inc.
00003    This file is part of the GNU C Library.
00004    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
00005 
00006    The GNU C Library is free software; you can redistribute it and/or
00007    modify it under the terms of the GNU Lesser General Public
00008    License as published by the Free Software Foundation; either
00009    version 2.1 of the License, or (at your option) any later version.
00010 
00011    The GNU C Library is distributed in the hope that it will be useful,
00012    but WITHOUT ANY WARRANTY; without even the implied warranty of
00013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014    Lesser General Public License for more details.
00015 
00016    You should have received a copy of the GNU Lesser General Public
00017    License along with the GNU C Library; if not, write to the Free
00018    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
00019    02111-1307 USA.  */
00020 
00021 #include <byteswap.h>
00022 #include <dlfcn.h>
00023 #include <gconv.h>
00024 #include <stddef.h>
00025 #include <stdint.h>
00026 #include <stdlib.h>
00027 #include <string.h>
00028 
00029 /* This is the Byte Order Mark character (BOM).  */
00030 #define BOM   0xfeff
00031 /* And in the other endian format.  */
00032 #define BOM_OE       0xfffe
00033 
00034 
00035 /* Definitions used in the body of the `gconv' function.  */
00036 #define FROM_LOOP           from_unicode_loop
00037 #define TO_LOOP                    to_unicode_loop
00038 #define DEFINE_INIT         0
00039 #define DEFINE_FINI         0
00040 #define MIN_NEEDED_FROM            2
00041 #define MIN_NEEDED_TO              4
00042 #define FROM_DIRECTION             (dir == from_unicode)
00043 #define PREPARE_LOOP \
00044   enum direction dir = ((struct unicode_data *) step->__data)->dir;         \
00045   int swap;                                                          \
00046   if (FROM_DIRECTION)                                                       \
00047     {                                                                \
00048       if (data->__invocation_counter == 0)                                  \
00049        {                                                             \
00050          /* We have to find out which byte order the file is encoded in.  */ \
00051          if (inptr + 2 > inend)                                      \
00052            return (inptr == inend                                    \
00053                   ? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT);        \
00054                                                                      \
00055          if (get16u (inptr) == BOM)                                         \
00056            /* Simply ignore the BOM character.  */                          \
00057            *inptrp = inptr += 2;                                     \
00058          else if (get16u (inptr) == BOM_OE)                                 \
00059            {                                                         \
00060              ((struct unicode_data *) step->__data)->swap = 1;              \
00061              *inptrp = inptr += 2;                                   \
00062            }                                                         \
00063        }                                                             \
00064     }                                                                \
00065   else if (!data->__internal_use && data->__invocation_counter == 0)        \
00066     {                                                                \
00067       /* Emit the Byte Order Mark.  */                                      \
00068       if (__builtin_expect (outbuf + 2 > outend, 0))                        \
00069        return __GCONV_FULL_OUTPUT;                                   \
00070                                                                      \
00071       put16u (outbuf, BOM);                                          \
00072       outbuf += 2;                                                   \
00073     }                                                                \
00074   swap = ((struct unicode_data *) step->__data)->swap;
00075 #define EXTRA_LOOP_ARGS            , swap
00076 
00077 
00078 /* Direction of the transformation.  */
00079 enum direction
00080 {
00081   illegal_dir,
00082   to_unicode,
00083   from_unicode
00084 };
00085 
00086 struct unicode_data
00087 {
00088   enum direction dir;
00089   int swap;
00090 };
00091 
00092 
00093 extern int gconv_init (struct __gconv_step *step);
00094 int
00095 gconv_init (struct __gconv_step *step)
00096 {
00097   /* Determine which direction.  */
00098   struct unicode_data *new_data;
00099   enum direction dir = illegal_dir;
00100   int result;
00101 
00102   if (strcmp (step->__from_name, "UNICODE//") == 0)
00103     dir = from_unicode;
00104   else
00105     dir = to_unicode;
00106 
00107   new_data = (struct unicode_data *) malloc (sizeof (struct unicode_data));
00108 
00109   result = __GCONV_NOMEM;
00110   if (new_data != NULL)
00111     {
00112       new_data->dir = dir;
00113       new_data->swap = 0;
00114       step->__data = new_data;
00115 
00116       if (dir == from_unicode)
00117        {
00118          step->__min_needed_from = MIN_NEEDED_FROM;
00119          step->__max_needed_from = MIN_NEEDED_FROM;
00120          step->__min_needed_to = MIN_NEEDED_TO;
00121          step->__max_needed_to = MIN_NEEDED_TO;
00122        }
00123       else
00124        {
00125          step->__min_needed_from = MIN_NEEDED_TO;
00126          step->__max_needed_from = MIN_NEEDED_TO;
00127          step->__min_needed_to = MIN_NEEDED_FROM;
00128          step->__max_needed_to = MIN_NEEDED_FROM;
00129        }
00130 
00131       step->__stateful = 0;
00132 
00133       result = __GCONV_OK;
00134     }
00135 
00136   return result;
00137 }
00138 
00139 
00140 extern void gconv_end (struct __gconv_step *data);
00141 void
00142 gconv_end (struct __gconv_step *data)
00143 {
00144   free (data->__data);
00145 }
00146 
00147 
00148 /* Convert from the internal (UCS4-like) format to UCS2.  */
00149 #define MIN_NEEDED_INPUT    MIN_NEEDED_TO
00150 #define MIN_NEEDED_OUTPUT   MIN_NEEDED_FROM
00151 #define LOOPFCT                    TO_LOOP
00152 #define BODY \
00153   {                                                                  \
00154     uint32_t c = get32 (inptr);                                             \
00155                                                                      \
00156     if (__builtin_expect (c >= 0x10000, 0))                                 \
00157       {                                                                     \
00158        UNICODE_TAG_HANDLER (c, 4);                                   \
00159        STANDARD_TO_LOOP_ERR_HANDLER (4);                             \
00160       }                                                                     \
00161     else if (__builtin_expect (c >= 0xd800 && c < 0xe000, 0))               \
00162       {                                                                     \
00163        /* Surrogate characters in UCS-4 input are not valid.                \
00164           We must catch this, because the UCS-2 output might be             \
00165           interpreted as UTF-16 by other programs.  If we let               \
00166           surrogates pass through, attackers could make a security          \
00167           hole exploit by synthesizing any desired plane 1-16               \
00168           character.  */                                             \
00169        result = __GCONV_ILLEGAL_INPUT;                                      \
00170        if (! ignore_errors_p ())                                     \
00171          break;                                                      \
00172        inptr += 4;                                                   \
00173        ++*irreversible;                                              \
00174        continue;                                                     \
00175       }                                                                     \
00176     else                                                             \
00177       {                                                                     \
00178        put16 (outptr, c);                                            \
00179        outptr += 2;                                                  \
00180       }                                                                     \
00181                                                                      \
00182     inptr += 4;                                                             \
00183   }
00184 #define LOOP_NEED_FLAGS
00185 #define EXTRA_LOOP_DECLS \
00186        , int swap
00187 #include <iconv/loop.c>
00188 
00189 
00190 /* Convert from UCS2 to the internal (UCS4-like) format.  */
00191 #define MIN_NEEDED_INPUT    MIN_NEEDED_FROM
00192 #define MIN_NEEDED_OUTPUT   MIN_NEEDED_TO
00193 #define LOOPFCT                    FROM_LOOP
00194 #define BODY \
00195   {                                                                  \
00196     uint16_t u1 = get16 (inptr);                                     \
00197                                                                      \
00198     if (swap)                                                        \
00199       u1 = bswap_16 (u1);                                            \
00200                                                                      \
00201     if (__builtin_expect (u1 >= 0xd800 && u1 < 0xe000, 0))                  \
00202       {                                                                     \
00203        /* Surrogate characters in UCS-2 input are not valid.  Reject        \
00204           them.  (Catching this here is not security relevant.)  */         \
00205        STANDARD_FROM_LOOP_ERR_HANDLER (2);                                  \
00206       }                                                                     \
00207                                                                      \
00208     put32 (outptr, u1);                                                     \
00209                                                                      \
00210     inptr += 2;                                                             \
00211     outptr += 4;                                                     \
00212   }
00213 #define LOOP_NEED_FLAGS
00214 #define EXTRA_LOOP_DECLS \
00215        , int swap
00216 #include <iconv/loop.c>
00217 
00218 
00219 /* Now define the toplevel functions.  */
00220 #include <iconv/skeleton.c>