Back to index

glibc  2.9
utf-16.c
Go to the documentation of this file.
00001 /* Conversion module for UTF-16.
00002    Copyright (C) 1999, 2000-2002, 2003, 2005 Free Software Foundation, Inc.
00003    This file is part of the GNU C Library.
00004    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
00005 
00006    The GNU C Library is free software; you can redistribute it and/or
00007    modify it under the terms of the GNU Lesser General Public
00008    License as published by the Free Software Foundation; either
00009    version 2.1 of the License, or (at your option) any later version.
00010 
00011    The GNU C Library is distributed in the hope that it will be useful,
00012    but WITHOUT ANY WARRANTY; without even the implied warranty of
00013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014    Lesser General Public License for more details.
00015 
00016    You should have received a copy of the GNU Lesser General Public
00017    License along with the GNU C Library; if not, write to the Free
00018    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
00019    02111-1307 USA.  */
00020 
00021 #include <byteswap.h>
00022 #include <dlfcn.h>
00023 #include <gconv.h>
00024 #include <stddef.h>
00025 #include <stdint.h>
00026 #include <stdlib.h>
00027 #include <string.h>
00028 
00029 /* This is the Byte Order Mark character (BOM).  */
00030 #define BOM   0xfeff
00031 /* And in the other byte order.  */
00032 #define BOM_OE       0xfffe
00033 
00034 
00035 /* Definitions used in the body of the `gconv' function.  */
00036 #define FROM_LOOP           from_utf16_loop
00037 #define TO_LOOP                    to_utf16_loop
00038 #define DEFINE_INIT         0
00039 #define DEFINE_FINI         0
00040 #define MIN_NEEDED_FROM            2
00041 #define MAX_NEEDED_FROM            4
00042 #define MIN_NEEDED_TO              4
00043 #define FROM_DIRECTION             (dir == from_utf16)
00044 #define PREPARE_LOOP \
00045   enum direction dir = ((struct utf16_data *) step->__data)->dir;           \
00046   enum variant var = ((struct utf16_data *) step->__data)->var;                    \
00047   if (__builtin_expect (data->__invocation_counter == 0, 0) && var == UTF_16) \
00048     {                                                                \
00049       if (FROM_DIRECTION)                                            \
00050        {                                                             \
00051          /* We have to find out which byte order the file is encoded in.  */ \
00052          if (inptr + 2 > inend)                                      \
00053            return (inptr == inend                                    \
00054                   ? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT);        \
00055                                                                      \
00056          if (get16u (inptr) == BOM)                                         \
00057            /* Simply ignore the BOM character.  */                          \
00058            *inptrp = inptr += 2;                                     \
00059          else if (get16u (inptr) == BOM_OE)                                 \
00060            {                                                         \
00061              ((struct utf16_data *) step->__data)->swap = 1;                \
00062              *inptrp = inptr += 2;                                   \
00063            }                                                         \
00064        }                                                             \
00065       else if (!FROM_DIRECTION && !data->__internal_use)                    \
00066        {                                                             \
00067          /* Emit the Byte Order Mark.  */                            \
00068          if (__builtin_expect (outbuf + 2 > outend, 0))              \
00069            return __GCONV_FULL_OUTPUT;                                      \
00070                                                                      \
00071          put16u (outbuf, BOM);                                              \
00072          outbuf += 2;                                                       \
00073        }                                                             \
00074     }                                                                \
00075   int swap = ((struct utf16_data *) step->__data)->swap;
00076 #define EXTRA_LOOP_ARGS            , swap
00077 
00078 
00079 /* Direction of the transformation.  */
00080 enum direction
00081 {
00082   illegal_dir,
00083   to_utf16,
00084   from_utf16
00085 };
00086 
00087 enum variant
00088 {
00089   illegal_var,
00090   UTF_16,
00091   UTF_16LE,
00092   UTF_16BE
00093 };
00094 
00095 struct utf16_data
00096 {
00097   enum direction dir;
00098   enum variant var;
00099   int swap;
00100 };
00101 
00102 
00103 extern int gconv_init (struct __gconv_step *step);
00104 int
00105 gconv_init (struct __gconv_step *step)
00106 {
00107   /* Determine which direction.  */
00108   struct utf16_data *new_data;
00109   enum direction dir = illegal_dir;
00110   enum variant var = illegal_var;
00111   int result;
00112 
00113   if (__strcasecmp (step->__from_name, "UTF-16//") == 0)
00114     {
00115       dir = from_utf16;
00116       var = UTF_16;
00117     }
00118   else if (__strcasecmp (step->__to_name, "UTF-16//") == 0)
00119     {
00120       dir = to_utf16;
00121       var = UTF_16;
00122     }
00123   else if (__strcasecmp (step->__from_name, "UTF-16BE//") == 0)
00124     {
00125       dir = from_utf16;
00126       var = UTF_16BE;
00127     }
00128   else if (__strcasecmp (step->__to_name, "UTF-16BE//") == 0)
00129     {
00130       dir = to_utf16;
00131       var = UTF_16BE;
00132     }
00133   else if (__strcasecmp (step->__from_name, "UTF-16LE//") == 0)
00134     {
00135       dir = from_utf16;
00136       var = UTF_16LE;
00137     }
00138   else if (__strcasecmp (step->__to_name, "UTF-16LE//") == 0)
00139     {
00140       dir = to_utf16;
00141       var = UTF_16LE;
00142     }
00143 
00144   result = __GCONV_NOCONV;
00145   if (__builtin_expect (dir, to_utf16) != illegal_dir)
00146     {
00147       new_data = (struct utf16_data *) malloc (sizeof (struct utf16_data));
00148 
00149       result = __GCONV_NOMEM;
00150       if (new_data != NULL)
00151        {
00152          new_data->dir = dir;
00153          new_data->var = var;
00154          new_data->swap = ((var == UTF_16LE && BYTE_ORDER == BIG_ENDIAN)
00155                          || (var == UTF_16BE
00156                             && BYTE_ORDER == LITTLE_ENDIAN));
00157          step->__data = new_data;
00158 
00159          if (dir == from_utf16)
00160            {
00161              step->__min_needed_from = MIN_NEEDED_FROM;
00162              step->__max_needed_from = MAX_NEEDED_FROM;
00163              step->__min_needed_to = MIN_NEEDED_TO;
00164              step->__max_needed_to = MIN_NEEDED_TO;
00165            }
00166          else
00167            {
00168              step->__min_needed_from = MIN_NEEDED_TO;
00169              step->__max_needed_from = MIN_NEEDED_TO;
00170              step->__min_needed_to = MIN_NEEDED_FROM;
00171              step->__max_needed_to = MAX_NEEDED_FROM;
00172            }
00173 
00174          step->__stateful = 0;
00175 
00176          result = __GCONV_OK;
00177        }
00178     }
00179 
00180   return result;
00181 }
00182 
00183 
00184 extern void gconv_end (struct __gconv_step *data);
00185 void
00186 gconv_end (struct __gconv_step *data)
00187 {
00188   free (data->__data);
00189 }
00190 
00191 
00192 /* Convert from the internal (UCS4-like) format to UTF-16.  */
00193 #define MIN_NEEDED_INPUT    MIN_NEEDED_TO
00194 #define MIN_NEEDED_OUTPUT   MIN_NEEDED_FROM
00195 #define MAX_NEEDED_OUTPUT   MAX_NEEDED_FROM
00196 #define LOOPFCT                    TO_LOOP
00197 #define BODY \
00198   {                                                                  \
00199     uint32_t c = get32 (inptr);                                             \
00200                                                                      \
00201     if (__builtin_expect (c >= 0xd800 && c < 0xe000, 0))                    \
00202       {                                                                     \
00203        /* Surrogate characters in UCS-4 input are not valid.                \
00204           We must catch this.  If we let surrogates pass through,           \
00205           attackers could make a security hole exploit by                   \
00206           synthesizing any desired plane 1-16 character.  */                \
00207        result = __GCONV_ILLEGAL_INPUT;                                      \
00208        if (! ignore_errors_p ())                                     \
00209          break;                                                      \
00210        inptr += 4;                                                   \
00211        ++*irreversible;                                              \
00212        continue;                                                     \
00213       }                                                                     \
00214                                                                      \
00215     if (swap)                                                        \
00216       {                                                                     \
00217        if (__builtin_expect (c >= 0x10000, 0))                              \
00218          {                                                           \
00219            if (__builtin_expect (c >= 0x110000, 0))                         \
00220              {                                                              \
00221               STANDARD_TO_LOOP_ERR_HANDLER (4);                      \
00222              }                                                              \
00223                                                                      \
00224            /* Generate a surrogate character.  */                           \
00225            if (__builtin_expect (outptr + 4 > outend, 0))                   \
00226              {                                                              \
00227               /* Overflow in the output buffer.  */                         \
00228               result = __GCONV_FULL_OUTPUT;                                 \
00229               break;                                                 \
00230              }                                                              \
00231                                                                      \
00232            put16 (outptr, bswap_16 (0xd7c0 + (c >> 10)));                   \
00233            outptr += 2;                                              \
00234            put16 (outptr, bswap_16 (0xdc00 + (c & 0x3ff)));                 \
00235          }                                                           \
00236        else                                                          \
00237          put16 (outptr, bswap_16 (c));                                      \
00238       }                                                                     \
00239     else                                                             \
00240       {                                                                     \
00241        if (__builtin_expect (c >= 0x10000, 0))                              \
00242          {                                                           \
00243            if (__builtin_expect (c >= 0x110000, 0))                         \
00244              {                                                              \
00245               STANDARD_TO_LOOP_ERR_HANDLER (4);                      \
00246              }                                                              \
00247                                                                      \
00248            /* Generate a surrogate character.  */                           \
00249            if (__builtin_expect (outptr + 4 > outend, 0))                   \
00250              {                                                              \
00251               /* Overflow in the output buffer.  */                         \
00252               result = __GCONV_FULL_OUTPUT;                                 \
00253               break;                                                 \
00254              }                                                              \
00255                                                                      \
00256            put16 (outptr, 0xd7c0 + (c >> 10));                              \
00257            outptr += 2;                                              \
00258            put16 (outptr, 0xdc00 + (c & 0x3ff));                     \
00259          }                                                           \
00260        else                                                          \
00261          put16 (outptr, c);                                          \
00262       }                                                                     \
00263     outptr += 2;                                                     \
00264     inptr += 4;                                                             \
00265   }
00266 #define LOOP_NEED_FLAGS
00267 #define EXTRA_LOOP_DECLS \
00268        , int swap
00269 #include <iconv/loop.c>
00270 
00271 
00272 /* Convert from UTF-16 to the internal (UCS4-like) format.  */
00273 #define MIN_NEEDED_INPUT    MIN_NEEDED_FROM
00274 #define MAX_NEEDED_INPUT    MAX_NEEDED_FROM
00275 #define MIN_NEEDED_OUTPUT   MIN_NEEDED_TO
00276 #define LOOPFCT                    FROM_LOOP
00277 #define BODY \
00278   {                                                                  \
00279     uint16_t u1 = get16 (inptr);                                     \
00280                                                                      \
00281     if (swap)                                                        \
00282       {                                                                     \
00283        u1 = bswap_16 (u1);                                           \
00284                                                                      \
00285        if (__builtin_expect (u1 < 0xd800, 1) || u1 > 0xdfff)                \
00286          {                                                           \
00287            /* No surrogate.  */                                      \
00288            put32 (outptr, u1);                                              \
00289            inptr += 2;                                                      \
00290          }                                                           \
00291        else                                                          \
00292          {                                                           \
00293            uint16_t u2;                                              \
00294                                                                      \
00295            /* It's a surrogate character.  At least the first word says      \
00296               it is.  */                                             \
00297            if (__builtin_expect (inptr + 4 > inend, 0))              \
00298              {                                                              \
00299               /* We don't have enough input for another complete input      \
00300                  character.  */                                      \
00301               result = __GCONV_INCOMPLETE_INPUT;                     \
00302               break;                                                 \
00303              }                                                              \
00304                                                                      \
00305            inptr += 2;                                                      \
00306            u2 = bswap_16 (get16 (inptr));                            \
00307            if (__builtin_expect (u2 < 0xdc00, 0)                     \
00308               || __builtin_expect (u2 > 0xdfff, 0))                         \
00309              {                                                              \
00310               /* This is no valid second word for a surrogate.  */          \
00311               inptr -= 2;                                            \
00312               STANDARD_FROM_LOOP_ERR_HANDLER (2);                           \
00313              }                                                              \
00314                                                                      \
00315            put32 (outptr, ((u1 - 0xd7c0) << 10) + (u2 - 0xdc00));           \
00316            inptr += 2;                                                      \
00317          }                                                           \
00318       }                                                                     \
00319     else                                                             \
00320       {                                                                     \
00321        if (__builtin_expect (u1 < 0xd800, 1) || u1 > 0xdfff)                \
00322          {                                                           \
00323            /* No surrogate.  */                                      \
00324            put32 (outptr, u1);                                              \
00325            inptr += 2;                                                      \
00326          }                                                           \
00327        else                                                          \
00328          {                                                           \
00329            /* It's a surrogate character.  At least the first word says      \
00330               it is.  */                                             \
00331            if (__builtin_expect (inptr + 4 > inend, 0))              \
00332              {                                                              \
00333               /* We don't have enough input for another complete input      \
00334                  character.  */                                      \
00335               result = __GCONV_INCOMPLETE_INPUT;                     \
00336               break;                                                 \
00337              }                                                              \
00338                                                                      \
00339            inptr += 2;                                                      \
00340            uint16_t u2 = get16 (inptr);                              \
00341            if (__builtin_expect (u2 < 0xdc00, 0)                     \
00342               || __builtin_expect (u2 > 0xdfff, 0))                         \
00343              {                                                              \
00344               /* This is no valid second word for a surrogate.  */          \
00345               inptr -= 2;                                            \
00346               STANDARD_FROM_LOOP_ERR_HANDLER (2);                           \
00347              }                                                              \
00348                                                                      \
00349            put32 (outptr, ((u1 - 0xd7c0) << 10) + (u2 - 0xdc00));           \
00350            inptr += 2;                                                      \
00351          }                                                           \
00352       }                                                                     \
00353     outptr += 4;                                                     \
00354   }
00355 #define LOOP_NEED_FLAGS
00356 #define EXTRA_LOOP_DECLS \
00357        , int swap
00358 #include <iconv/loop.c>
00359 
00360 
00361 /* Now define the toplevel functions.  */
00362 #include <iconv/skeleton.c>