Back to index

glibc  2.9
iso-2022-cn.c
Go to the documentation of this file.
00001 /* Conversion module for ISO-2022-CN.
00002    Copyright (C) 1999, 2000-2002, 2007, 2008 Free Software Foundation, Inc.
00003    This file is part of the GNU C Library.
00004    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
00005 
00006    The GNU C Library is free software; you can redistribute it and/or
00007    modify it under the terms of the GNU Lesser General Public
00008    License as published by the Free Software Foundation; either
00009    version 2.1 of the License, or (at your option) any later version.
00010 
00011    The GNU C Library is distributed in the hope that it will be useful,
00012    but WITHOUT ANY WARRANTY; without even the implied warranty of
00013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014    Lesser General Public License for more details.
00015 
00016    You should have received a copy of the GNU Lesser General Public
00017    License along with the GNU C Library; if not, write to the Free
00018    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
00019    02111-1307 USA.  */
00020 
00021 #include <dlfcn.h>
00022 #include <gconv.h>
00023 #include <stdint.h>
00024 #include <string.h>
00025 #include "gb2312.h"
00026 #include "cns11643l1.h"
00027 #include "cns11643l2.h"
00028 
00029 #include <assert.h>
00030 
00031 /* This makes obvious what everybody knows: 0x1b is the Esc character.  */
00032 #define ESC   0x1b
00033 
00034 /* We have single-byte shift-in and shift-out sequences, and the single
00035    shift sequence SS2 which replaces the SS2 designation for the next
00036    two bytes.  */
00037 #define SI    0x0f
00038 #define SO    0x0e
00039 #define SS2_0 ESC
00040 #define SS2_1 0x4e
00041 
00042 /* Definitions used in the body of the `gconv' function.  */
00043 #define CHARSET_NAME        "ISO-2022-CN//"
00044 #define DEFINE_INIT         1
00045 #define DEFINE_FINI         1
00046 #define FROM_LOOP           from_iso2022cn_loop
00047 #define TO_LOOP                    to_iso2022cn_loop
00048 #define FROM_LOOP_MIN_NEEDED_FROM  1
00049 #define FROM_LOOP_MAX_NEEDED_FROM  4
00050 #define FROM_LOOP_MIN_NEEDED_TO           4
00051 #define FROM_LOOP_MAX_NEEDED_TO           4
00052 #define TO_LOOP_MIN_NEEDED_FROM           4
00053 #define TO_LOOP_MAX_NEEDED_FROM           4
00054 #define TO_LOOP_MIN_NEEDED_TO             1
00055 #define TO_LOOP_MAX_NEEDED_TO             6
00056 #define PREPARE_LOOP \
00057   int save_set;                                                             \
00058   int *setp = &data->__statep->__count;
00059 #define EXTRA_LOOP_ARGS            , setp
00060 
00061 
00062 /* The COUNT element of the state keeps track of the currently selected
00063    character set.  The possible values are:  */
00064 enum
00065 {
00066   ASCII_set = 0,
00067   GB2312_set = 8,
00068   CNS11643_1_set = 16,
00069   CNS11643_2_set = 24,
00070   CURRENT_SEL_MASK = 24,
00071   GB2312_ann = 32,
00072   CNS11643_1_ann = 64,
00073   CNS11643_2_ann = 128,
00074   CURRENT_ANN_MASK = 224
00075 };
00076 
00077 
00078 /* Since this is a stateful encoding we have to provide code which resets
00079    the output state to the initial state.  This has to be done during the
00080    flushing.  */
00081 #define EMIT_SHIFT_TO_INIT \
00082   if (data->__statep->__count != ASCII_set)                                 \
00083     {                                                                \
00084       if (FROM_DIRECTION)                                            \
00085        /* It's easy, we don't have to emit anything, we just reset the             \
00086           state for the input.  */                                   \
00087        data->__statep->__count = ASCII_set;                                 \
00088       else                                                           \
00089        {                                                             \
00090          /* We are not in the initial state.  To switch back we have        \
00091             to emit `SI'.  */                                               \
00092          if (__builtin_expect (outbuf == outend, 0))                        \
00093            /* We don't have enough room in the output buffer.  */           \
00094            status = __GCONV_FULL_OUTPUT;                             \
00095          else                                                        \
00096            {                                                         \
00097              /* Write out the shift sequence.  */                           \
00098              *outbuf++ = SI;                                                \
00099              data->__statep->__count = ASCII_set;                           \
00100            }                                                         \
00101        }                                                             \
00102     }
00103 
00104 
00105 /* Since we might have to reset input pointer we must be able to save
00106    and retore the state.  */
00107 #define SAVE_RESET_STATE(Save) \
00108   if (Save)                                                          \
00109     save_set = *setp;                                                       \
00110   else                                                               \
00111     *setp = save_set
00112 
00113 
00114 /* First define the conversion function from ISO-2022-CN to UCS4.  */
00115 #define MIN_NEEDED_INPUT    FROM_LOOP_MIN_NEEDED_FROM
00116 #define MAX_NEEDED_INPUT    FROM_LOOP_MAX_NEEDED_FROM
00117 #define MIN_NEEDED_OUTPUT   FROM_LOOP_MIN_NEEDED_TO
00118 #define MAX_NEEDED_OUTPUT   FROM_LOOP_MAX_NEEDED_TO
00119 #define LOOPFCT                    FROM_LOOP
00120 #define BODY \
00121   {                                                                  \
00122     uint32_t ch = *inptr;                                            \
00123                                                                      \
00124     /* This is a 7bit character set, disallow all 8bit characters.  */             \
00125     if (__builtin_expect (ch >= 0x7f, 0))                            \
00126       STANDARD_FROM_LOOP_ERR_HANDLER (1);                            \
00127                                                                      \
00128     /* Recognize escape sequences.  */                                      \
00129     if (__builtin_expect (ch, 0) == ESC)                             \
00130       {                                                                     \
00131        /* There are two kinds of escape sequences we have to handle:        \
00132           - those announcing the use of GB and CNS characters on the        \
00133             line; we can simply ignore them                                 \
00134           - the initial byte of the SS2 sequence.                           \
00135        */                                                            \
00136        if (__builtin_expect (inptr + 2 > inend, 0)                          \
00137            || (inptr[1] == '$'                                              \
00138               && (__builtin_expect (inptr + 3 > inend, 0)                   \
00139                   || (inptr[2] == ')'                                       \
00140                      && __builtin_expect (inptr + 4 > inend, 0))            \
00141                   || (inptr[2] == '*'                                       \
00142                      && __builtin_expect (inptr + 4 > inend, 0))))          \
00143            || (inptr[1] == SS2_1                                     \
00144               && __builtin_expect (inptr + 4 > inend, 0)))                  \
00145          {                                                           \
00146            result = __GCONV_INCOMPLETE_INPUT;                               \
00147            break;                                                    \
00148          }                                                           \
00149        if (inptr[1] == '$'                                           \
00150            && ((inptr[2] == ')' && (inptr[3] == 'A' || inptr[3] == 'G'))     \
00151               || (inptr[2] == '*' && inptr[3] == 'H')))              \
00152          {                                                           \
00153            /* OK, we accept those character sets.  */                       \
00154            if (inptr[3] == 'A')                                      \
00155              ann = GB2312_ann;                                              \
00156            else if (inptr[3] == 'G')                                        \
00157              ann = CNS11643_1_ann;                                   \
00158            inptr += 4;                                                      \
00159            continue;                                                 \
00160          }                                                           \
00161       }                                                                     \
00162     else if (__builtin_expect (ch, 0) == SO)                                \
00163       {                                                                     \
00164        /* Switch to use GB2312 or CNS 11643 plane 1, depending on which      \
00165           S0 designation came last.  The only problem is what to do with     \
00166           faulty input files where no designator came.                      \
00167           XXX For now I'll default to use GB2312.  If this is not the       \
00168           best behaviour (e.g., we should flag an error) let me know.  */    \
00169        ++inptr;                                                      \
00170        set = ann == CNS11643_1_ann ? CNS11643_1_set : GB2312_set;           \
00171        continue;                                                     \
00172       }                                                                     \
00173     else if (__builtin_expect (ch, 0) == SI)                                \
00174       {                                                                     \
00175        /* Switch to use ASCII.  */                                   \
00176        ++inptr;                                                      \
00177        set = ASCII_set;                                              \
00178        continue;                                                     \
00179       }                                                                     \
00180                                                                      \
00181     if (__builtin_expect (ch, 0) == ESC && inptr[1] == SS2_1)               \
00182       {                                                                     \
00183        /* This is a character from CNS 11643 plane 2.                       \
00184           XXX We could test here whether the use of this character          \
00185           set was announced.  */                                     \
00186        inptr += 2;                                                   \
00187        ch = cns11643l2_to_ucs4 (&inptr, 2, 0);                              \
00188        if (__builtin_expect (ch, 0) == __UNKNOWN_10646_CHAR)                \
00189          {                                                           \
00190            inptr -= 2;                                                      \
00191            STANDARD_FROM_LOOP_ERR_HANDLER (2);                              \
00192          }                                                           \
00193       }                                                                     \
00194     else if (set == ASCII_set)                                              \
00195       {                                                                     \
00196        /* Almost done, just advance the input pointer.  */                  \
00197        ++inptr;                                                      \
00198       }                                                                     \
00199     else                                                             \
00200       {                                                                     \
00201        /* That's pretty easy, we have a dedicated functions for this.  */    \
00202        if (set == GB2312_set)                                               \
00203          ch = gb2312_to_ucs4 (&inptr, inend - inptr, 0);                    \
00204        else                                                          \
00205          {                                                           \
00206            assert (set == CNS11643_1_set);                                  \
00207            ch = cns11643l1_to_ucs4 (&inptr, inend - inptr, 0);              \
00208          }                                                           \
00209                                                                      \
00210        if (__builtin_expect (ch, 1) == 0)                            \
00211          {                                                           \
00212            result = __GCONV_INCOMPLETE_INPUT;                               \
00213            break;                                                    \
00214          }                                                           \
00215        else if (__builtin_expect (ch, 1) == __UNKNOWN_10646_CHAR)           \
00216          {                                                           \
00217            STANDARD_FROM_LOOP_ERR_HANDLER (1);                              \
00218          }                                                           \
00219       }                                                                     \
00220                                                                      \
00221     put32 (outptr, ch);                                                     \
00222     outptr += 4;                                                     \
00223   }
00224 #define LOOP_NEED_FLAGS
00225 #define EXTRA_LOOP_DECLS    , int *setp
00226 #define INIT_PARAMS         int set = *setp & CURRENT_SEL_MASK; \
00227                             int ann = *setp & CURRENT_ANN_MASK
00228 #define UPDATE_PARAMS              *setp = set | ann
00229 #include <iconv/loop.c>
00230 
00231 
00232 /* Next, define the other direction.  */
00233 #define MIN_NEEDED_INPUT    TO_LOOP_MIN_NEEDED_FROM
00234 #define MAX_NEEDED_INPUT    TO_LOOP_MAX_NEEDED_FROM
00235 #define MIN_NEEDED_OUTPUT   TO_LOOP_MIN_NEEDED_TO
00236 #define MAX_NEEDED_OUTPUT   TO_LOOP_MAX_NEEDED_TO
00237 #define LOOPFCT                    TO_LOOP
00238 #define BODY \
00239   {                                                                  \
00240     uint32_t ch = get32 (inptr);                                     \
00241                                                                      \
00242     /* First see whether we can write the character using the currently            \
00243        selected character set.  */                                   \
00244     if (ch < 0x80)                                                   \
00245       {                                                                     \
00246        if (set != ASCII_set)                                                \
00247          {                                                           \
00248            *outptr++ = SI;                                           \
00249            set = ASCII_set;                                          \
00250            if (__builtin_expect (outptr == outend, 0))                      \
00251              {                                                              \
00252               result = __GCONV_FULL_OUTPUT;                                 \
00253               break;                                                 \
00254              }                                                              \
00255          }                                                           \
00256                                                                      \
00257        *outptr++ = ch;                                                      \
00258                                                                      \
00259        /* At the end of the line we have to clear the `ann' flags since      \
00260           every line must contain this information again.  */               \
00261        if (ch == L'\n')                                              \
00262          ann = 0;                                                    \
00263       }                                                                     \
00264     else                                                             \
00265       {                                                                     \
00266        unsigned char buf[2];                                                \
00267        /* Fake initialization to keep gcc quiet.  */                        \
00268        asm ("" : "=m" (buf));                                               \
00269                                                                      \
00270        int used;                                                     \
00271        size_t written = 0;                                           \
00272                                                                      \
00273        if (set == GB2312_set || (ann & CNS11643_1_ann) == 0)                \
00274          {                                                           \
00275            written = ucs4_to_gb2312 (ch, buf, 2);                           \
00276            used = GB2312_set;                                               \
00277          }                                                           \
00278        else                                                          \
00279          {                                                           \
00280            written = ucs4_to_cns11643l1 (ch, buf, 2);                       \
00281            used = CNS11643_1_set;                                    \
00282          }                                                           \
00283                                                                      \
00284        if (written == __UNKNOWN_10646_CHAR)                                 \
00285          {                                                           \
00286            /* Cannot convert it using the currently selected SO set.        \
00287               Next try the SS2 set.  */                              \
00288            written = ucs4_to_cns11643l2 (ch, buf, 2);                       \
00289            if (written != __UNKNOWN_10646_CHAR)                      \
00290              /* Yep, that worked.  */                                       \
00291              used = CNS11643_2_set;                                         \
00292            else                                                      \
00293              {                                                              \
00294               /* Well, see whether we have to change the SO set.  */        \
00295               if (used == GB2312_set)                                       \
00296                 written = ucs4_to_cns11643l1 (ch, buf, 2);                  \
00297               else                                                   \
00298                 written = ucs4_to_gb2312 (ch, buf, 2);               \
00299                                                                      \
00300               if (__builtin_expect (written, 0) != __UNKNOWN_10646_CHAR)    \
00301                 /* Oh well, then switch SO.  */                      \
00302                 used = GB2312_set + CNS11643_1_set - used;                  \
00303               else                                                   \
00304                 {                                                    \
00305                   UNICODE_TAG_HANDLER (ch, 4);                       \
00306                                                                      \
00307                   /* Even this does not work.  Error.  */                   \
00308                   STANDARD_TO_LOOP_ERR_HANDLER (4);                         \
00309                 }                                                    \
00310              }                                                              \
00311          }                                                           \
00312        assert (written == 2);                                               \
00313                                                                      \
00314        /* See whether we have to emit an escape sequence.  */               \
00315        if (set != used)                                              \
00316          {                                                           \
00317            /* First see whether we announced that we use this               \
00318               character set.  */                                     \
00319            if ((ann & (16 << (used >> 3))) == 0)                     \
00320              {                                                              \
00321               const char *escseq;                                    \
00322                                                                      \
00323               if (__builtin_expect (outptr + 4 > outend, 0))                \
00324                 {                                                    \
00325                   result = __GCONV_FULL_OUTPUT;                      \
00326                   break;                                             \
00327                 }                                                    \
00328                                                                      \
00329               assert ((used >> 3) >= 1 && (used >> 3) <= 3);                \
00330               escseq = ")A)G*H" + ((used >> 3) - 1) * 2;                    \
00331               *outptr++ = ESC;                                       \
00332               *outptr++ = '$';                                       \
00333               *outptr++ = *escseq++;                                        \
00334               *outptr++ = *escseq++;                                        \
00335                                                                      \
00336               if (used == GB2312_set)                                       \
00337                 ann = (ann & CNS11643_2_ann) | GB2312_ann;                  \
00338               else if (used == CNS11643_1_set)                       \
00339                 ann = (ann & CNS11643_2_ann) | CNS11643_1_ann;       \
00340               else                                                   \
00341                 ann |= CNS11643_2_ann;                               \
00342              }                                                              \
00343                                                                      \
00344            if (used == CNS11643_2_set)                                      \
00345              {                                                              \
00346               if (__builtin_expect (outptr + 2 > outend, 0))                \
00347                 {                                                    \
00348                   result = __GCONV_FULL_OUTPUT;                      \
00349                   break;                                             \
00350                 }                                                    \
00351               *outptr++ = SS2_0;                                     \
00352               *outptr++ = SS2_1;                                     \
00353              }                                                              \
00354            else                                                      \
00355              {                                                              \
00356               /* We only have to emit something is currently ASCII is       \
00357                  selected.  Otherwise we are switching within the           \
00358                  SO charset.  */                                     \
00359               if (set == ASCII_set)                                         \
00360                 {                                                    \
00361                   if (__builtin_expect (outptr + 1 > outend, 0))            \
00362                     {                                                       \
00363                      result = __GCONV_FULL_OUTPUT;                          \
00364                      break;                                          \
00365                     }                                                       \
00366                   *outptr++ = SO;                                    \
00367                 }                                                    \
00368              }                                                              \
00369                                                                      \
00370            /* Always test the length here since we have used up all the      \
00371               guaranteed output buffer slots.  */                           \
00372            if (__builtin_expect (outptr + 2 > outend, 0))                   \
00373              {                                                              \
00374               result = __GCONV_FULL_OUTPUT;                                 \
00375               break;                                                 \
00376              }                                                              \
00377          }                                                           \
00378        else if (__builtin_expect (outptr + 2 > outend, 0))                  \
00379          {                                                           \
00380            result = __GCONV_FULL_OUTPUT;                             \
00381            break;                                                    \
00382          }                                                           \
00383                                                                      \
00384        *outptr++ = buf[0];                                           \
00385        *outptr++ = buf[1];                                           \
00386        set = used;                                                   \
00387       }                                                                     \
00388                                                                      \
00389     /* Now that we wrote the output increment the input pointer.  */        \
00390     inptr += 4;                                                             \
00391   }
00392 #define LOOP_NEED_FLAGS
00393 #define EXTRA_LOOP_DECLS    , int *setp
00394 #define INIT_PARAMS         int set = *setp & CURRENT_SEL_MASK; \
00395                             int ann = *setp & CURRENT_ANN_MASK
00396 #define REINIT_PARAMS              do                                       \
00397                               {                                      \
00398                                 set = *setp & CURRENT_SEL_MASK;             \
00399                                 ann = *setp & CURRENT_ANN_MASK;             \
00400                               }                                      \
00401                             while (0)
00402 #define UPDATE_PARAMS              *setp = set | ann
00403 #include <iconv/loop.c>
00404 
00405 
00406 /* Now define the toplevel functions.  */
00407 #include <iconv/skeleton.c>