Back to index

glibc  2.9
euc-jisx0213.c
Go to the documentation of this file.
00001 /* Conversion from and to EUC-JISX0213.
00002    Copyright (C) 2002, 2004 Free Software Foundation, Inc.
00003    This file is part of the GNU C Library.
00004    Contributed by Bruno Haible <bruno@clisp.org>, 2002.
00005 
00006    The GNU C Library is free software; you can redistribute it and/or
00007    modify it under the terms of the GNU Lesser General Public
00008    License as published by the Free Software Foundation; either
00009    version 2.1 of the License, or (at your option) any later version.
00010 
00011    The GNU C Library is distributed in the hope that it will be useful,
00012    but WITHOUT ANY WARRANTY; without even the implied warranty of
00013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014    Lesser General Public License for more details.
00015 
00016    You should have received a copy of the GNU Lesser General Public
00017    License along with the GNU C Library; if not, write to the Free
00018    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
00019    02111-1307 USA.  */
00020 
00021 #include <dlfcn.h>
00022 #include <stdint.h>
00023 #include <gconv.h>
00024 
00025 /* The structure of EUC-JISX0213 is as follows:
00026 
00027    0x00..0x7F: ASCII
00028 
00029    0x8E{A1..FE}: JISX0201 Katakana, with prefix 0x8E, offset by +0x80.
00030 
00031    0x8F{A1..FE}{A1..FE}: JISX0213 plane 2, with prefix 0x8F, offset by +0x8080.
00032 
00033    0x{A1..FE}{A1..FE}: JISX0213 plane 1, offset by +0x8080.
00034 
00035    Note that some JISX0213 characters are not contained in Unicode 3.2
00036    and are therefore best represented as sequences of Unicode characters.
00037 */
00038 
00039 #include "jisx0213.h"
00040 
00041 /* Definitions used in the body of the `gconv' function.  */
00042 #define CHARSET_NAME        "EUC-JISX0213//"
00043 #define FROM_LOOP           from_euc_jisx0213
00044 #define TO_LOOP                    to_euc_jisx0213
00045 #define DEFINE_INIT         1
00046 #define DEFINE_FINI         1
00047 #define FROM_LOOP_MIN_NEEDED_FROM  1
00048 #define FROM_LOOP_MAX_NEEDED_FROM  3
00049 #define FROM_LOOP_MIN_NEEDED_TO           4
00050 #define FROM_LOOP_MAX_NEEDED_TO           8
00051 #define TO_LOOP_MIN_NEEDED_FROM           4
00052 #define TO_LOOP_MAX_NEEDED_FROM           4
00053 #define TO_LOOP_MIN_NEEDED_TO             1
00054 #define TO_LOOP_MAX_NEEDED_TO             3
00055 #define PREPARE_LOOP \
00056   int saved_state;                                                   \
00057   int *statep = &data->__statep->__count;
00058 #define EXTRA_LOOP_ARGS            , statep
00059 
00060 
00061 /* Since we might have to reset input pointer we must be able to save
00062    and restore the state.  */
00063 #define SAVE_RESET_STATE(Save) \
00064   if (Save)                                                          \
00065     saved_state = *statep;                                           \
00066   else                                                               \
00067     *statep = saved_state
00068 
00069 
00070 /* During EUC-JISX0213 to UCS-4 conversion, the COUNT element of the state
00071    contains the last UCS-4 character, shifted by 3 bits.
00072    During UCS-4 to EUC-JISX0213 conversion, the COUNT element of the state
00073    contains the last two bytes to be output, shifted by 3 bits.  */
00074 
00075 /* Since this is a stateful encoding we have to provide code which resets
00076    the output state to the initial state.  This has to be done during the
00077    flushing.  */
00078 #define EMIT_SHIFT_TO_INIT \
00079   if (data->__statep->__count != 0)                                         \
00080     {                                                                \
00081       if (FROM_DIRECTION)                                            \
00082        {                                                             \
00083          if (__builtin_expect (outbuf + 4 <= outend, 1))                    \
00084            {                                                         \
00085              /* Write out the last character.  */                           \
00086              *((uint32_t *) outbuf) = data->__statep->__count >> 3;         \
00087              outbuf += sizeof (uint32_t);                            \
00088              data->__statep->__count = 0;                            \
00089            }                                                         \
00090          else                                                        \
00091            /* We don't have enough room in the output buffer.  */           \
00092            status = __GCONV_FULL_OUTPUT;                             \
00093        }                                                             \
00094       else                                                           \
00095        {                                                             \
00096          if (__builtin_expect (outbuf + 2 <= outend, 1))                    \
00097            {                                                         \
00098              /* Write out the last character.  */                           \
00099              uint32_t lasttwo = data->__statep->__count >> 3;               \
00100              *outbuf++ = (lasttwo >> 8) & 0xff;                      \
00101              *outbuf++ = lasttwo & 0xff;                             \
00102              data->__statep->__count = 0;                            \
00103            }                                                         \
00104          else                                                        \
00105            /* We don't have enough room in the output buffer.  */           \
00106            status = __GCONV_FULL_OUTPUT;                             \
00107        }                                                             \
00108     }
00109 
00110 
00111 /* First define the conversion function from EUC-JISX0213 to UCS-4.  */
00112 #define MIN_NEEDED_INPUT    FROM_LOOP_MIN_NEEDED_FROM
00113 #define MAX_NEEDED_INPUT    FROM_LOOP_MAX_NEEDED_FROM
00114 #define MIN_NEEDED_OUTPUT   FROM_LOOP_MIN_NEEDED_TO
00115 #define MAX_NEEDED_OUTPUT   FROM_LOOP_MAX_NEEDED_TO
00116 #define LOOPFCT                    FROM_LOOP
00117 #define BODY \
00118   {                                                                  \
00119     uint32_t ch;                                                     \
00120                                                                      \
00121     /* Determine whether there is a buffered character pending.  */         \
00122     ch = *statep >> 3;                                                      \
00123     if (__builtin_expect (ch == 0, 1))                                      \
00124       {                                                                     \
00125        /* No - so look at the next input byte.  */                          \
00126        ch = *inptr;                                                  \
00127                                                                      \
00128        if (ch < 0x80)                                                       \
00129          /* Plain ASCII character.  */                                      \
00130          ++inptr;                                                    \
00131        else if ((ch >= 0xa1 && ch <= 0xfe) || ch == 0x8e || ch == 0x8f)      \
00132          {                                                           \
00133            /* Two or three byte character.  */                              \
00134            uint32_t ch2;                                             \
00135                                                                      \
00136            if (__builtin_expect (inptr + 1 >= inend, 0))                    \
00137              {                                                              \
00138               /* The second byte is not available.  */               \
00139               result = __GCONV_INCOMPLETE_INPUT;                     \
00140               break;                                                 \
00141              }                                                              \
00142                                                                      \
00143            ch2 = inptr[1];                                           \
00144                                                                      \
00145            /* The second byte must be >= 0xa1 and <= 0xfe.  */              \
00146            if (__builtin_expect (ch2 < 0xa1 || ch2 > 0xfe, 0))              \
00147              {                                                              \
00148               /* This is an illegal character.  */                          \
00149               STANDARD_FROM_LOOP_ERR_HANDLER (1);                           \
00150              }                                                              \
00151                                                                      \
00152            if (ch == 0x8e)                                           \
00153              {                                                              \
00154               /* Half-width katakana.  */                            \
00155               if (__builtin_expect (ch2 > 0xdf, 0))                         \
00156                 STANDARD_FROM_LOOP_ERR_HANDLER (1);                         \
00157                                                                      \
00158               ch = ch2 + 0xfec0;                                     \
00159               inptr += 2;                                            \
00160              }                                                              \
00161            else                                                      \
00162              {                                                              \
00163               const unsigned char *endp;                             \
00164                                                                      \
00165               if (ch == 0x8f)                                               \
00166                 {                                                    \
00167                   /* JISX 0213 plane 2.  */                                 \
00168                   uint32_t ch3;                                      \
00169                                                                      \
00170                   if (__builtin_expect (inptr + 2 >= inend, 0))             \
00171                     {                                                       \
00172                      /* The third byte is not available.  */                \
00173                      result = __GCONV_INCOMPLETE_INPUT;              \
00174                      break;                                          \
00175                     }                                                       \
00176                                                                      \
00177                   ch3 = inptr[2];                                    \
00178                   endp = inptr + 3;                                         \
00179                                                                      \
00180                   ch = jisx0213_to_ucs4 (0x200 - 0x80 + ch2, ch3 ^ 0x80);   \
00181                 }                                                    \
00182               else                                                   \
00183                 {                                                    \
00184                   /* JISX 0213 plane 1.  */                                 \
00185                   endp = inptr + 2;                                         \
00186                                                                      \
00187                   ch = jisx0213_to_ucs4 (0x100 - 0x80 + ch, ch2 ^ 0x80);    \
00188                 }                                                    \
00189                                                                      \
00190               if (ch == 0)                                           \
00191                 /* This is an illegal character.  */                        \
00192                 STANDARD_FROM_LOOP_ERR_HANDLER (1);                         \
00193                                                                      \
00194               inptr = endp;                                          \
00195                                                                      \
00196               if (ch < 0x80)                                                \
00197                 {                                                    \
00198                   /* It's a combining character.  */                        \
00199                   uint32_t u1 = __jisx0213_to_ucs_combining[ch - 1][0];     \
00200                   uint32_t u2 = __jisx0213_to_ucs_combining[ch - 1][1];     \
00201                                                                      \
00202                   put32 (outptr, u1);                                       \
00203                   outptr += 4;                                       \
00204                                                                      \
00205                   /* See whether we have room for two characters.  */       \
00206                   if (outptr + 4 <= outend)                                 \
00207                     {                                                       \
00208                      put32 (outptr, u2);                             \
00209                      outptr += 4;                                    \
00210                      continue;                                       \
00211                     }                                                       \
00212                                                                      \
00213                   /* Otherwise store only the first character now, and      \
00214                      put the second one into the queue.  */                 \
00215                   *statep = u2 << 3;                                        \
00216                   /* Tell the caller why we terminate the loop.  */         \
00217                   result = __GCONV_FULL_OUTPUT;                      \
00218                   break;                                             \
00219                 }                                                    \
00220              }                                                              \
00221          }                                                           \
00222        else                                                          \
00223          {                                                           \
00224            /* This is illegal.  */                                   \
00225            STANDARD_FROM_LOOP_ERR_HANDLER (1);                              \
00226          }                                                           \
00227       }                                                                     \
00228                                                                      \
00229     put32 (outptr, ch);                                                     \
00230     outptr += 4;                                                     \
00231   }
00232 #define LOOP_NEED_FLAGS
00233 #define EXTRA_LOOP_DECLS    , int *statep
00234 #define ONEBYTE_BODY \
00235   {                                                                  \
00236     if (c < 0x80)                                                    \
00237       return c;                                                             \
00238     else                                                             \
00239       return WEOF;                                                   \
00240   }
00241 #include <iconv/loop.c>
00242 
00243 
00244 /* Next, define the other direction, from UCS-4 to EUC-JISX0213.  */
00245 
00246 /* Composition tables for each of the relevant combining characters.  */
00247 static const struct
00248 {
00249   uint16_t base;
00250   uint16_t composed;
00251 } comp_table_data[] =
00252 {
00253 #define COMP_TABLE_IDX_02E5 0
00254 #define COMP_TABLE_LEN_02E5 1
00255   { 0xabe4, 0xabe5 }, /* 0x12B65 = 0x12B64 U+02E5 */
00256 #define COMP_TABLE_IDX_02E9 (COMP_TABLE_IDX_02E5 + COMP_TABLE_LEN_02E5)
00257 #define COMP_TABLE_LEN_02E9 1
00258   { 0xabe0, 0xabe6 }, /* 0x12B66 = 0x12B60 U+02E9 */
00259 #define COMP_TABLE_IDX_0300 (COMP_TABLE_IDX_02E9 + COMP_TABLE_LEN_02E9)
00260 #define COMP_TABLE_LEN_0300 5
00261   { 0xa9dc, 0xabc4 }, /* 0x12B44 = 0x1295C U+0300 */
00262   { 0xabb8, 0xabc8 }, /* 0x12B48 = 0x12B38 U+0300 */
00263   { 0xabb7, 0xabca }, /* 0x12B4A = 0x12B37 U+0300 */
00264   { 0xabb0, 0xabcc }, /* 0x12B4C = 0x12B30 U+0300 */
00265   { 0xabc3, 0xabce }, /* 0x12B4E = 0x12B43 U+0300 */
00266 #define COMP_TABLE_IDX_0301 (COMP_TABLE_IDX_0300 + COMP_TABLE_LEN_0300)
00267 #define COMP_TABLE_LEN_0301 4
00268   { 0xabb8, 0xabc9 }, /* 0x12B49 = 0x12B38 U+0301 */
00269   { 0xabb7, 0xabcb }, /* 0x12B4B = 0x12B37 U+0301 */
00270   { 0xabb0, 0xabcd }, /* 0x12B4D = 0x12B30 U+0301 */
00271   { 0xabc3, 0xabcf }, /* 0x12B4F = 0x12B43 U+0301 */
00272 #define COMP_TABLE_IDX_309A (COMP_TABLE_IDX_0301 + COMP_TABLE_LEN_0301)
00273 #define COMP_TABLE_LEN_309A 14
00274   { 0xa4ab, 0xa4f7 }, /* 0x12477 = 0x1242B U+309A */
00275   { 0xa4ad, 0xa4f8 }, /* 0x12478 = 0x1242D U+309A */
00276   { 0xa4af, 0xa4f9 }, /* 0x12479 = 0x1242F U+309A */
00277   { 0xa4b1, 0xa4fa }, /* 0x1247A = 0x12431 U+309A */
00278   { 0xa4b3, 0xa4fb }, /* 0x1247B = 0x12433 U+309A */
00279   { 0xa5ab, 0xa5f7 }, /* 0x12577 = 0x1252B U+309A */
00280   { 0xa5ad, 0xa5f8 }, /* 0x12578 = 0x1252D U+309A */
00281   { 0xa5af, 0xa5f9 }, /* 0x12579 = 0x1252F U+309A */
00282   { 0xa5b1, 0xa5fa }, /* 0x1257A = 0x12531 U+309A */
00283   { 0xa5b3, 0xa5fb }, /* 0x1257B = 0x12533 U+309A */
00284   { 0xa5bb, 0xa5fc }, /* 0x1257C = 0x1253B U+309A */
00285   { 0xa5c4, 0xa5fd }, /* 0x1257D = 0x12544 U+309A */
00286   { 0xa5c8, 0xa5fe }, /* 0x1257E = 0x12548 U+309A */
00287   { 0xa6f5, 0xa6f8 }, /* 0x12678 = 0x12675 U+309A */
00288 };
00289 
00290 #define MIN_NEEDED_INPUT    TO_LOOP_MIN_NEEDED_FROM
00291 #define MAX_NEEDED_INPUT    TO_LOOP_MAX_NEEDED_FROM
00292 #define MIN_NEEDED_OUTPUT   TO_LOOP_MIN_NEEDED_TO
00293 #define MAX_NEEDED_OUTPUT   TO_LOOP_MAX_NEEDED_TO
00294 #define LOOPFCT                    TO_LOOP
00295 #define BODY \
00296   {                                                                  \
00297     uint32_t ch = get32 (inptr);                                     \
00298                                                                      \
00299     if ((*statep >> 3) != 0)                                                \
00300       {                                                                     \
00301        /* Attempt to combine the last character with this one.  */          \
00302        uint16_t lasttwo = *statep >> 3;                              \
00303        unsigned int idx;                                             \
00304        unsigned int len;                                             \
00305                                                                      \
00306        if (ch == 0x02e5)                                             \
00307          idx = COMP_TABLE_IDX_02E5, len = COMP_TABLE_LEN_02E5;              \
00308        else if (ch == 0x02e9)                                               \
00309          idx = COMP_TABLE_IDX_02E9, len = COMP_TABLE_LEN_02E9;              \
00310        else if (ch == 0x0300)                                               \
00311          idx = COMP_TABLE_IDX_0300, len = COMP_TABLE_LEN_0300;              \
00312        else if (ch == 0x0301)                                               \
00313          idx = COMP_TABLE_IDX_0301, len = COMP_TABLE_LEN_0301;              \
00314        else if (ch == 0x309a)                                               \
00315          idx = COMP_TABLE_IDX_309A, len = COMP_TABLE_LEN_309A;              \
00316        else                                                          \
00317          goto not_combining;                                                \
00318                                                                      \
00319        do                                                            \
00320          if (comp_table_data[idx].base == lasttwo)                          \
00321            break;                                                    \
00322        while (++idx, --len > 0);                                     \
00323                                                                      \
00324        if (len > 0)                                                  \
00325          {                                                           \
00326            /* Output the combined character.  */                     \
00327            if (__builtin_expect (outptr + 1 >= outend, 0))                  \
00328              {                                                              \
00329               result = __GCONV_FULL_OUTPUT;                                 \
00330               break;                                                 \
00331              }                                                              \
00332            lasttwo = comp_table_data[idx].composed;                         \
00333            *outptr++ = (lasttwo >> 8) & 0xff;                               \
00334            *outptr++ = lasttwo & 0xff;                                      \
00335            *statep = 0;                                              \
00336            inptr += 4;                                                      \
00337            continue;                                                 \
00338          }                                                           \
00339                                                                      \
00340       not_combining:                                                 \
00341        /* Output the buffered character.  */                                \
00342        if (__builtin_expect (outptr + 1 >= outend, 0))                      \
00343          {                                                           \
00344            result = __GCONV_FULL_OUTPUT;                             \
00345            break;                                                    \
00346          }                                                           \
00347        *outptr++ = (lasttwo >> 8) & 0xff;                            \
00348        *outptr++ = lasttwo & 0xff;                                   \
00349        *statep = 0;                                                  \
00350        continue;                                                     \
00351       }                                                                     \
00352                                                                      \
00353     if (ch < 0x80)                                                   \
00354       /* Plain ASCII character.  */                                         \
00355       *outptr++ = ch;                                                       \
00356     else if (ch >= 0xff61 && ch <= 0xff9f)                                  \
00357       {                                                                     \
00358        /* Half-width katakana.  */                                   \
00359        if (__builtin_expect (outptr + 1 >= outend, 0))                      \
00360          {                                                           \
00361            result = __GCONV_FULL_OUTPUT;                             \
00362            break;                                                    \
00363          }                                                           \
00364        *outptr++ = 0x8e;                                             \
00365        *outptr++ = ch - 0xfec0;                                      \
00366       }                                                                     \
00367     else                                                             \
00368       {                                                                     \
00369        uint32_t jch = ucs4_to_jisx0213 (ch);                                \
00370        if (jch == 0)                                                 \
00371          {                                                           \
00372            UNICODE_TAG_HANDLER (ch, 4);                              \
00373                                                                      \
00374            /* Illegal character.  */                                        \
00375            STANDARD_TO_LOOP_ERR_HANDLER (4);                                \
00376          }                                                           \
00377                                                                      \
00378        if (jch & 0x0080)                                             \
00379          {                                                           \
00380            /* A possible match in comp_table_data.  We have to buffer it.  */\
00381                                                                      \
00382            /* We know it's a JISX 0213 plane 1 character.  */               \
00383            assert ((jch & 0x8000) == 0);                             \
00384                                                                      \
00385            *statep = (jch | 0x8080) << 3;                            \
00386            inptr += 4;                                                      \
00387            continue;                                                 \
00388          }                                                           \
00389                                                                      \
00390        if (jch & 0x8000)                                             \
00391          {                                                           \
00392            /* JISX 0213 plane 2.  */                                        \
00393            if (__builtin_expect (outptr + 2 >= outend, 0))                  \
00394              {                                                              \
00395               result = __GCONV_FULL_OUTPUT;                                 \
00396               break;                                                 \
00397              }                                                              \
00398            *outptr++ = 0x8f;                                                \
00399          }                                                           \
00400        else                                                          \
00401          {                                                           \
00402            /* JISX 0213 plane 1.  */                                        \
00403            if (__builtin_expect (outptr + 1 >= outend, 0))                  \
00404              {                                                              \
00405               result = __GCONV_FULL_OUTPUT;                                 \
00406               break;                                                 \
00407              }                                                              \
00408          }                                                           \
00409        *outptr++ = (jch >> 8) | 0x80;                                       \
00410        *outptr++ = (jch & 0xff) | 0x80;                              \
00411       }                                                                     \
00412                                                                      \
00413     inptr += 4;                                                             \
00414   }
00415 #define LOOP_NEED_FLAGS
00416 #define EXTRA_LOOP_DECLS    , int *statep
00417 #include <iconv/loop.c>
00418 
00419 
00420 /* Now define the toplevel functions.  */
00421 #include <iconv/skeleton.c>