Back to index

glibc  2.9
shift_jisx0213.c
Go to the documentation of this file.
00001 /* Conversion from and to Shift_JISX0213.
00002    Copyright (C) 2002, 2004 Free Software Foundation, Inc.
00003    This file is part of the GNU C Library.
00004    Contributed by Bruno Haible <bruno@clisp.org>, 2002.
00005 
00006    The GNU C Library is free software; you can redistribute it and/or
00007    modify it under the terms of the GNU Lesser General Public
00008    License as published by the Free Software Foundation; either
00009    version 2.1 of the License, or (at your option) any later version.
00010 
00011    The GNU C Library is distributed in the hope that it will be useful,
00012    but WITHOUT ANY WARRANTY; without even the implied warranty of
00013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014    Lesser General Public License for more details.
00015 
00016    You should have received a copy of the GNU Lesser General Public
00017    License along with the GNU C Library; if not, write to the Free
00018    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
00019    02111-1307 USA.  */
00020 
00021 #include <dlfcn.h>
00022 #include <stdint.h>
00023 #include <gconv.h>
00024 
00025 /* The structure of Shift_JISX0213 is as follows:
00026 
00027    0x00..0x7F: ISO646-JP, an ASCII variant
00028 
00029    0x{A1..DF}: JISX0201 Katakana.
00030 
00031    0x{81..9F,E0..EF}{40..7E,80..FC}: JISX0213 plane 1.
00032 
00033    0x{F0..FC}{40..7E,80..FC}: JISX0213 plane 2, with irregular row mapping.
00034 
00035    Note that some JISX0213 characters are not contained in Unicode 3.2
00036    and are therefore best represented as sequences of Unicode characters.
00037 */
00038 
00039 #include "jisx0213.h"
00040 
00041 /* Definitions used in the body of the `gconv' function.  */
00042 #define CHARSET_NAME        "SHIFT_JISX0213//"
00043 #define FROM_LOOP           from_shift_jisx0213
00044 #define TO_LOOP                    to_shift_jisx0213
00045 #define DEFINE_INIT         1
00046 #define DEFINE_FINI         1
00047 #define FROM_LOOP_MIN_NEEDED_FROM  1
00048 #define FROM_LOOP_MAX_NEEDED_FROM  2
00049 #define FROM_LOOP_MIN_NEEDED_TO           4
00050 #define FROM_LOOP_MAX_NEEDED_TO           8
00051 #define TO_LOOP_MIN_NEEDED_FROM           4
00052 #define TO_LOOP_MAX_NEEDED_FROM           4
00053 #define TO_LOOP_MIN_NEEDED_TO             1
00054 #define TO_LOOP_MAX_NEEDED_TO             2
00055 #define PREPARE_LOOP \
00056   int saved_state;                                                   \
00057   int *statep = &data->__statep->__count;
00058 #define EXTRA_LOOP_ARGS            , statep
00059 
00060 
00061 /* Since we might have to reset input pointer we must be able to save
00062    and restore the state.  */
00063 #define SAVE_RESET_STATE(Save) \
00064   if (Save)                                                          \
00065     saved_state = *statep;                                           \
00066   else                                                               \
00067     *statep = saved_state
00068 
00069 
00070 /* During Shift_JISX0213 to UCS-4 conversion, the COUNT element of the state
00071    contains the last UCS-4 character, shifted by 3 bits.
00072    During UCS-4 to Shift_JISX0213 conversion, the COUNT element of the state
00073    contains the last two bytes to be output, shifted by 3 bits.  */
00074 
00075 /* Since this is a stateful encoding we have to provide code which resets
00076    the output state to the initial state.  This has to be done during the
00077    flushing.  */
00078 #define EMIT_SHIFT_TO_INIT \
00079   if (data->__statep->__count != 0)                                         \
00080     {                                                                \
00081       if (FROM_DIRECTION)                                            \
00082        {                                                             \
00083          if (__builtin_expect (outbuf + 4 <= outend, 1))                    \
00084            {                                                         \
00085              /* Write out the last character.  */                           \
00086              *((uint32_t *) outbuf) = data->__statep->__count >> 3;         \
00087              outbuf += sizeof (uint32_t);                            \
00088              data->__statep->__count = 0;                            \
00089            }                                                         \
00090          else                                                        \
00091            /* We don't have enough room in the output buffer.  */           \
00092            status = __GCONV_FULL_OUTPUT;                             \
00093        }                                                             \
00094       else                                                           \
00095        {                                                             \
00096          if (__builtin_expect (outbuf + 2 <= outend, 1))                    \
00097            {                                                         \
00098              /* Write out the last character.  */                           \
00099              uint32_t lasttwo = data->__statep->__count >> 3;               \
00100              *outbuf++ = (lasttwo >> 8) & 0xff;                      \
00101              *outbuf++ = lasttwo & 0xff;                             \
00102              data->__statep->__count = 0;                            \
00103            }                                                         \
00104          else                                                        \
00105            /* We don't have enough room in the output buffer.  */           \
00106            status = __GCONV_FULL_OUTPUT;                             \
00107        }                                                             \
00108     }
00109 
00110 
00111 /* First define the conversion function from Shift_JISX0213 to UCS-4.  */
00112 #define MIN_NEEDED_INPUT    FROM_LOOP_MIN_NEEDED_FROM
00113 #define MAX_NEEDED_INPUT    FROM_LOOP_MAX_NEEDED_FROM
00114 #define MIN_NEEDED_OUTPUT   FROM_LOOP_MIN_NEEDED_TO
00115 #define MAX_NEEDED_OUTPUT   FROM_LOOP_MAX_NEEDED_TO
00116 #define LOOPFCT                    FROM_LOOP
00117 #define BODY \
00118   {                                                                  \
00119     uint32_t ch;                                                     \
00120                                                                      \
00121     /* Determine whether there is a buffered character pending.  */         \
00122     ch = *statep >> 3;                                                      \
00123     if (__builtin_expect (ch == 0, 1))                                      \
00124       {                                                                     \
00125        /* No - so look at the next input byte.  */                          \
00126        ch = *inptr;                                                  \
00127                                                                      \
00128        if (ch < 0x80)                                                       \
00129          {                                                           \
00130            /* Plain ISO646-JP character.  */                                \
00131            if (__builtin_expect (ch == 0x5c, 0))                     \
00132              ch = 0xa5;                                              \
00133            else if (__builtin_expect (ch == 0x7e, 0))                       \
00134              ch = 0x203e;                                            \
00135            ++inptr;                                                  \
00136          }                                                           \
00137        else if (ch >= 0xa1 && ch <= 0xdf)                            \
00138          {                                                           \
00139            /* Half-width katakana.  */                                      \
00140            ch += 0xfec0;                                             \
00141            ++inptr;                                                  \
00142          }                                                           \
00143        else if ((ch >= 0x81 && ch <= 0x9f) || (ch >= 0xe0 && ch <= 0xfc))    \
00144          {                                                           \
00145            /* Two byte character.  */                                       \
00146            uint32_t ch2;                                             \
00147                                                                      \
00148            if (__builtin_expect (inptr + 1 >= inend, 0))                    \
00149              {                                                              \
00150               /* The second byte is not available.  */               \
00151               result = __GCONV_INCOMPLETE_INPUT;                     \
00152               break;                                                 \
00153              }                                                              \
00154                                                                      \
00155            ch2 = inptr[1];                                           \
00156                                                                      \
00157            /* The second byte must be in the range 0x{40..7E,80..FC}.  */    \
00158            if (__builtin_expect (ch2 < 0x40 || ch2 == 0x7f || ch2 > 0xfc, 0))\
00159              {                                                              \
00160               /* This is an illegal character.  */                          \
00161               STANDARD_FROM_LOOP_ERR_HANDLER (1);                           \
00162              }                                                              \
00163                                                                      \
00164            /* Convert to row and column.  */                                \
00165            if (ch < 0xe0)                                            \
00166              ch -= 0x81;                                             \
00167            else                                                      \
00168              ch -= 0xc1;                                             \
00169            if (ch2 < 0x80)                                           \
00170              ch2 -= 0x40;                                            \
00171            else                                                      \
00172              ch2 -= 0x41;                                            \
00173            /* Now 0 <= ch <= 0x3b, 0 <= ch2 <= 0xbb.  */                    \
00174            ch = 2 * ch;                                              \
00175            if (ch2 >= 0x5e)                                          \
00176              ch2 -= 0x5e, ch++;                                      \
00177            ch2 += 0x21;                                              \
00178            if (ch >= 0x5e)                                           \
00179              {                                                              \
00180               /* Handling of JISX 0213 plane 2 rows.  */                    \
00181               if (ch >= 0x67)                                               \
00182                 ch += 230;                                           \
00183               else if (ch >= 0x63 || ch == 0x5f)                     \
00184                 ch += 168;                                           \
00185               else                                                   \
00186                 ch += 162;                                           \
00187              }                                                              \
00188                                                                      \
00189            ch = jisx0213_to_ucs4 (0x121 + ch, ch2);                         \
00190                                                                      \
00191            if (ch == 0)                                              \
00192              {                                                              \
00193               /* This is an illegal character.  */                          \
00194               STANDARD_FROM_LOOP_ERR_HANDLER (1);                           \
00195              }                                                              \
00196                                                                      \
00197            inptr += 2;                                                      \
00198                                                                      \
00199            if (ch < 0x80)                                            \
00200              {                                                              \
00201               /* It's a combining character.  */                     \
00202               uint32_t u1 = __jisx0213_to_ucs_combining[ch - 1][0];         \
00203               uint32_t u2 = __jisx0213_to_ucs_combining[ch - 1][1];         \
00204                                                                      \
00205               put32 (outptr, u1);                                    \
00206               outptr += 4;                                           \
00207                                                                      \
00208               /* See whether we have room for two characters.  */           \
00209               if (outptr + 4 <= outend)                              \
00210                 {                                                    \
00211                   put32 (outptr, u2);                                       \
00212                   outptr += 4;                                       \
00213                   continue;                                          \
00214                 }                                                    \
00215                                                                      \
00216               /* Otherwise store only the first character now, and          \
00217                  put the second one into the queue.  */              \
00218               *statep = u2 << 3;                                     \
00219               /* Tell the caller why we terminate the loop.  */             \
00220               result = __GCONV_FULL_OUTPUT;                                 \
00221               break;                                                 \
00222              }                                                              \
00223          }                                                           \
00224        else                                                          \
00225          {                                                           \
00226            /* This is illegal.  */                                   \
00227            STANDARD_FROM_LOOP_ERR_HANDLER (1);                              \
00228          }                                                           \
00229       }                                                                     \
00230                                                                      \
00231     put32 (outptr, ch);                                                     \
00232     outptr += 4;                                                     \
00233   }
00234 #define LOOP_NEED_FLAGS
00235 #define EXTRA_LOOP_DECLS    , int *statep
00236 #define ONEBYTE_BODY \
00237   {                                                                  \
00238     if (c < 0x80)                                                    \
00239       {                                                                     \
00240        if (c == 0x5c)                                                       \
00241          return 0xa5;                                                       \
00242        if (c == 0x7e)                                                       \
00243          return 0x203e;                                              \
00244        return c;                                                     \
00245       }                                                                     \
00246     if (c >= 0xa1 && c <= 0xdf)                                             \
00247       return 0xfec0 + c;                                             \
00248     return WEOF;                                                     \
00249   }
00250 #include <iconv/loop.c>
00251 
00252 
00253 /* Next, define the other direction, from UCS-4 to Shift_JISX0213.  */
00254 
00255 /* Composition tables for each of the relevant combining characters.  */
00256 static const struct
00257 {
00258   uint16_t base;
00259   uint16_t composed;
00260 } comp_table_data[] =
00261 {
00262 #define COMP_TABLE_IDX_02E5 0
00263 #define COMP_TABLE_LEN_02E5 1
00264   { 0x8684, 0x8685 }, /* 0x12B65 = 0x12B64 U+02E5 */
00265 #define COMP_TABLE_IDX_02E9 (COMP_TABLE_IDX_02E5 + COMP_TABLE_LEN_02E5)
00266 #define COMP_TABLE_LEN_02E9 1
00267   { 0x8680, 0x8686 }, /* 0x12B66 = 0x12B60 U+02E9 */
00268 #define COMP_TABLE_IDX_0300 (COMP_TABLE_IDX_02E9 + COMP_TABLE_LEN_02E9)
00269 #define COMP_TABLE_LEN_0300 5
00270   { 0x857b, 0x8663 }, /* 0x12B44 = 0x1295C U+0300 */
00271   { 0x8657, 0x8667 }, /* 0x12B48 = 0x12B38 U+0300 */
00272   { 0x8656, 0x8669 }, /* 0x12B4A = 0x12B37 U+0300 */
00273   { 0x864f, 0x866b }, /* 0x12B4C = 0x12B30 U+0300 */
00274   { 0x8662, 0x866d }, /* 0x12B4E = 0x12B43 U+0300 */
00275 #define COMP_TABLE_IDX_0301 (COMP_TABLE_IDX_0300 + COMP_TABLE_LEN_0300)
00276 #define COMP_TABLE_LEN_0301 4
00277   { 0x8657, 0x8668 }, /* 0x12B49 = 0x12B38 U+0301 */
00278   { 0x8656, 0x866a }, /* 0x12B4B = 0x12B37 U+0301 */
00279   { 0x864f, 0x866c }, /* 0x12B4D = 0x12B30 U+0301 */
00280   { 0x8662, 0x866e }, /* 0x12B4F = 0x12B43 U+0301 */
00281 #define COMP_TABLE_IDX_309A (COMP_TABLE_IDX_0301 + COMP_TABLE_LEN_0301)
00282 #define COMP_TABLE_LEN_309A 14
00283   { 0x82a9, 0x82f5 }, /* 0x12477 = 0x1242B U+309A */
00284   { 0x82ab, 0x82f6 }, /* 0x12478 = 0x1242D U+309A */
00285   { 0x82ad, 0x82f7 }, /* 0x12479 = 0x1242F U+309A */
00286   { 0x82af, 0x82f8 }, /* 0x1247A = 0x12431 U+309A */
00287   { 0x82b1, 0x82f9 }, /* 0x1247B = 0x12433 U+309A */
00288   { 0x834a, 0x8397 }, /* 0x12577 = 0x1252B U+309A */
00289   { 0x834c, 0x8398 }, /* 0x12578 = 0x1252D U+309A */
00290   { 0x834e, 0x8399 }, /* 0x12579 = 0x1252F U+309A */
00291   { 0x8350, 0x839a }, /* 0x1257A = 0x12531 U+309A */
00292   { 0x8352, 0x839b }, /* 0x1257B = 0x12533 U+309A */
00293   { 0x835a, 0x839c }, /* 0x1257C = 0x1253B U+309A */
00294   { 0x8363, 0x839d }, /* 0x1257D = 0x12544 U+309A */
00295   { 0x8367, 0x839e }, /* 0x1257E = 0x12548 U+309A */
00296   { 0x83f3, 0x83f6 }, /* 0x12678 = 0x12675 U+309A */
00297 };
00298 
00299 #define MIN_NEEDED_INPUT    TO_LOOP_MIN_NEEDED_FROM
00300 #define MAX_NEEDED_INPUT    TO_LOOP_MAX_NEEDED_FROM
00301 #define MIN_NEEDED_OUTPUT   TO_LOOP_MIN_NEEDED_TO
00302 #define MAX_NEEDED_OUTPUT   TO_LOOP_MAX_NEEDED_TO
00303 #define LOOPFCT                    TO_LOOP
00304 #define BODY \
00305   {                                                                  \
00306     uint32_t ch = get32 (inptr);                                     \
00307                                                                      \
00308     if ((*statep >> 3) != 0)                                                \
00309       {                                                                     \
00310        /* Attempt to combine the last character with this one.  */          \
00311        uint16_t lasttwo = *statep >> 3;                              \
00312        unsigned int idx;                                             \
00313        unsigned int len;                                             \
00314                                                                      \
00315        if (ch == 0x02e5)                                             \
00316          idx = COMP_TABLE_IDX_02E5, len = COMP_TABLE_LEN_02E5;              \
00317        else if (ch == 0x02e9)                                               \
00318          idx = COMP_TABLE_IDX_02E9, len = COMP_TABLE_LEN_02E9;              \
00319        else if (ch == 0x0300)                                               \
00320          idx = COMP_TABLE_IDX_0300, len = COMP_TABLE_LEN_0300;              \
00321        else if (ch == 0x0301)                                               \
00322          idx = COMP_TABLE_IDX_0301, len = COMP_TABLE_LEN_0301;              \
00323        else if (ch == 0x309a)                                               \
00324          idx = COMP_TABLE_IDX_309A, len = COMP_TABLE_LEN_309A;              \
00325        else                                                          \
00326          goto not_combining;                                                \
00327                                                                      \
00328        do                                                            \
00329          if (comp_table_data[idx].base == lasttwo)                          \
00330            break;                                                    \
00331        while (++idx, --len > 0);                                     \
00332                                                                      \
00333        if (len > 0)                                                  \
00334          {                                                           \
00335            /* Output the combined character.  */                     \
00336            if (__builtin_expect (outptr + 1 >= outend, 0))                  \
00337              {                                                              \
00338               result = __GCONV_FULL_OUTPUT;                                 \
00339               break;                                                 \
00340              }                                                              \
00341            lasttwo = comp_table_data[idx].composed;                         \
00342            *outptr++ = (lasttwo >> 8) & 0xff;                               \
00343            *outptr++ = lasttwo & 0xff;                                      \
00344            *statep = 0;                                              \
00345            inptr += 4;                                                      \
00346            continue;                                                 \
00347          }                                                           \
00348                                                                      \
00349       not_combining:                                                 \
00350        /* Output the buffered character.  */                                \
00351        if (__builtin_expect (outptr + 1 >= outend, 0))                      \
00352          {                                                           \
00353            result = __GCONV_FULL_OUTPUT;                             \
00354            break;                                                    \
00355          }                                                           \
00356        *outptr++ = (lasttwo >> 8) & 0xff;                            \
00357        *outptr++ = lasttwo & 0xff;                                   \
00358        *statep = 0;                                                  \
00359        continue;                                                     \
00360       }                                                                     \
00361                                                                      \
00362     if (ch < 0x80)                                                   \
00363       /* Plain ISO646-JP character.  */                                     \
00364       *outptr++ = ch;                                                       \
00365     else if (ch == 0xa5)                                             \
00366       *outptr++ = 0x5c;                                                     \
00367     else if (ch == 0x203e)                                           \
00368       *outptr++ = 0x7e;                                                     \
00369     else if (ch >= 0xff61 && ch <= 0xff9f)                                  \
00370       /* Half-width katakana.  */                                    \
00371       *outptr++ = ch - 0xfec0;                                              \
00372     else                                                             \
00373       {                                                                     \
00374        unsigned int s1, s2;                                          \
00375        uint32_t jch = ucs4_to_jisx0213 (ch);                                \
00376        if (jch == 0)                                                 \
00377          {                                                           \
00378            UNICODE_TAG_HANDLER (ch, 4);                              \
00379                                                                      \
00380            /* Illegal character.  */                                        \
00381            STANDARD_TO_LOOP_ERR_HANDLER (4);                                \
00382          }                                                           \
00383                                                                      \
00384        /* Convert it to shifted representation.  */                         \
00385        s1 = jch >> 8;                                                       \
00386        s2 = jch & 0x7f;                                                     \
00387        s1 -= 0x21;                                                   \
00388        s2 -= 0x21;                                                   \
00389        if (s1 >= 0x5e)                                                      \
00390          {                                                           \
00391            /* Handling of JISX 0213 plane 2 rows.  */                       \
00392            if (s1 >= 0xcd) /* rows 0x26E..0x27E */                          \
00393              s1 -= 102;                                              \
00394            else if (s1 >= 0x8b || s1 == 0x87) /* rows 0x228, 0x22C..0x22F */ \
00395              s1 -= 40;                                                      \
00396            else /* rows 0x221, 0x223..0x225 */                              \
00397              s1 -= 34;                                                      \
00398            /* Now 0x5e <= s1 <= 0x77.  */                            \
00399          }                                                           \
00400        if (s1 & 1)                                                   \
00401          s2 += 0x5e;                                                 \
00402        s1 = s1 >> 1;                                                 \
00403        if (s1 < 0x1f)                                                       \
00404          s1 += 0x81;                                                 \
00405        else                                                          \
00406          s1 += 0xc1;                                                 \
00407        if (s2 < 0x3f)                                                       \
00408          s2 += 0x40;                                                 \
00409        else                                                          \
00410          s2 += 0x41;                                                 \
00411                                                                      \
00412        if (jch & 0x0080)                                             \
00413          {                                                           \
00414            /* A possible match in comp_table_data.  We have to buffer it.  */\
00415                                                                      \
00416            /* We know it's a JISX 0213 plane 1 character.  */               \
00417            assert ((jch & 0x8000) == 0);                             \
00418                                                                      \
00419            *statep = ((s1 << 8) | s2) << 3;                                 \
00420            inptr += 4;                                                      \
00421            continue;                                                 \
00422          }                                                           \
00423                                                                      \
00424        /* Output the shifted representation.  */                     \
00425        if (__builtin_expect (outptr + 1 >= outend, 0))                      \
00426          {                                                           \
00427            result = __GCONV_FULL_OUTPUT;                             \
00428            break;                                                    \
00429          }                                                           \
00430        *outptr++ = s1;                                                      \
00431        *outptr++ = s2;                                                      \
00432       }                                                                     \
00433                                                                      \
00434     inptr += 4;                                                             \
00435   }
00436 #define LOOP_NEED_FLAGS
00437 #define EXTRA_LOOP_DECLS    , int *statep
00438 #include <iconv/loop.c>
00439 
00440 
00441 /* Now define the toplevel functions.  */
00442 #include <iconv/skeleton.c>