Back to index

glibc  2.9
johab.c
Go to the documentation of this file.
00001 /* Mapping tables for JOHAB handling.
00002    Copyright (C) 1998, 1999, 2000-2002, 2007 Free Software Foundation, Inc.
00003    This file is part of the GNU C Library.
00004    Contributed by Jungshik Shin <jshin@pantheon.yale.edu>
00005    and Ulrich Drepper <drepper@cygnus.com>, 1998.
00006 
00007    The GNU C Library is free software; you can redistribute it and/or
00008    modify it under the terms of the GNU Lesser General Public
00009    License as published by the Free Software Foundation; either
00010    version 2.1 of the License, or (at your option) any later version.
00011 
00012    The GNU C Library is distributed in the hope that it will be useful,
00013    but WITHOUT ANY WARRANTY; without even the implied warranty of
00014    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015    Lesser General Public License for more details.
00016 
00017    You should have received a copy of the GNU Lesser General Public
00018    License along with the GNU C Library; if not, write to the Free
00019    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
00020    02111-1307 USA.  */
00021 
00022 #include <dlfcn.h>
00023 #include <stdint.h>
00024 #include <ksc5601.h>
00025 
00026 /* The table for Bit pattern to Hangul Jamo
00027    5 bits each are used to encode
00028    leading consonants(19 + 1 filler), medial vowels(21 + 1 filler)
00029    and trailing consonants(27 + 1 filler).
00030 
00031    KS C 5601-1992 Annex 3 Table 2
00032    0 : Filler, -1: invalid, >= 1 : valid
00033 
00034  */
00035 static const int init[32] =
00036 {
00037   -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
00038   19, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
00039 };
00040 static const int mid[32] =
00041 {
00042   -1, -1, 0, 1, 2, 3, 4, 5,
00043   -1, -1, 6, 7, 8, 9, 10, 11,
00044   -1, -1, 12, 13, 14, 15, 16, 17,
00045   -1, -1, 18, 19, 20, 21, -1, -1
00046 };
00047 static const int final[32] =
00048 {
00049   -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
00050   -1, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, -1, -1
00051 };
00052 
00053 /*
00054    Hangul Jamo in Johab to Unicode 2.0 : Unicode 2.0
00055    defines 51 Hangul Compatibility Jamos in the block [0x3131,0x314e]
00056 
00057    It's to be considered later which Jamo block to use, Compatibility
00058    block [0x3131,0x314e] or Hangul Conjoining Jamo block, [0x1100,0x11ff]
00059 
00060  */
00061 static const uint32_t init_to_ucs[19] =
00062 {
00063   0x3131, 0x3132, 0x3134, 0x3137, 0x3138, 0x3139, 0x3141, 0x3142,
00064   0x3143, 0x3145, 0x3146, 0x3147, 0x3148, 0x3149, 0x314a, 0x314b,
00065   0x314c, 0x314d, 0x314e
00066 };
00067 
00068 static const uint32_t final_to_ucs[31] =
00069 {
00070   L'\0', L'\0', 0x3133, L'\0', 0x3135, 0x3136, L'\0', L'\0',
00071   0x313a, 0x313b, 0x313c, 0x313d, 0x313e, 0x313f,
00072   0x3140, L'\0', L'\0', 0x3144, L'\0', L'\0', L'\0', L'\0',
00073   L'\0', L'\0', L'\0', L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'
00074 };
00075 
00076 /* The following three arrays are used to convert
00077    precomposed Hangul syllables in [0xac00,0xd???]
00078    to Jamo bit patterns for Johab encoding
00079 
00080    cf. : KS C 5601-1992, Annex3 Table 2
00081 
00082    Arrays are used to speed up things although it's possible
00083    to get the same result arithmetically.
00084 
00085  */
00086 static const int init_to_bit[19] =
00087 {
00088   0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00,
00089   0xa000, 0xa400, 0xa800, 0xac00, 0xb000, 0xb400,
00090   0xb800, 0xbc00, 0xc000, 0xc400, 0xc800, 0xcc00,
00091   0xd000
00092 };
00093 
00094 static const int mid_to_bit[21] =
00095 {
00096           0x0060, 0x0080, 0x00a0, 0x00c0, 0x00e0,
00097   0x0140, 0x0160, 0x0180, 0x01a0, 0x01c0, 0x1e0,
00098   0x0240, 0x0260, 0x0280, 0x02a0, 0x02c0, 0x02e0,
00099   0x0340, 0x0360, 0x0380, 0x03a0
00100 };
00101 
00102 static const int final_to_bit[28] =
00103 {
00104   1, 2, 3, 4, 5, 6, 7, 8, 9, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11,
00105   0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d
00106 };
00107 
00108 /* The conversion table from
00109    UCS4 Hangul Compatibility Jamo in [0x3131,0x3163]
00110    to Johab
00111 
00112    cf. 1. KS C 5601-1992 Annex 3 Table 2
00113    2. Unicode 2.0 manual
00114 
00115  */
00116 static const uint16_t jamo_from_ucs_table[51] =
00117 {
00118   0x8841, 0x8c41,
00119   0x8444,
00120   0x9041,
00121   0x8446, 0x8447,
00122   0x9441, 0x9841, 0x9c41,
00123   0x844a, 0x844b, 0x844c, 0x844d, 0x844e, 0x844f, 0x8450,
00124   0xa041, 0xa441, 0xa841,
00125   0x8454,
00126   0xac41, 0xb041, 0xb441, 0xb841, 0xbc41,
00127   0xc041, 0xc441, 0xc841, 0xcc41, 0xd041,
00128   0x8461, 0x8481, 0x84a1, 0x84c1, 0x84e1,
00129   0x8541, 0x8561, 0x8581, 0x85a1, 0x85c1, 0x85e1,
00130   0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1,
00131   0x8741, 0x8761, 0x8781, 0x87a1
00132 };
00133 
00134 
00135 static inline uint32_t
00136 johab_sym_hanja_to_ucs (uint_fast32_t idx, uint_fast32_t c1, uint_fast32_t c2)
00137 {
00138   if (idx <= 0xdefe)
00139     return (uint32_t) __ksc5601_sym_to_ucs[(c1 - 0xd9) * 188 + c2
00140                                       - (c2 > 0x90 ? 0x43 : 0x31)];
00141   else
00142     return (uint32_t) __ksc5601_hanja_to_ucs[(c1 - 0xe0) * 188 + c2
00143                                         - (c2 > 0x90 ? 0x43 : 0x31)];
00144 }
00145 /* Definitions used in the body of the `gconv' function.  */
00146 #define CHARSET_NAME        "JOHAB//"
00147 #define FROM_LOOP           from_johab
00148 #define TO_LOOP                    to_johab
00149 #define DEFINE_INIT         1
00150 #define DEFINE_FINI         1
00151 #define MIN_NEEDED_FROM            1
00152 #define MAX_NEEDED_FROM            2
00153 #define MIN_NEEDED_TO              4
00154 
00155 
00156 /* First define the conversion function from JOHAB to UCS4.  */
00157 #define MIN_NEEDED_INPUT    MIN_NEEDED_FROM
00158 #define MAX_NEEDED_INPUT    MAX_NEEDED_FROM
00159 #define MIN_NEEDED_OUTPUT   MIN_NEEDED_TO
00160 #define LOOPFCT                    FROM_LOOP
00161 #define BODY \
00162   {                                                                  \
00163     uint32_t ch = *inptr;                                            \
00164                                                                      \
00165     if (ch <= 0x7f)                                                  \
00166       {                                                                     \
00167        /* Plain ISO646-KR.  */                                              \
00168        if (ch == 0x5c)                                                      \
00169          ch = 0x20a9; /* half-width Korean Currency WON sign */             \
00170        ++inptr;                                                      \
00171       }                                                                     \
00172     /* Johab : 1. Hangul                                             \
00173        1st byte : 0x84-0xd3                                          \
00174        2nd byte : 0x41-0x7e, 0x81-0xfe                                      \
00175        2. Hanja & Symbol  :                                          \
00176        1st byte : 0xd8-0xde, 0xe0-0xf9                                      \
00177        2nd byte : 0x31-0x7e, 0x91-0xfe                                      \
00178        0xd831-0xd87e and 0xd891-0xd8fe are user-defined area */                    \
00179     else                                                             \
00180       {                                                                     \
00181        if (__builtin_expect (ch > 0xf9, 0)                                  \
00182            || __builtin_expect (ch == 0xdf, 0)                              \
00183            || (__builtin_expect (ch > 0x7e, 0) && ch < 0x84)                \
00184            || (__builtin_expect (ch > 0xd3, 0) && ch < 0xd9))               \
00185          {                                                           \
00186            /* These are illegal.  */                                        \
00187            STANDARD_FROM_LOOP_ERR_HANDLER (1);                              \
00188          }                                                           \
00189        else                                                          \
00190          {                                                           \
00191            /* Two-byte character.  First test whether the next              \
00192               character is also available.  */                              \
00193            uint32_t ch2;                                             \
00194            uint_fast32_t idx;                                               \
00195                                                                      \
00196            if (__builtin_expect (inptr + 1 >= inend, 0))                    \
00197              {                                                              \
00198               /* The second character is not available.  Store the          \
00199                  intermediate result.  */                            \
00200               result = __GCONV_INCOMPLETE_INPUT;                     \
00201               break;                                                 \
00202              }                                                              \
00203                                                                      \
00204            ch2 = inptr[1];                                           \
00205            idx = ch * 256 + ch2;                                     \
00206            if (__builtin_expect (ch <= 0xd3, 1))                     \
00207              {                                                              \
00208               /* Hangul */                                           \
00209               int_fast32_t i, m, f;                                         \
00210                                                                      \
00211               i = init[(idx & 0x7c00) >> 10];                               \
00212               m = mid[(idx & 0x03e0) >> 5];                                 \
00213               f = final[idx & 0x001f];                               \
00214                                                                      \
00215               if (__builtin_expect (i == -1, 0)                      \
00216                   || __builtin_expect (m == -1, 0)                          \
00217                   || __builtin_expect (f == -1, 0))                         \
00218                 {                                                    \
00219                   /* This is illegal.  */                            \
00220                   STANDARD_FROM_LOOP_ERR_HANDLER (1);                       \
00221                 }                                                    \
00222               else if (i > 0 && m > 0)                               \
00223                 ch = ((i - 1) * 21 + (m - 1)) * 28 + f + 0xac00;            \
00224               else if (i > 0 && m == 0 && f == 0)                           \
00225                 ch = init_to_ucs[i - 1];                             \
00226               else if (i == 0 && m > 0 && f == 0)                           \
00227                 ch = 0x314e + m;   /* 0x314f + m - 1 */              \
00228               else if (__builtin_expect ((i | m) == 0, 1)                   \
00229                       && __builtin_expect (f > 0, 1))                \
00230                 ch = final_to_ucs[f - 1]; /* round trip?? */         \
00231               else                                                   \
00232                 {                                                    \
00233                   /* This is illegal.  */                            \
00234                   STANDARD_FROM_LOOP_ERR_HANDLER (1);                       \
00235                 }                                                    \
00236              }                                                              \
00237            else                                                      \
00238              {                                                              \
00239               if (__builtin_expect (ch2 < 0x31, 0)                          \
00240                   || (__builtin_expect (ch2 > 0x7e, 0) && ch2 < 0x91)       \
00241                   || __builtin_expect (ch2, 0) == 0xff               \
00242                   || (__builtin_expect (ch, 0) == 0xd9 && ch2 > 0xe8)       \
00243                   || (__builtin_expect (ch, 0) == 0xda               \
00244                      && ch2 > 0xa0 && ch2 < 0xd4)                           \
00245                   || (__builtin_expect (ch, 0) == 0xde && ch2 > 0xf1))      \
00246                 {                                                    \
00247                   /* This is illegal.  */                            \
00248                   STANDARD_FROM_LOOP_ERR_HANDLER (1);                       \
00249                 }                                                    \
00250               else                                                   \
00251                 {                                                    \
00252                   ch = johab_sym_hanja_to_ucs (idx, ch, ch2);               \
00253                   /* if (idx <= 0xdefe)                              \
00254                       ch = __ksc5601_sym_to_ucs[(ch - 0xd9) * 192           \
00255                                              + ch2 - (ch2 > 0x90            \
00256                                                      ? 0x43 : 0x31)];  \
00257                      else                                            \
00258                        ch = __ksc5601_hanja_to_ucs[(ch - 0xe0) *192         \
00259                                                + ch2 -  (ch2 > 0x90     \
00260                                                         ?0x43 : 0x31)];\
00261                   */                                                 \
00262                 }                                                    \
00263              }                                                              \
00264          }                                                           \
00265                                                                      \
00266        if (__builtin_expect (ch == 0, 0))                            \
00267          {                                                           \
00268            /* This is an illegal character.  */                      \
00269            STANDARD_FROM_LOOP_ERR_HANDLER (2);                              \
00270          }                                                           \
00271                                                                      \
00272        inptr += 2;                                                   \
00273       }                                                                     \
00274                                                                      \
00275     put32 (outptr, ch);                                                     \
00276     outptr += 4;                                                     \
00277   }
00278 #define LOOP_NEED_FLAGS
00279 #define ONEBYTE_BODY \
00280   {                                                                  \
00281     if (c <= 0x7f)                                                   \
00282       return (c == 0x5c ? 0x20a9 : c);                                      \
00283     else                                                             \
00284       return WEOF;                                                   \
00285   }
00286 #include <iconv/loop.c>
00287 
00288 
00289 /* Next, define the other direction.  */
00290 #define MIN_NEEDED_INPUT    MIN_NEEDED_TO
00291 #define MIN_NEEDED_OUTPUT   MIN_NEEDED_FROM
00292 #define MAX_NEEDED_OUTPUT   MAX_NEEDED_FROM
00293 #define LOOPFCT                    TO_LOOP
00294 #define BODY \
00295   {                                                                  \
00296     uint32_t ch = get32 (inptr);                                     \
00297     /*                                                               \
00298        if (ch >= (sizeof (from_ucs4_lat1) / sizeof (from_ucs4_lat1[0])))      \
00299         {                                                            \
00300           if (ch >= 0x0391 && ch <= 0x0451)                                 \
00301             cp = from_ucs4_greek[ch - 0x391];                               \
00302           else if (ch >= 0x2010 && ch <= 0x9fa0)                     \
00303             cp = from_ucs4_cjk[ch - 0x02010];                               \
00304           else                                                              \
00305             break;                                                   \
00306         }                                                            \
00307        else                                                          \
00308         cp = from_ucs4_lat1[ch];                                     \
00309     */                                                               \
00310                                                                      \
00311     if (ch <= 0x7f && ch != 0x5c)                                    \
00312       *outptr++ = ch;                                                       \
00313     else                                                             \
00314       {                                                                     \
00315        if (ch >= 0xac00 && ch <= 0xd7a3)                             \
00316          {                                                           \
00317            if (__builtin_expect (outptr + 2 > outend, 0))                   \
00318              {                                                              \
00319               result = __GCONV_FULL_OUTPUT;                                 \
00320               break;                                                 \
00321              }                                                              \
00322                                                                      \
00323            ch -= 0xac00;                                             \
00324                                                                      \
00325            ch = (init_to_bit[ch / 588]      /* 21 * 28 = 588 */                    \
00326                 + mid_to_bit[(ch / 28) % 21]/* (ch % (21 * 28)) / 28 */     \
00327                 + final_to_bit[ch %  28]);  /* (ch % (21 * 28)) % 28 */     \
00328                                                                      \
00329            *outptr++ = ch / 256;                                     \
00330            *outptr++ = ch % 256;                                     \
00331          }                                                           \
00332        /* KS C 5601-1992 Annex 3 regards  0xA4DA(Hangul Filler : U3164)      \
00333           as symbol */                                                      \
00334        else if (ch >= 0x3131 && ch <= 0x3163)                               \
00335          {                                                           \
00336            ch = jamo_from_ucs_table[ch - 0x3131];                           \
00337                                                                      \
00338            if (__builtin_expect (outptr + 2 > outend, 0))                   \
00339              {                                                              \
00340               result = __GCONV_FULL_OUTPUT;                                 \
00341               break;                                                 \
00342              }                                                              \
00343                                                                      \
00344            *outptr++ = ch / 256;                                     \
00345            *outptr++ = ch % 256;                                     \
00346          }                                                           \
00347        else if ((ch >= 0x4e00 && ch <= 0x9fa5)                              \
00348                || (ch >= 0xf900 && ch <= 0xfa0b))                           \
00349          {                                                           \
00350            size_t written;                                           \
00351            uint32_t temp;                                            \
00352                                                                      \
00353            written = ucs4_to_ksc5601_hanja (ch, outptr, outend - outptr);    \
00354            if (__builtin_expect (written, 1) == 0)                          \
00355              {                                                              \
00356               result = __GCONV_FULL_OUTPUT;                                 \
00357               break;                                                 \
00358              }                                                              \
00359            if (__builtin_expect (written == __UNKNOWN_10646_CHAR, 0))       \
00360              {                                                              \
00361               STANDARD_TO_LOOP_ERR_HANDLER (4);                      \
00362              }                                                              \
00363                                                                      \
00364            outptr[0] -= 0x4a;                                               \
00365            outptr[1] -= 0x21;                                               \
00366                                                                      \
00367            temp = outptr[0] * 94 + outptr[1];                               \
00368                                                                      \
00369            outptr[0] = 0xe0 + temp / 188;                            \
00370            outptr[1] = temp % 188;                                   \
00371            outptr[1] += outptr[1] >= 78 ? 0x43 : 0x31;                      \
00372                                                                      \
00373            outptr += 2;                                              \
00374          }                                                           \
00375        else if (ch == 0x20a9)                                               \
00376          *outptr++ = 0x5c;                                           \
00377        else                                                          \
00378          {                                                           \
00379            size_t written;                                           \
00380            uint32_t temp;                                            \
00381                                                                      \
00382            written = ucs4_to_ksc5601_sym (ch, outptr, outend - outptr);      \
00383            if (__builtin_expect (written, 1) == 0)                          \
00384              {                                                              \
00385               result = __GCONV_FULL_OUTPUT;                                 \
00386               break;                                                 \
00387              }                                                              \
00388            if (__builtin_expect (written == __UNKNOWN_10646_CHAR, 0)        \
00389               || (outptr[0] == 0x22 && outptr[1] > 0x68))                   \
00390              {                                                              \
00391               UNICODE_TAG_HANDLER (ch, 4);                                  \
00392               STANDARD_TO_LOOP_ERR_HANDLER (4);                      \
00393              }                                                              \
00394                                                                      \
00395            temp = (outptr[0] < 0x4a ? outptr[0] + 0x191 : outptr[0] + 0x176);\
00396            outptr[1] += (temp % 2 ? 0x5e : 0);                              \
00397            outptr[1] += (outptr[1] < 0x6f ? 0x10 : 0x22);                   \
00398            outptr[0] = temp / 2;                                     \
00399                                                                      \
00400            outptr += 2;                                              \
00401          }                                                           \
00402       }                                                                     \
00403                                                                      \
00404     inptr += 4;                                                             \
00405   }
00406 #define LOOP_NEED_FLAGS
00407 #include <iconv/loop.c>
00408 
00409 
00410 /* Now define the toplevel functions.  */
00411 #include <iconv/skeleton.c>