Back to index

plt-scheme  4.2.1
gdkanji.c
Go to the documentation of this file.
00001 
00002 /* gdkanji.c (Kanji code converter)                            */
00003 /*                 written by Masahito Yamaga (ma@yama-ga.com) */
00004 
00005 #ifdef HAVE_CONFIG_H
00006 #include "config.h"
00007 #endif
00008 
00009 #include <stdio.h>
00010 #include <stdlib.h>
00011 #include <string.h>
00012 #include "gd.h"
00013 #include "gdhelpers.h"
00014 
00015 #ifdef HAVE_ERRNO_H
00016 #include <errno.h>
00017 #endif
00018 
00019 #include <stdarg.h>
00020 #if defined(HAVE_ICONV_H)
00021 #include <iconv.h>
00022 #endif
00023 
00024 #ifndef HAVE_ICONV_T_DEF
00025 typedef void *iconv_t;
00026 #endif
00027 
00028 #ifndef HAVE_ICONV
00029 #define ICONV_CONST 
00030   iconv_t iconv_open (const char *, const char *);
00031 size_t iconv (iconv_t, ICONV_CONST char **, size_t *, char **, size_t *);
00032 int iconv_close (iconv_t);
00033 
00034 iconv_t
00035 iconv_open (const char *tocode, const char *fromcode)
00036 {
00037   return (iconv_t) (-1);
00038 }
00039 
00040 size_t
00041 iconv (iconv_t cd, ICONV_CONST char **inbuf, size_t * inbytesleft,
00042        char **outbuf, size_t * outbytesleft)
00043 {
00044   return 0;
00045 }
00046 
00047 int
00048 iconv_close (iconv_t cd)
00049 {
00050   return 0;
00051 }
00052 
00053 #endif /* !HAVE_ICONV */
00054 
00055 #define LIBNAME "any2eucjp()"
00056 
00057 #if defined(__MSC__) || defined(__BORLANDC__) || defined(__TURBOC__) || defined(_Windows) || defined(MSDOS)
00058 #ifndef SJISPRE
00059 #define SJISPRE 1
00060 #endif
00061 #endif
00062 
00063 #ifdef TRUE
00064 #undef TRUE
00065 #endif
00066 #ifdef FALSE
00067 #undef FALSE
00068 #endif
00069 
00070 #define TRUE  1
00071 #define FALSE 0
00072 
00073 #define NEW 1
00074 #define OLD 2
00075 #define ESCI 3
00076 #define NEC 4
00077 #define EUC 5
00078 #define SJIS 6
00079 #define EUCORSJIS 7
00080 #define ASCII 8
00081 
00082 #define NEWJISSTR "JIS7"
00083 #define OLDJISSTR "jis"
00084 #define EUCSTR    "eucJP"
00085 #define SJISSTR   "SJIS"
00086 
00087 #define ESC 27
00088 #define SS2 142
00089 
00090 static void
00091 debug (const char *format, ...)
00092 {
00093 #ifdef DEBUG
00094   va_list args;
00095 
00096   va_start (args, format);
00097   fprintf (stdout, "%s: ", LIBNAME);
00098   vfprintf (stdout, format, args);
00099   fprintf (stdout, "\n");
00100   va_end (args);
00101 #endif
00102 }
00103 
00104 static void
00105 error (const char *format, ...)
00106 {
00107   va_list args;
00108 
00109   va_start (args, format);
00110   fprintf (stderr, "%s: ", LIBNAME);
00111   vfprintf (stderr, format, args);
00112   fprintf (stderr, "\n");
00113   va_end (args);
00114 }
00115 
00116 /* DetectKanjiCode() derived from DetectCodeType() by Ken Lunde. */
00117 
00118 static int
00119 DetectKanjiCode (unsigned char *str)
00120 {
00121   static int whatcode = ASCII;
00122   int oldcode = ASCII;
00123   int c, i;
00124   char *lang = NULL;
00125 
00126   c = '\1';
00127   i = 0;
00128 
00129   if (whatcode != EUCORSJIS && whatcode != ASCII)
00130     {
00131       oldcode = whatcode;
00132       whatcode = ASCII;
00133     }
00134 
00135   while ((whatcode == EUCORSJIS || whatcode == ASCII) && c != '\0')
00136     {
00137       if ((c = str[i++]) != '\0')
00138        {
00139          if (c == ESC)
00140            {
00141              c = str[i++];
00142              if (c == '$')
00143               {
00144                 c = str[i++];
00145                 if (c == 'B')
00146                   whatcode = NEW;
00147                 else if (c == '@')
00148                   whatcode = OLD;
00149               }
00150              else if (c == '(')
00151               {
00152                 c = str[i++];
00153                 if (c == 'I')
00154                   whatcode = ESCI;
00155               }
00156              else if (c == 'K')
00157               whatcode = NEC;
00158            }
00159          else if ((c >= 129 && c <= 141) || (c >= 143 && c <= 159))
00160            whatcode = SJIS;
00161          else if (c == SS2)
00162            {
00163              c = str[i++];
00164              if ((c >= 64 && c <= 126) || (c >= 128 && c <= 160)
00165                 || (c >= 224 && c <= 252))
00166               whatcode = SJIS;
00167              else if (c >= 161 && c <= 223)
00168               whatcode = EUCORSJIS;
00169            }
00170          else if (c >= 161 && c <= 223)
00171            {
00172              c = str[i++];
00173              if (c >= 240 && c <= 254)
00174               whatcode = EUC;
00175              else if (c >= 161 && c <= 223)
00176               whatcode = EUCORSJIS;
00177              else if (c >= 224 && c <= 239)
00178               {
00179                 whatcode = EUCORSJIS;
00180                 while (c >= 64 && c != '\0' && whatcode == EUCORSJIS)
00181                   {
00182                     if (c >= 129)
00183                      {
00184                        if (c <= 141 || (c >= 143 && c <= 159))
00185                          whatcode = SJIS;
00186                        else if (c >= 253 && c <= 254)
00187                          whatcode = EUC;
00188                      }
00189                     c = str[i++];
00190                   }
00191               }
00192              else if (c <= 159)
00193               whatcode = SJIS;
00194            }
00195          else if (c >= 240 && c <= 254)
00196            whatcode = EUC;
00197          else if (c >= 224 && c <= 239)
00198            {
00199              c = str[i++];
00200              if ((c >= 64 && c <= 126) || (c >= 128 && c <= 160))
00201               whatcode = SJIS;
00202              else if (c >= 253 && c <= 254)
00203               whatcode = EUC;
00204              else if (c >= 161 && c <= 252)
00205               whatcode = EUCORSJIS;
00206            }
00207        }
00208     }
00209 
00210 #ifdef DEBUG
00211   if (whatcode == ASCII)
00212     debug ("Kanji code not included.");
00213   else if (whatcode == EUCORSJIS)
00214     debug ("Kanji code not detected.");
00215   else
00216     debug ("Kanji code detected at %d byte.", i);
00217 #endif
00218 
00219   if (whatcode == EUCORSJIS && oldcode != ASCII)
00220     whatcode = oldcode;
00221 
00222   if (whatcode == EUCORSJIS)
00223     {
00224       if (getenv ("LC_ALL"))
00225        lang = getenv ("LC_ALL");
00226       else if (getenv ("LC_CTYPE"))
00227        lang = getenv ("LC_CTYPE");
00228       else if (getenv ("LANG"))
00229        lang = getenv ("LANG");
00230 
00231       if (lang)
00232        {
00233          if (strcmp (lang, "ja_JP.SJIS") == 0 ||
00234 #ifdef hpux
00235              strcmp (lang, "japanese") == 0 ||
00236 #endif
00237              strcmp (lang, "ja_JP.mscode") == 0 ||
00238              strcmp (lang, "ja_JP.PCK") == 0)
00239            whatcode = SJIS;
00240          else if (strncmp (lang, "ja", 2) == 0)
00241 #ifdef SJISPRE
00242            whatcode = SJIS;
00243 #else
00244            whatcode = EUC;
00245 #endif
00246        }
00247     }
00248 
00249   if (whatcode == EUCORSJIS)
00250 #ifdef SJISPRE
00251     whatcode = SJIS;
00252 #else
00253     whatcode = EUC;
00254 #endif
00255 
00256   return whatcode;
00257 }
00258 
00259 /* SJIStoJIS() is sjis2jis() by Ken Lunde. */
00260 
00261 static void
00262 SJIStoJIS (int *p1, int *p2)
00263 {
00264   register unsigned char c1 = *p1;
00265   register unsigned char c2 = *p2;
00266   register int adjust = c2 < 159;
00267   register int rowOffset = c1 < 160 ? 112 : 176;
00268   register int cellOffset = adjust ? (31 + (c2 > 127)) : 126;
00269 
00270   *p1 = ((c1 - rowOffset) << 1) - adjust;
00271   *p2 -= cellOffset;
00272 }
00273 
00274 /* han2zen() was derived from han2zen() written by Ken Lunde. */
00275 
00276 #define IS_DAKU(c) ((c >= 182 && c <= 196) || (c >= 202 && c <= 206) || (c == 179))
00277 #define IS_HANDAKU(c) (c >= 202 && c <= 206)
00278 
00279 static void
00280 han2zen (int *p1, int *p2)
00281 {
00282   int c = *p1;
00283   int daku = FALSE;
00284   int handaku = FALSE;
00285   int mtable[][2] = {
00286     {129, 66},
00287     {129, 117},
00288     {129, 118},
00289     {129, 65},
00290     {129, 69},
00291     {131, 146},
00292     {131, 64},
00293     {131, 66},
00294     {131, 68},
00295     {131, 70},
00296     {131, 72},
00297     {131, 131},
00298     {131, 133},
00299     {131, 135},
00300     {131, 98},
00301     {129, 91},
00302     {131, 65},
00303     {131, 67},
00304     {131, 69},
00305     {131, 71},
00306     {131, 73},
00307     {131, 74},
00308     {131, 76},
00309     {131, 78},
00310     {131, 80},
00311     {131, 82},
00312     {131, 84},
00313     {131, 86},
00314     {131, 88},
00315     {131, 90},
00316     {131, 92},
00317     {131, 94},
00318     {131, 96},
00319     {131, 99},
00320     {131, 101},
00321     {131, 103},
00322     {131, 105},
00323     {131, 106},
00324     {131, 107},
00325     {131, 108},
00326     {131, 109},
00327     {131, 110},
00328     {131, 113},
00329     {131, 116},
00330     {131, 119},
00331     {131, 122},
00332     {131, 125},
00333     {131, 126},
00334     {131, 128},
00335     {131, 129},
00336     {131, 130},
00337     {131, 132},
00338     {131, 134},
00339     {131, 136},
00340     {131, 137},
00341     {131, 138},
00342     {131, 139},
00343     {131, 140},
00344     {131, 141},
00345     {131, 143},
00346     {131, 147},
00347     {129, 74},
00348     {129, 75}
00349   };
00350 
00351   if (*p2 == 222 && IS_DAKU (*p1))
00352     daku = TRUE;            /* Daku-ten */
00353   else if (*p2 == 223 && IS_HANDAKU (*p1))
00354     handaku = TRUE;         /* Han-daku-ten */
00355 
00356   *p1 = mtable[c - 161][0];
00357   *p2 = mtable[c - 161][1];
00358 
00359   if (daku)
00360     {
00361       if ((*p2 >= 74 && *p2 <= 103) || (*p2 >= 110 && *p2 <= 122))
00362        (*p2)++;
00363       else if (*p2 == 131 && *p2 == 69)
00364        *p2 = 148;
00365     }
00366   else if (handaku && *p2 >= 110 && *p2 <= 122)
00367     (*p2) += 2;
00368 }
00369 
00370 /* Recast strcpy to handle unsigned chars used below. */
00371 #define ustrcpy(A,B) (strcpy((char*)(A),(const char*)(B)))
00372 
00373 static void
00374 do_convert (unsigned char *to, unsigned char *from, const char *code)
00375 {
00376 #ifdef HAVE_ICONV
00377   iconv_t cd;
00378   size_t from_len, to_len;
00379 
00380   if ((cd = iconv_open (EUCSTR, code)) == (iconv_t) - 1)
00381     {
00382       error ("iconv_open() error");
00383 #ifdef HAVE_ERRNO_H
00384       if (errno == EINVAL)
00385        error ("invalid code specification: \"%s\" or \"%s\"", EUCSTR, code);
00386 #endif
00387       strcpy ((char *) to, (const char *) from);
00388       return;
00389     }
00390 
00391   from_len = strlen ((const char *) from) + 1;
00392   to_len = BUFSIZ;
00393 
00394   if ((int) (iconv (cd, (char **) &from, &from_len, (char **) &to, &to_len))
00395       == -1)
00396     {
00397 #ifdef HAVE_ERRNO_H
00398       if (errno == EINVAL)
00399        error ("invalid end of input string");
00400       else if (errno == EILSEQ)
00401        error ("invalid code in input string");
00402       else if (errno == E2BIG)
00403        error ("output buffer overflow at do_convert()");
00404       else
00405 #endif
00406        error ("something happen");
00407       strcpy ((char *) to, (const char *) from);
00408       return;
00409     }
00410 
00411   if (iconv_close (cd) != 0)
00412     {
00413       error ("iconv_close() error");
00414     }
00415 #else
00416   int p1, p2, i, j;
00417   int jisx0208 = FALSE;
00418   int hankaku = FALSE;
00419 
00420   j = 0;
00421   if (strcmp (code, NEWJISSTR) == 0 || strcmp (code, OLDJISSTR) == 0)
00422     {
00423       for (i = 0; from[i] != '\0' && j < BUFSIZ; i++)
00424        {
00425          if (from[i] == ESC)
00426            {
00427              i++;
00428              if (from[i] == '$')
00429               {
00430                 jisx0208 = TRUE;
00431                 hankaku = FALSE;
00432                 i++;
00433               }
00434              else if (from[i] == '(')
00435               {
00436                 jisx0208 = FALSE;
00437                 i++;
00438                 if (from[i] == 'I')       /* Hankaku Kana */
00439                   hankaku = TRUE;
00440                 else
00441                   hankaku = FALSE;
00442               }
00443            }
00444          else
00445            {
00446              if (jisx0208)
00447               to[j++] = from[i] + 128;
00448              else if (hankaku)
00449               {
00450                 to[j++] = SS2;
00451                 to[j++] = from[i] + 128;
00452               }
00453              else
00454               to[j++] = from[i];
00455            }
00456        }
00457     }
00458   else if (strcmp (code, SJISSTR) == 0)
00459     {
00460       for (i = 0; from[i] != '\0' && j < BUFSIZ; i++)
00461        {
00462          p1 = from[i];
00463          if (p1 < 127)
00464            to[j++] = p1;
00465          else if ((p1 >= 161) && (p1 <= 223))
00466            {                /* Hankaku Kana */
00467              to[j++] = SS2;
00468              to[j++] = p1;
00469            }
00470          else
00471            {
00472              p2 = from[++i];
00473              SJIStoJIS (&p1, &p2);
00474              to[j++] = p1 + 128;
00475              to[j++] = p2 + 128;
00476            }
00477        }
00478     }
00479   else
00480     {
00481       error ("invalid code specification: \"%s\"", code);
00482       return;
00483     }
00484 
00485   if (j >= BUFSIZ)
00486     {
00487       error ("output buffer overflow at do_convert()");
00488       ustrcpy (to, from);
00489     }
00490   else
00491     to[j] = '\0';
00492 #endif /* HAVE_ICONV */
00493 }
00494 
00495 static int
00496 do_check_and_conv (unsigned char *to, unsigned char *from)
00497 {
00498   static unsigned char tmp[BUFSIZ];
00499   int p1, p2, i, j;
00500   int kanji = TRUE;
00501 
00502   switch (DetectKanjiCode (from))
00503     {
00504     case NEW:
00505       debug ("Kanji code is New JIS.");
00506       do_convert (tmp, from, NEWJISSTR);
00507       break;
00508     case OLD:
00509       debug ("Kanji code is Old JIS.");
00510       do_convert (tmp, from, OLDJISSTR);
00511       break;
00512     case ESCI:
00513       debug
00514        ("This string includes Hankaku-Kana (jisx0201) escape sequence [ESC] + ( + I.");
00515       do_convert (tmp, from, NEWJISSTR);
00516       break;
00517     case NEC:
00518       debug ("Kanji code is NEC Kanji.");
00519       error ("cannot convert NEC Kanji.");
00520       ustrcpy (tmp, from);
00521       kanji = FALSE;
00522       break;
00523     case EUC:
00524       debug ("Kanji code is EUC.");
00525       ustrcpy (tmp, from);
00526       break;
00527     case SJIS:
00528       debug ("Kanji code is SJIS.");
00529       do_convert (tmp, from, SJISSTR);
00530       break;
00531     case EUCORSJIS:
00532       debug ("Kanji code is EUC or SJIS.");
00533       ustrcpy (tmp, from);
00534       kanji = FALSE;
00535       break;
00536     case ASCII:
00537       debug ("This is ASCII string.");
00538       ustrcpy (tmp, from);
00539       kanji = FALSE;
00540       break;
00541     default:
00542       debug ("This string includes unknown code.");
00543       ustrcpy (tmp, from);
00544       kanji = FALSE;
00545       break;
00546     }
00547 
00548   /* Hankaku Kana ---> Zenkaku Kana */
00549   if (kanji)
00550     {
00551       j = 0;
00552       for (i = 0; tmp[i] != '\0' && j < BUFSIZ; i++)
00553        {
00554          if (tmp[i] == SS2)
00555            {
00556              p1 = tmp[++i];
00557              if (tmp[i + 1] == SS2)
00558               {
00559                 p2 = tmp[i + 2];
00560                 if (p2 == 222 || p2 == 223)
00561                   i += 2;
00562                 else
00563                   p2 = 0;
00564               }
00565              else
00566               p2 = 0;
00567              han2zen (&p1, &p2);
00568              SJIStoJIS (&p1, &p2);
00569              to[j++] = p1 + 128;
00570              to[j++] = p2 + 128;
00571            }
00572          else
00573            to[j++] = tmp[i];
00574        }
00575 
00576       if (j >= BUFSIZ)
00577        {
00578          error ("output buffer overflow at Hankaku --> Zenkaku");
00579          ustrcpy (to, tmp);
00580        }
00581       else
00582        to[j] = '\0';
00583     }
00584   else
00585     ustrcpy (to, tmp);
00586 
00587   return kanji;
00588 }
00589 
00590 int
00591 any2eucjp (unsigned char *dest, unsigned char *src, unsigned int dest_max)
00592 {
00593   static unsigned char tmp_dest[BUFSIZ];
00594   int ret;
00595 
00596   if (strlen ((const char *) src) >= BUFSIZ)
00597     {
00598       error ("input string too large");
00599       return -1;
00600     }
00601   if (dest_max > BUFSIZ)
00602     {
00603       error
00604        ("invalid maximum size of destination\nit should be less than %d.",
00605         BUFSIZ);
00606       return -1;
00607     }
00608   ret = do_check_and_conv (tmp_dest, src);
00609   if (strlen ((const char *) tmp_dest) >= dest_max)
00610     {
00611       error ("output buffer overflow");
00612       ustrcpy (dest, src);
00613       return -1;
00614     }
00615   ustrcpy (dest, tmp_dest);
00616   return ret;
00617 }
00618 
00619 #if 0
00620 unsigned int
00621 strwidth (unsigned char *s)
00622 {
00623   unsigned char *t;
00624   unsigned int i;
00625 
00626   t = (unsigned char *) gdMalloc (BUFSIZ);
00627   any2eucjp (t, s, BUFSIZ);
00628   i = strlen (t);
00629   gdFree (t);
00630   return i;
00631 }
00632 
00633 #ifdef DEBUG
00634 int
00635 main ()
00636 {
00637   unsigned char input[BUFSIZ];
00638   unsigned char *output;
00639   unsigned char *str;
00640   int c, i = 0;
00641 
00642   while ((c = fgetc (stdin)) != '\n' && i < BUFSIZ)
00643     input[i++] = c;
00644   input[i] = '\0';
00645 
00646   printf ("input : %d bytes\n", strlen ((const char *) input));
00647   printf ("output: %d bytes\n", strwidth (input));
00648 
00649   output = (unsigned char *) gdMalloc (BUFSIZ);
00650   any2eucjp (output, input, BUFSIZ);
00651   str = output;
00652   while (*str != '\0')
00653     putchar (*(str++));
00654   putchar ('\n');
00655   gdFree (output);
00656 
00657   return 0;
00658 }
00659 #endif
00660 #endif