Back to index

tetex-bin  3.0
gdkanji.c
Go to the documentation of this file.
00001 /* gdkanji.c (Kanji code converter)                            */
00002 /*                 written by Masahito Yamaga (ma@yama-ga.com) */
00003 
00004 #ifdef HAVE_CONFIG_H
00005 #include "config.h"
00006 #endif
00007 
00008 #include <stdio.h>
00009 #include <stdlib.h>
00010 #include <string.h>
00011 #include "gd.h"
00012 #include "gdhelpers.h"
00013 
00014 #ifdef HAVE_ERRNO_H
00015 #include <errno.h>
00016 #endif
00017 
00018 #include <stdarg.h>
00019 #if defined(HAVE_ICONV_H)
00020 #include <iconv.h>
00021 #endif
00022 
00023 #ifndef HAVE_ICONV_T_DEF
00024 typedef void *iconv_t;
00025 #endif
00026 
00027 #ifndef HAVE_ICONV
00028 #define ICONV_CONST 
00029   iconv_t iconv_open (const char *, const char *);
00030 size_t iconv (iconv_t, ICONV_CONST char **, size_t *, char **, size_t *);
00031 int iconv_close (iconv_t);
00032 
00033 iconv_t
00034 iconv_open (const char *tocode, const char *fromcode)
00035 {
00036   return (iconv_t) (-1);
00037 }
00038 
00039 size_t
00040 iconv (iconv_t cd, ICONV_CONST char **inbuf, size_t * inbytesleft,
00041        char **outbuf, size_t * outbytesleft)
00042 {
00043   return 0;
00044 }
00045 
00046 int
00047 iconv_close (iconv_t cd)
00048 {
00049   return 0;
00050 }
00051 
00052 #endif /* !HAVE_ICONV */
00053 
00054 #define LIBNAME "any2eucjp()"
00055 
00056 #if defined(__MSC__) || defined(__BORLANDC__) || defined(__TURBOC__) || defined(_Windows) || defined(MSDOS)
00057 #ifndef SJISPRE
00058 #define SJISPRE 1
00059 #endif
00060 #endif
00061 
00062 #ifdef TRUE
00063 #undef TRUE
00064 #endif
00065 #ifdef FALSE
00066 #undef FALSE
00067 #endif
00068 
00069 #define TRUE  1
00070 #define FALSE 0
00071 
00072 #define NEW 1
00073 #define OLD 2
00074 #define ESCI 3
00075 #define NEC 4
00076 #define EUC 5
00077 #define SJIS 6
00078 #define EUCORSJIS 7
00079 #define ASCII 8
00080 
00081 #define NEWJISSTR "JIS7"
00082 #define OLDJISSTR "jis"
00083 #define EUCSTR    "eucJP"
00084 #define SJISSTR   "SJIS"
00085 
00086 #define ESC 27
00087 #define SS2 142
00088 
00089 static void
00090 debug (const char *format, ...)
00091 {
00092 #ifdef DEBUG
00093   va_list args;
00094 
00095   va_start (args, format);
00096   fprintf (stdout, "%s: ", LIBNAME);
00097   vfprintf (stdout, format, args);
00098   fprintf (stdout, "\n");
00099   va_end (args);
00100 #endif
00101 }
00102 
00103 static void
00104 error (const char *format, ...)
00105 {
00106   va_list args;
00107 
00108   va_start (args, format);
00109   fprintf (stderr, "%s: ", LIBNAME);
00110   vfprintf (stderr, format, args);
00111   fprintf (stderr, "\n");
00112   va_end (args);
00113 }
00114 
00115 /* DetectKanjiCode() derived from DetectCodeType() by Ken Lunde. */
00116 
00117 static int
00118 DetectKanjiCode (unsigned char *str)
00119 {
00120   static int whatcode = ASCII;
00121   int oldcode = ASCII;
00122   int c, i;
00123   char *lang = NULL;
00124 
00125   c = '\1';
00126   i = 0;
00127 
00128   if (whatcode != EUCORSJIS && whatcode != ASCII)
00129     {
00130       oldcode = whatcode;
00131       whatcode = ASCII;
00132     }
00133 
00134   while ((whatcode == EUCORSJIS || whatcode == ASCII) && c != '\0')
00135     {
00136       if ((c = str[i++]) != '\0')
00137        {
00138          if (c == ESC)
00139            {
00140              c = str[i++];
00141              if (c == '$')
00142               {
00143                 c = str[i++];
00144                 if (c == 'B')
00145                   whatcode = NEW;
00146                 else if (c == '@')
00147                   whatcode = OLD;
00148               }
00149              else if (c == '(')
00150               {
00151                 c = str[i++];
00152                 if (c == 'I')
00153                   whatcode = ESCI;
00154               }
00155              else if (c == 'K')
00156               whatcode = NEC;
00157            }
00158          else if ((c >= 129 && c <= 141) || (c >= 143 && c <= 159))
00159            whatcode = SJIS;
00160          else if (c == SS2)
00161            {
00162              c = str[i++];
00163              if ((c >= 64 && c <= 126) || (c >= 128 && c <= 160)
00164                 || (c >= 224 && c <= 252))
00165               whatcode = SJIS;
00166              else if (c >= 161 && c <= 223)
00167               whatcode = EUCORSJIS;
00168            }
00169          else if (c >= 161 && c <= 223)
00170            {
00171              c = str[i++];
00172              if (c >= 240 && c <= 254)
00173               whatcode = EUC;
00174              else if (c >= 161 && c <= 223)
00175               whatcode = EUCORSJIS;
00176              else if (c >= 224 && c <= 239)
00177               {
00178                 whatcode = EUCORSJIS;
00179                 while (c >= 64 && c != '\0' && whatcode == EUCORSJIS)
00180                   {
00181                     if (c >= 129)
00182                      {
00183                        if (c <= 141 || (c >= 143 && c <= 159))
00184                          whatcode = SJIS;
00185                        else if (c >= 253 && c <= 254)
00186                          whatcode = EUC;
00187                      }
00188                     c = str[i++];
00189                   }
00190               }
00191              else if (c <= 159)
00192               whatcode = SJIS;
00193            }
00194          else if (c >= 240 && c <= 254)
00195            whatcode = EUC;
00196          else if (c >= 224 && c <= 239)
00197            {
00198              c = str[i++];
00199              if ((c >= 64 && c <= 126) || (c >= 128 && c <= 160))
00200               whatcode = SJIS;
00201              else if (c >= 253 && c <= 254)
00202               whatcode = EUC;
00203              else if (c >= 161 && c <= 252)
00204               whatcode = EUCORSJIS;
00205            }
00206        }
00207     }
00208 
00209 #ifdef DEBUG
00210   if (whatcode == ASCII)
00211     debug ("Kanji code not included.");
00212   else if (whatcode == EUCORSJIS)
00213     debug ("Kanji code not detected.");
00214   else
00215     debug ("Kanji code detected at %d byte.", i);
00216 #endif
00217 
00218   if (whatcode == EUCORSJIS && oldcode != ASCII)
00219     whatcode = oldcode;
00220 
00221   if (whatcode == EUCORSJIS)
00222     {
00223       if (getenv ("LC_ALL"))
00224        lang = getenv ("LC_ALL");
00225       else if (getenv ("LC_CTYPE"))
00226        lang = getenv ("LC_CTYPE");
00227       else if (getenv ("LANG"))
00228        lang = getenv ("LANG");
00229 
00230       if (lang)
00231        {
00232          if (strcmp (lang, "ja_JP.SJIS") == 0 ||
00233 #ifdef hpux
00234              strcmp (lang, "japanese") == 0 ||
00235 #endif
00236              strcmp (lang, "ja_JP.mscode") == 0 ||
00237              strcmp (lang, "ja_JP.PCK") == 0)
00238            whatcode = SJIS;
00239          else if (strncmp (lang, "ja", 2) == 0)
00240 #ifdef SJISPRE
00241            whatcode = SJIS;
00242 #else
00243            whatcode = EUC;
00244 #endif
00245        }
00246     }
00247 
00248   if (whatcode == EUCORSJIS)
00249 #ifdef SJISPRE
00250     whatcode = SJIS;
00251 #else
00252     whatcode = EUC;
00253 #endif
00254 
00255   return whatcode;
00256 }
00257 
00258 /* SJIStoJIS() is sjis2jis() by Ken Lunde. */
00259 
00260 static void
00261 SJIStoJIS (int *p1, int *p2)
00262 {
00263   register unsigned char c1 = *p1;
00264   register unsigned char c2 = *p2;
00265   register int adjust = c2 < 159;
00266   register int rowOffset = c1 < 160 ? 112 : 176;
00267   register int cellOffset = adjust ? (31 + (c2 > 127)) : 126;
00268 
00269   *p1 = ((c1 - rowOffset) << 1) - adjust;
00270   *p2 -= cellOffset;
00271 }
00272 
00273 /* han2zen() was derived from han2zen() written by Ken Lunde. */
00274 
00275 #define IS_DAKU(c) ((c >= 182 && c <= 196) || (c >= 202 && c <= 206) || (c == 179))
00276 #define IS_HANDAKU(c) (c >= 202 && c <= 206)
00277 
00278 static void
00279 han2zen (int *p1, int *p2)
00280 {
00281   int c = *p1;
00282   int daku = FALSE;
00283   int handaku = FALSE;
00284   int mtable[][2] = {
00285     {129, 66},
00286     {129, 117},
00287     {129, 118},
00288     {129, 65},
00289     {129, 69},
00290     {131, 146},
00291     {131, 64},
00292     {131, 66},
00293     {131, 68},
00294     {131, 70},
00295     {131, 72},
00296     {131, 131},
00297     {131, 133},
00298     {131, 135},
00299     {131, 98},
00300     {129, 91},
00301     {131, 65},
00302     {131, 67},
00303     {131, 69},
00304     {131, 71},
00305     {131, 73},
00306     {131, 74},
00307     {131, 76},
00308     {131, 78},
00309     {131, 80},
00310     {131, 82},
00311     {131, 84},
00312     {131, 86},
00313     {131, 88},
00314     {131, 90},
00315     {131, 92},
00316     {131, 94},
00317     {131, 96},
00318     {131, 99},
00319     {131, 101},
00320     {131, 103},
00321     {131, 105},
00322     {131, 106},
00323     {131, 107},
00324     {131, 108},
00325     {131, 109},
00326     {131, 110},
00327     {131, 113},
00328     {131, 116},
00329     {131, 119},
00330     {131, 122},
00331     {131, 125},
00332     {131, 126},
00333     {131, 128},
00334     {131, 129},
00335     {131, 130},
00336     {131, 132},
00337     {131, 134},
00338     {131, 136},
00339     {131, 137},
00340     {131, 138},
00341     {131, 139},
00342     {131, 140},
00343     {131, 141},
00344     {131, 143},
00345     {131, 147},
00346     {129, 74},
00347     {129, 75}
00348   };
00349 
00350   if (*p2 == 222 && IS_DAKU (*p1))
00351     daku = TRUE;            /* Daku-ten */
00352   else if (*p2 == 223 && IS_HANDAKU (*p1))
00353     handaku = TRUE;         /* Han-daku-ten */
00354 
00355   *p1 = mtable[c - 161][0];
00356   *p2 = mtable[c - 161][1];
00357 
00358   if (daku)
00359     {
00360       if ((*p2 >= 74 && *p2 <= 103) || (*p2 >= 110 && *p2 <= 122))
00361        (*p2)++;
00362       else if (*p2 == 131 && *p2 == 69)
00363        *p2 = 148;
00364     }
00365   else if (handaku && *p2 >= 110 && *p2 <= 122)
00366     (*p2) += 2;
00367 }
00368 
00369 /* Recast strcpy to handle unsigned chars used below. */
00370 #define ustrcpy(A,B) (strcpy((char*)(A),(const char*)(B)))
00371 
00372 static void
00373 do_convert (unsigned char **to_p, unsigned char **from_p, const char *code)
00374 {
00375 unsigned char *to = *to_p;
00376 unsigned char *from = *from_p;
00377 #ifdef HAVE_ICONV
00378   iconv_t cd;
00379   size_t from_len, to_len;
00380 
00381   if ((cd = iconv_open (EUCSTR, code)) == (iconv_t) - 1)
00382     {
00383       error ("iconv_open() error");
00384 #ifdef HAVE_ERRNO_H
00385       if (errno == EINVAL)
00386        error ("invalid code specification: \"%s\" or \"%s\"", EUCSTR, code);
00387 #endif
00388       ustrcpy (to, from);
00389       return;
00390     }
00391 
00392   from_len = strlen ((const char *)from) + 1;
00393   to_len = BUFSIZ;
00394 
00395   if ((int) (iconv (cd, (char **)from_p, &from_len, (char **)to_p, &to_len))
00396       == -1)
00397     {
00398 #ifdef HAVE_ERRNO_H
00399       if (errno == EINVAL)
00400        error ("invalid end of input string");
00401       else if (errno == EILSEQ)
00402        error ("invalid code in input string");
00403       else if (errno == E2BIG)
00404        error ("output buffer overflow at do_convert()");
00405       else
00406 #endif
00407        error ("something happen");
00408       ustrcpy (to, from);
00409       return;
00410     }
00411 
00412   if (iconv_close (cd) != 0)
00413     {
00414       error ("iconv_close() error");
00415     }
00416 #else
00417   int p1, p2, i, j;
00418   int jisx0208 = FALSE;
00419   int hankaku = FALSE;
00420 
00421   j = 0;
00422   if (strcmp (code, NEWJISSTR) == 0 || strcmp (code, OLDJISSTR) == 0)
00423     {
00424       for (i = 0; from[i] != '\0' && j < BUFSIZ; i++)
00425        {
00426          if (from[i] == ESC)
00427            {
00428              i++;
00429              if (from[i] == '$')
00430               {
00431                 jisx0208 = TRUE;
00432                 hankaku = FALSE;
00433                 i++;
00434               }
00435              else if (from[i] == '(')
00436               {
00437                 jisx0208 = FALSE;
00438                 i++;
00439                 if (from[i] == 'I')       /* Hankaku Kana */
00440                   hankaku = TRUE;
00441                 else
00442                   hankaku = FALSE;
00443               }
00444            }
00445          else
00446            {
00447              if (jisx0208)
00448               to[j++] = from[i] + 128;
00449              else if (hankaku)
00450               {
00451                 to[j++] = SS2;
00452                 to[j++] = from[i] + 128;
00453               }
00454              else
00455               to[j++] = from[i];
00456            }
00457        }
00458     }
00459   else if (strcmp (code, SJISSTR) == 0)
00460     {
00461       for (i = 0; from[i] != '\0' && j < BUFSIZ; i++)
00462        {
00463          p1 = from[i];
00464          if (p1 < 127)
00465            to[j++] = p1;
00466          else if ((p1 >= 161) && (p1 <= 223))
00467            {                /* Hankaku Kana */
00468              to[j++] = SS2;
00469              to[j++] = p1;
00470            }
00471          else
00472            {
00473              p2 = from[++i];
00474              SJIStoJIS (&p1, &p2);
00475              to[j++] = p1 + 128;
00476              to[j++] = p2 + 128;
00477            }
00478        }
00479     }
00480   else
00481     {
00482       error ("invalid code specification: \"%s\"", code);
00483       return;
00484     }
00485 
00486   if (j >= BUFSIZ)
00487     {
00488       error ("output buffer overflow at do_convert()");
00489       ustrcpy (to, from);
00490     }
00491   else
00492     to[j] = '\0';
00493 #endif /* HAVE_ICONV */
00494 }
00495 
00496 static int
00497 do_check_and_conv (unsigned char *to, unsigned char *from)
00498 {
00499   static unsigned char tmp[BUFSIZ];
00500   unsigned char *tmp_p = &tmp[0];
00501   int p1, p2, i, j;
00502   int kanji = TRUE;
00503 
00504   switch (DetectKanjiCode (from))
00505     {
00506     case NEW:
00507       debug ("Kanji code is New JIS.");
00508       do_convert (&tmp_p, &from, NEWJISSTR);
00509       break;
00510     case OLD:
00511       debug ("Kanji code is Old JIS.");
00512       do_convert (&tmp_p, &from, OLDJISSTR);
00513       break;
00514     case ESCI:
00515       debug
00516        ("This string includes Hankaku-Kana (jisx0201) escape sequence [ESC] + ( + I.");
00517       do_convert (&tmp_p, &from, NEWJISSTR);
00518       break;
00519     case NEC:
00520       debug ("Kanji code is NEC Kanji.");
00521       error ("cannot convert NEC Kanji.");
00522       ustrcpy (tmp, from);
00523       kanji = FALSE;
00524       break;
00525     case EUC:
00526       debug ("Kanji code is EUC.");
00527       ustrcpy (tmp, from);
00528       break;
00529     case SJIS:
00530       debug ("Kanji code is SJIS.");
00531       do_convert (&tmp_p, &from, SJISSTR);
00532       break;
00533     case EUCORSJIS:
00534       debug ("Kanji code is EUC or SJIS.");
00535       ustrcpy (tmp, from);
00536       kanji = FALSE;
00537       break;
00538     case ASCII:
00539       debug ("This is ASCII string.");
00540       ustrcpy (tmp, from);
00541       kanji = FALSE;
00542       break;
00543     default:
00544       debug ("This string includes unknown code.");
00545       ustrcpy (tmp, from);
00546       kanji = FALSE;
00547       break;
00548     }
00549 
00550   /* Hankaku Kana ---> Zenkaku Kana */
00551   if (kanji)
00552     {
00553       j = 0;
00554       for (i = 0; tmp[i] != '\0' && j < BUFSIZ; i++)
00555        {
00556          if (tmp[i] == SS2)
00557            {
00558              p1 = tmp[++i];
00559              if (tmp[i + 1] == SS2)
00560               {
00561                 p2 = tmp[i + 2];
00562                 if (p2 == 222 || p2 == 223)
00563                   i += 2;
00564                 else
00565                   p2 = 0;
00566               }
00567              else
00568               p2 = 0;
00569              han2zen (&p1, &p2);
00570              SJIStoJIS (&p1, &p2);
00571              to[j++] = p1 + 128;
00572              to[j++] = p2 + 128;
00573            }
00574          else
00575            to[j++] = tmp[i];
00576        }
00577 
00578       if (j >= BUFSIZ)
00579        {
00580          error ("output buffer overflow at Hankaku --> Zenkaku");
00581          ustrcpy (to, tmp);
00582        }
00583       else
00584        to[j] = '\0';
00585     }
00586   else
00587     ustrcpy (to, tmp);
00588 
00589   return kanji;
00590 }
00591 
00592 int
00593 any2eucjp (unsigned char *dest, unsigned char *src, unsigned int dest_max)
00594 {
00595   static unsigned char tmp_dest[BUFSIZ];
00596   int ret;
00597 
00598   if (strlen ((const char *) src) >= BUFSIZ)
00599     {
00600       error ("input string too large");
00601       return -1;
00602     }
00603   if (dest_max > BUFSIZ)
00604     {
00605       error
00606        ("invalid maximum size of destination\nit should be less than %d.",
00607         BUFSIZ);
00608       return -1;
00609     }
00610   ret = do_check_and_conv (tmp_dest, src);
00611   if (strlen ((const char *) tmp_dest) >= dest_max)
00612     {
00613       error ("output buffer overflow");
00614       ustrcpy (dest, src);
00615       return -1;
00616     }
00617   ustrcpy (dest, tmp_dest);
00618   return ret;
00619 }
00620 
00621 #if 0
00622 unsigned int
00623 strwidth (unsigned char *s)
00624 {
00625   unsigned char *t;
00626   unsigned int i;
00627 
00628   t = (unsigned char *) gdMalloc (BUFSIZ);
00629   any2eucjp (t, s, BUFSIZ);
00630   i = strlen (t);
00631   gdFree (t);
00632   return i;
00633 }
00634 
00635 #ifdef DEBUG
00636 int
00637 main ()
00638 {
00639   unsigned char input[BUFSIZ];
00640   unsigned char *output;
00641   unsigned char *str;
00642   int c, i = 0;
00643 
00644   while ((c = fgetc (stdin)) != '\n' && i < BUFSIZ)
00645     input[i++] = c;
00646   input[i] = '\0';
00647 
00648   printf ("input : %d bytes\n", strlen ((const char *) input));
00649   printf ("output: %d bytes\n", strwidth (input));
00650 
00651   output = (unsigned char *) gdMalloc (BUFSIZ);
00652   any2eucjp (output, input, BUFSIZ);
00653   str = output;
00654   while (*str != '\0')
00655     putchar (*(str++));
00656   putchar ('\n');
00657   gdFree (output);
00658 
00659   return 0;
00660 }
00661 #endif
00662 #endif