Back to index

php5  5.3.10
gdkanji.c
Go to the documentation of this file.
00001 
00002 /* gdkanji.c (Kanji code converter)                            */
00003 /*                 written by Masahito Yamaga (ma@yama-ga.com) */
00004 
00005 #include <stdio.h>
00006 #include <stdlib.h>
00007 #include <string.h>
00008 #include "gd.h"
00009 #include "gdhelpers.h"
00010 
00011 #include <stdarg.h>
00012 #if defined(HAVE_ICONV_H) || defined(HAVE_ICONV)
00013 #include <iconv.h>
00014 #ifdef HAVE_ERRNO_H
00015 #include <errno.h>
00016 #endif
00017 #endif
00018 
00019 #if defined(HAVE_ICONV_H) && !defined(HAVE_ICONV)
00020 #define HAVE_ICONV 1
00021 #endif
00022 
00023 #define LIBNAME "any2eucjp()"
00024 
00025 #if defined(__MSC__) || defined(__BORLANDC__) || defined(__TURBOC__) || defined(_Windows) || defined(MSDOS)
00026 #ifndef SJISPRE
00027 #define SJISPRE 1
00028 #endif
00029 #endif
00030 
00031 #ifdef TRUE
00032 #undef TRUE
00033 #endif
00034 #ifdef FALSE
00035 #undef FALSE
00036 #endif
00037 
00038 #define TRUE  1
00039 #define FALSE 0
00040 
00041 #define NEW 1
00042 #define OLD 2
00043 #define ESCI 3
00044 #define NEC 4
00045 #define EUC 5
00046 #define SJIS 6
00047 #define EUCORSJIS 7
00048 #define ASCII 8
00049 
00050 #define NEWJISSTR "JIS7"
00051 #define OLDJISSTR "jis"
00052 #define EUCSTR    "eucJP"
00053 #define SJISSTR   "SJIS"
00054 
00055 #define ESC 27
00056 #define SS2 142
00057 
00058 static void
00059 debug (const char *format,...)
00060 {
00061 #ifdef DEBUG
00062   va_list args;
00063 
00064   va_start (args, format);
00065   fprintf (stdout, "%s: ", LIBNAME);
00066   vfprintf (stdout, format, args);
00067   fprintf (stdout, "\n");
00068   va_end (args);
00069 #endif
00070 }
00071 
00072 static void
00073 error (const char *format,...)
00074 {
00075        va_list args;
00076        char *tmp;
00077        TSRMLS_FETCH();
00078 
00079        va_start(args, format);
00080        vspprintf(&tmp, 0, format, args);
00081        va_end(args);
00082        php_error_docref(NULL TSRMLS_CC, E_WARNING, "%s: %s", LIBNAME, tmp);
00083        efree(tmp);
00084 }
00085 
00086 /* DetectKanjiCode() derived from DetectCodeType() by Ken Lunde. */
00087 
00088 static int
00089 DetectKanjiCode (unsigned char *str)
00090 {
00091   static int whatcode = ASCII;
00092   int oldcode = ASCII;
00093   int c, i;
00094   char *lang = NULL;
00095 
00096   c = '\1';
00097   i = 0;
00098 
00099   if (whatcode != EUCORSJIS && whatcode != ASCII)
00100     {
00101       oldcode = whatcode;
00102       whatcode = ASCII;
00103     }
00104 
00105   while ((whatcode == EUCORSJIS || whatcode == ASCII) && c != '\0')
00106     {
00107       if ((c = str[i++]) != '\0')
00108        {
00109          if (c == ESC)
00110            {
00111              c = str[i++];
00112              if (c == '$')
00113               {
00114                 c = str[i++];
00115                 if (c == 'B')
00116                   whatcode = NEW;
00117                 else if (c == '@')
00118                   whatcode = OLD;
00119               }
00120              else if (c == '(')
00121               {
00122                 c = str[i++];
00123                 if (c == 'I')
00124                   whatcode = ESCI;
00125               }
00126              else if (c == 'K')
00127               whatcode = NEC;
00128            }
00129          else if ((c >= 129 && c <= 141) || (c >= 143 && c <= 159))
00130            whatcode = SJIS;
00131          else if (c == SS2)
00132            {
00133              c = str[i++];
00134              if ((c >= 64 && c <= 126) || (c >= 128 && c <= 160) || (c >= 224 && c <= 252))
00135               whatcode = SJIS;
00136              else if (c >= 161 && c <= 223)
00137               whatcode = EUCORSJIS;
00138            }
00139          else if (c >= 161 && c <= 223)
00140            {
00141              c = str[i++];
00142              if (c >= 240 && c <= 254)
00143               whatcode = EUC;
00144              else if (c >= 161 && c <= 223)
00145               whatcode = EUCORSJIS;
00146              else if (c >= 224 && c <= 239)
00147               {
00148                 whatcode = EUCORSJIS;
00149                 while (c >= 64 && c != '\0' && whatcode == EUCORSJIS)
00150                   {
00151                     if (c >= 129)
00152                      {
00153                        if (c <= 141 || (c >= 143 && c <= 159))
00154                          whatcode = SJIS;
00155                        else if (c >= 253 && c <= 254)
00156                          whatcode = EUC;
00157                      }
00158                     c = str[i++];
00159                   }
00160               }
00161              else if (c <= 159)
00162               whatcode = SJIS;
00163            }
00164          else if (c >= 240 && c <= 254)
00165            whatcode = EUC;
00166          else if (c >= 224 && c <= 239)
00167            {
00168              c = str[i++];
00169              if ((c >= 64 && c <= 126) || (c >= 128 && c <= 160))
00170               whatcode = SJIS;
00171              else if (c >= 253 && c <= 254)
00172               whatcode = EUC;
00173              else if (c >= 161 && c <= 252)
00174               whatcode = EUCORSJIS;
00175            }
00176        }
00177     }
00178 
00179 #ifdef DEBUG
00180   if (whatcode == ASCII)
00181     debug ("Kanji code not included.");
00182   else if (whatcode == EUCORSJIS)
00183     debug ("Kanji code not detected.");
00184   else
00185     debug ("Kanji code detected at %d byte.", i);
00186 #endif
00187 
00188   if (whatcode == EUCORSJIS && oldcode != ASCII)
00189     whatcode = oldcode;
00190 
00191   if (whatcode == EUCORSJIS)
00192     {
00193       if (getenv ("LC_ALL"))
00194        lang = getenv ("LC_ALL");
00195       else if (getenv ("LC_CTYPE"))
00196        lang = getenv ("LC_CTYPE");
00197       else if (getenv ("LANG"))
00198        lang = getenv ("LANG");
00199 
00200       if (lang)
00201        {
00202          if (strcmp (lang, "ja_JP.SJIS") == 0 ||
00203 #ifdef hpux
00204              strcmp (lang, "japanese") == 0 ||
00205 #endif
00206              strcmp (lang, "ja_JP.mscode") == 0 ||
00207              strcmp (lang, "ja_JP.PCK") == 0)
00208            whatcode = SJIS;
00209          else if (strncmp (lang, "ja", 2) == 0)
00210 #ifdef SJISPRE
00211            whatcode = SJIS;
00212 #else
00213            whatcode = EUC;
00214 #endif
00215        }
00216     }
00217 
00218   if (whatcode == EUCORSJIS)
00219 #ifdef SJISPRE
00220     whatcode = SJIS;
00221 #else
00222     whatcode = EUC;
00223 #endif
00224 
00225   return whatcode;
00226 }
00227 
00228 /* SJIStoJIS() is sjis2jis() by Ken Lunde. */
00229 
00230 static void
00231 SJIStoJIS (int *p1, int *p2)
00232 {
00233   register unsigned char c1 = *p1;
00234   register unsigned char c2 = *p2;
00235   register int adjust = c2 < 159;
00236   register int rowOffset = c1 < 160 ? 112 : 176;
00237   register int cellOffset = adjust ? (31 + (c2 > 127)) : 126;
00238 
00239   *p1 = ((c1 - rowOffset) << 1) - adjust;
00240   *p2 -= cellOffset;
00241 }
00242 
00243 /* han2zen() was derived from han2zen() written by Ken Lunde. */
00244 
00245 #define IS_DAKU(c) ((c >= 182 && c <= 196) || (c >= 202 && c <= 206) || (c == 179))
00246 #define IS_HANDAKU(c) (c >= 202 && c <= 206)
00247 
00248 static void
00249 han2zen (int *p1, int *p2)
00250 {
00251   int c = *p1;
00252   int daku = FALSE;
00253   int handaku = FALSE;
00254   int mtable[][2] =
00255   {
00256     {129, 66},
00257     {129, 117},
00258     {129, 118},
00259     {129, 65},
00260     {129, 69},
00261     {131, 146},
00262     {131, 64},
00263     {131, 66},
00264     {131, 68},
00265     {131, 70},
00266     {131, 72},
00267     {131, 131},
00268     {131, 133},
00269     {131, 135},
00270     {131, 98},
00271     {129, 91},
00272     {131, 65},
00273     {131, 67},
00274     {131, 69},
00275     {131, 71},
00276     {131, 73},
00277     {131, 74},
00278     {131, 76},
00279     {131, 78},
00280     {131, 80},
00281     {131, 82},
00282     {131, 84},
00283     {131, 86},
00284     {131, 88},
00285     {131, 90},
00286     {131, 92},
00287     {131, 94},
00288     {131, 96},
00289     {131, 99},
00290     {131, 101},
00291     {131, 103},
00292     {131, 105},
00293     {131, 106},
00294     {131, 107},
00295     {131, 108},
00296     {131, 109},
00297     {131, 110},
00298     {131, 113},
00299     {131, 116},
00300     {131, 119},
00301     {131, 122},
00302     {131, 125},
00303     {131, 126},
00304     {131, 128},
00305     {131, 129},
00306     {131, 130},
00307     {131, 132},
00308     {131, 134},
00309     {131, 136},
00310     {131, 137},
00311     {131, 138},
00312     {131, 139},
00313     {131, 140},
00314     {131, 141},
00315     {131, 143},
00316     {131, 147},
00317     {129, 74},
00318     {129, 75}
00319   };
00320 
00321   if (*p2 == 222 && IS_DAKU (*p1))
00322     daku = TRUE;            /* Daku-ten */
00323   else if (*p2 == 223 && IS_HANDAKU (*p1))
00324     handaku = TRUE;         /* Han-daku-ten */
00325 
00326   *p1 = mtable[c - 161][0];
00327   *p2 = mtable[c - 161][1];
00328 
00329   if (daku)
00330     {
00331       if ((*p2 >= 74 && *p2 <= 103) || (*p2 >= 110 && *p2 <= 122))
00332        (*p2)++;
00333       else if (*p2 == 131 && *p2 == 69)
00334        *p2 = 148;
00335     }
00336   else if (handaku && *p2 >= 110 && *p2 <= 122)
00337     (*p2) += 2;
00338 }
00339 
00340 /* Recast strcpy to handle unsigned chars used below. */
00341 #define ustrcpy(A,B) (strcpy((char*)(A),(const char*)(B)))
00342 
00343 static void
00344 do_convert (unsigned char *to, unsigned char *from, const char *code)
00345 {
00346 #ifdef HAVE_ICONV
00347   iconv_t cd;
00348   size_t from_len, to_len;
00349 
00350   if ((cd = iconv_open (EUCSTR, code)) == (iconv_t) - 1)
00351     {
00352       error ("iconv_open() error");
00353 #ifdef HAVE_ERRNO_H
00354       if (errno == EINVAL)
00355        error ("invalid code specification: \"%s\" or \"%s\"",
00356               EUCSTR, code);
00357 #endif
00358       strcpy ((char *) to, (const char *) from);
00359       return;
00360     }
00361 
00362   from_len = strlen ((const char *) from) + 1;
00363   to_len = BUFSIZ;
00364 
00365   if ((int) iconv(cd, (char **) &from, &from_len, (char **) &to, &to_len) == -1)
00366     {
00367 #ifdef HAVE_ERRNO_H
00368       if (errno == EINVAL)
00369        error ("invalid end of input string");
00370       else if (errno == EILSEQ)
00371        error ("invalid code in input string");
00372       else if (errno == E2BIG)
00373        error ("output buffer overflow at do_convert()");
00374       else
00375 #endif
00376        error ("something happen");
00377       strcpy ((char *) to, (const char *) from);
00378       return;
00379     }
00380 
00381   if (iconv_close (cd) != 0)
00382     {
00383       error ("iconv_close() error");
00384     }
00385 #else
00386   int p1, p2, i, j;
00387   int jisx0208 = FALSE;
00388   int hankaku = FALSE;
00389 
00390   j = 0;
00391   if (strcmp (code, NEWJISSTR) == 0 || strcmp (code, OLDJISSTR) == 0)
00392     {
00393       for (i = 0; from[i] != '\0' && j < BUFSIZ; i++)
00394        {
00395          if (from[i] == ESC)
00396            {
00397              i++;
00398              if (from[i] == '$')
00399               {
00400                 jisx0208 = TRUE;
00401                 hankaku = FALSE;
00402                 i++;
00403               }
00404              else if (from[i] == '(')
00405               {
00406                 jisx0208 = FALSE;
00407                 i++;
00408                 if (from[i] == 'I')       /* Hankaku Kana */
00409                   hankaku = TRUE;
00410                 else
00411                   hankaku = FALSE;
00412               }
00413            }
00414          else
00415            {
00416              if (jisx0208)
00417               to[j++] = from[i] + 128;
00418              else if (hankaku)
00419               {
00420                 to[j++] = SS2;
00421                 to[j++] = from[i] + 128;
00422               }
00423              else
00424               to[j++] = from[i];
00425            }
00426        }
00427     }
00428   else if (strcmp (code, SJISSTR) == 0)
00429     {
00430       for (i = 0; from[i] != '\0' && j < BUFSIZ; i++)
00431        {
00432          p1 = from[i];
00433          if (p1 < 127)
00434            to[j++] = p1;
00435          else if ((p1 >= 161) && (p1 <= 223))
00436            {                /* Hankaku Kana */
00437              to[j++] = SS2;
00438              to[j++] = p1;
00439            }
00440          else
00441            {
00442              p2 = from[++i];
00443              SJIStoJIS (&p1, &p2);
00444              to[j++] = p1 + 128;
00445              to[j++] = p2 + 128;
00446            }
00447        }
00448     }
00449   else
00450     {
00451       error ("invalid code specification: \"%s\"", code);
00452       return;
00453     }
00454 
00455   if (j >= BUFSIZ)
00456     {
00457       error ("output buffer overflow at do_convert()");
00458       ustrcpy (to, from);
00459     }
00460   else
00461     to[j] = '\0';
00462 #endif /* HAVE_ICONV */
00463 }
00464 
00465 static int
00466 do_check_and_conv (unsigned char *to, unsigned char *from)
00467 {
00468   static unsigned char tmp[BUFSIZ];
00469   int p1, p2, i, j;
00470   int kanji = TRUE;
00471 
00472   switch (DetectKanjiCode (from))
00473     {
00474     case NEW:
00475       debug ("Kanji code is New JIS.");
00476       do_convert (tmp, from, NEWJISSTR);
00477       break;
00478     case OLD:
00479       debug ("Kanji code is Old JIS.");
00480       do_convert (tmp, from, OLDJISSTR);
00481       break;
00482     case ESCI:
00483       debug ("This string includes Hankaku-Kana (jisx0201) escape sequence [ESC] + ( + I.");
00484       do_convert (tmp, from, NEWJISSTR);
00485       break;
00486     case NEC:
00487       debug ("Kanji code is NEC Kanji.");
00488       error ("cannot convert NEC Kanji.");
00489       ustrcpy (tmp, from);
00490       kanji = FALSE;
00491       break;
00492     case EUC:
00493       debug ("Kanji code is EUC.");
00494       ustrcpy (tmp, from);
00495       break;
00496     case SJIS:
00497       debug ("Kanji code is SJIS.");
00498       do_convert (tmp, from, SJISSTR);
00499       break;
00500     case EUCORSJIS:
00501       debug ("Kanji code is EUC or SJIS.");
00502       ustrcpy (tmp, from);
00503       kanji = FALSE;
00504       break;
00505     case ASCII:
00506       debug ("This is ASCII string.");
00507       ustrcpy (tmp, from);
00508       kanji = FALSE;
00509       break;
00510     default:
00511       debug ("This string includes unknown code.");
00512       ustrcpy (tmp, from);
00513       kanji = FALSE;
00514       break;
00515     }
00516 
00517   /* Hankaku Kana ---> Zenkaku Kana */
00518   if (kanji)
00519     {
00520       j = 0;
00521       for (i = 0; tmp[i] != '\0' && j < BUFSIZ; i++)
00522        {
00523          if (tmp[i] == SS2)
00524            {
00525              p1 = tmp[++i];
00526              if (tmp[i + 1] == SS2)
00527               {
00528                 p2 = tmp[i + 2];
00529                 if (p2 == 222 || p2 == 223)
00530                   i += 2;
00531                 else
00532                   p2 = 0;
00533               }
00534              else
00535               p2 = 0;
00536              han2zen (&p1, &p2);
00537              SJIStoJIS (&p1, &p2);
00538              to[j++] = p1 + 128;
00539              to[j++] = p2 + 128;
00540            }
00541          else
00542            to[j++] = tmp[i];
00543        }
00544 
00545       if (j >= BUFSIZ)
00546        {
00547          error ("output buffer overflow at Hankaku --> Zenkaku");
00548          ustrcpy (to, tmp);
00549        }
00550       else
00551        to[j] = '\0';
00552     }
00553   else
00554     ustrcpy (to, tmp);
00555 
00556   return kanji;
00557 }
00558 
00559 int
00560 any2eucjp (unsigned char *dest, unsigned char *src, unsigned int dest_max)
00561 {
00562   static unsigned char tmp_dest[BUFSIZ];
00563   int ret;
00564 
00565   if (strlen ((const char *) src) >= BUFSIZ)
00566     {
00567       error ("input string too large");
00568       return -1;
00569     }
00570   if (dest_max > BUFSIZ)
00571     {
00572       error ("invalid maximum size of destination\nit should be less than %d.", BUFSIZ);
00573       return -1;
00574     }
00575   ret = do_check_and_conv (tmp_dest, src);
00576   if (strlen ((const char *) tmp_dest) >= dest_max)
00577     {
00578       error ("output buffer overflow");
00579       ustrcpy (dest, src);
00580       return -1;
00581     }
00582   ustrcpy (dest, tmp_dest);
00583   return ret;
00584 }
00585 
00586 #if 0
00587 unsigned int
00588 strwidth (unsigned char *s)
00589 {
00590   unsigned char *t;
00591   unsigned int i;
00592 
00593   t = (unsigned char *) gdMalloc (BUFSIZ);
00594   any2eucjp (t, s, BUFSIZ);
00595   i = strlen (t);
00596   gdFree (t);
00597   return i;
00598 }
00599 
00600 #ifdef DEBUG
00601 int
00602 main ()
00603 {
00604   unsigned char input[BUFSIZ];
00605   unsigned char *output;
00606   unsigned char *str;
00607   int c, i = 0;
00608 
00609   while ((c = fgetc (stdin)) != '\n' && i < BUFSIZ)
00610     input[i++] = c;
00611   input[i] = '\0';
00612 
00613   printf ("input : %d bytes\n", strlen ((const char *) input));
00614   printf ("output: %d bytes\n", strwidth (input));
00615 
00616   output = (unsigned char *) gdMalloc (BUFSIZ);
00617   any2eucjp (output, input, BUFSIZ);
00618   str = output;
00619   while (*str != '\0')
00620     putchar (*(str++));
00621   putchar ('\n');
00622   gdFree (output);
00623 
00624   return 0;
00625 }
00626 #endif
00627 #endif