Back to index

courier  0.68.2
localcharset.c
Go to the documentation of this file.
00001 /* Determine a canonical name for the current locale's character encoding.
00002 
00003    Copyright (C) 2000-2006, 2008-2010 Free Software Foundation, Inc.
00004 
00005    This program is free software; you can redistribute it and/or modify it
00006    under the terms of the GNU Library General Public License as published
00007    by the Free Software Foundation; either version 2, or (at your option)
00008    any later version.
00009 
00010    This program is distributed in the hope that it will be useful,
00011    but WITHOUT ANY WARRANTY; without even the implied warranty of
00012    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00013    Library General Public License for more details.
00014 
00015    You should have received a copy of the GNU Library General Public
00016    License along with this program; if not, write to the Free Software
00017    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
00018    USA.  */
00019 
00020 /* Written by Bruno Haible <bruno@clisp.org>.  */
00021 
00022 #include <config.h>
00023 
00024 /* Specification.  */
00025 #include "localcharset.h"
00026 
00027 #include <fcntl.h>
00028 #include <stddef.h>
00029 #include <stdio.h>
00030 #include <string.h>
00031 #include <stdlib.h>
00032 
00033 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
00034 # define DARWIN7 /* Darwin 7 or newer, i.e. MacOS X 10.3 or newer */
00035 #endif
00036 
00037 #if defined _WIN32 || defined __WIN32__
00038 # define WIN32_NATIVE
00039 #endif
00040 
00041 #if defined __EMX__
00042 /* Assume EMX program runs on OS/2, even if compiled under DOS.  */
00043 # ifndef OS2
00044 #  define OS2
00045 # endif
00046 #endif
00047 
00048 #if !defined WIN32_NATIVE
00049 # include <unistd.h>
00050 # if HAVE_LANGINFO_CODESET
00051 #  include <langinfo.h>
00052 # else
00053 #  if 0 /* see comment below */
00054 #   include <locale.h>
00055 #  endif
00056 # endif
00057 # ifdef __CYGWIN__
00058 #  define WIN32_LEAN_AND_MEAN
00059 #  include <windows.h>
00060 # endif
00061 #elif defined WIN32_NATIVE
00062 # define WIN32_LEAN_AND_MEAN
00063 # include <windows.h>
00064 #endif
00065 #if defined OS2
00066 # define INCL_DOS
00067 # include <os2.h>
00068 #endif
00069 
00070 #if ENABLE_RELOCATABLE
00071 # include "relocatable.h"
00072 #else
00073 # define relocate(pathname) (pathname)
00074 #endif
00075 
00076 /* Get LIBDIR.  */
00077 #ifndef LIBDIR
00078 # include "configmake.h"
00079 #endif
00080 
00081 /* Define O_NOFOLLOW to 0 on platforms where it does not exist.  */
00082 #ifndef O_NOFOLLOW
00083 # define O_NOFOLLOW 0
00084 #endif
00085 
00086 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
00087   /* Win32, Cygwin, OS/2, DOS */
00088 # define ISSLASH(C) ((C) == '/' || (C) == '\\')
00089 #endif
00090 
00091 #ifndef DIRECTORY_SEPARATOR
00092 # define DIRECTORY_SEPARATOR '/'
00093 #endif
00094 
00095 #ifndef ISSLASH
00096 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
00097 #endif
00098 
00099 #if HAVE_DECL_GETC_UNLOCKED
00100 # undef getc
00101 # define getc getc_unlocked
00102 #endif
00103 
00104 /* The following static variable is declared 'volatile' to avoid a
00105    possible multithread problem in the function get_charset_aliases. If we
00106    are running in a threaded environment, and if two threads initialize
00107    'charset_aliases' simultaneously, both will produce the same value,
00108    and everything will be ok if the two assignments to 'charset_aliases'
00109    are atomic. But I don't know what will happen if the two assignments mix.  */
00110 #if __STDC__ != 1
00111 # define volatile /* empty */
00112 #endif
00113 /* Pointer to the contents of the charset.alias file, if it has already been
00114    read, else NULL.  Its format is:
00115    ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0'  */
00116 static const char * volatile charset_aliases;
00117 
00118 /* Return a pointer to the contents of the charset.alias file.  */
00119 static const char *
00120 get_charset_aliases (void)
00121 {
00122   const char *cp;
00123 
00124   cp = charset_aliases;
00125   if (cp == NULL)
00126     {
00127 #if !(defined DARWIN7 || defined VMS || defined WIN32_NATIVE || defined __CYGWIN__)
00128       const char *dir;
00129       const char *base = "charset.alias";
00130       char *file_name;
00131 
00132       /* Make it possible to override the charset.alias location.  This is
00133          necessary for running the testsuite before "make install".  */
00134       dir = getenv ("CHARSETALIASDIR");
00135       if (dir == NULL || dir[0] == '\0')
00136         dir = relocate (LIBDIR);
00137 
00138       /* Concatenate dir and base into freshly allocated file_name.  */
00139       {
00140         size_t dir_len = strlen (dir);
00141         size_t base_len = strlen (base);
00142         int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
00143         file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
00144         if (file_name != NULL)
00145           {
00146             memcpy (file_name, dir, dir_len);
00147             if (add_slash)
00148               file_name[dir_len] = DIRECTORY_SEPARATOR;
00149             memcpy (file_name + dir_len + add_slash, base, base_len + 1);
00150           }
00151       }
00152 
00153       if (file_name == NULL)
00154         /* Out of memory.  Treat the file as empty.  */
00155         cp = "";
00156       else
00157         {
00158           int fd;
00159 
00160           /* Open the file.  Reject symbolic links on platforms that support
00161              O_NOFOLLOW.  This is a security feature.  Without it, an attacker
00162              could retrieve parts of the contents (namely, the tail of the
00163              first line that starts with "* ") of an arbitrary file by placing
00164              a symbolic link to that file under the name "charset.alias" in
00165              some writable directory and defining the environment variable
00166              CHARSETALIASDIR to point to that directory.  */
00167           fd = open (file_name,
00168                      O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0));
00169           if (fd < 0)
00170             /* File not found.  Treat it as empty.  */
00171             cp = "";
00172           else
00173             {
00174               FILE *fp;
00175 
00176               fp = fdopen (fd, "r");
00177               if (fp == NULL)
00178                 {
00179                   /* Out of memory.  Treat the file as empty.  */
00180                   close (fd);
00181                   cp = "";
00182                 }
00183               else
00184                 {
00185                   /* Parse the file's contents.  */
00186                   char *res_ptr = NULL;
00187                   size_t res_size = 0;
00188 
00189                   for (;;)
00190                     {
00191                       int c;
00192                       char buf1[50+1];
00193                       char buf2[50+1];
00194                       size_t l1, l2;
00195                       char *old_res_ptr;
00196 
00197                       c = getc (fp);
00198                       if (c == EOF)
00199                         break;
00200                       if (c == '\n' || c == ' ' || c == '\t')
00201                         continue;
00202                       if (c == '#')
00203                         {
00204                           /* Skip comment, to end of line.  */
00205                           do
00206                             c = getc (fp);
00207                           while (!(c == EOF || c == '\n'));
00208                           if (c == EOF)
00209                             break;
00210                           continue;
00211                         }
00212                       ungetc (c, fp);
00213                       if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
00214                         break;
00215                       l1 = strlen (buf1);
00216                       l2 = strlen (buf2);
00217                       old_res_ptr = res_ptr;
00218                       if (res_size == 0)
00219                         {
00220                           res_size = l1 + 1 + l2 + 1;
00221                           res_ptr = (char *) malloc (res_size + 1);
00222                         }
00223                       else
00224                         {
00225                           res_size += l1 + 1 + l2 + 1;
00226                           res_ptr = (char *) realloc (res_ptr, res_size + 1);
00227                         }
00228                       if (res_ptr == NULL)
00229                         {
00230                           /* Out of memory. */
00231                           res_size = 0;
00232                           if (old_res_ptr != NULL)
00233                             free (old_res_ptr);
00234                           break;
00235                         }
00236                       strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
00237                       strcpy (res_ptr + res_size - (l2 + 1), buf2);
00238                     }
00239                   fclose (fp);
00240                   if (res_size == 0)
00241                     cp = "";
00242                   else
00243                     {
00244                       *(res_ptr + res_size) = '\0';
00245                       cp = res_ptr;
00246                     }
00247                 }
00248             }
00249 
00250           free (file_name);
00251         }
00252 
00253 #else
00254 
00255 # if defined DARWIN7
00256       /* To avoid the trouble of installing a file that is shared by many
00257          GNU packages -- many packaging systems have problems with this --,
00258          simply inline the aliases here.  */
00259       cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
00260            "ISO8859-2" "\0" "ISO-8859-2" "\0"
00261            "ISO8859-4" "\0" "ISO-8859-4" "\0"
00262            "ISO8859-5" "\0" "ISO-8859-5" "\0"
00263            "ISO8859-7" "\0" "ISO-8859-7" "\0"
00264            "ISO8859-9" "\0" "ISO-8859-9" "\0"
00265            "ISO8859-13" "\0" "ISO-8859-13" "\0"
00266            "ISO8859-15" "\0" "ISO-8859-15" "\0"
00267            "KOI8-R" "\0" "KOI8-R" "\0"
00268            "KOI8-U" "\0" "KOI8-U" "\0"
00269            "CP866" "\0" "CP866" "\0"
00270            "CP949" "\0" "CP949" "\0"
00271            "CP1131" "\0" "CP1131" "\0"
00272            "CP1251" "\0" "CP1251" "\0"
00273            "eucCN" "\0" "GB2312" "\0"
00274            "GB2312" "\0" "GB2312" "\0"
00275            "eucJP" "\0" "EUC-JP" "\0"
00276            "eucKR" "\0" "EUC-KR" "\0"
00277            "Big5" "\0" "BIG5" "\0"
00278            "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
00279            "GBK" "\0" "GBK" "\0"
00280            "GB18030" "\0" "GB18030" "\0"
00281            "SJIS" "\0" "SHIFT_JIS" "\0"
00282            "ARMSCII-8" "\0" "ARMSCII-8" "\0"
00283            "PT154" "\0" "PT154" "\0"
00284          /*"ISCII-DEV" "\0" "?" "\0"*/
00285            "*" "\0" "UTF-8" "\0";
00286 # endif
00287 
00288 # if defined VMS
00289       /* To avoid the troubles of an extra file charset.alias_vms in the
00290          sources of many GNU packages, simply inline the aliases here.  */
00291       /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
00292          "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
00293          section 10.7 "Handling Different Character Sets".  */
00294       cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
00295            "ISO8859-2" "\0" "ISO-8859-2" "\0"
00296            "ISO8859-5" "\0" "ISO-8859-5" "\0"
00297            "ISO8859-7" "\0" "ISO-8859-7" "\0"
00298            "ISO8859-8" "\0" "ISO-8859-8" "\0"
00299            "ISO8859-9" "\0" "ISO-8859-9" "\0"
00300            /* Japanese */
00301            "eucJP" "\0" "EUC-JP" "\0"
00302            "SJIS" "\0" "SHIFT_JIS" "\0"
00303            "DECKANJI" "\0" "DEC-KANJI" "\0"
00304            "SDECKANJI" "\0" "EUC-JP" "\0"
00305            /* Chinese */
00306            "eucTW" "\0" "EUC-TW" "\0"
00307            "DECHANYU" "\0" "DEC-HANYU" "\0"
00308            "DECHANZI" "\0" "GB2312" "\0"
00309            /* Korean */
00310            "DECKOREAN" "\0" "EUC-KR" "\0";
00311 # endif
00312 
00313 # if defined WIN32_NATIVE || defined __CYGWIN__
00314       /* To avoid the troubles of installing a separate file in the same
00315          directory as the DLL and of retrieving the DLL's directory at
00316          runtime, simply inline the aliases here.  */
00317 
00318       cp = "CP936" "\0" "GBK" "\0"
00319            "CP1361" "\0" "JOHAB" "\0"
00320            "CP20127" "\0" "ASCII" "\0"
00321            "CP20866" "\0" "KOI8-R" "\0"
00322            "CP20936" "\0" "GB2312" "\0"
00323            "CP21866" "\0" "KOI8-RU" "\0"
00324            "CP28591" "\0" "ISO-8859-1" "\0"
00325            "CP28592" "\0" "ISO-8859-2" "\0"
00326            "CP28593" "\0" "ISO-8859-3" "\0"
00327            "CP28594" "\0" "ISO-8859-4" "\0"
00328            "CP28595" "\0" "ISO-8859-5" "\0"
00329            "CP28596" "\0" "ISO-8859-6" "\0"
00330            "CP28597" "\0" "ISO-8859-7" "\0"
00331            "CP28598" "\0" "ISO-8859-8" "\0"
00332            "CP28599" "\0" "ISO-8859-9" "\0"
00333            "CP28605" "\0" "ISO-8859-15" "\0"
00334            "CP38598" "\0" "ISO-8859-8" "\0"
00335            "CP51932" "\0" "EUC-JP" "\0"
00336            "CP51936" "\0" "GB2312" "\0"
00337            "CP51949" "\0" "EUC-KR" "\0"
00338            "CP51950" "\0" "EUC-TW" "\0"
00339            "CP54936" "\0" "GB18030" "\0"
00340            "CP65001" "\0" "UTF-8" "\0";
00341 # endif
00342 #endif
00343 
00344       charset_aliases = cp;
00345     }
00346 
00347   return cp;
00348 }
00349 
00350 /* Determine the current locale's character encoding, and canonicalize it
00351    into one of the canonical names listed in config.charset.
00352    The result must not be freed; it is statically allocated.
00353    If the canonical name cannot be determined, the result is a non-canonical
00354    name.  */
00355 
00356 #ifdef STATIC
00357 STATIC
00358 #endif
00359 const char *
00360 locale_charset (void)
00361 {
00362   const char *codeset;
00363   const char *aliases;
00364 
00365 #if !(defined WIN32_NATIVE || defined OS2)
00366 
00367 # if HAVE_LANGINFO_CODESET
00368 
00369   /* Most systems support nl_langinfo (CODESET) nowadays.  */
00370   codeset = nl_langinfo (CODESET);
00371 
00372 #  ifdef __CYGWIN__
00373   /* Cygwin < 1.7 does not have locales.  nl_langinfo (CODESET) always
00374      returns "US-ASCII".  Return the suffix of the locale name from the
00375      environment variables (if present) or the codepage as a number.  */
00376   if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
00377     {
00378       const char *locale;
00379       static char buf[2 + 10 + 1];
00380 
00381       locale = getenv ("LC_ALL");
00382       if (locale == NULL || locale[0] == '\0')
00383         {
00384           locale = getenv ("LC_CTYPE");
00385           if (locale == NULL || locale[0] == '\0')
00386             locale = getenv ("LANG");
00387         }
00388       if (locale != NULL && locale[0] != '\0')
00389         {
00390           /* If the locale name contains an encoding after the dot, return
00391              it.  */
00392           const char *dot = strchr (locale, '.');
00393 
00394           if (dot != NULL)
00395             {
00396               const char *modifier;
00397 
00398               dot++;
00399               /* Look for the possible @... trailer and remove it, if any.  */
00400               modifier = strchr (dot, '@');
00401               if (modifier == NULL)
00402                 return dot;
00403               if (modifier - dot < sizeof (buf))
00404                 {
00405                   memcpy (buf, dot, modifier - dot);
00406                   buf [modifier - dot] = '\0';
00407                   return buf;
00408                 }
00409             }
00410         }
00411 
00412       /* Woe32 has a function returning the locale's codepage as a number:
00413          GetACP().  This encoding is used by Cygwin, unless the user has set
00414          the environment variable CYGWIN=codepage:oem (which very few people
00415          do).
00416          Output directed to console windows needs to be converted (to
00417          GetOEMCP() if the console is using a raster font, or to
00418          GetConsoleOutputCP() if it is using a TrueType font).  Cygwin does
00419          this conversion transparently (see winsup/cygwin/fhandler_console.cc),
00420          converting to GetConsoleOutputCP().  This leads to correct results,
00421          except when SetConsoleOutputCP has been called and a raster font is
00422          in use.  */
00423       sprintf (buf, "CP%u", GetACP ());
00424       codeset = buf;
00425     }
00426 #  endif
00427 
00428 # else
00429 
00430   /* On old systems which lack it, use setlocale or getenv.  */
00431   const char *locale = NULL;
00432 
00433   /* But most old systems don't have a complete set of locales.  Some
00434      (like SunOS 4 or DJGPP) have only the C locale.  Therefore we don't
00435      use setlocale here; it would return "C" when it doesn't support the
00436      locale name the user has set.  */
00437 #  if 0
00438   locale = setlocale (LC_CTYPE, NULL);
00439 #  endif
00440   if (locale == NULL || locale[0] == '\0')
00441     {
00442       locale = getenv ("LC_ALL");
00443       if (locale == NULL || locale[0] == '\0')
00444         {
00445           locale = getenv ("LC_CTYPE");
00446           if (locale == NULL || locale[0] == '\0')
00447             locale = getenv ("LANG");
00448         }
00449     }
00450 
00451   /* On some old systems, one used to set locale = "iso8859_1". On others,
00452      you set it to "language_COUNTRY.charset". In any case, we resolve it
00453      through the charset.alias file.  */
00454   codeset = locale;
00455 
00456 # endif
00457 
00458 #elif defined WIN32_NATIVE
00459 
00460   static char buf[2 + 10 + 1];
00461 
00462   /* Woe32 has a function returning the locale's codepage as a number:
00463      GetACP().
00464      When the output goes to a console window, it needs to be provided in
00465      GetOEMCP() encoding if the console is using a raster font, or in
00466      GetConsoleOutputCP() encoding if it is using a TrueType font.
00467      But in GUI programs and for output sent to files and pipes, GetACP()
00468      encoding is the best bet.  */
00469   sprintf (buf, "CP%u", GetACP ());
00470   codeset = buf;
00471 
00472 #elif defined OS2
00473 
00474   const char *locale;
00475   static char buf[2 + 10 + 1];
00476   ULONG cp[3];
00477   ULONG cplen;
00478 
00479   /* Allow user to override the codeset, as set in the operating system,
00480      with standard language environment variables.  */
00481   locale = getenv ("LC_ALL");
00482   if (locale == NULL || locale[0] == '\0')
00483     {
00484       locale = getenv ("LC_CTYPE");
00485       if (locale == NULL || locale[0] == '\0')
00486         locale = getenv ("LANG");
00487     }
00488   if (locale != NULL && locale[0] != '\0')
00489     {
00490       /* If the locale name contains an encoding after the dot, return it.  */
00491       const char *dot = strchr (locale, '.');
00492 
00493       if (dot != NULL)
00494         {
00495           const char *modifier;
00496 
00497           dot++;
00498           /* Look for the possible @... trailer and remove it, if any.  */
00499           modifier = strchr (dot, '@');
00500           if (modifier == NULL)
00501             return dot;
00502           if (modifier - dot < sizeof (buf))
00503             {
00504               memcpy (buf, dot, modifier - dot);
00505               buf [modifier - dot] = '\0';
00506               return buf;
00507             }
00508         }
00509 
00510       /* Resolve through the charset.alias file.  */
00511       codeset = locale;
00512     }
00513   else
00514     {
00515       /* OS/2 has a function returning the locale's codepage as a number.  */
00516       if (DosQueryCp (sizeof (cp), cp, &cplen))
00517         codeset = "";
00518       else
00519         {
00520           sprintf (buf, "CP%u", cp[0]);
00521           codeset = buf;
00522         }
00523     }
00524 
00525 #endif
00526 
00527   if (codeset == NULL)
00528     /* The canonical name cannot be determined.  */
00529     codeset = "";
00530 
00531   /* Resolve alias. */
00532   for (aliases = get_charset_aliases ();
00533        *aliases != '\0';
00534        aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
00535     if (strcmp (codeset, aliases) == 0
00536         || (aliases[0] == '*' && aliases[1] == '\0'))
00537       {
00538         codeset = aliases + strlen (aliases) + 1;
00539         break;
00540       }
00541 
00542   /* Don't return an empty string.  GNU libc and GNU libiconv interpret
00543      the empty string as denoting "the locale's character encoding",
00544      thus GNU libiconv would call this function a second time.  */
00545   if (codeset[0] == '\0')
00546     codeset = "ASCII";
00547 
00548   return codeset;
00549 }