Back to index

cell-binutils  2.17cvs20070401
localcharset.c
Go to the documentation of this file.
00001 /* Determine a canonical name for the current locale's character encoding.
00002 
00003    Copyright (C) 2000-2003 Free Software Foundation, Inc.
00004 
00005    This program is free software; you can redistribute it and/or modify it
00006    under the terms of the GNU Library General Public License as published
00007    by the Free Software Foundation; either version 2, or (at your option)
00008    any later version.
00009 
00010    This program is distributed in the hope that it will be useful,
00011    but WITHOUT ANY WARRANTY; without even the implied warranty of
00012    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00013    Library General Public License for more details.
00014 
00015    You should have received a copy of the GNU Library General Public
00016    License along with this program; if not, write to the Free Software
00017    Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301,
00018    USA.  */
00019 
00020 /* Written by Bruno Haible <bruno@clisp.org>.  */
00021 
00022 #ifdef HAVE_CONFIG_H
00023 # include <config.h>
00024 #endif
00025 
00026 /* Specification.  */
00027 #include "localcharset.h"
00028 
00029 #if HAVE_STDDEF_H
00030 # include <stddef.h>
00031 #endif
00032 
00033 #include <stdio.h>
00034 #if HAVE_STRING_H
00035 # include <string.h>
00036 #else
00037 # include <strings.h>
00038 #endif
00039 #if HAVE_STDLIB_H
00040 # include <stdlib.h>
00041 #endif
00042 
00043 #if defined _WIN32 || defined __WIN32__
00044 # undef WIN32   /* avoid warning on mingw32 */
00045 # define WIN32
00046 #endif
00047 
00048 #if defined __EMX__
00049 /* Assume EMX program runs on OS/2, even if compiled under DOS.  */
00050 # define OS2
00051 #endif
00052 
00053 #if !defined WIN32
00054 # if HAVE_LANGINFO_CODESET
00055 #  include <langinfo.h>
00056 # else
00057 #  if HAVE_SETLOCALE
00058 #   include <locale.h>
00059 #  endif
00060 # endif
00061 #elif defined WIN32
00062 # define WIN32_LEAN_AND_MEAN
00063 # include <windows.h>
00064 #endif
00065 #if defined OS2
00066 # define INCL_DOS
00067 # include <os2.h>
00068 #endif
00069 
00070 #if ENABLE_RELOCATABLE
00071 # include "relocatable.h"
00072 #else
00073 # define relocate(pathname) (pathname)
00074 #endif
00075 
00076 #if defined _WIN32 || defined __WIN32__ || defined __EMX__ || defined __DJGPP__
00077   /* Win32, OS/2, DOS */
00078 # define ISSLASH(C) ((C) == '/' || (C) == '\\')
00079 #endif
00080 
00081 #ifndef DIRECTORY_SEPARATOR
00082 # define DIRECTORY_SEPARATOR '/'
00083 #endif
00084 
00085 #ifndef ISSLASH
00086 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
00087 #endif
00088 
00089 #ifdef HAVE_GETC_UNLOCKED
00090 # undef getc
00091 # define getc getc_unlocked
00092 #endif
00093 
00094 /* The following static variable is declared 'volatile' to avoid a
00095    possible multithread problem in the function get_charset_aliases. If we
00096    are running in a threaded environment, and if two threads initialize
00097    'charset_aliases' simultaneously, both will produce the same value,
00098    and everything will be ok if the two assignments to 'charset_aliases'
00099    are atomic. But I don't know what will happen if the two assignments mix.  */
00100 #if __STDC__ != 1
00101 # define volatile /* empty */
00102 #endif
00103 /* Pointer to the contents of the charset.alias file, if it has already been
00104    read, else NULL.  Its format is:
00105    ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0'  */
00106 static const char * volatile charset_aliases;
00107 
00108 /* Return a pointer to the contents of the charset.alias file.  */
00109 static const char *
00110 get_charset_aliases ()
00111 {
00112   const char *cp;
00113 
00114   cp = charset_aliases;
00115   if (cp == NULL)
00116     {
00117 #if !(defined VMS || defined WIN32)
00118       FILE *fp;
00119       const char *dir = relocate (LIBDIR);
00120       const char *base = "charset.alias";
00121       char *file_name;
00122 
00123       /* Concatenate dir and base into freshly allocated file_name.  */
00124       {
00125        size_t dir_len = strlen (dir);
00126        size_t base_len = strlen (base);
00127        int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
00128        file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
00129        if (file_name != NULL)
00130          {
00131            memcpy (file_name, dir, dir_len);
00132            if (add_slash)
00133              file_name[dir_len] = DIRECTORY_SEPARATOR;
00134            memcpy (file_name + dir_len + add_slash, base, base_len + 1);
00135          }
00136       }
00137 
00138       if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL)
00139        /* Out of memory or file not found, treat it as empty.  */
00140        cp = "";
00141       else
00142        {
00143          /* Parse the file's contents.  */
00144          int c;
00145          char buf1[50+1];
00146          char buf2[50+1];
00147          char *res_ptr = NULL;
00148          size_t res_size = 0;
00149          size_t l1, l2;
00150 
00151          for (;;)
00152            {
00153              c = getc (fp);
00154              if (c == EOF)
00155               break;
00156              if (c == '\n' || c == ' ' || c == '\t')
00157               continue;
00158              if (c == '#')
00159               {
00160                 /* Skip comment, to end of line.  */
00161                 do
00162                   c = getc (fp);
00163                 while (!(c == EOF || c == '\n'));
00164                 if (c == EOF)
00165                   break;
00166                 continue;
00167               }
00168              ungetc (c, fp);
00169              if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
00170               break;
00171              l1 = strlen (buf1);
00172              l2 = strlen (buf2);
00173              if (res_size == 0)
00174               {
00175                 res_size = l1 + 1 + l2 + 1;
00176                 res_ptr = (char *) malloc (res_size + 1);
00177               }
00178              else
00179               {
00180                 res_size += l1 + 1 + l2 + 1;
00181                 res_ptr = (char *) realloc (res_ptr, res_size + 1);
00182               }
00183              if (res_ptr == NULL)
00184               {
00185                 /* Out of memory. */
00186                 res_size = 0;
00187                 break;
00188               }
00189              strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
00190              strcpy (res_ptr + res_size - (l2 + 1), buf2);
00191            }
00192          fclose (fp);
00193          if (res_size == 0)
00194            cp = "";
00195          else
00196            {
00197              *(res_ptr + res_size) = '\0';
00198              cp = res_ptr;
00199            }
00200        }
00201 
00202       if (file_name != NULL)
00203        free (file_name);
00204 
00205 #else
00206 
00207 # if defined VMS
00208       /* To avoid the troubles of an extra file charset.alias_vms in the
00209         sources of many GNU packages, simply inline the aliases here.  */
00210       /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
00211         "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
00212         section 10.7 "Handling Different Character Sets".  */
00213       cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
00214           "ISO8859-2" "\0" "ISO-8859-2" "\0"
00215           "ISO8859-5" "\0" "ISO-8859-5" "\0"
00216           "ISO8859-7" "\0" "ISO-8859-7" "\0"
00217           "ISO8859-8" "\0" "ISO-8859-8" "\0"
00218           "ISO8859-9" "\0" "ISO-8859-9" "\0"
00219           /* Japanese */
00220           "eucJP" "\0" "EUC-JP" "\0"
00221           "SJIS" "\0" "SHIFT_JIS" "\0"
00222           "DECKANJI" "\0" "DEC-KANJI" "\0"
00223           "SDECKANJI" "\0" "EUC-JP" "\0"
00224           /* Chinese */
00225           "eucTW" "\0" "EUC-TW" "\0"
00226           "DECHANYU" "\0" "DEC-HANYU" "\0"
00227           "DECHANZI" "\0" "GB2312" "\0"
00228           /* Korean */
00229           "DECKOREAN" "\0" "EUC-KR" "\0";
00230 # endif
00231 
00232 # if defined WIN32
00233       /* To avoid the troubles of installing a separate file in the same
00234         directory as the DLL and of retrieving the DLL's directory at
00235         runtime, simply inline the aliases here.  */
00236 
00237       cp = "CP936" "\0" "GBK" "\0"
00238           "CP1361" "\0" "JOHAB" "\0"
00239           "CP20127" "\0" "ASCII" "\0"
00240           "CP20866" "\0" "KOI8-R" "\0"
00241           "CP21866" "\0" "KOI8-RU" "\0"
00242           "CP28591" "\0" "ISO-8859-1" "\0"
00243           "CP28592" "\0" "ISO-8859-2" "\0"
00244           "CP28593" "\0" "ISO-8859-3" "\0"
00245           "CP28594" "\0" "ISO-8859-4" "\0"
00246           "CP28595" "\0" "ISO-8859-5" "\0"
00247           "CP28596" "\0" "ISO-8859-6" "\0"
00248           "CP28597" "\0" "ISO-8859-7" "\0"
00249           "CP28598" "\0" "ISO-8859-8" "\0"
00250           "CP28599" "\0" "ISO-8859-9" "\0"
00251           "CP28605" "\0" "ISO-8859-15" "\0";
00252 # endif
00253 #endif
00254 
00255       charset_aliases = cp;
00256     }
00257 
00258   return cp;
00259 }
00260 
00261 /* Determine the current locale's character encoding, and canonicalize it
00262    into one of the canonical names listed in config.charset.
00263    The result must not be freed; it is statically allocated.
00264    If the canonical name cannot be determined, the result is a non-canonical
00265    name.  */
00266 
00267 #ifdef STATIC
00268 STATIC
00269 #endif
00270 const char *
00271 locale_charset ()
00272 {
00273   const char *codeset;
00274   const char *aliases;
00275 
00276 #if !(defined WIN32 || defined OS2)
00277 
00278 # if HAVE_LANGINFO_CODESET
00279 
00280   /* Most systems support nl_langinfo (CODESET) nowadays.  */
00281   codeset = nl_langinfo (CODESET);
00282 
00283 # else
00284 
00285   /* On old systems which lack it, use setlocale or getenv.  */
00286   const char *locale = NULL;
00287 
00288   /* But most old systems don't have a complete set of locales.  Some
00289      (like SunOS 4 or DJGPP) have only the C locale.  Therefore we don't
00290      use setlocale here; it would return "C" when it doesn't support the
00291      locale name the user has set.  */
00292 #  if HAVE_SETLOCALE && 0
00293   locale = setlocale (LC_CTYPE, NULL);
00294 #  endif
00295   if (locale == NULL || locale[0] == '\0')
00296     {
00297       locale = getenv ("LC_ALL");
00298       if (locale == NULL || locale[0] == '\0')
00299        {
00300          locale = getenv ("LC_CTYPE");
00301          if (locale == NULL || locale[0] == '\0')
00302            locale = getenv ("LANG");
00303        }
00304     }
00305 
00306   /* On some old systems, one used to set locale = "iso8859_1". On others,
00307      you set it to "language_COUNTRY.charset". In any case, we resolve it
00308      through the charset.alias file.  */
00309   codeset = locale;
00310 
00311 # endif
00312 
00313 #elif defined WIN32
00314 
00315   static char buf[2 + 10 + 1];
00316 
00317   /* Woe32 has a function returning the locale's codepage as a number.  */
00318   sprintf (buf, "CP%u", GetACP ());
00319   codeset = buf;
00320 
00321 #elif defined OS2
00322 
00323   const char *locale;
00324   static char buf[2 + 10 + 1];
00325   ULONG cp[3];
00326   ULONG cplen;
00327 
00328   /* Allow user to override the codeset, as set in the operating system,
00329      with standard language environment variables.  */
00330   locale = getenv ("LC_ALL");
00331   if (locale == NULL || locale[0] == '\0')
00332     {
00333       locale = getenv ("LC_CTYPE");
00334       if (locale == NULL || locale[0] == '\0')
00335        locale = getenv ("LANG");
00336     }
00337   if (locale != NULL && locale[0] != '\0')
00338     {
00339       /* If the locale name contains an encoding after the dot, return it.  */
00340       const char *dot = strchr (locale, '.');
00341 
00342       if (dot != NULL)
00343        {
00344          const char *modifier;
00345 
00346          dot++;
00347          /* Look for the possible @... trailer and remove it, if any.  */
00348          modifier = strchr (dot, '@');
00349          if (modifier == NULL)
00350            return dot;
00351          if (modifier - dot < sizeof (buf))
00352            {
00353              memcpy (buf, dot, modifier - dot);
00354              buf [modifier - dot] = '\0';
00355              return buf;
00356            }
00357        }
00358 
00359       /* Resolve through the charset.alias file.  */
00360       codeset = locale;
00361     }
00362   else
00363     {
00364       /* OS/2 has a function returning the locale's codepage as a number.  */
00365       if (DosQueryCp (sizeof (cp), cp, &cplen))
00366        codeset = "";
00367       else
00368        {
00369          sprintf (buf, "CP%u", cp[0]);
00370          codeset = buf;
00371        }
00372     }
00373 
00374 #endif
00375 
00376   if (codeset == NULL)
00377     /* The canonical name cannot be determined.  */
00378     codeset = "";
00379 
00380   /* Resolve alias. */
00381   for (aliases = get_charset_aliases ();
00382        *aliases != '\0';
00383        aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
00384     if (strcmp (codeset, aliases) == 0
00385        || (aliases[0] == '*' && aliases[1] == '\0'))
00386       {
00387        codeset = aliases + strlen (aliases) + 1;
00388        break;
00389       }
00390 
00391   /* Don't return an empty string.  GNU libc and GNU libiconv interpret
00392      the empty string as denoting "the locale's character encoding",
00393      thus GNU libiconv would call this function a second time.  */
00394   if (codeset[0] == '\0')
00395     codeset = "ASCII";
00396 
00397   return codeset;
00398 }