Back to index

nagios-plugins  1.4.16
localcharset.c
Go to the documentation of this file.
00001 /* Determine a canonical name for the current locale's character encoding.
00002 
00003    Copyright (C) 2000-2006, 2008-2010 Free Software Foundation, Inc.
00004 
00005    This program is free software; you can redistribute it and/or modify
00006    it under the terms of the GNU General Public License as published by
00007    the Free Software Foundation; either version 3, or (at your option)
00008    any later version.
00009 
00010    This program is distributed in the hope that it will be useful,
00011    but WITHOUT ANY WARRANTY; without even the implied warranty of
00012    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013    GNU General Public License for more details.
00014 
00015    You should have received a copy of the GNU General Public License along
00016    with this program; if not, write to the Free Software Foundation,
00017    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
00018 
00019 /* Written by Bruno Haible <bruno@clisp.org>.  */
00020 
00021 #include <config.h>
00022 
00023 /* Specification.  */
00024 #include "localcharset.h"
00025 
00026 #include <fcntl.h>
00027 #include <stddef.h>
00028 #include <stdio.h>
00029 #include <string.h>
00030 #include <stdlib.h>
00031 
00032 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
00033 # define DARWIN7 /* Darwin 7 or newer, i.e. MacOS X 10.3 or newer */
00034 #endif
00035 
00036 #if defined _WIN32 || defined __WIN32__
00037 # define WIN32_NATIVE
00038 #endif
00039 
00040 #if defined __EMX__
00041 /* Assume EMX program runs on OS/2, even if compiled under DOS.  */
00042 # ifndef OS2
00043 #  define OS2
00044 # endif
00045 #endif
00046 
00047 #if !defined WIN32_NATIVE
00048 # include <unistd.h>
00049 # if HAVE_LANGINFO_CODESET
00050 #  include <langinfo.h>
00051 # else
00052 #  if 0 /* see comment below */
00053 #   include <locale.h>
00054 #  endif
00055 # endif
00056 # ifdef __CYGWIN__
00057 #  define WIN32_LEAN_AND_MEAN
00058 #  include <windows.h>
00059 # endif
00060 #elif defined WIN32_NATIVE
00061 # define WIN32_LEAN_AND_MEAN
00062 # include <windows.h>
00063 #endif
00064 #if defined OS2
00065 # define INCL_DOS
00066 # include <os2.h>
00067 #endif
00068 
00069 #if ENABLE_RELOCATABLE
00070 # include "relocatable.h"
00071 #else
00072 # define relocate(pathname) (pathname)
00073 #endif
00074 
00075 /* Get LIBDIR.  */
00076 #ifndef LIBDIR
00077 # include "configmake.h"
00078 #endif
00079 
00080 /* Define O_NOFOLLOW to 0 on platforms where it does not exist.  */
00081 #ifndef O_NOFOLLOW
00082 # define O_NOFOLLOW 0
00083 #endif
00084 
00085 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
00086   /* Win32, Cygwin, OS/2, DOS */
00087 # define ISSLASH(C) ((C) == '/' || (C) == '\\')
00088 #endif
00089 
00090 #ifndef DIRECTORY_SEPARATOR
00091 # define DIRECTORY_SEPARATOR '/'
00092 #endif
00093 
00094 #ifndef ISSLASH
00095 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
00096 #endif
00097 
00098 #if HAVE_DECL_GETC_UNLOCKED
00099 # undef getc
00100 # define getc getc_unlocked
00101 #endif
00102 
00103 /* The following static variable is declared 'volatile' to avoid a
00104    possible multithread problem in the function get_charset_aliases. If we
00105    are running in a threaded environment, and if two threads initialize
00106    'charset_aliases' simultaneously, both will produce the same value,
00107    and everything will be ok if the two assignments to 'charset_aliases'
00108    are atomic. But I don't know what will happen if the two assignments mix.  */
00109 #if __STDC__ != 1
00110 # define volatile /* empty */
00111 #endif
00112 /* Pointer to the contents of the charset.alias file, if it has already been
00113    read, else NULL.  Its format is:
00114    ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0'  */
00115 static const char * volatile charset_aliases;
00116 
00117 /* Return a pointer to the contents of the charset.alias file.  */
00118 static const char *
00119 get_charset_aliases (void)
00120 {
00121   const char *cp;
00122 
00123   cp = charset_aliases;
00124   if (cp == NULL)
00125     {
00126 #if !(defined DARWIN7 || defined VMS || defined WIN32_NATIVE || defined __CYGWIN__)
00127       const char *dir;
00128       const char *base = "charset.alias";
00129       char *file_name;
00130 
00131       /* Make it possible to override the charset.alias location.  This is
00132          necessary for running the testsuite before "make install".  */
00133       dir = getenv ("CHARSETALIASDIR");
00134       if (dir == NULL || dir[0] == '\0')
00135         dir = relocate (LIBDIR);
00136 
00137       /* Concatenate dir and base into freshly allocated file_name.  */
00138       {
00139         size_t dir_len = strlen (dir);
00140         size_t base_len = strlen (base);
00141         int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
00142         file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
00143         if (file_name != NULL)
00144           {
00145             memcpy (file_name, dir, dir_len);
00146             if (add_slash)
00147               file_name[dir_len] = DIRECTORY_SEPARATOR;
00148             memcpy (file_name + dir_len + add_slash, base, base_len + 1);
00149           }
00150       }
00151 
00152       if (file_name == NULL)
00153         /* Out of memory.  Treat the file as empty.  */
00154         cp = "";
00155       else
00156         {
00157           int fd;
00158 
00159           /* Open the file.  Reject symbolic links on platforms that support
00160              O_NOFOLLOW.  This is a security feature.  Without it, an attacker
00161              could retrieve parts of the contents (namely, the tail of the
00162              first line that starts with "* ") of an arbitrary file by placing
00163              a symbolic link to that file under the name "charset.alias" in
00164              some writable directory and defining the environment variable
00165              CHARSETALIASDIR to point to that directory.  */
00166           fd = open (file_name,
00167                      O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0));
00168           if (fd < 0)
00169             /* File not found.  Treat it as empty.  */
00170             cp = "";
00171           else
00172             {
00173               FILE *fp;
00174 
00175               fp = fdopen (fd, "r");
00176               if (fp == NULL)
00177                 {
00178                   /* Out of memory.  Treat the file as empty.  */
00179                   close (fd);
00180                   cp = "";
00181                 }
00182               else
00183                 {
00184                   /* Parse the file's contents.  */
00185                   char *res_ptr = NULL;
00186                   size_t res_size = 0;
00187 
00188                   for (;;)
00189                     {
00190                       int c;
00191                       char buf1[50+1];
00192                       char buf2[50+1];
00193                       size_t l1, l2;
00194                       char *old_res_ptr;
00195 
00196                       c = getc (fp);
00197                       if (c == EOF)
00198                         break;
00199                       if (c == '\n' || c == ' ' || c == '\t')
00200                         continue;
00201                       if (c == '#')
00202                         {
00203                           /* Skip comment, to end of line.  */
00204                           do
00205                             c = getc (fp);
00206                           while (!(c == EOF || c == '\n'));
00207                           if (c == EOF)
00208                             break;
00209                           continue;
00210                         }
00211                       ungetc (c, fp);
00212                       if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
00213                         break;
00214                       l1 = strlen (buf1);
00215                       l2 = strlen (buf2);
00216                       old_res_ptr = res_ptr;
00217                       if (res_size == 0)
00218                         {
00219                           res_size = l1 + 1 + l2 + 1;
00220                           res_ptr = (char *) malloc (res_size + 1);
00221                         }
00222                       else
00223                         {
00224                           res_size += l1 + 1 + l2 + 1;
00225                           res_ptr = (char *) realloc (res_ptr, res_size + 1);
00226                         }
00227                       if (res_ptr == NULL)
00228                         {
00229                           /* Out of memory. */
00230                           res_size = 0;
00231                           if (old_res_ptr != NULL)
00232                             free (old_res_ptr);
00233                           break;
00234                         }
00235                       strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
00236                       strcpy (res_ptr + res_size - (l2 + 1), buf2);
00237                     }
00238                   fclose (fp);
00239                   if (res_size == 0)
00240                     cp = "";
00241                   else
00242                     {
00243                       *(res_ptr + res_size) = '\0';
00244                       cp = res_ptr;
00245                     }
00246                 }
00247             }
00248 
00249           free (file_name);
00250         }
00251 
00252 #else
00253 
00254 # if defined DARWIN7
00255       /* To avoid the trouble of installing a file that is shared by many
00256          GNU packages -- many packaging systems have problems with this --,
00257          simply inline the aliases here.  */
00258       cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
00259            "ISO8859-2" "\0" "ISO-8859-2" "\0"
00260            "ISO8859-4" "\0" "ISO-8859-4" "\0"
00261            "ISO8859-5" "\0" "ISO-8859-5" "\0"
00262            "ISO8859-7" "\0" "ISO-8859-7" "\0"
00263            "ISO8859-9" "\0" "ISO-8859-9" "\0"
00264            "ISO8859-13" "\0" "ISO-8859-13" "\0"
00265            "ISO8859-15" "\0" "ISO-8859-15" "\0"
00266            "KOI8-R" "\0" "KOI8-R" "\0"
00267            "KOI8-U" "\0" "KOI8-U" "\0"
00268            "CP866" "\0" "CP866" "\0"
00269            "CP949" "\0" "CP949" "\0"
00270            "CP1131" "\0" "CP1131" "\0"
00271            "CP1251" "\0" "CP1251" "\0"
00272            "eucCN" "\0" "GB2312" "\0"
00273            "GB2312" "\0" "GB2312" "\0"
00274            "eucJP" "\0" "EUC-JP" "\0"
00275            "eucKR" "\0" "EUC-KR" "\0"
00276            "Big5" "\0" "BIG5" "\0"
00277            "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
00278            "GBK" "\0" "GBK" "\0"
00279            "GB18030" "\0" "GB18030" "\0"
00280            "SJIS" "\0" "SHIFT_JIS" "\0"
00281            "ARMSCII-8" "\0" "ARMSCII-8" "\0"
00282            "PT154" "\0" "PT154" "\0"
00283          /*"ISCII-DEV" "\0" "?" "\0"*/
00284            "*" "\0" "UTF-8" "\0";
00285 # endif
00286 
00287 # if defined VMS
00288       /* To avoid the troubles of an extra file charset.alias_vms in the
00289          sources of many GNU packages, simply inline the aliases here.  */
00290       /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
00291          "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
00292          section 10.7 "Handling Different Character Sets".  */
00293       cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
00294            "ISO8859-2" "\0" "ISO-8859-2" "\0"
00295            "ISO8859-5" "\0" "ISO-8859-5" "\0"
00296            "ISO8859-7" "\0" "ISO-8859-7" "\0"
00297            "ISO8859-8" "\0" "ISO-8859-8" "\0"
00298            "ISO8859-9" "\0" "ISO-8859-9" "\0"
00299            /* Japanese */
00300            "eucJP" "\0" "EUC-JP" "\0"
00301            "SJIS" "\0" "SHIFT_JIS" "\0"
00302            "DECKANJI" "\0" "DEC-KANJI" "\0"
00303            "SDECKANJI" "\0" "EUC-JP" "\0"
00304            /* Chinese */
00305            "eucTW" "\0" "EUC-TW" "\0"
00306            "DECHANYU" "\0" "DEC-HANYU" "\0"
00307            "DECHANZI" "\0" "GB2312" "\0"
00308            /* Korean */
00309            "DECKOREAN" "\0" "EUC-KR" "\0";
00310 # endif
00311 
00312 # if defined WIN32_NATIVE || defined __CYGWIN__
00313       /* To avoid the troubles of installing a separate file in the same
00314          directory as the DLL and of retrieving the DLL's directory at
00315          runtime, simply inline the aliases here.  */
00316 
00317       cp = "CP936" "\0" "GBK" "\0"
00318            "CP1361" "\0" "JOHAB" "\0"
00319            "CP20127" "\0" "ASCII" "\0"
00320            "CP20866" "\0" "KOI8-R" "\0"
00321            "CP20936" "\0" "GB2312" "\0"
00322            "CP21866" "\0" "KOI8-RU" "\0"
00323            "CP28591" "\0" "ISO-8859-1" "\0"
00324            "CP28592" "\0" "ISO-8859-2" "\0"
00325            "CP28593" "\0" "ISO-8859-3" "\0"
00326            "CP28594" "\0" "ISO-8859-4" "\0"
00327            "CP28595" "\0" "ISO-8859-5" "\0"
00328            "CP28596" "\0" "ISO-8859-6" "\0"
00329            "CP28597" "\0" "ISO-8859-7" "\0"
00330            "CP28598" "\0" "ISO-8859-8" "\0"
00331            "CP28599" "\0" "ISO-8859-9" "\0"
00332            "CP28605" "\0" "ISO-8859-15" "\0"
00333            "CP38598" "\0" "ISO-8859-8" "\0"
00334            "CP51932" "\0" "EUC-JP" "\0"
00335            "CP51936" "\0" "GB2312" "\0"
00336            "CP51949" "\0" "EUC-KR" "\0"
00337            "CP51950" "\0" "EUC-TW" "\0"
00338            "CP54936" "\0" "GB18030" "\0"
00339            "CP65001" "\0" "UTF-8" "\0";
00340 # endif
00341 #endif
00342 
00343       charset_aliases = cp;
00344     }
00345 
00346   return cp;
00347 }
00348 
00349 /* Determine the current locale's character encoding, and canonicalize it
00350    into one of the canonical names listed in config.charset.
00351    The result must not be freed; it is statically allocated.
00352    If the canonical name cannot be determined, the result is a non-canonical
00353    name.  */
00354 
00355 #ifdef STATIC
00356 STATIC
00357 #endif
00358 const char *
00359 locale_charset (void)
00360 {
00361   const char *codeset;
00362   const char *aliases;
00363 
00364 #if !(defined WIN32_NATIVE || defined OS2)
00365 
00366 # if HAVE_LANGINFO_CODESET
00367 
00368   /* Most systems support nl_langinfo (CODESET) nowadays.  */
00369   codeset = nl_langinfo (CODESET);
00370 
00371 #  ifdef __CYGWIN__
00372   /* Cygwin < 1.7 does not have locales.  nl_langinfo (CODESET) always
00373      returns "US-ASCII".  Return the suffix of the locale name from the
00374      environment variables (if present) or the codepage as a number.  */
00375   if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
00376     {
00377       const char *locale;
00378       static char buf[2 + 10 + 1];
00379 
00380       locale = getenv ("LC_ALL");
00381       if (locale == NULL || locale[0] == '\0')
00382         {
00383           locale = getenv ("LC_CTYPE");
00384           if (locale == NULL || locale[0] == '\0')
00385             locale = getenv ("LANG");
00386         }
00387       if (locale != NULL && locale[0] != '\0')
00388         {
00389           /* If the locale name contains an encoding after the dot, return
00390              it.  */
00391           const char *dot = strchr (locale, '.');
00392 
00393           if (dot != NULL)
00394             {
00395               const char *modifier;
00396 
00397               dot++;
00398               /* Look for the possible @... trailer and remove it, if any.  */
00399               modifier = strchr (dot, '@');
00400               if (modifier == NULL)
00401                 return dot;
00402               if (modifier - dot < sizeof (buf))
00403                 {
00404                   memcpy (buf, dot, modifier - dot);
00405                   buf [modifier - dot] = '\0';
00406                   return buf;
00407                 }
00408             }
00409         }
00410 
00411       /* Woe32 has a function returning the locale's codepage as a number:
00412          GetACP().  This encoding is used by Cygwin, unless the user has set
00413          the environment variable CYGWIN=codepage:oem (which very few people
00414          do).
00415          Output directed to console windows needs to be converted (to
00416          GetOEMCP() if the console is using a raster font, or to
00417          GetConsoleOutputCP() if it is using a TrueType font).  Cygwin does
00418          this conversion transparently (see winsup/cygwin/fhandler_console.cc),
00419          converting to GetConsoleOutputCP().  This leads to correct results,
00420          except when SetConsoleOutputCP has been called and a raster font is
00421          in use.  */
00422       sprintf (buf, "CP%u", GetACP ());
00423       codeset = buf;
00424     }
00425 #  endif
00426 
00427 # else
00428 
00429   /* On old systems which lack it, use setlocale or getenv.  */
00430   const char *locale = NULL;
00431 
00432   /* But most old systems don't have a complete set of locales.  Some
00433      (like SunOS 4 or DJGPP) have only the C locale.  Therefore we don't
00434      use setlocale here; it would return "C" when it doesn't support the
00435      locale name the user has set.  */
00436 #  if 0
00437   locale = setlocale (LC_CTYPE, NULL);
00438 #  endif
00439   if (locale == NULL || locale[0] == '\0')
00440     {
00441       locale = getenv ("LC_ALL");
00442       if (locale == NULL || locale[0] == '\0')
00443         {
00444           locale = getenv ("LC_CTYPE");
00445           if (locale == NULL || locale[0] == '\0')
00446             locale = getenv ("LANG");
00447         }
00448     }
00449 
00450   /* On some old systems, one used to set locale = "iso8859_1". On others,
00451      you set it to "language_COUNTRY.charset". In any case, we resolve it
00452      through the charset.alias file.  */
00453   codeset = locale;
00454 
00455 # endif
00456 
00457 #elif defined WIN32_NATIVE
00458 
00459   static char buf[2 + 10 + 1];
00460 
00461   /* Woe32 has a function returning the locale's codepage as a number:
00462      GetACP().
00463      When the output goes to a console window, it needs to be provided in
00464      GetOEMCP() encoding if the console is using a raster font, or in
00465      GetConsoleOutputCP() encoding if it is using a TrueType font.
00466      But in GUI programs and for output sent to files and pipes, GetACP()
00467      encoding is the best bet.  */
00468   sprintf (buf, "CP%u", GetACP ());
00469   codeset = buf;
00470 
00471 #elif defined OS2
00472 
00473   const char *locale;
00474   static char buf[2 + 10 + 1];
00475   ULONG cp[3];
00476   ULONG cplen;
00477 
00478   /* Allow user to override the codeset, as set in the operating system,
00479      with standard language environment variables.  */
00480   locale = getenv ("LC_ALL");
00481   if (locale == NULL || locale[0] == '\0')
00482     {
00483       locale = getenv ("LC_CTYPE");
00484       if (locale == NULL || locale[0] == '\0')
00485         locale = getenv ("LANG");
00486     }
00487   if (locale != NULL && locale[0] != '\0')
00488     {
00489       /* If the locale name contains an encoding after the dot, return it.  */
00490       const char *dot = strchr (locale, '.');
00491 
00492       if (dot != NULL)
00493         {
00494           const char *modifier;
00495 
00496           dot++;
00497           /* Look for the possible @... trailer and remove it, if any.  */
00498           modifier = strchr (dot, '@');
00499           if (modifier == NULL)
00500             return dot;
00501           if (modifier - dot < sizeof (buf))
00502             {
00503               memcpy (buf, dot, modifier - dot);
00504               buf [modifier - dot] = '\0';
00505               return buf;
00506             }
00507         }
00508 
00509       /* Resolve through the charset.alias file.  */
00510       codeset = locale;
00511     }
00512   else
00513     {
00514       /* OS/2 has a function returning the locale's codepage as a number.  */
00515       if (DosQueryCp (sizeof (cp), cp, &cplen))
00516         codeset = "";
00517       else
00518         {
00519           sprintf (buf, "CP%u", cp[0]);
00520           codeset = buf;
00521         }
00522     }
00523 
00524 #endif
00525 
00526   if (codeset == NULL)
00527     /* The canonical name cannot be determined.  */
00528     codeset = "";
00529 
00530   /* Resolve alias. */
00531   for (aliases = get_charset_aliases ();
00532        *aliases != '\0';
00533        aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
00534     if (strcmp (codeset, aliases) == 0
00535         || (aliases[0] == '*' && aliases[1] == '\0'))
00536       {
00537         codeset = aliases + strlen (aliases) + 1;
00538         break;
00539       }
00540 
00541   /* Don't return an empty string.  GNU libc and GNU libiconv interpret
00542      the empty string as denoting "the locale's character encoding",
00543      thus GNU libiconv would call this function a second time.  */
00544   if (codeset[0] == '\0')
00545     codeset = "ASCII";
00546 
00547   return codeset;
00548 }