Back to index

nagios-plugins  1.4.16
mbrtowc.c
Go to the documentation of this file.
00001 /* Convert multibyte character to wide character.
00002    Copyright (C) 1999-2002, 2005-2010 Free Software Foundation, Inc.
00003    Written by Bruno Haible <bruno@clisp.org>, 2008.
00004 
00005    This program is free software: you can redistribute it and/or modify
00006    it under the terms of the GNU General Public License as published by
00007    the Free Software Foundation; either version 3 of the License, or
00008    (at your option) any later version.
00009 
00010    This program is distributed in the hope that it will be useful,
00011    but WITHOUT ANY WARRANTY; without even the implied warranty of
00012    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013    GNU General Public License for more details.
00014 
00015    You should have received a copy of the GNU General Public License
00016    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
00017 
00018 #include <config.h>
00019 
00020 /* Specification.  */
00021 #include <wchar.h>
00022 
00023 #if GNULIB_defined_mbstate_t
00024 /* Implement mbrtowc() on top of mbtowc().  */
00025 
00026 # include <errno.h>
00027 # include <stdlib.h>
00028 
00029 # include "localcharset.h"
00030 # include "streq.h"
00031 # include "verify.h"
00032 
00033 
00034 verify (sizeof (mbstate_t) >= 4);
00035 
00036 static char internal_state[4];
00037 
00038 size_t
00039 mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
00040 {
00041   char *pstate = (char *)ps;
00042 
00043   if (pstate == NULL)
00044     pstate = internal_state;
00045 
00046   if (s == NULL)
00047     {
00048       pwc = NULL;
00049       s = "";
00050       n = 1;
00051     }
00052 
00053   if (n == 0)
00054     return (size_t)(-2);
00055 
00056   /* Here n > 0.  */
00057   {
00058     size_t nstate = pstate[0];
00059     char buf[4];
00060     const char *p;
00061     size_t m;
00062 
00063     switch (nstate)
00064       {
00065       case 0:
00066         p = s;
00067         m = n;
00068         break;
00069       case 3:
00070         buf[2] = pstate[3];
00071         /*FALLTHROUGH*/
00072       case 2:
00073         buf[1] = pstate[2];
00074         /*FALLTHROUGH*/
00075       case 1:
00076         buf[0] = pstate[1];
00077         p = buf;
00078         m = nstate;
00079         buf[m++] = s[0];
00080         if (n >= 2 && m < 4)
00081           {
00082             buf[m++] = s[1];
00083             if (n >= 3 && m < 4)
00084               buf[m++] = s[2];
00085           }
00086         break;
00087       default:
00088         errno = EINVAL;
00089         return (size_t)(-1);
00090       }
00091 
00092     /* Here m > 0.  */
00093 
00094 # if __GLIBC__
00095     /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
00096     mbtowc (NULL, NULL, 0);
00097 # endif
00098     {
00099       int res = mbtowc (pwc, p, m);
00100 
00101       if (res >= 0)
00102         {
00103           if (pwc != NULL && ((*pwc == 0) != (res == 0)))
00104             abort ();
00105           if (nstate >= (res > 0 ? res : 1))
00106             abort ();
00107           res -= nstate;
00108           pstate[0] = 0;
00109           return res;
00110         }
00111 
00112       /* mbtowc does not distinguish between invalid and incomplete multibyte
00113          sequences.  But mbrtowc needs to make this distinction.
00114          There are two possible approaches:
00115            - Use iconv() and its return value.
00116            - Use built-in knowledge about the possible encodings.
00117          Given the low quality of implementation of iconv() on the systems that
00118          lack mbrtowc(), we use the second approach.
00119          The possible encodings are:
00120            - 8-bit encodings,
00121            - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
00122            - UTF-8.
00123          Use specialized code for each.  */
00124       if (m >= 4 || m >= MB_CUR_MAX)
00125         goto invalid;
00126       /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
00127       {
00128         const char *encoding = locale_charset ();
00129 
00130         if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
00131           {
00132             /* Cf. unistr/u8-mblen.c.  */
00133             unsigned char c = (unsigned char) p[0];
00134 
00135             if (c >= 0xc2)
00136               {
00137                 if (c < 0xe0)
00138                   {
00139                     if (m == 1)
00140                       goto incomplete;
00141                   }
00142                 else if (c < 0xf0)
00143                   {
00144                     if (m == 1)
00145                       goto incomplete;
00146                     if (m == 2)
00147                       {
00148                         unsigned char c2 = (unsigned char) p[1];
00149 
00150                         if ((c2 ^ 0x80) < 0x40
00151                             && (c >= 0xe1 || c2 >= 0xa0)
00152                             && (c != 0xed || c2 < 0xa0))
00153                           goto incomplete;
00154                       }
00155                   }
00156                 else if (c <= 0xf4)
00157                   {
00158                     if (m == 1)
00159                       goto incomplete;
00160                     else /* m == 2 || m == 3 */
00161                       {
00162                         unsigned char c2 = (unsigned char) p[1];
00163 
00164                         if ((c2 ^ 0x80) < 0x40
00165                             && (c >= 0xf1 || c2 >= 0x90)
00166                             && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
00167                           {
00168                             if (m == 2)
00169                               goto incomplete;
00170                             else /* m == 3 */
00171                               {
00172                                 unsigned char c3 = (unsigned char) p[2];
00173 
00174                                 if ((c3 ^ 0x80) < 0x40)
00175                                   goto incomplete;
00176                               }
00177                           }
00178                       }
00179                   }
00180               }
00181             goto invalid;
00182           }
00183 
00184         /* As a reference for this code, you can use the GNU libiconv
00185            implementation.  Look for uses of the RET_TOOFEW macro.  */
00186 
00187         if (STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
00188           {
00189             if (m == 1)
00190               {
00191                 unsigned char c = (unsigned char) p[0];
00192 
00193                 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
00194                   goto incomplete;
00195               }
00196             if (m == 2)
00197               {
00198                 unsigned char c = (unsigned char) p[0];
00199 
00200                 if (c == 0x8f)
00201                   {
00202                     unsigned char c2 = (unsigned char) p[1];
00203 
00204                     if (c2 >= 0xa1 && c2 < 0xff)
00205                       goto incomplete;
00206                   }
00207               }
00208             goto invalid;
00209           }
00210         if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
00211             || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
00212             || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
00213           {
00214             if (m == 1)
00215               {
00216                 unsigned char c = (unsigned char) p[0];
00217 
00218                 if (c >= 0xa1 && c < 0xff)
00219                   goto incomplete;
00220               }
00221             goto invalid;
00222           }
00223         if (STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
00224           {
00225             if (m == 1)
00226               {
00227                 unsigned char c = (unsigned char) p[0];
00228 
00229                 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
00230                   goto incomplete;
00231               }
00232             else /* m == 2 || m == 3 */
00233               {
00234                 unsigned char c = (unsigned char) p[0];
00235 
00236                 if (c == 0x8e)
00237                   goto incomplete;
00238               }
00239             goto invalid;
00240           }
00241         if (STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
00242           {
00243             if (m == 1)
00244               {
00245                 unsigned char c = (unsigned char) p[0];
00246 
00247                 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
00248                   goto incomplete;
00249               }
00250             else /* m == 2 || m == 3 */
00251               {
00252                 unsigned char c = (unsigned char) p[0];
00253 
00254                 if (c >= 0x90 && c <= 0xe3)
00255                   {
00256                     unsigned char c2 = (unsigned char) p[1];
00257 
00258                     if (c2 >= 0x30 && c2 <= 0x39)
00259                       {
00260                         if (m == 2)
00261                           goto incomplete;
00262                         else /* m == 3 */
00263                           {
00264                             unsigned char c3 = (unsigned char) p[2];
00265 
00266                             if (c3 >= 0x81 && c3 <= 0xfe)
00267                               goto incomplete;
00268                           }
00269                       }
00270                   }
00271               }
00272             goto invalid;
00273           }
00274         if (STREQ (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
00275           {
00276             if (m == 1)
00277               {
00278                 unsigned char c = (unsigned char) p[0];
00279 
00280                 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
00281                     || (c >= 0xf0 && c <= 0xf9))
00282                   goto incomplete;
00283               }
00284             goto invalid;
00285           }
00286 
00287         /* An unknown multibyte encoding.  */
00288         goto incomplete;
00289       }
00290 
00291      incomplete:
00292       {
00293         size_t k = nstate;
00294         /* Here 0 <= k < m < 4.  */
00295         pstate[++k] = s[0];
00296         if (k < m)
00297           {
00298             pstate[++k] = s[1];
00299             if (k < m)
00300               pstate[++k] = s[2];
00301           }
00302         if (k != m)
00303           abort ();
00304       }
00305       pstate[0] = m;
00306       return (size_t)(-2);
00307 
00308      invalid:
00309       errno = EILSEQ;
00310       /* The conversion state is undefined, says POSIX.  */
00311       return (size_t)(-1);
00312     }
00313   }
00314 }
00315 
00316 #else
00317 /* Override the system's mbrtowc() function.  */
00318 
00319 # undef mbrtowc
00320 
00321 size_t
00322 rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
00323 {
00324 # if MBRTOWC_NULL_ARG_BUG || MBRTOWC_RETVAL_BUG
00325   if (s == NULL)
00326     {
00327       pwc = NULL;
00328       s = "";
00329       n = 1;
00330     }
00331 # endif
00332 
00333 # if MBRTOWC_RETVAL_BUG
00334   {
00335     static mbstate_t internal_state;
00336 
00337     /* Override mbrtowc's internal state.  We can not call mbsinit() on the
00338        hidden internal state, but we can call it on our variable.  */
00339     if (ps == NULL)
00340       ps = &internal_state;
00341 
00342     if (!mbsinit (ps))
00343       {
00344         /* Parse the rest of the multibyte character byte for byte.  */
00345         size_t count = 0;
00346         for (; n > 0; s++, n--)
00347           {
00348             wchar_t wc;
00349             size_t ret = mbrtowc (&wc, s, 1, ps);
00350 
00351             if (ret == (size_t)(-1))
00352               return (size_t)(-1);
00353             count++;
00354             if (ret != (size_t)(-2))
00355               {
00356                 /* The multibyte character has been completed.  */
00357                 if (pwc != NULL)
00358                   *pwc = wc;
00359                 return (wc == 0 ? 0 : count);
00360               }
00361           }
00362         return (size_t)(-2);
00363       }
00364   }
00365 # endif
00366 
00367 # if MBRTOWC_NUL_RETVAL_BUG
00368   {
00369     wchar_t wc;
00370     size_t ret = mbrtowc (&wc, s, n, ps);
00371 
00372     if (ret != (size_t)(-1) && ret != (size_t)(-2))
00373       {
00374         if (pwc != NULL)
00375           *pwc = wc;
00376         if (wc == 0)
00377           ret = 0;
00378       }
00379     return ret;
00380   }
00381 # else
00382   return mbrtowc (pwc, s, n, ps);
00383 # endif
00384 }
00385 
00386 #endif