Back to index

glibc  2.9
loop.c
Go to the documentation of this file.
00001 /* Conversion loop frame work.
00002    Copyright (C) 1998-2002, 2003, 2005, 2008 Free Software Foundation, Inc.
00003    This file is part of the GNU C Library.
00004    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
00005 
00006    The GNU C Library is free software; you can redistribute it and/or
00007    modify it under the terms of the GNU Lesser General Public
00008    License as published by the Free Software Foundation; either
00009    version 2.1 of the License, or (at your option) any later version.
00010 
00011    The GNU C Library is distributed in the hope that it will be useful,
00012    but WITHOUT ANY WARRANTY; without even the implied warranty of
00013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014    Lesser General Public License for more details.
00015 
00016    You should have received a copy of the GNU Lesser General Public
00017    License along with the GNU C Library; if not, write to the Free
00018    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
00019    02111-1307 USA.  */
00020 
00021 /* This file provides a frame for the reader loop in all conversion modules.
00022    The actual code must (of course) be provided in the actual module source
00023    code but certain actions can be written down generically, with some
00024    customization options which are these:
00025 
00026      MIN_NEEDED_INPUT       minimal number of input bytes needed for the next
00027                      conversion.
00028      MIN_NEEDED_OUTPUT      minimal number of bytes produced by the next round
00029                      of conversion.
00030 
00031      MAX_NEEDED_INPUT       you guess it, this is the maximal number of input
00032                      bytes needed.  It defaults to MIN_NEEDED_INPUT
00033      MAX_NEEDED_OUTPUT      likewise for output bytes.
00034 
00035      LOOPFCT         name of the function created.  If not specified
00036                      the name is `loop' but this prevents the use
00037                      of multiple functions in the same file.
00038 
00039      BODY            this is supposed to expand to the body of the loop.
00040                      The user must provide this.
00041 
00042      EXTRA_LOOP_DECLS       extra arguments passed from converion loop call.
00043 
00044      INIT_PARAMS     code to define and initialize variables from params.
00045      UPDATE_PARAMS   code to store result in params.
00046 
00047      ONEBYTE_BODY    body of the specialized conversion function for a
00048                      single byte from the current character set to INTERNAL.
00049 */
00050 
00051 #include <assert.h>
00052 #include <endian.h>
00053 #include <gconv.h>
00054 #include <stdint.h>
00055 #include <string.h>
00056 #include <wchar.h>
00057 #include <sys/param.h>             /* For MIN.  */
00058 #define __need_size_t
00059 #include <stddef.h>
00060 
00061 
00062 /* We have to provide support for machines which are not able to handled
00063    unaligned memory accesses.  Some of the character encodings have
00064    representations with a fixed width of 2 or 4 bytes.  But if we cannot
00065    access unaligned memory we still have to read byte-wise.  */
00066 #undef FCTNAME2
00067 #if defined _STRING_ARCH_unaligned || !defined DEFINE_UNALIGNED
00068 /* We can handle unaligned memory access.  */
00069 # define get16(addr) *((__const uint16_t *) (addr))
00070 # define get32(addr) *((__const uint32_t *) (addr))
00071 
00072 /* We need no special support for writing values either.  */
00073 # define put16(addr, val) *((uint16_t *) (addr)) = (val)
00074 # define put32(addr, val) *((uint32_t *) (addr)) = (val)
00075 
00076 # define FCTNAME2(name) name
00077 #else
00078 /* Distinguish between big endian and little endian.  */
00079 # if __BYTE_ORDER == __LITTLE_ENDIAN
00080 #  define get16(addr) \
00081      (((__const unsigned char *) (addr))[1] << 8                     \
00082       | ((__const unsigned char *) (addr))[0])
00083 #  define get32(addr) \
00084      (((((__const unsigned char *) (addr))[3] << 8                          \
00085        | ((__const unsigned char *) (addr))[2]) << 8                        \
00086        | ((__const unsigned char *) (addr))[1]) << 8                        \
00087       | ((__const unsigned char *) (addr))[0])
00088 
00089 #  define put16(addr, val) \
00090      ({ uint16_t __val = (val);                                             \
00091        ((unsigned char *) (addr))[0] = __val;                               \
00092        ((unsigned char *) (addr))[1] = __val >> 8;                          \
00093        (void) 0; })
00094 #  define put32(addr, val) \
00095      ({ uint32_t __val = (val);                                             \
00096        ((unsigned char *) (addr))[0] = __val;                               \
00097        __val >>= 8;                                                  \
00098        ((unsigned char *) (addr))[1] = __val;                               \
00099        __val >>= 8;                                                  \
00100        ((unsigned char *) (addr))[2] = __val;                               \
00101        __val >>= 8;                                                  \
00102        ((unsigned char *) (addr))[3] = __val;                               \
00103        (void) 0; })
00104 # else
00105 #  define get16(addr) \
00106      (((__const unsigned char *) (addr))[0] << 8                     \
00107       | ((__const unsigned char *) (addr))[1])
00108 #  define get32(addr) \
00109      (((((__const unsigned char *) (addr))[0] << 8                          \
00110        | ((__const unsigned char *) (addr))[1]) << 8                        \
00111        | ((__const unsigned char *) (addr))[2]) << 8                        \
00112       | ((__const unsigned char *) (addr))[3])
00113 
00114 #  define put16(addr, val) \
00115      ({ uint16_t __val = (val);                                             \
00116        ((unsigned char *) (addr))[1] = __val;                               \
00117        ((unsigned char *) (addr))[0] = __val >> 8;                          \
00118        (void) 0; })
00119 #  define put32(addr, val) \
00120      ({ uint32_t __val = (val);                                             \
00121        ((unsigned char *) (addr))[3] = __val;                               \
00122        __val >>= 8;                                                  \
00123        ((unsigned char *) (addr))[2] = __val;                               \
00124        __val >>= 8;                                                  \
00125        ((unsigned char *) (addr))[1] = __val;                               \
00126        __val >>= 8;                                                  \
00127        ((unsigned char *) (addr))[0] = __val;                               \
00128        (void) 0; })
00129 # endif
00130 
00131 # define FCTNAME2(name) name##_unaligned
00132 #endif
00133 #define FCTNAME(name) FCTNAME2(name)
00134 
00135 
00136 /* We need at least one byte for the next round.  */
00137 #ifndef MIN_NEEDED_INPUT
00138 # error "MIN_NEEDED_INPUT definition missing"
00139 #elif MIN_NEEDED_INPUT < 1
00140 # error "MIN_NEEDED_INPUT must be >= 1"
00141 #endif
00142 
00143 /* Let's see how many bytes we produce.  */
00144 #ifndef MAX_NEEDED_INPUT
00145 # define MAX_NEEDED_INPUT   MIN_NEEDED_INPUT
00146 #endif
00147 
00148 /* We produce at least one byte in the next round.  */
00149 #ifndef MIN_NEEDED_OUTPUT
00150 # error "MIN_NEEDED_OUTPUT definition missing"
00151 #elif MIN_NEEDED_OUTPUT < 1
00152 # error "MIN_NEEDED_OUTPUT must be >= 1"
00153 #endif
00154 
00155 /* Let's see how many bytes we produce.  */
00156 #ifndef MAX_NEEDED_OUTPUT
00157 # define MAX_NEEDED_OUTPUT  MIN_NEEDED_OUTPUT
00158 #endif
00159 
00160 /* Default name for the function.  */
00161 #ifndef LOOPFCT
00162 # define LOOPFCT            loop
00163 #endif
00164 
00165 /* Make sure we have a loop body.  */
00166 #ifndef BODY
00167 # error "Definition of BODY missing for function" LOOPFCT
00168 #endif
00169 
00170 
00171 /* If no arguments have to passed to the loop function define the macro
00172    as empty.  */
00173 #ifndef EXTRA_LOOP_DECLS
00174 # define EXTRA_LOOP_DECLS
00175 #endif
00176 
00177 /* Allow using UPDATE_PARAMS in macros where #ifdef UPDATE_PARAMS test
00178    isn't possible.  */
00179 #ifndef UPDATE_PARAMS
00180 # define UPDATE_PARAMS do { } while (0)
00181 #endif
00182 #ifndef REINIT_PARAMS
00183 # define REINIT_PARAMS do { } while (0)
00184 #endif
00185 
00186 
00187 /* To make it easier for the writers of the modules, we define a macro
00188    to test whether we have to ignore errors.  */
00189 #define ignore_errors_p() \
00190   (irreversible != NULL && (flags & __GCONV_IGNORE_ERRORS))
00191 
00192 
00193 /* Error handling for the FROM_LOOP direction, with ignoring of errors.
00194    Note that we cannot use the do while (0) trick since `break' and
00195    `continue' must reach certain points.  */
00196 #define STANDARD_FROM_LOOP_ERR_HANDLER(Incr) \
00197   {                                                                  \
00198     result = __GCONV_ILLEGAL_INPUT;                                         \
00199                                                                      \
00200     if (! ignore_errors_p ())                                               \
00201       break;                                                         \
00202                                                                      \
00203     /* We ignore the invalid input byte sequence.  */                       \
00204     inptr += (Incr);                                                 \
00205     ++*irreversible;                                                 \
00206     /* But we keep result == __GCONV_ILLEGAL_INPUT, because of the constraint \
00207        that "iconv -c" must give the same exitcode as "iconv".  */          \
00208     continue;                                                        \
00209   }
00210 
00211 /* Error handling for the TO_LOOP direction, with use of transliteration/
00212    transcription functions and ignoring of errors.  Note that we cannot use
00213    the do while (0) trick since `break' and `continue' must reach certain
00214    points.  */
00215 #define STANDARD_TO_LOOP_ERR_HANDLER(Incr) \
00216   {                                                                  \
00217     struct __gconv_trans_data *trans;                                       \
00218                                                                      \
00219     result = __GCONV_ILLEGAL_INPUT;                                         \
00220                                                                      \
00221     if (irreversible == NULL)                                               \
00222       /* This means we are in call from __gconv_transliterate.  In this            \
00223         case we are not doing any error recovery outself.  */               \
00224       break;                                                         \
00225                                                                      \
00226     /* If needed, flush any conversion state, so that __gconv_transliterate   \
00227        starts with current shift state.  */                                 \
00228     UPDATE_PARAMS;                                                   \
00229                                                                      \
00230     /* First try the transliteration methods.  */                           \
00231     for (trans = step_data->__trans; trans != NULL; trans = trans->__next)    \
00232       {                                                                     \
00233        result = DL_CALL_FCT (trans->__trans_fct,                     \
00234                            (step, step_data, trans->__data, *inptrp,        \
00235                             &inptr, inend, &outptr, irreversible));         \
00236        if (result != __GCONV_ILLEGAL_INPUT)                                 \
00237          break;                                                      \
00238       }                                                                     \
00239                                                                      \
00240     REINIT_PARAMS;                                                   \
00241                                                                      \
00242     /* If any of them recognized the input continue with the loop.  */             \
00243     if (result != __GCONV_ILLEGAL_INPUT)                             \
00244       {                                                                     \
00245        if (__builtin_expect (result == __GCONV_FULL_OUTPUT, 0))             \
00246          break;                                                      \
00247                                                                      \
00248        continue;                                                     \
00249       }                                                                     \
00250                                                                      \
00251     /* Next see whether we have to ignore the error.  If not, stop.  */            \
00252     if (! ignore_errors_p ())                                               \
00253       break;                                                         \
00254                                                                      \
00255     /* When we come here it means we ignore the character.  */              \
00256     ++*irreversible;                                                 \
00257     inptr += Incr;                                                   \
00258     /* But we keep result == __GCONV_ILLEGAL_INPUT, because of the constraint \
00259        that "iconv -c" must give the same exitcode as "iconv".  */          \
00260     continue;                                                        \
00261   }
00262 
00263 
00264 /* Handling of Unicode 3.1 TAG characters.  Unicode recommends
00265    "If language codes are not relevant to the particular processing
00266     operation, then they should be ignored."  This macro is usually
00267    called right before  STANDARD_TO_LOOP_ERR_HANDLER (Incr).  */
00268 #define UNICODE_TAG_HANDLER(Character, Incr) \
00269   {                                                                  \
00270     /* TAG characters are those in the range U+E0000..U+E007F.  */          \
00271     if (((Character) >> 7) == (0xe0000 >> 7))                               \
00272       {                                                                     \
00273        inptr += Incr;                                                       \
00274        continue;                                                     \
00275       }                                                                     \
00276   }
00277 
00278 
00279 /* The function returns the status, as defined in gconv.h.  */
00280 static inline int
00281 __attribute ((always_inline))
00282 FCTNAME (LOOPFCT) (struct __gconv_step *step,
00283                  struct __gconv_step_data *step_data,
00284                  const unsigned char **inptrp, const unsigned char *inend,
00285                  unsigned char **outptrp, const unsigned char *outend,
00286                  size_t *irreversible EXTRA_LOOP_DECLS)
00287 {
00288 #ifdef LOOP_NEED_STATE
00289   mbstate_t *state = step_data->__statep;
00290 #endif
00291 #ifdef LOOP_NEED_FLAGS
00292   int flags = step_data->__flags;
00293 #endif
00294 #ifdef LOOP_NEED_DATA
00295   void *data = step->__data;
00296 #endif
00297   int result = __GCONV_EMPTY_INPUT;
00298   const unsigned char *inptr = *inptrp;
00299   unsigned char *outptr = *outptrp;
00300 
00301 #ifdef INIT_PARAMS
00302   INIT_PARAMS;
00303 #endif
00304 
00305   while (inptr != inend)
00306     {
00307       /* `if' cases for MIN_NEEDED_OUTPUT ==/!= 1 is made to help the
00308         compiler generating better code.  They will be optimized away
00309         since MIN_NEEDED_OUTPUT is always a constant.  */
00310       if (MIN_NEEDED_INPUT > 1
00311          && __builtin_expect (inptr + MIN_NEEDED_INPUT > inend, 0))
00312        {
00313          /* We don't have enough input for another complete input
00314             character.  */
00315          result = __GCONV_INCOMPLETE_INPUT;
00316          break;
00317        }
00318       if ((MIN_NEEDED_OUTPUT != 1
00319           && __builtin_expect (outptr + MIN_NEEDED_OUTPUT > outend, 0))
00320          || (MIN_NEEDED_OUTPUT == 1
00321              && __builtin_expect (outptr >= outend, 0)))
00322        {
00323          /* Overflow in the output buffer.  */
00324          result = __GCONV_FULL_OUTPUT;
00325          break;
00326        }
00327 
00328       /* Here comes the body the user provides.  It can stop with
00329         RESULT set to GCONV_INCOMPLETE_INPUT (if the size of the
00330         input characters vary in size), GCONV_ILLEGAL_INPUT, or
00331         GCONV_FULL_OUTPUT (if the output characters vary in size).  */
00332       BODY
00333     }
00334 
00335   /* Update the pointers pointed to by the parameters.  */
00336   *inptrp = inptr;
00337   *outptrp = outptr;
00338   UPDATE_PARAMS;
00339 
00340   return result;
00341 }
00342 
00343 
00344 /* Include the file a second time to define the function to handle
00345    unaligned access.  */
00346 #if !defined DEFINE_UNALIGNED && !defined _STRING_ARCH_unaligned \
00347     && MIN_NEEDED_INPUT != 1 && MAX_NEEDED_INPUT % MIN_NEEDED_INPUT == 0 \
00348     && MIN_NEEDED_OUTPUT != 1 && MAX_NEEDED_OUTPUT % MIN_NEEDED_OUTPUT == 0
00349 # undef get16
00350 # undef get32
00351 # undef put16
00352 # undef put32
00353 # undef unaligned
00354 
00355 # define DEFINE_UNALIGNED
00356 # include "loop.c"
00357 # undef DEFINE_UNALIGNED
00358 #endif
00359 
00360 
00361 #if MAX_NEEDED_INPUT > 1
00362 # define SINGLE(fct) SINGLE2 (fct)
00363 # define SINGLE2(fct) fct##_single
00364 static inline int
00365 __attribute ((always_inline))
00366 SINGLE(LOOPFCT) (struct __gconv_step *step,
00367                struct __gconv_step_data *step_data,
00368                const unsigned char **inptrp, const unsigned char *inend,
00369                unsigned char **outptrp, unsigned char *outend,
00370                size_t *irreversible EXTRA_LOOP_DECLS)
00371 {
00372   mbstate_t *state = step_data->__statep;
00373 #ifdef LOOP_NEED_FLAGS
00374   int flags = step_data->__flags;
00375 #endif
00376 #ifdef LOOP_NEED_DATA
00377   void *data = step->__data;
00378 #endif
00379   int result = __GCONV_OK;
00380   unsigned char bytebuf[MAX_NEEDED_INPUT];
00381   const unsigned char *inptr = *inptrp;
00382   unsigned char *outptr = *outptrp;
00383   size_t inlen;
00384 
00385 #ifdef INIT_PARAMS
00386   INIT_PARAMS;
00387 #endif
00388 
00389 #ifdef UNPACK_BYTES
00390   UNPACK_BYTES
00391 #else
00392   /* Add the bytes from the state to the input buffer.  */
00393   for (inlen = 0; inlen < (size_t) (state->__count & 7); ++inlen)
00394     bytebuf[inlen] = state->__value.__wchb[inlen];
00395 #endif
00396 
00397   /* Are there enough bytes in the input buffer?  */
00398   if (__builtin_expect (inptr + (MIN_NEEDED_INPUT - inlen) > inend, 0))
00399     {
00400       *inptrp = inend;
00401 #ifdef STORE_REST
00402       inptr = bytebuf;
00403       inptrp = &inptr;
00404       inend = &bytebuf[inlen];
00405 
00406       STORE_REST
00407 #else
00408       /* We don't have enough input for another complete input
00409         character.  */
00410       while (inptr < inend)
00411        state->__value.__wchb[inlen++] = *inptr++;
00412 #endif
00413 
00414       return __GCONV_INCOMPLETE_INPUT;
00415     }
00416 
00417   /* Enough space in output buffer.  */
00418   if ((MIN_NEEDED_OUTPUT != 1 && outptr + MIN_NEEDED_OUTPUT > outend)
00419       || (MIN_NEEDED_OUTPUT == 1 && outptr >= outend))
00420     /* Overflow in the output buffer.  */
00421     return __GCONV_FULL_OUTPUT;
00422 
00423   /*  Now add characters from the normal input buffer.  */
00424   do
00425     bytebuf[inlen++] = *inptr++;
00426   while (inlen < MAX_NEEDED_INPUT && inptr < inend);
00427 
00428   inptr = bytebuf;
00429   inend = &bytebuf[inlen];
00430 
00431   do
00432     {
00433       BODY
00434     }
00435   while (0);
00436 
00437   /* Now we either have produced an output character and consumed all the
00438      bytes from the state and at least one more, or the character is still
00439      incomplete, or we have some other error (like illegal input character,
00440      no space in output buffer).  */
00441   if (__builtin_expect (inptr != bytebuf, 1))
00442     {
00443       /* We found a new character.  */
00444       assert (inptr - bytebuf > (state->__count & 7));
00445 
00446       *inptrp += inptr - bytebuf - (state->__count & 7);
00447       *outptrp = outptr;
00448 
00449       result = __GCONV_OK;
00450 
00451       /* Clear the state buffer.  */
00452 #ifdef CLEAR_STATE
00453       CLEAR_STATE;
00454 #else
00455       state->__count &= ~7;
00456 #endif
00457     }
00458   else if (result == __GCONV_INCOMPLETE_INPUT)
00459     {
00460       /* This can only happen if we have less than MAX_NEEDED_INPUT bytes
00461         available.  */
00462       assert (inend != &bytebuf[MAX_NEEDED_INPUT]);
00463 
00464       *inptrp += inend - bytebuf - (state->__count & 7);
00465 #ifdef STORE_REST
00466       inptrp = &inptr;
00467 
00468       STORE_REST
00469 #else
00470       /* We don't have enough input for another complete input
00471         character.  */
00472       assert (inend - inptr > (state->__count & ~7));
00473       assert (inend - inptr <= 7);
00474       state->__count = (state->__count & ~7) | (inend - inptr);
00475       inlen = 0;
00476       while (inptr < inend)
00477        state->__value.__wchb[inlen++] = *inptr++;
00478 #endif
00479     }
00480 
00481   return result;
00482 }
00483 # undef SINGLE
00484 # undef SINGLE2
00485 #endif
00486 
00487 
00488 #ifdef ONEBYTE_BODY
00489 /* Define the shortcut function for btowc.  */
00490 static wint_t
00491 gconv_btowc (struct __gconv_step *step, unsigned char c)
00492   ONEBYTE_BODY
00493 # define FROM_ONEBYTE gconv_btowc
00494 #endif
00495 
00496 
00497 /* We remove the macro definitions so that we can include this file again
00498    for the definition of another function.  */
00499 #undef MIN_NEEDED_INPUT
00500 #undef MAX_NEEDED_INPUT
00501 #undef MIN_NEEDED_OUTPUT
00502 #undef MAX_NEEDED_OUTPUT
00503 #undef LOOPFCT
00504 #undef BODY
00505 #undef LOOPFCT
00506 #undef EXTRA_LOOP_DECLS
00507 #undef INIT_PARAMS
00508 #undef UPDATE_PARAMS
00509 #undef REINIT_PARAMS
00510 #undef ONEBYTE_BODY
00511 #undef UNPACK_BYTES
00512 #undef CLEAR_STATE
00513 #undef LOOP_NEED_STATE
00514 #undef LOOP_NEED_FLAGS
00515 #undef LOOP_NEED_DATA
00516 #undef get16
00517 #undef get32
00518 #undef put16
00519 #undef put32
00520 #undef unaligned