Back to index

nux  3.0.0
NUni.cpp
Go to the documentation of this file.
00001 /*
00002  * Copyright 2010 Inalogic® Inc.
00003  *
00004  * This program is free software: you can redistribute it and/or modify it
00005  * under the terms of the GNU Lesser General Public License, as
00006  * published by the  Free Software Foundation; either version 2.1 or 3.0
00007  * of the License.
00008  *
00009  * This program is distributed in the hope that it will be useful, but
00010  * WITHOUT ANY WARRANTY; without even the implied warranties of
00011  * MERCHANTABILITY, SATISFACTORY QUALITY or FITNESS FOR A PARTICULAR
00012  * PURPOSE.  See the applicable version of the GNU Lesser General Public
00013  * License for more details.
00014  *
00015  * You should have received a copy of both the GNU Lesser General Public
00016  * License along with this program. If not, see <http://www.gnu.org/licenses/>
00017  *
00018  * Authored by: Jay Taoko <jaytaoko@inalogic.com>
00019  *
00020  */
00021 
00022 
00023 /*
00024  * Copyright 2001-2004 Unicode, Inc.
00025  *
00026  * Disclaimer
00027  *
00028  * This source code is provided as is by Unicode, Inc. No claims are
00029  * made as to fitness for any particular purpose. No warranties of any
00030  * kind are expressed or implied. The recipient agrees to determine
00031  * applicability of information provided. If this file has been
00032  * purchased on magnetic or optical media from Unicode, Inc., the
00033  * sole remedy for any claim will be exchange of defective media
00034  * within 90 days of receipt.
00035  *
00036  * Limitations on Rights to Redistribute This Code
00037  *
00038  * Unicode, Inc. hereby grants the right to freely use the information
00039  * supplied in this file in the creation of products supporting the
00040  * Unicode Standard, and to make copies of this file in any form
00041  * for internal or external distribution as long as this notice
00042  * remains attached.
00043  */
00044 
00045 /* ---------------------------------------------------------------------
00046 
00047     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
00048     Author: Mark E. Davis, 1994.
00049     Rev History: Rick McGowan, fixes & updates May 2001.
00050     Sept 2001: fixed const & error conditions per
00051         mods suggested by S. Parent & A. Lillich.
00052     June 2002: Tim Dodd added detection and handling of incomplete
00053         source sequences, enhanced error detection, added casts
00054         to eliminate compiler warnings.
00055     July 2003: slight mods to back out aggressive FFFE detection.
00056     Jan 2004: updated switches in from-UTF8 conversions.
00057     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
00058     May 2006: updated isLegalUTF8Sequence.
00059 
00060     See the header file "ConvertUTF.h" for complete documentation.
00061 
00062 ------------------------------------------------------------------------ */
00063 
00064 #include "NuxCore.h"
00065 //#include "NUni.h"
00066 
00067 namespace nux
00068 {
00069 
00070   static const int halfShift  = 10; /* used for shifting by 10 bits */
00071 
00072   static const unsigned int halfBase = 0x0010000UL;
00073   static const unsigned int halfMask = 0x3FFUL;
00074 
00075 #define UNI_SUR_HIGH_START  (unsigned int)0xD800
00076 #define UNI_SUR_HIGH_END    (unsigned int)0xDBFF
00077 #define UNI_SUR_LOW_START   (unsigned int)0xDC00
00078 #define UNI_SUR_LOW_END     (unsigned int)0xDFFF
00079 
00080 
00081   ConversionResult ConvertUTF32toUTF16 (const unsigned int **sourceStart, const unsigned int *sourceEnd, wchar_t **targetStart, wchar_t *targetEnd, ConversionFlags flags)
00082   {
00083     ConversionResult result = conversionOK;
00084     const unsigned int *source = *sourceStart;
00085     wchar_t *target = *targetStart;
00086 
00087     while (source < sourceEnd)
00088     {
00089       unsigned int ch;
00090 
00091       if (target >= targetEnd)
00092       {
00093         result = targetExhausted;
00094         break;
00095       }
00096 
00097       ch = *source++;
00098 
00099       if (ch <= UNI_MAX_BMP)   /* Target is a character <= 0xFFFF */
00100       {
00101         /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
00102         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
00103         {
00104           if (flags == strictConversion)
00105           {
00106             --source; /* return to the illegal value itself */
00107             result = sourceIllegal;
00108             break;
00109           }
00110           else
00111           {
00112             *target++ = UNI_REPLACEMENT_CHAR;
00113           }
00114         }
00115         else
00116         {
00117           *target++ = (wchar_t) ch; /* normal case */
00118         }
00119       }
00120       else if (ch > UNI_MAX_LEGAL_UTF32)
00121       {
00122         if (flags == strictConversion)
00123         {
00124           result = sourceIllegal;
00125         }
00126         else
00127         {
00128           *target++ = UNI_REPLACEMENT_CHAR;
00129         }
00130       }
00131       else
00132       {
00133         /* target is a character in range 0xFFFF - 0x10FFFF. */
00134         if (target + 1 >= targetEnd)
00135         {
00136           --source; /* Back up source pointer! */
00137           result = targetExhausted;
00138           break;
00139         }
00140 
00141         ch -= halfBase;
00142         *target++ = (wchar_t) ( (ch >> halfShift) + UNI_SUR_HIGH_START);
00143         *target++ = (wchar_t) ( (ch & halfMask) + UNI_SUR_LOW_START);
00144       }
00145     }
00146 
00147     *sourceStart = source;
00148     *targetStart = target;
00149     return result;
00150   }
00151 
00152   /* --------------------------------------------------------------------- */
00153 
00154   ConversionResult ConvertUTF16toUTF32 (const wchar_t **sourceStart, const wchar_t *sourceEnd, unsigned int **targetStart, unsigned int *targetEnd, ConversionFlags flags)
00155   {
00156     ConversionResult result = conversionOK;
00157     const wchar_t *source = *sourceStart;
00158     unsigned int *target = *targetStart;
00159     unsigned int ch, ch2;
00160 
00161     while (source < sourceEnd)
00162     {
00163       const wchar_t *oldSource = source; /*  In case we have to back up because of target overflow. */
00164       ch = *source++;
00165 
00166       /* If we have a surrogate pair, convert to UTF32 first. */
00167       if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
00168       {
00169         /* If the 16 bits following the high surrogate are in the source buffer... */
00170         if (source < sourceEnd)
00171         {
00172           ch2 = *source;
00173 
00174           /* If it's a low surrogate, convert to UTF32. */
00175           if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
00176           {
00177             ch = ( (ch - UNI_SUR_HIGH_START) << halfShift)
00178                  + (ch2 - UNI_SUR_LOW_START) + halfBase;
00179             ++source;
00180           }
00181           else if (flags == strictConversion)     /* it's an unpaired high surrogate */
00182           {
00183             --source; /* return to the illegal value itself */
00184             result = sourceIllegal;
00185             break;
00186           }
00187         }
00188         else     /* We don't have the 16 bits following the high surrogate. */
00189         {
00190           --source; /* return to the high surrogate */
00191           result = sourceExhausted;
00192           break;
00193         }
00194       }
00195       else if (flags == strictConversion)
00196       {
00197         /* UTF-16 surrogate values are illegal in UTF-32 */
00198         if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
00199         {
00200           --source; /* return to the illegal value itself */
00201           result = sourceIllegal;
00202           break;
00203         }
00204       }
00205 
00206       if (target >= targetEnd)
00207       {
00208         source = oldSource; /* Back up source pointer! */
00209         result = targetExhausted;
00210         break;
00211       }
00212 
00213       *target++ = ch;
00214     }
00215 
00216     *sourceStart = source;
00217     *targetStart = target;
00218 #ifdef CVTUTF_DEBUG
00219 
00220     if (result == sourceIllegal)
00221     {
00222       fprintf (stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
00223       fflush (stderr);
00224     }
00225 
00226 #endif
00227     return result;
00228   }
00229 
00230   /* --------------------------------------------------------------------- */
00231 
00232   /*
00233   * Index into the table below with the first byte of a UTF-8 sequence to
00234   * get the number of trailing bytes that are supposed to follow it.
00235   * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
00236   * left as-is for anyone who may want to do such conversion, which was
00237   * allowed in earlier algorithms.
00238   */
00239   static const char trailingBytesForUTF8[256] =
00240   {
00241     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00242     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00243     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00244     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00245     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00246     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00247     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00248     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
00249   };
00250 
00251   /*
00252   * Magic values subtracted from a buffer value during UTF8 conversion.
00253   * This table contains as many values as there might be trailing bytes
00254   * in a UTF-8 sequence.
00255   */
00256   static const unsigned int offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
00257       0x03C82080UL, 0xFA082080UL, 0x82082080UL
00258                                             };
00259 
00260   /*
00261   * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
00262   * into the first byte, depending on how many bytes follow.  There are
00263   * as many entries in this table as there are UTF-8 sequence types.
00264   * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
00265   * for *legal* UTF-8 will be 4 or fewer bytes total.
00266   */
00267   static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
00268 
00269   /* --------------------------------------------------------------------- */
00270 
00271   /* The interface converts a whole buffer to avoid function-call overhead.
00272   * Constants have been gathered. Loops & conditionals have been removed as
00273   * much as possible for efficiency, in favor of drop-through switches.
00274   * (See "Note A" at the bottom of the file for equivalent code.)
00275   * If your compiler supports it, the "isLegalUTF8" call can be turned
00276   * into an inline function.
00277   */
00278 
00279   /* --------------------------------------------------------------------- */
00280 
00281   ConversionResult ConvertUTF16toUTF8 (const wchar_t **sourceStart, const wchar_t *sourceEnd, unsigned char **targetStart, unsigned char *targetEnd, ConversionFlags flags)
00282   {
00283     ConversionResult result = conversionOK;
00284     const wchar_t *source = *sourceStart;
00285     unsigned char *target = *targetStart;
00286 
00287     while (source < sourceEnd)
00288     {
00289       unsigned int ch;
00290       unsigned short bytesToWrite = 0;
00291       const unsigned int byteMask = 0xBF;
00292       const unsigned int byteMark = 0x80;
00293       const wchar_t *oldSource = source; /* In case we have to back up because of target overflow. */
00294       ch = *source++;
00295 
00296       /* If we have a surrogate pair, convert to UTF32 first. */
00297       if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
00298       {
00299         /* If the 16 bits following the high surrogate are in the source buffer... */
00300         if (source < sourceEnd)
00301         {
00302           unsigned int ch2 = *source;
00303 
00304           /* If it's a low surrogate, convert to UTF32. */
00305           if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
00306           {
00307             ch = ( (ch - UNI_SUR_HIGH_START) << halfShift)
00308                  + (ch2 - UNI_SUR_LOW_START) + halfBase;
00309             ++source;
00310           }
00311           else if (flags == strictConversion)     /* it's an unpaired high surrogate */
00312           {
00313             --source; /* return to the illegal value itself */
00314             result = sourceIllegal;
00315             break;
00316           }
00317         }
00318         else     /* We don't have the 16 bits following the high surrogate. */
00319         {
00320           --source; /* return to the high surrogate */
00321           result = sourceExhausted;
00322           break;
00323         }
00324       }
00325       else if (flags == strictConversion)
00326       {
00327         /* UTF-16 surrogate values are illegal in UTF-32 */
00328         if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
00329         {
00330           --source; /* return to the illegal value itself */
00331           result = sourceIllegal;
00332           break;
00333         }
00334       }
00335 
00336       /* Figure out how many bytes the result will require */
00337       if (ch < (unsigned int) 0x80)
00338       {
00339         bytesToWrite = 1;
00340       }
00341       else if (ch < (unsigned int) 0x800)
00342       {
00343         bytesToWrite = 2;
00344       }
00345       else if (ch < (unsigned int) 0x10000)
00346       {
00347         bytesToWrite = 3;
00348       }
00349       else if (ch < (unsigned int) 0x110000)
00350       {
00351         bytesToWrite = 4;
00352       }
00353       else
00354       {
00355         bytesToWrite = 3;
00356         ch = UNI_REPLACEMENT_CHAR;
00357       }
00358 
00359       target += bytesToWrite;
00360 
00361       if (target > targetEnd)
00362       {
00363         source = oldSource; /* Back up source pointer! */
00364         target -= bytesToWrite;
00365         result = targetExhausted;
00366         break;
00367       }
00368 
00369       switch (bytesToWrite)   /* note: everything falls through. */
00370       {
00371         case 4:
00372           *--target = (unsigned char) ( (ch | byteMark) & byteMask);
00373           ch >>= 6;
00374         case 3:
00375           *--target = (unsigned char) ( (ch | byteMark) & byteMask);
00376           ch >>= 6;
00377         case 2:
00378           *--target = (unsigned char) ( (ch | byteMark) & byteMask);
00379           ch >>= 6;
00380         case 1:
00381           *--target =  (unsigned char) (ch | firstByteMark[bytesToWrite]);
00382       }
00383 
00384       target += bytesToWrite;
00385     }
00386 
00387     *sourceStart = source;
00388     *targetStart = target;
00389     return result;
00390   }
00391 
00392   /* --------------------------------------------------------------------- */
00393 
00394   /*
00395   * Utility routine to tell whether a sequence of bytes is legal UTF-8.
00396   * This must be called with the length pre-determined by the first byte.
00397   * If not calling this from ConvertUTF8to*, then the length can be set by:
00398   *  length = trailingBytesForUTF8[*source]+1;
00399   * and the sequence is illegal right away if there aren't that many bytes
00400   * available.
00401   * If presented with a length > 4, this returns false.  The Unicode
00402   * definition of UTF-8 goes up to 4-byte sequences.
00403   */
00404 
00405   static bool isLegalUTF8 (const unsigned char *source, int length)
00406   {
00407     unsigned char a;
00408     const unsigned char *srcptr = source + length;
00409 
00410     switch (length)
00411     {
00412       default:
00413         return false;
00414         /* Everything else falls through when "true"... */
00415       case 4:
00416 
00417         if ( (a = (*--srcptr) ) < 0x80 || a > 0xBF) return false;
00418 
00419       case 3:
00420 
00421         if ( (a = (*--srcptr) ) < 0x80 || a > 0xBF) return false;
00422 
00423       case 2:
00424 
00425         if ( (a = (*--srcptr) ) > 0xBF) return false;
00426 
00427         switch (*source)
00428         {
00429             /* no fall-through in this inner switch */
00430           case 0xE0:
00431 
00432             if (a < 0xA0) return false;
00433 
00434             break;
00435           case 0xED:
00436 
00437             if ( (a < 0x80) || (a > 0x9F) ) return false;
00438 
00439             break;
00440           case 0xF0:
00441 
00442             if (a < 0x90) return false;
00443 
00444             break;
00445           case 0xF4:
00446 
00447             if (a > 0x8F) return false;
00448 
00449             break;
00450           default:
00451 
00452             if (a < 0x80) return false;
00453         }
00454 
00455       case 1:
00456 
00457         if (*source >= 0x80 && *source < 0xC2) return false;
00458     }
00459 
00460     if (*source > 0xF4) return false;
00461 
00462     return true;
00463   }
00464 
00465   /* --------------------------------------------------------------------- */
00466 
00467   /*
00468   * Exported function to return whether a UTF-8 sequence is legal or not.
00469   * This is not used here; it's just exported.
00470   */
00471 
00472   bool isLegalUTF8Sequence (const unsigned char *source, const unsigned char *sourceEnd)
00473   {
00474     int length;
00475 
00476     if (source == sourceEnd)
00477     {
00478       return true;
00479     }
00480 
00481     while (true)
00482     {
00483       length = trailingBytesForUTF8[*source] + 1;
00484 
00485       if (source + length > sourceEnd)
00486       {
00487         return false;
00488       }
00489 
00490       if (!isLegalUTF8 (source, length) )
00491       {
00492         return false;
00493       }
00494 
00495       source += length;
00496 
00497       if (source >= sourceEnd)
00498       {
00499         return true;
00500       }
00501     }
00502   }
00503 
00508   bool
00509   tr_utf8_validate ( const char *str, int max_len, const char **end )
00510   {
00511     const unsigned char *source = (const unsigned char *) str;
00512     const unsigned char *sourceEnd;
00513 
00514     if ( max_len == 0 )
00515       return true;
00516 
00517     if ( str == NULL )
00518       return false;
00519 
00520     sourceEnd = source + ( (max_len < 0) ? strlen (str) : (size_t) max_len);
00521 
00522     if ( source == sourceEnd )
00523     {
00524       if ( end != NULL )
00525         *end = (const char *) source;
00526 
00527       return true;
00528     }
00529 
00530     for ( ;; )
00531     {
00532       const int length = trailingBytesForUTF8[*source] + 1;
00533 
00534       if (source + length > sourceEnd)
00535       {
00536         if ( end != NULL )
00537           *end = (const char *) source;
00538 
00539         return false;
00540       }
00541 
00542       if (!isLegalUTF8 (source, length) )
00543       {
00544         if ( end != NULL )
00545           *end = (const char *) source;
00546 
00547         return false;
00548       }
00549 
00550       source += length;
00551 
00552       if (source >= sourceEnd)
00553       {
00554         if ( end != NULL )
00555           *end = (const char *) source;
00556 
00557         return true;
00558       }
00559     }
00560 
00561 
00562   }
00563 
00564 
00565   /* --------------------------------------------------------------------- */
00566 
00567   ConversionResult ConvertUTF8toUTF16 (const unsigned char **sourceStart, const unsigned char *sourceEnd, wchar_t **targetStart, wchar_t *targetEnd, ConversionFlags flags)
00568   {
00569     ConversionResult result = conversionOK;
00570     const unsigned char *source = *sourceStart;
00571     wchar_t *target = *targetStart;
00572 
00573     while (source < sourceEnd)
00574     {
00575       unsigned int ch = 0;
00576       unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
00577 
00578       if (source + extraBytesToRead >= sourceEnd)
00579       {
00580         result = sourceExhausted;
00581         break;
00582       }
00583 
00584       /* Do this check whether lenient or strict */
00585       if (! isLegalUTF8 (source, extraBytesToRead + 1) )
00586       {
00587         result = sourceIllegal;
00588         break;
00589       }
00590 
00591       /*
00592        * The cases all fall through. See "Note A" below.
00593        */
00594       switch (extraBytesToRead)
00595       {
00596         case 5:
00597           ch += *source++;
00598           ch <<= 6; /* remember, illegal UTF-8 */
00599         case 4:
00600           ch += *source++;
00601           ch <<= 6; /* remember, illegal UTF-8 */
00602         case 3:
00603           ch += *source++;
00604           ch <<= 6;
00605         case 2:
00606           ch += *source++;
00607           ch <<= 6;
00608         case 1:
00609           ch += *source++;
00610           ch <<= 6;
00611         case 0:
00612           ch += *source++;
00613       }
00614 
00615       ch -= offsetsFromUTF8[extraBytesToRead];
00616 
00617       if (target >= targetEnd)
00618       {
00619         source -= (extraBytesToRead + 1); /* Back up source pointer! */
00620         result = targetExhausted;
00621         break;
00622       }
00623 
00624       if (ch <= UNI_MAX_BMP)   /* Target is a character <= 0xFFFF */
00625       {
00626         /* UTF-16 surrogate values are illegal in UTF-32 */
00627         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
00628         {
00629           if (flags == strictConversion)
00630           {
00631             source -= (extraBytesToRead + 1); /* return to the illegal value itself */
00632             result = sourceIllegal;
00633             break;
00634           }
00635           else
00636           {
00637             *target++ = UNI_REPLACEMENT_CHAR;
00638           }
00639         }
00640         else
00641         {
00642           *target++ = (wchar_t) ch; /* normal case */
00643         }
00644       }
00645       else if (ch > UNI_MAX_UTF16)
00646       {
00647         if (flags == strictConversion)
00648         {
00649           result = sourceIllegal;
00650           source -= (extraBytesToRead + 1); /* return to the start */
00651           break; /* Bail out; shouldn't continue */
00652         }
00653         else
00654         {
00655           *target++ = UNI_REPLACEMENT_CHAR;
00656         }
00657       }
00658       else
00659       {
00660         /* target is a character in range 0xFFFF - 0x10FFFF. */
00661         if (target + 1 >= targetEnd)
00662         {
00663           source -= (extraBytesToRead + 1); /* Back up source pointer! */
00664           result = targetExhausted;
00665           break;
00666         }
00667 
00668         ch -= halfBase;
00669         *target++ = (wchar_t) ( (ch >> halfShift) + UNI_SUR_HIGH_START);
00670         *target++ = (wchar_t) ( (ch & halfMask) + UNI_SUR_LOW_START);
00671       }
00672     }
00673 
00674     *sourceStart = source;
00675     *targetStart = target;
00676     return result;
00677   }
00678 
00679   /* --------------------------------------------------------------------- */
00680 
00681   ConversionResult ConvertUTF32toUTF8 (
00682     const unsigned int **sourceStart, const unsigned int *sourceEnd,
00683     unsigned char **targetStart, unsigned char *targetEnd, ConversionFlags flags)
00684   {
00685     ConversionResult result = conversionOK;
00686     const unsigned int *source = *sourceStart;
00687     unsigned char *target = *targetStart;
00688 
00689     while (source < sourceEnd)
00690     {
00691       unsigned int ch;
00692       unsigned short bytesToWrite = 0;
00693       const unsigned int byteMask = 0xBF;
00694       const unsigned int byteMark = 0x80;
00695       ch = *source++;
00696 
00697       if (flags == strictConversion )
00698       {
00699         /* UTF-16 surrogate values are illegal in UTF-32 */
00700         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
00701         {
00702           --source; /* return to the illegal value itself */
00703           result = sourceIllegal;
00704           break;
00705         }
00706       }
00707 
00708       /*
00709        * Figure out how many bytes the result will require. Turn any
00710        * illegally large UTF32 things (> Plane 17) into replacement chars.
00711        */
00712       if (ch < (unsigned int) 0x80)
00713       {
00714         bytesToWrite = 1;
00715       }
00716       else if (ch < (unsigned int) 0x800)
00717       {
00718         bytesToWrite = 2;
00719       }
00720       else if (ch < (unsigned int) 0x10000)
00721       {
00722         bytesToWrite = 3;
00723       }
00724       else if (ch <= UNI_MAX_LEGAL_UTF32)
00725       {
00726         bytesToWrite = 4;
00727       }
00728       else
00729       {
00730         bytesToWrite = 3;
00731         ch = UNI_REPLACEMENT_CHAR;
00732         result = sourceIllegal;
00733       }
00734 
00735       target += bytesToWrite;
00736 
00737       if (target > targetEnd)
00738       {
00739         --source; /* Back up source pointer! */
00740         target -= bytesToWrite;
00741         result = targetExhausted;
00742         break;
00743       }
00744 
00745       switch (bytesToWrite)   /* note: everything falls through. */
00746       {
00747         case 4:
00748           *--target = (unsigned char) ( (ch | byteMark) & byteMask);
00749           ch >>= 6;
00750         case 3:
00751           *--target = (unsigned char) ( (ch | byteMark) & byteMask);
00752           ch >>= 6;
00753         case 2:
00754           *--target = (unsigned char) ( (ch | byteMark) & byteMask);
00755           ch >>= 6;
00756         case 1:
00757           *--target = (unsigned char) (ch | firstByteMark[bytesToWrite]);
00758       }
00759 
00760       target += bytesToWrite;
00761     }
00762 
00763     *sourceStart = source;
00764     *targetStart = target;
00765     return result;
00766   }
00767 
00768   /* --------------------------------------------------------------------- */
00769 
00770   ConversionResult ConvertUTF8toUTF32 (
00771     const unsigned char **sourceStart, const unsigned char *sourceEnd,
00772     unsigned int **targetStart, unsigned int *targetEnd, ConversionFlags flags)
00773   {
00774     ConversionResult result = conversionOK;
00775     const unsigned char *source = *sourceStart;
00776     unsigned int *target = *targetStart;
00777 
00778     while (source < sourceEnd)
00779     {
00780       unsigned int ch = 0;
00781       unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
00782 
00783       if (source + extraBytesToRead >= sourceEnd)
00784       {
00785         result = sourceExhausted;
00786         break;
00787       }
00788 
00789       /* Do this check whether lenient or strict */
00790       if (! isLegalUTF8 (source, extraBytesToRead + 1) )
00791       {
00792         result = sourceIllegal;
00793         break;
00794       }
00795 
00796       /*
00797        * The cases all fall through. See "Note A" below.
00798        */
00799       switch (extraBytesToRead)
00800       {
00801         case 5:
00802           ch += *source++;
00803           ch <<= 6;
00804         case 4:
00805           ch += *source++;
00806           ch <<= 6;
00807         case 3:
00808           ch += *source++;
00809           ch <<= 6;
00810         case 2:
00811           ch += *source++;
00812           ch <<= 6;
00813         case 1:
00814           ch += *source++;
00815           ch <<= 6;
00816         case 0:
00817           ch += *source++;
00818       }
00819 
00820       ch -= offsetsFromUTF8[extraBytesToRead];
00821 
00822       if (target >= targetEnd)
00823       {
00824         source -= (extraBytesToRead + 1); /* Back up the source pointer! */
00825         result = targetExhausted;
00826         break;
00827       }
00828 
00829       if (ch <= UNI_MAX_LEGAL_UTF32)
00830       {
00831         /*
00832          * UTF-16 surrogate values are illegal in UTF-32, and anything
00833          * over Plane 17 (> 0x10FFFF) is illegal.
00834          */
00835         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
00836         {
00837           if (flags == strictConversion)
00838           {
00839             source -= (extraBytesToRead + 1); /* return to the illegal value itself */
00840             result = sourceIllegal;
00841             break;
00842           }
00843           else
00844           {
00845             *target++ = UNI_REPLACEMENT_CHAR;
00846           }
00847         }
00848         else
00849         {
00850           *target++ = ch;
00851         }
00852       }
00853       else     /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
00854       {
00855         result = sourceIllegal;
00856         *target++ = UNI_REPLACEMENT_CHAR;
00857       }
00858     }
00859 
00860     *sourceStart = source;
00861     *targetStart = target;
00862     return result;
00863   }
00864 
00865   /* ---------------------------------------------------------------------
00866 
00867   Note A.
00868   The fall-through switches in UTF-8 reading code save a
00869   temp variable, some decrements & conditionals.  The switches
00870   are equivalent to the following loop:
00871       {
00872           int tmpBytesToRead = extraBytesToRead+1;
00873           do {
00874               ch += *source++;
00875               --tmpBytesToRead;
00876               if (tmpBytesToRead) ch <<= 6;
00877           } while (tmpBytesToRead > 0);
00878       }
00879   In UTF-8 writing code, the switches on "bytesToWrite" are
00880   similarly unrolled loops.
00881 
00882   --------------------------------------------------------------------- */
00883 
00884 }