Back to index

lightning-sunbird  0.9+nobinonly
nsUTF8Utils.h
Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /* ***** BEGIN LICENSE BLOCK *****
00003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00004  *
00005  * The contents of this file are subject to the Mozilla Public License Version
00006  * 1.1 (the "License"); you may not use this file except in compliance with
00007  * the License. You may obtain a copy of the License at
00008  * http://www.mozilla.org/MPL/
00009  *
00010  * Software distributed under the License is distributed on an "AS IS" basis,
00011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00012  * for the specific language governing rights and limitations under the
00013  * License.
00014  *
00015  * The Original Code is mozilla.org code.
00016  *
00017  * The Initial Developer of the Original Code is
00018  * Netscape Communications Corporation.
00019  * Portions created by the Initial Developer are Copyright (C) 2001
00020  * the Initial Developer. All Rights Reserved.
00021  *
00022  * Contributor(s):
00023  *   Peter Annema <jaggernaut@netscape.com> (original author)
00024  *
00025  * Alternatively, the contents of this file may be used under the terms of
00026  * either of the GNU General Public License Version 2 or later (the "GPL"),
00027  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00028  * in which case the provisions of the GPL or the LGPL are applicable instead
00029  * of those above. If you wish to allow use of your version of this file only
00030  * under the terms of either the GPL or the LGPL, and not to allow others to
00031  * use your version of this file under the terms of the MPL, indicate your
00032  * decision by deleting the provisions above and replace them with the notice
00033  * and other provisions required by the GPL or the LGPL. If you do not delete
00034  * the provisions above, a recipient may use your version of this file under
00035  * the terms of any one of the MPL, the GPL or the LGPL.
00036  *
00037  * ***** END LICENSE BLOCK ***** */
00038 
00039 #ifndef nsUTF8Utils_h_
00040 #define nsUTF8Utils_h_
00041 
00042 #include "nsCharTraits.h"
00043 
00044 class UTF8traits
00045   {
00046     public:
00047       static PRBool isASCII(char c) { return (c & 0x80) == 0x00; }
00048       static PRBool isInSeq(char c) { return (c & 0xC0) == 0x80; }
00049       static PRBool is2byte(char c) { return (c & 0xE0) == 0xC0; }
00050       static PRBool is3byte(char c) { return (c & 0xF0) == 0xE0; }
00051       static PRBool is4byte(char c) { return (c & 0xF8) == 0xF0; }
00052       static PRBool is5byte(char c) { return (c & 0xFC) == 0xF8; }
00053       static PRBool is6byte(char c) { return (c & 0xFE) == 0xFC; }
00054   };
00055 
00056 #ifdef __GNUC__
00057 #define NS_ALWAYS_INLINE __attribute__((always_inline))
00058 #else
00059 #define NS_ALWAYS_INLINE
00060 #endif
00061 
00066 class ConvertUTF8toUTF16
00067   {
00068     public:
00069       typedef nsACString::char_type value_type;
00070       typedef nsAString::char_type  buffer_type;
00071 
00072     ConvertUTF8toUTF16( buffer_type* aBuffer )
00073         : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(PR_FALSE) {}
00074 
00075     size_t Length() const { return mBuffer - mStart; }
00076 
00077     PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
00078       {
00079         if ( mErrorEncountered )
00080           return N;
00081 
00082         // algorithm assumes utf8 units won't
00083         // be spread across fragments
00084         const value_type* p = start;
00085         const value_type* end = start + N;
00086         buffer_type* out = mBuffer;
00087         for ( ; p != end /* && *p */; )
00088           {
00089             char c = *p++;
00090 
00091             if ( UTF8traits::isASCII(c) )
00092               {
00093                 *out++ = buffer_type(c);
00094                 continue;
00095               }
00096 
00097             PRUint32 ucs4;
00098             PRUint32 minUcs4;
00099             PRInt32 state = 0;
00100 
00101             if ( UTF8traits::is2byte(c) )
00102               {
00103                 ucs4 = (PRUint32(c) << 6) & 0x000007C0L;
00104                 state = 1;
00105                 minUcs4 = 0x00000080;
00106               }
00107             else if ( UTF8traits::is3byte(c) )
00108               {
00109                 ucs4 = (PRUint32(c) << 12) & 0x0000F000L;
00110                 state = 2;
00111                 minUcs4 = 0x00000800;
00112               }
00113             else if ( UTF8traits::is4byte(c) )
00114               {
00115                 ucs4 = (PRUint32(c) << 18) & 0x001F0000L;
00116                 state = 3;
00117                 minUcs4 = 0x00010000;
00118               }
00119             else if ( UTF8traits::is5byte(c) )
00120               {
00121                 ucs4 = (PRUint32(c) << 24) & 0x03000000L;
00122                 state = 4;
00123                 minUcs4 = 0x00200000;
00124               }
00125             else if ( UTF8traits::is6byte(c) )
00126               {
00127                 ucs4 = (PRUint32(c) << 30) & 0x40000000L;
00128                 state = 5;
00129                 minUcs4 = 0x04000000;
00130               }
00131             else
00132               {
00133                 NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
00134                 mErrorEncountered = PR_TRUE;
00135                 mBuffer = out;
00136                 return N;
00137               }
00138 
00139             while ( state-- )
00140               {
00141                 if (p == end)
00142                   {
00143                     NS_ERROR("Buffer ended in the middle of a multibyte sequence");
00144                     mErrorEncountered = PR_TRUE;
00145                     mBuffer = out;
00146                     return N;
00147                   }
00148 
00149                 c = *p++;
00150 
00151                 if ( UTF8traits::isInSeq(c) )
00152                   {
00153                     PRInt32 shift = state * 6;
00154                     ucs4 |= (PRUint32(c) & 0x3F) << shift;
00155                   }
00156                 else
00157                   {
00158                     NS_ERROR("not a UTF8 string");
00159                     mErrorEncountered = PR_TRUE;
00160                     mBuffer = out;
00161                     return N;
00162                   }
00163               }
00164 
00165             if ( ucs4 < minUcs4 )
00166               {
00167                 // Overlong sequence
00168                 *out++ = UCS2_REPLACEMENT_CHAR;
00169               }
00170             else if ( ucs4 <= 0xD7FF )
00171               {
00172                 *out++ = ucs4;
00173               }
00174             else if ( /* ucs4 >= 0xD800 && */ ucs4 <= 0xDFFF )
00175               {
00176                 // Surrogates
00177                 *out++ = UCS2_REPLACEMENT_CHAR;
00178               }
00179             else if ( ucs4 == 0xFFFE || ucs4 == 0xFFFF )
00180               {
00181                 // Prohibited characters
00182                 *out++ = UCS2_REPLACEMENT_CHAR;
00183               }
00184             else if ( ucs4 >= PLANE1_BASE )
00185               {
00186                 if ( ucs4 >= UCS_END )
00187                   *out++ = UCS2_REPLACEMENT_CHAR;
00188                 else {
00189                   *out++ = (buffer_type)H_SURROGATE(ucs4);
00190                   *out++ = (buffer_type)L_SURROGATE(ucs4);
00191                 }
00192               }
00193             else
00194               {
00195                 *out++ = ucs4;
00196               }
00197           }
00198         mBuffer = out;
00199         return p - start;
00200       }
00201 
00202     void write_terminator()
00203       {
00204         *mBuffer = buffer_type(0);
00205       }
00206 
00207     private:
00208       buffer_type* const mStart;
00209       buffer_type* mBuffer;
00210       PRBool mErrorEncountered;
00211   };
00212 
00217 class CalculateUTF8Length
00218   {
00219     public:
00220       typedef nsACString::char_type value_type;
00221 
00222     CalculateUTF8Length() : mLength(0), mErrorEncountered(PR_FALSE) { }
00223 
00224     size_t Length() const { return mLength; }
00225 
00226     PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
00227       {
00228           // ignore any further requests
00229         if ( mErrorEncountered )
00230             return N;
00231 
00232         // algorithm assumes utf8 units won't
00233         // be spread across fragments
00234         const value_type* p = start;
00235         const value_type* end = start + N;
00236         for ( ; p < end /* && *p */; ++mLength )
00237           {
00238             if ( UTF8traits::isASCII(*p) )
00239                 p += 1;
00240             else if ( UTF8traits::is2byte(*p) )
00241                 p += 2;
00242             else if ( UTF8traits::is3byte(*p) )
00243                 p += 3;
00244             else if ( UTF8traits::is4byte(*p) ) {
00245                 p += 4;
00246                 // Because a UTF-8 sequence of 4 bytes represents a codepoint
00247                 // greater than 0xFFFF, it will become a surrogate pair in the
00248                 // UTF-16 string, so add 1 more to mLength.
00249                 // This doesn't happen with is5byte and is6byte because they
00250                 // are illegal UTF-8 sequences (greater than 0x10FFFF) so get
00251                 // converted to a single replacement character.
00252                 //
00253                 // XXX: if the 4-byte sequence is an illegal non-shortest form,
00254                 //      it also gets converted to a replacement character, so
00255                 //      mLength will be off by one in this case.
00256                 ++mLength;
00257             }
00258             else if ( UTF8traits::is5byte(*p) )
00259                 p += 5;
00260             else if ( UTF8traits::is6byte(*p) )
00261                 p += 6;
00262             else
00263               {
00264                 break;
00265               }
00266           }
00267         if ( p != end )
00268           {
00269             NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
00270             mErrorEncountered = PR_TRUE;
00271             return N;
00272           }
00273         return p - start;
00274       }
00275 
00276     private:
00277       size_t mLength;
00278       PRBool mErrorEncountered;
00279   };
00280 
00285 class ConvertUTF16toUTF8
00286   {
00287     public:
00288       typedef nsAString::char_type  value_type;
00289       typedef nsACString::char_type buffer_type;
00290 
00291     // The error handling here is more lenient than that in
00292     // |ConvertUTF8toUTF16|, but it's that way for backwards
00293     // compatibility.
00294 
00295     ConvertUTF16toUTF8( buffer_type* aBuffer )
00296         : mStart(aBuffer), mBuffer(aBuffer) {}
00297 
00298     size_t Size() const { return mBuffer - mStart; }
00299 
00300     PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
00301       {
00302         buffer_type *out = mBuffer; // gcc isn't smart enough to do this!
00303 
00304         for (const value_type *p = start, *end = start + N; p < end; ++p )
00305           {
00306             value_type c = *p;
00307             if (! (c & 0xFF80)) // U+0000 - U+007F
00308               {
00309                 *out++ = (char)c;
00310               }
00311             else if (! (c & 0xF800)) // U+0100 - U+07FF
00312               {
00313                 *out++ = 0xC0 | (char)(c >> 6);
00314                 *out++ = 0x80 | (char)(0x003F & c);
00315               }
00316             else if (!IS_SURROGATE(c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
00317               {
00318                 *out++ = 0xE0 | (char)(c >> 12);
00319                 *out++ = 0x80 | (char)(0x003F & (c >> 6));
00320                 *out++ = 0x80 | (char)(0x003F & c );
00321               }
00322             else if (IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF
00323               {
00324                 // D800- DBFF - High Surrogate
00325                 value_type h = c;
00326 
00327                 ++p;
00328                 if (p == end)
00329                   {
00330                     NS_ERROR("Surrogate pair split between fragments");
00331                     mBuffer = out;
00332                     return N;
00333                   }
00334                 c = *p;
00335 
00336                 if (IS_LOW_SURROGATE(c))
00337                   {
00338                     // DC00- DFFF - Low Surrogate
00339                     // N = (H - D800) *400 + 10000 + ( L - DC00 )
00340                     PRUint32 ucs4 = SURROGATE_TO_UCS4(h, c);
00341 
00342                     // 0001 0000-001F FFFF
00343                     *out++ = 0xF0 | (char)(ucs4 >> 18);
00344                     *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
00345                     *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
00346                     *out++ = 0x80 | (char)(0x003F & ucs4);
00347                   }
00348                 else
00349                   {
00350                     NS_ERROR("got a High Surrogate but no low surrogate");
00351                     // output nothing.
00352                   }
00353               }
00354             else // U+DC00 - U+DFFF
00355               {
00356                 // DC00- DFFF - Low Surrogate
00357                 NS_ERROR("got a low Surrogate but no high surrogate");
00358                 // output nothing.
00359               }
00360           }
00361 
00362         mBuffer = out;
00363         return N;
00364       }
00365 
00366     void write_terminator()
00367       {
00368         *mBuffer = buffer_type(0);
00369       }
00370 
00371     private:
00372       buffer_type* const mStart;
00373       buffer_type* mBuffer;
00374   };
00375 
00380 class CalculateUTF8Size
00381   {
00382     public:
00383       typedef nsAString::char_type value_type;
00384 
00385     CalculateUTF8Size()
00386       : mSize(0) { }
00387 
00388     size_t Size() const { return mSize; }
00389 
00390     PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
00391       {
00392         // Assume UCS2 surrogate pairs won't be spread across fragments.
00393         for (const value_type *p = start, *end = start + N; p < end; ++p )
00394           {
00395             value_type c = *p;
00396             if (! (c & 0xFF80)) // U+0000 - U+007F
00397               mSize += 1;
00398             else if (! (c & 0xF800)) // U+0100 - U+07FF
00399               mSize += 2;
00400             else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
00401               mSize += 3;
00402             else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
00403               {
00404                 ++p;
00405                 if (p == end)
00406                   {
00407                     NS_ERROR("Surrogate pair split between fragments");
00408                     return N;
00409                   }
00410                 c = *p;
00411 
00412                 if (0xDC00 == (0xFC00 & c))
00413                   mSize += 4;
00414                 else
00415                   NS_ERROR("got a high Surrogate but no low surrogate");
00416               }
00417             else // U+DC00 - U+DFFF
00418               NS_ERROR("got a low Surrogate but no high surrogate");
00419           }
00420 
00421         return N;
00422       }
00423 
00424     private:
00425       size_t mSize;
00426   };
00427 
00432 template <class FromCharT, class ToCharT>
00433 class LossyConvertEncoding
00434   {
00435     public:
00436       typedef FromCharT value_type;
00437  
00438       typedef FromCharT input_type;
00439       typedef ToCharT   output_type;
00440 
00441       typedef typename nsCharTraits<FromCharT>::unsigned_char_type unsigned_input_type;
00442 
00443     public:
00444       LossyConvertEncoding( output_type* aDestination ) : mDestination(aDestination) { }
00445 
00446       PRUint32
00447       write( const input_type* aSource, PRUint32 aSourceLength )
00448         {
00449           const input_type* done_writing = aSource + aSourceLength;
00450           while ( aSource < done_writing )
00451             *mDestination++ = (output_type)(unsigned_input_type)(*aSource++);  // use old-style cast to mimic old |ns[C]String| behavior
00452           return aSourceLength;
00453         }
00454 
00455       void
00456       write_terminator()
00457         {
00458           *mDestination = output_type(0);
00459         }
00460 
00461     private:
00462       output_type* mDestination;
00463   };
00464 
00465 #endif /* !defined(nsUTF8Utils_h_) */