Back to index

lightning-sunbird  0.9+nobinonly
nsUCS2BEToUnicode.cpp
Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /* ***** BEGIN LICENSE BLOCK *****
00003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00004  *
00005  * The contents of this file are subject to the Mozilla Public License Version
00006  * 1.1 (the "License"); you may not use this file except in compliance with
00007  * the License. You may obtain a copy of the License at
00008  * http://www.mozilla.org/MPL/
00009  *
00010  * Software distributed under the License is distributed on an "AS IS" basis,
00011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00012  * for the specific language governing rights and limitations under the
00013  * License.
00014  *
00015  * The Original Code is Mozilla Communicator client code.
00016  *
00017  * The Initial Developer of the Original Code is
00018  * Netscape Communications Corporation.
00019  * Portions created by the Initial Developer are Copyright (C) 1998
00020  * the Initial Developer. All Rights Reserved.
00021  *
00022  * Contributor(s):
00023  *
00024  * Alternatively, the contents of this file may be used under the terms of
00025  * either of the GNU General Public License Version 2 or later (the "GPL"),
00026  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00027  * in which case the provisions of the GPL or the LGPL are applicable instead
00028  * of those above. If you wish to allow use of your version of this file only
00029  * under the terms of either the GPL or the LGPL, and not to allow others to
00030  * use your version of this file under the terms of the MPL, indicate your
00031  * decision by deleting the provisions above and replace them with the notice
00032  * and other provisions required by the GPL or the LGPL. If you do not delete
00033  * the provisions above, a recipient may use your version of this file under
00034  * the terms of any one of the MPL, the GPL or the LGPL.
00035  *
00036  * ***** END LICENSE BLOCK ***** */
00037 
00038 #include "nsUCConstructors.h"
00039 #include "nsUCS2BEToUnicode.h"
00040 #include "nsUCvLatinDll.h"
00041 #include <string.h>
00042 #include "prtypes.h"
00043 
00044 // XXX : illegal surrogate code points are just passed through !!
00045 static nsresult
00046 UTF16ConvertToUnicode(PRUint8& aState, PRUint8& aData, const char * aSrc,
00047                       PRInt32 * aSrcLength, PRUnichar * aDest,
00048                       PRInt32 * aDestLength)
00049 {
00050   const char* src = aSrc;
00051   const char* srcEnd = aSrc + *aSrcLength;
00052   PRUnichar* dest = aDest;
00053   PRUnichar* destEnd = aDest + *aDestLength;
00054 
00055   if(2 == aState) // first time called
00056   {
00057     NS_ASSERTION(*aSrcLength >= 2, "Too few bytes in input");
00058 
00059     // Eliminate BOM (0xFEFF). Note that different endian case is taken care of
00060     // in |Convert| of LE and BE converters. Here, we only have to
00061     // deal with the same endian case. That is, 0xFFFE (byte-swapped BOM) is
00062     // illegal.
00063     if(0xFEFF == *((PRUnichar*)src)) {
00064       src+=2;
00065     } else if(0xFFFE == *((PRUnichar*)src)) {
00066       *aSrcLength=0;
00067       *aDestLength=0;
00068       return NS_ERROR_ILLEGAL_INPUT;
00069     }  
00070     aState=0;
00071   }
00072 
00073   PRInt32 copybytes;
00074 
00075   if((1 == aState) && (src < srcEnd))
00076   {
00077     if(dest >= destEnd)
00078       goto error;
00079 
00080     char tmpbuf[2];
00081 
00082     // the 1st byte of a 16-bit code unit was stored in |aData| in the previous
00083     // run while the 2nd byte has to come from |*src|. We just have to copy
00084     // 'byte-by-byte'. Byte-swapping, if necessary, will be done in |Convert| of
00085     // LE and BE converters.
00086     PRUnichar * up = (PRUnichar*) &tmpbuf[0];
00087     tmpbuf[0]= aData;
00088     tmpbuf[1]= *src++;
00089     *dest++ = *up;
00090   }
00091   
00092   copybytes = (destEnd-dest)*2;
00093   // if |srcEnd-src| is odd, we copy one fewer bytes.
00094   if(copybytes > (~1 & (srcEnd - src)))
00095       copybytes = ~1 & (srcEnd - src);
00096   memcpy(dest,src,copybytes);
00097   src +=copybytes;
00098   dest +=(copybytes/2);
00099   if(srcEnd == src)  { // srcLength was even.
00100      aState = 0;
00101   } else if(1 == (srcEnd - src) ) { // srcLength was odd. 
00102      aState = 1;
00103      aData  = *src++;  // store the lead byte of a 16-bit unit for the next run.
00104   } else  {
00105      goto error;
00106   }
00107   
00108   *aDestLength = dest - aDest;
00109   *aSrcLength =  src  - aSrc; 
00110   return NS_OK;
00111 
00112 error:
00113   *aDestLength = dest - aDest;
00114   *aSrcLength =  src  - aSrc; 
00115   return  NS_OK_UDEC_MOREOUTPUT;
00116 }
00117 
00118 static void
00119 SwapBytes(PRUnichar *aDest, PRInt32 aLen)
00120 {
00121   for (PRUnichar *p = aDest; aLen > 0; ++p, --aLen)
00122      *p = ((*p & 0xff) << 8) | ((*p >> 8) & 0xff);
00123 }
00124 
00125 NS_IMETHODIMP
00126 nsUTF16ToUnicodeBase::Reset()
00127 {
00128   mState = 2;
00129   mData = 0;
00130   return NS_OK;
00131 }
00132 
00133 NS_IMETHODIMP
00134 nsUTF16ToUnicodeBase::GetMaxLength(const char * aSrc, PRInt32 aSrcLength, 
00135                                    PRInt32 * aDestLength)
00136 {
00137   // the left-over byte of the previous run has to be taken into account.
00138   *aDestLength = (aSrcLength + ((1 == mState) ? 1 : 0)) / 2;
00139   return NS_OK;
00140 }
00141 
00142 
00143 NS_IMETHODIMP
00144 nsUTF16BEToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLength,
00145                             PRUnichar * aDest, PRInt32 * aDestLength)
00146 {
00147 #ifdef IS_LITTLE_ENDIAN
00148     // Remove the BOM if we're little-endian. The 'same endian' case with the
00149     // leading BOM will be taken care of by |UTF16ConvertToUnicode|.
00150     if(2 == mState) // Called for the first time.
00151     {
00152       NS_ASSERTION(*aSrcLength >= 2, "Too few bytes in input");
00153       if(0xFFFE == *((PRUnichar*)aSrc)) {
00154         // eliminate BOM (on LE machines, BE BOM is 0xFFFE)
00155         aSrc+=2;
00156         *aSrcLength-=2;
00157       } else if(0xFEFF == *((PRUnichar*)aSrc)) {
00158         *aSrcLength=0;
00159         *aDestLength=0;
00160         return NS_ERROR_ILLEGAL_INPUT;
00161       }  
00162       mState=0;
00163     }
00164 #endif
00165 
00166   nsresult rv = UTF16ConvertToUnicode(mState, mData, aSrc, aSrcLength,
00167                                       aDest, aDestLength);
00168 
00169 #ifdef IS_LITTLE_ENDIAN
00170   SwapBytes(aDest, *aDestLength);
00171 #endif
00172   return rv;
00173 }
00174 
00175 NS_IMETHODIMP
00176 nsUTF16LEToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLength,
00177                             PRUnichar * aDest, PRInt32 * aDestLength)
00178 {
00179 #ifdef IS_BIG_ENDIAN
00180     // Remove the BOM if we're big-endian. The 'same endian' case with the
00181     // leading BOM will be taken care of by |UTF16ConvertToUnicode|.
00182     if(2 == mState) // first time called
00183     {
00184       NS_ASSERTION(*aSrcLength >= 2, "Too few bytes in input");
00185       if(0xFFFE == *((PRUnichar*)aSrc)) {
00186         // eliminate BOM (on BE machines, LE BOM is 0xFFFE)
00187         aSrc+=2;
00188         *aSrcLength-=2;
00189       } else if(0xFEFF == *((PRUnichar*)aSrc)) {
00190         *aSrcLength=0;
00191         *aDestLength=0;
00192         return NS_ERROR_ILLEGAL_INPUT;
00193       }  
00194       mState=0;
00195     }
00196 #endif
00197     
00198   nsresult rv = UTF16ConvertToUnicode(mState, mData, aSrc, aSrcLength, aDest,
00199                                       aDestLength);
00200 
00201 #ifdef IS_BIG_ENDIAN
00202   SwapBytes(aDest, *aDestLength);
00203 #endif
00204   return rv;
00205 }
00206 
00207 NS_IMETHODIMP
00208 nsUTF16ToUnicode::Reset()
00209 {
00210   mEndian = kUnknown;
00211   mFoundBOM = PR_FALSE;
00212   return nsUTF16ToUnicodeBase::Reset();
00213 }
00214 
00215 NS_IMETHODIMP
00216 nsUTF16ToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLength,
00217                           PRUnichar * aDest, PRInt32 * aDestLength)
00218 {
00219     if(2 == mState) // first time called
00220     {
00221       NS_ASSERTION(*aSrcLength >= 2, "Too few bytes in input");
00222 
00223       // check if BOM (0xFEFF) is at the beginning, remove it if found, and
00224       // set mEndian accordingly.
00225       if(0xFF == PRUint8(aSrc[0]) && 0xFE == PRUint8(aSrc[1])) {
00226         aSrc += 2;
00227         *aSrcLength -= 2;
00228         mState = 0;
00229         mEndian = kLittleEndian;
00230         mFoundBOM = PR_TRUE;
00231       }
00232       else if(0xFE == PRUint8(aSrc[0]) && 0xFF == PRUint8(aSrc[1])) {
00233         aSrc += 2;
00234         *aSrcLength -= 2;
00235         mState = 0;
00236         mEndian = kBigEndian;
00237         mFoundBOM = PR_TRUE;
00238       }
00239       // BOM is not found, but we can use a simple heuristic to determine
00240       // the endianness. Assume the first character is [U+0001, U+00FF].
00241       // Not always valid, but it's very likely to hold for html/xml/css. 
00242       else if(!aSrc[0] && aSrc[1]) {  // 0x00 0xhh (hh != 00)
00243         mState = 0;                   
00244         mEndian = kBigEndian;
00245       }
00246       else if(aSrc[0] && !aSrc[1]) {  // 0xhh 0x00 (hh != 00)
00247         mState = 0;
00248         mEndian = kLittleEndian;
00249       }
00250       else { // Neither BOM nor 'plausible' byte patterns at the beginning.
00251              // Just assume it's BE (following Unicode standard)
00252              // and let the garbage show up in the browser. (security concern?)
00253              // (bug 246194)
00254         mState = 0;   
00255         mEndian = kBigEndian;
00256       }
00257     }
00258     
00259     nsresult rv = UTF16ConvertToUnicode(mState, mData, aSrc, aSrcLength, aDest,
00260                                         aDestLength);
00261 
00262 #ifdef IS_BIG_ENDIAN
00263     if (mEndian == kLittleEndian)
00264 #elif defined(IS_LITTLE_ENDIAN)
00265     if (mEndian == kBigEndian)
00266 #else
00267     #error "Unknown endianness"
00268 #endif
00269       SwapBytes(aDest, *aDestLength);
00270 
00271     // If BOM is not found and we're to return NS_OK, signal that BOM
00272     // is not found. Otherwise, return |rv| from |UTF16ConvertToUnicode|
00273     return (rv == NS_OK && !mFoundBOM) ? NS_OK_UDEC_NOBOMFOUND : rv;
00274 }