Back to index

lightning-sunbird  0.9+nobinonly
nsUTF32ToUnicode.cpp
Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /* vim:expandtab:shiftwidth=2:tabstop=2: 
00003  */
00004 /* ***** BEGIN LICENSE BLOCK *****
00005  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00006  *
00007  * The contents of this file are subject to the Mozilla Public License Version
00008  * 1.1 (the "License"); you may not use this file except in compliance with
00009  * the License. You may obtain a copy of the License at
00010  * http://www.mozilla.org/MPL/
00011  *
00012  * Software distributed under the License is distributed on an "AS IS" basis,
00013  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00014  * for the specific language governing rights and limitations under the
00015  * License.
00016  *
00017  * The Original Code is Mozilla Communicator client code.
00018  *
00019  * The Initial Developer of the Original Code is
00020  * Netscape Communications Corporation.
00021  * Portions created by the Initial Developer are Copyright (C) 1998
00022  * the Initial Developer. All Rights Reserved.
00023  *
00024  * Contributor(s):
00025  *   Jungshik Shin <jshin@mailaps.org>
00026  *
00027  * Alternatively, the contents of this file may be used under the terms of
00028  * either of the GNU General Public License Version 2 or later (the "GPL"),
00029  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00030  * in which case the provisions of the GPL or the LGPL are applicable instead
00031  * of those above. If you wish to allow use of your version of this file only
00032  * under the terms of either the GPL or the LGPL, and not to allow others to
00033  * use your version of this file under the terms of the MPL, indicate your
00034  * decision by deleting the provisions above and replace them with the notice
00035  * and other provisions required by the GPL or the LGPL. If you do not delete
00036  * the provisions above, a recipient may use your version of this file under
00037  * the terms of any one of the MPL, the GPL or the LGPL.
00038  *
00039  * ***** END LICENSE BLOCK ***** */
00040 
00041 #include "nsUCSupport.h"
00042 #include "nsUTF32ToUnicode.h"
00043 #include <string.h>
00044 
00045 //----------------------------------------------------------------------
00046 // static functions and macro definition common to nsUTF32(BE|LE)ToUnicode
00047 
00048 #ifdef IS_BIG_ENDIAN
00049 #define LE_STRING_TO_UCS4(s)                                       \
00050         (PRUint8(*(s)) | (PRUint8(*((s) + 1)) << 8) |              \
00051          (PRUint8(*((s) + 2)) << 16) | (PRUint8(*((s) + 3)) << 24))
00052 #else
00053 #define LE_STRING_TO_UCS4(s) (*(PRUint32*) (s))
00054 #endif
00055 
00056 #ifdef IS_BIG_ENDIAN
00057 #define BE_STRING_TO_UCS4(s) (*(PRUint32*) (s))
00058 #else
00059 #define BE_STRING_TO_UCS4(s)                                       \
00060         (PRUint8(*((s) + 3)) | (PRUint8(*((s) + 2)) << 8) |         \
00061          (PRUint8(*((s) + 1)) << 16) | (PRUint8(*(s)) << 24))
00062 #endif
00063  
00064 static nsresult ConvertCommon(const char * aSrc, 
00065                               PRInt32 * aSrcLength, 
00066                               PRUnichar * aDest, 
00067                               PRInt32 * aDestLength,
00068                               PRUint16 * aState,
00069                               PRUint8  * aBuffer,
00070                               PRBool aIsLE)
00071 {
00072    
00073   NS_ENSURE_TRUE(*aState < 4, NS_ERROR_INVALID_ARG);
00074   NS_ENSURE_TRUE(*aDestLength > 0, NS_ERROR_INVALID_ARG);
00075 
00076   const char *src = aSrc;
00077   const char *srcEnd = aSrc + *aSrcLength;
00078    
00079   PRUnichar *dest = aDest;
00080   PRUnichar *destEnd = aDest + *aDestLength;
00081 
00082   if (*aState > *aSrcLength) 
00083   {
00084     memcpy(aBuffer + 4 - *aState, src, *aSrcLength);
00085     *aDestLength = 0;
00086     *aState -= *aSrcLength;
00087     return NS_OK_UDEC_MOREINPUT;
00088   }
00089 
00090   PRUint32 ucs4;
00091 
00092   // prev. run left a partial UTF-32 seq. 
00093   if (*aState > 0)
00094   {
00095     memcpy(aBuffer + 4 - *aState, src, *aState);
00096     ucs4 =  aIsLE ? LE_STRING_TO_UCS4(aBuffer) : BE_STRING_TO_UCS4(aBuffer); 
00097     if (ucs4 < 0x10000L)  // BMP
00098     {
00099       // XXX Do we have to convert surrogate code points to the replacement
00100       // character (0xfffd)?  
00101       *dest++= PRUnichar(ucs4);
00102     }
00103     else if (ucs4 < 0x110000L)  // plane 1 through plane 16 
00104     {
00105       if (destEnd - dest < 2) 
00106       {
00107         *aSrcLength = 0;
00108         *aDestLength = 0;
00109         return NS_OK_UDEC_MOREOUTPUT;
00110       }
00111       // ((ucs4 - 0x10000) >> 10) + 0xd800;
00112       *dest++= PRUnichar((ucs4 >> 10) + 0xd7c0);  // high surrogate
00113       *dest++= PRUnichar(ucs4 & 0x3ffL | 0xdc00); // low surrogate
00114     }       
00115     // Codepoints in plane 17 and higher (> 0x10ffff)
00116     // are not representable in UTF-16 we use for the internal
00117     // character representation. This is not a problem
00118     // because Unicode/ISO 10646 will never assign characters
00119     // in plane 17 and higher. Therefore, we convert them
00120     // to Unicode replacement character (0xfffd).
00121     else                   
00122       *dest++ = 0xfffd;
00123     src += *aState;
00124     *aState = 0;
00125   }
00126 
00127   nsresult rv = NS_OK;  // conversion result
00128 
00129   for ( ; src < srcEnd && dest < destEnd; src += 4)
00130   {
00131     if (srcEnd - src < 4) 
00132     {
00133       // fill up aBuffer until src buffer gets exhausted.
00134       memcpy(aBuffer, src, srcEnd - src);
00135       *aState = 4 - (srcEnd - src); // set add. char to read in next run
00136       src = srcEnd;
00137       rv = NS_OK_UDEC_MOREINPUT;
00138       break;
00139     }
00140 
00141     ucs4 =  aIsLE ? LE_STRING_TO_UCS4(src) : BE_STRING_TO_UCS4(src); 
00142     if (ucs4 < 0x10000L)  // BMP
00143     {
00144       // XXX Do we have to convert surrogate code points to the replacement
00145       // character (0xfffd)?  
00146       *dest++= PRUnichar(ucs4);
00147     }
00148     else if (ucs4 < 0x110000L)  // plane 1 through plane 16 
00149     {
00150       if (destEnd - dest < 2) 
00151         break;
00152       // ((ucs4 - 0x10000) >> 10) + 0xd800;
00153       *dest++= PRUnichar((ucs4 >> 10) + 0xd7c0); 
00154       *dest++= PRUnichar(ucs4 & 0x3ffL | 0xdc00);
00155     }       
00156     else                       // plane 17 and higher
00157       *dest++ = 0xfffd;
00158   }
00159 
00160   //output not finished, output buffer too short
00161   if((NS_OK == rv) && (src < srcEnd) && (dest >= destEnd)) 
00162     rv = NS_OK_UDEC_MOREOUTPUT;
00163 
00164   *aSrcLength = src - aSrc;
00165   *aDestLength  = dest - aDest;
00166 
00167   return rv;
00168 }
00169 
00170 
00171 //----------------------------------------------------------------------
00172 // Class nsUTF32ToUnicode [implementation]
00173 
00174 nsUTF32ToUnicode::nsUTF32ToUnicode() : nsBasicDecoderSupport()
00175 {
00176   Reset();
00177 }
00178 
00179 //----------------------------------------------------------------------
00180 // Subclassing of nsDecoderSupport class [implementation]
00181 
00182 NS_IMETHODIMP nsUTF32ToUnicode::GetMaxLength(const char * aSrc, 
00183                                             PRInt32 aSrcLength, 
00184                                             PRInt32 * aDestLength)
00185 {
00186   // Non-BMP characters take two PRUnichars(a pair of surrogate codepoints)
00187   // so that we have to divide by 2 instead of 4 for the worst case.
00188   *aDestLength = aSrcLength / 2;
00189   return NS_OK;
00190 }
00191 
00192 
00193 //----------------------------------------------------------------------
00194 // Subclassing of nsBasicDecoderSupport class [implementation]
00195 
00196 NS_IMETHODIMP nsUTF32ToUnicode::Reset()
00197 {
00198   // the number of additional bytes to read to complete UTF-32 4byte seq.
00199   mState = 0;  
00200   memset(mBufferInc, 0, 4);
00201   return NS_OK;
00202 
00203 }
00204 
00205 
00206 //----------------------------------------------------------------------
00207 // Class nsUTF32BEToUnicode [implementation]
00208 
00209 //----------------------------------------------------------------------
00210 // Subclassing of nsUTF32ToUnicode class [implementation]
00211 
00212 NS_IMETHODIMP nsUTF32BEToUnicode::Convert(const char * aSrc, 
00213                                           PRInt32 * aSrcLength, 
00214                                           PRUnichar * aDest, 
00215                                           PRInt32 * aDestLength)
00216 {
00217   return ConvertCommon(aSrc, aSrcLength, aDest, aDestLength, &mState, 
00218                        mBufferInc, PR_FALSE);
00219 }
00220 
00221 //----------------------------------------------------------------------
00222 // Class nsUTF32LEToUnicode [implementation]
00223   
00224 //----------------------------------------------------------------------
00225 // Subclassing of nsUTF32ToUnicode class [implementation]
00226 
00227 NS_IMETHODIMP nsUTF32LEToUnicode::Convert(const char * aSrc, 
00228                                           PRInt32 * aSrcLength, 
00229                                           PRUnichar * aDest, 
00230                                           PRInt32 * aDestLength)
00231 {
00232   return ConvertCommon(aSrc, aSrcLength, aDest, aDestLength, &mState, 
00233                        mBufferInc, PR_TRUE);
00234 }
00235 
00236 // XXX : What to do with 'unflushed' mBufferInc?? : Finish()
00237