Back to index

lightning-sunbird  0.9+nobinonly
nsUTF8ToUnicode.cpp
Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /* ***** BEGIN LICENSE BLOCK *****
00003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00004  *
00005  * The contents of this file are subject to the Mozilla Public License Version
00006  * 1.1 (the "License"); you may not use this file except in compliance with
00007  * the License. You may obtain a copy of the License at
00008  * http://www.mozilla.org/MPL/
00009  *
00010  * Software distributed under the License is distributed on an "AS IS" basis,
00011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00012  * for the specific language governing rights and limitations under the
00013  * License.
00014  *
00015  * The Original Code is Mozilla Communicator client code.
00016  *
00017  * The Initial Developer of the Original Code is
00018  * Netscape Communications Corporation.
00019  * Portions created by the Initial Developer are Copyright (C) 1998
00020  * the Initial Developer. All Rights Reserved.
00021  *
00022  * Contributor(s):
00023  *
00024  * Alternatively, the contents of this file may be used under the terms of
00025  * either of the GNU General Public License Version 2 or later (the "GPL"),
00026  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00027  * in which case the provisions of the GPL or the LGPL are applicable instead
00028  * of those above. If you wish to allow use of your version of this file only
00029  * under the terms of either the GPL or the LGPL, and not to allow others to
00030  * use your version of this file under the terms of the MPL, indicate your
00031  * decision by deleting the provisions above and replace them with the notice
00032  * and other provisions required by the GPL or the LGPL. If you do not delete
00033  * the provisions above, a recipient may use your version of this file under
00034  * the terms of any one of the MPL, the GPL or the LGPL.
00035  *
00036  * ***** END LICENSE BLOCK ***** */
00037 
00038 #include "nsUCSupport.h"
00039 #include "nsUTF8ToUnicode.h"
00040 
00041 #define UNICODE_BYTE_ORDER_MARK    0xFEFF
00042 
00043 NS_IMETHODIMP NS_NewUTF8ToUnicode(nsISupports* aOuter,
00044                                   const nsIID& aIID,
00045                                   void** aResult)
00046 {
00047   if (!aResult) {
00048     return NS_ERROR_NULL_POINTER;
00049   }
00050   if (aOuter) {
00051     *aResult = nsnull;
00052     return NS_ERROR_NO_AGGREGATION;
00053   }
00054   nsUTF8ToUnicode * inst = new nsUTF8ToUnicode();
00055   if (!inst) {
00056     *aResult = nsnull;
00057     return NS_ERROR_OUT_OF_MEMORY;
00058   }
00059   nsresult res = inst->QueryInterface(aIID, aResult);
00060   if (NS_FAILED(res)) {
00061     *aResult = nsnull;
00062     delete inst;
00063   }
00064   return res;
00065 }
00066 
00067 //----------------------------------------------------------------------
00068 // Class nsUTF8ToUnicode [implementation]
00069 
00070 nsUTF8ToUnicode::nsUTF8ToUnicode()
00071 : nsBasicDecoderSupport()
00072 {
00073   Reset();
00074 }
00075 
00076 //----------------------------------------------------------------------
00077 // Subclassing of nsTableDecoderSupport class [implementation]
00078 
00094 NS_IMETHODIMP nsUTF8ToUnicode::GetMaxLength(const char * aSrc,
00095                                             PRInt32 aSrcLength,
00096                                             PRInt32 * aDestLength)
00097 {
00098   *aDestLength = aSrcLength + 1;
00099   return NS_OK;
00100 }
00101 
00102 
00103 //----------------------------------------------------------------------
00104 // Subclassing of nsBasicDecoderSupport class [implementation]
00105 
00106 NS_IMETHODIMP nsUTF8ToUnicode::Reset()
00107 {
00108 
00109   mUcs4  = 0;     // cached Unicode character
00110   mState = 0;     // cached expected number of octets after the current octet
00111                   // until the beginning of the next UTF8 character sequence
00112   mBytes = 1;     // cached expected number of octets in the current sequence
00113   mFirst = PR_TRUE;
00114 
00115   return NS_OK;
00116 
00117 }
00118 
00119 //----------------------------------------------------------------------
00120 // Subclassing of nsBasicDecoderSupport class [implementation]
00121 
00122 
00123 NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc,
00124                                        PRInt32 * aSrcLength,
00125                                        PRUnichar * aDest,
00126                                        PRInt32 * aDestLength)
00127 {
00128   PRUint32 aSrcLen   = (PRUint32) (*aSrcLength);
00129   PRUint32 aDestLen = (PRUint32) (*aDestLength);
00130 
00131   const char *in, *inend;
00132   inend = aSrc + aSrcLen;
00133 
00134   PRUnichar *out, *outend;
00135   outend = aDest + aDestLen;
00136 
00137   nsresult res = NS_OK; // conversion result
00138 
00139   // Set mFirst to PR_FALSE now so we don't have to every time through the ASCII
00140   // branch within the loop.
00141   if (mFirst && aSrcLen && (0 == (0x80 & (*aSrc))))
00142     mFirst = PR_FALSE;
00143 
00144   for (in = aSrc, out = aDest; ((in < inend) && (out < outend)); ++in) {
00145     if (0 == mState) {
00146       // When mState is zero we expect either a US-ASCII character or a
00147       // multi-octet sequence.
00148       if (0 == (0x80 & (*in))) {
00149         // US-ASCII, pass straight through.
00150         *out++ = (PRUnichar)*in;
00151         mBytes = 1;
00152       } else if (0xC0 == (0xE0 & (*in))) {
00153         // First octet of 2 octet sequence
00154         mUcs4 = (PRUint32)(*in);
00155         mUcs4 = (mUcs4 & 0x1F) << 6;
00156         mState = 1;
00157         mBytes = 2;
00158       } else if (0xE0 == (0xF0 & (*in))) {
00159         // First octet of 3 octet sequence
00160         mUcs4 = (PRUint32)(*in);
00161         mUcs4 = (mUcs4 & 0x0F) << 12;
00162         mState = 2;
00163         mBytes = 3;
00164       } else if (0xF0 == (0xF8 & (*in))) {
00165         // First octet of 4 octet sequence
00166         mUcs4 = (PRUint32)(*in);
00167         mUcs4 = (mUcs4 & 0x07) << 18;
00168         mState = 3;
00169         mBytes = 4;
00170       } else if (0xF8 == (0xFC & (*in))) {
00171         /* First octet of 5 octet sequence.
00172          *
00173          * This is illegal because the encoded codepoint must be either
00174          * (a) not the shortest form or
00175          * (b) outside the Unicode range of 0-0x10FFFF.
00176          * Rather than trying to resynchronize, we will carry on until the end
00177          * of the sequence and let the later error handling code catch it.
00178          */
00179         mUcs4 = (PRUint32)(*in);
00180         mUcs4 = (mUcs4 & 0x03) << 24;
00181         mState = 4;
00182         mBytes = 5;
00183       } else if (0xFC == (0xFE & (*in))) {
00184         // First octet of 6 octet sequence, see comments for 5 octet sequence.
00185         mUcs4 = (PRUint32)(*in);
00186         mUcs4 = (mUcs4 & 1) << 30;
00187         mState = 5;
00188         mBytes = 6;
00189       } else {
00190         /* Current octet is neither in the US-ASCII range nor a legal first
00191          * octet of a multi-octet sequence.
00192          *
00193          * Return an error condition. Caller is responsible for flushing and
00194          * refilling the buffer and resetting state.
00195          */
00196         res = NS_ERROR_UNEXPECTED;
00197         break;
00198       }
00199     } else {
00200       // When mState is non-zero, we expect a continuation of the multi-octet
00201       // sequence
00202       if (0x80 == (0xC0 & (*in))) {
00203         // Legal continuation.
00204         PRUint32 shift = (mState - 1) * 6;
00205         PRUint32 tmp = *in;
00206         tmp = (tmp & 0x0000003FL) << shift;
00207         mUcs4 |= tmp;
00208 
00209         if (0 == --mState) {
00210           /* End of the multi-octet sequence. mUcs4 now contains the final
00211            * Unicode codepoint to be output
00212            *
00213            * Check for illegal sequences and codepoints.
00214            */
00215 
00216           // From Unicode 3.1, non-shortest form is illegal
00217           if (((2 == mBytes) && (mUcs4 < 0x0080)) ||
00218               ((3 == mBytes) && (mUcs4 < 0x0800)) ||
00219               ((4 == mBytes) && (mUcs4 < 0x10000)) ||
00220               (4 < mBytes) ||
00221               // From Unicode 3.2, surrogate characters are illegal
00222               ((mUcs4 & 0xFFFFF800) == 0xD800) ||
00223               // Codepoints outside the Unicode range are illegal
00224               (mUcs4 > 0x10FFFF)) {
00225             res = NS_ERROR_UNEXPECTED;
00226             break;
00227           }
00228           if (mUcs4 > 0xFFFF) {
00229             // mUcs4 is in the range 0x10000 - 0x10FFFF. Output a UTF-16 pair
00230             mUcs4 -= 0x00010000;
00231             *out++ = 0xD800 | (0x000003FF & (mUcs4 >> 10));
00232             *out++ = 0xDC00 | (0x000003FF & mUcs4);
00233           } else if (UNICODE_BYTE_ORDER_MARK != mUcs4 || !mFirst) {
00234             // Don't output the BOM only if it is the first character
00235             *out++ = mUcs4;
00236           }
00237           //initialize UTF8 cache
00238           mUcs4  = 0;
00239           mState = 0;
00240           mBytes = 1;
00241           mFirst = PR_FALSE;
00242         }
00243       } else {
00244         /* ((0xC0 & (*in) != 0x80) && (mState != 0))
00245          * 
00246          * Incomplete multi-octet sequence. Unconsume this
00247          * octet and return an error condition. Caller is responsible
00248          * for flushing and refilling the buffer and resetting state.
00249          */
00250         in--;
00251         res = NS_ERROR_UNEXPECTED;
00252         break;
00253       }
00254     }
00255   }
00256 
00257   // output not finished, output buffer too short
00258   if ((NS_OK == res) && (in < inend) && (out >= outend))
00259     res = NS_OK_UDEC_MOREOUTPUT;
00260 
00261   // last UCS4 is incomplete, make sure the caller
00262   // returns with properly aligned continuation of the buffer
00263   if ((NS_OK == res) && (mState != 0))
00264     res = NS_OK_UDEC_MOREINPUT;
00265 
00266   *aSrcLength = in - aSrc;
00267   *aDestLength = out - aDest;
00268 
00269   return(res);
00270 }