Back to index

lightning-sunbird  0.9+nobinonly
nsGBKToUnicode.cpp
Go to the documentation of this file.
00001 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /* ***** BEGIN LICENSE BLOCK *****
00003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00004  *
00005  * The contents of this file are subject to the Mozilla Public License Version
00006  * 1.1 (the "License"); you may not use this file except in compliance with
00007  * the License. You may obtain a copy of the License at
00008  * http://www.mozilla.org/MPL/
00009  *
00010  * Software distributed under the License is distributed on an "AS IS" basis,
00011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00012  * for the specific language governing rights and limitations under the
00013  * License.
00014  *
00015  * The Original Code is mozilla.org code.
00016  *
00017  * The Initial Developer of the Original Code is
00018  * Netscape Communications Corporation.
00019  * Portions created by the Initial Developer are Copyright (C) 1998
00020  * the Initial Developer. All Rights Reserved.
00021  *
00022  * Contributor(s):
00023  *
00024  * Alternatively, the contents of this file may be used under the terms of
00025  * either of the GNU General Public License Version 2 or later (the "GPL"),
00026  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00027  * in which case the provisions of the GPL or the LGPL are applicable instead
00028  * of those above. If you wish to allow use of your version of this file only
00029  * under the terms of either the GPL or the LGPL, and not to allow others to
00030  * use your version of this file under the terms of the MPL, indicate your
00031  * decision by deleting the provisions above and replace them with the notice
00032  * and other provisions required by the GPL or the LGPL. If you do not delete
00033  * the provisions above, a recipient may use your version of this file under
00034  * the terms of any one of the MPL, the GPL or the LGPL.
00035  *
00036  * ***** END LICENSE BLOCK ***** */
00045 #include "nsGBKToUnicode.h"
00046 #include "nsUCvCnDll.h"
00047 #include "gbku.h"
00048 
00049 
00050 static const PRInt16 g_2BytesShiftTable[] = {
00051  0, u2BytesCharset,
00052  ShiftCell(0,0,0,0,0,0,0,0)
00053 };
00054 //------------------------------------------------------------
00055 // nsGBKUnique2BytesToUnicode
00056 //------------------------------------------------------------
00057 class nsGBKUnique2BytesToUnicode : public nsTableDecoderSupport 
00058 {
00059 public:
00060   nsGBKUnique2BytesToUnicode();
00061   virtual ~nsGBKUnique2BytesToUnicode() 
00062     { };
00063 protected:
00064 };
00065 
00066 static const PRUint16 g_utGBKUnique2Bytes[] = {
00067 #include "gbkuniq2b.ut"
00068 };
00069 nsGBKUnique2BytesToUnicode::nsGBKUnique2BytesToUnicode() 
00070 : nsTableDecoderSupport((uShiftTable*) &g_2BytesShiftTable,
00071         (uMappingTable*) &g_utGBKUnique2Bytes, 1) 
00072 {
00073 }
00074 
00075 //------------------------------------------------------------
00076 // nsGB18030Unique2BytesToUnicode
00077 //------------------------------------------------------------
00078 class nsGB18030Unique2BytesToUnicode : public nsTableDecoderSupport 
00079 {
00080 public:
00081   nsGB18030Unique2BytesToUnicode();
00082   virtual ~nsGB18030Unique2BytesToUnicode() 
00083     { };
00084 protected:
00085 };
00086 
00087 static const PRUint16 g_utGB18030Unique2Bytes[] = {
00088 #include "gb18030uniq2b.ut"
00089 };
00090 nsGB18030Unique2BytesToUnicode::nsGB18030Unique2BytesToUnicode() 
00091 : nsTableDecoderSupport((uShiftTable*) &g_2BytesShiftTable,
00092         (uMappingTable*) &g_utGB18030Unique2Bytes, 1) 
00093 {
00094 }
00095 
00096 //------------------------------------------------------------
00097 // nsGB18030Unique4BytesToUnicode
00098 //------------------------------------------------------------
00099 static const PRInt16 g_GB18030_4BytesShiftTable[] = {
00100  0, u4BytesGB18030Charset,
00101  ShiftCell(0,0,0,0,0,0,0,0)
00102 };
00103 
00104 class nsGB18030Unique4BytesToUnicode : public nsTableDecoderSupport 
00105 {
00106 public:
00107   nsGB18030Unique4BytesToUnicode();
00108   virtual ~nsGB18030Unique4BytesToUnicode() 
00109     { };
00110 protected:
00111 };
00112 
00113 static const PRUint16 g_utGB18030Unique4Bytes[] = {
00114 #include "gb180304bytes.ut"
00115 };
00116 nsGB18030Unique4BytesToUnicode::nsGB18030Unique4BytesToUnicode() 
00117 : nsTableDecoderSupport((uShiftTable*) &g_GB18030_4BytesShiftTable,
00118         (uMappingTable*) &g_utGB18030Unique4Bytes, 1) 
00119 {
00120 }
00121 
00122 
00123 //----------------------------------------------------------------------
00124 // Class nsGBKToUnicode [implementation]
00125 
00126 //----------------------------------------------------------------------
00127 // Subclassing of nsTablesDecoderSupport class [implementation]
00128 
00129 #define LEGAL_GBK_MULTIBYTE_FIRST_BYTE(c)  \
00130       (UINT8_IN_RANGE(0x81, (c), 0xFE))
00131 #define FIRST_BYTE_IS_SURROGATE(c)  \
00132       (UINT8_IN_RANGE(0x90, (c), 0xFE))
00133 #define LEGAL_GBK_2BYTE_SECOND_BYTE(c) \
00134       (UINT8_IN_RANGE(0x40, (c), 0x7E)|| UINT8_IN_RANGE(0x80, (c), 0xFE))
00135 #define LEGAL_GBK_4BYTE_SECOND_BYTE(c) \
00136       (UINT8_IN_RANGE(0x30, (c), 0x39))
00137 #define LEGAL_GBK_4BYTE_THIRD_BYTE(c)  \
00138       (UINT8_IN_RANGE(0x81, (c), 0xFE))
00139 #define LEGAL_GBK_4BYTE_FORTH_BYTE(c) \
00140       (UINT8_IN_RANGE(0x30, (c), 0x39))
00141 
00142 NS_IMETHODIMP nsGBKToUnicode::ConvertNoBuff(const char* aSrc,
00143                                             PRInt32 * aSrcLength,
00144                                             PRUnichar *aDest,
00145                                             PRInt32 * aDestLength)
00146 {
00147   PRInt32 i=0;
00148   PRInt32 iSrcLength = (*aSrcLength);
00149   PRInt32 iDestlen = 0;
00150   nsresult rv=NS_OK;
00151   *aSrcLength = 0;
00152   
00153   for (i=0;i<iSrcLength;i++)
00154   {
00155     if ( iDestlen >= (*aDestLength) )
00156     {
00157       rv = NS_OK_UDEC_MOREOUTPUT;
00158       break;
00159     }
00160     // The valid range for the 1st byte is [0x81,0xFE] 
00161     if(LEGAL_GBK_MULTIBYTE_FIRST_BYTE(*aSrc))
00162     {
00163       if(i+1 >= iSrcLength) 
00164       {
00165         rv = NS_OK_UDEC_MOREINPUT;
00166         break;
00167       }
00168       // To make sure, the second byte has to be checked as well.
00169       // In GBK, the second byte range is [0x40,0x7E] and [0x80,0XFE]
00170       if(LEGAL_GBK_2BYTE_SECOND_BYTE(aSrc[1]))
00171       {
00172         // Valid GBK code
00173         *aDest = mUtil.GBKCharToUnicode(aSrc[0], aSrc[1]);
00174         if(UCS2_NO_MAPPING == *aDest)
00175         { 
00176           // We cannot map in the common mapping, let's call the
00177           // delegate 2 byte decoder to decode the gbk or gb18030 unique 
00178           // 2 byte mapping
00179           if(! TryExtensionDecoder(aSrc, aDest))
00180           {
00181             *aDest = UCS2_NO_MAPPING;
00182           }
00183         }
00184         aSrc += 2;
00185         i++;
00186       }
00187       else if (LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]))
00188       {
00189         // from the first 2 bytes, it looks like a 4 byte GB18030
00190         if(i+3 >= iSrcLength)  // make sure we got 4 bytes
00191         {
00192           rv = NS_OK_UDEC_MOREINPUT;
00193           break;
00194         }
00195         // 4 bytes patten
00196         // [0x81-0xfe][0x30-0x39][0x81-0xfe][0x30-0x39]
00197         // preset the 
00198  
00199         if (LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]) &&
00200             LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]))
00201         {
00202            if ( ! FIRST_BYTE_IS_SURROGATE(aSrc[0])) 
00203            {
00204              // let's call the delegated 4 byte gb18030 converter to convert it
00205              if(! Try4BytesDecoder(aSrc, aDest))
00206                *aDest = UCS2_NO_MAPPING;
00207            } else {
00208               // let's try supplement mapping
00209              NS_ASSERTION(( (iDestlen+1) <= (*aDestLength) ), "no enouth output memory");
00210              if ( (iDestlen+1) <= (*aDestLength) )
00211              {
00212                if(DecodeToSurrogate(aSrc, aDest))
00213                {
00214                  // surrogte two PRUnichar
00215                  iDestlen++;
00216                  aDest++;
00217                }  else {
00218                  *aDest = UCS2_NO_MAPPING;
00219               }
00220              } else {
00221                *aDest = UCS2_NO_MAPPING;
00222              }
00223            }
00224         } else {
00225           *aDest = UCS2_NO_MAPPING; 
00226         }
00227         aSrc += 4;
00228         i+=3;
00229       }
00230       else if ((PRUint8) aSrc[0] == (PRUint8)0xA0 )
00231       {
00232         // stand-alone (not followed by a valid second byte) 0xA0 !
00233         // treat it as valid a la Netscape 4.x
00234         *aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
00235         aSrc++;
00236       } else {
00237         // Invalid GBK code point (second byte should be 0x40 or higher)
00238         *aDest = UCS2_NO_MAPPING;
00239         aSrc++;
00240       }
00241     } else {
00242       if(IS_ASCII(*aSrc))
00243       {
00244         // The source is an ASCII
00245         *aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
00246         aSrc++;
00247       } else {
00248         if(IS_GBK_EURO(*aSrc)) {
00249           *aDest = UCS2_EURO;
00250         } else {
00251           *aDest = UCS2_NO_MAPPING;
00252         }
00253         aSrc++;
00254       }
00255     }
00256     iDestlen++;
00257     aDest++;
00258     *aSrcLength = i+1;
00259   }
00260   *aDestLength = iDestlen;
00261   return rv;
00262 }
00263 
00264 
00265 void nsGBKToUnicode::CreateExtensionDecoder()
00266 {
00267   mExtensionDecoder = new nsGBKUnique2BytesToUnicode();
00268 }
00269 void nsGBKToUnicode::Create4BytesDecoder()
00270 {
00271   m4BytesDecoder =  nsnull;
00272 }
00273 void nsGB18030ToUnicode::CreateExtensionDecoder()
00274 {
00275   mExtensionDecoder = new nsGB18030Unique2BytesToUnicode();
00276 }
00277 void nsGB18030ToUnicode::Create4BytesDecoder()
00278 {
00279   m4BytesDecoder = new nsGB18030Unique4BytesToUnicode();
00280 }
00281 PRBool nsGB18030ToUnicode::DecodeToSurrogate(const char* aSrc, PRUnichar* aOut)
00282 {
00283   NS_ASSERTION(FIRST_BYTE_IS_SURROGATE(aSrc[0]),       "illegal first byte");
00284   NS_ASSERTION(LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]),   "illegal second byte");
00285   NS_ASSERTION(LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]),    "illegal third byte");
00286   NS_ASSERTION(LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]),    "illegal forth byte");
00287   if(! FIRST_BYTE_IS_SURROGATE(aSrc[0]))
00288     return PR_FALSE;
00289   if(! LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]))
00290     return PR_FALSE;
00291   if(! LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]))
00292     return PR_FALSE;
00293   if(! LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]))
00294     return PR_FALSE;
00295 
00296   PRUint8 a1 = (PRUint8) aSrc[0];
00297   PRUint8 a2 = (PRUint8) aSrc[1];
00298   PRUint8 a3 = (PRUint8) aSrc[2];
00299   PRUint8 a4 = (PRUint8) aSrc[3];
00300   a1 -= (PRUint8)0x90;
00301   a2 -= (PRUint8)0x30;
00302   a3 -= (PRUint8)0x81;
00303   a4 -= (PRUint8)0x30;
00304   PRUint32 idx = (((a1 * 10 + a2 ) * 126 + a3) * 10) + a4;
00305 
00306   *aOut++ = 0xD800 | (0x000003FF & (idx >> 10));
00307   *aOut = 0xDC00 | (0x000003FF & idx);
00308 
00309   return PR_TRUE;
00310 }
00311 PRBool nsGBKToUnicode::TryExtensionDecoder(const char* aSrc, PRUnichar* aOut)
00312 {
00313   if(!mExtensionDecoder)
00314     CreateExtensionDecoder();
00315   NS_ASSERTION(mExtensionDecoder, "cannot creqte 2 bytes unique converter");
00316   if(mExtensionDecoder)
00317   {
00318     nsresult res = mExtensionDecoder->Reset();
00319     NS_ASSERTION(NS_SUCCEEDED(res), "2 bytes unique conversoin reset failed");
00320     PRInt32 len = 2;
00321     PRInt32 dstlen = 1;
00322     res = mExtensionDecoder->Convert(aSrc,&len, aOut, &dstlen); 
00323     NS_ASSERTION(NS_FAILED(res) || ((len==2) && (dstlen == 1)), 
00324        "some strange conversion result");
00325      // if we failed, we then just use the 0xfffd 
00326      // therefore, we ignore the res here. 
00327     if(NS_SUCCEEDED(res)) 
00328       return PR_TRUE;
00329   }
00330   return  PR_FALSE;
00331 }
00332 PRBool nsGBKToUnicode::DecodeToSurrogate(const char* aSrc, PRUnichar* aOut)
00333 {
00334   return PR_FALSE;
00335 }
00336 PRBool nsGBKToUnicode::Try4BytesDecoder(const char* aSrc, PRUnichar* aOut)
00337 {
00338   if(!m4BytesDecoder)
00339     Create4BytesDecoder();
00340   if(m4BytesDecoder)
00341   {
00342     nsresult res = m4BytesDecoder->Reset();
00343     NS_ASSERTION(NS_SUCCEEDED(res), "4 bytes unique conversoin reset failed");
00344     PRInt32 len = 4;
00345     PRInt32 dstlen = 1;
00346     res = m4BytesDecoder->Convert(aSrc,&len, aOut, &dstlen); 
00347     NS_ASSERTION(NS_FAILED(res) || ((len==4) && (dstlen == 1)), 
00348        "some strange conversion result");
00349      // if we failed, we then just use the 0xfffd 
00350      // therefore, we ignore the res here. 
00351     if(NS_SUCCEEDED(res)) 
00352       return PR_TRUE;
00353   }
00354   return  PR_FALSE;
00355 }