Back to index

lightning-sunbird  0.9+nobinonly
nsSaveAsCharset.cpp
Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
00002 /* ***** BEGIN LICENSE BLOCK *****
00003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00004  *
00005  * The contents of this file are subject to the Mozilla Public License Version
00006  * 1.1 (the "License"); you may not use this file except in compliance with
00007  * the License. You may obtain a copy of the License at
00008  * http://www.mozilla.org/MPL/
00009  *
00010  * Software distributed under the License is distributed on an "AS IS" basis,
00011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00012  * for the specific language governing rights and limitations under the
00013  * License.
00014  *
00015  * The Original Code is mozilla.org code.
00016  *
00017  * The Initial Developer of the Original Code is
00018  * Netscape Communications Corporation.
00019  * Portions created by the Initial Developer are Copyright (C) 1998
00020  * the Initial Developer. All Rights Reserved.
00021  *
00022  * Contributor(s):
00023  *   Pierre Phaneuf <pp@ludusdesign.com>
00024  *
00025  * Alternatively, the contents of this file may be used under the terms of
00026  * either of the GNU General Public License Version 2 or later (the "GPL"),
00027  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00028  * in which case the provisions of the GPL or the LGPL are applicable instead
00029  * of those above. If you wish to allow use of your version of this file only
00030  * under the terms of either the GPL or the LGPL, and not to allow others to
00031  * use your version of this file under the terms of the MPL, indicate your
00032  * decision by deleting the provisions above and replace them with the notice
00033  * and other provisions required by the GPL or the LGPL. If you do not delete
00034  * the provisions above, a recipient may use your version of this file under
00035  * the terms of any one of the MPL, the GPL or the LGPL.
00036  *
00037  * ***** END LICENSE BLOCK ***** */
00038 
00039 
00040 #include "prmem.h"
00041 #include "prprf.h"
00042 #include "nsIServiceManager.h"
00043 #include "nsIComponentManager.h"
00044 #include "nsICharsetConverterManager.h"
00045 #include "nsSaveAsCharset.h"
00046 #include "nsCRT.h"
00047 #include "nsUnicharUtils.h"
00048 #include "nsCompressedCharMap.h"
00049 
00050 //
00051 // nsISupports methods
00052 //
00053 NS_IMPL_ISUPPORTS1(nsSaveAsCharset, nsISaveAsCharset)
00054 
00055 #include "ignorables_abjadpoints.x-ccmap"
00056 DEFINE_X_CCMAP(gIgnorableCCMapExt, const);
00057 
00058 //
00059 // nsSaveAsCharset
00060 //
00061 nsSaveAsCharset::nsSaveAsCharset()
00062 {
00063   mAttribute = attr_htmlTextDefault;
00064   mEntityVersion = 0;
00065   mCharsetListIndex = -1;
00066 }
00067 
00068 nsSaveAsCharset::~nsSaveAsCharset()
00069 {
00070 }
00071 
00072 NS_IMETHODIMP
00073 nsSaveAsCharset::Init(const char *charset, PRUint32 attr, PRUint32 entityVersion)
00074 {
00075   nsresult rv = NS_OK;
00076 
00077   mAttribute = attr;
00078   mEntityVersion = entityVersion;
00079 
00080   rv = SetupCharsetList(charset);
00081   NS_ENSURE_SUCCESS(rv, rv);
00082 
00083   // set up unicode encoder
00084   rv = SetupUnicodeEncoder(GetNextCharset());
00085   NS_ENSURE_SUCCESS(rv, rv);
00086 
00087   // set up entity converter
00088   if (attr_EntityNone != MASK_ENTITY(mAttribute) && !mEntityConverter)
00089     mEntityConverter = do_CreateInstance(NS_ENTITYCONVERTER_CONTRACTID, &rv);
00090 
00091   return rv;
00092 }
00093 
00094 NS_IMETHODIMP
00095 nsSaveAsCharset::Convert(const PRUnichar *inString, char **_retval)
00096 {
00097   if (nsnull == _retval)
00098     return NS_ERROR_NULL_POINTER;
00099   if (nsnull == inString)
00100     return NS_ERROR_NULL_POINTER;
00101   if (0 == *inString)
00102     return NS_ERROR_ILLEGAL_VALUE;
00103   nsresult rv = NS_OK;
00104 
00105   NS_ASSERTION(mEncoder, "need to call Init() before Convert()");
00106   NS_ENSURE_TRUE(mEncoder, NS_ERROR_FAILURE);
00107 
00108   *_retval = nsnull;
00109 
00110   // make sure to start from the first charset in the list
00111   if (mCharsetListIndex > 0) {
00112     mCharsetListIndex = -1;
00113     rv = SetupUnicodeEncoder(GetNextCharset());
00114     NS_ENSURE_SUCCESS(rv, rv);
00115   }
00116 
00117   do {
00118     // fallback to the next charset in the list if the last conversion failed by an unmapped character
00119     if (MASK_CHARSET_FALLBACK(mAttribute) && NS_ERROR_UENC_NOMAPPING == rv) {
00120       const char * charset = GetNextCharset();
00121       if (!charset)
00122         break;
00123       rv = SetupUnicodeEncoder(charset);
00124       NS_ENSURE_SUCCESS(rv, rv);
00125       PR_FREEIF(*_retval);
00126     }
00127 
00128     if (attr_EntityBeforeCharsetConv == MASK_ENTITY(mAttribute)) {
00129       NS_ASSERTION(mEntityConverter, "need to call Init() before Convert()");
00130       NS_ENSURE_TRUE(mEntityConverter, NS_ERROR_FAILURE);
00131       PRUnichar *entity = nsnull;
00132       // do the entity conversion first
00133       rv = mEntityConverter->ConvertToEntities(inString, mEntityVersion, &entity);
00134       if(NS_SUCCEEDED(rv)) {
00135         rv = DoCharsetConversion(entity, _retval);
00136         nsMemory::Free(entity);
00137       }
00138     }
00139     else
00140       rv = DoCharsetConversion(inString, _retval);
00141 
00142   } while (MASK_CHARSET_FALLBACK(mAttribute) && NS_ERROR_UENC_NOMAPPING == rv);
00143 
00144   return rv;
00145 }
00146 
00147 NS_IMETHODIMP 
00148 nsSaveAsCharset::GetCharset(char * *aCharset)
00149 {
00150   NS_ENSURE_ARG(aCharset);
00151   NS_ASSERTION(mCharsetListIndex >= 0, "need to call Init() first");
00152   NS_ENSURE_TRUE(mCharsetListIndex >= 0, NS_ERROR_FAILURE);
00153 
00154   const char *charset = mCharsetList[mCharsetListIndex]->get();
00155   if (!charset) {
00156     *aCharset = nsnull;
00157     NS_ASSERTION(charset, "make sure to call Init() with non empty charset list");
00158     return NS_ERROR_FAILURE;
00159   }
00160 
00161   *aCharset = nsCRT::strdup(charset);
00162   return (*aCharset) ? NS_OK : NS_ERROR_OUT_OF_MEMORY;
00163 }
00164 
00166 
00167 // do the fallback, reallocate the buffer if necessary
00168 // need to pass destination buffer info (size, current position and estimation of rest of the conversion)
00169 NS_IMETHODIMP
00170 nsSaveAsCharset::HandleFallBack(PRUint32 character, char **outString, PRInt32 *bufferLength, 
00171                                 PRInt32 *currentPos, PRInt32 estimatedLength)
00172 {
00173   if((nsnull == outString ) || (nsnull == bufferLength) ||(nsnull ==currentPos))
00174     return NS_ERROR_NULL_POINTER;
00175   char fallbackStr[256];
00176   nsresult rv = DoConversionFallBack(character, fallbackStr, 256);
00177   if (NS_SUCCEEDED(rv)) {
00178     PRInt32 tempLen = (PRInt32) PL_strlen(fallbackStr);
00179 
00180     // reallocate if the buffer is not large enough
00181     if ((tempLen + estimatedLength) >= (*bufferLength - *currentPos)) {
00182       char *temp = (char *) PR_Realloc(*outString, *bufferLength + tempLen);
00183       if (NULL != temp) {
00184         // adjust length/pointer after realloc
00185         *bufferLength += tempLen;
00186         *outString = temp;
00187       } else {
00188         *outString = NULL;
00189         *bufferLength =0;
00190         return NS_ERROR_OUT_OF_MEMORY;
00191       }
00192     }
00193     memcpy((*outString + *currentPos), fallbackStr, tempLen);
00194     *currentPos += tempLen;
00195   }
00196   return rv;
00197 }
00198 
00199 NS_IMETHODIMP
00200 nsSaveAsCharset::DoCharsetConversion(const PRUnichar *inString, char **outString)
00201 {
00202   if(nsnull == outString )
00203     return NS_ERROR_NULL_POINTER;
00204   NS_ASSERTION(outString, "invalid input");
00205 
00206   *outString = NULL;
00207 
00208   nsresult rv;
00209   PRInt32 inStringLength = nsCRT::strlen(inString);   // original input string length
00210   PRInt32 bufferLength;                               // allocated buffer length
00211   PRInt32 srcLength = inStringLength;
00212   PRInt32 dstLength;
00213   char *dstPtr = NULL;
00214   PRInt32 pos1, pos2;
00215   nsresult saveResult = NS_OK;                         // to remember NS_ERROR_UENC_NOMAPPING
00216 
00217   // estimate and allocate the target buffer (reserve extra memory for fallback)
00218   rv = mEncoder->GetMaxLength(inString, inStringLength, &dstLength);
00219   if (NS_FAILED(rv)) return rv;
00220 
00221   bufferLength = dstLength + 512; // reserve 512 byte for fallback.
00222   dstPtr = (char *) PR_Malloc(bufferLength);
00223   if (NULL == dstPtr) return NS_ERROR_OUT_OF_MEMORY;
00224 
00225   
00226   for (pos1 = 0, pos2 = 0; pos1 < inStringLength;) {
00227     // convert from unicode
00228     dstLength = bufferLength - pos2;
00229     rv = mEncoder->Convert(&inString[pos1], &srcLength, &dstPtr[pos2], &dstLength);
00230 
00231     pos1 += srcLength ? srcLength : 1;
00232     pos2 += dstLength;
00233     dstPtr[pos2] = '\0';
00234 
00235     // break: this is usually the case (no error) OR unrecoverable error
00236     if (NS_ERROR_UENC_NOMAPPING != rv) break;
00237 
00238     // remember this happened and reset the result
00239     saveResult = rv;
00240     rv = NS_OK;
00241 
00242     // finish encoder, give it a chance to write extra data like escape sequences
00243     dstLength = bufferLength - pos2;
00244     rv = mEncoder->Finish(&dstPtr[pos2], &dstLength);
00245     if (NS_SUCCEEDED(rv)) {
00246       pos2 += dstLength;
00247       dstPtr[pos2] = '\0';
00248     }
00249 
00250     srcLength = inStringLength - pos1;
00251 
00252     // do the fallback
00253     if (!ATTR_NO_FALLBACK(mAttribute)) {
00254       PRUint32 unMappedChar;
00255       if (IS_HIGH_SURROGATE(inString[pos1-1]) && 
00256           inStringLength > pos1 && IS_LOW_SURROGATE(inString[pos1])) {
00257         unMappedChar = SURROGATE_TO_UCS4(inString[pos1-1], inString[pos1]);
00258         pos1++;
00259       } else {
00260         unMappedChar = inString[pos1-1];
00261       }
00262 
00263       // if we're asked to ignore default ignorable code points, skip them.
00264       if (MASK_IGNORABLE_FALLBACK(mAttribute) &&
00265           CCMAP_HAS_CHAR_EXT(gIgnorableCCMapExt, unMappedChar)) 
00266                             continue;
00267 
00268       rv = mEncoder->GetMaxLength(inString+pos1, inStringLength-pos1, &dstLength);
00269       if (NS_FAILED(rv)) 
00270         break;
00271 
00272       rv = HandleFallBack(unMappedChar, &dstPtr, &bufferLength, &pos2, dstLength);
00273       if (NS_FAILED(rv)) 
00274         break;
00275       dstPtr[pos2] = '\0';
00276     }
00277   }
00278 
00279   if (NS_SUCCEEDED(rv)) {
00280     // finish encoder, give it a chance to write extra data like escape sequences
00281     dstLength = bufferLength - pos2;
00282     rv = mEncoder->Finish(&dstPtr[pos2], &dstLength);
00283     if (NS_SUCCEEDED(rv)) {
00284       pos2 += dstLength;
00285       dstPtr[pos2] = '\0';
00286     }
00287   }
00288 
00289   if (NS_FAILED(rv)) {
00290     PR_FREEIF(dstPtr);
00291     return rv;
00292   }
00293 
00294   *outString = dstPtr;      // set the result string
00295 
00296   // set error code so that the caller can do own fall back
00297   if (NS_ERROR_UENC_NOMAPPING == saveResult) {
00298     rv = NS_ERROR_UENC_NOMAPPING;
00299   }
00300 
00301   return rv;
00302 }
00303 
00304 NS_IMETHODIMP
00305 nsSaveAsCharset::DoConversionFallBack(PRUint32 inUCS4, char *outString, PRInt32 bufferLength)
00306 {
00307   NS_ASSERTION(outString, "invalid input");
00308   if(nsnull == outString )
00309     return NS_ERROR_NULL_POINTER;
00310 
00311   *outString = '\0';
00312 
00313   nsresult rv = NS_OK;
00314 
00315   if (ATTR_NO_FALLBACK(mAttribute)) {
00316     return NS_OK;
00317   }
00318   if (attr_EntityAfterCharsetConv == MASK_ENTITY(mAttribute)) {
00319     char *entity = NULL;
00320     rv = mEntityConverter->ConvertUTF32ToEntity(inUCS4, mEntityVersion, &entity);
00321     if (NS_SUCCEEDED(rv)) {
00322       if (NULL == entity || (PRInt32)strlen(entity) > bufferLength) {
00323         return NS_ERROR_OUT_OF_MEMORY;
00324       }
00325       PL_strcpy(outString, entity);
00326       nsMemory::Free(entity);
00327       return rv;
00328     }
00329   }
00330 
00331   switch (MASK_FALLBACK(mAttribute)) {
00332   case attr_FallbackQuestionMark:
00333     if(bufferLength>=2) {
00334       *outString++='?';
00335       *outString='\0';
00336       rv = NS_OK;
00337     } else {
00338       rv = NS_ERROR_FAILURE;
00339     }
00340     break;
00341   case attr_FallbackEscapeU:
00342     if (inUCS4 & 0xff0000)
00343       rv = (PR_snprintf(outString, bufferLength, "\\u%.6x", inUCS4) > 0) ? NS_OK : NS_ERROR_FAILURE;
00344     else
00345       rv = (PR_snprintf(outString, bufferLength, "\\u%.4x", inUCS4) > 0) ? NS_OK : NS_ERROR_FAILURE;
00346     break;
00347   case attr_FallbackDecimalNCR:
00348     rv = ( PR_snprintf(outString, bufferLength, "&#%u;", inUCS4) > 0) ? NS_OK : NS_ERROR_FAILURE;
00349     break;
00350   case attr_FallbackHexNCR:
00351     rv = (PR_snprintf(outString, bufferLength, "&#x%x;", inUCS4) > 0) ? NS_OK : NS_ERROR_FAILURE;
00352     break;
00353   case attr_FallbackNone:
00354     rv = NS_OK;
00355     break;
00356   default:
00357     rv = NS_ERROR_ILLEGAL_VALUE;
00358     break;
00359   }
00360 
00361        return rv;
00362 }
00363 
00364 nsresult nsSaveAsCharset::SetupUnicodeEncoder(const char* charset)
00365 {
00366   NS_ENSURE_ARG(charset);
00367   nsresult rv;
00368 
00369   // set up unicode encoder
00370   nsCOMPtr <nsICharsetConverterManager> ccm = do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &rv);
00371   NS_ENSURE_SUCCESS(rv, rv);
00372 
00373   return ccm->GetUnicodeEncoder(charset, getter_AddRefs(mEncoder));
00374 }
00375 
00376 nsresult nsSaveAsCharset::SetupCharsetList(const char *charsetList)
00377 {
00378   NS_ENSURE_ARG(charsetList);
00379 
00380   NS_ASSERTION(charsetList[0], "charsetList should not be empty");
00381   if (!charsetList[0])
00382     return NS_ERROR_INVALID_ARG;
00383 
00384   if (mCharsetListIndex >= 0) {
00385     mCharsetList.Clear();
00386     mCharsetListIndex = -1;
00387   }
00388 
00389   mCharsetList.ParseString(charsetList, ", ");
00390 
00391   return NS_OK;
00392 }
00393 
00394 const char * nsSaveAsCharset::GetNextCharset()
00395 {
00396   if ((mCharsetListIndex + 1) >= mCharsetList.Count())
00397     return nsnull;
00398 
00399   // bump the index and return the next charset
00400   return mCharsetList[++mCharsetListIndex]->get();
00401 }
00402 
00404 
00405 nsresult 
00406 NS_NewSaveAsCharset(nsISupports **inst)
00407 {
00408   if(nsnull == inst )
00409     return NS_ERROR_NULL_POINTER;
00410   *inst = (nsISupports *) new nsSaveAsCharset;
00411    if(*inst)
00412       NS_ADDREF(*inst);
00413    return (*inst) ? NS_OK : NS_ERROR_OUT_OF_MEMORY;
00414 }