Back to index

lightning-sunbird  0.9+nobinonly
nsPSMDetectors.cpp
Go to the documentation of this file.
00001 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /* ***** BEGIN LICENSE BLOCK *****
00003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00004  *
00005  * The contents of this file are subject to the Mozilla Public License Version
00006  * 1.1 (the "License"); you may not use this file except in compliance with
00007  * the License. You may obtain a copy of the License at
00008  * http://www.mozilla.org/MPL/
00009  *
00010  * Software distributed under the License is distributed on an "AS IS" basis,
00011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00012  * for the specific language governing rights and limitations under the
00013  * License.
00014  *
00015  * The Original Code is mozilla.org code.
00016  *
00017  * The Initial Developer of the Original Code is
00018  * Netscape Communications Corporation.
00019  * Portions created by the Initial Developer are Copyright (C) 1998
00020  * the Initial Developer. All Rights Reserved.
00021  *
00022  * Contributor(s):
00023  *   Pierre Phaneuf <pp@ludusdesign.com>
00024  *
00025  * Alternatively, the contents of this file may be used under the terms of
00026  * either of the GNU General Public License Version 2 or later (the "GPL"),
00027  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00028  * in which case the provisions of the GPL or the LGPL are applicable instead
00029  * of those above. If you wish to allow use of your version of this file only
00030  * under the terms of either the GPL or the LGPL, and not to allow others to
00031  * use your version of this file under the terms of the MPL, indicate your
00032  * decision by deleting the provisions above and replace them with the notice
00033  * and other provisions required by the GPL or the LGPL. If you do not delete
00034  * the provisions above, a recipient may use your version of this file under
00035  * the terms of any one of the MPL, the GPL or the LGPL.
00036  *
00037  * ***** END LICENSE BLOCK ***** */
00038 
00039 
00040 #include <math.h>
00041 #include <stdio.h>
00042 //---- for XPCOM
00043 #include "nsIFactory.h"
00044 #include "nsIGenericFactory.h"
00045 #include "nsISupports.h"
00046 #include "nsCharDetDll.h"
00047 #include "pratom.h"
00048 #include "nsPSMDetectors.h"
00049 
00050 nsEUCStatistics gBig5Statistics = 
00051 #include "Big5Statistics.h"
00052 // end of UECTWStatistics.h include
00053 
00054 nsEUCStatistics gEUCTWStatistics = 
00055 #include "EUCTWStatistics.h"
00056 // end of UECTWStatistics.h include
00057 
00058 nsEUCStatistics gGB2312Statistics = 
00059 #include "GB2312Statistics.h"
00060 // end of GB2312Statistics.h include
00061 
00062 nsEUCStatistics gEUCJPStatistics = 
00063 #include "EUCJPStatistics.h"
00064 // end of EUCJPStatistics.h include
00065 
00066 nsEUCStatistics gEUCKRStatistics = 
00067 #include "EUCKRStatistics.h"
00068 // end of EUCKRStatistics.h include
00069 
00070 //==========================================================
00071 /*
00072    This class won't detect x-euc-tw for now. It can  only 
00073    tell a Big5 document is not x-euc-tw , but cannot tell
00074    a x-euc-tw docuement is not Big5 unless we hit characters
00075    defined in CNS 11643 plane 2.
00076    
00077    May need improvement ....
00078  */
00079 
00080 nsVerifier* const gZhTwVerifierSet[ZHTW_DETECTOR_NUM_VERIFIERS] = {
00081       &nsUTF8Verifier,
00082       &nsBIG5Verifier,
00083       &nsISO2022CNVerifier,
00084       &nsEUCTWVerifier,
00085       &nsCP1252Verifier,
00086       &nsUCS2BEVerifier,
00087       &nsUCS2LEVerifier
00088 };
00089 
00090 nsEUCStatistics* const gZhTwStatisticsSet[ZHTW_DETECTOR_NUM_VERIFIERS] = {
00091       nsnull,
00092       &gBig5Statistics,
00093       nsnull,
00094       &gEUCTWStatistics,
00095       nsnull,
00096       nsnull,
00097       nsnull
00098 };
00099 
00100 //==========================================================
00101 
00102 nsVerifier* const gKoVerifierSet[KO_DETECTOR_NUM_VERIFIERS] = {
00103       &nsUTF8Verifier,
00104       &nsEUCKRVerifier,
00105       &nsISO2022KRVerifier,
00106       &nsCP1252Verifier,
00107       &nsUCS2BEVerifier,
00108       &nsUCS2LEVerifier
00109 };
00110 
00111 //==========================================================
00112 
00113 nsVerifier* const gZhCnVerifierSet[ZHCN_DETECTOR_NUM_VERIFIERS] = {
00114       &nsUTF8Verifier,
00115       &nsGB2312Verifier,
00116       &nsGB18030Verifier,
00117       &nsISO2022CNVerifier,
00118       &nsHZVerifier,
00119       &nsCP1252Verifier,
00120       &nsUCS2BEVerifier,
00121       &nsUCS2LEVerifier
00122 };
00123 
00124 //==========================================================
00125 
00126 nsVerifier* const gJaVerifierSet[JA_DETECTOR_NUM_VERIFIERS] = {
00127       &nsUTF8Verifier,
00128       &nsSJISVerifier,
00129       &nsEUCJPVerifier,
00130       &nsISO2022JPVerifier,
00131       &nsCP1252Verifier,
00132       &nsUCS2BEVerifier,
00133       &nsUCS2LEVerifier
00134 };
00135 
00136 //==========================================================
00137 
00138 nsVerifier* const gZhVerifierSet[ZH_DETECTOR_NUM_VERIFIERS] = {
00139       &nsUTF8Verifier,
00140       &nsGB2312Verifier,
00141       &nsGB18030Verifier,
00142       &nsBIG5Verifier,
00143       &nsISO2022CNVerifier,
00144       &nsHZVerifier,
00145       &nsEUCTWVerifier,
00146       &nsCP1252Verifier,
00147       &nsUCS2BEVerifier,
00148       &nsUCS2LEVerifier
00149 };
00150 
00151 nsEUCStatistics* const gZhStatisticsSet[ZH_DETECTOR_NUM_VERIFIERS] = {
00152       nsnull,
00153       &gGB2312Statistics,
00154       &gBig5Statistics,
00155       nsnull,
00156       nsnull,
00157       &gEUCTWStatistics,
00158       nsnull,
00159       nsnull,
00160       nsnull
00161 };
00162 
00163 //==========================================================
00164 
00165 nsVerifier* const gCJKVerifierSet[CJK_DETECTOR_NUM_VERIFIERS] = {
00166       &nsUTF8Verifier,
00167       &nsSJISVerifier,
00168       &nsEUCJPVerifier,
00169       &nsISO2022JPVerifier,
00170       &nsEUCKRVerifier,
00171       &nsISO2022KRVerifier,
00172       &nsBIG5Verifier,
00173       &nsEUCTWVerifier,
00174       &nsGB2312Verifier,
00175       &nsGB18030Verifier,
00176       &nsISO2022CNVerifier,
00177       &nsHZVerifier,
00178       &nsCP1252Verifier,
00179       &nsUCS2BEVerifier,
00180       &nsUCS2LEVerifier
00181 };
00182 
00183 nsEUCStatistics* const gCJKStatisticsSet[CJK_DETECTOR_NUM_VERIFIERS] = {
00184       nsnull,
00185       nsnull,
00186       &gEUCJPStatistics,
00187       nsnull,
00188       &gEUCKRStatistics,
00189       nsnull,
00190       &gBig5Statistics,
00191       &gEUCTWStatistics,
00192       &gGB2312Statistics,
00193       nsnull,
00194       nsnull,
00195       nsnull,
00196       nsnull,
00197       nsnull
00198 };
00199 
00200 PRBool nsEUCSampler::Sample(const char* aIn, PRUint32 aLen)
00201 {
00202     if(mState == 1)
00203         return PR_FALSE;
00204     const unsigned char* p = (const unsigned char*) aIn;
00205     if(aLen + mTotal > 0x80000000) 
00206        aLen = 0x80000000 - mTotal;
00207 
00208      PRUint32 i;
00209      for(i=0; (i<aLen) && (1 != mState) ;i++,p++)
00210      {
00211         switch(mState) {
00212            case 0:
00213              if( *p & 0x0080)  
00214              {
00215                 if((0x00ff == *p) || ( 0x00a1 > *p)) {
00216                    mState = 1;
00217                 } else {
00218                    mTotal++;
00219                    mFirstByteCnt[*p - 0x00a1]++;
00220                    mState = 2;
00221                 }
00222              }
00223              break;
00224            case 1:
00225              break;
00226            case 2:
00227              if( *p & 0x0080)  
00228              {
00229                 if((0x00ff == *p) || ( 0x00a1 > *p)) {
00230                    mState = 1;
00231                 } else {
00232                    mTotal++;
00233                    mSecondByteCnt[*p - 0x00a1]++;
00234                    mState = 0;
00235                 }
00236              } else {
00237                 mState = 1;
00238              }
00239              break;
00240            default:
00241              mState = 1;
00242         }
00243      }
00244    return (1 != mState  );
00245 }
00246 float nsEUCSampler::GetScore(const float* aFirstByteFreq, float aFirstByteWeight,
00247                      const float* aSecondByteFreq, float aSecondByteWeight)
00248 {
00249    return aFirstByteWeight * GetScore(aFirstByteFreq, mFirstByteFreq) +
00250           aSecondByteWeight * GetScore(aSecondByteFreq, mSecondByteFreq);
00251 }
00252 
00253 float nsEUCSampler::GetScore(const float* array1, const float* array2)
00254 {
00255    float s;
00256    float sum=0.0;
00257    PRUint16 i;
00258    for(i=0;i<94;i++) {
00259      s = array1[i] - array2[i];
00260      sum += s * s;
00261    }
00262    return (float)sqrt((double)sum) / 94.0f;
00263 }
00264 
00265 void nsEUCSampler::CalFreq()
00266 {
00267    PRUint32 i;
00268    for(i = 0 ; i < 94; i++) {
00269       mFirstByteFreq[i] = (float)mFirstByteCnt[i] / (float)mTotal;
00270       mSecondByteFreq[i] = (float)mSecondByteCnt[i] / (float)mTotal;
00271    }
00272 }
00273 
00274 //----------------------------------------------------------
00275 NS_IMPL_ISUPPORTS1(nsXPCOMDetector, nsICharsetDetector)
00276 NS_IMPL_ISUPPORTS1(nsXPCOMStringDetector, nsIStringCharsetDetector)
00277 nsPSMDetector::nsPSMDetector(PRUint8 aItems, nsVerifier* const * aVerifierSet, nsEUCStatistics* const * aStatisticsSet)
00278 {
00279   mClassRunSampler = (nsnull != aStatisticsSet);
00280   mStatisticsData = aStatisticsSet;
00281   mVerifier = aVerifierSet;
00282 
00283   mClassItems = aItems;
00284   Reset();
00285 }
00286 void nsPSMDetector::Reset()
00287 {
00288   mRunSampler = mClassRunSampler;
00289   mDone= PR_FALSE;
00290   mItems = mClassItems;
00291   NS_ASSERTION(MAX_VERIFIERS >= mItems , "MAX_VERIFIERS is too small!");
00292   for(PRUint8 i = 0; i < mItems ; i++)
00293   {
00294      mState[i] = 0;
00295      mItemIdx[i] = i;
00296   }
00297 #ifdef DETECTOR_DEBUG
00298   mDbgLen = mDbgTest = 0;
00299 #endif   
00300 }
00301 //----------------------------------------------------------
00302 void nsPSMDetector::DataEnd()
00303 {
00304   // since gb18030 covers almost all code points in big5, sjis, euc-xx, 
00305   // it effectively make other verifiers unusable. Gb18030 is not 
00306   // very popular, and it could reach Itsme state. We need to eliminate
00307   // gb18030 when there are only 2 candidates left. 
00308   if (mItems == 2) {
00309     if ((&nsGB18030Verifier) == mVerifier[mItemIdx[0]]) {
00310       Report( mVerifier[mItemIdx[1]]->charset);
00311       mDone = PR_TRUE;
00312     } else if ((&nsGB18030Verifier) == mVerifier[mItemIdx[1]]) {
00313       Report( mVerifier[mItemIdx[0]]->charset);
00314       mDone = PR_TRUE;
00315     }
00316   }
00317   if(mRunSampler)
00318      Sample(nsnull, 0, PR_TRUE);
00319 }
00320 //----------------------------------------------------------
00321 
00322 // #define ftang_TRACE_STATE
00323 // #define TRACE_VERIFIER nsCP1252Verifier
00324 
00325 PRBool nsPSMDetector::HandleData(const char* aBuf, PRUint32 aLen)
00326 {
00327   PRUint32 i,j;
00328   PRUint32 st;
00329   for(i=0; i < aLen; i++)
00330   {
00331      char b = aBuf[i];
00332      for(j = 0; j < mItems; )
00333      {
00334 #ifdef ftang_TRACE_STATE
00335        if(  mVerifier[mItemIdx[j]] == & TRACE_VERIFIER )
00336        {
00337            printf("%d = %d\n", i + mDbgLen, mState[j]);
00338        }
00339 #endif
00340 #ifdef DETECTOR_DEBUG
00341         mDbgTest++;
00342 #endif 
00343         st = GETNEXTSTATE( mVerifier[mItemIdx[j]], b, mState[j] );
00344         if(eItsMe == st) 
00345         {
00346 #ifdef DETECTOR_DEBUG
00347             printf("It's %s- byte %d(%x) test %d\n", 
00348                     mVerifier[mItemIdx[j]]->charset,
00349                     i+mDbgLen,
00350                     i+mDbgLen,
00351                     mDbgTest
00352                   );
00353 #endif
00354             Report( mVerifier[mItemIdx[j]]->charset);
00355             mDone = PR_TRUE;
00356             return mDone;
00357         } else if (eError == st) 
00358         {
00359 #ifdef DETECTOR_DEBUG
00360             printf("It's NOT %s- byte %d(%x)\n", 
00361                     mVerifier[mItemIdx[j]]->charset,
00362                     i+mDbgLen,
00363                     i+mDbgLen);
00364 #endif
00365             mItems--;
00366             if(j < mItems )
00367             {
00368                 mItemIdx[j] = mItemIdx[mItems];
00369                 mState[j] = mState[mItems];
00370             } 
00371         } else {
00372             mState[j++] = st;
00373         } 
00374      }
00375      if( mItems <= 1) 
00376      {
00377          if( 1 == mItems) {
00378 #ifdef DETECTOR_DEBUG
00379              printf("It's %s- byte %d (%x) Test %d. The only left\n", 
00380                        mVerifier[mItemIdx[0]]->charset,
00381                        i+mDbgLen,
00382                        i+mDbgLen,
00383                        mDbgTest);
00384 #endif
00385              Report( mVerifier[mItemIdx[0]]->charset);
00386          }
00387          mDone = PR_TRUE;
00388          return mDone;
00389      } else {
00390         // If the only charset left is UCS2LE/UCS2BE and another, report the other
00391         PRInt32 nonUCS2Num=0;
00392         PRInt32 nonUCS2Idx=0;
00393         for(j = 0; j < mItems; j++) {
00394              if(((&nsUCS2BEVerifier) != mVerifier[mItemIdx[j]]) &&
00395                 ((&nsUCS2LEVerifier) != mVerifier[mItemIdx[j]])) {
00396                   nonUCS2Num++;
00397                   nonUCS2Idx = j;
00398              }
00399         }
00400         if(1 == nonUCS2Num) {
00401 #ifdef DETECTOR_DEBUG
00402              printf("It's %s- byte %d (%x) Test %d. The only left except UCS2LE/BE\n", 
00403                        mVerifier[mItemIdx[nonUCS2Idx]]->charset,
00404                        i+mDbgLen,
00405                        i+mDbgLen,
00406                        mDbgTest);
00407 #endif
00408             Report( mVerifier[mItemIdx[nonUCS2Idx]]->charset);
00409             mDone = PR_TRUE;
00410             return mDone;
00411         }
00412      }
00413   }
00414   if(mRunSampler)
00415      Sample(aBuf, aLen);
00416 
00417 #ifdef DETECTOR_DEBUG
00418   mDbgLen += aLen;
00419 #endif
00420   return PR_FALSE;
00421 }
00422 
00423 void nsPSMDetector::Sample(const char* aBuf, PRUint32 aLen, PRBool aLastChance)
00424 {
00425      PRInt32 possibleCandidateNum=0;
00426      PRInt32 j;
00427      PRInt32 eucNum=0;
00428      for(j = 0; j < mItems; j++) {
00429         if(nsnull != mStatisticsData[mItemIdx[j]]) 
00430              eucNum++;
00431         if(((&nsUCS2BEVerifier) != mVerifier[mItemIdx[j]]) &&
00432                 ((&nsUCS2LEVerifier) != mVerifier[mItemIdx[j]]) &&
00433                 ((&nsGB18030Verifier) != mVerifier[mItemIdx[j]]) ) {
00434                   possibleCandidateNum++;
00435         }
00436      }
00437      mRunSampler = (eucNum > 1);
00438      if(mRunSampler) {
00439         mRunSampler = mSampler.Sample(aBuf, aLen);
00440         if(((aLastChance && mSampler.GetSomeData()) || 
00441             mSampler.EnoughData())
00442            && (eucNum == possibleCandidateNum)) {
00443           mSampler.CalFreq();
00444 #ifdef DETECTOR_DEBUG
00445           printf("We cannot figure out charset from the encoding, "
00446                  "All EUC based charset share the same encoding structure.\n"
00447                  "Detect based on statistics"); 
00448           if(aLastChance) {
00449              printf(" after we receive all the data.\n"); 
00450           } else {
00451              printf(" after we receive enough data.\n");
00452           }
00453 #endif
00454           PRInt32 bestIdx = -1;
00455           PRInt32 eucCnt=0;
00456           float bestScore = 0.0f;
00457           for(j = 0; j < mItems; j++) {
00458              if((nsnull != mStatisticsData[mItemIdx[j]])  &&
00459                 (&gBig5Statistics != mStatisticsData[mItemIdx[j]]))
00460              {
00461                 float score = mSampler.GetScore(
00462                    mStatisticsData[mItemIdx[j]]->mFirstByteFreq,
00463                    mStatisticsData[mItemIdx[j]]->mFirstByteWeight,
00464                    mStatisticsData[mItemIdx[j]]->mSecoundByteFreq,
00465                    mStatisticsData[mItemIdx[j]]->mSecoundByteWeight );
00466 #ifdef DETECTOR_DEBUG
00467                 printf("Differences between %s and this data is %2.8f\n",
00468                        mVerifier[mItemIdx[j]]->charset,
00469                        score);
00470 #endif
00471                 if(( 0 == eucCnt++) || (bestScore > score )) {
00472                    bestScore = score;
00473                    bestIdx = j;
00474                 } // if(( 0 == eucCnt++) || (bestScore > score )) 
00475             } // if(nsnull != ...)
00476          } // for
00477          if (bestIdx >= 0)
00478          {
00479 #ifdef DETECTOR_DEBUG
00480            printf("Based on the statistic, we decide it is %s",
00481             mVerifier[mItemIdx[bestIdx]]->charset);
00482 #endif
00483            Report( mVerifier[mItemIdx[bestIdx]]->charset);
00484            mDone = PR_TRUE;
00485          }
00486        } // if (eucNum == possibleCandidateNum)
00487      } // if(mRunSampler)
00488 }
00489 //----------------------------------------------------------
00490 nsXPCOMDetector::nsXPCOMDetector(PRUint8 aItems, nsVerifier * const *aVer, nsEUCStatistics* const * aStatisticsSet)
00491    : nsPSMDetector( aItems, aVer, aStatisticsSet)
00492 {
00493   mObserver = nsnull;
00494 }
00495 //----------------------------------------------------------
00496 nsXPCOMDetector::~nsXPCOMDetector()
00497 {
00498 }
00499 //----------------------------------------------------------
00500 NS_IMETHODIMP nsXPCOMDetector::Init(
00501   nsICharsetDetectionObserver* aObserver)
00502 {
00503   NS_ASSERTION(mObserver == nsnull , "Init twice");
00504   if(nsnull == aObserver)
00505      return NS_ERROR_ILLEGAL_VALUE;
00506 
00507   mObserver = aObserver;
00508   return NS_OK;
00509 }
00510 //----------------------------------------------------------
00511 NS_IMETHODIMP nsXPCOMDetector::DoIt(
00512   const char* aBuf, PRUint32 aLen, PRBool* oDontFeedMe)
00513 {
00514   NS_ASSERTION(mObserver != nsnull , "have not init yet");
00515 
00516   if((nsnull == aBuf) || (nsnull == oDontFeedMe))
00517      return NS_ERROR_ILLEGAL_VALUE;
00518 
00519   this->HandleData(aBuf, aLen);
00520   *oDontFeedMe = mDone;
00521   return NS_OK;
00522 }
00523 //----------------------------------------------------------
00524 NS_IMETHODIMP nsXPCOMDetector::Done()
00525 {
00526   NS_ASSERTION(mObserver != nsnull , "have not init yet");
00527   this->DataEnd();
00528   return NS_OK;
00529 }
00530 //----------------------------------------------------------
00531 void nsXPCOMDetector::Report(const char* charset)
00532 {
00533   mObserver->Notify(charset, eSureAnswer);
00534 }
00535 //----------------------------------------------------------
00536 nsXPCOMStringDetector::nsXPCOMStringDetector(PRUint8 aItems, nsVerifier* const * aVer, nsEUCStatistics* const * aStatisticsSet)
00537    : nsPSMDetector( aItems, aVer, aStatisticsSet)
00538 {
00539 }
00540 //----------------------------------------------------------
00541 nsXPCOMStringDetector::~nsXPCOMStringDetector()
00542 {
00543 }
00544 //----------------------------------------------------------
00545 void nsXPCOMStringDetector::Report(const char* charset)
00546 {
00547   mResult = charset;
00548 }
00549 //----------------------------------------------------------
00550 NS_IMETHODIMP nsXPCOMStringDetector::DoIt(const char* aBuf, PRUint32 aLen, 
00551                    const char** oCharset, 
00552                    nsDetectionConfident &oConfident)
00553 {
00554   mResult = nsnull;
00555   this->HandleData(aBuf, aLen);
00556 
00557   if( nsnull == mResult) {
00558      // If we have no result and detector is done - answer no match
00559      if(mDone) 
00560      {
00561         *oCharset = nsnull;
00562         oConfident = eNoAnswerMatch;
00563      } else {
00564         // if we have no answer force the Done method and find the answer
00565         // if we find one, return it as eBestAnswer
00566         this->DataEnd();
00567         *oCharset = mResult;
00568         oConfident = (mResult) ? eBestAnswer : eNoAnswerMatch ;
00569      }
00570   } else {
00571      // If we have answer, return as eSureAnswer
00572      *oCharset = mResult;
00573      oConfident = eSureAnswer;
00574   }
00575   this->Reset();
00576   return NS_OK;
00577 }
00578