Back to index

lightning-sunbird  0.9+nobinonly
nsPSMDetectors.h
Go to the documentation of this file.
00001 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /* ***** BEGIN LICENSE BLOCK *****
00003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00004  *
00005  * The contents of this file are subject to the Mozilla Public License Version
00006  * 1.1 (the "License"); you may not use this file except in compliance with
00007  * the License. You may obtain a copy of the License at
00008  * http://www.mozilla.org/MPL/
00009  *
00010  * Software distributed under the License is distributed on an "AS IS" basis,
00011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00012  * for the specific language governing rights and limitations under the
00013  * License.
00014  *
00015  * The Original Code is mozilla.org code.
00016  *
00017  * The Initial Developer of the Original Code is
00018  * Netscape Communications Corporation.
00019  * Portions created by the Initial Developer are Copyright (C) 1998
00020  * the Initial Developer. All Rights Reserved.
00021  *
00022  * Contributor(s):
00023  *
00024  * Alternatively, the contents of this file may be used under the terms of
00025  * either of the GNU General Public License Version 2 or later (the "GPL"),
00026  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00027  * in which case the provisions of the GPL or the LGPL are applicable instead
00028  * of those above. If you wish to allow use of your version of this file only
00029  * under the terms of either the GPL or the LGPL, and not to allow others to
00030  * use your version of this file under the terms of the MPL, indicate your
00031  * decision by deleting the provisions above and replace them with the notice
00032  * and other provisions required by the GPL or the LGPL. If you do not delete
00033  * the provisions above, a recipient may use your version of this file under
00034  * the terms of any one of the MPL, the GPL or the LGPL.
00035  *
00036  * ***** END LICENSE BLOCK ***** */
00037 #ifndef nsPSMDetectors_h__
00038 #define nsPSMDetectors_h__
00039 
00040 #include "nsCOMPtr.h"
00041 #include "nsIFactory.h"
00042 #include "nsVerifier.h"
00043 //---- for verifiers
00044 #include "nsSJISVerifier.h"
00045 #include "nsEUCJPVerifier.h"
00046 #include "nsCP1252Verifier.h"
00047 #include "nsUTF8Verifier.h"
00048 #include "nsISO2022JPVerifier.h"
00049 #include "nsISO2022KRVerifier.h"
00050 #include "nsISO2022CNVerifier.h"
00051 #include "nsHZVerifier.h"
00052 #include "nsUCS2BEVerifier.h"
00053 #include "nsUCS2LEVerifier.h"
00054 #include "nsBIG5Verifier.h"
00055 #include "nsGB2312Verifier.h"
00056 #include "nsGB18030Verifier.h"
00057 #include "nsEUCTWVerifier.h"
00058 #include "nsEUCKRVerifier.h"
00059 //---- end verifiers
00060 
00061 //#define DETECTOR_DEBUG
00062 
00063 #define MAX_VERIFIERS 16
00064 
00065 // {12BB8F1B-2389-11d3-B3BF-00805F8A6670}
00066 #define NS_JA_PSMDETECTOR_CID \
00067 { 0x12bb8f1b, 0x2389, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
00068 
00069 // {12BB8F1C-2389-11d3-B3BF-00805F8A6670}
00070 #define NS_JA_STRING_PSMDETECTOR_CID \
00071 { 0x12bb8f1c, 0x2389, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
00072 
00073 // {EA06D4E1-2B3D-11d3-B3BF-00805F8A6670}
00074 #define NS_KO_PSMDETECTOR_CID \
00075 { 0xea06d4e1, 0x2b3d, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
00076 
00077 // {EA06D4E2-2B3D-11d3-B3BF-00805F8A6670}
00078 #define NS_ZHCN_PSMDETECTOR_CID \
00079 { 0xea06d4e2, 0x2b3d, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
00080 
00081 // {EA06D4E3-2B3D-11d3-B3BF-00805F8A6670}
00082 #define NS_ZHTW_PSMDETECTOR_CID \
00083 { 0xea06d4e3, 0x2b3d, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
00084 
00085 
00086 // {EA06D4E4-2B3D-11d3-B3BF-00805F8A6670}
00087 #define NS_KO_STRING_PSMDETECTOR_CID \
00088 { 0xea06d4e4, 0x2b3d, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
00089 
00090 // {EA06D4E5-2B3D-11d3-B3BF-00805F8A6670}
00091 #define NS_ZHCN_STRING_PSMDETECTOR_CID \
00092 { 0xea06d4e5, 0x2b3d, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
00093 
00094 // {EA06D4E6-2B3D-11d3-B3BF-00805F8A6670}
00095 #define NS_ZHTW_STRING_PSMDETECTOR_CID \
00096 { 0xea06d4e6, 0x2b3d, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
00097 
00098 
00099 // {FCACEF21-2B40-11d3-B3BF-00805F8A6670}
00100 #define NS_ZH_STRING_PSMDETECTOR_CID \
00101 { 0xfcacef21, 0x2b40, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
00102 
00103 // {FCACEF22-2B40-11d3-B3BF-00805F8A6670}
00104 #define NS_CJK_STRING_PSMDETECTOR_CID \
00105 { 0xfcacef22, 0x2b40, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
00106 
00107 
00108 // {FCACEF23-2B40-11d3-B3BF-00805F8A6670}
00109 #define NS_ZH_PSMDETECTOR_CID \
00110 { 0xfcacef23, 0x2b40, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
00111 
00112 // {FCACEF24-2B40-11d3-B3BF-00805F8A6670}
00113 #define NS_CJK_PSMDETECTOR_CID \
00114 { 0xfcacef24, 0x2b40, 0x11d3, { 0xb3, 0xbf, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
00115 
00116 typedef struct {
00117   float mFirstByteFreq[94];
00118   float mFirstByteStdDev;
00119   float mFirstByteMean;
00120   float mFirstByteWeight;
00121   float mSecoundByteFreq[94];
00122   float mSecoundByteStdDev;
00123   float mSecoundByteMean;
00124   float mSecoundByteWeight;
00125 } nsEUCStatisticsMutable;
00126 
00127 typedef const nsEUCStatisticsMutable nsEUCStatistics;
00128 
00129 /*
00130 extern nsEUCStatistics gBig5Statistics = 
00131 #include "Big5Statistics.h"
00132 // end of UECTWStatistics.h include
00133 
00134 static nsEUCStatistics gEUCTWStatistics = 
00135 #include "EUCTWStatistics.h"
00136 // end of UECTWStatistics.h include
00137 
00138 static nsEUCStatistics gGB2312Statistics = 
00139 #include "GB2312Statistics.h"
00140 // end of GB2312Statistics.h include
00141 
00142 static nsEUCStatistics gEUCJPStatistics = 
00143 #include "EUCJPStatistics.h"
00144 // end of EUCJPStatistics.h include
00145 
00146 static nsEUCStatistics gEUCKRStatistics = 
00147 #include "EUCKRStatistics.h"
00148 // end of EUCKRStatistics.h include
00149 */
00150 
00151 //==========================================================
00152 /*
00153    This class won't detect x-euc-tw for now. It can  only 
00154    tell a Big5 document is not x-euc-tw , but cannot tell
00155    a x-euc-tw docuement is not Big5 unless we hit characters
00156    defined in CNS 11643 plane 2.
00157    
00158    May need improvement ....
00159  */
00160 #define ZHTW_DETECTOR_NUM_VERIFIERS 7
00161 extern nsVerifier* const gZhTwVerifierSet[];
00162 extern nsEUCStatistics* const gZhTwStatisticsSet[];
00163 
00164 #define KO_DETECTOR_NUM_VERIFIERS 6
00165 extern nsVerifier* const gKoVerifierSet[];
00166 
00167 #define ZHCN_DETECTOR_NUM_VERIFIERS 8
00168 extern nsVerifier* const gZhCnVerifierSet[];
00169 
00170 #define JA_DETECTOR_NUM_VERIFIERS 7
00171 extern nsVerifier* const gJaVerifierSet[];
00172 
00173 #define ZH_DETECTOR_NUM_VERIFIERS 10
00174 extern nsVerifier* const gZhVerifierSet[];
00175 extern nsEUCStatistics* const gZhStatisticsSet[];
00176 
00177 #define CJK_DETECTOR_NUM_VERIFIERS 15
00178 extern nsVerifier* const gCJKVerifierSet[];
00179 extern nsEUCStatistics* const gCJKStatisticsSet[];
00180 
00181 class nsEUCSampler {
00182   public:
00183     nsEUCSampler() {
00184       mTotal =0;
00185       mThreshold = 200;
00186          mState = 0;
00187       PRInt32 i;
00188       for(i=0;i<94;i++)
00189           mFirstByteCnt[i] = mSecondByteCnt[i]=0;
00190     }
00191     PRBool EnoughData()  { return mTotal > mThreshold; }
00192     PRBool GetSomeData() { return mTotal > 1; }
00193     PRBool Sample(const char* aIn, PRUint32 aLen);
00194     void   CalFreq();
00195     float   GetScore(const float* aFirstByteFreq, float aFirstByteWeight,
00196                      const float* aSecondByteFreq, float aSecondByteWeight);
00197     float   GetScore(const float* array1, const float* array2);
00198   private:
00199     PRUint32 mTotal;
00200     PRUint32 mThreshold;
00201     PRInt8 mState;
00202     PRUint32 mFirstByteCnt[94];
00203     PRUint32 mSecondByteCnt[94];
00204     float mFirstByteFreq[94];
00205     float mSecondByteFreq[94];
00206    
00207 };
00208 
00209 /*
00210  In the current design, we know the following combination of verifiers 
00211  are not good-
00212 
00213  1. Two or more of the following verifier in one detector:
00214       nsEUCJPVerifer 
00215       nsGB2312Verifier
00216       nsEUCKRVerifer 
00217       nsEUCTWVerifier
00218       nsBIG5Verifer 
00219 
00220  */
00221 //----------------------------------------------------------
00222 class nsPSMDetector {
00223 public :
00224    nsPSMDetector(PRUint8 aItems, nsVerifier* const * aVerifierSet, nsEUCStatistics* const * aStatisticsSet);
00225    virtual ~nsPSMDetector() {};
00226 
00227    virtual PRBool HandleData(const char* aBuf, PRUint32 aLen);
00228    virtual void   DataEnd();
00229  
00230 protected:
00231    virtual void Report(const char* charset) = 0;
00232 
00233    PRUint8 mItems;
00234    PRUint8 mClassItems;
00235    PRUint8 mState[MAX_VERIFIERS];
00236    PRUint8 mItemIdx[MAX_VERIFIERS];
00237    nsVerifier* const * mVerifier;
00238    nsEUCStatistics* const * mStatisticsData;
00239    PRBool mDone;
00240 
00241    PRBool mRunSampler;
00242    PRBool mClassRunSampler;
00243 protected:
00244    void Reset();
00245    void Sample(const char* aBuf, PRUint32 aLen, PRBool aLastChance=PR_FALSE);
00246 private:
00247 #ifdef DETECTOR_DEBUG
00248    PRUint32 mDbgTest;
00249    PRUint32 mDbgLen;
00250 #endif
00251    nsEUCSampler mSampler;
00252 
00253 };
00254 
00255 class nsXPCOMDetector : 
00256       private nsPSMDetector,
00257       public nsICharsetDetector // Implement the interface 
00258 {
00259   NS_DECL_ISUPPORTS
00260 public:
00261     nsXPCOMDetector(PRUint8 aItems, nsVerifier* const * aVer, nsEUCStatistics* const * aStatisticsSet);
00262     virtual ~nsXPCOMDetector();
00263   NS_IMETHOD Init(nsICharsetDetectionObserver* aObserver);
00264   NS_IMETHOD DoIt(const char* aBuf, PRUint32 aLen, PRBool* oDontFeedMe);
00265   NS_IMETHOD Done();
00266 
00267 protected:
00268   virtual void Report(const char* charset);
00269 
00270 private:
00271   nsCOMPtr<nsICharsetDetectionObserver> mObserver;
00272 };
00273 
00274 class nsXPCOMStringDetector : 
00275       private nsPSMDetector,
00276       public nsIStringCharsetDetector // Implement the interface 
00277 {
00278   NS_DECL_ISUPPORTS
00279 public:
00280     nsXPCOMStringDetector(PRUint8 aItems, nsVerifier* const * aVer, nsEUCStatistics* const * aStatisticsSet);
00281     virtual ~nsXPCOMStringDetector();
00282     NS_IMETHOD DoIt(const char* aBuf, PRUint32 aLen, 
00283                    const char** oCharset, 
00284                    nsDetectionConfident &oConfident);
00285 protected:
00286   virtual void Report(const char* charset);
00287 private:
00288   const char* mResult;
00289 };
00290 
00291 class nsJAPSMDetector : public nsXPCOMDetector
00292 {
00293 public:
00294   nsJAPSMDetector() 
00295     : nsXPCOMDetector(JA_DETECTOR_NUM_VERIFIERS, gJaVerifierSet, nsnull) {};
00296 };
00297 
00298 class nsJAStringPSMDetector : public nsXPCOMStringDetector
00299 {
00300 public:
00301   nsJAStringPSMDetector() 
00302     : nsXPCOMStringDetector(JA_DETECTOR_NUM_VERIFIERS - 3, gJaVerifierSet, nsnull) {};
00303 };
00304 
00305 class nsKOPSMDetector : public nsXPCOMDetector
00306 {
00307 public:
00308   nsKOPSMDetector() 
00309     : nsXPCOMDetector(KO_DETECTOR_NUM_VERIFIERS, gKoVerifierSet, nsnull){};
00310 };
00311 
00312 class nsKOStringPSMDetector : public nsXPCOMStringDetector
00313 {
00314 public:
00315   nsKOStringPSMDetector() 
00316     : nsXPCOMStringDetector(KO_DETECTOR_NUM_VERIFIERS - 3, gKoVerifierSet, nsnull) {};
00317 };
00318 
00319 class nsZHTWPSMDetector : public nsXPCOMDetector
00320 {
00321 public:
00322   nsZHTWPSMDetector() 
00323     : nsXPCOMDetector(ZHTW_DETECTOR_NUM_VERIFIERS, gZhTwVerifierSet, gZhTwStatisticsSet) {};
00324 };
00325 
00326 class nsZHTWStringPSMDetector : public nsXPCOMStringDetector
00327 {
00328 public:
00329   nsZHTWStringPSMDetector() 
00330     : nsXPCOMStringDetector(ZHTW_DETECTOR_NUM_VERIFIERS - 3, gZhTwVerifierSet, gZhTwStatisticsSet) {};
00331 };
00332 
00333 class nsZHCNPSMDetector : public nsXPCOMDetector
00334 {
00335 public:
00336   nsZHCNPSMDetector() 
00337     : nsXPCOMDetector(ZHCN_DETECTOR_NUM_VERIFIERS, gZhCnVerifierSet, nsnull) {};
00338 };
00339 
00340 class nsZHCNStringPSMDetector : public nsXPCOMStringDetector
00341 {
00342 public:
00343   nsZHCNStringPSMDetector() 
00344     : nsXPCOMStringDetector(ZHCN_DETECTOR_NUM_VERIFIERS - 3, gZhCnVerifierSet, nsnull) {};
00345 };
00346 
00347 class nsZHPSMDetector : public nsXPCOMDetector
00348 {
00349 public:
00350   nsZHPSMDetector() 
00351     : nsXPCOMDetector(ZH_DETECTOR_NUM_VERIFIERS, gZhVerifierSet, gZhStatisticsSet) {};
00352 };
00353 
00354 class nsZHStringPSMDetector : public nsXPCOMStringDetector
00355 {
00356 public:
00357   nsZHStringPSMDetector() 
00358     : nsXPCOMStringDetector(ZH_DETECTOR_NUM_VERIFIERS - 3, gZhVerifierSet, gZhStatisticsSet) {};
00359 };
00360 
00361 class nsCJKPSMDetector : public nsXPCOMDetector
00362 {
00363 public:
00364   nsCJKPSMDetector() 
00365     : nsXPCOMDetector(CJK_DETECTOR_NUM_VERIFIERS, gCJKVerifierSet, gCJKStatisticsSet) {};
00366 };
00367 
00368 class nsCJKStringPSMDetector : public nsXPCOMStringDetector
00369 {
00370 public:
00371   nsCJKStringPSMDetector() 
00372     : nsXPCOMStringDetector(CJK_DETECTOR_NUM_VERIFIERS - 3, gCJKVerifierSet, gCJKStatisticsSet) {};
00373 };
00374 
00375 #endif // nsPSMDetectors_h__