Back to index

lightning-sunbird  0.9+nobinonly
CharDistribution.h
Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /* ***** BEGIN LICENSE BLOCK *****
00003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00004  *
00005  * The contents of this file are subject to the Mozilla Public License Version
00006  * 1.1 (the "License"); you may not use this file except in compliance with
00007  * the License. You may obtain a copy of the License at
00008  * http://www.mozilla.org/MPL/
00009  *
00010  * Software distributed under the License is distributed on an "AS IS" basis,
00011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00012  * for the specific language governing rights and limitations under the
00013  * License.
00014  *
00015  * The Original Code is Mozilla Communicator client code.
00016  *
00017  * The Initial Developer of the Original Code is
00018  * Netscape Communications Corporation.
00019  * Portions created by the Initial Developer are Copyright (C) 1998
00020  * the Initial Developer. All Rights Reserved.
00021  *
00022  * Contributor(s):
00023  *
00024  * Alternatively, the contents of this file may be used under the terms of
00025  * either the GNU General Public License Version 2 or later (the "GPL"), or
00026  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00027  * in which case the provisions of the GPL or the LGPL are applicable instead
00028  * of those above. If you wish to allow use of your version of this file only
00029  * under the terms of either the GPL or the LGPL, and not to allow others to
00030  * use your version of this file under the terms of the MPL, indicate your
00031  * decision by deleting the provisions above and replace them with the notice
00032  * and other provisions required by the GPL or the LGPL. If you do not delete
00033  * the provisions above, a recipient may use your version of this file under
00034  * the terms of any one of the MPL, the GPL or the LGPL.
00035  *
00036  * ***** END LICENSE BLOCK ***** */
00037 
00038 #ifndef CharDistribution_h__
00039 #define CharDistribution_h__
00040 
00041 #include "nscore.h"
00042 
00043 #define ENOUGH_DATA_THRESHOLD 1024
00044  
00045 class CharDistributionAnalysis
00046 {
00047 public:
00048   CharDistributionAnalysis() {Reset();};
00049 
00050   //feed a block of data and do distribution analysis
00051   void HandleData(const char* aBuf, PRUint32 aLen) {};
00052   
00053   //Feed a character with known length
00054   void HandleOneChar(const char* aStr, PRUint32 aCharLen)
00055   {
00056     PRInt32 order;
00057 
00058     //we only care about 2-bytes character in our distribution analysis
00059     order = (aCharLen == 2) ? GetOrder(aStr) : -1;
00060 
00061     if (order >= 0)
00062     {
00063       mTotalChars++;
00064       //order is valid
00065       if ((PRUint32)order < mTableSize)
00066       {
00067         if (512 > mCharToFreqOrder[order])
00068           mFreqChars++;
00069       }
00070     }
00071   };
00072 
00073   //return confidence base on existing data
00074   float GetConfidence();
00075 
00076   //Reset analyser, clear any state 
00077   void      Reset(void) 
00078   {
00079     mDone = PR_FALSE;
00080     mTotalChars = 0;
00081     mFreqChars = 0;
00082   };
00083 
00084   //This function is for future extension. Caller can use this function to control
00085   //analyser's behavior
00086   void      SetOpion(){};
00087 
00088   //It is not necessary to receive all data to draw conclusion. For charset detection,
00089   // certain amount of data is enough
00090   PRBool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;};
00091 
00092 protected:
00093   //we do not handle character base on its original encoding string, but 
00094   //convert this encoding string to a number, here called order.
00095   //This allow multiple encoding of a language to share one frequency table 
00096   virtual PRInt32 GetOrder(const char* str) {return -1;};
00097   
00098   //If this flag is set to PR_TRUE, detection is done and conclusion has been made
00099   PRBool   mDone;
00100 
00101   //The number of characters whose frequency order is less than 512
00102   PRUint32 mFreqChars;
00103 
00104   //Total character encounted.
00105   PRUint32 mTotalChars;
00106 
00107   //Mapping table to get frequency order from char order (get from GetOrder())
00108   const PRInt16  *mCharToFreqOrder;
00109 
00110   //Size of above table
00111   PRUint32 mTableSize;
00112 
00113   //This is a constant value varies from language to language, it is used in 
00114   //calculating confidence. See my paper for further detail.
00115   float    mTypicalDistributionRatio;
00116 };
00117 
00118 
00119 class EUCTWDistributionAnalysis: public CharDistributionAnalysis
00120 {
00121 public:
00122   EUCTWDistributionAnalysis();
00123 protected:
00124 
00125   //for euc-TW encoding, we are interested 
00126   //  first  byte range: 0xc4 -- 0xfe
00127   //  second byte range: 0xa1 -- 0xfe
00128   //no validation needed here. State machine has done that
00129   PRInt32 GetOrder(const char* str) 
00130   { if ((unsigned char)*str >= (unsigned char)0xc4)  
00131       return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1;
00132     else
00133       return -1;
00134   };
00135 };
00136 
00137 
00138 class EUCKRDistributionAnalysis : public CharDistributionAnalysis
00139 {
00140 public:
00141   EUCKRDistributionAnalysis();
00142 protected:
00143   //for euc-KR encoding, we are interested 
00144   //  first  byte range: 0xb0 -- 0xfe
00145   //  second byte range: 0xa1 -- 0xfe
00146   //no validation needed here. State machine has done that
00147   PRInt32 GetOrder(const char* str) 
00148   { if ((unsigned char)*str >= (unsigned char)0xb0)  
00149       return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
00150     else
00151       return -1;
00152   };
00153 };
00154 
00155 class GB2312DistributionAnalysis : public CharDistributionAnalysis
00156 {
00157 public:
00158   GB2312DistributionAnalysis();
00159 protected:
00160   //for GB2312 encoding, we are interested 
00161   //  first  byte range: 0xb0 -- 0xfe
00162   //  second byte range: 0xa1 -- 0xfe
00163   //no validation needed here. State machine has done that
00164   PRInt32 GetOrder(const char* str) 
00165   { if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1)  
00166       return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
00167     else
00168       return -1;
00169   };
00170 };
00171 
00172 
00173 class Big5DistributionAnalysis : public CharDistributionAnalysis
00174 {
00175 public:
00176   Big5DistributionAnalysis();
00177 protected:
00178   //for big5 encoding, we are interested 
00179   //  first  byte range: 0xa4 -- 0xfe
00180   //  second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
00181   //no validation needed here. State machine has done that
00182   PRInt32 GetOrder(const char* str) 
00183   { if ((unsigned char)*str >= (unsigned char)0xa4)  
00184       if ((unsigned char)str[1] >= (unsigned char)0xa1)
00185         return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 +63;
00186       else
00187         return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40;
00188     else
00189       return -1;
00190   };
00191 };
00192 
00193 class SJISDistributionAnalysis : public CharDistributionAnalysis
00194 {
00195 public:
00196   SJISDistributionAnalysis();
00197 protected:
00198   //for sjis encoding, we are interested 
00199   //  first  byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
00200   //  second byte range: 0x40 -- 0x7e,  0x81 -- oxfe
00201   //no validation needed here. State machine has done that
00202   PRInt32 GetOrder(const char* str) 
00203   { 
00204     PRInt32 order;
00205     if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f)  
00206       order = 188 * ((unsigned char)str[0]-(unsigned char)0x81);
00207     else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef)  
00208       order = 188 * ((unsigned char)str[0]-(unsigned char)0xe0 + 31);
00209     else
00210       return -1;
00211     order += (unsigned char)*(str+1) - 0x40;
00212     if ((unsigned char)str[1] > (unsigned char)0x7f)
00213       order--;
00214     return order;
00215   };
00216 };
00217 
00218 class EUCJPDistributionAnalysis : public CharDistributionAnalysis
00219 {
00220 public:
00221   EUCJPDistributionAnalysis();
00222 protected:
00223   //for euc-JP encoding, we are interested 
00224   //  first  byte range: 0xa0 -- 0xfe
00225   //  second byte range: 0xa1 -- 0xfe
00226   //no validation needed here. State machine has done that
00227   PRInt32 GetOrder(const char* str) 
00228   { if ((unsigned char)*str >= (unsigned char)0xa0)  
00229       return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1;
00230     else
00231       return -1;
00232   };
00233 };
00234 
00235 #endif //CharDistribution_h__
00236