Back to index

lightning-sunbird  0.9+nobinonly
DetectCharset.cpp
Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /* ***** BEGIN LICENSE BLOCK *****
00003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00004  *
00005  * The contents of this file are subject to the Mozilla Public License Version
00006  * 1.1 (the "License"); you may not use this file except in compliance with
00007  * the License. You may obtain a copy of the License at
00008  * http://www.mozilla.org/MPL/
00009  *
00010  * Software distributed under the License is distributed on an "AS IS" basis,
00011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00012  * for the specific language governing rights and limitations under the
00013  * License.
00014  *
00015  * The Original Code is mozilla.org code.
00016  *
00017  * The Initial Developer of the Original Code is
00018  * Netscape Communications Corporation.
00019  * Portions created by the Initial Developer are Copyright (C) 1998
00020  * the Initial Developer. All Rights Reserved.
00021  *
00022  * Contributor(s):
00023  *   Pierre Phaneuf <pp@ludusdesign.com>
00024  *
00025  * Alternatively, the contents of this file may be used under the terms of
00026  * either of the GNU General Public License Version 2 or later (the "GPL"),
00027  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00028  * in which case the provisions of the GPL or the LGPL are applicable instead
00029  * of those above. If you wish to allow use of your version of this file only
00030  * under the terms of either the GPL or the LGPL, and not to allow others to
00031  * use your version of this file under the terms of the MPL, indicate your
00032  * decision by deleting the provisions above and replace them with the notice
00033  * and other provisions required by the GPL or the LGPL. If you do not delete
00034  * the provisions above, a recipient may use your version of this file under
00035  * the terms of any one of the MPL, the GPL or the LGPL.
00036  *
00037  * ***** END LICENSE BLOCK ***** */
00038 #include "nsISupports.h"
00039 #include "nsIComponentManager.h"
00040 #include "nsICharsetDetector.h"
00041 #include "nsICharsetDetectionObserver.h"
00042 #include <stdio.h>
00043 #include <stdlib.h>
00044 #if defined(XP_WIN) || defined(XP_OS2)
00045 #include <io.h>
00046 #endif
00047 #ifdef XP_UNIX
00048 #include <unistd.h>
00049 #endif
00050 
00051 
00052 class nsStatis {
00053 public:
00054     nsStatis() { };
00055     virtual ~nsStatis() { };
00056     virtual PRBool HandleData(const char* aBuf, PRUint32 aLen) = 0;
00057     virtual void   DataEnd() = 0;
00058     virtual void Report()=0;
00059 };
00060 
00061 class nsBaseStatis : public nsStatis {
00062 public:
00063     nsBaseStatis(unsigned char aL, unsigned char aH, float aR) ;
00064     virtual ~nsBaseStatis() {};
00065     virtual PRBool HandleData(const char* aBuf, PRUint32 aLen);
00066     virtual void   DataEnd() ;
00067     virtual void Report();
00068 protected:
00069     unsigned char mLWordHi;
00070     unsigned char mLWordLo;
00071 private:
00072     PRUint32 mNumOf2Bytes;
00073     PRUint32 mNumOfLChar;
00074     PRUint32 mNumOfLWord;
00075     PRUint32 mLWordLength;
00076     PRUint32 mLWordLen[10]; 
00077     float    mR;
00078     PRBool mTailByte;
00079     PRBool mLastLChar;
00080 };
00081 nsBaseStatis::nsBaseStatis(unsigned char aL, unsigned char aH, float aR)
00082 {
00083     mNumOf2Bytes = mNumOfLWord = mLWordLength = mNumOfLChar= 0;
00084     mTailByte = mLastLChar = PR_FALSE;
00085     for(PRUint32 i =0;i < 20; i++)
00086        mLWordLen[i] = 0;
00087     mLWordHi = aH;
00088     mLWordLo = aL;
00089     mR = aR;
00090 }
00091 PRBool nsBaseStatis::HandleData(const char* aBuf, PRUint32 aLen)
00092 {
00093     for(PRUint32 i=0; i < aLen; i++)
00094     {
00095        if(mTailByte)
00096           mTailByte = PR_FALSE;
00097        else 
00098        {
00099           mTailByte = (0x80 == ( aBuf[i] & 0x80));
00100           if(mTailByte) 
00101           {
00102              mNumOf2Bytes++;
00103              unsigned char a = (unsigned char) aBuf[i];
00104              PRBool thisLChar = (( mLWordLo <= a) && (a <= mLWordHi));
00105              if(thisLChar)
00106              {
00107                 mNumOfLChar++;
00108                 mLWordLength++;
00109              } else {
00110                 if(mLastLChar) {
00111                   mNumOfLWord++;
00112                   mLWordLen[ (mLWordLength > 10) ? 9 : (mLWordLength-1)]++;
00113                   mLWordLength =0 ;
00114                 }
00115              }
00116              mLastLChar = thisLChar;
00117           } else {
00118              if(mLastLChar) {
00119                 mNumOfLWord++;
00120                 mLWordLen[ (mLWordLength > 10) ? 9 : (mLWordLength-1)]++;
00121                 mLWordLength =0 ;
00122                 mLastLChar = PR_FALSE;
00123              }
00124           }
00125        }
00126     }
00127     return PR_TRUE;
00128 }
00129 void nsBaseStatis::DataEnd()
00130 {
00131     if(mLastLChar) {
00132       mNumOfLWord++;
00133       mLWordLen[ (mLWordLength > 10) ? 9 : (mLWordLength-1)]++;
00134     }
00135 }
00136 void nsBaseStatis::Report()
00137 {
00138     if(mNumOf2Bytes > 0)
00139     {
00140 /*
00141       printf("LChar Ratio = %d : %d ( %5.3f)\n", 
00142                          mNumOfLChar,
00143                          mNumOf2Bytes,
00144                         ((float)mNumOfLChar / (float)mNumOf2Bytes) * 100);
00145 */
00146       float rate = (float) mNumOfLChar / (float) mNumOf2Bytes;
00147       float delta = (rate - mR) / mR;
00148       delta *= delta * 1000;
00149 #ifdef EXPERIMENT
00150       printf("Exp = %f \n",delta);
00151 #endif
00152     }
00153     
00154 /*
00155 
00156     if(mNumOfLChar > 0)
00157       printf("LWord Word = %d : %d (%5.3f)\n", 
00158                          mNumOfLWord,
00159                          mNumOfLChar,
00160                         ((float)mNumOfLWord / (float)mNumOfLChar) * 100);
00161     if(mNumOfLWord > 0)
00162     {
00163       PRUint32 ac =0;
00164       for(PRUint32 i=0;i<10;i++)
00165       {
00166        ac += mLWordLen[i];
00167        printf("LWord Word Length[%d]= %d -> %5.3f%% %5.3f%%\n", i+1, 
00168            mLWordLen[i],
00169            (((float)mLWordLen[i] / (float)mNumOfLWord) * 100),
00170            (((float)ac / (float)mNumOfLWord) * 100));
00171       }
00172     }
00173 */
00174 }
00175 
00176 
00177 class nsSimpleStatis : public nsStatis {
00178 public:
00179     nsSimpleStatis(unsigned char aL, unsigned char aH, float aR,const char* aCharset) ;
00180     virtual ~nsSimpleStatis() {};
00181     virtual PRBool HandleData(const char* aBuf, PRUint32 aLen);
00182     virtual void   DataEnd() ;
00183     virtual void Report();
00184 protected:
00185     unsigned char mLWordHi;
00186     unsigned char mLWordLo;
00187 private:
00188     PRUint32 mNumOf2Bytes;
00189     PRUint32 mNumOfLChar;
00190     float    mR;
00191     const char* mCharset;
00192     PRBool mTailByte;
00193 };
00194 nsSimpleStatis::nsSimpleStatis(unsigned char aL, unsigned char aH, float aR, const char* aCharset)
00195 {
00196     mNumOf2Bytes =  mNumOfLChar= 0;
00197     mTailByte =  PR_FALSE;
00198     mLWordHi = aH;
00199     mLWordLo = aL;
00200     mR = aR;
00201     mCharset = aCharset;
00202 }
00203 PRBool nsSimpleStatis::HandleData(const char* aBuf, PRUint32 aLen)
00204 {
00205     for(PRUint32 i=0; i < aLen; i++)
00206     {
00207        if(mTailByte)
00208           mTailByte = PR_FALSE;
00209        else 
00210        {
00211           mTailByte = (0x80 == ( aBuf[i] & 0x80));
00212           if(mTailByte) 
00213           {
00214              mNumOf2Bytes++;
00215              unsigned char a = (unsigned char) aBuf[i];
00216              PRBool thisLChar = (( mLWordLo <= a) && (a <= mLWordHi));
00217              if(thisLChar)
00218                 mNumOfLChar++;
00219           }
00220        }
00221     }
00222     return PR_TRUE;
00223 }
00224 void nsSimpleStatis::DataEnd()
00225 {
00226 }
00227 void nsSimpleStatis::Report()
00228 {
00229     if(mNumOf2Bytes > 0)
00230     {
00231       float rate = (float) mNumOfLChar / (float) mNumOf2Bytes;
00232       float delta = (rate - mR) / mR;
00233       delta = delta * delta * (float)100;
00234 #ifdef EXPERIMENT
00235       printf("Exp = %f \n",delta);
00236       if(delta < 1.0)
00237          printf("This is %s\n" ,mCharset);
00238 #endif
00239 
00240     }
00241 }
00242 //==========================================================
00243 
00244 
00245 #define MAXBSIZE (1L << 13)
00246 
00247 void usage() {
00248    printf("Usage: DetectFile detector blocksize\n"
00249           "     detector: " 
00250           "ja_parallel_state_machine,"
00251           "ko_parallel_state_machine,"
00252           "zhcn_parallel_state_machine,"
00253           "zhtw_parallel_state_machine,"
00254           "zh_parallel_state_machine,"
00255           "cjk_parallel_state_machine,"
00256           "ruprob,"
00257           "ukprob,"
00258         "\n     blocksize: 1 ~ %ld\n"
00259           "  Data are passed in from STDIN\n"
00260           ,  MAXBSIZE);
00261 }
00262 
00263 class nsReporter : public nsICharsetDetectionObserver 
00264 {
00265    NS_DECL_ISUPPORTS
00266  public:
00267    nsReporter() { };
00268    virtual ~nsReporter() { };
00269 
00270    NS_IMETHOD Notify(const char* aCharset, nsDetectionConfident aConf)
00271     {
00272         printf("RESULT CHARSET : %s\n", aCharset);
00273         printf("RESULT Confident : %d\n", aConf);
00274         return NS_OK;
00275     };
00276 };
00277 
00278 
00279 NS_IMPL_ISUPPORTS1(nsReporter, nsICharsetDetectionObserver)
00280 
00281 nsresult GetDetector(const char* key, nsICharsetDetector** det)
00282 {
00283   char buf[128];
00284   strcpy(buf, NS_CHARSET_DETECTOR_CONTRACTID_BASE);
00285   strcat(buf, key);
00286   return CallCreateInstance(buf, det);
00287 }
00288 
00289 
00290 nsresult GetObserver(nsICharsetDetectionObserver** aRes)
00291 {
00292   *aRes = nsnull;
00293   nsReporter* rep = new nsReporter();
00294   if(rep) {
00295      return rep->QueryInterface(NS_GET_IID(nsICharsetDetectionObserver) ,
00296                                 (void**)aRes);
00297   }
00298   return NS_ERROR_OUT_OF_MEMORY;
00299 }
00300 
00301 int main(int argc, char** argv) {
00302   char buf[MAXBSIZE];
00303   PRUint32 bs;
00304   if( 3 != argc )
00305   {
00306     usage();
00307     printf("Need 2 arguments\n");
00308     return(-1);
00309   }
00310   bs = atoi(argv[2]);
00311   if((bs <1)||(bs>MAXBSIZE))
00312   {
00313     usage();
00314     printf("blocksize out of range - %s\n", argv[2]);
00315     return(-1);
00316   }
00317   nsresult rev = NS_OK;
00318   nsICharsetDetector *det = nsnull;
00319   rev = GetDetector(argv[1], &det);
00320   if(NS_FAILED(rev) || (nsnull == det) ){
00321     usage();
00322     printf("Invalid Detector - %s\n", argv[1]);
00323     printf("XPCOM ERROR CODE = %x\n", rev);
00324     return(-1);
00325   }
00326   nsICharsetDetectionObserver *obs = nsnull;
00327   rev = GetObserver(&obs);
00328   if(NS_SUCCEEDED(rev)) {
00329     rev = det->Init(obs);
00330     NS_IF_RELEASE(obs);
00331     if(NS_FAILED(rev))
00332     {
00333       printf("XPCOM ERROR CODE = %x\n", rev);
00334       return(-1);
00335     }
00336   } else {
00337     printf("XPCOM ERROR CODE = %x\n", rev);
00338     return(-1);
00339   }
00340 
00341   size_t sz;
00342   PRBool done = PR_FALSE;
00343   nsSimpleStatis  ks(0xb0,0xc8, (float)0.95952, "EUC-KR");
00344   nsSimpleStatis  js(0xa4,0xa5, (float)0.45006, "EUC-JP");
00345   nsStatis* stat[2] = {&ks, &js};
00346   PRUint32 i;
00347   do
00348   {
00349     sz = read(0, buf, bs); 
00350     if(sz > 0) {
00351       if(! done) {
00352 printf("call DoIt %d\n",sz);
00353         rev = det->DoIt( buf, sz, &done);
00354 printf("DoIt return Done = %d\n",done);
00355         if(NS_FAILED(rev))
00356         {
00357           printf("XPCOM ERROR CODE = %x\n", rev);
00358           return(-1);
00359         }
00360       }
00361       for(i=0;i<2;i++)
00362         stat[i]->HandleData(buf, sz);
00363     }
00364   // } while((sz > 0) &&  (!done) );
00365   } while(sz > 0);
00366   if(!done)
00367   {
00368 printf("Done = %d\n",done);
00369 printf("call Done %d\n",sz);
00370     rev = det->Done();
00371     if(NS_FAILED(rev))
00372     {
00373       printf("XPCOM ERROR CODE = %x\n", rev);
00374       return(-1);
00375     }
00376   }
00377   for(i=0;i<2;i++) {
00378     stat[i]->DataEnd();
00379     stat[i]->Report();
00380   }
00381 printf( "Done\n");
00382   
00383   NS_IF_RELEASE(det);
00384 printf( "Done 2\n");
00385   return (0);
00386 }