Back to index

lightning-sunbird  0.9+nobinonly
nsClassicDetectors.cpp
Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /* ***** BEGIN LICENSE BLOCK *****
00003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00004  *
00005  * The contents of this file are subject to the Mozilla Public License Version
00006  * 1.1 (the "License"); you may not use this file except in compliance with
00007  * the License. You may obtain a copy of the License at
00008  * http://www.mozilla.org/MPL/
00009  *
00010  * Software distributed under the License is distributed on an "AS IS" basis,
00011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00012  * for the specific language governing rights and limitations under the
00013  * License.
00014  *
00015  * The Original Code is Mozilla Communicator client code.
00016  *
00017  * The Initial Developer of the Original Code is
00018  * Netscape Communications Corporation.
00019  * Portions created by the Initial Developer are Copyright (C) 1998
00020  * the Initial Developer. All Rights Reserved.
00021  *
00022  * Contributor(s):
00023  *   Pierre Phaneuf <pp@ludusdesign.com>
00024  *
00025  * Alternatively, the contents of this file may be used under the terms of
00026  * either of the GNU General Public License Version 2 or later (the "GPL"),
00027  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00028  * in which case the provisions of the GPL or the LGPL are applicable instead
00029  * of those above. If you wish to allow use of your version of this file only
00030  * under the terms of either the GPL or the LGPL, and not to allow others to
00031  * use your version of this file under the terms of the MPL, indicate your
00032  * decision by deleting the provisions above and replace them with the notice
00033  * and other provisions required by the GPL or the LGPL. If you do not delete
00034  * the provisions above, a recipient may use your version of this file under
00035  * the terms of any one of the MPL, the GPL or the LGPL.
00036  *
00037  * ***** END LICENSE BLOCK ***** */
00038 
00039 
00040 #include "nsIFactory.h"
00041 #include "nsISupports.h"
00042 #include "nsClassicCharDetDll.h"
00043 #include "pratom.h"
00044 
00045 #include "nsICharsetDetector.h"
00046 #include "nsICharsetDetectionObserver.h"
00047 #include "nsIStringCharsetDetector.h"
00048 #include "nsClassicDetectors.h"
00049 
00050 //==========================================================
00051 
00052                                    /* values for EUC shift chars      */
00053 #define SS2          0x8E          /* Single Shift 2           */
00054 #define SS3          0x8F          /* Single Shift 3           */
00055 #define IsRoman(c)                 ((c) < 0x80)
00056 #define IsSJIS2ndByte(c)    (((c) > 0x3F) && ((c) < 0xFD))
00057 #define IsLoSJIS2ndByte(c)  (((c) > 0x3F) && ((c) < 0xA1))
00058 #define IsHiSJIS2ndByte(c)  (((c) > 0xA0) && ((c) < 0xFD))
00059 #define IsEUCJPKana(b1)            (((b1) > 0xA0) && ((b1) < 0xE0))
00060 #define IsEUCJPKanji(b1or2) (((b1or2) > 0xA0) && ((b1or2) < 0xFF))
00061 
00062 #define       YES           1
00063 #define NO           0
00064 #define       MAYBE  -1
00065 
00066 static int
00067 isSJIS(const unsigned char *cp, PRInt32 len)
00068 {
00069        while (len) {
00070               if (IsRoman(*cp)) {
00071                      cp++, len--;
00072               } else if (*cp == 0x80) {          /* illegal SJIS 1st byte                  */
00073                      return NO;
00074               } else if ((*cp < 0xA0)) {         /* byte 1 of 2byte SJIS 1st range  */
00075                      if (len > 1) {
00076                             if (IsSJIS2ndByte(cp[1])) {
00077                                    if ((*cp != 0x8E && *cp != 0x8F) || (*(cp+1) <= 0xA0))
00078                                           return YES;
00079                                    cp += 2, len -= 2;   /* valid 2 byte SJIS                      */
00080                             } else {
00081                                    return NO;                  /* invalid SJIS      2nd byte                    */
00082                             }
00083                      } else
00084                             break;                                    /* buffer ended w/1of2 byte SJIS */
00085               } else if (*cp == 0xA0) {                 /* illegal EUCJP byte              */
00086 #if ALLOW_NBSP
00087                      cp++, len--; /* allow nbsp */
00088 #endif
00089               } else if (*cp < 0xE0) {           /* SJIS half-width kana                          */
00090                      cp++, len--;
00091               } else if (*cp < 0xF0) {           /* byte 1 of 2byte SJIS      2nd range    */
00092                      if (len > 1) {
00093                             if (IsSJIS2ndByte(cp[1])) {
00094                                    cp += 2, len -= 2;   /* valid 2 byte SJIS                      */
00095                             } else {
00096                                    return NO;                  /* invalid SJIS                                         */
00097                             }
00098                      } else
00099                             break;                             /* buffer ended w/1of2 byte SJIS   */
00100               } else {
00101                      return NO;                                /* invalid SJIS 1st byte                  */
00102               }
00103        }
00104        return MAYBE;                                    /* No illegal SJIS values found           */
00105 }
00106 
00107 static int
00108 isEUCJP(const unsigned char *cp, PRInt32 len)
00109 {
00110        while (len) {
00111               if (IsRoman(*cp)) {                /* Roman                                         */
00112                      cp++, len--;
00113               } else if (*cp == SS2) {           /* EUCJP JIS201 half-width kana */
00114                      if (len > 1) {
00115                             if (IsEUCJPKana(cp[1]))
00116                                    cp += 2, len -= 2;          /* valid half-width kana */
00117                             else
00118                                    return NO;                         /* invalid 2of3 byte EUC */ 
00119                      } else
00120                             break;                                    /* buffer ended w/1of2 byte EUC    */
00121               } else if (*cp == SS3) {                  /* EUCJP JIS212                                  */
00122                       if (len > 1) {
00123                             if (IsEUCJPKanji(cp[1])) {
00124                                    if (len > 2) {
00125                                           if (IsEUCJPKanji(cp[2]))
00126                                                  cp += 2, len -= 2;   /* valid 3 byte EUCJP              */
00127                                           else
00128                                                  return NO;           /* invalid 3of3 byte EUCJP  */
00129                                    } else
00130                                           break;                      /* buffer ended w/2of3 byte EUCJP */
00131                             } else
00132                                    return NO;                         /* invalid 2of3 byte EUCJP  */
00133                      } else
00134                             break;                                    /* buffer ended w/1of3 byte EUCJP */
00135               } else if (*cp == 0xA0) {                 /* illegal EUCJP byte              */
00136 #if ALLOW_NBSP
00137                      cp++, len--; /* allow nbsp */
00138 #else
00139                      return NO;
00140 #endif
00141               } else if (*cp < 0xF0) {           /* EUCJP JIS208 (overlaps SJIS)           */
00142                      if (len > 1) {
00143                             if (IsEUCJPKanji(cp[1]))
00144                                    cp += 2, len -= 2;          /* valid 2 byte EUCJP              */
00145                             else
00146                                    return NO;                         /* invalid 2of2 byte EUCJP  */
00147                      } else
00148                             break;                                    /* buffer ended w/1of2 byte EUCJP */
00149               } else if (*cp < 0xFF) {           /* EUCJP JIS208 only:                     */
00150                      if (len > 1) {
00151                             if (IsEUCJPKanji(cp[1]))
00152                                    return YES;                 /* valid 2 byte EUCJP, invalid SJIS       */
00153                             else
00154                                    return NO;                         /* invalid 2of2 byte EUCJP  */
00155                      } else
00156                             break;                                    /* buffer ended w/1of2 byte EUCJP */
00157               } else {
00158                      return NO;                                /* invalid EUCJP 1st byte: 0xFF    */
00159               }
00160        }
00161        return MAYBE;
00162 }
00163 
00165 
00166 static nsresult JA_AutoCharsetDetectBuffer(const char* aBuffer, const PRInt32 aLen, char* aCharset)
00167 {
00168   PRBool hasEsc = PR_FALSE;
00169   PRBool asciiOnly = PR_TRUE;
00170 
00171   PL_strcpy(aCharset, "ISO-8859-1");
00172 
00173   // check 8 bit or ESC
00174   for (int i = 0; i < aLen; i++) {
00175     if ((unsigned char) aBuffer[i] > 127 || aBuffer[i] == 0x1B) {
00176       if (aBuffer[i] == 0x1B) {
00177         hasEsc = PR_TRUE;
00178         break;
00179       }
00180       else {
00181         asciiOnly = PR_FALSE;
00182       }
00183     }
00184   }
00185 
00186   if (hasEsc) {
00187     PL_strcpy(aCharset, "ISO-2022-JP");
00188   }
00189   else if (!asciiOnly) {
00190     // use old japanese auto detect code
00191     int euc, sjis;
00192     euc = isEUCJP((unsigned char *) aBuffer, aLen);
00193     sjis = isSJIS((unsigned char *) aBuffer, aLen);
00194     if (YES == euc) {
00195       PL_strcpy(aCharset, "EUC-JP");
00196     }
00197     else if (YES == sjis) {
00198       PL_strcpy(aCharset, "Shift_JIS");
00199     }
00200     else if (MAYBE == euc && NO == sjis) {
00201       PL_strcpy(aCharset, "EUC-JP");
00202     }
00203     else if (MAYBE == sjis && NO == euc) {
00204       PL_strcpy(aCharset, "Shift_JIS");
00205     }
00206     else if (MAYBE == euc && MAYBE == sjis) {
00207       PL_strcpy(aCharset, "EUC-JP");
00208     }
00209   }
00210 
00211   return NS_OK;
00212 }
00213 
00214 //==========================================================
00215 NS_IMPL_ISUPPORTS1(nsClassicDetector, nsICharsetDetector)
00216 
00217 //----------------------------------------------------------
00218 nsClassicDetector::nsClassicDetector(const char* language)
00219 {
00220   mObserver = nsnull;
00221   PL_strcpy(mLanguage, language);
00222 }
00223 //----------------------------------------------------------
00224 nsClassicDetector::~nsClassicDetector()
00225 {
00226 }
00227 //----------------------------------------------------------
00228 NS_IMETHODIMP nsClassicDetector::Init(
00229   nsICharsetDetectionObserver* aObserver)
00230 {
00231   NS_ASSERTION(mObserver == nsnull , "Init twice");
00232   if(nsnull == aObserver)
00233      return NS_ERROR_ILLEGAL_VALUE;
00234 
00235   mObserver = aObserver;
00236 
00237   return NS_OK;
00238 }
00239 //----------------------------------------------------------
00240 NS_IMETHODIMP nsClassicDetector::DoIt(
00241   const char* aBuf, PRUint32 aLen, PRBool* oDontFeedMe)
00242 {
00243   NS_ASSERTION(mObserver != nsnull , "have not init yet");
00244 
00245   if((nsnull == aBuf) || (nsnull == oDontFeedMe))
00246      return NS_ERROR_ILLEGAL_VALUE;
00247 
00248   if (!PL_strcasecmp("ja", mLanguage) &&
00249       NS_SUCCEEDED(JA_AutoCharsetDetectBuffer(aBuf, (PRInt32) aLen, mCharset))) {
00250     mObserver->Notify(mCharset, eBestAnswer);
00251   }
00252   else {
00253     mObserver->Notify("", eNoAnswerMatch);
00254   }
00255 
00256   *oDontFeedMe = PR_TRUE;
00257 
00258   return NS_OK;
00259 }
00260 //----------------------------------------------------------
00261 NS_IMETHODIMP nsClassicDetector::Done()
00262 {
00263   NS_ASSERTION(mObserver != nsnull , "have not init yet");
00264   return NS_OK;
00265 }
00266 
00267 //==========================================================
00268 
00269 NS_IMPL_ISUPPORTS1(nsClassicStringDetector, nsIStringCharsetDetector)
00270 
00271 //----------------------------------------------------------
00272 nsClassicStringDetector::nsClassicStringDetector(const char* language)
00273 {
00274   PL_strcpy(mLanguage, language);
00275 }
00276 //----------------------------------------------------------
00277 nsClassicStringDetector::~nsClassicStringDetector()
00278 {
00279 }
00280 
00281 //----------------------------------------------------------
00282 NS_IMETHODIMP nsClassicStringDetector::DoIt(const char* aBuf, PRUint32 aLen, 
00283                                             const char** oCharset, 
00284                                             nsDetectionConfident &oConfident)
00285 {
00286   oConfident = eNoAnswerMatch;
00287   *oCharset = "";
00288 
00289   if (!PL_strcasecmp("ja", mLanguage) &&
00290       NS_SUCCEEDED(JA_AutoCharsetDetectBuffer(aBuf, (PRInt32) aLen, mCharset))) {
00291     *oCharset = mCharset;
00292     oConfident = eBestAnswer;
00293   }
00294 
00295   return NS_OK;
00296 }
00297