Back to index

lightning-sunbird  0.9+nobinonly
nsBayesianFilter.cpp
Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
00002 /* ***** BEGIN LICENSE BLOCK *****
00003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00004  *
00005  * The contents of this file are subject to the Mozilla Public License Version
00006  * 1.1 (the "License"); you may not use this file except in compliance with
00007  * the License. You may obtain a copy of the License at
00008  * http://www.mozilla.org/MPL/
00009  *
00010  * Software distributed under the License is distributed on an "AS IS" basis,
00011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00012  * for the specific language governing rights and limitations under the
00013  * License.
00014  *
00015  * The Original Code is mozilla.org code.
00016  *
00017  * The Initial Developer of the Original Code is
00018  * Netscape Communications Corporation.
00019  * Portions created by the Initial Developer are Copyright (C) 2002
00020  * the Initial Developer. All Rights Reserved.
00021  *
00022  * Contributor(s):
00023  *   Patrick C. Beard <beard@netscape.com>
00024  *   Seth Spitzer <sspitzer@netscape.com>
00025  *
00026  * Alternatively, the contents of this file may be used under the terms of
00027  * either of the GNU General Public License Version 2 or later (the "GPL"),
00028  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00029  * in which case the provisions of the GPL or the LGPL are applicable instead
00030  * of those above. If you wish to allow use of your version of this file only
00031  * under the terms of either the GPL or the LGPL, and not to allow others to
00032  * use your version of this file under the terms of the MPL, indicate your
00033  * decision by deleting the provisions above and replace them with the notice
00034  * and other provisions required by the GPL or the LGPL. If you do not delete
00035  * the provisions above, a recipient may use your version of this file under
00036  * the terms of any one of the MPL, the GPL or the LGPL.
00037  *
00038  * ***** END LICENSE BLOCK ***** */
00039 
00040 #ifdef MOZ_LOGGING
00041 // sorry, this has to be before the pre-compiled header
00042 #define FORCE_PR_LOG /* Allow logging in the release build */
00043 #endif
00044 
00045 #include "nsCRT.h"
00046 #include "nsBayesianFilter.h"
00047 #include "nsIInputStream.h"
00048 #include "nsIStreamListener.h"
00049 #include "nsNetUtil.h"
00050 #include "nsQuickSort.h"
00051 #include "nsIMsgMessageService.h"
00052 #include "nsMsgUtils.h" // for GetMessageServiceFromURI
00053 #include "prnetdb.h"
00054 #include "nsIMsgWindow.h"
00055 #include "prlog.h"
00056 #include "nsAppDirectoryServiceDefs.h"
00057 #include "nsUnicharUtils.h"
00058 
00059 #include "nsPrintfCString.h"
00060 #include "nsIMIMEHeaderParam.h"
00061 #include "nsNetCID.h"
00062 #include "nsIMimeHeaders.h"
00063 #include "nsMsgMimeCID.h"
00064 #include "nsIMsgMailNewsUrl.h"
00065 #include "nsIMimeMiscStatus.h"
00066 #include "nsIPrefService.h"
00067 #include "nsIPrefBranch.h"
00068 #include "nsIStringEnumerator.h"
00069 
00070 // needed to mark attachment flag on the db hdr
00071 #include "nsIMsgHdr.h"
00072 
00073 // needed to strip html out of the body
00074 #include "nsParserCIID.h"
00075 #include "nsIParser.h"
00076 #include "nsIHTMLContentSink.h"
00077 #include "nsIContentSerializer.h"
00078 #include "nsLayoutCID.h"
00079 #include "nsIHTMLToTextSink.h"
00080 #include "nsIDocumentEncoder.h" 
00081 
00082 #include "nsIncompleteGamma.h"
00083 #include <math.h>
00084 
00085 static PRLogModuleInfo *BayesianFilterLogModule = nsnull;
00086 
00087 static NS_DEFINE_CID(kParserCID, NS_PARSER_CID);
00088 static NS_DEFINE_CID(kNavDTDCID, NS_CNAVDTD_CID);
00089 
00090 #define kDefaultJunkThreshold .99 // we override this value via a pref
00091 static const char* kBayesianFilterTokenDelimiters = " \t\n\r\f.";
00092 static int kMinLengthForToken = 3; // lower bound on the number of characters in a word before we treat it as a token
00093 static int kMaxLengthForToken = 12; // upper bound on the number of characters in a word to be declared as a token
00094 
00095 #define FORGED_RECEIVED_HEADER_HINT NS_LITERAL_CSTRING("may be forged")
00096 
00097 #ifndef M_LN2
00098 #define M_LN2 0.69314718055994530942
00099 #endif
00100 
00101 #ifndef M_E
00102 #define M_E   2.7182818284590452354
00103 #endif
00104 
00105 struct Token : public PLDHashEntryHdr {
00106     const char* mWord;
00107     PRUint32 mLength;
00108     PRUint32 mCount;            // TODO:  put good/bad count values in same token object.
00109     double mProbability;        // TODO:  cache probabilities
00110     double mDistance;
00111 };
00112 
00113 TokenEnumeration::TokenEnumeration(PLDHashTable* table)
00114     :   mEntrySize(table->entrySize),
00115         mEntryCount(table->entryCount),
00116         mEntryOffset(0),
00117         mEntryAddr(table->entryStore)
00118 {
00119     PRUint32 capacity = PL_DHASH_TABLE_SIZE(table);
00120     mEntryLimit = mEntryAddr + capacity * mEntrySize;
00121 }
00122     
00123 inline PRBool TokenEnumeration::hasMoreTokens()
00124 {
00125     return (mEntryOffset < mEntryCount);
00126 }
00127 
00128 inline Token* TokenEnumeration::nextToken()
00129 {
00130     Token* token = NULL;
00131     PRUint32 entrySize = mEntrySize;
00132     char *entryAddr = mEntryAddr, *entryLimit = mEntryLimit;
00133     while (entryAddr < entryLimit) {
00134         PLDHashEntryHdr* entry = (PLDHashEntryHdr*) entryAddr;
00135         entryAddr += entrySize;
00136         if (PL_DHASH_ENTRY_IS_LIVE(entry)) {
00137             token = NS_STATIC_CAST(Token*, entry);
00138             ++mEntryOffset;
00139             break;
00140         }
00141     }
00142     mEntryAddr = entryAddr;
00143     return token;
00144 }
00145 
00146 struct VisitClosure {
00147     PRBool (*f) (Token*, void*);
00148     void* data;
00149 };
00150 
00151 static PLDHashOperator PR_CALLBACK VisitEntry(PLDHashTable* table, PLDHashEntryHdr* entry,
00152                                               PRUint32 number, void* arg)
00153 {
00154     VisitClosure* closure = NS_REINTERPRET_CAST(VisitClosure*, arg);
00155     Token* token = NS_STATIC_CAST(Token*, entry);
00156     return (closure->f(token, closure->data) ? PL_DHASH_NEXT : PL_DHASH_STOP);
00157 }
00158 
00159 // member variables
00160 static const PLDHashTableOps gTokenTableOps = {
00161     PL_DHashAllocTable,
00162     PL_DHashFreeTable,
00163     PL_DHashGetKeyStub,
00164     PL_DHashStringKey,
00165     PL_DHashMatchStringKey,
00166     PL_DHashMoveEntryStub,
00167     PL_DHashClearEntryStub,
00168     PL_DHashFinalizeStub
00169 };
00170 
00171 Tokenizer::Tokenizer()
00172 {
00173     PL_INIT_ARENA_POOL(&mWordPool, "Words Arena", 16384);
00174     PRBool ok = PL_DHashTableInit(&mTokenTable, &gTokenTableOps, nsnull, sizeof(Token), 256);
00175     NS_ASSERTION(ok, "mTokenTable failed to initialize");
00176     if (!ok)
00177       PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("mTokenTable failed to initialize"));
00178 }
00179 
00180 Tokenizer::~Tokenizer()
00181 {
00182     if (mTokenTable.entryStore)
00183         PL_DHashTableFinish(&mTokenTable);
00184     PL_FinishArenaPool(&mWordPool);
00185 }
00186 
00187 nsresult Tokenizer::clearTokens()
00188 {
00189     // we re-use the tokenizer when classifying multiple messages, 
00190     // so this gets called after every message classification.
00191     PRBool ok = PR_TRUE;
00192     if (mTokenTable.entryStore)
00193     {
00194         PL_DHashTableFinish(&mTokenTable);
00195         PL_FreeArenaPool(&mWordPool);
00196         ok = PL_DHashTableInit(&mTokenTable, &gTokenTableOps, nsnull, sizeof(Token), 256);
00197         NS_ASSERTION(ok, "mTokenTable failed to initialize");
00198         if (!ok)
00199           PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("mTokenTable failed to initialize in clearTokens()"));
00200     }
00201     return (ok) ? NS_OK : NS_ERROR_OUT_OF_MEMORY;
00202 }
00203 
00204 char* Tokenizer::copyWord(const char* word, PRUint32 len)
00205 {
00206     void* result;
00207     PRUint32 size = 1 + len;
00208     PL_ARENA_ALLOCATE(result, &mWordPool, size);
00209     if (result)
00210         memcpy(result, word, size);
00211     return NS_REINTERPRET_CAST(char*, result);
00212 }
00213 
00214 inline Token* Tokenizer::get(const char* word)
00215 {
00216     PLDHashEntryHdr* entry = PL_DHashTableOperate(&mTokenTable, word, PL_DHASH_LOOKUP);
00217     if (PL_DHASH_ENTRY_IS_BUSY(entry))
00218         return NS_STATIC_CAST(Token*, entry);
00219     return NULL;
00220 }
00221 
00222 Token* Tokenizer::add(const char* word, PRUint32 count)
00223 {
00224     PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("add word: %s (count=%d)", word, count));
00225 
00226     PLDHashEntryHdr* entry = PL_DHashTableOperate(&mTokenTable, word, PL_DHASH_ADD);
00227     Token* token = NS_STATIC_CAST(Token*, entry);
00228     if (token) {
00229         if (token->mWord == NULL) {
00230             PRUint32 len = strlen(word);
00231             NS_ASSERTION(len != 0, "adding zero length word to tokenizer");
00232             if (!len)
00233               PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("adding zero length word to tokenizer"));
00234             token->mWord = copyWord(word, len);
00235             NS_ASSERTION(token->mWord, "copyWord failed");
00236             if (!token->mWord) {
00237                 PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("copyWord failed: %s (%d)", word, len));
00238                 PL_DHashTableRawRemove(&mTokenTable, entry);
00239                 return NULL;
00240             }
00241             token->mLength = len;
00242             token->mCount = count;
00243             token->mProbability = 0;
00244             PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("adding word to tokenizer: %s (len=%d) (count=%d)", word, len, count));
00245         } else {
00246             token->mCount += count;
00247             PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("adding word to tokenizer: %s (count=%d) (mCount=%d)", word, count, token->mCount));
00248         }
00249     }
00250     return token;
00251 }
00252 
00253 void Tokenizer::remove(const char* word, PRUint32 count)
00254 {
00255     PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("remove word: %s (count=%d)", word, count));
00256     Token* token = get(word);
00257     if (token) {
00258         NS_ASSERTION(token->mCount >= count, "token count underflow");
00259         if (token->mCount >= count) {
00260             PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("remove word: %s (count=%d) (mCount=%d)", word, count, token->mCount));
00261             token->mCount -= count;
00262             if (token->mCount == 0)
00263                 PL_DHashTableRawRemove(&mTokenTable, token);
00264         }
00265         else {
00266           PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("token count underflow: %s (count=%d) (mCount=%d)", word, count, token->mCount));
00267         }
00268     }
00269 }
00270 
00271 static PRBool isDecimalNumber(const char* word)
00272 {
00273     const char* p = word;
00274     if (*p == '-') ++p;
00275     char c;
00276     while ((c = *p++)) {
00277         if (!isdigit((unsigned char) c))
00278             return PR_FALSE;
00279     }
00280     return PR_TRUE;
00281 }
00282 
00283 static PRBool isASCII(const char* word)
00284 {
00285     const unsigned char* p = (const unsigned char*)word;
00286     unsigned char c;
00287     while ((c = *p++)) {
00288         if (c > 127)
00289             return PR_FALSE;
00290     }
00291     return PR_TRUE;
00292 }
00293 
00294 inline PRBool isUpperCase(char c) { return ('A' <= c) && (c <= 'Z'); }
00295 
00296 static char* toLowerCase(char* str)
00297 {
00298     char c, *p = str;
00299     while ((c = *p++)) {
00300         if (isUpperCase(c))
00301             p[-1] = c + ('a' - 'A');
00302     }
00303     return str;
00304 }
00305 
00306 void Tokenizer::addTokenForHeader(const char * aTokenPrefix, nsACString& aValue, PRBool aTokenizeValue)
00307 {
00308   if (aValue.Length())
00309   {
00310     ToLowerCase(aValue);
00311     if (!aTokenizeValue)
00312       add(PromiseFlatCString(nsDependentCString(aTokenPrefix) + NS_LITERAL_CSTRING(":") + aValue).get());
00313     else 
00314     {
00315       char* word;
00316       const nsPromiseFlatCString &flatValue = PromiseFlatCString(aValue);
00317       char* next = (char *) flatValue.get();
00318       while ((word = nsCRT::strtok(next, kBayesianFilterTokenDelimiters, &next)) != NULL) 
00319       {
00320           if (word[0] == '\0') continue;
00321           if (isDecimalNumber(word)) continue;
00322           if (isASCII(word))
00323               add(PromiseFlatCString(nsDependentCString(aTokenPrefix) + NS_LITERAL_CSTRING(":") + nsDependentCString(word)).get());
00324       }
00325     }
00326   }
00327 }
00328 
00329 void Tokenizer::tokenizeAttachment(const char * aContentType, const char * aFileName)
00330 {
00331   nsCAutoString contentType;
00332   nsCAutoString fileName;
00333   fileName.Assign(aFileName);
00334   contentType.Assign(aContentType);
00335 
00336   // normalize the content type and the file name
00337   ToLowerCase(fileName);
00338   ToLowerCase(contentType);
00339   addTokenForHeader("attachment/filename", fileName);
00340 
00341   addTokenForHeader("attachment/content-type", contentType);
00342 }
00343 
00344 void Tokenizer::tokenizeHeaders(nsIUTF8StringEnumerator * aHeaderNames, nsIUTF8StringEnumerator * aHeaderValues)
00345 {
00346   nsCString headerValue;
00347   nsCAutoString headerName; // we'll be normalizing all header names to lower case
00348   PRBool hasMore = PR_TRUE;
00349 
00350   while (hasMore)
00351   {
00352     aHeaderNames->GetNext(headerName);
00353     ToLowerCase(headerName); 
00354     aHeaderValues->GetNext(headerValue);
00355 
00356     switch (headerName.First())
00357     {
00358     case 'c':
00359         if (headerName.Equals("content-type"))
00360         {
00361           nsresult rv;
00362           nsCOMPtr<nsIMIMEHeaderParam> mimehdrpar = do_GetService(NS_MIMEHEADERPARAM_CONTRACTID, &rv);
00363           if (NS_FAILED(rv))
00364             break;
00365 
00366           // extract the charset parameter
00367           nsXPIDLCString parameterValue;
00368           mimehdrpar->GetParameterInternal(headerValue.get(), "charset", nsnull, nsnull, getter_Copies(parameterValue));
00369           addTokenForHeader("charset", parameterValue);
00370 
00371           // create a token containing just the content type 
00372           mimehdrpar->GetParameterInternal(headerValue.get(), "type", nsnull, nsnull, getter_Copies(parameterValue));
00373           if (!parameterValue.Length())
00374             mimehdrpar->GetParameterInternal(headerValue.get(), nsnull /* use first unnamed param */, nsnull, nsnull, getter_Copies(parameterValue));
00375           addTokenForHeader("content-type/type", parameterValue);
00376 
00377           // XXX: should we add a token for the entire content-type header as well or just these parts we have extracted?
00378         }
00379         break;
00380     case 'r':
00381       if (headerName.Equals("received"))
00382       {
00383         // look for the string "may be forged" in the received headers. sendmail sometimes adds this hint
00384         // This does not compile on linux yet. Need to figure out why. Commenting out for now
00385         // if (FindInReadable(FORGED_RECEIVED_HEADER_HINT, headerValue))
00386         //   addTokenForHeader(headerName.get(), FORGED_RECEIVED_HEADER_HINT);
00387       }
00388       
00389       // leave out reply-to
00390       break;
00391     case 's':
00392         if (headerName.Equals("subject"))
00393         { 
00394           // we want to tokenize the subject
00395           addTokenForHeader(headerName.get(), headerValue, PR_TRUE);
00396         }
00397 
00398         // important: leave out sender field. Too strong of an indicator
00399         break;
00400     case 'x': // (2) X-Mailer / user-agent works best if it is untokenized, just fold the case and any leading/trailing white space
00401     case 'u': 
00402         addTokenForHeader(headerName.get(), headerValue); 
00403         break;
00404     default:
00405         addTokenForHeader(headerName.get(), headerValue); 
00406         break;
00407     } // end switch
00408 
00409     aHeaderNames->HasMore(&hasMore);
00410   }
00411 }
00412 
00413 void Tokenizer::tokenize_ascii_word(char * aWord)
00414 {
00415   // always deal with normalized lower case strings
00416   toLowerCase(aWord);
00417   PRInt32 wordLength = strlen(aWord);
00418 
00419   // if the wordLength is within our accepted token limit, then add it
00420   if (wordLength >= kMinLengthForToken && wordLength <= kMaxLengthForToken)
00421     add(aWord);
00422   else if (wordLength > kMaxLengthForToken)
00423   {
00424     // don't skip over the word if it looks like an email address,
00425     // there is value in adding tokens for addresses
00426     nsDependentCString word (aWord, wordLength); // CHEAP, no allocation occurs here...
00427 
00428     // XXX: i think the 40 byte check is just for perf reasons...if the email address is longer than that then forget about it.
00429     if (wordLength < 40 && strchr(aWord, '.') && word.CountChar('@') == 1)
00430     {
00431       PRInt32 numBytesToSep = word.FindChar('@'); 
00432       if (numBytesToSep < wordLength - 1) // if the @ sign is the last character, it must not be an email address
00433       {
00434         // split the john@foo.com into john and foo.com, treat them as separate tokens
00435         // if i did my string foo correctly, none of this string magic should cause a heap based allocation...
00436         add(nsPrintfCString(256, "email name:%s", PromiseFlatCString(Substring(word, 0, numBytesToSep++)).get()).get());
00437         add(nsPrintfCString(256, "email addr:%s", PromiseFlatCString(Substring(word, numBytesToSep, wordLength - numBytesToSep)).get()).get());
00438         return;
00439       }
00440     }
00441 
00442     // there is value in generating a token indicating the number
00443     // of characters we are skipping. We'll round to the nearest 10
00444     add(nsPrintfCString("skip:%c %d", word[0], (wordLength/10) * 10).get()); 
00445   } 
00446 }
00447 
00448 // one substract and one conditional jump should be faster than two conditional jump on most recent system.
00449 #define IN_RANGE(x, low, high)  ((PRUint16)((x)-(low)) <= (high)-(low))
00450 
00451 #define IS_JA_HIRAGANA(x)   IN_RANGE(x, 0x3040, 0x309F)
00452 // swapping the range using xor operation to reduce conditional jump.
00453 #define IS_JA_KATAKANA(x)   (IN_RANGE(x^0x0004, 0x30A0, 0x30FE)||(IN_RANGE(x, 0xFF66, 0xFF9F)))
00454 #define IS_JA_KANJI(x)      (IN_RANGE(x, 0x2E80, 0x2FDF)||IN_RANGE(x, 0x4E00, 0x9FAF))
00455 #define IS_JA_KUTEN(x)      (((x)==0x3001)||((x)==0xFF64)||((x)==0xFF0E))
00456 #define IS_JA_TOUTEN(x)     (((x)==0x3002)||((x)==0xFF61)||((x)==0xFF0C))
00457 #define IS_JA_SPACE(x)      ((x)==0x3000)
00458 #define IS_JA_FWLATAIN(x)   IN_RANGE(x, 0xFF01, 0xFF5E)
00459 #define IS_JA_FWNUMERAL(x)  IN_RANGE(x, 0xFF10, 0xFF19)
00460 
00461 #define IS_JAPANESE_SPECIFIC(x) (IN_RANGE(x, 0x3040, 0x30FF)||IN_RANGE(x, 0xFF01, 0xFF9F))
00462 
00463 enum char_class{
00464     others = 0,
00465     space,
00466     hiragana,
00467     katakana,
00468     kanji,
00469     kuten,
00470     touten,
00471     kigou,
00472     fwlatain,
00473     ascii
00474 };
00475 
00476 char_class getCharClass(PRUnichar c)
00477 {
00478   char_class charClass = others;
00479 
00480   if(IS_JA_HIRAGANA(c))
00481     charClass = hiragana;
00482   else if(IS_JA_KATAKANA(c))
00483     charClass = katakana;
00484   else if(IS_JA_KANJI(c))
00485     charClass = kanji;
00486   else if(IS_JA_KUTEN(c))
00487     charClass = kuten;
00488   else if(IS_JA_TOUTEN(c))
00489     charClass = touten;
00490   else if(IS_JA_FWLATAIN(c))
00491     charClass = fwlatain;
00492 
00493   return charClass;
00494 }
00495 
00496 static PRBool isJapanese(const char* word)
00497 {
00498   nsString text = NS_ConvertUTF8toUCS2(word);
00499   PRUnichar* p = (PRUnichar*)text.get();
00500   PRUnichar c;
00501     
00502   // it is japanese chunk if it contains any hiragana or katakana.
00503   while((c = *p++))
00504     if( IS_JAPANESE_SPECIFIC(c)) 
00505       return PR_TRUE;
00506 
00507   return PR_FALSE;
00508 }
00509 
00510 PRBool isFWNumeral(const PRUnichar* p1, const PRUnichar* p2)
00511 {
00512   for(;p1<p2;p1++)
00513     if(!IS_JA_FWNUMERAL(*p1)) 
00514       return PR_FALSE;
00515 
00516   return PR_TRUE;
00517 }
00518 
00519 // The japanese tokenizer was added as part of Bug #277354
00520 void Tokenizer::tokenize_japanese_word(char* chunk)
00521 {
00522   PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("entering tokenize_japanese_word(%s)", chunk));
00523     
00524   nsString srcStr = NS_ConvertUTF8toUCS2(chunk);
00525   const PRUnichar* p1 = srcStr.get();
00526   const PRUnichar* p2 = p1;
00527   if(!*p2) return;
00528   
00529   char_class cc = getCharClass(*p2);
00530   while(*(++p2))
00531   {
00532     if(cc == getCharClass(*p2)) 
00533       continue;
00534    
00535     nsCString token = NS_ConvertUCS2toUTF8(p1, p2-p1);
00536     if( (!isDecimalNumber(token.get())) && (!isFWNumeral(p1, p2)))      
00537       add(PromiseFlatCString(NS_LITERAL_CSTRING("JA:") + token).get());
00538         
00539     cc = getCharClass(*p2);
00540     p1 = p2;
00541   }
00542 }
00543 
00544 nsresult Tokenizer::stripHTML(const nsAString& inString, nsAString& outString)
00545 {
00546   nsresult rv = NS_OK;
00547   // Create a parser
00548   nsCOMPtr<nsIParser> parser = do_CreateInstance(kParserCID, &rv);
00549   NS_ENSURE_SUCCESS(rv, rv);
00550 
00551   // Create the appropriate output sink
00552   nsCOMPtr<nsIContentSink> sink = do_CreateInstance(NS_PLAINTEXTSINK_CONTRACTID,&rv);
00553   NS_ENSURE_SUCCESS(rv, rv);
00554 
00555   nsCOMPtr<nsIHTMLToTextSink> textSink(do_QueryInterface(sink));
00556   NS_ENSURE_TRUE(textSink, NS_ERROR_FAILURE);
00557   PRUint32 flags = nsIDocumentEncoder::OutputLFLineBreak 
00558                  | nsIDocumentEncoder::OutputNoScriptContent
00559                  | nsIDocumentEncoder::OutputNoFramesContent
00560                  | nsIDocumentEncoder::OutputBodyOnly;
00561 
00562   textSink->Initialize(&outString, flags, 80);
00563 
00564   parser->SetContentSink(sink);
00565   nsCOMPtr<nsIDTD> dtd = do_CreateInstance(kNavDTDCID,&rv);
00566   NS_ENSURE_SUCCESS(rv, rv);
00567 
00568   parser->RegisterDTD(dtd);
00569 
00570   return parser->Parse(inString, 0, NS_LITERAL_CSTRING("text/html"), PR_FALSE, PR_TRUE);
00571 }
00572 
00573 void Tokenizer::tokenize(char* aText)
00574 {
00575     PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("tokenize: %s", aText));
00576 
00577     // strip out HTML tags before we begin processing
00578     // uggh but first we have to blow up our string into UCS2
00579     // since that's what the document encoder wants. UTF8/UCS2, I wish we all
00580     // spoke the same language here..
00581     nsString text = NS_ConvertUTF8toUCS2(aText);
00582     nsString strippedUCS2;
00583     stripHTML(text, strippedUCS2);
00584     
00585     // convert 0x3000(full width space) into 0x0020
00586     nsString::iterator substr_start, substr_end;
00587     strippedUCS2.BeginWriting(substr_start);
00588     strippedUCS2.EndWriting(substr_end);
00589     while (substr_start != substr_end) {
00590         if (*substr_start == 0x3000)
00591             *substr_start = 0x0020;
00592         ++substr_start;
00593     }
00594     
00595     nsCString strippedStr = NS_ConvertUCS2toUTF8(strippedUCS2);
00596     char * strippedText = (char *) strippedStr.get(); // bleh
00597     PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("tokenize stripped html: %s", strippedText));
00598 
00599     char* word;
00600     char* next = strippedText;
00601     while ((word = nsCRT::strtok(next, kBayesianFilterTokenDelimiters, &next)) != NULL) {
00602         if (!*word) continue;
00603         if (isDecimalNumber(word)) continue;
00604         if (isASCII(word))
00605             tokenize_ascii_word(word);
00606         else if (isJapanese(word))
00607             tokenize_japanese_word(word);
00608         else {
00609             nsresult rv;
00610             // use I18N  scanner to break this word into meaningful semantic units.
00611             if (!mScanner) {
00612                 mScanner = do_CreateInstance(NS_SEMANTICUNITSCANNER_CONTRACTID, &rv);
00613                 NS_ASSERTION(NS_SUCCEEDED(rv), "couldn't create semantic unit scanner!");
00614                 if (NS_FAILED(rv)) {
00615                     return;
00616                 }
00617             }
00618             if (mScanner) {
00619                 mScanner->Start("UTF-8");
00620                 // convert this word from UTF-8 into UCS2.
00621                 NS_ConvertUTF8toUCS2 uword(word);
00622                 ToLowerCase(uword);
00623                 const PRUnichar* utext = uword.get();
00624                 PRInt32 len = uword.Length(), pos = 0, begin, end;
00625                 PRBool gotUnit;
00626                 while (pos < len) {
00627                     rv = mScanner->Next(utext, len, pos, PR_TRUE, &begin, &end, &gotUnit);
00628                     if (NS_SUCCEEDED(rv) && gotUnit) {
00629                         NS_ConvertUCS2toUTF8 utfUnit(utext + begin, end - begin);
00630                         add(utfUnit.get());
00631                         // advance to end of current unit.
00632                         pos = end;
00633                     } else {
00634                         break;
00635                     }
00636                 }
00637             }
00638         }
00639     }
00640 }
00641 
00642 void Tokenizer::tokenize(const char* str)
00643 {
00644     char* text = nsCRT::strdup(str);
00645     if (text) {
00646         tokenize(text);
00647         nsCRT::free(text);
00648     }
00649 }
00650 
00651 void Tokenizer::visit(PRBool (*f) (Token*, void*), void* data)
00652 {
00653     VisitClosure closure = { f, data };
00654     PRUint32 visitCount = PL_DHashTableEnumerate(&mTokenTable, VisitEntry, &closure);
00655     NS_ASSERTION(visitCount == mTokenTable.entryCount, "visitCount != entryCount!");
00656     if (visitCount != mTokenTable.entryCount) {
00657       PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("visitCount != entryCount!: %d vs %d", visitCount, mTokenTable.entryCount));
00658     }
00659 }
00660 
00661 inline PRUint32 Tokenizer::countTokens()
00662 {
00663     return mTokenTable.entryCount;
00664 }
00665 
00666 Token* Tokenizer::copyTokens()
00667 {
00668     PRUint32 count = countTokens();
00669     if (count > 0) {
00670         Token* tokens = new Token[count];
00671         if (tokens) {
00672             Token* tp = tokens;
00673             TokenEnumeration e(&mTokenTable);
00674             while (e.hasMoreTokens())
00675                 *tp++ = *e.nextToken();
00676         }
00677         return tokens;
00678     }
00679     return NULL;
00680 }
00681 
00682 inline TokenEnumeration Tokenizer::getTokens()
00683 {
00684     return TokenEnumeration(&mTokenTable);
00685 }
00686 
00687 class TokenAnalyzer {
00688 public:
00689     virtual ~TokenAnalyzer() {}
00690     
00691     virtual void analyzeTokens(Tokenizer& tokenizer) = 0;
00692     void setTokenListener(nsIStreamListener *aTokenListener)
00693     {
00694       mTokenListener = aTokenListener;
00695     }
00696 
00697     void setSource(const char *sourceURI) {mTokenSource = sourceURI;}
00698 
00699     nsCOMPtr<nsIStreamListener> mTokenListener;
00700     nsCString mTokenSource;
00701 
00702 };
00703 
00710 class TokenStreamListener : public nsIStreamListener, nsIMsgHeaderSink {
00711 public:
00712     NS_DECL_ISUPPORTS
00713     NS_DECL_NSIREQUESTOBSERVER
00714     NS_DECL_NSISTREAMLISTENER
00715     NS_DECL_NSIMSGHEADERSINK
00716     
00717     TokenStreamListener(TokenAnalyzer* analyzer);
00718     virtual ~TokenStreamListener();
00719 protected:
00720     TokenAnalyzer* mAnalyzer;
00721     char* mBuffer;
00722     PRUint32 mBufferSize;
00723     PRUint32 mLeftOverCount;
00724     Tokenizer mTokenizer;
00725     PRBool mSetAttachmentFlag;
00726 };
00727 
00728 const PRUint32 kBufferSize = 16384;
00729 
00730 TokenStreamListener::TokenStreamListener(TokenAnalyzer* analyzer)
00731     :   mAnalyzer(analyzer),
00732         mBuffer(NULL), mBufferSize(kBufferSize), mLeftOverCount(0),
00733         mSetAttachmentFlag(PR_FALSE)
00734 {
00735 }
00736 
00737 TokenStreamListener::~TokenStreamListener()
00738 {
00739     delete[] mBuffer;
00740     delete mAnalyzer;
00741 }
00742 
00743 NS_IMPL_ISUPPORTS3(TokenStreamListener, nsIRequestObserver, nsIStreamListener, nsIMsgHeaderSink)
00744 
00745 NS_IMETHODIMP TokenStreamListener::ProcessHeaders(nsIUTF8StringEnumerator *aHeaderNames, nsIUTF8StringEnumerator *aHeaderValues, PRBool dontCollectAddress)
00746 {
00747     mTokenizer.tokenizeHeaders(aHeaderNames, aHeaderValues);
00748     return NS_OK;
00749 }
00750 
00751 NS_IMETHODIMP TokenStreamListener::HandleAttachment(const char *contentType, const char *url, const PRUnichar *displayName, const char *uri, PRBool aIsExternalAttachment)
00752 {
00753     mTokenizer.tokenizeAttachment(contentType, NS_ConvertUCS2toUTF8(displayName).get());
00754     return NS_OK;
00755 }
00756 
00757 NS_IMETHODIMP TokenStreamListener::OnEndAllAttachments()
00758 {
00759     return NS_OK;
00760 }
00761 
00762 NS_IMETHODIMP TokenStreamListener::OnEndMsgDownload(nsIMsgMailNewsUrl *url)
00763 {
00764     return NS_OK;
00765 }
00766 
00767 
00768 NS_IMETHODIMP TokenStreamListener::OnMsgHasRemoteContent(nsIMsgDBHdr * aMsgHdr)
00769 {
00770     return NS_OK;
00771 }
00772 
00773 NS_IMETHODIMP TokenStreamListener::OnEndMsgHeaders(nsIMsgMailNewsUrl *url)
00774 {
00775     return NS_OK;
00776 }
00777 
00778 
00779 NS_IMETHODIMP TokenStreamListener::GetSecurityInfo(nsISupports * *aSecurityInfo)
00780 {
00781     return NS_OK;
00782 }
00783 NS_IMETHODIMP TokenStreamListener::SetSecurityInfo(nsISupports * aSecurityInfo)
00784 {
00785     return NS_OK;
00786 }
00787 
00788 NS_IMETHODIMP TokenStreamListener::GetDummyMsgHeader(nsIMsgDBHdr **aMsgDBHdr)
00789 {
00790   return NS_ERROR_NOT_IMPLEMENTED;
00791 }
00792 
00793 NS_IMETHODIMP TokenStreamListener::GetProperties(nsIWritablePropertyBag2 * *aProperties)
00794 {
00795   return NS_ERROR_NOT_IMPLEMENTED;
00796 }
00797 
00798 /* void onStartRequest (in nsIRequest aRequest, in nsISupports aContext); */
00799 NS_IMETHODIMP TokenStreamListener::OnStartRequest(nsIRequest *aRequest, nsISupports *aContext)
00800 {
00801     mLeftOverCount = 0;
00802     if (!mTokenizer)
00803         return NS_ERROR_OUT_OF_MEMORY;
00804     if (!mBuffer)
00805     {
00806         mBuffer = new char[mBufferSize];
00807         if (!mBuffer)
00808             return NS_ERROR_OUT_OF_MEMORY;
00809     }
00810 
00811     // get the url for the channel and set our nsIMsgHeaderSink on it so we get notified 
00812     // about the headers and attachments
00813 
00814     nsCOMPtr<nsIChannel> channel (do_QueryInterface(aRequest));
00815     if (channel)
00816     {
00817         nsCOMPtr<nsIURI> uri;
00818         channel->GetURI(getter_AddRefs(uri));
00819         nsCOMPtr<nsIMsgMailNewsUrl> mailUrl = do_QueryInterface(uri);
00820         if (mailUrl)
00821             mailUrl->SetMsgHeaderSink(NS_STATIC_CAST(nsIMsgHeaderSink*, this));
00822     }
00823 
00824     return NS_OK;
00825 }
00826 
00827 /* void onDataAvailable (in nsIRequest aRequest, in nsISupports aContext, in nsIInputStream aInputStream, in unsigned long aOffset, in unsigned long aCount); */
00828 NS_IMETHODIMP TokenStreamListener::OnDataAvailable(nsIRequest *aRequest, nsISupports *aContext, nsIInputStream *aInputStream, PRUint32 aOffset, PRUint32 aCount)
00829 {
00830     nsresult rv = NS_OK;
00831 
00832     while (aCount > 0) {
00833         PRUint32 readCount, totalCount = (aCount + mLeftOverCount);
00834         if (totalCount >= mBufferSize) {
00835             readCount = mBufferSize - mLeftOverCount - 1;
00836         } else {
00837             readCount = aCount;
00838         }
00839         
00840         char* buffer = mBuffer;
00841         rv = aInputStream->Read(buffer + mLeftOverCount, readCount, &readCount);
00842         if (NS_FAILED(rv))
00843             break;
00844 
00845         if (readCount == 0) {
00846             rv = NS_ERROR_UNEXPECTED;
00847             NS_WARNING("failed to tokenize");
00848             break;
00849         }
00850             
00851         aCount -= readCount;
00852         
00853         /* consume the tokens up to the last legal token delimiter in the buffer. */
00854         totalCount = (readCount + mLeftOverCount);
00855         buffer[totalCount] = '\0';
00856         char* lastDelimiter = NULL;
00857         char* scan = buffer + totalCount;
00858         while (scan > buffer) {
00859             if (strchr(kBayesianFilterTokenDelimiters, *--scan)) {
00860                 lastDelimiter = scan;
00861                 break;
00862             }
00863         }
00864         
00865         if (lastDelimiter) {
00866             *lastDelimiter = '\0';
00867             mTokenizer.tokenize(buffer);
00868 
00869             PRUint32 consumedCount = 1 + (lastDelimiter - buffer);
00870             mLeftOverCount = totalCount - consumedCount;
00871             if (mLeftOverCount)
00872                 memmove(buffer, buffer + consumedCount, mLeftOverCount);
00873         } else {
00874             /* didn't find a delimiter, keep the whole buffer around. */
00875             mLeftOverCount = totalCount;
00876             if (totalCount >= (mBufferSize / 2)) {
00877                 PRUint32 newBufferSize = mBufferSize * 2;
00878                 char* newBuffer = new char[newBufferSize];
00879                 if (!newBuffer) return NS_ERROR_OUT_OF_MEMORY;
00880                 memcpy(newBuffer, mBuffer, mLeftOverCount);
00881                 delete[] mBuffer;
00882                 mBuffer = newBuffer;
00883                 mBufferSize = newBufferSize;
00884             }
00885         }
00886     }
00887     
00888     return rv;
00889 }
00890 
00891 /* void onStopRequest (in nsIRequest aRequest, in nsISupports aContext, in nsresult aStatusCode); */
00892 NS_IMETHODIMP TokenStreamListener::OnStopRequest(nsIRequest *aRequest, nsISupports *aContext, nsresult aStatusCode)
00893 {
00894     if (mLeftOverCount) {
00895         /* assume final buffer is complete. */
00896         char* buffer = mBuffer;
00897         buffer[mLeftOverCount] = '\0';
00898         mTokenizer.tokenize(buffer);
00899     }
00900     
00901     /* finally, analyze the tokenized message. */
00902     PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("analyze the tokenized message"));
00903     if (mAnalyzer)
00904         mAnalyzer->analyzeTokens(mTokenizer);
00905     
00906     return NS_OK;
00907 }
00908 
00909 /* Implementation file */
00910 NS_IMPL_ISUPPORTS2(nsBayesianFilter, nsIMsgFilterPlugin, nsIJunkMailPlugin)
00911 
00912 nsBayesianFilter::nsBayesianFilter()
00913     :   mGoodCount(0), mBadCount(0), mTrainingDataDirty(PR_FALSE)
00914 {
00915     if (!BayesianFilterLogModule)
00916       BayesianFilterLogModule = PR_NewLogModule("BayesianFilter");
00917     
00918     PRInt32 junkThreshold = 0;
00919     nsresult rv;
00920     nsCOMPtr<nsIPrefBranch> pPrefBranch(do_GetService(NS_PREFSERVICE_CONTRACTID, &rv));
00921     if (pPrefBranch)
00922       pPrefBranch->GetIntPref("mail.adaptivefilters.junk_threshold", &junkThreshold);
00923 
00924     mJunkProbabilityThreshold = ((double) junkThreshold) / 100;
00925     if (mJunkProbabilityThreshold == 0 || mJunkProbabilityThreshold >= 1)
00926       mJunkProbabilityThreshold = kDefaultJunkThreshold;
00927 
00928     PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("junk probabilty threshold: %f", mJunkProbabilityThreshold));
00929 
00930     getTrainingFile(getter_AddRefs(mTrainingFile));
00931 
00932     PRBool ok = (mGoodTokens && mBadTokens);
00933     NS_ASSERTION(ok, "error allocating tokenizers");
00934     if (ok)
00935         readTrainingData();
00936     else {
00937       PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("error allocating tokenizers"));
00938     }
00939     
00940     // get parameters for training data flushing, from the prefs
00941 
00942     nsCOMPtr<nsIPrefBranch> prefBranch;
00943     
00944     nsCOMPtr<nsIPrefService> prefs = do_GetService(NS_PREFSERVICE_CONTRACTID, &rv);
00945     NS_ASSERTION(NS_SUCCEEDED(rv),"failed accessing preferences service");
00946     rv = prefs->GetBranch(nsnull, getter_AddRefs(prefBranch));
00947     NS_ASSERTION(NS_SUCCEEDED(rv),"failed getting preferences branch");
00948 
00949     rv = prefBranch->GetIntPref("mailnews.bayesian_spam_filter.flush.minimum_interval",&mMinFlushInterval);
00950     // it is not a good idea to allow a minimum interval of under 1 second
00951     if (NS_FAILED(rv) || (mMinFlushInterval <= 1000) )
00952         mMinFlushInterval = DEFAULT_MIN_INTERVAL_BETWEEN_WRITES;
00953 
00954     mTimer = do_CreateInstance(NS_TIMER_CONTRACTID, &rv);
00955     NS_ASSERTION(NS_SUCCEEDED(rv), "unable to create a timer; training data will only be written on exit");
00956     
00957     // the timer is not used on object construction, since for
00958     // the time being there are no dirying messages
00959     
00960 }
00961 
00962 void
00963 nsBayesianFilter::TimerCallback(nsITimer* aTimer, void* aClosure)
00964 {
00965     // we will flush the training data to disk after enough time has passed
00966     // since the first time a message has been classified after the last flush
00967 
00968     nsBayesianFilter *filter = NS_STATIC_CAST(nsBayesianFilter *, aClosure);
00969     filter->writeTrainingData();
00970 }
00971 
00972 nsBayesianFilter::~nsBayesianFilter()
00973 {
00974     if (mTimer)
00975     {
00976         mTimer->Cancel();
00977         mTimer = nsnull;
00978     }
00979     // call shutdown when we are going away in case we need
00980     // to flush the training set to disk
00981     Shutdown();
00982 }
00983 
00984 // this object is used for one call to classifyMessage or classifyMessages(). 
00985 // So if we're classifying multiple messages, this object will be used for each message.
00986 // It's going to hold a reference to itself, basically, to stay in memory.
00987 class MessageClassifier : public TokenAnalyzer {
00988 public:
00989     MessageClassifier(nsBayesianFilter* aFilter, nsIJunkMailClassificationListener* aListener, 
00990       nsIMsgWindow *aMsgWindow,
00991       PRUint32 aNumMessagesToClassify, const char **aMessageURIs)
00992         :   mFilter(aFilter), mSupports(aFilter), mListener(aListener), mMsgWindow(aMsgWindow)
00993     {
00994       mCurMessageToClassify = 0;
00995       mNumMessagesToClassify = aNumMessagesToClassify;
00996       mMessageURIs = (char **) nsMemory::Alloc(sizeof(char *) * aNumMessagesToClassify);
00997       for (PRUint32 i = 0; i < aNumMessagesToClassify; i++)
00998         mMessageURIs[i] = PL_strdup(aMessageURIs[i]);
00999 
01000     }
01001     
01002     virtual ~MessageClassifier()
01003     {
01004        if (mMessageURIs)
01005        {
01006          NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(mNumMessagesToClassify, mMessageURIs);
01007        }
01008     }
01009     virtual void analyzeTokens(Tokenizer& tokenizer)
01010     {
01011         mFilter->classifyMessage(tokenizer, mTokenSource.get(), mListener);
01012         tokenizer.clearTokens();
01013         classifyNextMessage();
01014     }
01015 
01016     virtual void classifyNextMessage()
01017     {
01018       
01019       if (++mCurMessageToClassify < mNumMessagesToClassify && mMessageURIs[mCurMessageToClassify]) {
01020         PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("classifyNextMessage(%s)", mMessageURIs[mCurMessageToClassify]));
01021         mFilter->tokenizeMessage(mMessageURIs[mCurMessageToClassify], mMsgWindow, this);
01022       }
01023       else
01024       {
01025         mTokenListener = nsnull; // this breaks the circular ref that keeps this object alive
01026                                  // so we will be destroyed as a result.
01027       }
01028     }
01029 
01030 private:
01031     nsBayesianFilter* mFilter;
01032     nsCOMPtr<nsISupports> mSupports;
01033     nsCOMPtr<nsIJunkMailClassificationListener> mListener;
01034     nsCOMPtr<nsIMsgWindow> mMsgWindow;
01035     PRInt32 mNumMessagesToClassify;
01036     PRInt32 mCurMessageToClassify; // 0-based index
01037     char **mMessageURIs;
01038 };
01039 
01040 nsresult nsBayesianFilter::tokenizeMessage(const char* aMessageURI, nsIMsgWindow *aMsgWindow, TokenAnalyzer* aAnalyzer)
01041 {
01042 
01043     nsCOMPtr <nsIMsgMessageService> msgService;
01044     nsresult rv = GetMessageServiceFromURI(aMessageURI, getter_AddRefs(msgService));
01045     NS_ENSURE_SUCCESS(rv, rv);
01046 
01047     aAnalyzer->setSource(aMessageURI);
01048     return msgService->StreamMessage(aMessageURI, aAnalyzer->mTokenListener, aMsgWindow,
01049                                           nsnull, PR_TRUE /* convert data */, 
01050                                                 "filter", nsnull);
01051 }
01052 
01053 PR_STATIC_CALLBACK(int) compareTokens(const void* p1, const void* p2, void* /* data */)
01054 {
01055     Token *t1 = (Token*) p1, *t2 = (Token*) p2;
01056     double delta = t1->mDistance - t2->mDistance;
01057     return (delta == 0.0 ? 0 : (delta > 0.0 ? 1 : -1));
01058 }
01059 
01060 inline double dmax(double x, double y) { return (x > y ? x : y); }
01061 inline double dmin(double x, double y) { return (x < y ? x : y); }
01062 
01063 // Chi square functions are implemented by an incomplete gamma function.
01064 // Note that chi2P's callers multiply the arguments by 2 but chi2P
01065 // divides them by 2 again. Inlining chi2P gives the compiler a
01066 // chance to notice this.
01067 
01068 // Both chi2P and nsIncompleteGammaP set *error negative on domain
01069 // errors and nsIncompleteGammaP sets it posivive on internal errors.
01070 // This may be useful but the chi2P callers treat any error as fatal.
01071 
01072 // Note that converting unsigned ints to floating point can be slow on
01073 // some platforms (like Intel) so use signed quantities for the numeric
01074 // routines.
01075 static inline double chi2P (double chi2, double nu, PRInt32 *error)
01076 {
01077     // domain checks; set error and return a dummy value
01078     if (chi2 < 0.0 || nu <= 0.0)
01079     {
01080         *error = -1;
01081         return 0.0;
01082     }
01083     // reversing the arguments is intentional
01084     return nsIncompleteGammaP (nu/2.0, chi2/2.0, error);
01085 }
01086 
01087 void nsBayesianFilter::classifyMessage(Tokenizer& tokenizer, const char* messageURI,
01088                                        nsIJunkMailClassificationListener* listener)
01089 {
01090     Token* tokens = tokenizer.copyTokens();
01091     if (!tokens) return;
01092   
01093     // the algorithm in "A Plan For Spam" assumes that you have a large good
01094     // corpus and a large junk corpus.
01095     // that won't be the case with users who first use the junk mail feature
01096     // so, we do certain things to encourage them to train.
01097     //
01098     // if there are no good tokens, assume the message is junk
01099     // this will "encourage" the user to train
01100     // and if there are no bad tokens, assume the message is not junk
01101     // this will also "encourage" the user to train
01102     // see bug #194238
01103     if (listener && !mGoodCount && !mGoodTokens.countTokens()) {
01104       PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("no good tokens, assume junk"));
01105       listener->OnMessageClassified(messageURI, nsMsgJunkStatus(nsIJunkMailPlugin::JUNK));
01106       return;
01107     }
01108     if (listener && !mBadCount && !mBadTokens.countTokens()) {
01109       PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("no bad tokens, assume good"));
01110       listener->OnMessageClassified(messageURI, nsMsgJunkStatus(nsIJunkMailPlugin::GOOD));
01111       return;
01112     }
01113 
01114     /* this part is similar to the Graham algorithm with some adjustments. */
01115     PRUint32 i, goodclues=0, count = tokenizer.countTokens();
01116     double ngood = mGoodCount, nbad = mBadCount, prob;
01117 
01118     for (i = 0; i < count; ++i) 
01119     {
01120         Token& token = tokens[i];
01121         const char* word = token.mWord;
01122         Token* t = mGoodTokens.get(word);
01123       double hamcount = ((t != NULL) ? t->mCount : 0);
01124         t = mBadTokens.get(word);
01125        double spamcount = ((t != NULL) ? t->mCount : 0);
01126 
01127       // if hamcount and spam count are both 0, we could end up with a divide by 0 error, 
01128       // tread carefully here. (Bug #240819)
01129       double probDenom = (hamcount *nbad + spamcount*ngood);
01130       if (probDenom == 0.0) // nGood and nbad are known to be non zero or we wouldn't be here
01131         probDenom = nbad + ngood; // error case use a value of 1 for hamcount and spamcount if they are both zero.
01132 
01133       prob = (spamcount * ngood)/probDenom;
01134        double n = hamcount + spamcount;
01135        prob =  (0.225 + n * prob) / (.45 + n);
01136        double distance = PR_ABS(prob - 0.5);
01137        if (distance >= .1) 
01138        {
01139          goodclues++;
01140          token.mDistance = distance;
01141          token.mProbability = prob;
01142             PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("token.mProbability (%s) is %f", word, token.mProbability));
01143         }
01144       else 
01145         token.mDistance = -1; //ignore clue
01146     }
01147     
01148     // sort the array by the token distances
01149         NS_QuickSort(tokens, count, sizeof(Token), compareTokens, NULL);
01150     PRUint32 first, last = count;
01151     first = (goodclues > 150) ? count - 150 : 0;
01152 
01153     double H = 1.0, S = 1.0;
01154     PRInt32 Hexp = 0, Sexp = 0;
01155     goodclues=0;
01156     int e;
01157 
01158     for (i = first; i < last; ++i) 
01159     {
01160       if (tokens[i].mDistance != -1) 
01161       {
01162         goodclues++;
01163         double value = tokens[i].mProbability;
01164         S *= (1.0 - value);
01165         H *= value;
01166         if ( S < 1e-200 ) 
01167         {
01168           S = frexp(S, &e);
01169           Sexp += e;
01170         }
01171         if ( H < 1e-200 ) 
01172         {
01173           H = frexp(H, &e);
01174           Hexp += e;
01175     }
01176     }
01177     }
01178 
01179     S = log(S) + Sexp * M_LN2;
01180     H = log(H) + Hexp * M_LN2;
01181 
01182     if (goodclues > 0) 
01183     {
01184         PRInt32 chi_error;
01185         S = chi2P(-2.0 * S, 2.0 * goodclues, &chi_error);
01186         if (!chi_error)
01187             H = chi2P(-2.0 * H, 2.0 * goodclues, &chi_error);
01188         // if any error toss the entire calculation
01189         if (!chi_error)
01190             prob = (S-H +1.0) / 2.0;
01191         else
01192             prob = 0.5;
01193     } 
01194     else 
01195         prob = 0.5;
01196 
01197     PRBool isJunk = (prob >= mJunkProbabilityThreshold);
01198     PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("%s is junk probability = (%f)  HAM SCORE:%f SPAM SCORE:%f", messageURI, prob,H,S));
01199 
01200     delete[] tokens;
01201 
01202     if (listener)
01203         listener->OnMessageClassified(messageURI, isJunk ? nsMsgJunkStatus(nsIJunkMailPlugin::JUNK) : nsMsgJunkStatus(nsIJunkMailPlugin::GOOD));
01204 }
01205 
01206 /* void shutdown (); */
01207 NS_IMETHODIMP nsBayesianFilter::Shutdown()
01208 {
01209     if (mTrainingDataDirty)
01210         writeTrainingData();
01211     return NS_OK;
01212 }
01213 
01214 /* readonly attribute boolean shouldDownloadAllHeaders; */
01215 NS_IMETHODIMP nsBayesianFilter::GetShouldDownloadAllHeaders(PRBool *aShouldDownloadAllHeaders)
01216 {
01217     // bayesian filters work on the whole msg body currently.
01218     *aShouldDownloadAllHeaders = PR_FALSE;
01219     return NS_OK;
01220 }
01221 
01222 /* void classifyMessage (in string aMsgURL, in nsIJunkMailClassificationListener aListener); */
01223 NS_IMETHODIMP nsBayesianFilter::ClassifyMessage(const char *aMessageURL, nsIMsgWindow *aMsgWindow, nsIJunkMailClassificationListener *aListener)
01224 {
01225     MessageClassifier* analyzer = new MessageClassifier(this, aListener, aMsgWindow, 1, &aMessageURL);
01226     if (!analyzer) return NS_ERROR_OUT_OF_MEMORY;
01227     TokenStreamListener *tokenListener = new TokenStreamListener(analyzer);
01228     analyzer->setTokenListener(tokenListener);
01229     return tokenizeMessage(aMessageURL, aMsgWindow, analyzer);
01230 }
01231 
01232 /* void classifyMessages (in unsigned long aCount, [array, size_is (aCount)] in string aMsgURLs, in nsIJunkMailClassificationListener aListener); */
01233 NS_IMETHODIMP nsBayesianFilter::ClassifyMessages(PRUint32 aCount, const char **aMsgURLs, nsIMsgWindow *aMsgWindow, nsIJunkMailClassificationListener *aListener)
01234 {
01235     TokenAnalyzer* analyzer = new MessageClassifier(this, aListener, aMsgWindow, aCount, aMsgURLs);
01236     if (!analyzer) return NS_ERROR_OUT_OF_MEMORY;
01237     TokenStreamListener *tokenListener = new TokenStreamListener(analyzer);
01238     analyzer->setTokenListener(tokenListener);
01239     return tokenizeMessage(aMsgURLs[0], aMsgWindow, analyzer);
01240 }
01241 
01242 class MessageObserver : public TokenAnalyzer {
01243 public:
01244     MessageObserver(nsBayesianFilter* filter,
01245                     nsMsgJunkStatus oldClassification,
01246                     nsMsgJunkStatus newClassification,
01247                     nsIJunkMailClassificationListener* listener)
01248         :   mFilter(filter), mSupports(filter), mListener(listener),
01249             mOldClassification(oldClassification),
01250             mNewClassification(newClassification)
01251     {
01252     }
01253     
01254     virtual void analyzeTokens(Tokenizer& tokenizer)
01255     {
01256         mFilter->observeMessage(tokenizer, mTokenSource.get(), mOldClassification,
01257                                 mNewClassification, mListener);
01258         // release reference to listener, which will allow us to go away as well.
01259         mTokenListener = nsnull;
01260 
01261     }
01262 
01263 private:
01264     nsBayesianFilter* mFilter;
01265     nsCOMPtr<nsISupports> mSupports;
01266     nsCOMPtr<nsIJunkMailClassificationListener> mListener;
01267     nsMsgJunkStatus mOldClassification;
01268     nsMsgJunkStatus mNewClassification;
01269 };
01270 
01271 static void forgetTokens(Tokenizer& corpus, TokenEnumeration tokens)
01272 {
01273     // if we are forgetting the tokens for a message, should only 
01274     // subtract 1 from the occurrence count for that token in the training set
01275     // because we assume we only bumped the training set count once per messages
01276     // containing the token. 
01277     while (tokens.hasMoreTokens()) {
01278         Token* token = tokens.nextToken();
01279         corpus.remove(token->mWord);
01280     }
01281 }
01282 
01283 static void rememberTokens(Tokenizer& corpus, TokenEnumeration tokens)
01284 {
01285     while (tokens.hasMoreTokens()) {
01286         Token* token = tokens.nextToken();
01287         corpus.add(token->mWord);
01288     }
01289 }
01290 
01291 void nsBayesianFilter::observeMessage(Tokenizer& tokenizer, const char* messageURL,
01292                                       nsMsgJunkStatus oldClassification, nsMsgJunkStatus newClassification,
01293                                       nsIJunkMailClassificationListener* listener)
01294 {
01295     PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("observeMessage(%s) old=%d new=%d", messageURL, oldClassification, newClassification));
01296 
01297     PRBool trainingDataWasDirty = mTrainingDataDirty;
01298     TokenEnumeration tokens = tokenizer.getTokens();
01299 
01300     // Uhoh...if the user is re-training then the message may already be classified and we are classifying it again with the same classification.
01301     // the old code would have removed the tokens for this message then added them back. But this really hurts the message occurrence
01302     // count for tokens if you just removed training.dat and are re-training. See Bug #237095 for more details.
01303     // What can we do here? Well we can skip the token removal step if the classifications are the same and assume the user is
01304     // just re-training. But this then allows users to re-classify the same message on the same training set over and over again
01305     // leading to data skew. But that's all I can think to do right now to address this.....
01306     if (oldClassification != newClassification) 
01307     {
01308       // remove the tokens from the token set it is currently in
01309     switch (oldClassification) {
01310     case nsIJunkMailPlugin::JUNK:
01311         // remove tokens from junk corpus.
01312         if (mBadCount > 0) {
01313             --mBadCount;
01314             forgetTokens(mBadTokens, tokens);
01315             mTrainingDataDirty = PR_TRUE;
01316         }
01317         break;
01318     case nsIJunkMailPlugin::GOOD:
01319         // remove tokens from good corpus.
01320         if (mGoodCount > 0) {
01321             --mGoodCount;
01322             forgetTokens(mGoodTokens, tokens);
01323             mTrainingDataDirty = PR_TRUE;
01324         }
01325         break;
01326     }
01327     }
01328 
01329     
01330     switch (newClassification) {
01331     case nsIJunkMailPlugin::JUNK:
01332         // put tokens into junk corpus.
01333         ++mBadCount;
01334         rememberTokens(mBadTokens, tokens);
01335         mTrainingDataDirty = PR_TRUE;
01336         break;
01337     case nsIJunkMailPlugin::GOOD:
01338         // put tokens into good corpus.
01339         ++mGoodCount;
01340         rememberTokens(mGoodTokens, tokens);
01341         mTrainingDataDirty = PR_TRUE;
01342         break;
01343     }
01344     
01345     if (listener)
01346         listener->OnMessageClassified(messageURL, newClassification);
01347     
01348     if (mTrainingDataDirty && !trainingDataWasDirty && ( mTimer != nsnull ))
01349     {
01350         // if training data became dirty just now, schedule flush
01351         // mMinFlushInterval msec from now
01352         PR_LOG(
01353             BayesianFilterLogModule, PR_LOG_ALWAYS,
01354             ("starting training data flush timer %i msec", mMinFlushInterval));
01355         mTimer->InitWithFuncCallback(nsBayesianFilter::TimerCallback, this, mMinFlushInterval, nsITimer::TYPE_ONE_SHOT);
01356     }
01357 }
01358 
01359 /*
01360     Format of the training file for version 1:
01361     [0xFEEDFACE]
01362     [number good messages][number bad messages]
01363     [number good tokens]
01364     [count][length of word]word
01365     ...
01366     [number bad tokens]
01367     [count][length of word]word
01368     ...
01369  */
01370 
01371 inline int writeUInt32(FILE* stream, PRUint32 value)
01372 {
01373     value = PR_htonl(value);
01374     return fwrite(&value, sizeof(PRUint32), 1, stream);
01375 }
01376 
01377 inline int readUInt32(FILE* stream, PRUint32* value)
01378 {
01379     int n = fread(value, sizeof(PRUint32), 1, stream);
01380     if (n == 1) {
01381         *value = PR_ntohl(*value);
01382     }
01383     return n;
01384 }
01385 
01386 static PRBool writeTokens(FILE* stream, Tokenizer& tokenizer)
01387 {
01388     PRUint32 tokenCount = tokenizer.countTokens();
01389     if (writeUInt32(stream, tokenCount) != 1)
01390         return PR_FALSE;
01391 
01392     if (tokenCount > 0) {
01393         TokenEnumeration tokens = tokenizer.getTokens();
01394         for (PRUint32 i = 0; i < tokenCount; ++i) {
01395             Token* token = tokens.nextToken();
01396             if (writeUInt32(stream, token->mCount) != 1)
01397                 break;
01398             PRUint32 tokenLength = token->mLength;
01399             if (writeUInt32(stream, tokenLength) != 1)
01400                 break;
01401             if (fwrite(token->mWord, tokenLength, 1, stream) != 1)
01402                 break;
01403         }
01404     }
01405     
01406     return PR_TRUE;
01407 }
01408 
01409 static PRBool readTokens(FILE* stream, Tokenizer& tokenizer, PRInt64 fileSize)
01410 {
01411     PRUint32 tokenCount;
01412     if (readUInt32(stream, &tokenCount) != 1)
01413         return PR_FALSE;
01414 
01415     PRInt64 fpos = ftell(stream);
01416     if (fpos < 0)
01417         return PR_FALSE;
01418 
01419     PRUint32 bufferSize = 4096;
01420     char* buffer = new char[bufferSize];
01421     if (!buffer) return PR_FALSE;
01422 
01423     for (PRUint32 i = 0; i < tokenCount; ++i) {
01424         PRUint32 count;
01425         if (readUInt32(stream, &count) != 1)
01426             break;
01427         PRUint32 size;
01428         if (readUInt32(stream, &size) != 1)
01429             break;
01430         fpos += 8;
01431         if (size >= bufferSize) {
01432             delete[] buffer;
01433             if (fpos + size > fileSize)
01434                 return PR_FALSE;
01435             while (size >= bufferSize) {
01436                 bufferSize *= 2;
01437                 if (bufferSize == 0)
01438                     return PR_FALSE;
01439             }
01440             buffer = new char[bufferSize];
01441             if (!buffer) return PR_FALSE;
01442         }
01443         if (fread(buffer, size, 1, stream) != 1)
01444             break;
01445         fpos += size;
01446         buffer[size] = '\0';
01447         tokenizer.add(buffer, count);
01448     }
01449     
01450     delete[] buffer;
01451     
01452     return PR_TRUE;
01453 }
01454 
01455 
01456 nsresult nsBayesianFilter::getTrainingFile(nsILocalFile ** aTrainingFile)
01457 {
01458   // should we cache the profile manager's directory?
01459   nsCOMPtr<nsIFile> profileDir;
01460 
01461   nsresult rv = NS_GetSpecialDirectory(NS_APP_USER_PROFILE_50_DIR, getter_AddRefs(profileDir));
01462   NS_ENSURE_SUCCESS(rv, rv);
01463   rv = profileDir->Append(NS_LITERAL_STRING("training.dat"));
01464   NS_ENSURE_SUCCESS(rv, rv);
01465   
01466   return profileDir->QueryInterface(NS_GET_IID(nsILocalFile), (void **) aTrainingFile);
01467 }
01468 
01469 static const char kMagicCookie[] = { '\xFE', '\xED', '\xFA', '\xCE' };
01470 
01471 void nsBayesianFilter::writeTrainingData()
01472 {
01473   PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("writeTrainingData() entered"));
01474   if (!mTrainingFile) 
01475     return;
01476 
01477   // open the file, and write out training data
01478   FILE* stream;
01479   nsresult rv = mTrainingFile->OpenANSIFileDesc("wb", &stream);
01480   if (NS_FAILED(rv)) 
01481     return;
01482 
01483   if (!((fwrite(kMagicCookie, sizeof(kMagicCookie), 1, stream) == 1) &&
01484         (writeUInt32(stream, mGoodCount) == 1) &&
01485         (writeUInt32(stream, mBadCount) == 1) &&
01486          writeTokens(stream, mGoodTokens) &&
01487          writeTokens(stream, mBadTokens))) 
01488   {
01489     NS_WARNING("failed to write training data.");
01490     fclose(stream);
01491     // delete the training data file, since it is potentially corrupt.
01492     mTrainingFile->Remove(PR_FALSE);
01493   } 
01494   else 
01495   {
01496     fclose(stream);
01497     mTrainingDataDirty = PR_FALSE;
01498   }
01499 }
01500 
01501 void nsBayesianFilter::readTrainingData()
01502 {
01503   if (!mTrainingFile) 
01504     return;
01505   
01506   PRBool exists;
01507   nsresult rv = mTrainingFile->Exists(&exists);
01508   if (NS_FAILED(rv) || !exists) 
01509     return;
01510 
01511   FILE* stream;
01512   rv = mTrainingFile->OpenANSIFileDesc("rb", &stream);
01513   if (NS_FAILED(rv)) 
01514     return;
01515 
01516   PRInt64 fileSize;
01517   rv = mTrainingFile->GetFileSize(&fileSize);
01518   if (NS_FAILED(rv)) 
01519     return;
01520 
01521   // FIXME:  should make sure that the tokenizers are empty.
01522   char cookie[4];
01523   if (!((fread(cookie, sizeof(cookie), 1, stream) == 1) &&
01524         (memcmp(cookie, kMagicCookie, sizeof(cookie)) == 0) &&
01525         (readUInt32(stream, &mGoodCount) == 1) &&
01526         (readUInt32(stream, &mBadCount) == 1) &&
01527          readTokens(stream, mGoodTokens, fileSize) &&
01528          readTokens(stream, mBadTokens, fileSize))) {
01529       NS_WARNING("failed to read training data.");
01530       PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("failed to read training data."));
01531   }
01532   
01533   fclose(stream);
01534 }
01535 
01536 NS_IMETHODIMP nsBayesianFilter::GetUserHasClassified(PRBool *aResult)
01537 {
01538   *aResult = (mGoodCount && mGoodTokens.countTokens() ||
01539               mBadCount && mBadTokens.countTokens());
01540   return NS_OK;
01541 }
01542 
01543 /* void setMessageClassification (in string aMsgURL, in long aOldClassification, in long aNewClassification); */
01544 NS_IMETHODIMP nsBayesianFilter::SetMessageClassification(const char *aMsgURL,
01545                                                          nsMsgJunkStatus aOldClassification,
01546                                                          nsMsgJunkStatus aNewClassification,
01547                                                          nsIMsgWindow *aMsgWindow,
01548                                                          nsIJunkMailClassificationListener *aListener)
01549 {
01550     MessageObserver* analyzer = new MessageObserver(this, aOldClassification, aNewClassification, aListener);
01551     if (!analyzer) return NS_ERROR_OUT_OF_MEMORY;
01552     TokenStreamListener *tokenListener = new TokenStreamListener(analyzer);
01553     analyzer->setTokenListener(tokenListener);
01554     return tokenizeMessage(aMsgURL, aMsgWindow, analyzer);
01555 }
01556 
01557 NS_IMETHODIMP nsBayesianFilter::ResetTrainingData()
01558 {
01559   // clear out our in memory training tokens...
01560   if (mGoodCount && mGoodTokens.countTokens())
01561   {
01562     mGoodTokens.clearTokens();
01563     mGoodCount = 0;
01564   }
01565 
01566   if (mBadCount && mBadTokens.countTokens())
01567   {
01568     mBadTokens.clearTokens();
01569     mBadCount = 0;
01570   }
01571 
01572   // now remove training.dat
01573   if (mTrainingFile)
01574     mTrainingFile->Remove(PR_FALSE);
01575 
01576   return NS_OK;
01577 }
01578