Back to index

lightning-sunbird  0.9+nobinonly
nsBayesianFilter.h
Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
00002 /* ***** BEGIN LICENSE BLOCK *****
00003  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
00004  *
00005  * The contents of this file are subject to the Mozilla Public License Version
00006  * 1.1 (the "License"); you may not use this file except in compliance with
00007  * the License. You may obtain a copy of the License at
00008  * http://www.mozilla.org/MPL/
00009  *
00010  * Software distributed under the License is distributed on an "AS IS" basis,
00011  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
00012  * for the specific language governing rights and limitations under the
00013  * License.
00014  *
00015  * The Original Code is mozilla.org code.
00016  *
00017  * The Initial Developer of the Original Code is
00018  * Netscape Communications Corporation.
00019  * Portions created by the Initial Developer are Copyright (C) 2002
00020  * the Initial Developer. All Rights Reserved.
00021  *
00022  * Contributor(s):
00023  *   Patrick C. Beard <beard@netscape.com>
00024  *
00025  * Alternatively, the contents of this file may be used under the terms of
00026  * either of the GNU General Public License Version 2 or later (the "GPL"),
00027  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
00028  * in which case the provisions of the GPL or the LGPL are applicable instead
00029  * of those above. If you wish to allow use of your version of this file only
00030  * under the terms of either the GPL or the LGPL, and not to allow others to
00031  * use your version of this file under the terms of the MPL, indicate your
00032  * decision by deleting the provisions above and replace them with the notice
00033  * and other provisions required by the GPL or the LGPL. If you do not delete
00034  * the provisions above, a recipient may use your version of this file under
00035  * the terms of any one of the MPL, the GPL or the LGPL.
00036  *
00037  * ***** END LICENSE BLOCK ***** */
00038 
00039 #ifndef nsBayesianFilter_h__
00040 #define nsBayesianFilter_h__
00041 
00042 #include "nsCOMPtr.h"
00043 #include "nsIMsgFilterPlugin.h"
00044 #include "nsISemanticUnitScanner.h"
00045 #include "pldhash.h"
00046 #include "nsITimer.h"
00047 
00048 // XXX can't simply byte align arenas, must at least 2-byte align.
00049 #define PL_ARENA_CONST_ALIGN_MASK 1
00050 #include "plarena.h"
00051 
00052 #define DEFAULT_MIN_INTERVAL_BETWEEN_WRITES             15*60*1000
00053 
00054 struct Token;
00055 class TokenEnumeration;
00056 class TokenAnalyzer;
00057 class nsIMsgWindow;
00058 class nsIMimeHeaders;
00059 class nsIUTF8StringEnumerator;
00060 
00067 class TokenEnumeration {
00068 public:
00069     TokenEnumeration(PLDHashTable* table);
00070     PRBool hasMoreTokens();
00071     Token* nextToken();
00072     
00073 private:
00074     PRUint32 mEntrySize, mEntryCount, mEntryOffset;
00075     char *mEntryAddr, *mEntryLimit;
00076 };
00077 
00078 class Tokenizer {
00079 public:
00080     Tokenizer();
00081     ~Tokenizer();
00082 
00083     operator int() { return mTokenTable.entryStore != NULL; }
00084     
00085     Token* get(const char* word);
00086 
00087     // The training set keeps an occurrence count on each word. This count 
00088     // is supposed to count the # of messsages it occurs in.
00089     // When add/remove is called while tokenizing a message and NOT the training set,
00090     // 
00091     Token* add(const char* word, PRUint32 count = 1);
00092     void remove(const char* word, PRUint32 count = 1);
00093     
00094     PRUint32 countTokens();
00095     Token* copyTokens();
00096     TokenEnumeration getTokens();
00097 
00101     nsresult clearTokens();
00102 
00107     void tokenize(char* text);
00108     
00112     void tokenize(const char* str);
00113     
00117     void tokenizeHeaders(nsIUTF8StringEnumerator * aHeaderNames, nsIUTF8StringEnumerator * aHeaderValues);
00118 
00119     void tokenizeAttachment(const char * aContentType, const char * aFileName);
00120 
00124     void visit(PRBool (*f) (Token*, void*), void* data);
00125 
00126 private:
00127     char* copyWord(const char* word, PRUint32 len);
00128     void tokenize_ascii_word(char * word);
00129     void tokenize_japanese_word(char* chunk);
00130     inline void addTokenForHeader(const char * aTokenPrefix, nsACString& aValue, PRBool aTokenizeValue = false);
00131     nsresult stripHTML(const nsAString& inString, nsAString& outString);
00132 
00133 private:
00134     PLDHashTable mTokenTable;
00135     PLArenaPool mWordPool;
00136     nsCOMPtr<nsISemanticUnitScanner> mScanner;
00137 };
00138 
00139 class nsBayesianFilter : public nsIJunkMailPlugin {
00140 public:
00141     NS_DECL_ISUPPORTS
00142     NS_DECL_NSIMSGFILTERPLUGIN
00143     NS_DECL_NSIJUNKMAILPLUGIN
00144     
00145     nsBayesianFilter();
00146     virtual ~nsBayesianFilter();
00147     
00148     nsresult tokenizeMessage(const char* messageURI, nsIMsgWindow *aMsgWindow, TokenAnalyzer* analyzer);
00149     void classifyMessage(Tokenizer& tokens, const char* messageURI, nsIJunkMailClassificationListener* listener);
00150     void observeMessage(Tokenizer& tokens, const char* messageURI, nsMsgJunkStatus oldClassification, nsMsgJunkStatus newClassification, 
00151                         nsIJunkMailClassificationListener* listener);
00152 
00153     void writeTrainingData();
00154     void readTrainingData();
00155     nsresult getTrainingFile(nsILocalFile ** aFile);
00156     
00157 protected:
00158 
00159     static void TimerCallback(nsITimer* aTimer, void* aClosure);
00160 
00161     Tokenizer mGoodTokens, mBadTokens;
00162     double   mJunkProbabilityThreshold;
00163     PRUint32 mGoodCount, mBadCount;
00164     PRPackedBool mTrainingDataDirty;
00165     PRInt32 mMinFlushInterval; // in miliseconds, must be positive
00166                                //and not too close to 0
00167     nsCOMPtr<nsITimer> mTimer;
00168     nsCOMPtr<nsILocalFile> mTrainingFile;
00169 };
00170 
00171 #endif // _nsBayesianFilter_h__