Back to index

lightning-sunbird  0.9+nobinonly
Classes | Defines | Enumerations | Functions | Variables
nsBayesianFilter.cpp File Reference
#include "nsCRT.h"
#include "nsBayesianFilter.h"
#include "nsIInputStream.h"
#include "nsIStreamListener.h"
#include "nsNetUtil.h"
#include "nsQuickSort.h"
#include "nsIMsgMessageService.h"
#include "nsMsgUtils.h"
#include "prnetdb.h"
#include "nsIMsgWindow.h"
#include "prlog.h"
#include "nsAppDirectoryServiceDefs.h"
#include "nsUnicharUtils.h"
#include "nsPrintfCString.h"
#include "nsIMIMEHeaderParam.h"
#include "nsNetCID.h"
#include "nsIMimeHeaders.h"
#include "nsMsgMimeCID.h"
#include "nsIMsgMailNewsUrl.h"
#include "nsIMimeMiscStatus.h"
#include "nsIPrefService.h"
#include "nsIPrefBranch.h"
#include "nsIStringEnumerator.h"
#include "nsIMsgHdr.h"
#include "nsParserCIID.h"
#include "nsIParser.h"
#include "nsIHTMLContentSink.h"
#include "nsIContentSerializer.h"
#include "nsLayoutCID.h"
#include "nsIHTMLToTextSink.h"
#include "nsIDocumentEncoder.h"
#include "nsIncompleteGamma.h"
#include <math.h>

Go to the source code of this file.

Classes

struct  Token
 A Token class for the ExprLexer. More...
struct  VisitClosure
class  TokenAnalyzer
class  TokenStreamListener
 This class downloads the raw content of an email message, buffering until complete segments are seen, that is until a linefeed is seen, although any of the valid token separators would do. More...
class  MessageClassifier
class  MessageObserver

Defines

#define kDefaultJunkThreshold   .99
#define FORGED_RECEIVED_HEADER_HINT   NS_LITERAL_CSTRING("may be forged")
#define M_LN2   0.69314718055994530942
#define M_E   2.7182818284590452354
#define IN_RANGE(x, low, high)   ((PRUint16)((x)-(low)) <= (high)-(low))
#define IS_JA_HIRAGANA(x)   IN_RANGE(x, 0x3040, 0x309F)
#define IS_JA_KATAKANA(x)   (IN_RANGE(x^0x0004, 0x30A0, 0x30FE)||(IN_RANGE(x, 0xFF66, 0xFF9F)))
#define IS_JA_KANJI(x)   (IN_RANGE(x, 0x2E80, 0x2FDF)||IN_RANGE(x, 0x4E00, 0x9FAF))
#define IS_JA_KUTEN(x)   (((x)==0x3001)||((x)==0xFF64)||((x)==0xFF0E))
#define IS_JA_TOUTEN(x)   (((x)==0x3002)||((x)==0xFF61)||((x)==0xFF0C))
#define IS_JA_SPACE(x)   ((x)==0x3000)
#define IS_JA_FWLATAIN(x)   IN_RANGE(x, 0xFF01, 0xFF5E)
#define IS_JA_FWNUMERAL(x)   IN_RANGE(x, 0xFF10, 0xFF19)
#define IS_JAPANESE_SPECIFIC(x)   (IN_RANGE(x, 0x3040, 0x30FF)||IN_RANGE(x, 0xFF01, 0xFF9F))

Enumerations

enum  char_class {
  others = 0, space, hiragana, katakana,
  kanji, kuten, touten, kigou,
  fwlatain, ascii
}

Functions

static NS_DEFINE_CID (kParserCID, NS_PARSER_CID)
static NS_DEFINE_CID (kNavDTDCID, NS_CNAVDTD_CID)
static PLDHashOperator PR_CALLBACK VisitEntry (PLDHashTable *table, PLDHashEntryHdr *entry, PRUint32 number, void *arg)
static PRBool isDecimalNumber (const char *word)
static PRBool isASCII (const char *word)
PRBool isUpperCase (char c)
static char * toLowerCase (char *str)
char_class getCharClass (PRUnichar c)
static PRBool isJapanese (const char *word)
PRBool isFWNumeral (const PRUnichar *p1, const PRUnichar *p2)
 PR_STATIC_CALLBACK (int) compareTokens(const void *p1
 return (delta==0.0?0:(delta > 0.0?1:-1))
double dmax (double x, double y)
double dmin (double x, double y)
static double chi2P (double chi2, double nu, PRInt32 *error)
static void forgetTokens (Tokenizer &corpus, TokenEnumeration tokens)
static void rememberTokens (Tokenizer &corpus, TokenEnumeration tokens)
int writeUInt32 (FILE *stream, PRUint32 value)
int readUInt32 (FILE *stream, PRUint32 *value)
static PRBool writeTokens (FILE *stream, Tokenizer &tokenizer)
static PRBool readTokens (FILE *stream, Tokenizer &tokenizer, PRInt64 fileSize)

Variables

static PRLogModuleInfoBayesianFilterLogModule = nsnull
static const char * kBayesianFilterTokenDelimiters = " \t\n\r\f."
static int kMinLengthForToken = 3
static int kMaxLengthForToken = 12
static const PLDHashTableOps gTokenTableOps
const PRUint32 kBufferSize = 16384
const voidp2
const void void *Tokent1 = (Token*) p1
const void void *Tokent2 = (Token*) p2
double delta = t1->mDistance - t2->mDistance
static const char kMagicCookie [] = { '\xFE', '\xED', '\xFA', '\xCE' }

Define Documentation

Definition at line 95 of file nsBayesianFilter.cpp.

#define IN_RANGE (   x,
  low,
  high 
)    ((PRUint16)((x)-(low)) <= (high)-(low))

Definition at line 449 of file nsBayesianFilter.cpp.

#define IS_JA_FWLATAIN (   x)    IN_RANGE(x, 0xFF01, 0xFF5E)

Definition at line 458 of file nsBayesianFilter.cpp.

#define IS_JA_FWNUMERAL (   x)    IN_RANGE(x, 0xFF10, 0xFF19)

Definition at line 459 of file nsBayesianFilter.cpp.

#define IS_JA_HIRAGANA (   x)    IN_RANGE(x, 0x3040, 0x309F)

Definition at line 451 of file nsBayesianFilter.cpp.

#define IS_JA_KANJI (   x)    (IN_RANGE(x, 0x2E80, 0x2FDF)||IN_RANGE(x, 0x4E00, 0x9FAF))

Definition at line 454 of file nsBayesianFilter.cpp.

#define IS_JA_KATAKANA (   x)    (IN_RANGE(x^0x0004, 0x30A0, 0x30FE)||(IN_RANGE(x, 0xFF66, 0xFF9F)))

Definition at line 453 of file nsBayesianFilter.cpp.

#define IS_JA_KUTEN (   x)    (((x)==0x3001)||((x)==0xFF64)||((x)==0xFF0E))

Definition at line 455 of file nsBayesianFilter.cpp.

#define IS_JA_SPACE (   x)    ((x)==0x3000)

Definition at line 457 of file nsBayesianFilter.cpp.

#define IS_JA_TOUTEN (   x)    (((x)==0x3002)||((x)==0xFF61)||((x)==0xFF0C))

Definition at line 456 of file nsBayesianFilter.cpp.

#define IS_JAPANESE_SPECIFIC (   x)    (IN_RANGE(x, 0x3040, 0x30FF)||IN_RANGE(x, 0xFF01, 0xFF9F))

Definition at line 461 of file nsBayesianFilter.cpp.

Definition at line 90 of file nsBayesianFilter.cpp.

#define M_E   2.7182818284590452354

Definition at line 102 of file nsBayesianFilter.cpp.

#define M_LN2   0.69314718055994530942

Definition at line 98 of file nsBayesianFilter.cpp.


Enumeration Type Documentation

enum char_class
Enumerator:
others 
space 
hiragana 
katakana 
kanji 
kuten 
touten 
kigou 
fwlatain 
ascii 

Definition at line 463 of file nsBayesianFilter.cpp.


Function Documentation

static double chi2P ( double  chi2,
double  nu,
PRInt32 error 
) [inline, static]

Definition at line 1075 of file nsBayesianFilter.cpp.

{
    // domain checks; set error and return a dummy value
    if (chi2 < 0.0 || nu <= 0.0)
    {
        *error = -1;
        return 0.0;
    }
    // reversing the arguments is intentional
    return nsIncompleteGammaP (nu/2.0, chi2/2.0, error);
}

Here is the call graph for this function:

Here is the caller graph for this function:

double dmax ( double  x,
double  y 
) [inline]

Definition at line 1060 of file nsBayesianFilter.cpp.

{ return (x > y ? x : y); }

Here is the caller graph for this function:

double dmin ( double  x,
double  y 
) [inline]

Definition at line 1061 of file nsBayesianFilter.cpp.

{ return (x < y ? x : y); }
static void forgetTokens ( Tokenizer corpus,
TokenEnumeration  tokens 
) [static]

Definition at line 1271 of file nsBayesianFilter.cpp.

{
    // if we are forgetting the tokens for a message, should only 
    // subtract 1 from the occurrence count for that token in the training set
    // because we assume we only bumped the training set count once per messages
    // containing the token. 
    while (tokens.hasMoreTokens()) {
        Token* token = tokens.nextToken();
        corpus.remove(token->mWord);
    }
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 476 of file nsBayesianFilter.cpp.

{
  char_class charClass = others;

  if(IS_JA_HIRAGANA(c))
    charClass = hiragana;
  else if(IS_JA_KATAKANA(c))
    charClass = katakana;
  else if(IS_JA_KANJI(c))
    charClass = kanji;
  else if(IS_JA_KUTEN(c))
    charClass = kuten;
  else if(IS_JA_TOUTEN(c))
    charClass = touten;
  else if(IS_JA_FWLATAIN(c))
    charClass = fwlatain;

  return charClass;
}

Here is the caller graph for this function:

static PRBool isASCII ( const char *  word) [static]

Definition at line 283 of file nsBayesianFilter.cpp.

{
    const unsigned char* p = (const unsigned char*)word;
    unsigned char c;
    while ((c = *p++)) {
        if (c > 127)
            return PR_FALSE;
    }
    return PR_TRUE;
}

Here is the caller graph for this function:

static PRBool isDecimalNumber ( const char *  word) [static]

Definition at line 271 of file nsBayesianFilter.cpp.

{
    const char* p = word;
    if (*p == '-') ++p;
    char c;
    while ((c = *p++)) {
        if (!isdigit((unsigned char) c))
            return PR_FALSE;
    }
    return PR_TRUE;
}

Here is the caller graph for this function:

Definition at line 510 of file nsBayesianFilter.cpp.

{
  for(;p1<p2;p1++)
    if(!IS_JA_FWNUMERAL(*p1)) 
      return PR_FALSE;

  return PR_TRUE;
}

Here is the caller graph for this function:

static PRBool isJapanese ( const char *  word) [static]

Definition at line 496 of file nsBayesianFilter.cpp.

{
  nsString text = NS_ConvertUTF8toUCS2(word);
  PRUnichar* p = (PRUnichar*)text.get();
  PRUnichar c;
    
  // it is japanese chunk if it contains any hiragana or katakana.
  while((c = *p++))
    if( IS_JAPANESE_SPECIFIC(c)) 
      return PR_TRUE;

  return PR_FALSE;
}

Here is the call graph for this function:

Here is the caller graph for this function:

PRBool isUpperCase ( char  c) [inline]

Definition at line 294 of file nsBayesianFilter.cpp.

{ return ('A' <= c) && (c <= 'Z'); }

Here is the caller graph for this function:

static NS_DEFINE_CID ( kParserCID  ,
NS_PARSER_CID   
) [static]
static NS_DEFINE_CID ( kNavDTDCID  ,
NS_CNAVDTD_CID   
) [static]
PR_STATIC_CALLBACK ( int  ) const
static PRBool readTokens ( FILE stream,
Tokenizer tokenizer,
PRInt64  fileSize 
) [static]

Definition at line 1409 of file nsBayesianFilter.cpp.

{
    PRUint32 tokenCount;
    if (readUInt32(stream, &tokenCount) != 1)
        return PR_FALSE;

    PRInt64 fpos = ftell(stream);
    if (fpos < 0)
        return PR_FALSE;

    PRUint32 bufferSize = 4096;
    char* buffer = new char[bufferSize];
    if (!buffer) return PR_FALSE;

    for (PRUint32 i = 0; i < tokenCount; ++i) {
        PRUint32 count;
        if (readUInt32(stream, &count) != 1)
            break;
        PRUint32 size;
        if (readUInt32(stream, &size) != 1)
            break;
        fpos += 8;
        if (size >= bufferSize) {
            delete[] buffer;
            if (fpos + size > fileSize)
                return PR_FALSE;
            while (size >= bufferSize) {
                bufferSize *= 2;
                if (bufferSize == 0)
                    return PR_FALSE;
            }
            buffer = new char[bufferSize];
            if (!buffer) return PR_FALSE;
        }
        if (fread(buffer, size, 1, stream) != 1)
            break;
        fpos += size;
        buffer[size] = '\0';
        tokenizer.add(buffer, count);
    }
    
    delete[] buffer;
    
    return PR_TRUE;
}

Here is the call graph for this function:

Here is the caller graph for this function:

int readUInt32 ( FILE stream,
PRUint32 value 
) [inline]

Definition at line 1377 of file nsBayesianFilter.cpp.

{
    int n = fread(value, sizeof(PRUint32), 1, stream);
    if (n == 1) {
        *value = PR_ntohl(*value);
    }
    return n;
}

Here is the call graph for this function:

Here is the caller graph for this function:

static void rememberTokens ( Tokenizer corpus,
TokenEnumeration  tokens 
) [static]

Definition at line 1283 of file nsBayesianFilter.cpp.

{
    while (tokens.hasMoreTokens()) {
        Token* token = tokens.nextToken();
        corpus.add(token->mWord);
    }
}

Here is the call graph for this function:

Here is the caller graph for this function:

return ( delta  = =0.0?0:(delta > 0.0?1:-1))
static char* toLowerCase ( char *  str) [static]

Definition at line 296 of file nsBayesianFilter.cpp.

{
    char c, *p = str;
    while ((c = *p++)) {
        if (isUpperCase(c))
            p[-1] = c + ('a' - 'A');
    }
    return str;
}

Here is the call graph for this function:

Here is the caller graph for this function:

static PLDHashOperator PR_CALLBACK VisitEntry ( PLDHashTable table,
PLDHashEntryHdr entry,
PRUint32  number,
void arg 
) [static]

Definition at line 151 of file nsBayesianFilter.cpp.

{
    VisitClosure* closure = NS_REINTERPRET_CAST(VisitClosure*, arg);
    Token* token = NS_STATIC_CAST(Token*, entry);
    return (closure->f(token, closure->data) ? PL_DHASH_NEXT : PL_DHASH_STOP);
}

Here is the caller graph for this function:

static PRBool writeTokens ( FILE stream,
Tokenizer tokenizer 
) [static]

Definition at line 1386 of file nsBayesianFilter.cpp.

{
    PRUint32 tokenCount = tokenizer.countTokens();
    if (writeUInt32(stream, tokenCount) != 1)
        return PR_FALSE;

    if (tokenCount > 0) {
        TokenEnumeration tokens = tokenizer.getTokens();
        for (PRUint32 i = 0; i < tokenCount; ++i) {
            Token* token = tokens.nextToken();
            if (writeUInt32(stream, token->mCount) != 1)
                break;
            PRUint32 tokenLength = token->mLength;
            if (writeUInt32(stream, tokenLength) != 1)
                break;
            if (fwrite(token->mWord, tokenLength, 1, stream) != 1)
                break;
        }
    }
    
    return PR_TRUE;
}

Here is the call graph for this function:

Here is the caller graph for this function:

int writeUInt32 ( FILE stream,
PRUint32  value 
) [inline]

Definition at line 1371 of file nsBayesianFilter.cpp.

{
    value = PR_htonl(value);
    return fwrite(&value, sizeof(PRUint32), 1, stream);
}

Here is the call graph for this function:

Here is the caller graph for this function:


Variable Documentation

Definition at line 85 of file nsBayesianFilter.cpp.

double delta = t1->mDistance - t2->mDistance

Definition at line 1056 of file nsBayesianFilter.cpp.

const char* kBayesianFilterTokenDelimiters = " \t\n\r\f." [static]

Definition at line 91 of file nsBayesianFilter.cpp.

Definition at line 728 of file nsBayesianFilter.cpp.

const char kMagicCookie[] = { '\xFE', '\xED', '\xFA', '\xCE' } [static]

Definition at line 1469 of file nsBayesianFilter.cpp.

int kMaxLengthForToken = 12 [static]

Definition at line 93 of file nsBayesianFilter.cpp.

int kMinLengthForToken = 3 [static]

Definition at line 92 of file nsBayesianFilter.cpp.

Definition at line 1053 of file nsBayesianFilter.cpp.

const void void* Token* t1 = (Token*) p1

Definition at line 1055 of file nsBayesianFilter.cpp.

Definition at line 1055 of file nsBayesianFilter.cpp.