Back to index

lightning-sunbird  0.9+nobinonly
Public Member Functions | Private Member Functions | Private Attributes
Tokenizer Class Reference

#include <nsBayesianFilter.h>

Collaboration diagram for Tokenizer:
Collaboration graph
[legend]

List of all members.

Public Member Functions

 Tokenizer ()
 ~Tokenizer ()
 operator int ()
Tokenget (const char *word)
Tokenadd (const char *word, PRUint32 count=1)
void remove (const char *word, PRUint32 count=1)
PRUint32 countTokens ()
TokencopyTokens ()
TokenEnumeration getTokens ()
nsresult clearTokens ()
 Clears out the previous message tokens.
void tokenize (char *text)
 Assumes that text is mutable and can be nsCRT::strtok'd.
void tokenize (const char *str)
 Copies the string before tokenizing.
void tokenizeHeaders (nsIUTF8StringEnumerator *aHeaderNames, nsIUTF8StringEnumerator *aHeaderValues)
 Creates specific tokens based on the mime headers for the message being tokenized.
void tokenizeAttachment (const char *aContentType, const char *aFileName)
void visit (PRBool(*f)(Token *, void *), void *data)
 Calls passed-in function for each token in the table.

Private Member Functions

char * copyWord (const char *word, PRUint32 len)
void tokenize_ascii_word (char *word)
void tokenize_japanese_word (char *chunk)
void addTokenForHeader (const char *aTokenPrefix, nsACString &aValue, PRBool aTokenizeValue=false)
nsresult stripHTML (const nsAString &inString, nsAString &outString)

Private Attributes

PLDHashTable mTokenTable
PLArenaPool mWordPool
nsCOMPtr< nsISemanticUnitScannermScanner

Detailed Description

Definition at line 78 of file nsBayesianFilter.h.


Constructor & Destructor Documentation

Definition at line 171 of file nsBayesianFilter.cpp.

{
    PL_INIT_ARENA_POOL(&mWordPool, "Words Arena", 16384);
    PRBool ok = PL_DHashTableInit(&mTokenTable, &gTokenTableOps, nsnull, sizeof(Token), 256);
    NS_ASSERTION(ok, "mTokenTable failed to initialize");
    if (!ok)
      PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("mTokenTable failed to initialize"));
}

Here is the call graph for this function:

Definition at line 180 of file nsBayesianFilter.cpp.

Here is the call graph for this function:


Member Function Documentation

Token * Tokenizer::add ( const char *  word,
PRUint32  count = 1 
)

Definition at line 222 of file nsBayesianFilter.cpp.

{
    PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("add word: %s (count=%d)", word, count));

    PLDHashEntryHdr* entry = PL_DHashTableOperate(&mTokenTable, word, PL_DHASH_ADD);
    Token* token = NS_STATIC_CAST(Token*, entry);
    if (token) {
        if (token->mWord == NULL) {
            PRUint32 len = strlen(word);
            NS_ASSERTION(len != 0, "adding zero length word to tokenizer");
            if (!len)
              PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("adding zero length word to tokenizer"));
            token->mWord = copyWord(word, len);
            NS_ASSERTION(token->mWord, "copyWord failed");
            if (!token->mWord) {
                PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("copyWord failed: %s (%d)", word, len));
                PL_DHashTableRawRemove(&mTokenTable, entry);
                return NULL;
            }
            token->mLength = len;
            token->mCount = count;
            token->mProbability = 0;
            PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("adding word to tokenizer: %s (len=%d) (count=%d)", word, len, count));
        } else {
            token->mCount += count;
            PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("adding word to tokenizer: %s (count=%d) (mCount=%d)", word, count, token->mCount));
        }
    }
    return token;
}

Here is the call graph for this function:

Here is the caller graph for this function:

void Tokenizer::addTokenForHeader ( const char *  aTokenPrefix,
nsACString &  aValue,
PRBool  aTokenizeValue = false 
) [inline, private]

Definition at line 306 of file nsBayesianFilter.cpp.

{
  if (aValue.Length())
  {
    ToLowerCase(aValue);
    if (!aTokenizeValue)
      add(PromiseFlatCString(nsDependentCString(aTokenPrefix) + NS_LITERAL_CSTRING(":") + aValue).get());
    else 
    {
      char* word;
      const nsPromiseFlatCString &flatValue = PromiseFlatCString(aValue);
      char* next = (char *) flatValue.get();
      while ((word = nsCRT::strtok(next, kBayesianFilterTokenDelimiters, &next)) != NULL) 
      {
          if (word[0] == '\0') continue;
          if (isDecimalNumber(word)) continue;
          if (isASCII(word))
              add(PromiseFlatCString(nsDependentCString(aTokenPrefix) + NS_LITERAL_CSTRING(":") + nsDependentCString(word)).get());
      }
    }
  }
}

Here is the call graph for this function:

Here is the caller graph for this function:

Clears out the previous message tokens.

Definition at line 187 of file nsBayesianFilter.cpp.

{
    // we re-use the tokenizer when classifying multiple messages, 
    // so this gets called after every message classification.
    PRBool ok = PR_TRUE;
    if (mTokenTable.entryStore)
    {
        PL_DHashTableFinish(&mTokenTable);
        PL_FreeArenaPool(&mWordPool);
        ok = PL_DHashTableInit(&mTokenTable, &gTokenTableOps, nsnull, sizeof(Token), 256);
        NS_ASSERTION(ok, "mTokenTable failed to initialize");
        if (!ok)
          PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("mTokenTable failed to initialize in clearTokens()"));
    }
    return (ok) ? NS_OK : NS_ERROR_OUT_OF_MEMORY;
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 666 of file nsBayesianFilter.cpp.

{
    PRUint32 count = countTokens();
    if (count > 0) {
        Token* tokens = new Token[count];
        if (tokens) {
            Token* tp = tokens;
            TokenEnumeration e(&mTokenTable);
            while (e.hasMoreTokens())
                *tp++ = *e.nextToken();
        }
        return tokens;
    }
    return NULL;
}

Here is the call graph for this function:

Here is the caller graph for this function:

char * Tokenizer::copyWord ( const char *  word,
PRUint32  len 
) [private]

Definition at line 204 of file nsBayesianFilter.cpp.

{
    void* result;
    PRUint32 size = 1 + len;
    PL_ARENA_ALLOCATE(result, &mWordPool, size);
    if (result)
        memcpy(result, word, size);
    return NS_REINTERPRET_CAST(char*, result);
}

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 661 of file nsBayesianFilter.cpp.

{
    return mTokenTable.entryCount;
}

Here is the caller graph for this function:

Token * Tokenizer::get ( const char *  word) [inline]

Definition at line 214 of file nsBayesianFilter.cpp.

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 682 of file nsBayesianFilter.cpp.

Here is the caller graph for this function:

Tokenizer::operator int ( ) [inline]

Definition at line 83 of file nsBayesianFilter.h.

{ return mTokenTable.entryStore != NULL; }
void Tokenizer::remove ( const char *  word,
PRUint32  count = 1 
)

Definition at line 253 of file nsBayesianFilter.cpp.

{
    PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("remove word: %s (count=%d)", word, count));
    Token* token = get(word);
    if (token) {
        NS_ASSERTION(token->mCount >= count, "token count underflow");
        if (token->mCount >= count) {
            PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("remove word: %s (count=%d) (mCount=%d)", word, count, token->mCount));
            token->mCount -= count;
            if (token->mCount == 0)
                PL_DHashTableRawRemove(&mTokenTable, token);
        }
        else {
          PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("token count underflow: %s (count=%d) (mCount=%d)", word, count, token->mCount));
        }
    }
}

Here is the call graph for this function:

Here is the caller graph for this function:

nsresult Tokenizer::stripHTML ( const nsAString &  inString,
nsAString &  outString 
) [private]

Definition at line 544 of file nsBayesianFilter.cpp.

{
  nsresult rv = NS_OK;
  // Create a parser
  nsCOMPtr<nsIParser> parser = do_CreateInstance(kParserCID, &rv);
  NS_ENSURE_SUCCESS(rv, rv);

  // Create the appropriate output sink
  nsCOMPtr<nsIContentSink> sink = do_CreateInstance(NS_PLAINTEXTSINK_CONTRACTID,&rv);
  NS_ENSURE_SUCCESS(rv, rv);

  nsCOMPtr<nsIHTMLToTextSink> textSink(do_QueryInterface(sink));
  NS_ENSURE_TRUE(textSink, NS_ERROR_FAILURE);
  PRUint32 flags = nsIDocumentEncoder::OutputLFLineBreak 
                 | nsIDocumentEncoder::OutputNoScriptContent
                 | nsIDocumentEncoder::OutputNoFramesContent
                 | nsIDocumentEncoder::OutputBodyOnly;

  textSink->Initialize(&outString, flags, 80);

  parser->SetContentSink(sink);
  nsCOMPtr<nsIDTD> dtd = do_CreateInstance(kNavDTDCID,&rv);
  NS_ENSURE_SUCCESS(rv, rv);

  parser->RegisterDTD(dtd);

  return parser->Parse(inString, 0, NS_LITERAL_CSTRING("text/html"), PR_FALSE, PR_TRUE);
}

Here is the call graph for this function:

Here is the caller graph for this function:

void Tokenizer::tokenize ( char *  text)

Assumes that text is mutable and can be nsCRT::strtok'd.

Definition at line 573 of file nsBayesianFilter.cpp.

{
    PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("tokenize: %s", aText));

    // strip out HTML tags before we begin processing
    // uggh but first we have to blow up our string into UCS2
    // since that's what the document encoder wants. UTF8/UCS2, I wish we all
    // spoke the same language here..
    nsString text = NS_ConvertUTF8toUCS2(aText);
    nsString strippedUCS2;
    stripHTML(text, strippedUCS2);
    
    // convert 0x3000(full width space) into 0x0020
    nsString::iterator substr_start, substr_end;
    strippedUCS2.BeginWriting(substr_start);
    strippedUCS2.EndWriting(substr_end);
    while (substr_start != substr_end) {
        if (*substr_start == 0x3000)
            *substr_start = 0x0020;
        ++substr_start;
    }
    
    nsCString strippedStr = NS_ConvertUCS2toUTF8(strippedUCS2);
    char * strippedText = (char *) strippedStr.get(); // bleh
    PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("tokenize stripped html: %s", strippedText));

    char* word;
    char* next = strippedText;
    while ((word = nsCRT::strtok(next, kBayesianFilterTokenDelimiters, &next)) != NULL) {
        if (!*word) continue;
        if (isDecimalNumber(word)) continue;
        if (isASCII(word))
            tokenize_ascii_word(word);
        else if (isJapanese(word))
            tokenize_japanese_word(word);
        else {
            nsresult rv;
            // use I18N  scanner to break this word into meaningful semantic units.
            if (!mScanner) {
                mScanner = do_CreateInstance(NS_SEMANTICUNITSCANNER_CONTRACTID, &rv);
                NS_ASSERTION(NS_SUCCEEDED(rv), "couldn't create semantic unit scanner!");
                if (NS_FAILED(rv)) {
                    return;
                }
            }
            if (mScanner) {
                mScanner->Start("UTF-8");
                // convert this word from UTF-8 into UCS2.
                NS_ConvertUTF8toUCS2 uword(word);
                ToLowerCase(uword);
                const PRUnichar* utext = uword.get();
                PRInt32 len = uword.Length(), pos = 0, begin, end;
                PRBool gotUnit;
                while (pos < len) {
                    rv = mScanner->Next(utext, len, pos, PR_TRUE, &begin, &end, &gotUnit);
                    if (NS_SUCCEEDED(rv) && gotUnit) {
                        NS_ConvertUCS2toUTF8 utfUnit(utext + begin, end - begin);
                        add(utfUnit.get());
                        // advance to end of current unit.
                        pos = end;
                    } else {
                        break;
                    }
                }
            }
        }
    }
}

Here is the call graph for this function:

Here is the caller graph for this function:

void Tokenizer::tokenize ( const char *  str)

Copies the string before tokenizing.

Definition at line 642 of file nsBayesianFilter.cpp.

{
    char* text = nsCRT::strdup(str);
    if (text) {
        tokenize(text);
        nsCRT::free(text);
    }
}

Here is the call graph for this function:

void Tokenizer::tokenize_ascii_word ( char *  word) [private]

Definition at line 413 of file nsBayesianFilter.cpp.

{
  // always deal with normalized lower case strings
  toLowerCase(aWord);
  PRInt32 wordLength = strlen(aWord);

  // if the wordLength is within our accepted token limit, then add it
  if (wordLength >= kMinLengthForToken && wordLength <= kMaxLengthForToken)
    add(aWord);
  else if (wordLength > kMaxLengthForToken)
  {
    // don't skip over the word if it looks like an email address,
    // there is value in adding tokens for addresses
    nsDependentCString word (aWord, wordLength); // CHEAP, no allocation occurs here...

    // XXX: i think the 40 byte check is just for perf reasons...if the email address is longer than that then forget about it.
    if (wordLength < 40 && strchr(aWord, '.') && word.CountChar('@') == 1)
    {
      PRInt32 numBytesToSep = word.FindChar('@'); 
      if (numBytesToSep < wordLength - 1) // if the @ sign is the last character, it must not be an email address
      {
        // split the john@foo.com into john and foo.com, treat them as separate tokens
        // if i did my string foo correctly, none of this string magic should cause a heap based allocation...
        add(nsPrintfCString(256, "email name:%s", PromiseFlatCString(Substring(word, 0, numBytesToSep++)).get()).get());
        add(nsPrintfCString(256, "email addr:%s", PromiseFlatCString(Substring(word, numBytesToSep, wordLength - numBytesToSep)).get()).get());
        return;
      }
    }

    // there is value in generating a token indicating the number
    // of characters we are skipping. We'll round to the nearest 10
    add(nsPrintfCString("skip:%c %d", word[0], (wordLength/10) * 10).get()); 
  } 
}

Here is the call graph for this function:

Here is the caller graph for this function:

void Tokenizer::tokenize_japanese_word ( char *  chunk) [private]

Definition at line 520 of file nsBayesianFilter.cpp.

{
  PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("entering tokenize_japanese_word(%s)", chunk));
    
  nsString srcStr = NS_ConvertUTF8toUCS2(chunk);
  const PRUnichar* p1 = srcStr.get();
  const PRUnichar* p2 = p1;
  if(!*p2) return;
  
  char_class cc = getCharClass(*p2);
  while(*(++p2))
  {
    if(cc == getCharClass(*p2)) 
      continue;
   
    nsCString token = NS_ConvertUCS2toUTF8(p1, p2-p1);
    if( (!isDecimalNumber(token.get())) && (!isFWNumeral(p1, p2)))      
      add(PromiseFlatCString(NS_LITERAL_CSTRING("JA:") + token).get());
        
    cc = getCharClass(*p2);
    p1 = p2;
  }
}

Here is the call graph for this function:

Here is the caller graph for this function:

void Tokenizer::tokenizeAttachment ( const char *  aContentType,
const char *  aFileName 
)

Definition at line 329 of file nsBayesianFilter.cpp.

{
  nsCAutoString contentType;
  nsCAutoString fileName;
  fileName.Assign(aFileName);
  contentType.Assign(aContentType);

  // normalize the content type and the file name
  ToLowerCase(fileName);
  ToLowerCase(contentType);
  addTokenForHeader("attachment/filename", fileName);

  addTokenForHeader("attachment/content-type", contentType);
}

Here is the call graph for this function:

Creates specific tokens based on the mime headers for the message being tokenized.

Definition at line 344 of file nsBayesianFilter.cpp.

{
  nsCString headerValue;
  nsCAutoString headerName; // we'll be normalizing all header names to lower case
  PRBool hasMore = PR_TRUE;

  while (hasMore)
  {
    aHeaderNames->GetNext(headerName);
    ToLowerCase(headerName); 
    aHeaderValues->GetNext(headerValue);

    switch (headerName.First())
    {
    case 'c':
        if (headerName.Equals("content-type"))
        {
          nsresult rv;
          nsCOMPtr<nsIMIMEHeaderParam> mimehdrpar = do_GetService(NS_MIMEHEADERPARAM_CONTRACTID, &rv);
          if (NS_FAILED(rv))
            break;

          // extract the charset parameter
          nsXPIDLCString parameterValue;
          mimehdrpar->GetParameterInternal(headerValue.get(), "charset", nsnull, nsnull, getter_Copies(parameterValue));
          addTokenForHeader("charset", parameterValue);

          // create a token containing just the content type 
          mimehdrpar->GetParameterInternal(headerValue.get(), "type", nsnull, nsnull, getter_Copies(parameterValue));
          if (!parameterValue.Length())
            mimehdrpar->GetParameterInternal(headerValue.get(), nsnull /* use first unnamed param */, nsnull, nsnull, getter_Copies(parameterValue));
          addTokenForHeader("content-type/type", parameterValue);

          // XXX: should we add a token for the entire content-type header as well or just these parts we have extracted?
        }
        break;
    case 'r':
      if (headerName.Equals("received"))
      {
        // look for the string "may be forged" in the received headers. sendmail sometimes adds this hint
        // This does not compile on linux yet. Need to figure out why. Commenting out for now
        // if (FindInReadable(FORGED_RECEIVED_HEADER_HINT, headerValue))
        //   addTokenForHeader(headerName.get(), FORGED_RECEIVED_HEADER_HINT);
      }
      
      // leave out reply-to
      break;
    case 's':
        if (headerName.Equals("subject"))
        { 
          // we want to tokenize the subject
          addTokenForHeader(headerName.get(), headerValue, PR_TRUE);
        }

        // important: leave out sender field. Too strong of an indicator
        break;
    case 'x': // (2) X-Mailer / user-agent works best if it is untokenized, just fold the case and any leading/trailing white space
    case 'u': 
        addTokenForHeader(headerName.get(), headerValue); 
        break;
    default:
        addTokenForHeader(headerName.get(), headerValue); 
        break;
    } // end switch

    aHeaderNames->HasMore(&hasMore);
  }
}

Here is the call graph for this function:

void Tokenizer::visit ( PRBool(*)(Token *, void *)  f,
void data 
)

Calls passed-in function for each token in the table.

Definition at line 651 of file nsBayesianFilter.cpp.

{
    VisitClosure closure = { f, data };
    PRUint32 visitCount = PL_DHashTableEnumerate(&mTokenTable, VisitEntry, &closure);
    NS_ASSERTION(visitCount == mTokenTable.entryCount, "visitCount != entryCount!");
    if (visitCount != mTokenTable.entryCount) {
      PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("visitCount != entryCount!: %d vs %d", visitCount, mTokenTable.entryCount));
    }
}

Here is the call graph for this function:


Member Data Documentation

Definition at line 136 of file nsBayesianFilter.h.

Definition at line 134 of file nsBayesianFilter.h.

Definition at line 135 of file nsBayesianFilter.h.


The documentation for this class was generated from the following files: