Back to index

citadel  8.12
Classes | Defines | Typedefs | Functions
ft_wordbreaker.h File Reference
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

struct  noise_word

Defines

#define FT_WORDBREAKER_ID   0x0021
#define WB_MIN   4
#define WB_MAX   40

Typedefs

typedef struct noise_word

Functions

void wordbreaker (const char *text, int *num_tokens, int **tokens)
void initialize_noise_words (void)
void noise_word_cleanup (void)

Class Documentation

struct noise_word

Definition at line 41 of file ft_wordbreaker.h.

Collaboration diagram for noise_word:
Class Members
unsigned int len
noise_word * next
char * word

Define Documentation

#define FT_WORDBREAKER_ID   0x0021

Definition at line 25 of file ft_wordbreaker.h.

#define WB_MAX   40

Definition at line 31 of file ft_wordbreaker.h.

#define WB_MIN   4

Definition at line 30 of file ft_wordbreaker.h.


Typedef Documentation

typedef struct noise_word

Definition at line 39 of file ft_wordbreaker.h.


Function Documentation

void initialize_noise_words ( void  )

Definition at line 134 of file ft_wordbreaker.c.

{
       int i;
       int len;
       int ch;
       noise_word *next;
       
       memset (noise_words, 0, sizeof(noise_words));
       
       for (i=0; i<(sizeof(noise_words_init)/sizeof(char *)); ++i)
       {
              ch = noise_words_init[i][0] - 'a';
              len = strlen(noise_words_init[i]);
              
              next = malloc(sizeof(noise_word));
              next->len = len;
              next->word = strdup(noise_words_init[i]);
              next->next = noise_words[ch];
              noise_words[ch] = next;
       }
}

Here is the caller graph for this function:

void noise_word_cleanup ( void  )

Definition at line 157 of file ft_wordbreaker.c.

{
       int i;
       noise_word *cur, *next;
       
       syslog(LOG_INFO, "Cleaning up fulltext noise words.\n");
       
       for (i = 0 ; i < 26 ; i++)
       {
              cur = noise_words[i];
              while (cur)
              {
                     next = cur->next;
                     free(cur->word);
                     free(cur);
                     cur = next;
              }
       }
}

Here is the caller graph for this function:

void wordbreaker ( const char *  text,
int *  num_tokens,
int **  tokens 
)

Definition at line 192 of file ft_wordbreaker.c.

                                                                  {

       int wb_num_tokens = 0;
       int wb_num_alloc = 0;
       int *wb_tokens = NULL;

       const char *ptr;
       const char *word_start;
       const char *word_end;
       char ch;
       int word_len;
       char word[256];
       int i;
       int word_crc;
       noise_word *noise;
       
       
       if (text == NULL) {         /* no NULL text please */
              *num_tokens = 0;
              *tokens = NULL;
              return;
       }

       if (text[0] == 0) {         /* no empty text either */
              *num_tokens = 0;
              *tokens = NULL;
              return;
       }

       ptr = text;
       word_start = NULL;
       while (*ptr) {
              ch = *ptr;
              if (isalnum(ch)) {
                     if (!word_start) {
                            word_start = ptr;
                     }
              }
              ++ptr;
              ch = *ptr;
              if ( (!isalnum(ch)) && (word_start) ) {
                     word_end = ptr;
//                   --word_end;

                     /* extract the word */
                     word_len = word_end - word_start;
                     if (word_len >= sizeof word) {
                            syslog(LOG_DEBUG, "Invalid word length: %d\n", word_len);
                            safestrncpy(word, word_start, sizeof word);
                            word[(sizeof word) - 1] = 0;
                     }
                     else {
                            safestrncpy(word, word_start, word_len+1);
                            word[word_len] = 0;
                     }
                     word_start = NULL;

                     /* are we ok with the length? */
                     if ( (word_len >= WB_MIN)
                        && (word_len <= WB_MAX) ) {
                            for (i=0; i<word_len; ++i) {
                                   word[i] = tolower(word[i]);
                            }
                            /* disqualify noise words */
                            noise = noise_words[(int) (word[0]-'a')];
                            while (noise)
                            {
                                   if (noise->len == word_len)
                                   {
                                          if (!strcmp(word, noise->word)) 
                                          {
                                                 word_len = 0;
                                                 break;
                                          }
                                   }
                                   noise = noise->next;
                            }
                            if (word_len == 0)
                                   continue;

                            word_crc = (int)
                                   CalcCRC16Bytes(word_len, word);

                            ++wb_num_tokens;
                            if (wb_num_tokens > wb_num_alloc) {
                                   wb_num_alloc += 512;
                                   wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
                            }
                            wb_tokens[wb_num_tokens - 1] = word_crc;
                     }
              }
       }

       /* sort and purge dups */
       if (wb_num_tokens > 1) {
              qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
              for (i=0; i<(wb_num_tokens-1); ++i) {
                     if (wb_tokens[i] == wb_tokens[i+1]) {
                            memmove(&wb_tokens[i], &wb_tokens[i+1],
                                   ((wb_num_tokens - i - 1)*sizeof(int)));
                            --wb_num_tokens;
                            --i;
                     }
              }
       }

       *num_tokens = wb_num_tokens;
       *tokens = wb_tokens;
}

Here is the call graph for this function:

Here is the caller graph for this function: