Back to index

citadel  8.12
ft_wordbreaker.c
Go to the documentation of this file.
00001 /*
00002  * Default wordbreaker module for full text indexing.
00003  *
00004  * Copyright (c) 2005-2012 by the citadel.org team
00005  *
00006  *  This program is open source software; you can redistribute it and/or modify
00007  *  it under the terms of the GNU General Public License version 3.
00008  *  
00009  *  
00010  *
00011  *  This program is distributed in the hope that it will be useful,
00012  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  *  GNU General Public License for more details.
00015  *
00016  *  
00017  *  
00018  *  
00019  */
00020 
00021 
00022 #include "sysdep.h"
00023 #include <stdlib.h>
00024 #include <unistd.h>
00025 #include <stdio.h>
00026 #include <fcntl.h>
00027 #include <signal.h>
00028 #include <pwd.h>
00029 #include <errno.h>
00030 #include <sys/types.h>
00031 
00032 #if TIME_WITH_SYS_TIME
00033 # include <sys/time.h>
00034 # include <time.h>
00035 #else
00036 # if HAVE_SYS_TIME_H
00037 #  include <sys/time.h>
00038 # else
00039 #  include <time.h>
00040 # endif
00041 #endif
00042 
00043 #include <sys/wait.h>
00044 #include <ctype.h>
00045 #include <string.h>
00046 #include <limits.h>
00047 #include <libcitadel.h>
00048 #include "citadel.h"
00049 #include "server.h"
00050 #include "sysdep_decls.h"
00051 #include "citserver.h"
00052 #include "support.h"
00053 #include "config.h"
00054 #include "database.h"
00055 #include "msgbase.h"
00056 #include "control.h"
00057 #include "ft_wordbreaker.h"
00058 #include "crc16.h"
00059 #include "ctdl_module.h"
00060 
00061 /*
00062  * Noise words are not included in search indices.
00063  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
00064  * must also be changed, so that the index is rebuilt.
00065  */
00066 
00067 noise_word *noise_words[26];
00068 
00069 static char *noise_words_init[] = {
00070        "about",
00071        "after",
00072        "also",
00073        "another",
00074        "because",
00075        "been",
00076        "before",
00077        "being",
00078        "between",
00079        "both",
00080        "came",
00081        "come",
00082        "could",
00083        "each",
00084        "from",
00085        "have",
00086        "here",
00087        "himself",
00088        "into",
00089        "like",
00090        "make",
00091        "many",
00092        "might",
00093        "more",
00094        "most",
00095        "much",
00096        "must",
00097        "never",
00098        "only",
00099        "other",
00100        "over",
00101        "said",
00102        "same",
00103        "should",
00104        "since",
00105        "some",
00106        "still",
00107        "such",
00108        "take",
00109        "than",
00110        "that",
00111        "their",
00112        "them",
00113        "then",
00114        "there",
00115        "these",
00116        "they",
00117        "this",
00118        "those",
00119        "through",
00120        "under",
00121        "very",
00122        "well",
00123        "were",
00124        "what",
00125        "where",
00126        "which",
00127        "while",
00128        "with",
00129        "would",
00130        "your"
00131 };
00132 
00133 
00134 void initialize_noise_words(void)
00135 {
00136        int i;
00137        int len;
00138        int ch;
00139        noise_word *next;
00140        
00141        memset (noise_words, 0, sizeof(noise_words));
00142        
00143        for (i=0; i<(sizeof(noise_words_init)/sizeof(char *)); ++i)
00144        {
00145               ch = noise_words_init[i][0] - 'a';
00146               len = strlen(noise_words_init[i]);
00147               
00148               next = malloc(sizeof(noise_word));
00149               next->len = len;
00150               next->word = strdup(noise_words_init[i]);
00151               next->next = noise_words[ch];
00152               noise_words[ch] = next;
00153        }
00154 }
00155 
00156 
00157 void noise_word_cleanup(void)
00158 {
00159        int i;
00160        noise_word *cur, *next;
00161        
00162        syslog(LOG_INFO, "Cleaning up fulltext noise words.\n");
00163        
00164        for (i = 0 ; i < 26 ; i++)
00165        {
00166               cur = noise_words[i];
00167               while (cur)
00168               {
00169                      next = cur->next;
00170                      free(cur->word);
00171                      free(cur);
00172                      cur = next;
00173               }
00174        }
00175 }
00176 
00177 /*
00178  * Compare function
00179  */
00180 int intcmp(const void *rec1, const void *rec2) {
00181        int i1, i2;
00182 
00183        i1 = *(const int *)rec1;
00184        i2 = *(const int *)rec2;
00185 
00186        if (i1 > i2) return(1);
00187        if (i1 < i2) return(-1);
00188        return(0);
00189 }
00190 
00191 
00192 void wordbreaker(const char *text, int *num_tokens, int **tokens) {
00193 
00194        int wb_num_tokens = 0;
00195        int wb_num_alloc = 0;
00196        int *wb_tokens = NULL;
00197 
00198        const char *ptr;
00199        const char *word_start;
00200        const char *word_end;
00201        char ch;
00202        int word_len;
00203        char word[256];
00204        int i;
00205        int word_crc;
00206        noise_word *noise;
00207        
00208        
00209        if (text == NULL) {         /* no NULL text please */
00210               *num_tokens = 0;
00211               *tokens = NULL;
00212               return;
00213        }
00214 
00215        if (text[0] == 0) {         /* no empty text either */
00216               *num_tokens = 0;
00217               *tokens = NULL;
00218               return;
00219        }
00220 
00221        ptr = text;
00222        word_start = NULL;
00223        while (*ptr) {
00224               ch = *ptr;
00225               if (isalnum(ch)) {
00226                      if (!word_start) {
00227                             word_start = ptr;
00228                      }
00229               }
00230               ++ptr;
00231               ch = *ptr;
00232               if ( (!isalnum(ch)) && (word_start) ) {
00233                      word_end = ptr;
00234 //                   --word_end;
00235 
00236                      /* extract the word */
00237                      word_len = word_end - word_start;
00238                      if (word_len >= sizeof word) {
00239                             syslog(LOG_DEBUG, "Invalid word length: %d\n", word_len);
00240                             safestrncpy(word, word_start, sizeof word);
00241                             word[(sizeof word) - 1] = 0;
00242                      }
00243                      else {
00244                             safestrncpy(word, word_start, word_len+1);
00245                             word[word_len] = 0;
00246                      }
00247                      word_start = NULL;
00248 
00249                      /* are we ok with the length? */
00250                      if ( (word_len >= WB_MIN)
00251                         && (word_len <= WB_MAX) ) {
00252                             for (i=0; i<word_len; ++i) {
00253                                    word[i] = tolower(word[i]);
00254                             }
00255                             /* disqualify noise words */
00256                             noise = noise_words[(int) (word[0]-'a')];
00257                             while (noise)
00258                             {
00259                                    if (noise->len == word_len)
00260                                    {
00261                                           if (!strcmp(word, noise->word)) 
00262                                           {
00263                                                  word_len = 0;
00264                                                  break;
00265                                           }
00266                                    }
00267                                    noise = noise->next;
00268                             }
00269                             if (word_len == 0)
00270                                    continue;
00271 
00272                             word_crc = (int)
00273                                    CalcCRC16Bytes(word_len, word);
00274 
00275                             ++wb_num_tokens;
00276                             if (wb_num_tokens > wb_num_alloc) {
00277                                    wb_num_alloc += 512;
00278                                    wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
00279                             }
00280                             wb_tokens[wb_num_tokens - 1] = word_crc;
00281                      }
00282               }
00283        }
00284 
00285        /* sort and purge dups */
00286        if (wb_num_tokens > 1) {
00287               qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
00288               for (i=0; i<(wb_num_tokens-1); ++i) {
00289                      if (wb_tokens[i] == wb_tokens[i+1]) {
00290                             memmove(&wb_tokens[i], &wb_tokens[i+1],
00291                                    ((wb_num_tokens - i - 1)*sizeof(int)));
00292                             --wb_num_tokens;
00293                             --i;
00294                      }
00295               }
00296        }
00297 
00298        *num_tokens = wb_num_tokens;
00299        *tokens = wb_tokens;
00300 }
00301