Back to index

moin  1.9.0~rc2
tokenizer.py
Go to the documentation of this file.
00001 # -*- coding: iso-8859-1 -*-
00002 """
00003     MoinMoin - A text analyzer for wiki syntax
00004 
00005     @copyright: 2006-2008 MoinMoin:ThomasWaldmann,
00006                 2006 MoinMoin:FranzPletz
00007     @license: GNU GPL, see COPYING for details.
00008 """
00009 
00010 import re
00011 import xapian
00012 
00013 from MoinMoin.parser.text_moin_wiki import Parser as WikiParser
00014 from MoinMoin import config
00015 
00016 
00017 class WikiAnalyzer(object):
00018     """ A text analyzer for wiki syntax
00019 
00020     The purpose of this class is to analyze texts/pages in wiki syntax
00021     and yield single terms to feed into the xapian database.
00022     """
00023 
00024     singleword = r"[%(u)s][%(l)s]+" % {
00025                      'u': config.chars_upper,
00026                      'l': config.chars_lower,
00027                  }
00028 
00029     singleword_re = re.compile(singleword, re.U)
00030     wikiword_re = re.compile(WikiParser.word_rule, re.UNICODE|re.VERBOSE)
00031 
00032     token_re = re.compile(
00033         r"(?P<company>\w+[&@]\w+)|" + # company names like AT&T and Excite@Home.
00034         r"(?P<email>\w+([.-]\w+)*@\w+([.-]\w+)*)|" +    # email addresses
00035         r"(?P<acronym>(\w\.)+)|" +          # acronyms: U.S.A., I.B.M., etc.
00036         r"(?P<word>\w+)",                   # words (including WikiWords)
00037         re.U)
00038 
00039     dot_re = re.compile(r"[-_/,.]")
00040     mail_re = re.compile(r"[-_/,.]|(@)")
00041     alpha_num_re = re.compile(r"\d+|\D+")
00042 
00043     def __init__(self, request=None, language=None):
00044         """
00045         @param request: current request
00046         @param language: if given, the language in which to stem words
00047         """
00048         self.stemmer = None
00049         if request and request.cfg.xapian_stemming and language:
00050             try:
00051                 stemmer = xapian.Stem(language)
00052                 # we need this wrapper because the stemmer returns a utf-8
00053                 # encoded string even when it gets fed with unicode objects:
00054                 self.stemmer = lambda word: stemmer(word).decode('utf-8')
00055             except xapian.InvalidArgumentError:
00056                 # lang is not stemmable or not available
00057                 pass
00058 
00059     def raw_tokenize_word(self, word, pos):
00060         """ try to further tokenize some word starting at pos """
00061         yield (word, pos)
00062         if self.wikiword_re.match(word):
00063             # if it is a CamelCaseWord, we additionally try to tokenize Camel, Case and Word
00064             for m in re.finditer(self.singleword_re, word):
00065                 mw, mp = m.group(), pos + m.start()
00066                 for w, p in self.raw_tokenize_word(mw, mp):
00067                     yield (w, p)
00068         else:
00069             # if we have Foo42, yield Foo and 42
00070             for m in re.finditer(self.alpha_num_re, word):
00071                 mw, mp = m.group(), pos + m.start()
00072                 if mw != word:
00073                     for w, p in self.raw_tokenize_word(mw, mp):
00074                         yield (w, p)
00075 
00076     def raw_tokenize(self, value):
00077         """ Yield a stream of words from a string.
00078 
00079         @param value: string to split, must be an unicode object or a list of
00080                       unicode objects
00081         """
00082         if isinstance(value, list): # used for page links
00083             for v in value:
00084                 yield (v, 0)
00085         else:
00086             tokenstream = re.finditer(self.token_re, value)
00087             for m in tokenstream:
00088                 if m.group("acronym"):
00089                     yield (m.group("acronym").replace('.', ''), m.start())
00090                 elif m.group("company"):
00091                     yield (m.group("company"), m.start())
00092                 elif m.group("email"):
00093                     displ = 0
00094                     for word in self.mail_re.split(m.group("email")):
00095                         if word:
00096                             yield (word, m.start() + displ)
00097                             displ += len(word) + 1
00098                 elif m.group("word"):
00099                     for word, pos in self.raw_tokenize_word(m.group("word"), m.start()):
00100                         yield word, pos
00101 
00102     def tokenize(self, value):
00103         """
00104         Yield a stream of raw lower cased and stemmed words from a string.
00105 
00106         @param value: string to split, must be an unicode object or a list of
00107                       unicode objects
00108         """
00109         if self.stemmer:
00110 
00111             def stemmer(value):
00112                 stemmed = self.stemmer(value)
00113                 if stemmed != value:
00114                     return stemmed
00115                 else:
00116                     return ''
00117         else:
00118             stemmer = lambda v: ''
00119 
00120         for word, pos in self.raw_tokenize(value):
00121             # Xapian stemmer expects lowercase input
00122             word = word.lower()
00123             yield word, stemmer(word)
00124