Back to index

moin  1.9.0~rc2
expressions.py
Go to the documentation of this file.
00001 # -*- coding: iso-8859-1 -*-
00002 """
00003     MoinMoin - search query expressions
00004 
00005     @copyright: 2005 MoinMoin:FlorianFesti,
00006                 2005 MoinMoin:NirSoffer,
00007                 2005 MoinMoin:AlexanderSchremmer,
00008                 2006-2008 MoinMoin:ThomasWaldmann,
00009                 2006 MoinMoin:FranzPletz,
00010                 2009 MoinMoin:DmitrijsMilajevs
00011     @license: GNU GPL, see COPYING for details
00012 """
00013 
00014 import re
00015 
00016 from MoinMoin import log
00017 logging = log.getLogger(__name__)
00018 
00019 from MoinMoin import config, wikiutil
00020 from MoinMoin.search.results import Match, TitleMatch, TextMatch
00021 
00022 try:
00023     from MoinMoin.search import Xapian
00024     from MoinMoin.search.Xapian import Query
00025 
00026     OP_AND = Query.OP_AND
00027     OP_OR = Query.OP_OR
00028     OP_AND_NOT = Query.OP_AND_NOT
00029 
00030 except ImportError:
00031     pass
00032 
00033 
00034 class BaseExpression(object):
00035     """ Base class for all search terms """
00036 
00037     # costs is estimated time to calculate this term.
00038     # Number is relative to other terms and has no real unit.
00039     # It allows to do the fast searches first.
00040     costs = 0
00041     _tag = ""
00042 
00043     def __init__(self, pattern, use_re=False, case=False):
00044         """ Init a text search
00045 
00046         @param pattern: pattern to search for, ascii string or unicode
00047         @param use_re: treat pattern as re of plain text, bool
00048         @param case: do case sensitive search, bool
00049         """
00050         self._pattern = unicode(pattern)
00051         self.negated = 0
00052         self.use_re = use_re
00053         self.case = case
00054 
00055         if use_re:
00056             self._tag += 're:'
00057         if case:
00058             self._tag += 'case:'
00059 
00060         self.pattern, self.search_re = self._build_re(self._pattern, use_re=use_re, case=case)
00061 
00062     def __str__(self):
00063         return unicode(self).encode(config.charset, 'replace')
00064 
00065     def negate(self):
00066         """ Negate the result of this term """
00067         self.negated = 1
00068 
00069     def pageFilter(self):
00070         """ Return a page filtering function
00071 
00072         This function is used to filter page list before we search
00073         it. Return a function that get a page name, and return bool.
00074 
00075         The default expression does not have any filter function and
00076         return None. Sub class may define custom filter functions.
00077         """
00078         return None
00079 
00080     def _get_matches(self, page):
00081         raise NotImplementedError
00082 
00083     def search(self, page):
00084         """ Search a page
00085 
00086         Returns a list of Match objects or None if term didn't find
00087         anything (vice versa if negate() was called).  Terms containing
00088         other terms must call this method to aggregate the results.
00089         This Base class returns True (Match()) if not negated.
00090         """
00091         logging.debug("%s searching page %r for (negated = %r) %r" % (self.__class__, page.page_name, self.negated, self._pattern))
00092 
00093         matches = self._get_matches(page)
00094 
00095         # Decide what to do with the results.
00096         if self.negated:
00097             if matches:
00098                 result = None
00099             else:
00100                 result = [Match()] # represents "matched" (but as it was a negative match, we have nothing to show)
00101         else: # not negated
00102             if matches:
00103                 result = matches
00104             else:
00105                 result = None
00106         logging.debug("%s returning %r" % (self.__class__, result))
00107         return result
00108 
00109     def highlight_re(self):
00110         """ Return a regular expression of what the term searches for
00111 
00112         Used to display the needle in the page.
00113         """
00114         return u''
00115 
00116     def _build_re(self, pattern, use_re=False, case=False, stemmed=False):
00117         """ Make a regular expression out of a text pattern """
00118         flags = case and re.U or (re.I | re.U)
00119 
00120         try:
00121             search_re = re.compile(pattern, flags)
00122         except re.error:
00123             pattern = re.escape(pattern)
00124             search_re = re.compile(pattern, flags)
00125 
00126         return pattern, search_re
00127 
00128     def _get_query_for_search_re(self, connection, field_to_check=None):
00129         """
00130         Return a query which satisfy self.search_re for field values.
00131         If field_to_check is given check values only for that field.
00132         """
00133         queries = []
00134 
00135         documents = connection.get_all_documents()
00136         for document in documents:
00137             data = document.data
00138             if field_to_check:
00139                 # Check only field with given name
00140                 if field_to_check in data:
00141                     for term in data[field_to_check]:
00142                         if self.search_re.match(term):
00143                             queries.append(connection.query_field(field_to_check, term))
00144             else:
00145                 # Check all fields
00146                 for field, terms in data.iteritems():
00147                     for term in terms:
00148                         if self.search_re.match(term):
00149                             queries.append(connection.query_field(field_to_check, term))
00150 
00151         return Query(OP_OR, queries)
00152 
00153     def xapian_need_postproc(self):
00154         return self.case
00155 
00156     def __unicode__(self):
00157         neg = self.negated and '-' or ''
00158         return u'%s%s"%s"' % (neg, self._tag, unicode(self._pattern))
00159 
00160 
00161 class AndExpression(BaseExpression):
00162     """ A term connecting several sub terms with a logical AND """
00163 
00164     operator = ' '
00165 
00166     def __init__(self, *terms):
00167         self._subterms = list(terms)
00168         self.negated = 0
00169 
00170     def append(self, expression):
00171         """ Append another term """
00172         self._subterms.append(expression)
00173 
00174     def subterms(self):
00175         return self._subterms
00176 
00177     @property
00178     def costs(self):
00179         return sum([t.costs for t in self._subterms])
00180 
00181     def __unicode__(self):
00182         result = ''
00183         for t in self._subterms:
00184             result += self.operator + unicode(t)
00185         return u'[' + result[len(self.operator):] + u']'
00186 
00187     def _filter(self, terms, name):
00188         """ A function that returns True if all terms filter name """
00189         result = None
00190         for term in terms:
00191             _filter = term.pageFilter()
00192             t = _filter(name)
00193             if t is True:
00194                 result = True
00195             elif t is False:
00196                 result = False
00197                 break
00198         logging.debug("pageFilter AND returns %r" % result)
00199         return result
00200 
00201     def pageFilter(self):
00202         """ Return a page filtering function
00203 
00204         This function is used to filter page list before we search it.
00205 
00206         Return a function that gets a page name, and return bool, or None.
00207         """
00208         # Sort terms by cost, then get all title searches
00209         self.sortByCost()
00210         terms = [term for term in self._subterms if isinstance(term, TitleSearch)]
00211         if terms:
00212             return lambda name: self._filter(terms, name)
00213 
00214     def sortByCost(self):
00215         self._subterms.sort(key=lambda t: t.costs)
00216 
00217     def search(self, page):
00218         """ Search for each term, cheap searches first """
00219         self.sortByCost()
00220         matches = []
00221         for term in self._subterms:
00222             result = term.search(page)
00223             if not result:
00224                 return None
00225             matches.extend(result)
00226         return matches
00227 
00228     def highlight_re(self):
00229         result = []
00230         for s in self._subterms:
00231             highlight_re = s.highlight_re()
00232             if highlight_re:
00233                 result.append(highlight_re)
00234 
00235         return u'|'.join(result)
00236 
00237     def xapian_need_postproc(self):
00238         for term in self._subterms:
00239             if term.xapian_need_postproc():
00240                 return True
00241         return False
00242 
00243     def xapian_term(self, request, connection):
00244         # sort negated terms
00245         terms = []
00246         not_terms = []
00247 
00248         for term in self._subterms:
00249             if not term.negated:
00250                 terms.append(term.xapian_term(request, connection))
00251             else:
00252                 not_terms.append(term.xapian_term(request, connection))
00253 
00254         # prepare query for not negated terms
00255         if terms:
00256             query = Query(OP_AND, terms)
00257         else:
00258             query = Query('') # MatchAll
00259 
00260         # prepare query for negated terms
00261         if not_terms:
00262             query_negated = Query(OP_OR, not_terms)
00263         else:
00264             query_negated = Query()
00265 
00266         return Query(OP_AND_NOT, query, query_negated)
00267 
00268 
00269 class OrExpression(AndExpression):
00270     """ A term connecting several sub terms with a logical OR """
00271 
00272     operator = ' or '
00273 
00274     def _filter(self, terms, name):
00275         """ A function that returns True if any term filters name """
00276         result = None
00277         for term in terms:
00278             _filter = term.pageFilter()
00279             t = _filter(name)
00280             if t is True:
00281                 result = True
00282                 break
00283             elif t is False:
00284                 result = False
00285         logging.debug("pageFilter OR returns %r" % result)
00286         return result
00287 
00288     def search(self, page):
00289         """ Search page with terms
00290 
00291         @param page: the page instance
00292         """
00293 
00294         # XXX Do we have any reason to sort here? we are not breaking out
00295         # of the search in any case.
00296         #self.sortByCost()
00297         matches = []
00298         for term in self._subterms:
00299             result = term.search(page)
00300             if result:
00301                 matches.extend(result)
00302         return matches
00303 
00304     def xapian_term(self, request, connection):
00305         # XXX: negated terms managed by _moinSearch?
00306         return Query(OP_OR, [term.xapian_term(request, connection) for term in self._subterms])
00307 
00308 
00309 class BaseTextFieldSearch(BaseExpression):
00310 
00311     _field_to_search = None
00312 
00313     def xapian_term(self, request, connection):
00314         if self.use_re:
00315             queries = [self._get_query_for_search_re(connection, self._field_to_search)]
00316         else:
00317             queries = []
00318             stemmed = []
00319             analyzer = Xapian.WikiAnalyzer(request=request, language=request.cfg.language_default)
00320 
00321             for term in self._pattern.split():
00322                 query_term = connection.query_field(self._field_to_search, term)
00323                 tokens = analyzer.tokenize(term)
00324 
00325                 if request.cfg.xapian_stemming:
00326                     query_token = []
00327                     for token, stemmed_ in tokens:
00328                         if token != term.lower():
00329                             if stemmed_:
00330                                 query_token.append(Query(OP_OR,
00331                                                          [connection.query_field(self._field_to_search, token),
00332                                                           connection.query_field(self._field_to_search, stemmed_)]))
00333 #                                 stemmed.append('(%s|%s)' % (token, stemmed_))
00334                             else:
00335                                 query_token.append(connection.query_field(self._field_to_search, token))
00336 #                                 stemmed.append(token)
00337                     query_tokens = Query(OP_AND, query_token)
00338                 else:
00339                     query_tokens = Query(OP_AND, [connection.query_field(self._field_to_search, token) for token, stemmed_ in tokens if token != term.lower()])
00340 
00341                 queries.append(Query(OP_OR, [query_term, query_tokens]))
00342 
00343             # XXX broken wrong regexp is built!
00344             if not self.case and stemmed:
00345                 new_pat = ' '.join(stemmed)
00346                 self._pattern = new_pat
00347                 self.pattern, self.search_re = self._build_re(new_pat, use_re=False, case=self.case, stemmed=True)
00348 
00349         return Query(OP_AND, queries)
00350 
00351 
00352 class TextSearch(BaseTextFieldSearch):
00353     """ A term that does a normal text search
00354 
00355     Both page content and the page title are searched, using an
00356     additional TitleSearch term.
00357     """
00358 
00359     costs = 10000
00360     _field_to_search = 'content'
00361 
00362     def highlight_re(self):
00363         return u"(%s)" % self.pattern
00364 
00365     def _get_matches(self, page):
00366         matches = []
00367 
00368         # Search in page name
00369         results = TitleSearch(self._pattern, use_re=self.use_re, case=self.case)._get_matches(page)
00370         if results:
00371             matches.extend(results)
00372 
00373         # Search in page body
00374         body = page.get_raw_body()
00375         for match in self.search_re.finditer(body):
00376             matches.append(TextMatch(re_match=match))
00377 
00378         return matches
00379 
00380     def xapian_term(self, request, connection):
00381 
00382         content_query = super(TextSearch, self).xapian_term(request, connection)
00383         title_query = TitleSearch(self._pattern, use_re=self.use_re, case=self.case).xapian_term(request, connection)
00384 
00385         return Query(OP_OR, [title_query, content_query])
00386 
00387 
00388 class TitleSearch(BaseTextFieldSearch):
00389     """ Term searches in pattern in page title only """
00390 
00391     _tag = 'title:'
00392     costs = 100
00393     _field_to_search = 'title'
00394 
00395     def pageFilter(self):
00396         """ Page filter function for single title search """
00397 
00398         def filter(name):
00399             match = self.search_re.search(name)
00400             result = bool(self.negated) ^ bool(match)
00401             logging.debug("pageFilter title returns %r (%r)" % (result, self.pattern))
00402             return result
00403         return filter
00404 
00405     def _get_matches(self, page):
00406         """ Get matches in page name """
00407         matches = []
00408 
00409         for match in self.search_re.finditer(page.page_name):
00410             matches.append(TitleMatch(re_match=match))
00411 
00412         return matches
00413 
00414 
00415 class BaseFieldSearch(BaseExpression):
00416 
00417     _field_to_search = None
00418 
00419     def xapian_term(self, request, connection):
00420         if self.use_re:
00421             return self._get_query_for_search_re(connection, self._field_to_search)
00422         else:
00423             return connection.query_field(self._field_to_search, self._pattern)
00424 
00425 
00426 class LinkSearch(BaseFieldSearch):
00427     """ Search the term in the pagelinks """
00428 
00429     _tag = 'linkto:'
00430     _field_to_search = 'linkto'
00431     costs = 5000 # cheaper than a TextSearch
00432 
00433     def __init__(self, pattern, use_re=False, case=True):
00434         """ Init a link search
00435 
00436         @param pattern: pattern to search for, ascii string or unicode
00437         @param use_re: treat pattern as re of plain text, bool
00438         @param case: do case sensitive search, bool
00439         """
00440 
00441         super(LinkSearch, self).__init__(pattern, use_re, case)
00442 
00443         self._textpattern = '(' + pattern.replace('/', '|') + ')' # used for search in text
00444         self.textsearch = TextSearch(self._textpattern, use_re=True, case=case)
00445 
00446     def highlight_re(self):
00447         return u"(%s)" % self._textpattern
00448 
00449     def _get_matches(self, page):
00450         # Get matches in page links
00451         matches = []
00452 
00453         # XXX in python 2.5 any() may be used.
00454         found = False
00455         for link in page.getPageLinks(page.request):
00456             if self.search_re.match(link):
00457                 found = True
00458                 break
00459 
00460         if found:
00461             # Search in page text
00462             results = self.textsearch.search(page)
00463             if results:
00464                 matches.extend(results)
00465             else: # This happens e.g. for pages that use navigation macros
00466                 matches.append(TextMatch(0, 0))
00467 
00468         return matches
00469 
00470 
00471 class LanguageSearch(BaseFieldSearch):
00472     """ Search the pages written in a language """
00473 
00474     _tag = 'language:'
00475     _field_to_search = 'lang'
00476     costs = 5000 # cheaper than a TextSearch
00477 
00478     def __init__(self, pattern, use_re=False, case=False):
00479         """ Init a language search
00480 
00481         @param pattern: pattern to search for, ascii string or unicode
00482         @param use_re: treat pattern as re of plain text, bool
00483         @param case: do case sensitive search, bool
00484         """
00485         # iso language code, always lowercase and not case-sensitive
00486         super(LanguageSearch, self).__init__(pattern.lower(), use_re, case=False)
00487 
00488     def _get_matches(self, page):
00489 
00490         if self.pattern == page.pi['language']:
00491             return [Match()]
00492         else:
00493             return []
00494 
00495 
00496 class CategorySearch(BaseFieldSearch):
00497     """ Search the pages belonging to a category """
00498 
00499     _tag = 'category:'
00500     _field_to_search = 'category'
00501     costs = 5000 # cheaper than a TextSearch
00502 
00503     def _get_matches(self, page):
00504         """ match categories like this:
00505             ... some page text ...
00506             ----
00507             ## optionally some comments, e.g. about possible categories:
00508             ## CategoryFoo
00509             CategoryTheRealAndOnly
00510 
00511             Note: there might be multiple comment lines, but all real categories
00512                   must be on a single line either directly below the ---- or
00513                   directly below some comment lines.
00514         """
00515         matches = []
00516 
00517         pattern = r'(?m)(^-----*\s*\r?\n)(^##.*\r?\n)*^(?!##)(.*)\b%s\b' % self.pattern
00518         search_re = self._build_re(pattern, use_re=self.use_re, case=self.case)[1] # we need only a regexp, but not a pattern
00519 
00520         body = page.get_raw_body()
00521         for match in search_re.finditer(body):
00522             matches.append(TextMatch(re_match=match))
00523 
00524         return matches
00525 
00526     def highlight_re(self):
00527         return u'(\\b%s\\b)' % self._pattern
00528 
00529     def xapian_term(self, request, connection):
00530         # XXX Probably, it is a good idea to inherit this class from
00531         # BaseFieldSearch and get rid of this definition
00532         if self.use_re:
00533             return self._get_query_for_search_re(connection, 'category')
00534         else:
00535             pattern = self._pattern
00536             # XXX UnicodeQuery was used
00537             return connection.query_field('category', pattern)
00538 
00539 
00540 class MimetypeSearch(BaseFieldSearch):
00541     """ Search for files belonging to a specific mimetype """
00542 
00543     _tag = 'mimetype:'
00544     _field_to_search = 'mimetype'
00545     costs = 5000 # cheaper than a TextSearch
00546 
00547     def __init__(self, pattern, use_re=False, case=False):
00548         """ Init a mimetype search
00549 
00550         @param pattern: pattern to search for, ascii string or unicode
00551         @param use_re: treat pattern as re of plain text, bool
00552         @param case: do case sensitive search, bool
00553         """
00554         # always lowercase and not case-sensitive
00555         super(MimetypeSearch, self).__init__(pattern.lower(), use_re, case=False)
00556 
00557     def _get_matches(self, page):
00558 
00559         page_mimetype = u'text/%s' % page.pi['format']
00560 
00561         if self.search_re.search(page_mimetype):
00562             return [Match()]
00563         else:
00564             return []
00565 
00566 
00567 class DomainSearch(BaseFieldSearch):
00568     """ Search for pages belonging to a specific domain """
00569 
00570     _tag = 'domain:'
00571     _field_to_search = 'domain'
00572     costs = 5000 # cheaper than a TextSearch
00573 
00574     def __init__(self, pattern, use_re=False, case=False):
00575         """ Init a domain search
00576 
00577         @param pattern: pattern to search for, ascii string or unicode
00578         @param use_re: treat pattern as re of plain text, bool
00579         @param case: do case sensitive search, bool
00580         """
00581         # always lowercase and not case-sensitive
00582         super(DomainSearch, self).__init__(pattern.lower(), use_re, case=False)
00583 
00584     def _get_matches(self, page):
00585         checks = {'underlay': page.isUnderlayPage,
00586                   'standard': page.isStandardPage,
00587                   'system': lambda page=page: wikiutil.isSystemPage(page.request, page.page_name),
00588                  }
00589 
00590         try:
00591             match = checks[self.pattern]()
00592         except KeyError:
00593             match = False
00594 
00595         if match:
00596             return [Match()]
00597         else:
00598             return []
00599