Back to index

moin  1.9.0~rc2
builtin.py
Go to the documentation of this file.
00001 # -*- coding: iso-8859-1 -*-
00002 """
00003     MoinMoin - search engine internals
00004 
00005     @copyright: 2005 MoinMoin:FlorianFesti,
00006                 2005 MoinMoin:NirSoffer,
00007                 2005 MoinMoin:AlexanderSchremmer,
00008                 2006-2009 MoinMoin:ThomasWaldmann,
00009                 2006 MoinMoin:FranzPletz
00010     @license: GNU GPL, see COPYING for details
00011 """
00012 
00013 import sys, os, time, errno, codecs
00014 
00015 from MoinMoin import log
00016 logging = log.getLogger(__name__)
00017 
00018 from MoinMoin import wikiutil, config, caching
00019 from MoinMoin.Page import Page
00020 from MoinMoin.search.results import getSearchResults, Match, TextMatch, TitleMatch, getSearchResults
00021 
00022 ##############################################################################
00023 # Search Engine Abstraction
00024 ##############################################################################
00025 
00026 
00027 class IndexerQueue(object):
00028     """
00029     Represents a locked on-disk queue with jobs for the xapian indexer
00030 
00031     Each job is a tuple like: (PAGENAME, ATTACHMENTNAME, REVNO)
00032     PAGENAME: page name (unicode)
00033     ATTACHMENTNAME: attachment name (unicode) or None (for pages)
00034     REVNO: revision number (int) - meaning "look at that revision",
00035            or None - meaning "look at all revisions"
00036     """
00037 
00038     def __init__(self, request, xapian_dir, queuename, timeout=10.0):
00039         """
00040         @param request: request object
00041         @param xapian_dir: the xapian main directory
00042         @param queuename: name of the queue (used for caching key)
00043         @param timeout: lock acquire timeout
00044         """
00045         self.request = request
00046         self.xapian_dir = xapian_dir
00047         self.queuename = queuename
00048         self.timeout = timeout
00049 
00050     def get_cache(self, locking):
00051         return caching.CacheEntry(self.request, self.xapian_dir, self.queuename,
00052                                   scope='dir', use_pickle=True, do_locking=locking)
00053 
00054     def _queue(self, cache):
00055         try:
00056             queue = cache.content()
00057         except caching.CacheError:
00058             # likely nothing there yet
00059             queue = []
00060         return queue
00061 
00062     def put(self, pagename, attachmentname=None, revno=None):
00063         """ Put an entry into the queue (append at end)
00064 
00065         @param pagename: page name [unicode]
00066         @param attachmentname: attachment name [unicode]
00067         @param revno: revision number (int) or None (all revs)
00068         """
00069         cache = self.get_cache(locking=False) # we lock manually
00070         cache.lock('w', 60.0)
00071         try:
00072             queue = self._queue(cache)
00073             entry = (pagename, attachmentname, revno)
00074             queue.append(entry)
00075             cache.update(queue)
00076         finally:
00077             cache.unlock()
00078 
00079     def get(self):
00080         """ Get (and remove) first entry from the queue
00081 
00082         Raises IndexError if queue was empty when calling get().
00083         """
00084         cache = self.get_cache(locking=False) # we lock manually
00085         cache.lock('w', 60.0)
00086         try:
00087             queue = self._queue(cache)
00088             entry = queue.pop(0)
00089             cache.update(queue)
00090         finally:
00091             cache.unlock()
00092         return entry
00093 
00094 
00095 class BaseIndex(object):
00096     """ Represents a search engine index """
00097 
00098     def __init__(self, request):
00099         """
00100         @param request: current request
00101         """
00102         self.request = request
00103         self.main_dir = self._main_dir()
00104         if not os.path.exists(self.main_dir):
00105             os.makedirs(self.main_dir)
00106         self.update_queue = IndexerQueue(request, self.main_dir, 'indexer-queue')
00107 
00108     def _main_dir(self):
00109         raise NotImplemented('...')
00110 
00111     def exists(self):
00112         """ Check if index exists """
00113         raise NotImplemented('...')
00114 
00115     def mtime(self):
00116         """ Modification time of the index """
00117         raise NotImplemented('...')
00118 
00119     def touch(self):
00120         """ Touch the index """
00121         raise NotImplemented('...')
00122 
00123     def _search(self, query):
00124         """ Actually perfom the search
00125 
00126         @param query: the search query objects tree
00127         """
00128         raise NotImplemented('...')
00129 
00130     def search(self, query, **kw):
00131         """ Search for items in the index
00132 
00133         @param query: the search query objects to pass to the index
00134         """
00135         return self._search(query, **kw)
00136 
00137     def update_item(self, pagename, attachmentname=None, revno=None, now=True):
00138         """ Update a single item (page or attachment) in the index
00139 
00140         @param pagename: the name of the page to update
00141         @param attachmentname: the name of the attachment to update
00142         @param revno: a specific revision number (int) or None (all revs)
00143         @param now: do all updates now (default: True)
00144         """
00145         self.update_queue.put(pagename, attachmentname, revno)
00146         if now:
00147             self.do_queued_updates()
00148 
00149     def indexPages(self, files=None, mode='update', pages=None):
00150         """ Index pages (and files, if given)
00151 
00152         @param files: iterator or list of files to index additionally
00153         @param mode: set the mode of indexing the pages, either 'update' or 'add'
00154         @param pages: list of pages to index, if not given, all pages are indexed
00155         """
00156         start = time.time()
00157         request = self._indexingRequest(self.request)
00158         self._index_pages(request, files, mode, pages=pages)
00159         logging.info("indexing completed successfully in %0.2f seconds." %
00160                     (time.time() - start))
00161 
00162     def _index_pages(self, request, files=None, mode='update', pages=None):
00163         """ Index all pages (and all given files)
00164 
00165         This should be called from indexPages only!
00166 
00167         @param request: current request
00168         @param files: iterator or list of files to index additionally
00169         @param mode: set the mode of indexing the pages, either 'update' or 'add'
00170         @param pages: list of pages to index, if not given, all pages are indexed
00171 
00172         """
00173         raise NotImplemented('...')
00174 
00175     def do_queued_updates(self, amount=-1):
00176         """ Perform updates in the queues
00177 
00178         @param request: the current request
00179         @keyword amount: how many updates to perform at once (default: -1 == all)
00180         """
00181         raise NotImplemented('...')
00182 
00183     def optimize(self):
00184         """ Optimize the index if possible """
00185         raise NotImplemented('...')
00186 
00187     def contentfilter(self, filename):
00188         """ Get a filter for content of filename and return unicode content.
00189 
00190         @param filename: name of the file
00191         """
00192         request = self.request
00193         mt = wikiutil.MimeType(filename=filename)
00194         for modulename in mt.module_name():
00195             try:
00196                 execute = wikiutil.importPlugin(request.cfg, 'filter', modulename)
00197                 break
00198             except wikiutil.PluginMissingError:
00199                 pass
00200             else:
00201                 logging.info("Cannot load filter for mimetype %s" % modulename)
00202         try:
00203             data = execute(self, filename)
00204             logging.debug("Filter %s returned %d characters for file %s" % (modulename, len(data), filename))
00205         except (OSError, IOError), err:
00206             data = ''
00207             logging.warning("Filter %s threw error '%s' for file %s" % (modulename, str(err), filename))
00208         return mt.mime_type(), data
00209 
00210     def _indexingRequest(self, request):
00211         """ Return a new request that can be used for index building.
00212 
00213         This request uses a security policy that lets the current user
00214         read any page. Without this policy some pages will not render,
00215         which will create broken pagelinks index.
00216 
00217         @param request: current request
00218         """
00219         import copy
00220         from MoinMoin.security import Permissions
00221         from MoinMoin.logfile import editlog
00222 
00223         class SecurityPolicy(Permissions):
00224 
00225             def read(self, *args, **kw):
00226                 return True
00227 
00228         r = copy.copy(request)
00229         r.user.may = SecurityPolicy(r.user)
00230         r.editlog = editlog.EditLog(r)
00231         return r
00232 
00233 
00234 ##############################################################################
00235 ### Searching
00236 ##############################################################################
00237 
00238 
00239 class BaseSearch(object):
00240     """ A search run """
00241 
00242     def __init__(self, request, query, sort='weight', mtime=None, historysearch=0):
00243         """
00244         @param request: current request
00245         @param query: search query objects tree
00246         @keyword sort: the sorting of the results (default: 'weight')
00247         @keyword mtime: only show items newer than this timestamp (default: None)
00248         @keyword historysearch: whether to show old revisions of a page (default: 0)
00249         """
00250         self.request = request
00251         self.query = query
00252         self.sort = sort
00253         self.mtime = mtime
00254         self.historysearch = historysearch
00255         self.filtered = False
00256         self.fs_rootpage = "FS" # XXX FS hardcoded
00257 
00258     def run(self):
00259         """ Perform search and return results object """
00260 
00261         start = time.time()
00262         hits, estimated_hits = self._search()
00263 
00264         # important - filter deleted pages or pages the user may not read!
00265         if not self.filtered:
00266             hits = self._filter(hits)
00267             logging.debug("after filtering: %d hits" % len(hits))
00268 
00269         return self._get_search_results(hits, start, estimated_hits)
00270 
00271     def _search(self):
00272         """
00273         Search pages.
00274 
00275         Return list of tuples (wikiname, page object, attachment,
00276         matches, revision) and estimated number of search results (if
00277         there is no estimate, None should be returned).
00278 
00279         The list may contain deleted pages or pages the user may not read.
00280         """
00281         raise NotImplementedError()
00282 
00283     def _filter(self, hits):
00284         """
00285         Filter out deleted or acl protected pages
00286 
00287         @param hits: list of hits
00288         """
00289         userMayRead = self.request.user.may.read
00290         fs_rootpage = self.fs_rootpage + "/"
00291         thiswiki = (self.request.cfg.interwikiname, 'Self')
00292         filtered = [(wikiname, page, attachment, match, rev)
00293                 for wikiname, page, attachment, match, rev in hits
00294                     if (not wikiname in thiswiki or
00295                        page.exists() and userMayRead(page.page_name) or
00296                        page.page_name.startswith(fs_rootpage)) and
00297                        (not self.mtime or self.mtime <= page.mtime_usecs()/1000000)]
00298         return filtered
00299 
00300     def _get_search_results(self, hits, start, estimated_hits):
00301         return getSearchResults(self.request, self.query, hits, start, self.sort, estimated_hits)
00302 
00303     def _get_match(self, page=None, uid=None):
00304         """
00305         Get all matches
00306 
00307         @param page: the current page instance
00308         """
00309         if page:
00310             return self.query.search(page)
00311 
00312     def _getHits(self, pages):
00313         """ Get the hit tuples in pages through _get_match """
00314         logging.debug("_getHits searching in %d pages ..." % len(pages))
00315         hits = []
00316         revisionCache = {}
00317         fs_rootpage = self.fs_rootpage
00318         for hit in pages:
00319 
00320             uid = hit.get('uid')
00321             wikiname = hit['wikiname']
00322             pagename = hit['pagename']
00323             attachment = hit['attachment']
00324             revision = int(hit.get('revision', 0))
00325 
00326             logging.debug("_getHits processing %r %r %d %r" % (wikiname, pagename, revision, attachment))
00327 
00328             if wikiname in (self.request.cfg.interwikiname, 'Self'): # THIS wiki
00329                 page = Page(self.request, pagename, rev=revision)
00330 
00331                 if not self.historysearch and revision:
00332                     revlist = page.getRevList()
00333                     # revlist can be empty if page was nuked/renamed since it was included in xapian index
00334                     if not revlist or revlist[0] != revision:
00335                         # nothing there at all or not the current revision
00336                         logging.debug("no history search, skipping non-current revision...")
00337                         continue
00338 
00339                 if attachment:
00340                     # revision currently is 0 ever
00341                     if pagename == fs_rootpage: # not really an attachment
00342                         page = Page(self.request, "%s/%s" % (fs_rootpage, attachment))
00343                         hits.append((wikiname, page, None, None, revision))
00344                     else:
00345                         matches = self._get_match(page=None, uid=uid)
00346                         hits.append((wikiname, page, attachment, matches, revision))
00347                 else:
00348                     matches = self._get_match(page=page, uid=uid)
00349                     logging.debug("self._get_match %r" % matches)
00350                     if matches:
00351                         if not self.historysearch and pagename in revisionCache and revisionCache[pagename][0] < revision:
00352                             hits.remove(revisionCache[pagename][1])
00353                             del revisionCache[pagename]
00354                         hits.append((wikiname, page, attachment, matches, revision))
00355                         revisionCache[pagename] = (revision, hits[-1])
00356 
00357             else: # other wiki
00358                 hits.append((wikiname, pagename, attachment, None, revision))
00359         logging.debug("_getHits returning %r." % hits)
00360         return hits
00361 
00362 
00363 class MoinSearch(BaseSearch):
00364 
00365     def __init__(self, request, query, sort='weight', mtime=None, historysearch=0, pages=None):
00366         super(MoinSearch, self).__init__(request, query, sort, mtime, historysearch)
00367 
00368         self.pages = pages
00369 
00370     def _search(self):
00371         """
00372         Search pages using moin's built-in full text search
00373 
00374         The list may contain deleted pages or pages the user may not
00375         read.
00376 
00377         if self.pages is not None, searches in that pages.
00378         """
00379         self.request.clock.start('_moinSearch')
00380 
00381         # if self.pages is none, we make a full pagelist, but don't
00382         # search attachments (thus attachment name = '')
00383         pages = self.pages or [{'pagename': p, 'attachment': '', 'wikiname': 'Self', } for p in self._getPageList()]
00384 
00385         hits = self._getHits(pages)
00386         self.request.clock.stop('_moinSearch')
00387 
00388         return hits, None
00389 
00390     def _getPageList(self):
00391         """ Get list of pages to search in
00392 
00393         If the query has a page filter, use it to filter pages before
00394         searching. If not, get a unfiltered page list. The filtering
00395         will happen later on the hits, which is faster with current
00396         slow storage.
00397         """
00398         filter_ = self.query.pageFilter()
00399         if filter_:
00400             # There is no need to filter the results again.
00401             self.filtered = True
00402             return self.request.rootpage.getPageList(filter=filter_)
00403         else:
00404             return self.request.rootpage.getPageList(user='', exists=0)
00405