Back to index

moin  1.9.0~rc2
datastructures.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 #
00003 # Copyright (C) 2007 Lemur Consulting Ltd
00004 #
00005 # This program is free software; you can redistribute it and/or modify
00006 # it under the terms of the GNU General Public License as published by
00007 # the Free Software Foundation; either version 2 of the License, or
00008 # (at your option) any later version.
00009 #
00010 # This program is distributed in the hope that it will be useful,
00011 # but WITHOUT ANY WARRANTY; without even the implied warranty of
00012 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013 # GNU General Public License for more details.
00014 # 
00015 # You should have received a copy of the GNU General Public License along
00016 # with this program; if not, write to the Free Software Foundation, Inc.,
00017 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
00018 r"""datastructures.py: Datastructures for search engine core.
00019 
00020 """
00021 __docformat__ = "restructuredtext en"
00022 
00023 import errors
00024 from replaylog import log
00025 import xapian
00026 import cPickle
00027 
00028 class Field(object):
00029     # Use __slots__ because we're going to have very many Field objects in
00030     # typical usage.
00031     __slots__ = 'name', 'value'
00032 
00033     def __init__(self, name, value):
00034         self.name = name
00035         self.value = value
00036 
00037     def __repr__(self):
00038         return 'Field(%r, %r)' % (self.name, self.value)
00039 
00040 class UnprocessedDocument(object):
00041     """A unprocessed document to be passed to the indexer.
00042 
00043     This represents an item to be processed and stored in the search engine.
00044     Each document will be processed by the indexer to generate a
00045     ProcessedDocument, which can then be stored in the search engine index.
00046 
00047     Note that some information in an UnprocessedDocument will not be
00048     represented in the ProcessedDocument: therefore, it is not possible to
00049     retrieve an UnprocessedDocument from the search engine index.
00050 
00051     An unprocessed document is a simple container with two attributes:
00052 
00053      - `fields` is a list of Field objects, or an iterator returning Field
00054        objects.
00055      - `id` is a string holding a unique identifier for the document (or
00056        None to get the database to allocate a unique identifier automatically
00057        when the document is added).
00058 
00059     """
00060 
00061     __slots__ = 'id', 'fields',
00062     def __init__(self, id=None, fields=None):
00063         self.id = id
00064         if fields is None:
00065             self.fields = []
00066         else:
00067             self.fields = fields
00068 
00069     def __repr__(self):
00070         return 'UnprocessedDocument(%r, %r)' % (self.id, self.fields)
00071 
00072 class ProcessedDocument(object):
00073     """A processed document, as stored in the index.
00074 
00075     This represents an item which is ready to be stored in the search engine,
00076     or which has been returned by the search engine.
00077 
00078     """
00079 
00080     __slots__ = '_doc', '_fieldmappings', '_data',
00081     def __init__(self, fieldmappings, xapdoc=None):
00082         """Create a ProcessedDocument.
00083 
00084         `fieldmappings` is the configuration from a database connection used lookup
00085         the configuration to use to store each field.
00086     
00087         If supplied, `xapdoc` is a Xapian document to store in the processed
00088         document.  Otherwise, a new Xapian document is created.
00089 
00090         """
00091         if xapdoc is None:
00092             self._doc = log(xapian.Document)
00093         else:
00094             self._doc = xapdoc
00095         self._fieldmappings = fieldmappings
00096         self._data = None
00097 
00098     def add_term(self, field, term, wdfinc=1, positions=None):
00099         """Add a term to the document.
00100 
00101         Terms are the main unit of information used for performing searches.
00102 
00103         - `field` is the field to add the term to.
00104         - `term` is the term to add.
00105         - `wdfinc` is the value to increase the within-document-frequency
00106           measure for the term by.
00107         - `positions` is the positional information to add for the term.
00108           This may be None to indicate that there is no positional information,
00109           or may be an integer to specify one position, or may be a sequence of
00110           integers to specify several positions.  (Note that the wdf is not
00111           increased automatically for each position: if you add a term at 7
00112           positions, and the wdfinc value is 2, the total wdf for the term will
00113           only be increased by 2, not by 14.)
00114 
00115         """
00116         prefix = self._fieldmappings.get_prefix(field)
00117         if len(term) > 0:
00118             # We use the following check, rather than "isupper()" to ensure
00119             # that we match the check performed by the queryparser, regardless
00120             # of our locale.
00121             if ord(term[0]) >= ord('A') and ord(term[0]) <= ord('Z'):
00122                 prefix = prefix + ':'
00123 
00124         # Note - xapian currently restricts term lengths to about 248
00125         # characters - except that zero bytes are encoded in two bytes, so
00126         # in practice a term of length 125 characters could be too long.
00127         # Xapian will give an error when commit() is called after such
00128         # documents have been added to the database.
00129         # As a simple workaround, we give an error here for terms over 220
00130         # characters, which will catch most occurrences of the error early.
00131         #
00132         # In future, it might be good to change to a hashing scheme in this
00133         # situation (or for terms over, say, 64 characters), where the
00134         # characters after position 64 are hashed (we obviously need to do this
00135         # hashing at search time, too).
00136         if len(prefix + term) > 220:
00137             raise errors.IndexerError("Field %r is too long: maximum length "
00138                                        "220 - was %d (%r)" %
00139                                        (field, len(prefix + term),
00140                                         prefix + term))
00141 
00142         if positions is None:
00143             self._doc.add_term(prefix + term, wdfinc)
00144         elif isinstance(positions, int):
00145             self._doc.add_posting(prefix + term, positions, wdfinc)
00146         else:
00147             self._doc.add_term(prefix + term, wdfinc)
00148             for pos in positions:
00149                 self._doc.add_posting(prefix + term, pos, 0)
00150 
00151     def add_value(self, field, value, purpose=''):
00152         """Add a value to the document.
00153 
00154         Values are additional units of information used when performing
00155         searches.  Note that values are _not_ intended to be used to store
00156         information for display in the search results - use the document data
00157         for that.  The intention is that as little information as possible is
00158         stored in values, so that they can be accessed as quickly as possible
00159         during the search operation.
00160         
00161         Unlike terms, each document may have at most one value in each field
00162         (whereas there may be an arbitrary number of terms in a given field).
00163         If an attempt to add multiple values to a single field is made, only
00164         the last value added will be stored.
00165 
00166         """
00167         slot = self._fieldmappings.get_slot(field, purpose)
00168         self._doc.add_value(slot, value)
00169 
00170     def get_value(self, field, purpose=''):
00171         """Get a value from the document.
00172 
00173         """
00174         slot = self._fieldmappings.get_slot(field, purpose)
00175         return self._doc.get_value(slot)
00176 
00177     def prepare(self):
00178         """Prepare the document for adding to a xapian database.
00179 
00180         This updates the internal xapian document with any changes which have
00181         been made, and then returns it.
00182 
00183         """
00184         if self._data is not None:
00185             self._doc.set_data(cPickle.dumps(self._data, 2))
00186             self._data = None
00187         return self._doc
00188 
00189     def _get_data(self):
00190         if self._data is None:
00191             rawdata = self._doc.get_data()
00192             if rawdata == '':
00193                 self._data = {}
00194             else:
00195                 self._data = cPickle.loads(rawdata)
00196         return self._data
00197     def _set_data(self, data):
00198         if not isinstance(data, dict):
00199             raise TypeError("Cannot set data to any type other than a dict")
00200         self._data = data
00201     data = property(_get_data, _set_data, doc=
00202     """The data stored in this processed document.
00203 
00204     This data is a dictionary of entries, where the key is a fieldname, and the
00205     value is a list of strings.
00206 
00207     """)
00208 
00209     def _get_id(self):
00210         tl = self._doc.termlist()
00211         try:
00212             term = tl.skip_to('Q').term
00213             if len(term) == 0 or term[0] != 'Q':
00214                 return None
00215         except StopIteration:
00216             return None
00217         return term[1:]
00218     def _set_id(self, id):
00219         tl = self._doc.termlist()
00220         try:
00221             term = tl.skip_to('Q').term
00222         except StopIteration:
00223             term = ''
00224         if len(term) != 0 and term[0] == 'Q':
00225             self._doc.remove_term(term)
00226         if id is not None:
00227             self._doc.add_term('Q' + id, 0)
00228     id = property(_get_id, _set_id, doc=
00229     """The unique ID for this document.
00230 
00231     """)
00232 
00233     def __repr__(self):
00234         return '<ProcessedDocument(%r)>' % (self.id)
00235 
00236 if __name__ == '__main__':
00237     import doctest, sys
00238     doctest.testmod (sys.modules[__name__])