Back to index

moin  1.9.0~rc2
fieldactions.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 #
00003 # Copyright (C) 2007 Lemur Consulting Ltd
00004 #
00005 # This program is free software; you can redistribute it and/or modify
00006 # it under the terms of the GNU General Public License as published by
00007 # the Free Software Foundation; either version 2 of the License, or
00008 # (at your option) any later version.
00009 #
00010 # This program is distributed in the hope that it will be useful,
00011 # but WITHOUT ANY WARRANTY; without even the implied warranty of
00012 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013 # GNU General Public License for more details.
00014 # 
00015 # You should have received a copy of the GNU General Public License along
00016 # with this program; if not, write to the Free Software Foundation, Inc.,
00017 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
00018 r"""fieldactions.py: Definitions and implementations of field actions.
00019 
00020 """
00021 __docformat__ = "restructuredtext en"
00022 
00023 import _checkxapian
00024 import errors
00025 import marshall
00026 from replaylog import log
00027 import xapian
00028 import parsedate
00029 
00030 def _act_store_content(fieldname, doc, value, context):
00031     """Perform the STORE_CONTENT action.
00032     
00033     """
00034     try:
00035         fielddata = doc.data[fieldname]
00036     except KeyError:
00037         fielddata = []
00038         doc.data[fieldname] = fielddata
00039     fielddata.append(value)
00040 
00041 def _act_index_exact(fieldname, doc, value, context):
00042     """Perform the INDEX_EXACT action.
00043     
00044     """
00045     doc.add_term(fieldname, value, 0)
00046 
00047 def _act_tag(fieldname, doc, value, context):
00048     """Perform the TAG action.
00049     
00050     """
00051     doc.add_term(fieldname, value.lower(), 0)
00052 
00053 def _act_facet(fieldname, doc, value, context, type=None):
00054     """Perform the FACET action.
00055     
00056     """
00057     if type is None or type == 'string':
00058         value = value.lower()
00059         doc.add_term(fieldname, value, 0)
00060         serialiser = log(xapian.StringListSerialiser,
00061                           doc.get_value(fieldname, 'facet'))
00062         serialiser.append(value)
00063         doc.add_value(fieldname, serialiser.get(), 'facet')
00064     else:
00065         marshaller = SortableMarshaller()
00066         fn = marshaller.get_marshall_function(fieldname, type)
00067         doc.add_value(fieldname, fn(fieldname, value), 'facet')
00068 
00069 def _act_index_freetext(fieldname, doc, value, context, weight=1, 
00070                         language=None, stop=None, spell=False,
00071                         nopos=False,
00072                         allow_field_specific=True,
00073                         search_by_default=True):
00074     """Perform the INDEX_FREETEXT action.
00075     
00076     """
00077     termgen = log(xapian.TermGenerator)
00078     if language is not None:
00079         termgen.set_stemmer(log(xapian.Stem, language))
00080         
00081     if stop is not None:
00082         stopper = log(xapian.SimpleStopper)
00083         for term in stop:
00084             stopper.add (term)
00085         termgen.set_stopper (stopper)
00086 
00087     if spell:
00088         termgen.set_database(context.index)
00089         termgen.set_flags(termgen.FLAG_SPELLING)
00090     
00091     termgen.set_document(doc._doc)
00092 
00093     if search_by_default:
00094         termgen.set_termpos(context.current_position)
00095         # Store a copy of the field without a prefix, for non-field-specific
00096         # searches.
00097         if nopos:
00098             termgen.index_text_without_positions(value, weight, '')
00099         else:
00100             termgen.index_text(value, weight, '')
00101 
00102     if allow_field_specific:
00103         # Store a second copy of the term with a prefix, for field-specific
00104         # searches.
00105         prefix = doc._fieldmappings.get_prefix(fieldname)
00106         if len(prefix) != 0:
00107             termgen.set_termpos(context.current_position)
00108             if nopos:
00109                 termgen.index_text_without_positions(value, weight, prefix)
00110             else:
00111                 termgen.index_text(value, weight, prefix)
00112 
00113     # Add a gap between each field instance, so that phrase searches don't
00114     # match across instances.
00115     termgen.increase_termpos(10)
00116     context.current_position = termgen.get_termpos()
00117 
00118 class SortableMarshaller(object):
00119     """Implementation of marshalling for sortable values.
00120 
00121     """
00122     def __init__(self, indexing=True):
00123         if indexing:
00124             self._err = errors.IndexerError
00125         else:
00126             self._err = errors.SearchError
00127 
00128     def marshall_string(self, fieldname, value):
00129         """Marshall a value for sorting in lexicograpical order.
00130 
00131         This returns the input as the output, since strings already sort in
00132         lexicographical order.
00133 
00134         """
00135         return value
00136 
00137     def marshall_float(self, fieldname, value):
00138         """Marshall a value for sorting as a floating point value.
00139 
00140         """
00141         # convert the value to a float
00142         try:
00143             value = float(value)
00144         except ValueError:
00145             raise self._err("Value supplied to field %r must be a "
00146                             "valid floating point number: was %r" %
00147                             (fieldname, value))
00148         return marshall.float_to_string(value)
00149 
00150     def marshall_date(self, fieldname, value):
00151         """Marshall a value for sorting as a date.
00152 
00153         """
00154         try:
00155             value = parsedate.date_from_string(value)
00156         except ValueError, e:
00157             raise self._err("Value supplied to field %r must be a "
00158                             "valid date: was %r: error is '%s'" %
00159                             (fieldname, value, str(e)))
00160         return marshall.date_to_string(value)
00161 
00162     def get_marshall_function(self, fieldname, sorttype):
00163         """Get a function used to marshall values of a given sorttype.
00164 
00165         """
00166         try:
00167             return {
00168                 None: self.marshall_string,
00169                 'string': self.marshall_string,
00170                 'float': self.marshall_float,
00171                 'date': self.marshall_date,
00172             }[sorttype]
00173         except KeyError:
00174             raise self._err("Unknown sort type %r for field %r" %
00175                             (sorttype, fieldname))
00176 
00177 
00178 def _act_sort_and_collapse(fieldname, doc, value, context, type=None):
00179     """Perform the SORTABLE action.
00180 
00181     """
00182     marshaller = SortableMarshaller()
00183     fn = marshaller.get_marshall_function(fieldname, type)
00184     value = fn(fieldname, value)
00185     doc.add_value(fieldname, value, 'collsort')
00186 
00187 class ActionContext(object):
00188     """The context in which an action is performed.
00189 
00190     This is just used to pass term generators, word positions, and the like
00191     around.
00192 
00193     """
00194     def __init__(self, index):
00195         self.current_language = None
00196         self.current_position = 0
00197         self.index = index
00198 
00199 class FieldActions(object):
00200     """An object describing the actions to be performed on a field.
00201 
00202     The supported actions are:
00203     
00204     - `STORE_CONTENT`: store the unprocessed content of the field in the search
00205       engine database.  All fields which need to be displayed or used when
00206       displaying the search results need to be given this action.
00207 
00208     - `INDEX_EXACT`: index the exact content of the field as a single search
00209       term.  Fields whose contents need to be searchable as an "exact match"
00210       need to be given this action.
00211 
00212     - `INDEX_FREETEXT`: index the content of this field as text.  The content
00213       will be split into terms, allowing free text searching of the field.  Four
00214       optional parameters may be supplied:
00215 
00216       - 'weight' is a multiplier to apply to the importance of the field.  This
00217         must be an integer, and the default value is 1.
00218       - 'language' is the language to use when processing the field.  This can
00219         be expressed as an ISO 2-letter language code.  The supported languages
00220         are those supported by the xapian core in use.
00221       - 'stop' is an iterable of stopwords to filter out of the generated
00222         terms.  Note that due to Xapian design, only non-positional terms are
00223         affected, so this is of limited use.
00224       - 'spell' is a boolean flag - if true, the contents of the field will be
00225         used for spelling correction.
00226       - 'nopos' is a boolean flag - if true, positional information is not
00227         stored.
00228       - 'allow_field_specific' is a boolean flag - if False, prevents terms with the field
00229         prefix being generated.  This means that searches specific to this
00230         field will not work, and thus should only be used when only non-field
00231         specific searches are desired.  Defaults to True.
00232       - 'search_by_default' is a boolean flag - if False, the field will not be
00233         searched by non-field specific searches.  If True, or omitted, the
00234         field will be included in searches for non field-specific searches.
00235 
00236     - `SORTABLE`: index the content of the field such that it can be used to
00237       sort result sets.  It also allows result sets to be restricted to those
00238       documents with a field values in a given range.  One optional parameter
00239       may be supplied:
00240 
00241       - 'type' is a value indicating how to sort the field.  It has several
00242         possible values:
00243 
00244         - 'string' - sort in lexicographic (ie, alphabetical) order.
00245           This is the default, used if no type is set.
00246         - 'float' - treat the values as (decimal representations of) floating
00247           point numbers, and sort in numerical order.  The values in the field
00248           must be valid floating point numbers (according to Python's float()
00249           function).
00250         - 'date' - sort in date order.  The values must be valid dates (either
00251           Python datetime.date objects, or ISO 8601 format (ie, YYYYMMDD or
00252           YYYY-MM-DD).
00253 
00254     - `COLLAPSE`: index the content of the field such that it can be used to
00255       "collapse" result sets, such that only the highest result with each value
00256       of the field will be returned.
00257 
00258     - `TAG`: the field contains tags; these are strings, which will be matched
00259       in a case insensitive way, but otherwise must be exact matches.  Tag
00260       fields can be searched for by making an explict query (ie, using
00261       query_field(), but not with query_parse()).  A list of the most frequent
00262       tags in a result set can also be accessed easily.
00263 
00264     - `FACET`: the field represents a classification facet; these are strings
00265       which will be matched exactly, but a list of all the facets present in
00266       the result set can also be accessed easily - in addition, a suitable
00267       subset of the facets, and a selection of the facet values, present in the
00268       result set can be calculated.  One optional parameter may be supplied:
00269 
00270       - 'type' is a value indicating the type of facet contained in the field:
00271 
00272         - 'string' - the facet values are exact binary strings.
00273         - 'float' - the facet values are floating point numbers.
00274 
00275     """
00276 
00277     # See the class docstring for the meanings of the following constants.
00278     STORE_CONTENT = 1
00279     INDEX_EXACT = 2
00280     INDEX_FREETEXT = 3
00281     SORTABLE = 4 
00282     COLLAPSE = 5
00283     TAG = 6
00284     FACET = 7
00285 
00286     # Sorting and collapsing store the data in a value, but the format depends
00287     # on the sort type.  Easiest way to implement is to treat them as the same
00288     # action.
00289     SORT_AND_COLLAPSE = -1
00290 
00291     _unsupported_actions = []
00292 
00293     if 'tags' in _checkxapian.missing_features:
00294         _unsupported_actions.append(TAG)
00295     if 'facets' in _checkxapian.missing_features:
00296         _unsupported_actions.append(FACET)
00297 
00298     def __init__(self, fieldname):
00299         # Dictionary of actions, keyed by type.
00300         self._actions = {}
00301         self._fieldname = fieldname
00302 
00303     def add(self, field_mappings, action, **kwargs):
00304         """Add an action to perform on a field.
00305 
00306         """
00307         if action in self._unsupported_actions:
00308             raise errors.IndexerError("Action unsupported with this release of xapian")
00309 
00310         if action not in (FieldActions.STORE_CONTENT,
00311                           FieldActions.INDEX_EXACT,
00312                           FieldActions.INDEX_FREETEXT,
00313                           FieldActions.SORTABLE,
00314                           FieldActions.COLLAPSE,
00315                           FieldActions.TAG,
00316                           FieldActions.FACET,
00317                          ):
00318             raise errors.IndexerError("Unknown field action: %r" % action)
00319 
00320         info = self._action_info[action]
00321 
00322         # Check parameter names
00323         for key in kwargs.keys():
00324             if key not in info[1]:
00325                 raise errors.IndexerError("Unknown parameter name for action %r: %r" % (info[0], key))
00326 
00327         # Fields cannot be indexed both with "EXACT" and "FREETEXT": whilst we
00328         # could implement this, the query parser wouldn't know what to do with
00329         # searches.
00330         if action == FieldActions.INDEX_EXACT:
00331             if FieldActions.INDEX_FREETEXT in self._actions:
00332                 raise errors.IndexerError("Field %r is already marked for indexing "
00333                                    "as free text: cannot mark for indexing "
00334                                    "as exact text as well" % self._fieldname)
00335         if action == FieldActions.INDEX_FREETEXT:
00336             if FieldActions.INDEX_EXACT in self._actions:
00337                 raise errors.IndexerError("Field %r is already marked for indexing "
00338                                    "as exact text: cannot mark for indexing "
00339                                    "as free text as well" % self._fieldname)
00340 
00341         # Fields cannot be indexed as more than one type for "SORTABLE": to
00342         # implement this, we'd need to use a different prefix for each sortable
00343         # type, but even then the search end wouldn't know what to sort on when
00344         # searching.  Also, if they're indexed as "COLLAPSE", the value must be
00345         # stored in the right format for the type "SORTABLE".
00346         if action == FieldActions.SORTABLE or action == FieldActions.COLLAPSE:
00347             if action == FieldActions.COLLAPSE:
00348                 sorttype = None
00349             else:
00350                 try:
00351                     sorttype = kwargs['type']
00352                 except KeyError:
00353                     sorttype = 'string'
00354             kwargs['type'] = sorttype
00355             action = FieldActions.SORT_AND_COLLAPSE
00356 
00357             try:
00358                 oldsortactions = self._actions[FieldActions.SORT_AND_COLLAPSE]
00359             except KeyError:
00360                 oldsortactions = ()
00361 
00362             if len(oldsortactions) > 0:
00363                 for oldsortaction in oldsortactions:
00364                     oldsorttype = oldsortaction['type']
00365 
00366                 if sorttype == oldsorttype or oldsorttype is None:
00367                     # Use new type
00368                     self._actions[action] = []
00369                 elif sorttype is None:
00370                     # Use old type
00371                     return
00372                 else:
00373                     raise errors.IndexerError("Field %r is already marked for "
00374                                                "sorting, with a different "
00375                                                "sort type" % self._fieldname)
00376 
00377         if 'prefix' in info[3]:
00378             field_mappings.add_prefix(self._fieldname)
00379         if 'slot' in info[3]:
00380             purposes = info[3]['slot']
00381             if isinstance(purposes, basestring):
00382                 field_mappings.add_slot(self._fieldname, purposes)
00383             else:
00384                 slotnum = None
00385                 for purpose in purposes:
00386                     slotnum = field_mappings.get_slot(self._fieldname, purpose)
00387                     if slotnum is not None:
00388                         break
00389                 for purpose in purposes:
00390                     field_mappings.add_slot(self._fieldname, purpose, slotnum=slotnum)
00391 
00392         # Make an entry for the action
00393         if action not in self._actions:
00394             self._actions[action] = []
00395 
00396         # Check for repetitions of actions
00397         for old_action in self._actions[action]:
00398             if old_action == kwargs:
00399                 return
00400 
00401         # Append the action to the list of actions
00402         self._actions[action].append(kwargs)
00403 
00404     def perform(self, doc, value, context):
00405         """Perform the actions on the field.
00406 
00407         - `doc` is a ProcessedDocument to store the result of the actions in.
00408         - `value` is a string holding the value of the field.
00409         - `context` is an ActionContext object used to keep state in.
00410 
00411         """
00412         for type, actionlist in self._actions.iteritems():
00413             info = self._action_info[type]            
00414             for kwargs in actionlist:
00415                 info[2](self._fieldname, doc, value, context, **kwargs)
00416 
00417     _action_info = {
00418         STORE_CONTENT: ('STORE_CONTENT', (), _act_store_content, {}, ),
00419         INDEX_EXACT: ('INDEX_EXACT', (), _act_index_exact, {'prefix': True}, ),
00420         INDEX_FREETEXT: ('INDEX_FREETEXT', ('weight', 'language', 'stop', 'spell', 'nopos', 'allow_field_specific', 'search_by_default', ), 
00421             _act_index_freetext, {'prefix': True, }, ),
00422         SORTABLE: ('SORTABLE', ('type', ), None, {'slot': 'collsort',}, ),
00423         COLLAPSE: ('COLLAPSE', (), None, {'slot': 'collsort',}, ),
00424         TAG: ('TAG', (), _act_tag, {'prefix': True,}, ),
00425         FACET: ('FACET', ('type', ), _act_facet, {'prefix': True, 'slot': 'facet',}, ),
00426 
00427         SORT_AND_COLLAPSE: ('SORT_AND_COLLAPSE', ('type', ), _act_sort_and_collapse, {'slot': 'collsort',}, ),
00428     }
00429 
00430 if __name__ == '__main__':
00431     import doctest, sys
00432     doctest.testmod (sys.modules[__name__])