Back to index

moin  1.9.0~rc2
searchconnection.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 #
00003 # Copyright (C) 2007 Lemur Consulting Ltd
00004 #
00005 # This program is free software; you can redistribute it and/or modify
00006 # it under the terms of the GNU General Public License as published by
00007 # the Free Software Foundation; either version 2 of the License, or
00008 # (at your option) any later version.
00009 #
00010 # This program is distributed in the hope that it will be useful,
00011 # but WITHOUT ANY WARRANTY; without even the implied warranty of
00012 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013 # GNU General Public License for more details.
00014 # 
00015 # You should have received a copy of the GNU General Public License along
00016 # with this program; if not, write to the Free Software Foundation, Inc.,
00017 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
00018 r"""searchconnection.py: A connection to the search engine for searching.
00019 
00020 """
00021 __docformat__ = "restructuredtext en"
00022 
00023 import _checkxapian
00024 import os as _os
00025 import cPickle as _cPickle
00026 import math
00027 
00028 import xapian as _xapian
00029 from datastructures import *
00030 from fieldactions import *
00031 import fieldmappings as _fieldmappings
00032 import highlight as _highlight 
00033 import errors as _errors
00034 import indexerconnection as _indexerconnection
00035 import re as _re
00036 from replaylog import log as _log
00037 
00038 class SearchResult(ProcessedDocument):
00039     """A result from a search.
00040 
00041     As well as being a ProcessedDocument representing the document in the
00042     database, the result has several members which may be used to get
00043     information about how well the document matches the search:
00044 
00045      - `rank`: The rank of the document in the search results, starting at 0
00046        (ie, 0 is the "top" result, 1 is the second result, etc).
00047 
00048      - `weight`: A floating point number indicating the weight of the result
00049        document.  The value is only meaningful relative to other results for a
00050        given search - a different search, or the same search with a different
00051        database, may give an entirely different scale to the weights.  This
00052        should not usually be displayed to users, but may be useful if trying to
00053        perform advanced reweighting operations on search results.
00054 
00055      - `percent`: A percentage value for the weight of a document.  This is
00056        just a rescaled form of the `weight` member.  It doesn't represent any
00057        kind of probability value; the only real meaning of the numbers is that,
00058        within a single set of results, a document with a higher percentage
00059        corresponds to a better match.  Because the percentage doesn't really
00060        represent a probability, or a confidence value, it is probably unhelpful
00061        to display it to most users, since they tend to place an over emphasis
00062        on its meaning.  However, it is included because it may be useful
00063        occasionally.
00064 
00065     """
00066     def __init__(self, msetitem, results):
00067         ProcessedDocument.__init__(self, results._fieldmappings, msetitem.document)
00068         self.rank = msetitem.rank
00069         self.weight = msetitem.weight
00070         self.percent = msetitem.percent
00071         self._results = results
00072 
00073     def _get_language(self, field):
00074         """Get the language that should be used for a given field.
00075 
00076         Raises a KeyError if the field is not known.
00077 
00078         """
00079         actions = self._results._conn._field_actions[field]._actions
00080         for action, kwargslist in actions.iteritems():
00081             if action == FieldActions.INDEX_FREETEXT:
00082                 for kwargs in kwargslist:
00083                     try:
00084                         return kwargs['language']
00085                     except KeyError:
00086                         pass
00087         return 'none'
00088 
00089     def summarise(self, field, maxlen=600, hl=('<b>', '</b>'), query=None):
00090         """Return a summarised version of the field specified.
00091 
00092         This will return a summary of the contents of the field stored in the
00093         search result, with words which match the query highlighted.
00094 
00095         The maximum length of the summary (in characters) may be set using the
00096         maxlen parameter.
00097 
00098         The return value will be a string holding the summary, with
00099         highlighting applied.  If there are multiple instances of the field in
00100         the document, the instances will be joined with a newline character.
00101         
00102         To turn off highlighting, set hl to None.  Each highlight will consist
00103         of the first entry in the `hl` list being placed before the word, and
00104         the second entry in the `hl` list being placed after the word.
00105 
00106         Any XML or HTML style markup tags in the field will be stripped before
00107         the summarisation algorithm is applied.
00108 
00109         If `query` is supplied, it should contain a Query object, as returned
00110         from SearchConnection.query_parse() or related methods, which will be
00111         used as the basis of the summarisation and highlighting rather than the
00112         query which was used for the search.
00113 
00114         Raises KeyError if the field is not known.
00115 
00116         """
00117         highlighter = _highlight.Highlighter(language_code=self._get_language(field))
00118         field = self.data[field]
00119         results = []
00120         text = '\n'.join(field)
00121         if query is None:
00122             query = self._results._query
00123         return highlighter.makeSample(text, query, maxlen, hl)
00124 
00125     def highlight(self, field, hl=('<b>', '</b>'), strip_tags=False, query=None):
00126         """Return a highlighted version of the field specified.
00127 
00128         This will return all the contents of the field stored in the search
00129         result, with words which match the query highlighted.
00130 
00131         The return value will be a list of strings (corresponding to the list
00132         of strings which is the raw field data).
00133 
00134         Each highlight will consist of the first entry in the `hl` list being
00135         placed before the word, and the second entry in the `hl` list being
00136         placed after the word.
00137 
00138         If `strip_tags` is True, any XML or HTML style markup tags in the field
00139         will be stripped before highlighting is applied.
00140 
00141         If `query` is supplied, it should contain a Query object, as returned
00142         from SearchConnection.query_parse() or related methods, which will be
00143         used as the basis of the summarisation and highlighting rather than the
00144         query which was used for the search.
00145 
00146         Raises KeyError if the field is not known.
00147 
00148         """
00149         highlighter = _highlight.Highlighter(language_code=self._get_language(field))
00150         field = self.data[field]
00151         results = []
00152         if query is None:
00153             query = self._results._query
00154         for text in field:
00155             results.append(highlighter.highlight(text, query, hl, strip_tags))
00156         return results
00157 
00158     def __repr__(self):
00159         return ('<SearchResult(rank=%d, id=%r, data=%r)>' %
00160                 (self.rank, self.id, self.data))
00161 
00162 
00163 class SearchResultIter(object):
00164     """An iterator over a set of results from a search.
00165 
00166     """
00167     def __init__(self, results, order):
00168         self._results = results
00169         self._order = order
00170         if self._order is None:
00171             self._iter = iter(results._mset)
00172         else:
00173             self._iter = iter(self._order)
00174 
00175     def next(self):
00176         if self._order is None:
00177             msetitem = self._iter.next()
00178         else:
00179             index = self._iter.next()
00180             msetitem = self._results._mset.get_hit(index)
00181         return SearchResult(msetitem, self._results)
00182 
00183 
00184 def _get_significant_digits(value, lower, upper):
00185     """Get the significant digits of value which are constrained by the
00186     (inclusive) lower and upper bounds.
00187 
00188     If there are no significant digits which are definitely within the
00189     bounds, exactly one significant digit will be returned in the result.
00190 
00191     >>> _get_significant_digits(15,15,15)
00192     15
00193     >>> _get_significant_digits(15,15,17)
00194     20
00195     >>> _get_significant_digits(4777,208,6000)
00196     5000
00197     >>> _get_significant_digits(4777,4755,4790)
00198     4800
00199     >>> _get_significant_digits(4707,4695,4710)
00200     4700
00201     >>> _get_significant_digits(4719,4717,4727)
00202     4720
00203     >>> _get_significant_digits(0,0,0)
00204     0
00205     >>> _get_significant_digits(9,9,10)
00206     9
00207     >>> _get_significant_digits(9,9,100)
00208     9
00209 
00210     """
00211     assert(lower <= value)
00212     assert(value <= upper)
00213     diff = upper - lower
00214 
00215     # Get the first power of 10 greater than the difference.
00216     # This corresponds to the magnitude of the smallest significant digit.
00217     if diff == 0:
00218         pos_pow_10 = 1
00219     else:
00220         pos_pow_10 = int(10 ** math.ceil(math.log10(diff)))
00221 
00222     # Special case for situation where we don't have any significant digits:
00223     # get the magnitude of the most significant digit in value.
00224     if pos_pow_10 > value:
00225         if value == 0:
00226             pos_pow_10 = 1
00227         else:
00228             pos_pow_10 = int(10 ** math.floor(math.log10(value)))
00229 
00230     # Return the value, rounded to the nearest multiple of pos_pow_10
00231     return ((value + pos_pow_10 // 2) // pos_pow_10) * pos_pow_10
00232 
00233 class SearchResults(object):
00234     """A set of results of a search.
00235 
00236     """
00237     def __init__(self, conn, enq, query, mset, fieldmappings, tagspy,
00238                  tagfields, facetspy, facetfields, facethierarchy,
00239                  facetassocs):
00240         self._conn = conn
00241         self._enq = enq
00242         self._query = query
00243         self._mset = mset
00244         self._mset_order = None
00245         self._fieldmappings = fieldmappings
00246         self._tagspy = tagspy
00247         if tagfields is None:
00248             self._tagfields = None
00249         else:
00250             self._tagfields = set(tagfields)
00251         self._facetspy = facetspy
00252         self._facetfields = facetfields
00253         self._facethierarchy = facethierarchy
00254         self._facetassocs = facetassocs
00255         self._numeric_ranges_built = {}
00256 
00257     def _cluster(self, num_clusters, maxdocs, fields=None):
00258         """Cluster results based on similarity.
00259 
00260         Note: this method is experimental, and will probably disappear or
00261         change in the future.
00262 
00263         The number of clusters is specified by num_clusters: unless there are
00264         too few results, there will be exaclty this number of clusters in the
00265         result.
00266 
00267         """
00268         clusterer = _xapian.ClusterSingleLink()
00269         xapclusters = _xapian.ClusterAssignments()
00270         docsim = _xapian.DocSimCosine()
00271         source = _xapian.MSetDocumentSource(self._mset, maxdocs)
00272 
00273         if fields is None:
00274             clusterer.cluster(self._conn._index, xapclusters, docsim, source, num_clusters)
00275         else:
00276             decider = self._make_expand_decider(fields)
00277             clusterer.cluster(self._conn._index, xapclusters, docsim, source, decider, num_clusters)
00278 
00279         newid = 0
00280         idmap = {}
00281         clusters = {}
00282         for item in self._mset:
00283             docid = item.docid
00284             clusterid = xapclusters.cluster(docid)
00285             if clusterid not in idmap:
00286                 idmap[clusterid] = newid
00287                 newid += 1
00288             clusterid = idmap[clusterid]
00289             if clusterid not in clusters:
00290                 clusters[clusterid] = []
00291             clusters[clusterid].append(item.rank)
00292         return clusters
00293 
00294     def _reorder_by_clusters(self, clusters):
00295         """Reorder the mset based on some clusters.
00296 
00297         """
00298         if self.startrank != 0:
00299             raise _errors.SearchError("startrank must be zero to reorder by clusters")
00300         reordered = False
00301         tophits = []
00302         nottophits = []
00303 
00304         clusterstarts = dict(((c[0], None) for c in clusters.itervalues()))
00305         for i in xrange(self.endrank):
00306             if i in clusterstarts:
00307                 tophits.append(i)
00308             else:
00309                 nottophits.append(i)
00310         self._mset_order = tophits
00311         self._mset_order.extend(nottophits)
00312 
00313     def _make_expand_decider(self, fields):
00314         """Make an expand decider which accepts only terms in the specified
00315         field.
00316 
00317         """
00318         prefixes = {}
00319         if isinstance(fields, basestring):
00320             fields = [fields]
00321         for field in fields:
00322             try:
00323                 actions = self._conn._field_actions[field]._actions
00324             except KeyError:
00325                 continue
00326             for action, kwargslist in actions.iteritems():
00327                 if action == FieldActions.INDEX_FREETEXT:
00328                     prefix = self._conn._field_mappings.get_prefix(field)
00329                     prefixes[prefix] = None
00330                     prefixes['Z' + prefix] = None
00331                 if action in (FieldActions.INDEX_EXACT,
00332                               FieldActions.TAG,
00333                               FieldActions.FACET,):
00334                     prefix = self._conn._field_mappings.get_prefix(field)
00335                     prefixes[prefix] = None
00336         prefix_re = _re.compile('|'.join([_re.escape(x) + '[^A-Z]' for x in prefixes.keys()]))
00337         class decider(_xapian.ExpandDecider):
00338             def __call__(self, term):
00339                 return prefix_re.match(term) is not None
00340         return decider()
00341 
00342     def _reorder_by_similarity(self, count, maxcount, max_similarity,
00343                                fields=None):
00344         """Reorder results based on similarity.
00345 
00346         The top `count` documents will be chosen such that they are relatively
00347         dissimilar.  `maxcount` documents will be considered for moving around,
00348         and `max_similarity` is a value between 0 and 1 indicating the maximum
00349         similarity to the previous document before a document is moved down the
00350         result set.
00351 
00352         Note: this method is experimental, and will probably disappear or
00353         change in the future.
00354 
00355         """
00356         if self.startrank != 0:
00357             raise _errors.SearchError("startrank must be zero to reorder by similiarity")
00358         ds = _xapian.DocSimCosine()
00359         ds.set_termfreqsource(_xapian.DatabaseTermFreqSource(self._conn._index))
00360 
00361         if fields is not None:
00362             ds.set_expand_decider(self._make_expand_decider(fields))
00363 
00364         tophits = []
00365         nottophits = []
00366         full = False
00367         reordered = False
00368 
00369         sim_count = 0
00370         new_order = []
00371         end = min(self.endrank, maxcount)
00372         for i in xrange(end):
00373             if full:
00374                 new_order.append(i)
00375                 continue
00376             hit = self._mset.get_hit(i)
00377             if len(tophits) == 0:
00378                 tophits.append(hit)
00379                 continue
00380 
00381             # Compare each incoming hit to tophits
00382             maxsim = 0.0
00383             for tophit in tophits[-1:]:
00384                 sim_count += 1
00385                 sim = ds.similarity(hit.document, tophit.document)
00386                 if sim > maxsim:
00387                     maxsim = sim
00388 
00389             # If it's not similar to an existing hit, add to tophits.
00390             if maxsim < max_similarity:
00391                 tophits.append(hit)
00392             else:
00393                 nottophits.append(hit)
00394                 reordered = True
00395 
00396             # If we're full of hits, append to the end.
00397             if len(tophits) >= count:
00398                 for hit in tophits:
00399                     new_order.append(hit.rank)
00400                 for hit in nottophits:
00401                     new_order.append(hit.rank)
00402                 full = True
00403         if not full:
00404             for hit in tophits:
00405                 new_order.append(hit.rank)
00406             for hit in nottophits:
00407                 new_order.append(hit.rank)
00408         if end != self.endrank:
00409             new_order.extend(range(end, self.endrank))
00410         assert len(new_order) == self.endrank
00411         if reordered:
00412             self._mset_order = new_order
00413         else:
00414             assert new_order == range(self.endrank)
00415 
00416     def __repr__(self):
00417         return ("<SearchResults(startrank=%d, "
00418                 "endrank=%d, "
00419                 "more_matches=%s, "
00420                 "matches_lower_bound=%d, "
00421                 "matches_upper_bound=%d, "
00422                 "matches_estimated=%d, "
00423                 "estimate_is_exact=%s)>" %
00424                 (
00425                  self.startrank,
00426                  self.endrank,
00427                  self.more_matches,
00428                  self.matches_lower_bound,
00429                  self.matches_upper_bound,
00430                  self.matches_estimated,
00431                  self.estimate_is_exact,
00432                 ))
00433 
00434     def _get_more_matches(self):
00435         # This check relies on us having asked for at least one more result
00436         # than retrieved to be checked.
00437         return (self.matches_lower_bound > self.endrank)
00438     more_matches = property(_get_more_matches, doc=
00439     """Check whether there are further matches after those in this result set.
00440 
00441     """)
00442 
00443     def _get_startrank(self):
00444         return self._mset.get_firstitem()
00445     startrank = property(_get_startrank, doc=
00446     """Get the rank of the first item in the search results.
00447 
00448     This corresponds to the "startrank" parameter passed to the search() method.
00449 
00450     """)
00451 
00452     def _get_endrank(self):
00453         return self._mset.get_firstitem() + len(self._mset)
00454     endrank = property(_get_endrank, doc=
00455     """Get the rank of the item after the end of the search results.
00456 
00457     If there are sufficient results in the index, this corresponds to the
00458     "endrank" parameter passed to the search() method.
00459 
00460     """)
00461 
00462     def _get_lower_bound(self):
00463         return self._mset.get_matches_lower_bound()
00464     matches_lower_bound = property(_get_lower_bound, doc=
00465     """Get a lower bound on the total number of matching documents.
00466 
00467     """)
00468 
00469     def _get_upper_bound(self):
00470         return self._mset.get_matches_upper_bound()
00471     matches_upper_bound = property(_get_upper_bound, doc=
00472     """Get an upper bound on the total number of matching documents.
00473 
00474     """)
00475 
00476     def _get_human_readable_estimate(self):
00477         lower = self._mset.get_matches_lower_bound()
00478         upper = self._mset.get_matches_upper_bound()
00479         est = self._mset.get_matches_estimated()
00480         return _get_significant_digits(est, lower, upper)
00481     matches_human_readable_estimate = property(_get_human_readable_estimate,
00482                                                doc=
00483     """Get a human readable estimate of the number of matching documents.
00484 
00485     This consists of the value returned by the "matches_estimated" property,
00486     rounded to an appropriate number of significant digits (as determined by
00487     the values of the "matches_lower_bound" and "matches_upper_bound"
00488     properties).
00489 
00490     """)
00491 
00492     def _get_estimated(self):
00493         return self._mset.get_matches_estimated()
00494     matches_estimated = property(_get_estimated, doc=
00495     """Get an estimate for the total number of matching documents.
00496 
00497     """)
00498 
00499     def _estimate_is_exact(self):
00500         return self._mset.get_matches_lower_bound() == \
00501                self._mset.get_matches_upper_bound()
00502     estimate_is_exact = property(_estimate_is_exact, doc=
00503     """Check whether the estimated number of matching documents is exact.
00504 
00505     If this returns true, the estimate given by the `matches_estimated`
00506     property is guaranteed to be correct.
00507 
00508     If this returns false, it is possible that the actual number of matching
00509     documents is different from the number given by the `matches_estimated`
00510     property.
00511 
00512     """)
00513 
00514     def get_hit(self, index):
00515         """Get the hit with a given index.
00516 
00517         """
00518         if self._mset_order is None:
00519             msetitem = self._mset.get_hit(index)
00520         else:
00521             msetitem = self._mset.get_hit(self._mset_order[index])
00522         return SearchResult(msetitem, self)
00523     __getitem__ = get_hit
00524 
00525     def __iter__(self):
00526         """Get an iterator over the hits in the search result.
00527 
00528         The iterator returns the results in increasing order of rank.
00529 
00530         """
00531         return SearchResultIter(self, self._mset_order)
00532 
00533     def __len__(self):
00534         """Get the number of hits in the search result.
00535 
00536         Note that this is not (usually) the number of matching documents for
00537         the search.  If startrank is non-zero, it's not even the rank of the
00538         last document in the search result.  It's simply the number of hits
00539         stored in the search result.
00540 
00541         It is, however, the number of items returned by the iterator produced
00542         by calling iter() on this SearchResults object.
00543 
00544         """
00545         return len(self._mset)
00546 
00547     def get_top_tags(self, field, maxtags):
00548         """Get the most frequent tags in a given field.
00549 
00550          - `field` - the field to get tags for.  This must have been specified
00551            in the "gettags" argument of the search() call.
00552          - `maxtags` - the maximum number of tags to return.
00553 
00554         Returns a sequence of 2-item tuples, in which the first item in the
00555         tuple is the tag, and the second is the frequency of the tag in the
00556         matches seen (as an integer).
00557 
00558         """
00559         if 'tags' in _checkxapian.missing_features:
00560             raise errors.SearchError("Tags unsupported with this release of xapian")
00561         if self._tagspy is None or field not in self._tagfields:
00562             raise _errors.SearchError("Field %r was not specified for getting tags" % field)
00563         prefix = self._conn._field_mappings.get_prefix(field)
00564         return self._tagspy.get_top_terms(prefix, maxtags)
00565 
00566     def get_suggested_facets(self, maxfacets=5, desired_num_of_categories=7,
00567                              required_facets=None):
00568         """Get a suggested set of facets, to present to the user.
00569 
00570         This returns a list, in descending order of the usefulness of the
00571         facet, in which each item is a tuple holding:
00572 
00573          - fieldname of facet.
00574          - sequence of 2-tuples holding the suggested values or ranges for that
00575            field:
00576 
00577            For facets of type 'string', the first item in the 2-tuple will
00578            simply be the string supplied when the facet value was added to its
00579            document.  For facets of type 'float', it will be a 2-tuple, holding
00580            floats giving the start and end of the suggested value range.
00581 
00582            The second item in the 2-tuple will be the frequency of the facet
00583            value or range in the result set.
00584 
00585         If required_facets is not None, it must be a field name, or a sequence
00586         of field names.  Any field names mentioned in required_facets will be
00587         returned if there are any facet values at all in the search results for
00588         that field.  The facet will only be omitted if there are no facet
00589         values at all for the field.
00590 
00591         The value of maxfacets will be respected as far as possible; the
00592         exception is that if there are too many fields listed in
00593         required_facets with at least one value in the search results, extra
00594         facets will be returned (ie, obeying the required_facets parameter is
00595         considered more important than the maxfacets parameter).
00596 
00597         If facet_hierarchy was indicated when search() was called, and the
00598         query included facets, then only subfacets of those query facets and
00599         top-level facets will be included in the returned list. Furthermore
00600         top-level facets will only be returned if there are remaining places
00601         in the list after it has been filled with subfacets. Note that
00602         required_facets is still respected regardless of the facet hierarchy.
00603 
00604         If a query type was specified when search() was called, and the query
00605         included facets, then facets with an association of Never to the
00606         query type are never returned, even if mentioned in required_facets.
00607         Facets with an association of Preferred are listed before others in
00608         the returned list.
00609 
00610         """
00611         if 'facets' in _checkxapian.missing_features:
00612             raise errors.SearchError("Facets unsupported with this release of xapian")
00613         if self._facetspy is None:
00614             raise _errors.SearchError("Facet selection wasn't enabled when the search was run")
00615         if isinstance(required_facets, basestring):
00616             required_facets = [required_facets]
00617         scores = []
00618         facettypes = {}
00619         for field, slot, kwargslist in self._facetfields:
00620             type = None
00621             for kwargs in kwargslist:
00622                 type = kwargs.get('type', None)
00623                 if type is not None: break
00624             if type is None: type = 'string'
00625 
00626             if type == 'float':
00627                 if field not in self._numeric_ranges_built:
00628                     self._facetspy.build_numeric_ranges(slot, desired_num_of_categories)
00629                     self._numeric_ranges_built[field] = None
00630             facettypes[field] = type
00631             score = self._facetspy.score_categorisation(slot, desired_num_of_categories)
00632             scores.append((score, field, slot))
00633 
00634         # Sort on whether facet is top-level ahead of score (use subfacets first),
00635         # and on whether facet is preferred for the query type ahead of anything else
00636         if self._facethierarchy:
00637             # Note, tuple[-2] is the value of 'field' in a scores tuple
00638             scores = [(tuple[-2] not in self._facethierarchy,) + tuple for tuple in scores]
00639         if self._facetassocs:
00640             preferred = _indexerconnection.IndexerConnection.FacetQueryType_Preferred
00641             scores = [(self._facetassocs.get(tuple[-2]) != preferred,) + tuple for tuple in scores]
00642         scores.sort()
00643         if self._facethierarchy:
00644             index = 1
00645         else:
00646             index = 0
00647         if self._facetassocs:
00648             index += 1
00649         if index > 0:
00650             scores = [tuple[index:] for tuple in scores]
00651 
00652         results = []
00653         required_results = []
00654         for score, field, slot in scores:
00655             # Check if the facet is required
00656             required = False
00657             if required_facets is not None:
00658                 required = field in required_facets
00659 
00660             # If we've got enough facets, and the field isn't required, skip it
00661             if not required and len(results) + len(required_results) >= maxfacets:
00662                 continue
00663 
00664             # Get the values
00665             values = self._facetspy.get_values_as_dict(slot)
00666             if field in self._numeric_ranges_built:
00667                 if '' in values:
00668                     del values['']
00669 
00670             # Required facets must occur at least once, other facets must occur
00671             # at least twice.
00672             if required:
00673                 if len(values) < 1:
00674                     continue
00675             else:
00676                 if len(values) <= 1:
00677                     continue
00678 
00679             newvalues = []
00680             if facettypes[field] == 'float':
00681                 # Convert numbers to python numbers, and number ranges to a
00682                 # python tuple of two numbers.
00683                 for value, frequency in values.iteritems():
00684                     if len(value) <= 9:
00685                         value1 = _log(_xapian.sortable_unserialise, value)
00686                         value2 = value1
00687                     else:
00688                         value1 = _log(_xapian.sortable_unserialise, value[:9])
00689                         value2 = _log(_xapian.sortable_unserialise, value[9:])
00690                     newvalues.append(((value1, value2), frequency))
00691             else:
00692                 for value, frequency in values.iteritems():
00693                     newvalues.append((value, frequency))
00694 
00695             newvalues.sort()
00696             if required:
00697                 required_results.append((score, field, newvalues))
00698             else:
00699                 results.append((score, field, newvalues))
00700 
00701         # Throw away any excess results if we have more required_results to
00702         # insert.
00703         maxfacets = maxfacets - len(required_results)
00704         if maxfacets <= 0:
00705             results = required_results
00706         else:
00707             results = results[:maxfacets]
00708             results.extend(required_results)
00709             results.sort()
00710 
00711         # Throw away the scores because they're not meaningful outside this
00712         # algorithm.
00713         results = [(field, newvalues) for (score, field, newvalues) in results]
00714         return results
00715 
00716 
00717 class SearchConnection(object):
00718     """A connection to the search engine for searching.
00719 
00720     The connection will access a view of the database.
00721 
00722     """
00723     _qp_flags_base = _xapian.QueryParser.FLAG_LOVEHATE
00724     _qp_flags_phrase = _xapian.QueryParser.FLAG_PHRASE
00725     _qp_flags_synonym = (_xapian.QueryParser.FLAG_AUTO_SYNONYMS |
00726                          _xapian.QueryParser.FLAG_AUTO_MULTIWORD_SYNONYMS)
00727     _qp_flags_bool = _xapian.QueryParser.FLAG_BOOLEAN
00728 
00729     _index = None
00730 
00731     def __init__(self, indexpath):
00732         """Create a new connection to the index for searching.
00733 
00734         There may only an arbitrary number of search connections for a
00735         particular database open at a given time (regardless of whether there
00736         is a connection for indexing open as well).
00737 
00738         If the database doesn't exist, an exception will be raised.
00739 
00740         """
00741         self._index = _log(_xapian.Database, indexpath)
00742         self._indexpath = indexpath
00743 
00744         # Read the actions.
00745         self._load_config()
00746 
00747         self._close_handlers = []
00748 
00749     def __del__(self):
00750         self.close()
00751 
00752     def append_close_handler(self, handler, userdata=None):
00753         """Append a callback to the list of close handlers.
00754 
00755         These will be called when the SearchConnection is closed.  This happens
00756         when the close() method is called, or when the SearchConnection object
00757         is deleted.  The callback will be passed two arguments: the path to the
00758         SearchConnection object, and the userdata supplied to this method.
00759 
00760         The handlers will be called in the order in which they were added.
00761 
00762         The handlers will be called after the connection has been closed, so
00763         cannot prevent it closing: their return value will be ignored.  In
00764         addition, they should not raise any exceptions.
00765 
00766         """
00767         self._close_handlers.append((handler, userdata))
00768 
00769     def _get_sort_type(self, field):
00770         """Get the sort type that should be used for a given field.
00771 
00772         """
00773         try:
00774             actions = self._field_actions[field]._actions
00775         except KeyError:
00776             actions = {}
00777         for action, kwargslist in actions.iteritems():
00778             if action == FieldActions.SORT_AND_COLLAPSE:
00779                 for kwargs in kwargslist:
00780                     return kwargs['type']
00781 
00782     def _load_config(self):
00783         """Load the configuration for the database.
00784 
00785         """
00786         # Note: this code is basically duplicated in the IndexerConnection
00787         # class.  Move it to a shared location.
00788         assert self._index is not None
00789 
00790         config_str = _log(self._index.get_metadata, '_xappy_config')
00791         if len(config_str) == 0:
00792             self._field_actions = {}
00793             self._field_mappings = _fieldmappings.FieldMappings()
00794             self._facet_hierarchy = {}
00795             self._facet_query_table = {}
00796             return
00797 
00798         try:
00799             (self._field_actions, mappings, self._facet_hierarchy, self._facet_query_table, self._next_docid) = _cPickle.loads(config_str)
00800         except ValueError:
00801             # Backwards compatibility - configuration used to lack _facet_hierarchy and _facet_query_table
00802             (self._field_actions, mappings, self._next_docid) = _cPickle.loads(config_str)
00803             self._facet_hierarchy = {}
00804             self._facet_query_table = {}
00805         self._field_mappings = _fieldmappings.FieldMappings(mappings)
00806 
00807     def reopen(self):
00808         """Reopen the connection.
00809 
00810         This updates the revision of the index which the connection references
00811         to the latest flushed revision.
00812 
00813         """
00814         if self._index is None:
00815             raise _errors.SearchError("SearchConnection has been closed")
00816         self._index.reopen()
00817         # Re-read the actions.
00818         self._load_config()
00819         
00820     def close(self):
00821         """Close the connection to the database.
00822 
00823         It is important to call this method before allowing the class to be
00824         garbage collected to ensure that the connection is cleaned up promptly.
00825 
00826         No other methods may be called on the connection after this has been
00827         called.  (It is permissible to call close() multiple times, but
00828         only the first call will have any effect.)
00829 
00830         If an exception occurs, the database will be closed, but changes since
00831         the last call to flush may be lost.
00832 
00833         """
00834         if self._index is None:
00835             return
00836 
00837         # Remember the index path
00838         indexpath = self._indexpath
00839 
00840         # There is currently no "close()" method for xapian databases, so
00841         # we have to rely on the garbage collector.  Since we never copy
00842         # the _index property out of this class, there should be no cycles,
00843         # so the standard python implementation should garbage collect
00844         # _index straight away.  A close() method is planned to be added to
00845         # xapian at some point - when it is, we should call it here to make
00846         # the code more robust.
00847         self._index = None
00848         self._indexpath = None
00849         self._field_actions = None
00850         self._field_mappings = None
00851 
00852         # Call the close handlers.
00853         for handler, userdata in self._close_handlers:
00854             try:
00855                 handler(indexpath, userdata)
00856             except Exception, e:
00857                 import sys, traceback
00858                 print >>sys.stderr, "WARNING: unhandled exception in handler called by SearchConnection.close(): %s" % traceback.format_exception_only(type(e), e)
00859 
00860     def get_doccount(self):
00861         """Count the number of documents in the database.
00862 
00863         This count will include documents which have been added or removed but
00864         not yet flushed().
00865 
00866         """
00867         if self._index is None:
00868             raise _errors.SearchError("SearchConnection has been closed")
00869         return self._index.get_doccount()
00870 
00871     OP_AND = _xapian.Query.OP_AND
00872     OP_OR = _xapian.Query.OP_OR
00873     def query_composite(self, operator, queries):
00874         """Build a composite query from a list of queries.
00875 
00876         The queries are combined with the supplied operator, which is either
00877         SearchConnection.OP_AND or SearchConnection.OP_OR.
00878 
00879         """
00880         if self._index is None:
00881             raise _errors.SearchError("SearchConnection has been closed")
00882         return _log(_xapian.Query, operator, list(queries))
00883 
00884     def query_multweight(self, query, multiplier):
00885         """Build a query which modifies the weights of a subquery.
00886 
00887         This produces a query which returns the same documents as the subquery,
00888         and in the same order, but with the weights assigned to each document
00889         multiplied by the value of "multiplier".  "multiplier" may be any floating
00890         point value, but negative values will be clipped to 0, since Xapian
00891         doesn't support negative weights.
00892 
00893         This can be useful when producing queries to be combined with
00894         query_composite, because it allows the relative importance of parts of
00895         the query to be adjusted.
00896 
00897         """
00898         return _log(_xapian.Query, _xapian.Query.OP_SCALE_WEIGHT, query, multiplier)
00899 
00900     def query_filter(self, query, filter, exclude=False):
00901         """Filter a query with another query.
00902 
00903         If exclude is False (or not specified), documents will only match the
00904         resulting query if they match the both the first and second query: the
00905         results of the first query are "filtered" to only include those which
00906         also match the second query.
00907 
00908         If exclude is True, documents will only match the resulting query if
00909         they match the first query, but not the second query: the results of
00910         the first query are "filtered" to only include those which do not match
00911         the second query.
00912         
00913         Documents will always be weighted according to only the first query.
00914 
00915         - `query`: The query to filter.
00916         - `filter`: The filter to apply to the query.
00917         - `exclude`: If True, the sense of the filter is reversed - only
00918           documents which do not match the second query will be returned. 
00919 
00920         """
00921         if self._index is None:
00922             raise _errors.SearchError("SearchConnection has been closed")
00923         if not isinstance(filter, _xapian.Query):
00924             raise _errors.SearchError("Filter must be a Xapian Query object")
00925         if exclude:
00926             return _log(_xapian.Query, _xapian.Query.OP_AND_NOT, query, filter)
00927         else:
00928             return _log(_xapian.Query, _xapian.Query.OP_FILTER, query, filter)
00929 
00930     def query_adjust(self, primary, secondary):
00931         """Adjust the weights of one query with a secondary query.
00932 
00933         Documents will be returned from the resulting query if and only if they
00934         match the primary query (specified by the "primary" parameter).
00935         However, the weights (and hence, the relevance rankings) of the
00936         documents will be adjusted by adding weights from the secondary query
00937         (specified by the "secondary" parameter).
00938 
00939         """
00940         if self._index is None:
00941             raise _errors.SearchError("SearchConnection has been closed")
00942         return _log(_xapian.Query, _xapian.Query.OP_AND_MAYBE, primary, secondary)
00943 
00944     def query_range(self, field, begin, end):
00945         """Create a query for a range search.
00946         
00947         This creates a query which matches only those documents which have a
00948         field value in the specified range.
00949 
00950         Begin and end must be appropriate values for the field, according to
00951         the 'type' parameter supplied to the SORTABLE action for the field.
00952 
00953         The begin and end values are both inclusive - any documents with a
00954         value equal to begin or end will be returned (unless end is less than
00955         begin, in which case no documents will be returned).
00956 
00957         Begin or end may be set to None in order to create an open-ended
00958         range.  (They may also both be set to None, which will generate a query
00959         which matches all documents containing any value for the field.)
00960 
00961         """
00962         if self._index is None:
00963             raise _errors.SearchError("SearchConnection has been closed")
00964 
00965         if begin is None and end is None:
00966             # Return a "match everything" query
00967             return _log(_xapian.Query, '')
00968 
00969         try:
00970             slot = self._field_mappings.get_slot(field, 'collsort')
00971         except KeyError:
00972             # Return a "match nothing" query
00973             return _log(_xapian.Query)
00974 
00975         sorttype = self._get_sort_type(field)
00976         marshaller = SortableMarshaller(False)
00977         fn = marshaller.get_marshall_function(field, sorttype)
00978 
00979         if begin is not None:
00980             begin = fn(field, begin)
00981         if end is not None:
00982             end = fn(field, end)
00983 
00984         if begin is None:
00985             return _log(_xapian.Query, _xapian.Query.OP_VALUE_LE, slot, end)
00986 
00987         if end is None:
00988             return _log(_xapian.Query, _xapian.Query.OP_VALUE_GE, slot, begin)
00989 
00990         return _log(_xapian.Query, _xapian.Query.OP_VALUE_RANGE, slot, begin, end)
00991 
00992     def query_facet(self, field, val):
00993         """Create a query for a facet value.
00994         
00995         This creates a query which matches only those documents which have a
00996         facet value in the specified range.
00997 
00998         For a numeric range facet, val should be a tuple holding the start and
00999         end of the range, or a comma separated string holding two floating
01000         point values.  For other facets, val should be the value to look
01001         for.
01002 
01003         The start and end values are both inclusive - any documents with a
01004         value equal to start or end will be returned (unless end is less than
01005         start, in which case no documents will be returned).
01006 
01007         """
01008         if self._index is None:
01009             raise _errors.SearchError("SearchConnection has been closed")
01010         if 'facets' in _checkxapian.missing_features:
01011             raise errors.SearchError("Facets unsupported with this release of xapian")
01012 
01013         try:
01014             actions = self._field_actions[field]._actions
01015         except KeyError:
01016             actions = {}
01017         facettype = None
01018         for action, kwargslist in actions.iteritems():
01019             if action == FieldActions.FACET:
01020                 for kwargs in kwargslist:
01021                     facettype = kwargs.get('type', None)
01022                     if facettype is not None:
01023                         break
01024             if facettype is not None:
01025                 break
01026 
01027         if facettype == 'float':
01028             if isinstance(val, basestring):
01029                 val = [float(v) for v in val.split(',', 2)]
01030             assert(len(val) == 2)
01031             try:
01032                 slot = self._field_mappings.get_slot(field, 'facet')
01033             except KeyError:
01034                 return _log(_xapian.Query)
01035             # FIXME - check that sorttype == self._get_sort_type(field)
01036             sorttype = 'float'
01037             marshaller = SortableMarshaller(False)
01038             fn = marshaller.get_marshall_function(field, sorttype)
01039             begin = fn(field, val[0])
01040             end = fn(field, val[1])
01041             return _log(_xapian.Query, _xapian.Query.OP_VALUE_RANGE, slot, begin, end)
01042         else:
01043             assert(facettype == 'string' or facettype is None)
01044             prefix = self._field_mappings.get_prefix(field)
01045             return _log(_xapian.Query, prefix + val.lower())
01046 
01047 
01048     def _prepare_queryparser(self, allow, deny, default_op, default_allow,
01049                              default_deny):
01050         """Prepare (and return) a query parser using the specified fields and
01051         operator.
01052 
01053         """
01054         if self._index is None:
01055             raise _errors.SearchError("SearchConnection has been closed")
01056 
01057         if isinstance(allow, basestring):
01058             allow = (allow, )
01059         if isinstance(deny, basestring):
01060             deny = (deny, )
01061         if allow is not None and len(allow) == 0:
01062             allow = None
01063         if deny is not None and len(deny) == 0:
01064             deny = None
01065         if allow is not None and deny is not None:
01066             raise _errors.SearchError("Cannot specify both `allow` and `deny` "
01067                                       "(got %r and %r)" % (allow, deny))
01068 
01069         if isinstance(default_allow, basestring):
01070             default_allow = (default_allow, )
01071         if isinstance(default_deny, basestring):
01072             default_deny = (default_deny, )
01073         if default_allow is not None and len(default_allow) == 0:
01074             default_allow = None
01075         if default_deny is not None and len(default_deny) == 0:
01076             default_deny = None
01077         if default_allow is not None and default_deny is not None:
01078             raise _errors.SearchError("Cannot specify both `default_allow` and `default_deny` "
01079                                       "(got %r and %r)" % (default_allow, default_deny))
01080 
01081         qp = _log(_xapian.QueryParser)
01082         qp.set_database(self._index)
01083         qp.set_default_op(default_op)
01084 
01085         if allow is None:
01086             allow = [key for key in self._field_actions]
01087         if deny is not None:
01088             allow = [key for key in allow if key not in deny]
01089 
01090         for field in allow:
01091             try:
01092                 actions = self._field_actions[field]._actions
01093             except KeyError:
01094                 actions = {}
01095             for action, kwargslist in actions.iteritems():
01096                 if action == FieldActions.INDEX_EXACT:
01097                     # FIXME - need patched version of xapian to add exact prefixes
01098                     #qp.add_exact_prefix(field, self._field_mappings.get_prefix(field))
01099                     qp.add_prefix(field, self._field_mappings.get_prefix(field))
01100                 if action == FieldActions.INDEX_FREETEXT:
01101                     allow_field_specific = True
01102                     for kwargs in kwargslist:
01103                         allow_field_specific = allow_field_specific or kwargs.get('allow_field_specific', True)
01104                     if not allow_field_specific:
01105                         continue
01106                     qp.add_prefix(field, self._field_mappings.get_prefix(field))
01107                     for kwargs in kwargslist:
01108                         try:
01109                             lang = kwargs['language']
01110                             my_stemmer = _log(_xapian.Stem, lang)
01111                             qp.my_stemmer = my_stemmer
01112                             qp.set_stemmer(my_stemmer)
01113                             qp.set_stemming_strategy(qp.STEM_SOME)
01114                         except KeyError:
01115                             pass
01116 
01117         if default_allow is not None or default_deny is not None:
01118             if default_allow is None:
01119                 default_allow = [key for key in self._field_actions]
01120             if default_deny is not None:
01121                 default_allow = [key for key in default_allow if key not in default_deny]
01122             for field in default_allow:
01123                 try:
01124                     actions = self._field_actions[field]._actions
01125                 except KeyError:
01126                     actions = {}
01127                 for action, kwargslist in actions.iteritems():
01128                     if action == FieldActions.INDEX_FREETEXT:
01129                         qp.add_prefix('', self._field_mappings.get_prefix(field))
01130                         # FIXME - set stemming options for the default prefix
01131 
01132         return qp
01133 
01134     def _query_parse_with_prefix(self, qp, string, flags, prefix):
01135         """Parse a query, with an optional prefix.
01136 
01137         """
01138         if prefix is None:
01139             return qp.parse_query(string, flags)
01140         else:
01141             return qp.parse_query(string, flags, prefix)
01142 
01143     def _query_parse_with_fallback(self, qp, string, prefix=None):
01144         """Parse a query with various flags.
01145         
01146         If the initial boolean pass fails, fall back to not using boolean
01147         operators.
01148 
01149         """
01150         try:
01151             q1 = self._query_parse_with_prefix(qp, string,
01152                                                self._qp_flags_base |
01153                                                self._qp_flags_phrase |
01154                                                self._qp_flags_synonym |
01155                                                self._qp_flags_bool,
01156                                                prefix)
01157         except _xapian.QueryParserError, e:
01158             # If we got a parse error, retry without boolean operators (since
01159             # these are the usual cause of the parse error).
01160             q1 = self._query_parse_with_prefix(qp, string,
01161                                                self._qp_flags_base |
01162                                                self._qp_flags_phrase |
01163                                                self._qp_flags_synonym,
01164                                                prefix)
01165 
01166         qp.set_stemming_strategy(qp.STEM_NONE)
01167         try:
01168             q2 = self._query_parse_with_prefix(qp, string,
01169                                                self._qp_flags_base |
01170                                                self._qp_flags_bool,
01171                                                prefix)
01172         except _xapian.QueryParserError, e:
01173             # If we got a parse error, retry without boolean operators (since
01174             # these are the usual cause of the parse error).
01175             q2 = self._query_parse_with_prefix(qp, string,
01176                                                self._qp_flags_base,
01177                                                prefix)
01178 
01179         return _log(_xapian.Query, _xapian.Query.OP_AND_MAYBE, q1, q2)
01180 
01181     def query_parse(self, string, allow=None, deny=None, default_op=OP_AND,
01182                     default_allow=None, default_deny=None):
01183         """Parse a query string.
01184 
01185         This is intended for parsing queries entered by a user.  If you wish to
01186         combine structured queries, it is generally better to use the other
01187         query building methods, such as `query_composite` (though you may wish
01188         to create parts of the query to combine with such methods with this
01189         method).
01190 
01191         The string passed to this method can have various operators in it.  In
01192         particular, it may contain field specifiers (ie, field names, followed
01193         by a colon, followed by some text to search for in that field).  For
01194         example, if "author" is a field in the database, the search string
01195         could contain "author:richard", and this would be interpreted as
01196         "search for richard in the author field".  By default, any fields in
01197         the database which are indexed with INDEX_EXACT or INDEX_FREETEXT will
01198         be available for field specific searching in this way - however, this
01199         can be modified using the "allow" or "deny" parameters, and also by the
01200         allow_field_specific tag on INDEX_FREETEXT fields.
01201 
01202         Any text which isn't prefixed by a field specifier is used to search
01203         the "default set" of fields.  By default, this is the full set of
01204         fields in the database which are indexed with INDEX_FREETEXT and for
01205         which the search_by_default flag set (ie, if the text is found in any
01206         of those fields, the query will match).  However, this may be modified
01207         with the "default_allow" and "default_deny" parameters.  (Note that
01208         fields which are indexed with INDEX_EXACT aren't allowed to be used in
01209         the default list of fields.)
01210 
01211         - `string`: The string to parse.
01212         - `allow`: A list of fields to allow in the query.
01213         - `deny`: A list of fields not to allow in the query.
01214         - `default_op`: The default operator to combine query terms with.
01215         - `default_allow`: A list of fields to search for by default.
01216         - `default_deny`: A list of fields not to search for by default.
01217 
01218         Only one of `allow` and `deny` may be specified.
01219 
01220         Only one of `default_allow` and `default_deny` may be specified.
01221 
01222         If any of the entries in `allow` are not present in the configuration
01223         for the database, or are not specified for indexing (either as
01224         INDEX_EXACT or INDEX_FREETEXT), they will be ignored.  If any of the
01225         entries in `deny` are not present in the configuration for the
01226         database, they will be ignored.
01227 
01228         Returns a Query object, which may be passed to the search() method, or
01229         combined with other queries.
01230 
01231         """
01232         qp = self._prepare_queryparser(allow, deny, default_op, default_allow,
01233                                        default_deny)
01234         return self._query_parse_with_fallback(qp, string)
01235 
01236     def query_field(self, field, value, default_op=OP_AND):
01237         """A query for a single field.
01238 
01239         """
01240         if self._index is None:
01241             raise _errors.SearchError("SearchConnection has been closed")
01242         try:
01243             actions = self._field_actions[field]._actions
01244         except KeyError:
01245             actions = {}
01246 
01247         # need to check on field type, and stem / split as appropriate
01248         for action, kwargslist in actions.iteritems():
01249             if action in (FieldActions.INDEX_EXACT,
01250                           FieldActions.TAG,
01251                           FieldActions.FACET,):
01252                 prefix = self._field_mappings.get_prefix(field)
01253                 if len(value) > 0:
01254                     chval = ord(value[0])
01255                     if chval >= ord('A') and chval <= ord('Z'):
01256                         prefix = prefix + ':'
01257                 return _log(_xapian.Query, prefix + value)
01258             if action == FieldActions.INDEX_FREETEXT:
01259                 qp = _log(_xapian.QueryParser)
01260                 qp.set_default_op(default_op)
01261                 prefix = self._field_mappings.get_prefix(field)
01262                 for kwargs in kwargslist:
01263                     try:
01264                         lang = kwargs['language']
01265                         qp.set_stemmer(_log(_xapian.Stem, lang))
01266                         qp.set_stemming_strategy(qp.STEM_SOME)
01267                     except KeyError:
01268                         pass
01269                 return self._query_parse_with_fallback(qp, value, prefix)
01270 
01271         return _log(_xapian.Query)
01272 
01273     def query_similar(self, ids, allow=None, deny=None, simterms=10):
01274         """Get a query which returns documents which are similar to others.
01275 
01276         The list of document IDs to base the similarity search on is given in
01277         `ids`.  This should be an iterable, holding a list of strings.  If
01278         any of the supplied IDs cannot be found in the database, they will be
01279         ignored.  (If no IDs can be found in the database, the resulting query
01280         will not match any documents.)
01281 
01282         By default, all fields which have been indexed for freetext searching
01283         will be used for the similarity calculation.  The list of fields used
01284         for this can be customised using the `allow` and `deny` parameters
01285         (only one of which may be specified):
01286 
01287         - `allow`: A list of fields to base the similarity calculation on.
01288         - `deny`: A list of fields not to base the similarity calculation on.
01289         - `simterms`: Number of terms to use for the similarity calculation.
01290 
01291         For convenience, any of `ids`, `allow`, or `deny` may be strings, which
01292         will be treated the same as a list of length 1.
01293 
01294         Regardless of the setting of `allow` and `deny`, only fields which have
01295         been indexed for freetext searching will be used for the similarity
01296         measure - all other fields will always be ignored for this purpose.
01297 
01298         """
01299         eterms, prefixes = self._get_eterms(ids, allow, deny, simterms)
01300 
01301         # Use the "elite set" operator, which chooses the terms with the
01302         # highest query weight to use.
01303         q = _log(_xapian.Query, _xapian.Query.OP_ELITE_SET, eterms, simterms)
01304         return q
01305 
01306     def significant_terms(self, ids, maxterms=10, allow=None, deny=None):
01307         """Get a set of "significant" terms for a document, or documents.
01308 
01309         This has a similar interface to query_similar(): it takes a list of
01310         ids, and an optional specification of a set of fields to consider.
01311         Instead of returning a query, it returns a list of terms from the
01312         document (or documents), which appear "significant".  Roughly,
01313         in this situation significant means that the terms occur more
01314         frequently in the specified document than in the rest of the corpus.
01315 
01316         The list is in decreasing order of "significance".
01317 
01318         By default, all terms related to fields which have been indexed for
01319         freetext searching will be considered for the list of significant
01320         terms.  The list of fields used for this can be customised using the
01321         `allow` and `deny` parameters (only one of which may be specified):
01322 
01323         - `allow`: A list of fields to consider.
01324         - `deny`: A list of fields not to consider.
01325 
01326         For convenience, any of `ids`, `allow`, or `deny` may be strings, which
01327         will be treated the same as a list of length 1.
01328 
01329         Regardless of the setting of `allow` and `deny`, only fields which have
01330         been indexed for freetext searching will be considered - all other
01331         fields will always be ignored for this purpose.
01332 
01333         The maximum number of terms to return may be specified by the maxterms
01334         parameter.
01335 
01336         """
01337         eterms, prefixes = self._get_eterms(ids, allow, deny, maxterms)
01338         terms = []
01339         for term in eterms:
01340             pos = 0
01341             for char in term:
01342                 if not char.isupper():
01343                     break
01344                 pos += 1
01345             field = prefixes[term[:pos]]
01346             value = term[pos:]
01347             terms.append((field, value))
01348         return terms
01349 
01350     def _get_eterms(self, ids, allow, deny, simterms):
01351         """Get a set of terms for an expand
01352 
01353         """
01354         if self._index is None:
01355             raise _errors.SearchError("SearchConnection has been closed")
01356         if allow is not None and deny is not None:
01357             raise _errors.SearchError("Cannot specify both `allow` and `deny`")
01358 
01359         if isinstance(ids, basestring):
01360             ids = (ids, )
01361         if isinstance(allow, basestring):
01362             allow = (allow, )
01363         if isinstance(deny, basestring):
01364             deny = (deny, )
01365 
01366         # Set "allow" to contain a list of all the fields to use.
01367         if allow is None:
01368             allow = [key for key in self._field_actions]
01369         if deny is not None:
01370             allow = [key for key in allow if key not in deny]
01371 
01372         # Set "prefixes" to contain a list of all the prefixes to use.
01373         prefixes = {}
01374         for field in allow:
01375             try:
01376                 actions = self._field_actions[field]._actions
01377             except KeyError:
01378                 actions = {}
01379             for action, kwargslist in actions.iteritems():
01380                 if action == FieldActions.INDEX_FREETEXT:
01381                     prefixes[self._field_mappings.get_prefix(field)] = field
01382 
01383         # Repeat the expand until we don't get a DatabaseModifiedError
01384         while True:
01385             try:
01386                 eterms = self._perform_expand(ids, prefixes, simterms)
01387                 break;
01388             except _xapian.DatabaseModifiedError, e:
01389                 self.reopen()
01390         return eterms, prefixes
01391 
01392     class ExpandDecider(_xapian.ExpandDecider):
01393         def __init__(self, prefixes):
01394             _xapian.ExpandDecider.__init__(self)
01395             self._prefixes = prefixes
01396 
01397         def __call__(self, term):
01398             pos = 0
01399             for char in term:
01400                 if not char.isupper():
01401                     break
01402                 pos += 1
01403             if term[:pos] in self._prefixes:
01404                 return True
01405             return False
01406 
01407     def _perform_expand(self, ids, prefixes, simterms):
01408         """Perform an expand operation to get the terms for a similarity
01409         search, given a set of ids (and a set of prefixes to restrict the
01410         similarity operation to).
01411 
01412         """
01413         # Set idquery to be a query which returns the documents listed in
01414         # "ids".
01415         idquery = _log(_xapian.Query, _xapian.Query.OP_OR, ['Q' + id for id in ids])
01416 
01417         enq = _log(_xapian.Enquire, self._index)
01418         enq.set_query(idquery)
01419         rset = _log(_xapian.RSet)
01420         for id in ids:
01421             pl = self._index.postlist('Q' + id)
01422             try:
01423                 xapid = pl.next()
01424                 rset.add_document(xapid.docid)
01425             except StopIteration:
01426                 pass
01427 
01428         expanddecider = _log(self.ExpandDecider, prefixes)
01429         eset = enq.get_eset(simterms, rset, 0, 1.0, expanddecider)
01430         return [term.term for term in eset]
01431 
01432     def query_all(self):
01433         """A query which matches all the documents in the database.
01434 
01435         """
01436         return _log(_xapian.Query, '')
01437 
01438     def query_none(self):
01439         """A query which matches no documents in the database.
01440 
01441         This may be useful as a placeholder in various situations.
01442 
01443         """
01444         return _log(_xapian.Query)
01445 
01446     def spell_correct(self, querystr, allow=None, deny=None, default_op=OP_AND,
01447                       default_allow=None, default_deny=None):
01448         """Correct a query spelling.
01449 
01450         This returns a version of the query string with any misspelt words
01451         corrected.
01452 
01453         - `allow`: A list of fields to allow in the query.
01454         - `deny`: A list of fields not to allow in the query.
01455         - `default_op`: The default operator to combine query terms with.
01456         - `default_allow`: A list of fields to search for by default.
01457         - `default_deny`: A list of fields not to search for by default.
01458 
01459         Only one of `allow` and `deny` may be specified.
01460 
01461         Only one of `default_allow` and `default_deny` may be specified.
01462 
01463         If any of the entries in `allow` are not present in the configuration
01464         for the database, or are not specified for indexing (either as
01465         INDEX_EXACT or INDEX_FREETEXT), they will be ignored.  If any of the
01466         entries in `deny` are not present in the configuration for the
01467         database, they will be ignored.
01468 
01469         Note that it is possible that the resulting spell-corrected query will
01470         still match no documents - the user should usually check that some
01471         documents are matched by the corrected query before suggesting it to
01472         users.
01473 
01474         """
01475         qp = self._prepare_queryparser(allow, deny, default_op, default_allow,
01476                                        default_deny)
01477         try:
01478             qp.parse_query(querystr,
01479                            self._qp_flags_base |
01480                            self._qp_flags_phrase |
01481                            self._qp_flags_synonym |
01482                            self._qp_flags_bool |
01483                            qp.FLAG_SPELLING_CORRECTION)
01484         except _xapian.QueryParserError:
01485             qp.parse_query(querystr,
01486                            self._qp_flags_base |
01487                            self._qp_flags_phrase |
01488                            self._qp_flags_synonym |
01489                            qp.FLAG_SPELLING_CORRECTION)
01490         corrected = qp.get_corrected_query_string()
01491         if len(corrected) == 0:
01492             if isinstance(querystr, unicode):
01493                 # Encode as UTF-8 for consistency - this happens automatically
01494                 # to values passed to Xapian.
01495                 return querystr.encode('utf-8')
01496             return querystr
01497         return corrected
01498 
01499     def can_collapse_on(self, field):
01500         """Check if this database supports collapsing on a specified field.
01501 
01502         """
01503         if self._index is None:
01504             raise _errors.SearchError("SearchConnection has been closed")
01505         try:
01506             self._field_mappings.get_slot(field, 'collsort')
01507         except KeyError:
01508             return False
01509         return True
01510 
01511     def can_sort_on(self, field):
01512         """Check if this database supports sorting on a specified field.
01513 
01514         """
01515         if self._index is None:
01516             raise _errors.SearchError("SearchConnection has been closed")
01517         try:
01518             self._field_mappings.get_slot(field, 'collsort')
01519         except KeyError:
01520             return False
01521         return True
01522         
01523     def _get_prefix_from_term(self, term):
01524         """Get the prefix of a term.
01525    
01526         Prefixes are any initial capital letters, with the exception that R always
01527         ends a prefix, even if followed by capital letters.
01528         
01529         """
01530         for p in xrange(len(term)):
01531             if term[p].islower():
01532                 return term[:p]
01533             elif term[p] == 'R':
01534                 return term[:p+1]
01535         return term
01536 
01537     def _facet_query_never(self, facet, query_type):
01538         """Check if a facet must never be returned by a particular query type.
01539 
01540         Returns True if the facet must never be returned.
01541 
01542         Returns False if the facet may be returned - either becuase there is no
01543         entry for the query type, or because the entry is not
01544         FacetQueryType_Never.
01545 
01546         """
01547         if query_type is None:
01548             return False
01549         if query_type not in self._facet_query_table:
01550             return False
01551         if facet not in self._facet_query_table[query_type]:
01552             return False
01553         return self._facet_query_table[query_type][facet] == _indexerconnection.IndexerConnection.FacetQueryType_Never
01554 
01555     def search(self, query, startrank, endrank,
01556                checkatleast=0, sortby=None, collapse=None,
01557                gettags=None,
01558                getfacets=None, allowfacets=None, denyfacets=None, usesubfacets=None,
01559                percentcutoff=None, weightcutoff=None,
01560                query_type=None):
01561         """Perform a search, for documents matching a query.
01562 
01563         - `query` is the query to perform.
01564         - `startrank` is the rank of the start of the range of matching
01565           documents to return (ie, the result with this rank will be returned).
01566           ranks start at 0, which represents the "best" matching document.
01567         - `endrank` is the rank at the end of the range of matching documents
01568           to return.  This is exclusive, so the result with this rank will not
01569           be returned.
01570         - `checkatleast` is the minimum number of results to check for: the
01571           estimate of the total number of matches will always be exact if
01572           the number of matches is less than `checkatleast`.  A value of ``-1``
01573           can be specified for the checkatleast parameter - this has the
01574           special meaning of "check all matches", and is equivalent to passing
01575           the result of get_doccount().
01576         - `sortby` is the name of a field to sort by.  It may be preceded by a
01577           '+' or a '-' to indicate ascending or descending order
01578           (respectively).  If the first character is neither '+' or '-', the
01579           sort will be in ascending order.
01580         - `collapse` is the name of a field to collapse the result documents
01581           on.  If this is specified, there will be at most one result in the
01582           result set for each value of the field.
01583         - `gettags` is the name of a field to count tag occurrences in, or a
01584           list of fields to do so.
01585         - `getfacets` is a boolean - if True, the matching documents will be
01586           examined to build up a list of the facet values contained in them.
01587         - `allowfacets` is a list of the fieldnames of facets to consider.
01588         - `denyfacets` is a list of fieldnames of facets which will not be
01589           considered.
01590         - `usesubfacets` is a boolean - if True, only top-level facets and
01591           subfacets of facets appearing in the query are considered (taking
01592           precedence over `allowfacets` and `denyfacets`).
01593         - `percentcutoff` is the minimum percentage a result must have to be
01594           returned.
01595         - `weightcutoff` is the minimum weight a result must have to be
01596           returned.
01597         - `query_type` is a value indicating the type of query being
01598           performed. If not None, the value is used to influence which facets
01599           are be returned by the get_suggested_facets() function. If the
01600           value of `getfacets` is False, it has no effect.
01601 
01602         If neither 'allowfacets' or 'denyfacets' is specified, all fields
01603         holding facets will be considered (but see 'usesubfacets').
01604 
01605         """
01606         if self._index is None:
01607             raise _errors.SearchError("SearchConnection has been closed")
01608         if 'facets' in _checkxapian.missing_features:
01609             if getfacets is not None or \
01610                allowfacets is not None or \
01611                denyfacets is not None or \
01612                usesubfacets is not None or \
01613                query_type is not None:
01614                 raise errors.SearchError("Facets unsupported with this release of xapian")
01615         if 'tags' in _checkxapian.missing_features:
01616             if gettags is not None:
01617                 raise errors.SearchError("Tags unsupported with this release of xapian")
01618         if checkatleast == -1:
01619             checkatleast = self._index.get_doccount()
01620 
01621         enq = _log(_xapian.Enquire, self._index)
01622         enq.set_query(query)
01623 
01624         if sortby is not None:
01625             asc = True
01626             if sortby[0] == '-':
01627                 asc = False
01628                 sortby = sortby[1:]
01629             elif sortby[0] == '+':
01630                 sortby = sortby[1:]
01631 
01632             try:
01633                 slotnum = self._field_mappings.get_slot(sortby, 'collsort')
01634             except KeyError:
01635                 raise _errors.SearchError("Field %r was not indexed for sorting" % sortby)
01636 
01637             # Note: we invert the "asc" parameter, because xapian treats
01638             # "ascending" as meaning "higher values are better"; in other
01639             # words, it considers "ascending" to mean return results in
01640             # descending order.
01641             enq.set_sort_by_value_then_relevance(slotnum, not asc)
01642 
01643         if collapse is not None:
01644             try:
01645                 slotnum = self._field_mappings.get_slot(collapse, 'collsort')
01646             except KeyError:
01647                 raise _errors.SearchError("Field %r was not indexed for collapsing" % collapse)
01648             enq.set_collapse_key(slotnum)
01649 
01650         maxitems = max(endrank - startrank, 0)
01651         # Always check for at least one more result, so we can report whether
01652         # there are more matches.
01653         checkatleast = max(checkatleast, endrank + 1)
01654 
01655         # Build the matchspy.
01656         matchspies = []
01657 
01658         # First, add a matchspy for any gettags fields
01659         if isinstance(gettags, basestring):
01660             if len(gettags) != 0:
01661                 gettags = [gettags]
01662         tagspy = None
01663         if gettags is not None and len(gettags) != 0:
01664             tagspy = _log(_xapian.TermCountMatchSpy)
01665             for field in gettags:
01666                 try:
01667                     prefix = self._field_mappings.get_prefix(field)
01668                     tagspy.add_prefix(prefix)
01669                 except KeyError:
01670                     raise _errors.SearchError("Field %r was not indexed for tagging" % field)
01671             matchspies.append(tagspy)
01672 
01673 
01674         # add a matchspy for facet selection here.
01675         facetspy = None
01676         facetfields = []
01677         if getfacets:
01678             if allowfacets is not None and denyfacets is not None:
01679                 raise _errors.SearchError("Cannot specify both `allowfacets` and `denyfacets`")
01680             if allowfacets is None:
01681                 allowfacets = [key for key in self._field_actions]
01682             if denyfacets is not None:
01683                 allowfacets = [key for key in allowfacets if key not in denyfacets]
01684 
01685             # include None in queryfacets so a top-level facet will
01686             # satisfy self._facet_hierarchy.get(field) in queryfacets
01687             # (i.e. always include top-level facets)
01688             queryfacets = set([None])
01689             if usesubfacets:
01690                 # add facets used in the query to queryfacets
01691                 termsiter = query.get_terms_begin()
01692                 termsend = query.get_terms_end()
01693                 while termsiter != termsend:
01694                     prefix = self._get_prefix_from_term(termsiter.get_term())
01695                     field = self._field_mappings.get_fieldname_from_prefix(prefix)
01696                     if field and FieldActions.FACET in self._field_actions[field]._actions:
01697                         queryfacets.add(field)
01698                     termsiter.next()
01699 
01700             for field in allowfacets:
01701                 try:
01702                     actions = self._field_actions[field]._actions
01703                 except KeyError:
01704                     actions = {}
01705                 for action, kwargslist in actions.iteritems():
01706                     if action == FieldActions.FACET:
01707                         # filter out non-top-level facets that aren't subfacets
01708                         # of a facet in the query
01709                         if usesubfacets and self._facet_hierarchy.get(field) not in queryfacets:
01710                             continue
01711                         # filter out facets that should never be returned for the query type
01712                         if self._facet_query_never(field, query_type):
01713                             continue
01714                         slot = self._field_mappings.get_slot(field, 'facet')
01715                         if facetspy is None:
01716                             facetspy = _log(_xapian.CategorySelectMatchSpy)
01717                         facettype = None
01718                         for kwargs in kwargslist:
01719                             facettype = kwargs.get('type', None)
01720                             if facettype is not None:
01721                                 break
01722                         if facettype is None or facettype == 'string':
01723                             facetspy.add_slot(slot, True)
01724                         else:
01725                             facetspy.add_slot(slot)
01726                         facetfields.append((field, slot, kwargslist))
01727 
01728             if facetspy is None:
01729                 # Set facetspy to False, to distinguish from no facet
01730                 # calculation being performed.  (This will prevent an
01731                 # error being thrown when the list of suggested facets is
01732                 # requested - instead, an empty list will be returned.)
01733                 facetspy = False
01734             else:
01735                 matchspies.append(facetspy)
01736 
01737 
01738         # Finally, build a single matchspy to pass to get_mset().
01739         if len(matchspies) == 0:
01740             matchspy = None
01741         elif len(matchspies) == 1:
01742             matchspy = matchspies[0]
01743         else:
01744             matchspy = _log(_xapian.MultipleMatchDecider)
01745             for spy in matchspies:
01746                 matchspy.append(spy)
01747 
01748         enq.set_docid_order(enq.DONT_CARE)
01749 
01750         # Set percentage and weight cutoffs
01751         if percentcutoff is not None or weightcutoff is not None:
01752             if percentcutoff is None:
01753                 percentcutoff = 0
01754             if weightcutoff is None:
01755                 weightcutoff = 0
01756             enq.set_cutoff(percentcutoff, weightcutoff)
01757 
01758         # Repeat the search until we don't get a DatabaseModifiedError
01759         while True:
01760             try:
01761                 if matchspy is None:
01762                     mset = enq.get_mset(startrank, maxitems, checkatleast)
01763                 else:
01764                     mset = enq.get_mset(startrank, maxitems, checkatleast,
01765                                         None, None, matchspy)
01766                 break
01767             except _xapian.DatabaseModifiedError, e:
01768                 self.reopen()
01769         facet_hierarchy = None
01770         if usesubfacets:
01771             facet_hierarchy = self._facet_hierarchy
01772             
01773         return SearchResults(self, enq, query, mset, self._field_mappings,
01774                              tagspy, gettags, facetspy, facetfields,
01775                              facet_hierarchy,
01776                              self._facet_query_table.get(query_type))
01777 
01778     def iterids(self):
01779         """Get an iterator which returns all the ids in the database.
01780 
01781         The unqiue_ids are currently returned in binary lexicographical sort
01782         order, but this should not be relied on.
01783 
01784         Note that the iterator returned by this method may raise a
01785         xapian.DatabaseModifiedError exception if modifications are committed
01786         to the database while the iteration is in progress.  If this happens,
01787         the search connection must be reopened (by calling reopen) and the
01788         iteration restarted.
01789 
01790         """
01791         if self._index is None:
01792             raise _errors.SearchError("SearchConnection has been closed")
01793         return _indexerconnection.PrefixedTermIter('Q', self._index.allterms())
01794 
01795     def get_document(self, id):
01796         """Get the document with the specified unique ID.
01797 
01798         Raises a KeyError if there is no such document.  Otherwise, it returns
01799         a ProcessedDocument.
01800 
01801         """
01802         if self._index is None:
01803             raise _errors.SearchError("SearchConnection has been closed")
01804         while True:
01805             try:
01806                 postlist = self._index.postlist('Q' + id)
01807                 try:
01808                     plitem = postlist.next()
01809                 except StopIteration:
01810                     # Unique ID not found
01811                     raise KeyError('Unique ID %r not found' % id)
01812                 try:
01813                     postlist.next()
01814                     raise _errors.IndexerError("Multiple documents " #pragma: no cover
01815                                                "found with same unique ID")
01816                 except StopIteration:
01817                     # Only one instance of the unique ID found, as it should be.
01818                     pass
01819 
01820                 result = ProcessedDocument(self._field_mappings)
01821                 result.id = id
01822                 result._doc = self._index.get_document(plitem.docid)
01823                 return result
01824             except _xapian.DatabaseModifiedError, e:
01825                 self.reopen()
01826 
01827     def iter_synonyms(self, prefix=""):
01828         """Get an iterator over the synonyms.
01829 
01830          - `prefix`: if specified, only synonym keys with this prefix will be
01831            returned.
01832 
01833         The iterator returns 2-tuples, in which the first item is the key (ie,
01834         a 2-tuple holding the term or terms which will be synonym expanded,
01835         followed by the fieldname specified (or None if no fieldname)), and the
01836         second item is a tuple of strings holding the synonyms for the first
01837         item.
01838 
01839         These return values are suitable for the dict() builtin, so you can
01840         write things like:
01841 
01842          >>> conn = _indexerconnection.IndexerConnection('foo')
01843          >>> conn.add_synonym('foo', 'bar')
01844          >>> conn.add_synonym('foo bar', 'baz')
01845          >>> conn.add_synonym('foo bar', 'foo baz')
01846          >>> conn.flush()
01847          >>> conn = SearchConnection('foo')
01848          >>> dict(conn.iter_synonyms())
01849          {('foo', None): ('bar',), ('foo bar', None): ('baz', 'foo baz')}
01850 
01851         """
01852         if self._index is None:
01853             raise _errors.SearchError("SearchConnection has been closed")
01854         return _indexerconnection.SynonymIter(self._index, self._field_mappings, prefix)
01855 
01856     def get_metadata(self, key):
01857         """Get an item of metadata stored in the connection.
01858 
01859         This returns a value stored by a previous call to
01860         IndexerConnection.set_metadata.
01861 
01862         If the value is not found, this will return the empty string.
01863 
01864         """
01865         if self._index is None:
01866             raise _errors.IndexerError("SearchConnection has been closed")
01867         if not hasattr(self._index, 'get_metadata'):
01868             raise _errors.IndexerError("Version of xapian in use does not support metadata")
01869         return _log(self._index.get_metadata, key)
01870 
01871 if __name__ == '__main__':
01872     import doctest, sys
01873     doctest.testmod (sys.modules[__name__])