Back to index

moin  1.9.0~rc2
indexerconnection.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 #
00003 # Copyright (C) 2007 Lemur Consulting Ltd
00004 #
00005 # This program is free software; you can redistribute it and/or modify
00006 # it under the terms of the GNU General Public License as published by
00007 # the Free Software Foundation; either version 2 of the License, or
00008 # (at your option) any later version.
00009 #
00010 # This program is distributed in the hope that it will be useful,
00011 # but WITHOUT ANY WARRANTY; without even the implied warranty of
00012 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013 # GNU General Public License for more details.
00014 # 
00015 # You should have received a copy of the GNU General Public License along
00016 # with this program; if not, write to the Free Software Foundation, Inc.,
00017 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
00018 r"""indexerconnection.py: A connection to the search engine for indexing.
00019 
00020 """
00021 __docformat__ = "restructuredtext en"
00022 
00023 import _checkxapian
00024 import cPickle
00025 import xapian
00026 
00027 from datastructures import *
00028 import errors
00029 from fieldactions import *
00030 import fieldmappings
00031 import memutils
00032 from replaylog import log
00033 
00034 class IndexerConnection(object):
00035     """A connection to the search engine for indexing.
00036 
00037     """
00038 
00039     def __init__(self, indexpath):
00040         """Create a new connection to the index.
00041 
00042         There may only be one indexer connection for a particular database open
00043         at a given time.  Therefore, if a connection to the database is already
00044         open, this will raise a xapian.DatabaseLockError.
00045 
00046         If the database doesn't already exist, it will be created.
00047 
00048         """
00049         self._index = log(xapian.WritableDatabase, indexpath, xapian.DB_CREATE_OR_OPEN)
00050         self._indexpath = indexpath
00051 
00052         # Read existing actions.
00053         self._field_actions = {}
00054         self._field_mappings = fieldmappings.FieldMappings()
00055         self._facet_hierarchy = {}
00056         self._facet_query_table = {}
00057         self._next_docid = 0
00058         self._config_modified = False
00059         self._load_config()
00060 
00061         # Set management of the memory used.
00062         # This can be removed once Xapian implements this itself.
00063         self._mem_buffered = 0
00064         self.set_max_mem_use()
00065 
00066     def set_max_mem_use(self, max_mem=None, max_mem_proportion=None):
00067         """Set the maximum memory to use.
00068 
00069         This call allows the amount of memory to use to buffer changes to be
00070         set.  This will affect the speed of indexing, but should not result in
00071         other changes to the indexing.
00072 
00073         Note: this is an approximate measure - the actual amount of memory used
00074         max exceed the specified amount.  Also, note that future versions of
00075         xapian are likely to implement this differently, so this setting may be
00076         entirely ignored.
00077 
00078         The absolute amount of memory to use (in bytes) may be set by setting
00079         max_mem.  Alternatively, the proportion of the available memory may be
00080         set by setting max_mem_proportion (this should be a value between 0 and
00081         1).
00082 
00083         Setting too low a value will result in excessive flushing, and very
00084         slow indexing.  Setting too high a value will result in excessive
00085         buffering, leading to swapping, and very slow indexing.
00086 
00087         A reasonable default for max_mem_proportion for a system which is
00088         dedicated to indexing is probably 0.5: if other tasks are also being
00089         performed on the system, the value should be lowered.
00090 
00091         """
00092         if self._index is None:
00093             raise errors.IndexerError("IndexerConnection has been closed")
00094         if max_mem is not None and max_mem_proportion is not None:
00095             raise errors.IndexerError("Only one of max_mem and "
00096                                        "max_mem_proportion may be specified")
00097 
00098         if max_mem is None and max_mem_proportion is None:
00099             self._max_mem = None
00100 
00101         if max_mem_proportion is not None:
00102             physmem = memutils.get_physical_memory()
00103             if physmem is not None:
00104                 max_mem = int(physmem * max_mem_proportion)
00105 
00106         self._max_mem = max_mem
00107 
00108     def _store_config(self):
00109         """Store the configuration for the database.
00110 
00111         Currently, this stores the configuration in a file in the database
00112         directory, so changes to it are not protected by transactions.  When
00113         support is available in xapian for storing metadata associated with
00114         databases. this will be used instead of a file.
00115 
00116         """
00117         assert self._index is not None
00118 
00119         config_str = cPickle.dumps((
00120                                      self._field_actions,
00121                                      self._field_mappings.serialise(),
00122                                      self._facet_hierarchy,
00123                                      self._facet_query_table,
00124                                      self._next_docid,
00125                                     ), 2)
00126         log(self._index.set_metadata, '_xappy_config', config_str)
00127 
00128         self._config_modified = False
00129 
00130     def _load_config(self):
00131         """Load the configuration for the database.
00132 
00133         """
00134         assert self._index is not None
00135 
00136         config_str = log(self._index.get_metadata, '_xappy_config')
00137         if len(config_str) == 0:
00138             return
00139 
00140         try:
00141             (self._field_actions, mappings, self._facet_hierarchy, self._facet_query_table, self._next_docid) = cPickle.loads(config_str)
00142         except ValueError:
00143             # Backwards compatibility - configuration used to lack _facet_hierarchy and _facet_query_table
00144             (self._field_actions, mappings, self._next_docid) = cPickle.loads(config_str)
00145             self._facet_hierarchy = {}
00146             self._facet_query_table = {}
00147         self._field_mappings = fieldmappings.FieldMappings(mappings)
00148 
00149         self._config_modified = False
00150 
00151     def _allocate_id(self):
00152         """Allocate a new ID.
00153 
00154         """
00155         while True:
00156             idstr = "%x" % self._next_docid
00157             self._next_docid += 1
00158             if not self._index.term_exists('Q' + idstr):
00159                 break
00160         self._config_modified = True
00161         return idstr
00162 
00163     def add_field_action(self, fieldname, fieldtype, **kwargs):
00164         """Add an action to be performed on a field.
00165 
00166         Note that this change to the configuration will not be preserved on
00167         disk until the next call to flush().
00168 
00169         """
00170         if self._index is None:
00171             raise errors.IndexerError("IndexerConnection has been closed")
00172         if fieldname in self._field_actions:
00173             actions = self._field_actions[fieldname]
00174         else:
00175             actions = FieldActions(fieldname)
00176             self._field_actions[fieldname] = actions
00177         actions.add(self._field_mappings, fieldtype, **kwargs)
00178         self._config_modified = True
00179 
00180     def clear_field_actions(self, fieldname):
00181         """Clear all actions for the specified field.
00182 
00183         This does not report an error if there are already no actions for the
00184         specified field.
00185 
00186         Note that this change to the configuration will not be preserved on
00187         disk until the next call to flush().
00188 
00189         """
00190         if self._index is None:
00191             raise errors.IndexerError("IndexerConnection has been closed")
00192         if fieldname in self._field_actions:
00193             del self._field_actions[fieldname]
00194             self._config_modified = True
00195 
00196     def get_fields_with_actions(self):
00197         """Get a list of field names which have actions defined.
00198 
00199         """
00200         if self._index is None:
00201             raise errors.IndexerError("IndexerConnection has been closed")
00202         return self._field_actions.keys()
00203 
00204     def process(self, document):
00205         """Process an UnprocessedDocument with the settings in this database.
00206 
00207         The resulting ProcessedDocument is returned.
00208 
00209         Note that this processing will be automatically performed if an
00210         UnprocessedDocument is supplied to the add() or replace() methods of
00211         IndexerConnection.  This method is exposed to allow the processing to
00212         be performed separately, which may be desirable if you wish to manually
00213         modify the processed document before adding it to the database, or if
00214         you want to split processing of documents from adding documents to the
00215         database for performance reasons.
00216 
00217         """
00218         if self._index is None:
00219             raise errors.IndexerError("IndexerConnection has been closed")
00220         result = ProcessedDocument(self._field_mappings)
00221         result.id = document.id
00222         context = ActionContext(self._index)
00223 
00224         for field in document.fields:
00225             try:
00226                 actions = self._field_actions[field.name]
00227             except KeyError:
00228                 # If no actions are defined, just ignore the field.
00229                 continue
00230             actions.perform(result, field.value, context)
00231 
00232         return result
00233 
00234     def _get_bytes_used_by_doc_terms(self, xapdoc):
00235         """Get an estimate of the bytes used by the terms in a document.
00236 
00237         (This is a very rough estimate.)
00238 
00239         """
00240         count = 0
00241         for item in xapdoc.termlist():
00242             # The term may also be stored in the spelling correction table, so
00243             # double the amount used.
00244             count += len(item.term) * 2
00245 
00246             # Add a few more bytes for holding the wdf, and other bits and
00247             # pieces.
00248             count += 8
00249 
00250         # Empirical observations indicate that about 5 times as much memory as
00251         # the above calculation predicts is used for buffering in practice.
00252         return count * 5
00253 
00254     def add(self, document):
00255         """Add a new document to the search engine index.
00256 
00257         If the document has a id set, and the id already exists in
00258         the database, an exception will be raised.  Use the replace() method
00259         instead if you wish to overwrite documents.
00260 
00261         Returns the id of the newly added document (making up a new
00262         unique ID if no id was set).
00263 
00264         The supplied document may be an instance of UnprocessedDocument, or an
00265         instance of ProcessedDocument.
00266 
00267         """
00268         if self._index is None:
00269             raise errors.IndexerError("IndexerConnection has been closed")
00270         if not hasattr(document, '_doc'):
00271             # It's not a processed document.
00272             document = self.process(document)
00273 
00274         # Ensure that we have a id
00275         orig_id = document.id
00276         if orig_id is None:
00277             id = self._allocate_id()
00278             document.id = id
00279         else:
00280             id = orig_id
00281             if self._index.term_exists('Q' + id):
00282                 raise errors.IndexerError("Document ID of document supplied to add() is not unique.")
00283             
00284         # Add the document.
00285         xapdoc = document.prepare()
00286         self._index.add_document(xapdoc)
00287 
00288         if self._max_mem is not None:
00289             self._mem_buffered += self._get_bytes_used_by_doc_terms(xapdoc)
00290             if self._mem_buffered > self._max_mem:
00291                 self.flush()
00292 
00293         if id is not orig_id:
00294             document.id = orig_id
00295         return id
00296 
00297     def replace(self, document):
00298         """Replace a document in the search engine index.
00299 
00300         If the document does not have a id set, an exception will be
00301         raised.
00302 
00303         If the document has a id set, and the id does not already
00304         exist in the database, this method will have the same effect as add().
00305 
00306         """
00307         if self._index is None:
00308             raise errors.IndexerError("IndexerConnection has been closed")
00309         if not hasattr(document, '_doc'):
00310             # It's not a processed document.
00311             document = self.process(document)
00312 
00313         # Ensure that we have a id
00314         id = document.id
00315         if id is None:
00316             raise errors.IndexerError("No document ID set for document supplied to replace().")
00317 
00318         xapdoc = document.prepare()
00319         self._index.replace_document('Q' + id, xapdoc)
00320 
00321         if self._max_mem is not None:
00322             self._mem_buffered += self._get_bytes_used_by_doc_terms(xapdoc)
00323             if self._mem_buffered > self._max_mem:
00324                 self.flush()
00325 
00326     def _make_synonym_key(self, original, field):
00327         """Make a synonym key (ie, the term or group of terms to store in
00328         xapian).
00329 
00330         """
00331         if field is not None:
00332             prefix = self._field_mappings.get_prefix(field)
00333         else:
00334             prefix = ''
00335         original = original.lower()
00336         # Add the prefix to the start of each word.
00337         return ' '.join((prefix + word for word in original.split(' ')))
00338 
00339     def add_synonym(self, original, synonym, field=None,
00340                     original_field=None, synonym_field=None):
00341         """Add a synonym to the index.
00342 
00343          - `original` is the word or words which will be synonym expanded in
00344            searches (if multiple words are specified, each word should be
00345            separated by a single space).
00346          - `synonym` is a synonym for `original`.
00347          - `field` is the field which the synonym is specific to.  If no field
00348            is specified, the synonym will be used for searches which are not
00349            specific to any particular field.
00350 
00351         """
00352         if self._index is None:
00353             raise errors.IndexerError("IndexerConnection has been closed")
00354         if original_field is None:
00355             original_field = field
00356         if synonym_field is None:
00357             synonym_field = field
00358         key = self._make_synonym_key(original, original_field)
00359         # FIXME - this only works for exact fields which have no upper case
00360         # characters, or single words
00361         value = self._make_synonym_key(synonym, synonym_field)
00362         self._index.add_synonym(key, value)
00363 
00364     def remove_synonym(self, original, synonym, field=None):
00365         """Remove a synonym from the index.
00366 
00367          - `original` is the word or words which will be synonym expanded in
00368            searches (if multiple words are specified, each word should be
00369            separated by a single space).
00370          - `synonym` is a synonym for `original`.
00371          - `field` is the field which this synonym is specific to.  If no field
00372            is specified, the synonym will be used for searches which are not
00373            specific to any particular field.
00374 
00375         """
00376         if self._index is None:
00377             raise errors.IndexerError("IndexerConnection has been closed")
00378         key = self._make_synonym_key(original, field)
00379         self._index.remove_synonym(key, synonym.lower())
00380 
00381     def clear_synonyms(self, original, field=None):
00382         """Remove all synonyms for a word (or phrase).
00383 
00384          - `field` is the field which this synonym is specific to.  If no field
00385            is specified, the synonym will be used for searches which are not
00386            specific to any particular field.
00387 
00388         """
00389         if self._index is None:
00390             raise errors.IndexerError("IndexerConnection has been closed")
00391         key = self._make_synonym_key(original, field)
00392         self._index.clear_synonyms(key)
00393 
00394     def _assert_facet(self, facet):
00395         """Raise an error if facet is not a declared facet field.
00396 
00397         """
00398         for action in self._field_actions[facet]._actions:
00399             if action == FieldActions.FACET:
00400                 return
00401         raise errors.IndexerError("Field %r is not indexed as a facet" % facet)
00402 
00403     def add_subfacet(self, subfacet, facet):
00404         """Add a subfacet-facet relationship to the facet hierarchy.
00405         
00406         Any existing relationship for that subfacet is replaced.
00407 
00408         Raises a KeyError if either facet or subfacet is not a field,
00409         and an IndexerError if either facet or subfacet is not a facet field.
00410         """
00411         if self._index is None:
00412             raise errors.IndexerError("IndexerConnection has been closed")
00413         self._assert_facet(facet)
00414         self._assert_facet(subfacet)
00415         self._facet_hierarchy[subfacet] = facet
00416         self._config_modified = True
00417 
00418     def remove_subfacet(self, subfacet):
00419         """Remove any existing facet hierarchy relationship for a subfacet.
00420 
00421         """
00422         if self._index is None:
00423             raise errors.IndexerError("IndexerConnection has been closed")
00424         if subfacet in self._facet_hierarchy:
00425             del self._facet_hierarchy[subfacet]
00426             self._config_modified = True
00427 
00428     def get_subfacets(self, facet):
00429         """Get a list of subfacets of a facet.
00430 
00431         """
00432         if self._index is None:
00433             raise errors.IndexerError("IndexerConnection has been closed")
00434         return [k for k, v in self._facet_hierarchy.iteritems() if v == facet] 
00435 
00436     FacetQueryType_Preferred = 1;
00437     FacetQueryType_Never = 2;
00438     def set_facet_for_query_type(self, query_type, facet, association):
00439         """Set the association between a query type and a facet.
00440 
00441         The value of `association` must be one of
00442         IndexerConnection.FacetQueryType_Preferred,
00443         IndexerConnection.FacetQueryType_Never or None. A value of None removes
00444         any previously set association.
00445 
00446         """
00447         if self._index is None:
00448             raise errors.IndexerError("IndexerConnection has been closed")
00449         if query_type is None:
00450             raise errors.IndexerError("Cannot set query type information for None")
00451         self._assert_facet(facet)
00452         if query_type not in self._facet_query_table:
00453             self._facet_query_table[query_type] = {}
00454         if association is None:
00455             if facet in self._facet_query_table[query_type]:
00456                 del self._facet_query_table[query_type][facet]
00457         else:
00458             self._facet_query_table[query_type][facet] = association;
00459         if self._facet_query_table[query_type] == {}:
00460             del self._facet_query_table[query_type]
00461         self._config_modified = True
00462 
00463     def get_facets_for_query_type(self, query_type, association):
00464         """Get the set of facets associated with a query type.
00465 
00466         Only those facets associated with the query type in the specified
00467         manner are returned; `association` must be one of
00468         IndexerConnection.FacetQueryType_Preferred or
00469         IndexerConnection.FacetQueryType_Never.
00470 
00471         If the query type has no facets associated with it, None is returned.
00472 
00473         """
00474         if self._index is None:
00475             raise errors.IndexerError("IndexerConnection has been closed")
00476         if query_type not in self._facet_query_table:
00477             return None
00478         facet_dict = self._facet_query_table[query_type]
00479         return set([facet for facet, assoc in facet_dict.iteritems() if assoc == association])
00480 
00481     def set_metadata(self, key, value):
00482         """Set an item of metadata stored in the connection.
00483 
00484         The value supplied will be returned by subsequent calls to
00485         get_metadata() which use the same key.
00486 
00487         Keys with a leading underscore are reserved for internal use - you
00488         should not use such keys unless you really know what you are doing.
00489 
00490         This will store the value supplied in the database.  It will not be
00491         visible to readers (ie, search connections) until after the next flush.
00492 
00493         The key is limited to about 200 characters (the same length as a term
00494         is limited to).  The value can be several megabytes in size.
00495 
00496         To remove an item of metadata, simply call this with a `value`
00497         parameter containing an empty string.
00498 
00499         """
00500         if self._index is None:
00501             raise errors.IndexerError("IndexerConnection has been closed")
00502         if not hasattr(self._index, 'set_metadata'):
00503             raise errors.IndexerError("Version of xapian in use does not support metadata")
00504         log(self._index.set_metadata, key, value)
00505 
00506     def get_metadata(self, key):
00507         """Get an item of metadata stored in the connection.
00508 
00509         This returns a value stored by a previous call to set_metadata.
00510 
00511         If the value is not found, this will return the empty string.
00512 
00513         """
00514         if self._index is None:
00515             raise errors.IndexerError("IndexerConnection has been closed")
00516         if not hasattr(self._index, 'get_metadata'):
00517             raise errors.IndexerError("Version of xapian in use does not support metadata")
00518         return log(self._index.get_metadata, key)
00519 
00520     def delete(self, id):
00521         """Delete a document from the search engine index.
00522 
00523         If the id does not already exist in the database, this method
00524         will have no effect (and will not report an error).
00525 
00526         """
00527         if self._index is None:
00528             raise errors.IndexerError("IndexerConnection has been closed")
00529         self._index.delete_document('Q' + id)
00530 
00531     def flush(self):
00532         """Apply recent changes to the database.
00533 
00534         If an exception occurs, any changes since the last call to flush() may
00535         be lost.
00536 
00537         """
00538         if self._index is None:
00539             raise errors.IndexerError("IndexerConnection has been closed")
00540         if self._config_modified:
00541             self._store_config()
00542         self._index.flush()
00543         self._mem_buffered = 0
00544 
00545     def close(self):
00546         """Close the connection to the database.
00547 
00548         It is important to call this method before allowing the class to be
00549         garbage collected, because it will ensure that any un-flushed changes
00550         will be flushed.  It also ensures that the connection is cleaned up
00551         promptly.
00552 
00553         No other methods may be called on the connection after this has been
00554         called.  (It is permissible to call close() multiple times, but
00555         only the first call will have any effect.)
00556 
00557         If an exception occurs, the database will be closed, but changes since
00558         the last call to flush may be lost.
00559 
00560         """
00561         if self._index is None:
00562             return
00563         try:
00564             self.flush()
00565         finally:
00566             # There is currently no "close()" method for xapian databases, so
00567             # we have to rely on the garbage collector.  Since we never copy
00568             # the _index property out of this class, there should be no cycles,
00569             # so the standard python implementation should garbage collect
00570             # _index straight away.  A close() method is planned to be added to
00571             # xapian at some point - when it is, we should call it here to make
00572             # the code more robust.
00573             self._index = None
00574             self._indexpath = None
00575             self._field_actions = None
00576             self._config_modified = False
00577 
00578     def get_doccount(self):
00579         """Count the number of documents in the database.
00580 
00581         This count will include documents which have been added or removed but
00582         not yet flushed().
00583 
00584         """
00585         if self._index is None:
00586             raise errors.IndexerError("IndexerConnection has been closed")
00587         return self._index.get_doccount()
00588 
00589     def iterids(self):
00590         """Get an iterator which returns all the ids in the database.
00591 
00592         The unqiue_ids are currently returned in binary lexicographical sort
00593         order, but this should not be relied on.
00594 
00595         """
00596         if self._index is None:
00597             raise errors.IndexerError("IndexerConnection has been closed")
00598         return PrefixedTermIter('Q', self._index.allterms())
00599 
00600     def get_document(self, id):
00601         """Get the document with the specified unique ID.
00602 
00603         Raises a KeyError if there is no such document.  Otherwise, it returns
00604         a ProcessedDocument.
00605 
00606         """
00607         if self._index is None:
00608             raise errors.IndexerError("IndexerConnection has been closed")
00609         postlist = self._index.postlist('Q' + id)
00610         try:
00611             plitem = postlist.next()
00612         except StopIteration:
00613             # Unique ID not found
00614             raise KeyError('Unique ID %r not found' % id)
00615         try:
00616             postlist.next()
00617             raise errors.IndexerError("Multiple documents " #pragma: no cover
00618                                        "found with same unique ID")
00619         except StopIteration:
00620             # Only one instance of the unique ID found, as it should be.
00621             pass
00622 
00623         result = ProcessedDocument(self._field_mappings)
00624         result.id = id
00625         result._doc = self._index.get_document(plitem.docid)
00626         return result
00627 
00628     def iter_synonyms(self, prefix=""):
00629         """Get an iterator over the synonyms.
00630 
00631          - `prefix`: if specified, only synonym keys with this prefix will be
00632            returned.
00633 
00634         The iterator returns 2-tuples, in which the first item is the key (ie,
00635         a 2-tuple holding the term or terms which will be synonym expanded,
00636         followed by the fieldname specified (or None if no fieldname)), and the
00637         second item is a tuple of strings holding the synonyms for the first
00638         item.
00639 
00640         These return values are suitable for the dict() builtin, so you can
00641         write things like:
00642 
00643          >>> conn = IndexerConnection('foo')
00644          >>> conn.add_synonym('foo', 'bar')
00645          >>> conn.add_synonym('foo bar', 'baz')
00646          >>> conn.add_synonym('foo bar', 'foo baz')
00647          >>> dict(conn.iter_synonyms())
00648          {('foo', None): ('bar',), ('foo bar', None): ('baz', 'foo baz')}
00649 
00650         """
00651         if self._index is None:
00652             raise errors.IndexerError("IndexerConnection has been closed")
00653         return SynonymIter(self._index, self._field_mappings, prefix)
00654 
00655     def iter_subfacets(self):
00656         """Get an iterator over the facet hierarchy.
00657 
00658         The iterator returns 2-tuples, in which the first item is the
00659         subfacet and the second item is its parent facet.
00660 
00661         The return values are suitable for the dict() builtin, for example:
00662 
00663          >>> conn = IndexerConnection('db')
00664          >>> conn.add_field_action('foo', FieldActions.FACET)
00665          >>> conn.add_field_action('bar', FieldActions.FACET)
00666          >>> conn.add_field_action('baz', FieldActions.FACET)
00667          >>> conn.add_subfacet('foo', 'bar')
00668          >>> conn.add_subfacet('baz', 'bar')
00669          >>> dict(conn.iter_subfacets())
00670          {'foo': 'bar', 'baz': 'bar'}
00671 
00672         """
00673         if self._index is None:
00674             raise errors.IndexerError("IndexerConnection has been closed")
00675         if 'facets' in _checkxapian.missing_features:
00676             raise errors.IndexerError("Facets unsupported with this release of xapian")
00677         return self._facet_hierarchy.iteritems()
00678 
00679     def iter_facet_query_types(self, association):
00680         """Get an iterator over query types and their associated facets.
00681 
00682         Only facets associated with the query types in the specified manner
00683         are returned; `association` must be one of IndexerConnection.FacetQueryType_Preferred
00684         or IndexerConnection.FacetQueryType_Never.
00685 
00686         The iterator returns 2-tuples, in which the first item is the query
00687         type and the second item is the associated set of facets.
00688 
00689         The return values are suitable for the dict() builtin, for example:
00690 
00691          >>> conn = IndexerConnection('db')
00692          >>> conn.add_field_action('foo', FieldActions.FACET)
00693          >>> conn.add_field_action('bar', FieldActions.FACET)
00694          >>> conn.add_field_action('baz', FieldActions.FACET)
00695          >>> conn.set_facet_for_query_type('type1', 'foo', conn.FacetQueryType_Preferred)
00696          >>> conn.set_facet_for_query_type('type1', 'bar', conn.FacetQueryType_Never)
00697          >>> conn.set_facet_for_query_type('type1', 'baz', conn.FacetQueryType_Never)
00698          >>> conn.set_facet_for_query_type('type2', 'bar', conn.FacetQueryType_Preferred)
00699          >>> dict(conn.iter_facet_query_types(conn.FacetQueryType_Preferred))
00700          {'type1': set(['foo']), 'type2': set(['bar'])}
00701          >>> dict(conn.iter_facet_query_types(conn.FacetQueryType_Never))
00702          {'type1': set(['bar', 'baz'])}
00703 
00704         """
00705         if self._index is None:
00706             raise errors.IndexerError("IndexerConnection has been closed")
00707         if 'facets' in _checkxapian.missing_features:
00708             raise errors.IndexerError("Facets unsupported with this release of xapian")
00709         return FacetQueryTypeIter(self._facet_query_table, association)
00710 
00711 class PrefixedTermIter(object):
00712     """Iterate through all the terms with a given prefix.
00713 
00714     """
00715     def __init__(self, prefix, termiter):
00716         """Initialise the prefixed term iterator.
00717 
00718         - `prefix` is the prefix to return terms for.
00719         - `termiter` is a xapian TermIterator, which should be at its start.
00720 
00721         """
00722 
00723         # The algorithm used in next() currently only works for single
00724         # character prefixes, so assert that the prefix is single character.
00725         # To deal with multicharacter prefixes, we need to check for terms
00726         # which have a starting prefix equal to that given, but then have a
00727         # following uppercase alphabetic character, indicating that the actual
00728         # prefix is longer than the target prefix.  We then need to skip over
00729         # these.  Not too hard to implement, but we don't need it yet.
00730         assert(len(prefix) == 1)
00731 
00732         self._started = False
00733         self._prefix = prefix
00734         self._prefixlen = len(prefix)
00735         self._termiter = termiter
00736 
00737     def __iter__(self):
00738         return self
00739 
00740     def next(self):
00741         """Get the next term with the specified prefix.
00742 
00743         """
00744         if not self._started:
00745             term = self._termiter.skip_to(self._prefix).term
00746             self._started = True
00747         else:
00748             term = self._termiter.next().term
00749         if len(term) < self._prefixlen or term[:self._prefixlen] != self._prefix:
00750             raise StopIteration
00751         return term[self._prefixlen:]
00752 
00753 
00754 class SynonymIter(object):
00755     """Iterate through a list of synonyms.
00756 
00757     """
00758     def __init__(self, index, field_mappings, prefix):
00759         """Initialise the synonym iterator.
00760 
00761          - `index` is the index to get the synonyms from.
00762          - `field_mappings` is the FieldMappings object for the iterator.
00763          - `prefix` is the prefix to restrict the returned synonyms to.
00764 
00765         """
00766         self._index = index
00767         self._field_mappings = field_mappings
00768         self._syniter = self._index.synonym_keys(prefix)
00769 
00770     def __iter__(self):
00771         return self
00772 
00773     def next(self):
00774         """Get the next synonym.
00775 
00776         """
00777         synkey = self._syniter.next()
00778         pos = 0
00779         for char in synkey:
00780             if char.isupper(): pos += 1
00781             else: break
00782         if pos == 0:
00783             fieldname = None
00784             terms = synkey
00785         else:
00786             prefix = synkey[:pos]
00787             fieldname = self._field_mappings.get_fieldname_from_prefix(prefix)
00788             terms = ' '.join((term[pos:] for term in synkey.split(' ')))
00789         synval = tuple(self._index.synonyms(synkey))
00790         return ((terms, fieldname), synval)
00791 
00792 class FacetQueryTypeIter(object):
00793     """Iterate through all the query types and their associated facets.
00794 
00795     """
00796     def __init__(self, facet_query_table, association):
00797         """Initialise the query type facet iterator.
00798 
00799         Only facets associated with each query type in the specified
00800         manner are returned (`association` must be one of
00801         IndexerConnection.FacetQueryType_Preferred or
00802         IndexerConnection.FacetQueryType_Never).
00803 
00804         """
00805         self._table_iter = facet_query_table.iteritems()
00806         self._association = association
00807 
00808     def __iter__(self):
00809         return self
00810 
00811     def next(self):
00812         """Get the next (query type, facet set) 2-tuple.
00813 
00814         """
00815         query_type, facet_dict = self._table_iter.next()
00816         facet_list = [facet for facet, association in facet_dict.iteritems() if association == self._association]
00817         if len(facet_list) == 0:
00818             return self.next()
00819         return (query_type, set(facet_list))
00820 
00821 if __name__ == '__main__':
00822     import doctest, sys
00823     doctest.testmod (sys.modules[__name__])