Back to index

moin  1.9.0~rc2
highlight.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 #
00003 # Copyright (C) 2007 Lemur Consulting Ltd
00004 #
00005 # This program is free software; you can redistribute it and/or modify
00006 # it under the terms of the GNU General Public License as published by
00007 # the Free Software Foundation; either version 2 of the License, or
00008 # (at your option) any later version.
00009 #
00010 # This program is distributed in the hope that it will be useful,
00011 # but WITHOUT ANY WARRANTY; without even the implied warranty of
00012 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013 # GNU General Public License for more details.
00014 #
00015 # You should have received a copy of the GNU General Public License along
00016 # with this program; if not, write to the Free Software Foundation, Inc.,
00017 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
00018 r"""highlight.py: Highlight and summarise text.
00019 
00020 """
00021 __docformat__ = "restructuredtext en"
00022 
00023 import re
00024 import xapian
00025 
00026 class Highlighter(object):
00027     """Class for highlighting text and creating contextual summaries.
00028 
00029     >>> hl = Highlighter("en")
00030     >>> hl.makeSample('Hello world.', ['world'])
00031     'Hello world.'
00032     >>> hl.highlight('Hello world', ['world'], ('<', '>'))
00033     'Hello <world>'
00034 
00035     """
00036 
00037     # split string into words, spaces, punctuation and markup tags
00038     _split_re = re.compile(r'<\w+[^>]*>|</\w+>|[\w\']+|\s+|[^\w\'\s<>/]+')
00039 
00040     def __init__(self, language_code='en', stemmer=None):
00041         """Create a new highlighter for the specified language.
00042 
00043         """
00044         if stemmer is not None:
00045             self.stem = stemmer
00046         else:
00047             self.stem = xapian.Stem(language_code)
00048 
00049     def _split_text(self, text, strip_tags=False):
00050         """Split some text into words and non-words.
00051 
00052         - `text` is the text to process.  It may be a unicode object or a utf-8
00053           encoded simple string.
00054         - `strip_tags` is a flag - False to keep tags, True to strip all tags
00055           from the output.
00056 
00057         Returns a list of utf-8 encoded simple strings.
00058 
00059         """
00060         if isinstance(text, unicode):
00061             text = text.encode('utf-8')
00062 
00063         words = self._split_re.findall(text)
00064         if strip_tags:
00065             return [w for w in words if w[0] != '<']
00066         else:
00067             return words
00068 
00069     def _strip_prefix(self, term):
00070         """Strip the prefix off a term.
00071 
00072         Prefixes are any initial capital letters, with the exception that R always
00073         ends a prefix, even if followed by capital letters.
00074 
00075         >>> hl = Highlighter("en")
00076         >>> print hl._strip_prefix('hello')
00077         hello
00078         >>> print hl._strip_prefix('Rhello')
00079         hello
00080         >>> print hl._strip_prefix('XARHello')
00081         Hello
00082         >>> print hl._strip_prefix('XAhello')
00083         hello
00084         >>> print hl._strip_prefix('XAh')
00085         h
00086         >>> print hl._strip_prefix('XA')
00087         <BLANKLINE>
00088 
00089         """
00090         for p in xrange(len(term)):
00091             if term[p].islower():
00092                 return term[p:]
00093             elif term[p] == 'R':
00094                 return term[p+1:]
00095         return ''
00096 
00097     def _query_to_stemmed_words(self, query):
00098         """Convert a query to a list of stemmed words.
00099 
00100         - `query` is the query to parse: it may be xapian.Query object, or a
00101           sequence of terms.
00102 
00103         """
00104         if isinstance(query, xapian.Query):
00105             return [self._strip_prefix(t) for t in query]
00106         else:
00107             return [self.stem(q.lower()) for q in query]
00108 
00109 
00110     def makeSample(self, text, query, maxlen=600, hl=None):
00111         """Make a contextual summary from the supplied text.
00112 
00113         This basically works by splitting the text into phrases, counting the query
00114         terms in each, and keeping those with the most.
00115 
00116         Any markup tags in the text will be stripped.
00117 
00118         `text` is the source text to summarise.
00119         `query` is either a Xapian query object or a list of (unstemmed) term strings.
00120         `maxlen` is the maximum length of the generated summary.
00121         `hl` is a pair of strings to insert around highlighted terms, e.g. ('<b>', '</b>')
00122 
00123         """
00124 
00125         # coerce maxlen into an int, otherwise truncation doesn't happen
00126         maxlen = int(maxlen)
00127 
00128         words = self._split_text(text, True)
00129         terms = self._query_to_stemmed_words(query)
00130         
00131         # build blocks delimited by puncuation, and count matching words in each block
00132         # blocks[n] is a block [firstword, endword, charcount, termcount, selected]
00133         blocks = []
00134         start = end = count = blockchars = 0
00135 
00136         while end < len(words):
00137             blockchars += len(words[end])
00138             if words[end].isalnum():
00139                 if self.stem(words[end].lower()) in terms:
00140                     count += 1
00141                 end += 1
00142             elif words[end] in ',.;:?!\n':
00143                 end += 1
00144                 blocks.append([start, end, blockchars, count, False])
00145                 start = end
00146                 blockchars = 0
00147                 count = 0
00148             else:
00149                 end += 1
00150         if start != end:
00151             blocks.append([start, end, blockchars, count, False])
00152         if len(blocks) == 0:
00153             return ''
00154 
00155         # select high-scoring blocks first, down to zero-scoring
00156         chars = 0
00157         for count in xrange(3, -1, -1):
00158             for b in blocks:
00159                 if b[3] >= count:
00160                     b[4] = True
00161                     chars += b[2]
00162                     if chars >= maxlen: break
00163             if chars >= maxlen: break
00164 
00165         # assemble summary
00166         words2 = []
00167         lastblock = -1
00168         for i, b in enumerate(blocks):
00169             if b[4]:
00170                 if i != lastblock + 1:
00171                     words2.append('..')
00172                 words2.extend(words[b[0]:b[1]])
00173                 lastblock = i
00174 
00175         if not blocks[-1][4]:
00176             words2.append('..')
00177 
00178         # trim down to maxlen
00179         l = 0
00180         for i in xrange (len (words2)):
00181             l += len (words2[i])
00182             if l >= maxlen:
00183                 words2[i:] = ['..']
00184                 break
00185 
00186         if hl is None:
00187             return ''.join(words2)
00188         else:
00189             return self._hl(words2, terms, hl)
00190 
00191     def highlight(self, text, query, hl, strip_tags=False):
00192         """Add highlights (string prefix/postfix) to a string.
00193 
00194         `text` is the source to highlight.
00195         `query` is either a Xapian query object or a list of (unstemmed) term strings.
00196         `hl` is a pair of highlight strings, e.g. ('<i>', '</i>')
00197         `strip_tags` strips HTML markout iff True
00198 
00199         >>> hl = Highlighter()
00200         >>> qp = xapian.QueryParser()
00201         >>> q = qp.parse_query('cat dog')
00202         >>> tags = ('[[', ']]')
00203         >>> hl.highlight('The cat went Dogging; but was <i>dog tired</i>.', q, tags)
00204         'The [[cat]] went [[Dogging]]; but was <i>[[dog]] tired</i>.'
00205 
00206         """
00207         words = self._split_text(text, strip_tags)
00208         terms = self._query_to_stemmed_words(query)
00209         return self._hl(words, terms, hl)
00210 
00211     def _hl(self, words, terms, hl):
00212         """Add highlights to a list of words.
00213         
00214         `words` is the list of words and non-words to be highlighted..
00215         `terms` is the list of stemmed words to look for.
00216 
00217         """
00218         for i, w in enumerate(words):
00219             # HACK - more forgiving about stemmed terms 
00220             wl = w.lower()
00221             if wl in terms or self.stem (wl) in terms:
00222                 words[i] = ''.join((hl[0], w, hl[1]))
00223 
00224         return ''.join(words)
00225 
00226 
00227 __test__ = {
00228     'no_punc': r'''
00229 
00230     Test the highlighter's behaviour when there is no punctuation in the sample
00231     text (regression test - used to return no output):
00232     >>> hl = Highlighter("en")
00233     >>> hl.makeSample('Hello world', ['world'])
00234     'Hello world'
00235 
00236     ''',
00237 
00238     'stem_levels': r'''
00239 
00240     Test highlighting of words, and how it works with stemming:
00241     >>> hl = Highlighter("en")
00242 
00243     # "word" and "wording" stem to "word", so the following 4 calls all return
00244     # the same thing
00245     >>> hl.makeSample('Hello. word. wording. wordinging.', ['word'], hl='<>')
00246     'Hello. <word>. <wording>. wordinging.'
00247     >>> hl.highlight('Hello. word. wording. wordinging.', ['word'], '<>')
00248     'Hello. <word>. <wording>. wordinging.'
00249     >>> hl.makeSample('Hello. word. wording. wordinging.', ['wording'], hl='<>')
00250     'Hello. <word>. <wording>. wordinging.'
00251     >>> hl.highlight('Hello. word. wording. wordinging.', ['wording'], '<>')
00252     'Hello. <word>. <wording>. wordinging.'
00253 
00254     # "wordinging" stems to "wording", so only the last two words are
00255     # highlighted for this one.
00256     >>> hl.makeSample('Hello. word. wording. wordinging.', ['wordinging'], hl='<>')
00257     'Hello. word. <wording>. <wordinging>.'
00258     >>> hl.highlight('Hello. word. wording. wordinging.', ['wordinging'], '<>')
00259     'Hello. word. <wording>. <wordinging>.'
00260     ''',
00261 
00262     'supplied_stemmer': r'''
00263 
00264     Test behaviour if we pass in our own stemmer:
00265     >>> stem = xapian.Stem('en')
00266     >>> hl = Highlighter(stemmer=stem)
00267     >>> hl.highlight('Hello. word. wording. wordinging.', ['word'], '<>')
00268     'Hello. <word>. <wording>. wordinging.'
00269 
00270     ''',
00271 
00272     'unicode': r'''
00273 
00274     Test behaviour if we pass in unicode input:
00275     >>> hl = Highlighter('en')
00276     >>> hl.highlight(u'Hello\xf3. word. wording. wordinging.', ['word'], '<>')
00277     'Hello\xc3\xb3. <word>. <wording>. wordinging.'
00278 
00279     ''',
00280 
00281     'no_sample': r'''
00282 
00283     Test behaviour if we pass in unicode input:
00284     >>> hl = Highlighter('en')
00285     >>> hl.makeSample(u'', ['word'])
00286     ''
00287 
00288     ''',
00289 
00290     'short_samples': r'''
00291 
00292     >>> hl = Highlighter('en')
00293     >>> hl.makeSample("A boring start.  Hello world indeed.  A boring end.", ['hello'], 20, ('<', '>'))
00294     '..  <Hello> world ..'
00295     >>> hl.makeSample("A boring start.  Hello world indeed.  A boring end.", ['hello'], 40, ('<', '>'))
00296     'A boring start.  <Hello> world indeed...'
00297     >>> hl.makeSample("A boring start.  Hello world indeed.  A boring end.", ['boring'], 40, ('<', '>'))
00298     'A <boring> start...  A <boring> end.'
00299 
00300     ''',
00301 
00302     'apostrophes': r'''
00303 
00304     >>> hl = Highlighter('en')
00305     >>> hl.makeSample("A boring start.  Hello world's indeed.  A boring end.", ['world'], 40, ('<', '>'))
00306     "A boring start.  Hello <world's> indeed..."
00307 
00308     ''',
00309 
00310 }
00311 
00312 if __name__ == '__main__':
00313     import doctest, sys
00314     doctest.testmod (sys.modules[__name__])