Back to index

python-biopython  1.60
triefind.py
Go to the documentation of this file.
00001 """
00002 Given a trie, find all occurrences of a word in the trie in a string.
00003 
00004 Like searching a string for a substring, except that the substring is
00005 any word in a trie.
00006 
00007 Functions:
00008 match         Find longest key in a trie matching the beginning of the string.
00009 match_all     Find all keys in a trie matching the beginning of the string.
00010 find          Find keys in a trie matching anywhere in a string.
00011 find_words    Find keys in a trie matching whole words in a string.
00012 
00013 """
00014 import string
00015 import re
00016 
00017 def match(string, trie):
00018     """match(string, trie) -> longest key or None
00019 
00020     Find the longest key in the trie that matches the beginning of the
00021     string.
00022 
00023     """
00024     longest = None
00025     for i in range(len(string)):
00026         substr = string[:i+1]
00027         if not trie.has_prefix(substr):
00028             break
00029         if trie.has_key(substr):
00030             longest = substr
00031     return longest
00032 
00033 def match_all(string, trie):
00034     """match_all(string, trie) -> list of keys
00035 
00036     Find all the keys in the trie that matches the beginning of the
00037     string.
00038 
00039     """
00040     matches = []
00041     for i in range(len(string)):
00042         substr = string[:i+1]
00043         if not trie.has_prefix(substr):
00044             break
00045         if trie.has_key(substr):
00046             matches.append(substr)
00047     return matches
00048 
00049 def find(string, trie):
00050     """find(string, trie) -> list of tuples (key, start, end)
00051 
00052     Find all the keys in the trie that match anywhere in the string.
00053 
00054     """
00055     results = []
00056     start = 0     # index to start the search
00057     while start < len(string):
00058         # Look for a match.
00059         keys = match_all(string[start:], trie)
00060         for key in keys:
00061             results.append((key, start, start+len(key)))
00062         start += 1
00063     return results
00064 
00065 DEFAULT_BOUNDARY_CHARS = string.punctuation + string.whitespace
00066 
00067 def find_words(string, trie):
00068     """find_words(string, trie) -> list of tuples (key, start, end)
00069 
00070     Find all the keys in the trie that match full words in the string.
00071     Word boundaries are defined as any punctuation or whitespace.
00072 
00073     """
00074     _boundary_re = re.compile(r"[%s]+" % re.escape(DEFAULT_BOUNDARY_CHARS))
00075         
00076     results = []
00077     start = 0     # index of word boundary
00078     while start < len(string):
00079         # Look for a match.
00080         keys = match_all(string[start:], trie)
00081         for key in keys:
00082             l = len(key)
00083             # Make sure it ends at a boundary.
00084             if start+l == len(string) or \
00085                _boundary_re.match(string[start+l]):
00086                 results.append((key, start, start+l))
00087         # Move forward to the next boundary.
00088         m = _boundary_re.search(string, start)
00089         if m is None:
00090             break
00091         start = m.end()
00092     return results