Back to index

python-biopython  1.60
ParserSupport.py
Go to the documentation of this file.
00001 # Copyright 1999 by Jeffrey Chang.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 """Code to support writing parsers (OBSOLETE).
00007 
00008 
00009 
00010 Classes:
00011 AbstractParser         Base class for parsers.
00012 AbstractConsumer       Base class of all Consumers.
00013 TaggingConsumer        Consumer that tags output with its event.  For debugging
00014 SGMLStrippingConsumer  Consumer that strips SGML tags from output.
00015 EventGenerator         Generate Biopython Events from Martel XML output
00016                        (note that Martel is now DEPRECATED)
00017 
00018 Functions:
00019 safe_readline          Read a line from a handle, with check for EOF.
00020 safe_peekline          Peek at next line, with check for EOF.
00021 read_and_call          Read a line from a handle and pass it to a method.
00022 read_and_call_while    Read many lines, as long as a condition is met.
00023 read_and_call_until    Read many lines, until a condition is met.
00024 attempt_read_and_call  Like read_and_call, but forgiving of errors.
00025 is_blank_line          Test whether a line is blank.
00026 
00027 """
00028 
00029 
00030 import warnings
00031 warnings.warn("The module Bio.ParserSupport is now obsolete, and will be deprecated and removed in a future release of Biopython.", PendingDeprecationWarning)
00032 
00033 
00034 import sys
00035 import traceback
00036 from types import *
00037 import StringIO
00038 
00039 from Bio import File
00040 
00041 # XML from python 2.0
00042 try:
00043     from xml.sax import handler
00044     xml_support = 1
00045 except ImportError:
00046     sys.stderr.write("Warning: Could not import SAX for dealing with XML.\n" +
00047                      "This causes problems with some ParserSupport modules\n")
00048     xml_support = 0
00049 
00050 class AbstractParser(object):
00051     """Base class for other parsers.
00052 
00053     """
00054     def parse(self, handle):
00055         raise NotImplementedError("Please implement in a derived class")
00056 
00057     def parse_str(self, string):
00058         return self.parse(StringIO.StringIO(string))
00059 
00060     def parse_file(self, filename):
00061         h = open(filename)
00062         try:
00063             retval = self.parse(h)
00064         finally:
00065             h.close()
00066         return retval
00067 
00068 class AbstractConsumer(object):
00069     """Base class for other Consumers.
00070 
00071     Derive Consumers from this class and implement appropriate
00072     methods for each event that you want to receive.
00073     
00074     """
00075     def _unhandled_section(self):
00076         pass
00077     def _unhandled(self, data):
00078         pass
00079     def __getattr__(self, attr):
00080         if attr[:6] == 'start_' or attr[:4] == 'end_':
00081             method = self._unhandled_section
00082         else:
00083             method = self._unhandled
00084         return method
00085 
00086 class TaggingConsumer(AbstractConsumer):
00087     """A Consumer that tags the data stream with the event and
00088     prints it to a handle.  Useful for debugging.
00089 
00090     """
00091     def __init__(self, handle=None, colwidth=15, maxwidth=80):
00092         """TaggingConsumer(handle=sys.stdout, colwidth=15, maxwidth=80)"""
00093         # I can't assign sys.stdout to handle in the argument list.
00094         # If I do that, handle will be assigned the value of sys.stdout
00095         # the first time this function is called.  This will fail if
00096         # the user has assigned sys.stdout to some other file, which may
00097         # be closed or invalid at a later time.
00098         if handle is None:
00099             handle = sys.stdout
00100         self._handle = handle
00101         self._colwidth = colwidth
00102         self._maxwidth = maxwidth
00103 
00104     def unhandled_section(self):
00105         self._print_name('unhandled_section')
00106 
00107     def unhandled(self, data):
00108         self._print_name('unhandled', data)
00109 
00110     def _print_name(self, name, data=None):
00111         if data is None:
00112             # Write the name of a section.
00113             self._handle.write("%s %s\n" % ("*"*self._colwidth, name))
00114         else:
00115             # Write the tag and line.
00116             self._handle.write("%-*s: %s\n" % (
00117                 self._colwidth, name[:self._colwidth],
00118                 data[:self._maxwidth-self._colwidth-2].rstrip()))
00119 
00120     def __getattr__(self, attr):
00121         if attr[:6] == 'start_' or attr[:4] == 'end_':
00122             method = lambda a=attr, s=self: s._print_name(a)
00123         else:
00124             method = lambda x, a=attr, s=self: s._print_name(a, x)
00125         return method
00126 
00127 class SGMLStrippingConsumer(object):
00128     """A consumer that strips off SGML tags.
00129 
00130     This is meant to be used as a decorator for other consumers.
00131 
00132     """
00133     def __init__(self, consumer):
00134         import Bio
00135         warnings.warn("SGMLStrippingConsumer is deprecated, and is likely to be removed in a future version of Biopython", Bio.BiopythonDeprecationWarning)
00136         if type(consumer) is not InstanceType:
00137             raise ValueError("consumer should be an instance")
00138         self._consumer = consumer
00139         self._prev_attr = None
00140         self._stripper = File.SGMLStripper()
00141 
00142     def _apply_clean_data(self, data):
00143         clean = self._stripper.strip(data)
00144         self._prev_attr(clean)
00145 
00146     def __getattr__(self, name):
00147         if name in ['_prev_attr', '_stripper']:
00148             return getattr(self, name)
00149         attr = getattr(self._consumer, name)
00150         # If this is not a method, then return it as is.
00151         if type(attr) is not MethodType:
00152             return attr
00153         # If it's a section method, then return it.
00154         if name[:6] == 'start_' or name[:4] == 'end_':
00155             return attr
00156         # Otherwise, it's an info event, and return my method.
00157         self._prev_attr = attr
00158         return self._apply_clean_data
00159 
00160 # onle use the Event Generator if XML handling is okay
00161 if xml_support:
00162     class EventGenerator(handler.ContentHandler):
00163         """Handler to generate events associated with a Martel parsed file.
00164 
00165         This acts like a normal SAX handler, and accepts XML generated by
00166         Martel during parsing. These events are then converted into
00167         'Biopython events', which can then be caught by a standard
00168         biopython consumer.
00169 
00170         Note that Martel is now DEPRECATED.
00171         """
00172         def __init__(self, consumer, interest_tags, callback_finalizer = None,
00173                      exempt_tags = []):
00174             """Initialize to begin catching and firing off events.
00175 
00176             Arguments:
00177             o consumer - The consumer that we'll send Biopython events to.
00178             
00179             o interest_tags - A listing of all the tags we are interested in.
00180 
00181             o callback_finalizer - A function to deal with the collected
00182             information before passing it on to the consumer. By default
00183             the collected information is a list of all of the lines read
00184             for a particular tag -- if there are multiple tags in a row
00185             like:
00186 
00187             <some_info>Spam<some_info>
00188             <some_info>More Spam<some_info>
00189 
00190             In this case the list of information would be:
00191 
00192             ['Spam', 'More Spam']
00193             
00194             This list of lines will be passed to the callback finalizer if
00195             it is present. Otherwise the consumer will be called with the
00196             list of content information.
00197 
00198             o exempt_tags - A listing of particular tags that are exempt from
00199             being processed by the callback_finalizer. This allows you to
00200             use a finalizer to deal with most tags, but leave those you don't
00201             want touched.
00202             """
00203             self._consumer = consumer
00204             self.interest_tags = interest_tags
00205             self._finalizer = callback_finalizer
00206             self._exempt_tags = exempt_tags
00207 
00208             # a dictionary of content for each tag of interest
00209             # the information for each tag is held as a list of the lines.
00210             # This allows us to collect information from multiple tags
00211             # in a row, and return it all at once.
00212             self.info = {}
00213             for tag in self.interest_tags:
00214                 self.info[tag] = []
00215 
00216             # the previous tag we were collecting information for.
00217             # We set a delay in sending info to the consumer so that we can
00218             # collect a bunch of tags in a row and append all of the info
00219             # together.
00220             self._previous_tag = ''
00221 
00222             # the current character information for a tag
00223             self._cur_content = []
00224             # whether we should be collecting information
00225             self._collect_characters = 0
00226 
00227         def startElement(self, name, attrs):
00228             """Determine if we should collect characters from this tag.
00229             """
00230             if name in self.interest_tags:
00231                 self._collect_characters = 1
00232 
00233         def characters(self, content):
00234             """Extract the information if we are interested in it.
00235             """
00236             if self._collect_characters:
00237                 self._cur_content.append(content)
00238 
00239         def endElement(self, name):
00240             """Send the information to the consumer.
00241 
00242             Once we've got the end element we've collected up all of the
00243             character information we need, and we need to send this on to
00244             the consumer to do something with it.
00245 
00246             We have a delay of one tag on doing this, so that we can collect
00247             all of the info from multiple calls to the same element at once.
00248             """
00249             # only deal with the tag if it is something we are
00250             # interested in and potentially have information for
00251             if self._collect_characters:
00252                 # add all of the information collected inside this tag
00253                 self.info[name].append("".join(self._cur_content))
00254                 # reset our information and flags
00255                 self._cur_content = []
00256                 self._collect_characters = 0
00257                 
00258                 # if we are at a new tag, pass on the info from the last tag
00259                 if self._previous_tag and self._previous_tag != name:
00260                     self._make_callback(self._previous_tag)
00261 
00262                 # set this tag as the next to be passed
00263                 self._previous_tag = name
00264 
00265         def _make_callback(self, name):
00266             """Call the callback function with the info with the given name.
00267             """
00268             # strip off whitespace and call the consumer
00269             callback_function = getattr(self._consumer, name)
00270 
00271             # --- pass back the information
00272             # if there is a finalizer, use that
00273             if self._finalizer is not None and name not in self._exempt_tags:
00274                 info_to_pass = self._finalizer(self.info[name])
00275             # otherwise pass back the entire list of information
00276             else:
00277                 info_to_pass = self.info[name]
00278             
00279             callback_function(info_to_pass)
00280 
00281             # reset the information for the tag
00282             self.info[name] = []
00283 
00284         def endDocument(self):
00285             """Make sure all of our information has been passed.
00286 
00287             This just flushes out any stored tags that need to be passed.
00288             """
00289             if self._previous_tag:
00290                 self._make_callback(self._previous_tag)
00291 
00292 def read_and_call(uhandle, method, **keywds):
00293     """read_and_call(uhandle, method[, start][, end][, contains][, blank][, has_re])
00294 
00295     Read a line from uhandle, check it, and pass it to the method.
00296     Raises a ValueError if the line does not pass the checks.
00297 
00298     start, end, contains, blank, and has_re specify optional conditions
00299     that the line must pass.  start and end specifies what the line must
00300     begin or end with (not counting EOL characters).  contains
00301     specifies a substring that must be found in the line.  If blank
00302     is a true value, then the line must be blank.  has_re should be
00303     a regular expression object with a pattern that the line must match
00304     somewhere.
00305 
00306     """
00307     line = safe_readline(uhandle)
00308     errmsg = _fails_conditions(*(line,), **keywds)
00309     if errmsg is not None:
00310         raise ValueError(errmsg)
00311     method(line)
00312 
00313 def read_and_call_while(uhandle, method, **keywds):
00314     """read_and_call_while(uhandle, method[, start][, end][, contains][, blank][, has_re]) -> number of lines
00315 
00316     Read a line from uhandle and pass it to the method as long as
00317     some condition is true.  Returns the number of lines that were read.
00318 
00319     See the docstring for read_and_call for a description of the parameters.
00320     
00321     """
00322     nlines = 0
00323     while 1:
00324         line = safe_readline(uhandle)
00325         # If I've failed the condition, then stop reading the line.
00326         if _fails_conditions(*(line,), **keywds):
00327             uhandle.saveline(line)
00328             break
00329         method(line)
00330         nlines = nlines + 1
00331     return nlines
00332 
00333 def read_and_call_until(uhandle, method, **keywds):
00334     """read_and_call_until(uhandle, method, 
00335     start=None, end=None, contains=None, blank=None) -> number of lines
00336 
00337     Read a line from uhandle and pass it to the method until
00338     some condition is true.  Returns the number of lines that were read.
00339 
00340     See the docstring for read_and_call for a description of the parameters.
00341     
00342     """
00343     nlines = 0
00344     while 1:
00345         line = safe_readline(uhandle)
00346         # If I've met the condition, then stop reading the line.
00347         if not _fails_conditions(*(line,), **keywds):
00348             uhandle.saveline(line)
00349             break
00350         method(line)
00351         nlines = nlines + 1
00352     return nlines
00353 
00354 def attempt_read_and_call(uhandle, method, **keywds):
00355     """attempt_read_and_call(uhandle, method, **keywds) -> boolean
00356 
00357     Similar to read_and_call, but returns a boolean specifying
00358     whether the line has passed the checks.  Does not raise
00359     exceptions.
00360 
00361     See docs for read_and_call for a description of the function
00362     arguments.
00363 
00364     """
00365     line = safe_readline(uhandle)
00366     passed = not _fails_conditions(*(line,), **keywds)
00367     if passed:
00368         method(line)
00369     else:
00370         uhandle.saveline(line)
00371     return passed
00372 
00373 def _fails_conditions(line, start=None, end=None, contains=None, blank=None,
00374                       has_re=None):
00375     if start is not None:
00376         if line[:len(start)] != start:
00377             return "Line does not start with '%s':\n%s" % (start, line)
00378     if end is not None:
00379         if line.rstrip()[-len(end):] != end:
00380             return "Line does not end with '%s':\n%s" % (end, line)
00381     if contains is not None:
00382         if line.find(contains) == -1:
00383             return "Line does not contain '%s':\n%s" % (contains, line)
00384     if blank is not None:
00385         if blank:
00386             if not is_blank_line(line):
00387                 return "Expected blank line, but got:\n%s" % line
00388         else:
00389             if is_blank_line(line):
00390                 return "Expected non-blank line, but got a blank one"
00391     if has_re is not None:
00392         if has_re.search(line) is None:
00393             return "Line does not match regex '%s':\n%s" % (
00394                 has_re.pattern, line)
00395     return None
00396 
00397 def is_blank_line(line, allow_spaces=0):
00398     """is_blank_line(line, allow_spaces=0) -> boolean
00399 
00400     Return whether a line is blank.  allow_spaces specifies whether to
00401     allow whitespaces in a blank line.  A true value signifies that a
00402     line containing whitespaces as well as end-of-line characters
00403     should be considered blank.
00404 
00405     """
00406     if not line:
00407         return 1
00408     if allow_spaces:
00409         return line.rstrip() == ''
00410     return line[0] == '\n' or line[0] == '\r'
00411 
00412 def safe_readline(handle):
00413     """safe_readline(handle) -> line
00414 
00415     Read a line from an UndoHandle and return it.  If there are no more
00416     lines to read, I will raise a ValueError.
00417 
00418     """
00419     line = handle.readline()
00420     if not line:
00421         raise ValueError("Unexpected end of stream.")
00422     return line
00423 
00424 def safe_peekline(handle):
00425     """safe_peekline(handle) -> line
00426 
00427     Peek at the next line in an UndoHandle and return it.  If there are no
00428     more lines to peek, I will raise a ValueError.
00429     
00430     """
00431     line = handle.peekline()
00432     if not line:
00433         raise ValueError("Unexpected end of stream.")
00434     return line