Back to index

python-biopython  1.60
debug_blast_parser.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 
00003 # To do:
00004 # - Let user specify the parser class on the command line.
00005 # - Let user specify a sequence file to BLAST on the net.
00006 # - Script should help debug connection to NCBI website.
00007 
00008 import os, sys
00009 import re
00010 import getopt
00011 import traceback
00012 
00013 from Bio import ParserSupport
00014 from Bio.Blast import NCBIStandalone, NCBIWWW
00015 
00016 CONTEXT = 5   # show 5 lines of context around the error in the format file
00017 
00018 USAGE = """%s [-h] [-v] [-p] [-n] [-o] <testfile>
00019 
00020 This script helps diagnose problems with the BLAST parser.
00021 
00022 OPTIONS:
00023 
00024 -h    Show this help file.
00025 
00026 -v    Verbose output.
00027 
00028 -p    <testfile> is a protein file.
00029 
00030 -n    <testfile> is a nucleotide file.
00031 
00032 -o    <testfile> is a BLAST output file.
00033 
00034 """ % sys.argv[0]
00035 
00036 class DebuggingConsumer:
00037     def __init__(self, decorated=None):
00038         self.linenum = 0
00039         if decorated is None:
00040             decorated = ParserSupport.AbstractConsumer()
00041         self.decorated = decorated
00042         self._prev_attr = None
00043     def _decorated_section(self):
00044         getattr(self.decorated, self._prev_attr)()
00045     def _decorated(self, data):
00046         getattr(self.decorated, self._prev_attr)(data)
00047         self.linenum += 1
00048     def __getattr__(self, attr):
00049         self._prev_attr = attr
00050         if attr.startswith('start_') or attr.startswith('end_'):
00051             return self._decorated_section
00052         else:
00053             return self._decorated
00054 
00055 def chomp(line):
00056     return re.sub(r"[\r\n]*$", "", line)
00057 
00058 def choose_parser(outfile):
00059     data = open(outfile).read()
00060     ldata = data.lower()
00061     if ldata.find("<html>") >= 0 or ldata.find("<pre>") >= 0:
00062         return NCBIWWW.BlastParser
00063     if ldata.find("results from round") >= 0 or ldata.find("converged!") >= 0:
00064         return NCBIStandalone.PSIBlastParser
00065     return NCBIStandalone.BlastParser
00066 
00067 def test_blast_output(outfile):
00068     # Try to auto-detect the format
00069     if 1:
00070         print "No parser specified.  I'll try to choose one for you based"
00071         print "on the format of the output file."
00072         print
00073         
00074         parser_class = choose_parser(outfile)
00075         print "It looks like you have given output that should be parsed"
00076         print "with %s.%s.  If I'm wrong, you can select the correct parser" %\
00077               (parser_class.__module__, parser_class.__name__)
00078         print "on the command line of this script (NOT IMPLEMENTED YET)."
00079     else:
00080         raise NotImplementedError
00081         parser_class = NCBIWWW.BlastParser
00082         print "Using %s to parse the file." % parser_class.__name__
00083     print
00084 
00085     scanner_class = parser_class()._scanner.__class__
00086     consumer_class = parser_class()._consumer.__class__
00087 
00088     #parser_class()._scanner.feed(
00089     #    open(outfile), ParserSupport.TaggingConsumer())
00090     print "I'm going to run the data through the parser to see what happens..."
00091     parser = parser_class()
00092     try:
00093         rec = parser.parse_file(outfile)
00094     except KeyboardInterrupt, SystemExit:
00095         raise
00096     except Exception, x:
00097         exception_info = str(x)
00098         print "Dang, the parsing failed."
00099     else:
00100         print "Parsing succeeded, no problems detected."
00101         print "However, you should check to make sure the following scanner"
00102         print "trace looks reasonable."
00103         print
00104         parser_class()._scanner.feed(
00105             open(outfile), ParserSupport.TaggingConsumer())
00106         return 0
00107     print
00108 
00109     print "Alright.  Let me try and figure out where in the parser the"
00110     print "problem occurred..."
00111     etype, value, tb = sys.exc_info()
00112     ftb = traceback.extract_tb(tb)
00113     ftb.reverse()
00114     class_found = None
00115     for err_file, err_line, err_function, err_text in ftb:
00116         if hasattr(consumer_class, err_function):
00117             class_found = consumer_class
00118             break
00119         elif hasattr(scanner_class, err_function):
00120             class_found = scanner_class
00121             break
00122     if class_found is None:
00123         print "Sorry, I could not pinpoint the error to the parser."
00124         print "There's nothing more I can tell you."
00125         print "Here's the traceback:"
00126         traceback.print_exception(etype, value, tb)
00127         return 1
00128     else:
00129         print "I found the problem in %s.%s.%s, line %d:" % \
00130               (class_found.__module__, class_found.__name__,
00131                err_function, err_line)
00132         print "    %s" % err_text
00133         print "This output caused an %s to be raised with the" % etype
00134         print "information %r." % exception_info
00135     print
00136 
00137     print "Let me find the line in the file that triggers the problem..."
00138     parser = parser_class()
00139     scanner, consumer = parser._scanner, parser._consumer
00140     consumer = DebuggingConsumer(consumer)
00141     try:
00142         scanner.feed(open(outfile), consumer)
00143     except etype, x:
00144         pass
00145     else:
00146         print "Odd, the exception disappeared!  What happened?"
00147         return 3
00148     print "It's caused by line %d:" % consumer.linenum
00149     lines = open(outfile).readlines()
00150     start, end = consumer.linenum-CONTEXT, consumer.linenum+CONTEXT+1
00151     if start < 0:
00152         start = 0
00153     if end > len(lines):
00154         end = len(lines)
00155     ndigits = len(str(end))
00156     for linenum in range(start, end):
00157         line = chomp(lines[linenum])
00158         if linenum == consumer.linenum:
00159             prefix = '*'
00160         else:
00161             prefix = ' '
00162         
00163         s = "%s%*d %s" % (prefix, ndigits, linenum, line)
00164         s = s[:80]
00165         print s
00166     print
00167 
00168     if class_found == scanner_class:
00169         print "Problems in %s are most likely caused by changed formats." % \
00170               class_found.__name__
00171         print "You can start to fix this by going to line %d in module %s." % \
00172               (err_line, class_found.__module__)
00173         print "Perhaps the scanner needs to be made more lenient by accepting"
00174         print "the changed format?"
00175         print
00176 
00177         if VERBOSITY <= 0:
00178             print "For more help, you can run this script in verbose mode"
00179             print "to see detailed information about how the scanner"
00180             print "identifies each line."
00181         else:
00182             print "OK, let's see what the scanner's doing!"
00183             print
00184             print "*"*20 + " BEGIN SCANNER TRACE " + "*"*20
00185             try:
00186                 parser_class()._scanner.feed(
00187                     open(outfile), ParserSupport.TaggingConsumer())
00188             except etype, x:
00189                 pass
00190             print "*"*20 + " END SCANNER TRACE " + "*"*20
00191         print
00192             
00193     elif class_found == consumer_class:
00194         print "Problems in %s can be caused by two things:" % \
00195               class_found.__name__
00196         print "    - The format of the line parsed by '%s' changed." % \
00197               err_function
00198         print "    - The scanner misidentified the line."
00199         print "Check to make sure '%s' should parse the line:" % \
00200               err_function
00201         s = "    %s" % chomp(lines[consumer.linenum])
00202         s = s[:80]
00203         print s
00204         print "If so, debug %s.%s.  Otherwise, debug %s." % \
00205               (class_found.__name__, err_function, scanner_class.__name__)
00206     
00207 
00208 VERBOSITY = 0
00209 if __name__ == '__main__':
00210     try:
00211         optlist, args = getopt.getopt(sys.argv[1:], "hpnov")
00212     except getopt.error, x:
00213         print >>sys.stderr, x
00214         sys.exit(-1)
00215     if len(args) != 1:
00216         print >>sys.stderr, USAGE
00217         sys.exit(-1)
00218     TESTFILE, = args
00219     if not os.path.exists(TESTFILE):
00220         print >>sys.stderr, "I could not find file: %s" % TESTFILE
00221         sys.exit(-1)
00222 
00223     PROTEIN = NUCLEOTIDE = OUTPUT = None
00224     for opt, arg in optlist:
00225         if opt == '-h':
00226             print USAGE
00227             sys.exit(0)
00228         elif opt == '-p':
00229             PROTEIN = 1
00230         elif opt == '-n':
00231             NUCLEOTIDE = 1
00232         elif opt == '-o':
00233             OUTPUT = 1
00234         elif opt == '-v':
00235             VERBOSITY += 1
00236 
00237     if len([x for x in (PROTEIN, NUCLEOTIDE, OUTPUT) if x is not None]) != 1:
00238         OUTPUT = 1
00239         #print >>sys.stderr, "Exactly one of -p, -n, or -o should be specified."
00240         #sys.exit(-1)
00241     if PROTEIN or NUCLEOTIDE:
00242         print >>sys.stderr, "-p and -n not implemented yet"
00243         sys.exit(-1)
00244     test_blast_output(TESTFILE)
00245