Back to index

python-biopython  1.60
NCBIStandalone.py
Go to the documentation of this file.
00001 # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 # Patches by Mike Poidinger to support multiple databases.
00006 # Updated by Peter Cock in 2007 to do a better job on BLAST 2.2.15
00007 
00008 """Code for calling standalone BLAST and parsing plain text output (OBSOLETE).
00009 
00010 Rather than parsing the human readable plain text BLAST output (which seems to
00011 change with every update to BLAST), we and the NBCI recommend you parse the
00012 XML output instead. The plain text parser in this module still works at the
00013 time of writing, but is considered obsolete and updating it to cope with the
00014 latest versions of BLAST is not a priority for us.
00015 
00016 This module also provides code to work with the "legacy" standalone version of
00017 NCBI BLAST, tools blastall, rpsblast and blastpgp via three helper functions of
00018 the same name. These functions are very limited for dealing with the output as
00019 files rather than handles, for which the wrappers in Bio.Blast.Applications are
00020 prefered. Furthermore, the NCBI themselves regard these command line tools as
00021 "legacy", and encourage using the new BLAST+ tools instead. Biopython has
00022 wrappers for these under Bio.Blast.Applications (see the tutorial).
00023 
00024 Classes:
00025 LowQualityBlastError     Except that indicates low quality query sequences.
00026 BlastParser              Parses output from blast.
00027 BlastErrorParser         Parses output and tries to diagnose possible errors.
00028 PSIBlastParser           Parses output from psi-blast.
00029 Iterator                 Iterates over a file of blast results.
00030 
00031 _Scanner                 Scans output from standalone BLAST.
00032 _BlastConsumer           Consumes output from blast.
00033 _PSIBlastConsumer        Consumes output from psi-blast.
00034 _HeaderConsumer          Consumes header information.
00035 _DescriptionConsumer     Consumes description information.
00036 _AlignmentConsumer       Consumes alignment information.
00037 _HSPConsumer             Consumes hsp information.
00038 _DatabaseReportConsumer  Consumes database report information.
00039 _ParametersConsumer      Consumes parameters information.
00040 
00041 Functions:
00042 blastall        Execute blastall (OBSOLETE).
00043 blastpgp        Execute blastpgp (OBSOLETE).
00044 rpsblast        Execute rpsblast (OBSOLETE).
00045 
00046 For calling the BLAST command line tools, we encourage you to use the
00047 command line wrappers in Bio.Blast.Applications - the three functions
00048 blastall, blastpgp and rpsblast are considered to be obsolete now, and
00049 are likely to be deprecated and then removed in future releases.
00050 """
00051 
00052 import warnings
00053 warnings.warn("The plain text parser in this module still works at the time of writing, but is considered obsolete and updating it to cope with the latest versions of BLAST is not a priority for us.", PendingDeprecationWarning)
00054 
00055 import os
00056 import re
00057 import StringIO
00058 
00059 from Bio import File
00060 from Bio.ParserSupport import *
00061 from Bio.Blast import Record
00062 from Bio.Application import _escape_filename
00063 
00064 class LowQualityBlastError(Exception):
00065     """Error caused by running a low quality sequence through BLAST.
00066 
00067     When low quality sequences (like GenBank entries containing only
00068     stretches of a single nucleotide) are BLASTed, they will result in
00069     BLAST generating an error and not being able to perform the BLAST.
00070     search. This error should be raised for the BLAST reports produced
00071     in this case.
00072     """
00073     pass
00074 
00075 class ShortQueryBlastError(Exception):
00076     """Error caused by running a short query sequence through BLAST.
00077 
00078     If the query sequence is too short, BLAST outputs warnings and errors:
00079     Searching[blastall] WARNING:  [000.000]  AT1G08320: SetUpBlastSearch failed.
00080     [blastall] ERROR:  [000.000]  AT1G08320: Blast: 
00081     [blastall] ERROR:  [000.000]  AT1G08320: Blast: Query must be at least wordsize
00082     done
00083 
00084     This exception is raised when that condition is detected.
00085 
00086     """
00087     pass
00088     
00089 
00090 class _Scanner:
00091     """Scan BLAST output from blastall or blastpgp.
00092 
00093     Tested with blastall and blastpgp v2.0.10, v2.0.11
00094 
00095     Methods:
00096     feed     Feed data into the scanner.
00097     
00098     """
00099     def feed(self, handle, consumer):
00100         """S.feed(handle, consumer)
00101 
00102         Feed in a BLAST report for scanning.  handle is a file-like
00103         object that contains the BLAST report.  consumer is a Consumer
00104         object that will receive events as the report is scanned.
00105 
00106         """
00107         if isinstance(handle, File.UndoHandle):
00108             uhandle = handle
00109         else:
00110             uhandle = File.UndoHandle(handle)
00111 
00112         # Try to fast-forward to the beginning of the blast report.
00113         read_and_call_until(uhandle, consumer.noevent, contains='BLAST')
00114         # Now scan the BLAST report.
00115         self._scan_header(uhandle, consumer)
00116         self._scan_rounds(uhandle, consumer)
00117         self._scan_database_report(uhandle, consumer)
00118         self._scan_parameters(uhandle, consumer)
00119 
00120     def _scan_header(self, uhandle, consumer):
00121         # BLASTP 2.0.10 [Aug-26-1999]
00122         # 
00123         # 
00124         # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaf
00125         # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), 
00126         # "Gapped BLAST and PSI-BLAST: a new generation of protein database sea
00127         # programs",  Nucleic Acids Res. 25:3389-3402.
00128         # 
00129         # Query= test
00130         #          (140 letters)
00131         # 
00132         # Database: sdqib40-1.35.seg.fa
00133         #            1323 sequences; 223,339 total letters
00134         #
00135         # ========================================================
00136         # This next example is from the online version of Blast,
00137         # note there are TWO references, an RID line, and also
00138         # the database is BEFORE the query line.
00139         # Note there possibleuse of non-ASCII in the author names.
00140         # ========================================================
00141         #
00142         # BLASTP 2.2.15 [Oct-15-2006]
00143         # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Sch??ffer, 
00144         # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman 
00145         # (1997), "Gapped BLAST and PSI-BLAST: a new generation of 
00146         # protein database search programs", Nucleic Acids Res. 25:3389-3402.
00147         #
00148         # Reference: Sch??ffer, Alejandro A., L. Aravind, Thomas L. Madden, Sergei 
00149         # Shavirin, John L. Spouge, Yuri I. Wolf, Eugene V. Koonin, and 
00150         # Stephen F. Altschul (2001), "Improving the accuracy of PSI-BLAST 
00151         # protein database searches with composition-based statistics 
00152         # and other refinements", Nucleic Acids Res. 29:2994-3005. 
00153         #
00154         # RID: 1166022616-19998-65316425856.BLASTQ1
00155         # 
00156         #
00157         # Database: All non-redundant GenBank CDS
00158         # translations+PDB+SwissProt+PIR+PRF excluding environmental samples
00159         #            4,254,166 sequences; 1,462,033,012 total letters
00160         # Query=  gi:16127998
00161         # Length=428
00162         #
00163 
00164         consumer.start_header()
00165 
00166         read_and_call(uhandle, consumer.version, contains='BLAST')
00167         read_and_call_while(uhandle, consumer.noevent, blank=1)
00168 
00169         # There might be a <pre> line, for qblast output.
00170         attempt_read_and_call(uhandle, consumer.noevent, start="<pre>")
00171 
00172         # Read the reference(s)
00173         while attempt_read_and_call(uhandle,
00174                                 consumer.reference, start='Reference'):
00175             # References are normally multiline terminated by a blank line
00176             # (or, based on the old code, the RID line)
00177             while 1:
00178                 line = uhandle.readline()
00179                 if is_blank_line(line):
00180                     consumer.noevent(line)
00181                     break
00182                 elif line.startswith("RID"):
00183                     break
00184                 else:
00185                     #More of the reference
00186                     consumer.reference(line)
00187 
00188         #Deal with the optional RID: ...
00189         read_and_call_while(uhandle, consumer.noevent, blank=1)
00190         attempt_read_and_call(uhandle, consumer.reference, start="RID:")
00191         read_and_call_while(uhandle, consumer.noevent, blank=1)
00192 
00193         # blastpgp may have a reference for compositional score matrix
00194         # adjustment (see Bug 2502):
00195         if attempt_read_and_call(
00196             uhandle, consumer.reference, start="Reference"):
00197             read_and_call_until(uhandle, consumer.reference, blank=1)
00198             read_and_call_while(uhandle, consumer.noevent, blank=1)
00199 
00200         # blastpgp has a Reference for composition-based statistics.
00201         if attempt_read_and_call(
00202             uhandle, consumer.reference, start="Reference"):
00203             read_and_call_until(uhandle, consumer.reference, blank=1)
00204             read_and_call_while(uhandle, consumer.noevent, blank=1)
00205 
00206         line = uhandle.peekline()
00207         assert line.strip() != ""
00208         assert not line.startswith("RID:")
00209         if line.startswith("Query="):
00210             #This is an old style query then database...
00211 
00212             # Read the Query lines and the following blank line.
00213             read_and_call(uhandle, consumer.query_info, start='Query=')
00214             read_and_call_until(uhandle, consumer.query_info, blank=1)
00215             read_and_call_while(uhandle, consumer.noevent, blank=1)
00216 
00217             # Read the database lines and the following blank line.
00218             read_and_call_until(uhandle, consumer.database_info, end='total letters')
00219             read_and_call(uhandle, consumer.database_info, contains='sequences')
00220             read_and_call_while(uhandle, consumer.noevent, blank=1)
00221         elif line.startswith("Database:"):
00222             #This is a new style database then query...
00223             read_and_call_until(uhandle, consumer.database_info, end='total letters')
00224             read_and_call(uhandle, consumer.database_info, contains='sequences')
00225             read_and_call_while(uhandle, consumer.noevent, blank=1)
00226 
00227             # Read the Query lines and the following blank line.
00228             # Or, on BLAST 2.2.22+ there is no blank link - need to spot
00229             # the "... Score     E" line instead.
00230             read_and_call(uhandle, consumer.query_info, start='Query=')
00231             # BLAST 2.2.25+ has a blank line before Length=
00232             read_and_call_until(uhandle, consumer.query_info, start='Length=')
00233             while True:
00234                 line = uhandle.peekline()
00235                 if not line.strip() : break
00236                 if "Score     E" in line : break
00237                 #It is more of the query (and its length)
00238                 read_and_call(uhandle, consumer.query_info)
00239             read_and_call_while(uhandle, consumer.noevent, blank=1)
00240         else:
00241             raise ValueError("Invalid header?")
00242 
00243         consumer.end_header()
00244 
00245     def _scan_rounds(self, uhandle, consumer):
00246         # Scan a bunch of rounds.
00247         # Each round begins with either a "Searching......" line
00248         # or a 'Score     E' line followed by descriptions and alignments.
00249         # The email server doesn't give the "Searching....." line.
00250         # If there is no 'Searching.....' line then you'll first see a 
00251         # 'Results from round' line
00252 
00253         while not self._eof(uhandle):
00254             line = safe_peekline(uhandle)
00255             if (not line.startswith('Searching') and
00256                 not line.startswith('Results from round') and
00257                 re.search(r"Score +E", line) is None and
00258                 line.find('No hits found') == -1):
00259                 break
00260             self._scan_descriptions(uhandle, consumer)
00261             self._scan_alignments(uhandle, consumer)
00262 
00263     def _scan_descriptions(self, uhandle, consumer):
00264         # Searching..................................................done
00265         # Results from round 2
00266         # 
00267         # 
00268         #                                                                    Sc
00269         # Sequences producing significant alignments:                        (b
00270         # Sequences used in model and found again:
00271         # 
00272         # d1tde_2 3.4.1.4.4 (119-244) Thioredoxin reductase [Escherichia ...   
00273         # d1tcob_ 1.31.1.5.16 Calcineurin regulatory subunit (B-chain) [B...   
00274         # d1symb_ 1.31.1.2.2 Calcyclin (S100) [RAT (RATTUS NORVEGICUS)]        
00275         # 
00276         # Sequences not found previously or not previously below threshold:
00277         # 
00278         # d1osa__ 1.31.1.5.11 Calmodulin [Paramecium tetraurelia]              
00279         # d1aoza3 2.5.1.3.3 (339-552) Ascorbate oxidase [zucchini (Cucurb...   
00280         #
00281 
00282         # If PSI-BLAST, may also have:
00283         #
00284         # CONVERGED!
00285 
00286         consumer.start_descriptions()
00287 
00288         # Read 'Searching'
00289         # This line seems to be missing in BLASTN 2.1.2 (others?)
00290         attempt_read_and_call(uhandle, consumer.noevent, start='Searching')
00291 
00292         # blastpgp 2.0.10 from NCBI 9/19/99 for Solaris sometimes crashes here.
00293         # If this happens, the handle will yield no more information.
00294         if not uhandle.peekline():
00295             raise ValueError("Unexpected end of blast report.  " + \
00296                   "Looks suspiciously like a PSI-BLAST crash.")
00297 
00298         # BLASTN 2.2.3 sometimes spews a bunch of warnings and errors here:
00299         # Searching[blastall] WARNING:  [000.000]  AT1G08320: SetUpBlastSearch 
00300         # [blastall] ERROR:  [000.000]  AT1G08320: Blast: 
00301         # [blastall] ERROR:  [000.000]  AT1G08320: Blast: Query must be at leas
00302         # done 
00303         # Reported by David Weisman.
00304         # Check for these error lines and ignore them for now.  Let
00305         # the BlastErrorParser deal with them.
00306         line = uhandle.peekline()
00307         if line.find("ERROR:") != -1 or line.startswith("done"):
00308             read_and_call_while(uhandle, consumer.noevent, contains="ERROR:")
00309             read_and_call(uhandle, consumer.noevent, start="done")
00310 
00311         # Check to see if this is PSI-BLAST.
00312         # If it is, the 'Searching' line will be followed by:
00313         # (version 2.0.10)
00314         #     Searching.............................
00315         #     Results from round 2
00316         # or (version 2.0.11)
00317         #     Searching.............................
00318         #
00319         #
00320         #     Results from round 2
00321         
00322         # Skip a bunch of blank lines.
00323         read_and_call_while(uhandle, consumer.noevent, blank=1)
00324         # Check for the results line if it's there.
00325         if attempt_read_and_call(uhandle, consumer.round, start='Results'):
00326             read_and_call_while(uhandle, consumer.noevent, blank=1)
00327         
00328         # Three things can happen here:
00329         # 1.  line contains 'Score     E'
00330         # 2.  line contains "No hits found"
00331         # 3.  no descriptions
00332         # The first one begins a bunch of descriptions.  The last two
00333         # indicates that no descriptions follow, and we should go straight
00334         # to the alignments.
00335         if not attempt_read_and_call(
00336             uhandle, consumer.description_header,
00337             has_re=re.compile(r'Score +E')):
00338             # Either case 2 or 3.  Look for "No hits found".
00339             attempt_read_and_call(uhandle, consumer.no_hits,
00340                                   contains='No hits found')
00341             try:
00342                 read_and_call_while(uhandle, consumer.noevent, blank=1)
00343             except ValueError, err:
00344                 if str(err) != "Unexpected end of stream." : raise err
00345 
00346             consumer.end_descriptions()
00347             # Stop processing.
00348             return
00349 
00350         # Read the score header lines
00351         read_and_call(uhandle, consumer.description_header,
00352                       start='Sequences producing')
00353 
00354         # If PSI-BLAST, read the 'Sequences used in model' line.
00355         attempt_read_and_call(uhandle, consumer.model_sequences,
00356                               start='Sequences used in model')
00357         read_and_call_while(uhandle, consumer.noevent, blank=1)
00358 
00359         # In BLAT, rather than a "No hits found" line, we just
00360         # get no descriptions (and no alignments). This can be
00361         # spotted because the next line is the database block:
00362         if safe_peekline(uhandle).startswith("  Database:"):
00363             consumer.end_descriptions()
00364             # Stop processing.
00365             return
00366 
00367         # Read the descriptions and the following blank lines, making
00368         # sure that there are descriptions.
00369         if not uhandle.peekline().startswith('Sequences not found'):
00370             read_and_call_until(uhandle, consumer.description, blank=1)
00371             read_and_call_while(uhandle, consumer.noevent, blank=1)
00372 
00373         # If PSI-BLAST, read the 'Sequences not found' line followed
00374         # by more descriptions.  However, I need to watch out for the
00375         # case where there were no sequences not found previously, in
00376         # which case there will be no more descriptions.
00377         if attempt_read_and_call(uhandle, consumer.nonmodel_sequences,
00378                                  start='Sequences not found'):
00379             # Read the descriptions and the following blank lines.
00380             read_and_call_while(uhandle, consumer.noevent, blank=1)
00381             l = safe_peekline(uhandle)
00382             # Brad -- added check for QUERY. On some PSI-BLAST outputs
00383             # there will be a 'Sequences not found' line followed by no
00384             # descriptions. Check for this case since the first thing you'll
00385             # get is a blank line and then 'QUERY'
00386             if not l.startswith('CONVERGED') and l[0] != '>' \
00387                     and not l.startswith('QUERY'):
00388                 read_and_call_until(uhandle, consumer.description, blank=1)
00389                 read_and_call_while(uhandle, consumer.noevent, blank=1)
00390 
00391         attempt_read_and_call(uhandle, consumer.converged, start='CONVERGED')
00392         read_and_call_while(uhandle, consumer.noevent, blank=1)
00393 
00394         consumer.end_descriptions()
00395 
00396     def _scan_alignments(self, uhandle, consumer):
00397         if self._eof(uhandle) : return
00398         
00399         # qblast inserts a helpful line here.
00400         attempt_read_and_call(uhandle, consumer.noevent, start="ALIGNMENTS")
00401 
00402         # First, check to see if I'm at the database report.
00403         line = safe_peekline(uhandle)
00404         if not line:
00405             #EOF
00406             return
00407         elif line.startswith('  Database') or line.startswith("Lambda"):
00408             return
00409         elif line[0] == '>':
00410             # XXX make a better check here between pairwise and masterslave
00411             self._scan_pairwise_alignments(uhandle, consumer)
00412         else:
00413             # XXX put in a check to make sure I'm in a masterslave alignment
00414             self._scan_masterslave_alignment(uhandle, consumer)
00415 
00416     def _scan_pairwise_alignments(self, uhandle, consumer):
00417         while not self._eof(uhandle):
00418             line = safe_peekline(uhandle)
00419             if line[0] != '>':
00420                 break
00421             self._scan_one_pairwise_alignment(uhandle, consumer)
00422 
00423     def _scan_one_pairwise_alignment(self, uhandle, consumer):
00424         if self._eof(uhandle) : return
00425         consumer.start_alignment()
00426 
00427         self._scan_alignment_header(uhandle, consumer)
00428 
00429         # Scan a bunch of score/alignment pairs.
00430         while 1:
00431             if self._eof(uhandle):
00432                 #Shouldn't have issued that _scan_alignment_header event...
00433                 break
00434             line = safe_peekline(uhandle)
00435             if not line.startswith(' Score'):
00436                 break
00437             self._scan_hsp(uhandle, consumer)
00438         consumer.end_alignment()
00439 
00440     def _scan_alignment_header(self, uhandle, consumer):
00441         # >d1rip__ 2.24.7.1.1 Ribosomal S17 protein [Bacillus
00442         #           stearothermophilus]
00443         #           Length = 81
00444         #
00445         # Or, more recently with different white space:
00446         #
00447         # >gi|15799684|ref|NP_285696.1| threonine synthase ...
00448         #  gi|15829258|ref|NP_308031.1| threonine synthase 
00449         #  ...
00450         # Length=428
00451         read_and_call(uhandle, consumer.title, start='>')
00452         while 1:
00453             line = safe_readline(uhandle)
00454             if line.lstrip().startswith('Length =') \
00455             or line.lstrip().startswith('Length='):
00456                 consumer.length(line)
00457                 break
00458             elif is_blank_line(line):
00459                 # Check to make sure I haven't missed the Length line
00460                 raise ValueError("I missed the Length in an alignment header")
00461             consumer.title(line)
00462 
00463         # Older versions of BLAST will have a line with some spaces.
00464         # Version 2.0.14 (maybe 2.0.13?) and above print a true blank line.
00465         if not attempt_read_and_call(uhandle, consumer.noevent,
00466                                      start='          '):
00467             read_and_call(uhandle, consumer.noevent, blank=1)
00468 
00469     def _scan_hsp(self, uhandle, consumer):
00470         consumer.start_hsp()
00471         self._scan_hsp_header(uhandle, consumer)
00472         self._scan_hsp_alignment(uhandle, consumer)
00473         consumer.end_hsp()
00474         
00475     def _scan_hsp_header(self, uhandle, consumer):
00476         #  Score = 22.7 bits (47), Expect = 2.5
00477         #  Identities = 10/36 (27%), Positives = 18/36 (49%)
00478         #  Strand = Plus / Plus
00479         #  Frame = +3
00480         #
00481 
00482         read_and_call(uhandle, consumer.score, start=' Score')
00483         read_and_call(uhandle, consumer.identities, start=' Identities')
00484         # BLASTN
00485         attempt_read_and_call(uhandle, consumer.strand, start = ' Strand')
00486         # BLASTX, TBLASTN, TBLASTX
00487         attempt_read_and_call(uhandle, consumer.frame, start = ' Frame')
00488         read_and_call(uhandle, consumer.noevent, blank=1)
00489 
00490     def _scan_hsp_alignment(self, uhandle, consumer):
00491         # Query: 11 GRGVSACA-------TCDGFFYRNQKVAVIGGGNTAVEEALYLSNIASEVHLIHRRDGF
00492         #           GRGVS+         TC    Y  + + V GGG+ + EE   L     +   I R+
00493         # Sbjct: 12 GRGVSSVVRRCIHKPTCKE--YAVKIIDVTGGGSFSAEEVQELREATLKEVDILRKVSG
00494         # 
00495         # Query: 64 AEKILIKR 71
00496         #              I +K 
00497         # Sbjct: 70 PNIIQLKD 77
00498         # 
00499 
00500         while 1:
00501             # Blastn adds an extra line filled with spaces before Query
00502             attempt_read_and_call(uhandle, consumer.noevent, start='     ')
00503             read_and_call(uhandle, consumer.query, start='Query')
00504             read_and_call(uhandle, consumer.align, start='     ')
00505             read_and_call(uhandle, consumer.sbjct, start='Sbjct')
00506             try:
00507                 read_and_call_while(uhandle, consumer.noevent, blank=1)
00508             except ValueError, err:
00509                 if str(err) != "Unexpected end of stream." : raise err
00510                 # End of File (well, it looks like it with recent versions
00511                 # of BLAST for multiple queries after the Iterator class
00512                 # has broken up the whole file into chunks).
00513                 break
00514             line = safe_peekline(uhandle)
00515             # Alignment continues if I see a 'Query' or the spaces for Blastn.
00516             if not (line.startswith('Query') or line.startswith('     ')):
00517                 break
00518  
00519     def _scan_masterslave_alignment(self, uhandle, consumer):
00520         consumer.start_alignment()
00521         while 1:
00522             line = safe_readline(uhandle)
00523             # Check to see whether I'm finished reading the alignment.
00524             # This is indicated by 1) database section, 2) next psi-blast
00525             # round, which can also be a 'Results from round' if no 
00526             # searching line is present
00527             # patch by chapmanb
00528             if line.startswith('Searching') or \
00529                     line.startswith('Results from round'):
00530                 uhandle.saveline(line)
00531                 break
00532             elif line.startswith('  Database'):
00533                 uhandle.saveline(line)
00534                 break
00535             elif is_blank_line(line):
00536                 consumer.noevent(line)
00537             else:
00538                 consumer.multalign(line)
00539         read_and_call_while(uhandle, consumer.noevent, blank=1)
00540         consumer.end_alignment()
00541 
00542     def _eof(self, uhandle):
00543         try:
00544             line = safe_peekline(uhandle)
00545         except ValueError, err:
00546             if str(err) != "Unexpected end of stream." : raise err
00547             line = ""
00548         return not line
00549 
00550     def _scan_database_report(self, uhandle, consumer):
00551         #   Database: sdqib40-1.35.seg.fa
00552         #     Posted date:  Nov 1, 1999  4:25 PM
00553         #   Number of letters in database: 223,339
00554         #   Number of sequences in database:  1323
00555         #   
00556         # Lambda     K      H
00557         #    0.322    0.133    0.369 
00558         #
00559         # Gapped
00560         # Lambda     K      H
00561         #    0.270   0.0470    0.230 
00562         #
00563         ##########################################
00564         # Or, more recently Blast 2.2.15 gives less blank lines
00565         ##########################################
00566         #   Database: All non-redundant GenBank CDS translations+PDB+SwissProt+PIR+PRF excluding 
00567         # environmental samples
00568         #     Posted date:  Dec 12, 2006  5:51 PM
00569         #   Number of letters in database: 667,088,753
00570         #   Number of sequences in database:  2,094,974
00571         # Lambda     K      H
00572         #    0.319    0.136    0.395 
00573         # Gapped
00574         # Lambda     K      H
00575         #    0.267   0.0410    0.140
00576 
00577         if self._eof(uhandle) : return
00578 
00579         consumer.start_database_report()
00580         
00581         # Subset of the database(s) listed below
00582         #    Number of letters searched: 562,618,960
00583         #    Number of sequences searched:  228,924
00584         if attempt_read_and_call(uhandle, consumer.noevent, start="  Subset"):
00585             read_and_call(uhandle, consumer.noevent, contains="letters")
00586             read_and_call(uhandle, consumer.noevent, contains="sequences")
00587             read_and_call(uhandle, consumer.noevent, start="  ")
00588 
00589         # Sameet Mehta reported seeing output from BLASTN 2.2.9 that
00590         # was missing the "Database" stanza completely.
00591         while attempt_read_and_call(uhandle, consumer.database,
00592                 start='  Database'):
00593             # BLAT output ends abruptly here, without any of the other
00594             # information.  Check to see if this is the case.  If so,
00595             # then end the database report here gracefully.
00596             if not uhandle.peekline().strip() \
00597             or uhandle.peekline().startswith("BLAST"):
00598                 consumer.end_database_report()
00599                 return
00600             
00601             # Database can span multiple lines.
00602             read_and_call_until(uhandle, consumer.database, start='    Posted')
00603             read_and_call(uhandle, consumer.posted_date, start='    Posted')
00604             read_and_call(uhandle, consumer.num_letters_in_database,
00605                        start='  Number of letters')
00606             read_and_call(uhandle, consumer.num_sequences_in_database,
00607                        start='  Number of sequences')
00608             #There may not be a line starting with spaces...
00609             attempt_read_and_call(uhandle, consumer.noevent, start='  ')
00610 
00611             line = safe_readline(uhandle)
00612             uhandle.saveline(line)
00613             if line.find('Lambda') != -1:
00614                 break
00615 
00616         read_and_call(uhandle, consumer.noevent, start='Lambda')
00617         read_and_call(uhandle, consumer.ka_params)
00618 
00619         #This blank line is optional:
00620         attempt_read_and_call(uhandle, consumer.noevent, blank=1)
00621 
00622         # not BLASTP
00623         attempt_read_and_call(uhandle, consumer.gapped, start='Gapped')
00624         # not TBLASTX
00625         if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'):
00626             read_and_call(uhandle, consumer.ka_params_gap)
00627             
00628         # Blast 2.2.4 can sometimes skip the whole parameter section.
00629         # Thus, I need to be careful not to read past the end of the
00630         # file.
00631         try:
00632             read_and_call_while(uhandle, consumer.noevent, blank=1)
00633         except ValueError, x:
00634             if str(x) != "Unexpected end of stream.":
00635                 raise
00636         consumer.end_database_report()
00637 
00638     def _scan_parameters(self, uhandle, consumer):
00639         # Matrix: BLOSUM62
00640         # Gap Penalties: Existence: 11, Extension: 1
00641         # Number of Hits to DB: 50604
00642         # Number of Sequences: 1323
00643         # Number of extensions: 1526
00644         # Number of successful extensions: 6
00645         # Number of sequences better than 10.0: 5
00646         # Number of HSP's better than 10.0 without gapping: 5
00647         # Number of HSP's successfully gapped in prelim test: 0
00648         # Number of HSP's that attempted gapping in prelim test: 1
00649         # Number of HSP's gapped (non-prelim): 5
00650         # length of query: 140
00651         # length of database: 223,339
00652         # effective HSP length: 39
00653         # effective length of query: 101
00654         # effective length of database: 171,742
00655         # effective search space: 17345942
00656         # effective search space used: 17345942
00657         # T: 11
00658         # A: 40
00659         # X1: 16 ( 7.4 bits)
00660         # X2: 38 (14.8 bits)
00661         # X3: 64 (24.9 bits)
00662         # S1: 41 (21.9 bits)
00663         # S2: 42 (20.8 bits)
00664         ##########################################
00665         # Or, more recently Blast(x) 2.2.15 gives
00666         ##########################################
00667         # Matrix: BLOSUM62
00668         # Gap Penalties: Existence: 11, Extension: 1
00669         # Number of Sequences: 4535438
00670         # Number of Hits to DB: 2,588,844,100
00671         # Number of extensions: 60427286
00672         # Number of successful extensions: 126433
00673         # Number of sequences better than  2.0: 30
00674         # Number of HSP's gapped: 126387
00675         # Number of HSP's successfully gapped: 35
00676         # Length of query: 291
00677         # Length of database: 1,573,298,872
00678         # Length adjustment: 130
00679         # Effective length of query: 161
00680         # Effective length of database: 983,691,932
00681         # Effective search space: 158374401052
00682         # Effective search space used: 158374401052
00683         # Neighboring words threshold: 12
00684         # Window for multiple hits: 40
00685         # X1: 16 ( 7.3 bits)
00686         # X2: 38 (14.6 bits)
00687         # X3: 64 (24.7 bits)
00688         # S1: 41 (21.7 bits)
00689         # S2: 32 (16.9 bits)
00690 
00691 
00692         # Blast 2.2.4 can sometimes skip the whole parameter section.
00693         # BLAT also skips the whole parameter section.
00694         # Thus, check to make sure that the parameter section really
00695         # exists.
00696         if not uhandle.peekline().strip():
00697             return
00698 
00699         # BLASTN 2.2.9 looks like it reverses the "Number of Hits" and
00700         # "Number of Sequences" lines.
00701         consumer.start_parameters()
00702 
00703         # Matrix line may be missing in BLASTN 2.2.9
00704         attempt_read_and_call(uhandle, consumer.matrix, start='Matrix')
00705         # not TBLASTX
00706         attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap')
00707 
00708         attempt_read_and_call(uhandle, consumer.num_sequences,
00709                               start='Number of Sequences')
00710         attempt_read_and_call(uhandle, consumer.num_hits,
00711                       start='Number of Hits')
00712         attempt_read_and_call(uhandle, consumer.num_sequences,
00713                               start='Number of Sequences')
00714         attempt_read_and_call(uhandle, consumer.num_extends,
00715                       start='Number of extensions')
00716         attempt_read_and_call(uhandle, consumer.num_good_extends,
00717                       start='Number of successful')
00718 
00719         attempt_read_and_call(uhandle, consumer.num_seqs_better_e,
00720                       start='Number of sequences')
00721 
00722         # not BLASTN, TBLASTX
00723         if attempt_read_and_call(uhandle, consumer.hsps_no_gap,
00724                                  start="Number of HSP's better"):
00725             # BLASTN 2.2.9
00726             if attempt_read_and_call(uhandle, consumer.noevent,
00727                                      start="Number of HSP's gapped:"):
00728                 read_and_call(uhandle, consumer.noevent,
00729                               start="Number of HSP's successfully")
00730                 #This is ommitted in 2.2.15
00731                 attempt_read_and_call(uhandle, consumer.noevent,
00732                               start="Number of extra gapped extensions")
00733             else:
00734                 read_and_call(uhandle, consumer.hsps_prelim_gapped,
00735                               start="Number of HSP's successfully")
00736                 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted,
00737                               start="Number of HSP's that")
00738                 read_and_call(uhandle, consumer.hsps_gapped,
00739                               start="Number of HSP's gapped")
00740         #e.g. BLASTX 2.2.15 where the "better" line is missing
00741         elif attempt_read_and_call(uhandle, consumer.noevent,
00742                                      start="Number of HSP's gapped"):
00743             read_and_call(uhandle, consumer.noevent,
00744                           start="Number of HSP's successfully")
00745 
00746         # not in blastx 2.2.1
00747         attempt_read_and_call(uhandle, consumer.query_length,
00748                               has_re=re.compile(r"[Ll]ength of query"))
00749         # Not in BLASTX 2.2.22+
00750         attempt_read_and_call(uhandle, consumer.database_length,
00751                           has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase"))
00752 
00753         # BLASTN 2.2.9
00754         attempt_read_and_call(uhandle, consumer.noevent,
00755                               start="Length adjustment")
00756         attempt_read_and_call(uhandle, consumer.effective_hsp_length,
00757                               start='effective HSP')
00758         # Not in blastx 2.2.1
00759         attempt_read_and_call(
00760             uhandle, consumer.effective_query_length,
00761             has_re=re.compile(r'[Ee]ffective length of query'))
00762 
00763         # This is not in BLASTP 2.2.15
00764         attempt_read_and_call(
00765             uhandle, consumer.effective_database_length,
00766             has_re=re.compile(r'[Ee]ffective length of \s*[Dd]atabase'))
00767         # Not in blastx 2.2.1, added a ':' to distinguish between
00768         # this and the 'effective search space used' line
00769         attempt_read_and_call(
00770             uhandle, consumer.effective_search_space,
00771             has_re=re.compile(r'[Ee]ffective search space:'))
00772         # Does not appear in BLASTP 2.0.5
00773         attempt_read_and_call(
00774             uhandle, consumer.effective_search_space_used,
00775             has_re=re.compile(r'[Ee]ffective search space used'))
00776 
00777         # BLASTX, TBLASTN, TBLASTX
00778         attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift')
00779 
00780         # not in BLASTN 2.2.9
00781         attempt_read_and_call(uhandle, consumer.threshold, start='T')
00782         # In BLASTX 2.2.15 replaced by: "Neighboring words threshold: 12"
00783         attempt_read_and_call(uhandle, consumer.threshold, start='Neighboring words threshold')
00784 
00785         # not in BLASTX 2.2.15
00786         attempt_read_and_call(uhandle, consumer.window_size, start='A')
00787         # get this instead: "Window for multiple hits: 40"
00788         attempt_read_and_call(uhandle, consumer.window_size, start='Window for multiple hits')
00789 
00790         # not in BLASTX 2.2.22+        
00791         attempt_read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1')
00792         # not TBLASTN
00793         attempt_read_and_call(uhandle, consumer.gap_x_dropoff, start='X2')
00794 
00795         # not BLASTN, TBLASTX
00796         attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final,
00797                               start='X3')
00798 
00799         # not TBLASTN
00800         attempt_read_and_call(uhandle, consumer.gap_trigger, start='S1')
00801         # not in blastx 2.2.1
00802         # first we make sure we have additional lines to work with, if
00803         # not then the file is done and we don't have a final S2
00804         if not is_blank_line(uhandle.peekline(), allow_spaces=1):
00805             read_and_call(uhandle, consumer.blast_cutoff, start='S2')
00806 
00807         consumer.end_parameters()
00808 
00809 class BlastParser(AbstractParser):
00810     """Parses BLAST data into a Record.Blast object.
00811 
00812     """
00813     def __init__(self):
00814         """__init__(self)"""
00815         self._scanner = _Scanner()
00816         self._consumer = _BlastConsumer()
00817 
00818     def parse(self, handle):
00819         """parse(self, handle)"""
00820         self._scanner.feed(handle, self._consumer)
00821         return self._consumer.data
00822 
00823 class PSIBlastParser(AbstractParser):
00824     """Parses BLAST data into a Record.PSIBlast object.
00825 
00826     """
00827     def __init__(self):
00828         """__init__(self)"""
00829         self._scanner = _Scanner()
00830         self._consumer = _PSIBlastConsumer()
00831 
00832     def parse(self, handle):
00833         """parse(self, handle)"""
00834         self._scanner.feed(handle, self._consumer)
00835         return self._consumer.data
00836 
00837 class _HeaderConsumer:
00838     def start_header(self):
00839         self._header = Record.Header()
00840         
00841     def version(self, line):
00842         c = line.split()
00843         self._header.application = c[0]
00844         self._header.version = c[1]
00845         if len(c) > 2:
00846             #The date is missing in the new C++ output from blastx 2.2.22+
00847             #Just get "BLASTX 2.2.22+\n" and that's all.
00848             self._header.date = c[2][1:-1]
00849 
00850     def reference(self, line):
00851         if line.startswith('Reference: '):
00852             self._header.reference = line[11:]
00853         else:
00854             self._header.reference = self._header.reference + line
00855             
00856     def query_info(self, line):
00857         if line.startswith('Query= '):
00858             self._header.query = line[7:].lstrip()
00859         elif line.startswith('Length='):
00860             #New style way to give the query length in BLAST 2.2.22+ (the C++ code)
00861             self._header.query_letters = _safe_int(line[7:].strip())
00862         elif not line.startswith('       '):  # continuation of query_info
00863             self._header.query = "%s%s" % (self._header.query, line)
00864         else:
00865             #Hope it is the old style way to give the query length:
00866             letters, = _re_search(
00867                 r"([0-9,]+) letters", line,
00868                 "I could not find the number of letters in line\n%s" % line)
00869             self._header.query_letters = _safe_int(letters)
00870                 
00871     def database_info(self, line):
00872         line = line.rstrip()
00873         if line.startswith('Database: '):
00874             self._header.database = line[10:]
00875         elif not line.endswith('total letters'):
00876             if self._header.database:
00877                 #Need to include a space when merging multi line datase descr
00878                 self._header.database = self._header.database + " " + line.strip()
00879             else:
00880                 self._header.database = line.strip()                
00881         else:
00882             sequences, letters =_re_search(
00883                 r"([0-9,]+) sequences; ([0-9,-]+) total letters", line,
00884                 "I could not find the sequences and letters in line\n%s" %line)
00885             self._header.database_sequences = _safe_int(sequences)
00886             self._header.database_letters = _safe_int(letters)
00887 
00888     def end_header(self):
00889         # Get rid of the trailing newlines
00890         self._header.reference = self._header.reference.rstrip()
00891         self._header.query = self._header.query.rstrip()
00892 
00893 class _DescriptionConsumer:
00894     def start_descriptions(self):
00895         self._descriptions = []
00896         self._model_sequences = []
00897         self._nonmodel_sequences = []
00898         self._converged = 0
00899         self._type = None
00900         self._roundnum = None
00901 
00902         self.__has_n = 0   # Does the description line contain an N value?
00903 
00904     def description_header(self, line):
00905         if line.startswith('Sequences producing'):
00906             cols = line.split()
00907             if cols[-1] == 'N':
00908                 self.__has_n = 1
00909     
00910     def description(self, line):
00911         dh = self._parse(line)
00912         if self._type == 'model':
00913             self._model_sequences.append(dh)
00914         elif self._type == 'nonmodel':
00915             self._nonmodel_sequences.append(dh)
00916         else:
00917             self._descriptions.append(dh)
00918 
00919     def model_sequences(self, line):
00920         self._type = 'model'
00921 
00922     def nonmodel_sequences(self, line):
00923         self._type = 'nonmodel'
00924 
00925     def converged(self, line):
00926         self._converged = 1
00927 
00928     def no_hits(self, line):
00929         pass
00930 
00931     def round(self, line):
00932         if not line.startswith('Results from round'):
00933             raise ValueError("I didn't understand the round line\n%s" % line)
00934         self._roundnum = _safe_int(line[18:].strip())
00935 
00936     def end_descriptions(self):
00937         pass
00938 
00939     def _parse(self, description_line):
00940         line = description_line  # for convenience
00941         dh = Record.Description()
00942         
00943         # I need to separate the score and p-value from the title.
00944         # sp|P21297|FLBT_CAUCR FLBT PROTEIN     [snip]         284  7e-77
00945         # sp|P21297|FLBT_CAUCR FLBT PROTEIN     [snip]         284  7e-77  1
00946         # special cases to handle:
00947         #   - title must be preserved exactly (including whitespaces)
00948         #   - score could be equal to e-value (not likely, but what if??)
00949         #   - sometimes there's an "N" score of '1'.
00950         cols = line.split()
00951         if len(cols) < 3:
00952             raise ValueError( \
00953                   "Line does not appear to contain description:\n%s" % line)
00954         if self.__has_n:
00955             i = line.rfind(cols[-1])        # find start of N
00956             i = line.rfind(cols[-2], 0, i)  # find start of p-value
00957             i = line.rfind(cols[-3], 0, i)  # find start of score
00958         else:
00959             i = line.rfind(cols[-1])        # find start of p-value
00960             i = line.rfind(cols[-2], 0, i)  # find start of score
00961         if self.__has_n:
00962             dh.title, dh.score, dh.e, dh.num_alignments = \
00963                       line[:i].rstrip(), cols[-3], cols[-2], cols[-1]
00964         else:
00965             dh.title, dh.score, dh.e, dh.num_alignments = \
00966                       line[:i].rstrip(), cols[-2], cols[-1], 1
00967         dh.num_alignments = _safe_int(dh.num_alignments)
00968         dh.score = _safe_int(dh.score)
00969         dh.e = _safe_float(dh.e)
00970         return dh
00971 
00972 class _AlignmentConsumer:
00973     # This is a little bit tricky.  An alignment can either be a
00974     # pairwise alignment or a multiple alignment.  Since it's difficult
00975     # to know a-priori which one the blast record will contain, I'm going
00976     # to make one class that can parse both of them.
00977     def start_alignment(self):
00978         self._alignment = Record.Alignment()
00979         self._multiple_alignment = Record.MultipleAlignment()
00980 
00981     def title(self, line):
00982         if self._alignment.title:
00983             self._alignment.title += " "
00984         self._alignment.title += line.strip()
00985 
00986     def length(self, line):
00987         #e.g. "Length = 81" or more recently, "Length=428"
00988         parts = line.replace(" ","").split("=")
00989         assert len(parts)==2, "Unrecognised format length line"
00990         self._alignment.length = parts[1]
00991         self._alignment.length = _safe_int(self._alignment.length)
00992 
00993     def multalign(self, line):
00994         # Standalone version uses 'QUERY', while WWW version uses blast_tmp.
00995         if line.startswith('QUERY') or line.startswith('blast_tmp'):
00996             # If this is the first line of the multiple alignment,
00997             # then I need to figure out how the line is formatted.
00998             
00999             # Format of line is:
01000             # QUERY 1   acttg...gccagaggtggtttattcagtctccataagagaggggacaaacg 60
01001             try:
01002                 name, start, seq, end = line.split()
01003             except ValueError:
01004                 raise ValueError("I do not understand the line\n%s" % line)
01005             self._start_index = line.index(start, len(name))
01006             self._seq_index = line.index(seq,
01007                                          self._start_index+len(start))
01008             # subtract 1 for the space
01009             self._name_length = self._start_index - 1
01010             self._start_length = self._seq_index - self._start_index - 1
01011             self._seq_length = line.rfind(end) - self._seq_index - 1
01012             
01013             #self._seq_index = line.index(seq)
01014             ## subtract 1 for the space
01015             #self._seq_length = line.rfind(end) - self._seq_index - 1
01016             #self._start_index = line.index(start)
01017             #self._start_length = self._seq_index - self._start_index - 1
01018             #self._name_length = self._start_index
01019 
01020         # Extract the information from the line
01021         name = line[:self._name_length]
01022         name = name.rstrip()
01023         start = line[self._start_index:self._start_index+self._start_length]
01024         start = start.rstrip()
01025         if start:
01026             start = _safe_int(start)
01027         end = line[self._seq_index+self._seq_length:].rstrip()
01028         if end:
01029             end = _safe_int(end)
01030         seq = line[self._seq_index:self._seq_index+self._seq_length].rstrip()
01031         # right pad the sequence with spaces if necessary
01032         if len(seq) < self._seq_length:
01033             seq = seq + ' '*(self._seq_length-len(seq))
01034             
01035         # I need to make sure the sequence is aligned correctly with the query.
01036         # First, I will find the length of the query.  Then, if necessary,
01037         # I will pad my current sequence with spaces so that they will line
01038         # up correctly.
01039 
01040         # Two possible things can happen:
01041         # QUERY
01042         # 504
01043         #
01044         # QUERY
01045         # 403
01046         #
01047         # Sequence 504 will need padding at the end.  Since I won't know
01048         # this until the end of the alignment, this will be handled in
01049         # end_alignment.
01050         # Sequence 403 will need padding before being added to the alignment.
01051 
01052         align = self._multiple_alignment.alignment  # for convenience
01053         align.append((name, start, seq, end))
01054 
01055         # This is old code that tried to line up all the sequences
01056         # in a multiple alignment by using the sequence title's as
01057         # identifiers.  The problem with this is that BLAST assigns
01058         # different HSP's from the same sequence the same id.  Thus,
01059         # in one alignment block, there may be multiple sequences with
01060         # the same id.  I'm not sure how to handle this, so I'm not
01061         # going to.
01062         
01063         # # If the sequence is the query, then just add it.
01064         # if name == 'QUERY':
01065         #     if len(align) == 0:
01066         #         align.append((name, start, seq))
01067         #     else:
01068         #         aname, astart, aseq = align[0]
01069         #         if name != aname:
01070         #             raise ValueError, "Query is not the first sequence"
01071         #         aseq = aseq + seq
01072         #         align[0] = aname, astart, aseq
01073         # else:
01074         #     if len(align) == 0:
01075         #         raise ValueError, "I could not find the query sequence"
01076         #     qname, qstart, qseq = align[0]
01077         #     
01078         #     # Now find my sequence in the multiple alignment.
01079         #     for i in range(1, len(align)):
01080         #         aname, astart, aseq = align[i]
01081         #         if name == aname:
01082         #             index = i
01083         #             break
01084         #     else:
01085         #         # If I couldn't find it, then add a new one.
01086         #         align.append((None, None, None))
01087         #         index = len(align)-1
01088         #         # Make sure to left-pad it.
01089         #         aname, astart, aseq = name, start, ' '*(len(qseq)-len(seq))
01090         # 
01091         #     if len(qseq) != len(aseq) + len(seq):
01092         #         # If my sequences are shorter than the query sequence,
01093         #         # then I will need to pad some spaces to make them line up.
01094         #         # Since I've already right padded seq, that means aseq
01095         #         # must be too short.
01096         #         aseq = aseq + ' '*(len(qseq)-len(aseq)-len(seq))
01097         #     aseq = aseq + seq
01098         #     if astart is None:
01099         #         astart = start
01100         #     align[index] = aname, astart, aseq
01101 
01102     def end_alignment(self):
01103         # Remove trailing newlines
01104         if self._alignment:
01105             self._alignment.title = self._alignment.title.rstrip()
01106 
01107         # This code is also obsolete.  See note above.
01108         # If there's a multiple alignment, I will need to make sure
01109         # all the sequences are aligned.  That is, I may need to
01110         # right-pad the sequences.
01111         # if self._multiple_alignment is not None:
01112         #     align = self._multiple_alignment.alignment
01113         #     seqlen = None
01114         #     for i in range(len(align)):
01115         #         name, start, seq = align[i]
01116         #         if seqlen is None:
01117         #             seqlen = len(seq)
01118         #         else:
01119         #             if len(seq) < seqlen:
01120         #                 seq = seq + ' '*(seqlen - len(seq))
01121         #                 align[i] = name, start, seq
01122         #             elif len(seq) > seqlen:
01123         #                 raise ValueError, \
01124         #                       "Sequence %s is longer than the query" % name
01125         
01126         # Clean up some variables, if they exist.
01127         try:
01128             del self._seq_index
01129             del self._seq_length
01130             del self._start_index
01131             del self._start_length
01132             del self._name_length
01133         except AttributeError:
01134             pass
01135 
01136 class _HSPConsumer:
01137     def start_hsp(self):
01138         self._hsp = Record.HSP()
01139 
01140     def score(self, line):
01141         self._hsp.bits, self._hsp.score = _re_search(
01142             r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)", line,
01143             "I could not find the score in line\n%s" % line)
01144         self._hsp.score = _safe_float(self._hsp.score)
01145         self._hsp.bits = _safe_float(self._hsp.bits)
01146 
01147         x, y = _re_search(
01148             r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)", line,
01149             "I could not find the expect in line\n%s" % line)
01150         if x:
01151             self._hsp.num_alignments = _safe_int(x)
01152         else:
01153             self._hsp.num_alignments = 1
01154         self._hsp.expect = _safe_float(y)
01155 
01156     def identities(self, line):
01157         x, y = _re_search(
01158             r"Identities = (\d+)\/(\d+)", line,
01159             "I could not find the identities in line\n%s" % line)
01160         self._hsp.identities = _safe_int(x), _safe_int(y)
01161         self._hsp.align_length = _safe_int(y)
01162 
01163         if line.find('Positives') != -1:
01164             x, y = _re_search(
01165                 r"Positives = (\d+)\/(\d+)", line,
01166                 "I could not find the positives in line\n%s" % line)
01167             self._hsp.positives = _safe_int(x), _safe_int(y)
01168             assert self._hsp.align_length == _safe_int(y)
01169 
01170         if line.find('Gaps') != -1:
01171             x, y = _re_search(
01172                 r"Gaps = (\d+)\/(\d+)", line,
01173                 "I could not find the gaps in line\n%s" % line)
01174             self._hsp.gaps = _safe_int(x), _safe_int(y)
01175             assert self._hsp.align_length == _safe_int(y)
01176 
01177         
01178     def strand(self, line):
01179         self._hsp.strand = _re_search(
01180             r"Strand\s?=\s?(\w+)\s?/\s?(\w+)", line,
01181             "I could not find the strand in line\n%s" % line)
01182 
01183     def frame(self, line):
01184         # Frame can be in formats:
01185         # Frame = +1
01186         # Frame = +2 / +2
01187         if line.find('/') != -1:
01188             self._hsp.frame = _re_search(
01189                 r"Frame\s?=\s?([-+][123])\s?/\s?([-+][123])", line,
01190                 "I could not find the frame in line\n%s" % line)
01191         else:
01192             self._hsp.frame = _re_search(
01193                 r"Frame = ([-+][123])", line,
01194                 "I could not find the frame in line\n%s" % line)
01195 
01196     # Match a space, if one is available.  Masahir Ishikawa found a
01197     # case where there's no space between the start and the sequence:
01198     # Query: 100tt 101
01199     # line below modified by Yair Benita, Sep 2004
01200     # Note that the colon is not always present. 2006
01201     _query_re = re.compile(r"Query(:?) \s*(\d+)\s*(.+) (\d+)")
01202     def query(self, line):
01203         m = self._query_re.search(line)
01204         if m is None:
01205             raise ValueError("I could not find the query in line\n%s" % line)
01206         
01207         # line below modified by Yair Benita, Sep 2004.
01208         # added the end attribute for the query
01209         colon, start, seq, end = m.groups()
01210         self._hsp.query = self._hsp.query + seq
01211         if self._hsp.query_start is None:
01212             self._hsp.query_start = _safe_int(start)
01213 
01214         # line below added by Yair Benita, Sep 2004.
01215         # added the end attribute for the query
01216         self._hsp.query_end = _safe_int(end)
01217 
01218         #Get index for sequence start (regular expression element 3)
01219         self._query_start_index = m.start(3)
01220         self._query_len = len(seq)
01221 
01222     def align(self, line):
01223         seq = line[self._query_start_index:].rstrip()
01224         if len(seq) < self._query_len:
01225             # Make sure the alignment is the same length as the query
01226             seq = seq + ' ' * (self._query_len-len(seq))
01227         elif len(seq) < self._query_len:
01228             raise ValueError("Match is longer than the query in line\n%s" \
01229                              % line)
01230         self._hsp.match = self._hsp.match + seq
01231 
01232     # To match how we do the query, cache the regular expression.
01233     # Note that the colon is not always present.
01234     _sbjct_re = re.compile(r"Sbjct(:?) \s*(\d+)\s*(.+) (\d+)")
01235     def sbjct(self, line):
01236         m = self._sbjct_re.search(line)
01237         if m is None:
01238             raise ValueError("I could not find the sbjct in line\n%s" % line)
01239         colon, start, seq, end = m.groups()
01240         #mikep 26/9/00
01241         #On occasion, there is a blast hit with no subject match
01242         #so far, it only occurs with 1-line short "matches"
01243         #I have decided to let these pass as they appear
01244         if not seq.strip():
01245             seq = ' ' * self._query_len
01246         self._hsp.sbjct = self._hsp.sbjct + seq
01247         if self._hsp.sbjct_start is None:
01248             self._hsp.sbjct_start = _safe_int(start)
01249 
01250         self._hsp.sbjct_end = _safe_int(end)
01251         if len(seq) != self._query_len:
01252             raise ValueError( \
01253                   "QUERY and SBJCT sequence lengths don't match in line\n%s" \
01254                   % line)
01255 
01256         del self._query_start_index   # clean up unused variables
01257         del self._query_len
01258 
01259     def end_hsp(self):
01260         pass
01261 
01262 class _DatabaseReportConsumer:
01263 
01264     def start_database_report(self):
01265         self._dr = Record.DatabaseReport()
01266 
01267     def database(self, line):
01268         m = re.search(r"Database: (.+)$", line)
01269         if m:
01270             self._dr.database_name.append(m.group(1))
01271         elif self._dr.database_name:
01272             # This must be a continuation of the previous name.
01273             self._dr.database_name[-1] = "%s%s" % (self._dr.database_name[-1],
01274                                                    line.strip())
01275 
01276     def posted_date(self, line):
01277         self._dr.posted_date.append(_re_search(
01278             r"Posted date:\s*(.+)$", line,
01279             "I could not find the posted date in line\n%s" % line))
01280 
01281     def num_letters_in_database(self, line):
01282         letters, = _get_cols(
01283             line, (-1,), ncols=6, expected={2:"letters", 4:"database:"})
01284         self._dr.num_letters_in_database.append(_safe_int(letters))
01285 
01286     def num_sequences_in_database(self, line):
01287         sequences, = _get_cols(
01288             line, (-1,), ncols=6, expected={2:"sequences", 4:"database:"})
01289         self._dr.num_sequences_in_database.append(_safe_int(sequences))
01290 
01291     def ka_params(self, line):
01292         x = line.split()
01293         self._dr.ka_params = map(_safe_float, x)
01294 
01295     def gapped(self, line):
01296         self._dr.gapped = 1
01297 
01298     def ka_params_gap(self, line):
01299         x = line.split()
01300         self._dr.ka_params_gap = map(_safe_float, x)
01301 
01302     def end_database_report(self):
01303         pass
01304     
01305 class _ParametersConsumer:
01306     def start_parameters(self):
01307         self._params = Record.Parameters()
01308 
01309     def matrix(self, line):
01310         self._params.matrix = line[8:].rstrip()
01311 
01312     def gap_penalties(self, line):
01313         x = _get_cols(
01314             line, (3, 5), ncols=6, expected={2:"Existence:", 4:"Extension:"})
01315         self._params.gap_penalties = map(_safe_float, x)
01316 
01317     def num_hits(self, line):
01318         if line.find('1st pass') != -1:
01319             x, = _get_cols(line, (-4,), ncols=11, expected={2:"Hits"})
01320             self._params.num_hits = _safe_int(x)
01321         else:
01322             x, = _get_cols(line, (-1,), ncols=6, expected={2:"Hits"})
01323             self._params.num_hits = _safe_int(x)
01324 
01325     def num_sequences(self, line):
01326         if line.find('1st pass') != -1:
01327             x, = _get_cols(line, (-4,), ncols=9, expected={2:"Sequences:"})
01328             self._params.num_sequences = _safe_int(x)
01329         else:
01330             x, = _get_cols(line, (-1,), ncols=4, expected={2:"Sequences:"})
01331             self._params.num_sequences = _safe_int(x)
01332 
01333     def num_extends(self, line):
01334         if line.find('1st pass') != -1:
01335             x, = _get_cols(line, (-4,), ncols=9, expected={2:"extensions:"})
01336             self._params.num_extends = _safe_int(x)
01337         else:
01338             x, = _get_cols(line, (-1,), ncols=4, expected={2:"extensions:"})
01339             self._params.num_extends = _safe_int(x)
01340 
01341     def num_good_extends(self, line):
01342         if line.find('1st pass') != -1:
01343             x, = _get_cols(line, (-4,), ncols=10, expected={3:"extensions:"})
01344             self._params.num_good_extends = _safe_int(x)
01345         else:
01346             x, = _get_cols(line, (-1,), ncols=5, expected={3:"extensions:"})
01347             self._params.num_good_extends = _safe_int(x)
01348         
01349     def num_seqs_better_e(self, line):
01350         self._params.num_seqs_better_e, = _get_cols(
01351             line, (-1,), ncols=7, expected={2:"sequences"})
01352         self._params.num_seqs_better_e = _safe_int(
01353             self._params.num_seqs_better_e)
01354 
01355     def hsps_no_gap(self, line):
01356         self._params.hsps_no_gap, = _get_cols(
01357             line, (-1,), ncols=9, expected={3:"better", 7:"gapping:"})
01358         self._params.hsps_no_gap = _safe_int(self._params.hsps_no_gap)
01359 
01360     def hsps_prelim_gapped(self, line):
01361         self._params.hsps_prelim_gapped, = _get_cols(
01362             line, (-1,), ncols=9, expected={4:"gapped", 6:"prelim"})
01363         self._params.hsps_prelim_gapped = _safe_int(
01364             self._params.hsps_prelim_gapped)
01365 
01366     def hsps_prelim_gapped_attempted(self, line):
01367         self._params.hsps_prelim_gapped_attempted, = _get_cols(
01368             line, (-1,), ncols=10, expected={4:"attempted", 7:"prelim"})
01369         self._params.hsps_prelim_gapped_attempted = _safe_int(
01370             self._params.hsps_prelim_gapped_attempted)
01371 
01372     def hsps_gapped(self, line):
01373         self._params.hsps_gapped, = _get_cols(
01374             line, (-1,), ncols=6, expected={3:"gapped"})
01375         self._params.hsps_gapped = _safe_int(self._params.hsps_gapped)
01376         
01377     def query_length(self, line):
01378         self._params.query_length, = _get_cols(
01379             line.lower(), (-1,), ncols=4, expected={0:"length", 2:"query:"})
01380         self._params.query_length = _safe_int(self._params.query_length)
01381         
01382     def database_length(self, line):
01383         self._params.database_length, = _get_cols(
01384             line.lower(), (-1,), ncols=4, expected={0:"length", 2:"database:"})
01385         self._params.database_length = _safe_int(self._params.database_length)
01386 
01387     def effective_hsp_length(self, line):
01388         self._params.effective_hsp_length, = _get_cols(
01389             line, (-1,), ncols=4, expected={1:"HSP", 2:"length:"})
01390         self._params.effective_hsp_length = _safe_int(
01391             self._params.effective_hsp_length)
01392 
01393     def effective_query_length(self, line):
01394         self._params.effective_query_length, = _get_cols(
01395             line, (-1,), ncols=5, expected={1:"length", 3:"query:"})
01396         self._params.effective_query_length = _safe_int(
01397             self._params.effective_query_length)
01398 
01399     def effective_database_length(self, line):
01400         self._params.effective_database_length, = _get_cols(
01401             line.lower(), (-1,), ncols=5, expected={1:"length", 3:"database:"})
01402         self._params.effective_database_length = _safe_int(
01403             self._params.effective_database_length)
01404         
01405     def effective_search_space(self, line):
01406         self._params.effective_search_space, = _get_cols(
01407             line, (-1,), ncols=4, expected={1:"search"})
01408         self._params.effective_search_space = _safe_int(
01409             self._params.effective_search_space)
01410 
01411     def effective_search_space_used(self, line):
01412         self._params.effective_search_space_used, = _get_cols(
01413             line, (-1,), ncols=5, expected={1:"search", 3:"used:"})
01414         self._params.effective_search_space_used = _safe_int(
01415             self._params.effective_search_space_used)
01416 
01417     def frameshift(self, line):
01418         self._params.frameshift = _get_cols(
01419            line, (4, 5), ncols=6, expected={0:"frameshift", 2:"decay"})
01420 
01421     def threshold(self, line):
01422         if line[:2] == "T:":
01423             #Assume its an old stlye line like "T: 123"
01424             self._params.threshold, = _get_cols(
01425                 line, (1,), ncols=2, expected={0:"T:"})
01426         elif line[:28] == "Neighboring words threshold:":
01427             self._params.threshold, = _get_cols(
01428                 line, (3,), ncols=4, expected={0:"Neighboring", 1:"words", 2:"threshold:"})
01429         else:
01430             raise ValueError("Unrecognised threshold line:\n%s" % line)
01431         self._params.threshold = _safe_int(self._params.threshold)
01432         
01433     def window_size(self, line):
01434         if line[:2] == "A:":
01435             self._params.window_size, = _get_cols(
01436                 line, (1,), ncols=2, expected={0:"A:"})
01437         elif line[:25] == "Window for multiple hits:":
01438             self._params.window_size, = _get_cols(
01439                 line, (4,), ncols=5, expected={0:"Window", 2:"multiple", 3:"hits:"})
01440         else:
01441             raise ValueError("Unrecognised window size line:\n%s" % line)
01442         self._params.window_size = _safe_int(self._params.window_size)
01443         
01444     def dropoff_1st_pass(self, line):
01445         score, bits = _re_search(
01446             r"X1: (\d+) \(\s*([0-9,.]+) bits\)", line,
01447             "I could not find the dropoff in line\n%s" % line)
01448         self._params.dropoff_1st_pass = _safe_int(score), _safe_float(bits)
01449         
01450     def gap_x_dropoff(self, line):
01451         score, bits = _re_search(
01452             r"X2: (\d+) \(\s*([0-9,.]+) bits\)", line,
01453             "I could not find the gap dropoff in line\n%s" % line)
01454         self._params.gap_x_dropoff = _safe_int(score), _safe_float(bits)
01455         
01456     def gap_x_dropoff_final(self, line):
01457         score, bits = _re_search(
01458             r"X3: (\d+) \(\s*([0-9,.]+) bits\)", line,
01459             "I could not find the gap dropoff final in line\n%s" % line)
01460         self._params.gap_x_dropoff_final = _safe_int(score), _safe_float(bits)
01461 
01462     def gap_trigger(self, line):
01463         score, bits = _re_search(
01464             r"S1: (\d+) \(\s*([0-9,.]+) bits\)", line,
01465             "I could not find the gap trigger in line\n%s" % line)
01466         self._params.gap_trigger = _safe_int(score), _safe_float(bits)
01467         
01468     def blast_cutoff(self, line):
01469         score, bits = _re_search(
01470             r"S2: (\d+) \(\s*([0-9,.]+) bits\)", line,
01471             "I could not find the blast cutoff in line\n%s" % line)
01472         self._params.blast_cutoff = _safe_int(score), _safe_float(bits)
01473         
01474     def end_parameters(self):
01475         pass
01476     
01477 
01478 class _BlastConsumer(AbstractConsumer,
01479                      _HeaderConsumer,
01480                      _DescriptionConsumer,
01481                      _AlignmentConsumer,
01482                      _HSPConsumer,
01483                      _DatabaseReportConsumer,
01484                      _ParametersConsumer
01485                      ):
01486     # This Consumer is inherits from many other consumer classes that handle
01487     # the actual dirty work.  An alternate way to do it is to create objects
01488     # of those classes and then delegate the parsing tasks to them in a
01489     # decorator-type pattern.  The disadvantage of that is that the method
01490     # names will need to be resolved in this classes.  However, using
01491     # a decorator will retain more control in this class (which may or
01492     # may not be a bad thing).  In addition, having each sub-consumer as
01493     # its own object prevents this object's dictionary from being cluttered
01494     # with members and reduces the chance of member collisions.
01495     def __init__(self):
01496         self.data = None
01497 
01498     def round(self, line):
01499         # Make sure nobody's trying to pass me PSI-BLAST data!
01500         raise ValueError("This consumer doesn't handle PSI-BLAST data")
01501         
01502     def start_header(self):
01503         self.data = Record.Blast()
01504         _HeaderConsumer.start_header(self)
01505 
01506     def end_header(self):
01507         _HeaderConsumer.end_header(self)
01508         self.data.__dict__.update(self._header.__dict__)
01509 
01510     def end_descriptions(self):
01511         self.data.descriptions = self._descriptions
01512 
01513     def end_alignment(self):
01514         _AlignmentConsumer.end_alignment(self)
01515         if self._alignment.hsps:
01516             self.data.alignments.append(self._alignment)
01517         if self._multiple_alignment.alignment:
01518             self.data.multiple_alignment = self._multiple_alignment
01519 
01520     def end_hsp(self):
01521         _HSPConsumer.end_hsp(self)
01522         try:
01523             self._alignment.hsps.append(self._hsp)
01524         except AttributeError:
01525             raise ValueError("Found an HSP before an alignment")
01526 
01527     def end_database_report(self):
01528         _DatabaseReportConsumer.end_database_report(self)
01529         self.data.__dict__.update(self._dr.__dict__)
01530 
01531     def end_parameters(self):
01532         _ParametersConsumer.end_parameters(self)
01533         self.data.__dict__.update(self._params.__dict__)
01534 
01535 class _PSIBlastConsumer(AbstractConsumer,
01536                         _HeaderConsumer,
01537                         _DescriptionConsumer,
01538                         _AlignmentConsumer,
01539                         _HSPConsumer,
01540                         _DatabaseReportConsumer,
01541                         _ParametersConsumer
01542                         ):
01543     def __init__(self):
01544         self.data = None
01545 
01546     def start_header(self):
01547         self.data = Record.PSIBlast()
01548         _HeaderConsumer.start_header(self)
01549 
01550     def end_header(self):
01551         _HeaderConsumer.end_header(self)
01552         self.data.__dict__.update(self._header.__dict__)
01553 
01554     def start_descriptions(self):
01555         self._round = Record.Round()
01556         self.data.rounds.append(self._round)
01557         _DescriptionConsumer.start_descriptions(self)
01558 
01559     def end_descriptions(self):
01560         _DescriptionConsumer.end_descriptions(self)
01561         self._round.number = self._roundnum
01562         if self._descriptions:
01563             self._round.new_seqs.extend(self._descriptions)
01564         self._round.reused_seqs.extend(self._model_sequences)
01565         self._round.new_seqs.extend(self._nonmodel_sequences)
01566         if self._converged:
01567             self.data.converged = 1
01568 
01569     def end_alignment(self):
01570         _AlignmentConsumer.end_alignment(self)
01571         if self._alignment.hsps:
01572             self._round.alignments.append(self._alignment)
01573         if self._multiple_alignment:
01574             self._round.multiple_alignment = self._multiple_alignment
01575 
01576     def end_hsp(self):
01577         _HSPConsumer.end_hsp(self)
01578         try:
01579             self._alignment.hsps.append(self._hsp)
01580         except AttributeError:
01581             raise ValueError("Found an HSP before an alignment")
01582 
01583     def end_database_report(self):
01584         _DatabaseReportConsumer.end_database_report(self)
01585         self.data.__dict__.update(self._dr.__dict__)
01586 
01587     def end_parameters(self):
01588         _ParametersConsumer.end_parameters(self)
01589         self.data.__dict__.update(self._params.__dict__)
01590 
01591 class Iterator(object):
01592     """Iterates over a file of multiple BLAST results.
01593 
01594     Methods:
01595     next   Return the next record from the stream, or None.
01596 
01597     """
01598     def __init__(self, handle, parser=None):
01599         """__init__(self, handle, parser=None)
01600 
01601         Create a new iterator.  handle is a file-like object.  parser
01602         is an optional Parser object to change the results into another form.
01603         If set to None, then the raw contents of the file will be returned.
01604 
01605         """
01606         try:
01607             handle.readline
01608         except AttributeError:
01609             raise ValueError(
01610                 "I expected a file handle or file-like object, got %s"
01611                 % type(handle))
01612         self._uhandle = File.UndoHandle(handle)
01613         self._parser = parser
01614         self._header = []
01615 
01616     def next(self):
01617         """next(self) -> object
01618 
01619         Return the next Blast record from the file.  If no more records,
01620         return None.
01621 
01622         """
01623         lines = []
01624         query = False
01625         while 1:
01626             line = self._uhandle.readline()
01627             if not line:
01628                 break
01629             # If I've reached the next one, then put the line back and stop.
01630             if lines and (line.startswith('BLAST')
01631                           or line.startswith('BLAST', 1)
01632                           or line.startswith('<?xml ')):
01633                 self._uhandle.saveline(line)
01634                 break
01635             # New style files ommit the BLAST line to mark a new query:
01636             if line.startswith("Query="):
01637                 if not query:
01638                     if not self._header:
01639                         self._header = lines[:]
01640                     query = True
01641                 else:
01642                     #Start of another record
01643                     self._uhandle.saveline(line)
01644                     break
01645             lines.append(line)
01646 
01647         if query and "BLAST" not in lines[0]:
01648             #Cheat and re-insert the header
01649             #print "-"*50
01650             #print "".join(self._header)
01651             #print "-"*50
01652             #print "".join(lines)
01653             #print "-"*50
01654             lines = self._header + lines
01655             
01656         if not lines:
01657             return None
01658             
01659         data = ''.join(lines)
01660         if self._parser is not None:
01661             return self._parser.parse(StringIO.StringIO(data))
01662         return data
01663 
01664     def __iter__(self):
01665         return iter(self.next, None)
01666 
01667 def blastall(blastcmd, program, database, infile, align_view='7', **keywds):
01668     """Execute and retrieve data from standalone BLASTPALL as handles (OBSOLETE).
01669     
01670     NOTE - This function is obsolete, you are encouraged to the command
01671     line wrapper Bio.Blast.Applications.BlastallCommandline instead.
01672 
01673     Execute and retrieve data from blastall.  blastcmd is the command
01674     used to launch the 'blastall' executable.  program is the blast program
01675     to use, e.g. 'blastp', 'blastn', etc.  database is the path to the database
01676     to search against.  infile is the path to the file containing
01677     the sequence to search with.
01678 
01679     The return values are two handles, for standard output and standard error.
01680 
01681     You may pass more parameters to **keywds to change the behavior of
01682     the search.  Otherwise, optional values will be chosen by blastall.
01683     The Blast output is by default in XML format. Use the align_view keyword
01684     for output in a different format.
01685     
01686         Scoring
01687     matrix              Matrix to use.
01688     gap_open            Gap open penalty.
01689     gap_extend          Gap extension penalty.
01690     nuc_match           Nucleotide match reward.  (BLASTN)
01691     nuc_mismatch        Nucleotide mismatch penalty.  (BLASTN)
01692     query_genetic_code  Genetic code for Query.
01693     db_genetic_code     Genetic code for database.  (TBLAST[NX])
01694 
01695         Algorithm
01696     gapped              Whether to do a gapped alignment. T/F (not for TBLASTX)
01697     expectation         Expectation value cutoff.
01698     wordsize            Word size.
01699     strands             Query strands to search against database.([T]BLAST[NX])
01700     keep_hits           Number of best hits from a region to keep.
01701     xdrop               Dropoff value (bits) for gapped alignments.
01702     hit_extend          Threshold for extending hits.
01703     region_length       Length of region used to judge hits.
01704     db_length           Effective database length.
01705     search_length       Effective length of search space.
01706 
01707         Processing
01708     filter              Filter query sequence for low complexity (with SEG)?  T/F
01709     believe_query       Believe the query defline.  T/F
01710     restrict_gi         Restrict search to these GI's.
01711     nprocessors         Number of processors to use.
01712     oldengine           Force use of old engine T/F
01713 
01714         Formatting
01715     html                Produce HTML output?  T/F
01716     descriptions        Number of one-line descriptions.
01717     alignments          Number of alignments.
01718     align_view          Alignment view.  Integer 0-11,
01719                         passed as a string or integer.
01720     show_gi             Show GI's in deflines?  T/F
01721     seqalign_file       seqalign file to output.
01722     outfile             Output file for report.  Filename to write to, if
01723                         ommitted standard output is used (which you can access
01724                         from the returned handles).
01725     """
01726 
01727     _security_check_parameters(keywds)
01728 
01729     att2param = {
01730         'matrix' : '-M',
01731         'gap_open' : '-G',
01732         'gap_extend' : '-E',
01733         'nuc_match' : '-r',
01734         'nuc_mismatch' : '-q',
01735         'query_genetic_code' : '-Q',
01736         'db_genetic_code' : '-D',
01737 
01738         'gapped' : '-g',
01739         'expectation' : '-e',
01740         'wordsize' : '-W',
01741         'strands' : '-S',
01742         'keep_hits' : '-K',
01743         'xdrop' : '-X',
01744         'hit_extend' : '-f',
01745         'region_length' : '-L',
01746         'db_length' : '-z',
01747         'search_length' : '-Y',
01748         
01749         'program' : '-p',
01750         'database' : '-d',
01751         'infile' : '-i',
01752         'filter' : '-F',
01753         'believe_query' : '-J',
01754         'restrict_gi' : '-l',
01755         'nprocessors' : '-a',
01756         'oldengine' : '-V',
01757 
01758         'html' : '-T',
01759         'descriptions' : '-v',
01760         'alignments' : '-b',
01761         'align_view' : '-m',
01762         'show_gi' : '-I',
01763         'seqalign_file' : '-O',
01764         'outfile' : '-o',
01765         }
01766     import warnings
01767     warnings.warn("This function is obsolete, you are encouraged to the command line wrapper Bio.Blast.Applications.BlastallCommandline instead.", PendingDeprecationWarning)
01768     from Applications import BlastallCommandline
01769     cline = BlastallCommandline(blastcmd)
01770     cline.set_parameter(att2param['program'], program)
01771     cline.set_parameter(att2param['database'], database)
01772     cline.set_parameter(att2param['infile'], infile)
01773     cline.set_parameter(att2param['align_view'], str(align_view))
01774     for key, value in keywds.iteritems():
01775         cline.set_parameter(att2param[key], str(value))
01776     return _invoke_blast(cline)
01777 
01778 
01779 def blastpgp(blastcmd, database, infile, align_view='7', **keywds):
01780     """Execute and retrieve data from standalone BLASTPGP as handles (OBSOLETE).
01781 
01782     NOTE - This function is obsolete, you are encouraged to the command
01783     line wrapper Bio.Blast.Applications.BlastpgpCommandline instead.
01784     
01785     Execute and retrieve data from blastpgp.  blastcmd is the command
01786     used to launch the 'blastpgp' executable.  database is the path to the
01787     database to search against.  infile is the path to the file containing
01788     the sequence to search with.
01789 
01790     The return values are two handles, for standard output and standard error.
01791 
01792     You may pass more parameters to **keywds to change the behavior of
01793     the search.  Otherwise, optional values will be chosen by blastpgp.
01794     The Blast output is by default in XML format. Use the align_view keyword
01795     for output in a different format.
01796 
01797         Scoring
01798     matrix              Matrix to use.
01799     gap_open            Gap open penalty.
01800     gap_extend          Gap extension penalty.
01801     window_size         Multiple hits window size.
01802     npasses             Number of passes.
01803     passes              Hits/passes.  Integer 0-2.
01804 
01805         Algorithm
01806     gapped              Whether to do a gapped alignment.  T/F
01807     expectation         Expectation value cutoff.
01808     wordsize            Word size.
01809     keep_hits           Number of beset hits from a region to keep.
01810     xdrop               Dropoff value (bits) for gapped alignments.
01811     hit_extend          Threshold for extending hits.
01812     region_length       Length of region used to judge hits.
01813     db_length           Effective database length.
01814     search_length       Effective length of search space.
01815     nbits_gapping       Number of bits to trigger gapping.
01816     pseudocounts        Pseudocounts constants for multiple passes.
01817     xdrop_final         X dropoff for final gapped alignment.
01818     xdrop_extension     Dropoff for blast extensions.
01819     model_threshold     E-value threshold to include in multipass model.
01820     required_start      Start of required region in query.
01821     required_end        End of required region in query.
01822 
01823         Processing
01824     XXX should document default values
01825     program             The blast program to use. (PHI-BLAST)
01826     filter              Filter query sequence  for low complexity (with SEG)?  T/F
01827     believe_query       Believe the query defline?  T/F
01828     nprocessors         Number of processors to use.
01829 
01830         Formatting
01831     html                Produce HTML output?  T/F
01832     descriptions        Number of one-line descriptions.
01833     alignments          Number of alignments.
01834     align_view          Alignment view.  Integer 0-11,
01835                         passed as a string or integer.
01836     show_gi             Show GI's in deflines?  T/F
01837     seqalign_file       seqalign file to output.
01838     align_outfile       Output file for alignment.
01839     checkpoint_outfile  Output file for PSI-BLAST checkpointing.
01840     restart_infile      Input file for PSI-BLAST restart.
01841     hit_infile          Hit file for PHI-BLAST.
01842     matrix_outfile      Output file for PSI-BLAST matrix in ASCII.
01843     align_outfile       Output file for alignment.  Filename to write to, if
01844                         ommitted standard output is used (which you can access
01845                         from the returned handles).
01846 
01847     align_infile        Input alignment file for PSI-BLAST restart.
01848     
01849     """
01850 
01851     import warnings
01852     warnings.warn("This function is obsolete, you are encouraged to the command line wrapper Bio.Blast.Applications.BlastpgpCommandline instead.", PendingDeprecationWarning)
01853     _security_check_parameters(keywds)
01854 
01855     att2param = {
01856         'matrix' : '-M',
01857         'gap_open' : '-G',
01858         'gap_extend' : '-E',
01859         'window_size' : '-A',
01860         'npasses' : '-j',
01861         'passes' : '-P',
01862 
01863         'gapped' : '-g',
01864         'expectation' : '-e',
01865         'wordsize' : '-W',
01866         'keep_hits' : '-K',
01867         'xdrop' : '-X',
01868         'hit_extend' : '-f',
01869         'region_length' : '-L',
01870         'db_length' : '-Z',
01871         'search_length' : '-Y',
01872         'nbits_gapping' : '-N',
01873         'pseudocounts' : '-c',
01874         'xdrop_final' : '-Z',
01875         'xdrop_extension' : '-y',
01876         'model_threshold' : '-h',
01877         'required_start' : '-S',
01878         'required_end' : '-H',
01879 
01880         'program' : '-p',
01881         'database' : '-d',
01882         'infile' : '-i',
01883         'filter' : '-F',
01884         'believe_query' : '-J',
01885         'nprocessors' : '-a',
01886 
01887         'html' : '-T',
01888         'descriptions' : '-v',
01889         'alignments' : '-b',
01890         'align_view' : '-m',
01891         'show_gi' : '-I',
01892         'seqalign_file' : '-O',
01893         'align_outfile' : '-o',
01894         'checkpoint_outfile' : '-C',
01895         'restart_infile' : '-R',
01896         'hit_infile' : '-k',
01897         'matrix_outfile' : '-Q',
01898         'align_infile' : '-B',
01899         }
01900     from Applications import BlastpgpCommandline
01901     cline = BlastpgpCommandline(blastcmd)
01902     cline.set_parameter(att2param['database'], database)
01903     cline.set_parameter(att2param['infile'], infile)
01904     cline.set_parameter(att2param['align_view'], str(align_view))
01905     for key, value in keywds.iteritems():
01906         cline.set_parameter(att2param[key], str(value))
01907     return _invoke_blast(cline)
01908 
01909 
01910 def rpsblast(blastcmd, database, infile, align_view="7", **keywds):
01911     """Execute and retrieve data from standalone RPS-BLAST as handles (OBSOLETE).
01912     
01913     NOTE - This function is obsolete, you are encouraged to the command
01914     line wrapper Bio.Blast.Applications.RpsBlastCommandline instead.
01915 
01916     Execute and retrieve data from standalone RPS-BLAST.  blastcmd is the
01917     command used to launch the 'rpsblast' executable.  database is the path
01918     to the database to search against.  infile is the path to the file
01919     containing the sequence to search with.
01920 
01921     The return values are two handles, for standard output and standard error.
01922 
01923     You may pass more parameters to **keywds to change the behavior of
01924     the search.  Otherwise, optional values will be chosen by rpsblast.
01925 
01926     Please note that this function will give XML output by default, by
01927     setting align_view to seven (i.e. command line option -m 7).
01928     You should use the NCBIXML.parse() function to read the resulting output.
01929     This is because NCBIStandalone.BlastParser() does not understand the
01930     plain text output format from rpsblast.
01931 
01932     WARNING - The following text and associated parameter handling has not
01933     received extensive testing.  Please report any errors we might have made...
01934 
01935         Algorithm/Scoring
01936     gapped              Whether to do a gapped alignment.  T/F
01937     multihit            0 for multiple hit (default), 1 for single hit
01938     expectation         Expectation value cutoff.
01939     range_restriction   Range restriction on query sequence (Format: start,stop) blastp only
01940                         0 in 'start' refers to the beginning of the sequence
01941                         0 in 'stop' refers to the end of the sequence
01942                         Default = 0,0
01943     xdrop               Dropoff value (bits) for gapped alignments.
01944     xdrop_final         X dropoff for final gapped alignment (in bits).
01945     xdrop_extension     Dropoff for blast extensions (in bits).
01946     search_length       Effective length of search space.
01947     nbits_gapping       Number of bits to trigger gapping.
01948     protein             Query sequence is protein.  T/F
01949     db_length           Effective database length.
01950 
01951         Processing
01952     filter              Filter query sequence for low complexity?  T/F
01953     case_filter         Use lower case filtering of FASTA sequence T/F, default F
01954     believe_query       Believe the query defline.  T/F
01955     nprocessors         Number of processors to use.
01956     logfile             Name of log file to use, default rpsblast.log
01957 
01958         Formatting
01959     html                Produce HTML output?  T/F
01960     descriptions        Number of one-line descriptions.
01961     alignments          Number of alignments.
01962     align_view          Alignment view.  Integer 0-11,
01963                         passed as a string or integer.
01964     show_gi             Show GI's in deflines?  T/F
01965     seqalign_file       seqalign file to output.
01966     align_outfile       Output file for alignment.  Filename to write to, if
01967                         ommitted standard output is used (which you can access
01968                         from the returned handles).
01969     """
01970 
01971     import warnings
01972     warnings.warn("This function is obsolete, you are encouraged to the command line wrapper Bio.Blast.Applications.BlastrpsCommandline instead.", PendingDeprecationWarning)
01973     _security_check_parameters(keywds)
01974     
01975     att2param = {
01976         'multihit' : '-P',
01977         'gapped' : '-g',
01978         'expectation' : '-e',
01979         'range_restriction' : '-L',
01980         'xdrop' : '-X',
01981         'xdrop_final' : '-Z',
01982         'xdrop_extension' : '-y',
01983         'search_length' : '-Y',
01984         'nbits_gapping' : '-N',
01985         'protein' : '-p',
01986         'db_length' : '-z',
01987 
01988         'database' : '-d',
01989         'infile' : '-i',
01990         'filter' : '-F',
01991         'case_filter' : '-U',
01992         'believe_query' : '-J',
01993         'nprocessors' : '-a',
01994         'logfile' : '-l',
01995 
01996         'html' : '-T',
01997         'descriptions' : '-v',
01998         'alignments' : '-b',
01999         'align_view' : '-m',
02000         'show_gi' : '-I',
02001         'seqalign_file' : '-O',
02002         'align_outfile' : '-o',
02003         }
02004 
02005     from Applications import RpsBlastCommandline
02006     cline = RpsBlastCommandline(blastcmd)
02007     cline.set_parameter(att2param['database'], database)
02008     cline.set_parameter(att2param['infile'], infile)
02009     cline.set_parameter(att2param['align_view'], str(align_view))
02010     for key, value in keywds.iteritems():
02011         cline.set_parameter(att2param[key], str(value))
02012     return _invoke_blast(cline)
02013 
02014 
02015 def _re_search(regex, line, error_msg):
02016     m = re.search(regex, line)
02017     if not m:
02018         raise ValueError(error_msg)
02019     return m.groups()
02020 
02021 def _get_cols(line, cols_to_get, ncols=None, expected={}):
02022     cols = line.split()
02023 
02024     # Check to make sure number of columns is correct
02025     if ncols is not None and len(cols) != ncols:
02026         raise ValueError("I expected %d columns (got %d) in line\n%s" \
02027                          % (ncols, len(cols), line))
02028 
02029     # Check to make sure columns contain the correct data
02030     for k in expected:
02031         if cols[k] != expected[k]:
02032             raise ValueError("I expected '%s' in column %d in line\n%s" \
02033                              % (expected[k], k, line))
02034 
02035     # Construct the answer tuple
02036     results = []
02037     for c in cols_to_get:
02038         results.append(cols[c])
02039     return tuple(results)
02040 
02041 
02042 def _safe_int(str):
02043     try:
02044         return int(str)
02045     except ValueError:
02046         # Something went wrong.  Try to clean up the string.
02047         # Remove all commas from the string
02048         str = str.replace(',', '')
02049     # try again after removing commas.
02050     # Note int() will return a long rather than overflow
02051     try:
02052         return int(str)
02053     except ValueError:
02054         pass
02055     # Call float to handle things like "54.3", note could lose precision, e.g.
02056     # >>> int("5399354557888517312")
02057     # 5399354557888517312
02058     # >>> int(float("5399354557888517312"))
02059     # 5399354557888517120
02060     return int(float(str))
02061 
02062 
02063 def _safe_float(str):
02064     # Thomas Rosleff Soerensen (rosleff@mpiz-koeln.mpg.de) noted that
02065     # float('e-172') does not produce an error on his platform.  Thus,
02066     # we need to check the string for this condition.
02067     
02068     # Sometimes BLAST leaves of the '1' in front of an exponent.
02069     if str and str[0] in ['E', 'e']:
02070         str = '1' + str
02071     try:
02072         return float(str)
02073     except ValueError:
02074         # Remove all commas from the string
02075         str = str.replace(',', '')
02076     # try again.
02077     return float(str)
02078 
02079 
02080 def _invoke_blast(cline):
02081     """Start BLAST and returns handles for stdout and stderr (PRIVATE).
02082 
02083     Expects a command line wrapper object from Bio.Blast.Applications
02084     """
02085     import subprocess, sys
02086     blast_cmd = cline.program_name
02087     if not os.path.exists(blast_cmd):
02088         raise ValueError("BLAST executable does not exist at %s" % blast_cmd)
02089     #We don't need to supply any piped input, but we setup the
02090     #standard input pipe anyway as a work around for a python
02091     #bug if this is called from a Windows GUI program.  For
02092     #details, see http://bugs.python.org/issue1124861
02093     blast_process = subprocess.Popen(str(cline),
02094                                      stdin=subprocess.PIPE,
02095                                      stdout=subprocess.PIPE,
02096                                      stderr=subprocess.PIPE,
02097                                      universal_newlines=True,
02098                                      shell=(sys.platform!="win32"))
02099     blast_process.stdin.close()
02100     return blast_process.stdout, blast_process.stderr
02101 
02102 
02103 def _security_check_parameters(param_dict):
02104     """Look for any attempt to insert a command into a parameter.
02105 
02106     e.g. blastall(..., matrix='IDENTITY -F 0; rm -rf /etc/passwd')
02107 
02108     Looks for ";" or "&&" in the strings (Unix and Windows syntax
02109     for appending a command line), or ">", "<" or "|" (redirection)
02110     and if any are found raises an exception.
02111     """
02112     for key, value in param_dict.iteritems():
02113         str_value = str(value) # Could easily be an int or a float
02114         for bad_str in [";", "&&", ">", "<", "|"]:
02115             if bad_str in str_value:
02116                 raise ValueError("Rejecting suspicious argument for %s" % key)
02117 
02118 class _BlastErrorConsumer(_BlastConsumer):
02119     def __init__(self):
02120         _BlastConsumer.__init__(self)
02121     def noevent(self, line):
02122         if line.find("Query must be at least wordsize") != -1:
02123             raise ShortQueryBlastError("Query must be at least wordsize")
02124         # Now pass the line back up to the superclass.
02125         method = getattr(_BlastConsumer, 'noevent',
02126                          _BlastConsumer.__getattr__(self, 'noevent'))
02127         method(line)
02128 
02129 class BlastErrorParser(AbstractParser):
02130     """Attempt to catch and diagnose BLAST errors while parsing.
02131 
02132     This utilizes the BlastParser module but adds an additional layer
02133     of complexity on top of it by attempting to diagnose ValueErrors
02134     that may actually indicate problems during BLAST parsing.
02135 
02136     Current BLAST problems this detects are:
02137     o LowQualityBlastError - When BLASTing really low quality sequences
02138     (ie. some GenBank entries which are just short streches of a single
02139     nucleotide), BLAST will report an error with the sequence and be
02140     unable to search with this. This will lead to a badly formatted
02141     BLAST report that the parsers choke on. The parser will convert the
02142     ValueError to a LowQualityBlastError and attempt to provide useful
02143     information.
02144     
02145     """
02146     def __init__(self, bad_report_handle = None):
02147         """Initialize a parser that tries to catch BlastErrors.
02148 
02149         Arguments:
02150         o bad_report_handle - An optional argument specifying a handle
02151         where bad reports should be sent. This would allow you to save
02152         all of the bad reports to a file, for instance. If no handle
02153         is specified, the bad reports will not be saved.
02154         """
02155         self._bad_report_handle = bad_report_handle
02156         
02157         #self._b_parser = BlastParser()
02158         self._scanner = _Scanner()
02159         self._consumer = _BlastErrorConsumer()
02160 
02161     def parse(self, handle):
02162         """Parse a handle, attempting to diagnose errors.
02163         """
02164         results = handle.read()
02165 
02166         try:
02167             self._scanner.feed(StringIO.StringIO(results), self._consumer)
02168         except ValueError, msg:
02169             # if we have a bad_report_file, save the info to it first
02170             if self._bad_report_handle:
02171                 # send the info to the error handle
02172                 self._bad_report_handle.write(results)
02173 
02174             # now we want to try and diagnose the error
02175             self._diagnose_error(
02176                 StringIO.StringIO(results), self._consumer.data)
02177 
02178             # if we got here we can't figure out the problem
02179             # so we should pass along the syntax error we got
02180             raise
02181         return self._consumer.data
02182 
02183     def _diagnose_error(self, handle, data_record):
02184         """Attempt to diagnose an error in the passed handle.
02185 
02186         Arguments:
02187         o handle - The handle potentially containing the error
02188         o data_record - The data record partially created by the consumer.
02189         """
02190         line = handle.readline()
02191 
02192         while line:
02193             # 'Searchingdone' instead of 'Searching......done' seems
02194             # to indicate a failure to perform the BLAST due to
02195             # low quality sequence
02196             if line.startswith('Searchingdone'):
02197                 raise LowQualityBlastError("Blast failure occured on query: ",
02198                                            data_record.query)
02199             line = handle.readline()