Back to index

python-biopython  1.60
Record.py
Go to the documentation of this file.
00001 # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 """Record classes to hold BLAST output.
00007 
00008 Classes:
00009 Blast              Holds all the information from a blast search.
00010 PSIBlast           Holds all the information from a psi-blast search.
00011 
00012 Header             Holds information from the header.
00013 Description        Holds information about one hit description.
00014 Alignment          Holds information about one alignment hit.
00015 HSP                Holds information about one HSP.
00016 MultipleAlignment  Holds information about a multiple alignment.
00017 DatabaseReport     Holds information from the database report.
00018 Parameters         Holds information from the parameters.
00019 
00020 """
00021 # XXX finish printable BLAST output
00022 
00023 from Bio.Align import Generic
00024 
00025 class Header(object):
00026     """Saves information from a blast header.
00027 
00028     Members:
00029     application         The name of the BLAST flavor that generated this data.
00030     version             Version of blast used.
00031     date                Date this data was generated.
00032     reference           Reference for blast.
00033 
00034     query               Name of query sequence.
00035     query_letters       Number of letters in the query sequence.  (int)
00036     
00037     database            Name of the database.
00038     database_sequences  Number of sequences in the database.  (int)
00039     database_letters    Number of letters in the database.  (int)
00040 
00041     """
00042     def __init__(self):
00043         self.application = ''
00044         self.version = ''
00045         self.date = ''
00046         self.reference = ''
00047 
00048         self.query = ''
00049         self.query_letters = None
00050 
00051         self.database = ''
00052         self.database_sequences = None
00053         self.database_letters = None
00054 
00055 class Description(object):
00056     """Stores information about one hit in the descriptions section.
00057 
00058     Members:
00059     title           Title of the hit.
00060     score           Number of bits.  (int)
00061     bits            Bit score. (float)
00062     e               E value.  (float)
00063     num_alignments  Number of alignments for the same subject.  (int)
00064     
00065     """
00066     def __init__(self):
00067         self.title = ''
00068         self.score = None
00069         self.bits = None
00070         self.e = None
00071         self.num_alignments = None
00072     def __str__(self):
00073         return "%-66s %5s  %s" % (self.title, self.score, self.e)
00074 
00075 class Alignment(object):
00076     """Stores information about one hit in the alignments section.
00077 
00078     Members:
00079     title      Name.
00080     hit_id     Hit identifier. (str)
00081     hit_def    Hit definition. (str)
00082     length     Length.  (int)
00083     hsps       A list of HSP objects.
00084 
00085     """
00086     def __init__(self):
00087         self.title = ''
00088         self.hit_id = ''
00089         self.hit_def = ''
00090         self.length = None
00091         self.hsps = []
00092     def __str__(self):
00093         lines = self.title.split('\n')
00094         lines.append("Length = %s\n" % self.length)
00095         return '\n           '.join(lines)
00096 
00097 class HSP(object):
00098     """Stores information about one hsp in an alignment hit.
00099 
00100     Members:
00101     score           BLAST score of hit.  (float)
00102     bits            Number of bits for that score.  (float)
00103     expect          Expect value.  (float)
00104     num_alignments  Number of alignments for same subject.  (int)
00105     identities      Number of identities (int) if using the XML parser.
00106                     Tuple of numer of identities/total aligned (int, int)
00107                     if using the (obsolete) plain text parser.
00108     positives       Number of positives (int) if using the XML parser.
00109                     Tuple of numer of positives/total aligned (int, int)
00110                     if using the (obsolete) plain text parser.
00111     gaps            Number of gaps (int) if using the XML parser.
00112                     Tuple of numer of gaps/total aligned (int, int) if
00113                     using the (obsolete) plain text parser.
00114     align_length    Length of the alignment. (int)
00115     strand          Tuple of (query, target) strand.
00116     frame           Tuple of 1 or 2 frame shifts, depending on the flavor.
00117 
00118     query           The query sequence.
00119     query_start     The start residue for the query sequence.  (1-based)
00120     query_end       The end residue for the query sequence.  (1-based)
00121     match           The match sequence.
00122     sbjct           The sbjct sequence.
00123     sbjct_start     The start residue for the sbjct sequence.  (1-based)
00124     sbjct_end       The end residue for the sbjct sequence.  (1-based)
00125     
00126     Not all flavors of BLAST return values for every attribute:
00127               score     expect     identities   positives    strand  frame
00128     BLASTP     X          X            X            X
00129     BLASTN     X          X            X            X          X
00130     BLASTX     X          X            X            X                  X
00131     TBLASTN    X          X            X            X                  X
00132     TBLASTX    X          X            X            X                 X/X
00133 
00134     Note: for BLASTX, the query sequence is shown as a protein sequence,
00135     but the numbering is based on the nucleotides.  Thus, the numbering
00136     is 3x larger than the number of amino acid residues.  A similar effect
00137     can be seen for the sbjct sequence in TBLASTN, and for both sequences
00138     in TBLASTX.
00139 
00140     Also, for negative frames, the sequence numbering starts from
00141     query_start and counts down.
00142 
00143     """
00144     def __init__(self):
00145         self.score = None
00146         self.bits = None
00147         self.expect = None
00148         self.num_alignments = None
00149         self.identities = (None, None)
00150         self.positives = (None, None)
00151         self.gaps = (None, None)
00152         self.align_length = None
00153         self.strand = (None, None)
00154         self.frame = ()
00155         
00156         self.query = ''
00157         self.query_start = None
00158         self.query_end = None
00159         self.match = ''
00160         self.sbjct = ''
00161         self.sbjct_start = None
00162         self.sbjct_end = None
00163 
00164     def __str__(self):
00165         lines = ["Score %i (%i bits), expectation %0.1e, alignment length %i" \
00166                  % (self.score, self.bits, self.expect, self.align_length)]
00167         if self.align_length < 50:
00168             lines.append("Query:%s %s %s" % (str(self.query_start).rjust(8),
00169                                        str(self.query),
00170                                        str(self.query_end)))
00171             lines.append("               %s" \
00172                          % (str(self.match)))
00173             lines.append("Sbjct:%s %s %s" % (str(self.sbjct_start).rjust(8),
00174                                        str(self.sbjct),
00175                                        str(self.sbjct_end)))
00176         else:
00177             lines.append("Query:%s %s...%s %s" \
00178                          % (str(self.query_start).rjust(8),
00179                             str(self.query)[:45],
00180                             str(self.query)[-3:],
00181                             str(self.query_end)))
00182             lines.append("               %s...%s" \
00183                          % (str(self.match)[:45],
00184                             str(self.match)[-3:]))
00185             lines.append("Sbjct:%s %s...%s %s" \
00186                          % (str(self.sbjct_start).rjust(8),
00187                             str(self.sbjct)[:45],
00188                             str(self.sbjct)[-3:],
00189                             str(self.sbjct_end)))
00190         return "\n".join(lines)
00191 
00192 class MultipleAlignment(object):
00193     """Holds information about a multiple alignment.
00194 
00195     Members:
00196     alignment  A list of tuples (name, start residue, sequence, end residue).
00197 
00198     The start residue is 1-based.  It may be blank, if that sequence is
00199     not aligned in the multiple alignment.
00200 
00201     """
00202     def __init__(self):
00203         self.alignment = []
00204 
00205     def to_generic(self, alphabet):
00206         """Retrieve generic alignment object for the given alignment.
00207 
00208         Instead of the tuples, this returns an Alignment object from
00209         Bio.Align.Generic, through which you can manipulate and query
00210         the object.
00211 
00212         alphabet is the specified alphabet for the sequences in the code (for
00213         example IUPAC.IUPACProtein.
00214 
00215         Thanks to James Casbon for the code.
00216         """
00217         #TODO - Switch to new Bio.Align.MultipleSeqAlignment class?
00218         seq_parts = []
00219         seq_names = []
00220         parse_number = 0
00221         n = 0
00222         for name, start, seq, end in self.alignment:
00223             if name == 'QUERY': #QUERY is the first in each alignment block
00224                 parse_number += 1
00225                 n = 0
00226 
00227             if parse_number == 1: # create on first_parse, append on all others
00228                 seq_parts.append(seq)
00229                 seq_names.append(name)
00230             else:
00231                 seq_parts[n] += seq
00232                 n += 1
00233 
00234         generic = Generic.Alignment(alphabet)
00235         for (name,seq) in zip(seq_names,seq_parts):
00236             generic.add_sequence(name, seq)
00237 
00238         return generic
00239 
00240 class Round(object):
00241     """Holds information from a PSI-BLAST round.
00242 
00243     Members:
00244     number       Round number.  (int)
00245     reused_seqs  Sequences in model, found again.  List of Description objects.
00246     new_seqs     Sequences not found, or below threshold.  List of Description.
00247     alignments          A list of Alignment objects.
00248     multiple_alignment  A MultipleAlignment object.
00249     
00250     """
00251     def __init__(self):
00252         self.number = None
00253         self.reused_seqs = []
00254         self.new_seqs = []
00255         self.alignments = []
00256         self.multiple_alignment = None
00257 
00258 class DatabaseReport(object):
00259     """Holds information about a database report.
00260     
00261     Members:
00262     database_name              List of database names.  (can have multiple dbs)
00263     num_letters_in_database    Number of letters in the database.  (int)
00264     num_sequences_in_database  List of number of sequences in the database.
00265     posted_date                List of the dates the databases were posted.
00266     ka_params                  A tuple of (lambda, k, h) values.  (floats)
00267     gapped                     # XXX this isn't set right!
00268     ka_params_gap              A tuple of (lambda, k, h) values.  (floats)
00269 
00270     """
00271     def __init__(self):
00272         self.database_name = []
00273         self.posted_date = []
00274         self.num_letters_in_database = []
00275         self.num_sequences_in_database = []
00276         self.ka_params = (None, None, None)
00277         self.gapped = 0
00278         self.ka_params_gap = (None, None, None)
00279 
00280 class Parameters(object):
00281     """Holds information about the parameters.
00282 
00283     Members:
00284     matrix              Name of the matrix.
00285     gap_penalties       Tuple of (open, extend) penalties.  (floats)
00286     sc_match            Match score for nucleotide-nucleotide comparison
00287     sc_mismatch         Mismatch penalty for nucleotide-nucleotide comparison
00288     num_hits            Number of hits to the database.  (int)
00289     num_sequences       Number of sequences.  (int)
00290     num_good_extends    Number of extensions.  (int)
00291     num_seqs_better_e   Number of sequences better than e-value.  (int)
00292     hsps_no_gap         Number of HSP's better, without gapping.  (int)
00293     hsps_prelim_gapped  Number of HSP's gapped in prelim test.  (int)
00294     hsps_prelim_gapped_attemped  Number of HSP's attempted in prelim.  (int)
00295     hsps_gapped         Total number of HSP's gapped.  (int)
00296     query_length        Length of the query.  (int)
00297     query_id            Identifier of the query sequence. (str)
00298     database_length     Number of letters in the database.  (int)
00299     effective_hsp_length         Effective HSP length.  (int)
00300     effective_query_length       Effective length of query.  (int)
00301     effective_database_length    Effective length of database.  (int)
00302     effective_search_space       Effective search space.  (int)
00303     effective_search_space_used  Effective search space used.  (int)
00304     frameshift          Frameshift window.  Tuple of (int, float)
00305     threshold           Threshold.  (int)
00306     window_size         Window size.  (int)
00307     dropoff_1st_pass    Tuple of (score, bits).  (int, float)
00308     gap_x_dropoff       Tuple of (score, bits).  (int, float)
00309     gap_x_dropoff_final Tuple of (score, bits).  (int, float)
00310     gap_trigger         Tuple of (score, bits).  (int, float)
00311     blast_cutoff        Tuple of (score, bits).  (int, float)
00312     """
00313     def __init__(self):
00314         self.matrix = ''
00315         self.gap_penalties = (None, None)
00316         self.sc_match = None
00317         self.sc_mismatch = None
00318         self.num_hits = None
00319         self.num_sequences = None
00320         self.num_good_extends = None
00321         self.num_seqs_better_e = None
00322         self.hsps_no_gap = None
00323         self.hsps_prelim_gapped = None
00324         self.hsps_prelim_gapped_attemped = None
00325         self.hsps_gapped = None
00326         self.query_id = None
00327         self.query_length = None
00328         self.database_length = None
00329         self.effective_hsp_length = None
00330         self.effective_query_length = None
00331         self.effective_database_length = None
00332         self.effective_search_space = None
00333         self.effective_search_space_used = None
00334         self.frameshift = (None, None)
00335         self.threshold = None
00336         self.window_size = None
00337         self.dropoff_1st_pass = (None, None)
00338         self.gap_x_dropoff = (None, None)
00339         self.gap_x_dropoff_final = (None, None)
00340         self.gap_trigger = (None, None)
00341         self.blast_cutoff = (None, None)
00342 
00343 #TODO - Add a friendly __str__ method to BLAST results    
00344 class Blast(Header, DatabaseReport, Parameters):
00345     """Saves the results from a blast search.
00346 
00347     Members:
00348     descriptions        A list of Description objects.
00349     alignments          A list of Alignment objects.
00350     multiple_alignment  A MultipleAlignment object.
00351     + members inherited from base classes
00352 
00353     """
00354     def __init__(self):
00355         Header.__init__(self)
00356         DatabaseReport.__init__(self)
00357         Parameters.__init__(self)
00358         self.descriptions = []
00359         self.alignments = []
00360         self.multiple_alignment = None
00361 
00362 class PSIBlast(Header, DatabaseReport, Parameters):
00363     """Saves the results from a blastpgp search.
00364 
00365     Members:
00366     rounds       A list of Round objects.
00367     converged    Whether the search converged.
00368     + members inherited from base classes
00369 
00370     """
00371     def __init__(self):
00372         Header.__init__(self)
00373         DatabaseReport.__init__(self)
00374         Parameters.__init__(self)
00375         self.rounds = []
00376         self.converged = 0