Back to index

python-biopython  1.60
__init__.py
Go to the documentation of this file.
00001 # Copyright 2004 by James Casbon.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 """
00007 Code to deal with COMPASS output, a program for profile/profile comparison.
00008 
00009 Compass is described in:
00010 
00011 Sadreyev R, Grishin N. COMPASS: a tool for comparison of multiple protein
00012 alignments with assessment of statistical significance. J Mol Biol. 2003 Feb
00013 7;326(1):317-36.
00014 
00015 Tested with COMPASS 1.24.
00016 
00017 Functions:
00018 read          Reads a COMPASS file containing one COMPASS record
00019 parse         Iterates over records in a COMPASS file.
00020 
00021 Classes:
00022 Record        One result of a COMPASS file
00023 """
00024 import re
00025 
00026 def read(handle):
00027     record = None
00028     try:
00029         line = handle.next()
00030         record = Record()
00031         __read_names(record, line)
00032         line = handle.next()
00033         __read_threshold(record, line)
00034         line = handle.next()
00035         __read_lengths(record, line)
00036         line = handle.next()
00037         __read_profilewidth(record, line)
00038         line = handle.next()
00039         __read_scores(record, line)
00040     except StopIteration:
00041         if not record:
00042             raise ValueError("No record found in handle")
00043         else:
00044             raise ValueError("Unexpected end of stream.")
00045     for line in handle:
00046         if not line.strip(): # skip empty lines
00047             continue
00048         __read_query_alignment(record, line)
00049         try:
00050             line = handle.next()
00051             __read_positive_alignment(record, line)
00052             line = handle.next()
00053             __read_hit_alignment(record, line)
00054         except StopIteration:
00055             raise ValueError("Unexpected end of stream.")
00056     return record
00057 
00058 def parse(handle):
00059     record = None
00060     try:
00061         line = handle.next()
00062     except StopIteration:
00063         return
00064     while True:
00065         try:
00066             record = Record()
00067             __read_names(record, line)
00068             line = handle.next()
00069             __read_threshold(record, line)
00070             line = handle.next()
00071             __read_lengths(record, line)
00072             line = handle.next()
00073             __read_profilewidth(record, line)
00074             line = handle.next()
00075             __read_scores(record, line)
00076         except StopIteration:
00077             raise ValueError("Unexpected end of stream.")
00078         for line in handle:
00079             if not line.strip():
00080                 continue
00081             if "Ali1:" in line:
00082                 yield record
00083                 break
00084             __read_query_alignment(record, line)
00085             try:
00086                 line = handle.next()
00087                 __read_positive_alignment(record, line)
00088                 line = handle.next()
00089                 __read_hit_alignment(record, line)
00090             except StopIteration:
00091                 raise ValueError("Unexpected end of stream.")
00092         else:
00093             yield record
00094             break
00095 
00096 class Record(object):
00097     """
00098     Hold information from one compass hit.
00099     Ali1 one is the query, Ali2 the hit.
00100     """
00101 
00102     def __init__(self):
00103         self.query=''
00104         self.hit=''
00105         self.gap_threshold=0
00106         self.query_length=0
00107         self.query_filtered_length=0
00108         self.query_nseqs=0
00109         self.query_neffseqs=0
00110         self.hit_length=0
00111         self.hit_filtered_length=0
00112         self.hit_nseqs=0
00113         self.hit_neffseqs=0
00114         self.sw_score=0
00115         self.evalue=-1
00116         self.query_start=-1
00117         self.hit_start=-1
00118         self.query_aln=''
00119         self.hit_aln=''
00120         self.positives=''
00121 
00122 
00123     def query_coverage(self):
00124         """Return the length of the query covered in alignment"""
00125         s = self.query_aln.replace("=", "")
00126         return len(s)
00127 
00128     def hit_coverage(self):
00129         """Return the length of the hit covered in the alignment"""
00130         s = self.hit_aln.replace("=", "")
00131         return len(s)
00132 
00133 # Everything below is private
00134 
00135 __regex = {"names": re.compile("Ali1:\s+(\S+)\s+Ali2:\s+(\S+)\s+"),
00136            "threshold": re.compile("Threshold of effective gap content in columns: (\S+)"),
00137            "lengths": re.compile("length1=(\S+)\s+filtered_length1=(\S+)\s+length2=(\S+)\s+filtered_length2=(\S+)"),
00138            "profilewidth": re.compile("Nseqs1=(\S+)\s+Neff1=(\S+)\s+Nseqs2=(\S+)\s+Neff2=(\S+)"),
00139            "scores": re.compile("Smith-Waterman score = (\S+)\s+Evalue = (\S+)"),
00140            "start": re.compile("(\d+)"),
00141            "align": re.compile("^.{15}(\S+)"),
00142            "positive_alignment": re.compile("^.{15}(.+)"),
00143           }
00144 
00145 def __read_names(record, line):
00146     """
00147     Ali1: 60456.blo.gz.aln  Ali2: allscop//14984.blo.gz.aln
00148           ------query-----        -------hit-------------
00149     """        
00150     if not "Ali1:" in line:
00151         raise ValueError("Line does not contain 'Ali1:':\n%s" % line)
00152     m = __regex["names"].search(line)
00153     record.query = m.group(1)
00154     record.hit = m.group(2)
00155 
00156 def __read_threshold(record,line):
00157     if not line.startswith("Threshold"):
00158         raise ValueError("Line does not start with 'Threshold':\n%s" % line)
00159     m = __regex["threshold"].search(line)
00160     record.gap_threshold = float(m.group(1))
00161 
00162 def __read_lengths(record, line):
00163     if not line.startswith("length1="):
00164         raise ValueError("Line does not start with 'length1=':\n%s" % line)
00165     m = __regex["lengths"].search(line)
00166     record.query_length = int(m.group(1))
00167     record.query_filtered_length = float(m.group(2))
00168     record.hit_length = int(m.group(3))
00169     record.hit_filtered_length = float(m.group(4))
00170 
00171 def __read_profilewidth(record, line):
00172     if not "Nseqs1" in line:
00173         raise ValueError("Line does not contain 'Nseqs1':\n%s" % line)
00174     m = __regex["profilewidth"].search(line)
00175     record.query_nseqs = int(m.group(1))
00176     record.query_neffseqs = float(m.group(2))
00177     record.hit_nseqs = int(m.group(3))
00178     record.hit_neffseqs = float(m.group(4))
00179 
00180 def __read_scores(record, line):
00181     if not line.startswith("Smith-Waterman"):
00182         raise ValueError("Line does not start with 'Smith-Waterman':\n%s" % line)
00183     m = __regex["scores"].search(line)
00184     if m:
00185         record.sw_score = int(m.group(1))
00186         record.evalue = float(m.group(2))
00187     else:
00188         record.sw_score = 0
00189         record.evalue = -1.0
00190 
00191 def __read_query_alignment(record, line):
00192     m = __regex["start"].search(line)
00193     if m:
00194         record.query_start = int(m.group(1))
00195     m = __regex["align"].match(line)
00196     assert m!=None, "invalid match"
00197     record.query_aln += m.group(1)
00198 
00199 def __read_positive_alignment(record, line):
00200     m = __regex["positive_alignment"].match(line)
00201     assert m!=None, "invalid match"
00202     record.positives += m.group(1)
00203 
00204 def __read_hit_alignment(record, line):
00205     m = __regex["start"].search(line)
00206     if m:
00207         record.hit_start = int(m.group(1))
00208     m = __regex["align"].match(line)
00209     assert m!=None, "invalid match"
00210     record.hit_aln += m.group(1)