Back to index

python-biopython  1.60
Public Member Functions | Private Attributes
Bio.AlignIO.EmbossIO.EmbossIterator Class Reference
Inheritance diagram for Bio.AlignIO.EmbossIO.EmbossIterator:
Inheritance graph
[legend]
Collaboration diagram for Bio.AlignIO.EmbossIO.EmbossIterator:
Collaboration graph
[legend]

List of all members.

Public Member Functions

def next

Private Attributes

 _header

Detailed Description

Emboss alignment iterator.

For reading the (pairwise) alignments from EMBOSS tools in what they
call the "pairs" and "simple" formats.

Definition at line 60 of file EmbossIO.py.


Member Function Documentation

Definition at line 67 of file EmbossIO.py.

00067 
00068     def next(self):
00069 
00070         handle = self.handle
00071 
00072         try:
00073             #Header we saved from when we were parsing
00074             #the previous alignment.
00075             line = self._header
00076             del self._header
00077         except AttributeError:      
00078             line = handle.readline()
00079         if not line:
00080             raise StopIteration
00081 
00082         while line.rstrip() != "#=======================================":
00083             line = handle.readline()
00084             if not line:
00085                 raise StopIteration
00086 
00087         length_of_seqs = None
00088         number_of_seqs = None
00089         ids = []
00090         seqs = []
00091 
00092 
00093         while line[0] == "#":
00094             #Read in the rest of this alignment header,
00095             #try and discover the number of records expected
00096             #and their length
00097             parts = line[1:].split(":",1)
00098             key = parts[0].lower().strip()
00099             if key == "aligned_sequences":
00100                 number_of_seqs = int(parts[1].strip())
00101                 assert len(ids) == 0
00102                 # Should now expect the record identifiers...
00103                 for i in range(number_of_seqs):
00104                     line = handle.readline()
00105                     parts = line[1:].strip().split(":",1)
00106                     assert i+1 == int(parts[0].strip())
00107                     ids.append(parts[1].strip())
00108                 assert len(ids) == number_of_seqs
00109             if key == "length":
00110                 length_of_seqs = int(parts[1].strip())
00111 
00112             #And read in another line...
00113             line = handle.readline()
00114 
00115         if number_of_seqs is None:
00116             raise ValueError("Number of sequences missing!")
00117         if length_of_seqs is None:
00118             raise ValueError("Length of sequences missing!")
00119 
00120         if self.records_per_alignment is not None \
00121         and self.records_per_alignment != number_of_seqs:
00122             raise ValueError("Found %i records in this alignment, told to expect %i" \
00123                              % (number_of_seqs, self.records_per_alignment))
00124 
00125         seqs = ["" for id in ids]
00126         seq_starts = []
00127         index = 0
00128 
00129         #Parse the seqs
00130         while line:
00131             if len(line) > 21:
00132                 id_start = line[:21].strip().split(None, 1)
00133                 seq_end = line[21:].strip().split(None, 1)
00134                 if len(id_start) == 2 and len(seq_end) == 2:
00135                     #identifier, seq start position, seq, seq end position
00136                     #(an aligned seq is broken up into multiple lines)
00137                     id, start = id_start
00138                     seq, end = seq_end
00139                     if start==end:
00140                         #Special case, either a single letter is present,
00141                         #or no letters at all.
00142                         if seq.replace("-","") == "":
00143                             start = int(start)
00144                             end = int(end)
00145                         else:
00146                             start = int(start) - 1
00147                             end = int(end)
00148                     else:
00149                         assert seq.replace("-","") != ""
00150                         start = int(start)-1 #python counting
00151                         end = int(end)
00152 
00153                     #The identifier is truncated...
00154                     assert 0 <= index and index < number_of_seqs, \
00155                            "Expected index %i in range [0,%i)" \
00156                            % (index, number_of_seqs)
00157                     assert id==ids[index] or id == ids[index][:len(id)]
00158 
00159                     if len(seq_starts) == index:
00160                         #Record the start
00161                         seq_starts.append(start)
00162 
00163                     #Check the start...
00164                     if start == end:
00165                         assert seq.replace("-","") == "", line
00166                     else:
00167                         assert start - seq_starts[index] == len(seqs[index].replace("-","")), \
00168                         "Found %i chars so far for sequence %i (%s, %s), line says start %i:\n%s" \
00169                             % (len(seqs[index].replace("-","")), index, id, repr(seqs[index]),
00170                                start, line)
00171                     
00172                     seqs[index] += seq
00173 
00174                     #Check the end ...
00175                     assert end == seq_starts[index] + len(seqs[index].replace("-","")), \
00176                         "Found %i chars so far for sequence %i (%s, %s, start=%i), file says end %i:\n%s" \
00177                             % (len(seqs[index].replace("-","")), index, id, repr(seqs[index]),
00178                                seq_starts[index], end, line)
00179 
00180                     index += 1
00181                     if index >= number_of_seqs:
00182                         index = 0
00183                 else:
00184                     #just a start value, this is just alignment annotation (?)
00185                     #print "Skipping: " + line.rstrip()
00186                     pass
00187             elif line.strip() == "":
00188                 #Just a spacer?
00189                 pass
00190             else:
00191                 print line
00192                 assert False
00193 
00194             line = handle.readline()
00195             if line.rstrip() == "#---------------------------------------" \
00196             or line.rstrip() == "#=======================================":
00197                 #End of alignment
00198                 self._header = line
00199                 break
00200 
00201         assert index == 0
00202 
00203         if self.records_per_alignment is not None \
00204         and self.records_per_alignment != len(ids):
00205             raise ValueError("Found %i records in this alignment, told to expect %i" \
00206                              % (len(ids), self.records_per_alignment))
00207 
00208         records = []
00209         for id, seq in zip(ids, seqs):
00210             if len(seq) != length_of_seqs:
00211                 #EMBOSS 2.9.0 is known to use spaces instead of minus signs
00212                 #for leading gaps, and thus fails to parse.  This old version
00213                 #is still used as of Dec 2008 behind the EBI SOAP webservice:
00214                 #http://www.ebi.ac.uk/Tools/webservices/wsdl/WSEmboss.wsdl
00215                 raise ValueError("Error parsing alignment - sequences of "
00216                                  "different length? You could be using an "
00217                                  "old version of EMBOSS.")
00218             records.append(SeqRecord(Seq(seq, self.alphabet), \
00219                                      id=id, description=id))
00220         return MultipleSeqAlignment(records, self.alphabet)
00221 

Here is the caller graph for this function:


Member Data Documentation

Definition at line 197 of file EmbossIO.py.


The documentation for this class was generated from the following file: