Back to index

python-biopython  1.60
Public Member Functions | Private Attributes
Bio.AlignIO.ClustalIO.ClustalIterator Class Reference
Inheritance diagram for Bio.AlignIO.ClustalIO.ClustalIterator:
Inheritance graph
[legend]
Collaboration diagram for Bio.AlignIO.ClustalIO.ClustalIterator:
Collaboration graph
[legend]

List of all members.

Public Member Functions

def next

Private Attributes

 _header

Detailed Description

Clustalw alignment iterator.

Definition at line 81 of file ClustalIO.py.


Member Function Documentation

Definition at line 84 of file ClustalIO.py.

00084 
00085     def next(self):
00086         handle = self.handle
00087         try:
00088             #Header we saved from when we were parsing
00089             #the previous alignment.
00090             line = self._header
00091             del self._header
00092         except AttributeError:      
00093             line = handle.readline()
00094         if not line:
00095             raise StopIteration
00096 
00097         #Whitelisted headers we know about
00098         known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE']
00099         if line.strip().split()[0] not in known_headers:
00100             raise ValueError("%s is not a known CLUSTAL header: %s" % \
00101                              (line.strip().split()[0],
00102                               ", ".join(known_headers)))
00103 
00104         # find the clustal version in the header line
00105         version = None
00106         for word in line.split():
00107             if word[0]=='(' and word[-1]==')':
00108                 word = word[1:-1]
00109             if word[0] in '0123456789':
00110                 version = word
00111                 break
00112 
00113         #There should be two blank lines after the header line
00114         line = handle.readline()
00115         while line.strip() == "":
00116             line = handle.readline()
00117 
00118         #If the alignment contains entries with the same sequence
00119         #identifier (not a good idea - but seems possible), then this
00120         #dictionary based parser will merge their sequences.  Fix this?
00121         ids = []
00122         seqs = []
00123         consensus = ""
00124         seq_cols = None #: Used to extract the consensus
00125 
00126         #Use the first block to get the sequence identifiers
00127         while True:
00128             if line[0] != " " and line.strip() != "":
00129                 #Sequences identifier...
00130                 fields = line.rstrip().split()
00131 
00132                 #We expect there to be two fields, there can be an optional
00133                 #"sequence number" field containing the letter count.
00134                 if len(fields) < 2 or len(fields) > 3:
00135                     raise ValueError("Could not parse line:\n%s" % line)
00136 
00137                 ids.append(fields[0])
00138                 seqs.append(fields[1])
00139 
00140                 #Record the sequence position to get the consensus
00141                 if seq_cols is None:
00142                     start = len(fields[0]) + line[len(fields[0]):].find(fields[1])
00143                     end = start + len(fields[1])
00144                     seq_cols = slice(start, end)
00145                     del start, end
00146                 assert fields[1] == line[seq_cols]
00147 
00148                 if len(fields) == 3:
00149                     #This MAY be an old style file with a letter count...
00150                     try:
00151                         letters = int(fields[2])
00152                     except ValueError:
00153                         raise ValueError("Could not parse line, bad sequence number:\n%s" % line)
00154                     if len(fields[1].replace("-","")) != letters:
00155                         raise ValueError("Could not parse line, invalid sequence number:\n%s" % line)
00156             elif line[0] == " ":
00157                 #Sequence consensus line...
00158                 assert len(ids) == len(seqs)
00159                 assert len(ids) > 0
00160                 assert seq_cols is not None
00161                 consensus = line[seq_cols]
00162                 assert not line[:seq_cols.start].strip()
00163                 assert not line[seq_cols.stop:].strip()
00164                 #Check for blank line (or end of file)
00165                 line = handle.readline()
00166                 assert line.strip() == ""
00167                 break
00168             else:
00169                 #No consensus
00170                 break
00171             line = handle.readline()
00172             if not line : break #end of file
00173 
00174         assert line.strip() == ""
00175         assert seq_cols is not None
00176 
00177         #Confirm all same length
00178         for s in seqs:
00179             assert len(s) == len(seqs[0])
00180         if consensus:
00181             assert len(consensus) == len(seqs[0])
00182 
00183         #Loop over any remaining blocks...
00184         done = False
00185         while not done:
00186             #There should be a blank line between each block.
00187             #Also want to ignore any consensus line from the
00188             #previous block.
00189             while (not line) or line.strip() == "":
00190                 line = handle.readline()
00191                 if not line : break # end of file
00192             if not line : break # end of file
00193 
00194             if line.split(None,1)[0] in known_headers:
00195                 #Found concatenated alignment.
00196                 done = True
00197                 self._header = line
00198                 break
00199 
00200             for i in range(len(ids)):
00201                 assert line[0] != " ", "Unexpected line:\n%s" % repr(line)
00202                 fields = line.rstrip().split()
00203                 
00204                 #We expect there to be two fields, there can be an optional
00205                 #"sequence number" field containing the letter count.
00206                 if len(fields) < 2 or len(fields) > 3:
00207                     raise ValueError("Could not parse line:\n%s" % repr(line))
00208 
00209                 if fields[0] != ids[i]:
00210                     raise ValueError("Identifiers out of order? Got '%s' but expected '%s'" \
00211                                       % (fields[0], ids[i]))
00212 
00213                 if fields[1] != line[seq_cols]:
00214                     start = len(fields[0]) + line[len(fields[0]):].find(fields[1])
00215                     assert start == seq_cols.start, 'Old location %s -> %i:XX' % (seq_cols, start)
00216                     end = start + len(fields[1])
00217                     seq_cols = slice(start, end)
00218                     del start, end
00219 
00220                 #Append the sequence
00221                 seqs[i] += fields[1]
00222                 assert len(seqs[i]) == len(seqs[0])
00223 
00224                 if len(fields) == 3:
00225                     #This MAY be an old style file with a letter count...
00226                     try:
00227                         letters = int(fields[2])
00228                     except ValueError:
00229                         raise ValueError("Could not parse line, bad sequence number:\n%s" % line)
00230                     if len(seqs[i].replace("-","")) != letters:
00231                         raise ValueError("Could not parse line, invalid sequence number:\n%s" % line)
00232 
00233                 #Read in the next line
00234                 line = handle.readline()
00235             #There should now be a consensus line
00236             if consensus:
00237                 assert line[0] == " "
00238                 assert seq_cols is not None
00239                 consensus += line[seq_cols]
00240                 assert len(consensus) == len(seqs[0])
00241                 assert not line[:seq_cols.start].strip()
00242                 assert not line[seq_cols.stop:].strip()
00243                 #Read in the next line
00244                 line = handle.readline()
00245             
00246 
00247         assert len(ids) == len(seqs)
00248         if len(seqs) == 0 or len(seqs[0]) == 0:
00249             raise StopIteration
00250 
00251         if self.records_per_alignment is not None \
00252         and self.records_per_alignment != len(ids):
00253             raise ValueError("Found %i records in this alignment, told to expect %i" \
00254                              % (len(ids), self.records_per_alignment))
00255 
00256         records = (SeqRecord(Seq(s, self.alphabet), id=i, description=i) \
00257                    for (i,s) in zip(ids, seqs)) 
00258         alignment = MultipleSeqAlignment(records, self.alphabet)
00259         #TODO - Handle alignment annotation better, for now
00260         #mimic the old parser in Bio.Clustalw
00261         if version:
00262             alignment._version = version
00263         if consensus:
00264             alignment_length = len(seqs[0])
00265             assert len(consensus) == alignment_length, \
00266                    "Alignment length is %i, consensus length is %i, '%s'" \
00267                    % (alignment_length, len(consensus), consensus)
00268             alignment._star_info = consensus
        return alignment

Here is the call graph for this function:

Here is the caller graph for this function:


Member Data Documentation

Definition at line 196 of file ClustalIO.py.


The documentation for this class was generated from the following file: