Back to index

python-biopython  1.60
NexusIO.py
Go to the documentation of this file.
00001 # Copyright 2008-2010 by Peter Cock.  All rights reserved.
00002 #
00003 # This code is part of the Biopython distribution and governed by its
00004 # license.  Please see the LICENSE file that should have been included
00005 # as part of this package.
00006 """
00007 Bio.AlignIO support for the "nexus" file format.
00008 
00009 You are expected to use this module via the Bio.AlignIO functions (or the
00010 Bio.SeqIO functions if you want to work directly with the gapped sequences).
00011 
00012 See also the Bio.Nexus module (which this code calls internally),
00013 as this offers more than just accessing the alignment or its
00014 sequences as SeqRecord objects.
00015 """
00016 
00017 from Bio.Seq import Seq
00018 from Bio.SeqRecord import SeqRecord  
00019 from Bio.Nexus import Nexus
00020 from Bio.Align import MultipleSeqAlignment
00021 from Interfaces import AlignmentWriter
00022 from Bio import Alphabet
00023 
00024 #You can get a couple of example files here:
00025 #http://www.molecularevolution.org/resources/fileformats/
00026     
00027 #This is a generator function!
00028 def NexusIterator(handle, seq_count=None):
00029     """Returns SeqRecord objects from a Nexus file.
00030 
00031     Thus uses the Bio.Nexus module to do the hard work.
00032 
00033     You are expected to call this function via Bio.SeqIO or Bio.AlignIO
00034     (and not use it directly).
00035 
00036     NOTE - We only expect ONE alignment matrix per Nexus file,
00037     meaning this iterator will only yield one MultipleSeqAlignment.
00038     """
00039     n = Nexus.Nexus(handle)
00040     if not n.matrix:
00041         #No alignment found
00042         raise StopIteration
00043 
00044     #Bio.Nexus deals with duplicated names by adding a '.copy' suffix.
00045     #The original names and the modified names are kept in these two lists:
00046     assert len(n.unaltered_taxlabels) == len(n.taxlabels)
00047     
00048     if seq_count and seq_count != len(n.unaltered_taxlabels):
00049         raise ValueError("Found %i sequences, but seq_count=%i" \
00050                % (len(n.unaltered_taxlabels), seq_count))
00051 
00052     #ToDo - Can we extract any annotation too?
00053     records = (SeqRecord(n.matrix[new_name], id=new_name, \
00054                          name=old_name, description="") \
00055                for old_name, new_name \
00056                in zip (n.unaltered_taxlabels, n.taxlabels))
00057     #All done
00058     yield MultipleSeqAlignment(records, n.alphabet)
00059 
00060 class NexusWriter(AlignmentWriter):
00061     """Nexus alignment writer.
00062 
00063     Note that Nexus files are only expected to hold ONE alignment
00064     matrix.
00065 
00066     You are expected to call this class via the Bio.AlignIO.write() or
00067     Bio.SeqIO.write() functions.
00068     """
00069     def write_file(self, alignments):
00070         """Use this to write an entire file containing the given alignments.
00071 
00072         alignments - A list or iterator returning MultipleSeqAlignment objects.
00073                      This should hold ONE and only one alignment.
00074         """
00075         align_iter = iter(alignments) #Could have been a list
00076         try:
00077             first_alignment = align_iter.next()
00078         except StopIteration:
00079             first_alignment = None
00080         if first_alignment is None:
00081             #Nothing to write!
00082             return 0
00083         
00084         #Check there is only one alignment...
00085         try:
00086             second_alignment = align_iter.next()
00087         except StopIteration:
00088             second_alignment = None
00089         if second_alignment is not None:
00090             raise ValueError("We can only write one Alignment to a Nexus file.")
00091 
00092         #Good.  Actually write the single alignment,
00093         self.write_alignment(first_alignment)
00094         return 1 #we only support writing one alignment!
00095 
00096     def write_alignment(self, alignment):
00097         #Creates an empty Nexus object, adds the sequences,
00098         #and then gets Nexus to prepare the output.
00099         if len(alignment) == 0:
00100             raise ValueError("Must have at least one sequence")
00101         if alignment.get_alignment_length() == 0:
00102             raise ValueError("Non-empty sequences are required")
00103         minimal_record = "#NEXUS\nbegin data; dimensions ntax=0 nchar=0; " \
00104                          + "format datatype=%s; end;"  \
00105                          % self._classify_alphabet_for_nexus(alignment._alphabet)
00106         n = Nexus.Nexus(minimal_record)
00107         n.alphabet = alignment._alphabet
00108         for record in alignment:
00109             n.add_sequence(record.id, record.seq.tostring())
00110         n.write_nexus_data(self.handle)
00111     
00112     def _classify_alphabet_for_nexus(self, alphabet):
00113         """Returns 'protein', 'dna', 'rna' based on the alphabet (PRIVATE).
00114 
00115         Raises an exception if this is not possible."""
00116         #Get the base alphabet (underneath any Gapped or StopCodon encoding)
00117         a = Alphabet._get_base_alphabet(alphabet)
00118 
00119         if not isinstance(a, Alphabet.Alphabet):
00120             raise TypeError("Invalid alphabet")
00121         elif isinstance(a, Alphabet.ProteinAlphabet):
00122             return "protein"
00123         elif isinstance(a, Alphabet.DNAAlphabet):
00124             return "dna"
00125         elif isinstance(a, Alphabet.RNAAlphabet):
00126             return "rna"
00127         else:
00128             #Must be something like NucleotideAlphabet or
00129             #just the generic Alphabet (default for fasta files)
00130             raise ValueError("Need a DNA, RNA or Protein alphabet")
00131 
00132 if __name__ == "__main__":
00133     from StringIO import StringIO
00134     print "Quick self test"
00135     print
00136     print "Repeated names without a TAXA block"
00137     handle = StringIO("""#NEXUS
00138     [TITLE: NoName]
00139 
00140     begin data;
00141     dimensions ntax=4 nchar=50;
00142     format interleave datatype=protein   gap=- symbols="FSTNKEYVQMCLAWPHDRIG";
00143 
00144     matrix
00145     CYS1_DICDI          -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- 
00146     ALEU_HORVU          MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG 
00147     CATH_HUMAN          ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK----
00148     CYS1_DICDI          -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---X
00149     ;
00150     end; 
00151     """)
00152     for a in NexusIterator(handle):
00153         print a
00154         for r in a:
00155             print repr(r.seq), r.name, r.id
00156     print "Done"
00157 
00158     print
00159     print "Repeated names with a TAXA block"
00160     handle = StringIO("""#NEXUS
00161     [TITLE: NoName]
00162 
00163     begin taxa
00164     CYS1_DICDI
00165     ALEU_HORVU
00166     CATH_HUMAN
00167     CYS1_DICDI;
00168     end;
00169 
00170     begin data;
00171     dimensions ntax=4 nchar=50;
00172     format interleave datatype=protein   gap=- symbols="FSTNKEYVQMCLAWPHDRIG";
00173 
00174     matrix
00175     CYS1_DICDI          -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- 
00176     ALEU_HORVU          MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG 
00177     CATH_HUMAN          ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK----
00178     CYS1_DICDI          -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---X
00179     ;
00180     end; 
00181     """)
00182     for a in NexusIterator(handle):
00183         print a
00184         for r in a:
00185             print repr(r.seq), r.name, r.id
00186     print "Done"
00187     print
00188     print "Reading an empty file"
00189     assert 0 == len(list(NexusIterator(StringIO())))
00190     print "Done"
00191     print
00192     print "Writing..."
00193     
00194     handle = StringIO()
00195     NexusWriter(handle).write_file([a])
00196     handle.seek(0)
00197     print handle.read()
00198 
00199     handle = StringIO()
00200     try:
00201         NexusWriter(handle).write_file([a,a])
00202         assert False, "Should have rejected more than one alignment!"
00203     except ValueError:
00204         pass