Back to index

python-biopython  1.60
Namespaces | Functions | Variables
Bio.AlignIO Namespace Reference

Namespaces

namespace  ClustalIO
namespace  EmbossIO
namespace  FastaIO
namespace  Interfaces
namespace  NexusIO
namespace  PhylipIO
namespace  StockholmIO

Functions

def write
def _SeqIO_to_alignment_iterator
def _force_alphabet
def parse
def read
def convert
def _test

Variables

string __docformat__ = "epytext en"
dictionary _FormatToIterator
dictionary _FormatToWriter

Function Documentation

def Bio.AlignIO._force_alphabet (   alignment_iterator,
  alphabet 
) [private]
Iterate over alignments, over-riding the alphabet (PRIVATE).

Definition at line 273 of file __init__.py.

00273 
00274 def _force_alphabet(alignment_iterator, alphabet):
00275     """Iterate over alignments, over-riding the alphabet (PRIVATE)."""
00276     #Assume the alphabet argument has been pre-validated
00277     given_base_class = _get_base_alphabet(alphabet).__class__
00278     for align in alignment_iterator:
00279         if not isinstance(_get_base_alphabet(align._alphabet),
00280                           given_base_class):
00281             raise ValueError("Specified alphabet %s clashes with "\
00282                              "that determined from the file, %s" \
00283                              % (repr(alphabet), repr(align._alphabet)))
00284         for record in align:
00285             if not isinstance(_get_base_alphabet(record.seq.alphabet),
00286                               given_base_class):
00287                 raise ValueError("Specified alphabet %s clashes with "\
00288                                  "that determined from the file, %s" \
00289                            % (repr(alphabet), repr(record.seq.alphabet)))
00290             record.seq.alphabet = alphabet
00291         align._alphabet = alphabet
00292         yield align

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.AlignIO._SeqIO_to_alignment_iterator (   handle,
  format,
  alphabet = None,
  seq_count = None 
) [private]
Uses Bio.SeqIO to create an MultipleSeqAlignment iterator (PRIVATE).

Arguments:
 - handle    - handle to the file.
 - format    - string describing the file format.
 - alphabet  - optional Alphabet object, useful when the sequence type
               cannot be automatically inferred from the file itself
               (e.g. fasta, phylip, clustal)
 - seq_count - Optional integer, number of sequences expected in each
               alignment.  Recommended for fasta format files.

If count is omitted (default) then all the sequences in the file are
combined into a single MultipleSeqAlignment.

Definition at line 235 of file __init__.py.

00235 
00236 def _SeqIO_to_alignment_iterator(handle, format, alphabet=None, seq_count=None):
00237     """Uses Bio.SeqIO to create an MultipleSeqAlignment iterator (PRIVATE).
00238 
00239     Arguments:
00240      - handle    - handle to the file.
00241      - format    - string describing the file format.
00242      - alphabet  - optional Alphabet object, useful when the sequence type
00243                    cannot be automatically inferred from the file itself
00244                    (e.g. fasta, phylip, clustal)
00245      - seq_count - Optional integer, number of sequences expected in each
00246                    alignment.  Recommended for fasta format files.
00247 
00248     If count is omitted (default) then all the sequences in the file are
00249     combined into a single MultipleSeqAlignment.
00250     """
00251     from Bio import SeqIO
00252     assert format in SeqIO._FormatToIterator
00253 
00254     if seq_count:
00255         #Use the count to split the records into batches.
00256         seq_record_iterator = SeqIO.parse(handle, format, alphabet)
00257 
00258         records = []
00259         for record in seq_record_iterator:
00260             records.append(record)
00261             if len(records) == seq_count:
00262                 yield MultipleSeqAlignment(records, alphabet)
00263                 records = []
00264         if len(records) > 0:
00265             raise ValueError("Check seq_count argument, not enough sequences?")
00266     else:
00267         #Must assume that there is a single alignment using all
00268         #the SeqRecord objects:
00269         records = list(SeqIO.parse(handle, format, alphabet))
00270         if records:
00271             yield MultipleSeqAlignment(records, alphabet)
00272     raise StopIteration

Here is the caller graph for this function:

def Bio.AlignIO._test ( ) [private]
Run the Bio.AlignIO module's doctests.

This will try and locate the unit tests directory, and run the doctests
from there in order that the relative paths used in the examples work.

Definition at line 459 of file __init__.py.

00459 
00460 def _test():
00461     """Run the Bio.AlignIO module's doctests.
00462 
00463     This will try and locate the unit tests directory, and run the doctests
00464     from there in order that the relative paths used in the examples work.
00465     """
00466     import doctest
00467     import os
00468     if os.path.isdir(os.path.join("..", "..", "Tests")):
00469         print "Runing doctests..."
00470         cur_dir = os.path.abspath(os.curdir)
00471         os.chdir(os.path.join("..", "..", "Tests"))
00472         doctest.testmod()
00473         os.chdir(cur_dir)
00474         del cur_dir
00475         print "Done"
00476     elif os.path.isdir(os.path.join("Tests", "Fasta")):
00477         print "Runing doctests..."
00478         cur_dir = os.path.abspath(os.curdir)
00479         os.chdir(os.path.join("Tests"))
00480         doctest.testmod()
00481         os.chdir(cur_dir)
00482         del cur_dir
00483         print "Done"

def Bio.AlignIO.convert (   in_file,
  in_format,
  out_file,
  out_format,
  alphabet = None 
)
Convert between two alignment files, returns number of alignments.

 - in_file - an input handle or filename
 - in_format - input file format, lower case string
 - output - an output handle or filename
 - out_file - output file format, lower case string
 - alphabet - optional alphabet to assume

NOTE - If you provide an output filename, it will be opened which will
overwrite any existing file without warning. This may happen if even the
conversion is aborted (e.g. an invalid out_format name is given).

Definition at line 433 of file __init__.py.

00433 
00434 def convert(in_file, in_format, out_file, out_format, alphabet=None):
00435     """Convert between two alignment files, returns number of alignments.
00436 
00437      - in_file - an input handle or filename
00438      - in_format - input file format, lower case string
00439      - output - an output handle or filename
00440      - out_file - output file format, lower case string
00441      - alphabet - optional alphabet to assume
00442 
00443     NOTE - If you provide an output filename, it will be opened which will
00444     overwrite any existing file without warning. This may happen if even the
00445     conversion is aborted (e.g. an invalid out_format name is given).
00446     """
00447     #TODO - Add optimised versions of important conversions
00448     #For now just off load the work to SeqIO parse/write
00449     with as_handle(in_file, 'rU') as in_handle:
00450         #Don't open the output file until we've checked the input is OK:
00451         alignments = parse(in_handle, in_format, None, alphabet)
00452 
00453         #This will check the arguments and issue error messages,
00454         #after we have opened the file which is a shame.
00455         with as_handle(out_file, 'w') as out_handle:
00456             count = write(alignments, out_handle, out_format)
00457 
00458     return count

Here is the call graph for this function:

def Bio.AlignIO.parse (   handle,
  format,
  seq_count = None,
  alphabet = None 
)
Iterate over an alignment file as MultipleSeqAlignment objects.

Arguments:
 - handle    - handle to the file, or the filename as a string
               (note older verions of Biopython only took a handle).
 - format    - string describing the file format.
 - alphabet  - optional Alphabet object, useful when the sequence type
               cannot be automatically inferred from the file itself
               (e.g. fasta, phylip, clustal)
 - seq_count - Optional integer, number of sequences expected in each
               alignment.  Recommended for fasta format files.

If you have the file name in a string 'filename', use:

>>> from Bio import AlignIO
>>> filename = "Emboss/needle.txt"
>>> format = "emboss"
>>> for alignment in AlignIO.parse(filename, format):
...     print "Alignment of length", alignment.get_alignment_length()
Alignment of length 124
Alignment of length 119
Alignment of length 120
Alignment of length 118
Alignment of length 125

If you have a string 'data' containing the file contents, use:

from Bio import AlignIO
from StringIO import StringIO
my_iterator = AlignIO.parse(StringIO(data), format)

Use the Bio.AlignIO.read() function when you expect a single record only.

Definition at line 293 of file __init__.py.

00293 
00294 def parse(handle, format, seq_count=None, alphabet=None):
00295     """Iterate over an alignment file as MultipleSeqAlignment objects.
00296 
00297     Arguments:
00298      - handle    - handle to the file, or the filename as a string
00299                    (note older verions of Biopython only took a handle).
00300      - format    - string describing the file format.
00301      - alphabet  - optional Alphabet object, useful when the sequence type
00302                    cannot be automatically inferred from the file itself
00303                    (e.g. fasta, phylip, clustal)
00304      - seq_count - Optional integer, number of sequences expected in each
00305                    alignment.  Recommended for fasta format files.
00306 
00307     If you have the file name in a string 'filename', use:
00308 
00309     >>> from Bio import AlignIO
00310     >>> filename = "Emboss/needle.txt"
00311     >>> format = "emboss"
00312     >>> for alignment in AlignIO.parse(filename, format):
00313     ...     print "Alignment of length", alignment.get_alignment_length()
00314     Alignment of length 124
00315     Alignment of length 119
00316     Alignment of length 120
00317     Alignment of length 118
00318     Alignment of length 125
00319 
00320     If you have a string 'data' containing the file contents, use:
00321 
00322     from Bio import AlignIO
00323     from StringIO import StringIO
00324     my_iterator = AlignIO.parse(StringIO(data), format)
00325 
00326     Use the Bio.AlignIO.read() function when you expect a single record only.
00327     """
00328     from Bio import SeqIO
00329 
00330     #Try and give helpful error messages:
00331     if not isinstance(format, basestring):
00332         raise TypeError("Need a string for the file format (lower case)")
00333     if not format:
00334         raise ValueError("Format required (lower case string)")
00335     if format != format.lower():
00336         raise ValueError("Format string '%s' should be lower case" % format)
00337     if alphabet is not None and not (isinstance(alphabet, Alphabet) or \
00338                                      isinstance(alphabet, AlphabetEncoder)):
00339         raise ValueError("Invalid alphabet, %s" % repr(alphabet))
00340     if seq_count is not None and not isinstance(seq_count, int):
00341         raise TypeError("Need integer for seq_count (sequences per alignment)")
00342 
00343     with as_handle(handle, 'rU') as fp:
00344         #Map the file format to a sequence iterator:
00345         if format in _FormatToIterator:
00346             iterator_generator = _FormatToIterator[format]
00347             if alphabet is None :
00348                 i = iterator_generator(fp, seq_count)
00349             else:
00350                 try:
00351                     #Initially assume the optional alphabet argument is supported
00352                     i = iterator_generator(fp, seq_count, alphabet=alphabet)
00353                 except TypeError:
00354                     #It isn't supported.
00355                     i = _force_alphabet(iterator_generator(fp, seq_count),
00356                                         alphabet)
00357 
00358         elif format in SeqIO._FormatToIterator:
00359             #Exploit the existing SeqIO parser to the dirty work!
00360             i = _SeqIO_to_alignment_iterator(fp, format,
00361                                                 alphabet=alphabet,
00362                                                 seq_count=seq_count)
00363         else:
00364             raise ValueError("Unknown format '%s'" % format)
00365 
00366         #This imposes some overhead... wait until we drop Python 2.4 to fix it
00367         for a in i:
00368             yield a

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.AlignIO.read (   handle,
  format,
  seq_count = None,
  alphabet = None 
)
Turns an alignment file into a single MultipleSeqAlignment object.

Arguments:
 - handle    - handle to the file, or the filename as a string
               (note older verions of Biopython only took a handle).
 - format    - string describing the file format.
 - alphabet  - optional Alphabet object, useful when the sequence type
               cannot be automatically inferred from the file itself
               (e.g. fasta, phylip, clustal)
 - seq_count - Optional integer, number of sequences expected in each
               alignment.  Recommended for fasta format files.

If the handle contains no alignments, or more than one alignment,
an exception is raised.  For example, using a PFAM/Stockholm file
containing one alignment:

>>> from Bio import AlignIO
>>> filename = "Clustalw/protein.aln"
>>> format = "clustal"
>>> alignment = AlignIO.read(filename, format)
>>> print "Alignment of length", alignment.get_alignment_length()
Alignment of length 411

If however you want the first alignment from a file containing
multiple alignments this function would raise an exception.

>>> from Bio import AlignIO
>>> filename = "Emboss/needle.txt"
>>> format = "emboss"
>>> alignment = AlignIO.read(filename, format)
Traceback (most recent call last):
    ...
ValueError: More than one record found in handle

Instead use:

>>> from Bio import AlignIO
>>> filename = "Emboss/needle.txt"
>>> format = "emboss"
>>> alignment = AlignIO.parse(filename, format).next()
>>> print "First alignment has length", alignment.get_alignment_length()
First alignment has length 124

You must use the Bio.AlignIO.parse() function if you want to read multiple
records from the handle.

Definition at line 369 of file __init__.py.

00369 
00370 def read(handle, format, seq_count=None, alphabet=None):
00371     """Turns an alignment file into a single MultipleSeqAlignment object.
00372 
00373     Arguments:
00374      - handle    - handle to the file, or the filename as a string
00375                    (note older verions of Biopython only took a handle).
00376      - format    - string describing the file format.
00377      - alphabet  - optional Alphabet object, useful when the sequence type
00378                    cannot be automatically inferred from the file itself
00379                    (e.g. fasta, phylip, clustal)
00380      - seq_count - Optional integer, number of sequences expected in each
00381                    alignment.  Recommended for fasta format files.
00382 
00383     If the handle contains no alignments, or more than one alignment,
00384     an exception is raised.  For example, using a PFAM/Stockholm file
00385     containing one alignment:
00386 
00387     >>> from Bio import AlignIO
00388     >>> filename = "Clustalw/protein.aln"
00389     >>> format = "clustal"
00390     >>> alignment = AlignIO.read(filename, format)
00391     >>> print "Alignment of length", alignment.get_alignment_length()
00392     Alignment of length 411
00393 
00394     If however you want the first alignment from a file containing
00395     multiple alignments this function would raise an exception.
00396 
00397     >>> from Bio import AlignIO
00398     >>> filename = "Emboss/needle.txt"
00399     >>> format = "emboss"
00400     >>> alignment = AlignIO.read(filename, format)
00401     Traceback (most recent call last):
00402         ...
00403     ValueError: More than one record found in handle
00404 
00405     Instead use:
00406 
00407     >>> from Bio import AlignIO
00408     >>> filename = "Emboss/needle.txt"
00409     >>> format = "emboss"
00410     >>> alignment = AlignIO.parse(filename, format).next()
00411     >>> print "First alignment has length", alignment.get_alignment_length()
00412     First alignment has length 124
00413 
00414     You must use the Bio.AlignIO.parse() function if you want to read multiple
00415     records from the handle.
00416     """
00417     iterator = parse(handle, format, seq_count, alphabet)
00418     try:
00419         first = iterator.next()
00420     except StopIteration:
00421         first = None
00422     if first is None:
00423         raise ValueError("No records found in handle")
00424     try:
00425         second = iterator.next()
00426     except StopIteration:
00427         second = None
00428     if second is not None:
00429         raise ValueError("More than one record found in handle")
00430     if seq_count:
00431         assert len(first)==seq_count
00432     return first

Here is the call graph for this function:

def Bio.AlignIO.write (   alignments,
  handle,
  format 
)
Write complete set of alignments to a file.

Arguments:
 - alignments - A list (or iterator) of Alignment objects (ideally the
               new MultipleSeqAlignment objects), or (if using Biopython
               1.54 or later) a single alignment object.
 - handle    - File handle object to write to, or filename as string
               (note older versions of Biopython only took a handle).
 - format    - lower case string describing the file format to write.

You should close the handle after calling this function.

Returns the number of alignments written (as an integer).

Definition at line 178 of file __init__.py.

00178 
00179 def write(alignments, handle, format):
00180     """Write complete set of alignments to a file.
00181 
00182     Arguments:
00183      - alignments - A list (or iterator) of Alignment objects (ideally the
00184                    new MultipleSeqAlignment objects), or (if using Biopython
00185                    1.54 or later) a single alignment object.
00186      - handle    - File handle object to write to, or filename as string
00187                    (note older versions of Biopython only took a handle).
00188      - format    - lower case string describing the file format to write.
00189 
00190     You should close the handle after calling this function.
00191 
00192     Returns the number of alignments written (as an integer).
00193     """
00194     from Bio import SeqIO
00195 
00196     #Try and give helpful error messages:
00197     if not isinstance(format, basestring):
00198         raise TypeError("Need a string for the file format (lower case)")
00199     if not format:
00200         raise ValueError("Format required (lower case string)")
00201     if format != format.lower():
00202         raise ValueError("Format string '%s' should be lower case" % format)
00203 
00204     if isinstance(alignments, Alignment):
00205         #This raised an exception in older version of Biopython
00206         alignments = [alignments]
00207 
00208     with as_handle(handle, 'w') as fp:
00209         #Map the file format to a writer class
00210         if format in _FormatToWriter:
00211             writer_class = _FormatToWriter[format]
00212             count = writer_class(fp).write_file(alignments)
00213         elif format in SeqIO._FormatToWriter:
00214             #Exploit the existing SeqIO parser to the dirty work!
00215             #TODO - Can we make one call to SeqIO.write() and count the alignments?
00216             count = 0
00217             for alignment in alignments:
00218                 if not isinstance(alignment, Alignment):
00219                     raise TypeError(\
00220                         "Expect a list or iterator of Alignment objects.")
00221                 SeqIO.write(alignment, fp, format)
00222                 count += 1
00223         elif format in _FormatToIterator or format in SeqIO._FormatToIterator:
00224             raise ValueError("Reading format '%s' is supported, but not writing" \
00225                              % format)
00226         else:
00227             raise ValueError("Unknown format '%s'" % format)
00228 
00229     assert isinstance(count, int), "Internal error - the underlying %s " \
00230            "writer should have returned the alignment count, not %s" \
00231            % (format, repr(count))
00232 
00233     return count
00234 
#This is a generator function!

Here is the call graph for this function:

Here is the caller graph for this function:


Variable Documentation

string Bio.AlignIO.__docformat__ = "epytext en"

Definition at line 126 of file __init__.py.

Initial value:
00001 {#"fasta" is done via Bio.SeqIO
00002                      "clustal" : ClustalIO.ClustalIterator,
00003                      "emboss" : EmbossIO.EmbossIterator,
00004                      "fasta-m10" : FastaIO.FastaM10Iterator,
00005                      "nexus" : NexusIO.NexusIterator,
00006                      "phylip" : PhylipIO.PhylipIterator,
00007                      "phylip-sequential" : PhylipIO.SequentialPhylipIterator,
00008                      "phylip-relaxed" : PhylipIO.RelaxedPhylipIterator,
00009                      "stockholm" : StockholmIO.StockholmIterator,
00010                      }

Definition at line 157 of file __init__.py.

Initial value:
00001 {#"fasta" is done via Bio.SeqIO
00002                    #"emboss" : EmbossIO.EmbossWriter, (unfinished)
00003                    "nexus" : NexusIO.NexusWriter,
00004                    "phylip" : PhylipIO.PhylipWriter,
00005                    "phylip-sequential" : PhylipIO.SequentialPhylipWriter,
00006                    "phylip-relaxed" : PhylipIO.RelaxedPhylipWriter,
00007                    "stockholm" : StockholmIO.StockholmWriter,
00008                    "clustal" : ClustalIO.ClustalWriter,
00009                    }

Definition at line 168 of file __init__.py.