Back to index

python-biopython  1.60
Namespaces | Functions | Variables
Bio.SeqIO Namespace Reference

Namespaces

namespace  _convert
namespace  _index
namespace  AbiIO
namespace  AceIO
namespace  FastaIO
namespace  IgIO
namespace  InsdcIO
namespace  Interfaces
namespace  PhdIO
namespace  PirIO
namespace  QualityIO
namespace  SeqXmlIO
namespace  SffIO
namespace  SwissIO
namespace  TabIO
namespace  UniprotIO

Functions

def write
def parse
def _force_alphabet
def read
def to_dict
def index
def index_db
def convert
def _test

Variables

string __docformat__ = "epytext en"
dictionary _FormatToIterator
dictionary _FormatToWriter
list _BinaryFormats = ["sff", "sff-trim", "abi", "abi-trim"]

Function Documentation

def Bio.SeqIO._force_alphabet (   record_iterator,
  alphabet 
) [private]
Iterate over records, over-riding the alphabet (PRIVATE).

Definition at line 540 of file __init__.py.

00540 
00541 def _force_alphabet(record_iterator, alphabet):
00542     """Iterate over records, over-riding the alphabet (PRIVATE)."""
00543     #Assume the alphabet argument has been pre-validated
00544     given_base_class = _get_base_alphabet(alphabet).__class__
00545     for record in record_iterator:
00546         if isinstance(_get_base_alphabet(record.seq.alphabet),
00547                       given_base_class):
00548             record.seq.alphabet = alphabet
00549             yield record
00550         else:
00551             raise ValueError("Specified alphabet %s clashes with "\
00552                              "that determined from the file, %s" \
00553                              % (repr(alphabet), repr(record.seq.alphabet)))

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.SeqIO._test ( ) [private]
Run the Bio.SeqIO module's doctests.

This will try and locate the unit tests directory, and run the doctests
from there in order that the relative paths used in the examples work.

Definition at line 915 of file __init__.py.

00915 
00916 def _test():
00917     """Run the Bio.SeqIO module's doctests.
00918 
00919     This will try and locate the unit tests directory, and run the doctests
00920     from there in order that the relative paths used in the examples work.
00921     """
00922     import doctest
00923     import os
00924     if os.path.isdir(os.path.join("..", "..", "Tests")):
00925         print "Runing doctests..."
00926         cur_dir = os.path.abspath(os.curdir)
00927         os.chdir(os.path.join("..", "..", "Tests"))
00928         doctest.testmod()
00929         os.chdir(cur_dir)
00930         del cur_dir
00931         print "Done"
00932     elif os.path.isdir(os.path.join("Tests", "Fasta")):
00933         print "Runing doctests..."
00934         cur_dir = os.path.abspath(os.curdir)
00935         os.chdir(os.path.join("Tests"))
00936         doctest.testmod()
00937         os.chdir(cur_dir)
00938         del cur_dir
00939         print "Done"

def Bio.SeqIO.convert (   in_file,
  in_format,
  out_file,
  out_format,
  alphabet = None 
)
Convert between two sequence file formats, return number of records.

 - in_file - an input handle or filename
 - in_format - input file format, lower case string
 - out_file - an output handle or filename
 - out_format - output file format, lower case string
 - alphabet - optional alphabet to assume

NOTE - If you provide an output filename, it will be opened which will
overwrite any existing file without warning. This may happen if even
the conversion is aborted (e.g. an invalid out_format name is given).

For example, going from a filename to a handle:

>>> from Bio import SeqIO
>>> from StringIO import StringIO
>>> handle = StringIO("")
>>> SeqIO.convert("Quality/example.fastq", "fastq", handle, "fasta")
3
>>> print handle.getvalue()
>EAS54_6_R1_2_1_413_324
CCCTTCTTGTCTTCAGCGTTTCTCC
>EAS54_6_R1_2_1_540_792
TTGGCAGGCCAAGGCCGATGGATCA
>EAS54_6_R1_2_1_443_348
GTTGCTTCTGGCGTGGGTGGGGGGG
<BLANKLINE>

Definition at line 864 of file __init__.py.

00864 
00865 def convert(in_file, in_format, out_file, out_format, alphabet=None):
00866     """Convert between two sequence file formats, return number of records.
00867 
00868      - in_file - an input handle or filename
00869      - in_format - input file format, lower case string
00870      - out_file - an output handle or filename
00871      - out_format - output file format, lower case string
00872      - alphabet - optional alphabet to assume
00873 
00874     NOTE - If you provide an output filename, it will be opened which will
00875     overwrite any existing file without warning. This may happen if even
00876     the conversion is aborted (e.g. an invalid out_format name is given).
00877 
00878     For example, going from a filename to a handle:
00879 
00880     >>> from Bio import SeqIO
00881     >>> from StringIO import StringIO
00882     >>> handle = StringIO("")
00883     >>> SeqIO.convert("Quality/example.fastq", "fastq", handle, "fasta")
00884     3
00885     >>> print handle.getvalue()
00886     >EAS54_6_R1_2_1_413_324
00887     CCCTTCTTGTCTTCAGCGTTTCTCC
00888     >EAS54_6_R1_2_1_540_792
00889     TTGGCAGGCCAAGGCCGATGGATCA
00890     >EAS54_6_R1_2_1_443_348
00891     GTTGCTTCTGGCGTGGGTGGGGGGG
00892     <BLANKLINE>
00893     """
00894     #Hack for SFF, will need to make this more general in future
00895     if in_format in _BinaryFormats :
00896         in_mode = 'rb'
00897     else :
00898         in_mode = 'rU'
00899 
00900     #Don't open the output file until we've checked the input is OK?
00901     if out_format in ["sff", "sff_trim"] :
00902         out_mode = 'wb'
00903     else :
00904         out_mode = 'w'
00905 
00906     #This will check the arguments and issue error messages,
00907     #after we have opened the file which is a shame.
00908     from _convert import _handle_convert #Lazy import
00909     with as_handle(in_file, in_mode) as in_handle:
00910         with as_handle(out_file, out_mode) as out_handle:
00911             count = _handle_convert(in_handle, in_format,
00912                                     out_handle, out_format,
00913                                     alphabet)
00914     return count

Here is the call graph for this function:

def Bio.SeqIO.index (   filename,
  format,
  alphabet = None,
  key_function = None 
)
Indexes a sequence file and returns a dictionary like object.

 - filename - string giving name of file to be indexed
 - format   - lower case string describing the file format
 - alphabet - optional Alphabet object, useful when the sequence type
              cannot be automatically inferred from the file itself
              (e.g. format="fasta" or "tab")
 - key_function - Optional callback function which when given a
              SeqRecord identifier string should return a unique
              key for the dictionary.

This indexing function will return a dictionary like object, giving the
SeqRecord objects as values:

>>> from Bio import SeqIO
>>> records = SeqIO.index("Quality/example.fastq", "fastq")
>>> len(records)
3
>>> sorted(records)
['EAS54_6_R1_2_1_413_324', 'EAS54_6_R1_2_1_443_348', 'EAS54_6_R1_2_1_540_792']
>>> print records["EAS54_6_R1_2_1_540_792"].format("fasta")
>EAS54_6_R1_2_1_540_792
TTGGCAGGCCAAGGCCGATGGATCA
<BLANKLINE>
>>> "EAS54_6_R1_2_1_540_792" in records
True
>>> print records.get("Missing", None)
None

If the file is BGZF compressed, this is detected automatically. Ordinary
GZIP files are not supported:

>>> from Bio import SeqIO
>>> records = SeqIO.index("Quality/example.fastq.bgz", "fastq")
>>> len(records)
3
>>> print records["EAS54_6_R1_2_1_540_792"].seq
TTGGCAGGCCAAGGCCGATGGATCA

Note that this psuedo dictionary will not support all the methods of a
true Python dictionary, for example values() is not defined since this
would require loading all of the records into memory at once.

When you call the index function, it will scan through the file, noting
the location of each record. When you access a particular record via the
dictionary methods, the code will jump to the appropriate part of the
file and then parse that section into a SeqRecord.

Note that not all the input formats supported by Bio.SeqIO can be used
with this index function. It is designed to work only with sequential
file formats (e.g. "fasta", "gb", "fastq") and is not suitable for any
interlaced file format (e.g. alignment formats such as "clustal").

For small files, it may be more efficient to use an in memory Python
dictionary, e.g.

>>> from Bio import SeqIO
>>> records = SeqIO.to_dict(SeqIO.parse(open("Quality/example.fastq"), "fastq"))
>>> len(records)
3
>>> sorted(records)
['EAS54_6_R1_2_1_413_324', 'EAS54_6_R1_2_1_443_348', 'EAS54_6_R1_2_1_540_792']
>>> print records["EAS54_6_R1_2_1_540_792"].format("fasta")
>EAS54_6_R1_2_1_540_792
TTGGCAGGCCAAGGCCGATGGATCA
<BLANKLINE>

As with the to_dict() function, by default the id string of each record
is used as the key. You can specify a callback function to transform
this (the record identifier string) into your prefered key. For example:

>>> from Bio import SeqIO
>>> def make_tuple(identifier):
...     parts = identifier.split("_")
...     return int(parts[-2]), int(parts[-1])
>>> records = SeqIO.index("Quality/example.fastq", "fastq",
...                       key_function=make_tuple)
>>> len(records)
3
>>> sorted(records)
[(413, 324), (443, 348), (540, 792)]
>>> print records[(540, 792)].format("fasta")
>EAS54_6_R1_2_1_540_792
TTGGCAGGCCAAGGCCGATGGATCA
<BLANKLINE>
>>> (540, 792) in records
True
>>> "EAS54_6_R1_2_1_540_792" in records
False
>>> print records.get("Missing", None)
None

Another common use case would be indexing an NCBI style FASTA file,
where you might want to extract the GI number from the FASTA identifer
to use as the dictionary key.

Notice that unlike the to_dict() function, here the key_function does
not get given the full SeqRecord to use to generate the key. Doing so
would impose a severe performance penalty as it would require the file
to be completely parsed while building the index. Right now this is
usually avoided.

See also: Bio.SeqIO.index_db() and Bio.SeqIO.to_dict()

Definition at line 672 of file __init__.py.

00672 
00673 def index(filename, format, alphabet=None, key_function=None):
00674     """Indexes a sequence file and returns a dictionary like object.
00675 
00676      - filename - string giving name of file to be indexed
00677      - format   - lower case string describing the file format
00678      - alphabet - optional Alphabet object, useful when the sequence type
00679                   cannot be automatically inferred from the file itself
00680                   (e.g. format="fasta" or "tab")
00681      - key_function - Optional callback function which when given a
00682                   SeqRecord identifier string should return a unique
00683                   key for the dictionary.
00684 
00685     This indexing function will return a dictionary like object, giving the
00686     SeqRecord objects as values:
00687 
00688     >>> from Bio import SeqIO
00689     >>> records = SeqIO.index("Quality/example.fastq", "fastq")
00690     >>> len(records)
00691     3
00692     >>> sorted(records)
00693     ['EAS54_6_R1_2_1_413_324', 'EAS54_6_R1_2_1_443_348', 'EAS54_6_R1_2_1_540_792']
00694     >>> print records["EAS54_6_R1_2_1_540_792"].format("fasta")
00695     >EAS54_6_R1_2_1_540_792
00696     TTGGCAGGCCAAGGCCGATGGATCA
00697     <BLANKLINE>
00698     >>> "EAS54_6_R1_2_1_540_792" in records
00699     True
00700     >>> print records.get("Missing", None)
00701     None
00702 
00703     If the file is BGZF compressed, this is detected automatically. Ordinary
00704     GZIP files are not supported:
00705 
00706     >>> from Bio import SeqIO
00707     >>> records = SeqIO.index("Quality/example.fastq.bgz", "fastq")
00708     >>> len(records)
00709     3
00710     >>> print records["EAS54_6_R1_2_1_540_792"].seq
00711     TTGGCAGGCCAAGGCCGATGGATCA
00712 
00713     Note that this psuedo dictionary will not support all the methods of a
00714     true Python dictionary, for example values() is not defined since this
00715     would require loading all of the records into memory at once.
00716 
00717     When you call the index function, it will scan through the file, noting
00718     the location of each record. When you access a particular record via the
00719     dictionary methods, the code will jump to the appropriate part of the
00720     file and then parse that section into a SeqRecord.
00721 
00722     Note that not all the input formats supported by Bio.SeqIO can be used
00723     with this index function. It is designed to work only with sequential
00724     file formats (e.g. "fasta", "gb", "fastq") and is not suitable for any
00725     interlaced file format (e.g. alignment formats such as "clustal").
00726 
00727     For small files, it may be more efficient to use an in memory Python
00728     dictionary, e.g.
00729 
00730     >>> from Bio import SeqIO
00731     >>> records = SeqIO.to_dict(SeqIO.parse(open("Quality/example.fastq"), "fastq"))
00732     >>> len(records)
00733     3
00734     >>> sorted(records)
00735     ['EAS54_6_R1_2_1_413_324', 'EAS54_6_R1_2_1_443_348', 'EAS54_6_R1_2_1_540_792']
00736     >>> print records["EAS54_6_R1_2_1_540_792"].format("fasta")
00737     >EAS54_6_R1_2_1_540_792
00738     TTGGCAGGCCAAGGCCGATGGATCA
00739     <BLANKLINE>
00740 
00741     As with the to_dict() function, by default the id string of each record
00742     is used as the key. You can specify a callback function to transform
00743     this (the record identifier string) into your prefered key. For example:
00744 
00745     >>> from Bio import SeqIO
00746     >>> def make_tuple(identifier):
00747     ...     parts = identifier.split("_")
00748     ...     return int(parts[-2]), int(parts[-1])
00749     >>> records = SeqIO.index("Quality/example.fastq", "fastq",
00750     ...                       key_function=make_tuple)
00751     >>> len(records)
00752     3
00753     >>> sorted(records)
00754     [(413, 324), (443, 348), (540, 792)]
00755     >>> print records[(540, 792)].format("fasta")
00756     >EAS54_6_R1_2_1_540_792
00757     TTGGCAGGCCAAGGCCGATGGATCA
00758     <BLANKLINE>
00759     >>> (540, 792) in records
00760     True
00761     >>> "EAS54_6_R1_2_1_540_792" in records
00762     False
00763     >>> print records.get("Missing", None)
00764     None
00765 
00766     Another common use case would be indexing an NCBI style FASTA file,
00767     where you might want to extract the GI number from the FASTA identifer
00768     to use as the dictionary key.
00769 
00770     Notice that unlike the to_dict() function, here the key_function does
00771     not get given the full SeqRecord to use to generate the key. Doing so
00772     would impose a severe performance penalty as it would require the file
00773     to be completely parsed while building the index. Right now this is
00774     usually avoided.
00775 
00776     See also: Bio.SeqIO.index_db() and Bio.SeqIO.to_dict()
00777     """
00778     #Try and give helpful error messages:
00779     if not isinstance(filename, basestring):
00780         raise TypeError("Need a filename (not a handle)")
00781     if not isinstance(format, basestring):
00782         raise TypeError("Need a string for the file format (lower case)")
00783     if not format:
00784         raise ValueError("Format required (lower case string)")
00785     if format != format.lower():
00786         raise ValueError("Format string '%s' should be lower case" % format)
00787     if alphabet is not None and not (isinstance(alphabet, Alphabet) or \
00788                                      isinstance(alphabet, AlphabetEncoder)):
00789         raise ValueError("Invalid alphabet, %s" % repr(alphabet))
00790 
00791     #Map the file format to a sequence iterator:
00792     import _index #Lazy import
00793     return _index._IndexedSeqFileDict(filename, format, alphabet, key_function)

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.SeqIO.index_db (   index_filename,
  filenames = None,
  format = None,
  alphabet = None,
  key_function = None 
)
Index several sequence files and return a dictionary like object.

The index is stored in an SQLite database rather than in memory (as in the
Bio.SeqIO.index(...) function).

 - index_filename - Where to store the SQLite index
 - filenames - list of strings specifying file(s) to be indexed, or when
              indexing a single file this can be given as a string.
              (optional if reloading an existing index, but must match)
 - format   - lower case string describing the file format
              (optional if reloading an existing index, but must match)
 - alphabet - optional Alphabet object, useful when the sequence type
              cannot be automatically inferred from the file itself
              (e.g. format="fasta" or "tab")
 - key_function - Optional callback function which when given a
              SeqRecord identifier string should return a unique
              key for the dictionary.

This indexing function will return a dictionary like object, giving the
SeqRecord objects as values:

>>> from Bio.Alphabet import generic_protein
>>> from Bio import SeqIO
>>> files = ["GenBank/NC_000932.faa", "GenBank/NC_005816.faa"]
>>> def get_gi(name):
...     parts = name.split("|")
...     i = parts.index("gi")
...     assert i != -1
...     return parts[i+1]
>>> idx_name = ":memory:" #use an in memory SQLite DB for this test
>>> records = SeqIO.index_db(idx_name, files, "fasta", generic_protein, get_gi)
>>> len(records)
95
>>> records["7525076"].description
'gi|7525076|ref|NP_051101.1| Ycf2 [Arabidopsis thaliana]'
>>> records["45478717"].description
'gi|45478717|ref|NP_995572.1| pesticin [Yersinia pestis biovar Microtus str. 91001]'

In this example the two files contain 85 and 10 records respectively.

BGZF compressed files are supported, and detected automatically. Ordinary
GZIP compressed files are not supported.

See also: Bio.SeqIO.index() and Bio.SeqIO.to_dict()

Definition at line 795 of file __init__.py.

00795 
00796                key_function=None):
00797     """Index several sequence files and return a dictionary like object.
00798 
00799     The index is stored in an SQLite database rather than in memory (as in the
00800     Bio.SeqIO.index(...) function).
00801 
00802      - index_filename - Where to store the SQLite index
00803      - filenames - list of strings specifying file(s) to be indexed, or when
00804                   indexing a single file this can be given as a string.
00805                   (optional if reloading an existing index, but must match)
00806      - format   - lower case string describing the file format
00807                   (optional if reloading an existing index, but must match)
00808      - alphabet - optional Alphabet object, useful when the sequence type
00809                   cannot be automatically inferred from the file itself
00810                   (e.g. format="fasta" or "tab")
00811      - key_function - Optional callback function which when given a
00812                   SeqRecord identifier string should return a unique
00813                   key for the dictionary.
00814 
00815     This indexing function will return a dictionary like object, giving the
00816     SeqRecord objects as values:
00817 
00818     >>> from Bio.Alphabet import generic_protein
00819     >>> from Bio import SeqIO
00820     >>> files = ["GenBank/NC_000932.faa", "GenBank/NC_005816.faa"]
00821     >>> def get_gi(name):
00822     ...     parts = name.split("|")
00823     ...     i = parts.index("gi")
00824     ...     assert i != -1
00825     ...     return parts[i+1]
00826     >>> idx_name = ":memory:" #use an in memory SQLite DB for this test
00827     >>> records = SeqIO.index_db(idx_name, files, "fasta", generic_protein, get_gi)
00828     >>> len(records)
00829     95
00830     >>> records["7525076"].description
00831     'gi|7525076|ref|NP_051101.1| Ycf2 [Arabidopsis thaliana]'
00832     >>> records["45478717"].description
00833     'gi|45478717|ref|NP_995572.1| pesticin [Yersinia pestis biovar Microtus str. 91001]'
00834 
00835     In this example the two files contain 85 and 10 records respectively.
00836 
00837     BGZF compressed files are supported, and detected automatically. Ordinary
00838     GZIP compressed files are not supported.
00839 
00840     See also: Bio.SeqIO.index() and Bio.SeqIO.to_dict()
00841     """
00842     #Try and give helpful error messages:
00843     if not isinstance(index_filename, basestring):
00844         raise TypeError("Need a string for the index filename")
00845     if isinstance(filenames, basestring):
00846         #Make the API a little more friendly, and more similar
00847         #to Bio.SeqIO.index(...) for indexing just one file.
00848         filenames = [filenames]
00849     if filenames is not None and not isinstance(filenames, list):
00850         raise TypeError("Need a list of filenames (as strings), or one filename")
00851     if format is not None and not isinstance(format, basestring):
00852         raise TypeError("Need a string for the file format (lower case)")
00853     if format and format != format.lower():
00854         raise ValueError("Format string '%s' should be lower case" % format)
00855     if alphabet is not None and not (isinstance(alphabet, Alphabet) or \
00856                                      isinstance(alphabet, AlphabetEncoder)):
00857         raise ValueError("Invalid alphabet, %s" % repr(alphabet))
00858 
00859     #Map the file format to a sequence iterator:
00860     import _index #Lazy import
00861     return _index._SQLiteManySeqFilesDict(index_filename, filenames, format,
00862                                           alphabet, key_function)
00863 

Here is the caller graph for this function:

def Bio.SeqIO.parse (   handle,
  format,
  alphabet = None 
)

Definition at line 446 of file __init__.py.

00446 
00447 def parse(handle, format, alphabet=None):
00448     r"""Turns a sequence file into an iterator returning SeqRecords.
00449 
00450      - handle   - handle to the file, or the filename as a string
00451                   (note older verions of Biopython only took a handle).
00452      - format   - lower case string describing the file format.
00453      - alphabet - optional Alphabet object, useful when the sequence type
00454                   cannot be automatically inferred from the file itself
00455                   (e.g. format="fasta" or "tab")
00456 
00457     Typical usage, opening a file to read in, and looping over the record(s):
00458 
00459     >>> from Bio import SeqIO
00460     >>> filename = "Fasta/sweetpea.nu"
00461     >>> for record in SeqIO.parse(filename, "fasta"):
00462     ...    print "ID", record.id
00463     ...    print "Sequence length", len(record)
00464     ...    print "Sequence alphabet", record.seq.alphabet
00465     ID gi|3176602|gb|U78617.1|LOU78617
00466     Sequence length 309
00467     Sequence alphabet SingleLetterAlphabet()
00468 
00469     For file formats like FASTA where the alphabet cannot be determined, it
00470     may be useful to specify the alphabet explicitly:
00471 
00472     >>> from Bio import SeqIO
00473     >>> from Bio.Alphabet import generic_dna
00474     >>> filename = "Fasta/sweetpea.nu"
00475     >>> for record in SeqIO.parse(filename, "fasta", generic_dna):
00476     ...    print "ID", record.id
00477     ...    print "Sequence length", len(record)
00478     ...    print "Sequence alphabet", record.seq.alphabet
00479     ID gi|3176602|gb|U78617.1|LOU78617
00480     Sequence length 309
00481     Sequence alphabet DNAAlphabet()
00482 
00483     If you have a string 'data' containing the file contents, you must
00484     first turn this into a handle in order to parse it:
00485 
00486     >>> data = ">Alpha\nACCGGATGTA\n>Beta\nAGGCTCGGTTA\n"
00487     >>> from Bio import SeqIO
00488     >>> from StringIO import StringIO
00489     >>> for record in SeqIO.parse(StringIO(data), "fasta"):
00490     ...     print record.id, record.seq
00491     Alpha ACCGGATGTA
00492     Beta AGGCTCGGTTA
00493 
00494     Use the Bio.SeqIO.read(...) function when you expect a single record
00495     only.
00496     """
00497     #NOTE - The above docstring has some raw \n characters needed
00498     #for the StringIO example, hense the whole docstring is in raw
00499     #string mode (see the leading r before the opening quote).
00500     from Bio import AlignIO
00501 
00502     #Hack for SFF, will need to make this more general in future
00503     if format in _BinaryFormats :
00504         mode = 'rb'
00505     else:
00506         mode = 'rU'
00507 
00508     #Try and give helpful error messages:
00509     if not isinstance(format, basestring):
00510         raise TypeError("Need a string for the file format (lower case)")
00511     if not format:
00512         raise ValueError("Format required (lower case string)")
00513     if format != format.lower():
00514         raise ValueError("Format string '%s' should be lower case" % format)
00515     if alphabet is not None and not (isinstance(alphabet, Alphabet) or \
00516                                      isinstance(alphabet, AlphabetEncoder)):
00517         raise ValueError("Invalid alphabet, %s" % repr(alphabet))
00518 
00519     with as_handle(handle, mode) as fp:
00520         #Map the file format to a sequence iterator:
00521         if format in _FormatToIterator:
00522             iterator_generator = _FormatToIterator[format]
00523             if alphabet is None:
00524                 i = iterator_generator(fp)
00525             else:
00526                 try:
00527                     i = iterator_generator(fp, alphabet=alphabet)
00528                 except TypeError:
00529                     i = _force_alphabet(iterator_generator(fp), alphabet)
00530         elif format in AlignIO._FormatToIterator:
00531             #Use Bio.AlignIO to read in the alignments
00532             i = (r for alignment in AlignIO.parse(fp, format,
00533                                                   alphabet=alphabet)
00534                  for r in alignment)
00535         else:
00536             raise ValueError("Unknown format '%s'" % format)
00537         #This imposes some overhead... wait until we drop Python 2.4 to fix it
00538         for r in i:
00539             yield r

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.SeqIO.read (   handle,
  format,
  alphabet = None 
)
Turns a sequence file into a single SeqRecord.

 - handle   - handle to the file, or the filename as a string
              (note older verions of Biopython only took a handle).
 - format   - string describing the file format.
 - alphabet - optional Alphabet object, useful when the sequence type
              cannot be automatically inferred from the file itself
              (e.g. format="fasta" or "tab")

This function is for use parsing sequence files containing
exactly one record.  For example, reading a GenBank file:

>>> from Bio import SeqIO
>>> record = SeqIO.read("GenBank/arab1.gb", "genbank")
>>> print "ID", record.id
ID AC007323.5
>>> print "Sequence length", len(record)
Sequence length 86436
>>> print "Sequence alphabet", record.seq.alphabet
Sequence alphabet IUPACAmbiguousDNA()

If the handle contains no records, or more than one record,
an exception is raised.  For example:

>>> from Bio import SeqIO
>>> record = SeqIO.read("GenBank/cor6_6.gb", "genbank")
Traceback (most recent call last):
    ...
ValueError: More than one record found in handle

If however you want the first record from a file containing
multiple records this function would raise an exception (as
shown in the example above).  Instead use:

>>> from Bio import SeqIO
>>> record = SeqIO.parse("GenBank/cor6_6.gb", "genbank").next()
>>> print "First record's ID", record.id
First record's ID X55053.1

Use the Bio.SeqIO.parse(handle, format) function if you want
to read multiple records from the handle.

Definition at line 554 of file __init__.py.

00554 
00555 def read(handle, format, alphabet=None):
00556     """Turns a sequence file into a single SeqRecord.
00557 
00558      - handle   - handle to the file, or the filename as a string
00559                   (note older verions of Biopython only took a handle).
00560      - format   - string describing the file format.
00561      - alphabet - optional Alphabet object, useful when the sequence type
00562                   cannot be automatically inferred from the file itself
00563                   (e.g. format="fasta" or "tab")
00564 
00565     This function is for use parsing sequence files containing
00566     exactly one record.  For example, reading a GenBank file:
00567 
00568     >>> from Bio import SeqIO
00569     >>> record = SeqIO.read("GenBank/arab1.gb", "genbank")
00570     >>> print "ID", record.id
00571     ID AC007323.5
00572     >>> print "Sequence length", len(record)
00573     Sequence length 86436
00574     >>> print "Sequence alphabet", record.seq.alphabet
00575     Sequence alphabet IUPACAmbiguousDNA()
00576 
00577     If the handle contains no records, or more than one record,
00578     an exception is raised.  For example:
00579 
00580     >>> from Bio import SeqIO
00581     >>> record = SeqIO.read("GenBank/cor6_6.gb", "genbank")
00582     Traceback (most recent call last):
00583         ...
00584     ValueError: More than one record found in handle
00585 
00586     If however you want the first record from a file containing
00587     multiple records this function would raise an exception (as
00588     shown in the example above).  Instead use:
00589 
00590     >>> from Bio import SeqIO
00591     >>> record = SeqIO.parse("GenBank/cor6_6.gb", "genbank").next()
00592     >>> print "First record's ID", record.id
00593     First record's ID X55053.1
00594 
00595     Use the Bio.SeqIO.parse(handle, format) function if you want
00596     to read multiple records from the handle.
00597     """
00598     iterator = parse(handle, format, alphabet)
00599     try:
00600         first = iterator.next()
00601     except StopIteration:
00602         first = None
00603     if first is None:
00604         raise ValueError("No records found in handle")
00605     try:
00606         second = iterator.next()
00607     except StopIteration:
00608         second = None
00609     if second is not None:
00610         raise ValueError("More than one record found in handle")
00611     return first

Here is the call graph for this function:

def Bio.SeqIO.to_dict (   sequences,
  key_function = None 
)
Turns a sequence iterator or list into a dictionary.

 - sequences  - An iterator that returns SeqRecord objects,
                or simply a list of SeqRecord objects.
 - key_function - Optional callback function which when given a
                SeqRecord should return a unique key for the dictionary.

e.g. key_function = lambda rec : rec.name
or,  key_function = lambda rec : rec.description.split()[0]

If key_function is ommitted then record.id is used, on the assumption
that the records objects returned are SeqRecords with a unique id.

If there are duplicate keys, an error is raised.

Example usage, defaulting to using the record.id as key:

>>> from Bio import SeqIO
>>> filename = "GenBank/cor6_6.gb"
>>> format = "genbank"
>>> id_dict = SeqIO.to_dict(SeqIO.parse(filename, format))
>>> print sorted(id_dict)
['AF297471.1', 'AJ237582.1', 'L31939.1', 'M81224.1', 'X55053.1', 'X62281.1']
>>> print id_dict["L31939.1"].description
Brassica rapa (clone bif72) kin mRNA, complete cds.

A more complex example, using the key_function argument in order to
use a sequence checksum as the dictionary key:

>>> from Bio import SeqIO
>>> from Bio.SeqUtils.CheckSum import seguid
>>> filename = "GenBank/cor6_6.gb"
>>> format = "genbank"
>>> seguid_dict = SeqIO.to_dict(SeqIO.parse(filename, format),
...               key_function = lambda rec : seguid(rec.seq))
>>> for key, record in sorted(seguid_dict.iteritems()):
...     print key, record.id
/wQvmrl87QWcm9llO4/efg23Vgg AJ237582.1
BUg6YxXSKWEcFFH0L08JzaLGhQs L31939.1
SabZaA4V2eLE9/2Fm5FnyYy07J4 X55053.1
TtWsXo45S3ZclIBy4X/WJc39+CY M81224.1
l7gjJFE6W/S1jJn5+1ASrUKW/FA X62281.1
uVEYeAQSV5EDQOnFoeMmVea+Oow AF297471.1

This approach is not suitable for very large sets of sequences, as all
the SeqRecord objects are held in memory. Instead, consider using the
Bio.SeqIO.index() function (if it supports your particular file format).

Definition at line 612 of file __init__.py.

00612 
00613 def to_dict(sequences, key_function=None):
00614     """Turns a sequence iterator or list into a dictionary.
00615 
00616      - sequences  - An iterator that returns SeqRecord objects,
00617                     or simply a list of SeqRecord objects.
00618      - key_function - Optional callback function which when given a
00619                     SeqRecord should return a unique key for the dictionary.
00620 
00621     e.g. key_function = lambda rec : rec.name
00622     or,  key_function = lambda rec : rec.description.split()[0]
00623 
00624     If key_function is ommitted then record.id is used, on the assumption
00625     that the records objects returned are SeqRecords with a unique id.
00626 
00627     If there are duplicate keys, an error is raised.
00628 
00629     Example usage, defaulting to using the record.id as key:
00630 
00631     >>> from Bio import SeqIO
00632     >>> filename = "GenBank/cor6_6.gb"
00633     >>> format = "genbank"
00634     >>> id_dict = SeqIO.to_dict(SeqIO.parse(filename, format))
00635     >>> print sorted(id_dict)
00636     ['AF297471.1', 'AJ237582.1', 'L31939.1', 'M81224.1', 'X55053.1', 'X62281.1']
00637     >>> print id_dict["L31939.1"].description
00638     Brassica rapa (clone bif72) kin mRNA, complete cds.
00639 
00640     A more complex example, using the key_function argument in order to
00641     use a sequence checksum as the dictionary key:
00642 
00643     >>> from Bio import SeqIO
00644     >>> from Bio.SeqUtils.CheckSum import seguid
00645     >>> filename = "GenBank/cor6_6.gb"
00646     >>> format = "genbank"
00647     >>> seguid_dict = SeqIO.to_dict(SeqIO.parse(filename, format),
00648     ...               key_function = lambda rec : seguid(rec.seq))
00649     >>> for key, record in sorted(seguid_dict.iteritems()):
00650     ...     print key, record.id
00651     /wQvmrl87QWcm9llO4/efg23Vgg AJ237582.1
00652     BUg6YxXSKWEcFFH0L08JzaLGhQs L31939.1
00653     SabZaA4V2eLE9/2Fm5FnyYy07J4 X55053.1
00654     TtWsXo45S3ZclIBy4X/WJc39+CY M81224.1
00655     l7gjJFE6W/S1jJn5+1ASrUKW/FA X62281.1
00656     uVEYeAQSV5EDQOnFoeMmVea+Oow AF297471.1
00657 
00658     This approach is not suitable for very large sets of sequences, as all
00659     the SeqRecord objects are held in memory. Instead, consider using the
00660     Bio.SeqIO.index() function (if it supports your particular file format).
00661     """
00662     if key_function is None:
00663         key_function = lambda rec : rec.id
00664 
00665     d = dict()
00666     for record in sequences:
00667         key = key_function(record)
00668         if key in d:
00669             raise ValueError("Duplicate key '%s'" % key)
00670         d[key] = record
00671     return d

def Bio.SeqIO.write (   sequences,
  handle,
  format 
)
Write complete set of sequences to a file.

 - sequences - A list (or iterator) of SeqRecord objects, or (if using
               Biopython 1.54 or later) a single SeqRecord.
 - handle    - File handle object to write to, or filename as string
               (note older versions of Biopython only took a handle).
 - format    - lower case string describing the file format to write.

You should close the handle after calling this function.

Returns the number of records written (as an integer).

Definition at line 387 of file __init__.py.

00387 
00388 def write(sequences, handle, format):
00389     """Write complete set of sequences to a file.
00390 
00391      - sequences - A list (or iterator) of SeqRecord objects, or (if using
00392                    Biopython 1.54 or later) a single SeqRecord.
00393      - handle    - File handle object to write to, or filename as string
00394                    (note older versions of Biopython only took a handle).
00395      - format    - lower case string describing the file format to write.
00396 
00397     You should close the handle after calling this function.
00398 
00399     Returns the number of records written (as an integer).
00400     """
00401     from Bio import AlignIO
00402 
00403     #Try and give helpful error messages:
00404     if not isinstance(format, basestring):
00405         raise TypeError("Need a string for the file format (lower case)")
00406     if not format:
00407         raise ValueError("Format required (lower case string)")
00408     if format != format.lower():
00409         raise ValueError("Format string '%s' should be lower case" % format)
00410 
00411     if isinstance(sequences, SeqRecord):
00412         #This raised an exception in order version of Biopython
00413         sequences = [sequences]
00414 
00415     if format in _BinaryFormats:
00416         mode = 'wb'
00417     else:
00418         mode = 'w'
00419 
00420     with as_handle(handle, mode) as fp:
00421         #Map the file format to a writer class
00422         if format in _FormatToWriter:
00423             writer_class = _FormatToWriter[format]
00424             count = writer_class(fp).write_file(sequences)
00425         elif format in AlignIO._FormatToWriter:
00426             #Try and turn all the records into a single alignment,
00427             #and write that using Bio.AlignIO
00428             alignment = MultipleSeqAlignment(sequences)
00429             alignment_count = AlignIO.write([alignment], fp, format)
00430             assert alignment_count == 1, \
00431                     "Internal error - the underlying writer " \
00432                     " should have returned 1, not %s" % repr(alignment_count)
00433             count = len(alignment)
00434             del alignment_count, alignment
00435         elif format in _FormatToIterator or format in AlignIO._FormatToIterator:
00436             raise ValueError("Reading format '%s' is supported, but not writing"
00437                              % format)
00438         else:
00439             raise ValueError("Unknown format '%s'" % format)
00440 
00441         assert isinstance(count, int), "Internal error - the underlying %s " \
00442                "writer should have returned the record count, not %s" \
00443                % (format, repr(count))
00444 
00445     return count

Here is the call graph for this function:


Variable Documentation

string Bio.SeqIO.__docformat__ = "epytext en"

Definition at line 265 of file __init__.py.

list Bio.SeqIO._BinaryFormats = ["sff", "sff-trim", "abi", "abi-trim"]

Definition at line 384 of file __init__.py.

Initial value:
00001 {"fasta" : FastaIO.FastaIterator,
00002                      "gb" : InsdcIO.GenBankIterator,
00003                      "genbank" : InsdcIO.GenBankIterator,
00004                      "genbank-cds" : InsdcIO.GenBankCdsFeatureIterator,
00005                      "embl" : InsdcIO.EmblIterator,
00006                      "embl-cds" : InsdcIO.EmblCdsFeatureIterator,
00007                      "imgt" : InsdcIO.ImgtIterator,
00008                      "ig" : IgIO.IgIterator,
00009                      "swiss" : SwissIO.SwissIterator,
00010                      "phd" : PhdIO.PhdIterator,
00011                      "ace" : AceIO.AceIterator,
00012                      "tab" : TabIO.TabIterator,
00013                      "pir" : PirIO.PirIterator,
00014                      "fastq" : QualityIO.FastqPhredIterator,
00015                      "fastq-sanger" : QualityIO.FastqPhredIterator,
00016                      "fastq-solexa" : QualityIO.FastqSolexaIterator,
00017                      "fastq-illumina" : QualityIO.FastqIlluminaIterator,
00018                      "qual" : QualityIO.QualPhredIterator,
00019                      "sff": SffIO.SffIterator,
00020                      #Not sure about this in the long run:
00021                      "sff-trim": SffIO._SffTrimIterator,
00022                      "uniprot-xml": UniprotIO.UniprotIterator,
00023                      "seqxml" : SeqXmlIO.SeqXmlIterator,
00024                      "abi": AbiIO.AbiIterator,
00025                      "abi-trim": AbiIO._AbiTrimIterator,
00026                      }

Definition at line 341 of file __init__.py.

Initial value:
00001 {"fasta" : FastaIO.FastaWriter,
00002                    "gb" : InsdcIO.GenBankWriter,
00003                    "genbank" : InsdcIO.GenBankWriter,
00004                    "embl" : InsdcIO.EmblWriter,
00005                    "imgt" : InsdcIO.ImgtWriter,
00006                    "tab" : TabIO.TabWriter,
00007                    "fastq" : QualityIO.FastqPhredWriter,
00008                    "fastq-sanger" : QualityIO.FastqPhredWriter,
00009                    "fastq-solexa" : QualityIO.FastqSolexaWriter,
00010                    "fastq-illumina" : QualityIO.FastqIlluminaWriter,
00011                    "phd" : PhdIO.PhdWriter,
00012                    "qual" : QualityIO.QualPhredWriter,
00013                    "sff" : SffIO.SffWriter,
00014                    "seqxml" : SeqXmlIO.SeqXmlWriter,
00015                    }

Definition at line 368 of file __init__.py.