Back to index

python-biopython  1.60
Functions
Bio.SeqIO.AceIO Namespace Reference

Functions

def AceIterator
def _test

Function Documentation

def Bio.SeqIO.AceIO._test ( ) [private]
Run the Bio.SeqIO module's doctests.

This will try and locate the unit tests directory, and run the doctests
from there in order that the relative paths used in the examples work.

Definition at line 112 of file AceIO.py.

00112 
00113 def _test():
00114     """Run the Bio.SeqIO module's doctests.
00115 
00116     This will try and locate the unit tests directory, and run the doctests
00117     from there in order that the relative paths used in the examples work.
00118     """
00119     import doctest
00120     import os
00121     if os.path.isdir(os.path.join("..", "..", "Tests", "Ace")):
00122         print "Runing doctests..."
00123         cur_dir = os.path.abspath(os.curdir)
00124         os.chdir(os.path.join("..", "..", "Tests"))
00125         assert os.path.isfile("Ace/consed_sample.ace")
00126         doctest.testmod()
00127         os.chdir(cur_dir)
00128         del cur_dir
00129         print "Done"
00130     elif os.path.isdir(os.path.join("Tests", "Ace")):
00131         print "Runing doctests..."
00132         cur_dir = os.path.abspath(os.curdir)
00133         os.chdir(os.path.join("Tests"))
00134         doctest.testmod()
00135         os.chdir(cur_dir)
00136         del cur_dir
00137         print "Done"
        
def Bio.SeqIO.AceIO.AceIterator (   handle)
Returns SeqRecord objects from an ACE file.

This uses the Bio.Sequencing.Ace module to do the hard work.  Note that
by iterating over the file in a single pass, we are forced to ignore any
WA, CT, RT or WR footer tags.

Ace files include the base quality for each position, which are taken
to be PHRED style scores. Just as if you had read in a FASTQ or QUAL file
using PHRED scores using Bio.SeqIO, these are stored in the SeqRecord's
letter_annotations dictionary under the "phred_quality" key.

>>> from Bio import SeqIO
>>> handle = open("Ace/consed_sample.ace", "rU")
>>> for record in SeqIO.parse(handle, "ace"):
...     print record.id, record.seq[:10]+"...", len(record)
...     print max(record.letter_annotations["phred_quality"])
Contig1 agccccgggc... 1475
90

However, ACE files do not include a base quality for any gaps in the
consensus sequence, and these are represented in Biopython with a quality
of zero. Using zero is perhaps misleading as there may be very strong
evidence to support the gap in the consensus. Previous versions of
Biopython therefore used None instead, but this complicated usage, and
prevented output of the gapped sequence as FASTQ format.

>>> from Bio import SeqIO
>>> handle = open("Ace/contig1.ace", "rU")
>>> for record in SeqIO.parse(handle, "ace"):
...     print record.id, "..." + record.seq[85:95]+"..."
...     print record.letter_annotations["phred_quality"][85:95]
...     print max(record.letter_annotations["phred_quality"])
Contig1 ...AGAGG-ATGC...
[57, 57, 54, 57, 57, 0, 57, 72, 72, 72]
90
Contig2 ...GAATTACTAT...
[68, 68, 68, 68, 68, 68, 68, 68, 68, 68]
90

Definition at line 20 of file AceIO.py.

00020 
00021 def AceIterator(handle):
00022     """Returns SeqRecord objects from an ACE file.
00023 
00024     This uses the Bio.Sequencing.Ace module to do the hard work.  Note that
00025     by iterating over the file in a single pass, we are forced to ignore any
00026     WA, CT, RT or WR footer tags.
00027 
00028     Ace files include the base quality for each position, which are taken
00029     to be PHRED style scores. Just as if you had read in a FASTQ or QUAL file
00030     using PHRED scores using Bio.SeqIO, these are stored in the SeqRecord's
00031     letter_annotations dictionary under the "phred_quality" key.
00032 
00033     >>> from Bio import SeqIO
00034     >>> handle = open("Ace/consed_sample.ace", "rU")
00035     >>> for record in SeqIO.parse(handle, "ace"):
00036     ...     print record.id, record.seq[:10]+"...", len(record)
00037     ...     print max(record.letter_annotations["phred_quality"])
00038     Contig1 agccccgggc... 1475
00039     90
00040 
00041     However, ACE files do not include a base quality for any gaps in the
00042     consensus sequence, and these are represented in Biopython with a quality
00043     of zero. Using zero is perhaps misleading as there may be very strong
00044     evidence to support the gap in the consensus. Previous versions of
00045     Biopython therefore used None instead, but this complicated usage, and
00046     prevented output of the gapped sequence as FASTQ format.
00047 
00048     >>> from Bio import SeqIO
00049     >>> handle = open("Ace/contig1.ace", "rU")
00050     >>> for record in SeqIO.parse(handle, "ace"):
00051     ...     print record.id, "..." + record.seq[85:95]+"..."
00052     ...     print record.letter_annotations["phred_quality"][85:95]
00053     ...     print max(record.letter_annotations["phred_quality"])
00054     Contig1 ...AGAGG-ATGC...
00055     [57, 57, 54, 57, 57, 0, 57, 72, 72, 72]
00056     90
00057     Contig2 ...GAATTACTAT...
00058     [68, 68, 68, 68, 68, 68, 68, 68, 68, 68]
00059     90
00060 
00061     """
00062     for ace_contig in Ace.parse(handle):
00063         #Convert the ACE contig record into a SeqRecord...
00064         consensus_seq_str = ace_contig.sequence
00065         #Assume its DNA unless there is a U in it,
00066         if "U" in consensus_seq_str:
00067             if "T" in consensus_seq_str:
00068                 #Very odd! Error?
00069                 alpha = generic_nucleotide
00070             else:
00071                 alpha = generic_rna
00072         else:
00073             alpha = generic_dna
00074             
00075         if "*" in consensus_seq_str:
00076             #For consistency with most other file formats, map
00077             #any * gaps into - gaps.
00078             assert "-" not in consensus_seq_str
00079             consensus_seq = Seq(consensus_seq_str.replace("*","-"),
00080                                 Gapped(alpha, gap_char="-"))
00081         else:
00082             consensus_seq = Seq(consensus_seq_str, alpha)
00083 
00084         #TODO? - Base segments (BS lines) which indicates which read
00085         #phrap has chosen to be the consensus at a particular position.
00086         #Perhaps as SeqFeature objects?
00087 
00088         #TODO - Supporting reads (RD lines, plus perhaps QA and DS lines)
00089         #Perhaps as SeqFeature objects?
00090             
00091         seq_record = SeqRecord(consensus_seq,
00092                                id = ace_contig.name,
00093                                name = ace_contig.name)
00094 
00095         #Consensus base quality (BQ lines).  Note that any gaps (originally
00096         #as * characters) in the consensus do not get a quality entry, so
00097         #we assign a quality of None (zero would be missleading as there may
00098         #be excelent support for having a gap here).
00099         quals = []
00100         i = 0
00101         for base in consensus_seq:
00102             if base == "-":
00103                 quals.append(0)
00104             else:
00105                 quals.append(ace_contig.quality[i])
00106                 i += 1
00107         assert i == len(ace_contig.quality)
00108         seq_record.letter_annotations["phred_quality"] = quals
00109 
00110         yield seq_record 
00111     #All done