Back to index

python-biopython  1.60
AceIO.py
Go to the documentation of this file.
00001 # Copyright 2008-2010 by Peter Cock.  All rights reserved.
00002 #
00003 # This code is part of the Biopython distribution and governed by its
00004 # license.  Please see the LICENSE file that should have been included
00005 # as part of this package.
00006 
00007 """Bio.SeqIO support for the "ace" file format.
00008 
00009 You are expected to use this module via the Bio.SeqIO functions.
00010 See also the Bio.Sequencing.Ace module which offers more than just accessing
00011 the contig consensus sequences in an ACE file as SeqRecord objects.
00012 """
00013 
00014 from Bio.Seq import Seq
00015 from Bio.SeqRecord import SeqRecord
00016 from Bio.Alphabet import generic_nucleotide, generic_dna, generic_rna, Gapped
00017 from Bio.Sequencing import Ace
00018 
00019 #This is a generator function!
00020 def AceIterator(handle):
00021     """Returns SeqRecord objects from an ACE file.
00022 
00023     This uses the Bio.Sequencing.Ace module to do the hard work.  Note that
00024     by iterating over the file in a single pass, we are forced to ignore any
00025     WA, CT, RT or WR footer tags.
00026 
00027     Ace files include the base quality for each position, which are taken
00028     to be PHRED style scores. Just as if you had read in a FASTQ or QUAL file
00029     using PHRED scores using Bio.SeqIO, these are stored in the SeqRecord's
00030     letter_annotations dictionary under the "phred_quality" key.
00031 
00032     >>> from Bio import SeqIO
00033     >>> handle = open("Ace/consed_sample.ace", "rU")
00034     >>> for record in SeqIO.parse(handle, "ace"):
00035     ...     print record.id, record.seq[:10]+"...", len(record)
00036     ...     print max(record.letter_annotations["phred_quality"])
00037     Contig1 agccccgggc... 1475
00038     90
00039 
00040     However, ACE files do not include a base quality for any gaps in the
00041     consensus sequence, and these are represented in Biopython with a quality
00042     of zero. Using zero is perhaps misleading as there may be very strong
00043     evidence to support the gap in the consensus. Previous versions of
00044     Biopython therefore used None instead, but this complicated usage, and
00045     prevented output of the gapped sequence as FASTQ format.
00046 
00047     >>> from Bio import SeqIO
00048     >>> handle = open("Ace/contig1.ace", "rU")
00049     >>> for record in SeqIO.parse(handle, "ace"):
00050     ...     print record.id, "..." + record.seq[85:95]+"..."
00051     ...     print record.letter_annotations["phred_quality"][85:95]
00052     ...     print max(record.letter_annotations["phred_quality"])
00053     Contig1 ...AGAGG-ATGC...
00054     [57, 57, 54, 57, 57, 0, 57, 72, 72, 72]
00055     90
00056     Contig2 ...GAATTACTAT...
00057     [68, 68, 68, 68, 68, 68, 68, 68, 68, 68]
00058     90
00059 
00060     """
00061     for ace_contig in Ace.parse(handle):
00062         #Convert the ACE contig record into a SeqRecord...
00063         consensus_seq_str = ace_contig.sequence
00064         #Assume its DNA unless there is a U in it,
00065         if "U" in consensus_seq_str:
00066             if "T" in consensus_seq_str:
00067                 #Very odd! Error?
00068                 alpha = generic_nucleotide
00069             else:
00070                 alpha = generic_rna
00071         else:
00072             alpha = generic_dna
00073             
00074         if "*" in consensus_seq_str:
00075             #For consistency with most other file formats, map
00076             #any * gaps into - gaps.
00077             assert "-" not in consensus_seq_str
00078             consensus_seq = Seq(consensus_seq_str.replace("*","-"),
00079                                 Gapped(alpha, gap_char="-"))
00080         else:
00081             consensus_seq = Seq(consensus_seq_str, alpha)
00082 
00083         #TODO? - Base segments (BS lines) which indicates which read
00084         #phrap has chosen to be the consensus at a particular position.
00085         #Perhaps as SeqFeature objects?
00086 
00087         #TODO - Supporting reads (RD lines, plus perhaps QA and DS lines)
00088         #Perhaps as SeqFeature objects?
00089             
00090         seq_record = SeqRecord(consensus_seq,
00091                                id = ace_contig.name,
00092                                name = ace_contig.name)
00093 
00094         #Consensus base quality (BQ lines).  Note that any gaps (originally
00095         #as * characters) in the consensus do not get a quality entry, so
00096         #we assign a quality of None (zero would be missleading as there may
00097         #be excelent support for having a gap here).
00098         quals = []
00099         i = 0
00100         for base in consensus_seq:
00101             if base == "-":
00102                 quals.append(0)
00103             else:
00104                 quals.append(ace_contig.quality[i])
00105                 i += 1
00106         assert i == len(ace_contig.quality)
00107         seq_record.letter_annotations["phred_quality"] = quals
00108 
00109         yield seq_record 
00110     #All done
00111 
00112 def _test():
00113     """Run the Bio.SeqIO module's doctests.
00114 
00115     This will try and locate the unit tests directory, and run the doctests
00116     from there in order that the relative paths used in the examples work.
00117     """
00118     import doctest
00119     import os
00120     if os.path.isdir(os.path.join("..", "..", "Tests", "Ace")):
00121         print "Runing doctests..."
00122         cur_dir = os.path.abspath(os.curdir)
00123         os.chdir(os.path.join("..", "..", "Tests"))
00124         assert os.path.isfile("Ace/consed_sample.ace")
00125         doctest.testmod()
00126         os.chdir(cur_dir)
00127         del cur_dir
00128         print "Done"
00129     elif os.path.isdir(os.path.join("Tests", "Ace")):
00130         print "Runing doctests..."
00131         cur_dir = os.path.abspath(os.curdir)
00132         os.chdir(os.path.join("Tests"))
00133         doctest.testmod()
00134         os.chdir(cur_dir)
00135         del cur_dir
00136         print "Done"
00137         
00138 if __name__ == "__main__":
00139     _test()