Back to index

python-biopython  1.60
PhdIO.py
Go to the documentation of this file.
00001 # Copyright 2008-2010 by Peter Cock.  All rights reserved.
00002 # Revisions copyright 2009 by Cymon J. Cox.  All rights reserved.
00003 #
00004 # This code is part of the Biopython distribution and governed by its
00005 # license.  Please see the LICENSE file that should have been included
00006 # as part of this package.
00007 
00008 """Bio.SeqIO support for the "phd" file format.
00009 
00010 PHD files are output by PHRED and used by PHRAP and CONSED.
00011 
00012 You are expected to use this module via the Bio.SeqIO functions, under the
00013 format name "phd". See also the underlying Bio.Sequencing.Phd module.
00014 
00015 For example, using Bio.SeqIO we can read in one of the example PHRED files
00016 from the Biopython unit tests:
00017 
00018     >>> from Bio import SeqIO
00019     >>> for record in SeqIO.parse(open("Phd/phd1"), "phd"):
00020     ...     print record.id
00021     ...     print record.seq[:10], "..."
00022     ...     print record.letter_annotations["phred_quality"][:10], "..."
00023     34_222_(80-A03-19).b.ab1
00024     ctccgtcgga ...
00025     [9, 9, 10, 19, 22, 37, 28, 28, 24, 22] ...
00026     425_103_(81-A03-19).g.ab1
00027     cgggatccca ...
00028     [14, 17, 22, 10, 10, 10, 15, 8, 8, 9] ...
00029     425_7_(71-A03-19).b.ab1
00030     acataaatca ...
00031     [10, 10, 10, 10, 8, 8, 6, 6, 6, 6] ...
00032 
00033 Since PHRED files contain quality scores, you can save them as FASTQ or as
00034 QUAL files, for example using Bio.SeqIO.write(...), or simply with the format
00035 method of the SeqRecord object:
00036 
00037     >>> print record[:50].format("fastq")
00038     @425_7_(71-A03-19).b.ab1
00039     acataaatcaaattactnaccaacacacaaaccngtctcgcgtagtggag
00040     +
00041     ++++))'''')(''')$!$''')''''(+.''$!$))))+)))'''''''
00042     <BLANKLINE>
00043 
00044 Or,
00045 
00046     >>> print record[:50].format("qual")
00047     >425_7_(71-A03-19).b.ab1
00048     10 10 10 10 8 8 6 6 6 6 8 7 6 6 6 8 3 0 3 6 6 6 8 6 6 6 6 7
00049     10 13 6 6 3 0 3 8 8 8 8 10 8 8 8 6 6 6 6 6 6 6
00050     <BLANKLINE>
00051 
00052 Note these examples only show the first 50 bases to keep the output short.
00053 """
00054 
00055 from Bio.SeqRecord import SeqRecord
00056 from Bio.Sequencing import Phd
00057 from Bio.SeqIO.Interfaces import SequentialSequenceWriter
00058 from Bio.SeqIO import QualityIO
00059     
00060 #This is a generator function!
00061 def PhdIterator(handle):
00062     """Returns SeqRecord objects from a PHD file.
00063 
00064     This uses the Bio.Sequencing.Phd module to do the hard work.
00065     """
00066     phd_records = Phd.parse(handle)
00067     for phd_record in phd_records:
00068         #Convert the PHY record into a SeqRecord...
00069         #The "filename" can contain spaces, e.g. 'HWI-EAS94_4_1_1_602_99 1'
00070         #from unit test example file phd_solexa.
00071         #This will cause problems if used as the record identifier
00072         #(e.g. output for FASTQ format).
00073         name = phd_record.file_name.split(None, 1)[0]
00074         seq_record = SeqRecord(phd_record.seq,
00075                                id = name, name = name,
00076                                description= phd_record.file_name)
00077         #Just re-use the comments dictionary as the SeqRecord's annotations
00078         seq_record.annotations = phd_record.comments
00079         #And store the qualities and peak locations as per-letter-annotation
00080         seq_record.letter_annotations["phred_quality"] = \
00081                 [int(site[1]) for site in phd_record.sites]
00082         try:
00083             seq_record.letter_annotations["peak_location"] = \
00084                     [int(site[2]) for site in phd_record.sites]
00085         except IndexError:
00086             # peak locations are not always there according to
00087             # David Gordon (the Consed author)
00088             pass
00089         yield seq_record 
00090     #All done
00091 
00092 class PhdWriter(SequentialSequenceWriter):
00093     """Class to write Phd format files"""
00094 
00095     def __init__(self, handle):
00096         SequentialSequenceWriter.__init__(self, handle)
00097 
00098     def write_record(self, record):
00099         """Write a single Phd record to the file."""
00100         assert record.seq, "No sequence present in SeqRecord"
00101         # This method returns the 'phred_quality' scores or converted
00102         # 'solexa_quality' scores if present, else raises a value error
00103         phred_qualities = QualityIO._get_phred_quality(record)
00104         peak_locations = record.letter_annotations.get("peak_location", None)
00105         assert len(record.seq) == len(phred_qualities), "Number of " + \
00106                 "phd quality scores does not match length of sequence"
00107         if peak_locations:
00108             assert len(record.seq) == len(peak_locations), "Number " + \
00109                     "of peak location scores does not match length of sequence"
00110         if None in phred_qualities:
00111             raise ValueError("A quality value of None was found")
00112         if record.description.startswith("%s " % record.id):
00113             title = record.description
00114         else:
00115             title = "%s %s" % (record.id, record.description)
00116         self.handle.write("BEGIN_SEQUENCE %s\nBEGIN_COMMENT\n" \
00117                           % self.clean(title))
00118         for annot in [k.lower() for k in Phd.CKEYWORDS]:
00119             value = None
00120             if annot == "trim":
00121                 if record.annotations.get("trim", None):
00122                     value = "%s %s %.4f" % record.annotations["trim"]
00123             elif annot == "trace_peak_area_ratio":
00124                 if record.annotations.get("trace_peak_area_ratio", None):
00125                     value = "%.4f" % record.annotations["trace_peak_area_ratio"]
00126             else:
00127                 value = record.annotations.get(annot, None)
00128             if value or value == 0:
00129                 self.handle.write("%s: %s\n" % (annot.upper(), value))
00130 
00131         self.handle.write("END_COMMENT\nBEGIN_DNA\n")
00132         for i, site in enumerate(record.seq):
00133             if peak_locations:
00134                 self.handle.write("%s %i %i\n" % (
00135                         site,
00136                         round(phred_qualities[i]),
00137                         peak_locations[i])
00138                         )
00139             else:
00140                 self.handle.write("%s %i\n" % (
00141                         site,
00142                         round(phred_qualities[i]))
00143                         )
00144 
00145         self.handle.write("END_DNA\nEND_SEQUENCE\n")
00146 
00147 def _test():
00148     """Run the Bio.SeqIO.PhdIO module's doctests.
00149 
00150     This will try and locate the unit tests directory, and run the doctests
00151     from there in order that the relative paths used in the examples work.
00152     """
00153     import doctest
00154     import os
00155     if os.path.isdir(os.path.join("..", "..", "Tests")):
00156         print "Runing doctests..."
00157         cur_dir = os.path.abspath(os.curdir)
00158         os.chdir(os.path.join("..", "..", "Tests"))
00159         assert os.path.isfile("Phd/phd1")
00160         doctest.testmod()
00161         os.chdir(cur_dir)
00162         del cur_dir
00163         print "Done"
00164         
00165 if __name__ == "__main__":
00166     _test()
00167 
00168         
00169