Back to index

python-biopython  1.60
Phd.py
Go to the documentation of this file.
00001 # Copyright 2004 by Cymon J. Cox and Frank Kauff.  All rights reserved.
00002 # Copyright 2008 by Michiel de Hoon.  All rights reserved.
00003 # Revisions copyright 2009 by Cymon J. Cox.  All rights reserved.
00004 # Revisions copyright 2009 by Peter Cock.  All rights reserved.
00005 # 
00006 # This code is part of the Biopython distribution and governed by its
00007 # license.  Please see the LICENSE file that should have been included
00008 # as part of this package.
00009 """
00010 Parser for PHD files output by PHRED and used by PHRAP and CONSED.
00011 
00012 This module can be used used directly which will return Record objects
00013 which should contain all the original data in the file.
00014 
00015 Alternatively, using Bio.SeqIO with the "phd" format will call this module
00016 internally.  This will give SeqRecord objects for each contig sequence.
00017 """
00018 
00019 from Bio import Seq
00020 from Bio.Alphabet import generic_dna
00021 
00022 CKEYWORDS=['CHROMAT_FILE','ABI_THUMBPRINT','PHRED_VERSION','CALL_METHOD',\
00023         'QUALITY_LEVELS','TIME','TRACE_ARRAY_MIN_INDEX','TRACE_ARRAY_MAX_INDEX',\
00024         'TRIM','TRACE_PEAK_AREA_RATIO','CHEM','DYE']
00025 
00026 class Record(object):
00027     """Hold information from a PHD file."""
00028     def __init__(self):
00029         self.file_name = ''
00030         self.comments = {}
00031         for kw in CKEYWORDS:
00032             self.comments[kw.lower()] = None
00033         self.sites = []
00034         self.seq = ''
00035         self.seq_trimmed = ''
00036 
00037 
00038 def read(handle):
00039     """Reads the next PHD record from the file, returning it as a Record object.
00040 
00041     This function reads PHD file data line by line from the handle,
00042     and returns a single Record object.
00043     """
00044     for line in handle:
00045         if line.startswith("BEGIN_SEQUENCE"):
00046             record = Record()
00047             record.file_name = line[15:].rstrip() 
00048             break
00049     else:
00050         return # No record found
00051 
00052     for line in handle:
00053         if line.startswith("BEGIN_COMMENT"):
00054             break
00055     else:
00056         raise ValueError("Failed to find BEGIN_COMMENT line")
00057        
00058     for line in handle:
00059         line = line.strip()
00060         if not line:
00061             continue
00062         if line=="END_COMMENT":
00063             break
00064         keyword, value = line.split(":", 1)
00065         keyword = keyword.lower()
00066         value = value.strip()
00067         if keyword in ('chromat_file',
00068                        'phred_version',
00069                        'call_method',
00070                        'chem',
00071                        'dye',
00072                        'time',
00073                        'basecaller_version',
00074                        'trace_processor_version'):
00075             record.comments[keyword] = value
00076         elif keyword in ('abi_thumbprint',
00077                          'quality_levels',
00078                          'trace_array_min_index',
00079                          'trace_array_max_index'):
00080             record.comments[keyword] = int(value)
00081         elif keyword=='trace_peak_area_ratio':
00082             record.comments[keyword] = float(value)
00083         elif keyword=='trim':
00084             first, last, prob = value.split()
00085             record.comments[keyword] = (int(first), int(last), float(prob))
00086     else:
00087         raise ValueError("Failed to find END_COMMENT line")
00088 
00089     for line in handle:
00090         if line.startswith('BEGIN_DNA'):
00091             break
00092     else:
00093         raise ValueError("Failed to find BEGIN_DNA line")
00094 
00095     for line in handle:
00096         if line.startswith('END_DNA'):
00097             break
00098         else:
00099             # Line is: "site quality peak_location"
00100             # Peak location is optional according to
00101             # David Gordon (the Consed author)
00102             parts = line.split()
00103             if len(parts) in [2,3]:
00104                 record.sites.append(tuple(parts))
00105             else:
00106                 raise ValueError("DNA line must contain a base and quality "
00107                                  "score, and optionally a peak location.")
00108 
00109     for line in handle:
00110         if line.startswith("END_SEQUENCE"):
00111             break
00112     else:
00113         raise ValueError("Failed to find END_SEQUENCE line")
00114 
00115     record.seq = Seq.Seq(''.join([n[0] for n in record.sites]), generic_dna)
00116     if record.comments['trim'] is not None:
00117         first, last = record.comments['trim'][:2]
00118         record.seq_trimmed = record.seq[first:last]
00119 
00120     return record
00121 
00122 def parse(handle):
00123     """Iterates over a file returning multiple PHD records.
00124 
00125     The data is read line by line from the handle. The handle can be a list
00126     of lines, an open file, or similar; the only requirement is that we can
00127     iterate over the handle to retrieve lines from it.
00128 
00129     Typical usage:
00130 
00131     records = parse(handle)
00132     for record in records:
00133         # do something with the record object
00134     """
00135     while True:
00136         record = read(handle)
00137         if not record:
00138             return
00139         yield record