Back to index

python-biopython  1.60
SffIO.py
Go to the documentation of this file.
00001 # Copyright 2009-2010 by Peter Cock.  All rights reserved.
00002 # Based on code contributed and copyright 2009 by Jose Blanca (COMAV-UPV).
00003 #
00004 # This code is part of the Biopython distribution and governed by its
00005 # license.  Please see the LICENSE file that should have been included
00006 # as part of this package.
00007 """Bio.SeqIO support for the binary Standard Flowgram Format (SFF) file format.
00008 
00009 SFF was designed by 454 Life Sciences (Roche), the Whitehead Institute for
00010 Biomedical Research and the Wellcome Trust Sanger Institute. You are expected
00011 to use this module via the Bio.SeqIO functions under the format name "sff" (or
00012 "sff-trim" as described below).
00013 
00014 For example, to iterate over the records in an SFF file,
00015 
00016     >>> from Bio import SeqIO
00017     >>> for record in SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff"):
00018     ...     print record.id, len(record), record.seq[:20]+"..."
00019     E3MFGYR02JWQ7T 265 tcagGGTCTACATGTTGGTT...
00020     E3MFGYR02JA6IL 271 tcagTTTTTTTTGGAAAGGA...
00021     E3MFGYR02JHD4H 310 tcagAAAGACAAGTGGTATC...
00022     E3MFGYR02GFKUC 299 tcagCGGCCGGGCCTCTCAT...
00023     E3MFGYR02FTGED 281 tcagTGGTAATGGGGGGAAA...
00024     E3MFGYR02FR9G7 261 tcagCTCCGTAAGAAGGTGC...
00025     E3MFGYR02GAZMS 278 tcagAAAGAAGTAAGGTAAA...
00026     E3MFGYR02HHZ8O 221 tcagACTTTCTTCTTTACCG...
00027     E3MFGYR02GPGB1 269 tcagAAGCAGTGGTATCAAC...
00028     E3MFGYR02F7Z7G 219 tcagAATCATCCACTTTTTA...
00029 
00030 Each SeqRecord object will contain all the annotation from the SFF file,
00031 including the PHRED quality scores.
00032 
00033     >>> print record.id, len(record)
00034     E3MFGYR02F7Z7G 219
00035     >>> print record.seq[:10], "..."
00036     tcagAATCAT ...
00037     >>> print record.letter_annotations["phred_quality"][:10], "..."
00038     [22, 21, 23, 28, 26, 15, 12, 21, 28, 21] ...
00039 
00040 Notice that the sequence is given in mixed case, the central upper case region
00041 corresponds to the trimmed sequence. This matches the output of the Roche
00042 tools (and the 3rd party tool sff_extract) for SFF to FASTA.
00043 
00044     >>> print record.annotations["clip_qual_left"]
00045     4
00046     >>> print record.annotations["clip_qual_right"]
00047     134
00048     >>> print record.seq[:4]
00049     tcag
00050     >>> print record.seq[4:20], "...", record.seq[120:134]
00051     AATCATCCACTTTTTA ... CAAAACACAAACAG
00052     >>> print record.seq[134:]
00053     atcttatcaacaaaactcaaagttcctaactgagacacgcaacaggggataagacaaggcacacaggggataggnnnnnnnnnnn
00054 
00055 The annotations dictionary also contains any adapter clip positions
00056 (usually zero), and information about the flows. e.g.
00057 
00058     >>> len(record.annotations)
00059     11
00060     >>> print record.annotations["flow_key"]
00061     TCAG
00062     >>> print record.annotations["flow_values"][:10], "..."
00063     (83, 1, 128, 7, 4, 84, 6, 106, 3, 172) ...
00064     >>> print len(record.annotations["flow_values"])
00065     400
00066     >>> print record.annotations["flow_index"][:10], "..."
00067     (1, 2, 3, 2, 2, 0, 3, 2, 3, 3) ...
00068     >>> print len(record.annotations["flow_index"])
00069     219
00070 
00071 Note that to convert from a raw reading in flow_values to the corresponding
00072 homopolymer stretch estimate, the value should be rounded to the nearest 100:
00073 
00074     >>> print [int(round(value, -2)) // 100
00075     ...        for value in record.annotations["flow_values"][:10]], '...'
00076     [1, 0, 1, 0, 0, 1, 0, 1, 0, 2] ...
00077 
00078 If a read name is exactly 14 alphanumeric characters, the annotations 
00079 dictionary will also contain meta-data about the read extracted by 
00080 interpretting the name as a 454 Sequencing System "Universal" Accession
00081 Number. Note that if a read name happens to be exactly 14 alphanumeric
00082 characters but was not generated automatically, these annotation records
00083 will contain nonsense information.
00084 
00085     >>> print record.annotations["region"]
00086     2
00087     >>> print record.annotations["time"]
00088     [2008, 1, 9, 16, 16, 0]
00089     >>> print record.annotations["coords"]
00090     (2434, 1658)
00091 
00092 As a convenience method, you can read the file with SeqIO format name "sff-trim"
00093 instead of "sff" to get just the trimmed sequences (without any annotation
00094 except for the PHRED quality scores and anything encoded in the read names):
00095 
00096     >>> from Bio import SeqIO
00097     >>> for record in SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff-trim"):
00098     ...     print record.id, len(record), record.seq[:20]+"..."
00099     E3MFGYR02JWQ7T 260 GGTCTACATGTTGGTTAACC...
00100     E3MFGYR02JA6IL 265 TTTTTTTTGGAAAGGAAAAC...
00101     E3MFGYR02JHD4H 292 AAAGACAAGTGGTATCAACG...
00102     E3MFGYR02GFKUC 295 CGGCCGGGCCTCTCATCGGT...
00103     E3MFGYR02FTGED 277 TGGTAATGGGGGGAAATTTA...
00104     E3MFGYR02FR9G7 256 CTCCGTAAGAAGGTGCTGCC...
00105     E3MFGYR02GAZMS 271 AAAGAAGTAAGGTAAATAAC...
00106     E3MFGYR02HHZ8O 150 ACTTTCTTCTTTACCGTAAC...
00107     E3MFGYR02GPGB1 221 AAGCAGTGGTATCAACGCAG...
00108     E3MFGYR02F7Z7G 130 AATCATCCACTTTTTAACGT...
00109 
00110 Looking at the final record in more detail, note how this differs to the
00111 example above:
00112 
00113     >>> print record.id, len(record)
00114     E3MFGYR02F7Z7G 130
00115     >>> print record.seq[:10], "..."
00116     AATCATCCAC ...
00117     >>> print record.letter_annotations["phred_quality"][:10], "..."
00118     [26, 15, 12, 21, 28, 21, 36, 28, 27, 27] ...
00119     >>> len(record.annotations)
00120     3
00121     >>> print record.annotations["region"]
00122     2
00123     >>> print record.annotations["coords"]
00124     (2434, 1658)
00125     >>> print record.annotations["time"]
00126     [2008, 1, 9, 16, 16, 0]
00127 
00128 You might use the Bio.SeqIO.convert() function to convert the (trimmed) SFF
00129 reads into a FASTQ file (or a FASTA file and a QUAL file), e.g.
00130 
00131     >>> from Bio import SeqIO
00132     >>> from StringIO import StringIO
00133     >>> out_handle = StringIO()
00134     >>> count = SeqIO.convert("Roche/E3MFGYR02_random_10_reads.sff", "sff",
00135     ...                       out_handle, "fastq")
00136     >>> print "Converted %i records" % count
00137     Converted 10 records
00138 
00139 The output FASTQ file would start like this:
00140 
00141     >>> print "%s..." % out_handle.getvalue()[:50]
00142     @E3MFGYR02JWQ7T
00143     tcagGGTCTACATGTTGGTTAACCCGTACTGATT...
00144 
00145 Bio.SeqIO.index() provides memory efficient random access to the reads in an
00146 SFF file by name. SFF files can include an index within the file, which can
00147 be read in making this very fast. If the index is missing (or in a format not
00148 yet supported in Biopython) the file is indexed by scanning all the reads -
00149 which is a little slower. For example,
00150 
00151     >>> from Bio import SeqIO
00152     >>> reads = SeqIO.index("Roche/E3MFGYR02_random_10_reads.sff", "sff")
00153     >>> record = reads["E3MFGYR02JHD4H"]
00154     >>> print record.id, len(record), record.seq[:20]+"..."
00155     E3MFGYR02JHD4H 310 tcagAAAGACAAGTGGTATC...
00156 
00157 Or, using the trimmed reads:
00158 
00159     >>> from Bio import SeqIO
00160     >>> reads = SeqIO.index("Roche/E3MFGYR02_random_10_reads.sff", "sff-trim")
00161     >>> record = reads["E3MFGYR02JHD4H"]
00162     >>> print record.id, len(record), record.seq[:20]+"..."
00163     E3MFGYR02JHD4H 292 AAAGACAAGTGGTATCAACG...
00164 
00165 You can also use the Bio.SeqIO.write() function with the "sff" format. Note
00166 that this requires all the flow information etc, and thus is probably only
00167 useful for SeqRecord objects originally from reading another SFF file (and
00168 not the trimmed SeqRecord objects from parsing an SFF file as "sff-trim").
00169 
00170 As an example, let's pretend this example SFF file represents some DNA which
00171 was pre-amplified with a PCR primers AAAGANNNNN. The following script would
00172 produce a sub-file containing all those reads whose post-quality clipping
00173 region (i.e. the sequence after trimming) starts with AAAGA exactly (the non-
00174 degenerate bit of this pretend primer):
00175 
00176     >>> from Bio import SeqIO
00177     >>> records = (record for record in
00178     ...            SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff","sff")
00179     ...            if record.seq[record.annotations["clip_qual_left"]:].startswith("AAAGA"))
00180     >>> count = SeqIO.write(records, "temp_filtered.sff", "sff")
00181     >>> print "Selected %i records" % count
00182     Selected 2 records
00183 
00184 Of course, for an assembly you would probably want to remove these primers.
00185 If you want FASTA or FASTQ output, you could just slice the SeqRecord. However,
00186 if you want SFF output we have to preserve all the flow information - the trick
00187 is just to adjust the left clip position!
00188 
00189     >>> from Bio import SeqIO
00190     >>> def filter_and_trim(records, primer):
00191     ...     for record in records:
00192     ...         if record.seq[record.annotations["clip_qual_left"]:].startswith(primer):
00193     ...             record.annotations["clip_qual_left"] += len(primer)
00194     ...             yield record
00195     >>> records = SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff")
00196     >>> count = SeqIO.write(filter_and_trim(records,"AAAGA"),
00197     ...                     "temp_filtered.sff", "sff")
00198     >>> print "Selected %i records" % count
00199     Selected 2 records
00200 
00201 We can check the results, note the lower case clipped region now includes the "AAAGA"
00202 sequence:
00203 
00204     >>> for record in SeqIO.parse("temp_filtered.sff", "sff"):
00205     ...     print record.id, len(record), record.seq[:20]+"..."
00206     E3MFGYR02JHD4H 310 tcagaaagaCAAGTGGTATC...
00207     E3MFGYR02GAZMS 278 tcagaaagaAGTAAGGTAAA...
00208     >>> for record in SeqIO.parse("temp_filtered.sff", "sff-trim"):
00209     ...     print record.id, len(record), record.seq[:20]+"..."
00210     E3MFGYR02JHD4H 287 CAAGTGGTATCAACGCAGAG...
00211     E3MFGYR02GAZMS 266 AGTAAGGTAAATAACAAACG...
00212     >>> import os
00213     >>> os.remove("temp_filtered.sff")
00214 
00215 For a description of the file format, please see the Roche manuals and:
00216 http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=formats&m=doc&s=formats
00217 
00218 """
00219 from Bio.SeqIO.Interfaces import SequenceWriter
00220 from Bio import Alphabet
00221 from Bio.Seq import Seq
00222 from Bio.SeqRecord import SeqRecord
00223 import struct
00224 import sys
00225 import re
00226 
00227 from Bio._py3k import _bytes_to_string, _as_bytes
00228 _null = _as_bytes("\0")
00229 _sff = _as_bytes(".sff")
00230 _hsh = _as_bytes(".hsh")
00231 _srt = _as_bytes(".srt")
00232 _mft = _as_bytes(".mft")
00233 _flag = _as_bytes("\xff")
00234 
00235 def _sff_file_header(handle):
00236     """Read in an SFF file header (PRIVATE).
00237 
00238     Assumes the handle is at the start of the file, will read forwards
00239     though the header and leave the handle pointing at the first record.
00240     Returns a tuple of values from the header (header_length, index_offset,
00241     index_length, number_of_reads, flows_per_read, flow_chars, key_sequence)
00242 
00243     >>> handle = open("Roche/greek.sff", "rb")
00244     >>> values = _sff_file_header(handle)
00245     >>> print values[0]
00246     840
00247     >>> print values[1]
00248     65040
00249     >>> print values[2]
00250     256
00251     >>> print values[3]
00252     24
00253     >>> print values[4]
00254     800
00255     >>> values[-1]
00256     'TCAG'
00257 
00258     """
00259     if hasattr(handle,"mode") and "U" in handle.mode.upper():
00260         raise ValueError("SFF files must NOT be opened in universal new "
00261                          "lines mode. Binary mode is recommended (although "
00262                          "on Unix the default mode is also fine).")
00263     elif hasattr(handle,"mode") and "B" not in handle.mode.upper() \
00264     and sys.platform == "win32":
00265         raise ValueError("SFF files must be opened in binary mode on Windows")
00266     #file header (part one)
00267     #use big endiean encdoing   >
00268     #magic_number               I
00269     #version                    4B
00270     #index_offset               Q
00271     #index_length               I
00272     #number_of_reads            I
00273     #header_length              H
00274     #key_length                 H
00275     #number_of_flows_per_read   H
00276     #flowgram_format_code       B
00277     #[rest of file header depends on the number of flows and how many keys]
00278     fmt = '>4s4BQIIHHHB'
00279     assert 31 == struct.calcsize(fmt)
00280     data = handle.read(31)
00281     if not data:
00282         raise ValueError("Empty file.")
00283     elif len(data) < 13:
00284         raise ValueError("File too small to hold a valid SFF header.")
00285     magic_number, ver0, ver1, ver2, ver3, index_offset, index_length, \
00286     number_of_reads, header_length, key_length, number_of_flows_per_read, \
00287     flowgram_format = struct.unpack(fmt, data)
00288     if magic_number in [_hsh, _srt, _mft]:
00289         #Probably user error, calling Bio.SeqIO.parse() twice!
00290         raise ValueError("Handle seems to be at SFF index block, not start")
00291     if magic_number != _sff: # 779314790
00292         raise ValueError("SFF file did not start '.sff', but %s" \
00293                          % repr(magic_number))
00294     if (ver0, ver1, ver2, ver3) != (0, 0, 0, 1):
00295         raise ValueError("Unsupported SFF version in header, %i.%i.%i.%i" \
00296                          % (ver0, ver1, ver2, ver3))
00297     if flowgram_format != 1:
00298         raise ValueError("Flowgram format code %i not supported" \
00299                          % flowgram_format)
00300     if (index_offset!=0) ^ (index_length!=0):
00301         raise ValueError("Index offset %i but index length %i" \
00302                          % (index_offset, index_length))
00303     flow_chars = _bytes_to_string(handle.read(number_of_flows_per_read))
00304     key_sequence = _bytes_to_string(handle.read(key_length))
00305     #According to the spec, the header_length field should be the total number
00306     #of bytes required by this set of header fields, and should be equal to
00307     #"31 + number_of_flows_per_read + key_length" rounded up to the next value
00308     #divisible by 8.
00309     assert header_length % 8 == 0
00310     padding = header_length - number_of_flows_per_read - key_length - 31
00311     assert 0 <= padding < 8, padding
00312     if handle.read(padding).count(_null) != padding:
00313         raise ValueError("Post header %i byte padding region contained data" \
00314                          % padding)
00315     return header_length, index_offset, index_length, \
00316            number_of_reads, number_of_flows_per_read, \
00317            flow_chars, key_sequence
00318 
00319 #This is a generator function!
00320 def _sff_do_slow_index(handle):
00321     """Generates an index by scanning though all the reads in an SFF file (PRIVATE).
00322 
00323     This is a slow but generic approach if we can't parse the provided index
00324     (if present).
00325 
00326     Will use the handle seek/tell functions.
00327     """
00328     handle.seek(0)
00329     header_length, index_offset, index_length, number_of_reads, \
00330     number_of_flows_per_read, flow_chars, key_sequence \
00331         = _sff_file_header(handle)
00332     #Now on to the reads...
00333     read_header_fmt = '>2HI4H'
00334     read_header_size = struct.calcsize(read_header_fmt)
00335     #NOTE - assuming flowgram_format==1, which means struct type H
00336     read_flow_fmt = ">%iH" % number_of_flows_per_read
00337     read_flow_size = struct.calcsize(read_flow_fmt)
00338     assert 1 == struct.calcsize(">B")
00339     assert 1 == struct.calcsize(">s")
00340     assert 1 == struct.calcsize(">c")
00341     assert read_header_size % 8 == 0 #Important for padding calc later!
00342     for read in range(number_of_reads):
00343         record_offset = handle.tell()
00344         if record_offset == index_offset:
00345             #Found index block within reads, ignore it:
00346             offset = index_offset + index_length
00347             if offset % 8:
00348                 offset += 8 - (offset % 8)
00349             assert offset % 8 == 0
00350             handle.seek(offset)
00351             record_offset = offset
00352         #assert record_offset%8 == 0 #Worth checking, but slow
00353         #First the fixed header
00354         data = handle.read(read_header_size)
00355         read_header_length, name_length, seq_len, clip_qual_left, \
00356         clip_qual_right, clip_adapter_left, clip_adapter_right \
00357             = struct.unpack(read_header_fmt, data)
00358         if read_header_length < 10 or read_header_length % 8 != 0:
00359             raise ValueError("Malformed read header, says length is %i:\n%s" \
00360                              % (read_header_length, repr(data)))
00361         #now the name and any padding (remainder of header)
00362         name = _bytes_to_string(handle.read(name_length))
00363         padding = read_header_length - read_header_size - name_length
00364         if handle.read(padding).count(_null) != padding:
00365             raise ValueError("Post name %i byte padding region contained data" \
00366                              % padding)
00367         assert record_offset + read_header_length == handle.tell()
00368         #now the flowgram values, flowgram index, bases and qualities
00369         size = read_flow_size + 3*seq_len
00370         handle.seek(size, 1)
00371         #now any padding...
00372         padding = size % 8
00373         if padding:
00374             padding = 8 - padding
00375             if handle.read(padding).count(_null) != padding:
00376                 raise ValueError("Post quality %i byte padding region contained data" \
00377                                  % padding)
00378         #print read, name, record_offset
00379         yield name, record_offset
00380     if handle.tell() % 8 != 0:
00381         raise ValueError("After scanning reads, did not end on a multiple of 8")
00382 
00383 def _sff_find_roche_index(handle):
00384     """Locate any existing Roche style XML meta data and read index (PRIVATE).
00385 
00386     Makes a number of hard coded assumptions based on reverse engineered SFF
00387     files from Roche 454 machines.
00388 
00389     Returns a tuple of read count, SFF "index" offset and size, XML offset
00390     and size, and the actual read index offset and size.
00391 
00392     Raises a ValueError for unsupported or non-Roche index blocks.
00393     """
00394     handle.seek(0)
00395     header_length, index_offset, index_length, number_of_reads, \
00396     number_of_flows_per_read, flow_chars, key_sequence \
00397         = _sff_file_header(handle)
00398     assert handle.tell() == header_length
00399     if not index_offset or not index_offset:
00400         raise ValueError("No index present in this SFF file")
00401     #Now jump to the header...
00402     handle.seek(index_offset)
00403     fmt = ">4s4B"
00404     fmt_size = struct.calcsize(fmt)
00405     data = handle.read(fmt_size)
00406     if not data:
00407         raise ValueError("Premature end of file? Expected index of size %i at offest %i, found nothing" \
00408                          % (index_length, index_offset))
00409     if len(data) < fmt_size:
00410         raise ValueError("Premature end of file? Expected index of size %i at offest %i, found %s" \
00411                          % (index_length, index_offset, repr(data)))
00412     magic_number, ver0, ver1, ver2, ver3 = struct.unpack(fmt, data)
00413     if magic_number == _mft: # 778921588
00414         #Roche 454 manifest index
00415         #This is typical from raw Roche 454 SFF files (2009), and includes
00416         #both an XML manifest and the sorted index.
00417         if (ver0, ver1, ver2, ver3) != (49, 46, 48, 48):
00418             #This is "1.00" as a string
00419             raise ValueError("Unsupported version in .mft index header, %i.%i.%i.%i" \
00420                              % (ver0, ver1, ver2, ver3))
00421         fmt2 = ">LL"
00422         fmt2_size = struct.calcsize(fmt2)
00423         xml_size, data_size = struct.unpack(fmt2, handle.read(fmt2_size))
00424         if index_length != fmt_size + fmt2_size + xml_size + data_size:
00425             raise ValueError("Problem understanding .mft index header, %i != %i + %i + %i + %i" \
00426                              % (index_length, fmt_size, fmt2_size, xml_size, data_size))
00427         return number_of_reads, header_length, \
00428                index_offset, index_length, \
00429                index_offset + fmt_size + fmt2_size, xml_size, \
00430                index_offset + fmt_size + fmt2_size + xml_size, data_size
00431     elif magic_number == _srt: #779317876
00432         #Roche 454 sorted index
00433         #I've had this from Roche tool sfffile when the read identifiers
00434         #had nonstandard lengths and there was no XML manifest.
00435         if (ver0, ver1, ver2, ver3) != (49, 46, 48, 48):
00436             #This is "1.00" as a string
00437             raise ValueError("Unsupported version in .srt index header, %i.%i.%i.%i" \
00438                              % (ver0, ver1, ver2, ver3))
00439         data = handle.read(4)
00440         if data != _null*4:
00441             raise ValueError("Did not find expected null four bytes in .srt index")
00442         return number_of_reads, header_length, \
00443                index_offset, index_length, \
00444                0, 0, \
00445                index_offset + fmt_size + 4, index_length - fmt_size - 4
00446     elif magic_number == _hsh:
00447         raise ValueError("Hash table style indexes (.hsh) in SFF files are "
00448                          "not (yet) supported")
00449     else:
00450         raise ValueError("Unknown magic number %s in SFF index header:\n%s" \
00451                          % (repr(magic_number), repr(data)))
00452 
00453 def _sff_read_roche_index_xml(handle):
00454     """Reads any existing Roche style XML manifest data in the SFF "index" (PRIVATE, DEPRECATED).
00455 
00456     Will use the handle seek/tell functions. Returns a string.
00457 
00458     This has been replaced by ReadRocheXmlManifest. We would normally just
00459     delete an old private function without warning, but I believe some people
00460     are using this so we'll handle this with a deprecation warning.
00461     """
00462     import warnings
00463     warnings.warn("Private function _sff_read_roche_index_xml is deprecated. "
00464                   "Use new public function ReadRocheXmlManifest instead",
00465                   DeprecationWarning)
00466     return ReadRocheXmlManifest(handle)
00467 
00468 
00469 def ReadRocheXmlManifest(handle):
00470     """Reads any Roche style XML manifest data in the SFF "index".
00471 
00472     The SFF file format allows for multiple different index blocks, and Roche
00473     took advantage of this to define their own index block wich also embeds
00474     an XML manifest string. This is not a publically documented extension to
00475     the SFF file format, this was reverse engineered.
00476 
00477     The handle should be to an SFF file opened in binary mode. This function
00478     will use the handle seek/tell functions and leave the handle in an
00479     arbitrary location.
00480 
00481     Any XML manifest found is returned as a Python string, which you can then
00482     parse as appropriate, or reuse when writing out SFF files with the
00483     SffWriter class.
00484 
00485     Returns a string, or raises a ValueError if an Roche manifest could not be
00486     found.
00487     """
00488     number_of_reads, header_length, index_offset, index_length, xml_offset, \
00489     xml_size, read_index_offset, read_index_size = _sff_find_roche_index(handle)
00490     if not xml_offset or not xml_size:
00491         raise ValueError("No XML manifest found")
00492     handle.seek(xml_offset)
00493     return _bytes_to_string(handle.read(xml_size))
00494 
00495 
00496 #This is a generator function!
00497 def _sff_read_roche_index(handle):
00498     """Reads any existing Roche style read index provided in the SFF file (PRIVATE).
00499 
00500     Will use the handle seek/tell functions.
00501 
00502     This works on ".srt1.00" and ".mft1.00" style Roche SFF index blocks.
00503 
00504     Roche SFF indices use base 255 not 256, meaning we see bytes in range the
00505     range 0 to 254 only. This appears to be so that byte 0xFF (character 255)
00506     can be used as a marker character to separate entries (required if the
00507     read name lengths vary).
00508 
00509     Note that since only four bytes are used for the read offset, this is
00510     limited to 255^4 bytes (nearly 4GB). If you try to use the Roche sfffile
00511     tool to combine SFF files beyound this limit, they issue a warning and
00512     omit the index (and manifest).
00513     """
00514     number_of_reads, header_length, index_offset, index_length, xml_offset, \
00515     xml_size, read_index_offset, read_index_size = _sff_find_roche_index(handle)
00516     #Now parse the read index...
00517     handle.seek(read_index_offset)
00518     fmt = ">5B"
00519     for read in range(number_of_reads):
00520         #TODO - Be more aware of when the index should end?
00521         data = handle.read(6)
00522         while True:
00523             more = handle.read(1)
00524             if not more:
00525                 raise ValueError("Premature end of file!")
00526             data += more
00527             if more == _flag: break
00528         assert data[-1:] == _flag, data[-1:]
00529         name = _bytes_to_string(data[:-6])
00530         off4, off3, off2, off1, off0 = struct.unpack(fmt, data[-6:-1])
00531         offset = off0 + 255*off1 + 65025*off2 + 16581375*off3
00532         if off4:
00533             #Could in theory be used as a fifth piece of offset information,
00534             #i.e. offset =+ 4228250625L*off4, but testing the Roche tools this
00535             #is not the case. They simple don't support such large indexes.
00536             raise ValueError("Expected a null terminator to the read name.")
00537         yield name, offset
00538     if handle.tell() != read_index_offset + read_index_size:
00539         raise ValueError("Problem with index length? %i vs %i" \
00540                          % (handle.tell(), read_index_offset + read_index_size))
00541 
00542 _valid_UAN_read_name = re.compile(r'^[a-zA-Z0-9]{14}$')
00543 def _sff_read_seq_record(handle, number_of_flows_per_read, flow_chars,
00544                          key_sequence, alphabet, trim=False):
00545     """Parse the next read in the file, return data as a SeqRecord (PRIVATE)."""
00546     #Now on to the reads...
00547     #the read header format (fixed part):
00548     #read_header_length     H
00549     #name_length            H
00550     #seq_len                I
00551     #clip_qual_left         H
00552     #clip_qual_right        H
00553     #clip_adapter_left      H
00554     #clip_adapter_right     H
00555     #[rest of read header depends on the name length etc]
00556     read_header_fmt = '>2HI4H'
00557     read_header_size = struct.calcsize(read_header_fmt)
00558     read_flow_fmt = ">%iH" % number_of_flows_per_read
00559     read_flow_size = struct.calcsize(read_flow_fmt)
00560 
00561     read_header_length, name_length, seq_len, clip_qual_left, \
00562     clip_qual_right, clip_adapter_left, clip_adapter_right \
00563         = struct.unpack(read_header_fmt, handle.read(read_header_size))
00564     if clip_qual_left:
00565         clip_qual_left -= 1 #python counting
00566     if clip_adapter_left:
00567         clip_adapter_left -= 1 #python counting
00568     if read_header_length < 10 or read_header_length % 8 != 0:
00569         raise ValueError("Malformed read header, says length is %i" \
00570                          % read_header_length)
00571     #now the name and any padding (remainder of header)
00572     name = _bytes_to_string(handle.read(name_length))
00573     padding = read_header_length - read_header_size - name_length
00574     if handle.read(padding).count(_null) != padding:
00575         raise ValueError("Post name %i byte padding region contained data" \
00576                          % padding)
00577     #now the flowgram values, flowgram index, bases and qualities
00578     #NOTE - assuming flowgram_format==1, which means struct type H
00579     flow_values = handle.read(read_flow_size) #unpack later if needed
00580     temp_fmt = ">%iB" % seq_len # used for flow index and quals
00581     flow_index = handle.read(seq_len) #unpack later if needed
00582     seq = _bytes_to_string(handle.read(seq_len)) #TODO - Use bytes in Seq?
00583     quals = list(struct.unpack(temp_fmt, handle.read(seq_len)))
00584     #now any padding...
00585     padding = (read_flow_size + seq_len*3)%8
00586     if padding:
00587         padding = 8 - padding
00588         if handle.read(padding).count(_null) != padding:
00589             raise ValueError("Post quality %i byte padding region contained data" \
00590                              % padding)
00591     #Follow Roche and apply most aggressive of qual and adapter clipping.
00592     #Note Roche seems to ignore adapter clip fields when writing SFF,
00593     #and uses just the quality clipping values for any clipping.
00594     clip_left = max(clip_qual_left, clip_adapter_left)
00595     #Right clipping of zero means no clipping
00596     if clip_qual_right:
00597         if clip_adapter_right:
00598             clip_right = min(clip_qual_right, clip_adapter_right)
00599         else:
00600             #Typical case with Roche SFF files
00601             clip_right = clip_qual_right
00602     elif clip_adapter_right:
00603         clip_right = clip_adapter_right
00604     else:
00605         clip_right = seq_len
00606     #Now build a SeqRecord
00607     if trim:
00608         seq = seq[clip_left:clip_right].upper()
00609         quals = quals[clip_left:clip_right]
00610         #Don't record the clipping values, flow etc, they make no sense now:
00611         annotations = {}
00612     else:
00613         #This use of mixed case mimics the Roche SFF tool's FASTA output
00614         seq = seq[:clip_left].lower() + \
00615               seq[clip_left:clip_right].upper() + \
00616               seq[clip_right:].lower()
00617         annotations = {"flow_values":struct.unpack(read_flow_fmt, flow_values),
00618                        "flow_index":struct.unpack(temp_fmt, flow_index),
00619                        "flow_chars":flow_chars,
00620                        "flow_key":key_sequence,
00621                        "clip_qual_left":clip_qual_left,
00622                        "clip_qual_right":clip_qual_right,
00623                        "clip_adapter_left":clip_adapter_left,
00624                        "clip_adapter_right":clip_adapter_right}
00625     if re.match(_valid_UAN_read_name, name):
00626         annotations["time"] = _get_read_time(name)
00627         annotations["region"] = _get_read_region(name)
00628         annotations["coords"] = _get_read_xy(name)
00629     record = SeqRecord(Seq(seq, alphabet),
00630                        id=name,
00631                        name=name,
00632                        description="",
00633                        annotations=annotations)
00634     #Dirty trick to speed up this line:
00635     #record.letter_annotations["phred_quality"] = quals
00636     dict.__setitem__(record._per_letter_annotations,
00637                      "phred_quality", quals)
00638     #Return the record and then continue...
00639     return record
00640 
00641 _powers_of_36 = [36**i for i in range(6)]
00642 def _string_as_base_36(string):
00643     """Interpret a string as a base-36 number as per 454 manual."""
00644     total = 0
00645     for c, power in zip(string[::-1], _powers_of_36):
00646         # For reference: ord('0') = 48, ord('9') = 57
00647         # For reference: ord('A') = 65, ord('Z') = 90
00648         # For reference: ord('a') = 97, ord('z') = 122
00649         if 48 <= ord(c) <= 57:
00650             val = ord(c) - 22 # equivalent to: - ord('0') + 26
00651         elif 65 <= ord(c) <= 90:
00652             val = ord(c) - 65
00653         elif 97 <= ord(c) <= 122:
00654             val = ord(c) - 97
00655         else:
00656             # Invalid character
00657             val = 0
00658         total += val * power 
00659     return total
00660 
00661 def _get_read_xy(read_name):
00662     """Extract coordinates from last 5 characters of read name."""
00663     number = _string_as_base_36(read_name[9:])
00664     return divmod(number, 4096)
00665 
00666 _time_denominators = [13 * 32 * 24 * 60 * 60,
00667                       32 * 24 * 60 * 60,
00668                       24 * 60 * 60,
00669                       60 * 60,
00670                       60]
00671 def _get_read_time(read_name):
00672     """Extract time from first 6 characters of read name."""
00673     time_list = []
00674     remainder = _string_as_base_36(read_name[:6])
00675     for denominator in _time_denominators:
00676         this_term, remainder = divmod(remainder, denominator)
00677         time_list.append(this_term)
00678     time_list.append(remainder)
00679     time_list[0] += 2000
00680     return time_list
00681 
00682 def _get_read_region(read_name):
00683     """Extract region from read name."""
00684     return int(read_name[8])
00685 
00686 def _sff_read_raw_record(handle, number_of_flows_per_read):
00687     """Extract the next read in the file as a raw (bytes) string (PRIVATE)."""
00688     read_header_fmt = '>2HI'
00689     read_header_size = struct.calcsize(read_header_fmt)
00690     read_flow_fmt = ">%iH" % number_of_flows_per_read
00691     read_flow_size = struct.calcsize(read_flow_fmt)
00692 
00693     raw = handle.read(read_header_size)
00694     read_header_length, name_length, seq_len \
00695                         = struct.unpack(read_header_fmt, raw)
00696     if read_header_length < 10 or read_header_length % 8 != 0:
00697         raise ValueError("Malformed read header, says length is %i" \
00698                          % read_header_length)
00699     #now the four clip values (4H = 8 bytes), and read name
00700     raw += handle.read(8 + name_length)
00701     #and any padding (remainder of header)
00702     padding = read_header_length - read_header_size - 8 - name_length
00703     pad = handle.read(padding)
00704     if pad.count(_null) != padding:
00705         raise ValueError("Post name %i byte padding region contained data" \
00706                          % padding)
00707     raw += pad
00708     #now the flowgram values, flowgram index, bases and qualities
00709     raw += handle.read(read_flow_size + seq_len*3)
00710     padding = (read_flow_size + seq_len*3)%8
00711     #now any padding...
00712     if padding:
00713         padding = 8 - padding
00714         pad = handle.read(padding)
00715         if pad.count(_null) != padding:
00716             raise ValueError("Post quality %i byte padding region contained data" \
00717                              % padding)
00718         raw += pad
00719     #Return the raw bytes
00720     return raw
00721 
00722 class _AddTellHandle(object):
00723     """Wrapper for handles which do not support the tell method (PRIVATE).
00724 
00725     Intended for use with things like network handles where tell (and reverse
00726     seek) are not supported. The SFF file needs to track the current offset in
00727     order to deal with the index block.
00728     """
00729     def __init__(self, handle):
00730         self._handle = handle
00731         self._offset = 0
00732 
00733     def read(self, length):
00734         data = self._handle.read(length)
00735         self._offset += len(data)
00736         return data
00737 
00738     def tell(self):
00739         return self._offset
00740 
00741     def seek(self, offset):
00742         if offset < self._offset:
00743             raise RunTimeError("Can't seek backwards")
00744         self._handle.read(offset - self._offset)
00745 
00746     def close(self):
00747         return self._handle.close()
00748 
00749 
00750 #This is a generator function!
00751 def SffIterator(handle, alphabet=Alphabet.generic_dna, trim=False):
00752     """Iterate over Standard Flowgram Format (SFF) reads (as SeqRecord objects).
00753 
00754     handle - input file, an SFF file, e.g. from Roche 454 sequencing.
00755              This must NOT be opened in universal read lines mode!
00756     alphabet - optional alphabet, defaults to generic DNA.
00757     trim - should the sequences be trimmed?
00758 
00759     The resulting SeqRecord objects should match those from a paired FASTA
00760     and QUAL file converted from the SFF file using the Roche 454 tool
00761     ssfinfo. i.e. The sequence will be mixed case, with the trim regions
00762     shown in lower case.
00763 
00764     This function is used internally via the Bio.SeqIO functions:
00765 
00766     >>> from Bio import SeqIO
00767     >>> handle = open("Roche/E3MFGYR02_random_10_reads.sff", "rb")
00768     >>> for record in SeqIO.parse(handle, "sff"):
00769     ...     print record.id, len(record)
00770     E3MFGYR02JWQ7T 265
00771     E3MFGYR02JA6IL 271
00772     E3MFGYR02JHD4H 310
00773     E3MFGYR02GFKUC 299
00774     E3MFGYR02FTGED 281
00775     E3MFGYR02FR9G7 261
00776     E3MFGYR02GAZMS 278
00777     E3MFGYR02HHZ8O 221
00778     E3MFGYR02GPGB1 269
00779     E3MFGYR02F7Z7G 219
00780     >>> handle.close()
00781 
00782     You can also call it directly:
00783 
00784     >>> handle = open("Roche/E3MFGYR02_random_10_reads.sff", "rb")
00785     >>> for record in SffIterator(handle):
00786     ...     print record.id, len(record)
00787     E3MFGYR02JWQ7T 265
00788     E3MFGYR02JA6IL 271
00789     E3MFGYR02JHD4H 310
00790     E3MFGYR02GFKUC 299
00791     E3MFGYR02FTGED 281
00792     E3MFGYR02FR9G7 261
00793     E3MFGYR02GAZMS 278
00794     E3MFGYR02HHZ8O 221
00795     E3MFGYR02GPGB1 269
00796     E3MFGYR02F7Z7G 219
00797     >>> handle.close()
00798 
00799     Or, with the trim option:
00800 
00801     >>> handle = open("Roche/E3MFGYR02_random_10_reads.sff", "rb")
00802     >>> for record in SffIterator(handle, trim=True):
00803     ...     print record.id, len(record)
00804     E3MFGYR02JWQ7T 260
00805     E3MFGYR02JA6IL 265
00806     E3MFGYR02JHD4H 292
00807     E3MFGYR02GFKUC 295
00808     E3MFGYR02FTGED 277
00809     E3MFGYR02FR9G7 256
00810     E3MFGYR02GAZMS 271
00811     E3MFGYR02HHZ8O 150
00812     E3MFGYR02GPGB1 221
00813     E3MFGYR02F7Z7G 130
00814     >>> handle.close()
00815 
00816     """
00817     if isinstance(Alphabet._get_base_alphabet(alphabet),
00818                   Alphabet.ProteinAlphabet):
00819         raise ValueError("Invalid alphabet, SFF files do not hold proteins.")
00820     if isinstance(Alphabet._get_base_alphabet(alphabet),
00821                   Alphabet.RNAAlphabet):
00822         raise ValueError("Invalid alphabet, SFF files do not hold RNA.")
00823     try:
00824         assert 0 == handle.tell()
00825     except AttributeError:
00826         #Probably a network handle or something like that
00827         handle = _AddTellHandle(handle)
00828     header_length, index_offset, index_length, number_of_reads, \
00829     number_of_flows_per_read, flow_chars, key_sequence \
00830         = _sff_file_header(handle)
00831     #Now on to the reads...
00832     #the read header format (fixed part):
00833     #read_header_length     H
00834     #name_length            H
00835     #seq_len                I
00836     #clip_qual_left         H
00837     #clip_qual_right        H
00838     #clip_adapter_left      H
00839     #clip_adapter_right     H
00840     #[rest of read header depends on the name length etc]
00841     read_header_fmt = '>2HI4H'
00842     read_header_size = struct.calcsize(read_header_fmt)
00843     read_flow_fmt = ">%iH" % number_of_flows_per_read
00844     read_flow_size = struct.calcsize(read_flow_fmt)
00845     assert 1 == struct.calcsize(">B")
00846     assert 1 == struct.calcsize(">s")
00847     assert 1 == struct.calcsize(">c")
00848     assert read_header_size % 8 == 0 #Important for padding calc later!
00849     #The spec allows for the index block to be before or even in the middle
00850     #of the reads. We can check that if we keep track of our position
00851     #in the file...
00852     for read in range(number_of_reads):
00853         if index_offset and handle.tell() == index_offset:
00854             offset = index_offset + index_length
00855             if offset % 8:
00856                 offset += 8 - (offset % 8)
00857             assert offset % 8 == 0
00858             handle.seek(offset)
00859             #Now that we've done this, we don't need to do it again. Clear
00860             #the index_offset so we can skip extra handle.tell() calls:
00861             index_offset = 0
00862         yield _sff_read_seq_record(handle,
00863                                    number_of_flows_per_read,
00864                                    flow_chars,
00865                                    key_sequence,
00866                                    alphabet,
00867                                    trim)
00868     #The following is not essential, but avoids confusing error messages
00869     #for the user if they try and re-parse the same handle.
00870     if index_offset and handle.tell() == index_offset:
00871         offset = index_offset + index_length
00872         if offset % 8:
00873             offset += 8 - (offset % 8)
00874         assert offset % 8 == 0
00875         handle.seek(offset)
00876     #Should now be at the end of the file...
00877     if handle.read(1):
00878         raise ValueError("Additional data at end of SFF file")
00879 
00880 
00881 #This is a generator function!
00882 def _SffTrimIterator(handle, alphabet=Alphabet.generic_dna):
00883     """Iterate over SFF reads (as SeqRecord objects) with trimming (PRIVATE)."""
00884     return SffIterator(handle, alphabet, trim=True)
00885 
00886 
00887 class SffWriter(SequenceWriter):
00888     """SFF file writer."""
00889 
00890     def __init__(self, handle, index=True, xml=None):
00891         """Creates the writer object.
00892 
00893         handle - Output handle, ideally in binary write mode.
00894         index - Boolean argument, should we try and write an index?
00895         xml - Optional string argument, xml manifest to be recorded in the index
00896               block (see function ReadRocheXmlManifest for reading this data).
00897         """
00898         if hasattr(handle,"mode") and "U" in handle.mode.upper():
00899             raise ValueError("SFF files must NOT be opened in universal new "
00900                              "lines mode. Binary mode is required")
00901         elif hasattr(handle,"mode") and "B" not in handle.mode.upper():
00902             raise ValueError("SFF files must be opened in binary mode")
00903         self.handle = handle
00904         self._xml = xml
00905         if index:
00906             self._index = []
00907         else:
00908             self._index = None
00909 
00910     def write_file(self, records):
00911         """Use this to write an entire file containing the given records."""
00912         try:
00913             self._number_of_reads = len(records)
00914         except TypeError:
00915             self._number_of_reads = 0 #dummy value
00916             if not hasattr(self.handle, "seek") \
00917             or not hasattr(self.handle, "tell"):
00918                 raise ValueError("A handle with a seek/tell methods is "
00919                                  "required in order to record the total "
00920                                  "record count in the file header (once it "
00921                                  "is known at the end).")
00922         if self._index is not None and \
00923         not (hasattr(self.handle, "seek") and hasattr(self.handle, "tell")):
00924             import warnings
00925             warnings.warn("A handle with a seek/tell methods is required in "
00926                           "order to record an SFF index.")
00927             self._index = None
00928         self._index_start = 0
00929         self._index_length = 0
00930         if not hasattr(records, "next"):
00931             records = iter(records)
00932         #Get the first record in order to find the flow information
00933         #we will need for the header.
00934         try:
00935             record = records.next()
00936         except StopIteration:
00937             record = None
00938         if record is None:
00939             #No records -> empty SFF file (or an error)?
00940             #We can't write a header without the flow information.
00941             #return 0
00942             raise ValueError("Must have at least one sequence")
00943         try:
00944             self._key_sequence = _as_bytes(record.annotations["flow_key"])
00945             self._flow_chars = _as_bytes(record.annotations["flow_chars"])
00946             self._number_of_flows_per_read = len(self._flow_chars)
00947         except KeyError:
00948             raise ValueError("Missing SFF flow information")
00949         self.write_header()
00950         self.write_record(record)
00951         count = 1
00952         for record in records:
00953             self.write_record(record)
00954             count += 1
00955         if self._number_of_reads == 0:
00956             #Must go back and record the record count...
00957             offset = self.handle.tell()
00958             self.handle.seek(0)
00959             self._number_of_reads = count
00960             self.write_header()
00961             self.handle.seek(offset) #not essential?
00962         else:
00963             assert count == self._number_of_reads
00964         if self._index is not None:
00965             self._write_index()
00966         return count
00967 
00968     def _write_index(self):
00969         assert len(self._index)==self._number_of_reads
00970         handle = self.handle
00971         self._index.sort()
00972         self._index_start = handle.tell() #need for header
00973         #XML...
00974         if self._xml is not None:
00975             xml = _as_bytes(self._xml)
00976         else:
00977             from Bio import __version__
00978             xml = "<!-- This file was output with Biopython %s -->\n" % __version__
00979             xml += "<!-- This XML and index block attempts to mimic Roche SFF files -->\n"
00980             xml += "<!-- This file may be a combination of multiple SFF files etc -->\n"
00981             xml = _as_bytes(xml)
00982         xml_len = len(xml)
00983         #Write to the file...
00984         fmt = ">I4BLL"
00985         fmt_size = struct.calcsize(fmt)
00986         handle.write(_null*fmt_size + xml) #will come back later to fill this
00987         fmt2 = ">6B"
00988         assert 6 == struct.calcsize(fmt2)
00989         self._index.sort()
00990         index_len = 0 #don't know yet!
00991         for name, offset in self._index:
00992             #Roche files record the offsets using base 255 not 256.
00993             #See comments for parsing the index block. There may be a faster
00994             #way to code this, but we can't easily use shifts due to odd base
00995             off3 = offset
00996             off0 = off3 % 255
00997             off3 -= off0
00998             off1 = off3 % 65025
00999             off3 -= off1
01000             off2 = off3 % 16581375
01001             off3 -= off2
01002             assert offset == off0 + off1 + off2 + off3, \
01003                    "%i -> %i %i %i %i" % (offset, off0, off1, off2, off3)
01004             off3, off2, off1, off0 = off3//16581375, off2//65025, \
01005                                      off1//255, off0
01006             assert off0 < 255 and off1 < 255 and off2 < 255 and off3 < 255, \
01007                    "%i -> %i %i %i %i" % (offset, off0, off1, off2, off3)
01008             handle.write(name + struct.pack(fmt2, 0, \
01009                                             off3, off2, off1, off0, 255))
01010             index_len += len(name) + 6
01011         #Note any padding in not included:
01012         self._index_length = fmt_size + xml_len + index_len #need for header
01013         #Pad out to an 8 byte boundary (although I have noticed some
01014         #real Roche SFF files neglect to do this depsite their manual
01015         #suggesting this padding should be there):
01016         if self._index_length % 8:
01017             padding = 8 - (self._index_length%8)
01018             handle.write(_null*padding)
01019         else:
01020             padding = 0
01021         offset = handle.tell()
01022         assert offset == self._index_start + self._index_length + padding, \
01023                "%i vs %i + %i + %i"  % (offset, self._index_start, \
01024                                         self._index_length, padding)
01025         #Must now go back and update the index header with index size...
01026         handle.seek(self._index_start)
01027         handle.write(struct.pack(fmt, 778921588, #magic number
01028                                  49,46,48,48, #Roche index version, "1.00"
01029                                  xml_len, index_len) + xml)
01030         #Must now go back and update the header...
01031         handle.seek(0)
01032         self.write_header()
01033         handle.seek(offset) #not essential?
01034 
01035     def write_header(self):
01036         #Do header...
01037         key_length = len(self._key_sequence)
01038         #file header (part one)
01039         #use big endiean encdoing   >
01040         #magic_number               I
01041         #version                    4B
01042         #index_offset               Q
01043         #index_length               I
01044         #number_of_reads            I
01045         #header_length              H
01046         #key_length                 H
01047         #number_of_flows_per_read   H
01048         #flowgram_format_code       B
01049         #[rest of file header depends on the number of flows and how many keys]
01050         fmt = '>I4BQIIHHHB%is%is' % (self._number_of_flows_per_read, key_length)
01051         #According to the spec, the header_length field should be the total
01052         #number of bytes required by this set of header fields, and should be
01053         #equal to "31 + number_of_flows_per_read + key_length" rounded up to
01054         #the next value divisible by 8.
01055         if struct.calcsize(fmt) % 8 == 0:
01056             padding = 0
01057         else:
01058             padding = 8 - (struct.calcsize(fmt) % 8)
01059         header_length = struct.calcsize(fmt) + padding
01060         assert header_length % 8 == 0
01061         header = struct.pack(fmt, 779314790, #magic number 0x2E736666
01062                              0, 0, 0, 1, #version
01063                              self._index_start, self._index_length,
01064                              self._number_of_reads,
01065                              header_length, key_length,
01066                              self._number_of_flows_per_read,
01067                              1, #the only flowgram format code we support
01068                              self._flow_chars, self._key_sequence)
01069         self.handle.write(header + _null*padding)
01070 
01071     def write_record(self, record):
01072         """Write a single additional record to the output file.
01073 
01074         This assumes the header has been done.
01075         """
01076         #Basics
01077         name = _as_bytes(record.id)
01078         name_len = len(name)
01079         seq = _as_bytes(str(record.seq).upper())
01080         seq_len = len(seq)
01081         #Qualities
01082         try:
01083             quals = record.letter_annotations["phred_quality"]
01084         except KeyError:
01085             raise ValueError("Missing PHRED qualities information")
01086         #Flow
01087         try:
01088             flow_values = record.annotations["flow_values"]
01089             flow_index = record.annotations["flow_index"]
01090             if self._key_sequence != _as_bytes(record.annotations["flow_key"]) \
01091             or self._flow_chars != _as_bytes(record.annotations["flow_chars"]):
01092                 raise ValueError("Records have inconsistent SFF flow data")
01093         except KeyError:
01094             raise ValueError("Missing SFF flow information")
01095         except AttributeError:
01096             raise ValueError("Header not written yet?")
01097         #Clipping
01098         try:
01099             clip_qual_left = record.annotations["clip_qual_left"]
01100             if clip_qual_left:
01101                 clip_qual_left += 1
01102             clip_qual_right = record.annotations["clip_qual_right"]
01103             clip_adapter_left = record.annotations["clip_adapter_left"]
01104             if clip_adapter_left:
01105                 clip_adapter_left += 1
01106             clip_adapter_right = record.annotations["clip_adapter_right"]
01107         except KeyError:
01108             raise ValueError("Missing SFF clipping information")
01109 
01110         #Capture information for index
01111         if self._index is not None:
01112             offset = self.handle.tell()
01113             #Check the position of the final record (before sort by name)
01114             #Using a four-digit base 255 number, so the upper bound is
01115             #254*(1)+254*(255)+254*(255**2)+254*(255**3) = 4228250624
01116             #or equivalently it overflows at 255**4 = 4228250625
01117             if offset > 4228250624:
01118                 import warnings
01119                 warnings.warn("Read %s has file offset %i, which is too large "
01120                               "to store in the Roche SFF index structure. No "
01121                               "index block will be recorded." % (name, offset))
01122                 #No point recoring the offsets now
01123                 self._index = None
01124             else:
01125                 self._index.append((name, self.handle.tell()))
01126 
01127         #the read header format (fixed part):
01128         #read_header_length     H
01129         #name_length            H
01130         #seq_len                I
01131         #clip_qual_left         H
01132         #clip_qual_right        H
01133         #clip_adapter_left      H
01134         #clip_adapter_right     H
01135         #[rest of read header depends on the name length etc]
01136         #name
01137         #flow values
01138         #flow index
01139         #sequence
01140         #padding
01141         read_header_fmt = '>2HI4H%is' % name_len
01142         if struct.calcsize(read_header_fmt) % 8 == 0:
01143             padding = 0
01144         else:
01145             padding = 8 - (struct.calcsize(read_header_fmt) % 8)
01146         read_header_length = struct.calcsize(read_header_fmt) + padding
01147         assert read_header_length % 8 == 0
01148         data = struct.pack(read_header_fmt,
01149                            read_header_length,
01150                            name_len, seq_len,
01151                            clip_qual_left, clip_qual_right,
01152                            clip_adapter_left, clip_adapter_right,
01153                            name) + _null*padding
01154         assert len(data) == read_header_length
01155         #now the flowgram values, flowgram index, bases and qualities
01156         #NOTE - assuming flowgram_format==1, which means struct type H
01157         read_flow_fmt = ">%iH" % self._number_of_flows_per_read
01158         read_flow_size = struct.calcsize(read_flow_fmt)
01159         temp_fmt = ">%iB" % seq_len # used for flow index and quals
01160         data += struct.pack(read_flow_fmt, *flow_values) \
01161                 + struct.pack(temp_fmt, *flow_index) \
01162                 + seq \
01163                 + struct.pack(temp_fmt, *quals)
01164         #now any final padding...
01165         padding = (read_flow_size + seq_len*3)%8
01166         if padding:
01167             padding = 8 - padding
01168         self.handle.write(data + _null*padding)
01169 
01170 
01171 if __name__ == "__main__":
01172     print "Running quick self test"
01173     filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.sff"
01174     metadata = ReadRocheXmlManifest(open(filename, "rb"))
01175     index1 = sorted(_sff_read_roche_index(open(filename, "rb")))
01176     index2 = sorted(_sff_do_slow_index(open(filename, "rb")))
01177     assert index1 == index2
01178     assert len(index1) == len(list(SffIterator(open(filename, "rb"))))
01179     from StringIO import StringIO
01180     try:
01181         #This is in Python 2.6+, and is essential on Python 3
01182         from io import BytesIO
01183     except ImportError:
01184         BytesIO = StringIO
01185     assert len(index1) == len(list(SffIterator(BytesIO(open(filename,"rb").read()))))
01186 
01187     if sys.platform != "win32":
01188         assert len(index1) == len(list(SffIterator(open(filename, "r"))))
01189         index2 = sorted(_sff_read_roche_index(open(filename)))
01190         assert index1 == index2
01191         index2 = sorted(_sff_do_slow_index(open(filename)))
01192         assert index1 == index2
01193         assert len(index1) == len(list(SffIterator(open(filename))))
01194         assert len(index1) == len(list(SffIterator(BytesIO(open(filename,"r").read()))))
01195         assert len(index1) == len(list(SffIterator(BytesIO(open(filename).read()))))
01196 
01197     sff = list(SffIterator(open(filename, "rb")))
01198 
01199     sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb")))
01200     assert len(sff) == len(sff2)
01201     for old, new in zip(sff, sff2):
01202         assert old.id == new.id
01203         assert str(old.seq) == str(new.seq)
01204 
01205     sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "rb")))
01206     assert len(sff) == len(sff2)
01207     for old, new in zip(sff, sff2):
01208         assert old.id == new.id
01209         assert str(old.seq) == str(new.seq)
01210 
01211     sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "rb")))
01212     assert len(sff) == len(sff2)
01213     for old, new in zip(sff, sff2):
01214         assert old.id == new.id
01215         assert str(old.seq) == str(new.seq)
01216 
01217     sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_index_at_start.sff", "rb")))
01218     assert len(sff) == len(sff2)
01219     for old, new in zip(sff, sff2):
01220         assert old.id == new.id
01221         assert str(old.seq) == str(new.seq)
01222 
01223     sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_index_in_middle.sff", "rb")))
01224     assert len(sff) == len(sff2)
01225     for old, new in zip(sff, sff2):
01226         assert old.id == new.id
01227         assert str(old.seq) == str(new.seq)
01228 
01229     sff_trim = list(SffIterator(open(filename, "rb"), trim=True))
01230 
01231     print ReadRocheXmlManifest(open(filename, "rb"))
01232 
01233     from Bio import SeqIO
01234     filename = "../../Tests/Roche/E3MFGYR02_random_10_reads_no_trim.fasta"
01235     fasta_no_trim = list(SeqIO.parse(open(filename,"rU"), "fasta"))
01236     filename = "../../Tests/Roche/E3MFGYR02_random_10_reads_no_trim.qual"
01237     qual_no_trim = list(SeqIO.parse(open(filename,"rU"), "qual"))
01238 
01239     filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.fasta"
01240     fasta_trim = list(SeqIO.parse(open(filename,"rU"), "fasta"))
01241     filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.qual"
01242     qual_trim = list(SeqIO.parse(open(filename,"rU"), "qual"))
01243 
01244     for s, sT, f, q, fT, qT in zip(sff, sff_trim, fasta_no_trim,
01245                                    qual_no_trim, fasta_trim, qual_trim):
01246         #print
01247         print s.id
01248         #print s.seq
01249         #print s.letter_annotations["phred_quality"]
01250 
01251         assert s.id == f.id == q.id
01252         assert str(s.seq) == str(f.seq)
01253         assert s.letter_annotations["phred_quality"] == q.letter_annotations["phred_quality"]
01254 
01255         assert s.id == sT.id == fT.id == qT.id
01256         assert str(sT.seq) == str(fT.seq)
01257         assert sT.letter_annotations["phred_quality"] == qT.letter_annotations["phred_quality"]
01258 
01259 
01260     print "Writing with a list of SeqRecords..."
01261     handle = StringIO()
01262     w = SffWriter(handle, xml=metadata)
01263     w.write_file(sff) #list
01264     data = handle.getvalue()
01265     print "And again with an iterator..."
01266     handle = StringIO()
01267     w = SffWriter(handle, xml=metadata)
01268     w.write_file(iter(sff))
01269     assert data == handle.getvalue()
01270     #Check 100% identical to the original:
01271     filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.sff"
01272     original = open(filename,"rb").read()
01273     assert len(data) == len(original)
01274     assert data == original
01275     del data
01276     handle.close()
01277 
01278     print "-"*50
01279     filename = "../../Tests/Roche/greek.sff"
01280     for record in SffIterator(open(filename,"rb")):
01281         print record.id
01282     index1 = sorted(_sff_read_roche_index(open(filename, "rb")))
01283     index2 = sorted(_sff_do_slow_index(open(filename, "rb")))
01284     assert index1 == index2
01285     try:
01286         print ReadRocheXmlManifest(open(filename, "rb"))
01287         assert False, "Should fail!"
01288     except ValueError:
01289         pass
01290 
01291 
01292     handle = open(filename, "rb")
01293     for record in SffIterator(handle):
01294         pass
01295     try:
01296         for record in SffIterator(handle):
01297             print record.id
01298         assert False, "Should have failed"
01299     except ValueError, err:
01300         print "Checking what happens on re-reading a handle:"
01301         print err
01302 
01303 
01304     """
01305     #Ugly code to make test files...
01306     index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
01307     padding = len(index)%8
01308     if padding:
01309         padding = 8 - padding
01310     index += chr(0)*padding
01311     assert len(index)%8 == 0
01312 
01313     #Ugly bit of code to make a fake index at start
01314     records = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_random_10_reads.sff", "rb")))
01315     out_handle = open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "w")
01316     index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
01317     padding = len(index)%8
01318     if padding:
01319         padding = 8 - padding
01320     index += chr(0)*padding
01321     w = SffWriter(out_handle, index=False, xml=None)
01322     #Fake the header...
01323     w._number_of_reads = len(records)
01324     w._index_start = 0
01325     w._index_length = 0
01326     w._key_sequence = records[0].annotations["flow_key"]
01327     w._flow_chars = records[0].annotations["flow_chars"]
01328     w._number_of_flows_per_read = len(w._flow_chars)
01329     w.write_header()
01330     w._index_start = out_handle.tell()
01331     w._index_length = len(index)
01332     out_handle.seek(0)
01333     w.write_header() #this time with index info
01334     w.handle.write(index)
01335     for record in records:
01336         w.write_record(record)
01337     out_handle.close()
01338     records2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "rb")))
01339     for old, new in zip(records, records2):
01340         assert str(old.seq)==str(new.seq)
01341     i = list(_sff_do_slow_index(open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "rb")))
01342 
01343     #Ugly bit of code to make a fake index in middle
01344     records = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_random_10_reads.sff", "rb")))
01345     out_handle = open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "w")
01346     index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
01347     padding = len(index)%8
01348     if padding:
01349         padding = 8 - padding
01350     index += chr(0)*padding
01351     w = SffWriter(out_handle, index=False, xml=None)
01352     #Fake the header...
01353     w._number_of_reads = len(records)
01354     w._index_start = 0
01355     w._index_length = 0
01356     w._key_sequence = records[0].annotations["flow_key"]
01357     w._flow_chars = records[0].annotations["flow_chars"]
01358     w._number_of_flows_per_read = len(w._flow_chars)
01359     w.write_header()
01360     for record in records[:5]:
01361         w.write_record(record)
01362     w._index_start = out_handle.tell()
01363     w._index_length = len(index)
01364     w.handle.write(index)
01365     for record in records[5:]:
01366         w.write_record(record)
01367     out_handle.seek(0)
01368     w.write_header() #this time with index info
01369     out_handle.close()
01370     records2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "rb")))
01371     for old, new in zip(records, records2):
01372         assert str(old.seq)==str(new.seq)
01373     j = list(_sff_do_slow_index(open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "rb")))
01374 
01375     #Ugly bit of code to make a fake index at end
01376     records = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_random_10_reads.sff", "rb")))
01377     out_handle = open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "w")
01378     w = SffWriter(out_handle, index=False, xml=None)
01379     #Fake the header...
01380     w._number_of_reads = len(records)
01381     w._index_start = 0
01382     w._index_length = 0
01383     w._key_sequence = records[0].annotations["flow_key"]
01384     w._flow_chars = records[0].annotations["flow_chars"]
01385     w._number_of_flows_per_read = len(w._flow_chars)
01386     w.write_header()
01387     for record in records:
01388         w.write_record(record)
01389     w._index_start = out_handle.tell()
01390     w._index_length = len(index)
01391     out_handle.write(index)
01392     out_handle.seek(0)
01393     w.write_header() #this time with index info
01394     out_handle.close()
01395     records2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb")))
01396     for old, new in zip(records, records2):
01397         assert str(old.seq)==str(new.seq)
01398     try:
01399         print ReadRocheXmlManifest(open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb"))
01400         assert False, "Should fail!"
01401     except ValueError:
01402         pass
01403     k = list(_sff_do_slow_index(open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb")))
01404     """
01405 
01406     print "Done"