Back to index

python-biopython  1.60
Classes | Functions | Variables
Bio.SeqIO.SffIO Namespace Reference

Classes

class  _AddTellHandle
class  SffWriter

Functions

def _sff_file_header
def _sff_do_slow_index
def _sff_find_roche_index
def _sff_read_roche_index_xml
def ReadRocheXmlManifest
def _sff_read_roche_index
def _sff_read_seq_record
def _string_as_base_36
def _get_read_xy
def _get_read_time
def _get_read_region
def _sff_read_raw_record
def SffIterator
def _SffTrimIterator

Variables

tuple _null = _as_bytes("\0")
tuple _sff = _as_bytes(".sff")
tuple _hsh = _as_bytes(".hsh")
tuple _srt = _as_bytes(".srt")
tuple _mft = _as_bytes(".mft")
tuple _flag = _as_bytes("\xff")
tuple _valid_UAN_read_name = re.compile(r'^[a-zA-Z0-9]{14}$')
list _powers_of_36 = [36**i for i in range(6)]
list _time_denominators
string filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.sff"
tuple metadata = ReadRocheXmlManifest(open(filename, "rb"))
tuple index1 = sorted(_sff_read_roche_index(open(filename, "rb")))
tuple index2 = sorted(_sff_do_slow_index(open(filename, "rb")))
 BytesIO = StringIO
tuple sff = list(SffIterator(open(filename, "rb")))
tuple sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb")))
tuple sff_trim = list(SffIterator(open(filename, "rb"), trim=True))
tuple fasta_no_trim = list(SeqIO.parse(open(filename,"rU"), "fasta"))
tuple qual_no_trim = list(SeqIO.parse(open(filename,"rU"), "qual"))
tuple fasta_trim = list(SeqIO.parse(open(filename,"rU"), "fasta"))
tuple qual_trim = list(SeqIO.parse(open(filename,"rU"), "qual"))
tuple handle = StringIO()
tuple w = SffWriter(handle, xml=metadata)
tuple data = handle.getvalue()
tuple original = open(filename,"rb")

Function Documentation

def Bio.SeqIO.SffIO._get_read_region (   read_name) [private]
Extract region from read name.

Definition at line 682 of file SffIO.py.

00682 
00683 def _get_read_region(read_name):
00684     """Extract region from read name."""
00685     return int(read_name[8])

Here is the caller graph for this function:

def Bio.SeqIO.SffIO._get_read_time (   read_name) [private]
Extract time from first 6 characters of read name.

Definition at line 671 of file SffIO.py.

00671 
00672 def _get_read_time(read_name):
00673     """Extract time from first 6 characters of read name."""
00674     time_list = []
00675     remainder = _string_as_base_36(read_name[:6])
00676     for denominator in _time_denominators:
00677         this_term, remainder = divmod(remainder, denominator)
00678         time_list.append(this_term)
00679     time_list.append(remainder)
00680     time_list[0] += 2000
00681     return time_list

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.SeqIO.SffIO._get_read_xy (   read_name) [private]
Extract coordinates from last 5 characters of read name.

Definition at line 661 of file SffIO.py.

00661 
00662 def _get_read_xy(read_name):
00663     """Extract coordinates from last 5 characters of read name."""
00664     number = _string_as_base_36(read_name[9:])
00665     return divmod(number, 4096)

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.SeqIO.SffIO._sff_do_slow_index (   handle) [private]
Generates an index by scanning though all the reads in an SFF file (PRIVATE).

This is a slow but generic approach if we can't parse the provided index
(if present).

Will use the handle seek/tell functions.

Definition at line 320 of file SffIO.py.

00320 
00321 def _sff_do_slow_index(handle):
00322     """Generates an index by scanning though all the reads in an SFF file (PRIVATE).
00323 
00324     This is a slow but generic approach if we can't parse the provided index
00325     (if present).
00326 
00327     Will use the handle seek/tell functions.
00328     """
00329     handle.seek(0)
00330     header_length, index_offset, index_length, number_of_reads, \
00331     number_of_flows_per_read, flow_chars, key_sequence \
00332         = _sff_file_header(handle)
00333     #Now on to the reads...
00334     read_header_fmt = '>2HI4H'
00335     read_header_size = struct.calcsize(read_header_fmt)
00336     #NOTE - assuming flowgram_format==1, which means struct type H
00337     read_flow_fmt = ">%iH" % number_of_flows_per_read
00338     read_flow_size = struct.calcsize(read_flow_fmt)
00339     assert 1 == struct.calcsize(">B")
00340     assert 1 == struct.calcsize(">s")
00341     assert 1 == struct.calcsize(">c")
00342     assert read_header_size % 8 == 0 #Important for padding calc later!
00343     for read in range(number_of_reads):
00344         record_offset = handle.tell()
00345         if record_offset == index_offset:
00346             #Found index block within reads, ignore it:
00347             offset = index_offset + index_length
00348             if offset % 8:
00349                 offset += 8 - (offset % 8)
00350             assert offset % 8 == 0
00351             handle.seek(offset)
00352             record_offset = offset
00353         #assert record_offset%8 == 0 #Worth checking, but slow
00354         #First the fixed header
00355         data = handle.read(read_header_size)
00356         read_header_length, name_length, seq_len, clip_qual_left, \
00357         clip_qual_right, clip_adapter_left, clip_adapter_right \
00358             = struct.unpack(read_header_fmt, data)
00359         if read_header_length < 10 or read_header_length % 8 != 0:
00360             raise ValueError("Malformed read header, says length is %i:\n%s" \
00361                              % (read_header_length, repr(data)))
00362         #now the name and any padding (remainder of header)
00363         name = _bytes_to_string(handle.read(name_length))
00364         padding = read_header_length - read_header_size - name_length
00365         if handle.read(padding).count(_null) != padding:
00366             raise ValueError("Post name %i byte padding region contained data" \
00367                              % padding)
00368         assert record_offset + read_header_length == handle.tell()
00369         #now the flowgram values, flowgram index, bases and qualities
00370         size = read_flow_size + 3*seq_len
00371         handle.seek(size, 1)
00372         #now any padding...
00373         padding = size % 8
00374         if padding:
00375             padding = 8 - padding
00376             if handle.read(padding).count(_null) != padding:
00377                 raise ValueError("Post quality %i byte padding region contained data" \
00378                                  % padding)
00379         #print read, name, record_offset
00380         yield name, record_offset
00381     if handle.tell() % 8 != 0:
00382         raise ValueError("After scanning reads, did not end on a multiple of 8")

Here is the call graph for this function:

def Bio.SeqIO.SffIO._sff_file_header (   handle) [private]
Read in an SFF file header (PRIVATE).

Assumes the handle is at the start of the file, will read forwards
though the header and leave the handle pointing at the first record.
Returns a tuple of values from the header (header_length, index_offset,
index_length, number_of_reads, flows_per_read, flow_chars, key_sequence)

>>> handle = open("Roche/greek.sff", "rb")
>>> values = _sff_file_header(handle)
>>> print values[0]
840
>>> print values[1]
65040
>>> print values[2]
256
>>> print values[3]
24
>>> print values[4]
800
>>> values[-1]
'TCAG'

Definition at line 235 of file SffIO.py.

00235 
00236 def _sff_file_header(handle):
00237     """Read in an SFF file header (PRIVATE).
00238 
00239     Assumes the handle is at the start of the file, will read forwards
00240     though the header and leave the handle pointing at the first record.
00241     Returns a tuple of values from the header (header_length, index_offset,
00242     index_length, number_of_reads, flows_per_read, flow_chars, key_sequence)
00243 
00244     >>> handle = open("Roche/greek.sff", "rb")
00245     >>> values = _sff_file_header(handle)
00246     >>> print values[0]
00247     840
00248     >>> print values[1]
00249     65040
00250     >>> print values[2]
00251     256
00252     >>> print values[3]
00253     24
00254     >>> print values[4]
00255     800
00256     >>> values[-1]
00257     'TCAG'
00258 
00259     """
00260     if hasattr(handle,"mode") and "U" in handle.mode.upper():
00261         raise ValueError("SFF files must NOT be opened in universal new "
00262                          "lines mode. Binary mode is recommended (although "
00263                          "on Unix the default mode is also fine).")
00264     elif hasattr(handle,"mode") and "B" not in handle.mode.upper() \
00265     and sys.platform == "win32":
00266         raise ValueError("SFF files must be opened in binary mode on Windows")
00267     #file header (part one)
00268     #use big endiean encdoing   >
00269     #magic_number               I
00270     #version                    4B
00271     #index_offset               Q
00272     #index_length               I
00273     #number_of_reads            I
00274     #header_length              H
00275     #key_length                 H
00276     #number_of_flows_per_read   H
00277     #flowgram_format_code       B
00278     #[rest of file header depends on the number of flows and how many keys]
00279     fmt = '>4s4BQIIHHHB'
00280     assert 31 == struct.calcsize(fmt)
00281     data = handle.read(31)
00282     if not data:
00283         raise ValueError("Empty file.")
00284     elif len(data) < 13:
00285         raise ValueError("File too small to hold a valid SFF header.")
00286     magic_number, ver0, ver1, ver2, ver3, index_offset, index_length, \
00287     number_of_reads, header_length, key_length, number_of_flows_per_read, \
00288     flowgram_format = struct.unpack(fmt, data)
00289     if magic_number in [_hsh, _srt, _mft]:
00290         #Probably user error, calling Bio.SeqIO.parse() twice!
00291         raise ValueError("Handle seems to be at SFF index block, not start")
00292     if magic_number != _sff: # 779314790
00293         raise ValueError("SFF file did not start '.sff', but %s" \
00294                          % repr(magic_number))
00295     if (ver0, ver1, ver2, ver3) != (0, 0, 0, 1):
00296         raise ValueError("Unsupported SFF version in header, %i.%i.%i.%i" \
00297                          % (ver0, ver1, ver2, ver3))
00298     if flowgram_format != 1:
00299         raise ValueError("Flowgram format code %i not supported" \
00300                          % flowgram_format)
00301     if (index_offset!=0) ^ (index_length!=0):
00302         raise ValueError("Index offset %i but index length %i" \
00303                          % (index_offset, index_length))
00304     flow_chars = _bytes_to_string(handle.read(number_of_flows_per_read))
00305     key_sequence = _bytes_to_string(handle.read(key_length))
00306     #According to the spec, the header_length field should be the total number
00307     #of bytes required by this set of header fields, and should be equal to
00308     #"31 + number_of_flows_per_read + key_length" rounded up to the next value
00309     #divisible by 8.
00310     assert header_length % 8 == 0
00311     padding = header_length - number_of_flows_per_read - key_length - 31
00312     assert 0 <= padding < 8, padding
00313     if handle.read(padding).count(_null) != padding:
00314         raise ValueError("Post header %i byte padding region contained data" \
00315                          % padding)
00316     return header_length, index_offset, index_length, \
00317            number_of_reads, number_of_flows_per_read, \
00318            flow_chars, key_sequence
00319 
#This is a generator function!

Here is the caller graph for this function:

def Bio.SeqIO.SffIO._sff_find_roche_index (   handle) [private]
Locate any existing Roche style XML meta data and read index (PRIVATE).

Makes a number of hard coded assumptions based on reverse engineered SFF
files from Roche 454 machines.

Returns a tuple of read count, SFF "index" offset and size, XML offset
and size, and the actual read index offset and size.

Raises a ValueError for unsupported or non-Roche index blocks.

Definition at line 383 of file SffIO.py.

00383 
00384 def _sff_find_roche_index(handle):
00385     """Locate any existing Roche style XML meta data and read index (PRIVATE).
00386 
00387     Makes a number of hard coded assumptions based on reverse engineered SFF
00388     files from Roche 454 machines.
00389 
00390     Returns a tuple of read count, SFF "index" offset and size, XML offset
00391     and size, and the actual read index offset and size.
00392 
00393     Raises a ValueError for unsupported or non-Roche index blocks.
00394     """
00395     handle.seek(0)
00396     header_length, index_offset, index_length, number_of_reads, \
00397     number_of_flows_per_read, flow_chars, key_sequence \
00398         = _sff_file_header(handle)
00399     assert handle.tell() == header_length
00400     if not index_offset or not index_offset:
00401         raise ValueError("No index present in this SFF file")
00402     #Now jump to the header...
00403     handle.seek(index_offset)
00404     fmt = ">4s4B"
00405     fmt_size = struct.calcsize(fmt)
00406     data = handle.read(fmt_size)
00407     if not data:
00408         raise ValueError("Premature end of file? Expected index of size %i at offest %i, found nothing" \
00409                          % (index_length, index_offset))
00410     if len(data) < fmt_size:
00411         raise ValueError("Premature end of file? Expected index of size %i at offest %i, found %s" \
00412                          % (index_length, index_offset, repr(data)))
00413     magic_number, ver0, ver1, ver2, ver3 = struct.unpack(fmt, data)
00414     if magic_number == _mft: # 778921588
00415         #Roche 454 manifest index
00416         #This is typical from raw Roche 454 SFF files (2009), and includes
00417         #both an XML manifest and the sorted index.
00418         if (ver0, ver1, ver2, ver3) != (49, 46, 48, 48):
00419             #This is "1.00" as a string
00420             raise ValueError("Unsupported version in .mft index header, %i.%i.%i.%i" \
00421                              % (ver0, ver1, ver2, ver3))
00422         fmt2 = ">LL"
00423         fmt2_size = struct.calcsize(fmt2)
00424         xml_size, data_size = struct.unpack(fmt2, handle.read(fmt2_size))
00425         if index_length != fmt_size + fmt2_size + xml_size + data_size:
00426             raise ValueError("Problem understanding .mft index header, %i != %i + %i + %i + %i" \
00427                              % (index_length, fmt_size, fmt2_size, xml_size, data_size))
00428         return number_of_reads, header_length, \
00429                index_offset, index_length, \
00430                index_offset + fmt_size + fmt2_size, xml_size, \
00431                index_offset + fmt_size + fmt2_size + xml_size, data_size
00432     elif magic_number == _srt: #779317876
00433         #Roche 454 sorted index
00434         #I've had this from Roche tool sfffile when the read identifiers
00435         #had nonstandard lengths and there was no XML manifest.
00436         if (ver0, ver1, ver2, ver3) != (49, 46, 48, 48):
00437             #This is "1.00" as a string
00438             raise ValueError("Unsupported version in .srt index header, %i.%i.%i.%i" \
00439                              % (ver0, ver1, ver2, ver3))
00440         data = handle.read(4)
00441         if data != _null*4:
00442             raise ValueError("Did not find expected null four bytes in .srt index")
00443         return number_of_reads, header_length, \
00444                index_offset, index_length, \
00445                0, 0, \
00446                index_offset + fmt_size + 4, index_length - fmt_size - 4
00447     elif magic_number == _hsh:
00448         raise ValueError("Hash table style indexes (.hsh) in SFF files are "
00449                          "not (yet) supported")
00450     else:
00451         raise ValueError("Unknown magic number %s in SFF index header:\n%s" \
00452                          % (repr(magic_number), repr(data)))

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.SeqIO.SffIO._sff_read_raw_record (   handle,
  number_of_flows_per_read 
) [private]
Extract the next read in the file as a raw (bytes) string (PRIVATE).

Definition at line 686 of file SffIO.py.

00686 
00687 def _sff_read_raw_record(handle, number_of_flows_per_read):
00688     """Extract the next read in the file as a raw (bytes) string (PRIVATE)."""
00689     read_header_fmt = '>2HI'
00690     read_header_size = struct.calcsize(read_header_fmt)
00691     read_flow_fmt = ">%iH" % number_of_flows_per_read
00692     read_flow_size = struct.calcsize(read_flow_fmt)
00693 
00694     raw = handle.read(read_header_size)
00695     read_header_length, name_length, seq_len \
00696                         = struct.unpack(read_header_fmt, raw)
00697     if read_header_length < 10 or read_header_length % 8 != 0:
00698         raise ValueError("Malformed read header, says length is %i" \
00699                          % read_header_length)
00700     #now the four clip values (4H = 8 bytes), and read name
00701     raw += handle.read(8 + name_length)
00702     #and any padding (remainder of header)
00703     padding = read_header_length - read_header_size - 8 - name_length
00704     pad = handle.read(padding)
00705     if pad.count(_null) != padding:
00706         raise ValueError("Post name %i byte padding region contained data" \
00707                          % padding)
00708     raw += pad
00709     #now the flowgram values, flowgram index, bases and qualities
00710     raw += handle.read(read_flow_size + seq_len*3)
00711     padding = (read_flow_size + seq_len*3)%8
00712     #now any padding...
00713     if padding:
00714         padding = 8 - padding
00715         pad = handle.read(padding)
00716         if pad.count(_null) != padding:
00717             raise ValueError("Post quality %i byte padding region contained data" \
00718                              % padding)
00719         raw += pad
00720     #Return the raw bytes
00721     return raw

def Bio.SeqIO.SffIO._sff_read_roche_index (   handle) [private]
Reads any existing Roche style read index provided in the SFF file (PRIVATE).

Will use the handle seek/tell functions.

This works on ".srt1.00" and ".mft1.00" style Roche SFF index blocks.

Roche SFF indices use base 255 not 256, meaning we see bytes in range the
range 0 to 254 only. This appears to be so that byte 0xFF (character 255)
can be used as a marker character to separate entries (required if the
read name lengths vary).

Note that since only four bytes are used for the read offset, this is
limited to 255^4 bytes (nearly 4GB). If you try to use the Roche sfffile
tool to combine SFF files beyound this limit, they issue a warning and
omit the index (and manifest).

Definition at line 497 of file SffIO.py.

00497 
00498 def _sff_read_roche_index(handle):
00499     """Reads any existing Roche style read index provided in the SFF file (PRIVATE).
00500 
00501     Will use the handle seek/tell functions.
00502 
00503     This works on ".srt1.00" and ".mft1.00" style Roche SFF index blocks.
00504 
00505     Roche SFF indices use base 255 not 256, meaning we see bytes in range the
00506     range 0 to 254 only. This appears to be so that byte 0xFF (character 255)
00507     can be used as a marker character to separate entries (required if the
00508     read name lengths vary).
00509 
00510     Note that since only four bytes are used for the read offset, this is
00511     limited to 255^4 bytes (nearly 4GB). If you try to use the Roche sfffile
00512     tool to combine SFF files beyound this limit, they issue a warning and
00513     omit the index (and manifest).
00514     """
00515     number_of_reads, header_length, index_offset, index_length, xml_offset, \
00516     xml_size, read_index_offset, read_index_size = _sff_find_roche_index(handle)
00517     #Now parse the read index...
00518     handle.seek(read_index_offset)
00519     fmt = ">5B"
00520     for read in range(number_of_reads):
00521         #TODO - Be more aware of when the index should end?
00522         data = handle.read(6)
00523         while True:
00524             more = handle.read(1)
00525             if not more:
00526                 raise ValueError("Premature end of file!")
00527             data += more
00528             if more == _flag: break
00529         assert data[-1:] == _flag, data[-1:]
00530         name = _bytes_to_string(data[:-6])
00531         off4, off3, off2, off1, off0 = struct.unpack(fmt, data[-6:-1])
00532         offset = off0 + 255*off1 + 65025*off2 + 16581375*off3
00533         if off4:
00534             #Could in theory be used as a fifth piece of offset information,
00535             #i.e. offset =+ 4228250625L*off4, but testing the Roche tools this
00536             #is not the case. They simple don't support such large indexes.
00537             raise ValueError("Expected a null terminator to the read name.")
00538         yield name, offset
00539     if handle.tell() != read_index_offset + read_index_size:
00540         raise ValueError("Problem with index length? %i vs %i" \
00541                          % (handle.tell(), read_index_offset + read_index_size))

Here is the call graph for this function:

def Bio.SeqIO.SffIO._sff_read_roche_index_xml (   handle) [private]
Reads any existing Roche style XML manifest data in the SFF "index" (PRIVATE, DEPRECATED).

Will use the handle seek/tell functions. Returns a string.

This has been replaced by ReadRocheXmlManifest. We would normally just
delete an old private function without warning, but I believe some people
are using this so we'll handle this with a deprecation warning.

Definition at line 453 of file SffIO.py.

00453 
00454 def _sff_read_roche_index_xml(handle):
00455     """Reads any existing Roche style XML manifest data in the SFF "index" (PRIVATE, DEPRECATED).
00456 
00457     Will use the handle seek/tell functions. Returns a string.
00458 
00459     This has been replaced by ReadRocheXmlManifest. We would normally just
00460     delete an old private function without warning, but I believe some people
00461     are using this so we'll handle this with a deprecation warning.
00462     """
00463     import warnings
00464     warnings.warn("Private function _sff_read_roche_index_xml is deprecated. "
00465                   "Use new public function ReadRocheXmlManifest instead",
00466                   DeprecationWarning)
00467     return ReadRocheXmlManifest(handle)
00468 

Here is the call graph for this function:

def Bio.SeqIO.SffIO._sff_read_seq_record (   handle,
  number_of_flows_per_read,
  flow_chars,
  key_sequence,
  alphabet,
  trim = False 
) [private]
Parse the next read in the file, return data as a SeqRecord (PRIVATE).

Definition at line 544 of file SffIO.py.

00544 
00545                          key_sequence, alphabet, trim=False):
00546     """Parse the next read in the file, return data as a SeqRecord (PRIVATE)."""
00547     #Now on to the reads...
00548     #the read header format (fixed part):
00549     #read_header_length     H
00550     #name_length            H
00551     #seq_len                I
00552     #clip_qual_left         H
00553     #clip_qual_right        H
00554     #clip_adapter_left      H
00555     #clip_adapter_right     H
00556     #[rest of read header depends on the name length etc]
00557     read_header_fmt = '>2HI4H'
00558     read_header_size = struct.calcsize(read_header_fmt)
00559     read_flow_fmt = ">%iH" % number_of_flows_per_read
00560     read_flow_size = struct.calcsize(read_flow_fmt)
00561 
00562     read_header_length, name_length, seq_len, clip_qual_left, \
00563     clip_qual_right, clip_adapter_left, clip_adapter_right \
00564         = struct.unpack(read_header_fmt, handle.read(read_header_size))
00565     if clip_qual_left:
00566         clip_qual_left -= 1 #python counting
00567     if clip_adapter_left:
00568         clip_adapter_left -= 1 #python counting
00569     if read_header_length < 10 or read_header_length % 8 != 0:
00570         raise ValueError("Malformed read header, says length is %i" \
00571                          % read_header_length)
00572     #now the name and any padding (remainder of header)
00573     name = _bytes_to_string(handle.read(name_length))
00574     padding = read_header_length - read_header_size - name_length
00575     if handle.read(padding).count(_null) != padding:
00576         raise ValueError("Post name %i byte padding region contained data" \
00577                          % padding)
00578     #now the flowgram values, flowgram index, bases and qualities
00579     #NOTE - assuming flowgram_format==1, which means struct type H
00580     flow_values = handle.read(read_flow_size) #unpack later if needed
00581     temp_fmt = ">%iB" % seq_len # used for flow index and quals
00582     flow_index = handle.read(seq_len) #unpack later if needed
00583     seq = _bytes_to_string(handle.read(seq_len)) #TODO - Use bytes in Seq?
00584     quals = list(struct.unpack(temp_fmt, handle.read(seq_len)))
00585     #now any padding...
00586     padding = (read_flow_size + seq_len*3)%8
00587     if padding:
00588         padding = 8 - padding
00589         if handle.read(padding).count(_null) != padding:
00590             raise ValueError("Post quality %i byte padding region contained data" \
00591                              % padding)
00592     #Follow Roche and apply most aggressive of qual and adapter clipping.
00593     #Note Roche seems to ignore adapter clip fields when writing SFF,
00594     #and uses just the quality clipping values for any clipping.
00595     clip_left = max(clip_qual_left, clip_adapter_left)
00596     #Right clipping of zero means no clipping
00597     if clip_qual_right:
00598         if clip_adapter_right:
00599             clip_right = min(clip_qual_right, clip_adapter_right)
00600         else:
00601             #Typical case with Roche SFF files
00602             clip_right = clip_qual_right
00603     elif clip_adapter_right:
00604         clip_right = clip_adapter_right
00605     else:
00606         clip_right = seq_len
00607     #Now build a SeqRecord
00608     if trim:
00609         seq = seq[clip_left:clip_right].upper()
00610         quals = quals[clip_left:clip_right]
00611         #Don't record the clipping values, flow etc, they make no sense now:
00612         annotations = {}
00613     else:
00614         #This use of mixed case mimics the Roche SFF tool's FASTA output
00615         seq = seq[:clip_left].lower() + \
00616               seq[clip_left:clip_right].upper() + \
00617               seq[clip_right:].lower()
00618         annotations = {"flow_values":struct.unpack(read_flow_fmt, flow_values),
00619                        "flow_index":struct.unpack(temp_fmt, flow_index),
00620                        "flow_chars":flow_chars,
00621                        "flow_key":key_sequence,
00622                        "clip_qual_left":clip_qual_left,
00623                        "clip_qual_right":clip_qual_right,
00624                        "clip_adapter_left":clip_adapter_left,
00625                        "clip_adapter_right":clip_adapter_right}
00626     if re.match(_valid_UAN_read_name, name):
00627         annotations["time"] = _get_read_time(name)
00628         annotations["region"] = _get_read_region(name)
00629         annotations["coords"] = _get_read_xy(name)
00630     record = SeqRecord(Seq(seq, alphabet),
00631                        id=name,
00632                        name=name,
00633                        description="",
00634                        annotations=annotations)
00635     #Dirty trick to speed up this line:
00636     #record.letter_annotations["phred_quality"] = quals
00637     dict.__setitem__(record._per_letter_annotations,
00638                      "phred_quality", quals)
00639     #Return the record and then continue...
00640     return record

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.SeqIO.SffIO._SffTrimIterator (   handle,
  alphabet = Alphabet.generic_dna 
) [private]
Iterate over SFF reads (as SeqRecord objects) with trimming (PRIVATE).

Definition at line 882 of file SffIO.py.

00882 
00883 def _SffTrimIterator(handle, alphabet=Alphabet.generic_dna):
00884     """Iterate over SFF reads (as SeqRecord objects) with trimming (PRIVATE)."""
00885     return SffIterator(handle, alphabet, trim=True)
00886 

Here is the call graph for this function:

def Bio.SeqIO.SffIO._string_as_base_36 (   string) [private]
Interpret a string as a base-36 number as per 454 manual.

Definition at line 642 of file SffIO.py.

00642 
00643 def _string_as_base_36(string):
00644     """Interpret a string as a base-36 number as per 454 manual."""
00645     total = 0
00646     for c, power in zip(string[::-1], _powers_of_36):
00647         # For reference: ord('0') = 48, ord('9') = 57
00648         # For reference: ord('A') = 65, ord('Z') = 90
00649         # For reference: ord('a') = 97, ord('z') = 122
00650         if 48 <= ord(c) <= 57:
00651             val = ord(c) - 22 # equivalent to: - ord('0') + 26
00652         elif 65 <= ord(c) <= 90:
00653             val = ord(c) - 65
00654         elif 97 <= ord(c) <= 122:
00655             val = ord(c) - 97
00656         else:
00657             # Invalid character
00658             val = 0
00659         total += val * power 
00660     return total

Here is the caller graph for this function:

Reads any Roche style XML manifest data in the SFF "index".

The SFF file format allows for multiple different index blocks, and Roche
took advantage of this to define their own index block wich also embeds
an XML manifest string. This is not a publically documented extension to
the SFF file format, this was reverse engineered.

The handle should be to an SFF file opened in binary mode. This function
will use the handle seek/tell functions and leave the handle in an
arbitrary location.

Any XML manifest found is returned as a Python string, which you can then
parse as appropriate, or reuse when writing out SFF files with the
SffWriter class.

Returns a string, or raises a ValueError if an Roche manifest could not be
found.

Definition at line 469 of file SffIO.py.

00469 
00470 def ReadRocheXmlManifest(handle):
00471     """Reads any Roche style XML manifest data in the SFF "index".
00472 
00473     The SFF file format allows for multiple different index blocks, and Roche
00474     took advantage of this to define their own index block wich also embeds
00475     an XML manifest string. This is not a publically documented extension to
00476     the SFF file format, this was reverse engineered.
00477 
00478     The handle should be to an SFF file opened in binary mode. This function
00479     will use the handle seek/tell functions and leave the handle in an
00480     arbitrary location.
00481 
00482     Any XML manifest found is returned as a Python string, which you can then
00483     parse as appropriate, or reuse when writing out SFF files with the
00484     SffWriter class.
00485 
00486     Returns a string, or raises a ValueError if an Roche manifest could not be
00487     found.
00488     """
00489     number_of_reads, header_length, index_offset, index_length, xml_offset, \
00490     xml_size, read_index_offset, read_index_size = _sff_find_roche_index(handle)
00491     if not xml_offset or not xml_size:
00492         raise ValueError("No XML manifest found")
00493     handle.seek(xml_offset)
00494     return _bytes_to_string(handle.read(xml_size))
00495 
00496 
#This is a generator function!

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.SeqIO.SffIO.SffIterator (   handle,
  alphabet = Alphabet.generic_dna,
  trim = False 
)
Iterate over Standard Flowgram Format (SFF) reads (as SeqRecord objects).

handle - input file, an SFF file, e.g. from Roche 454 sequencing.
         This must NOT be opened in universal read lines mode!
alphabet - optional alphabet, defaults to generic DNA.
trim - should the sequences be trimmed?

The resulting SeqRecord objects should match those from a paired FASTA
and QUAL file converted from the SFF file using the Roche 454 tool
ssfinfo. i.e. The sequence will be mixed case, with the trim regions
shown in lower case.

This function is used internally via the Bio.SeqIO functions:

>>> from Bio import SeqIO
>>> handle = open("Roche/E3MFGYR02_random_10_reads.sff", "rb")
>>> for record in SeqIO.parse(handle, "sff"):
...     print record.id, len(record)
E3MFGYR02JWQ7T 265
E3MFGYR02JA6IL 271
E3MFGYR02JHD4H 310
E3MFGYR02GFKUC 299
E3MFGYR02FTGED 281
E3MFGYR02FR9G7 261
E3MFGYR02GAZMS 278
E3MFGYR02HHZ8O 221
E3MFGYR02GPGB1 269
E3MFGYR02F7Z7G 219
>>> handle.close()

You can also call it directly:

>>> handle = open("Roche/E3MFGYR02_random_10_reads.sff", "rb")
>>> for record in SffIterator(handle):
...     print record.id, len(record)
E3MFGYR02JWQ7T 265
E3MFGYR02JA6IL 271
E3MFGYR02JHD4H 310
E3MFGYR02GFKUC 299
E3MFGYR02FTGED 281
E3MFGYR02FR9G7 261
E3MFGYR02GAZMS 278
E3MFGYR02HHZ8O 221
E3MFGYR02GPGB1 269
E3MFGYR02F7Z7G 219
>>> handle.close()

Or, with the trim option:

>>> handle = open("Roche/E3MFGYR02_random_10_reads.sff", "rb")
>>> for record in SffIterator(handle, trim=True):
...     print record.id, len(record)
E3MFGYR02JWQ7T 260
E3MFGYR02JA6IL 265
E3MFGYR02JHD4H 292
E3MFGYR02GFKUC 295
E3MFGYR02FTGED 277
E3MFGYR02FR9G7 256
E3MFGYR02GAZMS 271
E3MFGYR02HHZ8O 150
E3MFGYR02GPGB1 221
E3MFGYR02F7Z7G 130
>>> handle.close()

Definition at line 751 of file SffIO.py.

00751 
00752 def SffIterator(handle, alphabet=Alphabet.generic_dna, trim=False):
00753     """Iterate over Standard Flowgram Format (SFF) reads (as SeqRecord objects).
00754 
00755     handle - input file, an SFF file, e.g. from Roche 454 sequencing.
00756              This must NOT be opened in universal read lines mode!
00757     alphabet - optional alphabet, defaults to generic DNA.
00758     trim - should the sequences be trimmed?
00759 
00760     The resulting SeqRecord objects should match those from a paired FASTA
00761     and QUAL file converted from the SFF file using the Roche 454 tool
00762     ssfinfo. i.e. The sequence will be mixed case, with the trim regions
00763     shown in lower case.
00764 
00765     This function is used internally via the Bio.SeqIO functions:
00766 
00767     >>> from Bio import SeqIO
00768     >>> handle = open("Roche/E3MFGYR02_random_10_reads.sff", "rb")
00769     >>> for record in SeqIO.parse(handle, "sff"):
00770     ...     print record.id, len(record)
00771     E3MFGYR02JWQ7T 265
00772     E3MFGYR02JA6IL 271
00773     E3MFGYR02JHD4H 310
00774     E3MFGYR02GFKUC 299
00775     E3MFGYR02FTGED 281
00776     E3MFGYR02FR9G7 261
00777     E3MFGYR02GAZMS 278
00778     E3MFGYR02HHZ8O 221
00779     E3MFGYR02GPGB1 269
00780     E3MFGYR02F7Z7G 219
00781     >>> handle.close()
00782 
00783     You can also call it directly:
00784 
00785     >>> handle = open("Roche/E3MFGYR02_random_10_reads.sff", "rb")
00786     >>> for record in SffIterator(handle):
00787     ...     print record.id, len(record)
00788     E3MFGYR02JWQ7T 265
00789     E3MFGYR02JA6IL 271
00790     E3MFGYR02JHD4H 310
00791     E3MFGYR02GFKUC 299
00792     E3MFGYR02FTGED 281
00793     E3MFGYR02FR9G7 261
00794     E3MFGYR02GAZMS 278
00795     E3MFGYR02HHZ8O 221
00796     E3MFGYR02GPGB1 269
00797     E3MFGYR02F7Z7G 219
00798     >>> handle.close()
00799 
00800     Or, with the trim option:
00801 
00802     >>> handle = open("Roche/E3MFGYR02_random_10_reads.sff", "rb")
00803     >>> for record in SffIterator(handle, trim=True):
00804     ...     print record.id, len(record)
00805     E3MFGYR02JWQ7T 260
00806     E3MFGYR02JA6IL 265
00807     E3MFGYR02JHD4H 292
00808     E3MFGYR02GFKUC 295
00809     E3MFGYR02FTGED 277
00810     E3MFGYR02FR9G7 256
00811     E3MFGYR02GAZMS 271
00812     E3MFGYR02HHZ8O 150
00813     E3MFGYR02GPGB1 221
00814     E3MFGYR02F7Z7G 130
00815     >>> handle.close()
00816 
00817     """
00818     if isinstance(Alphabet._get_base_alphabet(alphabet),
00819                   Alphabet.ProteinAlphabet):
00820         raise ValueError("Invalid alphabet, SFF files do not hold proteins.")
00821     if isinstance(Alphabet._get_base_alphabet(alphabet),
00822                   Alphabet.RNAAlphabet):
00823         raise ValueError("Invalid alphabet, SFF files do not hold RNA.")
00824     try:
00825         assert 0 == handle.tell()
00826     except AttributeError:
00827         #Probably a network handle or something like that
00828         handle = _AddTellHandle(handle)
00829     header_length, index_offset, index_length, number_of_reads, \
00830     number_of_flows_per_read, flow_chars, key_sequence \
00831         = _sff_file_header(handle)
00832     #Now on to the reads...
00833     #the read header format (fixed part):
00834     #read_header_length     H
00835     #name_length            H
00836     #seq_len                I
00837     #clip_qual_left         H
00838     #clip_qual_right        H
00839     #clip_adapter_left      H
00840     #clip_adapter_right     H
00841     #[rest of read header depends on the name length etc]
00842     read_header_fmt = '>2HI4H'
00843     read_header_size = struct.calcsize(read_header_fmt)
00844     read_flow_fmt = ">%iH" % number_of_flows_per_read
00845     read_flow_size = struct.calcsize(read_flow_fmt)
00846     assert 1 == struct.calcsize(">B")
00847     assert 1 == struct.calcsize(">s")
00848     assert 1 == struct.calcsize(">c")
00849     assert read_header_size % 8 == 0 #Important for padding calc later!
00850     #The spec allows for the index block to be before or even in the middle
00851     #of the reads. We can check that if we keep track of our position
00852     #in the file...
00853     for read in range(number_of_reads):
00854         if index_offset and handle.tell() == index_offset:
00855             offset = index_offset + index_length
00856             if offset % 8:
00857                 offset += 8 - (offset % 8)
00858             assert offset % 8 == 0
00859             handle.seek(offset)
00860             #Now that we've done this, we don't need to do it again. Clear
00861             #the index_offset so we can skip extra handle.tell() calls:
00862             index_offset = 0
00863         yield _sff_read_seq_record(handle,
00864                                    number_of_flows_per_read,
00865                                    flow_chars,
00866                                    key_sequence,
00867                                    alphabet,
00868                                    trim)
00869     #The following is not essential, but avoids confusing error messages
00870     #for the user if they try and re-parse the same handle.
00871     if index_offset and handle.tell() == index_offset:
00872         offset = index_offset + index_length
00873         if offset % 8:
00874             offset += 8 - (offset % 8)
00875         assert offset % 8 == 0
00876         handle.seek(offset)
00877     #Should now be at the end of the file...
00878     if handle.read(1):
00879         raise ValueError("Additional data at end of SFF file")
00880 
00881 
#This is a generator function!

Here is the call graph for this function:

Here is the caller graph for this function:


Variable Documentation

tuple Bio.SeqIO.SffIO._flag = _as_bytes("\xff")

Definition at line 233 of file SffIO.py.

tuple Bio.SeqIO.SffIO._hsh = _as_bytes(".hsh")

Definition at line 230 of file SffIO.py.

tuple Bio.SeqIO.SffIO._mft = _as_bytes(".mft")

Definition at line 232 of file SffIO.py.

tuple Bio.SeqIO.SffIO._null = _as_bytes("\0")

Definition at line 228 of file SffIO.py.

list Bio.SeqIO.SffIO._powers_of_36 = [36**i for i in range(6)]

Definition at line 641 of file SffIO.py.

tuple Bio.SeqIO.SffIO._sff = _as_bytes(".sff")

Definition at line 229 of file SffIO.py.

tuple Bio.SeqIO.SffIO._srt = _as_bytes(".srt")

Definition at line 231 of file SffIO.py.

Initial value:
00001 [13 * 32 * 24 * 60 * 60,
00002                       32 * 24 * 60 * 60,
00003                       24 * 60 * 60,
00004                       60 * 60,
00005                       60]

Definition at line 666 of file SffIO.py.

tuple Bio.SeqIO.SffIO._valid_UAN_read_name = re.compile(r'^[a-zA-Z0-9]{14}$')

Definition at line 542 of file SffIO.py.

Definition at line 1184 of file SffIO.py.

tuple Bio.SeqIO.SffIO.data = handle.getvalue()

Definition at line 1264 of file SffIO.py.

tuple Bio.SeqIO.SffIO.fasta_no_trim = list(SeqIO.parse(open(filename,"rU"), "fasta"))

Definition at line 1235 of file SffIO.py.

tuple Bio.SeqIO.SffIO.fasta_trim = list(SeqIO.parse(open(filename,"rU"), "fasta"))

Definition at line 1240 of file SffIO.py.

string Bio.SeqIO.SffIO.filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.sff"

Definition at line 1173 of file SffIO.py.

tuple Bio.SeqIO.SffIO.handle = StringIO()

Definition at line 1261 of file SffIO.py.

Definition at line 1175 of file SffIO.py.

tuple Bio.SeqIO.SffIO.index2 = sorted(_sff_do_slow_index(open(filename, "rb")))

Definition at line 1176 of file SffIO.py.

Definition at line 1174 of file SffIO.py.

tuple Bio.SeqIO.SffIO.original = open(filename,"rb")

Definition at line 1272 of file SffIO.py.

tuple Bio.SeqIO.SffIO.qual_no_trim = list(SeqIO.parse(open(filename,"rU"), "qual"))

Definition at line 1237 of file SffIO.py.

tuple Bio.SeqIO.SffIO.qual_trim = list(SeqIO.parse(open(filename,"rU"), "qual"))

Definition at line 1242 of file SffIO.py.

tuple Bio.SeqIO.SffIO.sff = list(SffIterator(open(filename, "rb")))

Definition at line 1197 of file SffIO.py.

tuple Bio.SeqIO.SffIO.sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb")))

Definition at line 1199 of file SffIO.py.

tuple Bio.SeqIO.SffIO.sff_trim = list(SffIterator(open(filename, "rb"), trim=True))

Definition at line 1229 of file SffIO.py.

Definition at line 1262 of file SffIO.py.