Back to index

python-biopython  1.60
Public Member Functions | Public Attributes | Private Member Functions | Private Attributes
Bio.SeqIO.SffIO.SffWriter Class Reference
Inheritance diagram for Bio.SeqIO.SffIO.SffWriter:
Inheritance graph
[legend]
Collaboration diagram for Bio.SeqIO.SffIO.SffWriter:
Collaboration graph
[legend]

List of all members.

Public Member Functions

def __init__
def write_file
def write_header
def write_record
def clean

Public Attributes

 handle

Private Member Functions

def _write_index

Private Attributes

 _xml
 _index
 _number_of_reads
 _index_start
 _index_length
 _key_sequence
 _flow_chars
 _number_of_flows_per_read

Detailed Description

SFF file writer.

Definition at line 887 of file SffIO.py.


Constructor & Destructor Documentation

def Bio.SeqIO.SffIO.SffWriter.__init__ (   self,
  handle,
  index = True,
  xml = None 
)
Creates the writer object.

handle - Output handle, ideally in binary write mode.
index - Boolean argument, should we try and write an index?
xml - Optional string argument, xml manifest to be recorded in the index
      block (see function ReadRocheXmlManifest for reading this data).

Definition at line 890 of file SffIO.py.

00890 
00891     def __init__(self, handle, index=True, xml=None):
00892         """Creates the writer object.
00893 
00894         handle - Output handle, ideally in binary write mode.
00895         index - Boolean argument, should we try and write an index?
00896         xml - Optional string argument, xml manifest to be recorded in the index
00897               block (see function ReadRocheXmlManifest for reading this data).
00898         """
00899         if hasattr(handle,"mode") and "U" in handle.mode.upper():
00900             raise ValueError("SFF files must NOT be opened in universal new "
00901                              "lines mode. Binary mode is required")
00902         elif hasattr(handle,"mode") and "B" not in handle.mode.upper():
00903             raise ValueError("SFF files must be opened in binary mode")
00904         self.handle = handle
00905         self._xml = xml
00906         if index:
00907             self._index = []
00908         else:
00909             self._index = None


Member Function Documentation

def Bio.SeqIO.SffIO.SffWriter._write_index (   self) [private]

Definition at line 968 of file SffIO.py.

00968 
00969     def _write_index(self):
00970         assert len(self._index)==self._number_of_reads
00971         handle = self.handle
00972         self._index.sort()
00973         self._index_start = handle.tell() #need for header
00974         #XML...
00975         if self._xml is not None:
00976             xml = _as_bytes(self._xml)
00977         else:
00978             from Bio import __version__
00979             xml = "<!-- This file was output with Biopython %s -->\n" % __version__
00980             xml += "<!-- This XML and index block attempts to mimic Roche SFF files -->\n"
00981             xml += "<!-- This file may be a combination of multiple SFF files etc -->\n"
00982             xml = _as_bytes(xml)
00983         xml_len = len(xml)
00984         #Write to the file...
00985         fmt = ">I4BLL"
00986         fmt_size = struct.calcsize(fmt)
00987         handle.write(_null*fmt_size + xml) #will come back later to fill this
00988         fmt2 = ">6B"
00989         assert 6 == struct.calcsize(fmt2)
00990         self._index.sort()
00991         index_len = 0 #don't know yet!
00992         for name, offset in self._index:
00993             #Roche files record the offsets using base 255 not 256.
00994             #See comments for parsing the index block. There may be a faster
00995             #way to code this, but we can't easily use shifts due to odd base
00996             off3 = offset
00997             off0 = off3 % 255
00998             off3 -= off0
00999             off1 = off3 % 65025
01000             off3 -= off1
01001             off2 = off3 % 16581375
01002             off3 -= off2
01003             assert offset == off0 + off1 + off2 + off3, \
01004                    "%i -> %i %i %i %i" % (offset, off0, off1, off2, off3)
01005             off3, off2, off1, off0 = off3//16581375, off2//65025, \
01006                                      off1//255, off0
01007             assert off0 < 255 and off1 < 255 and off2 < 255 and off3 < 255, \
01008                    "%i -> %i %i %i %i" % (offset, off0, off1, off2, off3)
01009             handle.write(name + struct.pack(fmt2, 0, \
01010                                             off3, off2, off1, off0, 255))
01011             index_len += len(name) + 6
01012         #Note any padding in not included:
01013         self._index_length = fmt_size + xml_len + index_len #need for header
01014         #Pad out to an 8 byte boundary (although I have noticed some
01015         #real Roche SFF files neglect to do this depsite their manual
01016         #suggesting this padding should be there):
01017         if self._index_length % 8:
01018             padding = 8 - (self._index_length%8)
01019             handle.write(_null*padding)
01020         else:
01021             padding = 0
01022         offset = handle.tell()
01023         assert offset == self._index_start + self._index_length + padding, \
01024                "%i vs %i + %i + %i"  % (offset, self._index_start, \
01025                                         self._index_length, padding)
01026         #Must now go back and update the index header with index size...
01027         handle.seek(self._index_start)
01028         handle.write(struct.pack(fmt, 778921588, #magic number
01029                                  49,46,48,48, #Roche index version, "1.00"
01030                                  xml_len, index_len) + xml)
01031         #Must now go back and update the header...
01032         handle.seek(0)
01033         self.write_header()
01034         handle.seek(offset) #not essential?

Here is the call graph for this function:

def Bio.SeqIO.Interfaces.SequenceWriter.clean (   self,
  text 
) [inherited]
Use this to avoid getting newlines in the output.

Definition at line 166 of file Interfaces.py.

00166 
00167     def clean(self, text):
00168         """Use this to avoid getting newlines in the output."""
00169         return text.replace("\n", " ").replace("\r", " ").replace("  ", " ")
    

Here is the caller graph for this function:

def Bio.SeqIO.SffIO.SffWriter.write_file (   self,
  records 
)
Use this to write an entire file containing the given records.

Reimplemented from Bio.SeqIO.Interfaces.SequenceWriter.

Definition at line 910 of file SffIO.py.

00910 
00911     def write_file(self, records):
00912         """Use this to write an entire file containing the given records."""
00913         try:
00914             self._number_of_reads = len(records)
00915         except TypeError:
00916             self._number_of_reads = 0 #dummy value
00917             if not hasattr(self.handle, "seek") \
00918             or not hasattr(self.handle, "tell"):
00919                 raise ValueError("A handle with a seek/tell methods is "
00920                                  "required in order to record the total "
00921                                  "record count in the file header (once it "
00922                                  "is known at the end).")
00923         if self._index is not None and \
00924         not (hasattr(self.handle, "seek") and hasattr(self.handle, "tell")):
00925             import warnings
00926             warnings.warn("A handle with a seek/tell methods is required in "
00927                           "order to record an SFF index.")
00928             self._index = None
00929         self._index_start = 0
00930         self._index_length = 0
00931         if not hasattr(records, "next"):
00932             records = iter(records)
00933         #Get the first record in order to find the flow information
00934         #we will need for the header.
00935         try:
00936             record = records.next()
00937         except StopIteration:
00938             record = None
00939         if record is None:
00940             #No records -> empty SFF file (or an error)?
00941             #We can't write a header without the flow information.
00942             #return 0
00943             raise ValueError("Must have at least one sequence")
00944         try:
00945             self._key_sequence = _as_bytes(record.annotations["flow_key"])
00946             self._flow_chars = _as_bytes(record.annotations["flow_chars"])
00947             self._number_of_flows_per_read = len(self._flow_chars)
00948         except KeyError:
00949             raise ValueError("Missing SFF flow information")
00950         self.write_header()
00951         self.write_record(record)
00952         count = 1
00953         for record in records:
00954             self.write_record(record)
00955             count += 1
00956         if self._number_of_reads == 0:
00957             #Must go back and record the record count...
00958             offset = self.handle.tell()
00959             self.handle.seek(0)
00960             self._number_of_reads = count
00961             self.write_header()
00962             self.handle.seek(offset) #not essential?
00963         else:
00964             assert count == self._number_of_reads
00965         if self._index is not None:
00966             self._write_index()
00967         return count

Definition at line 1035 of file SffIO.py.

01035 
01036     def write_header(self):
01037         #Do header...
01038         key_length = len(self._key_sequence)
01039         #file header (part one)
01040         #use big endiean encdoing   >
01041         #magic_number               I
01042         #version                    4B
01043         #index_offset               Q
01044         #index_length               I
01045         #number_of_reads            I
01046         #header_length              H
01047         #key_length                 H
01048         #number_of_flows_per_read   H
01049         #flowgram_format_code       B
01050         #[rest of file header depends on the number of flows and how many keys]
01051         fmt = '>I4BQIIHHHB%is%is' % (self._number_of_flows_per_read, key_length)
01052         #According to the spec, the header_length field should be the total
01053         #number of bytes required by this set of header fields, and should be
01054         #equal to "31 + number_of_flows_per_read + key_length" rounded up to
01055         #the next value divisible by 8.
01056         if struct.calcsize(fmt) % 8 == 0:
01057             padding = 0
01058         else:
01059             padding = 8 - (struct.calcsize(fmt) % 8)
01060         header_length = struct.calcsize(fmt) + padding
01061         assert header_length % 8 == 0
01062         header = struct.pack(fmt, 779314790, #magic number 0x2E736666
01063                              0, 0, 0, 1, #version
01064                              self._index_start, self._index_length,
01065                              self._number_of_reads,
01066                              header_length, key_length,
01067                              self._number_of_flows_per_read,
01068                              1, #the only flowgram format code we support
01069                              self._flow_chars, self._key_sequence)
01070         self.handle.write(header + _null*padding)

Here is the caller graph for this function:

def Bio.SeqIO.SffIO.SffWriter.write_record (   self,
  record 
)
Write a single additional record to the output file.

This assumes the header has been done.

Definition at line 1071 of file SffIO.py.

01071 
01072     def write_record(self, record):
01073         """Write a single additional record to the output file.
01074 
01075         This assumes the header has been done.
01076         """
01077         #Basics
01078         name = _as_bytes(record.id)
01079         name_len = len(name)
01080         seq = _as_bytes(str(record.seq).upper())
01081         seq_len = len(seq)
01082         #Qualities
01083         try:
01084             quals = record.letter_annotations["phred_quality"]
01085         except KeyError:
01086             raise ValueError("Missing PHRED qualities information")
01087         #Flow
01088         try:
01089             flow_values = record.annotations["flow_values"]
01090             flow_index = record.annotations["flow_index"]
01091             if self._key_sequence != _as_bytes(record.annotations["flow_key"]) \
01092             or self._flow_chars != _as_bytes(record.annotations["flow_chars"]):
01093                 raise ValueError("Records have inconsistent SFF flow data")
01094         except KeyError:
01095             raise ValueError("Missing SFF flow information")
01096         except AttributeError:
01097             raise ValueError("Header not written yet?")
01098         #Clipping
01099         try:
01100             clip_qual_left = record.annotations["clip_qual_left"]
01101             if clip_qual_left:
01102                 clip_qual_left += 1
01103             clip_qual_right = record.annotations["clip_qual_right"]
01104             clip_adapter_left = record.annotations["clip_adapter_left"]
01105             if clip_adapter_left:
01106                 clip_adapter_left += 1
01107             clip_adapter_right = record.annotations["clip_adapter_right"]
01108         except KeyError:
01109             raise ValueError("Missing SFF clipping information")
01110 
01111         #Capture information for index
01112         if self._index is not None:
01113             offset = self.handle.tell()
01114             #Check the position of the final record (before sort by name)
01115             #Using a four-digit base 255 number, so the upper bound is
01116             #254*(1)+254*(255)+254*(255**2)+254*(255**3) = 4228250624
01117             #or equivalently it overflows at 255**4 = 4228250625
01118             if offset > 4228250624:
01119                 import warnings
01120                 warnings.warn("Read %s has file offset %i, which is too large "
01121                               "to store in the Roche SFF index structure. No "
01122                               "index block will be recorded." % (name, offset))
01123                 #No point recoring the offsets now
01124                 self._index = None
01125             else:
01126                 self._index.append((name, self.handle.tell()))
01127 
01128         #the read header format (fixed part):
01129         #read_header_length     H
01130         #name_length            H
01131         #seq_len                I
01132         #clip_qual_left         H
01133         #clip_qual_right        H
01134         #clip_adapter_left      H
01135         #clip_adapter_right     H
01136         #[rest of read header depends on the name length etc]
01137         #name
01138         #flow values
01139         #flow index
01140         #sequence
01141         #padding
01142         read_header_fmt = '>2HI4H%is' % name_len
01143         if struct.calcsize(read_header_fmt) % 8 == 0:
01144             padding = 0
01145         else:
01146             padding = 8 - (struct.calcsize(read_header_fmt) % 8)
01147         read_header_length = struct.calcsize(read_header_fmt) + padding
01148         assert read_header_length % 8 == 0
01149         data = struct.pack(read_header_fmt,
01150                            read_header_length,
01151                            name_len, seq_len,
01152                            clip_qual_left, clip_qual_right,
01153                            clip_adapter_left, clip_adapter_right,
01154                            name) + _null*padding
01155         assert len(data) == read_header_length
01156         #now the flowgram values, flowgram index, bases and qualities
01157         #NOTE - assuming flowgram_format==1, which means struct type H
01158         read_flow_fmt = ">%iH" % self._number_of_flows_per_read
01159         read_flow_size = struct.calcsize(read_flow_fmt)
01160         temp_fmt = ">%iB" % seq_len # used for flow index and quals
01161         data += struct.pack(read_flow_fmt, *flow_values) \
01162                 + struct.pack(temp_fmt, *flow_index) \
01163                 + seq \
01164                 + struct.pack(temp_fmt, *quals)
01165         #now any final padding...
01166         padding = (read_flow_size + seq_len*3)%8
01167         if padding:
01168             padding = 8 - padding
01169         self.handle.write(data + _null*padding)
01170 

Here is the call graph for this function:


Member Data Documentation

Definition at line 945 of file SffIO.py.

Definition at line 906 of file SffIO.py.

Definition at line 929 of file SffIO.py.

Definition at line 928 of file SffIO.py.

Definition at line 944 of file SffIO.py.

Definition at line 946 of file SffIO.py.

Definition at line 913 of file SffIO.py.

Definition at line 904 of file SffIO.py.

Reimplemented from Bio.SeqIO.Interfaces.SequenceWriter.

Definition at line 903 of file SffIO.py.


The documentation for this class was generated from the following file: