Back to index

python-biopython  1.60
Classes | Functions | Variables
Bio.bgzf Namespace Reference

Classes

class  BgzfReader
class  BgzfWriter

Functions

def open
def make_virtual_offset
def split_virtual_offset
def BgzfBlocks
def _load_bgzf_block

Variables

tuple _bgzf_magic = _as_bytes("\x1f\x8b\x08\x04")
tuple _bgzf_header
tuple _bgzf_eof
tuple _bytes_BC = _as_bytes("BC")
tuple _empty_bytes_string = _as_bytes("")
tuple _bytes_newline = _as_bytes("\n")
tuple w = BgzfWriter(fileobj=sys.stdout)
tuple data = sys.stdin.read(65536)

Function Documentation

def Bio.bgzf._load_bgzf_block (   handle,
  text_mode = False 
) [private]
Internal function to load the next BGZF function (PRIVATE).

Definition at line 366 of file bgzf.py.

00366 
00367 def _load_bgzf_block(handle, text_mode=False):
00368     """Internal function to load the next BGZF function (PRIVATE)."""
00369     magic = handle.read(4)
00370     if not magic:
00371         #End of file
00372         raise StopIteration
00373     if magic != _bgzf_magic:
00374         raise ValueError(r"A BGZF (e.g. a BAM file) block should start with "
00375                          r"%r, not %r; handle.tell() now says %r"
00376                          % (_bgzf_magic, magic, handle.tell()))
00377     gzip_mod_time, gzip_extra_flags, gzip_os, extra_len = \
00378         struct.unpack("<LBBH", handle.read(8))
00379         
00380     block_size = None
00381     x_len = 0
00382     while x_len < extra_len:
00383         subfield_id = handle.read(2)
00384         subfield_len = struct.unpack("<H", handle.read(2))[0] #uint16_t
00385         subfield_data = handle.read(subfield_len)
00386         x_len += subfield_len + 4
00387         if subfield_id == _bytes_BC:
00388             assert subfield_len == 2, "Wrong BC payload length"
00389             assert block_size is None, "Two BC subfields?"
00390             block_size = struct.unpack("<H", subfield_data)[0]+1 #uint16_t
00391     assert x_len == extra_len, (x_len, extra_len)
00392     assert block_size is not None, "Missing BC, this isn't a BGZF file!"
00393     #Now comes the compressed data, CRC, and length of uncompressed data.
00394     deflate_size = block_size - 1 - extra_len - 19
00395     d = zlib.decompressobj(-15) #Negative window size means no headers
00396     data = d.decompress(handle.read(deflate_size)) + d.flush()
00397     expected_crc = handle.read(4)
00398     expected_size = struct.unpack("<I", handle.read(4))[0]
00399     assert expected_size == len(data), \
00400            "Decompressed to %i, not %i" % (len(data), expected_size)
00401     #Should cope with a mix of Python platforms...
00402     crc = zlib.crc32(data)
00403     if crc < 0:
00404         crc = struct.pack("<i", crc)
00405     else:
00406         crc = struct.pack("<I", crc)
00407     assert expected_crc == crc, \
00408            "CRC is %s, not %s" % (crc, expected_crc)
00409     if text_mode:
00410         return block_size, _as_string(data)
00411     else:
00412         return block_size, data
00413 

Here is the caller graph for this function:

def Bio.bgzf.BgzfBlocks (   handle)
Low level debugging function to inspect BGZF blocks.

Returns the block start offset (see virtual offsets), the block
length (add these for the start of the next block), and the
decompressed length of the blocks contents (limited to 65536 in
BGZF).

>>> from __builtin__ import open
>>> handle = open("SamBam/ex1.bam", "rb")
>>> for values in BgzfBlocks(handle):
...     print "Raw start %i, raw length %i; data start %i, data length %i" % values
Raw start 0, raw length 18239; data start 0, data length 65536
Raw start 18239, raw length 18223; data start 65536, data length 65536
Raw start 36462, raw length 18017; data start 131072, data length 65536
Raw start 54479, raw length 17342; data start 196608, data length 65536
Raw start 71821, raw length 17715; data start 262144, data length 65536
Raw start 89536, raw length 17728; data start 327680, data length 65536
Raw start 107264, raw length 17292; data start 393216, data length 63398
Raw start 124556, raw length 28; data start 456614, data length 0
>>> handle.close()

Indirectly we can tell this file came from an old version of
samtools because all the blocks (except the final one and the
dummy empty EOF marker block) are 65536 bytes.  Later versions
avoid splitting a read between two blocks, and give the header
its own block (useful to speed up replacing the header). You
can see this in ex1_refresh.bam created using samtools 0.1.18:

samtools view -b ex1.bam > ex1_refresh.bam

>>> handle = open("SamBam/ex1_refresh.bam", "rb")
>>> for values in BgzfBlocks(handle):
...     print "Raw start %i, raw length %i; data start %i, data length %i" % values
Raw start 0, raw length 53; data start 0, data length 38
Raw start 53, raw length 18195; data start 38, data length 65434
Raw start 18248, raw length 18190; data start 65472, data length 65409
Raw start 36438, raw length 18004; data start 130881, data length 65483
Raw start 54442, raw length 17353; data start 196364, data length 65519
Raw start 71795, raw length 17708; data start 261883, data length 65411
Raw start 89503, raw length 17709; data start 327294, data length 65466
Raw start 107212, raw length 17390; data start 392760, data length 63854
Raw start 124602, raw length 28; data start 456614, data length 0
>>> handle.close()

The above example has no embedded SAM header (thus the first block
is very small at just 38 bytes of decompressed data), while the next
example does (a larger block of 103 bytes). Notice that the rest of
the blocks show the same sizes (they contain the same read data):

>>> handle = open("SamBam/ex1_header.bam", "rb")
>>> for values in BgzfBlocks(handle):
...     print "Raw start %i, raw length %i; data start %i, data length %i" % values
Raw start 0, raw length 104; data start 0, data length 103
Raw start 104, raw length 18195; data start 103, data length 65434
Raw start 18299, raw length 18190; data start 65537, data length 65409
Raw start 36489, raw length 18004; data start 130946, data length 65483
Raw start 54493, raw length 17353; data start 196429, data length 65519
Raw start 71846, raw length 17708; data start 261948, data length 65411
Raw start 89554, raw length 17709; data start 327359, data length 65466
Raw start 107263, raw length 17390; data start 392825, data length 63854
Raw start 124653, raw length 28; data start 456679, data length 0
>>> handle.close()

Definition at line 291 of file bgzf.py.

00291 
00292 def BgzfBlocks(handle):
00293     """Low level debugging function to inspect BGZF blocks.
00294 
00295     Returns the block start offset (see virtual offsets), the block
00296     length (add these for the start of the next block), and the
00297     decompressed length of the blocks contents (limited to 65536 in
00298     BGZF).
00299 
00300     >>> from __builtin__ import open
00301     >>> handle = open("SamBam/ex1.bam", "rb")
00302     >>> for values in BgzfBlocks(handle):
00303     ...     print "Raw start %i, raw length %i; data start %i, data length %i" % values
00304     Raw start 0, raw length 18239; data start 0, data length 65536
00305     Raw start 18239, raw length 18223; data start 65536, data length 65536
00306     Raw start 36462, raw length 18017; data start 131072, data length 65536
00307     Raw start 54479, raw length 17342; data start 196608, data length 65536
00308     Raw start 71821, raw length 17715; data start 262144, data length 65536
00309     Raw start 89536, raw length 17728; data start 327680, data length 65536
00310     Raw start 107264, raw length 17292; data start 393216, data length 63398
00311     Raw start 124556, raw length 28; data start 456614, data length 0
00312     >>> handle.close()
00313 
00314     Indirectly we can tell this file came from an old version of
00315     samtools because all the blocks (except the final one and the
00316     dummy empty EOF marker block) are 65536 bytes.  Later versions
00317     avoid splitting a read between two blocks, and give the header
00318     its own block (useful to speed up replacing the header). You
00319     can see this in ex1_refresh.bam created using samtools 0.1.18:
00320 
00321     samtools view -b ex1.bam > ex1_refresh.bam
00322 
00323     >>> handle = open("SamBam/ex1_refresh.bam", "rb")
00324     >>> for values in BgzfBlocks(handle):
00325     ...     print "Raw start %i, raw length %i; data start %i, data length %i" % values
00326     Raw start 0, raw length 53; data start 0, data length 38
00327     Raw start 53, raw length 18195; data start 38, data length 65434
00328     Raw start 18248, raw length 18190; data start 65472, data length 65409
00329     Raw start 36438, raw length 18004; data start 130881, data length 65483
00330     Raw start 54442, raw length 17353; data start 196364, data length 65519
00331     Raw start 71795, raw length 17708; data start 261883, data length 65411
00332     Raw start 89503, raw length 17709; data start 327294, data length 65466
00333     Raw start 107212, raw length 17390; data start 392760, data length 63854
00334     Raw start 124602, raw length 28; data start 456614, data length 0
00335     >>> handle.close()
00336 
00337     The above example has no embedded SAM header (thus the first block
00338     is very small at just 38 bytes of decompressed data), while the next
00339     example does (a larger block of 103 bytes). Notice that the rest of
00340     the blocks show the same sizes (they contain the same read data):
00341 
00342     >>> handle = open("SamBam/ex1_header.bam", "rb")
00343     >>> for values in BgzfBlocks(handle):
00344     ...     print "Raw start %i, raw length %i; data start %i, data length %i" % values
00345     Raw start 0, raw length 104; data start 0, data length 103
00346     Raw start 104, raw length 18195; data start 103, data length 65434
00347     Raw start 18299, raw length 18190; data start 65537, data length 65409
00348     Raw start 36489, raw length 18004; data start 130946, data length 65483
00349     Raw start 54493, raw length 17353; data start 196429, data length 65519
00350     Raw start 71846, raw length 17708; data start 261948, data length 65411
00351     Raw start 89554, raw length 17709; data start 327359, data length 65466
00352     Raw start 107263, raw length 17390; data start 392825, data length 63854
00353     Raw start 124653, raw length 28; data start 456679, data length 0
00354     >>> handle.close()
00355 
00356     """
00357     data_start = 0
00358     while True:
00359         start_offset = handle.tell()
00360         #This may raise StopIteration which is perfect here
00361         block_length, data = _load_bgzf_block(handle)
00362         data_len = len(data)
00363         yield start_offset, block_length, data_start, data_len
00364         data_start += data_len
00365 

Here is the call graph for this function:

def Bio.bgzf.make_virtual_offset (   block_start_offset,
  within_block_offset 
)
Compute a BGZF virtual offset from block start and within block offsets.

The BAM indexing scheme records read positions using a 64 bit
'virtual offset', comprising in C terms:

block_start_offset<<16 | within_block_offset

Here block_start_offset is the file offset of the BGZF block
start (unsigned integer using up to 64-16 = 48 bits), and
within_block_offset within the (decompressed) block (unsigned
16 bit integer).

>>> make_virtual_offset(0,0)
0
>>> make_virtual_offset(0,1)
1
>>> make_virtual_offset(0, 2**16 - 1)
65535
>>> make_virtual_offset(0, 2**16)
Traceback (most recent call last):
...
ValueError: Require 0 <= within_block_offset < 2**16, got 65536

>>> 65536 == make_virtual_offset(1,0)
True
>>> 65537 == make_virtual_offset(1,1)
True
>>> 131071 == make_virtual_offset(1, 2**16 - 1)
True

>>> 6553600000 == make_virtual_offset(100000,0)
True
>>> 6553600001 == make_virtual_offset(100000,1)
True
>>> 6553600010 == make_virtual_offset(100000,10)
True

>>> make_virtual_offset(2**48,0)
Traceback (most recent call last):
...
ValueError: Require 0 <= block_start_offset < 2**48, got 281474976710656

Definition at line 229 of file bgzf.py.

00229 
00230 def make_virtual_offset(block_start_offset, within_block_offset):
00231     """Compute a BGZF virtual offset from block start and within block offsets.
00232 
00233     The BAM indexing scheme records read positions using a 64 bit
00234     'virtual offset', comprising in C terms:
00235 
00236     block_start_offset<<16 | within_block_offset
00237 
00238     Here block_start_offset is the file offset of the BGZF block
00239     start (unsigned integer using up to 64-16 = 48 bits), and
00240     within_block_offset within the (decompressed) block (unsigned
00241     16 bit integer).
00242 
00243     >>> make_virtual_offset(0,0)
00244     0
00245     >>> make_virtual_offset(0,1)
00246     1
00247     >>> make_virtual_offset(0, 2**16 - 1)
00248     65535
00249     >>> make_virtual_offset(0, 2**16)
00250     Traceback (most recent call last):
00251     ...
00252     ValueError: Require 0 <= within_block_offset < 2**16, got 65536
00253 
00254     >>> 65536 == make_virtual_offset(1,0)
00255     True
00256     >>> 65537 == make_virtual_offset(1,1)
00257     True
00258     >>> 131071 == make_virtual_offset(1, 2**16 - 1)
00259     True
00260 
00261     >>> 6553600000 == make_virtual_offset(100000,0)
00262     True
00263     >>> 6553600001 == make_virtual_offset(100000,1)
00264     True
00265     >>> 6553600010 == make_virtual_offset(100000,10)
00266     True
00267 
00268     >>> make_virtual_offset(2**48,0)
00269     Traceback (most recent call last):
00270     ...
00271     ValueError: Require 0 <= block_start_offset < 2**48, got 281474976710656
00272 
00273     """
00274     if within_block_offset < 0 or within_block_offset >= 65536:
00275         raise ValueError("Require 0 <= within_block_offset < 2**16, got %i" % within_block_offset)
00276     if block_start_offset < 0 or block_start_offset >= 281474976710656:
00277         raise ValueError("Require 0 <= block_start_offset < 2**48, got %i" % block_start_offset)
00278     return (block_start_offset<<16) | within_block_offset

Here is the caller graph for this function:

def Bio.bgzf.open (   filename,
  mode = "rb" 
)
Open a BGZF file for reading, writing or appending.

Definition at line 220 of file bgzf.py.

00220 
00221 def open(filename, mode="rb"):
00222     """Open a BGZF file for reading, writing or appending."""
00223     if "r" in mode.lower():
00224         return BgzfReader(filename, mode)
00225     elif "w" in mode.lower() or "a" in mode.lower():
00226         return BgzfWriter(filename, mode)
00227     else:
00228         raise ValueError("Bad mode %r" % mode)

def Bio.bgzf.split_virtual_offset (   virtual_offset)
Divides a 64-bit BGZF virtual offset into block start & within block offsets.

>>> (100000, 0) == split_virtual_offset(6553600000)
True
>>> (100000, 10) == split_virtual_offset(6553600010)
True

Definition at line 279 of file bgzf.py.

00279 
00280 def split_virtual_offset(virtual_offset):
00281     """Divides a 64-bit BGZF virtual offset into block start & within block offsets.
00282 
00283     >>> (100000, 0) == split_virtual_offset(6553600000)
00284     True
00285     >>> (100000, 10) == split_virtual_offset(6553600010)
00286     True
00287 
00288     """
00289     start = virtual_offset>>16
00290     return start, virtual_offset ^ (start<<16)


Variable Documentation

Initial value:
00001 _as_bytes("\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00BC" + \
00002                       "\x02\x00\x1b\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00")

Definition at line 213 of file bgzf.py.

Initial value:
00001 _as_bytes("\x1f\x8b\x08\x04\x00\x00\x00\x00"
00002                          "\x00\xff\x06\x00\x42\x43\x02\x00")

Definition at line 211 of file bgzf.py.

tuple Bio.bgzf._bgzf_magic = _as_bytes("\x1f\x8b\x08\x04")

Definition at line 210 of file bgzf.py.

tuple Bio.bgzf._bytes_BC = _as_bytes("BC")

Definition at line 215 of file bgzf.py.

tuple Bio.bgzf._bytes_newline = _as_bytes("\n")

Definition at line 217 of file bgzf.py.

tuple Bio.bgzf._empty_bytes_string = _as_bytes("")

Definition at line 216 of file bgzf.py.

tuple Bio.bgzf.data = sys.stdin.read(65536)

Definition at line 766 of file bgzf.py.

tuple Bio.bgzf.w = BgzfWriter(fileobj=sys.stdout)

Definition at line 764 of file bgzf.py.