Back to index

python-biopython  1.60
_index.py
Go to the documentation of this file.
00001 # Copyright 2009-2011 by Peter Cock.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 """Dictionary like indexing of sequence files (PRIVATE).
00006 
00007 You are not expected to access this module, or any of its code, directly. This
00008 is all handled internally by the Bio.SeqIO.index(...) function which is the
00009 public interface for this functionality.
00010 
00011 The basic idea is that we scan over a sequence file, looking for new record
00012 markers. We then try and extract the string that Bio.SeqIO.parse/read would
00013 use as the record id, ideally without actually parsing the full record. We
00014 then use a subclassed Python dictionary to record the file offset for the
00015 record start against the record id.
00016 
00017 Note that this means full parsing is on demand, so any invalid or problem
00018 record may not trigger an exception until it is accessed. This is by design.
00019 
00020 This means our dictionary like objects have in memory ALL the keys (all the
00021 record identifiers), which shouldn't be a problem even with second generation
00022 sequencing. If this is an issue later on, storing the keys and offsets in a
00023 temp lookup file might be one idea (e.g. using SQLite or an OBDA style index).
00024 """
00025 
00026 import os
00027 try:
00028     from collections import UserDict as _dict_base
00029 except ImportError:
00030     from UserDict import DictMixin as _dict_base
00031 import re
00032 import gzip
00033 import itertools
00034 from StringIO import StringIO
00035 
00036 try:
00037     from sqlite3 import dbapi2 as _sqlite
00038     from sqlite3 import IntegrityError as _IntegrityError
00039     from sqlite3 import OperationalError as _OperationalError
00040 except ImportError:
00041     #Not expected to be present on Python 2.4, ignore it
00042     #and at least offer Bio.SeqIO.index() functionality
00043     _sqlite = None
00044     pass
00045 
00046 from Bio._py3k import _bytes_to_string, _as_bytes, _as_string
00047 
00048 from Bio import SeqIO
00049 from Bio import Alphabet
00050 from Bio import bgzf
00051 
00052 class _IndexedSeqFileDict(_dict_base):
00053     """Read only dictionary interface to a sequential sequence file.
00054 
00055     Keeps the keys and associated file offsets in memory, reads the file to
00056     access entries as SeqRecord objects using Bio.SeqIO for parsing them.
00057     This approach is memory limited, but will work even with millions of
00058     sequences.
00059 
00060     Note - as with the Bio.SeqIO.to_dict() function, duplicate keys
00061     (record identifiers by default) are not allowed. If this happens,
00062     a ValueError exception is raised.
00063 
00064     By default the SeqRecord's id string is used as the dictionary
00065     key. This can be changed by suppling an optional key_function,
00066     a callback function which will be given the record id and must
00067     return the desired key. For example, this allows you to parse
00068     NCBI style FASTA identifiers, and extract the GI number to use
00069     as the dictionary key.
00070 
00071     Note that this dictionary is essentially read only. You cannot
00072     add or change values, pop values, nor clear the dictionary.
00073     """
00074     def __init__(self, filename, format, alphabet, key_function):
00075         #Use key_function=None for default value
00076         try:
00077             proxy_class = _FormatToRandomAccess[format]
00078         except KeyError:
00079             raise ValueError("Unsupported format '%s'" % format)
00080         random_access_proxy = proxy_class(filename, format, alphabet)
00081         self._proxy = random_access_proxy
00082         self._key_function = key_function
00083         if key_function:
00084             offset_iter = ((key_function(k),o,l) for (k,o,l) in random_access_proxy)
00085         else:
00086             offset_iter = random_access_proxy
00087         offsets = {}
00088         for key, offset, length in offset_iter:
00089             #Note - we don't store the length because I want to minimise the
00090             #memory requirements. With the SQLite backend the length is kept
00091             #and is used to speed up the get_raw method (by about 3 times).
00092             #The length should be provided by all the current backends except
00093             #SFF where there is an existing Roche index we can reuse (very fast
00094             #but lacks the record lengths)
00095             #assert length or format in ["sff", "sff-trim"], \
00096             #       "%s at offset %i given length %r (%s format %s)" \
00097             #       % (key, offset, length, filename, format)
00098             if key in offsets:
00099                 self._proxy._handle.close()
00100                 raise ValueError("Duplicate key '%s'" % key)
00101             else:
00102                 offsets[key] = offset
00103         self._offsets = offsets
00104     
00105     def __repr__(self):
00106         return "SeqIO.index(%r, %r, alphabet=%r, key_function=%r)" \
00107                % (self._proxy._handle.name, self._proxy._format,
00108                   self._proxy._alphabet, self._key_function)
00109 
00110     def __str__(self):
00111         if self:
00112             return "{%s : SeqRecord(...), ...}" % repr(self.keys()[0])
00113         else:
00114             return "{}"
00115 
00116     def __contains__(self, key) :
00117         return key in self._offsets
00118         
00119     def __len__(self):
00120         """How many records are there?"""
00121         return len(self._offsets)
00122 
00123     if hasattr(dict, "iteritems"):
00124         #Python 2, use iteritems but not items etc
00125         def values(self):
00126             """Would be a list of the SeqRecord objects, but not implemented.
00127 
00128             In general you can be indexing very very large files, with millions
00129             of sequences. Loading all these into memory at once as SeqRecord
00130             objects would (probably) use up all the RAM. Therefore we simply
00131             don't support this dictionary method.
00132             """
00133             raise NotImplementedError("Due to memory concerns, when indexing a "
00134                                       "sequence file you cannot access all the "
00135                                       "records at once.")
00136 
00137         def items(self):
00138             """Would be a list of the (key, SeqRecord) tuples, but not implemented.
00139 
00140             In general you can be indexing very very large files, with millions
00141             of sequences. Loading all these into memory at once as SeqRecord
00142             objects would (probably) use up all the RAM. Therefore we simply
00143             don't support this dictionary method.
00144             """
00145             raise NotImplementedError("Due to memory concerns, when indexing a "
00146                                       "sequence file you cannot access all the "
00147                                       "records at once.")
00148 
00149         def keys(self) :
00150             """Return a list of all the keys (SeqRecord identifiers)."""
00151             #TODO - Stick a warning in here for large lists? Or just refuse?
00152             return self._offsets.keys()
00153 
00154         def itervalues(self):
00155             """Iterate over the SeqRecord) items."""
00156             for key in self.__iter__():
00157                 yield self.__getitem__(key)
00158 
00159         def iteritems(self):
00160             """Iterate over the (key, SeqRecord) items."""
00161             for key in self.__iter__():
00162                 yield key, self.__getitem__(key)
00163         
00164         def iterkeys(self):
00165             """Iterate over the keys."""
00166             return self.__iter__()
00167 
00168     else:
00169         #Python 3 - define items and values as iterators
00170         def items(self):
00171             """Iterate over the (key, SeqRecord) items."""
00172             for key in self.__iter__():
00173                 yield key, self.__getitem__(key)
00174 
00175         def values(self):
00176             """Iterate over the SeqRecord items."""
00177             for key in self.__iter__():
00178                 yield self.__getitem__(key)
00179 
00180         def keys(self):
00181             """Iterate over the keys."""
00182             return self.__iter__()
00183 
00184     def __iter__(self):
00185         """Iterate over the keys."""
00186         return iter(self._offsets)
00187         
00188     def __getitem__(self, key):
00189         """x.__getitem__(y) <==> x[y]"""
00190         #Pass the offset to the proxy
00191         record = self._proxy.get(self._offsets[key])
00192         if self._key_function:
00193             key2 = self._key_function(record.id)
00194         else:
00195             key2 = record.id
00196         if key != key2:
00197             raise ValueError("Key did not match (%s vs %s)" % (key, key2))
00198         return record
00199 
00200     def get(self, k, d=None):
00201         """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
00202         try:
00203             return self.__getitem__(k)
00204         except KeyError:
00205             return d
00206 
00207     def get_raw(self, key):
00208         """Similar to the get method, but returns the record as a raw string.
00209 
00210         If the key is not found, a KeyError exception is raised.
00211 
00212         Note that on Python 3 a bytes string is returned, not a typical
00213         unicode string.
00214 
00215         NOTE - This functionality is not supported for every file format.
00216         """
00217         #Pass the offset to the proxy
00218         return self._proxy.get_raw(self._offsets[key])
00219 
00220     def __setitem__(self, key, value):
00221         """Would allow setting or replacing records, but not implemented."""
00222         raise NotImplementedError("An indexed a sequence file is read only.")
00223     
00224     def update(self, *args, **kwargs):
00225         """Would allow adding more values, but not implemented."""
00226         raise NotImplementedError("An indexed a sequence file is read only.")
00227 
00228     
00229     def pop(self, key, default=None):
00230         """Would remove specified record, but not implemented."""
00231         raise NotImplementedError("An indexed a sequence file is read only.")
00232     
00233     def popitem(self):
00234         """Would remove and return a SeqRecord, but not implemented."""
00235         raise NotImplementedError("An indexed a sequence file is read only.")
00236 
00237     
00238     def clear(self):
00239         """Would clear dictionary, but not implemented."""
00240         raise NotImplementedError("An indexed a sequence file is read only.")
00241 
00242     def fromkeys(self, keys, value=None):
00243         """A dictionary method which we don't implement."""
00244         raise NotImplementedError("An indexed a sequence file doesn't "
00245                                   "support this.")
00246 
00247     def copy(self):
00248         """A dictionary method which we don't implement."""
00249         raise NotImplementedError("An indexed a sequence file doesn't "
00250                                   "support this.")
00251 
00252 
00253 class _SQLiteManySeqFilesDict(_IndexedSeqFileDict):
00254     """Read only dictionary interface to many sequential sequence files.
00255 
00256     Keeps the keys, file-numbers and offsets in an SQLite database. To access
00257     a record by key, reads from the offset in the approapriate file using
00258     Bio.SeqIO for parsing.
00259     
00260     There are OS limits on the number of files that can be open at once,
00261     so a pool are kept. If a record is required from a closed file, then
00262     one of the open handles is closed first.
00263     """
00264     def __init__(self, index_filename, filenames, format, alphabet,
00265                  key_function, max_open=10):
00266         random_access_proxies = {}
00267         #TODO? - Don't keep filename list in memory (just in DB)?
00268         #Should save a chunk of memory if dealing with 1000s of files.
00269         #Furthermore could compare a generator to the DB on reloading
00270         #(no need to turn it into a list)
00271         if not _sqlite:
00272             #Hack for Python 2.4 (of if Python is compiled without it)
00273             from Bio import MissingPythonDependencyError
00274             raise MissingPythonDependencyError("Requires sqlite3, which is "
00275                                                "included Python 2.5+")
00276         if filenames is not None:
00277             filenames = list(filenames) #In case it was a generator
00278         if os.path.isfile(index_filename):
00279             #Reuse the index.
00280             con = _sqlite.connect(index_filename)
00281             self._con = con
00282             #Check the count...
00283             try:
00284                 count, = con.execute("SELECT value FROM meta_data WHERE key=?;",
00285                                      ("count",)).fetchone()
00286                 self._length = int(count)
00287                 if self._length == -1:
00288                     con.close()
00289                     raise ValueError("Unfinished/partial database")
00290                 count, = con.execute("SELECT COUNT(key) FROM offset_data;").fetchone()
00291                 if self._length <> int(count):
00292                     con.close()
00293                     raise ValueError("Corrupt database? %i entries not %i" \
00294                                      % (int(count), self._length))
00295                 self._format, = con.execute("SELECT value FROM meta_data WHERE key=?;",
00296                                            ("format",)).fetchone()
00297                 if format and format != self._format:
00298                     con.close()
00299                     raise ValueError("Index file says format %s, not %s" \
00300                                      % (self._format, format))
00301                 self._filenames = [row[0] for row in \
00302                                   con.execute("SELECT name FROM file_data "
00303                                               "ORDER BY file_number;").fetchall()]
00304                 if filenames and len(filenames) != len(self._filenames):
00305                     con.close()
00306                     raise ValueError("Index file says %i files, not %i" \
00307                                      % (len(self._filenames), len(filenames)))
00308                 if filenames and filenames != self._filenames:
00309                     con.close()
00310                     raise ValueError("Index file has different filenames")
00311             except _OperationalError, err:
00312                 con.close()
00313                 raise ValueError("Not a Biopython index database? %s" % err)
00314             #Now we have the format (from the DB if not given to us),
00315             try:
00316                 proxy_class = _FormatToRandomAccess[self._format]
00317             except KeyError:
00318                 con.close()
00319                 raise ValueError("Unsupported format '%s'" % self._format)
00320         else:
00321             self._filenames = filenames
00322             self._format = format
00323             if not format or not filenames:
00324                 raise ValueError("Filenames to index and format required")
00325             try:
00326                 proxy_class = _FormatToRandomAccess[format]
00327             except KeyError:
00328                 raise ValueError("Unsupported format '%s'" % format)
00329             #Create the index
00330             con = _sqlite.connect(index_filename)
00331             self._con = con
00332             #print "Creating index"
00333             # Sqlite PRAGMA settings for speed
00334             con.execute("PRAGMA synchronous='OFF'")
00335             con.execute("PRAGMA locking_mode=EXCLUSIVE")
00336             #Don't index the key column until the end (faster)
00337             #con.execute("CREATE TABLE offset_data (key TEXT PRIMARY KEY, "
00338             # "offset INTEGER);")
00339             con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);")
00340             con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);",
00341                         ("count", -1))
00342             con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);",
00343                         ("format", format))
00344             #TODO - Record the alphabet?
00345             #TODO - Record the file size and modified date?
00346             con.execute("CREATE TABLE file_data (file_number INTEGER, name TEXT);")
00347             con.execute("CREATE TABLE offset_data (key TEXT, file_number INTEGER, offset INTEGER, length INTEGER);")
00348             count = 0
00349             for i, filename in enumerate(filenames):
00350                 con.execute("INSERT INTO file_data (file_number, name) VALUES (?,?);",
00351                             (i, filename))
00352                 random_access_proxy = proxy_class(filename, format, alphabet)
00353                 if key_function:
00354                     offset_iter = ((key_function(k),i,o,l) for (k,o,l) in random_access_proxy)
00355                 else:
00356                     offset_iter = ((k,i,o,l) for (k,o,l) in random_access_proxy)
00357                 while True:
00358                     batch = list(itertools.islice(offset_iter, 100))
00359                     if not batch: break
00360                     #print "Inserting batch of %i offsets, %s ... %s" \
00361                     # % (len(batch), batch[0][0], batch[-1][0])
00362                     con.executemany("INSERT INTO offset_data (key,file_number,offset,length) VALUES (?,?,?,?);",
00363                                     batch)
00364                     con.commit()
00365                     count += len(batch)
00366                 if len(random_access_proxies) < max_open:
00367                     random_access_proxies[i] = random_access_proxy
00368                 else:
00369                     random_access_proxy._handle.close()
00370             self._length = count
00371             #print "About to index %i entries" % count
00372             try:
00373                 con.execute("CREATE UNIQUE INDEX IF NOT EXISTS "
00374                             "key_index ON offset_data(key);")
00375             except _IntegrityError, err:
00376                 self._proxies = random_access_proxies
00377                 self.close()
00378                 con.close()
00379                 raise ValueError("Duplicate key? %s" % err)
00380             con.execute("PRAGMA locking_mode=NORMAL")
00381             con.execute("UPDATE meta_data SET value = ? WHERE key = ?;",
00382                         (count, "count"))
00383             con.commit()
00384             #print "Index created"
00385         self._proxies = random_access_proxies
00386         self._max_open = max_open
00387         self._index_filename = index_filename
00388         self._alphabet = alphabet
00389         self._key_function = key_function
00390     
00391     def __repr__(self):
00392         return "SeqIO.index_db(%r, filenames=%r, format=%r, alphabet=%r, key_function=%r)" \
00393                % (self._index_filename, self._filenames, self._format,
00394                   self._alphabet, self._key_function)
00395 
00396     def __contains__(self, key):
00397         return bool(self._con.execute("SELECT key FROM offset_data WHERE key=?;",
00398                                       (key,)).fetchone())
00399 
00400     def __len__(self):
00401         """How many records are there?"""
00402         return self._length
00403         #return self._con.execute("SELECT COUNT(key) FROM offset_data;").fetchone()[0]
00404 
00405     def __iter__(self):
00406         """Iterate over the keys."""
00407         for row in self._con.execute("SELECT key FROM offset_data;"):
00408             yield str(row[0])
00409 
00410     if hasattr(dict, "iteritems"):
00411         #Python 2, use iteritems but not items etc
00412         #Just need to override this...
00413         def keys(self) :
00414             """Return a list of all the keys (SeqRecord identifiers)."""
00415             return [str(row[0]) for row in \
00416                     self._con.execute("SELECT key FROM offset_data;").fetchall()]
00417 
00418     def __getitem__(self, key):
00419         """x.__getitem__(y) <==> x[y]"""
00420         #Pass the offset to the proxy
00421         row = self._con.execute("SELECT file_number, offset FROM offset_data WHERE key=?;",
00422                                 (key,)).fetchone()
00423         if not row: raise KeyError
00424         file_number, offset = row
00425         proxies = self._proxies
00426         if file_number in proxies:
00427             record = proxies[file_number].get(offset)
00428         else:
00429             if len(proxies) >= self._max_open:
00430                 #Close an old handle...
00431                 proxies.popitem()[1]._handle.close()
00432             #Open a new handle...
00433             proxy = _FormatToRandomAccess[self._format]( \
00434                         self._filenames[file_number],
00435                         self._format, self._alphabet)
00436             record = proxy.get(offset)
00437             proxies[file_number] = proxy
00438         if self._key_function:
00439             key2 = self._key_function(record.id)
00440         else:
00441             key2 = record.id
00442         if key != key2:
00443             raise ValueError("Key did not match (%s vs %s)" % (key, key2))
00444         return record
00445 
00446     def get(self, k, d=None):
00447         """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
00448         try:
00449             return self.__getitem__(k)
00450         except KeyError:
00451             return d
00452 
00453     def get_raw(self, key):
00454         """Similar to the get method, but returns the record as a raw string.
00455 
00456         If the key is not found, a KeyError exception is raised.
00457 
00458         Note that on Python 3 a bytes string is returned, not a typical
00459         unicode string.
00460 
00461         NOTE - This functionality is not supported for every file format.
00462         """
00463         #Pass the offset to the proxy
00464         row = self._con.execute("SELECT file_number, offset, length FROM offset_data WHERE key=?;",
00465                                 (key,)).fetchone()
00466         if not row: raise KeyError
00467         file_number, offset, length = row
00468         proxies = self._proxies
00469         if file_number in proxies:
00470             if length:
00471                 #Shortcut if we have the length
00472                 h = proxies[file_number]._handle
00473                 h.seek(offset)
00474                 return h.read(length)
00475             else:
00476                 return proxies[file_number].get_raw(offset)
00477         else:
00478             #This code is duplicated from __getitem__ to avoid a function call
00479             if len(proxies) >= self._max_open:
00480                 #Close an old handle...
00481                 proxies.popitem()[1]._handle.close()
00482             #Open a new handle...
00483             proxy = _FormatToRandomAccess[self._format]( \
00484                         self._filenames[file_number],
00485                         self._format, self._alphabet)
00486             proxies[file_number] = proxy
00487             if length:
00488                 #Shortcut if we have the length
00489                 h = proxy._handle
00490                 h.seek(offset)
00491                 return h.read(length)
00492             else:
00493                 return proxy.get_raw(offset)
00494 
00495     def close(self):
00496         """Close any open file handles."""
00497         proxies = self._proxies
00498         while proxies:
00499             proxies.popitem()[1]._handle.close()
00500         
00501 
00502 ##############################################################################
00503 
00504 class SeqFileRandomAccess(object):
00505     def __init__(self, filename, format, alphabet):
00506         h = open(filename, "rb")
00507         try:
00508             self._handle = bgzf.BgzfReader(mode="rb", fileobj=h)
00509         except ValueError, e:
00510             assert "BGZF" in str(e)
00511             #Not a BGZF file
00512             h.seek(0)
00513             self._handle = h
00514         self._alphabet = alphabet
00515         self._format = format
00516         #Load the parser class/function once an avoid the dict lookup in each
00517         #__getitem__ call:
00518         i = SeqIO._FormatToIterator[format]
00519         #The following alphabet code is a bit nasty... duplicates logic in
00520         #Bio.SeqIO.parse()
00521         if alphabet is None:
00522             def _parse(handle):
00523                 """Dynamically generated parser function (PRIVATE)."""
00524                 return i(handle).next()
00525         else:
00526             #TODO - Detect alphabet support ONCE at __init__
00527             def _parse(handle):
00528                 """Dynamically generated parser function (PRIVATE)."""
00529                 try:
00530                     return i(handle, alphabet=alphabet).next()
00531                 except TypeError:
00532                     return SeqIO._force_alphabet(i(handle),
00533                                                  alphabet).next()
00534         self._parse = _parse
00535 
00536     def __iter__(self):
00537         """Returns (id,offset) tuples."""
00538         raise NotImplementedError("Subclass should implement this")
00539 
00540     def get(self, offset):
00541         """Returns SeqRecord."""
00542         #Should be overriden for binary file formats etc:
00543         return self._parse(StringIO(_bytes_to_string(self.get_raw(offset))))
00544 
00545     def get_raw(self, offset):
00546         """Returns bytes string (if implemented for this file format)."""
00547         #Should be done by each sub-class (if possible)
00548         raise NotImplementedError("Not available for this file format.")
00549 
00550 
00551 
00552 
00553 ####################
00554 # Special indexers #
00555 ####################
00556 
00557 # Anything where the records cannot be read simply by parsing from
00558 # the record start. For example, anything requiring information from
00559 # a file header - e.g. SFF files where we would need to know the
00560 # number of flows.
00561 
00562 class SffRandomAccess(SeqFileRandomAccess):
00563     """Random access to a Standard Flowgram Format (SFF) file."""
00564     def __init__(self, filename, format, alphabet):
00565         SeqFileRandomAccess.__init__(self, filename, format, alphabet)
00566         header_length, index_offset, index_length, number_of_reads, \
00567         self._flows_per_read, self._flow_chars, self._key_sequence \
00568             = SeqIO.SffIO._sff_file_header(self._handle)
00569 
00570     def __iter__(self):
00571         """Load any index block in the file, or build it the slow way (PRIVATE)."""
00572         if self._alphabet is None:
00573             self._alphabet = Alphabet.generic_dna
00574         handle = self._handle
00575         handle.seek(0)
00576         #Alread did this in __init__ but need handle in right place
00577         header_length, index_offset, index_length, number_of_reads, \
00578         self._flows_per_read, self._flow_chars, self._key_sequence \
00579             = SeqIO.SffIO._sff_file_header(handle)
00580         if index_offset and index_length:
00581             #There is an index provided, try this the fast way:
00582             count = 0
00583             try :
00584                 for name, offset in SeqIO.SffIO._sff_read_roche_index(handle) :
00585                     yield name, offset, 0
00586                     count += 1
00587                 assert count == number_of_reads, \
00588                        "Indexed %i records, expected %i" \
00589                        % (count, number_of_reads)
00590                 return
00591             except ValueError, err :
00592                 import warnings
00593                 warnings.warn("Could not parse the SFF index: %s" % err)
00594                 assert count==0, "Partially populated index"
00595                 handle.seek(0)
00596         #We used to give a warning in this case, but Ion Torrent's
00597         #SFF files don't have an index so that would be annoying.
00598         #Fall back on the slow way!
00599         count = 0
00600         for name, offset in SeqIO.SffIO._sff_do_slow_index(handle) :
00601             yield name, offset, 0
00602             count += 1
00603         assert count == number_of_reads, \
00604                "Indexed %i records, expected %i" % (count, number_of_reads)
00605 
00606     def get(self, offset) :
00607         handle = self._handle
00608         handle.seek(offset)
00609         return SeqIO.SffIO._sff_read_seq_record(handle,
00610                                                 self._flows_per_read,
00611                                                 self._flow_chars,
00612                                                 self._key_sequence,
00613                                                 self._alphabet)
00614 
00615     def get_raw(self, offset):
00616         handle = self._handle
00617         handle.seek(offset)
00618         return SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read)
00619 
00620 
00621 class SffTrimedRandomAccess(SffRandomAccess) :
00622     def get(self, offset) :
00623         handle = self._handle
00624         handle.seek(offset)
00625         return SeqIO.SffIO._sff_read_seq_record(handle,
00626                                                 self._flows_per_read,
00627                                                 self._flow_chars,
00628                                                 self._key_sequence,
00629                                                 self._alphabet,
00630                                                 trim=True)
00631 
00632 
00633 ###################
00634 # Simple indexers #
00635 ###################
00636 
00637 class SequentialSeqFileRandomAccess(SeqFileRandomAccess):
00638     def __init__(self, filename, format, alphabet):
00639         SeqFileRandomAccess.__init__(self, filename, format, alphabet)
00640         marker = {"ace" : "CO ",
00641                   "embl" : "ID ",
00642                   "fasta" : ">",
00643                   "genbank" : "LOCUS ",
00644                   "gb": "LOCUS ",
00645                   "imgt" : "ID ",
00646                   "phd" : "BEGIN_SEQUENCE",
00647                   "pir" : ">..;",
00648                   "qual": ">",
00649                   "qual": ">",
00650                   "swiss" : "ID ",
00651                   "uniprot-xml" : "<entry ",
00652                    }[format]
00653         self._marker = marker
00654         self._marker_re = re.compile(_as_bytes("^%s" % marker))
00655         
00656     def __iter__(self):
00657         """Returns (id,offset) tuples."""
00658         marker_offset = len(self._marker)
00659         marker_re = self._marker_re
00660         handle = self._handle
00661         handle.seek(0)
00662         #Skip and header before first record
00663         while True:
00664             start_offset = handle.tell()
00665             line = handle.readline()
00666             if marker_re.match(line) or not line:
00667                 break
00668         #Should now be at the start of a record, or end of the file
00669         while marker_re.match(line):
00670             #Here we can assume the record.id is the first word after the
00671             #marker. This is generally fine... but not for GenBank, EMBL, Swiss
00672             id = line[marker_offset:].strip().split(None, 1)[0]
00673             length = len(line)
00674             while True:
00675                 end_offset = handle.tell()
00676                 line = handle.readline()
00677                 if marker_re.match(line) or not line:
00678                     yield _bytes_to_string(id), start_offset, length
00679                     start_offset = end_offset
00680                     break
00681                 else:
00682                     #Track this explicitly as can't do file offset difference on BGZF
00683                     length += len(line)
00684         assert not line, repr(line)
00685 
00686     def get_raw(self, offset):
00687         """Similar to the get method, but returns the record as a raw string."""
00688         #For non-trivial file formats this must be over-ridden in the subclass
00689         handle = self._handle
00690         marker_re = self._marker_re
00691         handle.seek(offset)
00692         lines = [handle.readline()]
00693         while True:
00694             line = handle.readline()
00695             if marker_re.match(line) or not line:
00696                 #End of file, or start of next record => end of this record
00697                 break
00698             lines.append(line)
00699         return _as_bytes("").join(lines)
00700 
00701 
00702 #######################################
00703 # Fiddly indexers: GenBank, EMBL, ... #
00704 #######################################
00705 
00706 class GenBankRandomAccess(SequentialSeqFileRandomAccess):
00707     """Indexed dictionary like access to a GenBank file."""
00708     def __iter__(self):
00709         handle = self._handle
00710         handle.seek(0)
00711         marker_re = self._marker_re
00712         dot_char = _as_bytes(".")
00713         accession_marker = _as_bytes("ACCESSION ")
00714         version_marker = _as_bytes("VERSION ")
00715         #Skip and header before first record
00716         while True:
00717             start_offset = handle.tell()
00718             line = handle.readline()
00719             if marker_re.match(line) or not line:
00720                 break
00721         #Should now be at the start of a record, or end of the file
00722         while marker_re.match(line):
00723             #We cannot assume the record.id is the first word after LOCUS,
00724             #normally the first entry on the VERSION or ACCESSION line is used.
00725             key = None
00726             length = len(line)
00727             while True:
00728                 end_offset = handle.tell()
00729                 line = handle.readline()
00730                 if marker_re.match(line) or not line:
00731                     if not key:
00732                         raise ValueError("Did not find ACCESSION/VERSION lines")
00733                     yield _bytes_to_string(key), start_offset, length
00734                     start_offset = end_offset
00735                     break
00736                 elif line.startswith(accession_marker):
00737                     key = line.rstrip().split()[1]
00738                 elif line.startswith(version_marker):
00739                     version_id = line.rstrip().split()[1]
00740                     if version_id.count(dot_char)==1 and version_id.split(dot_char)[1].isdigit():
00741                         #This should mimic the GenBank parser...
00742                         key = version_id
00743                 length += len(line)
00744         assert not line, repr(line)
00745 
00746 
00747 class EmblRandomAccess(SequentialSeqFileRandomAccess):
00748     """Indexed dictionary like access to an EMBL file."""
00749     def __iter__(self):
00750         handle = self._handle
00751         handle.seek(0)
00752         marker_re = self._marker_re
00753         semi_char = _as_bytes(";")
00754         dot_char = _as_bytes(".")
00755         sv_marker = _as_bytes("SV ")
00756         #Skip any header before first record
00757         while True:
00758             start_offset = handle.tell()
00759             line = handle.readline()
00760             if marker_re.match(line) or not line:
00761                 break
00762         #Should now be at the start of a record, or end of the file
00763         while marker_re.match(line):
00764             #We cannot assume the record.id is the first word after ID,
00765             #normally the SV line is used.
00766             length = len(line)
00767             if line[2:].count(semi_char) == 6:
00768                 #Looks like the semi colon separated style introduced in 2006
00769                 parts = line[3:].rstrip().split(semi_char)
00770                 if parts[1].strip().startswith(sv_marker):
00771                     #The SV bit gives the version
00772                     key = parts[0].strip() + dot_char + parts[1].strip().split()[1]
00773                 else:
00774                     key = parts[0].strip()
00775             elif line[2:].count(semi_char) == 3:
00776                 #Looks like the pre 2006 style, take first word only
00777                 key = line[3:].strip().split(None,1)[0]
00778             else:
00779                 raise ValueError('Did not recognise the ID line layout:\n' + line)
00780             while True:
00781                 end_offset = handle.tell()
00782                 line = handle.readline()
00783                 if marker_re.match(line) or not line:
00784                     end_offset = handle.tell() - len(line)
00785                     yield _bytes_to_string(key), start_offset, length
00786                     start_offset = end_offset
00787                     break
00788                 elif line.startswith(sv_marker):
00789                     key = line.rstrip().split()[1]
00790                 length += len(line)
00791         assert not line, repr(line)
00792 
00793 
00794 class SwissRandomAccess(SequentialSeqFileRandomAccess):
00795     """Random access to a SwissProt file."""
00796     def __iter__(self):
00797         handle = self._handle
00798         handle.seek(0)
00799         marker_re = self._marker_re
00800         semi_char = _as_bytes(";")
00801         #Skip any header before first record
00802         while True:
00803             start_offset = handle.tell()
00804             line = handle.readline()
00805             if marker_re.match(line) or not line:
00806                 break
00807         #Should now be at the start of a record, or end of the file
00808         while marker_re.match(line):
00809             length = len(line)
00810             #We cannot assume the record.id is the first word after ID,
00811             #normally the following AC line is used.
00812             line = handle.readline()
00813             length += len(line)
00814             assert line.startswith(_as_bytes("AC "))
00815             key = line[3:].strip().split(semi_char)[0].strip()
00816             while True:
00817                 end_offset = handle.tell()
00818                 line = handle.readline()
00819                 if marker_re.match(line) or not line:
00820                     yield _bytes_to_string(key), start_offset, length
00821                     start_offset = end_offset
00822                     break
00823                 length += len(line)
00824         assert not line, repr(line)
00825 
00826 
00827 class UniprotRandomAccess(SequentialSeqFileRandomAccess):
00828     """Random access to a UniProt XML file."""
00829     def __iter__(self):
00830         handle = self._handle
00831         handle.seek(0)
00832         marker_re = self._marker_re
00833         start_acc_marker = _as_bytes("<accession>")
00834         end_acc_marker = _as_bytes("</accession>")
00835         end_entry_marker = _as_bytes("</entry>")
00836         less_than = _as_bytes("<")
00837         #Skip any header before first record
00838         while True:
00839             start_offset = handle.tell()
00840             line = handle.readline()
00841             if marker_re.match(line) or not line:
00842                 break
00843         #Should now be at the start of a record, or end of the file
00844         while marker_re.match(line):
00845             length = len(line)
00846             #We expect the next line to be <accession>xxx</accession>
00847             #(possibly with leading spaces)
00848             #but allow it to be later on within the <entry>
00849             key = None
00850             done = False
00851             while True:
00852                 line = handle.readline()
00853                 if key is None and start_acc_marker in line:
00854                     assert end_acc_marker in line, line
00855                     key = line[line.find(start_acc_marker)+11:].split(less_than,1)[0]
00856                     length += len(line)
00857                 elif end_entry_marker in line:
00858                     end_offset = handle.tell() - len(line) \
00859                                + line.find(end_entry_marker) + 8
00860                     break
00861                 elif marker_re.match(line) or not line:
00862                     #Start of next record or end of file
00863                     raise ValueError("Didn't find end of record")
00864                 else:
00865                     length += len(line)
00866             if not key:
00867                 raise ValueError("Did not find <accession> line in bytes %i to %i" \
00868                                  % (start_offset, end_offset))
00869             yield _bytes_to_string(key), start_offset, length
00870             #Find start of next record
00871             while not marker_re.match(line) and line:
00872                 start_offset = handle.tell()
00873                 line = handle.readline()
00874         assert not line, repr(line)
00875 
00876     def get_raw(self, offset):
00877         """Similar to the get method, but returns the record as a raw string."""
00878         handle = self._handle
00879         marker_re = self._marker_re
00880         end_entry_marker = _as_bytes("</entry>")
00881         handle.seek(offset)
00882         data = [handle.readline()]
00883         while True:
00884             line = handle.readline()
00885             i = line.find(end_entry_marker)
00886             if i != -1:
00887                 data.append(line[:i+8])
00888                 break
00889             if marker_re.match(line) or not line:
00890                 #End of file, or start of next record
00891                 raise ValueError("Didn't find end of record")
00892             data.append(line)
00893         return _as_bytes("").join(data)
00894 
00895     def get(self, offset) :
00896         #TODO - Can we handle this directly in the parser?
00897         #This is a hack - use get_raw for <entry>...</entry> and wrap it with
00898         #the apparently required XML header and footer.
00899         data = """<?xml version='1.0' encoding='UTF-8'?>
00900         <uniprot xmlns="http://uniprot.org/uniprot"
00901         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
00902         xsi:schemaLocation="http://uniprot.org/uniprot
00903         http://www.uniprot.org/support/docs/uniprot.xsd">
00904         %s
00905         </uniprot>
00906         """ % _bytes_to_string(self.get_raw(offset))
00907         #TODO - For consistency, this function should not accept a string:
00908         return SeqIO.UniprotIO.UniprotIterator(data).next()
00909 
00910 
00911 class IntelliGeneticsRandomAccess(SeqFileRandomAccess):
00912     """Random access to a IntelliGenetics file."""
00913     def __init__(self, filename, format, alphabet):
00914         SeqFileRandomAccess.__init__(self, filename, format, alphabet)
00915         self._marker_re = re.compile(_as_bytes("^;"))
00916 
00917     def __iter__(self):
00918         handle = self._handle
00919         handle.seek(0)
00920         marker_re = self._marker_re
00921         semi_char = _as_bytes(";")
00922         while True:
00923             offset = handle.tell()
00924             line = handle.readline()
00925             length = len(line)
00926             if marker_re.match(line):
00927                 #Now look for the first line which doesn't start ";"
00928                 while True:
00929                     line = handle.readline()
00930                     if line[0:1] != semi_char and line.strip():
00931                         key = line.split()[0]
00932                         yield _bytes_to_string(key), offset, length
00933                         break
00934                     if not line:
00935                         raise ValueError("Premature end of file?")
00936                     length += len(line)
00937             elif not line:
00938                 #End of file
00939                 break
00940 
00941     def get_raw(self, offset):
00942         handle = self._handle
00943         handle.seek(offset)
00944         marker_re = self._marker_re
00945         lines = []
00946         line = handle.readline()
00947         semi_char = _as_bytes(";")
00948         while line.startswith(semi_char):
00949             lines.append(line)
00950             line = handle.readline()
00951         while line and not line.startswith(semi_char):
00952             lines.append(line)
00953             line = handle.readline()
00954         return _as_bytes("").join(lines)
00955 
00956 class TabRandomAccess(SeqFileRandomAccess):
00957     """Random access to a simple tabbed file."""
00958     def __iter__(self):
00959         handle = self._handle
00960         handle.seek(0)
00961         tab_char = _as_bytes("\t")
00962         while True:
00963             start_offset = handle.tell()
00964             line = handle.readline()
00965             if not line : break #End of file
00966             try:
00967                 key = line.split(tab_char)[0]
00968             except ValueError, err:
00969                 if not line.strip():
00970                     #Ignore blank lines
00971                     continue
00972                 else:
00973                     raise err
00974             else:
00975                 yield _bytes_to_string(key), start_offset, len(line)
00976 
00977     def get_raw(self, offset):
00978         """Like the get method, but returns the record as a raw string."""
00979         handle = self._handle
00980         handle.seek(offset)
00981         return handle.readline()
00982 
00983 
00984 ##########################
00985 # Now the FASTQ indexers #
00986 ##########################
00987          
00988 class FastqRandomAccess(SeqFileRandomAccess):
00989     """Random access to a FASTQ file (any supported variant).
00990     
00991     With FASTQ the records all start with a "@" line, but so can quality lines.
00992     Note this will cope with line-wrapped FASTQ files.
00993     """
00994     def __iter__(self):
00995         handle = self._handle
00996         handle.seek(0)
00997         id = None
00998         start_offset = handle.tell()
00999         line = handle.readline()
01000         if not line:
01001             #Empty file!
01002             return
01003         at_char = _as_bytes("@")
01004         plus_char = _as_bytes("+")
01005         if line[0:1] != at_char:
01006             raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line))
01007         while line:
01008             #assert line[0]=="@"
01009             #This record seems OK (so far)
01010             id = line[1:].rstrip().split(None, 1)[0]
01011             #Find the seq line(s)
01012             seq_len = 0
01013             length = len(line)
01014             while line:
01015                 line = handle.readline()
01016                 length += len(line)
01017                 if line.startswith(plus_char) : break
01018                 seq_len += len(line.strip())
01019             if not line:
01020                 raise ValueError("Premature end of file in seq section")
01021             #assert line[0]=="+"
01022             #Find the qual line(s)
01023             qual_len = 0
01024             while line:
01025                 if seq_len == qual_len:
01026                     #Should be end of record...
01027                     end_offset = handle.tell()
01028                     line = handle.readline()
01029                     if line and line[0:1] != at_char:
01030                         ValueError("Problem with line %s" % repr(line))
01031                     break
01032                 else:
01033                     line = handle.readline()
01034                     qual_len += len(line.strip())
01035                     length += len(line)
01036             if seq_len != qual_len:
01037                 raise ValueError("Problem with quality section")
01038             yield _bytes_to_string(id), start_offset, length
01039             start_offset = end_offset
01040         #print "EOF"
01041 
01042     def get_raw(self, offset):
01043         """Similar to the get method, but returns the record as a raw string."""
01044         #TODO - Refactor this and the __init__ method to reduce code duplication?
01045         handle = self._handle
01046         handle.seek(offset)
01047         line = handle.readline()
01048         data = line
01049         at_char = _as_bytes("@")
01050         plus_char = _as_bytes("+")
01051         if line[0:1] != at_char:
01052             raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line))
01053         identifier = line[1:].rstrip().split(None, 1)[0]
01054         #Find the seq line(s)
01055         seq_len = 0
01056         while line:
01057             line = handle.readline()
01058             data += line
01059             if line.startswith(plus_char) : break
01060             seq_len += len(line.strip())
01061         if not line:
01062             raise ValueError("Premature end of file in seq section")
01063         assert line[0:1] == plus_char
01064         #Find the qual line(s)
01065         qual_len = 0
01066         while line:
01067             if seq_len == qual_len:
01068                 #Should be end of record...
01069                 pos = handle.tell()
01070                 line = handle.readline()
01071                 if line and line[0:1] != at_char:
01072                     ValueError("Problem with line %s" % repr(line))
01073                 break
01074             else:
01075                 line = handle.readline()
01076                 data += line
01077                 qual_len += len(line.strip())
01078         if seq_len != qual_len:
01079             raise ValueError("Problem with quality section")
01080         return data
01081 
01082 
01083 ###############################################################################
01084 
01085 _FormatToRandomAccess = {"ace" : SequentialSeqFileRandomAccess,
01086                         "embl" : EmblRandomAccess,
01087                         "fasta" : SequentialSeqFileRandomAccess,
01088                         "fastq" : FastqRandomAccess, #Class handles all three variants
01089                         "fastq-sanger" : FastqRandomAccess, #alias of the above
01090                         "fastq-solexa" : FastqRandomAccess,
01091                         "fastq-illumina" : FastqRandomAccess,
01092                         "genbank" : GenBankRandomAccess,
01093                         "gb" : GenBankRandomAccess, #alias of the above
01094                         "ig" : IntelliGeneticsRandomAccess,
01095                         "imgt" : EmblRandomAccess,
01096                         "phd" : SequentialSeqFileRandomAccess,
01097                         "pir" : SequentialSeqFileRandomAccess,
01098                         "sff" : SffRandomAccess,
01099                         "sff-trim" : SffTrimedRandomAccess,
01100                         "swiss" : SwissRandomAccess,
01101                         "tab" : TabRandomAccess,
01102                         "qual" : SequentialSeqFileRandomAccess,
01103                         "uniprot-xml" : UniprotRandomAccess, 
01104                         }
01105