Back to index

python-biopython  1.60
Index.py
Go to the documentation of this file.
00001 # Copyright 1999 by Jeffrey Chang.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 """Index.py
00007 
00008 This module provides a way to create indexes to text files.
00009 
00010 Classes:
00011 Index     Dictionary-like class used to store index information.
00012 
00013 _ShelveIndex    An Index class based on the shelve module.
00014 _InMemoryIndex  An in-memory Index class.
00015 
00016 """
00017 import os
00018 import array
00019 import cPickle
00020 import shelve
00021 
00022 class _ShelveIndex(dict):
00023     """An index file wrapped around shelve.
00024 
00025     """
00026     # Without a good dbm module installed, this is pretty slow and
00027     # generates large files.  When generating an index on a FASTA-
00028     # formatted file with 82000 sequences (37Mb), the 
00029     # index 'dat' file is 42Mb and 'dir' file is 8Mb.
00030 
00031     __version = 2
00032     __version_key = '__version'
00033 
00034     def __init__(self, indexname, truncate=None):
00035         dict.__init__(self)
00036         try:
00037             if truncate:
00038                 # In python 1.52 and before, dumbdbm (under shelve)
00039                 # doesn't clear the old database.
00040                 files = [indexname + '.dir',
00041                          indexname + '.dat',
00042                          indexname + '.bak'
00043                          ]
00044                 for file in files:
00045                     if os.path.exists(file):
00046                         os.unlink(file)
00047                 raise Exception("open a new shelf")
00048             self.data = shelve.open(indexname, flag='r')
00049         except:
00050             # No database exists.
00051             self.data = shelve.open(indexname, flag='n')
00052             self.data[self.__version_key] = self.__version
00053         else:
00054             # Check to make sure the database is the correct version.
00055             version = self.data.get(self.__version_key, None)
00056             if version is None:
00057                 raise IOError("Unrecognized index format")
00058             elif version != self.__version:
00059                 raise IOError("Version %s doesn't match my version %s" \
00060                               % (version, self.__version))
00061             
00062     def __del__(self):
00063         if self.__dict__.has_key('data'):
00064             self.data.close()
00065 
00066 class _InMemoryIndex(dict):
00067     """This creates an in-memory index file.
00068 
00069     """
00070     # File Format:
00071     # version
00072     # key value
00073     # [...]
00074     
00075     __version = 3
00076     __version_key = '__version'
00077 
00078     def __init__(self, indexname, truncate=None):
00079         self._indexname = indexname
00080         dict.__init__(self)
00081         self.__changed = 0     # the index hasn't changed
00082         
00083         # Remove the database if truncate is true.
00084         if truncate and os.path.exists(indexname):
00085             os.unlink(indexname)
00086             self.__changed = 1
00087 
00088         # Load the database if it exists
00089         if os.path.exists(indexname):
00090             handle = open(indexname)
00091             version = self._toobj(handle.readline().rstrip())
00092             if version != self.__version:
00093                 raise IOError("Version %s doesn't match my version %s" \
00094                               % (version, self.__version))
00095             for line in handle:
00096                 key, value = line.split()
00097                 key, value = self._toobj(key), self._toobj(value)
00098                 self[key] = value
00099             self.__changed = 0
00100 
00101     def update(self, dict):
00102         self.__changed = 1
00103         dict.update(self, dict)
00104     def __setitem__(self, key, value):
00105         self.__changed = 1
00106         dict.__setitem__(self, key, value)
00107     def __delitem__(self, key):
00108         self.__changed = 1
00109         dict.__delitem__(self, key)
00110     def clear(self):
00111         self.__changed = 1
00112         dict.clear(self)
00113             
00114     def __del__(self):
00115         if self.__changed:
00116             handle = open(self._indexname, 'w')
00117             handle.write("%s\n" % self._tostr(self.__version))
00118             for key, value in self.items():
00119                 handle.write("%s %s\n" %
00120                              (self._tostr(key), self._tostr(value)))
00121             handle.close()
00122 
00123     def _tostr(self, obj):
00124         # I need a representation of the object that's saveable to
00125         # a file that uses whitespace as delimiters.  Thus, I'm
00126         # going to pickle the object, and then convert each character of
00127         # the string to its ASCII integer value.  Then, I'm going to convert
00128         # the integers into strings and join them together with commas. 
00129         # It's not the most efficient way of storing things, but it's
00130         # relatively fast.
00131         s = cPickle.dumps(obj)
00132         intlist = array.array('b', s)
00133         strlist = map(str, intlist)
00134         return ','.join(strlist)
00135 
00136     def _toobj(self, str):
00137         intlist = map(int, str.split(','))
00138         intlist = array.array('b', intlist)
00139         strlist = map(chr, intlist)
00140         return cPickle.loads(''.join(strlist))
00141 
00142 Index = _InMemoryIndex