Back to index

obnam  1.1
clientmetadatatree.py
Go to the documentation of this file.
00001 # Copyright 2010  Lars Wirzenius
00002 # 
00003 # This program is free software: you can redistribute it and/or modify
00004 # it under the terms of the GNU General Public License as published by
00005 # the Free Software Foundation, either version 3 of the License, or
00006 # (at your option) any later version.
00007 # 
00008 # This program is distributed in the hope that it will be useful,
00009 # but WITHOUT ANY WARRANTY; without even the implied warranty of
00010 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00011 # GNU General Public License for more details.
00012 # 
00013 # You should have received a copy of the GNU General Public License
00014 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
00015 
00016 
00017 import hashlib
00018 import logging
00019 import os
00020 import random
00021 import struct
00022 import tracing
00023 
00024 import obnamlib
00025 
00026 
00027 class ClientMetadataTree(obnamlib.RepositoryTree):
00028 
00029     '''Store per-client metadata about files.
00030     
00031     Actual file contents is stored elsewhere, this stores just the 
00032     metadata about files: names, inode info, and what chunks of
00033     data they use.
00034     
00035     See http://braawi.org/obnam/ondisk/ for a description of how
00036     this works.
00037     
00038     '''
00039     
00040     # Filesystem metadata.
00041     PREFIX_FS_META = 0      # prefix
00042     FILE_NAME = 0           # subkey type for storing pathnames
00043     FILE_CHUNKS = 1         # subkey type for list of chunks
00044     FILE_METADATA = 3       # subkey type for inode fields, etc
00045     DIR_CONTENTS = 4        # subkey type for list of directory contents
00046     FILE_DATA = 5           # subkey type for file data (instead of chunk)
00047     
00048     FILE_METADATA_ENCODED = 0 # subkey value for encoded obnamlib.Metadata().
00049     
00050     # References to chunks in this generation.
00051     # Main key is the chunk id, subkey type is always 0, subkey is file id
00052     # for file that uses the chunk.
00053     PREFIX_CHUNK_REF = 1
00054     
00055     # Metadata about the generation. The main key is always the hash of
00056     # 'generation', subkey type field is always 0.
00057     PREFIX_GEN_META = 2     # prefix
00058     GEN_ID = 0              # subkey type for generation id
00059     GEN_STARTED = 1         # subkey type for when generation was started
00060     GEN_ENDED = 2           # subkey type for when generation was ended
00061     GEN_IS_CHECKPOINT = 3   # subkey type for whether generation is checkpoint
00062     GEN_FILE_COUNT = 4      # subkey type for count of files+dirs in generation
00063     GEN_TOTAL_DATA = 5      # subkey type for sum of all file sizes in gen
00064     
00065     # Maximum values for the subkey type field, and the subkey field.
00066     # Both have a minimum value of 0.
00067 
00068     TYPE_MAX = 255
00069     SUBKEY_MAX = struct.pack('!Q', obnamlib.MAX_ID)
00070     
00071     def __init__(self, fs, client_dir, node_size, upload_queue_size, lru_size,
00072                  repo):
00073         tracing.trace('new ClientMetadataTree, client_dir=%s' % client_dir)
00074         self.current_time = repo.current_time
00075         key_bytes = len(self.hashkey(0, self.default_file_id(''), 0, 0))
00076         obnamlib.RepositoryTree.__init__(self, fs, client_dir, key_bytes, 
00077                                          node_size, upload_queue_size, 
00078                                          lru_size, repo)
00079         self.genhash = self.default_file_id('generation')
00080         self.chunkids_per_key = max(1,
00081                                     int(node_size / 4 / struct.calcsize('Q')))
00082         self.init_caches()
00083 
00084     def init_caches(self):
00085         self.known_generations = {}
00086         self.file_ids = {}
00087 
00088     def default_file_id(self, filename):
00089         '''Return hash of filename suitable for use as main key.'''
00090         tracing.trace(repr(filename))
00091         def hash(s):
00092             return hashlib.md5(s).digest()[:4]
00093         dirname = os.path.dirname(filename)
00094         basename = os.path.basename(filename)
00095         return hash(dirname) + hash(basename)
00096 
00097     def _bad_default_file_id(self, filename):
00098         '''For use by unit tests.'''
00099         return struct.pack('!Q', 0)
00100 
00101     def hashkey(self, prefix, mainhash, subtype, subkey):
00102         '''Compute a full key.
00103 
00104         The full key consists of three parts:
00105 
00106         * prefix (0 for filesystem metadata, 1 for chunk refs)
00107         * a hash of mainkey (64 bits)
00108         * the subkey type (8 bits)
00109         * type subkey (64 bits)
00110 
00111         These are catenated.
00112 
00113         mainhash must be a string of 8 bytes.
00114 
00115         subtype must be an integer in the range 0.255, inclusive.
00116 
00117         subkey must be either a string or an integer. If it is a string,
00118         it will be padded with NUL bytes at the end, if it is less than
00119         8 bytes, and truncated, if longer. If it is an integer, it will
00120         be converted as a string, and the value must fit into 64 bits.
00121 
00122         '''
00123         
00124         if type(subkey) == str:
00125             subkey = (subkey + '\0' * 8)[:8]
00126             fmt = '!B8sB8s'
00127         else:
00128             assert type(subkey) in [int, long]
00129             fmt = '!B8sBQ'
00130 
00131         return struct.pack(fmt, prefix, mainhash, subtype, subkey)
00132 
00133     def fskey(self, mainhash, subtype, subkey):
00134         ''''Generate key for filesystem metadata.'''
00135         return self.hashkey(self.PREFIX_FS_META, mainhash, subtype, subkey)
00136         
00137     def fs_unkey(self, key):
00138         '''Inverse of fskey.'''
00139         parts = struct.unpack('!B8sB8s', key)
00140         return parts[1], parts[3]
00141 
00142     def genkey(self, subkey):
00143         '''Generate key for generation metadata.'''
00144         return self.hashkey(self.PREFIX_GEN_META, self.genhash, 0, subkey)
00145 
00146     def int2bin(self, integer):
00147         '''Convert an integer to a binary string representation.'''
00148         return struct.pack('!Q', integer)
00149 
00150     def chunk_key(self, chunk_id, file_id):
00151         '''Generate a key for a chunk reference.'''
00152         return self.hashkey(self.PREFIX_CHUNK_REF, self.int2bin(chunk_id),
00153                             0, file_id)
00154 
00155     def chunk_unkey(self, key):
00156         '''Return the chunk and file ids in a chunk key.'''
00157         parts = struct.unpack('!BQBQ', key)
00158         return parts[1], parts[3]
00159 
00160     def get_file_id(self, tree, pathname):
00161         '''Return id for file in a given generation.'''
00162         
00163         if tree in self.file_ids:
00164             if pathname in self.file_ids[tree]:
00165                 return self.file_ids[tree][pathname]
00166         else:
00167             self.file_ids[tree] = {}
00168         
00169         default_file_id = self.default_file_id(pathname)
00170         minkey = self.fskey(default_file_id, self.FILE_NAME, 0)
00171         maxkey = self.fskey(default_file_id, self.FILE_NAME, obnamlib.MAX_ID)
00172         for key, value in tree.lookup_range(minkey, maxkey):
00173             def_id, file_id = self.fs_unkey(key)
00174             assert def_id == default_file_id, \
00175                 'def=%s other=%s' % (repr(def_id), repr(default_file_id))
00176             self.file_ids[tree][value] = file_id
00177             if value == pathname:
00178                 return file_id
00179 
00180         raise KeyError('%s does not yet have a file-id' % pathname)
00181 
00182     def set_file_id(self, pathname):
00183         '''Set and return the file-id for a file in current generation.'''
00184 
00185         default_file_id = self.default_file_id(pathname)
00186         minkey = self.fskey(default_file_id, self.FILE_NAME, 0)
00187         maxkey = self.fskey(default_file_id, self.FILE_NAME, obnamlib.MAX_ID)
00188         file_ids = set()
00189         for key, value in self.tree.lookup_range(minkey, maxkey):
00190             def_id, file_id = self.fs_unkey(key)
00191             assert def_id == default_file_id
00192             if value == pathname:
00193                 return file_id
00194             file_ids.add(file_id)
00195         
00196         while True:
00197             n = random.randint(0, obnamlib.MAX_ID)
00198             file_id = struct.pack('!Q', n)
00199             if file_id not in file_ids:
00200                 break
00201         
00202         key = self.fskey(default_file_id, self.FILE_NAME, file_id)
00203         self.tree.insert(key, pathname)
00204         return file_id
00205 
00206     def _lookup_int(self, tree, key):
00207         return struct.unpack('!Q', tree.lookup(key))[0]
00208 
00209     def _insert_int(self, tree, key, value):
00210         return tree.insert(key, struct.pack('!Q', value))
00211 
00212     def commit(self):
00213         tracing.trace('committing ClientMetadataTree')
00214         if self.tree:
00215             now = int(self.current_time())
00216             self._insert_int(self.tree, self.genkey(self.GEN_ENDED), now)
00217             genid = self._get_generation_id_or_None(self.tree)
00218             if genid is not None:
00219                 t = [(self.GEN_FILE_COUNT, 'file_count'),
00220                      (self.GEN_TOTAL_DATA, 'total_data')]
00221                 for subkey, attr in t:
00222                     if hasattr(self, attr):
00223                         self._insert_count(genid, subkey, getattr(self, attr))
00224         obnamlib.RepositoryTree.commit(self)
00225 
00226     def init_forest(self, *args, **kwargs):
00227         self.init_caches()
00228         return obnamlib.RepositoryTree.init_forest(self, *args, **kwargs)
00229 
00230     def start_changes(self, *args, **kwargs):
00231         self.init_caches()
00232         return obnamlib.RepositoryTree.start_changes(self, *args, **kwargs)
00233 
00234     def find_generation(self, genid):
00235     
00236         def fill_cache():
00237             key = self.genkey(self.GEN_ID)
00238             for t in self.forest.trees:
00239                 t_genid = self._lookup_int(t, key)
00240                 if t_genid == genid:
00241                     self.known_generations[genid] = t
00242                     return t
00243     
00244         if self.forest:
00245             if genid in self.known_generations:
00246                 return self.known_generations[genid]
00247             t = fill_cache()
00248             if t is not None:
00249                 return t
00250         raise KeyError('Unknown generation %s' % genid)
00251 
00252     def list_generations(self):
00253         if self.forest:
00254             genids = []
00255             for t in self.forest.trees:
00256                 genid = self._get_generation_id_or_None(t)
00257                 if genid is not None:
00258                     genids.append(genid)
00259             return genids
00260         else:
00261             return []
00262 
00263     def start_generation(self):
00264         tracing.trace('start new generation')
00265         self.start_changes()
00266         gen_id = self.forest.new_id()
00267         now = int(self.current_time())
00268         self._insert_int(self.tree, self.genkey(self.GEN_ID), gen_id)
00269         self._insert_int(self.tree, self.genkey(self.GEN_STARTED), now)
00270         self.file_count = self.get_generation_file_count(gen_id) or 0
00271         self.total_data = self.get_generation_data(gen_id) or 0
00272 
00273     def set_current_generation_is_checkpoint(self, is_checkpoint):
00274         tracing.trace('is_checkpoint=%s', is_checkpoint)
00275         value = 1 if is_checkpoint else 0
00276         key = self.genkey(self.GEN_IS_CHECKPOINT)
00277         self._insert_int(self.tree, key, value)
00278 
00279     def get_is_checkpoint(self, genid):
00280         tree = self.find_generation(genid)
00281         key = self.genkey(self.GEN_IS_CHECKPOINT)
00282         try:
00283             return self._lookup_int(tree, key)
00284         except KeyError:
00285             return 0
00286 
00287     def remove_generation(self, genid):
00288         tracing.trace('genid=%s', genid)
00289         tree = self.find_generation(genid)
00290         if tree == self.tree:
00291             self.tree = None
00292         self.forest.remove_tree(tree)
00293 
00294     def get_generation_id(self, tree):
00295         return self._lookup_int(tree, self.genkey(self.GEN_ID))
00296 
00297     def _get_generation_id_or_None(self, tree):
00298         try:
00299             return self.get_generation_id(tree)
00300         except KeyError: # pragma: no cover
00301             return None
00302 
00303     def _lookup_time(self, tree, what):
00304         try:
00305             return self._lookup_int(tree, self.genkey(what))
00306         except KeyError:
00307             return None
00308 
00309     def get_generation_times(self, genid):
00310         tree = self.find_generation(genid)
00311         return (self._lookup_time(tree, self.GEN_STARTED),
00312                 self._lookup_time(tree, self.GEN_ENDED))
00313 
00314     def get_generation_data(self, genid):
00315         return self._lookup_count(genid, self.GEN_TOTAL_DATA)
00316 
00317     def _lookup_count(self, genid, count_type):
00318         tree = self.find_generation(genid)
00319         key = self.genkey(count_type)
00320         try:
00321             return self._lookup_int(tree, key)
00322         except KeyError:
00323             return None
00324 
00325     def _insert_count(self, genid, count_type, count):
00326         tree = self.find_generation(genid)
00327         key = self.genkey(count_type)
00328         return self._insert_int(tree, key, count)
00329 
00330     def get_generation_file_count(self, genid):
00331         return self._lookup_count(genid, self.GEN_FILE_COUNT)
00332 
00333     def create(self, filename, encoded_metadata):
00334         tracing.trace('filename=%s', filename)
00335         file_id = self.set_file_id(filename)
00336         gen_id = self.get_generation_id(self.tree)
00337         try:
00338             old_metadata = self.get_metadata(gen_id, filename)
00339         except KeyError:
00340             old_metadata = None
00341             self.file_count += 1
00342         else:
00343             old = obnamlib.decode_metadata(old_metadata)
00344             if old.isfile():
00345                 self.total_data -= old.st_size or 0
00346 
00347         metadata = obnamlib.decode_metadata(encoded_metadata)
00348         if metadata.isfile():
00349             self.total_data += metadata.st_size or 0
00350 
00351         if encoded_metadata != old_metadata:
00352             tracing.trace('new or changed metadata')
00353             self.set_metadata(filename, encoded_metadata)
00354 
00355         # Add to parent's contents, unless already there.
00356         parent = os.path.dirname(filename)
00357         tracing.trace('parent=%s', parent)
00358         if parent != filename: # root dir is its own parent
00359             basename = os.path.basename(filename)
00360             parent_id = self.set_file_id(parent)
00361             key = self.fskey(parent_id, self.DIR_CONTENTS, file_id)
00362             # We could just insert, but that would cause unnecessary
00363             # churn in the tree if nothing changes.
00364             try:
00365                 self.tree.lookup(key)
00366                 tracing.trace('was already in parent') # pragma: no cover
00367             except KeyError:
00368                 self.tree.insert(key, basename)
00369                 tracing.trace('added to parent')
00370 
00371     def get_metadata(self, genid, filename):
00372         tree = self.find_generation(genid)
00373         file_id = self.get_file_id(tree, filename)
00374         key = self.fskey(file_id, self.FILE_METADATA, 
00375                          self.FILE_METADATA_ENCODED)
00376         return tree.lookup(key)
00377 
00378     def set_metadata(self, filename, encoded_metadata):
00379         tracing.trace('filename=%s', filename)
00380 
00381         file_id = self.set_file_id(filename)
00382         key1 = self.fskey(file_id, self.FILE_NAME, file_id)
00383         self.tree.insert(key1, filename)
00384         
00385         key2 = self.fskey(file_id, self.FILE_METADATA, 
00386                           self.FILE_METADATA_ENCODED)
00387         self.tree.insert(key2, encoded_metadata)
00388 
00389     def remove(self, filename):
00390         tracing.trace('filename=%s', filename)
00391 
00392         file_id = self.get_file_id(self.tree, filename)
00393         genid = self.get_generation_id(self.tree)
00394         self.file_count -= 1
00395         
00396         try:
00397             encoded_metadata = self.get_metadata(genid, filename)
00398         except KeyError:
00399             pass
00400         else:
00401             metadata = obnamlib.decode_metadata(encoded_metadata)
00402             if metadata.isfile():
00403                 self.total_data -= metadata.st_size or 0
00404 
00405         # Remove any children.
00406         minkey = self.fskey(file_id, self.DIR_CONTENTS, 0)
00407         maxkey = self.fskey(file_id, self.DIR_CONTENTS, obnamlib.MAX_ID)
00408         for key, basename in self.tree.lookup_range(minkey, maxkey):
00409             self.remove(os.path.join(filename, basename))
00410 
00411         # Remove chunk refs.
00412         for chunkid in self.get_file_chunks(genid, filename):
00413             key = self.chunk_key(chunkid, file_id)
00414             self.tree.remove_range(key, key)
00415             
00416         # Remove this file's metadata.
00417         minkey = self.fskey(file_id, 0, 0)
00418         maxkey = self.fskey(file_id, self.TYPE_MAX, self.SUBKEY_MAX)
00419         self.tree.remove_range(minkey, maxkey)
00420         
00421         # Remove filename.
00422         default_file_id = self.default_file_id(filename)
00423         key = self.fskey(default_file_id, self.FILE_NAME, file_id)
00424         self.tree.remove_range(key, key)
00425         
00426         # Also remove from parent's contents.
00427         parent = os.path.dirname(filename)
00428         if parent != filename: # root dir is its own parent
00429             parent_id = self.set_file_id(parent)
00430             key = self.fskey(parent_id, self.DIR_CONTENTS, file_id)
00431             # The range removal will work even if the key does not exist.
00432             self.tree.remove_range(key, key)
00433             
00434     def listdir(self, genid, dirname):
00435         tree = self.find_generation(genid)
00436         try:
00437             dir_id = self.get_file_id(tree, dirname)
00438         except KeyError:
00439             return []
00440         minkey = self.fskey(dir_id, self.DIR_CONTENTS, 0)
00441         maxkey = self.fskey(dir_id, self.DIR_CONTENTS, self.SUBKEY_MAX)
00442         basenames = []
00443         for key, value in tree.lookup_range(minkey, maxkey):
00444             basenames.append(value)
00445         return basenames
00446 
00447     def get_file_chunks(self, genid, filename):
00448         tree = self.find_generation(genid)
00449         try:
00450             file_id = self.get_file_id(tree, filename)
00451         except KeyError:
00452             return []
00453         minkey = self.fskey(file_id, self.FILE_CHUNKS, 0)
00454         maxkey = self.fskey(file_id, self.FILE_CHUNKS, self.SUBKEY_MAX)
00455         pairs = tree.lookup_range(minkey, maxkey)
00456         chunkids = []
00457         for key, value in pairs:
00458             chunkids.extend(self._decode_chunks(value))
00459         return chunkids
00460 
00461     def _encode_chunks(self, chunkids):
00462         fmt = '!' + ('Q' * len(chunkids))
00463         return struct.pack(fmt, *chunkids)
00464 
00465     def _decode_chunks(self, encoded):
00466         size = struct.calcsize('Q')
00467         count = len(encoded) / size
00468         fmt = '!' + ('Q' * count)
00469         return struct.unpack(fmt, encoded)
00470 
00471     def _insert_chunks(self, tree, file_id, i, chunkids):
00472         key = self.fskey(file_id, self.FILE_CHUNKS, i)
00473         encoded = self._encode_chunks(chunkids)
00474         tree.insert(key, encoded)
00475 
00476     def set_file_chunks(self, filename, chunkids):
00477         tracing.trace('filename=%s', filename)
00478         tracing.trace('chunkids=%s', repr(chunkids))
00479     
00480         file_id = self.set_file_id(filename)
00481         minkey = self.fskey(file_id, self.FILE_CHUNKS, 0)
00482         maxkey = self.fskey(file_id, self.FILE_CHUNKS, self.SUBKEY_MAX)
00483 
00484         for key, value in self.tree.lookup_range(minkey, maxkey):
00485             for chunkid in self._decode_chunks(value):
00486                 k = self.chunk_key(chunkid, file_id)
00487                 self.tree.remove_range(k, k)
00488 
00489         self.tree.remove_range(minkey, maxkey)
00490 
00491         self.append_file_chunks(filename, chunkids)
00492 
00493     def append_file_chunks(self, filename, chunkids):
00494         tracing.trace('filename=%s', filename)
00495         tracing.trace('chunkids=%s', repr(chunkids))
00496 
00497         file_id = self.set_file_id(filename)
00498 
00499         minkey = self.fskey(file_id, self.FILE_CHUNKS, 0)
00500         maxkey = self.fskey(file_id, self.FILE_CHUNKS, self.SUBKEY_MAX)
00501         i = self.tree.count_range(minkey, maxkey)
00502 
00503         while chunkids:
00504             some = chunkids[:self.chunkids_per_key]
00505             self._insert_chunks(self.tree, file_id, i, some)
00506             for chunkid in some:
00507                 self.tree.insert(self.chunk_key(chunkid, file_id), '')
00508             i += 1
00509             chunkids = chunkids[self.chunkids_per_key:]
00510 
00511     def chunk_in_use(self, gen_id, chunk_id):
00512         '''Is a chunk used by a generation?'''
00513         
00514         minkey = self.chunk_key(chunk_id, 0)
00515         maxkey = self.chunk_key(chunk_id, obnamlib.MAX_ID)
00516         t = self.find_generation(gen_id)
00517         return not t.range_is_empty(minkey, maxkey)
00518 
00519     def list_chunks_in_generation(self, gen_id):
00520         '''Return list of chunk ids used in a given generation.'''
00521         
00522         minkey = self.chunk_key(0, 0)
00523         maxkey = self.chunk_key(obnamlib.MAX_ID, obnamlib.MAX_ID)
00524         t = self.find_generation(gen_id)
00525         return list(set(self.chunk_unkey(key)[0]
00526                         for key, value in t.lookup_range(minkey, maxkey)))
00527 
00528     def set_file_data(self, filename, contents): # pragma: no cover
00529         '''Store contents of file, if small, in B-tree instead of chunk.
00530         
00531         The length of the contents should be small enough to fit in a
00532         B-tree leaf.
00533         
00534         '''
00535         tracing.trace('filename=%s' % filename)
00536         tracing.trace('contents=%s' % repr(contents))
00537         
00538         file_id = self.set_file_id(filename)
00539         key = self.fskey(file_id, self.FILE_DATA, 0)
00540         self.tree.insert(key, contents)
00541 
00542     def get_file_data(self, gen_id, filename): # pragma: no cover
00543         '''Return contents of file, if set, or None.'''
00544         tree = self.find_generation(gen_id)
00545         file_id = self.get_file_id(tree, filename)
00546         key = self.fskey(file_id, self.FILE_DATA, 0)
00547         try:
00548             return tree.lookup(key)
00549         except KeyError:
00550             return None
00551