Back to index

enigmail  1.4.3
BloomFilter.h
Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
00002 /* This Source Code Form is subject to the terms of the Mozilla Public
00003  * License, v. 2.0. If a copy of the MPL was not distributed with this file,
00004  * You can obtain one at http://mozilla.org/MPL/2.0/. */
00005 
00006 /*
00007  * A counting Bloom filter implementation.  This allows consumers to
00008  * do fast probabilistic "is item X in set Y?" testing which will
00009  * never answer "no" when the correct answer is "yes" (but might
00010  * incorrectly answer "yes" when the correct answer is "no").
00011  */
00012 
00013 #ifndef mozilla_BloomFilter_h_
00014 #define mozilla_BloomFilter_h_
00015 
00016 #include "mozilla/Likely.h"
00017 #include "mozilla/StandardInteger.h"
00018 #include "mozilla/Util.h"
00019 
00020 #include <string.h>
00021 
00022 namespace mozilla {
00023 
00024 /*
00025  * This class implements a counting Bloom filter as described at
00026  * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with
00027  * 8-bit counters.  This allows quick probabilistic answers to the
00028  * question "is object X in set Y?" where the contents of Y might not
00029  * be time-invariant.  The probabilistic nature of the test means that
00030  * sometimes the answer will be "yes" when it should be "no".  If the
00031  * answer is "no", then X is guaranteed not to be in Y.
00032  *
00033  * The filter is parametrized on KeySize, which is the size of the key
00034  * generated by each of hash functions used by the filter, in bits,
00035  * and the type of object T being added and removed.  T must implement
00036  * a |uint32_t hash() const| method which returns a uint32_t hash key
00037  * that will be used to generate the two separate hash functions for
00038  * the Bloom filter.  This hash key MUST be well-distributed for good
00039  * results!  KeySize is not allowed to be larger than 16.
00040  *
00041  * The filter uses exactly 2**KeySize bytes of memory.  From now on we
00042  * will refer to the memory used by the filter as M.
00043  *
00044  * The expected rate of incorrect "yes" answers depends on M and on
00045  * the number N of objects in set Y.  As long as N is small compared
00046  * to M, the rate of such answers is expected to be approximately
00047  * 4*(N/M)**2 for this filter.  In practice, if Y has a few hundred
00048  * elements then using a KeySize of 12 gives a reasonably low
00049  * incorrect answer rate.  A KeySize of 12 has the additional benefit
00050  * of using exactly one page for the filter in typical hardware
00051  * configurations.
00052  */
00053 
00054 template<unsigned KeySize, class T>
00055 class BloomFilter {
00056     /*
00057      * A counting Bloom filter with 8-bit counters.  For now we assume
00058      * that having two hash functions is enough, but we may revisit that
00059      * decision later.
00060      *
00061      * The filter uses an array with 2**KeySize entries.
00062      *
00063      * Assuming a well-distributed hash function, a Bloom filter with
00064      * array size M containing N elements and
00065      * using k hash function has expected false positive rate exactly
00066      *
00067      * $  (1 - (1 - 1/M)^{kN})^k  $
00068      *
00069      * because each array slot has a
00070      *
00071      * $  (1 - 1/M)^{kN}  $
00072      *
00073      * chance of being 0, and the expected false positive rate is the
00074      * probability that all of the k hash functions will hit a nonzero
00075      * slot.
00076      *
00077      * For reasonable assumptions (M large, kN large, which should both
00078      * hold if we're worried about false positives) about M and kN this
00079      * becomes approximately
00080      *
00081      * $$  (1 - \exp(-kN/M))^k   $$
00082      *
00083      * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
00084      * or in other words
00085      *
00086      * $$    N/M = -0.5 * \ln(1 - \sqrt(r))   $$
00087      *
00088      * where r is the false positive rate.  This can be used to compute
00089      * the desired KeySize for a given load N and false positive rate r.
00090      *
00091      * If N/M is assumed small, then the false positive rate can
00092      * further be approximated as 4*N^2/M^2.  So increasing KeySize by
00093      * 1, which doubles M, reduces the false positive rate by about a
00094      * factor of 4, and a false positive rate of 1% corresponds to
00095      * about M/N == 20.
00096      *
00097      * What this means in practice is that for a few hundred keys using a
00098      * KeySize of 12 gives false positive rates on the order of 0.25-4%.
00099      *
00100      * Similarly, using a KeySize of 10 would lead to a 4% false
00101      * positive rate for N == 100 and to quite bad false positive
00102      * rates for larger N.
00103      */
00104 public:
00105     BloomFilter() {
00106         MOZ_STATIC_ASSERT(KeySize <= keyShift, "KeySize too big");
00107 
00108         // Should we have a custom operator new using calloc instead and
00109         // require that we're allocated via the operator?
00110         clear();
00111     }
00112 
00113     /*
00114      * Clear the filter.  This should be done before reusing it, because
00115      * just removing all items doesn't clear counters that hit the upper
00116      * bound.
00117      */
00118     void clear();
00119 
00120     /*
00121      * Add an item to the filter.
00122      */
00123     void add(const T* t);
00124 
00125     /*
00126      * Remove an item from the filter.
00127      */
00128     void remove(const T* t);
00129 
00130     /*
00131      * Check whether the filter might contain an item.  This can
00132      * sometimes return true even if the item is not in the filter,
00133      * but will never return false for items that are actually in the
00134      * filter.
00135      */
00136     bool mightContain(const T* t) const;
00137 
00138     /*
00139      * Methods for add/remove/contain when we already have a hash computed
00140      */
00141     void add(uint32_t hash);
00142     void remove(uint32_t hash);
00143     bool mightContain(uint32_t hash) const;
00144 
00145 private:
00146     static const size_t arraySize = (1 << KeySize);
00147     static const uint32_t keyMask = (1 << KeySize) - 1;
00148     static const uint32_t keyShift = 16;
00149 
00150     static uint32_t hash1(uint32_t hash) { return hash & keyMask; }
00151     static uint32_t hash2(uint32_t hash) { return (hash >> keyShift) & keyMask; }
00152 
00153     uint8_t& firstSlot(uint32_t hash) { return counters[hash1(hash)]; }
00154     uint8_t& secondSlot(uint32_t hash) { return counters[hash2(hash)]; }
00155     const uint8_t& firstSlot(uint32_t hash) const { return counters[hash1(hash)]; }
00156     const uint8_t& secondSlot(uint32_t hash) const { return counters[hash2(hash)]; }
00157 
00158     static bool full(const uint8_t& slot) { return slot == UINT8_MAX; }
00159 
00160     uint8_t counters[arraySize];
00161 };
00162 
00163 template<unsigned KeySize, class T>
00164 inline void
00165 BloomFilter<KeySize, T>::clear()
00166 {
00167     memset(counters, 0, arraySize);
00168 }
00169 
00170 template<unsigned KeySize, class T>
00171 inline void
00172 BloomFilter<KeySize, T>::add(uint32_t hash)
00173 {
00174     uint8_t& slot1 = firstSlot(hash);
00175     if (MOZ_LIKELY(!full(slot1)))
00176         ++slot1;
00177 
00178     uint8_t& slot2 = secondSlot(hash);
00179     if (MOZ_LIKELY(!full(slot2)))
00180         ++slot2;
00181 }
00182 
00183 template<unsigned KeySize, class T>
00184 MOZ_ALWAYS_INLINE void
00185 BloomFilter<KeySize, T>::add(const T* t)
00186 {
00187     uint32_t hash = t->hash();
00188     return add(hash);
00189 }
00190 
00191 template<unsigned KeySize, class T>
00192 inline void
00193 BloomFilter<KeySize, T>::remove(uint32_t hash)
00194 {
00195     // If the slots are full, we don't know whether we bumped them to be
00196     // there when we added or not, so just leave them full.
00197     uint8_t& slot1 = firstSlot(hash);
00198     if (MOZ_LIKELY(!full(slot1)))
00199         --slot1;
00200 
00201     uint8_t& slot2 = secondSlot(hash);
00202     if (MOZ_LIKELY(!full(slot2)))
00203         --slot2;
00204 }
00205 
00206 template<unsigned KeySize, class T>
00207 MOZ_ALWAYS_INLINE void
00208 BloomFilter<KeySize, T>::remove(const T* t)
00209 {
00210     uint32_t hash = t->hash();
00211     remove(hash);
00212 }
00213 
00214 template<unsigned KeySize, class T>
00215 MOZ_ALWAYS_INLINE bool
00216 BloomFilter<KeySize, T>::mightContain(uint32_t hash) const
00217 {
00218     // Check that all the slots for this hash contain something
00219     return firstSlot(hash) && secondSlot(hash);
00220 }
00221 
00222 template<unsigned KeySize, class T>
00223 MOZ_ALWAYS_INLINE bool
00224 BloomFilter<KeySize, T>::mightContain(const T* t) const
00225 {
00226     uint32_t hash = t->hash();
00227     return mightContain(hash);
00228 }
00229 
00230 } // namespace mozilla
00231 
00232 #endif /* mozilla_BloomFilter_h_ */