LCOV - code coverage report
Current view: directory - objdir/dist/include/mozilla - BloomFilter.h (source / functions) Found Hit Coverage
Test: app.info Lines: 41 41 100.0 %
Date: 2012-06-02 Functions: 27 15 55.6 %

       1                 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
       2                 : /* This Source Code Form is subject to the terms of the Mozilla Public
       3                 :  * License, v. 2.0. If a copy of the MPL was not distributed with this file,
       4                 :  * You can obtain one at http://mozilla.org/MPL/2.0/. */
       5                 : 
       6                 : /*
       7                 :  * A counting Bloom filter implementation.  This allows consumers to
       8                 :  * do fast probabilistic "is item X in set Y?" testing which will
       9                 :  * never answer "no" when the correct answer is "yes" (but might
      10                 :  * incorrectly answer "yes" when the correct answer is "no").
      11                 :  */
      12                 : 
      13                 : #ifndef mozilla_BloomFilter_h_
      14                 : #define mozilla_BloomFilter_h_
      15                 : 
      16                 : #include "mozilla/Likely.h"
      17                 : #include "mozilla/StandardInteger.h"
      18                 : #include "mozilla/Util.h"
      19                 : 
      20                 : #include <string.h>
      21                 : 
      22                 : namespace mozilla {
      23                 : 
      24                 : /*
      25                 :  * This class implements a counting Bloom filter as described at
      26                 :  * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with
      27                 :  * 8-bit counters.  This allows quick probabilistic answers to the
      28                 :  * question "is object X in set Y?" where the contents of Y might not
      29                 :  * be time-invariant.  The probabilistic nature of the test means that
      30                 :  * sometimes the answer will be "yes" when it should be "no".  If the
      31                 :  * answer is "no", then X is guaranteed not to be in Y.
      32                 :  *
      33                 :  * The filter is parametrized on KeySize, which is the size of the key
      34                 :  * generated by each of hash functions used by the filter, in bits,
      35                 :  * and the type of object T being added and removed.  T must implement
      36                 :  * a |uint32_t hash() const| method which returns a uint32_t hash key
      37                 :  * that will be used to generate the two separate hash functions for
      38                 :  * the Bloom filter.  This hash key MUST be well-distributed for good
      39                 :  * results!  KeySize is not allowed to be larger than 16.
      40                 :  *
      41                 :  * The filter uses exactly 2**KeySize bytes of memory.  From now on we
      42                 :  * will refer to the memory used by the filter as M.
      43                 :  *
      44                 :  * The expected rate of incorrect "yes" answers depends on M and on
      45                 :  * the number N of objects in set Y.  As long as N is small compared
      46                 :  * to M, the rate of such answers is expected to be approximately
      47                 :  * 4*(N/M)**2 for this filter.  In practice, if Y has a few hundred
      48                 :  * elements then using a KeySize of 12 gives a reasonably low
      49                 :  * incorrect answer rate.  A KeySize of 12 has the additional benefit
      50                 :  * of using exactly one page for the filter in typical hardware
      51                 :  * configurations.
      52                 :  */
      53                 : 
      54                 : template<unsigned KeySize, class T>
      55                 : class BloomFilter {
      56                 :     /*
      57                 :      * A counting Bloom filter with 8-bit counters.  For now we assume
      58                 :      * that having two hash functions is enough, but we may revisit that
      59                 :      * decision later.
      60                 :      *
      61                 :      * The filter uses an array with 2**KeySize entries.
      62                 :      *
      63                 :      * Assuming a well-distributed hash function, a Bloom filter with
      64                 :      * array size M containing N elements and
      65                 :      * using k hash function has expected false positive rate exactly
      66                 :      *
      67                 :      * $  (1 - (1 - 1/M)^{kN})^k  $
      68                 :      *
      69                 :      * because each array slot has a
      70                 :      *
      71                 :      * $  (1 - 1/M)^{kN}  $
      72                 :      *
      73                 :      * chance of being 0, and the expected false positive rate is the
      74                 :      * probability that all of the k hash functions will hit a nonzero
      75                 :      * slot.
      76                 :      *
      77                 :      * For reasonable assumptions (M large, kN large, which should both
      78                 :      * hold if we're worried about false positives) about M and kN this
      79                 :      * becomes approximately
      80                 :      *
      81                 :      * $$  (1 - \exp(-kN/M))^k   $$
      82                 :      *
      83                 :      * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
      84                 :      * or in other words
      85                 :      *
      86                 :      * $$    N/M = -0.5 * \ln(1 - \sqrt(r))   $$
      87                 :      *
      88                 :      * where r is the false positive rate.  This can be used to compute
      89                 :      * the desired KeySize for a given load N and false positive rate r.
      90                 :      *
      91                 :      * If N/M is assumed small, then the false positive rate can
      92                 :      * further be approximated as 4*N^2/M^2.  So increasing KeySize by
      93                 :      * 1, which doubles M, reduces the false positive rate by about a
      94                 :      * factor of 4, and a false positive rate of 1% corresponds to
      95                 :      * about M/N == 20.
      96                 :      *
      97                 :      * What this means in practice is that for a few hundred keys using a
      98                 :      * KeySize of 12 gives false positive rates on the order of 0.25-4%.
      99                 :      *
     100                 :      * Similarly, using a KeySize of 10 would lead to a 4% false
     101                 :      * positive rate for N == 100 and to quite bad false positive
     102                 :      * rates for larger N.
     103                 :      */
     104                 : public:
     105               1 :     BloomFilter() {
     106                 :         MOZ_STATIC_ASSERT(KeySize <= keyShift, "KeySize too big");
     107                 : 
     108                 :         // Should we have a custom operator new using calloc instead and
     109                 :         // require that we're allocated via the operator?
     110               1 :         clear();
     111               1 :     }
     112                 : 
     113                 :     /*
     114                 :      * Clear the filter.  This should be done before reusing it, because
     115                 :      * just removing all items doesn't clear counters that hit the upper
     116                 :      * bound.
     117                 :      */
     118                 :     void clear();
     119                 : 
     120                 :     /*
     121                 :      * Add an item to the filter.
     122                 :      */
     123                 :     void add(const T* t);
     124                 : 
     125                 :     /*
     126                 :      * Remove an item from the filter.
     127                 :      */
     128                 :     void remove(const T* t);
     129                 : 
     130                 :     /*
     131                 :      * Check whether the filter might contain an item.  This can
     132                 :      * sometimes return true even if the item is not in the filter,
     133                 :      * but will never return false for items that are actually in the
     134                 :      * filter.
     135                 :      */
     136                 :     bool mightContain(const T* t) const;
     137                 : 
     138                 :     /*
     139                 :      * Methods for add/remove/contain when we already have a hash computed
     140                 :      */
     141                 :     void add(uint32_t hash);
     142                 :     void remove(uint32_t hash);
     143                 :     bool mightContain(uint32_t hash) const;
     144                 : 
     145                 : private:
     146                 :     static const size_t arraySize = (1 << KeySize);
     147                 :     static const uint32_t keyMask = (1 << KeySize) - 1;
     148                 :     static const uint32_t keyShift = 16;
     149                 : 
     150            1036 :     static uint32_t hash1(uint32_t hash) { return hash & keyMask; }
     151            1034 :     static uint32_t hash2(uint32_t hash) { return (hash >> keyShift) & keyMask; }
     152                 : 
     153            1024 :     uint8_t& firstSlot(uint32_t hash) { return counters[hash1(hash)]; }
     154            1024 :     uint8_t& secondSlot(uint32_t hash) { return counters[hash2(hash)]; }
     155              12 :     const uint8_t& firstSlot(uint32_t hash) const { return counters[hash1(hash)]; }
     156              10 :     const uint8_t& secondSlot(uint32_t hash) const { return counters[hash2(hash)]; }
     157                 : 
     158            2048 :     static bool full(const uint8_t& slot) { return slot == UINT8_MAX; }
     159                 : 
     160                 :     uint8_t counters[arraySize];
     161                 : };
     162                 : 
     163                 : template<unsigned KeySize, class T>
     164                 : inline void
     165               2 : BloomFilter<KeySize, T>::clear()
     166                 : {
     167               2 :     memset(counters, 0, arraySize);
     168               2 : }
     169                 : 
     170                 : template<unsigned KeySize, class T>
     171                 : inline void
     172             512 : BloomFilter<KeySize, T>::add(uint32_t hash)
     173                 : {
     174             512 :     uint8_t& slot1 = firstSlot(hash);
     175             512 :     if (MOZ_LIKELY(!full(slot1)))
     176             256 :         ++slot1;
     177                 : 
     178             512 :     uint8_t& slot2 = secondSlot(hash);
     179             512 :     if (MOZ_LIKELY(!full(slot2)))
     180             511 :         ++slot2;
     181             512 : }
     182                 : 
     183                 : template<unsigned KeySize, class T>
     184                 : MOZ_ALWAYS_INLINE void
     185             512 : BloomFilter<KeySize, T>::add(const T* t)
     186                 : {
     187             512 :     uint32_t hash = t->hash();
     188             512 :     return add(hash);
     189                 : }
     190                 : 
     191                 : template<unsigned KeySize, class T>
     192                 : inline void
     193             512 : BloomFilter<KeySize, T>::remove(uint32_t hash)
     194                 : {
     195                 :     // If the slots are full, we don't know whether we bumped them to be
     196                 :     // there when we added or not, so just leave them full.
     197             512 :     uint8_t& slot1 = firstSlot(hash);
     198             512 :     if (MOZ_LIKELY(!full(slot1)))
     199               2 :         --slot1;
     200                 : 
     201             512 :     uint8_t& slot2 = secondSlot(hash);
     202             512 :     if (MOZ_LIKELY(!full(slot2)))
     203             255 :         --slot2;
     204             512 : }
     205                 : 
     206                 : template<unsigned KeySize, class T>
     207                 : MOZ_ALWAYS_INLINE void
     208             512 : BloomFilter<KeySize, T>::remove(const T* t)
     209                 : {
     210             512 :     uint32_t hash = t->hash();
     211             512 :     remove(hash);
     212             512 : }
     213                 : 
     214                 : template<unsigned KeySize, class T>
     215                 : MOZ_ALWAYS_INLINE bool
     216              12 : BloomFilter<KeySize, T>::mightContain(uint32_t hash) const
     217                 : {
     218                 :     // Check that all the slots for this hash contain something
     219              12 :     return firstSlot(hash) && secondSlot(hash);
     220                 : }
     221                 : 
     222                 : template<unsigned KeySize, class T>
     223                 : MOZ_ALWAYS_INLINE bool
     224              12 : BloomFilter<KeySize, T>::mightContain(const T* t) const
     225                 : {
     226              12 :     uint32_t hash = t->hash();
     227              12 :     return mightContain(hash);
     228                 : }
     229                 : 
     230                 : } // namespace mozilla
     231                 : 
     232                 : #endif /* mozilla_BloomFilter_h_ */

Generated by: LCOV version 1.7