1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /* This Source Code Form is subject to the terms of the Mozilla Public
3 : * License, v. 2.0. If a copy of the MPL was not distributed with this file,
4 : * You can obtain one at http://mozilla.org/MPL/2.0/. */
5 :
6 : /*
7 : * A counting Bloom filter implementation. This allows consumers to
8 : * do fast probabilistic "is item X in set Y?" testing which will
9 : * never answer "no" when the correct answer is "yes" (but might
10 : * incorrectly answer "yes" when the correct answer is "no").
11 : */
12 :
13 : #ifndef mozilla_BloomFilter_h_
14 : #define mozilla_BloomFilter_h_
15 :
16 : #include "mozilla/Likely.h"
17 : #include "mozilla/StandardInteger.h"
18 : #include "mozilla/Util.h"
19 :
20 : #include <string.h>
21 :
22 : namespace mozilla {
23 :
24 : /*
25 : * This class implements a counting Bloom filter as described at
26 : * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with
27 : * 8-bit counters. This allows quick probabilistic answers to the
28 : * question "is object X in set Y?" where the contents of Y might not
29 : * be time-invariant. The probabilistic nature of the test means that
30 : * sometimes the answer will be "yes" when it should be "no". If the
31 : * answer is "no", then X is guaranteed not to be in Y.
32 : *
33 : * The filter is parametrized on KeySize, which is the size of the key
34 : * generated by each of hash functions used by the filter, in bits,
35 : * and the type of object T being added and removed. T must implement
36 : * a |uint32_t hash() const| method which returns a uint32_t hash key
37 : * that will be used to generate the two separate hash functions for
38 : * the Bloom filter. This hash key MUST be well-distributed for good
39 : * results! KeySize is not allowed to be larger than 16.
40 : *
41 : * The filter uses exactly 2**KeySize bytes of memory. From now on we
42 : * will refer to the memory used by the filter as M.
43 : *
44 : * The expected rate of incorrect "yes" answers depends on M and on
45 : * the number N of objects in set Y. As long as N is small compared
46 : * to M, the rate of such answers is expected to be approximately
47 : * 4*(N/M)**2 for this filter. In practice, if Y has a few hundred
48 : * elements then using a KeySize of 12 gives a reasonably low
49 : * incorrect answer rate. A KeySize of 12 has the additional benefit
50 : * of using exactly one page for the filter in typical hardware
51 : * configurations.
52 : */
53 :
54 : template<unsigned KeySize, class T>
55 : class BloomFilter {
56 : /*
57 : * A counting Bloom filter with 8-bit counters. For now we assume
58 : * that having two hash functions is enough, but we may revisit that
59 : * decision later.
60 : *
61 : * The filter uses an array with 2**KeySize entries.
62 : *
63 : * Assuming a well-distributed hash function, a Bloom filter with
64 : * array size M containing N elements and
65 : * using k hash function has expected false positive rate exactly
66 : *
67 : * $ (1 - (1 - 1/M)^{kN})^k $
68 : *
69 : * because each array slot has a
70 : *
71 : * $ (1 - 1/M)^{kN} $
72 : *
73 : * chance of being 0, and the expected false positive rate is the
74 : * probability that all of the k hash functions will hit a nonzero
75 : * slot.
76 : *
77 : * For reasonable assumptions (M large, kN large, which should both
78 : * hold if we're worried about false positives) about M and kN this
79 : * becomes approximately
80 : *
81 : * $$ (1 - \exp(-kN/M))^k $$
82 : *
83 : * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
84 : * or in other words
85 : *
86 : * $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$
87 : *
88 : * where r is the false positive rate. This can be used to compute
89 : * the desired KeySize for a given load N and false positive rate r.
90 : *
91 : * If N/M is assumed small, then the false positive rate can
92 : * further be approximated as 4*N^2/M^2. So increasing KeySize by
93 : * 1, which doubles M, reduces the false positive rate by about a
94 : * factor of 4, and a false positive rate of 1% corresponds to
95 : * about M/N == 20.
96 : *
97 : * What this means in practice is that for a few hundred keys using a
98 : * KeySize of 12 gives false positive rates on the order of 0.25-4%.
99 : *
100 : * Similarly, using a KeySize of 10 would lead to a 4% false
101 : * positive rate for N == 100 and to quite bad false positive
102 : * rates for larger N.
103 : */
104 : public:
105 1 : BloomFilter() {
106 : MOZ_STATIC_ASSERT(KeySize <= keyShift, "KeySize too big");
107 :
108 : // Should we have a custom operator new using calloc instead and
109 : // require that we're allocated via the operator?
110 1 : clear();
111 1 : }
112 :
113 : /*
114 : * Clear the filter. This should be done before reusing it, because
115 : * just removing all items doesn't clear counters that hit the upper
116 : * bound.
117 : */
118 : void clear();
119 :
120 : /*
121 : * Add an item to the filter.
122 : */
123 : void add(const T* t);
124 :
125 : /*
126 : * Remove an item from the filter.
127 : */
128 : void remove(const T* t);
129 :
130 : /*
131 : * Check whether the filter might contain an item. This can
132 : * sometimes return true even if the item is not in the filter,
133 : * but will never return false for items that are actually in the
134 : * filter.
135 : */
136 : bool mightContain(const T* t) const;
137 :
138 : /*
139 : * Methods for add/remove/contain when we already have a hash computed
140 : */
141 : void add(uint32_t hash);
142 : void remove(uint32_t hash);
143 : bool mightContain(uint32_t hash) const;
144 :
145 : private:
146 : static const size_t arraySize = (1 << KeySize);
147 : static const uint32_t keyMask = (1 << KeySize) - 1;
148 : static const uint32_t keyShift = 16;
149 :
150 1036 : static uint32_t hash1(uint32_t hash) { return hash & keyMask; }
151 1034 : static uint32_t hash2(uint32_t hash) { return (hash >> keyShift) & keyMask; }
152 :
153 1024 : uint8_t& firstSlot(uint32_t hash) { return counters[hash1(hash)]; }
154 1024 : uint8_t& secondSlot(uint32_t hash) { return counters[hash2(hash)]; }
155 12 : const uint8_t& firstSlot(uint32_t hash) const { return counters[hash1(hash)]; }
156 10 : const uint8_t& secondSlot(uint32_t hash) const { return counters[hash2(hash)]; }
157 :
158 2048 : static bool full(const uint8_t& slot) { return slot == UINT8_MAX; }
159 :
160 : uint8_t counters[arraySize];
161 : };
162 :
163 : template<unsigned KeySize, class T>
164 : inline void
165 2 : BloomFilter<KeySize, T>::clear()
166 : {
167 2 : memset(counters, 0, arraySize);
168 2 : }
169 :
170 : template<unsigned KeySize, class T>
171 : inline void
172 512 : BloomFilter<KeySize, T>::add(uint32_t hash)
173 : {
174 512 : uint8_t& slot1 = firstSlot(hash);
175 512 : if (MOZ_LIKELY(!full(slot1)))
176 256 : ++slot1;
177 :
178 512 : uint8_t& slot2 = secondSlot(hash);
179 512 : if (MOZ_LIKELY(!full(slot2)))
180 511 : ++slot2;
181 512 : }
182 :
183 : template<unsigned KeySize, class T>
184 : MOZ_ALWAYS_INLINE void
185 512 : BloomFilter<KeySize, T>::add(const T* t)
186 : {
187 512 : uint32_t hash = t->hash();
188 512 : return add(hash);
189 : }
190 :
191 : template<unsigned KeySize, class T>
192 : inline void
193 512 : BloomFilter<KeySize, T>::remove(uint32_t hash)
194 : {
195 : // If the slots are full, we don't know whether we bumped them to be
196 : // there when we added or not, so just leave them full.
197 512 : uint8_t& slot1 = firstSlot(hash);
198 512 : if (MOZ_LIKELY(!full(slot1)))
199 2 : --slot1;
200 :
201 512 : uint8_t& slot2 = secondSlot(hash);
202 512 : if (MOZ_LIKELY(!full(slot2)))
203 255 : --slot2;
204 512 : }
205 :
206 : template<unsigned KeySize, class T>
207 : MOZ_ALWAYS_INLINE void
208 512 : BloomFilter<KeySize, T>::remove(const T* t)
209 : {
210 512 : uint32_t hash = t->hash();
211 512 : remove(hash);
212 512 : }
213 :
214 : template<unsigned KeySize, class T>
215 : MOZ_ALWAYS_INLINE bool
216 12 : BloomFilter<KeySize, T>::mightContain(uint32_t hash) const
217 : {
218 : // Check that all the slots for this hash contain something
219 12 : return firstSlot(hash) && secondSlot(hash);
220 : }
221 :
222 : template<unsigned KeySize, class T>
223 : MOZ_ALWAYS_INLINE bool
224 12 : BloomFilter<KeySize, T>::mightContain(const T* t) const
225 : {
226 12 : uint32_t hash = t->hash();
227 12 : return mightContain(hash);
228 : }
229 :
230 : } // namespace mozilla
231 :
232 : #endif /* mozilla_BloomFilter_h_ */
|