001 /*
002 * Copyright (C) 2011 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015 package com.google.common.hash;
016
017 import static com.google.common.base.Preconditions.checkArgument;
018 import static com.google.common.base.Preconditions.checkNotNull;
019 import static com.google.common.math.IntMath.log2;
020 import static java.lang.Math.max;
021 import static java.math.RoundingMode.CEILING;
022
023 import com.google.common.annotations.Beta;
024 import com.google.common.annotations.VisibleForTesting;
025 import com.google.common.base.Preconditions;
026 import com.google.common.hash.HashCodes.HashCodeSlicer;
027
028 import java.io.Serializable;
029
030 /**
031 * A Bloom filter for instances of {@code T}. A Bloom filter offers an approximate containment test
032 * with one-sided error: if it claims that an element is contained in it, this might be in error,
033 * but if it claims that an element is <i>not</i> contained in it, then this is definitely true.
034 *
035 * <p>If you are unfamiliar with Bloom filters, this nice
036 * <a href="http://llimllib.github.com/bloomfilter-tutorial/">tutorial</a> may help you understand
037 * how they work.
038 *
039 * @param <T> the type of instances that the {@code BloomFilter} accepts
040 * @author Kevin Bourrillion
041 * @author Dimitris Andreou
042 * @since 11.0
043 */
044 @Beta
045 public final class BloomFilter<T> implements Serializable {
046 /** A power of two sized bit set */
047 private final BitArray bits;
048
049 /** Number of bits required to index a bit out of {@code bits} */
050 private final int hashBitsPerSlice;
051
052 /** Number of hashes per element */
053 private final int numHashFunctions;
054
055 /** The funnel to translate T's to bytes */
056 private final Funnel<T> funnel;
057
058 /** The HashFunction that generates as many bits as this BloomFilter needs */
059 private final HashFunction hashFunction;
060
061 /**
062 * Creates a BloomFilter.
063 */
064 private BloomFilter(BitArray bits, int numHashFunctions, Funnel<T> funnel,
065 HashFunction hashFunction) {
066 Preconditions.checkArgument(numHashFunctions > 0, "numHashFunctions zero or negative");
067 this.bits = checkNotNull(bits);
068 this.numHashFunctions = numHashFunctions;
069 this.funnel = checkNotNull(funnel);
070 this.hashFunction = checkNotNull(hashFunction);
071 this.hashBitsPerSlice = log2(max(bits.size(), Long.SIZE /* minimum capacity */ ), CEILING);
072 }
073
074 /**
075 * Returns {@code true} if the element <i>might</i> have been put in this Bloom filter,
076 * {@code false} if this is <i>definitely</i> not the case.
077 */
078 public boolean mightContain(T instance) {
079 HashCodeSlicer slicer = HashCodes.slice(
080 hashFunction.newHasher().putObject(instance, funnel).hash(), hashBitsPerSlice);
081 for (int i = 0; i < numHashFunctions; i++) {
082 if (!bits.get(slicer.nextSlice())) {
083 return false;
084 }
085 }
086 return true;
087 }
088
089 /**
090 * Puts an element into this {@code BloomFilter}. Ensures that subsequent invocations of
091 * {@link #mightContain(Object)} with the same element will always return {@code true}.
092 */
093 public void put(T instance) {
094 HashCodeSlicer slicer = HashCodes.slice(
095 hashFunction.newHasher().putObject(instance, funnel).hash(), hashBitsPerSlice);
096 for (int i = 0; i < numHashFunctions; i++) {
097 int nextSlice = slicer.nextSlice();
098 bits.set(nextSlice);
099 }
100 }
101
102 @VisibleForTesting int getHashCount() {
103 return numHashFunctions;
104 }
105
106 @VisibleForTesting double computeExpectedFalsePositiveRate(int insertions) {
107 return Math.pow(
108 1 - Math.exp(-numHashFunctions * ((double) insertions / (bits.size()))),
109 numHashFunctions);
110 }
111
112 // This little gem is kindly offered by kevinb
113 private static class BitArray {
114 final long[] data;
115
116 BitArray(int bits) {
117 this(new long[bits >> 6]);
118 }
119
120 // Used by serialization
121 BitArray(long[] data) {
122 checkArgument(data.length > 0, "data length is zero!");
123 this.data = data;
124 }
125
126 void set(int index) {
127 data[index >> 6] |= (1L << index);
128 }
129
130 boolean get(int index) {
131 return (data[index >> 6] & (1L << index)) != 0;
132 }
133
134 /** Number of bits */
135 int size() {
136 return data.length * Long.SIZE;
137 }
138 }
139
140 /*
141 * Cheat sheet for the factories:
142 *
143 * m: total bits
144 * n: expected insertions
145 * b: m/n, bits per insertion
146
147 * p: expected false positive probability
148 *
149 * 1) Optimal k = b * ln2
150 * 2) p = (1 - e ^ (-kn/m))^k
151 * 3) For optimal k: p = 2 ^ (-k) ~= 0.6185^b
152 * 4) For optimal k: m = -nlnp / ((ln2) ^ 2)
153 *
154 * I expect the user to provide "n", and then one of {m,b,p} is needed.
155 * Providing both (n, m) and (n, b) would be confusing, so I go for the 2nd.
156 */
157
158 /**
159 * Creates a {@code Builder} of a {@link BloomFilter BloomFilter<T>}, with the expected number
160 * of insertions and expected false positive probability.
161 *
162 * <p>Note that overflowing a {@code BloomFilter} with significantly more elements
163 * than specified, will result in its saturation, and a sharp deterioration of its
164 * false positive probability.
165 *
166 * <p>The constructed {@code BloomFilter<T>} will be serializable if the provided
167 * {@code Funnel<T>} is.
168 *
169 * @param funnel the funnel of T's that the constructed {@code BloomFilter<T>} will use
170 * @param expectedInsertions the number of expected insertions to the constructed
171 * {@code BloomFilter<T>}; must be positive
172 * @param falsePositiveProbability the desired false positive probability (must be positive and
173 * less than 1.0)
174 * @return a {@code Builder}
175 */
176 public static <T> BloomFilter<T> create(Funnel<T> funnel, int expectedInsertions /* n */,
177 double falsePositiveProbability) {
178 checkNotNull(funnel);
179 checkArgument(expectedInsertions > 0, "Expected insertions must be positive");
180 checkArgument(falsePositiveProbability > 0.0 & falsePositiveProbability < 1.0,
181 "False positive probability in (0.0, 1.0)");
182 /*
183 * andreou: I wanted to put a warning in the javadoc about tiny fpp values,
184 * since the resulting size is proportional to -log(p), but there is not
185 * much of a point after all, e.g. optimalM(1000, 0.0000000000000001) = 76680
186 * which is less that 10kb. Who cares!
187 */
188 int m = optimalM(expectedInsertions, falsePositiveProbability);
189 int k = optimalK(expectedInsertions, m);
190
191 BitArray bits = new BitArray(1 << log2(max(m, Long.SIZE /* minimum capacity */ ), CEILING));
192 HashFunction hashFunction = BloomFilterStrategies.From128ToN.withBits(
193 bits.size() * k, Hashing.murmur3_128());
194 return new BloomFilter<T>(bits, k, funnel, hashFunction);
195 }
196
197 /**
198 * Creates a {@code Builder} of a {@link BloomFilter BloomFilter<T>}, with the expected number
199 * of insertions, and a default expected false positive probability of 3%.
200 *
201 * <p>Note that overflowing a {@code BloomFilter} with significantly more elements
202 * than specified, will result in its saturation, and a sharp deterioration of its
203 * false positive probability.
204 *
205 * <p>The constructed {@code BloomFilter<T>} will be serializable if the provided
206 * {@code Funnel<T>} is.
207 *
208 * @param funnel the funnel of T's that the constructed {@code BloomFilter<T>} will use
209 * @param expectedInsertions the number of expected insertions to the constructed
210 * {@code BloomFilter<T>}; must be positive
211 * @return a {@code Builder}
212 */
213 public static <T> BloomFilter<T> create(Funnel<T> funnel, int expectedInsertions /* n */) {
214 return create(funnel, expectedInsertions, 0.03); // FYI, for 3%, we always get 5 hash functions
215 }
216
217
218 private static final double LN2 = Math.log(2);
219 private static final double LN2_SQUARED = LN2 * LN2;
220
221 /**
222 * Computes the optimal k (number of hashes per element inserted in Bloom filter), given the
223 * expected insertions and total number of bits in the Bloom filter.
224 *
225 * See http://en.wikipedia.org/wiki/File:Bloom_filter_fp_probability.svg for the formula.
226 *
227 * @param n expected insertions (must be positive)
228 * @param m total number of bits in Bloom filter (must be positive)
229 */
230 @VisibleForTesting static int optimalK(int n, int m) {
231 return Math.max(1, (int) Math.round(m / n * LN2));
232 }
233
234 /**
235 * Computes m (total bits of Bloom filter) which is expected to achieve, for the specified
236 * expected insertions, the required false positive probability.
237 *
238 * See http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives for the formula.
239 *
240 * @param n expected insertions (must be positive)
241 * @param p false positive rate (must be 0 < p < 1)
242 */
243 @VisibleForTesting static int optimalM(int n, double p) {
244 return (int) (-n * Math.log(p) / LN2_SQUARED);
245 }
246
247 private Object writeReplace() {
248 return new SerialForm<T>(this);
249 }
250
251 private static class SerialForm<T> implements Serializable {
252 final long[] data;
253 final int numHashFunctions;
254 final Funnel<T> funnel;
255 final HashFunction hashFunction;
256
257 SerialForm(BloomFilter<T> bf) {
258 this.data = bf.bits.data;
259 this.numHashFunctions = bf.numHashFunctions;
260 this.funnel = bf.funnel;
261 this.hashFunction = bf.hashFunction;
262 }
263 Object readResolve() {
264 return new BloomFilter<T>(new BitArray(data), numHashFunctions, funnel, hashFunction);
265 }
266 private static final long serialVersionUID = 0;
267 }
268 }