Skip to content

Commit 41a6430

Browse files
arnav-aggarwaltrekhleb
authored andcommitted
Add bloom filter (trekhleb#84)
1 parent b33f1d5 commit 41a6430

File tree

5 files changed

+358
-0
lines changed

5 files changed

+358
-0
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ the data.
3838
* `A` [Fenwick Tree](src/data-structures/tree/fenwick-tree) (Binary Indexed Tree)
3939
* `A` [Graph](src/data-structures/graph) (both directed and undirected)
4040
* `A` [Disjoint Set](src/data-structures/disjoint-set)
41+
* `A` [Bloom Filter](src/data-structures/bloom-filter)
4142

4243
## Algorithms
4344

@@ -231,6 +232,7 @@ Below is the list of some of the most used Big O notations and their performance
231232
| **B-Tree** | log(n) | log(n) | log(n) | log(n) | |
232233
| **Red-Black Tree** | log(n) | log(n) | log(n) | log(n) | |
233234
| **AVL Tree** | log(n) | log(n) | log(n) | log(n) | |
235+
| **Bloom Filter** | | 1 | 1 | | |
234236

235237
### Array Sorting Algorithms Complexity
236238

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
export default class BloomFilter {
2+
/**
3+
* @param {number} size
4+
*/
5+
constructor(size = 100) {
6+
// Bloom filter size directly affects the likelihood of false positives.
7+
// The bigger the size the lower the likelihood of false positives.
8+
this.size = size;
9+
this.storage = this.createStore(size);
10+
}
11+
12+
/**
13+
* @param {string} item
14+
*/
15+
insert(item) {
16+
const hashValues = this.getHashValues(item);
17+
18+
// Set each hashValue index to true
19+
hashValues.forEach(val => this.storage.setValue(val));
20+
}
21+
22+
/**
23+
* @param {string} item
24+
* @return {boolean}
25+
*/
26+
mayContain(item) {
27+
const hashValues = this.getHashValues(item);
28+
29+
for (let i = 0; i < hashValues.length; i += 1) {
30+
if (!this.storage.getValue(hashValues[i])) {
31+
// We know that the item was definitely not inserted.
32+
return false;
33+
}
34+
}
35+
36+
// The item may or may not have been inserted.
37+
return true;
38+
}
39+
40+
/**
41+
* Creates the data store for our filter.
42+
* We use this method to generate the store in order to
43+
* encapsulate the data itself and only provide access
44+
* to the necessary methods.
45+
*
46+
* @param {number} size
47+
* @return {Object}
48+
*/
49+
createStore(size) {
50+
const storage = [];
51+
52+
// Initialize all indexes to false
53+
for (let i = 0; i < size; i += 1) {
54+
storage.push(false);
55+
}
56+
57+
const storageInterface = {
58+
getValue(index) {
59+
return storage[index];
60+
},
61+
setValue(index) {
62+
storage[index] = true;
63+
},
64+
};
65+
66+
return storageInterface;
67+
}
68+
69+
/**
70+
* @param {string} str
71+
* @return {number}
72+
*/
73+
hash1(str) {
74+
let hash = 0;
75+
76+
for (let i = 0; i < str.length; i += 1) {
77+
const char = str.charCodeAt(i);
78+
hash = (hash << 5) + hash + char;
79+
hash &= hash; // Convert to 32bit integer
80+
hash = Math.abs(hash);
81+
}
82+
83+
return hash % this.size;
84+
}
85+
86+
/**
87+
* @param {string} str
88+
* @return {number}
89+
*/
90+
hash2(str) {
91+
let hash = 5381;
92+
93+
for (let i = 0; i < str.length; i += 1) {
94+
const char = str.charCodeAt(i);
95+
hash = (hash << 5) + hash + char; /* hash * 33 + c */
96+
}
97+
98+
return hash % this.size;
99+
}
100+
101+
/**
102+
* @param {string} str
103+
* @return {number}
104+
*/
105+
hash3(str) {
106+
let hash = 0;
107+
108+
for (let i = 0; i < str.length; i += 1) {
109+
const char = str.charCodeAt(i);
110+
hash = (hash << 5) - hash;
111+
hash += char;
112+
hash &= hash; // Convert to 32bit integer
113+
}
114+
115+
return hash % this.size;
116+
}
117+
118+
/**
119+
* Runs all 3 hash functions on the input and returns an array of results
120+
*
121+
* @param {string} str
122+
* @return {number[]}
123+
*/
124+
getHashValues(item) {
125+
return [this.hash1(item), Math.abs(this.hash2(item)), Math.abs(this.hash3(item))];
126+
}
127+
}
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
# Bloom Filter
2+
3+
A bloom filter is a data structure designed to
4+
test whether an element is present in a set. It
5+
is designed to be blazingly fast and use minimal
6+
memory at the cost of potential false positives.
7+
8+
![Bloom Filter](https://upload.wikimedia.org/wikipedia/commons/a/ac/Bloom_filter.svg)
9+
10+
## Operations
11+
12+
There are two main operations a bloom filter can
13+
perform: insertion and search. Search may result in
14+
false positives. Deletion is not possible.
15+
16+
In other words, the filter can take in items. When
17+
we go to check if an item has previously been
18+
inserted, it can tell us either "no" or "maybe".
19+
20+
Both insertion and search are O(1) operations.
21+
22+
## Making the filter
23+
24+
A bloom filter is created by allotting a certain size.
25+
In our example, we use 100 as a default length. All
26+
locations are initialized to `false`.
27+
28+
### Insertion
29+
30+
During insertion, a number of hash functions,
31+
in our case 3 hash functions, are used to create
32+
hashes of the input. These hash functions output
33+
indexes. At every index received, we simply change
34+
the value in our bloom filter to `true`.
35+
36+
### Search
37+
38+
During a search, the same hash functions are called
39+
and used to hash the input. We then check if the
40+
indexes received _all_ have a value of `true` inside
41+
our bloom filter. If they _all_ have a value of
42+
`true`, we know that the bloom filter may have had
43+
the value previously inserted.
44+
45+
However, it's not certain, because it's possible
46+
that other values previously inserted flipped the
47+
values to `true`. The values aren't necessarily
48+
`true` due to the item currently being searched for.
49+
Absolute certainty is impossible unless only a single
50+
item has previously been inserted.
51+
52+
While checking the bloom filter for the indexes
53+
returned by our hash functions, if even one of them
54+
has a value of `false`, we definitively know that the
55+
item was not previously inserted.
56+
57+
## False Positives
58+
59+
The probability of false positives is determined by
60+
three factors: the size of the bloom filter, the
61+
number of hash functions we use, and the number
62+
of items that have been inserted into the filter.
63+
64+
The formula to calculate probablity of a false positive is:
65+
66+
( 1 - e <sup>-kn/m</sup> ) <sup>k</sup>
67+
68+
k = # hash functions
69+
70+
m = size
71+
72+
n = # items inserted
73+
74+
These variables, k, m, and n, should be picked based
75+
on how acceptable false positives are. If the values
76+
are picked and the resulting probability is too high,
77+
the values should be tweaked and the probability
78+
re-calculated.
79+
80+
## Applications
81+
82+
A bloom filter can be used on a blogging website. If
83+
the goal is to show readers only articles that they
84+
have never seen before, a bloom filter is perfect.
85+
It can store hashed values based on the articles. After
86+
a user reads a few articles, they can be inserted into
87+
the filter. The next time the user visits the site,
88+
those articles can be filtered out of the results.
89+
90+
Some articles will inevitably be filtered out by mistake,
91+
but the cost is acceptable. It's ok if a user never sees
92+
a few articles as long as they have other, brand new ones
93+
to see every time they visit the site.
94+
95+
The popular blog site Medium does a version of this.
96+
Feel free to read [their article](https://blog.medium.com/what-are-bloom-filters-1ec2a50c68ff).
97+
98+
## References
99+
100+
- [Wikipedia](https://en.wikipedia.org/wiki/Bloom_filter)
101+
- [Tutorial](http://llimllib.github.io/bloomfilter-tutorial/)
102+
- [Calculating false positive probability](https://hur.st/bloomfilter/?n=4&p=&m=18&k=3)
103+
- [Medium blog](https://blog.medium.com/what-are-bloom-filters-1ec2a50c68ff)
104+
- [YouTube](https://www.youtube.com/watch?v=bEmBh1HtYrw)
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import BloomFilter from '../BloomFilter';
2+
3+
describe('Bloom Filter', () => {
4+
let bloomFilter;
5+
const people = ['Bruce Wayne', 'Clark Kent', 'Barry Allen'];
6+
7+
beforeEach(() => {
8+
bloomFilter = new BloomFilter();
9+
});
10+
11+
it('Should have methods named "insert" and "mayContain"', () => {
12+
expect(typeof bloomFilter.insert).toBe('function');
13+
expect(typeof bloomFilter.mayContain).toBe('function');
14+
});
15+
16+
it('Should create a new filter store with the appropriate methods', () => {
17+
const store = bloomFilter.createStore(18);
18+
expect(typeof store.getValue).toBe('function');
19+
expect(typeof store.setValue).toBe('function');
20+
});
21+
22+
it('Should hash deterministically with all 3 hash functions', () => {
23+
const str = 'abc';
24+
expect(bloomFilter.hash1(str)).toEqual(bloomFilter.hash1(str));
25+
expect(bloomFilter.hash2(str)).toEqual(bloomFilter.hash2(str));
26+
expect(bloomFilter.hash3(str)).toEqual(bloomFilter.hash3(str));
27+
});
28+
29+
it('Should create an array with 3 hash values', () => {
30+
expect(bloomFilter.getHashValues('abc').length).toEqual(3);
31+
});
32+
33+
it('Should insert strings correctly and return true when checking for inserted values', () => {
34+
people.forEach(person => bloomFilter.insert(person));
35+
expect(bloomFilter.mayContain('Bruce Wayne')).toBe(true);
36+
expect(bloomFilter.mayContain('Clark Kent')).toBe(true);
37+
expect(bloomFilter.mayContain('Barry Allen')).toBe(true);
38+
});
39+
});
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import BloomFilter from '../BloomFilter';
2+
3+
// Adapted from http://stackoverflow.com/questions/1349404/generate-random-string-characters-in-javascript
4+
function makeID() {
5+
const possible = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
6+
let id = '';
7+
8+
for (let i = 0; i < 10; i += 1) {
9+
const randomLength = Math.random() * possible.length;
10+
const randomIndex = Math.floor(randomLength);
11+
id += possible.charAt(randomIndex);
12+
}
13+
14+
return id;
15+
}
16+
17+
function run10kTrials(numRandomTests = 1000) {
18+
const bloomFilter = new BloomFilter();
19+
const mockPeopleIDs = [];
20+
21+
for (let i = 0; i < 10; i += 1) {
22+
mockPeopleIDs.push(makeID());
23+
}
24+
25+
mockPeopleIDs.forEach(id => bloomFilter.insert(id));
26+
let numFalsePositives = 0;
27+
28+
for (let index = 0; index < numRandomTests; index += 1) {
29+
const randomID = makeID();
30+
if (bloomFilter.mayContain(randomID)) {
31+
numFalsePositives += 1;
32+
}
33+
}
34+
35+
return numFalsePositives;
36+
}
37+
38+
function testFilter(numTrials = 100) {
39+
const results = [];
40+
41+
for (let i = 0; i < numTrials; i += 1) {
42+
results.push(run10kTrials());
43+
}
44+
45+
const sum = results.reduce((cumulative, next) => cumulative + next, 0);
46+
return sum / numTrials;
47+
}
48+
49+
describe('Bloom filter false positives', () => {
50+
const falsePositiveProbability = 0.0174;
51+
const expectedFalsePositives = falsePositiveProbability * 1000;
52+
const avgFalsePositives = testFilter();
53+
54+
it(`Should keep false positives close to an expected value:
55+
56+
# trials = 1000
57+
k = 3 (hash functions)
58+
m = 100 (size)
59+
n = 10 (items inserted)
60+
61+
Using k, m, and n, plugged into https://hur.st/bloomfilter/?n=3&p=&m=18&k=3
62+
Chance of false positive = 0.017
63+
64+
Expected false positives = # trials * chance of false positive
65+
Expected false positives => 1000 * ${falsePositiveProbability}
66+
Expected false positives => ${expectedFalsePositives}
67+
68+
**************************
69+
EXPECTED = ${expectedFalsePositives}
70+
ACTUAL AVG = ${avgFalsePositives}
71+
**************************
72+
73+
If the expected and actual numbers are far off, something is wrong.
74+
Inspect manually.`, () => {
75+
// We give it a large range to avoid unnecessary failures.
76+
// If it's working correctly, the value should definitely
77+
// fall within this range.
78+
79+
// In over 1,000 test runs, none of them ever come close
80+
// to falling outside of this range.
81+
const upperLimit = expectedFalsePositives + 5;
82+
const lowerLimit = expectedFalsePositives - 5;
83+
expect(avgFalsePositives).toBeGreaterThan(lowerLimit);
84+
expect(avgFalsePositives).toBeLessThan(upperLimit);
85+
});
86+
});

0 commit comments

Comments
 (0)