-
Notifications
You must be signed in to change notification settings - Fork 568
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
703b13c
commit 41dcaf9
Showing
3 changed files
with
121 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import mmh3 | ||
from bitarray import bitarray | ||
|
||
|
||
class BloomFilter: | ||
""" | ||
Simple bloom filter implementation capable of rougly 200K lookups/s. | ||
BBOT uses bloom filters in scenarios like dns brute-forcing, where it's useful to keep track | ||
of which mutations have been tried so far. | ||
A 100-megabyte bloom filter (800M bits) can store 10M entries with a .01% false-positive rate. | ||
A python hash is 36 bytes. So if you wanted to store these in a set, this would take up | ||
36 * 10M * 2 (key+value) == 720 megabytes. So we save rougly 7 times the space. | ||
""" | ||
|
||
def __init__(self, size=2**16): | ||
self.size = size | ||
self.bit_array = bitarray(size) | ||
self.bit_array.setall(0) # Initialize all bits to 0 | ||
|
||
def _hashes(self, item): | ||
item_str = str(item).encode("utf-8") | ||
return [ | ||
abs(hash(item)) % self.size, | ||
abs(mmh3.hash(item_str)) % self.size, | ||
abs(self._fnv1a_hash(item_str)) % self.size, | ||
] | ||
|
||
def _fnv1a_hash(self, data): | ||
hash = 0x811C9DC5 # 2166136261 | ||
for byte in data: | ||
hash ^= byte | ||
hash = (hash * 0x01000193) % 2**32 # 16777619 | ||
return hash | ||
|
||
def add(self, item): | ||
for hash_value in self._hashes(item): | ||
self.bit_array[hash_value] = 1 | ||
|
||
def check(self, item): | ||
return all(self.bit_array[hash_value] for hash_value in self._hashes(item)) | ||
|
||
def __contains__(self, item): | ||
return self.check(item) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
import sys | ||
import time | ||
import string | ||
import random | ||
|
||
|
||
def test_bloom_filter(): | ||
|
||
def generate_random_strings(n, length=10): | ||
"""Generate a list of n random strings.""" | ||
return ["".join(random.choices(string.ascii_letters + string.digits, k=length)) for _ in range(n)] | ||
|
||
from bbot.scanner import Scanner | ||
|
||
scan = Scanner() | ||
|
||
n_items_to_add = 100000 | ||
n_items_to_test = 100000 | ||
bloom_filter_size = 8000000 | ||
|
||
# Initialize the simple bloom filter and the set | ||
bloom_filter = scan.helpers.bloom_filter(size=bloom_filter_size) | ||
test_set = set() | ||
|
||
mem_size = sys.getsizeof(bloom_filter.bit_array) | ||
print(f"Size of bit array: {mem_size}") | ||
|
||
# size should be roughly 1MB | ||
assert 900000 < mem_size < 1100000 | ||
|
||
# Generate random strings to add | ||
print(f"Generating {n_items_to_add:,} items to add") | ||
items_to_add = set(generate_random_strings(n_items_to_add)) | ||
|
||
# Generate random strings to test | ||
print(f"Generating {n_items_to_test:,} items to test") | ||
items_to_test = generate_random_strings(n_items_to_test) | ||
|
||
print("Adding items") | ||
start = time.time() | ||
for item in items_to_add: | ||
bloom_filter.add(item) | ||
test_set.add(hash(item)) | ||
end = time.time() | ||
elapsed = end - start | ||
print(f"elapsed: {elapsed:.2f} ({int(n_items_to_test/elapsed)}/s)") | ||
# this shouldn't take longer than 5 seconds | ||
assert elapsed < 5 | ||
|
||
# make sure we have 100% accuracy | ||
start = time.time() | ||
for item in items_to_add: | ||
assert item in bloom_filter | ||
end = time.time() | ||
elapsed = end - start | ||
print(f"elapsed: {elapsed:.2f} ({int(n_items_to_test/elapsed)}/s)") | ||
# this shouldn't take longer than 5 seconds | ||
assert elapsed < 5 | ||
|
||
print("Measuring false positives") | ||
# Check for false positives | ||
false_positives = 0 | ||
for item in items_to_test: | ||
if bloom_filter.check(item) and hash(item) not in test_set: | ||
false_positives += 1 | ||
false_positive_rate = false_positives / len(items_to_test) | ||
|
||
print(f"False positive rate: {false_positive_rate * 100:.2f}% ({false_positives}/{len(items_to_test)})") | ||
|
||
# ensure false positives are less than .01 percent | ||
assert 0 < false_positives < 10 |