Skip to content

Commit

Permalink
add bloom filter
Browse files Browse the repository at this point in the history
  • Loading branch information
TheTechromancer committed Apr 24, 2024
1 parent 703b13c commit 41dcaf9
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 0 deletions.
45 changes: 45 additions & 0 deletions bbot/core/helpers/bloom.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import mmh3
from bitarray import bitarray


class BloomFilter:
"""
Simple bloom filter implementation capable of rougly 200K lookups/s.
BBOT uses bloom filters in scenarios like dns brute-forcing, where it's useful to keep track
of which mutations have been tried so far.
A 100-megabyte bloom filter (800M bits) can store 10M entries with a .01% false-positive rate.
A python hash is 36 bytes. So if you wanted to store these in a set, this would take up
36 * 10M * 2 (key+value) == 720 megabytes. So we save rougly 7 times the space.
"""

def __init__(self, size=2**16):
self.size = size
self.bit_array = bitarray(size)
self.bit_array.setall(0) # Initialize all bits to 0

def _hashes(self, item):
item_str = str(item).encode("utf-8")
return [
abs(hash(item)) % self.size,
abs(mmh3.hash(item_str)) % self.size,
abs(self._fnv1a_hash(item_str)) % self.size,
]

def _fnv1a_hash(self, data):
hash = 0x811C9DC5 # 2166136261
for byte in data:
hash ^= byte
hash = (hash * 0x01000193) % 2**32 # 16777619
return hash

def add(self, item):
for hash_value in self._hashes(item):
self.bit_array[hash_value] = 1

def check(self, item):
return all(self.bit_array[hash_value] for hash_value in self._hashes(item))

def __contains__(self, item):
return self.check(item)
5 changes: 5 additions & 0 deletions bbot/core/helpers/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ def __init__(self, preset):
self.word_cloud = WordCloud(self)
self.dummy_modules = {}

def bloom_filter(self, size):
from .bloom import BloomFilter

return BloomFilter(size)

def interactsh(self, *args, **kwargs):
return Interactsh(self, *args, **kwargs)

Expand Down
71 changes: 71 additions & 0 deletions bbot/test/test_step_1/test_bloom_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import sys
import time
import string
import random


def test_bloom_filter():

def generate_random_strings(n, length=10):
"""Generate a list of n random strings."""
return ["".join(random.choices(string.ascii_letters + string.digits, k=length)) for _ in range(n)]

from bbot.scanner import Scanner

scan = Scanner()

n_items_to_add = 100000
n_items_to_test = 100000
bloom_filter_size = 8000000

# Initialize the simple bloom filter and the set
bloom_filter = scan.helpers.bloom_filter(size=bloom_filter_size)
test_set = set()

mem_size = sys.getsizeof(bloom_filter.bit_array)
print(f"Size of bit array: {mem_size}")

# size should be roughly 1MB
assert 900000 < mem_size < 1100000

# Generate random strings to add
print(f"Generating {n_items_to_add:,} items to add")
items_to_add = set(generate_random_strings(n_items_to_add))

# Generate random strings to test
print(f"Generating {n_items_to_test:,} items to test")
items_to_test = generate_random_strings(n_items_to_test)

print("Adding items")
start = time.time()
for item in items_to_add:
bloom_filter.add(item)
test_set.add(hash(item))
end = time.time()
elapsed = end - start
print(f"elapsed: {elapsed:.2f} ({int(n_items_to_test/elapsed)}/s)")
# this shouldn't take longer than 5 seconds
assert elapsed < 5

# make sure we have 100% accuracy
start = time.time()
for item in items_to_add:
assert item in bloom_filter
end = time.time()
elapsed = end - start
print(f"elapsed: {elapsed:.2f} ({int(n_items_to_test/elapsed)}/s)")
# this shouldn't take longer than 5 seconds
assert elapsed < 5

print("Measuring false positives")
# Check for false positives
false_positives = 0
for item in items_to_test:
if bloom_filter.check(item) and hash(item) not in test_set:
false_positives += 1
false_positive_rate = false_positives / len(items_to_test)

print(f"False positive rate: {false_positive_rate * 100:.2f}% ({false_positives}/{len(items_to_test)})")

# ensure false positives are less than .01 percent
assert 0 < false_positives < 10

0 comments on commit 41dcaf9

Please sign in to comment.