Skip to content

Commit

Permalink
feat: Add bloom filter
Browse files Browse the repository at this point in the history
  • Loading branch information
idsulik committed Nov 3, 2024
1 parent ac22478 commit 262f4c5
Show file tree
Hide file tree
Showing 3 changed files with 429 additions and 1 deletion.
76 changes: 75 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
- [Binary Search Tree](#binary-search-tree)
- [Skip List](#skip-list)
- [Graph](#graph)
- [BloomFilter](#bloom-filter)
4. [License](#license)

## [Installation](#installation)
Expand Down Expand Up @@ -362,7 +363,80 @@ Represents networks of nodes and edges, suitable for various algorithms like sea
- `Edges() [][2]T`: Returns a slice of all edges in the graph.

---
### [Bloom Filter](#bloom-filter)

## [License](#license)
A Bloom Filter is a space-efficient probabilistic data structure used to test whether an element is a member of a set. False positive matches are possible, but false negatives are not. Elements can be added to the set, but not removed.

#### Type `BloomFilter[T any]`

- **Constructor:**

```go
func NewBloomFilter[T any](expectedItems uint, falsePositiveProb float64) *BloomFilter[T]
```

- `expectedItems`: Expected number of items to be added to the filter
- `falsePositiveProb`: Desired false positive probability (between 0 and 1)

- **Methods:**

- `Add(item T)`: Adds an item to the Bloom Filter.
- `Contains(item T) bool`: Tests whether an item might be in the set.
- `EstimatedFalsePositiveRate() float64`: Returns the current estimated false positive rate.
- `Clear()`: Removes all items from the Bloom Filter.
- `Len() int`: Returns the number of items added to the Bloom Filter.
- `IsEmpty() bool`: Returns true if no items have been added.
- `BitSize() uint`: Returns the size of the underlying bit array.
- `NumberOfHashes() uint`: Returns the number of hash functions being used.

#### Example Usage:

```go
// Create a new Bloom Filter expecting 1000 items with 1% false positive rate
bf := collections.NewBloomFilter[string](1000, 0.01)

// Add some items
bf.Add("apple")
bf.Add("banana")
bf.Add("cherry")

// Check for membership
if bf.Contains("apple") {
fmt.Println("'apple' is probably in the set")
}

// Get current false positive rate
fmt.Printf("False positive rate: %f\n", bf.EstimatedFalsePositiveRate())

// Clear the filter
bf.Clear()
```

#### Performance Characteristics:

- Space Complexity: O(m), where m is the size of the bit array
- Time Complexity:
- Add: O(k), where k is the number of hash functions
- Contains: O(k), where k is the number of hash functions
- False Positive Probability: (1 - e^(-kn/m))^k
- k: number of hash functions
- n: number of inserted elements
- m: size of bit array

#### Use Cases:

- Duplicate detection
- Cache filtering
- URL shorteners
- Spell checkers
- Network routing
- Database query optimization

#### Notes:

- The Bloom Filter automatically optimizes the number of hash functions and bit array size based on the expected number of items and desired false positive rate.
- The actual false positive rate may vary slightly from the target rate due to the probabilistic nature of the data structure.
- The filter supports any type that can be converted to a string representation.

## [License](#license)
This project is licensed under the [MIT License](LICENSE) - see the [LICENSE](LICENSE) file for details.
119 changes: 119 additions & 0 deletions bloomfilter/bloomfilter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
package bloomfilter

import (
"crypto/sha256"
"encoding/binary"
"fmt"
"hash"
"hash/fnv"
"math"
)

type BloomFilter[T any] struct {
bits []bool
numBits uint
numHash uint
count uint
hasher hash.Hash
}

func NewBloomFilter[T any](expectedItems uint, falsePositiveProb float64) *BloomFilter[T] {
if expectedItems == 0 {
expectedItems = 1
}
if falsePositiveProb <= 0 {
falsePositiveProb = 0.01
}

numBits := uint(math.Ceil(-float64(expectedItems) * math.Log(falsePositiveProb) / math.Pow(math.Log(2), 2)))
numHash := uint(math.Ceil(float64(numBits) / float64(expectedItems) * math.Log(2)))

return &BloomFilter[T]{
bits: make([]bool, numBits),
numBits: numBits,
numHash: numHash,
hasher: fnv.New64a(), // Using fnv.New64a() for better distribution
}
}

// hashToUint converts a hash sum to uint
func hashToUint(sum []byte) uint {
// Convert the first 8 bytes of the hash to uint64, then to uint
if len(sum) < 8 {
panic("Hash sum too short")
}
return uint(binary.BigEndian.Uint64(sum[:8]))
}

// getLocations generates multiple hash locations for an item
func (bf *BloomFilter[T]) getLocations(item T) []uint {
locations := make([]uint, bf.numHash)
itemStr := fmt.Sprintf("%v", item)

// Calculate SHA-256 hash as a base for location generation
hash := sha256.Sum256([]byte(itemStr))
h1 := hashToUint(hash[:8]) // Use first 8 bytes for h1
h2 := hashToUint(hash[8:16]) // Use next 8 bytes for h2

// Generate all hash values using the formula: h1 + i*h2
for i := uint(0); i < bf.numHash; i++ {
locations[i] = (h1 + i*h2) % bf.numBits
}

return locations
}

// Add inserts an item into the Bloom Filter.
func (bf *BloomFilter[T]) Add(item T) {
locations := bf.getLocations(item)
for _, loc := range locations {
bf.bits[loc] = true
}
bf.count++
}

// Contains tests whether an item might be in the set.
func (bf *BloomFilter[T]) Contains(item T) bool {
locations := bf.getLocations(item)
for _, loc := range locations {
if !bf.bits[loc] {
return false
}
}
return true
}

// EstimatedFalsePositiveRate returns the estimated false positive rate.
func (bf *BloomFilter[T]) EstimatedFalsePositiveRate() float64 {
if bf.count == 0 {
return 0.0
}
exponent := -float64(bf.numHash) * float64(bf.count) / float64(bf.numBits)
return math.Pow(1-math.Exp(exponent), float64(bf.numHash))
}

// Clear removes all items from the Bloom Filter.
func (bf *BloomFilter[T]) Clear() {
bf.bits = make([]bool, bf.numBits)
bf.count = 0
}

// Len returns the number of items added to the Bloom Filter.
func (bf *BloomFilter[T]) Len() int {
return int(bf.count)
}

// IsEmpty returns true if no items have been added to the Bloom Filter.
func (bf *BloomFilter[T]) IsEmpty() bool {
return bf.count == 0
}

// BitSize returns the size of the bit array.
func (bf *BloomFilter[T]) BitSize() uint {
return bf.numBits
}

// NumberOfHashes returns the number of hash functions.
func (bf *BloomFilter[T]) NumberOfHashes() uint {
return bf.numHash
}
Loading

0 comments on commit 262f4c5

Please sign in to comment.