diff --git a/README.md b/README.md index bea0dfd..d8a2d78 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ - [Binary Search Tree](#binary-search-tree) - [Skip List](#skip-list) - [Graph](#graph) + - [BloomFilter](#bloom-filter) 4. [License](#license) ## [Installation](#installation) @@ -362,7 +363,80 @@ Represents networks of nodes and edges, suitable for various algorithms like sea - `Edges() [][2]T`: Returns a slice of all edges in the graph. --- +### [Bloom Filter](#bloom-filter) -## [License](#license) +A Bloom Filter is a space-efficient probabilistic data structure used to test whether an element is a member of a set. False positive matches are possible, but false negatives are not. Elements can be added to the set, but not removed. + +#### Type `BloomFilter[T any]` + +- **Constructor:** + + ```go + func NewBloomFilter[T any](expectedItems uint, falsePositiveProb float64) *BloomFilter[T] + ``` + + - `expectedItems`: Expected number of items to be added to the filter + - `falsePositiveProb`: Desired false positive probability (between 0 and 1) + +- **Methods:** + + - `Add(item T)`: Adds an item to the Bloom Filter. + - `Contains(item T) bool`: Tests whether an item might be in the set. + - `EstimatedFalsePositiveRate() float64`: Returns the current estimated false positive rate. + - `Clear()`: Removes all items from the Bloom Filter. + - `Len() int`: Returns the number of items added to the Bloom Filter. + - `IsEmpty() bool`: Returns true if no items have been added. + - `BitSize() uint`: Returns the size of the underlying bit array. + - `NumberOfHashes() uint`: Returns the number of hash functions being used. + +#### Example Usage: + +```go +// Create a new Bloom Filter expecting 1000 items with 1% false positive rate +bf := collections.NewBloomFilter[string](1000, 0.01) +// Add some items +bf.Add("apple") +bf.Add("banana") +bf.Add("cherry") + +// Check for membership +if bf.Contains("apple") { + fmt.Println("'apple' is probably in the set") +} + +// Get current false positive rate +fmt.Printf("False positive rate: %f\n", bf.EstimatedFalsePositiveRate()) + +// Clear the filter +bf.Clear() +``` + +#### Performance Characteristics: + +- Space Complexity: O(m), where m is the size of the bit array +- Time Complexity: + - Add: O(k), where k is the number of hash functions + - Contains: O(k), where k is the number of hash functions +- False Positive Probability: (1 - e^(-kn/m))^k + - k: number of hash functions + - n: number of inserted elements + - m: size of bit array + +#### Use Cases: + +- Duplicate detection +- Cache filtering +- URL shorteners +- Spell checkers +- Network routing +- Database query optimization + +#### Notes: + +- The Bloom Filter automatically optimizes the number of hash functions and bit array size based on the expected number of items and desired false positive rate. +- The actual false positive rate may vary slightly from the target rate due to the probabilistic nature of the data structure. +- The filter supports any type that can be converted to a string representation. + +## [License](#license) This project is licensed under the [MIT License](LICENSE) - see the [LICENSE](LICENSE) file for details. diff --git a/bloomfilter/bloomfilter.go b/bloomfilter/bloomfilter.go new file mode 100644 index 0000000..f5c963b --- /dev/null +++ b/bloomfilter/bloomfilter.go @@ -0,0 +1,119 @@ +package bloomfilter + +import ( + "crypto/sha256" + "encoding/binary" + "fmt" + "hash" + "hash/fnv" + "math" +) + +type BloomFilter[T any] struct { + bits []bool + numBits uint + numHash uint + count uint + hasher hash.Hash +} + +func NewBloomFilter[T any](expectedItems uint, falsePositiveProb float64) *BloomFilter[T] { + if expectedItems == 0 { + expectedItems = 1 + } + if falsePositiveProb <= 0 { + falsePositiveProb = 0.01 + } + + numBits := uint(math.Ceil(-float64(expectedItems) * math.Log(falsePositiveProb) / math.Pow(math.Log(2), 2))) + numHash := uint(math.Ceil(float64(numBits) / float64(expectedItems) * math.Log(2))) + + return &BloomFilter[T]{ + bits: make([]bool, numBits), + numBits: numBits, + numHash: numHash, + hasher: fnv.New64a(), // Using fnv.New64a() for better distribution + } +} + +// hashToUint converts a hash sum to uint +func hashToUint(sum []byte) uint { + // Convert the first 8 bytes of the hash to uint64, then to uint + if len(sum) < 8 { + panic("Hash sum too short") + } + return uint(binary.BigEndian.Uint64(sum[:8])) +} + +// getLocations generates multiple hash locations for an item +func (bf *BloomFilter[T]) getLocations(item T) []uint { + locations := make([]uint, bf.numHash) + itemStr := fmt.Sprintf("%v", item) + + // Calculate SHA-256 hash as a base for location generation + hash := sha256.Sum256([]byte(itemStr)) + h1 := hashToUint(hash[:8]) // Use first 8 bytes for h1 + h2 := hashToUint(hash[8:16]) // Use next 8 bytes for h2 + + // Generate all hash values using the formula: h1 + i*h2 + for i := uint(0); i < bf.numHash; i++ { + locations[i] = (h1 + i*h2) % bf.numBits + } + + return locations +} + +// Add inserts an item into the Bloom Filter. +func (bf *BloomFilter[T]) Add(item T) { + locations := bf.getLocations(item) + for _, loc := range locations { + bf.bits[loc] = true + } + bf.count++ +} + +// Contains tests whether an item might be in the set. +func (bf *BloomFilter[T]) Contains(item T) bool { + locations := bf.getLocations(item) + for _, loc := range locations { + if !bf.bits[loc] { + return false + } + } + return true +} + +// EstimatedFalsePositiveRate returns the estimated false positive rate. +func (bf *BloomFilter[T]) EstimatedFalsePositiveRate() float64 { + if bf.count == 0 { + return 0.0 + } + exponent := -float64(bf.numHash) * float64(bf.count) / float64(bf.numBits) + return math.Pow(1-math.Exp(exponent), float64(bf.numHash)) +} + +// Clear removes all items from the Bloom Filter. +func (bf *BloomFilter[T]) Clear() { + bf.bits = make([]bool, bf.numBits) + bf.count = 0 +} + +// Len returns the number of items added to the Bloom Filter. +func (bf *BloomFilter[T]) Len() int { + return int(bf.count) +} + +// IsEmpty returns true if no items have been added to the Bloom Filter. +func (bf *BloomFilter[T]) IsEmpty() bool { + return bf.count == 0 +} + +// BitSize returns the size of the bit array. +func (bf *BloomFilter[T]) BitSize() uint { + return bf.numBits +} + +// NumberOfHashes returns the number of hash functions. +func (bf *BloomFilter[T]) NumberOfHashes() uint { + return bf.numHash +} diff --git a/bloomfilter/bloomfilter_test.go b/bloomfilter/bloomfilter_test.go new file mode 100644 index 0000000..d42036a --- /dev/null +++ b/bloomfilter/bloomfilter_test.go @@ -0,0 +1,235 @@ +package bloomfilter + +import ( + "fmt" + "testing" +) + +func TestBloomFilter_Basic(t *testing.T) { + tests := []struct { + name string + expectedItems uint + falsePositive float64 + itemsToAdd []string + itemsToCheck []string + shouldContain []bool + expectedMinBits uint + }{ + { + name: "Basic operation", + expectedItems: 100, + falsePositive: 0.01, + itemsToAdd: []string{"apple", "banana", "cherry"}, + itemsToCheck: []string{"apple", "banana", "cherry", "date"}, + shouldContain: []bool{true, true, true, false}, + }, + { + name: "Empty filter", + expectedItems: 100, + falsePositive: 0.01, + itemsToAdd: []string{}, + itemsToCheck: []string{"apple"}, + shouldContain: []bool{false}, + }, + { + name: "Single item", + expectedItems: 100, + falsePositive: 0.01, + itemsToAdd: []string{"apple"}, + itemsToCheck: []string{"apple", "banana"}, + shouldContain: []bool{true, false}, + }, + } + + for _, tt := range tests { + t.Run( + tt.name, func(t *testing.T) { + bf := NewBloomFilter[string](tt.expectedItems, tt.falsePositive) + + // Add items + for _, item := range tt.itemsToAdd { + bf.Add(item) + } + + // Check size matches expected items + if bf.Len() != len(tt.itemsToAdd) { + t.Errorf("Expected length %d, got %d", len(tt.itemsToAdd), bf.Len()) + } + + // Check contains + for i, item := range tt.itemsToCheck { + if bf.Contains(item) != tt.shouldContain[i] { + t.Errorf( + "Contains(%s) = %v, want %v", + item, bf.Contains(item), tt.shouldContain[i], + ) + } + } + }, + ) + } +} + +func TestBloomFilter_DifferentTypes(t *testing.T) { + t.Run( + "Integer type", func(t *testing.T) { + bf := NewBloomFilter[int](100, 0.01) + numbers := []int{1, 2, 3, 4, 5} + + for _, n := range numbers { + bf.Add(n) + } + + for _, n := range numbers { + if !bf.Contains(n) { + t.Errorf("Should contain %d", n) + } + } + + if bf.Contains(6) { + t.Error("Should not contain 6") + } + }, + ) + + t.Run( + "Custom struct type", func(t *testing.T) { + type Person struct { + Name string + Age int + } + + bf := NewBloomFilter[Person](100, 0.01) + p1 := Person{"Alice", 30} + p2 := Person{"Bob", 25} + + bf.Add(p1) + bf.Add(p2) + + if !bf.Contains(p1) { + t.Error("Should contain person 1") + } + if !bf.Contains(p2) { + t.Error("Should contain person 2") + } + if bf.Contains(Person{"Charlie", 35}) { + t.Error("Should not contain person 3") + } + }, + ) +} + +func TestBloomFilter_EdgeCases(t *testing.T) { + t.Run( + "Zero expected items", func(t *testing.T) { + bf := NewBloomFilter[string](0, 0.01) + if bf == nil { + t.Error("Should create filter even with zero expected items") + } + bf.Add("test") + if !bf.Contains("test") { + t.Error("Should still function with zero expected items") + } + }, + ) + + t.Run( + "Zero false positive rate", func(t *testing.T) { + bf := NewBloomFilter[string](100, 0) + if bf == nil { + t.Error("Should create filter even with zero false positive rate") + } + bf.Add("test") + if !bf.Contains("test") { + t.Error("Should still function with zero false positive rate") + } + }, + ) +} + +func TestBloomFilter_Operations(t *testing.T) { + t.Run( + "Clear operation", func(t *testing.T) { + bf := NewBloomFilter[string](100, 0.01) + bf.Add("test") + + if !bf.Contains("test") { + t.Error("Should contain 'test' before clear") + } + + bf.Clear() + + if bf.Contains("test") { + t.Error("Should not contain 'test' after clear") + } + + if !bf.IsEmpty() { + t.Error("Should be empty after clear") + } + + if bf.Len() != 0 { + t.Error("Length should be 0 after clear") + } + }, + ) +} + +func TestBloomFilter_FalsePositiveRate(t *testing.T) { + expectedItems := uint(1000) + targetFPR := 0.01 + bf := NewBloomFilter[int](expectedItems, targetFPR) + + // Add expectedItems number of items + for i := 0; i < int(expectedItems); i++ { + bf.Add(i) + } + + // Test false positive rate + falsePositives := 0 + trials := 10000 + for i := int(expectedItems); i < int(expectedItems)+trials; i++ { + if bf.Contains(i) { + falsePositives++ + } + } + + actualFPR := float64(falsePositives) / float64(trials) + estimatedFPR := bf.EstimatedFalsePositiveRate() + + // Allow for some variance in the actual false positive rate + maxAcceptableFPR := targetFPR * 2 + if actualFPR > maxAcceptableFPR { + t.Errorf( + "False positive rate too high: got %f, want <= %f", + actualFPR, maxAcceptableFPR, + ) + } + + // Check if estimated FPR is reasonably close to actual FPR + if estimatedFPR < actualFPR/2 || estimatedFPR > actualFPR*2 { + t.Errorf( + "Estimated FPR %f significantly different from actual FPR %f", + estimatedFPR, actualFPR, + ) + } +} + +func BenchmarkBloomFilter(b *testing.B) { + bf := NewBloomFilter[string](1000, 0.01) + + b.Run( + "Add", func(b *testing.B) { + for i := 0; i < b.N; i++ { + bf.Add(fmt.Sprintf("item%d", i)) + } + }, + ) + + b.Run( + "Contains", func(b *testing.B) { + for i := 0; i < b.N; i++ { + bf.Contains(fmt.Sprintf("item%d", i)) + } + }, + ) +}