Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Saturated bloomfilter size #666

Open
wants to merge 5 commits into
base: develop
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@ package com.twitter.algebird

import java.io.{ByteArrayOutputStream, ObjectOutputStream}

import org.scalacheck.{Arbitrary, Gen, Properties}
import org.scalatest.{Matchers, WordSpec}
import org.scalacheck.Prop._
import org.scalacheck.{Arbitrary, Gen}
import org.scalatest.{Matchers, WordSpec}

object BloomFilterTestUtils {
def toSparse[A](bf: BF[A]): BFSparse[A] = bf match {
case BFZero(hashes, width) => BFSparse(hashes, RichCBitSet(), width)
case BFItem(item, hashes, width) =>
BFSparse(hashes, RichCBitSet.fromArray(hashes(item)), width)
case bfs @ BFSparse(_, _, _) => bfs
case bfs@BFSparse(_, _, _) => bfs
case BFInstance(hashes, bitset, width) =>
BFSparse(hashes, RichCBitSet.fromBitSet(bitset), width)
}
Expand All @@ -22,15 +22,15 @@ object BloomFilterTestUtils {
val bs = LongBitSet.empty(width)
bs += hashes(item)
BFInstance(hashes, bs.toBitSetNoCopy, width)
case bfs @ BFSparse(hashes, bitset, width) => bfs.dense
case bfi @ BFInstance(hashes, bitset, width) => bfi
case bfs@BFSparse(hashes, bitset, width) => bfs.dense
case bfi@BFInstance(hashes, bitset, width) => bfi
}
}

class BloomFilterLaws extends CheckProperties {

import com.twitter.algebird.BaseProperties._
import BloomFilterTestUtils._
import com.twitter.algebird.BaseProperties._

val NUM_HASHES = 6
val WIDTH = 32
Expand Down Expand Up @@ -117,7 +117,7 @@ class BloomFilterLaws extends CheckProperties {
val next1 = a + b

Equiv[BF[String]].equiv(next, next1) &&
(check == a.contains(b))
(check == a.contains(b))
}
}

Expand Down Expand Up @@ -155,8 +155,8 @@ class BFHashIndices extends CheckProperties {
}

/**
* This is the version of the BFHash as of before the "negative values fix"
*/
* This is the version of the BFHash as of before the "negative values fix"
*/
case class NegativeBFHash(numHashes: Int, width: Int) {
val size = numHashes

Expand Down Expand Up @@ -203,7 +203,7 @@ class BFHashIndices extends CheckProperties {
}
}

class BloomFilterFalsePositives[T: Gen: Hash128](falsePositiveRate: Double) extends ApproximateProperty {
class BloomFilterFalsePositives[T: Gen : Hash128](falsePositiveRate: Double) extends ApproximateProperty {

type Exact = Set[T]
type Approx = BF[T]
Expand Down Expand Up @@ -237,7 +237,7 @@ class BloomFilterFalsePositives[T: Gen: Hash128](falsePositiveRate: Double) exte
def approximateResult(bf: BF[T], t: T) = bf.contains(t)
}

class BloomFilterCardinality[T: Gen: Hash128] extends ApproximateProperty {
class BloomFilterCardinality[T: Gen : Hash128] extends ApproximateProperty {

type Exact = Set[T]
type Approx = BF[T]
Expand All @@ -264,10 +264,12 @@ class BloomFilterCardinality[T: Gen: Hash128] extends ApproximateProperty {
def inputGenerator(set: Set[T]) = Gen.const(())

def exactResult(s: Set[T], u: Unit) = s.size

def approximateResult(bf: BF[T], u: Unit) = bf.size
}

class BloomFilterProperties extends ApproximateProperties("BloomFilter") {

import ApproximateProperty.toProp

for (falsePositiveRate <- List(0.1, 0.01, 0.001)) {
Expand Down Expand Up @@ -304,54 +306,53 @@ class BloomFilterTest extends WordSpec with Matchers {
}

"identify all true positives" in {
(0 to 100).foreach { _ =>
{
val bfMonoid = new BloomFilterMonoid[String](RAND.nextInt(5) + 1, RAND.nextInt(64) + 32)
val numEntries = 5
val entries = (0 until numEntries).map(_ => RAND.nextInt.toString)
val bf = bfMonoid.create(entries: _*)

entries.foreach { i =>
assert(bf.contains(i.toString).isTrue)
}
(0 to 100).foreach { _ => {
val bfMonoid = new BloomFilterMonoid[String](RAND.nextInt(5) + 1, RAND.nextInt(64) + 32)
val numEntries = 5
val entries = (0 until numEntries).map(_ => RAND.nextInt.toString)
val bf = bfMonoid.create(entries: _*)

entries.foreach { i =>
assert(bf.contains(i.toString).isTrue)
}
}
}
}

"have small false positive rate" in {
val iter = 10000

Seq(0.1, 0.01, 0.001).foreach { fpProb =>
{
val fps = (0 until iter).par.map { _ =>
{
val numEntries = RAND.nextInt(10) + 1
Seq(0.1, 0.01, 0.001).foreach { fpProb => {
val fps = (0 until iter).par.map { _ => {
val numEntries = RAND.nextInt(10) + 1

val bfMonoid = BloomFilter[String](numEntries, fpProb)
val bfMonoid = BloomFilter[String](numEntries, fpProb)

val entries = RAND
.shuffle((0 until 1000).toList)
.take(numEntries + 1)
.map(_.toString)
val bf = bfMonoid.create(entries.drop(1): _*)
val entries = RAND
.shuffle((0 until 1000).toList)
.take(numEntries + 1)
.map(_.toString)
val bf = bfMonoid.create(entries.drop(1): _*)

if (bf.contains(entries(0)).isTrue) 1.0 else 0.0
}
}
if (bf.contains(entries(0)).isTrue) 1.0 else 0.0
}
}

val observedFpProb = fps.sum / fps.size
val observedFpProb = fps.sum / fps.size

// the 2.5 is a fudge factor to make the probability of it low
// in tests
assert(observedFpProb <= 2.5 * fpProb)
}
// the 2.5 is a fudge factor to make the probability of it low
// in tests
assert(observedFpProb <= 2.5 * fpProb)
}
}
}

"approximate cardinality" in {
val bfMonoid = BloomFilterMonoid[String](10, 100000)
Seq(10, 100, 1000, 10000).foreach { exactCardinality =>
val items = (1 until exactCardinality).map { _.toString }
val items = (1 until exactCardinality).map {
_.toString
}
val bf = bfMonoid.create(items: _*)
val size = bf.size

Expand All @@ -362,18 +363,17 @@ class BloomFilterTest extends WordSpec with Matchers {
}

"work as an Aggregator" in {
(0 to 10).foreach { _ =>
{
val aggregator = BloomFilterAggregator[String](RAND.nextInt(5) + 1, RAND.nextInt(64) + 32)
val numEntries = 5
val entries = (0 until numEntries).map(_ => RAND.nextInt.toString)
val bf = aggregator(entries)

entries.foreach { i =>
assert(bf.contains(i.toString).isTrue)
}
(0 to 10).foreach { _ => {
val aggregator = BloomFilterAggregator[String](RAND.nextInt(5) + 1, RAND.nextInt(64) + 32)
val numEntries = 5
val entries = (0 until numEntries).map(_ => RAND.nextInt.toString)
val bf = aggregator(entries)

entries.foreach { i =>
assert(bf.contains(i.toString).isTrue)
}
}
}
}

"not serialize @transient dense BFInstance" in {
Expand All @@ -397,8 +397,8 @@ class BloomFilterTest extends WordSpec with Matchers {
}

/**
* this test failed before the fix for https://github.com/twitter/algebird/issues/229
*/
* this test failed before the fix for https://github.com/twitter/algebird/issues/229
*/
"not have negative hash values" in {
val NUM_HASHES = 2
val WIDTH = 4752800
Expand All @@ -408,33 +408,48 @@ class BloomFilterTest extends WordSpec with Matchers {

assert(index >= 0)
}
"return his size event if it's saturated" in {
val bfMonoid = BloomFilterMonoid[String](5, 13)
val strings = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10).map(_.toString)
val bf = bfMonoid.create(strings: _*)

assert(bf.size.min > 0)
}

"return a max approximate size if it's saturated" in {
val bfMonoid = BloomFilterMonoid[String](5, 13)
val strings = List(8, 9, 8, 10, 1, 8, 11, 7, 7, 1).map(_.toString)
val bf = bfMonoid.create(strings: _*)
// even it's seems big. This assert fail..
assert(bf.size.max < strings.length * 100)
}

}

"BloomFilter method `checkAndAdd`" should {

"be identical to method `+`" in {
(0 to 100).foreach { _ =>
{
val bfMonoid = new BloomFilterMonoid[String](RAND.nextInt(5) + 1, RAND.nextInt(64) + 32)
val numEntries = 5
val entries = (0 until numEntries).map(_ => RAND.nextInt.toString)
val bf = bfMonoid.create(entries: _*)
val bfWithCheckAndAdd = entries
.map { entry =>
(entry, bfMonoid.create(entry))
}
.foldLeft((bfMonoid.zero, bfMonoid.zero)) {
case ((left, leftAlt), (entry, right)) =>
val (newLeftAlt, contained) = leftAlt.checkAndAdd(entry)
left.contains(entry) shouldBe contained
(left + entry, newLeftAlt)
}

entries.foreach { i =>
assert(bf.contains(i.toString).isTrue)
(0 to 100).foreach { _ => {
val bfMonoid = new BloomFilterMonoid[String](RAND.nextInt(5) + 1, RAND.nextInt(64) + 32)
val numEntries = 5
val entries = (0 until numEntries).map(_ => RAND.nextInt.toString)
val bf = bfMonoid.create(entries: _*)
val bfWithCheckAndAdd = entries
.map { entry =>
(entry, bfMonoid.create(entry))
}
.foldLeft((bfMonoid.zero, bfMonoid.zero)) {
case ((left, leftAlt), (entry, right)) =>
val (newLeftAlt, contained) = leftAlt.checkAndAdd(entry)
left.contains(entry) shouldBe contained
(left + entry, newLeftAlt)
}

entries.foreach { i =>
assert(bf.contains(i.toString).isTrue)
}
}
}
}
}

Expand Down