Skip to content

Commit

Permalink
Merge pull request #129 from mirosval/add-hyperloglog-serde
Browse files Browse the repository at this point in the history
Add Serde support to HyperLogLog
  • Loading branch information
crepererum authored Feb 29, 2024
2 parents 2f74d71 + 951e2a9 commit 6927ec1
Show file tree
Hide file tree
Showing 8 changed files with 265 additions and 31 deletions.
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,13 @@ fixedbitset = { version = "0.4", optional = true }
num-traits = { version = "^0.2.4", optional = true }
rand = { version = "0.8", optional = true }
succinct = { version = "^0.5", optional = true }
serde = { version = "1.0", optional = true }

[dev-dependencies]
criterion = "0.5"
rand_chacha = "0.3"
rand_distr = "0.4"
serde_json = "1.0"

[features]
default = [
Expand All @@ -35,6 +37,7 @@ default = [
"num-traits",
"rand",
"succinct",
"serde",
]

[[bench]]
Expand Down
6 changes: 2 additions & 4 deletions src/filters/quotientfilter.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//! QuotientFilter implementation.
use std::collections::hash_map::DefaultHasher;
use std::collections::VecDeque;
use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher};
use std::hash::{BuildHasher, BuildHasherDefault, Hash};
use std::marker::PhantomData;

use fixedbitset::FixedBitSet;
Expand Down Expand Up @@ -346,9 +346,7 @@ where

fn calc_quotient_remainder(&self, obj: &T) -> (usize, usize) {
let bits_remainder = self.bits_remainder();
let mut hasher = self.buildhasher.build_hasher();
obj.hash(&mut hasher);
let fingerprint = hasher.finish();
let fingerprint = self.buildhasher.hash_one(obj);
let bits_trash = 64 - bits_remainder - self.bits_quotient;
let trash = if bits_trash > 0 {
(fingerprint >> (64 - bits_trash)) << (64 - bits_trash)
Expand Down
15 changes: 4 additions & 11 deletions src/hash_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ impl Hash for AnyHash {
mod tests {
use super::{BuildHasherSeeded, HashIterBuilder};
use std::collections::hash_map::DefaultHasher;
use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher};
use std::hash::{BuildHasher, BuildHasherDefault};

#[test]
fn hash_iter_builder_getter() {
Expand Down Expand Up @@ -347,18 +347,11 @@ mod tests {
let bh2 = BuildHasherSeeded::new(0);
let bh3 = BuildHasherSeeded::new(1);

let mut hasher1 = bh1.build_hasher();
let mut hasher2 = bh2.build_hasher();
let mut hasher3 = bh3.build_hasher();

let obj = "foo bar";
obj.hash(&mut hasher1);
obj.hash(&mut hasher2);
obj.hash(&mut hasher3);

let result1 = hasher1.finish();
let result2 = hasher2.finish();
let result3 = hasher3.finish();
let result1 = bh1.hash_one(obj);
let result2 = bh2.hash_one(obj);
let result3 = bh3.hash_one(obj);

assert_eq!(result1, result2);
assert_ne!(result1, result3);
Expand Down
File renamed without changes.
77 changes: 65 additions & 12 deletions src/hyperloglog.rs → src/hyperloglog/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,20 @@
use std::cmp;
use std::collections::hash_map::DefaultHasher;
use std::fmt;
use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher};
use std::hash::{BuildHasher, BuildHasherDefault, Hash};
use std::marker::PhantomData;

use crate::hyperloglog_data::{
use crate::hyperloglog::data::{
BIAS_DATA_OFFSET, BIAS_DATA_VEC, POW2MINX, RAW_ESTIMATE_DATA_OFFSET, RAW_ESTIMATE_DATA_VEC,
THRESHOLD_DATA_OFFSET, THRESHOLD_DATA_VEC,
};

mod data;

/// Serde support for `pdatastructs::hyperloglog::HyperLogLog`
#[cfg(feature = "serde")]
pub mod serde;

/// A HyperLogLog is a data structure to count unique elements on a data stream.
///
/// # Examples
Expand Down Expand Up @@ -79,7 +85,7 @@ use crate::hyperloglog_data::{
/// - ["Appendix to HyperLogLog in Practice: Algorithmic Engineering of a State of the Art
/// Cardinality Estimation Algorithm", Stefan Heule, Marc Nunkesser, Alexander Hall, 2016](https://goo.gl/iU8Ig)
/// - [Wikipedia: HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog)
#[derive(Clone)]
#[derive(Clone, Eq, PartialEq)]
pub struct HyperLogLog<T, B = BuildHasherDefault<DefaultHasher>>
where
T: Hash + ?Sized,
Expand Down Expand Up @@ -114,14 +120,27 @@ where
{
/// Same as `new` but with a specific `BuildHasher`.
pub fn with_hash(b: usize, buildhasher: B) -> Self {
let m = 1_usize << b;
let registers = vec![0; m];
Self::with_registers_and_hash(b, registers, buildhasher)
}

/// Same as `new` but with pre-initialized registers and a specific `BuildHasher`.
pub fn with_registers_and_hash(b: usize, registers: Vec<u8>, buildhasher: B) -> Self {
assert!(
(4..=18).contains(&b),
"b ({}) must be larger or equal than 4 and smaller or equal than 18",
b
);

let m = 1_usize << b;
let registers = vec![0; m];
let len = registers.len();
assert!(
m == len,
"registers must have length of {}, but had {}",
m,
len
);
Self {
registers,
b,
Expand All @@ -140,6 +159,13 @@ where
self.registers.len()
}

/// Get register data
/// This is useful if you need to persist or serialize the structure using something else than
/// Serde
pub fn registers(&self) -> &[u8] {
&self.registers
}

/// Get `BuildHasher`.
pub fn buildhasher(&self) -> &B {
&self.buildhasher
Expand All @@ -152,15 +178,18 @@ where

/// Adds an element to the HyperLogLog.
pub fn add(&mut self, obj: &T) {
let mut hasher = self.buildhasher.build_hasher();
obj.hash(&mut hasher);
let h: u64 = hasher.finish();
self.add_hashed(self.buildhasher.hash_one(obj));
}

// split h into:
/// Adds an already-hashed element to the HyperLogLog
///
/// Note: Make sure to use the same hasher as the rest of the HyperLogLog when hashing values on your own
pub fn add_hashed(&mut self, hashed_value: u64) {
// split hashed_value into:
// - w = 64 - b upper bits
// - j = b lower bits
let w = h >> self.b;
let j = h - (w << self.b); // no 1 as in the paper since register indices are 0-based
let w = hashed_value >> self.b;
let j = hashed_value - (w << self.b); // no 1 as in the paper since register indices are 0-based

// p = leftmost bit (1-based count)
let p = w.leading_zeros() + 1 - (self.b as u32);
Expand Down Expand Up @@ -328,9 +357,10 @@ where
}
}

impl<T> fmt::Debug for HyperLogLog<T>
impl<T, B> fmt::Debug for HyperLogLog<T, B>
where
T: Hash + ?Sized,
B: BuildHasher + Clone + Eq,
{
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "HyperLogLog {{ b: {} }}", self.b)
Expand Down Expand Up @@ -363,7 +393,7 @@ where
mod tests {
use super::HyperLogLog;
use crate::hash_utils::BuildHasherSeeded;
use crate::hyperloglog_data::{RAW_ESTIMATE_DATA_OFFSET, RAW_ESTIMATE_DATA_VEC};
use crate::hyperloglog::data::{RAW_ESTIMATE_DATA_OFFSET, RAW_ESTIMATE_DATA_VEC};
use crate::test_util::{assert_send, NotSend};

#[test]
Expand Down Expand Up @@ -720,4 +750,27 @@ mod tests {
let hll: HyperLogLog<NotSend> = HyperLogLog::new(4);
assert_send(&hll);
}

#[test]
fn reconstruct() {
let h = BuildHasherSeeded::new(0);
let b = 4;
let mut hll = HyperLogLog::with_hash(b, h);
hll.add("abc");

let hll2 = HyperLogLog::with_registers_and_hash(b, hll.registers().to_vec(), h);
assert_eq!(hll, hll2);
}

#[test]
#[should_panic(expected = "registers must have length of 16, but had 0")]
fn reconstruct_panics() {
let h = BuildHasherSeeded::new(0);
let b = 4;
let mut hll = HyperLogLog::with_hash(b, h);
hll.add("abc");

let hll2 = HyperLogLog::with_registers_and_hash(b, vec![], h);
assert_eq!(hll, hll2);
}
}
Loading

0 comments on commit 6927ec1

Please sign in to comment.