Skip to content

Commit

Permalink
Compute stats for RoaringBoolArray (#874)
Browse files Browse the repository at this point in the history
  • Loading branch information
robert3005 authored Sep 19, 2024
1 parent f14f45a commit 129aa2b
Show file tree
Hide file tree
Showing 3 changed files with 185 additions and 14 deletions.
9 changes: 1 addition & 8 deletions encodings/roaring/src/boolean/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,7 @@ use crate::RoaringBoolArray;

pub fn roaring_bool_encode(bool_array: BoolArray) -> VortexResult<RoaringBoolArray> {
let mut bitmap = Bitmap::new();
bitmap.extend(
bool_array
.boolean_buffer()
.iter()
.enumerate()
.filter(|(_, b)| *b)
.map(|(i, _)| i as u32),
);
bitmap.extend(bool_array.boolean_buffer().set_indices().map(|i| i as u32));
bitmap.run_optimize();
bitmap.shrink_to_fit();

Expand Down
28 changes: 22 additions & 6 deletions encodings/roaring/src/boolean/mod.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
use std::collections::HashMap;
use std::fmt::Debug;

use arrow_buffer::{BooleanBuffer, Buffer as ArrowBuffer};
pub use compress::*;
use croaring::Native;
pub use croaring::{Bitmap, Portable};
use serde::{Deserialize, Serialize};
use vortex::array::BoolArray;
use vortex::stats::{ArrayStatisticsCompute, StatsSet};
use vortex::stats::{Stat, StatsSet};
use vortex::validity::{ArrayValidity, LogicalValidity, Validity};
use vortex::variants::{ArrayVariants, BoolArrayTrait};
use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor};
Expand All @@ -19,6 +21,7 @@ use vortex_error::{vortex_bail, vortex_err, VortexExpect as _, VortexResult};

mod compress;
mod compute;
mod stats;

impl_encoding!("vortex.roaring_bool", 17u16, RoaringBool);

Expand All @@ -32,22 +35,37 @@ impl RoaringBoolArray {
if length < bitmap.cardinality() as usize {
vortex_bail!("RoaringBoolArray length is less than bitmap cardinality")
} else {
let roaring_stats = bitmap.statistics();
let stats = StatsSet::from(HashMap::from([
(
Stat::Min,
(roaring_stats.cardinality == length as u64).into(),
),
(Stat::Max, (roaring_stats.cardinality > 0).into()),
(
Stat::IsConstant,
(roaring_stats.cardinality == length as u64 || roaring_stats.cardinality == 0)
.into(),
),
(Stat::TrueCount, roaring_stats.cardinality.into()),
]));

Ok(Self {
typed: TypedArray::try_from_parts(
DType::Bool(NonNullable),
length,
RoaringBoolMetadata { length },
Some(Buffer::from(bitmap.serialize::<Portable>())),
Some(Buffer::from(bitmap.serialize::<Native>())),
vec![].into(),
StatsSet::new(),
stats,
)?,
})
}
}

pub fn bitmap(&self) -> Bitmap {
//TODO(@jdcasale): figure out a way to avoid this deserialization per-call
Bitmap::deserialize::<Portable>(self.buffer().as_ref())
Bitmap::deserialize::<Native>(self.buffer().as_ref())
}

pub fn encode(array: Array) -> VortexResult<Array> {
Expand Down Expand Up @@ -93,8 +111,6 @@ impl AcceptArrayVisitor for RoaringBoolArray {
}
}

impl ArrayStatisticsCompute for RoaringBoolArray {}

impl ArrayValidity for RoaringBoolArray {
fn is_valid(&self, _index: usize) -> bool {
true
Expand Down
162 changes: 162 additions & 0 deletions encodings/roaring/src/boolean/stats.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
use std::collections::HashMap;

use croaring::Bitset;
use vortex::stats::{ArrayStatisticsCompute, Stat, StatsSet};
use vortex_error::{vortex_err, VortexResult};

use crate::RoaringBoolArray;

impl ArrayStatisticsCompute for RoaringBoolArray {
fn compute_statistics(&self, stat: Stat) -> VortexResult<StatsSet> {
if self.is_empty() {
return Ok(StatsSet::new());
}

// Only needs to compute IsSorted, IsStrictSorted and RunCount all other stats have been populated on construction
let bitmap = self.bitmap();
BitmapStats(
bitmap
.to_bitset()
.ok_or_else(|| vortex_err!("Bitmap to Bitset conversion run out of memory"))?,
self.len(),
bitmap.statistics().cardinality,
)
.compute_statistics(stat)
}
}

// Underlying bitset, length in bits, cardinality (true count) of the bitset
struct BitmapStats(Bitset, usize, u64);

impl ArrayStatisticsCompute for BitmapStats {
fn compute_statistics(&self, _stat: Stat) -> VortexResult<StatsSet> {
let bitset_slice = self.0.as_slice();
// This is a weird case where the bitset is full of false values
// sometimes it will not allocate any underlying storage
// TODO(robert): This likely can be simplified after https://github.com/RoaringBitmap/CRoaring/issues/660
if bitset_slice.is_empty() {
return Ok(StatsSet::from(HashMap::from([
(Stat::IsSorted, true.into()),
(Stat::IsStrictSorted, (self.1 == 1).into()),
(Stat::RunCount, 1.into()),
])));
}

let whole_chunks = self.1 / 64;
let last_chunk_len = self.1 % 64;
let fist_bool = bitset_slice[0] & 1 == 1;
let mut stats = RoaringBoolStatsAccumulator::new(fist_bool);
for bits64 in bitset_slice[0..whole_chunks].iter() {
stats.next(*bits64);
}
stats.next_up_to_length(bitset_slice[whole_chunks], last_chunk_len);
Ok(stats.finish(self.2))
}
}

struct RoaringBoolStatsAccumulator {
prev: bool,
is_sorted: bool,
run_count: usize,
len: usize,
}

impl RoaringBoolStatsAccumulator {
fn new(first_value: bool) -> Self {
Self {
prev: first_value,
is_sorted: true,
run_count: 1,
len: 0,
}
}

pub fn next_up_to_length(&mut self, next: u64, len: usize) {
assert!(len <= 64);
self.len += len;
for i in 0..len {
let current = ((next >> i) & 1) == 1;
// Booleans are sorted true > false so we aren't sorted if we switched from true to false value
if !current && self.prev {
self.is_sorted = false;
}
if current != self.prev {
self.run_count += 1;
self.prev = current;
}
}
}

pub fn next(&mut self, next: u64) {
self.next_up_to_length(next, 64)
}

pub fn finish(self, cardinality: u64) -> StatsSet {
StatsSet::from(HashMap::from([
(Stat::IsSorted, self.is_sorted.into()),
(
Stat::IsStrictSorted,
(self.is_sorted && (self.len < 2 || (self.len == 2 && cardinality == 1))).into(),
),
(Stat::RunCount, self.run_count.into()),
]))
}
}

#[cfg(test)]

mod test {
use vortex::array::BoolArray;
use vortex::stats::ArrayStatistics;
use vortex::IntoArray;

use crate::RoaringBoolArray;

#[test]
#[cfg_attr(miri, ignore)]
fn bool_stats() {
let bool_arr = RoaringBoolArray::encode(
BoolArray::from(vec![false, false, true, true, false, true, true, false]).into_array(),
)
.unwrap();
assert!(!bool_arr.statistics().compute_is_strict_sorted().unwrap());
assert!(!bool_arr.statistics().compute_is_sorted().unwrap());
assert!(!bool_arr.statistics().compute_is_constant().unwrap());
assert!(!bool_arr.statistics().compute_min::<bool>().unwrap());
assert!(bool_arr.statistics().compute_max::<bool>().unwrap());
assert_eq!(bool_arr.statistics().compute_run_count().unwrap(), 5);
assert_eq!(bool_arr.statistics().compute_true_count().unwrap(), 4);
}

#[test]
#[cfg_attr(miri, ignore)]
fn strict_sorted() {
let bool_arr_1 =
RoaringBoolArray::encode(BoolArray::from(vec![false, true]).into_array()).unwrap();
assert!(bool_arr_1.statistics().compute_is_strict_sorted().unwrap());
assert!(bool_arr_1.statistics().compute_is_sorted().unwrap());

// TODO(robert): Reenable after https://github.com/RoaringBitmap/CRoaring/issues/660 is resolved
// let bool_arr_2 =
// RoaringBoolArray::encode(BoolArray::from(vec![true]).into_array()).unwrap();
// assert!(bool_arr_2.statistics().compute_is_strict_sorted().unwrap());
// assert!(bool_arr_2.statistics().compute_is_sorted().unwrap());

let bool_arr_3 =
RoaringBoolArray::encode(BoolArray::from(vec![false]).into_array()).unwrap();
assert!(bool_arr_3.statistics().compute_is_strict_sorted().unwrap());
assert!(bool_arr_3.statistics().compute_is_sorted().unwrap());

// TODO(robert): Reenable after https://github.com/RoaringBitmap/CRoaring/issues/660 is resolved
// let bool_arr_4 =
// RoaringBoolArray::encode(BoolArray::from(vec![true, false]).into_array()).unwrap();
// assert!(!bool_arr_4.statistics().compute_is_strict_sorted().unwrap());
// assert!(!bool_arr_4.statistics().compute_is_sorted().unwrap());

let bool_arr_5 =
RoaringBoolArray::encode(BoolArray::from(vec![false, true, true]).into_array())
.unwrap();
assert!(!bool_arr_5.statistics().compute_is_strict_sorted().unwrap());
assert!(bool_arr_5.statistics().compute_is_sorted().unwrap());
}
}

0 comments on commit 129aa2b

Please sign in to comment.