diff --git a/encodings/roaring/src/boolean/compress.rs b/encodings/roaring/src/boolean/compress.rs index b2a6b4d75e..6a93f21f85 100644 --- a/encodings/roaring/src/boolean/compress.rs +++ b/encodings/roaring/src/boolean/compress.rs @@ -6,14 +6,7 @@ use crate::RoaringBoolArray; pub fn roaring_bool_encode(bool_array: BoolArray) -> VortexResult { let mut bitmap = Bitmap::new(); - bitmap.extend( - bool_array - .boolean_buffer() - .iter() - .enumerate() - .filter(|(_, b)| *b) - .map(|(i, _)| i as u32), - ); + bitmap.extend(bool_array.boolean_buffer().set_indices().map(|i| i as u32)); bitmap.run_optimize(); bitmap.shrink_to_fit(); diff --git a/encodings/roaring/src/boolean/mod.rs b/encodings/roaring/src/boolean/mod.rs index 2069d05bae..b557a509e1 100644 --- a/encodings/roaring/src/boolean/mod.rs +++ b/encodings/roaring/src/boolean/mod.rs @@ -1,11 +1,13 @@ +use std::collections::HashMap; use std::fmt::Debug; use arrow_buffer::{BooleanBuffer, Buffer as ArrowBuffer}; pub use compress::*; +use croaring::Native; pub use croaring::{Bitmap, Portable}; use serde::{Deserialize, Serialize}; use vortex::array::BoolArray; -use vortex::stats::{ArrayStatisticsCompute, StatsSet}; +use vortex::stats::{Stat, StatsSet}; use vortex::validity::{ArrayValidity, LogicalValidity, Validity}; use vortex::variants::{ArrayVariants, BoolArrayTrait}; use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; @@ -19,6 +21,7 @@ use vortex_error::{vortex_bail, vortex_err, VortexExpect as _, VortexResult}; mod compress; mod compute; +mod stats; impl_encoding!("vortex.roaring_bool", 17u16, RoaringBool); @@ -32,14 +35,29 @@ impl RoaringBoolArray { if length < bitmap.cardinality() as usize { vortex_bail!("RoaringBoolArray length is less than bitmap cardinality") } else { + let roaring_stats = bitmap.statistics(); + let stats = StatsSet::from(HashMap::from([ + ( + Stat::Min, + (roaring_stats.cardinality == length as u64).into(), + ), + (Stat::Max, (roaring_stats.cardinality > 0).into()), + ( + Stat::IsConstant, + (roaring_stats.cardinality == length as u64 || roaring_stats.cardinality == 0) + .into(), + ), + (Stat::TrueCount, roaring_stats.cardinality.into()), + ])); + Ok(Self { typed: TypedArray::try_from_parts( DType::Bool(NonNullable), length, RoaringBoolMetadata { length }, - Some(Buffer::from(bitmap.serialize::())), + Some(Buffer::from(bitmap.serialize::())), vec![].into(), - StatsSet::new(), + stats, )?, }) } @@ -47,7 +65,7 @@ impl RoaringBoolArray { pub fn bitmap(&self) -> Bitmap { //TODO(@jdcasale): figure out a way to avoid this deserialization per-call - Bitmap::deserialize::(self.buffer().as_ref()) + Bitmap::deserialize::(self.buffer().as_ref()) } pub fn encode(array: Array) -> VortexResult { @@ -93,8 +111,6 @@ impl AcceptArrayVisitor for RoaringBoolArray { } } -impl ArrayStatisticsCompute for RoaringBoolArray {} - impl ArrayValidity for RoaringBoolArray { fn is_valid(&self, _index: usize) -> bool { true diff --git a/encodings/roaring/src/boolean/stats.rs b/encodings/roaring/src/boolean/stats.rs new file mode 100644 index 0000000000..62c1552329 --- /dev/null +++ b/encodings/roaring/src/boolean/stats.rs @@ -0,0 +1,162 @@ +use std::collections::HashMap; + +use croaring::Bitset; +use vortex::stats::{ArrayStatisticsCompute, Stat, StatsSet}; +use vortex_error::{vortex_err, VortexResult}; + +use crate::RoaringBoolArray; + +impl ArrayStatisticsCompute for RoaringBoolArray { + fn compute_statistics(&self, stat: Stat) -> VortexResult { + if self.is_empty() { + return Ok(StatsSet::new()); + } + + // Only needs to compute IsSorted, IsStrictSorted and RunCount all other stats have been populated on construction + let bitmap = self.bitmap(); + BitmapStats( + bitmap + .to_bitset() + .ok_or_else(|| vortex_err!("Bitmap to Bitset conversion run out of memory"))?, + self.len(), + bitmap.statistics().cardinality, + ) + .compute_statistics(stat) + } +} + +// Underlying bitset, length in bits, cardinality (true count) of the bitset +struct BitmapStats(Bitset, usize, u64); + +impl ArrayStatisticsCompute for BitmapStats { + fn compute_statistics(&self, _stat: Stat) -> VortexResult { + let bitset_slice = self.0.as_slice(); + // This is a weird case where the bitset is full of false values + // sometimes it will not allocate any underlying storage + // TODO(robert): This likely can be simplified after https://github.com/RoaringBitmap/CRoaring/issues/660 + if bitset_slice.is_empty() { + return Ok(StatsSet::from(HashMap::from([ + (Stat::IsSorted, true.into()), + (Stat::IsStrictSorted, (self.1 == 1).into()), + (Stat::RunCount, 1.into()), + ]))); + } + + let whole_chunks = self.1 / 64; + let last_chunk_len = self.1 % 64; + let fist_bool = bitset_slice[0] & 1 == 1; + let mut stats = RoaringBoolStatsAccumulator::new(fist_bool); + for bits64 in bitset_slice[0..whole_chunks].iter() { + stats.next(*bits64); + } + stats.next_up_to_length(bitset_slice[whole_chunks], last_chunk_len); + Ok(stats.finish(self.2)) + } +} + +struct RoaringBoolStatsAccumulator { + prev: bool, + is_sorted: bool, + run_count: usize, + len: usize, +} + +impl RoaringBoolStatsAccumulator { + fn new(first_value: bool) -> Self { + Self { + prev: first_value, + is_sorted: true, + run_count: 1, + len: 0, + } + } + + pub fn next_up_to_length(&mut self, next: u64, len: usize) { + assert!(len <= 64); + self.len += len; + for i in 0..len { + let current = ((next >> i) & 1) == 1; + // Booleans are sorted true > false so we aren't sorted if we switched from true to false value + if !current && self.prev { + self.is_sorted = false; + } + if current != self.prev { + self.run_count += 1; + self.prev = current; + } + } + } + + pub fn next(&mut self, next: u64) { + self.next_up_to_length(next, 64) + } + + pub fn finish(self, cardinality: u64) -> StatsSet { + StatsSet::from(HashMap::from([ + (Stat::IsSorted, self.is_sorted.into()), + ( + Stat::IsStrictSorted, + (self.is_sorted && (self.len < 2 || (self.len == 2 && cardinality == 1))).into(), + ), + (Stat::RunCount, self.run_count.into()), + ])) + } +} + +#[cfg(test)] + +mod test { + use vortex::array::BoolArray; + use vortex::stats::ArrayStatistics; + use vortex::IntoArray; + + use crate::RoaringBoolArray; + + #[test] + #[cfg_attr(miri, ignore)] + fn bool_stats() { + let bool_arr = RoaringBoolArray::encode( + BoolArray::from(vec![false, false, true, true, false, true, true, false]).into_array(), + ) + .unwrap(); + assert!(!bool_arr.statistics().compute_is_strict_sorted().unwrap()); + assert!(!bool_arr.statistics().compute_is_sorted().unwrap()); + assert!(!bool_arr.statistics().compute_is_constant().unwrap()); + assert!(!bool_arr.statistics().compute_min::().unwrap()); + assert!(bool_arr.statistics().compute_max::().unwrap()); + assert_eq!(bool_arr.statistics().compute_run_count().unwrap(), 5); + assert_eq!(bool_arr.statistics().compute_true_count().unwrap(), 4); + } + + #[test] + #[cfg_attr(miri, ignore)] + fn strict_sorted() { + let bool_arr_1 = + RoaringBoolArray::encode(BoolArray::from(vec![false, true]).into_array()).unwrap(); + assert!(bool_arr_1.statistics().compute_is_strict_sorted().unwrap()); + assert!(bool_arr_1.statistics().compute_is_sorted().unwrap()); + + // TODO(robert): Reenable after https://github.com/RoaringBitmap/CRoaring/issues/660 is resolved + // let bool_arr_2 = + // RoaringBoolArray::encode(BoolArray::from(vec![true]).into_array()).unwrap(); + // assert!(bool_arr_2.statistics().compute_is_strict_sorted().unwrap()); + // assert!(bool_arr_2.statistics().compute_is_sorted().unwrap()); + + let bool_arr_3 = + RoaringBoolArray::encode(BoolArray::from(vec![false]).into_array()).unwrap(); + assert!(bool_arr_3.statistics().compute_is_strict_sorted().unwrap()); + assert!(bool_arr_3.statistics().compute_is_sorted().unwrap()); + + // TODO(robert): Reenable after https://github.com/RoaringBitmap/CRoaring/issues/660 is resolved + // let bool_arr_4 = + // RoaringBoolArray::encode(BoolArray::from(vec![true, false]).into_array()).unwrap(); + // assert!(!bool_arr_4.statistics().compute_is_strict_sorted().unwrap()); + // assert!(!bool_arr_4.statistics().compute_is_sorted().unwrap()); + + let bool_arr_5 = + RoaringBoolArray::encode(BoolArray::from(vec![false, true, true]).into_array()) + .unwrap(); + assert!(!bool_arr_5.statistics().compute_is_strict_sorted().unwrap()); + assert!(bool_arr_5.statistics().compute_is_sorted().unwrap()); + } +}