Skip to content

Commit

Permalink
fix: RoaringInt can_compress was erroneously always None (#1004)
Browse files Browse the repository at this point in the history
(unless a user produced a RoaringIntArray manually, in which case it was
Some... but a no-op)
  • Loading branch information
lwwmanning authored Oct 9, 2024
1 parent f15b162 commit 4aa30c0
Show file tree
Hide file tree
Showing 5 changed files with 157 additions and 48 deletions.
69 changes: 67 additions & 2 deletions encodings/roaring/src/integer/compute.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use croaring::Bitmap;
use vortex::compute::unary::ScalarAtFn;
use vortex::compute::ArrayCompute;
use vortex::compute::{ArrayCompute, SliceFn};
use vortex::{Array, IntoArray};
use vortex_dtype::PType;
use vortex_error::{vortex_err, VortexResult, VortexUnwrap as _};
use vortex_scalar::Scalar;
Expand All @@ -10,12 +12,16 @@ impl ArrayCompute for RoaringIntArray {
fn scalar_at(&self) -> Option<&dyn ScalarAtFn> {
Some(self)
}

fn slice(&self) -> Option<&dyn SliceFn> {
Some(self)
}
}

impl ScalarAtFn for RoaringIntArray {
fn scalar_at(&self, index: usize) -> VortexResult<Scalar> {
let bitmap_value = self
.bitmap()
.owned_bitmap()
.select(index as u32)
.ok_or_else(|| vortex_err!(OutOfBounds: index, 0, self.len()))?;
let scalar: Scalar = match self.metadata().ptype {
Expand All @@ -32,3 +38,62 @@ impl ScalarAtFn for RoaringIntArray {
<Self as ScalarAtFn>::scalar_at(self, index).vortex_unwrap()
}
}

impl SliceFn for RoaringIntArray {
fn slice(&self, start: usize, stop: usize) -> VortexResult<Array> {
let mut bitmap = self.owned_bitmap();
let start = bitmap
.select(start as u32)
.ok_or_else(|| vortex_err!(OutOfBounds: start, 0, self.len()))?;
let stop_inclusive = if stop == self.len() {
bitmap.maximum().unwrap_or(0)
} else {
bitmap
.select(stop.saturating_sub(1) as u32)
.ok_or_else(|| vortex_err!(OutOfBounds: stop, 0, self.len()))?
};

bitmap.and_inplace(&Bitmap::from_range(start..=stop_inclusive));
Self::try_new(bitmap, self.ptype()).map(IntoArray::into_array)
}
}

#[cfg(test)]
mod tests {
use vortex::array::PrimitiveArray;
use vortex::compute::slice;
use vortex::compute::unary::scalar_at;

use super::*;

#[test]
#[cfg_attr(miri, ignore)]
pub fn test_scalar_at() {
let ints = PrimitiveArray::from(vec![2u32, 12, 22, 32]).into_array();
let array = RoaringIntArray::encode(ints).unwrap();

assert_eq!(scalar_at(&array, 0).unwrap(), 2u32.into());
assert_eq!(scalar_at(&array, 1).unwrap(), 12u32.into());
}

#[test]
#[cfg_attr(miri, ignore)]
fn test_slice() {
let array = RoaringIntArray::try_new(Bitmap::from_range(10..20), PType::U32).unwrap();

let sliced = slice(&array, 0, 5).unwrap();
assert_eq!(sliced.len(), 5);
assert_eq!(scalar_at(&sliced, 0).unwrap(), 10u32.into());
assert_eq!(scalar_at(&sliced, 4).unwrap(), 14u32.into());

let sliced = slice(&array, 5, 10).unwrap();
assert_eq!(sliced.len(), 5);
assert_eq!(scalar_at(&sliced, 0).unwrap(), 15u32.into());
assert_eq!(scalar_at(&sliced, 4).unwrap(), 19u32.into());

let sliced = slice(&sliced, 3, 5).unwrap();
assert_eq!(sliced.len(), 2);
assert_eq!(scalar_at(&sliced, 0).unwrap(), 18u32.into());
assert_eq!(scalar_at(&sliced, 1).unwrap(), 19u32.into());
}
}
76 changes: 48 additions & 28 deletions encodings/roaring/src/integer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,16 @@ pub use compress::*;
use croaring::{Bitmap, Portable};
use serde::{Deserialize, Serialize};
use vortex::array::PrimitiveArray;
use vortex::compute::unary::try_cast;
use vortex::encoding::ids;
use vortex::stats::{ArrayStatisticsCompute, StatsSet};
use vortex::validity::{ArrayValidity, LogicalValidity};
use vortex::stats::{ArrayStatistics, ArrayStatisticsCompute, Stat, StatsSet};
use vortex::validity::{ArrayValidity, LogicalValidity, Validity};
use vortex::variants::{ArrayVariants, PrimitiveArrayTrait};
use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor};
use vortex::{impl_encoding, Array, ArrayTrait, Canonical, IntoArray, IntoCanonical, TypedArray};
use vortex::{
impl_encoding, Array, ArrayDType as _, ArrayTrait, Canonical, IntoArray, IntoCanonical,
TypedArray,
};
use vortex_buffer::Buffer;
use vortex_dtype::Nullability::NonNullable;
use vortex_dtype::{DType, PType};
Expand All @@ -34,9 +38,25 @@ impl Display for RoaringIntMetadata {
impl RoaringIntArray {
pub fn try_new(bitmap: Bitmap, ptype: PType) -> VortexResult<Self> {
if !ptype.is_unsigned_int() {
vortex_bail!("RoaringInt expected unsigned int");
vortex_bail!(MismatchedTypes: "unsigned int", ptype);
}

let length = bitmap.statistics().cardinality as usize;
let max = bitmap.maximum();
if max.map(|mv| mv as u64 > ptype.max_value()).unwrap_or(false) {
vortex_bail!(
"RoaringInt maximum value is greater than the maximum value for the primitive type"
);
}

let mut stats = StatsSet::new();
stats.set(Stat::NullCount, 0.into());
stats.set(Stat::Max, max.into());
stats.set(Stat::Min, bitmap.minimum().into());
stats.set(Stat::IsConstant, (length <= 1).into());
stats.set(Stat::IsSorted, true.into());
stats.set(Stat::IsStrictSorted, true.into());

Ok(Self {
typed: TypedArray::try_from_parts(
DType::Primitive(ptype, NonNullable),
Expand All @@ -49,8 +69,7 @@ impl RoaringIntArray {
})
}

pub fn bitmap(&self) -> Bitmap {
//TODO(@jdcasale): figure out a way to avoid this deserialization per-call
pub fn owned_bitmap(&self) -> Bitmap {
Bitmap::deserialize::<Portable>(
self.as_ref()
.buffer()
Expand Down Expand Up @@ -88,39 +107,40 @@ impl ArrayValidity for RoaringIntArray {
}

fn logical_validity(&self) -> LogicalValidity {
LogicalValidity::AllValid(self.bitmap().iter().count())
LogicalValidity::AllValid(self.len())
}
}

impl IntoCanonical for RoaringIntArray {
fn into_canonical(self) -> VortexResult<Canonical> {
todo!()
try_cast(
PrimitiveArray::from_vec(self.owned_bitmap().to_vec(), Validity::NonNullable),
self.dtype(),
)
.and_then(Array::into_canonical)
}
}

impl AcceptArrayVisitor for RoaringIntArray {
fn accept(&self, _visitor: &mut dyn ArrayVisitor) -> VortexResult<()> {
todo!()
fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> {
visitor.visit_buffer(
self.as_ref()
.buffer()
.vortex_expect("Missing buffer in RoaringIntArray"),
)
}
}

impl ArrayStatisticsCompute for RoaringIntArray {}

#[cfg(test)]
mod test {
use vortex::array::PrimitiveArray;
use vortex::compute::unary::scalar_at;
use vortex::IntoArray;

use crate::RoaringIntArray;

#[test]
#[cfg_attr(miri, ignore)]
pub fn test_scalar_at() {
let ints = PrimitiveArray::from(vec![2u32, 12, 22, 32]).into_array();
let array = RoaringIntArray::encode(ints).unwrap();

assert_eq!(scalar_at(&array, 0).unwrap(), 2u32.into());
assert_eq!(scalar_at(&array, 1).unwrap(), 12u32.into());
impl ArrayStatisticsCompute for RoaringIntArray {
fn compute_statistics(&self, stat: Stat) -> VortexResult<StatsSet> {
// possibly faster to write an accumulator over the iterator, though not necessarily
if stat == Stat::TrailingZeroFreq || stat == Stat::BitWidthFreq || stat == Stat::RunCount {
let primitive =
PrimitiveArray::from_vec(self.owned_bitmap().to_vec(), Validity::NonNullable);
primitive.statistics().compute(stat);
Ok(primitive.statistics().to_set())
} else {
Ok(StatsSet::new())
}
}
}
23 changes: 12 additions & 11 deletions vortex-array/src/array/chunked/compute/take.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,17 +77,18 @@ fn take_strict_sorted(chunked: &ChunkedArray, indices: &Array) -> VortexResult<A
// Adjust the indices so they're relative to the chunk
// Note. Indices might not have a dtype big enough to fit chunk_begin after cast,
// if it does cast the scalar otherwise upcast the indices.
let chunk_indices = if chunk_begin < PType::try_from(chunk_indices.dtype())?.max_value() {
subtract_scalar(
&chunk_indices,
&Scalar::from(chunk_begin).cast(chunk_indices.dtype())?,
)?
} else {
// Note. this try_cast (memory copy) is unnecessary, could instead upcast in the subtract fn.
// and avoid an extra
let u64_chunk_indices = try_cast(&chunk_indices, PType::U64.into())?;
subtract_scalar(&u64_chunk_indices, &chunk_begin.into())?
};
let chunk_indices =
if chunk_begin < PType::try_from(chunk_indices.dtype())?.max_value() as usize {
subtract_scalar(
&chunk_indices,
&Scalar::from(chunk_begin).cast(chunk_indices.dtype())?,
)?
} else {
// Note. this try_cast (memory copy) is unnecessary, could instead upcast in the subtract fn.
// and avoid an extra
let u64_chunk_indices = try_cast(&chunk_indices, PType::U64.into())?;
subtract_scalar(&u64_chunk_indices, &chunk_begin.into())?
};

indices_by_chunk[chunk_idx] = Some(chunk_indices);

Expand Down
4 changes: 2 additions & 2 deletions vortex-dtype/src/ptype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -204,8 +204,8 @@ impl PType {
self.byte_width() * 8
}

pub const fn max_value(&self) -> usize {
match_each_integer_ptype!(self, |$T| $T::MAX as usize)
pub const fn max_value(&self) -> u64 {
match_each_integer_ptype!(self, |$T| $T::MAX as u64)
}

pub fn to_signed(self) -> Self {
Expand Down
33 changes: 28 additions & 5 deletions vortex-sampling-compressor/src/compressors/roaring_int.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,6 @@ impl EncodingCompressor for RoaringIntCompressor {
}

fn can_compress(&self, array: &Array) -> Option<&dyn EncodingCompressor> {
// Only support primitive enc arrays
if array.encoding().id() != RoaringInt::ID {
return None;
}

// Only support non-nullable uint arrays
if !array.dtype().is_unsigned_int() || array.dtype().is_nullable() {
return None;
Expand Down Expand Up @@ -60,3 +55,31 @@ impl EncodingCompressor for RoaringIntCompressor {
HashSet::from([&RoaringIntEncoding as EncodingRef])
}
}

#[cfg(test)]
mod tests {
use vortex::array::PrimitiveArray;
use vortex::validity::Validity;
use vortex::IntoArray;
use vortex_roaring::RoaringIntArray;

use crate::compressors::roaring_int::RoaringIntCompressor;
use crate::compressors::EncodingCompressor as _;
use crate::SamplingCompressor;

#[test]
#[cfg_attr(miri, ignore)]
fn test_roaring_int_compressor() {
let array =
PrimitiveArray::from_vec(vec![1u32, 2, 3, 4, 5], Validity::NonNullable).into_array();
assert!(RoaringIntCompressor.can_compress(&array).is_some());
let compressed = RoaringIntCompressor
.compress(&array, None, SamplingCompressor::default())
.unwrap();
assert_eq!(compressed.array.len(), 5);
assert!(compressed.path.is_some());

let roaring = RoaringIntArray::try_from(compressed.array).unwrap();
assert!(roaring.owned_bitmap().contains_range(1..=5));
}
}

0 comments on commit 4aa30c0

Please sign in to comment.