From 389e6a40f210c2423b8226b207a7b8b85b3950db Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 2 Oct 2024 12:07:21 -0400 Subject: [PATCH] feat: implement ALP-RD compression (#947) Fixes #10: Add ALP-RD compression. Currently our only floating point compression algorithm is standard ALP, which targets floats/doubles that are originally decimal, and thus have some natural integer they can round to when you undo the exponent. For science/math datasets, there are a lot of "real doubles", i.e. floating point numbers that use most/all of their available precision. These do not compress with standard ALP. The ALP paper authors had a solution for this called "ALP for 'Real' Doubles" / ALP-RD, which is implemented in this PR. ## Basics The key insight of ALP-RD is that even for dense floating point numbers, within a column they often share the front bits (exponent + first few bits of mantissa). We try and find the best cut-point within the leftmost 16-bits. There are generally a small number of unique values for the leftmost bits, so you can create a dictionary of fixed size (here we use the choice of 8 from the C++ implementation) which naturally bit-packs down to 3 bits. If you compress perfectly without exceptions, you can store 53 bits/value ~17% compression. In practice the amount varies. In the comments below you can see a test with the POI dataset referenced in the ALP paper, and we replicate their results of 55 and 56 bits/value respectively. ## List of changes * Reorganized the `vortex-alp` crate. I created two top-level modules, `alp` and alp_rd`, and moved the previous implementation into the `alp` module * Added new `ALPRDArray` in the `alp_rd` module. It supports both f32 and f64, and all major compute functions are implemented (save for `MaybeCompareFn` and the Accessors I will file an issue to implement these in a FLUP if alright, this PR is already quite large) * Added corresponding `ALPRDCompressor` and wired the CompressorRef everywhere I could find ALPCompressor * New benchmark for RD compression in the existing ALP benchmarks suite --- Cargo.lock | 2 + bench-vortex/src/lib.rs | 2 + bench-vortex/src/reader.rs | 2 +- encodings/alp/Cargo.toml | 2 + encodings/alp/benches/alp_compress.rs | 13 +- encodings/alp/src/{ => alp}/array.rs | 3 +- encodings/alp/src/{ => alp}/compress.rs | 3 +- encodings/alp/src/{ => alp}/compute.rs | 0 encodings/alp/src/{alp.rs => alp/mod.rs} | 16 +- encodings/alp/src/alp_rd/array.rs | 270 ++++++++++++ encodings/alp/src/alp_rd/compute/filter.rs | 52 +++ encodings/alp/src/alp_rd/compute/mod.rs | 27 ++ encodings/alp/src/alp_rd/compute/scalar_at.rs | 68 +++ encodings/alp/src/alp_rd/compute/slice.rs | 51 +++ encodings/alp/src/alp_rd/compute/take.rs | 51 +++ encodings/alp/src/alp_rd/mod.rs | 404 ++++++++++++++++++ encodings/alp/src/alp_rd/variants.rs | 15 + encodings/alp/src/lib.rs | 24 +- .../fastlanes/src/bitpacking/compress.rs | 24 ++ vortex-array/src/encoding.rs | 1 + .../src/compressors/alp_rd.rs | 78 ++++ .../src/compressors/mod.rs | 1 + vortex-sampling-compressor/src/lib.rs | 4 +- vortex-sampling-compressor/tests/smoketest.rs | 2 + 24 files changed, 1101 insertions(+), 14 deletions(-) rename encodings/alp/src/{ => alp}/array.rs (99%) rename encodings/alp/src/{ => alp}/compress.rs (99%) rename encodings/alp/src/{ => alp}/compute.rs (100%) rename encodings/alp/src/{alp.rs => alp/mod.rs} (97%) create mode 100644 encodings/alp/src/alp_rd/array.rs create mode 100644 encodings/alp/src/alp_rd/compute/filter.rs create mode 100644 encodings/alp/src/alp_rd/compute/mod.rs create mode 100644 encodings/alp/src/alp_rd/compute/scalar_at.rs create mode 100644 encodings/alp/src/alp_rd/compute/slice.rs create mode 100644 encodings/alp/src/alp_rd/compute/take.rs create mode 100644 encodings/alp/src/alp_rd/mod.rs create mode 100644 encodings/alp/src/alp_rd/variants.rs create mode 100644 vortex-sampling-compressor/src/compressors/alp_rd.rs diff --git a/Cargo.lock b/Cargo.lock index 4ea23ee2a3..3cc43cab3c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4280,10 +4280,12 @@ dependencies = [ "divan", "itertools 0.13.0", "num-traits", + "rstest", "serde", "vortex-array", "vortex-dtype", "vortex-error", + "vortex-fastlanes", "vortex-scalar", ] diff --git a/bench-vortex/src/lib.rs b/bench-vortex/src/lib.rs index 04f8b364db..2dc8d871e6 100644 --- a/bench-vortex/src/lib.rs +++ b/bench-vortex/src/lib.rs @@ -20,6 +20,7 @@ use vortex::{Array, Context, IntoArray}; use vortex_dtype::DType; use vortex_fastlanes::DeltaEncoding; use vortex_sampling_compressor::compressors::alp::ALPCompressor; +use vortex_sampling_compressor::compressors::alp_rd::ALPRDCompressor; use vortex_sampling_compressor::compressors::bitpacked::BitPackedCompressor; use vortex_sampling_compressor::compressors::date_time_parts::DateTimePartsCompressor; use vortex_sampling_compressor::compressors::dict::DictCompressor; @@ -54,6 +55,7 @@ lazy_static! { lazy_static! { pub static ref COMPRESSORS: HashSet> = [ &ALPCompressor as CompressorRef<'static>, + &ALPRDCompressor, &DictCompressor, &BitPackedCompressor, &FoRCompressor, diff --git a/bench-vortex/src/reader.rs b/bench-vortex/src/reader.rs index 77e6dcca08..23eb7c2bfd 100644 --- a/bench-vortex/src/reader.rs +++ b/bench-vortex/src/reader.rs @@ -89,7 +89,7 @@ pub async fn rewrite_parquet_as_vortex( Ok(()) } -pub fn read_parquet_to_vortex(parquet_path: &Path) -> VortexResult { +pub fn read_parquet_to_vortex>(parquet_path: P) -> VortexResult { let taxi_pq = File::open(parquet_path)?; let builder = ParquetRecordBatchReaderBuilder::try_new(taxi_pq)?; // FIXME(ngates): #157 the compressor should handle batch size. diff --git a/encodings/alp/Cargo.toml b/encodings/alp/Cargo.toml index 8ea9a66e2b..1e1f502e51 100644 --- a/encodings/alp/Cargo.toml +++ b/encodings/alp/Cargo.toml @@ -17,6 +17,7 @@ readme = { workspace = true } workspace = true [dependencies] +vortex-fastlanes = { workspace = true } itertools = { workspace = true } num-traits = { workspace = true } serde = { workspace = true, features = ["derive"] } @@ -28,6 +29,7 @@ vortex-scalar = { workspace = true } [dev-dependencies] arrow = { workspace = true } divan = { workspace = true } +rstest = { workspace = true } [[bench]] name = "alp_compress" diff --git a/encodings/alp/benches/alp_compress.rs b/encodings/alp/benches/alp_compress.rs index d88728557f..c571de4018 100644 --- a/encodings/alp/benches/alp_compress.rs +++ b/encodings/alp/benches/alp_compress.rs @@ -7,7 +7,7 @@ use vortex::array::PrimitiveArray; use vortex::validity::Validity; use vortex::variants::PrimitiveArrayTrait; use vortex::IntoCanonical; -use vortex_alp::{alp_encode_components, ALPArray, ALPFloat, Exponents}; +use vortex_alp::{alp_encode_components, ALPArray, ALPFloat, ALPRDFloat, Exponents, RDEncoder}; use vortex_dtype::NativePType; fn main() { @@ -15,11 +15,20 @@ fn main() { } #[divan::bench(types = [f32, f64], args = [100_000, 10_000_000])] -fn alp_compress(n: usize) -> (Exponents, Vec, Vec, Vec) { +fn compress_alp(n: usize) -> (Exponents, Vec, Vec, Vec) { let values: Vec = vec![T::from(1.234).unwrap(); n]; T::encode(values.as_slice(), None) } +#[divan::bench(types = [f32, f64], args = [100_000, 10_000_000])] +fn compress_rd(bencher: Bencher, n: usize) { + let values: Vec = vec![T::from(1.23).unwrap(); n]; + let primitive = PrimitiveArray::from(values); + let encoder = RDEncoder::new(&[T::from(1.23).unwrap()]); + + bencher.bench_local(|| encoder.encode(&primitive)); +} + #[divan::bench(types = [f32, f64], args = [100_000, 1_000_000, 10_000_000])] fn alp_iter(bencher: Bencher, n: usize) where diff --git a/encodings/alp/src/array.rs b/encodings/alp/src/alp/array.rs similarity index 99% rename from encodings/alp/src/array.rs rename to encodings/alp/src/alp/array.rs index d14fc41407..4717431557 100644 --- a/encodings/alp/src/array.rs +++ b/encodings/alp/src/alp/array.rs @@ -15,8 +15,7 @@ use vortex::{ use vortex_dtype::{DType, PType}; use vortex_error::{vortex_bail, vortex_panic, VortexExpect as _, VortexResult}; -use crate::alp::Exponents; -use crate::compress::{alp_encode, decompress}; +use crate::alp::{alp_encode, decompress, Exponents}; use crate::ALPFloat; impl_encoding!("vortex.alp", ids::ALP, ALP); diff --git a/encodings/alp/src/compress.rs b/encodings/alp/src/alp/compress.rs similarity index 99% rename from encodings/alp/src/compress.rs rename to encodings/alp/src/alp/compress.rs index 12e8388fa9..d0c0aaad96 100644 --- a/encodings/alp/src/compress.rs +++ b/encodings/alp/src/alp/compress.rs @@ -5,8 +5,7 @@ use vortex_dtype::{NativePType, PType}; use vortex_error::{vortex_bail, VortexExpect as _, VortexResult}; use vortex_scalar::ScalarValue; -use crate::alp::ALPFloat; -use crate::array::ALPArray; +use crate::alp::{ALPArray, ALPFloat}; use crate::Exponents; #[macro_export] diff --git a/encodings/alp/src/compute.rs b/encodings/alp/src/alp/compute.rs similarity index 100% rename from encodings/alp/src/compute.rs rename to encodings/alp/src/alp/compute.rs diff --git a/encodings/alp/src/alp.rs b/encodings/alp/src/alp/mod.rs similarity index 97% rename from encodings/alp/src/alp.rs rename to encodings/alp/src/alp/mod.rs index 710b8b25b3..7297fc0497 100644 --- a/encodings/alp/src/alp.rs +++ b/encodings/alp/src/alp/mod.rs @@ -5,6 +5,13 @@ use itertools::Itertools; use num_traits::{CheckedSub, Float, PrimInt, ToPrimitive}; use serde::{Deserialize, Serialize}; +mod array; +mod compress; +mod compute; + +pub use array::*; +pub use compress::*; + const SAMPLE_SIZE: usize = 32; #[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -19,7 +26,14 @@ impl Display for Exponents { } } -pub trait ALPFloat: Float + Display + 'static { +mod private { + pub trait Sealed {} + + impl Sealed for f32 {} + impl Sealed for f64 {} +} + +pub trait ALPFloat: private::Sealed + Float + Display + 'static { type ALPInt: PrimInt + Display + ToPrimitive; const FRACTIONAL_BITS: u8; diff --git a/encodings/alp/src/alp_rd/array.rs b/encodings/alp/src/alp_rd/array.rs new file mode 100644 index 0000000000..d942a104ee --- /dev/null +++ b/encodings/alp/src/alp_rd/array.rs @@ -0,0 +1,270 @@ +use serde::{Deserialize, Serialize}; +use vortex::array::{PrimitiveArray, SparseArray}; +use vortex::encoding::ids; +use vortex::stats::{ArrayStatisticsCompute, StatsSet}; +use vortex::validity::{ArrayValidity, LogicalValidity}; +use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; +use vortex::{impl_encoding, Array, ArrayDType, ArrayDef, ArrayTrait, Canonical, IntoCanonical}; +use vortex_dtype::{DType, PType}; +use vortex_error::{vortex_bail, VortexExpect, VortexResult}; + +use crate::alp_rd::alp_rd_decode; + +impl_encoding!("vortex.alprd", ids::ALP_RD, ALPRD); + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ALPRDMetadata { + right_bit_width: u8, + dict_len: u8, + dict: [u16; 8], + left_parts_dtype: DType, + has_exceptions: bool, +} + +impl ALPRDArray { + pub fn try_new( + dtype: DType, + left_parts: Array, + left_parts_dict: impl AsRef<[u16]>, + right_parts: Array, + right_bit_width: u8, + left_parts_exceptions: Option, + ) -> VortexResult { + if !dtype.is_float() { + vortex_bail!("ALPRDArray given invalid DType ({dtype})"); + } + + if left_parts.len() != right_parts.len() { + vortex_bail!("left_parts and right_parts must be of same length"); + } + + let len = left_parts.len(); + + if !left_parts.dtype().is_unsigned_int() { + vortex_bail!("left_parts dtype must be uint"); + } + + let left_parts_dtype = left_parts.dtype().clone(); + + if !right_parts.dtype().is_unsigned_int() { + vortex_bail!("right_parts dtype must be uint"); + } + + let mut children = vec![left_parts, right_parts]; + let has_exceptions = left_parts_exceptions.is_some(); + + if let Some(exceptions) = left_parts_exceptions { + // Enforce that the exceptions are SparseArray so that we have access to indices and values. + if exceptions.encoding().id().code() != ids::SPARSE { + vortex_bail!("left_parts_exceptions must be SparseArray encoded"); + } + children.push(exceptions); + } + + let mut dict = [0u16; 8]; + for (idx, v) in left_parts_dict.as_ref().iter().enumerate() { + dict[idx] = *v; + } + + Self::try_from_parts( + dtype, + len, + ALPRDMetadata { + right_bit_width, + dict_len: left_parts_dict.as_ref().len() as u8, + dict, + left_parts_dtype, + has_exceptions, + }, + children.into(), + StatsSet::new(), + ) + } + + /// Returns true if logical type of the array values is f32. + /// + /// Returns false if the logical type of the array values is f64. + #[inline] + pub fn is_f32(&self) -> bool { + PType::try_from(self.dtype()).vortex_expect("ALPRDArray must have primitive type") + == PType::F32 + } + + /// The leftmost (most significant) bits of the floating point values stored in the array. + /// + /// These are bit-packed and dictionary encoded, and cannot directly be interpreted without + /// the metadata of this array. + pub fn left_parts(&self) -> Array { + self.as_ref() + .child(0, &self.metadata().left_parts_dtype, self.len()) + .vortex_expect("ALPRDArray: left_parts child") + } + + /// The rightmost (least significant) bits of the floating point values stored in the array. + pub fn right_parts(&self) -> Array { + let uint_ptype = if self.is_f32() { + PType::U32 + } else { + PType::U64 + }; + + self.as_ref() + .child( + 1, + &DType::Primitive(uint_ptype, self.metadata().left_parts_dtype.nullability()), + self.len(), + ) + .vortex_expect("ALPRDArray: right_parts child") + } + + /// Patches of left-most bits. + pub fn left_parts_exceptions(&self) -> Option { + self.metadata().has_exceptions.then(|| { + self.as_ref() + .child( + 2, + &self.metadata().left_parts_dtype.as_nullable(), + self.len(), + ) + .vortex_expect("ALPRDArray: left_parts_exceptions child") + }) + } + + /// The dictionary that maps the codes in `left_parts` into bit patterns. + #[inline] + pub fn left_parts_dict(&self) -> &[u16] { + &self.metadata().dict[0..self.metadata().dict_len as usize] + } + + #[inline] + pub(crate) fn right_bit_width(&self) -> u8 { + self.metadata().right_bit_width + } +} + +impl IntoCanonical for ALPRDArray { + fn into_canonical(self) -> VortexResult { + let left_parts = self.left_parts().into_canonical()?.into_primitive()?; + let right_parts = self.right_parts().into_canonical()?.into_primitive()?; + + // Decode the left_parts using our builtin dictionary. + let left_parts_dict = &self.metadata().dict[0..self.metadata().dict_len as usize]; + + let exc_pos: Vec; + let exc_u16: PrimitiveArray; + + if let Some(left_parts_exceptions) = self.left_parts_exceptions() { + let left_parts_exceptions = SparseArray::try_from(left_parts_exceptions) + .vortex_expect("ALPRDArray: exceptions must be SparseArray encoded"); + exc_pos = left_parts_exceptions + .resolved_indices() + .into_iter() + .map(|v| v as _) + .collect(); + exc_u16 = left_parts_exceptions + .values() + .into_canonical()? + .into_primitive()?; + } else { + exc_pos = Vec::new(); + exc_u16 = PrimitiveArray::from(Vec::::new()); + } + + let decoded_array = if self.is_f32() { + PrimitiveArray::from_vec( + alp_rd_decode::( + left_parts.maybe_null_slice::(), + left_parts_dict, + self.metadata().right_bit_width, + right_parts.maybe_null_slice::(), + &exc_pos, + exc_u16.maybe_null_slice::(), + ), + self.logical_validity().into_validity(), + ) + } else { + PrimitiveArray::from_vec( + alp_rd_decode::( + left_parts.maybe_null_slice::(), + left_parts_dict, + self.metadata().right_bit_width, + right_parts.maybe_null_slice::(), + &exc_pos, + exc_u16.maybe_null_slice::(), + ), + self.logical_validity().into_validity(), + ) + }; + + Ok(Canonical::Primitive(decoded_array)) + } +} + +impl ArrayValidity for ALPRDArray { + fn is_valid(&self, index: usize) -> bool { + // Use validity from left_parts + self.left_parts().with_dyn(|a| a.is_valid(index)) + } + + fn logical_validity(&self) -> LogicalValidity { + self.left_parts().with_dyn(|a| a.logical_validity()) + } +} + +impl AcceptArrayVisitor for ALPRDArray { + fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { + visitor.visit_child("left_parts", &self.left_parts())?; + visitor.visit_child("right_parts", &self.right_parts())?; + if let Some(left_parts_exceptions) = self.left_parts_exceptions() { + visitor.visit_child("left_parts_exceptions", &left_parts_exceptions) + } else { + Ok(()) + } + } +} + +impl ArrayStatisticsCompute for ALPRDArray {} + +impl ArrayTrait for ALPRDArray {} + +#[cfg(test)] +mod test { + use rstest::rstest; + use vortex::array::PrimitiveArray; + use vortex::{IntoArray, IntoCanonical}; + + use crate::{alp_rd, ALPRDFloat}; + + #[rstest] + #[case(vec![0.1f32.next_up(); 1024], 1.123_848_f32)] + #[case(vec![0.1f64.next_up(); 1024], 1.123_848_591_110_992_f64)] + fn test_array_encode_with_nulls_and_exceptions( + #[case] reals: Vec, + #[case] seed: T, + ) { + assert_eq!(reals.len(), 1024, "test expects 1024-length fixture"); + // Null out some of the values. + let mut reals: Vec> = reals.into_iter().map(Some).collect(); + reals[1] = None; + reals[5] = None; + reals[900] = None; + + // Create a new array from this. + let real_array = PrimitiveArray::from_nullable_vec(reals.clone()); + + // Pick a seed that we know will trigger lots of exceptions. + let encoder: alp_rd::RDEncoder = alp_rd::RDEncoder::new(&[seed.powi(-2)]); + + let rd_array = encoder.encode(&real_array); + + let decoded = rd_array + .into_array() + .into_canonical() + .unwrap() + .into_primitive() + .unwrap(); + + let maybe_null_reals: Vec = reals.into_iter().map(|v| v.unwrap_or_default()).collect(); + assert_eq!(decoded.maybe_null_slice::(), &maybe_null_reals); + } +} diff --git a/encodings/alp/src/alp_rd/compute/filter.rs b/encodings/alp/src/alp_rd/compute/filter.rs new file mode 100644 index 0000000000..d35c796867 --- /dev/null +++ b/encodings/alp/src/alp_rd/compute/filter.rs @@ -0,0 +1,52 @@ +use vortex::compute::{filter, FilterFn}; +use vortex::{Array, ArrayDType, IntoArray}; +use vortex_error::VortexResult; + +use crate::ALPRDArray; + +impl FilterFn for ALPRDArray { + fn filter(&self, predicate: &Array) -> VortexResult { + let left_parts_exceptions = self + .left_parts_exceptions() + .map(|array| filter(&array, predicate)) + .transpose()?; + + Ok(ALPRDArray::try_new( + self.dtype().clone(), + filter(self.left_parts(), predicate)?, + self.left_parts_dict(), + filter(self.right_parts(), predicate)?, + self.right_bit_width(), + left_parts_exceptions, + )? + .into_array()) + } +} + +#[cfg(test)] +mod test { + use rstest::rstest; + use vortex::array::{BoolArray, PrimitiveArray}; + use vortex::compute::filter; + use vortex::IntoArrayVariant; + + use crate::{ALPRDFloat, RDEncoder}; + + #[rstest] + #[case(0.1f32, 0.2f32, 3e25f32)] + #[case(0.1f64, 0.2f64, 3e100f64)] + fn test_filter(#[case] a: T, #[case] b: T, #[case] outlier: T) { + let array = PrimitiveArray::from(vec![a, b, outlier]); + let encoded = RDEncoder::new(&[a, b]).encode(&array); + + // Make sure that we're testing the exception pathway. + assert!(encoded.left_parts_exceptions().is_some()); + + // The first two values need no patching + let filtered = filter(encoded.as_ref(), BoolArray::from(vec![true, false, true])) + .unwrap() + .into_primitive() + .unwrap(); + assert_eq!(filtered.maybe_null_slice::(), &[a, outlier]); + } +} diff --git a/encodings/alp/src/alp_rd/compute/mod.rs b/encodings/alp/src/alp_rd/compute/mod.rs new file mode 100644 index 0000000000..5420b362fd --- /dev/null +++ b/encodings/alp/src/alp_rd/compute/mod.rs @@ -0,0 +1,27 @@ +use vortex::compute::unary::ScalarAtFn; +use vortex::compute::{ArrayCompute, FilterFn, SliceFn, TakeFn}; + +use crate::ALPRDArray; + +mod filter; +mod scalar_at; +mod slice; +mod take; + +impl ArrayCompute for ALPRDArray { + fn filter(&self) -> Option<&dyn FilterFn> { + Some(self) + } + + fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { + Some(self) + } + + fn slice(&self) -> Option<&dyn SliceFn> { + Some(self) + } + + fn take(&self) -> Option<&dyn TakeFn> { + Some(self) + } +} diff --git a/encodings/alp/src/alp_rd/compute/scalar_at.rs b/encodings/alp/src/alp_rd/compute/scalar_at.rs new file mode 100644 index 0000000000..65a4bc9433 --- /dev/null +++ b/encodings/alp/src/alp_rd/compute/scalar_at.rs @@ -0,0 +1,68 @@ +use vortex::compute::unary::{scalar_at, ScalarAtFn}; +use vortex_error::{VortexResult, VortexUnwrap}; +use vortex_scalar::Scalar; + +use crate::alp_rd::array::ALPRDArray; + +impl ScalarAtFn for ALPRDArray { + fn scalar_at(&self, index: usize) -> VortexResult { + // The left value can either be a direct value, or an exception. + // The exceptions array represents exception positions with non-null values. + let left: u16 = match self.left_parts_exceptions() { + Some(exceptions) if exceptions.with_dyn(|a| a.is_valid(index)) => { + scalar_at(&exceptions, index)?.try_into()? + } + _ => { + let left_code: u16 = scalar_at(&self.left_parts(), index)?.try_into()?; + self.left_parts_dict()[left_code as usize] + } + }; + + // combine left and right values + if self.is_f32() { + let right: u32 = scalar_at(&self.right_parts(), index)?.try_into()?; + let packed = f32::from_bits((left as u32) << self.right_bit_width() | right); + Ok(packed.into()) + } else { + let right: u64 = scalar_at(&self.right_parts(), index)?.try_into()?; + let packed = f64::from_bits(((left as u64) << self.right_bit_width()) | right); + Ok(packed.into()) + } + } + + fn scalar_at_unchecked(&self, index: usize) -> Scalar { + self.scalar_at(index).vortex_unwrap() + } +} + +#[cfg(test)] +mod test { + use rstest::rstest; + use vortex::array::PrimitiveArray; + use vortex::compute::unary::scalar_at; + use vortex_scalar::Scalar; + + use crate::{ALPRDFloat, RDEncoder}; + + #[rstest] + #[case(0.1f32, 0.2f32, 3e25f32)] + #[case(0.1f64, 0.2f64, 3e100f64)] + fn test_scalar_at>( + #[case] a: T, + #[case] b: T, + #[case] outlier: T, + ) { + let array = PrimitiveArray::from(vec![a, b, outlier]); + let encoded = RDEncoder::new(&[a, b]).encode(&array); + + // Make sure that we're testing the exception pathway. + assert!(encoded.left_parts_exceptions().is_some()); + + // The first two values need no patching + assert_eq!(scalar_at(encoded.as_ref(), 0).unwrap(), a.into()); + assert_eq!(scalar_at(encoded.as_ref(), 1).unwrap(), b.into()); + + // The right value hits the left_part_exceptions + assert_eq!(scalar_at(encoded.as_ref(), 2).unwrap(), outlier.into()); + } +} diff --git a/encodings/alp/src/alp_rd/compute/slice.rs b/encodings/alp/src/alp_rd/compute/slice.rs new file mode 100644 index 0000000000..827e30528e --- /dev/null +++ b/encodings/alp/src/alp_rd/compute/slice.rs @@ -0,0 +1,51 @@ +use vortex::compute::{slice, SliceFn}; +use vortex::{Array, ArrayDType, IntoArray}; +use vortex_error::VortexResult; + +use crate::ALPRDArray; + +impl SliceFn for ALPRDArray { + fn slice(&self, start: usize, stop: usize) -> VortexResult { + let left_parts_exceptions = self + .left_parts_exceptions() + .map(|array| slice(&array, start, stop)) + .transpose()?; + + Ok(ALPRDArray::try_new( + self.dtype().clone(), + slice(self.left_parts(), start, stop)?, + self.left_parts_dict(), + slice(self.right_parts(), start, stop)?, + self.right_bit_width(), + left_parts_exceptions, + )? + .into_array()) + } +} + +#[cfg(test)] +mod test { + use rstest::rstest; + use vortex::array::PrimitiveArray; + use vortex::compute::slice; + use vortex::IntoArrayVariant; + + use crate::{ALPRDFloat, RDEncoder}; + + #[rstest] + #[case(0.1f32, 0.2f32, 3e25f32)] + #[case(0.1f64, 0.2f64, 3e100f64)] + fn test_slice(#[case] a: T, #[case] b: T, #[case] outlier: T) { + let array = PrimitiveArray::from(vec![a, b, outlier]); + let encoded = RDEncoder::new(&[a, b]).encode(&array); + + assert!(encoded.left_parts_exceptions().is_some()); + + let decoded = slice(encoded.as_ref(), 1, 3) + .unwrap() + .into_primitive() + .unwrap(); + + assert_eq!(decoded.maybe_null_slice::(), &[b, outlier]); + } +} diff --git a/encodings/alp/src/alp_rd/compute/take.rs b/encodings/alp/src/alp_rd/compute/take.rs new file mode 100644 index 0000000000..28ce8a6441 --- /dev/null +++ b/encodings/alp/src/alp_rd/compute/take.rs @@ -0,0 +1,51 @@ +use vortex::compute::{take, TakeFn}; +use vortex::{Array, ArrayDType, IntoArray}; +use vortex_error::VortexResult; + +use crate::ALPRDArray; + +impl TakeFn for ALPRDArray { + fn take(&self, indices: &Array) -> VortexResult { + let left_parts_exceptions = self + .left_parts_exceptions() + .map(|array| take(&array, indices)) + .transpose()?; + + Ok(ALPRDArray::try_new( + self.dtype().clone(), + take(self.left_parts(), indices)?, + self.left_parts_dict(), + take(self.right_parts(), indices)?, + self.right_bit_width(), + left_parts_exceptions, + )? + .into_array()) + } +} + +#[cfg(test)] +mod test { + use rstest::rstest; + use vortex::array::PrimitiveArray; + use vortex::compute::take; + use vortex::IntoArrayVariant; + + use crate::{ALPRDFloat, RDEncoder}; + + #[rstest] + #[case(0.1f32, 0.2f32, 3e25f32)] + #[case(0.1f64, 0.2f64, 3e100f64)] + fn test_take(#[case] a: T, #[case] b: T, #[case] outlier: T) { + let array = PrimitiveArray::from(vec![a, b, outlier]); + let encoded = RDEncoder::new(&[a, b]).encode(&array); + + assert!(encoded.left_parts_exceptions().is_some()); + + let taken = take(encoded.as_ref(), PrimitiveArray::from(vec![0, 2]).as_ref()) + .unwrap() + .into_primitive() + .unwrap(); + + assert_eq!(taken.maybe_null_slice::(), &[a, outlier]); + } +} diff --git a/encodings/alp/src/alp_rd/mod.rs b/encodings/alp/src/alp_rd/mod.rs new file mode 100644 index 0000000000..2c22c00929 --- /dev/null +++ b/encodings/alp/src/alp_rd/mod.rs @@ -0,0 +1,404 @@ +pub use array::*; + +mod array; +mod compute; +mod variants; + +use std::collections::HashMap; +use std::ops::{Shl, Shr}; + +use itertools::Itertools; +use num_traits::{Float, One, PrimInt}; +use vortex::array::{PrimitiveArray, SparseArray}; +use vortex::{ArrayDType, IntoArray}; +use vortex_dtype::{DType, NativePType}; +use vortex_error::{VortexExpect, VortexUnwrap}; +use vortex_fastlanes::bitpack_encode_unchecked; +use vortex_scalar::ScalarValue; + +use crate::match_each_alp_float_ptype; + +macro_rules! bit_width { + ($value:expr) => { + if $value == 0 { + 1 + } else { + $value.ilog2().wrapping_add(1) as usize + } + }; +} + +/// Max number of bits to cut from the MSB section of each float. +const CUT_LIMIT: usize = 16; + +const MAX_DICT_SIZE: u8 = 8; + +mod private { + pub trait Sealed {} + + impl Sealed for f32 {} + impl Sealed for f64 {} +} + +/// Main trait for ALP-RD encodable floating point numbers. +/// +/// Like the paper, we limit this to the IEEE7 754 single-precision (`f32`) and double-precision +/// (`f64`) floating point types. +pub trait ALPRDFloat: private::Sealed + Float + Copy + NativePType { + /// The unsigned integer type with the same bit-width as the floating-point type. + type UINT: NativePType + PrimInt + One + Copy; + + /// Number of bits the value occupies in registers. + const BITS: usize = size_of::() * 8; + + /// Bit-wise transmute from the unsigned integer type to the floating-point type. + fn from_bits(bits: Self::UINT) -> Self; + + /// Bit-wise transmute into the unsigned integer type. + fn to_bits(value: Self) -> Self::UINT; + + /// Truncating conversion from the unsigned integer type to `u16`. + fn to_u16(bits: Self::UINT) -> u16; + + /// Type-widening conversion from `u16` to the unsigned integer type. + fn from_u16(value: u16) -> Self::UINT; +} + +impl ALPRDFloat for f64 { + type UINT = u64; + + fn from_bits(bits: Self::UINT) -> Self { + f64::from_bits(bits) + } + + fn to_bits(value: Self) -> Self::UINT { + value.to_bits() + } + + fn to_u16(bits: Self::UINT) -> u16 { + bits as u16 + } + + fn from_u16(value: u16) -> Self::UINT { + value as u64 + } +} + +impl ALPRDFloat for f32 { + type UINT = u32; + + fn from_bits(bits: Self::UINT) -> Self { + f32::from_bits(bits) + } + + fn to_bits(value: Self) -> Self::UINT { + value.to_bits() + } + + fn to_u16(bits: Self::UINT) -> u16 { + bits as u16 + } + + fn from_u16(value: u16) -> Self::UINT { + value as u32 + } +} + +/// Encoder for ALP-RD ("real doubles") values. +/// +/// The encoder calculates its parameters from a single sample of floating-point values, +/// and then can be applied to many vectors. +/// +/// ALP-RD uses the algorithm outlined in Section 3.4 of the paper. The crux of it is that the front +/// (most significant) bits of many double vectors tend to be the same, i.e. most doubles in a +/// vector often use the same exponent and front bits. Compression proceeds by finding the best +/// prefix of up to 16 bits that can be collapsed into a dictionary of +/// up to 8 elements. Each double can then be broken into the front/left `L` bits, which neatly +/// bit-packs down to 1-3 bits per element (depending on the actual dictionary size). +/// The remaining `R` bits naturally bit-pack. +/// +/// In the ideal case, this scheme allows us to store a sequence of doubles in 49 bits-per-value. +/// +/// Our implementation draws on the MIT-licensed [C++ implementation] provided by the original authors. +/// +/// [C++ implementation]: https://github.com/cwida/ALP/blob/main/include/alp/rd.hpp +pub struct RDEncoder { + right_bit_width: u8, + codes: Vec, +} + +impl RDEncoder { + /// Build a new encoder from a sample of doubles. + pub fn new(sample: &[T]) -> Self + where + T: ALPRDFloat + NativePType, + T::UINT: NativePType, + { + let dictionary = find_best_dictionary::(sample); + + let mut codes = vec![0; dictionary.dictionary.len()]; + dictionary.dictionary.into_iter().for_each(|(bits, code)| { + // write the reverse mapping into the codes vector. + codes[code as usize] = bits + }); + + Self { + right_bit_width: dictionary.right_bit_width, + codes, + } + } + + /// Encode a set of floating point values with ALP-RD. + /// + /// Each value will be split into a left and right component, which are compressed individually. + pub fn encode(&self, array: &PrimitiveArray) -> ALPRDArray { + match_each_alp_float_ptype!(array.ptype(), |$P| { + self.encode_generic::<$P>(array) + }) + } + + fn encode_generic(&self, array: &PrimitiveArray) -> ALPRDArray + where + T: ALPRDFloat + NativePType, + T::UINT: NativePType, + { + assert!( + !self.codes.is_empty(), + "codes lookup table must be populated before RD encoding" + ); + + let doubles = array.maybe_null_slice::(); + + let mut left_parts: Vec = Vec::with_capacity(doubles.len()); + let mut right_parts: Vec = Vec::with_capacity(doubles.len()); + let mut exceptions_pos: Vec = Vec::with_capacity(doubles.len() / 4); + let mut exceptions: Vec = Vec::with_capacity(doubles.len() / 4); + + // mask for right-parts + let right_mask = T::UINT::one().shl(self.right_bit_width as _) - T::UINT::one(); + let max_code = self.codes.len() - 1; + let left_bit_width = bit_width!(max_code); + + for v in doubles.iter().copied() { + right_parts.push(T::to_bits(v) & right_mask); + left_parts.push(::to_u16( + T::to_bits(v).shr(self.right_bit_width as _), + )); + } + + // dict-encode the left-parts, keeping track of exceptions + for (idx, left) in left_parts.iter_mut().enumerate() { + // TODO: revisit if we need to change the branch order for perf. + if let Some(code) = self.codes.iter().position(|v| *v == *left) { + *left = code as u16; + } else { + exceptions.push(*left); + exceptions_pos.push(idx as _); + + *left = 0u16; + } + } + + // Bit-pack down the encoded left-parts array that have been dictionary encoded. + let primitive_left = PrimitiveArray::from_vec(left_parts, array.validity()); + // SAFETY: by construction, all values in left_parts can be packed to left_bit_width. + let packed_left = unsafe { + bitpack_encode_unchecked(primitive_left, left_bit_width as _) + .vortex_unwrap() + .into_array() + }; + + let primitive_right = PrimitiveArray::from_vec(right_parts, array.validity()); + // SAFETY: by construction, all values in right_parts are right_bit_width + leading zeros. + let packed_right = unsafe { + bitpack_encode_unchecked(primitive_right, self.right_bit_width as _) + .vortex_unwrap() + .into_array() + }; + + // Bit-pack the dict-encoded left-parts + // Bit-pack the right-parts + // SparseArray for exceptions. + let exceptions = (!exceptions_pos.is_empty()).then(|| { + let max_exc_pos = exceptions_pos.last().copied().unwrap_or_default(); + let bw = bit_width!(max_exc_pos); + + let exc_pos_array = PrimitiveArray::from(exceptions_pos); + // SAFETY: We calculate bw such that it is wide enough to hold the largest position index. + let packed_pos = unsafe { + bitpack_encode_unchecked(exc_pos_array, bw) + .vortex_unwrap() + .into_array() + }; + + let exc_array = + PrimitiveArray::from_nullable_vec(exceptions.into_iter().map(Some).collect()) + .into_array(); + SparseArray::try_new(packed_pos, exc_array, doubles.len(), ScalarValue::Null) + .vortex_expect("ALP-RD: construction of exceptions SparseArray") + .into_array() + }); + + ALPRDArray::try_new( + DType::Primitive(T::PTYPE, packed_left.dtype().nullability()), + packed_left, + &self.codes, + packed_right, + self.right_bit_width, + exceptions, + ) + .vortex_expect("ALPRDArray construction in encode") + } +} + +/// Decode a vector of ALP-RD encoded values back into their original floating point format. +/// +/// # Panics +/// +/// The function panics if the provided `left_parts` and `right_parts` differ in length. +/// +/// The function panics if the provided `exc_pos` and `exceptions` differ in length. +pub fn alp_rd_decode( + left_parts: &[u16], + left_parts_dict: &[u16], + right_bit_width: u8, + right_parts: &[T::UINT], + exc_pos: &[u64], + exceptions: &[u16], +) -> Vec { + assert_eq!( + left_parts.len(), + right_parts.len(), + "alp_rd_decode: left_parts.len != right_parts.len" + ); + + assert_eq!( + exc_pos.len(), + exceptions.len(), + "alp_rd_decode: exc_pos.len != exceptions.len" + ); + + let mut dict = Vec::with_capacity(left_parts_dict.len()); + dict.extend_from_slice(left_parts_dict); + + let mut left_parts_decoded: Vec = Vec::with_capacity(left_parts.len()); + + // Decode with bit-packing and dict unpacking. + for code in left_parts { + left_parts_decoded.push(::from_u16(dict[*code as usize])); + } + + // Apply the exception patches to left_parts + for (pos, val) in exc_pos.iter().zip(exceptions.iter()) { + left_parts_decoded[*pos as usize] = ::from_u16(*val); + } + + // recombine the left-and-right parts, adjusting by the right_bit_width. + left_parts_decoded + .into_iter() + .zip(right_parts.iter().copied()) + .map(|(left, right)| T::from_bits((left << (right_bit_width as usize)) | right)) + .collect() +} + +/// Find the best "cut point" for a set of floating point values such that we can +/// cast them all to the relevant value instead. +fn find_best_dictionary(samples: &[T]) -> ALPRDDictionary { + let mut best_est_size = f64::MAX; + let mut best_dict = ALPRDDictionary::default(); + + for p in 1..=16 { + let candidate_right_bw = (T::BITS - p) as u8; + let (dictionary, exception_count) = + build_left_parts_dictionary::(samples, candidate_right_bw, MAX_DICT_SIZE); + let estimated_size = estimate_compression_size( + dictionary.right_bit_width, + dictionary.left_bit_width, + exception_count, + samples.len(), + ); + if estimated_size < best_est_size { + best_est_size = estimated_size; + best_dict = dictionary; + } + } + + best_dict +} + +/// Build dictionary of the leftmost bits. +fn build_left_parts_dictionary( + samples: &[T], + right_bw: u8, + max_dict_size: u8, +) -> (ALPRDDictionary, usize) { + assert!( + right_bw >= (T::BITS - CUT_LIMIT) as _, + "left-parts must be <= 16 bits" + ); + + // Count the number of occurrences of each left bit pattern + let counts = samples + .iter() + .copied() + .map(|v| ::to_u16(T::to_bits(v).shr(right_bw as _))) + .counts(); + + // Sorted counts: sort by negative count so that heavy hitters sort first. + let mut sorted_bit_counts: Vec<(u16, usize)> = counts.into_iter().collect_vec(); + sorted_bit_counts.sort_by_key(|(_, count)| count.wrapping_neg()); + + // Assign the most-frequently occurring left-bits as dictionary codes, up to `dict_size`... + let mut dictionary = HashMap::with_capacity(max_dict_size as _); + let mut code = 0u16; + while code < (max_dict_size as _) && (code as usize) < sorted_bit_counts.len() { + let (bits, _) = sorted_bit_counts[code as usize]; + dictionary.insert(bits, code); + code += 1; + } + + // ...and the rest are exceptions. + let exception_count: usize = sorted_bit_counts + .iter() + .skip(code as _) + .map(|(_, count)| *count) + .sum(); + + // Left bit-width is determined based on the actual dictionary size. + let max_code = dictionary.len() - 1; + let left_bw = bit_width!(max_code) as u8; + + ( + ALPRDDictionary { + dictionary, + right_bit_width: right_bw, + left_bit_width: left_bw, + }, + exception_count, + ) +} + +/// Estimate the bits-per-value when using these compression settings. +fn estimate_compression_size( + right_bw: u8, + left_bw: u8, + exception_count: usize, + sample_n: usize, +) -> f64 { + const EXC_POSITION_SIZE: usize = 16; // two bytes for exception position. + const EXC_SIZE: usize = 16; // two bytes for each exception (up to 16 front bits). + + let exceptions_size = exception_count * (EXC_POSITION_SIZE + EXC_SIZE); + (right_bw as f64) + (left_bw as f64) + ((exceptions_size as f64) / (sample_n as f64)) +} + +/// The ALP-RD dictionary, encoding the "left parts" and their dictionary encoding. +#[derive(Debug, Default)] +struct ALPRDDictionary { + /// Items in the dictionary are bit patterns, along with their 16-bit encoding. + dictionary: HashMap, + /// The (compressed) left bit width. This is after bit-packing the dictionary codes. + left_bit_width: u8, + /// The right bit width. This is the bit-packed width of each of the "real double" values. + right_bit_width: u8, +} diff --git a/encodings/alp/src/alp_rd/variants.rs b/encodings/alp/src/alp_rd/variants.rs new file mode 100644 index 0000000000..127eb8ded4 --- /dev/null +++ b/encodings/alp/src/alp_rd/variants.rs @@ -0,0 +1,15 @@ +use vortex::variants::{ArrayVariants, PrimitiveArrayTrait}; + +use crate::ALPRDArray; + +impl ArrayVariants for ALPRDArray { + fn as_primitive_array(&self) -> Option<&dyn PrimitiveArrayTrait> { + Some(self) + } + + fn as_primitive_array_unchecked(&self) -> &dyn PrimitiveArrayTrait { + self + } +} + +impl PrimitiveArrayTrait for ALPRDArray {} diff --git a/encodings/alp/src/lib.rs b/encodings/alp/src/lib.rs index fde87a65f2..d47da5bdea 100644 --- a/encodings/alp/src/lib.rs +++ b/encodings/alp/src/lib.rs @@ -1,8 +1,22 @@ +#![feature(float_next_up_down)] + +//! This crate contains an implementation of the floating point compression algorithm from the +//! paper ["ALP: Adaptive Lossless floating-Point Compression"][paper] by Afroozeh et al. +//! +//! The compressor has two variants, classic ALP which is well-suited for data that does not use +//! the full precision, and "real doubles", values that do. +//! +//! Classic ALP will return small integers, and it is meant to be cascaded with other integer +//! compression techniques such as bit-packing and frame-of-reference encoding. Combined this allows +//! for significant compression on the order of what you can get for integer values. +//! +//! ALP-RD is generally terminal, and in the ideal case it can represent an f64 is just 49 bits, +//! though generally it is closer to 54 bits per value or ~12.5% compression. +//! +//! [paper]: https://ir.cwi.nl/pub/33334/33334.pdf + pub use alp::*; -pub use array::*; -pub use compress::*; +pub use alp_rd::*; mod alp; -mod array; -mod compress; -mod compute; +mod alp_rd; diff --git a/encodings/fastlanes/src/bitpacking/compress.rs b/encodings/fastlanes/src/bitpacking/compress.rs index 3d10335bce..783fe52b71 100644 --- a/encodings/fastlanes/src/bitpacking/compress.rs +++ b/encodings/fastlanes/src/bitpacking/compress.rs @@ -40,6 +40,30 @@ pub fn bitpack_encode(array: PrimitiveArray, bit_width: usize) -> VortexResult VortexResult { + let packed = bitpack(&array, bit_width)?; + + BitPackedArray::try_new( + packed, + array.ptype(), + array.validity(), + None, + bit_width, + array.len(), + ) +} + /// Bitpack a [PrimitiveArray] to the given width. /// /// On success, returns a [Buffer] containing the packed data. diff --git a/vortex-array/src/encoding.rs b/vortex-array/src/encoding.rs index 77137b4da2..a0991d085f 100644 --- a/vortex-array/src/encoding.rs +++ b/vortex-array/src/encoding.rs @@ -138,6 +138,7 @@ pub mod ids { pub const RUN_END: u16 = 27; pub const RUN_END_BOOL: u16 = 28; pub const ZIGZAG: u16 = 29; + pub const ALP_RD: u16 = 30; } #[cfg(test)] diff --git a/vortex-sampling-compressor/src/compressors/alp_rd.rs b/vortex-sampling-compressor/src/compressors/alp_rd.rs new file mode 100644 index 0000000000..e877d4067a --- /dev/null +++ b/vortex-sampling-compressor/src/compressors/alp_rd.rs @@ -0,0 +1,78 @@ +use std::any::Any; +use std::collections::HashSet; +use std::sync::Arc; + +use vortex::array::PrimitiveArray; +use vortex::encoding::EncodingRef; +use vortex::{Array, ArrayDef, IntoArray, IntoArrayVariant}; +use vortex_alp::{match_each_alp_float_ptype, ALPRDEncoding, RDEncoder as ALPRDEncoder, ALPRD}; +use vortex_dtype::PType; +use vortex_error::{vortex_bail, VortexResult}; +use vortex_fastlanes::BitPackedEncoding; + +use crate::compressors::{CompressedArray, CompressionTree, EncoderMetadata, EncodingCompressor}; +use crate::SamplingCompressor; + +#[derive(Debug)] +pub struct ALPRDCompressor; + +impl EncoderMetadata for ALPRDEncoder { + fn as_any(&self) -> &dyn Any { + self + } +} + +impl EncodingCompressor for ALPRDCompressor { + fn id(&self) -> &str { + ALPRD::ID.as_ref() + } + + fn can_compress(&self, array: &Array) -> Option<&dyn EncodingCompressor> { + // Only support primitive arrays + let parray = PrimitiveArray::try_from(array).ok()?; + + // Only supports f32 and f64 + if !matches!(parray.ptype(), PType::F32 | PType::F64) { + return None; + } + + Some(self) + } + + fn compress<'a>( + &'a self, + array: &Array, + like: Option>, + _ctx: SamplingCompressor<'a>, + ) -> VortexResult> { + let primitive = array.clone().into_primitive()?; + + // Train a new compressor or reuse an existing compressor. + let encoder = like + .clone() + .and_then(|mut tree| tree.metadata()) + .map(VortexResult::Ok) + .unwrap_or_else(|| Ok(Arc::new(alp_rd_new_encoder(&primitive))))?; + + let Some(alp_rd_encoder) = encoder.as_any().downcast_ref::() else { + vortex_bail!("Could not downcast metadata as ALPRDEncoder"); + }; + + let encoded = alp_rd_encoder.encode(&primitive).into_array(); + Ok(CompressedArray::new( + encoded, + Some(CompressionTree::new_with_metadata(self, vec![], encoder)), + )) + } + + fn used_encodings(&self) -> HashSet { + HashSet::from([&ALPRDEncoding as EncodingRef, &BitPackedEncoding]) + } +} + +/// Create a new `ALPRDEncoder` from the given array of samples. +fn alp_rd_new_encoder(array: &PrimitiveArray) -> ALPRDEncoder { + match_each_alp_float_ptype!(array.ptype(), |$P| { + ALPRDEncoder::new(array.maybe_null_slice::<$P>()) + }) +} diff --git a/vortex-sampling-compressor/src/compressors/mod.rs b/vortex-sampling-compressor/src/compressors/mod.rs index f8cac1063a..05d6288721 100644 --- a/vortex-sampling-compressor/src/compressors/mod.rs +++ b/vortex-sampling-compressor/src/compressors/mod.rs @@ -11,6 +11,7 @@ use vortex_error::VortexResult; use crate::SamplingCompressor; pub mod alp; +pub mod alp_rd; pub mod bitpacked; pub mod constant; pub mod date_time_parts; diff --git a/vortex-sampling-compressor/src/lib.rs b/vortex-sampling-compressor/src/lib.rs index f6a7790eb7..bef4e74849 100644 --- a/vortex-sampling-compressor/src/lib.rs +++ b/vortex-sampling-compressor/src/lib.rs @@ -16,6 +16,7 @@ use vortex::{Array, ArrayDType, ArrayDef, IntoArray, IntoCanonical}; use vortex_error::VortexResult; use crate::compressors::alp::ALPCompressor; +use crate::compressors::alp_rd::ALPRDCompressor; use crate::compressors::bitpacked::BitPackedCompressor; use crate::compressors::constant::ConstantCompressor; use crate::compressors::date_time_parts::DateTimePartsCompressor; @@ -35,8 +36,9 @@ pub mod compressors; mod sampling; lazy_static! { - pub static ref ALL_COMPRESSORS: [CompressorRef<'static>; 11] = [ + pub static ref ALL_COMPRESSORS: [CompressorRef<'static>; 12] = [ &ALPCompressor as CompressorRef, + &ALPRDCompressor, &BitPackedCompressor, &DateTimePartsCompressor, &DEFAULT_RUN_END_COMPRESSOR, diff --git a/vortex-sampling-compressor/tests/smoketest.rs b/vortex-sampling-compressor/tests/smoketest.rs index 6477810446..f3c112148b 100644 --- a/vortex-sampling-compressor/tests/smoketest.rs +++ b/vortex-sampling-compressor/tests/smoketest.rs @@ -29,6 +29,7 @@ mod tests { use vortex_datetime_parts::DateTimeParts; use vortex_dict::Dict; use vortex_fastlanes::FoR; + use vortex_sampling_compressor::compressors::alp_rd::ALPRDCompressor; use vortex_sampling_compressor::compressors::fsst::FSSTCompressor; use super::*; @@ -39,6 +40,7 @@ mod tests { let compressor = SamplingCompressor::new_with_options( HashSet::from([ &ALPCompressor as CompressorRef, + &ALPRDCompressor as CompressorRef, &BitPackedCompressor, // TODO(robert): Implement minimal compute for DeltaArrays - scalar_at and slice // &DeltaCompressor,