From daccef78b52639bdb3bada3560d1e0d843bdcea7 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 2 Oct 2024 11:57:20 -0400 Subject: [PATCH] docs and some other cleanup --- encodings/alp/benches/alp_compress.rs | 4 +- encodings/alp/src/alp_rd/array.rs | 5 +- encodings/alp/src/alp_rd/compute/filter.rs | 4 +- encodings/alp/src/alp_rd/compute/scalar_at.rs | 16 ++--- encodings/alp/src/alp_rd/compute/slice.rs | 4 +- encodings/alp/src/alp_rd/compute/take.rs | 4 +- encodings/alp/src/alp_rd/mod.rs | 64 +++++++++++-------- .../src/compressors/alp_rd.rs | 2 +- 8 files changed, 57 insertions(+), 46 deletions(-) diff --git a/encodings/alp/benches/alp_compress.rs b/encodings/alp/benches/alp_compress.rs index 0010bde34d..c571de4018 100644 --- a/encodings/alp/benches/alp_compress.rs +++ b/encodings/alp/benches/alp_compress.rs @@ -7,7 +7,7 @@ use vortex::array::PrimitiveArray; use vortex::validity::Validity; use vortex::variants::PrimitiveArrayTrait; use vortex::IntoCanonical; -use vortex_alp::{alp_encode_components, ALPArray, ALPFloat, ALPRDFloat, Encoder, Exponents}; +use vortex_alp::{alp_encode_components, ALPArray, ALPFloat, ALPRDFloat, Exponents, RDEncoder}; use vortex_dtype::NativePType; fn main() { @@ -24,7 +24,7 @@ fn compress_alp(n: usize) -> (Exponents, Vec, Vec, fn compress_rd(bencher: Bencher, n: usize) { let values: Vec = vec![T::from(1.23).unwrap(); n]; let primitive = PrimitiveArray::from(values); - let encoder = Encoder::new(&[T::from(1.23).unwrap()]); + let encoder = RDEncoder::new(&[T::from(1.23).unwrap()]); bencher.bench_local(|| encoder.encode(&primitive)); } diff --git a/encodings/alp/src/alp_rd/array.rs b/encodings/alp/src/alp_rd/array.rs index 3e263b861d..d942a104ee 100644 --- a/encodings/alp/src/alp_rd/array.rs +++ b/encodings/alp/src/alp_rd/array.rs @@ -15,7 +15,6 @@ impl_encoding!("vortex.alprd", ids::ALP_RD, ALPRD); #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ALPRDMetadata { right_bit_width: u8, - // left_bit_width is implicit from the dict_len. dict_len: u8, dict: [u16; 8], left_parts_dtype: DType, @@ -103,7 +102,7 @@ impl ALPRDArray { /// The rightmost (least significant) bits of the floating point values stored in the array. pub fn right_parts(&self) -> Array { - let uint_ptype = if self.metadata().is_f32 { + let uint_ptype = if self.is_f32() { PType::U32 } else { PType::U64 @@ -254,7 +253,7 @@ mod test { let real_array = PrimitiveArray::from_nullable_vec(reals.clone()); // Pick a seed that we know will trigger lots of exceptions. - let encoder: alp_rd::Encoder = alp_rd::Encoder::new(&[seed.powi(-2)]); + let encoder: alp_rd::RDEncoder = alp_rd::RDEncoder::new(&[seed.powi(-2)]); let rd_array = encoder.encode(&real_array); diff --git a/encodings/alp/src/alp_rd/compute/filter.rs b/encodings/alp/src/alp_rd/compute/filter.rs index a20972aead..d35c796867 100644 --- a/encodings/alp/src/alp_rd/compute/filter.rs +++ b/encodings/alp/src/alp_rd/compute/filter.rs @@ -30,14 +30,14 @@ mod test { use vortex::compute::filter; use vortex::IntoArrayVariant; - use crate::{ALPRDFloat, Encoder}; + use crate::{ALPRDFloat, RDEncoder}; #[rstest] #[case(0.1f32, 0.2f32, 3e25f32)] #[case(0.1f64, 0.2f64, 3e100f64)] fn test_filter(#[case] a: T, #[case] b: T, #[case] outlier: T) { let array = PrimitiveArray::from(vec![a, b, outlier]); - let encoded = Encoder::new(&[a, b]).encode(&array); + let encoded = RDEncoder::new(&[a, b]).encode(&array); // Make sure that we're testing the exception pathway. assert!(encoded.left_parts_exceptions().is_some()); diff --git a/encodings/alp/src/alp_rd/compute/scalar_at.rs b/encodings/alp/src/alp_rd/compute/scalar_at.rs index 3af49310df..65a4bc9433 100644 --- a/encodings/alp/src/alp_rd/compute/scalar_at.rs +++ b/encodings/alp/src/alp_rd/compute/scalar_at.rs @@ -1,6 +1,4 @@ use vortex::compute::unary::{scalar_at, ScalarAtFn}; -use vortex::ArrayDType; -use vortex_dtype::PType; use vortex_error::{VortexResult, VortexUnwrap}; use vortex_scalar::Scalar; @@ -21,14 +19,14 @@ impl ScalarAtFn for ALPRDArray { }; // combine left and right values - if self.ptype() == Some(PType::F64) { - let right: u64 = scalar_at(&self.right_parts(), index)?.try_into()?; - let packed = f64::from_bits(((left as u64) << self.right_bit_width()) | right); - Ok(packed.into()) - } else { + if self.is_f32() { let right: u32 = scalar_at(&self.right_parts(), index)?.try_into()?; let packed = f32::from_bits((left as u32) << self.right_bit_width() | right); Ok(packed.into()) + } else { + let right: u64 = scalar_at(&self.right_parts(), index)?.try_into()?; + let packed = f64::from_bits(((left as u64) << self.right_bit_width()) | right); + Ok(packed.into()) } } @@ -44,7 +42,7 @@ mod test { use vortex::compute::unary::scalar_at; use vortex_scalar::Scalar; - use crate::{ALPRDFloat, Encoder}; + use crate::{ALPRDFloat, RDEncoder}; #[rstest] #[case(0.1f32, 0.2f32, 3e25f32)] @@ -55,7 +53,7 @@ mod test { #[case] outlier: T, ) { let array = PrimitiveArray::from(vec![a, b, outlier]); - let encoded = Encoder::new(&[a, b]).encode(&array); + let encoded = RDEncoder::new(&[a, b]).encode(&array); // Make sure that we're testing the exception pathway. assert!(encoded.left_parts_exceptions().is_some()); diff --git a/encodings/alp/src/alp_rd/compute/slice.rs b/encodings/alp/src/alp_rd/compute/slice.rs index 339fdd9e6c..827e30528e 100644 --- a/encodings/alp/src/alp_rd/compute/slice.rs +++ b/encodings/alp/src/alp_rd/compute/slice.rs @@ -30,14 +30,14 @@ mod test { use vortex::compute::slice; use vortex::IntoArrayVariant; - use crate::{ALPRDFloat, Encoder}; + use crate::{ALPRDFloat, RDEncoder}; #[rstest] #[case(0.1f32, 0.2f32, 3e25f32)] #[case(0.1f64, 0.2f64, 3e100f64)] fn test_slice(#[case] a: T, #[case] b: T, #[case] outlier: T) { let array = PrimitiveArray::from(vec![a, b, outlier]); - let encoded = Encoder::new(&[a, b]).encode(&array); + let encoded = RDEncoder::new(&[a, b]).encode(&array); assert!(encoded.left_parts_exceptions().is_some()); diff --git a/encodings/alp/src/alp_rd/compute/take.rs b/encodings/alp/src/alp_rd/compute/take.rs index 54b2754dc1..28ce8a6441 100644 --- a/encodings/alp/src/alp_rd/compute/take.rs +++ b/encodings/alp/src/alp_rd/compute/take.rs @@ -30,14 +30,14 @@ mod test { use vortex::compute::take; use vortex::IntoArrayVariant; - use crate::{ALPRDFloat, Encoder}; + use crate::{ALPRDFloat, RDEncoder}; #[rstest] #[case(0.1f32, 0.2f32, 3e25f32)] #[case(0.1f64, 0.2f64, 3e100f64)] fn test_take(#[case] a: T, #[case] b: T, #[case] outlier: T) { let array = PrimitiveArray::from(vec![a, b, outlier]); - let encoded = Encoder::new(&[a, b]).encode(&array); + let encoded = RDEncoder::new(&[a, b]).encode(&array); assert!(encoded.left_parts_exceptions().is_some()); diff --git a/encodings/alp/src/alp_rd/mod.rs b/encodings/alp/src/alp_rd/mod.rs index d1bf4dbff8..2c22c00929 100644 --- a/encodings/alp/src/alp_rd/mod.rs +++ b/encodings/alp/src/alp_rd/mod.rs @@ -1,21 +1,3 @@ -//! Encoding for "real doubles", i.e. doubles that don't compress easily via the typical ALP -//! algorithm. -//! -//! ALP-RD uses the algorithm outlined in Section 3.4 of the paper, as well as relevant MIT-licensed -//! C++ code from CWI. -//! -//! The crux of it is that the front (most significant) bits of many double vectors tend to be -//! the same, i.e. most doubles in a vector often use the same exponent and front bits. Compression -//! proceeds by finding the best prefix of up to 16 bits that can be collapsed into a dictionary of -//! up to 8 elements. Each double can then be broken into the front/left `L` bits, which neatly -//! bit-packs down to 3 bits per element. The remaining `R` bits are bit-packed as well. -//! -//! In the ideal case, this gets about ~24% compression. -//! -//! The code in this module draws on the MIT-licensed [C++ implementation]. -//! -//! [C++ implementation]: https://github.com/cwida/ALP/blob/main/include/alp/rd.hpp - pub use array::*; mod array; @@ -58,15 +40,27 @@ mod private { impl Sealed for f64 {} } +/// Main trait for ALP-RD encodable floating point numbers. +/// +/// Like the paper, we limit this to the IEEE7 754 single-precision (`f32`) and double-precision +/// (`f64`) floating point types. pub trait ALPRDFloat: private::Sealed + Float + Copy + NativePType { + /// The unsigned integer type with the same bit-width as the floating-point type. type UINT: NativePType + PrimInt + One + Copy; + /// Number of bits the value occupies in registers. const BITS: usize = size_of::() * 8; + /// Bit-wise transmute from the unsigned integer type to the floating-point type. fn from_bits(bits: Self::UINT) -> Self; + + /// Bit-wise transmute into the unsigned integer type. fn to_bits(value: Self) -> Self::UINT; + /// Truncating conversion from the unsigned integer type to `u16`. fn to_u16(bits: Self::UINT) -> u16; + + /// Type-widening conversion from `u16` to the unsigned integer type. fn from_u16(value: u16) -> Self::UINT; } @@ -110,15 +104,30 @@ impl ALPRDFloat for f32 { } } -/// Encoder for ALP-RD (real doubles) values. +/// Encoder for ALP-RD ("real doubles") values. +/// +/// The encoder calculates its parameters from a single sample of floating-point values, +/// and then can be applied to many vectors. +/// +/// ALP-RD uses the algorithm outlined in Section 3.4 of the paper. The crux of it is that the front +/// (most significant) bits of many double vectors tend to be the same, i.e. most doubles in a +/// vector often use the same exponent and front bits. Compression proceeds by finding the best +/// prefix of up to 16 bits that can be collapsed into a dictionary of +/// up to 8 elements. Each double can then be broken into the front/left `L` bits, which neatly +/// bit-packs down to 1-3 bits per element (depending on the actual dictionary size). +/// The remaining `R` bits naturally bit-pack. /// -/// The encoder builds a sample of values from there. -pub struct Encoder { +/// In the ideal case, this scheme allows us to store a sequence of doubles in 49 bits-per-value. +/// +/// Our implementation draws on the MIT-licensed [C++ implementation] provided by the original authors. +/// +/// [C++ implementation]: https://github.com/cwida/ALP/blob/main/include/alp/rd.hpp +pub struct RDEncoder { right_bit_width: u8, codes: Vec, } -impl Encoder { +impl RDEncoder { /// Build a new encoder from a sample of doubles. pub fn new(sample: &[T]) -> Self where @@ -215,8 +224,7 @@ impl Encoder { let bw = bit_width!(max_exc_pos); let exc_pos_array = PrimitiveArray::from(exceptions_pos); - // SAFETY: the positions array is sorted, we calculate bw such that it is wide enough - // to hold the largest position index. + // SAFETY: We calculate bw such that it is wide enough to hold the largest position index. let packed_pos = unsafe { bitpack_encode_unchecked(exc_pos_array, bw) .vortex_unwrap() @@ -243,7 +251,13 @@ impl Encoder { } } -// Only applies for F64. +/// Decode a vector of ALP-RD encoded values back into their original floating point format. +/// +/// # Panics +/// +/// The function panics if the provided `left_parts` and `right_parts` differ in length. +/// +/// The function panics if the provided `exc_pos` and `exceptions` differ in length. pub fn alp_rd_decode( left_parts: &[u16], left_parts_dict: &[u16], diff --git a/vortex-sampling-compressor/src/compressors/alp_rd.rs b/vortex-sampling-compressor/src/compressors/alp_rd.rs index 542ea1acdc..e877d4067a 100644 --- a/vortex-sampling-compressor/src/compressors/alp_rd.rs +++ b/vortex-sampling-compressor/src/compressors/alp_rd.rs @@ -5,7 +5,7 @@ use std::sync::Arc; use vortex::array::PrimitiveArray; use vortex::encoding::EncodingRef; use vortex::{Array, ArrayDef, IntoArray, IntoArrayVariant}; -use vortex_alp::{match_each_alp_float_ptype, ALPRDEncoding, Encoder as ALPRDEncoder, ALPRD}; +use vortex_alp::{match_each_alp_float_ptype, ALPRDEncoding, RDEncoder as ALPRDEncoder, ALPRD}; use vortex_dtype::PType; use vortex_error::{vortex_bail, VortexResult}; use vortex_fastlanes::BitPackedEncoding;