docs and some other cleanup

spiraldb · Oct 2, 2024 · daccef7 · daccef7
1 parent a732275
commit daccef7
Show file tree

Hide file tree

Showing 8 changed files with 57 additions and 46 deletions.
diff --git a/encodings/alp/benches/alp_compress.rs b/encodings/alp/benches/alp_compress.rs
@@ -7,7 +7,7 @@ use vortex::array::PrimitiveArray;
 use vortex::validity::Validity;
 use vortex::variants::PrimitiveArrayTrait;
 use vortex::IntoCanonical;
-use vortex_alp::{alp_encode_components, ALPArray, ALPFloat, ALPRDFloat, Encoder, Exponents};
+use vortex_alp::{alp_encode_components, ALPArray, ALPFloat, ALPRDFloat, Exponents, RDEncoder};
 use vortex_dtype::NativePType;
 
 fn main() {
@@ -24,7 +24,7 @@ fn compress_alp<T: ALPFloat>(n: usize) -> (Exponents, Vec<T::ALPInt>, Vec<u64>,
 fn compress_rd<T: ALPRDFloat>(bencher: Bencher, n: usize) {
     let values: Vec<T> = vec![T::from(1.23).unwrap(); n];
     let primitive = PrimitiveArray::from(values);
-    let encoder = Encoder::new(&[T::from(1.23).unwrap()]);
+    let encoder = RDEncoder::new(&[T::from(1.23).unwrap()]);
 
     bencher.bench_local(|| encoder.encode(&primitive));
 }

diff --git a/encodings/alp/src/alp_rd/array.rs b/encodings/alp/src/alp_rd/array.rs
@@ -15,7 +15,6 @@ impl_encoding!("vortex.alprd", ids::ALP_RD, ALPRD);
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct ALPRDMetadata {
     right_bit_width: u8,
-    // left_bit_width is implicit from the dict_len.
     dict_len: u8,
     dict: [u16; 8],
     left_parts_dtype: DType,
@@ -103,7 +102,7 @@ impl ALPRDArray {
 
     /// The rightmost (least significant) bits of the floating point values stored in the array.
     pub fn right_parts(&self) -> Array {
-        let uint_ptype = if self.metadata().is_f32 {
+        let uint_ptype = if self.is_f32() {
             PType::U32
         } else {
             PType::U64
@@ -254,7 +253,7 @@ mod test {
         let real_array = PrimitiveArray::from_nullable_vec(reals.clone());
 
         // Pick a seed that we know will trigger lots of exceptions.
-        let encoder: alp_rd::Encoder = alp_rd::Encoder::new(&[seed.powi(-2)]);
+        let encoder: alp_rd::RDEncoder = alp_rd::RDEncoder::new(&[seed.powi(-2)]);
 
         let rd_array = encoder.encode(&real_array);
 

diff --git a/encodings/alp/src/alp_rd/compute/filter.rs b/encodings/alp/src/alp_rd/compute/filter.rs
@@ -30,14 +30,14 @@ mod test {
     use vortex::compute::filter;
     use vortex::IntoArrayVariant;
 
-    use crate::{ALPRDFloat, Encoder};
+    use crate::{ALPRDFloat, RDEncoder};
 
     #[rstest]
     #[case(0.1f32, 0.2f32, 3e25f32)]
     #[case(0.1f64, 0.2f64, 3e100f64)]
     fn test_filter<T: ALPRDFloat>(#[case] a: T, #[case] b: T, #[case] outlier: T) {
         let array = PrimitiveArray::from(vec![a, b, outlier]);
-        let encoded = Encoder::new(&[a, b]).encode(&array);
+        let encoded = RDEncoder::new(&[a, b]).encode(&array);
 
         // Make sure that we're testing the exception pathway.
         assert!(encoded.left_parts_exceptions().is_some());

diff --git a/encodings/alp/src/alp_rd/compute/scalar_at.rs b/encodings/alp/src/alp_rd/compute/scalar_at.rs
@@ -1,6 +1,4 @@
 use vortex::compute::unary::{scalar_at, ScalarAtFn};
-use vortex::ArrayDType;
-use vortex_dtype::PType;
 use vortex_error::{VortexResult, VortexUnwrap};
 use vortex_scalar::Scalar;
 
@@ -21,14 +19,14 @@ impl ScalarAtFn for ALPRDArray {
         };
 
         // combine left and right values
-        if self.ptype() == Some(PType::F64) {
-            let right: u64 = scalar_at(&self.right_parts(), index)?.try_into()?;
-            let packed = f64::from_bits(((left as u64) << self.right_bit_width()) | right);
-            Ok(packed.into())
-        } else {
+        if self.is_f32() {
             let right: u32 = scalar_at(&self.right_parts(), index)?.try_into()?;
             let packed = f32::from_bits((left as u32) << self.right_bit_width() | right);
             Ok(packed.into())
+        } else {
+            let right: u64 = scalar_at(&self.right_parts(), index)?.try_into()?;
+            let packed = f64::from_bits(((left as u64) << self.right_bit_width()) | right);
+            Ok(packed.into())
         }
     }
 
@@ -44,7 +42,7 @@ mod test {
     use vortex::compute::unary::scalar_at;
     use vortex_scalar::Scalar;
 
-    use crate::{ALPRDFloat, Encoder};
+    use crate::{ALPRDFloat, RDEncoder};
 
     #[rstest]
     #[case(0.1f32, 0.2f32, 3e25f32)]
@@ -55,7 +53,7 @@ mod test {
         #[case] outlier: T,
     ) {
         let array = PrimitiveArray::from(vec![a, b, outlier]);
-        let encoded = Encoder::new(&[a, b]).encode(&array);
+        let encoded = RDEncoder::new(&[a, b]).encode(&array);
 
         // Make sure that we're testing the exception pathway.
         assert!(encoded.left_parts_exceptions().is_some());

diff --git a/encodings/alp/src/alp_rd/compute/slice.rs b/encodings/alp/src/alp_rd/compute/slice.rs
@@ -30,14 +30,14 @@ mod test {
     use vortex::compute::slice;
     use vortex::IntoArrayVariant;
 
-    use crate::{ALPRDFloat, Encoder};
+    use crate::{ALPRDFloat, RDEncoder};
 
     #[rstest]
     #[case(0.1f32, 0.2f32, 3e25f32)]
     #[case(0.1f64, 0.2f64, 3e100f64)]
     fn test_slice<T: ALPRDFloat>(#[case] a: T, #[case] b: T, #[case] outlier: T) {
         let array = PrimitiveArray::from(vec![a, b, outlier]);
-        let encoded = Encoder::new(&[a, b]).encode(&array);
+        let encoded = RDEncoder::new(&[a, b]).encode(&array);
 
         assert!(encoded.left_parts_exceptions().is_some());
 

diff --git a/encodings/alp/src/alp_rd/compute/take.rs b/encodings/alp/src/alp_rd/compute/take.rs
@@ -30,14 +30,14 @@ mod test {
     use vortex::compute::take;
     use vortex::IntoArrayVariant;
 
-    use crate::{ALPRDFloat, Encoder};
+    use crate::{ALPRDFloat, RDEncoder};
 
     #[rstest]
     #[case(0.1f32, 0.2f32, 3e25f32)]
     #[case(0.1f64, 0.2f64, 3e100f64)]
     fn test_take<T: ALPRDFloat>(#[case] a: T, #[case] b: T, #[case] outlier: T) {
         let array = PrimitiveArray::from(vec![a, b, outlier]);
-        let encoded = Encoder::new(&[a, b]).encode(&array);
+        let encoded = RDEncoder::new(&[a, b]).encode(&array);
 
         assert!(encoded.left_parts_exceptions().is_some());
 

diff --git a/encodings/alp/src/alp_rd/mod.rs b/encodings/alp/src/alp_rd/mod.rs
@@ -1,21 +1,3 @@
-//! Encoding for "real doubles", i.e. doubles that don't compress easily via the typical ALP
-//! algorithm.
-//!
-//! ALP-RD uses the algorithm outlined in Section 3.4 of the paper, as well as relevant MIT-licensed
-//! C++ code from CWI.
-//!
-//! The crux of it is that the front (most significant) bits of many double vectors tend to be
-//! the same, i.e. most doubles in a vector often use the same exponent and front bits. Compression
-//! proceeds by finding the best prefix of up to 16 bits that can be collapsed into a dictionary of
-//! up to 8 elements. Each double can then be broken into the front/left `L` bits, which neatly
-//! bit-packs down to 3 bits per element. The remaining `R` bits are bit-packed as well.
-//!
-//! In the ideal case, this gets about ~24% compression.
-//!
-//! The code in this module draws on the MIT-licensed [C++ implementation].
-//!
-//! [C++ implementation]: https://github.com/cwida/ALP/blob/main/include/alp/rd.hpp
-
 pub use array::*;
 
 mod array;
@@ -58,15 +40,27 @@ mod private {
     impl Sealed for f64 {}
 }
 
+/// Main trait for ALP-RD encodable floating point numbers.
+///
+/// Like the paper, we limit this to the IEEE7 754 single-precision (`f32`) and double-precision
+/// (`f64`) floating point types.
 pub trait ALPRDFloat: private::Sealed + Float + Copy + NativePType {
+    /// The unsigned integer type with the same bit-width as the floating-point type.
     type UINT: NativePType + PrimInt + One + Copy;
 
+    /// Number of bits the value occupies in registers.
     const BITS: usize = size_of::<Self>() * 8;
 
+    /// Bit-wise transmute from the unsigned integer type to the floating-point type.
     fn from_bits(bits: Self::UINT) -> Self;
+
+    /// Bit-wise transmute into the unsigned integer type.
     fn to_bits(value: Self) -> Self::UINT;
 
+    /// Truncating conversion from the unsigned integer type to `u16`.
     fn to_u16(bits: Self::UINT) -> u16;
+
+    /// Type-widening conversion from `u16` to the unsigned integer type.
     fn from_u16(value: u16) -> Self::UINT;
 }
 
@@ -110,15 +104,30 @@ impl ALPRDFloat for f32 {
     }
 }
 
-/// Encoder for ALP-RD (real doubles) values.
+/// Encoder for ALP-RD ("real doubles") values.
+///
+/// The encoder calculates its parameters from a single sample of floating-point values,
+/// and then can be applied to many vectors.
+///
+/// ALP-RD uses the algorithm outlined in Section 3.4 of the paper. The crux of it is that the front
+/// (most significant) bits of many double vectors tend to be  the same, i.e. most doubles in a
+/// vector often use the same exponent and front bits. Compression proceeds by finding the best
+/// prefix of up to 16 bits that can be collapsed into a dictionary of
+/// up to 8 elements. Each double can then be broken into the front/left `L` bits, which neatly
+/// bit-packs down to 1-3 bits per element (depending on the actual dictionary size).
+/// The remaining `R` bits naturally bit-pack.
 ///
-/// The encoder builds a sample of values from there.
-pub struct Encoder {
+/// In the ideal case, this scheme allows us to store a sequence of doubles in 49 bits-per-value.
+///
+/// Our implementation draws on the MIT-licensed [C++ implementation] provided by the original authors.
+///
+/// [C++ implementation]: https://github.com/cwida/ALP/blob/main/include/alp/rd.hpp
+pub struct RDEncoder {
     right_bit_width: u8,
     codes: Vec<u16>,
 }
 
-impl Encoder {
+impl RDEncoder {
     /// Build a new encoder from a sample of doubles.
     pub fn new<T>(sample: &[T]) -> Self
     where
@@ -215,8 +224,7 @@ impl Encoder {
             let bw = bit_width!(max_exc_pos);
 
             let exc_pos_array = PrimitiveArray::from(exceptions_pos);
-            // SAFETY: the positions array is sorted, we calculate bw such that it is wide enough
-            //  to hold the largest position index.
+            // SAFETY: We calculate bw such that it is wide enough to hold the largest position index.
             let packed_pos = unsafe {
                 bitpack_encode_unchecked(exc_pos_array, bw)
                     .vortex_unwrap()
@@ -243,7 +251,13 @@ impl Encoder {
     }
 }
 
-// Only applies for F64.
+/// Decode a vector of ALP-RD encoded values back into their original floating point format.
+///
+/// # Panics
+///
+/// The function panics if the provided `left_parts` and `right_parts` differ in length.
+///
+/// The function panics if the provided `exc_pos` and `exceptions` differ in length.
 pub fn alp_rd_decode<T: ALPRDFloat>(
     left_parts: &[u16],
     left_parts_dict: &[u16],

diff --git a/vortex-sampling-compressor/src/compressors/alp_rd.rs b/vortex-sampling-compressor/src/compressors/alp_rd.rs
@@ -5,7 +5,7 @@ use std::sync::Arc;
 use vortex::array::PrimitiveArray;
 use vortex::encoding::EncodingRef;
 use vortex::{Array, ArrayDef, IntoArray, IntoArrayVariant};
-use vortex_alp::{match_each_alp_float_ptype, ALPRDEncoding, Encoder as ALPRDEncoder, ALPRD};
+use vortex_alp::{match_each_alp_float_ptype, ALPRDEncoding, RDEncoder as ALPRDEncoder, ALPRD};
 use vortex_dtype::PType;
 use vortex_error::{vortex_bail, VortexResult};
 use vortex_fastlanes::BitPackedEncoding;