Skip to content

Commit

Permalink
docs and some other cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
a10y committed Oct 2, 2024
1 parent a732275 commit daccef7
Show file tree
Hide file tree
Showing 8 changed files with 57 additions and 46 deletions.
4 changes: 2 additions & 2 deletions encodings/alp/benches/alp_compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use vortex::array::PrimitiveArray;
use vortex::validity::Validity;
use vortex::variants::PrimitiveArrayTrait;
use vortex::IntoCanonical;
use vortex_alp::{alp_encode_components, ALPArray, ALPFloat, ALPRDFloat, Encoder, Exponents};
use vortex_alp::{alp_encode_components, ALPArray, ALPFloat, ALPRDFloat, Exponents, RDEncoder};
use vortex_dtype::NativePType;

fn main() {
Expand All @@ -24,7 +24,7 @@ fn compress_alp<T: ALPFloat>(n: usize) -> (Exponents, Vec<T::ALPInt>, Vec<u64>,
fn compress_rd<T: ALPRDFloat>(bencher: Bencher, n: usize) {
let values: Vec<T> = vec![T::from(1.23).unwrap(); n];
let primitive = PrimitiveArray::from(values);
let encoder = Encoder::new(&[T::from(1.23).unwrap()]);
let encoder = RDEncoder::new(&[T::from(1.23).unwrap()]);

bencher.bench_local(|| encoder.encode(&primitive));
}
Expand Down
5 changes: 2 additions & 3 deletions encodings/alp/src/alp_rd/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ impl_encoding!("vortex.alprd", ids::ALP_RD, ALPRD);
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ALPRDMetadata {
right_bit_width: u8,
// left_bit_width is implicit from the dict_len.
dict_len: u8,
dict: [u16; 8],
left_parts_dtype: DType,
Expand Down Expand Up @@ -103,7 +102,7 @@ impl ALPRDArray {

/// The rightmost (least significant) bits of the floating point values stored in the array.
pub fn right_parts(&self) -> Array {
let uint_ptype = if self.metadata().is_f32 {
let uint_ptype = if self.is_f32() {
PType::U32
} else {
PType::U64
Expand Down Expand Up @@ -254,7 +253,7 @@ mod test {
let real_array = PrimitiveArray::from_nullable_vec(reals.clone());

// Pick a seed that we know will trigger lots of exceptions.
let encoder: alp_rd::Encoder = alp_rd::Encoder::new(&[seed.powi(-2)]);
let encoder: alp_rd::RDEncoder = alp_rd::RDEncoder::new(&[seed.powi(-2)]);

let rd_array = encoder.encode(&real_array);

Expand Down
4 changes: 2 additions & 2 deletions encodings/alp/src/alp_rd/compute/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,14 @@ mod test {
use vortex::compute::filter;
use vortex::IntoArrayVariant;

use crate::{ALPRDFloat, Encoder};
use crate::{ALPRDFloat, RDEncoder};

#[rstest]
#[case(0.1f32, 0.2f32, 3e25f32)]
#[case(0.1f64, 0.2f64, 3e100f64)]
fn test_filter<T: ALPRDFloat>(#[case] a: T, #[case] b: T, #[case] outlier: T) {
let array = PrimitiveArray::from(vec![a, b, outlier]);
let encoded = Encoder::new(&[a, b]).encode(&array);
let encoded = RDEncoder::new(&[a, b]).encode(&array);

// Make sure that we're testing the exception pathway.
assert!(encoded.left_parts_exceptions().is_some());
Expand Down
16 changes: 7 additions & 9 deletions encodings/alp/src/alp_rd/compute/scalar_at.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
use vortex::compute::unary::{scalar_at, ScalarAtFn};
use vortex::ArrayDType;
use vortex_dtype::PType;
use vortex_error::{VortexResult, VortexUnwrap};
use vortex_scalar::Scalar;

Expand All @@ -21,14 +19,14 @@ impl ScalarAtFn for ALPRDArray {
};

// combine left and right values
if self.ptype() == Some(PType::F64) {
let right: u64 = scalar_at(&self.right_parts(), index)?.try_into()?;
let packed = f64::from_bits(((left as u64) << self.right_bit_width()) | right);
Ok(packed.into())
} else {
if self.is_f32() {
let right: u32 = scalar_at(&self.right_parts(), index)?.try_into()?;
let packed = f32::from_bits((left as u32) << self.right_bit_width() | right);
Ok(packed.into())
} else {
let right: u64 = scalar_at(&self.right_parts(), index)?.try_into()?;
let packed = f64::from_bits(((left as u64) << self.right_bit_width()) | right);
Ok(packed.into())
}
}

Expand All @@ -44,7 +42,7 @@ mod test {
use vortex::compute::unary::scalar_at;
use vortex_scalar::Scalar;

use crate::{ALPRDFloat, Encoder};
use crate::{ALPRDFloat, RDEncoder};

#[rstest]
#[case(0.1f32, 0.2f32, 3e25f32)]
Expand All @@ -55,7 +53,7 @@ mod test {
#[case] outlier: T,
) {
let array = PrimitiveArray::from(vec![a, b, outlier]);
let encoded = Encoder::new(&[a, b]).encode(&array);
let encoded = RDEncoder::new(&[a, b]).encode(&array);

// Make sure that we're testing the exception pathway.
assert!(encoded.left_parts_exceptions().is_some());
Expand Down
4 changes: 2 additions & 2 deletions encodings/alp/src/alp_rd/compute/slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,14 @@ mod test {
use vortex::compute::slice;
use vortex::IntoArrayVariant;

use crate::{ALPRDFloat, Encoder};
use crate::{ALPRDFloat, RDEncoder};

#[rstest]
#[case(0.1f32, 0.2f32, 3e25f32)]
#[case(0.1f64, 0.2f64, 3e100f64)]
fn test_slice<T: ALPRDFloat>(#[case] a: T, #[case] b: T, #[case] outlier: T) {
let array = PrimitiveArray::from(vec![a, b, outlier]);
let encoded = Encoder::new(&[a, b]).encode(&array);
let encoded = RDEncoder::new(&[a, b]).encode(&array);

assert!(encoded.left_parts_exceptions().is_some());

Expand Down
4 changes: 2 additions & 2 deletions encodings/alp/src/alp_rd/compute/take.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,14 @@ mod test {
use vortex::compute::take;
use vortex::IntoArrayVariant;

use crate::{ALPRDFloat, Encoder};
use crate::{ALPRDFloat, RDEncoder};

#[rstest]
#[case(0.1f32, 0.2f32, 3e25f32)]
#[case(0.1f64, 0.2f64, 3e100f64)]
fn test_take<T: ALPRDFloat>(#[case] a: T, #[case] b: T, #[case] outlier: T) {
let array = PrimitiveArray::from(vec![a, b, outlier]);
let encoded = Encoder::new(&[a, b]).encode(&array);
let encoded = RDEncoder::new(&[a, b]).encode(&array);

assert!(encoded.left_parts_exceptions().is_some());

Expand Down
64 changes: 39 additions & 25 deletions encodings/alp/src/alp_rd/mod.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,3 @@
//! Encoding for "real doubles", i.e. doubles that don't compress easily via the typical ALP
//! algorithm.
//!
//! ALP-RD uses the algorithm outlined in Section 3.4 of the paper, as well as relevant MIT-licensed
//! C++ code from CWI.
//!
//! The crux of it is that the front (most significant) bits of many double vectors tend to be
//! the same, i.e. most doubles in a vector often use the same exponent and front bits. Compression
//! proceeds by finding the best prefix of up to 16 bits that can be collapsed into a dictionary of
//! up to 8 elements. Each double can then be broken into the front/left `L` bits, which neatly
//! bit-packs down to 3 bits per element. The remaining `R` bits are bit-packed as well.
//!
//! In the ideal case, this gets about ~24% compression.
//!
//! The code in this module draws on the MIT-licensed [C++ implementation].
//!
//! [C++ implementation]: https://github.com/cwida/ALP/blob/main/include/alp/rd.hpp
pub use array::*;

mod array;
Expand Down Expand Up @@ -58,15 +40,27 @@ mod private {
impl Sealed for f64 {}
}

/// Main trait for ALP-RD encodable floating point numbers.
///
/// Like the paper, we limit this to the IEEE7 754 single-precision (`f32`) and double-precision
/// (`f64`) floating point types.
pub trait ALPRDFloat: private::Sealed + Float + Copy + NativePType {
/// The unsigned integer type with the same bit-width as the floating-point type.
type UINT: NativePType + PrimInt + One + Copy;

/// Number of bits the value occupies in registers.
const BITS: usize = size_of::<Self>() * 8;

/// Bit-wise transmute from the unsigned integer type to the floating-point type.
fn from_bits(bits: Self::UINT) -> Self;

/// Bit-wise transmute into the unsigned integer type.
fn to_bits(value: Self) -> Self::UINT;

/// Truncating conversion from the unsigned integer type to `u16`.
fn to_u16(bits: Self::UINT) -> u16;

/// Type-widening conversion from `u16` to the unsigned integer type.
fn from_u16(value: u16) -> Self::UINT;
}

Expand Down Expand Up @@ -110,15 +104,30 @@ impl ALPRDFloat for f32 {
}
}

/// Encoder for ALP-RD (real doubles) values.
/// Encoder for ALP-RD ("real doubles") values.
///
/// The encoder calculates its parameters from a single sample of floating-point values,
/// and then can be applied to many vectors.
///
/// ALP-RD uses the algorithm outlined in Section 3.4 of the paper. The crux of it is that the front
/// (most significant) bits of many double vectors tend to be the same, i.e. most doubles in a
/// vector often use the same exponent and front bits. Compression proceeds by finding the best
/// prefix of up to 16 bits that can be collapsed into a dictionary of
/// up to 8 elements. Each double can then be broken into the front/left `L` bits, which neatly
/// bit-packs down to 1-3 bits per element (depending on the actual dictionary size).
/// The remaining `R` bits naturally bit-pack.
///
/// The encoder builds a sample of values from there.
pub struct Encoder {
/// In the ideal case, this scheme allows us to store a sequence of doubles in 49 bits-per-value.
///
/// Our implementation draws on the MIT-licensed [C++ implementation] provided by the original authors.
///
/// [C++ implementation]: https://github.com/cwida/ALP/blob/main/include/alp/rd.hpp
pub struct RDEncoder {
right_bit_width: u8,
codes: Vec<u16>,
}

impl Encoder {
impl RDEncoder {
/// Build a new encoder from a sample of doubles.
pub fn new<T>(sample: &[T]) -> Self
where
Expand Down Expand Up @@ -215,8 +224,7 @@ impl Encoder {
let bw = bit_width!(max_exc_pos);

let exc_pos_array = PrimitiveArray::from(exceptions_pos);
// SAFETY: the positions array is sorted, we calculate bw such that it is wide enough
// to hold the largest position index.
// SAFETY: We calculate bw such that it is wide enough to hold the largest position index.
let packed_pos = unsafe {
bitpack_encode_unchecked(exc_pos_array, bw)
.vortex_unwrap()
Expand All @@ -243,7 +251,13 @@ impl Encoder {
}
}

// Only applies for F64.
/// Decode a vector of ALP-RD encoded values back into their original floating point format.
///
/// # Panics
///
/// The function panics if the provided `left_parts` and `right_parts` differ in length.
///
/// The function panics if the provided `exc_pos` and `exceptions` differ in length.
pub fn alp_rd_decode<T: ALPRDFloat>(
left_parts: &[u16],
left_parts_dict: &[u16],
Expand Down
2 changes: 1 addition & 1 deletion vortex-sampling-compressor/src/compressors/alp_rd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use std::sync::Arc;
use vortex::array::PrimitiveArray;
use vortex::encoding::EncodingRef;
use vortex::{Array, ArrayDef, IntoArray, IntoArrayVariant};
use vortex_alp::{match_each_alp_float_ptype, ALPRDEncoding, Encoder as ALPRDEncoder, ALPRD};
use vortex_alp::{match_each_alp_float_ptype, ALPRDEncoding, RDEncoder as ALPRDEncoder, ALPRD};
use vortex_dtype::PType;
use vortex_error::{vortex_bail, VortexResult};
use vortex_fastlanes::BitPackedEncoding;
Expand Down

0 comments on commit daccef7

Please sign in to comment.