From 2c0a14849e7d68adafbf783bf1a4fc8043988d3b Mon Sep 17 00:00:00 2001 From: Will Manning Date: Tue, 24 Sep 2024 17:41:28 +0100 Subject: [PATCH] improve ALP size estimation --- encodings/alp/src/alp.rs | 49 ++++++++++++++++++++++++++--------- encodings/alp/src/compress.rs | 5 ++-- 2 files changed, 40 insertions(+), 14 deletions(-) diff --git a/encodings/alp/src/alp.rs b/encodings/alp/src/alp.rs index 27e24e0e4b..5d73bbc8e3 100644 --- a/encodings/alp/src/alp.rs +++ b/encodings/alp/src/alp.rs @@ -2,7 +2,7 @@ use std::fmt::{Display, Formatter}; use std::mem::size_of; use itertools::Itertools; -use num_traits::{Float, NumCast, PrimInt, Zero}; +use num_traits::{CheckedSub, Float, NumCast, PrimInt, Saturating, ToPrimitive, Zero}; use serde::{Deserialize, Serialize}; use vortex_error::vortex_panic; @@ -21,7 +21,7 @@ impl Display for Exponents { } pub trait ALPFloat: Float + Display + 'static { - type ALPInt: PrimInt + Display; + type ALPInt: PrimInt + Display + ToPrimitive; const FRACTIONAL_BITS: u8; const MAX_EXPONENT: u8; @@ -30,10 +30,12 @@ pub trait ALPFloat: Float + Display + 'static { const IF10: &'static [Self]; /// Round to the nearest floating integer by shifting in and out of the low precision range. + #[inline] fn fast_round(self) -> Self { (self + Self::SWEET) - Self::SWEET } + #[inline] fn as_int(self) -> Option { ::from(self) } @@ -50,16 +52,14 @@ pub trait ALPFloat: Float + Display + 'static { .collect_vec() }); - // TODO(wmanning): idea, start with highest e, then find the best f - // after that, try e's in descending order, with a gap no larger than the original e - f - for e in 0..Self::MAX_EXPONENT { + for e in (0..Self::MAX_EXPONENT).rev() { for f in 0..e { - let (_, encoded, exc_pos, exc_patches) = Self::encode( + let (_, encoded, _, exc_patches) = Self::encode( sample.as_deref().unwrap_or(values), Some(Exponents { e, f }), ); - let size = - (encoded.len() + exc_patches.len()) * size_of::() + (exc_pos.len() * 4); + + let size = Self::estimate_encoded_size(&encoded, &exc_patches); if size < best_nbytes { best_nbytes = size; best_exp = Exponents { e, f }; @@ -72,6 +72,31 @@ pub trait ALPFloat: Float + Display + 'static { best_exp } + #[inline(always)] + fn estimate_encoded_size(encoded: &[Self::ALPInt], patches: &[Self]) -> usize { + let bits_per_encoded = encoded + .iter() + .minmax() + .into_option() + // estimating bits per encoded value assuming frame-of-reference + bitpacking-without-patches + .and_then(|(min, max)| max.checked_sub(min)) + .and_then(|range_size: ::ALPInt| range_size.to_u64()) + .and_then(|range_size| { + range_size + .checked_ilog2() + .map(|bits| (bits + 1) as usize) + .or(Some(0)) + }) + .unwrap_or(size_of::() * 8); + + let encoded_bytes = (encoded.len() * bits_per_encoded + 7) / 8; + // each patch is a value + a position + // in practice, patch positions are in [0, u16::MAX] because of how we chunk + let patch_bytes = patches.len() * (size_of::() + size_of::()); + + encoded_bytes + patch_bytes + } + fn encode( values: &[Self], exponents: Option, @@ -149,7 +174,7 @@ impl ALPFloat for f32 { 10000000.0, 100000000.0, 1000000000.0, - 10000000000.0, + 10000000000.0, // 10^10 ]; const IF10: &'static [Self] = &[ 1.0, @@ -162,7 +187,7 @@ impl ALPFloat for f32 { 0.0000001, 0.00000001, 0.000000001, - 0.0000000001, + 0.0000000001, // 10^-10 ]; } @@ -196,7 +221,7 @@ impl ALPFloat for f64 { 100000000000000000000.0, 1000000000000000000000.0, 10000000000000000000000.0, - 100000000000000000000000.0, + 100000000000000000000000.0, // 10^23 ]; const IF10: &'static [Self] = &[ @@ -223,6 +248,6 @@ impl ALPFloat for f64 { 0.00000000000000000001, 0.000000000000000000001, 0.0000000000000000000001, - 0.00000000000000000000001, + 0.00000000000000000000001, // 10^-23 ]; } diff --git a/encodings/alp/src/compress.rs b/encodings/alp/src/compress.rs index 33041bba68..40707e534e 100644 --- a/encodings/alp/src/compress.rs +++ b/encodings/alp/src/compress.rs @@ -2,7 +2,7 @@ use vortex::array::{PrimitiveArray, Sparse, SparseArray}; use vortex::validity::Validity; use vortex::{Array, ArrayDType, ArrayDef, IntoArray, IntoArrayVariant}; use vortex_dtype::{NativePType, PType}; -use vortex_error::{vortex_bail, VortexExpect as _, VortexResult}; +use vortex_error::{vortex_bail,VortexExpect as _, VortexResult}; use vortex_scalar::Scalar; use crate::alp::ALPFloat; @@ -14,11 +14,12 @@ macro_rules! match_each_alp_float_ptype { ($self:expr, | $_:tt $enc:ident | $($body:tt)*) => ({ macro_rules! __with__ {( $_ $enc:ident ) => ( $($body)* )} use vortex_dtype::PType; + use vortex_error::vortex_panic; let ptype = $self; match ptype { PType::F32 => __with__! { f32 }, PType::F64 => __with__! { f64 }, - _ => panic!("ALP can only encode f32 and f64"), + _ => vortex_panic!("ALP can only encode f32 and f64, got {}", ptype), } }) }