Skip to content

Commit

Permalink
feat: implement ALP-RD compression (#947)
Browse files Browse the repository at this point in the history
Fixes #10: Add ALP-RD compression.

Currently our only floating point compression algorithm is standard ALP,
which targets floats/doubles that are originally decimal, and thus have
some natural integer they can round to when you undo the exponent.

For science/math datasets, there are a lot of "real doubles", i.e.
floating point numbers that use most/all of their available precision.
These do not compress with standard ALP. The ALP paper authors had a
solution for this called "ALP for 'Real' Doubles" / ALP-RD, which is
implemented in this PR.

## Basics

The key insight of ALP-RD is that even for dense floating point numbers,
within a column they often share the front bits (exponent + first few
bits of mantissa). We try and find the best cut-point within the
leftmost 16-bits.

There are generally a small number of unique values for the leftmost
bits, so you can create a dictionary of fixed size (here we use the
choice of 8 from the C++ implementation) which naturally bit-packs down
to 3 bits. If you compress perfectly without exceptions, you can store
53 bits/value ~17% compression. In practice the amount varies. In the
comments below you can see a test with the POI dataset referenced in the
ALP paper, and we replicate their results of 55 and 56 bits/value
respectively.

## List of changes

* Reorganized the `vortex-alp` crate. I created two top-level modules,
`alp` and alp_rd`, and moved the previous implementation into the `alp`
module
* Added new `ALPRDArray` in the `alp_rd` module. It supports both f32
and f64, and all major compute functions are implemented (save for
`MaybeCompareFn` and the Accessors I will file an issue to implement
these in a FLUP if alright, this PR is already quite large)
* Added corresponding `ALPRDCompressor` and wired the CompressorRef
everywhere I could find ALPCompressor
* New benchmark for RD compression in the existing ALP benchmarks suite
  • Loading branch information
a10y authored Oct 2, 2024
1 parent 251d3ed commit 389e6a4
Show file tree
Hide file tree
Showing 24 changed files with 1,101 additions and 14 deletions.
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions bench-vortex/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ use vortex::{Array, Context, IntoArray};
use vortex_dtype::DType;
use vortex_fastlanes::DeltaEncoding;
use vortex_sampling_compressor::compressors::alp::ALPCompressor;
use vortex_sampling_compressor::compressors::alp_rd::ALPRDCompressor;
use vortex_sampling_compressor::compressors::bitpacked::BitPackedCompressor;
use vortex_sampling_compressor::compressors::date_time_parts::DateTimePartsCompressor;
use vortex_sampling_compressor::compressors::dict::DictCompressor;
Expand Down Expand Up @@ -54,6 +55,7 @@ lazy_static! {
lazy_static! {
pub static ref COMPRESSORS: HashSet<CompressorRef<'static>> = [
&ALPCompressor as CompressorRef<'static>,
&ALPRDCompressor,
&DictCompressor,
&BitPackedCompressor,
&FoRCompressor,
Expand Down
2 changes: 1 addition & 1 deletion bench-vortex/src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ pub async fn rewrite_parquet_as_vortex<W: VortexWrite>(
Ok(())
}

pub fn read_parquet_to_vortex(parquet_path: &Path) -> VortexResult<ChunkedArray> {
pub fn read_parquet_to_vortex<P: AsRef<Path>>(parquet_path: P) -> VortexResult<ChunkedArray> {
let taxi_pq = File::open(parquet_path)?;
let builder = ParquetRecordBatchReaderBuilder::try_new(taxi_pq)?;
// FIXME(ngates): #157 the compressor should handle batch size.
Expand Down
2 changes: 2 additions & 0 deletions encodings/alp/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ readme = { workspace = true }
workspace = true

[dependencies]
vortex-fastlanes = { workspace = true }
itertools = { workspace = true }
num-traits = { workspace = true }
serde = { workspace = true, features = ["derive"] }
Expand All @@ -28,6 +29,7 @@ vortex-scalar = { workspace = true }
[dev-dependencies]
arrow = { workspace = true }
divan = { workspace = true }
rstest = { workspace = true }

[[bench]]
name = "alp_compress"
Expand Down
13 changes: 11 additions & 2 deletions encodings/alp/benches/alp_compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,28 @@ use vortex::array::PrimitiveArray;
use vortex::validity::Validity;
use vortex::variants::PrimitiveArrayTrait;
use vortex::IntoCanonical;
use vortex_alp::{alp_encode_components, ALPArray, ALPFloat, Exponents};
use vortex_alp::{alp_encode_components, ALPArray, ALPFloat, ALPRDFloat, Exponents, RDEncoder};
use vortex_dtype::NativePType;

fn main() {
divan::main();
}

#[divan::bench(types = [f32, f64], args = [100_000, 10_000_000])]
fn alp_compress<T: ALPFloat>(n: usize) -> (Exponents, Vec<T::ALPInt>, Vec<u64>, Vec<T>) {
fn compress_alp<T: ALPFloat>(n: usize) -> (Exponents, Vec<T::ALPInt>, Vec<u64>, Vec<T>) {
let values: Vec<T> = vec![T::from(1.234).unwrap(); n];
T::encode(values.as_slice(), None)
}

#[divan::bench(types = [f32, f64], args = [100_000, 10_000_000])]
fn compress_rd<T: ALPRDFloat>(bencher: Bencher, n: usize) {
let values: Vec<T> = vec![T::from(1.23).unwrap(); n];
let primitive = PrimitiveArray::from(values);
let encoder = RDEncoder::new(&[T::from(1.23).unwrap()]);

bencher.bench_local(|| encoder.encode(&primitive));
}

#[divan::bench(types = [f32, f64], args = [100_000, 1_000_000, 10_000_000])]
fn alp_iter<T>(bencher: Bencher, n: usize)
where
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@ use vortex::{
use vortex_dtype::{DType, PType};
use vortex_error::{vortex_bail, vortex_panic, VortexExpect as _, VortexResult};

use crate::alp::Exponents;
use crate::compress::{alp_encode, decompress};
use crate::alp::{alp_encode, decompress, Exponents};
use crate::ALPFloat;

impl_encoding!("vortex.alp", ids::ALP, ALP);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@ use vortex_dtype::{NativePType, PType};
use vortex_error::{vortex_bail, VortexExpect as _, VortexResult};
use vortex_scalar::ScalarValue;

use crate::alp::ALPFloat;
use crate::array::ALPArray;
use crate::alp::{ALPArray, ALPFloat};
use crate::Exponents;

#[macro_export]
Expand Down
File renamed without changes.
16 changes: 15 additions & 1 deletion encodings/alp/src/alp.rs → encodings/alp/src/alp/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@ use itertools::Itertools;
use num_traits::{CheckedSub, Float, PrimInt, ToPrimitive};
use serde::{Deserialize, Serialize};

mod array;
mod compress;
mod compute;

pub use array::*;
pub use compress::*;

const SAMPLE_SIZE: usize = 32;

#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
Expand All @@ -19,7 +26,14 @@ impl Display for Exponents {
}
}

pub trait ALPFloat: Float + Display + 'static {
mod private {
pub trait Sealed {}

impl Sealed for f32 {}
impl Sealed for f64 {}
}

pub trait ALPFloat: private::Sealed + Float + Display + 'static {
type ALPInt: PrimInt + Display + ToPrimitive;

const FRACTIONAL_BITS: u8;
Expand Down
Loading

0 comments on commit 389e6a4

Please sign in to comment.