From 3d6dd50dc7bee646c1ad32c2f30822305310fb16 Mon Sep 17 00:00:00 2001 From: Dan King Date: Thu, 10 Oct 2024 16:44:09 -0400 Subject: [PATCH] feat: add ChunkedCompressor which compresses chunk n+1 like chunk n (#996) The primary idea is that chunk n+1 more often than not has a distribution of values similar to chunk n. We ought to reuse chunk n's compression scheme if the ratio is "good" before attempting a full sampling pass. This has the potential to both increase throughput and also permit us to invest in a more extensive search on the first chunk. This PR introduces `ChunkedCompressor` and `StructCompressor`. Their existence means that compression trees now fully represent an array. For example, if I have a `Chunked(Struct(foo=Chunked(U64), ...))`, the `ChunkedCompressor` will attempt to compress all the U64 chunks similarly and then it will pass up the ratio and encoding tree of the last chunk to the `StructCompressor`. Eventually the outer `ChunkedCompressor` can attempt to reuse on the second outer chunk all the encodings from all the fields of the first outer chunk. This PR looks best with whitespace ignored. The `CompressionTree` (particularly the metadata) is not so ergonomic, but I focused on throughput improvement rather than a refactor. ### benchmarks Any ratio outside (0.8, 1.2) is bolded. |Benchmark suite|Current: 68faec3|Previous: 4aa30c0|Unit|Ratio| |-|-|-|-|-| |taxi: compress time|1.4951|2.9452|s|**0.51**| |taxi: compress throughput|300.31|152.45|MiB/s|**1.97**| |taxi: vortex:parquetzstd size|0.94933|0.95669||0.99| |taxi: compress ratio|0.10633|0.10605||1.00| |taxi: compressed size|47.744|47.615|MiB|1.00| |AirlineSentiment: compress time|0.00038491|0.00036394|s|1.06| |AirlineSenthroughputnt: compress throughput|5.0049|5.2933|MiB/s|0.95| |AirlineSentiment: vortex:parquetzstd size|6.3744|6.3744||1.00| |AirlineSentiment: compress ratio|0.62079|0.62079||1.00| |AirlineSentiment: compressed size|0.0011959|0.0011959|MiB|1.00| |Arade: compress time|2.294|3.9502|s|**0.58**| |Arade: compress throughput|327.19|190.01|MiB/s|**1.72**| |Arade: vortex:parquetzstd size|0.47662|0.47901||1.00| |Arade: compress ratio|0.17756|0.17816||1.00| |Arade: compressed size|133.27|133.72|MiB|1.00| |Bimbo: compress time|12.753|25.983|s|**0.49**| |Bimbo: compress throughput|532.55|261.38|MiB/s|**2.04**| |Bimbo: vortex:parquetzstd size|1.2573|1.1858||1.06| |Bimbo: compress ratio|0.061503|0.057562||1.07| |Bimbo: compressed size|417.69|390.93|MiB|1.07| |CMSprovider: compress time|11.892|16.619|s|**0.72**| |CMSprovider: compress throughput|412.91|295.48|MiB/s|**1.40**| |CMSprovider: vortex:parquetzstd size|1.0742|1.0992||0.98| |CMSprovider: compress ratio|0.15301|0.1575||0.97| |CMSprovider: compressed size|751.38|773.42|MiB|0.97| |Euro2016: compress time|1.7194|2.0275|s|0.85| |Euro2016: compress throughput|218.12|184.97|MiB/s|1.18| |Euro2016: vortex:parquetzstd size|1.3998|1.3737||1.02| |Euro2016: compress ratio|0.4182|0.41015||1.02| |Euro2016: compressed size|156.84|153.82|MiB|1.02| |Food: compress time|1.0851|1.3049|s|0.83| |Food: compress throughput|292.41|243.16|MiB/s|1.20| |Food: vortex:parquetzstd size|1.2213|1.2548||0.97| |Food: compress ratio|0.12602|0.13044||0.97| |Food: compressed size|39.986|41.39|MiB|0.97| |HashTags: compress time|2.3817|3.1473|s|**0.76**| |HashTags: compress throughput|322.14|243.77|MiB/s|**1.32**| |HashTags: vortex:parquetzstd size|1.5056|1.5142||0.99| |HashTags: compress ratio|0.24665|0.2483||0.99| |HashTags: compressed size|189.23|190.51|MiB|0.99| |TPC-H l_comment: compress time|0.8073|1.2042|s|**0.67**| |TPC-H l_comment: compress throughput|216.19|144.93|MiB/s|**1.49**| |TPC-H l_comment: vortex:parquetzstd size|1.1701|1.1648||1.00| |TPC-H l_comment: compress ratio|0.36161|0.35995||1.00| |TPC-H l_comment: compressed size|63.113|62.822|MiB|1.00| --- Cargo.lock | 1 + vortex-sampling-compressor/Cargo.toml | 1 + .../src/compressors/chunked.rs | 156 ++++++++++++ .../src/compressors/fsst.rs | 2 +- .../src/compressors/mod.rs | 14 ++ .../src/compressors/struct_.rs | 66 +++++ vortex-sampling-compressor/src/lib.rs | 229 ++++++++---------- 7 files changed, 334 insertions(+), 135 deletions(-) create mode 100644 vortex-sampling-compressor/src/compressors/chunked.rs create mode 100644 vortex-sampling-compressor/src/compressors/struct_.rs diff --git a/Cargo.lock b/Cargo.lock index 88af749c68..137c33334f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4597,6 +4597,7 @@ dependencies = [ "arbitrary", "chrono", "fsst-rs", + "itertools 0.13.0", "lazy_static", "log", "rand", diff --git a/vortex-sampling-compressor/Cargo.toml b/vortex-sampling-compressor/Cargo.toml index 6e097f1cd6..e616de8b58 100644 --- a/vortex-sampling-compressor/Cargo.toml +++ b/vortex-sampling-compressor/Cargo.toml @@ -16,6 +16,7 @@ readme = { workspace = true } [dependencies] arbitrary = { workspace = true, optional = true } fsst-rs = { workspace = true } +itertools = { workspace = true } lazy_static = { workspace = true } log = { workspace = true } rand = { workspace = true } diff --git a/vortex-sampling-compressor/src/compressors/chunked.rs b/vortex-sampling-compressor/src/compressors/chunked.rs new file mode 100644 index 0000000000..23ff730d17 --- /dev/null +++ b/vortex-sampling-compressor/src/compressors/chunked.rs @@ -0,0 +1,156 @@ +use std::any::Any; +use std::collections::HashSet; +use std::sync::Arc; + +use log::warn; +use vortex::array::{Chunked, ChunkedArray}; +use vortex::encoding::EncodingRef; +use vortex::{Array, ArrayDType, ArrayDef, IntoArray}; +use vortex_error::{vortex_bail, VortexResult}; + +use super::EncoderMetadata; +use crate::compressors::{CompressedArray, CompressionTree, EncodingCompressor}; +use crate::SamplingCompressor; + +#[derive(Debug)] +pub struct ChunkedCompressor { + relatively_good_ratio: f32, +} + +pub const DEFAULT_CHUNKED_COMPRESSOR: ChunkedCompressor = ChunkedCompressor { + relatively_good_ratio: 1.2, +}; + +pub struct ChunkedCompressorMetadata(Option); + +impl EncoderMetadata for ChunkedCompressorMetadata { + fn as_any(&self) -> &dyn Any { + self + } +} + +impl EncodingCompressor for ChunkedCompressor { + fn id(&self) -> &str { + Chunked::ID.as_ref() + } + + fn cost(&self) -> u8 { + 0 + } + + fn can_compress(&self, array: &Array) -> Option<&dyn EncodingCompressor> { + array.is_encoding(Chunked::ID).then_some(self) + } + + fn compress<'a>( + &'a self, + array: &Array, + like: Option>, + ctx: SamplingCompressor<'a>, + ) -> VortexResult> { + let chunked_array = ChunkedArray::try_from(array)?; + let like_and_ratio = like_into_parts(like)?; + self.compress_chunked(&chunked_array, like_and_ratio, ctx) + } + + fn used_encodings(&self) -> HashSet { + HashSet::from([]) + } +} + +fn like_into_parts( + tree: Option>, +) -> VortexResult, f32)>> { + let (_, mut children, metadata) = match tree { + None => return Ok(None), + Some(tree) => tree.into_parts(), + }; + + let Some(target_ratio) = metadata else { + vortex_bail!("chunked array compression tree must have metadata") + }; + + let Some(ChunkedCompressorMetadata(target_ratio)) = + target_ratio.as_ref().as_any().downcast_ref() + else { + vortex_bail!("chunked array compression tree must be ChunkedCompressorMetadata") + }; + + if children.len() != 1 { + vortex_bail!("chunked array compression tree must have one child") + } + + let child = children.remove(0); + + match (child, target_ratio) { + (None, None) => Ok(None), + (Some(child), Some(ratio)) => Ok(Some((child, *ratio))), + (..) => vortex_bail!("chunked array compression tree must have a child iff it has a ratio"), + } +} + +impl ChunkedCompressor { + /// How far the compression ratio is allowed to grow from one chunk to another chunk. + /// + /// As long as a compressor compresses subsequent chunks "reasonably well" we should continue to + /// use it, which saves us the cost of searching for a good compressor. This constant quantifies + /// "reasonably well" as + /// + /// ```text + /// new_ratio <= old_ratio * self.relatively_good_ratio + /// ``` + fn relatively_good_ratio(&self) -> f32 { + self.relatively_good_ratio + } + + fn compress_chunked<'a>( + &'a self, + array: &ChunkedArray, + mut previous: Option<(CompressionTree<'a>, f32)>, + ctx: SamplingCompressor<'a>, + ) -> VortexResult> { + let less_chunked = array.rechunk( + ctx.options().target_block_bytesize, + ctx.options().target_block_size, + )?; + let mut compressed_chunks = Vec::with_capacity(less_chunked.nchunks()); + for (index, chunk) in less_chunked.chunks().enumerate() { + let like = previous.as_ref().map(|(like, _)| like); + let (compressed_chunk, tree) = ctx + .named(&format!("chunk-{}", index)) + .compress(&chunk, like)? + .into_parts(); + + let ratio = (compressed_chunk.nbytes() as f32) / (chunk.nbytes() as f32); + let exceeded_target_ratio = previous + .as_ref() + .map(|(_, target_ratio)| ratio > target_ratio * self.relatively_good_ratio()) + .unwrap_or(false); + + if ratio > 1.0 || exceeded_target_ratio { + warn!("unsatisfactory ratio {} {:?}", ratio, previous); + let (compressed_chunk, tree) = ctx.compress_array(&chunk)?.into_parts(); + let new_ratio = (compressed_chunk.nbytes() as f32) / (chunk.nbytes() as f32); + previous = tree.map(|tree| (tree, new_ratio)); + compressed_chunks.push(compressed_chunk); + } else { + previous = previous.or_else(|| tree.map(|tree| (tree, ratio))); + compressed_chunks.push(compressed_chunk); + } + } + + let (child, ratio) = match previous { + Some((child, ratio)) => (Some(child), Some(ratio)), + None => (None, None), + }; + + Ok(CompressedArray::new( + ChunkedArray::try_new(compressed_chunks, array.dtype().clone())?.into_array(), + Some(CompressionTree::new_with_metadata( + self, + vec![child], + Arc::new(ChunkedCompressorMetadata(ratio)), + )), + )) + } +} diff --git a/vortex-sampling-compressor/src/compressors/fsst.rs b/vortex-sampling-compressor/src/compressors/fsst.rs index 285332672c..69e27ac3e8 100644 --- a/vortex-sampling-compressor/src/compressors/fsst.rs +++ b/vortex-sampling-compressor/src/compressors/fsst.rs @@ -63,7 +63,7 @@ impl EncodingCompressor for FSSTCompressor { // between 2-3x depending on the text quality. // // It's not worth running a full compression step unless the array is large enough. - if array.nbytes() < 10 * FSST_SYMTAB_MAX_SIZE { + if array.nbytes() < 5 * FSST_SYMTAB_MAX_SIZE { return Ok(CompressedArray::uncompressed(array.clone())); } diff --git a/vortex-sampling-compressor/src/compressors/mod.rs b/vortex-sampling-compressor/src/compressors/mod.rs index 3b087df2dd..132f0cbbac 100644 --- a/vortex-sampling-compressor/src/compressors/mod.rs +++ b/vortex-sampling-compressor/src/compressors/mod.rs @@ -13,6 +13,7 @@ use crate::SamplingCompressor; pub mod alp; pub mod alp_rd; pub mod bitpacked; +pub mod chunked; pub mod constant; pub mod date_time_parts; pub mod delta; @@ -23,6 +24,7 @@ pub mod roaring_bool; pub mod roaring_int; pub mod runend; pub mod sparse; +pub mod struct_; pub mod zigzag; pub trait EncodingCompressor: Sync + Send + Debug { @@ -162,6 +164,17 @@ impl<'a> CompressionTree<'a> { .filter_map(|child| child.as_ref().map(|c| c.num_descendants() + 1)) .sum::() } + + #[allow(clippy::type_complexity)] + pub fn into_parts( + self, + ) -> ( + &'a dyn EncodingCompressor, + Vec>>, + Option>, + ) { + (self.compressor, self.children, self.metadata) + } } #[derive(Debug, Clone)] @@ -199,6 +212,7 @@ impl<'a> CompressedArray<'a> { self.path } + #[inline] pub fn into_parts(self) -> (Array, Option>) { (self.array, self.path) } diff --git a/vortex-sampling-compressor/src/compressors/struct_.rs b/vortex-sampling-compressor/src/compressors/struct_.rs new file mode 100644 index 0000000000..9225f68540 --- /dev/null +++ b/vortex-sampling-compressor/src/compressors/struct_.rs @@ -0,0 +1,66 @@ +use std::collections::HashSet; + +use itertools::Itertools; +use vortex::array::{Struct, StructArray}; +use vortex::encoding::EncodingRef; +use vortex::variants::StructArrayTrait; +use vortex::{Array, ArrayDef, IntoArray}; +use vortex_error::VortexResult; + +use crate::compressors::{CompressedArray, CompressionTree, EncodingCompressor}; +use crate::SamplingCompressor; + +#[derive(Debug)] +pub struct StructCompressor; + +impl EncodingCompressor for StructCompressor { + fn id(&self) -> &str { + Struct::ID.as_ref() + } + + fn cost(&self) -> u8 { + 0 + } + + fn can_compress(&self, array: &Array) -> Option<&dyn EncodingCompressor> { + StructArray::try_from(array) + .ok() + .map(|_| self as &dyn EncodingCompressor) + } + + fn compress<'a>( + &'a self, + array: &Array, + like: Option>, + ctx: SamplingCompressor<'a>, + ) -> VortexResult> { + let array = StructArray::try_from(array)?; + let compressed_validity = ctx.compress_validity(array.validity())?; + + let children_trees = match like { + Some(tree) => tree.children, + None => vec![None; array.nfields()], + }; + + let (arrays, trees) = array + .children() + .zip_eq(children_trees) + .map(|(array, like)| ctx.compress(&array, like.as_ref())) + .process_results(|iter| iter.map(|x| (x.array, x.path)).unzip())?; + + Ok(CompressedArray::new( + StructArray::try_new( + array.names().clone(), + arrays, + array.len(), + compressed_validity, + )? + .into_array(), + Some(CompressionTree::new(self, trees)), + )) + } + + fn used_encodings(&self) -> HashSet { + HashSet::from([]) + } +} diff --git a/vortex-sampling-compressor/src/lib.rs b/vortex-sampling-compressor/src/lib.rs index 1ad5083ef5..9714689e49 100644 --- a/vortex-sampling-compressor/src/lib.rs +++ b/vortex-sampling-compressor/src/lib.rs @@ -2,18 +2,19 @@ use std::collections::HashSet; use std::fmt::{Debug, Display, Formatter}; use compressors::bitpacked::BITPACK_WITH_PATCHES; +use compressors::chunked::DEFAULT_CHUNKED_COMPRESSOR; use compressors::fsst::FSSTCompressor; +use compressors::struct_::StructCompressor; use lazy_static::lazy_static; use log::{debug, info, warn}; use rand::rngs::StdRng; use rand::SeedableRng; -use vortex::array::{Chunked, ChunkedArray, Constant, Struct, StructArray}; +use vortex::array::{ChunkedArray, Constant}; use vortex::compress::{check_dtype_unchanged, check_validity_unchanged, CompressionStrategy}; use vortex::compute::slice; use vortex::encoding::EncodingRef; use vortex::validity::Validity; -use vortex::variants::StructArrayTrait; -use vortex::{Array, ArrayDType, ArrayDef, IntoArray, IntoCanonical}; +use vortex::{Array, ArrayDType, ArrayDef, IntoCanonical}; use vortex_error::{VortexExpect as _, VortexResult}; use crate::compressors::alp::ALPCompressor; @@ -241,144 +242,104 @@ impl<'a> SamplingCompressor<'a> { } } - fn compress_array(&self, arr: &Array) -> VortexResult> { - match arr.encoding().id() { - Chunked::ID => { - let chunked = ChunkedArray::try_from(arr)?; - let less_chunked = chunked.rechunk( - self.options().target_block_bytesize, - self.options().target_block_size, - )?; - let compressed_chunks = less_chunked - .chunks() - .map(|chunk| { - self.compress_array(&chunk) - .map(compressors::CompressedArray::into_array) - }) - .collect::>>()?; - Ok(CompressedArray::uncompressed( - ChunkedArray::try_new(compressed_chunks, chunked.dtype().clone())?.into_array(), - )) - } - Constant::ID => { - // Not much better we can do than constant! - Ok(CompressedArray::uncompressed(arr.clone())) - } - Struct::ID => { - // For struct arrays, we compress each field individually - let strct = StructArray::try_from(arr)?; - let compressed_fields = strct - .children() - .map(|field| { - self.compress_array(&field) - .map(compressors::CompressedArray::into_array) - }) - .collect::>>()?; - let validity = self.compress_validity(strct.validity())?; - Ok(CompressedArray::uncompressed( - StructArray::try_new( - strct.names().clone(), - compressed_fields, - strct.len(), - validity, - )? - .into_array(), - )) - } - _ => { - // Otherwise, we run sampled compression over pluggable encodings - let mut rng = StdRng::seed_from_u64(self.options.rng_seed); - let sampled = sampled_compression(arr, self, &mut rng)?; - Ok(sampled.unwrap_or_else(|| CompressedArray::uncompressed(arr.clone()))) - } + fn compress_array(&self, array: &Array) -> VortexResult> { + let mut rng = StdRng::seed_from_u64(self.options.rng_seed); + + if array.encoding().id() == Constant::ID { + // Not much better we can do than constant! + return Ok(CompressedArray::uncompressed(array.clone())); } - } -} -fn sampled_compression<'a>( - array: &Array, - compressor: &SamplingCompressor<'a>, - rng: &mut StdRng, -) -> VortexResult>> { - // First, we try constant compression and shortcut any sampling. - if let Some(cc) = ConstantCompressor.can_compress(array) { - return cc.compress(array, None, compressor.clone()).map(Some); - } + if let Some(cc) = DEFAULT_CHUNKED_COMPRESSOR.can_compress(array) { + return cc.compress(array, None, self.clone()); + } - let mut candidates: Vec<&dyn EncodingCompressor> = compressor - .compressors - .iter() - .filter(|&encoding| !compressor.disabled_compressors.contains(encoding)) - .filter(|compression| { - if compression.can_compress(array).is_some() { - if compressor.depth + compression.cost() > compressor.options.max_cost { - debug!( - "{} skipping encoding {} due to depth", - compressor, - compression.id() - ); - return false; - } - true - } else { - false - } - }) - .copied() - .collect(); - debug!("{} candidates for {}: {:?}", compressor, array, candidates); + if let Some(cc) = StructCompressor.can_compress(array) { + return cc.compress(array, None, self.clone()); + } - if candidates.is_empty() { - debug!( - "{} no compressors for array with dtype: {} and encoding: {}", - compressor, - array.dtype(), - array.encoding().id(), - ); - return Ok(None); - } + if let Some(cc) = ConstantCompressor.can_compress(array) { + return cc.compress(array, None, self.clone()); + } - // We prefer all other candidates to the array's own encoding. - // This is because we assume that the array's own encoding is the least efficient, but useful - // to destructure an array in the final stages of compression. e.g. VarBin would be DictEncoded - // but then the dictionary itself remains a VarBin array. DictEncoding excludes itself from the - // dictionary, but we still have a large offsets array that should be compressed. - // TODO(ngates): we actually probably want some way to prefer dict encoding over other varbin - // encodings, e.g. FSST. - if candidates.len() > 1 { - candidates.retain(|&compression| compression.id() != array.encoding().id().as_ref()); - } + let (mut candidates, too_deep) = self + .compressors + .iter() + .filter(|&encoding| !self.disabled_compressors.contains(encoding)) + .filter(|&encoding| encoding.can_compress(array).is_some()) + .partition::, _>(|&encoding| { + self.depth + encoding.cost() <= self.options.max_cost + }); + + if !too_deep.is_empty() { + debug!( + "{} skipping encodings due to depth/cost: {}", + self, + too_deep + .iter() + .map(|x| x.id()) + .collect::>() + .join(", ") + ); + } - if array.len() - <= (compressor.options.sample_size as usize * compressor.options.sample_count as usize) - { - // We're either already within a sample, or we're operating over a sufficiently small array. - return find_best_compression(candidates, array, compressor).map(Some); - } + info!("{} candidates for {}: {:?}", self, array, candidates); + + if candidates.is_empty() { + debug!( + "{} no compressors for array with dtype: {} and encoding: {}", + self, + array.dtype(), + array.encoding().id(), + ); + return Ok(CompressedArray::uncompressed(array.clone())); + } - // Take a sample of the array, then ask codecs for their best compression estimate. - let sample = ChunkedArray::try_new( - stratified_slices( - array.len(), - compressor.options.sample_size, - compressor.options.sample_count, - rng, - ) - .into_iter() - .map(|(start, stop)| slice(array, start, stop)) - .collect::>>()?, - array.dtype().clone(), - )? - .into_canonical()? - .into(); - - find_best_compression(candidates, &sample, compressor)? - .into_path() - .map(|best_compressor| { - info!("Compressing array {} with {}", array, best_compressor); - best_compressor.compress_unchecked(array, compressor) - }) - .transpose() + // We prefer all other candidates to the array's own encoding. + // This is because we assume that the array's own encoding is the least efficient, but useful + // to destructure an array in the final stages of compression. e.g. VarBin would be DictEncoded + // but then the dictionary itself remains a VarBin array. DictEncoding excludes itself from the + // dictionary, but we still have a large offsets array that should be compressed. + // TODO(ngates): we actually probably want some way to prefer dict encoding over other varbin + // encodings, e.g. FSST. + if candidates.len() > 1 { + candidates.retain(|&compression| compression.id() != array.encoding().id().as_ref()); + } + + if array.len() <= (self.options.sample_size as usize * self.options.sample_count as usize) { + // We're either already within a sample, or we're operating over a sufficiently small array. + return find_best_compression(candidates, array, self); + } + + // Take a sample of the array, then ask codecs for their best compression estimate. + let sample = ChunkedArray::try_new( + stratified_slices( + array.len(), + self.options.sample_size, + self.options.sample_count, + &mut rng, + ) + .into_iter() + .map(|(start, stop)| slice(array, start, stop)) + .collect::>>()?, + array.dtype().clone(), + )? + .into_canonical()? + .into(); + + let best = find_best_compression(candidates, &sample, self)? + .into_path() + .map(|best_compressor| { + info!( + "{} Compressing array {} with {}", + self, array, best_compressor + ); + best_compressor.compress_unchecked(array, self) + }) + .transpose()?; + + Ok(best.unwrap_or_else(|| CompressedArray::uncompressed(array.clone()))) + } } fn find_best_compression<'a>(