From 79197e9a9e66d5b61ee22a0183c8a5f04e407189 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Tue, 3 Sep 2024 17:15:48 -0400 Subject: [PATCH] be slightly safer --- .../src/compressors/fsst.rs | 23 ++++--- .../src/compressors/mod.rs | 67 +++++++------------ 2 files changed, 40 insertions(+), 50 deletions(-) diff --git a/vortex-sampling-compressor/src/compressors/fsst.rs b/vortex-sampling-compressor/src/compressors/fsst.rs index 624fb23648..7a4521cf11 100644 --- a/vortex-sampling-compressor/src/compressors/fsst.rs +++ b/vortex-sampling-compressor/src/compressors/fsst.rs @@ -1,5 +1,7 @@ +use std::any::Any; use std::collections::HashSet; use std::fmt::Debug; +use std::sync::Arc; use fsst::Compressor; use vortex::array::{VarBin, VarBinView}; @@ -9,7 +11,7 @@ use vortex_dtype::DType; use vortex_error::{vortex_bail, VortexResult}; use vortex_fsst::{fsst_compress, fsst_train_compressor, FSSTEncoding, FSST}; -use super::{CompressedArray, CompressionTree, EncodingCompressor}; +use super::{CompressedArray, CompressionTree, EncoderMetadata, EncodingCompressor}; use crate::SamplingCompressor; #[derive(Debug)] @@ -18,10 +20,11 @@ pub struct FSSTCompressor; /// Maximum size in bytes of the FSST symbol table const FSST_SYMTAB_MAX_SIZE: usize = 8 * 255 + 255; -/// We use a 16KB sample of text from the input. -/// -/// This value is derived from the FSST paper section 4.4 -// const DEFAULT_SAMPLE_BYTES: usize = 1 << 14; +impl EncoderMetadata for Compressor { + fn as_any(&self) -> &dyn Any { + self + } +} impl EncodingCompressor for FSSTCompressor { fn id(&self) -> &str { @@ -60,13 +63,17 @@ impl EncodingCompressor for FSSTCompressor { } let compressor = like - .and_then(|mut c| unsafe { c.metadata::() }) - .unwrap_or_else(|| Box::new(fsst_train_compressor(array))); + .and_then(|mut tree| tree.metadata()) + .unwrap_or_else(|| Arc::new(fsst_train_compressor(array))); + + let Some(fsst_compressor) = compressor.as_any().downcast_ref::() else { + vortex_bail!("Could not downcast metadata as FSST Compressor") + }; let result_array = if array.encoding().id() == VarBin::ID || array.encoding().id() == VarBinView::ID { // For a VarBinArray or VarBinViewArray, compress directly. - fsst_compress(array, compressor.as_ref()).into_array() + fsst_compress(array, fsst_compressor).into_array() } else { vortex_bail!( InvalidArgument: "unsupported encoding for FSSTCompressor {:?}", diff --git a/vortex-sampling-compressor/src/compressors/mod.rs b/vortex-sampling-compressor/src/compressors/mod.rs index 556c08c6e4..39ee274905 100644 --- a/vortex-sampling-compressor/src/compressors/mod.rs +++ b/vortex-sampling-compressor/src/compressors/mod.rs @@ -1,6 +1,8 @@ +use std::any::Any; use std::collections::HashSet; use std::fmt::{Debug, Display, Formatter}; use std::hash::{Hash, Hasher}; +use std::sync::Arc; use vortex::encoding::EncodingRef; use vortex::Array; @@ -55,11 +57,25 @@ impl Hash for dyn EncodingCompressor + '_ { } } -#[derive(Debug, Clone)] +#[derive(Clone)] pub struct CompressionTree<'a> { compressor: &'a dyn EncodingCompressor, children: Vec>>, - metadata: Option<*const ()>, + metadata: Option>, +} + +impl Debug for CompressionTree<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{self}") + } +} + +/// Metadata that can optionally be attached to a compression tree. +/// +/// This enables codecs to cache trained parameters from the sampling runs to reuse for +/// the large run. +pub trait EncoderMetadata { + fn as_any(&self) -> &dyn Any; } impl Display for CompressionTree<'_> { @@ -88,17 +104,15 @@ impl<'a> CompressionTree<'a> { /// /// This can be specific encoder parameters that were discovered at sample time /// that should be reused when compressing the full array. - pub(crate) fn new_with_metadata( + pub(crate) fn new_with_metadata( compressor: &'a dyn EncodingCompressor, children: Vec>>, - metadata: Box, + metadata: Arc, ) -> Self { - // SAFETY: the memory pointed to will get cleaned up in Drop impl. - let ptr = Box::into_raw(metadata) as *const (); Self { compressor, children, - metadata: Some(ptr), + metadata: Some(metadata), } } @@ -129,45 +143,14 @@ impl<'a> CompressionTree<'a> { .map(|c| c.compress(array, Some(self.clone()), ctx.for_compressor(c))) } - // /// Access the saved opaque metadata by reference. - // /// - // /// # Safety - // /// - // /// It is up to the caller to ensure that the type `T` is the correct type for the stored - // /// metadata. - // /// - // /// The value of `T` will almost always be `EncodingCompressor`-specific. - // pub(crate) unsafe fn metadata_ref(&self) -> Option<&T> { - // unsafe { self.metadata.map(|m| &*(m as *const T)) } - // } - /// Access the saved opaque metadata. /// - /// This will consume the struct's metadata pointer, giving the caller ownership of - /// the memory by returning a `Box`. - /// - /// # Safety - /// - /// It is up to the caller to ensure that the type `T` is the correct type for the stored - /// metadata. + /// This will consume the owned metadata, giving the caller ownership of + /// the Box. /// /// The value of `T` will almost always be `EncodingCompressor`-specific. - pub unsafe fn metadata(&mut self) -> Option> { - let metadata = std::mem::take(&mut self.metadata); - - metadata.map(|m| { - let ptr = m as *mut T; - unsafe { Box::from_raw(ptr) } - }) - } -} - -impl Drop for CompressionTree<'_> { - fn drop(&mut self) { - if let Some(ptr) = self.metadata { - // Recnostruct the box from the pointer to do a manual drop. - let _ = unsafe { Box::from_raw(ptr as *mut ()) }; - } + pub fn metadata(&mut self) -> Option> { + std::mem::take(&mut self.metadata) } }