diff --git a/Cargo.lock b/Cargo.lock index 12521e9c75..28842da607 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4529,6 +4529,7 @@ name = "vortex-fsst" version = "0.12.0" dependencies = [ "arrow-array", + "arrow-buffer", "fsst-rs", "serde", "vortex-array", diff --git a/bench-vortex/src/bin/notimplemented.rs b/bench-vortex/src/bin/notimplemented.rs index 1c6686f3b0..b77d2427ce 100644 --- a/bench-vortex/src/bin/notimplemented.rs +++ b/bench-vortex/src/bin/notimplemented.rs @@ -45,7 +45,7 @@ fn fsst_array() -> Array { fn varbin_array() -> Array { let mut input_array = VarBinBuilder::::with_capacity(3); - input_array.push_value(b"The Greeks never said that the limit could not he overstepped"); + input_array.push_value(b"The Greeks never said that the limit could not be overstepped"); input_array.push_value( b"They said it existed and that whoever dared to exceed it was mercilessly struck down", ); @@ -57,7 +57,7 @@ fn varbin_array() -> Array { fn varbinview_array() -> Array { VarBinViewArray::from_iter_str(vec![ - "The Greeks never said that the limit could not he overstepped", + "The Greeks never said that the limit could not be overstepped", "They said it existed and that whoever dared to exceed it was mercilessly struck down", "Nothing in present history can contradict them", ]) diff --git a/encodings/fsst/Cargo.toml b/encodings/fsst/Cargo.toml index 173017771c..42d0eab04e 100644 --- a/encodings/fsst/Cargo.toml +++ b/encodings/fsst/Cargo.toml @@ -18,6 +18,7 @@ workspace = true [dependencies] arrow-array = { workspace = true } +arrow-buffer = { workspace = true } fsst-rs = { workspace = true } serde = { workspace = true } diff --git a/encodings/fsst/src/canonical.rs b/encodings/fsst/src/canonical.rs index 6dd9138e6a..879a689358 100644 --- a/encodings/fsst/src/canonical.rs +++ b/encodings/fsst/src/canonical.rs @@ -1,5 +1,7 @@ -use vortex::array::{PrimitiveArray, VarBinArray}; -use vortex::{ArrayDType, Canonical, IntoArray, IntoArrayVariant, IntoCanonical}; +use arrow_array::builder::make_view; +use arrow_buffer::Buffer; +use vortex::array::{PrimitiveArray, VarBinArray, VarBinViewArray}; +use vortex::{Array, ArrayDType, Canonical, IntoArray, IntoCanonical}; use vortex_error::VortexResult; use crate::FSSTArray; @@ -25,34 +27,39 @@ impl IntoCanonical for FSSTArray { let uncompressed_bytes = decompressor.decompress(compressed_bytes.maybe_null_slice::()); - // Convert the uncompressed_lengths into offsets for building a new VarBinArray. - let mut offsets: Vec = Vec::with_capacity(self.len() + 1); - let mut offset = 0; - offsets.push(offset); - let uncompressed_lens_array = self .uncompressed_lengths() .into_canonical()? .into_primitive()?; let uncompressed_lens_slice = uncompressed_lens_array.maybe_null_slice::(); - for len in uncompressed_lens_slice.iter() { - offset += len; - offsets.push(offset); - } + // Directly create the binary views. + let views: Vec = uncompressed_lens_slice + .iter() + .scan(0, |offset, len| { + let str_start = *offset; + let str_end = *offset + len; + + *offset += len; + + Some(make_view( + &uncompressed_bytes[(str_start as usize)..(str_end as usize)], + 0u32, + str_start as u32, + )) + }) + .collect(); - let offsets_array = PrimitiveArray::from(offsets).into_array(); + let views_array: Array = Buffer::from(views).into(); let uncompressed_bytes_array = PrimitiveArray::from(uncompressed_bytes).into_array(); - Ok(Canonical::VarBinView( - VarBinArray::try_new( - offsets_array, - uncompressed_bytes_array, - self.dtype().clone(), - self.validity(), - )? - .into_varbinview()?, - )) + VarBinViewArray::try_new( + views_array, + vec![uncompressed_bytes_array], + self.dtype().clone(), + self.validity(), + ) + .map(Canonical::VarBinView) }) } } diff --git a/encodings/fsst/tests/fsst_tests.rs b/encodings/fsst/tests/fsst_tests.rs index 1ca6cdf141..73f758d977 100644 --- a/encodings/fsst/tests/fsst_tests.rs +++ b/encodings/fsst/tests/fsst_tests.rs @@ -18,7 +18,7 @@ macro_rules! assert_nth_scalar { // this function is VERY slow on miri, so we only want to run it once fn build_fsst_array() -> Array { let mut input_array = VarBinBuilder::::with_capacity(3); - input_array.push_value(b"The Greeks never said that the limit could not he overstepped"); + input_array.push_value(b"The Greeks never said that the limit could not be overstepped"); input_array.push_value( b"They said it existed and that whoever dared to exceed it was mercilessly struck down", ); @@ -41,7 +41,7 @@ fn test_fsst_array_ops() { assert_nth_scalar!( fsst_array, 0, - "The Greeks never said that the limit could not he overstepped" + "The Greeks never said that the limit could not be overstepped" ); assert_nth_scalar!( fsst_array, @@ -76,7 +76,7 @@ fn test_fsst_array_ops() { assert_nth_scalar!( fsst_taken, 0, - "The Greeks never said that the limit could not he overstepped" + "The Greeks never said that the limit could not be overstepped" ); assert_nth_scalar!( fsst_taken, diff --git a/vortex-buffer/src/string.rs b/vortex-buffer/src/string.rs index 4abbac9852..4e6ae2210b 100644 --- a/vortex-buffer/src/string.rs +++ b/vortex-buffer/src/string.rs @@ -1,10 +1,11 @@ +use std::fmt::{Debug, Formatter}; use std::ops::Deref; use std::str::Utf8Error; use crate::Buffer; /// A wrapper around a [`Buffer`] that guarantees that the buffer contains valid UTF-8. -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd)] +#[derive(Clone, PartialEq, Eq, PartialOrd)] pub struct BufferString(Buffer); impl BufferString { @@ -23,6 +24,14 @@ impl BufferString { } } +impl Debug for BufferString { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("BufferString") + .field("string", &self.as_str()) + .finish() + } +} + impl From for Buffer { fn from(value: BufferString) -> Self { value.0