From 0e3d4256180d9b94a986d4973c516ca96234f470 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Tue, 29 Oct 2024 14:14:56 -0400 Subject: [PATCH] feat: FSSTArray::into_canonical directly build VarBinView --- Cargo.lock | 1 + bench-vortex/src/bin/notimplemented.rs | 4 +- encodings/fsst/Cargo.toml | 1 + encodings/fsst/src/canonical.rs | 53 ++++++++++++++++---------- encodings/fsst/tests/fsst_tests.rs | 6 +-- vortex-buffer/src/string.rs | 11 +++++- 6 files changed, 50 insertions(+), 26 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8032c2e42d..40345dd2a9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4528,6 +4528,7 @@ name = "vortex-fsst" version = "0.12.0" dependencies = [ "arrow-array", + "arrow-buffer", "fsst-rs", "serde", "vortex-array", diff --git a/bench-vortex/src/bin/notimplemented.rs b/bench-vortex/src/bin/notimplemented.rs index 1c6686f3b0..b77d2427ce 100644 --- a/bench-vortex/src/bin/notimplemented.rs +++ b/bench-vortex/src/bin/notimplemented.rs @@ -45,7 +45,7 @@ fn fsst_array() -> Array { fn varbin_array() -> Array { let mut input_array = VarBinBuilder::::with_capacity(3); - input_array.push_value(b"The Greeks never said that the limit could not he overstepped"); + input_array.push_value(b"The Greeks never said that the limit could not be overstepped"); input_array.push_value( b"They said it existed and that whoever dared to exceed it was mercilessly struck down", ); @@ -57,7 +57,7 @@ fn varbin_array() -> Array { fn varbinview_array() -> Array { VarBinViewArray::from_iter_str(vec![ - "The Greeks never said that the limit could not he overstepped", + "The Greeks never said that the limit could not be overstepped", "They said it existed and that whoever dared to exceed it was mercilessly struck down", "Nothing in present history can contradict them", ]) diff --git a/encodings/fsst/Cargo.toml b/encodings/fsst/Cargo.toml index 173017771c..42d0eab04e 100644 --- a/encodings/fsst/Cargo.toml +++ b/encodings/fsst/Cargo.toml @@ -18,6 +18,7 @@ workspace = true [dependencies] arrow-array = { workspace = true } +arrow-buffer = { workspace = true } fsst-rs = { workspace = true } serde = { workspace = true } diff --git a/encodings/fsst/src/canonical.rs b/encodings/fsst/src/canonical.rs index 6dd9138e6a..a2c9059b5c 100644 --- a/encodings/fsst/src/canonical.rs +++ b/encodings/fsst/src/canonical.rs @@ -1,5 +1,7 @@ -use vortex::array::{PrimitiveArray, VarBinArray}; -use vortex::{ArrayDType, Canonical, IntoArray, IntoArrayVariant, IntoCanonical}; +use arrow_array::builder::make_view; +use arrow_buffer::Buffer; +use vortex::array::{PrimitiveArray, VarBinArray, VarBinViewArray}; +use vortex::{Array, ArrayDType, Canonical, IntoArray, IntoCanonical}; use vortex_error::VortexResult; use crate::FSSTArray; @@ -25,34 +27,45 @@ impl IntoCanonical for FSSTArray { let uncompressed_bytes = decompressor.decompress(compressed_bytes.maybe_null_slice::()); - // Convert the uncompressed_lengths into offsets for building a new VarBinArray. - let mut offsets: Vec = Vec::with_capacity(self.len() + 1); - let mut offset = 0; - offsets.push(offset); - let uncompressed_lens_array = self .uncompressed_lengths() .into_canonical()? .into_primitive()?; let uncompressed_lens_slice = uncompressed_lens_array.maybe_null_slice::(); - for len in uncompressed_lens_slice.iter() { - offset += len; - offsets.push(offset); + // Directly create the binary views. + let views: Vec = uncompressed_lens_slice + .iter() + .scan(0, |offset, len| { + let str_start = *offset; + let str_end = *offset + len; + + *offset += len; + + Some(make_view( + &uncompressed_bytes[(str_start as usize)..(str_end as usize)], + 0u32, + str_start as u32, + )) + }) + .collect(); + + for view in &views { + println!("len: {}", *view as u32); } - let offsets_array = PrimitiveArray::from(offsets).into_array(); + // there are 3 views...so wtf?? + + let views_array: Array = Buffer::from(views).into(); let uncompressed_bytes_array = PrimitiveArray::from(uncompressed_bytes).into_array(); - Ok(Canonical::VarBinView( - VarBinArray::try_new( - offsets_array, - uncompressed_bytes_array, - self.dtype().clone(), - self.validity(), - )? - .into_varbinview()?, - )) + VarBinViewArray::try_new( + views_array, + vec![uncompressed_bytes_array], + self.dtype().clone(), + self.validity(), + ) + .map(Canonical::VarBinView) }) } } diff --git a/encodings/fsst/tests/fsst_tests.rs b/encodings/fsst/tests/fsst_tests.rs index 1ca6cdf141..73f758d977 100644 --- a/encodings/fsst/tests/fsst_tests.rs +++ b/encodings/fsst/tests/fsst_tests.rs @@ -18,7 +18,7 @@ macro_rules! assert_nth_scalar { // this function is VERY slow on miri, so we only want to run it once fn build_fsst_array() -> Array { let mut input_array = VarBinBuilder::::with_capacity(3); - input_array.push_value(b"The Greeks never said that the limit could not he overstepped"); + input_array.push_value(b"The Greeks never said that the limit could not be overstepped"); input_array.push_value( b"They said it existed and that whoever dared to exceed it was mercilessly struck down", ); @@ -41,7 +41,7 @@ fn test_fsst_array_ops() { assert_nth_scalar!( fsst_array, 0, - "The Greeks never said that the limit could not he overstepped" + "The Greeks never said that the limit could not be overstepped" ); assert_nth_scalar!( fsst_array, @@ -76,7 +76,7 @@ fn test_fsst_array_ops() { assert_nth_scalar!( fsst_taken, 0, - "The Greeks never said that the limit could not he overstepped" + "The Greeks never said that the limit could not be overstepped" ); assert_nth_scalar!( fsst_taken, diff --git a/vortex-buffer/src/string.rs b/vortex-buffer/src/string.rs index 4abbac9852..4e6ae2210b 100644 --- a/vortex-buffer/src/string.rs +++ b/vortex-buffer/src/string.rs @@ -1,10 +1,11 @@ +use std::fmt::{Debug, Formatter}; use std::ops::Deref; use std::str::Utf8Error; use crate::Buffer; /// A wrapper around a [`Buffer`] that guarantees that the buffer contains valid UTF-8. -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd)] +#[derive(Clone, PartialEq, Eq, PartialOrd)] pub struct BufferString(Buffer); impl BufferString { @@ -23,6 +24,14 @@ impl BufferString { } } +impl Debug for BufferString { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("BufferString") + .field("string", &self.as_str()) + .finish() + } +} + impl From for Buffer { fn from(value: BufferString) -> Self { value.0