Skip to content

Commit

Permalink
feat: FSSTArray::into_canonical directly build VarBinView
Browse files Browse the repository at this point in the history
  • Loading branch information
a10y committed Oct 29, 2024
1 parent d884d9f commit 0e3d425
Show file tree
Hide file tree
Showing 6 changed files with 50 additions and 26 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions bench-vortex/src/bin/notimplemented.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ fn fsst_array() -> Array {

fn varbin_array() -> Array {
let mut input_array = VarBinBuilder::<i32>::with_capacity(3);
input_array.push_value(b"The Greeks never said that the limit could not he overstepped");
input_array.push_value(b"The Greeks never said that the limit could not be overstepped");
input_array.push_value(
b"They said it existed and that whoever dared to exceed it was mercilessly struck down",
);
Expand All @@ -57,7 +57,7 @@ fn varbin_array() -> Array {

fn varbinview_array() -> Array {
VarBinViewArray::from_iter_str(vec![
"The Greeks never said that the limit could not he overstepped",
"The Greeks never said that the limit could not be overstepped",
"They said it existed and that whoever dared to exceed it was mercilessly struck down",
"Nothing in present history can contradict them",
])
Expand Down
1 change: 1 addition & 0 deletions encodings/fsst/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ workspace = true

[dependencies]
arrow-array = { workspace = true }
arrow-buffer = { workspace = true }
fsst-rs = { workspace = true }
serde = { workspace = true }

Expand Down
53 changes: 33 additions & 20 deletions encodings/fsst/src/canonical.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use vortex::array::{PrimitiveArray, VarBinArray};
use vortex::{ArrayDType, Canonical, IntoArray, IntoArrayVariant, IntoCanonical};
use arrow_array::builder::make_view;
use arrow_buffer::Buffer;
use vortex::array::{PrimitiveArray, VarBinArray, VarBinViewArray};
use vortex::{Array, ArrayDType, Canonical, IntoArray, IntoCanonical};
use vortex_error::VortexResult;

use crate::FSSTArray;
Expand All @@ -25,34 +27,45 @@ impl IntoCanonical for FSSTArray {
let uncompressed_bytes =
decompressor.decompress(compressed_bytes.maybe_null_slice::<u8>());

// Convert the uncompressed_lengths into offsets for building a new VarBinArray.
let mut offsets: Vec<i32> = Vec::with_capacity(self.len() + 1);
let mut offset = 0;
offsets.push(offset);

let uncompressed_lens_array = self
.uncompressed_lengths()
.into_canonical()?
.into_primitive()?;
let uncompressed_lens_slice = uncompressed_lens_array.maybe_null_slice::<i32>();

for len in uncompressed_lens_slice.iter() {
offset += len;
offsets.push(offset);
// Directly create the binary views.
let views: Vec<u128> = uncompressed_lens_slice
.iter()
.scan(0, |offset, len| {
let str_start = *offset;
let str_end = *offset + len;

*offset += len;

Some(make_view(
&uncompressed_bytes[(str_start as usize)..(str_end as usize)],
0u32,
str_start as u32,
))
})
.collect();

for view in &views {
println!("len: {}", *view as u32);
}

let offsets_array = PrimitiveArray::from(offsets).into_array();
// there are 3 views...so wtf??

let views_array: Array = Buffer::from(views).into();
let uncompressed_bytes_array = PrimitiveArray::from(uncompressed_bytes).into_array();

Ok(Canonical::VarBinView(
VarBinArray::try_new(
offsets_array,
uncompressed_bytes_array,
self.dtype().clone(),
self.validity(),
)?
.into_varbinview()?,
))
VarBinViewArray::try_new(
views_array,
vec![uncompressed_bytes_array],
self.dtype().clone(),
self.validity(),
)
.map(Canonical::VarBinView)
})
}
}
6 changes: 3 additions & 3 deletions encodings/fsst/tests/fsst_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ macro_rules! assert_nth_scalar {
// this function is VERY slow on miri, so we only want to run it once
fn build_fsst_array() -> Array {
let mut input_array = VarBinBuilder::<i32>::with_capacity(3);
input_array.push_value(b"The Greeks never said that the limit could not he overstepped");
input_array.push_value(b"The Greeks never said that the limit could not be overstepped");
input_array.push_value(
b"They said it existed and that whoever dared to exceed it was mercilessly struck down",
);
Expand All @@ -41,7 +41,7 @@ fn test_fsst_array_ops() {
assert_nth_scalar!(
fsst_array,
0,
"The Greeks never said that the limit could not he overstepped"
"The Greeks never said that the limit could not be overstepped"
);
assert_nth_scalar!(
fsst_array,
Expand Down Expand Up @@ -76,7 +76,7 @@ fn test_fsst_array_ops() {
assert_nth_scalar!(
fsst_taken,
0,
"The Greeks never said that the limit could not he overstepped"
"The Greeks never said that the limit could not be overstepped"
);
assert_nth_scalar!(
fsst_taken,
Expand Down
11 changes: 10 additions & 1 deletion vortex-buffer/src/string.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
use std::fmt::{Debug, Formatter};
use std::ops::Deref;
use std::str::Utf8Error;

use crate::Buffer;

/// A wrapper around a [`Buffer`] that guarantees that the buffer contains valid UTF-8.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd)]
#[derive(Clone, PartialEq, Eq, PartialOrd)]
pub struct BufferString(Buffer);

impl BufferString {
Expand All @@ -23,6 +24,14 @@ impl BufferString {
}
}

impl Debug for BufferString {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.debug_struct("BufferString")
.field("string", &self.as_str())
.finish()
}
}

impl From<BufferString> for Buffer {
fn from(value: BufferString) -> Self {
value.0
Expand Down

0 comments on commit 0e3d425

Please sign in to comment.