Skip to content

Commit

Permalink
Faster RunEndBool decompression, plus metadata cleanup (#981)
Browse files Browse the repository at this point in the history
Decompression is now 15-35% faster

specifically, ~15% faster for constant (sel = 0 or sel = 1)
![Screenshot 2024-10-04 at 17 52
45](https://github.com/user-attachments/assets/6843318f-5a69-4aa5-ab5d-8ab4df424ed1)

and ~35% faster if each bool is produced uniformly at random (i.e., sel
= 0.5, max number of runs in expectation)
![Screenshot 2024-10-04 at 17 52
38](https://github.com/user-attachments/assets/f453c4be-07b4-4b67-aa1d-363e24165c15)
  • Loading branch information
lwwmanning authored Oct 5, 2024
1 parent f12c846 commit 40a58fa
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 39 deletions.
8 changes: 6 additions & 2 deletions encodings/runend-bool/benches/ree_bool_compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use itertools::Itertools;
use rand::distributions::Open01;
use rand::prelude::StdRng;
use rand::{Rng, SeedableRng};
use vortex_runend_bool::compress::runend_bool_encode_slice;
use vortex_runend_bool::compress::{runend_bool_decode_slice, runend_bool_encode_slice};

fn compress_compare(c: &mut Criterion) {
compress_compare_param(c, 0.);
Expand All @@ -33,10 +33,14 @@ fn compress_compare_param(c: &mut Criterion, sel_fac: f32) {

let mut group = c.benchmark_group(format!("sel: {sel_fac}"));

group.bench_function("ree bool", |b| {
group.bench_function("ree bool compress", |b| {
b.iter(|| black_box(runend_bool_encode_slice(&boolbuf)));
});

let (ends, start) = runend_bool_encode_slice(&boolbuf);
group.bench_function("ree bool decompress", |b| {
b.iter(|| black_box(runend_bool_decode_slice(&ends, start, 0, ends.len())));
});
group.finish()
}

Expand Down
61 changes: 35 additions & 26 deletions encodings/runend-bool/src/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor};
use vortex::{
impl_encoding, Array, ArrayDType, ArrayTrait, Canonical, IntoArrayVariant, IntoCanonical,
};
use vortex_dtype::{DType, Nullability};
use vortex_dtype::{DType, PType};
use vortex_error::{vortex_bail, VortexExpect as _, VortexResult};

use crate::compress::runend_bool_decode;
Expand All @@ -22,10 +22,9 @@ impl_encoding!("vortex.runendbool", ids::RUN_END_BOOL, RunEndBool);
pub struct RunEndBoolMetadata {
start: bool,
validity: ValidityMetadata,
ends_dtype: DType,
ends_ptype: PType,
num_runs: usize,
offset: usize,
length: usize,
}

impl Display for RunEndBoolMetadata {
Expand All @@ -50,13 +49,25 @@ impl RunEndBoolArray {
if !ends.statistics().compute_is_strict_sorted().unwrap_or(true) {
vortex_bail!("Ends array must be strictly sorted",);
}
if !ends.dtype().is_unsigned_int() || ends.dtype().is_nullable() {
vortex_bail!(
"Ends array must be an unsigned integer type, got {}",
ends.dtype()
);
}
if ends.is_empty() {
vortex_bail!("Ends array must have at least one element");
}

let dtype = DType::Bool(validity.nullability());

let ends_ptype = ends.dtype().try_into()?;
let metadata = RunEndBoolMetadata {
start,
validity: validity.to_metadata(length)?,
ends_dtype: ends.dtype().clone(),
ends_ptype,
num_runs: ends.len(),
offset,
length,
};

let mut children = Vec::with_capacity(2);
Expand All @@ -65,44 +76,42 @@ impl RunEndBoolArray {
children.push(a)
}

Self::try_from_parts(
DType::Bool(Nullability::NonNullable),
length,
metadata,
children.into(),
StatsSet::new(),
)
Self::try_from_parts(dtype, length, metadata, children.into(), StatsSet::new())
}

pub fn find_physical_index(&self, index: usize) -> VortexResult<usize> {
pub(crate) fn find_physical_index(&self, index: usize) -> VortexResult<usize> {
search_sorted(&self.ends(), index + self.offset(), SearchSortedSide::Right)
.map(|s| s.to_ends_index(self.ends().len()))
}

pub fn validity(&self) -> Validity {
self.metadata().validity.to_validity(|| {
self.as_ref()
.child(2, &Validity::DTYPE, self.len())
.vortex_expect("RunEndBoolArray: validity child")
})
}

#[inline]
pub fn offset(&self) -> usize {
pub(crate) fn offset(&self) -> usize {
self.metadata().offset
}

#[inline]
pub fn start(&self) -> bool {
pub(crate) fn start(&self) -> bool {
self.metadata().start
}

#[inline]
pub fn ends(&self) -> Array {
pub(crate) fn ends(&self) -> Array {
self.as_ref()
.child(0, &self.metadata().ends_dtype, self.metadata().num_runs)
.child(
0,
&self.metadata().ends_ptype.into(),
self.metadata().num_runs,
)
.vortex_expect("RunEndBoolArray is missing its run ends")
}

pub fn validity(&self) -> Validity {
self.metadata().validity.to_validity(|| {
self.as_ref()
.child(1, &Validity::DTYPE, self.len())
.vortex_expect("RunEndBoolArray: validity child")
})
}
}

impl BoolArrayTrait for RunEndBoolArray {
Expand Down Expand Up @@ -186,7 +195,7 @@ mod test {
let arr = slice(
// [t, t, f, f, f, t, f, t, t, t]
RunEndBoolArray::try_new(
vec![2i32, 5, 6, 7, 10].into_array(),
vec![2u32, 5, 6, 7, 10].into_array(),
true,
Validity::NonNullable,
)
Expand Down
22 changes: 11 additions & 11 deletions encodings/runend-bool/src/compress.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::cmp::min;

use arrow_buffer::buffer::BooleanBuffer;
use arrow_buffer::BooleanBufferBuilder;
use num_traits::{AsPrimitive, FromPrimitive};
use vortex::array::{BoolArray, PrimitiveArray};
use vortex::validity::Validity;
Expand All @@ -14,11 +15,11 @@ pub fn runend_bool_encode(elements: &BoolArray) -> (PrimitiveArray, bool) {

pub fn runend_bool_encode_slice(elements: &BooleanBuffer) -> (Vec<u64>, bool) {
let mut iter = elements.set_slices();
let mut ends = Vec::new();
let Some((start, end)) = iter.next() else {
return (vec![elements.len() as u64], false);
};

let mut ends = Vec::new();
let first_bool = start == 0;
if !first_bool {
ends.push(start as u64)
Expand Down Expand Up @@ -48,7 +49,7 @@ pub fn runend_bool_decode(
) -> VortexResult<BoolArray> {
match_each_integer_ptype!(run_ends.ptype(), |$E| {
let bools = runend_bool_decode_slice::<$E>(run_ends.maybe_null_slice(), start, offset, length);
BoolArray::try_new(BooleanBuffer::from(bools), validity)
BoolArray::try_new(bools, validity)
})
}

Expand All @@ -57,7 +58,7 @@ pub fn runend_bool_decode_slice<E: NativePType + AsPrimitive<usize> + FromPrimit
start: bool,
offset: usize,
length: usize,
) -> Vec<bool> {
) -> BooleanBuffer {
let offset_e = E::from_usize(offset).unwrap_or_else(|| {
vortex_panic!(
"offset {} cannot be converted to {}",
Expand All @@ -77,12 +78,11 @@ pub fn runend_bool_decode_slice<E: NativePType + AsPrimitive<usize> + FromPrimit
.map(|v| *v - offset_e)
.map(|v| min(v, length_e));

let mut decoded = Vec::with_capacity(length);
let mut decoded = BooleanBufferBuilder::new(length);
for (idx, end) in trimmed_ends.enumerate() {
decoded
.extend(std::iter::repeat(value_at_index(idx, start)).take(end.as_() - decoded.len()));
decoded.append_n(end.as_() - decoded.len(), value_at_index(idx, start));
}
decoded
BooleanBuffer::from(decoded)
}

pub fn value_at_index(idx: usize, start: bool) -> bool {
Expand Down Expand Up @@ -136,7 +136,7 @@ mod test {
let (ends, start) = runend_bool_encode_slice(&BooleanBuffer::from(input.as_slice()));

let decoded = runend_bool_decode_slice(ends.as_slice(), start, 0, input.len());
assert_eq!(decoded, input)
assert_eq!(decoded, BooleanBuffer::from(input.as_slice()))
}

#[test]
Expand All @@ -145,7 +145,7 @@ mod test {
let (ends, start) = runend_bool_encode_slice(&BooleanBuffer::from(input.as_slice()));

let decoded = runend_bool_decode_slice(ends.as_slice(), start, 0, input.len());
assert_eq!(decoded, input)
assert_eq!(decoded, BooleanBuffer::from(input.as_slice()))
}

#[test]
Expand All @@ -155,7 +155,7 @@ mod test {
let (ends, start) = runend_bool_encode_slice(&BooleanBuffer::from(input.as_slice()));

let decoded = runend_bool_decode_slice(ends.as_slice(), start, 0, input.len());
assert_eq!(decoded, input)
assert_eq!(decoded, BooleanBuffer::from(input.as_slice()))
}

#[test]
Expand All @@ -165,7 +165,7 @@ mod test {
let (ends, start) = runend_bool_encode_slice(&BooleanBuffer::from(input.as_slice()));

let decoded = runend_bool_decode_slice(ends.as_slice(), start, 0, input.len());
assert_eq!(decoded, input)
assert_eq!(decoded, BooleanBuffer::from(input.as_slice()))
}

#[test]
Expand Down

0 comments on commit 40a58fa

Please sign in to comment.