From edb014b90cca059ced5f2576fd46c4c31f546270 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Tue, 10 Sep 2024 10:43:32 +0100 Subject: [PATCH] Fuzzer performs multiple operations on the underlying array instead of just one (#766) Includes fixes for - ZigZagArray scalar_at returns wrong nullability - RunEndArray slice wrongly slicing validity - Scalar comparison is nullability invariant --- Cargo.lock | 1 + encodings/runend/src/compute.rs | 137 +++++++++++++++- encodings/runend/src/runend.rs | 105 +------------ encodings/zigzag/src/compress.rs | 36 ++--- encodings/zigzag/src/compute.rs | 105 ++++++++----- encodings/zigzag/src/zigzag.rs | 14 +- fuzz/Cargo.toml | 1 + fuzz/fuzz_targets/fuzz_target_1.rs | 128 ++++++--------- fuzz/src/lib.rs | 148 +++++++++++++----- fuzz/src/search_sorted.rs | 98 ++++++++++++ fuzz/src/slice.rs | 57 +++++++ fuzz/src/sort.rs | 99 ++++++++++++ fuzz/src/take.rs | 60 +++++++ vortex-array/src/array/constant/canonical.rs | 22 ++- .../primitive/compute/subtract_scalar.rs | 7 +- vortex-array/src/compute/search_sorted.rs | 11 +- vortex-sampling-compressor/src/arbitrary.rs | 4 + .../src/compressors/zigzag.rs | 4 +- vortex-scalar/src/lib.rs | 2 +- 19 files changed, 737 insertions(+), 302 deletions(-) create mode 100644 fuzz/src/search_sorted.rs create mode 100644 fuzz/src/slice.rs create mode 100644 fuzz/src/sort.rs create mode 100644 fuzz/src/take.rs diff --git a/Cargo.lock b/Cargo.lock index 6bc10e4ae5..29d2e4b604 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4509,6 +4509,7 @@ version = "0.0.0" dependencies = [ "libfuzzer-sys", "vortex-array", + "vortex-buffer", "vortex-dtype", "vortex-error", "vortex-sampling-compressor", diff --git a/encodings/runend/src/compute.rs b/encodings/runend/src/compute.rs index f8671710e1..826175fada 100644 --- a/encodings/runend/src/compute.rs +++ b/encodings/runend/src/compute.rs @@ -96,7 +96,7 @@ impl SliceFn for RunEndArray { Ok(Self::with_offset_and_size( slice(&self.ends(), slice_begin, slice_end + 1)?, slice(&self.values(), slice_begin, slice_end + 1)?, - self.validity().slice(slice_begin, slice_end + 1)?, + self.validity().slice(start, stop)?, stop - start, start, )? @@ -106,11 +106,12 @@ impl SliceFn for RunEndArray { #[cfg(test)] mod test { - use vortex::array::PrimitiveArray; - use vortex::compute::take; + use vortex::array::{BoolArray, PrimitiveArray}; use vortex::compute::unary::{scalar_at, try_cast}; - use vortex::validity::Validity; - use vortex::{ArrayDType, IntoArrayVariant, ToArray}; + use vortex::compute::{slice, take}; + use vortex::validity::{ArrayValidity, Validity}; + use vortex::{ArrayDType, IntoArray, IntoArrayVariant, ToArray}; + use vortex_dtype::{DType, Nullability, PType}; use vortex_scalar::Scalar; use crate::RunEndArray; @@ -168,4 +169,130 @@ mod test { let scalar = scalar_at(null_ree.array(), 11).unwrap(); assert_eq!(scalar, Scalar::null(null_ree.dtype().clone())); } + + #[test] + fn slice_with_nulls() { + let array = RunEndArray::try_new( + PrimitiveArray::from(vec![3, 6, 8, 12]).into_array(), + PrimitiveArray::from_vec(vec![1, 4, 2, 5], Validity::AllValid).into_array(), + Validity::from(vec![ + false, false, false, false, true, true, false, false, false, false, true, true, + ]), + ) + .unwrap(); + let sliced = slice(array.array(), 4, 10).unwrap(); + let sliced_primitive = sliced.into_primitive().unwrap(); + assert_eq!( + sliced_primitive.maybe_null_slice::(), + vec![4, 4, 2, 2, 5, 5] + ); + assert_eq!( + sliced_primitive + .logical_validity() + .into_array() + .into_bool() + .unwrap() + .boolean_buffer() + .iter() + .collect::>(), + vec![true, true, false, false, false, false] + ) + } + + #[test] + fn slice_array() { + let arr = slice( + RunEndArray::try_new( + vec![2u32, 5, 10].into_array(), + vec![1i32, 2, 3].into_array(), + Validity::NonNullable, + ) + .unwrap() + .array(), + 3, + 8, + ) + .unwrap(); + assert_eq!( + arr.dtype(), + &DType::Primitive(PType::I32, Nullability::NonNullable) + ); + assert_eq!(arr.len(), 5); + + assert_eq!( + arr.into_primitive().unwrap().maybe_null_slice::(), + vec![2, 2, 3, 3, 3] + ); + } + + #[test] + fn slice_end_inclusive() { + let arr = slice( + RunEndArray::try_new( + vec![2u32, 5, 10].into_array(), + vec![1i32, 2, 3].into_array(), + Validity::NonNullable, + ) + .unwrap() + .array(), + 4, + 10, + ) + .unwrap(); + assert_eq!( + arr.dtype(), + &DType::Primitive(PType::I32, Nullability::NonNullable) + ); + assert_eq!(arr.len(), 6); + + assert_eq!( + arr.into_primitive().unwrap().maybe_null_slice::(), + vec![2, 3, 3, 3, 3, 3] + ); + } + + #[test] + fn decompress() { + let arr = RunEndArray::try_new( + vec![2u32, 5, 10].into_array(), + vec![1i32, 2, 3].into_array(), + Validity::NonNullable, + ) + .unwrap(); + + assert_eq!( + arr.into_primitive().unwrap().maybe_null_slice::(), + vec![1, 1, 2, 2, 2, 3, 3, 3, 3, 3] + ); + } + + #[test] + fn take_with_nulls() { + let uncompressed = PrimitiveArray::from_vec(vec![1i32, 0, 3], Validity::AllValid); + let validity = BoolArray::from_vec( + vec![ + true, true, false, false, false, true, true, true, true, true, + ], + Validity::NonNullable, + ); + let arr = RunEndArray::try_new( + vec![2u32, 5, 10].into_array(), + uncompressed.into(), + Validity::Array(validity.into()), + ) + .unwrap(); + + let test_indices = PrimitiveArray::from_vec(vec![0, 2, 4, 6], Validity::NonNullable); + let taken = take(arr.array(), test_indices.array()).unwrap(); + + assert_eq!(taken.len(), test_indices.len()); + + let parray = taken.into_primitive().unwrap(); + assert_eq!( + (0..4) + .map(|idx| parray.is_valid(idx).then(|| parray.get_as_cast::(idx))) + .collect::>>(), + vec![Some(1), None, None, Some(3),] + ); + } } diff --git a/encodings/runend/src/runend.rs b/encodings/runend/src/runend.rs index 7e989e78d9..b7d517db70 100644 --- a/encodings/runend/src/runend.rs +++ b/encodings/runend/src/runend.rs @@ -164,12 +164,10 @@ impl AcceptArrayVisitor for RunEndArray { impl ArrayStatisticsCompute for RunEndArray {} #[cfg(test)] -mod test { - use vortex::array::{BoolArray, PrimitiveArray}; +mod tests { use vortex::compute::unary::scalar_at; - use vortex::compute::{slice, take}; - use vortex::validity::{ArrayValidity, Validity}; - use vortex::{ArrayDType, IntoArray, IntoArrayVariant}; + use vortex::validity::Validity; + use vortex::{ArrayDType, IntoArray}; use vortex_dtype::{DType, Nullability, PType}; use crate::RunEndArray; @@ -196,101 +194,4 @@ mod test { assert_eq!(scalar_at(arr.array(), 5).unwrap(), 3.into()); assert_eq!(scalar_at(arr.array(), 9).unwrap(), 3.into()); } - - #[test] - fn slice_array() { - let arr = slice( - RunEndArray::try_new( - vec![2u32, 5, 10].into_array(), - vec![1i32, 2, 3].into_array(), - Validity::NonNullable, - ) - .unwrap() - .array(), - 3, - 8, - ) - .unwrap(); - assert_eq!( - arr.dtype(), - &DType::Primitive(PType::I32, Nullability::NonNullable) - ); - assert_eq!(arr.len(), 5); - - assert_eq!( - arr.into_primitive().unwrap().maybe_null_slice::(), - vec![2, 2, 3, 3, 3] - ); - } - - #[test] - fn slice_end_inclusive() { - let arr = slice( - RunEndArray::try_new( - vec![2u32, 5, 10].into_array(), - vec![1i32, 2, 3].into_array(), - Validity::NonNullable, - ) - .unwrap() - .array(), - 4, - 10, - ) - .unwrap(); - assert_eq!( - arr.dtype(), - &DType::Primitive(PType::I32, Nullability::NonNullable) - ); - assert_eq!(arr.len(), 6); - - assert_eq!( - arr.into_primitive().unwrap().maybe_null_slice::(), - vec![2, 3, 3, 3, 3, 3] - ); - } - - #[test] - fn flatten() { - let arr = RunEndArray::try_new( - vec![2u32, 5, 10].into_array(), - vec![1i32, 2, 3].into_array(), - Validity::NonNullable, - ) - .unwrap(); - - assert_eq!( - arr.into_primitive().unwrap().maybe_null_slice::(), - vec![1, 1, 2, 2, 2, 3, 3, 3, 3, 3] - ); - } - - #[test] - fn with_nulls() { - let uncompressed = PrimitiveArray::from_vec(vec![1i32, 0, 3], Validity::AllValid); - let validity = BoolArray::from_vec( - vec![ - true, true, false, false, false, true, true, true, true, true, - ], - Validity::NonNullable, - ); - let arr = RunEndArray::try_new( - vec![2u32, 5, 10].into_array(), - uncompressed.into(), - Validity::Array(validity.into()), - ) - .unwrap(); - - let test_indices = PrimitiveArray::from_vec(vec![0, 2, 4, 6], Validity::NonNullable); - let taken = take(arr.array(), test_indices.array()).unwrap(); - - assert_eq!(taken.len(), test_indices.len()); - - let parray = taken.into_primitive().unwrap(); - assert_eq!( - (0..4) - .map(|idx| parray.is_valid(idx).then(|| parray.get_as_cast::(idx))) - .collect::>>(), - vec![Some(1), None, None, Some(3),] - ); - } } diff --git a/encodings/zigzag/src/compress.rs b/encodings/zigzag/src/compress.rs index 65489ebc0e..4586f3533b 100644 --- a/encodings/zigzag/src/compress.rs +++ b/encodings/zigzag/src/compress.rs @@ -7,49 +7,47 @@ use zigzag::ZigZag as ExternalZigZag; use crate::ZigZagArray; -pub fn zigzag_encode(parray: &PrimitiveArray) -> VortexResult { +pub fn zigzag_encode(parray: PrimitiveArray) -> VortexResult { + let validity = parray.validity(); let encoded = match parray.ptype() { - PType::I8 => zigzag_encode_primitive::(parray.maybe_null_slice(), parray.validity()), - PType::I16 => zigzag_encode_primitive::(parray.maybe_null_slice(), parray.validity()), - PType::I32 => zigzag_encode_primitive::(parray.maybe_null_slice(), parray.validity()), - PType::I64 => zigzag_encode_primitive::(parray.maybe_null_slice(), parray.validity()), + PType::I8 => zigzag_encode_primitive::(parray.into_maybe_null_slice(), validity), + PType::I16 => zigzag_encode_primitive::(parray.into_maybe_null_slice(), validity), + PType::I32 => zigzag_encode_primitive::(parray.into_maybe_null_slice(), validity), + PType::I64 => zigzag_encode_primitive::(parray.into_maybe_null_slice(), validity), _ => vortex_bail!("Unsupported ptype {}", parray.ptype()), }; ZigZagArray::try_new(encoded.into_array()) } fn zigzag_encode_primitive( - values: &[T], + values: Vec, validity: Validity, ) -> PrimitiveArray where ::UInt: NativePType, { - let mut encoded = Vec::with_capacity(values.len()); - encoded.extend(values.iter().map(|v| T::encode(*v))); - PrimitiveArray::from_vec(encoded.to_vec(), validity) + PrimitiveArray::from_vec(values.into_iter().map(|v| T::encode(v)).collect(), validity) } -pub fn zigzag_decode(parray: &PrimitiveArray) -> PrimitiveArray { +pub fn zigzag_decode(parray: PrimitiveArray) -> PrimitiveArray { + let validity = parray.validity(); match parray.ptype() { - PType::U8 => zigzag_decode_primitive::(parray.maybe_null_slice(), parray.validity()), - PType::U16 => zigzag_decode_primitive::(parray.maybe_null_slice(), parray.validity()), - PType::U32 => zigzag_decode_primitive::(parray.maybe_null_slice(), parray.validity()), - PType::U64 => zigzag_decode_primitive::(parray.maybe_null_slice(), parray.validity()), + PType::U8 => zigzag_decode_primitive::(parray.into_maybe_null_slice(), validity), + PType::U16 => zigzag_decode_primitive::(parray.into_maybe_null_slice(), validity), + PType::U32 => zigzag_decode_primitive::(parray.into_maybe_null_slice(), validity), + PType::U64 => zigzag_decode_primitive::(parray.into_maybe_null_slice(), validity), _ => panic!("Unsupported ptype {}", parray.ptype()), } } fn zigzag_decode_primitive( - values: &[T::UInt], + values: Vec, validity: Validity, ) -> PrimitiveArray where ::UInt: NativePType, { - let mut encoded = Vec::with_capacity(values.len()); - encoded.extend(values.iter().map(|v| T::decode(*v))); - PrimitiveArray::from_vec(encoded, validity) + PrimitiveArray::from_vec(values.into_iter().map(|v| T::decode(v)).collect(), validity) } #[cfg(test)] @@ -61,7 +59,7 @@ mod test { #[test] fn test_compress() { - let compressed = zigzag_encode(&PrimitiveArray::from(Vec::from_iter( + let compressed = zigzag_encode(PrimitiveArray::from(Vec::from_iter( (-10_000..10_000).map(|i| i as i64), ))) .unwrap(); diff --git a/encodings/zigzag/src/compute.rs b/encodings/zigzag/src/compute.rs index 2347463a35..028966a080 100644 --- a/encodings/zigzag/src/compute.rs +++ b/encodings/zigzag/src/compute.rs @@ -1,10 +1,10 @@ use vortex::compute::unary::{scalar_at_unchecked, ScalarAtFn}; use vortex::compute::{slice, ArrayCompute, SliceFn}; -use vortex::{Array, IntoArray}; -use vortex_dtype::PType; +use vortex::{Array, ArrayDType, IntoArray}; +use vortex_dtype::match_each_unsigned_integer_ptype; use vortex_error::{vortex_err, VortexResult}; use vortex_scalar::{PrimitiveScalar, Scalar}; -use zigzag::ZigZag as ExternalZigZag; +use zigzag::{ZigZag as ExternalZigZag, ZigZag}; use crate::ZigZagArray; @@ -22,41 +22,21 @@ impl ScalarAtFn for ZigZagArray { fn scalar_at(&self, index: usize) -> VortexResult { let scalar = scalar_at_unchecked(&self.encoded(), index); if scalar.is_null() { - return Ok(scalar); + return Ok(scalar.reinterpret_cast(self.ptype())); } let pscalar = PrimitiveScalar::try_from(&scalar)?; - match pscalar.ptype() { - PType::U8 => Ok(i8::decode(pscalar.typed_value::().ok_or_else(|| { - vortex_err!( - "Cannot decode provided scalar: expected u8, got ptype {}", - pscalar.ptype() - ) - })?) - .into()), - PType::U16 => Ok(i16::decode(pscalar.typed_value::().ok_or_else(|| { - vortex_err!( - "Cannot decode provided scalar: expected u16, got ptype {}", - pscalar.ptype() - ) - })?) - .into()), - PType::U32 => Ok(i32::decode(pscalar.typed_value::().ok_or_else(|| { - vortex_err!( - "Cannot decode provided scalar: expected u32, got ptype {}", - pscalar.ptype() - ) - })?) - .into()), - PType::U64 => Ok(i64::decode(pscalar.typed_value::().ok_or_else(|| { - vortex_err!( - "Cannot decode provided scalar: expected u64, got ptype {}", - pscalar.ptype() - ) - })?) - .into()), - _ => unreachable!(), - } + match_each_unsigned_integer_ptype!(pscalar.ptype(), |$P| { + Ok(Scalar::primitive( + <<$P as ZigZagEncoded>::Int>::decode(pscalar.typed_value::<$P>().ok_or_else(|| { + vortex_err!( + "Cannot decode provided scalar: expected u8, got ptype {}", + pscalar.ptype() + ) + })?), + self.dtype().nullability(), + )) + }) } fn scalar_at_unchecked(&self, index: usize) -> Scalar { @@ -64,8 +44,63 @@ impl ScalarAtFn for ZigZagArray { } } +trait ZigZagEncoded { + type Int: ZigZag; +} + +impl ZigZagEncoded for u8 { + type Int = i8; +} + +impl ZigZagEncoded for u16 { + type Int = i16; +} + +impl ZigZagEncoded for u32 { + type Int = i32; +} + +impl ZigZagEncoded for u64 { + type Int = i64; +} + impl SliceFn for ZigZagArray { fn slice(&self, start: usize, stop: usize) -> VortexResult { Ok(Self::try_new(slice(&self.encoded(), start, stop)?)?.into_array()) } } + +#[cfg(test)] +mod tests { + use vortex::array::PrimitiveArray; + use vortex::compute::unary::scalar_at; + use vortex::compute::{search_sorted, SearchResult, SearchSortedSide}; + use vortex::validity::Validity; + use vortex::IntoArray; + use vortex_dtype::Nullability; + use vortex_scalar::Scalar; + + use crate::ZigZagArray; + + #[test] + pub fn search_sorted_uncompressed() { + let zigzag = + ZigZagArray::encode(&PrimitiveArray::from(vec![-189, -160, 1]).into_array()).unwrap(); + assert_eq!( + search_sorted(&zigzag, -169, SearchSortedSide::Right).unwrap(), + SearchResult::NotFound(1) + ); + } + + #[test] + pub fn nullable_scalar_at() { + let zigzag = ZigZagArray::encode( + &PrimitiveArray::from_vec(vec![-189, -160, 1], Validity::AllValid).into_array(), + ) + .unwrap(); + assert_eq!( + scalar_at(&zigzag, 1).unwrap(), + Scalar::primitive(-160, Nullability::Nullable) + ); + } +} diff --git a/encodings/zigzag/src/zigzag.rs b/encodings/zigzag/src/zigzag.rs index 2fc4d7b8df..3a06dadb0d 100644 --- a/encodings/zigzag/src/zigzag.rs +++ b/encodings/zigzag/src/zigzag.rs @@ -20,10 +20,6 @@ impl_encoding!("vortex.zigzag", 21u16, ZigZag); pub struct ZigZagMetadata; impl ZigZagArray { - pub fn new(encoded: Array) -> Self { - Self::try_new(encoded).unwrap() - } - pub fn try_new(encoded: Array) -> VortexResult { let encoded_dtype = encoded.dtype().clone(); if !encoded_dtype.is_unsigned_int() { @@ -42,8 +38,8 @@ impl ZigZagArray { pub fn encode(array: &Array) -> VortexResult { PrimitiveArray::try_from(array) .map_err(|_| vortex_err!("ZigZag can only encoding primitive arrays")) - .map(|parray| zigzag_encode(&parray))? - .map(|encoded| encoded.into_array()) + .and_then(zigzag_encode) + .map(|a| a.into_array()) } pub fn encoded(&self) -> Array { @@ -53,6 +49,10 @@ impl ZigZagArray { .child(0, &encoded, self.len()) .expect("Missing encoded array") } + + pub fn ptype(&self) -> PType { + PType::try_from(self.dtype()).expect("must be a ptype") + } } impl ArrayTrait for ZigZagArray {} @@ -86,7 +86,7 @@ impl ArrayStatisticsCompute for ZigZagArray {} impl IntoCanonical for ZigZagArray { fn into_canonical(self) -> VortexResult { Ok(Canonical::Primitive(zigzag_decode( - &self.encoded().into_primitive()?, + self.encoded().into_primitive()?, ))) } } diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 71e7e9fd05..5f937ef907 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -11,6 +11,7 @@ cargo-fuzz = true [dependencies] libfuzzer-sys = { workspace = true } vortex-array = { workspace = true, features = ["arbitrary"] } +vortex-buffer = { workspace = true } vortex-dtype = { workspace = true } vortex-error = { workspace = true } vortex-sampling-compressor = { workspace = true, features = ["arbitrary"] } diff --git a/fuzz/fuzz_targets/fuzz_target_1.rs b/fuzz/fuzz_targets/fuzz_target_1.rs index ddd723cfcf..64904471a2 100644 --- a/fuzz/fuzz_targets/fuzz_target_1.rs +++ b/fuzz/fuzz_targets/fuzz_target_1.rs @@ -1,48 +1,61 @@ #![no_main] +use std::collections::HashSet; + use libfuzzer_sys::{fuzz_target, Corpus}; +use vortex::array::{BoolEncoding, PrimitiveEncoding, VarBinEncoding, VarBinViewEncoding}; use vortex::compute::unary::scalar_at; -use vortex::compute::{search_sorted, slice, take, SearchResult, SearchSorted, SearchSortedSide}; -use vortex::encoding::EncodingId; -use vortex::Array; -use vortex_error::VortexResult; -use vortex_fuzz::{Action, FuzzArrayAction}; +use vortex::compute::{search_sorted, slice, take, SearchResult, SearchSortedSide}; +use vortex::encoding::{EncodingId, EncodingRef}; +use vortex::{Array, IntoCanonical}; +use vortex_fuzz::{sort_canonical_array, Action, FuzzArrayAction}; use vortex_sampling_compressor::SamplingCompressor; use vortex_scalar::{PValue, Scalar, ScalarValue}; fuzz_target!(|fuzz_action: FuzzArrayAction| -> Corpus { let FuzzArrayAction { array, actions } = fuzz_action; - match &actions[0] { - Action::Compress(c) => match fuzz_compress(&array, c) { - Some(compressed_array) => { - assert_array_eq(&array, &compressed_array); - Corpus::Keep + let mut current_array = array.clone(); + for (action, expected) in actions { + match action { + Action::Compress(c) => { + match fuzz_compress(¤t_array.into_canonical().unwrap().into(), &c) { + Some(compressed_array) => { + assert_array_eq(&expected.array(), &compressed_array); + current_array = compressed_array; + } + None => return Corpus::Reject, + } } - None => Corpus::Reject, - }, - Action::Slice(range) => { - let slice = slice(&array, range.start, range.end).unwrap(); - assert_slice(&array, &slice, range.start); - Corpus::Keep - } - Action::SearchSorted(s, side) => { - if !array_is_sorted(&array).unwrap() || s.is_null() { - return Corpus::Reject; + Action::Slice(range) => { + current_array = slice(¤t_array, range.start, range.end).unwrap(); + assert_array_eq(&expected.array(), ¤t_array); } - - let search_result = search_sorted(&array, s.clone(), *side).unwrap(); - assert_search_sorted(&array, s, *side, search_result); - Corpus::Keep - } - Action::Take(indices) => { - if indices.is_empty() { - return Corpus::Reject; + Action::Take(indices) => { + if indices.is_empty() { + return Corpus::Reject; + } + current_array = take(¤t_array, &indices).unwrap(); + assert_array_eq(&expected.array(), ¤t_array); + } + Action::SearchSorted(s, side) => { + // TODO(robert): Ideally we'd preserve the encoding perfectly but this is close enough + let mut sorted = sort_canonical_array(¤t_array); + if !HashSet::from([ + &PrimitiveEncoding as EncodingRef, + &VarBinEncoding, + &VarBinViewEncoding, + &BoolEncoding, + ]) + .contains(¤t_array.encoding()) + { + sorted = + fuzz_compress(&sorted, &SamplingCompressor::default()).unwrap_or(sorted); + } + assert_search_sorted(sorted, s, side, expected.search()) } - let taken = take(&array, indices).unwrap(); - assert_take(&array, &taken, indices); - Corpus::Keep } } + Corpus::Keep }); fn fuzz_compress(array: &Array, compressor: &SamplingCompressor) -> Option { @@ -54,39 +67,14 @@ fn fuzz_compress(array: &Array, compressor: &SamplingCompressor) -> Option VortexResult { - if array.is_empty() { - return Ok(true); - } - - let mut last_value = scalar_at(array, 0)?; - for i in 1..array.len() { - let next_value = scalar_at(array, i)?; - if next_value < last_value { - return Ok(false); - } - last_value = next_value; - } - Ok(true) -} diff --git a/fuzz/src/lib.rs b/fuzz/src/lib.rs index 596153fc0d..3c3fc73cbf 100644 --- a/fuzz/src/lib.rs +++ b/fuzz/src/lib.rs @@ -1,21 +1,53 @@ +mod search_sorted; +mod slice; +mod sort; +mod take; + use std::fmt::Debug; use std::iter; use std::ops::Range; use libfuzzer_sys::arbitrary::Error::EmptyChoose; use libfuzzer_sys::arbitrary::{Arbitrary, Result, Unstructured}; +pub use sort::sort_canonical_array; use vortex::array::PrimitiveArray; use vortex::compute::unary::scalar_at; -use vortex::compute::SearchSortedSide; +use vortex::compute::{SearchResult, SearchSortedSide}; use vortex::{Array, ArrayDType}; use vortex_sampling_compressor::SamplingCompressor; use vortex_scalar::arbitrary::random_scalar; use vortex_scalar::Scalar; +use crate::search_sorted::search_sorted_canonical_array; +use crate::slice::slice_canonical_array; +use crate::take::take_canonical_array; + +#[derive(Debug)] +pub enum ExpectedValue { + Array(Array), + Search(SearchResult), +} + +impl ExpectedValue { + pub fn array(self) -> Array { + match self { + ExpectedValue::Array(array) => array, + _ => panic!("expected array"), + } + } + + pub fn search(self) -> SearchResult { + match self { + ExpectedValue::Search(s) => s, + _ => panic!("expected search"), + } + } +} + #[derive(Debug)] pub struct FuzzArrayAction { pub array: Array, - pub actions: Vec, + pub actions: Vec<(Action, ExpectedValue)>, } #[derive(Debug)] @@ -29,52 +61,88 @@ pub enum Action { impl<'a> Arbitrary<'a> for FuzzArrayAction { fn arbitrary(u: &mut Unstructured<'a>) -> Result { let array = Array::arbitrary(u)?; - let len = array.len(); - let action = match u.int_in_range(0..=3)? { - 0 => Action::Compress(u.arbitrary()?), - 1 => { - let start = u.choose_index(len)?; - let stop = u.int_in_range(start..=len)?; - Action::Slice(start..stop) - } - 2 => { - if len == 0 { - return Err(EmptyChoose); + let mut current_array = array.clone(); + let mut actions = Vec::new(); + for _ in 0..u.int_in_range(1..=4)? { + actions.push(match u.int_in_range(0..=3)? { + 0 => { + if actions + .last() + .map(|(l, _)| matches!(l, Action::Compress(_))) + .unwrap_or(false) + { + continue; + } + ( + Action::Compress(u.arbitrary()?), + ExpectedValue::Array(current_array.clone()), + ) + } + 1 => { + let start = u.choose_index(current_array.len())?; + let stop = u.int_in_range(start..=current_array.len())?; + current_array = slice_canonical_array(¤t_array, start, stop); + + ( + Action::Slice(start..stop), + ExpectedValue::Array(current_array.clone()), + ) + } + 2 => { + if current_array.is_empty() { + return Err(EmptyChoose); + } + + let indices = random_vec_in_range(u, 0, current_array.len() - 1)?; + current_array = take_canonical_array(¤t_array, &indices); + let indices_array = + PrimitiveArray::from(indices.iter().map(|i| *i as u64).collect::>()) + .into(); + let compressed = SamplingCompressor::default() + .compress(&indices_array, None) + .unwrap(); + ( + Action::Take(compressed.into_array()), + ExpectedValue::Array(current_array.clone()), + ) } + 3 => { + let scalar = if u.arbitrary()? { + scalar_at(¤t_array, u.choose_index(current_array.len())?).unwrap() + } else { + random_scalar(u, current_array.dtype())? + }; + + if scalar.is_null() { + return Err(EmptyChoose); + } + + let sorted = sort_canonical_array(¤t_array); - let indices = PrimitiveArray::from(random_vec_in_range(u, 0, len - 1)?).into(); - let compressed = SamplingCompressor::default() - .compress(&indices, None) - .unwrap(); - Action::Take(compressed.into_array()) - } - 3 => { - let side = if u.arbitrary()? { - SearchSortedSide::Left - } else { - SearchSortedSide::Right - }; - if u.arbitrary()? { - let random_value_in_array = scalar_at(&array, u.choose_index(len)?).unwrap(); - Action::SearchSorted(random_value_in_array, side) - } else { - Action::SearchSorted(random_scalar(u, array.dtype())?, side) + let side = if u.arbitrary()? { + SearchSortedSide::Left + } else { + SearchSortedSide::Right + }; + ( + Action::SearchSorted(scalar.clone(), side), + ExpectedValue::Search(search_sorted_canonical_array( + &sorted, &scalar, side, + )), + ) } - } - _ => unreachable!(), - }; - - Ok(Self { - array, - actions: vec![action], - }) + _ => unreachable!(), + }) + } + + Ok(Self { array, actions }) } } -fn random_vec_in_range(u: &mut Unstructured<'_>, min: usize, max: usize) -> Result> { +fn random_vec_in_range(u: &mut Unstructured<'_>, min: usize, max: usize) -> Result> { iter::from_fn(|| { if u.arbitrary().unwrap_or(false) { - Some(u.int_in_range(min..=max).map(|i| i as u64)) + Some(u.int_in_range(min..=max)) } else { None } diff --git a/fuzz/src/search_sorted.rs b/fuzz/src/search_sorted.rs new file mode 100644 index 0000000000..78f39a1bf5 --- /dev/null +++ b/fuzz/src/search_sorted.rs @@ -0,0 +1,98 @@ +use std::cmp::Ordering; + +use vortex::accessor::ArrayAccessor; +use vortex::compute::{IndexOrd, Len, SearchResult, SearchSorted, SearchSortedSide}; +use vortex::validity::ArrayValidity; +use vortex::{Array, ArrayDType, IntoArray, IntoArrayVariant}; +use vortex_buffer::{Buffer, BufferString}; +use vortex_dtype::{match_each_native_ptype, DType}; +use vortex_scalar::Scalar; + +struct SearchNullableSlice(Vec>); + +impl IndexOrd> for SearchNullableSlice { + fn index_cmp(&self, idx: usize, elem: &Option) -> Option { + match elem { + None => unreachable!("Can't search for None"), + Some(v) => + // SAFETY: Used in search_sorted_by same as the standard library. The search_sorted ensures idx is in bounds + { + match unsafe { self.0.get_unchecked(idx) } { + None => Some(Ordering::Greater), + Some(i) => i.partial_cmp(v), + } + } + } + } +} + +impl Len for SearchNullableSlice { + fn len(&self) -> usize { + self.0.len() + } +} + +pub fn search_sorted_canonical_array( + array: &Array, + scalar: &Scalar, + side: SearchSortedSide, +) -> SearchResult { + match array.dtype() { + DType::Bool(_) => { + let bool_array = array.clone().into_bool().unwrap(); + let opt_values = bool_array + .boolean_buffer() + .iter() + .zip( + bool_array + .logical_validity() + .into_array() + .into_bool() + .unwrap() + .boolean_buffer() + .iter(), + ) + .map(|(b, v)| v.then_some(b)) + .collect::>(); + let to_find = scalar.try_into().unwrap(); + SearchNullableSlice(opt_values).search_sorted(&Some(to_find), side) + } + DType::Primitive(p, _) => match_each_native_ptype!(p, |$P| { + let primitive_array = array.clone().into_primitive().unwrap(); + let opt_values = primitive_array + .maybe_null_slice::<$P>() + .iter() + .copied() + .zip( + primitive_array + .logical_validity() + .into_array() + .into_bool() + .unwrap() + .boolean_buffer() + .iter(), + ) + .map(|(b, v)| v.then_some(b)) + .collect::>(); + let to_find: $P = scalar.try_into().unwrap(); + SearchNullableSlice(opt_values).search_sorted(&Some(to_find), side) + }), + DType::Utf8(_) | DType::Binary(_) => { + let utf8 = array.clone().into_varbin().unwrap(); + let opt_values = utf8 + .with_iterator(|iter| iter.map(|v| v.map(|u| u.to_vec())).collect::>()) + .unwrap(); + let to_find = if matches!(array.dtype(), DType::Utf8(_)) { + BufferString::try_from(scalar) + .unwrap() + .as_str() + .as_bytes() + .to_vec() + } else { + Buffer::try_from(scalar).unwrap().to_vec() + }; + SearchNullableSlice(opt_values).search_sorted(&Some(to_find), side) + } + _ => unreachable!("Not a canonical array"), + } +} diff --git a/fuzz/src/slice.rs b/fuzz/src/slice.rs new file mode 100644 index 0000000000..ce6fce05b8 --- /dev/null +++ b/fuzz/src/slice.rs @@ -0,0 +1,57 @@ +use vortex::accessor::ArrayAccessor; +use vortex::array::{BoolArray, PrimitiveArray, VarBinArray}; +use vortex::validity::{ArrayValidity, Validity}; +use vortex::{Array, ArrayDType, IntoArray, IntoArrayVariant}; +use vortex_dtype::{match_each_native_ptype, DType}; + +pub fn slice_canonical_array(array: &Array, start: usize, stop: usize) -> Array { + match array.dtype() { + DType::Bool(_) => { + let bool_array = array.clone().into_bool().unwrap(); + let vec_values = bool_array.boolean_buffer().iter().collect::>(); + let vec_validity = bool_array + .logical_validity() + .into_array() + .into_bool() + .unwrap() + .boolean_buffer() + .iter() + .collect::>(); + BoolArray::from_vec( + Vec::from(&vec_values[start..stop]), + Validity::from(Vec::from(&vec_validity[start..stop])), + ) + .into_array() + } + DType::Primitive(p, _) => match_each_native_ptype!(p, |$P| { + let primitive_array = array.clone().into_primitive().unwrap(); + let vec_values = primitive_array + .maybe_null_slice::<$P>() + .iter() + .copied() + .collect::>(); + let vec_validity = primitive_array + .logical_validity() + .into_array() + .into_bool() + .unwrap() + .boolean_buffer() + .iter() + .collect::>(); + PrimitiveArray::from_vec( + Vec::from(&vec_values[start..stop]), + Validity::from(Vec::from(&vec_validity[start..stop])), + ) + .into_array() + }), + DType::Utf8(_) | DType::Binary(_) => { + let utf8 = array.clone().into_varbin().unwrap(); + let values = utf8 + .with_iterator(|iter| iter.map(|v| v.map(|u| u.to_vec())).collect::>()) + .unwrap(); + VarBinArray::from_iter(Vec::from(&values[start..stop]), array.dtype().clone()) + .into_array() + } + _ => unreachable!("Array::arbitrary will not generate other dtypes"), + } +} diff --git a/fuzz/src/sort.rs b/fuzz/src/sort.rs new file mode 100644 index 0000000000..994940f9cb --- /dev/null +++ b/fuzz/src/sort.rs @@ -0,0 +1,99 @@ +use std::cmp::Ordering; + +use vortex::accessor::ArrayAccessor; +use vortex::array::{BoolArray, PrimitiveArray, VarBinArray}; +use vortex::validity::ArrayValidity; +use vortex::{Array, ArrayDType, IntoArray, IntoArrayVariant}; +use vortex_dtype::{match_each_float_ptype, match_each_integer_ptype, DType}; + +pub fn sort_canonical_array(array: &Array) -> Array { + match array.dtype() { + DType::Bool(_) => { + let bool_array = array.clone().into_bool().unwrap(); + let mut opt_values = bool_array + .boolean_buffer() + .iter() + .zip( + bool_array + .logical_validity() + .into_array() + .into_bool() + .unwrap() + .boolean_buffer() + .iter(), + ) + .map(|(b, v)| v.then_some(b)) + .collect::>(); + sort_opt_slice(&mut opt_values); + BoolArray::from_iter(opt_values).into_array() + } + DType::Primitive(p, _) => { + let primitive_array = array.clone().into_primitive().unwrap(); + if p.is_int() { + match_each_integer_ptype!(p, |$P| { + let mut opt_values = primitive_array + .maybe_null_slice::<$P>() + .iter() + .copied() + .zip( + primitive_array + .logical_validity() + .into_array() + .into_bool() + .unwrap() + .boolean_buffer() + .iter(), + ) + .map(|(p, v)| v.then_some(p)) + .collect::>(); + sort_opt_slice(&mut opt_values); + PrimitiveArray::from_nullable_vec(opt_values).into_array() + }) + } else { + match_each_float_ptype!(p, |$F| { + let mut opt_values = primitive_array + .maybe_null_slice::<$F>() + .iter() + .copied() + .zip( + primitive_array + .logical_validity() + .into_array() + .into_bool() + .unwrap() + .boolean_buffer() + .iter(), + ) + .map(|(p, v)| v.then_some(p)) + .collect::>(); + opt_values.sort_by(|a, b| match (a, b) { + (Some(v), Some(w)) => v.to_bits().cmp(&w.to_bits()), + (None, None) => Ordering::Equal, + (None, Some(_)) => Ordering::Greater, + (Some(_), None) => Ordering::Less, + }); + PrimitiveArray::from_nullable_vec(opt_values).into_array() + }) + } + } + DType::Utf8(_) | DType::Binary(_) => { + let utf8 = array.clone().into_varbin().unwrap(); + let mut opt_values = utf8 + .with_iterator(|iter| iter.map(|v| v.map(|u| u.to_vec())).collect::>()) + .unwrap(); + sort_opt_slice(&mut opt_values); + VarBinArray::from_iter(opt_values, array.dtype().clone()).into_array() + } + _ => unreachable!("Not a canonical array"), + } +} + +/// Reverse sorting of Option such that None is last (Greatest) +fn sort_opt_slice(s: &mut [Option]) { + s.sort_by(|a, b| match (a, b) { + (Some(v), Some(w)) => v.cmp(w), + (None, None) => Ordering::Equal, + (None, Some(_)) => Ordering::Greater, + (Some(_), None) => Ordering::Less, + }); +} diff --git a/fuzz/src/take.rs b/fuzz/src/take.rs new file mode 100644 index 0000000000..3ed5605b17 --- /dev/null +++ b/fuzz/src/take.rs @@ -0,0 +1,60 @@ +use vortex::accessor::ArrayAccessor; +use vortex::array::{BoolArray, PrimitiveArray, VarBinArray}; +use vortex::validity::{ArrayValidity, Validity}; +use vortex::{Array, ArrayDType, IntoArray, IntoArrayVariant}; +use vortex_dtype::{match_each_native_ptype, DType}; + +pub fn take_canonical_array(array: &Array, indices: &[usize]) -> Array { + match array.dtype() { + DType::Bool(_) => { + let bool_array = array.clone().into_bool().unwrap(); + let vec_values = bool_array.boolean_buffer().iter().collect::>(); + let vec_validity = bool_array + .logical_validity() + .into_array() + .into_bool() + .unwrap() + .boolean_buffer() + .iter() + .collect::>(); + BoolArray::from_vec( + indices.iter().map(|i| vec_values[*i]).collect(), + Validity::from(indices.iter().map(|i| vec_validity[*i]).collect::>()), + ) + .into_array() + } + DType::Primitive(p, _) => match_each_native_ptype!(p, |$P| { + let primitive_array = array.clone().into_primitive().unwrap(); + let vec_values = primitive_array + .maybe_null_slice::<$P>() + .iter() + .copied() + .collect::>(); + let vec_validity = primitive_array + .logical_validity() + .into_array() + .into_bool() + .unwrap() + .boolean_buffer() + .iter() + .collect::>(); + PrimitiveArray::from_vec( + indices.iter().map(|i| vec_values[*i]).collect(), + Validity::from(indices.iter().map(|i| vec_validity[*i]).collect::>()) + ) + .into_array() + }), + DType::Utf8(_) | DType::Binary(_) => { + let utf8 = array.clone().into_varbin().unwrap(); + let values = utf8 + .with_iterator(|iter| iter.map(|v| v.map(|u| u.to_vec())).collect::>()) + .unwrap(); + VarBinArray::from_iter( + indices.iter().map(|i| values[*i].clone()), + array.dtype().clone(), + ) + .into_array() + } + _ => unreachable!("Array::arbitrary will not generate other dtypes"), + } +} diff --git a/vortex-array/src/array/constant/canonical.rs b/vortex-array/src/array/constant/canonical.rs index 3ed24df1bc..9d537a9d45 100644 --- a/vortex-array/src/array/constant/canonical.rs +++ b/vortex-array/src/array/constant/canonical.rs @@ -1,8 +1,8 @@ use std::iter; use vortex_dtype::{match_each_native_ptype, DType, Nullability, PType}; -use vortex_error::{vortex_bail, vortex_err, VortexResult}; -use vortex_scalar::{BoolScalar, Utf8Scalar}; +use vortex_error::{vortex_bail, VortexResult}; +use vortex_scalar::{BinaryScalar, BoolScalar, Utf8Scalar}; use crate::array::constant::ConstantArray; use crate::array::primitive::PrimitiveArray; @@ -29,17 +29,25 @@ impl IntoCanonical for ConstantArray { } if let Ok(s) = Utf8Scalar::try_from(self.scalar()) { - let const_value = s - .value() - .ok_or_else(|| vortex_err!("Constant UTF-8 array has null value"))?; - let bytes = const_value.as_bytes(); + let value = s.value(); + let const_value = value.as_ref().map(|v| v.as_bytes()); return Ok(Canonical::VarBin(VarBinArray::from_iter( - iter::repeat(Some(bytes)).take(self.len()), + iter::repeat(const_value).take(self.len()), DType::Utf8(validity.nullability()), ))); } + if let Ok(b) = BinaryScalar::try_from(self.scalar()) { + let value = b.value(); + let const_value = value.as_ref().map(|v| v.as_slice()); + + return Ok(Canonical::VarBin(VarBinArray::from_iter( + iter::repeat(const_value).take(self.len()), + DType::Binary(validity.nullability()), + ))); + } + if let Ok(ptype) = PType::try_from(self.scalar().dtype()) { return match_each_native_ptype!(ptype, |$P| { Ok(Canonical::Primitive(PrimitiveArray::from_vec::<$P>( diff --git a/vortex-array/src/array/primitive/compute/subtract_scalar.rs b/vortex-array/src/array/primitive/compute/subtract_scalar.rs index 23bed1f599..25240890d6 100644 --- a/vortex-array/src/array/primitive/compute/subtract_scalar.rs +++ b/vortex-array/src/array/primitive/compute/subtract_scalar.rs @@ -1,7 +1,7 @@ use itertools::Itertools; use num_traits::WrappingSub; use vortex_dtype::{match_each_float_ptype, match_each_integer_ptype, NativePType}; -use vortex_error::{vortex_bail, vortex_err, VortexError, VortexResult}; +use vortex_error::{vortex_bail, vortex_err, VortexResult}; use vortex_scalar::{PrimitiveScalar, Scalar}; use crate::array::constant::ConstantArray; @@ -45,10 +45,7 @@ impl SubtractScalarFn for PrimitiveArray { } } -fn subtract_scalar_integer< - 'a, - T: NativePType + WrappingSub + for<'b> TryFrom<&'b Scalar, Error = VortexError>, ->( +fn subtract_scalar_integer( subtract_from: &PrimitiveArray, to_subtract: T, ) -> VortexResult { diff --git a/vortex-array/src/compute/search_sorted.rs b/vortex-array/src/compute/search_sorted.rs index 5e33eda478..cd0327d296 100644 --- a/vortex-array/src/compute/search_sorted.rs +++ b/vortex-array/src/compute/search_sorted.rs @@ -1,6 +1,6 @@ use std::cmp::Ordering; use std::cmp::Ordering::{Equal, Greater, Less}; -use std::fmt::{Display, Formatter}; +use std::fmt::{Debug, Display, Formatter}; use vortex_error::{vortex_bail, VortexResult}; use vortex_scalar::Scalar; @@ -45,6 +45,15 @@ impl SearchResult { } } +impl Display for SearchResult { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + SearchResult::Found(i) => write!(f, "Found({i})"), + SearchResult::NotFound(i) => write!(f, "NotFound({i})"), + } + } +} + /// Searches for value assuming the array is sorted. /// /// For nullable arrays we assume that the nulls are sorted last, i.e. they're the greatest value diff --git a/vortex-sampling-compressor/src/arbitrary.rs b/vortex-sampling-compressor/src/arbitrary.rs index d84a1f0917..9e0dddf683 100644 --- a/vortex-sampling-compressor/src/arbitrary.rs +++ b/vortex-sampling-compressor/src/arbitrary.rs @@ -1,5 +1,6 @@ use std::collections::HashSet; +use arbitrary::Error::EmptyChoose; use arbitrary::{Arbitrary, Result, Unstructured}; use crate::compressors::{CompressorRef, EncodingCompressor}; @@ -8,6 +9,9 @@ use crate::{SamplingCompressor, ALL_COMPRESSORS}; impl<'a, 'b: 'a> Arbitrary<'a> for SamplingCompressor<'b> { fn arbitrary(u: &mut Unstructured<'a>) -> Result { let compressors: HashSet = u.arbitrary()?; + if compressors.is_empty() { + return Err(EmptyChoose); + } Ok(Self::new(compressors)) } } diff --git a/vortex-sampling-compressor/src/compressors/zigzag.rs b/vortex-sampling-compressor/src/compressors/zigzag.rs index b2ff107f89..d72253a25c 100644 --- a/vortex-sampling-compressor/src/compressors/zigzag.rs +++ b/vortex-sampling-compressor/src/compressors/zigzag.rs @@ -46,11 +46,11 @@ impl EncodingCompressor for ZigZagCompressor { like: Option>, ctx: SamplingCompressor<'a>, ) -> VortexResult> { - let encoded = zigzag_encode(&array.as_primitive())?; + let encoded = zigzag_encode(PrimitiveArray::try_from(array)?)?; let compressed = ctx.compress(&encoded.encoded(), like.as_ref().and_then(|l| l.child(0)))?; Ok(CompressedArray::new( - ZigZagArray::new(compressed.array).into_array(), + ZigZagArray::try_new(compressed.array)?.into_array(), Some(CompressionTree::new(self, vec![compressed.path])), )) } diff --git a/vortex-scalar/src/lib.rs b/vortex-scalar/src/lib.rs index c67e297914..f10fc707a0 100644 --- a/vortex-scalar/src/lib.rs +++ b/vortex-scalar/src/lib.rs @@ -103,7 +103,7 @@ impl PartialEq for Scalar { impl PartialOrd for Scalar { fn partial_cmp(&self, other: &Self) -> Option { - if self.dtype() == other.dtype() { + if self.dtype().eq_ignore_nullability(other.dtype()) { self.value.partial_cmp(&other.value) } else { None