Skip to content

Commit

Permalink
fix: trim array metadatas + fix validity handling (part 1) (#966)
Browse files Browse the repository at this point in the history
this PR touches ALP, ALP-RD, DateTimeParts (which had major
Validity-handling issues), and Dict
  • Loading branch information
lwwmanning authored Oct 3, 2024
1 parent 8d4f5f2 commit de81eb1
Show file tree
Hide file tree
Showing 31 changed files with 251 additions and 153 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

54 changes: 32 additions & 22 deletions encodings/alp/src/alp/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@ use vortex::stats::ArrayStatisticsCompute;
use vortex::validity::{ArrayValidity, LogicalValidity, Validity};
use vortex::variants::{ArrayVariants, PrimitiveArrayTrait};
use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor};
use vortex::{
impl_encoding, Array, ArrayDType, ArrayDef, ArrayTrait, Canonical, IntoArray, IntoCanonical,
};
use vortex::{impl_encoding, Array, ArrayDType, ArrayTrait, Canonical, IntoArray, IntoCanonical};
use vortex_dtype::{DType, PType};
use vortex_error::{vortex_bail, vortex_panic, VortexExpect as _, VortexResult};

Expand All @@ -23,8 +21,6 @@ impl_encoding!("vortex.alp", ids::ALP, ALP);
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ALPMetadata {
exponents: Exponents,
encoded_dtype: DType,
patches_dtype: Option<DType>,
}

impl ALPArray {
Expand All @@ -33,7 +29,6 @@ impl ALPArray {
exponents: Exponents,
patches: Option<Array>,
) -> VortexResult<Self> {
let encoded_dtype = encoded.dtype().clone();
let dtype = match encoded.dtype() {
DType::Primitive(PType::I32, nullability) => DType::Primitive(PType::F32, *nullability),
DType::Primitive(PType::I64, nullability) => DType::Primitive(PType::F64, *nullability),
Expand All @@ -53,21 +48,23 @@ impl ALPArray {
}
}

let patches_dtype = patches.as_ref().map(|a| a.dtype().as_nullable());
let mut children = Vec::with_capacity(2);
children.push(encoded);
if let Some(patch) = patches {
if !patch.dtype().eq_ignore_nullability(&dtype) || !patch.dtype().is_nullable() {
vortex_bail!(
"ALP patches dtype, {}, must be nullable version of array dtype, {}",
patch.dtype(),
dtype,
);
}
children.push(patch);
}

Self::try_from_parts(
dtype,
length,
ALPMetadata {
exponents,
encoded_dtype,
patches_dtype,
},
ALPMetadata { exponents },
children.into(),
Default::default(),
)
Expand All @@ -83,7 +80,7 @@ impl ALPArray {

pub fn encoded(&self) -> Array {
self.as_ref()
.child(0, &self.metadata().encoded_dtype, self.len())
.child(0, &self.encoded_dtype(), self.len())
.vortex_expect("Missing encoded child in ALPArray")
}

Expand All @@ -93,15 +90,10 @@ impl ALPArray {
}

pub fn patches(&self) -> Option<Array> {
self.metadata().patches_dtype.as_ref().map(|dt| {
self.as_ref().child(1, dt, self.len()).unwrap_or_else(|e| {
vortex_panic!(
e,
"ALPArray: patches child missing: dtype: {}, len: {}",
dt,
self.len()
)
})
(self.as_ref().nchildren() > 1).then(|| {
self.as_ref()
.child(1, &self.patches_dtype(), self.len())
.vortex_expect("Missing patches child in ALPArray")
})
}

Expand All @@ -111,6 +103,24 @@ impl ALPArray {
.try_into()
.vortex_expect("Failed to convert DType to PType")
}

#[inline]
fn encoded_dtype(&self) -> DType {
match self.dtype() {
DType::Primitive(PType::F32, _) => {
DType::Primitive(PType::I32, self.dtype().nullability())
}
DType::Primitive(PType::F64, _) => {
DType::Primitive(PType::I64, self.dtype().nullability())
}
d => vortex_panic!(MismatchedTypes: "f32 or f64", d),
}
}

#[inline]
fn patches_dtype(&self) -> DType {
self.dtype().as_nullable()
}
}

impl ArrayTrait for ALPArray {}
Expand Down
84 changes: 55 additions & 29 deletions encodings/alp/src/alp_rd/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ use vortex::encoding::ids;
use vortex::stats::{ArrayStatisticsCompute, StatsSet};
use vortex::validity::{ArrayValidity, LogicalValidity};
use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor};
use vortex::{impl_encoding, Array, ArrayDType, ArrayDef, ArrayTrait, Canonical, IntoCanonical};
use vortex_dtype::{DType, PType};
use vortex::{impl_encoding, Array, ArrayDType, ArrayTrait, Canonical, IntoCanonical};
use vortex_dtype::{DType, Nullability, PType};
use vortex_error::{vortex_bail, VortexExpect, VortexResult};

use crate::alp_rd::alp_rd_decode;
Expand All @@ -17,7 +17,7 @@ pub struct ALPRDMetadata {
right_bit_width: u8,
dict_len: u8,
dict: [u16; 8],
left_parts_dtype: DType,
left_parts_ptype: PType,
has_exceptions: bool,
}

Expand All @@ -34,20 +34,35 @@ impl ALPRDArray {
vortex_bail!("ALPRDArray given invalid DType ({dtype})");
}

if left_parts.len() != right_parts.len() {
vortex_bail!("left_parts and right_parts must be of same length");
}

let len = left_parts.len();
if right_parts.len() != len {
vortex_bail!(
"left_parts (len {}) and right_parts (len {}) must be of same length",
len,
right_parts.len()
);
}

if !left_parts.dtype().is_unsigned_int() {
vortex_bail!("left_parts dtype must be uint");
}
// we delegate array validity to the left_parts child
if dtype.is_nullable() != left_parts.dtype().is_nullable() {
vortex_bail!(
"ALPRDArray dtype nullability ({}) must match left_parts dtype nullability ({})",
dtype,
left_parts.dtype()
);
}
let left_parts_ptype =
PType::try_from(left_parts.dtype()).vortex_expect("left_parts dtype must be uint");

let left_parts_dtype = left_parts.dtype().clone();

if !right_parts.dtype().is_unsigned_int() {
vortex_bail!("right_parts dtype must be uint");
// we enforce right_parts to be non-nullable uint
if right_parts.dtype().is_nullable() {
vortex_bail!("right_parts dtype must be non-nullable");
}
if !right_parts.dtype().is_unsigned_int() || right_parts.dtype().is_nullable() {
vortex_bail!(MismatchedTypes: "non-nullable uint", right_parts.dtype());
}

let mut children = vec![left_parts, right_parts];
Expand All @@ -73,7 +88,7 @@ impl ALPRDArray {
right_bit_width,
dict_len: left_parts_dict.as_ref().len() as u8,
dict,
left_parts_dtype,
left_parts_ptype,
has_exceptions,
},
children.into(),
Expand All @@ -90,42 +105,53 @@ impl ALPRDArray {
== PType::F32
}

/// The dtype of the left parts of the array.
#[inline]
fn left_parts_dtype(&self) -> DType {
DType::Primitive(self.metadata().left_parts_ptype, self.dtype().nullability())
}

/// The dtype of the right parts of the array.
#[inline]
fn right_parts_dtype(&self) -> DType {
DType::Primitive(
if self.is_f32() {
PType::U32
} else {
PType::U64
},
Nullability::NonNullable,
)
}

/// The dtype of the exceptions of the left parts of the array.
#[inline]
fn left_parts_exceptions_dtype(&self) -> DType {
DType::Primitive(self.metadata().left_parts_ptype, Nullability::Nullable)
}

/// The leftmost (most significant) bits of the floating point values stored in the array.
///
/// These are bit-packed and dictionary encoded, and cannot directly be interpreted without
/// the metadata of this array.
pub fn left_parts(&self) -> Array {
self.as_ref()
.child(0, &self.metadata().left_parts_dtype, self.len())
.child(0, &self.left_parts_dtype(), self.len())
.vortex_expect("ALPRDArray: left_parts child")
}

/// The rightmost (least significant) bits of the floating point values stored in the array.
pub fn right_parts(&self) -> Array {
let uint_ptype = if self.is_f32() {
PType::U32
} else {
PType::U64
};

self.as_ref()
.child(
1,
&DType::Primitive(uint_ptype, self.metadata().left_parts_dtype.nullability()),
self.len(),
)
.child(1, &self.right_parts_dtype(), self.len())
.vortex_expect("ALPRDArray: right_parts child")
}

/// Patches of left-most bits.
pub fn left_parts_exceptions(&self) -> Option<Array> {
self.metadata().has_exceptions.then(|| {
self.as_ref()
.child(
2,
&self.metadata().left_parts_dtype.as_nullable(),
self.len(),
)
.child(2, &self.left_parts_exceptions_dtype(), self.len())
.vortex_expect("ALPRDArray: left_parts_exceptions child")
})
}
Expand Down
7 changes: 3 additions & 4 deletions encodings/alp/src/alp_rd/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
pub use array::*;
use vortex::validity::Validity;

mod array;
mod compute;
Expand Down Expand Up @@ -208,7 +209,7 @@ impl RDEncoder {
.into_array()
};

let primitive_right = PrimitiveArray::from_vec(right_parts, array.validity());
let primitive_right = PrimitiveArray::from(right_parts);
// SAFETY: by construction, all values in right_parts are right_bit_width + leading zeros.
let packed_right = unsafe {
bitpack_encode_unchecked(primitive_right, self.right_bit_width as _)
Expand All @@ -231,9 +232,7 @@ impl RDEncoder {
.into_array()
};

let exc_array =
PrimitiveArray::from_nullable_vec(exceptions.into_iter().map(Some).collect())
.into_array();
let exc_array = PrimitiveArray::from_vec(exceptions, Validity::AllValid).into_array();
SparseArray::try_new(packed_pos, exc_array, doubles.len(), ScalarValue::Null)
.vortex_expect("ALP-RD: construction of exceptions SparseArray")
.into_array()
Expand Down
2 changes: 1 addition & 1 deletion encodings/bytebool/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use vortex::stats::StatsSet;
use vortex::validity::{ArrayValidity, LogicalValidity, Validity, ValidityMetadata};
use vortex::variants::{ArrayVariants, BoolArrayTrait};
use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor};
use vortex::{impl_encoding, ArrayDef, ArrayTrait, Canonical, IntoCanonical, TypedArray};
use vortex::{impl_encoding, ArrayTrait, Canonical, IntoCanonical, TypedArray};
use vortex_buffer::Buffer;
use vortex_dtype::DType;
use vortex_error::{VortexExpect as _, VortexResult};
Expand Down
1 change: 1 addition & 0 deletions encodings/datetime-parts/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ readme = { workspace = true }
workspace = true

[dependencies]
itertools = { workspace = true }
log = { workspace = true }
serde = { workspace = true, features = ["derive"] }
vortex-array = { workspace = true }
Expand Down
Loading

0 comments on commit de81eb1

Please sign in to comment.