Skip to content

Commit

Permalink
feat: store min, max, null count, and true count in column metadata (#…
Browse files Browse the repository at this point in the history
…1164)

Co-authored-by: Will Manning <[email protected]>
  • Loading branch information
danking and lwwmanning authored Nov 1, 2024
1 parent 453fa61 commit f83a093
Show file tree
Hide file tree
Showing 17 changed files with 700 additions and 100 deletions.
2 changes: 1 addition & 1 deletion docs/quickstart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ similar to or smaller than Parquet.

>>> from os.path import getsize
>>> getsize("example.vortex") / getsize("_static/example.parquet")
2.1...
2...

Read
^^^^
Expand Down
2 changes: 2 additions & 0 deletions vortex-array/src/array/bool/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ impl BoolStatsAccumulator {
StatsSet::from(HashMap::from([
(Stat::Min, (self.true_count == self.len).into()),
(Stat::Max, (self.true_count > 0).into()),
(Stat::NullCount, self.null_count.into()),
(
Stat::IsConstant,
(self.null_count == 0 && (self.true_count == self.len || self.true_count == 0)
Expand Down Expand Up @@ -155,6 +156,7 @@ mod test {
assert!(!bool_arr.statistics().compute_is_constant().unwrap());
assert!(!bool_arr.statistics().compute_min::<bool>().unwrap());
assert!(bool_arr.statistics().compute_max::<bool>().unwrap());
assert_eq!(bool_arr.statistics().compute_null_count().unwrap(), 0);
assert_eq!(bool_arr.statistics().compute_run_count().unwrap(), 5);
assert_eq!(bool_arr.statistics().compute_true_count().unwrap(), 4);
}
Expand Down
15 changes: 14 additions & 1 deletion vortex-array/src/array/constant/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,27 @@ use crate::stats::{ArrayStatisticsCompute, Stat, StatsSet};

impl ArrayStatisticsCompute for ConstantArray {
fn compute_statistics(&self, _stat: Stat) -> VortexResult<StatsSet> {
let mut stats_map = HashMap::from([(Stat::IsConstant, true.into())]);
let mut stats_map = HashMap::from([
(Stat::IsConstant, true.into()),
(Stat::IsSorted, true.into()),
(Stat::IsStrictSorted, (self.len() <= 1).into()),
]);

if let ScalarValue::Bool(b) = self.scalar_value() {
let true_count = if *b { self.len() as u64 } else { 0 };

stats_map.insert(Stat::TrueCount, true_count.into());
}

stats_map.insert(
Stat::NullCount,
self.scalar_value()
.is_null()
.then_some(self.len() as u64)
.unwrap_or_default()
.into(),
);

Ok(StatsSet::from(stats_map))
}
}
55 changes: 55 additions & 0 deletions vortex-array/src/array/from.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
use vortex_buffer::{Buffer, BufferString};
use vortex_dtype::half::f16;
use vortex_dtype::{DType, Nullability};

use super::{BoolArray, PrimitiveArray, VarBinViewArray};
use crate::validity::Validity;
use crate::{Array, IntoArray as _};

// `From<Vec<Option<!>>> for Array` requries the experimental uninhabited type: !.

impl From<Vec<Option<bool>>> for Array {
fn from(value: Vec<Option<bool>>) -> Self {
BoolArray::from_iter(value).into_array()
}
}

macro_rules! impl_from_primitive_for_array {
($P:ty) => {
impl From<Vec<$P>> for Array {
fn from(value: Vec<$P>) -> Self {
PrimitiveArray::from_vec(value, Validity::NonNullable).into_array()
}
}

impl From<Vec<Option<$P>>> for Array {
fn from(value: Vec<Option<$P>>) -> Self {
PrimitiveArray::from_nullable_vec(value).into_array()
}
}
};
}

impl_from_primitive_for_array!(u8);
impl_from_primitive_for_array!(u16);
impl_from_primitive_for_array!(u32);
impl_from_primitive_for_array!(u64);
impl_from_primitive_for_array!(i8);
impl_from_primitive_for_array!(i16);
impl_from_primitive_for_array!(i32);
impl_from_primitive_for_array!(i64);
impl_from_primitive_for_array!(f16);
impl_from_primitive_for_array!(f32);
impl_from_primitive_for_array!(f64);

impl From<Vec<Option<BufferString>>> for Array {
fn from(value: Vec<Option<BufferString>>) -> Self {
VarBinViewArray::from_iter(value, DType::Utf8(Nullability::Nullable)).into_array()
}
}

impl From<Vec<Option<Buffer>>> for Array {
fn from(value: Vec<Option<Buffer>>) -> Self {
VarBinViewArray::from_iter(value, DType::Binary(Nullability::Nullable)).into_array()
}
}
1 change: 1 addition & 0 deletions vortex-array/src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ mod struct_;
mod varbin;
mod varbinview;

pub mod from;
pub mod visitor;

#[cfg(feature = "arbitrary")]
Expand Down
6 changes: 6 additions & 0 deletions vortex-buffer/src/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,9 @@ impl AsRef<str> for BufferString {
self.as_str()
}
}

impl AsRef<[u8]> for BufferString {
fn as_ref(&self) -> &[u8] {
self.as_str().as_bytes()
}
}
45 changes: 42 additions & 3 deletions vortex-scalar/src/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,48 @@ impl<'a> TryFrom<&'a Scalar> for BinaryScalar<'a> {
impl<'a> TryFrom<&'a Scalar> for Buffer {
type Error = VortexError;

fn try_from(value: &'a Scalar) -> VortexResult<Self> {
BinaryScalar::try_from(value)?
.value()
fn try_from(scalar: &'a Scalar) -> VortexResult<Self> {
Buffer::try_from(scalar.value())
}
}

impl TryFrom<Scalar> for Buffer {
type Error = VortexError;

fn try_from(scalar: Scalar) -> VortexResult<Self> {
Buffer::try_from(&scalar)
}
}

impl TryFrom<&ScalarValue> for Buffer {
type Error = VortexError;

fn try_from(value: &ScalarValue) -> Result<Self, Self::Error> {
Option::<Buffer>::try_from(value)?
.ok_or_else(|| vortex_err!("Can't extract present value from null scalar"))
}
}

impl TryFrom<ScalarValue> for Buffer {
type Error = VortexError;

fn try_from(value: ScalarValue) -> Result<Self, Self::Error> {
Buffer::try_from(&value)
}
}

impl TryFrom<&ScalarValue> for Option<Buffer> {
type Error = VortexError;

fn try_from(value: &ScalarValue) -> Result<Self, Self::Error> {
value.as_buffer()
}
}

impl TryFrom<ScalarValue> for Option<Buffer> {
type Error = VortexError;

fn try_from(value: ScalarValue) -> Result<Self, Self::Error> {
Option::<Buffer>::try_from(&value)
}
}
41 changes: 36 additions & 5 deletions vortex-scalar/src/bool.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,14 @@ impl TryFrom<&Scalar> for bool {
}
}

impl TryFrom<Scalar> for bool {
type Error = VortexError;

fn try_from(value: Scalar) -> VortexResult<Self> {
bool::try_from(&value)
}
}

impl From<bool> for Scalar {
fn from(value: bool) -> Self {
Self {
Expand All @@ -73,19 +81,42 @@ impl From<bool> for Scalar {
}
}

impl From<bool> for ScalarValue {
fn from(value: bool) -> Self {
ScalarValue::Bool(value)
}
}

impl TryFrom<&ScalarValue> for Option<bool> {
type Error = VortexError;

fn try_from(value: &ScalarValue) -> VortexResult<Self> {
value.as_bool()
}
}

impl TryFrom<ScalarValue> for Option<bool> {
type Error = VortexError;

fn try_from(value: ScalarValue) -> VortexResult<Self> {
Option::<bool>::try_from(&value)
}
}

impl TryFrom<&ScalarValue> for bool {
type Error = VortexError;

fn try_from(value: &ScalarValue) -> VortexResult<Self> {
value
.as_bool()?
Option::<bool>::try_from(value)?
.ok_or_else(|| vortex_err!("Can't extract present value from null scalar"))
}
}

impl From<bool> for ScalarValue {
fn from(value: bool) -> Self {
ScalarValue::Bool(value)
impl TryFrom<ScalarValue> for bool {
type Error = VortexError;

fn try_from(value: ScalarValue) -> VortexResult<Self> {
bool::try_from(&value)
}
}

Expand Down
1 change: 1 addition & 0 deletions vortex-scalar/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ mod datafusion;
mod display;
mod extension;
mod list;
mod null;
mod primitive;
mod pvalue;
mod scalar_type;
Expand Down
35 changes: 35 additions & 0 deletions vortex-scalar/src/null.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
use vortex_error::VortexError;

use crate::{Scalar, ScalarValue};

impl TryFrom<&Scalar> for () {
type Error = VortexError;

fn try_from(scalar: &Scalar) -> Result<Self, Self::Error> {
scalar.value().as_null()
}
}

impl TryFrom<Scalar> for () {
type Error = VortexError;

fn try_from(scalar: Scalar) -> Result<Self, Self::Error> {
<()>::try_from(&scalar)
}
}

impl TryFrom<&ScalarValue> for () {
type Error = VortexError;

fn try_from(value: &ScalarValue) -> Result<Self, Self::Error> {
value.as_null()
}
}

impl TryFrom<ScalarValue> for () {
type Error = VortexError;

fn try_from(value: ScalarValue) -> Result<Self, Self::Error> {
<()>::try_from(&value)
}
}
24 changes: 21 additions & 3 deletions vortex-scalar/src/primitive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,19 +154,37 @@ macro_rules! primitive_scalar {
impl TryFrom<&ScalarValue> for $T {
type Error = VortexError;

fn try_from(value: &ScalarValue) -> Result<Self, Self::Error> {
Option::<$T>::try_from(value)?
.ok_or_else(|| vortex_err!("Can't extract present value from null scalar"))
}
}

impl TryFrom<ScalarValue> for $T {
type Error = VortexError;

fn try_from(value: ScalarValue) -> Result<Self, Self::Error> {
<$T>::try_from(&value)
}
}

impl TryFrom<&ScalarValue> for Option<$T> {
type Error = VortexError;

fn try_from(value: &ScalarValue) -> Result<Self, Self::Error> {
match value {
ScalarValue::Primitive(pvalue) => <$T>::try_from(*pvalue),
ScalarValue::Null => Ok(None),
ScalarValue::Primitive(pvalue) => Ok(Some(<$T>::try_from(*pvalue)?)),
_ => vortex_bail!("expected primitive"),
}
}
}

impl TryFrom<ScalarValue> for $T {
impl TryFrom<ScalarValue> for Option<$T> {
type Error = VortexError;

fn try_from(value: ScalarValue) -> Result<Self, Self::Error> {
<$T>::try_from(&value)
Option::<$T>::try_from(&value)
}
}
};
Expand Down
59 changes: 49 additions & 10 deletions vortex-scalar/src/utf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,16 +62,6 @@ impl<'a> TryFrom<&'a Scalar> for Utf8Scalar<'a> {
}
}

impl<'a> TryFrom<&'a Scalar> for BufferString {
type Error = VortexError;

fn try_from(value: &'a Scalar) -> VortexResult<Self> {
Utf8Scalar::try_from(value)?
.value()
.ok_or_else(|| vortex_err!("Can't extract present value from null scalar"))
}
}

impl<'a> TryFrom<&'a Scalar> for String {
type Error = VortexError;

Expand All @@ -88,3 +78,52 @@ impl From<&str> for Scalar {
}
}
}

impl<'a> TryFrom<&'a Scalar> for BufferString {
type Error = VortexError;

fn try_from(scalar: &'a Scalar) -> VortexResult<Self> {
BufferString::try_from(scalar.value())
}
}

impl TryFrom<Scalar> for BufferString {
type Error = VortexError;

fn try_from(scalar: Scalar) -> Result<Self, Self::Error> {
BufferString::try_from(&scalar)
}
}

impl TryFrom<&ScalarValue> for BufferString {
type Error = VortexError;

fn try_from(value: &ScalarValue) -> Result<Self, Self::Error> {
Option::<BufferString>::try_from(value)?
.ok_or_else(|| vortex_err!("Can't extract present value from null scalar"))
}
}

impl TryFrom<ScalarValue> for BufferString {
type Error = VortexError;

fn try_from(value: ScalarValue) -> Result<Self, Self::Error> {
BufferString::try_from(&value)
}
}

impl TryFrom<&ScalarValue> for Option<BufferString> {
type Error = VortexError;

fn try_from(value: &ScalarValue) -> Result<Self, Self::Error> {
value.as_buffer_string()
}
}

impl TryFrom<ScalarValue> for Option<BufferString> {
type Error = VortexError;

fn try_from(value: ScalarValue) -> Result<Self, Self::Error> {
Option::<BufferString>::try_from(&value)
}
}
Loading

0 comments on commit f83a093

Please sign in to comment.