From 88fc6f5e45f39965f77579fff5e165adb88d5574 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Mon, 8 Apr 2024 17:28:18 -0400 Subject: [PATCH] Stats --- vortex-array/src/stats.rs | 2 +- vortex-array2/src/array/bool/mod.rs | 13 ++++++---- vortex-array2/src/array/primitive/mod.rs | 9 +++++++ vortex-array2/src/array/ree/mod.rs | 9 +++++++ vortex-array2/src/array/struct/mod.rs | 3 +++ vortex-array2/src/data.rs | 30 +++++++++++++++++++++++- vortex-array2/src/lib.rs | 7 +++++- vortex-array2/src/view.rs | 6 +++++ 8 files changed, 72 insertions(+), 7 deletions(-) diff --git a/vortex-array/src/stats.rs b/vortex-array/src/stats.rs index a8e8d0b2d5..b5403f0da8 100644 --- a/vortex-array/src/stats.rs +++ b/vortex-array/src/stats.rs @@ -10,7 +10,7 @@ use vortex_schema::DType; use crate::ptype::NativePType; use crate::scalar::{ListScalarVec, Scalar}; -#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum Stat { BitWidthFreq, TrailingZeroFreq, diff --git a/vortex-array2/src/array/bool/mod.rs b/vortex-array2/src/array/bool/mod.rs index f145c41e9f..f7bb943ee7 100644 --- a/vortex-array2/src/array/bool/mod.rs +++ b/vortex-array2/src/array/bool/mod.rs @@ -6,6 +6,7 @@ use vortex_error::VortexResult; use vortex_schema::DType; use crate::impl_encoding; +use crate::stats::{ArrayStatistics, Statistics}; use crate::validity::Validity; use crate::validity::{ArrayValidity, ValidityMetadata}; use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; @@ -26,10 +27,7 @@ pub struct BoolArray<'a> { buffer: &'a Buffer, validity: Validity<'a>, length: usize, - // TODO(ngates): we support statistics by reference to a dyn trait. - // This trait is implemented for ArrayView and ArrayData and is passed into here as part - // of ArrayParts. - // e.g. stats: &dyn Statistics, + statistics: &'a dyn Statistics, } impl BoolArray<'_> { @@ -58,6 +56,7 @@ impl<'v> TryFromArrayParts<'v, BoolMetadata> for BoolArray<'v> { .validity .to_validity(parts.child(0, &Validity::DTYPE)), length: metadata.length, + statistics: parts.statistics(), }) } } @@ -110,6 +109,12 @@ impl AcceptArrayVisitor for BoolArray<'_> { } } +impl ArrayStatistics for BoolArray<'_> { + fn statistics(&self) -> &dyn Statistics { + self.statistics + } +} + #[cfg(test)] mod tests { use crate::array::bool::BoolData; diff --git a/vortex-array2/src/array/primitive/mod.rs b/vortex-array2/src/array/primitive/mod.rs index 1dc5cffb3c..4e141acf77 100644 --- a/vortex-array2/src/array/primitive/mod.rs +++ b/vortex-array2/src/array/primitive/mod.rs @@ -7,6 +7,7 @@ use vortex_error::VortexResult; use vortex_schema::DType; use crate::impl_encoding; +use crate::stats::{ArrayStatistics, Statistics}; use crate::validity::{ArrayValidity, Validity, ValidityMetadata}; use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; use crate::ArrayMetadata; @@ -25,6 +26,7 @@ pub struct PrimitiveArray<'a> { dtype: &'a DType, buffer: &'a Buffer, validity: Validity<'a>, + statistics: &'a dyn Statistics, } impl PrimitiveArray<'_> { @@ -53,6 +55,7 @@ impl<'a> TryFromArrayParts<'a, PrimitiveMetadata> for PrimitiveArray<'a> { dtype: parts.dtype(), buffer, validity: metadata.validity.to_validity(parts.child(0, parts.dtype())), + statistics: parts.statistics(), }) } } @@ -111,3 +114,9 @@ impl AcceptArrayVisitor for PrimitiveArray<'_> { visitor.visit_validity(self.validity()) } } + +impl ArrayStatistics for PrimitiveArray<'_> { + fn statistics(&self) -> &dyn Statistics { + self.statistics + } +} diff --git a/vortex-array2/src/array/ree/mod.rs b/vortex-array2/src/array/ree/mod.rs index feaab0aa57..8097e41105 100644 --- a/vortex-array2/src/array/ree/mod.rs +++ b/vortex-array2/src/array/ree/mod.rs @@ -5,6 +5,7 @@ use vortex_error::VortexResult; use vortex_schema::DType; use crate::impl_encoding; +use crate::stats::{ArrayStatistics, Statistics}; use crate::validity::ArrayValidity; use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; use crate::{Array, ArrayMetadata}; @@ -24,6 +25,7 @@ pub struct REEArray<'a> { values: Array<'a>, run_ends: Array<'a>, length: usize, + statistics: &'a dyn Statistics, } impl REEArray<'_> { @@ -61,6 +63,7 @@ impl<'v> TryFromArrayParts<'v, REEMetadata> for REEArray<'v> { .child(1, &metadata.ends_dtype) .ok_or_else(|| vortex_err!("REEArray missing run_ends"))?, length: metadata.length, + statistics: parts.statistics(), }) } } @@ -93,3 +96,9 @@ impl AcceptArrayVisitor for REEArray<'_> { visitor.visit_child("run_ends", self.run_ends()) } } + +impl ArrayStatistics for REEArray<'_> { + fn statistics(&self) -> &dyn Statistics { + self.statistics + } +} diff --git a/vortex-array2/src/array/struct/mod.rs b/vortex-array2/src/array/struct/mod.rs index 3a95e52317..03b0e8cde7 100644 --- a/vortex-array2/src/array/struct/mod.rs +++ b/vortex-array2/src/array/struct/mod.rs @@ -4,6 +4,7 @@ use serde::{Deserialize, Serialize}; use vortex_error::{vortex_bail, VortexResult}; use vortex_schema::{DType, FieldNames}; +use crate::stats::ArrayStatistics; use crate::validity::ArrayValidity; use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; use crate::{impl_encoding, ToArray, WithArray}; @@ -146,3 +147,5 @@ impl AcceptArrayVisitor for StructArray<'_> { Ok(()) } } + +impl ArrayStatistics for StructArray<'_> {} diff --git a/vortex-array2/src/data.rs b/vortex-array2/src/data.rs index ebeb2245e4..d1365722f1 100644 --- a/vortex-array2/src/data.rs +++ b/vortex-array2/src/data.rs @@ -1,11 +1,15 @@ +use std::collections::HashMap; use std::marker::PhantomData; -use std::sync::Arc; +use std::sync::{Arc, RwLock}; use arrow_buffer::Buffer; +use vortex::scalar::Scalar; +use vortex::stats::Stat; use vortex_error::{vortex_bail, VortexError, VortexResult}; use vortex_schema::DType; use crate::encoding::EncodingRef; +use crate::stats::Statistics; use crate::{Array, ArrayDef, ArrayMetadata, ArrayParts, IntoArray, ToArray}; #[allow(dead_code)] @@ -16,6 +20,7 @@ pub struct ArrayData { metadata: Arc, buffers: Arc<[Buffer]>, // Should this just be an Option, not an Arc? children: Arc<[Option]>, + stats_set: Arc>>, } impl ArrayData { @@ -32,6 +37,7 @@ impl ArrayData { metadata, buffers, children, + stats_set: Arc::new(RwLock::new(HashMap::new())), }; // Validate here that the metadata correctly parses, so that an encoding can infallibly @@ -215,4 +221,26 @@ impl ArrayParts for ArrayData { fn nchildren(&self) -> usize { self.children.len() } + + fn statistics(&self) -> &dyn Statistics { + self + } +} + +impl Statistics for ArrayData { + fn compute(&self, stat: &Stat) -> VortexResult> { + let mut locked = self.stats_set.write().unwrap(); + let stats = self + .encoding() + .with_data(self, |a| a.compute_statistics(stat))?; + for (k, v) in &stats { + locked.insert(*k, v.clone()); + } + Ok(stats.get(stat).cloned()) + } + + fn get(&self, stat: &Stat) -> Option { + let locked = self.stats_set.read().unwrap(); + locked.get(stat).cloned() + } } diff --git a/vortex-array2/src/lib.rs b/vortex-array2/src/lib.rs index 5745222146..84eb002956 100644 --- a/vortex-array2/src/lib.rs +++ b/vortex-array2/src/lib.rs @@ -10,6 +10,7 @@ mod data; pub mod encoding; mod implementation; mod metadata; +mod stats; mod tree; mod validity; mod view; @@ -30,6 +31,7 @@ use vortex_schema::DType; use crate::compute::ArrayCompute; use crate::encoding::EncodingRef; +use crate::stats::{ArrayStatistics, Statistics}; use crate::validity::ArrayValidity; use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; @@ -87,6 +89,7 @@ pub trait ArrayParts { fn buffer(&self, idx: usize) -> Option<&Buffer>; fn child<'a>(&'a self, idx: usize, dtype: &'a DType) -> Option; fn nchildren(&self) -> usize; + fn statistics(&self) -> &dyn Statistics; } pub trait TryFromArrayParts<'v, M: ArrayMetadata>: Sized + 'v { @@ -94,7 +97,9 @@ pub trait TryFromArrayParts<'v, M: ArrayMetadata>: Sized + 'v { } /// Collects together the behaviour of an array. -pub trait ArrayTrait: ArrayCompute + ArrayValidity + AcceptArrayVisitor + ToArrayData { +pub trait ArrayTrait: + ArrayCompute + ArrayValidity + AcceptArrayVisitor + ArrayStatistics + ToArrayData +{ fn dtype(&self) -> &DType; fn len(&self) -> usize; diff --git a/vortex-array2/src/view.rs b/vortex-array2/src/view.rs index 3e4a9392bf..8ffaa0851e 100644 --- a/vortex-array2/src/view.rs +++ b/vortex-array2/src/view.rs @@ -6,6 +6,7 @@ use vortex_error::{vortex_bail, vortex_err, VortexResult}; use vortex_schema::DType; use crate::encoding::EncodingRef; +use crate::stats::{EmptyStatistics, Statistics}; use crate::{Array, IntoArray, ToArray}; use crate::{ArrayParts, SerdeContext}; @@ -169,4 +170,9 @@ impl ArrayParts for ArrayView<'_> { fn nchildren(&self) -> usize { self.array.children().map(|c| c.len()).unwrap_or_default() } + + fn statistics(&self) -> &dyn Statistics { + // TODO(ngates): serialize statistics into fb::Array + &EmptyStatistics + } }