Skip to content

Commit

Permalink
Stats
Browse files Browse the repository at this point in the history
  • Loading branch information
gatesn committed Apr 8, 2024
1 parent 86b090e commit 768d28c
Show file tree
Hide file tree
Showing 9 changed files with 115 additions and 7 deletions.
2 changes: 1 addition & 1 deletion vortex-array/src/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use vortex_schema::DType;
use crate::ptype::NativePType;
use crate::scalar::{ListScalarVec, Scalar};

#[derive(Debug, Clone, PartialEq, Eq, Hash)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum Stat {
BitWidthFreq,
TrailingZeroFreq,
Expand Down
13 changes: 9 additions & 4 deletions vortex-array2/src/array/bool/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use vortex_error::VortexResult;
use vortex_schema::DType;

use crate::impl_encoding;
use crate::stats::{ArrayStatistics, Statistics};
use crate::validity::Validity;
use crate::validity::{ArrayValidity, ValidityMetadata};
use crate::visitor::{AcceptArrayVisitor, ArrayVisitor};
Expand All @@ -26,10 +27,7 @@ pub struct BoolArray<'a> {
buffer: &'a Buffer,
validity: Validity<'a>,
length: usize,
// TODO(ngates): we support statistics by reference to a dyn trait.
// This trait is implemented for ArrayView and ArrayData and is passed into here as part
// of ArrayParts.
// e.g. stats: &dyn Statistics,
statistics: &'a dyn Statistics,
}

impl BoolArray<'_> {
Expand Down Expand Up @@ -58,6 +56,7 @@ impl<'v> TryFromArrayParts<'v, BoolMetadata> for BoolArray<'v> {
.validity
.to_validity(parts.child(0, &Validity::DTYPE)),
length: metadata.length,
statistics: parts.statistics(),
})
}
}
Expand Down Expand Up @@ -110,6 +109,12 @@ impl AcceptArrayVisitor for BoolArray<'_> {
}
}

impl ArrayStatistics for BoolArray<'_> {
fn statistics(&self) -> &dyn Statistics {
self.statistics
}
}

#[cfg(test)]
mod tests {
use crate::array::bool::BoolData;
Expand Down
9 changes: 9 additions & 0 deletions vortex-array2/src/array/primitive/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use vortex_error::VortexResult;
use vortex_schema::DType;

use crate::impl_encoding;
use crate::stats::{ArrayStatistics, Statistics};
use crate::validity::{ArrayValidity, Validity, ValidityMetadata};
use crate::visitor::{AcceptArrayVisitor, ArrayVisitor};
use crate::ArrayMetadata;
Expand All @@ -25,6 +26,7 @@ pub struct PrimitiveArray<'a> {
dtype: &'a DType,
buffer: &'a Buffer,
validity: Validity<'a>,
statistics: &'a dyn Statistics,
}

impl PrimitiveArray<'_> {
Expand Down Expand Up @@ -53,6 +55,7 @@ impl<'a> TryFromArrayParts<'a, PrimitiveMetadata> for PrimitiveArray<'a> {
dtype: parts.dtype(),
buffer,
validity: metadata.validity.to_validity(parts.child(0, parts.dtype())),
statistics: parts.statistics(),
})
}
}
Expand Down Expand Up @@ -111,3 +114,9 @@ impl AcceptArrayVisitor for PrimitiveArray<'_> {
visitor.visit_validity(self.validity())
}
}

impl ArrayStatistics for PrimitiveArray<'_> {
fn statistics(&self) -> &dyn Statistics {
self.statistics
}
}
9 changes: 9 additions & 0 deletions vortex-array2/src/array/ree/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use vortex_error::VortexResult;
use vortex_schema::DType;

use crate::impl_encoding;
use crate::stats::{ArrayStatistics, Statistics};
use crate::validity::ArrayValidity;
use crate::visitor::{AcceptArrayVisitor, ArrayVisitor};
use crate::{Array, ArrayMetadata};
Expand All @@ -24,6 +25,7 @@ pub struct REEArray<'a> {
values: Array<'a>,
run_ends: Array<'a>,
length: usize,
statistics: &'a dyn Statistics,
}

impl REEArray<'_> {
Expand Down Expand Up @@ -61,6 +63,7 @@ impl<'v> TryFromArrayParts<'v, REEMetadata> for REEArray<'v> {
.child(1, &metadata.ends_dtype)
.ok_or_else(|| vortex_err!("REEArray missing run_ends"))?,
length: metadata.length,
statistics: parts.statistics(),
})
}
}
Expand Down Expand Up @@ -93,3 +96,9 @@ impl AcceptArrayVisitor for REEArray<'_> {
visitor.visit_child("run_ends", self.run_ends())
}
}

impl ArrayStatistics for REEArray<'_> {
fn statistics(&self) -> &dyn Statistics {
self.statistics
}
}
3 changes: 3 additions & 0 deletions vortex-array2/src/array/struct/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use serde::{Deserialize, Serialize};
use vortex_error::{vortex_bail, VortexResult};
use vortex_schema::{DType, FieldNames};

use crate::stats::ArrayStatistics;
use crate::validity::ArrayValidity;
use crate::visitor::{AcceptArrayVisitor, ArrayVisitor};
use crate::{impl_encoding, ToArray, WithArray};
Expand Down Expand Up @@ -146,3 +147,5 @@ impl AcceptArrayVisitor for StructArray<'_> {
Ok(())
}
}

impl ArrayStatistics for StructArray<'_> {}
30 changes: 29 additions & 1 deletion vortex-array2/src/data.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
use std::collections::HashMap;
use std::marker::PhantomData;
use std::sync::Arc;
use std::sync::{Arc, RwLock};

use arrow_buffer::Buffer;
use vortex::scalar::Scalar;
use vortex::stats::Stat;
use vortex_error::{vortex_bail, VortexError, VortexResult};
use vortex_schema::DType;

use crate::encoding::EncodingRef;
use crate::stats::Statistics;
use crate::{Array, ArrayDef, ArrayMetadata, ArrayParts, IntoArray, ToArray};

#[allow(dead_code)]
Expand All @@ -16,6 +20,7 @@ pub struct ArrayData {
metadata: Arc<dyn ArrayMetadata>,
buffers: Arc<[Buffer]>, // Should this just be an Option, not an Arc?
children: Arc<[Option<ArrayData>]>,
stats_set: Arc<RwLock<HashMap<Stat, Scalar>>>,
}

impl ArrayData {
Expand All @@ -32,6 +37,7 @@ impl ArrayData {
metadata,
buffers,
children,
stats_set: Arc::new(RwLock::new(HashMap::new())),
};

// Validate here that the metadata correctly parses, so that an encoding can infallibly
Expand Down Expand Up @@ -215,4 +221,26 @@ impl ArrayParts for ArrayData {
fn nchildren(&self) -> usize {
self.children.len()
}

fn statistics(&self) -> &dyn Statistics {
self
}
}

impl Statistics for ArrayData {
fn compute(&self, stat: &Stat) -> VortexResult<Option<Scalar>> {
let mut locked = self.stats_set.write().unwrap();
let stats = self
.encoding()
.with_data(self, |a| a.compute_statistics(stat))?;
for (k, v) in &stats {
locked.insert(*k, v.clone());
}
Ok(stats.get(stat).cloned())
}

fn get(&self, stat: &Stat) -> Option<Scalar> {
let locked = self.stats_set.read().unwrap();
locked.get(stat).cloned()
}
}
7 changes: 6 additions & 1 deletion vortex-array2/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ mod data;
pub mod encoding;
mod implementation;
mod metadata;
mod stats;
mod tree;
mod validity;
mod view;
Expand All @@ -30,6 +31,7 @@ use vortex_schema::DType;

use crate::compute::ArrayCompute;
use crate::encoding::EncodingRef;
use crate::stats::{ArrayStatistics, Statistics};
use crate::validity::ArrayValidity;
use crate::visitor::{AcceptArrayVisitor, ArrayVisitor};

Expand Down Expand Up @@ -87,14 +89,17 @@ pub trait ArrayParts {
fn buffer(&self, idx: usize) -> Option<&Buffer>;
fn child<'a>(&'a self, idx: usize, dtype: &'a DType) -> Option<Array>;
fn nchildren(&self) -> usize;
fn statistics(&self) -> &dyn Statistics;
}

pub trait TryFromArrayParts<'v, M: ArrayMetadata>: Sized + 'v {
fn try_from_parts(parts: &'v dyn ArrayParts, metadata: &'v M) -> VortexResult<Self>;
}

/// Collects together the behaviour of an array.
pub trait ArrayTrait: ArrayCompute + ArrayValidity + AcceptArrayVisitor + ToArrayData {
pub trait ArrayTrait:
ArrayCompute + ArrayValidity + AcceptArrayVisitor + ArrayStatistics + ToArrayData
{
fn dtype(&self) -> &DType;

fn len(&self) -> usize;
Expand Down
43 changes: 43 additions & 0 deletions vortex-array2/src/stats.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
use std::collections::HashMap;

use vortex::scalar::Scalar;
use vortex::stats::Stat;
use vortex_error::VortexResult;

pub trait ArrayStatistics {
fn statistics(&self) -> &dyn Statistics {
&EmptyStatistics
}

/// Compute the requested statistic. Can return additional stats.
fn compute_statistics(&self, _stat: &Stat) -> VortexResult<HashMap<Stat, Scalar>> {
Ok(HashMap::new())
}
}

pub trait Statistics {
fn compute(&self, stat: &Stat) -> VortexResult<Option<Scalar>>;
fn get(&self, stat: &Stat) -> Option<Scalar>;
}

impl dyn Statistics {
fn compute_as<T: TryFrom<Scalar>>(&self, _stat: &Stat) -> VortexResult<Option<T>> {
// TODO(ngates): should we panic if conversion fails?
todo!()
}

fn get_as<T: TryFrom<Scalar>>(&self, _stat: &Stat) -> Option<T> {
todo!()
}
}

pub struct EmptyStatistics;
impl Statistics for EmptyStatistics {
fn compute(&self, _stat: &Stat) -> VortexResult<Option<Scalar>> {
Ok(None)
}

fn get(&self, _stat: &Stat) -> Option<Scalar> {
None
}
}
6 changes: 6 additions & 0 deletions vortex-array2/src/view.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use vortex_error::{vortex_bail, vortex_err, VortexResult};
use vortex_schema::DType;

use crate::encoding::EncodingRef;
use crate::stats::{EmptyStatistics, Statistics};
use crate::{Array, IntoArray, ToArray};
use crate::{ArrayParts, SerdeContext};

Expand Down Expand Up @@ -169,4 +170,9 @@ impl ArrayParts for ArrayView<'_> {
fn nchildren(&self) -> usize {
self.array.children().map(|c| c.len()).unwrap_or_default()
}

fn statistics(&self) -> &dyn Statistics {
// TODO(ngates): serialize statistics into fb::Array
&EmptyStatistics
}
}

0 comments on commit 768d28c

Please sign in to comment.