Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/develop' into dk/pyvortex-filter
Browse files Browse the repository at this point in the history
  • Loading branch information
danking committed Oct 21, 2024
2 parents e92ed90 + 008b3a7 commit ef0cac2
Show file tree
Hide file tree
Showing 31 changed files with 1,018 additions and 1,032 deletions.
194 changes: 97 additions & 97 deletions Cargo.lock

Large diffs are not rendered by default.

13 changes: 12 additions & 1 deletion encodings/dict/src/compute.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use vortex::compute::unary::{scalar_at, scalar_at_unchecked, ScalarAtFn};
use vortex::compute::{slice, take, ArrayCompute, SliceFn, TakeFn};
use vortex::compute::{filter, slice, take, ArrayCompute, FilterFn, SliceFn, TakeFn};
use vortex::{Array, IntoArray};
use vortex_error::{VortexExpect, VortexResult};
use vortex_scalar::Scalar;
Expand All @@ -18,6 +18,10 @@ impl ArrayCompute for DictArray {
fn take(&self) -> Option<&dyn TakeFn> {
Some(self)
}

fn filter(&self) -> Option<&dyn FilterFn> {
Some(self)
}
}

impl ScalarAtFn for DictArray {
Expand Down Expand Up @@ -46,6 +50,13 @@ impl TakeFn for DictArray {
}
}

impl FilterFn for DictArray {
fn filter(&self, predicate: &Array) -> VortexResult<Array> {
let codes = filter(self.codes(), predicate)?;
Self::try_new(codes, self.values()).map(|a| a.into_array())
}
}

impl SliceFn for DictArray {
// TODO(robert): Add function to trim the dictionary
fn slice(&self, start: usize, stop: usize) -> VortexResult<Array> {
Expand Down
45 changes: 45 additions & 0 deletions pyvortex/src/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use pyo3::exceptions::PyValueError;
use pyo3::prelude::*;
use pyo3::types::{IntoPyDict, PyList};
use vortex::array::ChunkedArray;
use vortex::compute::unary::fill_forward;
use vortex::compute::{slice, take};
use vortex::{Array, ArrayDType, IntoCanonical};

Expand Down Expand Up @@ -171,6 +172,50 @@ impl PyArray {
.map(|arr| PyArray { inner: arr })
}

/// Fill forward non-null values over runs of nulls.
///
/// Leading nulls are replaced with the "zero" for that type. For integral and floating-point
/// types, this is zero. For the Boolean type, this is `:obj:`False`.
///
/// Fill forward sensor values over intermediate missing values. Note that leading nulls are
/// replaced with 0.0:
///
/// >>> a = vortex.encoding.array([
/// ... None, None, 30.29, 30.30, 30.30, None, None, 30.27, 30.25,
/// ... 30.22, None, None, None, None, 30.12, 30.11, 30.11, 30.11,
/// ... 30.10, 30.08, None, 30.21, 30.03, 30.03, 30.05, 30.07, 30.07,
/// ... ])
/// >>> a.fill_forward().to_arrow_array()
/// <pyarrow.lib.DoubleArray object at ...>
/// [
/// 0,
/// 0,
/// 30.29,
/// 30.3,
/// 30.3,
/// 30.3,
/// 30.3,
/// 30.27,
/// 30.25,
/// 30.22,
/// ...
/// 30.11,
/// 30.1,
/// 30.08,
/// 30.08,
/// 30.21,
/// 30.03,
/// 30.03,
/// 30.05,
/// 30.07,
/// 30.07
/// ]
fn fill_forward(&self) -> PyResult<PyArray> {
fill_forward(&self.inner)
.map_err(PyVortexError::map_err)
.map(|arr| PyArray { inner: arr })
}

/// Filter, permute, and/or repeat elements by their index.
///
/// Parameters
Expand Down
2 changes: 2 additions & 0 deletions vortex-array/src/array/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#[cfg(test)]
mod assertions;

mod bool;
mod chunked;
mod constant;
Expand Down
1 change: 1 addition & 0 deletions vortex-array/src/arrow/dtype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ pub fn infer_schema(dtype: &DType) -> VortexResult<Schema> {
Ok(builder.finish())
}

/// Try to convert a Vortex [`DType`] into an a Arrow [`DataType`]
pub fn infer_data_type(dtype: &DType) -> VortexResult<DataType> {
Ok(match dtype {
DType::Null => DataType::Null,
Expand Down
2 changes: 2 additions & 0 deletions vortex-array/src/arrow/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//! Utilities to work with `Arrow` data and types
use vortex_error::VortexResult;

pub use crate::arrow::dtype::{infer_data_type, infer_schema};
Expand Down
1 change: 1 addition & 0 deletions vortex-array/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use crate::array::{
};
use crate::encoding::EncodingRef;

/// A mapping between an encoding's ID to an [`EncodingRef`], used to have a shared view of all available encoding schemes.
#[derive(Debug, Clone)]
pub struct Context {
encodings: HashMap<u16, EncodingRef>,
Expand Down
1 change: 1 addition & 0 deletions vortex-array/src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use crate::encoding::EncodingRef;
use crate::stats::{Stat, Statistics, StatsSet};
use crate::{Array, ArrayDType, ArrayMetadata, ToArray};

/// Owned [`Array`] with serialized metadata, backed by heap-allocated memory.
#[derive(Clone, Debug)]
pub struct ArrayData {
encoding: EncodingRef,
Expand Down
3 changes: 3 additions & 0 deletions vortex-array/src/encoding.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//! Traits and types to define shared unique encoding identifiers
use std::fmt::{Debug, Display, Formatter};
use std::hash::{Hash, Hasher};

Expand All @@ -7,6 +9,7 @@ use crate::canonical::{Canonical, IntoCanonical};
use crate::{Array, ArrayDef, ArrayTrait};

// TODO(robert): Outline how you create a well known encoding id

/// EncodingId is a unique name and numerical code of the array
///
/// 0x0000 - reserved marker encoding
Expand Down
4 changes: 4 additions & 0 deletions vortex-array/src/implementation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ pub trait ArrayDef {
type Encoding: ArrayEncoding + ArrayEncodingExt<D = Self>;
}

/// Macro to generate all the necessary code for a new type of array encoding. Including:
/// 1. New Array type that implements `AsRef<Array>`, `GetArrayMetadata`, `ToArray`, `IntoArray`, and multiple useful `From`/`TryFrom` implementations.
/// 1. New Encoding type that implements `ArrayEncoding`.
/// 1. New metadata type that implements `ArrayMetadata`.
#[macro_export]
macro_rules! impl_encoding {
($id:literal, $code:expr, $Name:ident) => {
Expand Down
13 changes: 11 additions & 2 deletions vortex-array/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
use std::fmt::{Debug, Display, Formatter};
use std::future::ready;

pub use ::paste;
pub use canonical::*;
pub use context::*;
pub use data::*;
Expand Down Expand Up @@ -62,9 +61,14 @@ pub mod flatbuffers {
pub use vortex_flatbuffers::array::*;
}

/// A central type for all Vortex arrays, which are known length sequences of compressed data.
///
/// This is the main entrypoint for working with in-memory Vortex data, and dispatches work over the underlying encoding or memory representations.
#[derive(Debug, Clone)]
pub enum Array {
/// Owned [`Array`] with serialized metadata, backed by heap-allocated memory.
Data(ArrayData),
/// Zero-copy view over flatbuffer-encoded [`Array`] data, created without eager serialization.
View(ArrayView),
}

Expand All @@ -76,6 +80,7 @@ impl Array {
}
}

/// Returns the number of logical elements in the array.
#[allow(clippy::same_name_method)]
pub fn len(&self) -> usize {
match self {
Expand All @@ -91,6 +96,7 @@ impl Array {
}
}

/// Total size of the array in bytes, including all children and buffers.
pub fn nbytes(&self) -> usize {
self.with_dyn(|a| a.nbytes())
}
Expand All @@ -102,13 +108,15 @@ impl Array {
}
}

/// Returns a Vec of Arrays with all of the array's child arrays.
pub fn children(&self) -> Vec<Array> {
match self {
Array::Data(d) => d.children().iter().cloned().collect_vec(),
Array::View(v) => v.children(),
}
}

/// Returns the number of child arrays
pub fn nchildren(&self) -> usize {
match self {
Self::Data(d) => d.nchildren(),
Expand Down Expand Up @@ -174,7 +182,7 @@ impl Array {
)
}

/// Checks whether array is of given encoding
/// Checks whether array is of a given encoding.
pub fn is_encoding(&self, id: EncodingId) -> bool {
self.encoding().id() == id
}
Expand Down Expand Up @@ -270,6 +278,7 @@ pub trait ArrayTrait:
+ ArrayStatisticsCompute
+ ToArrayData
{
/// Total size of the array in bytes, including all children and buffers.
fn nbytes(&self) -> usize {
let mut visitor = NBytesVisitor(0);
self.accept(&mut visitor)
Expand Down
23 changes: 16 additions & 7 deletions vortex-array/src/validity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,24 +47,23 @@ impl ValidityMetadata {
}
}

/// Validity information for an array
#[derive(Clone, Debug)]
pub enum Validity {
/// Items *can't* be null
NonNullable,
/// All items are valid
AllValid,
/// All items are null
AllInvalid,
/// Specified items are null
Array(Array),
}

impl Validity {
/// The [`DType`] of the underlying validity array (if it exists).
pub const DTYPE: DType = DType::Bool(Nullability::NonNullable);

pub fn into_array(self) -> Option<Array> {
match self {
Self::Array(a) => Some(a),
_ => None,
}
}

pub fn to_metadata(&self, length: usize) -> VortexResult<ValidityMetadata> {
match self {
Self::NonNullable => Ok(ValidityMetadata::NonNullable),
Expand All @@ -85,6 +84,15 @@ impl Validity {
}
}

/// If Validity is [`Validity::Array`], returns the array, otherwise returns `None`.
pub fn into_array(self) -> Option<Array> {
match self {
Self::Array(a) => Some(a),
_ => None,
}
}

/// If Validity is [`Validity::Array`], returns a reference to the array array, otherwise returns `None`.
pub fn as_array(&self) -> Option<&Array> {
match self {
Self::Array(a) => Some(a),
Expand All @@ -99,6 +107,7 @@ impl Validity {
}
}

/// Returns whether the `index` item is valid.
#[inline]
pub fn is_valid(&self, index: usize) -> bool {
match self {
Expand Down
1 change: 1 addition & 0 deletions vortex-array/src/view.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ use crate::stats::{Stat, Statistics, StatsSet};
use crate::visitor::ArrayVisitor;
use crate::{flatbuffers as fb, Array, Context, IntoArray, ToArray};

/// Zero-copy view over flatbuffer-encoded array data, created without eager serialization.
#[derive(Clone)]
pub struct ArrayView {
encoding: EncodingRef,
Expand Down
1 change: 1 addition & 0 deletions vortex-dtype/src/nullability.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::fmt::{Display, Formatter};

/// Whether an item can contain a null value or not
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash, Ord, PartialOrd)]
pub enum Nullability {
#[default]
Expand Down
3 changes: 3 additions & 0 deletions vortex-dtype/src/ptype.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//! Physical type definitions and behavior.
use std::cmp::Ordering;
use std::fmt::{Debug, Display, Formatter};
use std::hash::Hash;
Expand All @@ -11,6 +13,7 @@ use crate::nullability::Nullability::NonNullable;
use crate::DType;
use crate::DType::*;

/// Physical type enum, represents the in-memory physical layout but might represent a different logical type.
#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Hash)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
#[cfg_attr(feature = "serde", serde(rename_all = "lowercase"))]
Expand Down
26 changes: 10 additions & 16 deletions vortex-flatbuffers/flatbuffers/vortex-serde/footer.fbs
Original file line number Diff line number Diff line change
@@ -1,24 +1,18 @@
table FlatLayout {
begin: uint64;
end: uint64;
}

table NestedLayout {
table Layout {
children: [Layout];
encoding: uint16;
}

union LayoutVariant {
FlatLayout,
NestedLayout,
}

table Layout {
layout: LayoutVariant;
metadata: [ubyte];
}

table Footer {
layout: Layout;
row_count: uint64;
}

table Postscript {
schema_offset: uint64;
footer_offset: uint64;
}

root_type Footer;
root_type Postscript;
root_type Footer;
Loading

0 comments on commit ef0cac2

Please sign in to comment.