From 0cf515810d45182d81e961c5437aaf789bab1350 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Mon, 8 Apr 2024 12:49:17 +0100 Subject: [PATCH] Struct Array --- Cargo.lock | 1 + vortex-array2/src/array/bool/mod.rs | 5 +- vortex-array2/src/array/mod.rs | 1 + vortex-array2/src/array/primitive/mod.rs | 2 +- vortex-array2/src/array/ree/mod.rs | 9 +- vortex-array2/src/array/struct/compute.rs | 17 ++++ vortex-array2/src/array/struct/mod.rs | 115 ++++++++++++++++++++++ vortex-array2/src/batch.rs | 34 +++++++ vortex-array2/src/context.rs | 2 + vortex-array2/src/data.rs | 16 +-- vortex-array2/src/lib.rs | 18 ++-- vortex-array2/src/tree.rs | 8 +- vortex-array2/src/view.rs | 22 +++-- vortex-array2/src/visitor.rs | 11 ++- vortex-ipc/Cargo.toml | 1 + 15 files changed, 224 insertions(+), 38 deletions(-) create mode 100644 vortex-array2/src/array/struct/compute.rs create mode 100644 vortex-array2/src/array/struct/mod.rs create mode 100644 vortex-array2/src/batch.rs diff --git a/Cargo.lock b/Cargo.lock index c7cd06bf66..553133de9d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5514,6 +5514,7 @@ dependencies = [ "nougat", "streaming-iterator", "vortex-array", + "vortex-array2", "vortex-error", "vortex-flatbuffers", "vortex-schema", diff --git a/vortex-array2/src/array/bool/mod.rs b/vortex-array2/src/array/bool/mod.rs index 1da7a66379..fa7663e35b 100644 --- a/vortex-array2/src/array/bool/mod.rs +++ b/vortex-array2/src/array/bool/mod.rs @@ -52,10 +52,7 @@ impl BoolArray<'_> { } impl<'v> TryFromArrayParts<'v, BoolMetadata> for BoolArray<'v> { - fn try_from_parts( - parts: &'v dyn ArrayParts<'v>, - metadata: &'v BoolMetadata, - ) -> VortexResult { + fn try_from_parts(parts: &'v dyn ArrayParts, metadata: &'v BoolMetadata) -> VortexResult { Ok(BoolArray { dtype: parts.dtype(), buffer: parts diff --git a/vortex-array2/src/array/mod.rs b/vortex-array2/src/array/mod.rs index b125b0cc74..8bb4a1d37a 100644 --- a/vortex-array2/src/array/mod.rs +++ b/vortex-array2/src/array/mod.rs @@ -1,3 +1,4 @@ pub mod bool; pub mod primitive; pub mod ree; +pub mod r#struct; diff --git a/vortex-array2/src/array/primitive/mod.rs b/vortex-array2/src/array/primitive/mod.rs index ee73ef3624..df103fb913 100644 --- a/vortex-array2/src/array/primitive/mod.rs +++ b/vortex-array2/src/array/primitive/mod.rs @@ -44,7 +44,7 @@ impl PrimitiveArray<'_> { impl<'a> TryFromArrayParts<'a, PrimitiveMetadata> for PrimitiveArray<'a> { fn try_from_parts( - parts: &'a dyn ArrayParts<'a>, + parts: &'a dyn ArrayParts, metadata: &'a PrimitiveMetadata, ) -> VortexResult { let buffer = parts.buffer(0).unwrap(); diff --git a/vortex-array2/src/array/ree/mod.rs b/vortex-array2/src/array/ree/mod.rs index 1a2e5a3b9b..30c50087fe 100644 --- a/vortex-array2/src/array/ree/mod.rs +++ b/vortex-array2/src/array/ree/mod.rs @@ -55,10 +55,7 @@ impl REEData { } impl<'v> TryFromArrayParts<'v, REEMetadata> for REEArray<'v> { - fn try_from_parts( - parts: &'v dyn ArrayParts<'v>, - metadata: &'v REEMetadata, - ) -> VortexResult { + fn try_from_parts(parts: &'v dyn ArrayParts, metadata: &'v REEMetadata) -> VortexResult { Ok(REEArray { dtype: parts.dtype(), values: parts @@ -96,7 +93,7 @@ impl ToArrayData for REEArray<'_> { impl AcceptArrayVisitor for REEArray<'_> { fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { - visitor.visit_array("values", self.values())?; - visitor.visit_array("run_ends", self.run_ends()) + visitor.visit_child("values", self.values())?; + visitor.visit_child("run_ends", self.run_ends()) } } diff --git a/vortex-array2/src/array/struct/compute.rs b/vortex-array2/src/array/struct/compute.rs new file mode 100644 index 0000000000..83ff492373 --- /dev/null +++ b/vortex-array2/src/array/struct/compute.rs @@ -0,0 +1,17 @@ +use vortex::scalar::Scalar; +use vortex_error::VortexResult; + +use crate::array::r#struct::StructArray; +use crate::compute::{ArrayCompute, ScalarAtFn}; + +impl ArrayCompute for StructArray<'_> { + fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { + Some(self) + } +} + +impl ScalarAtFn for StructArray<'_> { + fn scalar_at(&self, _index: usize) -> VortexResult { + todo!() + } +} diff --git a/vortex-array2/src/array/struct/mod.rs b/vortex-array2/src/array/struct/mod.rs new file mode 100644 index 0000000000..064bea8b77 --- /dev/null +++ b/vortex-array2/src/array/struct/mod.rs @@ -0,0 +1,115 @@ +mod compute; + +use serde::{Deserialize, Serialize}; +use vortex_error::{vortex_bail, VortexResult}; +use vortex_schema::{DType, FieldNames}; + +use crate::impl_encoding; +use crate::validity::ArrayValidity; +use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; +use crate::{Array, ArrayMetadata}; +use crate::{ArrayData, TypedArrayData}; +use crate::{ArrayView, ToArrayData}; + +impl_encoding!("vortex.struct", Struct); + +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct StructMetadata { + length: usize, +} + +#[derive(Clone)] +pub struct StructArray<'a> { + dtype: &'a DType, + // Note(ngates): for arrays with variable-length children, we don't want to + // allocate a Vec, so instead we defer child access by storing a reference to the parts. + parts: &'a dyn ArrayParts, + length: usize, +} + +impl<'a> StructArray<'a> { + pub fn child(&'a self, idx: usize) -> Option> { + let DType::Struct(_, fields) = self.dtype() else { + unreachable!() + }; + let dtype = fields.get(idx)?; + self.parts.child(idx, dtype) + } + + pub fn names(&self) -> &FieldNames { + let DType::Struct(names, _fields) = self.dtype() else { + unreachable!() + }; + names + } + + pub fn fields(&self) -> &[DType] { + let DType::Struct(_names, fields) = self.dtype() else { + unreachable!() + }; + fields.as_slice() + } + + pub fn ncolumns(&self) -> usize { + self.fields().len() + } + + pub fn len(&self) -> usize { + self.length + } +} + +impl<'v> TryFromArrayParts<'v, StructMetadata> for StructArray<'v> { + fn try_from_parts( + parts: &'v dyn ArrayParts, + metadata: &'v StructMetadata, + ) -> VortexResult { + let DType::Struct(_names, dtypes) = parts.dtype() else { + unreachable!() + }; + if parts.nchildren() != dtypes.len() { + vortex_bail!( + "Expected {} children, found {}", + dtypes.len(), + parts.nchildren() + ); + } + Ok(StructArray { + dtype: parts.dtype(), + parts, + length: metadata.length, + }) + } +} + +impl ArrayTrait for StructArray<'_> { + fn dtype(&self) -> &DType { + self.dtype + } + + fn len(&self) -> usize { + self.length + } +} + +impl ArrayValidity for StructArray<'_> { + fn is_valid(&self, _index: usize) -> bool { + todo!() + } +} + +impl ToArrayData for StructArray<'_> { + fn to_array_data(&self) -> ArrayData { + todo!() + } +} + +impl AcceptArrayVisitor for StructArray<'_> { + fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { + for (idx, name) in self.names().iter().enumerate() { + let child = self.child(idx).unwrap(); + visitor.visit_column(name, &child)?; + } + Ok(()) + } +} diff --git a/vortex-array2/src/batch.rs b/vortex-array2/src/batch.rs new file mode 100644 index 0000000000..35a4dc59e8 --- /dev/null +++ b/vortex-array2/src/batch.rs @@ -0,0 +1,34 @@ +use arrow_buffer::Buffer; +use vortex::serde::data::ArrayData; +use vortex_error::VortexResult; +use vortex_schema::DType; + +use crate::visitor::ArrayVisitor; +use crate::Array; + +/// A column batch contains the flattened list of columns. +pub struct ColumnBatch { + dtype: DType, + columns: Vec>, +} + +pub struct ColumnBatchBuilder { + columns: Vec, +} + +impl ArrayVisitor for ColumnBatchBuilder { + fn visit_column(&mut self, _name: &str, _array: &Array) -> VortexResult<()> { + todo!() + } + + fn visit_child(&mut self, _name: &str, _array: &Array) -> VortexResult<()> { + // If the array is a struct, then pull out each column. + // But we can't do this in case some non-column child is a struct. + // Can we ask an array for column(idx)? Seems like a lot of work. + todo!() + } + + fn visit_buffer(&mut self, _buffer: &Buffer) -> VortexResult<()> { + todo!() + } +} diff --git a/vortex-array2/src/context.rs b/vortex-array2/src/context.rs index ea76216480..99ecf59091 100644 --- a/vortex-array2/src/context.rs +++ b/vortex-array2/src/context.rs @@ -4,6 +4,8 @@ use vortex::encoding::EncodingId; use crate::encoding::EncodingRef; +/// TODO(ngates): I'm not too sure about this construct. Where it should live, or what scope it +/// should have. #[derive(Debug)] pub struct SerdeContext { encodings: Arc<[EncodingRef]>, diff --git a/vortex-array2/src/data.rs b/vortex-array2/src/data.rs index 49cab7cbe6..aa4538dfb3 100644 --- a/vortex-array2/src/data.rs +++ b/vortex-array2/src/data.rs @@ -152,8 +152,8 @@ impl TryFrom for TypedArrayData { } } -impl ArrayParts<'_> for ArrayData { - fn dtype(&'_ self) -> &'_ DType { +impl ArrayParts for ArrayData { + fn dtype(&self) -> &DType { &self.dtype } @@ -162,11 +162,11 @@ impl ArrayParts<'_> for ArrayData { } fn child(&self, idx: usize, _dtype: &DType) -> Option { - self.child(idx).map(|a| { - let array = a.to_array(); - // FIXME(ngates): can we ask an array its dtype? - // assert_eq!(array.dtype(), dtype); - array - }) + // TODO(ngates): validate the DType + self.child(idx).map(move |a| a.to_array()) + } + + fn nchildren(&self) -> usize { + self.children.len() } } diff --git a/vortex-array2/src/lib.rs b/vortex-array2/src/lib.rs index f46d663e34..a24f3eef69 100644 --- a/vortex-array2/src/lib.rs +++ b/vortex-array2/src/lib.rs @@ -1,6 +1,7 @@ #![allow(dead_code)] pub mod array; +mod batch; pub mod compute; mod context; mod data; @@ -73,14 +74,15 @@ pub trait WithArray { fn with_array R>(&self, f: F) -> R; } -pub trait ArrayParts<'a> { - fn dtype(&'a self) -> &'a DType; - fn buffer(&'a self, idx: usize) -> Option<&'a Buffer>; - fn child(&'a self, idx: usize, dtype: &'a DType) -> Option>; +pub trait ArrayParts { + fn dtype(&self) -> &DType; + fn buffer(&self, idx: usize) -> Option<&Buffer>; + fn child<'a>(&'a self, idx: usize, dtype: &'a DType) -> Option; + fn nchildren(&self) -> usize; } pub trait TryFromArrayParts<'v, M: ArrayMetadata>: Sized + 'v { - fn try_from_parts(parts: &'v dyn ArrayParts<'v>, metadata: &'v M) -> VortexResult; + fn try_from_parts(parts: &'v dyn ArrayParts, metadata: &'v M) -> VortexResult; } /// Collects together the behaviour of an array. @@ -103,7 +105,11 @@ pub trait ArrayTrait: ArrayCompute + ArrayValidity + AcceptArrayVisitor + ToArra struct NBytesVisitor(usize); impl ArrayVisitor for NBytesVisitor { - fn visit_array(&mut self, _name: &str, array: &Array) -> VortexResult<()> { + fn visit_column(&mut self, name: &str, array: &Array) -> VortexResult<()> { + self.visit_child(name, array) + } + + fn visit_child(&mut self, _name: &str, array: &Array) -> VortexResult<()> { self.0 += array.with_array(|a| a.nbytes()); Ok(()) } diff --git a/vortex-array2/src/tree.rs b/vortex-array2/src/tree.rs index 02f2b3070c..764453d24c 100644 --- a/vortex-array2/src/tree.rs +++ b/vortex-array2/src/tree.rs @@ -27,7 +27,7 @@ impl<'a, 'fmt: 'a> fmt::Display for TreeDisplayWrapper<'a> { let nbytes = array.with_array(|a| a.nbytes()); let mut array_fmt = TreeFormatter::new(f, "".to_string(), nbytes); array_fmt - .visit_array("root", array) + .visit_child("root", array) .map_err(fmt::Error::custom) } } @@ -41,7 +41,11 @@ pub struct TreeFormatter<'a, 'b: 'a> { /// TODO(ngates): I think we want to go back to the old explicit style. It gives arrays more /// control over how their metadata etc is displayed. impl<'a, 'b: 'a> ArrayVisitor for TreeFormatter<'a, 'b> { - fn visit_array(&mut self, name: &str, array: &Array) -> VortexResult<()> { + fn visit_column(&mut self, name: &str, array: &Array) -> VortexResult<()> { + self.visit_child(name, array) + } + + fn visit_child(&mut self, name: &str, array: &Array) -> VortexResult<()> { array.with_array(|a| { let nbytes = a.nbytes(); writeln!( diff --git a/vortex-array2/src/view.rs b/vortex-array2/src/view.rs index ba8d3f1436..7c5e35228f 100644 --- a/vortex-array2/src/view.rs +++ b/vortex-array2/src/view.rs @@ -22,7 +22,7 @@ impl<'a> Debug for ArrayView<'a> { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { f.debug_struct("ArrayView") .field("encoding", &self.encoding) - .field("dtype", &self.dtype) + .field("dtype", self.dtype) // .field("array", &self.array) .field("buffers", &self.buffers) .field("ctx", &self.ctx) @@ -75,10 +75,10 @@ impl<'v> ArrayView<'v> { pub fn metadata(&self) -> Option<&'v [u8]> { self.array.metadata().map(|m| m.bytes()) } - - pub fn nchildren(&self) -> usize { - self.array.children().map(|c| c.len()).unwrap_or_default() - } + // + // pub fn nchildren(&self) -> usize { + // self.array.children().map(|c| c.len()).unwrap_or_default() + // } pub fn child(&self, idx: usize, dtype: &'v DType) -> Option> { let child = self.array_child(idx)?; @@ -146,16 +146,20 @@ impl<'v> IntoArray<'v> for ArrayView<'v> { } } -impl<'v> ArrayParts<'v> for ArrayView<'v> { - fn dtype(&'v self) -> &'v DType { +impl ArrayParts for ArrayView<'_> { + fn dtype(&self) -> &DType { self.dtype } - fn buffer(&'v self, idx: usize) -> Option<&'v Buffer> { + fn buffer(&self, idx: usize) -> Option<&Buffer> { self.buffers().get(idx) } - fn child(&'v self, idx: usize, dtype: &'v DType) -> Option> { + fn child<'a>(&'a self, idx: usize, dtype: &'a DType) -> Option { self.child(idx, dtype).map(|a| a.into_array()) } + + fn nchildren(&self) -> usize { + self.array.children().map(|c| c.len()).unwrap_or_default() + } } diff --git a/vortex-array2/src/visitor.rs b/vortex-array2/src/visitor.rs index ebe99ce9ed..75e24f1af4 100644 --- a/vortex-array2/src/visitor.rs +++ b/vortex-array2/src/visitor.rs @@ -10,13 +10,20 @@ pub trait AcceptArrayVisitor { // TODO(ngates): maybe we make this more like the inverse of TryFromParts? pub trait ArrayVisitor { - fn visit_array(&mut self, name: &str, array: &Array) -> VortexResult<()>; + /// Visit a child column of this array. + fn visit_column(&mut self, name: &str, array: &Array) -> VortexResult<()>; + + /// Visit a child of this array. + fn visit_child(&mut self, name: &str, array: &Array) -> VortexResult<()>; + + /// Utility for visiting Array validity. fn visit_validity(&mut self, validity: &Validity) -> VortexResult<()> { if let Some(v) = validity.array() { - self.visit_array("validity", v) + self.visit_child("validity", v) } else { Ok(()) } } + fn visit_buffer(&mut self, buffer: &Buffer) -> VortexResult<()>; } diff --git a/vortex-ipc/Cargo.toml b/vortex-ipc/Cargo.toml index f480a3377d..9d2bba8ebe 100644 --- a/vortex-ipc/Cargo.toml +++ b/vortex-ipc/Cargo.toml @@ -19,6 +19,7 @@ lending-iterator = "0.1.7" nougat = "0.2.4" streaming-iterator = "0.1.9" vortex-array = { path = "../vortex-array" } +vortex-array2 = { path = "../vortex-array2" } vortex-error = { path = "../vortex-error" } vortex-flatbuffers = { path = "../vortex-flatbuffers" } vortex-schema = { path = "../vortex-schema" }