From 8282118f7b2f24aea030a639d75a7ba690d41cae Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Tue, 5 Nov 2024 08:54:33 -0500 Subject: [PATCH] refactor: move Array enum out of public interface (#1212) Makes `Array` an opaque struct, containing a (private) inner enum with our two current variants. In the future we can make changes to the inner enum without a breaking API change. The other place this would be useful is for `Buffer`. If folks like this change can make that one as a follow up --- vortex-array/src/data.rs | 21 ---- vortex-array/src/implementation.rs | 45 +------- vortex-array/src/lib.rs | 164 +++++++++++++++++++++++------ vortex-array/src/typed.rs | 15 +-- vortex-array/src/view.rs | 15 +-- vortex-serde/src/messages.rs | 20 +--- 6 files changed, 151 insertions(+), 129 deletions(-) diff --git a/vortex-array/src/data.rs b/vortex-array/src/data.rs index 9deb5979cb..041d2f2540 100644 --- a/vortex-array/src/data.rs +++ b/vortex-array/src/data.rs @@ -116,27 +116,6 @@ impl ArrayData { } } -impl ToArray for ArrayData { - fn to_array(&self) -> Array { - Array::Data(self.clone()) - } -} - -impl From for ArrayData { - fn from(value: Array) -> ArrayData { - match value { - Array::Data(d) => d, - Array::View(_) => value.with_dyn(|v| v.to_array_data()), - } - } -} - -impl From for Array { - fn from(value: ArrayData) -> Array { - Array::Data(value) - } -} - impl Statistics for ArrayData { fn get(&self, stat: Stat) -> Option { self.stats_map diff --git a/vortex-array/src/implementation.rs b/vortex-array/src/implementation.rs index b845762b9f..dbb21b7072 100644 --- a/vortex-array/src/implementation.rs +++ b/vortex-array/src/implementation.rs @@ -1,14 +1,13 @@ //! The core Vortex macro to create new encodings and array types. use vortex_buffer::Buffer; -use vortex_dtype::DType; use vortex_error::{vortex_bail, VortexError, VortexExpect as _, VortexResult}; use crate::array::visitor::ArrayVisitor; use crate::encoding::{ArrayEncoding, ArrayEncodingExt, ArrayEncodingRef, EncodingId, EncodingRef}; -use crate::stats::{ArrayStatistics, Statistics}; +use crate::stats::ArrayStatistics; use crate::{ - Array, ArrayDType, ArrayData, ArrayLen, ArrayMetadata, ArrayTrait, GetArrayMetadata, IntoArray, + Array, ArrayDType, ArrayData, ArrayMetadata, ArrayTrait, GetArrayMetadata, Inner, IntoArray, ToArrayData, TryDeserializeArrayMetadata, }; @@ -169,49 +168,15 @@ impl> ArrayEncodingRef for T { } } -impl> ArrayDType for T { - fn dtype(&self) -> &DType { - match self.as_ref() { - Array::Data(d) => d.dtype(), - Array::View(v) => v.dtype(), - } - } -} - -impl> ArrayLen for T { - fn len(&self) -> usize { - match self.as_ref() { - Array::Data(d) => d.len(), - Array::View(v) => v.len(), - } - } - - fn is_empty(&self) -> bool { - match self.as_ref() { - Array::Data(d) => d.is_empty(), - Array::View(v) => v.is_empty(), - } - } -} - -impl> ArrayStatistics for T { - fn statistics(&self) -> &(dyn Statistics + '_) { - match self.as_ref() { - Array::Data(d) => d.statistics(), - Array::View(v) => v.statistics(), - } - } -} - impl ToArrayData for D where D: IntoArray + ArrayEncodingRef + ArrayStatistics + GetArrayMetadata + Clone, { fn to_array_data(&self) -> ArrayData { let array = self.clone().into_array(); - match array { - Array::Data(d) => d, - Array::View(ref view) => { + match array.0 { + Inner::Data(d) => d, + Inner::View(ref view) => { struct Visitor { buffer: Option, children: Vec, diff --git a/vortex-array/src/lib.rs b/vortex-array/src/lib.rs index 2d554b99cc..45054e5ddf 100644 --- a/vortex-array/src/lib.rs +++ b/vortex-array/src/lib.rs @@ -9,6 +9,7 @@ //! arrays can be [canonicalized](Canonical) into for ease of access in compute functions. //! +use std::borrow::Cow; use std::fmt::{Debug, Display, Formatter}; use std::future::ready; @@ -19,11 +20,12 @@ pub use implementation::*; use itertools::Itertools; pub use metadata::*; pub use paste; +use stats::Statistics; pub use typed::*; pub use view::*; use vortex_buffer::Buffer; use vortex_dtype::DType; -use vortex_error::{vortex_panic, VortexExpect, VortexResult}; +use vortex_error::{vortex_err, vortex_panic, VortexExpect, VortexResult}; use crate::array::visitor::{AcceptArrayVisitor, ArrayVisitor}; use crate::compute::ArrayCompute; @@ -65,7 +67,10 @@ pub mod flatbuffers { /// /// This is the main entrypoint for working with in-memory Vortex data, and dispatches work over the underlying encoding or memory representations. #[derive(Debug, Clone)] -pub enum Array { +pub struct Array(pub(crate) Inner); + +#[derive(Debug, Clone)] +pub(crate) enum Inner { /// Owned [`Array`] with serialized metadata, backed by heap-allocated memory. Data(ArrayData), /// Zero-copy view over flatbuffer-encoded [`Array`] data, created without eager serialization. @@ -74,25 +79,25 @@ pub enum Array { impl Array { pub fn encoding(&self) -> EncodingRef { - match self { - Self::Data(d) => d.encoding(), - Self::View(v) => v.encoding(), + match &self.0 { + Inner::Data(d) => d.encoding(), + Inner::View(v) => v.encoding(), } } /// Returns the number of logical elements in the array. #[allow(clippy::same_name_method)] pub fn len(&self) -> usize { - match self { - Self::Data(d) => d.len(), - Self::View(v) => v.len(), + match &self.0 { + Inner::Data(d) => d.len(), + Inner::View(v) => v.len(), } } pub fn is_empty(&self) -> bool { - match self { - Self::Data(d) => d.is_empty(), - Self::View(v) => v.is_empty(), + match &self.0 { + Inner::Data(d) => d.is_empty(), + Inner::View(v) => v.is_empty(), } } @@ -102,25 +107,27 @@ impl Array { } pub fn child<'a>(&'a self, idx: usize, dtype: &'a DType, len: usize) -> VortexResult { - match self { - Self::Data(d) => d.child(idx, dtype, len).cloned(), - Self::View(v) => v.child(idx, dtype, len).map(Array::View), + match &self.0 { + Inner::Data(d) => d.child(idx, dtype, len).cloned(), + Inner::View(v) => v + .child(idx, dtype, len) + .map(|view| Array(Inner::View(view))), } } /// Returns a Vec of Arrays with all of the array's child arrays. pub fn children(&self) -> Vec { - match self { - Array::Data(d) => d.children().iter().cloned().collect_vec(), - Array::View(v) => v.children(), + match &self.0 { + Inner::Data(d) => d.children().iter().cloned().collect_vec(), + Inner::View(v) => v.children(), } } /// Returns the number of child arrays pub fn nchildren(&self) -> usize { - match self { - Self::Data(d) => d.nchildren(), - Self::View(v) => v.nchildren(), + match &self.0 { + Inner::Data(d) => d.nchildren(), + Inner::View(v) => v.nchildren(), } } @@ -157,17 +164,43 @@ impl Array { offsets } + /// Get back the (possibly owned) metadata for the array. + /// + /// View arrays will return a reference to their bytes, while heap-backed arrays + /// must first serialize their metadata, returning an owned byte array to the caller. + pub fn metadata(&self) -> VortexResult> { + match &self.0 { + Inner::Data(array_data) => { + // Heap-backed arrays must first try and serialize the metadata. + let owned_meta: Vec = array_data + .metadata() + .try_serialize_metadata()? + .as_ref() + .to_owned(); + + Ok(Cow::Owned(owned_meta)) + } + Inner::View(array_view) => { + // View arrays have direct access to metadata bytes. + array_view + .metadata() + .ok_or_else(|| vortex_err!("things")) + .map(Cow::Borrowed) + } + } + } + pub fn buffer(&self) -> Option<&Buffer> { - match self { - Self::Data(d) => d.buffer(), - Self::View(v) => v.buffer(), + match &self.0 { + Inner::Data(d) => d.buffer(), + Inner::View(v) => v.buffer(), } } pub fn into_buffer(self) -> Option { - match self { - Self::Data(d) => d.into_buffer(), - Self::View(v) => v.buffer().cloned(), + match self.0 { + Inner::Data(d) => d.into_buffer(), + Inner::View(v) => v.buffer().cloned(), } } @@ -295,12 +328,37 @@ pub trait ArrayDType { fn dtype(&self) -> &DType; } +impl> ArrayDType for T { + fn dtype(&self) -> &DType { + match &self.as_ref().0 { + Inner::Data(array_data) => array_data.dtype(), + Inner::View(array_view) => array_view.dtype(), + } + } +} + pub trait ArrayLen { fn len(&self) -> usize; fn is_empty(&self) -> bool; } +impl> ArrayLen for T { + fn len(&self) -> usize { + match &self.as_ref().0 { + Inner::Data(d) => d.len(), + Inner::View(v) => v.len(), + } + } + + fn is_empty(&self) -> bool { + match &self.as_ref().0 { + Inner::Data(d) => d.is_empty(), + Inner::View(v) => v.is_empty(), + } + } +} + struct NBytesVisitor(usize); impl ArrayVisitor for NBytesVisitor { @@ -317,9 +375,9 @@ impl ArrayVisitor for NBytesVisitor { impl Display for Array { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - let prefix = match self { - Self::Data(_) => "", - Self::View(_) => "$", + let prefix = match &self.0 { + Inner::Data(_) => "", + Inner::View(_) => "$", }; write!( f, @@ -334,9 +392,51 @@ impl Display for Array { impl ToArrayData for Array { fn to_array_data(&self) -> ArrayData { - match self { - Self::Data(d) => d.clone(), - Self::View(_) => self.with_dyn(|a| a.to_array_data()), + match &self.0 { + Inner::Data(d) => d.clone(), + Inner::View(_) => self.with_dyn(|a| a.to_array_data()), + } + } +} + +impl ToArray for ArrayData { + fn to_array(&self) -> Array { + Array(Inner::Data(self.clone())) + } +} + +impl ToArray for ArrayView { + fn to_array(&self) -> Array { + Array(Inner::View(self.clone())) + } +} + +impl IntoArray for ArrayView { + fn into_array(self) -> Array { + Array(Inner::View(self)) + } +} + +impl From for ArrayData { + fn from(value: Array) -> ArrayData { + match value.0 { + Inner::Data(d) => d, + Inner::View(_) => value.with_dyn(|v| v.to_array_data()), + } + } +} + +impl From for Array { + fn from(value: ArrayData) -> Array { + Array(Inner::Data(value)) + } +} + +impl> ArrayStatistics for T { + fn statistics(&self) -> &(dyn Statistics + '_) { + match &self.as_ref().0 { + Inner::Data(d) => d.statistics(), + Inner::View(v) => v.statistics(), } } } diff --git a/vortex-array/src/typed.rs b/vortex-array/src/typed.rs index 4b9594f8b0..c152203b95 100644 --- a/vortex-array/src/typed.rs +++ b/vortex-array/src/typed.rs @@ -5,7 +5,7 @@ use vortex_dtype::DType; use vortex_error::{vortex_bail, vortex_panic, VortexError, VortexResult}; use crate::stats::StatsSet; -use crate::{Array, ArrayData, ArrayDef, IntoArray, ToArray, TryDeserializeArrayMetadata}; +use crate::{Array, ArrayData, ArrayDef, Inner, IntoArray, ToArray, TryDeserializeArrayMetadata}; /// Container for an array with all the associated implementation type information (encoding reference and ID, actual array type, metadata type). #[derive(Debug, Clone)] @@ -23,7 +23,7 @@ impl TypedArray { children: Arc<[Array]>, stats: StatsSet, ) -> VortexResult { - let array = Array::Data(ArrayData::try_new( + let array = ArrayData::try_new( D::ENCODING, dtype, len, @@ -31,7 +31,8 @@ impl TypedArray { buffer, children, stats, - )?); + )? + .into(); Ok(Self { array, lazy_metadata: OnceLock::new(), @@ -39,8 +40,8 @@ impl TypedArray { } pub fn metadata(&self) -> &D::Metadata { - match &self.array { - Array::Data(d) => d + match &self.array.0 { + Inner::Data(d) => d .metadata() .as_any() .downcast_ref::() @@ -52,12 +53,12 @@ impl TypedArray { D::ENCODING.id().as_ref(), ) }), - Array::View(v) => self + Inner::View(v) => self .lazy_metadata .get_or_init(|| { D::Metadata::try_deserialize_metadata(v.metadata()).unwrap_or_else(|err| { vortex_panic!( - "Failed to deserialize ArrayView metadata for typed array with ID {} and encoding {}: {}", + "Failed to deserialize ArrayView metadata for typed array with ID {} and encoding {}: {}", D::ID.as_ref(), D::ENCODING.id().as_ref(), err diff --git a/vortex-array/src/view.rs b/vortex-array/src/view.rs index 055502e87d..d668bd2c53 100644 --- a/vortex-array/src/view.rs +++ b/vortex-array/src/view.rs @@ -149,7 +149,8 @@ impl ArrayView { pub fn children(&self) -> Vec { let mut collector = ChildrenCollector::default(); - Array::View(self.clone()) + self.clone() + .into_array() .with_dyn(|a| a.accept(&mut collector)) .vortex_expect("Failed to get children"); collector.children @@ -262,15 +263,3 @@ impl Statistics for ArrayView { .cloned() } } - -impl ToArray for ArrayView { - fn to_array(&self) -> Array { - Array::View(self.clone()) - } -} - -impl IntoArray for ArrayView { - fn into_array(self) -> Array { - Array::View(self) - } -} diff --git a/vortex-serde/src/messages.rs b/vortex-serde/src/messages.rs index 80cb7b82db..6e368506fa 100644 --- a/vortex-serde/src/messages.rs +++ b/vortex-serde/src/messages.rs @@ -111,22 +111,10 @@ impl WriteFlatBuffer for IPCArray<'_> { let column_data = self.0; let encoding = column_data.encoding().id().code(); - let metadata = match column_data { - Array::Data(d) => { - let metadata = d - .metadata() - .try_serialize_metadata() - // TODO(ngates): should we serialize externally to here? - .vortex_expect("ArrayData is missing metadata during serialization"); - Some(fbb.create_vector(metadata.as_ref())) - } - Array::View(v) => Some( - fbb.create_vector( - v.metadata() - .vortex_expect("ArrayView is missing metadata during serialization"), - ), - ), - }; + let metadata = column_data + .metadata() + .vortex_expect("IPCArray is missing metadata during serialization"); + let metadata = Some(fbb.create_vector(metadata.as_ref())); // Assign buffer indices for all child arrays. // The second tuple element holds the buffer_index for this Array subtree. If this array