diff --git a/Cargo.lock b/Cargo.lock index 1592053d31..c7cd06bf66 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5427,6 +5427,7 @@ dependencies = [ "flatbuffers", "flexbuffers", "half", + "humansize", "paste", "serde", "vortex-array", diff --git a/vortex-array2/Cargo.toml b/vortex-array2/Cargo.toml index 88031b6e05..30be8b8127 100644 --- a/vortex-array2/Cargo.toml +++ b/vortex-array2/Cargo.toml @@ -15,6 +15,7 @@ arrow-buffer = { workspace = true } flatbuffers = { workspace = true } flexbuffers = { workspace = true } half = { workspace = true } +humansize = { workspace = true } paste = { workspace = true } serde = { workspace = true, features = ["derive"] } vortex-array = { path = "../vortex-array", features = ["serde"] } diff --git a/vortex-array2/src/array/bool/mod.rs b/vortex-array2/src/array/bool/mod.rs index 8569c32865..1da7a66379 100644 --- a/vortex-array2/src/array/bool/mod.rs +++ b/vortex-array2/src/array/bool/mod.rs @@ -8,6 +8,7 @@ use vortex_schema::DType; use crate::impl_encoding; use crate::validity::Validity; use crate::validity::{ArrayValidity, ValidityMetadata}; +use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; use crate::ArrayMetadata; use crate::{ArrayData, TypedArrayData}; use crate::{ArrayView, ToArrayData}; @@ -109,6 +110,13 @@ impl ToArrayData for BoolArray<'_> { } } +impl AcceptArrayVisitor for BoolArray<'_> { + fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { + visitor.visit_buffer(self.buffer())?; + visitor.visit_validity(self.validity()) + } +} + #[cfg(test)] mod tests { use crate::array::bool::BoolData; diff --git a/vortex-array2/src/array/primitive/mod.rs b/vortex-array2/src/array/primitive/mod.rs index 688426b143..ee73ef3624 100644 --- a/vortex-array2/src/array/primitive/mod.rs +++ b/vortex-array2/src/array/primitive/mod.rs @@ -1,6 +1,6 @@ mod compute; -use arrow_buffer::{Buffer, ScalarBuffer}; +use arrow_buffer::{ArrowNativeType, Buffer, ScalarBuffer}; use serde::{Deserialize, Serialize}; use vortex::ptype::{NativePType, PType}; use vortex_error::VortexResult; @@ -8,6 +8,7 @@ use vortex_schema::DType; use crate::impl_encoding; use crate::validity::{ArrayValidity, Validity, ValidityMetadata}; +use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; use crate::ArrayMetadata; use crate::{ArrayData, TypedArrayData}; use crate::{ArrayView, ToArrayData}; @@ -71,6 +72,10 @@ impl PrimitiveData { .unwrap() .try_into() } + + pub fn from_vec(values: Vec) -> Self { + Self::try_new(ScalarBuffer::from(values), Validity::NonNullable).unwrap() + } } impl ArrayTrait for PrimitiveArray<'_> { @@ -94,3 +99,10 @@ impl ToArrayData for PrimitiveArray<'_> { todo!() } } + +impl AcceptArrayVisitor for PrimitiveArray<'_> { + fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { + visitor.visit_buffer(self.buffer())?; + visitor.visit_validity(self.validity()) + } +} diff --git a/vortex-array2/src/array/ree/mod.rs b/vortex-array2/src/array/ree/mod.rs index 15f0dac472..1a2e5a3b9b 100644 --- a/vortex-array2/src/array/ree/mod.rs +++ b/vortex-array2/src/array/ree/mod.rs @@ -6,6 +6,7 @@ use vortex_schema::DType; use crate::impl_encoding; use crate::validity::ArrayValidity; +use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; use crate::{Array, ArrayMetadata}; use crate::{ArrayData, TypedArrayData}; use crate::{ArrayView, ToArrayData}; @@ -22,6 +23,17 @@ pub struct REEArray<'a> { dtype: &'a DType, values: Array<'a>, run_ends: Array<'a>, + length: usize, +} + +impl REEArray<'_> { + pub fn values(&self) -> &Array { + &self.values + } + + pub fn run_ends(&self) -> &Array { + &self.run_ends + } } impl REEData { @@ -55,6 +67,7 @@ impl<'v> TryFromArrayParts<'v, REEMetadata> for REEArray<'v> { run_ends: parts .child(1, &metadata.ends_dtype) .ok_or_else(|| vortex_err!("REEArray missing run_ends"))?, + length: metadata.length, }) } } @@ -65,7 +78,7 @@ impl ArrayTrait for REEArray<'_> { } fn len(&self) -> usize { - todo!() + self.length } } @@ -80,3 +93,10 @@ impl ToArrayData for REEArray<'_> { todo!() } } + +impl AcceptArrayVisitor for REEArray<'_> { + fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { + visitor.visit_array("values", self.values())?; + visitor.visit_array("run_ends", self.run_ends()) + } +} diff --git a/vortex-array2/src/encoding.rs b/vortex-array2/src/encoding.rs index 57e4ced4f1..c1e77b0887 100644 --- a/vortex-array2/src/encoding.rs +++ b/vortex-array2/src/encoding.rs @@ -33,10 +33,10 @@ impl Debug for dyn ArrayEncoding + '_ { } impl dyn ArrayEncoding { - pub(crate) fn with_view<'v, R, F: Fn(&dyn ArrayTrait) -> R>( + pub(crate) fn with_view<'v, R, F: FnMut(&dyn ArrayTrait) -> R>( &self, view: &'v ArrayView<'v>, - f: F, + mut f: F, ) -> R { let mut result = None; @@ -52,7 +52,11 @@ impl dyn ArrayEncoding { result.unwrap() } - pub(crate) fn with_data R>(&self, data: &ArrayData, f: F) -> R { + pub(crate) fn with_data R>( + &self, + data: &ArrayData, + mut f: F, + ) -> R { let mut result = None; // Unwrap the result. This is safe since we validate that encoding against the diff --git a/vortex-array2/src/lib.rs b/vortex-array2/src/lib.rs index 1597e89015..f46d663e34 100644 --- a/vortex-array2/src/lib.rs +++ b/vortex-array2/src/lib.rs @@ -7,10 +7,12 @@ mod data; pub mod encoding; mod implementation; mod metadata; +mod tree; mod validity; mod view; +mod visitor; -use std::fmt::Debug; +use std::fmt::{Debug, Display, Formatter}; use arrow_buffer::Buffer; pub use context::*; @@ -22,7 +24,9 @@ use vortex_error::VortexResult; use vortex_schema::DType; use crate::compute::ArrayCompute; +use crate::encoding::EncodingRef; use crate::validity::ArrayValidity; +use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; #[derive(Debug, Clone)] pub enum Array<'v> { @@ -32,6 +36,14 @@ pub enum Array<'v> { } impl Array<'_> { + pub fn encoding(&self) -> EncodingRef { + match self { + Array::Data(d) => d.encoding(), + Array::DataRef(d) => d.encoding(), + Array::View(v) => v.encoding(), + } + } + pub fn dtype(&self) -> &DType { match self { Array::Data(d) => d.dtype(), @@ -39,6 +51,10 @@ impl Array<'_> { Array::View(v) => v.dtype(), } } + + pub fn len(&self) -> usize { + self.with_array(|a| a.len()) + } } pub trait ToArray { @@ -54,7 +70,7 @@ pub trait ToArrayData { } pub trait WithArray { - fn with_array R>(&self, f: F) -> R; + fn with_array R>(&self, f: F) -> R; } pub trait ArrayParts<'a> { @@ -68,7 +84,7 @@ pub trait TryFromArrayParts<'v, M: ArrayMetadata>: Sized + 'v { } /// Collects together the behaviour of an array. -pub trait ArrayTrait: ArrayCompute + ArrayValidity + ToArrayData { +pub trait ArrayTrait: ArrayCompute + ArrayValidity + AcceptArrayVisitor + ToArrayData { fn dtype(&self) -> &DType; fn len(&self) -> usize; @@ -77,6 +93,25 @@ pub trait ArrayTrait: ArrayCompute + ArrayValidity + ToArrayData { // TODO(ngates): remove this default impl to encourage explicit implementation self.len() == 0 } + + fn nbytes(&self) -> usize { + let mut visitor = NBytesVisitor(0); + self.accept(&mut visitor).unwrap(); + visitor.0 + } +} + +struct NBytesVisitor(usize); +impl ArrayVisitor for NBytesVisitor { + fn visit_array(&mut self, _name: &str, array: &Array) -> VortexResult<()> { + self.0 += array.with_array(|a| a.nbytes()); + Ok(()) + } + + fn visit_buffer(&mut self, buffer: &Buffer) -> VortexResult<()> { + self.0 += buffer.len(); + Ok(()) + } } impl ToArrayData for Array<'_> { @@ -90,7 +125,7 @@ impl ToArrayData for Array<'_> { } impl WithArray for Array<'_> { - fn with_array R>(&self, f: F) -> R { + fn with_array R>(&self, f: F) -> R { match self { Array::Data(d) => d.encoding().with_data(d, f), Array::DataRef(d) => d.encoding().with_data(d, f), @@ -98,3 +133,21 @@ impl WithArray for Array<'_> { } } } + +impl Display for Array<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let prefix = match self { + Array::Data(_) => "", + Array::DataRef(_) => "&", + Array::View(_) => "$", + }; + write!( + f, + "{}{}({}, len={})", + prefix, + self.encoding().id(), + self.dtype(), + self.len() + ) + } +} diff --git a/vortex-array2/src/tree.rs b/vortex-array2/src/tree.rs new file mode 100644 index 0000000000..02f2b3070c --- /dev/null +++ b/vortex-array2/src/tree.rs @@ -0,0 +1,129 @@ +use std::fmt; + +use arrow_buffer::Buffer; +use humansize::{format_size, DECIMAL}; +use serde::ser::Error; +use vortex_error::{VortexError, VortexResult}; + +use crate::visitor::ArrayVisitor; +use crate::{Array, WithArray}; + +impl Array<'_> { + pub fn tree_display(&self) -> TreeDisplayWrapper { + TreeDisplayWrapper(self) + } +} + +pub struct TreeDisplayWrapper<'a>(&'a Array<'a>); +impl<'a> TreeDisplayWrapper<'a> { + pub fn new(array: &'a Array<'a>) -> Self { + Self(array) + } +} + +impl<'a, 'fmt: 'a> fmt::Display for TreeDisplayWrapper<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let array = self.0; + let nbytes = array.with_array(|a| a.nbytes()); + let mut array_fmt = TreeFormatter::new(f, "".to_string(), nbytes); + array_fmt + .visit_array("root", array) + .map_err(fmt::Error::custom) + } +} + +pub struct TreeFormatter<'a, 'b: 'a> { + fmt: &'a mut fmt::Formatter<'b>, + indent: String, + total_size: usize, +} + +/// TODO(ngates): I think we want to go back to the old explicit style. It gives arrays more +/// control over how their metadata etc is displayed. +impl<'a, 'b: 'a> ArrayVisitor for TreeFormatter<'a, 'b> { + fn visit_array(&mut self, name: &str, array: &Array) -> VortexResult<()> { + array.with_array(|a| { + let nbytes = a.nbytes(); + writeln!( + self.fmt, + "{}{}: {} nbytes={} ({:.2}%)", + self.indent, + name, + array, + format_size(nbytes, DECIMAL), + 100f64 * nbytes as f64 / self.total_size as f64 + )?; + self.indent(|i| a.accept(i).map_err(fmt::Error::custom)) + .map_err(VortexError::from) + }) + } + + fn visit_buffer(&mut self, buffer: &Buffer) -> VortexResult<()> { + Ok(writeln!( + self.fmt, + "{}buffer: {}", + self.indent, + format_size(buffer.len(), DECIMAL) + )?) + } +} + +impl<'a, 'b: 'a> TreeFormatter<'a, 'b> { + fn new( + fmt: &'a mut fmt::Formatter<'b>, + indent: String, + total_size: usize, + ) -> TreeFormatter<'a, 'b> { + TreeFormatter { + fmt, + indent, + total_size, + } + } + + fn indent(&mut self, indented: F) -> fmt::Result + where + F: FnOnce(&mut TreeFormatter) -> fmt::Result, + { + let original_ident = self.indent.clone(); + self.indent += " "; + let res = indented(self); + self.indent = original_ident; + res + } + + pub fn new_total_size(&mut self, total: usize, new_total: F) -> fmt::Result + where + F: FnOnce(&mut TreeFormatter) -> fmt::Result, + { + let original_total = self.total_size; + self.total_size = total; + let res = new_total(self); + self.total_size = original_total; + res + } +} + +#[cfg(test)] +mod tests { + use std::fmt::Write; + + use vortex_error::VortexResult; + + use crate::array::primitive::PrimitiveData; + use crate::array::ree::REEData; + use crate::IntoArray; + + #[test] + fn tree() -> VortexResult<()> { + let primitive = PrimitiveData::from_vec(vec![2i32, 3, 4, 5]); + let ree = REEData::try_new(primitive.data().clone(), primitive.data().clone(), 4)?; + let arr = ree.into_array(); + + let mut str = String::new(); + write!(str, "{}", arr.tree_display())?; + println!("{}", str); + // assert_eq!(str.as_str(), "hello"); + Ok(()) + } +} diff --git a/vortex-array2/src/visitor.rs b/vortex-array2/src/visitor.rs new file mode 100644 index 0000000000..ebe99ce9ed --- /dev/null +++ b/vortex-array2/src/visitor.rs @@ -0,0 +1,22 @@ +use arrow_buffer::Buffer; +use vortex_error::VortexResult; + +use crate::validity::Validity; +use crate::Array; + +pub trait AcceptArrayVisitor { + fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()>; +} + +// TODO(ngates): maybe we make this more like the inverse of TryFromParts? +pub trait ArrayVisitor { + fn visit_array(&mut self, name: &str, array: &Array) -> VortexResult<()>; + fn visit_validity(&mut self, validity: &Validity) -> VortexResult<()> { + if let Some(v) = validity.array() { + self.visit_array("validity", v) + } else { + Ok(()) + } + } + fn visit_buffer(&mut self, buffer: &Buffer) -> VortexResult<()>; +} diff --git a/vortex-error/src/lib.rs b/vortex-error/src/lib.rs index 6473b6c43e..fe9004bdc1 100644 --- a/vortex-error/src/lib.rs +++ b/vortex-error/src/lib.rs @@ -90,6 +90,12 @@ pub enum VortexError { flexbuffers::SerializationError, ), #[error(transparent)] + FmtError( + #[from] + #[backtrace] + std::fmt::Error, + ), + #[error(transparent)] IOError( #[from] #[backtrace]