From 3606dc2b70ecccef109820c38da5621686cb790a Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Thu, 23 Jan 2025 16:14:04 -0800 Subject: [PATCH 01/13] create_one --- kernel/src/engine/arrow_expression.rs | 284 +++++++++++++++++++++++++- kernel/src/lib.rs | 9 + kernel/src/schema.rs | 5 + 3 files changed, 297 insertions(+), 1 deletion(-) diff --git a/kernel/src/engine/arrow_expression.rs b/kernel/src/engine/arrow_expression.rs index 8ee54ebd0..601971ea8 100644 --- a/kernel/src/engine/arrow_expression.rs +++ b/kernel/src/engine/arrow_expression.rs @@ -526,6 +526,87 @@ impl ExpressionHandler for ArrowExpressionHandler { output_type, }) } + + fn create_one(&self, schema: SchemaRef, expr: &Expression) -> DeltaResult> { + if let Expression::Struct(child_exprs) = expr { + if schema.len() != child_exprs.len() { + return Err(Error::Generic(format!( + "Schema has {} top-level fields, but struct expr has {} children", + schema.len(), + child_exprs.len() + ))); + } + + let arrays: Vec = schema + .fields() + .zip(child_exprs.iter()) + .map(|(field, child_expr)| create_single_row_array(child_expr, field)) + .collect::, Error>>()?; + + let record_batch = RecordBatch::try_new(Arc::new(schema.as_ref().try_into()?), arrays)?; + Ok(Box::new(ArrowEngineData::new(record_batch))) + } else { + Err(Error::generic( + "ArrowExpressionHandler::create_one() requires a top-level struct expression", + )) + } + } +} + +fn create_single_row_array(expr: &Expression, field: &StructField) -> DeltaResult { + match expr { + // simple case: for literals, just create a single-row array and ensure the data types match + Expression::Literal(scalar) => { + let array = scalar.to_array(1)?; + // TODO(zach): we could do better and cast here instead of always failing + ensure_data_types(field.data_type(), array.data_type(), true)?; + Ok(array) + } + // recursive case: for struct expressions, build a struct array by recursing into each child + Expression::Struct(child_exprs) => { + // co-traverse the expression and schema: we expect the data type to be struct, error + // otherwise. + match field.data_type() { + DataType::Struct(struct_type) => { + if struct_type.len() != child_exprs.len() { + return Err(Error::Generic(format!( + "Schema struct field has {} children, but expression has {} children", + struct_type.len(), + child_exprs.len() + ))); + } + + let child_arrays: Vec = struct_type + .fields() + .zip(child_exprs.iter()) + .map(|(subfield, subexpr)| create_single_row_array(subexpr, subfield)) + .collect::, Error>>()?; + + let arrow_fields = struct_type + .fields() + .map(|f| ArrowField::try_from(f)) + .collect::, ArrowError>>()?; + + let struct_array = StructArray::new( + arrow_fields.into(), + child_arrays, + None, // FIXME: null bitmap + ); + + Ok(Arc::new(struct_array)) + } + other_type => Err(Error::Generic(format!( + "Expected struct type in schema, but got {:?}", + other_type + ))), + } + } + // fail for any non-literal, non-struct expressions + non_literal_non_struct => Err(Error::Unsupported(format!( + "build_array_from_expr: unhandled expr variant: {:?}", + non_literal_non_struct + ))), + } } #[derive(Debug)] @@ -568,7 +649,7 @@ impl ExpressionEvaluator for DefaultExpressionEvaluator { mod tests { use std::ops::{Add, Div, Mul, Sub}; - use arrow_array::{GenericStringArray, Int32Array}; + use arrow_array::{create_array, record_batch, GenericStringArray, Int32Array}; use arrow_buffer::ScalarBuffer; use arrow_schema::{DataType, Field, Fields, Schema}; @@ -867,4 +948,205 @@ mod tests { let expected = Arc::new(BooleanArray::from(vec![true, false])); assert_eq!(results.as_ref(), expected.as_ref()); } + + #[test] + fn test_create_one() { + let expr = Expression::struct_from([ + Expression::literal(1), + Expression::literal(2), + Expression::literal(3), + ]); + let schema = Arc::new(crate::schema::StructType::new([ + StructField::new("a", DeltaDataTypes::INTEGER, true), + StructField::new("b", DeltaDataTypes::INTEGER, true), + StructField::new("c", DeltaDataTypes::INTEGER, true), + ])); + + let handler = ArrowExpressionHandler; + let actual = handler.create_one(schema, &expr).unwrap(); + let expected = + record_batch!(("a", Int32, [1]), ("b", Int32, [2]), ("c", Int32, [3])).unwrap(); + let actual_rb: RecordBatch = actual + .into_any() + .downcast::() + .unwrap() + .into(); + assert_eq!(actual_rb, expected); + } + + #[test] + fn test_create_one_string() { + let expr = Expression::struct_from([Expression::literal("a")]); + let schema = Arc::new(crate::schema::StructType::new([StructField::new( + "col_1", + DeltaDataTypes::STRING, + true, + )])); + + let handler = ArrowExpressionHandler; + let actual = handler.create_one(schema, &expr).unwrap(); + let expected = record_batch!(("col_1", Utf8, ["a"])).unwrap(); + let actual_rb: RecordBatch = actual + .into_any() + .downcast::() + .unwrap() + .into(); + assert_eq!(actual_rb, expected); + } + + #[test] + fn test_create_one_null() { + let expr = Expression::struct_from([Expression::null_literal(DeltaDataTypes::INTEGER)]); + let schema = Arc::new(crate::schema::StructType::new([StructField::new( + "col_1", + DeltaDataTypes::INTEGER, + true, + )])); + + let handler = ArrowExpressionHandler; + let actual = handler.create_one(schema, &expr).unwrap(); + let expected = record_batch!(("col_1", Int32, [None])).unwrap(); + let actual_rb: RecordBatch = actual + .into_any() + .downcast::() + .unwrap() + .into(); + assert_eq!(actual_rb, expected); + } + + #[test] + fn test_create_one_non_null() { + let expr = Expression::struct_from([Expression::literal(1)]); + let schema = Arc::new(crate::schema::StructType::new([StructField::new( + "a", + DeltaDataTypes::INTEGER, + false, + )])); + + let handler = ArrowExpressionHandler; + let actual = handler.create_one(schema, &expr).unwrap(); + let expected_schema = Arc::new(arrow_schema::Schema::new(vec![Field::new( + "a", + DataType::Int32, + false, + )])); + let expected = + RecordBatch::try_new(expected_schema, vec![create_array!(Int32, [1])]).unwrap(); + let actual_rb: RecordBatch = actual + .into_any() + .downcast::() + .unwrap() + .into(); + assert_eq!(actual_rb, expected); + } + + #[test] + fn test_create_one_disallow_column_ref() { + let expr = Expression::struct_from([column_expr!("a")]); + let schema = Arc::new(crate::schema::StructType::new([StructField::new( + "a", + DeltaDataTypes::INTEGER, + true, + )])); + + let handler = ArrowExpressionHandler; + let actual = handler.create_one(schema, &expr); + assert!(actual.is_err()); + } + + #[test] + fn test_create_one_disallow_operator() { + let expr = Expression::struct_from([Expression::binary( + BinaryOperator::Plus, + Expression::literal(1), + Expression::literal(2), + )]); + let schema = Arc::new(crate::schema::StructType::new([StructField::new( + "a", + DeltaDataTypes::INTEGER, + true, + )])); + + let handler = ArrowExpressionHandler; + let actual = handler.create_one(schema, &expr); + assert!(actual.is_err()); + } + + #[test] + fn test_create_one_nested() { + let expr = Expression::struct_from([Expression::struct_from([ + Expression::literal(1), + Expression::literal(2), + ])]); + let schema = Arc::new(crate::schema::StructType::new([StructField::new( + "a", + DeltaDataTypes::struct_type([ + StructField::new("b", DeltaDataTypes::INTEGER, true), + StructField::new("c", DeltaDataTypes::INTEGER, false), + ]), + false, + )])); + + let handler = ArrowExpressionHandler; + let actual = handler.create_one(schema, &expr).unwrap(); + let expected_schema = Arc::new(arrow_schema::Schema::new(vec![Field::new( + "a", + DataType::Struct( + vec![ + Field::new("b", DataType::Int32, true), + Field::new("c", DataType::Int32, false), + ] + .into(), + ), + false, + )])); + let expected = RecordBatch::try_new( + expected_schema, + vec![Arc::new(StructArray::from(vec![ + ( + Arc::new(Field::new("b", DataType::Int32, true)), + create_array!(Int32, [1]) as ArrayRef, + ), + ( + Arc::new(Field::new("c", DataType::Int32, false)), + create_array!(Int32, [2]) as ArrayRef, + ), + ]))], + ) + .unwrap(); + let actual_rb: RecordBatch = actual + .into_any() + .downcast::() + .unwrap() + .into(); + assert_eq!(actual_rb, expected); + + // make the same but with literal struct instead of struct of literal + let struct_data = StructData::try_new( + vec![ + StructField::new("b", DeltaDataTypes::INTEGER, true), + StructField::new("c", DeltaDataTypes::INTEGER, false), + ], + vec![Scalar::Integer(1), Scalar::Integer(2)], + ) + .unwrap(); + let expr = Expression::struct_from([Expression::literal(Scalar::Struct(struct_data))]); + let schema = Arc::new(crate::schema::StructType::new([StructField::new( + "a", + DeltaDataTypes::struct_type([ + StructField::new("b", DeltaDataTypes::INTEGER, true), + StructField::new("c", DeltaDataTypes::INTEGER, false), + ]), + false, + )])); + + let handler = ArrowExpressionHandler; + let actual = handler.create_one(schema, &expr).unwrap(); + let actual_rb: RecordBatch = actual + .into_any() + .downcast::() + .unwrap() + .into(); + assert_eq!(actual_rb, expected); + } } diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index 49dceea75..ece22662e 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -337,6 +337,15 @@ pub trait ExpressionHandler: AsAny { expression: Expression, output_type: DataType, ) -> Arc; + + /// Create a single-row [`EngineData`] by evaluating an [`Expression`] with no column + /// references. + /// + /// The schema of the output is the schema parameter which must match the output of the + /// expression. + // Note: we will stick with a Schema instead of DataType (more constrained can expand in + // future) + fn create_one(&self, schema: SchemaRef, expr: &Expression) -> DeltaResult>; } /// Provides file system related functionalities to Delta Kernel. diff --git a/kernel/src/schema.rs b/kernel/src/schema.rs index d2ff65193..8db62ab44 100644 --- a/kernel/src/schema.rs +++ b/kernel/src/schema.rs @@ -274,6 +274,11 @@ impl StructType { self.fields.values() } + pub fn len(&self) -> usize { + // O(1) for indexmap + self.fields.len() + } + /// Extracts the name and type of all leaf columns, in schema order. Caller should pass Some /// `own_name` if this schema is embedded in a larger struct (e.g. `add.*`) and None if the /// schema is a top-level result (e.g. `*`). From b6130f21bac9587aeb5a05e7f677464152528562 Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Fri, 24 Jan 2025 14:03:00 -0800 Subject: [PATCH 02/13] do schema transform instead --- kernel/src/engine/arrow_expression.rs | 538 ++++++++++++++------------ kernel/src/lib.rs | 11 +- 2 files changed, 295 insertions(+), 254 deletions(-) diff --git a/kernel/src/engine/arrow_expression.rs b/kernel/src/engine/arrow_expression.rs index 601971ea8..53d1fd14a 100644 --- a/kernel/src/engine/arrow_expression.rs +++ b/kernel/src/engine/arrow_expression.rs @@ -527,85 +527,131 @@ impl ExpressionHandler for ArrowExpressionHandler { }) } - fn create_one(&self, schema: SchemaRef, expr: &Expression) -> DeltaResult> { - if let Expression::Struct(child_exprs) = expr { - if schema.len() != child_exprs.len() { - return Err(Error::Generic(format!( - "Schema has {} top-level fields, but struct expr has {} children", - schema.len(), - child_exprs.len() - ))); - } + fn create_one(&self, schema: SchemaRef, values: &[Scalar]) -> DeltaResult> { + let mut array_transform = SingleRowArrayTransform::new(values); + let datatype = schema.into(); + array_transform + .transform(&datatype) // FIXME + .unwrap(); // FIXME + let array = array_transform.finish()?; + let struct_array = array.as_any().downcast_ref::().unwrap(); // FIXME + + let record_batch: RecordBatch = struct_array.into(); // FIXME will panic for top-level null + Ok(Box::new(ArrowEngineData::new(record_batch))) + } +} - let arrays: Vec = schema - .fields() - .zip(child_exprs.iter()) - .map(|(field, child_expr)| create_single_row_array(child_expr, field)) - .collect::, Error>>()?; +/// [`SchemaTransform`] that will transform a [`Schema`] and an ordered list of leaf values (as +/// Scalar slice) into an arrow Array with a single row of each literal. +struct SingleRowArrayTransform<'a> { + /// Leaf-node values to insert in schema order. + scalars: &'a [Scalar], + /// Index into `scalars` for the next leaf we encounter. + next_scalar_idx: usize, + /// A stack of built arrays. After visiting children, we pop them off to + /// build the parent container, then push the parent back on. + stack: Vec, +} - let record_batch = RecordBatch::try_new(Arc::new(schema.as_ref().try_into()?), arrays)?; - Ok(Box::new(ArrowEngineData::new(record_batch))) - } else { - Err(Error::generic( - "ArrowExpressionHandler::create_one() requires a top-level struct expression", - )) +impl<'a> SingleRowArrayTransform<'a> { + fn new(scalars: &'a [Scalar]) -> Self { + Self { + scalars, + next_scalar_idx: 0, + stack: Vec::new(), + } + } + + /// top of `stack` should be our single-row struct array + fn finish(mut self) -> DeltaResult { + match self.stack.pop() { + Some(array) => Ok(array), + None => Err(Error::generic("didn't build array")), } } } -fn create_single_row_array(expr: &Expression, field: &StructField) -> DeltaResult { - match expr { - // simple case: for literals, just create a single-row array and ensure the data types match - Expression::Literal(scalar) => { - let array = scalar.to_array(1)?; - // TODO(zach): we could do better and cast here instead of always failing - ensure_data_types(field.data_type(), array.data_type(), true)?; - Ok(array) +use crate::schema::SchemaTransform; +use crate::schema::StructType; +use std::borrow::Cow; +impl<'a> SchemaTransform<'a> for SingleRowArrayTransform<'a> { + fn transform_primitive(&mut self, ptype: &'a PrimitiveType) -> Option> { + // check bounds + if self.next_scalar_idx >= self.scalars.len() { + return None; // TODO } - // recursive case: for struct expressions, build a struct array by recursing into each child - Expression::Struct(child_exprs) => { - // co-traverse the expression and schema: we expect the data type to be struct, error - // otherwise. - match field.data_type() { - DataType::Struct(struct_type) => { - if struct_type.len() != child_exprs.len() { - return Err(Error::Generic(format!( - "Schema struct field has {} children, but expression has {} children", - struct_type.len(), - child_exprs.len() - ))); - } + let scalar = &self.scalars[self.next_scalar_idx]; + self.next_scalar_idx += 1; - let child_arrays: Vec = struct_type - .fields() - .zip(child_exprs.iter()) - .map(|(subfield, subexpr)| create_single_row_array(subexpr, subfield)) - .collect::, Error>>()?; + let arr = match scalar.to_array(1) { + Ok(a) => a, + Err(_e) => return None, // or handle error + }; + self.stack.push(arr); - let arrow_fields = struct_type - .fields() - .map(|f| ArrowField::try_from(f)) - .collect::, ArrowError>>()?; + Some(Cow::Borrowed(ptype)) + } - let struct_array = StructArray::new( - arrow_fields.into(), - child_arrays, - None, // FIXME: null bitmap - ); + fn transform_struct(&mut self, stype: &'a StructType) -> Option> { + self.recurse_into_struct(stype)?; - Ok(Arc::new(struct_array)) - } - other_type => Err(Error::Generic(format!( - "Expected struct type in schema, but got {:?}", - other_type - ))), - } + let n_fields = stype.fields.len(); + if self.stack.len() < n_fields { + return None; // FIXME } - // fail for any non-literal, non-struct expressions - non_literal_non_struct => Err(Error::Unsupported(format!( - "build_array_from_expr: unhandled expr variant: {:?}", - non_literal_non_struct - ))), + + // pop in reverse + let mut child_arrays = Vec::with_capacity(n_fields); + for _ in 0..n_fields { + child_arrays.push(self.stack.pop().unwrap()); + } + child_arrays.reverse(); + + let field_refs: Vec = stype + .fields() + .map(|f| StructField::new(f.name.clone(), f.data_type.clone(), f.nullable)) + .collect(); // FIXME + + // let struct_dt = DataType::Struct(Box::new(StructType::new(field_refs.clone()))); + + let arrow_fields = field_refs + .iter() + .map(|f| ArrowField::try_from(f)) + .try_collect() + .unwrap(); // FIXME + let struct_arr = Arc::new(StructArray::new(arrow_fields, child_arrays, None)) as ArrayRef; + + self.stack.push(struct_arr); + Some(Cow::Borrowed(stype)) + } + + fn transform_struct_field(&mut self, field: &'a StructField) -> Option> { + self.recurse_into_struct_field(field) + } + + // For arrays, do something similar: + fn transform_array(&mut self, atype: &'a ArrayType) -> Option> { + // Recurse into child + self.recurse_into_array(atype)?; + + // Once child is done, pop the child's single-row array from the stack + let child_array = self.stack.pop().unwrap(); + + unimplemented!(); + + Some(Cow::Borrowed(atype)) + } + + fn transform_map(&mut self, mtype: &'a MapType) -> Option> { + self.recurse_into_map(mtype)?; + + let value_arr = self.stack.pop().unwrap(); + let key_arr = self.stack.pop().unwrap(); + + unimplemented!(); + + // build map, push + Some(Cow::Borrowed(mtype)) } } @@ -951,11 +997,7 @@ mod tests { #[test] fn test_create_one() { - let expr = Expression::struct_from([ - Expression::literal(1), - Expression::literal(2), - Expression::literal(3), - ]); + let values: &[Scalar] = &[1.into(), 2.into(), 3.into()]; let schema = Arc::new(crate::schema::StructType::new([ StructField::new("a", DeltaDataTypes::INTEGER, true), StructField::new("b", DeltaDataTypes::INTEGER, true), @@ -963,7 +1005,7 @@ mod tests { ])); let handler = ArrowExpressionHandler; - let actual = handler.create_one(schema, &expr).unwrap(); + let actual = handler.create_one(schema, values).unwrap(); let expected = record_batch!(("a", Int32, [1]), ("b", Int32, [2]), ("c", Int32, [3])).unwrap(); let actual_rb: RecordBatch = actual @@ -974,179 +1016,179 @@ mod tests { assert_eq!(actual_rb, expected); } - #[test] - fn test_create_one_string() { - let expr = Expression::struct_from([Expression::literal("a")]); - let schema = Arc::new(crate::schema::StructType::new([StructField::new( - "col_1", - DeltaDataTypes::STRING, - true, - )])); - - let handler = ArrowExpressionHandler; - let actual = handler.create_one(schema, &expr).unwrap(); - let expected = record_batch!(("col_1", Utf8, ["a"])).unwrap(); - let actual_rb: RecordBatch = actual - .into_any() - .downcast::() - .unwrap() - .into(); - assert_eq!(actual_rb, expected); - } - - #[test] - fn test_create_one_null() { - let expr = Expression::struct_from([Expression::null_literal(DeltaDataTypes::INTEGER)]); - let schema = Arc::new(crate::schema::StructType::new([StructField::new( - "col_1", - DeltaDataTypes::INTEGER, - true, - )])); - - let handler = ArrowExpressionHandler; - let actual = handler.create_one(schema, &expr).unwrap(); - let expected = record_batch!(("col_1", Int32, [None])).unwrap(); - let actual_rb: RecordBatch = actual - .into_any() - .downcast::() - .unwrap() - .into(); - assert_eq!(actual_rb, expected); - } - - #[test] - fn test_create_one_non_null() { - let expr = Expression::struct_from([Expression::literal(1)]); - let schema = Arc::new(crate::schema::StructType::new([StructField::new( - "a", - DeltaDataTypes::INTEGER, - false, - )])); - - let handler = ArrowExpressionHandler; - let actual = handler.create_one(schema, &expr).unwrap(); - let expected_schema = Arc::new(arrow_schema::Schema::new(vec![Field::new( - "a", - DataType::Int32, - false, - )])); - let expected = - RecordBatch::try_new(expected_schema, vec![create_array!(Int32, [1])]).unwrap(); - let actual_rb: RecordBatch = actual - .into_any() - .downcast::() - .unwrap() - .into(); - assert_eq!(actual_rb, expected); - } - - #[test] - fn test_create_one_disallow_column_ref() { - let expr = Expression::struct_from([column_expr!("a")]); - let schema = Arc::new(crate::schema::StructType::new([StructField::new( - "a", - DeltaDataTypes::INTEGER, - true, - )])); - - let handler = ArrowExpressionHandler; - let actual = handler.create_one(schema, &expr); - assert!(actual.is_err()); - } - - #[test] - fn test_create_one_disallow_operator() { - let expr = Expression::struct_from([Expression::binary( - BinaryOperator::Plus, - Expression::literal(1), - Expression::literal(2), - )]); - let schema = Arc::new(crate::schema::StructType::new([StructField::new( - "a", - DeltaDataTypes::INTEGER, - true, - )])); - - let handler = ArrowExpressionHandler; - let actual = handler.create_one(schema, &expr); - assert!(actual.is_err()); - } - - #[test] - fn test_create_one_nested() { - let expr = Expression::struct_from([Expression::struct_from([ - Expression::literal(1), - Expression::literal(2), - ])]); - let schema = Arc::new(crate::schema::StructType::new([StructField::new( - "a", - DeltaDataTypes::struct_type([ - StructField::new("b", DeltaDataTypes::INTEGER, true), - StructField::new("c", DeltaDataTypes::INTEGER, false), - ]), - false, - )])); - - let handler = ArrowExpressionHandler; - let actual = handler.create_one(schema, &expr).unwrap(); - let expected_schema = Arc::new(arrow_schema::Schema::new(vec![Field::new( - "a", - DataType::Struct( - vec![ - Field::new("b", DataType::Int32, true), - Field::new("c", DataType::Int32, false), - ] - .into(), - ), - false, - )])); - let expected = RecordBatch::try_new( - expected_schema, - vec![Arc::new(StructArray::from(vec![ - ( - Arc::new(Field::new("b", DataType::Int32, true)), - create_array!(Int32, [1]) as ArrayRef, - ), - ( - Arc::new(Field::new("c", DataType::Int32, false)), - create_array!(Int32, [2]) as ArrayRef, - ), - ]))], - ) - .unwrap(); - let actual_rb: RecordBatch = actual - .into_any() - .downcast::() - .unwrap() - .into(); - assert_eq!(actual_rb, expected); - - // make the same but with literal struct instead of struct of literal - let struct_data = StructData::try_new( - vec![ - StructField::new("b", DeltaDataTypes::INTEGER, true), - StructField::new("c", DeltaDataTypes::INTEGER, false), - ], - vec![Scalar::Integer(1), Scalar::Integer(2)], - ) - .unwrap(); - let expr = Expression::struct_from([Expression::literal(Scalar::Struct(struct_data))]); - let schema = Arc::new(crate::schema::StructType::new([StructField::new( - "a", - DeltaDataTypes::struct_type([ - StructField::new("b", DeltaDataTypes::INTEGER, true), - StructField::new("c", DeltaDataTypes::INTEGER, false), - ]), - false, - )])); - - let handler = ArrowExpressionHandler; - let actual = handler.create_one(schema, &expr).unwrap(); - let actual_rb: RecordBatch = actual - .into_any() - .downcast::() - .unwrap() - .into(); - assert_eq!(actual_rb, expected); - } + //#[test] + //fn test_create_one_string() { + // let expr = Expression::struct_from([Expression::literal("a")]); + // let schema = Arc::new(crate::schema::StructType::new([StructField::new( + // "col_1", + // DeltaDataTypes::STRING, + // true, + // )])); + // + // let handler = ArrowExpressionHandler; + // let actual = handler.create_one(schema, &expr).unwrap(); + // let expected = record_batch!(("col_1", Utf8, ["a"])).unwrap(); + // let actual_rb: RecordBatch = actual + // .into_any() + // .downcast::() + // .unwrap() + // .into(); + // assert_eq!(actual_rb, expected); + //} + // + //#[test] + //fn test_create_one_null() { + // let expr = Expression::struct_from([Expression::null_literal(DeltaDataTypes::INTEGER)]); + // let schema = Arc::new(crate::schema::StructType::new([StructField::new( + // "col_1", + // DeltaDataTypes::INTEGER, + // true, + // )])); + // + // let handler = ArrowExpressionHandler; + // let actual = handler.create_one(schema, &expr).unwrap(); + // let expected = record_batch!(("col_1", Int32, [None])).unwrap(); + // let actual_rb: RecordBatch = actual + // .into_any() + // .downcast::() + // .unwrap() + // .into(); + // assert_eq!(actual_rb, expected); + //} + // + //#[test] + //fn test_create_one_non_null() { + // let expr = Expression::struct_from([Expression::literal(1)]); + // let schema = Arc::new(crate::schema::StructType::new([StructField::new( + // "a", + // DeltaDataTypes::INTEGER, + // false, + // )])); + // + // let handler = ArrowExpressionHandler; + // let actual = handler.create_one(schema, &expr).unwrap(); + // let expected_schema = Arc::new(arrow_schema::Schema::new(vec![Field::new( + // "a", + // DataType::Int32, + // false, + // )])); + // let expected = + // RecordBatch::try_new(expected_schema, vec![create_array!(Int32, [1])]).unwrap(); + // let actual_rb: RecordBatch = actual + // .into_any() + // .downcast::() + // .unwrap() + // .into(); + // assert_eq!(actual_rb, expected); + //} + // + //#[test] + //fn test_create_one_disallow_column_ref() { + // let expr = Expression::struct_from([column_expr!("a")]); + // let schema = Arc::new(crate::schema::StructType::new([StructField::new( + // "a", + // DeltaDataTypes::INTEGER, + // true, + // )])); + // + // let handler = ArrowExpressionHandler; + // let actual = handler.create_one(schema, &expr); + // assert!(actual.is_err()); + //} + // + //#[test] + //fn test_create_one_disallow_operator() { + // let expr = Expression::struct_from([Expression::binary( + // BinaryOperator::Plus, + // Expression::literal(1), + // Expression::literal(2), + // )]); + // let schema = Arc::new(crate::schema::StructType::new([StructField::new( + // "a", + // DeltaDataTypes::INTEGER, + // true, + // )])); + // + // let handler = ArrowExpressionHandler; + // let actual = handler.create_one(schema, &expr); + // assert!(actual.is_err()); + //} + // + //#[test] + //fn test_create_one_nested() { + // let expr = Expression::struct_from([Expression::struct_from([ + // Expression::literal(1), + // Expression::literal(2), + // ])]); + // let schema = Arc::new(crate::schema::StructType::new([StructField::new( + // "a", + // DeltaDataTypes::struct_type([ + // StructField::new("b", DeltaDataTypes::INTEGER, true), + // StructField::new("c", DeltaDataTypes::INTEGER, false), + // ]), + // false, + // )])); + // + // let handler = ArrowExpressionHandler; + // let actual = handler.create_one(schema, &expr).unwrap(); + // let expected_schema = Arc::new(arrow_schema::Schema::new(vec![Field::new( + // "a", + // DataType::Struct( + // vec![ + // Field::new("b", DataType::Int32, true), + // Field::new("c", DataType::Int32, false), + // ] + // .into(), + // ), + // false, + // )])); + // let expected = RecordBatch::try_new( + // expected_schema, + // vec![Arc::new(StructArray::from(vec![ + // ( + // Arc::new(Field::new("b", DataType::Int32, true)), + // create_array!(Int32, [1]) as ArrayRef, + // ), + // ( + // Arc::new(Field::new("c", DataType::Int32, false)), + // create_array!(Int32, [2]) as ArrayRef, + // ), + // ]))], + // ) + // .unwrap(); + // let actual_rb: RecordBatch = actual + // .into_any() + // .downcast::() + // .unwrap() + // .into(); + // assert_eq!(actual_rb, expected); + // + // // make the same but with literal struct instead of struct of literal + // let struct_data = StructData::try_new( + // vec![ + // StructField::new("b", DeltaDataTypes::INTEGER, true), + // StructField::new("c", DeltaDataTypes::INTEGER, false), + // ], + // vec![Scalar::Integer(1), Scalar::Integer(2)], + // ) + // .unwrap(); + // let expr = Expression::struct_from([Expression::literal(Scalar::Struct(struct_data))]); + // let schema = Arc::new(crate::schema::StructType::new([StructField::new( + // "a", + // DeltaDataTypes::struct_type([ + // StructField::new("b", DeltaDataTypes::INTEGER, true), + // StructField::new("c", DeltaDataTypes::INTEGER, false), + // ]), + // false, + // )])); + // + // let handler = ArrowExpressionHandler; + // let actual = handler.create_one(schema, &expr).unwrap(); + // let actual_rb: RecordBatch = actual + // .into_any() + // .downcast::() + // .unwrap() + // .into(); + // assert_eq!(actual_rb, expected); + //} } diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index ece22662e..0766c35ba 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -105,6 +105,8 @@ pub use error::{DeltaResult, Error}; pub use expressions::{Expression, ExpressionRef}; pub use table::Table; +use expressions::Scalar; + #[cfg(any( feature = "default-engine", feature = "sync-engine", @@ -338,14 +340,11 @@ pub trait ExpressionHandler: AsAny { output_type: DataType, ) -> Arc; - /// Create a single-row [`EngineData`] by evaluating an [`Expression`] with no column - /// references. - /// - /// The schema of the output is the schema parameter which must match the output of the - /// expression. + /// Create a single-row [`EngineData`] by applying the given schema to the leaf-values given in + /// `values`. // Note: we will stick with a Schema instead of DataType (more constrained can expand in // future) - fn create_one(&self, schema: SchemaRef, expr: &Expression) -> DeltaResult>; + fn create_one(&self, schema: SchemaRef, values: &[Scalar]) -> DeltaResult>; } /// Provides file system related functionalities to Delta Kernel. From 6fb4f7fb80f552ddfe2f1bd869d12bebf03a0888 Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Tue, 28 Jan 2025 15:00:03 -0800 Subject: [PATCH 03/13] datatypes, tests, cleanup - still need to do nullability --- kernel/src/engine/arrow_expression.rs | 368 +++++++++++++------------- 1 file changed, 182 insertions(+), 186 deletions(-) diff --git a/kernel/src/engine/arrow_expression.rs b/kernel/src/engine/arrow_expression.rs index 53d1fd14a..abae356c3 100644 --- a/kernel/src/engine/arrow_expression.rs +++ b/kernel/src/engine/arrow_expression.rs @@ -1,5 +1,5 @@ //! Expression handling based on arrow-rs compute kernels. -use std::borrow::Borrow; +use std::borrow::{Borrow, Cow}; use std::collections::HashMap; use std::sync::Arc; @@ -32,7 +32,10 @@ use crate::expressions::{ BinaryExpression, BinaryOperator, Expression, Scalar, UnaryExpression, UnaryOperator, VariadicExpression, VariadicOperator, }; -use crate::schema::{ArrayType, DataType, MapType, PrimitiveType, Schema, SchemaRef, StructField}; +use crate::schema::{ + ArrayType, DataType, MapType, PrimitiveType, Schema, SchemaRef, SchemaTransform, StructField, + StructType, +}; use crate::{EngineData, ExpressionEvaluator, ExpressionHandler}; // TODO leverage scalars / Datum @@ -527,13 +530,14 @@ impl ExpressionHandler for ArrowExpressionHandler { }) } + /// TODO fn create_one(&self, schema: SchemaRef, values: &[Scalar]) -> DeltaResult> { let mut array_transform = SingleRowArrayTransform::new(values); let datatype = schema.into(); - array_transform - .transform(&datatype) // FIXME - .unwrap(); // FIXME - let array = array_transform.finish()?; + // we build up the array within the `SignatureArrayTransform` - we don't actually care + // about the 'transformed' type + let _transformed = array_transform.transform(&datatype); + let array = array_transform.into_array()?; let struct_array = array.as_any().downcast_ref::().unwrap(); // FIXME let record_batch: RecordBatch = struct_array.into(); // FIXME will panic for top-level null @@ -551,6 +555,8 @@ struct SingleRowArrayTransform<'a> { /// A stack of built arrays. After visiting children, we pop them off to /// build the parent container, then push the parent back on. stack: Vec, + /// If an error occurs, it will be stored here. + error: Option, } impl<'a> SingleRowArrayTransform<'a> { @@ -559,11 +565,16 @@ impl<'a> SingleRowArrayTransform<'a> { scalars, next_scalar_idx: 0, stack: Vec::new(), + error: None, } } - /// top of `stack` should be our single-row struct array - fn finish(mut self) -> DeltaResult { + /// return the single-row array (or propagate Error). the top of `stack` should be our + /// single-row struct array + fn into_array(mut self) -> DeltaResult { + if let Some(e) = self.error { + return Err(e); + } match self.stack.pop() { Some(array) => Ok(array), None => Err(Error::generic("didn't build array")), @@ -571,11 +582,13 @@ impl<'a> SingleRowArrayTransform<'a> { } } -use crate::schema::SchemaTransform; -use crate::schema::StructType; -use std::borrow::Cow; impl<'a> SchemaTransform<'a> for SingleRowArrayTransform<'a> { fn transform_primitive(&mut self, ptype: &'a PrimitiveType) -> Option> { + // first always check error to terminate early if possible + if self.error.is_some() { + return None; + } + // check bounds if self.next_scalar_idx >= self.scalars.len() { return None; // TODO @@ -583,6 +596,15 @@ impl<'a> SchemaTransform<'a> for SingleRowArrayTransform<'a> { let scalar = &self.scalars[self.next_scalar_idx]; self.next_scalar_idx += 1; + let DataType::Primitive(scalar_type) = scalar.data_type() else { + return None; // FIXME + }; + if scalar_type != *ptype { + self.error = Some(Error::generic( + "Mismatched scalar type when creating single-row array: expected {ptype}, got {scalar_type}")); + return None; // FIXME + } + let arr = match scalar.to_array(1) { Ok(a) => a, Err(_e) => return None, // or handle error @@ -593,6 +615,11 @@ impl<'a> SchemaTransform<'a> for SingleRowArrayTransform<'a> { } fn transform_struct(&mut self, stype: &'a StructType) -> Option> { + // first always check error to terminate early if possible + if self.error.is_some() { + return None; + } + self.recurse_into_struct(stype)?; let n_fields = stype.fields.len(); @@ -626,11 +653,24 @@ impl<'a> SchemaTransform<'a> for SingleRowArrayTransform<'a> { } fn transform_struct_field(&mut self, field: &'a StructField) -> Option> { + // first always check error to terminate early if possible + if self.error.is_some() { + return None; + } + + // FIXME + // struct field (or containing array/map) decides whether the child type is nullable (any + // nullable parent makes the field nullable) self.recurse_into_struct_field(field) } // For arrays, do something similar: fn transform_array(&mut self, atype: &'a ArrayType) -> Option> { + // first always check error to terminate early if possible + if self.error.is_some() { + return None; + } + // Recurse into child self.recurse_into_array(atype)?; @@ -643,6 +683,11 @@ impl<'a> SchemaTransform<'a> for SingleRowArrayTransform<'a> { } fn transform_map(&mut self, mtype: &'a MapType) -> Option> { + // first always check error to terminate early if possible + if self.error.is_some() { + return None; + } + self.recurse_into_map(mtype)?; let value_arr = self.stack.pop().unwrap(); @@ -1016,179 +1061,130 @@ mod tests { assert_eq!(actual_rb, expected); } - //#[test] - //fn test_create_one_string() { - // let expr = Expression::struct_from([Expression::literal("a")]); - // let schema = Arc::new(crate::schema::StructType::new([StructField::new( - // "col_1", - // DeltaDataTypes::STRING, - // true, - // )])); - // - // let handler = ArrowExpressionHandler; - // let actual = handler.create_one(schema, &expr).unwrap(); - // let expected = record_batch!(("col_1", Utf8, ["a"])).unwrap(); - // let actual_rb: RecordBatch = actual - // .into_any() - // .downcast::() - // .unwrap() - // .into(); - // assert_eq!(actual_rb, expected); - //} - // - //#[test] - //fn test_create_one_null() { - // let expr = Expression::struct_from([Expression::null_literal(DeltaDataTypes::INTEGER)]); - // let schema = Arc::new(crate::schema::StructType::new([StructField::new( - // "col_1", - // DeltaDataTypes::INTEGER, - // true, - // )])); - // - // let handler = ArrowExpressionHandler; - // let actual = handler.create_one(schema, &expr).unwrap(); - // let expected = record_batch!(("col_1", Int32, [None])).unwrap(); - // let actual_rb: RecordBatch = actual - // .into_any() - // .downcast::() - // .unwrap() - // .into(); - // assert_eq!(actual_rb, expected); - //} - // - //#[test] - //fn test_create_one_non_null() { - // let expr = Expression::struct_from([Expression::literal(1)]); - // let schema = Arc::new(crate::schema::StructType::new([StructField::new( - // "a", - // DeltaDataTypes::INTEGER, - // false, - // )])); - // - // let handler = ArrowExpressionHandler; - // let actual = handler.create_one(schema, &expr).unwrap(); - // let expected_schema = Arc::new(arrow_schema::Schema::new(vec![Field::new( - // "a", - // DataType::Int32, - // false, - // )])); - // let expected = - // RecordBatch::try_new(expected_schema, vec![create_array!(Int32, [1])]).unwrap(); - // let actual_rb: RecordBatch = actual - // .into_any() - // .downcast::() - // .unwrap() - // .into(); - // assert_eq!(actual_rb, expected); - //} - // - //#[test] - //fn test_create_one_disallow_column_ref() { - // let expr = Expression::struct_from([column_expr!("a")]); - // let schema = Arc::new(crate::schema::StructType::new([StructField::new( - // "a", - // DeltaDataTypes::INTEGER, - // true, - // )])); - // - // let handler = ArrowExpressionHandler; - // let actual = handler.create_one(schema, &expr); - // assert!(actual.is_err()); - //} - // - //#[test] - //fn test_create_one_disallow_operator() { - // let expr = Expression::struct_from([Expression::binary( - // BinaryOperator::Plus, - // Expression::literal(1), - // Expression::literal(2), - // )]); - // let schema = Arc::new(crate::schema::StructType::new([StructField::new( - // "a", - // DeltaDataTypes::INTEGER, - // true, - // )])); - // - // let handler = ArrowExpressionHandler; - // let actual = handler.create_one(schema, &expr); - // assert!(actual.is_err()); - //} - // - //#[test] - //fn test_create_one_nested() { - // let expr = Expression::struct_from([Expression::struct_from([ - // Expression::literal(1), - // Expression::literal(2), - // ])]); - // let schema = Arc::new(crate::schema::StructType::new([StructField::new( - // "a", - // DeltaDataTypes::struct_type([ - // StructField::new("b", DeltaDataTypes::INTEGER, true), - // StructField::new("c", DeltaDataTypes::INTEGER, false), - // ]), - // false, - // )])); - // - // let handler = ArrowExpressionHandler; - // let actual = handler.create_one(schema, &expr).unwrap(); - // let expected_schema = Arc::new(arrow_schema::Schema::new(vec![Field::new( - // "a", - // DataType::Struct( - // vec![ - // Field::new("b", DataType::Int32, true), - // Field::new("c", DataType::Int32, false), - // ] - // .into(), - // ), - // false, - // )])); - // let expected = RecordBatch::try_new( - // expected_schema, - // vec![Arc::new(StructArray::from(vec![ - // ( - // Arc::new(Field::new("b", DataType::Int32, true)), - // create_array!(Int32, [1]) as ArrayRef, - // ), - // ( - // Arc::new(Field::new("c", DataType::Int32, false)), - // create_array!(Int32, [2]) as ArrayRef, - // ), - // ]))], - // ) - // .unwrap(); - // let actual_rb: RecordBatch = actual - // .into_any() - // .downcast::() - // .unwrap() - // .into(); - // assert_eq!(actual_rb, expected); - // - // // make the same but with literal struct instead of struct of literal - // let struct_data = StructData::try_new( - // vec![ - // StructField::new("b", DeltaDataTypes::INTEGER, true), - // StructField::new("c", DeltaDataTypes::INTEGER, false), - // ], - // vec![Scalar::Integer(1), Scalar::Integer(2)], - // ) - // .unwrap(); - // let expr = Expression::struct_from([Expression::literal(Scalar::Struct(struct_data))]); - // let schema = Arc::new(crate::schema::StructType::new([StructField::new( - // "a", - // DeltaDataTypes::struct_type([ - // StructField::new("b", DeltaDataTypes::INTEGER, true), - // StructField::new("c", DeltaDataTypes::INTEGER, false), - // ]), - // false, - // )])); - // - // let handler = ArrowExpressionHandler; - // let actual = handler.create_one(schema, &expr).unwrap(); - // let actual_rb: RecordBatch = actual - // .into_any() - // .downcast::() - // .unwrap() - // .into(); - // assert_eq!(actual_rb, expected); - //} + #[test] + fn test_create_one_string() { + let values = &["a".into()]; + let schema = Arc::new(crate::schema::StructType::new([StructField::new( + "col_1", + DeltaDataTypes::STRING, + true, + )])); + + let handler = ArrowExpressionHandler; + let actual = handler.create_one(schema, values).unwrap(); + let expected = record_batch!(("col_1", Utf8, ["a"])).unwrap(); + let actual_rb: RecordBatch = actual + .into_any() + .downcast::() + .unwrap() + .into(); + assert_eq!(actual_rb, expected); + } + + #[test] + fn test_create_one_incorrect_schema() { + let values = &["a".into()]; + let schema = Arc::new(crate::schema::StructType::new([StructField::new( + "col_1", + DeltaDataTypes::INTEGER, + true, + )])); + + let handler = ArrowExpressionHandler; + // FIXME check err + assert!(handler.create_one(schema, values).is_err()) + } + + #[test] + fn test_create_one_null() { + let values = &[Scalar::Null(DeltaDataTypes::INTEGER)]; + let schema = Arc::new(crate::schema::StructType::new([StructField::new( + "col_1", + DeltaDataTypes::INTEGER, + true, + )])); + + let handler = ArrowExpressionHandler; + let actual = handler.create_one(schema, values).unwrap(); + let expected = record_batch!(("col_1", Int32, [None])).unwrap(); + let actual_rb: RecordBatch = actual + .into_any() + .downcast::() + .unwrap() + .into(); + assert_eq!(actual_rb, expected); + } + + #[test] + fn test_create_one_non_null() { + let values: &[Scalar] = &[1.into()]; + let schema = Arc::new(crate::schema::StructType::new([StructField::new( + "a", + DeltaDataTypes::INTEGER, + false, + )])); + + let handler = ArrowExpressionHandler; + let actual = handler.create_one(schema, values).unwrap(); + let expected_schema = Arc::new(arrow_schema::Schema::new(vec![Field::new( + "a", + DataType::Int32, + false, + )])); + let expected = + RecordBatch::try_new(expected_schema, vec![create_array!(Int32, [1])]).unwrap(); + let actual_rb: RecordBatch = actual + .into_any() + .downcast::() + .unwrap() + .into(); + assert_eq!(actual_rb, expected); + } + + #[test] + fn test_create_one_nested() { + let values: &[Scalar] = &[1.into(), 2.into()]; + let schema = Arc::new(crate::schema::StructType::new([StructField::new( + "a", + DeltaDataTypes::struct_type([ + StructField::new("b", DeltaDataTypes::INTEGER, true), + StructField::new("c", DeltaDataTypes::INTEGER, false), + ]), + false, + )])); + + let handler = ArrowExpressionHandler; + let actual = handler.create_one(schema, values).unwrap(); + let expected_schema = Arc::new(arrow_schema::Schema::new(vec![Field::new( + "a", + DataType::Struct( + vec![ + Field::new("b", DataType::Int32, true), + Field::new("c", DataType::Int32, false), + ] + .into(), + ), + false, + )])); + let expected = RecordBatch::try_new( + expected_schema, + vec![Arc::new(StructArray::from(vec![ + ( + Arc::new(Field::new("b", DataType::Int32, true)), + create_array!(Int32, [1]) as ArrayRef, + ), + ( + Arc::new(Field::new("c", DataType::Int32, false)), + create_array!(Int32, [2]) as ArrayRef, + ), + ]))], + ) + .unwrap(); + let actual_rb: RecordBatch = actual + .into_any() + .downcast::() + .unwrap() + .into(); + assert_eq!(actual_rb, expected); + } } From 975dd01cf97a5c7ffde90f8042fea70f90c2880d Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Thu, 30 Jan 2025 16:12:02 -0800 Subject: [PATCH 04/13] cleanup --- kernel/src/engine/arrow_expression.rs | 436 +++++++++++++++----------- 1 file changed, 247 insertions(+), 189 deletions(-) diff --git a/kernel/src/engine/arrow_expression.rs b/kernel/src/engine/arrow_expression.rs index abae356c3..8d6462f47 100644 --- a/kernel/src/engine/arrow_expression.rs +++ b/kernel/src/engine/arrow_expression.rs @@ -1,5 +1,5 @@ //! Expression handling based on arrow-rs compute kernels. -use std::borrow::{Borrow, Cow}; +use std::borrow::Borrow; use std::collections::HashMap; use std::sync::Arc; @@ -34,9 +34,9 @@ use crate::expressions::{ }; use crate::schema::{ ArrayType, DataType, MapType, PrimitiveType, Schema, SchemaRef, SchemaTransform, StructField, - StructType, }; use crate::{EngineData, ExpressionEvaluator, ExpressionHandler}; +use single_row_array_transform::SingleRowArrayTransform; // TODO leverage scalars / Datum @@ -545,158 +545,192 @@ impl ExpressionHandler for ArrowExpressionHandler { } } -/// [`SchemaTransform`] that will transform a [`Schema`] and an ordered list of leaf values (as -/// Scalar slice) into an arrow Array with a single row of each literal. -struct SingleRowArrayTransform<'a> { - /// Leaf-node values to insert in schema order. - scalars: &'a [Scalar], - /// Index into `scalars` for the next leaf we encounter. - next_scalar_idx: usize, - /// A stack of built arrays. After visiting children, we pop them off to - /// build the parent container, then push the parent back on. - stack: Vec, - /// If an error occurs, it will be stored here. - error: Option, -} +mod single_row_array_transform { + use std::borrow::Cow; + use std::sync::Arc; -impl<'a> SingleRowArrayTransform<'a> { - fn new(scalars: &'a [Scalar]) -> Self { - Self { - scalars, - next_scalar_idx: 0, - stack: Vec::new(), - error: None, - } - } + use arrow_array::{ArrayRef, StructArray}; + use arrow_schema::Field as ArrowField; + use itertools::Itertools; + use tracing::debug; - /// return the single-row array (or propagate Error). the top of `stack` should be our - /// single-row struct array - fn into_array(mut self) -> DeltaResult { - if let Some(e) = self.error { - return Err(e); - } - match self.stack.pop() { - Some(array) => Ok(array), - None => Err(Error::generic("didn't build array")), - } + use crate::error::{DeltaResult, Error}; + use crate::expressions::Scalar; + use crate::schema::{ + ArrayType, DataType, MapType, PrimitiveType, SchemaTransform, StructField, StructType, + }; + + /// [`SchemaTransform`] that will transform a [`Schema`] and an ordered list of leaf values (as + /// Scalar slice) into an arrow Array with a single row of each literal. + #[derive(Debug, Default)] + pub(crate) struct SingleRowArrayTransform<'a> { + /// Leaf-node values to insert in schema order. + scalars: &'a [Scalar], + /// Index into `scalars` for the next leaf we encounter. + next_scalar_idx: usize, + /// A stack of built arrays. After visiting children, we pop them off to + /// build the parent container, then push the parent back on. + stack: Vec, + /// If an error occurs, it will be stored here. + error: Option, } -} -impl<'a> SchemaTransform<'a> for SingleRowArrayTransform<'a> { - fn transform_primitive(&mut self, ptype: &'a PrimitiveType) -> Option> { - // first always check error to terminate early if possible - if self.error.is_some() { - return None; + impl<'a> SingleRowArrayTransform<'a> { + pub(crate) fn new(scalars: &'a [Scalar]) -> Self { + Self { + scalars, + ..Default::default() + } } - // check bounds - if self.next_scalar_idx >= self.scalars.len() { - return None; // TODO + /// return the single-row array (or propagate Error). the top of `stack` should be our + /// single-row struct array + pub(crate) fn into_array(mut self) -> DeltaResult { + if let Some(e) = self.error { + return Err(e); + } + match self.stack.pop() { + Some(array) => Ok(array), + None => Err(Error::generic("didn't build array")), + } } - let scalar = &self.scalars[self.next_scalar_idx]; - self.next_scalar_idx += 1; - let DataType::Primitive(scalar_type) = scalar.data_type() else { - return None; // FIXME - }; - if scalar_type != *ptype { - self.error = Some(Error::generic( - "Mismatched scalar type when creating single-row array: expected {ptype}, got {scalar_type}")); - return None; // FIXME + fn set_error(&mut self, e: Error) { + if let Some(err) = &self.error { + debug!("Overwriting error that was already set: {err}"); + } + self.error = Some(e); } - - let arr = match scalar.to_array(1) { - Ok(a) => a, - Err(_e) => return None, // or handle error - }; - self.stack.push(arr); - - Some(Cow::Borrowed(ptype)) } - fn transform_struct(&mut self, stype: &'a StructType) -> Option> { - // first always check error to terminate early if possible - if self.error.is_some() { - return None; - } + impl<'a> SchemaTransform<'a> for SingleRowArrayTransform<'a> { + fn transform_primitive( + &mut self, + prim_type: &'a PrimitiveType, + ) -> Option> { + // first always check error to terminate early if possible + if self.error.is_some() { + return None; + } - self.recurse_into_struct(stype)?; + if self.next_scalar_idx >= self.scalars.len() { + self.set_error(Error::generic( + "Not enough scalars to create a single-row array", + )); + return None; + } - let n_fields = stype.fields.len(); - if self.stack.len() < n_fields { - return None; // FIXME - } + let scalar = &self.scalars[self.next_scalar_idx]; + self.next_scalar_idx += 1; + + match scalar.data_type() { + DataType::Primitive(scalar_type) => { + if scalar_type != *prim_type { + self.set_error(Error::Generic(format!( + "Mismatched scalar type creating a single-row array: expected {}, got {}", + prim_type, scalar_type + ))); + return None; + } + } + _ => { + self.set_error(Error::generic( + "Non-primitive scalar type {datatype} provided", + )); + return None; + } + } - // pop in reverse - let mut child_arrays = Vec::with_capacity(n_fields); - for _ in 0..n_fields { - child_arrays.push(self.stack.pop().unwrap()); - } - child_arrays.reverse(); + let arr = match scalar.to_array(1) { + Ok(a) => a, + Err(e) => { + self.set_error(e); + return None; + } + }; + self.stack.push(arr); - let field_refs: Vec = stype - .fields() - .map(|f| StructField::new(f.name.clone(), f.data_type.clone(), f.nullable)) - .collect(); // FIXME + Some(Cow::Borrowed(prim_type)) + } - // let struct_dt = DataType::Struct(Box::new(StructType::new(field_refs.clone()))); + fn transform_struct(&mut self, struct_type: &'a StructType) -> Option> { + // first always check error to terminate early if possible + if self.error.is_some() { + return None; + } - let arrow_fields = field_refs - .iter() - .map(|f| ArrowField::try_from(f)) - .try_collect() - .unwrap(); // FIXME - let struct_arr = Arc::new(StructArray::new(arrow_fields, child_arrays, None)) as ArrayRef; + self.recurse_into_struct(struct_type)?; - self.stack.push(struct_arr); - Some(Cow::Borrowed(stype)) - } + let num_fields = struct_type.fields.len(); + let stack_len = self.stack.len(); + if stack_len < num_fields { + self.set_error(Error::Generic(format!( + "Not enough child arrays to create a single-row struct: expected {}, got {}", + num_fields, stack_len + ))); + return None; + } - fn transform_struct_field(&mut self, field: &'a StructField) -> Option> { - // first always check error to terminate early if possible - if self.error.is_some() { - return None; + let mut child_arrays = Vec::with_capacity(num_fields); + for _ in 0..num_fields { + child_arrays.push(self.stack.pop().unwrap()); + } + // reverse child_arrays since we popped them off the stack + child_arrays.reverse(); + + let fields = match struct_type.fields().map(ArrowField::try_from).try_collect() { + Ok(f) => f, + Err(e) => { + self.set_error(Error::Arrow(e)); + return None; + } + }; + let array = match StructArray::try_new(fields, child_arrays, None) { + Ok(arr) => arr, + Err(e) => { + self.set_error(Error::Arrow(e)); + return None; + } + }; + let array = Arc::new(array) as ArrayRef; + self.stack.push(array); + Some(Cow::Borrowed(struct_type)) } - // FIXME - // struct field (or containing array/map) decides whether the child type is nullable (any - // nullable parent makes the field nullable) - self.recurse_into_struct_field(field) - } + fn transform_struct_field( + &mut self, + field: &'a StructField, + ) -> Option> { + // first always check error to terminate early if possible + if self.error.is_some() { + return None; + } + self.recurse_into_struct_field(field) + } - // For arrays, do something similar: - fn transform_array(&mut self, atype: &'a ArrayType) -> Option> { - // first always check error to terminate early if possible - if self.error.is_some() { + // arrays unsupported for now + fn transform_array(&mut self, _array_type: &'a ArrayType) -> Option> { + // first always check error to terminate early if possible + if self.error.is_some() { + return None; + } + self.set_error(Error::unsupported( + "ArrayType not yet supported for creating single-row array", + )); return None; } - // Recurse into child - self.recurse_into_array(atype)?; - - // Once child is done, pop the child's single-row array from the stack - let child_array = self.stack.pop().unwrap(); - - unimplemented!(); - - Some(Cow::Borrowed(atype)) - } - - fn transform_map(&mut self, mtype: &'a MapType) -> Option> { - // first always check error to terminate early if possible - if self.error.is_some() { + // maps unsupported for now + fn transform_map(&mut self, _map_type: &'a MapType) -> Option> { + // first always check error to terminate early if possible + if self.error.is_some() { + return None; + } + self.set_error(Error::unsupported( + "MapType not yet supported for creating single-row array", + )); return None; } - - self.recurse_into_map(mtype)?; - - let value_arr = self.stack.pop().unwrap(); - let key_arr = self.stack.pop().unwrap(); - - unimplemented!(); - - // build map, push - Some(Cow::Borrowed(mtype)) } } @@ -746,7 +780,7 @@ mod tests { use super::*; use crate::expressions::*; - use crate::schema::ArrayType; + use crate::schema::{ArrayType, StructType}; use crate::DataType as DeltaDataTypes; #[test] @@ -1040,92 +1074,65 @@ mod tests { assert_eq!(results.as_ref(), expected.as_ref()); } + // helper to take values/schema to pass to `create_one` and assert the result = expected + fn assert_create_one(values: &[Scalar], schema: SchemaRef, expected: RecordBatch) { + let handler = ArrowExpressionHandler; + let actual = handler.create_one(schema, values).unwrap(); + let actual_rb: RecordBatch = actual + .into_any() + .downcast::() + .unwrap() + .into(); + assert_eq!(actual_rb, expected); + } + #[test] fn test_create_one() { let values: &[Scalar] = &[1.into(), 2.into(), 3.into()]; - let schema = Arc::new(crate::schema::StructType::new([ + let schema = Arc::new(StructType::new([ StructField::new("a", DeltaDataTypes::INTEGER, true), StructField::new("b", DeltaDataTypes::INTEGER, true), StructField::new("c", DeltaDataTypes::INTEGER, true), ])); - let handler = ArrowExpressionHandler; - let actual = handler.create_one(schema, values).unwrap(); let expected = record_batch!(("a", Int32, [1]), ("b", Int32, [2]), ("c", Int32, [3])).unwrap(); - let actual_rb: RecordBatch = actual - .into_any() - .downcast::() - .unwrap() - .into(); - assert_eq!(actual_rb, expected); + assert_create_one(values, schema, expected); } #[test] fn test_create_one_string() { let values = &["a".into()]; - let schema = Arc::new(crate::schema::StructType::new([StructField::new( + let schema = Arc::new(StructType::new([StructField::new( "col_1", DeltaDataTypes::STRING, true, )])); - let handler = ArrowExpressionHandler; - let actual = handler.create_one(schema, values).unwrap(); let expected = record_batch!(("col_1", Utf8, ["a"])).unwrap(); - let actual_rb: RecordBatch = actual - .into_any() - .downcast::() - .unwrap() - .into(); - assert_eq!(actual_rb, expected); - } - - #[test] - fn test_create_one_incorrect_schema() { - let values = &["a".into()]; - let schema = Arc::new(crate::schema::StructType::new([StructField::new( - "col_1", - DeltaDataTypes::INTEGER, - true, - )])); - - let handler = ArrowExpressionHandler; - // FIXME check err - assert!(handler.create_one(schema, values).is_err()) + assert_create_one(values, schema, expected); } #[test] fn test_create_one_null() { let values = &[Scalar::Null(DeltaDataTypes::INTEGER)]; - let schema = Arc::new(crate::schema::StructType::new([StructField::new( + let schema = Arc::new(StructType::new([StructField::new( "col_1", DeltaDataTypes::INTEGER, true, )])); - - let handler = ArrowExpressionHandler; - let actual = handler.create_one(schema, values).unwrap(); let expected = record_batch!(("col_1", Int32, [None])).unwrap(); - let actual_rb: RecordBatch = actual - .into_any() - .downcast::() - .unwrap() - .into(); - assert_eq!(actual_rb, expected); + assert_create_one(values, schema, expected); } #[test] fn test_create_one_non_null() { let values: &[Scalar] = &[1.into()]; - let schema = Arc::new(crate::schema::StructType::new([StructField::new( + let schema = Arc::new(StructType::new([StructField::new( "a", DeltaDataTypes::INTEGER, false, )])); - - let handler = ArrowExpressionHandler; - let actual = handler.create_one(schema, values).unwrap(); let expected_schema = Arc::new(arrow_schema::Schema::new(vec![Field::new( "a", DataType::Int32, @@ -1133,12 +1140,7 @@ mod tests { )])); let expected = RecordBatch::try_new(expected_schema, vec![create_array!(Int32, [1])]).unwrap(); - let actual_rb: RecordBatch = actual - .into_any() - .downcast::() - .unwrap() - .into(); - assert_eq!(actual_rb, expected); + assert_create_one(values, schema, expected); } #[test] @@ -1152,9 +1154,6 @@ mod tests { ]), false, )])); - - let handler = ArrowExpressionHandler; - let actual = handler.create_one(schema, values).unwrap(); let expected_schema = Arc::new(arrow_schema::Schema::new(vec![Field::new( "a", DataType::Struct( @@ -1180,11 +1179,70 @@ mod tests { ]))], ) .unwrap(); - let actual_rb: RecordBatch = actual - .into_any() - .downcast::() - .unwrap() - .into(); - assert_eq!(actual_rb, expected); + assert_create_one(values, schema, expected); + } + + #[test] + fn test_create_one_nested_null() { + let values: &[Scalar] = &[Scalar::Null(DeltaDataTypes::INTEGER), 1.into()]; + let schema = Arc::new(crate::schema::StructType::new([StructField::new( + "a", + DeltaDataTypes::struct_type([ + StructField::new("b", DeltaDataTypes::INTEGER, true), + StructField::new("c", DeltaDataTypes::INTEGER, false), + ]), + false, + )])); + let expected_schema = Arc::new(arrow_schema::Schema::new(vec![Field::new( + "a", + DataType::Struct( + vec![ + Field::new("b", DataType::Int32, true), + Field::new("c", DataType::Int32, false), + ] + .into(), + ), + false, + )])); + let expected = RecordBatch::try_new( + expected_schema, + vec![Arc::new(StructArray::from(vec![ + ( + Arc::new(Field::new("b", DataType::Int32, true)), + create_array!(Int32, [None]) as ArrayRef, + ), + ( + Arc::new(Field::new("c", DataType::Int32, false)), + create_array!(Int32, [1]) as ArrayRef, + ), + ]))], + ) + .unwrap(); + assert_create_one(values, schema, expected); + } + + #[test] + fn test_create_one_incorrect_schema() { + let values = &["a".into()]; + let schema = Arc::new(StructType::new([StructField::new( + "col_1", + DeltaDataTypes::INTEGER, + true, + )])); + + let handler = ArrowExpressionHandler; + matches!(handler.create_one(schema, values), Err(Error::Generic(_))); + } + + #[test] + fn test_create_one_missing_values() { + let values = &["a".into()]; + let schema = Arc::new(StructType::new([ + StructField::new("col_1", DeltaDataTypes::INTEGER, true), + StructField::new("col_2", DeltaDataTypes::INTEGER, true), + ])); + + let handler = ArrowExpressionHandler; + matches!(handler.create_one(schema, values), Err(Error::Generic(_))); } } From ad643d726070c213c5872ea4176cb9f3ff9faa09 Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Thu, 30 Jan 2025 16:25:42 -0800 Subject: [PATCH 05/13] more cleanup --- kernel/src/engine/arrow_expression.rs | 54 +++++++++++++++------------ kernel/src/schema/mod.rs | 4 ++ 2 files changed, 34 insertions(+), 24 deletions(-) diff --git a/kernel/src/engine/arrow_expression.rs b/kernel/src/engine/arrow_expression.rs index 8d6462f47..7100e4138 100644 --- a/kernel/src/engine/arrow_expression.rs +++ b/kernel/src/engine/arrow_expression.rs @@ -717,7 +717,7 @@ mod single_row_array_transform { self.set_error(Error::unsupported( "ArrayType not yet supported for creating single-row array", )); - return None; + None } // maps unsupported for now @@ -729,7 +729,7 @@ mod single_row_array_transform { self.set_error(Error::unsupported( "MapType not yet supported for creating single-row array", )); - return None; + None } } } @@ -1090,9 +1090,9 @@ mod tests { fn test_create_one() { let values: &[Scalar] = &[1.into(), 2.into(), 3.into()]; let schema = Arc::new(StructType::new([ - StructField::new("a", DeltaDataTypes::INTEGER, true), - StructField::new("b", DeltaDataTypes::INTEGER, true), - StructField::new("c", DeltaDataTypes::INTEGER, true), + StructField::nullable("a", DeltaDataTypes::INTEGER), + StructField::nullable("b", DeltaDataTypes::INTEGER), + StructField::nullable("c", DeltaDataTypes::INTEGER), ])); let expected = @@ -1103,10 +1103,9 @@ mod tests { #[test] fn test_create_one_string() { let values = &["a".into()]; - let schema = Arc::new(StructType::new([StructField::new( + let schema = Arc::new(StructType::new([StructField::nullable( "col_1", DeltaDataTypes::STRING, - true, )])); let expected = record_batch!(("col_1", Utf8, ["a"])).unwrap(); @@ -1116,10 +1115,9 @@ mod tests { #[test] fn test_create_one_null() { let values = &[Scalar::Null(DeltaDataTypes::INTEGER)]; - let schema = Arc::new(StructType::new([StructField::new( + let schema = Arc::new(StructType::new([StructField::nullable( "col_1", DeltaDataTypes::INTEGER, - true, )])); let expected = record_batch!(("col_1", Int32, [None])).unwrap(); assert_create_one(values, schema, expected); @@ -1128,10 +1126,9 @@ mod tests { #[test] fn test_create_one_non_null() { let values: &[Scalar] = &[1.into()]; - let schema = Arc::new(StructType::new([StructField::new( + let schema = Arc::new(StructType::new([StructField::not_null( "a", DeltaDataTypes::INTEGER, - false, )])); let expected_schema = Arc::new(arrow_schema::Schema::new(vec![Field::new( "a", @@ -1146,15 +1143,14 @@ mod tests { #[test] fn test_create_one_nested() { let values: &[Scalar] = &[1.into(), 2.into()]; - let schema = Arc::new(crate::schema::StructType::new([StructField::new( + let schema = Arc::new(StructType::new([StructField::not_null( "a", DeltaDataTypes::struct_type([ - StructField::new("b", DeltaDataTypes::INTEGER, true), - StructField::new("c", DeltaDataTypes::INTEGER, false), + StructField::nullable("b", DeltaDataTypes::INTEGER), + StructField::not_null("c", DeltaDataTypes::INTEGER), ]), - false, )])); - let expected_schema = Arc::new(arrow_schema::Schema::new(vec![Field::new( + let expected_schema = Arc::new(Schema::new(vec![Field::new( "a", DataType::Struct( vec![ @@ -1185,13 +1181,12 @@ mod tests { #[test] fn test_create_one_nested_null() { let values: &[Scalar] = &[Scalar::Null(DeltaDataTypes::INTEGER), 1.into()]; - let schema = Arc::new(crate::schema::StructType::new([StructField::new( + let schema = Arc::new(StructType::new([StructField::not_null( "a", DeltaDataTypes::struct_type([ - StructField::new("b", DeltaDataTypes::INTEGER, true), - StructField::new("c", DeltaDataTypes::INTEGER, false), + StructField::nullable("b", DeltaDataTypes::INTEGER), + StructField::not_null("c", DeltaDataTypes::INTEGER), ]), - false, )])); let expected_schema = Arc::new(arrow_schema::Schema::new(vec![Field::new( "a", @@ -1224,22 +1219,33 @@ mod tests { #[test] fn test_create_one_incorrect_schema() { let values = &["a".into()]; - let schema = Arc::new(StructType::new([StructField::new( + let schema = Arc::new(StructType::new([StructField::nullable( "col_1", DeltaDataTypes::INTEGER, - true, )])); let handler = ArrowExpressionHandler; matches!(handler.create_one(schema, values), Err(Error::Generic(_))); } + #[test] + fn test_create_one_incorrect_null() { + let values = &[Scalar::Null(DeltaDataTypes::INTEGER)]; + let schema = Arc::new(StructType::new([StructField::not_null( + "col_1", + DeltaDataTypes::INTEGER, + )])); + + let handler = ArrowExpressionHandler; + matches!(handler.create_one(schema, values), Err(Error::Arrow(_))); + } + #[test] fn test_create_one_missing_values() { let values = &["a".into()]; let schema = Arc::new(StructType::new([ - StructField::new("col_1", DeltaDataTypes::INTEGER, true), - StructField::new("col_2", DeltaDataTypes::INTEGER, true), + StructField::nullable("col_1", DeltaDataTypes::INTEGER), + StructField::nullable("col_2", DeltaDataTypes::INTEGER), ])); let handler = ArrowExpressionHandler; diff --git a/kernel/src/schema/mod.rs b/kernel/src/schema/mod.rs index 7d3ee35fe..c54352d9f 100644 --- a/kernel/src/schema/mod.rs +++ b/kernel/src/schema/mod.rs @@ -291,6 +291,10 @@ impl StructType { self.fields.len() } + pub fn is_empty(&self) -> bool { + self.fields.is_empty() + } + /// Extracts the name and type of all leaf columns, in schema order. Caller should pass Some /// `own_name` if this schema is embedded in a larger struct (e.g. `add.*`) and None if the /// schema is a top-level result (e.g. `*`). From e032799595320bf26b7270f234389accc48b81e2 Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Fri, 31 Jan 2025 12:04:02 -0800 Subject: [PATCH 06/13] cleanup added test --- kernel/src/engine/arrow_expression.rs | 84 ++++++++++++++++----------- 1 file changed, 50 insertions(+), 34 deletions(-) diff --git a/kernel/src/engine/arrow_expression.rs b/kernel/src/engine/arrow_expression.rs index 7100e4138..83d47c0d6 100644 --- a/kernel/src/engine/arrow_expression.rs +++ b/kernel/src/engine/arrow_expression.rs @@ -534,10 +534,10 @@ impl ExpressionHandler for ArrowExpressionHandler { fn create_one(&self, schema: SchemaRef, values: &[Scalar]) -> DeltaResult> { let mut array_transform = SingleRowArrayTransform::new(values); let datatype = schema.into(); - // we build up the array within the `SignatureArrayTransform` - we don't actually care + // we build up the array within the `SingleRowArrayTransform` - we don't actually care // about the 'transformed' type let _transformed = array_transform.transform(&datatype); - let array = array_transform.into_array()?; + let array = array_transform.into_struct_array()?; let struct_array = array.as_any().downcast_ref::().unwrap(); // FIXME let record_batch: RecordBatch = struct_array.into(); // FIXME will panic for top-level null @@ -547,9 +547,11 @@ impl ExpressionHandler for ArrowExpressionHandler { mod single_row_array_transform { use std::borrow::Cow; + use std::mem; use std::sync::Arc; use arrow_array::{ArrayRef, StructArray}; + use arrow_schema::DataType as ArrowDataType; use arrow_schema::Field as ArrowField; use itertools::Itertools; use tracing::debug; @@ -562,12 +564,10 @@ mod single_row_array_transform { /// [`SchemaTransform`] that will transform a [`Schema`] and an ordered list of leaf values (as /// Scalar slice) into an arrow Array with a single row of each literal. - #[derive(Debug, Default)] - pub(crate) struct SingleRowArrayTransform<'a> { + #[derive(Debug)] + pub(crate) struct SingleRowArrayTransform<'a, T: Iterator> { /// Leaf-node values to insert in schema order. - scalars: &'a [Scalar], - /// Index into `scalars` for the next leaf we encounter. - next_scalar_idx: usize, + scalars: T, /// A stack of built arrays. After visiting children, we pop them off to /// build the parent container, then push the parent back on. stack: Vec, @@ -575,23 +575,30 @@ mod single_row_array_transform { error: Option, } - impl<'a> SingleRowArrayTransform<'a> { - pub(crate) fn new(scalars: &'a [Scalar]) -> Self { + impl<'a, T: Iterator> SingleRowArrayTransform<'a, T> { + pub(crate) fn new(scalars: impl IntoIterator) -> Self { Self { - scalars, - ..Default::default() + scalars: scalars.into_iter(), + stack: Vec::new(), + error: None, } } /// return the single-row array (or propagate Error). the top of `stack` should be our /// single-row struct array - pub(crate) fn into_array(mut self) -> DeltaResult { + pub(crate) fn into_struct_array(mut self) -> DeltaResult { if let Some(e) = self.error { return Err(e); } match self.stack.pop() { - Some(array) => Ok(array), - None => Err(Error::generic("didn't build array")), + Some(array) if self.stack.is_empty() => match array.data_type() { + ArrowDataType::Struct(_) => Ok(array), + _ => Err(Error::generic("Expected struct array")), + }, + Some(_) => Err(Error::generic( + "additional arrays left in the stack (possibly too many scalars)", + )), + None => Err(Error::generic("no array present in the stack")), } } @@ -603,7 +610,7 @@ mod single_row_array_transform { } } - impl<'a> SchemaTransform<'a> for SingleRowArrayTransform<'a> { + impl<'a, T: Iterator> SchemaTransform<'a> for SingleRowArrayTransform<'a, T> { fn transform_primitive( &mut self, prim_type: &'a PrimitiveType, @@ -613,15 +620,15 @@ mod single_row_array_transform { return None; } - if self.next_scalar_idx >= self.scalars.len() { - self.set_error(Error::generic( - "Not enough scalars to create a single-row array", - )); - return None; - } - - let scalar = &self.scalars[self.next_scalar_idx]; - self.next_scalar_idx += 1; + let scalar = match self.scalars.next() { + Some(s) => s, + None => { + self.set_error(Error::generic( + "Not enough scalars to create a single-row array", + )); + return None; + } + }; match scalar.data_type() { DataType::Primitive(scalar_type) => { @@ -663,20 +670,15 @@ mod single_row_array_transform { let num_fields = struct_type.fields.len(); let stack_len = self.stack.len(); - if stack_len < num_fields { + if stack_len != num_fields { self.set_error(Error::Generic(format!( - "Not enough child arrays to create a single-row struct: expected {}, got {}", + "Incorrect number of child arrays to create a single-row struct: expected {}, got {}", num_fields, stack_len ))); return None; } - let mut child_arrays = Vec::with_capacity(num_fields); - for _ in 0..num_fields { - child_arrays.push(self.stack.pop().unwrap()); - } - // reverse child_arrays since we popped them off the stack - child_arrays.reverse(); + let child_arrays = mem::take(&mut self.stack); let fields = match struct_type.fields().map(ArrowField::try_from).try_collect() { Ok(f) => f, @@ -685,6 +687,9 @@ mod single_row_array_transform { return None; } }; + + // TODO: null buffer is always none since we are purposely creating a single-row array, + // but if the literals are all null we should use the NullBuffer? let array = match StructArray::try_new(fields, child_arrays, None) { Ok(arr) => arr, Err(e) => { @@ -692,8 +697,7 @@ mod single_row_array_transform { return None; } }; - let array = Arc::new(array) as ArrayRef; - self.stack.push(array); + self.stack.push(Arc::new(array)); Some(Cow::Borrowed(struct_type)) } @@ -1251,4 +1255,16 @@ mod tests { let handler = ArrowExpressionHandler; matches!(handler.create_one(schema, values), Err(Error::Generic(_))); } + + #[test] + fn test_create_one_extra_values() { + let values = &["a".into(), "b".into(), "c".into()]; + let schema = Arc::new(StructType::new([ + StructField::nullable("col_1", DeltaDataTypes::INTEGER), + StructField::nullable("col_2", DeltaDataTypes::INTEGER), + ])); + + let handler = ArrowExpressionHandler; + matches!(handler.create_one(schema, values), Err(Error::Generic(_))); + } } From 09548b12878de990a40e803fa88dfafb20e1320d Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Fri, 31 Jan 2025 16:53:39 -0800 Subject: [PATCH 07/13] do nullability craziness --- kernel/src/engine/arrow_expression.rs | 195 ++++++++++++++++++++++++-- kernel/src/schema/mod.rs | 26 +++- 2 files changed, 201 insertions(+), 20 deletions(-) diff --git a/kernel/src/engine/arrow_expression.rs b/kernel/src/engine/arrow_expression.rs index 83d47c0d6..5e0a22eaf 100644 --- a/kernel/src/engine/arrow_expression.rs +++ b/kernel/src/engine/arrow_expression.rs @@ -538,9 +538,9 @@ impl ExpressionHandler for ArrowExpressionHandler { // about the 'transformed' type let _transformed = array_transform.transform(&datatype); let array = array_transform.into_struct_array()?; - let struct_array = array.as_any().downcast_ref::().unwrap(); // FIXME + let struct_array = array.as_struct_opt().unwrap(); // FIXME - let record_batch: RecordBatch = struct_array.into(); // FIXME will panic for top-level null + let record_batch: RecordBatch = struct_array.into(); // FIXME Ok(Box::new(ArrowEngineData::new(record_batch))) } } @@ -573,14 +573,40 @@ mod single_row_array_transform { stack: Vec, /// If an error occurs, it will be stored here. error: Option, + nullability_stack: Vec, } + ///// Any error for the single-row array transform. + //#[derive(thiserror::Error, Debug)] + //pub enum Error { + // /// An error performing operations on arrow data + // #[error(transparent)] + // Arrow(arrow_schema::ArrowError), + // + // /// A generic error with a message + // #[error("Schema error: {0}")] + // Schema(String), + // + // /// TODO + // #[error("stack error: {0}")] + // Stack(String), + //} + // + //impl Error { + // /// Create a generic error with a message + // pub fn generic(msg: impl Into) -> Self { + // Self::Schema(msg.into()) + // } + //} + // + // TODO ERRORS impl<'a, T: Iterator> SingleRowArrayTransform<'a, T> { pub(crate) fn new(scalars: impl IntoIterator) -> Self { Self { scalars: scalars.into_iter(), stack: Vec::new(), error: None, + nullability_stack: vec![false], } } @@ -590,6 +616,13 @@ mod single_row_array_transform { if let Some(e) = self.error { return Err(e); } + + if self.scalars.next().is_some() { + return Err(Error::generic( + "Excess scalars given to create a single-row array", + )); + } + match self.stack.pop() { Some(array) if self.stack.is_empty() => match array.data_type() { ArrowDataType::Struct(_) => Ok(array), @@ -688,8 +721,38 @@ mod single_row_array_transform { } }; - // TODO: null buffer is always none since we are purposely creating a single-row array, - // but if the literals are all null we should use the NullBuffer? + // TODO track if ancestor/me is nullable + // if yes, are any of my children _directly_ not nullable? if yes, check if values are + // null + // + // do i have direct children which are non nullable with value null. + // -> verify all other children are all null + // -> if not, error + // -> if yes, create a null for the struct + + println!("transforming: {:?}", struct_type); + + // check if any child array is null and has a non-nullable field + for (child, field) in child_arrays.iter().zip(struct_type.fields()) { + if !field.is_nullable() && child.is_null(0) { + // if we have a null child array for a not-nullable field, either all other + // children must be null (and we make a null struct) or error + if child_arrays.iter().all(|c| c.is_null(0)) + && self.nullability_stack.iter().any(|n| *n) + { + self.stack.push(Arc::new(StructArray::new_null(fields, 1))); + return Some(Cow::Borrowed(struct_type)); + } else { + self.set_error(Error::Generic(format!( + "Non-nullable field {} is null in single-row struct", + field.name() + ))); + return None; + } + } + } + + // null buffer is none since we catch the null struct case above let array = match StructArray::try_new(fields, child_arrays, None) { Ok(arr) => arr, Err(e) => { @@ -709,7 +772,13 @@ mod single_row_array_transform { if self.error.is_some() { return None; } - self.recurse_into_struct_field(field) + + println!("transforming field: {:?}", field); + println!("nullable: {:?}", field.is_nullable()); + self.nullability_stack.push(field.is_nullable()); + self.recurse_into_struct_field(field); + self.nullability_stack.pop(); + Some(Cow::Borrowed(field)) } // arrays unsupported for now @@ -1134,11 +1203,7 @@ mod tests { "a", DeltaDataTypes::INTEGER, )])); - let expected_schema = Arc::new(arrow_schema::Schema::new(vec![Field::new( - "a", - DataType::Int32, - false, - )])); + let expected_schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); let expected = RecordBatch::try_new(expected_schema, vec![create_array!(Int32, [1])]).unwrap(); assert_create_one(values, schema, expected); @@ -1192,7 +1257,7 @@ mod tests { StructField::not_null("c", DeltaDataTypes::INTEGER), ]), )])); - let expected_schema = Arc::new(arrow_schema::Schema::new(vec![Field::new( + let expected_schema = Arc::new(Schema::new(vec![Field::new( "a", DataType::Struct( vec![ @@ -1220,6 +1285,106 @@ mod tests { assert_create_one(values, schema, expected); } + // critical case: struct(x) [nullable] with (a [non-null], b [nullable]) fields. + // for scalars (null, null) the _only_ solution here is struct(x) is null struct (not two null + // leaves) + // + // vs. if a and b are both nullable, we just leave them as null and the struct is non-null + // + // thus, we trigger the 'nullable struct' case only if it is required. + #[test] + fn test_create_one_null_struct() { + let values: &[Scalar] = &[ + Scalar::Null(DeltaDataTypes::INTEGER), + Scalar::Null(DeltaDataTypes::INTEGER), + ]; + let schema = Arc::new(StructType::new([StructField::nullable( + "a", + DeltaDataTypes::struct_type([ + StructField::not_null("b", DeltaDataTypes::INTEGER), + StructField::nullable("c", DeltaDataTypes::INTEGER), + ]), + )])); + let struct_fields = vec![ + Field::new("b", DataType::Int32, false), + Field::new("c", DataType::Int32, true), + ]; + let expected_schema = Arc::new(Schema::new(vec![Field::new( + "a", + DataType::Struct(struct_fields.clone().into()), + true, + )])); + // an array with one null struct + let expected = RecordBatch::try_new( + expected_schema, + vec![Arc::new(StructArray::new_null(struct_fields.into(), 1))], + ) + .unwrap(); + assert_create_one(values, schema.clone(), expected); + + let err_values: &[Scalar] = &[Scalar::Null(DeltaDataTypes::INTEGER), 1.into()]; + let handler = ArrowExpressionHandler; + assert!(handler.create_one(schema, err_values).is_err()); // FIXME + } + + #[test] + fn test_create_one_null_fields() { + // vs if both fields are nullable, we just leave them as null + let values: &[Scalar] = &[ + Scalar::Null(DeltaDataTypes::INTEGER), + Scalar::Null(DeltaDataTypes::INTEGER), + ]; + let schema = Arc::new(StructType::new([StructField::nullable( + "a", + DeltaDataTypes::struct_type([ + StructField::nullable("b", DeltaDataTypes::INTEGER), + StructField::nullable("c", DeltaDataTypes::INTEGER), + ]), + )])); + let struct_fields = vec![ + Field::new("b", DataType::Int32, true), + Field::new("c", DataType::Int32, true), + ]; + let expected_schema = Arc::new(Schema::new(vec![Field::new( + "a", + DataType::Struct(struct_fields.clone().into()), + true, + )])); + // an array with one struct and two null fields + let expected = RecordBatch::try_new( + expected_schema, + vec![Arc::new(StructArray::from(vec![ + ( + Arc::new(Field::new("b", DataType::Int32, true)), + create_array!(Int32, [None]) as ArrayRef, + ), + ( + Arc::new(Field::new("c", DataType::Int32, true)), + create_array!(Int32, [None]) as ArrayRef, + ), + ]))], + ) + .unwrap(); + assert_create_one(values, schema, expected); + } + + #[test] + fn test_create_one_not_null_struct() { + let values: &[Scalar] = &[ + Scalar::Null(DeltaDataTypes::INTEGER), + Scalar::Null(DeltaDataTypes::INTEGER), + ]; + let schema = Arc::new(StructType::new([StructField::not_null( + "a", + DeltaDataTypes::struct_type([ + StructField::not_null("b", DeltaDataTypes::INTEGER), + StructField::nullable("c", DeltaDataTypes::INTEGER), + ]), + )])); + let handler = ArrowExpressionHandler; + assert!(handler.create_one(schema, values).is_err()); // FIXME + } + #[test] fn test_create_one_incorrect_schema() { let values = &["a".into()]; @@ -1233,7 +1398,7 @@ mod tests { } #[test] - fn test_create_one_incorrect_null() { + fn test_create_one_top_level_null() { let values = &[Scalar::Null(DeltaDataTypes::INTEGER)]; let schema = Arc::new(StructType::new([StructField::not_null( "col_1", @@ -1246,7 +1411,7 @@ mod tests { #[test] fn test_create_one_missing_values() { - let values = &["a".into()]; + let values = &[1.into()]; let schema = Arc::new(StructType::new([ StructField::nullable("col_1", DeltaDataTypes::INTEGER), StructField::nullable("col_2", DeltaDataTypes::INTEGER), @@ -1258,13 +1423,13 @@ mod tests { #[test] fn test_create_one_extra_values() { - let values = &["a".into(), "b".into(), "c".into()]; + let values = &[1.into(), 2.into(), 3.into()]; let schema = Arc::new(StructType::new([ StructField::nullable("col_1", DeltaDataTypes::INTEGER), StructField::nullable("col_2", DeltaDataTypes::INTEGER), ])); let handler = ArrowExpressionHandler; - matches!(handler.create_one(schema, values), Err(Error::Generic(_))); + assert!(handler.create_one(schema, values).is_err()); } } diff --git a/kernel/src/schema/mod.rs b/kernel/src/schema/mod.rs index c54352d9f..0305824ce 100644 --- a/kernel/src/schema/mod.rs +++ b/kernel/src/schema/mod.rs @@ -286,15 +286,11 @@ impl StructType { self.fields.values() } - pub fn len(&self) -> usize { + pub fn fields_len(&self) -> usize { // O(1) for indexmap self.fields.len() } - pub fn is_empty(&self) -> bool { - self.fields.is_empty() - } - /// Extracts the name and type of all leaf columns, in schema order. Caller should pass Some /// `own_name` if this schema is embedded in a larger struct (e.g. `add.*`) and None if the /// schema is a top-level result (e.g. `*`). @@ -1212,4 +1208,24 @@ mod tests { "[\"an\",\"array\"]" ); } + + #[test] + fn test_fields_len() { + let schema = StructType::new([]); + assert!(schema.fields_len() == 0); + let schema = StructType::new([ + StructField::nullable("a", DataType::LONG), + StructField::nullable("b", DataType::LONG), + StructField::nullable("c", DataType::LONG), + StructField::nullable("d", DataType::LONG), + ]); + assert_eq!(schema.fields_len(), 4); + let schema = StructType::new([ + StructField::nullable("b", DataType::LONG), + StructField::not_null("b", DataType::LONG), + StructField::nullable("c", DataType::LONG), + StructField::nullable("c", DataType::LONG), + ]); + assert_eq!(schema.fields_len(), 2); + } } From 2aa3a52d365036f9e6a0dea8e55ad2cb3c5d8ad4 Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Tue, 4 Feb 2025 11:11:23 -0800 Subject: [PATCH 08/13] checkpoint - all working --- kernel/src/engine/arrow_expression.rs | 410 ++++++++++++++++++++++---- 1 file changed, 358 insertions(+), 52 deletions(-) diff --git a/kernel/src/engine/arrow_expression.rs b/kernel/src/engine/arrow_expression.rs index 5e0a22eaf..97d165963 100644 --- a/kernel/src/engine/arrow_expression.rs +++ b/kernel/src/engine/arrow_expression.rs @@ -540,6 +540,11 @@ impl ExpressionHandler for ArrowExpressionHandler { let array = array_transform.into_struct_array()?; let struct_array = array.as_struct_opt().unwrap(); // FIXME + // detect top-level null + if struct_array.is_null(0) { + return Err(Error::generic("Top-level null in single-row array")); + } + let record_batch: RecordBatch = struct_array.into(); // FIXME Ok(Box::new(ArrowEngineData::new(record_batch))) } @@ -547,7 +552,6 @@ impl ExpressionHandler for ArrowExpressionHandler { mod single_row_array_transform { use std::borrow::Cow; - use std::mem; use std::sync::Arc; use arrow_array::{ArrayRef, StructArray}; @@ -573,7 +577,6 @@ mod single_row_array_transform { stack: Vec, /// If an error occurs, it will be stored here. error: Option, - nullability_stack: Vec, } ///// Any error for the single-row array transform. @@ -600,13 +603,12 @@ mod single_row_array_transform { //} // // TODO ERRORS - impl<'a, T: Iterator> SingleRowArrayTransform<'a, T> { - pub(crate) fn new(scalars: impl IntoIterator) -> Self { + impl<'a, I: Iterator> SingleRowArrayTransform<'a, I> { + pub(crate) fn new(scalars: impl IntoIterator) -> Self { Self { scalars: scalars.into_iter(), stack: Vec::new(), error: None, - nullability_stack: vec![false], } } @@ -636,10 +638,20 @@ mod single_row_array_transform { } fn set_error(&mut self, e: Error) { - if let Some(err) = &self.error { + if let Some(err) = &self.error.replace(e) { debug!("Overwriting error that was already set: {err}"); } - self.error = Some(e); + } + + // TODO use check_error everywhere + fn check_error>(&mut self, result: Result) -> Option { + match result { + Ok(val) => Some(val), + Err(err) => { + self.set_error(err.into()); + None + } + } } } @@ -653,41 +665,26 @@ mod single_row_array_transform { return None; } - let scalar = match self.scalars.next() { - Some(s) => s, - None => { - self.set_error(Error::generic( - "Not enough scalars to create a single-row array", - )); - return None; - } - }; + let next = self.scalars.next(); + let scalar = self.check_error(next.ok_or(Error::generic( + "Not enough scalars to create a single-row array", + )))?; - match scalar.data_type() { - DataType::Primitive(scalar_type) => { - if scalar_type != *prim_type { - self.set_error(Error::Generic(format!( - "Mismatched scalar type creating a single-row array: expected {}, got {}", - prim_type, scalar_type - ))); - return None; - } - } - _ => { - self.set_error(Error::generic( - "Non-primitive scalar type {datatype} provided", - )); - return None; - } + let DataType::Primitive(scalar_type) = scalar.data_type() else { + self.set_error(Error::generic( + "Non-primitive scalar type {datatype} provided", + )); + return None; + }; + if scalar_type != *prim_type { + self.set_error(Error::Generic(format!( + "Mismatched scalar type creating a single-row array: expected {}, got {}", + prim_type, scalar_type + ))); + return None; } - let arr = match scalar.to_array(1) { - Ok(a) => a, - Err(e) => { - self.set_error(e); - return None; - } - }; + let arr = self.check_error(scalar.to_array(1))?; self.stack.push(arr); Some(Cow::Borrowed(prim_type)) @@ -699,19 +696,67 @@ mod single_row_array_transform { return None; } - self.recurse_into_struct(struct_type)?; + // Only consume newly-added entries (if any). There could be fewer than expected if + // the recursion encountered an error. + let mark = self.stack.len(); + let _ = self.recurse_into_struct(struct_type); // change + //self.recurse_into_struct(struct_type)?; + let field_arrays = self.stack.split_off(mark); + if self.error.is_some() { + return None; + } + + if field_arrays.len() != struct_type.fields_len() { + self.set_error(Error::generic( + "Not enough child arrays to create a single-row struct", + )); + return None; + } + + let mut found_non_nullable_null = false; + let mut all_null = true; + for (f, v) in struct_type.fields().zip(&field_arrays) { + if v.is_valid(0) { + all_null = false; + } else if !f.is_nullable() { + found_non_nullable_null = true; + } + } + + let null_buffer = found_non_nullable_null.then(|| { + // The struct had a non-nullable NULL. This is only legal if all fields were NULL, which we + // interpret as the struct itself being NULL. + // require!(all_null, ...); + if !all_null { + self.set_error(Error::Generic( + "Non-nullable field is null in single-row struct".to_string(), + )); + // return None; TODO + } + + // We already have the all-null columns we need, just need a null buffer + use arrow_buffer::NullBuffer; + NullBuffer::new_null(1) + }); + + /* let num_fields = struct_type.fields.len(); let stack_len = self.stack.len(); - if stack_len != num_fields { + if stack_len < num_fields { self.set_error(Error::Generic(format!( - "Incorrect number of child arrays to create a single-row struct: expected {}, got {}", + "Not enough child arrays to create a single-row struct: expected {}, got {}", num_fields, stack_len ))); return None; } - let child_arrays = mem::take(&mut self.stack); + // pop in reverse + let mut child_arrays = Vec::with_capacity(num_fields); + for _ in 0..num_fields { + child_arrays.push(self.stack.pop().unwrap()); + } + child_arrays.reverse(); let fields = match struct_type.fields().map(ArrowField::try_from).try_collect() { Ok(f) => f, @@ -752,14 +797,13 @@ mod single_row_array_transform { } } + */ + let fields = + self.check_error(struct_type.fields().map(ArrowField::try_from).try_collect())?; // null buffer is none since we catch the null struct case above - let array = match StructArray::try_new(fields, child_arrays, None) { - Ok(arr) => arr, - Err(e) => { - self.set_error(Error::Arrow(e)); - return None; - } - }; + //let array = self.check_error(StructArray::try_new(fields, child_arrays, None))?; + let array = + self.check_error(StructArray::try_new(fields, field_arrays, null_buffer))?; self.stack.push(Arc::new(array)); Some(Cow::Borrowed(struct_type)) } @@ -775,9 +819,7 @@ mod single_row_array_transform { println!("transforming field: {:?}", field); println!("nullable: {:?}", field.is_nullable()); - self.nullability_stack.push(field.is_nullable()); self.recurse_into_struct_field(field); - self.nullability_stack.pop(); Some(Cow::Borrowed(field)) } @@ -1327,6 +1369,66 @@ mod tests { assert!(handler.create_one(schema, err_values).is_err()); // FIXME } + #[test] + fn test_create_one_many_structs() { + let values: &[Scalar] = &[1.into(), 2.into(), 3.into(), 4.into()]; + let schema = Arc::new(StructType::new([ + StructField::nullable( + "x", + DeltaDataTypes::struct_type([ + StructField::not_null("a", DeltaDataTypes::INTEGER), + StructField::nullable("b", DeltaDataTypes::INTEGER), + ]), + ), + StructField::nullable( + "y", + DeltaDataTypes::struct_type([ + StructField::not_null("c", DeltaDataTypes::INTEGER), + StructField::nullable("d", DeltaDataTypes::INTEGER), + ]), + ), + ])); + let struct_fields_1 = vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, true), + ]; + let struct_fields_2 = vec![ + Field::new("c", DataType::Int32, false), + Field::new("d", DataType::Int32, true), + ]; + let expected_schema = Arc::new(Schema::new(vec![ + Field::new("x", DataType::Struct(struct_fields_1.clone().into()), true), + Field::new("y", DataType::Struct(struct_fields_2.clone().into()), true), + ])); + let expected = RecordBatch::try_new( + expected_schema, + vec![ + Arc::new(StructArray::from(vec![ + ( + Arc::new(Field::new("a", DataType::Int32, false)), + create_array!(Int32, [1]) as ArrayRef, + ), + ( + Arc::new(Field::new("b", DataType::Int32, true)), + create_array!(Int32, [2]) as ArrayRef, + ), + ])), + Arc::new(StructArray::from(vec![ + ( + Arc::new(Field::new("c", DataType::Int32, false)), + create_array!(Int32, [3]) as ArrayRef, + ), + ( + Arc::new(Field::new("d", DataType::Int32, true)), + create_array!(Int32, [4]) as ArrayRef, + ), + ])), + ], + ) + .unwrap(); + assert_create_one(values, schema.clone(), expected); + } + #[test] fn test_create_one_null_fields() { // vs if both fields are nullable, we just leave them as null @@ -1432,4 +1534,208 @@ mod tests { let handler = ArrowExpressionHandler; assert!(handler.create_one(schema, values).is_err()); } + + #[derive(Clone, Copy)] + struct TestSchema { + x_nullable: bool, + a_nullable: bool, + b_nullable: bool, + } + struct TestGroup { + schema: TestSchema, + expected: [Expected; 4], + } + enum Expected { + Noop, + NullStruct, + Error, + } + + impl TestGroup { + fn run(self) { + let value_groups = [ + (Some(1), Some(2)), + (None, Some(2)), + (Some(1), None), + (None, None), + ]; + for (i, (a_val, b_val)) in value_groups.into_iter().enumerate() { + let a = match a_val { + Some(v) => Scalar::Integer(v), + None => Scalar::Null(DeltaDataTypes::INTEGER), + }; + let b = match b_val { + Some(v) => Scalar::Integer(v), + None => Scalar::Null(DeltaDataTypes::INTEGER), + }; + let values: &[Scalar] = &[a, b]; + let schema = Arc::new(StructType::new([StructField::new( + "x", + DeltaDataTypes::struct_type([ + StructField::new("a", DeltaDataTypes::INTEGER, self.schema.a_nullable), + StructField::new("b", DeltaDataTypes::INTEGER, self.schema.b_nullable), + ]), + self.schema.x_nullable, + )])); + + let struct_fields = vec![ + Field::new("a", DataType::Int32, self.schema.a_nullable), + Field::new("b", DataType::Int32, self.schema.b_nullable), + ]; + let expected_schema = Arc::new(Schema::new(vec![Field::new( + "x", + DataType::Struct(struct_fields.clone().into()), + self.schema.x_nullable, + )])); + + let expected_struct_array = match self.expected[i] { + Expected::Noop => StructArray::from(vec![ + ( + Arc::new(Field::new("a", DataType::Int32, self.schema.a_nullable)), + create_array!(Int32, [a_val]) as ArrayRef, + ), + ( + Arc::new(Field::new("b", DataType::Int32, self.schema.b_nullable)), + create_array!(Int32, [b_val]) as ArrayRef, + ), + ]), + Expected::NullStruct => StructArray::new_null(struct_fields.into(), 1), + Expected::Error => { + let handler = ArrowExpressionHandler; + assert!(handler.create_one(schema, values).is_err()); + return; + } + }; + + let expected = + RecordBatch::try_new(expected_schema, vec![Arc::new(expected_struct_array)]) + .unwrap(); + assert_create_one(values, schema, expected); + } + } + } + + #[test] + fn test_create_one_nullability_combinations() { + // n { n, n } + // 1. (a, b) -> x (a, b) + // 2. (N, b) -> x (N, b) + // 3. (a, N) -> x (a, N) + // 4. (N, N) -> x (N, N) + + // + // n { n, ! } + // 1. (a, b) -> x (a, b) + // 2. (N, b) -> x (N, b) + // 3. (a, N) -> Err + // 4. (N, N) -> x NULL + // + // n { !, ! } + // 1. (a, b) -> x (a, b) + // 2. (N, b) -> Err + // 3. (a, N) -> Err + // 4. (N, N) -> x NULL + // + // ! { n, n } + // 1. (a, b) -> x (a, b) + // 2. (N, b) -> x (N, b) + // 3. (a, N) -> x (a, N) + // 4. (N, N) -> x (N, N) + // + // ! { n, ! } + // 1. (a, b) -> x (a, b) + // 2. (N, b) -> x (N, b) + // 3. (a, N) -> Err + // 4. (N, N) -> Err + // + // ! { !, ! } + // 1. (a, b) -> x (a, b) + // 2. (N, b) -> Err + // 3. (a, N) -> Err + // 4. (N, N) -> Err + let tests = [ + TestGroup { + schema: TestSchema { + x_nullable: true, + a_nullable: true, + b_nullable: true, + }, + expected: [ + Expected::Noop, + Expected::Noop, + Expected::Noop, + Expected::Noop, + ], + }, + TestGroup { + schema: TestSchema { + x_nullable: true, + a_nullable: true, + b_nullable: false, + }, + expected: [ + Expected::Noop, + Expected::Noop, + Expected::Error, + Expected::NullStruct, + ], + }, + TestGroup { + schema: TestSchema { + x_nullable: true, + a_nullable: false, + b_nullable: false, + }, + expected: [ + Expected::Noop, + Expected::Error, + Expected::Error, + Expected::NullStruct, + ], + }, + TestGroup { + schema: TestSchema { + x_nullable: false, + a_nullable: true, + b_nullable: true, + }, + expected: [ + Expected::Noop, + Expected::Noop, + Expected::Noop, + Expected::Noop, + ], + }, + TestGroup { + schema: TestSchema { + x_nullable: false, + a_nullable: true, + b_nullable: false, + }, + expected: [ + Expected::Noop, + Expected::Noop, + Expected::Error, + Expected::Error, + ], + }, + TestGroup { + schema: TestSchema { + x_nullable: false, + a_nullable: false, + b_nullable: false, + }, + expected: [ + Expected::Noop, + Expected::Error, + Expected::Error, + Expected::Error, + ], + }, + ]; + + for test_group in tests { + test_group.run() + } + } } From a4237b2d03d0fe9ffe095a808c6d85cb1e7dc1ee Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Tue, 4 Feb 2025 11:56:25 -0800 Subject: [PATCH 09/13] all null children = null struct --- kernel/src/engine/arrow_expression.rs | 315 +++++++++----------------- 1 file changed, 103 insertions(+), 212 deletions(-) diff --git a/kernel/src/engine/arrow_expression.rs b/kernel/src/engine/arrow_expression.rs index 97d165963..32bd63bbd 100644 --- a/kernel/src/engine/arrow_expression.rs +++ b/kernel/src/engine/arrow_expression.rs @@ -555,6 +555,7 @@ mod single_row_array_transform { use std::sync::Arc; use arrow_array::{ArrayRef, StructArray}; + use arrow_buffer::NullBuffer; use arrow_schema::DataType as ArrowDataType; use arrow_schema::Field as ArrowField; use itertools::Itertools; @@ -713,97 +714,17 @@ mod single_row_array_transform { return None; } - let mut found_non_nullable_null = false; - let mut all_null = true; - for (f, v) in struct_type.fields().zip(&field_arrays) { - if v.is_valid(0) { - all_null = false; - } else if !f.is_nullable() { - found_non_nullable_null = true; - } - } - - let null_buffer = found_non_nullable_null.then(|| { - // The struct had a non-nullable NULL. This is only legal if all fields were NULL, which we - // interpret as the struct itself being NULL. - // require!(all_null, ...); - if !all_null { - self.set_error(Error::Generic( - "Non-nullable field is null in single-row struct".to_string(), - )); - // return None; TODO - } - - // We already have the all-null columns we need, just need a null buffer - use arrow_buffer::NullBuffer; - NullBuffer::new_null(1) - }); - - /* - - let num_fields = struct_type.fields.len(); - let stack_len = self.stack.len(); - if stack_len < num_fields { - self.set_error(Error::Generic(format!( - "Not enough child arrays to create a single-row struct: expected {}, got {}", - num_fields, stack_len - ))); - return None; - } - - // pop in reverse - let mut child_arrays = Vec::with_capacity(num_fields); - for _ in 0..num_fields { - child_arrays.push(self.stack.pop().unwrap()); - } - child_arrays.reverse(); - - let fields = match struct_type.fields().map(ArrowField::try_from).try_collect() { - Ok(f) => f, - Err(e) => { - self.set_error(Error::Arrow(e)); - return None; - } - }; - - // TODO track if ancestor/me is nullable - // if yes, are any of my children _directly_ not nullable? if yes, check if values are - // null - // - // do i have direct children which are non nullable with value null. - // -> verify all other children are all null - // -> if not, error - // -> if yes, create a null for the struct - - println!("transforming: {:?}", struct_type); - - // check if any child array is null and has a non-nullable field - for (child, field) in child_arrays.iter().zip(struct_type.fields()) { - if !field.is_nullable() && child.is_null(0) { - // if we have a null child array for a not-nullable field, either all other - // children must be null (and we make a null struct) or error - if child_arrays.iter().all(|c| c.is_null(0)) - && self.nullability_stack.iter().any(|n| *n) - { - self.stack.push(Arc::new(StructArray::new_null(fields, 1))); - return Some(Cow::Borrowed(struct_type)); - } else { - self.set_error(Error::Generic(format!( - "Non-nullable field {} is null in single-row struct", - field.name() - ))); - return None; - } - } - } - - */ let fields = self.check_error(struct_type.fields().map(ArrowField::try_from).try_collect())?; - // null buffer is none since we catch the null struct case above - //let array = self.check_error(StructArray::try_new(fields, child_arrays, None))?; + // If all children are NULL, interpret the struct itself as being NULL. This is always legal, + // and is required for correctness if any field is non-nullable. + let null_buffer = field_arrays + .iter() + .all(|v| v.is_null(0)) + .then(|| NullBuffer::new_null(1)); let array = self.check_error(StructArray::try_new(fields, field_arrays, null_buffer))?; + self.stack.push(Arc::new(array)); Some(Cow::Borrowed(struct_type)) } @@ -817,8 +738,6 @@ mod single_row_array_transform { return None; } - println!("transforming field: {:?}", field); - println!("nullable: {:?}", field.is_nullable()); self.recurse_into_struct_field(field); Some(Cow::Borrowed(field)) } @@ -1190,52 +1109,52 @@ mod tests { } // helper to take values/schema to pass to `create_one` and assert the result = expected - fn assert_create_one(values: &[Scalar], schema: SchemaRef, expected: RecordBatch) { - let handler = ArrowExpressionHandler; - let actual = handler.create_one(schema, values).unwrap(); - let actual_rb: RecordBatch = actual - .into_any() - .downcast::() - .unwrap() - .into(); - assert_eq!(actual_rb, expected); + fn assert_create_one(values: &[Scalar], schema: SchemaRef, expected: StructArray) { + //let handler = ArrowExpressionHandler; + //let actual = handler.create_one(schema, values).unwrap(); + //let actual_rb: RecordBatch = actual + // .into_any() + // .downcast::() + // .unwrap() + // .into(); + //assert_eq!(actual_rb, expected); + + ////// FIXME + let mut array_transform = SingleRowArrayTransform::new(values); + let datatype = schema.into(); + // we build up the array within the `SingleRowArrayTransform` - we don't actually care + // about the 'transformed' type + let _transformed = array_transform.transform(&datatype); + let array = array_transform.into_struct_array().unwrap(); + let struct_array = array.as_struct_opt().unwrap(); // FIXME + ////// + + assert_eq!(struct_array, &expected); } #[test] fn test_create_one() { - let values: &[Scalar] = &[1.into(), 2.into(), 3.into()]; + let values: &[Scalar] = &[ + 1.into(), + "B".into(), + 3.into(), + Scalar::Null(DeltaDataTypes::INTEGER), + ]; let schema = Arc::new(StructType::new([ StructField::nullable("a", DeltaDataTypes::INTEGER), - StructField::nullable("b", DeltaDataTypes::INTEGER), + StructField::nullable("b", DeltaDataTypes::STRING), StructField::nullable("c", DeltaDataTypes::INTEGER), + StructField::nullable("d", DeltaDataTypes::INTEGER), ])); - let expected = - record_batch!(("a", Int32, [1]), ("b", Int32, [2]), ("c", Int32, [3])).unwrap(); - assert_create_one(values, schema, expected); - } - - #[test] - fn test_create_one_string() { - let values = &["a".into()]; - let schema = Arc::new(StructType::new([StructField::nullable( - "col_1", - DeltaDataTypes::STRING, - )])); - - let expected = record_batch!(("col_1", Utf8, ["a"])).unwrap(); - assert_create_one(values, schema, expected); - } - - #[test] - fn test_create_one_null() { - let values = &[Scalar::Null(DeltaDataTypes::INTEGER)]; - let schema = Arc::new(StructType::new([StructField::nullable( - "col_1", - DeltaDataTypes::INTEGER, - )])); - let expected = record_batch!(("col_1", Int32, [None])).unwrap(); - assert_create_one(values, schema, expected); + let expected = record_batch!( + ("a", Int32, [1]), + ("b", Utf8, ["B"]), + ("c", Int32, [3]), + ("d", Int32, [None]) + ) + .unwrap(); + assert_create_one(values, schema, expected.into()); } #[test] @@ -1248,7 +1167,7 @@ mod tests { let expected_schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); let expected = RecordBatch::try_new(expected_schema, vec![create_array!(Int32, [1])]).unwrap(); - assert_create_one(values, schema, expected); + assert_create_one(values, schema, expected.into()); } #[test] @@ -1286,7 +1205,7 @@ mod tests { ]))], ) .unwrap(); - assert_create_one(values, schema, expected); + assert_create_one(values, schema, expected.into()); } #[test] @@ -1324,7 +1243,7 @@ mod tests { ]))], ) .unwrap(); - assert_create_one(values, schema, expected); + assert_create_one(values, schema, expected.into()); } // critical case: struct(x) [nullable] with (a [non-null], b [nullable]) fields. @@ -1337,32 +1256,37 @@ mod tests { #[test] fn test_create_one_null_struct() { let values: &[Scalar] = &[ + 1.into(), // need an int here so everything doesn't collapse into only one null struct Scalar::Null(DeltaDataTypes::INTEGER), Scalar::Null(DeltaDataTypes::INTEGER), ]; - let schema = Arc::new(StructType::new([StructField::nullable( - "a", - DeltaDataTypes::struct_type([ - StructField::not_null("b", DeltaDataTypes::INTEGER), - StructField::nullable("c", DeltaDataTypes::INTEGER), - ]), - )])); + let schema = Arc::new(StructType::new([ + StructField::nullable("a", DeltaDataTypes::INTEGER), + StructField::nullable( + "x", + DeltaDataTypes::struct_type([ + StructField::not_null("b", DeltaDataTypes::INTEGER), + StructField::nullable("c", DeltaDataTypes::INTEGER), + ]), + ), + ])); let struct_fields = vec![ Field::new("b", DataType::Int32, false), Field::new("c", DataType::Int32, true), ]; - let expected_schema = Arc::new(Schema::new(vec![Field::new( - "a", - DataType::Struct(struct_fields.clone().into()), - true, - )])); - // an array with one null struct + let expected_schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, true), + Field::new("x", DataType::Struct(struct_fields.clone().into()), true), + ])); let expected = RecordBatch::try_new( expected_schema, - vec![Arc::new(StructArray::new_null(struct_fields.into(), 1))], + vec![ + Arc::new(Int32Array::from(vec![1])), + Arc::new(StructArray::new_null(struct_fields.into(), 1)), + ], ) .unwrap(); - assert_create_one(values, schema.clone(), expected); + assert_create_one(values, schema.clone(), expected.into()); let err_values: &[Scalar] = &[Scalar::Null(DeltaDataTypes::INTEGER), 1.into()]; let handler = ArrowExpressionHandler; @@ -1426,48 +1350,7 @@ mod tests { ], ) .unwrap(); - assert_create_one(values, schema.clone(), expected); - } - - #[test] - fn test_create_one_null_fields() { - // vs if both fields are nullable, we just leave them as null - let values: &[Scalar] = &[ - Scalar::Null(DeltaDataTypes::INTEGER), - Scalar::Null(DeltaDataTypes::INTEGER), - ]; - let schema = Arc::new(StructType::new([StructField::nullable( - "a", - DeltaDataTypes::struct_type([ - StructField::nullable("b", DeltaDataTypes::INTEGER), - StructField::nullable("c", DeltaDataTypes::INTEGER), - ]), - )])); - let struct_fields = vec![ - Field::new("b", DataType::Int32, true), - Field::new("c", DataType::Int32, true), - ]; - let expected_schema = Arc::new(Schema::new(vec![Field::new( - "a", - DataType::Struct(struct_fields.clone().into()), - true, - )])); - // an array with one struct and two null fields - let expected = RecordBatch::try_new( - expected_schema, - vec![Arc::new(StructArray::from(vec![ - ( - Arc::new(Field::new("b", DataType::Int32, true)), - create_array!(Int32, [None]) as ArrayRef, - ), - ( - Arc::new(Field::new("c", DataType::Int32, true)), - create_array!(Int32, [None]) as ArrayRef, - ), - ]))], - ) - .unwrap(); - assert_create_one(values, schema, expected); + assert_create_one(values, schema.clone(), expected.into()); } #[test] @@ -1502,12 +1385,18 @@ mod tests { #[test] fn test_create_one_top_level_null() { let values = &[Scalar::Null(DeltaDataTypes::INTEGER)]; + let handler = ArrowExpressionHandler; + let schema = Arc::new(StructType::new([StructField::not_null( "col_1", DeltaDataTypes::INTEGER, )])); + matches!(handler.create_one(schema, values), Err(Error::Arrow(_))); - let handler = ArrowExpressionHandler; + let schema = Arc::new(StructType::new([StructField::nullable( + "col_1", + DeltaDataTypes::INTEGER, + )])); matches!(handler.create_one(schema, values), Err(Error::Arrow(_))); } @@ -1582,35 +1471,37 @@ mod tests { Field::new("a", DataType::Int32, self.schema.a_nullable), Field::new("b", DataType::Int32, self.schema.b_nullable), ]; - let expected_schema = Arc::new(Schema::new(vec![Field::new( - "x", - DataType::Struct(struct_fields.clone().into()), - self.schema.x_nullable, - )])); + let field_a = Arc::new(Field::new("a", DataType::Int32, self.schema.a_nullable)); + let field_b = Arc::new(Field::new("b", DataType::Int32, self.schema.b_nullable)); let expected_struct_array = match self.expected[i] { - Expected::Noop => StructArray::from(vec![ - ( - Arc::new(Field::new("a", DataType::Int32, self.schema.a_nullable)), - create_array!(Int32, [a_val]) as ArrayRef, - ), - ( - Arc::new(Field::new("b", DataType::Int32, self.schema.b_nullable)), - create_array!(Int32, [b_val]) as ArrayRef, - ), - ]), - Expected::NullStruct => StructArray::new_null(struct_fields.into(), 1), + Expected::Noop => StructArray::from(vec![( + Arc::new(Field::new( + "x", + DataType::Struct(struct_fields.into()), + self.schema.x_nullable, + )), + Arc::new(StructArray::from(vec![ + (field_a, create_array!(Int32, [a_val]) as ArrayRef), + (field_b, create_array!(Int32, [b_val]) as ArrayRef), + ])) as ArrayRef, + )]), + Expected::NullStruct => StructArray::new_null( + vec![Field::new( + "x", + DataType::Struct(struct_fields.into()), + self.schema.x_nullable, + )] + .into(), + 1, + ), Expected::Error => { let handler = ArrowExpressionHandler; assert!(handler.create_one(schema, values).is_err()); return; } }; - - let expected = - RecordBatch::try_new(expected_schema, vec![Arc::new(expected_struct_array)]) - .unwrap(); - assert_create_one(values, schema, expected); + assert_create_one(values, schema, expected_struct_array); } } } @@ -1621,7 +1512,7 @@ mod tests { // 1. (a, b) -> x (a, b) // 2. (N, b) -> x (N, b) // 3. (a, N) -> x (a, N) - // 4. (N, N) -> x (N, N) + // 4. (N, N) -> x NULL // // n { n, ! } @@ -1640,7 +1531,7 @@ mod tests { // 1. (a, b) -> x (a, b) // 2. (N, b) -> x (N, b) // 3. (a, N) -> x (a, N) - // 4. (N, N) -> x (N, N) + // 4. (N, N) -> x NULL // // ! { n, ! } // 1. (a, b) -> x (a, b) @@ -1664,7 +1555,7 @@ mod tests { Expected::Noop, Expected::Noop, Expected::Noop, - Expected::Noop, + Expected::NullStruct, ], }, TestGroup { @@ -1703,7 +1594,7 @@ mod tests { Expected::Noop, Expected::Noop, Expected::Noop, - Expected::Noop, + Expected::NullStruct, ], }, TestGroup { From 1bf432e6e47daf932b47e0d20da1031445e7f79c Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Wed, 5 Feb 2025 11:49:12 -0800 Subject: [PATCH 10/13] new mod --- ffi/src/error.rs | 2 + kernel/src/engine/arrow_expression.rs | 460 +--------------- .../arrow_expression/single_row_transform.rs | 506 ++++++++++++++++++ kernel/src/error.rs | 5 + 4 files changed, 526 insertions(+), 447 deletions(-) create mode 100644 kernel/src/engine/arrow_expression/single_row_transform.rs diff --git a/ffi/src/error.rs b/ffi/src/error.rs index a615d0330..508c1465b 100644 --- a/ffi/src/error.rs +++ b/ffi/src/error.rs @@ -52,6 +52,7 @@ pub enum KernelError { ChangeDataFeedUnsupported, ChangeDataFeedIncompatibleSchema, InvalidCheckpoint, + SingleRowTransformError, } impl From for KernelError { @@ -110,6 +111,7 @@ impl From for KernelError { KernelError::ChangeDataFeedIncompatibleSchema } Error::InvalidCheckpoint(_) => KernelError::InvalidCheckpoint, + Error::SingleRowTransformError(_) => KernelError::SingleRowTransformError, } } } diff --git a/kernel/src/engine/arrow_expression.rs b/kernel/src/engine/arrow_expression.rs index 32bd63bbd..58755c3c8 100644 --- a/kernel/src/engine/arrow_expression.rs +++ b/kernel/src/engine/arrow_expression.rs @@ -36,7 +36,9 @@ use crate::schema::{ ArrayType, DataType, MapType, PrimitiveType, Schema, SchemaRef, SchemaTransform, StructField, }; use crate::{EngineData, ExpressionEvaluator, ExpressionHandler}; -use single_row_array_transform::SingleRowArrayTransform; + +pub(crate) mod single_row_transform; +use single_row_transform::SingleRowTransform; // TODO leverage scalars / Datum @@ -532,7 +534,7 @@ impl ExpressionHandler for ArrowExpressionHandler { /// TODO fn create_one(&self, schema: SchemaRef, values: &[Scalar]) -> DeltaResult> { - let mut array_transform = SingleRowArrayTransform::new(values); + let mut array_transform = SingleRowTransform::new(values); let datatype = schema.into(); // we build up the array within the `SingleRowArrayTransform` - we don't actually care // about the 'transformed' type @@ -550,224 +552,6 @@ impl ExpressionHandler for ArrowExpressionHandler { } } -mod single_row_array_transform { - use std::borrow::Cow; - use std::sync::Arc; - - use arrow_array::{ArrayRef, StructArray}; - use arrow_buffer::NullBuffer; - use arrow_schema::DataType as ArrowDataType; - use arrow_schema::Field as ArrowField; - use itertools::Itertools; - use tracing::debug; - - use crate::error::{DeltaResult, Error}; - use crate::expressions::Scalar; - use crate::schema::{ - ArrayType, DataType, MapType, PrimitiveType, SchemaTransform, StructField, StructType, - }; - - /// [`SchemaTransform`] that will transform a [`Schema`] and an ordered list of leaf values (as - /// Scalar slice) into an arrow Array with a single row of each literal. - #[derive(Debug)] - pub(crate) struct SingleRowArrayTransform<'a, T: Iterator> { - /// Leaf-node values to insert in schema order. - scalars: T, - /// A stack of built arrays. After visiting children, we pop them off to - /// build the parent container, then push the parent back on. - stack: Vec, - /// If an error occurs, it will be stored here. - error: Option, - } - - ///// Any error for the single-row array transform. - //#[derive(thiserror::Error, Debug)] - //pub enum Error { - // /// An error performing operations on arrow data - // #[error(transparent)] - // Arrow(arrow_schema::ArrowError), - // - // /// A generic error with a message - // #[error("Schema error: {0}")] - // Schema(String), - // - // /// TODO - // #[error("stack error: {0}")] - // Stack(String), - //} - // - //impl Error { - // /// Create a generic error with a message - // pub fn generic(msg: impl Into) -> Self { - // Self::Schema(msg.into()) - // } - //} - // - // TODO ERRORS - impl<'a, I: Iterator> SingleRowArrayTransform<'a, I> { - pub(crate) fn new(scalars: impl IntoIterator) -> Self { - Self { - scalars: scalars.into_iter(), - stack: Vec::new(), - error: None, - } - } - - /// return the single-row array (or propagate Error). the top of `stack` should be our - /// single-row struct array - pub(crate) fn into_struct_array(mut self) -> DeltaResult { - if let Some(e) = self.error { - return Err(e); - } - - if self.scalars.next().is_some() { - return Err(Error::generic( - "Excess scalars given to create a single-row array", - )); - } - - match self.stack.pop() { - Some(array) if self.stack.is_empty() => match array.data_type() { - ArrowDataType::Struct(_) => Ok(array), - _ => Err(Error::generic("Expected struct array")), - }, - Some(_) => Err(Error::generic( - "additional arrays left in the stack (possibly too many scalars)", - )), - None => Err(Error::generic("no array present in the stack")), - } - } - - fn set_error(&mut self, e: Error) { - if let Some(err) = &self.error.replace(e) { - debug!("Overwriting error that was already set: {err}"); - } - } - - // TODO use check_error everywhere - fn check_error>(&mut self, result: Result) -> Option { - match result { - Ok(val) => Some(val), - Err(err) => { - self.set_error(err.into()); - None - } - } - } - } - - impl<'a, T: Iterator> SchemaTransform<'a> for SingleRowArrayTransform<'a, T> { - fn transform_primitive( - &mut self, - prim_type: &'a PrimitiveType, - ) -> Option> { - // first always check error to terminate early if possible - if self.error.is_some() { - return None; - } - - let next = self.scalars.next(); - let scalar = self.check_error(next.ok_or(Error::generic( - "Not enough scalars to create a single-row array", - )))?; - - let DataType::Primitive(scalar_type) = scalar.data_type() else { - self.set_error(Error::generic( - "Non-primitive scalar type {datatype} provided", - )); - return None; - }; - if scalar_type != *prim_type { - self.set_error(Error::Generic(format!( - "Mismatched scalar type creating a single-row array: expected {}, got {}", - prim_type, scalar_type - ))); - return None; - } - - let arr = self.check_error(scalar.to_array(1))?; - self.stack.push(arr); - - Some(Cow::Borrowed(prim_type)) - } - - fn transform_struct(&mut self, struct_type: &'a StructType) -> Option> { - // first always check error to terminate early if possible - if self.error.is_some() { - return None; - } - - // Only consume newly-added entries (if any). There could be fewer than expected if - // the recursion encountered an error. - let mark = self.stack.len(); - let _ = self.recurse_into_struct(struct_type); // change - //self.recurse_into_struct(struct_type)?; - let field_arrays = self.stack.split_off(mark); - if self.error.is_some() { - return None; - } - - if field_arrays.len() != struct_type.fields_len() { - self.set_error(Error::generic( - "Not enough child arrays to create a single-row struct", - )); - return None; - } - - let fields = - self.check_error(struct_type.fields().map(ArrowField::try_from).try_collect())?; - // If all children are NULL, interpret the struct itself as being NULL. This is always legal, - // and is required for correctness if any field is non-nullable. - let null_buffer = field_arrays - .iter() - .all(|v| v.is_null(0)) - .then(|| NullBuffer::new_null(1)); - let array = - self.check_error(StructArray::try_new(fields, field_arrays, null_buffer))?; - - self.stack.push(Arc::new(array)); - Some(Cow::Borrowed(struct_type)) - } - - fn transform_struct_field( - &mut self, - field: &'a StructField, - ) -> Option> { - // first always check error to terminate early if possible - if self.error.is_some() { - return None; - } - - self.recurse_into_struct_field(field); - Some(Cow::Borrowed(field)) - } - - // arrays unsupported for now - fn transform_array(&mut self, _array_type: &'a ArrayType) -> Option> { - // first always check error to terminate early if possible - if self.error.is_some() { - return None; - } - self.set_error(Error::unsupported( - "ArrayType not yet supported for creating single-row array", - )); - None - } - - // maps unsupported for now - fn transform_map(&mut self, _map_type: &'a MapType) -> Option> { - // first always check error to terminate early if possible - if self.error.is_some() { - return None; - } - self.set_error(Error::unsupported( - "MapType not yet supported for creating single-row array", - )); - None - } - } -} - #[derive(Debug)] pub struct DefaultExpressionEvaluator { input_schema: SchemaRef, @@ -1109,27 +893,15 @@ mod tests { } // helper to take values/schema to pass to `create_one` and assert the result = expected - fn assert_create_one(values: &[Scalar], schema: SchemaRef, expected: StructArray) { - //let handler = ArrowExpressionHandler; - //let actual = handler.create_one(schema, values).unwrap(); - //let actual_rb: RecordBatch = actual - // .into_any() - // .downcast::() - // .unwrap() - // .into(); - //assert_eq!(actual_rb, expected); - - ////// FIXME - let mut array_transform = SingleRowArrayTransform::new(values); - let datatype = schema.into(); - // we build up the array within the `SingleRowArrayTransform` - we don't actually care - // about the 'transformed' type - let _transformed = array_transform.transform(&datatype); - let array = array_transform.into_struct_array().unwrap(); - let struct_array = array.as_struct_opt().unwrap(); // FIXME - ////// - - assert_eq!(struct_array, &expected); + fn assert_create_one(values: &[Scalar], schema: SchemaRef, expected: RecordBatch) { + let handler = ArrowExpressionHandler; + let actual = handler.create_one(schema, values).unwrap(); + let actual_rb: RecordBatch = actual + .into_any() + .downcast::() + .unwrap() + .into(); + assert_eq!(actual_rb, expected); } #[test] @@ -1423,210 +1195,4 @@ mod tests { let handler = ArrowExpressionHandler; assert!(handler.create_one(schema, values).is_err()); } - - #[derive(Clone, Copy)] - struct TestSchema { - x_nullable: bool, - a_nullable: bool, - b_nullable: bool, - } - struct TestGroup { - schema: TestSchema, - expected: [Expected; 4], - } - enum Expected { - Noop, - NullStruct, - Error, - } - - impl TestGroup { - fn run(self) { - let value_groups = [ - (Some(1), Some(2)), - (None, Some(2)), - (Some(1), None), - (None, None), - ]; - for (i, (a_val, b_val)) in value_groups.into_iter().enumerate() { - let a = match a_val { - Some(v) => Scalar::Integer(v), - None => Scalar::Null(DeltaDataTypes::INTEGER), - }; - let b = match b_val { - Some(v) => Scalar::Integer(v), - None => Scalar::Null(DeltaDataTypes::INTEGER), - }; - let values: &[Scalar] = &[a, b]; - let schema = Arc::new(StructType::new([StructField::new( - "x", - DeltaDataTypes::struct_type([ - StructField::new("a", DeltaDataTypes::INTEGER, self.schema.a_nullable), - StructField::new("b", DeltaDataTypes::INTEGER, self.schema.b_nullable), - ]), - self.schema.x_nullable, - )])); - - let struct_fields = vec![ - Field::new("a", DataType::Int32, self.schema.a_nullable), - Field::new("b", DataType::Int32, self.schema.b_nullable), - ]; - - let field_a = Arc::new(Field::new("a", DataType::Int32, self.schema.a_nullable)); - let field_b = Arc::new(Field::new("b", DataType::Int32, self.schema.b_nullable)); - let expected_struct_array = match self.expected[i] { - Expected::Noop => StructArray::from(vec![( - Arc::new(Field::new( - "x", - DataType::Struct(struct_fields.into()), - self.schema.x_nullable, - )), - Arc::new(StructArray::from(vec![ - (field_a, create_array!(Int32, [a_val]) as ArrayRef), - (field_b, create_array!(Int32, [b_val]) as ArrayRef), - ])) as ArrayRef, - )]), - Expected::NullStruct => StructArray::new_null( - vec![Field::new( - "x", - DataType::Struct(struct_fields.into()), - self.schema.x_nullable, - )] - .into(), - 1, - ), - Expected::Error => { - let handler = ArrowExpressionHandler; - assert!(handler.create_one(schema, values).is_err()); - return; - } - }; - assert_create_one(values, schema, expected_struct_array); - } - } - } - - #[test] - fn test_create_one_nullability_combinations() { - // n { n, n } - // 1. (a, b) -> x (a, b) - // 2. (N, b) -> x (N, b) - // 3. (a, N) -> x (a, N) - // 4. (N, N) -> x NULL - - // - // n { n, ! } - // 1. (a, b) -> x (a, b) - // 2. (N, b) -> x (N, b) - // 3. (a, N) -> Err - // 4. (N, N) -> x NULL - // - // n { !, ! } - // 1. (a, b) -> x (a, b) - // 2. (N, b) -> Err - // 3. (a, N) -> Err - // 4. (N, N) -> x NULL - // - // ! { n, n } - // 1. (a, b) -> x (a, b) - // 2. (N, b) -> x (N, b) - // 3. (a, N) -> x (a, N) - // 4. (N, N) -> x NULL - // - // ! { n, ! } - // 1. (a, b) -> x (a, b) - // 2. (N, b) -> x (N, b) - // 3. (a, N) -> Err - // 4. (N, N) -> Err - // - // ! { !, ! } - // 1. (a, b) -> x (a, b) - // 2. (N, b) -> Err - // 3. (a, N) -> Err - // 4. (N, N) -> Err - let tests = [ - TestGroup { - schema: TestSchema { - x_nullable: true, - a_nullable: true, - b_nullable: true, - }, - expected: [ - Expected::Noop, - Expected::Noop, - Expected::Noop, - Expected::NullStruct, - ], - }, - TestGroup { - schema: TestSchema { - x_nullable: true, - a_nullable: true, - b_nullable: false, - }, - expected: [ - Expected::Noop, - Expected::Noop, - Expected::Error, - Expected::NullStruct, - ], - }, - TestGroup { - schema: TestSchema { - x_nullable: true, - a_nullable: false, - b_nullable: false, - }, - expected: [ - Expected::Noop, - Expected::Error, - Expected::Error, - Expected::NullStruct, - ], - }, - TestGroup { - schema: TestSchema { - x_nullable: false, - a_nullable: true, - b_nullable: true, - }, - expected: [ - Expected::Noop, - Expected::Noop, - Expected::Noop, - Expected::NullStruct, - ], - }, - TestGroup { - schema: TestSchema { - x_nullable: false, - a_nullable: true, - b_nullable: false, - }, - expected: [ - Expected::Noop, - Expected::Noop, - Expected::Error, - Expected::Error, - ], - }, - TestGroup { - schema: TestSchema { - x_nullable: false, - a_nullable: false, - b_nullable: false, - }, - expected: [ - Expected::Noop, - Expected::Error, - Expected::Error, - Expected::Error, - ], - }, - ]; - - for test_group in tests { - test_group.run() - } - } } diff --git a/kernel/src/engine/arrow_expression/single_row_transform.rs b/kernel/src/engine/arrow_expression/single_row_transform.rs new file mode 100644 index 000000000..b2dabac8c --- /dev/null +++ b/kernel/src/engine/arrow_expression/single_row_transform.rs @@ -0,0 +1,506 @@ +use std::borrow::Cow; +use std::sync::Arc; + +use arrow_array::{ArrayRef, StructArray}; +use arrow_buffer::NullBuffer; +use arrow_schema::DataType as ArrowDataType; +use arrow_schema::Field as ArrowField; +use itertools::Itertools; +use tracing::debug; + +use crate::expressions::Scalar; +use crate::schema::{ + ArrayType, DataType, MapType, PrimitiveType, SchemaTransform, StructField, StructType, +}; + +/// [`SchemaTransform`] that will transform a [`Schema`] and an ordered list of leaf values (as +/// Scalar slice) into an arrow Array with a single row of each literal. +#[derive(Debug)] +pub(crate) struct SingleRowTransform<'a, T: Iterator> { + /// Leaf-node values to insert in schema order. + scalars: T, + /// A stack of built arrays. After visiting children, we pop them off to + /// build the parent container, then push the parent back on. + stack: Vec, + /// If an error occurs, it will be stored here. + error: Option, +} + +/// Any error for the single-row array transform. +#[derive(thiserror::Error, Debug)] +pub enum Error { + /// An error performing operations on arrow data + #[error(transparent)] + Arrow(#[from] arrow_schema::ArrowError), + + /// Schema mismatch error + #[error("Schema error: {0}")] + Schema(String), + + /// Insufficient number of scalars (too many) to create a single-row array + #[error("Too many scalars given create a single-row array")] + ExcessScalars, + + /// Insufficient number of scalars (too few) to create a single-row array + #[error("Too few scalars given create a single-row array")] + InsufficientScalars, + + /// Empty array after performing the transform + #[error("No arrays were created after performing the transform")] + EmptyStack, + + /// Unsupported operation + #[error("Unsupported operation: {0}")] + Unsupported(String), +} + +impl<'a, I: Iterator> SingleRowTransform<'a, I> { + pub(crate) fn new(scalars: impl IntoIterator) -> Self { + Self { + scalars: scalars.into_iter(), + stack: Vec::new(), + error: None, + } + } + + /// return the single-row array (or propagate Error). the top of `stack` should be our + /// single-row struct array + pub(crate) fn into_struct_array(mut self) -> Result { + if let Some(e) = self.error { + return Err(e); + } + + if self.scalars.next().is_some() { + return Err(Error::ExcessScalars); + } + + match self.stack.pop() { + Some(array) if self.stack.is_empty() => match array.data_type() { + ArrowDataType::Struct(_) => Ok(array), + _ => Err(Error::Schema("Expected struct array".to_string())), + }, + Some(_) => Err(Error::ExcessScalars), + None => Err(Error::EmptyStack), + } + } + + fn set_error(&mut self, e: Error) { + if let Some(err) = &self.error.replace(e) { + debug!("Overwriting error that was already set: {err}"); + } + } + + // TODO use check_error everywhere + fn check_error>(&mut self, result: Result) -> Option { + match result { + Ok(val) => Some(val), + Err(err) => { + self.set_error(err.into()); + None + } + } + } +} + +impl<'a, T: Iterator> SchemaTransform<'a> for SingleRowTransform<'a, T> { + fn transform_primitive( + &mut self, + prim_type: &'a PrimitiveType, + ) -> Option> { + // first always check error to terminate early if possible + if self.error.is_some() { + return None; + } + + let next = self.scalars.next(); + let scalar = self.check_error(next.ok_or(Error::InsufficientScalars))?; + + let DataType::Primitive(scalar_type) = scalar.data_type() else { + self.set_error(Error::Schema( + "Non-primitive scalar type {datatype} provided".to_string(), + )); + return None; + }; + if scalar_type != *prim_type { + self.set_error(Error::Schema(format!( + "Mismatched scalar type creating a single-row array: expected {}, got {}", + prim_type, scalar_type + ))); + return None; + } + + let arr = self.check_error( + scalar + .to_array(1) + .map_err(|delta_error| Error::Schema(delta_error.to_string())), + )?; + self.stack.push(arr); + + Some(Cow::Borrowed(prim_type)) + } + + fn transform_struct(&mut self, struct_type: &'a StructType) -> Option> { + // first always check error to terminate early if possible + if self.error.is_some() { + return None; + } + + // Only consume newly-added entries (if any). There could be fewer than expected if + // the recursion encountered an error. + let mark = self.stack.len(); + self.recurse_into_struct(struct_type)?; + let field_arrays = self.stack.split_off(mark); + + if field_arrays.len() != struct_type.fields_len() { + self.set_error(Error::InsufficientScalars); + return None; + } + + let fields = + self.check_error(struct_type.fields().map(ArrowField::try_from).try_collect())?; + + let mut found_non_nullable_null = false; + let mut all_null = true; + for (f, v) in struct_type.fields().zip(&field_arrays) { + if v.is_valid(0) { + all_null = false; + } else if !f.is_nullable() { + found_non_nullable_null = true; + } + } + + // If all children are NULL and at least one is ostensibly non-nullable, we must interpret + // the struct itself as being NULL or `StructArray::try_new` will fail null checks. + let null_buffer = (all_null && found_non_nullable_null).then(|| NullBuffer::new_null(1)); + let array = self.check_error(StructArray::try_new(fields, field_arrays, null_buffer))?; + + self.stack.push(Arc::new(array)); + Some(Cow::Borrowed(struct_type)) + } + + fn transform_struct_field(&mut self, field: &'a StructField) -> Option> { + // first always check error to terminate early if possible + if self.error.is_some() { + return None; + } + + self.recurse_into_struct_field(field); + Some(Cow::Borrowed(field)) + } + + // arrays unsupported for now + fn transform_array(&mut self, _array_type: &'a ArrayType) -> Option> { + // first always check error to terminate early if possible + if self.error.is_some() { + return None; + } + self.set_error(Error::Unsupported( + "ArrayType not yet supported for creating single-row array".to_string(), + )); + None + } + + // maps unsupported for now + fn transform_map(&mut self, _map_type: &'a MapType) -> Option> { + // first always check error to terminate early if possible + if self.error.is_some() { + return None; + } + self.set_error(Error::Unsupported( + "MapType not yet supported for creating single-row array".to_string(), + )); + None + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::schema::SchemaRef; + use crate::schema::StructType; + use crate::DataType as DeltaDataTypes; + + use arrow_array::cast::AsArray; + use arrow_array::create_array; + use arrow_schema::{DataType, Field}; + + // helper to take values/schema to pass to `create_one` and assert the result = expected + fn assert_single_row_transform( + values: &[Scalar], + schema: SchemaRef, + expected: Result, + ) { + let mut array_transform = SingleRowTransform::new(values); + let datatype = schema.into(); + let _transformed = array_transform.transform(&datatype); + match expected { + Ok(expected_struct) => { + let array = array_transform.into_struct_array().unwrap(); + let struct_array = array.as_struct_opt().unwrap(); + assert_eq!(struct_array, &expected_struct); + } + Err(()) => { + assert!(array_transform.into_struct_array().is_err()); + } + } + } + + #[derive(Clone, Copy)] + struct TestSchema { + x_nullable: bool, + a_nullable: bool, + b_nullable: bool, + } + struct TestGroup { + schema: TestSchema, + expected: [Expected; 4], + } + enum Expected { + Noop, + NullStruct, + Null, + Error, // TODO we could check the actual error + } + + impl TestGroup { + fn run(self) { + let value_groups = [ + (Some(1), Some(2)), + (None, Some(2)), + (Some(1), None), + (None, None), + ]; + for (i, (a_val, b_val)) in value_groups.into_iter().enumerate() { + let a = match a_val { + Some(v) => Scalar::Integer(v), + None => Scalar::Null(DeltaDataTypes::INTEGER), + }; + let b = match b_val { + Some(v) => Scalar::Integer(v), + None => Scalar::Null(DeltaDataTypes::INTEGER), + }; + let values: &[Scalar] = &[a, b]; + let schema = Arc::new(StructType::new([StructField::new( + "x", + DeltaDataTypes::struct_type([ + StructField::new("a", DeltaDataTypes::INTEGER, self.schema.a_nullable), + StructField::new("b", DeltaDataTypes::INTEGER, self.schema.b_nullable), + ]), + self.schema.x_nullable, + )])); + + let struct_fields = vec![ + Field::new("a", DataType::Int32, self.schema.a_nullable), + Field::new("b", DataType::Int32, self.schema.b_nullable), + ]; + + let field_a = Arc::new(Field::new("a", DataType::Int32, self.schema.a_nullable)); + let field_b = Arc::new(Field::new("b", DataType::Int32, self.schema.b_nullable)); + let expected = match self.expected[i] { + Expected::Noop => Ok(StructArray::from(vec![( + Arc::new(Field::new( + "x", + DataType::Struct(struct_fields.into()), + self.schema.x_nullable, + )), + Arc::new(StructArray::from(vec![ + (field_a, create_array!(Int32, [a_val]) as ArrayRef), + (field_b, create_array!(Int32, [b_val]) as ArrayRef), + ])) as ArrayRef, + )])), + Expected::Null => Ok(StructArray::new_null( + vec![Field::new( + "x", + DataType::Struct(struct_fields.into()), + self.schema.x_nullable, + )] + .into(), + 1, + )), + Expected::NullStruct => Ok(StructArray::from(vec![( + Arc::new(Field::new( + "x", + DataType::Struct(struct_fields.into()), + self.schema.x_nullable, + )), + Arc::new(StructArray::new_null( + vec![field_a.clone(), field_b.clone()].into(), + 1, + )) as ArrayRef, + )])), + Expected::Error => Err(()), + }; + assert_single_row_transform(values, schema, expected); + } + } + } + + // test cases: x, a, b are nullable (n) or not-null (!). we have 6 interesting nullability + // combinations: + // 1. n { n, n } + // 2. n { n, ! } + // 3. n { !, ! } + // 4. ! { n, n } + // 5. ! { n, ! } + // 6. ! { !, ! } + // + // and for each we want to test the four combinations of values ("a" and "b" just chosen as + // abitrary scalars): + // 1. (a, b) + // 2. (N, b) + // 3. (a, N) + // 4. (N, N) + // + // here's the full list of test cases with expected output: + // + // n { n, n } + // 1. (a, b) -> x (a, b) + // 2. (N, b) -> x (N, b) + // 3. (a, N) -> x (a, N) + // 4. (N, N) -> x (N, N) + // + // n { n, ! } + // 1. (a, b) -> x (a, b) + // 2. (N, b) -> x (N, b) + // 3. (a, N) -> Err + // 4. (N, N) -> x NULL + // + // n { !, ! } + // 1. (a, b) -> x (a, b) + // 2. (N, b) -> Err + // 3. (a, N) -> Err + // 4. (N, N) -> x NULL + // + // ! { n, n } + // 1. (a, b) -> x (a, b) + // 2. (N, b) -> x (N, b) + // 3. (a, N) -> x (a, N) + // 4. (N, N) -> x (N, N) + // + // ! { n, ! } + // 1. (a, b) -> x (a, b) + // 2. (N, b) -> x (N, b) + // 3. (a, N) -> Err + // 4. (N, N) -> NULL + // + // ! { !, ! } + // 1. (a, b) -> x (a, b) + // 2. (N, b) -> Err + // 3. (a, N) -> Err + // 4. (N, N) -> NULL + + // A helper macro to map the nullability specification to booleans + macro_rules! bool_from_nullable { + (nullable) => { + true + }; + (not_null) => { + false + }; + } + + // The main macro: for each test case we list a test name, the values for x, a, b, and the expected outcomes. + macro_rules! test_nullability_combinations { + ( + $( + $test_name:ident: { + x: $x:ident, + a: $a:ident, + b: $b:ident, + expected: [$($expected:expr),+ $(,)?] + } + ),+ $(,)? + ) => { + $( + #[test] + fn $test_name() { + let test_group = TestGroup { + schema: TestSchema { + x_nullable: bool_from_nullable!($x), + a_nullable: bool_from_nullable!($a), + b_nullable: bool_from_nullable!($b), + }, + expected: [$($expected),+], + }; + test_group.run(); + } + )+ + }; + } + + test_nullability_combinations! { + // n { n, n } case: all nullable + test_all_nullable: { + x: nullable, + a: nullable, + b: nullable, + expected: [ + Expected::Noop, + Expected::Noop, + Expected::Noop, + Expected::Noop, + ] + }, + // n { n, ! } case: x and a are nullable, b is not-null + test_nullable_nullable_not_null: { + x: nullable, + a: nullable, + b: not_null, + expected: [ + Expected::Noop, + Expected::Noop, + Expected::Error, + Expected::NullStruct, + ] + }, + // n { !, ! } case: x is nullable; a and b are not-null + test_nullable_not_null_not_null: { + x: nullable, + a: not_null, + b: not_null, + expected: [ + Expected::Noop, + Expected::Error, + Expected::Error, + Expected::NullStruct, + ] + }, + // ! { n, n } case: x is not-null; a and b are nullable + test_not_null_nullable_nullable: { + x: not_null, + a: nullable, + b: nullable, + expected: [ + Expected::Noop, + Expected::Noop, + Expected::Noop, + Expected::Noop, + ] + }, + // ! { n, ! } case: x is not-null; a is nullable, b is not-null + test_not_null_nullable_not_null: { + x: not_null, + a: nullable, + b: not_null, + expected: [ + Expected::Noop, + Expected::Noop, + Expected::Error, + Expected::Null, + ] + }, + // ! { !, ! } case: all not-null (for a and b) + test_not_null_not_null_not_null: { + x: not_null, + a: not_null, + b: not_null, + expected: [ + Expected::Noop, + Expected::Error, + Expected::Error, + Expected::Null, + ] + }, + } +} diff --git a/kernel/src/error.rs b/kernel/src/error.rs index 815ef3e51..0f95e188d 100644 --- a/kernel/src/error.rs +++ b/kernel/src/error.rs @@ -195,6 +195,11 @@ pub enum Error { /// Invalid checkpoint files #[error("Invalid Checkpoint: {0}")] InvalidCheckpoint(String), + + /// Error while creating a new single-row array + #[cfg(any(feature = "default-engine-base", feature = "sync-engine"))] + #[error(transparent)] + SingleRowTransformError(#[from] crate::engine::arrow_expression::single_row_transform::Error), } // Convenience constructors for Error types that take a String argument From f15f95516908d6a752b9260c67fea0ca91275f1f Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Wed, 5 Feb 2025 12:41:45 -0800 Subject: [PATCH 11/13] test cleanup --- .../arrow_expression/single_row_transform.rs | 434 +++++++++--------- 1 file changed, 210 insertions(+), 224 deletions(-) diff --git a/kernel/src/engine/arrow_expression/single_row_transform.rs b/kernel/src/engine/arrow_expression/single_row_transform.rs index b2dabac8c..9065c920e 100644 --- a/kernel/src/engine/arrow_expression/single_row_transform.rs +++ b/kernel/src/engine/arrow_expression/single_row_transform.rs @@ -90,7 +90,6 @@ impl<'a, I: Iterator> SingleRowTransform<'a, I> { } } - // TODO use check_error everywhere fn check_error>(&mut self, result: Result) -> Option { match result { Ok(val) => Some(val), @@ -224,6 +223,7 @@ mod tests { use arrow_array::cast::AsArray; use arrow_array::create_array; use arrow_schema::{DataType, Field}; + use paste::paste; // helper to take values/schema to pass to `create_one` and assert the result = expected fn assert_single_row_transform( @@ -252,145 +252,74 @@ mod tests { a_nullable: bool, b_nullable: bool, } - struct TestGroup { - schema: TestSchema, - expected: [Expected; 4], - } + enum Expected { Noop, NullStruct, Null, - Error, // TODO we could check the actual error + Error, // TODO: we could check the actual error } - impl TestGroup { - fn run(self) { - let value_groups = [ - (Some(1), Some(2)), - (None, Some(2)), - (Some(1), None), - (None, None), - ]; - for (i, (a_val, b_val)) in value_groups.into_iter().enumerate() { - let a = match a_val { - Some(v) => Scalar::Integer(v), - None => Scalar::Null(DeltaDataTypes::INTEGER), - }; - let b = match b_val { - Some(v) => Scalar::Integer(v), - None => Scalar::Null(DeltaDataTypes::INTEGER), - }; - let values: &[Scalar] = &[a, b]; - let schema = Arc::new(StructType::new([StructField::new( - "x", - DeltaDataTypes::struct_type([ - StructField::new("a", DeltaDataTypes::INTEGER, self.schema.a_nullable), - StructField::new("b", DeltaDataTypes::INTEGER, self.schema.b_nullable), - ]), - self.schema.x_nullable, - )])); - - let struct_fields = vec![ - Field::new("a", DataType::Int32, self.schema.a_nullable), - Field::new("b", DataType::Int32, self.schema.b_nullable), - ]; - - let field_a = Arc::new(Field::new("a", DataType::Int32, self.schema.a_nullable)); - let field_b = Arc::new(Field::new("b", DataType::Int32, self.schema.b_nullable)); - let expected = match self.expected[i] { - Expected::Noop => Ok(StructArray::from(vec![( - Arc::new(Field::new( - "x", - DataType::Struct(struct_fields.into()), - self.schema.x_nullable, - )), - Arc::new(StructArray::from(vec![ - (field_a, create_array!(Int32, [a_val]) as ArrayRef), - (field_b, create_array!(Int32, [b_val]) as ArrayRef), - ])) as ArrayRef, - )])), - Expected::Null => Ok(StructArray::new_null( - vec![Field::new( - "x", - DataType::Struct(struct_fields.into()), - self.schema.x_nullable, - )] - .into(), - 1, - )), - Expected::NullStruct => Ok(StructArray::from(vec![( - Arc::new(Field::new( - "x", - DataType::Struct(struct_fields.into()), - self.schema.x_nullable, - )), - Arc::new(StructArray::new_null( - vec![field_a.clone(), field_b.clone()].into(), - 1, - )) as ArrayRef, - )])), - Expected::Error => Err(()), - }; - assert_single_row_transform(values, schema, expected); + fn run_test(schema: TestSchema, values: (Option, Option), expected: Expected) { + let (a_val, b_val) = values; + let a = match a_val { + Some(v) => Scalar::Integer(v), + None => Scalar::Null(DeltaDataTypes::INTEGER), + }; + let b = match b_val { + Some(v) => Scalar::Integer(v), + None => Scalar::Null(DeltaDataTypes::INTEGER), + }; + let values: &[Scalar] = &[a, b]; + + let x_children_fields = vec![ + Field::new("a", DataType::Int32, schema.a_nullable), + Field::new("b", DataType::Int32, schema.b_nullable), + ]; + let x_field = Arc::new(Field::new( + "x", + DataType::Struct(x_children_fields.clone().into()), + schema.x_nullable, + )); + + let arrow_schema = Arc::new(StructType::new([StructField::new( + "x", + DeltaDataTypes::struct_type([ + StructField::new("a", DeltaDataTypes::INTEGER, schema.a_nullable), + StructField::new("b", DeltaDataTypes::INTEGER, schema.b_nullable), + ]), + schema.x_nullable, + )])); + + let field_a = Arc::new(x_children_fields[0].clone()); + let field_b = Arc::new(x_children_fields[1].clone()); + + let expected_result = match expected { + Expected::Noop => { + let nested_struct = Arc::new(StructArray::from(vec![ + (field_a, create_array!(Int32, [a_val]) as ArrayRef), + (field_b, create_array!(Int32, [b_val]) as ArrayRef), + ])) as ArrayRef; + Ok(StructArray::from(vec![(x_field.clone(), nested_struct)])) } - } + Expected::Null => Ok(StructArray::new_null( + vec![x_field.as_ref().clone()].into(), + 1, + )), + Expected::NullStruct => { + let nested_null = Arc::new(StructArray::new_null( + vec![field_a.clone(), field_b.clone()].into(), + 1, + )) as ArrayRef; + Ok(StructArray::from(vec![(x_field, nested_null)])) + } + Expected::Error => Err(()), + }; + + assert_single_row_transform(values, arrow_schema, expected_result); } - // test cases: x, a, b are nullable (n) or not-null (!). we have 6 interesting nullability - // combinations: - // 1. n { n, n } - // 2. n { n, ! } - // 3. n { !, ! } - // 4. ! { n, n } - // 5. ! { n, ! } - // 6. ! { !, ! } - // - // and for each we want to test the four combinations of values ("a" and "b" just chosen as - // abitrary scalars): - // 1. (a, b) - // 2. (N, b) - // 3. (a, N) - // 4. (N, N) - // - // here's the full list of test cases with expected output: - // - // n { n, n } - // 1. (a, b) -> x (a, b) - // 2. (N, b) -> x (N, b) - // 3. (a, N) -> x (a, N) - // 4. (N, N) -> x (N, N) - // - // n { n, ! } - // 1. (a, b) -> x (a, b) - // 2. (N, b) -> x (N, b) - // 3. (a, N) -> Err - // 4. (N, N) -> x NULL - // - // n { !, ! } - // 1. (a, b) -> x (a, b) - // 2. (N, b) -> Err - // 3. (a, N) -> Err - // 4. (N, N) -> x NULL - // - // ! { n, n } - // 1. (a, b) -> x (a, b) - // 2. (N, b) -> x (N, b) - // 3. (a, N) -> x (a, N) - // 4. (N, N) -> x (N, N) - // - // ! { n, ! } - // 1. (a, b) -> x (a, b) - // 2. (N, b) -> x (N, b) - // 3. (a, N) -> Err - // 4. (N, N) -> NULL - // - // ! { !, ! } - // 1. (a, b) -> x (a, b) - // 2. (N, b) -> Err - // 3. (a, N) -> Err - // 4. (N, N) -> NULL - - // A helper macro to map the nullability specification to booleans + // helper to convert nullable/not_null to bool macro_rules! bool_from_nullable { (nullable) => { true @@ -400,107 +329,164 @@ mod tests { }; } - // The main macro: for each test case we list a test name, the values for x, a, b, and the expected outcomes. + // helper to convert a/b/N to Some/Some/None (1 and 2 just arbitrary non-null ints) + macro_rules! parse_value { + (a) => { + Some(1) + }; + (b) => { + Some(2) + }; + (N) => { + None + }; + } + macro_rules! test_nullability_combinations { ( - $( - $test_name:ident: { - x: $x:ident, - a: $a:ident, - b: $b:ident, - expected: [$($expected:expr),+ $(,)?] - } - ),+ $(,)? + name = $name:ident, + schema = { x: $x:ident, a: $a:ident, b: $b:ident }, + tests = { + ($ta1:tt, $tb1:tt) -> $expected1:ident, + ($ta2:tt, $tb2:tt) -> $expected2:ident, + ($ta3:tt, $tb3:tt) -> $expected3:ident, + ($ta4:tt, $tb4:tt) -> $expected4:ident $(,)? + } ) => { - $( + paste! { #[test] - fn $test_name() { - let test_group = TestGroup { - schema: TestSchema { - x_nullable: bool_from_nullable!($x), - a_nullable: bool_from_nullable!($a), - b_nullable: bool_from_nullable!($b), - }, - expected: [$($expected),+], + fn [<$name _ $ta1:lower _ $tb1:lower>]() { + let schema = TestSchema { + x_nullable: bool_from_nullable!($x), + a_nullable: bool_from_nullable!($a), + b_nullable: bool_from_nullable!($b), }; - test_group.run(); + run_test(schema, (parse_value!($ta1), parse_value!($tb1)), Expected::$expected1); } - )+ - }; + #[test] + fn [<$name _ $ta2:lower _ $tb2:lower>]() { + let schema = TestSchema { + x_nullable: bool_from_nullable!($x), + a_nullable: bool_from_nullable!($a), + b_nullable: bool_from_nullable!($b), + }; + run_test(schema, (parse_value!($ta2), parse_value!($tb2)), Expected::$expected2); + } + #[test] + fn [<$name _ $ta3:lower _ $tb3:lower>]() { + let schema = TestSchema { + x_nullable: bool_from_nullable!($x), + a_nullable: bool_from_nullable!($a), + b_nullable: bool_from_nullable!($b), + }; + run_test(schema, (parse_value!($ta3), parse_value!($tb3)), Expected::$expected3); + } + #[test] + fn [<$name _ $ta4:lower _ $tb4:lower>]() { + let schema = TestSchema { + x_nullable: bool_from_nullable!($x), + a_nullable: bool_from_nullable!($a), + b_nullable: bool_from_nullable!($b), + }; + run_test(schema, (parse_value!($ta4), parse_value!($tb4)), Expected::$expected4); + } + } + } + } + + // Group 1: nullable { nullable, nullable } + // 1. (a, b) -> x (a, b) + // 2. (N, b) -> x (N, b) + // 3. (a, N) -> x (a, N) + // 4. (N, N) -> x (N, N) + test_nullability_combinations! { + name = test_all_nullable, + schema = { x: nullable, a: nullable, b: nullable }, + tests = { + (a, b) -> Noop, + (N, b) -> Noop, + (a, N) -> Noop, + (N, N) -> Noop, + } + } + + // Group 2: nullable { nullable, not_null } + // 1. (a, b) -> x (a, b) + // 2. (N, b) -> x (N, b) + // 3. (a, N) -> Err + // 4. (N, N) -> x NULL + test_nullability_combinations! { + name = test_nullable_nullable_not_null, + schema = { x: nullable, a: nullable, b: not_null }, + tests = { + (a, b) -> Noop, + (N, b) -> Noop, + (a, N) -> Error, + (N, N) -> NullStruct, + } } + // Group 3: nullable { not_null, not_null } + // 1. (a, b) -> x (a, b) + // 2. (N, b) -> Err + // 3. (a, N) -> Err + // 4. (N, N) -> x NULL test_nullability_combinations! { - // n { n, n } case: all nullable - test_all_nullable: { - x: nullable, - a: nullable, - b: nullable, - expected: [ - Expected::Noop, - Expected::Noop, - Expected::Noop, - Expected::Noop, - ] - }, - // n { n, ! } case: x and a are nullable, b is not-null - test_nullable_nullable_not_null: { - x: nullable, - a: nullable, - b: not_null, - expected: [ - Expected::Noop, - Expected::Noop, - Expected::Error, - Expected::NullStruct, - ] - }, - // n { !, ! } case: x is nullable; a and b are not-null - test_nullable_not_null_not_null: { - x: nullable, - a: not_null, - b: not_null, - expected: [ - Expected::Noop, - Expected::Error, - Expected::Error, - Expected::NullStruct, - ] - }, - // ! { n, n } case: x is not-null; a and b are nullable - test_not_null_nullable_nullable: { - x: not_null, - a: nullable, - b: nullable, - expected: [ - Expected::Noop, - Expected::Noop, - Expected::Noop, - Expected::Noop, - ] - }, - // ! { n, ! } case: x is not-null; a is nullable, b is not-null - test_not_null_nullable_not_null: { - x: not_null, - a: nullable, - b: not_null, - expected: [ - Expected::Noop, - Expected::Noop, - Expected::Error, - Expected::Null, - ] - }, - // ! { !, ! } case: all not-null (for a and b) - test_not_null_not_null_not_null: { - x: not_null, - a: not_null, - b: not_null, - expected: [ - Expected::Noop, - Expected::Error, - Expected::Error, - Expected::Null, - ] - }, + name = test_nullable_not_null_not_null, + schema = { x: nullable, a: not_null, b: not_null }, + tests = { + (a, b) -> Noop, + (N, b) -> Error, + (a, N) -> Error, + (N, N) -> NullStruct, + } + } + + // Group 4: not_null { nullable, nullable } + // 1. (a, b) -> x (a, b) + // 2. (N, b) -> x (N, b) + // 3. (a, N) -> x (a, N) + // 4. (N, N) -> x (N, N) + test_nullability_combinations! { + name = test_not_null_nullable_nullable, + schema = { x: not_null, a: nullable, b: nullable }, + tests = { + (a, b) -> Noop, + (N, b) -> Noop, + (a, N) -> Noop, + (N, N) -> Noop, + } + } + + // Group 5: not_null { nullable, not_null } + // 1. (a, b) -> x (a, b) + // 2. (N, b) -> x (N, b) + // 3. (a, N) -> Err + // 4. (N, N) -> NULL + test_nullability_combinations! { + name = test_not_null_nullable_not_null, + schema = { x: not_null, a: nullable, b: not_null }, + tests = { + (a, b) -> Noop, + (N, b) -> Noop, + (a, N) -> Error, + (N, N) -> Null, + } + } + + // Group 6: not_null { not_null, not_null } + // 1. (a, b) -> x (a, b) + // 2. (N, b) -> Err + // 3. (a, N) -> Err + // 4. (N, N) -> NULL + test_nullability_combinations! { + name = test_all_not_null, + schema = { x: not_null, a: not_null, b: not_null }, + tests = { + (a, b) -> Noop, + (N, b) -> Error, + (a, N) -> Error, + (N, N) -> Null, + } } } From 321f592ed7608303186baba62a062220f426cc11 Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Wed, 5 Feb 2025 12:47:28 -0800 Subject: [PATCH 12/13] clippy --- kernel/src/engine/arrow_expression.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/src/engine/arrow_expression.rs b/kernel/src/engine/arrow_expression.rs index 58755c3c8..1abd7663b 100644 --- a/kernel/src/engine/arrow_expression.rs +++ b/kernel/src/engine/arrow_expression.rs @@ -939,7 +939,7 @@ mod tests { let expected_schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); let expected = RecordBatch::try_new(expected_schema, vec![create_array!(Int32, [1])]).unwrap(); - assert_create_one(values, schema, expected.into()); + assert_create_one(values, schema, expected); } #[test] @@ -977,7 +977,7 @@ mod tests { ]))], ) .unwrap(); - assert_create_one(values, schema, expected.into()); + assert_create_one(values, schema, expected); } #[test] @@ -1015,7 +1015,7 @@ mod tests { ]))], ) .unwrap(); - assert_create_one(values, schema, expected.into()); + assert_create_one(values, schema, expected); } // critical case: struct(x) [nullable] with (a [non-null], b [nullable]) fields. @@ -1058,7 +1058,7 @@ mod tests { ], ) .unwrap(); - assert_create_one(values, schema.clone(), expected.into()); + assert_create_one(values, schema.clone(), expected); let err_values: &[Scalar] = &[Scalar::Null(DeltaDataTypes::INTEGER), 1.into()]; let handler = ArrowExpressionHandler; @@ -1122,7 +1122,7 @@ mod tests { ], ) .unwrap(); - assert_create_one(values, schema.clone(), expected.into()); + assert_create_one(values, schema.clone(), expected); } #[test] From 54d739565ccbf388b40c76fa4354ac3dbdae6365 Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Wed, 5 Feb 2025 18:35:20 -0800 Subject: [PATCH 13/13] test cleanup --- kernel/src/engine/arrow_expression.rs | 192 ++---------------- .../arrow_expression/single_row_transform.rs | 120 +++++++++++ 2 files changed, 141 insertions(+), 171 deletions(-) diff --git a/kernel/src/engine/arrow_expression.rs b/kernel/src/engine/arrow_expression.rs index 1abd7663b..9d8d460b6 100644 --- a/kernel/src/engine/arrow_expression.rs +++ b/kernel/src/engine/arrow_expression.rs @@ -592,7 +592,7 @@ impl ExpressionEvaluator for DefaultExpressionEvaluator { mod tests { use std::ops::{Add, Div, Mul, Sub}; - use arrow_array::{create_array, record_batch, GenericStringArray, Int32Array}; + use arrow_array::{create_array, GenericStringArray, Int32Array}; use arrow_buffer::ScalarBuffer; use arrow_schema::{DataType, Field, Fields, Schema}; @@ -915,30 +915,26 @@ mod tests { let schema = Arc::new(StructType::new([ StructField::nullable("a", DeltaDataTypes::INTEGER), StructField::nullable("b", DeltaDataTypes::STRING), - StructField::nullable("c", DeltaDataTypes::INTEGER), + StructField::not_null("c", DeltaDataTypes::INTEGER), StructField::nullable("d", DeltaDataTypes::INTEGER), ])); - let expected = record_batch!( - ("a", Int32, [1]), - ("b", Utf8, ["B"]), - ("c", Int32, [3]), - ("d", Int32, [None]) + let expected_schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, true), + Field::new("b", DataType::Utf8, true), + Field::new("c", DataType::Int32, false), + Field::new("d", DataType::Int32, true), + ])); + let expected = RecordBatch::try_new( + expected_schema, + vec![ + create_array!(Int32, [1]), + create_array!(Utf8, ["B"]), + create_array!(Int32, [3]), + create_array!(Int32, [None]), + ], ) .unwrap(); - assert_create_one(values, schema, expected.into()); - } - - #[test] - fn test_create_one_non_null() { - let values: &[Scalar] = &[1.into()]; - let schema = Arc::new(StructType::new([StructField::not_null( - "a", - DeltaDataTypes::INTEGER, - )])); - let expected_schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); - let expected = - RecordBatch::try_new(expected_schema, vec![create_array!(Int32, [1])]).unwrap(); assert_create_one(values, schema, expected); } @@ -1018,113 +1014,6 @@ mod tests { assert_create_one(values, schema, expected); } - // critical case: struct(x) [nullable] with (a [non-null], b [nullable]) fields. - // for scalars (null, null) the _only_ solution here is struct(x) is null struct (not two null - // leaves) - // - // vs. if a and b are both nullable, we just leave them as null and the struct is non-null - // - // thus, we trigger the 'nullable struct' case only if it is required. - #[test] - fn test_create_one_null_struct() { - let values: &[Scalar] = &[ - 1.into(), // need an int here so everything doesn't collapse into only one null struct - Scalar::Null(DeltaDataTypes::INTEGER), - Scalar::Null(DeltaDataTypes::INTEGER), - ]; - let schema = Arc::new(StructType::new([ - StructField::nullable("a", DeltaDataTypes::INTEGER), - StructField::nullable( - "x", - DeltaDataTypes::struct_type([ - StructField::not_null("b", DeltaDataTypes::INTEGER), - StructField::nullable("c", DeltaDataTypes::INTEGER), - ]), - ), - ])); - let struct_fields = vec![ - Field::new("b", DataType::Int32, false), - Field::new("c", DataType::Int32, true), - ]; - let expected_schema = Arc::new(Schema::new(vec![ - Field::new("a", DataType::Int32, true), - Field::new("x", DataType::Struct(struct_fields.clone().into()), true), - ])); - let expected = RecordBatch::try_new( - expected_schema, - vec![ - Arc::new(Int32Array::from(vec![1])), - Arc::new(StructArray::new_null(struct_fields.into(), 1)), - ], - ) - .unwrap(); - assert_create_one(values, schema.clone(), expected); - - let err_values: &[Scalar] = &[Scalar::Null(DeltaDataTypes::INTEGER), 1.into()]; - let handler = ArrowExpressionHandler; - assert!(handler.create_one(schema, err_values).is_err()); // FIXME - } - - #[test] - fn test_create_one_many_structs() { - let values: &[Scalar] = &[1.into(), 2.into(), 3.into(), 4.into()]; - let schema = Arc::new(StructType::new([ - StructField::nullable( - "x", - DeltaDataTypes::struct_type([ - StructField::not_null("a", DeltaDataTypes::INTEGER), - StructField::nullable("b", DeltaDataTypes::INTEGER), - ]), - ), - StructField::nullable( - "y", - DeltaDataTypes::struct_type([ - StructField::not_null("c", DeltaDataTypes::INTEGER), - StructField::nullable("d", DeltaDataTypes::INTEGER), - ]), - ), - ])); - let struct_fields_1 = vec![ - Field::new("a", DataType::Int32, false), - Field::new("b", DataType::Int32, true), - ]; - let struct_fields_2 = vec![ - Field::new("c", DataType::Int32, false), - Field::new("d", DataType::Int32, true), - ]; - let expected_schema = Arc::new(Schema::new(vec![ - Field::new("x", DataType::Struct(struct_fields_1.clone().into()), true), - Field::new("y", DataType::Struct(struct_fields_2.clone().into()), true), - ])); - let expected = RecordBatch::try_new( - expected_schema, - vec![ - Arc::new(StructArray::from(vec![ - ( - Arc::new(Field::new("a", DataType::Int32, false)), - create_array!(Int32, [1]) as ArrayRef, - ), - ( - Arc::new(Field::new("b", DataType::Int32, true)), - create_array!(Int32, [2]) as ArrayRef, - ), - ])), - Arc::new(StructArray::from(vec![ - ( - Arc::new(Field::new("c", DataType::Int32, false)), - create_array!(Int32, [3]) as ArrayRef, - ), - ( - Arc::new(Field::new("d", DataType::Int32, true)), - create_array!(Int32, [4]) as ArrayRef, - ), - ])), - ], - ) - .unwrap(); - assert_create_one(values, schema.clone(), expected); - } - #[test] fn test_create_one_not_null_struct() { let values: &[Scalar] = &[ @@ -1139,19 +1028,7 @@ mod tests { ]), )])); let handler = ArrowExpressionHandler; - assert!(handler.create_one(schema, values).is_err()); // FIXME - } - - #[test] - fn test_create_one_incorrect_schema() { - let values = &["a".into()]; - let schema = Arc::new(StructType::new([StructField::nullable( - "col_1", - DeltaDataTypes::INTEGER, - )])); - - let handler = ArrowExpressionHandler; - matches!(handler.create_one(schema, values), Err(Error::Generic(_))); + assert!(handler.create_one(schema, values).is_err()); } #[test] @@ -1163,36 +1040,9 @@ mod tests { "col_1", DeltaDataTypes::INTEGER, )])); - matches!(handler.create_one(schema, values), Err(Error::Arrow(_))); - - let schema = Arc::new(StructType::new([StructField::nullable( - "col_1", - DeltaDataTypes::INTEGER, - )])); - matches!(handler.create_one(schema, values), Err(Error::Arrow(_))); - } - - #[test] - fn test_create_one_missing_values() { - let values = &[1.into()]; - let schema = Arc::new(StructType::new([ - StructField::nullable("col_1", DeltaDataTypes::INTEGER), - StructField::nullable("col_2", DeltaDataTypes::INTEGER), - ])); - - let handler = ArrowExpressionHandler; - matches!(handler.create_one(schema, values), Err(Error::Generic(_))); - } - - #[test] - fn test_create_one_extra_values() { - let values = &[1.into(), 2.into(), 3.into()]; - let schema = Arc::new(StructType::new([ - StructField::nullable("col_1", DeltaDataTypes::INTEGER), - StructField::nullable("col_2", DeltaDataTypes::INTEGER), - ])); - - let handler = ArrowExpressionHandler; - assert!(handler.create_one(schema, values).is_err()); + assert!(matches!( + handler.create_one(schema, values), + Err(Error::Generic(_)) + )); } } diff --git a/kernel/src/engine/arrow_expression/single_row_transform.rs b/kernel/src/engine/arrow_expression/single_row_transform.rs index 9065c920e..1f7ea6624 100644 --- a/kernel/src/engine/arrow_expression/single_row_transform.rs +++ b/kernel/src/engine/arrow_expression/single_row_transform.rs @@ -246,6 +246,126 @@ mod tests { } } + #[test] + fn test_create_one_top_level_null() { + let values = &[Scalar::Null(DeltaDataTypes::INTEGER)]; + + let schema = Arc::new(StructType::new([StructField::not_null( + "col_1", + DeltaDataTypes::INTEGER, + )])); + let expected = + StructArray::new_null(vec![Field::new("col_1", DataType::Int32, false)].into(), 1); + assert_single_row_transform(values, schema, Ok(expected)); + + let schema = Arc::new(StructType::new([StructField::nullable( + "col_1", + DeltaDataTypes::INTEGER, + )])); + let expected = StructArray::from(vec![( + Arc::new(Field::new("col_1", DataType::Int32, true)), + create_array!(Int32, [None]) as ArrayRef, + )]); + assert_single_row_transform(values, schema, Ok(expected)); + } + + #[test] + fn test_create_one_missing_values() { + let values = &[1.into()]; + let schema = Arc::new(StructType::new([ + StructField::nullable("col_1", DeltaDataTypes::INTEGER), + StructField::nullable("col_2", DeltaDataTypes::INTEGER), + ])); + assert_single_row_transform(values, schema, Err(())); + } + + #[test] + fn test_create_one_extra_values() { + let values = &[1.into(), 2.into(), 3.into()]; + let schema = Arc::new(StructType::new([ + StructField::nullable("col_1", DeltaDataTypes::INTEGER), + StructField::nullable("col_2", DeltaDataTypes::INTEGER), + ])); + assert_single_row_transform(values, schema, Err(())); + } + + #[test] + fn test_create_one_incorrect_schema() { + let values = &["a".into()]; + let schema = Arc::new(StructType::new([StructField::nullable( + "col_1", + DeltaDataTypes::INTEGER, + )])); + assert_single_row_transform(values, schema, Err(())); + } + + // useful test to make sure that we correctly process the stack + #[test] + fn test_many_structs() { + let values: &[Scalar] = &[1.into(), 2.into(), 3.into(), 4.into()]; + let schema = Arc::new(StructType::new([ + StructField::nullable( + "x", + DeltaDataTypes::struct_type([ + StructField::not_null("a", DeltaDataTypes::INTEGER), + StructField::nullable("b", DeltaDataTypes::INTEGER), + ]), + ), + StructField::nullable( + "y", + DeltaDataTypes::struct_type([ + StructField::not_null("c", DeltaDataTypes::INTEGER), + StructField::nullable("d", DeltaDataTypes::INTEGER), + ]), + ), + ])); + let struct_fields_1 = vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, true), + ]; + let struct_fields_2 = vec![ + Field::new("c", DataType::Int32, false), + Field::new("d", DataType::Int32, true), + ]; + let expected = StructArray::from(vec![ + ( + Arc::new(Field::new( + "x", + DataType::Struct(struct_fields_1.clone().into()), + true, + )), + Arc::new(StructArray::from(vec![ + ( + Arc::new(struct_fields_1[0].clone()), + create_array!(Int32, [1]) as ArrayRef, + ), + ( + Arc::new(struct_fields_1[1].clone()), + create_array!(Int32, [2]) as ArrayRef, + ), + ])) as ArrayRef, + ), + ( + Arc::new(Field::new( + "y", + DataType::Struct(struct_fields_2.clone().into()), + true, + )), + Arc::new(StructArray::from(vec![ + ( + Arc::new(struct_fields_2[0].clone()), + create_array!(Int32, [3]) as ArrayRef, + ), + ( + Arc::new(struct_fields_2[1].clone()), + create_array!(Int32, [4]) as ArrayRef, + ), + ])) as ArrayRef, + ), + ]); + assert_single_row_transform(values, schema, Ok(expected)); + } + #[derive(Clone, Copy)] struct TestSchema { x_nullable: bool,