From 8b4a8e6b157c007e7988f715cb4b693578438f8b Mon Sep 17 00:00:00 2001 From: YjyJeff <1731939194@qq.com> Date: Thu, 14 Mar 2024 14:22:42 +0800 Subject: [PATCH] support unnest as subexpression (#9592) * support unnest as subexpression and multiple unnests in projection * update alias unnest * remove unnest multiple columns * rename mulitple unnest columns to multiple unnest functions in selection * fix: move column name to if branch --------- Co-authored-by: jefffffyang --- datafusion/expr/src/expr_schema.rs | 20 +++++- datafusion/sql/src/select.rs | 52 ++++++++++---- datafusion/sqllogictest/test_files/unnest.slt | 67 ++++++++++++++++++- 3 files changed, 123 insertions(+), 16 deletions(-) diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 70ffa5064a52..1d83fbe8c0e0 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -28,7 +28,8 @@ use crate::{utils, LogicalPlan, Projection, Subquery}; use arrow::compute::can_cast_types; use arrow::datatypes::{DataType, Field}; use datafusion_common::{ - internal_err, plan_datafusion_err, plan_err, Column, DFField, ExprSchema, Result, + internal_err, not_impl_err, plan_datafusion_err, plan_err, Column, DFField, + ExprSchema, Result, }; use std::collections::HashMap; use std::sync::Arc; @@ -113,7 +114,22 @@ impl ExprSchemable for Expr { .iter() .map(|e| e.get_type(schema)) .collect::>>()?; - Ok(arg_data_types[0].clone()) + let arg_data_type = arg_data_types[0].clone(); + // Unnest's output type is the inner type of the list + match arg_data_type{ + DataType::List(field) | DataType::LargeList(field) | DataType::FixedSizeList(field, _) =>{ + Ok(field.data_type().clone()) + } + DataType::Struct(_) => { + not_impl_err!("unnest() does not support struct yet") + } + DataType::Null => { + not_impl_err!("unnest() does not support null yet") + } + _ => { + plan_err!("unnest() can only be applied to array, struct and null") + } + } } Expr::ScalarFunction(ScalarFunction { func_def, args }) => { let arg_data_types = args diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index c5c80bd6acf0..1bfd60a8ce1a 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -24,6 +24,7 @@ use crate::utils::{ resolve_columns, resolve_positions_to_exprs, }; +use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::{not_impl_err, plan_err, DataFusionError, Result}; use datafusion_common::{Column, UnnestOptions}; use datafusion_expr::expr::{Alias, Unnest}; @@ -276,25 +277,51 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { Ok(plan) } - // Try converting Expr::Unnest to LogicalPlan::Unnest if possible, otherwise do the final projection + /// Try converting Expr(Unnest(Expr)) to Projection/Unnest/Projection pub(super) fn try_process_unnest( &self, input: LogicalPlan, select_exprs: Vec, ) -> Result { let mut unnest_columns = vec![]; - // Map unnest expressions to their argument - let projection_exprs = select_exprs + let mut inner_projection_exprs = vec![]; + + let outer_projection_exprs = select_exprs .into_iter() .map(|expr| { - if let Expr::Unnest(Unnest { ref exprs }) = expr { - let column_name = expr.display_name()?; - unnest_columns.push(column_name.clone()); - // Add alias for the argument expression, to avoid naming conflicts with other expressions - // in the select list. For example: `select unnest(col1), col1 from t`. - Ok(exprs[0].clone().alias(column_name)) + let Transformed { + data: transformed_expr, + transformed, + tnr: _, + } = expr.transform_up_mut(&mut |expr: Expr| { + if let Expr::Unnest(Unnest { ref exprs }) = expr { + let column_name = expr.display_name()?; + unnest_columns.push(column_name.clone()); + // Add alias for the argument expression, to avoid naming conflicts with other expressions + // in the select list. For example: `select unnest(col1), col1 from t`. + inner_projection_exprs + .push(exprs[0].clone().alias(column_name.clone())); + Ok(Transformed::yes(Expr::Column(Column::from_name( + column_name, + )))) + } else { + Ok(Transformed::no(expr)) + } + })?; + + if !transformed { + if matches!(&transformed_expr, Expr::Column(_)) { + inner_projection_exprs.push(transformed_expr.clone()); + Ok(transformed_expr) + } else { + // We need to evaluate the expr in the inner projection, + // outer projection just select its name + let column_name = transformed_expr.display_name()?; + inner_projection_exprs.push(transformed_expr); + Ok(Expr::Column(Column::from_name(column_name))) + } } else { - Ok(expr) + Ok(transformed_expr) } }) .collect::>>()?; @@ -302,7 +329,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // Do the final projection if unnest_columns.is_empty() { LogicalPlanBuilder::from(input) - .project(projection_exprs)? + .project(inner_projection_exprs)? .build() } else { if unnest_columns.len() > 1 { @@ -312,8 +339,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // Set preserve_nulls to false to ensure compatibility with DuckDB and PostgreSQL let unnest_options = UnnestOptions::new().with_preserve_nulls(false); LogicalPlanBuilder::from(input) - .project(projection_exprs)? + .project(inner_projection_exprs)? .unnest_column_with_options(unnest_column, unnest_options)? + .project(outer_projection_exprs)? .build() } } diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index f60f715242cf..5c178bb392b1 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -135,11 +135,10 @@ select array_remove(column1, 4), unnest(column2), column3 * 10 from unnest_table query error DataFusion error: Error during planning: unnest\(\) can only be applied to array, struct and null select unnest(column3) from unnest_table; -## Unnest multiple columns +## Multiple unnest functions in selection query error DataFusion error: This feature is not implemented: Only support single unnest expression for now select unnest(column1), unnest(column2) from unnest_table; - ## Unnest scalar in select list query error DataFusion error: Error during planning: unnest\(\) can only be applied to array, struct and null select unnest(1); @@ -254,5 +253,69 @@ select * from unnest([1,2,(select sum(column3) from unnest_table)]); 2 10 +## Unnest is the sub-expression of other expression +query II +select unnest(column1) as a, column3 from unnest_table; +---- +1 1 +2 1 +3 1 +4 2 +5 2 +6 3 +12 NULL + +query BI +select unnest(column1) is not null, column3 from unnest_table; +---- +true 1 +true 1 +true 1 +true 2 +true 2 +true 3 +true NULL + +query II +select -unnest(column1) as a, column3 from unnest_table; +---- +-1 1 +-2 1 +-3 1 +-4 2 +-5 2 +-6 3 +-12 NULL + +query II +select unnest(array_remove(column1, 3)) as a, column3 from unnest_table; +---- +1 1 +2 1 +4 2 +5 2 +6 3 +12 NULL + +query II +select unnest(array_remove(column1, 3)) as c1, column3 from unnest_table order by c1 desc, column3; +---- +12 NULL +6 3 +5 2 +4 2 +2 1 +1 1 + +query II +select unnest(array_remove(column1, 3)) - 1 as c1, column3 from unnest_table; +---- +0 1 +1 1 +3 2 +4 2 +5 3 +11 NULL + statement ok drop table unnest_table;