diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index fe3679099480..87d81abc4b15 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -19,10 +19,8 @@ use std::ops::Not; +use super::inlist_simplifier::{InListSimplifier, ShortenInListSimplifier}; use super::utils::*; -use super::{ - inlist_simplifier::InListSimplifier, or_in_list_simplifier::OrInListSimplifier, -}; use crate::analyzer::type_coercion::TypeCoercionRewriter; use crate::simplify_expressions::guarantees::GuaranteeRewriter; use crate::simplify_expressions::regex::simplify_regex_expr; @@ -44,10 +42,7 @@ use datafusion_expr::{ and, lit, or, BinaryExpr, BuiltinScalarFunction, Case, ColumnarValue, Expr, Like, ScalarFunctionDefinition, Volatility, }; -use datafusion_expr::{ - expr::{InList, InSubquery, ScalarFunction}, - interval_arithmetic::NullableInterval, -}; +use datafusion_expr::{expr::ScalarFunction, interval_arithmetic::NullableInterval}; use datafusion_physical_expr::{create_physical_expr, execution_props::ExecutionProps}; /// This structure handles API for expression simplification @@ -137,7 +132,7 @@ impl ExprSimplifier { pub fn simplify(&self, expr: Expr) -> Result { let mut simplifier = Simplifier::new(&self.info); let mut const_evaluator = ConstEvaluator::try_new(self.info.execution_props())?; - let mut or_in_list_simplifier = OrInListSimplifier::new(); + let mut shorten_in_list_simplifier = ShortenInListSimplifier::new(); let mut inlist_simplifier = InListSimplifier::new(); let mut guarantee_rewriter = GuaranteeRewriter::new(&self.guarantees); @@ -153,8 +148,8 @@ impl ExprSimplifier { // https://github.com/apache/arrow-datafusion/issues/1160 expr.rewrite(&mut const_evaluator)? .rewrite(&mut simplifier)? - .rewrite(&mut or_in_list_simplifier)? .rewrite(&mut inlist_simplifier)? + .rewrite(&mut shorten_in_list_simplifier)? .rewrite(&mut guarantee_rewriter)? // run both passes twice to try an minimize simplifications that we missed .rewrite(&mut const_evaluator)? @@ -609,91 +604,6 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> { None => lit_bool_null(), } } - // expr IN () --> false - // expr NOT IN () --> true - Expr::InList(InList { - expr, - list, - negated, - }) if list.is_empty() && *expr != Expr::Literal(ScalarValue::Null) => { - lit(negated) - } - - // null in (x, y, z) --> null - // null not in (x, y, z) --> null - Expr::InList(InList { - expr, - list: _, - negated: _, - }) if is_null(&expr) => lit_bool_null(), - - // expr IN ((subquery)) -> expr IN (subquery), see ##5529 - Expr::InList(InList { - expr, - mut list, - negated, - }) if list.len() == 1 - && matches!(list.first(), Some(Expr::ScalarSubquery { .. })) => - { - let Expr::ScalarSubquery(subquery) = list.remove(0) else { - unreachable!() - }; - Expr::InSubquery(InSubquery::new(expr, subquery, negated)) - } - - // if expr is a single column reference: - // expr IN (A, B, ...) --> (expr = A) OR (expr = B) OR (expr = C) - Expr::InList(InList { - expr, - list, - negated, - }) if !list.is_empty() - && ( - // For lists with only 1 value we allow more complex expressions to be simplified - // e.g SUBSTR(c1, 2, 3) IN ('1') -> SUBSTR(c1, 2, 3) = '1' - // for more than one we avoid repeating this potentially expensive - // expressions - list.len() == 1 - || list.len() <= THRESHOLD_INLINE_INLIST - && expr.try_into_col().is_ok() - ) => - { - let first_val = list[0].clone(); - if negated { - list.into_iter().skip(1).fold( - (*expr.clone()).not_eq(first_val), - |acc, y| { - // Note that `A and B and C and D` is a left-deep tree structure - // as such we want to maintain this structure as much as possible - // to avoid reordering the expression during each optimization - // pass. - // - // Left-deep tree structure for `A and B and C and D`: - // ``` - // & - // / \ - // & D - // / \ - // & C - // / \ - // A B - // ``` - // - // The code below maintain the left-deep tree structure. - acc.and((*expr.clone()).not_eq(y)) - }, - ) - } else { - list.into_iter().skip(1).fold( - (*expr.clone()).eq(first_val), - |acc, y| { - // Same reasoning as above - acc.or((*expr.clone()).eq(y)) - }, - ) - } - } - // // Rules for NotEq // diff --git a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs index e9ce2734636c..710c24f66e33 100644 --- a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs @@ -17,29 +17,89 @@ //! This module implements a rule that simplifies the values for `InList`s +use std::borrow::Cow; use std::collections::HashSet; use datafusion_common::tree_node::TreeNodeRewriter; -use datafusion_common::Result; -use datafusion_expr::expr::InList; +use datafusion_common::{Result, ScalarValue}; +use datafusion_expr::expr::{InList, InSubquery}; use datafusion_expr::{lit, BinaryExpr, Expr, Operator}; -/// Simplify expressions that is guaranteed to be true or false to a literal boolean expression -/// -/// Rules: -/// If both expressions are `IN` or `NOT IN`, then we can apply intersection or union on both lists -/// Intersection: -/// 1. `a in (1,2,3) AND a in (4,5) -> a in (), which is false` -/// 2. `a in (1,2,3) AND a in (2,3,4) -> a in (2,3)` -/// 3. `a not in (1,2,3) OR a not in (3,4,5,6) -> a not in (3)` -/// Union: -/// 4. `a not int (1,2,3) AND a not in (4,5,6) -> a not in (1,2,3,4,5,6)` -/// # This rule is handled by `or_in_list_simplifier.rs` -/// 5. `a in (1,2,3) OR a in (4,5,6) -> a in (1,2,3,4,5,6)` -/// If one of the expressions is `IN` and another one is `NOT IN`, then we apply exception on `In` expression -/// 6. `a in (1,2,3,4) AND a not in (1,2,3,4,5) -> a in (), which is false` -/// 7. `a not in (1,2,3,4) AND a in (1,2,3,4,5) -> a = 5` -/// 8. `a in (1,2,3,4) AND a not in (5,6,7,8) -> a in (1,2,3,4)` +use super::utils::{is_null, lit_bool_null}; +use super::THRESHOLD_INLINE_INLIST; + +pub(super) struct ShortenInListSimplifier {} + +impl ShortenInListSimplifier { + pub(super) fn new() -> Self { + Self {} + } +} + +impl TreeNodeRewriter for ShortenInListSimplifier { + type N = Expr; + + fn mutate(&mut self, expr: Expr) -> Result { + // if expr is a single column reference: + // expr IN (A, B, ...) --> (expr = A) OR (expr = B) OR (expr = C) + if let Expr::InList(InList { + expr, + list, + negated, + }) = expr.clone() + { + if !list.is_empty() + && ( + // For lists with only 1 value we allow more complex expressions to be simplified + // e.g SUBSTR(c1, 2, 3) IN ('1') -> SUBSTR(c1, 2, 3) = '1' + // for more than one we avoid repeating this potentially expensive + // expressions + list.len() == 1 + || list.len() <= THRESHOLD_INLINE_INLIST + && expr.try_into_col().is_ok() + ) + { + let first_val = list[0].clone(); + if negated { + return Ok(list.into_iter().skip(1).fold( + (*expr.clone()).not_eq(first_val), + |acc, y| { + // Note that `A and B and C and D` is a left-deep tree structure + // as such we want to maintain this structure as much as possible + // to avoid reordering the expression during each optimization + // pass. + // + // Left-deep tree structure for `A and B and C and D`: + // ``` + // & + // / \ + // & D + // / \ + // & C + // / \ + // A B + // ``` + // + // The code below maintain the left-deep tree structure. + acc.and((*expr.clone()).not_eq(y)) + }, + )); + } else { + return Ok(list.into_iter().skip(1).fold( + (*expr.clone()).eq(first_val), + |acc, y| { + // Same reasoning as above + acc.or((*expr.clone()).eq(y)) + }, + )); + } + } + } + + Ok(expr) + } +} + pub(super) struct InListSimplifier {} impl InListSimplifier { @@ -52,45 +112,142 @@ impl TreeNodeRewriter for InListSimplifier { type N = Expr; fn mutate(&mut self, expr: Expr) -> Result { - if let Expr::BinaryExpr(BinaryExpr { left, op, right }) = expr { + if let Expr::InList(InList { + expr, + mut list, + negated, + }) = expr.clone() + { + // expr IN () --> false + // expr NOT IN () --> true + if list.is_empty() && *expr != Expr::Literal(ScalarValue::Null) { + return Ok(lit(negated)); + // null in (x, y, z) --> null + // null not in (x, y, z) --> null + } else if is_null(&expr) { + return Ok(lit_bool_null()); + // expr IN ((subquery)) -> expr IN (subquery), see ##5529 + } else if list.len() == 1 + && matches!(list.first(), Some(Expr::ScalarSubquery { .. })) + { + let Expr::ScalarSubquery(subquery) = list.remove(0) else { + unreachable!() + }; + return Ok(Expr::InSubquery(InSubquery::new(expr, subquery, negated))); + } + } + // Combine multiple OR expressions into a single IN list expression if possible + // + // i.e. `a = 1 OR a = 2 OR a = 3` -> `a IN (1, 2, 3)` + if let Expr::BinaryExpr(BinaryExpr { left, op, right }) = &expr { + if *op == Operator::Or { + let left = as_inlist(left); + let right = as_inlist(right); + if let (Some(lhs), Some(rhs)) = (left, right) { + if lhs.expr.try_into_col().is_ok() + && rhs.expr.try_into_col().is_ok() + && lhs.expr == rhs.expr + && !lhs.negated + && !rhs.negated + { + let lhs = lhs.into_owned(); + let rhs = rhs.into_owned(); + let mut seen: HashSet = HashSet::new(); + let list = lhs + .list + .into_iter() + .chain(rhs.list) + .filter(|e| seen.insert(e.to_owned())) + .collect::>(); + + let merged_inlist = InList { + expr: lhs.expr, + list, + negated: false, + }; + return Ok(Expr::InList(merged_inlist)); + } + } + } + } + // Simplify expressions that is guaranteed to be true or false to a literal boolean expression + // + // Rules: + // If both expressions are `IN` or `NOT IN`, then we can apply intersection or union on both lists + // Intersection: + // 1. `a in (1,2,3) AND a in (4,5) -> a in (), which is false` + // 2. `a in (1,2,3) AND a in (2,3,4) -> a in (2,3)` + // 3. `a not in (1,2,3) OR a not in (3,4,5,6) -> a not in (3)` + // Union: + // 4. `a not int (1,2,3) AND a not in (4,5,6) -> a not in (1,2,3,4,5,6)` + // # This rule is handled by `or_in_list_simplifier.rs` + // 5. `a in (1,2,3) OR a in (4,5,6) -> a in (1,2,3,4,5,6)` + // If one of the expressions is `IN` and another one is `NOT IN`, then we apply exception on `In` expression + // 6. `a in (1,2,3,4) AND a not in (1,2,3,4,5) -> a in (), which is false` + // 7. `a not in (1,2,3,4) AND a in (1,2,3,4,5) -> a = 5` + // 8. `a in (1,2,3,4) AND a not in (5,6,7,8) -> a in (1,2,3,4)` + if let Expr::BinaryExpr(BinaryExpr { left, op, right }) = expr.clone() { match (*left, op, *right) { (Expr::InList(l1), Operator::And, Expr::InList(l2)) if l1.expr == l2.expr && !l1.negated && !l2.negated => { - inlist_intersection(l1, l2, false) + return inlist_intersection(l1, l2, false); } (Expr::InList(l1), Operator::And, Expr::InList(l2)) if l1.expr == l2.expr && l1.negated && l2.negated => { - inlist_union(l1, l2, true) + return inlist_union(l1, l2, true); } (Expr::InList(l1), Operator::And, Expr::InList(l2)) if l1.expr == l2.expr && !l1.negated && l2.negated => { - inlist_except(l1, l2) + return inlist_except(l1, l2); } (Expr::InList(l1), Operator::And, Expr::InList(l2)) if l1.expr == l2.expr && l1.negated && !l2.negated => { - inlist_except(l2, l1) + return inlist_except(l2, l1); } (Expr::InList(l1), Operator::Or, Expr::InList(l2)) if l1.expr == l2.expr && l1.negated && l2.negated => { - inlist_intersection(l1, l2, true) + return inlist_intersection(l1, l2, true); } (left, op, right) => { // put the expression back together - Ok(Expr::BinaryExpr(BinaryExpr { + return Ok(Expr::BinaryExpr(BinaryExpr { left: Box::new(left), op, right: Box::new(right), - })) + })); } } - } else { - Ok(expr) } + + Ok(expr) + } +} + +/// Try to convert an expression to an in-list expression +fn as_inlist(expr: &Expr) -> Option> { + match expr { + Expr::InList(inlist) => Some(Cow::Borrowed(inlist)), + Expr::BinaryExpr(BinaryExpr { left, op, right }) if *op == Operator::Eq => { + match (left.as_ref(), right.as_ref()) { + (Expr::Column(_), Expr::Literal(_)) => Some(Cow::Owned(InList { + expr: left.clone(), + list: vec![*right.clone()], + negated: false, + })), + (Expr::Literal(_), Expr::Column(_)) => Some(Cow::Owned(InList { + expr: right.clone(), + list: vec![*left.clone()], + negated: false, + })), + _ => None, + } + } + _ => None, } } diff --git a/datafusion/optimizer/src/simplify_expressions/mod.rs b/datafusion/optimizer/src/simplify_expressions/mod.rs index 44ba5b3e3b84..a03dd767e911 100644 --- a/datafusion/optimizer/src/simplify_expressions/mod.rs +++ b/datafusion/optimizer/src/simplify_expressions/mod.rs @@ -19,7 +19,6 @@ pub mod context; pub mod expr_simplifier; mod guarantees; mod inlist_simplifier; -mod or_in_list_simplifier; mod regex; pub mod simplify_exprs; mod utils; diff --git a/datafusion/optimizer/src/simplify_expressions/or_in_list_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/or_in_list_simplifier.rs deleted file mode 100644 index fd5c9ecaf82c..000000000000 --- a/datafusion/optimizer/src/simplify_expressions/or_in_list_simplifier.rs +++ /dev/null @@ -1,100 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! This module implements a rule that simplifies OR expressions into IN list expressions - -use std::borrow::Cow; -use std::collections::HashSet; - -use datafusion_common::tree_node::TreeNodeRewriter; -use datafusion_common::Result; -use datafusion_expr::expr::InList; -use datafusion_expr::{BinaryExpr, Expr, Operator}; - -/// Combine multiple OR expressions into a single IN list expression if possible -/// -/// i.e. `a = 1 OR a = 2 OR a = 3` -> `a IN (1, 2, 3)` -pub(super) struct OrInListSimplifier {} - -impl OrInListSimplifier { - pub(super) fn new() -> Self { - Self {} - } -} - -impl TreeNodeRewriter for OrInListSimplifier { - type N = Expr; - - fn mutate(&mut self, expr: Expr) -> Result { - if let Expr::BinaryExpr(BinaryExpr { left, op, right }) = &expr { - if *op == Operator::Or { - let left = as_inlist(left); - let right = as_inlist(right); - if let (Some(lhs), Some(rhs)) = (left, right) { - if lhs.expr.try_into_col().is_ok() - && rhs.expr.try_into_col().is_ok() - && lhs.expr == rhs.expr - && !lhs.negated - && !rhs.negated - { - let lhs = lhs.into_owned(); - let rhs = rhs.into_owned(); - let mut seen: HashSet = HashSet::new(); - let list = lhs - .list - .into_iter() - .chain(rhs.list) - .filter(|e| seen.insert(e.to_owned())) - .collect::>(); - - let merged_inlist = InList { - expr: lhs.expr, - list, - negated: false, - }; - return Ok(Expr::InList(merged_inlist)); - } - } - } - } - - Ok(expr) - } -} - -/// Try to convert an expression to an in-list expression -fn as_inlist(expr: &Expr) -> Option> { - match expr { - Expr::InList(inlist) => Some(Cow::Borrowed(inlist)), - Expr::BinaryExpr(BinaryExpr { left, op, right }) if *op == Operator::Eq => { - match (left.as_ref(), right.as_ref()) { - (Expr::Column(_), Expr::Literal(_)) => Some(Cow::Owned(InList { - expr: left.clone(), - list: vec![*right.clone()], - negated: false, - })), - (Expr::Literal(_), Expr::Column(_)) => Some(Cow::Owned(InList { - expr: right.clone(), - list: vec![*left.clone()], - negated: false, - })), - _ => None, - } - } - _ => None, - } -}