Skip to content

Commit

Permalink
move InList related simplify to one place (#9037)
Browse files Browse the repository at this point in the history
* move InList related simplify to one place

* remove dead code

* delete dead code

* reduce function call

* fix err

* code fmt
  • Loading branch information
guojidan authored Feb 4, 2024
1 parent f5302ef commit 5f18aa7
Show file tree
Hide file tree
Showing 4 changed files with 189 additions and 223 deletions.
98 changes: 4 additions & 94 deletions datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,8 @@
use std::ops::Not;

use super::inlist_simplifier::{InListSimplifier, ShortenInListSimplifier};
use super::utils::*;
use super::{
inlist_simplifier::InListSimplifier, or_in_list_simplifier::OrInListSimplifier,
};
use crate::analyzer::type_coercion::TypeCoercionRewriter;
use crate::simplify_expressions::guarantees::GuaranteeRewriter;
use crate::simplify_expressions::regex::simplify_regex_expr;
Expand All @@ -44,10 +42,7 @@ use datafusion_expr::{
and, lit, or, BinaryExpr, BuiltinScalarFunction, Case, ColumnarValue, Expr, Like,
ScalarFunctionDefinition, Volatility,
};
use datafusion_expr::{
expr::{InList, InSubquery, ScalarFunction},
interval_arithmetic::NullableInterval,
};
use datafusion_expr::{expr::ScalarFunction, interval_arithmetic::NullableInterval};
use datafusion_physical_expr::{create_physical_expr, execution_props::ExecutionProps};

/// This structure handles API for expression simplification
Expand Down Expand Up @@ -137,7 +132,7 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
pub fn simplify(&self, expr: Expr) -> Result<Expr> {
let mut simplifier = Simplifier::new(&self.info);
let mut const_evaluator = ConstEvaluator::try_new(self.info.execution_props())?;
let mut or_in_list_simplifier = OrInListSimplifier::new();
let mut shorten_in_list_simplifier = ShortenInListSimplifier::new();
let mut inlist_simplifier = InListSimplifier::new();
let mut guarantee_rewriter = GuaranteeRewriter::new(&self.guarantees);

Expand All @@ -153,8 +148,8 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
// https://github.com/apache/arrow-datafusion/issues/1160
expr.rewrite(&mut const_evaluator)?
.rewrite(&mut simplifier)?
.rewrite(&mut or_in_list_simplifier)?
.rewrite(&mut inlist_simplifier)?
.rewrite(&mut shorten_in_list_simplifier)?
.rewrite(&mut guarantee_rewriter)?
// run both passes twice to try an minimize simplifications that we missed
.rewrite(&mut const_evaluator)?
Expand Down Expand Up @@ -609,91 +604,6 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> {
None => lit_bool_null(),
}
}
// expr IN () --> false
// expr NOT IN () --> true
Expr::InList(InList {
expr,
list,
negated,
}) if list.is_empty() && *expr != Expr::Literal(ScalarValue::Null) => {
lit(negated)
}

// null in (x, y, z) --> null
// null not in (x, y, z) --> null
Expr::InList(InList {
expr,
list: _,
negated: _,
}) if is_null(&expr) => lit_bool_null(),

// expr IN ((subquery)) -> expr IN (subquery), see ##5529
Expr::InList(InList {
expr,
mut list,
negated,
}) if list.len() == 1
&& matches!(list.first(), Some(Expr::ScalarSubquery { .. })) =>
{
let Expr::ScalarSubquery(subquery) = list.remove(0) else {
unreachable!()
};
Expr::InSubquery(InSubquery::new(expr, subquery, negated))
}

// if expr is a single column reference:
// expr IN (A, B, ...) --> (expr = A) OR (expr = B) OR (expr = C)
Expr::InList(InList {
expr,
list,
negated,
}) if !list.is_empty()
&& (
// For lists with only 1 value we allow more complex expressions to be simplified
// e.g SUBSTR(c1, 2, 3) IN ('1') -> SUBSTR(c1, 2, 3) = '1'
// for more than one we avoid repeating this potentially expensive
// expressions
list.len() == 1
|| list.len() <= THRESHOLD_INLINE_INLIST
&& expr.try_into_col().is_ok()
) =>
{
let first_val = list[0].clone();
if negated {
list.into_iter().skip(1).fold(
(*expr.clone()).not_eq(first_val),
|acc, y| {
// Note that `A and B and C and D` is a left-deep tree structure
// as such we want to maintain this structure as much as possible
// to avoid reordering the expression during each optimization
// pass.
//
// Left-deep tree structure for `A and B and C and D`:
// ```
// &
// / \
// & D
// / \
// & C
// / \
// A B
// ```
//
// The code below maintain the left-deep tree structure.
acc.and((*expr.clone()).not_eq(y))
},
)
} else {
list.into_iter().skip(1).fold(
(*expr.clone()).eq(first_val),
|acc, y| {
// Same reasoning as above
acc.or((*expr.clone()).eq(y))
},
)
}
}
//
// Rules for NotEq
//

Expand Down
213 changes: 185 additions & 28 deletions datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,29 +17,89 @@

//! This module implements a rule that simplifies the values for `InList`s
use std::borrow::Cow;
use std::collections::HashSet;

use datafusion_common::tree_node::TreeNodeRewriter;
use datafusion_common::Result;
use datafusion_expr::expr::InList;
use datafusion_common::{Result, ScalarValue};
use datafusion_expr::expr::{InList, InSubquery};
use datafusion_expr::{lit, BinaryExpr, Expr, Operator};

/// Simplify expressions that is guaranteed to be true or false to a literal boolean expression
///
/// Rules:
/// If both expressions are `IN` or `NOT IN`, then we can apply intersection or union on both lists
/// Intersection:
/// 1. `a in (1,2,3) AND a in (4,5) -> a in (), which is false`
/// 2. `a in (1,2,3) AND a in (2,3,4) -> a in (2,3)`
/// 3. `a not in (1,2,3) OR a not in (3,4,5,6) -> a not in (3)`
/// Union:
/// 4. `a not int (1,2,3) AND a not in (4,5,6) -> a not in (1,2,3,4,5,6)`
/// # This rule is handled by `or_in_list_simplifier.rs`
/// 5. `a in (1,2,3) OR a in (4,5,6) -> a in (1,2,3,4,5,6)`
/// If one of the expressions is `IN` and another one is `NOT IN`, then we apply exception on `In` expression
/// 6. `a in (1,2,3,4) AND a not in (1,2,3,4,5) -> a in (), which is false`
/// 7. `a not in (1,2,3,4) AND a in (1,2,3,4,5) -> a = 5`
/// 8. `a in (1,2,3,4) AND a not in (5,6,7,8) -> a in (1,2,3,4)`
use super::utils::{is_null, lit_bool_null};
use super::THRESHOLD_INLINE_INLIST;

pub(super) struct ShortenInListSimplifier {}

impl ShortenInListSimplifier {
pub(super) fn new() -> Self {
Self {}
}
}

impl TreeNodeRewriter for ShortenInListSimplifier {
type N = Expr;

fn mutate(&mut self, expr: Expr) -> Result<Expr> {
// if expr is a single column reference:
// expr IN (A, B, ...) --> (expr = A) OR (expr = B) OR (expr = C)
if let Expr::InList(InList {
expr,
list,
negated,
}) = expr.clone()
{
if !list.is_empty()
&& (
// For lists with only 1 value we allow more complex expressions to be simplified
// e.g SUBSTR(c1, 2, 3) IN ('1') -> SUBSTR(c1, 2, 3) = '1'
// for more than one we avoid repeating this potentially expensive
// expressions
list.len() == 1
|| list.len() <= THRESHOLD_INLINE_INLIST
&& expr.try_into_col().is_ok()
)
{
let first_val = list[0].clone();
if negated {
return Ok(list.into_iter().skip(1).fold(
(*expr.clone()).not_eq(first_val),
|acc, y| {
// Note that `A and B and C and D` is a left-deep tree structure
// as such we want to maintain this structure as much as possible
// to avoid reordering the expression during each optimization
// pass.
//
// Left-deep tree structure for `A and B and C and D`:
// ```
// &
// / \
// & D
// / \
// & C
// / \
// A B
// ```
//
// The code below maintain the left-deep tree structure.
acc.and((*expr.clone()).not_eq(y))
},
));
} else {
return Ok(list.into_iter().skip(1).fold(
(*expr.clone()).eq(first_val),
|acc, y| {
// Same reasoning as above
acc.or((*expr.clone()).eq(y))
},
));
}
}
}

Ok(expr)
}
}

pub(super) struct InListSimplifier {}

impl InListSimplifier {
Expand All @@ -52,45 +112,142 @@ impl TreeNodeRewriter for InListSimplifier {
type N = Expr;

fn mutate(&mut self, expr: Expr) -> Result<Expr> {
if let Expr::BinaryExpr(BinaryExpr { left, op, right }) = expr {
if let Expr::InList(InList {
expr,
mut list,
negated,
}) = expr.clone()
{
// expr IN () --> false
// expr NOT IN () --> true
if list.is_empty() && *expr != Expr::Literal(ScalarValue::Null) {
return Ok(lit(negated));
// null in (x, y, z) --> null
// null not in (x, y, z) --> null
} else if is_null(&expr) {
return Ok(lit_bool_null());
// expr IN ((subquery)) -> expr IN (subquery), see ##5529
} else if list.len() == 1
&& matches!(list.first(), Some(Expr::ScalarSubquery { .. }))
{
let Expr::ScalarSubquery(subquery) = list.remove(0) else {
unreachable!()
};
return Ok(Expr::InSubquery(InSubquery::new(expr, subquery, negated)));
}
}
// Combine multiple OR expressions into a single IN list expression if possible
//
// i.e. `a = 1 OR a = 2 OR a = 3` -> `a IN (1, 2, 3)`
if let Expr::BinaryExpr(BinaryExpr { left, op, right }) = &expr {
if *op == Operator::Or {
let left = as_inlist(left);
let right = as_inlist(right);
if let (Some(lhs), Some(rhs)) = (left, right) {
if lhs.expr.try_into_col().is_ok()
&& rhs.expr.try_into_col().is_ok()
&& lhs.expr == rhs.expr
&& !lhs.negated
&& !rhs.negated
{
let lhs = lhs.into_owned();
let rhs = rhs.into_owned();
let mut seen: HashSet<Expr> = HashSet::new();
let list = lhs
.list
.into_iter()
.chain(rhs.list)
.filter(|e| seen.insert(e.to_owned()))
.collect::<Vec<_>>();

let merged_inlist = InList {
expr: lhs.expr,
list,
negated: false,
};
return Ok(Expr::InList(merged_inlist));
}
}
}
}
// Simplify expressions that is guaranteed to be true or false to a literal boolean expression
//
// Rules:
// If both expressions are `IN` or `NOT IN`, then we can apply intersection or union on both lists
// Intersection:
// 1. `a in (1,2,3) AND a in (4,5) -> a in (), which is false`
// 2. `a in (1,2,3) AND a in (2,3,4) -> a in (2,3)`
// 3. `a not in (1,2,3) OR a not in (3,4,5,6) -> a not in (3)`
// Union:
// 4. `a not int (1,2,3) AND a not in (4,5,6) -> a not in (1,2,3,4,5,6)`
// # This rule is handled by `or_in_list_simplifier.rs`
// 5. `a in (1,2,3) OR a in (4,5,6) -> a in (1,2,3,4,5,6)`
// If one of the expressions is `IN` and another one is `NOT IN`, then we apply exception on `In` expression
// 6. `a in (1,2,3,4) AND a not in (1,2,3,4,5) -> a in (), which is false`
// 7. `a not in (1,2,3,4) AND a in (1,2,3,4,5) -> a = 5`
// 8. `a in (1,2,3,4) AND a not in (5,6,7,8) -> a in (1,2,3,4)`
if let Expr::BinaryExpr(BinaryExpr { left, op, right }) = expr.clone() {
match (*left, op, *right) {
(Expr::InList(l1), Operator::And, Expr::InList(l2))
if l1.expr == l2.expr && !l1.negated && !l2.negated =>
{
inlist_intersection(l1, l2, false)
return inlist_intersection(l1, l2, false);
}
(Expr::InList(l1), Operator::And, Expr::InList(l2))
if l1.expr == l2.expr && l1.negated && l2.negated =>
{
inlist_union(l1, l2, true)
return inlist_union(l1, l2, true);
}
(Expr::InList(l1), Operator::And, Expr::InList(l2))
if l1.expr == l2.expr && !l1.negated && l2.negated =>
{
inlist_except(l1, l2)
return inlist_except(l1, l2);
}
(Expr::InList(l1), Operator::And, Expr::InList(l2))
if l1.expr == l2.expr && l1.negated && !l2.negated =>
{
inlist_except(l2, l1)
return inlist_except(l2, l1);
}
(Expr::InList(l1), Operator::Or, Expr::InList(l2))
if l1.expr == l2.expr && l1.negated && l2.negated =>
{
inlist_intersection(l1, l2, true)
return inlist_intersection(l1, l2, true);
}
(left, op, right) => {
// put the expression back together
Ok(Expr::BinaryExpr(BinaryExpr {
return Ok(Expr::BinaryExpr(BinaryExpr {
left: Box::new(left),
op,
right: Box::new(right),
}))
}));
}
}
} else {
Ok(expr)
}

Ok(expr)
}
}

/// Try to convert an expression to an in-list expression
fn as_inlist(expr: &Expr) -> Option<Cow<InList>> {
match expr {
Expr::InList(inlist) => Some(Cow::Borrowed(inlist)),
Expr::BinaryExpr(BinaryExpr { left, op, right }) if *op == Operator::Eq => {
match (left.as_ref(), right.as_ref()) {
(Expr::Column(_), Expr::Literal(_)) => Some(Cow::Owned(InList {
expr: left.clone(),
list: vec![*right.clone()],
negated: false,
})),
(Expr::Literal(_), Expr::Column(_)) => Some(Cow::Owned(InList {
expr: right.clone(),
list: vec![*left.clone()],
negated: false,
})),
_ => None,
}
}
_ => None,
}
}

Expand Down
Loading

0 comments on commit 5f18aa7

Please sign in to comment.