From 30858355a6fbf73d667a9fc80b45878daebee31d Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Tue, 5 Nov 2024 11:59:05 +0100 Subject: [PATCH] Fix incorrect `... LIKE '%'` simplification `expr LIKE '%'` was previously simplified to `true`, but the expression returns `NULL` when `expr` is null. The conversion was conditional on `!is_null(expr)` which means "is not always true, i.e. is not a null literal". This commit adds correct simplification logic. It additionally expands the rule coverage to include string view (Utf8View) and large string (LargeUtf8). This allows writing shared test cases even despite `utf8_view LIKE '%'` returning incorrect results at execution time (tracked by https://github.com/apache/datafusion/issues/12637). I.e. the simplification masks the bug for cases where pattern is statically known. --- .../simplify_expressions/expr_simplifier.rs | 63 ++++++++++++++----- .../sqllogictest/test_files/string/string.slt | 8 +++ .../test_files/string/string_query.slt.part | 52 +++++++++++++++ 3 files changed, 109 insertions(+), 14 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 40be1f85391d..fbab39fb3f3a 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1476,13 +1476,28 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> { negated, escape_char: _, case_insensitive: _, - }) if !is_null(&expr) - && matches!( - pattern.as_ref(), - Expr::Literal(ScalarValue::Utf8(Some(pattern_str))) if pattern_str == "%" - ) => + }) if matches!( + pattern.as_ref(), + Expr::Literal(ScalarValue::Utf8(Some(pattern_str))) if pattern_str == "%" + ) || matches!( + pattern.as_ref(), + Expr::Literal(ScalarValue::LargeUtf8(Some(pattern_str))) if pattern_str == "%" + ) || matches!( + pattern.as_ref(), + Expr::Literal(ScalarValue::Utf8View(Some(pattern_str))) if pattern_str == "%" + ) => { - Transformed::yes(lit(!negated)) + // exp LIKE '%' is + // - when exp is not NULL, it's true + // - when exp is NULL, it's NULL + // exp NOT LIKE '%' is + // - when exp is not NULL, it's false + // - when exp is NULL, it's NULL + Transformed::yes(Expr::Case(Case { + expr: Some(Box::new(Expr::IsNotNull(expr))), + when_then_expr: vec![(Box::new(lit(true)), Box::new(lit(!negated)))], + else_expr: None, + })) } // a is not null/unknown --> true (if a is not nullable) @@ -2777,10 +2792,22 @@ mod tests { assert_no_change(regex_match(col("c1"), lit("f_o"))); // empty cases - assert_change(regex_match(col("c1"), lit("")), lit(true)); - assert_change(regex_not_match(col("c1"), lit("")), lit(false)); - assert_change(regex_imatch(col("c1"), lit("")), lit(true)); - assert_change(regex_not_imatch(col("c1"), lit("")), lit(false)); + assert_change( + regex_match(col("c1"), lit("")), + if_not_null(col("c1"), true), + ); + assert_change( + regex_not_match(col("c1"), lit("")), + if_not_null(col("c1"), false), + ); + assert_change( + regex_imatch(col("c1"), lit("")), + if_not_null(col("c1"), true), + ); + assert_change( + regex_not_imatch(col("c1"), lit("")), + if_not_null(col("c1"), false), + ); // single character assert_change(regex_match(col("c1"), lit("x")), like(col("c1"), "%x%")); @@ -3608,16 +3635,16 @@ mod tests { fn test_like_and_ilke() { // test non-null values let expr = like(col("c1"), "%"); - assert_eq!(simplify(expr), lit(true)); + assert_eq!(simplify(expr), if_not_null(col("c1"), true)); let expr = not_like(col("c1"), "%"); - assert_eq!(simplify(expr), lit(false)); + assert_eq!(simplify(expr), if_not_null(col("c1"), false)); let expr = ilike(col("c1"), "%"); - assert_eq!(simplify(expr), lit(true)); + assert_eq!(simplify(expr), if_not_null(col("c1"), true)); let expr = not_ilike(col("c1"), "%"); - assert_eq!(simplify(expr), lit(false)); + assert_eq!(simplify(expr), if_not_null(col("c1"), false)); // test null values let null = lit(ScalarValue::Utf8(None)); @@ -4043,4 +4070,12 @@ mod tests { ); } } + + fn if_not_null(expr: Expr, then: bool) -> Expr { + Expr::Case(Case { + expr: Some(expr.is_not_null().into()), + when_then_expr: vec![(lit(true).into(), lit(then).into())], + else_expr: None, + }) + } } diff --git a/datafusion/sqllogictest/test_files/string/string.slt b/datafusion/sqllogictest/test_files/string/string.slt index 9e97712b6871..68111ba34696 100644 --- a/datafusion/sqllogictest/test_files/string/string.slt +++ b/datafusion/sqllogictest/test_files/string/string.slt @@ -34,6 +34,14 @@ statement ok create table test_substr as select arrow_cast(col1, 'Utf8') as c1 from test_substr_base; +query BBB +SELECT + NULL LIKE '%', + '' LIKE '%', + 'a' LIKE '%' +---- +NULL true true + # TODO: move it back to `string_query.slt.part` after fixing the issue # see detail: https://github.com/apache/datafusion/issues/12637 # Test pattern with wildcard characters diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part b/datafusion/sqllogictest/test_files/string/string_query.slt.part index c4975b5b8c8d..57fb09bca9e4 100644 --- a/datafusion/sqllogictest/test_files/string/string_query.slt.part +++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part @@ -873,6 +873,58 @@ NULL NULL NULL NULL NULL #Raphael datafusionДатаФусион false false false false #NULL NULL NULL NULL NULL NULL +# TODO (https://github.com/apache/datafusion/issues/12637) uncomment additional test projections +query TTBB +SELECT ascii_1, unicode_1, + ascii_1 LIKE '%' AS ascii_1_like_percent, + unicode_1 LIKE '%' AS unicode_1_like_percent + -- ascii_1 LIKE '%%' AS ascii_1_like_percent_percent, -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637 + -- unicode_1 LIKE '%%' AS unicode_1_like_percent_percent -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637 +FROM test_basic_operator +---- +Andrew datafusion📊🔥 true true +Xiangpeng datafusion数据融合 true true +Raphael datafusionДатаФусион true true +under_score un iść core true true +percent pan Tadeusz ma iść w kąt true true +(empty) (empty) true true +NULL NULL NULL NULL +NULL NULL NULL NULL + +# TODO (https://github.com/apache/datafusion/issues/12637) uncomment additional test projections +query TTBB +SELECT ascii_1, unicode_1, + ascii_1 NOT LIKE '%' AS ascii_1_not_like_percent, + unicode_1 NOT LIKE '%' AS unicode_1_not_like_percent + -- ascii_1 NOT LIKE '%%' AS ascii_1_not_like_percent_percent, -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637 + -- unicode_1 NOT LIKE '%%' AS unicode_1_not_like_percent_percent -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637 +FROM test_basic_operator +---- +Andrew datafusion📊🔥 false false +Xiangpeng datafusion数据融合 false false +Raphael datafusionДатаФусион false false +under_score un iść core false false +percent pan Tadeusz ma iść w kąt false false +(empty) (empty) false false +NULL NULL NULL NULL +NULL NULL NULL NULL + +query T +SELECT ascii_1 FROM test_basic_operator WHERE ascii_1 LIKE '%' +---- +Andrew +Xiangpeng +Raphael +under_score +percent +(empty) + +# TODO: move it back to `string_query.slt.part` after fixing the issue +# see detail: https://github.com/apache/datafusion/issues/12637 +query T +SELECT ascii_1 FROM test_basic_operator WHERE ascii_1 NOT LIKE '%' +---- + # Test pattern without wildcard characters query TTBBBB select ascii_1, unicode_1,