diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 40be1f85391d..fbab39fb3f3a 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1476,13 +1476,28 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> { negated, escape_char: _, case_insensitive: _, - }) if !is_null(&expr) - && matches!( - pattern.as_ref(), - Expr::Literal(ScalarValue::Utf8(Some(pattern_str))) if pattern_str == "%" - ) => + }) if matches!( + pattern.as_ref(), + Expr::Literal(ScalarValue::Utf8(Some(pattern_str))) if pattern_str == "%" + ) || matches!( + pattern.as_ref(), + Expr::Literal(ScalarValue::LargeUtf8(Some(pattern_str))) if pattern_str == "%" + ) || matches!( + pattern.as_ref(), + Expr::Literal(ScalarValue::Utf8View(Some(pattern_str))) if pattern_str == "%" + ) => { - Transformed::yes(lit(!negated)) + // exp LIKE '%' is + // - when exp is not NULL, it's true + // - when exp is NULL, it's NULL + // exp NOT LIKE '%' is + // - when exp is not NULL, it's false + // - when exp is NULL, it's NULL + Transformed::yes(Expr::Case(Case { + expr: Some(Box::new(Expr::IsNotNull(expr))), + when_then_expr: vec![(Box::new(lit(true)), Box::new(lit(!negated)))], + else_expr: None, + })) } // a is not null/unknown --> true (if a is not nullable) @@ -2777,10 +2792,22 @@ mod tests { assert_no_change(regex_match(col("c1"), lit("f_o"))); // empty cases - assert_change(regex_match(col("c1"), lit("")), lit(true)); - assert_change(regex_not_match(col("c1"), lit("")), lit(false)); - assert_change(regex_imatch(col("c1"), lit("")), lit(true)); - assert_change(regex_not_imatch(col("c1"), lit("")), lit(false)); + assert_change( + regex_match(col("c1"), lit("")), + if_not_null(col("c1"), true), + ); + assert_change( + regex_not_match(col("c1"), lit("")), + if_not_null(col("c1"), false), + ); + assert_change( + regex_imatch(col("c1"), lit("")), + if_not_null(col("c1"), true), + ); + assert_change( + regex_not_imatch(col("c1"), lit("")), + if_not_null(col("c1"), false), + ); // single character assert_change(regex_match(col("c1"), lit("x")), like(col("c1"), "%x%")); @@ -3608,16 +3635,16 @@ mod tests { fn test_like_and_ilke() { // test non-null values let expr = like(col("c1"), "%"); - assert_eq!(simplify(expr), lit(true)); + assert_eq!(simplify(expr), if_not_null(col("c1"), true)); let expr = not_like(col("c1"), "%"); - assert_eq!(simplify(expr), lit(false)); + assert_eq!(simplify(expr), if_not_null(col("c1"), false)); let expr = ilike(col("c1"), "%"); - assert_eq!(simplify(expr), lit(true)); + assert_eq!(simplify(expr), if_not_null(col("c1"), true)); let expr = not_ilike(col("c1"), "%"); - assert_eq!(simplify(expr), lit(false)); + assert_eq!(simplify(expr), if_not_null(col("c1"), false)); // test null values let null = lit(ScalarValue::Utf8(None)); @@ -4043,4 +4070,12 @@ mod tests { ); } } + + fn if_not_null(expr: Expr, then: bool) -> Expr { + Expr::Case(Case { + expr: Some(expr.is_not_null().into()), + when_then_expr: vec![(lit(true).into(), lit(then).into())], + else_expr: None, + }) + } } diff --git a/datafusion/sqllogictest/test_files/string/string.slt b/datafusion/sqllogictest/test_files/string/string.slt index 9e97712b6871..68111ba34696 100644 --- a/datafusion/sqllogictest/test_files/string/string.slt +++ b/datafusion/sqllogictest/test_files/string/string.slt @@ -34,6 +34,14 @@ statement ok create table test_substr as select arrow_cast(col1, 'Utf8') as c1 from test_substr_base; +query BBB +SELECT + NULL LIKE '%', + '' LIKE '%', + 'a' LIKE '%' +---- +NULL true true + # TODO: move it back to `string_query.slt.part` after fixing the issue # see detail: https://github.com/apache/datafusion/issues/12637 # Test pattern with wildcard characters diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part b/datafusion/sqllogictest/test_files/string/string_query.slt.part index c4975b5b8c8d..57fb09bca9e4 100644 --- a/datafusion/sqllogictest/test_files/string/string_query.slt.part +++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part @@ -873,6 +873,58 @@ NULL NULL NULL NULL NULL #Raphael datafusionДатаФусион false false false false #NULL NULL NULL NULL NULL NULL +# TODO (https://github.com/apache/datafusion/issues/12637) uncomment additional test projections +query TTBB +SELECT ascii_1, unicode_1, + ascii_1 LIKE '%' AS ascii_1_like_percent, + unicode_1 LIKE '%' AS unicode_1_like_percent + -- ascii_1 LIKE '%%' AS ascii_1_like_percent_percent, -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637 + -- unicode_1 LIKE '%%' AS unicode_1_like_percent_percent -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637 +FROM test_basic_operator +---- +Andrew datafusion📊🔥 true true +Xiangpeng datafusion数据融合 true true +Raphael datafusionДатаФусион true true +under_score un iść core true true +percent pan Tadeusz ma iść w kąt true true +(empty) (empty) true true +NULL NULL NULL NULL +NULL NULL NULL NULL + +# TODO (https://github.com/apache/datafusion/issues/12637) uncomment additional test projections +query TTBB +SELECT ascii_1, unicode_1, + ascii_1 NOT LIKE '%' AS ascii_1_not_like_percent, + unicode_1 NOT LIKE '%' AS unicode_1_not_like_percent + -- ascii_1 NOT LIKE '%%' AS ascii_1_not_like_percent_percent, -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637 + -- unicode_1 NOT LIKE '%%' AS unicode_1_not_like_percent_percent -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637 +FROM test_basic_operator +---- +Andrew datafusion📊🔥 false false +Xiangpeng datafusion数据融合 false false +Raphael datafusionДатаФусион false false +under_score un iść core false false +percent pan Tadeusz ma iść w kąt false false +(empty) (empty) false false +NULL NULL NULL NULL +NULL NULL NULL NULL + +query T +SELECT ascii_1 FROM test_basic_operator WHERE ascii_1 LIKE '%' +---- +Andrew +Xiangpeng +Raphael +under_score +percent +(empty) + +# TODO: move it back to `string_query.slt.part` after fixing the issue +# see detail: https://github.com/apache/datafusion/issues/12637 +query T +SELECT ascii_1 FROM test_basic_operator WHERE ascii_1 NOT LIKE '%' +---- + # Test pattern without wildcard characters query TTBBBB select ascii_1, unicode_1,