From 399b7b4d389148ec53588d95288e3ddd0e318c22 Mon Sep 17 00:00:00 2001 From: Lordworms Date: Fri, 21 Jun 2024 16:34:48 -0700 Subject: [PATCH 1/3] Improve LIKE performance for Dictionary arrays --- .../optimizer/src/analyzer/type_coercion.rs | 5 ++- .../physical-expr/src/expressions/like.rs | 10 +++++- datafusion/sqllogictest/test_files/regexp.slt | 32 +++++++++++++++++++ 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 0c8e4ae34a90..64572acecdae 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -265,7 +265,10 @@ impl<'a> TreeNodeRewriter for TypeCoercionRewriter<'a> { "There isn't a common type to coerce {left_type} and {right_type} in {op_name} expression" ) })?; - let expr = Box::new(expr.cast_to(&coerced_type, self.schema)?); + let expr = match left_type { + DataType::Dictionary(_, Utf8) => expr, + _ => Box::new(expr.cast_to(&coerced_type, self.schema)?), + }; let pattern = Box::new(pattern.cast_to(&coerced_type, self.schema)?); Ok(Transformed::yes(Expr::Like(Like::new( negated, diff --git a/datafusion/physical-expr/src/expressions/like.rs b/datafusion/physical-expr/src/expressions/like.rs index eec347db8ed8..d18651c641fd 100644 --- a/datafusion/physical-expr/src/expressions/like.rs +++ b/datafusion/physical-expr/src/expressions/like.rs @@ -148,6 +148,14 @@ impl PartialEq for LikeExpr { } } +/// used for optimize Dictionary like +fn can_like_type(from_type: &DataType) -> bool { + match from_type { + DataType::Dictionary(_, inner_type_from) => **inner_type_from == DataType::Utf8, + _ => false, + } +} + /// Create a like expression, erroring if the argument types are not compatible. pub fn like( negated: bool, @@ -158,7 +166,7 @@ pub fn like( ) -> Result> { let expr_type = &expr.data_type(input_schema)?; let pattern_type = &pattern.data_type(input_schema)?; - if !expr_type.eq(pattern_type) { + if !expr_type.eq(pattern_type) && !can_like_type(expr_type) { return internal_err!( "The type of {expr_type} AND {pattern_type} of like physical should be same" ); diff --git a/datafusion/sqllogictest/test_files/regexp.slt b/datafusion/sqllogictest/test_files/regexp.slt index a45ce3718bc4..604a7cd283ad 100644 --- a/datafusion/sqllogictest/test_files/regexp.slt +++ b/datafusion/sqllogictest/test_files/regexp.slt @@ -322,3 +322,35 @@ true statement ok drop table t; + +statement ok +create or replace table strings as values + ('FooBar'), + ('Foo'), + ('Foo'), + ('Bar'), + ('FooBar'), + ('Bar'), + ('Baz'); + +statement ok +create or replace table dict_table as +select arrow_cast(column1, 'Dictionary(Int32, Utf8)') as column1 +from strings; + +query TT +explain select column1 from dict_table where column1 LIKE '%oo%'; +---- +logical_plan +01)Filter: dict_table.column1 LIKE Utf8("%oo%") +02)--TableScan: dict_table projection=[column1] +physical_plan +01)CoalesceBatchesExec: target_batch_size=8192 +02)--FilterExec: column1@0 LIKE %oo% +03)----MemoryExec: partitions=1, partition_sizes=[1] + +statement ok +drop table strings + +statement ok +drop table dict_table From 2c1394d1dccf2a2407061dd5a38d28b1c9ded5cf Mon Sep 17 00:00:00 2001 From: Lordworms Date: Fri, 21 Jun 2024 19:08:06 -0700 Subject: [PATCH 2/3] fix clippy --- datafusion/optimizer/src/analyzer/type_coercion.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 64572acecdae..a7ce7376977b 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -266,7 +266,7 @@ impl<'a> TreeNodeRewriter for TypeCoercionRewriter<'a> { ) })?; let expr = match left_type { - DataType::Dictionary(_, Utf8) => expr, + DataType::Dictionary(_, inner) if *inner == DataType::Utf8 => expr, _ => Box::new(expr.cast_to(&coerced_type, self.schema)?), }; let pattern = Box::new(pattern.cast_to(&coerced_type, self.schema)?); From c5668b646b764e156a183fc3894773350746ac6c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 22 Jun 2024 07:44:14 -0400 Subject: [PATCH 3/3] Add a few more tests --- datafusion/sqllogictest/test_files/regexp.slt | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/datafusion/sqllogictest/test_files/regexp.slt b/datafusion/sqllogictest/test_files/regexp.slt index 604a7cd283ad..fed7ac31712c 100644 --- a/datafusion/sqllogictest/test_files/regexp.slt +++ b/datafusion/sqllogictest/test_files/regexp.slt @@ -338,6 +338,38 @@ create or replace table dict_table as select arrow_cast(column1, 'Dictionary(Int32, Utf8)') as column1 from strings; +query ? +select column1 from dict_table where column1 LIKE '%oo%'; +---- +FooBar +Foo +Foo +FooBar + +query ? +select column1 from dict_table where column1 NOT LIKE '%oo%'; +---- +Bar +Bar +Baz + +query ? +select column1 from dict_table where column1 ILIKE '%oO%'; +---- +FooBar +Foo +Foo +FooBar + +query ? +select column1 from dict_table where column1 NOT ILIKE '%oO%'; +---- +Bar +Bar +Baz + + +# plan should not cast the column, instead it should use the dictionary directly query TT explain select column1 from dict_table where column1 LIKE '%oo%'; ---- @@ -349,6 +381,23 @@ physical_plan 02)--FilterExec: column1@0 LIKE %oo% 03)----MemoryExec: partitions=1, partition_sizes=[1] +# Ensure casting / coercion works for all operators +# (there should be no casts to Utf8) +query TT +explain select + column1 LIKE '%oo%', + column1 NOT LIKE '%oo%', + column1 ILIKE '%oo%', + column1 NOT ILIKE '%oo%' +from dict_table; +---- +logical_plan +01)Projection: dict_table.column1 LIKE Utf8("%oo%"), dict_table.column1 NOT LIKE Utf8("%oo%"), dict_table.column1 ILIKE Utf8("%oo%"), dict_table.column1 NOT ILIKE Utf8("%oo%") +02)--TableScan: dict_table projection=[column1] +physical_plan +01)ProjectionExec: expr=[column1@0 LIKE %oo% as dict_table.column1 LIKE Utf8("%oo%"), column1@0 NOT LIKE %oo% as dict_table.column1 NOT LIKE Utf8("%oo%"), column1@0 ILIKE %oo% as dict_table.column1 ILIKE Utf8("%oo%"), column1@0 NOT ILIKE %oo% as dict_table.column1 NOT ILIKE Utf8("%oo%")] +02)--MemoryExec: partitions=1, partition_sizes=[1] + statement ok drop table strings