Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support StringView for binary operators #12212

Merged
merged 4 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 22 additions & 2 deletions datafusion/expr-common/src/type_coercion/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -984,6 +984,26 @@ fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType>
}
}

/// This will be deprecated when binary operators native support
/// for Utf8View (use `string_coercion` instead).
fn regex_comparison_string_coercion(
lhs_type: &DataType,
rhs_type: &DataType,
) -> Option<DataType> {
use arrow::datatypes::DataType::*;
match (lhs_type, rhs_type) {
// If Utf8View is in any side, we coerce to Utf8.
(Utf8View, Utf8View | Utf8 | LargeUtf8) | (Utf8 | LargeUtf8, Utf8View) => {
Some(Utf8)
}
// Then, if LargeUtf8 is in any side, we coerce to LargeUtf8.
(LargeUtf8, Utf8 | LargeUtf8) | (Utf8, LargeUtf8) => Some(LargeUtf8),
// Utf8 coerces to Utf8
(Utf8, Utf8) => Some(Utf8),
_ => None,
}
}

fn numeric_string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
use arrow::datatypes::DataType::*;
match (lhs_type, rhs_type) {
Expand Down Expand Up @@ -1072,10 +1092,10 @@ fn regex_null_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataT
}
}

/// coercion rules for regular expression comparison operations.
/// Coercion rules for regular expression comparison operations.
/// This is a union of string coercion rules and dictionary coercion rules
pub fn regex_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
string_coercion(lhs_type, rhs_type)
regex_comparison_string_coercion(lhs_type, rhs_type)
.or_else(|| dictionary_comparison_coercion(lhs_type, rhs_type, false))
.or_else(|| regex_null_coercion(lhs_type, rhs_type))
}
Expand Down
6 changes: 5 additions & 1 deletion datafusion/functions/src/regex/regexpmatch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.

//! Regx expressions
//! Regex expressions
use arrow::array::{Array, ArrayRef, OffsetSizeTrait};
use arrow::compute::kernels::regexp;
use arrow::datatypes::DataType;
Expand Down Expand Up @@ -49,6 +49,10 @@ impl RegexpMatchFunc {
Self {
signature: Signature::one_of(
vec![
// Planner attempts coercion to the target type starting with the most preferred candidate.
// For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8, Utf8)`.
// If that fails, it proceeds to `(LargeUtf8, Utf8)`.
// TODO: Native support Utf8View for regexp_match.
Exact(vec![Utf8, Utf8]),
Exact(vec![LargeUtf8, Utf8]),
Exact(vec![Utf8, Utf8, Utf8]),
Expand Down
60 changes: 54 additions & 6 deletions datafusion/physical-expr/src/expressions/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,12 +144,12 @@ macro_rules! boolean_op {
macro_rules! binary_string_array_flag_op {
($LEFT:expr, $RIGHT:expr, $OP:ident, $NOT:expr, $FLAG:expr) => {{
match $LEFT.data_type() {
DataType::Utf8 => {
DataType::Utf8View | DataType::Utf8 => {
compute_utf8_flag_op!($LEFT, $RIGHT, $OP, StringArray, $NOT, $FLAG)
}
},
DataType::LargeUtf8 => {
compute_utf8_flag_op!($LEFT, $RIGHT, $OP, LargeStringArray, $NOT, $FLAG)
}
},
other => internal_err!(
"Data type {:?} not supported for binary_string_array_flag_op operation '{}' on string array",
other, stringify!($OP)
Expand Down Expand Up @@ -186,12 +186,12 @@ macro_rules! compute_utf8_flag_op {
macro_rules! binary_string_array_flag_op_scalar {
($LEFT:expr, $RIGHT:expr, $OP:ident, $NOT:expr, $FLAG:expr) => {{
let result: Result<Arc<dyn Array>> = match $LEFT.data_type() {
DataType::Utf8 => {
DataType::Utf8View | DataType::Utf8 => {
compute_utf8_flag_op_scalar!($LEFT, $RIGHT, $OP, StringArray, $NOT, $FLAG)
}
},
DataType::LargeUtf8 => {
compute_utf8_flag_op_scalar!($LEFT, $RIGHT, $OP, LargeStringArray, $NOT, $FLAG)
}
},
other => internal_err!(
"Data type {:?} not supported for binary_string_array_flag_op_scalar operation '{}' on string array",
other, stringify!($OP)
Expand Down Expand Up @@ -933,6 +933,54 @@ mod tests {
DataType::Boolean,
[true, false],
);
test_coercion!(
StringViewArray,
DataType::Utf8View,
vec!["abc"; 5],
StringArray,
DataType::Utf8,
vec!["^a", "^A", "(b|d)", "(B|D)", "^(b|c)"],
Operator::RegexMatch,
BooleanArray,
DataType::Boolean,
[true, false, true, false, false],
);
test_coercion!(
StringViewArray,
DataType::Utf8View,
vec!["abc"; 5],
StringArray,
DataType::Utf8,
vec!["^a", "^A", "(b|d)", "(B|D)", "^(b|c)"],
Operator::RegexIMatch,
BooleanArray,
DataType::Boolean,
[true, true, true, true, false],
);
test_coercion!(
StringArray,
DataType::Utf8,
vec!["abc"; 5],
StringViewArray,
DataType::Utf8View,
vec!["^a", "^A", "(b|d)", "(B|D)", "^(b|c)"],
Operator::RegexNotMatch,
BooleanArray,
DataType::Boolean,
[false, true, false, true, true],
);
test_coercion!(
StringArray,
DataType::Utf8,
vec!["abc"; 5],
StringViewArray,
DataType::Utf8View,
vec!["^a", "^A", "(b|d)", "(B|D)", "^(b|c)"],
Operator::RegexNotIMatch,
BooleanArray,
DataType::Boolean,
[false, false, false, false, true],
);
test_coercion!(
StringArray,
DataType::Utf8,
Expand Down
46 changes: 42 additions & 4 deletions datafusion/sqllogictest/test_files/string_view.slt
Original file line number Diff line number Diff line change
Expand Up @@ -1040,7 +1040,6 @@ Rap (empty) Raph
NULL NULL NULL

## Ensure no casts for RPAD
## TODO file ticket
query TT
EXPLAIN SELECT
RPAD(column1_utf8view, 1) as c1,
Expand Down Expand Up @@ -1070,7 +1069,6 @@ logical_plan
02)--TableScan: test projection=[column1_utf8view, column2_utf8view]

## Ensure no casts for SPLIT_PART
## TODO file ticket
query TT
EXPLAIN SELECT
SPLIT_PART(column1_utf8view, 'f', 1) as c1,
Expand All @@ -1082,7 +1080,6 @@ logical_plan
02)--TableScan: test projection=[column1_utf8view]

## Ensure no casts for STRPOS
## TODO file ticket
query TT
EXPLAIN SELECT
STRPOS(column1_utf8view, 'f') as c,
Expand All @@ -1094,7 +1091,6 @@ logical_plan
02)--TableScan: test projection=[column1_utf8view, column2_utf8view]

## Ensure no casts for SUBSTR
## TODO file ticket
query TT
EXPLAIN SELECT
SUBSTR(column1_utf8view, 1) as c,
Expand Down Expand Up @@ -1225,6 +1221,48 @@ XiangpengXiangpeng XiangpengXiangpeng XiangpengXiangpeng
RaphaelRaphael RaphaelRaphael RaphaelRaphael
NULL NULL NULL

## Ensure no casts for binary operators
## TODO: https://github.com/apache/datafusion/issues/12180
# `~` operator (regex match)
query TT
EXPLAIN SELECT
tlm365 marked this conversation as resolved.
Show resolved Hide resolved
column1_utf8view ~ 'foo' AS c1
FROM test;
----
logical_plan
01)Projection: CAST(test.column1_utf8view AS Utf8) LIKE Utf8("%foo%") AS c1
02)--TableScan: test projection=[column1_utf8view]

# `~*` operator (regex match case-insensitive)
query TT
EXPLAIN SELECT
column1_utf8view ~* 'foo' AS c1
FROM test;
----
logical_plan
01)Projection: CAST(test.column1_utf8view AS Utf8) ILIKE Utf8("%foo%") AS c1
02)--TableScan: test projection=[column1_utf8view]

# `!~~` operator (not like match)
query TT
EXPLAIN SELECT
column1_utf8view !~~ 'an' AS c1
FROM test;
----
logical_plan
01)Projection: CAST(test.column1_utf8view AS Utf8) !~~ Utf8("an") AS c1
02)--TableScan: test projection=[column1_utf8view]

# `!~~*` operator (not like match case-insensitive)
query TT
EXPLAIN SELECT
column1_utf8view !~~* 'an' AS c1
FROM test;
----
logical_plan
01)Projection: CAST(test.column1_utf8view AS Utf8) !~~* Utf8("an") AS c1
02)--TableScan: test projection=[column1_utf8view]

statement ok
drop table test;

Expand Down