From af5c79452df6e4dc61f62db3686366d65ded42af Mon Sep 17 00:00:00 2001 From: Yanxin Xiang Date: Wed, 28 Feb 2024 20:35:42 -0600 Subject: [PATCH] port regexp_like function and port related tests --- datafusion/expr/src/built_in_function.rs | 22 -- datafusion/expr/src/expr_fn.rs | 7 - datafusion/functions/src/regex/mod.rs | 10 +- datafusion/functions/src/regex/regexplike.rs | 249 ++++++++++++++++++ datafusion/functions/src/regex/regexpmatch.rs | 69 +++++ datafusion/physical-expr/src/functions.rs | 155 +++++------ .../physical-expr/src/regex_expressions.rs | 130 --------- datafusion/proto/proto/datafusion.proto | 2 +- datafusion/proto/src/generated/pbjson.rs | 3 - datafusion/proto/src/generated/prost.rs | 4 +- .../proto/src/logical_plan/from_proto.rs | 19 +- datafusion/proto/src/logical_plan/to_proto.rs | 1 - datafusion/sqllogictest/test_files/regexp.slt | 4 + 13 files changed, 404 insertions(+), 271 deletions(-) create mode 100644 datafusion/functions/src/regex/regexplike.rs diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index b7f089846a110..9bd58bc4c8ef8 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -226,8 +226,6 @@ pub enum BuiltinScalarFunction { OctetLength, /// random Random, - /// regexp_like - RegexpLike, /// regexp_match /// regexp_replace RegexpReplace, @@ -441,7 +439,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::MD5 => Volatility::Immutable, BuiltinScalarFunction::OctetLength => Volatility::Immutable, BuiltinScalarFunction::Radians => Volatility::Immutable, - BuiltinScalarFunction::RegexpLike => Volatility::Immutable, BuiltinScalarFunction::RegexpReplace => Volatility::Immutable, BuiltinScalarFunction::Repeat => Volatility::Immutable, BuiltinScalarFunction::Replace => Volatility::Immutable, @@ -794,15 +791,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Upper => { utf8_to_str_type(&input_expr_types[0], "upper") } - BuiltinScalarFunction::RegexpLike => Ok(match &input_expr_types[0] { - LargeUtf8 | Utf8 => Boolean, - Null => Null, - other => { - return plan_err!( - "The regexp_like function can only accept strings. Got {other}" - ); - } - }), BuiltinScalarFunction::Factorial | BuiltinScalarFunction::Gcd @@ -1224,15 +1212,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Replace | BuiltinScalarFunction::Translate => { Signature::one_of(vec![Exact(vec![Utf8, Utf8, Utf8])], self.volatility()) } - BuiltinScalarFunction::RegexpLike => Signature::one_of( - vec![ - Exact(vec![Utf8, Utf8]), - Exact(vec![LargeUtf8, Utf8]), - Exact(vec![Utf8, Utf8, Utf8]), - Exact(vec![LargeUtf8, Utf8, Utf8]), - ], - self.volatility(), - ), BuiltinScalarFunction::RegexpReplace => Signature::one_of( vec![ Exact(vec![Utf8, Utf8, Utf8]), @@ -1472,7 +1451,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::FindInSet => &["find_in_set"], // regex functions - BuiltinScalarFunction::RegexpLike => &["regexp_like"], BuiltinScalarFunction::RegexpReplace => &["regexp_replace"], // time/date functions diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index 63f3af8868bbe..503f576412f65 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -842,11 +842,6 @@ nary_scalar_expr!( rpad, "fill up a string to the length by appending the characters" ); -nary_scalar_expr!( - RegexpLike, - regexp_like, - "matches a regular expression against a string and returns true or false if there was at least one match or not" -); nary_scalar_expr!( RegexpReplace, regexp_replace, @@ -1365,8 +1360,6 @@ mod test { test_scalar_expr!(Ltrim, ltrim, string); test_scalar_expr!(MD5, md5, string); test_scalar_expr!(OctetLength, octet_length, string); - test_nary_scalar_expr!(RegexpLike, regexp_like, string, pattern); - test_nary_scalar_expr!(RegexpLike, regexp_like, string, pattern, flags); test_nary_scalar_expr!( RegexpReplace, regexp_replace, diff --git a/datafusion/functions/src/regex/mod.rs b/datafusion/functions/src/regex/mod.rs index 862e8b77a2d62..8d6449839e9e6 100644 --- a/datafusion/functions/src/regex/mod.rs +++ b/datafusion/functions/src/regex/mod.rs @@ -17,13 +17,17 @@ //! "regx" DataFusion functions +mod regexplike; mod regexpmatch; // create UDFs make_udf_function!(regexpmatch::RegexpMatchFunc, REGEXP_MATCH, regexp_match); - +make_udf_function!(regexplike::RegexpLikeFunc, REGEXP_LIKE, regexp_like); export_functions!(( regexp_match, - input_arg1 - input_arg2, + input_arg1 input_arg2, "returns a list of regular expression matches in a string. " +),( + regexp_like, + input_arg1 input_arg2, + "Returns true if a has at least one match in a string,false otherwise." )); diff --git a/datafusion/functions/src/regex/regexplike.rs b/datafusion/functions/src/regex/regexplike.rs new file mode 100644 index 0000000000000..e6a6ff32a53cd --- /dev/null +++ b/datafusion/functions/src/regex/regexplike.rs @@ -0,0 +1,249 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Encoding expressions +use arrow::array::{Array, ArrayRef, OffsetSizeTrait}; +use arrow::compute::kernels::regexp; +use arrow::datatypes::DataType; +use datafusion_common::exec_err; +use datafusion_common::ScalarValue; +use datafusion_common::{arrow_datafusion_err, plan_err}; +use datafusion_common::{ + cast::as_generic_string_array, internal_err, DataFusionError, Result, +}; +use datafusion_expr::ColumnarValue; +use datafusion_expr::TypeSignature::*; +use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; +use std::any::Any; +use std::sync::Arc; + +#[derive(Debug)] +pub(super) struct RegexpLikeFunc { + signature: Signature, +} +impl RegexpLikeFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![ + Exact(vec![Utf8, Utf8]), + Exact(vec![LargeUtf8, Utf8]), + Exact(vec![Utf8, Utf8, Utf8]), + Exact(vec![LargeUtf8, Utf8, Utf8]), + ], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for RegexpLikeFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "regexp_like" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + use DataType::*; + + Ok(match &arg_types[0] { + LargeUtf8 | Utf8 => Boolean, + Null => Null, + other => { + return plan_err!( + "The regexp_like function can only accept strings. Got {other}" + ); + } + }) + } + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let len = args + .iter() + .fold(Option::::None, |acc, arg| match arg { + ColumnarValue::Scalar(_) => acc, + ColumnarValue::Array(a) => Some(a.len()), + }); + + let is_scalar = len.is_none(); + let inferred_length = len.unwrap_or(1); + let args = args + .iter() + .map(|arg| arg.clone().into_array(inferred_length)) + .collect::>>()?; + + let result = regexp_like_func(&args); + if is_scalar { + // If all inputs are scalar, keeps output as scalar + let result = result.and_then(|arr| ScalarValue::try_from_array(&arr, 0)); + result.map(ColumnarValue::Scalar) + } else { + result.map(ColumnarValue::Array) + } + } +} +fn regexp_like_func(args: &[ArrayRef]) -> Result { + match args[0].data_type() { + DataType::Utf8 => regexp_like::(args), + DataType::LargeUtf8 => regexp_like::(args), + other => { + internal_err!("Unsupported data type {other:?} for function regexp_like") + } + } +} +/// Tests a string using a regular expression returning true if at +/// least one match, false otherwise. +/// +/// The full list of supported features and syntax can be found at +/// +/// +/// Supported flags can be found at +/// +/// +/// # Examples +/// +/// ```ignore +/// # use datafusion::prelude::*; +/// # use datafusion::error::Result; +/// # #[tokio::main] +/// # async fn main() -> Result<()> { +/// let ctx = SessionContext::new(); +/// let df = ctx.read_csv("tests/data/regex.csv", CsvReadOptions::new()).await?; +/// +/// // use the regexp_like function to test col 'values', +/// // against patterns in col 'patterns' without flags +/// let df = df.with_column( +/// "a", +/// regexp_like(vec![col("values"), col("patterns")]) +/// )?; +/// // use the regexp_like function to test col 'values', +/// // against patterns in col 'patterns' with flags +/// let df = df.with_column( +/// "b", +/// regexp_like(vec![col("values"), col("patterns"), col("flags")]) +/// )?; +/// // literals can be used as well with dataframe calls +/// let df = df.with_column( +/// "c", +/// regexp_like(vec![lit("foobarbequebaz"), lit("(bar)(beque)")]) +/// )?; +/// +/// df.show().await?; +/// +/// # Ok(()) +/// # } +/// ``` +pub fn regexp_like(args: &[ArrayRef]) -> Result { + match args.len() { + 2 => { + let values = as_generic_string_array::(&args[0])?; + let regex = as_generic_string_array::(&args[1])?; + let array = regexp::regexp_is_match_utf8(values, regex, None) + .map_err(|e| arrow_datafusion_err!(e))?; + + Ok(Arc::new(array) as ArrayRef) + } + 3 => { + let values = as_generic_string_array::(&args[0])?; + let regex = as_generic_string_array::(&args[1])?; + let flags = as_generic_string_array::(&args[2])?; + + if flags.iter().any(|s| s == Some("g")) { + return plan_err!("regexp_like() does not support the \"global\" option"); + } + + let array = regexp::regexp_is_match_utf8(values, regex, Some(flags)) + .map_err(|e| arrow_datafusion_err!(e))?; + + Ok(Arc::new(array) as ArrayRef) + } + other => exec_err!( + "regexp_like was called with {other} arguments. It requires at least 2 and at most 3." + ), + } +} +mod tests { + use arrow::array::*; + + use datafusion_common::ScalarValue; + + use super::*; + #[test] + fn test_case_sensitive_regexp_like() { + let values = StringArray::from(vec!["abc"; 5]); + + let patterns = + StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]); + + let mut expected_builder: BooleanBuilder = BooleanBuilder::new(); + expected_builder.append_value(true); + expected_builder.append_value(false); + expected_builder.append_value(true); + expected_builder.append_value(false); + expected_builder.append_value(false); + let expected = expected_builder.finish(); + + let re = regexp_like::(&[Arc::new(values), Arc::new(patterns)]).unwrap(); + + assert_eq!(re.as_ref(), &expected); + } + + #[test] + fn test_case_insensitive_regexp_like() { + let values = StringArray::from(vec!["abc"; 5]); + let patterns = + StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]); + let flags = StringArray::from(vec!["i"; 5]); + + let mut expected_builder: BooleanBuilder = BooleanBuilder::new(); + expected_builder.append_value(true); + expected_builder.append_value(true); + expected_builder.append_value(true); + expected_builder.append_value(true); + expected_builder.append_value(false); + let expected = expected_builder.finish(); + + let re = + regexp_like::(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)]) + .unwrap(); + + assert_eq!(re.as_ref(), &expected); + } + + #[test] + fn test_unsupported_global_flag_regexp_like() { + let values = StringArray::from(vec!["abc"]); + let patterns = StringArray::from(vec!["^(a)"]); + let flags = StringArray::from(vec!["g"]); + + let re_err = + regexp_like::(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)]) + .expect_err("unsupported flag should have failed"); + + assert_eq!( + re_err.strip_backtrace(), + "Error during planning: regexp_like() does not support the \"global\" option" + ); + } +} diff --git a/datafusion/functions/src/regex/regexpmatch.rs b/datafusion/functions/src/regex/regexpmatch.rs index 8a2180f00be74..b9e7be125ccb0 100644 --- a/datafusion/functions/src/regex/regexpmatch.rs +++ b/datafusion/functions/src/regex/regexpmatch.rs @@ -139,3 +139,72 @@ pub fn regexp_match(args: &[ArrayRef]) -> Result { ), } } +mod tests { + use arrow::array::*; + + use datafusion_common::ScalarValue; + + use super::*; + + #[test] + fn test_case_sensitive_regexp_match() { + let values = StringArray::from(vec!["abc"; 5]); + let patterns = + StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]); + + let elem_builder: GenericStringBuilder = GenericStringBuilder::new(); + let mut expected_builder = ListBuilder::new(elem_builder); + expected_builder.values().append_value("a"); + expected_builder.append(true); + expected_builder.append(false); + expected_builder.values().append_value("b"); + expected_builder.append(true); + expected_builder.append(false); + expected_builder.append(false); + let expected = expected_builder.finish(); + + let re = regexp_match::(&[Arc::new(values), Arc::new(patterns)]).unwrap(); + + assert_eq!(re.as_ref(), &expected); + } + + #[test] + fn test_case_insensitive_regexp_match() { + let values = StringArray::from(vec!["abc"; 5]); + let patterns = + StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]); + let flags = StringArray::from(vec!["i"; 5]); + + let elem_builder: GenericStringBuilder = GenericStringBuilder::new(); + let mut expected_builder = ListBuilder::new(elem_builder); + expected_builder.values().append_value("a"); + expected_builder.append(true); + expected_builder.values().append_value("a"); + expected_builder.append(true); + expected_builder.values().append_value("b"); + expected_builder.append(true); + expected_builder.values().append_value("b"); + expected_builder.append(true); + expected_builder.append(false); + let expected = expected_builder.finish(); + + let re = + regexp_match::(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)]) + .unwrap(); + + assert_eq!(re.as_ref(), &expected); + } + + #[test] + fn test_unsupported_global_flag_regexp_match() { + let values = StringArray::from(vec!["abc"]); + let patterns = StringArray::from(vec!["^(a)"]); + let flags = StringArray::from(vec!["g"]); + + let re_err = + regexp_match::(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)]) + .expect_err("unsupported flag should have failed"); + + assert_eq!(re_err.strip_backtrace(), "Error during planning: regexp_match() does not support the \"global\" option"); + } +} diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index 2552381a79b0f..55c81b9deb310 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -593,27 +593,6 @@ pub fn create_physical_fun( _ => unreachable!(), }, }), - BuiltinScalarFunction::RegexpLike => Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => { - let func = invoke_on_array_if_regex_expressions_feature_flag!( - regexp_like, - i32, - "regexp_like" - ); - make_scalar_function_inner(func)(args) - } - DataType::LargeUtf8 => { - let func = invoke_on_array_if_regex_expressions_feature_flag!( - regexp_like, - i64, - "regexp_like" - ); - make_scalar_function_inner(func)(args) - } - other => { - exec_err!("Unsupported data type {other:?} for function regexp_like") - } - }), BuiltinScalarFunction::RegexpReplace => { Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { @@ -3089,73 +3068,73 @@ mod tests { Ok(()) } - #[test] - #[cfg(feature = "regex_expressions")] - fn test_regexp_like() -> Result<()> { - let schema = Schema::new(vec![Field::new("a", DataType::Utf8, false)]); - let execution_props = ExecutionProps::new(); - - let col_value: ArrayRef = Arc::new(StringArray::from(vec!["aaa-555"])); - let pattern = lit(r".*-(\d*)"); - let columns: Vec = vec![col_value]; - let expr = create_physical_expr_with_type_coercion( - &BuiltinScalarFunction::RegexpLike, - &[col("a", &schema)?, pattern], - &schema, - &execution_props, - )?; - - // type is correct - assert_eq!(expr.data_type(&schema)?, DataType::Boolean); - - // evaluate works - let batch = RecordBatch::try_new(Arc::new(schema.clone()), columns)?; - let result = expr - .evaluate(&batch)? - .into_array(batch.num_rows()) - .expect("Failed to convert to array"); - - let result = as_boolean_array(&result)?; - - // value is correct - assert!(result.value(0)); - - Ok(()) - } - - #[test] - #[cfg(feature = "regex_expressions")] - fn test_regexp_like_all_literals() -> Result<()> { - let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); - let execution_props = ExecutionProps::new(); - - let col_value = lit("aaa-555"); - let pattern = lit(r".*-(\d*)"); - let columns: Vec = vec![Arc::new(Int32Array::from(vec![1]))]; - let expr = create_physical_expr_with_type_coercion( - &BuiltinScalarFunction::RegexpLike, - &[col_value, pattern], - &schema, - &execution_props, - )?; - - // type is correct - assert_eq!(expr.data_type(&schema)?, DataType::Boolean); - - // evaluate works - let batch = RecordBatch::try_new(Arc::new(schema.clone()), columns)?; - let result = expr - .evaluate(&batch)? - .into_array(batch.num_rows()) - .expect("Failed to convert to array"); - - let result = as_boolean_array(&result)?; - - // value is correct - assert!(result.value(0)); - - Ok(()) - } + // #[test] + // #[cfg(feature = "regex_expressions")] + // fn test_regexp_like() -> Result<()> { + // let schema = Schema::new(vec![Field::new("a", DataType::Utf8, false)]); + // let execution_props = ExecutionProps::new(); + + // let col_value: ArrayRef = Arc::new(StringArray::from(vec!["aaa-555"])); + // let pattern = lit(r".*-(\d*)"); + // let columns: Vec = vec![col_value]; + // let expr = create_physical_expr_with_type_coercion( + // &BuiltinScalarFunction::RegexpLike, + // &[col("a", &schema)?, pattern], + // &schema, + // &execution_props, + // )?; + + // // type is correct + // assert_eq!(expr.data_type(&schema)?, DataType::Boolean); + + // // evaluate works + // let batch = RecordBatch::try_new(Arc::new(schema.clone()), columns)?; + // let result = expr + // .evaluate(&batch)? + // .into_array(batch.num_rows()) + // .expect("Failed to convert to array"); + + // let result = as_boolean_array(&result)?; + + // // value is correct + // assert!(result.value(0)); + + // Ok(()) + // } + + // #[test] + // #[cfg(feature = "regex_expressions")] + // fn test_regexp_like_all_literals() -> Result<()> { + // let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); + // let execution_props = ExecutionProps::new(); + + // let col_value = lit("aaa-555"); + // let pattern = lit(r".*-(\d*)"); + // let columns: Vec = vec![Arc::new(Int32Array::from(vec![1]))]; + // let expr = create_physical_expr_with_type_coercion( + // &BuiltinScalarFunction::RegexpLike, + // &[col_value, pattern], + // &schema, + // &execution_props, + // )?; + + // // type is correct + // assert_eq!(expr.data_type(&schema)?, DataType::Boolean); + + // // evaluate works + // let batch = RecordBatch::try_new(Arc::new(schema.clone()), columns)?; + // let result = expr + // .evaluate(&batch)? + // .into_array(batch.num_rows()) + // .expect("Failed to convert to array"); + + // let result = as_boolean_array(&result)?; + + // // value is correct + // assert!(result.value(0)); + + // Ok(()) + // } // Helper function just for testing. // Returns `expressions` coerced to types compatible with diff --git a/datafusion/physical-expr/src/regex_expressions.rs b/datafusion/physical-expr/src/regex_expressions.rs index 846e5801af1c7..99e6597dad823 100644 --- a/datafusion/physical-expr/src/regex_expressions.rs +++ b/datafusion/physical-expr/src/regex_expressions.rs @@ -53,78 +53,6 @@ macro_rules! fetch_string_arg { }}; } -/// Tests a string using a regular expression returning true if at -/// least one match, false otherwise. -/// -/// The full list of supported features and syntax can be found at -/// -/// -/// Supported flags can be found at -/// -/// -/// # Examples -/// -/// ```ignore -/// # use datafusion::prelude::*; -/// # use datafusion::error::Result; -/// # #[tokio::main] -/// # async fn main() -> Result<()> { -/// let ctx = SessionContext::new(); -/// let df = ctx.read_csv("tests/data/regex.csv", CsvReadOptions::new()).await?; -/// -/// // use the regexp_like function to test col 'values', -/// // against patterns in col 'patterns' without flags -/// let df = df.with_column( -/// "a", -/// regexp_like(vec![col("values"), col("patterns")]) -/// )?; -/// // use the regexp_like function to test col 'values', -/// // against patterns in col 'patterns' with flags -/// let df = df.with_column( -/// "b", -/// regexp_like(vec![col("values"), col("patterns"), col("flags")]) -/// )?; -/// // literals can be used as well with dataframe calls -/// let df = df.with_column( -/// "c", -/// regexp_like(vec![lit("foobarbequebaz"), lit("(bar)(beque)")]) -/// )?; -/// -/// df.show().await?; -/// -/// # Ok(()) -/// # } -/// ``` -pub fn regexp_like(args: &[ArrayRef]) -> Result { - match args.len() { - 2 => { - let values = as_generic_string_array::(&args[0])?; - let regex = as_generic_string_array::(&args[1])?; - let array = arrow_string::regexp::regexp_is_match_utf8(values, regex, None) - .map_err(|e| arrow_datafusion_err!(e))?; - - Ok(Arc::new(array) as ArrayRef) - } - 3 => { - let values = as_generic_string_array::(&args[0])?; - let regex = as_generic_string_array::(&args[1])?; - let flags = as_generic_string_array::(&args[2])?; - - if flags.iter().any(|s| s == Some("g")) { - return plan_err!("regexp_like() does not support the \"global\" option"); - } - - let array = arrow_string::regexp::regexp_is_match_utf8(values, regex, Some(flags)) - .map_err(|e| arrow_datafusion_err!(e))?; - - Ok(Arc::new(array) as ArrayRef) - } - other => exec_err!( - "regexp_like was called with {other} arguments. It requires at least 2 and at most 3." - ), - } -} - /// Extract a specific group from a string column, using a regular expression. /// /// The full list of supported features and syntax can be found at @@ -487,64 +415,6 @@ mod tests { use super::*; - #[test] - fn test_case_sensitive_regexp_like() { - let values = StringArray::from(vec!["abc"; 5]); - - let patterns = - StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]); - - let mut expected_builder: BooleanBuilder = BooleanBuilder::new(); - expected_builder.append_value(true); - expected_builder.append_value(false); - expected_builder.append_value(true); - expected_builder.append_value(false); - expected_builder.append_value(false); - let expected = expected_builder.finish(); - - let re = regexp_like::(&[Arc::new(values), Arc::new(patterns)]).unwrap(); - - assert_eq!(re.as_ref(), &expected); - } - - #[test] - fn test_case_insensitive_regexp_like() { - let values = StringArray::from(vec!["abc"; 5]); - let patterns = - StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]); - let flags = StringArray::from(vec!["i"; 5]); - - let mut expected_builder: BooleanBuilder = BooleanBuilder::new(); - expected_builder.append_value(true); - expected_builder.append_value(true); - expected_builder.append_value(true); - expected_builder.append_value(true); - expected_builder.append_value(false); - let expected = expected_builder.finish(); - - let re = - regexp_like::(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)]) - .unwrap(); - - assert_eq!(re.as_ref(), &expected); - } - - #[test] - fn test_unsupported_global_flag_regexp_like() { - let values = StringArray::from(vec!["abc"]); - let patterns = StringArray::from(vec!["^(a)"]); - let flags = StringArray::from(vec!["g"]); - - let re_err = - regexp_like::(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)]) - .expect_err("unsupported flag should have failed"); - - assert_eq!( - re_err.strip_backtrace(), - "Error during planning: regexp_like() does not support the \"global\" option" - ); - } - #[test] fn test_case_sensitive_regexp_match() { let values = StringArray::from(vec!["abc"; 5]); diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index a4a06bab854c2..d0c880aa1b6b4 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -680,7 +680,7 @@ enum ScalarFunction { InStr = 132; MakeDate = 133; ArrayReverse = 134; - RegexpLike = 135; + /// 135 is RegexpLike ToChar = 136; } diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index 443597bebc20c..dd9ac5478e811 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -22447,7 +22447,6 @@ impl serde::Serialize for ScalarFunction { Self::InStr => "InStr", Self::MakeDate => "MakeDate", Self::ArrayReverse => "ArrayReverse", - Self::RegexpLike => "RegexpLike", Self::ToChar => "ToChar", }; serializer.serialize_str(variant) @@ -22586,7 +22585,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "InStr", "MakeDate", "ArrayReverse", - "RegexpLike", "ToChar", ]; @@ -22754,7 +22752,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "InStr" => Ok(ScalarFunction::InStr), "MakeDate" => Ok(ScalarFunction::MakeDate), "ArrayReverse" => Ok(ScalarFunction::ArrayReverse), - "RegexpLike" => Ok(ScalarFunction::RegexpLike), "ToChar" => Ok(ScalarFunction::ToChar), _ => Err(serde::de::Error::unknown_variant(value, FIELDS)), } diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index c0d234443c94d..8999f6558e711 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -2768,7 +2768,7 @@ pub enum ScalarFunction { InStr = 132, MakeDate = 133, ArrayReverse = 134, - RegexpLike = 135, + /// / 135 is RegexpLike ToChar = 136, } impl ScalarFunction { @@ -2904,7 +2904,6 @@ impl ScalarFunction { ScalarFunction::InStr => "InStr", ScalarFunction::MakeDate => "MakeDate", ScalarFunction::ArrayReverse => "ArrayReverse", - ScalarFunction::RegexpLike => "RegexpLike", ScalarFunction::ToChar => "ToChar", } } @@ -3037,7 +3036,6 @@ impl ScalarFunction { "InStr" => Some(Self::InStr), "MakeDate" => Some(Self::MakeDate), "ArrayReverse" => Some(Self::ArrayReverse), - "RegexpLike" => Some(Self::RegexpLike), "ToChar" => Some(Self::ToChar), _ => None, } diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index c89b3d1ed0f2f..f7491af1373aa 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -61,12 +61,12 @@ use datafusion_expr::{ lcm, left, levenshtein, ln, log, log10, log2, logical_plan::{PlanType, StringifiedPlan}, lower, lpad, ltrim, md5, nanvl, now, octet_length, overlay, pi, power, radians, - random, regexp_like, regexp_replace, repeat, replace, reverse, right, round, rpad, - rtrim, sha224, sha256, sha384, sha512, signum, sin, sinh, split_part, sqrt, - starts_with, string_to_array, strpos, struct_fun, substr, substr_index, substring, - tan, tanh, to_hex, translate, trim, trunc, upper, uuid, AggregateFunction, Between, - BinaryExpr, BuiltInWindowFunction, BuiltinScalarFunction, Case, Cast, Expr, - GetFieldAccess, GetIndexedField, GroupingSet, + random, regexp_replace, repeat, replace, reverse, right, round, rpad, rtrim, sha224, + sha256, sha384, sha512, signum, sin, sinh, split_part, sqrt, starts_with, + string_to_array, strpos, struct_fun, substr, substr_index, substring, tan, tanh, + to_hex, translate, trim, trunc, upper, uuid, AggregateFunction, Between, BinaryExpr, + BuiltInWindowFunction, BuiltinScalarFunction, Case, Cast, Expr, GetFieldAccess, + GetIndexedField, GroupingSet, GroupingSet::GroupingSets, JoinConstraint, JoinType, Like, Operator, TryCast, WindowFrame, WindowFrameBound, WindowFrameUnits, @@ -532,7 +532,6 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::Left => Self::Left, ScalarFunction::Lpad => Self::Lpad, ScalarFunction::Random => Self::Random, - ScalarFunction::RegexpLike => Self::RegexpLike, ScalarFunction::RegexpReplace => Self::RegexpReplace, ScalarFunction::Repeat => Self::Repeat, ScalarFunction::Replace => Self::Replace, @@ -1622,12 +1621,6 @@ pub fn parse_expr( .map(|expr| parse_expr(expr, registry)) .collect::, _>>()?, )), - ScalarFunction::RegexpLike => Ok(regexp_like( - args.to_owned() - .iter() - .map(|expr| parse_expr(expr, registry)) - .collect::, _>>()?, - )), ScalarFunction::RegexpReplace => Ok(regexp_replace( args.to_owned() .iter() diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index b98be075f314d..0e55c0b2cc90e 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -1515,7 +1515,6 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::Lpad => Self::Lpad, BuiltinScalarFunction::Random => Self::Random, BuiltinScalarFunction::Uuid => Self::Uuid, - BuiltinScalarFunction::RegexpLike => Self::RegexpLike, BuiltinScalarFunction::RegexpReplace => Self::RegexpReplace, BuiltinScalarFunction::Repeat => Self::Repeat, BuiltinScalarFunction::Replace => Self::Replace, diff --git a/datafusion/sqllogictest/test_files/regexp.slt b/datafusion/sqllogictest/test_files/regexp.slt index a80b08c41ee3f..19966be2095b4 100644 --- a/datafusion/sqllogictest/test_files/regexp.slt +++ b/datafusion/sqllogictest/test_files/regexp.slt @@ -124,6 +124,10 @@ SELECT regexp_like('(?<=[A-Z]\w )Smith', 'John Smith', 'i'); ---- false +query B +select regexp_like('aaa-555', '.*-(\d*)'); +---- +true # # regexp_match tests