Skip to content

Commit

Permalink
fix: using simple string match replace regex match for contains udf (#…
Browse files Browse the repository at this point in the history
…12931)

* fix: using simple string match replace regex match

* doc: update doc of contains

* test: add case for contains udf

---------

Co-authored-by: zhuliquan <[email protected]>
  • Loading branch information
zhuliquan and zhuliquan authored Oct 17, 2024
1 parent 3d1d28d commit 0e2023d
Showing 1 changed file with 34 additions and 21 deletions.
55 changes: 34 additions & 21 deletions datafusion/functions/src/string/contains.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
// under the License.

use crate::utils::make_scalar_function;
use arrow::array::{Array, ArrayRef, AsArray, GenericStringArray, StringViewArray};
use arrow::compute::regexp_is_match;
use arrow::array::{Array, ArrayRef, AsArray};
use arrow::compute::contains as arrow_contains;
use arrow::datatypes::DataType;
use arrow::datatypes::DataType::{Boolean, LargeUtf8, Utf8, Utf8View};
use datafusion_common::exec_err;
Expand Down Expand Up @@ -102,44 +102,57 @@ fn get_contains_doc() -> &'static Documentation {
})
}

/// use regexp_is_match_utf8_scalar to do the calculation for contains
/// use `arrow::compute::contains` to do the calculation for contains
pub fn contains(args: &[ArrayRef]) -> Result<ArrayRef, DataFusionError> {
match (args[0].data_type(), args[1].data_type()) {
(Utf8View, Utf8View) => {
let mod_str = args[0].as_string_view();
let match_str = args[1].as_string_view();
let res = regexp_is_match::<
StringViewArray,
StringViewArray,
GenericStringArray<i32>,
>(mod_str, match_str, None)?;

let res = arrow_contains(mod_str, match_str)?;
Ok(Arc::new(res) as ArrayRef)
}
(Utf8, Utf8) => {
let mod_str = args[0].as_string::<i32>();
let match_str = args[1].as_string::<i32>();
let res = regexp_is_match::<
GenericStringArray<i32>,
GenericStringArray<i32>,
GenericStringArray<i32>,
>(mod_str, match_str, None)?;

let res = arrow_contains(mod_str, match_str)?;
Ok(Arc::new(res) as ArrayRef)
}
(LargeUtf8, LargeUtf8) => {
let mod_str = args[0].as_string::<i64>();
let match_str = args[1].as_string::<i64>();
let res = regexp_is_match::<
GenericStringArray<i64>,
GenericStringArray<i64>,
GenericStringArray<i32>,
>(mod_str, match_str, None)?;

let res = arrow_contains(mod_str, match_str)?;
Ok(Arc::new(res) as ArrayRef)
}
other => {
exec_err!("Unsupported data type {other:?} for function `contains`.")
}
}
}

#[cfg(test)]
mod test {
use super::ContainsFunc;
use arrow::array::{BooleanArray, StringArray};
use datafusion_common::ScalarValue;
use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
use std::sync::Arc;

#[test]
fn test_contains_udf() {
let udf = ContainsFunc::new();
let array = ColumnarValue::Array(Arc::new(StringArray::from(vec![
Some("xxx?()"),
Some("yyy?()"),
])));
let scalar = ColumnarValue::Scalar(ScalarValue::Utf8(Some("x?(".to_string())));
let actual = udf.invoke(&[array, scalar]).unwrap();
let expect = ColumnarValue::Array(Arc::new(BooleanArray::from(vec![
Some(true),
Some(false),
])));
assert_eq!(
*actual.into_array(2).unwrap(),
*expect.into_array(2).unwrap()
);
}
}

0 comments on commit 0e2023d

Please sign in to comment.