Skip to content

Commit

Permalink
feat: Add regexp_count function (#12970)
Browse files Browse the repository at this point in the history
* Implement regexp_ccount

* Update document

* fix check

* add more tests

* Update the world to 1.80

* Fix doc format

* Add null tests

* Add uft8 support and bench

* Refactoring regexp_count

* Refactoring regexp_count

* Revert ci change

* Fix ci

* Updates for documentation, minor improvements.

* Updates for documentation, minor improvements.

* updates to fix scalar tests, doc updates.

* updated regex and string features to remove deps on other features.

---------

Co-authored-by: Xin Li <[email protected]>
  • Loading branch information
Omega359 and xinlifoobar authored Oct 18, 2024
1 parent 3405234 commit 73ba4c4
Show file tree
Hide file tree
Showing 6 changed files with 1,382 additions and 15 deletions.
2 changes: 1 addition & 1 deletion datafusion/functions/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ math_expressions = []
# enable regular expressions
regex_expressions = ["regex"]
# enable string functions
string_expressions = ["regex_expressions", "uuid"]
string_expressions = ["uuid"]
# enable unicode functions
unicode_expressions = ["hashbrown", "unicode-segmentation"]

Expand Down
54 changes: 53 additions & 1 deletion datafusion/functions/benches/regx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@
extern crate criterion;

use arrow::array::builder::StringBuilder;
use arrow::array::{ArrayRef, AsArray, StringArray};
use arrow::array::{ArrayRef, AsArray, Int64Array, StringArray};
use arrow::compute::cast;
use arrow::datatypes::DataType;
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use datafusion_functions::regex::regexpcount::regexp_count_func;
use datafusion_functions::regex::regexplike::regexp_like;
use datafusion_functions::regex::regexpmatch::regexp_match;
use datafusion_functions::regex::regexpreplace::regexp_replace;
Expand Down Expand Up @@ -59,6 +62,15 @@ fn regex(rng: &mut ThreadRng) -> StringArray {
StringArray::from(data)
}

fn start(rng: &mut ThreadRng) -> Int64Array {
let mut data: Vec<i64> = vec![];
for _ in 0..1000 {
data.push(rng.gen_range(1..5));
}

Int64Array::from(data)
}

fn flags(rng: &mut ThreadRng) -> StringArray {
let samples = [Some("i".to_string()), Some("im".to_string()), None];
let mut sb = StringBuilder::new();
Expand All @@ -75,6 +87,46 @@ fn flags(rng: &mut ThreadRng) -> StringArray {
}

fn criterion_benchmark(c: &mut Criterion) {
c.bench_function("regexp_count_1000 string", |b| {
let mut rng = rand::thread_rng();
let data = Arc::new(data(&mut rng)) as ArrayRef;
let regex = Arc::new(regex(&mut rng)) as ArrayRef;
let start = Arc::new(start(&mut rng)) as ArrayRef;
let flags = Arc::new(flags(&mut rng)) as ArrayRef;

b.iter(|| {
black_box(
regexp_count_func(&[
Arc::clone(&data),
Arc::clone(&regex),
Arc::clone(&start),
Arc::clone(&flags),
])
.expect("regexp_count should work on utf8"),
)
})
});

c.bench_function("regexp_count_1000 utf8view", |b| {
let mut rng = rand::thread_rng();
let data = cast(&data(&mut rng), &DataType::Utf8View).unwrap();
let regex = cast(&regex(&mut rng), &DataType::Utf8View).unwrap();
let start = Arc::new(start(&mut rng)) as ArrayRef;
let flags = cast(&flags(&mut rng), &DataType::Utf8View).unwrap();

b.iter(|| {
black_box(
regexp_count_func(&[
Arc::clone(&data),
Arc::clone(&regex),
Arc::clone(&start),
Arc::clone(&flags),
])
.expect("regexp_count should work on utf8view"),
)
})
});

c.bench_function("regexp_like_1000", |b| {
let mut rng = rand::thread_rng();
let data = Arc::new(data(&mut rng)) as ArrayRef;
Expand Down
27 changes: 26 additions & 1 deletion datafusion/functions/src/regex/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,13 @@
use std::sync::Arc;

pub mod regexpcount;
pub mod regexplike;
pub mod regexpmatch;
pub mod regexpreplace;

// create UDFs
make_udf_function!(regexpcount::RegexpCountFunc, REGEXP_COUNT, regexp_count);
make_udf_function!(regexpmatch::RegexpMatchFunc, REGEXP_MATCH, regexp_match);
make_udf_function!(regexplike::RegexpLikeFunc, REGEXP_LIKE, regexp_like);
make_udf_function!(
Expand All @@ -35,6 +37,24 @@ make_udf_function!(
pub mod expr_fn {
use datafusion_expr::Expr;

/// Returns the number of consecutive occurrences of a regular expression in a string.
pub fn regexp_count(
values: Expr,
regex: Expr,
start: Option<Expr>,
flags: Option<Expr>,
) -> Expr {
let mut args = vec![values, regex];
if let Some(start) = start {
args.push(start);
};

if let Some(flags) = flags {
args.push(flags);
};
super::regexp_count().call(args)
}

/// Returns a list of regular expression matches in a string.
pub fn regexp_match(values: Expr, regex: Expr, flags: Option<Expr>) -> Expr {
let mut args = vec![values, regex];
Expand Down Expand Up @@ -70,5 +90,10 @@ pub mod expr_fn {

/// Returns all DataFusion functions defined in this package
pub fn functions() -> Vec<Arc<datafusion_expr::ScalarUDF>> {
vec![regexp_match(), regexp_like(), regexp_replace()]
vec![
regexp_count(),
regexp_match(),
regexp_like(),
regexp_replace(),
]
}
Loading

0 comments on commit 73ba4c4

Please sign in to comment.