diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 44d44933b98e..40f9bd9dd4b5 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1296,7 +1296,6 @@ dependencies = [ "datafusion-common", "datafusion-execution", "datafusion-expr", - "datafusion-functions", "half", "hashbrown 0.14.3", "hex", diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index e890c9623ca3..e15694fabd79 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -68,3 +68,7 @@ tokio = { workspace = true, features = ["macros", "rt", "sync"] } [[bench]] harness = false name = "to_timestamp" + +[[bench]] +harness = false +name = "regx" \ No newline at end of file diff --git a/datafusion/functions/benches/regx.rs b/datafusion/functions/benches/regx.rs new file mode 100644 index 000000000000..390676f8f249 --- /dev/null +++ b/datafusion/functions/benches/regx.rs @@ -0,0 +1,107 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use std::sync::Arc; + +use arrow_array::builder::StringBuilder; +use arrow_array::{ArrayRef, StringArray}; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use datafusion_functions::regex::regexplike::regexp_like; +use datafusion_functions::regex::regexpmatch::regexp_match; +use rand::distributions::Alphanumeric; +use rand::rngs::ThreadRng; +use rand::seq::SliceRandom; +use rand::Rng; +fn data(rng: &mut ThreadRng) -> StringArray { + let mut data: Vec = vec![]; + for _ in 0..1000 { + data.push( + rng.sample_iter(&Alphanumeric) + .take(7) + .map(char::from) + .collect(), + ); + } + + StringArray::from(data) +} + +fn regex(rng: &mut ThreadRng) -> StringArray { + let samples = vec![ + ".*([A-Z]{1}).*".to_string(), + "^(A).*".to_string(), + r#"[\p{Letter}-]+"#.to_string(), + r#"[\p{L}-]+"#.to_string(), + "[a-zA-Z]_[a-zA-Z]{2}".to_string(), + ]; + let mut data: Vec = vec![]; + for _ in 0..1000 { + data.push(samples.choose(rng).unwrap().to_string()); + } + + StringArray::from(data) +} + +fn flags(rng: &mut ThreadRng) -> StringArray { + let samples = vec![Some("i".to_string()), Some("im".to_string()), None]; + let mut sb = StringBuilder::new(); + for _ in 0..1000 { + let sample = samples.choose(rng).unwrap(); + if sample.is_some() { + sb.append_value(sample.clone().unwrap()); + } else { + sb.append_null(); + } + } + + sb.finish() +} + +fn criterion_benchmark(c: &mut Criterion) { + c.bench_function("regexp_like_1000", |b| { + let mut rng = rand::thread_rng(); + let data = Arc::new(data(&mut rng)) as ArrayRef; + let regex = Arc::new(regex(&mut rng)) as ArrayRef; + let flags = Arc::new(flags(&mut rng)) as ArrayRef; + + b.iter(|| { + black_box( + regexp_like::(&[data.clone(), regex.clone(), flags.clone()]) + .expect("regexp_like should work on valid values"), + ) + }) + }); + + c.bench_function("regexp_match_1000", |b| { + let mut rng = rand::thread_rng(); + let data = Arc::new(data(&mut rng)) as ArrayRef; + let regex = Arc::new(regex(&mut rng)) as ArrayRef; + let flags = Arc::new(flags(&mut rng)) as ArrayRef; + + b.iter(|| { + black_box( + regexp_match::(&[data.clone(), regex.clone(), flags.clone()]) + .expect("regexp_match should work on valid values"), + ) + }) + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/functions/src/regex/regexplike.rs b/datafusion/functions/src/regex/regexplike.rs index 035c5b83b813..b0abad318058 100644 --- a/datafusion/functions/src/regex/regexplike.rs +++ b/datafusion/functions/src/regex/regexplike.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Encoding expressions +//! Regx expressions use arrow::array::{Array, ArrayRef, OffsetSizeTrait}; use arrow::compute::kernels::regexp; use arrow::datatypes::DataType; diff --git a/datafusion/functions/src/regex/regexpmatch.rs b/datafusion/functions/src/regex/regexpmatch.rs index 4032ff0c14d0..f34502af35b7 100644 --- a/datafusion/functions/src/regex/regexpmatch.rs +++ b/datafusion/functions/src/regex/regexpmatch.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Encoding expressions +//! Regx expressions use arrow::array::{Array, ArrayRef, OffsetSizeTrait}; use arrow::compute::kernels::regexp; use arrow::datatypes::DataType; diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index 7e59d617b3f1..0ff7bd595c5b 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -61,7 +61,6 @@ chrono = { workspace = true } datafusion-common = { workspace = true, default-features = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } -datafusion-functions = { workspace = true } half = { workspace = true } hashbrown = { version = "0.14", features = ["raw"] } hex = { version = "0.4", optional = true } diff --git a/datafusion/physical-expr/benches/regexp.rs b/datafusion/physical-expr/benches/regexp.rs index 6e5191d0f370..32acd6ca8f28 100644 --- a/datafusion/physical-expr/benches/regexp.rs +++ b/datafusion/physical-expr/benches/regexp.rs @@ -23,7 +23,6 @@ use std::sync::Arc; use arrow_array::builder::StringBuilder; use arrow_array::{ArrayRef, StringArray}; use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use datafusion_functions::regex::regexplike::regexp_like; use datafusion_physical_expr::regex_expressions::{regexp_match, regexp_replace}; use rand::distributions::Alphanumeric; use rand::rngs::ThreadRng; @@ -75,20 +74,6 @@ fn flags(rng: &mut ThreadRng) -> StringArray { } fn criterion_benchmark(c: &mut Criterion) { - c.bench_function("regexp_like_1000", |b| { - let mut rng = rand::thread_rng(); - let data = Arc::new(data(&mut rng)) as ArrayRef; - let regex = Arc::new(regex(&mut rng)) as ArrayRef; - let flags = Arc::new(flags(&mut rng)) as ArrayRef; - - b.iter(|| { - black_box( - regexp_like::(&[data.clone(), regex.clone(), flags.clone()]) - .expect("regexp_like should work on valid values"), - ) - }) - }); - c.bench_function("regexp_match_1000", |b| { let mut rng = rand::thread_rng(); let data = Arc::new(data(&mut rng)) as ArrayRef;