Skip to content

Commit ec5e038

Browse files
richoxzhangli20
authored andcommitted
Improve substr() performance by avoiding using owned string (apache#13688)
Co-authored-by: zhangli20 <[email protected]>
1 parent 021a500 commit ec5e038

File tree

1 file changed

+40
-37
lines changed

1 file changed

+40
-37
lines changed

datafusion/functions/src/unicode/substr.rs

+40-37
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ use std::sync::{Arc, OnceLock};
2121
use crate::strings::{make_and_append_view, StringArrayType};
2222
use crate::utils::{make_scalar_function, utf8_to_str_type};
2323
use arrow::array::{
24-
Array, ArrayIter, ArrayRef, AsArray, GenericStringArray, Int64Array, OffsetSizeTrait,
25-
StringViewArray,
24+
Array, ArrayIter, ArrayRef, AsArray, GenericStringBuilder, Int64Array,
25+
OffsetSizeTrait, StringViewArray,
2626
};
2727
use arrow::datatypes::DataType;
2828
use arrow_buffer::{NullBufferBuilder, ScalarBuffer};
@@ -448,10 +448,9 @@ where
448448
match args.len() {
449449
1 => {
450450
let iter = ArrayIter::new(string_array);
451-
452-
let result = iter
453-
.zip(start_array.iter())
454-
.map(|(string, start)| match (string, start) {
451+
let mut result_builder = GenericStringBuilder::<T>::new();
452+
for (string, start) in iter.zip(start_array.iter()) {
453+
match (string, start) {
455454
(Some(string), Some(start)) => {
456455
let (start, end) = get_true_start_end(
457456
string,
@@ -460,47 +459,51 @@ where
460459
enable_ascii_fast_path,
461460
); // start, end is byte-based
462461
let substr = &string[start..end];
463-
Some(substr.to_string())
462+
result_builder.append_value(substr);
464463
}
465-
_ => None,
466-
})
467-
.collect::<GenericStringArray<T>>();
468-
Ok(Arc::new(result) as ArrayRef)
464+
_ => {
465+
result_builder.append_null();
466+
}
467+
}
468+
}
469+
Ok(Arc::new(result_builder.finish()) as ArrayRef)
469470
}
470471
2 => {
471472
let iter = ArrayIter::new(string_array);
472473
let count_array = count_array_opt.unwrap();
474+
let mut result_builder = GenericStringBuilder::<T>::new();
473475

474-
let result = iter
475-
.zip(start_array.iter())
476-
.zip(count_array.iter())
477-
.map(|((string, start), count)| {
478-
match (string, start, count) {
479-
(Some(string), Some(start), Some(count)) => {
480-
if count < 0 {
481-
exec_err!(
476+
for ((string, start), count) in
477+
iter.zip(start_array.iter()).zip(count_array.iter())
478+
{
479+
match (string, start, count) {
480+
(Some(string), Some(start), Some(count)) => {
481+
if count < 0 {
482+
return exec_err!(
482483
"negative substring length not allowed: substr(<str>, {start}, {count})"
483-
)
484-
} else {
485-
if start == i64::MIN {
486-
return exec_err!("negative overflow when calculating skip value");
487-
}
488-
let (start, end) = get_true_start_end(
489-
string,
490-
start,
491-
Some(count as u64),
492-
enable_ascii_fast_path,
493-
); // start, end is byte-based
494-
let substr = &string[start..end];
495-
Ok(Some(substr.to_string()))
484+
);
485+
} else {
486+
if start == i64::MIN {
487+
return exec_err!(
488+
"negative overflow when calculating skip value"
489+
);
496490
}
491+
let (start, end) = get_true_start_end(
492+
string,
493+
start,
494+
Some(count as u64),
495+
enable_ascii_fast_path,
496+
); // start, end is byte-based
497+
let substr = &string[start..end];
498+
result_builder.append_value(substr);
497499
}
498-
_ => Ok(None),
499500
}
500-
})
501-
.collect::<Result<GenericStringArray<T>>>()?;
502-
503-
Ok(Arc::new(result) as ArrayRef)
501+
_ => {
502+
result_builder.append_null();
503+
}
504+
}
505+
}
506+
Ok(Arc::new(result_builder.finish()) as ArrayRef)
504507
}
505508
other => {
506509
exec_err!("substr was called with {other} arguments. It requires 2 or 3.")

0 commit comments

Comments
 (0)