Skip to content

Commit

Permalink
Improve performance of trim for string view (10%) (#12395)
Browse files Browse the repository at this point in the history
* draft.

* add unit tests for xTrim.

* fix fmt.

* tmp copy for ci.

* move `make_and_append_view` to common.

* fix sting view trim about the process of empty string.

* fix compile.

* eliminate some repeated codes.

* add sql test case about string view trim.

* remove unused imports.

* fix tests.

* remove stale file.

* Avoid unecessary unsafe

* add unit test cases with a unlined string view output.

* fix tests.

* improve comments.

* add todo and the related issue.

* use the safe way to get `start_offset` after trimming.

* fix comments.

* Remove redundant test

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
Rachelint and alamb authored Sep 25, 2024
1 parent 21df68c commit dbfde67
Show file tree
Hide file tree
Showing 6 changed files with 605 additions and 70 deletions.
152 changes: 151 additions & 1 deletion datafusion/functions/src/string/btrim.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,11 @@ impl ScalarUDFImpl for BTrimFunc {
}

fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
utf8_to_str_type(&arg_types[0], "btrim")
if arg_types[0] == DataType::Utf8View {
Ok(DataType::Utf8View)
} else {
utf8_to_str_type(&arg_types[0], "btrim")
}
}

fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
Expand All @@ -106,3 +110,149 @@ impl ScalarUDFImpl for BTrimFunc {
&self.aliases
}
}

#[cfg(test)]
mod tests {
use arrow::array::{Array, StringArray, StringViewArray};
use arrow::datatypes::DataType::{Utf8, Utf8View};

use datafusion_common::{Result, ScalarValue};
use datafusion_expr::{ColumnarValue, ScalarUDFImpl};

use crate::string::btrim::BTrimFunc;
use crate::utils::test::test_function;

#[test]
fn test_functions() {
// String view cases for checking normal logic
test_function!(
BTrimFunc::new(),
&[ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
String::from("alphabet ")
))),],
Ok(Some("alphabet")),
&str,
Utf8View,
StringViewArray
);
test_function!(
BTrimFunc::new(),
&[ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
String::from(" alphabet ")
))),],
Ok(Some("alphabet")),
&str,
Utf8View,
StringViewArray
);
test_function!(
BTrimFunc::new(),
&[
ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from(
"alphabet"
)))),
ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from("t")))),
],
Ok(Some("alphabe")),
&str,
Utf8View,
StringViewArray
);
test_function!(
BTrimFunc::new(),
&[
ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from(
"alphabet"
)))),
ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from(
"alphabe"
)))),
],
Ok(Some("t")),
&str,
Utf8View,
StringViewArray
);
test_function!(
BTrimFunc::new(),
&[
ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from(
"alphabet"
)))),
ColumnarValue::Scalar(ScalarValue::Utf8View(None)),
],
Ok(None),
&str,
Utf8View,
StringViewArray
);
// Special string view case for checking unlined output(len > 12)
test_function!(
BTrimFunc::new(),
&[
ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from(
"xxxalphabetalphabetxxx"
)))),
ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from("x")))),
],
Ok(Some("alphabetalphabet")),
&str,
Utf8View,
StringViewArray
);
// String cases
test_function!(
BTrimFunc::new(),
&[ColumnarValue::Scalar(ScalarValue::Utf8(Some(
String::from("alphabet ")
))),],
Ok(Some("alphabet")),
&str,
Utf8,
StringArray
);
test_function!(
BTrimFunc::new(),
&[ColumnarValue::Scalar(ScalarValue::Utf8(Some(
String::from("alphabet ")
))),],
Ok(Some("alphabet")),
&str,
Utf8,
StringArray
);
test_function!(
BTrimFunc::new(),
&[
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("alphabet")))),
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("t")))),
],
Ok(Some("alphabe")),
&str,
Utf8,
StringArray
);
test_function!(
BTrimFunc::new(),
&[
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("alphabet")))),
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("alphabe")))),
],
Ok(Some("t")),
&str,
Utf8,
StringArray
);
test_function!(
BTrimFunc::new(),
&[
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("alphabet")))),
ColumnarValue::Scalar(ScalarValue::Utf8(None)),
],
Ok(None),
&str,
Utf8,
StringArray
);
}
}
Loading

0 comments on commit dbfde67

Please sign in to comment.