Skip to content

Commit

Permalink
Add cudf::strings::contains_multiple
Browse files Browse the repository at this point in the history
  • Loading branch information
davidwendt committed Sep 24, 2024
1 parent 951616e commit e446371
Show file tree
Hide file tree
Showing 5 changed files with 538 additions and 4 deletions.
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,7 @@ add_library(
src/strings/replace/replace_slice.cu
src/strings/reverse.cu
src/strings/scan/scan_inclusive.cu
src/strings/search/contains_multiple.cu
src/strings/search/findall.cu
src/strings/search/find.cu
src/strings/search/find_multiple.cu
Expand Down
25 changes: 24 additions & 1 deletion cpp/benchmarks/string/find.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,28 @@ static void bench_find_string(nvbench::state& state)
} else if (api == "contains") {
state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { cudf::strings::contains(input, target); });
} else if (api == "contains_multi") {
constexpr int iters = 10;
std::vector<std::string> match_targets({" abc",
"W43",
"0987 5W43",
"123 abc",
"23 abc",
"3 abc",
"é",
"7 5W43",
"87 5W43",
"987 5W43"});
auto multi_targets = std::vector<std::string>{};
for (int i = 0; i < iters; i++) {
multi_targets.emplace_back(match_targets[i % match_targets.size()]);
}
cudf::test::strings_column_wrapper multi_targets_column(multi_targets.begin(),
multi_targets.end());

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
cudf::strings::contains_multiple(input, cudf::strings_column_view(multi_targets_column));
});
} else if (api == "starts_with") {
state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { cudf::strings::starts_with(input, target); });
Expand All @@ -84,7 +106,8 @@ static void bench_find_string(nvbench::state& state)

NVBENCH_BENCH(bench_find_string)
.set_name("find_string")
.add_string_axis("api", {"find", "find_multi", "contains", "starts_with", "ends_with"})
.add_string_axis("api",
{"find", "find_multi", "contains", "contains_multi", "starts_with", "ends_with"})
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
.add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216})
.add_int64_axis("hit_rate", {20, 80}); // percentage
33 changes: 33 additions & 0 deletions cpp/include/cudf/strings/find.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,39 @@ std::unique_ptr<column> contains(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Returns a table of columns of boolean values for each string where true indicates
* the target string was found within that string in the provided column
*
* Each column in the result table corresponds to the result for the target string at the same
* ordinal. i.e. 0th column is the BOOL8 column result for the 0th target string, 1th for 1th,
* etc.
*
* If the target is not found for a string, false is returned for that entry in the output column.
* If the target is an empty string, true is returned for all non-null entries in the output column.
*
* Any null string entries return corresponding null entries in the output columns.
*
* @code{.pseudo}
* input = ["a", "b", "c"]
* targets = ["a", "c"]
* output is a table with two boolean columns:
* column 0: [true, false, false]
* column 1: [false, false, true]
* @endcode
*
* @param input Strings instance for this operation
* @param targets UTF-8 encoded strings to search for in each string in `input`
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New BOOL8 column
*/
std::unique_ptr<table> contains_multiple(
strings_column_view const& input,
strings_column_view const& targets,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns a column of boolean values for each string where true indicates
* the target string was found at the beginning of that string in the provided column.
Expand Down
Loading

0 comments on commit e446371

Please sign in to comment.