Skip to content

Commit

Permalink
Add cudf::strings::contains_multiple (#16900)
Browse files Browse the repository at this point in the history
Add new `cudf::strings::contains_multiple` API to search multiple targets within a strings column.
Output is a table where the number of columns is the number of targets and each row is a boolean indicating that target was found at the row or not.
This PR is to help in collaboration with #16641

Authors:
  - David Wendt (https://github.com/davidwendt)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Chong Gao (https://github.com/res-life)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Chong Gao (https://github.com/res-life)
  - Yunsong Wang (https://github.com/PointKernel)
  - MithunR (https://github.com/mythrocks)
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Bradley Dice (https://github.com/bdice)

URL: #16900
  • Loading branch information
davidwendt authored Nov 12, 2024
1 parent 7682edb commit 796de4b
Show file tree
Hide file tree
Showing 9 changed files with 592 additions and 21 deletions.
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -705,6 +705,7 @@ add_library(
src/strings/replace/replace_slice.cu
src/strings/reverse.cu
src/strings/scan/scan_inclusive.cu
src/strings/search/contains_multiple.cu
src/strings/search/findall.cu
src/strings/search/find.cu
src/strings/search/find_multiple.cu
Expand Down
1 change: 1 addition & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,7 @@ ConfigureNVBench(
string/count.cpp
string/extract.cpp
string/find.cpp
string/find_multiple.cpp
string/join_strings.cpp
string/lengths.cpp
string/like.cpp
Expand Down
14 changes: 3 additions & 11 deletions cpp/benchmarks/string/find.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,7 @@
#include <cudf_test/column_wrapper.hpp>

#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/combine.hpp>
#include <cudf/strings/find.hpp>
#include <cudf/strings/find_multiple.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>

Expand All @@ -44,15 +42,13 @@ static void bench_find_string(nvbench::state& state)
auto const col = create_string_column(n_rows, row_width, hit_rate);
auto const input = cudf::strings_column_view(col->view());

std::vector<std::string> h_targets({"5W", "5W43", "0987 5W43"});
cudf::string_scalar target(h_targets[2]);
cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());
cudf::string_scalar target("0987 5W43");

state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
auto const chars_size = input.chars_size(stream);
state.add_element_count(chars_size, "chars_size");
state.add_global_memory_reads<nvbench::int8_t>(chars_size);
if (api.substr(0, 4) == "find") {
if (api == "find") {
state.add_global_memory_writes<nvbench::int32_t>(input.size());
} else {
state.add_global_memory_writes<nvbench::int8_t>(input.size());
Expand All @@ -61,10 +57,6 @@ static void bench_find_string(nvbench::state& state)
if (api == "find") {
state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { cudf::strings::find(input, target); });
} else if (api == "find_multi") {
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
cudf::strings::find_multiple(input, cudf::strings_column_view(targets));
});
} else if (api == "contains") {
state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { cudf::strings::contains(input, target); });
Expand All @@ -79,7 +71,7 @@ static void bench_find_string(nvbench::state& state)

NVBENCH_BENCH(bench_find_string)
.set_name("find_string")
.add_string_axis("api", {"find", "find_multi", "contains", "starts_with", "ends_with"})
.add_string_axis("api", {"find", "contains", "starts_with", "ends_with"})
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
.add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216})
.add_int64_axis("hit_rate", {20, 80}); // percentage
77 changes: 77 additions & 0 deletions cpp/benchmarks/string/find_multiple.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>

#include <cudf_test/column_wrapper.hpp>

#include <cudf/strings/find.hpp>
#include <cudf/strings/find_multiple.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <nvbench/nvbench.cuh>

static void bench_find_string(nvbench::state& state)
{
auto const n_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const hit_rate = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
auto const target_count = static_cast<cudf::size_type>(state.get_int64("targets"));
auto const api = state.get_string("api");

auto const stream = cudf::get_default_stream();
auto const col = create_string_column(n_rows, row_width, hit_rate);
auto const input = cudf::strings_column_view(col->view());

// Note that these all match the first row of the raw_data in create_string_column.
// This is so the hit_rate can properly accounted for.
std::vector<std::string> const target_data(
{" abc", "W43", "0987 5W43", "123 abc", "23 abc", "3 abc", "7 5W43", "87 5W43", "987 5W43"});
auto h_targets = std::vector<std::string>{};
for (cudf::size_type i = 0; i < target_count; i++) {
h_targets.emplace_back(target_data[i % target_data.size()]);
}
cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());

state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
auto const chars_size = input.chars_size(stream);
state.add_global_memory_reads<nvbench::int8_t>(chars_size);
if (api == "find") {
state.add_global_memory_writes<nvbench::int32_t>(input.size());
} else {
state.add_global_memory_writes<nvbench::int8_t>(input.size());
}

if (api == "find") {
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
cudf::strings::find_multiple(input, cudf::strings_column_view(targets));
});
} else if (api == "contains") {
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
cudf::strings::contains_multiple(input, cudf::strings_column_view(targets));
});
}
}

NVBENCH_BENCH(bench_find_string)
.set_name("find_multiple")
.add_string_axis("api", {"find", "contains"})
.add_int64_axis("targets", {10, 20, 40})
.add_int64_axis("row_width", {32, 64, 128, 256})
.add_int64_axis("num_rows", {32768, 262144, 2097152})
.add_int64_axis("hit_rate", {20, 80}); // percentage
40 changes: 37 additions & 3 deletions cpp/include/cudf/strings/find_multiple.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,42 @@ namespace strings {
*/

/**
* @brief Returns a lists column with character position values where each
* of the target strings are found in each string.
* @brief Searches for the given target strings within each string in the provided column
*
* Each column in the result table corresponds to the result for the target string at the same
* ordinal. i.e. 0th column is the BOOL8 column result for the 0th target string, 1st for 1st,
* etc.
*
* If the target is not found for a string, false is returned for that entry in the output column.
* If the target is an empty string, true is returned for all non-null entries in the output column.
*
* Any null input strings return corresponding null entries in the output columns.
*
* @code{.pseudo}
* input = ["a", "b", "c"]
* targets = ["a", "c"]
* output is a table with two boolean columns:
* column 0: [true, false, false]
* column 1: [false, false, true]
* @endcode
*
* @throw std::invalid_argument if `targets` is empty or contains nulls
*
* @param input Strings instance for this operation
* @param targets UTF-8 encoded strings to search for in each string in `input`
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return Table of BOOL8 columns
*/
std::unique_ptr<table> contains_multiple(
strings_column_view const& input,
strings_column_view const& targets,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Searches for the given target strings within each string in the provided column
* and returns the position the targets were found
*
* The size of the output column is `input.size()`.
* Each row of the output column is of size `targets.size()`.
Expand All @@ -45,7 +79,7 @@ namespace strings {
* [-1,-1, 1 ]} // for "def": "a" and "b" not found, "e" at pos 1
* @endcode
*
* @throw cudf::logic_error if `targets` is empty or contains nulls
* @throw std::invalid_argument if `targets` is empty or contains nulls
*
* @param input Strings instance for this operation
* @param targets Strings to search for in each string
Expand Down
Loading

0 comments on commit 796de4b

Please sign in to comment.