diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 84b462bb884..38ee5389cf2 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -603,6 +603,7 @@ add_library( src/strings/replace/replace_slice.cu src/strings/reverse.cu src/strings/scan/scan_inclusive.cu + src/strings/search/contains_multiple.cu src/strings/search/findall.cu src/strings/search/find.cu src/strings/search/find_multiple.cu diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp index a9c620e4bf0..baa34b13e2e 100644 --- a/cpp/benchmarks/string/find.cpp +++ b/cpp/benchmarks/string/find.cpp @@ -73,6 +73,28 @@ static void bench_find_string(nvbench::state& state) } else if (api == "contains") { state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { cudf::strings::contains(input, target); }); + } else if (api == "contains_multi") { + constexpr int iters = 10; + std::vector match_targets({" abc", + "W43", + "0987 5W43", + "123 abc", + "23 abc", + "3 abc", + "é", + "7 5W43", + "87 5W43", + "987 5W43"}); + auto multi_targets = std::vector{}; + for (int i = 0; i < iters; i++) { + multi_targets.emplace_back(match_targets[i % match_targets.size()]); + } + cudf::test::strings_column_wrapper multi_targets_column(multi_targets.begin(), + multi_targets.end()); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::strings::contains_multiple(input, cudf::strings_column_view(multi_targets_column)); + }); } else if (api == "starts_with") { state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { cudf::strings::starts_with(input, target); }); @@ -84,7 +106,8 @@ static void bench_find_string(nvbench::state& state) NVBENCH_BENCH(bench_find_string) .set_name("find_string") - .add_string_axis("api", {"find", "find_multi", "contains", "starts_with", "ends_with"}) + .add_string_axis("api", + {"find", "find_multi", "contains", "contains_multi", "starts_with", "ends_with"}) .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) .add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216}) .add_int64_axis("hit_rate", {20, 80}); // percentage diff --git a/cpp/include/cudf/strings/find.hpp b/cpp/include/cudf/strings/find.hpp index e024b116a71..497c263cca8 100644 --- a/cpp/include/cudf/strings/find.hpp +++ b/cpp/include/cudf/strings/find.hpp @@ -163,6 +163,39 @@ std::unique_ptr contains( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Returns a table of columns of boolean values for each string where true indicates + * the target string was found within that string in the provided column + * + * Each column in the result table corresponds to the result for the target string at the same + * ordinal. i.e. 0th column is the BOOL8 column result for the 0th target string, 1th for 1th, + * etc. + * + * If the target is not found for a string, false is returned for that entry in the output column. + * If the target is an empty string, true is returned for all non-null entries in the output column. + * + * Any null string entries return corresponding null entries in the output columns. + * + * @code{.pseudo} + * input = ["a", "b", "c"] + * targets = ["a", "c"] + * output is a table with two boolean columns: + * column 0: [true, false, false] + * column 1: [false, false, true] + * @endcode + * + * @param input Strings instance for this operation + * @param targets UTF-8 encoded strings to search for in each string in `input` + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New BOOL8 column + */ +std::unique_ptr contains_multiple( + strings_column_view const& input, + strings_column_view const& targets, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Returns a column of boolean values for each string where true indicates * the target string was found at the beginning of that string in the provided column. diff --git a/cpp/src/strings/search/contains_multiple.cu b/cpp/src/strings/search/contains_multiple.cu new file mode 100644 index 00000000000..e4673016e28 --- /dev/null +++ b/cpp/src/strings/search/contains_multiple.cu @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cudf { +namespace strings { +namespace detail { +namespace { + +/** + * @brief Threshold to decide on using string or warp parallel functions. + * + * If the average byte length of a string in a column exceeds this value then + * a warp-parallel function is used. + */ +constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64; + +CUDF_KERNEL void multi_contains_warp_parallel(column_device_view const d_strings, + column_device_view const d_targets, + u_char const* d_first_bytes, + size_type const* d_indices, + size_type const* d_offsets, + size_type unique_count, + cudf::device_span d_results) +{ + auto const num_targets = d_targets.size(); + auto const num_rows = d_strings.size(); + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const str_idx = idx / cudf::detail::warp_size; + if (str_idx >= num_rows) { return; } + if (d_strings.is_null(str_idx)) { return; } + // get the string for this warp + auto const d_str = d_strings.element(str_idx); + + // size of shared_bools = targets_size * block_size + // each thread uses targets_size bools + extern __shared__ bool shared_bools[]; + auto const lane_idx = idx % cudf::detail::warp_size; + + // initialize result: set true if target is empty, false otherwise + for (int target_idx = 0; target_idx < num_targets; target_idx++) { + auto const d_target = d_targets.element(target_idx); + shared_bools[threadIdx.x * num_targets + target_idx] = d_target.empty(); + } + + auto const last_ptr = d_first_bytes + unique_count; + for (size_type str_byte_idx = lane_idx; str_byte_idx < d_str.size_bytes(); + str_byte_idx += cudf::detail::warp_size) { + // search for byte in first_bytes array + auto const chr = static_cast(*(d_str.data() + str_byte_idx)); + auto const byte_ptr = thrust::lower_bound(thrust::seq, d_first_bytes, last_ptr, chr); + // if not found, continue to next byte + if ((byte_ptr == last_ptr) || (*byte_ptr != chr)) { continue; } + // compute index of matched byte + auto offset_idx = static_cast(thrust::distance(d_first_bytes, byte_ptr)); + auto map_idx = d_offsets[offset_idx]; + auto const last_idx = + (offset_idx + 1) < unique_count ? d_offsets[offset_idx + 1] : unique_count; + // check for targets that begin with chr + while ((map_idx < num_targets) && (offset_idx < last_idx)) { + auto target_idx = d_indices[map_idx++]; + int temp_result_idx = threadIdx.x * num_targets + target_idx; + if (!shared_bools[temp_result_idx]) { // not found before + auto const d_target = d_targets.element(target_idx); + if (d_str.size_bytes() - str_byte_idx >= d_target.size_bytes()) { + // first char already checked, only need to check the [2nd, end) chars if has. + bool found = true; + for (auto i = 1; i < d_target.size_bytes() && found; i++) { + if (*(d_str.data() + str_byte_idx + i) != *(d_target.data() + i)) { found = false; } + } + if (found) { shared_bools[temp_result_idx] = true; } + } + } + ++offset_idx; + } + } + + // wait all lanes are done in a warp + __syncwarp(); + + if (lane_idx == 0) { + for (int target_idx = 0; target_idx < num_targets; target_idx++) { + bool found = false; + for (int lane_idx = 0; lane_idx < cudf::detail::warp_size; lane_idx++) { + int temp_idx = (threadIdx.x + lane_idx) * num_targets + target_idx; + if (shared_bools[temp_idx]) { + found = true; + break; + } + } + d_results[target_idx][str_idx] = found; + } + } +} + +CUDF_KERNEL void multi_contains_row_parallel(column_device_view const d_strings, + column_device_view const d_targets, + u_char const* d_first_bytes, + size_type const* d_indices, + size_type const* d_offsets, + size_type unique_count, + cudf::device_span d_results) +{ + auto const str_idx = static_cast(cudf::detail::grid_1d::global_thread_id()); + auto const num_targets = d_targets.size(); + auto const num_rows = d_strings.size(); + if (str_idx >= num_rows) { return; } + if (d_strings.is_null(str_idx)) { return; } + auto const d_str = d_strings.element(str_idx); + + // initialize output; the result of searching empty target is true + for (auto target_idx = 0; target_idx < num_targets; ++target_idx) { + auto const d_target = d_targets.element(target_idx); + d_results[target_idx][str_idx] = d_target.empty(); + } + + // process each byte of the current string + auto const last_ptr = d_first_bytes + unique_count; + for (auto str_byte_idx = 0; str_byte_idx < d_str.size_bytes(); ++str_byte_idx) { + // search for byte in first_bytes array + auto const chr = static_cast(*(d_str.data() + str_byte_idx)); + auto const byte_ptr = thrust::lower_bound(thrust::seq, d_first_bytes, last_ptr, chr); + // if not found, continue to next byte + if ((byte_ptr == last_ptr) || (*byte_ptr != chr)) { continue; } + // compute index of matched byte + auto offset_idx = static_cast(thrust::distance(d_first_bytes, byte_ptr)); + auto map_idx = d_offsets[offset_idx]; + auto const last_idx = + (offset_idx + 1) < unique_count ? d_offsets[offset_idx + 1] : unique_count; + // check for targets that begin with chr + while ((map_idx < num_targets) && (offset_idx < last_idx)) { + auto target_idx = d_indices[map_idx++]; + if (!d_results[target_idx][str_idx]) { // not found before + auto const d_target = d_targets.element(target_idx); + if (d_str.size_bytes() - str_byte_idx >= d_target.size_bytes()) { + // first char already checked, only need to check the [2nd, end) chars + bool found = true; + for (auto i = 1; i < d_target.size_bytes() && found; i++) { + if (*(d_str.data() + str_byte_idx + i) != *(d_target.data() + i)) { found = false; } + } + if (found) { d_results[target_idx][str_idx] = true; } + } + } + ++offset_idx; + } + } +} + +std::vector> multi_contains(bool warp_parallel, + strings_column_view const& input, + strings_column_view const& targets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const num_targets = static_cast(targets.size()); + + auto const d_strings = column_device_view::create(input.parent(), stream); + auto const d_targets = column_device_view::create(targets.parent(), stream); + + // copy the first byte of each target and sort the first bytes + auto first_bytes = rmm::device_uvector(targets.size(), stream); + auto indices = rmm::device_uvector(targets.size(), stream); + { + auto tgt_itr = thrust::make_transform_iterator( + d_targets->begin(), [] __device__(auto const& d_tgt) -> u_char { + return d_tgt.empty() ? u_char{0} : static_cast(d_tgt.data()[0]); + }); + auto count_itr = thrust::make_counting_iterator(0); + auto keys_out = first_bytes.begin(); + auto vals_out = indices.begin(); + auto cmp_op = thrust::less(); + auto sv = stream.value(); + + std::size_t tmp_bytes = 0; + cub::DeviceMergeSort::SortPairsCopy( + nullptr, tmp_bytes, tgt_itr, count_itr, keys_out, vals_out, num_targets, cmp_op, sv); + auto tmp_stg = rmm::device_buffer(tmp_bytes, stream); + cub::DeviceMergeSort::SortPairsCopy( + tmp_stg.data(), tmp_bytes, tgt_itr, count_itr, keys_out, vals_out, num_targets, cmp_op, sv); + } + + // remove duplicates to speed up lower_bound + auto offsets = rmm::device_uvector(targets.size(), stream); + thrust::sequence(rmm::exec_policy_nosync(stream), offsets.begin(), offsets.end()); + auto end = thrust::unique_by_key( + rmm::exec_policy_nosync(stream), first_bytes.begin(), first_bytes.end(), offsets.begin()); + auto ucount = static_cast(thrust::distance(first_bytes.begin(), end.first)); + + // create output columns + auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) { + return make_numeric_column(data_type{type_id::BOOL8}, + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), + stream, + mr); + }); + auto results_list = + std::vector>(results_iter, results_iter + targets.size()); + auto device_results_list = [&] { + auto host_results_pointer_iter = + thrust::make_transform_iterator(results_list.begin(), [](auto const& results_column) { + return results_column->mutable_view().template data(); + }); + auto host_results_pointers = std::vector( + host_results_pointer_iter, host_results_pointer_iter + results_list.size()); + return cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr); + }(); + + constexpr cudf::thread_index_type block_size = 256; + + auto d_first_bytes = first_bytes.data(); + auto d_indices = indices.data(); + auto d_offsets = offsets.data(); + + if (warp_parallel) { + cudf::detail::grid_1d grid{ + static_cast(input.size()) * cudf::detail::warp_size, block_size}; + int shared_mem_size = block_size * targets.size(); + multi_contains_warp_parallel<<>>( + *d_strings, *d_targets, d_first_bytes, d_indices, d_offsets, ucount, device_results_list); + } else { + cudf::detail::grid_1d grid{static_cast(input.size()), block_size}; + multi_contains_row_parallel<<>>( + *d_strings, *d_targets, d_first_bytes, d_indices, d_offsets, ucount, device_results_list); + } + + return results_list; +} + +} // namespace + +std::unique_ptr
contains_multiple(strings_column_view const& input, + strings_column_view const& targets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(not targets.is_empty(), "Must specify at least one target string."); + CUDF_EXPECTS(not targets.has_nulls(), "Target strings cannot be null"); + + if ((input.null_count() == input.size()) || + ((input.chars_size(stream) / (input.size() - input.null_count())) <= + AVG_CHAR_BYTES_THRESHOLD)) { + // Small strings. Searching for multiple targets in one thread seems to work fastest. + return std::make_unique
( + multi_contains(/**warp parallel**/ false, input, targets, stream, mr)); + } + + // Long strings + // Use warp parallel when the average string width is greater than the threshold + static constexpr size_type target_group_size = 16; // perhaps can be calculated + if (targets.size() <= target_group_size) { + return std::make_unique
( + multi_contains(/**warp parallel**/ true, input, targets, stream, mr)); + } + + // Too many targets will consume more shared memory, so split targets + // TODO: test with large working memory (instead of shared-memory) + std::vector> ret_columns; + auto const num_groups = cudf::util::div_rounding_up_safe(targets.size(), target_group_size); + for (size_type group_idx = 0; group_idx < num_groups; group_idx++) { + auto const start_target = group_idx * target_group_size; + auto const end_target = std::min(start_target + target_group_size, targets.size()); + + auto target_group = cudf::detail::slice(targets.parent(), start_target, end_target, stream); + auto bool_columns = multi_contains( + /**warp parallel**/ true, input, strings_column_view(target_group), stream, mr); + for (auto& c : bool_columns) { + ret_columns.push_back(std::move(c)); // transfer ownership + } + } + return std::make_unique
(std::move(ret_columns)); +} + +} // namespace detail + +std::unique_ptr
contains_multiple(strings_column_view const& strings, + strings_column_view const& targets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::contains_multiple(strings, targets, stream, mr); +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp index 2da95ba5c27..698fea50bad 100644 --- a/cpp/tests/strings/find_tests.cpp +++ b/cpp/tests/strings/find_tests.cpp @@ -17,16 +17,15 @@ #include #include #include +#include +#include -#include #include #include #include #include #include -#include - #include struct StringsFindTest : public cudf::test::BaseFixture {}; @@ -198,6 +197,154 @@ TEST_F(StringsFindTest, ContainsLongStrings) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } +TEST_F(StringsFindTest, MultiContains) +{ + constexpr int num_rows = 1024 + 1; + // replicate the following 9 rows: + std::vector s = { + "Héllo, there world and goodbye", + "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving", + "the following code snippet demonstrates how to use search for values in an ordered range", + "it returns the last position where value could be inserted without violating the ordering", + "algorithms execution is parallelized as determined by an execution policy. t", + "he this is a continuation of previous row to make sure string boundaries are honored", + "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ !@#$%^&*()~", + "", + ""}; + + // replicate strings + auto string_itr = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s[i % s.size()]; }); + + // nulls: 8, 8 + 1 * 9, 8 + 2 * 9 ...... + auto string_v = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return (i + 1) % s.size() != 0; }); + + auto const strings = + cudf::test::strings_column_wrapper(string_itr, string_itr + num_rows, string_v); + auto strings_view = cudf::strings_column_view(strings); + std::vector match_targets({" the ", "a", "", "é"}); + cudf::test::strings_column_wrapper multi_targets_column(match_targets.begin(), + match_targets.end()); + auto results = + cudf::strings::contains_multiple(strings_view, cudf::strings_column_view(multi_targets_column)); + + std::vector ret_0 = {0, 1, 0, 1, 0, 0, 0, 0, 0}; + std::vector ret_1 = {1, 1, 1, 1, 1, 1, 1, 0, 0}; + std::vector ret_2 = {1, 1, 1, 1, 1, 1, 1, 1, 0}; + std::vector ret_3 = {1, 0, 0, 0, 0, 0, 0, 0, 0}; + + auto make_bool_col_fn = [&string_v, &num_rows](std::vector bools) { + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return bools[i % bools.size()]; }); + return cudf::test::fixed_width_column_wrapper(iter, iter + num_rows, string_v); + }; + + auto expected_0 = make_bool_col_fn(ret_0); + auto expected_1 = make_bool_col_fn(ret_1); + auto expected_2 = make_bool_col_fn(ret_2); + auto expected_3 = make_bool_col_fn(ret_3); + + auto expected = cudf::table_view({expected_0, expected_1, expected_2, expected_3}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(results->view(), expected); +} + +TEST_F(StringsFindTest, MultiContainsMoreTargets) +{ + auto const strings = cudf::test::strings_column_wrapper{ + "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving " + "quick brown fox jumped", + "the following code snippet demonstrates how to use search for values in an ordered rangethe " + "following code snippet", + "thé it returns the last position where value could be inserted without violating ordering thé " + "it returns the last position"}; + auto strings_view = cudf::strings_column_view(strings); + std::vector targets({"lazy brown", "non-exist", ""}); + + std::vector> expects; + expects.push_back(cudf::test::fixed_width_column_wrapper({1, 0, 0})); + expects.push_back(cudf::test::fixed_width_column_wrapper({0, 0, 0})); + expects.push_back(cudf::test::fixed_width_column_wrapper({1, 1, 1})); + + std::vector match_targets; + int max_num_targets = 50; + + for (int num_targets = 1; num_targets < max_num_targets; num_targets++) { + match_targets.clear(); + for (int i = 0; i < num_targets; i++) { + match_targets.push_back(targets[i % targets.size()]); + } + + cudf::test::strings_column_wrapper multi_targets_column(match_targets.begin(), + match_targets.end()); + auto results = cudf::strings::contains_multiple( + strings_view, cudf::strings_column_view(multi_targets_column)); + EXPECT_EQ(results->num_columns(), num_targets); + for (int i = 0; i < num_targets; i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(i), expects[i % expects.size()]); + } + } +} + +TEST_F(StringsFindTest, MultiContainsLongStrings) +{ + constexpr int num_rows = 1024 + 1; + // replicate the following 7 rows: + std::vector s = { + "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving " + "quick brown fox jumped", + "the following code snippet demonstrates how to use search for values in an ordered rangethe " + "following code snippet", + "thé it returns the last position where value could be inserted without violating ordering thé " + "it returns the last position", + "algorithms execution is parallelized as determined by an execution policy. t algorithms " + "execution is parallelized as ", + "he this is a continuation of previous row to make sure string boundaries are honored he this " + "is a continuation of previous row", + "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ " + "!@#$%^&*()~abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKL", + ""}; + + // replicate strings + auto string_itr = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s[i % s.size()]; }); + + // nulls: 6, 6 + 1 * 7, 6 + 2 * 7 ...... + auto string_v = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return (i + 1) % s.size() != 0; }); + + auto const strings = + cudf::test::strings_column_wrapper(string_itr, string_itr + num_rows, string_v); + + auto sv = cudf::strings_column_view(strings); + auto targets = cudf::test::strings_column_wrapper({" the ", "search", "", "string", "ox", "é "}); + auto results = cudf::strings::contains_multiple(sv, cudf::strings_column_view(targets)); + + std::vector ret_0 = {1, 0, 1, 0, 0, 0, 0}; + std::vector ret_1 = {0, 1, 0, 0, 0, 0, 0}; + std::vector ret_2 = {1, 1, 1, 1, 1, 1, 0}; + std::vector ret_3 = {0, 0, 0, 0, 1, 0, 0}; + std::vector ret_4 = {1, 0, 0, 0, 0, 0, 0}; + std::vector ret_5 = {0, 0, 1, 0, 0, 0, 0}; + + auto make_bool_col_fn = [&string_v, &num_rows](std::vector bools) { + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return bools[i % bools.size()]; }); + return cudf::test::fixed_width_column_wrapper(iter, iter + num_rows, string_v); + }; + + auto expected_0 = make_bool_col_fn(ret_0); + auto expected_1 = make_bool_col_fn(ret_1); + auto expected_2 = make_bool_col_fn(ret_2); + auto expected_3 = make_bool_col_fn(ret_3); + auto expected_4 = make_bool_col_fn(ret_4); + auto expected_5 = make_bool_col_fn(ret_5); + + auto expected = + cudf::table_view({expected_0, expected_1, expected_2, expected_3, expected_4, expected_5}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(results->view(), expected); +} + TEST_F(StringsFindTest, StartsWith) { cudf::test::strings_column_wrapper strings({"Héllo", "thesé", "", "lease", "tést strings", ""},