diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 6858a3fc69f..9c3a05a2f5f 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -278,7 +278,7 @@ ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp) ConfigureNVBench( TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp - text/normalize.cpp text/replace.cpp text/tokenize.cpp + text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp ) # ################################################################################################## diff --git a/cpp/benchmarks/text/vocab.cpp b/cpp/benchmarks/text/vocab.cpp new file mode 100644 index 00000000000..6922b7214ff --- /dev/null +++ b/cpp/benchmarks/text/vocab.cpp @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include + +#include + +static void bench_vocab_tokenize(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + + if (static_cast(num_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + + auto const column = [num_rows, row_width] { + data_profile const profile = data_profile_builder().no_validity().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + auto const col = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); + return cudf::strings::filter_characters_of_type( + cudf::strings_column_view(col->view()), + cudf::strings::string_character_types::ALL_TYPES, + cudf::string_scalar(" "), + cudf::strings::string_character_types::ALPHANUM); + }(); + cudf::strings_column_view input(column->view()); + + auto const vocab_col = [] { + data_profile const profile = data_profile_builder().no_validity().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, 5); + auto const col = create_random_column(cudf::type_id::STRING, row_count{100}, profile); + return cudf::strings::filter_characters_of_type( + cudf::strings_column_view(col->view()), + cudf::strings::string_character_types::ALL_TYPES, + cudf::string_scalar(""), + cudf::strings::string_character_types::ALPHANUM); + }(); + auto const vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocab_col->view())); + + auto token_count = [input] { + auto const counts = nvtext::count_tokens(input); + auto const agg = cudf::make_sum_aggregation(); + auto const count = cudf::reduce(counts->view(), *agg, counts->type()); + return static_cast*>(count.get()) + ->value(cudf::get_default_stream()); + }(); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + auto chars_size = input.chars_size() + cudf::strings_column_view(vocab_col->view()).chars_size(); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(token_count); + + auto const delimiter = cudf::string_scalar(""); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = nvtext::tokenize_with_vocabulary(input, *vocab, delimiter); + }); +} + +NVBENCH_BENCH(bench_vocab_tokenize) + .set_name("vocab_tokenize") + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) + .add_int64_axis("num_rows", {262144, 524288, 1048576, 2097152, 4194304, 16777216}); diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu index f998c9ec239..41f8c0a8731 100644 --- a/cpp/src/text/vocabulary_tokenize.cu +++ b/cpp/src/text/vocabulary_tokenize.cu @@ -21,10 +21,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -37,6 +39,15 @@ #include +#include +#include +#include +#include +#include +#include + +#include + namespace nvtext { namespace detail { namespace { @@ -162,6 +173,119 @@ std::unique_ptr load_vocabulary(cudf::strings_column_view c namespace detail { namespace { +/** + * @brief Threshold to decide on using string or warp parallel functions. + * + * If the average byte length of a string in a column exceeds this value then + * the warp-parallel function is used to compute the output sizes. + * Otherwise, a regular string-parallel function is used. + * + * This value was found using the vocab_tokenize benchmark results. + */ +constexpr cudf::size_type AVG_CHAR_BYTES_THRESHOLD = 128; + +constexpr int block_size = 256; + +__device__ bool is_delimiter(cudf::string_view const& d_delimiters, cudf::char_utf8 chr) +{ + return d_delimiters.empty() ? (chr <= ' ') : // whitespace check + thrust::any_of(thrust::seq, + d_delimiters.begin(), + d_delimiters.end(), + [chr] __device__(cudf::char_utf8 c) { return c == chr; }); +} + +struct mark_delimiters_fn { + char const* d_chars; + cudf::string_view const d_delimiter; + int8_t* d_results; + + __device__ void operator()(cudf::size_type idx) const + { + auto const ptr = d_chars + idx; + if (cudf::strings::detail::is_utf8_continuation_char(*ptr)) { return; } + cudf::char_utf8 chr = 0; + auto ch_size = cudf::strings::detail::to_char_utf8(ptr, chr); + auto const output = is_delimiter(d_delimiter, chr); + while (ch_size > 0) { + d_results[idx++] = output; + --ch_size; + } + } +}; + +__global__ void token_counts_fn(cudf::column_device_view const d_strings, + cudf::string_view const d_delimiter, + cudf::size_type* d_counts, + int8_t* d_results) +{ + // string per warp + auto const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (idx >= (static_cast(d_strings.size()) * + static_cast(cudf::detail::warp_size))) { + return; + } + auto const str_idx = static_cast(idx / cudf::detail::warp_size); + auto const lane_idx = static_cast(idx % cudf::detail::warp_size); + + if (d_strings.is_null(str_idx)) { + d_counts[str_idx] = 0; + return; + } + auto const d_str = d_strings.element(str_idx); + if (d_str.empty()) { + d_counts[str_idx] = 0; + return; + } + + auto const offsets = + d_strings.child(cudf::strings_column_view::offsets_column_index).data(); + auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()]; + auto const chars_begin = + d_strings.child(cudf::strings_column_view::chars_column_index).data() + + offsets[d_strings.offset()]; + + auto const begin = d_str.data(); + auto const end = begin + d_str.size_bytes(); + auto const d_output = d_results + offset; + auto const d_output_end = d_output + d_str.size_bytes(); + + using warp_reduce = cub::WarpReduce; + __shared__ typename warp_reduce::TempStorage warp_storage; + + cudf::size_type count = 0; + if (lane_idx == 0) { + cudf::char_utf8 chr = 0; + auto ch_size = cudf::strings::detail::to_char_utf8(begin, chr); + auto output = 1; + if (begin > chars_begin) { + auto ptr = begin - 1; + while (ptr > chars_begin && cudf::strings::detail::is_utf8_continuation_char(*ptr)) { + --ptr; + } + cudf::strings::detail::to_char_utf8(ptr, chr); + output = !is_delimiter(d_delimiter, chr); + } + auto ptr = d_output; + while (ch_size > 0) { + *ptr++ = output; + --ch_size; + } + count = ((begin + ch_size) == end); + } + __syncwarp(); + + for (auto itr = d_output + lane_idx + 1; itr < d_output_end; itr += cudf::detail::warp_size) { + // add one if at the edge of a token or at the string's end + count += ((*itr && !(*(itr - 1))) || (itr + 1 == d_output_end)); + } + __syncwarp(); + + // add up the counts from the other threads to compute the total token count for this string + auto const total_count = warp_reduce(warp_storage).Reduce(count, cub::Sum()); + if (lane_idx == 0) { d_counts[str_idx] = total_count; } +} + /** * @brief Tokenizes each string and uses the map to assign token id values * @@ -197,6 +321,33 @@ struct vocabulary_tokenizer_fn { } }; +template +struct transform_tokenizer_fn { + cudf::string_view const d_delimiter; + MapRefType d_map; + cudf::size_type const default_id; + + __device__ cudf::size_type operator()(cudf::string_view d_str) const + { + auto const begin = d_str.data(); + auto const end = begin + d_str.size_bytes(); + + auto itr = begin; + while (itr < end) { + cudf::char_utf8 chr = 0; + auto const ch_size = cudf::strings::detail::to_char_utf8(itr, chr); + if (!is_delimiter(d_delimiter, chr)) break; + itr += ch_size; + } + + auto const size = static_cast(thrust::distance(itr, end)); + auto const token = cudf::string_view{itr, size}; + // lookup token in map + auto const fitr = d_map.find(token); + return (fitr != d_map.end()) ? fitr->second : default_id; + } +}; + } // namespace std::unique_ptr tokenize_with_vocabulary(cudf::strings_column_view const& input, @@ -209,28 +360,94 @@ std::unique_ptr tokenize_with_vocabulary(cudf::strings_column_view CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid"); auto const output_type = cudf::data_type{cudf::type_to_id()}; - if (input.is_empty()) { return cudf::make_empty_column(output_type); } + if (input.size() == input.null_count()) { return cudf::make_empty_column(output_type); } // count the tokens per string and build the offsets from the counts auto const d_strings = cudf::column_device_view::create(input.parent(), stream); auto const d_delimiter = delimiter.value(stream); - auto const sizes_itr = - cudf::detail::make_counting_transform_iterator(0, strings_tokenizer{*d_strings, d_delimiter}); - auto [token_offsets, total_count] = - cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr); + auto map_ref = vocabulary._impl->get_map_ref(); + auto const zero_itr = thrust::make_counting_iterator(0); + + if ((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) { + auto const sizes_itr = + cudf::detail::make_counting_transform_iterator(0, strings_tokenizer{*d_strings, d_delimiter}); + auto [token_offsets, total_count] = + cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr); + + // build the output column to hold all the token ids + auto tokens = cudf::make_numeric_column( + output_type, total_count, cudf::mask_state::UNALLOCATED, stream, mr); + auto d_tokens = tokens->mutable_view().data(); + auto d_offsets = token_offsets->view().data(); + vocabulary_tokenizer_fn tokenizer{ + *d_strings, d_delimiter, map_ref, default_id, d_offsets, d_tokens}; + thrust::for_each_n(rmm::exec_policy(stream), zero_itr, input.size(), tokenizer); + return cudf::make_lists_column(input.size(), + std::move(token_offsets), + std::move(tokens), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + stream, + mr); + } + + // longer strings perform better with warp-parallel approach + + auto const first_offset = (input.offset() == 0) ? 0 + : cudf::detail::get_value( + input.offsets(), input.offset(), stream); + auto const last_offset = (input.offset() == 0 && input.size() == input.offsets().size() - 1) + ? input.chars().size() + : cudf::detail::get_value( + input.offsets(), input.size() + input.offset(), stream); + auto const chars_size = last_offset - first_offset; + auto const d_input_chars = input.chars().data() + first_offset; + + rmm::device_uvector d_token_counts(input.size(), stream); + rmm::device_uvector d_marks(chars_size, stream); + + // mark position of all delimiters + thrust::for_each_n(rmm::exec_policy(stream), + zero_itr, + chars_size, + mark_delimiters_fn{d_input_chars, d_delimiter, d_marks.data()}); + + // launch warp per string to compute token counts + cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size}; + token_counts_fn<<>>( + *d_strings, d_delimiter, d_token_counts.data(), d_marks.data()); + auto [token_offsets, total_count] = cudf::detail::make_offsets_child_column( + d_token_counts.begin(), d_token_counts.end(), stream, mr); + + rmm::device_uvector d_tmp_offsets(total_count + 1, stream); + d_tmp_offsets.set_element(total_count, chars_size, stream); + thrust::copy_if(rmm::exec_policy(stream), + zero_itr, + thrust::counting_iterator(chars_size), + d_tmp_offsets.begin(), + [d_marks = d_marks.data()] __device__(auto idx) { + if (idx == 0) return true; + return d_marks[idx] && !d_marks[idx - 1]; + }); + + auto tmp_offsets = + std::make_unique(std::move(d_tmp_offsets), rmm::device_buffer{}, 0); + auto tmp_chars = cudf::column_view(input.chars().type(), chars_size, d_input_chars, nullptr, 0); + auto const tmp_input = cudf::column_view( + input.parent().type(), total_count, nullptr, nullptr, 0, 0, {tmp_offsets->view(), tmp_chars}); + + auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream); - // build the output column to hold all the token ids auto tokens = cudf::make_numeric_column(output_type, total_count, cudf::mask_state::UNALLOCATED, stream, mr); - auto map_ref = vocabulary._impl->get_map_ref(); - auto d_offsets = token_offsets->view().data(); - auto d_tokens = tokens->mutable_view().data(); - vocabulary_tokenizer_fn tokenizer{ - *d_strings, d_delimiter, map_ref, default_id, d_offsets, d_tokens}; - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - input.size(), - tokenizer); + auto d_tokens = tokens->mutable_view().data(); + + transform_tokenizer_fn tokenizer{d_delimiter, map_ref, default_id}; + thrust::transform(rmm::exec_policy(stream), + d_tmp_strings->begin(), + d_tmp_strings->end(), + d_tokens, + tokenizer); return cudf::make_lists_column(input.size(), std::move(token_offsets), diff --git a/cpp/tests/text/tokenize_tests.cpp b/cpp/tests/text/tokenize_tests.cpp index fbc706ea290..8118183a458 100644 --- a/cpp/tests/text/tokenize_tests.cpp +++ b/cpp/tests/text/tokenize_tests.cpp @@ -208,14 +208,16 @@ TEST_F(TextTokenizeTest, Vocabulary) {"ate", "chased", "cheese", "dog", "fox", "jumped", "mouse", "mousé", "over", "the"}); auto vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocabulary)); - auto validity = cudf::test::iterators::null_at(4); - cudf::test::strings_column_wrapper input({"the fox jumped over the dog", - "the dog chased the cat", - "the cat chased the mouse", - "the mousé ate cheese", - "", - ""}, - validity); + auto validity = cudf::test::iterators::null_at(5); + auto input = cudf::test::strings_column_wrapper({" the fox jumped over the dog ", + " the dog chased the cat", + "", + "the cat chased the mouse ", + "the mousé ate cheese", + "", + "dog"}, + validity); + auto input_view = cudf::strings_column_view(input); auto delimiter = cudf::string_scalar(" "); auto default_id = -7; // should be the token for the missing 'cat' @@ -225,12 +227,55 @@ TEST_F(TextTokenizeTest, Vocabulary) // clang-format off LCW expected({LCW{ 9, 4, 5, 8, 9, 3}, LCW{ 9, 3, 1, 9,-7}, + LCW{}, LCW{ 9,-7, 1, 9, 6}, LCW{ 9, 7, 0, 2}, - LCW{}, LCW{}}, + LCW{}, LCW{3}}, validity); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto sliced = cudf::slice(input, {1, 4}).front(); + auto sliced_expected = cudf::slice(expected, {1, 4}).front(); + + input_view = cudf::strings_column_view(sliced); + + results = nvtext::tokenize_with_vocabulary(input_view, *vocab, delimiter, default_id); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected); +} + +TEST_F(TextTokenizeTest, VocabularyLongStrings) +{ + cudf::test::strings_column_wrapper vocabulary( // leaving out 'cat' on purpose + {"ate", "chased", "cheese", "dog", "fox", "jumped", "mouse", "mousé", "over", "the"}); + auto vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocabulary)); + + std::vector h_strings( + 4, + "the fox jumped chased the dog cheese mouse at the over there dog mouse cat plus the horse " + "jumped over the mouse house with the dog"); + cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end()); + auto input_view = cudf::strings_column_view(input); + auto delimiter = cudf::string_scalar(" "); + auto default_id = -1; + auto results = nvtext::tokenize_with_vocabulary(input_view, *vocab, delimiter, default_id); + + using LCW = cudf::test::lists_column_wrapper; + // clang-format off + LCW expected({LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3}, + LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3}, + LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3}, + LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3}}); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto sliced = cudf::slice(input, {1, 3}).front(); + auto sliced_expected = cudf::slice(expected, {1, 3}).front(); + + input_view = cudf::strings_column_view(sliced); + + results = nvtext::tokenize_with_vocabulary(input_view, *vocab, delimiter, default_id); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected); } TEST_F(TextTokenizeTest, TokenizeErrors)