Skip to content

Commit

Permalink
Improve performance of nvtext::tokenize_with_vocabulary for long stri…
Browse files Browse the repository at this point in the history
…ngs (#14336)

Improves `nvtext::tokenize_with_vocabulary` performance for long strings. Also adds additional tests and an nvbench benchmark.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - MithunR (https://github.com/mythrocks)

URL: #14336
  • Loading branch information
davidwendt authored Nov 3, 2023
1 parent 56fe5db commit f97e74f
Show file tree
Hide file tree
Showing 4 changed files with 375 additions and 25 deletions.
2 changes: 1 addition & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)

ConfigureNVBench(
TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
text/normalize.cpp text/replace.cpp text/tokenize.cpp
text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
)

# ##################################################################################################
Expand Down
88 changes: 88 additions & 0 deletions cpp/benchmarks/text/vocab.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>

#include <cudf_test/column_wrapper.hpp>

#include <cudf/reduction.hpp>
#include <nvtext/tokenize.hpp>

#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/char_types/char_types.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <nvbench/nvbench.cuh>

static void bench_vocab_tokenize(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));

if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}

auto const column = [num_rows, row_width] {
data_profile const profile = data_profile_builder().no_validity().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
auto const col = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
return cudf::strings::filter_characters_of_type(
cudf::strings_column_view(col->view()),
cudf::strings::string_character_types::ALL_TYPES,
cudf::string_scalar(" "),
cudf::strings::string_character_types::ALPHANUM);
}();
cudf::strings_column_view input(column->view());

auto const vocab_col = [] {
data_profile const profile = data_profile_builder().no_validity().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, 5);
auto const col = create_random_column(cudf::type_id::STRING, row_count{100}, profile);
return cudf::strings::filter_characters_of_type(
cudf::strings_column_view(col->view()),
cudf::strings::string_character_types::ALL_TYPES,
cudf::string_scalar(""),
cudf::strings::string_character_types::ALPHANUM);
}();
auto const vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocab_col->view()));

auto token_count = [input] {
auto const counts = nvtext::count_tokens(input);
auto const agg = cudf::make_sum_aggregation<cudf::reduce_aggregation>();
auto const count = cudf::reduce(counts->view(), *agg, counts->type());
return static_cast<cudf::scalar_type_t<cudf::size_type>*>(count.get())
->value(cudf::get_default_stream());
}();

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
auto chars_size = input.chars_size() + cudf::strings_column_view(vocab_col->view()).chars_size();
state.add_global_memory_reads<nvbench::int8_t>(chars_size);
state.add_global_memory_writes<nvbench::int32_t>(token_count);

auto const delimiter = cudf::string_scalar("");
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = nvtext::tokenize_with_vocabulary(input, *vocab, delimiter);
});
}

NVBENCH_BENCH(bench_vocab_tokenize)
.set_name("vocab_tokenize")
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
.add_int64_axis("num_rows", {262144, 524288, 1048576, 2097152, 4194304, 16777216});
247 changes: 232 additions & 15 deletions cpp/src/text/vocabulary_tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@
#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/sizes_to_offsets_iterator.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/hashing/detail/hash_allocator.cuh>
#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
#include <cudf/strings/string_view.cuh>
Expand All @@ -37,6 +39,15 @@

#include <cuco/static_map.cuh>

#include <thrust/copy.h>
#include <thrust/distance.h>
#include <thrust/execution_policy.h>
#include <thrust/functional.h>
#include <thrust/logical.h>
#include <thrust/transform.h>

#include <cub/cub.cuh>

namespace nvtext {
namespace detail {
namespace {
Expand Down Expand Up @@ -162,6 +173,119 @@ std::unique_ptr<tokenize_vocabulary> load_vocabulary(cudf::strings_column_view c
namespace detail {
namespace {

/**
* @brief Threshold to decide on using string or warp parallel functions.
*
* If the average byte length of a string in a column exceeds this value then
* the warp-parallel function is used to compute the output sizes.
* Otherwise, a regular string-parallel function is used.
*
* This value was found using the vocab_tokenize benchmark results.
*/
constexpr cudf::size_type AVG_CHAR_BYTES_THRESHOLD = 128;

constexpr int block_size = 256;

__device__ bool is_delimiter(cudf::string_view const& d_delimiters, cudf::char_utf8 chr)
{
return d_delimiters.empty() ? (chr <= ' ') : // whitespace check
thrust::any_of(thrust::seq,
d_delimiters.begin(),
d_delimiters.end(),
[chr] __device__(cudf::char_utf8 c) { return c == chr; });
}

struct mark_delimiters_fn {
char const* d_chars;
cudf::string_view const d_delimiter;
int8_t* d_results;

__device__ void operator()(cudf::size_type idx) const
{
auto const ptr = d_chars + idx;
if (cudf::strings::detail::is_utf8_continuation_char(*ptr)) { return; }
cudf::char_utf8 chr = 0;
auto ch_size = cudf::strings::detail::to_char_utf8(ptr, chr);
auto const output = is_delimiter(d_delimiter, chr);
while (ch_size > 0) {
d_results[idx++] = output;
--ch_size;
}
}
};

__global__ void token_counts_fn(cudf::column_device_view const d_strings,
cudf::string_view const d_delimiter,
cudf::size_type* d_counts,
int8_t* d_results)
{
// string per warp
auto const idx = static_cast<std::size_t>(threadIdx.x + blockIdx.x * blockDim.x);
if (idx >= (static_cast<std::size_t>(d_strings.size()) *
static_cast<std::size_t>(cudf::detail::warp_size))) {
return;
}
auto const str_idx = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);

if (d_strings.is_null(str_idx)) {
d_counts[str_idx] = 0;
return;
}
auto const d_str = d_strings.element<cudf::string_view>(str_idx);
if (d_str.empty()) {
d_counts[str_idx] = 0;
return;
}

auto const offsets =
d_strings.child(cudf::strings_column_view::offsets_column_index).data<cudf::size_type>();
auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()];
auto const chars_begin =
d_strings.child(cudf::strings_column_view::chars_column_index).data<char>() +
offsets[d_strings.offset()];

auto const begin = d_str.data();
auto const end = begin + d_str.size_bytes();
auto const d_output = d_results + offset;
auto const d_output_end = d_output + d_str.size_bytes();

using warp_reduce = cub::WarpReduce<cudf::size_type>;
__shared__ typename warp_reduce::TempStorage warp_storage;

cudf::size_type count = 0;
if (lane_idx == 0) {
cudf::char_utf8 chr = 0;
auto ch_size = cudf::strings::detail::to_char_utf8(begin, chr);
auto output = 1;
if (begin > chars_begin) {
auto ptr = begin - 1;
while (ptr > chars_begin && cudf::strings::detail::is_utf8_continuation_char(*ptr)) {
--ptr;
}
cudf::strings::detail::to_char_utf8(ptr, chr);
output = !is_delimiter(d_delimiter, chr);
}
auto ptr = d_output;
while (ch_size > 0) {
*ptr++ = output;
--ch_size;
}
count = ((begin + ch_size) == end);
}
__syncwarp();

for (auto itr = d_output + lane_idx + 1; itr < d_output_end; itr += cudf::detail::warp_size) {
// add one if at the edge of a token or at the string's end
count += ((*itr && !(*(itr - 1))) || (itr + 1 == d_output_end));
}
__syncwarp();

// add up the counts from the other threads to compute the total token count for this string
auto const total_count = warp_reduce(warp_storage).Reduce(count, cub::Sum());
if (lane_idx == 0) { d_counts[str_idx] = total_count; }
}

/**
* @brief Tokenizes each string and uses the map to assign token id values
*
Expand Down Expand Up @@ -197,6 +321,33 @@ struct vocabulary_tokenizer_fn {
}
};

template <typename MapRefType>
struct transform_tokenizer_fn {
cudf::string_view const d_delimiter;
MapRefType d_map;
cudf::size_type const default_id;

__device__ cudf::size_type operator()(cudf::string_view d_str) const
{
auto const begin = d_str.data();
auto const end = begin + d_str.size_bytes();

auto itr = begin;
while (itr < end) {
cudf::char_utf8 chr = 0;
auto const ch_size = cudf::strings::detail::to_char_utf8(itr, chr);
if (!is_delimiter(d_delimiter, chr)) break;
itr += ch_size;
}

auto const size = static_cast<cudf::size_type>(thrust::distance(itr, end));
auto const token = cudf::string_view{itr, size};
// lookup token in map
auto const fitr = d_map.find(token);
return (fitr != d_map.end()) ? fitr->second : default_id;
}
};

} // namespace

std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view const& input,
Expand All @@ -209,28 +360,94 @@ std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view
CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");

auto const output_type = cudf::data_type{cudf::type_to_id<cudf::size_type>()};
if (input.is_empty()) { return cudf::make_empty_column(output_type); }
if (input.size() == input.null_count()) { return cudf::make_empty_column(output_type); }

// count the tokens per string and build the offsets from the counts
auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
auto const d_delimiter = delimiter.value(stream);
auto const sizes_itr =
cudf::detail::make_counting_transform_iterator(0, strings_tokenizer{*d_strings, d_delimiter});
auto [token_offsets, total_count] =
cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
auto map_ref = vocabulary._impl->get_map_ref();
auto const zero_itr = thrust::make_counting_iterator<cudf::size_type>(0);

if ((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
auto const sizes_itr =
cudf::detail::make_counting_transform_iterator(0, strings_tokenizer{*d_strings, d_delimiter});
auto [token_offsets, total_count] =
cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);

// build the output column to hold all the token ids
auto tokens = cudf::make_numeric_column(
output_type, total_count, cudf::mask_state::UNALLOCATED, stream, mr);
auto d_tokens = tokens->mutable_view().data<cudf::size_type>();
auto d_offsets = token_offsets->view().data<cudf::size_type>();
vocabulary_tokenizer_fn<decltype(map_ref)> tokenizer{
*d_strings, d_delimiter, map_ref, default_id, d_offsets, d_tokens};
thrust::for_each_n(rmm::exec_policy(stream), zero_itr, input.size(), tokenizer);
return cudf::make_lists_column(input.size(),
std::move(token_offsets),
std::move(tokens),
input.null_count(),
cudf::detail::copy_bitmask(input.parent(), stream, mr),
stream,
mr);
}

// longer strings perform better with warp-parallel approach

auto const first_offset = (input.offset() == 0) ? 0
: cudf::detail::get_value<cudf::size_type>(
input.offsets(), input.offset(), stream);
auto const last_offset = (input.offset() == 0 && input.size() == input.offsets().size() - 1)
? input.chars().size()
: cudf::detail::get_value<cudf::size_type>(
input.offsets(), input.size() + input.offset(), stream);
auto const chars_size = last_offset - first_offset;
auto const d_input_chars = input.chars().data<char>() + first_offset;

rmm::device_uvector<cudf::size_type> d_token_counts(input.size(), stream);
rmm::device_uvector<int8_t> d_marks(chars_size, stream);

// mark position of all delimiters
thrust::for_each_n(rmm::exec_policy(stream),
zero_itr,
chars_size,
mark_delimiters_fn{d_input_chars, d_delimiter, d_marks.data()});

// launch warp per string to compute token counts
cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
token_counts_fn<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
*d_strings, d_delimiter, d_token_counts.data(), d_marks.data());
auto [token_offsets, total_count] = cudf::detail::make_offsets_child_column(
d_token_counts.begin(), d_token_counts.end(), stream, mr);

rmm::device_uvector<cudf::size_type> d_tmp_offsets(total_count + 1, stream);
d_tmp_offsets.set_element(total_count, chars_size, stream);
thrust::copy_if(rmm::exec_policy(stream),
zero_itr,
thrust::counting_iterator<cudf::size_type>(chars_size),
d_tmp_offsets.begin(),
[d_marks = d_marks.data()] __device__(auto idx) {
if (idx == 0) return true;
return d_marks[idx] && !d_marks[idx - 1];
});

auto tmp_offsets =
std::make_unique<cudf::column>(std::move(d_tmp_offsets), rmm::device_buffer{}, 0);
auto tmp_chars = cudf::column_view(input.chars().type(), chars_size, d_input_chars, nullptr, 0);
auto const tmp_input = cudf::column_view(
input.parent().type(), total_count, nullptr, nullptr, 0, 0, {tmp_offsets->view(), tmp_chars});

auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream);

// build the output column to hold all the token ids
auto tokens =
cudf::make_numeric_column(output_type, total_count, cudf::mask_state::UNALLOCATED, stream, mr);
auto map_ref = vocabulary._impl->get_map_ref();
auto d_offsets = token_offsets->view().data<cudf::size_type>();
auto d_tokens = tokens->mutable_view().data<cudf::size_type>();
vocabulary_tokenizer_fn<decltype(map_ref)> tokenizer{
*d_strings, d_delimiter, map_ref, default_id, d_offsets, d_tokens};
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
input.size(),
tokenizer);
auto d_tokens = tokens->mutable_view().data<cudf::size_type>();

transform_tokenizer_fn<decltype(map_ref)> tokenizer{d_delimiter, map_ref, default_id};
thrust::transform(rmm::exec_policy(stream),
d_tmp_strings->begin<cudf::string_view>(),
d_tmp_strings->end<cudf::string_view>(),
d_tokens,
tokenizer);

return cudf::make_lists_column(input.size(),
std::move(token_offsets),
Expand Down
Loading

0 comments on commit f97e74f

Please sign in to comment.