Improve performance of nvtext::tokenize_with_vocabulary for long stri…

…ngs (#14336) Improves `nvtext::tokenize_with_vocabulary` performance for long strings. Also adds additional tests and an nvbench benchmark. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - MithunR (https://github.com/mythrocks) URL: #14336
rapidsai · Nov 3, 2023 · f97e74f · f97e74f
1 parent 56fe5db
commit f97e74f
Show file tree

Hide file tree

Showing 4 changed files with 375 additions and 25 deletions.
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -278,7 +278,7 @@ ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)
 
 ConfigureNVBench(
   TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
-  text/normalize.cpp text/replace.cpp text/tokenize.cpp
+  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
 )
 
 # ##################################################################################################

diff --git a/cpp/benchmarks/text/vocab.cpp b/cpp/benchmarks/text/vocab.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/reduction.hpp>
+#include <nvtext/tokenize.hpp>
+
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/char_types/char_types.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_vocab_tokenize(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  auto const column = [num_rows, row_width] {
+    data_profile const profile = data_profile_builder().no_validity().distribution(
+      cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    auto const col = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
+    return cudf::strings::filter_characters_of_type(
+      cudf::strings_column_view(col->view()),
+      cudf::strings::string_character_types::ALL_TYPES,
+      cudf::string_scalar(" "),
+      cudf::strings::string_character_types::ALPHANUM);
+  }();
+  cudf::strings_column_view input(column->view());
+
+  auto const vocab_col = [] {
+    data_profile const profile = data_profile_builder().no_validity().distribution(
+      cudf::type_id::STRING, distribution_id::NORMAL, 0, 5);
+    auto const col = create_random_column(cudf::type_id::STRING, row_count{100}, profile);
+    return cudf::strings::filter_characters_of_type(
+      cudf::strings_column_view(col->view()),
+      cudf::strings::string_character_types::ALL_TYPES,
+      cudf::string_scalar(""),
+      cudf::strings::string_character_types::ALPHANUM);
+  }();
+  auto const vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocab_col->view()));
+
+  auto token_count = [input] {
+    auto const counts = nvtext::count_tokens(input);
+    auto const agg    = cudf::make_sum_aggregation<cudf::reduce_aggregation>();
+    auto const count  = cudf::reduce(counts->view(), *agg, counts->type());
+    return static_cast<cudf::scalar_type_t<cudf::size_type>*>(count.get())
+      ->value(cudf::get_default_stream());
+  }();
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  auto chars_size = input.chars_size() + cudf::strings_column_view(vocab_col->view()).chars_size();
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int32_t>(token_count);
+
+  auto const delimiter = cudf::string_scalar("");
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = nvtext::tokenize_with_vocabulary(input, *vocab, delimiter);
+  });
+}
+
+NVBENCH_BENCH(bench_vocab_tokenize)
+  .set_name("vocab_tokenize")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
+  .add_int64_axis("num_rows", {262144, 524288, 1048576, 2097152, 4194304, 16777216});
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
@@ -21,10 +21,12 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/hashing/detail/hash_allocator.cuh>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -37,6 +39,15 @@
 
 #include <cuco/static_map.cuh>
 
+#include <thrust/copy.h>
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/logical.h>
+#include <thrust/transform.h>
+
+#include <cub/cub.cuh>
+
 namespace nvtext {
 namespace detail {
 namespace {
@@ -162,6 +173,119 @@ std::unique_ptr<tokenize_vocabulary> load_vocabulary(cudf::strings_column_view c
 namespace detail {
 namespace {
 
+/**
+ * @brief Threshold to decide on using string or warp parallel functions.
+ *
+ * If the average byte length of a string in a column exceeds this value then
+ * the warp-parallel function is used to compute the output sizes.
+ * Otherwise, a regular string-parallel function is used.
+ *
+ * This value was found using the vocab_tokenize benchmark results.
+ */
+constexpr cudf::size_type AVG_CHAR_BYTES_THRESHOLD = 128;
+
+constexpr int block_size = 256;
+
+__device__ bool is_delimiter(cudf::string_view const& d_delimiters, cudf::char_utf8 chr)
+{
+  return d_delimiters.empty() ? (chr <= ' ') :  // whitespace check
+           thrust::any_of(thrust::seq,
+                          d_delimiters.begin(),
+                          d_delimiters.end(),
+                          [chr] __device__(cudf::char_utf8 c) { return c == chr; });
+}
+
+struct mark_delimiters_fn {
+  char const* d_chars;
+  cudf::string_view const d_delimiter;
+  int8_t* d_results;
+
+  __device__ void operator()(cudf::size_type idx) const
+  {
+    auto const ptr = d_chars + idx;
+    if (cudf::strings::detail::is_utf8_continuation_char(*ptr)) { return; }
+    cudf::char_utf8 chr = 0;
+    auto ch_size        = cudf::strings::detail::to_char_utf8(ptr, chr);
+    auto const output   = is_delimiter(d_delimiter, chr);
+    while (ch_size > 0) {
+      d_results[idx++] = output;
+      --ch_size;
+    }
+  }
+};
+
+__global__ void token_counts_fn(cudf::column_device_view const d_strings,
+                                cudf::string_view const d_delimiter,
+                                cudf::size_type* d_counts,
+                                int8_t* d_results)
+{
+  // string per warp
+  auto const idx = static_cast<std::size_t>(threadIdx.x + blockIdx.x * blockDim.x);
+  if (idx >= (static_cast<std::size_t>(d_strings.size()) *
+              static_cast<std::size_t>(cudf::detail::warp_size))) {
+    return;
+  }
+  auto const str_idx  = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
+  auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);
+
+  if (d_strings.is_null(str_idx)) {
+    d_counts[str_idx] = 0;
+    return;
+  }
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  if (d_str.empty()) {
+    d_counts[str_idx] = 0;
+    return;
+  }
+
+  auto const offsets =
+    d_strings.child(cudf::strings_column_view::offsets_column_index).data<cudf::size_type>();
+  auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()];
+  auto const chars_begin =
+    d_strings.child(cudf::strings_column_view::chars_column_index).data<char>() +
+    offsets[d_strings.offset()];
+
+  auto const begin        = d_str.data();
+  auto const end          = begin + d_str.size_bytes();
+  auto const d_output     = d_results + offset;
+  auto const d_output_end = d_output + d_str.size_bytes();
+
+  using warp_reduce = cub::WarpReduce<cudf::size_type>;
+  __shared__ typename warp_reduce::TempStorage warp_storage;
+
+  cudf::size_type count = 0;
+  if (lane_idx == 0) {
+    cudf::char_utf8 chr = 0;
+    auto ch_size        = cudf::strings::detail::to_char_utf8(begin, chr);
+    auto output         = 1;
+    if (begin > chars_begin) {
+      auto ptr = begin - 1;
+      while (ptr > chars_begin && cudf::strings::detail::is_utf8_continuation_char(*ptr)) {
+        --ptr;
+      }
+      cudf::strings::detail::to_char_utf8(ptr, chr);
+      output = !is_delimiter(d_delimiter, chr);
+    }
+    auto ptr = d_output;
+    while (ch_size > 0) {
+      *ptr++ = output;
+      --ch_size;
+    }
+    count = ((begin + ch_size) == end);
+  }
+  __syncwarp();
+
+  for (auto itr = d_output + lane_idx + 1; itr < d_output_end; itr += cudf::detail::warp_size) {
+    // add one if at the edge of a token or at the string's end
+    count += ((*itr && !(*(itr - 1))) || (itr + 1 == d_output_end));
+  }
+  __syncwarp();
+
+  // add up the counts from the other threads to compute the total token count for this string
+  auto const total_count = warp_reduce(warp_storage).Reduce(count, cub::Sum());
+  if (lane_idx == 0) { d_counts[str_idx] = total_count; }
+}
+
 /**
  * @brief Tokenizes each string and uses the map to assign token id values
  *
@@ -197,6 +321,33 @@ struct vocabulary_tokenizer_fn {
   }
 };
 
+template <typename MapRefType>
+struct transform_tokenizer_fn {
+  cudf::string_view const d_delimiter;
+  MapRefType d_map;
+  cudf::size_type const default_id;
+
+  __device__ cudf::size_type operator()(cudf::string_view d_str) const
+  {
+    auto const begin = d_str.data();
+    auto const end   = begin + d_str.size_bytes();
+
+    auto itr = begin;
+    while (itr < end) {
+      cudf::char_utf8 chr = 0;
+      auto const ch_size  = cudf::strings::detail::to_char_utf8(itr, chr);
+      if (!is_delimiter(d_delimiter, chr)) break;
+      itr += ch_size;
+    }
+
+    auto const size  = static_cast<cudf::size_type>(thrust::distance(itr, end));
+    auto const token = cudf::string_view{itr, size};
+    // lookup token in map
+    auto const fitr = d_map.find(token);
+    return (fitr != d_map.end()) ? fitr->second : default_id;
+  }
+};
+
 }  // namespace
 
 std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view const& input,
@@ -209,28 +360,94 @@ std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
   auto const output_type = cudf::data_type{cudf::type_to_id<cudf::size_type>()};
-  if (input.is_empty()) { return cudf::make_empty_column(output_type); }
+  if (input.size() == input.null_count()) { return cudf::make_empty_column(output_type); }
 
   // count the tokens per string and build the offsets from the counts
   auto const d_strings   = cudf::column_device_view::create(input.parent(), stream);
   auto const d_delimiter = delimiter.value(stream);
-  auto const sizes_itr =
-    cudf::detail::make_counting_transform_iterator(0, strings_tokenizer{*d_strings, d_delimiter});
-  auto [token_offsets, total_count] =
-    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
+  auto map_ref           = vocabulary._impl->get_map_ref();
+  auto const zero_itr    = thrust::make_counting_iterator<cudf::size_type>(0);
+
+  if ((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
+    auto const sizes_itr =
+      cudf::detail::make_counting_transform_iterator(0, strings_tokenizer{*d_strings, d_delimiter});
+    auto [token_offsets, total_count] =
+      cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
+
+    // build the output column to hold all the token ids
+    auto tokens = cudf::make_numeric_column(
+      output_type, total_count, cudf::mask_state::UNALLOCATED, stream, mr);
+    auto d_tokens  = tokens->mutable_view().data<cudf::size_type>();
+    auto d_offsets = token_offsets->view().data<cudf::size_type>();
+    vocabulary_tokenizer_fn<decltype(map_ref)> tokenizer{
+      *d_strings, d_delimiter, map_ref, default_id, d_offsets, d_tokens};
+    thrust::for_each_n(rmm::exec_policy(stream), zero_itr, input.size(), tokenizer);
+    return cudf::make_lists_column(input.size(),
+                                   std::move(token_offsets),
+                                   std::move(tokens),
+                                   input.null_count(),
+                                   cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                   stream,
+                                   mr);
+  }
+
+  // longer strings perform better with warp-parallel approach
+
+  auto const first_offset  = (input.offset() == 0) ? 0
+                                                   : cudf::detail::get_value<cudf::size_type>(
+                                                      input.offsets(), input.offset(), stream);
+  auto const last_offset   = (input.offset() == 0 && input.size() == input.offsets().size() - 1)
+                               ? input.chars().size()
+                               : cudf::detail::get_value<cudf::size_type>(
+                                 input.offsets(), input.size() + input.offset(), stream);
+  auto const chars_size    = last_offset - first_offset;
+  auto const d_input_chars = input.chars().data<char>() + first_offset;
+
+  rmm::device_uvector<cudf::size_type> d_token_counts(input.size(), stream);
+  rmm::device_uvector<int8_t> d_marks(chars_size, stream);
+
+  // mark position of all delimiters
+  thrust::for_each_n(rmm::exec_policy(stream),
+                     zero_itr,
+                     chars_size,
+                     mark_delimiters_fn{d_input_chars, d_delimiter, d_marks.data()});
+
+  // launch warp per string to compute token counts
+  cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
+  token_counts_fn<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+    *d_strings, d_delimiter, d_token_counts.data(), d_marks.data());
+  auto [token_offsets, total_count] = cudf::detail::make_offsets_child_column(
+    d_token_counts.begin(), d_token_counts.end(), stream, mr);
+
+  rmm::device_uvector<cudf::size_type> d_tmp_offsets(total_count + 1, stream);
+  d_tmp_offsets.set_element(total_count, chars_size, stream);
+  thrust::copy_if(rmm::exec_policy(stream),
+                  zero_itr,
+                  thrust::counting_iterator<cudf::size_type>(chars_size),
+                  d_tmp_offsets.begin(),
+                  [d_marks = d_marks.data()] __device__(auto idx) {
+                    if (idx == 0) return true;
+                    return d_marks[idx] && !d_marks[idx - 1];
+                  });
+
+  auto tmp_offsets =
+    std::make_unique<cudf::column>(std::move(d_tmp_offsets), rmm::device_buffer{}, 0);
+  auto tmp_chars = cudf::column_view(input.chars().type(), chars_size, d_input_chars, nullptr, 0);
+  auto const tmp_input = cudf::column_view(
+    input.parent().type(), total_count, nullptr, nullptr, 0, 0, {tmp_offsets->view(), tmp_chars});
+
+  auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream);
 
-  // build the output column to hold all the token ids
   auto tokens =
     cudf::make_numeric_column(output_type, total_count, cudf::mask_state::UNALLOCATED, stream, mr);
-  auto map_ref   = vocabulary._impl->get_map_ref();
-  auto d_offsets = token_offsets->view().data<cudf::size_type>();
-  auto d_tokens  = tokens->mutable_view().data<cudf::size_type>();
-  vocabulary_tokenizer_fn<decltype(map_ref)> tokenizer{
-    *d_strings, d_delimiter, map_ref, default_id, d_offsets, d_tokens};
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<cudf::size_type>(0),
-                     input.size(),
-                     tokenizer);
+  auto d_tokens = tokens->mutable_view().data<cudf::size_type>();
+
+  transform_tokenizer_fn<decltype(map_ref)> tokenizer{d_delimiter, map_ref, default_id};
+  thrust::transform(rmm::exec_policy(stream),
+                    d_tmp_strings->begin<cudf::string_view>(),
+                    d_tmp_strings->end<cudf::string_view>(),
+                    d_tokens,
+                    tokenizer);
 
   return cudf::make_lists_column(input.size(),
                                  std::move(token_offsets),