Skip to content

Commit

Permalink
Use experimental make_strings_children in nvtext APIs (#15595)
Browse files Browse the repository at this point in the history
Updates nvtext replace, ngram, normalize, and detokenize functions to replace the existing calls to `make_strings_children` with the new experimental `make_strings_children` which supports building large strings.

Reference #15579

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

URL: #15595
  • Loading branch information
davidwendt authored May 1, 2024
1 parent 4aabf51 commit fe4b92c
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 35 deletions.
19 changes: 10 additions & 9 deletions cpp/src/text/detokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#include <cudf/detail/indexalator.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/sorting.hpp>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/strings_children_ex.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
Expand Down Expand Up @@ -48,12 +48,13 @@ namespace {
* the same row. The `d_separator` is appended between each token.
*/
struct detokenizer_fn {
cudf::column_device_view const d_strings; // these are the tokens
cudf::size_type const* d_row_map; // indices sorted by output row
cudf::size_type const* d_token_offsets; // to each input token array
cudf::string_view const d_separator; // append after each token
cudf::size_type* d_offsets{}; // offsets to output buffer d_chars
char* d_chars{}; // output buffer for characters
cudf::column_device_view const d_strings; // these are the tokens
cudf::size_type const* d_row_map; // indices sorted by output row
cudf::size_type const* d_token_offsets; // to each input token array
cudf::string_view const d_separator; // append after each token
cudf::size_type* d_sizes{}; // output sizes
char* d_chars{}; // output buffer for characters
cudf::detail::input_offsetalator d_offsets; // for addressing output row data in d_chars

__device__ void operator()(cudf::size_type idx)
{
Expand All @@ -75,7 +76,7 @@ struct detokenizer_fn {
nbytes += d_separator.size_bytes();
}
}
if (!d_chars) { d_offsets[idx] = (nbytes > 0) ? (nbytes - d_separator.size_bytes()) : 0; }
if (!d_chars) { d_sizes[idx] = (nbytes > 0) ? (nbytes - d_separator.size_bytes()) : 0; }
}
};

Expand Down Expand Up @@ -157,7 +158,7 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& string

cudf::string_view const d_separator(separator.data(), separator.size());

auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
detokenizer_fn{*strings_column, d_row_map, tokens_offsets.data(), d_separator},
output_count,
stream,
Expand Down
20 changes: 11 additions & 9 deletions cpp/src/text/generate_ngrams.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/sizes_to_offsets_iterator.cuh>
#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/strings_children_ex.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
Expand Down Expand Up @@ -57,8 +57,9 @@ struct ngram_generator_fn {
cudf::column_device_view const d_strings;
cudf::size_type ngrams;
cudf::string_view const d_separator;
cudf::size_type* d_offsets{};
cudf::size_type* d_sizes{};
char* d_chars{};
cudf::detail::input_offsetalator d_offsets;

/**
* @brief Build ngram for each string.
Expand All @@ -81,7 +82,7 @@ struct ngram_generator_fn {
bytes += d_separator.size_bytes();
if (out_ptr) out_ptr = cudf::strings::detail::copy_string(out_ptr, d_separator);
}
if (!d_chars) d_offsets[idx] = bytes;
if (!d_chars) { d_sizes[idx] = bytes; }
}
};

Expand Down Expand Up @@ -141,7 +142,7 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
// compute the number of strings of ngrams
auto const ngrams_count = strings_count - ngrams + 1;

auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
ngram_generator_fn{d_strings, ngrams, d_separator}, ngrams_count, stream, mr);

// make the output strings column from the offsets and chars column
Expand Down Expand Up @@ -175,8 +176,9 @@ struct character_ngram_generator_fn {
cudf::column_device_view const d_strings;
cudf::size_type ngrams;
cudf::size_type const* d_ngram_offsets{};
cudf::size_type* d_offsets{};
cudf::size_type* d_sizes{};
char* d_chars{};
cudf::detail::input_offsetalator d_offsets;

__device__ void operator()(cudf::size_type idx)
{
Expand All @@ -186,16 +188,16 @@ struct character_ngram_generator_fn {
auto itr = d_str.begin();
auto const ngram_offset = d_ngram_offsets[idx];
auto const ngram_count = d_ngram_offsets[idx + 1] - ngram_offset;
auto d_sizes = d_offsets + ngram_offset;
auto out_ptr = d_chars ? d_chars + *d_sizes : nullptr;
auto d_output_sizes = d_sizes + ngram_offset;
auto out_ptr = d_chars ? d_chars + d_offsets[ngram_offset] : nullptr;
for (cudf::size_type n = 0; n < ngram_count; ++n, ++itr) {
auto const begin = itr.byte_offset();
auto const end = (itr + ngrams).byte_offset();
if (d_chars) {
out_ptr =
cudf::strings::detail::copy_and_increment(out_ptr, d_str.data() + begin, (end - begin));
} else {
*d_sizes++ = end - begin;
*d_output_sizes++ = end - begin;
}
}
}
Expand Down Expand Up @@ -233,7 +235,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
"Insufficient number of characters in each string to generate ngrams");

character_ngram_generator_fn generator{*d_strings, ngrams, d_offsets};
auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
generator, strings_count, total_ngrams, stream, mr);

auto output = cudf::make_strings_column(
Expand Down
20 changes: 11 additions & 9 deletions cpp/src/text/normalize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/strings_children_ex.cuh>
#include <cudf/strings/detail/strings_column_factories.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/string_view.cuh>
Expand Down Expand Up @@ -59,13 +59,14 @@ namespace {
*/
struct normalize_spaces_fn {
cudf::column_device_view const d_strings; // strings to normalize
cudf::size_type* d_offsets{}; // offsets into d_chars
cudf::size_type* d_sizes{}; // size of each output row
char* d_chars{}; // output buffer for characters
cudf::detail::input_offsetalator d_offsets;

__device__ void operator()(cudf::size_type idx)
{
if (d_strings.is_null(idx)) {
if (!d_chars) d_offsets[idx] = 0;
if (!d_chars) { d_sizes[idx] = 0; }
return;
}
cudf::string_view const single_space(" ", 1);
Expand Down Expand Up @@ -93,7 +94,7 @@ struct normalize_spaces_fn {
nbytes += token.size_bytes() + 1; // token size plus a single space
}
// remove trailing space
if (!d_chars) d_offsets[idx] = (nbytes > 0) ? nbytes - 1 : 0;
if (!d_chars) { d_sizes[idx] = (nbytes > 0) ? nbytes - 1 : 0; }
}
};

Expand All @@ -109,8 +110,9 @@ struct codepoint_to_utf8_fn {
cudf::column_device_view const d_strings; // input strings
uint32_t const* cp_data; // full code-point array
int64_t const* d_cp_offsets{}; // offsets to each string's code-point array
cudf::size_type* d_offsets{}; // offsets for the output strings
cudf::size_type* d_sizes{}; // size of output string
char* d_chars{}; // buffer for the output strings column
cudf::detail::input_offsetalator d_offsets;

/**
* @brief Return the number of bytes for the output string given its code-point array.
Expand All @@ -133,14 +135,14 @@ struct codepoint_to_utf8_fn {
__device__ void operator()(cudf::size_type idx)
{
if (d_strings.is_null(idx)) {
if (!d_chars) d_offsets[idx] = 0;
if (!d_chars) { d_sizes[idx] = 0; }
return;
}
auto const offset = d_cp_offsets[idx];
auto const count = d_cp_offsets[idx + 1] - offset; // number of code-points
auto str_cps = cp_data + offset; // code-points for this string
if (!d_chars) {
d_offsets[idx] = compute_output_size(str_cps, count);
d_sizes[idx] = compute_output_size(str_cps, count);
return;
}
// convert each code-point to 1-4 UTF-8 encoded bytes
Expand Down Expand Up @@ -183,7 +185,7 @@ std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const&
auto d_strings = cudf::column_device_view::create(strings.parent(), stream);

// build offsets and children using the normalize_space_fn
auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
normalize_spaces_fn{*d_strings}, strings.size(), stream, mr);

return cudf::make_strings_column(strings.size(),
Expand Down Expand Up @@ -225,7 +227,7 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
auto d_strings = cudf::column_device_view::create(strings.parent(), stream);

// build offsets and children using the codepoint_to_utf8_fn
auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
codepoint_to_utf8_fn{*d_strings, cp_chars, cp_offsets}, strings.size(), stream, mr);

return cudf::make_strings_column(strings.size(),
Expand Down
18 changes: 10 additions & 8 deletions cpp/src/text/replace.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/strings_children_ex.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
Expand All @@ -48,8 +48,9 @@ using replace_result = thrust::pair<bool, cudf::string_view>;
struct base_token_replacer_fn {
cudf::column_device_view const d_strings; ///< strings to tokenize
cudf::string_view const d_delimiter; ///< delimiter characters for tokenizing
cudf::size_type* d_offsets{}; ///< for locating output string in d_chars
cudf::size_type* d_sizes{}; ///< for output string size
char* d_chars{}; ///< output buffer
cudf::detail::input_offsetalator d_offsets;

/**
* @brief Tokenizes each string and calls the provided `replacer` function
Expand All @@ -63,7 +64,7 @@ struct base_token_replacer_fn {
__device__ void process_string(cudf::size_type idx, ReplaceFn replacer)
{
if (d_strings.is_null(idx)) {
if (!d_chars) d_offsets[idx] = 0;
if (!d_chars) { d_sizes[idx] = 0; }
return;
}

Expand Down Expand Up @@ -95,10 +96,11 @@ struct base_token_replacer_fn {
}

// copy the remainder of the string's bytes to the output buffer
if (out_ptr)
if (out_ptr) {
memcpy(out_ptr, in_ptr + last_pos, d_str.size_bytes() - last_pos);
else
d_offsets[idx] = nbytes;
} else {
d_sizes[idx] = nbytes;
}
}
};

Expand Down Expand Up @@ -230,7 +232,7 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& st

// this utility calls replacer to build the offsets and chars columns
auto [offsets_column, chars] =
cudf::strings::detail::make_strings_children(replacer, strings_count, stream, mr);
cudf::strings::detail::experimental::make_strings_children(replacer, strings_count, stream, mr);

// return new strings column
return cudf::make_strings_column(strings_count,
Expand Down Expand Up @@ -263,7 +265,7 @@ std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& str

// this utility calls filterer to build the offsets and chars columns
auto [offsets_column, chars] =
cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr);
cudf::strings::detail::experimental::make_strings_children(filterer, strings_count, stream, mr);

// return new strings column
return cudf::make_strings_column(strings_count,
Expand Down

0 comments on commit fe4b92c

Please sign in to comment.