Skip to content

Commit

Permalink
Update strings/text source to use grid_1d for thread/block/stride cal…
Browse files Browse the repository at this point in the history
…culations (#17404)

Replaces `threadIdx.x + blockDim.x * blockIdx.x` logic with `grid_1d::global_thread_id()` and `blockDim.x * gridDim.x` with `grid_1d::grid_stride()` in libcudf strings and text source.

Reference #10368

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: #17404
  • Loading branch information
davidwendt authored Nov 26, 2024
1 parent 79a9860 commit d10eae7
Show file tree
Hide file tree
Showing 10 changed files with 38 additions and 56 deletions.
10 changes: 5 additions & 5 deletions cpp/include/cudf/strings/detail/gather.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -85,15 +85,15 @@ CUDF_KERNEL void gather_chars_fn_string_parallel(StringIterator strings_begin,
constexpr size_t out_datatype_size = sizeof(uint4);
constexpr size_t in_datatype_size = sizeof(uint);

int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
int global_warp_id = global_thread_id / cudf::detail::warp_size;
int warp_lane = global_thread_id % cudf::detail::warp_size;
int nwarps = gridDim.x * blockDim.x / cudf::detail::warp_size;
auto const global_thread_id = cudf::detail::grid_1d::global_thread_id();
auto const global_warp_id = global_thread_id / cudf::detail::warp_size;
auto const warp_lane = global_thread_id % cudf::detail::warp_size;
auto const nwarps = cudf::detail::grid_1d::grid_stride() / cudf::detail::warp_size;

auto const alignment_offset = reinterpret_cast<std::uintptr_t>(out_chars) % out_datatype_size;
uint4* out_chars_aligned = reinterpret_cast<uint4*>(out_chars - alignment_offset);

for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) {
for (auto istring = global_warp_id; istring < total_out_strings; istring += nwarps) {
auto const out_start = out_offsets[istring];
auto const out_end = out_offsets[istring + 1];

Expand Down
6 changes: 4 additions & 2 deletions cpp/src/strings/convert/convert_urls.cu
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,8 @@ CUDF_KERNEL void url_decode_char_counter(column_device_view const in_strings,
auto const global_warp_id = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
auto const local_warp_id = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
auto const warp_lane = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
auto const nwarps = static_cast<size_type>(gridDim.x * blockDim.x / cudf::detail::warp_size);
auto const nwarps =
static_cast<size_type>(cudf::detail::grid_1d::grid_stride() / cudf::detail::warp_size);
char* in_chars_shared = temporary_buffer[local_warp_id];

// Loop through strings, and assign each string to a warp.
Expand Down Expand Up @@ -293,7 +294,8 @@ CUDF_KERNEL void url_decode_char_replacer(column_device_view const in_strings,
auto const global_warp_id = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
auto const local_warp_id = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
auto const warp_lane = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
auto const nwarps = static_cast<size_type>(gridDim.x * blockDim.x / cudf::detail::warp_size);
auto const nwarps =
static_cast<size_type>(cudf::detail::grid_1d::grid_stride() / cudf::detail::warp_size);
char* in_chars_shared = temporary_buffer[local_warp_id];

// Loop through strings, and assign each string to a warp
Expand Down
10 changes: 5 additions & 5 deletions cpp/src/strings/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,8 @@ CUDF_KERNEL void fused_concatenate_string_offset_kernel(
bitmask_type* output_mask,
size_type* out_valid_count)
{
cudf::thread_index_type output_index = threadIdx.x + blockIdx.x * blockDim.x;
size_type warp_valid_count = 0;
auto output_index = cudf::detail::grid_1d::global_thread_id();
size_type warp_valid_count = 0;

unsigned active_mask;
if (Nullable) { active_mask = __ballot_sync(0xFFFF'FFFFu, output_index < output_size); }
Expand Down Expand Up @@ -156,7 +156,7 @@ CUDF_KERNEL void fused_concatenate_string_offset_kernel(
warp_valid_count += __popc(new_word);
}

output_index += blockDim.x * gridDim.x;
output_index += cudf::detail::grid_1d::grid_stride();
if (Nullable) { active_mask = __ballot_sync(active_mask, output_index < output_size); }
}

Expand All @@ -178,7 +178,7 @@ CUDF_KERNEL void fused_concatenate_string_chars_kernel(column_device_view const*
size_type const output_size,
char* output_data)
{
cudf::thread_index_type output_index = threadIdx.x + blockIdx.x * blockDim.x;
auto output_index = cudf::detail::grid_1d::global_thread_id();

while (output_index < output_size) {
// Lookup input index by searching for output index in offsets
Expand All @@ -198,7 +198,7 @@ CUDF_KERNEL void fused_concatenate_string_chars_kernel(column_device_view const*
auto const first_char = input_offsets_data[input_view.offset()];
output_data[output_index] = input_chars_data[offset_index + first_char];

output_index += blockDim.x * gridDim.x;
output_index += cudf::detail::grid_1d::grid_stride();
}
}

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/strings/regex/utilities.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ CUDF_KERNEL void for_each_kernel(ForEachFunction fn, reprog_device const d_prog,
__syncthreads();
auto const s_prog = reprog_device::load(d_prog, shmem);

auto const thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
auto const thread_idx = cudf::detail::grid_1d::global_thread_id();
auto const stride = s_prog.thread_count();
if (thread_idx < stride) {
for (auto idx = thread_idx; idx < size; idx += stride) {
Expand Down Expand Up @@ -84,7 +84,7 @@ CUDF_KERNEL void transform_kernel(TransformFunction fn,
__syncthreads();
auto const s_prog = reprog_device::load(d_prog, shmem);

auto const thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
auto const thread_idx = cudf::detail::grid_1d::global_thread_id();
auto const stride = s_prog.thread_count();
if (thread_idx < stride) {
for (auto idx = thread_idx; idx < size; idx += stride) {
Expand Down
16 changes: 7 additions & 9 deletions cpp/src/strings/search/find.cu
Original file line number Diff line number Diff line change
Expand Up @@ -121,11 +121,10 @@ CUDF_KERNEL void finder_warp_parallel_fn(column_device_view const d_strings,
size_type const stop,
size_type* d_results)
{
size_type const idx = static_cast<size_type>(threadIdx.x + blockIdx.x * blockDim.x);
auto const idx = cudf::detail::grid_1d::global_thread_id();

if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; }

auto const str_idx = idx / cudf::detail::warp_size;
auto const str_idx = idx / cudf::detail::warp_size;
if (str_idx >= d_strings.size()) { return; }
auto const lane_idx = idx % cudf::detail::warp_size;

if (d_strings.is_null(str_idx)) { return; }
Expand Down Expand Up @@ -350,13 +349,12 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings,
string_view const d_target,
bool* d_results)
{
size_type const idx = static_cast<size_type>(threadIdx.x + blockIdx.x * blockDim.x);
using warp_reduce = cub::WarpReduce<bool>;
auto const idx = cudf::detail::grid_1d::global_thread_id();
using warp_reduce = cub::WarpReduce<bool>;
__shared__ typename warp_reduce::TempStorage temp_storage;

if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; }

auto const str_idx = idx / cudf::detail::warp_size;
auto const str_idx = idx / cudf::detail::warp_size;
if (str_idx >= d_strings.size()) { return; }
auto const lane_idx = idx % cudf::detail::warp_size;
if (d_strings.is_null(str_idx)) { return; }
// get the string for this warp
Expand Down
9 changes: 3 additions & 6 deletions cpp/src/text/minhash.cu
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,10 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
cudf::size_type width,
hash_value_type* d_hashes)
{
auto const idx = static_cast<std::size_t>(threadIdx.x + blockIdx.x * blockDim.x);
if (idx >= (static_cast<std::size_t>(d_strings.size()) *
static_cast<std::size_t>(cudf::detail::warp_size))) {
return;
}
auto const idx = cudf::detail::grid_1d::global_thread_id();

auto const str_idx = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
auto const str_idx = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
if (str_idx >= d_strings.size()) { return; }
auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);

if (d_strings.is_null(str_idx)) { return; }
Expand Down
5 changes: 2 additions & 3 deletions cpp/src/text/subword/data_normalizer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -217,9 +217,8 @@ CUDF_KERNEL void kernel_data_normalizer(unsigned char const* strings,
constexpr uint32_t init_val = (1 << FILTER_BIT);
uint32_t replacement_code_points[MAX_NEW_CHARS] = {init_val, init_val, init_val};

cudf::thread_index_type const char_for_thread =
threadIdx.x + cudf::thread_index_type(blockIdx.x) * cudf::thread_index_type(blockDim.x);
uint32_t num_new_chars = 0;
auto const char_for_thread = cudf::detail::grid_1d::global_thread_id();
uint32_t num_new_chars = 0;

if (char_for_thread < total_bytes) {
auto const code_point = extract_code_points_from_utf8(strings, total_bytes, char_for_thread);
Expand Down
11 changes: 3 additions & 8 deletions cpp/src/text/subword/subword_tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,10 @@ CUDF_KERNEL void kernel_compute_tensor_metadata(
uint32_t* attn_mask,
uint32_t* metadata)
{
cudf::thread_index_type const output_idx =
threadIdx.x + static_cast<cudf::thread_index_type>(blockIdx.x) *
static_cast<cudf::thread_index_type>(blockDim.x);
if (output_idx >= (static_cast<cudf::thread_index_type>(nrows_tensor_token_ids) *
static_cast<cudf::thread_index_type>(max_sequence_length))) {
return;
}
auto const output_idx = cudf::detail::grid_1d::global_thread_id();

uint32_t const absolute_row_id = output_idx / max_sequence_length;
uint32_t const absolute_row_id = output_idx / max_sequence_length;
if (absolute_row_id >= nrows_tensor_token_ids) { return; }
uint32_t const tensor_id = row2tensor[absolute_row_id];
uint32_t const row_within_tensor = row2row_within_tensor[absolute_row_id];
uint32_t const offset_token_ids_tensor = offsets[tensor_id];
Expand Down
14 changes: 4 additions & 10 deletions cpp/src/text/subword/wordpiece_tokenizer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,7 @@ CUDF_KERNEL void init_data_and_mark_word_start_and_ends(uint32_t const* code_poi
uint32_t* token_ids,
uint8_t* tokens_per_word)
{
cudf::thread_index_type char_for_thread = static_cast<cudf::thread_index_type>(blockDim.x) *
static_cast<cudf::thread_index_type>(blockIdx.x) +
threadIdx.x;
auto const char_for_thread = cudf::detail::grid_1d::global_thread_id();

// Deal with the start_word_indices array
if (char_for_thread < num_code_points) {
Expand Down Expand Up @@ -138,9 +136,7 @@ CUDF_KERNEL void mark_string_start_and_ends(uint32_t const* code_points,
uint32_t* end_word_indices,
uint32_t num_strings)
{
cudf::thread_index_type idx = static_cast<cudf::thread_index_type>(blockDim.x) *
static_cast<cudf::thread_index_type>(blockIdx.x) +
threadIdx.x;
auto const idx = cudf::detail::grid_1d::global_thread_id();
// Ensure the starting character of each strings is written to the word start array.
if (idx <= num_strings) {
auto const offset = strings_offsets[idx];
Expand Down Expand Up @@ -335,11 +331,9 @@ CUDF_KERNEL void kernel_wordpiece_tokenizer(uint32_t const* code_points,
uint32_t* token_ids,
uint8_t* tokens_per_word)
{
cudf::thread_index_type word_to_tokenize = static_cast<cudf::thread_index_type>(blockDim.x) *
static_cast<cudf::thread_index_type>(blockIdx.x) +
threadIdx.x;
auto const word_to_tokenize = cudf::detail::grid_1d::global_thread_id();

if (word_to_tokenize >= total_words) return;
if (word_to_tokenize >= total_words) { return; }
// Each thread gets the start code_point offset for each word and resets the token_id memory to
// the default value. In a post processing step, all of these values will be removed.
auto const token_start = word_starts[word_to_tokenize];
Expand Down
9 changes: 3 additions & 6 deletions cpp/src/text/vocabulary_tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -222,12 +222,9 @@ CUDF_KERNEL void token_counts_fn(cudf::column_device_view const d_strings,
int8_t* d_results)
{
// string per warp
auto const idx = static_cast<std::size_t>(threadIdx.x + blockIdx.x * blockDim.x);
if (idx >= (static_cast<std::size_t>(d_strings.size()) *
static_cast<std::size_t>(cudf::detail::warp_size))) {
return;
}
auto const str_idx = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
auto const idx = cudf::detail::grid_1d::global_thread_id();
auto const str_idx = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
if (str_idx >= d_strings.size()) { return; }
auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);

if (d_strings.is_null(str_idx)) {
Expand Down

0 comments on commit d10eae7

Please sign in to comment.