Update strings/text source to use grid_1d for thread/block/stride cal…

…culations (#17404) Replaces `threadIdx.x + blockDim.x * blockIdx.x` logic with `grid_1d::global_thread_id()` and `blockDim.x * gridDim.x` with `grid_1d::grid_stride()` in libcudf strings and text source. Reference #10368 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Yunsong Wang (https://github.com/PointKernel) - Muhammad Haseeb (https://github.com/mhaseeb123) URL: #17404
rapidsai · Nov 26, 2024 · d10eae7 · d10eae7
1 parent 79a9860
commit d10eae7
Show file tree

Hide file tree

Showing 10 changed files with 38 additions and 56 deletions.
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
@@ -85,15 +85,15 @@ CUDF_KERNEL void gather_chars_fn_string_parallel(StringIterator strings_begin,
   constexpr size_t out_datatype_size = sizeof(uint4);
   constexpr size_t in_datatype_size  = sizeof(uint);
 
-  int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  int global_warp_id   = global_thread_id / cudf::detail::warp_size;
-  int warp_lane        = global_thread_id % cudf::detail::warp_size;
-  int nwarps           = gridDim.x * blockDim.x / cudf::detail::warp_size;
+  auto const global_thread_id = cudf::detail::grid_1d::global_thread_id();
+  auto const global_warp_id   = global_thread_id / cudf::detail::warp_size;
+  auto const warp_lane        = global_thread_id % cudf::detail::warp_size;
+  auto const nwarps           = cudf::detail::grid_1d::grid_stride() / cudf::detail::warp_size;
 
   auto const alignment_offset = reinterpret_cast<std::uintptr_t>(out_chars) % out_datatype_size;
   uint4* out_chars_aligned    = reinterpret_cast<uint4*>(out_chars - alignment_offset);
 
-  for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) {
+  for (auto istring = global_warp_id; istring < total_out_strings; istring += nwarps) {
     auto const out_start = out_offsets[istring];
     auto const out_end   = out_offsets[istring + 1];
 

diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
@@ -207,7 +207,8 @@ CUDF_KERNEL void url_decode_char_counter(column_device_view const in_strings,
   auto const global_warp_id = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
   auto const local_warp_id  = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
   auto const warp_lane      = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
-  auto const nwarps     = static_cast<size_type>(gridDim.x * blockDim.x / cudf::detail::warp_size);
+  auto const nwarps =
+    static_cast<size_type>(cudf::detail::grid_1d::grid_stride() / cudf::detail::warp_size);
   char* in_chars_shared = temporary_buffer[local_warp_id];
 
   // Loop through strings, and assign each string to a warp.
@@ -293,7 +294,8 @@ CUDF_KERNEL void url_decode_char_replacer(column_device_view const in_strings,
   auto const global_warp_id = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
   auto const local_warp_id  = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
   auto const warp_lane      = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
-  auto const nwarps     = static_cast<size_type>(gridDim.x * blockDim.x / cudf::detail::warp_size);
+  auto const nwarps =
+    static_cast<size_type>(cudf::detail::grid_1d::grid_stride() / cudf::detail::warp_size);
   char* in_chars_shared = temporary_buffer[local_warp_id];
 
   // Loop through strings, and assign each string to a warp

diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
@@ -123,8 +123,8 @@ CUDF_KERNEL void fused_concatenate_string_offset_kernel(
   bitmask_type* output_mask,
   size_type* out_valid_count)
 {
-  cudf::thread_index_type output_index = threadIdx.x + blockIdx.x * blockDim.x;
-  size_type warp_valid_count           = 0;
+  auto output_index          = cudf::detail::grid_1d::global_thread_id();
+  size_type warp_valid_count = 0;
 
   unsigned active_mask;
   if (Nullable) { active_mask = __ballot_sync(0xFFFF'FFFFu, output_index < output_size); }
@@ -156,7 +156,7 @@ CUDF_KERNEL void fused_concatenate_string_offset_kernel(
       warp_valid_count += __popc(new_word);
     }
 
-    output_index += blockDim.x * gridDim.x;
+    output_index += cudf::detail::grid_1d::grid_stride();
     if (Nullable) { active_mask = __ballot_sync(active_mask, output_index < output_size); }
   }
 
@@ -178,7 +178,7 @@ CUDF_KERNEL void fused_concatenate_string_chars_kernel(column_device_view const*
                                                        size_type const output_size,
                                                        char* output_data)
 {
-  cudf::thread_index_type output_index = threadIdx.x + blockIdx.x * blockDim.x;
+  auto output_index = cudf::detail::grid_1d::global_thread_id();
 
   while (output_index < output_size) {
     // Lookup input index by searching for output index in offsets
@@ -198,7 +198,7 @@ CUDF_KERNEL void fused_concatenate_string_chars_kernel(column_device_view const*
     auto const first_char     = input_offsets_data[input_view.offset()];
     output_data[output_index] = input_chars_data[offset_index + first_char];
 
-    output_index += blockDim.x * gridDim.x;
+    output_index += cudf::detail::grid_1d::grid_stride();
   }
 }
 

diff --git a/cpp/src/strings/regex/utilities.cuh b/cpp/src/strings/regex/utilities.cuh
@@ -47,7 +47,7 @@ CUDF_KERNEL void for_each_kernel(ForEachFunction fn, reprog_device const d_prog,
   __syncthreads();
   auto const s_prog = reprog_device::load(d_prog, shmem);
 
-  auto const thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  auto const thread_idx = cudf::detail::grid_1d::global_thread_id();
   auto const stride     = s_prog.thread_count();
   if (thread_idx < stride) {
     for (auto idx = thread_idx; idx < size; idx += stride) {
@@ -84,7 +84,7 @@ CUDF_KERNEL void transform_kernel(TransformFunction fn,
   __syncthreads();
   auto const s_prog = reprog_device::load(d_prog, shmem);
 
-  auto const thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  auto const thread_idx = cudf::detail::grid_1d::global_thread_id();
   auto const stride     = s_prog.thread_count();
   if (thread_idx < stride) {
     for (auto idx = thread_idx; idx < size; idx += stride) {

diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
@@ -121,11 +121,10 @@ CUDF_KERNEL void finder_warp_parallel_fn(column_device_view const d_strings,
                                          size_type const stop,
                                          size_type* d_results)
 {
-  size_type const idx = static_cast<size_type>(threadIdx.x + blockIdx.x * blockDim.x);
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
 
-  if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; }
-
-  auto const str_idx  = idx / cudf::detail::warp_size;
+  auto const str_idx = idx / cudf::detail::warp_size;
+  if (str_idx >= d_strings.size()) { return; }
   auto const lane_idx = idx % cudf::detail::warp_size;
 
   if (d_strings.is_null(str_idx)) { return; }
@@ -350,13 +349,12 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings,
                                            string_view const d_target,
                                            bool* d_results)
 {
-  size_type const idx = static_cast<size_type>(threadIdx.x + blockIdx.x * blockDim.x);
-  using warp_reduce   = cub::WarpReduce<bool>;
+  auto const idx    = cudf::detail::grid_1d::global_thread_id();
+  using warp_reduce = cub::WarpReduce<bool>;
   __shared__ typename warp_reduce::TempStorage temp_storage;
 
-  if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; }
-
-  auto const str_idx  = idx / cudf::detail::warp_size;
+  auto const str_idx = idx / cudf::detail::warp_size;
+  if (str_idx >= d_strings.size()) { return; }
   auto const lane_idx = idx % cudf::detail::warp_size;
   if (d_strings.is_null(str_idx)) { return; }
   // get the string for this warp

diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
@@ -74,13 +74,10 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
                                 cudf::size_type width,
                                 hash_value_type* d_hashes)
 {
-  auto const idx = static_cast<std::size_t>(threadIdx.x + blockIdx.x * blockDim.x);
-  if (idx >= (static_cast<std::size_t>(d_strings.size()) *
-              static_cast<std::size_t>(cudf::detail::warp_size))) {
-    return;
-  }
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
 
-  auto const str_idx  = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
+  auto const str_idx = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
+  if (str_idx >= d_strings.size()) { return; }
   auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);
 
   if (d_strings.is_null(str_idx)) { return; }

diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu
@@ -217,9 +217,8 @@ CUDF_KERNEL void kernel_data_normalizer(unsigned char const* strings,
   constexpr uint32_t init_val                     = (1 << FILTER_BIT);
   uint32_t replacement_code_points[MAX_NEW_CHARS] = {init_val, init_val, init_val};
 
-  cudf::thread_index_type const char_for_thread =
-    threadIdx.x + cudf::thread_index_type(blockIdx.x) * cudf::thread_index_type(blockDim.x);
-  uint32_t num_new_chars = 0;
+  auto const char_for_thread = cudf::detail::grid_1d::global_thread_id();
+  uint32_t num_new_chars     = 0;
 
   if (char_for_thread < total_bytes) {
     auto const code_point = extract_code_points_from_utf8(strings, total_bytes, char_for_thread);

diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
@@ -73,15 +73,10 @@ CUDF_KERNEL void kernel_compute_tensor_metadata(
   uint32_t* attn_mask,
   uint32_t* metadata)
 {
-  cudf::thread_index_type const output_idx =
-    threadIdx.x + static_cast<cudf::thread_index_type>(blockIdx.x) *
-                    static_cast<cudf::thread_index_type>(blockDim.x);
-  if (output_idx >= (static_cast<cudf::thread_index_type>(nrows_tensor_token_ids) *
-                     static_cast<cudf::thread_index_type>(max_sequence_length))) {
-    return;
-  }
+  auto const output_idx = cudf::detail::grid_1d::global_thread_id();
 
-  uint32_t const absolute_row_id         = output_idx / max_sequence_length;
+  uint32_t const absolute_row_id = output_idx / max_sequence_length;
+  if (absolute_row_id >= nrows_tensor_token_ids) { return; }
   uint32_t const tensor_id               = row2tensor[absolute_row_id];
   uint32_t const row_within_tensor       = row2row_within_tensor[absolute_row_id];
   uint32_t const offset_token_ids_tensor = offsets[tensor_id];

diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu
@@ -83,9 +83,7 @@ CUDF_KERNEL void init_data_and_mark_word_start_and_ends(uint32_t const* code_poi
                                                         uint32_t* token_ids,
                                                         uint8_t* tokens_per_word)
 {
-  cudf::thread_index_type char_for_thread = static_cast<cudf::thread_index_type>(blockDim.x) *
-                                              static_cast<cudf::thread_index_type>(blockIdx.x) +
-                                            threadIdx.x;
+  auto const char_for_thread = cudf::detail::grid_1d::global_thread_id();
 
   // Deal with the start_word_indices array
   if (char_for_thread < num_code_points) {
@@ -138,9 +136,7 @@ CUDF_KERNEL void mark_string_start_and_ends(uint32_t const* code_points,
                                             uint32_t* end_word_indices,
                                             uint32_t num_strings)
 {
-  cudf::thread_index_type idx = static_cast<cudf::thread_index_type>(blockDim.x) *
-                                  static_cast<cudf::thread_index_type>(blockIdx.x) +
-                                threadIdx.x;
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
   // Ensure the starting character of each strings is written to the word start array.
   if (idx <= num_strings) {
     auto const offset = strings_offsets[idx];
@@ -335,11 +331,9 @@ CUDF_KERNEL void kernel_wordpiece_tokenizer(uint32_t const* code_points,
                                             uint32_t* token_ids,
                                             uint8_t* tokens_per_word)
 {
-  cudf::thread_index_type word_to_tokenize = static_cast<cudf::thread_index_type>(blockDim.x) *
-                                               static_cast<cudf::thread_index_type>(blockIdx.x) +
-                                             threadIdx.x;
+  auto const word_to_tokenize = cudf::detail::grid_1d::global_thread_id();
 
-  if (word_to_tokenize >= total_words) return;
+  if (word_to_tokenize >= total_words) { return; }
   // Each thread gets the start code_point offset for each word and resets the token_id memory to
   // the default value. In a post processing step, all of these values will be removed.
   auto const token_start = word_starts[word_to_tokenize];

diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
@@ -222,12 +222,9 @@ CUDF_KERNEL void token_counts_fn(cudf::column_device_view const d_strings,
                                  int8_t* d_results)
 {
   // string per warp
-  auto const idx = static_cast<std::size_t>(threadIdx.x + blockIdx.x * blockDim.x);
-  if (idx >= (static_cast<std::size_t>(d_strings.size()) *
-              static_cast<std::size_t>(cudf::detail::warp_size))) {
-    return;
-  }
-  auto const str_idx  = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
+  auto const idx     = cudf::detail::grid_1d::global_thread_id();
+  auto const str_idx = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
+  if (str_idx >= d_strings.size()) { return; }
   auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);
 
   if (d_strings.is_null(str_idx)) {