From 22a96df3a85acbb76f1564dee86ce452c1ee1960 Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Tue, 15 Mar 2022 15:22:43 -0500
Subject: [PATCH 01/10] Grid stride loop index overflow fix.

---
 cpp/include/cudf/detail/valid_if.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index ee9e4b2c687..28e5ad9a8ad 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -48,7 +48,7 @@ __global__ void valid_if_kernel(
 {
   constexpr size_type leader_lane{0};
   auto const lane_id{threadIdx.x % warp_size};
-  size_type i = threadIdx.x + blockIdx.x * blockDim.x;
+  std::size_t i = threadIdx.x + blockIdx.x * blockDim.x;
   size_type warp_valid_count{0};
 
   auto active_mask = __ballot_sync(0xFFFF'FFFF, i < size);

From 797c723a9dcc5834eb2ea28fa86d29fa480d8675 Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Wed, 16 Mar 2022 11:25:39 -0500
Subject: [PATCH 02/10] Fix 2 size_type overflows in scalar cudf::scatter()

---
 cpp/src/copying/scatter.cu | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 9a364451b3b..4eba43dca19 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,7 +47,7 @@ __global__ void marking_bitmask_kernel(mutable_column_device_view destination,
                                        MapIterator scatter_map,
                                        size_type num_scatter_rows)
 {
-  size_type row = threadIdx.x + blockIdx.x * blockDim.x;
+  std::size_t row = threadIdx.x + blockIdx.x * blockDim.x;
 
   while (row < num_scatter_rows) {
     size_type const output_row = scatter_map[row];
@@ -351,8 +351,10 @@ std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<const scalar>>
 
   // Transform negative indices to index + target size
   auto scatter_rows = indices.size();
+  // note: the intermediate ((in % n_rows) + n_rows) will overflow a size_type for any value of `in` > (2^31)/2, 
+  // but the end result after the final (% n_rows) will fit. so we'll do the computation using a signed 64 bit value.
   auto scatter_iter = thrust::make_transform_iterator(
-    map_begin, [n_rows] __device__(size_type in) { return ((in % n_rows) + n_rows) % n_rows; });
+    map_begin, [n_rows = static_cast<int64_t>(n_rows)] __device__(size_type in) -> size_type { return ((static_cast<int64_t>(in) % n_rows) + n_rows) % n_rows; });
 
   // Dispatch over data type per column
   auto result          = std::vector<std::unique_ptr<column>>(target.num_columns());

From 34cf254dfa437684c9c1801ebf77afb62d5d2029 Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Wed, 16 Mar 2022 16:38:10 -0500
Subject: [PATCH 03/10] Fix grid stride overflow in gpu_rolling.

---
 cpp/src/rolling/rolling_detail.cuh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh
index a121e247258..36952be31a6 100644
--- a/cpp/src/rolling/rolling_detail.cuh
+++ b/cpp/src/rolling/rolling_detail.cuh
@@ -1008,7 +1008,7 @@ __launch_bounds__(block_size) __global__
                    PrecedingWindowIterator preceding_window_begin,
                    FollowingWindowIterator following_window_begin)
 {
-  size_type i      = blockIdx.x * block_size + threadIdx.x;
+  std::size_t i      = blockIdx.x * block_size + threadIdx.x;
   size_type stride = block_size * gridDim.x;
 
   size_type warp_valid_count{0};
@@ -1021,9 +1021,9 @@ __launch_bounds__(block_size) __global__
 
     // compute bounds
     auto start = static_cast<size_type>(
-      min(static_cast<int64_t>(input.size()), max(0L, i - preceding_window + 1)));
+      min(static_cast<int64_t>(input.size()), max(0L, static_cast<cudf::size_type>(i) - preceding_window + 1)));
     auto end = static_cast<size_type>(
-      min(static_cast<int64_t>(input.size()), max(0L, i + following_window + 1)));
+      min(static_cast<int64_t>(input.size()), max(0L, static_cast<cudf::size_type>(i) + following_window + 1)));
     size_type start_index = min(start, end);
     size_type end_index   = max(start, end);
 

From 67aa4e3295524ed791ad673d2012bf01eb557fcb Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Wed, 16 Mar 2022 17:40:21 -0500
Subject: [PATCH 04/10] Fix grid stride overflow issues in replace_nulls() and
 replace_nulls_strings() kernels.

---
 cpp/src/replace/nulls.cu | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index d41bdb6ca5a..4ee259da608 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -63,7 +63,7 @@ __global__ void replace_nulls_strings(cudf::column_device_view input,
                                       cudf::size_type* valid_counter)
 {
   cudf::size_type nrows = input.size();
-  cudf::size_type i     = blockIdx.x * blockDim.x + threadIdx.x;
+  std::size_t i     = blockIdx.x * blockDim.x + threadIdx.x;
 
   uint32_t active_mask = 0xffffffff;
   active_mask          = __ballot_sync(active_mask, i < nrows);
@@ -115,7 +115,7 @@ __global__ void replace_nulls(cudf::column_device_view input,
                               cudf::size_type* output_valid_count)
 {
   cudf::size_type nrows = input.size();
-  cudf::size_type i     = blockIdx.x * blockDim.x + threadIdx.x;
+  std::size_t i     = blockIdx.x * blockDim.x + threadIdx.x;
 
   uint32_t active_mask = 0xffffffff;
   active_mask          = __ballot_sync(active_mask, i < nrows);
@@ -247,6 +247,7 @@ std::unique_ptr<cudf::column> replace_nulls_column_kernel_forwarder::operator()<
 
   std::unique_ptr<cudf::column> offsets = cudf::strings::detail::make_offsets_child_column(
     sizes_view.begin<int32_t>(), sizes_view.end<int32_t>(), stream, mr);
+
   auto offsets_view = offsets->mutable_view();
 
   auto const bytes =

From e5d12b41cc0d16fe2dff205ed86ba5864602e9cf Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Wed, 16 Mar 2022 18:22:07 -0500
Subject: [PATCH 05/10] Fix grid stride overflow issue in
 compute_column_kernel.

---
 cpp/src/transform/compute_column.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu
index bf109dbe1e5..70a61036392 100644
--- a/cpp/src/transform/compute_column.cu
+++ b/cpp/src/transform/compute_column.cu
@@ -72,7 +72,7 @@ __launch_bounds__(max_block_size) __global__
   auto evaluator =
     cudf::ast::detail::expression_evaluator<has_nulls>(table, device_expression_data);
 
-  for (cudf::size_type row_index = start_idx; row_index < table.num_rows(); row_index += stride) {
+  for (std::size_t row_index = start_idx; row_index < table.num_rows(); row_index += stride) {
     auto output_dest = ast::detail::mutable_column_expression_result<has_nulls>(output_column);
     evaluator.evaluate(output_dest, row_index, thread_intermediate_storage);
   }

From 11b4c611f4d18d152060a2336c907332cfee8649 Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Wed, 16 Mar 2022 19:38:54 -0500
Subject: [PATCH 06/10] Fix grid stride overflow issue in gpu_rolling_new.

---
 cpp/src/rolling/jit/kernel.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/rolling/jit/kernel.cu b/cpp/src/rolling/jit/kernel.cu
index 52e397b9351..b5dc8359a4b 100644
--- a/cpp/src/rolling/jit/kernel.cu
+++ b/cpp/src/rolling/jit/kernel.cu
@@ -51,7 +51,7 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
                                 FollowingWindowType following_window_begin,
                                 cudf::size_type min_periods)
 {
-  cudf::size_type i      = blockIdx.x * blockDim.x + threadIdx.x;
+  std::size_t i          = blockIdx.x * blockDim.x + threadIdx.x;
   cudf::size_type stride = blockDim.x * gridDim.x;
 
   cudf::size_type warp_valid_count{0};
@@ -66,8 +66,8 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
     cudf::size_type following_window = get_window(following_window_begin, i);
 
     // compute bounds
-    cudf::size_type start       = min(nrows, max(0, i - preceding_window + 1));
-    cudf::size_type end         = min(nrows, max(0, i + following_window + 1));
+    cudf::size_type start       = min(nrows, max(0, static_cast<size_type>(i) - preceding_window + 1));
+    cudf::size_type end         = min(nrows, max(0, static_cast<size_type>(i) + following_window + 1));
     cudf::size_type start_index = min(start, end);
     cudf::size_type end_index   = max(start, end);
 

From 75f01bd8beaa88ef3df399b7e2e67e0b7d4976f8 Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Wed, 16 Mar 2022 19:44:34 -0500
Subject: [PATCH 07/10] Code formatting.

---
 cpp/src/copying/scatter.cu         |  9 ++++++---
 cpp/src/replace/nulls.cu           |  4 ++--
 cpp/src/rolling/jit/kernel.cu      |  4 ++--
 cpp/src/rolling/rolling_detail.cuh | 12 +++++++-----
 4 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 4eba43dca19..45e338f904e 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -351,10 +351,13 @@ std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<const scalar>>
 
   // Transform negative indices to index + target size
   auto scatter_rows = indices.size();
-  // note: the intermediate ((in % n_rows) + n_rows) will overflow a size_type for any value of `in` > (2^31)/2, 
-  // but the end result after the final (% n_rows) will fit. so we'll do the computation using a signed 64 bit value.
+  // note: the intermediate ((in % n_rows) + n_rows) will overflow a size_type for any value of `in`
+  // > (2^31)/2, but the end result after the final (% n_rows) will fit. so we'll do the computation
+  // using a signed 64 bit value.
   auto scatter_iter = thrust::make_transform_iterator(
-    map_begin, [n_rows = static_cast<int64_t>(n_rows)] __device__(size_type in) -> size_type { return ((static_cast<int64_t>(in) % n_rows) + n_rows) % n_rows; });
+    map_begin, [n_rows = static_cast<int64_t>(n_rows)] __device__(size_type in) -> size_type {
+      return ((static_cast<int64_t>(in) % n_rows) + n_rows) % n_rows;
+    });
 
   // Dispatch over data type per column
   auto result          = std::vector<std::unique_ptr<column>>(target.num_columns());
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 4ee259da608..b9c5a8a319c 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -63,7 +63,7 @@ __global__ void replace_nulls_strings(cudf::column_device_view input,
                                       cudf::size_type* valid_counter)
 {
   cudf::size_type nrows = input.size();
-  std::size_t i     = blockIdx.x * blockDim.x + threadIdx.x;
+  std::size_t i         = blockIdx.x * blockDim.x + threadIdx.x;
 
   uint32_t active_mask = 0xffffffff;
   active_mask          = __ballot_sync(active_mask, i < nrows);
@@ -115,7 +115,7 @@ __global__ void replace_nulls(cudf::column_device_view input,
                               cudf::size_type* output_valid_count)
 {
   cudf::size_type nrows = input.size();
-  std::size_t i     = blockIdx.x * blockDim.x + threadIdx.x;
+  std::size_t i         = blockIdx.x * blockDim.x + threadIdx.x;
 
   uint32_t active_mask = 0xffffffff;
   active_mask          = __ballot_sync(active_mask, i < nrows);
diff --git a/cpp/src/rolling/jit/kernel.cu b/cpp/src/rolling/jit/kernel.cu
index b5dc8359a4b..0f140ef1a79 100644
--- a/cpp/src/rolling/jit/kernel.cu
+++ b/cpp/src/rolling/jit/kernel.cu
@@ -66,8 +66,8 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
     cudf::size_type following_window = get_window(following_window_begin, i);
 
     // compute bounds
-    cudf::size_type start       = min(nrows, max(0, static_cast<size_type>(i) - preceding_window + 1));
-    cudf::size_type end         = min(nrows, max(0, static_cast<size_type>(i) + following_window + 1));
+    cudf::size_type start = min(nrows, max(0, static_cast<size_type>(i) - preceding_window + 1));
+    cudf::size_type end   = min(nrows, max(0, static_cast<size_type>(i) + following_window + 1));
     cudf::size_type start_index = min(start, end);
     cudf::size_type end_index   = max(start, end);
 
diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh
index 36952be31a6..2c8890ab860 100644
--- a/cpp/src/rolling/rolling_detail.cuh
+++ b/cpp/src/rolling/rolling_detail.cuh
@@ -1008,7 +1008,7 @@ __launch_bounds__(block_size) __global__
                    PrecedingWindowIterator preceding_window_begin,
                    FollowingWindowIterator following_window_begin)
 {
-  std::size_t i      = blockIdx.x * block_size + threadIdx.x;
+  std::size_t i    = blockIdx.x * block_size + threadIdx.x;
   size_type stride = block_size * gridDim.x;
 
   size_type warp_valid_count{0};
@@ -1020,10 +1020,12 @@ __launch_bounds__(block_size) __global__
     int64_t following_window = following_window_begin[i];
 
     // compute bounds
-    auto start = static_cast<size_type>(
-      min(static_cast<int64_t>(input.size()), max(0L, static_cast<cudf::size_type>(i) - preceding_window + 1)));
-    auto end = static_cast<size_type>(
-      min(static_cast<int64_t>(input.size()), max(0L, static_cast<cudf::size_type>(i) + following_window + 1)));
+    auto start =
+      static_cast<size_type>(min(static_cast<int64_t>(input.size()),
+                                 max(0L, static_cast<cudf::size_type>(i) - preceding_window + 1)));
+    auto end =
+      static_cast<size_type>(min(static_cast<int64_t>(input.size()),
+                                 max(0L, static_cast<cudf::size_type>(i) + following_window + 1)));
     size_type start_index = min(start, end);
     size_type end_index   = max(start, end);
 

From 3de4b512a156c1d70cdf55da11b97fea6113c5f0 Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Mon, 21 Mar 2022 10:57:42 -0500
Subject: [PATCH 08/10] Add thread_index_type for grid stride loops.

---
 cpp/include/cudf/detail/valid_if.cuh |  5 +++--
 cpp/include/cudf/types.hpp           |  9 +++++----
 cpp/src/copying/scatter.cu           |  5 +++--
 cpp/src/replace/nulls.cu             | 14 ++++++++------
 cpp/src/rolling/jit/kernel.cu        |  6 +++---
 cpp/src/rolling/rolling_detail.cuh   |  4 ++--
 cpp/src/transform/compute_column.cu  |  9 +++++----
 7 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index 28e5ad9a8ad..aa4421bb4ed 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -48,7 +48,8 @@ __global__ void valid_if_kernel(
 {
   constexpr size_type leader_lane{0};
   auto const lane_id{threadIdx.x % warp_size};
-  std::size_t i = threadIdx.x + blockIdx.x * blockDim.x;
+  thread_index_type i            = threadIdx.x + blockIdx.x * blockDim.x;
+  thread_index_type const stride = blockDim.x * gridDim.x;
   size_type warp_valid_count{0};
 
   auto active_mask = __ballot_sync(0xFFFF'FFFF, i < size);
@@ -58,7 +59,7 @@ __global__ void valid_if_kernel(
       output[cudf::word_index(i)] = ballot;
       warp_valid_count += __popc(ballot);
     }
-    i += blockDim.x * gridDim.x;
+    i += stride;
     active_mask = __ballot_sync(active_mask, i < size);
   }
 
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index 76e2589a5a9..f6496980f17 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -83,10 +83,11 @@ class mutable_table_view;
  * @file
  */
 
-using size_type    = int32_t;
-using bitmask_type = uint32_t;
-using valid_type   = uint8_t;
-using offset_type  = int32_t;
+using size_type         = int32_t;
+using bitmask_type      = uint32_t;
+using valid_type        = uint8_t;
+using offset_type       = int32_t;
+using thread_index_type = int64_t;
 
 /**
  * @brief Similar to `std::distance` but returns `cudf::size_type` and performs `static_cast`
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 45e338f904e..98a90518bcb 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -47,7 +47,8 @@ __global__ void marking_bitmask_kernel(mutable_column_device_view destination,
                                        MapIterator scatter_map,
                                        size_type num_scatter_rows)
 {
-  std::size_t row = threadIdx.x + blockIdx.x * blockDim.x;
+  thread_index_type row          = threadIdx.x + blockIdx.x * blockDim.x;
+  thread_index_type const stride = blockDim.x * gridDim.x;
 
   while (row < num_scatter_rows) {
     size_type const output_row = scatter_map[row];
@@ -58,7 +59,7 @@ __global__ void marking_bitmask_kernel(mutable_column_device_view destination,
       destination.set_null(output_row);
     }
 
-    row += blockDim.x * gridDim.x;
+    row += stride;
   }
 }
 
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index b9c5a8a319c..8707a89d9c9 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -62,8 +62,9 @@ __global__ void replace_nulls_strings(cudf::column_device_view input,
                                       char* chars,
                                       cudf::size_type* valid_counter)
 {
-  cudf::size_type nrows = input.size();
-  std::size_t i         = blockIdx.x * blockDim.x + threadIdx.x;
+  cudf::size_type nrows                = input.size();
+  cudf::thread_index_type i            = blockIdx.x * blockDim.x + threadIdx.x;
+  cudf::thread_index_type const stride = blockDim.x * gridDim.x;
 
   uint32_t active_mask = 0xffffffff;
   active_mask          = __ballot_sync(active_mask, i < nrows);
@@ -98,7 +99,7 @@ __global__ void replace_nulls_strings(cudf::column_device_view input,
       if (nonzero_output) std::memcpy(chars + offsets[i], out.data(), out.size_bytes());
     }
 
-    i += blockDim.x * gridDim.x;
+    i += stride;
     active_mask = __ballot_sync(active_mask, i < nrows);
   }
 
@@ -114,8 +115,9 @@ __global__ void replace_nulls(cudf::column_device_view input,
                               cudf::mutable_column_device_view output,
                               cudf::size_type* output_valid_count)
 {
-  cudf::size_type nrows = input.size();
-  std::size_t i         = blockIdx.x * blockDim.x + threadIdx.x;
+  cudf::size_type nrows                = input.size();
+  cudf::thread_index_type i            = blockIdx.x * blockDim.x + threadIdx.x;
+  cudf::thread_index_type const stride = blockDim.x * gridDim.x;
 
   uint32_t active_mask = 0xffffffff;
   active_mask          = __ballot_sync(active_mask, i < nrows);
@@ -141,7 +143,7 @@ __global__ void replace_nulls(cudf::column_device_view input,
       }
     }
 
-    i += blockDim.x * gridDim.x;
+    i += stride;
     active_mask = __ballot_sync(active_mask, i < nrows);
   }
   if (replacement_has_nulls) {
diff --git a/cpp/src/rolling/jit/kernel.cu b/cpp/src/rolling/jit/kernel.cu
index 0f140ef1a79..1b8ff5c6a95 100644
--- a/cpp/src/rolling/jit/kernel.cu
+++ b/cpp/src/rolling/jit/kernel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,8 +51,8 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
                                 FollowingWindowType following_window_begin,
                                 cudf::size_type min_periods)
 {
-  std::size_t i          = blockIdx.x * blockDim.x + threadIdx.x;
-  cudf::size_type stride = blockDim.x * gridDim.x;
+  thread_index_type i            = blockIdx.x * blockDim.x + threadIdx.x;
+  thread_index_type const stride = blockDim.x * gridDim.x;
 
   cudf::size_type warp_valid_count{0};
 
diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh
index 2c8890ab860..f7528c6c930 100644
--- a/cpp/src/rolling/rolling_detail.cuh
+++ b/cpp/src/rolling/rolling_detail.cuh
@@ -1008,8 +1008,8 @@ __launch_bounds__(block_size) __global__
                    PrecedingWindowIterator preceding_window_begin,
                    FollowingWindowIterator following_window_begin)
 {
-  std::size_t i    = blockIdx.x * block_size + threadIdx.x;
-  size_type stride = block_size * gridDim.x;
+  thread_index_type i            = blockIdx.x * block_size + threadIdx.x;
+  thread_index_type const stride = block_size * gridDim.x;
 
   size_type warp_valid_count{0};
 
diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu
index 70a61036392..bc3678380be 100644
--- a/cpp/src/transform/compute_column.cu
+++ b/cpp/src/transform/compute_column.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,12 +67,13 @@ __launch_bounds__(max_block_size) __global__
 
   auto thread_intermediate_storage =
     &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
-  auto const start_idx = static_cast<cudf::size_type>(threadIdx.x + blockIdx.x * blockDim.x);
-  auto const stride    = static_cast<cudf::size_type>(blockDim.x * gridDim.x);
+  auto const start_idx =
+    static_cast<cudf::thread_index_type>(threadIdx.x + blockIdx.x * blockDim.x);
+  auto const stride = static_cast<cudf::thread_index_type>(blockDim.x * gridDim.x);
   auto evaluator =
     cudf::ast::detail::expression_evaluator<has_nulls>(table, device_expression_data);
 
-  for (std::size_t row_index = start_idx; row_index < table.num_rows(); row_index += stride) {
+  for (thread_index_type row_index = start_idx; row_index < table.num_rows(); row_index += stride) {
     auto output_dest = ast::detail::mutable_column_expression_result<has_nulls>(output_column);
     evaluator.evaluate(output_dest, row_index, thread_intermediate_storage);
   }

From b167b07a28f0c861b5b21eb9069b50a2d9811333 Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Mon, 21 Mar 2022 11:14:07 -0500
Subject: [PATCH 09/10] PR review comments.

---
 cpp/src/rolling/rolling_detail.cuh | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh
index f7528c6c930..917ce4b6663 100644
--- a/cpp/src/rolling/rolling_detail.cuh
+++ b/cpp/src/rolling/rolling_detail.cuh
@@ -1020,12 +1020,10 @@ __launch_bounds__(block_size) __global__
     int64_t following_window = following_window_begin[i];
 
     // compute bounds
-    auto start =
-      static_cast<size_type>(min(static_cast<int64_t>(input.size()),
-                                 max(0L, static_cast<cudf::size_type>(i) - preceding_window + 1)));
-    auto end =
-      static_cast<size_type>(min(static_cast<int64_t>(input.size()),
-                                 max(0L, static_cast<cudf::size_type>(i) + following_window + 1)));
+    auto start            = static_cast<size_type>(min(
+      static_cast<int64_t>(input.size()), max(0L, static_cast<int64_t>(i) - preceding_window + 1)));
+    auto end              = static_cast<size_type>(min(static_cast<int64_t>(input.size()),
+                                          max(0L, static_cast<int64_t>(i) + following_window + 1)));
     size_type start_index = min(start, end);
     size_type end_index   = max(start, end);
 

From e1bdea4379443a1be3a35fb1e621da4250475933 Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Tue, 22 Mar 2022 13:29:06 -0500
Subject: [PATCH 10/10] Clean up some declarations and casting.

---
 cpp/src/rolling/jit/kernel.cu      | 24 +++++++++++++-----------
 cpp/src/rolling/rolling_detail.cuh | 18 +++++++++---------
 2 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/cpp/src/rolling/jit/kernel.cu b/cpp/src/rolling/jit/kernel.cu
index 1b8ff5c6a95..4122f3a56a2 100644
--- a/cpp/src/rolling/jit/kernel.cu
+++ b/cpp/src/rolling/jit/kernel.cu
@@ -25,13 +25,13 @@ namespace rolling {
 namespace jit {
 
 template <typename WindowType>
-cudf::size_type __device__ get_window(WindowType window, cudf::size_type index)
+cudf::size_type __device__ get_window(WindowType window, cudf::thread_index_type index)
 {
   return window[index];
 }
 
 template <>
-cudf::size_type __device__ get_window(cudf::size_type window, cudf::size_type index)
+cudf::size_type __device__ get_window(cudf::size_type window, cudf::thread_index_type index)
 {
   return window;
 }
@@ -51,8 +51,8 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
                                 FollowingWindowType following_window_begin,
                                 cudf::size_type min_periods)
 {
-  thread_index_type i            = blockIdx.x * blockDim.x + threadIdx.x;
-  thread_index_type const stride = blockDim.x * gridDim.x;
+  cudf::thread_index_type i            = blockIdx.x * blockDim.x + threadIdx.x;
+  cudf::thread_index_type const stride = blockDim.x * gridDim.x;
 
   cudf::size_type warp_valid_count{0};
 
@@ -62,14 +62,16 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
     // for CUDA 10.0 and below (fixed in CUDA 10.1)
     volatile cudf::size_type count = 0;
 
-    cudf::size_type preceding_window = get_window(preceding_window_begin, i);
-    cudf::size_type following_window = get_window(following_window_begin, i);
+    int64_t const preceding_window = get_window(preceding_window_begin, i);
+    int64_t const following_window = get_window(following_window_begin, i);
 
     // compute bounds
-    cudf::size_type start = min(nrows, max(0, static_cast<size_type>(i) - preceding_window + 1));
-    cudf::size_type end   = min(nrows, max(0, static_cast<size_type>(i) + following_window + 1));
-    cudf::size_type start_index = min(start, end);
-    cudf::size_type end_index   = max(start, end);
+    auto const start = static_cast<cudf::size_type>(
+      min(static_cast<int64_t>(nrows), max(int64_t{0}, i - preceding_window + 1)));
+    auto const end = static_cast<cudf::size_type>(
+      min(static_cast<int64_t>(nrows), max(int64_t{0}, i + following_window + 1)));
+    auto const start_index = min(start, end);
+    auto const end_index   = max(start, end);
 
     // aggregate
     // TODO: We should explore using shared memory to avoid redundant loads.
@@ -79,7 +81,7 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
     OutType val = agg_op::template operate<OutType, InType>(in_col, start_index, count);
 
     // check if we have enough input samples
-    bool output_is_valid = (count >= min_periods);
+    bool const output_is_valid = (count >= min_periods);
 
     // set the mask
     const unsigned int result_mask = __ballot_sync(active_threads, output_is_valid);
diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh
index 917ce4b6663..0ab8fff9a88 100644
--- a/cpp/src/rolling/rolling_detail.cuh
+++ b/cpp/src/rolling/rolling_detail.cuh
@@ -1016,16 +1016,16 @@ __launch_bounds__(block_size) __global__
   auto active_threads = __ballot_sync(0xffffffff, i < input.size());
   while (i < input.size()) {
     // to prevent overflow issues when computing bounds use int64_t
-    int64_t preceding_window = preceding_window_begin[i];
-    int64_t following_window = following_window_begin[i];
+    int64_t const preceding_window = preceding_window_begin[i];
+    int64_t const following_window = following_window_begin[i];
 
     // compute bounds
-    auto start            = static_cast<size_type>(min(
-      static_cast<int64_t>(input.size()), max(0L, static_cast<int64_t>(i) - preceding_window + 1)));
-    auto end              = static_cast<size_type>(min(static_cast<int64_t>(input.size()),
-                                          max(0L, static_cast<int64_t>(i) + following_window + 1)));
-    size_type start_index = min(start, end);
-    size_type end_index   = max(start, end);
+    auto const start = static_cast<size_type>(
+      min(static_cast<int64_t>(input.size()), max(int64_t{0}, i - preceding_window + 1)));
+    auto const end = static_cast<size_type>(
+      min(static_cast<int64_t>(input.size()), max(int64_t{0}, i + following_window + 1)));
+    auto const start_index = min(start, end);
+    auto const end_index   = max(start, end);
 
     // aggregate
     // TODO: We should explore using shared memory to avoid redundant loads.
@@ -1037,7 +1037,7 @@ __launch_bounds__(block_size) __global__
       input, default_outputs, output, start_index, end_index, i);
 
     // set the mask
-    cudf::bitmask_type result_mask{__ballot_sync(active_threads, output_is_valid)};
+    cudf::bitmask_type const result_mask{__ballot_sync(active_threads, output_is_valid)};
 
     // only one thread writes the mask
     if (0 == threadIdx.x % cudf::detail::warp_size) {