From db2b1d10a82ec63d4278feca9c92d96a586e51d6 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 17 Oct 2024 22:10:42 +0000
Subject: [PATCH 1/7] Allow templated begin value to
 `counting_transform_iterator`

---
 cpp/include/cudf/detail/iterator.cuh         | 4 ++--
 cpp/src/io/parquet/reader_impl_preprocess.cu | 5 +----
 2 files changed, 3 insertions(+), 6 deletions(-)
diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
index 4349e1b70fd..b46d50314a2 100644
--- a/cpp/include/cudf/detail/iterator.cuh
+++ b/cpp/include/cudf/detail/iterator.cuh
@@ -66,8 +66,8 @@ namespace detail {
  * @param f The unary function to apply to the counting iterator.
  * @return A transform iterator that applies `f` to a counting iterator
  */
-template <typename UnaryFunction>
-CUDF_HOST_DEVICE inline auto make_counting_transform_iterator(cudf::size_type start,
+template <typename CountingIterType, typename UnaryFunction>
+CUDF_HOST_DEVICE inline auto make_counting_transform_iterator(CountingIterType start,
                                                               UnaryFunction f)
 {
   return thrust::make_transform_iterator(thrust::make_counting_iterator(start), f);
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 5138a92ac14..d8f7d339a7f 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1629,10 +1629,7 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
         get_page_nesting_size{
           d_cols_info.data(), max_depth, subpass.pages.size(), subpass.pages.device_begin()});
 
-      // Manually create a int64_t `key_start` compatible counting_transform_iterator to avoid
-      // implicit casting to size_type.
-      auto const reduction_keys = thrust::make_transform_iterator(
-        thrust::make_counting_iterator<size_t>(key_start), get_reduction_key{subpass.pages.size()});
+      auto const reduction_keys = cudf::detail::make_counting_transform_iterator(key_start, get_reduction_key{subpass.pages.size()});
 
       // Find the size of each column
       thrust::reduce_by_key(rmm::exec_policy_nosync(_stream),

From ac82d73d46942acc3ca21e6769e9d4f7aed1a2e4 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 17 Oct 2024 22:19:52 +0000
Subject: [PATCH 2/7] style fix

---
 cpp/src/io/parquet/reader_impl_preprocess.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index d8f7d339a7f..f4d1d036ecd 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1629,7 +1629,8 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
         get_page_nesting_size{
           d_cols_info.data(), max_depth, subpass.pages.size(), subpass.pages.device_begin()});
 
-      auto const reduction_keys = cudf::detail::make_counting_transform_iterator(key_start, get_reduction_key{subpass.pages.size()});
+      auto const reduction_keys = cudf::detail::make_counting_transform_iterator(
+        key_start, get_reduction_key{subpass.pages.size()});
 
       // Find the size of each column
       thrust::reduce_by_key(rmm::exec_policy_nosync(_stream),

From ed31c32580baa5f6d413e12b8cc22a97c522e7b8 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 18 Oct 2024 18:33:49 +0000
Subject: [PATCH 3/7] Force int32_t bound for counting transform iterator

---
 cpp/include/cudf/detail/iterator.cuh            | 11 ++++++++---
 .../cudf/detail/utilities/batched_memset.hpp    |  4 ++--
 .../cudf/strings/detail/strings_children.cuh    | 17 +++++++++--------
 cpp/src/io/parquet/page_data.cu                 |  7 ++++---
 cpp/src/io/parquet/reader_impl_preprocess.cu    |  6 ++++--
 cpp/tests/io/orc_chunked_reader_test.cu         |  9 +++++----
 6 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
index b46d50314a2..b4a4f01522e 100644
--- a/cpp/include/cudf/detail/iterator.cuh
+++ b/cpp/include/cudf/detail/iterator.cuh
@@ -49,7 +49,8 @@ namespace cudf {
 namespace detail {
 /**
  * @brief Convenience wrapper for creating a `thrust::transform_iterator` over a
- * `thrust::counting_iterator`.
+ * `thrust::counting_iterator` of type smaller than or equal to int32_t in size.
+ *
  *
  * Example:
  * @code{.cpp}
@@ -62,11 +63,15 @@ namespace detail {
  * iter[n] == n * n
  * @endcode
  *
- * @param start The starting value of the counting iterator
+ * @param start The starting value of the counting iterator (must be int32_t or smaller type).
  * @param f The unary function to apply to the counting iterator.
  * @return A transform iterator that applies `f` to a counting iterator
  */
-template <typename CountingIterType, typename UnaryFunction>
+template <typename CountingIterType,
+          typename UnaryFunction,
+          typename = std::enable_if_t<std::is_integral_v<CountingIterType> and
+                                      std::numeric_limits<CountingIterType>::max() <=
+                                        std::numeric_limits<cudf::size_type>::max()>>
 CUDF_HOST_DEVICE inline auto make_counting_transform_iterator(CountingIterType start,
                                                               UnaryFunction f)
 {
diff --git a/cpp/include/cudf/detail/utilities/batched_memset.hpp b/cpp/include/cudf/detail/utilities/batched_memset.hpp
index 75f738f7529..78be5b91248 100644
--- a/cpp/include/cudf/detail/utilities/batched_memset.hpp
+++ b/cpp/include/cudf/detail/utilities/batched_memset.hpp
@@ -53,8 +53,8 @@ void batched_memset(std::vector<cudf::device_span<T>> const& bufs,
     cudf::detail::make_device_uvector_async(bufs, stream, cudf::get_current_device_resource_ref());
 
   // get a vector with the sizes of all buffers
-  auto sizes = cudf::detail::make_counting_transform_iterator(
-    static_cast<std::size_t>(0),
+  auto sizes = thrust::make_transform_iterator(
+    thrust::counting_iterator<std::size_t>(0),
     cuda::proclaim_return_type<std::size_t>(
       [gpu_bufs = gpu_bufs.data()] __device__(std::size_t i) { return gpu_bufs[i].size(); }));
 
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index fb0b25cf9f1..de2f1770e28 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -65,19 +65,20 @@ rmm::device_uvector<char> make_chars_buffer(column_view const& offsets,
   auto chars_data      = rmm::device_uvector<char>(chars_size, stream, mr);
   auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets);
 
-  auto const src_ptrs = cudf::detail::make_counting_transform_iterator(
-    0u, cuda::proclaim_return_type<void*>([begin] __device__(uint32_t idx) {
+  auto const src_ptrs = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<uint32_t>(0),
+    cuda::proclaim_return_type<void*>([begin] __device__(uint32_t idx) {
       // Due to a bug in cub (https://github.com/NVIDIA/cccl/issues/586),
       // we have to use `const_cast` to remove `const` qualifier from the source pointer.
       // This should be fine as long as we only read but not write anything to the source.
       return reinterpret_cast<void*>(const_cast<char*>(begin[idx].first));
     }));
-  auto const src_sizes = cudf::detail::make_counting_transform_iterator(
-    0u, cuda::proclaim_return_type<size_type>([begin] __device__(uint32_t idx) {
-      return begin[idx].second;
-    }));
-  auto const dst_ptrs = cudf::detail::make_counting_transform_iterator(
-    0u,
+  auto const src_sizes = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<uint32_t>(0),
+    cuda::proclaim_return_type<size_type>(
+      [begin] __device__(uint32_t idx) { return begin[idx].second; }));
+  auto const dst_ptrs = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<uint32_t>(0),
     cuda::proclaim_return_type<char*>([offsets = d_offsets, output = chars_data.data()] __device__(
                                         uint32_t idx) { return output + offsets[idx]; }));
 
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index b3276c81c1f..0d24fa4236f 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -21,6 +21,7 @@
 
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/reduce.h>
 
 namespace cudf::io::parquet::detail {
@@ -476,9 +477,9 @@ void WriteFinalOffsets(host_span<size_type const> offsets,
   auto d_src_data = cudf::detail::make_device_uvector_async(
     offsets, stream, cudf::get_current_device_resource_ref());
   // Iterator for the source (scalar) data
-  auto src_iter = cudf::detail::make_counting_transform_iterator(
-    static_cast<std::size_t>(0),
-    cuda::proclaim_return_type<size_type*>(
+  auto src_iter = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<std::size_t>(0),
+    cuda::proclaim_return_type<cudf::size_type*>(
       [src = d_src_data.begin()] __device__(std::size_t i) { return src + i; }));
 
   // Copy buffer addresses to device and create an iterator
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index f4d1d036ecd..60e35465793 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1629,8 +1629,10 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
         get_page_nesting_size{
           d_cols_info.data(), max_depth, subpass.pages.size(), subpass.pages.device_begin()});
 
-      auto const reduction_keys = cudf::detail::make_counting_transform_iterator(
-        key_start, get_reduction_key{subpass.pages.size()});
+      // Manually create a size_t `key_start` compatible counting_transform_iterator.
+      auto const reduction_keys =
+        thrust::make_transform_iterator(thrust::make_counting_iterator<std::size_t>(key_start),
+                                        get_reduction_key{subpass.pages.size()});
 
       // Find the size of each column
       thrust::reduce_by_key(rmm::exec_policy_nosync(_stream),
diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 8ad1fea649d..5f1aea71f73 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -1358,10 +1358,11 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
   int64_t constexpr total_rows  = num_rows * num_reps;
   static_assert(total_rows > std::numeric_limits<cudf::size_type>::max());
 
-  auto const it  = cudf::detail::make_counting_transform_iterator(0l, [num_rows](int64_t i) {
-    return (i % num_rows) % static_cast<int64_t>(std::numeric_limits<data_type>::max() / 2);
-  });
-  auto const col = data_col(it, it + num_rows);
+  auto const it = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<int64_t>(0), [num_rows](int64_t i) {
+      return (i % num_rows) % static_cast<int64_t>(std::numeric_limits<data_type>::max() / 2);
+    });
+  auto const col         = data_col(it, it + num_rows);
   auto const chunk_table = cudf::table_view{{col}};
 
   std::vector<char> data_buffer;

From 5a27bc5745906c5c532a835b008f88de461d5a68 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 18 Oct 2024 14:19:34 -0700
Subject: [PATCH 4/7] Apply suggestions from code review

Co-authored-by: Yunsong Wang <yunsongw@nvidia.com>
---
 cpp/include/cudf/detail/iterator.cuh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
index b4a4f01522e..77f049fa9aa 100644
--- a/cpp/include/cudf/detail/iterator.cuh
+++ b/cpp/include/cudf/detail/iterator.cuh
@@ -49,7 +49,7 @@ namespace cudf {
 namespace detail {
 /**
  * @brief Convenience wrapper for creating a `thrust::transform_iterator` over a
- * `thrust::counting_iterator` of type smaller than or equal to int32_t in size.
+ * `thrust::counting_iterator` within the range [0, INT_MAX].
  *
  *
  * Example:
@@ -69,9 +69,9 @@ namespace detail {
  */
 template <typename CountingIterType,
           typename UnaryFunction,
-          typename = std::enable_if_t<std::is_integral_v<CountingIterType> and
-                                      std::numeric_limits<CountingIterType>::max() <=
-                                        std::numeric_limits<cudf::size_type>::max()>>
+          typename = cuda::std::enable_if_t<cuda::std::is_integral_v<CountingIterType> and
+                                      cuda::std::numeric_limits<CountingIterType>::max() <=
+                                        cuda::std::numeric_limits<cudf::size_type>::max()>>
 CUDF_HOST_DEVICE inline auto make_counting_transform_iterator(CountingIterType start,
                                                               UnaryFunction f)
 {

From 201d76173f302093f910c912038bf4ae24ab52d5 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 18 Oct 2024 21:27:14 +0000
Subject: [PATCH 5/7] style fix and cuda std headers

---
 cpp/include/cudf/detail/iterator.cuh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
index 77f049fa9aa..4607d921043 100644
--- a/cpp/include/cudf/detail/iterator.cuh
+++ b/cpp/include/cudf/detail/iterator.cuh
@@ -38,13 +38,13 @@
 #include <cudf/scalar/scalar_device_view.cuh>
 
 #include <cuda/std/optional>
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/pair.h>
 
-#include <utility>
-
 namespace cudf {
 namespace detail {
 /**
@@ -70,8 +70,8 @@ namespace detail {
 template <typename CountingIterType,
           typename UnaryFunction,
           typename = cuda::std::enable_if_t<cuda::std::is_integral_v<CountingIterType> and
-                                      cuda::std::numeric_limits<CountingIterType>::max() <=
-                                        cuda::std::numeric_limits<cudf::size_type>::max()>>
+                                            cuda::std::numeric_limits<CountingIterType>::max() <=
+                                              cuda::std::numeric_limits<cudf::size_type>::max()>>
 CUDF_HOST_DEVICE inline auto make_counting_transform_iterator(CountingIterType start,
                                                               UnaryFunction f)
 {

From 0024a64b3a57799bb6273afc5ee97879909d76f9 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 18 Oct 2024 21:28:12 +0000
Subject: [PATCH 6/7] Apply suggestion from reviews

---
 cpp/include/cudf/detail/iterator.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
index 4607d921043..9eefb90756e 100644
--- a/cpp/include/cudf/detail/iterator.cuh
+++ b/cpp/include/cudf/detail/iterator.cuh
@@ -63,7 +63,7 @@ namespace detail {
  * iter[n] == n * n
  * @endcode
  *
- * @param start The starting value of the counting iterator (must be int32_t or smaller type).
+ * @param start The starting value of the counting iterator (must be size_type or smaller type).
  * @param f The unary function to apply to the counting iterator.
  * @return A transform iterator that applies `f` to a counting iterator
  */

From 50f252e896214b926c5fbff6b635de0b9af5842d Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Mon, 21 Oct 2024 17:24:15 +0000
Subject: [PATCH 7/7] Use static_assert instead of enable_if

---
 cpp/include/cudf/detail/iterator.cuh | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
index 9eefb90756e..30f36d6a5da 100644
--- a/cpp/include/cudf/detail/iterator.cuh
+++ b/cpp/include/cudf/detail/iterator.cuh
@@ -67,14 +67,17 @@ namespace detail {
  * @param f The unary function to apply to the counting iterator.
  * @return A transform iterator that applies `f` to a counting iterator
  */
-template <typename CountingIterType,
-          typename UnaryFunction,
-          typename = cuda::std::enable_if_t<cuda::std::is_integral_v<CountingIterType> and
-                                            cuda::std::numeric_limits<CountingIterType>::max() <=
-                                              cuda::std::numeric_limits<cudf::size_type>::max()>>
+template <typename CountingIterType, typename UnaryFunction>
 CUDF_HOST_DEVICE inline auto make_counting_transform_iterator(CountingIterType start,
                                                               UnaryFunction f)
 {
+  // Check if the `start` for counting_iterator is of size_type or a smaller integral type
+  static_assert(
+    cuda::std::is_integral_v<CountingIterType> and
+      cuda::std::numeric_limits<CountingIterType>::digits <=
+        cuda::std::numeric_limits<cudf::size_type>::digits,
+    "The `start` for the counting_transform_iterator must be size_type or smaller type");
+
   return thrust::make_transform_iterator(thrust::make_counting_iterator(start), f);
 }