From 74ee6ae526785d03ca1453b83dbc02094c962225 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 25 Sep 2024 03:20:19 +0000
Subject: [PATCH 01/24] Add capability to batch memcpy the last offsets to str
 and list out_bufs

---
 cpp/include/cudf/io/detail/batched_memcpy.hpp | 85 +++++++++++++++++++
 cpp/src/io/parquet/page_data.cu               | 13 +++
 cpp/src/io/parquet/parquet_gpu.hpp            | 15 ++++
 cpp/src/io/parquet/reader_impl.cpp            | 20 ++---
 4 files changed, 123 insertions(+), 10 deletions(-)
 create mode 100644 cpp/include/cudf/io/detail/batched_memcpy.hpp
diff --git a/cpp/include/cudf/io/detail/batched_memcpy.hpp b/cpp/include/cudf/io/detail/batched_memcpy.hpp
new file mode 100644
index 00000000000..318f833a77c
--- /dev/null
+++ b/cpp/include/cudf/io/detail/batched_memcpy.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/utilities/memory_resource.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+
+#include <cub/device/device_memcpy.cuh>
+#include <cuda/functional>
+#include <thrust/iterator/constant_iterator.h>
+
+namespace CUDF_EXPORT cudf {
+namespace io::detail {
+
+/**
+ * @brief A helper function that copies a vector of host scalar data to the corresponding device
+ * addresses in a batched manner.
+ *
+ *
+ * @param[in] src_data A vector of host scalar data
+ * @param[in] dst_addrs A vector of device destination addresses
+ * @param[in] mr Device memory resource to allocate temporary memory
+ * @param[in] stream CUDA stream to use
+ */
+template <typename T>
+void batched_memcpy(std::vector<T> const& src_data,
+                    std::vector<T*> const& dst_addrs,
+                    rmm::device_async_resource_ref mr,
+                    rmm::cuda_stream_view stream)
+{
+  // Number of elements to copy
+  auto const num_elems = src_data.size();
+
+  // Copy src data to device and create an iterator
+  auto d_src_data = cudf::detail::make_device_uvector_async(src_data, stream, mr);
+  auto src_iter   = cudf::detail::make_counting_transform_iterator(
+    static_cast<std::size_t>(0),
+    cuda::proclaim_return_type<T*>(
+      [src = d_src_data.data()] __device__(std::size_t i) { return src + i; }));
+
+  // Copy dst addresses to device and create an iterator
+  auto d_dst_addrs = cudf::detail::make_device_uvector_async(dst_addrs, stream, mr);
+  auto dst_iter    = cudf::detail::make_counting_transform_iterator(
+    static_cast<std::size_t>(0),
+    cuda::proclaim_return_type<T*>(
+      [dst = d_dst_addrs.data()] __device__(std::size_t i) { return dst[i]; }));
+
+  // Scalar src data so size_iter is simply a constant iterator.
+  auto size_iter = thrust::make_constant_iterator(sizeof(T));
+
+  // Get temp storage needed for cub::DeviceMemcpy::Batched
+  size_t temp_storage_bytes = 0;
+  cub::DeviceMemcpy::Batched(
+    nullptr, temp_storage_bytes, src_iter, dst_iter, size_iter, num_elems, stream.value());
+
+  // Allocate temporary storage
+  auto d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream.value(), mr};
+
+  // Run cub::DeviceMemcpy::Batched
+  cub::DeviceMemcpy::Batched(d_temp_storage.data(),
+                             temp_storage_bytes,
+                             src_iter,
+                             dst_iter,
+                             size_iter,
+                             num_elems,
+                             stream.value());
+}
+
+}  // namespace io::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index e0d50d7ccf9..69d58bab05d 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -17,6 +17,8 @@
 #include "page_data.cuh"
 #include "page_decode.cuh"
 
+#include <cudf/io/detail/batched_memcpy.hpp>
+
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/reduce.h>
@@ -466,4 +468,15 @@ void __host__ DecodeSplitPageData(cudf::detail::hostdevice_span<PageInfo> pages,
   }
 }
 
+/**
+ * @copydoc cudf::io::parquet::detail::WriteOutputBufferSizesBatched
+ */
+void __host__ WriteFinalOffsetsBatched(std::vector<size_type> const& offsets,
+                                       std::vector<size_type*> const& buff_addrs,
+                                       rmm::device_async_resource_ref mr,
+                                       rmm::cuda_stream_view stream)
+{
+  return cudf::io::detail::batched_memcpy(offsets, buff_addrs, mr, stream);
+}
+
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 1390339c1ae..b1bcdfac48f 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -796,6 +796,21 @@ void DecodeSplitPageData(cudf::detail::hostdevice_span<PageInfo> pages,
                          kernel_error::pointer error_code,
                          rmm::cuda_stream_view stream);
 
+/**
+ * @brief Writes the final offsets to the corresponding list and string buffer end addresses in a
+ * batched manner.
+ *
+ *
+ * @param[in] offsets A vector of finals offsets
+ * @param[in] buff_addrs A vector of corresponding buffer end addresses
+ * @param[in] mr Device memory resource to allocate temporary memory
+ * @param[in] stream CUDA stream to use
+ */
+void WriteFinalOffsetsBatched(std::vector<size_type> const& offsets,
+                              std::vector<size_type*> const& buff_addrs,
+                              rmm::device_async_resource_ref mr,
+                              rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernel for reading the string column data stored in the pages
  *
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 7d817bde7af..6127c4d91e2 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -378,6 +378,10 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
   // that it is difficult/impossible for a given page to know that it is writing the very
   // last value that should then be followed by a terminator (because rows can span
   // page boundaries).
+  std::vector<size_type*> out_buffers;
+  std::vector<size_type> final_offsets;
+  out_buffers.reserve(_input_columns.size());
+  final_offsets.reserve(_input_columns.size());
   for (size_t idx = 0; idx < _input_columns.size(); idx++) {
     input_column_info const& input_col = _input_columns[idx];
 
@@ -393,24 +397,20 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
 
         // the final offset for a list at level N is the size of it's child
         size_type const offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size;
-        CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast<size_type*>(out_buf.data()) + (out_buf.size - 1),
-                                      &offset,
-                                      sizeof(size_type),
-                                      cudaMemcpyDefault,
-                                      _stream.value()));
+        out_buffers.emplace_back(static_cast<size_type*>(out_buf.data()) + (out_buf.size - 1));
+        final_offsets.emplace_back(offset);
         out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED;
       } else if (out_buf.type.id() == type_id::STRING) {
         // need to cap off the string offsets column
         auto const sz = static_cast<size_type>(col_string_sizes[idx]);
         if (sz <= strings::detail::get_offset64_threshold()) {
-          CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast<size_type*>(out_buf.data()) + out_buf.size,
-                                        &sz,
-                                        sizeof(size_type),
-                                        cudaMemcpyDefault,
-                                        _stream.value()));
+          out_buffers.emplace_back(static_cast<size_type*>(out_buf.data()) + out_buf.size);
+          final_offsets.emplace_back(sz);
         }
       }
     }
+    // Write the final offsets for all list and string buffers in a batched manner
+    cudf::io::parquet::detail::WriteFinalOffsetsBatched(final_offsets, out_buffers, _mr, _stream);
   }
 
   // update null counts in the final column buffers

From cab885d1b1c7295812ef003eddcab7eb9399bdb7 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 25 Sep 2024 18:51:17 +0000
Subject: [PATCH 02/24] Move `WriteFinalOffsetsBatched` out of the for loop

---
 cpp/src/io/parquet/reader_impl.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 6127c4d91e2..86863a2562b 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -409,9 +409,9 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
         }
       }
     }
-    // Write the final offsets for all list and string buffers in a batched manner
-    cudf::io::parquet::detail::WriteFinalOffsetsBatched(final_offsets, out_buffers, _mr, _stream);
   }
+  // Write the final offsets for all list and string buffers in a batched manner
+  cudf::io::parquet::detail::WriteFinalOffsetsBatched(final_offsets, out_buffers, _mr, _stream);
 
   // update null counts in the final column buffers
   for (size_t idx = 0; idx < subpass.pages.size(); idx++) {

From b15e3d3096195e4a6d2ab03bbae5116b1caa2e56 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 25 Sep 2024 20:22:53 +0000
Subject: [PATCH 03/24] Generalize the API and ORC changes by @vuule

---
 cpp/include/cudf/io/detail/batched_memcpy.hpp | 44 ++++---------
 cpp/src/io/orc/stripe_enc.cu                  | 66 +++++++++++++------
 cpp/src/io/parquet/page_data.cu               | 20 +++++-
 3 files changed, 78 insertions(+), 52 deletions(-)

diff --git a/cpp/include/cudf/io/detail/batched_memcpy.hpp b/cpp/include/cudf/io/detail/batched_memcpy.hpp
index 318f833a77c..abbc096ce39 100644
--- a/cpp/include/cudf/io/detail/batched_memcpy.hpp
+++ b/cpp/include/cudf/io/detail/batched_memcpy.hpp
@@ -28,48 +28,32 @@ namespace CUDF_EXPORT cudf {
 namespace io::detail {
 
 /**
- * @brief A helper function that copies a vector of host scalar data to the corresponding device
- * addresses in a batched manner.
+ * @brief A helper function that copies a vector of vectors from source to destination addresses in
+ * a batched manner.
  *
+ * @tparam SrcIterator The type of the source address iterator
+ * @tparam DstIterator The type of the destination address iterator
+ * @tparam Sizeiterator The type of the buffer size iterator
  *
- * @param[in] src_data A vector of host scalar data
- * @param[in] dst_addrs A vector of device destination addresses
- * @param[in] mr Device memory resource to allocate temporary memory
+ * @param[in] src_iter Iterator to source addresses
+ * @param[in] dst_iter Iterator to destination addresses
+ * @param[in] size_iter Iterator to the vector sizes (in bytes)
  * @param[in] stream CUDA stream to use
  */
-template <typename T>
-void batched_memcpy(std::vector<T> const& src_data,
-                    std::vector<T*> const& dst_addrs,
-                    rmm::device_async_resource_ref mr,
+template <typename SrcIterator, typename DstIterator, typename Sizeiterator>
+void batched_memcpy(SrcIterator src_iter,
+                    DstIterator dst_iter,
+                    Sizeiterator size_iter,
+                    size_t num_elems,
                     rmm::cuda_stream_view stream)
 {
-  // Number of elements to copy
-  auto const num_elems = src_data.size();
-
-  // Copy src data to device and create an iterator
-  auto d_src_data = cudf::detail::make_device_uvector_async(src_data, stream, mr);
-  auto src_iter   = cudf::detail::make_counting_transform_iterator(
-    static_cast<std::size_t>(0),
-    cuda::proclaim_return_type<T*>(
-      [src = d_src_data.data()] __device__(std::size_t i) { return src + i; }));
-
-  // Copy dst addresses to device and create an iterator
-  auto d_dst_addrs = cudf::detail::make_device_uvector_async(dst_addrs, stream, mr);
-  auto dst_iter    = cudf::detail::make_counting_transform_iterator(
-    static_cast<std::size_t>(0),
-    cuda::proclaim_return_type<T*>(
-      [dst = d_dst_addrs.data()] __device__(std::size_t i) { return dst[i]; }));
-
-  // Scalar src data so size_iter is simply a constant iterator.
-  auto size_iter = thrust::make_constant_iterator(sizeof(T));
-
   // Get temp storage needed for cub::DeviceMemcpy::Batched
   size_t temp_storage_bytes = 0;
   cub::DeviceMemcpy::Batched(
     nullptr, temp_storage_bytes, src_iter, dst_iter, size_iter, num_elems, stream.value());
 
   // Allocate temporary storage
-  auto d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream.value(), mr};
+  rmm::device_buffer d_temp_storage{temp_storage_bytes, stream.value()};
 
   // Run cub::DeviceMemcpy::Batched
   cub::DeviceMemcpy::Batched(d_temp_storage.data(),
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 5c70e35fd2e..b5d86e7197c 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -23,6 +23,7 @@
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/detail/batched_memcpy.hpp>
 #include <cudf/io/orc_types.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
@@ -1087,37 +1088,42 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 /**
  * @brief Merge chunked column data into a single contiguous stream
  *
- * @param[in,out] strm_desc StripeStream device array [stripe][stream]
- * @param[in,out] streams List of encoder chunk streams [column][rowgroup]
+ * @param[in] strm_desc StripeStream device array [stripe][stream]
+ * @param[in] streams List of encoder chunk streams [column][rowgroup]
+ * @param[out] srcs  List of source encoder chunk stream data addresses
+ * @param[out] dsts List of destination StripeStream data addresses
+ * @param[out] sizes List of stream sizes in bytes
  */
 // blockDim {compact_streams_block_size,1,1}
 CUDF_KERNEL void __launch_bounds__(compact_streams_block_size)
-  gpuCompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
-                           device_2dspan<encoder_chunk_streams> streams)
+  gpuInitBatchedMemcpy(device_2dspan<StripeStream const> strm_desc,
+                       device_2dspan<encoder_chunk_streams> streams,
+                       device_span<uint8_t*> srcs,
+                       device_span<uint8_t*> dsts,
+                       device_span<size_t> sizes)
 {
-  __shared__ __align__(16) StripeStream ss;
-
-  auto const stripe_id = blockIdx.x;
+  auto const stripe_id = blockIdx.x * compact_streams_block_size + threadIdx.x;
   auto const stream_id = blockIdx.y;
-  auto const t         = threadIdx.x;
+  if (stripe_id >= strm_desc.size().first) { return; }
 
-  if (t == 0) { ss = strm_desc[stripe_id][stream_id]; }
-  __syncthreads();
+  auto const out_id = stream_id * strm_desc.size().first + stripe_id;
+  StripeStream ss   = strm_desc[stripe_id][stream_id];
 
   if (ss.data_ptr == nullptr) { return; }
 
   auto const cid = ss.stream_type;
   auto dst_ptr   = ss.data_ptr;
   for (auto group = ss.first_chunk_id; group < ss.first_chunk_id + ss.num_chunks; ++group) {
+    auto const out_id = stream_id * streams.size().second + group;
+    srcs[out_id]      = streams[ss.column_id][group].data_ptrs[cid];
+    dsts[out_id]      = dst_ptr;
+
+    // Also update the stream here, data will be copied in a separate kernel
+    streams[ss.column_id][group].data_ptrs[cid] = dst_ptr;
+
     auto const len = streams[ss.column_id][group].lengths[cid];
-    if (len > 0) {
-      auto const src_ptr = streams[ss.column_id][group].data_ptrs[cid];
-      for (uint32_t i = t; i < len; i += blockDim.x) {
-        dst_ptr[i] = src_ptr[i];
-      }
-      __syncthreads();
-    }
-    if (t == 0) { streams[ss.column_id][group].data_ptrs[cid] = dst_ptr; }
+    // Multiplying by sizeof(uint8_t) = 1 is redundant here.
+    sizes[out_id] = len;
     dst_ptr += len;
   }
 }
@@ -1326,8 +1332,26 @@ void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
                            rmm::cuda_stream_view stream)
 {
   dim3 dim_block(compact_streams_block_size, 1);
-  dim3 dim_grid(strm_desc.size().first, strm_desc.size().second);
-  gpuCompactOrcDataStreams<<<dim_grid, dim_block, 0, stream.value()>>>(strm_desc, enc_streams);
+
+  auto const num_rowgroups = enc_streams.size().second;
+  auto const num_streams   = strm_desc.size().second;
+  auto const num_stripes   = strm_desc.size().first;
+  auto const num_chunks    = num_rowgroups * num_streams;
+  auto srcs                = cudf::detail::make_zeroed_device_uvector_sync<uint8_t*>(
+    num_chunks, stream, rmm::mr::get_current_device_resource());
+  auto dsts = cudf::detail::make_zeroed_device_uvector_sync<uint8_t*>(
+    num_chunks, stream, rmm::mr::get_current_device_resource());
+  auto lengths = cudf::detail::make_zeroed_device_uvector_sync<size_t>(
+    num_chunks, stream, rmm::mr::get_current_device_resource());
+
+  dim3 dim_grid_alt(cudf::util::div_rounding_up_unsafe(num_stripes, compact_streams_block_size),
+                    strm_desc.size().second);
+  gpuInitBatchedMemcpy<<<dim_grid_alt, dim_block, 0, stream.value()>>>(
+    strm_desc, enc_streams, srcs, dsts, lengths);
+
+  // Copy streams in a batched manner.
+  cudf::io::detail::batched_memcpy(
+    srcs.data(), dsts.data(), lengths.data(), lengths.size(), stream);
 }
 
 std::optional<writer_compression_statistics> CompressOrcDataStreams(
@@ -1438,4 +1462,4 @@ void decimal_sizes_to_offsets(device_2dspan<rowgroup_rows const> rg_bounds,
 }  // namespace gpu
 }  // namespace orc
 }  // namespace io
-}  // namespace cudf
+}  // namespace cudf
\ No newline at end of file
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 69d58bab05d..fd7849c1327 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -476,7 +476,25 @@ void __host__ WriteFinalOffsetsBatched(std::vector<size_type> const& offsets,
                                        rmm::device_async_resource_ref mr,
                                        rmm::cuda_stream_view stream)
 {
-  return cudf::io::detail::batched_memcpy(offsets, buff_addrs, mr, stream);
+  // Copy offsets to device and create an iterator
+  auto d_src_data = cudf::detail::make_device_uvector_async(offsets, stream, mr);
+  auto src_iter   = cudf::detail::make_counting_transform_iterator(
+    static_cast<std::size_t>(0),
+    cuda::proclaim_return_type<size_type*>(
+      [src = d_src_data.data()] __device__(std::size_t i) { return src + i; }));
+
+  // Copy buffer addresses to device and create an iterator
+  auto d_dst_addrs = cudf::detail::make_device_uvector_async(buff_addrs, stream, mr);
+  auto dst_iter    = cudf::detail::make_counting_transform_iterator(
+    static_cast<std::size_t>(0),
+    cuda::proclaim_return_type<size_type*>(
+      [dst = d_dst_addrs.data()] __device__(std::size_t i) { return dst[i]; }));
+
+  // size_iter is simply a constant iterator of sizeof(size_type) bytes.
+  auto size_iter = thrust::make_constant_iterator(sizeof(size_type));
+
+  // Copy offsets to buffers in batched manner.
+  cudf::io::detail::batched_memcpy(src_iter, dst_iter, size_iter, offsets.size(), stream);
 }
 
 }  // namespace cudf::io::parquet::detail

From 50dcd7155feef660f4a2c18af24f008d80075c78 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 25 Sep 2024 20:29:13 +0000
Subject: [PATCH 04/24] Use make_zeroed_device_uvector_async instead

---
 cpp/src/io/orc/stripe_enc.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index b5d86e7197c..a8e216f2700 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -1337,11 +1337,11 @@ void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
   auto const num_streams   = strm_desc.size().second;
   auto const num_stripes   = strm_desc.size().first;
   auto const num_chunks    = num_rowgroups * num_streams;
-  auto srcs                = cudf::detail::make_zeroed_device_uvector_sync<uint8_t*>(
+  auto srcs                = cudf::detail::make_zeroed_device_uvector_async<uint8_t*>(
     num_chunks, stream, rmm::mr::get_current_device_resource());
-  auto dsts = cudf::detail::make_zeroed_device_uvector_sync<uint8_t*>(
+  auto dsts = cudf::detail::make_zeroed_device_uvector_async<uint8_t*>(
     num_chunks, stream, rmm::mr::get_current_device_resource());
-  auto lengths = cudf::detail::make_zeroed_device_uvector_sync<size_t>(
+  auto lengths = cudf::detail::make_zeroed_device_uvector_async<size_t>(
     num_chunks, stream, rmm::mr::get_current_device_resource());
 
   dim3 dim_grid_alt(cudf::util::div_rounding_up_unsafe(num_stripes, compact_streams_block_size),
@@ -1462,4 +1462,4 @@ void decimal_sizes_to_offsets(device_2dspan<rowgroup_rows const> rg_bounds,
 }  // namespace gpu
 }  // namespace orc
 }  // namespace io
-}  // namespace cudf
\ No newline at end of file
+}  // namespace cudf

From 800b2717a4f35fd35ed2f11dae8d02df7825867d Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 26 Sep 2024 00:46:20 +0000
Subject: [PATCH 05/24] Add gtest for batched_memcpy

---
 cpp/tests/CMakeLists.txt                      |   3 +-
 .../utilities_tests/batched_memcpy_tests.cu   | 141 ++++++++++++++++++
 2 files changed, 143 insertions(+), 1 deletion(-)
 create mode 100644 cpp/tests/utilities_tests/batched_memcpy_tests.cu

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index b67d922d377..4596ec65ce7 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -385,6 +385,8 @@ ConfigureTest(
 # * utilities tests -------------------------------------------------------------------------------
 ConfigureTest(
   UTILITIES_TEST
+  utilities_tests/batched_memcpy_tests.cu
+  utilities_tests/batched_memset_tests.cu
   utilities_tests/column_debug_tests.cpp
   utilities_tests/column_utilities_tests.cpp
   utilities_tests/column_wrapper_tests.cpp
@@ -395,7 +397,6 @@ ConfigureTest(
   utilities_tests/pinned_memory_tests.cpp
   utilities_tests/type_check_tests.cpp
   utilities_tests/type_list_tests.cpp
-  utilities_tests/batched_memset_tests.cu
 )
 
 # ##################################################################################################
diff --git a/cpp/tests/utilities_tests/batched_memcpy_tests.cu b/cpp/tests/utilities_tests/batched_memcpy_tests.cu
new file mode 100644
index 00000000000..0986469d314
--- /dev/null
+++ b/cpp/tests/utilities_tests/batched_memcpy_tests.cu
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cudf_test/column_utilities.hpp"
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/detail/batched_memcpy.hpp>
+#include <cudf/io/parquet.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/device_vector.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/tuple.h>
+
+#include <iterator>
+#include <numeric>
+#include <random>
+#include <type_traits>
+
+template <typename T>
+struct BatchedMemcpyTest : public cudf::test::BaseFixture {};
+
+TEST(BatchedMemcpyTest, BasicTest)
+{
+  using T1 = int64_t;
+
+  // Device init
+  auto stream = cudf::get_default_stream();
+  auto mr     = cudf::get_current_device_resource_ref();
+
+  // Buffer lengths (in number of elements)
+  std::vector<size_t> const h_lens{
+    50000, 4, 1000, 0, 250000, 1, 100, 8000, 0, 1, 100, 1000, 10000, 100000, 0, 1, 100000};
+
+  // Total number of buffers
+  auto const num_buffs = h_lens.size();
+
+  // Exclusive sum of buffer lengths for pointers
+  std::vector<size_t> h_lens_excl_sum(num_buffs);
+  std::exclusive_scan(h_lens.begin(), h_lens.end(), h_lens_excl_sum.begin(), 0);
+
+  // Corresponding buffer sizes (in bytes)
+  std::vector<size_t> h_sizes_bytes;
+  h_sizes_bytes.reserve(num_buffs);
+  std::transform(
+    h_lens.cbegin(), h_lens.cend(), std::back_inserter(h_sizes_bytes), [&](auto& size) {
+      return size * sizeof(T1);
+    });
+
+  // Initialize random engine
+  auto constexpr seed = 0xcead;
+  std::mt19937 engine{seed};
+  using uniform_distribution =
+    typename std::conditional_t<std::is_same_v<T1, bool>,
+                                std::bernoulli_distribution,
+                                std::conditional_t<std::is_floating_point_v<T1>,
+                                                   std::uniform_real_distribution<T1>,
+                                                   std::uniform_int_distribution<T1>>>;
+  uniform_distribution dist{};
+
+  // Generate a src vector of random data vectors
+  std::vector<std::vector<T1>> h_sources;
+  h_sources.reserve(num_buffs);
+  std::transform(h_lens.begin(), h_lens.end(), std::back_inserter(h_sources), [&](auto size) {
+    std::vector<T1> data(size);
+    std::generate_n(data.begin(), size, [&]() { return T1{dist(engine)}; });
+    return data;
+  });
+  // Copy the vectors to device
+  std::vector<rmm::device_uvector<T1>> h_device_vecs;
+  h_device_vecs.reserve(h_sources.size());
+  std::transform(
+    h_sources.begin(), h_sources.end(), std::back_inserter(h_device_vecs), [stream, mr](auto& vec) {
+      return cudf::detail::make_device_uvector_async(vec, stream, mr);
+    });
+  // Pointers to the source vectors
+  std::vector<T1*> h_src_ptrs;
+  h_src_ptrs.reserve(h_sources.size());
+  std::transform(
+    h_device_vecs.begin(), h_device_vecs.end(), std::back_inserter(h_src_ptrs), [](auto& vec) {
+      return static_cast<T1*>(vec.data());
+    });
+  // Copy the source data pointers to device
+  auto d_src_ptrs = cudf::detail::make_device_uvector_async(h_src_ptrs, stream, mr);
+
+  // Total number of elements in all buffers
+  auto const total_buff_len = std::accumulate(h_lens.cbegin(), h_lens.cend(), 0);
+
+  // Create one giant buffer for destination
+  auto d_dst_data = cudf::detail::make_zeroed_device_uvector_async<T1>(total_buff_len, stream, mr);
+  // Pointers to destination buffers within the giant destination buffer
+  std::vector<T1*> h_dst_ptrs(num_buffs);
+  std::for_each(thrust::make_counting_iterator(static_cast<size_t>(0)),
+                thrust::make_counting_iterator(num_buffs),
+                [&](auto i) { return h_dst_ptrs[i] = d_dst_data.data() + h_lens_excl_sum[i]; });
+  // Copy destination data pointers to device
+  auto d_dst_ptrs = cudf::detail::make_device_uvector_async(h_dst_ptrs, stream, mr);
+
+  // Copy buffer size iterators (in bytes) to device
+  auto d_sizes_bytes = cudf::detail::make_device_uvector_async(h_sizes_bytes, stream, mr);
+
+  // Run the batched memcpy
+  cudf::io::detail::batched_memcpy(
+    d_src_ptrs.begin(), d_dst_ptrs.begin(), d_sizes_bytes.begin(), num_buffs, stream);
+
+  // Expected giant destination buffer after the memcpy
+  std::vector<T1> expected_buffer;
+  expected_buffer.reserve(total_buff_len);
+  std::for_each(h_sources.cbegin(), h_sources.cend(), [&expected_buffer](auto& source) {
+    expected_buffer.insert(expected_buffer.end(), source.begin(), source.end());
+  });
+
+  // Copy over the result destination buffer to host and synchronize the stream
+  auto result_dst_buffer =
+    cudf::detail::make_std_vector_sync<T1>(cudf::device_span<T1>(d_dst_data), stream);
+
+  // Check if both vectors are equal
+  EXPECT_TRUE(
+    std::equal(expected_buffer.begin(), expected_buffer.end(), result_dst_buffer.begin()));
+}

From 31a755b01d9d9d92bf0b5d2c8c08970ff5301920 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 25 Sep 2024 18:20:37 -0700
Subject: [PATCH 06/24] Update cpp/include/cudf/io/detail/batched_memcpy.hpp

---
 cpp/include/cudf/io/detail/batched_memcpy.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cudf/io/detail/batched_memcpy.hpp b/cpp/include/cudf/io/detail/batched_memcpy.hpp
index abbc096ce39..fc340021d92 100644
--- a/cpp/include/cudf/io/detail/batched_memcpy.hpp
+++ b/cpp/include/cudf/io/detail/batched_memcpy.hpp
@@ -40,10 +40,10 @@ namespace io::detail {
  * @param[in] size_iter Iterator to the vector sizes (in bytes)
  * @param[in] stream CUDA stream to use
  */
-template <typename SrcIterator, typename DstIterator, typename Sizeiterator>
+template <typename SrcIterator, typename DstIterator, typename SizeIterator>
 void batched_memcpy(SrcIterator src_iter,
                     DstIterator dst_iter,
-                    Sizeiterator size_iter,
+                    SizeIterator size_iter,
                     size_t num_elems,
                     rmm::cuda_stream_view stream)
 {

From b29329b72ed7d2cd08d4de875542b5e69478ddce Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 25 Sep 2024 18:20:41 -0700
Subject: [PATCH 07/24] Update cpp/include/cudf/io/detail/batched_memcpy.hpp

---
 cpp/include/cudf/io/detail/batched_memcpy.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/io/detail/batched_memcpy.hpp b/cpp/include/cudf/io/detail/batched_memcpy.hpp
index fc340021d92..8065743922e 100644
--- a/cpp/include/cudf/io/detail/batched_memcpy.hpp
+++ b/cpp/include/cudf/io/detail/batched_memcpy.hpp
@@ -33,7 +33,7 @@ namespace io::detail {
  *
  * @tparam SrcIterator The type of the source address iterator
  * @tparam DstIterator The type of the destination address iterator
- * @tparam Sizeiterator The type of the buffer size iterator
+ * @tparam SizeIterator The type of the buffer size iterator
  *
  * @param[in] src_iter Iterator to source addresses
  * @param[in] dst_iter Iterator to destination addresses

From 4efb989525d8cc1c4744b0c32af08d649fbf2b1d Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 25 Sep 2024 18:42:11 -0700
Subject: [PATCH 08/24] Comments update

---
 cpp/src/io/parquet/reader_impl.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 86863a2562b..151359695a9 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -371,9 +371,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
     CUDF_FAIL("Parquet data decode failed with code(s) " + kernel_error::to_string(error));
   }
 
-  // for list columns, add the final offset to every offset buffer.
-  // TODO : make this happen in more efficiently. Maybe use thrust::for_each
-  // on each buffer.
+  // For list and string columns, add the final offset to every offset buffer.
   // Note : the reason we are doing this here instead of in the decode kernel is
   // that it is difficult/impossible for a given page to know that it is writing the very
   // last value that should then be followed by a terminator (because rows can span
@@ -410,7 +408,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
       }
     }
   }
-  // Write the final offsets for all list and string buffers in a batched manner
+  // Write the final offsets for list and string columns in a batched manner
   cudf::io::parquet::detail::WriteFinalOffsetsBatched(final_offsets, out_buffers, _mr, _stream);
 
   // update null counts in the final column buffers

From cc2829f42c86bbdb59a592e7b05395515e0dc8ff Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Fri, 27 Sep 2024 01:03:38 +0000
Subject: [PATCH 09/24] Address reviewer comments

---
 cpp/include/cudf/io/detail/batched_memcpy.hpp   | 17 +++++++++--------
 cpp/src/io/orc/stripe_enc.cu                    |  4 ++--
 cpp/src/io/parquet/page_data.cu                 | 13 ++++++-------
 cpp/src/io/parquet/parquet_gpu.hpp              | 11 ++++-------
 cpp/src/io/parquet/reader_impl.cpp              |  2 +-
 .../utilities_tests/batched_memcpy_tests.cu     |  2 +-
 6 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/cpp/include/cudf/io/detail/batched_memcpy.hpp b/cpp/include/cudf/io/detail/batched_memcpy.hpp
index 8065743922e..281908705db 100644
--- a/cpp/include/cudf/io/detail/batched_memcpy.hpp
+++ b/cpp/include/cudf/io/detail/batched_memcpy.hpp
@@ -38,30 +38,31 @@ namespace io::detail {
  * @param[in] src_iter Iterator to source addresses
  * @param[in] dst_iter Iterator to destination addresses
  * @param[in] size_iter Iterator to the vector sizes (in bytes)
+ * @param[in] num_buffs Number of buffers to be copied
  * @param[in] stream CUDA stream to use
  */
 template <typename SrcIterator, typename DstIterator, typename SizeIterator>
-void batched_memcpy(SrcIterator src_iter,
-                    DstIterator dst_iter,
-                    SizeIterator size_iter,
-                    size_t num_elems,
-                    rmm::cuda_stream_view stream)
+void batched_memcpy_async(SrcIterator src_iter,
+                          DstIterator dst_iter,
+                          SizeIterator size_iter,
+                          size_t num_buffs,
+                          rmm::cuda_stream_view stream)
 {
   // Get temp storage needed for cub::DeviceMemcpy::Batched
   size_t temp_storage_bytes = 0;
   cub::DeviceMemcpy::Batched(
-    nullptr, temp_storage_bytes, src_iter, dst_iter, size_iter, num_elems, stream.value());
+    nullptr, temp_storage_bytes, src_iter, dst_iter, size_iter, num_buffs, stream.value());
 
   // Allocate temporary storage
   rmm::device_buffer d_temp_storage{temp_storage_bytes, stream.value()};
 
-  // Run cub::DeviceMemcpy::Batched
+  // Perform copies
   cub::DeviceMemcpy::Batched(d_temp_storage.data(),
                              temp_storage_bytes,
                              src_iter,
                              dst_iter,
                              size_iter,
-                             num_elems,
+                             num_buffs,
                              stream.value());
 }
 
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index a8e216f2700..00fd1767554 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -1350,8 +1350,8 @@ void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
     strm_desc, enc_streams, srcs, dsts, lengths);
 
   // Copy streams in a batched manner.
-  cudf::io::detail::batched_memcpy(
-    srcs.data(), dsts.data(), lengths.data(), lengths.size(), stream);
+  cudf::io::detail::batched_memcpy_async(
+    srcs.begin(), dsts.begin(), lengths.begin(), lengths.size(), stream);
 }
 
 std::optional<writer_compression_statistics> CompressOrcDataStreams(
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index fd7849c1327..413da57293d 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -471,20 +471,19 @@ void __host__ DecodeSplitPageData(cudf::detail::hostdevice_span<PageInfo> pages,
 /**
  * @copydoc cudf::io::parquet::detail::WriteOutputBufferSizesBatched
  */
-void __host__ WriteFinalOffsetsBatched(std::vector<size_type> const& offsets,
-                                       std::vector<size_type*> const& buff_addrs,
-                                       rmm::device_async_resource_ref mr,
-                                       rmm::cuda_stream_view stream)
+void WriteFinalOffsetsBatched(host_span<size_type const> offsets,
+                              host_span<size_type* const> const& buff_addrs,
+                              rmm::cuda_stream_view stream)
 {
   // Copy offsets to device and create an iterator
-  auto d_src_data = cudf::detail::make_device_uvector_async(offsets, stream, mr);
+  auto d_src_data = cudf::detail::make_device_uvector_async(offsets, stream, cudf::get_current_device_resource_ref());
   auto src_iter   = cudf::detail::make_counting_transform_iterator(
     static_cast<std::size_t>(0),
     cuda::proclaim_return_type<size_type*>(
       [src = d_src_data.data()] __device__(std::size_t i) { return src + i; }));
 
   // Copy buffer addresses to device and create an iterator
-  auto d_dst_addrs = cudf::detail::make_device_uvector_async(buff_addrs, stream, mr);
+  auto d_dst_addrs = cudf::detail::make_device_uvector_async(buff_addrs, stream, cudf::get_current_device_resource_ref());
   auto dst_iter    = cudf::detail::make_counting_transform_iterator(
     static_cast<std::size_t>(0),
     cuda::proclaim_return_type<size_type*>(
@@ -494,7 +493,7 @@ void __host__ WriteFinalOffsetsBatched(std::vector<size_type> const& offsets,
   auto size_iter = thrust::make_constant_iterator(sizeof(size_type));
 
   // Copy offsets to buffers in batched manner.
-  cudf::io::detail::batched_memcpy(src_iter, dst_iter, size_iter, offsets.size(), stream);
+  cudf::io::detail::batched_memcpy_async(src_iter, dst_iter, size_iter, offsets.size(), stream);
 }
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index b1bcdfac48f..33fb9a9456a 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -800,15 +800,12 @@ void DecodeSplitPageData(cudf::detail::hostdevice_span<PageInfo> pages,
  * @brief Writes the final offsets to the corresponding list and string buffer end addresses in a
  * batched manner.
  *
- *
- * @param[in] offsets A vector of finals offsets
- * @param[in] buff_addrs A vector of corresponding buffer end addresses
- * @param[in] mr Device memory resource to allocate temporary memory
+ * @param[in] offsets Host span of final offsets
+ * @param[in] buff_addrs Host span of corresponding output col buffer end addresses
  * @param[in] stream CUDA stream to use
  */
-void WriteFinalOffsetsBatched(std::vector<size_type> const& offsets,
-                              std::vector<size_type*> const& buff_addrs,
-                              rmm::device_async_resource_ref mr,
+void WriteFinalOffsetsBatched(host_span<size_type const> offsets,
+                              host_span<size_type* const> const& buff_addrs,
                               rmm::cuda_stream_view stream);
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 151359695a9..c02e4027fed 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -409,7 +409,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
     }
   }
   // Write the final offsets for list and string columns in a batched manner
-  cudf::io::parquet::detail::WriteFinalOffsetsBatched(final_offsets, out_buffers, _mr, _stream);
+  cudf::io::parquet::detail::WriteFinalOffsetsBatched(final_offsets, out_buffers, _stream);
 
   // update null counts in the final column buffers
   for (size_t idx = 0; idx < subpass.pages.size(); idx++) {
diff --git a/cpp/tests/utilities_tests/batched_memcpy_tests.cu b/cpp/tests/utilities_tests/batched_memcpy_tests.cu
index 0986469d314..63717b72658 100644
--- a/cpp/tests/utilities_tests/batched_memcpy_tests.cu
+++ b/cpp/tests/utilities_tests/batched_memcpy_tests.cu
@@ -121,7 +121,7 @@ TEST(BatchedMemcpyTest, BasicTest)
   auto d_sizes_bytes = cudf::detail::make_device_uvector_async(h_sizes_bytes, stream, mr);
 
   // Run the batched memcpy
-  cudf::io::detail::batched_memcpy(
+  cudf::io::detail::batched_memcpy_async(
     d_src_ptrs.begin(), d_dst_ptrs.begin(), d_sizes_bytes.begin(), num_buffs, stream);
 
   // Expected giant destination buffer after the memcpy

From 78d68a818dc735ef8de60595c7fda7db76eaf735 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Fri, 27 Sep 2024 01:09:01 +0000
Subject: [PATCH 10/24] Style fix

---
 cpp/src/io/parquet/page_data.cu | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 413da57293d..71c1d23a70b 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -476,15 +476,17 @@ void WriteFinalOffsetsBatched(host_span<size_type const> offsets,
                               rmm::cuda_stream_view stream)
 {
   // Copy offsets to device and create an iterator
-  auto d_src_data = cudf::detail::make_device_uvector_async(offsets, stream, cudf::get_current_device_resource_ref());
-  auto src_iter   = cudf::detail::make_counting_transform_iterator(
+  auto d_src_data = cudf::detail::make_device_uvector_async(
+    offsets, stream, cudf::get_current_device_resource_ref());
+  auto src_iter = cudf::detail::make_counting_transform_iterator(
     static_cast<std::size_t>(0),
     cuda::proclaim_return_type<size_type*>(
       [src = d_src_data.data()] __device__(std::size_t i) { return src + i; }));
 
   // Copy buffer addresses to device and create an iterator
-  auto d_dst_addrs = cudf::detail::make_device_uvector_async(buff_addrs, stream, cudf::get_current_device_resource_ref());
-  auto dst_iter    = cudf::detail::make_counting_transform_iterator(
+  auto d_dst_addrs = cudf::detail::make_device_uvector_async(
+    buff_addrs, stream, cudf::get_current_device_resource_ref());
+  auto dst_iter = cudf::detail::make_counting_transform_iterator(
     static_cast<std::size_t>(0),
     cuda::proclaim_return_type<size_type*>(
       [dst = d_dst_addrs.data()] __device__(std::size_t i) { return dst[i]; }));

From d42da4522b249fd846f6ea366d2543adc20e1ef6 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Fri, 27 Sep 2024 01:46:40 +0000
Subject: [PATCH 11/24] Remove the unnecessary iterator

---
 cpp/src/io/parquet/page_data.cu | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 71c1d23a70b..07e3be8187a 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -478,24 +478,21 @@ void WriteFinalOffsetsBatched(host_span<size_type const> offsets,
   // Copy offsets to device and create an iterator
   auto d_src_data = cudf::detail::make_device_uvector_async(
     offsets, stream, cudf::get_current_device_resource_ref());
+  // Iterator for the source (scalar) data
   auto src_iter = cudf::detail::make_counting_transform_iterator(
     static_cast<std::size_t>(0),
     cuda::proclaim_return_type<size_type*>(
-      [src = d_src_data.data()] __device__(std::size_t i) { return src + i; }));
+      [src = d_src_data.begin()] __device__(std::size_t i) { return src + i; }));
 
   // Copy buffer addresses to device and create an iterator
   auto d_dst_addrs = cudf::detail::make_device_uvector_async(
     buff_addrs, stream, cudf::get_current_device_resource_ref());
-  auto dst_iter = cudf::detail::make_counting_transform_iterator(
-    static_cast<std::size_t>(0),
-    cuda::proclaim_return_type<size_type*>(
-      [dst = d_dst_addrs.data()] __device__(std::size_t i) { return dst[i]; }));
-
   // size_iter is simply a constant iterator of sizeof(size_type) bytes.
   auto size_iter = thrust::make_constant_iterator(sizeof(size_type));
 
   // Copy offsets to buffers in batched manner.
-  cudf::io::detail::batched_memcpy_async(src_iter, dst_iter, size_iter, offsets.size(), stream);
+  cudf::io::detail::batched_memcpy_async(
+    src_iter, d_dst_addrs.begin(), size_iter, offsets.size(), stream);
 }
 
 }  // namespace cudf::io::parquet::detail

From 8d5640da649b9e8971bd0530348c2f6c5a0c7672 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Fri, 27 Sep 2024 17:00:24 +0000
Subject: [PATCH 12/24] Move batched_memxxx to include/detail/utilities

---
 cpp/benchmarks/CMakeLists.txt                 |   5 -
 .../io/utilities/batched_memset_bench.cpp     | 101 ------------------
 .../utilities}/batched_memcpy.hpp             |   4 +-
 .../utilities}/batched_memset.hpp             |   4 +-
 cpp/src/io/orc/stripe_enc.cu                  |   4 +-
 cpp/src/io/parquet/page_data.cu               |   4 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu  |   6 +-
 .../utilities_tests/batched_memcpy_tests.cu   |   4 +-
 .../utilities_tests/batched_memset_tests.cu   |   4 +-
 9 files changed, 15 insertions(+), 121 deletions(-)
 delete mode 100644 cpp/benchmarks/io/utilities/batched_memset_bench.cpp
 rename cpp/include/cudf/{io/detail => detail/utilities}/batched_memcpy.hpp (98%)
 rename cpp/include/cudf/{io/detail => detail/utilities}/batched_memset.hpp (98%)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 4113e38dcf4..110b4557840 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -392,11 +392,6 @@ ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader
 ConfigureNVBench(JSON_READER_OPTION_NVBENCH io/json/json_reader_option.cpp)
 ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)
 
-# ##################################################################################################
-# * multi buffer memset benchmark
-# ----------------------------------------------------------------------
-ConfigureNVBench(BATCHED_MEMSET_BENCH io/utilities/batched_memset_bench.cpp)
-
 # ##################################################################################################
 # * io benchmark ---------------------------------------------------------------------
 ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp)
diff --git a/cpp/benchmarks/io/utilities/batched_memset_bench.cpp b/cpp/benchmarks/io/utilities/batched_memset_bench.cpp
deleted file mode 100644
index 2905895a63b..00000000000
--- a/cpp/benchmarks/io/utilities/batched_memset_bench.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/io/cuio_common.hpp>
-#include <benchmarks/io/nvbench_helpers.hpp>
-
-#include <cudf/io/parquet.hpp>
-#include <cudf/utilities/default_stream.hpp>
-
-#include <nvbench/nvbench.cuh>
-
-// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
-// run on most GPUs, but large enough to allow highest throughput
-constexpr size_t data_size = 512 << 20;
-
-void parquet_read_common(cudf::size_type num_rows_to_read,
-                         cudf::size_type num_cols_to_read,
-                         cuio_source_sink_pair& source_sink,
-                         nvbench::state& state)
-{
-  cudf::io::parquet_reader_options read_opts =
-    cudf::io::parquet_reader_options::builder(source_sink.make_source_info());
-
-  auto mem_stats_logger = cudf::memory_stats_logger();
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
-  state.exec(
-    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
-      try_drop_l3_cache();
-
-      timer.start();
-      auto const result = cudf::io::read_parquet(read_opts);
-      timer.stop();
-
-      CUDF_EXPECTS(result.tbl->num_columns() == num_cols_to_read, "Unexpected number of columns");
-      CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows");
-    });
-
-  auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
-  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
-  state.add_buffer_size(
-    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
-  state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
-}
-
-template <data_type DataType>
-void bench_batched_memset(nvbench::state& state, nvbench::type_list<nvbench::enum_type<DataType>>)
-{
-  auto const d_type      = get_type_or_group(static_cast<int32_t>(DataType));
-  auto const num_cols    = static_cast<cudf::size_type>(state.get_int64("num_cols"));
-  auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
-  auto const run_length  = static_cast<cudf::size_type>(state.get_int64("run_length"));
-  auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
-  auto const compression = cudf::io::compression_type::NONE;
-  cuio_source_sink_pair source_sink(source_type);
-  auto const tbl =
-    create_random_table(cycle_dtypes(d_type, num_cols),
-                        table_size_bytes{data_size},
-                        data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
-  auto const view = tbl->view();
-
-  cudf::io::parquet_writer_options write_opts =
-    cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
-      .compression(compression);
-  cudf::io::write_parquet(write_opts);
-  auto const num_rows = view.num_rows();
-
-  parquet_read_common(num_rows, num_cols, source_sink, state);
-}
-
-using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
-                                            data_type::FLOAT,
-                                            data_type::DECIMAL,
-                                            data_type::TIMESTAMP,
-                                            data_type::DURATION,
-                                            data_type::STRING,
-                                            data_type::LIST,
-                                            data_type::STRUCT>;
-
-NVBENCH_BENCH_TYPES(bench_batched_memset, NVBENCH_TYPE_AXES(d_type_list))
-  .set_name("batched_memset")
-  .set_type_axes_names({"data_type"})
-  .add_int64_axis("num_cols", {1000})
-  .add_string_axis("io_type", {"DEVICE_BUFFER"})
-  .set_min_samples(4)
-  .add_int64_axis("cardinality", {0, 1000})
-  .add_int64_axis("run_length", {1, 32});
diff --git a/cpp/include/cudf/io/detail/batched_memcpy.hpp b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp
similarity index 98%
rename from cpp/include/cudf/io/detail/batched_memcpy.hpp
rename to cpp/include/cudf/detail/utilities/batched_memcpy.hpp
index 281908705db..22fef6f4815 100644
--- a/cpp/include/cudf/io/detail/batched_memcpy.hpp
+++ b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp
@@ -25,7 +25,7 @@
 #include <thrust/iterator/constant_iterator.h>
 
 namespace CUDF_EXPORT cudf {
-namespace io::detail {
+namespace detail {
 
 /**
  * @brief A helper function that copies a vector of vectors from source to destination addresses in
@@ -66,5 +66,5 @@ void batched_memcpy_async(SrcIterator src_iter,
                              stream.value());
 }
 
-}  // namespace io::detail
+}  // namespace detail
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/detail/batched_memset.hpp b/cpp/include/cudf/detail/utilities/batched_memset.hpp
similarity index 98%
rename from cpp/include/cudf/io/detail/batched_memset.hpp
rename to cpp/include/cudf/detail/utilities/batched_memset.hpp
index 1c74be4a9fe..75f738f7529 100644
--- a/cpp/include/cudf/io/detail/batched_memset.hpp
+++ b/cpp/include/cudf/detail/utilities/batched_memset.hpp
@@ -28,7 +28,7 @@
 #include <thrust/transform.h>
 
 namespace CUDF_EXPORT cudf {
-namespace io::detail {
+namespace detail {
 
 /**
  * @brief A helper function that takes in a vector of device spans and memsets them to the
@@ -78,5 +78,5 @@ void batched_memset(std::vector<cudf::device_span<T>> const& bufs,
     d_temp_storage.data(), temp_storage_bytes, iter_in, iter_out, sizes, num_bufs, stream);
 }
 
-}  // namespace io::detail
+}  // namespace detail
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 00fd1767554..5652039eb3f 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -20,10 +20,10 @@
 #include "orc_gpu.hpp"
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/utilities/batched_memcpy.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/io/detail/batched_memcpy.hpp>
 #include <cudf/io/orc_types.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
@@ -1350,7 +1350,7 @@ void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
     strm_desc, enc_streams, srcs, dsts, lengths);
 
   // Copy streams in a batched manner.
-  cudf::io::detail::batched_memcpy_async(
+  cudf::detail::batched_memcpy_async(
     srcs.begin(), dsts.begin(), lengths.begin(), lengths.size(), stream);
 }
 
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 07e3be8187a..9fdad113f49 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -17,7 +17,7 @@
 #include "page_data.cuh"
 #include "page_decode.cuh"
 
-#include <cudf/io/detail/batched_memcpy.hpp>
+#include <cudf/detail/utilities/batched_memcpy.hpp>
 
 #include <rmm/exec_policy.hpp>
 
@@ -491,7 +491,7 @@ void WriteFinalOffsetsBatched(host_span<size_type const> offsets,
   auto size_iter = thrust::make_constant_iterator(sizeof(size_type));
 
   // Copy offsets to buffers in batched manner.
-  cudf::io::detail::batched_memcpy_async(
+  cudf::detail::batched_memcpy_async(
     src_iter, d_dst_addrs.begin(), size_iter, offsets.size(), stream);
 }
 
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 8e67f233213..3060e8739f9 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -19,9 +19,9 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/batched_memset.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/io/detail/batched_memset.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/exec_policy.hpp>
@@ -1660,9 +1660,9 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
     }
   }
 
-  cudf::io::detail::batched_memset(memset_bufs, static_cast<std::byte>(0), _stream);
+  cudf::detail::batched_memset(memset_bufs, static_cast<std::byte>(0), _stream);
   // Need to set null mask bufs to all high bits
-  cudf::io::detail::batched_memset(
+  cudf::detail::batched_memset(
     nullmask_bufs, std::numeric_limits<cudf::bitmask_type>::max(), _stream);
 }
 
diff --git a/cpp/tests/utilities_tests/batched_memcpy_tests.cu b/cpp/tests/utilities_tests/batched_memcpy_tests.cu
index 63717b72658..3c411adc517 100644
--- a/cpp/tests/utilities_tests/batched_memcpy_tests.cu
+++ b/cpp/tests/utilities_tests/batched_memcpy_tests.cu
@@ -20,8 +20,8 @@
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <cudf/detail/utilities/batched_memcpy.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/io/detail/batched_memcpy.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
@@ -121,7 +121,7 @@ TEST(BatchedMemcpyTest, BasicTest)
   auto d_sizes_bytes = cudf::detail::make_device_uvector_async(h_sizes_bytes, stream, mr);
 
   // Run the batched memcpy
-  cudf::io::detail::batched_memcpy_async(
+  cudf::detail::batched_memcpy_async(
     d_src_ptrs.begin(), d_dst_ptrs.begin(), d_sizes_bytes.begin(), num_buffs, stream);
 
   // Expected giant destination buffer after the memcpy
diff --git a/cpp/tests/utilities_tests/batched_memset_tests.cu b/cpp/tests/utilities_tests/batched_memset_tests.cu
index bed0f40d70e..0eeb7b95318 100644
--- a/cpp/tests/utilities_tests/batched_memset_tests.cu
+++ b/cpp/tests/utilities_tests/batched_memset_tests.cu
@@ -18,8 +18,8 @@
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <cudf/detail/utilities/batched_memset.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/io/detail/batched_memset.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
@@ -78,7 +78,7 @@ TEST(MultiBufferTestIntegral, BasicTest1)
     });
 
   // Function Call
-  cudf::io::detail::batched_memset(memset_bufs, uint64_t{0}, stream);
+  cudf::detail::batched_memset(memset_bufs, uint64_t{0}, stream);
 
   // Set all buffer regions to 0 for expected comparison
   std::for_each(

From 9e063af0d2132174e7df390bbab4c7c02e12957d Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Mon, 30 Sep 2024 21:08:41 +0000
Subject: [PATCH 13/24] Minor changes from reviews

---
 cpp/src/io/parquet/page_data.cu                   |  6 +++---
 cpp/src/io/parquet/parquet_gpu.hpp                | 12 ++++++------
 cpp/src/io/parquet/reader_impl.cpp                |  2 +-
 cpp/tests/utilities_tests/batched_memcpy_tests.cu |  2 --
 4 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 9fdad113f49..7de5ebbe23a 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -469,10 +469,10 @@ void __host__ DecodeSplitPageData(cudf::detail::hostdevice_span<PageInfo> pages,
 }
 
 /**
- * @copydoc cudf::io::parquet::detail::WriteOutputBufferSizesBatched
+ * @copydoc cudf::io::parquet::detail::WriteFinalOffsets
  */
-void WriteFinalOffsetsBatched(host_span<size_type const> offsets,
-                              host_span<size_type* const> const& buff_addrs,
+void WriteFinalOffsets(host_span<size_type const> offsets,
+                              host_span<size_type* const> buff_addrs,
                               rmm::cuda_stream_view stream)
 {
   // Copy offsets to device and create an iterator
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 33fb9a9456a..2a73c3df41c 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -800,13 +800,13 @@ void DecodeSplitPageData(cudf::detail::hostdevice_span<PageInfo> pages,
  * @brief Writes the final offsets to the corresponding list and string buffer end addresses in a
  * batched manner.
  *
- * @param[in] offsets Host span of final offsets
- * @param[in] buff_addrs Host span of corresponding output col buffer end addresses
- * @param[in] stream CUDA stream to use
+ * @param offsets Host span of final offsets
+ * @param buff_addrs Host span of corresponding output col buffer end addresses
+ * @param stream CUDA stream to use
  */
-void WriteFinalOffsetsBatched(host_span<size_type const> offsets,
-                              host_span<size_type* const> const& buff_addrs,
-                              rmm::cuda_stream_view stream);
+void WriteFinalOffsets(host_span<size_type const> offsets,
+                       host_span<size_type* const> buff_addrs,
+                       rmm::cuda_stream_view stream);
 
 /**
  * @brief Launches kernel for reading the string column data stored in the pages
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index c02e4027fed..1b69ccb7742 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -409,7 +409,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
     }
   }
   // Write the final offsets for list and string columns in a batched manner
-  cudf::io::parquet::detail::WriteFinalOffsetsBatched(final_offsets, out_buffers, _stream);
+  WriteFinalOffsets(final_offsets, out_buffers, _stream);
 
   // update null counts in the final column buffers
   for (size_t idx = 0; idx < subpass.pages.size(); idx++) {
diff --git a/cpp/tests/utilities_tests/batched_memcpy_tests.cu b/cpp/tests/utilities_tests/batched_memcpy_tests.cu
index 3c411adc517..98657f8e224 100644
--- a/cpp/tests/utilities_tests/batched_memcpy_tests.cu
+++ b/cpp/tests/utilities_tests/batched_memcpy_tests.cu
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include "cudf_test/column_utilities.hpp"
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>

From 2372fbb5115a530a048c160423ef5294a558b732 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Mon, 30 Sep 2024 21:41:51 +0000
Subject: [PATCH 14/24] Minor updates

---
 cpp/src/io/orc/stripe_enc.cu    | 9 ++++-----
 cpp/src/io/parquet/page_data.cu | 4 ++--
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 5652039eb3f..afb1a906cf5 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -1331,8 +1331,6 @@ void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
                            device_2dspan<encoder_chunk_streams> enc_streams,
                            rmm::cuda_stream_view stream)
 {
-  dim3 dim_block(compact_streams_block_size, 1);
-
   auto const num_rowgroups = enc_streams.size().second;
   auto const num_streams   = strm_desc.size().second;
   auto const num_stripes   = strm_desc.size().first;
@@ -1344,9 +1342,10 @@ void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
   auto lengths = cudf::detail::make_zeroed_device_uvector_async<size_t>(
     num_chunks, stream, rmm::mr::get_current_device_resource());
 
-  dim3 dim_grid_alt(cudf::util::div_rounding_up_unsafe(num_stripes, compact_streams_block_size),
-                    strm_desc.size().second);
-  gpuInitBatchedMemcpy<<<dim_grid_alt, dim_block, 0, stream.value()>>>(
+  dim3 dim_block(compact_streams_block_size, 1);
+  dim3 dim_grid(cudf::util::div_rounding_up_unsafe(num_stripes, compact_streams_block_size),
+                strm_desc.size().second);
+  gpuInitBatchedMemcpy<<<dim_grid, dim_block, 0, stream.value()>>>(
     strm_desc, enc_streams, srcs, dsts, lengths);
 
   // Copy streams in a batched manner.
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 7de5ebbe23a..e46f2976f05 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -472,8 +472,8 @@ void __host__ DecodeSplitPageData(cudf::detail::hostdevice_span<PageInfo> pages,
  * @copydoc cudf::io::parquet::detail::WriteFinalOffsets
  */
 void WriteFinalOffsets(host_span<size_type const> offsets,
-                              host_span<size_type* const> buff_addrs,
-                              rmm::cuda_stream_view stream)
+                       host_span<size_type* const> buff_addrs,
+                       rmm::cuda_stream_view stream)
 {
   // Copy offsets to device and create an iterator
   auto d_src_data = cudf::detail::make_device_uvector_async(

From 4ea0930195ecf3618dfe9434f085b2ea09c55f91 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Tue, 1 Oct 2024 00:53:44 +0000
Subject: [PATCH 15/24] Minor comment update

---
 cpp/src/io/orc/stripe_enc.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index afb1a906cf5..a0382318cd1 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -1122,7 +1122,7 @@ CUDF_KERNEL void __launch_bounds__(compact_streams_block_size)
     streams[ss.column_id][group].data_ptrs[cid] = dst_ptr;
 
     auto const len = streams[ss.column_id][group].lengths[cid];
-    // Multiplying by sizeof(uint8_t) = 1 is redundant here.
+    // len is the size (in bytes) of current stream data.
     sizes[out_id] = len;
     dst_ptr += len;
   }

From 3eea6e224dddf2843add6057e92d9539d4876185 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Tue, 1 Oct 2024 00:54:04 +0000
Subject: [PATCH 16/24] Minor comment update

---
 cpp/src/io/orc/stripe_enc.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index a0382318cd1..7cdd6deedb1 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -1122,7 +1122,7 @@ CUDF_KERNEL void __launch_bounds__(compact_streams_block_size)
     streams[ss.column_id][group].data_ptrs[cid] = dst_ptr;
 
     auto const len = streams[ss.column_id][group].lengths[cid];
-    // len is the size (in bytes) of current stream data.
+    // len is the size (in bytes) of the current stream.
     sizes[out_id] = len;
     dst_ptr += len;
   }

From 6d078c2177adca6c4624b3e987703d757eaf47aa Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Tue, 1 Oct 2024 01:44:01 +0000
Subject: [PATCH 17/24] Style fix and add to CI.

---
 ci/run_cudf_examples.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh
index 0819eacf636..830bb610cc8 100755
--- a/ci/run_cudf_examples.sh
+++ b/ci/run_cudf_examples.sh
@@ -26,4 +26,7 @@ compute-sanitizer --tool memcheck custom_with_malloc names.csv
 compute-sanitizer --tool memcheck parquet_io
 compute-sanitizer --tool memcheck parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD TRUE
 
+compute-sanitizer --tool memcheck parquet_io_multithreaded
+compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet 4 PINNED_BUFFER 2 2
+
 exit ${EXITCODE}

From 1cc4e1f193b1c875e6a03fa3ab9658947b53f69b Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Mon, 30 Sep 2024 18:50:09 -0700
Subject: [PATCH 18/24] Revert erroneous commit

---
 ci/run_cudf_examples.sh | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh
index 830bb610cc8..0819eacf636 100755
--- a/ci/run_cudf_examples.sh
+++ b/ci/run_cudf_examples.sh
@@ -26,7 +26,4 @@ compute-sanitizer --tool memcheck custom_with_malloc names.csv
 compute-sanitizer --tool memcheck parquet_io
 compute-sanitizer --tool memcheck parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD TRUE
 
-compute-sanitizer --tool memcheck parquet_io_multithreaded
-compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet 4 PINNED_BUFFER 2 2
-
 exit ${EXITCODE}

From 042cfc09e5fdd259360dc488e2b81ca855eb50c1 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 2 Oct 2024 09:57:12 -0700
Subject: [PATCH 19/24] Update
 cpp/include/cudf/detail/utilities/batched_memcpy.hpp

Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
---
 cpp/include/cudf/detail/utilities/batched_memcpy.hpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/batched_memcpy.hpp b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp
index 22fef6f4815..e213d692394 100644
--- a/cpp/include/cudf/detail/utilities/batched_memcpy.hpp
+++ b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp
@@ -35,11 +35,11 @@ namespace detail {
  * @tparam DstIterator The type of the destination address iterator
  * @tparam SizeIterator The type of the buffer size iterator
  *
- * @param[in] src_iter Iterator to source addresses
- * @param[in] dst_iter Iterator to destination addresses
- * @param[in] size_iter Iterator to the vector sizes (in bytes)
- * @param[in] num_buffs Number of buffers to be copied
- * @param[in] stream CUDA stream to use
+ * @param src_iter Iterator to source addresses
+ * @param dst_iter Iterator to destination addresses
+ * @param size_iter Iterator to the vector sizes (in bytes)
+ * @param num_buffs Number of buffers to be copied
+ * @param stream CUDA stream to use
  */
 template <typename SrcIterator, typename DstIterator, typename SizeIterator>
 void batched_memcpy_async(SrcIterator src_iter,

From eee6f6d7f702dea2e0210671ec1000986909d7a0 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 2 Oct 2024 17:17:31 +0000
Subject: [PATCH 20/24] Apply suggestions from review

---
 cpp/src/io/orc/stripe_enc.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 7cdd6deedb1..ed0b6969154 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -21,6 +21,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/utilities/batched_memcpy.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
@@ -1102,7 +1103,7 @@ CUDF_KERNEL void __launch_bounds__(compact_streams_block_size)
                        device_span<uint8_t*> dsts,
                        device_span<size_t> sizes)
 {
-  auto const stripe_id = blockIdx.x * compact_streams_block_size + threadIdx.x;
+  auto const stripe_id = cudf::detail::grid_1d::global_thread_id();
   auto const stream_id = blockIdx.y;
   if (stripe_id >= strm_desc.size().first) { return; }
 

From 828e0acd1da902201f7c233947a39524d1e87ba3 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 2 Oct 2024 17:18:47 +0000
Subject: [PATCH 21/24] Minor updates from review

---
 cpp/include/cudf/detail/utilities/batched_memcpy.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/include/cudf/detail/utilities/batched_memcpy.hpp b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp
index e213d692394..2038e77127a 100644
--- a/cpp/include/cudf/detail/utilities/batched_memcpy.hpp
+++ b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp
@@ -56,7 +56,6 @@ void batched_memcpy_async(SrcIterator src_iter,
   // Allocate temporary storage
   rmm::device_buffer d_temp_storage{temp_storage_bytes, stream.value()};
 
-  // Perform copies
   cub::DeviceMemcpy::Batched(d_temp_storage.data(),
                              temp_storage_bytes,
                              src_iter,

From ecc425233ea303c03af58495b6a2a0b022075bdc Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 2 Oct 2024 17:20:39 +0000
Subject: [PATCH 22/24] Minor

---
 cpp/include/cudf/detail/utilities/batched_memcpy.hpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/batched_memcpy.hpp b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp
index 2038e77127a..b809f1e11c5 100644
--- a/cpp/include/cudf/detail/utilities/batched_memcpy.hpp
+++ b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp
@@ -48,12 +48,10 @@ void batched_memcpy_async(SrcIterator src_iter,
                           size_t num_buffs,
                           rmm::cuda_stream_view stream)
 {
-  // Get temp storage needed for cub::DeviceMemcpy::Batched
   size_t temp_storage_bytes = 0;
   cub::DeviceMemcpy::Batched(
     nullptr, temp_storage_bytes, src_iter, dst_iter, size_iter, num_buffs, stream.value());
 
-  // Allocate temporary storage
   rmm::device_buffer d_temp_storage{temp_storage_bytes, stream.value()};
 
   cub::DeviceMemcpy::Batched(d_temp_storage.data(),

From 871854baf33bb4e756050e0a88319aaeda69e9bb Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 2 Oct 2024 17:48:52 -0700
Subject: [PATCH 23/24] Update cpp/src/io/parquet/page_data.cu

Co-authored-by: Vyas Ramasubramani <vyas.ramasubramani@gmail.com>
---
 cpp/src/io/parquet/page_data.cu | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index e46f2976f05..b3276c81c1f 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -468,9 +468,6 @@ void __host__ DecodeSplitPageData(cudf::detail::hostdevice_span<PageInfo> pages,
   }
 }
 
-/**
- * @copydoc cudf::io::parquet::detail::WriteFinalOffsets
- */
 void WriteFinalOffsets(host_span<size_type const> offsets,
                        host_span<size_type* const> buff_addrs,
                        rmm::cuda_stream_view stream)

From 3e30777af2a50b2261b46e0d208fe3ef789daaa1 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 3 Oct 2024 00:58:54 +0000
Subject: [PATCH 24/24] Comments update.

---
 cpp/include/cudf/detail/utilities/batched_memcpy.hpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/batched_memcpy.hpp b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp
index b809f1e11c5..ed0ab9e6e5b 100644
--- a/cpp/include/cudf/detail/utilities/batched_memcpy.hpp
+++ b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp
@@ -31,13 +31,13 @@ namespace detail {
  * @brief A helper function that copies a vector of vectors from source to destination addresses in
  * a batched manner.
  *
- * @tparam SrcIterator The type of the source address iterator
- * @tparam DstIterator The type of the destination address iterator
- * @tparam SizeIterator The type of the buffer size iterator
+ * @tparam SrcIterator **[inferred]** The type of device-accessible source addresses iterator
+ * @tparam DstIterator **[inferred]** The type of device-accessible destination address iterator
+ * @tparam SizeIterator **[inferred]** The type of device-accessible buffer size iterator
  *
- * @param src_iter Iterator to source addresses
- * @param dst_iter Iterator to destination addresses
- * @param size_iter Iterator to the vector sizes (in bytes)
+ * @param src_iter Device-accessible iterator to source addresses
+ * @param dst_iter Device-accessible iterator to destination addresses
+ * @param size_iter Device-accessible iterator to the buffer sizes (in bytes)
  * @param num_buffs Number of buffers to be copied
  * @param stream CUDA stream to use
  */