From 74ee6ae526785d03ca1453b83dbc02094c962225 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 25 Sep 2024 03:20:19 +0000 Subject: [PATCH 01/24] Add capability to batch memcpy the last offsets to str and list out_bufs --- cpp/include/cudf/io/detail/batched_memcpy.hpp | 85 +++++++++++++++++++ cpp/src/io/parquet/page_data.cu | 13 +++ cpp/src/io/parquet/parquet_gpu.hpp | 15 ++++ cpp/src/io/parquet/reader_impl.cpp | 20 ++--- 4 files changed, 123 insertions(+), 10 deletions(-) create mode 100644 cpp/include/cudf/io/detail/batched_memcpy.hpp diff --git a/cpp/include/cudf/io/detail/batched_memcpy.hpp b/cpp/include/cudf/io/detail/batched_memcpy.hpp new file mode 100644 index 00000000000..318f833a77c --- /dev/null +++ b/cpp/include/cudf/io/detail/batched_memcpy.hpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include +#include +#include + +namespace CUDF_EXPORT cudf { +namespace io::detail { + +/** + * @brief A helper function that copies a vector of host scalar data to the corresponding device + * addresses in a batched manner. + * + * + * @param[in] src_data A vector of host scalar data + * @param[in] dst_addrs A vector of device destination addresses + * @param[in] mr Device memory resource to allocate temporary memory + * @param[in] stream CUDA stream to use + */ +template +void batched_memcpy(std::vector const& src_data, + std::vector const& dst_addrs, + rmm::device_async_resource_ref mr, + rmm::cuda_stream_view stream) +{ + // Number of elements to copy + auto const num_elems = src_data.size(); + + // Copy src data to device and create an iterator + auto d_src_data = cudf::detail::make_device_uvector_async(src_data, stream, mr); + auto src_iter = cudf::detail::make_counting_transform_iterator( + static_cast(0), + cuda::proclaim_return_type( + [src = d_src_data.data()] __device__(std::size_t i) { return src + i; })); + + // Copy dst addresses to device and create an iterator + auto d_dst_addrs = cudf::detail::make_device_uvector_async(dst_addrs, stream, mr); + auto dst_iter = cudf::detail::make_counting_transform_iterator( + static_cast(0), + cuda::proclaim_return_type( + [dst = d_dst_addrs.data()] __device__(std::size_t i) { return dst[i]; })); + + // Scalar src data so size_iter is simply a constant iterator. + auto size_iter = thrust::make_constant_iterator(sizeof(T)); + + // Get temp storage needed for cub::DeviceMemcpy::Batched + size_t temp_storage_bytes = 0; + cub::DeviceMemcpy::Batched( + nullptr, temp_storage_bytes, src_iter, dst_iter, size_iter, num_elems, stream.value()); + + // Allocate temporary storage + auto d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream.value(), mr}; + + // Run cub::DeviceMemcpy::Batched + cub::DeviceMemcpy::Batched(d_temp_storage.data(), + temp_storage_bytes, + src_iter, + dst_iter, + size_iter, + num_elems, + stream.value()); +} + +} // namespace io::detail +} // namespace CUDF_EXPORT cudf diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index e0d50d7ccf9..69d58bab05d 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -17,6 +17,8 @@ #include "page_data.cuh" #include "page_decode.cuh" +#include + #include #include @@ -466,4 +468,15 @@ void __host__ DecodeSplitPageData(cudf::detail::hostdevice_span pages, } } +/** + * @copydoc cudf::io::parquet::detail::WriteOutputBufferSizesBatched + */ +void __host__ WriteFinalOffsetsBatched(std::vector const& offsets, + std::vector const& buff_addrs, + rmm::device_async_resource_ref mr, + rmm::cuda_stream_view stream) +{ + return cudf::io::detail::batched_memcpy(offsets, buff_addrs, mr, stream); +} + } // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 1390339c1ae..b1bcdfac48f 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -796,6 +796,21 @@ void DecodeSplitPageData(cudf::detail::hostdevice_span pages, kernel_error::pointer error_code, rmm::cuda_stream_view stream); +/** + * @brief Writes the final offsets to the corresponding list and string buffer end addresses in a + * batched manner. + * + * + * @param[in] offsets A vector of finals offsets + * @param[in] buff_addrs A vector of corresponding buffer end addresses + * @param[in] mr Device memory resource to allocate temporary memory + * @param[in] stream CUDA stream to use + */ +void WriteFinalOffsetsBatched(std::vector const& offsets, + std::vector const& buff_addrs, + rmm::device_async_resource_ref mr, + rmm::cuda_stream_view stream); + /** * @brief Launches kernel for reading the string column data stored in the pages * diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 7d817bde7af..6127c4d91e2 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -378,6 +378,10 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num // that it is difficult/impossible for a given page to know that it is writing the very // last value that should then be followed by a terminator (because rows can span // page boundaries). + std::vector out_buffers; + std::vector final_offsets; + out_buffers.reserve(_input_columns.size()); + final_offsets.reserve(_input_columns.size()); for (size_t idx = 0; idx < _input_columns.size(); idx++) { input_column_info const& input_col = _input_columns[idx]; @@ -393,24 +397,20 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num // the final offset for a list at level N is the size of it's child size_type const offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size; - CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast(out_buf.data()) + (out_buf.size - 1), - &offset, - sizeof(size_type), - cudaMemcpyDefault, - _stream.value())); + out_buffers.emplace_back(static_cast(out_buf.data()) + (out_buf.size - 1)); + final_offsets.emplace_back(offset); out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED; } else if (out_buf.type.id() == type_id::STRING) { // need to cap off the string offsets column auto const sz = static_cast(col_string_sizes[idx]); if (sz <= strings::detail::get_offset64_threshold()) { - CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast(out_buf.data()) + out_buf.size, - &sz, - sizeof(size_type), - cudaMemcpyDefault, - _stream.value())); + out_buffers.emplace_back(static_cast(out_buf.data()) + out_buf.size); + final_offsets.emplace_back(sz); } } } + // Write the final offsets for all list and string buffers in a batched manner + cudf::io::parquet::detail::WriteFinalOffsetsBatched(final_offsets, out_buffers, _mr, _stream); } // update null counts in the final column buffers From cab885d1b1c7295812ef003eddcab7eb9399bdb7 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 25 Sep 2024 18:51:17 +0000 Subject: [PATCH 02/24] Move `WriteFinalOffsetsBatched` out of the for loop --- cpp/src/io/parquet/reader_impl.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 6127c4d91e2..86863a2562b 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -409,9 +409,9 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num } } } - // Write the final offsets for all list and string buffers in a batched manner - cudf::io::parquet::detail::WriteFinalOffsetsBatched(final_offsets, out_buffers, _mr, _stream); } + // Write the final offsets for all list and string buffers in a batched manner + cudf::io::parquet::detail::WriteFinalOffsetsBatched(final_offsets, out_buffers, _mr, _stream); // update null counts in the final column buffers for (size_t idx = 0; idx < subpass.pages.size(); idx++) { From b15e3d3096195e4a6d2ab03bbae5116b1caa2e56 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 25 Sep 2024 20:22:53 +0000 Subject: [PATCH 03/24] Generalize the API and ORC changes by @vuule --- cpp/include/cudf/io/detail/batched_memcpy.hpp | 44 ++++--------- cpp/src/io/orc/stripe_enc.cu | 66 +++++++++++++------ cpp/src/io/parquet/page_data.cu | 20 +++++- 3 files changed, 78 insertions(+), 52 deletions(-) diff --git a/cpp/include/cudf/io/detail/batched_memcpy.hpp b/cpp/include/cudf/io/detail/batched_memcpy.hpp index 318f833a77c..abbc096ce39 100644 --- a/cpp/include/cudf/io/detail/batched_memcpy.hpp +++ b/cpp/include/cudf/io/detail/batched_memcpy.hpp @@ -28,48 +28,32 @@ namespace CUDF_EXPORT cudf { namespace io::detail { /** - * @brief A helper function that copies a vector of host scalar data to the corresponding device - * addresses in a batched manner. + * @brief A helper function that copies a vector of vectors from source to destination addresses in + * a batched manner. * + * @tparam SrcIterator The type of the source address iterator + * @tparam DstIterator The type of the destination address iterator + * @tparam Sizeiterator The type of the buffer size iterator * - * @param[in] src_data A vector of host scalar data - * @param[in] dst_addrs A vector of device destination addresses - * @param[in] mr Device memory resource to allocate temporary memory + * @param[in] src_iter Iterator to source addresses + * @param[in] dst_iter Iterator to destination addresses + * @param[in] size_iter Iterator to the vector sizes (in bytes) * @param[in] stream CUDA stream to use */ -template -void batched_memcpy(std::vector const& src_data, - std::vector const& dst_addrs, - rmm::device_async_resource_ref mr, +template +void batched_memcpy(SrcIterator src_iter, + DstIterator dst_iter, + Sizeiterator size_iter, + size_t num_elems, rmm::cuda_stream_view stream) { - // Number of elements to copy - auto const num_elems = src_data.size(); - - // Copy src data to device and create an iterator - auto d_src_data = cudf::detail::make_device_uvector_async(src_data, stream, mr); - auto src_iter = cudf::detail::make_counting_transform_iterator( - static_cast(0), - cuda::proclaim_return_type( - [src = d_src_data.data()] __device__(std::size_t i) { return src + i; })); - - // Copy dst addresses to device and create an iterator - auto d_dst_addrs = cudf::detail::make_device_uvector_async(dst_addrs, stream, mr); - auto dst_iter = cudf::detail::make_counting_transform_iterator( - static_cast(0), - cuda::proclaim_return_type( - [dst = d_dst_addrs.data()] __device__(std::size_t i) { return dst[i]; })); - - // Scalar src data so size_iter is simply a constant iterator. - auto size_iter = thrust::make_constant_iterator(sizeof(T)); - // Get temp storage needed for cub::DeviceMemcpy::Batched size_t temp_storage_bytes = 0; cub::DeviceMemcpy::Batched( nullptr, temp_storage_bytes, src_iter, dst_iter, size_iter, num_elems, stream.value()); // Allocate temporary storage - auto d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream.value(), mr}; + rmm::device_buffer d_temp_storage{temp_storage_bytes, stream.value()}; // Run cub::DeviceMemcpy::Batched cub::DeviceMemcpy::Batched(d_temp_storage.data(), diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 5c70e35fd2e..b5d86e7197c 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -1087,37 +1088,42 @@ CUDF_KERNEL void __launch_bounds__(block_size) /** * @brief Merge chunked column data into a single contiguous stream * - * @param[in,out] strm_desc StripeStream device array [stripe][stream] - * @param[in,out] streams List of encoder chunk streams [column][rowgroup] + * @param[in] strm_desc StripeStream device array [stripe][stream] + * @param[in] streams List of encoder chunk streams [column][rowgroup] + * @param[out] srcs List of source encoder chunk stream data addresses + * @param[out] dsts List of destination StripeStream data addresses + * @param[out] sizes List of stream sizes in bytes */ // blockDim {compact_streams_block_size,1,1} CUDF_KERNEL void __launch_bounds__(compact_streams_block_size) - gpuCompactOrcDataStreams(device_2dspan strm_desc, - device_2dspan streams) + gpuInitBatchedMemcpy(device_2dspan strm_desc, + device_2dspan streams, + device_span srcs, + device_span dsts, + device_span sizes) { - __shared__ __align__(16) StripeStream ss; - - auto const stripe_id = blockIdx.x; + auto const stripe_id = blockIdx.x * compact_streams_block_size + threadIdx.x; auto const stream_id = blockIdx.y; - auto const t = threadIdx.x; + if (stripe_id >= strm_desc.size().first) { return; } - if (t == 0) { ss = strm_desc[stripe_id][stream_id]; } - __syncthreads(); + auto const out_id = stream_id * strm_desc.size().first + stripe_id; + StripeStream ss = strm_desc[stripe_id][stream_id]; if (ss.data_ptr == nullptr) { return; } auto const cid = ss.stream_type; auto dst_ptr = ss.data_ptr; for (auto group = ss.first_chunk_id; group < ss.first_chunk_id + ss.num_chunks; ++group) { + auto const out_id = stream_id * streams.size().second + group; + srcs[out_id] = streams[ss.column_id][group].data_ptrs[cid]; + dsts[out_id] = dst_ptr; + + // Also update the stream here, data will be copied in a separate kernel + streams[ss.column_id][group].data_ptrs[cid] = dst_ptr; + auto const len = streams[ss.column_id][group].lengths[cid]; - if (len > 0) { - auto const src_ptr = streams[ss.column_id][group].data_ptrs[cid]; - for (uint32_t i = t; i < len; i += blockDim.x) { - dst_ptr[i] = src_ptr[i]; - } - __syncthreads(); - } - if (t == 0) { streams[ss.column_id][group].data_ptrs[cid] = dst_ptr; } + // Multiplying by sizeof(uint8_t) = 1 is redundant here. + sizes[out_id] = len; dst_ptr += len; } } @@ -1326,8 +1332,26 @@ void CompactOrcDataStreams(device_2dspan strm_desc, rmm::cuda_stream_view stream) { dim3 dim_block(compact_streams_block_size, 1); - dim3 dim_grid(strm_desc.size().first, strm_desc.size().second); - gpuCompactOrcDataStreams<<>>(strm_desc, enc_streams); + + auto const num_rowgroups = enc_streams.size().second; + auto const num_streams = strm_desc.size().second; + auto const num_stripes = strm_desc.size().first; + auto const num_chunks = num_rowgroups * num_streams; + auto srcs = cudf::detail::make_zeroed_device_uvector_sync( + num_chunks, stream, rmm::mr::get_current_device_resource()); + auto dsts = cudf::detail::make_zeroed_device_uvector_sync( + num_chunks, stream, rmm::mr::get_current_device_resource()); + auto lengths = cudf::detail::make_zeroed_device_uvector_sync( + num_chunks, stream, rmm::mr::get_current_device_resource()); + + dim3 dim_grid_alt(cudf::util::div_rounding_up_unsafe(num_stripes, compact_streams_block_size), + strm_desc.size().second); + gpuInitBatchedMemcpy<<>>( + strm_desc, enc_streams, srcs, dsts, lengths); + + // Copy streams in a batched manner. + cudf::io::detail::batched_memcpy( + srcs.data(), dsts.data(), lengths.data(), lengths.size(), stream); } std::optional CompressOrcDataStreams( @@ -1438,4 +1462,4 @@ void decimal_sizes_to_offsets(device_2dspan rg_bounds, } // namespace gpu } // namespace orc } // namespace io -} // namespace cudf +} // namespace cudf \ No newline at end of file diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 69d58bab05d..fd7849c1327 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -476,7 +476,25 @@ void __host__ WriteFinalOffsetsBatched(std::vector const& offsets, rmm::device_async_resource_ref mr, rmm::cuda_stream_view stream) { - return cudf::io::detail::batched_memcpy(offsets, buff_addrs, mr, stream); + // Copy offsets to device and create an iterator + auto d_src_data = cudf::detail::make_device_uvector_async(offsets, stream, mr); + auto src_iter = cudf::detail::make_counting_transform_iterator( + static_cast(0), + cuda::proclaim_return_type( + [src = d_src_data.data()] __device__(std::size_t i) { return src + i; })); + + // Copy buffer addresses to device and create an iterator + auto d_dst_addrs = cudf::detail::make_device_uvector_async(buff_addrs, stream, mr); + auto dst_iter = cudf::detail::make_counting_transform_iterator( + static_cast(0), + cuda::proclaim_return_type( + [dst = d_dst_addrs.data()] __device__(std::size_t i) { return dst[i]; })); + + // size_iter is simply a constant iterator of sizeof(size_type) bytes. + auto size_iter = thrust::make_constant_iterator(sizeof(size_type)); + + // Copy offsets to buffers in batched manner. + cudf::io::detail::batched_memcpy(src_iter, dst_iter, size_iter, offsets.size(), stream); } } // namespace cudf::io::parquet::detail From 50dcd7155feef660f4a2c18af24f008d80075c78 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 25 Sep 2024 20:29:13 +0000 Subject: [PATCH 04/24] Use make_zeroed_device_uvector_async instead --- cpp/src/io/orc/stripe_enc.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index b5d86e7197c..a8e216f2700 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -1337,11 +1337,11 @@ void CompactOrcDataStreams(device_2dspan strm_desc, auto const num_streams = strm_desc.size().second; auto const num_stripes = strm_desc.size().first; auto const num_chunks = num_rowgroups * num_streams; - auto srcs = cudf::detail::make_zeroed_device_uvector_sync( + auto srcs = cudf::detail::make_zeroed_device_uvector_async( num_chunks, stream, rmm::mr::get_current_device_resource()); - auto dsts = cudf::detail::make_zeroed_device_uvector_sync( + auto dsts = cudf::detail::make_zeroed_device_uvector_async( num_chunks, stream, rmm::mr::get_current_device_resource()); - auto lengths = cudf::detail::make_zeroed_device_uvector_sync( + auto lengths = cudf::detail::make_zeroed_device_uvector_async( num_chunks, stream, rmm::mr::get_current_device_resource()); dim3 dim_grid_alt(cudf::util::div_rounding_up_unsafe(num_stripes, compact_streams_block_size), @@ -1462,4 +1462,4 @@ void decimal_sizes_to_offsets(device_2dspan rg_bounds, } // namespace gpu } // namespace orc } // namespace io -} // namespace cudf \ No newline at end of file +} // namespace cudf From 800b2717a4f35fd35ed2f11dae8d02df7825867d Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 26 Sep 2024 00:46:20 +0000 Subject: [PATCH 05/24] Add gtest for batched_memcpy --- cpp/tests/CMakeLists.txt | 3 +- .../utilities_tests/batched_memcpy_tests.cu | 141 ++++++++++++++++++ 2 files changed, 143 insertions(+), 1 deletion(-) create mode 100644 cpp/tests/utilities_tests/batched_memcpy_tests.cu diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index b67d922d377..4596ec65ce7 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -385,6 +385,8 @@ ConfigureTest( # * utilities tests ------------------------------------------------------------------------------- ConfigureTest( UTILITIES_TEST + utilities_tests/batched_memcpy_tests.cu + utilities_tests/batched_memset_tests.cu utilities_tests/column_debug_tests.cpp utilities_tests/column_utilities_tests.cpp utilities_tests/column_wrapper_tests.cpp @@ -395,7 +397,6 @@ ConfigureTest( utilities_tests/pinned_memory_tests.cpp utilities_tests/type_check_tests.cpp utilities_tests/type_list_tests.cpp - utilities_tests/batched_memset_tests.cu ) # ################################################################################################## diff --git a/cpp/tests/utilities_tests/batched_memcpy_tests.cu b/cpp/tests/utilities_tests/batched_memcpy_tests.cu new file mode 100644 index 00000000000..0986469d314 --- /dev/null +++ b/cpp/tests/utilities_tests/batched_memcpy_tests.cu @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cudf_test/column_utilities.hpp" + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +template +struct BatchedMemcpyTest : public cudf::test::BaseFixture {}; + +TEST(BatchedMemcpyTest, BasicTest) +{ + using T1 = int64_t; + + // Device init + auto stream = cudf::get_default_stream(); + auto mr = cudf::get_current_device_resource_ref(); + + // Buffer lengths (in number of elements) + std::vector const h_lens{ + 50000, 4, 1000, 0, 250000, 1, 100, 8000, 0, 1, 100, 1000, 10000, 100000, 0, 1, 100000}; + + // Total number of buffers + auto const num_buffs = h_lens.size(); + + // Exclusive sum of buffer lengths for pointers + std::vector h_lens_excl_sum(num_buffs); + std::exclusive_scan(h_lens.begin(), h_lens.end(), h_lens_excl_sum.begin(), 0); + + // Corresponding buffer sizes (in bytes) + std::vector h_sizes_bytes; + h_sizes_bytes.reserve(num_buffs); + std::transform( + h_lens.cbegin(), h_lens.cend(), std::back_inserter(h_sizes_bytes), [&](auto& size) { + return size * sizeof(T1); + }); + + // Initialize random engine + auto constexpr seed = 0xcead; + std::mt19937 engine{seed}; + using uniform_distribution = + typename std::conditional_t, + std::bernoulli_distribution, + std::conditional_t, + std::uniform_real_distribution, + std::uniform_int_distribution>>; + uniform_distribution dist{}; + + // Generate a src vector of random data vectors + std::vector> h_sources; + h_sources.reserve(num_buffs); + std::transform(h_lens.begin(), h_lens.end(), std::back_inserter(h_sources), [&](auto size) { + std::vector data(size); + std::generate_n(data.begin(), size, [&]() { return T1{dist(engine)}; }); + return data; + }); + // Copy the vectors to device + std::vector> h_device_vecs; + h_device_vecs.reserve(h_sources.size()); + std::transform( + h_sources.begin(), h_sources.end(), std::back_inserter(h_device_vecs), [stream, mr](auto& vec) { + return cudf::detail::make_device_uvector_async(vec, stream, mr); + }); + // Pointers to the source vectors + std::vector h_src_ptrs; + h_src_ptrs.reserve(h_sources.size()); + std::transform( + h_device_vecs.begin(), h_device_vecs.end(), std::back_inserter(h_src_ptrs), [](auto& vec) { + return static_cast(vec.data()); + }); + // Copy the source data pointers to device + auto d_src_ptrs = cudf::detail::make_device_uvector_async(h_src_ptrs, stream, mr); + + // Total number of elements in all buffers + auto const total_buff_len = std::accumulate(h_lens.cbegin(), h_lens.cend(), 0); + + // Create one giant buffer for destination + auto d_dst_data = cudf::detail::make_zeroed_device_uvector_async(total_buff_len, stream, mr); + // Pointers to destination buffers within the giant destination buffer + std::vector h_dst_ptrs(num_buffs); + std::for_each(thrust::make_counting_iterator(static_cast(0)), + thrust::make_counting_iterator(num_buffs), + [&](auto i) { return h_dst_ptrs[i] = d_dst_data.data() + h_lens_excl_sum[i]; }); + // Copy destination data pointers to device + auto d_dst_ptrs = cudf::detail::make_device_uvector_async(h_dst_ptrs, stream, mr); + + // Copy buffer size iterators (in bytes) to device + auto d_sizes_bytes = cudf::detail::make_device_uvector_async(h_sizes_bytes, stream, mr); + + // Run the batched memcpy + cudf::io::detail::batched_memcpy( + d_src_ptrs.begin(), d_dst_ptrs.begin(), d_sizes_bytes.begin(), num_buffs, stream); + + // Expected giant destination buffer after the memcpy + std::vector expected_buffer; + expected_buffer.reserve(total_buff_len); + std::for_each(h_sources.cbegin(), h_sources.cend(), [&expected_buffer](auto& source) { + expected_buffer.insert(expected_buffer.end(), source.begin(), source.end()); + }); + + // Copy over the result destination buffer to host and synchronize the stream + auto result_dst_buffer = + cudf::detail::make_std_vector_sync(cudf::device_span(d_dst_data), stream); + + // Check if both vectors are equal + EXPECT_TRUE( + std::equal(expected_buffer.begin(), expected_buffer.end(), result_dst_buffer.begin())); +} From 31a755b01d9d9d92bf0b5d2c8c08970ff5301920 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 25 Sep 2024 18:20:37 -0700 Subject: [PATCH 06/24] Update cpp/include/cudf/io/detail/batched_memcpy.hpp --- cpp/include/cudf/io/detail/batched_memcpy.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/io/detail/batched_memcpy.hpp b/cpp/include/cudf/io/detail/batched_memcpy.hpp index abbc096ce39..fc340021d92 100644 --- a/cpp/include/cudf/io/detail/batched_memcpy.hpp +++ b/cpp/include/cudf/io/detail/batched_memcpy.hpp @@ -40,10 +40,10 @@ namespace io::detail { * @param[in] size_iter Iterator to the vector sizes (in bytes) * @param[in] stream CUDA stream to use */ -template +template void batched_memcpy(SrcIterator src_iter, DstIterator dst_iter, - Sizeiterator size_iter, + SizeIterator size_iter, size_t num_elems, rmm::cuda_stream_view stream) { From b29329b72ed7d2cd08d4de875542b5e69478ddce Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 25 Sep 2024 18:20:41 -0700 Subject: [PATCH 07/24] Update cpp/include/cudf/io/detail/batched_memcpy.hpp --- cpp/include/cudf/io/detail/batched_memcpy.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/io/detail/batched_memcpy.hpp b/cpp/include/cudf/io/detail/batched_memcpy.hpp index fc340021d92..8065743922e 100644 --- a/cpp/include/cudf/io/detail/batched_memcpy.hpp +++ b/cpp/include/cudf/io/detail/batched_memcpy.hpp @@ -33,7 +33,7 @@ namespace io::detail { * * @tparam SrcIterator The type of the source address iterator * @tparam DstIterator The type of the destination address iterator - * @tparam Sizeiterator The type of the buffer size iterator + * @tparam SizeIterator The type of the buffer size iterator * * @param[in] src_iter Iterator to source addresses * @param[in] dst_iter Iterator to destination addresses From 4efb989525d8cc1c4744b0c32af08d649fbf2b1d Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 25 Sep 2024 18:42:11 -0700 Subject: [PATCH 08/24] Comments update --- cpp/src/io/parquet/reader_impl.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 86863a2562b..151359695a9 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -371,9 +371,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num CUDF_FAIL("Parquet data decode failed with code(s) " + kernel_error::to_string(error)); } - // for list columns, add the final offset to every offset buffer. - // TODO : make this happen in more efficiently. Maybe use thrust::for_each - // on each buffer. + // For list and string columns, add the final offset to every offset buffer. // Note : the reason we are doing this here instead of in the decode kernel is // that it is difficult/impossible for a given page to know that it is writing the very // last value that should then be followed by a terminator (because rows can span @@ -410,7 +408,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num } } } - // Write the final offsets for all list and string buffers in a batched manner + // Write the final offsets for list and string columns in a batched manner cudf::io::parquet::detail::WriteFinalOffsetsBatched(final_offsets, out_buffers, _mr, _stream); // update null counts in the final column buffers From cc2829f42c86bbdb59a592e7b05395515e0dc8ff Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Fri, 27 Sep 2024 01:03:38 +0000 Subject: [PATCH 09/24] Address reviewer comments --- cpp/include/cudf/io/detail/batched_memcpy.hpp | 17 +++++++++-------- cpp/src/io/orc/stripe_enc.cu | 4 ++-- cpp/src/io/parquet/page_data.cu | 13 ++++++------- cpp/src/io/parquet/parquet_gpu.hpp | 11 ++++------- cpp/src/io/parquet/reader_impl.cpp | 2 +- .../utilities_tests/batched_memcpy_tests.cu | 2 +- 6 files changed, 23 insertions(+), 26 deletions(-) diff --git a/cpp/include/cudf/io/detail/batched_memcpy.hpp b/cpp/include/cudf/io/detail/batched_memcpy.hpp index 8065743922e..281908705db 100644 --- a/cpp/include/cudf/io/detail/batched_memcpy.hpp +++ b/cpp/include/cudf/io/detail/batched_memcpy.hpp @@ -38,30 +38,31 @@ namespace io::detail { * @param[in] src_iter Iterator to source addresses * @param[in] dst_iter Iterator to destination addresses * @param[in] size_iter Iterator to the vector sizes (in bytes) + * @param[in] num_buffs Number of buffers to be copied * @param[in] stream CUDA stream to use */ template -void batched_memcpy(SrcIterator src_iter, - DstIterator dst_iter, - SizeIterator size_iter, - size_t num_elems, - rmm::cuda_stream_view stream) +void batched_memcpy_async(SrcIterator src_iter, + DstIterator dst_iter, + SizeIterator size_iter, + size_t num_buffs, + rmm::cuda_stream_view stream) { // Get temp storage needed for cub::DeviceMemcpy::Batched size_t temp_storage_bytes = 0; cub::DeviceMemcpy::Batched( - nullptr, temp_storage_bytes, src_iter, dst_iter, size_iter, num_elems, stream.value()); + nullptr, temp_storage_bytes, src_iter, dst_iter, size_iter, num_buffs, stream.value()); // Allocate temporary storage rmm::device_buffer d_temp_storage{temp_storage_bytes, stream.value()}; - // Run cub::DeviceMemcpy::Batched + // Perform copies cub::DeviceMemcpy::Batched(d_temp_storage.data(), temp_storage_bytes, src_iter, dst_iter, size_iter, - num_elems, + num_buffs, stream.value()); } diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index a8e216f2700..00fd1767554 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -1350,8 +1350,8 @@ void CompactOrcDataStreams(device_2dspan strm_desc, strm_desc, enc_streams, srcs, dsts, lengths); // Copy streams in a batched manner. - cudf::io::detail::batched_memcpy( - srcs.data(), dsts.data(), lengths.data(), lengths.size(), stream); + cudf::io::detail::batched_memcpy_async( + srcs.begin(), dsts.begin(), lengths.begin(), lengths.size(), stream); } std::optional CompressOrcDataStreams( diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index fd7849c1327..413da57293d 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -471,20 +471,19 @@ void __host__ DecodeSplitPageData(cudf::detail::hostdevice_span pages, /** * @copydoc cudf::io::parquet::detail::WriteOutputBufferSizesBatched */ -void __host__ WriteFinalOffsetsBatched(std::vector const& offsets, - std::vector const& buff_addrs, - rmm::device_async_resource_ref mr, - rmm::cuda_stream_view stream) +void WriteFinalOffsetsBatched(host_span offsets, + host_span const& buff_addrs, + rmm::cuda_stream_view stream) { // Copy offsets to device and create an iterator - auto d_src_data = cudf::detail::make_device_uvector_async(offsets, stream, mr); + auto d_src_data = cudf::detail::make_device_uvector_async(offsets, stream, cudf::get_current_device_resource_ref()); auto src_iter = cudf::detail::make_counting_transform_iterator( static_cast(0), cuda::proclaim_return_type( [src = d_src_data.data()] __device__(std::size_t i) { return src + i; })); // Copy buffer addresses to device and create an iterator - auto d_dst_addrs = cudf::detail::make_device_uvector_async(buff_addrs, stream, mr); + auto d_dst_addrs = cudf::detail::make_device_uvector_async(buff_addrs, stream, cudf::get_current_device_resource_ref()); auto dst_iter = cudf::detail::make_counting_transform_iterator( static_cast(0), cuda::proclaim_return_type( @@ -494,7 +493,7 @@ void __host__ WriteFinalOffsetsBatched(std::vector const& offsets, auto size_iter = thrust::make_constant_iterator(sizeof(size_type)); // Copy offsets to buffers in batched manner. - cudf::io::detail::batched_memcpy(src_iter, dst_iter, size_iter, offsets.size(), stream); + cudf::io::detail::batched_memcpy_async(src_iter, dst_iter, size_iter, offsets.size(), stream); } } // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index b1bcdfac48f..33fb9a9456a 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -800,15 +800,12 @@ void DecodeSplitPageData(cudf::detail::hostdevice_span pages, * @brief Writes the final offsets to the corresponding list and string buffer end addresses in a * batched manner. * - * - * @param[in] offsets A vector of finals offsets - * @param[in] buff_addrs A vector of corresponding buffer end addresses - * @param[in] mr Device memory resource to allocate temporary memory + * @param[in] offsets Host span of final offsets + * @param[in] buff_addrs Host span of corresponding output col buffer end addresses * @param[in] stream CUDA stream to use */ -void WriteFinalOffsetsBatched(std::vector const& offsets, - std::vector const& buff_addrs, - rmm::device_async_resource_ref mr, +void WriteFinalOffsetsBatched(host_span offsets, + host_span const& buff_addrs, rmm::cuda_stream_view stream); /** diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 151359695a9..c02e4027fed 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -409,7 +409,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num } } // Write the final offsets for list and string columns in a batched manner - cudf::io::parquet::detail::WriteFinalOffsetsBatched(final_offsets, out_buffers, _mr, _stream); + cudf::io::parquet::detail::WriteFinalOffsetsBatched(final_offsets, out_buffers, _stream); // update null counts in the final column buffers for (size_t idx = 0; idx < subpass.pages.size(); idx++) { diff --git a/cpp/tests/utilities_tests/batched_memcpy_tests.cu b/cpp/tests/utilities_tests/batched_memcpy_tests.cu index 0986469d314..63717b72658 100644 --- a/cpp/tests/utilities_tests/batched_memcpy_tests.cu +++ b/cpp/tests/utilities_tests/batched_memcpy_tests.cu @@ -121,7 +121,7 @@ TEST(BatchedMemcpyTest, BasicTest) auto d_sizes_bytes = cudf::detail::make_device_uvector_async(h_sizes_bytes, stream, mr); // Run the batched memcpy - cudf::io::detail::batched_memcpy( + cudf::io::detail::batched_memcpy_async( d_src_ptrs.begin(), d_dst_ptrs.begin(), d_sizes_bytes.begin(), num_buffs, stream); // Expected giant destination buffer after the memcpy From 78d68a818dc735ef8de60595c7fda7db76eaf735 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Fri, 27 Sep 2024 01:09:01 +0000 Subject: [PATCH 10/24] Style fix --- cpp/src/io/parquet/page_data.cu | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 413da57293d..71c1d23a70b 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -476,15 +476,17 @@ void WriteFinalOffsetsBatched(host_span offsets, rmm::cuda_stream_view stream) { // Copy offsets to device and create an iterator - auto d_src_data = cudf::detail::make_device_uvector_async(offsets, stream, cudf::get_current_device_resource_ref()); - auto src_iter = cudf::detail::make_counting_transform_iterator( + auto d_src_data = cudf::detail::make_device_uvector_async( + offsets, stream, cudf::get_current_device_resource_ref()); + auto src_iter = cudf::detail::make_counting_transform_iterator( static_cast(0), cuda::proclaim_return_type( [src = d_src_data.data()] __device__(std::size_t i) { return src + i; })); // Copy buffer addresses to device and create an iterator - auto d_dst_addrs = cudf::detail::make_device_uvector_async(buff_addrs, stream, cudf::get_current_device_resource_ref()); - auto dst_iter = cudf::detail::make_counting_transform_iterator( + auto d_dst_addrs = cudf::detail::make_device_uvector_async( + buff_addrs, stream, cudf::get_current_device_resource_ref()); + auto dst_iter = cudf::detail::make_counting_transform_iterator( static_cast(0), cuda::proclaim_return_type( [dst = d_dst_addrs.data()] __device__(std::size_t i) { return dst[i]; })); From d42da4522b249fd846f6ea366d2543adc20e1ef6 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Fri, 27 Sep 2024 01:46:40 +0000 Subject: [PATCH 11/24] Remove the unnecessary iterator --- cpp/src/io/parquet/page_data.cu | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 71c1d23a70b..07e3be8187a 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -478,24 +478,21 @@ void WriteFinalOffsetsBatched(host_span offsets, // Copy offsets to device and create an iterator auto d_src_data = cudf::detail::make_device_uvector_async( offsets, stream, cudf::get_current_device_resource_ref()); + // Iterator for the source (scalar) data auto src_iter = cudf::detail::make_counting_transform_iterator( static_cast(0), cuda::proclaim_return_type( - [src = d_src_data.data()] __device__(std::size_t i) { return src + i; })); + [src = d_src_data.begin()] __device__(std::size_t i) { return src + i; })); // Copy buffer addresses to device and create an iterator auto d_dst_addrs = cudf::detail::make_device_uvector_async( buff_addrs, stream, cudf::get_current_device_resource_ref()); - auto dst_iter = cudf::detail::make_counting_transform_iterator( - static_cast(0), - cuda::proclaim_return_type( - [dst = d_dst_addrs.data()] __device__(std::size_t i) { return dst[i]; })); - // size_iter is simply a constant iterator of sizeof(size_type) bytes. auto size_iter = thrust::make_constant_iterator(sizeof(size_type)); // Copy offsets to buffers in batched manner. - cudf::io::detail::batched_memcpy_async(src_iter, dst_iter, size_iter, offsets.size(), stream); + cudf::io::detail::batched_memcpy_async( + src_iter, d_dst_addrs.begin(), size_iter, offsets.size(), stream); } } // namespace cudf::io::parquet::detail From 8d5640da649b9e8971bd0530348c2f6c5a0c7672 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Fri, 27 Sep 2024 17:00:24 +0000 Subject: [PATCH 12/24] Move batched_memxxx to include/detail/utilities --- cpp/benchmarks/CMakeLists.txt | 5 - .../io/utilities/batched_memset_bench.cpp | 101 ------------------ .../utilities}/batched_memcpy.hpp | 4 +- .../utilities}/batched_memset.hpp | 4 +- cpp/src/io/orc/stripe_enc.cu | 4 +- cpp/src/io/parquet/page_data.cu | 4 +- cpp/src/io/parquet/reader_impl_preprocess.cu | 6 +- .../utilities_tests/batched_memcpy_tests.cu | 4 +- .../utilities_tests/batched_memset_tests.cu | 4 +- 9 files changed, 15 insertions(+), 121 deletions(-) delete mode 100644 cpp/benchmarks/io/utilities/batched_memset_bench.cpp rename cpp/include/cudf/{io/detail => detail/utilities}/batched_memcpy.hpp (98%) rename cpp/include/cudf/{io/detail => detail/utilities}/batched_memset.hpp (98%) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 4113e38dcf4..110b4557840 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -392,11 +392,6 @@ ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader ConfigureNVBench(JSON_READER_OPTION_NVBENCH io/json/json_reader_option.cpp) ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp) -# ################################################################################################## -# * multi buffer memset benchmark -# ---------------------------------------------------------------------- -ConfigureNVBench(BATCHED_MEMSET_BENCH io/utilities/batched_memset_bench.cpp) - # ################################################################################################## # * io benchmark --------------------------------------------------------------------- ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp) diff --git a/cpp/benchmarks/io/utilities/batched_memset_bench.cpp b/cpp/benchmarks/io/utilities/batched_memset_bench.cpp deleted file mode 100644 index 2905895a63b..00000000000 --- a/cpp/benchmarks/io/utilities/batched_memset_bench.cpp +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -#include -#include - -#include - -// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to -// run on most GPUs, but large enough to allow highest throughput -constexpr size_t data_size = 512 << 20; - -void parquet_read_common(cudf::size_type num_rows_to_read, - cudf::size_type num_cols_to_read, - cuio_source_sink_pair& source_sink, - nvbench::state& state) -{ - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(source_sink.make_source_info()); - - auto mem_stats_logger = cudf::memory_stats_logger(); - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - state.exec( - nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { - try_drop_l3_cache(); - - timer.start(); - auto const result = cudf::io::read_parquet(read_opts); - timer.stop(); - - CUDF_EXPECTS(result.tbl->num_columns() == num_cols_to_read, "Unexpected number of columns"); - CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows"); - }); - - auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); - state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); - state.add_buffer_size( - mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); - state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size"); -} - -template -void bench_batched_memset(nvbench::state& state, nvbench::type_list>) -{ - auto const d_type = get_type_or_group(static_cast(DataType)); - auto const num_cols = static_cast(state.get_int64("num_cols")); - auto const cardinality = static_cast(state.get_int64("cardinality")); - auto const run_length = static_cast(state.get_int64("run_length")); - auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); - auto const compression = cudf::io::compression_type::NONE; - cuio_source_sink_pair source_sink(source_type); - auto const tbl = - create_random_table(cycle_dtypes(d_type, num_cols), - table_size_bytes{data_size}, - data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); - auto const view = tbl->view(); - - cudf::io::parquet_writer_options write_opts = - cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) - .compression(compression); - cudf::io::write_parquet(write_opts); - auto const num_rows = view.num_rows(); - - parquet_read_common(num_rows, num_cols, source_sink, state); -} - -using d_type_list = nvbench::enum_type_list; - -NVBENCH_BENCH_TYPES(bench_batched_memset, NVBENCH_TYPE_AXES(d_type_list)) - .set_name("batched_memset") - .set_type_axes_names({"data_type"}) - .add_int64_axis("num_cols", {1000}) - .add_string_axis("io_type", {"DEVICE_BUFFER"}) - .set_min_samples(4) - .add_int64_axis("cardinality", {0, 1000}) - .add_int64_axis("run_length", {1, 32}); diff --git a/cpp/include/cudf/io/detail/batched_memcpy.hpp b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp similarity index 98% rename from cpp/include/cudf/io/detail/batched_memcpy.hpp rename to cpp/include/cudf/detail/utilities/batched_memcpy.hpp index 281908705db..22fef6f4815 100644 --- a/cpp/include/cudf/io/detail/batched_memcpy.hpp +++ b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp @@ -25,7 +25,7 @@ #include namespace CUDF_EXPORT cudf { -namespace io::detail { +namespace detail { /** * @brief A helper function that copies a vector of vectors from source to destination addresses in @@ -66,5 +66,5 @@ void batched_memcpy_async(SrcIterator src_iter, stream.value()); } -} // namespace io::detail +} // namespace detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/io/detail/batched_memset.hpp b/cpp/include/cudf/detail/utilities/batched_memset.hpp similarity index 98% rename from cpp/include/cudf/io/detail/batched_memset.hpp rename to cpp/include/cudf/detail/utilities/batched_memset.hpp index 1c74be4a9fe..75f738f7529 100644 --- a/cpp/include/cudf/io/detail/batched_memset.hpp +++ b/cpp/include/cudf/detail/utilities/batched_memset.hpp @@ -28,7 +28,7 @@ #include namespace CUDF_EXPORT cudf { -namespace io::detail { +namespace detail { /** * @brief A helper function that takes in a vector of device spans and memsets them to the @@ -78,5 +78,5 @@ void batched_memset(std::vector> const& bufs, d_temp_storage.data(), temp_storage_bytes, iter_in, iter_out, sizes, num_bufs, stream); } -} // namespace io::detail +} // namespace detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 00fd1767554..5652039eb3f 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -20,10 +20,10 @@ #include "orc_gpu.hpp" #include +#include #include #include #include -#include #include #include #include @@ -1350,7 +1350,7 @@ void CompactOrcDataStreams(device_2dspan strm_desc, strm_desc, enc_streams, srcs, dsts, lengths); // Copy streams in a batched manner. - cudf::io::detail::batched_memcpy_async( + cudf::detail::batched_memcpy_async( srcs.begin(), dsts.begin(), lengths.begin(), lengths.size(), stream); } diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 07e3be8187a..9fdad113f49 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -17,7 +17,7 @@ #include "page_data.cuh" #include "page_decode.cuh" -#include +#include #include @@ -491,7 +491,7 @@ void WriteFinalOffsetsBatched(host_span offsets, auto size_iter = thrust::make_constant_iterator(sizeof(size_type)); // Copy offsets to buffers in batched manner. - cudf::io::detail::batched_memcpy_async( + cudf::detail::batched_memcpy_async( src_iter, d_dst_addrs.begin(), size_iter, offsets.size(), stream); } diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index 8e67f233213..3060e8739f9 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -19,9 +19,9 @@ #include #include +#include #include #include -#include #include #include @@ -1660,9 +1660,9 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num } } - cudf::io::detail::batched_memset(memset_bufs, static_cast(0), _stream); + cudf::detail::batched_memset(memset_bufs, static_cast(0), _stream); // Need to set null mask bufs to all high bits - cudf::io::detail::batched_memset( + cudf::detail::batched_memset( nullmask_bufs, std::numeric_limits::max(), _stream); } diff --git a/cpp/tests/utilities_tests/batched_memcpy_tests.cu b/cpp/tests/utilities_tests/batched_memcpy_tests.cu index 63717b72658..3c411adc517 100644 --- a/cpp/tests/utilities_tests/batched_memcpy_tests.cu +++ b/cpp/tests/utilities_tests/batched_memcpy_tests.cu @@ -20,8 +20,8 @@ #include #include +#include #include -#include #include #include #include @@ -121,7 +121,7 @@ TEST(BatchedMemcpyTest, BasicTest) auto d_sizes_bytes = cudf::detail::make_device_uvector_async(h_sizes_bytes, stream, mr); // Run the batched memcpy - cudf::io::detail::batched_memcpy_async( + cudf::detail::batched_memcpy_async( d_src_ptrs.begin(), d_dst_ptrs.begin(), d_sizes_bytes.begin(), num_buffs, stream); // Expected giant destination buffer after the memcpy diff --git a/cpp/tests/utilities_tests/batched_memset_tests.cu b/cpp/tests/utilities_tests/batched_memset_tests.cu index bed0f40d70e..0eeb7b95318 100644 --- a/cpp/tests/utilities_tests/batched_memset_tests.cu +++ b/cpp/tests/utilities_tests/batched_memset_tests.cu @@ -18,8 +18,8 @@ #include #include +#include #include -#include #include #include #include @@ -78,7 +78,7 @@ TEST(MultiBufferTestIntegral, BasicTest1) }); // Function Call - cudf::io::detail::batched_memset(memset_bufs, uint64_t{0}, stream); + cudf::detail::batched_memset(memset_bufs, uint64_t{0}, stream); // Set all buffer regions to 0 for expected comparison std::for_each( From 9e063af0d2132174e7df390bbab4c7c02e12957d Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Mon, 30 Sep 2024 21:08:41 +0000 Subject: [PATCH 13/24] Minor changes from reviews --- cpp/src/io/parquet/page_data.cu | 6 +++--- cpp/src/io/parquet/parquet_gpu.hpp | 12 ++++++------ cpp/src/io/parquet/reader_impl.cpp | 2 +- cpp/tests/utilities_tests/batched_memcpy_tests.cu | 2 -- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 9fdad113f49..7de5ebbe23a 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -469,10 +469,10 @@ void __host__ DecodeSplitPageData(cudf::detail::hostdevice_span pages, } /** - * @copydoc cudf::io::parquet::detail::WriteOutputBufferSizesBatched + * @copydoc cudf::io::parquet::detail::WriteFinalOffsets */ -void WriteFinalOffsetsBatched(host_span offsets, - host_span const& buff_addrs, +void WriteFinalOffsets(host_span offsets, + host_span buff_addrs, rmm::cuda_stream_view stream) { // Copy offsets to device and create an iterator diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 33fb9a9456a..2a73c3df41c 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -800,13 +800,13 @@ void DecodeSplitPageData(cudf::detail::hostdevice_span pages, * @brief Writes the final offsets to the corresponding list and string buffer end addresses in a * batched manner. * - * @param[in] offsets Host span of final offsets - * @param[in] buff_addrs Host span of corresponding output col buffer end addresses - * @param[in] stream CUDA stream to use + * @param offsets Host span of final offsets + * @param buff_addrs Host span of corresponding output col buffer end addresses + * @param stream CUDA stream to use */ -void WriteFinalOffsetsBatched(host_span offsets, - host_span const& buff_addrs, - rmm::cuda_stream_view stream); +void WriteFinalOffsets(host_span offsets, + host_span buff_addrs, + rmm::cuda_stream_view stream); /** * @brief Launches kernel for reading the string column data stored in the pages diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index c02e4027fed..1b69ccb7742 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -409,7 +409,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num } } // Write the final offsets for list and string columns in a batched manner - cudf::io::parquet::detail::WriteFinalOffsetsBatched(final_offsets, out_buffers, _stream); + WriteFinalOffsets(final_offsets, out_buffers, _stream); // update null counts in the final column buffers for (size_t idx = 0; idx < subpass.pages.size(); idx++) { diff --git a/cpp/tests/utilities_tests/batched_memcpy_tests.cu b/cpp/tests/utilities_tests/batched_memcpy_tests.cu index 3c411adc517..98657f8e224 100644 --- a/cpp/tests/utilities_tests/batched_memcpy_tests.cu +++ b/cpp/tests/utilities_tests/batched_memcpy_tests.cu @@ -14,8 +14,6 @@ * limitations under the License. */ -#include "cudf_test/column_utilities.hpp" - #include #include #include From 2372fbb5115a530a048c160423ef5294a558b732 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Mon, 30 Sep 2024 21:41:51 +0000 Subject: [PATCH 14/24] Minor updates --- cpp/src/io/orc/stripe_enc.cu | 9 ++++----- cpp/src/io/parquet/page_data.cu | 4 ++-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 5652039eb3f..afb1a906cf5 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -1331,8 +1331,6 @@ void CompactOrcDataStreams(device_2dspan strm_desc, device_2dspan enc_streams, rmm::cuda_stream_view stream) { - dim3 dim_block(compact_streams_block_size, 1); - auto const num_rowgroups = enc_streams.size().second; auto const num_streams = strm_desc.size().second; auto const num_stripes = strm_desc.size().first; @@ -1344,9 +1342,10 @@ void CompactOrcDataStreams(device_2dspan strm_desc, auto lengths = cudf::detail::make_zeroed_device_uvector_async( num_chunks, stream, rmm::mr::get_current_device_resource()); - dim3 dim_grid_alt(cudf::util::div_rounding_up_unsafe(num_stripes, compact_streams_block_size), - strm_desc.size().second); - gpuInitBatchedMemcpy<<>>( + dim3 dim_block(compact_streams_block_size, 1); + dim3 dim_grid(cudf::util::div_rounding_up_unsafe(num_stripes, compact_streams_block_size), + strm_desc.size().second); + gpuInitBatchedMemcpy<<>>( strm_desc, enc_streams, srcs, dsts, lengths); // Copy streams in a batched manner. diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 7de5ebbe23a..e46f2976f05 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -472,8 +472,8 @@ void __host__ DecodeSplitPageData(cudf::detail::hostdevice_span pages, * @copydoc cudf::io::parquet::detail::WriteFinalOffsets */ void WriteFinalOffsets(host_span offsets, - host_span buff_addrs, - rmm::cuda_stream_view stream) + host_span buff_addrs, + rmm::cuda_stream_view stream) { // Copy offsets to device and create an iterator auto d_src_data = cudf::detail::make_device_uvector_async( From 4ea0930195ecf3618dfe9434f085b2ea09c55f91 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Tue, 1 Oct 2024 00:53:44 +0000 Subject: [PATCH 15/24] Minor comment update --- cpp/src/io/orc/stripe_enc.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index afb1a906cf5..a0382318cd1 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -1122,7 +1122,7 @@ CUDF_KERNEL void __launch_bounds__(compact_streams_block_size) streams[ss.column_id][group].data_ptrs[cid] = dst_ptr; auto const len = streams[ss.column_id][group].lengths[cid]; - // Multiplying by sizeof(uint8_t) = 1 is redundant here. + // len is the size (in bytes) of current stream data. sizes[out_id] = len; dst_ptr += len; } From 3eea6e224dddf2843add6057e92d9539d4876185 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Tue, 1 Oct 2024 00:54:04 +0000 Subject: [PATCH 16/24] Minor comment update --- cpp/src/io/orc/stripe_enc.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index a0382318cd1..7cdd6deedb1 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -1122,7 +1122,7 @@ CUDF_KERNEL void __launch_bounds__(compact_streams_block_size) streams[ss.column_id][group].data_ptrs[cid] = dst_ptr; auto const len = streams[ss.column_id][group].lengths[cid]; - // len is the size (in bytes) of current stream data. + // len is the size (in bytes) of the current stream. sizes[out_id] = len; dst_ptr += len; } From 6d078c2177adca6c4624b3e987703d757eaf47aa Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Tue, 1 Oct 2024 01:44:01 +0000 Subject: [PATCH 17/24] Style fix and add to CI. --- ci/run_cudf_examples.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh index 0819eacf636..830bb610cc8 100755 --- a/ci/run_cudf_examples.sh +++ b/ci/run_cudf_examples.sh @@ -26,4 +26,7 @@ compute-sanitizer --tool memcheck custom_with_malloc names.csv compute-sanitizer --tool memcheck parquet_io compute-sanitizer --tool memcheck parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD TRUE +compute-sanitizer --tool memcheck parquet_io_multithreaded +compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet 4 PINNED_BUFFER 2 2 + exit ${EXITCODE} From 1cc4e1f193b1c875e6a03fa3ab9658947b53f69b Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Mon, 30 Sep 2024 18:50:09 -0700 Subject: [PATCH 18/24] Revert erroneous commit --- ci/run_cudf_examples.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh index 830bb610cc8..0819eacf636 100755 --- a/ci/run_cudf_examples.sh +++ b/ci/run_cudf_examples.sh @@ -26,7 +26,4 @@ compute-sanitizer --tool memcheck custom_with_malloc names.csv compute-sanitizer --tool memcheck parquet_io compute-sanitizer --tool memcheck parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD TRUE -compute-sanitizer --tool memcheck parquet_io_multithreaded -compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet 4 PINNED_BUFFER 2 2 - exit ${EXITCODE} From 042cfc09e5fdd259360dc488e2b81ca855eb50c1 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 2 Oct 2024 09:57:12 -0700 Subject: [PATCH 19/24] Update cpp/include/cudf/detail/utilities/batched_memcpy.hpp Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> --- cpp/include/cudf/detail/utilities/batched_memcpy.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/batched_memcpy.hpp b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp index 22fef6f4815..e213d692394 100644 --- a/cpp/include/cudf/detail/utilities/batched_memcpy.hpp +++ b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp @@ -35,11 +35,11 @@ namespace detail { * @tparam DstIterator The type of the destination address iterator * @tparam SizeIterator The type of the buffer size iterator * - * @param[in] src_iter Iterator to source addresses - * @param[in] dst_iter Iterator to destination addresses - * @param[in] size_iter Iterator to the vector sizes (in bytes) - * @param[in] num_buffs Number of buffers to be copied - * @param[in] stream CUDA stream to use + * @param src_iter Iterator to source addresses + * @param dst_iter Iterator to destination addresses + * @param size_iter Iterator to the vector sizes (in bytes) + * @param num_buffs Number of buffers to be copied + * @param stream CUDA stream to use */ template void batched_memcpy_async(SrcIterator src_iter, From eee6f6d7f702dea2e0210671ec1000986909d7a0 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 2 Oct 2024 17:17:31 +0000 Subject: [PATCH 20/24] Apply suggestions from review --- cpp/src/io/orc/stripe_enc.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 7cdd6deedb1..ed0b6969154 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -1102,7 +1103,7 @@ CUDF_KERNEL void __launch_bounds__(compact_streams_block_size) device_span dsts, device_span sizes) { - auto const stripe_id = blockIdx.x * compact_streams_block_size + threadIdx.x; + auto const stripe_id = cudf::detail::grid_1d::global_thread_id(); auto const stream_id = blockIdx.y; if (stripe_id >= strm_desc.size().first) { return; } From 828e0acd1da902201f7c233947a39524d1e87ba3 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 2 Oct 2024 17:18:47 +0000 Subject: [PATCH 21/24] Minor updates from review --- cpp/include/cudf/detail/utilities/batched_memcpy.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/include/cudf/detail/utilities/batched_memcpy.hpp b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp index e213d692394..2038e77127a 100644 --- a/cpp/include/cudf/detail/utilities/batched_memcpy.hpp +++ b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp @@ -56,7 +56,6 @@ void batched_memcpy_async(SrcIterator src_iter, // Allocate temporary storage rmm::device_buffer d_temp_storage{temp_storage_bytes, stream.value()}; - // Perform copies cub::DeviceMemcpy::Batched(d_temp_storage.data(), temp_storage_bytes, src_iter, From ecc425233ea303c03af58495b6a2a0b022075bdc Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 2 Oct 2024 17:20:39 +0000 Subject: [PATCH 22/24] Minor --- cpp/include/cudf/detail/utilities/batched_memcpy.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/batched_memcpy.hpp b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp index 2038e77127a..b809f1e11c5 100644 --- a/cpp/include/cudf/detail/utilities/batched_memcpy.hpp +++ b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp @@ -48,12 +48,10 @@ void batched_memcpy_async(SrcIterator src_iter, size_t num_buffs, rmm::cuda_stream_view stream) { - // Get temp storage needed for cub::DeviceMemcpy::Batched size_t temp_storage_bytes = 0; cub::DeviceMemcpy::Batched( nullptr, temp_storage_bytes, src_iter, dst_iter, size_iter, num_buffs, stream.value()); - // Allocate temporary storage rmm::device_buffer d_temp_storage{temp_storage_bytes, stream.value()}; cub::DeviceMemcpy::Batched(d_temp_storage.data(), From 871854baf33bb4e756050e0a88319aaeda69e9bb Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 2 Oct 2024 17:48:52 -0700 Subject: [PATCH 23/24] Update cpp/src/io/parquet/page_data.cu Co-authored-by: Vyas Ramasubramani --- cpp/src/io/parquet/page_data.cu | 3 --- 1 file changed, 3 deletions(-) diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index e46f2976f05..b3276c81c1f 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -468,9 +468,6 @@ void __host__ DecodeSplitPageData(cudf::detail::hostdevice_span pages, } } -/** - * @copydoc cudf::io::parquet::detail::WriteFinalOffsets - */ void WriteFinalOffsets(host_span offsets, host_span buff_addrs, rmm::cuda_stream_view stream) From 3e30777af2a50b2261b46e0d208fe3ef789daaa1 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 3 Oct 2024 00:58:54 +0000 Subject: [PATCH 24/24] Comments update. --- cpp/include/cudf/detail/utilities/batched_memcpy.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/batched_memcpy.hpp b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp index b809f1e11c5..ed0ab9e6e5b 100644 --- a/cpp/include/cudf/detail/utilities/batched_memcpy.hpp +++ b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp @@ -31,13 +31,13 @@ namespace detail { * @brief A helper function that copies a vector of vectors from source to destination addresses in * a batched manner. * - * @tparam SrcIterator The type of the source address iterator - * @tparam DstIterator The type of the destination address iterator - * @tparam SizeIterator The type of the buffer size iterator + * @tparam SrcIterator **[inferred]** The type of device-accessible source addresses iterator + * @tparam DstIterator **[inferred]** The type of device-accessible destination address iterator + * @tparam SizeIterator **[inferred]** The type of device-accessible buffer size iterator * - * @param src_iter Iterator to source addresses - * @param dst_iter Iterator to destination addresses - * @param size_iter Iterator to the vector sizes (in bytes) + * @param src_iter Device-accessible iterator to source addresses + * @param dst_iter Device-accessible iterator to destination addresses + * @param size_iter Device-accessible iterator to the buffer sizes (in bytes) * @param num_buffs Number of buffers to be copied * @param stream CUDA stream to use */