From 3801e7496914dec453f0d3cb49aef7c60ab636aa Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 11 Dec 2024 12:18:49 -0800 Subject: [PATCH] Replace direct `cudaMemcpyAsync` calls with utility functions (within `/include`) (#17557) Replaced the calls to `cudaMemcpyAsync` with the new `cuda_memcpy`/`cuda_memcpy_async` utility, which optionally avoids using the copy engine. Also took the opportunity to use `cudf::detail::host_vector` and its factories to enable wider pinned memory use. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - David Wendt (https://github.com/davidwendt) - https://github.com/nvdbaranec - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/17557 --- cpp/include/cudf/detail/get_value.cuh | 11 +++++------ cpp/include/cudf/table/table_device_view.cuh | 10 +++++++--- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/cpp/include/cudf/detail/get_value.cuh b/cpp/include/cudf/detail/get_value.cuh index 5ea0d06039f..1bfb40e5916 100644 --- a/cpp/include/cudf/detail/get_value.cuh +++ b/cpp/include/cudf/detail/get_value.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include #include @@ -48,11 +49,9 @@ T get_value(column_view const& col_view, size_type element_index, rmm::cuda_stre CUDF_EXPECTS(data_type(type_to_id()) == col_view.type(), "get_value data type mismatch"); CUDF_EXPECTS(element_index >= 0 && element_index < col_view.size(), "invalid element_index value"); - T result; - CUDF_CUDA_TRY(cudaMemcpyAsync( - &result, col_view.data() + element_index, sizeof(T), cudaMemcpyDefault, stream.value())); - stream.synchronize(); - return result; + return cudf::detail::make_host_vector_sync( + device_span{col_view.data() + element_index, 1}, stream) + .front(); } } // namespace detail diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh index 16d532ea2b8..4f6238b5fe7 100644 --- a/cpp/include/cudf/table/table_device_view.cuh +++ b/cpp/include/cudf/table/table_device_view.cuh @@ -16,6 +16,8 @@ #pragma once #include +#include +#include #include #include #include @@ -251,7 +253,7 @@ auto contiguous_copy_column_device_views(HostTableView source_view, rmm::cuda_st // A buffer of CPU memory is allocated to hold the ColumnDeviceView // objects. Once filled, the CPU memory is then copied to device memory // and the pointer is set in the d_columns member. - std::vector h_buffer(padded_views_size_bytes); + auto h_buffer = cudf::detail::make_host_vector(padded_views_size_bytes, stream); // Each ColumnDeviceView instance may have child objects which may // require setting some internal device pointers before being copied // from CPU to device. @@ -266,8 +268,10 @@ auto contiguous_copy_column_device_views(HostTableView source_view, rmm::cuda_st auto d_columns = detail::child_columns_to_device_array( source_view.begin(), source_view.end(), h_ptr, d_ptr); - CUDF_CUDA_TRY(cudaMemcpyAsync(d_ptr, h_ptr, views_size_bytes, cudaMemcpyDefault, stream.value())); - stream.synchronize(); + auto const h_span = host_span{h_buffer}.subspan( + static_cast(h_ptr) - h_buffer.data(), views_size_bytes); + auto const d_span = device_span{static_cast(d_ptr), views_size_bytes}; + cudf::detail::cuda_memcpy(d_span, h_span, stream); return std::make_tuple(std::move(descendant_storage), d_columns); }