From 248a9bf9aaf50031d4ba9461f27c7356abde83ae Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Mon, 18 Apr 2022 09:07:15 +0000 Subject: [PATCH] add support FutureValue for reduce --- cub/device/device_reduce.cuh | 59 +++++++++++++++++++------ cub/device/dispatch/dispatch_reduce.cuh | 23 +++++----- 2 files changed, 59 insertions(+), 23 deletions(-) diff --git a/cub/device/device_reduce.cuh b/cub/device/device_reduce.cuh index 9f70a111a4..7fc1f2e079 100644 --- a/cub/device/device_reduce.cuh +++ b/cub/device/device_reduce.cuh @@ -155,14 +155,45 @@ struct DeviceReduce // Signed integer type for global offsets typedef int OffsetT; - return DispatchReduce::Dispatch( + return DispatchReduce>::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, - init, + detail::InputValue(init), + stream, + debug_synchronous); + } + + template < + typename InputIteratorT, + typename OutputIteratorT, + typename ReductionOpT, + typename InitValueT, + typename InitValueIterT = InitValueT *> + CUB_RUNTIME_FUNCTION static cudaError_t Reduce( + void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + ReductionOpT reduction_op, ///< [in] Binary reduction functor + FutureValue init, ///< [in] Initial value of the reduction + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + return DispatchReduce>::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + reduction_op, + detail::InputValue(init), stream, debug_synchronous); } @@ -239,14 +270,14 @@ struct DeviceReduce cub::detail::non_void_value_t>; - return DispatchReduce::Dispatch( + return DispatchReduce>::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, cub::Sum(), - OutputT(), // zero-initialize + detail::InputValue(OutputT{}), // zero-initialize stream, debug_synchronous); } @@ -314,14 +345,15 @@ struct DeviceReduce // The input value type using InputT = cub::detail::value_t; - return DispatchReduce::Dispatch( + auto init_val = Traits::Max(); + return DispatchReduce>::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, cub::Min(), - Traits::Max(), // replace with std::numeric_limits::max() when C++11 support is more prevalent + detail::InputValue(init_val), // replace with std::numeric_limits::max() when C++11 support is more prevalent stream, debug_synchronous); } @@ -407,15 +439,15 @@ struct DeviceReduce // Initial value OutputTupleT initial_value(1, Traits::Max()); // replace with std::numeric_limits::max() when C++11 support is more prevalent - - return DispatchReduce::Dispatch( + + return DispatchReduce>::Dispatch( d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMin(), - initial_value, + detail::InputValue(initial_value), stream, debug_synchronous); } @@ -483,14 +515,15 @@ struct DeviceReduce // The input value type using InputT = cub::detail::value_t; - return DispatchReduce::Dispatch( + auto init_val = Traits::Lowest(); + return DispatchReduce>::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, cub::Max(), - Traits::Lowest(), // replace with std::numeric_limits::lowest() when C++11 support is more prevalent + detail::InputValue(init_val), // replace with std::numeric_limits::lowest() when C++11 support is more prevalent stream, debug_synchronous); } @@ -577,14 +610,14 @@ struct DeviceReduce // Initial value OutputTupleT initial_value(1, Traits::Lowest()); // replace with std::numeric_limits::lowest() when C++11 support is more prevalent - return DispatchReduce::Dispatch( + return DispatchReduce>::Dispatch( d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMax(), - initial_value, + detail::InputValue(initial_value), stream, debug_synchronous); } diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh index e0470ccb1e..06870bbc24 100644 --- a/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/device/dispatch/dispatch_reduce.cuh @@ -105,15 +105,17 @@ template < typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator typename OffsetT, ///< Signed integer type for global offsets typename ReductionOpT, ///< Binary reduction functor type having member T operator()(const T &a, const T &b) - typename OutputT> ///< Data element type that is convertible to the \p value type of \p OutputIteratorT + typename InitValT> ///< Data element type that is convertible to the \p value type of \p OutputIteratorT __launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) __global__ void DeviceReduceSingleTileKernel( InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items OutputIteratorT d_out, ///< [out] Pointer to the output aggregate OffsetT num_items, ///< [in] Total number of input data items ReductionOpT reduction_op, ///< [in] Binary reduction functor - OutputT init) ///< [in] The initial value of the reduction + InitValT init) ///< [in] The initial value of the reduction { + using RealInitValT = typename InitValT::value_type; + RealInitValT real_init = init; // Thread block type for reducing input tiles typedef AgentReduce< typename ChainedPolicyT::ActivePolicy::SingleTilePolicy, @@ -130,18 +132,18 @@ __global__ void DeviceReduceSingleTileKernel( if (num_items == 0) { if (threadIdx.x == 0) - *d_out = init; + *d_out = real_init; return; } // Consume input tiles - OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange( + RealInitValT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange( OffsetT(0), num_items); // Output result if (threadIdx.x == 0) - *d_out = reduction_op(init, block_aggregate); + *d_out = reduction_op(real_init, block_aggregate); } @@ -317,6 +319,7 @@ template < typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator typename OffsetT, ///< Signed integer type for global offsets typename ReductionOpT, ///< Binary reduction functor type having member T operator()(const T &a, const T &b) + typename InitValT, typename OutputT = ///< Data type of the output iterator cub::detail::non_void_value_t< OutputIteratorT, @@ -339,7 +342,7 @@ struct DispatchReduce : OutputIteratorT d_out; ///< [out] Pointer to the output aggregate OffsetT num_items; ///< [in] Total number of input items (i.e., length of \p d_in) ReductionOpT reduction_op; ///< [in] Binary reduction functor - OutputT init; ///< [in] The initial value of the reduction + InitValT init; ///< [in] The initial value of the reduction cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. int ptx_version; ///< [in] PTX version @@ -357,7 +360,7 @@ struct DispatchReduce : OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, - OutputT init, + InitValT init, cudaStream_t stream, bool debug_synchronous, int ptx_version) @@ -570,14 +573,14 @@ struct DispatchReduce : { // Small, single tile size return InvokeSingleTile( - DeviceReduceSingleTileKernel); + DeviceReduceSingleTileKernel); } else { // Regular size return InvokePasses( DeviceReduceKernel, - DeviceReduceSingleTileKernel); + DeviceReduceSingleTileKernel); } } @@ -597,7 +600,7 @@ struct DispatchReduce : OutputIteratorT d_out, ///< [out] Pointer to the output aggregate OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) ReductionOpT reduction_op, ///< [in] Binary reduction functor - OutputT init, ///< [in] The initial value of the reduction + InitValT init, ///< [in] The initial value of the reduction cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. {