Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add more CUB transform benchmarks #2906

Merged
merged 4 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/transform/babelstream1.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# endif
#endif

#include "babelstream.h"
#include "common.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/transform/babelstream2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# endif
#endif

#include "babelstream.h"
#include "common.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/transform/babelstream3.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# endif
#endif

#include "babelstream.h"
#include "common.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
Expand Down
48 changes: 48 additions & 0 deletions cub/benchmarks/bench/transform/complex_cmp.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// keep checks at the top so compilation of discarded variants fails really fast
#if !TUNE_BASE
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif

# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include "common.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
# error "This benchmark does not support being compiled for multiple architectures"
# endif
#endif

// This benchmark tests overlapping memory regions for reading and is compute intensive

template <typename OffsetT>
static void compare_complex(nvbench::state& state, nvbench::type_list<OffsetT>)
{
const auto n = narrow<OffsetT>(state.get_int64("Elements{io}"));
thrust::device_vector<complex> in = generate(n);
thrust::device_vector<bool> out(n - 1);

state.add_element_count(n);
state.add_global_memory_reads<complex>(n);
state.add_global_memory_writes<bool>(n);

// the complex comparison needs lots of compute and transform reads from overlapping input
using compare_op = less_t;
bench_transform(state, ::cuda::std::tuple{in.begin(), in.begin() + 1}, out.begin(), n - 1, compare_op{});
}

// TODO(bgruber): hardcode OffsetT?
NVBENCH_BENCH_TYPES(compare_complex, NVBENCH_TYPE_AXES(offset_types))
.set_name("compare_complex")
.set_type_axes_names({"OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
76 changes: 76 additions & 0 deletions cub/benchmarks/bench/transform/fib.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// keep checks at the top so compilation of discarded variants fails really fast
#if !TUNE_BASE
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif

# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include "common.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
# error "This benchmark does not support being compiled for multiple architectures"
# endif
#endif

// This benchmark is compute intensive with diverging threads

template <class IndexT, class OutputT>
struct fib_t
{
__device__ OutputT operator()(IndexT n)
{
OutputT t1 = 0;
OutputT t2 = 1;

if (n < 1)
{
return t1;
}
if (n == 1)
{
return t1;
}
if (n == 2)
{
return t2;
}
for (IndexT i = 3; i <= n; ++i)
{
const auto next = t1 + t2;
t1 = t2;
t2 = next;
}
return t2;
}
};
template <typename OffsetT>
static void fibonacci(nvbench::state& state, nvbench::type_list<OffsetT>)
{
using index_t = int64_t;
using output_t = uint32_t;
const auto n = narrow<OffsetT>(state.get_int64("Elements{io}"));
thrust::device_vector<index_t> in = generate(n, bit_entropy::_1_000, index_t{0}, index_t{42});
thrust::device_vector<output_t> out(n);

state.add_element_count(n);
state.add_global_memory_reads<index_t>(n);
state.add_global_memory_writes<output_t>(n);

bench_transform(state, ::cuda::std::tuple{in.begin()}, out.begin(), n, fib_t<index_t, output_t>{});
}

NVBENCH_BENCH_TYPES(fibonacci, NVBENCH_TYPE_AXES(offset_types))
.set_name("fibonacci")
.set_type_axes_names({"OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
79 changes: 79 additions & 0 deletions cub/benchmarks/bench/transform/heavy.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// keep checks at the top so compilation of discarded variants fails really fast
#if !TUNE_BASE
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif

# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include "common.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
# error "This benchmark does not support being compiled for multiple architectures"
# endif
#endif

// This benchmark uses a LOT of registers and is compute intensive.

template <int N>
struct heavy_functor
{
// we need to use an unsigned type so overflow in arithmetic wraps around
__device__ std::uint32_t operator()(std::uint32_t data) const
{
std::uint32_t reg[N];
reg[0] = data;
for (int i = 1; i < N; ++i)
{
reg[i] = reg[i - 1] * reg[i - 1] + 1;
}
for (int i = 0; i < N; ++i)
{
reg[i] = (reg[i] * reg[i]) % 19;
}
for (int i = 0; i < N; ++i)
{
reg[i] = reg[N - i - 1] * reg[i];
}
std::uint32_t x = 0;
for (int i = 0; i < N; ++i)
{
x += reg[i];
}
return x;
}
};

template <typename Heaviness>
static void heavy(nvbench::state& state, nvbench::type_list<Heaviness>)
{
using value_t = std::uint32_t;
using offset_t = int;
const auto n = narrow<offset_t>(state.get_int64("Elements{io}"));
thrust::device_vector<value_t> in = generate(n);
thrust::device_vector<value_t> out(n);

state.add_element_count(n);
state.add_global_memory_reads<value_t>(n);
state.add_global_memory_writes<value_t>(n);

bench_transform(state, ::cuda::std::tuple{in.begin()}, out.begin(), n, heavy_functor<Heaviness::value>{});
}

template <int I>
using ic = ::cuda::std::integral_constant<int, I>;

NVBENCH_BENCH_TYPES(heavy, NVBENCH_TYPE_AXES(nvbench::type_list<ic<32>, ic<64>, ic<128>, ic<256>>))
.set_name("heavy")
.set_type_axes_names({"Heaviness{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
13 changes: 13 additions & 0 deletions cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,19 @@ NVBENCH_DECLARE_TYPE_STRINGS(complex, "C64", "complex");
NVBENCH_DECLARE_TYPE_STRINGS(::cuda::std::false_type, "false", "false_type");
NVBENCH_DECLARE_TYPE_STRINGS(::cuda::std::true_type, "true", "true_type");

template <typename T, T I>
struct nvbench::type_strings<::cuda::std::integral_constant<T, I>>
{
static std::string input_string()
{
return std::to_string(I);
}
static std::string description()
{
return "integral_constant<" + type_strings<T>::description() + ", " + std::to_string(I) + ">";
}
};

namespace detail
{

Expand Down
Loading