NVIDIA · bernhardmgruber · Nov 26, 2024 · Nov 20, 2024 · Nov 21, 2024 · Nov 23, 2024
@@ -15,7 +15,7 @@
 #  endif
 #endif
 
-#include "babelstream.h"
+#include "common.h"
 
 #if !TUNE_BASE
 #  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1

@@ -15,7 +15,7 @@
 #  endif
 #endif
 
-#include "babelstream.h"
+#include "common.h"
 
 #if !TUNE_BASE
 #  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1

@@ -15,7 +15,7 @@
 #  endif
 #endif
 
-#include "babelstream.h"
+#include "common.h"
 
 #if !TUNE_BASE
 #  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1

@@ -0,0 +1,48 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// %RANGE% TUNE_THREADS tpb 128:1024:128
+// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+// keep checks at the top so compilation of discarded variants fails really fast
+#if !TUNE_BASE
+#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
+#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
+#  endif
+
+#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
+#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
+#  endif
+#endif
+
+#include "common.h"
+
+#if !TUNE_BASE
+#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
+#    error "This benchmark does not support being compiled for multiple architectures"
+#  endif
+#endif
+
+// This benchmark tests overlapping memory regions for reading and is compute intensive
+
+template <typename OffsetT>
+static void compare_complex(nvbench::state& state, nvbench::type_list<OffsetT>)
+{
+  const auto n                      = narrow<OffsetT>(state.get_int64("Elements{io}"));
+  thrust::device_vector<complex> in = generate(n);
+  thrust::device_vector<bool> out(n - 1);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<complex>(n);
+  state.add_global_memory_writes<bool>(n);
+
+  // the complex comparison needs lots of compute and transform reads from overlapping input
+  using compare_op = less_t;
+  bench_transform(state, ::cuda::std::tuple{in.begin(), in.begin() + 1}, out.begin(), n - 1, compare_op{});
+}
+
+// TODO(bgruber): hardcode OffsetT?
+NVBENCH_BENCH_TYPES(compare_complex, NVBENCH_TYPE_AXES(offset_types))
+  .set_name("compare_complex")
+  .set_type_axes_names({"OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
@@ -0,0 +1,76 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// %RANGE% TUNE_THREADS tpb 128:1024:128
+// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+// keep checks at the top so compilation of discarded variants fails really fast
+#if !TUNE_BASE
+#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
+#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
+#  endif
+
+#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
+#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
+#  endif
+#endif
+
+#include "common.h"
+
+#if !TUNE_BASE
+#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
+#    error "This benchmark does not support being compiled for multiple architectures"
+#  endif
+#endif
+
+// This benchmark is compute intensive with diverging threads
+
+template <class IndexT, class OutputT>
+struct fib_t
+{
+  __device__ OutputT operator()(IndexT n)
+  {
+    OutputT t1 = 0;
+    OutputT t2 = 1;
+
+    if (n < 1)
+    {
+      return t1;
+    }
+    if (n == 1)
+    {
+      return t1;
+    }
+    if (n == 2)
+    {
+      return t2;
+    }
+    for (IndexT i = 3; i <= n; ++i)
+    {
+      const auto next = t1 + t2;
+      t1              = t2;
+      t2              = next;
+    }
+    return t2;
+  }
+};
+template <typename OffsetT>
+static void fibonacci(nvbench::state& state, nvbench::type_list<OffsetT>)
+{
+  using index_t                     = int64_t;
+  using output_t                    = uint32_t;
+  const auto n                      = narrow<OffsetT>(state.get_int64("Elements{io}"));
+  thrust::device_vector<index_t> in = generate(n, bit_entropy::_1_000, index_t{0}, index_t{42});
+  thrust::device_vector<output_t> out(n);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<index_t>(n);
+  state.add_global_memory_writes<output_t>(n);
+
+  bench_transform(state, ::cuda::std::tuple{in.begin()}, out.begin(), n, fib_t<index_t, output_t>{});
+}
+
+NVBENCH_BENCH_TYPES(fibonacci, NVBENCH_TYPE_AXES(offset_types))
+  .set_name("fibonacci")
+  .set_type_axes_names({"OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
@@ -0,0 +1,79 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// %RANGE% TUNE_THREADS tpb 128:1024:128
+// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+// keep checks at the top so compilation of discarded variants fails really fast
+#if !TUNE_BASE
+#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
+#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
+#  endif
+
+#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
+#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
+#  endif
+#endif
+
+#include "common.h"
+
+#if !TUNE_BASE
+#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
+#    error "This benchmark does not support being compiled for multiple architectures"
+#  endif
+#endif
+
+// This benchmark uses a LOT of registers and is compute intensive.
+
+template <int N>
+struct heavy_functor
+{
+  // we need to use an unsigned type so overflow in arithmetic wraps around
+  __device__ std::uint32_t operator()(std::uint32_t data) const
+  {
+    std::uint32_t reg[N];
+    reg[0] = data;
+    for (int i = 1; i < N; ++i)
+    {
+      reg[i] = reg[i - 1] * reg[i - 1] + 1;
+    }
+    for (int i = 0; i < N; ++i)
+    {
+      reg[i] = (reg[i] * reg[i]) % 19;
+    }
+    for (int i = 0; i < N; ++i)
+    {
+      reg[i] = reg[N - i - 1] * reg[i];
+    }
+    std::uint32_t x = 0;
+    for (int i = 0; i < N; ++i)
+    {
+      x += reg[i];
+    }
+    return x;
+  }
+};
+
+template <typename Heaviness>
+static void heavy(nvbench::state& state, nvbench::type_list<Heaviness>)
+{
+  using value_t                     = std::uint32_t;
+  using offset_t                    = int;
+  const auto n                      = narrow<offset_t>(state.get_int64("Elements{io}"));
+  thrust::device_vector<value_t> in = generate(n);
+  thrust::device_vector<value_t> out(n);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<value_t>(n);
+  state.add_global_memory_writes<value_t>(n);
+
+  bench_transform(state, ::cuda::std::tuple{in.begin()}, out.begin(), n, heavy_functor<Heaviness::value>{});
+}
+
+template <int I>
+using ic = ::cuda::std::integral_constant<int, I>;
+
+NVBENCH_BENCH_TYPES(heavy, NVBENCH_TYPE_AXES(nvbench::type_list<ic<32>, ic<64>, ic<128>, ic<256>>))
+  .set_name("heavy")
+  .set_type_axes_names({"Heaviness{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
@@ -32,6 +32,19 @@ NVBENCH_DECLARE_TYPE_STRINGS(complex, "C64", "complex");
 NVBENCH_DECLARE_TYPE_STRINGS(::cuda::std::false_type, "false", "false_type");
 NVBENCH_DECLARE_TYPE_STRINGS(::cuda::std::true_type, "true", "true_type");
 
+template <typename T, T I>
+struct nvbench::type_strings<::cuda::std::integral_constant<T, I>>
+{
+  static std::string input_string()
+  {
+    return std::to_string(I);
+  }
+  static std::string description()
+  {
+    return "integral_constant<" + type_strings<T>::description() + ", " + std::to_string(I) + ">";
+  }
+};
+
 namespace detail
 {