Add benchmark

NVIDIA · Jan 25, 2024 · 1c780c2 · 1c780c2
1 parent b21dcd1
commit 1c780c2
Show file tree

Hide file tree

Showing 7 changed files with 146 additions and 0 deletions.
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
@@ -84,3 +84,8 @@ ConfigureBench(DYNAMIC_MAP_BENCH
 # - hash function benchmarks ----------------------------------------------------------------------
 ConfigureBench(HASH_BENCH
   hash_bench.cu)
+
+###################################################################################################
+# - distinct_count_estimator benchmarks -----------------------------------------------------------
+ConfigureBench(DISTINCT_COUNT_ESTIMATOR_BENCH
+  distinct_count_estimator_bench.cu)
diff --git a/benchmarks/distinct_count_estimator_bench.cu b/benchmarks/distinct_count_estimator_bench.cu
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/distinct_count_estimator.cuh>
+#include <cuco/static_set.cuh>
+#include <cuco/utility/key_generator.cuh>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+
+#include <cstddef>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+template <typename T, typename InputIt>
+[[nodiscard]] std::size_t exact_distinct_count(InputIt first, InputIt last)
+{
+  // TODO don't use detail ns in user land
+  auto const num_items = cuco::detail::distance(first, last);
+  if (num_items == 0) { return 0; }
+
+  auto set = cuco::static_set{num_items, cuco::empty_key<T>{-1}};
+  set.insert(first, last);
+  return set.size();
+}
+
+/**
+ * @brief A benchmark evaluating `cuco::distinct_count_estimator` end-to-end performance
+ */
+template <typename Estimator, typename Dist>
+void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list<Estimator, Dist>)
+{
+  using T = typename Estimator::value_type;
+
+  auto const num_items = state.get_int64_or_default("NumInputs", 1ull << 30);
+
+  thrust::device_vector<T> items(num_items);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), items.begin(), items.end());
+
+  state.add_element_count(num_items);
+  state.add_global_memory_reads<T>(num_items, "InputSize");
+
+  Estimator estimator;
+  estimator.add(items.begin(), items.end());
+
+  double estimated_cardinality  = estimator.estimate();
+  double const true_cardinality = exact_distinct_count<T>(items.begin(), items.end());
+  auto const relative_error     = abs(true_cardinality - estimated_cardinality) / true_cardinality;
+
+  auto& summ = state.add_summary("RelativeError");
+  summ.set_string("hint", "RelErr");
+  summ.set_string("short_name", "RelativeError");
+  summ.set_string("description", "Relatve approximation error.");
+  summ.set_float64("value", relative_error);
+
+  estimator.clear();
+  state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
+             [&](nvbench::launch& launch, auto& timer) {
+               estimator.clear_async({launch.get_stream()});
+
+               timer.start();
+               estimator.add_async(items.begin(), items.end(), {launch.get_stream()});
+               estimated_cardinality = estimator.estimate({launch.get_stream()});
+               timer.stop();
+             });
+}
+
+/**
+ * @brief A benchmark evaluating `cuco::distinct_count_estimator::add` performance
+ */
+template <typename Estimator, typename Dist>
+void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list<Estimator, Dist>)
+{
+  using T = typename Estimator::value_type;
+
+  auto const num_items = state.get_int64_or_default("NumInputs", 1ull << 30);
+
+  thrust::device_vector<T> items(num_items);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), items.begin(), items.end());
+
+  state.add_element_count(num_items);
+  state.add_global_memory_reads<T>(num_items, "InputSize");
+
+  Estimator estimator;
+  state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+    estimator.clear_async({launch.get_stream()});
+
+    timer.start();
+    estimator.add_async(items.begin(), items.end(), {launch.get_stream()});
+    timer.stop();
+  });
+}
+
+using ESTIMATOR_RANGE = nvbench::type_list<cuco::distinct_count_estimator<nvbench::int32_t, 8>,
+                                           cuco::distinct_count_estimator<nvbench::int32_t, 9>,
+                                           cuco::distinct_count_estimator<nvbench::int32_t, 10>,
+                                           cuco::distinct_count_estimator<nvbench::int32_t, 11>,
+                                           cuco::distinct_count_estimator<nvbench::int32_t, 12>,
+                                           cuco::distinct_count_estimator<nvbench::int32_t, 13>,
+                                           cuco::distinct_count_estimator<nvbench::int64_t, 11>,
+                                           cuco::distinct_count_estimator<nvbench::int64_t, 12>>;
+
+NVBENCH_BENCH_TYPES(distinct_count_estimator_e2e,
+                    NVBENCH_TYPE_AXES(ESTIMATOR_RANGE, nvbench::type_list<distribution::unique>))
+  .set_name("distinct_count_estimator")
+  .set_type_axes_names({"Estimator", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE);
+
+NVBENCH_BENCH_TYPES(distinct_count_estimator_add,
+                    NVBENCH_TYPE_AXES(ESTIMATOR_RANGE, nvbench::type_list<distribution::unique>))
+  .set_name("distinct_count_estimator::add")
+  .set_type_axes_names({"Estimator", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE);
diff --git a/benchmarks/utils.hpp b/benchmarks/utils.hpp
@@ -21,6 +21,8 @@
 
 #include <nvbench/nvbench.cuh>
 
+#include <cuda/std/atomic>  // thread_scope
+
 namespace cuco::benchmark {
 
 template <typename Dist>

diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -55,6 +55,7 @@ class hyperloglog {
                                                                    ///< type
 
   using allocator_type         = Allocator;                          ///< Allocator type
+  using value_type             = typename ref_type<>::value_type;    ///< Type of items to count
   using storage_type           = typename ref_type<>::storage_type;  ///< Storage type
   using storage_allocator_type = typename std::allocator_traits<Allocator>::template rebind_alloc<
     storage_type>;  ///< Storage allocator type

diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -49,6 +49,7 @@ class hyperloglog_ref {
   static constexpr auto thread_scope = Scope;      ///< CUDA thread scope
   static constexpr auto precision    = Precision;  ///< Precision
 
+  using value_type   = T;                                       ///< Type of items to count
   using storage_type = hyperloglog_dense_registers<Precision>;  ///< Storage type
 
   template <cuda::thread_scope NewScope>

diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh
@@ -59,6 +59,7 @@ class distinct_count_estimator {
     cuco::distinct_count_estimator_ref<T, Precision, NewScope, Hash>;  ///< Non-owning reference
                                                                        ///< type
 
+  using value_type     = typename impl_type::value_type;      ///< Type of items to count
   using allocator_type = typename impl_type::allocator_type;  ///< Allocator type
   using storage_type   = typename impl_type::storage_type;    ///< Storage type
 

diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh
@@ -44,6 +44,7 @@ class distinct_count_estimator_ref {
   static constexpr auto thread_scope = impl_type::thread_scope;  ///< CUDA thread scope
   static constexpr auto precision    = impl_type::precision;     ///< Precision
 
+  using value_type   = typename impl_type::value_type;    ///< Type of items to count
   using storage_type = typename impl_type::storage_type;  ///< Storage type
 
   template <cuda::thread_scope NewScope>