Skip to content

Commit

Permalink
Add benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
sleeepyjack committed Jan 25, 2024
1 parent b21dcd1 commit 1c780c2
Show file tree
Hide file tree
Showing 7 changed files with 146 additions and 0 deletions.
5 changes: 5 additions & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,8 @@ ConfigureBench(DYNAMIC_MAP_BENCH
# - hash function benchmarks ----------------------------------------------------------------------
ConfigureBench(HASH_BENCH
hash_bench.cu)

###################################################################################################
# - distinct_count_estimator benchmarks -----------------------------------------------------------
ConfigureBench(DISTINCT_COUNT_ESTIMATOR_BENCH
distinct_count_estimator_bench.cu)
135 changes: 135 additions & 0 deletions benchmarks/distinct_count_estimator_bench.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <defaults.hpp>
#include <utils.hpp>

#include <cuco/distinct_count_estimator.cuh>
#include <cuco/static_set.cuh>
#include <cuco/utility/key_generator.cuh>

#include <nvbench/nvbench.cuh>

#include <thrust/device_vector.h>

#include <cstddef>

using namespace cuco::benchmark;
using namespace cuco::utility;

template <typename T, typename InputIt>
[[nodiscard]] std::size_t exact_distinct_count(InputIt first, InputIt last)
{
// TODO don't use detail ns in user land
auto const num_items = cuco::detail::distance(first, last);
if (num_items == 0) { return 0; }

auto set = cuco::static_set{num_items, cuco::empty_key<T>{-1}};
set.insert(first, last);
return set.size();
}

/**
* @brief A benchmark evaluating `cuco::distinct_count_estimator` end-to-end performance
*/
template <typename Estimator, typename Dist>
void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list<Estimator, Dist>)
{
using T = typename Estimator::value_type;

auto const num_items = state.get_int64_or_default("NumInputs", 1ull << 30);

thrust::device_vector<T> items(num_items);

key_generator gen;
gen.generate(dist_from_state<Dist>(state), items.begin(), items.end());

state.add_element_count(num_items);
state.add_global_memory_reads<T>(num_items, "InputSize");

Estimator estimator;
estimator.add(items.begin(), items.end());

double estimated_cardinality = estimator.estimate();
double const true_cardinality = exact_distinct_count<T>(items.begin(), items.end());
auto const relative_error = abs(true_cardinality - estimated_cardinality) / true_cardinality;

auto& summ = state.add_summary("RelativeError");
summ.set_string("hint", "RelErr");
summ.set_string("short_name", "RelativeError");
summ.set_string("description", "Relatve approximation error.");
summ.set_float64("value", relative_error);

estimator.clear();
state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
[&](nvbench::launch& launch, auto& timer) {
estimator.clear_async({launch.get_stream()});

timer.start();
estimator.add_async(items.begin(), items.end(), {launch.get_stream()});
estimated_cardinality = estimator.estimate({launch.get_stream()});
timer.stop();
});
}

/**
* @brief A benchmark evaluating `cuco::distinct_count_estimator::add` performance
*/
template <typename Estimator, typename Dist>
void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list<Estimator, Dist>)
{
using T = typename Estimator::value_type;

auto const num_items = state.get_int64_or_default("NumInputs", 1ull << 30);

thrust::device_vector<T> items(num_items);

key_generator gen;
gen.generate(dist_from_state<Dist>(state), items.begin(), items.end());

state.add_element_count(num_items);
state.add_global_memory_reads<T>(num_items, "InputSize");

Estimator estimator;
state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
estimator.clear_async({launch.get_stream()});

timer.start();
estimator.add_async(items.begin(), items.end(), {launch.get_stream()});
timer.stop();
});
}

using ESTIMATOR_RANGE = nvbench::type_list<cuco::distinct_count_estimator<nvbench::int32_t, 8>,
cuco::distinct_count_estimator<nvbench::int32_t, 9>,
cuco::distinct_count_estimator<nvbench::int32_t, 10>,
cuco::distinct_count_estimator<nvbench::int32_t, 11>,
cuco::distinct_count_estimator<nvbench::int32_t, 12>,
cuco::distinct_count_estimator<nvbench::int32_t, 13>,
cuco::distinct_count_estimator<nvbench::int64_t, 11>,
cuco::distinct_count_estimator<nvbench::int64_t, 12>>;

NVBENCH_BENCH_TYPES(distinct_count_estimator_e2e,
NVBENCH_TYPE_AXES(ESTIMATOR_RANGE, nvbench::type_list<distribution::unique>))
.set_name("distinct_count_estimator")
.set_type_axes_names({"Estimator", "Distribution"})
.set_max_noise(defaults::MAX_NOISE);

NVBENCH_BENCH_TYPES(distinct_count_estimator_add,
NVBENCH_TYPE_AXES(ESTIMATOR_RANGE, nvbench::type_list<distribution::unique>))
.set_name("distinct_count_estimator::add")
.set_type_axes_names({"Estimator", "Distribution"})
.set_max_noise(defaults::MAX_NOISE);
2 changes: 2 additions & 0 deletions benchmarks/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@

#include <nvbench/nvbench.cuh>

#include <cuda/std/atomic> // thread_scope

namespace cuco::benchmark {

template <typename Dist>
Expand Down
1 change: 1 addition & 0 deletions include/cuco/detail/hyperloglog/hyperloglog.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ class hyperloglog {
///< type

using allocator_type = Allocator; ///< Allocator type
using value_type = typename ref_type<>::value_type; ///< Type of items to count
using storage_type = typename ref_type<>::storage_type; ///< Storage type
using storage_allocator_type = typename std::allocator_traits<Allocator>::template rebind_alloc<
storage_type>; ///< Storage allocator type
Expand Down
1 change: 1 addition & 0 deletions include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ class hyperloglog_ref {
static constexpr auto thread_scope = Scope; ///< CUDA thread scope
static constexpr auto precision = Precision; ///< Precision

using value_type = T; ///< Type of items to count
using storage_type = hyperloglog_dense_registers<Precision>; ///< Storage type

template <cuda::thread_scope NewScope>
Expand Down
1 change: 1 addition & 0 deletions include/cuco/distinct_count_estimator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ class distinct_count_estimator {
cuco::distinct_count_estimator_ref<T, Precision, NewScope, Hash>; ///< Non-owning reference
///< type

using value_type = typename impl_type::value_type; ///< Type of items to count
using allocator_type = typename impl_type::allocator_type; ///< Allocator type
using storage_type = typename impl_type::storage_type; ///< Storage type

Expand Down
1 change: 1 addition & 0 deletions include/cuco/distinct_count_estimator_ref.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class distinct_count_estimator_ref {
static constexpr auto thread_scope = impl_type::thread_scope; ///< CUDA thread scope
static constexpr auto precision = impl_type::precision; ///< Precision

using value_type = typename impl_type::value_type; ///< Type of items to count
using storage_type = typename impl_type::storage_type; ///< Storage type

template <cuda::thread_scope NewScope>
Expand Down

0 comments on commit 1c780c2

Please sign in to comment.