From 21d05d73a66c0bc0009ff378beb58fb4f0f2bf2d Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 29 Aug 2024 16:40:14 -0400 Subject: [PATCH] Move apply_boolean_mask benchmark to nvbench (#16616) Reworks the `apply_booleam_mask` benchmark as an nvbench benchmark under the `STREAM_COMPACTION_NVBENCH` module. `cudf::string_view` was added as a type to help measure the performance improvement in a follow on PR for `apply_boolean_mask` for strings Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/16616 --- cpp/benchmarks/CMakeLists.txt | 5 +- .../stream_compaction/apply_boolean_mask.cpp | 138 ++++++------------ 2 files changed, 48 insertions(+), 95 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 7f3edfa0a01..99ef9e2976f 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -151,14 +151,11 @@ ConfigureBench(COPY_IF_ELSE_BENCH copying/copy_if_else.cpp) # * transpose benchmark --------------------------------------------------------------------------- ConfigureBench(TRANSPOSE_BENCH transpose/transpose.cpp) -# ################################################################################################## -# * apply_boolean_mask benchmark ------------------------------------------------------------------ -ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp) - # ################################################################################################## # * stream_compaction benchmark ------------------------------------------------------------------- ConfigureNVBench( STREAM_COMPACTION_NVBENCH + stream_compaction/apply_boolean_mask.cpp stream_compaction/distinct.cpp stream_compaction/distinct_count.cpp stream_compaction/stable_distinct.cpp diff --git a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp index 492237474ff..fa017ca9e29 100644 --- a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp +++ b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp @@ -15,120 +15,76 @@ */ #include +#include #include +#include -#include -#include +#include namespace { -constexpr cudf::size_type hundredM = 1e8; -constexpr cudf::size_type tenM = 1e7; -constexpr cudf::size_type tenK = 1e4; -constexpr cudf::size_type fifty_percent = 50; - -void percent_range(benchmark::internal::Benchmark* b) -{ - b->Unit(benchmark::kMillisecond); - for (int percent = 0; percent <= 100; percent += 10) - b->Args({hundredM, percent}); -} - -void size_range(benchmark::internal::Benchmark* b) -{ - b->Unit(benchmark::kMillisecond); - for (int size = tenK; size <= hundredM; size *= 10) - b->Args({size, fifty_percent}); -} - template -void calculate_bandwidth(benchmark::State& state, cudf::size_type num_columns) +void calculate_bandwidth(nvbench::state& state) { - cudf::size_type const column_size{static_cast(state.range(0))}; - cudf::size_type const percent_true{static_cast(state.range(1))}; - - float const fraction = percent_true / 100.f; - cudf::size_type const column_size_out = fraction * column_size; - int64_t const mask_size = - sizeof(bool) * column_size + cudf::bitmask_allocation_size_bytes(column_size); - int64_t const validity_bytes_in = (fraction >= 1.0f / 32) - ? cudf::bitmask_allocation_size_bytes(column_size) - : 4 * column_size_out; - int64_t const validity_bytes_out = cudf::bitmask_allocation_size_bytes(column_size_out); - int64_t const column_bytes_out = sizeof(T) * column_size_out; + auto const n_rows = static_cast(state.get_int64("rows")); + auto const n_cols = static_cast(state.get_int64("columns")); + auto const percent_true = static_cast(state.get_int64("hits_%")); + + double const fraction = percent_true / 100.0; + cudf::size_type const output_size = fraction * n_rows; + int64_t const mask_size = sizeof(bool) * n_rows + cudf::bitmask_allocation_size_bytes(n_rows); + int64_t const validity_bytes_in = + (fraction >= 1.0 / 32) ? cudf::bitmask_allocation_size_bytes(n_rows) : 4 * output_size; + int64_t const validity_bytes_out = cudf::bitmask_allocation_size_bytes(output_size); + int64_t const column_bytes_out = sizeof(T) * output_size; int64_t const column_bytes_in = column_bytes_out; // we only read unmasked inputs - int64_t const bytes_read = - (column_bytes_in + validity_bytes_in) * num_columns + // reading columns - mask_size; // reading boolean mask + int64_t const bytes_read = (column_bytes_in + validity_bytes_in) * n_cols + // reading columns + mask_size; // reading boolean mask int64_t const bytes_written = - (column_bytes_out + validity_bytes_out) * num_columns; // writing columns + (column_bytes_out + validity_bytes_out) * n_cols; // writing columns - state.SetItemsProcessed(state.iterations() * column_size * num_columns); - state.SetBytesProcessed(static_cast(state.iterations()) * (bytes_read + bytes_written)); + state.add_element_count(n_rows * n_cols); + state.add_global_memory_reads(bytes_read); + state.add_global_memory_writes(bytes_written); } } // namespace -template -void BM_apply_boolean_mask(benchmark::State& state, cudf::size_type num_columns) +template +void apply_boolean_mask_benchmark(nvbench::state& state, nvbench::type_list) { - cudf::size_type const column_size{static_cast(state.range(0))}; - cudf::size_type const percent_true{static_cast(state.range(1))}; + auto const n_rows = static_cast(state.get_int64("rows")); + auto const n_cols = static_cast(state.get_int64("columns")); + auto const percent_true = static_cast(state.get_int64("hits_%")); - data_profile profile = data_profile_builder().cardinality(0).null_probability(0.0).distribution( - cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); + auto const input_type = cudf::type_to_id(); + data_profile profile = data_profile_builder().cardinality(0).no_validity().distribution( + input_type, distribution_id::UNIFORM, 0, 20); - auto source_table = create_random_table( - cycle_dtypes({cudf::type_to_id()}, num_columns), row_count{column_size}, profile); + auto source_table = + create_random_table(cycle_dtypes({input_type}, n_cols), row_count{n_rows}, profile); profile.set_bool_probability_true(percent_true / 100.0); profile.set_null_probability(std::nullopt); // no null mask - auto mask = create_random_column(cudf::type_id::BOOL8, row_count{column_size}, profile); + auto mask = create_random_column(cudf::type_id::BOOL8, row_count{n_rows}, profile); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + calculate_bandwidth(state); - for (auto _ : state) { - cuda_event_timer raii(state, true); - auto result = cudf::apply_boolean_mask(*source_table, mask->view()); - } + state.exec(nvbench::exec_tag::sync, [&source_table, &mask](nvbench::launch& launch) { + cudf::apply_boolean_mask(*source_table, mask->view()); + }); - calculate_bandwidth(state, num_columns); + set_throughputs(state); } -template -class ApplyBooleanMask : public cudf::benchmark { - public: - using TypeParam = T; -}; - -#define ABM_BENCHMARK_DEFINE(name, type, n_columns) \ - BENCHMARK_TEMPLATE_DEFINE_F(ApplyBooleanMask, name, type)(::benchmark::State & st) \ - { \ - BM_apply_boolean_mask(st, n_columns); \ - } - -ABM_BENCHMARK_DEFINE(float_1_col, float, 1); -ABM_BENCHMARK_DEFINE(float_2_col, float, 2); -ABM_BENCHMARK_DEFINE(float_4_col, float, 4); - -// shmoo 1, 2, 4 column float across percentage true -BENCHMARK_REGISTER_F(ApplyBooleanMask, float_1_col)->Apply(percent_range); -BENCHMARK_REGISTER_F(ApplyBooleanMask, float_2_col)->Apply(percent_range); -BENCHMARK_REGISTER_F(ApplyBooleanMask, float_4_col)->Apply(percent_range); - -// shmoo 1, 2, 4 column float across column sizes with 50% true -BENCHMARK_REGISTER_F(ApplyBooleanMask, float_1_col)->Apply(size_range); -BENCHMARK_REGISTER_F(ApplyBooleanMask, float_2_col)->Apply(size_range); -BENCHMARK_REGISTER_F(ApplyBooleanMask, float_4_col)->Apply(size_range); - -// spot benchmark other types -ABM_BENCHMARK_DEFINE(int8_1_col, int8_t, 1); -ABM_BENCHMARK_DEFINE(int16_1_col, int16_t, 1); -ABM_BENCHMARK_DEFINE(int32_1_col, int32_t, 1); -ABM_BENCHMARK_DEFINE(int64_1_col, int64_t, 1); -ABM_BENCHMARK_DEFINE(double_1_col, double, 1); -BENCHMARK_REGISTER_F(ApplyBooleanMask, int8_1_col)->Args({tenM, fifty_percent}); -BENCHMARK_REGISTER_F(ApplyBooleanMask, int16_1_col)->Args({tenM, fifty_percent}); -BENCHMARK_REGISTER_F(ApplyBooleanMask, int32_1_col)->Args({tenM, fifty_percent}); -BENCHMARK_REGISTER_F(ApplyBooleanMask, int64_1_col)->Args({tenM, fifty_percent}); -BENCHMARK_REGISTER_F(ApplyBooleanMask, double_1_col)->Args({tenM, fifty_percent}); +using data_type = nvbench::type_list; +NVBENCH_BENCH_TYPES(apply_boolean_mask_benchmark, NVBENCH_TYPE_AXES(data_type)) + .set_name("apply_boolean_mask") + .set_type_axes_names({"type"}) + .add_int64_axis("columns", {1, 4}) + .add_int64_axis("rows", {100'000, 1'000'000, 10'000'000}) + .add_int64_axis("hits_%", {10, 50, 100});