From 976a746edee29434b036bc7b09a1f96ec7001c33 Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Wed, 9 Mar 2022 09:41:17 -0800 Subject: [PATCH 001/152] dynamic map erase, still needs work --- benchmarks/hash_table/dynamic_map_bench.cu | 2 + benchmarks/hash_table/static_map_bench.cu | 119 ++++++++++++++++++++ include/cuco/detail/dynamic_map.inl | 48 ++++++++ include/cuco/detail/dynamic_map_kernels.cuh | 61 ++++++++++ include/cuco/detail/static_map.inl | 3 + include/cuco/dynamic_map.cuh | 10 ++ include/cuco/static_map.cuh | 5 + tests/CMakeLists.txt | 3 +- tests/dynamic_map/erase_test.cu | 90 +++++++++++++++ tests/static_map/erase_test.cu | 2 + 10 files changed, 342 insertions(+), 1 deletion(-) create mode 100644 tests/dynamic_map/erase_test.cu diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu index 90446ea57..d42aae755 100644 --- a/benchmarks/hash_table/dynamic_map_bench.cu +++ b/benchmarks/hash_table/dynamic_map_bench.cu @@ -147,6 +147,7 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIQUE) ->Apply(gen_final_size) ->UseManualTime(); +/* BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) @@ -196,3 +197,4 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); +*/ \ No newline at end of file diff --git a/benchmarks/hash_table/static_map_bench.cu b/benchmarks/hash_table/static_map_bench.cu index e2b15b05e..63c2976d4 100644 --- a/benchmarks/hash_table/static_map_bench.cu +++ b/benchmarks/hash_table/static_map_bench.cu @@ -155,6 +155,49 @@ static void BM_static_map_search_all(::benchmark::State& state) int64_t(state.range(0))); } +template +static void BM_static_map_search_none(::benchmark::State& state) +{ + using map_type = cuco::static_map; + + std::size_t num_keys = state.range(0); + float occupancy = state.range(1) / float{100}; + std::size_t size = num_keys / occupancy; + + map_type map{size, -1, -1}; + auto view = map.get_device_mutable_view(); + + std::vector h_keys(num_keys); + std::vector h_values(num_keys); + std::vector> h_pairs(num_keys); + std::vector h_results(num_keys); + + generate_keys(h_keys.begin(), h_keys.end()); + + for (auto i = 0; i < num_keys; ++i) { + Key key = h_keys[i]; + Value val = h_keys[i]; + h_pairs[i].first = key; + h_pairs[i].second = val; + } + + // diff keys + for(int i = 0; i < num_keys; ++i) h_keys[i] += num_keys; + + thrust::device_vector d_keys(h_keys); + thrust::device_vector d_results(num_keys); + thrust::device_vector> d_pairs(h_pairs); + + map.insert(d_pairs.begin(), d_pairs.end()); + + for (auto _ : state) { + map.find(d_keys.begin(), d_keys.end(), d_results.begin()); + } + + state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * + int64_t(state.range(0))); +} + template static void BM_static_map_erase_all(::benchmark::State& state) { @@ -200,6 +243,82 @@ static void BM_static_map_erase_all(::benchmark::State& state) int64_t(state.range(0))); } +template +static void BM_static_map_erase_none(::benchmark::State& state) +{ + using map_type = cuco::static_map; + + std::size_t num_keys = state.range(0); + float occupancy = state.range(1) / float{100}; + std::size_t size = num_keys / occupancy; + + map_type map{size, -1, -1}; + auto view = map.get_device_mutable_view(); + + std::vector h_keys(num_keys); + std::vector h_values(num_keys); + std::vector> h_pairs(num_keys); + std::vector h_results(num_keys); + + generate_keys(h_keys.begin(), h_keys.end()); + + for (auto i = 0; i < num_keys; ++i) { + Key key = h_keys[i]; + Value val = h_keys[i]; + h_pairs[i].first = key; + h_pairs[i].second = val; + } + + + // diff keys + for(int i = 0; i < num_keys; ++i) h_keys[i] += num_keys; + + thrust::device_vector d_keys(h_keys); + thrust::device_vector d_results(num_keys); + thrust::device_vector> d_pairs(h_pairs); + + for (auto _ : state) { + //state.ResumeTiming(); + state.PauseTiming(); + map.insert(d_pairs.begin(), d_pairs.end()); + state.ResumeTiming(); + + map.erase(d_keys.begin(), d_keys.end()); + + //state.PauseTiming(); + } + + state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * + int64_t(state.range(0))); +} + +/* +BENCHMARK_TEMPLATE(BM_static_map_search_none, int32_t, int32_t, dist_type::UNIQUE) + ->Unit(benchmark::kMillisecond) + ->Apply(generate_size_and_occupancy); + +BENCHMARK_TEMPLATE(BM_static_map_search_none, int64_t, int64_t, dist_type::UNIQUE) + ->Unit(benchmark::kMillisecond) + ->Apply(generate_size_and_occupancy); +*/ + + +BENCHMARK_TEMPLATE(BM_static_map_erase_none, int64_t, int64_t, dist_type::UNIQUE) + ->Unit(benchmark::kMillisecond) + ->Apply(generate_size_and_occupancy); + +BENCHMARK_TEMPLATE(BM_static_map_erase_none, int32_t, int32_t, dist_type::UNIQUE) + ->Unit(benchmark::kMillisecond) + ->Apply(generate_size_and_occupancy); + +BENCHMARK_TEMPLATE(BM_static_map_erase_all, int64_t, int64_t, dist_type::UNIQUE) + ->Unit(benchmark::kMillisecond) + ->Apply(generate_size_and_occupancy); + +BENCHMARK_TEMPLATE(BM_static_map_erase_all, int32_t, int32_t, dist_type::UNIQUE) + ->Unit(benchmark::kMillisecond) + ->Apply(generate_size_and_occupancy); +/* BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(generate_size_and_occupancy) diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index 0c1d2e377..28857f547 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -38,6 +38,8 @@ dynamic_map::dynamic_map( submap_views_.push_back(submaps_[0]->get_device_view()); submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); + submap_num_successes_.push_back(submaps_[0]->get_num_successes()); + CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type))); } // namespace cuco @@ -69,6 +71,8 @@ void dynamic_map::reserve(std::size_t n) alloc_)); submap_views_.push_back(submaps_[submap_idx]->get_device_view()); submap_mutable_views_.push_back(submaps_[submap_idx]->get_device_mutable_view()); + + submap_num_successes_.push_back(submaps_[submap_idx]->get_num_successes()); capacity_ *= 2; } @@ -128,6 +132,50 @@ void dynamic_map::insert(InputIt first, } } +template +template +void dynamic_map::erase(InputIt first, + InputIt last, + Hash hash, + KeyEqual key_equal) +{ + std::size_t num_keys = std::distance(first, last); + + auto const block_size = 128; + auto const stride = 1; + auto const tile_size = 4; + auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); + + *num_successes_ = 0; + int device_id; + CUCO_CUDA_TRY(cudaGetDevice(&device_id)); + CUCO_CUDA_TRY(cudaMemPrefetchAsync(num_successes_, sizeof(atomic_ctr_type), device_id)); + + // TODO: hacky, improve this + thrust::device_vector d_submap_num_successes(submap_num_successes_); + + detail::erase> + <<>>(first, + first + num_keys, + submap_views_.data().get(), + submap_mutable_views_.data().get(), + num_successes_, + d_submap_num_successes.data().get(), + submaps_.size(), + hash, + key_equal); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); + + std::size_t h_num_successes = num_successes_->load(cuda::std::memory_order_relaxed); + size_ -= h_num_successes; + + for(int i = 0; i < submaps_.size(); ++i) { + //std::size_t h_num_submap_successes = submap_num_successes_[i]->load(cuda::std::memory_order_relaxed); + //submaps_[i]->size_ -= h_num_submap_successes; + } + +} + template template void dynamic_map::find( diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh index f261b49aa..3bc8a0d8a 100644 --- a/include/cuco/detail/dynamic_map_kernels.cuh +++ b/include/cuco/detail/dynamic_map_kernels.cuh @@ -186,6 +186,67 @@ __global__ void insert(InputIt first, if (threadIdx.x == 0) { *num_successes += block_num_successes; } } +template +__global__ void erase(InputIt first, + InputIt last, + viewT* submap_views, + mutableViewT* submap_mutable_views, + atomicT* num_successes, + atomicT** submap_num_successes, + uint32_t num_submaps, + Hash hash, + KeyEqual key_equal) +{ + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + // TODO: hack for up to 4 submaps, make this better + __shared__ typename BlockReduce::TempStorage temp_submap_storage[4]; + + std::size_t thread_num_successes = 0; + std::size_t submap_thread_num_successes[4] = {0, 0, 0, 0}; + + auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tid = blockDim.x * blockIdx.x + threadIdx.x; + auto key_idx = tid / tile_size; + auto it = first + tid / tile_size; + + while (it < last) { + auto key = *(first + key_idx); + auto erased = false; + + // manually check for duplicates in those submaps we are not inserting into + int i; + for (i = 0; i < num_submaps; ++i) { + erased = submap_mutable_views[i].erase(tile, key, hash, key_equal); + if (erased) { break; } + } + if (erased && tile.thread_rank() == 0) { + thread_num_successes++; + //submap_thread_num_successes[i]++; + } + + it += (gridDim.x * blockDim.x) / tile_size; + } + + std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); + if (threadIdx.x == 0) { *num_successes += block_num_successes; } + + // update submap thread counts + for(int i = 0; i < num_submaps; ++i) { + //std::size_t submap_block_num_successes = BlockReduce(temp_submap_storage[i]).Sum(submap_thread_num_successes[i]); + //if(threadIdx.x == 0) {*submap_num_successes[i] += submap_block_num_successes; } + } +} + /** * @brief Finds the values corresponding to all keys in the range `[first, last)`. * diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index 09e9d05dd..b451f9089 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -650,6 +650,9 @@ __device__ bool static_map::device_mutable_view::e bool status; if (g.thread_rank() == src_lane) { + // only fetch value once necessary + auto existing_value = current_slot->second.load(cuda::std::memory_order_relaxed); + if constexpr (cuco::detail::is_packable()) { auto slot = reinterpret_cast< cuda::atomic::packed_type>*>( diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index 866f94819..1e347239b 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -171,6 +171,11 @@ class dynamic_map { typename Hash = cuco::detail::MurmurHash3_32, typename KeyEqual = thrust::equal_to> void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}); + + template , + typename KeyEqual = thrust::equal_to> + void erase(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}); /** * @brief Finds the values corresponding to all keys in the range `[first, last)`. @@ -251,6 +256,9 @@ class dynamic_map { private: key_type empty_key_sentinel_{}; ///< Key value that represents an empty slot mapped_type empty_value_sentinel_{}; ///< Initial value of empty slot + + // TODO: initialize this + key_type erased_key_sentinel_{}; std::size_t size_{}; ///< Number of keys in the map std::size_t capacity_{}; ///< Maximum number of keys that can be inserted float max_load_factor_{}; ///< Max load factor before capacity growth @@ -263,6 +271,8 @@ class dynamic_map { std::size_t min_insert_size_{}; ///< min remaining capacity of submap for insert atomic_ctr_type* num_successes_; ///< number of successfully inserted keys on insert Allocator alloc_{}; ///< Allocator passed to submaps to allocate their device storage + + std::vector submap_num_successes_; }; } // namespace cuco diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 1daad9965..3ef487b7c 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -1414,6 +1414,11 @@ class static_map { sentinel::erased_key{erased_key_sentinel_}); } + atomic_ctr_type* get_num_successes() const noexcept + { + return num_successes_; + } + private: pair_atomic_type* slots_{nullptr}; ///< Pointer to flat slots storage std::size_t capacity_{}; ///< Total number of slots diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2d1d25526..a7b40300c 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -68,7 +68,8 @@ ConfigureTest(STATIC_MAP_TEST ################################################################################################### # - dynamic_map tests ----------------------------------------------------------------------------- ConfigureTest(DYNAMIC_MAP_TEST - dynamic_map/unique_sequence_test.cu) + dynamic_map/unique_sequence_test.cu + dynamic_map/erase_test.cu) ################################################################################################### # - static_multimap tests ------------------------------------------------------------------------- diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu new file mode 100644 index 000000000..c00013961 --- /dev/null +++ b/tests/dynamic_map/erase_test.cu @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include + + +TEMPLATE_TEST_CASE_SIG( + "erase key", "", ((typename T), T), (int32_t)) +{ + using Key = T; + using Value = T; + + unsigned long num_keys = 1'000'000; + cuco::dynamic_map map{num_keys * 2, -1, -1}; + + thrust::device_vector d_keys(num_keys); + thrust::device_vector d_values(num_keys); + thrust::device_vector d_keys_exist(num_keys); + + thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1); + thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1); + + auto pairs_begin = + thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin())); + + SECTION( + "Check basic insert/erase") + { + map.insert(pairs_begin, pairs_begin + num_keys); + + REQUIRE(map.get_size() == num_keys); + + map.erase(d_keys.begin(), d_keys.end()); + + // delete decreases count correctly + REQUIRE(map.get_size() == 0); + + map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); + + // keys were actaully deleted + REQUIRE(cuco::test::none_of(d_keys_exist.begin(), + d_keys_exist.end(), + [] __device__(const bool key_found) { return key_found; })); + + printf("cow\n"); + + // ensures that map is reusing deleted slots + map.insert(pairs_begin, pairs_begin + num_keys); + + REQUIRE(map.get_size() == num_keys); + + printf("cow2\n"); + + map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); + + REQUIRE(cuco::test::all_of(d_keys_exist.begin(), + d_keys_exist.end(), + [] __device__(const bool key_found) { return key_found; })); + + // erase can act selectively + map.erase(d_keys.begin(), d_keys.begin() + num_keys/2); + map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); + + REQUIRE(cuco::test::none_of(d_keys_exist.begin(), + d_keys_exist.begin() + num_keys/2, + [] __device__(const bool key_found) { return key_found; })); + + REQUIRE(cuco::test::all_of(d_keys_exist.begin() + num_keys/2, + d_keys_exist.end(), + [] __device__(const bool key_found) { return key_found; })); + } +} \ No newline at end of file diff --git a/tests/static_map/erase_test.cu b/tests/static_map/erase_test.cu index b5641539c..a4b956305 100644 --- a/tests/static_map/erase_test.cu +++ b/tests/static_map/erase_test.cu @@ -56,10 +56,12 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t), (int64_t)) map.erase(d_keys.begin(), d_keys.end()); + // delete decreases count correctly REQUIRE(map.get_size() == 0); map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); + // keys were actaully deleted REQUIRE(cuco::test::none_of(d_keys_exist.begin(), d_keys_exist.end(), [] __device__(const bool key_found) { return key_found; })); From eead8b8c9d08ae9cc6c1ca0bd2648e46af45b271 Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Wed, 9 Mar 2022 11:43:40 -0800 Subject: [PATCH 002/152] minor clarity changes --- benchmarks/hash_table/static_map_bench.cu | 44 ++++------------------- 1 file changed, 7 insertions(+), 37 deletions(-) diff --git a/benchmarks/hash_table/static_map_bench.cu b/benchmarks/hash_table/static_map_bench.cu index 63c2976d4..363899a46 100644 --- a/benchmarks/hash_table/static_map_bench.cu +++ b/benchmarks/hash_table/static_map_bench.cu @@ -293,33 +293,28 @@ static void BM_static_map_erase_none(::benchmark::State& state) } /* -BENCHMARK_TEMPLATE(BM_static_map_search_none, int32_t, int32_t, dist_type::UNIQUE) +BENCHMARK_TEMPLATE(BM_static_map_erase_none, int64_t, int64_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) ->Apply(generate_size_and_occupancy); -BENCHMARK_TEMPLATE(BM_static_map_search_none, int64_t, int64_t, dist_type::UNIQUE) +BENCHMARK_TEMPLATE(BM_static_map_erase_none, int32_t, int32_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) ->Apply(generate_size_and_occupancy); -*/ - -BENCHMARK_TEMPLATE(BM_static_map_erase_none, int64_t, int64_t, dist_type::UNIQUE) +BENCHMARK_TEMPLATE(BM_static_map_erase_all, int64_t, int64_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) ->Apply(generate_size_and_occupancy); -BENCHMARK_TEMPLATE(BM_static_map_erase_none, int32_t, int32_t, dist_type::UNIQUE) +BENCHMARK_TEMPLATE(BM_static_map_erase_all, int32_t, int32_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) ->Apply(generate_size_and_occupancy); -BENCHMARK_TEMPLATE(BM_static_map_erase_all, int64_t, int64_t, dist_type::UNIQUE) +BENCHMARK_TEMPLATE(BM_static_map_search_none, int64_t, int64_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(generate_size_and_occupancy); +*/ -BENCHMARK_TEMPLATE(BM_static_map_erase_all, int32_t, int32_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy); -/* -BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIQUE) +BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) ->Apply(generate_size_and_occupancy) ->UseManualTime(); @@ -351,28 +346,3 @@ BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIQUE) ->Apply(generate_size_and_occupancy) ->UseManualTime(); -BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy); - -BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy); - -BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::GAUSSIAN) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::GAUSSIAN) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy); - -BENCHMARK_TEMPLATE(BM_static_map_erase_all, int32_t, int32_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy); From 20ac7a33c5136f29a4b24826fab0f4fb49404b4a Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Mon, 4 Apr 2022 18:22:55 -0700 Subject: [PATCH 003/152] erase bug fix --- include/cuco/detail/dynamic_map.inl | 47 +++++++++++++++++---- include/cuco/detail/dynamic_map_kernels.cuh | 10 ++--- include/cuco/detail/static_map.inl | 5 +-- include/cuco/dynamic_map.cuh | 8 +++- tests/CMakeLists.txt | 2 +- tests/dynamic_map/erase_test.cu | 8 ++-- tests/static_map/erase_test.cu | 2 - 7 files changed, 56 insertions(+), 26 deletions(-) diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index 28857f547..f5625bd72 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -17,13 +17,13 @@ namespace cuco { template -dynamic_map::dynamic_map( - std::size_t initial_capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, - Allocator const& alloc) - : empty_key_sentinel_(empty_key_sentinel.value), - empty_value_sentinel_(empty_value_sentinel.value), +dynamic_map::dynamic_map(std::size_t initial_capacity, + Key empty_key_sentinel, + Value empty_value_sentinel, + Allocator const& alloc) + : empty_key_sentinel_(empty_key_sentinel), + empty_value_sentinel_(empty_value_sentinel), + erased_key_sentinel_(empty_value_sentinel), size_(0), capacity_(initial_capacity), min_insert_size_(1E4), @@ -41,7 +41,37 @@ dynamic_map::dynamic_map( submap_num_successes_.push_back(submaps_[0]->get_num_successes()); CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type))); -} // namespace cuco +} + +template +dynamic_map::dynamic_map(std::size_t initial_capacity, + Key empty_key_sentinel, + Value empty_value_sentinel, + Key erased_key_sentinel, + Allocator const& alloc) + : empty_key_sentinel_(empty_key_sentinel), + empty_value_sentinel_(empty_value_sentinel), + erased_key_sentinel_(erased_key_sentinel), + size_(0), + capacity_(initial_capacity), + min_insert_size_(1E4), + max_load_factor_(0.60), + alloc_{alloc} +{ + submaps_.push_back(std::make_unique>( + initial_capacity, + sentinel::empty_key{empty_key_sentinel}, + sentinel::empty_value{empty_value_sentinel}, + sentinel::erased_key{erased_key_sentinel}, + alloc)); + submap_views_.push_back(submaps_[0]->get_device_view()); + submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); + + submap_num_successes_.push_back(submaps_[0]->get_num_successes()); + + CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type))); +} + template dynamic_map::~dynamic_map() @@ -123,6 +153,7 @@ void dynamic_map::insert(InputIt first, CUCO_CUDA_TRY(cudaDeviceSynchronize()); std::size_t h_num_successes = num_successes_->load(cuda::std::memory_order_relaxed); + submaps_[submap_idx]->size_ += h_num_successes; size_ += h_num_successes; first += n; diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh index 3bc8a0d8a..46fae21b5 100644 --- a/include/cuco/detail/dynamic_map_kernels.cuh +++ b/include/cuco/detail/dynamic_map_kernels.cuh @@ -209,24 +209,22 @@ __global__ void erase(InputIt first, __shared__ typename BlockReduce::TempStorage temp_storage; // TODO: hack for up to 4 submaps, make this better - __shared__ typename BlockReduce::TempStorage temp_submap_storage[4]; + //__shared__ typename BlockReduce::TempStorage temp_submap_storage[4]; std::size_t thread_num_successes = 0; std::size_t submap_thread_num_successes[4] = {0, 0, 0, 0}; auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = blockDim.x * blockIdx.x + threadIdx.x; - auto key_idx = tid / tile_size; + auto tid = block_size * blockIdx.x + threadIdx.x; auto it = first + tid / tile_size; while (it < last) { - auto key = *(first + key_idx); - auto erased = false; + auto erased = false; // manually check for duplicates in those submaps we are not inserting into int i; for (i = 0; i < num_submaps; ++i) { - erased = submap_mutable_views[i].erase(tile, key, hash, key_equal); + erased = submap_mutable_views[i].erase(tile, *it, hash, key_equal); if (erased) { break; } } if (erased && tile.thread_rank() == 0) { diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index b451f9089..42c163550 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -650,9 +650,6 @@ __device__ bool static_map::device_mutable_view::e bool status; if (g.thread_rank() == src_lane) { - // only fetch value once necessary - auto existing_value = current_slot->second.load(cuda::std::memory_order_relaxed); - if constexpr (cuco::detail::is_packable()) { auto slot = reinterpret_cast< cuda::atomic::packed_type>*>( @@ -855,4 +852,4 @@ static_map::device_view::contains(CG const& g, current_slot = next_slot(g, current_slot); } } -} // namespace cuco +} // namespace cuco \ No newline at end of file diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index 1e347239b..649eb3d01 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -136,6 +136,12 @@ class dynamic_map { sentinel::empty_key empty_key_sentinel, sentinel::empty_value empty_value_sentinel, Allocator const& alloc = Allocator{}); + + dynamic_map(std::size_t initial_capacity, + Key empty_key_sentinel, + Value empty_value_sentinel, + Key erased_key_sentinel, + Allocator const& alloc = Allocator{}); /** * @brief Destroy the map and frees its contents @@ -256,9 +262,9 @@ class dynamic_map { private: key_type empty_key_sentinel_{}; ///< Key value that represents an empty slot mapped_type empty_value_sentinel_{}; ///< Initial value of empty slot + key_type erased_key_sentinel_{}; // TODO: initialize this - key_type erased_key_sentinel_{}; std::size_t size_{}; ///< Number of keys in the map std::size_t capacity_{}; ///< Maximum number of keys that can be inserted float max_load_factor_{}; ///< Max load factor before capacity growth diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a7b40300c..ae5dfd5af 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -68,7 +68,7 @@ ConfigureTest(STATIC_MAP_TEST ################################################################################################### # - dynamic_map tests ----------------------------------------------------------------------------- ConfigureTest(DYNAMIC_MAP_TEST - dynamic_map/unique_sequence_test.cu + #dynamic_map/unique_sequence_test.cu dynamic_map/erase_test.cu) ################################################################################################### diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu index c00013961..ec2ca44b4 100644 --- a/tests/dynamic_map/erase_test.cu +++ b/tests/dynamic_map/erase_test.cu @@ -29,7 +29,7 @@ TEMPLATE_TEST_CASE_SIG( using Value = T; unsigned long num_keys = 1'000'000; - cuco::dynamic_map map{num_keys * 2, -1, -1}; + cuco::dynamic_map map{num_keys * 2, -1, -1, -2}; thrust::device_vector d_keys(num_keys); thrust::device_vector d_values(num_keys); @@ -48,6 +48,7 @@ TEMPLATE_TEST_CASE_SIG( REQUIRE(map.get_size() == num_keys); + map.erase(d_keys.begin(), d_keys.end()); // delete decreases count correctly @@ -60,13 +61,11 @@ TEMPLATE_TEST_CASE_SIG( d_keys_exist.end(), [] __device__(const bool key_found) { return key_found; })); - printf("cow\n"); - // ensures that map is reusing deleted slots map.insert(pairs_begin, pairs_begin + num_keys); REQUIRE(map.get_size() == num_keys); - +/* printf("cow2\n"); map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); @@ -86,5 +85,6 @@ TEMPLATE_TEST_CASE_SIG( REQUIRE(cuco::test::all_of(d_keys_exist.begin() + num_keys/2, d_keys_exist.end(), [] __device__(const bool key_found) { return key_found; })); + */ } } \ No newline at end of file diff --git a/tests/static_map/erase_test.cu b/tests/static_map/erase_test.cu index a4b956305..b5641539c 100644 --- a/tests/static_map/erase_test.cu +++ b/tests/static_map/erase_test.cu @@ -56,12 +56,10 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t), (int64_t)) map.erase(d_keys.begin(), d_keys.end()); - // delete decreases count correctly REQUIRE(map.get_size() == 0); map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); - // keys were actaully deleted REQUIRE(cuco::test::none_of(d_keys_exist.begin(), d_keys_exist.end(), [] __device__(const bool key_found) { return key_found; })); From ede50d68a1e4f14383654248f3a0e64993186273 Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Tue, 5 Apr 2022 11:19:57 -0700 Subject: [PATCH 004/152] dynamic map erase working, only 4 submaps for now --- include/cuco/detail/dynamic_map.inl | 24 ++++++--- include/cuco/detail/dynamic_map_kernels.cuh | 8 +-- tests/dynamic_map/erase_test.cu | 55 +++++++++++++++++++-- 3 files changed, 73 insertions(+), 14 deletions(-) diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index f5625bd72..1599f90fd 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -23,7 +23,7 @@ dynamic_map::dynamic_map(std::size_t initial_capac Allocator const& alloc) : empty_key_sentinel_(empty_key_sentinel), empty_value_sentinel_(empty_value_sentinel), - erased_key_sentinel_(empty_value_sentinel), + erased_key_sentinel_(empty_key_sentinel), size_(0), capacity_(initial_capacity), min_insert_size_(1E4), @@ -60,9 +60,9 @@ dynamic_map::dynamic_map(std::size_t initial_capac { submaps_.push_back(std::make_unique>( initial_capacity, - sentinel::empty_key{empty_key_sentinel}, - sentinel::empty_value{empty_value_sentinel}, - sentinel::erased_key{erased_key_sentinel}, + sentinel::empty_key{empty_key_sentinel_}, + sentinel::empty_value{empty_value_sentinel_}, + sentinel::erased_key{erased_key_sentinel_}, alloc)); submap_views_.push_back(submaps_[0]->get_device_view()); submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); @@ -98,6 +98,7 @@ void dynamic_map::reserve(std::size_t n) submap_capacity, sentinel::empty_key{empty_key_sentinel_}, sentinel::empty_value{empty_value_sentinel_}, + sentinel::erased_key{erased_key_sentinel_}, alloc_)); submap_views_.push_back(submaps_[submap_idx]->get_device_view()); submap_mutable_views_.push_back(submaps_[submap_idx]->get_device_mutable_view()); @@ -128,6 +129,7 @@ void dynamic_map::insert(InputIt first, max_load_factor_ * submaps_[submap_idx]->get_capacity() - submaps_[submap_idx]->get_size(); // If we are tying to insert some of the remaining keys into this submap, we can insert // only if we meet the minimum insert size. + if (capacity_remaining >= min_insert_size_) { *num_successes_ = 0; int device_id; @@ -182,6 +184,11 @@ void dynamic_map::erase(InputIt first, CUCO_CUDA_TRY(cudaGetDevice(&device_id)); CUCO_CUDA_TRY(cudaMemPrefetchAsync(num_successes_, sizeof(atomic_ctr_type), device_id)); + static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); + for(int i = 0; i < submaps_.size(); ++i) { + CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[i], 0, sizeof(atomic_ctr_type))); + } + // TODO: hacky, improve this thrust::device_vector d_submap_num_successes(submap_num_successes_); @@ -201,10 +208,13 @@ void dynamic_map::erase(InputIt first, size_ -= h_num_successes; for(int i = 0; i < submaps_.size(); ++i) { - //std::size_t h_num_submap_successes = submap_num_successes_[i]->load(cuda::std::memory_order_relaxed); - //submaps_[i]->size_ -= h_num_submap_successes; + std::size_t h_submap_num_successes; + CUCO_CUDA_TRY(cudaMemcpy( + &h_submap_num_successes, submap_num_successes_[i], sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost)); + + CUCO_CUDA_TRY(cudaDeviceSynchronize()); // stream sync to ensure h_num_successes is updated + submaps_[i]->size_ -= h_submap_num_successes; } - } template diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh index 46fae21b5..624a6a85a 100644 --- a/include/cuco/detail/dynamic_map_kernels.cuh +++ b/include/cuco/detail/dynamic_map_kernels.cuh @@ -209,7 +209,7 @@ __global__ void erase(InputIt first, __shared__ typename BlockReduce::TempStorage temp_storage; // TODO: hack for up to 4 submaps, make this better - //__shared__ typename BlockReduce::TempStorage temp_submap_storage[4]; + __shared__ typename BlockReduce::TempStorage temp_submap_storage[4]; std::size_t thread_num_successes = 0; std::size_t submap_thread_num_successes[4] = {0, 0, 0, 0}; @@ -229,7 +229,7 @@ __global__ void erase(InputIt first, } if (erased && tile.thread_rank() == 0) { thread_num_successes++; - //submap_thread_num_successes[i]++; + submap_thread_num_successes[i]++; } it += (gridDim.x * blockDim.x) / tile_size; @@ -240,8 +240,8 @@ __global__ void erase(InputIt first, // update submap thread counts for(int i = 0; i < num_submaps; ++i) { - //std::size_t submap_block_num_successes = BlockReduce(temp_submap_storage[i]).Sum(submap_thread_num_successes[i]); - //if(threadIdx.x == 0) {*submap_num_successes[i] += submap_block_num_successes; } + std::size_t submap_block_num_successes = BlockReduce(temp_submap_storage[i]).Sum(submap_thread_num_successes[i]); + if(threadIdx.x == 0) {*submap_num_successes[i] += submap_block_num_successes; } } } diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu index ec2ca44b4..e84c6f35e 100644 --- a/tests/dynamic_map/erase_test.cu +++ b/tests/dynamic_map/erase_test.cu @@ -44,6 +44,10 @@ TEMPLATE_TEST_CASE_SIG( SECTION( "Check basic insert/erase") { + // ***************************************** + // first, check single submap works properly + // ***************************************** + map.insert(pairs_begin, pairs_begin + num_keys); REQUIRE(map.get_size() == num_keys); @@ -65,8 +69,6 @@ TEMPLATE_TEST_CASE_SIG( map.insert(pairs_begin, pairs_begin + num_keys); REQUIRE(map.get_size() == num_keys); -/* - printf("cow2\n"); map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); @@ -85,6 +87,53 @@ TEMPLATE_TEST_CASE_SIG( REQUIRE(cuco::test::all_of(d_keys_exist.begin() + num_keys/2, d_keys_exist.end(), [] __device__(const bool key_found) { return key_found; })); - */ + + // clear map + map.erase(d_keys.begin()+num_keys/2, d_keys.end()); + + // ************************************************* + // second, check multiple submaps case works properly + // ************************************************* + + thrust::device_vector d_keys2(4 * num_keys); + thrust::device_vector d_values2(4 * num_keys); + thrust::device_vector d_keys_exist2(4 * num_keys); + + thrust::sequence(thrust::device, d_keys2.begin(), d_keys2.end(), 1); + thrust::sequence(thrust::device, d_values2.begin(), d_values2.end(), 1); + + auto pairs_begin2 = + thrust::make_zip_iterator(thrust::make_tuple(d_keys2.begin(), d_values2.begin())); + + map.insert(pairs_begin2, pairs_begin2 + 4*num_keys); + + // map should resize twice if the erased slots are successfully reused + REQUIRE(map.get_capacity() == 8*num_keys); + + // check that keys can be successfully deleted from only the first and second submaps + map.erase(d_keys2.begin(), d_keys2.begin() + 2*num_keys); + + map.contains(d_keys2.begin(), d_keys2.end(), d_keys_exist2.begin()); + + REQUIRE(cuco::test::none_of(d_keys_exist2.begin(), + d_keys_exist2.begin() + 2*num_keys, + [] __device__(const bool key_found) { return key_found; })); + + REQUIRE(cuco::test::all_of(d_keys_exist2.begin() + 2*num_keys, + d_keys_exist2.end(), + [] __device__(const bool key_found) { return key_found; })); + + REQUIRE(map.get_size() == 2*num_keys); + + // check that keys can be successfully deleted from all submaps (some will be unsuccessful erases) + map.erase(d_keys2.begin(), d_keys2.end()); + + map.contains(d_keys2.begin(), d_keys2.end(), d_keys_exist2.begin()); + + REQUIRE(cuco::test::none_of(d_keys_exist2.begin(), + d_keys_exist2.end(), + [] __device__(const bool key_found) { return key_found; })); + + REQUIRE(map.get_size() == 0); } } \ No newline at end of file From 1d8fbd0d54c24c248cbdb340e99506d61179c545 Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Tue, 5 Apr 2022 11:57:06 -0700 Subject: [PATCH 005/152] type wrappers added --- include/cuco/detail/dynamic_map.inl | 44 +++++++++++++---------- include/cuco/dynamic_map.cuh | 6 ++-- tests/CMakeLists.txt | 2 +- tests/dynamic_map/erase_test.cu | 5 ++- tests/dynamic_map/unique_sequence_test.cu | 6 ++-- 5 files changed, 37 insertions(+), 26 deletions(-) diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index 1599f90fd..ecccf82dc 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -18,12 +18,12 @@ namespace cuco { template dynamic_map::dynamic_map(std::size_t initial_capacity, - Key empty_key_sentinel, - Value empty_value_sentinel, + sentinel::empty_key empty_key_sentinel, + sentinel::empty_value empty_value_sentinel, Allocator const& alloc) - : empty_key_sentinel_(empty_key_sentinel), - empty_value_sentinel_(empty_value_sentinel), - erased_key_sentinel_(empty_key_sentinel), + : empty_key_sentinel_(empty_key_sentinel.value), + empty_value_sentinel_(empty_value_sentinel.value), + erased_key_sentinel_(empty_key_sentinel.value), size_(0), capacity_(initial_capacity), min_insert_size_(1E4), @@ -45,13 +45,13 @@ dynamic_map::dynamic_map(std::size_t initial_capac template dynamic_map::dynamic_map(std::size_t initial_capacity, - Key empty_key_sentinel, - Value empty_value_sentinel, - Key erased_key_sentinel, + sentinel::empty_key empty_key_sentinel, + sentinel::empty_value empty_value_sentinel, + sentinel::erased_key erased_key_sentinel, Allocator const& alloc) - : empty_key_sentinel_(empty_key_sentinel), - empty_value_sentinel_(empty_value_sentinel), - erased_key_sentinel_(erased_key_sentinel), + : empty_key_sentinel_(empty_key_sentinel.value), + empty_value_sentinel_(empty_value_sentinel.value), + erased_key_sentinel_(erased_key_sentinel.value), size_(0), capacity_(initial_capacity), min_insert_size_(1E4), @@ -66,7 +66,6 @@ dynamic_map::dynamic_map(std::size_t initial_capac alloc)); submap_views_.push_back(submaps_[0]->get_device_view()); submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); - submap_num_successes_.push_back(submaps_[0]->get_num_successes()); CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type))); @@ -94,15 +93,22 @@ void dynamic_map::reserve(std::size_t n) // if the submap does not exist yet, create it else { submap_capacity = capacity_; - submaps_.push_back(std::make_unique>( - submap_capacity, - sentinel::empty_key{empty_key_sentinel_}, - sentinel::empty_value{empty_value_sentinel_}, - sentinel::erased_key{erased_key_sentinel_}, - alloc_)); + if(erased_key_sentinel_ != empty_key_sentinel_) { + submaps_.push_back(std::make_unique>( + submap_capacity, + sentinel::empty_key{empty_key_sentinel_}, + sentinel::empty_value{empty_value_sentinel_}, + sentinel::erased_key{erased_key_sentinel_}, + alloc_)); + } else { + submaps_.push_back(std::make_unique>( + submap_capacity, + sentinel::empty_key{empty_key_sentinel_}, + sentinel::empty_value{empty_value_sentinel_}, + alloc_)); + } submap_views_.push_back(submaps_[submap_idx]->get_device_view()); submap_mutable_views_.push_back(submaps_[submap_idx]->get_device_mutable_view()); - submap_num_successes_.push_back(submaps_[submap_idx]->get_num_successes()); capacity_ *= 2; diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index 649eb3d01..bbe8c664b 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -138,9 +138,9 @@ class dynamic_map { Allocator const& alloc = Allocator{}); dynamic_map(std::size_t initial_capacity, - Key empty_key_sentinel, - Value empty_value_sentinel, - Key erased_key_sentinel, + sentinel::empty_key empty_key_sentinel, + sentinel::empty_value empty_value_sentinel, + sentinel::erased_key erased_key_sentinel, Allocator const& alloc = Allocator{}); /** diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ae5dfd5af..a7b40300c 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -68,7 +68,7 @@ ConfigureTest(STATIC_MAP_TEST ################################################################################################### # - dynamic_map tests ----------------------------------------------------------------------------- ConfigureTest(DYNAMIC_MAP_TEST - #dynamic_map/unique_sequence_test.cu + dynamic_map/unique_sequence_test.cu dynamic_map/erase_test.cu) ################################################################################################### diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu index e84c6f35e..385b2e426 100644 --- a/tests/dynamic_map/erase_test.cu +++ b/tests/dynamic_map/erase_test.cu @@ -29,7 +29,10 @@ TEMPLATE_TEST_CASE_SIG( using Value = T; unsigned long num_keys = 1'000'000; - cuco::dynamic_map map{num_keys * 2, -1, -1, -2}; + cuco::dynamic_map map{num_keys * 2, + cuco::sentinel::empty_key{-1}, + cuco::sentinel::empty_value{-1}, + cuco::sentinel::erased_key{-2}}; thrust::device_vector d_keys(num_keys); thrust::device_vector d_values(num_keys); diff --git a/tests/dynamic_map/unique_sequence_test.cu b/tests/dynamic_map/unique_sequence_test.cu index de26bb3dc..24a2041aa 100644 --- a/tests/dynamic_map/unique_sequence_test.cu +++ b/tests/dynamic_map/unique_sequence_test.cu @@ -38,8 +38,10 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", (int64_t, int64_t)) { constexpr std::size_t num_keys{50'000'000}; - cuco::dynamic_map map{ - 30'000'000, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + + cuco::dynamic_map map{30'000'000, + cuco::sentinel::empty_key{-1}, + cuco::sentinel::empty_value{-1}}; thrust::device_vector d_keys(num_keys); thrust::device_vector d_values(num_keys); From 63dd4eb07fbb4b145f7075f47b2fa9d64c3b3538 Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Tue, 5 Apr 2022 12:55:31 -0700 Subject: [PATCH 006/152] prevent implicit type conversion of sentinels during construction --- benchmarks/hash_table/dynamic_map_bench.cu | 9 +++++++-- include/cuco/dynamic_map.cuh | 9 +++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu index d42aae755..8545a47c6 100644 --- a/benchmarks/hash_table/dynamic_map_bench.cu +++ b/benchmarks/hash_table/dynamic_map_bench.cu @@ -86,8 +86,13 @@ static void BM_dynamic_insert(::benchmark::State& state) std::size_t batch_size = 1E6; for (auto _ : state) { +<<<<<<< HEAD map_type map{ initial_size, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; +======= + map_type map{initial_size, + cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; +>>>>>>> prevent implicit type conversion of sentinels during construction { cuda_event_timer raii{state}; for (std::size_t i = 0; i < num_keys; i += batch_size) { @@ -124,8 +129,8 @@ static void BM_dynamic_search_all(::benchmark::State& state) thrust::device_vector> d_pairs(h_pairs); thrust::device_vector d_results(num_keys); - map_type map{ - initial_size, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + map_type map{initial_size, + cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; map.insert(d_pairs.begin(), d_pairs.end()); for (auto _ : state) { diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index bbe8c664b..267910b43 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -109,6 +109,15 @@ class dynamic_map { dynamic_map(dynamic_map const&) = delete; dynamic_map(dynamic_map&&) = delete; + + template + dynamic_map(std::size_t, T1, T2, + Allocator const& = Allocator{}) = delete; + + template + dynamic_map(std::size_t, T1, T2, T3, + Allocator const& = Allocator{}) = delete; + dynamic_map& operator=(dynamic_map const&) = delete; dynamic_map& operator=(dynamic_map&&) = delete; From 52d83f6cb0d6bd4575d43621cd15baee4baa0870 Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Tue, 5 Apr 2022 16:34:16 -0700 Subject: [PATCH 007/152] erase benchmark added --- benchmarks/hash_table/dynamic_map_bench.cu | 53 ++++++++++++++++++++- include/cuco/detail/dynamic_map.inl | 23 +++++---- include/cuco/detail/dynamic_map_kernels.cuh | 4 +- 3 files changed, 66 insertions(+), 14 deletions(-) diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu index 8545a47c6..afdd3644b 100644 --- a/benchmarks/hash_table/dynamic_map_bench.cu +++ b/benchmarks/hash_table/dynamic_map_bench.cu @@ -57,7 +57,7 @@ static void generate_keys(OutputIt output_begin, OutputIt output_end) static void gen_final_size(benchmark::internal::Benchmark* b) { - for (auto size = 10'000'000; size <= 150'000'000; size += 20'000'000) { + for (auto size = 10'000'000; size <= 310'000'000; size += 20'000'000) { b->Args({size}); } } @@ -142,15 +142,64 @@ static void BM_dynamic_search_all(::benchmark::State& state) int64_t(state.range(0))); } +template +static void BM_dynamic_erase_all(::benchmark::State& state) +{ + using map_type = cuco::dynamic_map; + + std::size_t num_keys = state.range(0); + std::size_t initial_size = 1 << 27; + + std::vector h_keys(num_keys); + std::vector> h_pairs(num_keys); + + generate_keys(h_keys.begin(), h_keys.end()); + + for (auto i = 0; i < num_keys; ++i) { + Key key = h_keys[i]; + Value val = h_keys[i]; + h_pairs[i].first = key; + h_pairs[i].second = val; + } + + thrust::device_vector> d_pairs(h_pairs); + thrust::device_vector d_keys(h_keys); + + std::size_t batch_size = 1E6; + for (auto _ : state) { + map_type map{initial_size, + cuco::sentinel::empty_key{-1}, + cuco::sentinel::empty_value{-1}, + cuco::sentinel::erased_key{-2}}; + for (auto i = 0; i < num_keys; i += batch_size) { + map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size); + } + { + cuda_event_timer raii{state}; + for (auto i = 0; i < num_keys; i += batch_size) { + map.erase(d_keys.begin() + i, d_keys.begin() + i + batch_size); + } + } + } + + state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * + int64_t(state.range(0))); +} + BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); - +/* BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); +*/ +BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIQUE) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); /* BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM) diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index ecccf82dc..ba9a7bf8b 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -197,17 +197,22 @@ void dynamic_map::erase(InputIt first, // TODO: hacky, improve this thrust::device_vector d_submap_num_successes(submap_num_successes_); + + // TODO: hack (how to get size on host?) + constexpr size_t temp_storage_size_one_block = 48; + auto const temp_storage_size = submaps_.size() * temp_storage_size_one_block; detail::erase> - <<>>(first, - first + num_keys, - submap_views_.data().get(), - submap_mutable_views_.data().get(), - num_successes_, - d_submap_num_successes.data().get(), - submaps_.size(), - hash, - key_equal); + <<>>( + first, + first + num_keys, + submap_views_.data().get(), + submap_mutable_views_.data().get(), + num_successes_, + d_submap_num_successes.data().get(), + submaps_.size(), + hash, + key_equal); CUCO_CUDA_TRY(cudaDeviceSynchronize()); std::size_t h_num_successes = num_successes_->load(cuda::std::memory_order_relaxed); diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh index 624a6a85a..599d1d68b 100644 --- a/include/cuco/detail/dynamic_map_kernels.cuh +++ b/include/cuco/detail/dynamic_map_kernels.cuh @@ -206,11 +206,9 @@ __global__ void erase(InputIt first, KeyEqual key_equal) { typedef cub::BlockReduce BlockReduce; + extern __shared__ typename BlockReduce::TempStorage temp_submap_storage[]; __shared__ typename BlockReduce::TempStorage temp_storage; - // TODO: hack for up to 4 submaps, make this better - __shared__ typename BlockReduce::TempStorage temp_submap_storage[4]; - std::size_t thread_num_successes = 0; std::size_t submap_thread_num_successes[4] = {0, 0, 0, 0}; From 0878216aed25ad34fcae870b17138859ca7dee3d Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Tue, 5 Apr 2022 23:29:16 -0700 Subject: [PATCH 008/152] num_successes managed pointer updated --- benchmarks/hash_table/dynamic_map_bench.cu | 2 +- include/cuco/detail/dynamic_map.inl | 50 +- include/cuco/detail/dynamic_map_kernels.cuh | 2 +- include/cuco/detail/nvtx3.hpp | 2045 +++++++++++++++++++ include/cuco/dynamic_map.cuh | 18 +- 5 files changed, 2085 insertions(+), 32 deletions(-) create mode 100644 include/cuco/detail/nvtx3.hpp diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu index afdd3644b..c0306f901 100644 --- a/benchmarks/hash_table/dynamic_map_bench.cu +++ b/benchmarks/hash_table/dynamic_map_bench.cu @@ -57,7 +57,7 @@ static void generate_keys(OutputIt output_begin, OutputIt output_end) static void gen_final_size(benchmark::internal::Benchmark* b) { - for (auto size = 10'000'000; size <= 310'000'000; size += 20'000'000) { + for (auto size = 10'000'000; size <= 10'000'000; size += 20'000'000) { b->Args({size}); } } diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index ba9a7bf8b..2bb1459bc 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -14,6 +14,8 @@ * limitations under the License. */ +//#include "nvtx3.hpp" + namespace cuco { template @@ -28,7 +30,8 @@ dynamic_map::dynamic_map(std::size_t initial_capac capacity_(initial_capacity), min_insert_size_(1E4), max_load_factor_(0.60), - alloc_{alloc} + alloc_{alloc}, + counter_allocator_{alloc} { submaps_.push_back(std::make_unique>( initial_capacity, @@ -39,8 +42,8 @@ dynamic_map::dynamic_map(std::size_t initial_capac submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); submap_num_successes_.push_back(submaps_[0]->get_num_successes()); - - CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type))); + + num_successes_ = std::allocator_traits::allocate(counter_allocator_, 1); } template @@ -56,7 +59,8 @@ dynamic_map::dynamic_map(std::size_t initial_capac capacity_(initial_capacity), min_insert_size_(1E4), max_load_factor_(0.60), - alloc_{alloc} + alloc_{alloc}, + counter_allocator_{alloc} { submaps_.push_back(std::make_unique>( initial_capacity, @@ -68,14 +72,14 @@ dynamic_map::dynamic_map(std::size_t initial_capac submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); submap_num_successes_.push_back(submaps_[0]->get_num_successes()); - CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type))); + num_successes_ = std::allocator_traits::allocate(counter_allocator_, 1); } template dynamic_map::~dynamic_map() { - CUCO_ASSERT_CUDA_SUCCESS(cudaFree(num_successes_)); + std::allocator_traits::deallocate(counter_allocator_, num_successes_, 1); } template @@ -126,7 +130,10 @@ void dynamic_map::insert(InputIt first, Hash hash, KeyEqual key_equal) { + //nvtx3::thread_range r{"insert"}; + std::size_t num_to_insert = std::distance(first, last); + reserve(size_ + num_to_insert); uint32_t submap_idx = 0; @@ -137,11 +144,10 @@ void dynamic_map::insert(InputIt first, // only if we meet the minimum insert size. if (capacity_remaining >= min_insert_size_) { - *num_successes_ = 0; - int device_id; - CUCO_CUDA_TRY(cudaGetDevice(&device_id)); - CUCO_CUDA_TRY(cudaMemPrefetchAsync(num_successes_, sizeof(atomic_ctr_type), device_id)); - + // TODO: memset an atomic variable is unsafe + static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); + CUCO_CUDA_TRY(cudaMemset(num_successes_, 0, sizeof(atomic_ctr_type))); + auto n = std::min(capacity_remaining, num_to_insert); auto const block_size = 128; auto const stride = 1; @@ -158,9 +164,10 @@ void dynamic_map::insert(InputIt first, submaps_.size(), hash, key_equal); - CUCO_CUDA_TRY(cudaDeviceSynchronize()); - std::size_t h_num_successes = num_successes_->load(cuda::std::memory_order_relaxed); + std::size_t h_num_successes; + CUCO_CUDA_TRY(cudaMemcpy( + &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost)); submaps_[submap_idx]->size_ += h_num_successes; size_ += h_num_successes; @@ -178,6 +185,7 @@ void dynamic_map::erase(InputIt first, Hash hash, KeyEqual key_equal) { + //nvtx3::thread_range r{"erase"}; std::size_t num_keys = std::distance(first, last); auto const block_size = 128; @@ -185,10 +193,9 @@ void dynamic_map::erase(InputIt first, auto const tile_size = 4; auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); - *num_successes_ = 0; - int device_id; - CUCO_CUDA_TRY(cudaGetDevice(&device_id)); - CUCO_CUDA_TRY(cudaMemPrefetchAsync(num_successes_, sizeof(atomic_ctr_type), device_id)); + // TODO: memset an atomic variable is unsafe + static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); + CUCO_CUDA_TRY(cudaMemset(num_successes_, 0, sizeof(atomic_ctr_type))); static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); for(int i = 0; i < submaps_.size(); ++i) { @@ -213,17 +220,16 @@ void dynamic_map::erase(InputIt first, submaps_.size(), hash, key_equal); - CUCO_CUDA_TRY(cudaDeviceSynchronize()); - - std::size_t h_num_successes = num_successes_->load(cuda::std::memory_order_relaxed); + + std::size_t h_num_successes; + CUCO_CUDA_TRY(cudaMemcpy( + &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost)); size_ -= h_num_successes; for(int i = 0; i < submaps_.size(); ++i) { std::size_t h_submap_num_successes; CUCO_CUDA_TRY(cudaMemcpy( &h_submap_num_successes, submap_num_successes_[i], sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost)); - - CUCO_CUDA_TRY(cudaDeviceSynchronize()); // stream sync to ensure h_num_successes is updated submaps_[i]->size_ -= h_submap_num_successes; } } diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh index 599d1d68b..c5605d463 100644 --- a/include/cuco/detail/dynamic_map_kernels.cuh +++ b/include/cuco/detail/dynamic_map_kernels.cuh @@ -201,7 +201,7 @@ __global__ void erase(InputIt first, mutableViewT* submap_mutable_views, atomicT* num_successes, atomicT** submap_num_successes, - uint32_t num_submaps, + const uint32_t num_submaps, Hash hash, KeyEqual key_equal) { diff --git a/include/cuco/detail/nvtx3.hpp b/include/cuco/detail/nvtx3.hpp new file mode 100644 index 000000000..08a02153b --- /dev/null +++ b/include/cuco/detail/nvtx3.hpp @@ -0,0 +1,2045 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#if defined(NVTX3_MINOR_VERSION) and NVTX3_MINOR_VERSION < 0 +#error \ + "Trying to #include NVTX version 3 in a source file where an older NVTX version has already been included. If you are not directly using NVTX (the NVIDIA Tools Extension library), you are getting this error because libraries you are using have included different versions of NVTX. Suggested solutions are: (1) reorder #includes so the newest NVTX version is included first, (2) avoid using the conflicting libraries in the same .c/.cpp file, or (3) update the library using the older NVTX version to use the newer version instead." +#endif + +/** + * @brief Semantic minor version number. + * + * Major version number is hardcoded into the "nvtx3" namespace/prefix. + * + * If this value is incremented, the above version include guard needs to be + * updated. + * + */ +#define NVTX3_MINOR_VERSION 0 + +#include + +#include + +/** + * @file nvtx3.hpp + * + * @brief Provides C++ constructs making the NVTX library safer and easier to + * use with zero overhead. + */ + +/** + * \mainpage + * \tableofcontents + * + * \section QUICK_START Quick Start + * + * To add NVTX ranges to your code, use the `nvtx3::thread_range` RAII object. A + * range begins when the object is created, and ends when the object is + * destroyed. + * + * \code{.cpp} + * #include "nvtx3.hpp" + * void some_function(){ + * // Begins a NVTX range with the messsage "some_function" + * // The range ends when some_function() returns and `r` is destroyed + * nvtx3::thread_range r{"some_function"}; + * + * for(int i = 0; i < 6; ++i){ + * nvtx3::thread_range loop{"loop range"}; + * std::this_thread::sleep_for(std::chrono::seconds{1}); + * } + * } // Range ends when `r` is destroyed + * \endcode + * + * The example code above generates the following timeline view in Nsight + * Systems: + * + * \image html + * https://raw.githubusercontent.com/jrhemstad/nvtx_wrappers/master/docs/example_range.png + * + * Alternatively, use the \ref MACROS like `NVTX3_FUNC_RANGE()` to add + * ranges to your code that automatically use the name of the enclosing function + * as the range's message. + * + * \code{.cpp} + * #include "nvtx3.hpp" + * void some_function(){ + * // Creates a range with a message "some_function" that ends when the + * enclosing + * // function returns + * NVTX3_FUNC_RANGE(); + * ... + * } + * \endcode + * + * + * \section Overview + * + * The NVTX library provides a set of functions for users to annotate their code + * to aid in performance profiling and optimization. These annotations provide + * information to tools like Nsight Systems to improve visualization of + * application timelines. + * + * \ref RANGES are one of the most commonly used NVTX constructs for annotating + * a span of time. For example, imagine a user wanted to see every time a + * function, `my_function`, is called and how long it takes to execute. This can + * be accomplished with an NVTX range created on the entry to the function and + * terminated on return from `my_function` using the push/pop C APIs: + * + * ``` + * void my_function(...){ + * nvtxRangePushA("my_function"); // Begins NVTX range + * // do work + * nvtxRangePop(); // Ends NVTX range + * } + * ``` + * + * One of the challenges with using the NVTX C API is that it requires manually + * terminating the end of the range with `nvtxRangePop`. This can be challenging + * if `my_function()` has multiple returns or can throw exceptions as it + * requires calling `nvtxRangePop()` before all possible return points. + * + * NVTX++ solves this inconvenience through the "RAII" technique by providing a + * `nvtx3::thread_range` class that begins a range at construction and ends the + * range on destruction. The above example then becomes: + * + * ``` + * void my_function(...){ + * nvtx3::thread_range r{"my_function"}; // Begins NVTX range + * // do work + * } // Range ends on exit from `my_function` when `r` is destroyed + * ``` + * + * The range object `r` is deterministically destroyed whenever `my_function` + * returns---ending the NVTX range without manual intervention. For more + * information, see \ref RANGES and `nvtx3::domain_thread_range`. + * + * Another inconvenience of the NVTX C APIs are the several constructs where the + * user is expected to initialize an object at the beginning of an application + * and reuse that object throughout the lifetime of the application. For example + * Domains, Categories, and Registered messages. + * + * Example: + * ``` + * nvtxDomainHandle_t D = nvtxDomainCreateA("my domain"); + * // Reuse `D` throughout the rest of the application + * ``` + * + * This can be problematic if the user application or library does not have an + * explicit initialization function called before all other functions to + * ensure that these long-lived objects are initialized before being used. + * + * NVTX++ makes use of the "construct on first use" technique to alleviate this + * inconvenience. In short, a function local static object is constructed upon + * the first invocation of a function and returns a reference to that object on + * all future invocations. See the documentation for + * `nvtx3::registered_message`, `nvtx3::domain`, `nvtx3::named_category`, and + * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use for more + * information. + * + * Using construct on first use, the above example becomes: + * ``` + * struct my_domain{ static constexpr char const* name{"my domain"}; }; + * + * // The first invocation of `domain::get` for the type `my_domain` will + * // construct a `nvtx3::domain` object and return a reference to it. Future + * // invocations simply return a reference. + * nvtx3::domain const& D = nvtx3::domain::get(); + * ``` + * For more information about NVTX and how it can be used, see + * https://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvtx and + * https://devblogs.nvidia.com/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/ + * for more information. + * + * \section RANGES Ranges + * + * Ranges are used to describe a span of time during the execution of an + * application. Common examples are using ranges to annotate the time it takes + * to execute a function or an iteration of a loop. + * + * NVTX++ uses RAII to automate the generation of ranges that are tied to the + * lifetime of objects. Similar to `std::lock_guard` in the C++ Standard + * Template Library. + * + * \subsection THREAD_RANGE Thread Range + * + * `nvtx3::domain_thread_range` is a class that begins a range upon construction + * and ends the range at destruction. This is one of the most commonly used + * constructs in NVTX++ and is useful for annotating spans of time on a + * particular thread. These ranges can be nested to arbitrary depths. + * + * `nvtx3::thread_range` is an alias for a `nvtx3::domain_thread_range` in the + * global NVTX domain. For more information about Domains, see \ref DOMAINS. + * + * Various attributes of a range can be configured constructing a + * `nvtx3::domain_thread_range` with a `nvtx3::event_attributes` object. For + * more information, see \ref ATTRIBUTES. + * + * Example: + * + * \code{.cpp} + * void some_function(){ + * // Creates a range for the duration of `some_function` + * nvtx3::thread_range r{}; + * + * while(true){ + * // Creates a range for every loop iteration + * // `loop_range` is nested inside `r` + * nvtx3::thread_range loop_range{}; + * } + * } + * \endcode + * + * \subsection PROCESS_RANGE Process Range + * + * `nvtx3::domain_process_range` is identical to `nvtx3::domain_thread_range` + * with the exception that a `domain_process_range` can be created and destroyed + * on different threads. This is useful to annotate spans of time that can + * bridge multiple threads. + * + * `nvtx3::domain_thread_range`s should be preferred unless one needs the + * ability to begin and end a range on different threads. + * + * \section MARKS Marks + * + * `nvtx3::mark` allows annotating an instantaneous event in an application's + * timeline. For example, indicating when a mutex is locked or unlocked. + * + * \code{.cpp} + * std::mutex global_lock; + * void lock_mutex(){ + * global_lock.lock(); + * // Marks an event immediately after the mutex is locked + * nvtx3::mark("lock_mutex"); + * } + * \endcode + * + * \section DOMAINS Domains + * + * Similar to C++ namespaces, Domains allow for scoping NVTX events. By default, + * all NVTX events belong to the "global" domain. Libraries and applications + * should scope their events to use a custom domain to differentiate where the + * events originate from. + * + * It is common for a library or application to have only a single domain and + * for the name of that domain to be known at compile time. Therefore, Domains + * in NVTX++ are represented by _tag types_. + * + * For example, to define a custom domain, simply define a new concrete type + * (a `class` or `struct`) with a `static` member called `name` that contains + * the desired name of the domain. + * + * ``` + * struct my_domain{ static constexpr char const* name{"my domain"}; }; + * ``` + * + * For any NVTX++ construct that can be scoped to a domain, the type `my_domain` + * can be passed as an explicit template argument to scope it to the custom + * domain. + * + * The tag type `nvtx3::domain::global` represents the global NVTX domain. + * + * \code{.cpp} + * // By default, `domain_thread_range` belongs to the global domain + * nvtx3::domain_thread_range<> r0{}; + * + * // Alias for a `domain_thread_range` in the global domain + * nvtx3::thread_range r1{}; + * + * // `r` belongs to the custom domain + * nvtx3::domain_thread_range r{}; + * \endcode + * + * When using a custom domain, it is reccomended to define type aliases for NVTX + * constructs in the custom domain. + * ``` + * using my_thread_range = nvtx3::domain_thread_range; + * using my_registered_message = nvtx3::registered_message; + * using my_named_category = nvtx3::named_category; + * ``` + * + * See `nvtx3::domain` for more information. + * + * \section ATTRIBUTES Event Attributes + * + * NVTX events can be customized with various attributes to provide additional + * information (such as a custom message) or to control visualization of the + * event (such as the color used). These attributes can be specified per-event + * via arguments to a `nvtx3::event_attributes` object. + * + * NVTX events can be customized via four "attributes": + * - \ref COLOR : color used to visualize the event in tools. + * - \ref MESSAGES : Custom message string. + * - \ref PAYLOAD : User-defined numerical value. + * - \ref CATEGORY : Intra-domain grouping. + * + * It is possible to construct a `nvtx3::event_attributes` from any number of + * attribute objects (nvtx3::color, nvtx3::message, nvtx3::payload, + * nvtx3::category) in any order. If an attribute is not specified, a tool + * specific default value is used. See `nvtx3::event_attributes` for more + * information. + * + * \code{.cpp} + * // Custom color, message + * event_attributes attr{nvtx3::rgb{127, 255, 0}, + * "message"}; + * + * // Custom color, message, payload, category + * event_attributes attr{nvtx3::rgb{127, 255, 0}, + * nvtx3::payload{42}, + * "message", + * nvtx3::category{1}}; + * + * // Arguments can be in any order + * event_attributes attr{nvtx3::payload{42}, + * nvtx3::category{1}, + * "message", + * nvtx3::rgb{127, 255, 0}}; + * + * // "First wins" with multiple arguments of the same type + * event_attributes attr{ nvtx3::payload{42}, nvtx3::payload{7} }; // payload is + * 42 \endcode + * + * \subsection MESSAGES message + * + * A `nvtx3::message` allows associating a custom message string with an NVTX + * event. + * + * Example: + * \code{.cpp} + * // Create an `event_attributes` with the custom message "my message" + * nvtx3::event_attributes attr{nvtx3::Mesage{"my message"}}; + * + * // strings and string literals implicitly assumed to be a `nvtx3::message` + * nvtx3::event_attributes attr{"my message"}; + * \endcode + * + * \subsubsection REGISTERED_MESSAGE Registered Messages + * + * Associating a `nvtx3::message` with an event requires copying the contents of + * the message every time the message is used, i.e., copying the entire message + * string. This may cause non-trivial overhead in performance sensitive code. + * + * To eliminate this overhead, NVTX allows registering a message string, + * yielding a "handle" that is inexpensive to copy that may be used in place of + * a message string. When visualizing the events, tools such as Nsight Systems + * will take care of mapping the message handle to its string. + * + * A message should be registered once and the handle reused throughout the rest + * of the application. This can be done by either explicitly creating static + * `nvtx3::registered_message` objects, or using the + * `nvtx3::registered_message::get` construct on first use helper (recommended). + * + * Similar to \ref DOMAINS, `nvtx3::registered_message::get` requires defining a + * custom tag type with a static `message` member whose value will be the + * contents of the registered string. + * + * Example: + * \code{.cpp} + * // Explicitly constructed, static `registered_message` + * static registered_message static_message{"my message"}; + * + * // Or use construct on first use: + * // Define a tag type with a `message` member string to register + * struct my_message{ static constexpr char const* message{ "my message" }; }; + * + * // Uses construct on first use to register the contents of + * // `my_message::message` + * nvtx3::registered_message const& msg = + * nvtx3::registered_message::get(); \endcode + * + * \subsection COLOR color + * + * Associating a `nvtx3::color` with an event allows controlling how the event + * is visualized in a tool such as Nsight Systems. This is a convenient way to + * visually differentiate among different events. + * + * \code{.cpp} + * // Define a color via rgb color values + * nvtx3::color c{nvtx3::rgb{127, 255, 0}}; + * nvtx3::event_attributes attr{c}; + * + * // rgb color values can be passed directly to an `event_attributes` + * nvtx3::event_attributes attr1{nvtx3::rgb{127,255,0}}; + * \endcode + * + * \subsection CATEGORY category + * + * A `nvtx3::category` is simply an integer id that allows for fine-grain + * grouping of NVTX events. For example, one might use separate categories for + * IO, memory allocation, compute, etc. + * + * \code{.cpp} + * nvtx3::event_attributes{nvtx3::category{1}}; + * \endcode + * + * \subsubsection NAMED_CATEGORIES Named Categories + * + * Associates a `name` string with a category `id` to help differentiate among + * categories. + * + * For any given category id `Id`, a `named_category{Id, "name"}` should only + * be constructed once and reused throughout an application. This can be done by + * either explicitly creating static `nvtx3::named_category` objects, or using + * the `nvtx3::named_category::get` construct on first use helper (recommended). + * + * Similar to \ref DOMAINS, `nvtx3::named_category::get` requires defining a + * custom tag type with static `name` and `id` members. + * + * \code{.cpp} + * // Explicitly constructed, static `named_category` + * static nvtx3::named_category static_category{42, "my category"}; + * + * // OR use construct on first use: + * // Define a tag type with `name` and `id` members + * struct my_category{ + * static constexpr char const* name{"my category"}; // category name + * static constexpr category::id_type id{42}; // category id + * }; + * + * // Use construct on first use to name the category id `42` + * // with name "my category" + * nvtx3::named_category const& my_category = + * named_category::get(); + * + * // Range `r` associated with category id `42` + * nvtx3::event_attributes attr{my_category}; + * \endcode + * + * \subsection PAYLOAD payload + * + * Allows associating a user-defined numerical value with an event. + * + * ``` + * nvtx3:: event_attributes attr{nvtx3::payload{42}}; // Constructs a payload + * from + * // the `int32_t` value 42 + * ``` + * + * + * \section EXAMPLE Example + * + * Putting it all together: + * \code{.cpp} + * // Define a custom domain tag type + * struct my_domain{ static constexpr char const* name{"my domain"}; }; + * + * // Define a named category tag type + * struct my_category{ + * static constexpr char const* name{"my category"}; + * static constexpr uint32_t id{42}; + * }; + * + * // Define a registered message tag type + * struct my_message{ static constexpr char const* message{"my message"}; }; + * + * // For convenience, use aliases for domain scoped objects + * using my_thread_range = nvtx3::domain_thread_range; + * using my_registered_message = nvtx3::registered_message; + * using my_named_category = nvtx3::named_category; + * + * // Default values for all attributes + * nvtx3::event_attributes attr{}; + * my_thread_range r0{attr}; + * + * // Custom (unregistered) message, and unnamed category + * nvtx3::event_attributes attr1{"message", nvtx3::category{2}}; + * my_thread_range r1{attr1}; + * + * // Alternatively, pass arguments of `event_attributes` ctor directly to + * // `my_thread_range` + * my_thread_range r2{"message", nvtx3::category{2}}; + * + * // construct on first use a registered message + * auto msg = my_registered_message::get(); + * + * // construct on first use a named category + * auto category = my_named_category::get(); + * + * // Use registered message and named category + * my_thread_range r3{msg, category, nvtx3::rgb{127, 255, 0}, + * nvtx3::payload{42}}; + * + * // Any number of arguments in any order + * my_thread_range r{nvtx3::rgb{127, 255,0}, msg}; + * + * \endcode + * \section MACROS Convenience Macros + * + * Oftentimes users want to quickly and easily add NVTX ranges to their library + * or application to aid in profiling and optimization. + * + * A convenient way to do this is to use the \ref NVTX3_FUNC_RANGE and + * \ref NVTX3_FUNC_RANGE_IN macros. These macros take care of constructing an + * `nvtx3::domain_thread_range` with the name of the enclosing function as the + * range's message. + * + * \code{.cpp} + * void some_function(){ + * // Automatically generates an NVTX range for the duration of the function + * // using "some_function" as the event's message. + * NVTX3_FUNC_RANGE(); + * } + * \endcode + * + */ + +/** + * @brief Enables the use of constexpr when support for C++14 relaxed constexpr + * is present. + * + * Initializing a legacy-C (i.e., no constructor) union member requires + * initializing in the constructor body. Non-empty constexpr constructors + * require C++14 relaxed constexpr. + * + */ +#if __cpp_constexpr >= 201304L +#define NVTX3_RELAXED_CONSTEXPR constexpr +#else +#define NVTX3_RELAXED_CONSTEXPR +#endif + +namespace nvtx3 { +namespace detail { + +/** + * @brief Verifies if a type `T` contains a member `T::name` of type `const + * char*` or `const wchar_t*`. + * + * @tparam T The type to verify + * @return True if `T` contains a member `T::name` of type `const char*` or + * `const wchar_t*`. + */ +template +constexpr auto has_name_member() noexcept -> decltype(T::name, bool()) { + return (std::is_same::type>::value or + std::is_same::type>::value); +} +} // namespace detail + +/** + * @brief `domain`s allow for grouping NVTX events into a single scope to + * differentiate them from events in other `domain`s. + * + * By default, all NVTX constructs are placed in the "global" NVTX domain. + * + * A custom `domain` may be used in order to differentiate a library's or + * application's NVTX events from other events. + * + * `domain`s are expected to be long-lived and unique to a library or + * application. As such, it is assumed a domain's name is known at compile + * time. Therefore, all NVTX constructs that can be associated with a domain + * require the domain to be specified via a *type* `DomainName` passed as an + * explicit template parameter. + * + * The type `domain::global` may be used to indicate that the global NVTX + * domain should be used. + * + * None of the C++ NVTX constructs require the user to manually construct a + * `domain` object. Instead, if a custom domain is desired, the user is + * expected to define a type `DomainName` that contains a member + * `DomainName::name` which resolves to either a `char const*` or `wchar_t + * const*`. The value of `DomainName::name` is used to name and uniquely + * identify the custom domain. + * + * Upon the first use of an NVTX construct associated with the type + * `DomainName`, the "construct on first use" pattern is used to construct a + * function local static `domain` object. All future NVTX constructs + * associated with `DomainType` will use a reference to the previously + * constructed `domain` object. See `domain::get`. + * + * Example: + * ``` + * // The type `my_domain` defines a `name` member used to name and identify + * the + * // `domain` object identified by `my_domain`. + * struct my_domain{ static constexpr char const* name{"my_domain"}; }; + * + * // The NVTX range `r` will be grouped with all other NVTX constructs + * // associated with `my_domain`. + * nvtx3::domain_thread_range r{}; + * + * // An alias can be created for a `domain_thread_range` in the custom domain + * using my_thread_range = nvtx3::domain_thread_range; + * my_thread_range my_range{}; + * + * // `domain::global` indicates that the global NVTX domain is used + * nvtx3::domain_thread_range r2{}; + * + * // For convenience, `nvtx3::thread_range` is an alias for a range in the + * // global domain + * nvtx3::thread_range r3{}; + * ``` + */ +class domain { + public: + domain(domain const&) = delete; + domain& operator=(domain const&) = delete; + domain(domain&&) = delete; + domain& operator=(domain&&) = delete; + + /** + * @brief Returns reference to an instance of a function local static + * `domain` object. + * + * Uses the "construct on first use" idiom to safely ensure the `domain` + * object is initialized exactly once upon first invocation of + * `domain::get()`. All following invocations will return a + * reference to the previously constructed `domain` object. See + * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use + * + * None of the constructs in this header require the user to directly invoke + * `domain::get`. It is automatically invoked when constructing objects like + * a `domain_thread_range` or `category`. Advanced users may wish to use + * `domain::get` for the convenience of the "construct on first use" idiom + * when using domains with their own use of the NVTX C API. + * + * This function is threadsafe as of C++11. If two or more threads call + * `domain::get` concurrently, exactly one of them is guaranteed + * to construct the `domain` object and the other(s) will receive a + * reference to the object after it is fully constructed. + * + * The domain's name is specified via the type `DomainName` pass as an + * explicit template parameter. `DomainName` is required to contain a + * member `DomainName::name` that resolves to either a `char const*` or + * `wchar_t const*`. The value of `DomainName::name` is used to name and + * uniquely identify the `domain`. + * + * Example: + * ``` + * // The type `my_domain` defines a `name` member used to name and identify + * // the `domain` object identified by `my_domain`. + * struct my_domain{ static constexpr char const* name{"my domain"}; }; + * + * auto D = domain::get(); // First invocation constructs a + * // `domain` with the name "my domain" + * + * auto D1 = domain::get(); // Simply returns reference to + * // previously constructed `domain`. + * ``` + * + * @tparam DomainName Type that contains a `DomainName::name` member used to + * name the `domain` object. + * @return Reference to the `domain` corresponding to the type `DomainName`. + */ + template + static domain const& get() { + static_assert(detail::has_name_member(), + "Type used to identify a domain must contain a name member of" + "type const char* or const wchar_t*"); + static domain const d{DomainName::name}; + return d; + } + + /** + * @brief Conversion operator to `nvtxDomainHandle_t`. + * + * Allows transparently passing a domain object into an API expecting a + * native `nvtxDomainHandle_t` object. + */ + operator nvtxDomainHandle_t() const noexcept { return _domain; } + + /** + * @brief Tag type for the "global" NVTX domain. + * + * This type may be passed as a template argument to any function/class + * expecting a type to identify a domain to indicate that the global domain + * should be used. + * + * All NVTX events in the global domain across all libraries and + * applications will be grouped together. + * + */ + struct global {}; + + private: + /** + * @brief Construct a new domain with the specified `name`. + * + * This constructor is private as it is intended that `domain` objects only + * be created through the `domain::get` function. + * + * @param name A unique name identifying the domain + */ + explicit domain(char const* name) noexcept + : _domain{nvtxDomainCreateA(name)} {} + + /** + * @brief Construct a new domain with the specified `name`. + * + * This constructor is private as it is intended that `domain` objects only + * be created through the `domain::get` function. + * + * @param name A unique name identifying the domain + */ + explicit domain(wchar_t const* name) noexcept + : _domain{nvtxDomainCreateW(name)} {} + + /** + * @brief Construct a new domain with the specified `name`. + * + * This constructor is private as it is intended that `domain` objects only + * be created through the `domain::get` function. + * + * @param name A unique name identifying the domain + */ + explicit domain(std::string const& name) noexcept : domain{name.c_str()} {} + + /** + * @brief Construct a new domain with the specified `name`. + * + * This constructor is private as it is intended that `domain` objects only + * be created through the `domain::get` function. + * + * @param name A unique name identifying the domain + */ + explicit domain(std::wstring const& name) noexcept : domain{name.c_str()} {} + + /** + * @brief Default constructor creates a `domain` representing the + * "global" NVTX domain. + * + * All events not associated with a custom `domain` are grouped in the + * "global" NVTX domain. + * + */ + domain() = default; + + /** + * @brief Destroy the domain object, unregistering and freeing all domain + * specific resources. + */ + ~domain() noexcept { nvtxDomainDestroy(_domain); } + + private: + nvtxDomainHandle_t const _domain{}; ///< The `domain`s NVTX handle +}; + +/** + * @brief Returns reference to the `domain` object that represents the global + * NVTX domain. + * + * This specialization for `domain::global` returns a default constructed, + * `domain` object for use when the "global" domain is desired. + * + * All NVTX events in the global domain across all libraries and applications + * will be grouped together. + * + * @return Reference to the `domain` corresponding to the global NVTX domain. + * + */ +template <> +inline domain const& domain::get() { + static domain const d{}; + return d; +} + +/** + * @brief Indicates the values of the red, green, blue color channels for + * a rgb color code. + * + */ +struct rgb { + /// Type used for component values + using component_type = uint8_t; + + /** + * @brief Construct a rgb with red, green, and blue channels + * specified by `red_`, `green_`, and `blue_`, respectively. + * + * Valid values are in the range `[0,255]`. + * + * @param red_ Value of the red channel + * @param green_ Value of the green channel + * @param blue_ Value of the blue channel + */ + constexpr rgb(component_type red_, component_type green_, + component_type blue_) noexcept + : red{red_}, green{green_}, blue{blue_} {} + + component_type const red{}; ///< Red channel value + component_type const green{}; ///< Green channel value + component_type const blue{}; ///< Blue channel value +}; + +/** + * @brief Indicates the value of the alpha, red, green, and blue color + * channels for an argb color code. + * + */ +struct argb final : rgb { + /** + * @brief Construct an argb with alpha, red, green, and blue channels + * specified by `alpha_`, `red_`, `green_`, and `blue_`, respectively. + * + * Valid values are in the range `[0,255]`. + * + * @param alpha_ Value of the alpha channel (opacity) + * @param red_ Value of the red channel + * @param green_ Value of the green channel + * @param blue_ Value of the blue channel + * + */ + constexpr argb(component_type alpha_, component_type red_, + component_type green_, component_type blue_) noexcept + : rgb{red_, green_, blue_}, alpha{alpha_} {} + + component_type const alpha{}; ///< Alpha channel value +}; + +/** + * @brief Represents a custom color that can be associated with an NVTX event + * via it's `event_attributes`. + * + * Specifying colors for NVTX events is a convenient way to visually + * differentiate among different events in a visualization tool such as Nsight + * Systems. + * + */ +class color { + public: + /// Type used for the color's value + using value_type = uint32_t; + + /** + * @brief Constructs a `color` using the value provided by `hex_code`. + * + * `hex_code` is expected to be a 4 byte argb hex code. + * + * The most significant byte indicates the value of the alpha channel + * (opacity) (0-255) + * + * The next byte indicates the value of the red channel (0-255) + * + * The next byte indicates the value of the green channel (0-255) + * + * The least significant byte indicates the value of the blue channel + * (0-255) + * + * @param hex_code The hex code used to construct the `color` + */ + constexpr explicit color(value_type hex_code) noexcept : _value{hex_code} {} + + /** + * @brief Construct a `color` using the alpha, red, green, blue components + * in `argb`. + * + * @param argb The alpha, red, green, blue components of the desired `color` + */ + constexpr color(argb argb) noexcept + : color{from_bytes_msb_to_lsb(argb.alpha, argb.red, argb.green, + argb.blue)} {} + + /** + * @brief Construct a `color` using the red, green, blue components in + * `rgb`. + * + * Uses maximum value for the alpha channel (opacity) of the `color`. + * + * @param rgb The red, green, blue components of the desired `color` + */ + constexpr color(rgb rgb) noexcept + : color{from_bytes_msb_to_lsb(0xFF, rgb.red, rgb.green, rgb.blue)} {} + + /** + * @brief Returns the `color`s argb hex code + * + */ + constexpr value_type get_value() const noexcept { return _value; } + + /** + * @brief Return the NVTX color type of the color. + * + */ + constexpr nvtxColorType_t get_type() const noexcept { return _type; } + + color() = delete; + ~color() = default; + color(color const&) = default; + color& operator=(color const&) = default; + color(color&&) = default; + color& operator=(color&&) = default; + + private: + /** + * @brief Constructs an unsigned, 4B integer from the component bytes in + * most to least significant byte order. + * + */ + constexpr static value_type from_bytes_msb_to_lsb(uint8_t byte3, + uint8_t byte2, + uint8_t byte1, + uint8_t byte0) noexcept { + return uint32_t{byte3} << 24 | uint32_t{byte2} << 16 | + uint32_t{byte1} << 8 | uint32_t{byte0}; + } + + value_type const _value{}; ///< color's argb color code + nvtxColorType_t const _type{NVTX_COLOR_ARGB}; ///< NVTX color type code +}; + +/** + * @brief Object for intra-domain grouping of NVTX events. + * + * A `category` is simply an integer id that allows for fine-grain grouping of + * NVTX events. For example, one might use separate categories for IO, memory + * allocation, compute, etc. + * + * Example: + * \code{.cpp} + * nvtx3::category cat1{1}; + * + * // Range `r1` belongs to the category identified by the value `1`. + * nvtx3::thread_range r1{cat1}; + * + * // Range `r2` belongs to the same category as `r1` + * nvtx3::thread_range r2{nvtx3::category{1}}; + * \endcode + * + * To associate a name string with a category id, see `named_category`. + * + */ +class category { + public: + /// Type used for `category`s integer id. + using id_type = uint32_t; + + /** + * @brief Construct a `category` with the specified `id`. + * + * The `category` will be unnamed and identified only by its `id` value. + * + * All `category` objects sharing the same `id` are equivalent. + * + * @param[in] id The `category`'s identifying value + */ + constexpr explicit category(id_type id) noexcept : id_{id} {} + + /** + * @brief Returns the id of the category. + * + */ + constexpr id_type get_id() const noexcept { return id_; } + + category() = delete; + ~category() = default; + category(category const&) = default; + category& operator=(category const&) = default; + category(category&&) = default; + category& operator=(category&&) = default; + + private: + id_type const id_{}; ///< category's unique identifier +}; + +/** + * @brief A `category` with an associated name string. + * + * Associates a `name` string with a category `id` to help differentiate among + * categories. + * + * For any given category id `Id`, a `named_category(Id, "name")` should only + * be constructed once and reused throughout an application. This can be done + * by either explicitly creating static `named_category` objects, or using the + * `named_category::get` construct on first use helper (recommended). + * + * Creating two or more `named_category` objects with the same value for `id` + * in the same domain results in undefined behavior. + * + * Similarly, behavior is undefined when a `named_category` and `category` + * share the same value of `id`. + * + * Example: + * \code{.cpp} + * // Explicitly constructed, static `named_category` + * static nvtx3::named_category static_category{42, "my category"}; + * + * // Range `r` associated with category id `42` + * nvtx3::thread_range r{static_category}; + * + * // OR use construct on first use: + * + * // Define a type with `name` and `id` members + * struct my_category{ + * static constexpr char const* name{"my category"}; // category name + * static constexpr category::id_type id{42}; // category id + * }; + * + * // Use construct on first use to name the category id `42` + * // with name "my category" + * auto my_category = named_category::get(); + * + * // Range `r` associated with category id `42` + * nvtx3::thread_range r{my_category}; + * \endcode + * + * `named_category`'s association of a name to a category id is local to the + * domain specified by the type `D`. An id may have a different name in + * another domain. + * + * @tparam D Type containing `name` member used to identify the `domain` to + * which the `named_category` belongs. Else, `domain::global` to indicate + * that the global NVTX domain should be used. + */ +template +class named_category final : public category { + public: + /** + * @brief Returns a global instance of a `named_category` as a + * function-local static. + * + * Creates a `named_category` with name and id specified by the contents of + * a type `C`. `C::name` determines the name and `C::id` determines the + * category id. + * + * This function is useful for constructing a named `category` exactly once + * and reusing the same instance throughout an application. + * + * Example: + * \code{.cpp} + * // Define a type with `name` and `id` members + * struct my_category{ + * static constexpr char const* name{"my category"}; // category name + * static constexpr uint32_t id{42}; // category id + * }; + * + * // Use construct on first use to name the category id `42` + * // with name "my category" + * auto cat = named_category::get(); + * + * // Range `r` associated with category id `42` + * nvtx3::thread_range r{cat}; + * \endcode + * + * Uses the "construct on first use" idiom to safely ensure the `category` + * object is initialized exactly once. See + * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use + * + * @tparam C Type containing a member `C::name` that resolves to either a + * `char const*` or `wchar_t const*` and `C::id`. + */ + template + static named_category const& get() noexcept { + static_assert(detail::has_name_member(), + "Type used to name a category must contain a name member."); + static named_category const category{C::id, C::name}; + return category; + } + /** + * @brief Construct a `category` with the specified `id` and `name`. + * + * The name `name` will be registered with `id`. + * + * Every unique value of `id` should only be named once. + * + * @param[in] id The category id to name + * @param[in] name The name to associated with `id` + */ + named_category(id_type id, char const* name) noexcept : category{id} { + nvtxDomainNameCategoryA(domain::get(), get_id(), name); + }; + + /** + * @brief Construct a `category` with the specified `id` and `name`. + * + * The name `name` will be registered with `id`. + * + * Every unique value of `id` should only be named once. + * + * @param[in] id The category id to name + * @param[in] name The name to associated with `id` + */ + named_category(id_type id, wchar_t const* name) noexcept : category{id} { + nvtxDomainNameCategoryW(domain::get(), get_id(), name); + }; +}; + +/** + * @brief A message registered with NVTX. + * + * Normally, associating a `message` with an NVTX event requires copying the + * contents of the message string. This may cause non-trivial overhead in + * highly performance sensitive regions of code. + * + * message registration is an optimization to lower the overhead of + * associating a message with an NVTX event. Registering a message yields a + * handle that is inexpensive to copy that may be used in place of a message + * string. + * + * A particular message should only be registered once and the handle + * reused throughout the rest of the application. This can be done by either + * explicitly creating static `registered_message` objects, or using the + * `registered_message::get` construct on first use helper (recommended). + * + * Example: + * \code{.cpp} + * // Explicitly constructed, static `registered_message` + * static registered_message static_message{"message"}; + * + * // "message" is associated with the range `r` + * nvtx3::thread_range r{static_message}; + * + * // Or use construct on first use: + * + * // Define a type with a `message` member that defines the contents of the + * // registered message + * struct my_message{ static constexpr char const* message{ "my message" }; }; + * + * // Uses construct on first use to register the contents of + * // `my_message::message` + * auto msg = registered_message::get(); + * + * // "my message" is associated with the range `r` + * nvtx3::thread_range r{msg}; + * \endcode + * + * `registered_message`s are local to a particular domain specified via + * the type `D`. + * + * @tparam D Type containing `name` member used to identify the `domain` to + * which the `registered_message` belongs. Else, `domain::global` to indicate + * that the global NVTX domain should be used. + */ +template +class registered_message { + public: + /** + * @brief Returns a global instance of a `registered_message` as a function + * local static. + * + * Provides a convenient way to register a message with NVTX without having + * to explicitly register the message. + * + * Upon first invocation, constructs a `registered_message` whose contents + * are specified by `message::message`. + * + * All future invocations will return a reference to the object constructed + * in the first invocation. + * + * Example: + * \code{.cpp} + * // Define a type with a `message` member that defines the contents of the + * // registered message + * struct my_message{ static constexpr char const* message{ "my message" }; + * }; + * + * // Uses construct on first use to register the contents of + * // `my_message::message` + * auto msg = registered_message::get(); + * + * // "my message" is associated with the range `r` + * nvtx3::thread_range r{msg}; + * \endcode + * + * @tparam M Type required to contain a member `M::message` that + * resolves to either a `char const*` or `wchar_t const*` used as the + * registered message's contents. + * @return Reference to a `registered_message` associated with the type `M`. + */ + template + static registered_message const& get() noexcept { + static registered_message const registered_message{M::message}; + return registered_message; + } + + /** + * @brief Constructs a `registered_message` from the specified `msg` string. + * + * Registers `msg` with NVTX and associates a handle with the registered + * message. + * + * A particular message should should only be registered once and the handle + * reused throughout the rest of the application. + * + * @param msg The contents of the message + */ + explicit registered_message(char const* msg) noexcept + : handle_{nvtxDomainRegisterStringA(domain::get(), msg)} {} + + /** + * @brief Constructs a `registered_message` from the specified `msg` string. + * + * Registers `msg` with NVTX and associates a handle with the registered + * message. + * + * A particular message should should only be registered once and the handle + * reused throughout the rest of the application. + * + * @param msg The contents of the message + */ + explicit registered_message(std::string const& msg) noexcept + : registered_message{msg.c_str()} {} + + /** + * @brief Constructs a `registered_message` from the specified `msg` string. + * + * Registers `msg` with NVTX and associates a handle with the registered + * message. + * + * A particular message should should only be registered once and the handle + * reused throughout the rest of the application. + * + * @param msg The contents of the message + */ + explicit registered_message(wchar_t const* msg) noexcept + : handle_{nvtxDomainRegisterStringW(domain::get(), msg)} {} + + /** + * @brief Constructs a `registered_message` from the specified `msg` string. + * + * Registers `msg` with NVTX and associates a handle with the registered + * message. + * + * A particular message should only be registered once and the handle + * reused throughout the rest of the application. + * + * @param msg The contents of the message + */ + explicit registered_message(std::wstring const& msg) noexcept + : registered_message{msg.c_str()} {} + + /** + * @brief Returns the registered message's handle + * + */ + nvtxStringHandle_t get_handle() const noexcept { return handle_; } + + registered_message() = delete; + ~registered_message() = default; + registered_message(registered_message const&) = default; + registered_message& operator=(registered_message const&) = default; + registered_message(registered_message&&) = default; + registered_message& operator=(registered_message&&) = default; + + private: + nvtxStringHandle_t const handle_{}; ///< The handle returned from + ///< registering the message with NVTX +}; + +/** + * @brief Allows associating a message string with an NVTX event via + * its `EventAttribute`s. + * + * Associating a `message` with an NVTX event through its `event_attributes` + * allows for naming events to easily differentiate them from other events. + * + * Every time an NVTX event is created with an associated `message`, the + * contents of the message string must be copied. This may cause non-trivial + * overhead in highly performance sensitive sections of code. Use of a + * `nvtx3::registered_message` is recommended in these situations. + * + * Example: + * \code{.cpp} + * // Creates an `event_attributes` with message "message 0" + * nvtx3::event_attributes attr0{nvtx3::message{"message 0"}}; + * + * // `range0` contains message "message 0" + * nvtx3::thread_range range0{attr0}; + * + * // `std::string` and string literals are implicitly assumed to be + * // the contents of an `nvtx3::message` + * // Creates an `event_attributes` with message "message 1" + * nvtx3::event_attributes attr1{"message 1"}; + * + * // `range1` contains message "message 1" + * nvtx3::thread_range range1{attr1}; + * + * // `range2` contains message "message 2" + * nvtx3::thread_range range2{nvtx3::Mesage{"message 2"}}; + * + * // `std::string` and string literals are implicitly assumed to be + * // the contents of an `nvtx3::message` + * // `range3` contains message "message 3" + * nvtx3::thread_range range3{"message 3"}; + * \endcode + */ +class message { + public: + using value_type = nvtxMessageValue_t; + + /** + * @brief Construct a `message` whose contents are specified by `msg`. + * + * @param msg The contents of the message + */ + NVTX3_RELAXED_CONSTEXPR message(char const* msg) noexcept + : type_{NVTX_MESSAGE_TYPE_ASCII} { + value_.ascii = msg; + } + + /** + * @brief Construct a `message` whose contents are specified by `msg`. + * + * @param msg The contents of the message + */ + message(std::string const& msg) noexcept : message{msg.c_str()} {} + + /** + * @brief Disallow construction for `std::string` r-value + * + * `message` is a non-owning type and therefore cannot take ownership of an + * r-value. Therefore, constructing from an r-value is disallowed to prevent + * a dangling pointer. + * + */ + message(std::string&&) = delete; + + /** + * @brief Construct a `message` whose contents are specified by `msg`. + * + * @param msg The contents of the message + */ + NVTX3_RELAXED_CONSTEXPR message(wchar_t const* msg) noexcept + : type_{NVTX_MESSAGE_TYPE_UNICODE} { + value_.unicode = msg; + } + + /** + * @brief Construct a `message` whose contents are specified by `msg`. + * + * @param msg The contents of the message + */ + message(std::wstring const& msg) noexcept : message{msg.c_str()} {} + + /** + * @brief Disallow construction for `std::wstring` r-value + * + * `message` is a non-owning type and therefore cannot take ownership of an + * r-value. Therefore, constructing from an r-value is disallowed to prevent + * a dangling pointer. + * + */ + message(std::wstring&&) = delete; + + /** + * @brief Construct a `message` from a `registered_message`. + * + * @tparam D Type containing `name` member used to identify the `domain` + * to which the `registered_message` belongs. Else, `domain::global` to + * indicate that the global NVTX domain should be used. + * @param msg The message that has already been registered with NVTX. + */ + template + NVTX3_RELAXED_CONSTEXPR message(registered_message const& msg) noexcept + : type_{NVTX_MESSAGE_TYPE_REGISTERED} { + value_.registered = msg.get_handle(); + } + + /** + * @brief Return the union holding the value of the message. + * + */ + NVTX3_RELAXED_CONSTEXPR value_type get_value() const noexcept { + return value_; + } + + /** + * @brief Return the type information about the value the union holds. + * + */ + NVTX3_RELAXED_CONSTEXPR nvtxMessageType_t get_type() const noexcept { + return type_; + } + + private: + nvtxMessageType_t const type_{}; ///< message type + nvtxMessageValue_t value_{}; ///< message contents +}; + +/** + * @brief A numerical value that can be associated with an NVTX event via + * its `event_attributes`. + * + * Example: + * ``` + * nvtx3:: event_attributes attr{nvtx3::payload{42}}; // Constructs a payload + * from + * // the `int32_t` value 42 + * + * // `range0` will have an int32_t payload of 42 + * nvtx3::thread_range range0{attr}; + * + * // range1 has double payload of 3.14 + * nvtx3::thread_range range1{ nvtx3::payload{3.14} }; + * ``` + */ +class payload { + public: + using value_type = typename nvtxEventAttributes_v2::payload_t; + + /** + * @brief Construct a `payload` from a signed, 8 byte integer. + * + * @param value Value to use as contents of the payload + */ + NVTX3_RELAXED_CONSTEXPR explicit payload(int64_t value) noexcept + : type_{NVTX_PAYLOAD_TYPE_INT64}, value_{} { + value_.llValue = value; + } + + /** + * @brief Construct a `payload` from a signed, 4 byte integer. + * + * @param value Value to use as contents of the payload + */ + NVTX3_RELAXED_CONSTEXPR explicit payload(int32_t value) noexcept + : type_{NVTX_PAYLOAD_TYPE_INT32}, value_{} { + value_.iValue = value; + } + + /** + * @brief Construct a `payload` from an unsigned, 8 byte integer. + * + * @param value Value to use as contents of the payload + */ + NVTX3_RELAXED_CONSTEXPR explicit payload(uint64_t value) noexcept + : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT64}, value_{} { + value_.ullValue = value; + } + + /** + * @brief Construct a `payload` from an unsigned, 4 byte integer. + * + * @param value Value to use as contents of the payload + */ + NVTX3_RELAXED_CONSTEXPR explicit payload(uint32_t value) noexcept + : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT32}, value_{} { + value_.uiValue = value; + } + + /** + * @brief Construct a `payload` from a single-precision floating point + * value. + * + * @param value Value to use as contents of the payload + */ + NVTX3_RELAXED_CONSTEXPR explicit payload(float value) noexcept + : type_{NVTX_PAYLOAD_TYPE_FLOAT}, value_{} { + value_.fValue = value; + } + + /** + * @brief Construct a `payload` from a double-precision floating point + * value. + * + * @param value Value to use as contents of the payload + */ + NVTX3_RELAXED_CONSTEXPR explicit payload(double value) noexcept + : type_{NVTX_PAYLOAD_TYPE_DOUBLE}, value_{} { + value_.dValue = value; + } + + /** + * @brief Return the union holding the value of the payload + * + */ + NVTX3_RELAXED_CONSTEXPR value_type get_value() const noexcept { + return value_; + } + + /** + * @brief Return the information about the type the union holds. + * + */ + NVTX3_RELAXED_CONSTEXPR nvtxPayloadType_t get_type() const noexcept { + return type_; + } + + private: + nvtxPayloadType_t const type_; ///< Type of the payload value + value_type value_; ///< Union holding the payload value +}; + +/** + * @brief Describes the attributes of a NVTX event. + * + * NVTX events can be customized via four "attributes": + * + * - color: color used to visualize the event in tools such as Nsight + * Systems. See `color`. + * - message: Custom message string. See `message`. + * - payload: User-defined numerical value. See `payload`. + * - category: Intra-domain grouping. See `category`. + * + * These component attributes are specified via an `event_attributes` object. + * See `nvtx3::color`, `nvtx3::message`, `nvtx3::payload`, and + * `nvtx3::category` for how these individual attributes are constructed. + * + * While it is possible to specify all four attributes, it is common to want + * to only specify a subset of attributes and use default values for the + * others. For convenience, `event_attributes` can be constructed from any + * number of attribute components in any order. + * + * Example: + * \code{.cpp} + * event_attributes attr{}; // No arguments, use defaults for all attributes + * + * event_attributes attr{"message"}; // Custom message, rest defaulted + * + * // Custom color & message + * event_attributes attr{"message", nvtx3::rgb{127, 255, 0}}; + * + * /// Custom color & message, can use any order of arguments + * event_attributes attr{nvtx3::rgb{127, 255, 0}, "message"}; + * + * + * // Custom color, message, payload, category + * event_attributes attr{nvtx3::rgb{127, 255, 0}, + * "message", + * nvtx3::payload{42}, + * nvtx3::category{1}}; + * + * // Custom color, message, payload, category, can use any order of arguments + * event_attributes attr{nvtx3::payload{42}, + * nvtx3::category{1}, + * "message", + * nvtx3::rgb{127, 255, 0}}; + * + * // Multiple arguments of the same type are allowed, but only the first is + * // used. All others are ignored + * event_attributes attr{ nvtx3::payload{42}, nvtx3::payload{7} }; // payload + * is 42 + * + * // Range `r` will be customized according the attributes in `attr` + * nvtx3::thread_range r{attr}; + * + * // For convenience, the arguments that can be passed to the + * `event_attributes` + * // constructor may be passed to the `domain_thread_range` contructor where + * // they will be forwarded to the `EventAttribute`s constructor + * nvtx3::thread_range r{nvtx3::payload{42}, nvtx3::category{1}, "message"}; + * \endcode + * + */ +class event_attributes { + public: + using value_type = nvtxEventAttributes_t; + + /** + * @brief Default constructor creates an `event_attributes` with no + * category, color, payload, nor message. + */ + constexpr event_attributes() noexcept + : attributes_{ + NVTX_VERSION, // version + sizeof(nvtxEventAttributes_t), // size + 0, // category + NVTX_COLOR_UNKNOWN, // color type + 0, // color value + NVTX_PAYLOAD_UNKNOWN, // payload type + 0, // payload value (union) + NVTX_MESSAGE_UNKNOWN, // message type + 0 // message value (union) + } {} + + /** + * @brief Variadic constructor where the first argument is a `category`. + * + * Sets the value of the `EventAttribute`s category based on `c` and + * forwards the remaining variadic parameter pack to the next constructor. + * + */ + template + NVTX3_RELAXED_CONSTEXPR explicit event_attributes( + category const& c, Args const&... args) noexcept + : event_attributes(args...) { + attributes_.category = c.get_id(); + } + + /** + * @brief Variadic constructor where the first argument is a `color`. + * + * Sets the value of the `EventAttribute`s color based on `c` and forwards + * the remaining variadic parameter pack to the next constructor. + * + */ + template + NVTX3_RELAXED_CONSTEXPR explicit event_attributes( + color const& c, Args const&... args) noexcept + : event_attributes(args...) { + attributes_.color = c.get_value(); + attributes_.colorType = c.get_type(); + } + + /** + * @brief Variadic constructor where the first argument is a `payload`. + * + * Sets the value of the `EventAttribute`s payload based on `p` and forwards + * the remaining variadic parameter pack to the next constructor. + * + */ + template + NVTX3_RELAXED_CONSTEXPR explicit event_attributes( + payload const& p, Args const&... args) noexcept + : event_attributes(args...) { + attributes_.payload = p.get_value(); + attributes_.payloadType = p.get_type(); + } + + /** + * @brief Variadic constructor where the first argument is a `message`. + * + * Sets the value of the `EventAttribute`s message based on `m` and forwards + * the remaining variadic parameter pack to the next constructor. + * + */ + template + NVTX3_RELAXED_CONSTEXPR explicit event_attributes( + message const& m, Args const&... args) noexcept + : event_attributes(args...) { + attributes_.message = m.get_value(); + attributes_.messageType = m.get_type(); + } + + ~event_attributes() = default; + event_attributes(event_attributes const&) = default; + event_attributes& operator=(event_attributes const&) = default; + event_attributes(event_attributes&&) = default; + event_attributes& operator=(event_attributes&&) = default; + + /** + * @brief Get raw pointer to underlying NVTX attributes object. + * + */ + constexpr value_type const* get() const noexcept { return &attributes_; } + + private: + value_type attributes_{}; ///< The NVTX attributes structure +}; + +/** + * @brief A RAII object for creating a NVTX range local to a thread within a + * domain. + * + * When constructed, begins a nested NVTX range on the calling thread in the + * specified domain. Upon destruction, ends the NVTX range. + * + * Behavior is undefined if a `domain_thread_range` object is + * created/destroyed on different threads. + * + * `domain_thread_range` is neither moveable nor copyable. + * + * `domain_thread_range`s may be nested within other ranges. + * + * The domain of the range is specified by the template type parameter `D`. + * By default, the `domain::global` is used, which scopes the range to the + * global NVTX domain. The convenience alias `thread_range` is provided for + * ranges scoped to the global domain. + * + * A custom domain can be defined by creating a type, `D`, with a static + * member `D::name` whose value is used to name the domain associated with + * `D`. `D::name` must resolve to either `char const*` or `wchar_t const*` + * + * Example: + * ``` + * // Define a type `my_domain` with a member `name` used to name the domain + * // associated with the type `my_domain`. + * struct my_domain{ + * static constexpr const char * name{"my domain"}; + * }; + * ``` + * + * Usage: + * ``` + * nvtx3::domain_thread_range<> r0{"range 0"}; // Range in global domain + * + * nvtx3::thread_range r1{"range 1"}; // Alias for range in global domain + * + * nvtx3::domain_thread_range r2{"range 2"}; // Range in custom + * domain + * + * // specify an alias to a range that uses a custom domain + * using my_thread_range = nvtx3::domain_thread_range; + * + * my_thread_range r3{"range 3"}; // Alias for range in custom domain + * ``` + */ +template +class domain_thread_range { + public: + /** + * @brief Construct a `domain_thread_range` with the specified + * `event_attributes` + * + * Example: + * ``` + * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}}; + * nvtx3::domain_thread_range<> range{attr}; // Creates a range with message + * contents + * // "msg" and green color + * ``` + * + * @param[in] attr `event_attributes` that describes the desired attributes + * of the range. + */ + explicit domain_thread_range(event_attributes const& attr) noexcept { + nvtxDomainRangePushEx(domain::get(), attr.get()); + } + + /** + * @brief Constructs a `domain_thread_range` from the constructor arguments + * of an `event_attributes`. + * + * Forwards the arguments `first, args...` to construct an + * `event_attributes` object. The `event_attributes` object is then + * associated with the `domain_thread_range`. + * + * For more detail, see `event_attributes` documentation. + * + * Example: + * ``` + * // Creates a range with message "message" and green color + * nvtx3::domain_thread_range<> r{"message", nvtx3::rgb{127,255,0}}; + * ``` + * + * @note To prevent making needless copies of `event_attributes` objects, + * this constructor is disabled when the first argument is an + * `event_attributes` object, instead preferring the explicit + * `domain_thread_range(event_attributes const&)` constructor. + * + * @param[in] first First argument to forward to the `event_attributes` + * constructor. + * @param[in] args Variadic parameter pack of additional arguments to + * forward. + * + */ + template >::value>> + explicit domain_thread_range(First const& first, Args const&... args) noexcept + : domain_thread_range{event_attributes{first, args...}} {} + + /** + * @brief Default constructor creates a `domain_thread_range` with no + * message, color, payload, nor category. + * + */ + domain_thread_range() : domain_thread_range{event_attributes{}} {} + + domain_thread_range(domain_thread_range const&) = delete; + domain_thread_range& operator=(domain_thread_range const&) = delete; + domain_thread_range(domain_thread_range&&) = delete; + domain_thread_range& operator=(domain_thread_range&&) = delete; + + /** + * @brief Destroy the domain_thread_range, ending the NVTX range event. + */ + ~domain_thread_range() noexcept { nvtxDomainRangePop(domain::get()); } +}; + +/** + * @brief Alias for a `domain_thread_range` in the global NVTX domain. + * + */ +using thread_range = domain_thread_range<>; + +/** + * @brief Handle used for correlating explicit range start and end events. + * + */ +struct range_handle { + /// Type used for the handle's value + using value_type = nvtxRangeId_t; + + /** + * @brief Construct a `range_handle` from the given id. + * + */ + constexpr range_handle(value_type id) noexcept : _range_id{id} {} + + /** + * @brief Returns the `range_handle`'s value + * + * @return value_type The handle's value + */ + constexpr value_type get_value() const noexcept { return _range_id; } + +private: + value_type _range_id{}; ///< The underlying NVTX range id +}; + +/** + * @brief Manually begin an NVTX range. + * + * Explicitly begins an NVTX range and returns a unique handle. To end the + * range, pass the handle to `end_range()`. + * + * `start_range/end_range` are the most explicit and lowest level APIs provided + * for creating ranges. Use of `nvtx3::domain_process_range` should be + * preferred unless one is unable to tie the range to the lifetime of an object. + * + * Example: + * ``` + * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}}; + * nvtx3::range_handle h = nvxt3::start_range(attr); // Manually begins a range + * ... + * nvtx3::end_range(h); // Ends the range + * ``` + * + * @tparam D Type containing `name` member used to identify the `domain` + * to which the range belongs. Else, `domain::global` to indicate that the + * global NVTX domain should be used. + * @param[in] attr `event_attributes` that describes the desired attributes + * of the range. + * @return Unique handle to be passed to `end_range` to end the range. + */ +template +range_handle start_range(event_attributes const &attr) noexcept { + return range_handle{nvtxDomainRangeStartEx(domain::get(), attr.get())}; +} + +/** + * @brief Manually begin an NVTX range. + * + * Explicitly begins an NVTX range and returns a unique handle. To end the + * range, pass the handle to `end_range()`. + * + * Forwards the arguments `first, args...` to construct an `event_attributes` + * object. The `event_attributes` object is then associated with the range. + * + * For more detail, see `event_attributes` documentation. + * + * Example: + * ``` + * nvtx3::range_handle h = nvxt3::start_range("msg", nvtx3::rgb{127,255,0}); // + * Begin range + * ... + * nvtx3::end_range(h); // Ends the range + * ``` + * + * `start_range/end_range` are the most explicit and lowest level APIs provided + * for creating ranges. Use of `nvtx3::domain_process_range` should be + * preferred unless one is unable to tie the range to the lifetime of an object. + * + * @param first[in] First argument to pass to an `event_attributes` + * @param args[in] Variadiac parameter pack of the rest of the arguments for an + * `event_attributes`. + * @return Unique handle to be passed to `end_range` to end the range. + */ +template >::value>> +range_handle start_range(First const &first, Args const &... args) noexcept { + return start_range(event_attributes{first, args...}); +} + +/** + * @brief Manually end the range associated with the handle `r`. + * + * Explicitly ends the NVTX range indicated by the handle `r` returned from a + * prior call to `start_range`. The range may end on a different thread from + * where it began. + * + * This function does not have a Domain tag type template parameter as the + * handle `r` already indicates the domain to which the range belongs. + * + * @param r Handle to a range started by a prior call to `start_range`. + */ +void end_range(range_handle r) { nvtxRangeEnd(r.get_value()); } + +/** + * @brief A RAII object for creating a NVTX range within a domain that can + * be created and destroyed on different threads. + * + * When constructed, begins a NVTX range in the specified domain. Upon + * destruction, ends the NVTX range. + * + * Similar to `nvtx3::domain_thread_range`, the only difference being that + * `domain_process_range` can start and end on different threads. + * + * Use of `nvtx3::domain_thread_range` should be preferred unless one needs + * the ability to start and end a range on different threads. + * + * `domain_process_range` is moveable, but not copyable. + * + * @tparam D Type containing `name` member used to identify the `domain` + * to which the `domain_process_range` belongs. Else, `domain::global` to + * indicate that the global NVTX domain should be used. + */ +template class domain_process_range { + public: + /** + * @brief Construct a new domain process range object + * + * @param attr + */ + explicit domain_process_range(event_attributes const &attr) noexcept + : handle_{start_range(attr)} {} + + /** + * @brief Construct a new domain process range object + * + * @param first + * @param args + */ + template >::value>> + explicit domain_process_range(First const &first, + Args const &... args) noexcept + : domain_process_range{event_attributes{first, args...}} {} + + /** + * @brief Construct a new domain process range object + * + */ + constexpr domain_process_range() noexcept + : domain_process_range{event_attributes{}} {} + + /** + * @brief Destroy the `domain_process_range` ending the range. + * + */ + ~domain_process_range() noexcept { + if (not moved_from_) { + end_range(handle_); + } + } + + /** + * @brief Move constructor allows taking ownership of the NVTX range from + * another `domain_process_range`. + * + * @param other + */ + domain_process_range(domain_process_range &&other) noexcept + : handle_{other.handle_} { + other.moved_from_ = true; + } + + /** + * @brief Move assignment operator allows taking ownership of an NVTX range + * from another `domain_process_range`. + * + * @param other + * @return domain_process_range& + */ + domain_process_range &operator=(domain_process_range &&other) noexcept { + handle_ = other.handle_; + other.moved_from_ = true; + } + + /// Copy construction is not allowed to prevent multiple objects from owning + /// the same range handle + domain_process_range(domain_process_range const &) = delete; + + /// Copy assignment is not allowed to prevent multiple objects from owning the + /// same range handle + domain_process_range &operator=(domain_process_range const &) = delete; + + private: + range_handle handle_; ///< Range handle used to correlate + ///< the start/end of the range + bool moved_from_{false}; ///< Indicates if the object has had + ///< it's contents moved from it, + ///< indicating it should not attempt + ///< to end the NVTX range. +}; + +/** + * @brief Alias for a `domain_process_range` in the global NVTX domain. + * + */ +using process_range = domain_process_range<>; + +/** + * @brief Annotates an instantaneous point in time with the attributes specified + * by `attr`. + * + * Unlike a "range", a mark is an instantaneous event in an application, e.g., + * locking/unlocking a mutex. + * + * \code{.cpp} + * std::mutex global_lock; + * void lock_mutex(){ + * global_lock.lock(); + * nvtx3::mark("lock_mutex"); + * } + * \endcode + * + * @tparam D Type containing `name` member used to identify the `domain` + * to which the `domain_process_range` belongs. Else, `domain::global` to + * indicate that the global NVTX domain should be used. + * @param[in] attr `event_attributes` that describes the desired attributes + * of the mark. + */ +template +inline void mark(event_attributes const& attr) noexcept { + nvtxDomainMarkEx(domain::get(), attr.get()); +} + +} // namespace nvtx3 + +/** + * @brief Convenience macro for generating a range in the specified `domain` + * from the lifetime of a function + * + * This macro is useful for generating an NVTX range in `domain` from + * the entry point of a function to its exit. It is intended to be the first + * line of the function. + * + * Constructs a static `registered_message` using the name of the immediately + * enclosing function returned by `__func__` and constructs a + * `nvtx3::thread_range` using the registered function name as the range's + * message. + * + * Example: + * ``` + * struct my_domain{static constexpr char const* name{"my_domain"};}; + * + * void foo(...){ + * NVTX3_FUNC_RANGE_IN(my_domain); // Range begins on entry to foo() + * // do stuff + * ... + * } // Range ends on return from foo() + * ``` + * + * @param[in] D Type containing `name` member used to identify the + * `domain` to which the `registered_message` belongs. Else, + * `domain::global` to indicate that the global NVTX domain should be used. + */ +#define NVTX3_FUNC_RANGE_IN(D) \ + static ::nvtx3::registered_message const nvtx3_func_name__{__func__}; \ + static ::nvtx3::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \ + ::nvtx3::domain_thread_range const nvtx3_range__{nvtx3_func_attr__}; + +/** + * @brief Convenience macro for generating a range in the global domain from the + * lifetime of a function. + * + * This macro is useful for generating an NVTX range in the global domain from + * the entry point of a function to its exit. It is intended to be the first + * line of the function. + * + * Constructs a static `registered_message` using the name of the immediately + * enclosing function returned by `__func__` and constructs a + * `nvtx3::thread_range` using the registered function name as the range's + * message. + * + * Example: + * ``` + * void foo(...){ + * NVTX3_FUNC_RANGE(); // Range begins on entry to foo() + * // do stuff + * ... + * } // Range ends on return from foo() + * ``` + */ +#define NVTX3_FUNC_RANGE() NVTX3_FUNC_RANGE_IN(::nvtx3::domain::global) \ No newline at end of file diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index 267910b43..01399a610 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -99,14 +99,15 @@ class dynamic_map { static_assert(std::is_arithmetic::value, "Unsupported, non-arithmetic key type."); public: - using value_type = cuco::pair_type; ///< Type of key/value pairs - using key_type = Key; ///< Key type - using mapped_type = Value; ///< Type of mapped values - using atomic_ctr_type = cuda::atomic; ///< Type of atomic counters - using view_type = typename static_map::device_view; ///< Device view type - using mutable_view_type = typename static_map::device_mutable_view; - ///< Device mutable view type - + using value_type = cuco::pair_type; + using key_type = Key; + using mapped_type = Value; + using atomic_ctr_type = cuda::atomic; + using view_type = typename static_map::device_view; + using mutable_view_type = typename static_map::device_mutable_view; + using counter_allocator_type = + typename std::allocator_traits::rebind_alloc; + dynamic_map(dynamic_map const&) = delete; dynamic_map(dynamic_map&&) = delete; @@ -286,6 +287,7 @@ class dynamic_map { std::size_t min_insert_size_{}; ///< min remaining capacity of submap for insert atomic_ctr_type* num_successes_; ///< number of successfully inserted keys on insert Allocator alloc_{}; ///< Allocator passed to submaps to allocate their device storage + counter_allocator_type counter_allocator_{}; ///< Allocator used to allocate `num_successes_` std::vector submap_num_successes_; }; From 7eac9d1ffe79ccf5256aebfe60523db1c223e702 Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Wed, 6 Apr 2022 10:46:25 -0700 Subject: [PATCH 009/152] more efficient block reduce --- benchmarks/hash_table/dynamic_map_bench.cu | 111 +++++++++++++++++++- include/cuco/detail/dynamic_map.inl | 2 +- include/cuco/detail/dynamic_map_kernels.cuh | 8 +- include/cuco/dynamic_map.cuh | 5 +- 4 files changed, 117 insertions(+), 9 deletions(-) diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu index c0306f901..e6c29ede6 100644 --- a/benchmarks/hash_table/dynamic_map_bench.cu +++ b/benchmarks/hash_table/dynamic_map_bench.cu @@ -57,7 +57,7 @@ static void generate_keys(OutputIt output_begin, OutputIt output_end) static void gen_final_size(benchmark::internal::Benchmark* b) { - for (auto size = 10'000'000; size <= 10'000'000; size += 20'000'000) { + for (auto size = 10'000'000; size <= 310'000'000; size += 20'000'000) { b->Args({size}); } } @@ -142,6 +142,43 @@ static void BM_dynamic_search_all(::benchmark::State& state) int64_t(state.range(0))); } +template +static void BM_dynamic_search_none(::benchmark::State& state) +{ + using map_type = cuco::dynamic_map; + + std::size_t num_keys = state.range(0); + std::size_t initial_size = 1 << 27; + + std::vector h_keys(num_keys); + std::vector> h_pairs(num_keys); + + generate_keys(h_keys.begin(), h_keys.end()); + + for (auto i = 0; i < num_keys; ++i) { + Key key = h_keys[i] + num_keys; + Value val = h_keys[i] + num_keys; + h_pairs[i].first = key; + h_pairs[i].second = val; + } + + thrust::device_vector d_keys(h_keys); + thrust::device_vector> d_pairs(h_pairs); + thrust::device_vector d_results(num_keys); + + map_type map{initial_size, + cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + map.insert(d_pairs.begin(), d_pairs.end()); + + for (auto _ : state) { + cuda_event_timer raii{state}; + map.find(d_keys.begin(), d_keys.end(), d_results.begin()); + } + + state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * + int64_t(state.range(0))); +} + template static void BM_dynamic_erase_all(::benchmark::State& state) { @@ -186,21 +223,66 @@ static void BM_dynamic_erase_all(::benchmark::State& state) int64_t(state.range(0))); } +template +static void BM_dynamic_erase_none(::benchmark::State& state) +{ + using map_type = cuco::dynamic_map; + + std::size_t num_keys = state.range(0); + std::size_t initial_size = 1 << 27; + + std::vector h_keys(num_keys); + std::vector> h_pairs(num_keys); + + generate_keys(h_keys.begin(), h_keys.end()); + + for (auto i = 0; i < num_keys; ++i) { + Key key = h_keys[i] + num_keys; + Value val = h_keys[i] + num_keys; + h_pairs[i].first = key; + h_pairs[i].second = val; + } + + thrust::device_vector> d_pairs(h_pairs); + thrust::device_vector d_keys(h_keys); + + std::size_t batch_size = 1E6; + for (auto _ : state) { + map_type map{initial_size, + cuco::sentinel::empty_key{-1}, + cuco::sentinel::empty_value{-1}, + cuco::sentinel::erased_key{-2}}; + for (auto i = 0; i < num_keys; i += batch_size) { + map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size); + } + { + cuda_event_timer raii{state}; + map.erase(d_keys.begin(), d_keys.end()); + //for (auto i = 0; i < num_keys; i += batch_size) { + // map.erase(d_keys.begin() + i, d_keys.begin() + i + batch_size); + //} + } + } + + state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * + int64_t(state.range(0))); +} +/* BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); -/* + BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); */ + BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); - /* BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) @@ -221,7 +303,12 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); - +*/ +BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::GAUSSIAN) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); +/* BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) @@ -231,6 +318,22 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); +*/ + +BENCHMARK_TEMPLATE(BM_dynamic_erase_none, int32_t, int32_t, dist_type::UNIQUE) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); +/* +BENCHMARK_TEMPLATE(BM_dynamic_search_none, int32_t, int32_t, dist_type::UNIQUE) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); +/* +BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::UNIQUE) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index 2bb1459bc..7b3ac6ec1 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -220,7 +220,7 @@ void dynamic_map::erase(InputIt first, submaps_.size(), hash, key_equal); - + std::size_t h_num_successes; CUCO_CUDA_TRY(cudaMemcpy( &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost)); diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh index c5605d463..fcfa8c921 100644 --- a/include/cuco/detail/dynamic_map_kernels.cuh +++ b/include/cuco/detail/dynamic_map_kernels.cuh @@ -234,12 +234,16 @@ __global__ void erase(InputIt first, } std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); - if (threadIdx.x == 0) { *num_successes += block_num_successes; } + if (threadIdx.x == 0) { + num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed); + } // update submap thread counts for(int i = 0; i < num_submaps; ++i) { std::size_t submap_block_num_successes = BlockReduce(temp_submap_storage[i]).Sum(submap_thread_num_successes[i]); - if(threadIdx.x == 0) {*submap_num_successes[i] += submap_block_num_successes; } + if(threadIdx.x == 0) { + submap_num_successes[i]->fetch_add(submap_block_num_successes, cuda::std::memory_order_relaxed); + } } } diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index 01399a610..7ac06d61a 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -16,11 +16,12 @@ #pragma once -#include + #include #include #include - +#include +#include #include #include From 4d10631e2d2cc14136d8a497b7f197b911a0f32d Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Wed, 6 Apr 2022 11:46:44 -0700 Subject: [PATCH 010/152] doc changes --- benchmarks/hash_table/dynamic_map_bench.cu | 20 ++++++++++---------- include/cuco/detail/dynamic_map.inl | 12 ++++++------ include/cuco/detail/dynamic_map_kernels.cuh | 3 +++ include/cuco/dynamic_map.cuh | 3 +-- 4 files changed, 20 insertions(+), 18 deletions(-) diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu index e6c29ede6..46eb0aa18 100644 --- a/benchmarks/hash_table/dynamic_map_bench.cu +++ b/benchmarks/hash_table/dynamic_map_bench.cu @@ -258,9 +258,6 @@ static void BM_dynamic_erase_none(::benchmark::State& state) { cuda_event_timer raii{state}; map.erase(d_keys.begin(), d_keys.end()); - //for (auto i = 0; i < num_keys; i += batch_size) { - // map.erase(d_keys.begin() + i, d_keys.begin() + i + batch_size); - //} } } @@ -277,13 +274,12 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); -*/ BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); -/* + BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) @@ -303,12 +299,12 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); -*/ + BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); -/* + BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) @@ -318,14 +314,18 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); -*/ BENCHMARK_TEMPLATE(BM_dynamic_erase_none, int32_t, int32_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); -/* -BENCHMARK_TEMPLATE(BM_dynamic_search_none, int32_t, int32_t, dist_type::UNIQUE) + +BENCHMARK_TEMPLATE(BM_dynamic_erase_none, int32_t, int32_t, dist_type::GAUSSIAN) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); +*/ +BENCHMARK_TEMPLATE(BM_dynamic_search_none, int32_t, int32_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index 7b3ac6ec1..d0d4f0ba7 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -14,8 +14,6 @@ * limitations under the License. */ -//#include "nvtx3.hpp" - namespace cuco { template @@ -40,7 +38,6 @@ dynamic_map::dynamic_map(std::size_t initial_capac alloc)); submap_views_.push_back(submaps_[0]->get_device_view()); submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); - submap_num_successes_.push_back(submaps_[0]->get_num_successes()); num_successes_ = std::allocator_traits::allocate(counter_allocator_, 1); @@ -130,8 +127,6 @@ void dynamic_map::insert(InputIt first, Hash hash, KeyEqual key_equal) { - //nvtx3::thread_range r{"insert"}; - std::size_t num_to_insert = std::distance(first, last); reserve(size_ + num_to_insert); @@ -185,7 +180,6 @@ void dynamic_map::erase(InputIt first, Hash hash, KeyEqual key_equal) { - //nvtx3::thread_range r{"erase"}; std::size_t num_keys = std::distance(first, last); auto const block_size = 128; @@ -197,15 +191,18 @@ void dynamic_map::erase(InputIt first, static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); CUCO_CUDA_TRY(cudaMemset(num_successes_, 0, sizeof(atomic_ctr_type))); + // zero out submap success counters static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); for(int i = 0; i < submaps_.size(); ++i) { CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[i], 0, sizeof(atomic_ctr_type))); } // TODO: hacky, improve this + // provide device-accessible vector for each submap num_successes variable thrust::device_vector d_submap_num_successes(submap_num_successes_); // TODO: hack (how to get size on host?) + // use dynamic shared memory to hold block reduce space for each submap's erases constexpr size_t temp_storage_size_one_block = 48; auto const temp_storage_size = submaps_.size() * temp_storage_size_one_block; @@ -221,11 +218,14 @@ void dynamic_map::erase(InputIt first, hash, key_equal); + // update total dynamic map size std::size_t h_num_successes; CUCO_CUDA_TRY(cudaMemcpy( &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost)); size_ -= h_num_successes; + // TODO: if only one submap, skip this step + // update each submap's size for(int i = 0; i < submaps_.size(); ++i) { std::size_t h_submap_num_successes; CUCO_CUDA_TRY(cudaMemcpy( diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh index fcfa8c921..e54b51586 100644 --- a/include/cuco/detail/dynamic_map_kernels.cuh +++ b/include/cuco/detail/dynamic_map_kernels.cuh @@ -210,6 +210,8 @@ __global__ void erase(InputIt first, __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; + + // TODO: find permanent solution (only works for four submaps) std::size_t submap_thread_num_successes[4] = {0, 0, 0, 0}; auto tile = cg::tiled_partition(cg::this_thread_block()); @@ -238,6 +240,7 @@ __global__ void erase(InputIt first, num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed); } + // TODO: if there's only one submap, skip this step // update submap thread counts for(int i = 0; i < num_submaps; ++i) { std::size_t submap_block_num_successes = BlockReduce(temp_submap_storage[i]).Sum(submap_thread_num_successes[i]); diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index 7ac06d61a..977b00de0 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -287,10 +287,9 @@ class dynamic_map { submap_mutable_views_; ///< vector of mutable device views for each submap std::size_t min_insert_size_{}; ///< min remaining capacity of submap for insert atomic_ctr_type* num_successes_; ///< number of successfully inserted keys on insert + std::vector submap_num_successes_; ///< number of succesfully erased keys for each submap Allocator alloc_{}; ///< Allocator passed to submaps to allocate their device storage counter_allocator_type counter_allocator_{}; ///< Allocator used to allocate `num_successes_` - - std::vector submap_num_successes_; }; } // namespace cuco From b59a16b83b84497262546e7ba0215c042b703c1f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 6 Apr 2022 18:52:40 +0000 Subject: [PATCH 011/152] [pre-commit.ci] auto code formatting --- benchmarks/hash_table/dynamic_map_bench.cu | 29 +- benchmarks/hash_table/static_map_bench.cu | 15 +- include/cuco/detail/dynamic_map.inl | 76 ++--- include/cuco/detail/dynamic_map_kernels.cuh | 28 +- include/cuco/detail/nvtx3.hpp | 294 +++++++++++--------- include/cuco/dynamic_map.cuh | 45 +-- include/cuco/static_map.cuh | 5 +- tests/dynamic_map/erase_test.cu | 77 +++-- 8 files changed, 295 insertions(+), 274 deletions(-) diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu index 46eb0aa18..222699abb 100644 --- a/benchmarks/hash_table/dynamic_map_bench.cu +++ b/benchmarks/hash_table/dynamic_map_bench.cu @@ -86,13 +86,8 @@ static void BM_dynamic_insert(::benchmark::State& state) std::size_t batch_size = 1E6; for (auto _ : state) { -<<<<<<< HEAD map_type map{ initial_size, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; -======= - map_type map{initial_size, - cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; ->>>>>>> prevent implicit type conversion of sentinels during construction { cuda_event_timer raii{state}; for (std::size_t i = 0; i < num_keys; i += batch_size) { @@ -129,8 +124,8 @@ static void BM_dynamic_search_all(::benchmark::State& state) thrust::device_vector> d_pairs(h_pairs); thrust::device_vector d_results(num_keys); - map_type map{initial_size, - cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + map_type map{ + initial_size, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; map.insert(d_pairs.begin(), d_pairs.end()); for (auto _ : state) { @@ -166,8 +161,8 @@ static void BM_dynamic_search_none(::benchmark::State& state) thrust::device_vector> d_pairs(h_pairs); thrust::device_vector d_results(num_keys); - map_type map{initial_size, - cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + map_type map{ + initial_size, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; map.insert(d_pairs.begin(), d_pairs.end()); for (auto _ : state) { @@ -204,10 +199,10 @@ static void BM_dynamic_erase_all(::benchmark::State& state) std::size_t batch_size = 1E6; for (auto _ : state) { - map_type map{initial_size, - cuco::sentinel::empty_key{-1}, - cuco::sentinel::empty_value{-1}, - cuco::sentinel::erased_key{-2}}; + map_type map{initial_size, + cuco::sentinel::empty_key{-1}, + cuco::sentinel::empty_value{-1}, + cuco::sentinel::erased_key{-2}}; for (auto i = 0; i < num_keys; i += batch_size) { map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size); } @@ -248,10 +243,10 @@ static void BM_dynamic_erase_none(::benchmark::State& state) std::size_t batch_size = 1E6; for (auto _ : state) { - map_type map{initial_size, - cuco::sentinel::empty_key{-1}, - cuco::sentinel::empty_value{-1}, - cuco::sentinel::erased_key{-2}}; + map_type map{initial_size, + cuco::sentinel::empty_key{-1}, + cuco::sentinel::empty_value{-1}, + cuco::sentinel::erased_key{-2}}; for (auto i = 0; i < num_keys; i += batch_size) { map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size); } diff --git a/benchmarks/hash_table/static_map_bench.cu b/benchmarks/hash_table/static_map_bench.cu index 363899a46..1e69c0c4e 100644 --- a/benchmarks/hash_table/static_map_bench.cu +++ b/benchmarks/hash_table/static_map_bench.cu @@ -180,9 +180,10 @@ static void BM_static_map_search_none(::benchmark::State& state) h_pairs[i].first = key; h_pairs[i].second = val; } - + // diff keys - for(int i = 0; i < num_keys; ++i) h_keys[i] += num_keys; + for (int i = 0; i < num_keys; ++i) + h_keys[i] += num_keys; thrust::device_vector d_keys(h_keys); thrust::device_vector d_results(num_keys); @@ -269,23 +270,23 @@ static void BM_static_map_erase_none(::benchmark::State& state) h_pairs[i].second = val; } - // diff keys - for(int i = 0; i < num_keys; ++i) h_keys[i] += num_keys; + for (int i = 0; i < num_keys; ++i) + h_keys[i] += num_keys; thrust::device_vector d_keys(h_keys); thrust::device_vector d_results(num_keys); thrust::device_vector> d_pairs(h_pairs); for (auto _ : state) { - //state.ResumeTiming(); + // state.ResumeTiming(); state.PauseTiming(); map.insert(d_pairs.begin(), d_pairs.end()); state.ResumeTiming(); map.erase(d_keys.begin(), d_keys.end()); - - //state.PauseTiming(); + + // state.PauseTiming(); } state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index d0d4f0ba7..8be714c3d 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -17,10 +17,11 @@ namespace cuco { template -dynamic_map::dynamic_map(std::size_t initial_capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, - Allocator const& alloc) +dynamic_map::dynamic_map( + std::size_t initial_capacity, + sentinel::empty_key empty_key_sentinel, + sentinel::empty_value empty_value_sentinel, + Allocator const& alloc) : empty_key_sentinel_(empty_key_sentinel.value), empty_value_sentinel_(empty_value_sentinel.value), erased_key_sentinel_(empty_key_sentinel.value), @@ -39,16 +40,17 @@ dynamic_map::dynamic_map(std::size_t initial_capac submap_views_.push_back(submaps_[0]->get_device_view()); submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); submap_num_successes_.push_back(submaps_[0]->get_num_successes()); - + num_successes_ = std::allocator_traits::allocate(counter_allocator_, 1); } template -dynamic_map::dynamic_map(std::size_t initial_capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, - sentinel::erased_key erased_key_sentinel, - Allocator const& alloc) +dynamic_map::dynamic_map( + std::size_t initial_capacity, + sentinel::empty_key empty_key_sentinel, + sentinel::empty_value empty_value_sentinel, + sentinel::erased_key erased_key_sentinel, + Allocator const& alloc) : empty_key_sentinel_(empty_key_sentinel.value), empty_value_sentinel_(empty_value_sentinel.value), erased_key_sentinel_(erased_key_sentinel.value), @@ -72,7 +74,6 @@ dynamic_map::dynamic_map(std::size_t initial_capac num_successes_ = std::allocator_traits::allocate(counter_allocator_, 1); } - template dynamic_map::~dynamic_map() { @@ -94,7 +95,7 @@ void dynamic_map::reserve(std::size_t n) // if the submap does not exist yet, create it else { submap_capacity = capacity_; - if(erased_key_sentinel_ != empty_key_sentinel_) { + if (erased_key_sentinel_ != empty_key_sentinel_) { submaps_.push_back(std::make_unique>( submap_capacity, sentinel::empty_key{empty_key_sentinel_}, @@ -142,7 +143,7 @@ void dynamic_map::insert(InputIt first, // TODO: memset an atomic variable is unsafe static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); CUCO_CUDA_TRY(cudaMemset(num_successes_, 0, sizeof(atomic_ctr_type))); - + auto n = std::min(capacity_remaining, num_to_insert); auto const block_size = 128; auto const stride = 1; @@ -176,9 +177,9 @@ void dynamic_map::insert(InputIt first, template template void dynamic_map::erase(InputIt first, - InputIt last, - Hash hash, - KeyEqual key_equal) + InputIt last, + Hash hash, + KeyEqual key_equal) { std::size_t num_keys = std::distance(first, last); @@ -190,13 +191,13 @@ void dynamic_map::erase(InputIt first, // TODO: memset an atomic variable is unsafe static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); CUCO_CUDA_TRY(cudaMemset(num_successes_, 0, sizeof(atomic_ctr_type))); - + // zero out submap success counters static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); - for(int i = 0; i < submaps_.size(); ++i) { + for (int i = 0; i < submaps_.size(); ++i) { CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[i], 0, sizeof(atomic_ctr_type))); } - + // TODO: hacky, improve this // provide device-accessible vector for each submap num_successes variable thrust::device_vector d_submap_num_successes(submap_num_successes_); @@ -204,32 +205,33 @@ void dynamic_map::erase(InputIt first, // TODO: hack (how to get size on host?) // use dynamic shared memory to hold block reduce space for each submap's erases constexpr size_t temp_storage_size_one_block = 48; - auto const temp_storage_size = submaps_.size() * temp_storage_size_one_block; - + auto const temp_storage_size = submaps_.size() * temp_storage_size_one_block; + detail::erase> - <<>>( - first, - first + num_keys, - submap_views_.data().get(), - submap_mutable_views_.data().get(), - num_successes_, - d_submap_num_successes.data().get(), - submaps_.size(), - hash, - key_equal); + <<>>(first, + first + num_keys, + submap_views_.data().get(), + submap_mutable_views_.data().get(), + num_successes_, + d_submap_num_successes.data().get(), + submaps_.size(), + hash, + key_equal); // update total dynamic map size std::size_t h_num_successes; - CUCO_CUDA_TRY(cudaMemcpy( - &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost)); + CUCO_CUDA_TRY( + cudaMemcpy(&h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost)); size_ -= h_num_successes; - + // TODO: if only one submap, skip this step // update each submap's size - for(int i = 0; i < submaps_.size(); ++i) { + for (int i = 0; i < submaps_.size(); ++i) { std::size_t h_submap_num_successes; - CUCO_CUDA_TRY(cudaMemcpy( - &h_submap_num_successes, submap_num_successes_[i], sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost)); + CUCO_CUDA_TRY(cudaMemcpy(&h_submap_num_successes, + submap_num_successes_[i], + sizeof(atomic_ctr_type), + cudaMemcpyDeviceToHost)); submaps_[i]->size_ -= h_submap_num_successes; } } diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh index e54b51586..61f32bda7 100644 --- a/include/cuco/detail/dynamic_map_kernels.cuh +++ b/include/cuco/detail/dynamic_map_kernels.cuh @@ -196,14 +196,14 @@ template __global__ void erase(InputIt first, - InputIt last, - viewT* submap_views, - mutableViewT* submap_mutable_views, - atomicT* num_successes, - atomicT** submap_num_successes, - const uint32_t num_submaps, - Hash hash, - KeyEqual key_equal) + InputIt last, + viewT* submap_views, + mutableViewT* submap_mutable_views, + atomicT* num_successes, + atomicT** submap_num_successes, + const uint32_t num_submaps, + Hash hash, + KeyEqual key_equal) { typedef cub::BlockReduce BlockReduce; extern __shared__ typename BlockReduce::TempStorage temp_submap_storage[]; @@ -219,7 +219,7 @@ __global__ void erase(InputIt first, auto it = first + tid / tile_size; while (it < last) { - auto erased = false; + auto erased = false; // manually check for duplicates in those submaps we are not inserting into int i; @@ -242,10 +242,12 @@ __global__ void erase(InputIt first, // TODO: if there's only one submap, skip this step // update submap thread counts - for(int i = 0; i < num_submaps; ++i) { - std::size_t submap_block_num_successes = BlockReduce(temp_submap_storage[i]).Sum(submap_thread_num_successes[i]); - if(threadIdx.x == 0) { - submap_num_successes[i]->fetch_add(submap_block_num_successes, cuda::std::memory_order_relaxed); + for (int i = 0; i < num_submaps; ++i) { + std::size_t submap_block_num_successes = + BlockReduce(temp_submap_storage[i]).Sum(submap_thread_num_successes[i]); + if (threadIdx.x == 0) { + submap_num_successes[i]->fetch_add(submap_block_num_successes, + cuda::std::memory_order_relaxed); } } } diff --git a/include/cuco/detail/nvtx3.hpp b/include/cuco/detail/nvtx3.hpp index 08a02153b..075c6e5d4 100644 --- a/include/cuco/detail/nvtx3.hpp +++ b/include/cuco/detail/nvtx3.hpp @@ -17,7 +17,7 @@ #if defined(NVTX3_MINOR_VERSION) and NVTX3_MINOR_VERSION < 0 #error \ - "Trying to #include NVTX version 3 in a source file where an older NVTX version has already been included. If you are not directly using NVTX (the NVIDIA Tools Extension library), you are getting this error because libraries you are using have included different versions of NVTX. Suggested solutions are: (1) reorder #includes so the newest NVTX version is included first, (2) avoid using the conflicting libraries in the same .c/.cpp file, or (3) update the library using the older NVTX version to use the newer version instead." + "Trying to #include NVTX version 3 in a source file where an older NVTX version has already been included. If you are not directly using NVTX (the NVIDIA Tools Extension library), you are getting this error because libraries you are using have included different versions of NVTX. Suggested solutions are: (1) reorder #includes so the newest NVTX version is included first, (2) avoid using the conflicting libraries in the same .c/.cpp file, or (3) update the library using the older NVTX version to use the newer version instead." #endif /** @@ -219,7 +219,7 @@ * * `nvtx3::mark` allows annotating an instantaneous event in an application's * timeline. For example, indicating when a mutex is locked or unlocked. - * + * * \code{.cpp} * std::mutex global_lock; * void lock_mutex(){ @@ -526,11 +526,10 @@ namespace detail { * `const wchar_t*`. */ template -constexpr auto has_name_member() noexcept -> decltype(T::name, bool()) { - return (std::is_same::type>::value or - std::is_same::type>::value); +constexpr auto has_name_member() noexcept -> decltype(T::name, bool()) +{ + return (std::is_same::type>::value or + std::is_same::type>::value); } } // namespace detail @@ -592,7 +591,7 @@ class domain { public: domain(domain const&) = delete; domain& operator=(domain const&) = delete; - domain(domain&&) = delete; + domain(domain&&) = delete; domain& operator=(domain&&) = delete; /** @@ -640,7 +639,8 @@ class domain { * @return Reference to the `domain` corresponding to the type `DomainName`. */ template - static domain const& get() { + static domain const& get() + { static_assert(detail::has_name_member(), "Type used to identify a domain must contain a name member of" "type const char* or const wchar_t*"); @@ -667,7 +667,8 @@ class domain { * applications will be grouped together. * */ - struct global {}; + struct global { + }; private: /** @@ -678,8 +679,7 @@ class domain { * * @param name A unique name identifying the domain */ - explicit domain(char const* name) noexcept - : _domain{nvtxDomainCreateA(name)} {} + explicit domain(char const* name) noexcept : _domain{nvtxDomainCreateA(name)} {} /** * @brief Construct a new domain with the specified `name`. @@ -689,8 +689,7 @@ class domain { * * @param name A unique name identifying the domain */ - explicit domain(wchar_t const* name) noexcept - : _domain{nvtxDomainCreateW(name)} {} + explicit domain(wchar_t const* name) noexcept : _domain{nvtxDomainCreateW(name)} {} /** * @brief Construct a new domain with the specified `name`. @@ -746,7 +745,8 @@ class domain { * */ template <> -inline domain const& domain::get() { +inline domain const& domain::get() +{ static domain const d{}; return d; } @@ -770,9 +770,10 @@ struct rgb { * @param green_ Value of the green channel * @param blue_ Value of the blue channel */ - constexpr rgb(component_type red_, component_type green_, - component_type blue_) noexcept - : red{red_}, green{green_}, blue{blue_} {} + constexpr rgb(component_type red_, component_type green_, component_type blue_) noexcept + : red{red_}, green{green_}, blue{blue_} + { + } component_type const red{}; ///< Red channel value component_type const green{}; ///< Green channel value @@ -797,9 +798,13 @@ struct argb final : rgb { * @param blue_ Value of the blue channel * */ - constexpr argb(component_type alpha_, component_type red_, - component_type green_, component_type blue_) noexcept - : rgb{red_, green_, blue_}, alpha{alpha_} {} + constexpr argb(component_type alpha_, + component_type red_, + component_type green_, + component_type blue_) noexcept + : rgb{red_, green_, blue_}, alpha{alpha_} + { + } component_type const alpha{}; ///< Alpha channel value }; @@ -844,8 +849,9 @@ class color { * @param argb The alpha, red, green, blue components of the desired `color` */ constexpr color(argb argb) noexcept - : color{from_bytes_msb_to_lsb(argb.alpha, argb.red, argb.green, - argb.blue)} {} + : color{from_bytes_msb_to_lsb(argb.alpha, argb.red, argb.green, argb.blue)} + { + } /** * @brief Construct a `color` using the red, green, blue components in @@ -856,7 +862,9 @@ class color { * @param rgb The red, green, blue components of the desired `color` */ constexpr color(rgb rgb) noexcept - : color{from_bytes_msb_to_lsb(0xFF, rgb.red, rgb.green, rgb.blue)} {} + : color{from_bytes_msb_to_lsb(0xFF, rgb.red, rgb.green, rgb.blue)} + { + } /** * @brief Returns the `color`s argb hex code @@ -870,11 +878,11 @@ class color { */ constexpr nvtxColorType_t get_type() const noexcept { return _type; } - color() = delete; - ~color() = default; + color() = delete; + ~color() = default; color(color const&) = default; color& operator=(color const&) = default; - color(color&&) = default; + color(color&&) = default; color& operator=(color&&) = default; private: @@ -886,9 +894,9 @@ class color { constexpr static value_type from_bytes_msb_to_lsb(uint8_t byte3, uint8_t byte2, uint8_t byte1, - uint8_t byte0) noexcept { - return uint32_t{byte3} << 24 | uint32_t{byte2} << 16 | - uint32_t{byte1} << 8 | uint32_t{byte0}; + uint8_t byte0) noexcept + { + return uint32_t{byte3} << 24 | uint32_t{byte2} << 16 | uint32_t{byte1} << 8 | uint32_t{byte0}; } value_type const _value{}; ///< color's argb color code @@ -938,11 +946,11 @@ class category { */ constexpr id_type get_id() const noexcept { return id_; } - category() = delete; - ~category() = default; + category() = delete; + ~category() = default; category(category const&) = default; category& operator=(category const&) = default; - category(category&&) = default; + category(category&&) = default; category& operator=(category&&) = default; private: @@ -1036,7 +1044,8 @@ class named_category final : public category { * `char const*` or `wchar_t const*` and `C::id`. */ template - static named_category const& get() noexcept { + static named_category const& get() noexcept + { static_assert(detail::has_name_member(), "Type used to name a category must contain a name member."); static named_category const category{C::id, C::name}; @@ -1052,7 +1061,8 @@ class named_category final : public category { * @param[in] id The category id to name * @param[in] name The name to associated with `id` */ - named_category(id_type id, char const* name) noexcept : category{id} { + named_category(id_type id, char const* name) noexcept : category{id} + { nvtxDomainNameCategoryA(domain::get(), get_id(), name); }; @@ -1066,7 +1076,8 @@ class named_category final : public category { * @param[in] id The category id to name * @param[in] name The name to associated with `id` */ - named_category(id_type id, wchar_t const* name) noexcept : category{id} { + named_category(id_type id, wchar_t const* name) noexcept : category{id} + { nvtxDomainNameCategoryW(domain::get(), get_id(), name); }; }; @@ -1154,7 +1165,8 @@ class registered_message { * @return Reference to a `registered_message` associated with the type `M`. */ template - static registered_message const& get() noexcept { + static registered_message const& get() noexcept + { static registered_message const registered_message{M::message}; return registered_message; } @@ -1171,7 +1183,9 @@ class registered_message { * @param msg The contents of the message */ explicit registered_message(char const* msg) noexcept - : handle_{nvtxDomainRegisterStringA(domain::get(), msg)} {} + : handle_{nvtxDomainRegisterStringA(domain::get(), msg)} + { + } /** * @brief Constructs a `registered_message` from the specified `msg` string. @@ -1184,8 +1198,7 @@ class registered_message { * * @param msg The contents of the message */ - explicit registered_message(std::string const& msg) noexcept - : registered_message{msg.c_str()} {} + explicit registered_message(std::string const& msg) noexcept : registered_message{msg.c_str()} {} /** * @brief Constructs a `registered_message` from the specified `msg` string. @@ -1199,7 +1212,9 @@ class registered_message { * @param msg The contents of the message */ explicit registered_message(wchar_t const* msg) noexcept - : handle_{nvtxDomainRegisterStringW(domain::get(), msg)} {} + : handle_{nvtxDomainRegisterStringW(domain::get(), msg)} + { + } /** * @brief Constructs a `registered_message` from the specified `msg` string. @@ -1212,8 +1227,7 @@ class registered_message { * * @param msg The contents of the message */ - explicit registered_message(std::wstring const& msg) noexcept - : registered_message{msg.c_str()} {} + explicit registered_message(std::wstring const& msg) noexcept : registered_message{msg.c_str()} {} /** * @brief Returns the registered message's handle @@ -1221,11 +1235,11 @@ class registered_message { */ nvtxStringHandle_t get_handle() const noexcept { return handle_; } - registered_message() = delete; - ~registered_message() = default; + registered_message() = delete; + ~registered_message() = default; registered_message(registered_message const&) = default; registered_message& operator=(registered_message const&) = default; - registered_message(registered_message&&) = default; + registered_message(registered_message&&) = default; registered_message& operator=(registered_message&&) = default; private: @@ -1279,8 +1293,8 @@ class message { * * @param msg The contents of the message */ - NVTX3_RELAXED_CONSTEXPR message(char const* msg) noexcept - : type_{NVTX_MESSAGE_TYPE_ASCII} { + NVTX3_RELAXED_CONSTEXPR message(char const* msg) noexcept : type_{NVTX_MESSAGE_TYPE_ASCII} + { value_.ascii = msg; } @@ -1306,8 +1320,8 @@ class message { * * @param msg The contents of the message */ - NVTX3_RELAXED_CONSTEXPR message(wchar_t const* msg) noexcept - : type_{NVTX_MESSAGE_TYPE_UNICODE} { + NVTX3_RELAXED_CONSTEXPR message(wchar_t const* msg) noexcept : type_{NVTX_MESSAGE_TYPE_UNICODE} + { value_.unicode = msg; } @@ -1338,7 +1352,8 @@ class message { */ template NVTX3_RELAXED_CONSTEXPR message(registered_message const& msg) noexcept - : type_{NVTX_MESSAGE_TYPE_REGISTERED} { + : type_{NVTX_MESSAGE_TYPE_REGISTERED} + { value_.registered = msg.get_handle(); } @@ -1346,17 +1361,13 @@ class message { * @brief Return the union holding the value of the message. * */ - NVTX3_RELAXED_CONSTEXPR value_type get_value() const noexcept { - return value_; - } + NVTX3_RELAXED_CONSTEXPR value_type get_value() const noexcept { return value_; } /** * @brief Return the type information about the value the union holds. * */ - NVTX3_RELAXED_CONSTEXPR nvtxMessageType_t get_type() const noexcept { - return type_; - } + NVTX3_RELAXED_CONSTEXPR nvtxMessageType_t get_type() const noexcept { return type_; } private: nvtxMessageType_t const type_{}; ///< message type @@ -1390,7 +1401,8 @@ class payload { * @param value Value to use as contents of the payload */ NVTX3_RELAXED_CONSTEXPR explicit payload(int64_t value) noexcept - : type_{NVTX_PAYLOAD_TYPE_INT64}, value_{} { + : type_{NVTX_PAYLOAD_TYPE_INT64}, value_{} + { value_.llValue = value; } @@ -1400,7 +1412,8 @@ class payload { * @param value Value to use as contents of the payload */ NVTX3_RELAXED_CONSTEXPR explicit payload(int32_t value) noexcept - : type_{NVTX_PAYLOAD_TYPE_INT32}, value_{} { + : type_{NVTX_PAYLOAD_TYPE_INT32}, value_{} + { value_.iValue = value; } @@ -1410,7 +1423,8 @@ class payload { * @param value Value to use as contents of the payload */ NVTX3_RELAXED_CONSTEXPR explicit payload(uint64_t value) noexcept - : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT64}, value_{} { + : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT64}, value_{} + { value_.ullValue = value; } @@ -1420,7 +1434,8 @@ class payload { * @param value Value to use as contents of the payload */ NVTX3_RELAXED_CONSTEXPR explicit payload(uint32_t value) noexcept - : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT32}, value_{} { + : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT32}, value_{} + { value_.uiValue = value; } @@ -1431,7 +1446,8 @@ class payload { * @param value Value to use as contents of the payload */ NVTX3_RELAXED_CONSTEXPR explicit payload(float value) noexcept - : type_{NVTX_PAYLOAD_TYPE_FLOAT}, value_{} { + : type_{NVTX_PAYLOAD_TYPE_FLOAT}, value_{} + { value_.fValue = value; } @@ -1442,7 +1458,8 @@ class payload { * @param value Value to use as contents of the payload */ NVTX3_RELAXED_CONSTEXPR explicit payload(double value) noexcept - : type_{NVTX_PAYLOAD_TYPE_DOUBLE}, value_{} { + : type_{NVTX_PAYLOAD_TYPE_DOUBLE}, value_{} + { value_.dValue = value; } @@ -1450,17 +1467,13 @@ class payload { * @brief Return the union holding the value of the payload * */ - NVTX3_RELAXED_CONSTEXPR value_type get_value() const noexcept { - return value_; - } + NVTX3_RELAXED_CONSTEXPR value_type get_value() const noexcept { return value_; } /** * @brief Return the information about the type the union holds. * */ - NVTX3_RELAXED_CONSTEXPR nvtxPayloadType_t get_type() const noexcept { - return type_; - } + NVTX3_RELAXED_CONSTEXPR nvtxPayloadType_t get_type() const noexcept { return type_; } private: nvtxPayloadType_t const type_; ///< Type of the payload value @@ -1537,17 +1550,19 @@ class event_attributes { * category, color, payload, nor message. */ constexpr event_attributes() noexcept - : attributes_{ - NVTX_VERSION, // version - sizeof(nvtxEventAttributes_t), // size - 0, // category - NVTX_COLOR_UNKNOWN, // color type - 0, // color value - NVTX_PAYLOAD_UNKNOWN, // payload type - 0, // payload value (union) - NVTX_MESSAGE_UNKNOWN, // message type - 0 // message value (union) - } {} + : attributes_{ + NVTX_VERSION, // version + sizeof(nvtxEventAttributes_t), // size + 0, // category + NVTX_COLOR_UNKNOWN, // color type + 0, // color value + NVTX_PAYLOAD_UNKNOWN, // payload type + 0, // payload value (union) + NVTX_MESSAGE_UNKNOWN, // message type + 0 // message value (union) + } + { + } /** * @brief Variadic constructor where the first argument is a `category`. @@ -1557,9 +1572,9 @@ class event_attributes { * */ template - NVTX3_RELAXED_CONSTEXPR explicit event_attributes( - category const& c, Args const&... args) noexcept - : event_attributes(args...) { + NVTX3_RELAXED_CONSTEXPR explicit event_attributes(category const& c, Args const&... args) noexcept + : event_attributes(args...) + { attributes_.category = c.get_id(); } @@ -1571,10 +1586,10 @@ class event_attributes { * */ template - NVTX3_RELAXED_CONSTEXPR explicit event_attributes( - color const& c, Args const&... args) noexcept - : event_attributes(args...) { - attributes_.color = c.get_value(); + NVTX3_RELAXED_CONSTEXPR explicit event_attributes(color const& c, Args const&... args) noexcept + : event_attributes(args...) + { + attributes_.color = c.get_value(); attributes_.colorType = c.get_type(); } @@ -1586,10 +1601,10 @@ class event_attributes { * */ template - NVTX3_RELAXED_CONSTEXPR explicit event_attributes( - payload const& p, Args const&... args) noexcept - : event_attributes(args...) { - attributes_.payload = p.get_value(); + NVTX3_RELAXED_CONSTEXPR explicit event_attributes(payload const& p, Args const&... args) noexcept + : event_attributes(args...) + { + attributes_.payload = p.get_value(); attributes_.payloadType = p.get_type(); } @@ -1601,17 +1616,17 @@ class event_attributes { * */ template - NVTX3_RELAXED_CONSTEXPR explicit event_attributes( - message const& m, Args const&... args) noexcept - : event_attributes(args...) { - attributes_.message = m.get_value(); + NVTX3_RELAXED_CONSTEXPR explicit event_attributes(message const& m, Args const&... args) noexcept + : event_attributes(args...) + { + attributes_.message = m.get_value(); attributes_.messageType = m.get_type(); } - ~event_attributes() = default; + ~event_attributes() = default; event_attributes(event_attributes const&) = default; event_attributes& operator=(event_attributes const&) = default; - event_attributes(event_attributes&&) = default; + event_attributes(event_attributes&&) = default; event_attributes& operator=(event_attributes&&) = default; /** @@ -1689,7 +1704,8 @@ class domain_thread_range { * @param[in] attr `event_attributes` that describes the desired attributes * of the range. */ - explicit domain_thread_range(event_attributes const& attr) noexcept { + explicit domain_thread_range(event_attributes const& attr) noexcept + { nvtxDomainRangePushEx(domain::get(), attr.get()); } @@ -1720,11 +1736,14 @@ class domain_thread_range { * forward. * */ - template >::value>> + template >::value>> explicit domain_thread_range(First const& first, Args const&... args) noexcept - : domain_thread_range{event_attributes{first, args...}} {} + : domain_thread_range{event_attributes{first, args...}} + { + } /** * @brief Default constructor creates a `domain_thread_range` with no @@ -1735,7 +1754,7 @@ class domain_thread_range { domain_thread_range(domain_thread_range const&) = delete; domain_thread_range& operator=(domain_thread_range const&) = delete; - domain_thread_range(domain_thread_range&&) = delete; + domain_thread_range(domain_thread_range&&) = delete; domain_thread_range& operator=(domain_thread_range&&) = delete; /** @@ -1771,8 +1790,8 @@ struct range_handle { */ constexpr value_type get_value() const noexcept { return _range_id; } -private: - value_type _range_id{}; ///< The underlying NVTX range id + private: + value_type _range_id{}; ///< The underlying NVTX range id }; /** @@ -1801,7 +1820,8 @@ struct range_handle { * @return Unique handle to be passed to `end_range` to end the range. */ template -range_handle start_range(event_attributes const &attr) noexcept { +range_handle start_range(event_attributes const& attr) noexcept +{ return range_handle{nvtxDomainRangeStartEx(domain::get(), attr.get())}; } @@ -1833,10 +1853,12 @@ range_handle start_range(event_attributes const &attr) noexcept { * `event_attributes`. * @return Unique handle to be passed to `end_range` to end the range. */ -template >::value>> -range_handle start_range(First const &first, Args const &... args) noexcept { +template >::value>> +range_handle start_range(First const& first, Args const&... args) noexcept +{ return start_range(event_attributes{first, args...}); } @@ -1873,15 +1895,17 @@ void end_range(range_handle r) { nvtxRangeEnd(r.get_value()); } * to which the `domain_process_range` belongs. Else, `domain::global` to * indicate that the global NVTX domain should be used. */ -template class domain_process_range { +template +class domain_process_range { public: /** * @brief Construct a new domain process range object * * @param attr */ - explicit domain_process_range(event_attributes const &attr) noexcept - : handle_{start_range(attr)} {} + explicit domain_process_range(event_attributes const& attr) noexcept : handle_{start_range(attr)} + { + } /** * @brief Construct a new domain process range object @@ -1889,28 +1913,28 @@ template class domain_process_range { * @param first * @param args */ - template >::value>> - explicit domain_process_range(First const &first, - Args const &... args) noexcept - : domain_process_range{event_attributes{first, args...}} {} + template >::value>> + explicit domain_process_range(First const& first, Args const&... args) noexcept + : domain_process_range{event_attributes{first, args...}} + { + } /** * @brief Construct a new domain process range object * */ - constexpr domain_process_range() noexcept - : domain_process_range{event_attributes{}} {} + constexpr domain_process_range() noexcept : domain_process_range{event_attributes{}} {} /** * @brief Destroy the `domain_process_range` ending the range. * */ - ~domain_process_range() noexcept { - if (not moved_from_) { - end_range(handle_); - } + ~domain_process_range() noexcept + { + if (not moved_from_) { end_range(handle_); } } /** @@ -1919,8 +1943,8 @@ template class domain_process_range { * * @param other */ - domain_process_range(domain_process_range &&other) noexcept - : handle_{other.handle_} { + domain_process_range(domain_process_range&& other) noexcept : handle_{other.handle_} + { other.moved_from_ = true; } @@ -1931,23 +1955,24 @@ template class domain_process_range { * @param other * @return domain_process_range& */ - domain_process_range &operator=(domain_process_range &&other) noexcept { - handle_ = other.handle_; + domain_process_range& operator=(domain_process_range&& other) noexcept + { + handle_ = other.handle_; other.moved_from_ = true; } /// Copy construction is not allowed to prevent multiple objects from owning /// the same range handle - domain_process_range(domain_process_range const &) = delete; + domain_process_range(domain_process_range const&) = delete; /// Copy assignment is not allowed to prevent multiple objects from owning the /// same range handle - domain_process_range &operator=(domain_process_range const &) = delete; + domain_process_range& operator=(domain_process_range const&) = delete; private: - range_handle handle_; ///< Range handle used to correlate + range_handle handle_; ///< Range handle used to correlate ///< the start/end of the range - bool moved_from_{false}; ///< Indicates if the object has had + bool moved_from_{false}; ///< Indicates if the object has had ///< it's contents moved from it, ///< indicating it should not attempt ///< to end the NVTX range. @@ -1981,7 +2006,8 @@ using process_range = domain_process_range<>; * of the mark. */ template -inline void mark(event_attributes const& attr) noexcept { +inline void mark(event_attributes const& attr) noexcept +{ nvtxDomainMarkEx(domain::get(), attr.get()); } diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index 977b00de0..35eb4898f 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -17,11 +17,13 @@ #pragma once +#include +#include +#include #include #include #include #include -#include #include #include @@ -100,26 +102,24 @@ class dynamic_map { static_assert(std::is_arithmetic::value, "Unsupported, non-arithmetic key type."); public: - using value_type = cuco::pair_type; - using key_type = Key; - using mapped_type = Value; - using atomic_ctr_type = cuda::atomic; - using view_type = typename static_map::device_view; - using mutable_view_type = typename static_map::device_mutable_view; + using value_type = cuco::pair_type; + using key_type = Key; + using mapped_type = Value; + using atomic_ctr_type = cuda::atomic; + using view_type = typename static_map::device_view; + using mutable_view_type = typename static_map::device_mutable_view; using counter_allocator_type = typename std::allocator_traits::rebind_alloc; dynamic_map(dynamic_map const&) = delete; dynamic_map(dynamic_map&&) = delete; - template - dynamic_map(std::size_t, T1, T2, - Allocator const& = Allocator{}) = delete; - - template - dynamic_map(std::size_t, T1, T2, T3, - Allocator const& = Allocator{}) = delete; - + template + dynamic_map(std::size_t, T1, T2, Allocator const& = Allocator{}) = delete; + + template + dynamic_map(std::size_t, T1, T2, T3, Allocator const& = Allocator{}) = delete; + dynamic_map& operator=(dynamic_map const&) = delete; dynamic_map& operator=(dynamic_map&&) = delete; @@ -147,7 +147,7 @@ class dynamic_map { sentinel::empty_key empty_key_sentinel, sentinel::empty_value empty_value_sentinel, Allocator const& alloc = Allocator{}); - + dynamic_map(std::size_t initial_capacity, sentinel::empty_key empty_key_sentinel, sentinel::empty_value empty_value_sentinel, @@ -188,7 +188,7 @@ class dynamic_map { typename Hash = cuco::detail::MurmurHash3_32, typename KeyEqual = thrust::equal_to> void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}); - + template , typename KeyEqual = thrust::equal_to> @@ -276,9 +276,9 @@ class dynamic_map { key_type erased_key_sentinel_{}; // TODO: initialize this - std::size_t size_{}; ///< Number of keys in the map - std::size_t capacity_{}; ///< Maximum number of keys that can be inserted - float max_load_factor_{}; ///< Max load factor before capacity growth + std::size_t size_{}; ///< Number of keys in the map + std::size_t capacity_{}; ///< Maximum number of keys that can be inserted + float max_load_factor_{}; ///< Max load factor before capacity growth std::vector>> submaps_; ///< vector of pointers to each submap @@ -287,8 +287,9 @@ class dynamic_map { submap_mutable_views_; ///< vector of mutable device views for each submap std::size_t min_insert_size_{}; ///< min remaining capacity of submap for insert atomic_ctr_type* num_successes_; ///< number of successfully inserted keys on insert - std::vector submap_num_successes_; ///< number of succesfully erased keys for each submap - Allocator alloc_{}; ///< Allocator passed to submaps to allocate their device storage + std::vector + submap_num_successes_; ///< number of succesfully erased keys for each submap + Allocator alloc_{}; ///< Allocator passed to submaps to allocate their device storage counter_allocator_type counter_allocator_{}; ///< Allocator used to allocate `num_successes_` }; } // namespace cuco diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 3ef487b7c..f72ce41c5 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -1414,10 +1414,7 @@ class static_map { sentinel::erased_key{erased_key_sentinel_}); } - atomic_ctr_type* get_num_successes() const noexcept - { - return num_successes_; - } + atomic_ctr_type* get_num_successes() const noexcept { return num_successes_; } private: pair_atomic_type* slots_{nullptr}; ///< Pointer to flat slots storage diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu index 385b2e426..64e4cce38 100644 --- a/tests/dynamic_map/erase_test.cu +++ b/tests/dynamic_map/erase_test.cu @@ -21,18 +21,16 @@ #include - -TEMPLATE_TEST_CASE_SIG( - "erase key", "", ((typename T), T), (int32_t)) +TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t)) { using Key = T; using Value = T; - + unsigned long num_keys = 1'000'000; - cuco::dynamic_map map{num_keys * 2, - cuco::sentinel::empty_key{-1}, - cuco::sentinel::empty_value{-1}, - cuco::sentinel::erased_key{-2}}; + cuco::dynamic_map map{num_keys * 2, + cuco::sentinel::empty_key{-1}, + cuco::sentinel::empty_value{-1}, + cuco::sentinel::erased_key{-2}}; thrust::device_vector d_keys(num_keys); thrust::device_vector d_values(num_keys); @@ -40,12 +38,11 @@ TEMPLATE_TEST_CASE_SIG( thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1); thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1); - + auto pairs_begin = thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin())); - SECTION( - "Check basic insert/erase") + SECTION("Check basic insert/erase") { // ***************************************** // first, check single submap works properly @@ -55,7 +52,6 @@ TEMPLATE_TEST_CASE_SIG( REQUIRE(map.get_size() == num_keys); - map.erase(d_keys.begin(), d_keys.end()); // delete decreases count correctly @@ -68,7 +64,7 @@ TEMPLATE_TEST_CASE_SIG( d_keys_exist.end(), [] __device__(const bool key_found) { return key_found; })); - // ensures that map is reusing deleted slots + // ensures that map is reusing deleted slots map.insert(pairs_begin, pairs_begin + num_keys); REQUIRE(map.get_size() == num_keys); @@ -76,63 +72,64 @@ TEMPLATE_TEST_CASE_SIG( map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); REQUIRE(cuco::test::all_of(d_keys_exist.begin(), - d_keys_exist.end(), - [] __device__(const bool key_found) { return key_found; })); + d_keys_exist.end(), + [] __device__(const bool key_found) { return key_found; })); // erase can act selectively - map.erase(d_keys.begin(), d_keys.begin() + num_keys/2); + map.erase(d_keys.begin(), d_keys.begin() + num_keys / 2); map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); - + REQUIRE(cuco::test::none_of(d_keys_exist.begin(), - d_keys_exist.begin() + num_keys/2, + d_keys_exist.begin() + num_keys / 2, [] __device__(const bool key_found) { return key_found; })); - REQUIRE(cuco::test::all_of(d_keys_exist.begin() + num_keys/2, - d_keys_exist.end(), - [] __device__(const bool key_found) { return key_found; })); - + REQUIRE(cuco::test::all_of(d_keys_exist.begin() + num_keys / 2, + d_keys_exist.end(), + [] __device__(const bool key_found) { return key_found; })); + // clear map - map.erase(d_keys.begin()+num_keys/2, d_keys.end()); - + map.erase(d_keys.begin() + num_keys / 2, d_keys.end()); + // ************************************************* // second, check multiple submaps case works properly // ************************************************* - + thrust::device_vector d_keys2(4 * num_keys); thrust::device_vector d_values2(4 * num_keys); thrust::device_vector d_keys_exist2(4 * num_keys); - + thrust::sequence(thrust::device, d_keys2.begin(), d_keys2.end(), 1); thrust::sequence(thrust::device, d_values2.begin(), d_values2.end(), 1); - + auto pairs_begin2 = thrust::make_zip_iterator(thrust::make_tuple(d_keys2.begin(), d_values2.begin())); - map.insert(pairs_begin2, pairs_begin2 + 4*num_keys); - + map.insert(pairs_begin2, pairs_begin2 + 4 * num_keys); + // map should resize twice if the erased slots are successfully reused - REQUIRE(map.get_capacity() == 8*num_keys); + REQUIRE(map.get_capacity() == 8 * num_keys); // check that keys can be successfully deleted from only the first and second submaps - map.erase(d_keys2.begin(), d_keys2.begin() + 2*num_keys); + map.erase(d_keys2.begin(), d_keys2.begin() + 2 * num_keys); map.contains(d_keys2.begin(), d_keys2.end(), d_keys_exist2.begin()); - + REQUIRE(cuco::test::none_of(d_keys_exist2.begin(), - d_keys_exist2.begin() + 2*num_keys, + d_keys_exist2.begin() + 2 * num_keys, [] __device__(const bool key_found) { return key_found; })); - REQUIRE(cuco::test::all_of(d_keys_exist2.begin() + 2*num_keys, - d_keys_exist2.end(), - [] __device__(const bool key_found) { return key_found; })); + REQUIRE(cuco::test::all_of(d_keys_exist2.begin() + 2 * num_keys, + d_keys_exist2.end(), + [] __device__(const bool key_found) { return key_found; })); - REQUIRE(map.get_size() == 2*num_keys); + REQUIRE(map.get_size() == 2 * num_keys); - // check that keys can be successfully deleted from all submaps (some will be unsuccessful erases) + // check that keys can be successfully deleted from all submaps (some will be unsuccessful + // erases) map.erase(d_keys2.begin(), d_keys2.end()); - + map.contains(d_keys2.begin(), d_keys2.end(), d_keys_exist2.begin()); - + REQUIRE(cuco::test::none_of(d_keys_exist2.begin(), d_keys_exist2.end(), [] __device__(const bool key_found) { return key_found; })); From b00fcba685e924b102058abac785a5f2fde74157 Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Wed, 6 Apr 2022 22:54:24 -0700 Subject: [PATCH 012/152] shared mem atomics to keep track of per-submap erases --- benchmarks/hash_table/dynamic_map_bench.cu | 12 +- include/cuco/detail/dynamic_map.inl | 67 +++++------ include/cuco/detail/dynamic_map_kernels.cuh | 122 ++++++++++++++++---- include/cuco/dynamic_map.cuh | 8 +- tests/dynamic_map/erase_test.cu | 12 +- 5 files changed, 141 insertions(+), 80 deletions(-) diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu index 222699abb..e150c02be 100644 --- a/benchmarks/hash_table/dynamic_map_bench.cu +++ b/benchmarks/hash_table/dynamic_map_bench.cu @@ -269,12 +269,12 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); - +*/ BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); - +/* BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) @@ -309,22 +309,22 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); - +*/ BENCHMARK_TEMPLATE(BM_dynamic_erase_none, int32_t, int32_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); - +/* BENCHMARK_TEMPLATE(BM_dynamic_erase_none, int32_t, int32_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); -*/ + BENCHMARK_TEMPLATE(BM_dynamic_search_none, int32_t, int32_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); -/* + BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index 8be714c3d..c97622433 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -39,8 +39,6 @@ dynamic_map::dynamic_map( alloc)); submap_views_.push_back(submaps_[0]->get_device_view()); submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); - submap_num_successes_.push_back(submaps_[0]->get_num_successes()); - num_successes_ = std::allocator_traits::allocate(counter_allocator_, 1); } @@ -70,7 +68,7 @@ dynamic_map::dynamic_map( submap_views_.push_back(submaps_[0]->get_device_view()); submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); submap_num_successes_.push_back(submaps_[0]->get_num_successes()); - + d_submap_num_successes_ = submap_num_successes_; num_successes_ = std::allocator_traits::allocate(counter_allocator_, 1); } @@ -102,6 +100,8 @@ void dynamic_map::reserve(std::size_t n) sentinel::empty_value{empty_value_sentinel_}, sentinel::erased_key{erased_key_sentinel_}, alloc_)); + submap_num_successes_.push_back(submaps_[submap_idx]->get_num_successes()); + d_submap_num_successes_ = submap_num_successes_; } else { submaps_.push_back(std::make_unique>( submap_capacity, @@ -111,8 +111,6 @@ void dynamic_map::reserve(std::size_t n) } submap_views_.push_back(submaps_[submap_idx]->get_device_view()); submap_mutable_views_.push_back(submaps_[submap_idx]->get_device_mutable_view()); - submap_num_successes_.push_back(submaps_[submap_idx]->get_num_successes()); - capacity_ *= 2; } @@ -164,7 +162,6 @@ void dynamic_map::insert(InputIt first, std::size_t h_num_successes; CUCO_CUDA_TRY(cudaMemcpy( &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost)); - submaps_[submap_idx]->size_ += h_num_successes; size_ += h_num_successes; first += n; @@ -193,46 +190,42 @@ void dynamic_map::erase(InputIt first, CUCO_CUDA_TRY(cudaMemset(num_successes_, 0, sizeof(atomic_ctr_type))); // zero out submap success counters - static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); - for (int i = 0; i < submaps_.size(); ++i) { - CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[i], 0, sizeof(atomic_ctr_type))); + if(submaps_.size() > 1) { + static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); + for(int i = 0; i < submaps_.size(); ++i) { + CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[i], 0, sizeof(atomic_ctr_type))); + } } - - // TODO: hacky, improve this - // provide device-accessible vector for each submap num_successes variable - thrust::device_vector d_submap_num_successes(submap_num_successes_); - - // TODO: hack (how to get size on host?) - // use dynamic shared memory to hold block reduce space for each submap's erases - constexpr size_t temp_storage_size_one_block = 48; - auto const temp_storage_size = submaps_.size() * temp_storage_size_one_block; + + auto const temp_storage_size = submaps_.size() * sizeof(unsigned long long); detail::erase> - <<>>(first, - first + num_keys, - submap_views_.data().get(), - submap_mutable_views_.data().get(), - num_successes_, - d_submap_num_successes.data().get(), - submaps_.size(), - hash, - key_equal); + <<>>( + first, + first + num_keys, + submap_views_.data().get(), + submap_mutable_views_.data().get(), + num_successes_, + d_submap_num_successes_.data().get(), + submaps_.size(), + hash, + key_equal); // update total dynamic map size std::size_t h_num_successes; CUCO_CUDA_TRY( cudaMemcpy(&h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost)); size_ -= h_num_successes; - - // TODO: if only one submap, skip this step - // update each submap's size - for (int i = 0; i < submaps_.size(); ++i) { - std::size_t h_submap_num_successes; - CUCO_CUDA_TRY(cudaMemcpy(&h_submap_num_successes, - submap_num_successes_[i], - sizeof(atomic_ctr_type), - cudaMemcpyDeviceToHost)); - submaps_[i]->size_ -= h_submap_num_successes; + + if(submaps_.size() == 1) { + submaps_[0]->size_ -= h_num_successes; + } else { + for(int i = 0; i < submaps_.size(); ++i) { + std::size_t h_submap_num_successes; + CUCO_CUDA_TRY(cudaMemcpy( + &h_submap_num_successes, submap_num_successes_[i], sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost)); + submaps_[i]->size_ -= h_submap_num_successes; + } } } diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh index 61f32bda7..fbc7f9e35 100644 --- a/include/cuco/detail/dynamic_map_kernels.cuh +++ b/include/cuco/detail/dynamic_map_kernels.cuh @@ -187,7 +187,6 @@ __global__ void insert(InputIt first, } template BlockReduce; - extern __shared__ typename BlockReduce::TempStorage temp_submap_storage[]; __shared__ typename BlockReduce::TempStorage temp_storage; + extern __shared__ unsigned long long submap_block_num_successes[]; std::size_t thread_num_successes = 0; - // TODO: find permanent solution (only works for four submaps) - std::size_t submap_thread_num_successes[4] = {0, 0, 0, 0}; + auto tid = block_size * blockIdx.x + threadIdx.x; + auto it = first + tid; + + if(num_submaps > 1) { + for(int i = threadIdx.x; i < num_submaps; i += block_size) + submap_block_num_successes[i] = 0; + __syncthreads(); + + while (it < last) { + int i; + for (i = 0; i < num_submaps; ++i) { + if (submap_mutable_views[i].erase(*it, hash, key_equal)) { + thread_num_successes++; + atomicAdd(&submap_block_num_successes[i], 1); + break; + } + } + it += gridDim.x * blockDim.x; + } + } else { + while (it < last) { + if(submap_mutable_views[0].erase(*it, hash, key_equal)) + thread_num_successes++; + it += gridDim.x * blockDim.x; + } + } + + std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); + if (threadIdx.x == 0) { + num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed); + } + + if(num_submaps > 1) { + for(int i = 0; i < num_submaps; ++i) { + if(threadIdx.x == 0) { + submap_num_successes[i]->fetch_add( + static_cast(submap_block_num_successes[i]), cuda::std::memory_order_relaxed); + } + } + } +} + +template +__global__ void erase(InputIt first, + InputIt last, + viewT* submap_views, + mutableViewT* submap_mutable_views, + atomicT* num_successes, + atomicT** submap_num_successes, + const uint32_t num_submaps, + Hash hash, + KeyEqual key_equal) +{ + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + extern __shared__ unsigned long long submap_block_num_successes[]; + + std::size_t thread_num_successes = 0; auto tile = cg::tiled_partition(cg::this_thread_block()); auto tid = block_size * blockIdx.x + threadIdx.x; auto it = first + tid / tile_size; - while (it < last) { - auto erased = false; - - // manually check for duplicates in those submaps we are not inserting into - int i; - for (i = 0; i < num_submaps; ++i) { - erased = submap_mutable_views[i].erase(tile, *it, hash, key_equal); - if (erased) { break; } - } - if (erased && tile.thread_rank() == 0) { - thread_num_successes++; - submap_thread_num_successes[i]++; + if(num_submaps > 1) { + for(int i = threadIdx.x; i < num_submaps; i += block_size) + submap_block_num_successes[i] = 0; + __syncthreads(); + + while (it < last) { + auto erased = false; + int i; + for (i = 0; i < num_submaps; ++i) { + erased = submap_mutable_views[i].erase(tile, *it, hash, key_equal); + if (erased) { break; } + } + if (erased && tile.thread_rank() == 0) { + thread_num_successes++; + atomicAdd(&submap_block_num_successes[i], 1); + } + it += (gridDim.x * blockDim.x) / tile_size; } + } else { + while (it < last) { + auto erased = submap_mutable_views[0].erase(tile, *it, hash, key_equal); + if (erased && tile.thread_rank() == 0) + thread_num_successes++; - it += (gridDim.x * blockDim.x) / tile_size; + it += (gridDim.x * blockDim.x) / tile_size; + } } std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); @@ -240,14 +314,12 @@ __global__ void erase(InputIt first, num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed); } - // TODO: if there's only one submap, skip this step - // update submap thread counts - for (int i = 0; i < num_submaps; ++i) { - std::size_t submap_block_num_successes = - BlockReduce(temp_submap_storage[i]).Sum(submap_thread_num_successes[i]); - if (threadIdx.x == 0) { - submap_num_successes[i]->fetch_add(submap_block_num_successes, - cuda::std::memory_order_relaxed); + if(num_submaps > 1) { + for(int i = 0; i < num_submaps; ++i) { + if(threadIdx.x == 0) { + submap_num_successes[i]->fetch_add( + static_cast(submap_block_num_successes[i]), cuda::std::memory_order_relaxed); + } } } } diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index 35eb4898f..bb197f6dc 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -19,11 +19,11 @@ #include #include -#include #include #include #include #include +#include #include #include @@ -287,9 +287,9 @@ class dynamic_map { submap_mutable_views_; ///< vector of mutable device views for each submap std::size_t min_insert_size_{}; ///< min remaining capacity of submap for insert atomic_ctr_type* num_successes_; ///< number of successfully inserted keys on insert - std::vector - submap_num_successes_; ///< number of succesfully erased keys for each submap - Allocator alloc_{}; ///< Allocator passed to submaps to allocate their device storage + std::vector submap_num_successes_; ///< number of succesfully erased keys for each submap + thrust::device_vector d_submap_num_successes_; + Allocator alloc_{}; ///< Allocator passed to submaps to allocate their device storage counter_allocator_type counter_allocator_{}; ///< Allocator used to allocate `num_successes_` }; } // namespace cuco diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu index 64e4cce38..e5753b544 100644 --- a/tests/dynamic_map/erase_test.cu +++ b/tests/dynamic_map/erase_test.cu @@ -107,11 +107,9 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t)) map.insert(pairs_begin2, pairs_begin2 + 4 * num_keys); // map should resize twice if the erased slots are successfully reused - REQUIRE(map.get_capacity() == 8 * num_keys); - + REQUIRE(map.get_capacity() == 8*num_keys); // check that keys can be successfully deleted from only the first and second submaps - map.erase(d_keys2.begin(), d_keys2.begin() + 2 * num_keys); - + map.erase(d_keys2.begin(), d_keys2.begin() + 2*num_keys); map.contains(d_keys2.begin(), d_keys2.end(), d_keys_exist2.begin()); REQUIRE(cuco::test::none_of(d_keys_exist2.begin(), @@ -122,10 +120,8 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t)) d_keys_exist2.end(), [] __device__(const bool key_found) { return key_found; })); - REQUIRE(map.get_size() == 2 * num_keys); - - // check that keys can be successfully deleted from all submaps (some will be unsuccessful - // erases) + REQUIRE(map.get_size() == 2*num_keys); + // check that keys can be successfully deleted from all submaps (some will be unsuccessful erases) map.erase(d_keys2.begin(), d_keys2.end()); map.contains(d_keys2.begin(), d_keys2.end(), d_keys_exist2.begin()); From c146f9d92e0988b76f442a4d7abd4354a5140aaa Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Fri, 29 Apr 2022 17:01:25 -0700 Subject: [PATCH 013/152] doc improvements --- benchmarks/hash_table/dynamic_map_bench.cu | 82 +-------------------- include/cuco/detail/dynamic_map.inl | 1 - include/cuco/detail/dynamic_map_kernels.cuh | 55 +++++++++++++- include/cuco/detail/static_map_kernels.cuh | 41 +++++++++++ include/cuco/dynamic_map.cuh | 61 ++++++++++++++- 5 files changed, 152 insertions(+), 88 deletions(-) diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu index e150c02be..3a846b23d 100644 --- a/benchmarks/hash_table/dynamic_map_bench.cu +++ b/benchmarks/hash_table/dynamic_map_bench.cu @@ -259,7 +259,7 @@ static void BM_dynamic_erase_none(::benchmark::State& state) state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * int64_t(state.range(0))); } -/* + BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) @@ -269,84 +269,8 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); -*/ -BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); -/* -BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::GAUSSIAN) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::GAUSSIAN) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::GAUSSIAN) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); -BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); -*/ -BENCHMARK_TEMPLATE(BM_dynamic_erase_none, int32_t, int32_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); -/* -BENCHMARK_TEMPLATE(BM_dynamic_erase_none, int32_t, int32_t, dist_type::GAUSSIAN) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_search_none, int32_t, int32_t, dist_type::GAUSSIAN) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::GAUSSIAN) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::GAUSSIAN) +BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) - ->UseManualTime(); -*/ \ No newline at end of file + ->UseManualTime(); \ No newline at end of file diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index c97622433..c648c7029 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -203,7 +203,6 @@ void dynamic_map::erase(InputIt first, <<>>( first, first + num_keys, - submap_views_.data().get(), submap_mutable_views_.data().get(), num_successes_, d_submap_num_successes_.data().get(), diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh index fbc7f9e35..6614cfe28 100644 --- a/include/cuco/detail/dynamic_map_kernels.cuh +++ b/include/cuco/detail/dynamic_map_kernels.cuh @@ -186,17 +186,40 @@ __global__ void insert(InputIt first, if (threadIdx.x == 0) { *num_successes += block_num_successes; } } +/** + * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`. + * + * If the key `*(first + i)` exists in the map, its slot is erased and made available for future + insertions. + * Else, no effect. + * @tparam block_size The size of the thread block + * @tparam pair_type Type of the pairs contained in the map + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `key_type` + * @tparam mutableViewT Type of device view allowing modification of hash map storage + * @tparam atomicT Type of atomic storage + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param submap_mutable_views Array of `static_map::mutable_device_view` objects used to + * perform `erase` operations on each underlying `static_map` + * @param num_successes The number of successfully erased key/value pairs + * @param submap_num_successes The number of successfully erased key/value pairs + * in each submap + * @param num_submaps The number of submaps in the map + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + */ template __global__ void erase(InputIt first, InputIt last, - viewT* submap_views, mutableViewT* submap_mutable_views, atomicT* num_successes, atomicT** submap_num_successes, @@ -252,18 +275,42 @@ __global__ void erase(InputIt first, } } +/** + * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`. + * + * If the key `*(first + i)` exists in the map, its slot is erased and made available for future + insertions. + * Else, no effect. + * @tparam block_size The size of the thread block + * @tparam tile_size The number of threads in the Cooperative Groups used to perform erase + * @tparam pair_type Type of the pairs contained in the map + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `key_type` + * @tparam mutableViewT Type of device view allowing modification of hash map storage + * @tparam atomicT Type of atomic storage + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param submap_mutable_views Array of `static_map::mutable_device_view` objects used to + * perform `erase` operations on each underlying `static_map` + * @param num_successes The number of successfully erased key/value pairs + * @param submap_num_successes The number of successfully erased key/value pairs + * in each submap + * @param num_submaps The number of submaps in the map + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + */ template __global__ void erase(InputIt first, InputIt last, - viewT* submap_views, mutableViewT* submap_mutable_views, atomicT* num_successes, atomicT** submap_num_successes, diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh index 7a3ca0dfa..2ebcd4c91 100644 --- a/include/cuco/detail/static_map_kernels.cuh +++ b/include/cuco/detail/static_map_kernels.cuh @@ -163,6 +163,26 @@ __global__ void insert( if (threadIdx.x == 0) { *num_successes += block_num_successes; } } +/** + * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`. + * + * If the key `*(first + i)` exists in the map, its slot is erased and made available for future + insertions. + * Else, no effect. + * @tparam block_size The size of the thread block + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `key_type` + * @tparam atomicT Type of atomic storage + * @tparam viewT Type of device view allowing access of hash map storage + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param num_successes The number of successfully erased key/value pairs + * @param view Device view used to access the hash map's slot storage + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + */ template empty_key_sentinel, sentinel::empty_value empty_value_sentinel, Allocator const& alloc = Allocator{}); - + + /** + * @brief Construct a dynamically-sized map with erase capability. + * + * The capacity of the map will automatically increase as the user adds key/value pairs using + * `insert`. + * + * Capacity increases by a factor of growth_factor each time the size of the map exceeds a + * threshold occupancy. The performance of `find` and `contains` decreases somewhat each time the + * map's capacity grows. + * + * The `empty_key_sentinel` and `empty_value_sentinel` values are reserved and + * undefined behavior results from attempting to insert any key/value pair + * that contains either. + * + * @param initial_capacity The initial number of slots in the map + * @param growth_factor The factor by which the capacity increases when resizing + * @param empty_key_sentinel The reserved key value for empty slots + * @param empty_value_sentinel The reserved mapped value for empty slots + * @param erased_key_sentinel The reserved key value for erased slots + * @param alloc Allocator used to allocate submap device storage + * + * @throw std::runtime error if the empty key sentinel and erased key sentinel + * are the same value + */ dynamic_map(std::size_t initial_capacity, sentinel::empty_key empty_key_sentinel, sentinel::empty_value empty_value_sentinel, @@ -188,7 +212,36 @@ class dynamic_map { typename Hash = cuco::detail::MurmurHash3_32, typename KeyEqual = thrust::equal_to> void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}); - + + /** + * @brief Erases keys in the range `[first, last)`. + * + * For each key `k` in `[first, last)`, if `contains(k) == true), removes `k` and it's + * associated value from the map. Else, no effect. + * + * Side-effects: + * - `contains(k) == false` + * - `find(k) == end()` + * - `insert({k,v}) == true` + * - `get_size()` is reduced by the total number of erased keys + * + * This function synchronizes `stream`. + * + * Keep in mind that `erase` does not cause the map to shrink its memory allocation. + * + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `value_type` + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + * @param stream Stream used for executing the kernels + * + * @throw std::runtime_error if a unique erased key sentinel value was not + * provided at construction + */ template , typename KeyEqual = thrust::equal_to> @@ -273,7 +326,7 @@ class dynamic_map { private: key_type empty_key_sentinel_{}; ///< Key value that represents an empty slot mapped_type empty_value_sentinel_{}; ///< Initial value of empty slot - key_type erased_key_sentinel_{}; + key_type erased_key_sentinel_{}; ///< Key value that represents an erased slot // TODO: initialize this std::size_t size_{}; ///< Number of keys in the map @@ -288,7 +341,7 @@ class dynamic_map { std::size_t min_insert_size_{}; ///< min remaining capacity of submap for insert atomic_ctr_type* num_successes_; ///< number of successfully inserted keys on insert std::vector submap_num_successes_; ///< number of succesfully erased keys for each submap - thrust::device_vector d_submap_num_successes_; + thrust::device_vector d_submap_num_successes_; ///< device-side number of successfully erased keys for each submap Allocator alloc_{}; ///< Allocator passed to submaps to allocate their device storage counter_allocator_type counter_allocator_{}; ///< Allocator used to allocate `num_successes_` }; From faf82240f287bd382268a985a4872295e96b5b4f Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Wed, 31 Aug 2022 11:37:26 -0700 Subject: [PATCH 014/152] warning fixes --- benchmarks/hash_table/dynamic_map_bench.cu | 6 +++--- include/cuco/detail/dynamic_map.inl | 4 ++-- tests/dynamic_map/erase_test.cu | 2 ++ 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu index 3a846b23d..079018005 100644 --- a/benchmarks/hash_table/dynamic_map_bench.cu +++ b/benchmarks/hash_table/dynamic_map_bench.cu @@ -187,7 +187,7 @@ static void BM_dynamic_erase_all(::benchmark::State& state) generate_keys(h_keys.begin(), h_keys.end()); - for (auto i = 0; i < num_keys; ++i) { + for (uint32_t i = 0; i < num_keys; ++i) { Key key = h_keys[i]; Value val = h_keys[i]; h_pairs[i].first = key; @@ -203,12 +203,12 @@ static void BM_dynamic_erase_all(::benchmark::State& state) cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}, cuco::sentinel::erased_key{-2}}; - for (auto i = 0; i < num_keys; i += batch_size) { + for (uint32_t i = 0; i < num_keys; i += batch_size) { map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size); } { cuda_event_timer raii{state}; - for (auto i = 0; i < num_keys; i += batch_size) { + for (uint32_t i = 0; i < num_keys; i += batch_size) { map.erase(d_keys.begin() + i, d_keys.begin() + i + batch_size); } } diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index c648c7029..97a628d15 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -192,7 +192,7 @@ void dynamic_map::erase(InputIt first, // zero out submap success counters if(submaps_.size() > 1) { static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); - for(int i = 0; i < submaps_.size(); ++i) { + for(uint32_t i = 0; i < submaps_.size(); ++i) { CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[i], 0, sizeof(atomic_ctr_type))); } } @@ -219,7 +219,7 @@ void dynamic_map::erase(InputIt first, if(submaps_.size() == 1) { submaps_[0]->size_ -= h_num_successes; } else { - for(int i = 0; i < submaps_.size(); ++i) { + for(uint32_t i = 0; i < submaps_.size(); ++i) { std::size_t h_submap_num_successes; CUCO_CUDA_TRY(cudaMemcpy( &h_submap_num_successes, submap_num_successes_[i], sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost)); diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu index e5753b544..2254523c7 100644 --- a/tests/dynamic_map/erase_test.cu +++ b/tests/dynamic_map/erase_test.cu @@ -16,6 +16,8 @@ #include #include +#include +#include #include From e4b548e954ec60f7bee1a849e9a935733ca6584b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 31 Aug 2022 18:38:09 +0000 Subject: [PATCH 015/152] [pre-commit.ci] auto code formatting --- benchmarks/hash_table/static_map_bench.cu | 1 - include/cuco/detail/dynamic_map.inl | 35 +++++++------ include/cuco/detail/dynamic_map_kernels.cuh | 58 ++++++++++----------- include/cuco/dynamic_map.cuh | 17 +++--- tests/dynamic_map/erase_test.cu | 11 ++-- tests/dynamic_map/unique_sequence_test.cu | 5 +- 6 files changed, 63 insertions(+), 64 deletions(-) diff --git a/benchmarks/hash_table/static_map_bench.cu b/benchmarks/hash_table/static_map_bench.cu index 1e69c0c4e..ce1015b8d 100644 --- a/benchmarks/hash_table/static_map_bench.cu +++ b/benchmarks/hash_table/static_map_bench.cu @@ -346,4 +346,3 @@ BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(generate_size_and_occupancy) ->UseManualTime(); - diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index 97a628d15..0e0020e97 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -190,39 +190,40 @@ void dynamic_map::erase(InputIt first, CUCO_CUDA_TRY(cudaMemset(num_successes_, 0, sizeof(atomic_ctr_type))); // zero out submap success counters - if(submaps_.size() > 1) { + if (submaps_.size() > 1) { static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); - for(uint32_t i = 0; i < submaps_.size(); ++i) { + for (uint32_t i = 0; i < submaps_.size(); ++i) { CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[i], 0, sizeof(atomic_ctr_type))); } } - + auto const temp_storage_size = submaps_.size() * sizeof(unsigned long long); detail::erase> - <<>>( - first, - first + num_keys, - submap_mutable_views_.data().get(), - num_successes_, - d_submap_num_successes_.data().get(), - submaps_.size(), - hash, - key_equal); + <<>>(first, + first + num_keys, + submap_mutable_views_.data().get(), + num_successes_, + d_submap_num_successes_.data().get(), + submaps_.size(), + hash, + key_equal); // update total dynamic map size std::size_t h_num_successes; CUCO_CUDA_TRY( cudaMemcpy(&h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost)); size_ -= h_num_successes; - - if(submaps_.size() == 1) { + + if (submaps_.size() == 1) { submaps_[0]->size_ -= h_num_successes; } else { - for(uint32_t i = 0; i < submaps_.size(); ++i) { + for (uint32_t i = 0; i < submaps_.size(); ++i) { std::size_t h_submap_num_successes; - CUCO_CUDA_TRY(cudaMemcpy( - &h_submap_num_successes, submap_num_successes_[i], sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost)); + CUCO_CUDA_TRY(cudaMemcpy(&h_submap_num_successes, + submap_num_successes_[i], + sizeof(atomic_ctr_type), + cudaMemcpyDeviceToHost)); submaps_[i]->size_ -= h_submap_num_successes; } } diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh index 6614cfe28..913149021 100644 --- a/include/cuco/detail/dynamic_map_kernels.cuh +++ b/include/cuco/detail/dynamic_map_kernels.cuh @@ -233,14 +233,14 @@ __global__ void erase(InputIt first, std::size_t thread_num_successes = 0; - auto tid = block_size * blockIdx.x + threadIdx.x; - auto it = first + tid; + auto tid = block_size * blockIdx.x + threadIdx.x; + auto it = first + tid; - if(num_submaps > 1) { - for(int i = threadIdx.x; i < num_submaps; i += block_size) + if (num_submaps > 1) { + for (int i = threadIdx.x; i < num_submaps; i += block_size) submap_block_num_successes[i] = 0; __syncthreads(); - + while (it < last) { int i; for (i = 0; i < num_submaps; ++i) { @@ -254,8 +254,7 @@ __global__ void erase(InputIt first, } } else { while (it < last) { - if(submap_mutable_views[0].erase(*it, hash, key_equal)) - thread_num_successes++; + if (submap_mutable_views[0].erase(*it, hash, key_equal)) thread_num_successes++; it += gridDim.x * blockDim.x; } } @@ -265,11 +264,11 @@ __global__ void erase(InputIt first, num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed); } - if(num_submaps > 1) { - for(int i = 0; i < num_submaps; ++i) { - if(threadIdx.x == 0) { - submap_num_successes[i]->fetch_add( - static_cast(submap_block_num_successes[i]), cuda::std::memory_order_relaxed); + if (num_submaps > 1) { + for (int i = 0; i < num_submaps; ++i) { + if (threadIdx.x == 0) { + submap_num_successes[i]->fetch_add(static_cast(submap_block_num_successes[i]), + cuda::std::memory_order_relaxed); } } } @@ -310,13 +309,13 @@ template __global__ void erase(InputIt first, - InputIt last, - mutableViewT* submap_mutable_views, - atomicT* num_successes, - atomicT** submap_num_successes, - const uint32_t num_submaps, - Hash hash, - KeyEqual key_equal) + InputIt last, + mutableViewT* submap_mutable_views, + atomicT* num_successes, + atomicT** submap_num_successes, + const uint32_t num_submaps, + Hash hash, + KeyEqual key_equal) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; @@ -328,13 +327,13 @@ __global__ void erase(InputIt first, auto tid = block_size * blockIdx.x + threadIdx.x; auto it = first + tid / tile_size; - if(num_submaps > 1) { - for(int i = threadIdx.x; i < num_submaps; i += block_size) + if (num_submaps > 1) { + for (int i = threadIdx.x; i < num_submaps; i += block_size) submap_block_num_successes[i] = 0; __syncthreads(); - + while (it < last) { - auto erased = false; + auto erased = false; int i; for (i = 0; i < num_submaps; ++i) { erased = submap_mutable_views[i].erase(tile, *it, hash, key_equal); @@ -349,8 +348,7 @@ __global__ void erase(InputIt first, } else { while (it < last) { auto erased = submap_mutable_views[0].erase(tile, *it, hash, key_equal); - if (erased && tile.thread_rank() == 0) - thread_num_successes++; + if (erased && tile.thread_rank() == 0) thread_num_successes++; it += (gridDim.x * blockDim.x) / tile_size; } @@ -361,11 +359,11 @@ __global__ void erase(InputIt first, num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed); } - if(num_submaps > 1) { - for(int i = 0; i < num_submaps; ++i) { - if(threadIdx.x == 0) { - submap_num_successes[i]->fetch_add( - static_cast(submap_block_num_successes[i]), cuda::std::memory_order_relaxed); + if (num_submaps > 1) { + for (int i = 0; i < num_submaps; ++i) { + if (threadIdx.x == 0) { + submap_num_successes[i]->fetch_add(static_cast(submap_block_num_successes[i]), + cuda::std::memory_order_relaxed); } } } diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index 0efd87f4b..f34eb3d86 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -16,14 +16,13 @@ #pragma once - #include #include +#include #include #include #include #include -#include #include #include @@ -110,7 +109,7 @@ class dynamic_map { using mutable_view_type = typename static_map::device_mutable_view; using counter_allocator_type = typename std::allocator_traits::rebind_alloc; - + dynamic_map(dynamic_map const&) = delete; dynamic_map(dynamic_map&&) = delete; @@ -147,7 +146,7 @@ class dynamic_map { sentinel::empty_key empty_key_sentinel, sentinel::empty_value empty_value_sentinel, Allocator const& alloc = Allocator{}); - + /** * @brief Construct a dynamically-sized map with erase capability. * @@ -212,7 +211,7 @@ class dynamic_map { typename Hash = cuco::detail::MurmurHash3_32, typename KeyEqual = thrust::equal_to> void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}); - + /** * @brief Erases keys in the range `[first, last)`. * @@ -340,9 +339,11 @@ class dynamic_map { submap_mutable_views_; ///< vector of mutable device views for each submap std::size_t min_insert_size_{}; ///< min remaining capacity of submap for insert atomic_ctr_type* num_successes_; ///< number of successfully inserted keys on insert - std::vector submap_num_successes_; ///< number of succesfully erased keys for each submap - thrust::device_vector d_submap_num_successes_; ///< device-side number of successfully erased keys for each submap - Allocator alloc_{}; ///< Allocator passed to submaps to allocate their device storage + std::vector + submap_num_successes_; ///< number of succesfully erased keys for each submap + thrust::device_vector + d_submap_num_successes_; ///< device-side number of successfully erased keys for each submap + Allocator alloc_{}; ///< Allocator passed to submaps to allocate their device storage counter_allocator_type counter_allocator_{}; ///< Allocator used to allocate `num_successes_` }; } // namespace cuco diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu index 2254523c7..1c81f400a 100644 --- a/tests/dynamic_map/erase_test.cu +++ b/tests/dynamic_map/erase_test.cu @@ -16,8 +16,8 @@ #include #include -#include #include +#include #include @@ -109,9 +109,9 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t)) map.insert(pairs_begin2, pairs_begin2 + 4 * num_keys); // map should resize twice if the erased slots are successfully reused - REQUIRE(map.get_capacity() == 8*num_keys); + REQUIRE(map.get_capacity() == 8 * num_keys); // check that keys can be successfully deleted from only the first and second submaps - map.erase(d_keys2.begin(), d_keys2.begin() + 2*num_keys); + map.erase(d_keys2.begin(), d_keys2.begin() + 2 * num_keys); map.contains(d_keys2.begin(), d_keys2.end(), d_keys_exist2.begin()); REQUIRE(cuco::test::none_of(d_keys_exist2.begin(), @@ -122,8 +122,9 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t)) d_keys_exist2.end(), [] __device__(const bool key_found) { return key_found; })); - REQUIRE(map.get_size() == 2*num_keys); - // check that keys can be successfully deleted from all submaps (some will be unsuccessful erases) + REQUIRE(map.get_size() == 2 * num_keys); + // check that keys can be successfully deleted from all submaps (some will be unsuccessful + // erases) map.erase(d_keys2.begin(), d_keys2.end()); map.contains(d_keys2.begin(), d_keys2.end(), d_keys_exist2.begin()); diff --git a/tests/dynamic_map/unique_sequence_test.cu b/tests/dynamic_map/unique_sequence_test.cu index 24a2041aa..fea8de53d 100644 --- a/tests/dynamic_map/unique_sequence_test.cu +++ b/tests/dynamic_map/unique_sequence_test.cu @@ -39,9 +39,8 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", { constexpr std::size_t num_keys{50'000'000}; - cuco::dynamic_map map{30'000'000, - cuco::sentinel::empty_key{-1}, - cuco::sentinel::empty_value{-1}}; + cuco::dynamic_map map{ + 30'000'000, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; thrust::device_vector d_keys(num_keys); thrust::device_vector d_values(num_keys); From 93b79837db16cfb5ff85d3552db260bae78273bf Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Wed, 31 Aug 2022 13:13:20 -0700 Subject: [PATCH 016/152] removed nvtx file --- include/cuco/detail/nvtx3.hpp | 2071 --------------------------------- 1 file changed, 2071 deletions(-) delete mode 100644 include/cuco/detail/nvtx3.hpp diff --git a/include/cuco/detail/nvtx3.hpp b/include/cuco/detail/nvtx3.hpp deleted file mode 100644 index 075c6e5d4..000000000 --- a/include/cuco/detail/nvtx3.hpp +++ /dev/null @@ -1,2071 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#if defined(NVTX3_MINOR_VERSION) and NVTX3_MINOR_VERSION < 0 -#error \ - "Trying to #include NVTX version 3 in a source file where an older NVTX version has already been included. If you are not directly using NVTX (the NVIDIA Tools Extension library), you are getting this error because libraries you are using have included different versions of NVTX. Suggested solutions are: (1) reorder #includes so the newest NVTX version is included first, (2) avoid using the conflicting libraries in the same .c/.cpp file, or (3) update the library using the older NVTX version to use the newer version instead." -#endif - -/** - * @brief Semantic minor version number. - * - * Major version number is hardcoded into the "nvtx3" namespace/prefix. - * - * If this value is incremented, the above version include guard needs to be - * updated. - * - */ -#define NVTX3_MINOR_VERSION 0 - -#include - -#include - -/** - * @file nvtx3.hpp - * - * @brief Provides C++ constructs making the NVTX library safer and easier to - * use with zero overhead. - */ - -/** - * \mainpage - * \tableofcontents - * - * \section QUICK_START Quick Start - * - * To add NVTX ranges to your code, use the `nvtx3::thread_range` RAII object. A - * range begins when the object is created, and ends when the object is - * destroyed. - * - * \code{.cpp} - * #include "nvtx3.hpp" - * void some_function(){ - * // Begins a NVTX range with the messsage "some_function" - * // The range ends when some_function() returns and `r` is destroyed - * nvtx3::thread_range r{"some_function"}; - * - * for(int i = 0; i < 6; ++i){ - * nvtx3::thread_range loop{"loop range"}; - * std::this_thread::sleep_for(std::chrono::seconds{1}); - * } - * } // Range ends when `r` is destroyed - * \endcode - * - * The example code above generates the following timeline view in Nsight - * Systems: - * - * \image html - * https://raw.githubusercontent.com/jrhemstad/nvtx_wrappers/master/docs/example_range.png - * - * Alternatively, use the \ref MACROS like `NVTX3_FUNC_RANGE()` to add - * ranges to your code that automatically use the name of the enclosing function - * as the range's message. - * - * \code{.cpp} - * #include "nvtx3.hpp" - * void some_function(){ - * // Creates a range with a message "some_function" that ends when the - * enclosing - * // function returns - * NVTX3_FUNC_RANGE(); - * ... - * } - * \endcode - * - * - * \section Overview - * - * The NVTX library provides a set of functions for users to annotate their code - * to aid in performance profiling and optimization. These annotations provide - * information to tools like Nsight Systems to improve visualization of - * application timelines. - * - * \ref RANGES are one of the most commonly used NVTX constructs for annotating - * a span of time. For example, imagine a user wanted to see every time a - * function, `my_function`, is called and how long it takes to execute. This can - * be accomplished with an NVTX range created on the entry to the function and - * terminated on return from `my_function` using the push/pop C APIs: - * - * ``` - * void my_function(...){ - * nvtxRangePushA("my_function"); // Begins NVTX range - * // do work - * nvtxRangePop(); // Ends NVTX range - * } - * ``` - * - * One of the challenges with using the NVTX C API is that it requires manually - * terminating the end of the range with `nvtxRangePop`. This can be challenging - * if `my_function()` has multiple returns or can throw exceptions as it - * requires calling `nvtxRangePop()` before all possible return points. - * - * NVTX++ solves this inconvenience through the "RAII" technique by providing a - * `nvtx3::thread_range` class that begins a range at construction and ends the - * range on destruction. The above example then becomes: - * - * ``` - * void my_function(...){ - * nvtx3::thread_range r{"my_function"}; // Begins NVTX range - * // do work - * } // Range ends on exit from `my_function` when `r` is destroyed - * ``` - * - * The range object `r` is deterministically destroyed whenever `my_function` - * returns---ending the NVTX range without manual intervention. For more - * information, see \ref RANGES and `nvtx3::domain_thread_range`. - * - * Another inconvenience of the NVTX C APIs are the several constructs where the - * user is expected to initialize an object at the beginning of an application - * and reuse that object throughout the lifetime of the application. For example - * Domains, Categories, and Registered messages. - * - * Example: - * ``` - * nvtxDomainHandle_t D = nvtxDomainCreateA("my domain"); - * // Reuse `D` throughout the rest of the application - * ``` - * - * This can be problematic if the user application or library does not have an - * explicit initialization function called before all other functions to - * ensure that these long-lived objects are initialized before being used. - * - * NVTX++ makes use of the "construct on first use" technique to alleviate this - * inconvenience. In short, a function local static object is constructed upon - * the first invocation of a function and returns a reference to that object on - * all future invocations. See the documentation for - * `nvtx3::registered_message`, `nvtx3::domain`, `nvtx3::named_category`, and - * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use for more - * information. - * - * Using construct on first use, the above example becomes: - * ``` - * struct my_domain{ static constexpr char const* name{"my domain"}; }; - * - * // The first invocation of `domain::get` for the type `my_domain` will - * // construct a `nvtx3::domain` object and return a reference to it. Future - * // invocations simply return a reference. - * nvtx3::domain const& D = nvtx3::domain::get(); - * ``` - * For more information about NVTX and how it can be used, see - * https://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvtx and - * https://devblogs.nvidia.com/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/ - * for more information. - * - * \section RANGES Ranges - * - * Ranges are used to describe a span of time during the execution of an - * application. Common examples are using ranges to annotate the time it takes - * to execute a function or an iteration of a loop. - * - * NVTX++ uses RAII to automate the generation of ranges that are tied to the - * lifetime of objects. Similar to `std::lock_guard` in the C++ Standard - * Template Library. - * - * \subsection THREAD_RANGE Thread Range - * - * `nvtx3::domain_thread_range` is a class that begins a range upon construction - * and ends the range at destruction. This is one of the most commonly used - * constructs in NVTX++ and is useful for annotating spans of time on a - * particular thread. These ranges can be nested to arbitrary depths. - * - * `nvtx3::thread_range` is an alias for a `nvtx3::domain_thread_range` in the - * global NVTX domain. For more information about Domains, see \ref DOMAINS. - * - * Various attributes of a range can be configured constructing a - * `nvtx3::domain_thread_range` with a `nvtx3::event_attributes` object. For - * more information, see \ref ATTRIBUTES. - * - * Example: - * - * \code{.cpp} - * void some_function(){ - * // Creates a range for the duration of `some_function` - * nvtx3::thread_range r{}; - * - * while(true){ - * // Creates a range for every loop iteration - * // `loop_range` is nested inside `r` - * nvtx3::thread_range loop_range{}; - * } - * } - * \endcode - * - * \subsection PROCESS_RANGE Process Range - * - * `nvtx3::domain_process_range` is identical to `nvtx3::domain_thread_range` - * with the exception that a `domain_process_range` can be created and destroyed - * on different threads. This is useful to annotate spans of time that can - * bridge multiple threads. - * - * `nvtx3::domain_thread_range`s should be preferred unless one needs the - * ability to begin and end a range on different threads. - * - * \section MARKS Marks - * - * `nvtx3::mark` allows annotating an instantaneous event in an application's - * timeline. For example, indicating when a mutex is locked or unlocked. - * - * \code{.cpp} - * std::mutex global_lock; - * void lock_mutex(){ - * global_lock.lock(); - * // Marks an event immediately after the mutex is locked - * nvtx3::mark("lock_mutex"); - * } - * \endcode - * - * \section DOMAINS Domains - * - * Similar to C++ namespaces, Domains allow for scoping NVTX events. By default, - * all NVTX events belong to the "global" domain. Libraries and applications - * should scope their events to use a custom domain to differentiate where the - * events originate from. - * - * It is common for a library or application to have only a single domain and - * for the name of that domain to be known at compile time. Therefore, Domains - * in NVTX++ are represented by _tag types_. - * - * For example, to define a custom domain, simply define a new concrete type - * (a `class` or `struct`) with a `static` member called `name` that contains - * the desired name of the domain. - * - * ``` - * struct my_domain{ static constexpr char const* name{"my domain"}; }; - * ``` - * - * For any NVTX++ construct that can be scoped to a domain, the type `my_domain` - * can be passed as an explicit template argument to scope it to the custom - * domain. - * - * The tag type `nvtx3::domain::global` represents the global NVTX domain. - * - * \code{.cpp} - * // By default, `domain_thread_range` belongs to the global domain - * nvtx3::domain_thread_range<> r0{}; - * - * // Alias for a `domain_thread_range` in the global domain - * nvtx3::thread_range r1{}; - * - * // `r` belongs to the custom domain - * nvtx3::domain_thread_range r{}; - * \endcode - * - * When using a custom domain, it is reccomended to define type aliases for NVTX - * constructs in the custom domain. - * ``` - * using my_thread_range = nvtx3::domain_thread_range; - * using my_registered_message = nvtx3::registered_message; - * using my_named_category = nvtx3::named_category; - * ``` - * - * See `nvtx3::domain` for more information. - * - * \section ATTRIBUTES Event Attributes - * - * NVTX events can be customized with various attributes to provide additional - * information (such as a custom message) or to control visualization of the - * event (such as the color used). These attributes can be specified per-event - * via arguments to a `nvtx3::event_attributes` object. - * - * NVTX events can be customized via four "attributes": - * - \ref COLOR : color used to visualize the event in tools. - * - \ref MESSAGES : Custom message string. - * - \ref PAYLOAD : User-defined numerical value. - * - \ref CATEGORY : Intra-domain grouping. - * - * It is possible to construct a `nvtx3::event_attributes` from any number of - * attribute objects (nvtx3::color, nvtx3::message, nvtx3::payload, - * nvtx3::category) in any order. If an attribute is not specified, a tool - * specific default value is used. See `nvtx3::event_attributes` for more - * information. - * - * \code{.cpp} - * // Custom color, message - * event_attributes attr{nvtx3::rgb{127, 255, 0}, - * "message"}; - * - * // Custom color, message, payload, category - * event_attributes attr{nvtx3::rgb{127, 255, 0}, - * nvtx3::payload{42}, - * "message", - * nvtx3::category{1}}; - * - * // Arguments can be in any order - * event_attributes attr{nvtx3::payload{42}, - * nvtx3::category{1}, - * "message", - * nvtx3::rgb{127, 255, 0}}; - * - * // "First wins" with multiple arguments of the same type - * event_attributes attr{ nvtx3::payload{42}, nvtx3::payload{7} }; // payload is - * 42 \endcode - * - * \subsection MESSAGES message - * - * A `nvtx3::message` allows associating a custom message string with an NVTX - * event. - * - * Example: - * \code{.cpp} - * // Create an `event_attributes` with the custom message "my message" - * nvtx3::event_attributes attr{nvtx3::Mesage{"my message"}}; - * - * // strings and string literals implicitly assumed to be a `nvtx3::message` - * nvtx3::event_attributes attr{"my message"}; - * \endcode - * - * \subsubsection REGISTERED_MESSAGE Registered Messages - * - * Associating a `nvtx3::message` with an event requires copying the contents of - * the message every time the message is used, i.e., copying the entire message - * string. This may cause non-trivial overhead in performance sensitive code. - * - * To eliminate this overhead, NVTX allows registering a message string, - * yielding a "handle" that is inexpensive to copy that may be used in place of - * a message string. When visualizing the events, tools such as Nsight Systems - * will take care of mapping the message handle to its string. - * - * A message should be registered once and the handle reused throughout the rest - * of the application. This can be done by either explicitly creating static - * `nvtx3::registered_message` objects, or using the - * `nvtx3::registered_message::get` construct on first use helper (recommended). - * - * Similar to \ref DOMAINS, `nvtx3::registered_message::get` requires defining a - * custom tag type with a static `message` member whose value will be the - * contents of the registered string. - * - * Example: - * \code{.cpp} - * // Explicitly constructed, static `registered_message` - * static registered_message static_message{"my message"}; - * - * // Or use construct on first use: - * // Define a tag type with a `message` member string to register - * struct my_message{ static constexpr char const* message{ "my message" }; }; - * - * // Uses construct on first use to register the contents of - * // `my_message::message` - * nvtx3::registered_message const& msg = - * nvtx3::registered_message::get(); \endcode - * - * \subsection COLOR color - * - * Associating a `nvtx3::color` with an event allows controlling how the event - * is visualized in a tool such as Nsight Systems. This is a convenient way to - * visually differentiate among different events. - * - * \code{.cpp} - * // Define a color via rgb color values - * nvtx3::color c{nvtx3::rgb{127, 255, 0}}; - * nvtx3::event_attributes attr{c}; - * - * // rgb color values can be passed directly to an `event_attributes` - * nvtx3::event_attributes attr1{nvtx3::rgb{127,255,0}}; - * \endcode - * - * \subsection CATEGORY category - * - * A `nvtx3::category` is simply an integer id that allows for fine-grain - * grouping of NVTX events. For example, one might use separate categories for - * IO, memory allocation, compute, etc. - * - * \code{.cpp} - * nvtx3::event_attributes{nvtx3::category{1}}; - * \endcode - * - * \subsubsection NAMED_CATEGORIES Named Categories - * - * Associates a `name` string with a category `id` to help differentiate among - * categories. - * - * For any given category id `Id`, a `named_category{Id, "name"}` should only - * be constructed once and reused throughout an application. This can be done by - * either explicitly creating static `nvtx3::named_category` objects, or using - * the `nvtx3::named_category::get` construct on first use helper (recommended). - * - * Similar to \ref DOMAINS, `nvtx3::named_category::get` requires defining a - * custom tag type with static `name` and `id` members. - * - * \code{.cpp} - * // Explicitly constructed, static `named_category` - * static nvtx3::named_category static_category{42, "my category"}; - * - * // OR use construct on first use: - * // Define a tag type with `name` and `id` members - * struct my_category{ - * static constexpr char const* name{"my category"}; // category name - * static constexpr category::id_type id{42}; // category id - * }; - * - * // Use construct on first use to name the category id `42` - * // with name "my category" - * nvtx3::named_category const& my_category = - * named_category::get(); - * - * // Range `r` associated with category id `42` - * nvtx3::event_attributes attr{my_category}; - * \endcode - * - * \subsection PAYLOAD payload - * - * Allows associating a user-defined numerical value with an event. - * - * ``` - * nvtx3:: event_attributes attr{nvtx3::payload{42}}; // Constructs a payload - * from - * // the `int32_t` value 42 - * ``` - * - * - * \section EXAMPLE Example - * - * Putting it all together: - * \code{.cpp} - * // Define a custom domain tag type - * struct my_domain{ static constexpr char const* name{"my domain"}; }; - * - * // Define a named category tag type - * struct my_category{ - * static constexpr char const* name{"my category"}; - * static constexpr uint32_t id{42}; - * }; - * - * // Define a registered message tag type - * struct my_message{ static constexpr char const* message{"my message"}; }; - * - * // For convenience, use aliases for domain scoped objects - * using my_thread_range = nvtx3::domain_thread_range; - * using my_registered_message = nvtx3::registered_message; - * using my_named_category = nvtx3::named_category; - * - * // Default values for all attributes - * nvtx3::event_attributes attr{}; - * my_thread_range r0{attr}; - * - * // Custom (unregistered) message, and unnamed category - * nvtx3::event_attributes attr1{"message", nvtx3::category{2}}; - * my_thread_range r1{attr1}; - * - * // Alternatively, pass arguments of `event_attributes` ctor directly to - * // `my_thread_range` - * my_thread_range r2{"message", nvtx3::category{2}}; - * - * // construct on first use a registered message - * auto msg = my_registered_message::get(); - * - * // construct on first use a named category - * auto category = my_named_category::get(); - * - * // Use registered message and named category - * my_thread_range r3{msg, category, nvtx3::rgb{127, 255, 0}, - * nvtx3::payload{42}}; - * - * // Any number of arguments in any order - * my_thread_range r{nvtx3::rgb{127, 255,0}, msg}; - * - * \endcode - * \section MACROS Convenience Macros - * - * Oftentimes users want to quickly and easily add NVTX ranges to their library - * or application to aid in profiling and optimization. - * - * A convenient way to do this is to use the \ref NVTX3_FUNC_RANGE and - * \ref NVTX3_FUNC_RANGE_IN macros. These macros take care of constructing an - * `nvtx3::domain_thread_range` with the name of the enclosing function as the - * range's message. - * - * \code{.cpp} - * void some_function(){ - * // Automatically generates an NVTX range for the duration of the function - * // using "some_function" as the event's message. - * NVTX3_FUNC_RANGE(); - * } - * \endcode - * - */ - -/** - * @brief Enables the use of constexpr when support for C++14 relaxed constexpr - * is present. - * - * Initializing a legacy-C (i.e., no constructor) union member requires - * initializing in the constructor body. Non-empty constexpr constructors - * require C++14 relaxed constexpr. - * - */ -#if __cpp_constexpr >= 201304L -#define NVTX3_RELAXED_CONSTEXPR constexpr -#else -#define NVTX3_RELAXED_CONSTEXPR -#endif - -namespace nvtx3 { -namespace detail { - -/** - * @brief Verifies if a type `T` contains a member `T::name` of type `const - * char*` or `const wchar_t*`. - * - * @tparam T The type to verify - * @return True if `T` contains a member `T::name` of type `const char*` or - * `const wchar_t*`. - */ -template -constexpr auto has_name_member() noexcept -> decltype(T::name, bool()) -{ - return (std::is_same::type>::value or - std::is_same::type>::value); -} -} // namespace detail - -/** - * @brief `domain`s allow for grouping NVTX events into a single scope to - * differentiate them from events in other `domain`s. - * - * By default, all NVTX constructs are placed in the "global" NVTX domain. - * - * A custom `domain` may be used in order to differentiate a library's or - * application's NVTX events from other events. - * - * `domain`s are expected to be long-lived and unique to a library or - * application. As such, it is assumed a domain's name is known at compile - * time. Therefore, all NVTX constructs that can be associated with a domain - * require the domain to be specified via a *type* `DomainName` passed as an - * explicit template parameter. - * - * The type `domain::global` may be used to indicate that the global NVTX - * domain should be used. - * - * None of the C++ NVTX constructs require the user to manually construct a - * `domain` object. Instead, if a custom domain is desired, the user is - * expected to define a type `DomainName` that contains a member - * `DomainName::name` which resolves to either a `char const*` or `wchar_t - * const*`. The value of `DomainName::name` is used to name and uniquely - * identify the custom domain. - * - * Upon the first use of an NVTX construct associated with the type - * `DomainName`, the "construct on first use" pattern is used to construct a - * function local static `domain` object. All future NVTX constructs - * associated with `DomainType` will use a reference to the previously - * constructed `domain` object. See `domain::get`. - * - * Example: - * ``` - * // The type `my_domain` defines a `name` member used to name and identify - * the - * // `domain` object identified by `my_domain`. - * struct my_domain{ static constexpr char const* name{"my_domain"}; }; - * - * // The NVTX range `r` will be grouped with all other NVTX constructs - * // associated with `my_domain`. - * nvtx3::domain_thread_range r{}; - * - * // An alias can be created for a `domain_thread_range` in the custom domain - * using my_thread_range = nvtx3::domain_thread_range; - * my_thread_range my_range{}; - * - * // `domain::global` indicates that the global NVTX domain is used - * nvtx3::domain_thread_range r2{}; - * - * // For convenience, `nvtx3::thread_range` is an alias for a range in the - * // global domain - * nvtx3::thread_range r3{}; - * ``` - */ -class domain { - public: - domain(domain const&) = delete; - domain& operator=(domain const&) = delete; - domain(domain&&) = delete; - domain& operator=(domain&&) = delete; - - /** - * @brief Returns reference to an instance of a function local static - * `domain` object. - * - * Uses the "construct on first use" idiom to safely ensure the `domain` - * object is initialized exactly once upon first invocation of - * `domain::get()`. All following invocations will return a - * reference to the previously constructed `domain` object. See - * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use - * - * None of the constructs in this header require the user to directly invoke - * `domain::get`. It is automatically invoked when constructing objects like - * a `domain_thread_range` or `category`. Advanced users may wish to use - * `domain::get` for the convenience of the "construct on first use" idiom - * when using domains with their own use of the NVTX C API. - * - * This function is threadsafe as of C++11. If two or more threads call - * `domain::get` concurrently, exactly one of them is guaranteed - * to construct the `domain` object and the other(s) will receive a - * reference to the object after it is fully constructed. - * - * The domain's name is specified via the type `DomainName` pass as an - * explicit template parameter. `DomainName` is required to contain a - * member `DomainName::name` that resolves to either a `char const*` or - * `wchar_t const*`. The value of `DomainName::name` is used to name and - * uniquely identify the `domain`. - * - * Example: - * ``` - * // The type `my_domain` defines a `name` member used to name and identify - * // the `domain` object identified by `my_domain`. - * struct my_domain{ static constexpr char const* name{"my domain"}; }; - * - * auto D = domain::get(); // First invocation constructs a - * // `domain` with the name "my domain" - * - * auto D1 = domain::get(); // Simply returns reference to - * // previously constructed `domain`. - * ``` - * - * @tparam DomainName Type that contains a `DomainName::name` member used to - * name the `domain` object. - * @return Reference to the `domain` corresponding to the type `DomainName`. - */ - template - static domain const& get() - { - static_assert(detail::has_name_member(), - "Type used to identify a domain must contain a name member of" - "type const char* or const wchar_t*"); - static domain const d{DomainName::name}; - return d; - } - - /** - * @brief Conversion operator to `nvtxDomainHandle_t`. - * - * Allows transparently passing a domain object into an API expecting a - * native `nvtxDomainHandle_t` object. - */ - operator nvtxDomainHandle_t() const noexcept { return _domain; } - - /** - * @brief Tag type for the "global" NVTX domain. - * - * This type may be passed as a template argument to any function/class - * expecting a type to identify a domain to indicate that the global domain - * should be used. - * - * All NVTX events in the global domain across all libraries and - * applications will be grouped together. - * - */ - struct global { - }; - - private: - /** - * @brief Construct a new domain with the specified `name`. - * - * This constructor is private as it is intended that `domain` objects only - * be created through the `domain::get` function. - * - * @param name A unique name identifying the domain - */ - explicit domain(char const* name) noexcept : _domain{nvtxDomainCreateA(name)} {} - - /** - * @brief Construct a new domain with the specified `name`. - * - * This constructor is private as it is intended that `domain` objects only - * be created through the `domain::get` function. - * - * @param name A unique name identifying the domain - */ - explicit domain(wchar_t const* name) noexcept : _domain{nvtxDomainCreateW(name)} {} - - /** - * @brief Construct a new domain with the specified `name`. - * - * This constructor is private as it is intended that `domain` objects only - * be created through the `domain::get` function. - * - * @param name A unique name identifying the domain - */ - explicit domain(std::string const& name) noexcept : domain{name.c_str()} {} - - /** - * @brief Construct a new domain with the specified `name`. - * - * This constructor is private as it is intended that `domain` objects only - * be created through the `domain::get` function. - * - * @param name A unique name identifying the domain - */ - explicit domain(std::wstring const& name) noexcept : domain{name.c_str()} {} - - /** - * @brief Default constructor creates a `domain` representing the - * "global" NVTX domain. - * - * All events not associated with a custom `domain` are grouped in the - * "global" NVTX domain. - * - */ - domain() = default; - - /** - * @brief Destroy the domain object, unregistering and freeing all domain - * specific resources. - */ - ~domain() noexcept { nvtxDomainDestroy(_domain); } - - private: - nvtxDomainHandle_t const _domain{}; ///< The `domain`s NVTX handle -}; - -/** - * @brief Returns reference to the `domain` object that represents the global - * NVTX domain. - * - * This specialization for `domain::global` returns a default constructed, - * `domain` object for use when the "global" domain is desired. - * - * All NVTX events in the global domain across all libraries and applications - * will be grouped together. - * - * @return Reference to the `domain` corresponding to the global NVTX domain. - * - */ -template <> -inline domain const& domain::get() -{ - static domain const d{}; - return d; -} - -/** - * @brief Indicates the values of the red, green, blue color channels for - * a rgb color code. - * - */ -struct rgb { - /// Type used for component values - using component_type = uint8_t; - - /** - * @brief Construct a rgb with red, green, and blue channels - * specified by `red_`, `green_`, and `blue_`, respectively. - * - * Valid values are in the range `[0,255]`. - * - * @param red_ Value of the red channel - * @param green_ Value of the green channel - * @param blue_ Value of the blue channel - */ - constexpr rgb(component_type red_, component_type green_, component_type blue_) noexcept - : red{red_}, green{green_}, blue{blue_} - { - } - - component_type const red{}; ///< Red channel value - component_type const green{}; ///< Green channel value - component_type const blue{}; ///< Blue channel value -}; - -/** - * @brief Indicates the value of the alpha, red, green, and blue color - * channels for an argb color code. - * - */ -struct argb final : rgb { - /** - * @brief Construct an argb with alpha, red, green, and blue channels - * specified by `alpha_`, `red_`, `green_`, and `blue_`, respectively. - * - * Valid values are in the range `[0,255]`. - * - * @param alpha_ Value of the alpha channel (opacity) - * @param red_ Value of the red channel - * @param green_ Value of the green channel - * @param blue_ Value of the blue channel - * - */ - constexpr argb(component_type alpha_, - component_type red_, - component_type green_, - component_type blue_) noexcept - : rgb{red_, green_, blue_}, alpha{alpha_} - { - } - - component_type const alpha{}; ///< Alpha channel value -}; - -/** - * @brief Represents a custom color that can be associated with an NVTX event - * via it's `event_attributes`. - * - * Specifying colors for NVTX events is a convenient way to visually - * differentiate among different events in a visualization tool such as Nsight - * Systems. - * - */ -class color { - public: - /// Type used for the color's value - using value_type = uint32_t; - - /** - * @brief Constructs a `color` using the value provided by `hex_code`. - * - * `hex_code` is expected to be a 4 byte argb hex code. - * - * The most significant byte indicates the value of the alpha channel - * (opacity) (0-255) - * - * The next byte indicates the value of the red channel (0-255) - * - * The next byte indicates the value of the green channel (0-255) - * - * The least significant byte indicates the value of the blue channel - * (0-255) - * - * @param hex_code The hex code used to construct the `color` - */ - constexpr explicit color(value_type hex_code) noexcept : _value{hex_code} {} - - /** - * @brief Construct a `color` using the alpha, red, green, blue components - * in `argb`. - * - * @param argb The alpha, red, green, blue components of the desired `color` - */ - constexpr color(argb argb) noexcept - : color{from_bytes_msb_to_lsb(argb.alpha, argb.red, argb.green, argb.blue)} - { - } - - /** - * @brief Construct a `color` using the red, green, blue components in - * `rgb`. - * - * Uses maximum value for the alpha channel (opacity) of the `color`. - * - * @param rgb The red, green, blue components of the desired `color` - */ - constexpr color(rgb rgb) noexcept - : color{from_bytes_msb_to_lsb(0xFF, rgb.red, rgb.green, rgb.blue)} - { - } - - /** - * @brief Returns the `color`s argb hex code - * - */ - constexpr value_type get_value() const noexcept { return _value; } - - /** - * @brief Return the NVTX color type of the color. - * - */ - constexpr nvtxColorType_t get_type() const noexcept { return _type; } - - color() = delete; - ~color() = default; - color(color const&) = default; - color& operator=(color const&) = default; - color(color&&) = default; - color& operator=(color&&) = default; - - private: - /** - * @brief Constructs an unsigned, 4B integer from the component bytes in - * most to least significant byte order. - * - */ - constexpr static value_type from_bytes_msb_to_lsb(uint8_t byte3, - uint8_t byte2, - uint8_t byte1, - uint8_t byte0) noexcept - { - return uint32_t{byte3} << 24 | uint32_t{byte2} << 16 | uint32_t{byte1} << 8 | uint32_t{byte0}; - } - - value_type const _value{}; ///< color's argb color code - nvtxColorType_t const _type{NVTX_COLOR_ARGB}; ///< NVTX color type code -}; - -/** - * @brief Object for intra-domain grouping of NVTX events. - * - * A `category` is simply an integer id that allows for fine-grain grouping of - * NVTX events. For example, one might use separate categories for IO, memory - * allocation, compute, etc. - * - * Example: - * \code{.cpp} - * nvtx3::category cat1{1}; - * - * // Range `r1` belongs to the category identified by the value `1`. - * nvtx3::thread_range r1{cat1}; - * - * // Range `r2` belongs to the same category as `r1` - * nvtx3::thread_range r2{nvtx3::category{1}}; - * \endcode - * - * To associate a name string with a category id, see `named_category`. - * - */ -class category { - public: - /// Type used for `category`s integer id. - using id_type = uint32_t; - - /** - * @brief Construct a `category` with the specified `id`. - * - * The `category` will be unnamed and identified only by its `id` value. - * - * All `category` objects sharing the same `id` are equivalent. - * - * @param[in] id The `category`'s identifying value - */ - constexpr explicit category(id_type id) noexcept : id_{id} {} - - /** - * @brief Returns the id of the category. - * - */ - constexpr id_type get_id() const noexcept { return id_; } - - category() = delete; - ~category() = default; - category(category const&) = default; - category& operator=(category const&) = default; - category(category&&) = default; - category& operator=(category&&) = default; - - private: - id_type const id_{}; ///< category's unique identifier -}; - -/** - * @brief A `category` with an associated name string. - * - * Associates a `name` string with a category `id` to help differentiate among - * categories. - * - * For any given category id `Id`, a `named_category(Id, "name")` should only - * be constructed once and reused throughout an application. This can be done - * by either explicitly creating static `named_category` objects, or using the - * `named_category::get` construct on first use helper (recommended). - * - * Creating two or more `named_category` objects with the same value for `id` - * in the same domain results in undefined behavior. - * - * Similarly, behavior is undefined when a `named_category` and `category` - * share the same value of `id`. - * - * Example: - * \code{.cpp} - * // Explicitly constructed, static `named_category` - * static nvtx3::named_category static_category{42, "my category"}; - * - * // Range `r` associated with category id `42` - * nvtx3::thread_range r{static_category}; - * - * // OR use construct on first use: - * - * // Define a type with `name` and `id` members - * struct my_category{ - * static constexpr char const* name{"my category"}; // category name - * static constexpr category::id_type id{42}; // category id - * }; - * - * // Use construct on first use to name the category id `42` - * // with name "my category" - * auto my_category = named_category::get(); - * - * // Range `r` associated with category id `42` - * nvtx3::thread_range r{my_category}; - * \endcode - * - * `named_category`'s association of a name to a category id is local to the - * domain specified by the type `D`. An id may have a different name in - * another domain. - * - * @tparam D Type containing `name` member used to identify the `domain` to - * which the `named_category` belongs. Else, `domain::global` to indicate - * that the global NVTX domain should be used. - */ -template -class named_category final : public category { - public: - /** - * @brief Returns a global instance of a `named_category` as a - * function-local static. - * - * Creates a `named_category` with name and id specified by the contents of - * a type `C`. `C::name` determines the name and `C::id` determines the - * category id. - * - * This function is useful for constructing a named `category` exactly once - * and reusing the same instance throughout an application. - * - * Example: - * \code{.cpp} - * // Define a type with `name` and `id` members - * struct my_category{ - * static constexpr char const* name{"my category"}; // category name - * static constexpr uint32_t id{42}; // category id - * }; - * - * // Use construct on first use to name the category id `42` - * // with name "my category" - * auto cat = named_category::get(); - * - * // Range `r` associated with category id `42` - * nvtx3::thread_range r{cat}; - * \endcode - * - * Uses the "construct on first use" idiom to safely ensure the `category` - * object is initialized exactly once. See - * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use - * - * @tparam C Type containing a member `C::name` that resolves to either a - * `char const*` or `wchar_t const*` and `C::id`. - */ - template - static named_category const& get() noexcept - { - static_assert(detail::has_name_member(), - "Type used to name a category must contain a name member."); - static named_category const category{C::id, C::name}; - return category; - } - /** - * @brief Construct a `category` with the specified `id` and `name`. - * - * The name `name` will be registered with `id`. - * - * Every unique value of `id` should only be named once. - * - * @param[in] id The category id to name - * @param[in] name The name to associated with `id` - */ - named_category(id_type id, char const* name) noexcept : category{id} - { - nvtxDomainNameCategoryA(domain::get(), get_id(), name); - }; - - /** - * @brief Construct a `category` with the specified `id` and `name`. - * - * The name `name` will be registered with `id`. - * - * Every unique value of `id` should only be named once. - * - * @param[in] id The category id to name - * @param[in] name The name to associated with `id` - */ - named_category(id_type id, wchar_t const* name) noexcept : category{id} - { - nvtxDomainNameCategoryW(domain::get(), get_id(), name); - }; -}; - -/** - * @brief A message registered with NVTX. - * - * Normally, associating a `message` with an NVTX event requires copying the - * contents of the message string. This may cause non-trivial overhead in - * highly performance sensitive regions of code. - * - * message registration is an optimization to lower the overhead of - * associating a message with an NVTX event. Registering a message yields a - * handle that is inexpensive to copy that may be used in place of a message - * string. - * - * A particular message should only be registered once and the handle - * reused throughout the rest of the application. This can be done by either - * explicitly creating static `registered_message` objects, or using the - * `registered_message::get` construct on first use helper (recommended). - * - * Example: - * \code{.cpp} - * // Explicitly constructed, static `registered_message` - * static registered_message static_message{"message"}; - * - * // "message" is associated with the range `r` - * nvtx3::thread_range r{static_message}; - * - * // Or use construct on first use: - * - * // Define a type with a `message` member that defines the contents of the - * // registered message - * struct my_message{ static constexpr char const* message{ "my message" }; }; - * - * // Uses construct on first use to register the contents of - * // `my_message::message` - * auto msg = registered_message::get(); - * - * // "my message" is associated with the range `r` - * nvtx3::thread_range r{msg}; - * \endcode - * - * `registered_message`s are local to a particular domain specified via - * the type `D`. - * - * @tparam D Type containing `name` member used to identify the `domain` to - * which the `registered_message` belongs. Else, `domain::global` to indicate - * that the global NVTX domain should be used. - */ -template -class registered_message { - public: - /** - * @brief Returns a global instance of a `registered_message` as a function - * local static. - * - * Provides a convenient way to register a message with NVTX without having - * to explicitly register the message. - * - * Upon first invocation, constructs a `registered_message` whose contents - * are specified by `message::message`. - * - * All future invocations will return a reference to the object constructed - * in the first invocation. - * - * Example: - * \code{.cpp} - * // Define a type with a `message` member that defines the contents of the - * // registered message - * struct my_message{ static constexpr char const* message{ "my message" }; - * }; - * - * // Uses construct on first use to register the contents of - * // `my_message::message` - * auto msg = registered_message::get(); - * - * // "my message" is associated with the range `r` - * nvtx3::thread_range r{msg}; - * \endcode - * - * @tparam M Type required to contain a member `M::message` that - * resolves to either a `char const*` or `wchar_t const*` used as the - * registered message's contents. - * @return Reference to a `registered_message` associated with the type `M`. - */ - template - static registered_message const& get() noexcept - { - static registered_message const registered_message{M::message}; - return registered_message; - } - - /** - * @brief Constructs a `registered_message` from the specified `msg` string. - * - * Registers `msg` with NVTX and associates a handle with the registered - * message. - * - * A particular message should should only be registered once and the handle - * reused throughout the rest of the application. - * - * @param msg The contents of the message - */ - explicit registered_message(char const* msg) noexcept - : handle_{nvtxDomainRegisterStringA(domain::get(), msg)} - { - } - - /** - * @brief Constructs a `registered_message` from the specified `msg` string. - * - * Registers `msg` with NVTX and associates a handle with the registered - * message. - * - * A particular message should should only be registered once and the handle - * reused throughout the rest of the application. - * - * @param msg The contents of the message - */ - explicit registered_message(std::string const& msg) noexcept : registered_message{msg.c_str()} {} - - /** - * @brief Constructs a `registered_message` from the specified `msg` string. - * - * Registers `msg` with NVTX and associates a handle with the registered - * message. - * - * A particular message should should only be registered once and the handle - * reused throughout the rest of the application. - * - * @param msg The contents of the message - */ - explicit registered_message(wchar_t const* msg) noexcept - : handle_{nvtxDomainRegisterStringW(domain::get(), msg)} - { - } - - /** - * @brief Constructs a `registered_message` from the specified `msg` string. - * - * Registers `msg` with NVTX and associates a handle with the registered - * message. - * - * A particular message should only be registered once and the handle - * reused throughout the rest of the application. - * - * @param msg The contents of the message - */ - explicit registered_message(std::wstring const& msg) noexcept : registered_message{msg.c_str()} {} - - /** - * @brief Returns the registered message's handle - * - */ - nvtxStringHandle_t get_handle() const noexcept { return handle_; } - - registered_message() = delete; - ~registered_message() = default; - registered_message(registered_message const&) = default; - registered_message& operator=(registered_message const&) = default; - registered_message(registered_message&&) = default; - registered_message& operator=(registered_message&&) = default; - - private: - nvtxStringHandle_t const handle_{}; ///< The handle returned from - ///< registering the message with NVTX -}; - -/** - * @brief Allows associating a message string with an NVTX event via - * its `EventAttribute`s. - * - * Associating a `message` with an NVTX event through its `event_attributes` - * allows for naming events to easily differentiate them from other events. - * - * Every time an NVTX event is created with an associated `message`, the - * contents of the message string must be copied. This may cause non-trivial - * overhead in highly performance sensitive sections of code. Use of a - * `nvtx3::registered_message` is recommended in these situations. - * - * Example: - * \code{.cpp} - * // Creates an `event_attributes` with message "message 0" - * nvtx3::event_attributes attr0{nvtx3::message{"message 0"}}; - * - * // `range0` contains message "message 0" - * nvtx3::thread_range range0{attr0}; - * - * // `std::string` and string literals are implicitly assumed to be - * // the contents of an `nvtx3::message` - * // Creates an `event_attributes` with message "message 1" - * nvtx3::event_attributes attr1{"message 1"}; - * - * // `range1` contains message "message 1" - * nvtx3::thread_range range1{attr1}; - * - * // `range2` contains message "message 2" - * nvtx3::thread_range range2{nvtx3::Mesage{"message 2"}}; - * - * // `std::string` and string literals are implicitly assumed to be - * // the contents of an `nvtx3::message` - * // `range3` contains message "message 3" - * nvtx3::thread_range range3{"message 3"}; - * \endcode - */ -class message { - public: - using value_type = nvtxMessageValue_t; - - /** - * @brief Construct a `message` whose contents are specified by `msg`. - * - * @param msg The contents of the message - */ - NVTX3_RELAXED_CONSTEXPR message(char const* msg) noexcept : type_{NVTX_MESSAGE_TYPE_ASCII} - { - value_.ascii = msg; - } - - /** - * @brief Construct a `message` whose contents are specified by `msg`. - * - * @param msg The contents of the message - */ - message(std::string const& msg) noexcept : message{msg.c_str()} {} - - /** - * @brief Disallow construction for `std::string` r-value - * - * `message` is a non-owning type and therefore cannot take ownership of an - * r-value. Therefore, constructing from an r-value is disallowed to prevent - * a dangling pointer. - * - */ - message(std::string&&) = delete; - - /** - * @brief Construct a `message` whose contents are specified by `msg`. - * - * @param msg The contents of the message - */ - NVTX3_RELAXED_CONSTEXPR message(wchar_t const* msg) noexcept : type_{NVTX_MESSAGE_TYPE_UNICODE} - { - value_.unicode = msg; - } - - /** - * @brief Construct a `message` whose contents are specified by `msg`. - * - * @param msg The contents of the message - */ - message(std::wstring const& msg) noexcept : message{msg.c_str()} {} - - /** - * @brief Disallow construction for `std::wstring` r-value - * - * `message` is a non-owning type and therefore cannot take ownership of an - * r-value. Therefore, constructing from an r-value is disallowed to prevent - * a dangling pointer. - * - */ - message(std::wstring&&) = delete; - - /** - * @brief Construct a `message` from a `registered_message`. - * - * @tparam D Type containing `name` member used to identify the `domain` - * to which the `registered_message` belongs. Else, `domain::global` to - * indicate that the global NVTX domain should be used. - * @param msg The message that has already been registered with NVTX. - */ - template - NVTX3_RELAXED_CONSTEXPR message(registered_message const& msg) noexcept - : type_{NVTX_MESSAGE_TYPE_REGISTERED} - { - value_.registered = msg.get_handle(); - } - - /** - * @brief Return the union holding the value of the message. - * - */ - NVTX3_RELAXED_CONSTEXPR value_type get_value() const noexcept { return value_; } - - /** - * @brief Return the type information about the value the union holds. - * - */ - NVTX3_RELAXED_CONSTEXPR nvtxMessageType_t get_type() const noexcept { return type_; } - - private: - nvtxMessageType_t const type_{}; ///< message type - nvtxMessageValue_t value_{}; ///< message contents -}; - -/** - * @brief A numerical value that can be associated with an NVTX event via - * its `event_attributes`. - * - * Example: - * ``` - * nvtx3:: event_attributes attr{nvtx3::payload{42}}; // Constructs a payload - * from - * // the `int32_t` value 42 - * - * // `range0` will have an int32_t payload of 42 - * nvtx3::thread_range range0{attr}; - * - * // range1 has double payload of 3.14 - * nvtx3::thread_range range1{ nvtx3::payload{3.14} }; - * ``` - */ -class payload { - public: - using value_type = typename nvtxEventAttributes_v2::payload_t; - - /** - * @brief Construct a `payload` from a signed, 8 byte integer. - * - * @param value Value to use as contents of the payload - */ - NVTX3_RELAXED_CONSTEXPR explicit payload(int64_t value) noexcept - : type_{NVTX_PAYLOAD_TYPE_INT64}, value_{} - { - value_.llValue = value; - } - - /** - * @brief Construct a `payload` from a signed, 4 byte integer. - * - * @param value Value to use as contents of the payload - */ - NVTX3_RELAXED_CONSTEXPR explicit payload(int32_t value) noexcept - : type_{NVTX_PAYLOAD_TYPE_INT32}, value_{} - { - value_.iValue = value; - } - - /** - * @brief Construct a `payload` from an unsigned, 8 byte integer. - * - * @param value Value to use as contents of the payload - */ - NVTX3_RELAXED_CONSTEXPR explicit payload(uint64_t value) noexcept - : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT64}, value_{} - { - value_.ullValue = value; - } - - /** - * @brief Construct a `payload` from an unsigned, 4 byte integer. - * - * @param value Value to use as contents of the payload - */ - NVTX3_RELAXED_CONSTEXPR explicit payload(uint32_t value) noexcept - : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT32}, value_{} - { - value_.uiValue = value; - } - - /** - * @brief Construct a `payload` from a single-precision floating point - * value. - * - * @param value Value to use as contents of the payload - */ - NVTX3_RELAXED_CONSTEXPR explicit payload(float value) noexcept - : type_{NVTX_PAYLOAD_TYPE_FLOAT}, value_{} - { - value_.fValue = value; - } - - /** - * @brief Construct a `payload` from a double-precision floating point - * value. - * - * @param value Value to use as contents of the payload - */ - NVTX3_RELAXED_CONSTEXPR explicit payload(double value) noexcept - : type_{NVTX_PAYLOAD_TYPE_DOUBLE}, value_{} - { - value_.dValue = value; - } - - /** - * @brief Return the union holding the value of the payload - * - */ - NVTX3_RELAXED_CONSTEXPR value_type get_value() const noexcept { return value_; } - - /** - * @brief Return the information about the type the union holds. - * - */ - NVTX3_RELAXED_CONSTEXPR nvtxPayloadType_t get_type() const noexcept { return type_; } - - private: - nvtxPayloadType_t const type_; ///< Type of the payload value - value_type value_; ///< Union holding the payload value -}; - -/** - * @brief Describes the attributes of a NVTX event. - * - * NVTX events can be customized via four "attributes": - * - * - color: color used to visualize the event in tools such as Nsight - * Systems. See `color`. - * - message: Custom message string. See `message`. - * - payload: User-defined numerical value. See `payload`. - * - category: Intra-domain grouping. See `category`. - * - * These component attributes are specified via an `event_attributes` object. - * See `nvtx3::color`, `nvtx3::message`, `nvtx3::payload`, and - * `nvtx3::category` for how these individual attributes are constructed. - * - * While it is possible to specify all four attributes, it is common to want - * to only specify a subset of attributes and use default values for the - * others. For convenience, `event_attributes` can be constructed from any - * number of attribute components in any order. - * - * Example: - * \code{.cpp} - * event_attributes attr{}; // No arguments, use defaults for all attributes - * - * event_attributes attr{"message"}; // Custom message, rest defaulted - * - * // Custom color & message - * event_attributes attr{"message", nvtx3::rgb{127, 255, 0}}; - * - * /// Custom color & message, can use any order of arguments - * event_attributes attr{nvtx3::rgb{127, 255, 0}, "message"}; - * - * - * // Custom color, message, payload, category - * event_attributes attr{nvtx3::rgb{127, 255, 0}, - * "message", - * nvtx3::payload{42}, - * nvtx3::category{1}}; - * - * // Custom color, message, payload, category, can use any order of arguments - * event_attributes attr{nvtx3::payload{42}, - * nvtx3::category{1}, - * "message", - * nvtx3::rgb{127, 255, 0}}; - * - * // Multiple arguments of the same type are allowed, but only the first is - * // used. All others are ignored - * event_attributes attr{ nvtx3::payload{42}, nvtx3::payload{7} }; // payload - * is 42 - * - * // Range `r` will be customized according the attributes in `attr` - * nvtx3::thread_range r{attr}; - * - * // For convenience, the arguments that can be passed to the - * `event_attributes` - * // constructor may be passed to the `domain_thread_range` contructor where - * // they will be forwarded to the `EventAttribute`s constructor - * nvtx3::thread_range r{nvtx3::payload{42}, nvtx3::category{1}, "message"}; - * \endcode - * - */ -class event_attributes { - public: - using value_type = nvtxEventAttributes_t; - - /** - * @brief Default constructor creates an `event_attributes` with no - * category, color, payload, nor message. - */ - constexpr event_attributes() noexcept - : attributes_{ - NVTX_VERSION, // version - sizeof(nvtxEventAttributes_t), // size - 0, // category - NVTX_COLOR_UNKNOWN, // color type - 0, // color value - NVTX_PAYLOAD_UNKNOWN, // payload type - 0, // payload value (union) - NVTX_MESSAGE_UNKNOWN, // message type - 0 // message value (union) - } - { - } - - /** - * @brief Variadic constructor where the first argument is a `category`. - * - * Sets the value of the `EventAttribute`s category based on `c` and - * forwards the remaining variadic parameter pack to the next constructor. - * - */ - template - NVTX3_RELAXED_CONSTEXPR explicit event_attributes(category const& c, Args const&... args) noexcept - : event_attributes(args...) - { - attributes_.category = c.get_id(); - } - - /** - * @brief Variadic constructor where the first argument is a `color`. - * - * Sets the value of the `EventAttribute`s color based on `c` and forwards - * the remaining variadic parameter pack to the next constructor. - * - */ - template - NVTX3_RELAXED_CONSTEXPR explicit event_attributes(color const& c, Args const&... args) noexcept - : event_attributes(args...) - { - attributes_.color = c.get_value(); - attributes_.colorType = c.get_type(); - } - - /** - * @brief Variadic constructor where the first argument is a `payload`. - * - * Sets the value of the `EventAttribute`s payload based on `p` and forwards - * the remaining variadic parameter pack to the next constructor. - * - */ - template - NVTX3_RELAXED_CONSTEXPR explicit event_attributes(payload const& p, Args const&... args) noexcept - : event_attributes(args...) - { - attributes_.payload = p.get_value(); - attributes_.payloadType = p.get_type(); - } - - /** - * @brief Variadic constructor where the first argument is a `message`. - * - * Sets the value of the `EventAttribute`s message based on `m` and forwards - * the remaining variadic parameter pack to the next constructor. - * - */ - template - NVTX3_RELAXED_CONSTEXPR explicit event_attributes(message const& m, Args const&... args) noexcept - : event_attributes(args...) - { - attributes_.message = m.get_value(); - attributes_.messageType = m.get_type(); - } - - ~event_attributes() = default; - event_attributes(event_attributes const&) = default; - event_attributes& operator=(event_attributes const&) = default; - event_attributes(event_attributes&&) = default; - event_attributes& operator=(event_attributes&&) = default; - - /** - * @brief Get raw pointer to underlying NVTX attributes object. - * - */ - constexpr value_type const* get() const noexcept { return &attributes_; } - - private: - value_type attributes_{}; ///< The NVTX attributes structure -}; - -/** - * @brief A RAII object for creating a NVTX range local to a thread within a - * domain. - * - * When constructed, begins a nested NVTX range on the calling thread in the - * specified domain. Upon destruction, ends the NVTX range. - * - * Behavior is undefined if a `domain_thread_range` object is - * created/destroyed on different threads. - * - * `domain_thread_range` is neither moveable nor copyable. - * - * `domain_thread_range`s may be nested within other ranges. - * - * The domain of the range is specified by the template type parameter `D`. - * By default, the `domain::global` is used, which scopes the range to the - * global NVTX domain. The convenience alias `thread_range` is provided for - * ranges scoped to the global domain. - * - * A custom domain can be defined by creating a type, `D`, with a static - * member `D::name` whose value is used to name the domain associated with - * `D`. `D::name` must resolve to either `char const*` or `wchar_t const*` - * - * Example: - * ``` - * // Define a type `my_domain` with a member `name` used to name the domain - * // associated with the type `my_domain`. - * struct my_domain{ - * static constexpr const char * name{"my domain"}; - * }; - * ``` - * - * Usage: - * ``` - * nvtx3::domain_thread_range<> r0{"range 0"}; // Range in global domain - * - * nvtx3::thread_range r1{"range 1"}; // Alias for range in global domain - * - * nvtx3::domain_thread_range r2{"range 2"}; // Range in custom - * domain - * - * // specify an alias to a range that uses a custom domain - * using my_thread_range = nvtx3::domain_thread_range; - * - * my_thread_range r3{"range 3"}; // Alias for range in custom domain - * ``` - */ -template -class domain_thread_range { - public: - /** - * @brief Construct a `domain_thread_range` with the specified - * `event_attributes` - * - * Example: - * ``` - * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}}; - * nvtx3::domain_thread_range<> range{attr}; // Creates a range with message - * contents - * // "msg" and green color - * ``` - * - * @param[in] attr `event_attributes` that describes the desired attributes - * of the range. - */ - explicit domain_thread_range(event_attributes const& attr) noexcept - { - nvtxDomainRangePushEx(domain::get(), attr.get()); - } - - /** - * @brief Constructs a `domain_thread_range` from the constructor arguments - * of an `event_attributes`. - * - * Forwards the arguments `first, args...` to construct an - * `event_attributes` object. The `event_attributes` object is then - * associated with the `domain_thread_range`. - * - * For more detail, see `event_attributes` documentation. - * - * Example: - * ``` - * // Creates a range with message "message" and green color - * nvtx3::domain_thread_range<> r{"message", nvtx3::rgb{127,255,0}}; - * ``` - * - * @note To prevent making needless copies of `event_attributes` objects, - * this constructor is disabled when the first argument is an - * `event_attributes` object, instead preferring the explicit - * `domain_thread_range(event_attributes const&)` constructor. - * - * @param[in] first First argument to forward to the `event_attributes` - * constructor. - * @param[in] args Variadic parameter pack of additional arguments to - * forward. - * - */ - template >::value>> - explicit domain_thread_range(First const& first, Args const&... args) noexcept - : domain_thread_range{event_attributes{first, args...}} - { - } - - /** - * @brief Default constructor creates a `domain_thread_range` with no - * message, color, payload, nor category. - * - */ - domain_thread_range() : domain_thread_range{event_attributes{}} {} - - domain_thread_range(domain_thread_range const&) = delete; - domain_thread_range& operator=(domain_thread_range const&) = delete; - domain_thread_range(domain_thread_range&&) = delete; - domain_thread_range& operator=(domain_thread_range&&) = delete; - - /** - * @brief Destroy the domain_thread_range, ending the NVTX range event. - */ - ~domain_thread_range() noexcept { nvtxDomainRangePop(domain::get()); } -}; - -/** - * @brief Alias for a `domain_thread_range` in the global NVTX domain. - * - */ -using thread_range = domain_thread_range<>; - -/** - * @brief Handle used for correlating explicit range start and end events. - * - */ -struct range_handle { - /// Type used for the handle's value - using value_type = nvtxRangeId_t; - - /** - * @brief Construct a `range_handle` from the given id. - * - */ - constexpr range_handle(value_type id) noexcept : _range_id{id} {} - - /** - * @brief Returns the `range_handle`'s value - * - * @return value_type The handle's value - */ - constexpr value_type get_value() const noexcept { return _range_id; } - - private: - value_type _range_id{}; ///< The underlying NVTX range id -}; - -/** - * @brief Manually begin an NVTX range. - * - * Explicitly begins an NVTX range and returns a unique handle. To end the - * range, pass the handle to `end_range()`. - * - * `start_range/end_range` are the most explicit and lowest level APIs provided - * for creating ranges. Use of `nvtx3::domain_process_range` should be - * preferred unless one is unable to tie the range to the lifetime of an object. - * - * Example: - * ``` - * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}}; - * nvtx3::range_handle h = nvxt3::start_range(attr); // Manually begins a range - * ... - * nvtx3::end_range(h); // Ends the range - * ``` - * - * @tparam D Type containing `name` member used to identify the `domain` - * to which the range belongs. Else, `domain::global` to indicate that the - * global NVTX domain should be used. - * @param[in] attr `event_attributes` that describes the desired attributes - * of the range. - * @return Unique handle to be passed to `end_range` to end the range. - */ -template -range_handle start_range(event_attributes const& attr) noexcept -{ - return range_handle{nvtxDomainRangeStartEx(domain::get(), attr.get())}; -} - -/** - * @brief Manually begin an NVTX range. - * - * Explicitly begins an NVTX range and returns a unique handle. To end the - * range, pass the handle to `end_range()`. - * - * Forwards the arguments `first, args...` to construct an `event_attributes` - * object. The `event_attributes` object is then associated with the range. - * - * For more detail, see `event_attributes` documentation. - * - * Example: - * ``` - * nvtx3::range_handle h = nvxt3::start_range("msg", nvtx3::rgb{127,255,0}); // - * Begin range - * ... - * nvtx3::end_range(h); // Ends the range - * ``` - * - * `start_range/end_range` are the most explicit and lowest level APIs provided - * for creating ranges. Use of `nvtx3::domain_process_range` should be - * preferred unless one is unable to tie the range to the lifetime of an object. - * - * @param first[in] First argument to pass to an `event_attributes` - * @param args[in] Variadiac parameter pack of the rest of the arguments for an - * `event_attributes`. - * @return Unique handle to be passed to `end_range` to end the range. - */ -template >::value>> -range_handle start_range(First const& first, Args const&... args) noexcept -{ - return start_range(event_attributes{first, args...}); -} - -/** - * @brief Manually end the range associated with the handle `r`. - * - * Explicitly ends the NVTX range indicated by the handle `r` returned from a - * prior call to `start_range`. The range may end on a different thread from - * where it began. - * - * This function does not have a Domain tag type template parameter as the - * handle `r` already indicates the domain to which the range belongs. - * - * @param r Handle to a range started by a prior call to `start_range`. - */ -void end_range(range_handle r) { nvtxRangeEnd(r.get_value()); } - -/** - * @brief A RAII object for creating a NVTX range within a domain that can - * be created and destroyed on different threads. - * - * When constructed, begins a NVTX range in the specified domain. Upon - * destruction, ends the NVTX range. - * - * Similar to `nvtx3::domain_thread_range`, the only difference being that - * `domain_process_range` can start and end on different threads. - * - * Use of `nvtx3::domain_thread_range` should be preferred unless one needs - * the ability to start and end a range on different threads. - * - * `domain_process_range` is moveable, but not copyable. - * - * @tparam D Type containing `name` member used to identify the `domain` - * to which the `domain_process_range` belongs. Else, `domain::global` to - * indicate that the global NVTX domain should be used. - */ -template -class domain_process_range { - public: - /** - * @brief Construct a new domain process range object - * - * @param attr - */ - explicit domain_process_range(event_attributes const& attr) noexcept : handle_{start_range(attr)} - { - } - - /** - * @brief Construct a new domain process range object - * - * @param first - * @param args - */ - template >::value>> - explicit domain_process_range(First const& first, Args const&... args) noexcept - : domain_process_range{event_attributes{first, args...}} - { - } - - /** - * @brief Construct a new domain process range object - * - */ - constexpr domain_process_range() noexcept : domain_process_range{event_attributes{}} {} - - /** - * @brief Destroy the `domain_process_range` ending the range. - * - */ - ~domain_process_range() noexcept - { - if (not moved_from_) { end_range(handle_); } - } - - /** - * @brief Move constructor allows taking ownership of the NVTX range from - * another `domain_process_range`. - * - * @param other - */ - domain_process_range(domain_process_range&& other) noexcept : handle_{other.handle_} - { - other.moved_from_ = true; - } - - /** - * @brief Move assignment operator allows taking ownership of an NVTX range - * from another `domain_process_range`. - * - * @param other - * @return domain_process_range& - */ - domain_process_range& operator=(domain_process_range&& other) noexcept - { - handle_ = other.handle_; - other.moved_from_ = true; - } - - /// Copy construction is not allowed to prevent multiple objects from owning - /// the same range handle - domain_process_range(domain_process_range const&) = delete; - - /// Copy assignment is not allowed to prevent multiple objects from owning the - /// same range handle - domain_process_range& operator=(domain_process_range const&) = delete; - - private: - range_handle handle_; ///< Range handle used to correlate - ///< the start/end of the range - bool moved_from_{false}; ///< Indicates if the object has had - ///< it's contents moved from it, - ///< indicating it should not attempt - ///< to end the NVTX range. -}; - -/** - * @brief Alias for a `domain_process_range` in the global NVTX domain. - * - */ -using process_range = domain_process_range<>; - -/** - * @brief Annotates an instantaneous point in time with the attributes specified - * by `attr`. - * - * Unlike a "range", a mark is an instantaneous event in an application, e.g., - * locking/unlocking a mutex. - * - * \code{.cpp} - * std::mutex global_lock; - * void lock_mutex(){ - * global_lock.lock(); - * nvtx3::mark("lock_mutex"); - * } - * \endcode - * - * @tparam D Type containing `name` member used to identify the `domain` - * to which the `domain_process_range` belongs. Else, `domain::global` to - * indicate that the global NVTX domain should be used. - * @param[in] attr `event_attributes` that describes the desired attributes - * of the mark. - */ -template -inline void mark(event_attributes const& attr) noexcept -{ - nvtxDomainMarkEx(domain::get(), attr.get()); -} - -} // namespace nvtx3 - -/** - * @brief Convenience macro for generating a range in the specified `domain` - * from the lifetime of a function - * - * This macro is useful for generating an NVTX range in `domain` from - * the entry point of a function to its exit. It is intended to be the first - * line of the function. - * - * Constructs a static `registered_message` using the name of the immediately - * enclosing function returned by `__func__` and constructs a - * `nvtx3::thread_range` using the registered function name as the range's - * message. - * - * Example: - * ``` - * struct my_domain{static constexpr char const* name{"my_domain"};}; - * - * void foo(...){ - * NVTX3_FUNC_RANGE_IN(my_domain); // Range begins on entry to foo() - * // do stuff - * ... - * } // Range ends on return from foo() - * ``` - * - * @param[in] D Type containing `name` member used to identify the - * `domain` to which the `registered_message` belongs. Else, - * `domain::global` to indicate that the global NVTX domain should be used. - */ -#define NVTX3_FUNC_RANGE_IN(D) \ - static ::nvtx3::registered_message const nvtx3_func_name__{__func__}; \ - static ::nvtx3::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \ - ::nvtx3::domain_thread_range const nvtx3_range__{nvtx3_func_attr__}; - -/** - * @brief Convenience macro for generating a range in the global domain from the - * lifetime of a function. - * - * This macro is useful for generating an NVTX range in the global domain from - * the entry point of a function to its exit. It is intended to be the first - * line of the function. - * - * Constructs a static `registered_message` using the name of the immediately - * enclosing function returned by `__func__` and constructs a - * `nvtx3::thread_range` using the registered function name as the range's - * message. - * - * Example: - * ``` - * void foo(...){ - * NVTX3_FUNC_RANGE(); // Range begins on entry to foo() - * // do stuff - * ... - * } // Range ends on return from foo() - * ``` - */ -#define NVTX3_FUNC_RANGE() NVTX3_FUNC_RANGE_IN(::nvtx3::domain::global) \ No newline at end of file From b0d96a08385cbfb0ad3028196e973d2fd52867c0 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 20 Sep 2022 11:57:46 +0000 Subject: [PATCH 017/152] Fix for incorrect CUDART_VERSION scheme. --- include/cuco/detail/__config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cuco/detail/__config b/include/cuco/detail/__config index 40eb75aa2..c76a1bbef 100644 --- a/include/cuco/detail/__config +++ b/include/cuco/detail/__config @@ -25,7 +25,7 @@ #define CUCO_HAS_CUDA_BARRIER #endif -#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11100) +#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11010) #define CUCO_HAS_CG_MEMCPY_ASYNC #endif From fe942065b181e1cf37a97975b5a6e4a599455b04 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 20 Sep 2022 20:05:08 +0000 Subject: [PATCH 018/152] Cast output iterator to raw pointer. --- .../cuco/detail/static_multimap/device_view_impl.inl | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index c6612a7c8..d07154445 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -22,6 +22,7 @@ #include #include +#include namespace cuco { template ::device_view_ #if defined(CUCO_HAS_CUDA_BARRIER) cooperative_groups::memcpy_async( g, - output_begin + offset, + &thrust::raw_reference_cast(*(output_begin + offset)), output_buffer, cuda::aligned_size_t(sizeof(value_type) * num_outputs)); #else cooperative_groups::memcpy_async( - g, output_begin + offset, output_buffer, sizeof(value_type) * num_outputs); + g, + &thrust::raw_reference_cast(*(output_begin + offset)), + output_buffer, + sizeof(value_type) * num_outputs); #endif // end CUCO_HAS_CUDA_BARRIER return; #endif // end CUCO_HAS_CG_MEMCPY_ASYNC } + #pragma nv_diag_suppress 128 // warning: unreachable for (auto index = lane_id; index < num_outputs; index += g.size()) { *(output_begin + offset + index) = output_buffer[index]; } + #pragma nv_diag_default 128 } /** From a0b2b670a417f2638062379e59b0fcf47da2ef80 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 20 Sep 2022 20:05:33 +0000 Subject: [PATCH 019/152] Remove unused include. --- include/cuco/detail/static_multimap/kernels.cuh | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/cuco/detail/static_multimap/kernels.cuh b/include/cuco/detail/static_multimap/kernels.cuh index f3820bf64..019e66e31 100644 --- a/include/cuco/detail/static_multimap/kernels.cuh +++ b/include/cuco/detail/static_multimap/kernels.cuh @@ -23,8 +23,6 @@ #include -#include - #include namespace cuco { From d9100cd63d9f9e2f01d8fc01f7e540d48ccb9c7d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 20 Sep 2022 20:05:58 +0000 Subject: [PATCH 020/152] [pre-commit.ci] auto code formatting --- .../detail/static_multimap/device_view_impl.inl | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index d07154445..4c450b335 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -507,20 +507,19 @@ class static_multimap::device_view_ output_buffer, cuda::aligned_size_t(sizeof(value_type) * num_outputs)); #else - cooperative_groups::memcpy_async( - g, - &thrust::raw_reference_cast(*(output_begin + offset)), - output_buffer, - sizeof(value_type) * num_outputs); + cooperative_groups::memcpy_async(g, + &thrust::raw_reference_cast(*(output_begin + offset)), + output_buffer, + sizeof(value_type) * num_outputs); #endif // end CUCO_HAS_CUDA_BARRIER return; #endif // end CUCO_HAS_CG_MEMCPY_ASYNC } - #pragma nv_diag_suppress 128 // warning: unreachable +#pragma nv_diag_suppress 128 // warning: unreachable for (auto index = lane_id; index < num_outputs; index += g.size()) { *(output_begin + offset + index) = output_buffer[index]; } - #pragma nv_diag_default 128 +#pragma nv_diag_default 128 } /** From b97aa9fbac3a16f4ebb80b880e764d24a5e3bdbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 20 Sep 2022 23:45:13 +0200 Subject: [PATCH 021/152] Guard fallback flush method to prevent compiler warnings. Co-authored-by: Yunsong Wang --- include/cuco/detail/static_multimap/device_view_impl.inl | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index 4c450b335..f132a8fa8 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -512,14 +512,13 @@ class static_multimap::device_view_ output_buffer, sizeof(value_type) * num_outputs); #endif // end CUCO_HAS_CUDA_BARRIER - return; #endif // end CUCO_HAS_CG_MEMCPY_ASYNC } -#pragma nv_diag_suppress 128 // warning: unreachable - for (auto index = lane_id; index < num_outputs; index += g.size()) { - *(output_begin + offset + index) = output_buffer[index]; + if constexpr (not thrust::is_contiguous_iterator_v) { + for (auto index = lane_id; index < num_outputs; index += g.size()) { + *(output_begin + offset + index) = output_buffer[index]; + } } -#pragma nv_diag_default 128 } /** From 1130a8bce088fc9dacc60a29018d8ad1f4bcc280 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 20 Sep 2022 22:13:32 +0000 Subject: [PATCH 022/152] Reorder logic in flush_output_buffer. --- .../cuco/detail/static_multimap/device_view_impl.inl | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index f132a8fa8..9e354b096 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -498,8 +498,13 @@ class static_multimap::device_view_ } offset = g.shfl(offset, 0); - if constexpr (thrust::is_contiguous_iterator_v) { #if defined(CUCO_HAS_CG_MEMCPY_ASYNC) + constexpr bool uses_memcpy_async = thrust::is_contiguous_iterator_v; +#else + constexpr bool uses_memcpy_async = false; +#endif // end CUCO_HAS_CG_MEMCPY_ASYNC + + if constexpr (uses_memcpy_async) { #if defined(CUCO_HAS_CUDA_BARRIER) cooperative_groups::memcpy_async( g, @@ -512,9 +517,9 @@ class static_multimap::device_view_ output_buffer, sizeof(value_type) * num_outputs); #endif // end CUCO_HAS_CUDA_BARRIER -#endif // end CUCO_HAS_CG_MEMCPY_ASYNC } - if constexpr (not thrust::is_contiguous_iterator_v) { + + if constexpr (not uses_memcpy_async) { for (auto index = lane_id; index < num_outputs; index += g.size()) { *(output_begin + offset + index) = output_buffer[index]; } From d664744dfe9d072e1cb466596ab6f4c4bbe63702 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 20 Sep 2022 22:14:13 +0000 Subject: [PATCH 023/152] [pre-commit.ci] auto code formatting --- include/cuco/detail/static_multimap/device_view_impl.inl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index 9e354b096..9e328898d 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -502,7 +502,7 @@ class static_multimap::device_view_ constexpr bool uses_memcpy_async = thrust::is_contiguous_iterator_v; #else constexpr bool uses_memcpy_async = false; -#endif // end CUCO_HAS_CG_MEMCPY_ASYNC +#endif // end CUCO_HAS_CG_MEMCPY_ASYNC if constexpr (uses_memcpy_async) { #if defined(CUCO_HAS_CUDA_BARRIER) From 9fa08684b9334ed9ac0470683cc901579a685416 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Fri, 23 Sep 2022 15:20:44 -0400 Subject: [PATCH 024/152] Document that minimum required CMake version is now 3.23.1 With rapids-cmake now requiring CMake 3.23.1 update consumers to correctly express this requirement --- CMakeLists.txt | 2 +- README.md | 2 +- benchmarks/CMakeLists.txt | 2 +- ci/gpu/build.sh | 2 +- examples/CMakeLists.txt | 2 +- tests/CMakeLists.txt | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e1b5055d9..981d790ac 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. #============================================================================= -cmake_minimum_required(VERSION 3.18 FATAL_ERROR) +cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake) file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.10/RAPIDS.cmake diff --git a/README.md b/README.md index dc8d4db80..a5283b82a 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ We recommend using [CMake Package Manager (CPM)](https://github.com/TheLartians/ With CPM, getting `cuCollections` is easy: ``` -cmake_minimum_required(VERSION 3.14 FATAL_ERROR) +cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) include(path/to/CPM.cmake) diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index a037dc603..b70105d7d 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. #============================================================================= -cmake_minimum_required(VERSION 3.18 FATAL_ERROR) +cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) CPMAddPackage( NAME benchmark diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 8ae26bcf4..be4c72f92 100644 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -32,7 +32,7 @@ nvidia-smi gpuci_logger "Install Dependencies" . /opt/conda/etc/profile.d/conda.sh -conda create -y -n cuda -c nvidia -c conda-forge "cudatoolkit=${CUDA_VER}" "cmake>=3.18.*" +conda create -y -n cuda -c nvidia -c conda-forge "cudatoolkit=${CUDA_VER}" "cmake>=3.23.1" conda activate cuda gpuci_logger "Check versions" diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 0a83a3cb1..1205c774d 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. #============================================================================= -cmake_minimum_required(VERSION 3.18 FATAL_ERROR) +cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) ################################################################################################### # - compiler function ----------------------------------------------------------------------------- diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2d1d25526..dd3ea3bc3 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. #============================================================================= -cmake_minimum_required(VERSION 3.18 FATAL_ERROR) +cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) include(CTest) From cd2119096ddbfd38705f6b9326e0ce81a1288fe0 Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Fri, 7 Oct 2022 11:02:44 -0700 Subject: [PATCH 025/152] num_successes_ removed --- benchmarks/hash_table/static_map_bench.cu | 40 +++---- include/cuco/detail/dynamic_map.inl | 49 ++++----- include/cuco/detail/dynamic_map_kernels.cuh | 110 +++++++------------- include/cuco/dynamic_map.cuh | 1 - tests/dynamic_map/erase_test.cu | 11 +- 5 files changed, 85 insertions(+), 126 deletions(-) diff --git a/benchmarks/hash_table/static_map_bench.cu b/benchmarks/hash_table/static_map_bench.cu index ce1015b8d..04b0e5372 100644 --- a/benchmarks/hash_table/static_map_bench.cu +++ b/benchmarks/hash_table/static_map_bench.cu @@ -293,56 +293,60 @@ static void BM_static_map_erase_none(::benchmark::State& state) int64_t(state.range(0))); } -/* -BENCHMARK_TEMPLATE(BM_static_map_erase_none, int64_t, int64_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy); - -BENCHMARK_TEMPLATE(BM_static_map_erase_none, int32_t, int32_t, dist_type::UNIFORM) +BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy); + ->Apply(generate_size_and_occupancy) + ->UseManualTime(); -BENCHMARK_TEMPLATE(BM_static_map_erase_all, int64_t, int64_t, dist_type::UNIFORM) +BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(generate_size_and_occupancy); -BENCHMARK_TEMPLATE(BM_static_map_erase_all, int32_t, int32_t, dist_type::UNIFORM) +BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy); + ->Apply(generate_size_and_occupancy) + ->UseManualTime(); -BENCHMARK_TEMPLATE(BM_static_map_search_none, int64_t, int64_t, dist_type::UNIQUE) +BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) ->Apply(generate_size_and_occupancy); -*/ BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) ->Apply(generate_size_and_occupancy) ->UseManualTime(); -BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::UNIQUE) +BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) ->Apply(generate_size_and_occupancy); -BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIFORM) +BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(generate_size_and_occupancy) ->UseManualTime(); -BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::UNIFORM) +BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(generate_size_and_occupancy); -BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::GAUSSIAN) +BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) ->Apply(generate_size_and_occupancy) ->UseManualTime(); -BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::GAUSSIAN) +BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) ->Apply(generate_size_and_occupancy); -BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIQUE) +BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) ->Apply(generate_size_and_occupancy) ->UseManualTime(); + +BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::GAUSSIAN) + ->Unit(benchmark::kMillisecond) + ->Apply(generate_size_and_occupancy); + +BENCHMARK_TEMPLATE(BM_static_map_erase_all, int32_t, int32_t, dist_type::UNIQUE) + ->Unit(benchmark::kMillisecond) + ->Apply(generate_size_and_occupancy); diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index 0e0020e97..4b857256a 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -39,7 +39,8 @@ dynamic_map::dynamic_map( alloc)); submap_views_.push_back(submaps_[0]->get_device_view()); submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); - num_successes_ = std::allocator_traits::allocate(counter_allocator_, 1); + submap_num_successes_.push_back(submaps_[0]->get_num_successes()); + d_submap_num_successes_ = submap_num_successes_; } template @@ -69,13 +70,11 @@ dynamic_map::dynamic_map( submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); submap_num_successes_.push_back(submaps_[0]->get_num_successes()); d_submap_num_successes_ = submap_num_successes_; - num_successes_ = std::allocator_traits::allocate(counter_allocator_, 1); } template dynamic_map::~dynamic_map() { - std::allocator_traits::deallocate(counter_allocator_, num_successes_, 1); } template @@ -108,6 +107,8 @@ void dynamic_map::reserve(std::size_t n) sentinel::empty_key{empty_key_sentinel_}, sentinel::empty_value{empty_value_sentinel_}, alloc_)); + submap_num_successes_.push_back(submaps_[submap_idx]->get_num_successes()); + d_submap_num_successes_ = submap_num_successes_; } submap_views_.push_back(submaps_[submap_idx]->get_device_view()); submap_mutable_views_.push_back(submaps_[submap_idx]->get_device_mutable_view()); @@ -140,7 +141,7 @@ void dynamic_map::insert(InputIt first, if (capacity_remaining >= min_insert_size_) { // TODO: memset an atomic variable is unsafe static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); - CUCO_CUDA_TRY(cudaMemset(num_successes_, 0, sizeof(atomic_ctr_type))); + CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[submap_idx], 0, sizeof(atomic_ctr_type))); auto n = std::min(capacity_remaining, num_to_insert); auto const block_size = 128; @@ -153,7 +154,8 @@ void dynamic_map::insert(InputIt first, first + n, submap_views_.data().get(), submap_mutable_views_.data().get(), - num_successes_, + //num_successes_, + d_submap_num_successes_.data().get(), submap_idx, submaps_.size(), hash, @@ -161,7 +163,7 @@ void dynamic_map::insert(InputIt first, std::size_t h_num_successes; CUCO_CUDA_TRY(cudaMemcpy( - &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost)); + &h_num_successes, submap_num_successes_[submap_idx], sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost)); submaps_[submap_idx]->size_ += h_num_successes; size_ += h_num_successes; first += n; @@ -187,14 +189,11 @@ void dynamic_map::erase(InputIt first, // TODO: memset an atomic variable is unsafe static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); - CUCO_CUDA_TRY(cudaMemset(num_successes_, 0, sizeof(atomic_ctr_type))); // zero out submap success counters - if (submaps_.size() > 1) { - static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); - for (uint32_t i = 0; i < submaps_.size(); ++i) { - CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[i], 0, sizeof(atomic_ctr_type))); - } + static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); + for (uint32_t i = 0; i < submaps_.size(); ++i) { + CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[i], 0, sizeof(atomic_ctr_type))); } auto const temp_storage_size = submaps_.size() * sizeof(unsigned long long); @@ -203,29 +202,19 @@ void dynamic_map::erase(InputIt first, <<>>(first, first + num_keys, submap_mutable_views_.data().get(), - num_successes_, d_submap_num_successes_.data().get(), submaps_.size(), hash, key_equal); - // update total dynamic map size - std::size_t h_num_successes; - CUCO_CUDA_TRY( - cudaMemcpy(&h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost)); - size_ -= h_num_successes; - - if (submaps_.size() == 1) { - submaps_[0]->size_ -= h_num_successes; - } else { - for (uint32_t i = 0; i < submaps_.size(); ++i) { - std::size_t h_submap_num_successes; - CUCO_CUDA_TRY(cudaMemcpy(&h_submap_num_successes, - submap_num_successes_[i], - sizeof(atomic_ctr_type), - cudaMemcpyDeviceToHost)); - submaps_[i]->size_ -= h_submap_num_successes; - } + for (uint32_t i = 0; i < submaps_.size(); ++i) { + std::size_t h_submap_num_successes; + CUCO_CUDA_TRY(cudaMemcpy(&h_submap_num_successes, + submap_num_successes_[i], + sizeof(atomic_ctr_type), + cudaMemcpyDeviceToHost)); + submaps_[i]->size_ -= h_submap_num_successes; + size_ -= h_submap_num_successes; } } diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh index 913149021..7e2f84fce 100644 --- a/include/cuco/detail/dynamic_map_kernels.cuh +++ b/include/cuco/detail/dynamic_map_kernels.cuh @@ -147,7 +147,8 @@ __global__ void insert(InputIt first, InputIt last, viewT* submap_views, mutableViewT* submap_mutable_views, - atomicT* num_successes, + //atomicT* num_successes, + atomicT** submap_num_successes, uint32_t insert_idx, uint32_t num_submaps, Hash hash, @@ -183,7 +184,10 @@ __global__ void insert(InputIt first, } std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); - if (threadIdx.x == 0) { *num_successes += block_num_successes; } + if (threadIdx.x == 0) { + //*num_successes += block_num_successes; + *submap_num_successes[insert_idx] += block_num_successes; + } } /** @@ -221,55 +225,37 @@ template BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; extern __shared__ unsigned long long submap_block_num_successes[]; - std::size_t thread_num_successes = 0; - auto tid = block_size * blockIdx.x + threadIdx.x; auto it = first + tid; - if (num_submaps > 1) { - for (int i = threadIdx.x; i < num_submaps; i += block_size) - submap_block_num_successes[i] = 0; - __syncthreads(); + for (int i = threadIdx.x; i < num_submaps; i += block_size) + submap_block_num_successes[i] = 0; + __syncthreads(); - while (it < last) { - int i; - for (i = 0; i < num_submaps; ++i) { - if (submap_mutable_views[i].erase(*it, hash, key_equal)) { - thread_num_successes++; - atomicAdd(&submap_block_num_successes[i], 1); - break; - } + while (it < last) { + int i; + for (i = 0; i < num_submaps; ++i) { + if (submap_mutable_views[i].erase(*it, hash, key_equal)) { + atomicAdd(&submap_block_num_successes[i], 1); + break; } - it += gridDim.x * blockDim.x; - } - } else { - while (it < last) { - if (submap_mutable_views[0].erase(*it, hash, key_equal)) thread_num_successes++; - it += gridDim.x * blockDim.x; } + it += gridDim.x * blockDim.x; } + __syncthreads(); - std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); - if (threadIdx.x == 0) { - num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed); - } - - if (num_submaps > 1) { - for (int i = 0; i < num_submaps; ++i) { - if (threadIdx.x == 0) { - submap_num_successes[i]->fetch_add(static_cast(submap_block_num_successes[i]), - cuda::std::memory_order_relaxed); - } + for (int i = 0; i < num_submaps; ++i) { + if (threadIdx.x == 0) { + submap_num_successes[i]->fetch_add(static_cast(submap_block_num_successes[i]), + cuda::std::memory_order_relaxed); } } } @@ -311,60 +297,40 @@ template BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; extern __shared__ unsigned long long submap_block_num_successes[]; - std::size_t thread_num_successes = 0; - auto tile = cg::tiled_partition(cg::this_thread_block()); auto tid = block_size * blockIdx.x + threadIdx.x; auto it = first + tid / tile_size; - if (num_submaps > 1) { - for (int i = threadIdx.x; i < num_submaps; i += block_size) - submap_block_num_successes[i] = 0; - __syncthreads(); + for (int i = threadIdx.x; i < num_submaps; i += block_size) + submap_block_num_successes[i] = 0; + __syncthreads(); - while (it < last) { - auto erased = false; - int i; - for (i = 0; i < num_submaps; ++i) { - erased = submap_mutable_views[i].erase(tile, *it, hash, key_equal); - if (erased) { break; } - } - if (erased && tile.thread_rank() == 0) { - thread_num_successes++; - atomicAdd(&submap_block_num_successes[i], 1); - } - it += (gridDim.x * blockDim.x) / tile_size; + while (it < last) { + auto erased = false; + int i; + for (i = 0; i < num_submaps; ++i) { + erased = submap_mutable_views[i].erase(tile, *it, hash, key_equal); + if (erased) { break; } } - } else { - while (it < last) { - auto erased = submap_mutable_views[0].erase(tile, *it, hash, key_equal); - if (erased && tile.thread_rank() == 0) thread_num_successes++; - - it += (gridDim.x * blockDim.x) / tile_size; + if (erased && tile.thread_rank() == 0) { + atomicAdd(&submap_block_num_successes[i], 1); } + it += (gridDim.x * blockDim.x) / tile_size; } + __syncthreads(); - std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); - if (threadIdx.x == 0) { - num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed); - } - - if (num_submaps > 1) { - for (int i = 0; i < num_submaps; ++i) { - if (threadIdx.x == 0) { - submap_num_successes[i]->fetch_add(static_cast(submap_block_num_successes[i]), - cuda::std::memory_order_relaxed); - } + for (int i = 0; i < num_submaps; ++i) { + if (threadIdx.x == 0) { + submap_num_successes[i]->fetch_add(static_cast(submap_block_num_successes[i]), + cuda::std::memory_order_relaxed); } } } diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index f34eb3d86..d22ff1d8c 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -338,7 +338,6 @@ class dynamic_map { thrust::device_vector submap_mutable_views_; ///< vector of mutable device views for each submap std::size_t min_insert_size_{}; ///< min remaining capacity of submap for insert - atomic_ctr_type* num_successes_; ///< number of successfully inserted keys on insert std::vector submap_num_successes_; ///< number of succesfully erased keys for each submap thrust::device_vector diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu index 1c81f400a..4d046a89b 100644 --- a/tests/dynamic_map/erase_test.cu +++ b/tests/dynamic_map/erase_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,14 +14,15 @@ * limitations under the License. */ -#include +#include +#include + #include #include #include -#include +#include -#include TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t)) { @@ -121,7 +122,7 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t)) REQUIRE(cuco::test::all_of(d_keys_exist2.begin() + 2 * num_keys, d_keys_exist2.end(), [] __device__(const bool key_found) { return key_found; })); - + REQUIRE(map.get_size() == 2 * num_keys); // check that keys can be successfully deleted from all submaps (some will be unsuccessful // erases) From 4c1952da326eaf38ca581b7bc2b5085ba9a2fdfc Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Fri, 7 Oct 2022 16:17:34 -0700 Subject: [PATCH 026/152] doxygen warning fixes --- include/cuco/detail/dynamic_map.inl | 50 ++++++++++++++--------------- include/cuco/dynamic_map.cuh | 45 ++++++++++++++++---------- include/cuco/static_map.cuh | 6 ++++ 3 files changed, 58 insertions(+), 43 deletions(-) diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index 4b857256a..33def1f83 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -21,7 +21,8 @@ dynamic_map::dynamic_map( std::size_t initial_capacity, sentinel::empty_key empty_key_sentinel, sentinel::empty_value empty_value_sentinel, - Allocator const& alloc) + Allocator const& alloc, + cudaStream_t stream) : empty_key_sentinel_(empty_key_sentinel.value), empty_value_sentinel_(empty_value_sentinel.value), erased_key_sentinel_(empty_key_sentinel.value), @@ -36,7 +37,7 @@ dynamic_map::dynamic_map( initial_capacity, sentinel::empty_key{empty_key_sentinel}, sentinel::empty_value{empty_value_sentinel}, - alloc)); + alloc, stream)); submap_views_.push_back(submaps_[0]->get_device_view()); submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); submap_num_successes_.push_back(submaps_[0]->get_num_successes()); @@ -49,7 +50,8 @@ dynamic_map::dynamic_map( sentinel::empty_key empty_key_sentinel, sentinel::empty_value empty_value_sentinel, sentinel::erased_key erased_key_sentinel, - Allocator const& alloc) + Allocator const& alloc, + cudaStream_t stream) : empty_key_sentinel_(empty_key_sentinel.value), empty_value_sentinel_(empty_value_sentinel.value), erased_key_sentinel_(erased_key_sentinel.value), @@ -65,7 +67,7 @@ dynamic_map::dynamic_map( sentinel::empty_key{empty_key_sentinel_}, sentinel::empty_value{empty_value_sentinel_}, sentinel::erased_key{erased_key_sentinel_}, - alloc)); + alloc, stream)); submap_views_.push_back(submaps_[0]->get_device_view()); submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); submap_num_successes_.push_back(submaps_[0]->get_num_successes()); @@ -73,12 +75,7 @@ dynamic_map::dynamic_map( } template -dynamic_map::~dynamic_map() -{ -} - -template -void dynamic_map::reserve(std::size_t n) +void dynamic_map::reserve(std::size_t n, cudaStream_t stream) { int64_t num_elements_remaining = n; uint32_t submap_idx = 0; @@ -98,18 +95,16 @@ void dynamic_map::reserve(std::size_t n) sentinel::empty_key{empty_key_sentinel_}, sentinel::empty_value{empty_value_sentinel_}, sentinel::erased_key{erased_key_sentinel_}, - alloc_)); - submap_num_successes_.push_back(submaps_[submap_idx]->get_num_successes()); - d_submap_num_successes_ = submap_num_successes_; + alloc_, stream)); } else { submaps_.push_back(std::make_unique>( submap_capacity, sentinel::empty_key{empty_key_sentinel_}, sentinel::empty_value{empty_value_sentinel_}, - alloc_)); - submap_num_successes_.push_back(submaps_[submap_idx]->get_num_successes()); - d_submap_num_successes_ = submap_num_successes_; + alloc_, stream)); } + submap_num_successes_.push_back(submaps_[submap_idx]->get_num_successes()); + d_submap_num_successes_ = submap_num_successes_; submap_views_.push_back(submaps_[submap_idx]->get_device_view()); submap_mutable_views_.push_back(submaps_[submap_idx]->get_device_mutable_view()); capacity_ *= 2; @@ -125,11 +120,12 @@ template void dynamic_map::insert(InputIt first, InputIt last, Hash hash, - KeyEqual key_equal) + KeyEqual key_equal, + cudaStream_t stream) { std::size_t num_to_insert = std::distance(first, last); - reserve(size_ + num_to_insert); + reserve(size_ + num_to_insert, stream); uint32_t submap_idx = 0; while (num_to_insert > 0) { @@ -150,11 +146,10 @@ void dynamic_map::insert(InputIt first, auto const grid_size = (tile_size * n + stride * block_size - 1) / (stride * block_size); detail::insert> - <<>>(first, + <<>>(first, first + n, submap_views_.data().get(), submap_mutable_views_.data().get(), - //num_successes_, d_submap_num_successes_.data().get(), submap_idx, submaps_.size(), @@ -178,7 +173,8 @@ template void dynamic_map::erase(InputIt first, InputIt last, Hash hash, - KeyEqual key_equal) + KeyEqual key_equal, + cudaStream_t stream) { std::size_t num_keys = std::distance(first, last); @@ -199,7 +195,7 @@ void dynamic_map::erase(InputIt first, auto const temp_storage_size = submaps_.size() * sizeof(unsigned long long); detail::erase> - <<>>(first, + <<>>(first, first + num_keys, submap_mutable_views_.data().get(), d_submap_num_successes_.data().get(), @@ -221,7 +217,8 @@ void dynamic_map::erase(InputIt first, template template void dynamic_map::find( - InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal) + InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal, + cudaStream_t stream) { auto num_keys = std::distance(first, last); auto const block_size = 128; @@ -229,7 +226,7 @@ void dynamic_map::find( auto const tile_size = 4; auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); - detail::find<<>>( + detail::find<<>>( first, last, output_begin, submap_views_.data().get(), submaps_.size(), hash, key_equal); CUCO_CUDA_TRY(cudaDeviceSynchronize()); } @@ -237,7 +234,8 @@ void dynamic_map::find( template template void dynamic_map::contains( - InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal) + InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal, + cudaStream_t stream) { auto num_keys = std::distance(first, last); auto const block_size = 128; @@ -245,7 +243,7 @@ void dynamic_map::contains( auto const tile_size = 4; auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); - detail::contains<<>>( + detail::contains<<>>( first, last, output_begin, submap_views_.data().get(), submaps_.size(), hash, key_equal); CUCO_CUDA_TRY(cudaDeviceSynchronize()); } diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index d22ff1d8c..f2239b75e 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -101,14 +101,14 @@ class dynamic_map { static_assert(std::is_arithmetic::value, "Unsupported, non-arithmetic key type."); public: - using value_type = cuco::pair_type; - using key_type = Key; - using mapped_type = Value; - using atomic_ctr_type = cuda::atomic; - using view_type = typename static_map::device_view; - using mutable_view_type = typename static_map::device_mutable_view; - using counter_allocator_type = - typename std::allocator_traits::rebind_alloc; + using value_type = cuco::pair_type; ///< Type of key/value pairs + using key_type = Key; ///< Key type + using mapped_type = Value; ///< Type of mapped values + using atomic_ctr_type = cuda::atomic; ///< Atomic counter type + using view_type = typename static_map::device_view; ///< Type for submap device view + using mutable_view_type = typename static_map::device_mutable_view; ///< Type for submap mutable device view + using counter_allocator_type = typename std::allocator_traits::rebind_alloc< + atomic_ctr_type>; ///< Type of the allocator to (de)allocate atomic counters dynamic_map(dynamic_map const&) = delete; dynamic_map(dynamic_map&&) = delete; @@ -141,11 +141,13 @@ class dynamic_map { * @param empty_key_sentinel The reserved key value for empty slots * @param empty_value_sentinel The reserved mapped value for empty slots * @param alloc Allocator used to allocate submap device storage + * @param stream Stream used for executing the kernels */ dynamic_map(std::size_t initial_capacity, sentinel::empty_key empty_key_sentinel, sentinel::empty_value empty_value_sentinel, - Allocator const& alloc = Allocator{}); + Allocator const& alloc = Allocator{}, + cudaStream_t stream = 0); /** * @brief Construct a dynamically-sized map with erase capability. @@ -162,11 +164,11 @@ class dynamic_map { * that contains either. * * @param initial_capacity The initial number of slots in the map - * @param growth_factor The factor by which the capacity increases when resizing * @param empty_key_sentinel The reserved key value for empty slots * @param empty_value_sentinel The reserved mapped value for empty slots * @param erased_key_sentinel The reserved key value for erased slots * @param alloc Allocator used to allocate submap device storage + * @param stream Stream used for executing the kernels * * @throw std::runtime error if the empty key sentinel and erased key sentinel * are the same value @@ -175,13 +177,14 @@ class dynamic_map { sentinel::empty_key empty_key_sentinel, sentinel::empty_value empty_value_sentinel, sentinel::erased_key erased_key_sentinel, - Allocator const& alloc = Allocator{}); + Allocator const& alloc = Allocator{}, + cudaStream_t stream = 0); /** * @brief Destroy the map and frees its contents * */ - ~dynamic_map(); + ~dynamic_map() {} /** * @brief Grows the capacity of the map so there is enough space for `n` key/value pairs. @@ -189,8 +192,9 @@ class dynamic_map { * If there is already enough space for `n` key/value pairs, the capacity remains the same. * * @param n The number of key value pairs for which there must be space + * @param stream Stream used for executing the kernels */ - void reserve(std::size_t n); + void reserve(std::size_t n, cudaStream_t stream = 0); /** * @brief Inserts all key/value pairs in the range `[first, last)`. @@ -206,11 +210,13 @@ class dynamic_map { * @param last End of the sequence of key/value pairs * @param hash The unary function to apply to hash each key * @param key_equal The binary function to compare two keys for equality + * @param stream Stream used for executing the kernels */ template , typename KeyEqual = thrust::equal_to> - void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}); + void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}, + cudaStream_t stream = 0); /** * @brief Erases keys in the range `[first, last)`. @@ -244,7 +250,8 @@ class dynamic_map { template , typename KeyEqual = thrust::equal_to> - void erase(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}); + void erase(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}, + cudaStream_t stream = 0); /** * @brief Finds the values corresponding to all keys in the range `[first, last)`. @@ -263,6 +270,7 @@ class dynamic_map { * @param output_begin Beginning of the sequence of values retrieved for each key * @param hash The unary function to apply to hash each key * @param key_equal The binary function to compare two keys for equality + * @param stream Stream used for executing the kernels */ template {erased_key_sentinel_}); } + /** + * @brief Gets the number of successfully inserted/erased keys from the last + * insert/erase operation + * + * @return Number of successfully inserted/erased keys from the last insert/erase operation + */ atomic_ctr_type* get_num_successes() const noexcept { return num_successes_; } private: From 80f4d14265a63c35ce5e8ea4eb0f9893c33f7980 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 7 Oct 2022 23:17:51 +0000 Subject: [PATCH 027/152] [pre-commit.ci] auto code formatting --- include/cuco/detail/dynamic_map.inl | 84 +++++++++++---------- include/cuco/detail/dynamic_map_kernels.cuh | 12 ++- include/cuco/dynamic_map.cuh | 43 ++++++----- tests/dynamic_map/erase_test.cu | 5 +- 4 files changed, 78 insertions(+), 66 deletions(-) diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index 33def1f83..8dcfd89cb 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -37,7 +37,8 @@ dynamic_map::dynamic_map( initial_capacity, sentinel::empty_key{empty_key_sentinel}, sentinel::empty_value{empty_value_sentinel}, - alloc, stream)); + alloc, + stream)); submap_views_.push_back(submaps_[0]->get_device_view()); submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); submap_num_successes_.push_back(submaps_[0]->get_num_successes()); @@ -67,7 +68,8 @@ dynamic_map::dynamic_map( sentinel::empty_key{empty_key_sentinel_}, sentinel::empty_value{empty_value_sentinel_}, sentinel::erased_key{erased_key_sentinel_}, - alloc, stream)); + alloc, + stream)); submap_views_.push_back(submaps_[0]->get_device_view()); submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); submap_num_successes_.push_back(submaps_[0]->get_num_successes()); @@ -95,13 +97,15 @@ void dynamic_map::reserve(std::size_t n, cudaStrea sentinel::empty_key{empty_key_sentinel_}, sentinel::empty_value{empty_value_sentinel_}, sentinel::erased_key{erased_key_sentinel_}, - alloc_, stream)); + alloc_, + stream)); } else { submaps_.push_back(std::make_unique>( submap_capacity, sentinel::empty_key{empty_key_sentinel_}, sentinel::empty_value{empty_value_sentinel_}, - alloc_, stream)); + alloc_, + stream)); } submap_num_successes_.push_back(submaps_[submap_idx]->get_num_successes()); d_submap_num_successes_ = submap_num_successes_; @@ -117,11 +121,8 @@ void dynamic_map::reserve(std::size_t n, cudaStrea template template -void dynamic_map::insert(InputIt first, - InputIt last, - Hash hash, - KeyEqual key_equal, - cudaStream_t stream) +void dynamic_map::insert( + InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream) { std::size_t num_to_insert = std::distance(first, last); @@ -147,18 +148,20 @@ void dynamic_map::insert(InputIt first, detail::insert> <<>>(first, - first + n, - submap_views_.data().get(), - submap_mutable_views_.data().get(), - d_submap_num_successes_.data().get(), - submap_idx, - submaps_.size(), - hash, - key_equal); + first + n, + submap_views_.data().get(), + submap_mutable_views_.data().get(), + d_submap_num_successes_.data().get(), + submap_idx, + submaps_.size(), + hash, + key_equal); std::size_t h_num_successes; - CUCO_CUDA_TRY(cudaMemcpy( - &h_num_successes, submap_num_successes_[submap_idx], sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost)); + CUCO_CUDA_TRY(cudaMemcpy(&h_num_successes, + submap_num_successes_[submap_idx], + sizeof(atomic_ctr_type), + cudaMemcpyDeviceToHost)); submaps_[submap_idx]->size_ += h_num_successes; size_ += h_num_successes; first += n; @@ -170,11 +173,8 @@ void dynamic_map::insert(InputIt first, template template -void dynamic_map::erase(InputIt first, - InputIt last, - Hash hash, - KeyEqual key_equal, - cudaStream_t stream) +void dynamic_map::erase( + InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream) { std::size_t num_keys = std::distance(first, last); @@ -196,19 +196,19 @@ void dynamic_map::erase(InputIt first, detail::erase> <<>>(first, - first + num_keys, - submap_mutable_views_.data().get(), - d_submap_num_successes_.data().get(), - submaps_.size(), - hash, - key_equal); + first + num_keys, + submap_mutable_views_.data().get(), + d_submap_num_successes_.data().get(), + submaps_.size(), + hash, + key_equal); for (uint32_t i = 0; i < submaps_.size(); ++i) { std::size_t h_submap_num_successes; CUCO_CUDA_TRY(cudaMemcpy(&h_submap_num_successes, - submap_num_successes_[i], - sizeof(atomic_ctr_type), - cudaMemcpyDeviceToHost)); + submap_num_successes_[i], + sizeof(atomic_ctr_type), + cudaMemcpyDeviceToHost)); submaps_[i]->size_ -= h_submap_num_successes; size_ -= h_submap_num_successes; } @@ -216,9 +216,12 @@ void dynamic_map::erase(InputIt first, template template -void dynamic_map::find( - InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal, - cudaStream_t stream) +void dynamic_map::find(InputIt first, + InputIt last, + OutputIt output_begin, + Hash hash, + KeyEqual key_equal, + cudaStream_t stream) { auto num_keys = std::distance(first, last); auto const block_size = 128; @@ -233,9 +236,12 @@ void dynamic_map::find( template template -void dynamic_map::contains( - InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal, - cudaStream_t stream) +void dynamic_map::contains(InputIt first, + InputIt last, + OutputIt output_begin, + Hash hash, + KeyEqual key_equal, + cudaStream_t stream) { auto num_keys = std::distance(first, last); auto const block_size = 128; diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh index 7e2f84fce..0eeb1a632 100644 --- a/include/cuco/detail/dynamic_map_kernels.cuh +++ b/include/cuco/detail/dynamic_map_kernels.cuh @@ -147,7 +147,7 @@ __global__ void insert(InputIt first, InputIt last, viewT* submap_views, mutableViewT* submap_mutable_views, - //atomicT* num_successes, + // atomicT* num_successes, atomicT** submap_num_successes, uint32_t insert_idx, uint32_t num_submaps, @@ -184,7 +184,7 @@ __global__ void insert(InputIt first, } std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); - if (threadIdx.x == 0) { + if (threadIdx.x == 0) { //*num_successes += block_num_successes; *submap_num_successes[insert_idx] += block_num_successes; } @@ -255,7 +255,7 @@ __global__ void erase(InputIt first, for (int i = 0; i < num_submaps; ++i) { if (threadIdx.x == 0) { submap_num_successes[i]->fetch_add(static_cast(submap_block_num_successes[i]), - cuda::std::memory_order_relaxed); + cuda::std::memory_order_relaxed); } } } @@ -320,9 +320,7 @@ __global__ void erase(InputIt first, erased = submap_mutable_views[i].erase(tile, *it, hash, key_equal); if (erased) { break; } } - if (erased && tile.thread_rank() == 0) { - atomicAdd(&submap_block_num_successes[i], 1); - } + if (erased && tile.thread_rank() == 0) { atomicAdd(&submap_block_num_successes[i], 1); } it += (gridDim.x * blockDim.x) / tile_size; } __syncthreads(); @@ -330,7 +328,7 @@ __global__ void erase(InputIt first, for (int i = 0; i < num_submaps; ++i) { if (threadIdx.x == 0) { submap_num_successes[i]->fetch_add(static_cast(submap_block_num_successes[i]), - cuda::std::memory_order_relaxed); + cuda::std::memory_order_relaxed); } } } diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index f2239b75e..3386208aa 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -101,14 +101,17 @@ class dynamic_map { static_assert(std::is_arithmetic::value, "Unsupported, non-arithmetic key type."); public: - using value_type = cuco::pair_type; ///< Type of key/value pairs - using key_type = Key; ///< Key type - using mapped_type = Value; ///< Type of mapped values - using atomic_ctr_type = cuda::atomic; ///< Atomic counter type - using view_type = typename static_map::device_view; ///< Type for submap device view - using mutable_view_type = typename static_map::device_mutable_view; ///< Type for submap mutable device view + using value_type = cuco::pair_type; ///< Type of key/value pairs + using key_type = Key; ///< Key type + using mapped_type = Value; ///< Type of mapped values + using atomic_ctr_type = cuda::atomic; ///< Atomic counter type + using view_type = + typename static_map::device_view; ///< Type for submap device view + using mutable_view_type = + typename static_map::device_mutable_view; ///< Type for submap mutable + ///< device view using counter_allocator_type = typename std::allocator_traits::rebind_alloc< - atomic_ctr_type>; ///< Type of the allocator to (de)allocate atomic counters + atomic_ctr_type>; ///< Type of the allocator to (de)allocate atomic counters dynamic_map(dynamic_map const&) = delete; dynamic_map(dynamic_map&&) = delete; @@ -147,7 +150,7 @@ class dynamic_map { sentinel::empty_key empty_key_sentinel, sentinel::empty_value empty_value_sentinel, Allocator const& alloc = Allocator{}, - cudaStream_t stream = 0); + cudaStream_t stream = 0); /** * @brief Construct a dynamically-sized map with erase capability. @@ -178,7 +181,7 @@ class dynamic_map { sentinel::empty_value empty_value_sentinel, sentinel::erased_key erased_key_sentinel, Allocator const& alloc = Allocator{}, - cudaStream_t stream = 0); + cudaStream_t stream = 0); /** * @brief Destroy the map and frees its contents @@ -215,7 +218,10 @@ class dynamic_map { template , typename KeyEqual = thrust::equal_to> - void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}, + void insert(InputIt first, + InputIt last, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}, cudaStream_t stream = 0); /** @@ -250,7 +256,10 @@ class dynamic_map { template , typename KeyEqual = thrust::equal_to> - void erase(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}, + void erase(InputIt first, + InputIt last, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}, cudaStream_t stream = 0); /** @@ -279,8 +288,8 @@ class dynamic_map { void find(InputIt first, InputIt last, OutputIt output_begin, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}, cudaStream_t stream = 0); /** @@ -308,8 +317,8 @@ class dynamic_map { void contains(InputIt first, InputIt last, OutputIt output_begin, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}, cudaStream_t stream = 0); /** @@ -347,8 +356,8 @@ class dynamic_map { submaps_; ///< vector of pointers to each submap thrust::device_vector submap_views_; ///< vector of device views for each submap thrust::device_vector - submap_mutable_views_; ///< vector of mutable device views for each submap - std::size_t min_insert_size_{}; ///< min remaining capacity of submap for insert + submap_mutable_views_; ///< vector of mutable device views for each submap + std::size_t min_insert_size_{}; ///< min remaining capacity of submap for insert std::vector submap_num_successes_; ///< number of succesfully erased keys for each submap thrust::device_vector diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu index 4d046a89b..0e53197ea 100644 --- a/tests/dynamic_map/erase_test.cu +++ b/tests/dynamic_map/erase_test.cu @@ -14,8 +14,8 @@ * limitations under the License. */ -#include #include +#include #include #include @@ -23,7 +23,6 @@ #include - TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t)) { using Key = T; @@ -122,7 +121,7 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t)) REQUIRE(cuco::test::all_of(d_keys_exist2.begin() + 2 * num_keys, d_keys_exist2.end(), [] __device__(const bool key_found) { return key_found; })); - + REQUIRE(map.get_size() == 2 * num_keys); // check that keys can be successfully deleted from all submaps (some will be unsuccessful // erases) From 79075af992216e3b14f0dca88b53ff6acbfbc739 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Fri, 4 Nov 2022 11:05:48 -0400 Subject: [PATCH 028/152] Update rapids-cmake version to get support for Ada and Hopper --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 981d790ac..f77bc54dc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,7 +16,7 @@ cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake) - file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.10/RAPIDS.cmake + file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.12/RAPIDS.cmake ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake) endif() include(${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake) From eef6c1263e5e04e3b3e69cf4a120e123a8f2a1c0 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 9 Nov 2022 19:35:04 -0500 Subject: [PATCH 029/152] Make vars and retrieve_all const --- include/cuco/detail/static_map.inl | 12 ++++++------ include/cuco/static_map.cuh | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index 09e9d05dd..6f18a4f8d 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -92,7 +92,7 @@ template void static_map::insert( InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream) { - auto num_keys = std::distance(first, last); + auto const num_keys = std::distance(first, last); if (num_keys == 0) { return; } auto const block_size = 128; @@ -130,7 +130,7 @@ void static_map::insert_if(InputIt first, KeyEqual key_equal, cudaStream_t stream) { - auto num_keys = std::distance(first, last); + auto const num_keys = std::distance(first, last); if (num_keys == 0) { return; } auto constexpr block_size = 128; @@ -161,7 +161,7 @@ void static_map::erase( CUCO_RUNTIME_EXPECTS(get_empty_key_sentinel() != get_erased_key_sentinel(), "You must provide a unique erased key sentinel value at map construction."); - auto num_keys = std::distance(first, last); + auto const num_keys = std::distance(first, last); if (num_keys == 0) { return; } auto constexpr block_size = 128; @@ -194,7 +194,7 @@ void static_map::find(InputIt first, KeyEqual key_equal, cudaStream_t stream) { - auto num_keys = std::distance(first, last); + auto const num_keys = std::distance(first, last); if (num_keys == 0) { return; } auto const block_size = 128; @@ -210,7 +210,7 @@ void static_map::find(InputIt first, template template std::pair static_map::retrieve_all( - KeyOut keys_out, ValueOut values_out, cudaStream_t stream) + KeyOut keys_out, ValueOut values_out, cudaStream_t stream) const { static_assert(sizeof(pair_atomic_type) == sizeof(value_type)); auto slots_begin = reinterpret_cast(slots_); @@ -263,7 +263,7 @@ void static_map::contains(InputIt first, KeyEqual key_equal, cudaStream_t stream) const { - auto num_keys = std::distance(first, last); + auto const num_keys = std::distance(first, last); if (num_keys == 0) { return; } auto const block_size = 128; diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 1daad9965..4ca5755d7 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -382,7 +382,7 @@ class static_map { template std::pair retrieve_all(KeyOut keys_out, ValueOut values_out, - cudaStream_t stream = 0); + cudaStream_t stream = 0) const; /** * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. From 62fbaf031a6eb81698271491e85c454be5edc3a3 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 9 Nov 2022 19:44:33 -0500 Subject: [PATCH 030/152] Fix range bug in insert functions --- include/cuco/detail/static_map_kernels.cuh | 24 ++++++++++++---------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh index 7a3ca0dfa..90aa124a6 100644 --- a/include/cuco/detail/static_map_kernels.cuh +++ b/include/cuco/detail/static_map_kernels.cuh @@ -15,6 +15,8 @@ */ #pragma once +#include + #include #include @@ -90,13 +92,13 @@ __global__ void insert( __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; - auto tid = block_size * blockIdx.x + threadIdx.x; - auto it = first + tid; + auto const n = thrust::distance(first, last); + auto tid = block_size * blockIdx.x + threadIdx.x; - while (it < last) { - typename viewT::value_type const insert_pair{*it}; + while (tid < n) { + typename viewT::value_type const insert_pair{*(first + tid)}; if (view.insert(insert_pair, hash, key_equal)) { thread_num_successes++; } - it += gridDim.x * block_size; + tid += gridDim.x * block_size; } // compute number of successfully inserted elements for each block @@ -144,17 +146,17 @@ __global__ void insert( __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto it = first + tid / tile_size; + auto const n = thrust::distance(first, last); + auto tile = cg::tiled_partition(cg::this_thread_block()); + auto idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; - while (it < last) { + while (idx < n) { // force conversion to value_type - typename viewT::value_type const insert_pair{*it}; + typename viewT::value_type const insert_pair{*(first + idx)}; if (view.insert(tile, insert_pair, hash, key_equal) && tile.thread_rank() == 0) { thread_num_successes++; } - it += (gridDim.x * block_size) / tile_size; + idx += (gridDim.x * block_size) / tile_size; } // compute number of successfully inserted elements for each block From 81b75e8ac9e196120cab47d5d9745ea28acb5948 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 9 Nov 2022 19:54:58 -0500 Subject: [PATCH 031/152] Fix iterator range bugs in static map kernels --- include/cuco/detail/static_map_kernels.cuh | 69 +++++++++++----------- 1 file changed, 34 insertions(+), 35 deletions(-) diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh index 90aa124a6..80cf1142e 100644 --- a/include/cuco/detail/static_map_kernels.cuh +++ b/include/cuco/detail/static_map_kernels.cuh @@ -93,12 +93,12 @@ __global__ void insert( std::size_t thread_num_successes = 0; auto const n = thrust::distance(first, last); - auto tid = block_size * blockIdx.x + threadIdx.x; + auto idx = block_size * blockIdx.x + threadIdx.x; - while (tid < n) { - typename viewT::value_type const insert_pair{*(first + tid)}; + while (idx < n) { + typename viewT::value_type const insert_pair{*(first + idx)}; if (view.insert(insert_pair, hash, key_equal)) { thread_num_successes++; } - tid += gridDim.x * block_size; + idx += gridDim.x * block_size; } // compute number of successfully inserted elements for each block @@ -178,12 +178,12 @@ __global__ void erase( __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; - auto tid = block_size * blockIdx.x + threadIdx.x; - auto it = first + tid; + auto const n = thrust::distance(first, last); + auto idx = block_size * blockIdx.x + threadIdx.x; - while (it < last) { - if (view.erase(*it, hash, key_equal)) { thread_num_successes++; } - it += gridDim.x * block_size; + while (idx < n) { + if (view.erase(*(first + idx), hash, key_equal)) { thread_num_successes++; } + idx += gridDim.x * block_size; } // compute number of successfully inserted elements for each block @@ -208,15 +208,15 @@ __global__ void erase( __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto it = first + tid / tile_size; + auto const n = thrust::distance(first, last); + auto tile = cg::tiled_partition(cg::this_thread_block()); + auto idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; - while (it < last) { - if (view.erase(tile, *it, hash, key_equal) and tile.thread_rank() == 0) { + while (idx < n) { + if (view.erase(tile, *(first + idx), hash, key_equal) and tile.thread_rank() == 0) { thread_num_successes++; } - it += (gridDim.x * block_size) / tile_size; + idx += (gridDim.x * block_size) / tile_size; } // compute number of successfully inserted elements for each block @@ -278,17 +278,16 @@ __global__ void insert_if_n(InputIt first, std::size_t thread_num_successes = 0; auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto i = tid / tile_size; + auto idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; - while (i < n) { - if (pred(*(stencil + i))) { - typename viewT::value_type const insert_pair{*(first + i)}; + while (idx < n) { + if (pred(*(stencil + idx))) { + typename viewT::value_type const insert_pair{*(first + idx)}; if (view.insert(tile, insert_pair, hash, key_equal) and tile.thread_rank() == 0) { thread_num_successes++; } } - i += (gridDim.x * block_size) / tile_size; + idx += (gridDim.x * block_size) / tile_size; } // compute number of successfully inserted elements for each block @@ -330,11 +329,11 @@ template (cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto key_idx = tid / tile_size; - __shared__ Value writeBuffer[block_size]; + auto key_idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; + __shared__ Value writeBuffer[block_size / tile_size]; - while (first + key_idx < last) { + while (key_idx < n) { auto key = *(first + key_idx); auto found = view.find(tile, key, hash, key_equal); @@ -449,11 +448,11 @@ template (cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto key_idx = tid / tile_size; - __shared__ bool writeBuffer[block_size]; + auto key_idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; + __shared__ bool writeBuffer[block_size / tile_size]; - while (first + key_idx < last) { + while (key_idx < n) { auto key = *(first + key_idx); auto found = view.contains(tile, key, hash, key_equal); From fed4f43d11ca6ccf8b404654c89dd43fdf550905 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 10 Nov 2022 13:21:06 -0500 Subject: [PATCH 032/152] Apply suggestions from code review Co-authored-by: Jake Hemstad --- include/cuco/detail/static_map_kernels.cuh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh index 80cf1142e..775b042ec 100644 --- a/include/cuco/detail/static_map_kernels.cuh +++ b/include/cuco/detail/static_map_kernels.cuh @@ -93,7 +93,7 @@ __global__ void insert( std::size_t thread_num_successes = 0; auto const n = thrust::distance(first, last); - auto idx = block_size * blockIdx.x + threadIdx.x; + int64_t idx = block_size * blockIdx.x + threadIdx.x; while (idx < n) { typename viewT::value_type const insert_pair{*(first + idx)}; @@ -148,7 +148,7 @@ __global__ void insert( auto const n = thrust::distance(first, last); auto tile = cg::tiled_partition(cg::this_thread_block()); - auto idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; while (idx < n) { // force conversion to value_type @@ -179,7 +179,7 @@ __global__ void erase( std::size_t thread_num_successes = 0; auto const n = thrust::distance(first, last); - auto idx = block_size * blockIdx.x + threadIdx.x; + int64_t idx = block_size * blockIdx.x + threadIdx.x; while (idx < n) { if (view.erase(*(first + idx), hash, key_equal)) { thread_num_successes++; } @@ -210,7 +210,7 @@ __global__ void erase( auto const n = thrust::distance(first, last); auto tile = cg::tiled_partition(cg::this_thread_block()); - auto idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; while (idx < n) { if (view.erase(tile, *(first + idx), hash, key_equal) and tile.thread_rank() == 0) { @@ -278,7 +278,7 @@ __global__ void insert_if_n(InputIt first, std::size_t thread_num_successes = 0; auto tile = cg::tiled_partition(cg::this_thread_block()); - auto idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; while (idx < n) { if (pred(*(stencil + idx))) { From 0c780604631035072d02b5057e4496ae8c35a8a0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 10 Nov 2022 18:21:18 +0000 Subject: [PATCH 033/152] [pre-commit.ci] auto code formatting --- include/cuco/detail/static_map_kernels.cuh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh index 775b042ec..0d7a7127f 100644 --- a/include/cuco/detail/static_map_kernels.cuh +++ b/include/cuco/detail/static_map_kernels.cuh @@ -93,7 +93,7 @@ __global__ void insert( std::size_t thread_num_successes = 0; auto const n = thrust::distance(first, last); - int64_t idx = block_size * blockIdx.x + threadIdx.x; + int64_t idx = block_size * blockIdx.x + threadIdx.x; while (idx < n) { typename viewT::value_type const insert_pair{*(first + idx)}; @@ -148,7 +148,7 @@ __global__ void insert( auto const n = thrust::distance(first, last); auto tile = cg::tiled_partition(cg::this_thread_block()); - int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; while (idx < n) { // force conversion to value_type @@ -179,7 +179,7 @@ __global__ void erase( std::size_t thread_num_successes = 0; auto const n = thrust::distance(first, last); - int64_t idx = block_size * blockIdx.x + threadIdx.x; + int64_t idx = block_size * blockIdx.x + threadIdx.x; while (idx < n) { if (view.erase(*(first + idx), hash, key_equal)) { thread_num_successes++; } @@ -210,7 +210,7 @@ __global__ void erase( auto const n = thrust::distance(first, last); auto tile = cg::tiled_partition(cg::this_thread_block()); - int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; while (idx < n) { if (view.erase(tile, *(first + idx), hash, key_equal) and tile.thread_rank() == 0) { @@ -277,8 +277,8 @@ __global__ void insert_if_n(InputIt first, __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; - auto tile = cg::tiled_partition(cg::this_thread_block()); - int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; while (idx < n) { if (pred(*(stencil + idx))) { From d691e78533e64ee0248b2a560cd39892292655a5 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 10 Nov 2022 14:29:46 -0500 Subject: [PATCH 034/152] Update kernels to take int64_t n --- include/cuco/detail/static_map.inl | 23 ++-- include/cuco/detail/static_map_kernels.cuh | 120 ++++++++++----------- include/cuco/detail/utils.hpp | 15 ++- 3 files changed, 83 insertions(+), 75 deletions(-) diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index 6f18a4f8d..8e27ca0d3 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -92,7 +93,7 @@ template void static_map::insert( InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream) { - auto const num_keys = std::distance(first, last); + auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } auto const block_size = 128; @@ -106,8 +107,8 @@ void static_map::insert( CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream)); std::size_t h_num_successes; - detail::insert<<>>( - first, first + num_keys, num_successes_, view, hash, key_equal); + detail::insert + <<>>(first, num_keys, num_successes_, view, hash, key_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); @@ -130,7 +131,7 @@ void static_map::insert_if(InputIt first, KeyEqual key_equal, cudaStream_t stream) { - auto const num_keys = std::distance(first, last); + auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } auto constexpr block_size = 128; @@ -161,7 +162,7 @@ void static_map::erase( CUCO_RUNTIME_EXPECTS(get_empty_key_sentinel() != get_erased_key_sentinel(), "You must provide a unique erased key sentinel value at map construction."); - auto const num_keys = std::distance(first, last); + auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } auto constexpr block_size = 128; @@ -175,8 +176,8 @@ void static_map::erase( CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream)); std::size_t h_num_successes; - detail::erase<<>>( - first, first + num_keys, num_successes_, view, hash, key_equal); + detail::erase + <<>>(first, num_keys, num_successes_, view, hash, key_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); @@ -194,7 +195,7 @@ void static_map::find(InputIt first, KeyEqual key_equal, cudaStream_t stream) { - auto const num_keys = std::distance(first, last); + auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } auto const block_size = 128; @@ -204,7 +205,7 @@ void static_map::find(InputIt first, auto view = get_device_view(); detail::find - <<>>(first, last, output_begin, view, hash, key_equal); + <<>>(first, num_keys, output_begin, view, hash, key_equal); } template @@ -263,7 +264,7 @@ void static_map::contains(InputIt first, KeyEqual key_equal, cudaStream_t stream) const { - auto const num_keys = std::distance(first, last); + auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } auto const block_size = 128; @@ -273,7 +274,7 @@ void static_map::contains(InputIt first, auto view = get_device_view(); detail::contains - <<>>(first, last, output_begin, view, hash, key_equal); + <<>>(first, num_keys, output_begin, view, hash, key_equal); } template diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh index 0d7a7127f..5ef1644c3 100644 --- a/include/cuco/detail/static_map_kernels.cuh +++ b/include/cuco/detail/static_map_kernels.cuh @@ -15,8 +15,6 @@ */ #pragma once -#include - #include #include @@ -38,6 +36,7 @@ namespace cg = cooperative_groups; * @tparam Key key type * @tparam Value value type * @tparam pair_atomic_type key/value pair type + * * @param slots Pointer to flat storage for the map's key/value pairs * @param k Key to which all keys in `slots` are initialized * @param v Value to which all values in `slots` are initialized @@ -49,13 +48,13 @@ template -__global__ void initialize(pair_atomic_type* const slots, Key k, Value v, std::size_t size) +__global__ void initialize(pair_atomic_type* const slots, Key k, Value v, int64_t size) { - auto tid = block_size * blockIdx.x + threadIdx.x; - while (tid < size) { - new (&slots[tid].first) atomic_key_type{k}; - new (&slots[tid].second) atomic_mapped_type{v}; - tid += gridDim.x * block_size; + int64_t idx = block_size * blockIdx.x + threadIdx.x; + while (idx < size) { + new (&slots[idx].first) atomic_key_type{k}; + new (&slots[idx].second) atomic_mapped_type{v}; + idx += gridDim.x * block_size; } } @@ -72,8 +71,9 @@ __global__ void initialize(pair_atomic_type* const slots, Key k, Value v, std::s * @tparam viewT Type of device view allowing access of hash map storage * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of key/value pairs - * @param last End of the sequence of key/value pairs + * @param n Number of the key/value pairs to insert * @param num_successes The number of successfully inserted key/value pairs * @param view Mutable device view used to access the hash map's slot storage * @param hash The unary function to apply to hash each key @@ -86,14 +86,13 @@ template __global__ void insert( - InputIt first, InputIt last, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) + InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; - auto const n = thrust::distance(first, last); - int64_t idx = block_size * blockIdx.x + threadIdx.x; + int64_t idx = block_size * blockIdx.x + threadIdx.x; while (idx < n) { typename viewT::value_type const insert_pair{*(first + idx)}; @@ -125,8 +124,9 @@ __global__ void insert( * @tparam viewT Type of device view allowing access of hash map storage * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of key/value pairs - * @param last End of the sequence of key/value pairs + * @param n Number of the key/value pairs to insert * @param num_successes The number of successfully inserted key/value pairs * @param view Mutable device view used to access the hash map's slot storage * @param hash The unary function to apply to hash each key @@ -140,15 +140,14 @@ template __global__ void insert( - InputIt first, InputIt last, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) + InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; - auto const n = thrust::distance(first, last); - auto tile = cg::tiled_partition(cg::this_thread_block()); - int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; while (idx < n) { // force conversion to value_type @@ -172,14 +171,13 @@ template __global__ void erase( - InputIt first, InputIt last, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) + InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) { using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; - auto const n = thrust::distance(first, last); - int64_t idx = block_size * blockIdx.x + threadIdx.x; + int64_t idx = block_size * blockIdx.x + threadIdx.x; while (idx < n) { if (view.erase(*(first + idx), hash, key_equal)) { thread_num_successes++; } @@ -202,15 +200,14 @@ template __global__ void erase( - InputIt first, InputIt last, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) + InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; - auto const n = thrust::distance(first, last); - auto tile = cg::tiled_partition(cg::this_thread_block()); - int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; while (idx < n) { if (view.erase(tile, *(first + idx), hash, key_equal) and tile.thread_rank() == 0) { @@ -246,6 +243,7 @@ __global__ void erase( * and argument type is convertible from `std::iterator_traits::value_type` * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of key/value pairs * @param n Number of elements to insert * @param num_successes The number of successfully inserted key/value pairs @@ -265,7 +263,7 @@ template __global__ void insert_if_n(InputIt first, - std::size_t n, + int64_t n, atomicT* num_successes, viewT view, StencilIt stencil, @@ -312,8 +310,9 @@ __global__ void insert_if_n(InputIt first, * @tparam viewT Type of device view allowing access of hash map storage * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys + * @param n Number of keys to query * @param output_begin Beginning of the sequence of values retrieved for each key * @param view Device view used to access the hash map's slot storage * @param hash The unary function to apply to hash each key @@ -327,14 +326,13 @@ template __global__ void find( - InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) + InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) { - auto const n = thrust::distance(first, last); - auto key_idx = block_size * blockIdx.x + threadIdx.x; + int64_t idx = block_size * blockIdx.x + threadIdx.x; __shared__ Value writeBuffer[block_size]; - while (key_idx < n) { - auto key = *(first + key_idx); + while (idx < n) { + auto key = *(first + idx); auto found = view.find(key, hash, key_equal); /* @@ -348,8 +346,8 @@ __global__ void find( ? view.get_empty_value_sentinel() : found->second.load(cuda::std::memory_order_relaxed); __syncthreads(); - *(output_begin + key_idx) = writeBuffer[threadIdx.x]; - key_idx += gridDim.x * block_size; + *(output_begin + idx) = writeBuffer[threadIdx.x]; + idx += gridDim.x * block_size; } } @@ -372,8 +370,9 @@ __global__ void find( * @tparam viewT Type of device view allowing access of hash map storage * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys + * @param n Number of keys to query * @param output_begin Beginning of the sequence of values retrieved for each key * @param view Device view used to access the hash map's slot storage * @param hash The unary function to apply to hash each key @@ -388,15 +387,14 @@ template __global__ void find( - InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) + InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) { - auto const n = thrust::distance(first, last); - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto key_idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; __shared__ Value writeBuffer[block_size / tile_size]; - while (key_idx < n) { - auto key = *(first + key_idx); + while (idx < n) { + auto key = *(first + idx); auto found = view.find(tile, key, hash, key_equal); /* @@ -412,10 +410,8 @@ __global__ void find( : found->second.load(cuda::std::memory_order_relaxed); } __syncthreads(); - if (tile.thread_rank() == 0) { - *(output_begin + key_idx) = writeBuffer[threadIdx.x / tile_size]; - } - key_idx += (gridDim.x * block_size) / tile_size; + if (tile.thread_rank() == 0) { *(output_begin + idx) = writeBuffer[threadIdx.x / tile_size]; } + idx += (gridDim.x * block_size) / tile_size; } } @@ -432,8 +428,9 @@ __global__ void find( * @tparam viewT Type of device view allowing access of hash map storage * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys + * @param n Number of keys to query * @param output_begin Beginning of the sequence of booleans for the presence of each key * @param view Device view used to access the hash map's slot storage * @param hash The unary function to apply to hash each key @@ -446,14 +443,13 @@ template __global__ void contains( - InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) + InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) { - auto const n = thrust::distance(first, last); - auto key_idx = block_size * blockIdx.x + threadIdx.x; + int64_t idx = block_size * blockIdx.x + threadIdx.x; __shared__ bool writeBuffer[block_size]; - while (key_idx < n) { - auto key = *(first + key_idx); + while (idx < n) { + auto key = *(first + idx); /* * The ld.relaxed.gpu instruction used in view.find causes L1 to @@ -464,8 +460,8 @@ __global__ void contains( */ writeBuffer[threadIdx.x] = view.contains(key, hash, key_equal); __syncthreads(); - *(output_begin + key_idx) = writeBuffer[threadIdx.x]; - key_idx += gridDim.x * block_size; + *(output_begin + idx) = writeBuffer[threadIdx.x]; + idx += gridDim.x * block_size; } } @@ -487,8 +483,9 @@ __global__ void contains( * @tparam viewT Type of device view allowing access of hash map storage * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys + * @param n Number of keys to query * @param output_begin Beginning of the sequence of booleans for the presence of each key * @param view Device view used to access the hash map's slot storage * @param hash The unary function to apply to hash each key @@ -502,15 +499,14 @@ template __global__ void contains( - InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) + InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) { - auto const n = thrust::distance(first, last); - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto key_idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; __shared__ bool writeBuffer[block_size / tile_size]; - while (key_idx < n) { - auto key = *(first + key_idx); + while (idx < n) { + auto key = *(first + idx); auto found = view.contains(tile, key, hash, key_equal); /* @@ -522,10 +518,8 @@ __global__ void contains( */ if (tile.thread_rank() == 0) { writeBuffer[threadIdx.x / tile_size] = found; } __syncthreads(); - if (tile.thread_rank() == 0) { - *(output_begin + key_idx) = writeBuffer[threadIdx.x / tile_size]; - } - key_idx += (gridDim.x * block_size) / tile_size; + if (tile.thread_rank() == 0) { *(output_begin + idx) = writeBuffer[threadIdx.x / tile_size]; } + idx += (gridDim.x * block_size) / tile_size; } } diff --git a/include/cuco/detail/utils.hpp b/include/cuco/detail/utils.hpp index 40697ff5c..b477ed520 100644 --- a/include/cuco/detail/utils.hpp +++ b/include/cuco/detail/utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,11 @@ #pragma once +#include + +#include +#include + namespace cuco { namespace detail { @@ -49,5 +54,13 @@ auto get_grid_size(Kernel kernel, std::size_t block_size, std::size_t dynamic_sm return grid_size; } +template +constexpr inline int64_t distance(Iterator begin, Iterator end) +{ + auto const res = std::distance(begin, end); + CUCO_RUNTIME_EXPECTS(res >= 0, "Potential overflow"); + return static_cast(res); +} + } // namespace detail } // namespace cuco From 6e1b0e84aeac24c95f0fd6517c913330a830f84c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 10 Nov 2022 14:36:28 -0500 Subject: [PATCH 035/152] Minor cleanup --- include/cuco/detail/utils.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/cuco/detail/utils.hpp b/include/cuco/detail/utils.hpp index b477ed520..4724e7156 100644 --- a/include/cuco/detail/utils.hpp +++ b/include/cuco/detail/utils.hpp @@ -57,9 +57,8 @@ auto get_grid_size(Kernel kernel, std::size_t block_size, std::size_t dynamic_sm template constexpr inline int64_t distance(Iterator begin, Iterator end) { - auto const res = std::distance(begin, end); - CUCO_RUNTIME_EXPECTS(res >= 0, "Potential overflow"); - return static_cast(res); + // `int64_t` instead of arch-dependant `long int` + return static_cast(std::distance(begin, end)); } } // namespace detail From 6f5e8c5eead0d4798cedee2b9bbb7d74b122cecf Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 10 Nov 2022 14:45:45 -0500 Subject: [PATCH 036/152] Use loop_stride const var --- include/cuco/detail/static_map_kernels.cuh | 60 +++++++++++++--------- 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh index 5ef1644c3..705aae7fb 100644 --- a/include/cuco/detail/static_map_kernels.cuh +++ b/include/cuco/detail/static_map_kernels.cuh @@ -50,11 +50,12 @@ template __global__ void initialize(pair_atomic_type* const slots, Key k, Value v, int64_t size) { - int64_t idx = block_size * blockIdx.x + threadIdx.x; + int64_t const loop_stride = gridDim.x * block_size; + int64_t idx = block_size * blockIdx.x + threadIdx.x; while (idx < size) { new (&slots[idx].first) atomic_key_type{k}; new (&slots[idx].second) atomic_mapped_type{v}; - idx += gridDim.x * block_size; + idx += loop_stride; } } @@ -92,12 +93,13 @@ __global__ void insert( __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; - int64_t idx = block_size * blockIdx.x + threadIdx.x; + int64_t const loop_stride = gridDim.x * block_size; + int64_t idx = block_size * blockIdx.x + threadIdx.x; while (idx < n) { typename viewT::value_type const insert_pair{*(first + idx)}; if (view.insert(insert_pair, hash, key_equal)) { thread_num_successes++; } - idx += gridDim.x * block_size; + idx += loop_stride; } // compute number of successfully inserted elements for each block @@ -146,8 +148,9 @@ __global__ void insert( __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; - auto tile = cg::tiled_partition(cg::this_thread_block()); - int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / tile_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; while (idx < n) { // force conversion to value_type @@ -155,7 +158,7 @@ __global__ void insert( if (view.insert(tile, insert_pair, hash, key_equal) && tile.thread_rank() == 0) { thread_num_successes++; } - idx += (gridDim.x * block_size) / tile_size; + idx += loop_stride; } // compute number of successfully inserted elements for each block @@ -177,11 +180,12 @@ __global__ void erase( __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; - int64_t idx = block_size * blockIdx.x + threadIdx.x; + const int64_t loop_stride = gridDim.x * block_size; + int64_t idx = block_size * blockIdx.x + threadIdx.x; while (idx < n) { if (view.erase(*(first + idx), hash, key_equal)) { thread_num_successes++; } - idx += gridDim.x * block_size; + idx += loop_stride; } // compute number of successfully inserted elements for each block @@ -206,14 +210,15 @@ __global__ void erase( __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; - auto tile = cg::tiled_partition(cg::this_thread_block()); - int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / tile_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; while (idx < n) { if (view.erase(tile, *(first + idx), hash, key_equal) and tile.thread_rank() == 0) { thread_num_successes++; } - idx += (gridDim.x * block_size) / tile_size; + idx += loop_stride; } // compute number of successfully inserted elements for each block @@ -275,8 +280,9 @@ __global__ void insert_if_n(InputIt first, __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; - auto tile = cg::tiled_partition(cg::this_thread_block()); - int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / tile_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; while (idx < n) { if (pred(*(stencil + idx))) { @@ -285,7 +291,7 @@ __global__ void insert_if_n(InputIt first, thread_num_successes++; } } - idx += (gridDim.x * block_size) / tile_size; + idx += loop_stride; } // compute number of successfully inserted elements for each block @@ -328,7 +334,8 @@ template second.load(cuda::std::memory_order_relaxed); __syncthreads(); *(output_begin + idx) = writeBuffer[threadIdx.x]; - idx += gridDim.x * block_size; + idx += loop_stride; } } @@ -389,8 +396,9 @@ template (cg::this_thread_block()); - int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / tile_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; __shared__ Value writeBuffer[block_size / tile_size]; while (idx < n) { @@ -411,7 +419,7 @@ __global__ void find( } __syncthreads(); if (tile.thread_rank() == 0) { *(output_begin + idx) = writeBuffer[threadIdx.x / tile_size]; } - idx += (gridDim.x * block_size) / tile_size; + idx += loop_stride; } } @@ -445,7 +453,8 @@ template (cg::this_thread_block()); - int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / tile_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; __shared__ bool writeBuffer[block_size / tile_size]; while (idx < n) { @@ -519,7 +529,7 @@ __global__ void contains( if (tile.thread_rank() == 0) { writeBuffer[threadIdx.x / tile_size] = found; } __syncthreads(); if (tile.thread_rank() == 0) { *(output_begin + idx) = writeBuffer[threadIdx.x / tile_size]; } - idx += (gridDim.x * block_size) / tile_size; + idx += loop_stride; } } From 331421697188a54f23a8cfdfdf5557fc5389bc78 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 10 Nov 2022 15:29:00 -0500 Subject: [PATCH 037/152] Fix iterator range bugs for multimap --- .../cuco/detail/static_multimap/kernels.cuh | 140 +++++++++--------- .../static_multimap/static_multimap.inl | 68 +++++---- 2 files changed, 112 insertions(+), 96 deletions(-) diff --git a/include/cuco/detail/static_multimap/kernels.cuh b/include/cuco/detail/static_multimap/kernels.cuh index 019e66e31..c010fa8f3 100644 --- a/include/cuco/detail/static_multimap/kernels.cuh +++ b/include/cuco/detail/static_multimap/kernels.cuh @@ -40,6 +40,7 @@ namespace cg = cooperative_groups; * @tparam Key key type * @tparam Value value type * @tparam pair_atomic_type key/value pair type + * * @param slots Pointer to flat storage for the map's key/value pairs * @param k Key to which all keys in `slots` are initialized * @param v Value to which all values in `slots` are initialized @@ -50,13 +51,14 @@ template -__global__ void initialize(pair_atomic_type* const slots, Key k, Value v, std::size_t size) +__global__ void initialize(pair_atomic_type* const slots, Key k, Value v, int64_t size) { - auto tid = threadIdx.x + blockIdx.x * blockDim.x; - while (tid < size) { - new (&slots[tid].first) atomic_key_type{k}; - new (&slots[tid].second) atomic_mapped_type{v}; - tid += gridDim.x * blockDim.x; + int64_t const loop_stride = gridDim.x * blockDim.x; + int64_t idx = threadIdx.x + blockIdx.x * blockDim.x; + while (idx < size) { + new (&slots[idx].first) atomic_key_type{k}; + new (&slots[idx].second) atomic_mapped_type{v}; + idx += loop_stride; } } @@ -76,21 +78,21 @@ __global__ void initialize(pair_atomic_type* const slots, Key k, Value v, std::s * @tparam viewT Type of device view allowing access of hash map storage * * @param first Beginning of the sequence of key/value pairs - * @param last End of the sequence of key/value pairs + * @param n Number of key/value pairs to insert * @param view Mutable device view used to access the hash map's slot storage */ template -__global__ void insert(InputIt first, InputIt last, viewT view) +__global__ void insert(InputIt first, int64_t n, viewT view) { - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto it = first + tid / tile_size; + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / tile_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; - while (it < last) { + while (idx < n) { // force conversion to value_type - typename viewT::value_type const insert_pair{*it}; + typename viewT::value_type const insert_pair{*(first + idx)}; view.insert(tile, insert_pair); - it += (gridDim.x * block_size) / tile_size; + idx += loop_stride; } } @@ -115,6 +117,7 @@ __global__ void insert(InputIt first, InputIt last, viewT view) * @tparam viewT Type of device view allowing access of hash map storage * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and * argument type is convertible from `std::iterator_traits::value_type`. + * * @param first Beginning of the sequence of key/value pairs * @param s Beginning of the stencil sequence * @param n Number of elements to insert @@ -127,19 +130,19 @@ template -__global__ void insert_if_n(InputIt first, StencilIt s, std::size_t n, viewT view, Predicate pred) +__global__ void insert_if_n(InputIt first, StencilIt s, int64_t n, viewT view, Predicate pred) { - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto const tid = block_size * blockIdx.x + threadIdx.x; - auto i = tid / tile_size; + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / tile_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; - while (i < n) { - if (pred(*(s + i))) { - typename viewT::value_type const insert_pair{*(first + i)}; + while (idx < n) { + if (pred(*(s + idx))) { + typename viewT::value_type const insert_pair{*(first + idx)}; // force conversion to value_type view.insert(tile, insert_pair); } - i += (gridDim.x * block_size) / tile_size; + idx += loop_stride; } } @@ -162,7 +165,7 @@ __global__ void insert_if_n(InputIt first, StencilIt s, std::size_t n, viewT vie * @tparam Equal Binary callable type * * @param first Beginning of the sequence of elements - * @param last End of the sequence of elements + * @param n Number of elements to query * @param output_begin Beginning of the sequence of booleans for the presence of each element * @param view Device view used to access the hash map's slot storage * @param equal The binary function to compare input element and slot content for equality @@ -174,15 +177,14 @@ template -__global__ void contains( - InputIt first, InputIt last, OutputIt output_begin, viewT view, Equal equal) +__global__ void contains(InputIt first, int64_t n, OutputIt output_begin, viewT view, Equal equal) { - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto idx = tid / tile_size; - __shared__ bool writeBuffer[block_size]; + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / tile_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; + __shared__ bool writeBuffer[block_size / tile_size]; - while (first + idx < last) { + while (idx < n) { typename std::iterator_traits::value_type element = *(first + idx); auto found = [&]() { if constexpr (is_pair_contains) { return view.pair_contains(tile, element, equal); } @@ -199,7 +201,7 @@ __global__ void contains( if (tile.thread_rank() == 0) { writeBuffer[threadIdx.x / tile_size] = found; } __syncthreads(); if (tile.thread_rank() == 0) { *(output_begin + idx) = writeBuffer[threadIdx.x / tile_size]; } - idx += (gridDim.x * block_size) / tile_size; + idx += loop_stride; } } @@ -219,8 +221,9 @@ __global__ void contains( * @tparam atomicT Type of atomic storage * @tparam viewT Type of device view allowing access of hash map storage * @tparam KeyEqual Binary callable + * * @param first Beginning of the sequence of keys to count - * @param last End of the sequence of keys to count + * @param n Number of the keys to query * @param num_matches The number of all the matches for a sequence of keys * @param view Device view used to access the hash map's slot storage * @param key_equal Binary function to compare two keys for equality @@ -233,24 +236,24 @@ template __global__ void count( - InputIt first, InputIt last, atomicT* num_matches, viewT view, KeyEqual key_equal) + InputIt first, int64_t n, atomicT* num_matches, viewT view, KeyEqual key_equal) { - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto key_idx = tid / tile_size; + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / tile_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_matches = 0; - while (first + key_idx < last) { - auto key = *(first + key_idx); + while (idx < n) { + auto key = *(first + idx); if constexpr (is_outer) { thread_num_matches += view.count_outer(tile, key, key_equal); } else { thread_num_matches += view.count(tile, key, key_equal); } - key_idx += (gridDim.x * block_size) / tile_size; + idx += loop_stride; } // compute number of successfully inserted elements for each block @@ -277,8 +280,9 @@ __global__ void count( * @tparam atomicT Type of atomic storage * @tparam viewT Type of device view allowing access of hash map storage * @tparam PairEqual Binary callable + * * @param first Beginning of the sequence of pairs to count - * @param last End of the sequence of pairs to count + * @param n Number of the pairs to query * @param num_matches The number of all the matches for a sequence of pairs * @param view Device view used to access the hash map's slot storage * @param pair_equal Binary function to compare two pairs for equality @@ -291,24 +295,24 @@ template __global__ void pair_count( - InputIt first, InputIt last, atomicT* num_matches, viewT view, PairEqual pair_equal) + InputIt first, int64_t n, atomicT* num_matches, viewT view, PairEqual pair_equal) { - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto pair_idx = tid / tile_size; + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / tile_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_matches = 0; - while (first + pair_idx < last) { - typename viewT::value_type const pair = *(first + pair_idx); + while (idx < n) { + typename viewT::value_type const pair = *(first + idx); if constexpr (is_outer) { thread_num_matches += view.pair_count_outer(tile, pair, pair_equal); } else { thread_num_matches += view.pair_count(tile, pair, pair_equal); } - pair_idx += (gridDim.x * block_size) / tile_size; + idx += loop_stride; } // compute number of successfully inserted elements for each block @@ -341,8 +345,9 @@ __global__ void pair_count( * @tparam atomicT Type of atomic storage * @tparam viewT Type of device view allowing access of hash map storage * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys + * @param n Number of the keys to query * @param output_begin Beginning of the sequence of values retrieved for each key * @param num_matches Size of the output sequence * @param view Device view used to access the hash map's slot storage @@ -359,7 +364,7 @@ template __global__ void retrieve(InputIt first, - InputIt last, + int64_t n, OutputIt output_begin, atomicT* num_matches, viewT view, @@ -370,10 +375,10 @@ __global__ void retrieve(InputIt first, constexpr uint32_t num_flushing_cgs = block_size / flushing_cg_size; const uint32_t flushing_cg_id = threadIdx.x / flushing_cg_size; - auto flushing_cg = cg::tiled_partition(cg::this_thread_block()); - auto probing_cg = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto key_idx = tid / probing_cg_size; + auto flushing_cg = cg::tiled_partition(cg::this_thread_block()); + auto probing_cg = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / probing_cg_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / probing_cg_size; __shared__ pair_type output_buffer[num_flushing_cgs][buffer_size]; // TODO: replace this with shared memory cuda::atomic variables once the dynamiic initialization @@ -382,12 +387,12 @@ __global__ void retrieve(InputIt first, if (flushing_cg.thread_rank() == 0) { flushing_cg_counter[flushing_cg_id] = 0; } - while (flushing_cg.any(first + key_idx < last)) { - bool active_flag = first + key_idx < last; + while (flushing_cg.any(idx < n)) { + bool active_flag = idx < n; auto active_flushing_cg = cg::binary_partition(flushing_cg, active_flag); if (active_flag) { - auto key = *(first + key_idx); + auto key = *(first + idx); if constexpr (is_outer) { view.retrieve_outer(active_flushing_cg, probing_cg, @@ -408,7 +413,7 @@ __global__ void retrieve(InputIt first, key_equal); } } - key_idx += (gridDim.x * block_size) / probing_cg_size; + idx += loop_stride; } // Final flush of output buffer @@ -448,8 +453,9 @@ __global__ void retrieve(InputIt first, * @tparam atomicT Type of atomic storage * @tparam viewT Type of device view allowing access of hash map storage * @tparam PairEqual Binary callable type + * * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys + * @param n Number of keys to query * @param probe_output_begin Beginning of the sequence of the matched probe pairs * @param contained_output_begin Beginning of the sequence of the matched contained pairs * @param num_matches Size of the output sequence @@ -468,7 +474,7 @@ template __global__ void pair_retrieve(InputIt first, - InputIt last, + int64_t n, OutputIt1 probe_output_begin, OutputIt2 contained_output_begin, atomicT* num_matches, @@ -480,10 +486,10 @@ __global__ void pair_retrieve(InputIt first, constexpr uint32_t num_flushing_cgs = block_size / flushing_cg_size; const uint32_t flushing_cg_id = threadIdx.x / flushing_cg_size; - auto flushing_cg = cg::tiled_partition(cg::this_thread_block()); - auto probing_cg = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto pair_idx = tid / probing_cg_size; + auto flushing_cg = cg::tiled_partition(cg::this_thread_block()); + auto probing_cg = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / probing_cg_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / probing_cg_size; __shared__ pair_type probe_output_buffer[num_flushing_cgs][buffer_size]; __shared__ pair_type contained_output_buffer[num_flushing_cgs][buffer_size]; @@ -493,12 +499,12 @@ __global__ void pair_retrieve(InputIt first, if (flushing_cg.thread_rank() == 0) { flushing_cg_counter[flushing_cg_id] = 0; } - while (flushing_cg.any(first + pair_idx < last)) { - bool active_flag = first + pair_idx < last; + while (flushing_cg.any(idx < n)) { + bool active_flag = idx < n; auto active_flushing_cg = cg::binary_partition(flushing_cg, active_flag); if (active_flag) { - pair_type pair = *(first + pair_idx); + pair_type pair = *(first + idx); if constexpr (is_outer) { view.pair_retrieve_outer(active_flushing_cg, probing_cg, @@ -523,7 +529,7 @@ __global__ void pair_retrieve(InputIt first, pair_equal); } } - pair_idx += (gridDim.x * block_size) / probing_cg_size; + idx += loop_stride; } // Final flush of output buffer diff --git a/include/cuco/detail/static_multimap/static_multimap.inl b/include/cuco/detail/static_multimap/static_multimap.inl index ddec2e4a2..b3523e74b 100644 --- a/include/cuco/detail/static_multimap/static_multimap.inl +++ b/include/cuco/detail/static_multimap/static_multimap.inl @@ -66,7 +66,7 @@ void static_multimap::insert(InputI InputIt last, cudaStream_t stream) { - auto const num_keys = std::distance(first, last); + auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } auto constexpr block_size = 128; @@ -75,7 +75,7 @@ void static_multimap::insert(InputI auto view = get_device_mutable_view(); detail::insert - <<>>(first, first + num_keys, view); + <<>>(first, num_keys, view); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); } @@ -88,7 +88,7 @@ template void static_multimap::insert_if( InputIt first, InputIt last, StencilIt stencil, Predicate pred, cudaStream_t stream) { - auto const num_keys = std::distance(first, last); + auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } auto constexpr block_size = 128; @@ -110,7 +110,7 @@ template void static_multimap::contains( InputIt first, InputIt last, OutputIt output_begin, KeyEqual key_equal, cudaStream_t stream) const { - auto const num_keys = std::distance(first, last); + auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } auto constexpr is_pair_contains = false; @@ -120,7 +120,7 @@ void static_multimap::contains( auto view = get_device_view(); detail::contains - <<>>(first, last, output_begin, view, key_equal); + <<>>(first, num_keys, output_begin, view, key_equal); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); } @@ -134,7 +134,7 @@ void static_multimap::pair_contains InputIt first, InputIt last, OutputIt output_begin, PairEqual pair_equal, cudaStream_t stream) const { - auto const num_pairs = std::distance(first, last); + auto const num_pairs = cuco::detail::distance(first, last); if (num_pairs == 0) { return; } auto constexpr is_pair_contains = true; @@ -144,7 +144,7 @@ void static_multimap::pair_contains auto view = get_device_view(); detail::contains - <<>>(first, last, output_begin, view, pair_equal); + <<>>(first, num_pairs, output_begin, view, pair_equal); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); } @@ -157,7 +157,7 @@ template std::size_t static_multimap::count( InputIt first, InputIt last, cudaStream_t stream, KeyEqual key_equal) const { - auto const num_keys = std::distance(first, last); + auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return 0; } auto constexpr is_outer = false; @@ -171,7 +171,7 @@ std::size_t static_multimap::count( std::size_t h_counter; detail::count - <<>>(first, last, d_counter_.get(), view, key_equal); + <<>>(first, num_keys, d_counter_.get(), view, key_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); @@ -188,7 +188,7 @@ template std::size_t static_multimap::count_outer( InputIt first, InputIt last, cudaStream_t stream, KeyEqual key_equal) const { - auto const num_keys = std::distance(first, last); + auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return 0; } auto constexpr is_outer = true; @@ -202,7 +202,7 @@ std::size_t static_multimap::count_ std::size_t h_counter; detail::count - <<>>(first, last, d_counter_.get(), view, key_equal); + <<>>(first, num_keys, d_counter_.get(), view, key_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); @@ -219,21 +219,21 @@ template std::size_t static_multimap::pair_count( InputIt first, InputIt last, PairEqual pair_equal, cudaStream_t stream) const { - auto const num_keys = std::distance(first, last); - if (num_keys == 0) { return 0; } + auto const num_pairs = cuco::detail::distance(first, last); + if (num_pairs == 0) { return 0; } auto constexpr is_outer = false; auto constexpr block_size = 128; auto constexpr stride = 1; auto view = get_device_view(); - auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size); + auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size); cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream); std::size_t h_counter; detail::pair_count - <<>>(first, last, d_counter_.get(), view, pair_equal); + <<>>(first, num_pairs, d_counter_.get(), view, pair_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); @@ -250,21 +250,21 @@ template std::size_t static_multimap::pair_count_outer( InputIt first, InputIt last, PairEqual pair_equal, cudaStream_t stream) const { - auto const num_keys = std::distance(first, last); - if (num_keys == 0) { return 0; } + auto const num_pairs = cuco::detail::distance(first, last); + if (num_pairs == 0) { return 0; } auto constexpr is_outer = true; auto constexpr block_size = 128; auto constexpr stride = 1; auto view = get_device_view(); - auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size); + auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size); cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream); std::size_t h_counter; detail::pair_count - <<>>(first, last, d_counter_.get(), view, pair_equal); + <<>>(first, num_pairs, d_counter_.get(), view, pair_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); @@ -281,7 +281,7 @@ template OutputIt static_multimap::retrieve( InputIt first, InputIt last, OutputIt output_begin, cudaStream_t stream, KeyEqual key_equal) const { - auto const num_keys = std::distance(first, last); + auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return output_begin; } // Using per-warp buffer for vector loads and per-CG buffer for scalar loads @@ -312,7 +312,7 @@ OutputIt static_multimap::retrieve( detail::retrieve <<>>( - first, last, output_begin, d_counter_.get(), view, key_equal); + first, num_keys, output_begin, d_counter_.get(), view, key_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); @@ -331,7 +331,7 @@ template OutputIt static_multimap::retrieve_outer( InputIt first, InputIt last, OutputIt output_begin, cudaStream_t stream, KeyEqual key_equal) const { - auto const num_keys = std::distance(first, last); + auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return output_begin; } // Using per-warp buffer for vector loads and per-CG buffer for scalar loads @@ -362,7 +362,7 @@ OutputIt static_multimap::retrieve_ detail::retrieve <<>>( - first, last, output_begin, d_counter_.get(), view, key_equal); + first, num_keys, output_begin, d_counter_.get(), view, key_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); @@ -387,7 +387,7 @@ static_multimap::pair_retrieve( PairEqual pair_equal, cudaStream_t stream) const { - auto const num_pairs = std::distance(first, last); + auto const num_pairs = cuco::detail::distance(first, last); if (num_pairs == 0) { return std::make_pair(probe_output_begin, contained_output_begin); } // Using per-warp buffer for vector loads and per-CG buffer for scalar loads @@ -407,8 +407,13 @@ static_multimap::pair_retrieve( std::size_t h_counter; detail::pair_retrieve - <<>>( - first, last, probe_output_begin, contained_output_begin, d_counter_.get(), view, pair_equal); + <<>>(first, + num_pairs, + probe_output_begin, + contained_output_begin, + d_counter_.get(), + view, + pair_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); @@ -432,7 +437,7 @@ static_multimap::pair_retrieve_oute PairEqual pair_equal, cudaStream_t stream) const { - auto const num_pairs = std::distance(first, last); + auto const num_pairs = cuco::detail::distance(first, last); if (num_pairs == 0) { return std::make_pair(probe_output_begin, contained_output_begin); } // Using per-warp buffer for vector loads and per-CG buffer for scalar loads @@ -452,8 +457,13 @@ static_multimap::pair_retrieve_oute std::size_t h_counter; detail::pair_retrieve - <<>>( - first, last, probe_output_begin, contained_output_begin, d_counter_.get(), view, pair_equal); + <<>>(first, + num_pairs, + probe_output_begin, + contained_output_begin, + d_counter_.get(), + view, + pair_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); From db7c06451d144ce8da5ed2025f10334f9445488f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 10 Nov 2022 15:33:03 -0500 Subject: [PATCH 038/152] Remove unused headers --- include/cuco/detail/utils.hpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/cuco/detail/utils.hpp b/include/cuco/detail/utils.hpp index 4724e7156..4e6913f7a 100644 --- a/include/cuco/detail/utils.hpp +++ b/include/cuco/detail/utils.hpp @@ -15,10 +15,7 @@ #pragma once -#include - #include -#include namespace cuco { namespace detail { From e06e6988175c66f49c623bfcb8d9407354c8199b Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sun, 13 Nov 2022 13:21:32 -0500 Subject: [PATCH 039/152] Add static assert for random access iterator check --- include/cuco/detail/utils.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/cuco/detail/utils.hpp b/include/cuco/detail/utils.hpp index 4e6913f7a..14077ccc6 100644 --- a/include/cuco/detail/utils.hpp +++ b/include/cuco/detail/utils.hpp @@ -16,6 +16,7 @@ #pragma once #include +#include namespace cuco { namespace detail { @@ -54,6 +55,9 @@ auto get_grid_size(Kernel kernel, std::size_t block_size, std::size_t dynamic_sm template constexpr inline int64_t distance(Iterator begin, Iterator end) { + using category = typename std::iterator_traits::iterator_category; + static_assert(std::is_base_of_v, + "Input iterator should be a random access iterator."); // `int64_t` instead of arch-dependant `long int` return static_cast(std::distance(begin, end)); } From 66168897e2acc8f2cc3cf9a52259ee68e8ad2d10 Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Sun, 13 Nov 2022 22:33:00 -0800 Subject: [PATCH 040/152] code cleanup --- include/cuco/detail/dynamic_map.inl | 13 +++++--- include/cuco/detail/dynamic_map_kernels.cuh | 7 +++-- include/cuco/dynamic_map.cuh | 34 ++++++++++----------- 3 files changed, 29 insertions(+), 25 deletions(-) diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index 8dcfd89cb..b712a9f41 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -63,6 +63,9 @@ dynamic_map::dynamic_map( alloc_{alloc}, counter_allocator_{alloc} { + CUCO_RUNTIME_EXPECTS(empty_key_sentinel_ != erased_key_sentinel_, + "The empty key sentinel and erased key sentinel cannot be the same value."); + submaps_.push_back(std::make_unique>( initial_capacity, sentinel::empty_key{empty_key_sentinel_}, @@ -124,6 +127,10 @@ template void dynamic_map::insert( InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream) { + // TODO: memset an atomic variable is unsafe + CUCO_RUNTIME_EXPECTS(sizeof(std::size_t) == sizeof(atomic_ctr_type), + "sizeof(atomic_ctr_type) must be equal to sizeof(std:size_t)."); + std::size_t num_to_insert = std::distance(first, last); reserve(size_ + num_to_insert, stream); @@ -136,8 +143,6 @@ void dynamic_map::insert( // only if we meet the minimum insert size. if (capacity_remaining >= min_insert_size_) { - // TODO: memset an atomic variable is unsafe - static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[submap_idx], 0, sizeof(atomic_ctr_type))); auto n = std::min(capacity_remaining, num_to_insert); @@ -184,10 +189,10 @@ void dynamic_map::erase( auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); // TODO: memset an atomic variable is unsafe - static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); + CUCO_RUNTIME_EXPECTS(sizeof(std::size_t) == sizeof(atomic_ctr_type), + "sizeof(atomic_ctr_type) must be equal to sizeof(std:size_t)."); // zero out submap success counters - static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); for (uint32_t i = 0; i < submaps_.size(); ++i) { CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[i], 0, sizeof(atomic_ctr_type))); } diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh index 0eeb1a632..2ae519220 100644 --- a/include/cuco/detail/dynamic_map_kernels.cuh +++ b/include/cuco/detail/dynamic_map_kernels.cuh @@ -305,13 +305,14 @@ __global__ void erase(InputIt first, typedef cub::BlockReduce BlockReduce; extern __shared__ unsigned long long submap_block_num_successes[]; + auto block = cg::this_thread_block(); auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; + auto tid = block_size * block.group_index().x + block.thread_rank(); auto it = first + tid / tile_size; for (int i = threadIdx.x; i < num_submaps; i += block_size) submap_block_num_successes[i] = 0; - __syncthreads(); + block.sync(); while (it < last) { auto erased = false; @@ -323,7 +324,7 @@ __global__ void erase(InputIt first, if (erased && tile.thread_rank() == 0) { atomicAdd(&submap_block_num_successes[i], 1); } it += (gridDim.x * blockDim.x) / tile_size; } - __syncthreads(); + block.sync(); for (int i = 0; i < num_submaps; ++i) { if (threadIdx.x == 0) { diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index 3386208aa..1c47ab2c1 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -150,7 +150,11 @@ class dynamic_map { sentinel::empty_key empty_key_sentinel, sentinel::empty_value empty_value_sentinel, Allocator const& alloc = Allocator{}, +<<<<<<< HEAD cudaStream_t stream = 0); +======= + cudaStream_t stream = nullptr); +>>>>>>> code cleanup /** * @brief Construct a dynamically-sized map with erase capability. @@ -181,7 +185,7 @@ class dynamic_map { sentinel::empty_value empty_value_sentinel, sentinel::erased_key erased_key_sentinel, Allocator const& alloc = Allocator{}, - cudaStream_t stream = 0); + cudaStream_t stream = nullptr); /** * @brief Destroy the map and frees its contents @@ -197,7 +201,7 @@ class dynamic_map { * @param n The number of key value pairs for which there must be space * @param stream Stream used for executing the kernels */ - void reserve(std::size_t n, cudaStream_t stream = 0); + void reserve(std::size_t n, cudaStream_t stream = nullptr); /** * @brief Inserts all key/value pairs in the range `[first, last)`. @@ -218,11 +222,8 @@ class dynamic_map { template , typename KeyEqual = thrust::equal_to> - void insert(InputIt first, - InputIt last, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}, - cudaStream_t stream = 0); + void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}, + cudaStream_t stream = nullptr); /** * @brief Erases keys in the range `[first, last)`. @@ -256,11 +257,8 @@ class dynamic_map { template , typename KeyEqual = thrust::equal_to> - void erase(InputIt first, - InputIt last, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}, - cudaStream_t stream = 0); + void erase(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}, + cudaStream_t stream = nullptr); /** * @brief Finds the values corresponding to all keys in the range `[first, last)`. @@ -288,9 +286,9 @@ class dynamic_map { void find(InputIt first, InputIt last, OutputIt output_begin, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}, - cudaStream_t stream = 0); + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}, + cudaStream_t stream = nullptr); /** * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. @@ -317,9 +315,9 @@ class dynamic_map { void contains(InputIt first, InputIt last, OutputIt output_begin, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}, - cudaStream_t stream = 0); + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}, + cudaStream_t stream = nullptr); /** * @brief Gets the current number of elements in the map From 2df247c63bb37fe1f195de697d8dd658af72a9d8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 14 Nov 2022 06:35:52 +0000 Subject: [PATCH 041/152] [pre-commit.ci] auto code formatting --- include/cuco/detail/dynamic_map.inl | 2 +- include/cuco/detail/dynamic_map_kernels.cuh | 6 +++--- include/cuco/dynamic_map.cuh | 20 +++++++++++++------- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index b712a9f41..ce800653e 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -130,7 +130,7 @@ void dynamic_map::insert( // TODO: memset an atomic variable is unsafe CUCO_RUNTIME_EXPECTS(sizeof(std::size_t) == sizeof(atomic_ctr_type), "sizeof(atomic_ctr_type) must be equal to sizeof(std:size_t)."); - + std::size_t num_to_insert = std::distance(first, last); reserve(size_ + num_to_insert, stream); diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh index 2ae519220..3feadbd34 100644 --- a/include/cuco/detail/dynamic_map_kernels.cuh +++ b/include/cuco/detail/dynamic_map_kernels.cuh @@ -306,9 +306,9 @@ __global__ void erase(InputIt first, extern __shared__ unsigned long long submap_block_num_successes[]; auto block = cg::this_thread_block(); - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * block.group_index().x + block.thread_rank(); - auto it = first + tid / tile_size; + auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tid = block_size * block.group_index().x + block.thread_rank(); + auto it = first + tid / tile_size; for (int i = threadIdx.x; i < num_submaps; i += block_size) submap_block_num_successes[i] = 0; diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index 1c47ab2c1..874fdeab1 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -185,7 +185,7 @@ class dynamic_map { sentinel::empty_value empty_value_sentinel, sentinel::erased_key erased_key_sentinel, Allocator const& alloc = Allocator{}, - cudaStream_t stream = nullptr); + cudaStream_t stream = nullptr); /** * @brief Destroy the map and frees its contents @@ -222,7 +222,10 @@ class dynamic_map { template , typename KeyEqual = thrust::equal_to> - void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}, + void insert(InputIt first, + InputIt last, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}, cudaStream_t stream = nullptr); /** @@ -257,7 +260,10 @@ class dynamic_map { template , typename KeyEqual = thrust::equal_to> - void erase(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}, + void erase(InputIt first, + InputIt last, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}, cudaStream_t stream = nullptr); /** @@ -286,8 +292,8 @@ class dynamic_map { void find(InputIt first, InputIt last, OutputIt output_begin, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}, cudaStream_t stream = nullptr); /** @@ -315,8 +321,8 @@ class dynamic_map { void contains(InputIt first, InputIt last, OutputIt output_begin, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}, cudaStream_t stream = nullptr); /** From af7706d8268050b884b1a5703f6a92f296be62cc Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Mon, 14 Nov 2022 14:19:38 -0800 Subject: [PATCH 042/152] switched typedef to using --- include/cuco/detail/dynamic_map_kernels.cuh | 8 ++++---- include/cuco/dynamic_map.cuh | 4 ---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh index 3feadbd34..aefe8c873 100644 --- a/include/cuco/detail/dynamic_map_kernels.cuh +++ b/include/cuco/detail/dynamic_map_kernels.cuh @@ -71,7 +71,7 @@ __global__ void insert(InputIt first, Hash hash, KeyEqual key_equal) { - typedef cub::BlockReduce BlockReduce; + using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; @@ -154,7 +154,7 @@ __global__ void insert(InputIt first, Hash hash, KeyEqual key_equal) { - typedef cub::BlockReduce BlockReduce; + using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; @@ -230,7 +230,7 @@ __global__ void erase(InputIt first, Hash hash, KeyEqual key_equal) { - typedef cub::BlockReduce BlockReduce; + using BlockReduce = cub::BlockReduce; extern __shared__ unsigned long long submap_block_num_successes[]; auto tid = block_size * blockIdx.x + threadIdx.x; @@ -302,7 +302,7 @@ __global__ void erase(InputIt first, Hash hash, KeyEqual key_equal) { - typedef cub::BlockReduce BlockReduce; + using BlockReduce = cub::BlockReduce; extern __shared__ unsigned long long submap_block_num_successes[]; auto block = cg::this_thread_block(); diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index 874fdeab1..dcfa192c5 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -150,11 +150,7 @@ class dynamic_map { sentinel::empty_key empty_key_sentinel, sentinel::empty_value empty_value_sentinel, Allocator const& alloc = Allocator{}, -<<<<<<< HEAD - cudaStream_t stream = 0); -======= cudaStream_t stream = nullptr); ->>>>>>> code cleanup /** * @brief Construct a dynamically-sized map with erase capability. From 54ae2548d7f1bc18af020f0038d94baa976b0d6d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 14 Nov 2022 22:22:26 +0000 Subject: [PATCH 043/152] [pre-commit.ci] auto code formatting --- include/cuco/dynamic_map.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index dcfa192c5..5dbd9c2f7 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -150,7 +150,7 @@ class dynamic_map { sentinel::empty_key empty_key_sentinel, sentinel::empty_value empty_value_sentinel, Allocator const& alloc = Allocator{}, - cudaStream_t stream = nullptr); + cudaStream_t stream = nullptr); /** * @brief Construct a dynamically-sized map with erase capability. From 593fe127e91df2f9c12197abc2087f94c8cf9ae7 Mon Sep 17 00:00:00 2001 From: Nico Iskos Date: Fri, 18 Nov 2022 11:24:43 -0800 Subject: [PATCH 044/152] responding to PR comments --- benchmarks/hash_table/dynamic_map_bench.cu | 75 +++++++++++++++++++++ include/cuco/detail/dynamic_map.inl | 2 +- include/cuco/detail/dynamic_map_kernels.cuh | 17 +++-- include/cuco/detail/static_map_kernels.cuh | 8 ++- include/cuco/dynamic_map.cuh | 13 +--- 5 files changed, 97 insertions(+), 18 deletions(-) diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu index 079018005..de2317957 100644 --- a/benchmarks/hash_table/dynamic_map_bench.cu +++ b/benchmarks/hash_table/dynamic_map_bench.cu @@ -271,6 +271,81 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIQUE) ->UseManualTime(); BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIQUE) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); + +BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIQUE) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); + +BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIQUE) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); + +BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::UNIQUE) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); + +BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); + +BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIFORM) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); + +BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIFORM) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); + +BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIFORM) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); + +BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIFORM) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); + +BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::UNIFORM) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); + +BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::GAUSSIAN) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); + +BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::GAUSSIAN) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); + +BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::GAUSSIAN) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); + +BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::GAUSSIAN) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); + +BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::GAUSSIAN) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); + +BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); \ No newline at end of file diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index ce800653e..66c130899 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -199,7 +199,7 @@ void dynamic_map::erase( auto const temp_storage_size = submaps_.size() * sizeof(unsigned long long); - detail::erase> + detail::erase <<>>(first, first + num_keys, submap_mutable_views_.data().get(), diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh index aefe8c873..37bcbc547 100644 --- a/include/cuco/detail/dynamic_map_kernels.cuh +++ b/include/cuco/detail/dynamic_map_kernels.cuh @@ -41,6 +41,7 @@ namespace cg = cooperative_groups; * @tparam viewT Type of device view allowing access of hash map storage * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of key/value pairs * @param last End of the sequence of key/value pairs * @param submap_views Array of `static_map::device_view` objects used to @@ -122,6 +123,7 @@ __global__ void insert(InputIt first, * @tparam viewT Type of device view allowing access of hash map storage * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of key/value pairs * @param last End of the sequence of key/value pairs * @param submap_views Array of `static_map::device_view` objects used to @@ -196,14 +198,15 @@ __global__ void insert(InputIt first, * If the key `*(first + i)` exists in the map, its slot is erased and made available for future insertions. * Else, no effect. + * * @tparam block_size The size of the thread block - * @tparam pair_type Type of the pairs contained in the map * @tparam InputIt Device accessible input iterator whose `value_type` is * convertible to the map's `key_type` * @tparam mutableViewT Type of device view allowing modification of hash map storage * @tparam atomicT Type of atomic storage * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param submap_mutable_views Array of `static_map::mutable_device_view` objects used to @@ -216,7 +219,6 @@ __global__ void insert(InputIt first, * @param key_equal The binary function to compare two keys for equality */ template -#include #include #include #include @@ -26,8 +24,6 @@ #include #include -#include - #include #include #include @@ -116,12 +112,6 @@ class dynamic_map { dynamic_map(dynamic_map const&) = delete; dynamic_map(dynamic_map&&) = delete; - template - dynamic_map(std::size_t, T1, T2, Allocator const& = Allocator{}) = delete; - - template - dynamic_map(std::size_t, T1, T2, T3, Allocator const& = Allocator{}) = delete; - dynamic_map& operator=(dynamic_map const&) = delete; dynamic_map& operator=(dynamic_map&&) = delete; @@ -244,6 +234,7 @@ class dynamic_map { * convertible to the map's `value_type` * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param hash The unary function to apply to hash each key @@ -274,6 +265,7 @@ class dynamic_map { * convertible to the map's `mapped_type` * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param output_begin Beginning of the sequence of values retrieved for each key @@ -303,6 +295,7 @@ class dynamic_map { * convertible to the map's `mapped_type` * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param output_begin Beginning of the sequence of booleans for the presence of each key From 7598e47184e6f66d4163aab61c9cff9f290f8224 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 18 Nov 2022 19:25:58 +0000 Subject: [PATCH 045/152] [pre-commit.ci] auto code formatting --- benchmarks/hash_table/dynamic_map_bench.cu | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu index de2317957..420187b08 100644 --- a/benchmarks/hash_table/dynamic_map_bench.cu +++ b/benchmarks/hash_table/dynamic_map_bench.cu @@ -274,7 +274,7 @@ BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); - + BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) @@ -289,7 +289,7 @@ BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); - + BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) @@ -304,7 +304,7 @@ BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); - + BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) @@ -319,7 +319,7 @@ BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); - + BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) @@ -334,7 +334,7 @@ BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); - + BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) From cd27320a20295b5e979adce03afd2f341018badc Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 23 Nov 2022 18:19:26 +0000 Subject: [PATCH 046/152] Inline sentinel namespace + add missing implicit conversion operators --- include/cuco/sentinel.cuh | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/include/cuco/sentinel.cuh b/include/cuco/sentinel.cuh index 58317d179..e50dc9e2f 100644 --- a/include/cuco/sentinel.cuh +++ b/include/cuco/sentinel.cuh @@ -17,7 +17,7 @@ #pragma once namespace cuco { -namespace sentinel { +inline namespace sentinel { /** * @brief A strong type wrapper used to denote the empty key sentinel. * @@ -31,6 +31,14 @@ struct empty_key { * @param v The empty key sentinel value */ __host__ __device__ explicit constexpr empty_key(T v) : value{v} {} + + /** + * @brief Implicit conversion operator to the underlying value. + * + * @return Sentinel as underlying value type + */ + __host__ __device__ constexpr operator T() const noexcept { return value; } + T value; ///< Empty key sentinel }; @@ -47,6 +55,14 @@ struct empty_value { * @param v The empty value sentinel value */ __host__ __device__ explicit constexpr empty_value(T v) : value{v} {} + + /** + * @brief Implicit conversion operator to the underlying value. + * + * @return Sentinel as underlying value type + */ + __host__ __device__ constexpr operator T() const noexcept { return value; } + T value; ///< Empty value sentinel }; @@ -63,6 +79,14 @@ struct erased_key { * @param v The erased key sentinel value */ __host__ __device__ explicit constexpr erased_key(T v) : value{v} {} + + /** + * @brief Implicit conversion operator to the underlying value. + * + * @return Sentinel as underlying value type + */ + __host__ __device__ constexpr operator T() const noexcept { return value; } + T value; ///< Erased key sentinel }; From 4c476e314b67890f2ea943f60e488e13a195802b Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 2 Dec 2022 15:13:42 +0000 Subject: [PATCH 047/152] Add strong_type base class --- include/cuco/detail/utils.cuh | 24 ++++++++++++++++++++ include/cuco/sentinel.cuh | 42 ++++++++--------------------------- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/include/cuco/detail/utils.cuh b/include/cuco/detail/utils.cuh index 5b02cef96..3aadbb848 100644 --- a/include/cuco/detail/utils.cuh +++ b/include/cuco/detail/utils.cuh @@ -76,5 +76,29 @@ struct slot_is_filled { } }; +/** + * @brief A strong type wrapper. + * + * @tparam T Type of the mapped values + */ +template +struct strong_type { + /** + * @brief Constructs a strong type. + * + * @param v Value to be wrapped as a strong type + */ + __host__ __device__ explicit constexpr strong_type(T v) : value{v} {} + + /** + * @brief Implicit conversion operator to the underlying value. + * + * @return Underlying value + */ + __host__ __device__ constexpr operator T() const noexcept { return value; } + + T value; ///< Underlying value +}; + } // namespace detail } // namespace cuco diff --git a/include/cuco/sentinel.cuh b/include/cuco/sentinel.cuh index e50dc9e2f..a440e5b2c 100644 --- a/include/cuco/sentinel.cuh +++ b/include/cuco/sentinel.cuh @@ -16,30 +16,24 @@ #pragma once +#include + namespace cuco { inline namespace sentinel { + /** * @brief A strong type wrapper used to denote the empty key sentinel. * * @tparam T Type of the key values */ template -struct empty_key { +struct empty_key : public cuco::detail::strong_type { /** * @brief Constructs an empty key sentinel with the given `v`. * * @param v The empty key sentinel value */ - __host__ __device__ explicit constexpr empty_key(T v) : value{v} {} - - /** - * @brief Implicit conversion operator to the underlying value. - * - * @return Sentinel as underlying value type - */ - __host__ __device__ constexpr operator T() const noexcept { return value; } - - T value; ///< Empty key sentinel + __host__ __device__ explicit constexpr empty_key(T v) : cuco::detail::strong_type(v) {} }; /** @@ -48,22 +42,13 @@ struct empty_key { * @tparam T Type of the mapped values */ template -struct empty_value { +struct empty_value : public cuco::detail::strong_type { /** * @brief Constructs an empty value sentinel with the given `v`. * * @param v The empty value sentinel value */ - __host__ __device__ explicit constexpr empty_value(T v) : value{v} {} - - /** - * @brief Implicit conversion operator to the underlying value. - * - * @return Sentinel as underlying value type - */ - __host__ __device__ constexpr operator T() const noexcept { return value; } - - T value; ///< Empty value sentinel + __host__ __device__ explicit constexpr empty_value(T v) : cuco::detail::strong_type(v) {} }; /** @@ -72,22 +57,13 @@ struct empty_value { * @tparam T Type of the key values */ template -struct erased_key { +struct erased_key : public cuco::detail::strong_type { /** * @brief Constructs an erased key sentinel with the given `v`. * * @param v The erased key sentinel value */ - __host__ __device__ explicit constexpr erased_key(T v) : value{v} {} - - /** - * @brief Implicit conversion operator to the underlying value. - * - * @return Sentinel as underlying value type - */ - __host__ __device__ constexpr operator T() const noexcept { return value; } - - T value; ///< Erased key sentinel + __host__ __device__ explicit constexpr erased_key(T v) : cuco::detail::strong_type(v) {} }; } // namespace sentinel From 56cee10a602058f449b83df2cffc9288369f5887 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sat, 3 Dec 2022 14:10:32 -0500 Subject: [PATCH 048/152] Include missing header --- include/cuco/detail/bitwise_compare.cuh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/cuco/detail/bitwise_compare.cuh b/include/cuco/detail/bitwise_compare.cuh index 3038943a0..6598ddbda 100644 --- a/include/cuco/detail/bitwise_compare.cuh +++ b/include/cuco/detail/bitwise_compare.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,8 @@ #pragma once +#include + #include #include From 235dacec09dc7521a0a460f74cee5a9fcb1bd770 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 5 Dec 2022 09:07:58 -0500 Subject: [PATCH 049/152] Remove the use of sentinel namespace --- benchmarks/hash_table/dynamic_map_bench.cu | 6 ++---- benchmarks/hash_table/static_map_bench.cu | 10 ++++------ benchmarks/hash_table/static_multimap/count_bench.cu | 2 +- benchmarks/hash_table/static_multimap/insert_bench.cu | 2 +- .../static_multimap/optimal_retrieve_bench.cu | 2 +- .../hash_table/static_multimap/pair_retrieve_bench.cu | 2 +- benchmarks/hash_table/static_multimap/query_bench.cu | 2 +- .../hash_table/static_multimap/retrieve_bench.cu | 2 +- examples/static_map/custom_type_example.cu | 4 +--- examples/static_map/device_view_example.cu | 5 ++--- examples/static_map/host_bulk_example.cu | 5 ++--- examples/static_multimap/host_bulk_example.cu | 4 +--- tests/dynamic_map/unique_sequence_test.cu | 2 +- tests/static_map/custom_type_test.cu | 5 ++--- tests/static_map/duplicate_keys_test.cu | 2 +- tests/static_map/erase_test.cu | 6 ++---- tests/static_map/heterogeneous_lookup_test.cu | 5 ++--- tests/static_map/insert_and_find_test.cu | 4 ++-- tests/static_map/key_sentinel_test.cu | 2 +- tests/static_map/shared_memory_test.cu | 9 +++------ tests/static_map/stream_test.cu | 4 ++-- tests/static_map/unique_sequence_test.cu | 2 +- tests/static_multimap/custom_pair_retrieve_test.cu | 5 ++--- tests/static_multimap/custom_type_test.cu | 9 +++------ tests/static_multimap/heterogeneous_lookup_test.cu | 4 +--- tests/static_multimap/insert_if_test.cu | 4 ++-- tests/static_multimap/multiplicity_test.cu | 4 ++-- tests/static_multimap/non_match_test.cu | 4 ++-- tests/static_multimap/pair_function_test.cu | 5 ++--- 29 files changed, 49 insertions(+), 73 deletions(-) diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu index 90446ea57..5b4009d3b 100644 --- a/benchmarks/hash_table/dynamic_map_bench.cu +++ b/benchmarks/hash_table/dynamic_map_bench.cu @@ -86,8 +86,7 @@ static void BM_dynamic_insert(::benchmark::State& state) std::size_t batch_size = 1E6; for (auto _ : state) { - map_type map{ - initial_size, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + map_type map{initial_size, cuco::empty_key{-1}, cuco::empty_value{-1}}; { cuda_event_timer raii{state}; for (std::size_t i = 0; i < num_keys; i += batch_size) { @@ -124,8 +123,7 @@ static void BM_dynamic_search_all(::benchmark::State& state) thrust::device_vector> d_pairs(h_pairs); thrust::device_vector d_results(num_keys); - map_type map{ - initial_size, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + map_type map{initial_size, cuco::empty_key{-1}, cuco::empty_value{-1}}; map.insert(d_pairs.begin(), d_pairs.end()); for (auto _ : state) { diff --git a/benchmarks/hash_table/static_map_bench.cu b/benchmarks/hash_table/static_map_bench.cu index e2b15b05e..c384e649f 100644 --- a/benchmarks/hash_table/static_map_bench.cu +++ b/benchmarks/hash_table/static_map_bench.cu @@ -92,7 +92,7 @@ static void BM_static_map_insert(::benchmark::State& state) thrust::device_vector d_keys(h_keys); for (auto _ : state) { - map_type map{size, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + map_type map{size, cuco::empty_key{-1}, cuco::empty_value{-1}}; cudaEvent_t start, stop; cudaEventCreate(&start); @@ -122,7 +122,7 @@ static void BM_static_map_search_all(::benchmark::State& state) float occupancy = state.range(1) / float{100}; std::size_t size = num_keys / occupancy; - map_type map{size, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + map_type map{size, cuco::empty_key{-1}, cuco::empty_value{-1}}; std::vector h_keys(num_keys); std::vector h_values(num_keys); @@ -165,10 +165,8 @@ static void BM_static_map_erase_all(::benchmark::State& state) std::size_t size = num_keys / occupancy; // static map with erase support - map_type map{size, - cuco::sentinel::empty_key{-1}, - cuco::sentinel::empty_value{-1}, - cuco::sentinel::erased_key{-2}}; + map_type map{ + size, cuco::empty_key{-1}, cuco::empty_value{-1}, cuco::erased_key{-2}}; std::vector h_keys(num_keys); std::vector h_values(num_keys); diff --git a/benchmarks/hash_table/static_multimap/count_bench.cu b/benchmarks/hash_table/static_multimap/count_bench.cu index 0659fe742..564a4c2dd 100644 --- a/benchmarks/hash_table/static_multimap/count_bench.cu +++ b/benchmarks/hash_table/static_multimap/count_bench.cu @@ -58,7 +58,7 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_static_multimap_c state.add_element_count(num_keys, "NumKeys"); cuco::static_multimap map{ - size, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + size, cuco::empty_key{-1}, cuco::empty_value{-1}}; map.insert(d_pairs.begin(), d_pairs.end()); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { diff --git a/benchmarks/hash_table/static_multimap/insert_bench.cu b/benchmarks/hash_table/static_multimap/insert_bench.cu index 17f8723df..80ff314b3 100644 --- a/benchmarks/hash_table/static_multimap/insert_bench.cu +++ b/benchmarks/hash_table/static_multimap/insert_bench.cu @@ -56,7 +56,7 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_static_multimap_i state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { cuco::static_multimap map{ - size, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + size, cuco::empty_key{-1}, cuco::empty_value{-1}}; // Use timers to explicitly mark the target region timer.start(); diff --git a/benchmarks/hash_table/static_multimap/optimal_retrieve_bench.cu b/benchmarks/hash_table/static_multimap/optimal_retrieve_bench.cu index a4a202161..78f134158 100644 --- a/benchmarks/hash_table/static_multimap/optimal_retrieve_bench.cu +++ b/benchmarks/hash_table/static_multimap/optimal_retrieve_bench.cu @@ -78,7 +78,7 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_retrieve( cuco::double_hashing, cuco::detail::MurmurHash3_32>> - map{size, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + map{size, cuco::empty_key{-1}, cuco::empty_value{-1}}; map.insert(d_pairs.begin(), d_pairs.end()); auto const output_size = map.count_outer(d_keys.begin(), d_keys.end()); diff --git a/benchmarks/hash_table/static_multimap/pair_retrieve_bench.cu b/benchmarks/hash_table/static_multimap/pair_retrieve_bench.cu index b341fce76..4bde01c44 100644 --- a/benchmarks/hash_table/static_multimap/pair_retrieve_bench.cu +++ b/benchmarks/hash_table/static_multimap/pair_retrieve_bench.cu @@ -71,7 +71,7 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_static_multimap_p auto const pair_begin = d_pairs.begin(); cuco::static_multimap map{ - size, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + size, cuco::empty_key{-1}, cuco::empty_value{-1}}; map.insert(pair_begin, pair_begin + num_input); generate_probe_keys(matching_rate, h_keys.begin(), h_keys.end()); diff --git a/benchmarks/hash_table/static_multimap/query_bench.cu b/benchmarks/hash_table/static_multimap/query_bench.cu index 91c3ca645..3eff33a35 100644 --- a/benchmarks/hash_table/static_multimap/query_bench.cu +++ b/benchmarks/hash_table/static_multimap/query_bench.cu @@ -58,7 +58,7 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_static_multimap_q state.add_element_count(num_keys, "NumKeys"); cuco::static_multimap map{ - size, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + size, cuco::empty_key{-1}, cuco::empty_value{-1}}; map.insert(d_pairs.begin(), d_pairs.end()); auto const output_size = map.count_outer(d_keys.begin(), d_keys.end()); diff --git a/benchmarks/hash_table/static_multimap/retrieve_bench.cu b/benchmarks/hash_table/static_multimap/retrieve_bench.cu index d92f3528e..128bcb03d 100644 --- a/benchmarks/hash_table/static_multimap/retrieve_bench.cu +++ b/benchmarks/hash_table/static_multimap/retrieve_bench.cu @@ -58,7 +58,7 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_static_multimap_r state.add_element_count(num_keys, "NumKeys"); cuco::static_multimap map{ - size, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + size, cuco::empty_key{-1}, cuco::empty_value{-1}}; map.insert(d_pairs.begin(), d_pairs.end()); auto const output_size = map.count_outer(d_keys.begin(), d_keys.end()); diff --git a/examples/static_map/custom_type_example.cu b/examples/static_map/custom_type_example.cu index efc04e0c8..e150a858e 100644 --- a/examples/static_map/custom_type_example.cu +++ b/examples/static_map/custom_type_example.cu @@ -93,9 +93,7 @@ int main(void) // Construct a map with 100,000 slots using the given empty key/value sentinels. Note the // capacity is chosen knowing we will insert 80,000 keys, for an load factor of 80%. cuco::static_map map{ - 100'000, - cuco::sentinel::empty_key{empty_key_sentinel}, - cuco::sentinel::empty_value{empty_value_sentinel}}; + 100'000, cuco::empty_key{empty_key_sentinel}, cuco::empty_value{empty_value_sentinel}}; // Inserts 80,000 pairs into the map by using the custom hasher and custom equality callable map.insert(pairs_begin, pairs_begin + num_pairs, custom_hash{}, custom_key_equals{}); diff --git a/examples/static_map/device_view_example.cu b/examples/static_map/device_view_example.cu index a65e12162..f3414e3ff 100644 --- a/examples/static_map/device_view_example.cu +++ b/examples/static_map/device_view_example.cu @@ -135,9 +135,8 @@ int main(void) std::size_t const capacity = std::ceil(num_keys / load_factor); // Constructs a map with "capacity" slots using -1 and -1 as the empty key/value sentinels. - cuco::static_map map{capacity, - cuco::sentinel::empty_key{empty_key_sentinel}, - cuco::sentinel::empty_value{empty_value_sentinel}}; + cuco::static_map map{ + capacity, cuco::empty_key{empty_key_sentinel}, cuco::empty_value{empty_value_sentinel}}; // Get a non-owning, mutable view of the map that allows inserts to pass by value into the kernel auto device_insert_view = map.get_device_mutable_view(); diff --git a/examples/static_map/host_bulk_example.cu b/examples/static_map/host_bulk_example.cu index d682442fb..746857511 100644 --- a/examples/static_map/host_bulk_example.cu +++ b/examples/static_map/host_bulk_example.cu @@ -54,9 +54,8 @@ int main(void) std::size_t const capacity = std::ceil(num_keys / load_factor); // Constructs a map with "capacity" slots using -1 and -1 as the empty key/value sentinels. - cuco::static_map map{capacity, - cuco::sentinel::empty_key{empty_key_sentinel}, - cuco::sentinel::empty_value{empty_value_sentinel}}; + cuco::static_map map{ + capacity, cuco::empty_key{empty_key_sentinel}, cuco::empty_value{empty_value_sentinel}}; // Create a sequence of keys and values {{0,0}, {1,1}, ... {i,i}} thrust::device_vector insert_keys(num_keys); diff --git a/examples/static_multimap/host_bulk_example.cu b/examples/static_multimap/host_bulk_example.cu index 149abd112..984a05387 100644 --- a/examples/static_multimap/host_bulk_example.cu +++ b/examples/static_multimap/host_bulk_example.cu @@ -38,9 +38,7 @@ int main(void) // sentinels. Note the capacity is chosen knowing we will insert 50,000 keys, // for an load factor of 50%. cuco::static_multimap map{ - N * 2, - cuco::sentinel::empty_key{empty_key_sentinel}, - cuco::sentinel::empty_value{empty_value_sentinel}}; + N * 2, cuco::empty_key{empty_key_sentinel}, cuco::empty_value{empty_value_sentinel}}; thrust::device_vector> pairs(N); diff --git a/tests/dynamic_map/unique_sequence_test.cu b/tests/dynamic_map/unique_sequence_test.cu index de26bb3dc..d97bac0a0 100644 --- a/tests/dynamic_map/unique_sequence_test.cu +++ b/tests/dynamic_map/unique_sequence_test.cu @@ -39,7 +39,7 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", { constexpr std::size_t num_keys{50'000'000}; cuco::dynamic_map map{ - 30'000'000, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + 30'000'000, cuco::empty_key{-1}, cuco::empty_value{-1}}; thrust::device_vector d_keys(num_keys); thrust::device_vector d_values(num_keys); diff --git a/tests/static_map/custom_type_test.cu b/tests/static_map/custom_type_test.cu index e587613d4..062df6c42 100644 --- a/tests/static_map/custom_type_test.cu +++ b/tests/static_map/custom_type_test.cu @@ -113,9 +113,8 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type", constexpr std::size_t num = 100; constexpr std::size_t capacity = num * 2; - cuco::static_map map{capacity, - cuco::sentinel::empty_key{sentinel_key}, - cuco::sentinel::empty_value{sentinel_value}}; + cuco::static_map map{ + capacity, cuco::empty_key{sentinel_key}, cuco::empty_value{sentinel_value}}; thrust::device_vector insert_keys(num); thrust::device_vector insert_values(num); diff --git a/tests/static_map/duplicate_keys_test.cu b/tests/static_map/duplicate_keys_test.cu index 34a315a1c..1815c52e4 100644 --- a/tests/static_map/duplicate_keys_test.cu +++ b/tests/static_map/duplicate_keys_test.cu @@ -39,7 +39,7 @@ TEMPLATE_TEST_CASE_SIG("Duplicate keys", { constexpr std::size_t num_keys{500'000}; cuco::static_map map{ - num_keys * 2, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; thrust::device_vector d_keys(num_keys); thrust::device_vector d_values(num_keys); diff --git a/tests/static_map/erase_test.cu b/tests/static_map/erase_test.cu index b5641539c..1b60406a5 100644 --- a/tests/static_map/erase_test.cu +++ b/tests/static_map/erase_test.cu @@ -33,10 +33,8 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t), (int64_t)) constexpr std::size_t num_keys = 1'000'000; constexpr std::size_t capacity = 1'100'000; - cuco::static_map map{capacity, - cuco::sentinel::empty_key{-1}, - cuco::sentinel::empty_value{-1}, - cuco::sentinel::erased_key{-2}}; + cuco::static_map map{ + capacity, cuco::empty_key{-1}, cuco::empty_value{-1}, cuco::erased_key{-2}}; thrust::device_vector d_keys(num_keys); thrust::device_vector d_values(num_keys); diff --git a/tests/static_map/heterogeneous_lookup_test.cu b/tests/static_map/heterogeneous_lookup_test.cu index 766fa9e1f..4a5088891 100644 --- a/tests/static_map/heterogeneous_lookup_test.cu +++ b/tests/static_map/heterogeneous_lookup_test.cu @@ -96,9 +96,8 @@ TEMPLATE_TEST_CASE("Heterogeneous lookup", constexpr std::size_t num = 100; constexpr std::size_t capacity = num * 2; - cuco::static_map map{capacity, - cuco::sentinel::empty_key{sentinel_key}, - cuco::sentinel::empty_value{sentinel_value}}; + cuco::static_map map{ + capacity, cuco::empty_key{sentinel_key}, cuco::empty_value{sentinel_value}}; auto insert_pairs = thrust::make_transform_iterator( thrust::counting_iterator(0), diff --git a/tests/static_map/insert_and_find_test.cu b/tests/static_map/insert_and_find_test.cu index ec3339c4f..fbd66c3e9 100644 --- a/tests/static_map/insert_and_find_test.cu +++ b/tests/static_map/insert_and_find_test.cu @@ -59,8 +59,8 @@ TEMPLATE_TEST_CASE_SIG("Parallel insert-or-update", (int64_t, int32_t), (int64_t, int64_t)) { - cuco::sentinel::empty_key empty_key_sentinel{-1}; - cuco::sentinel::empty_value empty_value_sentinel{-1}; + cuco::empty_key empty_key_sentinel{-1}; + cuco::empty_value empty_value_sentinel{-1}; cuco::static_map m(10 * Iters, empty_key_sentinel, empty_value_sentinel); static constexpr int Blocks = 1024; diff --git a/tests/static_map/key_sentinel_test.cu b/tests/static_map/key_sentinel_test.cu index e52c1405e..f74990367 100644 --- a/tests/static_map/key_sentinel_test.cu +++ b/tests/static_map/key_sentinel_test.cu @@ -40,7 +40,7 @@ TEMPLATE_TEST_CASE_SIG( constexpr std::size_t num_keys{SIZE}; cuco::static_map map{ - SIZE * 2, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + SIZE * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; auto m_view = map.get_device_mutable_view(); auto view = map.get_device_view(); diff --git a/tests/static_map/shared_memory_test.cu b/tests/static_map/shared_memory_test.cu index 67ae88d88..9920b7b1d 100644 --- a/tests/static_map/shared_memory_test.cu +++ b/tests/static_map/shared_memory_test.cu @@ -95,7 +95,7 @@ TEMPLATE_TEST_CASE_SIG("Shared memory static map", std::vector> maps; for (std::size_t map_id = 0; map_id < number_of_maps; ++map_id) { maps.push_back(std::make_unique( - map_capacity, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1})); + map_capacity, cuco::empty_key{-1}, cuco::empty_value{-1})); } thrust::device_vector d_keys_exist(number_of_maps * elements_in_map); @@ -161,11 +161,8 @@ __global__ void shared_memory_hash_table_kernel(bool* key_found) using map_type = typename cuco::static_map::device_mutable_view; using find_map_type = typename cuco::static_map::device_view; __shared__ typename map_type::slot_type slots[N]; - auto map = map_type::make_from_uninitialized_slots(cg::this_thread_block(), - &slots[0], - N, - cuco::sentinel::empty_key{-1}, - cuco::sentinel::empty_value{-1}); + auto map = map_type::make_from_uninitialized_slots( + cg::this_thread_block(), &slots[0], N, cuco::empty_key{-1}, cuco::empty_value{-1}); auto g = cg::this_thread_block(); std::size_t index = threadIdx.x + blockIdx.x * blockDim.x; diff --git a/tests/static_map/stream_test.cu b/tests/static_map/stream_test.cu index 5f816410e..cb0358c2b 100644 --- a/tests/static_map/stream_test.cu +++ b/tests/static_map/stream_test.cu @@ -42,8 +42,8 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream", constexpr std::size_t num_keys{500'000}; cuco::static_map map{1'000'000, - cuco::sentinel::empty_key{-1}, - cuco::sentinel::empty_value{-1}, + cuco::empty_key{-1}, + cuco::empty_value{-1}, cuco::cuda_allocator{}, stream}; diff --git a/tests/static_map/unique_sequence_test.cu b/tests/static_map/unique_sequence_test.cu index 75bb67d61..94ca65c6b 100644 --- a/tests/static_map/unique_sequence_test.cu +++ b/tests/static_map/unique_sequence_test.cu @@ -40,7 +40,7 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", { constexpr std::size_t num_keys{500'000}; cuco::static_map map{ - 1'000'000, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + 1'000'000, cuco::empty_key{-1}, cuco::empty_value{-1}}; auto m_view = map.get_device_mutable_view(); auto view = map.get_device_view(); diff --git a/tests/static_multimap/custom_pair_retrieve_test.cu b/tests/static_multimap/custom_pair_retrieve_test.cu index 5d0329382..58887ba9f 100644 --- a/tests/static_multimap/custom_pair_retrieve_test.cu +++ b/tests/static_multimap/custom_pair_retrieve_test.cu @@ -202,13 +202,12 @@ TEMPLATE_TEST_CASE_SIG( cuda::thread_scope_device, cuco::cuda_allocator, cuco::linear_probing<1, cuco::detail::MurmurHash3_32>> - map{ - num_pairs * 2, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + map{num_pairs * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; test_non_shmem_pair_retrieve(map, num_pairs); } if constexpr (Probe == cuco::test::probe_sequence::double_hashing) { cuco::static_multimap map{ - num_pairs * 2, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + num_pairs * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; test_non_shmem_pair_retrieve(map, num_pairs); } } diff --git a/tests/static_multimap/custom_type_test.cu b/tests/static_multimap/custom_type_test.cu index 40bdbe8ba..2e565ede2 100644 --- a/tests/static_multimap/custom_type_test.cu +++ b/tests/static_multimap/custom_type_test.cu @@ -234,15 +234,12 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type", cuda::thread_scope_device, cuco::cuda_allocator, cuco::linear_probing<1, hash_key_pair>> - map{capacity, - cuco::sentinel::empty_key{sentinel_key}, - cuco::sentinel::empty_value{sentinel_value}}; + map{capacity, cuco::empty_key{sentinel_key}, cuco::empty_value{sentinel_value}}; test_custom_key_value_type(map, num_pairs); } if constexpr (Probe == cuco::test::probe_sequence::double_hashing) { - cuco::static_multimap map{capacity, - cuco::sentinel::empty_key{sentinel_key}, - cuco::sentinel::empty_value{sentinel_value}}; + cuco::static_multimap map{ + capacity, cuco::empty_key{sentinel_key}, cuco::empty_value{sentinel_value}}; test_custom_key_value_type(map, num_pairs); } } diff --git a/tests/static_multimap/heterogeneous_lookup_test.cu b/tests/static_multimap/heterogeneous_lookup_test.cu index dca3de826..57cda8c44 100644 --- a/tests/static_multimap/heterogeneous_lookup_test.cu +++ b/tests/static_multimap/heterogeneous_lookup_test.cu @@ -101,9 +101,7 @@ TEMPLATE_TEST_CASE("Heterogeneous lookup", cuda::thread_scope_device, cuco::cuda_allocator, cuco::linear_probing<1, custom_hasher>> - map{capacity, - cuco::sentinel::empty_key{sentinel_key}, - cuco::sentinel::empty_value{sentinel_value}}; + map{capacity, cuco::empty_key{sentinel_key}, cuco::empty_value{sentinel_value}}; auto insert_pairs = thrust::make_transform_iterator( thrust::counting_iterator(0), diff --git a/tests/static_multimap/insert_if_test.cu b/tests/static_multimap/insert_if_test.cu index 506563502..8ff7344a6 100644 --- a/tests/static_multimap/insert_if_test.cu +++ b/tests/static_multimap/insert_if_test.cu @@ -73,12 +73,12 @@ TEMPLATE_TEST_CASE_SIG( cuda::thread_scope_device, cuco::cuda_allocator, cuco::linear_probing<1, cuco::detail::MurmurHash3_32>> - map{num_keys * 2, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + map{num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; test_insert_if(map, d_pairs.begin(), d_keys.begin(), num_keys); } if constexpr (Probe == cuco::test::probe_sequence::double_hashing) { cuco::static_multimap map{ - num_keys * 2, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; test_insert_if(map, d_pairs.begin(), d_keys.begin(), num_keys); } } diff --git a/tests/static_multimap/multiplicity_test.cu b/tests/static_multimap/multiplicity_test.cu index 3f5581b03..e34593438 100644 --- a/tests/static_multimap/multiplicity_test.cu +++ b/tests/static_multimap/multiplicity_test.cu @@ -167,12 +167,12 @@ TEMPLATE_TEST_CASE_SIG( cuda::thread_scope_device, cuco::cuda_allocator, cuco::linear_probing<1, cuco::detail::MurmurHash3_32>> - map{5, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + map{5, cuco::empty_key{-1}, cuco::empty_value{-1}}; test_multiplicity_two(map, num_items); } if constexpr (Probe == cuco::test::probe_sequence::double_hashing) { cuco::static_multimap map{ - 5, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + 5, cuco::empty_key{-1}, cuco::empty_value{-1}}; test_multiplicity_two(map, num_items); } } diff --git a/tests/static_multimap/non_match_test.cu b/tests/static_multimap/non_match_test.cu index ef0042012..ea8f24899 100644 --- a/tests/static_multimap/non_match_test.cu +++ b/tests/static_multimap/non_match_test.cu @@ -145,12 +145,12 @@ TEMPLATE_TEST_CASE_SIG( cuda::thread_scope_device, cuco::cuda_allocator, cuco::linear_probing<1, cuco::detail::MurmurHash3_32>> - map{num_keys * 2, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + map{num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; test_non_matches(map, d_pairs.begin(), d_keys.begin(), num_keys); } if constexpr (Probe == cuco::test::probe_sequence::double_hashing) { cuco::static_multimap map{ - num_keys * 2, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; test_non_matches(map, d_pairs.begin(), d_keys.begin(), num_keys); } } diff --git a/tests/static_multimap/pair_function_test.cu b/tests/static_multimap/pair_function_test.cu index c5442533b..e4b7b5b1f 100644 --- a/tests/static_multimap/pair_function_test.cu +++ b/tests/static_multimap/pair_function_test.cu @@ -138,13 +138,12 @@ TEMPLATE_TEST_CASE_SIG( cuda::thread_scope_device, cuco::cuda_allocator, cuco::linear_probing<1, cuco::detail::MurmurHash3_32>> - map{ - num_pairs * 2, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + map{num_pairs * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; test_pair_functions(map, d_pairs.begin(), num_pairs); } if constexpr (Probe == cuco::test::probe_sequence::double_hashing) { cuco::static_multimap map{ - num_pairs * 2, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + num_pairs * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; test_pair_functions(map, d_pairs.begin(), num_pairs); } } From 17ffbdfcc2a9c136eda3b796d7236ff585048bc4 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 5 Dec 2022 09:16:39 -0500 Subject: [PATCH 050/152] Update godbolt examples --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index a5283b82a..293fe31ab 100644 --- a/README.md +++ b/README.md @@ -186,16 +186,16 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection `cuco::static_map` is a fixed-size hash table using open addressing with linear probing. See the Doxygen documentation in `static_map.cuh` for more detailed information. #### Examples: -- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/ervPzqh64)) -- [Device-view APIs for individual operations](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/device_view_example.cu) (see [live example in godbolt](https://godbolt.org/z/qMWrfE6ET)) -- [Custom data types, key equality operators and hash functions](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/custom_type_example.cu) (see [live example in godbolt](https://godbolt.org/z/oGfYjzMGT)) +- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/T49P85Mnd)) +- [Device-view APIs for individual operations](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/device_view_example.cu) (see [live example in godbolt](https://godbolt.org/z/dh8bMn3G1)) +- [Custom data types, key equality operators and hash functions](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/custom_type_example.cu) (see [live example in godbolt](https://godbolt.org/z/7djKevK6e)) ### `static_multimap` `cuco::static_multimap` is a fixed-size hash table that supports storing equivalent keys. It uses double hashing by default and supports switching to linear probing. See the Doxygen documentation in `static_multimap.cuh` for more detailed information. #### Examples: -- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_multimap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/Po4eTEn1a)) +- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_multimap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/PrbqG6ae4)) ### `dynamic_map` From c1ceb1a870930adb632790f99803795a222b1db8 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 8 Dec 2022 02:50:39 +0000 Subject: [PATCH 051/152] Add inline qualifier to primes array --- include/cuco/detail/prime.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cuco/detail/prime.hpp b/include/cuco/detail/prime.hpp index 93ddde1a0..600e46a4d 100644 --- a/include/cuco/detail/prime.hpp +++ b/include/cuco/detail/prime.hpp @@ -21,7 +21,7 @@ namespace cuco { namespace detail { -constexpr std::array primes = { +inline constexpr std::array primes = { 2, 3, 5, 7, 13, 19, 29, 37, 43, 53, 59, 67, 73, 79, 89, 97, 103, 109, 127, 137, 149, From 378b78944792645710ebd610d04ef78f64bcc231 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 8 Dec 2022 02:57:58 +0000 Subject: [PATCH 052/152] Add missing includes --- include/cuco/detail/prime.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/cuco/detail/prime.hpp b/include/cuco/detail/prime.hpp index 600e46a4d..234c718ee 100644 --- a/include/cuco/detail/prime.hpp +++ b/include/cuco/detail/prime.hpp @@ -18,6 +18,10 @@ #include +#include +#include +#include + namespace cuco { namespace detail { From e8ce5de8224d96e3067c5a02f60188c8a1985c9b Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 8 Dec 2022 03:00:11 +0000 Subject: [PATCH 053/152] Remove unused functions in prime.hpp --- include/cuco/detail/prime.hpp | 37 ----------------------------------- 1 file changed, 37 deletions(-) diff --git a/include/cuco/detail/prime.hpp b/include/cuco/detail/prime.hpp index 234c718ee..b4ff147a1 100644 --- a/include/cuco/detail/prime.hpp +++ b/include/cuco/detail/prime.hpp @@ -20133,43 +20133,6 @@ inline constexpr std::array primes = { 17176447243, 17176578343, 17176709449, 17176840529, 17176971601, 17177102693, 17177233783, 17177364857, 17177495953, 17177627053, 17177758133}; -/** - * @brief Indicates whether the input `num` is a prime number. - * - * @param num - * @return A boolean indicating whether the input `num` is a prime number - */ -constexpr bool is_prime(std::size_t num) noexcept -{ - bool flag = true; - // 0 and 1 are not prime numbers - if (num == 0lu || num == 1lu) { - flag = false; - } else { - for (auto i = 2lu; i <= num / 2lu; ++i) { - if (num % i == 0) { - flag = false; - break; - } - } - } - return flag; -} - -/** - * @brief Computes the smallest prime number greater than or equal to `num`. - * - * @param num - * @return The smallest prime number greater than or equal to `num` - */ -constexpr std::size_t compute_prime(std::size_t num) noexcept -{ - while (not is_prime(num)) { - num++; - } - return num; -} - /** * @brief Calculates the valid capacity based on `cg_size` , `vector_width` * and the initial `capacity`. From d46eb9374e3766761d2b56073c166033242a3519 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 8 Dec 2022 11:37:33 -0500 Subject: [PATCH 054/152] Move hash function to public cuco namespace --- include/cuco/dynamic_map.cuh | 7 ++++--- include/cuco/hash_functions.cuh | 34 ++++++++++++++++++++++++++++++++ include/cuco/static_map.cuh | 34 ++++++++++++++++---------------- include/cuco/static_multimap.cuh | 3 ++- 4 files changed, 57 insertions(+), 21 deletions(-) create mode 100644 include/cuco/hash_functions.cuh diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index a75512d3c..c9df0fc13 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -170,7 +171,7 @@ class dynamic_map { * @param key_equal The binary function to compare two keys for equality */ template , + typename Hash = cuco::MurmurHash3_32, typename KeyEqual = thrust::equal_to> void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}); @@ -194,7 +195,7 @@ class dynamic_map { */ template , + typename Hash = cuco::MurmurHash3_32, typename KeyEqual = thrust::equal_to> void find(InputIt first, InputIt last, @@ -221,7 +222,7 @@ class dynamic_map { */ template , + typename Hash = cuco::MurmurHash3_32, typename KeyEqual = thrust::equal_to> void contains(InputIt first, InputIt last, diff --git a/include/cuco/hash_functions.cuh b/include/cuco/hash_functions.cuh new file mode 100644 index 000000000..40490c18d --- /dev/null +++ b/include/cuco/hash_functions.cuh @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cuco { + +using hash_value_type = uint32_t; + +/** + * @brief A `MurmurHash3_32` hash function to hash the given argument on host and device. + * + * @tparam Key The type of the values to hash + */ +template +struct MurmurHash3_32 : public detail::MurmurHash3_32 { +}; + +} // namespace cuco diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 4ca5755d7..615ce7843 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -19,9 +19,9 @@ #include #include #include -#include #include #include +#include #include #include @@ -251,7 +251,7 @@ class static_map { * @param stream Stream used for executing the kernels */ template , + typename Hash = cuco::MurmurHash3_32, typename KeyEqual = thrust::equal_to> void insert(InputIt first, InputIt last, @@ -285,7 +285,7 @@ class static_map { template , + typename Hash = cuco::MurmurHash3_32, typename KeyEqual = thrust::equal_to> void insert_if(InputIt first, InputIt last, @@ -323,7 +323,7 @@ class static_map { * provided at construction */ template , + typename Hash = cuco::MurmurHash3_32, typename KeyEqual = thrust::equal_to> void erase(InputIt first, InputIt last, @@ -352,7 +352,7 @@ class static_map { */ template , + typename Hash = cuco::MurmurHash3_32, typename KeyEqual = thrust::equal_to> void find(InputIt first, InputIt last, @@ -407,7 +407,7 @@ class static_map { */ template , + typename Hash = cuco::MurmurHash3_32, typename KeyEqual = thrust::equal_to> void contains(InputIt first, InputIt last, @@ -930,7 +930,7 @@ class static_map { * equality * @return `true` if the insert was successful, `false` otherwise. */ - template , + template , typename KeyEqual = thrust::equal_to> __device__ bool insert(value_type const& insert_pair, Hash hash = Hash{}, @@ -961,7 +961,7 @@ class static_map { * @return a pair consisting of an iterator to the element and a bool, * either `true` if the insert was successful, `false` otherwise. */ - template , + template , typename KeyEqual = thrust::equal_to> __device__ thrust::pair insert_and_find( value_type const& insert_pair, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) noexcept; @@ -986,7 +986,7 @@ class static_map { * @return `true` if the insert was successful, `false` otherwise. */ template , + typename Hash = cuco::MurmurHash3_32, typename KeyEqual = thrust::equal_to> __device__ bool insert(CG const& g, value_type const& insert_pair, @@ -1007,7 +1007,7 @@ class static_map { * equality * @return `true` if the erasure was successful, `false` otherwise. */ - template , + template , typename KeyEqual = thrust::equal_to> __device__ bool erase(key_type const& k, Hash hash = Hash{}, @@ -1030,7 +1030,7 @@ class static_map { * @return `true` if the erasure was successful, `false` otherwise. */ template , + typename Hash = cuco::MurmurHash3_32, typename KeyEqual = thrust::equal_to> __device__ bool erase(CG const& g, key_type const& k, @@ -1198,7 +1198,7 @@ class static_map { * @return An iterator to the position at which the key/value pair * containing `k` was inserted */ - template , + template , typename KeyEqual = thrust::equal_to> __device__ iterator find(Key const& k, Hash hash = Hash{}, @@ -1218,7 +1218,7 @@ class static_map { * @return An iterator to the position at which the key/value pair * containing `k` was inserted */ - template , + template , typename KeyEqual = thrust::equal_to> __device__ const_iterator find(Key const& k, Hash hash = Hash{}, @@ -1245,7 +1245,7 @@ class static_map { * containing `k` was inserted */ template , + typename Hash = cuco::MurmurHash3_32, typename KeyEqual = thrust::equal_to> __device__ iterator find(CG g, Key const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) noexcept; @@ -1271,7 +1271,7 @@ class static_map { * containing `k` was inserted */ template , + typename Hash = cuco::MurmurHash3_32, typename KeyEqual = thrust::equal_to> __device__ const_iterator find(CG g, Key const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) const noexcept; @@ -1300,7 +1300,7 @@ class static_map { * containing `k` was inserted */ template , + typename Hash = cuco::MurmurHash3_32, typename KeyEqual = thrust::equal_to> __device__ bool contains(ProbeKey const& k, Hash hash = Hash{}, @@ -1335,7 +1335,7 @@ class static_map { */ template , + typename Hash = cuco::MurmurHash3_32, typename KeyEqual = thrust::equal_to> __device__ std::enable_if_t, bool> contains( CG const& g, diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index ef43b2175..0c29b41c6 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -131,7 +132,7 @@ template , class ProbeSequence = - cuco::double_hashing<8, detail::MurmurHash3_32, detail::MurmurHash3_32>> + cuco::double_hashing<8, cuco::MurmurHash3_32, cuco::MurmurHash3_32>> class static_multimap { static_assert( cuco::is_bitwise_comparable_v, From e184630a071d8109697551a6c563fd3003b026c4 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 8 Dec 2022 11:39:04 -0500 Subject: [PATCH 055/152] Cleanups --- include/cuco/detail/hash_functions.cuh | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/include/cuco/detail/hash_functions.cuh b/include/cuco/detail/hash_functions.cuh index 7be6cab20..3c3f7403a 100644 --- a/include/cuco/detail/hash_functions.cuh +++ b/include/cuco/detail/hash_functions.cuh @@ -16,11 +16,7 @@ #pragma once -namespace cuco { - -using hash_value_type = uint32_t; - -namespace detail { +namespace cuco::detail { /** * @brief A `MurmurHash3_32` hash function to hash the given argument on host and device. @@ -118,5 +114,4 @@ struct MurmurHash3_32 { uint32_t m_seed; }; -} // namespace detail -} // namespace cuco +} // namespace cuco::detail From b66b517c5ad26822fb778dccd75abbd22bada039 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 8 Dec 2022 17:01:03 +0000 Subject: [PATCH 056/152] Consistent integer types --- include/cuco/detail/prime.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/cuco/detail/prime.hpp b/include/cuco/detail/prime.hpp index b4ff147a1..9e51bdf5a 100644 --- a/include/cuco/detail/prime.hpp +++ b/include/cuco/detail/prime.hpp @@ -20,12 +20,12 @@ #include #include -#include +#include namespace cuco { namespace detail { -inline constexpr std::array primes = { +inline constexpr std::array primes = { 2, 3, 5, 7, 13, 19, 29, 37, 43, 53, 59, 67, 73, 79, 89, 97, 103, 109, 127, 137, 149, @@ -20144,8 +20144,8 @@ inline constexpr std::array primes = { * @param capacity The initially requested capacity * @return A valid capacity no smaller than the requested `capacity` */ -template -constexpr std::size_t get_valid_capacity(std::size_t capacity) noexcept +template +constexpr T get_valid_capacity(T capacity) noexcept { auto const stride = [&]() { if constexpr (uses_vector_load) { return cg_size * vector_width; } From 8001a236411324da7c4f863403bc73c1bf769a6f Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 8 Dec 2022 17:02:09 +0000 Subject: [PATCH 057/152] Copyright year --- include/cuco/detail/prime.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cuco/detail/prime.hpp b/include/cuco/detail/prime.hpp index 9e51bdf5a..9837d356d 100644 --- a/include/cuco/detail/prime.hpp +++ b/include/cuco/detail/prime.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 1805c08eb08209a0c660236da5022feccdca40ac Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 8 Dec 2022 12:48:02 -0500 Subject: [PATCH 058/152] Renaming + update tests and benchmarks --- .../static_multimap/optimal_retrieve_bench.cu | 13 ++++---- include/cuco/dynamic_map.cuh | 6 ++-- include/cuco/hash_functions.cuh | 5 ++- include/cuco/static_map.cuh | 32 +++++++++---------- include/cuco/static_multimap.cuh | 2 +- tests/static_map/key_sentinel_test.cu | 8 ++--- tests/static_map/stream_test.cu | 2 +- .../custom_pair_retrieve_test.cu | 2 +- tests/static_multimap/insert_if_test.cu | 2 +- tests/static_multimap/multiplicity_test.cu | 2 +- tests/static_multimap/non_match_test.cu | 2 +- tests/static_multimap/pair_function_test.cu | 2 +- 12 files changed, 37 insertions(+), 41 deletions(-) diff --git a/benchmarks/hash_table/static_multimap/optimal_retrieve_bench.cu b/benchmarks/hash_table/static_multimap/optimal_retrieve_bench.cu index 78f134158..bca2effa6 100644 --- a/benchmarks/hash_table/static_multimap/optimal_retrieve_bench.cu +++ b/benchmarks/hash_table/static_multimap/optimal_retrieve_bench.cu @@ -71,13 +71,12 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_retrieve( thrust::device_vector d_keys(h_keys); thrust::device_vector> d_pairs(h_pairs); - cuco::static_multimap, - cuco::double_hashing, - cuco::detail::MurmurHash3_32>> + cuco::static_multimap< + Key, + Value, + cuda::thread_scope_device, + cuco::cuda_allocator, + cuco::double_hashing, cuco::murmurhash3_32>> map{size, cuco::empty_key{-1}, cuco::empty_value{-1}}; map.insert(d_pairs.begin(), d_pairs.end()); diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index c9df0fc13..35a3cdf4a 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -171,7 +171,7 @@ class dynamic_map { * @param key_equal The binary function to compare two keys for equality */ template , + typename Hash = cuco::murmurhash3_32, typename KeyEqual = thrust::equal_to> void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}); @@ -195,7 +195,7 @@ class dynamic_map { */ template , + typename Hash = cuco::murmurhash3_32, typename KeyEqual = thrust::equal_to> void find(InputIt first, InputIt last, @@ -222,7 +222,7 @@ class dynamic_map { */ template , + typename Hash = cuco::murmurhash3_32, typename KeyEqual = thrust::equal_to> void contains(InputIt first, InputIt last, diff --git a/include/cuco/hash_functions.cuh b/include/cuco/hash_functions.cuh index 40490c18d..fd8a73ac7 100644 --- a/include/cuco/hash_functions.cuh +++ b/include/cuco/hash_functions.cuh @@ -23,12 +23,11 @@ namespace cuco { using hash_value_type = uint32_t; /** - * @brief A `MurmurHash3_32` hash function to hash the given argument on host and device. + * @brief A `murmurhash3_32` hash function to hash the given argument on host and device. * * @tparam Key The type of the values to hash */ template -struct MurmurHash3_32 : public detail::MurmurHash3_32 { -}; +using murmurhash3_32 = detail::MurmurHash3_32; } // namespace cuco diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 615ce7843..7674d5b20 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -251,7 +251,7 @@ class static_map { * @param stream Stream used for executing the kernels */ template , + typename Hash = cuco::murmurhash3_32, typename KeyEqual = thrust::equal_to> void insert(InputIt first, InputIt last, @@ -285,7 +285,7 @@ class static_map { template , + typename Hash = cuco::murmurhash3_32, typename KeyEqual = thrust::equal_to> void insert_if(InputIt first, InputIt last, @@ -323,7 +323,7 @@ class static_map { * provided at construction */ template , + typename Hash = cuco::murmurhash3_32, typename KeyEqual = thrust::equal_to> void erase(InputIt first, InputIt last, @@ -352,7 +352,7 @@ class static_map { */ template , + typename Hash = cuco::murmurhash3_32, typename KeyEqual = thrust::equal_to> void find(InputIt first, InputIt last, @@ -407,7 +407,7 @@ class static_map { */ template , + typename Hash = cuco::murmurhash3_32, typename KeyEqual = thrust::equal_to> void contains(InputIt first, InputIt last, @@ -930,7 +930,7 @@ class static_map { * equality * @return `true` if the insert was successful, `false` otherwise. */ - template , + template , typename KeyEqual = thrust::equal_to> __device__ bool insert(value_type const& insert_pair, Hash hash = Hash{}, @@ -961,7 +961,7 @@ class static_map { * @return a pair consisting of an iterator to the element and a bool, * either `true` if the insert was successful, `false` otherwise. */ - template , + template , typename KeyEqual = thrust::equal_to> __device__ thrust::pair insert_and_find( value_type const& insert_pair, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) noexcept; @@ -986,7 +986,7 @@ class static_map { * @return `true` if the insert was successful, `false` otherwise. */ template , + typename Hash = cuco::murmurhash3_32, typename KeyEqual = thrust::equal_to> __device__ bool insert(CG const& g, value_type const& insert_pair, @@ -1007,7 +1007,7 @@ class static_map { * equality * @return `true` if the erasure was successful, `false` otherwise. */ - template , + template , typename KeyEqual = thrust::equal_to> __device__ bool erase(key_type const& k, Hash hash = Hash{}, @@ -1030,7 +1030,7 @@ class static_map { * @return `true` if the erasure was successful, `false` otherwise. */ template , + typename Hash = cuco::murmurhash3_32, typename KeyEqual = thrust::equal_to> __device__ bool erase(CG const& g, key_type const& k, @@ -1198,7 +1198,7 @@ class static_map { * @return An iterator to the position at which the key/value pair * containing `k` was inserted */ - template , + template , typename KeyEqual = thrust::equal_to> __device__ iterator find(Key const& k, Hash hash = Hash{}, @@ -1218,7 +1218,7 @@ class static_map { * @return An iterator to the position at which the key/value pair * containing `k` was inserted */ - template , + template , typename KeyEqual = thrust::equal_to> __device__ const_iterator find(Key const& k, Hash hash = Hash{}, @@ -1245,7 +1245,7 @@ class static_map { * containing `k` was inserted */ template , + typename Hash = cuco::murmurhash3_32, typename KeyEqual = thrust::equal_to> __device__ iterator find(CG g, Key const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) noexcept; @@ -1271,7 +1271,7 @@ class static_map { * containing `k` was inserted */ template , + typename Hash = cuco::murmurhash3_32, typename KeyEqual = thrust::equal_to> __device__ const_iterator find(CG g, Key const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) const noexcept; @@ -1300,7 +1300,7 @@ class static_map { * containing `k` was inserted */ template , + typename Hash = cuco::murmurhash3_32, typename KeyEqual = thrust::equal_to> __device__ bool contains(ProbeKey const& k, Hash hash = Hash{}, @@ -1335,7 +1335,7 @@ class static_map { */ template , + typename Hash = cuco::murmurhash3_32, typename KeyEqual = thrust::equal_to> __device__ std::enable_if_t, bool> contains( CG const& g, diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index 0c29b41c6..27cb951ba 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -132,7 +132,7 @@ template , class ProbeSequence = - cuco::double_hashing<8, cuco::MurmurHash3_32, cuco::MurmurHash3_32>> + cuco::double_hashing<8, cuco::murmurhash3_32, cuco::murmurhash3_32>> class static_multimap { static_assert( cuco::is_bitwise_comparable_v, diff --git a/tests/static_map/key_sentinel_test.cu b/tests/static_map/key_sentinel_test.cu index f74990367..eeac192b9 100644 --- a/tests/static_map/key_sentinel_test.cu +++ b/tests/static_map/key_sentinel_test.cu @@ -62,17 +62,15 @@ TEMPLATE_TEST_CASE_SIG( pairs_begin, pairs_begin + num_keys, [m_view] __device__(cuco::pair_type const& pair) mutable { - return m_view.insert(pair, cuco::detail::MurmurHash3_32{}, custom_equals{}); + return m_view.insert(pair, cuco::murmurhash3_32{}, custom_equals{}); })); } SECTION( "Tests of CG insert: The custom `key_equal` can never be used to compare against sentinel") { - map.insert(pairs_begin, - pairs_begin + num_keys, - cuco::detail::MurmurHash3_32{}, - custom_equals{}); + map.insert( + pairs_begin, pairs_begin + num_keys, cuco::murmurhash3_32{}, custom_equals{}); // All keys inserted via custom `key_equal` should be found REQUIRE(cuco::test::all_of(pairs_begin, pairs_begin + num_keys, diff --git a/tests/static_map/stream_test.cu b/tests/static_map/stream_test.cu index cb0358c2b..1f1fe9b1c 100644 --- a/tests/static_map/stream_test.cu +++ b/tests/static_map/stream_test.cu @@ -57,7 +57,7 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream", thrust::make_counting_iterator(0), [] __device__(auto i) { return cuco::pair_type(i, i); }); - auto hash_fn = cuco::detail::MurmurHash3_32{}; + auto hash_fn = cuco::murmurhash3_32{}; auto equal_fn = thrust::equal_to{}; // bulk function test cases diff --git a/tests/static_multimap/custom_pair_retrieve_test.cu b/tests/static_multimap/custom_pair_retrieve_test.cu index 58887ba9f..feb13dc97 100644 --- a/tests/static_multimap/custom_pair_retrieve_test.cu +++ b/tests/static_multimap/custom_pair_retrieve_test.cu @@ -201,7 +201,7 @@ TEMPLATE_TEST_CASE_SIG( Value, cuda::thread_scope_device, cuco::cuda_allocator, - cuco::linear_probing<1, cuco::detail::MurmurHash3_32>> + cuco::linear_probing<1, cuco::murmurhash3_32>> map{num_pairs * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; test_non_shmem_pair_retrieve(map, num_pairs); } diff --git a/tests/static_multimap/insert_if_test.cu b/tests/static_multimap/insert_if_test.cu index 8ff7344a6..2a6f3099d 100644 --- a/tests/static_multimap/insert_if_test.cu +++ b/tests/static_multimap/insert_if_test.cu @@ -72,7 +72,7 @@ TEMPLATE_TEST_CASE_SIG( Value, cuda::thread_scope_device, cuco::cuda_allocator, - cuco::linear_probing<1, cuco::detail::MurmurHash3_32>> + cuco::linear_probing<1, cuco::murmurhash3_32>> map{num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; test_insert_if(map, d_pairs.begin(), d_keys.begin(), num_keys); } diff --git a/tests/static_multimap/multiplicity_test.cu b/tests/static_multimap/multiplicity_test.cu index e34593438..ae26da5d1 100644 --- a/tests/static_multimap/multiplicity_test.cu +++ b/tests/static_multimap/multiplicity_test.cu @@ -166,7 +166,7 @@ TEMPLATE_TEST_CASE_SIG( Value, cuda::thread_scope_device, cuco::cuda_allocator, - cuco::linear_probing<1, cuco::detail::MurmurHash3_32>> + cuco::linear_probing<1, cuco::murmurhash3_32>> map{5, cuco::empty_key{-1}, cuco::empty_value{-1}}; test_multiplicity_two(map, num_items); } diff --git a/tests/static_multimap/non_match_test.cu b/tests/static_multimap/non_match_test.cu index ea8f24899..7232b2832 100644 --- a/tests/static_multimap/non_match_test.cu +++ b/tests/static_multimap/non_match_test.cu @@ -144,7 +144,7 @@ TEMPLATE_TEST_CASE_SIG( Value, cuda::thread_scope_device, cuco::cuda_allocator, - cuco::linear_probing<1, cuco::detail::MurmurHash3_32>> + cuco::linear_probing<1, cuco::murmurhash3_32>> map{num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; test_non_matches(map, d_pairs.begin(), d_keys.begin(), num_keys); } diff --git a/tests/static_multimap/pair_function_test.cu b/tests/static_multimap/pair_function_test.cu index e4b7b5b1f..ae1bb8ae2 100644 --- a/tests/static_multimap/pair_function_test.cu +++ b/tests/static_multimap/pair_function_test.cu @@ -137,7 +137,7 @@ TEMPLATE_TEST_CASE_SIG( Value, cuda::thread_scope_device, cuco::cuda_allocator, - cuco::linear_probing<1, cuco::detail::MurmurHash3_32>> + cuco::linear_probing<1, cuco::murmurhash3_32>> map{num_pairs * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; test_pair_functions(map, d_pairs.begin(), num_pairs); } From 6ff1b1319c4af28e33c61aa3abd2d323d61483cf Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 8 Dec 2022 12:51:21 -0500 Subject: [PATCH 059/152] Remove unused type alias --- include/cuco/hash_functions.cuh | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/cuco/hash_functions.cuh b/include/cuco/hash_functions.cuh index fd8a73ac7..365958d64 100644 --- a/include/cuco/hash_functions.cuh +++ b/include/cuco/hash_functions.cuh @@ -20,8 +20,6 @@ namespace cuco { -using hash_value_type = uint32_t; - /** * @brief A `murmurhash3_32` hash function to hash the given argument on host and device. * From 8ba86f48a2e22ac1c309156688b5bdac4f51e64d Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 9 Dec 2022 18:43:23 +0000 Subject: [PATCH 060/152] Fix prime array length --- include/cuco/detail/prime.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cuco/detail/prime.hpp b/include/cuco/detail/prime.hpp index 9837d356d..2bb448eeb 100644 --- a/include/cuco/detail/prime.hpp +++ b/include/cuco/detail/prime.hpp @@ -25,7 +25,7 @@ namespace cuco { namespace detail { -inline constexpr std::array primes = { +inline constexpr std::array primes = { 2, 3, 5, 7, 13, 19, 29, 37, 43, 53, 59, 67, 73, 79, 89, 97, 103, 109, 127, 137, 149, From 8a14afa39a4f91bd880b84331f4ab19346472a42 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Mon, 12 Dec 2022 20:30:28 +0530 Subject: [PATCH 061/152] Fix memory leak in static_map::retrieve_all deallocate temporary memory in cuco::static_map::retrieve_all --- include/cuco/detail/static_map.inl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index 8e27ca0d3..837c79046 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -251,7 +251,9 @@ std::pair static_map::retrieve_a CUCO_CUDA_TRY( cudaMemcpyAsync(&h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, stream)); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); - + temp_allocator_type::deallocate(temp_allocator, d_num_out, sizeof(std::size_t)); + temp_allocator_type::deallocate(temp_allocator, d_temp_storage, temp_storage_bytes); + return std::make_pair(keys_out + h_num_out, values_out + h_num_out); } From 01ac7ccac36627a18f4d7d6602ce4052e994297a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 12 Dec 2022 15:15:16 +0000 Subject: [PATCH 062/152] [pre-commit.ci] auto code formatting --- include/cuco/detail/static_map.inl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index 837c79046..947c9e96b 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -253,7 +253,7 @@ std::pair static_map::retrieve_a CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); temp_allocator_type::deallocate(temp_allocator, d_num_out, sizeof(std::size_t)); temp_allocator_type::deallocate(temp_allocator, d_temp_storage, temp_storage_bytes); - + return std::make_pair(keys_out + h_num_out, values_out + h_num_out); } From 0c376c4f2f78f55920813fe879130404a7068ece Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Mon, 12 Dec 2022 20:46:08 +0530 Subject: [PATCH 063/152] fix size_t* to char* pointer for deallocation --- include/cuco/detail/static_map.inl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index 947c9e96b..8655cf35c 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -251,9 +251,11 @@ std::pair static_map::retrieve_a CUCO_CUDA_TRY( cudaMemcpyAsync(&h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, stream)); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); - temp_allocator_type::deallocate(temp_allocator, d_num_out, sizeof(std::size_t)); - temp_allocator_type::deallocate(temp_allocator, d_temp_storage, temp_storage_bytes); - + std::allocator_traits::deallocate( + temp_allocator, reinterpret_cast(d_num_out), sizeof(std::size_t)); + std::allocator_traits::deallocate( + temp_allocator, d_temp_storage, temp_storage_bytes); + return std::make_pair(keys_out + h_num_out, values_out + h_num_out); } From 72a0be41d58c779819d08058a04b587660114fa2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 12 Dec 2022 15:16:18 +0000 Subject: [PATCH 064/152] [pre-commit.ci] auto code formatting --- include/cuco/detail/static_map.inl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index 8655cf35c..ee794e1c2 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -255,7 +255,7 @@ std::pair static_map::retrieve_a temp_allocator, reinterpret_cast(d_num_out), sizeof(std::size_t)); std::allocator_traits::deallocate( temp_allocator, d_temp_storage, temp_storage_bytes); - + return std::make_pair(keys_out + h_num_out, values_out + h_num_out); } From a86889493d59e9914e1adbd86bf4b7be8a06123d Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Mon, 12 Dec 2022 18:37:20 +0000 Subject: [PATCH 065/152] Add TODO regarding CTAD array declaration --- include/cuco/detail/prime.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/include/cuco/detail/prime.hpp b/include/cuco/detail/prime.hpp index 2bb448eeb..1180035ae 100644 --- a/include/cuco/detail/prime.hpp +++ b/include/cuco/detail/prime.hpp @@ -25,6 +25,7 @@ namespace cuco { namespace detail { +// TODO use CTAD instead of explicitly specifying the array size once we drop support for nvcc <11.5 inline constexpr std::array primes = { 2, 3, 5, 7, 13, 19, 29, 37, 43, 53, 59, 67, 73, 79, From 1e6ad99de0bd52f501e70cc084e74b013abf98fb Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 20 Dec 2022 11:18:31 -0500 Subject: [PATCH 066/152] Add more data types for erase tests --- tests/dynamic_map/erase_test.cu | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu index 0e53197ea..25033ff14 100644 --- a/tests/dynamic_map/erase_test.cu +++ b/tests/dynamic_map/erase_test.cu @@ -23,11 +23,14 @@ #include -TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t)) +TEMPLATE_TEST_CASE_SIG("erase key", + "", + ((typename Key, typename Value), Key, Value), + (int32_t, int32_t), + (int32_t, int64_t), + (int64_t, int32_t), + (int64_t, int64_t)) { - using Key = T; - using Value = T; - unsigned long num_keys = 1'000'000; cuco::dynamic_map map{num_keys * 2, cuco::sentinel::empty_key{-1}, @@ -135,4 +138,4 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t)) REQUIRE(map.get_size() == 0); } -} \ No newline at end of file +} From 9e324fcafcf1a92b329683f12f56106b6b451028 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 20 Dec 2022 11:19:23 -0500 Subject: [PATCH 067/152] Use public murmurhash --- include/cuco/dynamic_map.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index a677fdc83..794e67f2e 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -209,7 +209,7 @@ class dynamic_map { * @param stream Stream used for executing the kernels */ template , + typename Hash = cuco::murmurhash3_32, typename KeyEqual = thrust::equal_to> void insert(InputIt first, InputIt last, From bb0e4e9c3cdf6a22ade08ed55c8475408fe5aba2 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 20 Dec 2022 11:48:40 -0500 Subject: [PATCH 068/152] Update static map benchmark: fix runtime stall bug, remove redundant comments and add erase_none and search_none benchmarks --- benchmarks/hash_table/static_map_bench.cu | 33 ++++++++++++++--------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/benchmarks/hash_table/static_map_bench.cu b/benchmarks/hash_table/static_map_bench.cu index 57d17e222..1538a636a 100644 --- a/benchmarks/hash_table/static_map_bench.cu +++ b/benchmarks/hash_table/static_map_bench.cu @@ -164,8 +164,7 @@ static void BM_static_map_search_none(::benchmark::State& state) float occupancy = state.range(1) / float{100}; std::size_t size = num_keys / occupancy; - map_type map{size, -1, -1}; - auto view = map.get_device_mutable_view(); + map_type map{size, cuco::empty_key{-1}, cuco::empty_value{-1}}; std::vector h_keys(num_keys); std::vector h_values(num_keys); @@ -174,7 +173,7 @@ static void BM_static_map_search_none(::benchmark::State& state) generate_keys(h_keys.begin(), h_keys.end()); - for (auto i = 0; i < num_keys; ++i) { + for (std::size_t i = 0; i < num_keys; ++i) { Key key = h_keys[i]; Value val = h_keys[i]; h_pairs[i].first = key; @@ -182,8 +181,9 @@ static void BM_static_map_search_none(::benchmark::State& state) } // diff keys - for (int i = 0; i < num_keys; ++i) + for (std::size_t i = 0; i < num_keys; ++i) { h_keys[i] += num_keys; + } thrust::device_vector d_keys(h_keys); thrust::device_vector d_results(num_keys); @@ -193,6 +193,9 @@ static void BM_static_map_search_none(::benchmark::State& state) for (auto _ : state) { map.find(d_keys.begin(), d_keys.end(), d_results.begin()); + // TODO: get rid of sync and rewrite the benchmark with `nvbench` + // once https://github.com/NVIDIA/nvbench/pull/80 is merged + cudaDeviceSynchronize(); } state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * @@ -251,8 +254,7 @@ static void BM_static_map_erase_none(::benchmark::State& state) float occupancy = state.range(1) / float{100}; std::size_t size = num_keys / occupancy; - map_type map{size, -1, -1}; - auto view = map.get_device_mutable_view(); + map_type map{size, cuco::empty_key{-1}, cuco::empty_value{-1}, cuco::erased_key{-2}}; std::vector h_keys(num_keys); std::vector h_values(num_keys); @@ -261,7 +263,7 @@ static void BM_static_map_erase_none(::benchmark::State& state) generate_keys(h_keys.begin(), h_keys.end()); - for (auto i = 0; i < num_keys; ++i) { + for (std::size_t i = 0; i < num_keys; ++i) { Key key = h_keys[i]; Value val = h_keys[i]; h_pairs[i].first = key; @@ -269,22 +271,20 @@ static void BM_static_map_erase_none(::benchmark::State& state) } // diff keys - for (int i = 0; i < num_keys; ++i) + for (std::size_t i = 0; i < num_keys; ++i) { h_keys[i] += num_keys; + } thrust::device_vector d_keys(h_keys); thrust::device_vector d_results(num_keys); thrust::device_vector> d_pairs(h_pairs); for (auto _ : state) { - // state.ResumeTiming(); state.PauseTiming(); map.insert(d_pairs.begin(), d_pairs.end()); state.ResumeTiming(); map.erase(d_keys.begin(), d_keys.end()); - - // state.PauseTiming(); } state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * @@ -345,6 +345,15 @@ BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::GAUSSI ->Unit(benchmark::kMillisecond) ->Apply(generate_size_and_occupancy); -BENCHMARK_TEMPLATE(BM_static_map_erase_all, int32_t, int32_t, dist_type::UNIQUE) +// TODO: comprehensive tests for erase_all, erase_none and search_none +BENCHMARK_TEMPLATE(BM_static_map_erase_all, int32_t, int32_t, dist_type::UNIFORM) + ->Unit(benchmark::kMillisecond) + ->Apply(generate_size_and_occupancy); + +BENCHMARK_TEMPLATE(BM_static_map_search_none, int32_t, int32_t, dist_type::UNIFORM) + ->Unit(benchmark::kMillisecond) + ->Apply(generate_size_and_occupancy); + +BENCHMARK_TEMPLATE(BM_static_map_erase_none, int32_t, int32_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) ->Apply(generate_size_and_occupancy); From 10fd08a658e3019de8cff0a002f1e89ef59980a2 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 20 Dec 2022 12:03:39 -0500 Subject: [PATCH 069/152] Update dynamic map benchmark: fix conversion warning, add search_none and erase_none benchmarks and get rid of sentinel namespace --- benchmarks/hash_table/dynamic_map_bench.cu | 34 ++++++++++++++-------- tests/dynamic_map/erase_test.cu | 6 ++-- 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu index c31dde8ad..8fbb804de 100644 --- a/benchmarks/hash_table/dynamic_map_bench.cu +++ b/benchmarks/hash_table/dynamic_map_bench.cu @@ -148,7 +148,7 @@ static void BM_dynamic_search_none(::benchmark::State& state) generate_keys(h_keys.begin(), h_keys.end()); - for (auto i = 0; i < num_keys; ++i) { + for (std::size_t i = 0; i < num_keys; ++i) { Key key = h_keys[i] + num_keys; Value val = h_keys[i] + num_keys; h_pairs[i].first = key; @@ -159,8 +159,7 @@ static void BM_dynamic_search_none(::benchmark::State& state) thrust::device_vector> d_pairs(h_pairs); thrust::device_vector d_results(num_keys); - map_type map{ - initial_size, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + map_type map{initial_size, cuco::empty_key{-1}, cuco::empty_value{-1}}; map.insert(d_pairs.begin(), d_pairs.end()); for (auto _ : state) { @@ -198,9 +197,9 @@ static void BM_dynamic_erase_all(::benchmark::State& state) std::size_t batch_size = 1E6; for (auto _ : state) { map_type map{initial_size, - cuco::sentinel::empty_key{-1}, - cuco::sentinel::empty_value{-1}, - cuco::sentinel::erased_key{-2}}; + cuco::empty_key{-1}, + cuco::empty_value{-1}, + cuco::erased_key{-2}}; for (uint32_t i = 0; i < num_keys; i += batch_size) { map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size); } @@ -229,7 +228,7 @@ static void BM_dynamic_erase_none(::benchmark::State& state) generate_keys(h_keys.begin(), h_keys.end()); - for (auto i = 0; i < num_keys; ++i) { + for (std::size_t i = 0; i < num_keys; ++i) { Key key = h_keys[i] + num_keys; Value val = h_keys[i] + num_keys; h_pairs[i].first = key; @@ -242,10 +241,10 @@ static void BM_dynamic_erase_none(::benchmark::State& state) std::size_t batch_size = 1E6; for (auto _ : state) { map_type map{initial_size, - cuco::sentinel::empty_key{-1}, - cuco::sentinel::empty_value{-1}, - cuco::sentinel::erased_key{-2}}; - for (auto i = 0; i < num_keys; i += batch_size) { + cuco::empty_key{-1}, + cuco::empty_value{-1}, + cuco::erased_key{-2}}; + for (std::size_t i = 0; i < num_keys; i += batch_size) { map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size); } { @@ -346,4 +345,15 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::GAUSSIAN) BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) - ->UseManualTime(); \ No newline at end of file + ->UseManualTime(); + +// TODO: comprehensive tests for erase_none and search_none? +BENCHMARK_TEMPLATE(BM_dynamic_search_none, int32_t, int32_t, dist_type::UNIFORM) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); + +BENCHMARK_TEMPLATE(BM_dynamic_erase_none, int32_t, int32_t, dist_type::UNIFORM) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu index 25033ff14..f25caed30 100644 --- a/tests/dynamic_map/erase_test.cu +++ b/tests/dynamic_map/erase_test.cu @@ -33,9 +33,9 @@ TEMPLATE_TEST_CASE_SIG("erase key", { unsigned long num_keys = 1'000'000; cuco::dynamic_map map{num_keys * 2, - cuco::sentinel::empty_key{-1}, - cuco::sentinel::empty_value{-1}, - cuco::sentinel::erased_key{-2}}; + cuco::empty_key{-1}, + cuco::empty_value{-1}, + cuco::erased_key{-2}}; thrust::device_vector d_keys(num_keys); thrust::device_vector d_values(num_keys); From 788ad29ec8c92446666dfca9b8c14c7b9c431171 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 20 Dec 2022 13:37:35 -0500 Subject: [PATCH 070/152] Cleanups: get rid of host-side counter vector, remove get_ prefixes and async instructions when possible --- include/cuco/detail/dynamic_map.inl | 36 ++++++++++++++--------------- include/cuco/dynamic_map.cuh | 6 ++--- include/cuco/static_map.cuh | 4 ++-- 3 files changed, 22 insertions(+), 24 deletions(-) diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index 66c130899..08aa4dd1c 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -41,8 +41,7 @@ dynamic_map::dynamic_map( stream)); submap_views_.push_back(submaps_[0]->get_device_view()); submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); - submap_num_successes_.push_back(submaps_[0]->get_num_successes()); - d_submap_num_successes_ = submap_num_successes_; + submap_num_successes_.push_back(submaps_[0]->num_successes()); } template @@ -75,8 +74,7 @@ dynamic_map::dynamic_map( stream)); submap_views_.push_back(submaps_[0]->get_device_view()); submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); - submap_num_successes_.push_back(submaps_[0]->get_num_successes()); - d_submap_num_successes_ = submap_num_successes_; + submap_num_successes_.push_back(submaps_[0]->num_successes()); } template @@ -110,8 +108,7 @@ void dynamic_map::reserve(std::size_t n, cudaStrea alloc_, stream)); } - submap_num_successes_.push_back(submaps_[submap_idx]->get_num_successes()); - d_submap_num_successes_ = submap_num_successes_; + submap_num_successes_.push_back(submaps_[submap_idx]->num_successes()); submap_views_.push_back(submaps_[submap_idx]->get_device_view()); submap_mutable_views_.push_back(submaps_[submap_idx]->get_device_mutable_view()); capacity_ *= 2; @@ -143,7 +140,8 @@ void dynamic_map::insert( // only if we meet the minimum insert size. if (capacity_remaining >= min_insert_size_) { - CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[submap_idx], 0, sizeof(atomic_ctr_type))); + CUCO_CUDA_TRY( + cudaMemsetAsync(submap_num_successes_[submap_idx], 0, sizeof(atomic_ctr_type), stream)); auto n = std::min(capacity_remaining, num_to_insert); auto const block_size = 128; @@ -156,17 +154,18 @@ void dynamic_map::insert( first + n, submap_views_.data().get(), submap_mutable_views_.data().get(), - d_submap_num_successes_.data().get(), + submap_num_successes_.data().get(), submap_idx, submaps_.size(), hash, key_equal); std::size_t h_num_successes; - CUCO_CUDA_TRY(cudaMemcpy(&h_num_successes, - submap_num_successes_[submap_idx], - sizeof(atomic_ctr_type), - cudaMemcpyDeviceToHost)); + CUCO_CUDA_TRY(cudaMemcpyAsync(&h_num_successes, + submap_num_successes_[submap_idx], + sizeof(atomic_ctr_type), + cudaMemcpyDeviceToHost, + stream)); submaps_[submap_idx]->size_ += h_num_successes; size_ += h_num_successes; first += n; @@ -194,7 +193,7 @@ void dynamic_map::erase( // zero out submap success counters for (uint32_t i = 0; i < submaps_.size(); ++i) { - CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[i], 0, sizeof(atomic_ctr_type))); + CUCO_CUDA_TRY(cudaMemsetAsync(submap_num_successes_[i], 0, sizeof(atomic_ctr_type), stream)); } auto const temp_storage_size = submaps_.size() * sizeof(unsigned long long); @@ -203,17 +202,18 @@ void dynamic_map::erase( <<>>(first, first + num_keys, submap_mutable_views_.data().get(), - d_submap_num_successes_.data().get(), + submap_num_successes_.data().get(), submaps_.size(), hash, key_equal); for (uint32_t i = 0; i < submaps_.size(); ++i) { std::size_t h_submap_num_successes; - CUCO_CUDA_TRY(cudaMemcpy(&h_submap_num_successes, - submap_num_successes_[i], - sizeof(atomic_ctr_type), - cudaMemcpyDeviceToHost)); + CUCO_CUDA_TRY(cudaMemcpyAsync(&h_submap_num_successes, + submap_num_successes_[i], + sizeof(atomic_ctr_type), + cudaMemcpyDeviceToHost, + stream)); submaps_[i]->size_ -= h_submap_num_successes; size_ -= h_submap_num_successes; } diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index 794e67f2e..c0c76bde0 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -354,11 +354,9 @@ class dynamic_map { thrust::device_vector submap_mutable_views_; ///< vector of mutable device views for each submap std::size_t min_insert_size_{}; ///< min remaining capacity of submap for insert - std::vector - submap_num_successes_; ///< number of succesfully erased keys for each submap thrust::device_vector - d_submap_num_successes_; ///< device-side number of successfully erased keys for each submap - Allocator alloc_{}; ///< Allocator passed to submaps to allocate their device storage + submap_num_successes_; ///< Number of successfully erased keys for each submap + Allocator alloc_{}; ///< Allocator passed to submaps to allocate their device storage counter_allocator_type counter_allocator_{}; ///< Allocator used to allocate `num_successes_` }; } // namespace cuco diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 89af45d41..316a5d77d 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -1420,10 +1420,10 @@ class static_map { * * @return Number of successfully inserted/erased keys from the last insert/erase operation */ - atomic_ctr_type* get_num_successes() const noexcept { return num_successes_; } + atomic_ctr_type* num_successes() const noexcept { return num_successes_; } private: - pair_atomic_type* slots_{nullptr}; ///< Pointer to flat slots storage + pair_atomic_type* slots_{}; ///< Pointer to flat slots storage std::size_t capacity_{}; ///< Total number of slots std::size_t size_{}; ///< Number of keys in map Key empty_key_sentinel_{}; ///< Key value that represents an empty slot From c71dd607684ec7b37457c97ff2d54736b6923983 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 20 Dec 2022 13:50:19 -0500 Subject: [PATCH 071/152] Get rid of num_successes getter --- include/cuco/detail/dynamic_map.inl | 6 +++--- include/cuco/static_map.cuh | 8 -------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index 08aa4dd1c..c50d5e3a5 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -41,7 +41,7 @@ dynamic_map::dynamic_map( stream)); submap_views_.push_back(submaps_[0]->get_device_view()); submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); - submap_num_successes_.push_back(submaps_[0]->num_successes()); + submap_num_successes_.push_back(submaps_[0]->num_successes_); } template @@ -74,7 +74,7 @@ dynamic_map::dynamic_map( stream)); submap_views_.push_back(submaps_[0]->get_device_view()); submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); - submap_num_successes_.push_back(submaps_[0]->num_successes()); + submap_num_successes_.push_back(submaps_[0]->num_successes_); } template @@ -108,7 +108,7 @@ void dynamic_map::reserve(std::size_t n, cudaStrea alloc_, stream)); } - submap_num_successes_.push_back(submaps_[submap_idx]->num_successes()); + submap_num_successes_.push_back(submaps_[submap_idx]->num_successes_); submap_views_.push_back(submaps_[submap_idx]->get_device_view()); submap_mutable_views_.push_back(submaps_[submap_idx]->get_device_mutable_view()); capacity_ *= 2; diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 316a5d77d..7a240da03 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -1414,14 +1414,6 @@ class static_map { sentinel::erased_key{erased_key_sentinel_}); } - /** - * @brief Gets the number of successfully inserted/erased keys from the last - * insert/erase operation - * - * @return Number of successfully inserted/erased keys from the last insert/erase operation - */ - atomic_ctr_type* num_successes() const noexcept { return num_successes_; } - private: pair_atomic_type* slots_{}; ///< Pointer to flat slots storage std::size_t capacity_{}; ///< Total number of slots From ab4ef0c9058f564a925396ee7a89134883e42f4e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 20 Dec 2022 13:53:23 -0500 Subject: [PATCH 072/152] Fix comments --- include/cuco/dynamic_map.cuh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index c0c76bde0..a35aee893 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -119,8 +119,8 @@ class dynamic_map { dynamic_map& operator=(dynamic_map&&) = delete; /** - * @brief Construct a dynamically-sized map with the specified initial capacity, growth factor and - * sentinel values. + * @brief Constructs a dynamically-sized map with the specified initial capacity, growth factor + * and sentinel values. * * The capacity of the map will automatically increase as the user adds key/value pairs using * `insert`. @@ -146,7 +146,7 @@ class dynamic_map { cudaStream_t stream = nullptr); /** - * @brief Construct a dynamically-sized map with erase capability. + * @brief Constructs a dynamically-sized map with erase capability. * * The capacity of the map will automatically increase as the user adds key/value pairs using * `insert`. @@ -177,7 +177,7 @@ class dynamic_map { cudaStream_t stream = nullptr); /** - * @brief Destroy the map and frees its contents + * @brief Destroys the map and frees its contents * */ ~dynamic_map() {} From d72e40303ff6af01f76f5a78ca904d0381aadaba Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 20 Dec 2022 14:06:34 -0500 Subject: [PATCH 073/152] Update tests --- tests/dynamic_map/erase_test.cu | 72 ++++++++++++++++----------------- 1 file changed, 35 insertions(+), 37 deletions(-) diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu index f25caed30..fc3dc3c28 100644 --- a/tests/dynamic_map/erase_test.cu +++ b/tests/dynamic_map/erase_test.cu @@ -14,9 +14,10 @@ * limitations under the License. */ -#include #include +#include + #include #include #include @@ -31,27 +32,23 @@ TEMPLATE_TEST_CASE_SIG("erase key", (int64_t, int32_t), (int64_t, int64_t)) { - unsigned long num_keys = 1'000'000; + constexpr std::size_t num_keys = 1'000'000; cuco::dynamic_map map{num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}, cuco::erased_key{-2}}; - thrust::device_vector d_keys(num_keys); - thrust::device_vector d_values(num_keys); - thrust::device_vector d_keys_exist(num_keys); - - thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1); - thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1); + SECTION("Check single submap insert/erase") + { + thrust::device_vector d_keys(num_keys); + thrust::device_vector d_values(num_keys); + thrust::device_vector d_keys_exist(num_keys); - auto pairs_begin = - thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin())); + thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1); + thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1); - SECTION("Check basic insert/erase") - { - // ***************************************** - // first, check single submap works properly - // ***************************************** + auto pairs_begin = + thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin())); map.insert(pairs_begin, pairs_begin + num_keys); @@ -94,46 +91,47 @@ TEMPLATE_TEST_CASE_SIG("erase key", // clear map map.erase(d_keys.begin() + num_keys / 2, d_keys.end()); + } - // ************************************************* - // second, check multiple submaps case works properly - // ************************************************* + SECTION("Check multiple submaps insert/erase") + { + constexpr std::size_t num = 4 * num_keys; - thrust::device_vector d_keys2(4 * num_keys); - thrust::device_vector d_values2(4 * num_keys); - thrust::device_vector d_keys_exist2(4 * num_keys); + thrust::device_vector d_keys(num); + thrust::device_vector d_values(num); + thrust::device_vector d_keys_exist(num); - thrust::sequence(thrust::device, d_keys2.begin(), d_keys2.end(), 1); - thrust::sequence(thrust::device, d_values2.begin(), d_values2.end(), 1); + thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1); + thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1); - auto pairs_begin2 = - thrust::make_zip_iterator(thrust::make_tuple(d_keys2.begin(), d_values2.begin())); + auto pairs_begin = + thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin())); - map.insert(pairs_begin2, pairs_begin2 + 4 * num_keys); + map.insert(pairs_begin, pairs_begin + num); // map should resize twice if the erased slots are successfully reused - REQUIRE(map.get_capacity() == 8 * num_keys); + REQUIRE(map.get_capacity() == 2 * num); // check that keys can be successfully deleted from only the first and second submaps - map.erase(d_keys2.begin(), d_keys2.begin() + 2 * num_keys); - map.contains(d_keys2.begin(), d_keys2.end(), d_keys_exist2.begin()); + map.erase(d_keys.begin(), d_keys.begin() + 2 * num_keys); + map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); - REQUIRE(cuco::test::none_of(d_keys_exist2.begin(), - d_keys_exist2.begin() + 2 * num_keys, + REQUIRE(cuco::test::none_of(d_keys_exist.begin(), + d_keys_exist.begin() + 2 * num_keys, [] __device__(const bool key_found) { return key_found; })); - REQUIRE(cuco::test::all_of(d_keys_exist2.begin() + 2 * num_keys, - d_keys_exist2.end(), + REQUIRE(cuco::test::all_of(d_keys_exist.begin() + 2 * num_keys, + d_keys_exist.end(), [] __device__(const bool key_found) { return key_found; })); REQUIRE(map.get_size() == 2 * num_keys); // check that keys can be successfully deleted from all submaps (some will be unsuccessful // erases) - map.erase(d_keys2.begin(), d_keys2.end()); + map.erase(d_keys.begin(), d_keys.end()); - map.contains(d_keys2.begin(), d_keys2.end(), d_keys_exist2.begin()); + map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); - REQUIRE(cuco::test::none_of(d_keys_exist2.begin(), - d_keys_exist2.end(), + REQUIRE(cuco::test::none_of(d_keys_exist.begin(), + d_keys_exist.end(), [] __device__(const bool key_found) { return key_found; })); REQUIRE(map.get_size() == 0); From 9478650e5e6af934d5b6463a3d96ea9996004daa Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 20 Dec 2022 15:35:44 -0500 Subject: [PATCH 074/152] Update include/cuco/detail/dynamic_map_kernels.cuh --- include/cuco/detail/dynamic_map_kernels.cuh | 1 - 1 file changed, 1 deletion(-) diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh index 37bcbc547..b98516160 100644 --- a/include/cuco/detail/dynamic_map_kernels.cuh +++ b/include/cuco/detail/dynamic_map_kernels.cuh @@ -149,7 +149,6 @@ __global__ void insert(InputIt first, InputIt last, viewT* submap_views, mutableViewT* submap_mutable_views, - // atomicT* num_successes, atomicT** submap_num_successes, uint32_t insert_idx, uint32_t num_submaps, From 82e0f2e8f51c2b5593e8dc3cac27f4f505ad6e09 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 20 Dec 2022 21:47:44 -0500 Subject: [PATCH 075/152] Cleanups: relaxed memory atomic, static_assert instead of runtime expect, constexpr when possible --- include/cuco/detail/dynamic_map.inl | 56 +++++++++++---------- include/cuco/detail/dynamic_map_kernels.cuh | 36 ++++++------- 2 files changed, 47 insertions(+), 45 deletions(-) diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index c50d5e3a5..989225eea 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -125,8 +125,12 @@ void dynamic_map::insert( InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream) { // TODO: memset an atomic variable is unsafe - CUCO_RUNTIME_EXPECTS(sizeof(std::size_t) == sizeof(atomic_ctr_type), - "sizeof(atomic_ctr_type) must be equal to sizeof(std:size_t)."); + static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type), + "sizeof(atomic_ctr_type) must be equal to sizeof(std:size_t)."); + + auto constexpr block_size = 128; + auto constexpr stride = 1; + auto constexpr tile_size = 4; std::size_t num_to_insert = std::distance(first, last); @@ -138,16 +142,12 @@ void dynamic_map::insert( max_load_factor_ * submaps_[submap_idx]->get_capacity() - submaps_[submap_idx]->get_size(); // If we are tying to insert some of the remaining keys into this submap, we can insert // only if we meet the minimum insert size. - if (capacity_remaining >= min_insert_size_) { CUCO_CUDA_TRY( cudaMemsetAsync(submap_num_successes_[submap_idx], 0, sizeof(atomic_ctr_type), stream)); - auto n = std::min(capacity_remaining, num_to_insert); - auto const block_size = 128; - auto const stride = 1; - auto const tile_size = 4; - auto const grid_size = (tile_size * n + stride * block_size - 1) / (stride * block_size); + auto const n = std::min(capacity_remaining, num_to_insert); + auto const grid_size = (tile_size * n + stride * block_size - 1) / (stride * block_size); detail::insert> <<>>(first, @@ -180,16 +180,16 @@ template void dynamic_map::erase( InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream) { - std::size_t num_keys = std::distance(first, last); + // TODO: memset an atomic variable is unsafe + static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type), + "sizeof(atomic_ctr_type) must be equal to sizeof(std:size_t)."); - auto const block_size = 128; - auto const stride = 1; - auto const tile_size = 4; - auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); + auto constexpr block_size = 128; + auto constexpr stride = 1; + auto constexpr tile_size = 4; - // TODO: memset an atomic variable is unsafe - CUCO_RUNTIME_EXPECTS(sizeof(std::size_t) == sizeof(atomic_ctr_type), - "sizeof(atomic_ctr_type) must be equal to sizeof(std:size_t)."); + auto const num_keys = std::distance(first, last); + auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); // zero out submap success counters for (uint32_t i = 0; i < submaps_.size(); ++i) { @@ -228,11 +228,12 @@ void dynamic_map::find(InputIt first, KeyEqual key_equal, cudaStream_t stream) { - auto num_keys = std::distance(first, last); - auto const block_size = 128; - auto const stride = 1; - auto const tile_size = 4; - auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); + auto constexpr block_size = 128; + auto constexpr stride = 1; + auto constexpr tile_size = 4; + + auto const num_keys = std::distance(first, last); + auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); detail::find<<>>( first, last, output_begin, submap_views_.data().get(), submaps_.size(), hash, key_equal); @@ -248,11 +249,12 @@ void dynamic_map::contains(InputIt first, KeyEqual key_equal, cudaStream_t stream) { - auto num_keys = std::distance(first, last); - auto const block_size = 128; - auto const stride = 1; - auto const tile_size = 4; - auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); + auto constexpr block_size = 128; + auto constexpr stride = 1; + auto constexpr tile_size = 4; + + auto const num_keys = std::distance(first, last); + auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); detail::contains<<>>( first, last, output_begin, submap_views_.data().get(), submaps_.size(), hash, key_equal); diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh index 37bcbc547..566576e1e 100644 --- a/include/cuco/detail/dynamic_map_kernels.cuh +++ b/include/cuco/detail/dynamic_map_kernels.cuh @@ -98,8 +98,10 @@ __global__ void insert(InputIt first, tid += gridDim.x * blockDim.x; } - std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); - if (threadIdx.x == 0) { *num_successes += block_num_successes; } + std::size_t const block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); + if (threadIdx.x == 0) { + num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed); + } } /** @@ -130,7 +132,7 @@ __global__ void insert(InputIt first, * perform `contains` operations on each underlying `static_map` * @param submap_mutable_views Array of `static_map::device_mutable_view` objects * used to perform an `insert` into the target `static_map` submap - * @param num_successes The number of successfully inserted key/value pairs + * @param submap_num_successes The number of successfully inserted key/value pairs for each submap * @param insert_idx The index of the submap we are inserting into * @param num_submaps The total number of submaps in the map * @param hash The unary function to apply to hash each key @@ -149,7 +151,6 @@ __global__ void insert(InputIt first, InputIt last, viewT* submap_views, mutableViewT* submap_mutable_views, - // atomicT* num_successes, atomicT** submap_num_successes, uint32_t insert_idx, uint32_t num_submaps, @@ -185,10 +186,10 @@ __global__ void insert(InputIt first, it += (gridDim.x * blockDim.x) / tile_size; } - std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); + std::size_t const block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); if (threadIdx.x == 0) { - //*num_successes += block_num_successes; - *submap_num_successes[insert_idx] += block_num_successes; + submap_num_successes[insert_idx]->fetch_add(block_num_successes, + cuda::std::memory_order_relaxed); } } @@ -228,23 +229,22 @@ __global__ void erase(InputIt first, InputIt last, mutableViewT* submap_mutable_views, atomicT** submap_num_successes, - const uint32_t num_submaps, + uint32_t num_submaps, Hash hash, KeyEqual key_equal) { - using BlockReduce = cub::BlockReduce; extern __shared__ unsigned long long submap_block_num_successes[]; auto tid = block_size * blockIdx.x + threadIdx.x; auto it = first + tid; - for (int i = threadIdx.x; i < num_submaps; i += block_size) + for (auto i = threadIdx.x; i < num_submaps; i += block_size) { submap_block_num_successes[i] = 0; + } __syncthreads(); while (it < last) { - int i; - for (i = 0; i < num_submaps; ++i) { + for (auto i = 0; i < num_submaps; ++i) { if (submap_mutable_views[i].erase(*it, hash, key_equal)) { atomicAdd(&submap_block_num_successes[i], 1); break; @@ -254,7 +254,7 @@ __global__ void erase(InputIt first, } __syncthreads(); - for (int i = 0; i < num_submaps; ++i) { + for (auto i = 0; i < num_submaps; ++i) { if (threadIdx.x == 0) { submap_num_successes[i]->fetch_add(static_cast(submap_block_num_successes[i]), cuda::std::memory_order_relaxed); @@ -300,11 +300,10 @@ __global__ void erase(InputIt first, InputIt last, mutableViewT* submap_mutable_views, atomicT** submap_num_successes, - const uint32_t num_submaps, + uint32_t num_submaps, Hash hash, KeyEqual key_equal) { - using BlockReduce = cub::BlockReduce; extern __shared__ unsigned long long submap_block_num_successes[]; auto block = cg::this_thread_block(); @@ -312,13 +311,14 @@ __global__ void erase(InputIt first, auto tid = block_size * block.group_index().x + block.thread_rank(); auto it = first + tid / tile_size; - for (int i = threadIdx.x; i < num_submaps; i += block_size) + for (auto i = threadIdx.x; i < num_submaps; i += block_size) { submap_block_num_successes[i] = 0; + } block.sync(); while (it < last) { auto erased = false; - int i; + int i = 0; for (i = 0; i < num_submaps; ++i) { erased = submap_mutable_views[i].erase(tile, *it, hash, key_equal); if (erased) { break; } @@ -328,7 +328,7 @@ __global__ void erase(InputIt first, } block.sync(); - for (int i = 0; i < num_submaps; ++i) { + for (auto i = 0; i < num_submaps; ++i) { if (threadIdx.x == 0) { submap_num_successes[i]->fetch_add(static_cast(submap_block_num_successes[i]), cuda::std::memory_order_relaxed); From f5ec677e85b23e4a74c89fa1afd40bbe980d7623 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 20 Dec 2022 21:55:52 -0500 Subject: [PATCH 076/152] Reorder header groups + remove unused counter allocator --- include/cuco/detail/dynamic_map.inl | 6 ++---- include/cuco/dynamic_map.cuh | 7 +++---- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index 989225eea..bb7986071 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -30,8 +30,7 @@ dynamic_map::dynamic_map( capacity_(initial_capacity), min_insert_size_(1E4), max_load_factor_(0.60), - alloc_{alloc}, - counter_allocator_{alloc} + alloc_{alloc} { submaps_.push_back(std::make_unique>( initial_capacity, @@ -59,8 +58,7 @@ dynamic_map::dynamic_map( capacity_(initial_capacity), min_insert_size_(1E4), max_load_factor_(0.60), - alloc_{alloc}, - counter_allocator_{alloc} + alloc_{alloc} { CUCO_RUNTIME_EXPECTS(empty_key_sentinel_ != erased_key_sentinel_, "The empty key sentinel and erased key sentinel cannot be the same value."); diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index a35aee893..f2285efc8 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -21,10 +21,12 @@ #include #include #include -#include + #include #include +#include + #include #include #include @@ -109,8 +111,6 @@ class dynamic_map { using mutable_view_type = typename static_map::device_mutable_view; ///< Type for submap mutable ///< device view - using counter_allocator_type = typename std::allocator_traits::rebind_alloc< - atomic_ctr_type>; ///< Type of the allocator to (de)allocate atomic counters dynamic_map(dynamic_map const&) = delete; dynamic_map(dynamic_map&&) = delete; @@ -357,7 +357,6 @@ class dynamic_map { thrust::device_vector submap_num_successes_; ///< Number of successfully erased keys for each submap Allocator alloc_{}; ///< Allocator passed to submaps to allocate their device storage - counter_allocator_type counter_allocator_{}; ///< Allocator used to allocate `num_successes_` }; } // namespace cuco From c970a05e4253b23e8abcfda1141c65d76db49ca7 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 28 Dec 2022 18:16:25 -0500 Subject: [PATCH 077/152] Get rid of the use of sentinel namespace --- include/cuco/detail/dynamic_map.inl | 17 ++-- include/cuco/detail/static_map.inl | 24 +++--- .../static_multimap/static_multimap.inl | 4 +- include/cuco/dynamic_map.cuh | 8 +- include/cuco/static_map.cuh | 86 +++++++++---------- include/cuco/static_multimap.cuh | 29 +++---- 6 files changed, 81 insertions(+), 87 deletions(-) diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index 0c1d2e377..d436302ea 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -17,11 +17,10 @@ namespace cuco { template -dynamic_map::dynamic_map( - std::size_t initial_capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, - Allocator const& alloc) +dynamic_map::dynamic_map(std::size_t initial_capacity, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + Allocator const& alloc) : empty_key_sentinel_(empty_key_sentinel.value), empty_value_sentinel_(empty_value_sentinel.value), size_(0), @@ -32,8 +31,8 @@ dynamic_map::dynamic_map( { submaps_.push_back(std::make_unique>( initial_capacity, - sentinel::empty_key{empty_key_sentinel}, - sentinel::empty_value{empty_value_sentinel}, + empty_key{empty_key_sentinel}, + empty_value{empty_value_sentinel}, alloc)); submap_views_.push_back(submaps_[0]->get_device_view()); submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); @@ -64,8 +63,8 @@ void dynamic_map::reserve(std::size_t n) submap_capacity = capacity_; submaps_.push_back(std::make_unique>( submap_capacity, - sentinel::empty_key{empty_key_sentinel_}, - sentinel::empty_value{empty_value_sentinel_}, + empty_key{empty_key_sentinel_}, + empty_value{empty_value_sentinel_}, alloc_)); submap_views_.push_back(submaps_[submap_idx]->get_device_view()); submap_mutable_views_.push_back(submaps_[submap_idx]->get_device_mutable_view()); diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index ee794e1c2..49fb645e5 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -28,12 +28,11 @@ namespace cuco { template -static_map::static_map( - std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, - Allocator const& alloc, - cudaStream_t stream) +static_map::static_map(std::size_t capacity, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + Allocator const& alloc, + cudaStream_t stream) : capacity_{std::max(capacity, std::size_t{1})}, // to avoid dereferencing a nullptr (Issue #72) empty_key_sentinel_{empty_key_sentinel.value}, empty_value_sentinel_{empty_value_sentinel.value}, @@ -53,13 +52,12 @@ static_map::static_map( } template -static_map::static_map( - std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, - sentinel::erased_key erased_key_sentinel, - Allocator const& alloc, - cudaStream_t stream) +static_map::static_map(std::size_t capacity, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + erased_key erased_key_sentinel, + Allocator const& alloc, + cudaStream_t stream) : capacity_{std::max(capacity, std::size_t{1})}, // to avoid dereferencing a nullptr (Issue #72) empty_key_sentinel_{empty_key_sentinel.value}, empty_value_sentinel_{empty_value_sentinel.value}, diff --git a/include/cuco/detail/static_multimap/static_multimap.inl b/include/cuco/detail/static_multimap/static_multimap.inl index b3523e74b..f42d9ab2e 100644 --- a/include/cuco/detail/static_multimap/static_multimap.inl +++ b/include/cuco/detail/static_multimap/static_multimap.inl @@ -33,8 +33,8 @@ template static_multimap::static_multimap( std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, cudaStream_t stream, Allocator const& alloc) : capacity_{cuco::detail::get_valid_capacity( diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index 35a3cdf4a..5deb0920a 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -67,8 +67,8 @@ namespace cuco { * // within the second insert. * * dynamic_map m{100'000, - * sentinel::empty_key{empty_key_sentinel}, - * sentinel::empty_value{empty_value_sentinel}}; + * empty_key{empty_key_sentinel}, + * empty_value{empty_value_sentinel}}; * * // Create a sequence of pairs {{0,0}, {1,1}, ... {i,i}} * thrust::device_vector> pairs_0(50'000); @@ -136,8 +136,8 @@ class dynamic_map { * @param alloc Allocator used to allocate submap device storage */ dynamic_map(std::size_t initial_capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, Allocator const& alloc = Allocator{}); /** diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 7674d5b20..b6410b16a 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -200,8 +200,8 @@ class static_map { * @param stream Stream used for executing the kernels */ static_map(std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, Allocator const& alloc = Allocator{}, cudaStream_t stream = 0); @@ -220,9 +220,9 @@ class static_map { * @param stream Stream used for executing the kernels */ static_map(std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, - sentinel::erased_key erased_key_sentinel, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + erased_key erased_key_sentinel, Allocator const& alloc = Allocator{}, cudaStream_t stream = 0); @@ -435,8 +435,8 @@ class static_map { __host__ __device__ device_view_base(pair_atomic_type* slots, std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel) noexcept + empty_key empty_key_sentinel, + empty_value empty_value_sentinel) noexcept : slots_{slots}, capacity_{capacity}, empty_key_sentinel_{empty_key_sentinel.value}, @@ -447,9 +447,9 @@ class static_map { __host__ __device__ device_view_base(pair_atomic_type* slots, std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, - sentinel::erased_key erased_key_sentinel) noexcept + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + erased_key erased_key_sentinel) noexcept : slots_{slots}, capacity_{capacity}, empty_key_sentinel_{empty_key_sentinel.value}, @@ -768,11 +768,10 @@ class static_map { * @param empty_value_sentinel The reserved value for mapped values to * represent empty slots */ - __host__ __device__ - device_mutable_view(pair_atomic_type* slots, - std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel) noexcept + __host__ __device__ device_mutable_view(pair_atomic_type* slots, + std::size_t capacity, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel) noexcept : device_view_base{slots, capacity, empty_key_sentinel, empty_value_sentinel} { } @@ -789,9 +788,9 @@ class static_map { */ __host__ __device__ device_mutable_view(pair_atomic_type* slots, std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, - sentinel::erased_key erased_key_sentinel) noexcept + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + erased_key erased_key_sentinel) noexcept : device_view_base{ slots, capacity, empty_key_sentinel, empty_value_sentinel, erased_key_sentinel} { @@ -878,8 +877,8 @@ class static_map { CG const& g, pair_atomic_type* slots, std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel) noexcept + empty_key empty_key_sentinel, + empty_value empty_value_sentinel) noexcept { device_view_base::initialize_slots( g, slots, capacity, empty_key_sentinel.value, empty_value_sentinel.value); @@ -887,7 +886,7 @@ class static_map { capacity, empty_key_sentinel, empty_value_sentinel, - sentinel::erased_key{empty_key_sentinel.value}}; + erased_key{empty_key_sentinel.value}}; } /** @@ -909,9 +908,9 @@ class static_map { CG const& g, pair_atomic_type* slots, std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, - sentinel::erased_key erased_key_sentinel) noexcept + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + erased_key erased_key_sentinel) noexcept { device_view_base::initialize_slots( g, slots, capacity, empty_key_sentinel, empty_value_sentinel); @@ -1070,8 +1069,8 @@ class static_map { */ __host__ __device__ device_view(pair_atomic_type* slots, std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel) noexcept + empty_key empty_key_sentinel, + empty_value empty_value_sentinel) noexcept : device_view_base{slots, capacity, empty_key_sentinel, empty_value_sentinel} { } @@ -1088,9 +1087,9 @@ class static_map { */ __host__ __device__ device_view(pair_atomic_type* slots, std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, - sentinel::erased_key erased_key_sentinel) noexcept + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + erased_key erased_key_sentinel) noexcept : device_view_base{ slots, capacity, empty_key_sentinel, empty_value_sentinel, erased_key_sentinel} { @@ -1104,9 +1103,9 @@ class static_map { __host__ __device__ explicit device_view(device_mutable_view mutable_map) : device_view_base{mutable_map.get_slots(), mutable_map.get_capacity(), - sentinel::empty_key{mutable_map.get_empty_key_sentinel()}, - sentinel::empty_value{mutable_map.get_empty_value_sentinel()}, - sentinel::erased_key{mutable_map.get_erased_key_sentinel()}} + empty_key{mutable_map.get_empty_key_sentinel()}, + empty_value{mutable_map.get_empty_value_sentinel()}, + erased_key{mutable_map.get_erased_key_sentinel()}} { } @@ -1175,12 +1174,11 @@ class static_map { g.sync(); #endif - return device_view( - memory_to_use, - source_device_view.get_capacity(), - sentinel::empty_key{source_device_view.get_empty_key_sentinel()}, - sentinel::empty_value{source_device_view.get_empty_value_sentinel()}, - sentinel::erased_key{source_device_view.get_erased_key_sentinel()}); + return device_view(memory_to_use, + source_device_view.get_capacity(), + empty_key{source_device_view.get_empty_key_sentinel()}, + empty_value{source_device_view.get_empty_value_sentinel()}, + erased_key{source_device_view.get_erased_key_sentinel()}); } /** @@ -1395,9 +1393,9 @@ class static_map { { return device_view(slots_, capacity_, - sentinel::empty_key{empty_key_sentinel_}, - sentinel::empty_value{empty_value_sentinel_}, - sentinel::erased_key{erased_key_sentinel_}); + empty_key{empty_key_sentinel_}, + empty_value{empty_value_sentinel_}, + erased_key{erased_key_sentinel_}); } /** @@ -1409,9 +1407,9 @@ class static_map { { return device_mutable_view(slots_, capacity_, - sentinel::empty_key{empty_key_sentinel_}, - sentinel::empty_value{empty_value_sentinel_}, - sentinel::erased_key{erased_key_sentinel_}); + empty_key{empty_key_sentinel_}, + empty_value{empty_value_sentinel_}, + erased_key{erased_key_sentinel_}); } private: diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index 27cb951ba..b0917dc8e 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -225,8 +225,8 @@ class static_multimap { * @param alloc Allocator used for allocating device storage */ static_multimap(std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, cudaStream_t stream = 0, Allocator const& alloc = Allocator{}); @@ -611,8 +611,8 @@ class static_multimap { __host__ __device__ device_view_base(pair_atomic_type* slots, std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel) noexcept + empty_key empty_key_sentinel, + empty_value empty_value_sentinel) noexcept : impl_{slots, capacity, empty_key_sentinel.value, empty_value_sentinel.value} { } @@ -714,11 +714,10 @@ class static_multimap { * @param empty_value_sentinel The reserved value for mapped values to * represent empty slots */ - __host__ __device__ - device_mutable_view(pair_atomic_type* slots, - std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel) noexcept + __host__ __device__ device_mutable_view(pair_atomic_type* slots, + std::size_t capacity, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel) noexcept : view_base_type{slots, capacity, empty_key_sentinel, empty_value_sentinel} { } @@ -770,8 +769,8 @@ class static_multimap { */ __host__ __device__ device_view(pair_atomic_type* slots, std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel) noexcept + empty_key empty_key_sentinel, + empty_value empty_value_sentinel) noexcept : view_base_type{slots, capacity, empty_key_sentinel, empty_value_sentinel} { } @@ -1325,8 +1324,8 @@ class static_multimap { { return device_view(slots_.get(), capacity_, - sentinel::empty_key{empty_key_sentinel_}, - sentinel::empty_value{empty_value_sentinel_}); + empty_key{empty_key_sentinel_}, + empty_value{empty_value_sentinel_}); } /** @@ -1339,8 +1338,8 @@ class static_multimap { { return device_mutable_view(slots_.get(), capacity_, - sentinel::empty_key{empty_key_sentinel_}, - sentinel::empty_value{empty_value_sentinel_}); + empty_key{empty_key_sentinel_}, + empty_value{empty_value_sentinel_}); } private: From ab73e2744585ef1e43e6b011a6fa3af88879742c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 13 Jan 2023 18:43:14 -0500 Subject: [PATCH 078/152] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 293fe31ab..619dbb58a 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ Similar to how [Thrust](https://github.com/thrust/thrust) and [CUB](https://gith We recommend using [CMake Package Manager (CPM)](https://github.com/TheLartians/CPM.cmake) to fetch `cuCollections` into your project. With CPM, getting `cuCollections` is easy: -``` +```cmake cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) include(path/to/CPM.cmake) From 52230640b29384e9b1d5da9fedb0fb6a297fc05e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 23 Jan 2023 14:14:19 -0500 Subject: [PATCH 079/152] Upgrade to Catch2.v3 --- tests/CMakeLists.txt | 17 ++++++----------- tests/catch_main.cpp | 6 ------ tests/dynamic_map/erase_test.cu | 4 ++-- tests/dynamic_map/unique_sequence_test.cu | 4 ++-- tests/static_map/custom_type_test.cu | 4 ++-- tests/static_map/duplicate_keys_test.cu | 4 ++-- tests/static_map/erase_test.cu | 11 ++++++----- tests/static_map/heterogeneous_lookup_test.cu | 4 ++-- tests/static_map/insert_and_find_test.cu | 4 ++-- tests/static_map/key_sentinel_test.cu | 4 ++-- tests/static_map/shared_memory_test.cu | 4 ++-- tests/static_map/stream_test.cu | 4 ++-- tests/static_map/unique_sequence_test.cu | 4 ++-- .../custom_pair_retrieve_test.cu | 4 ++-- tests/static_multimap/custom_type_test.cu | 4 ++-- .../heterogeneous_lookup_test.cu | 4 ++-- tests/static_multimap/insert_if_test.cu | 4 ++-- tests/static_multimap/multiplicity_test.cu | 4 ++-- tests/static_multimap/non_match_test.cu | 4 ++-- tests/static_multimap/pair_function_test.cu | 4 ++-- 20 files changed, 46 insertions(+), 56 deletions(-) delete mode 100644 tests/catch_main.cpp diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index bfb9cfbf0..52c4cd9db 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -23,23 +23,18 @@ include(CTest) CPMAddPackage( NAME Catch2 GITHUB_REPOSITORY catchorg/Catch2 - VERSION 2.13.9 + VERSION 3.3.0 ) +# Header for catch_discover_tests if(Catch2_ADDED) - include(${Catch2_SOURCE_DIR}/contrib/Catch.cmake) + include(${Catch2_SOURCE_DIR}/extras/Catch.cmake) endif() -# catch_main.cpp defines `CATCH_CONFIG_MAIN` which provides main() -# Compiles it to be linked into test executables -add_library(CatchMain OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/catch_main.cpp) -target_link_libraries(CatchMain Catch2::Catch2) - ################################################################################################### function(ConfigureTest TEST_NAME) - add_executable(${TEST_NAME} ${ARGN} - $) # Link in the CatchMain object file - target_link_libraries(${TEST_NAME} Catch2::Catch2 cuco CUDA::cudart) + add_executable(${TEST_NAME} ${ARGN}) + target_link_libraries(${TEST_NAME} PRIVATE Catch2::Catch2WithMain cuco CUDA::cudart) target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) set_target_properties(${TEST_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests") diff --git a/tests/catch_main.cpp b/tests/catch_main.cpp deleted file mode 100644 index a7cc18e23..000000000 --- a/tests/catch_main.cpp +++ /dev/null @@ -1,6 +0,0 @@ -// In a Catch project with multiple files, dedicate one file to compile the -// source code of Catch itself and reuse the resulting object file for linking. - -// Let Catch provide main(): -#define CATCH_CONFIG_MAIN -#include diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu index fc3dc3c28..44e9e8fb5 100644 --- a/tests/dynamic_map/erase_test.cu +++ b/tests/dynamic_map/erase_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,7 +22,7 @@ #include #include -#include +#include TEMPLATE_TEST_CASE_SIG("erase key", "", diff --git a/tests/dynamic_map/unique_sequence_test.cu b/tests/dynamic_map/unique_sequence_test.cu index b42a7fa5a..820fb95f8 100644 --- a/tests/dynamic_map/unique_sequence_test.cu +++ b/tests/dynamic_map/unique_sequence_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,7 +27,7 @@ #include #include -#include +#include TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", "", diff --git a/tests/static_map/custom_type_test.cu b/tests/static_map/custom_type_test.cu index 062df6c42..ac743037a 100644 --- a/tests/static_map/custom_type_test.cu +++ b/tests/static_map/custom_type_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,7 +25,7 @@ #include #include -#include +#include #include diff --git a/tests/static_map/duplicate_keys_test.cu b/tests/static_map/duplicate_keys_test.cu index 1815c52e4..54d1c42f1 100644 --- a/tests/static_map/duplicate_keys_test.cu +++ b/tests/static_map/duplicate_keys_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,7 +27,7 @@ #include #include -#include +#include TEMPLATE_TEST_CASE_SIG("Duplicate keys", "", diff --git a/tests/static_map/erase_test.cu b/tests/static_map/erase_test.cu index 1b60406a5..1315a5cba 100644 --- a/tests/static_map/erase_test.cu +++ b/tests/static_map/erase_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,16 +14,17 @@ * limitations under the License. */ -#include +#include + +#include + #include #include #include #include #include -#include - -#include +#include TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t), (int64_t)) { diff --git a/tests/static_map/heterogeneous_lookup_test.cu b/tests/static_map/heterogeneous_lookup_test.cu index 4a5088891..17b7d5662 100644 --- a/tests/static_map/heterogeneous_lookup_test.cu +++ b/tests/static_map/heterogeneous_lookup_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,7 +25,7 @@ #include #include -#include +#include #include diff --git a/tests/static_map/insert_and_find_test.cu b/tests/static_map/insert_and_find_test.cu index fbd66c3e9..1a2fa1907 100644 --- a/tests/static_map/insert_and_find_test.cu +++ b/tests/static_map/insert_and_find_test.cu @@ -1,6 +1,6 @@ /* * Copyright (c) 2022, Jonas Hahnfeld, CERN. - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,7 +24,7 @@ #include #include -#include +#include static constexpr int Iters = 10'000; diff --git a/tests/static_map/key_sentinel_test.cu b/tests/static_map/key_sentinel_test.cu index eeac192b9..b611d38e1 100644 --- a/tests/static_map/key_sentinel_test.cu +++ b/tests/static_map/key_sentinel_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,7 +22,7 @@ #include #include -#include +#include #define SIZE 10 __device__ int A[SIZE]; diff --git a/tests/static_map/shared_memory_test.cu b/tests/static_map/shared_memory_test.cu index 9920b7b1d..8b9d35390 100644 --- a/tests/static_map/shared_memory_test.cu +++ b/tests/static_map/shared_memory_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,7 +25,7 @@ #include #include -#include +#include #include diff --git a/tests/static_map/stream_test.cu b/tests/static_map/stream_test.cu index 1f1fe9b1c..290c176b5 100644 --- a/tests/static_map/stream_test.cu +++ b/tests/static_map/stream_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,7 +27,7 @@ #include #include -#include +#include TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream", "", diff --git a/tests/static_map/unique_sequence_test.cu b/tests/static_map/unique_sequence_test.cu index 94ca65c6b..678fe4098 100644 --- a/tests/static_map/unique_sequence_test.cu +++ b/tests/static_map/unique_sequence_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,7 +28,7 @@ #include #include -#include +#include TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", "", diff --git a/tests/static_multimap/custom_pair_retrieve_test.cu b/tests/static_multimap/custom_pair_retrieve_test.cu index feb13dc97..1b0a346ee 100644 --- a/tests/static_multimap/custom_pair_retrieve_test.cu +++ b/tests/static_multimap/custom_pair_retrieve_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,7 +28,7 @@ #include #include -#include +#include #include diff --git a/tests/static_multimap/custom_type_test.cu b/tests/static_multimap/custom_type_test.cu index 2e565ede2..f3cee280f 100644 --- a/tests/static_multimap/custom_type_test.cu +++ b/tests/static_multimap/custom_type_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,7 +27,7 @@ #include #include -#include +#include #include diff --git a/tests/static_multimap/heterogeneous_lookup_test.cu b/tests/static_multimap/heterogeneous_lookup_test.cu index 57cda8c44..9b724d43c 100644 --- a/tests/static_multimap/heterogeneous_lookup_test.cu +++ b/tests/static_multimap/heterogeneous_lookup_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,7 +25,7 @@ #include #include -#include +#include #include diff --git a/tests/static_multimap/insert_if_test.cu b/tests/static_multimap/insert_if_test.cu index 2a6f3099d..734a93505 100644 --- a/tests/static_multimap/insert_if_test.cu +++ b/tests/static_multimap/insert_if_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,7 +24,7 @@ #include #include -#include +#include template __inline__ void test_insert_if(Map& map, PairIt pair_begin, KeyIt key_begin, std::size_t size) diff --git a/tests/static_multimap/multiplicity_test.cu b/tests/static_multimap/multiplicity_test.cu index ae26da5d1..f1255aaca 100644 --- a/tests/static_multimap/multiplicity_test.cu +++ b/tests/static_multimap/multiplicity_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,7 +27,7 @@ #include #include -#include +#include template __inline__ void test_multiplicity_two(Map& map, std::size_t num_items) diff --git a/tests/static_multimap/non_match_test.cu b/tests/static_multimap/non_match_test.cu index 7232b2832..38c310d25 100644 --- a/tests/static_multimap/non_match_test.cu +++ b/tests/static_multimap/non_match_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ #include #include -#include +#include template __inline__ void test_non_matches(Map& map, PairIt pair_begin, KeyIt key_begin, std::size_t num_keys) diff --git a/tests/static_multimap/pair_function_test.cu b/tests/static_multimap/pair_function_test.cu index ae1bb8ae2..694c968ae 100644 --- a/tests/static_multimap/pair_function_test.cu +++ b/tests/static_multimap/pair_function_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,7 +27,7 @@ #include #include -#include +#include // Custom pair equal template From 928375bd7e8e54d1621ec5600e75e7a7de3dd70d Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 26 Jan 2023 09:48:38 -0500 Subject: [PATCH 080/152] Update rapids-cmake to v23.02 --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f77bc54dc..5f7c21a8d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake) - file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.12/RAPIDS.cmake + file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.02/RAPIDS.cmake ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake) endif() include(${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake) From e85155635b80964190ca419c9e17da4167b7c007 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 30 Jan 2023 15:33:41 -0500 Subject: [PATCH 081/152] Guard CUDA runtime APIs with error checking --- benchmarks/hash_table/static_map_bench.cu | 18 ++++---- benchmarks/reduce_by_key/reduce_by_key.cu | 6 ++- benchmarks/synchronization.hpp | 41 ++++++++----------- .../static_multimap/static_multimap.inl | 18 ++++---- include/cuco/detail/utils.hpp | 11 +++-- tests/static_map/insert_and_find_test.cu | 2 +- tests/static_map/key_sentinel_test.cu | 2 +- tests/static_map/stream_test.cu | 5 +-- tests/static_multimap/pair_function_test.cu | 2 +- tests/utils.hpp | 24 ++++++----- 10 files changed, 63 insertions(+), 66 deletions(-) diff --git a/benchmarks/hash_table/static_map_bench.cu b/benchmarks/hash_table/static_map_bench.cu index 1538a636a..22822a6bc 100644 --- a/benchmarks/hash_table/static_map_bench.cu +++ b/benchmarks/hash_table/static_map_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -95,16 +95,16 @@ static void BM_static_map_insert(::benchmark::State& state) map_type map{size, cuco::empty_key{-1}, cuco::empty_value{-1}}; cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); + CUCO_CUDA_TRY(cudaEventCreate(&start)); + CUCO_CUDA_TRY(cudaEventCreate(&stop)); - cudaEventRecord(start); + CUCO_CUDA_TRY(cudaEventRecord(start)); map.insert(d_pairs.begin(), d_pairs.end()); - cudaEventRecord(stop); - cudaEventSynchronize(stop); + CUCO_CUDA_TRY(cudaEventRecord(stop)); + CUCO_CUDA_TRY(cudaEventSynchronize(stop)); float ms; - cudaEventElapsedTime(&ms, start, stop); + CUCO_CUDA_TRY(cudaEventElapsedTime(&ms, start, stop)); state.SetIterationTime(ms / 1000); } @@ -148,7 +148,7 @@ static void BM_static_map_search_all(::benchmark::State& state) map.find(d_keys.begin(), d_keys.end(), d_results.begin()); // TODO: get rid of sync and rewrite the benchmark with `nvbench` // once https://github.com/NVIDIA/nvbench/pull/80 is merged - cudaDeviceSynchronize(); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); } state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * @@ -195,7 +195,7 @@ static void BM_static_map_search_none(::benchmark::State& state) map.find(d_keys.begin(), d_keys.end(), d_results.begin()); // TODO: get rid of sync and rewrite the benchmark with `nvbench` // once https://github.com/NVIDIA/nvbench/pull/80 is merged - cudaDeviceSynchronize(); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); } state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * diff --git a/benchmarks/reduce_by_key/reduce_by_key.cu b/benchmarks/reduce_by_key/reduce_by_key.cu index 1de05a42f..30e25905b 100644 --- a/benchmarks/reduce_by_key/reduce_by_key.cu +++ b/benchmarks/reduce_by_key/reduce_by_key.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,8 @@ * limitations under the License. */ +#include + #include #include @@ -75,7 +77,7 @@ static void BM_thrust(::benchmark::State& state) thrust::device_vector values(state.range(0)); state.ResumeTiming(); thrust_reduce_by_key(keys.begin(), keys.end(), values.begin()); - cudaDeviceSynchronize(); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); } } BENCHMARK_TEMPLATE(BM_thrust, int32_t, int32_t) diff --git a/benchmarks/synchronization.hpp b/benchmarks/synchronization.hpp index f0d7807be..ecf57138b 100644 --- a/benchmarks/synchronization.hpp +++ b/benchmarks/synchronization.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,8 @@ #pragma once +#include + // Google Benchmark library #include @@ -23,17 +25,6 @@ #include -#define BENCH_CUDA_TRY(call) \ - do { \ - auto const status = (call); \ - if (cudaSuccess != status) { throw std::runtime_error("CUDA error detected."); } \ - } while (0) - -#define BENCH_ASSERT_CUDA_SUCCESS(expr) \ - do { \ - cudaError_t const status = (expr); \ - assert(cudaSuccess == status); \ - } while (0) /** * @brief This class serves as a wrapper for using `cudaEvent_t` as the user * defined timer within the framework of google benchmark @@ -90,24 +81,24 @@ class cuda_event_timer { // flush all of L2$ if (flush_l2_cache) { int current_device = 0; - BENCH_CUDA_TRY(cudaGetDevice(¤t_device)); + CUCO_CUDA_TRY(cudaGetDevice(¤t_device)); int l2_cache_bytes = 0; - BENCH_CUDA_TRY( + CUCO_CUDA_TRY( cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device)); if (l2_cache_bytes > 0) { const int memset_value = 0; int* l2_cache_buffer = nullptr; - BENCH_CUDA_TRY(cudaMalloc(&l2_cache_buffer, l2_cache_bytes)); - BENCH_CUDA_TRY(cudaMemsetAsync(l2_cache_buffer, memset_value, l2_cache_bytes, stream_)); - BENCH_CUDA_TRY(cudaFree(l2_cache_buffer)); + CUCO_CUDA_TRY(cudaMalloc(&l2_cache_buffer, l2_cache_bytes)); + CUCO_CUDA_TRY(cudaMemsetAsync(l2_cache_buffer, memset_value, l2_cache_bytes, stream_)); + CUCO_CUDA_TRY(cudaFree(l2_cache_buffer)); } } - BENCH_CUDA_TRY(cudaEventCreate(&start_)); - BENCH_CUDA_TRY(cudaEventCreate(&stop_)); - BENCH_CUDA_TRY(cudaEventRecord(start_, stream_)); + CUCO_CUDA_TRY(cudaEventCreate(&start_)); + CUCO_CUDA_TRY(cudaEventCreate(&stop_)); + CUCO_CUDA_TRY(cudaEventRecord(start_, stream_)); } cuda_event_timer() = delete; @@ -118,13 +109,13 @@ class cuda_event_timer { */ ~cuda_event_timer() { - BENCH_ASSERT_CUDA_SUCCESS(cudaEventRecord(stop_, stream_)); - BENCH_ASSERT_CUDA_SUCCESS(cudaEventSynchronize(stop_)); + CUCO_ASSERT_CUDA_SUCCESS(cudaEventRecord(stop_, stream_)); + CUCO_ASSERT_CUDA_SUCCESS(cudaEventSynchronize(stop_)); float milliseconds = 0.0f; - BENCH_ASSERT_CUDA_SUCCESS(cudaEventElapsedTime(&milliseconds, start_, stop_)); + CUCO_ASSERT_CUDA_SUCCESS(cudaEventElapsedTime(&milliseconds, start_, stop_)); p_state->SetIterationTime(milliseconds / (1000.0f)); - BENCH_ASSERT_CUDA_SUCCESS(cudaEventDestroy(start_)); - BENCH_ASSERT_CUDA_SUCCESS(cudaEventDestroy(stop_)); + CUCO_ASSERT_CUDA_SUCCESS(cudaEventDestroy(start_)); + CUCO_ASSERT_CUDA_SUCCESS(cudaEventDestroy(stop_)); } private: diff --git a/include/cuco/detail/static_multimap/static_multimap.inl b/include/cuco/detail/static_multimap/static_multimap.inl index f42d9ab2e..969765e07 100644 --- a/include/cuco/detail/static_multimap/static_multimap.inl +++ b/include/cuco/detail/static_multimap/static_multimap.inl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -167,7 +167,7 @@ std::size_t static_multimap::count( auto view = get_device_view(); auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size); - cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream); + CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream)); std::size_t h_counter; detail::count @@ -198,7 +198,7 @@ std::size_t static_multimap::count_ auto view = get_device_view(); auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size); - cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream); + CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream)); std::size_t h_counter; detail::count @@ -229,7 +229,7 @@ std::size_t static_multimap::pair_c auto view = get_device_view(); auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size); - cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream); + CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream)); std::size_t h_counter; detail::pair_count @@ -260,7 +260,7 @@ std::size_t static_multimap::pair_c auto view = get_device_view(); auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size); - cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream); + CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream)); std::size_t h_counter; detail::pair_count @@ -307,7 +307,7 @@ OutputIt static_multimap::retrieve( KeyEqual>, block_size); - cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream); + CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream)); std::size_t h_counter; detail::retrieve @@ -357,7 +357,7 @@ OutputIt static_multimap::retrieve_ KeyEqual>, block_size); - cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream); + CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream)); std::size_t h_counter; detail::retrieve @@ -403,7 +403,7 @@ static_multimap::pair_retrieve( }(); auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size); - cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream); + CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream)); std::size_t h_counter; detail::pair_retrieve @@ -453,7 +453,7 @@ static_multimap::pair_retrieve_oute }(); auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size); - cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream); + CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream)); std::size_t h_counter; detail::pair_retrieve diff --git a/include/cuco/detail/utils.hpp b/include/cuco/detail/utils.hpp index 14077ccc6..d06216c54 100644 --- a/include/cuco/detail/utils.hpp +++ b/include/cuco/detail/utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,8 @@ #pragma once +#include + #include #include @@ -43,11 +45,12 @@ template auto get_grid_size(Kernel kernel, std::size_t block_size, std::size_t dynamic_smem_bytes = 0) { int grid_size{-1}; - cudaOccupancyMaxActiveBlocksPerMultiprocessor(&grid_size, kernel, block_size, dynamic_smem_bytes); + CUCO_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &grid_size, kernel, block_size, dynamic_smem_bytes)); int dev_id{-1}; - cudaGetDevice(&dev_id); + CUCO_CUDA_TRY(cudaGetDevice(&dev_id)); int num_sms{-1}; - cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id); + CUCO_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id)); grid_size *= num_sms; return grid_size; } diff --git a/tests/static_map/insert_and_find_test.cu b/tests/static_map/insert_and_find_test.cu index 1a2fa1907..5784f786f 100644 --- a/tests/static_map/insert_and_find_test.cu +++ b/tests/static_map/insert_and_find_test.cu @@ -66,7 +66,7 @@ TEMPLATE_TEST_CASE_SIG("Parallel insert-or-update", static constexpr int Blocks = 1024; static constexpr int Threads = 128; parallel_sum<<>>(m.get_device_mutable_view()); - cudaDeviceSynchronize(); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); thrust::device_vector d_keys(Iters); thrust::device_vector d_values(Iters); diff --git a/tests/static_map/key_sentinel_test.cu b/tests/static_map/key_sentinel_test.cu index b611d38e1..dcf88c99c 100644 --- a/tests/static_map/key_sentinel_test.cu +++ b/tests/static_map/key_sentinel_test.cu @@ -49,7 +49,7 @@ TEMPLATE_TEST_CASE_SIG( for (int i = 0; i < SIZE; i++) { h_A[i] = i; } - cudaMemcpyToSymbol(A, h_A, SIZE * sizeof(int)); + CUCO_CUDA_TRY(cudaMemcpyToSymbol(A, h_A, SIZE * sizeof(int))); auto pairs_begin = thrust::make_transform_iterator( thrust::make_counting_iterator(0), diff --git a/tests/static_map/stream_test.cu b/tests/static_map/stream_test.cu index 290c176b5..35cd7e821 100644 --- a/tests/static_map/stream_test.cu +++ b/tests/static_map/stream_test.cu @@ -38,7 +38,7 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream", (int64_t, int64_t)) { cudaStream_t stream; - cudaStreamCreate(&stream); + CUCO_CUDA_TRY(cudaStreamCreate(&stream)); constexpr std::size_t num_keys{500'000}; cuco::static_map map{1'000'000, @@ -67,7 +67,6 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream", map.insert(pairs_begin, pairs_begin + num_keys, hash_fn, equal_fn, stream); map.find(d_keys.begin(), d_keys.end(), d_results.begin(), hash_fn, equal_fn, stream); - // cudaStreamSynchronize(stream); auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_results.begin(), d_values.begin())); REQUIRE(cuco::test::all_of( @@ -87,5 +86,5 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream", REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), thrust::identity{}, stream)); } - cudaStreamDestroy(stream); + CUCO_CUDA_TRY(cudaStreamDestroy(stream)); } diff --git a/tests/static_multimap/pair_function_test.cu b/tests/static_multimap/pair_function_test.cu index 694c968ae..26cc5bbd2 100644 --- a/tests/static_multimap/pair_function_test.cu +++ b/tests/static_multimap/pair_function_test.cu @@ -43,7 +43,7 @@ template __inline__ void test_pair_functions(Map& map, PairIt pair_begin, std::size_t num_pairs) { map.insert(pair_begin, pair_begin + num_pairs); - cudaStreamSynchronize(0); + CUCO_CUDA_TRY(cudaStreamSynchronize(0)); auto res = map.get_size(); REQUIRE(res == num_pairs); diff --git a/tests/utils.hpp b/tests/utils.hpp index dd2f6545f..a94b04a57 100644 --- a/tests/utils.hpp +++ b/tests/utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,8 @@ #include +#include + #include #include @@ -39,19 +41,19 @@ int count_if(Iterator begin, Iterator end, Predicate p, cudaStream_t stream = 0) auto const grid_size = (size + block_size - 1) / block_size; int* count; - cudaMallocManaged(&count, sizeof(int)); + CUCO_CUDA_TRY(cudaMallocManaged(&count, sizeof(int))); *count = 0; int device_id; - cudaGetDevice(&device_id); - cudaMemPrefetchAsync(count, sizeof(int), device_id, stream); + CUCO_CUDA_TRY(cudaGetDevice(&device_id)); + CUCO_CUDA_TRY(cudaMemPrefetchAsync(count, sizeof(int), device_id, stream)); detail::count_if<<>>(begin, end, count, p); - cudaStreamSynchronize(stream); + CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); auto res = *count; - cudaFree(count); + CUCO_CUDA_TRY(cudaFree(count)); return res; } @@ -85,19 +87,19 @@ bool equal(Iterator1 begin1, Iterator1 end1, Iterator2 begin2, Predicate p, cuda auto const grid_size = (size + block_size - 1) / block_size; int* count; - cudaMallocManaged(&count, sizeof(int)); + CUCO_CUDA_TRY(cudaMallocManaged(&count, sizeof(int))); *count = 0; int device_id; - cudaGetDevice(&device_id); - cudaMemPrefetchAsync(count, sizeof(int), device_id, stream); + CUCO_CUDA_TRY(cudaGetDevice(&device_id)); + CUCO_CUDA_TRY(cudaMemPrefetchAsync(count, sizeof(int), device_id, stream)); detail::count_if<<>>(begin1, end1, begin2, count, p); - cudaStreamSynchronize(stream); + CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); auto res = *count; - cudaFree(count); + CUCO_CUDA_TRY(cudaFree(count)); return res == size; } From e088afb4f9127ba3755cc6ff17a072979a9c94d2 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 1 Feb 2023 14:27:12 -0500 Subject: [PATCH 082/152] Get rid of thrust::device_vector::data --- .../hash_table/static_multimap/optimal_retrieve_bench.cu | 4 ++-- benchmarks/hash_table/static_multimap/query_bench.cu | 4 ++-- benchmarks/hash_table/static_multimap/retrieve_bench.cu | 4 ++-- examples/static_multimap/host_bulk_example.cu | 7 +++---- tests/static_multimap/non_match_test.cu | 4 ++-- 5 files changed, 11 insertions(+), 12 deletions(-) diff --git a/benchmarks/hash_table/static_multimap/optimal_retrieve_bench.cu b/benchmarks/hash_table/static_multimap/optimal_retrieve_bench.cu index bca2effa6..2315d5fc1 100644 --- a/benchmarks/hash_table/static_multimap/optimal_retrieve_bench.cu +++ b/benchmarks/hash_table/static_multimap/optimal_retrieve_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -84,7 +84,7 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_retrieve( thrust::device_vector> d_results(output_size); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - map.retrieve_outer(d_keys.begin(), d_keys.end(), d_results.data().get(), launch.get_stream()); + map.retrieve_outer(d_keys.begin(), d_keys.end(), d_results.begin(), launch.get_stream()); }); } diff --git a/benchmarks/hash_table/static_multimap/query_bench.cu b/benchmarks/hash_table/static_multimap/query_bench.cu index 3eff33a35..41e88647c 100644 --- a/benchmarks/hash_table/static_multimap/query_bench.cu +++ b/benchmarks/hash_table/static_multimap/query_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -66,7 +66,7 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_static_multimap_q state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto count = map.count_outer(d_keys.begin(), d_keys.end(), launch.get_stream()); - map.retrieve_outer(d_keys.begin(), d_keys.end(), d_results.data().get(), launch.get_stream()); + map.retrieve_outer(d_keys.begin(), d_keys.end(), d_results.begin(), launch.get_stream()); }); } diff --git a/benchmarks/hash_table/static_multimap/retrieve_bench.cu b/benchmarks/hash_table/static_multimap/retrieve_bench.cu index 128bcb03d..25ddae575 100644 --- a/benchmarks/hash_table/static_multimap/retrieve_bench.cu +++ b/benchmarks/hash_table/static_multimap/retrieve_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -65,7 +65,7 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_static_multimap_r thrust::device_vector> d_results(output_size); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - map.retrieve_outer(d_keys.begin(), d_keys.end(), d_results.data().get(), launch.get_stream()); + map.retrieve_outer(d_keys.begin(), d_keys.end(), d_results.begin(), launch.get_stream()); }); } diff --git a/examples/static_multimap/host_bulk_example.cu b/examples/static_multimap/host_bulk_example.cu index 984a05387..a7d5a95a7 100644 --- a/examples/static_multimap/host_bulk_example.cu +++ b/examples/static_multimap/host_bulk_example.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -64,9 +64,8 @@ int main(void) // Finds all keys {0, 1, 2, ...} and stores associated key/value pairs into `d_results` // If a key `keys_to_find[i]` doesn't exist, `d_results[i].second == empty_value_sentinel` - auto output_end = - map.retrieve_outer(keys_to_find.begin(), keys_to_find.end(), d_results.data().get()); - auto retrieve_size = output_end - d_results.data().get(); + auto output_end = map.retrieve_outer(keys_to_find.begin(), keys_to_find.end(), d_results.begin()); + auto retrieve_size = output_end - d_results.begin(); // The total number of outer matches should be `N + N / 2` assert(not(output_size == retrieve_size == N + N / 2)); diff --git a/tests/static_multimap/non_match_test.cu b/tests/static_multimap/non_match_test.cu index 38c310d25..3fdb60c14 100644 --- a/tests/static_multimap/non_match_test.cu +++ b/tests/static_multimap/non_match_test.cu @@ -43,7 +43,7 @@ __inline__ void test_non_matches(Map& map, PairIt pair_begin, KeyIt key_begin, s REQUIRE(num == num_keys); - auto output_begin = d_results.data().get(); + auto output_begin = d_results.begin(); auto output_end = map.retrieve(key_begin, key_begin + num_keys, output_begin); std::size_t const size = thrust::distance(output_begin, output_end); @@ -75,7 +75,7 @@ __inline__ void test_non_matches(Map& map, PairIt pair_begin, KeyIt key_begin, s REQUIRE(num == (num_keys + num_keys / 2)); - auto output_begin = d_results.data().get(); + auto output_begin = d_results.begin(); auto output_end = map.retrieve_outer(key_begin, key_begin + num_keys, output_begin); std::size_t const size = thrust::distance(output_begin, output_end); From cb05a8f077059c356932ffe286160a477c5acd26 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 2 Feb 2023 14:28:27 -0500 Subject: [PATCH 083/152] Add CUCO_FAIL macro --- include/cuco/detail/error.hpp | 54 ++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/include/cuco/detail/error.hpp b/include/cuco/detail/error.hpp index 45f78a2e0..41b3feced 100644 --- a/include/cuco/detail/error.hpp +++ b/include/cuco/detail/error.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,27 @@ #include namespace cuco { +/** + * @brief Exception thrown when logical precondition is violated. + * + * This exception should not be thrown directly and is instead thrown by the + * CUCO_EXPECTS macro. + */ +struct logic_error : public std::logic_error { + /** + * @brief Constructs a logic_error with the error message. + * + * @param message Message to be associated with the exception + */ + logic_error(char const* const message) : std::logic_error(message) {} + + /** + * @brief Construct a new logic error object with error message + * + * @param message Message to be associated with the exception + */ + logic_error(std::string const& message) : std::logic_error(message) {} +}; /** * @brief Exception thrown when a CUDA error is encountered. * @@ -111,3 +132,34 @@ struct cuda_error : public std::runtime_error { (!!(cond)) ? static_cast(0) \ : throw std::runtime_error("cuco failure at: " __FILE__ \ ":" CUCO_STRINGIFY(__LINE__) ": " reason) + +/** + * @brief Indicates that an erroneous code path has been taken. + * + * Example usage: + * ```c++ + * // Throws `cuco::logic_error` + * CUCO_FAIL("Unsupported code path"); + * + * // Throws `std::runtime_error` + * CUCO_FAIL("Unsupported code path", std::runtime_error); + * ``` + * + * @param ... This macro accepts either one or two arguments: + * - The first argument is a string literal used to construct the `what` of + * the exception. + * - When given, the second argument is the exception to be thrown. When not + * specified, defaults to `cuco::logic_error`. + * @throw `_exception_type` if the condition evaluates to 0 (false). + */ +#define CUCO_FAIL(...) \ + GET_CUCO_FAIL_MACRO(__VA_ARGS__, CUCO_FAIL_2, CUCO_FAIL_1) \ + (__VA_ARGS__) + +#define GET_CUCO_FAIL_MACRO(_1, _2, NAME, ...) NAME + +#define CUCO_FAIL_2(_what, _exception_type) \ + /*NOLINTNEXTLINE(bugprone-macro-parentheses)*/ \ + throw _exception_type { "CUDF failure at:" __FILE__ ":" CUCO_STRINGIFY(__LINE__) ": " _what } + +#define CUCO_FAIL_1(_what) CUCO_FAIL_2(_what, cuco::logic_error) From 31e1fd2dcb4d1f1fc496be787104dd3f072762ee Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 2 Feb 2023 14:34:03 -0500 Subject: [PATCH 084/152] Deprecate CUCO_RUNTIME_EXPECTS by using CUCO_EXPECTS --- include/cuco/detail/dynamic_map.inl | 5 ++-- include/cuco/detail/error.hpp | 46 +++++++++++++++++++++-------- include/cuco/detail/static_map.inl | 12 ++++---- 3 files changed, 43 insertions(+), 20 deletions(-) diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index 4369f32c2..17585d754 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -59,8 +59,9 @@ dynamic_map::dynamic_map( max_load_factor_(0.60), alloc_{alloc} { - CUCO_RUNTIME_EXPECTS(empty_key_sentinel_ != erased_key_sentinel_, - "The empty key sentinel and erased key sentinel cannot be the same value."); + CUCO_EXPECTS(empty_key_sentinel_ != erased_key_sentinel_, + "The empty key sentinel and erased key sentinel cannot be the same value.", + std::runtime_error); submaps_.push_back(std::make_unique>( initial_capacity, diff --git a/include/cuco/detail/error.hpp b/include/cuco/detail/error.hpp index 41b3feced..b3301eafd 100644 --- a/include/cuco/detail/error.hpp +++ b/include/cuco/detail/error.hpp @@ -114,24 +114,44 @@ struct cuda_error : public std::runtime_error { } while (0) /** - * @brief Macro for checking runtime conditions that throws an exception when + * @brief Macro for checking (pre-)conditions that throws an exception when * a condition is violated. * - * Example usage: + * Defaults to throwing `cuco::logic_error`, but a custom exception may also be + * specified. * - * @code - * CUCO_RUNTIME_EXPECTS(key == value, "Key value mismatch"); - * @endcode + * Example usage: + * ``` + * // throws cuco::logic_error + * CUCO_EXPECTS(p != nullptr, "Unexpected null pointer"); * - * @param[in] cond Expression that evaluates to true or false - * @param[in] reason String literal description of the reason that cond is - * expected to be true - * @throw std::runtime_error if the condition evaluates to false. + * // throws std::runtime_error + * CUCO_EXPECTS(p != nullptr, "Unexpected nullptr", std::runtime_error); + * ``` + * @param ... This macro accepts either two or three arguments: + * - The first argument must be an expression that evaluates to true or + * false, and is the condition being checked. + * - The second argument is a string literal used to construct the `what` of + * the exception. + * - When given, the third argument is the exception to be thrown. When not + * specified, defaults to `cuco::logic_error`. + * @throw `_exception_type` if the condition evaluates to 0 (false). */ -#define CUCO_RUNTIME_EXPECTS(cond, reason) \ - (!!(cond)) ? static_cast(0) \ - : throw std::runtime_error("cuco failure at: " __FILE__ \ - ":" CUCO_STRINGIFY(__LINE__) ": " reason) +#define CUCO_EXPECTS(...) \ + GET_CUCO_EXPECTS_MACRO(__VA_ARGS__, CUCO_EXPECTS_3, CUCO_EXPECTS_2) \ + (__VA_ARGS__) + +#define GET_CUCO_EXPECTS_MACRO(_1, _2, _3, NAME, ...) NAME + +#define CUCO_EXPECTS_3(_condition, _reason, _exception_type) \ + do { \ + static_assert(std::is_base_of_v); \ + (_condition) ? static_cast(0) \ + : throw _exception_type /*NOLINT(bugprone-macro-parentheses)*/ \ + {"CUDF failure at: " __FILE__ ":" CUCO_STRINGIFY(__LINE__) ": " _reason}; \ + } while (0) + +#define CUCO_EXPECTS_2(_condition, _reason) CUCO_EXPECTS_3(_condition, _reason, cuco::logic_error) /** * @brief Indicates that an erroneous code path has been taken. diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index 8137b62d5..cd7f8c079 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -65,8 +65,9 @@ static_map::static_map(std::size_t capacity, slot_allocator_{alloc}, counter_allocator_{alloc} { - CUCO_RUNTIME_EXPECTS(empty_key_sentinel_ != erased_key_sentinel_, - "The empty key sentinel and erased key sentinel cannot be the same value."); + CUCO_EXPECTS(empty_key_sentinel_ != erased_key_sentinel_, + "The empty key sentinel and erased key sentinel cannot be the same value.", + std::runtime_error); slots_ = std::allocator_traits::allocate(slot_allocator_, capacity_); num_successes_ = std::allocator_traits::allocate(counter_allocator_, 1); @@ -157,8 +158,9 @@ template void static_map::erase( InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream) { - CUCO_RUNTIME_EXPECTS(get_empty_key_sentinel() != get_erased_key_sentinel(), - "You must provide a unique erased key sentinel value at map construction."); + CUCO_EXPECTS(get_empty_key_sentinel() != get_erased_key_sentinel(), + "You must provide a unique erased key sentinel value at map construction.", + std::runtime_error); auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } @@ -855,4 +857,4 @@ static_map::device_view::contains(CG const& g, current_slot = next_slot(g, current_slot); } } -} // namespace cuco \ No newline at end of file +} // namespace cuco From 435ec32e136c0282ec0147830e56dbb765c8887a Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 2 Feb 2023 14:36:47 -0500 Subject: [PATCH 085/152] Fix typos --- include/cuco/detail/error.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/error.hpp b/include/cuco/detail/error.hpp index b3301eafd..dc43ba03c 100644 --- a/include/cuco/detail/error.hpp +++ b/include/cuco/detail/error.hpp @@ -148,7 +148,7 @@ struct cuda_error : public std::runtime_error { static_assert(std::is_base_of_v); \ (_condition) ? static_cast(0) \ : throw _exception_type /*NOLINT(bugprone-macro-parentheses)*/ \ - {"CUDF failure at: " __FILE__ ":" CUCO_STRINGIFY(__LINE__) ": " _reason}; \ + {"CUCO failure at: " __FILE__ ":" CUCO_STRINGIFY(__LINE__) ": " _reason}; \ } while (0) #define CUCO_EXPECTS_2(_condition, _reason) CUCO_EXPECTS_3(_condition, _reason, cuco::logic_error) @@ -180,6 +180,6 @@ struct cuda_error : public std::runtime_error { #define CUCO_FAIL_2(_what, _exception_type) \ /*NOLINTNEXTLINE(bugprone-macro-parentheses)*/ \ - throw _exception_type { "CUDF failure at:" __FILE__ ":" CUCO_STRINGIFY(__LINE__) ": " _what } + throw _exception_type { "CUCO failure at:" __FILE__ ":" CUCO_STRINGIFY(__LINE__) ": " _what } #define CUCO_FAIL_1(_what) CUCO_FAIL_2(_what, cuco::logic_error) From 38246043047e0167096195c7ad4c81691b25f0e1 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 3 Feb 2023 13:34:17 -0500 Subject: [PATCH 086/152] Remove the use of sentinel namespace --- include/cuco/detail/dynamic_map.inl | 19 +++++++++---------- include/cuco/dynamic_map.cuh | 6 +++--- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index 4369f32c2..9acd34b14 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -43,13 +43,12 @@ dynamic_map::dynamic_map(std::size_t initial_capac } template -dynamic_map::dynamic_map( - std::size_t initial_capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, - sentinel::erased_key erased_key_sentinel, - Allocator const& alloc, - cudaStream_t stream) +dynamic_map::dynamic_map(std::size_t initial_capacity, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + erased_key erased_key_sentinel, + Allocator const& alloc, + cudaStream_t stream) : empty_key_sentinel_(empty_key_sentinel.value), empty_value_sentinel_(empty_value_sentinel.value), erased_key_sentinel_(erased_key_sentinel.value), @@ -64,9 +63,9 @@ dynamic_map::dynamic_map( submaps_.push_back(std::make_unique>( initial_capacity, - sentinel::empty_key{empty_key_sentinel_}, - sentinel::empty_value{empty_value_sentinel_}, - sentinel::erased_key{erased_key_sentinel_}, + empty_key{empty_key_sentinel_}, + empty_value{empty_value_sentinel_}, + erased_key{erased_key_sentinel_}, alloc, stream)); submap_views_.push_back(submaps_[0]->get_device_view()); diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index 19b38ef0e..bbdefddad 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -140,8 +140,8 @@ class dynamic_map { * @param stream Stream used for executing the kernels */ dynamic_map(std::size_t initial_capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, Allocator const& alloc = Allocator{}, cudaStream_t stream = nullptr); From 5e689784db4a3a106dc8fccb298de7f2e70d3e2f Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 9 Feb 2023 18:56:42 +0000 Subject: [PATCH 087/152] Remove reduce-by-key benchmarks --- benchmarks/CMakeLists.txt | 5 -- benchmarks/reduce_by_key/reduce_by_key.cu | 91 ----------------------- 2 files changed, 96 deletions(-) delete mode 100644 benchmarks/reduce_by_key/reduce_by_key.cu diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index b70105d7d..e59566ad8 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -98,8 +98,3 @@ ConfigureNVBench(STATIC_MULTIMAP_BENCH ConfigureNVBench(RETRIEVE_BENCH hash_table/static_multimap/optimal_retrieve_bench.cu) - -################################################################################################### -# - reduce_by_key benchmarks ---------------------------------------------------------------------- -set(RBK_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/reduce_by_key/reduce_by_key.cu") -ConfigureBench(RBK_BENCH "${RBK_BENCH_SRC}") diff --git a/benchmarks/reduce_by_key/reduce_by_key.cu b/benchmarks/reduce_by_key/reduce_by_key.cu deleted file mode 100644 index 30e25905b..000000000 --- a/benchmarks/reduce_by_key/reduce_by_key.cu +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/** - * @brief Generates input sizes and number of unique keys - * - */ -static void generate_size_and_num_unique(benchmark::internal::Benchmark* b) -{ - for (auto num_unique = 64; num_unique <= 1 << 20; num_unique <<= 1) { - for (auto size = 10'000'000; size <= 10'000'000; size *= 10) { - b->Args({size, num_unique}); - } - } -} - -template -void thrust_reduce_by_key(KeyRandomIterator keys_begin, - KeyRandomIterator keys_end, - ValueRandomIterator values_begin) -{ - using Key = typename thrust::iterator_traits::value_type; - using Value = typename thrust::iterator_traits::value_type; - - // Exact size of output is unknown (number of unique keys), but upper bounded - // by the number of keys - auto maximum_output_size = thrust::distance(keys_begin, keys_end); - thrust::device_vector output_keys(maximum_output_size); - thrust::device_vector output_values(maximum_output_size); - - thrust::sort_by_key(thrust::device, keys_begin, keys_end, values_begin); - thrust::reduce_by_key( - thrust::device, keys_begin, keys_end, values_begin, output_keys.begin(), output_values.end()); -} - -template -static void BM_thrust(::benchmark::State& state) -{ - auto const num_unique_keys = state.range(1); - for (auto _ : state) { - state.PauseTiming(); - thrust::device_vector keys(state.range(0)); - auto begin = thrust::make_counting_iterator(0); - thrust::transform( - begin, begin + state.range(0), keys.begin(), [num_unique_keys] __device__(auto i) { - return i % num_unique_keys; - }); - - thrust::device_vector values(state.range(0)); - state.ResumeTiming(); - thrust_reduce_by_key(keys.begin(), keys.end(), values.begin()); - CUCO_CUDA_TRY(cudaDeviceSynchronize()); - } -} -BENCHMARK_TEMPLATE(BM_thrust, int32_t, int32_t) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_num_unique); - -BENCHMARK_TEMPLATE(BM_thrust, int64_t, int64_t) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_num_unique); - -// TODO: Hash based reduce by key benchmark From a09a08683bea9c58c7716a6d8e5f7cb01bdeabf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 17 Feb 2023 17:43:55 +0100 Subject: [PATCH 088/152] New key generator to be used in benchmarks (#271) Provides a new random key generator based on `thrust` random distributions which is between 8-12x faster than the former host-sided key generator. --------- Co-authored-by: Yunsong Wang --- benchmarks/CMakeLists.txt | 8 +- benchmarks/defaults.hpp | 41 +++ benchmarks/distribution.hpp | 62 ++++ .../hash_table/static_multimap/count_bench.cu | 128 ++++---- .../static_multimap/insert_bench.cu | 107 +++---- .../static_multimap/optimal_retrieve_bench.cu | 121 -------- .../static_multimap/pair_retrieve_bench.cu | 127 -------- .../hash_table/static_multimap/query_bench.cu | 131 ++++---- .../static_multimap/retrieve_bench.cu | 129 ++++---- benchmarks/key_generator.hpp | 287 +++++++++++++----- 10 files changed, 536 insertions(+), 605 deletions(-) create mode 100644 benchmarks/defaults.hpp create mode 100644 benchmarks/distribution.hpp delete mode 100644 benchmarks/hash_table/static_multimap/optimal_retrieve_bench.cu delete mode 100644 benchmarks/hash_table/static_multimap/pair_retrieve_bench.cu diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index e59566ad8..283d50244 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -90,11 +90,7 @@ ConfigureBench(STATIC_MAP_BENCH "${STATIC_MAP_BENCH_SRC}") ################################################################################################### # - static_multimap benchmarks -------------------------------------------------------------------- ConfigureNVBench(STATIC_MULTIMAP_BENCH - hash_table/static_multimap/count_bench.cu hash_table/static_multimap/insert_bench.cu - hash_table/static_multimap/pair_retrieve_bench.cu + hash_table/static_multimap/retrieve_bench.cu hash_table/static_multimap/query_bench.cu - hash_table/static_multimap/retrieve_bench.cu) - -ConfigureNVBench(RETRIEVE_BENCH - hash_table/static_multimap/optimal_retrieve_bench.cu) + hash_table/static_multimap/count_bench.cu) diff --git a/benchmarks/defaults.hpp b/benchmarks/defaults.hpp new file mode 100644 index 000000000..447a5d031 --- /dev/null +++ b/benchmarks/defaults.hpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include + +namespace cuco::benchmark::defaults { + +using KEY_TYPE_RANGE = nvbench::type_list; +using VALUE_TYPE_RANGE = nvbench::type_list; + +auto constexpr N = 100'000'000; +auto constexpr OCCUPANCY = 0.5; +auto constexpr MULTIPLICITY = 8; +auto constexpr MATCHING_RATE = 0.5; +auto constexpr MAX_NOISE = 3; +auto constexpr SKEW = 0.5; + +auto const OCCUPANCY_RANGE = nvbench::range(0.1, 0.9, 0.1); +auto const MULTIPLICITY_RANGE = std::vector{1, 2, 4, 8, 16}; +auto const MATCHING_RATE_RANGE = nvbench::range(0.1, 1., 0.1); +auto const SKEW_RANGE = nvbench::range(0.1, 1., 0.1); + +} // namespace cuco::benchmark::defaults \ No newline at end of file diff --git a/benchmarks/distribution.hpp b/benchmarks/distribution.hpp new file mode 100644 index 000000000..b7ad25ba4 --- /dev/null +++ b/benchmarks/distribution.hpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +#include + +namespace cuco::benchmark { + +namespace dist_type { +struct unique { +}; + +struct uniform { + int64_t multiplicity; // TODO assert >0 +}; + +struct gaussian { + double skew; // TODO assert >0 +}; +} // namespace dist_type + +template +auto dist_from_state(nvbench::state const& state) +{ + if constexpr (std::is_same_v) { + return Dist{}; + } else if constexpr (std::is_same_v) { + auto const multiplicity = state.get_int64_or_default("Multiplicity", defaults::MULTIPLICITY); + return Dist{multiplicity}; + } else if constexpr (std::is_same_v) { + auto const skew = state.get_float64_or_default("Skew", defaults::SKEW); + return Dist{skew}; + } else { + CUCO_FAIL("Unexpected distribution type"); + } +} + +} // namespace cuco::benchmark + +NVBENCH_DECLARE_TYPE_STRINGS(cuco::benchmark::dist_type::unique, "UNIQUE", "dist_type::unique"); +NVBENCH_DECLARE_TYPE_STRINGS(cuco::benchmark::dist_type::uniform, "UNIFORM", "dist_type::uniform"); +NVBENCH_DECLARE_TYPE_STRINGS(cuco::benchmark::dist_type::gaussian, + "GAUSSIAN", + "dist_type::gaussian"); \ No newline at end of file diff --git a/benchmarks/hash_table/static_multimap/count_bench.cu b/benchmarks/hash_table/static_multimap/count_bench.cu index 564a4c2dd..46a045275 100644 --- a/benchmarks/hash_table/static_multimap/count_bench.cu +++ b/benchmarks/hash_table/static_multimap/count_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,8 @@ * limitations under the License. */ +#include +#include #include #include @@ -21,95 +23,81 @@ #include #include +#include + +using namespace cuco::benchmark; +using namespace cuco::benchmark::defaults; /** - * @brief A benchmark evaluating multi-value `count` performance: - * - Total number of insertions: 100'000'000 - * - CG size: 8 + * @brief A benchmark evaluating multi-value `count` performance */ -template -std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_static_multimap_count( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_count( + nvbench::state& state, nvbench::type_list) { - auto const num_keys = state.get_int64("NumInputs"); - auto const occupancy = state.get_float64("Occupancy"); - auto const matching_rate = state.get_float64("MatchingRate"); + using pair_type = cuco::pair_type; - std::size_t const size = num_keys / occupancy; + auto const num_keys = state.get_int64_or_default("NumInputs", N); + auto const occupancy = state.get_float64_or_default("Occupancy", OCCUPANCY); + auto const matching_rate = state.get_float64_or_default("MatchingRate", MATCHING_RATE); - std::vector h_keys(num_keys); - std::vector> h_pairs(num_keys); + std::size_t const size = num_keys / occupancy; - generate_keys(h_keys.begin(), h_keys.end()); + thrust::device_vector keys(num_keys); - for (auto i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - } + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); - generate_probe_keys(matching_rate, h_keys.begin(), h_keys.end()); + thrust::device_vector pairs(num_keys); + thrust::transform( + thrust::device, keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, {}); + }); - thrust::device_vector d_keys(h_keys); - thrust::device_vector> d_pairs(h_pairs); + gen.dropout(keys.begin(), keys.end(), matching_rate); - state.add_element_count(num_keys, "NumKeys"); + state.add_element_count(num_keys); + state.set_global_memory_rw_bytes(num_keys * sizeof(Key)); cuco::static_multimap map{ size, cuco::empty_key{-1}, cuco::empty_value{-1}}; - map.insert(d_pairs.begin(), d_pairs.end()); + map.insert(pairs.begin(), pairs.end()); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto count = map.count(d_keys.begin(), d_keys.end(), launch.get_stream()); + auto count = map.count(keys.begin(), keys.end(), launch.get_stream()); }); } -template -std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> nvbench_static_multimap_count( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_multimap_count( + nvbench::state& state, nvbench::type_list) { state.skip("Key should be the same type as Value."); } -using key_type = nvbench::type_list; -using value_type = nvbench::type_list; -using d_type = - nvbench::enum_type_list; - -using multiplicity = nvbench::enum_type_list<1, 2, 4, 8, 16, 32, 64, 128, 256>; - -NVBENCH_BENCH_TYPES(nvbench_static_multimap_count, - NVBENCH_TYPE_AXES(key_type, - value_type, - nvbench::enum_type_list, - multiplicity)) - .set_name("staic_multimap_count_uniform_multiplicity") - .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000 - .add_float64_axis("Occupancy", {0.8}) - .add_float64_axis("MatchingRate", {0.5}); - -NVBENCH_BENCH_TYPES(nvbench_static_multimap_count, - NVBENCH_TYPE_AXES(key_type, value_type, d_type, nvbench::enum_type_list<8>)) - .set_name("staic_multimap_count_occupancy") - .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000 - .add_float64_axis("Occupancy", nvbench::range(0.1, 0.9, 0.1)) - .add_float64_axis("MatchingRate", {0.5}); - -NVBENCH_BENCH_TYPES(nvbench_static_multimap_count, - NVBENCH_TYPE_AXES(key_type, value_type, d_type, nvbench::enum_type_list<8>)) - .set_name("staic_multimap_count_matching_rate") - .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000 - .add_float64_axis("Occupancy", {0.8}) - .add_float64_axis("MatchingRate", {0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1}); +NVBENCH_BENCH_TYPES(static_multimap_count, + NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, + VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multimap_count_uniform_occupancy") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(MAX_NOISE) + .add_float64_axis("Occupancy", OCCUPANCY_RANGE); + +NVBENCH_BENCH_TYPES(static_multimap_count, + NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, + VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multimap_count_uniform_matching_rate") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(MAX_NOISE) + .add_float64_axis("MatchingRate", MATCHING_RATE_RANGE); + +NVBENCH_BENCH_TYPES(static_multimap_count, + NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, + VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multimap_count_uniform_multiplicity") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(MAX_NOISE) + .add_int64_axis("Multiplicity", MULTIPLICITY_RANGE); diff --git a/benchmarks/hash_table/static_multimap/insert_bench.cu b/benchmarks/hash_table/static_multimap/insert_bench.cu index 80ff314b3..becd6cdbf 100644 --- a/benchmarks/hash_table/static_multimap/insert_bench.cu +++ b/benchmarks/hash_table/static_multimap/insert_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,8 @@ * limitations under the License. */ +#include +#include #include #include @@ -21,37 +23,38 @@ #include #include +#include + +using namespace cuco::benchmark; +using namespace cuco::benchmark::defaults; /** - * @brief A benchmark evaluating multi-value `insert` performance: - * - Total number of insertions: 100'000'000 - * - CG size: 8 + * @brief A benchmark evaluating multi-value `insert` performance */ -template -std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_static_multimap_insert( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_insert( + nvbench::state& state, nvbench::type_list) { - auto const num_keys = state.get_int64("NumInputs"); - auto const occupancy = state.get_float64("Occupancy"); + using pair_type = cuco::pair_type; - std::size_t const size = num_keys / occupancy; + auto const num_keys = state.get_int64_or_default("NumInputs", N); + auto const occupancy = state.get_float64_or_default("Occupancy", OCCUPANCY); - std::vector h_keys(num_keys); - std::vector> h_pairs(num_keys); + std::size_t const size = num_keys / occupancy; - generate_keys(h_keys.begin(), h_keys.end()); + thrust::device_vector keys(num_keys); - for (auto i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - } + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); - thrust::device_vector> d_pairs(h_pairs); + thrust::device_vector pairs(num_keys); + thrust::transform( + thrust::device, keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, {}); + }); - state.add_element_count(num_keys, "NumKeys"); + state.add_element_count(num_keys); + state.set_global_memory_rw_bytes(num_keys * sizeof(pair_type)); state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { @@ -60,41 +63,41 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_static_multimap_i // Use timers to explicitly mark the target region timer.start(); - map.insert(d_pairs.begin(), d_pairs.end(), launch.get_stream()); + map.insert(pairs.begin(), pairs.end(), launch.get_stream()); timer.stop(); }); } -template -std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> nvbench_static_multimap_insert( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_multimap_insert( + nvbench::state& state, nvbench::type_list) { state.skip("Key should be the same type as Value."); } -using key_type = nvbench::type_list; -using value_type = nvbench::type_list; -using d_type = - nvbench::enum_type_list; - -using multiplicity = nvbench::enum_type_list<1, 2, 4, 8, 16, 32, 64, 128, 256>; - -NVBENCH_BENCH_TYPES(nvbench_static_multimap_insert, - NVBENCH_TYPE_AXES(key_type, - value_type, - nvbench::enum_type_list, - multiplicity)) - .set_name("staic_multimap_insert_uniform_multiplicity") - .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"}) - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000 - .add_float64_axis("Occupancy", {0.8}); - -NVBENCH_BENCH_TYPES(nvbench_static_multimap_insert, - NVBENCH_TYPE_AXES(key_type, value_type, d_type, nvbench::enum_type_list<8>)) - .set_name("staic_multimap_insert_occupancy") - .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"}) - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000 - .add_float64_axis("Occupancy", nvbench::range(0.1, 0.9, 0.1)); +NVBENCH_BENCH_TYPES(static_multimap_insert, + NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, + VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multimap_insert_unique_occupancy") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(MAX_NOISE) + .add_float64_axis("Occupancy", OCCUPANCY_RANGE); + +NVBENCH_BENCH_TYPES(static_multimap_insert, + NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, + VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multimap_insert_uniform_multiplicity") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(MAX_NOISE) + .add_int64_axis("Multiplicity", MULTIPLICITY_RANGE); + +NVBENCH_BENCH_TYPES(static_multimap_insert, + NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, + VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multimap_insert_gaussian_skew") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(MAX_NOISE) + .add_float64_axis("Skew", SKEW_RANGE); diff --git a/benchmarks/hash_table/static_multimap/optimal_retrieve_bench.cu b/benchmarks/hash_table/static_multimap/optimal_retrieve_bench.cu deleted file mode 100644 index 2315d5fc1..000000000 --- a/benchmarks/hash_table/static_multimap/optimal_retrieve_bench.cu +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include - -#include - -/** - * @brief Generates input keys by a given number of repetitions per key. - * - */ -template -static void generate_multikeys(OutputIt output_begin, - OutputIt output_end, - size_t const multiplicity) -{ - auto num_keys = std::distance(output_begin, output_end); - - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = (i % (num_keys / multiplicity)) + 1; - } -} - -/** - * @brief A benchmark evaluating multi-value retrieval performance by varing number of repetitions - * per key: - * - 100'000'000 keys are inserted - * - Map occupancy is fixed at 0.4 - * - Number of repetitions per key: 1, ... , 128, 256 - * - */ -template -std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_retrieve( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) -{ - std::size_t const num_keys = state.get_int64("NumInputs"); - auto const occupancy = state.get_float64("Occupancy"); - std::size_t const size = num_keys / occupancy; - std::size_t const multiplicity = state.get_int64("Multiplicity"); - - state.add_element_count(num_keys, "NumKeys"); - state.add_global_memory_writes(num_keys * 2); - - std::vector h_keys(num_keys); - std::vector> h_pairs(num_keys); - - generate_multikeys(h_keys.begin(), h_keys.end(), multiplicity); - for (auto i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - } - - thrust::device_vector d_keys(h_keys); - thrust::device_vector> d_pairs(h_pairs); - - cuco::static_multimap< - Key, - Value, - cuda::thread_scope_device, - cuco::cuda_allocator, - cuco::double_hashing, cuco::murmurhash3_32>> - map{size, cuco::empty_key{-1}, cuco::empty_value{-1}}; - map.insert(d_pairs.begin(), d_pairs.end()); - - auto const output_size = map.count_outer(d_keys.begin(), d_keys.end()); - thrust::device_vector> d_results(output_size); - - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - map.retrieve_outer(d_keys.begin(), d_keys.end(), d_results.begin(), launch.get_stream()); - }); -} - -template -std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> nvbench_retrieve( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) -{ - state.skip("Key should be the same type as Value."); -} - -using key_type = nvbench::type_list; -using value_type = nvbench::type_list; -using cg_size = nvbench::enum_type_list<1, 2, 4, 8, 16, 32>; -using buffer_size = nvbench::enum_type_list<1, 2, 4, 8, 16>; - -NVBENCH_BENCH_TYPES(nvbench_retrieve, - NVBENCH_TYPE_AXES(key_type, value_type, cg_size, nvbench::enum_type_list<2>)) - .set_type_axes_names({"Key", "Value", "CGSize", "BufferSize"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000 - .add_float64_axis("Occupancy", {0.4}) - .add_int64_power_of_two_axis("Multiplicity", nvbench::range(0, 8, 1)); - -NVBENCH_BENCH_TYPES( - nvbench_retrieve, - NVBENCH_TYPE_AXES(key_type, value_type, nvbench::enum_type_list<8>, buffer_size)) - .set_type_axes_names({"Key", "Value", "CGSize", "BufferSize"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000 - .add_float64_axis("Occupancy", {0.4}) - .add_int64_power_of_two_axis("Multiplicity", nvbench::range(0, 8, 1)); diff --git a/benchmarks/hash_table/static_multimap/pair_retrieve_bench.cu b/benchmarks/hash_table/static_multimap/pair_retrieve_bench.cu deleted file mode 100644 index 4bde01c44..000000000 --- a/benchmarks/hash_table/static_multimap/pair_retrieve_bench.cu +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include - -#include - -#include -#include -#include -#include -#include -#include - -namespace { -// Custom pair equal -template -struct pair_equal { - __device__ bool operator()(const cuco::pair_type& lhs, - const cuco::pair_type& rhs) const - { - return lhs.first == rhs.first; - } -}; -} // anonymous namespace - -/** - * @brief A benchmark evaluating `pair_retrieve` performance: - * - CG size: 8 - */ -template -std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_static_multimap_pair_retrieve( - nvbench::state& state, nvbench::type_list>) -{ - auto constexpr matching_rate = 0.5; - auto constexpr occupancy = 0.5; - auto constexpr dist = dist_type::UNIFORM; - - auto const num_input = state.get_int64("NumInputs"); - - std::size_t const size = num_input / occupancy; - - std::vector h_keys(num_input); - std::vector> h_pairs(num_input); - - generate_keys(h_keys.begin(), h_keys.end()); - - for (auto i = 0; i < num_input; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - } - - thrust::device_vector> d_pairs(h_pairs); - auto const pair_begin = d_pairs.begin(); - - cuco::static_multimap map{ - size, cuco::empty_key{-1}, cuco::empty_value{-1}}; - map.insert(pair_begin, pair_begin + num_input); - - generate_probe_keys(matching_rate, h_keys.begin(), h_keys.end()); - thrust::device_vector d_keys(h_keys); - - thrust::transform( - thrust::device, d_keys.begin(), d_keys.begin() + num_input, pair_begin, [] __device__(Key i) { - return cuco::pair_type{i, i}; - }); - - state.add_element_count(num_input, "NumInputs"); - - auto const output_size = - map.pair_count(pair_begin, pair_begin + num_input, pair_equal{}); - thrust::device_vector> d_results(output_size); - - auto out1_begin = thrust::make_zip_iterator( - thrust::make_tuple(thrust::make_discard_iterator(), thrust::make_discard_iterator())); - auto out2_begin = thrust::make_zip_iterator( - thrust::make_tuple(thrust::make_discard_iterator(), thrust::make_discard_iterator())); - - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto [out1_end, out2_end] = map.pair_retrieve( - pair_begin, pair_begin + num_input, out1_begin, out2_begin, pair_equal{}); - }); -} - -template -std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> nvbench_static_multimap_pair_retrieve( - nvbench::state& state, nvbench::type_list>) -{ - state.skip("Key should be the same type as Value."); -} - -using key_type = nvbench::type_list; -using value_type = nvbench::type_list; -using d_type = - nvbench::enum_type_list; - -using multiplicity = nvbench::enum_type_list<1, 2, 4, 8, 16, 32, 64, 128, 256>; - -NVBENCH_BENCH_TYPES(nvbench_static_multimap_pair_retrieve, - NVBENCH_TYPE_AXES(key_type, value_type, multiplicity)) - .set_name("staic_multimap_pair_retrieve_uniform_multiplicity") - .set_type_axes_names({"Key", "Value", "Multiplicity"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", - {1'000, - 100'000, - 1'000'000, - 10'000'000, - 100'000'000}); // Total number of key/value pairs: 100'000'000 diff --git a/benchmarks/hash_table/static_multimap/query_bench.cu b/benchmarks/hash_table/static_multimap/query_bench.cu index 41e88647c..34577e29c 100644 --- a/benchmarks/hash_table/static_multimap/query_bench.cu +++ b/benchmarks/hash_table/static_multimap/query_bench.cu @@ -14,6 +14,8 @@ * limitations under the License. */ +#include +#include #include #include @@ -21,99 +23,82 @@ #include #include +#include + +using namespace cuco::benchmark; +using namespace cuco::benchmark::defaults; /** - * @brief A benchmark evaluating multi-value query (`count` + `retrieve`) performance: - * - Total number of insertions: 100'000'000 - * - CG size: 8 + * @brief A benchmark evaluating multi-value query (`count` + `retrieve`) performance */ -template -std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_static_multimap_query( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_query( + nvbench::state& state, nvbench::type_list) { - auto const num_keys = state.get_int64("NumInputs"); - auto const occupancy = state.get_float64("Occupancy"); - auto const matching_rate = state.get_float64("MatchingRate"); + using pair_type = cuco::pair_type; - std::size_t const size = num_keys / occupancy; + auto const num_keys = state.get_int64_or_default("NumInputs", N); + auto const occupancy = state.get_float64_or_default("Occupancy", OCCUPANCY); + auto const matching_rate = state.get_float64_or_default("MatchingRate", MATCHING_RATE); - std::vector h_keys(num_keys); - std::vector> h_pairs(num_keys); + std::size_t const size = num_keys / occupancy; - generate_keys(h_keys.begin(), h_keys.end()); + thrust::device_vector keys(num_keys); - for (auto i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - } + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); - generate_probe_keys(matching_rate, h_keys.begin(), h_keys.end()); + thrust::device_vector pairs(num_keys); + thrust::transform( + thrust::device, keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, {}); + }); - thrust::device_vector d_keys(h_keys); - thrust::device_vector> d_pairs(h_pairs); + gen.dropout(keys.begin(), keys.end(), matching_rate); - state.add_element_count(num_keys, "NumKeys"); + state.add_element_count(num_keys); + state.set_global_memory_rw_bytes(num_keys * sizeof(pair_type)); cuco::static_multimap map{ size, cuco::empty_key{-1}, cuco::empty_value{-1}}; - map.insert(d_pairs.begin(), d_pairs.end()); - - auto const output_size = map.count_outer(d_keys.begin(), d_keys.end()); - thrust::device_vector> d_results(output_size); + map.insert(pairs.begin(), pairs.end()); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto count = map.count_outer(d_keys.begin(), d_keys.end(), launch.get_stream()); - map.retrieve_outer(d_keys.begin(), d_keys.end(), d_results.begin(), launch.get_stream()); + auto count = map.count_outer(keys.begin(), keys.end(), launch.get_stream()); + map.retrieve_outer(keys.begin(), keys.end(), pairs.begin(), launch.get_stream()); }); } -template -std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> nvbench_static_multimap_query( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_multimap_query( + nvbench::state& state, nvbench::type_list) { state.skip("Key should be the same type as Value."); } -using key_type = nvbench::type_list; -using value_type = nvbench::type_list; -using d_type = - nvbench::enum_type_list; - -using multiplicity = nvbench::enum_type_list<1, 2, 4, 8, 16, 32, 64, 128, 256>; - -NVBENCH_BENCH_TYPES(nvbench_static_multimap_query, - NVBENCH_TYPE_AXES(key_type, - value_type, - nvbench::enum_type_list, - multiplicity)) - .set_name("staic_multimap_query_uniform_multiplicity") - .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000 - .add_float64_axis("Occupancy", {0.8}) - .add_float64_axis("MatchingRate", {0.5}); - -NVBENCH_BENCH_TYPES(nvbench_static_multimap_query, - NVBENCH_TYPE_AXES(key_type, value_type, d_type, nvbench::enum_type_list<8>)) - .set_name("staic_multimap_query_occupancy") - .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000 - .add_float64_axis("Occupancy", nvbench::range(0.1, 0.9, 0.1)) - .add_float64_axis("MatchingRate", {0.5}); - -NVBENCH_BENCH_TYPES(nvbench_static_multimap_query, - NVBENCH_TYPE_AXES(key_type, value_type, d_type, nvbench::enum_type_list<8>)) - .set_name("staic_multimap_query_matching_rate") - .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000 - .add_float64_axis("Occupancy", {0.8}) - .add_float64_axis("MatchingRate", {0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1}); +NVBENCH_BENCH_TYPES(static_multimap_query, + NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, + VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multimap_query_uniform_occupancy") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(MAX_NOISE) + .add_float64_axis("Occupancy", OCCUPANCY_RANGE); + +NVBENCH_BENCH_TYPES(static_multimap_query, + NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, + VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multimap_query_uniform_matching_rate") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(MAX_NOISE) + .add_float64_axis("MatchingRate", MATCHING_RATE_RANGE); + +NVBENCH_BENCH_TYPES(static_multimap_query, + NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, + VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multimap_query_uniform_multiplicity") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(MAX_NOISE) + .add_int64_axis("Multiplicity", MULTIPLICITY_RANGE); \ No newline at end of file diff --git a/benchmarks/hash_table/static_multimap/retrieve_bench.cu b/benchmarks/hash_table/static_multimap/retrieve_bench.cu index 25ddae575..e7ea00e0b 100644 --- a/benchmarks/hash_table/static_multimap/retrieve_bench.cu +++ b/benchmarks/hash_table/static_multimap/retrieve_bench.cu @@ -14,6 +14,8 @@ * limitations under the License. */ +#include +#include #include #include @@ -21,98 +23,81 @@ #include #include +#include + +using namespace cuco::benchmark; +using namespace cuco::benchmark::defaults; /** - * @brief A benchmark evaluating multi-value `retrieve` performance: - * - Total number of insertions: 100'000'000 - * - CG size: 8 + * @brief A benchmark evaluating multi-value `retrieve` performance */ -template -std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_static_multimap_retrieve( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_retrieve( + nvbench::state& state, nvbench::type_list) { - auto const num_keys = state.get_int64("NumInputs"); - auto const occupancy = state.get_float64("Occupancy"); - auto const matching_rate = state.get_float64("MatchingRate"); + using pair_type = cuco::pair_type; - std::size_t const size = num_keys / occupancy; + auto const num_keys = state.get_int64_or_default("NumInputs", N); + auto const occupancy = state.get_float64_or_default("Occupancy", OCCUPANCY); + auto const matching_rate = state.get_float64_or_default("MatchingRate", MATCHING_RATE); - std::vector h_keys(num_keys); - std::vector> h_pairs(num_keys); + std::size_t const size = num_keys / occupancy; - generate_keys(h_keys.begin(), h_keys.end()); + thrust::device_vector keys(num_keys); - for (auto i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - } + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); - generate_probe_keys(matching_rate, h_keys.begin(), h_keys.end()); + thrust::device_vector pairs(num_keys); + thrust::transform( + thrust::device, keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, {}); + }); - thrust::device_vector d_keys(h_keys); - thrust::device_vector> d_pairs(h_pairs); + gen.dropout(keys.begin(), keys.end(), matching_rate); - state.add_element_count(num_keys, "NumKeys"); + state.add_element_count(num_keys); + state.set_global_memory_rw_bytes(num_keys * sizeof(pair_type)); cuco::static_multimap map{ size, cuco::empty_key{-1}, cuco::empty_value{-1}}; - map.insert(d_pairs.begin(), d_pairs.end()); - - auto const output_size = map.count_outer(d_keys.begin(), d_keys.end()); - thrust::device_vector> d_results(output_size); + map.insert(pairs.begin(), pairs.end()); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - map.retrieve_outer(d_keys.begin(), d_keys.end(), d_results.begin(), launch.get_stream()); + map.retrieve_outer(keys.begin(), keys.end(), pairs.begin(), launch.get_stream()); }); } -template -std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> nvbench_static_multimap_retrieve( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_multimap_retrieve( + nvbench::state& state, nvbench::type_list) { state.skip("Key should be the same type as Value."); } -using key_type = nvbench::type_list; -using value_type = nvbench::type_list; -using d_type = - nvbench::enum_type_list; - -using multiplicity = nvbench::enum_type_list<1, 2, 4, 8, 16, 32, 64, 128, 256>; - -NVBENCH_BENCH_TYPES(nvbench_static_multimap_retrieve, - NVBENCH_TYPE_AXES(key_type, - value_type, - nvbench::enum_type_list, - multiplicity)) - .set_name("staic_multimap_retrieve_uniform_multiplicity") - .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000 - .add_float64_axis("Occupancy", {0.8}) - .add_float64_axis("MatchingRate", {0.5}); - -NVBENCH_BENCH_TYPES(nvbench_static_multimap_retrieve, - NVBENCH_TYPE_AXES(key_type, value_type, d_type, nvbench::enum_type_list<8>)) - .set_name("staic_multimap_retrieve_occupancy") - .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000 - .add_float64_axis("Occupancy", nvbench::range(0.1, 0.9, 0.1)) - .add_float64_axis("MatchingRate", {0.5}); - -NVBENCH_BENCH_TYPES(nvbench_static_multimap_retrieve, - NVBENCH_TYPE_AXES(key_type, value_type, d_type, nvbench::enum_type_list<8>)) - .set_name("staic_multimap_retrieve_matching_rate") - .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000 - .add_float64_axis("Occupancy", {0.8}) - .add_float64_axis("MatchingRate", {0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1}); +NVBENCH_BENCH_TYPES(static_multimap_retrieve, + NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, + VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multimap_retrieve_uniform_occupancy") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(MAX_NOISE) + .add_float64_axis("Occupancy", OCCUPANCY_RANGE); + +NVBENCH_BENCH_TYPES(static_multimap_retrieve, + NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, + VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multimap_retrieve_uniform_matching_rate") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(MAX_NOISE) + .add_float64_axis("MatchingRate", MATCHING_RATE_RANGE); + +NVBENCH_BENCH_TYPES(static_multimap_retrieve, + NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, + VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multimap_retrieve_uniform_multiplicity") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(MAX_NOISE) + .add_int64_axis("Multiplicity", MULTIPLICITY_RANGE); \ No newline at end of file diff --git a/benchmarks/key_generator.hpp b/benchmarks/key_generator.hpp index bd90e6caa..4bf8491f1 100644 --- a/benchmarks/key_generator.hpp +++ b/benchmarks/key_generator.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,99 +16,218 @@ #pragma once +#include +#include + +#include + #include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include #include -#include - -enum class dist_type { GAUSSIAN, GEOMETRIC, UNIFORM }; - -NVBENCH_DECLARE_ENUM_TYPE_STRINGS( - // Enum type: - dist_type, - // Callable to generate input strings: - // Short identifier used for tables, command-line args, etc. - // Used when context is available to figure out the enum type. - [](dist_type d) { - switch (d) { - case dist_type::GAUSSIAN: return "GAUSSIAN"; - case dist_type::GEOMETRIC: return "GEOMETRIC"; - case dist_type::UNIFORM: return "UNIFORM"; - default: return "ERROR"; - } - }, - // Callable to generate descriptions: - // If non-empty, these are used in `--list` to describe values. - // Used when context may not be available to figure out the type from the - // input string. - // Just use `[](auto) { return std::string{}; }` if you don't want these. - [](auto) { return std::string{}; }) - -template -static void generate_keys(OutputIt output_begin, OutputIt output_end) -{ - auto const num_keys = std::distance(output_begin, output_end); - - std::random_device rd; - std::mt19937 gen{rd()}; - - switch (Dist) { - case dist_type::GAUSSIAN: { - auto const mean = static_cast(num_keys / 2); - auto const dev = static_cast(num_keys / 5); - - std::normal_distribution<> distribution{mean, dev}; - - for (auto i = 0; i < num_keys; ++i) { - auto k = distribution(gen); - while (k >= num_keys) { - k = distribution(gen); - } - output_begin[i] = k; - } - break; - } - case dist_type::GEOMETRIC: { - auto const max = std::numeric_limits::max(); - auto const coeff = static_cast(num_keys) / static_cast(max); - // Random sampling in range [0, INT32_MAX] - std::geometric_distribution distribution{1e-9}; - - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = distribution(gen) * coeff; - } - break; +#include +#include +#include + +namespace cuco::benchmark { + +/** + * @brief Random key generator. + * + * @tparam RNG Pseudo-random number generator + */ +template +class key_generator { + public: + /** + * @brief Construct a new key generator object. + * + * @param seed Seed for the random number generator + */ + key_generator(uint32_t seed = static_cast(time(nullptr))) : rng_(seed) {} + + /** + * @brief Generates a sequence of random keys in the interval [0, N). + * + * @tparam Dist Key distribution type + * @tparam OutputIt Ouput iterator typy which value type is the desired key type + * @tparam ExecPolicy Thrust execution policy + * @tparam Enable SFINAE helper + * + * @param dist Random distribution to use + * @param output_begin Start of the output sequence + * @param output_end End of the output sequence + * @param exec_policy Thrust execution policy this operation will be executed with + */ + template ::value>> + void generate(Dist dist, OutputIt out_begin, OutputIt out_end, ExecPolicy exec_policy) + { + using value_type = typename std::iterator_traits::value_type; + + if constexpr (std::is_same_v) { + thrust::sequence(exec_policy, out_begin, out_end, 0); + thrust::shuffle(exec_policy, out_begin, out_end, this->rng_); + } else if constexpr (std::is_same_v) { + size_t num_keys = thrust::distance(out_begin, out_end); + + thrust::counting_iterator seeds(this->rng_()); + + thrust::transform(exec_policy, + seeds, + seeds + num_keys, + out_begin, + [*this, dist, num_keys] __host__ __device__(size_t const seed) { + RNG rng; + thrust::uniform_int_distribution uniform_dist( + 1, num_keys / dist.multiplicity); + rng.seed(seed); + return uniform_dist(rng); + }); + } else if constexpr (std::is_same_v) { + size_t num_keys = thrust::distance(out_begin, out_end); + + thrust::counting_iterator seq(this->rng_()); + + thrust::transform(exec_policy, + seq, + seq + num_keys, + out_begin, + [*this, dist, num_keys] __host__ __device__(size_t const seed) { + RNG rng; + thrust::normal_distribution<> normal_dist( + static_cast(num_keys / 2), num_keys * dist.skew); + rng.seed(seed); + auto val = normal_dist(rng); + while (val < 0 or val >= num_keys) { + // Re-sample if the value is outside the range [0, N) + // This is necessary because the normal distribution is not bounded + // might be a better way to do this, e.g., discard(n) + val = normal_dist(rng); + } + return val; + }); + } else { + CUCO_FAIL("Unexpected distribution type"); } - case dist_type::UNIFORM: { - std::uniform_int_distribution distribution{1, static_cast(num_keys / Multiplicity)}; + } + + /** + * @brief Overload of 'generate' which automatically selects a suitable execution policy + */ + template + void generate(Dist dist, OutputIt out_begin, OutputIt out_end) + { + using thrust::system::detail::generic::select_system; - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = distribution(gen); - } - break; + typedef typename thrust::iterator_system::type System; + System system; + + generate(dist, out_begin, out_end, select_system(system)); + } + + /** + * @brief Overload of 'generate' which uses 'thrust::cuda::par_nosync' execution policy on CUDA + * stream 'stream' + */ + template + void generate(Dist dist, OutputIt out_begin, OutputIt out_end, cudaStream_t stream) + { + generate(dist, out_begin, out_end, thrust::cuda::par_nosync.on(stream)); + } + + /** + * @brief Randomly replaces previously generated keys with new keys outside the input + * distribution. + * + * @tparam InOutIt Input/Ouput iterator typy which value type is the desired key type + * @tparam ExecPolicy Thrust execution policy + * @tparam Enable SFINAE helper + * + * @param begin Start of the key sequence + * @param end End of the key sequence + * @param keep_prob Probability that a key is kept + * @param exec_policy Thrust execution policy this operation will be executed with + */ + template ::value>> + void dropout(InOutIt begin, InOutIt end, double keep_prob, ExecPolicy exec_policy) + { + using value_type = typename std::iterator_traits::value_type; + + if (keep_prob >= 1.0) { + size_t num_keys = thrust::distance(begin, end); + + thrust::counting_iterator seeds(rng_()); + + thrust::transform_if( + exec_policy, + seeds, + seeds + num_keys, + begin, + [num_keys] __host__ __device__(size_t const seed) { + RNG rng; + thrust::uniform_int_distribution non_match_dist{ + static_cast(num_keys), std::numeric_limits::max()}; + rng.seed(seed); + return non_match_dist(rng); + }, + [keep_prob] __host__ __device__(size_t const seed) { + RNG rng; + thrust::uniform_real_distribution rate_dist(0.0, 1.0); + rng.seed(seed); + return (rate_dist(rng) > keep_prob); + }); } - } // switch -} -template -static void generate_probe_keys(double const matching_rate, - OutputIt output_begin, - OutputIt output_end) -{ - auto const num_keys = std::distance(output_begin, output_end); - auto const max = std::numeric_limits::max(); + thrust::shuffle(exec_policy, begin, end, rng_); + } + + /** + * @brief Overload of 'dropout' which automatically selects a suitable execution policy + */ + template + void dropout(InOutIt begin, InOutIt end, double keep_prob) + { + using thrust::system::detail::generic::select_system; - std::random_device rd; - std::mt19937 gen{rd()}; + typedef typename thrust::iterator_system::type System; + System system; - std::uniform_real_distribution rate_dist(0.0, 1.0); - std::uniform_int_distribution non_match_dist{static_cast(num_keys), max}; + dropout(begin, end, keep_prob, select_system(system)); + } - for (auto i = 0; i < num_keys; ++i) { - auto const tmp_rate = rate_dist(gen); + /** + * @brief Overload of 'dropout' which uses 'thrust::cuda::par_nosync' execution policy on CUDA + * stream 'stream' + */ + template + void dropout(InOutIt begin, InOutIt end, double keep_prob, cudaStream_t stream) + { + using thrust::system::detail::generic::select_system; - if (tmp_rate > matching_rate) { output_begin[i] = non_match_dist(gen); } + typedef typename thrust::iterator_system::type System; + System system; + + dropout(begin, end, keep_prob, thrust::cuda::par_nosync.on(stream)); } - std::random_shuffle(output_begin, output_end); -} + private: + RNG rng_; ///< Random number generator +}; + +} // namespace cuco::benchmark \ No newline at end of file From 74f19ad7e3681be402d666cdfad73e5673ea00d5 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 20 Feb 2023 13:09:44 -0500 Subject: [PATCH 089/152] Rewrite `static_map` benchmarks with nvbench (#270) This PR rewrites `static_map` benchmarks with nvbench. The goal is to create a baseline to evaluate the performance of the new `static_set` and compare the new and old implementations of `static_map`. --- .gitignore | 3 + benchmarks/CMakeLists.txt | 10 +- benchmarks/defaults.hpp | 2 +- .../hash_table/static_map/contains_bench.cu | 95 +++++ .../hash_table/static_map/erase_bench.cu | 115 ++++++ .../hash_table/static_map/find_bench.cu | 94 +++++ .../hash_table/static_map/insert_bench.cu | 109 ++++++ benchmarks/hash_table/static_map_bench.cu | 359 ------------------ benchmarks/key_generator.hpp | 2 +- 9 files changed, 424 insertions(+), 365 deletions(-) create mode 100644 benchmarks/hash_table/static_map/contains_bench.cu create mode 100644 benchmarks/hash_table/static_map/erase_bench.cu create mode 100644 benchmarks/hash_table/static_map/find_bench.cu create mode 100644 benchmarks/hash_table/static_map/insert_bench.cu delete mode 100644 benchmarks/hash_table/static_map_bench.cu diff --git a/.gitignore b/.gitignore index 4146530ed..e57f3b30c 100644 --- a/.gitignore +++ b/.gitignore @@ -140,3 +140,6 @@ ENV/ # clang compile_commands.json + +# figures +*.eps diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 283d50244..cd4183da7 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -65,7 +65,6 @@ function(ConfigureNVBench BENCH_NAME) RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/nvbenchmarks") target_include_directories(${BENCH_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") - #"${NVBench_SOURCE_DIR}") target_compile_options(${BENCH_NAME} PRIVATE --expt-extended-lambda --expt-relaxed-constexpr) target_link_libraries(${BENCH_NAME} PRIVATE nvbench::main @@ -84,8 +83,11 @@ ConfigureBench(DYNAMIC_MAP_BENCH "${DYNAMIC_MAP_BENCH_SRC}") ################################################################################################### # - static_map benchmarks ------------------------------------------------------------------------- -set(STATIC_MAP_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/hash_table/static_map_bench.cu") -ConfigureBench(STATIC_MAP_BENCH "${STATIC_MAP_BENCH_SRC}") +ConfigureNVBench(STATIC_MAP_BENCH + hash_table/static_map/insert_bench.cu + hash_table/static_map/find_bench.cu + hash_table/static_map/contains_bench.cu + hash_table/static_map/erase_bench.cu) ################################################################################################### # - static_multimap benchmarks -------------------------------------------------------------------- diff --git a/benchmarks/defaults.hpp b/benchmarks/defaults.hpp index 447a5d031..9aa9f3587 100644 --- a/benchmarks/defaults.hpp +++ b/benchmarks/defaults.hpp @@ -38,4 +38,4 @@ auto const MULTIPLICITY_RANGE = std::vector{1, 2, 4, 8, 16}; auto const MATCHING_RATE_RANGE = nvbench::range(0.1, 1., 0.1); auto const SKEW_RANGE = nvbench::range(0.1, 1., 0.1); -} // namespace cuco::benchmark::defaults \ No newline at end of file +} // namespace cuco::benchmark::defaults diff --git a/benchmarks/hash_table/static_map/contains_bench.cu b/benchmarks/hash_table/static_map/contains_bench.cu new file mode 100644 index 000000000..ef1dcfa26 --- /dev/null +++ b/benchmarks/hash_table/static_map/contains_bench.cu @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include + +#include + +#include +#include + +using namespace cuco::benchmark; +using namespace cuco::benchmark::defaults; + +/** + * @brief A benchmark evaluating `contains` performance: + */ +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_contains( + nvbench::state& state, nvbench::type_list) +{ + auto const num_keys = state.get_int64_or_default("NumInputs", N); + auto const occupancy = state.get_float64_or_default("Occupancy", OCCUPANCY); + auto const matching_rate = state.get_float64_or_default("MatchingRate", MATCHING_RATE); + + std::size_t const size = num_keys / occupancy; + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + auto pairs_begin = thrust::make_transform_iterator( + keys.begin(), [] __device__(auto i) { return cuco::pair_type(i, {}); }); + + cuco::static_map map{size, cuco::empty_key{-1}, cuco::empty_value{-1}}; + map.insert(pairs_begin, pairs_begin + num_keys); + CUCO_CUDA_TRY(cudaStreamSynchronize(nullptr)); + + gen.dropout(keys.begin(), keys.end(), matching_rate); + + state.add_element_count(num_keys, "NumInputs"); + state.set_global_memory_rw_bytes(num_keys * sizeof(Key)); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + map.contains(keys.begin(), + keys.end(), + thrust::make_discard_iterator(), + cuco::murmurhash3_32{}, + thrust::equal_to{}, + launch.get_stream()); + }); +} + +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_contains( + nvbench::state& state, nvbench::type_list) +{ + state.skip("Key should be the same type as Value."); +} + +NVBENCH_BENCH_TYPES(static_map_contains, + NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, + VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_map_contains_unique_occupancy") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. + .set_max_noise(MAX_NOISE) // Custom noise: 3%. By default: 0.5%. + .add_float64_axis("Occupancy", OCCUPANCY_RANGE); + +NVBENCH_BENCH_TYPES(static_map_contains, + NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, + VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_map_contains_unique_matching_rate") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. + .set_max_noise(MAX_NOISE) // Custom noise: 3%. By default: 0.5%. + .add_float64_axis("MatchingRate", MATCHING_RATE_RANGE); diff --git a/benchmarks/hash_table/static_map/erase_bench.cu b/benchmarks/hash_table/static_map/erase_bench.cu new file mode 100644 index 000000000..777276624 --- /dev/null +++ b/benchmarks/hash_table/static_map/erase_bench.cu @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include + +#include +#include + +using namespace cuco::benchmark; +using namespace cuco::benchmark::defaults; + +/** + * @brief A benchmark evaluating `erase` performance: + */ +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_erase( + nvbench::state& state, nvbench::type_list) +{ + using pair_type = cuco::pair_type; + + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); + + std::size_t const size = num_keys / occupancy; + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + thrust::device_vector pairs(num_keys); + thrust::transform(thrust::device, keys.begin(), keys.end(), pairs.begin(), [] __device__(auto i) { + return pair_type(i, {}); + }); + + gen.dropout(keys.begin(), keys.end(), matching_rate); + + state.add_element_count(num_keys, "NumInputs"); + state.set_global_memory_rw_bytes(num_keys * sizeof(pair_type)); + state.exec( + nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { + // static map with erase support + cuco::static_map map{ + size, cuco::empty_key{-1}, cuco::empty_value{-1}, cuco::erased_key{-2}}; + map.insert(pairs.begin(), + pairs.end(), + cuco::murmurhash3_32{}, + thrust::equal_to{}, + launch.get_stream()); + + timer.start(); + map.erase(keys.begin(), + keys.end(), + cuco::murmurhash3_32{}, + thrust::equal_to{}, + launch.get_stream()); + timer.stop(); + }); +} + +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_erase( + nvbench::state& state, nvbench::type_list) +{ + state.skip("Key should be the same type as Value."); +} + +NVBENCH_BENCH_TYPES(static_map_erase, + NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, + VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_map_erase_uniform_multiplicity") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. + .set_max_noise(MAX_NOISE) // Custom noise: 3%. By default: 0.5%. + .add_int64_axis("Multiplicity", MULTIPLICITY_RANGE); + +NVBENCH_BENCH_TYPES(static_map_erase, + NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, + VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_map_erase_unique_occupancy") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. + .set_max_noise(MAX_NOISE) // Custom noise: 3%. By default: 0.5%. + .add_float64_axis("Occupancy", OCCUPANCY_RANGE); + +NVBENCH_BENCH_TYPES(static_map_erase, + NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, + VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_map_erase_unique_matching_rate") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. + .set_max_noise(MAX_NOISE) // Custom noise: 3%. By default: 0.5%. + .add_float64_axis("MatchingRate", MATCHING_RATE_RANGE); diff --git a/benchmarks/hash_table/static_map/find_bench.cu b/benchmarks/hash_table/static_map/find_bench.cu new file mode 100644 index 000000000..8efc73acd --- /dev/null +++ b/benchmarks/hash_table/static_map/find_bench.cu @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include + +#include +#include + +using namespace cuco::benchmark; +using namespace cuco::benchmark::defaults; + +/** + * @brief A benchmark evaluating `find` performance: + */ +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_find( + nvbench::state& state, nvbench::type_list) +{ + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); + + std::size_t const size = num_keys / occupancy; + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + auto pairs_begin = thrust::make_transform_iterator( + keys.begin(), [] __device__(auto i) { return cuco::pair_type(i, {}); }); + + cuco::static_map map{size, cuco::empty_key{-1}, cuco::empty_value{-1}}; + map.insert(pairs_begin, pairs_begin + num_keys); + CUCO_CUDA_TRY(cudaStreamSynchronize(nullptr)); + + gen.dropout(keys.begin(), keys.end(), matching_rate); + + state.add_element_count(num_keys, "NumInputs"); + state.set_global_memory_rw_bytes(num_keys * sizeof(Key)); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + map.find(keys.begin(), + keys.end(), + thrust::make_discard_iterator(), + cuco::murmurhash3_32{}, + thrust::equal_to{}, + launch.get_stream()); + }); +} + +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_find( + nvbench::state& state, nvbench::type_list) +{ + state.skip("Key should be the same type as Value."); +} + +NVBENCH_BENCH_TYPES(static_map_find, + NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, + VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_map_find_unique_occupancy") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. + .set_max_noise(MAX_NOISE) // Custom noise: 3%. By default: 0.5%. + .add_float64_axis("Occupancy", OCCUPANCY_RANGE); + +NVBENCH_BENCH_TYPES(static_map_find, + NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, + VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_map_find_unique_matching_rate") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. + .set_max_noise(MAX_NOISE) // Custom noise: 3%. By default: 0.5%. + .add_float64_axis("MatchingRate", MATCHING_RATE_RANGE); diff --git a/benchmarks/hash_table/static_map/insert_bench.cu b/benchmarks/hash_table/static_map/insert_bench.cu new file mode 100644 index 000000000..0cc370403 --- /dev/null +++ b/benchmarks/hash_table/static_map/insert_bench.cu @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include + +#include + +#include +#include + +using namespace cuco::benchmark; +using namespace cuco::benchmark::defaults; + +/** + * @brief A benchmark evaluating `insert` performance: + */ +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_insert( + nvbench::state& state, nvbench::type_list) +{ + using pair_type = cuco::pair_type; + + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + + std::size_t const size = num_keys / occupancy; + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + thrust::device_vector pairs(num_keys); + thrust::transform( + thrust::device, keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, {}); + }); + + state.add_element_count(num_keys, "NumInputs"); + state.set_global_memory_rw_bytes(num_keys * sizeof(pair_type)); + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, + [&](nvbench::launch& launch, auto& timer) { + cuco::static_map map{size, + cuco::empty_key{-1}, + cuco::empty_value{-1}, + cuco::cuda_allocator{}, + launch.get_stream()}; + + // Use timers to explicitly mark the target region + timer.start(); + map.insert(pairs.begin(), + pairs.end(), + cuco::murmurhash3_32{}, + thrust::equal_to{}, + launch.get_stream()); + timer.stop(); + }); +} + +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_insert( + nvbench::state& state, nvbench::type_list) +{ + state.skip("Key should be the same type as Value."); +} + +NVBENCH_BENCH_TYPES(static_map_insert, + NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, + VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_map_insert_uniform_multiplicity") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(MAX_NOISE) // Custom noise: 3%. By default: 0.5%. + .add_int64_axis("Multiplicity", MULTIPLICITY_RANGE); + +NVBENCH_BENCH_TYPES(static_map_insert, + NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, + VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_map_insert_unique_occupancy") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(MAX_NOISE) // Custom noise: 3%. By default: 0.5%. + .add_float64_axis("Occupancy", OCCUPANCY_RANGE); + +NVBENCH_BENCH_TYPES(static_map_insert, + NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, + VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_map_insert_gaussian") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(MAX_NOISE) // Custom noise: 3%. By default: 0.5%. + .add_float64_axis("Skew", SKEW_RANGE); diff --git a/benchmarks/hash_table/static_map_bench.cu b/benchmarks/hash_table/static_map_bench.cu deleted file mode 100644 index 22822a6bc..000000000 --- a/benchmarks/hash_table/static_map_bench.cu +++ /dev/null @@ -1,359 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include - -#include - -#include -#include - -enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN }; - -template -static void generate_keys(OutputIt output_begin, OutputIt output_end) -{ - auto num_keys = std::distance(output_begin, output_end); - - std::random_device rd; - std::mt19937 gen{rd()}; - - switch (Dist) { - case dist_type::UNIQUE: - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = i; - } - break; - case dist_type::UNIFORM: - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = std::abs(static_cast(gen())); - } - break; - case dist_type::GAUSSIAN: - std::normal_distribution<> dg{1e9, 1e7}; - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = std::abs(static_cast(dg(gen))); - } - break; - } -} - -/** - * @brief Generates input sizes and hash table occupancies - * - */ -static void generate_size_and_occupancy(benchmark::internal::Benchmark* b) -{ - for (auto size = 100'000'000; size <= 100'000'000; size *= 10) { - for (auto occupancy = 10; occupancy <= 90; occupancy += 10) { - b->Args({size, occupancy}); - } - } -} - -template -static void BM_static_map_insert(::benchmark::State& state) -{ - using map_type = cuco::static_map; - - std::size_t num_keys = state.range(0); - float occupancy = state.range(1) / float{100}; - std::size_t size = num_keys / occupancy; - - std::vector h_keys(num_keys); - std::vector> h_pairs(num_keys); - - generate_keys(h_keys.begin(), h_keys.end()); - - for (std::size_t i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - } - - thrust::device_vector> d_pairs(h_pairs); - thrust::device_vector d_keys(h_keys); - - for (auto _ : state) { - map_type map{size, cuco::empty_key{-1}, cuco::empty_value{-1}}; - - cudaEvent_t start, stop; - CUCO_CUDA_TRY(cudaEventCreate(&start)); - CUCO_CUDA_TRY(cudaEventCreate(&stop)); - - CUCO_CUDA_TRY(cudaEventRecord(start)); - map.insert(d_pairs.begin(), d_pairs.end()); - CUCO_CUDA_TRY(cudaEventRecord(stop)); - CUCO_CUDA_TRY(cudaEventSynchronize(stop)); - - float ms; - CUCO_CUDA_TRY(cudaEventElapsedTime(&ms, start, stop)); - - state.SetIterationTime(ms / 1000); - } - - state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * - int64_t(state.range(0))); -} - -template -static void BM_static_map_search_all(::benchmark::State& state) -{ - using map_type = cuco::static_map; - - std::size_t num_keys = state.range(0); - float occupancy = state.range(1) / float{100}; - std::size_t size = num_keys / occupancy; - - map_type map{size, cuco::empty_key{-1}, cuco::empty_value{-1}}; - - std::vector h_keys(num_keys); - std::vector h_values(num_keys); - std::vector> h_pairs(num_keys); - std::vector h_results(num_keys); - - generate_keys(h_keys.begin(), h_keys.end()); - - for (std::size_t i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - } - - thrust::device_vector d_keys(h_keys); - thrust::device_vector d_results(num_keys); - thrust::device_vector> d_pairs(h_pairs); - - map.insert(d_pairs.begin(), d_pairs.end()); - - for (auto _ : state) { - map.find(d_keys.begin(), d_keys.end(), d_results.begin()); - // TODO: get rid of sync and rewrite the benchmark with `nvbench` - // once https://github.com/NVIDIA/nvbench/pull/80 is merged - CUCO_CUDA_TRY(cudaDeviceSynchronize()); - } - - state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * - int64_t(state.range(0))); -} - -template -static void BM_static_map_search_none(::benchmark::State& state) -{ - using map_type = cuco::static_map; - - std::size_t num_keys = state.range(0); - float occupancy = state.range(1) / float{100}; - std::size_t size = num_keys / occupancy; - - map_type map{size, cuco::empty_key{-1}, cuco::empty_value{-1}}; - - std::vector h_keys(num_keys); - std::vector h_values(num_keys); - std::vector> h_pairs(num_keys); - std::vector h_results(num_keys); - - generate_keys(h_keys.begin(), h_keys.end()); - - for (std::size_t i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - } - - // diff keys - for (std::size_t i = 0; i < num_keys; ++i) { - h_keys[i] += num_keys; - } - - thrust::device_vector d_keys(h_keys); - thrust::device_vector d_results(num_keys); - thrust::device_vector> d_pairs(h_pairs); - - map.insert(d_pairs.begin(), d_pairs.end()); - - for (auto _ : state) { - map.find(d_keys.begin(), d_keys.end(), d_results.begin()); - // TODO: get rid of sync and rewrite the benchmark with `nvbench` - // once https://github.com/NVIDIA/nvbench/pull/80 is merged - CUCO_CUDA_TRY(cudaDeviceSynchronize()); - } - - state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * - int64_t(state.range(0))); -} - -template -static void BM_static_map_erase_all(::benchmark::State& state) -{ - using map_type = cuco::static_map; - - std::size_t num_keys = state.range(0); - float occupancy = state.range(1) / float{100}; - std::size_t size = num_keys / occupancy; - - // static map with erase support - map_type map{ - size, cuco::empty_key{-1}, cuco::empty_value{-1}, cuco::erased_key{-2}}; - - std::vector h_keys(num_keys); - std::vector h_values(num_keys); - std::vector> h_pairs(num_keys); - std::vector h_results(num_keys); - - generate_keys(h_keys.begin(), h_keys.end()); - - for (std::size_t i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - } - - thrust::device_vector d_keys(h_keys); - thrust::device_vector d_results(num_keys); - thrust::device_vector> d_pairs(h_pairs); - - for (auto _ : state) { - state.PauseTiming(); - map.insert(d_pairs.begin(), d_pairs.end()); - state.ResumeTiming(); - - map.erase(d_keys.begin(), d_keys.end()); - } - - state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * - int64_t(state.range(0))); -} - -template -static void BM_static_map_erase_none(::benchmark::State& state) -{ - using map_type = cuco::static_map; - - std::size_t num_keys = state.range(0); - float occupancy = state.range(1) / float{100}; - std::size_t size = num_keys / occupancy; - - map_type map{size, cuco::empty_key{-1}, cuco::empty_value{-1}, cuco::erased_key{-2}}; - - std::vector h_keys(num_keys); - std::vector h_values(num_keys); - std::vector> h_pairs(num_keys); - std::vector h_results(num_keys); - - generate_keys(h_keys.begin(), h_keys.end()); - - for (std::size_t i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - } - - // diff keys - for (std::size_t i = 0; i < num_keys; ++i) { - h_keys[i] += num_keys; - } - - thrust::device_vector d_keys(h_keys); - thrust::device_vector d_results(num_keys); - thrust::device_vector> d_pairs(h_pairs); - - for (auto _ : state) { - state.PauseTiming(); - map.insert(d_pairs.begin(), d_pairs.end()); - state.ResumeTiming(); - - map.erase(d_keys.begin(), d_keys.end()); - } - - state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * - int64_t(state.range(0))); -} - -BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy); - -BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy); - -BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::GAUSSIAN) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::GAUSSIAN) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy); - -BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy); - -BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy); - -BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::GAUSSIAN) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::GAUSSIAN) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy); - -// TODO: comprehensive tests for erase_all, erase_none and search_none -BENCHMARK_TEMPLATE(BM_static_map_erase_all, int32_t, int32_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy); - -BENCHMARK_TEMPLATE(BM_static_map_search_none, int32_t, int32_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy); - -BENCHMARK_TEMPLATE(BM_static_map_erase_none, int32_t, int32_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy); diff --git a/benchmarks/key_generator.hpp b/benchmarks/key_generator.hpp index 4bf8491f1..e07bca2be 100644 --- a/benchmarks/key_generator.hpp +++ b/benchmarks/key_generator.hpp @@ -230,4 +230,4 @@ class key_generator { RNG rng_; ///< Random number generator }; -} // namespace cuco::benchmark \ No newline at end of file +} // namespace cuco::benchmark From 6fd6f77f1cad940573b59984dd3872fe854c13c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 21 Feb 2023 15:29:43 +0100 Subject: [PATCH 090/152] Benchmark cleanups (#273) This PR fixes some minor issues with the new key generator. The main goal is to remove the `nvbench` include from the key generator, so it can be used in other projects without parsing any `nvbench` source. --- .../hash_table/static_map/contains_bench.cu | 2 +- .../hash_table/static_map/erase_bench.cu | 1 + .../hash_table/static_map/find_bench.cu | 1 + .../hash_table/static_map/insert_bench.cu | 2 +- .../hash_table/static_multimap/count_bench.cu | 2 +- .../static_multimap/insert_bench.cu | 2 +- .../hash_table/static_multimap/query_bench.cu | 2 +- .../static_multimap/retrieve_bench.cu | 2 +- benchmarks/key_generator.hpp | 39 +++++++++++++------ benchmarks/{distribution.hpp => utils.hpp} | 17 +------- 10 files changed, 38 insertions(+), 32 deletions(-) rename benchmarks/{distribution.hpp => utils.hpp} (89%) diff --git a/benchmarks/hash_table/static_map/contains_bench.cu b/benchmarks/hash_table/static_map/contains_bench.cu index ef1dcfa26..dfcc378f0 100644 --- a/benchmarks/hash_table/static_map/contains_bench.cu +++ b/benchmarks/hash_table/static_map/contains_bench.cu @@ -15,8 +15,8 @@ */ #include -#include #include +#include #include diff --git a/benchmarks/hash_table/static_map/erase_bench.cu b/benchmarks/hash_table/static_map/erase_bench.cu index 777276624..947c29fa3 100644 --- a/benchmarks/hash_table/static_map/erase_bench.cu +++ b/benchmarks/hash_table/static_map/erase_bench.cu @@ -16,6 +16,7 @@ #include #include +#include #include diff --git a/benchmarks/hash_table/static_map/find_bench.cu b/benchmarks/hash_table/static_map/find_bench.cu index 8efc73acd..7a1161143 100644 --- a/benchmarks/hash_table/static_map/find_bench.cu +++ b/benchmarks/hash_table/static_map/find_bench.cu @@ -16,6 +16,7 @@ #include #include +#include #include diff --git a/benchmarks/hash_table/static_map/insert_bench.cu b/benchmarks/hash_table/static_map/insert_bench.cu index 0cc370403..6d9c8acf4 100644 --- a/benchmarks/hash_table/static_map/insert_bench.cu +++ b/benchmarks/hash_table/static_map/insert_bench.cu @@ -15,8 +15,8 @@ */ #include -#include #include +#include #include diff --git a/benchmarks/hash_table/static_multimap/count_bench.cu b/benchmarks/hash_table/static_multimap/count_bench.cu index 46a045275..4895e1ea6 100644 --- a/benchmarks/hash_table/static_multimap/count_bench.cu +++ b/benchmarks/hash_table/static_multimap/count_bench.cu @@ -15,8 +15,8 @@ */ #include -#include #include +#include #include diff --git a/benchmarks/hash_table/static_multimap/insert_bench.cu b/benchmarks/hash_table/static_multimap/insert_bench.cu index becd6cdbf..212cb764d 100644 --- a/benchmarks/hash_table/static_multimap/insert_bench.cu +++ b/benchmarks/hash_table/static_multimap/insert_bench.cu @@ -15,8 +15,8 @@ */ #include -#include #include +#include #include diff --git a/benchmarks/hash_table/static_multimap/query_bench.cu b/benchmarks/hash_table/static_multimap/query_bench.cu index 34577e29c..557099e0a 100644 --- a/benchmarks/hash_table/static_multimap/query_bench.cu +++ b/benchmarks/hash_table/static_multimap/query_bench.cu @@ -15,8 +15,8 @@ */ #include -#include #include +#include #include diff --git a/benchmarks/hash_table/static_multimap/retrieve_bench.cu b/benchmarks/hash_table/static_multimap/retrieve_bench.cu index e7ea00e0b..56404793b 100644 --- a/benchmarks/hash_table/static_multimap/retrieve_bench.cu +++ b/benchmarks/hash_table/static_multimap/retrieve_bench.cu @@ -15,8 +15,8 @@ */ #include -#include #include +#include #include diff --git a/benchmarks/key_generator.hpp b/benchmarks/key_generator.hpp index e07bca2be..225efb7bb 100644 --- a/benchmarks/key_generator.hpp +++ b/benchmarks/key_generator.hpp @@ -16,12 +16,8 @@ #pragma once -#include -#include - #include - -#include +#include #include #include @@ -33,15 +29,34 @@ #include #include -#include +#include #include -#include -#include #include #include namespace cuco::benchmark { +namespace dist_type { + +struct unique { +}; + +struct uniform : public cuco::detail::strong_type { + uniform(int64_t multiplicity) : cuco::detail::strong_type{multiplicity} + { + CUCO_EXPECTS(multiplicity > 0, "Multiplicity must be greater than 0"); + } +}; + +struct gaussian : public cuco::detail::strong_type { + gaussian(double skew) : cuco::detail::strong_type{skew} + { + CUCO_EXPECTS(skew > 0, "Skew must be greater than 0"); + } +}; + +} // namespace dist_type + /** * @brief Random key generator. * @@ -93,7 +108,7 @@ class key_generator { [*this, dist, num_keys] __host__ __device__(size_t const seed) { RNG rng; thrust::uniform_int_distribution uniform_dist( - 1, num_keys / dist.multiplicity); + 1, num_keys / dist.value); rng.seed(seed); return uniform_dist(rng); }); @@ -109,7 +124,7 @@ class key_generator { [*this, dist, num_keys] __host__ __device__(size_t const seed) { RNG rng; thrust::normal_distribution<> normal_dist( - static_cast(num_keys / 2), num_keys * dist.skew); + static_cast(num_keys / 2), num_keys * dist.value); rng.seed(seed); auto val = normal_dist(rng); while (val < 0 or val >= num_keys) { @@ -169,7 +184,9 @@ class key_generator { { using value_type = typename std::iterator_traits::value_type; - if (keep_prob >= 1.0) { + CUCO_EXPECTS(keep_prob >= 0.0 and keep_prob <= 1.0, "Probability needs to be between 0 and 1"); + + if (keep_prob < 1.0) { size_t num_keys = thrust::distance(begin, end); thrust::counting_iterator seeds(rng_()); diff --git a/benchmarks/distribution.hpp b/benchmarks/utils.hpp similarity index 89% rename from benchmarks/distribution.hpp rename to benchmarks/utils.hpp index b7ad25ba4..1d9a112e9 100644 --- a/benchmarks/distribution.hpp +++ b/benchmarks/utils.hpp @@ -16,27 +16,14 @@ #pragma once +#include + #include #include -#include - namespace cuco::benchmark { -namespace dist_type { -struct unique { -}; - -struct uniform { - int64_t multiplicity; // TODO assert >0 -}; - -struct gaussian { - double skew; // TODO assert >0 -}; -} // namespace dist_type - template auto dist_from_state(nvbench::state const& state) { From 8266f7d7a240a99e82e8e97e3d88e13d875e2bf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 23 Feb 2023 16:24:54 +0100 Subject: [PATCH 091/152] Benchmark parity (#274) This PR aims to achieve parity/consistency between each benchmark for better readability. --- .../hash_table/static_map/contains_bench.cu | 53 +++++++-------- .../hash_table/static_map/erase_bench.cu | 52 ++++---------- .../hash_table/static_map/find_bench.cu | 47 ++++++------- .../hash_table/static_map/insert_bench.cu | 68 ++++++++----------- .../hash_table/static_multimap/count_bench.cu | 41 ++++++----- .../static_multimap/insert_bench.cu | 50 +++++++------- .../hash_table/static_multimap/query_bench.cu | 41 ++++++----- .../static_multimap/retrieve_bench.cu | 41 ++++++----- 8 files changed, 171 insertions(+), 222 deletions(-) diff --git a/benchmarks/hash_table/static_map/contains_bench.cu b/benchmarks/hash_table/static_map/contains_bench.cu index dfcc378f0..f9076fcc6 100644 --- a/benchmarks/hash_table/static_map/contains_bench.cu +++ b/benchmarks/hash_table/static_map/contains_bench.cu @@ -23,21 +23,22 @@ #include #include -#include +#include using namespace cuco::benchmark; -using namespace cuco::benchmark::defaults; /** - * @brief A benchmark evaluating `contains` performance: + * @brief A benchmark evaluating `cuco::static_map::contains` performance */ template std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_contains( nvbench::state& state, nvbench::type_list) { - auto const num_keys = state.get_int64_or_default("NumInputs", N); - auto const occupancy = state.get_float64_or_default("Occupancy", OCCUPANCY); - auto const matching_rate = state.get_float64_or_default("MatchingRate", MATCHING_RATE); + using pair_type = cuco::pair_type; + + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); std::size_t const size = num_keys / occupancy; @@ -46,24 +47,22 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_contains( key_generator gen; gen.generate(dist_from_state(state), keys.begin(), keys.end()); - auto pairs_begin = thrust::make_transform_iterator( - keys.begin(), [] __device__(auto i) { return cuco::pair_type(i, {}); }); + thrust::device_vector pairs(num_keys); + thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, {}); + }); cuco::static_map map{size, cuco::empty_key{-1}, cuco::empty_value{-1}}; - map.insert(pairs_begin, pairs_begin + num_keys); - CUCO_CUDA_TRY(cudaStreamSynchronize(nullptr)); + map.insert(pairs.begin(), pairs.end()); gen.dropout(keys.begin(), keys.end(), matching_rate); - state.add_element_count(num_keys, "NumInputs"); - state.set_global_memory_rw_bytes(num_keys * sizeof(Key)); + thrust::device_vector result(num_keys); + + state.add_element_count(num_keys); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - map.contains(keys.begin(), - keys.end(), - thrust::make_discard_iterator(), - cuco::murmurhash3_32{}, - thrust::equal_to{}, - launch.get_stream()); + map.contains(keys.begin(), keys.end(), result.begin(), {}, {}, launch.get_stream()); }); } @@ -75,21 +74,19 @@ std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_contains( } NVBENCH_BENCH_TYPES(static_map_contains, - NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, - VALUE_TYPE_RANGE, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, nvbench::type_list)) .set_name("static_map_contains_unique_occupancy") .set_type_axes_names({"Key", "Value", "Distribution"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(MAX_NOISE) // Custom noise: 3%. By default: 0.5%. - .add_float64_axis("Occupancy", OCCUPANCY_RANGE); + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); NVBENCH_BENCH_TYPES(static_map_contains, - NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, - VALUE_TYPE_RANGE, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, nvbench::type_list)) .set_name("static_map_contains_unique_matching_rate") .set_type_axes_names({"Key", "Value", "Distribution"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(MAX_NOISE) // Custom noise: 3%. By default: 0.5%. - .add_float64_axis("MatchingRate", MATCHING_RATE_RANGE); + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); diff --git a/benchmarks/hash_table/static_map/erase_bench.cu b/benchmarks/hash_table/static_map/erase_bench.cu index 947c29fa3..0e4b87ddd 100644 --- a/benchmarks/hash_table/static_map/erase_bench.cu +++ b/benchmarks/hash_table/static_map/erase_bench.cu @@ -26,10 +26,9 @@ #include using namespace cuco::benchmark; -using namespace cuco::benchmark::defaults; /** - * @brief A benchmark evaluating `erase` performance: + * @brief A benchmark evaluating `cuco::static_map::erase` performance */ template std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_erase( @@ -49,31 +48,22 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_erase( gen.generate(dist_from_state(state), keys.begin(), keys.end()); thrust::device_vector pairs(num_keys); - thrust::transform(thrust::device, keys.begin(), keys.end(), pairs.begin(), [] __device__(auto i) { - return pair_type(i, {}); - }); + thrust::transform( + keys.begin(), keys.end(), pairs.begin(), [] __device__(auto i) { return pair_type(i, {}); }); gen.dropout(keys.begin(), keys.end(), matching_rate); - state.add_element_count(num_keys, "NumInputs"); - state.set_global_memory_rw_bytes(num_keys * sizeof(pair_type)); + state.add_element_count(num_keys); + state.exec( nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { // static map with erase support cuco::static_map map{ size, cuco::empty_key{-1}, cuco::empty_value{-1}, cuco::erased_key{-2}}; - map.insert(pairs.begin(), - pairs.end(), - cuco::murmurhash3_32{}, - thrust::equal_to{}, - launch.get_stream()); + map.insert(pairs.begin(), pairs.end(), {}, {}, launch.get_stream()); timer.start(); - map.erase(keys.begin(), - keys.end(), - cuco::murmurhash3_32{}, - thrust::equal_to{}, - launch.get_stream()); + map.erase(keys.begin(), keys.end(), {}, {}, launch.get_stream()); timer.stop(); }); } @@ -86,31 +76,19 @@ std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_erase( } NVBENCH_BENCH_TYPES(static_map_erase, - NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, - VALUE_TYPE_RANGE, - nvbench::type_list)) - .set_name("static_map_erase_uniform_multiplicity") - .set_type_axes_names({"Key", "Value", "Distribution"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(MAX_NOISE) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("Multiplicity", MULTIPLICITY_RANGE); - -NVBENCH_BENCH_TYPES(static_map_erase, - NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, - VALUE_TYPE_RANGE, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, nvbench::type_list)) .set_name("static_map_erase_unique_occupancy") .set_type_axes_names({"Key", "Value", "Distribution"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(MAX_NOISE) // Custom noise: 3%. By default: 0.5%. - .add_float64_axis("Occupancy", OCCUPANCY_RANGE); + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); NVBENCH_BENCH_TYPES(static_map_erase, - NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, - VALUE_TYPE_RANGE, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, nvbench::type_list)) .set_name("static_map_erase_unique_matching_rate") .set_type_axes_names({"Key", "Value", "Distribution"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(MAX_NOISE) // Custom noise: 3%. By default: 0.5%. - .add_float64_axis("MatchingRate", MATCHING_RATE_RANGE); + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); diff --git a/benchmarks/hash_table/static_map/find_bench.cu b/benchmarks/hash_table/static_map/find_bench.cu index 7a1161143..b530c251a 100644 --- a/benchmarks/hash_table/static_map/find_bench.cu +++ b/benchmarks/hash_table/static_map/find_bench.cu @@ -23,18 +23,19 @@ #include #include -#include +#include using namespace cuco::benchmark; -using namespace cuco::benchmark::defaults; /** - * @brief A benchmark evaluating `find` performance: + * @brief A benchmark evaluating `cuco::static_map::find` performance */ template std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_find( nvbench::state& state, nvbench::type_list) { + using pair_type = cuco::pair_type; + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); @@ -46,24 +47,22 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_find( key_generator gen; gen.generate(dist_from_state(state), keys.begin(), keys.end()); - auto pairs_begin = thrust::make_transform_iterator( - keys.begin(), [] __device__(auto i) { return cuco::pair_type(i, {}); }); + thrust::device_vector pairs(num_keys); + thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, {}); + }); cuco::static_map map{size, cuco::empty_key{-1}, cuco::empty_value{-1}}; - map.insert(pairs_begin, pairs_begin + num_keys); - CUCO_CUDA_TRY(cudaStreamSynchronize(nullptr)); + map.insert(pairs.begin(), pairs.end()); gen.dropout(keys.begin(), keys.end(), matching_rate); - state.add_element_count(num_keys, "NumInputs"); - state.set_global_memory_rw_bytes(num_keys * sizeof(Key)); + thrust::device_vector result(num_keys); + + state.add_element_count(num_keys); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - map.find(keys.begin(), - keys.end(), - thrust::make_discard_iterator(), - cuco::murmurhash3_32{}, - thrust::equal_to{}, - launch.get_stream()); + map.find(keys.begin(), keys.end(), result.begin(), {}, {}, launch.get_stream()); }); } @@ -75,21 +74,19 @@ std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_find( } NVBENCH_BENCH_TYPES(static_map_find, - NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, - VALUE_TYPE_RANGE, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, nvbench::type_list)) .set_name("static_map_find_unique_occupancy") .set_type_axes_names({"Key", "Value", "Distribution"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(MAX_NOISE) // Custom noise: 3%. By default: 0.5%. - .add_float64_axis("Occupancy", OCCUPANCY_RANGE); + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); NVBENCH_BENCH_TYPES(static_map_find, - NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, - VALUE_TYPE_RANGE, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, nvbench::type_list)) .set_name("static_map_find_unique_matching_rate") .set_type_axes_names({"Key", "Value", "Distribution"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(MAX_NOISE) // Custom noise: 3%. By default: 0.5%. - .add_float64_axis("MatchingRate", MATCHING_RATE_RANGE); + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); diff --git a/benchmarks/hash_table/static_map/insert_bench.cu b/benchmarks/hash_table/static_map/insert_bench.cu index 6d9c8acf4..21aaebe75 100644 --- a/benchmarks/hash_table/static_map/insert_bench.cu +++ b/benchmarks/hash_table/static_map/insert_bench.cu @@ -23,13 +23,12 @@ #include #include -#include +#include using namespace cuco::benchmark; -using namespace cuco::benchmark::defaults; /** - * @brief A benchmark evaluating `insert` performance: + * @brief A benchmark evaluating `cuco::static_map::insert` performance */ template std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_insert( @@ -48,30 +47,21 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_insert( gen.generate(dist_from_state(state), keys.begin(), keys.end()); thrust::device_vector pairs(num_keys); - thrust::transform( - thrust::device, keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { - return pair_type(key, {}); - }); + thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, {}); + }); + + state.add_element_count(num_keys); - state.add_element_count(num_keys, "NumInputs"); - state.set_global_memory_rw_bytes(num_keys * sizeof(pair_type)); - state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, - [&](nvbench::launch& launch, auto& timer) { - cuco::static_map map{size, - cuco::empty_key{-1}, - cuco::empty_value{-1}, - cuco::cuda_allocator{}, - launch.get_stream()}; - - // Use timers to explicitly mark the target region - timer.start(); - map.insert(pairs.begin(), - pairs.end(), - cuco::murmurhash3_32{}, - thrust::equal_to{}, - launch.get_stream()); - timer.stop(); - }); + state.exec( + nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { + cuco::static_map map{ + size, cuco::empty_key{-1}, cuco::empty_value{-1}, {}, launch.get_stream()}; + + timer.start(); + map.insert(pairs.begin(), pairs.end(), {}, {}, launch.get_stream()); + timer.stop(); + }); } template @@ -82,28 +72,28 @@ std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_insert( } NVBENCH_BENCH_TYPES(static_map_insert, - NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, - VALUE_TYPE_RANGE, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, nvbench::type_list)) .set_name("static_map_insert_uniform_multiplicity") .set_type_axes_names({"Key", "Value", "Distribution"}) - .set_max_noise(MAX_NOISE) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("Multiplicity", MULTIPLICITY_RANGE); + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); NVBENCH_BENCH_TYPES(static_map_insert, - NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, - VALUE_TYPE_RANGE, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, nvbench::type_list)) .set_name("static_map_insert_unique_occupancy") .set_type_axes_names({"Key", "Value", "Distribution"}) - .set_max_noise(MAX_NOISE) // Custom noise: 3%. By default: 0.5%. - .add_float64_axis("Occupancy", OCCUPANCY_RANGE); + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); NVBENCH_BENCH_TYPES(static_map_insert, - NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, - VALUE_TYPE_RANGE, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, nvbench::type_list)) - .set_name("static_map_insert_gaussian") + .set_name("static_map_insert_gaussian_skew") .set_type_axes_names({"Key", "Value", "Distribution"}) - .set_max_noise(MAX_NOISE) // Custom noise: 3%. By default: 0.5%. - .add_float64_axis("Skew", SKEW_RANGE); + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Skew", defaults::SKEW_RANGE); diff --git a/benchmarks/hash_table/static_multimap/count_bench.cu b/benchmarks/hash_table/static_multimap/count_bench.cu index 4895e1ea6..70a56edb6 100644 --- a/benchmarks/hash_table/static_multimap/count_bench.cu +++ b/benchmarks/hash_table/static_multimap/count_bench.cu @@ -26,10 +26,9 @@ #include using namespace cuco::benchmark; -using namespace cuco::benchmark::defaults; /** - * @brief A benchmark evaluating multi-value `count` performance + * @brief A benchmark evaluating `cuco::static_multimap::count` performance */ template std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_count( @@ -37,9 +36,9 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_count( { using pair_type = cuco::pair_type; - auto const num_keys = state.get_int64_or_default("NumInputs", N); - auto const occupancy = state.get_float64_or_default("Occupancy", OCCUPANCY); - auto const matching_rate = state.get_float64_or_default("MatchingRate", MATCHING_RATE); + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); std::size_t const size = num_keys / occupancy; @@ -49,15 +48,13 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_count( gen.generate(dist_from_state(state), keys.begin(), keys.end()); thrust::device_vector pairs(num_keys); - thrust::transform( - thrust::device, keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { - return pair_type(key, {}); - }); + thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, {}); + }); gen.dropout(keys.begin(), keys.end(), matching_rate); state.add_element_count(num_keys); - state.set_global_memory_rw_bytes(num_keys * sizeof(Key)); cuco::static_multimap map{ size, cuco::empty_key{-1}, cuco::empty_value{-1}}; @@ -76,28 +73,28 @@ std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_multimap_count( } NVBENCH_BENCH_TYPES(static_multimap_count, - NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, - VALUE_TYPE_RANGE, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, nvbench::type_list)) .set_name("static_multimap_count_uniform_occupancy") .set_type_axes_names({"Key", "Value", "Distribution"}) - .set_max_noise(MAX_NOISE) - .add_float64_axis("Occupancy", OCCUPANCY_RANGE); + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); NVBENCH_BENCH_TYPES(static_multimap_count, - NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, - VALUE_TYPE_RANGE, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, nvbench::type_list)) .set_name("static_multimap_count_uniform_matching_rate") .set_type_axes_names({"Key", "Value", "Distribution"}) - .set_max_noise(MAX_NOISE) - .add_float64_axis("MatchingRate", MATCHING_RATE_RANGE); + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); NVBENCH_BENCH_TYPES(static_multimap_count, - NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, - VALUE_TYPE_RANGE, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, nvbench::type_list)) .set_name("static_multimap_count_uniform_multiplicity") .set_type_axes_names({"Key", "Value", "Distribution"}) - .set_max_noise(MAX_NOISE) - .add_int64_axis("Multiplicity", MULTIPLICITY_RANGE); + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); diff --git a/benchmarks/hash_table/static_multimap/insert_bench.cu b/benchmarks/hash_table/static_multimap/insert_bench.cu index 212cb764d..f62047a40 100644 --- a/benchmarks/hash_table/static_multimap/insert_bench.cu +++ b/benchmarks/hash_table/static_multimap/insert_bench.cu @@ -26,10 +26,9 @@ #include using namespace cuco::benchmark; -using namespace cuco::benchmark::defaults; /** - * @brief A benchmark evaluating multi-value `insert` performance + * @brief A benchmark evaluating `cuco::static_multimap::insert` performance */ template std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_insert( @@ -37,8 +36,8 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_insert( { using pair_type = cuco::pair_type; - auto const num_keys = state.get_int64_or_default("NumInputs", N); - auto const occupancy = state.get_float64_or_default("Occupancy", OCCUPANCY); + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); std::size_t const size = num_keys / occupancy; @@ -48,20 +47,17 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_insert( gen.generate(dist_from_state(state), keys.begin(), keys.end()); thrust::device_vector pairs(num_keys); - thrust::transform( - thrust::device, keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { - return pair_type(key, {}); - }); + thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, {}); + }); state.add_element_count(num_keys); - state.set_global_memory_rw_bytes(num_keys * sizeof(pair_type)); state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { cuco::static_multimap map{ - size, cuco::empty_key{-1}, cuco::empty_value{-1}}; + size, cuco::empty_key{-1}, cuco::empty_value{-1}, launch.get_stream()}; - // Use timers to explicitly mark the target region timer.start(); map.insert(pairs.begin(), pairs.end(), launch.get_stream()); timer.stop(); @@ -76,28 +72,28 @@ std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_multimap_insert( } NVBENCH_BENCH_TYPES(static_multimap_insert, - NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, - VALUE_TYPE_RANGE, - nvbench::type_list)) - .set_name("static_multimap_insert_unique_occupancy") + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multimap_insert_uniform_multiplicity") .set_type_axes_names({"Key", "Value", "Distribution"}) - .set_max_noise(MAX_NOISE) - .add_float64_axis("Occupancy", OCCUPANCY_RANGE); + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); NVBENCH_BENCH_TYPES(static_multimap_insert, - NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, - VALUE_TYPE_RANGE, - nvbench::type_list)) - .set_name("static_multimap_insert_uniform_multiplicity") + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multimap_insert_unique_occupancy") .set_type_axes_names({"Key", "Value", "Distribution"}) - .set_max_noise(MAX_NOISE) - .add_int64_axis("Multiplicity", MULTIPLICITY_RANGE); + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); NVBENCH_BENCH_TYPES(static_multimap_insert, - NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, - VALUE_TYPE_RANGE, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, nvbench::type_list)) .set_name("static_multimap_insert_gaussian_skew") .set_type_axes_names({"Key", "Value", "Distribution"}) - .set_max_noise(MAX_NOISE) - .add_float64_axis("Skew", SKEW_RANGE); + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Skew", defaults::SKEW_RANGE); diff --git a/benchmarks/hash_table/static_multimap/query_bench.cu b/benchmarks/hash_table/static_multimap/query_bench.cu index 557099e0a..b1e7e52ea 100644 --- a/benchmarks/hash_table/static_multimap/query_bench.cu +++ b/benchmarks/hash_table/static_multimap/query_bench.cu @@ -26,10 +26,9 @@ #include using namespace cuco::benchmark; -using namespace cuco::benchmark::defaults; /** - * @brief A benchmark evaluating multi-value query (`count` + `retrieve`) performance + * @brief A benchmark evaluating 'cuco::static_multimap::query' (`count` + `retrieve`) performance */ template std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_query( @@ -37,9 +36,9 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_query( { using pair_type = cuco::pair_type; - auto const num_keys = state.get_int64_or_default("NumInputs", N); - auto const occupancy = state.get_float64_or_default("Occupancy", OCCUPANCY); - auto const matching_rate = state.get_float64_or_default("MatchingRate", MATCHING_RATE); + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); std::size_t const size = num_keys / occupancy; @@ -49,15 +48,13 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_query( gen.generate(dist_from_state(state), keys.begin(), keys.end()); thrust::device_vector pairs(num_keys); - thrust::transform( - thrust::device, keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { - return pair_type(key, {}); - }); + thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, {}); + }); gen.dropout(keys.begin(), keys.end(), matching_rate); state.add_element_count(num_keys); - state.set_global_memory_rw_bytes(num_keys * sizeof(pair_type)); cuco::static_multimap map{ size, cuco::empty_key{-1}, cuco::empty_value{-1}}; @@ -77,28 +74,28 @@ std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_multimap_query( } NVBENCH_BENCH_TYPES(static_multimap_query, - NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, - VALUE_TYPE_RANGE, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, nvbench::type_list)) .set_name("static_multimap_query_uniform_occupancy") .set_type_axes_names({"Key", "Value", "Distribution"}) - .set_max_noise(MAX_NOISE) - .add_float64_axis("Occupancy", OCCUPANCY_RANGE); + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); NVBENCH_BENCH_TYPES(static_multimap_query, - NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, - VALUE_TYPE_RANGE, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, nvbench::type_list)) .set_name("static_multimap_query_uniform_matching_rate") .set_type_axes_names({"Key", "Value", "Distribution"}) - .set_max_noise(MAX_NOISE) - .add_float64_axis("MatchingRate", MATCHING_RATE_RANGE); + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); NVBENCH_BENCH_TYPES(static_multimap_query, - NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, - VALUE_TYPE_RANGE, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, nvbench::type_list)) .set_name("static_multimap_query_uniform_multiplicity") .set_type_axes_names({"Key", "Value", "Distribution"}) - .set_max_noise(MAX_NOISE) - .add_int64_axis("Multiplicity", MULTIPLICITY_RANGE); \ No newline at end of file + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); \ No newline at end of file diff --git a/benchmarks/hash_table/static_multimap/retrieve_bench.cu b/benchmarks/hash_table/static_multimap/retrieve_bench.cu index 56404793b..2d0b6a385 100644 --- a/benchmarks/hash_table/static_multimap/retrieve_bench.cu +++ b/benchmarks/hash_table/static_multimap/retrieve_bench.cu @@ -26,10 +26,9 @@ #include using namespace cuco::benchmark; -using namespace cuco::benchmark::defaults; /** - * @brief A benchmark evaluating multi-value `retrieve` performance + * @brief A benchmark evaluating `cuco::static_multimap::retrieve` performance */ template std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_retrieve( @@ -37,9 +36,9 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_retrieve( { using pair_type = cuco::pair_type; - auto const num_keys = state.get_int64_or_default("NumInputs", N); - auto const occupancy = state.get_float64_or_default("Occupancy", OCCUPANCY); - auto const matching_rate = state.get_float64_or_default("MatchingRate", MATCHING_RATE); + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); std::size_t const size = num_keys / occupancy; @@ -49,15 +48,13 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_retrieve( gen.generate(dist_from_state(state), keys.begin(), keys.end()); thrust::device_vector pairs(num_keys); - thrust::transform( - thrust::device, keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { - return pair_type(key, {}); - }); + thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, {}); + }); gen.dropout(keys.begin(), keys.end(), matching_rate); state.add_element_count(num_keys); - state.set_global_memory_rw_bytes(num_keys * sizeof(pair_type)); cuco::static_multimap map{ size, cuco::empty_key{-1}, cuco::empty_value{-1}}; @@ -76,28 +73,28 @@ std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_multimap_retrieve( } NVBENCH_BENCH_TYPES(static_multimap_retrieve, - NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, - VALUE_TYPE_RANGE, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, nvbench::type_list)) .set_name("static_multimap_retrieve_uniform_occupancy") .set_type_axes_names({"Key", "Value", "Distribution"}) - .set_max_noise(MAX_NOISE) - .add_float64_axis("Occupancy", OCCUPANCY_RANGE); + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); NVBENCH_BENCH_TYPES(static_multimap_retrieve, - NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, - VALUE_TYPE_RANGE, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, nvbench::type_list)) .set_name("static_multimap_retrieve_uniform_matching_rate") .set_type_axes_names({"Key", "Value", "Distribution"}) - .set_max_noise(MAX_NOISE) - .add_float64_axis("MatchingRate", MATCHING_RATE_RANGE); + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); NVBENCH_BENCH_TYPES(static_multimap_retrieve, - NVBENCH_TYPE_AXES(KEY_TYPE_RANGE, - VALUE_TYPE_RANGE, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, nvbench::type_list)) .set_name("static_multimap_retrieve_uniform_multiplicity") .set_type_axes_names({"Key", "Value", "Distribution"}) - .set_max_noise(MAX_NOISE) - .add_int64_axis("Multiplicity", MULTIPLICITY_RANGE); \ No newline at end of file + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); \ No newline at end of file From a45f45e6b3df5f5987f9beee4e1e47946e60423b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 23 Feb 2023 18:57:24 +0100 Subject: [PATCH 092/152] Move random key generator to public API (#275) The new key generator is pretty handy when writing standalone PoCs/benchmarks for cuco. So, I would like to move it to the public API, which can then be used directly from the cuco cmake target. --- .../hash_table/static_map/contains_bench.cu | 7 ++- .../hash_table/static_map/erase_bench.cu | 7 ++- .../hash_table/static_map/find_bench.cu | 7 ++- .../hash_table/static_map/insert_bench.cu | 9 +-- .../hash_table/static_multimap/count_bench.cu | 9 +-- .../static_multimap/insert_bench.cu | 9 +-- .../hash_table/static_multimap/query_bench.cu | 9 +-- .../static_multimap/retrieve_bench.cu | 9 +-- benchmarks/utils.hpp | 19 +++--- .../cuco/utility}/key_generator.hpp | 61 ++++++++++++++++--- 10 files changed, 99 insertions(+), 47 deletions(-) rename {benchmarks => include/cuco/utility}/key_generator.hpp (80%) diff --git a/benchmarks/hash_table/static_map/contains_bench.cu b/benchmarks/hash_table/static_map/contains_bench.cu index f9076fcc6..09737a136 100644 --- a/benchmarks/hash_table/static_map/contains_bench.cu +++ b/benchmarks/hash_table/static_map/contains_bench.cu @@ -15,10 +15,10 @@ */ #include -#include #include #include +#include #include @@ -26,6 +26,7 @@ #include using namespace cuco::benchmark; +using namespace cuco::utility; /** * @brief A benchmark evaluating `cuco::static_map::contains` performance @@ -76,7 +77,7 @@ std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_contains( NVBENCH_BENCH_TYPES(static_map_contains, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, defaults::VALUE_TYPE_RANGE, - nvbench::type_list)) + nvbench::type_list)) .set_name("static_map_contains_unique_occupancy") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) @@ -85,7 +86,7 @@ NVBENCH_BENCH_TYPES(static_map_contains, NVBENCH_BENCH_TYPES(static_map_contains, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, defaults::VALUE_TYPE_RANGE, - nvbench::type_list)) + nvbench::type_list)) .set_name("static_map_contains_unique_matching_rate") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) diff --git a/benchmarks/hash_table/static_map/erase_bench.cu b/benchmarks/hash_table/static_map/erase_bench.cu index 0e4b87ddd..3f26504a7 100644 --- a/benchmarks/hash_table/static_map/erase_bench.cu +++ b/benchmarks/hash_table/static_map/erase_bench.cu @@ -15,10 +15,10 @@ */ #include -#include #include #include +#include #include @@ -26,6 +26,7 @@ #include using namespace cuco::benchmark; +using namespace cuco::utility; /** * @brief A benchmark evaluating `cuco::static_map::erase` performance @@ -78,7 +79,7 @@ std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_erase( NVBENCH_BENCH_TYPES(static_map_erase, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, defaults::VALUE_TYPE_RANGE, - nvbench::type_list)) + nvbench::type_list)) .set_name("static_map_erase_unique_occupancy") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) @@ -87,7 +88,7 @@ NVBENCH_BENCH_TYPES(static_map_erase, NVBENCH_BENCH_TYPES(static_map_erase, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, defaults::VALUE_TYPE_RANGE, - nvbench::type_list)) + nvbench::type_list)) .set_name("static_map_erase_unique_matching_rate") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) diff --git a/benchmarks/hash_table/static_map/find_bench.cu b/benchmarks/hash_table/static_map/find_bench.cu index b530c251a..259615e0c 100644 --- a/benchmarks/hash_table/static_map/find_bench.cu +++ b/benchmarks/hash_table/static_map/find_bench.cu @@ -15,10 +15,10 @@ */ #include -#include #include #include +#include #include @@ -26,6 +26,7 @@ #include using namespace cuco::benchmark; +using namespace cuco::utility; /** * @brief A benchmark evaluating `cuco::static_map::find` performance @@ -76,7 +77,7 @@ std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_find( NVBENCH_BENCH_TYPES(static_map_find, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, defaults::VALUE_TYPE_RANGE, - nvbench::type_list)) + nvbench::type_list)) .set_name("static_map_find_unique_occupancy") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) @@ -85,7 +86,7 @@ NVBENCH_BENCH_TYPES(static_map_find, NVBENCH_BENCH_TYPES(static_map_find, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, defaults::VALUE_TYPE_RANGE, - nvbench::type_list)) + nvbench::type_list)) .set_name("static_map_find_unique_matching_rate") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) diff --git a/benchmarks/hash_table/static_map/insert_bench.cu b/benchmarks/hash_table/static_map/insert_bench.cu index 21aaebe75..b6fadc057 100644 --- a/benchmarks/hash_table/static_map/insert_bench.cu +++ b/benchmarks/hash_table/static_map/insert_bench.cu @@ -15,10 +15,10 @@ */ #include -#include #include #include +#include #include @@ -26,6 +26,7 @@ #include using namespace cuco::benchmark; +using namespace cuco::utility; /** * @brief A benchmark evaluating `cuco::static_map::insert` performance @@ -74,7 +75,7 @@ std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_insert( NVBENCH_BENCH_TYPES(static_map_insert, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, defaults::VALUE_TYPE_RANGE, - nvbench::type_list)) + nvbench::type_list)) .set_name("static_map_insert_uniform_multiplicity") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) @@ -83,7 +84,7 @@ NVBENCH_BENCH_TYPES(static_map_insert, NVBENCH_BENCH_TYPES(static_map_insert, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, defaults::VALUE_TYPE_RANGE, - nvbench::type_list)) + nvbench::type_list)) .set_name("static_map_insert_unique_occupancy") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) @@ -92,7 +93,7 @@ NVBENCH_BENCH_TYPES(static_map_insert, NVBENCH_BENCH_TYPES(static_map_insert, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, defaults::VALUE_TYPE_RANGE, - nvbench::type_list)) + nvbench::type_list)) .set_name("static_map_insert_gaussian_skew") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) diff --git a/benchmarks/hash_table/static_multimap/count_bench.cu b/benchmarks/hash_table/static_multimap/count_bench.cu index 70a56edb6..e087e3243 100644 --- a/benchmarks/hash_table/static_multimap/count_bench.cu +++ b/benchmarks/hash_table/static_multimap/count_bench.cu @@ -15,10 +15,10 @@ */ #include -#include #include #include +#include #include @@ -26,6 +26,7 @@ #include using namespace cuco::benchmark; +using namespace cuco::utility; /** * @brief A benchmark evaluating `cuco::static_multimap::count` performance @@ -75,7 +76,7 @@ std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_multimap_count( NVBENCH_BENCH_TYPES(static_multimap_count, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, defaults::VALUE_TYPE_RANGE, - nvbench::type_list)) + nvbench::type_list)) .set_name("static_multimap_count_uniform_occupancy") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) @@ -84,7 +85,7 @@ NVBENCH_BENCH_TYPES(static_multimap_count, NVBENCH_BENCH_TYPES(static_multimap_count, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, defaults::VALUE_TYPE_RANGE, - nvbench::type_list)) + nvbench::type_list)) .set_name("static_multimap_count_uniform_matching_rate") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) @@ -93,7 +94,7 @@ NVBENCH_BENCH_TYPES(static_multimap_count, NVBENCH_BENCH_TYPES(static_multimap_count, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, defaults::VALUE_TYPE_RANGE, - nvbench::type_list)) + nvbench::type_list)) .set_name("static_multimap_count_uniform_multiplicity") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) diff --git a/benchmarks/hash_table/static_multimap/insert_bench.cu b/benchmarks/hash_table/static_multimap/insert_bench.cu index f62047a40..c045f3a91 100644 --- a/benchmarks/hash_table/static_multimap/insert_bench.cu +++ b/benchmarks/hash_table/static_multimap/insert_bench.cu @@ -15,10 +15,10 @@ */ #include -#include #include #include +#include #include @@ -26,6 +26,7 @@ #include using namespace cuco::benchmark; +using namespace cuco::utility; /** * @brief A benchmark evaluating `cuco::static_multimap::insert` performance @@ -74,7 +75,7 @@ std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_multimap_insert( NVBENCH_BENCH_TYPES(static_multimap_insert, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, defaults::VALUE_TYPE_RANGE, - nvbench::type_list)) + nvbench::type_list)) .set_name("static_multimap_insert_uniform_multiplicity") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) @@ -83,7 +84,7 @@ NVBENCH_BENCH_TYPES(static_multimap_insert, NVBENCH_BENCH_TYPES(static_multimap_insert, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, defaults::VALUE_TYPE_RANGE, - nvbench::type_list)) + nvbench::type_list)) .set_name("static_multimap_insert_unique_occupancy") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) @@ -92,7 +93,7 @@ NVBENCH_BENCH_TYPES(static_multimap_insert, NVBENCH_BENCH_TYPES(static_multimap_insert, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, defaults::VALUE_TYPE_RANGE, - nvbench::type_list)) + nvbench::type_list)) .set_name("static_multimap_insert_gaussian_skew") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) diff --git a/benchmarks/hash_table/static_multimap/query_bench.cu b/benchmarks/hash_table/static_multimap/query_bench.cu index b1e7e52ea..783c83556 100644 --- a/benchmarks/hash_table/static_multimap/query_bench.cu +++ b/benchmarks/hash_table/static_multimap/query_bench.cu @@ -15,10 +15,10 @@ */ #include -#include #include #include +#include #include @@ -26,6 +26,7 @@ #include using namespace cuco::benchmark; +using namespace cuco::utility; /** * @brief A benchmark evaluating 'cuco::static_multimap::query' (`count` + `retrieve`) performance @@ -76,7 +77,7 @@ std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_multimap_query( NVBENCH_BENCH_TYPES(static_multimap_query, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, defaults::VALUE_TYPE_RANGE, - nvbench::type_list)) + nvbench::type_list)) .set_name("static_multimap_query_uniform_occupancy") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) @@ -85,7 +86,7 @@ NVBENCH_BENCH_TYPES(static_multimap_query, NVBENCH_BENCH_TYPES(static_multimap_query, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, defaults::VALUE_TYPE_RANGE, - nvbench::type_list)) + nvbench::type_list)) .set_name("static_multimap_query_uniform_matching_rate") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) @@ -94,7 +95,7 @@ NVBENCH_BENCH_TYPES(static_multimap_query, NVBENCH_BENCH_TYPES(static_multimap_query, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, defaults::VALUE_TYPE_RANGE, - nvbench::type_list)) + nvbench::type_list)) .set_name("static_multimap_query_uniform_multiplicity") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) diff --git a/benchmarks/hash_table/static_multimap/retrieve_bench.cu b/benchmarks/hash_table/static_multimap/retrieve_bench.cu index 2d0b6a385..432bd3485 100644 --- a/benchmarks/hash_table/static_multimap/retrieve_bench.cu +++ b/benchmarks/hash_table/static_multimap/retrieve_bench.cu @@ -15,10 +15,10 @@ */ #include -#include #include #include +#include #include @@ -26,6 +26,7 @@ #include using namespace cuco::benchmark; +using namespace cuco::utility; /** * @brief A benchmark evaluating `cuco::static_multimap::retrieve` performance @@ -75,7 +76,7 @@ std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_multimap_retrieve( NVBENCH_BENCH_TYPES(static_multimap_retrieve, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, defaults::VALUE_TYPE_RANGE, - nvbench::type_list)) + nvbench::type_list)) .set_name("static_multimap_retrieve_uniform_occupancy") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) @@ -84,7 +85,7 @@ NVBENCH_BENCH_TYPES(static_multimap_retrieve, NVBENCH_BENCH_TYPES(static_multimap_retrieve, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, defaults::VALUE_TYPE_RANGE, - nvbench::type_list)) + nvbench::type_list)) .set_name("static_multimap_retrieve_uniform_matching_rate") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) @@ -93,7 +94,7 @@ NVBENCH_BENCH_TYPES(static_multimap_retrieve, NVBENCH_BENCH_TYPES(static_multimap_retrieve, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, defaults::VALUE_TYPE_RANGE, - nvbench::type_list)) + nvbench::type_list)) .set_name("static_multimap_retrieve_uniform_multiplicity") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) diff --git a/benchmarks/utils.hpp b/benchmarks/utils.hpp index 1d9a112e9..538b7eefb 100644 --- a/benchmarks/utils.hpp +++ b/benchmarks/utils.hpp @@ -16,9 +16,8 @@ #pragma once -#include - #include +#include #include @@ -27,12 +26,12 @@ namespace cuco::benchmark { template auto dist_from_state(nvbench::state const& state) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { return Dist{}; - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { auto const multiplicity = state.get_int64_or_default("Multiplicity", defaults::MULTIPLICITY); return Dist{multiplicity}; - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { auto const skew = state.get_float64_or_default("Skew", defaults::SKEW); return Dist{skew}; } else { @@ -42,8 +41,10 @@ auto dist_from_state(nvbench::state const& state) } // namespace cuco::benchmark -NVBENCH_DECLARE_TYPE_STRINGS(cuco::benchmark::dist_type::unique, "UNIQUE", "dist_type::unique"); -NVBENCH_DECLARE_TYPE_STRINGS(cuco::benchmark::dist_type::uniform, "UNIFORM", "dist_type::uniform"); -NVBENCH_DECLARE_TYPE_STRINGS(cuco::benchmark::dist_type::gaussian, +NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::unique, "UNIQUE", "distribution::unique"); +NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::uniform, + "UNIFORM", + "distribution::uniform"); +NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::gaussian, "GAUSSIAN", - "dist_type::gaussian"); \ No newline at end of file + "distribution::gaussian"); \ No newline at end of file diff --git a/benchmarks/key_generator.hpp b/include/cuco/utility/key_generator.hpp similarity index 80% rename from benchmarks/key_generator.hpp rename to include/cuco/utility/key_generator.hpp index 225efb7bb..deea62a62 100644 --- a/benchmarks/key_generator.hpp +++ b/include/cuco/utility/key_generator.hpp @@ -34,28 +34,43 @@ #include #include -namespace cuco::benchmark { +namespace cuco::utility { -namespace dist_type { +namespace distribution { +/** + * @brief Tag struct representing a random distribution of unique keys. + */ struct unique { }; +/** + * @brief Tag struct representing a uniform distribution. + */ struct uniform : public cuco::detail::strong_type { + /** + * @param multiplicity Average key multiplicity of the distribution. + */ uniform(int64_t multiplicity) : cuco::detail::strong_type{multiplicity} { CUCO_EXPECTS(multiplicity > 0, "Multiplicity must be greater than 0"); } }; +/** + * @brief Tag struct representing a gaussian distribution. + */ struct gaussian : public cuco::detail::strong_type { + /** + * @param skew 0 represents a uniform distribution; ∞ represents a Dirac delta distribution. + */ gaussian(double skew) : cuco::detail::strong_type{skew} { CUCO_EXPECTS(skew > 0, "Skew must be greater than 0"); } }; -} // namespace dist_type +} // namespace distribution /** * @brief Random key generator. @@ -81,8 +96,8 @@ class key_generator { * @tparam Enable SFINAE helper * * @param dist Random distribution to use - * @param output_begin Start of the output sequence - * @param output_end End of the output sequence + * @param out_begin Start of the output sequence + * @param out_end End of the output sequence * @param exec_policy Thrust execution policy this operation will be executed with */ template ::value_type; - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { thrust::sequence(exec_policy, out_begin, out_end, 0); thrust::shuffle(exec_policy, out_begin, out_end, this->rng_); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { size_t num_keys = thrust::distance(out_begin, out_end); thrust::counting_iterator seeds(this->rng_()); @@ -112,7 +127,7 @@ class key_generator { rng.seed(seed); return uniform_dist(rng); }); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { size_t num_keys = thrust::distance(out_begin, out_end); thrust::counting_iterator seq(this->rng_()); @@ -142,6 +157,13 @@ class key_generator { /** * @brief Overload of 'generate' which automatically selects a suitable execution policy + * + * @tparam Dist Key distribution type + * @tparam OutputIt Ouput iterator typy which value type is the desired key type + * + * @param dist Random distribution to use + * @param out_begin Start of the output sequence + * @param out_end End of the output sequence */ template void generate(Dist dist, OutputIt out_begin, OutputIt out_end) @@ -157,6 +179,14 @@ class key_generator { /** * @brief Overload of 'generate' which uses 'thrust::cuda::par_nosync' execution policy on CUDA * stream 'stream' + * + * @tparam Dist Key distribution type + * @tparam OutputIt Ouput iterator typy which value type is the desired key type + * + * @param dist Random distribution to use + * @param out_begin Start of the output sequence + * @param out_end End of the output sequence + * @param stream CUDA stream in which this operation is executed in */ template void generate(Dist dist, OutputIt out_begin, OutputIt out_end, cudaStream_t stream) @@ -216,6 +246,12 @@ class key_generator { /** * @brief Overload of 'dropout' which automatically selects a suitable execution policy + * + * @tparam InOutIt Input/Ouput iterator typy which value type is the desired key type + * + * @param begin Start of the key sequence + * @param end End of the key sequence + * @param keep_prob Probability that a key is kept */ template void dropout(InOutIt begin, InOutIt end, double keep_prob) @@ -231,6 +267,13 @@ class key_generator { /** * @brief Overload of 'dropout' which uses 'thrust::cuda::par_nosync' execution policy on CUDA * stream 'stream' + * + * @tparam InOutIt Input/Ouput iterator typy which value type is the desired key type + * + * @param begin Start of the key sequence + * @param end End of the key sequence + * @param keep_prob Probability that a key is kept + * @param stream CUDA stream in which this operation is executed in */ template void dropout(InOutIt begin, InOutIt end, double keep_prob, cudaStream_t stream) @@ -247,4 +290,4 @@ class key_generator { RNG rng_; ///< Random number generator }; -} // namespace cuco::benchmark +} // namespace cuco::utility From b137a7cf39be6fd196bade0c35bb359f49c6ab66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 23 Feb 2023 19:46:54 +0100 Subject: [PATCH 093/152] Fix static_map::find benchmark output type (#277) --- benchmarks/hash_table/static_map/find_bench.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/hash_table/static_map/find_bench.cu b/benchmarks/hash_table/static_map/find_bench.cu index 259615e0c..4a1ccca11 100644 --- a/benchmarks/hash_table/static_map/find_bench.cu +++ b/benchmarks/hash_table/static_map/find_bench.cu @@ -58,7 +58,7 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_find( gen.dropout(keys.begin(), keys.end(), matching_rate); - thrust::device_vector result(num_keys); + thrust::device_vector result(num_keys); state.add_element_count(num_keys); From a2fd23fff0675b2a946fc818adcaefd44e1e56c5 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 6 Mar 2023 15:52:26 -0800 Subject: [PATCH 094/152] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 619dbb58a..685ba94aa 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ Since `cuCollections` is header-only, there is nothing to build to use it. To build the tests, benchmarks, and examples: -``` +```bash cd $CUCO_ROOT mkdir -p build cd build From a4ac019473e3e4e4803025cbc8326c98c963a218 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 14 Mar 2023 01:15:30 +0100 Subject: [PATCH 095/152] Port dynamic_map benchmarks to nvbench and remove gbench dependency (#276) This PR ports dynamic_map benchmarks to nvbench and removes deprecated gbench dependency. --------- Co-authored-by: Yunsong Wang --- benchmarks/CMakeLists.txt | 55 +-- benchmarks/defaults.hpp | 3 + .../hash_table/dynamic_map/contains_bench.cu | 92 +++++ .../hash_table/dynamic_map/erase_bench.cu | 95 +++++ .../hash_table/dynamic_map/find_bench.cu | 92 +++++ .../hash_table/dynamic_map/insert_bench.cu | 106 ++++++ benchmarks/hash_table/dynamic_map_bench.cu | 359 ------------------ benchmarks/synchronization.hpp | 126 ------ 8 files changed, 402 insertions(+), 526 deletions(-) create mode 100644 benchmarks/hash_table/dynamic_map/contains_bench.cu create mode 100644 benchmarks/hash_table/dynamic_map/erase_bench.cu create mode 100644 benchmarks/hash_table/dynamic_map/find_bench.cu create mode 100644 benchmarks/hash_table/dynamic_map/insert_bench.cu delete mode 100644 benchmarks/hash_table/dynamic_map_bench.cu delete mode 100644 benchmarks/synchronization.hpp diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index cd4183da7..13f1e3be0 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -15,19 +15,6 @@ #============================================================================= cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) -CPMAddPackage( - NAME benchmark - GITHUB_REPOSITORY google/benchmark - VERSION 1.5.2 - OPTIONS - "BENCHMARK_ENABLE_TESTING Off" - # The REGEX feature test fails when gbench's cmake is run under CPM w/ gcc5.4 because it doesn't assume C++11 - # Additionally, attempting to set the CMAKE_CXX_VERSION here doesn't propogate to the feature test build - # Therefore, we just disable the feature test and assume platforms we care about have a regex impl available - "RUN_HAVE_STD_REGEX 0" # - "BENCHMARK_ENABLE_INSTALL OFF" -) - CPMAddPackage( NAME nvbench GITHUB_REPOSITORY NVIDIA/nvbench @@ -41,49 +28,27 @@ CPMAddPackage( ################################################################################################### ################################################################################################### -function(ConfigureBench BENCH_NAME BENCH_SRC) - add_executable(${BENCH_NAME} "${BENCH_SRC}") - set_target_properties(${BENCH_NAME} PROPERTIES - POSITION_INDEPENDENT_CODE ON - RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/gbenchmarks") - target_include_directories(${BENCH_NAME} PRIVATE - "${CMAKE_CURRENT_SOURCE_DIR}") - target_compile_options(${BENCH_NAME} PRIVATE --compiler-options=-Wall --compiler-options=-Wextra - --expt-extended-lambda --expt-relaxed-constexpr -Xcompiler -Wno-subobject-linkage) - target_link_libraries(${BENCH_NAME} PRIVATE - benchmark benchmark_main - pthread - cuco - CUDA::cudart) -endfunction(ConfigureBench) - -################################################################################################### -function(ConfigureNVBench BENCH_NAME) +function(ConfigureBench BENCH_NAME) add_executable(${BENCH_NAME} ${ARGN}) set_target_properties(${BENCH_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON - RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/nvbenchmarks") + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/benchmarks") target_include_directories(${BENCH_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") - target_compile_options(${BENCH_NAME} PRIVATE --expt-extended-lambda --expt-relaxed-constexpr) + target_compile_options(${BENCH_NAME} PRIVATE --expt-extended-lambda --expt-relaxed-constexpr -lineinfo) target_link_libraries(${BENCH_NAME} PRIVATE nvbench::main pthread cuco) -endfunction(ConfigureNVBench) +endfunction(ConfigureBench) ################################################################################################### ### benchmark sources ############################################################################# ################################################################################################### -################################################################################################### -# - dynamic_map benchmarks ------------------------------------------------------------------------ -set(DYNAMIC_MAP_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/hash_table/dynamic_map_bench.cu") -ConfigureBench(DYNAMIC_MAP_BENCH "${DYNAMIC_MAP_BENCH_SRC}") - ################################################################################################### # - static_map benchmarks ------------------------------------------------------------------------- -ConfigureNVBench(STATIC_MAP_BENCH +ConfigureBench(STATIC_MAP_BENCH hash_table/static_map/insert_bench.cu hash_table/static_map/find_bench.cu hash_table/static_map/contains_bench.cu @@ -91,8 +56,16 @@ ConfigureNVBench(STATIC_MAP_BENCH ################################################################################################### # - static_multimap benchmarks -------------------------------------------------------------------- -ConfigureNVBench(STATIC_MULTIMAP_BENCH +ConfigureBench(STATIC_MULTIMAP_BENCH hash_table/static_multimap/insert_bench.cu hash_table/static_multimap/retrieve_bench.cu hash_table/static_multimap/query_bench.cu hash_table/static_multimap/count_bench.cu) + +################################################################################################### +# - dynamic_map benchmarks ------------------------------------------------------------------------ +ConfigureBench(DYNAMIC_MAP_BENCH + hash_table/dynamic_map/insert_bench.cu + hash_table/dynamic_map/find_bench.cu + hash_table/dynamic_map/contains_bench.cu + hash_table/dynamic_map/erase_bench.cu) diff --git a/benchmarks/defaults.hpp b/benchmarks/defaults.hpp index 9aa9f3587..d9d35b9a9 100644 --- a/benchmarks/defaults.hpp +++ b/benchmarks/defaults.hpp @@ -32,7 +32,10 @@ auto constexpr MULTIPLICITY = 8; auto constexpr MATCHING_RATE = 0.5; auto constexpr MAX_NOISE = 3; auto constexpr SKEW = 0.5; +auto constexpr BATCH_SIZE = 1'000'000; +auto constexpr INITIAL_SIZE = 50'000'000; +auto const N_RANGE = nvbench::range(10'000'000, 100'000'000, 20'000'000); auto const OCCUPANCY_RANGE = nvbench::range(0.1, 0.9, 0.1); auto const MULTIPLICITY_RANGE = std::vector{1, 2, 4, 8, 16}; auto const MATCHING_RATE_RANGE = nvbench::range(0.1, 1., 0.1); diff --git a/benchmarks/hash_table/dynamic_map/contains_bench.cu b/benchmarks/hash_table/dynamic_map/contains_bench.cu new file mode 100644 index 000000000..8e41b8e2d --- /dev/null +++ b/benchmarks/hash_table/dynamic_map/contains_bench.cu @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::dynamic_map::contains` performance + */ +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> dynamic_map_contains( + nvbench::state& state, nvbench::type_list) +{ + using pair_type = cuco::pair_type; + + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const initial_size = state.get_int64_or_default("InitSize", defaults::INITIAL_SIZE); + auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + thrust::device_vector pairs(num_keys); + thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, {}); + }); + + cuco::dynamic_map map{ + static_cast(initial_size), cuco::empty_key{-1}, cuco::empty_value{-1}}; + map.insert(pairs.begin(), pairs.end()); + + gen.dropout(keys.begin(), keys.end(), matching_rate); + + thrust::device_vector result(num_keys); + + state.add_element_count(num_keys); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + map.contains(keys.begin(), keys.end(), result.begin(), {}, {}, launch.get_stream()); + }); +} + +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> dynamic_map_contains( + nvbench::state& state, nvbench::type_list) +{ + state.skip("Key should be the same type as Value."); +} + +NVBENCH_BENCH_TYPES(dynamic_map_contains, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("dynamic_map_contains_unique_num_inputs") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("NumInputs", defaults::N_RANGE); + +NVBENCH_BENCH_TYPES(dynamic_map_contains, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("dynamic_map_contains_unique_matching_rate") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); diff --git a/benchmarks/hash_table/dynamic_map/erase_bench.cu b/benchmarks/hash_table/dynamic_map/erase_bench.cu new file mode 100644 index 000000000..b815515e8 --- /dev/null +++ b/benchmarks/hash_table/dynamic_map/erase_bench.cu @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::dynamic_map::erase` performance + */ +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> dynamic_map_erase( + nvbench::state& state, nvbench::type_list) +{ + using pair_type = cuco::pair_type; + + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const initial_size = state.get_int64_or_default("InitSize", defaults::INITIAL_SIZE); + auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + thrust::device_vector pairs(num_keys); + thrust::transform( + keys.begin(), keys.end(), pairs.begin(), [] __device__(auto i) { return pair_type(i, {}); }); + + gen.dropout(keys.begin(), keys.end(), matching_rate); + + state.add_element_count(num_keys); + + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, + [&](nvbench::launch& launch, auto& timer) { + // dynamic map with erase support + cuco::dynamic_map map{static_cast(initial_size), + cuco::empty_key{-1}, + cuco::empty_value{-1}, + cuco::erased_key{-2}}; + map.insert(pairs.begin(), pairs.end(), {}, {}, launch.get_stream()); + + timer.start(); + map.erase(keys.begin(), keys.end(), {}, {}, launch.get_stream()); + timer.stop(); + }); +} + +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> dynamic_map_erase( + nvbench::state& state, nvbench::type_list) +{ + state.skip("Key should be the same type as Value."); +} + +NVBENCH_BENCH_TYPES(dynamic_map_erase, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("dynamic_map_erase_unique_num_inputs") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("NumInputs", defaults::N_RANGE); + +NVBENCH_BENCH_TYPES(dynamic_map_erase, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("dynamic_map_erase_unique_matching_rate") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); diff --git a/benchmarks/hash_table/dynamic_map/find_bench.cu b/benchmarks/hash_table/dynamic_map/find_bench.cu new file mode 100644 index 000000000..12576ccc1 --- /dev/null +++ b/benchmarks/hash_table/dynamic_map/find_bench.cu @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::dynamic_map::find` performance + */ +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> dynamic_map_find( + nvbench::state& state, nvbench::type_list) +{ + using pair_type = cuco::pair_type; + + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const initial_size = state.get_int64_or_default("InitSize", defaults::INITIAL_SIZE); + auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + thrust::device_vector pairs(num_keys); + thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, {}); + }); + + cuco::dynamic_map map{ + static_cast(initial_size), cuco::empty_key{-1}, cuco::empty_value{-1}}; + map.insert(pairs.begin(), pairs.end()); + + gen.dropout(keys.begin(), keys.end(), matching_rate); + + thrust::device_vector result(num_keys); + + state.add_element_count(num_keys); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + map.find(keys.begin(), keys.end(), result.begin(), {}, {}, launch.get_stream()); + }); +} + +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> dynamic_map_find( + nvbench::state& state, nvbench::type_list) +{ + state.skip("Key should be the same type as Value."); +} + +NVBENCH_BENCH_TYPES(dynamic_map_find, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("dynamic_map_find_unique_num_inputs") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("NumInputs", defaults::N_RANGE); + +NVBENCH_BENCH_TYPES(dynamic_map_find, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("dynamic_map_find_unique_matching_rate") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); diff --git a/benchmarks/hash_table/dynamic_map/insert_bench.cu b/benchmarks/hash_table/dynamic_map/insert_bench.cu new file mode 100644 index 000000000..de2fa8a4a --- /dev/null +++ b/benchmarks/hash_table/dynamic_map/insert_bench.cu @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::dynamic_map::insert` performance + */ +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> dynamic_map_insert( + nvbench::state& state, nvbench::type_list) +{ + using pair_type = cuco::pair_type; + + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const initial_size = state.get_int64_or_default("InitSize", defaults::INITIAL_SIZE); + auto const batch_size = state.get_int64_or_default("BatchSize", defaults::BATCH_SIZE); + + if (num_keys % batch_size) { state.skip("NumInputs must be divisible by BatchSize."); } + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + thrust::device_vector pairs(num_keys); + thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, {}); + }); + + state.add_element_count(num_keys); + + state.exec( + nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { + cuco::dynamic_map map{static_cast(initial_size), + cuco::empty_key{-1}, + cuco::empty_value{-1}, + {}, + launch.get_stream()}; + + timer.start(); + for (std::size_t i = 0; i < num_keys; i += batch_size) { + map.insert(pairs.begin() + i, pairs.begin() + i + batch_size, {}, {}, launch.get_stream()); + } + timer.stop(); + }); +} + +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> dynamic_map_insert( + nvbench::state& state, nvbench::type_list) +{ + state.skip("Key should be the same type as Value."); +} + +NVBENCH_BENCH_TYPES(dynamic_map_insert, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("dynamic_map_insert_unique_num_inputs") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("NumInputs", defaults::N_RANGE); + +NVBENCH_BENCH_TYPES(dynamic_map_insert, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("dynamic_map_insert_uniform_multiplicity") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); + +NVBENCH_BENCH_TYPES(dynamic_map_insert, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("dynamic_map_insert_gaussian_skew") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Skew", defaults::SKEW_RANGE); diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu deleted file mode 100644 index 8fbb804de..000000000 --- a/benchmarks/hash_table/dynamic_map_bench.cu +++ /dev/null @@ -1,359 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include - -#include - -#include - -#include -#include - -enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN }; - -template -static void generate_keys(OutputIt output_begin, OutputIt output_end) -{ - auto num_keys = std::distance(output_begin, output_end); - - std::random_device rd; - std::mt19937 gen{rd()}; - - switch (Dist) { - case dist_type::UNIQUE: - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = i; - } - break; - case dist_type::UNIFORM: - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = std::abs(static_cast(gen())); - } - break; - case dist_type::GAUSSIAN: - std::normal_distribution<> dg{1e9, 1e7}; - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = std::abs(static_cast(dg(gen))); - } - break; - } -} - -static void gen_final_size(benchmark::internal::Benchmark* b) -{ - for (auto size = 10'000'000; size <= 310'000'000; size += 20'000'000) { - b->Args({size}); - } -} - -template -static void BM_dynamic_insert(::benchmark::State& state) -{ - using map_type = cuco::dynamic_map; - - std::size_t num_keys = state.range(0); - std::size_t initial_size = 1 << 27; - - std::vector h_keys(num_keys); - std::vector> h_pairs(num_keys); - - generate_keys(h_keys.begin(), h_keys.end()); - - for (std::size_t i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - } - - thrust::device_vector> d_pairs(h_pairs); - - std::size_t batch_size = 1E6; - for (auto _ : state) { - map_type map{initial_size, cuco::empty_key{-1}, cuco::empty_value{-1}}; - { - cuda_event_timer raii{state}; - for (std::size_t i = 0; i < num_keys; i += batch_size) { - map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size); - } - } - } - - state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * - int64_t(state.range(0))); -} - -template -static void BM_dynamic_search_all(::benchmark::State& state) -{ - using map_type = cuco::dynamic_map; - - std::size_t num_keys = state.range(0); - std::size_t initial_size = 1 << 27; - - std::vector h_keys(num_keys); - std::vector> h_pairs(num_keys); - - generate_keys(h_keys.begin(), h_keys.end()); - - for (std::size_t i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - } - - thrust::device_vector d_keys(h_keys); - thrust::device_vector> d_pairs(h_pairs); - thrust::device_vector d_results(num_keys); - - map_type map{initial_size, cuco::empty_key{-1}, cuco::empty_value{-1}}; - map.insert(d_pairs.begin(), d_pairs.end()); - - for (auto _ : state) { - cuda_event_timer raii{state}; - map.find(d_keys.begin(), d_keys.end(), d_results.begin()); - } - - state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * - int64_t(state.range(0))); -} - -template -static void BM_dynamic_search_none(::benchmark::State& state) -{ - using map_type = cuco::dynamic_map; - - std::size_t num_keys = state.range(0); - std::size_t initial_size = 1 << 27; - - std::vector h_keys(num_keys); - std::vector> h_pairs(num_keys); - - generate_keys(h_keys.begin(), h_keys.end()); - - for (std::size_t i = 0; i < num_keys; ++i) { - Key key = h_keys[i] + num_keys; - Value val = h_keys[i] + num_keys; - h_pairs[i].first = key; - h_pairs[i].second = val; - } - - thrust::device_vector d_keys(h_keys); - thrust::device_vector> d_pairs(h_pairs); - thrust::device_vector d_results(num_keys); - - map_type map{initial_size, cuco::empty_key{-1}, cuco::empty_value{-1}}; - map.insert(d_pairs.begin(), d_pairs.end()); - - for (auto _ : state) { - cuda_event_timer raii{state}; - map.find(d_keys.begin(), d_keys.end(), d_results.begin()); - } - - state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * - int64_t(state.range(0))); -} - -template -static void BM_dynamic_erase_all(::benchmark::State& state) -{ - using map_type = cuco::dynamic_map; - - std::size_t num_keys = state.range(0); - std::size_t initial_size = 1 << 27; - - std::vector h_keys(num_keys); - std::vector> h_pairs(num_keys); - - generate_keys(h_keys.begin(), h_keys.end()); - - for (uint32_t i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - } - - thrust::device_vector> d_pairs(h_pairs); - thrust::device_vector d_keys(h_keys); - - std::size_t batch_size = 1E6; - for (auto _ : state) { - map_type map{initial_size, - cuco::empty_key{-1}, - cuco::empty_value{-1}, - cuco::erased_key{-2}}; - for (uint32_t i = 0; i < num_keys; i += batch_size) { - map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size); - } - { - cuda_event_timer raii{state}; - for (uint32_t i = 0; i < num_keys; i += batch_size) { - map.erase(d_keys.begin() + i, d_keys.begin() + i + batch_size); - } - } - } - - state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * - int64_t(state.range(0))); -} - -template -static void BM_dynamic_erase_none(::benchmark::State& state) -{ - using map_type = cuco::dynamic_map; - - std::size_t num_keys = state.range(0); - std::size_t initial_size = 1 << 27; - - std::vector h_keys(num_keys); - std::vector> h_pairs(num_keys); - - generate_keys(h_keys.begin(), h_keys.end()); - - for (std::size_t i = 0; i < num_keys; ++i) { - Key key = h_keys[i] + num_keys; - Value val = h_keys[i] + num_keys; - h_pairs[i].first = key; - h_pairs[i].second = val; - } - - thrust::device_vector> d_pairs(h_pairs); - thrust::device_vector d_keys(h_keys); - - std::size_t batch_size = 1E6; - for (auto _ : state) { - map_type map{initial_size, - cuco::empty_key{-1}, - cuco::empty_value{-1}, - cuco::erased_key{-2}}; - for (std::size_t i = 0; i < num_keys; i += batch_size) { - map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size); - } - { - cuda_event_timer raii{state}; - map.erase(d_keys.begin(), d_keys.end()); - } - } - - state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * - int64_t(state.range(0))); -} - -BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::GAUSSIAN) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::GAUSSIAN) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::GAUSSIAN) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::GAUSSIAN) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::GAUSSIAN) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::GAUSSIAN) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -// TODO: comprehensive tests for erase_none and search_none? -BENCHMARK_TEMPLATE(BM_dynamic_search_none, int32_t, int32_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_erase_none, int32_t, int32_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); diff --git a/benchmarks/synchronization.hpp b/benchmarks/synchronization.hpp deleted file mode 100644 index ecf57138b..000000000 --- a/benchmarks/synchronization.hpp +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -// Google Benchmark library -#include - -#include - -#include - -/** - * @brief This class serves as a wrapper for using `cudaEvent_t` as the user - * defined timer within the framework of google benchmark - * (https://github.com/google/benchmark). - * - * It is built on top of the idea of Resource acquisition is initialization - * (RAII). In the following we show a minimal example of how to use this class. - * - * \code{cpp} - * #include - * - * static void sample_cuda_benchmark(benchmark::State& state) { - * - * for (auto _ : state){ - * cudaStream_t stream = 0; - * - * // Create (Construct) an object of this class. You HAVE to pass in the - * // benchmark::State object you are using. It measures the time from its - * // creation to its destruction that is spent on the specified CUDA stream. - * // It also clears the L2 cache by cudaMemset'ing a device buffer that is of - * // the size of the L2 cache (if flush_l2_cache is set to true and there is - * // an L2 cache on the current device). - * cuda_event_timer raii(state, true, stream); // flush_l2_cache = true - * - * // Now perform the operations that is to be benchmarked - * sample_kernel<<<1, 256, 0, stream>>>(); // Possibly launching a CUDA kernel - * - * } - * } - * - * // Register the function as a benchmark. You will need to set the `UseManualTime()` - * // flag in order to use the timer embeded in this class. - * BENCHMARK(sample_cuda_benchmark)->UseManualTime(); - * \endcode - * - * - */ -class cuda_event_timer { - public: - /** - * @brief Constructs a `cuda_event_timer` beginning a manual timing range. - * - * Optionally flushes L2 cache. - * - * @param[in,out] state This is the benchmark::State whose timer we are going - * to update. - * @param[in] flush_l2_cache_ whether or not to flush the L2 cache before - * every iteration. - * @param[in] stream_ The CUDA stream we are measuring time on. - */ - cuda_event_timer(benchmark::State& state, bool flush_l2_cache = false, cudaStream_t stream = 0) - : p_state(&state), stream_(stream) - { - // flush all of L2$ - if (flush_l2_cache) { - int current_device = 0; - CUCO_CUDA_TRY(cudaGetDevice(¤t_device)); - - int l2_cache_bytes = 0; - CUCO_CUDA_TRY( - cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device)); - - if (l2_cache_bytes > 0) { - const int memset_value = 0; - int* l2_cache_buffer = nullptr; - CUCO_CUDA_TRY(cudaMalloc(&l2_cache_buffer, l2_cache_bytes)); - CUCO_CUDA_TRY(cudaMemsetAsync(l2_cache_buffer, memset_value, l2_cache_bytes, stream_)); - CUCO_CUDA_TRY(cudaFree(l2_cache_buffer)); - } - } - - CUCO_CUDA_TRY(cudaEventCreate(&start_)); - CUCO_CUDA_TRY(cudaEventCreate(&stop_)); - CUCO_CUDA_TRY(cudaEventRecord(start_, stream_)); - } - - cuda_event_timer() = delete; - - /** - * @brief Destroy the `cuda_event_timer` and ending the manual time range. - * - */ - ~cuda_event_timer() - { - CUCO_ASSERT_CUDA_SUCCESS(cudaEventRecord(stop_, stream_)); - CUCO_ASSERT_CUDA_SUCCESS(cudaEventSynchronize(stop_)); - float milliseconds = 0.0f; - CUCO_ASSERT_CUDA_SUCCESS(cudaEventElapsedTime(&milliseconds, start_, stop_)); - p_state->SetIterationTime(milliseconds / (1000.0f)); - CUCO_ASSERT_CUDA_SUCCESS(cudaEventDestroy(start_)); - CUCO_ASSERT_CUDA_SUCCESS(cudaEventDestroy(stop_)); - } - - private: - cudaEvent_t start_; - cudaEvent_t stop_; - cudaStream_t stream_; - benchmark::State* p_state; -}; From 15196bf13053f5ad5bbd9a48a0268fab86f3a082 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 14 Mar 2023 03:44:48 +0100 Subject: [PATCH 096/152] Add static map count_by_key_example.cu (#280) This PR adds an example of how to use `cuco::static_map::insert_and_find` to compute a simple histogram over keys. --- README.md | 1 + examples/CMakeLists.txt | 3 +- examples/static_map/count_by_key_example.cu | 163 ++++++++++++++++++++ 3 files changed, 166 insertions(+), 1 deletion(-) create mode 100644 examples/static_map/count_by_key_example.cu diff --git a/README.md b/README.md index 685ba94aa..9f3984bc5 100644 --- a/README.md +++ b/README.md @@ -189,6 +189,7 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection - [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/T49P85Mnd)) - [Device-view APIs for individual operations](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/device_view_example.cu) (see [live example in godbolt](https://godbolt.org/z/dh8bMn3G1)) - [Custom data types, key equality operators and hash functions](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/custom_type_example.cu) (see [live example in godbolt](https://godbolt.org/z/7djKevK6e)) +- [Key histogram](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/count_by_key_example.cu) (see [live example in godbolt](https://godbolt.org/z/vecGeYM48)) ### `static_multimap` diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 1205c774d..9e02d62ba 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -36,4 +36,5 @@ endfunction(ConfigureExample) ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu") ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_view_example.cu") ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu") +ConfigureExample(STATIC_MAP_COUNT_BY_KEY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/count_by_key_example.cu") ConfigureExample(STATIC_MULTIMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multimap/host_bulk_example.cu") diff --git a/examples/static_map/count_by_key_example.cu b/examples/static_map/count_by_key_example.cu new file mode 100644 index 000000000..4c8cfdb11 --- /dev/null +++ b/examples/static_map/count_by_key_example.cu @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include +#include + +/** + * @file count_by_key_example.cu + * @brief Demonstrates usage of the device side APIs for individual operations like insert/find in + * the context of a count-by-key operation, i.e. for a histogram over keys. + * + * Individual operations like a single insert or find can be performed in device code via the + * static_map "device_view" types. + * + * @note This example is for demonstration purposes only. It is not intended to show the most + * performant way to do the example algorithm. + * + */ + +/** + * @brief Inserts keys and counts how often they occur in the input sequence. + * + * @tparam BlockSize CUDA block size + * @tparam Map Type of the map returned from static_map::get_device_mutable_view + * @tparam KeyIter Input iterator whose value_type convertible to Map::key_type + * @tparam UniqueIter Output iterator whose value_type is convertible to uint64_t + * + * @param[in] map_view View of the map into which inserts will be performed + * @param[in] key_begin The beginning of the range of keys to insert + * @param[in] num_keys The total number of keys and values + * @param[out] num_unique_keys The total number of distinct keys inserted + */ +template +__global__ void count_by_key(Map map_view, + KeyIter keys, + uint64_t num_keys, + UniqueIter num_unique_keys) +{ + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + int64_t const loop_stride = gridDim.x * BlockSize; + int64_t idx = BlockSize * blockIdx.x + threadIdx.x; + + uint64_t thread_unique_keys = 0; + while (idx < num_keys) { + // insert key into the map with a count of 1 + auto [slot, is_new_key] = map_view.insert_and_find({keys[idx], 1}); + if (is_new_key) { + // first occurrence of the key + thread_unique_keys++; + } else { + // key is already in the map -> increment count + slot->second.fetch_add(1, cuda::memory_order_relaxed); + } + idx += loop_stride; + } + + // compute number of successfully inserted new keys for each block + // and atomically add to the grand total + uint64_t block_unique_keys = BlockReduce(temp_storage).Sum(thread_unique_keys); + if (threadIdx.x == 0) { + cuda::atomic_ref grid_unique_keys( + *thrust::raw_pointer_cast(num_unique_keys)); + grid_unique_keys.fetch_add(block_unique_keys, cuda::memory_order_relaxed); + } +} + +int main(void) +{ + // Note that if (sizeof(Key)+sizeof(Count))>8 then the minimum required CUDA architecture is sm_70 + using Key = uint32_t; + using Count = uint32_t; + + // Empty slots are represented by reserved "sentinel" values. These values should be selected such + // that they never occur in your input data. + Key constexpr empty_key_sentinel = static_cast(-1); + Count constexpr empty_value_sentinel = static_cast(-1); + + // Number of keys to be inserted + auto constexpr num_keys = 50'000; + // How often each distinct key occurs in the example input + auto constexpr key_duplicates = 5; + static_assert((num_keys % key_duplicates) == 0, + "For this example, num_keys must be divisible by key_duplicates in order to pass " + "the unit test."); + + thrust::device_vector insert_keys(num_keys); + // Create a sequence of keys. Eeach distinct key has key_duplicates many matches. + thrust::transform( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(insert_keys.size()), + insert_keys.begin(), + [] __device__(auto i) { return static_cast(i % (num_keys / key_duplicates)); }); + + // Allocate storage for count of number of unique keys + thrust::device_vector num_unique_keys(1); + + // Compute capacity based on a 50% load factor + auto constexpr load_factor = 0.5; + + // If the number of unique keys is known in advance, we can use it to calculate the map capacity + std::size_t const capacity = std::ceil((num_keys / key_duplicates) / load_factor); + // If we can't give an estimated upper bound on the number of unique keys + // we conservatively assume each key in the input is distinct + // std::size_t const capacity = std::ceil(num_keys / load_factor); + + // Constructs a map with "capacity" slots. + cuco::static_map map{ + capacity, cuco::empty_key{empty_key_sentinel}, cuco::empty_value{empty_value_sentinel}}; + + // Get a non-owning, mutable view of the map that allows inserts to pass by value into the kernel + auto device_insert_view = map.get_device_mutable_view(); + + auto constexpr block_size = 256; + auto const grid_size = (num_keys + block_size - 1) / block_size; + count_by_key<<>>( + device_insert_view, insert_keys.begin(), num_keys, num_unique_keys.data()); + + // Retrieve contents of all the non-empty slots in the map + thrust::device_vector result_keys(num_unique_keys[0]); + thrust::device_vector result_counts(num_unique_keys[0]); + map.retrieve_all(result_keys.begin(), result_counts.begin()); + + // Check if the number of result keys is correct + auto num_keys_check = num_unique_keys[0] == (num_keys / key_duplicates); + + // Iterate over all result counts and verify that they are correct + auto counts_check = thrust::all_of( + result_counts.begin(), result_counts.end(), [] __host__ __device__(Count const count) { + return count == key_duplicates; + }); + + if (num_keys_check and counts_check) { std::cout << "Success!\n"; } + + return 0; +} From 356ce59bb9a403437bf7b1be668bc914ad2bf26f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 16 Mar 2023 09:45:22 -0700 Subject: [PATCH 097/152] Reorganize header files (#283) This PR repositions multiple headers to a better location: - Expose error types to public - Move `traits.hpp` and `allocator.hpp` to the public `utility` folder --- benchmarks/utils.hpp | 2 +- include/cuco/detail/bitwise_compare.cuh | 4 +- include/cuco/detail/error.hpp | 49 ++----------------- include/cuco/dynamic_map.cuh | 1 - include/cuco/static_map.cuh | 7 ++- include/cuco/static_multimap.cuh | 7 ++- include/cuco/{ => utility}/allocator.hpp | 2 +- include/cuco/utility/error.hpp | 62 ++++++++++++++++++++++++ include/cuco/{ => utility}/traits.hpp | 0 9 files changed, 75 insertions(+), 59 deletions(-) rename include/cuco/{ => utility}/allocator.hpp (97%) create mode 100644 include/cuco/utility/error.hpp rename include/cuco/{ => utility}/traits.hpp (100%) diff --git a/benchmarks/utils.hpp b/benchmarks/utils.hpp index 538b7eefb..a8a84a3b6 100644 --- a/benchmarks/utils.hpp +++ b/benchmarks/utils.hpp @@ -47,4 +47,4 @@ NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::uniform, "distribution::uniform"); NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::gaussian, "GAUSSIAN", - "distribution::gaussian"); \ No newline at end of file + "distribution::gaussian"); diff --git a/include/cuco/detail/bitwise_compare.cuh b/include/cuco/detail/bitwise_compare.cuh index 6598ddbda..979dab829 100644 --- a/include/cuco/detail/bitwise_compare.cuh +++ b/include/cuco/detail/bitwise_compare.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ #pragma once -#include +#include #include #include diff --git a/include/cuco/detail/error.hpp b/include/cuco/detail/error.hpp index dc43ba03c..1d1ff6135 100644 --- a/include/cuco/detail/error.hpp +++ b/include/cuco/detail/error.hpp @@ -16,52 +16,9 @@ #pragma once -#include - -#include -#include - -namespace cuco { -/** - * @brief Exception thrown when logical precondition is violated. - * - * This exception should not be thrown directly and is instead thrown by the - * CUCO_EXPECTS macro. - */ -struct logic_error : public std::logic_error { - /** - * @brief Constructs a logic_error with the error message. - * - * @param message Message to be associated with the exception - */ - logic_error(char const* const message) : std::logic_error(message) {} +#include - /** - * @brief Construct a new logic error object with error message - * - * @param message Message to be associated with the exception - */ - logic_error(std::string const& message) : std::logic_error(message) {} -}; -/** - * @brief Exception thrown when a CUDA error is encountered. - * - */ -struct cuda_error : public std::runtime_error { - /** - * @brief Constructs a `cuda_error` object with the given `message`. - * - * @param message The error char array used to construct `cuda_error` - */ - cuda_error(const char* message) : std::runtime_error(message) {} - /** - * @brief Constructs a `cuda_error` object with the given `message` string. - * - * @param message The `std::string` used to construct `cuda_error` - */ - cuda_error(std::string const& message) : cuda_error{message.c_str()} {} -}; -} // namespace cuco +#include #define STRINGIFY_DETAIL(x) #x #define CUCO_STRINGIFY(x) STRINGIFY_DETAIL(x) @@ -79,7 +36,7 @@ struct cuda_error : public std::runtime_error { * Example: * ```c++ * - * // Throws `rmm::cuda_error` if `cudaMalloc` fails + * // Throws `cuco::cuda_error` if `cudaMalloc` fails * CUCO_CUDA_TRY(cudaMalloc(&p, 100)); * * // Throws `std::runtime_error` if `cudaMalloc` fails diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index bbdefddad..53b7ec1fe 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -17,7 +17,6 @@ #pragma once #include -#include #include #include #include diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 3323e9481..740bb8282 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,14 +16,13 @@ #pragma once -#include #include -#include #include #include #include #include -#include +#include +#include #include diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index b0917dc8e..fe68da32b 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,14 +16,13 @@ #pragma once -#include #include -#include #include #include #include #include -#include +#include +#include #include diff --git a/include/cuco/allocator.hpp b/include/cuco/utility/allocator.hpp similarity index 97% rename from include/cuco/allocator.hpp rename to include/cuco/utility/allocator.hpp index c19552963..583571620 100644 --- a/include/cuco/allocator.hpp +++ b/include/cuco/utility/allocator.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/include/cuco/utility/error.hpp b/include/cuco/utility/error.hpp new file mode 100644 index 000000000..eb6a5f2e3 --- /dev/null +++ b/include/cuco/utility/error.hpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace cuco { +/** + * @brief Exception thrown when logical precondition is violated. + * + * This exception should not be thrown directly and is instead thrown by the + * CUCO_EXPECTS macro. + */ +struct logic_error : public std::logic_error { + /** + * @brief Constructs a logic_error with the error message. + * + * @param message Message to be associated with the exception + */ + logic_error(char const* const message) : std::logic_error(message) {} + + /** + * @brief Construct a new logic error object with error message + * + * @param message Message to be associated with the exception + */ + logic_error(std::string const& message) : std::logic_error(message) {} +}; +/** + * @brief Exception thrown when a CUDA error is encountered. + * + */ +struct cuda_error : public std::runtime_error { + /** + * @brief Constructs a `cuda_error` object with the given `message`. + * + * @param message The error char array used to construct `cuda_error` + */ + cuda_error(const char* message) : std::runtime_error(message) {} + /** + * @brief Constructs a `cuda_error` object with the given `message` string. + * + * @param message The `std::string` used to construct `cuda_error` + */ + cuda_error(std::string const& message) : cuda_error{message.c_str()} {} +}; +} // namespace cuco diff --git a/include/cuco/traits.hpp b/include/cuco/utility/traits.hpp similarity index 100% rename from include/cuco/traits.hpp rename to include/cuco/utility/traits.hpp From 9287726c8893d37a3e873ef9d9e1d081ef413c19 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 3 Apr 2023 15:48:43 -0700 Subject: [PATCH 098/152] Remove outdated doc (#287) The statement of map doesn't support erasing keys is no longer valid and this PR removes the out-of-date documentation. --- include/cuco/dynamic_map.cuh | 4 ++-- include/cuco/static_map.cuh | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index 53b7ec1fe..b9dcf9f22 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -43,8 +43,8 @@ namespace cuco { * concurrent insert and find) from threads in device code. * * Current limitations: - * - Requires keys that are Arithmetic - * - Does not support erasing keys + * - Requires keys and values that where `cuco::is_bitwise_comparable_v` is true + * - Comparisons against the "sentinel" values will always be done with bitwise comparisons. * - Capacity does not shrink automatically * - Requires the user to specify sentinel values for both key and mapped value * to indicate empty slots diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 740bb8282..ad4b1ae78 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -52,7 +52,6 @@ class dynamic_map; * Current limitations: * - Requires keys and values that where `cuco::is_bitwise_comparable_v` is true * - Comparisons against the "sentinel" values will always be done with bitwise comparisons. - * - Does not support erasing keys * - Capacity is fixed and will not grow automatically * - Requires the user to specify sentinel values for both key and mapped value to indicate empty * slots From ed620ab4b885676991699c5823a802fd0ad90f99 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 5 Apr 2023 17:45:56 -0700 Subject: [PATCH 099/152] Add data structure base classes and `cuco::static_set` (#278) This is the first PR related to #110. It introduces the concept of: - New probing scheme via probing iterator - Array of Windows storage instead of flat storage to better deal with memory bandwidth-bound workload when hash collisions are present - Dynamic and static extent type for efficient probing - Mixin to encode concurrent device operators - Synchronous and asynchronous host bulk APIs This PR also adds `cuco::static_set` to evaluate the new design. For now, only 2 basic operations, `insert` and `contains`, are supported. --------- Co-authored-by: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- README.md | 8 + benchmarks/CMakeLists.txt | 7 + .../hash_table/static_set/contains_bench.cu | 75 ++++ .../hash_table/static_set/insert_bench.cu | 81 ++++ .../hash_table/static_set/size_bench.cu | 62 ++++ examples/CMakeLists.txt | 2 + examples/static_set/device_ref_example.cu | 116 ++++++ examples/static_set/host_bulk_example.cu | 72 ++++ include/cuco/detail/equal_wrapper.cuh | 95 +++++ include/cuco/detail/operator.inl | 59 +++ include/cuco/detail/prime.hpp | 1 + include/cuco/detail/probing_scheme_base.cuh | 42 +++ include/cuco/detail/probing_scheme_impl.inl | 146 ++++++++ include/cuco/detail/static_set/kernels.cuh | 287 +++++++++++++++ include/cuco/detail/static_set/static_set.inl | 249 +++++++++++++ .../cuco/detail/static_set/static_set_ref.inl | 348 ++++++++++++++++++ include/cuco/detail/storage/aow_storage.cuh | 247 +++++++++++++ .../cuco/detail/storage/counter_storage.cuh | 112 ++++++ include/cuco/detail/storage/kernels.cuh | 55 +++ include/cuco/detail/storage/storage.cuh | 62 ++++ include/cuco/detail/storage/storage_base.cuh | 91 +++++ include/cuco/detail/tuning.cuh | 27 ++ include/cuco/detail/utils.cuh | 36 +- include/cuco/detail/utils.hpp | 43 ++- include/cuco/extent.cuh | 149 ++++++++ include/cuco/operator.hpp | 40 ++ include/cuco/probing_scheme.cuh | 153 ++++++++ include/cuco/static_set.cuh | 289 +++++++++++++++ include/cuco/static_set_ref.cuh | 131 +++++++ include/cuco/storage.cuh | 47 +++ include/cuco/utility/traits.hpp | 8 +- tests/CMakeLists.txt | 17 +- tests/static_set/capacity_test.cu | 112 ++++++ tests/static_set/heterogeneous_lookup_test.cu | 120 ++++++ tests/static_set/large_input_test.cu | 88 +++++ tests/static_set/size_test.cu | 42 +++ tests/static_set/unique_sequence_test.cu | 110 ++++++ tests/utility/extent_test.cu | 56 +++ tests/utility/storage_test.cu | 90 +++++ 39 files changed, 3768 insertions(+), 7 deletions(-) create mode 100644 benchmarks/hash_table/static_set/contains_bench.cu create mode 100644 benchmarks/hash_table/static_set/insert_bench.cu create mode 100644 benchmarks/hash_table/static_set/size_bench.cu create mode 100644 examples/static_set/device_ref_example.cu create mode 100644 examples/static_set/host_bulk_example.cu create mode 100644 include/cuco/detail/equal_wrapper.cuh create mode 100644 include/cuco/detail/operator.inl create mode 100644 include/cuco/detail/probing_scheme_base.cuh create mode 100644 include/cuco/detail/probing_scheme_impl.inl create mode 100644 include/cuco/detail/static_set/kernels.cuh create mode 100644 include/cuco/detail/static_set/static_set.inl create mode 100644 include/cuco/detail/static_set/static_set_ref.inl create mode 100644 include/cuco/detail/storage/aow_storage.cuh create mode 100644 include/cuco/detail/storage/counter_storage.cuh create mode 100644 include/cuco/detail/storage/kernels.cuh create mode 100644 include/cuco/detail/storage/storage.cuh create mode 100644 include/cuco/detail/storage/storage_base.cuh create mode 100644 include/cuco/detail/tuning.cuh create mode 100644 include/cuco/extent.cuh create mode 100644 include/cuco/operator.hpp create mode 100644 include/cuco/probing_scheme.cuh create mode 100644 include/cuco/static_set.cuh create mode 100644 include/cuco/static_set_ref.cuh create mode 100644 include/cuco/storage.cuh create mode 100644 tests/static_set/capacity_test.cu create mode 100644 tests/static_set/heterogeneous_lookup_test.cu create mode 100644 tests/static_set/large_input_test.cu create mode 100644 tests/static_set/size_test.cu create mode 100644 tests/static_set/unique_sequence_test.cu create mode 100644 tests/utility/extent_test.cu create mode 100644 tests/utility/storage_test.cu diff --git a/README.md b/README.md index 9f3984bc5..ef294f838 100644 --- a/README.md +++ b/README.md @@ -181,6 +181,14 @@ class example_class { We plan to add many GPU-accelerated, concurrent data structures to `cuCollections`. As of now, the two flagships are variants of hash tables. +### `static_set` + +`cuco::static_set` is a fixed-size container that stores unique elements in no particular order. See the Doxygen documentation in `static_set.cuh` for more detailed information. + +#### Examples: +- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/jnjcdG16c)) +- [Device-ref APIs for individual operations](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/EGMj6qx73)) + ### `static_map` `cuco::static_map` is a fixed-size hash table using open addressing with linear probing. See the Doxygen documentation in `static_map.cuh` for more detailed information. diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 13f1e3be0..750c9be86 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -46,6 +46,13 @@ endfunction(ConfigureBench) ### benchmark sources ############################################################################# ################################################################################################### +################################################################################################### +# - static_set benchmarks ------------------------------------------------------------------------- +ConfigureBench(STATIC_SET_BENCH + hash_table/static_set/contains_bench.cu + hash_table/static_set/insert_bench.cu + hash_table/static_set/size_bench.cu) + ################################################################################################### # - static_map benchmarks ------------------------------------------------------------------------- ConfigureBench(STATIC_MAP_BENCH diff --git a/benchmarks/hash_table/static_set/contains_bench.cu b/benchmarks/hash_table/static_set/contains_bench.cu new file mode 100644 index 000000000..b0c0f34f4 --- /dev/null +++ b/benchmarks/hash_table/static_set/contains_bench.cu @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::static_set::contains` performance + */ +template +void static_set_contains(nvbench::state& state, nvbench::type_list) +{ + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); + + std::size_t const size = num_keys / occupancy; + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + cuco::experimental::static_set set{size, cuco::empty_key{-1}}; + set.insert(keys.begin(), keys.end()); + + gen.dropout(keys.begin(), keys.end(), matching_rate); + + thrust::device_vector result(num_keys); + + state.add_element_count(num_keys); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + set.contains(keys.begin(), keys.end(), result.begin(), launch.get_stream()); + }); +} + +NVBENCH_BENCH_TYPES(static_set_contains, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_contains_unique_occupancy") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); + +NVBENCH_BENCH_TYPES(static_set_contains, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_contains_unique_matching_rate") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); diff --git a/benchmarks/hash_table/static_set/insert_bench.cu b/benchmarks/hash_table/static_set/insert_bench.cu new file mode 100644 index 000000000..cb5dcf1f8 --- /dev/null +++ b/benchmarks/hash_table/static_set/insert_bench.cu @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::static_set::insert` performance + */ +template +void static_set_insert(nvbench::state& state, nvbench::type_list) +{ + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + + std::size_t const size = num_keys / occupancy; + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + state.add_element_count(num_keys); + + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, + [&](nvbench::launch& launch, auto& timer) { + cuco::experimental::static_set set{ + size, cuco::empty_key{-1}, {}, {}, {}, launch.get_stream()}; + + timer.start(); + set.insert(keys.begin(), keys.end(), launch.get_stream()); + timer.stop(); + }); +} + +NVBENCH_BENCH_TYPES(static_set_insert, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_insert_uniform_multiplicity") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); + +NVBENCH_BENCH_TYPES(static_set_insert, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_insert_unique_occupancy") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); + +NVBENCH_BENCH_TYPES(static_set_insert, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_insert_gaussian_skew") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Skew", defaults::SKEW_RANGE); diff --git a/benchmarks/hash_table/static_set/size_bench.cu b/benchmarks/hash_table/static_set/size_bench.cu new file mode 100644 index 000000000..ded20fe04 --- /dev/null +++ b/benchmarks/hash_table/static_set/size_bench.cu @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::static_set::size` performance + */ +template +void static_set_size(nvbench::state& state, nvbench::type_list) +{ + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + + std::size_t const size = num_keys / occupancy; + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + state.add_element_count(num_keys); + + cuco::experimental::static_set set{size, cuco::empty_key{-1}}; + + set.insert(keys.begin(), keys.end()); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto const size = set.size(launch.get_stream()); }); +} + +NVBENCH_BENCH_TYPES(static_set_size, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_size_unique_occupancy") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 9e02d62ba..d78627eee 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -33,6 +33,8 @@ endfunction(ConfigureExample) ### Example sources ############################################################################### ################################################################################################### +ConfigureExample(STATIC_SET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/host_bulk_example.cu") +ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_ref_example.cu") ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu") ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_view_example.cu") ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu") diff --git a/examples/static_set/device_ref_example.cu b/examples/static_set/device_ref_example.cu new file mode 100644 index 000000000..0179baa83 --- /dev/null +++ b/examples/static_set/device_ref_example.cu @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include + +#include +#include + +// insert a set of keys into a hash set using one cooperative group for each task +template +__global__ void custom_cooperative_insert(SetRef set, InputIterator keys, std::size_t n) +{ + namespace cg = cooperative_groups; + + constexpr auto cg_size = SetRef::cg_size; + + auto tile = cg::tiled_partition(cg::this_thread_block()); + + int64_t const loop_stride = gridDim.x * blockDim.x / cg_size; + int64_t idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size; + + while (idx < n) { + set.insert(tile, *(keys + idx)); + idx += loop_stride; + } +} + +template +__global__ void custom_contains(SetRef set, InputIterator keys, std::size_t n, OutputIterator found) +{ + int64_t const loop_stride = gridDim.x * blockDim.x; + int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; + + while (idx < n) { + found[idx] = set.contains(*(keys + idx)); + idx += loop_stride; + } +} + +/** + * @file device_reference_example.cu + * @brief Demonstrates usage of the static_set device-side APIs. + * + * static_set provides a non-owning reference which can be used to interact with + * the container from within device code. + * + */ +int main(void) +{ + using Key = int; + + // Empty slots are represented by reserved "sentinel" values. These values should be selected such + // that they never occur in your input data. + Key constexpr empty_key_sentinel = -1; + + // Number of keys to be inserted + std::size_t constexpr num_keys = 50'000; + + // Compute capacity based on a 50% load factor + auto constexpr load_factor = 0.5; + std::size_t const capacity = std::ceil(num_keys / load_factor); + + using set_type = cuco::experimental::static_set; + + // Constructs a hash set with at least "capacity" slots using -1 as the empty key sentinel. + set_type set{capacity, cuco::empty_key{empty_key_sentinel}}; + + // Create a sequence of keys {0, 1, 2, .., i} + thrust::device_vector keys(num_keys); + thrust::sequence(keys.begin(), keys.end(), 0); + + // Insert the first half of the keys into the set + set.insert(keys.begin(), keys.begin() + num_keys / 2); + + // Insert the second half of keys using a custom CUDA kernel. + custom_cooperative_insert<<<128, 128>>>( + set.ref(cuco::experimental::insert), keys.begin() + num_keys / 2, num_keys / 2); + + // Storage for result + thrust::device_vector found(num_keys); + + // Check if all keys are now contained in the set. Note that we pass a reference that already has + // the `contains` operator. + // In general, using two or more reference objects to the same container but with + // a different set of operators concurrently is undefined behavior. + // This does not apply here since the two kernels do not overlap. + custom_contains<<<128, 128>>>( + set.ref(cuco::experimental::contains), keys.begin(), num_keys, found.begin()); + + // Verify that all keys have been found + bool const all_keys_found = thrust::all_of(found.begin(), found.end(), thrust::identity()); + + if (all_keys_found) { std::cout << "Success! Found all keys.\n"; } + + return 0; +} diff --git a/examples/static_set/host_bulk_example.cu b/examples/static_set/host_bulk_example.cu new file mode 100644 index 000000000..3b8c4deb4 --- /dev/null +++ b/examples/static_set/host_bulk_example.cu @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include +#include + +/** + * @file host_bulk_example.cu + * @brief Demonstrates usage of the static_set "bulk" host APIs. + * + * The bulk APIs are only invocable from the host and are used for doing operations like `insert` or + * `contains` on a set of keys. + * + */ +int main(void) +{ + using Key = int; + + // Empty slots are represented by reserved "sentinel" values. These values should be selected such + // that they never occur in your input data. + Key constexpr empty_key_sentinel = -1; + + // Number of keys to be inserted + std::size_t constexpr num_keys = 50'000; + + // Compute capacity based on a 50% load factor + auto constexpr load_factor = 0.5; + std::size_t const capacity = std::ceil(num_keys / load_factor); + + // Constructs a set with at least `capacity` slots using -1 as the empty keys sentinel. + cuco::experimental::static_set set{capacity, cuco::empty_key{empty_key_sentinel}}; + + // Create a sequence of keys {0, 1, 2, .., i} + thrust::device_vector keys(num_keys); + thrust::sequence(keys.begin(), keys.end(), 0); + + // Inserts all keys into the hash set + set.insert(keys.begin(), keys.end()); + + // Storage for result + thrust::device_vector found(num_keys); + + // Check if all keys are contained in the set + set.contains(keys.begin(), keys.end(), found.begin()); + + // Verify that all keys have been found + bool const all_keys_found = thrust::all_of(found.begin(), found.end(), thrust::identity()); + + if (all_keys_found) { std::cout << "Success! Found all keys.\n"; } + + return 0; +} diff --git a/include/cuco/detail/equal_wrapper.cuh b/include/cuco/detail/equal_wrapper.cuh new file mode 100644 index 000000000..1774e0bf3 --- /dev/null +++ b/include/cuco/detail/equal_wrapper.cuh @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief Enum of equality comparison results. + */ +enum class equal_result : int32_t { UNEQUAL = 0, EMPTY = 1, EQUAL = 2 }; + +/** + * @brief Equality wrapper. + * + * User-provided equality binary callable cannot be used to compare against sentinel value. + * + * @tparam T Right-hand side Element type + * @tparam Equal Type of user-provided equality binary callable + */ +template +struct equal_wrapper { + T sentinel_; ///< Sentinel value + Equal equal_; ///< Custom equality callable + + /** + * @brief Equality wrapper ctor. + * + * @param sentinel Sentinel value + * @param equal Equality binary callable + */ + __host__ __device__ constexpr equal_wrapper(T sentinel, Equal const& equal) + : sentinel_{sentinel}, equal_{equal} + { + } + + /** + * @brief Equality check with the given equality callable. + * + * @tparam LHS Left-hand side Element type + * @tparam RHS Right-hand side Element type + * + * @param lhs Left-hand side element to check equality + * @param rhs Right-hand side element to check equality + * @return Three way equality comparison result + */ + template + __device__ constexpr equal_result equal_to(LHS const& lhs, RHS const& rhs) const noexcept + { + return equal_(lhs, rhs) ? equal_result::EQUAL : equal_result::UNEQUAL; + } + + /** + * @brief Order-sensitive equality operator. + * + * This function always compares the left-hand side element against `sentinel_` value first + * then perform a equality check with the given `equal_` callable, i.e., `equal_(lhs, rhs)`. + * + * @note Container (like set or map) keys MUST be always on the left-hand side. + * + * @tparam U Right-hand side Element type + * + * @param lhs Left-hand side element to check equality + * @param rhs Right-hand side element to check equality + * @return Three way equality comparison result + */ + template + __device__ constexpr equal_result operator()(T const& lhs, U const& rhs) const noexcept + { + return cuco::detail::bitwise_compare(lhs, sentinel_) ? equal_result::EMPTY + : this->equal_to(lhs, rhs); + } +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/operator.inl b/include/cuco/detail/operator.inl new file mode 100644 index 000000000..fdd5884e8 --- /dev/null +++ b/include/cuco/detail/operator.inl @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief CRTP mixin which augments a given `Reference` with an `Operator`. + * + * @throw If the operator is not defined in `include/cuco/operator.hpp` + * + * @tparam Operator Operator type, i.e., `cuco::op::*_tag` + * @tparam Reference The reference type. + * + * @note This primary template should never be instantiated. + */ +template +class operator_impl { + static_assert(cuco::dependent_false, + "Operator type is not supported by reference type."); +}; + +/** + * @brief Checks if the given `Operator` is contained in a list of `Operators`. + * + * @tparam Operator Operator type, i.e., `cuco::op::*_tag` + * @tparam Operators List of operators to search in + * + * @return `true` if `Operator` is contained in `Operators`, `false` otherwise. + */ +template +static constexpr bool has_operator() +{ + return ((std::is_same_v) || ...); +} + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/prime.hpp b/include/cuco/detail/prime.hpp index 1180035ae..186a29257 100644 --- a/include/cuco/detail/prime.hpp +++ b/include/cuco/detail/prime.hpp @@ -21,6 +21,7 @@ #include #include #include +#include namespace cuco { namespace detail { diff --git a/include/cuco/detail/probing_scheme_base.cuh b/include/cuco/detail/probing_scheme_base.cuh new file mode 100644 index 000000000..03f712155 --- /dev/null +++ b/include/cuco/detail/probing_scheme_base.cuh @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief Base class of public probing scheme. + * + * This class should not be used directly. + * + * @tparam CGSize Size of CUDA Cooperative Groups + */ +template +class probing_scheme_base { + public: + /** + * @brief The size of the CUDA cooperative thread group. + */ + static constexpr int32_t cg_size = CGSize; +}; +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/probing_scheme_impl.inl b/include/cuco/detail/probing_scheme_impl.inl new file mode 100644 index 000000000..4f7e904a1 --- /dev/null +++ b/include/cuco/detail/probing_scheme_impl.inl @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief Probing iterator class. + * + * @tparam Extent Type of Extent + */ +template +class probing_iterator { + public: + using extent_type = Extent; ///< Extent type + using size_type = typename extent_type::value_type; ///< Size type + + /** + * @brief Constructs an probing iterator + * + * @param start Iteration starting point + * @param step_size Double hashing step size + * @param upper_bound Upper bound of the iteration + */ + __host__ __device__ constexpr probing_iterator(size_type start, + size_type step_size, + extent_type upper_bound) noexcept + : curr_index_{start}, step_size_{step_size}, upper_bound_{upper_bound} + { + // TODO: revise this API when introducing quadratic probing into cuco + } + + /** + * @brief Dereference operator + * + * @return Current slot ndex + */ + __host__ __device__ constexpr auto operator*() const noexcept { return curr_index_; } + + /** + * @brief Prefix increment operator + * + * @return Current iterator + */ + __host__ __device__ constexpr auto operator++() noexcept + { + // TODO: step_size_ can be a build time constant (e.g. linear probing) + // Worth passing another extent type? + curr_index_ = (curr_index_ + step_size_) % upper_bound_; + return *this; + } + + /** + * @brief Postfix increment operator + * + * @return Old iterator before increment + */ + __host__ __device__ constexpr auto operator++(int32_t) noexcept + { + auto temp = *this; + ++(*this); + return temp; + } + + private: + size_type curr_index_; + size_type step_size_; + extent_type upper_bound_; +}; +} // namespace detail + +template +__host__ __device__ constexpr linear_probing::linear_probing(Hash const& hash) + : hash_{hash} +{ +} + +template +template +__host__ __device__ constexpr auto linear_probing::operator()( + ProbeKey const& probe_key, Extent upper_bound) const noexcept +{ + return detail::probing_iterator{hash_(probe_key) % upper_bound, + 1, // step size is 1 + upper_bound}; +} + +template +template +__host__ __device__ constexpr auto linear_probing::operator()( + cooperative_groups::thread_block_tile const& g, + ProbeKey const& probe_key, + Extent upper_bound) const noexcept +{ + return detail::probing_iterator{ + (hash_(probe_key) + g.thread_rank()) % upper_bound, cg_size, upper_bound}; +} + +template +__host__ __device__ constexpr double_hashing::double_hashing( + Hash1 const& hash1, Hash2 const& hash2) + : hash1_{hash1}, hash2_{hash2} +{ +} + +template +template +__host__ __device__ constexpr auto double_hashing::operator()( + ProbeKey const& probe_key, Extent upper_bound) const noexcept +{ + return detail::probing_iterator{ + hash1_(probe_key) % upper_bound, + hash2_(probe_key) % (upper_bound - 1) + 1, // step size in range [1, prime - 1] + upper_bound}; +} + +template +template +__host__ __device__ constexpr auto double_hashing::operator()( + cooperative_groups::thread_block_tile const& g, + ProbeKey const& probe_key, + Extent upper_bound) const noexcept +{ + return detail::probing_iterator{ + (hash1_(probe_key) + g.thread_rank()) % upper_bound, + (hash2_(probe_key) % (upper_bound / cg_size - 1) + 1) * cg_size, + upper_bound}; +} +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/static_set/kernels.cuh b/include/cuco/detail/static_set/kernels.cuh new file mode 100644 index 000000000..e7d52ae27 --- /dev/null +++ b/include/cuco/detail/static_set/kernels.cuh @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +#include + +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief Inserts all elements in the range `[first, first + n)` and returns the number of + * successful insertions. + * + * If multiple elements in `[first, first + size)` compare equal, it is unspecified which + * element is inserted. + * + * @tparam BlockSize Number of threads in each block + * @tparam InputIterator Device accessible input iterator whose `value_type` is + * convertible to the `value_type` of the data structure + * @tparam AtomicT Atomic counter type + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of input elements + * @param n Number of input elements + * @param num_successes Number of successful inserted elements + * @param ref Non-owing set device ref used to access the slot storage + */ +template +__global__ void insert(InputIterator first, + cuco::detail::index_type n, + AtomicT* num_successes, + Ref ref) +{ + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + typename Ref::size_type thread_num_successes = 0; + + cuco::detail::index_type const loop_stride = gridDim.x * BlockSize; + cuco::detail::index_type idx = BlockSize * blockIdx.x + threadIdx.x; + + while (idx < n) { + typename Ref::value_type const insert_pair{*(first + idx)}; + if (ref.insert(insert_pair)) { thread_num_successes++; }; + idx += loop_stride; + } + + // compute number of successfully inserted elements for each block + // and atomically add to the grand total + typename Ref::size_type block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); + if (threadIdx.x == 0) { + num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed); + } +} + +/** + * @brief Inserts all elements in the range `[first, first + n)`. + * + * If multiple elements in `[first, first + n)` compare equal, it is unspecified which + * element is inserted. + * + * @tparam BlockSize Number of threads in each block + * @tparam InputIterator Device accessible input iterator whose `value_type` is + * convertible to the `value_type` of the data structure + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of input elements + * @param n Number of input elements + * @param ref Non-owing set device ref used to access the slot storage + */ +template +__global__ void insert_async(InputIterator first, cuco::detail::index_type n, Ref ref) +{ + cuco::detail::index_type const loop_stride = gridDim.x * BlockSize; + cuco::detail::index_type idx = BlockSize * blockIdx.x + threadIdx.x; + + while (idx < n) { + typename Ref::value_type const insert_pair{*(first + idx)}; + ref.insert(insert_pair); + idx += loop_stride; + } +} + +/** + * @brief Inserts all elements in the range `[first, first + n)` and returns the number of + * successful insertions. + * + * If multiple elements in `[first, first + n)` compare equal, it is unspecified which + * element is inserted. + * + * @tparam CGSize Number of threads in each CG + * @tparam BlockSize Number of threads in each block + * @tparam InputIterator Device accessible input iterator whose `value_type` is + * convertible to the `value_type` of the data structure + * @tparam AtomicT Atomic counter type + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of input elements + * @param n Number of input elements + * @param num_successes Number of successful inserted elements + * @param ref Non-owing set device ref used to access the slot storage + */ +template +__global__ void insert(InputIterator first, + cuco::detail::index_type n, + AtomicT* num_successes, + Ref ref) +{ + namespace cg = cooperative_groups; + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + typename Ref::size_type thread_num_successes = 0; + + auto const tile = cg::tiled_partition(cg::this_thread_block()); + cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize; + cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize; + + while (idx < n) { + typename Ref::value_type const insert_pair{*(first + idx)}; + if (ref.insert(tile, insert_pair) && tile.thread_rank() == 0) { thread_num_successes++; }; + idx += loop_stride; + } + + // compute number of successfully inserted elements for each block + // and atomically add to the grand total + typename Ref::size_type block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); + if (threadIdx.x == 0) { + num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed); + } +} + +/** + * @brief Inserts all elements in the range `[first, first + n)`. + * + * If multiple elements in `[first, first + n)` compare equal, it is unspecified which + * element is inserted. + * + * @tparam CGSize Number of threads in each CG + * @tparam BlockSize Number of threads in each block + * @tparam InputIterator Device accessible input iterator whose `value_type` is + * convertible to the `value_type` of the data structure + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of input elements + * @param n Number of input elements + * @param ref Non-owing set device ref used to access the slot storage + */ +template +__global__ void insert_async(InputIterator first, cuco::detail::index_type n, Ref ref) +{ + namespace cg = cooperative_groups; + + auto tile = cg::tiled_partition(cg::this_thread_block()); + cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize; + cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize; + + while (idx < n) { + typename Ref::value_type const insert_pair{*(first + idx)}; + ref.insert(tile, insert_pair); + idx += loop_stride; + } +} + +/** + * @brief Indicates whether the keys in the range `[first, first + n)` are contained in the data + * structure. + * + * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the data + * structure. + * + * @tparam BlockSize The size of the thread block + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of keys + * @param n Number of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param ref Non-owing set device ref used to access the slot storage + */ +template +__global__ void contains(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref) +{ + namespace cg = cooperative_groups; + + auto const block = cg::this_thread_block(); + auto const thread_idx = block.thread_rank(); + + cuco::detail::index_type const loop_stride = gridDim.x * BlockSize; + cuco::detail::index_type idx = BlockSize * blockIdx.x + threadIdx.x; + __shared__ bool output_buffer[BlockSize]; + + while (idx - thread_idx < n) { // the whole thread block falls into the same iteration + if (idx < n) { + auto const key = *(first + idx); + /* + * The ld.relaxed.gpu instruction used in this operation causes L1 to + * flush more frequently, causing increased sector stores from L2 to global memory. + * By writing results to shared memory and then synchronizing before writing back + * to global, we no longer rely on L1, preventing the increase in sector stores from + * L2 to global and improving performance. + */ + output_buffer[thread_idx] = ref.contains(key); + } + + block.sync(); + if (idx < n) { *(output_begin + idx) = output_buffer[thread_idx]; } + idx += loop_stride; + } +} + +/** + * @brief Indicates whether the keys in the range `[first, first + n)` are contained in the data + * structure. + * + * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the data + * structure. + * + * @tparam CGSize Number of threads in each CG + * @tparam BlockSize The size of the thread block + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of keys + * @param n Number of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param ref Non-owing set device ref used to access the slot storage + */ +template +__global__ void contains(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref) +{ + namespace cg = cooperative_groups; + + auto block = cg::this_thread_block(); + auto const thread_idx = block.thread_rank(); + + auto tile = cg::tiled_partition(cg::this_thread_block()); + cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize; + cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize; + + __shared__ bool output_buffer[BlockSize / CGSize]; + auto const tile_idx = thread_idx / CGSize; + + while (idx - thread_idx < n) { // the whole thread block falls into the same iteration + if (idx < n) { + auto const key = *(first + idx); + auto const found = ref.contains(tile, key); + /* + * The ld.relaxed.gpu instruction used in view.find causes L1 to + * flush more frequently, causing increased sector stores from L2 to global memory. + * By writing results to shared memory and then synchronizing before writing back + * to global, we no longer rely on L1, preventing the increase in sector stores from + * L2 to global and improving performance. + */ + if (tile.thread_rank() == 0) { output_buffer[tile_idx] = found; } + } + + block.sync(); + if (idx < n and tile.thread_rank() == 0) { *(output_begin + idx) = output_buffer[tile_idx]; } + idx += loop_stride; + } +} + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl new file mode 100644 index 000000000..0443c1184 --- /dev/null +++ b/include/cuco/detail/static_set/static_set.inl @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +namespace cuco { +namespace experimental { + +template +constexpr static_set::static_set( + Extent capacity, + empty_key empty_key_sentinel, + KeyEqual pred, + ProbingScheme const& probing_scheme, + Allocator const& alloc, + cudaStream_t stream) + : empty_key_sentinel_{empty_key_sentinel}, + predicate_{pred}, + probing_scheme_{probing_scheme}, + allocator_{alloc}, + storage_{make_valid_extent(capacity), allocator_} +{ + storage_.initialize(empty_key_sentinel_, stream); +} + +template +template +static_set::size_type +static_set::insert( + InputIt first, InputIt last, cudaStream_t stream) +{ + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return 0; } + + auto counter = detail::counter_storage{allocator_}; + counter.reset(stream); + + auto const grid_size = + (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / + (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + + if constexpr (cg_size == 1) { + detail::insert + <<>>( + first, num_keys, counter.data(), ref(op::insert)); + } else { + detail::insert + <<>>( + first, num_keys, counter.data(), ref(op::insert)); + } + + return counter.load_to_host(stream); +} + +template +template +void static_set::insert_async( + InputIt first, InputIt last, cudaStream_t stream) +{ + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + auto const grid_size = + (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / + (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + + if constexpr (cg_size == 1) { + detail::insert_async + <<>>(first, num_keys, ref(op::insert)); + } else { + detail::insert_async + <<>>(first, num_keys, ref(op::insert)); + } +} + +template +template +void static_set::contains( + InputIt first, InputIt last, OutputIt output_begin, cudaStream_t stream) const +{ + contains_async(first, last, output_begin, stream); + CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); +} + +template +template +void static_set::contains_async( + InputIt first, InputIt last, OutputIt output_begin, cudaStream_t stream) const +{ + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + auto const grid_size = + (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / + (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + + if constexpr (cg_size == 1) { + detail::contains + <<>>( + first, num_keys, output_begin, ref(op::contains)); + } else { + detail::contains + <<>>( + first, num_keys, output_begin, ref(op::contains)); + } +} + +template +static_set::size_type +static_set::size( + cudaStream_t stream) const +{ + auto const begin = thrust::make_transform_iterator( + storage_.data(), + cuco::detail::elements_per_window{empty_key_sentinel_}); + + std::size_t temp_storage_bytes = 0; + using temp_allocator_type = typename std::allocator_traits::rebind_alloc; + auto temp_allocator = temp_allocator_type{allocator_}; + auto d_size = reinterpret_cast( + std::allocator_traits::allocate(temp_allocator, sizeof(size_type))); + cub::DeviceReduce::Sum( + nullptr, temp_storage_bytes, begin, d_size, storage_.num_windows(), stream); + + auto d_temp_storage = + std::allocator_traits::allocate(temp_allocator, temp_storage_bytes); + + cub::DeviceReduce::Sum( + d_temp_storage, temp_storage_bytes, begin, d_size, storage_.num_windows(), stream); + + size_type h_size; + CUCO_CUDA_TRY( + cudaMemcpyAsync(&h_size, d_size, sizeof(size_type), cudaMemcpyDeviceToHost, stream)); + CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); + std::allocator_traits::deallocate( + temp_allocator, reinterpret_cast(d_size), sizeof(size_type)); + std::allocator_traits::deallocate( + temp_allocator, d_temp_storage, temp_storage_bytes); + + return h_size; +} + +template +constexpr auto +static_set::capacity() + const noexcept +{ + return storage_.capacity(); +} + +template +constexpr static_set::key_type +static_set::empty_key_sentinel() + const noexcept +{ + return empty_key_sentinel_; +} + +template +template +auto static_set::ref( + Operators...) const noexcept +{ + static_assert(sizeof...(Operators), "No operators specified"); + return ref_type{ + cuco::empty_key(empty_key_sentinel_), predicate_, probing_scheme_, storage_.ref()}; +} +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/static_set/static_set_ref.inl b/include/cuco/detail/static_set/static_set_ref.inl new file mode 100644 index 000000000..51099243f --- /dev/null +++ b/include/cuco/detail/static_set/static_set_ref.inl @@ -0,0 +1,348 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace cuco { +namespace experimental { + +template +__host__ __device__ constexpr static_set_ref< + Key, + Scope, + KeyEqual, + ProbingScheme, + StorageRef, + Operators...>::static_set_ref(cuco::empty_key empty_key_sentinel, + KeyEqual const& predicate, + ProbingScheme const& probing_scheme, + StorageRef storage_ref) noexcept + : empty_key_sentinel_{empty_key_sentinel}, + predicate_{empty_key_sentinel, predicate}, + probing_scheme_{probing_scheme}, + storage_ref_{storage_ref} +{ +} + +template +__host__ __device__ constexpr auto +static_set_ref::capacity() + const noexcept +{ + return storage_ref_.capacity(); +} + +template +__host__ __device__ constexpr Key +static_set_ref::empty_key_sentinel() + const noexcept +{ + return empty_key_sentinel_; +} + +namespace detail { + +template +class operator_impl> { + using base_type = static_set_ref; + using ref_type = static_set_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + static constexpr auto thread_scope = base_type::thread_scope; + + public: + /** + * @brief Inserts an element. + * + * @param value The element to insert + * @return True if the given element is successfully inserted + */ + __device__ bool insert(value_type const& value) noexcept + { + ref_type& ref_ = static_cast(*this); + auto probing_iter = ref_.probing_scheme_(value, ref_.storage_ref_.num_windows()); + + while (true) { + auto const window_slots = ref_.storage_ref_[*probing_iter]; + + // TODO: perf gain with #pragma unroll since num_windows is build time constant + for (auto& slot_content : window_slots) { + auto const eq_res = ref_.predicate_(slot_content, value); + + // If the key is already in the container, return false + if (eq_res == detail::equal_result::EQUAL) { return false; } + if (eq_res == detail::equal_result::EMPTY) { + auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); + switch (attempt_insert( + (ref_.storage_ref_.data() + *probing_iter)->data() + intra_window_index, value)) { + case insert_result::CONTINUE: continue; + case insert_result::SUCCESS: return true; + case insert_result::DUPLICATE: return false; + } + } + } + ++probing_iter; + } + } + + /** + * @brief Inserts an element. + * + * @param group The Cooperative Group used to perform group insert + * @param value The element to insert + * @return True if the given element is successfully inserted + */ + __device__ bool insert(cooperative_groups::thread_block_tile group, + value_type const& value) noexcept + { + auto& ref_ = static_cast(*this); + auto probing_iter = ref_.probing_scheme_(group, value, ref_.storage_ref_.num_windows()); + + while (true) { + auto const window_slots = ref_.storage_ref_[*probing_iter]; + + auto const [state, intra_window_index] = [&]() { + for (auto i = 0; i < window_size; ++i) { + switch (ref_.predicate_(window_slots[i], value)) { + case detail::equal_result::EMPTY: return cuco::pair{detail::equal_result::EMPTY, i}; + case detail::equal_result::EQUAL: return cuco::pair{detail::equal_result::EQUAL, i}; + default: continue; + } + } + // returns dummy index `-1` for UNEQUAL + return cuco::pair{detail::equal_result::UNEQUAL, -1}; + }(); + + // If the key is already in the container, return false + if (group.any(state == detail::equal_result::EQUAL)) { return false; } + + auto const group_contains_empty = group.ballot(state == detail::equal_result::EMPTY); + + if (group_contains_empty) { + auto const src_lane = __ffs(group_contains_empty) - 1; + auto const status = + (group.thread_rank() == src_lane) + ? attempt_insert( + (ref_.storage_ref_.data() + *probing_iter)->data() + intra_window_index, value) + : insert_result::CONTINUE; + + switch (group.shfl(status, src_lane)) { + case insert_result::SUCCESS: return true; + case insert_result::DUPLICATE: return false; + default: continue; + } + } else { + ++probing_iter; + } + } + } + + private: + // TODO: this should be a common enum for all data structures + enum class insert_result : int32_t { CONTINUE = 0, SUCCESS = 1, DUPLICATE = 2 }; + + /** + * @brief Attempts to insert an element into a slot. + * + * @note Dispatches the correct implementation depending on the container + * type and presence of other operator mixins. + * + * @param slot Pointer to the slot in memory + * @param value Element to insert + * + * @return Result of this operation, i.e., success/continue/duplicate + */ + [[nodiscard]] __device__ insert_result attempt_insert(value_type* slot, value_type const& value) + { + auto& ref_ = static_cast(*this); + + // temporary workaround due to performance regression + // https://github.com/NVIDIA/libcudacxx/issues/366 + value_type const old = [&]() { + value_type expected = ref_.empty_key_sentinel_.value; + value_type val = value; + if constexpr (sizeof(value_type) == sizeof(uint32_t)) { + auto* expected_ptr = reinterpret_cast(&expected); + auto* value_ptr = reinterpret_cast(&val); + if constexpr (thread_scope == cuda::thread_scope_system) { + return atomicCAS_system(reinterpret_cast(slot), *expected_ptr, *value_ptr); + } else if constexpr (thread_scope == cuda::thread_scope_device) { + return atomicCAS(reinterpret_cast(slot), *expected_ptr, *value_ptr); + } else if constexpr (thread_scope == cuda::thread_scope_block) { + return atomicCAS_block(reinterpret_cast(slot), *expected_ptr, *value_ptr); + } else { + static_assert(cuco::dependent_false, "Unsupported thread scope"); + } + } + if constexpr (sizeof(value_type) == sizeof(uint64_t)) { + auto* expected_ptr = reinterpret_cast(&expected); + auto* value_ptr = reinterpret_cast(&val); + if constexpr (thread_scope == cuda::thread_scope_system) { + return atomicCAS_system( + reinterpret_cast(slot), *expected_ptr, *value_ptr); + } else if constexpr (thread_scope == cuda::thread_scope_device) { + return atomicCAS( + reinterpret_cast(slot), *expected_ptr, *value_ptr); + } else if constexpr (thread_scope == cuda::thread_scope_block) { + return atomicCAS_block( + reinterpret_cast(slot), *expected_ptr, *value_ptr); + } else { + static_assert(cuco::dependent_false, "Unsupported thread scope"); + } + } + }(); + if (*slot == old) { + // Shouldn't use `predicate_` operator directly since it includes a redundant bitwise compare + return ref_.predicate_.equal_to(old, value) == detail::equal_result::EQUAL + ? insert_result::DUPLICATE + : insert_result::CONTINUE; + } else { + return insert_result::SUCCESS; + } + } +}; + +template +class operator_impl> { + using base_type = static_set_ref; + using ref_type = static_set_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + public: + /** + * @brief Indicates whether the probe key `key` was inserted into the container. + * + * If the probe key `key` was inserted into the container, returns + * true. Otherwise, returns false. + * + * @tparam ProbeKey Probe key type + * + * @param key The key to search for + * @return A boolean indicating whether the probe key is present + */ + template + [[nodiscard]] __device__ bool contains(ProbeKey const& key) const noexcept + { + // CRTP: cast `this` to the actual ref type + auto const& ref_ = static_cast(*this); + + auto probing_iter = ref_.probing_scheme_(key, ref_.storage_ref_.num_windows()); + + while (true) { + // TODO atomic_ref::load if insert operator is present + auto const window_slots = ref_.storage_ref_[*probing_iter]; + + for (auto& slot_content : window_slots) { + switch (ref_.predicate_(slot_content, key)) { + case detail::equal_result::UNEQUAL: continue; + case detail::equal_result::EMPTY: return false; + case detail::equal_result::EQUAL: return true; + } + } + ++probing_iter; + } + } + + /** + * @brief Indicates whether the probe key `key` was inserted into the container. + * + * If the probe key `key` was inserted into the container, returns + * true. Otherwise, returns false. + * + * @tparam ProbeKey Probe key type + * + * @param g The Cooperative Group used to perform group contains + * @param key The key to search for + * @return A boolean indicating whether the probe key is present + */ + template + [[nodiscard]] __device__ bool contains(cooperative_groups::thread_block_tile const& g, + ProbeKey const& key) const noexcept + { + auto const& ref_ = static_cast(*this); + + auto probing_iter = ref_.probing_scheme_(g, key, ref_.storage_ref_.num_windows()); + + while (true) { + auto const window_slots = ref_.storage_ref_[*probing_iter]; + + auto const state = [&]() { + for (auto& slot : window_slots) { + switch (ref_.predicate_(slot, key)) { + case detail::equal_result::EMPTY: return detail::equal_result::EMPTY; + case detail::equal_result::EQUAL: return detail::equal_result::EQUAL; + default: continue; + } + } + return detail::equal_result::UNEQUAL; + }(); + + if (g.any(state == detail::equal_result::EQUAL)) { return true; } + if (g.any(state == detail::equal_result::EMPTY)) { return false; } + + ++probing_iter; + } + } +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/storage/aow_storage.cuh b/include/cuco/detail/storage/aow_storage.cuh new file mode 100644 index 000000000..316f7fbe5 --- /dev/null +++ b/include/cuco/detail/storage/aow_storage.cuh @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include + +#include +#include + +namespace cuco { +namespace experimental { +namespace detail { +/** + * @brief Base class of array of slot windows open addressing storage. + * + * This should NOT be used directly. + * + * @tparam WindowSize Number of elements in each window + * @tparam T Element type + * @tparam Extent Type of extent denoting the number of windows + */ +template +class aow_storage_base : public storage_base { + public: + /** + * @brief The number of elements (slots) processed per window. + */ + static constexpr int32_t window_size = WindowSize; + + using extent_type = typename storage_base::extent_type; ///< Storage extent type + using size_type = typename storage_base::size_type; ///< Storage size type + + using value_type = T; ///< Slot type + using window_type = cuda::std::array; ///< Slot window type + + /** + * @brief Constructor of AoW base storage. + * + * @param size Number of windows to store + */ + explicit constexpr aow_storage_base(Extent size) : storage_base{size} {} + + /** + * @brief Gets the total number of slot windows in the current storage. + * + * @return The total number of slot windows + */ + [[nodiscard]] __host__ __device__ constexpr extent_type num_windows() const noexcept + { + return storage_base::capacity(); + } + + /** + * @brief Gets the total number of slots in the current storage. + * + * @return The total number of slots + */ + [[nodiscard]] __host__ __device__ constexpr auto capacity() const noexcept + { + return storage_base::capacity().template multiply(); + } +}; + +/** + * @brief Non-owning AoW storage reference type. + * + * @tparam WindowSize Number of slots in each window + * @tparam T Storage element type + * @tparam Extent Type of extent denoting storage capacity + */ +template +class aow_storage_ref : public aow_storage_base { + public: + using base_type = aow_storage_base; ///< AoW base class type + + using base_type::window_size; ///< Number of elements processed per window + + using extent_type = typename base_type::extent_type; ///< Storage extent type + using size_type = typename base_type::size_type; ///< Storage size type + using value_type = typename base_type::value_type; ///< Slot type + using window_type = typename base_type::window_type; ///< Slot window type + + using base_type::capacity; + using base_type::num_windows; + + /** + * @brief Constructor of AoS storage ref. + * + * @param windows Pointer to the windows array + * @param num_windows Number of windows + */ + explicit constexpr aow_storage_ref(Extent num_windows, window_type* windows) noexcept + : aow_storage_base{num_windows}, windows_{windows} + { + } + + /** + * @brief Gets windows array. + * + * @return Pointer to the first window + */ + [[nodiscard]] __device__ constexpr window_type* data() noexcept { return windows_; } + + /** + * @brief Gets windows array. + * + * @return Pointer to the first window + */ + [[nodiscard]] __device__ constexpr window_type* data() const noexcept { return windows_; } + + /** + * @brief Returns an array of slots (or a window) for a given index. + * + * @param index Index of the window + * @return An array of slots + */ + [[nodiscard]] __device__ constexpr window_type operator[](size_type index) const noexcept + { + return *reinterpret_cast( + __builtin_assume_aligned(this->data() + index, sizeof(value_type) * window_size)); + } + + private: + window_type* windows_; ///< Pointer to the windows array +}; + +/** + * @brief Array of slot Window open addressing storage class. + * + * @tparam WindowSize Number of slots in each window + * @tparam T Slot type + * @tparam Extent Type of extent denoting number of windows + * @tparam Allocator Type of allocator used for device storage (de)allocation + */ +template +class aow_storage : public aow_storage_base { + public: + using base_type = aow_storage_base; ///< AoW base class type + + using base_type::window_size; ///< Number of elements processed per window + + using extent_type = typename base_type::extent_type; ///< Storage extent type + using size_type = typename base_type::size_type; ///< Storage size type + using value_type = typename base_type::value_type; ///< Slot type + using window_type = typename base_type::window_type; ///< Slot window type + + using base_type::capacity; + using base_type::num_windows; + + using allocator_type = + typename std::allocator_traits::rebind_alloc; ///< Type of the + ///< allocator to + ///< (de)allocate windows + using window_deleter_type = custom_deleter; ///< Type of window deleter + using ref_type = aow_storage_ref; ///< Storage ref type + + /** + * @brief Constructor of AoW storage. + * + * @note The input `size` should be exclusively determined by the return value of + * `make_valid_extent` since it depends on the requested low-bound value, the probing scheme, and + * the storage. + * + * @param size Number of windows to (de)allocate + * @param allocator Allocator used for (de)allocating device storage + */ + explicit constexpr aow_storage(Extent size, Allocator const& allocator) + : aow_storage_base{size}, + allocator_{allocator}, + window_deleter_{capacity(), allocator_}, + windows_{allocator_.allocate(capacity()), window_deleter_} + { + } + + aow_storage(aow_storage&&) = default; ///< Move constructor + /** + * @brief Replaces the contents of the storage with another storage. + * + * @return Reference of the current storage object + */ + aow_storage& operator=(aow_storage&&) = default; + ~aow_storage() = default; ///< Destructor + + aow_storage(aow_storage const&) = delete; + aow_storage& operator=(aow_storage const&) = delete; + + /** + * @brief Gets windows array. + * + * @return Pointer to the first window + */ + [[nodiscard]] constexpr window_type* data() const noexcept { return windows_.get(); } + + /** + * @brief Gets window storage reference. + * + * @return Reference of window storage + */ + [[nodiscard]] constexpr ref_type ref() const noexcept + { + return ref_type{this->num_windows(), this->data()}; + } + + /** + * @brief Initializes each slot in the AoW storage to contain `key`. + * + * @param key Key to which all keys in `slots` are initialized + * @param stream Stream used for executing the kernel + */ + void initialize(value_type key, cudaStream_t stream) noexcept + { + auto constexpr stride = 4; + auto const grid_size = (this->num_windows() + stride * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / + (stride * detail::CUCO_DEFAULT_BLOCK_SIZE); + + detail::initialize<<>>( + this->data(), this->num_windows(), key); + } + + private: + allocator_type allocator_; ///< Allocator used to (de)allocate windows + window_deleter_type window_deleter_; ///< Custom windows deleter + std::unique_ptr windows_; ///< Pointer to AoW storage +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/storage/counter_storage.cuh b/include/cuco/detail/storage/counter_storage.cuh new file mode 100644 index 000000000..bf87357a3 --- /dev/null +++ b/include/cuco/detail/storage/counter_storage.cuh @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include + +#include + +namespace cuco { +namespace experimental { +namespace detail { +/** + * @brief Device atomic counter storage class. + * + * @tparam SizeType Type of storage size + * @tparam Scope The scope in which the counter will be used by individual threads + * @tparam Allocator Type of allocator used for device storage + */ +template +class counter_storage : public storage_base> { + public: + using storage_base>::capacity_; ///< Storage size + + using size_type = SizeType; ///< Size type + using value_type = cuda::atomic; ///< Type of the counter + using allocator_type = typename std::allocator_traits::rebind_alloc< + value_type>; ///< Type of the allocator to (de)allocate counter + using counter_deleter_type = custom_deleter; ///< Type of counter deleter + + /** + * @brief Constructor of counter storage. + * + * @param allocator Allocator used for (de)allocating device storage + */ + explicit constexpr counter_storage(Allocator const& allocator) + : storage_base>{cuco::experimental::extent{}}, + allocator_{allocator}, + counter_deleter_{capacity_, allocator_}, + counter_{allocator_.allocate(capacity_), counter_deleter_} + { + } + + /** + * @brief Asynchronously resets counter to zero. + * + * @param stream CUDA stream used to reset + */ + void reset(cudaStream_t stream) + { + static_assert(sizeof(size_type) == sizeof(value_type)); + CUCO_CUDA_TRY(cudaMemsetAsync(this->data(), 0, sizeof(value_type), stream)); + } + + /** + * @brief Gets device atomic counter pointer. + * + * @return Pointer to the device atomic counter + */ + [[nodiscard]] constexpr value_type* data() noexcept { return counter_.get(); } + + /** + * @brief Gets device atomic counter pointer. + * + * @return Pointer to the device atomic counter + */ + [[nodiscard]] constexpr value_type* data() const noexcept { return counter_.get(); } + + /** + * @brief Atomically obtains the value of the device atomic counter and copies it to the host. + * + * @note This API synchronizes the given `stream`. + * + * @param stream CUDA stream used to copy device value to the host + * @return Value of the atomic counter + */ + [[nodiscard]] constexpr size_type load_to_host(cudaStream_t stream) const + { + size_type h_count; + CUCO_CUDA_TRY( + cudaMemcpyAsync(&h_count, this->data(), sizeof(size_type), cudaMemcpyDeviceToHost, stream)); + CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); + return h_count; + } + + private: + allocator_type allocator_; ///< Allocator used to (de)allocate counter + counter_deleter_type counter_deleter_; ///< Custom counter deleter + std::unique_ptr counter_; ///< Pointer to counter storage +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/storage/kernels.cuh b/include/cuco/detail/storage/kernels.cuh new file mode 100644 index 000000000..546c58daa --- /dev/null +++ b/include/cuco/detail/storage/kernels.cuh @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief Initializes each slot in the window storage to contain `value`. + * + * @tparam WindowT Window type + * + * @param windows Pointer to flat storage for windows + * @param n Number of input windows + * @param value Value to which all values in `slots` are initialized + */ +template +__global__ void initialize(WindowT* windows, + cuco::detail::index_type n, + typename WindowT::value_type value) +{ + cuco::detail::index_type const loop_stride = gridDim.x * blockDim.x; + cuco::detail::index_type idx = blockDim.x * blockIdx.x + threadIdx.x; + + while (idx < n) { + auto& window_slots = *(windows + idx); +#pragma unroll + for (auto& slot : window_slots) { + slot = value; + } + idx += loop_stride; + } +} + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/storage/storage.cuh b/include/cuco/detail/storage/storage.cuh new file mode 100644 index 000000000..b4fc86890 --- /dev/null +++ b/include/cuco/detail/storage/storage.cuh @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cuco { +namespace experimental { +namespace detail { +/** + * @brief Intermediate class internally used by data structures + * + * @tparam StorageImpl Storage implementation class + * @tparam T Storage element type + * @tparam Extent Type of extent denoting number of windows + * @tparam Allocator Type of allocator used for device storage + */ +template +class storage : StorageImpl::template impl { + public: + /// Storage implementation type + using impl_type = typename StorageImpl::template impl; + using ref_type = typename impl_type::ref_type; ///< Storage ref type + using value_type = typename impl_type::value_type; ///< Storage value type + + /// Number of elements per window + static constexpr int window_size = impl_type::window_size; + + using impl_type::capacity; + using impl_type::data; + using impl_type::initialize; + using impl_type::num_windows; + using impl_type::ref; + + /** + * @brief Constructs storage. + * + * @param size Number of slots to (de)allocate + * @param allocator Allocator used for (de)allocating device storage + */ + explicit constexpr storage(Extent size, Allocator const& allocator) : impl_type{size, allocator} + { + } +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/storage/storage_base.cuh b/include/cuco/detail/storage/storage_base.cuh new file mode 100644 index 000000000..dec443dce --- /dev/null +++ b/include/cuco/detail/storage/storage_base.cuh @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cuco { +namespace experimental { +namespace detail { +/** + * @brief Custom deleter for unique pointer. + * + * @tparam Allocator Type of allocator used for device storage + */ +template +struct custom_deleter { + using pointer = typename Allocator::value_type*; ///< Value pointer type + + /** + * @brief Constructor of custom deleter. + * + * @param size Number of values to deallocate + * @param allocator Allocator used for deallocating device storage + */ + explicit constexpr custom_deleter(std::size_t size, Allocator& allocator) + : size_{size}, allocator_{allocator} + { + } + + /** + * @brief Operator for deallocation + * + * @param ptr Pointer to the first value for deallocation + */ + void operator()(pointer ptr) { allocator_.deallocate(ptr, size_); } + + std::size_t size_; ///< Number of values to delete + Allocator& allocator_; ///< Allocator used deallocating values +}; + +/** + * @brief Base class of open addressing storage. + * + * This class should not be used directly. + * + * @tparam Extent Type of extent denoting storage capacity + */ +template +class storage_base { + public: + using extent_type = Extent; ///< Storage extent type + using size_type = typename extent_type::value_type; ///< Storage size type + + /** + * @brief Constructor of base storage. + * + * @param size Number of elements to (de)allocate + */ + explicit constexpr storage_base(Extent size) : capacity_{size} {} + + /** + * @brief Gets the total number of elements in the current storage. + * + * @return The total number of elements + */ + [[nodiscard]] __host__ __device__ constexpr extent_type capacity() const noexcept + { + return capacity_; + } + + protected: + extent_type capacity_; ///< Total number of elements +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/tuning.cuh b/include/cuco/detail/tuning.cuh new file mode 100644 index 000000000..035b60cc5 --- /dev/null +++ b/include/cuco/detail/tuning.cuh @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + */ + +#pragma once + +namespace cuco { +namespace experimental { +namespace detail { + +static constexpr int CUCO_DEFAULT_BLOCK_SIZE = 128; +static constexpr int CUCO_DEFAULT_STRIDE = 1; + +} // namespace detail +} // namespace experimental +} // namespace cuco \ No newline at end of file diff --git a/include/cuco/detail/utils.cuh b/include/cuco/detail/utils.cuh index 3aadbb848..ae55f7830 100644 --- a/include/cuco/detail/utils.cuh +++ b/include/cuco/detail/utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,8 @@ #pragma once +#include + #include namespace cuco { @@ -59,7 +61,7 @@ struct slot_to_tuple { */ template struct slot_is_filled { - Key empty_key_sentinel; ///< The value of the empty key sentinel + Key empty_key_sentinel_; ///< The value of the empty key sentinel /** * @brief Indicates if the target slot `s` is filled. @@ -72,7 +74,35 @@ struct slot_is_filled { template __device__ bool operator()(S const& s) { - return thrust::get<0>(s) != empty_key_sentinel; + return not cuco::detail::bitwise_compare(thrust::get<0>(s), empty_key_sentinel_); + } +}; + +/** + * @brief Device functor returning the number of filled elements per window. + * + * @tparam Sentinel Empty sentinel type + */ +template +struct elements_per_window { + Sentinel empty_key_sentinel_; ///< The value of the empty key sentinel + + /** + * @brief Computes the number of filled elements per window. + * + * @tparam Window Window storage type + * + * @param window The window to query + * @return Number of filled elements per window + */ + template + __device__ inline int32_t operator()(Window const& window) const + { + int32_t num = 0; + for (auto const& element : window) { + num += not cuco::detail::bitwise_compare(element, empty_key_sentinel_); + } + return num; } }; diff --git a/include/cuco/detail/utils.hpp b/include/cuco/detail/utils.hpp index d06216c54..513ccd559 100644 --- a/include/cuco/detail/utils.hpp +++ b/include/cuco/detail/utils.hpp @@ -23,6 +23,8 @@ namespace cuco { namespace detail { +using index_type = int64_t; ///< index type for internal use + /** * @brief Compute the number of bits of a simple type. * @@ -56,13 +58,50 @@ auto get_grid_size(Kernel kernel, std::size_t block_size, std::size_t dynamic_sm } template -constexpr inline int64_t distance(Iterator begin, Iterator end) +constexpr inline index_type distance(Iterator begin, Iterator end) { using category = typename std::iterator_traits::iterator_category; static_assert(std::is_base_of_v, "Input iterator should be a random access iterator."); // `int64_t` instead of arch-dependant `long int` - return static_cast(std::distance(begin, end)); + return static_cast(std::distance(begin, end)); +} + +/** + * @brief C++17 constexpr backport of `std::lower_bound`. + * + * @tparam ForwardIt Type of input iterator + * @tparam T Type of `value` + * + * @param first Iterator defining the start of the range to examine + * @param last Iterator defining the start of the range to examine + * @param value Value to compare the elements to + * + * @return Iterator pointing to the first element in the range [first, last) that does not satisfy + * element < value + */ +template +constexpr ForwardIt lower_bound(ForwardIt first, ForwardIt last, const T& value) +{ + using diff_type = typename std::iterator_traits::difference_type; + + ForwardIt it{}; + diff_type count = std::distance(first, last); + diff_type step{}; + + while (count > 0) { + it = first; + step = count / 2; + std::advance(it, step); + + if (static_cast(*it) < value) { + first = ++it; + count -= step + 1; + } else + count = step; + } + + return first; } } // namespace detail diff --git a/include/cuco/extent.cuh b/include/cuco/extent.cuh new file mode 100644 index 000000000..b825188ed --- /dev/null +++ b/include/cuco/extent.cuh @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include + +namespace cuco { +namespace experimental { +static constexpr std::size_t dynamic_extent = static_cast(-1); + +/** + * @brief Static extent class. + * + * @tparam SizeType Size type + * @tparam N Extent + */ +template +struct extent { + using value_type = SizeType; ///< Extent value type + + constexpr extent() = default; + + /// Constructs from `SizeType` + __host__ __device__ constexpr explicit extent(SizeType) noexcept {} + + /** + * @brief Conversion to value_type. + * + * @return Extent size + */ + __host__ __device__ constexpr operator value_type() const noexcept { return N; } + + /** + * @brief Multiplies the current extent with the given `Value`. + * + * @tparam Value The input value to multiply with + * + * @return Resulting static extent + */ + template + __host__ __device__ constexpr auto multiply() const noexcept + { + return extent{}; + } +}; + +/** + * @brief Dynamic extent class. + * + * @tparam SizeType Size type + */ +template +struct extent { + using value_type = SizeType; ///< Extent value type + + /** + * @brief Constructs extent from a given `size`. + * + * @param size The extent size + */ + __host__ __device__ constexpr extent(SizeType size) noexcept : value_{size} {} + + /** + * @brief Conversion to value_type. + * + * @return Extent size + */ + __host__ __device__ constexpr operator value_type() const noexcept { return value_; } + + /** + * @brief Multiplies the current extent with the given `Value`. + * + * @tparam Value The input value to multiply with + * + * @return Resulting extent + */ + template + __host__ __device__ constexpr auto multiply() const noexcept + { + return extent{Value * value_}; + } + + private: + value_type value_; ///< Extent value +}; + +/** + * @brief Computes valid extent based on given parameters. + * + * @note The actual capacity of a container (map/set) should be exclusively determined by the return + * value of this utility since the output depends on the requested low-bound size, the probing + * scheme, and the storage. This utility is used internally during container constructions while for + * container ref constructions, it would be users' responsibility to use this function to determine + * the input size of the ref. + * + * @tparam CGSize Number of elements handled per CG + * @tparam WindowSize Number of elements handled per Window + * @tparam SizeType Size type + * @tparam N Extent + * + * @throw If the input extent is invalid + * + * @return Resulting valid extent + */ +template +[[nodiscard]] auto constexpr make_valid_extent(extent ext) +{ + auto constexpr max_prime = cuco::detail::primes.back(); + auto constexpr max_value = + (static_cast(std::numeric_limits::max()) < max_prime) + ? std::numeric_limits::max() + : static_cast(max_prime); + auto const size = SDIV(ext, CGSize * WindowSize); + if (size <= 0 or size > max_value) { CUCO_FAIL("Invalid input extent"); } + + if constexpr (N == dynamic_extent) { + return extent{static_cast( + *cuco::detail::lower_bound( + cuco::detail::primes.begin(), cuco::detail::primes.end(), static_cast(size)) * + CGSize)}; + } + if constexpr (N != dynamic_extent) { + return extent(*cuco::detail::lower_bound(cuco::detail::primes.begin(), + cuco::detail::primes.end(), + static_cast(size)) * + CGSize)>{}; + } +} + +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/operator.hpp b/include/cuco/operator.hpp new file mode 100644 index 000000000..abf66e0bd --- /dev/null +++ b/include/cuco/operator.hpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace cuco { +namespace experimental { +inline namespace op { +// TODO enum class of int32_t instead of struct +// https://github.com/NVIDIA/cuCollections/issues/239 +/** + * @brief `insert` operator tag + */ +struct insert_tag { +} inline constexpr insert; + +/** + * @brief `contains` operator tag + */ +struct contains_tag { +} inline constexpr contains; + +} // namespace op +} // namespace experimental +} // namespace cuco + +#include diff --git a/include/cuco/probing_scheme.cuh b/include/cuco/probing_scheme.cuh new file mode 100644 index 000000000..0880ee97b --- /dev/null +++ b/include/cuco/probing_scheme.cuh @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cuco { +namespace experimental { +/** + * @brief Public linear probing scheme class. + * + * @note Linear probing is efficient when few collisions are present, e.g., low occupancy or low + * multiplicity. + * + * @note `Hash` should be callable object type. + * + * @tparam CGSize Size of CUDA Cooperative Groups + * @tparam Hash Unary callable type + */ +template +class linear_probing : private detail::probing_scheme_base { + public: + using probing_scheme_base_type = + detail::probing_scheme_base; ///< The base probe scheme type + using probing_scheme_base_type::cg_size; + + /** + *@brief Constructs linear probing scheme with the hasher callable. + * + * @param hash Hasher + */ + __host__ __device__ constexpr linear_probing(Hash const& hash = {}); + + /** + * @brief Operator to return a probing iterator + * + * @tparam ProbeKey Type of probing key + * @tparam Extent Type of extent + * + * @param probe_key The probing key + * @param upper_bound Upper bound of the iteration + * @return An iterator whose value_type is convertible to slot index type + */ + template + __host__ __device__ constexpr auto operator()(ProbeKey const& probe_key, + Extent upper_bound) const noexcept; + + /** + * @brief Operator to return a CG-based probing iterator + * + * @tparam ProbeKey Type of probing key + * @tparam Extent Type of extent + * + * @param g the Cooperative Group to generate probing iterator + * @param probe_key The probing key + * @param upper_bound Upper bound of the iteration + * @return An iterator whose value_type is convertible to slot index type + */ + template + __host__ __device__ constexpr auto operator()( + cooperative_groups::thread_block_tile const& g, + ProbeKey const& probe_key, + Extent upper_bound) const noexcept; + + private: + Hash hash_; +}; + +/** + * @brief Public double hashing scheme class. + * + * @note Default probing scheme for cuco data structures. It shows superior performance over linear + * probing especially when dealing with high multiplicty and/or high occupancy use cases. + * + * @note `Hash1` and `Hash2` should be callable object type. + * + * @note `Hash2` needs to be able to construct from an integer value to avoid secondary clustering. + * + * @tparam CGSize Size of CUDA Cooperative Groups + * @tparam Hash1 Unary callable type + * @tparam Hash2 Unary callable type + */ +template +class double_hashing : private detail::probing_scheme_base { + public: + using probing_scheme_base_type = + detail::probing_scheme_base; ///< The base probe scheme type + using probing_scheme_base_type::cg_size; + + /** + *@brief Constructs double hashing probing scheme with the two hasher callables. + * + * @param hash1 First hasher + * @param hash2 Second hasher + */ + __host__ __device__ constexpr double_hashing(Hash1 const& hash1 = {}, Hash2 const& hash2 = {1}); + + /** + * @brief Operator to return a probing iterator + * + * @tparam ProbeKey Type of probing key + * @tparam Extent Type of extent + * + * @param probe_key The probing key + * @param upper_bound Upper bound of the iteration + * @return An iterator whose value_type is convertible to slot index type + */ + template + __host__ __device__ constexpr auto operator()(ProbeKey const& probe_key, + Extent upper_bound) const noexcept; + + /** + * @brief Operator to return a CG-based probing iterator + * + * @tparam ProbeKey Type of probing key + * @tparam Extent Type of extent + * + * @param g the Cooperative Group to generate probing iterator + * @param probe_key The probing key + * @param upper_bound Upper bound of the iteration + * @return An iterator whose value_type is convertible to slot index type + */ + template + __host__ __device__ constexpr auto operator()( + cooperative_groups::thread_block_tile const& g, + ProbeKey const& probe_key, + Extent upper_bound) const noexcept; + + private: + Hash1 hash1_; + Hash2 hash2_; +}; + +} // namespace experimental +} // namespace cuco + +#include diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh new file mode 100644 index 000000000..1e6a3f589 --- /dev/null +++ b/include/cuco/static_set.cuh @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#if defined(CUCO_HAS_CUDA_BARRIER) +#include +#endif + +#include +#include + +namespace cuco { +namespace experimental { +/** + * @brief A GPU-accelerated, unordered, associative container of unique keys. + * + * The `static_set` supports two types of operations: + * - Host-side "bulk" operations + * - Device-side "singular" operations + * + * The host-side bulk operations include `insert`, `contains`, etc. These APIs should be used when + * there are a large number of keys to modify or lookup. For example, given a range of keys + * specified by device-accessible iterators, the bulk `insert` function will insert all keys into + * the set. + * + * The singular device-side operations allow individual threads (or cooperative groups) to perform + * independent modify or lookup operations from device code. These operations are accessed through + * non-owning, trivially copyable reference types (or "ref"). User can combine any arbitrary + * operators (see options in `include/cuco/operator.hpp`) when creating the ref. Concurrent modify + * and lookup will be supported if both kinds of operators are specified during the ref + * construction. + * + * @note Allows constant time concurrent modify or lookup operations from threads in device code. + * @note cuCollections data stuctures always place the slot keys on the left-hand side when invoking + * the key comparison predicate, i.e., `pred(slot_key, query_key)`. Order-sensitive `KeyEqual` + * should be used with caution. + * @note `ProbingScheme::cg_size` indicates how many threads are used to handle one independent + * device operation. `cg_size == 1` uses the scalar (or non-CG) code paths. + * + * @throw If the size of the given key type is larger than 8 bytes + * @throw If the given key type doesn't have unique object representations, i.e., + * `cuco::bitwise_comparable_v == false` + * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base` + * + * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v` + * @tparam Extent Data structure size type + * @tparam Scope The scope in which operations will be performed by individual threads. + * @tparam KeyEqual Binary callable type used to compare two keys for equality + * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for choices) + * @tparam Allocator Type of allocator used for device storage + * @tparam Storage Slot window storage type + */ + +template , + cuda::thread_scope Scope = cuda::thread_scope_device, + class KeyEqual = thrust::equal_to, + class ProbingScheme = experimental::double_hashing<1, // CG size + cuco::murmurhash3_32, + cuco::murmurhash3_32>, + class Allocator = cuco::cuda_allocator, + class Storage = cuco::experimental::aow_storage<2>> +class static_set { + static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); + + static_assert( + cuco::is_bitwise_comparable_v, + "Key type must have unique object representations or have been explicitly declared as safe for " + "bitwise comparison via specialization of cuco::is_bitwise_comparable_v."); + + static_assert( + std::is_base_of_v, + ProbingScheme>, + "ProbingScheme must inherit from cuco::detail::probing_scheme_base"); + + public: + static constexpr auto cg_size = ProbingScheme::cg_size; ///< CG size used to for probing + static constexpr auto window_size = Storage::window_size; ///< Window size used to for probing + static constexpr auto thread_scope = Scope; ///< CUDA thread scope + + using key_type = Key; ///< Key type + using value_type = Key; ///< Key type + /// Extent type + using extent_type = decltype(make_valid_extent(std::declval())); + using size_type = typename extent_type::value_type; ///< Size type + using key_equal = KeyEqual; ///< Key equality comparator type + using allocator_type = Allocator; ///< Allocator type + using storage_type = + detail::storage; ///< Storage type + + using storage_ref_type = typename storage_type::ref_type; ///< Non-owning window storage ref type + using probing_scheme_type = ProbingScheme; ///< Probe scheme type + template + using ref_type = + cuco::experimental::static_set_ref; ///< Non-owning container ref type + + static_set(static_set const&) = delete; + static_set& operator=(static_set const&) = delete; + + static_set(static_set&&) = default; ///< Move constructor + + /** + * @brief Replaces the contents of the container with another container. + * + * @return Reference of the current map object + */ + static_set& operator=(static_set&&) = default; + ~static_set() = default; + + /** + * @brief Constructs a statically-sized set with the specified initial capacity, sentinel values + * and CUDA stream. + * + * The actual set capacity depends on the given `capacity`, the probing scheme, CG size, and the + * window size and it's computed via `make_valid_extent` factory. Insert operations will not + * automatically grow the set. Attempting to insert more unique keys than the capacity of the map + * results in undefined behavior. + * + * The `empty_key_sentinel` is reserved and behavior is undefined when attempting to insert + * this sentinel value. + * + * @param capacity The requested lower-bound set size + * @param empty_key_sentinel The reserved key value for empty slots + * @param pred Key equality binary predicate + * @param probing_scheme Probing scheme + * @param alloc Allocator used for allocating device storage + * @param stream CUDA stream used to initialize the map + */ + constexpr static_set(Extent capacity, + empty_key empty_key_sentinel, + KeyEqual pred = {}, + ProbingScheme const& probing_scheme = {}, + Allocator const& alloc = {}, + cudaStream_t stream = nullptr); + + /** + * @brief Inserts all keys in the range `[first, last)` and returns the number of successful + * insertions. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `insert_async`. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * static_set::value_type> is `true` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stream CUDA stream used for insert + * + * @return Number of successfully inserted keys + */ + template + size_type insert(InputIt first, InputIt last, cudaStream_t stream = nullptr); + + /** + * @brief Asynchonously inserts all keys in the range `[first, last)`. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * static_set::value_type> is `true` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stream CUDA stream used for insert + */ + template + void insert_async(InputIt first, InputIt last, cudaStream_t stream = nullptr); + + /** + * @brief Indicates whether the keys in the range `[first, last)` are contained in the set. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `contains_async`. + * + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param stream Stream used for executing the kernels + */ + template + void contains(InputIt first, + InputIt last, + OutputIt output_begin, + cudaStream_t stream = nullptr) const; + + /** + * @brief Asynchonously indicates whether the keys in the range `[first, last)` are contained in + * the set. + * + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param stream Stream used for executing the kernels + */ + template + void contains_async(InputIt first, + InputIt last, + OutputIt output_begin, + cudaStream_t stream = nullptr) const; + + /** + * @brief Gets the number of elements in the container. + * + * @note This function synchronizes the given stream. + * + * @param stream CUDA stream used to get the number of inserted elements + * @return The number of elements in the container + */ + [[nodiscard]] size_type size(cudaStream_t stream = nullptr) const; + + /** + * @brief Gets the maximum number of elements the hash map can hold. + * + * @return The maximum number of elements the hash map can hold + */ + [[nodiscard]] constexpr auto capacity() const noexcept; + + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + [[nodiscard]] constexpr key_type empty_key_sentinel() const noexcept; + + /** + * @brief Get device ref with operators. + * + * @tparam Operators Set of `cuco::op` to be provided by the ref + * + * @param ops List of operators, e.g., `cuco::insert` + * + * @return Device ref of the current `static_set` object + */ + template + [[nodiscard]] auto ref(Operators... ops) const noexcept; + + private: + key_type empty_key_sentinel_; ///< Key value that represents an empty slot + key_equal predicate_; ///< Key equality binary predicate + probing_scheme_type probing_scheme_; ///< Probing scheme + allocator_type allocator_; ///< Allocator used to (de)allocate temporary storage + storage_type storage_; ///< Slot window storage +}; + +} // namespace experimental +} // namespace cuco + +#include diff --git a/include/cuco/static_set_ref.cuh b/include/cuco/static_set_ref.cuh new file mode 100644 index 000000000..7c5ae13dc --- /dev/null +++ b/include/cuco/static_set_ref.cuh @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include + +namespace cuco { +namespace experimental { + +/** + * @brief Device non-owning "ref" type that can be used in device code to perform arbitrary + * operations defined in `include/cuco/operator.hpp` + * + * @note Concurrent modify and lookup will be supported if both kinds of operators are specified + * during the ref construction. + * @note cuCollections data stuctures always place the slot keys on the left-hand + * side when invoking the key comparison predicate. + * @note Ref types are trivially-copyable and are intended to be passed by value. + * @note `ProbingScheme::cg_size` indicates how many threads are used to handle one independent + * device operation. `cg_size == 1` uses the scalar (or non-CG) code paths. + * + * @throw If the size of the given key type is larger than 8 bytes + * @throw If the given key type doesn't have unique object representations, i.e., + * `cuco::bitwise_comparable_v == false` + * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base` + * + * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v` returning true + * @tparam Scope The scope in which operations will be performed by individual threads. + * @tparam KeyEqual Binary callable type used to compare two keys for equality + * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for options) + * @tparam StorageRef Storage ref type + * @tparam Operators Device operator options defined in `include/cuco/operator.hpp` + */ +template +class static_set_ref + : public detail::operator_impl< + Operators, + static_set_ref>... { + static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); + + static_assert( + cuco::is_bitwise_comparable_v, + "Key type must have unique object representations or have been explicitly declared as safe for " + "bitwise comparison via specialization of cuco::is_bitwise_comparable_v."); + + static_assert( + std::is_base_of_v, + ProbingScheme>, + "ProbingScheme must inherit from cuco::detail::probing_scheme_base"); + + public: + using key_type = Key; ///< Key Type + using probing_scheme_type = ProbingScheme; ///< Type of probing scheme + using storage_ref_type = StorageRef; ///< Type of storage ref + using window_type = typename storage_ref_type::window_type; ///< Window type + using value_type = typename storage_ref_type::value_type; ///< Storage element type + using extent_type = typename storage_ref_type::extent_type; ///< Extent type + using size_type = typename storage_ref_type::size_type; ///< Probing scheme size type + using key_equal = KeyEqual; ///< Type of key equality binary callable + + static constexpr auto cg_size = probing_scheme_type::cg_size; ///< Cooperative group size + static constexpr auto window_size = + storage_ref_type::window_size; ///< Number of elements handled per window + static constexpr auto thread_scope = Scope; ///< Thread scope + + /** + * @brief Constructs static_set_ref. + * + * @param empty_key_sentinel Sentinel indicating empty key + * @param predicate Key equality binary callable + * @param probing_scheme Probing scheme + * @param storage_ref Non-owning ref of slot storage + */ + __host__ __device__ explicit constexpr static_set_ref( + cuco::empty_key empty_key_sentinel, + key_equal const& predicate, + probing_scheme_type const& probing_scheme, + storage_ref_type storage_ref) noexcept; + + /** + * @brief Gets the maximum number of elements the container can hold. + * + * @return The maximum number of elements the container can hold + */ + [[nodiscard]] __host__ __device__ constexpr auto capacity() const noexcept; + + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + [[nodiscard]] __host__ __device__ constexpr key_type empty_key_sentinel() const noexcept; + + private: + cuco::empty_key empty_key_sentinel_; ///< Empty key sentinel + detail::equal_wrapper predicate_; ///< Key equality binary callable + probing_scheme_type probing_scheme_; ///< Probing scheme + storage_ref_type storage_ref_; ///< Slot storage ref + + // Mixins need to be friends with this class in order to access private members + template + friend class detail::operator_impl; +}; + +} // namespace experimental +} // namespace cuco + +#include diff --git a/include/cuco/storage.cuh b/include/cuco/storage.cuh new file mode 100644 index 000000000..969b49f37 --- /dev/null +++ b/include/cuco/storage.cuh @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cuco { +namespace experimental { +/** + * @brief Public Array of slot Windows storage class. + * + * The window size defines the workload granularity for each CUDA thread, i.e., how many slots a + * thread would concurrently operate on when performing modify or lookup operations. cuCollections + * uses the AoW storage to supersede the raw flat slot storage due to its superior granularity + * control: When window size equals one, AoW performs the same as the flat storage. If the + * underlying operation is more memory bandwidth bound, e.g., high occupancy multimap operations, a + * larger window size can reduce the length of probing sequences thus improve runtime performance. + * + * @tparam WindowSize Number of elements per window storage + */ +template +class aow_storage { + public: + /// Number of elements per window storage + static constexpr int32_t window_size = WindowSize; + + /// Type of implementation details + template + using impl = detail::aow_storage; +}; + +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/utility/traits.hpp b/include/cuco/utility/traits.hpp index 445a40daf..78e8dabcb 100644 --- a/include/cuco/utility/traits.hpp +++ b/include/cuco/utility/traits.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -58,4 +58,10 @@ inline constexpr bool is_bitwise_comparable_v = is_bitwise_comparable::value; }; \ } +template +inline constexpr bool dependent_bool_value = value; + +template +inline constexpr bool dependent_false = dependent_bool_value; + } // namespace cuco diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 52c4cd9db..16f6abacf 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -40,13 +40,28 @@ function(ConfigureTest TEST_NAME) RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests") target_compile_options(${TEST_NAME} PRIVATE --compiler-options=-Wall --compiler-options=-Wextra --expt-extended-lambda --expt-relaxed-constexpr -Xcompiler -Wno-subobject-linkage) - catch_discover_tests(${TEST_NAME}) + catch_discover_tests(${TEST_NAME} EXTRA_ARGS --allow-running-no-tests) endfunction(ConfigureTest) ################################################################################################### ### test sources ################################################################################## ################################################################################################### +################################################################################################### +# - utility tests --------------------------------------------------------------------------------- +ConfigureTest(UTILITY_TEST + utility/extent_test.cu + utility/storage_test.cu) + +################################################################################################### +# - static_set tests ------------------------------------------------------------------------------ +ConfigureTest(STATIC_SET_TEST + static_set/capacity_test.cu + static_set/heterogeneous_lookup_test.cu + static_set/large_input_test.cu + static_set/size_test.cu + static_set/unique_sequence_test.cu) + ################################################################################################### # - static_map tests ------------------------------------------------------------------------------ ConfigureTest(STATIC_MAP_TEST diff --git a/tests/static_set/capacity_test.cu b/tests/static_set/capacity_test.cu new file mode 100644 index 000000000..e4d3e146a --- /dev/null +++ b/tests/static_set/capacity_test.cu @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +TEST_CASE("Static set capacity", "") +{ + constexpr std::size_t num_keys{400}; + using Key = int32_t; + using ProbeT = + cuco::experimental::double_hashing<1, cuco::murmurhash3_32, cuco::murmurhash3_32>; + using Equal = thrust::equal_to; + using AllocatorT = cuco::cuda_allocator; + using StorageT = cuco::experimental::aow_storage<2>; + + SECTION("Static extent must be evaluated at compile time.") + { + auto constexpr gold_capacity = 422; // 211 x 2 + + using extent_type = cuco::experimental::extent; + cuco::experimental:: + static_set + set{extent_type{}, + cuco::empty_key{-1}, + {}, + ProbeT{cuco::murmurhash3_32{}, cuco::murmurhash3_32{}}, + {}}; + auto const capacity = set.capacity(); + STATIC_REQUIRE(capacity == gold_capacity); + + auto ref = set.ref(cuco::experimental::insert); + auto const ref_capacity = ref.capacity(); + STATIC_REQUIRE(ref_capacity == gold_capacity); + } + + SECTION("Dynamic extent is evaluated at run time.") + { + auto constexpr gold_capacity = 422; // 211 x 2 + + using extent_type = cuco::experimental::extent; + cuco::experimental:: + static_set + set{num_keys, + cuco::empty_key{-1}, + {}, + ProbeT{cuco::murmurhash3_32{}, cuco::murmurhash3_32{}}, + {}}; + auto const capacity = set.capacity(); + REQUIRE(capacity == gold_capacity); + + auto ref = set.ref(cuco::experimental::insert); + auto const ref_capacity = ref.capacity(); + REQUIRE(ref_capacity == gold_capacity); + } + + SECTION("Static extent must be evaluated at compile time.") + { + auto constexpr gold_capacity = 412; // 103 x 2 x 2 + + using extent_type = cuco::experimental::extent; + using probe = cuco::experimental::linear_probing<2, cuco::murmurhash3_32>; + auto set = cuco::experimental:: + static_set{ + extent_type{}, cuco::empty_key{-1}, {}, probe{cuco::murmurhash3_32{}}, {}}; + + REQUIRE(set.capacity() == gold_capacity); + + auto const capacity = set.capacity(); + STATIC_REQUIRE(capacity == gold_capacity); + + auto ref = set.ref(cuco::experimental::insert); + auto const ref_capacity = ref.capacity(); + STATIC_REQUIRE(ref_capacity == gold_capacity); + } + + SECTION("Dynamic extent is evaluated at run time.") + { + auto constexpr gold_capacity = 412; // 103 x 2 x 2 + + using probe = cuco::experimental::linear_probing<2, cuco::murmurhash3_32>; + auto set = cuco::experimental::static_set, + cuda::thread_scope_device, + Equal, + probe, + AllocatorT, + StorageT>{ + num_keys, cuco::empty_key{-1}, {}, probe{cuco::murmurhash3_32{}}, {}}; + + auto const capacity = set.capacity(); + REQUIRE(capacity == gold_capacity); + + auto ref = set.ref(cuco::experimental::insert); + auto const ref_capacity = ref.capacity(); + REQUIRE(ref_capacity == gold_capacity); + } +} diff --git a/tests/static_set/heterogeneous_lookup_test.cu b/tests/static_set/heterogeneous_lookup_test.cu new file mode 100644 index 000000000..cbc0efac3 --- /dev/null +++ b/tests/static_set/heterogeneous_lookup_test.cu @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include + +// insert key type +template +struct key_pair { + T a; + T b; + + __host__ __device__ key_pair() {} + __host__ __device__ key_pair(T x) : a{x}, b{x} {} + + // Device equality operator is mandatory due to libcudacxx bug: + // https://github.com/NVIDIA/libcudacxx/issues/223 + __device__ bool operator==(key_pair const& other) const { return a == other.a and b == other.b; } +}; + +// probe key type +template +struct key_triplet { + T a; + T b; + T c; + + __host__ __device__ key_triplet() {} + __host__ __device__ key_triplet(T x) : a{x}, b{x}, c{x} {} + + // Device equality operator is mandatory due to libcudacxx bug: + // https://github.com/NVIDIA/libcudacxx/issues/223 + __device__ bool operator==(key_triplet const& other) const + { + return a == other.a and b == other.b and c == other.c; + } +}; + +// User-defined device hasher +struct custom_hasher { + template + __device__ uint32_t operator()(CustomKey const& k) const + { + return thrust::raw_reference_cast(k).a; + }; +}; + +// User-defined device key equality +struct custom_key_equal { + template + __device__ bool operator()(LHS const& lhs, RHS const& rhs) const + { + return thrust::raw_reference_cast(lhs).a == thrust::raw_reference_cast(rhs).a; + } +}; + +TEMPLATE_TEST_CASE_SIG( + "Heterogeneous lookup", "", ((typename T, int CGSize), T, CGSize), (int32_t, 1), (int32_t, 2)) +{ + using Key = key_pair; + using ProbeKey = key_triplet; + using probe_type = cuco::experimental::double_hashing; + + auto const sentinel_key = Key{-1}; + + constexpr std::size_t num = 100; + constexpr std::size_t capacity = num * 2; + auto const probe = probe_type{custom_hasher{}, custom_hasher{}}; + auto my_set = cuco::experimental::static_set, + cuda::thread_scope_device, + custom_key_equal, + probe_type>{ + capacity, cuco::empty_key{sentinel_key}, custom_key_equal{}, probe}; + + auto insert_pairs = thrust::make_transform_iterator(thrust::counting_iterator(0), + [] __device__(auto i) { return Key{i}; }); + auto probe_keys = thrust::make_transform_iterator(thrust::counting_iterator(0), + [] __device__(auto i) { return ProbeKey(i); }); + + SECTION("All inserted keys should be contained") + { + thrust::device_vector contained(num); + my_set.insert(insert_pairs, insert_pairs + num); + my_set.contains(probe_keys, probe_keys + num, contained.begin()); + REQUIRE(cuco::test::all_of(contained.begin(), contained.end(), thrust::identity{})); + } + + SECTION("Non-inserted keys should not be contained") + { + thrust::device_vector contained(num); + my_set.contains(probe_keys, probe_keys + num, contained.begin()); + REQUIRE(cuco::test::none_of(contained.begin(), contained.end(), thrust::identity{})); + } +} diff --git a/tests/static_set/large_input_test.cu b/tests/static_set/large_input_test.cu new file mode 100644 index 000000000..6f4e5803b --- /dev/null +++ b/tests/static_set/large_input_test.cu @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +template +__inline__ void test_unique_sequence(Set& set, bool* res_begin, std::size_t num_keys) +{ + using Key = typename Set::key_type; + + auto const keys_begin = thrust::counting_iterator(0); + auto const keys_end = thrust::counting_iterator(num_keys); + + SECTION("Non-inserted keys should not be contained.") + { + REQUIRE(set.size() == 0); + + set.contains(keys_begin, keys_end, res_begin); + REQUIRE(cuco::test::none_of(res_begin, res_begin + num_keys, thrust::identity{})); + } + + set.insert(keys_begin, keys_end); + REQUIRE(set.size() == num_keys); + + SECTION("All inserted key/value pairs should be contained.") + { + set.contains(keys_begin, keys_end, res_begin); + REQUIRE(cuco::test::all_of(res_begin, res_begin + num_keys, thrust::identity{})); + } +} + +TEMPLATE_TEST_CASE_SIG( + "Large input", + "", + ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize), + (int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, cuco::test::probe_sequence::double_hashing, 2)) +{ + constexpr std::size_t num_keys{1'200'000'000}; + + using extent_type = cuco::experimental::extent; + using probe = cuco::experimental:: + double_hashing, cuco::murmurhash3_32>; + + try { + auto set = cuco::experimental:: + static_set, probe>{ + num_keys * 2, + cuco::empty_key{-1}, + thrust::equal_to{}, + probe{cuco::murmurhash3_32{}, cuco::murmurhash3_32{}}}; + + thrust::device_vector d_contained(num_keys); + test_unique_sequence(set, d_contained.data().get(), num_keys); + } catch (cuco::cuda_error&) { + SKIP("Out of memory"); + } catch (std::bad_alloc&) { + SKIP("Out of memory"); + } +} diff --git a/tests/static_set/size_test.cu b/tests/static_set/size_test.cu new file mode 100644 index 000000000..05b1f4f8b --- /dev/null +++ b/tests/static_set/size_test.cu @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include + +TEST_CASE("Size computation", "") +{ + constexpr std::size_t num_keys{400}; + + cuco::experimental::static_set set{cuco::experimental::extent{400}, + cuco::empty_key{-1}}; + + thrust::device_vector d_keys(num_keys); + + thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); + + auto const num_successes = set.insert(d_keys.begin(), d_keys.end()); + + auto const size = set.size(); + + REQUIRE(size == num_keys); + REQUIRE(num_successes == num_keys); +} diff --git a/tests/static_set/unique_sequence_test.cu b/tests/static_set/unique_sequence_test.cu new file mode 100644 index 000000000..c2eab7eff --- /dev/null +++ b/tests/static_set/unique_sequence_test.cu @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +template +__inline__ void test_unique_sequence(Set& set, std::size_t num_keys) +{ + using Key = typename Set::key_type; + + thrust::device_vector d_keys(num_keys); + + thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); + + auto key_begin = d_keys.begin(); + thrust::device_vector d_contained(num_keys); + + SECTION("Non-inserted keys should not be contained.") + { + REQUIRE(set.size() == 0); + + set.contains(key_begin, key_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), thrust::identity{})); + } + + set.insert(key_begin, key_begin + num_keys); + REQUIRE(set.size() == num_keys); + + SECTION("All inserted key/value pairs should be contained.") + { + set.contains(key_begin, key_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), thrust::identity{})); + } +} + +TEMPLATE_TEST_CASE_SIG( + "Unique sequence", + "", + ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize), + (int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, cuco::test::probe_sequence::linear_probing, 2)) +{ + constexpr std::size_t num_keys{400}; + auto constexpr gold_capacity = CGSize == 1 ? 422 // 211 x 1 x 2 + : 412 // 103 x 2 x 2 + ; + + using extent_type = cuco::experimental::extent; + + if constexpr (Probe == cuco::test::probe_sequence::linear_probing) { + using probe = cuco::experimental::linear_probing>; + auto set = cuco::experimental:: + static_set, probe>{ + num_keys, + cuco::empty_key{-1}, + thrust::equal_to{}, + probe{cuco::murmurhash3_32{}}}; + + REQUIRE(set.capacity() == gold_capacity); + + test_unique_sequence(set, num_keys); + } + + if constexpr (Probe == cuco::test::probe_sequence::double_hashing) { + using probe = cuco::experimental:: + double_hashing, cuco::murmurhash3_32>; + auto set = cuco::experimental:: + static_set, probe>{ + num_keys, + cuco::empty_key{-1}, + thrust::equal_to{}, + probe{cuco::murmurhash3_32{}, cuco::murmurhash3_32{}}}; + + REQUIRE(set.capacity() == gold_capacity); + + test_unique_sequence(set, num_keys); + } +} diff --git a/tests/utility/extent_test.cu b/tests/utility/extent_test.cu new file mode 100644 index 000000000..2623a8ae5 --- /dev/null +++ b/tests/utility/extent_test.cu @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include + +TEMPLATE_TEST_CASE_SIG( + "Extent tests", "", ((typename SizeType), SizeType), (int32_t), (int64_t), (std::size_t)) +{ + SizeType constexpr num = 1234; + SizeType constexpr gold_reference = 314; // 157 x 2 + auto constexpr cg_size = 2; + auto constexpr window_size = 4; + + SECTION("Static extent must be evaluated at compile time.") + { + auto const size = cuco::experimental::extent{}; + STATIC_REQUIRE(num == size); + } + + SECTION("Dynamic extent is evaluated at run time.") + { + auto const size = cuco::experimental::extent(num); + REQUIRE(size == num); + } + + SECTION("Compute static valid extent at compile time.") + { + auto constexpr size = cuco::experimental::extent{}; + auto constexpr res = cuco::experimental::make_valid_extent(size); + STATIC_REQUIRE(gold_reference == res); + } + + SECTION("Compute dynamic valid extent at run time.") + { + auto const size = cuco::experimental::extent{num}; + auto const res = cuco::experimental::make_valid_extent(size); + REQUIRE(gold_reference == res); + } +} diff --git a/tests/utility/storage_test.cu b/tests/utility/storage_test.cu new file mode 100644 index 000000000..c82b5ab44 --- /dev/null +++ b/tests/utility/storage_test.cu @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include + +TEMPLATE_TEST_CASE_SIG("Storage tests", + "", + ((typename Key, typename Value), Key, Value), + (int32_t, int32_t), + (int32_t, int64_t), + (int64_t, int64_t)) +{ + constexpr std::size_t size{1'000}; + constexpr int window_size{2}; + constexpr std::size_t gold_capacity{2'000}; + + using allocator_type = cuco::cuda_allocator; + auto allocator = allocator_type{}; + + SECTION("Allocate array of pairs with AoS storage.") + { + auto s = cuco::experimental::detail::aow_storage, + cuco::experimental::extent, + allocator_type>( + cuco::experimental::extent{size}, allocator); + auto const num_windows = s.num_windows(); + auto const capacity = s.capacity(); + + REQUIRE(num_windows == size); + REQUIRE(capacity == gold_capacity); + } + + SECTION("Allocate array of pairs with AoS storage with static extent.") + { + using extent_type = cuco::experimental::extent; + auto s = cuco::experimental::detail:: + aow_storage, extent_type, allocator_type>(extent_type{}, + allocator); + auto const num_windows = s.num_windows(); + auto const capacity = s.capacity(); + + STATIC_REQUIRE(num_windows == size); + STATIC_REQUIRE(capacity == gold_capacity); + } + + SECTION("Allocate array of keys with AoS storage.") + { + auto s = cuco::experimental::detail:: + aow_storage, allocator_type>( + cuco::experimental::extent{size}, allocator); + auto const num_windows = s.num_windows(); + auto const capacity = s.capacity(); + + REQUIRE(num_windows == size); + REQUIRE(capacity == gold_capacity); + } + + SECTION("Allocate array of keys with AoS storage with static extent.") + { + using extent_type = cuco::experimental::extent; + auto s = cuco::experimental::detail::aow_storage( + extent_type{}, allocator); + auto const num_windows = s.num_windows(); + auto const capacity = s.capacity(); + + STATIC_REQUIRE(num_windows == size); + STATIC_REQUIRE(capacity == gold_capacity); + } +} From 8be28f654a517838063d53df2d861a8d34d70eb9 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 13 Apr 2023 14:01:31 -0700 Subject: [PATCH 100/152] Update the default parameters for `static_set` (#292) This PR sets the default CG size to 4 and the default window size to 1 for better runtime performance. It also falls back the `size()` implementation to use a custom kernel instead of `cub::DeviceReduce::Sum` since the latter doesn't support operations for more than 2^31 elements for now. --- include/cuco/detail/static_set/kernels.cuh | 51 ++++++++++++++++--- include/cuco/detail/static_set/static_set.inl | 40 +++++---------- include/cuco/static_set.cuh | 4 +- tests/static_set/unique_sequence_test.cu | 30 ++++++----- 4 files changed, 77 insertions(+), 48 deletions(-) diff --git a/include/cuco/detail/static_set/kernels.cuh b/include/cuco/detail/static_set/kernels.cuh index e7d52ae27..9518ba23e 100644 --- a/include/cuco/detail/static_set/kernels.cuh +++ b/include/cuco/detail/static_set/kernels.cuh @@ -43,7 +43,7 @@ namespace detail { * @param first Beginning of the sequence of input elements * @param n Number of input elements * @param num_successes Number of successful inserted elements - * @param ref Non-owing set device ref used to access the slot storage + * @param ref Non-owning set device ref used to access the slot storage */ template __global__ void insert(InputIterator first, @@ -85,7 +85,7 @@ __global__ void insert(InputIterator first, * * @param first Beginning of the sequence of input elements * @param n Number of input elements - * @param ref Non-owing set device ref used to access the slot storage + * @param ref Non-owning set device ref used to access the slot storage */ template __global__ void insert_async(InputIterator first, cuco::detail::index_type n, Ref ref) @@ -117,7 +117,7 @@ __global__ void insert_async(InputIterator first, cuco::detail::index_type n, Re * @param first Beginning of the sequence of input elements * @param n Number of input elements * @param num_successes Number of successful inserted elements - * @param ref Non-owing set device ref used to access the slot storage + * @param ref Non-owning set device ref used to access the slot storage */ template __global__ void insert(InputIterator first, @@ -163,7 +163,7 @@ __global__ void insert(InputIterator first, * * @param first Beginning of the sequence of input elements * @param n Number of input elements - * @param ref Non-owing set device ref used to access the slot storage + * @param ref Non-owning set device ref used to access the slot storage */ template __global__ void insert_async(InputIterator first, cuco::detail::index_type n, Ref ref) @@ -196,7 +196,7 @@ __global__ void insert_async(InputIterator first, cuco::detail::index_type n, Re * @param first Beginning of the sequence of keys * @param n Number of keys * @param output_begin Beginning of the sequence of booleans for the presence of each key - * @param ref Non-owing set device ref used to access the slot storage + * @param ref Non-owning set device ref used to access the slot storage */ template __global__ void contains(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref) @@ -245,7 +245,7 @@ __global__ void contains(InputIt first, cuco::detail::index_type n, OutputIt out * @param first Beginning of the sequence of keys * @param n Number of keys * @param output_begin Beginning of the sequence of booleans for the presence of each key - * @param ref Non-owing set device ref used to access the slot storage + * @param ref Non-owning set device ref used to access the slot storage */ template __global__ void contains(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref) @@ -282,6 +282,45 @@ __global__ void contains(InputIt first, cuco::detail::index_type n, OutputIt out } } +/** + * @brief Calculates the number of filled slots for the given window storage. + * + * @tparam BlockSize Number of threads in each block + * @tparam StorageRef Type of non-owning ref allowing access to storage + * @tparam AtomicT Atomic counter type + * + * @param storage Non-owning device ref used to access the slot storage + * @param empty_sentinel Sentinel indicating empty slots + * @param count Number of filled slots + */ +template +__global__ void size(StorageRef storage, + typename StorageRef::value_type empty_sentinel, + AtomicT* count) +{ + using size_type = typename StorageRef::size_type; + + cuco::detail::index_type const loop_stride = gridDim.x * BlockSize; + cuco::detail::index_type idx = BlockSize * blockIdx.x + threadIdx.x; + + size_type thread_count = 0; + auto const n = storage.num_windows(); + + while (idx < n) { + auto const window = storage[idx]; +#pragma unroll + for (auto const& it : window) { + thread_count += static_cast(not cuco::detail::bitwise_compare(it, empty_sentinel)); + } + idx += loop_stride; + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + size_type const block_count = BlockReduce(temp_storage).Sum(thread_count); + if (threadIdx.x == 0) { count->fetch_add(block_count, cuda::std::memory_order_relaxed); } +} + } // namespace detail } // namespace experimental } // namespace cuco diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl index 0443c1184..03db92343 100644 --- a/include/cuco/detail/static_set/static_set.inl +++ b/include/cuco/detail/static_set/static_set.inl @@ -172,34 +172,20 @@ static_set::siz static_set::size( cudaStream_t stream) const { - auto const begin = thrust::make_transform_iterator( - storage_.data(), - cuco::detail::elements_per_window{empty_key_sentinel_}); - - std::size_t temp_storage_bytes = 0; - using temp_allocator_type = typename std::allocator_traits::rebind_alloc; - auto temp_allocator = temp_allocator_type{allocator_}; - auto d_size = reinterpret_cast( - std::allocator_traits::allocate(temp_allocator, sizeof(size_type))); - cub::DeviceReduce::Sum( - nullptr, temp_storage_bytes, begin, d_size, storage_.num_windows(), stream); - - auto d_temp_storage = - std::allocator_traits::allocate(temp_allocator, temp_storage_bytes); - - cub::DeviceReduce::Sum( - d_temp_storage, temp_storage_bytes, begin, d_size, storage_.num_windows(), stream); - - size_type h_size; - CUCO_CUDA_TRY( - cudaMemcpyAsync(&h_size, d_size, sizeof(size_type), cudaMemcpyDeviceToHost, stream)); - CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); - std::allocator_traits::deallocate( - temp_allocator, reinterpret_cast(d_size), sizeof(size_type)); - std::allocator_traits::deallocate( - temp_allocator, d_temp_storage, temp_storage_bytes); + auto counter = detail::counter_storage{allocator_}; + counter.reset(stream); + + auto const grid_size = + (storage_.num_windows() + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / + (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); - return h_size; + // TODO: custom kernel to be replaced by cub::DeviceReduce::Sum when cub version is bumped to + // v2.1.0 + detail::size + <<>>( + storage_.ref(), this->empty_key_sentinel(), counter.data()); + + return counter.load_to_host(stream); } template , cuda::thread_scope Scope = cuda::thread_scope_device, class KeyEqual = thrust::equal_to, - class ProbingScheme = experimental::double_hashing<1, // CG size + class ProbingScheme = experimental::double_hashing<4, // CG size cuco::murmurhash3_32, cuco::murmurhash3_32>, class Allocator = cuco::cuda_allocator, - class Storage = cuco::experimental::aow_storage<2>> + class Storage = cuco::experimental::aow_storage<1>> class static_set { static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); diff --git a/tests/static_set/unique_sequence_test.cu b/tests/static_set/unique_sequence_test.cu index c2eab7eff..d927e645f 100644 --- a/tests/static_set/unique_sequence_test.cu +++ b/tests/static_set/unique_sequence_test.cu @@ -77,16 +77,19 @@ TEMPLATE_TEST_CASE_SIG( : 412 // 103 x 2 x 2 ; - using extent_type = cuco::experimental::extent; + using extent_type = cuco::experimental::extent; + using allocator_type = cuco::cuda_allocator; + using storage_type = cuco::experimental::aow_storage<2>; if constexpr (Probe == cuco::test::probe_sequence::linear_probing) { using probe = cuco::experimental::linear_probing>; - auto set = cuco::experimental:: - static_set, probe>{ - num_keys, - cuco::empty_key{-1}, - thrust::equal_to{}, - probe{cuco::murmurhash3_32{}}}; + auto set = cuco::experimental::static_set, + probe, + allocator_type, + storage_type>{num_keys, cuco::empty_key{-1}}; REQUIRE(set.capacity() == gold_capacity); @@ -96,12 +99,13 @@ TEMPLATE_TEST_CASE_SIG( if constexpr (Probe == cuco::test::probe_sequence::double_hashing) { using probe = cuco::experimental:: double_hashing, cuco::murmurhash3_32>; - auto set = cuco::experimental:: - static_set, probe>{ - num_keys, - cuco::empty_key{-1}, - thrust::equal_to{}, - probe{cuco::murmurhash3_32{}, cuco::murmurhash3_32{}}}; + auto set = cuco::experimental::static_set, + probe, + allocator_type, + storage_type>{num_keys, cuco::empty_key{-1}}; REQUIRE(set.capacity() == gold_capacity); From 002d1baf2052b73ec2b4a9880730b82a0da39814 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 14 Apr 2023 08:46:39 -0700 Subject: [PATCH 101/152] Add `static_set::retrieve_all` (#291) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR adds the `static_set::retrieve_all` function. It also changes the default CG size to 4 and the default window size to 1 for slightly better performance. --------- Co-authored-by: Daniel Jünger <2955913+sleeepyjack@users.noreply.github.com> --- benchmarks/CMakeLists.txt | 1 + .../static_set/retrieve_all_bench.cu | 63 ++++++++++ include/cuco/detail/static_set/functors.cuh | 86 +++++++++++++ include/cuco/detail/static_set/static_set.inl | 49 ++++++++ include/cuco/detail/utils.cuh | 28 ----- include/cuco/static_set.cuh | 20 +++ tests/CMakeLists.txt | 1 + tests/static_set/retrieve_all_test.cu | 116 ++++++++++++++++++ 8 files changed, 336 insertions(+), 28 deletions(-) create mode 100644 benchmarks/hash_table/static_set/retrieve_all_bench.cu create mode 100644 include/cuco/detail/static_set/functors.cuh create mode 100644 tests/static_set/retrieve_all_test.cu diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 750c9be86..6e2c7001f 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -51,6 +51,7 @@ endfunction(ConfigureBench) ConfigureBench(STATIC_SET_BENCH hash_table/static_set/contains_bench.cu hash_table/static_set/insert_bench.cu + hash_table/static_set/retrieve_all_bench.cu hash_table/static_set/size_bench.cu) ################################################################################################### diff --git a/benchmarks/hash_table/static_set/retrieve_all_bench.cu b/benchmarks/hash_table/static_set/retrieve_all_bench.cu new file mode 100644 index 000000000..fb52b251b --- /dev/null +++ b/benchmarks/hash_table/static_set/retrieve_all_bench.cu @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::static_set::retrieve_all` performance + */ +template +void static_set_retrieve_all(nvbench::state& state, nvbench::type_list) +{ + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + + std::size_t const size = num_keys / occupancy; + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + cuco::experimental::static_set set{size, cuco::empty_key{-1}}; + set.insert(keys.begin(), keys.end()); + + thrust::device_vector result(num_keys); + + state.add_element_count(num_keys); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto end = set.retrieve_all(result.begin(), launch.get_stream()); + }); +} + +NVBENCH_BENCH_TYPES(static_set_retrieve_all, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_retrieve_all_unique_occupancy") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); diff --git a/include/cuco/detail/static_set/functors.cuh b/include/cuco/detail/static_set/functors.cuh new file mode 100644 index 000000000..52375f225 --- /dev/null +++ b/include/cuco/detail/static_set/functors.cuh @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + */ + +#pragma once + +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief Device functor returning the content of the slot indexed by `idx`. + * + * @tparam StorageRef Storage ref type + */ +template +struct get_slot { + StorageRef storage_; ///< Storage ref + + /** + * @brief Constructs `get_slot` functor with the given storage ref. + * + * @param s Input storage ref + */ + get_slot(StorageRef s) : storage_{s} {} + + /** + * @brief Accesses the slot content with the given index. + * + * @param idx The slot index + * @return The slot content + */ + __device__ typename StorageRef::value_type operator()(typename StorageRef::size_type idx) const + { + auto const window_idx = idx / StorageRef::window_size; + auto const intra_idx = idx % StorageRef::window_size; + return storage_[window_idx][intra_idx]; + } +}; + +/** + * @brief Device functor returning whether the input slot indexed by `idx` is filled. + * + * @tparam T The slot content type + */ +template +struct slot_is_filled { + T empty_sentinel_; ///< The value of the empty key sentinel + + /** + * @brief Constructs `slot_is_filled` functor with the given empty sentinel. + * + * @param s Sentinel indicating empty slot + */ + slot_is_filled(T s) : empty_sentinel_{s} {} + + /** + * @brief Indicates if the target slot `slot` is filled. + * + * @tparam T Slot content type + * + * @param slot The slot + * @return `true` if slot is filled + */ + __device__ bool operator()(T slot) const + { + return not cuco::detail::bitwise_compare(empty_sentinel_, slot); + } +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl index 03db92343..2e81fb847 100644 --- a/include/cuco/detail/static_set/static_set.inl +++ b/include/cuco/detail/static_set/static_set.inl @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -23,9 +24,11 @@ #include #include +#include #include #include +#include #include @@ -161,6 +164,52 @@ void static_set } } +template +template +OutputIt static_set::retrieve_all( + OutputIt output_begin, cudaStream_t stream) const +{ + auto begin = thrust::make_transform_iterator(thrust::counting_iterator(0), + detail::get_slot(storage_.ref())); + auto filled = detail::slot_is_filled(empty_key_sentinel_); + + std::size_t temp_storage_bytes = 0; + using temp_allocator_type = typename std::allocator_traits::rebind_alloc; + auto temp_allocator = temp_allocator_type{allocator_}; + auto d_num_out = reinterpret_cast( + std::allocator_traits::allocate(temp_allocator, sizeof(size_type))); + CUCO_CUDA_TRY(cub::DeviceSelect::If( + nullptr, temp_storage_bytes, begin, output_begin, d_num_out, capacity(), filled, stream)); + + // Allocate temporary storage + auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes); + + CUCO_CUDA_TRY(cub::DeviceSelect::If(d_temp_storage, + temp_storage_bytes, + begin, + output_begin, + d_num_out, + capacity(), + filled, + stream)); + + size_type h_num_out; + CUCO_CUDA_TRY( + cudaMemcpyAsync(&h_num_out, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream)); + CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); + std::allocator_traits::deallocate( + temp_allocator, reinterpret_cast(d_num_out), sizeof(size_type)); + temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); + + return output_begin + h_num_out; +} + template -struct elements_per_window { - Sentinel empty_key_sentinel_; ///< The value of the empty key sentinel - - /** - * @brief Computes the number of filled elements per window. - * - * @tparam Window Window storage type - * - * @param window The window to query - * @return Number of filled elements per window - */ - template - __device__ inline int32_t operator()(Window const& window) const - { - int32_t num = 0; - for (auto const& element : window) { - num += not cuco::detail::bitwise_compare(element, empty_key_sentinel_); - } - return num; - } -}; - /** * @brief A strong type wrapper. * diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh index 5653e6041..c83a76c2e 100644 --- a/include/cuco/static_set.cuh +++ b/include/cuco/static_set.cuh @@ -239,6 +239,26 @@ class static_set { OutputIt output_begin, cudaStream_t stream = nullptr) const; + /** + * @brief Retrieves all keys contained in the set. + * + * @note This API synchronizes the given stream. + * @note The order in which keys are returned is implementation defined and not guaranteed to be + * consistent between subsequent calls to `retrieve_all`. + * @note Behavior is undefined if the range beginning at `keys_out` is smaller than the return + * value of `size()`. + * + * @tparam OutputIt Device accessible random access output iterator whose `value_type` is + * convertible from the container's `key_type`. + * + * @param output_begin Beginning output iterator for keys + * @param stream CUDA stream used for this operation + * + * @return Iterator indicating the end of the output + */ + template + [[nodiscard]] OutputIt retrieve_all(OutputIt output_begin, cudaStream_t stream = nullptr) const; + /** * @brief Gets the number of elements in the container. * diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 16f6abacf..4cbd43d22 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -59,6 +59,7 @@ ConfigureTest(STATIC_SET_TEST static_set/capacity_test.cu static_set/heterogeneous_lookup_test.cu static_set/large_input_test.cu + static_set/retrieve_all_test.cu static_set/size_test.cu static_set/unique_sequence_test.cu) diff --git a/tests/static_set/retrieve_all_test.cu b/tests/static_set/retrieve_all_test.cu new file mode 100644 index 000000000..45fa5e56b --- /dev/null +++ b/tests/static_set/retrieve_all_test.cu @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +template +__inline__ void test_unique_sequence(Set& set, std::size_t num_keys) +{ + using Key = typename Set::key_type; + + thrust::device_vector d_keys(num_keys); + + thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); + + auto key_begin = d_keys.begin(); + + SECTION("Non-inserted keys should not be contained.") + { + REQUIRE(set.size() == 0); + + auto key_end = set.retrieve_all(key_begin); + REQUIRE(std::distance(key_begin, key_end) == 0); + } + + set.insert(key_begin, key_begin + num_keys); + REQUIRE(set.size() == num_keys); + + SECTION("All inserted key/value pairs should be contained.") + { + thrust::device_vector d_res(num_keys); + auto d_res_end = set.retrieve_all(d_res.begin()); + thrust::sort(thrust::device, d_res.begin(), d_res_end); + REQUIRE(cuco::test::equal( + d_res.begin(), d_res_end, thrust::counting_iterator(0), thrust::equal_to{})); + } +} + +TEMPLATE_TEST_CASE_SIG( + "Retrieve all", + "", + ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize), + (int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, cuco::test::probe_sequence::linear_probing, 2)) +{ + constexpr std::size_t num_keys{400}; + auto constexpr gold_capacity = CGSize == 1 ? 409 // 409 x 1 x 1 + : 422 // 211 x 2 x 1 + ; + + using extent_type = cuco::experimental::extent; + using allocator_type = cuco::cuda_allocator; + using storage_type = cuco::experimental::aow_storage<1>; + + if constexpr (Probe == cuco::test::probe_sequence::linear_probing) { + using probe = cuco::experimental::linear_probing>; + auto set = cuco::experimental::static_set, + probe, + allocator_type, + storage_type>{num_keys, cuco::empty_key{-1}}; + + REQUIRE(set.capacity() == gold_capacity); + + test_unique_sequence(set, num_keys); + } + + if constexpr (Probe == cuco::test::probe_sequence::double_hashing) { + using probe = cuco::experimental:: + double_hashing, cuco::murmurhash3_32>; + auto set = cuco::experimental::static_set, + probe, + allocator_type, + storage_type>{num_keys, cuco::empty_key{-1}}; + + REQUIRE(set.capacity() == gold_capacity); + + test_unique_sequence(set, num_keys); + } +} From 5cdcfc39f5ad49b15a12124b213abe59078e0a9e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 19 Apr 2023 11:45:53 -0700 Subject: [PATCH 102/152] Add set conditional insert APIs (#293) This PR adds set `insert_if` and `insert_if_async` APIs. It also includes two improvements for the existing implementation: - Uses `insert_if_n` to unify all kernel implementations of the insert - Distinct CG and non-CG code paths at the kernel level The performance regression of the unified kernel is about 0.1% compared to the dedicated kernels thus considered negligible. --------- Co-authored-by: Jake Hemstad --- include/cuco/detail/static_set/kernels.cuh | 143 +++++++----------- include/cuco/detail/static_set/static_set.inl | 78 ++++++++-- include/cuco/static_set.cuh | 52 +++++++ tests/static_set/unique_sequence_test.cu | 24 ++- 4 files changed, 188 insertions(+), 109 deletions(-) diff --git a/include/cuco/detail/static_set/kernels.cuh b/include/cuco/detail/static_set/kernels.cuh index 9518ba23e..e0fa30d7d 100644 --- a/include/cuco/detail/static_set/kernels.cuh +++ b/include/cuco/detail/static_set/kernels.cuh @@ -27,79 +27,6 @@ namespace cuco { namespace experimental { namespace detail { -/** - * @brief Inserts all elements in the range `[first, first + n)` and returns the number of - * successful insertions. - * - * If multiple elements in `[first, first + size)` compare equal, it is unspecified which - * element is inserted. - * - * @tparam BlockSize Number of threads in each block - * @tparam InputIterator Device accessible input iterator whose `value_type` is - * convertible to the `value_type` of the data structure - * @tparam AtomicT Atomic counter type - * @tparam Ref Type of non-owning device ref allowing access to storage - * - * @param first Beginning of the sequence of input elements - * @param n Number of input elements - * @param num_successes Number of successful inserted elements - * @param ref Non-owning set device ref used to access the slot storage - */ -template -__global__ void insert(InputIterator first, - cuco::detail::index_type n, - AtomicT* num_successes, - Ref ref) -{ - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - typename Ref::size_type thread_num_successes = 0; - - cuco::detail::index_type const loop_stride = gridDim.x * BlockSize; - cuco::detail::index_type idx = BlockSize * blockIdx.x + threadIdx.x; - - while (idx < n) { - typename Ref::value_type const insert_pair{*(first + idx)}; - if (ref.insert(insert_pair)) { thread_num_successes++; }; - idx += loop_stride; - } - - // compute number of successfully inserted elements for each block - // and atomically add to the grand total - typename Ref::size_type block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); - if (threadIdx.x == 0) { - num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed); - } -} - -/** - * @brief Inserts all elements in the range `[first, first + n)`. - * - * If multiple elements in `[first, first + n)` compare equal, it is unspecified which - * element is inserted. - * - * @tparam BlockSize Number of threads in each block - * @tparam InputIterator Device accessible input iterator whose `value_type` is - * convertible to the `value_type` of the data structure - * @tparam Ref Type of non-owning device ref allowing access to storage - * - * @param first Beginning of the sequence of input elements - * @param n Number of input elements - * @param ref Non-owning set device ref used to access the slot storage - */ -template -__global__ void insert_async(InputIterator first, cuco::detail::index_type n, Ref ref) -{ - cuco::detail::index_type const loop_stride = gridDim.x * BlockSize; - cuco::detail::index_type idx = BlockSize * blockIdx.x + threadIdx.x; - - while (idx < n) { - typename Ref::value_type const insert_pair{*(first + idx)}; - ref.insert(insert_pair); - idx += loop_stride; - } -} - /** * @brief Inserts all elements in the range `[first, first + n)` and returns the number of * successful insertions. @@ -111,33 +38,52 @@ __global__ void insert_async(InputIterator first, cuco::detail::index_type n, Re * @tparam BlockSize Number of threads in each block * @tparam InputIterator Device accessible input iterator whose `value_type` is * convertible to the `value_type` of the data structure + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` + * and argument type is convertible from `std::iterator_traits::value_type` * @tparam AtomicT Atomic counter type * @tparam Ref Type of non-owning device ref allowing access to storage * * @param first Beginning of the sequence of input elements * @param n Number of input elements + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[s, s + n)` * @param num_successes Number of successful inserted elements * @param ref Non-owning set device ref used to access the slot storage */ -template -__global__ void insert(InputIterator first, - cuco::detail::index_type n, - AtomicT* num_successes, - Ref ref) +template +__global__ void insert_if_n(InputIterator first, + cuco::detail::index_type n, + StencilIt stencil, + Predicate pred, + AtomicT* num_successes, + Ref ref) { - namespace cg = cooperative_groups; - using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; typename Ref::size_type thread_num_successes = 0; - auto const tile = cg::tiled_partition(cg::this_thread_block()); cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize; cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize; while (idx < n) { - typename Ref::value_type const insert_pair{*(first + idx)}; - if (ref.insert(tile, insert_pair) && tile.thread_rank() == 0) { thread_num_successes++; }; + if (pred(*(stencil + idx))) { + typename Ref::value_type const insert_pair{*(first + idx)}; + if constexpr (CGSize == 1) { + if (ref.insert(insert_pair)) { thread_num_successes++; }; + } else { + auto const tile = + cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + if (ref.insert(tile, insert_pair) && tile.thread_rank() == 0) { thread_num_successes++; }; + } + } idx += loop_stride; } @@ -159,24 +105,41 @@ __global__ void insert(InputIterator first, * @tparam BlockSize Number of threads in each block * @tparam InputIterator Device accessible input iterator whose `value_type` is * convertible to the `value_type` of the data structure + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` + * and argument type is convertible from `std::iterator_traits::value_type` * @tparam Ref Type of non-owning device ref allowing access to storage * * @param first Beginning of the sequence of input elements * @param n Number of input elements + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[s, s + n)` * @param ref Non-owning set device ref used to access the slot storage */ -template -__global__ void insert_async(InputIterator first, cuco::detail::index_type n, Ref ref) +template +__global__ void insert_if_n( + InputIterator first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, Ref ref) { - namespace cg = cooperative_groups; - - auto tile = cg::tiled_partition(cg::this_thread_block()); cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize; cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize; while (idx < n) { - typename Ref::value_type const insert_pair{*(first + idx)}; - ref.insert(tile, insert_pair); + if (pred(*(stencil + idx))) { + typename Ref::value_type const insert_pair{*(first + idx)}; + if constexpr (CGSize == 1) { + ref.insert(insert_pair); + } else { + auto const tile = + cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + ref.insert(tile, insert_pair); + } + } idx += loop_stride; } } diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl index 2e81fb847..324387b0e 100644 --- a/include/cuco/detail/static_set/static_set.inl +++ b/include/cuco/detail/static_set/static_set.inl @@ -24,6 +24,8 @@ #include #include +#include +#include #include #include @@ -80,15 +82,10 @@ static_set::ins (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); - if constexpr (cg_size == 1) { - detail::insert - <<>>( - first, num_keys, counter.data(), ref(op::insert)); - } else { - detail::insert - <<>>( - first, num_keys, counter.data(), ref(op::insert)); - } + auto const always_true = thrust::constant_iterator{true}; + detail::insert_if_n + <<>>( + first, num_keys, always_true, thrust::identity{}, counter.data(), ref(op::insert)); return counter.load_to_host(stream); } @@ -111,13 +108,62 @@ void static_set (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); - if constexpr (cg_size == 1) { - detail::insert_async - <<>>(first, num_keys, ref(op::insert)); - } else { - detail::insert_async - <<>>(first, num_keys, ref(op::insert)); - } + auto const always_true = thrust::constant_iterator{true}; + detail::insert_if_n + <<>>( + first, num_keys, always_true, thrust::identity{}, ref(op::insert)); +} + +template +template +static_set::size_type +static_set::insert_if( + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cudaStream_t stream) +{ + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return 0; } + + auto counter = detail::counter_storage{allocator_}; + counter.reset(stream); + + auto const grid_size = + (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / + (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + + detail::insert_if_n + <<>>( + first, num_keys, stencil, pred, counter.data(), ref(op::insert)); + + return counter.load_to_host(stream); +} + +template +template +void static_set::insert_if_async( + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cudaStream_t stream) +{ + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + auto const grid_size = + (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / + (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + + detail::insert_if_n + <<>>( + first, num_keys, stencil, pred, ref(op::insert)); } template void insert_async(InputIt first, InputIt last, cudaStream_t stream = nullptr); + /** + * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns + * true. + * + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * @note This function synchronizes the given stream and returns the number of successful + * insertions. For asynchronous execution use `insert_if_async`. + * + * @tparam InputIt Device accessible random access iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param stream CUDA stream used for the operation + * + * @return Number of successfully inserted keys + */ + template + size_type insert_if( + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cudaStream_t stream = nullptr); + + /** + * @brief Asynchonously inserts keys in the range `[first, last)` if `pred` of the corresponding + * stencil returns true. + * + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * + * @tparam InputIt Device accessible random access iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param stream CUDA stream used for the operation + */ + template + void insert_if_async( + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cudaStream_t stream = nullptr); + /** * @brief Indicates whether the keys in the range `[first, last)` are contained in the set. * diff --git a/tests/static_set/unique_sequence_test.cu b/tests/static_set/unique_sequence_test.cu index d927e645f..6ea4a03e3 100644 --- a/tests/static_set/unique_sequence_test.cu +++ b/tests/static_set/unique_sequence_test.cu @@ -49,14 +49,32 @@ __inline__ void test_unique_sequence(Set& set, std::size_t num_keys) REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), thrust::identity{})); } - set.insert(key_begin, key_begin + num_keys); - REQUIRE(set.size() == num_keys); - SECTION("All inserted key/value pairs should be contained.") { + set.insert(key_begin, key_begin + num_keys); + REQUIRE(set.size() == num_keys); + set.contains(key_begin, key_begin + num_keys, d_contained.begin()); REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), thrust::identity{})); } + + SECTION("All conditionally inserted keys should be contained") + { + auto const inserted = set.insert_if(key_begin, + key_begin + num_keys, + thrust::counting_iterator(0), + [] __device__(auto const& key) { return (key % 2) == 0; }); + REQUIRE(inserted == num_keys / 2); + REQUIRE(set.size() == num_keys / 2); + + set.contains(key_begin, key_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::equal(d_contained.begin(), + d_contained.end(), + thrust::counting_iterator(0), + [] __device__(auto const& idx_contained, auto const& idx) { + return ((idx % 2) == 0) == idx_contained; + })); + } } TEMPLATE_TEST_CASE_SIG( From 1b6dd07a29364aa0867a8b3d903beaad493e7da3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 25 Apr 2023 22:28:34 +0200 Subject: [PATCH 103/152] Introduce cuco::cuda_stream_ref (#296) This PR introduces a non-owning wrapper for `cudaStream_t` coined `cuco::cuda_stream_ref` and primarily borrows from [RMM's implementation](https://github.com/rapidsai/rmm/blob/528b283cacdc312ef99052644a0bb33e07338836/include/rmm/cuda_stream_view.hpp). --- .../hash_table/static_set/contains_bench.cu | 2 +- .../hash_table/static_set/insert_bench.cu | 4 +- .../static_set/retrieve_all_bench.cu | 2 +- .../hash_table/static_set/size_bench.cu | 2 +- include/cuco/cuda_stream_ref.hpp | 142 ++++++++++++++++++ include/cuco/detail/cuda_stream_ref.inl | 50 ++++++ include/cuco/detail/static_set/static_set.inl | 23 +-- include/cuco/detail/storage/aow_storage.cuh | 3 +- .../cuco/detail/storage/counter_storage.cuh | 7 +- include/cuco/static_set.cuh | 19 +-- 10 files changed, 225 insertions(+), 29 deletions(-) create mode 100644 include/cuco/cuda_stream_ref.hpp create mode 100644 include/cuco/detail/cuda_stream_ref.inl diff --git a/benchmarks/hash_table/static_set/contains_bench.cu b/benchmarks/hash_table/static_set/contains_bench.cu index b0c0f34f4..697b98574 100644 --- a/benchmarks/hash_table/static_set/contains_bench.cu +++ b/benchmarks/hash_table/static_set/contains_bench.cu @@ -54,7 +54,7 @@ void static_set_contains(nvbench::state& state, nvbench::type_list) state.add_element_count(num_keys); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - set.contains(keys.begin(), keys.end(), result.begin(), launch.get_stream()); + set.contains(keys.begin(), keys.end(), result.begin(), {launch.get_stream()}); }); } diff --git a/benchmarks/hash_table/static_set/insert_bench.cu b/benchmarks/hash_table/static_set/insert_bench.cu index cb5dcf1f8..48bc37fa4 100644 --- a/benchmarks/hash_table/static_set/insert_bench.cu +++ b/benchmarks/hash_table/static_set/insert_bench.cu @@ -48,10 +48,10 @@ void static_set_insert(nvbench::state& state, nvbench::type_list) state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { cuco::experimental::static_set set{ - size, cuco::empty_key{-1}, {}, {}, {}, launch.get_stream()}; + size, cuco::empty_key{-1}, {}, {}, {}, {launch.get_stream()}}; timer.start(); - set.insert(keys.begin(), keys.end(), launch.get_stream()); + set.insert(keys.begin(), keys.end(), {launch.get_stream()}); timer.stop(); }); } diff --git a/benchmarks/hash_table/static_set/retrieve_all_bench.cu b/benchmarks/hash_table/static_set/retrieve_all_bench.cu index fb52b251b..17ea66384 100644 --- a/benchmarks/hash_table/static_set/retrieve_all_bench.cu +++ b/benchmarks/hash_table/static_set/retrieve_all_bench.cu @@ -50,7 +50,7 @@ void static_set_retrieve_all(nvbench::state& state, nvbench::type_list) set.insert(keys.begin(), keys.end()); state.exec(nvbench::exec_tag::sync, - [&](nvbench::launch& launch) { auto const size = set.size(launch.get_stream()); }); + [&](nvbench::launch& launch) { auto const size = set.size({launch.get_stream()}); }); } NVBENCH_BENCH_TYPES(static_set_size, diff --git a/include/cuco/cuda_stream_ref.hpp b/include/cuco/cuda_stream_ref.hpp new file mode 100644 index 000000000..bf0a5dea9 --- /dev/null +++ b/include/cuco/cuda_stream_ref.hpp @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +namespace cuco { +namespace experimental { + +/** + * @brief Strongly-typed non-owning wrapper for CUDA streams with default constructor. + * + * This wrapper is simply a "view": it does not own the lifetime of the stream it wraps. + */ +class cuda_stream_ref { + public: + constexpr cuda_stream_ref() = default; ///< Default constructor + constexpr cuda_stream_ref(cuda_stream_ref const&) = default; ///< Copy constructor + constexpr cuda_stream_ref(cuda_stream_ref&&) = default; ///< Move constructor + + /** + * @brief Copy-assignment operator. + * + * @return Copy of this stream reference. + */ + constexpr cuda_stream_ref& operator=(cuda_stream_ref const&) = default; + + /** + * @brief Move-assignment operator. + * + * @return New location of this stream reference. + */ + constexpr cuda_stream_ref& operator=(cuda_stream_ref&&) = default; ///< Move-assignment operator + + ~cuda_stream_ref() = default; + + constexpr cuda_stream_ref(int) = delete; //< Prevent cast from literal 0 + constexpr cuda_stream_ref(std::nullptr_t) = delete; //< Prevent cast from nullptr + + /** + * @brief Implicit conversion from `cudaStream_t`. + * + * @param stream The CUDA stream to reference. + */ + constexpr cuda_stream_ref(cudaStream_t stream) noexcept : stream_{stream} {} + + /** + * @brief Get the wrapped stream. + * + * @return The wrapped stream. + */ + [[nodiscard]] constexpr cudaStream_t value() const noexcept { return stream_; } + + /** + * @brief Implicit conversion to `cudaStream_t`. + * + * @return The underlying `cudaStream_t`. + */ + constexpr operator cudaStream_t() const noexcept { return value(); } + + /** + * @brief Return true if the wrapped stream is the CUDA per-thread default stream. + * + * @return True if the wrapped stream is the per-thread default stream; else false. + */ + [[nodiscard]] inline bool is_per_thread_default() const noexcept; + + /** + * @brief Return true if the wrapped stream is explicitly the CUDA legacy default stream. + * + * @return True if the wrapped stream is the default stream; else false. + */ + [[nodiscard]] inline bool is_default() const noexcept; + + /** + * @brief Synchronize the viewed CUDA stream. + * + * Calls `cudaStreamSynchronize()`. + * + * @throw cuco::cuda_error if stream synchronization fails + */ + void synchronize() const; + + private: + cudaStream_t stream_{}; +}; + +/** + * @brief Static `cuda_stream_ref` of the default stream (stream 0), for convenience + */ +static constexpr cuda_stream_ref cuda_stream_default{}; + +/** + * @brief Static `cuda_stream_ref` of cudaStreamLegacy, for convenience + */ +static const cuda_stream_ref cuda_stream_legacy{cudaStreamLegacy}; + +/** + * @brief Static `cuda_stream_ref` of cudaStreamPerThread, for convenience + */ +static const cuda_stream_ref cuda_stream_per_thread{cudaStreamPerThread}; + +// /** +// * @brief Equality comparison operator for streams +// * +// * @param lhs The first stream view to compare +// * @param rhs The second stream view to compare +// * @return true if equal, false if unequal +// */ +// inline bool operator==(cuda_stream_ref lhs, cuda_stream_ref rhs) +// { +// return lhs.value() == rhs.value(); +// } + +// /** +// * @brief Inequality comparison operator for streams +// * +// * @param lhs The first stream view to compare +// * @param rhs The second stream view to compare +// * @return true if unequal, false if equal +// */ +// inline bool operator!=(cuda_stream_ref lhs, cuda_stream_ref rhs) { return not(lhs == rhs); } + +} // namespace experimental +} // namespace cuco + +#include \ No newline at end of file diff --git a/include/cuco/detail/cuda_stream_ref.inl b/include/cuco/detail/cuda_stream_ref.inl new file mode 100644 index 000000000..64aa078aa --- /dev/null +++ b/include/cuco/detail/cuda_stream_ref.inl @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include + +namespace cuco { +namespace experimental { + +[[nodiscard]] inline bool cuda_stream_ref::is_per_thread_default() const noexcept +{ +#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM + return value() == cuda_stream_per_thread || value() == nullptr; +#else + return value() == cuda_stream_per_thread; +#endif +} + +[[nodiscard]] inline bool cuda_stream_ref::is_default() const noexcept +{ +#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM + return value() == cuda_stream_legacy; +#else + return value() == cuda_stream_legacy || value() == nullptr; +#endif +} + +inline void cuda_stream_ref::synchronize() const +{ + CUCO_CUDA_TRY(cudaStreamSynchronize(this->stream_)); +} + +} // namespace experimental +} // namespace cuco \ No newline at end of file diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl index 324387b0e..0198b91d4 100644 --- a/include/cuco/detail/static_set/static_set.inl +++ b/include/cuco/detail/static_set/static_set.inl @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include #include @@ -50,7 +51,7 @@ constexpr static_set static_set::size_type static_set::insert( - InputIt first, InputIt last, cudaStream_t stream) + InputIt first, InputIt last, cuda_stream_ref stream) { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return 0; } @@ -99,7 +100,7 @@ template template void static_set::insert_async( - InputIt first, InputIt last, cudaStream_t stream) + InputIt first, InputIt last, cuda_stream_ref stream) { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } @@ -124,7 +125,7 @@ template static_set::size_type static_set::insert_if( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cudaStream_t stream) + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return 0; } @@ -152,7 +153,7 @@ template template void static_set::insert_if_async( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cudaStream_t stream) + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } @@ -175,10 +176,10 @@ template template void static_set::contains( - InputIt first, InputIt last, OutputIt output_begin, cudaStream_t stream) const + InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const { contains_async(first, last, output_begin, stream); - CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); + stream.synchronize(); } template template void static_set::contains_async( - InputIt first, InputIt last, OutputIt output_begin, cudaStream_t stream) const + InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } @@ -219,7 +220,7 @@ template template OutputIt static_set::retrieve_all( - OutputIt output_begin, cudaStream_t stream) const + OutputIt output_begin, cuda_stream_ref stream) const { auto begin = thrust::make_transform_iterator(thrust::counting_iterator(0), detail::get_slot(storage_.ref())); @@ -248,7 +249,7 @@ OutputIt static_set::deallocate( temp_allocator, reinterpret_cast(d_num_out), sizeof(size_type)); temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); @@ -265,7 +266,7 @@ template static_set::size_type static_set::size( - cudaStream_t stream) const + cuda_stream_ref stream) const { auto counter = detail::counter_storage{allocator_}; counter.reset(stream); diff --git a/include/cuco/detail/storage/aow_storage.cuh b/include/cuco/detail/storage/aow_storage.cuh index 316f7fbe5..0d35cf49d 100644 --- a/include/cuco/detail/storage/aow_storage.cuh +++ b/include/cuco/detail/storage/aow_storage.cuh @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include @@ -226,7 +227,7 @@ class aow_storage : public aow_storage_base { * @param key Key to which all keys in `slots` are initialized * @param stream Stream used for executing the kernel */ - void initialize(value_type key, cudaStream_t stream) noexcept + void initialize(value_type key, cuda_stream_ref stream) noexcept { auto constexpr stride = 4; auto const grid_size = (this->num_windows() + stride * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / diff --git a/include/cuco/detail/storage/counter_storage.cuh b/include/cuco/detail/storage/counter_storage.cuh index bf87357a3..021e530d9 100644 --- a/include/cuco/detail/storage/counter_storage.cuh +++ b/include/cuco/detail/storage/counter_storage.cuh @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include @@ -64,7 +65,7 @@ class counter_storage : public storage_basedata(), 0, sizeof(value_type), stream)); @@ -92,12 +93,12 @@ class counter_storage : public storage_basedata(), sizeof(size_type), cudaMemcpyDeviceToHost, stream)); - CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); + stream.synchronize(); return h_count; } diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh index 985626444..256fd1131 100644 --- a/include/cuco/static_set.cuh +++ b/include/cuco/static_set.cuh @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include @@ -165,7 +166,7 @@ class static_set { KeyEqual pred = {}, ProbingScheme const& probing_scheme = {}, Allocator const& alloc = {}, - cudaStream_t stream = nullptr); + cuda_stream_ref stream = {}); /** * @brief Inserts all keys in the range `[first, last)` and returns the number of successful @@ -185,7 +186,7 @@ class static_set { * @return Number of successfully inserted keys */ template - size_type insert(InputIt first, InputIt last, cudaStream_t stream = nullptr); + size_type insert(InputIt first, InputIt last, cuda_stream_ref stream = {}); /** * @brief Asynchonously inserts all keys in the range `[first, last)`. @@ -199,7 +200,7 @@ class static_set { * @param stream CUDA stream used for insert */ template - void insert_async(InputIt first, InputIt last, cudaStream_t stream = nullptr); + void insert_async(InputIt first, InputIt last, cuda_stream_ref stream = {}); /** * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns @@ -227,7 +228,7 @@ class static_set { */ template size_type insert_if( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cudaStream_t stream = nullptr); + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream = {}); /** * @brief Asynchonously inserts keys in the range `[first, last)` if `pred` of the corresponding @@ -251,7 +252,7 @@ class static_set { */ template void insert_if_async( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cudaStream_t stream = nullptr); + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream = {}); /** * @brief Indicates whether the keys in the range `[first, last)` are contained in the set. @@ -271,7 +272,7 @@ class static_set { void contains(InputIt first, InputIt last, OutputIt output_begin, - cudaStream_t stream = nullptr) const; + cuda_stream_ref stream = {}) const; /** * @brief Asynchonously indicates whether the keys in the range `[first, last)` are contained in @@ -289,7 +290,7 @@ class static_set { void contains_async(InputIt first, InputIt last, OutputIt output_begin, - cudaStream_t stream = nullptr) const; + cuda_stream_ref stream = {}) const; /** * @brief Retrieves all keys contained in the set. @@ -309,7 +310,7 @@ class static_set { * @return Iterator indicating the end of the output */ template - [[nodiscard]] OutputIt retrieve_all(OutputIt output_begin, cudaStream_t stream = nullptr) const; + [[nodiscard]] OutputIt retrieve_all(OutputIt output_begin, cuda_stream_ref stream = {}) const; /** * @brief Gets the number of elements in the container. @@ -319,7 +320,7 @@ class static_set { * @param stream CUDA stream used to get the number of inserted elements * @return The number of elements in the container */ - [[nodiscard]] size_type size(cudaStream_t stream = nullptr) const; + [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const; /** * @brief Gets the maximum number of elements the hash map can hold. From 6c997d500e1b1770f5e3b4a342a8ef63c20a98d7 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 25 Apr 2023 17:03:03 -0700 Subject: [PATCH 104/152] Add set find APIs (#294) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR adds host-bulk and device ref APIs related to the `find` operation. --------- Co-authored-by: Daniel Jünger <2955913+sleeepyjack@users.noreply.github.com> --- benchmarks/CMakeLists.txt | 1 + .../hash_table/static_set/find_bench.cu | 77 ++++++++++ include/cuco/detail/probing_scheme_impl.inl | 2 +- include/cuco/detail/static_set/kernels.cuh | 77 ++++++++-- include/cuco/detail/static_set/static_set.inl | 38 +++++ .../cuco/detail/static_set/static_set_ref.inl | 143 +++++++++++++++++- include/cuco/detail/storage/aow_storage.cuh | 101 +++++++++++++ include/cuco/operator.hpp | 6 + include/cuco/static_set.cuh | 40 +++++ tests/static_set/unique_sequence_test.cu | 46 ++++-- 10 files changed, 509 insertions(+), 22 deletions(-) create mode 100644 benchmarks/hash_table/static_set/find_bench.cu diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 6e2c7001f..d75052d3e 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -50,6 +50,7 @@ endfunction(ConfigureBench) # - static_set benchmarks ------------------------------------------------------------------------- ConfigureBench(STATIC_SET_BENCH hash_table/static_set/contains_bench.cu + hash_table/static_set/find_bench.cu hash_table/static_set/insert_bench.cu hash_table/static_set/retrieve_all_bench.cu hash_table/static_set/size_bench.cu) diff --git a/benchmarks/hash_table/static_set/find_bench.cu b/benchmarks/hash_table/static_set/find_bench.cu new file mode 100644 index 000000000..53450f771 --- /dev/null +++ b/benchmarks/hash_table/static_set/find_bench.cu @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::static_set::find` performance + */ +template +void static_set_find(nvbench::state& state, nvbench::type_list) +{ + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); + + std::size_t const size = num_keys / occupancy; + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + cuco::experimental::static_set set{size, cuco::empty_key{-1}}; + set.insert(keys.begin(), keys.end()); + + // TODO: would crash if not passing nullptr, why? + gen.dropout(keys.begin(), keys.end(), matching_rate, nullptr); + + thrust::device_vector result(num_keys); + + state.add_element_count(num_keys); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + set.find(keys.begin(), keys.end(), result.begin(), {launch.get_stream()}); + }); +} + +NVBENCH_BENCH_TYPES(static_set_find, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_find_unique_occupancy") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); + +NVBENCH_BENCH_TYPES(static_set_find, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_find_unique_matching_rate") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); diff --git a/include/cuco/detail/probing_scheme_impl.inl b/include/cuco/detail/probing_scheme_impl.inl index 4f7e904a1..5a5c5ae23 100644 --- a/include/cuco/detail/probing_scheme_impl.inl +++ b/include/cuco/detail/probing_scheme_impl.inl @@ -49,7 +49,7 @@ class probing_iterator { /** * @brief Dereference operator * - * @return Current slot ndex + * @return Current slot index */ __host__ __device__ constexpr auto operator*() const noexcept { return curr_index_; } diff --git a/include/cuco/detail/static_set/kernels.cuh b/include/cuco/detail/static_set/kernels.cuh index e0fa30d7d..5304ebd4a 100644 --- a/include/cuco/detail/static_set/kernels.cuh +++ b/include/cuco/detail/static_set/kernels.cuh @@ -177,11 +177,10 @@ __global__ void contains(InputIt first, cuco::detail::index_type n, OutputIt out if (idx < n) { auto const key = *(first + idx); /* - * The ld.relaxed.gpu instruction used in this operation causes L1 to - * flush more frequently, causing increased sector stores from L2 to global memory. - * By writing results to shared memory and then synchronizing before writing back - * to global, we no longer rely on L1, preventing the increase in sector stores from - * L2 to global and improving performance. + * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased sector + * stores from L2 to global memory. By writing results to shared memory and then synchronizing + * before writing back to global, we no longer rely on L1, preventing the increase in sector + * stores from L2 to global and improving performance. */ output_buffer[thread_idx] = ref.contains(key); } @@ -230,11 +229,10 @@ __global__ void contains(InputIt first, cuco::detail::index_type n, OutputIt out auto const key = *(first + idx); auto const found = ref.contains(tile, key); /* - * The ld.relaxed.gpu instruction used in view.find causes L1 to - * flush more frequently, causing increased sector stores from L2 to global memory. - * By writing results to shared memory and then synchronizing before writing back - * to global, we no longer rely on L1, preventing the increase in sector stores from - * L2 to global and improving performance. + * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased sector + * stores from L2 to global memory. By writing results to shared memory and then synchronizing + * before writing back to global, we no longer rely on L1, preventing the increase in sector + * stores from L2 to global and improving performance. */ if (tile.thread_rank() == 0) { output_buffer[tile_idx] = found; } } @@ -245,6 +243,65 @@ __global__ void contains(InputIt first, cuco::detail::index_type n, OutputIt out } } +/** + * @brief Finds the equivalent set elements of all keys in the range `[first, last)`. + * + * If the key `*(first + i)` has a match in the set, copies its matched element to `(output_begin + + * i)`. Else, copies the empty value sentinel. Uses the CUDA Cooperative Groups API to leverage + * groups of multiple threads to find each key. This provides a significant boost in throughput + * compared to the non Cooperative Group `find` at moderate to high load factors. + * + * @tparam CGSize Number of threads in each CG + * @tparam BlockSize The size of the thread block + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from the set's `value_type` + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * + * @param first Beginning of the sequence of keys + * @param n Number of keys to query + * @param output_begin Beginning of the sequence of matched elements retrieved for each key + * @param ref Non-owning set device ref used to access the slot storage + */ +template +__global__ void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref) +{ + namespace cg = cooperative_groups; + + auto const block = cg::this_thread_block(); + auto const thread_idx = block.thread_rank(); + + cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize; + cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize; + __shared__ typename Ref::value_type output_buffer[BlockSize / CGSize]; + + while (idx - thread_idx < n) { // the whole thread block falls into the same iteration + if (idx < n) { + auto const key = *(first + idx); + if constexpr (CGSize == 1) { + auto const found = ref.find(key); + /* + * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased + * sector stores from L2 to global memory. By writing results to shared memory and then + * synchronizing before writing back to global, we no longer rely on L1, preventing the + * increase in sector stores from L2 to global and improving performance. + */ + output_buffer[thread_idx] = found == ref.end() ? ref.empty_key_sentinel() : *found; + block.sync(); + *(output_begin + idx) = output_buffer[thread_idx]; + } else { + auto const tile = cg::tiled_partition(block); + auto const found = ref.find(tile, key); + + if (tile.thread_rank() == 0) { + *(output_begin + idx) = found == ref.end() ? ref.empty_key_sentinel() : *found; + } + } + } + idx += loop_stride; + } +} + /** * @brief Calculates the number of filled slots for the given window storage. * diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl index 0198b91d4..1800c8910 100644 --- a/include/cuco/detail/static_set/static_set.inl +++ b/include/cuco/detail/static_set/static_set.inl @@ -211,6 +211,44 @@ void static_set } } +template +template +void static_set::find( + InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const +{ + find_async(first, last, output_begin, stream); + stream.synchronize(); +} + +template +template +void static_set::find_async( + InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const +{ + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + auto const grid_size = + (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / + (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + + detail::find + <<>>( + first, num_keys, output_begin, ref(op::find)); +} + template -#include #include + +#include + +#include #include namespace cuco { @@ -343,6 +346,144 @@ class operator_impl +class operator_impl> { + using base_type = static_set_ref; + using ref_type = static_set_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + using iterator = typename StorageRef::iterator; + using const_iterator = typename StorageRef::const_iterator; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + public: + /** + * @brief Returns a const_iterator to one past the last slot. + * + * @note This API is available only when `find_tag` is present. + * + * @return A const_iterator to one past the last slot + */ + [[nodiscard]] __host__ __device__ constexpr const_iterator end() const noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.storage_ref_.end(); + } + + /** + * @brief Returns an iterator to one past the last slot. + * + * @note This API is available only when `find_tag` is present. + * + * @return An iterator to one past the last slot + */ + [[nodiscard]] __host__ __device__ constexpr iterator end() noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.storage_ref_.end(); + } + + /** + * @brief Finds an element in the set with key equivalent to the probe key. + * + * @note Returns a un-incrementable input iterator to the element whose key is equivalent to + * `key`. If no such element exists, returns `end()`. + * + * @tparam ProbeKey Probe key type + * + * @param key The key to search for + * + * @return An iterator to the position at which the equivalent key is stored + */ + template + [[nodiscard]] __device__ const_iterator find(ProbeKey const& key) const noexcept + { + // CRTP: cast `this` to the actual ref type + auto const& ref_ = static_cast(*this); + + auto probing_iter = ref_.probing_scheme_(key, ref_.storage_ref_.num_windows()); + + while (true) { + // TODO atomic_ref::load if insert operator is present + auto const window_slots = ref_.storage_ref_[*probing_iter]; + + for (auto i = 0; i < window_size; ++i) { + switch (ref_.predicate_(window_slots[i], key)) { + case detail::equal_result::EMPTY: { + return this->end(); + } + case detail::equal_result::EQUAL: { + return const_iterator{&(*(ref_.storage_ref_.data() + *probing_iter))[i]}; + } + default: continue; + } + } + ++probing_iter; + } + } + + /** + * @brief Finds an element in the set with key equivalent to the probe key. + * + * @note Returns a un-incrementable input iterator to the element whose key is equivalent to + * `key`. If no such element exists, returns `end()`. + * + * @tparam ProbeKey Probe key type + * + * @param g The Cooperative Group used to perform this operation + * @param key The key to search for + * + * @return An iterator to the position at which the equivalent key is stored + */ + template + [[nodiscard]] __device__ const_iterator + find(cooperative_groups::thread_block_tile const& g, ProbeKey const& key) const noexcept + { + auto const& ref_ = static_cast(*this); + + auto probing_iter = ref_.probing_scheme_(g, key, ref_.storage_ref_.num_windows()); + + while (true) { + auto const window_slots = ref_.storage_ref_[*probing_iter]; + + auto const [state, intra_window_index] = [&]() { + for (auto i = 0; i < window_size; ++i) { + switch (ref_.predicate_(window_slots[i], key)) { + case detail::equal_result::EMPTY: return cuco::pair{detail::equal_result::EMPTY, i}; + case detail::equal_result::EQUAL: return cuco::pair{detail::equal_result::EQUAL, i}; + default: continue; + } + } + // returns dummy index `-1` for UNEQUAL + return cuco::pair{detail::equal_result::UNEQUAL, -1}; + }(); + + // Find a match for the probe key, thus return an iterator to the entry + auto const group_finds_match = g.ballot(state == detail::equal_result::EQUAL); + if (group_finds_match) { + auto const src_lane = __ffs(group_finds_match) - 1; + auto const res = g.shfl(reinterpret_cast(&( + *(ref_.storage_ref_.data() + *probing_iter))[intra_window_index]), + src_lane); + return const_iterator{reinterpret_cast(res)}; + } + + // Find an empty slot, meaning that the probe key isn't present in the set + if (g.any(state == detail::equal_result::EMPTY)) { return this->end(); } + + ++probing_iter; + } + } +}; + } // namespace detail } // namespace experimental } // namespace cuco diff --git a/include/cuco/detail/storage/aow_storage.cuh b/include/cuco/detail/storage/aow_storage.cuh index 0d35cf49d..6d24cd832 100644 --- a/include/cuco/detail/storage/aow_storage.cuh +++ b/include/cuco/detail/storage/aow_storage.cuh @@ -25,6 +25,8 @@ #include #include +#include +#include #include namespace cuco { @@ -114,6 +116,105 @@ class aow_storage_ref : public aow_storage_base { { } + /** + * @brief Custom un-incrementable input iterator for the convenience of `find` operations. + * + * @note This iterator is for read only and NOT incrementable. + */ + struct iterator { + public: + using iterator_category = std::input_iterator_tag; ///< iterator category + using reference = value_type&; ///< iterator reference type + + /** + * @brief Constructs a device side input iterator of the given slot. + * + * @param current The slot pointer + */ + __device__ constexpr explicit iterator(value_type* current) noexcept : current_{current} {} + + /** + * @brief Prefix increment operator + * + * @throw This code path should never be chosen. + * + * @return Current iterator + */ + __device__ constexpr iterator& operator++() noexcept + { + static_assert("Un-incrementable input iterator"); + } + + /** + * @brief Postfix increment operator + * + * @throw This code path should never be chosen. + * + * @return Current iterator + */ + __device__ constexpr iterator operator++(int32_t) noexcept + { + static_assert("Un-incrementable input iterator"); + } + + /** + * @brief Dereference operator + * + * @return Reference to the current slot + */ + __device__ constexpr reference operator*() const { return *current_; } + + /** + * Equality operator + * + * @return True if two iterators are identical + */ + friend __device__ constexpr bool operator==(iterator const& lhs, iterator const& rhs) noexcept + { + return lhs.current_ == rhs.current_; + } + + /** + * Inequality operator + * + * @return True if two iterators are not identical + */ + friend __device__ constexpr bool operator!=(iterator const& lhs, iterator const& rhs) noexcept + { + return not lhs == rhs; + } + + private: + value_type* current_{}; ///< Pointer to the current slot + }; + using const_iterator = iterator const; ///< Const forward iterator type + + /** + * @brief Returns an iterator to one past the last slot. + * + * This is provided for convenience for those familiar with checking + * an iterator returned from `find()` against the `end()` iterator. + * + * @return An iterator to one past the last slot + */ + [[nodiscard]] __device__ constexpr iterator end() noexcept + { + return iterator{reinterpret_cast(this->data() + this->capacity())}; + } + + /** + * @brief Returns a const_iterator to one past the last slot. + * + * This is provided for convenience for those familiar with checking + * an iterator returned from `find()` against the `end()` iterator. + * + * @return A const_iterator to one past the last slot + */ + [[nodiscard]] __device__ constexpr const_iterator end() const noexcept + { + return const_iterator{reinterpret_cast(this->data() + this->capacity())}; + } + /** * @brief Gets windows array. * diff --git a/include/cuco/operator.hpp b/include/cuco/operator.hpp index abf66e0bd..1f466e87e 100644 --- a/include/cuco/operator.hpp +++ b/include/cuco/operator.hpp @@ -33,6 +33,12 @@ struct insert_tag { struct contains_tag { } inline constexpr contains; +/** + * @brief `find` operator tag + */ +struct find_tag { +} inline constexpr find; + } // namespace op } // namespace experimental } // namespace cuco diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh index 256fd1131..90d59a2e1 100644 --- a/include/cuco/static_set.cuh +++ b/include/cuco/static_set.cuh @@ -292,6 +292,46 @@ class static_set { OutputIt output_begin, cuda_stream_ref stream = {}) const; + /** + * @brief For all keys in the range `[first, last)`, finds an element with key equivalent to the + * query key. + * + * @note This function synchronizes the given stream. For asynchronous execution use `find_async`. + * @note If the key `*(first + i)` has a matched `element` in the set, copies `element` to + * `(output_begin + i)`. Else, copies the empty key sentinel. + * + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from the set's `value_type` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of elements retrieved for each key + * @param stream Stream used for executing the kernels + */ + template + void find(InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream = {}) const; + + /** + * @brief For all keys in the range `[first, last)`, asynchonously finds an element with key + * equivalent to the query key. + * + * @note If the key `*(first + i)` has a matched `element` in the set, copies `element` to + * `(output_begin + i)`. Else, copies the empty key sentinel. + * + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from the set's `value_type` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of elements retrieved for each key + * @param stream Stream used for executing the kernels + */ + template + void find_async(InputIt first, + InputIt last, + OutputIt output_begin, + cuda_stream_ref stream = {}) const; + /** * @brief Retrieves all keys contained in the set. * diff --git a/tests/static_set/unique_sequence_test.cu b/tests/static_set/unique_sequence_test.cu index 6ea4a03e3..6a18b3378 100644 --- a/tests/static_set/unique_sequence_test.cu +++ b/tests/static_set/unique_sequence_test.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -38,36 +39,40 @@ __inline__ void test_unique_sequence(Set& set, std::size_t num_keys) thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); - auto key_begin = d_keys.begin(); + auto keys_begin = d_keys.begin(); thrust::device_vector d_contained(num_keys); SECTION("Non-inserted keys should not be contained.") { REQUIRE(set.size() == 0); - set.contains(key_begin, key_begin + num_keys, d_contained.begin()); + set.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), thrust::identity{})); } - SECTION("All inserted key/value pairs should be contained.") + SECTION("Non-inserted keys have no matches") { - set.insert(key_begin, key_begin + num_keys); - REQUIRE(set.size() == num_keys); + thrust::device_vector d_results(num_keys); - set.contains(key_begin, key_begin + num_keys, d_contained.begin()); - REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), thrust::identity{})); + set.find(keys_begin, keys_begin + num_keys, d_results.begin()); + auto zip = thrust::make_zip_iterator(thrust::make_tuple( + d_results.begin(), thrust::constant_iterator{set.empty_key_sentinel()})); + + REQUIRE(cuco::test::all_of(zip, zip + num_keys, [] __device__(auto const& p) { + return thrust::get<0>(p) == thrust::get<1>(p); + })); } SECTION("All conditionally inserted keys should be contained") { - auto const inserted = set.insert_if(key_begin, - key_begin + num_keys, + auto const inserted = set.insert_if(keys_begin, + keys_begin + num_keys, thrust::counting_iterator(0), [] __device__(auto const& key) { return (key % 2) == 0; }); REQUIRE(inserted == num_keys / 2); REQUIRE(set.size() == num_keys / 2); - set.contains(key_begin, key_begin + num_keys, d_contained.begin()); + set.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); REQUIRE(cuco::test::equal(d_contained.begin(), d_contained.end(), thrust::counting_iterator(0), @@ -75,6 +80,27 @@ __inline__ void test_unique_sequence(Set& set, std::size_t num_keys) return ((idx % 2) == 0) == idx_contained; })); } + + set.insert(keys_begin, keys_begin + num_keys); + REQUIRE(set.size() == num_keys); + + SECTION("All inserted keys should be contained.") + { + set.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), thrust::identity{})); + } + + SECTION("All inserted keys should be correctly recovered during find") + { + thrust::device_vector d_results(num_keys); + + set.find(keys_begin, keys_begin + num_keys, d_results.begin()); + auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_results.begin(), keys_begin)); + + REQUIRE(cuco::test::all_of(zip, zip + num_keys, [] __device__(auto const& p) { + return thrust::get<0>(p) == thrust::get<1>(p); + })); + } } TEMPLATE_TEST_CASE_SIG( From 1a841121d1a32b83c6de0ccccbcdf037790d61ad Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 26 Apr 2023 10:07:10 -0700 Subject: [PATCH 105/152] Add set conditional contains APIs (#295) This PR adds conditional `contains` APIs and unifies `contains` implementations with the `contains_if_n` kernel. --- include/cuco/detail/static_set/kernels.cuh | 131 ++++++++---------- include/cuco/detail/static_set/static_set.inl | 69 +++++++-- include/cuco/static_set.cuh | 77 +++++++++- tests/static_set/unique_sequence_test.cu | 30 ++-- 4 files changed, 203 insertions(+), 104 deletions(-) diff --git a/include/cuco/detail/static_set/kernels.cuh b/include/cuco/detail/static_set/kernels.cuh index 5304ebd4a..3cca8d2b4 100644 --- a/include/cuco/detail/static_set/kernels.cuh +++ b/include/cuco/detail/static_set/kernels.cuh @@ -29,10 +29,11 @@ namespace detail { /** * @brief Inserts all elements in the range `[first, first + n)` and returns the number of - * successful insertions. + * successful insertions if `pred` of the corresponding stencil returns true. * - * If multiple elements in `[first, first + n)` compare equal, it is unspecified which - * element is inserted. + * @note If multiple elements in `[first, first + n)` compare equal, it is unspecified which element + * is inserted. + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. * * @tparam CGSize Number of threads in each CG * @tparam BlockSize Number of threads in each block @@ -48,7 +49,7 @@ namespace detail { * @param first Beginning of the sequence of input elements * @param n Number of input elements * @param stencil Beginning of the stencil sequence - * @param pred Predicate to test on every element in the range `[s, s + n)` + * @param pred Predicate to test on every element in the range `[stencil, stencil + n)` * @param num_successes Number of successful inserted elements * @param ref Non-owning set device ref used to access the slot storage */ @@ -96,10 +97,12 @@ __global__ void insert_if_n(InputIterator first, } /** - * @brief Inserts all elements in the range `[first, first + n)`. + * @brief Inserts all elements in the range `[first, first + n)` if `pred` of the corresponding + * stencil returns true. * - * If multiple elements in `[first, first + n)` compare equal, it is unspecified which - * element is inserted. + * @note If multiple elements in `[first, first + n)` compare equal, it is unspecified which element + * is inserted. + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. * * @tparam CGSize Number of threads in each CG * @tparam BlockSize Number of threads in each block @@ -114,7 +117,7 @@ __global__ void insert_if_n(InputIterator first, * @param first Beginning of the sequence of input elements * @param n Number of input elements * @param stencil Beginning of the stencil sequence - * @param pred Predicate to test on every element in the range `[s, s + n)` + * @param pred Predicate to test on every element in the range `[stencil, stencil + n)` * @param ref Non-owning set device ref used to access the slot storage */ template -__global__ void contains(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref) -{ - namespace cg = cooperative_groups; - - auto const block = cg::this_thread_block(); - auto const thread_idx = block.thread_rank(); - - cuco::detail::index_type const loop_stride = gridDim.x * BlockSize; - cuco::detail::index_type idx = BlockSize * blockIdx.x + threadIdx.x; - __shared__ bool output_buffer[BlockSize]; - - while (idx - thread_idx < n) { // the whole thread block falls into the same iteration - if (idx < n) { - auto const key = *(first + idx); - /* - * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased sector - * stores from L2 to global memory. By writing results to shared memory and then synchronizing - * before writing back to global, we no longer rely on L1, preventing the increase in sector - * stores from L2 to global and improving performance. - */ - output_buffer[thread_idx] = ref.contains(key); - } - - block.sync(); - if (idx < n) { *(output_begin + idx) = output_buffer[thread_idx]; } - idx += loop_stride; - } -} - -/** - * @brief Indicates whether the keys in the range `[first, first + n)` are contained in the data - * structure. - * - * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the data - * structure. + * @note If `pred( *(stencil + i) )` is true, stores `true` or `false` to `(output_begin + i)` + * indicating if the key `*(first + i)` is present in the set. If `pred( *(stencil + i) )` is false, + * stores false to `(output_begin + i)`. * * @tparam CGSize Number of threads in each CG * @tparam BlockSize The size of the thread block * @tparam InputIt Device accessible input iterator + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` + * and argument type is convertible from `std::iterator_traits::value_type` * @tparam OutputIt Device accessible output iterator assignable from `bool` * @tparam Ref Type of non-owning device ref allowing access to storage * * @param first Beginning of the sequence of keys * @param n Number of keys + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + n)` * @param output_begin Beginning of the sequence of booleans for the presence of each key * @param ref Non-owning set device ref used to access the slot storage */ -template -__global__ void contains(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref) +template +__global__ void contains_if_n(InputIt first, + cuco::detail::index_type n, + StencilIt stencil, + Predicate pred, + OutputIt output_begin, + Ref ref) { namespace cg = cooperative_groups; - auto block = cg::this_thread_block(); + auto const block = cg::this_thread_block(); auto const thread_idx = block.thread_rank(); - auto tile = cg::tiled_partition(cg::this_thread_block()); cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize; cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize; __shared__ bool output_buffer[BlockSize / CGSize]; - auto const tile_idx = thread_idx / CGSize; while (idx - thread_idx < n) { // the whole thread block falls into the same iteration - if (idx < n) { - auto const key = *(first + idx); - auto const found = ref.contains(tile, key); - /* - * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased sector - * stores from L2 to global memory. By writing results to shared memory and then synchronizing - * before writing back to global, we no longer rely on L1, preventing the increase in sector - * stores from L2 to global and improving performance. - */ - if (tile.thread_rank() == 0) { output_buffer[tile_idx] = found; } + if constexpr (CGSize == 1) { + if (idx < n) { + auto const key = *(first + idx); + /* + * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased + * sector stores from L2 to global memory. By writing results to shared memory and then + * synchronizing before writing back to global, we no longer rely on L1, preventing the + * increase in sector stores from L2 to global and improving performance. + */ + output_buffer[thread_idx] = pred(*(stencil + idx)) ? ref.contains(key) : false; + } + block.sync(); + if (idx < n) { *(output_begin + idx) = output_buffer[thread_idx]; } + } else { + auto const tile = cg::tiled_partition(cg::this_thread_block()); + if (idx < n) { + auto const key = *(first + idx); + auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false; + if (tile.thread_rank() == 0) { *(output_begin + idx) = found; } + } } - - block.sync(); - if (idx < n and tile.thread_rank() == 0) { *(output_begin + idx) = output_buffer[tile_idx]; } idx += loop_stride; } } diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl index 1800c8910..fdd4bfaf4 100644 --- a/include/cuco/detail/static_set/static_set.inl +++ b/include/cuco/detail/static_set/static_set.inl @@ -100,7 +100,7 @@ template template void static_set::insert_async( - InputIt first, InputIt last, cuda_stream_ref stream) + InputIt first, InputIt last, cuda_stream_ref stream) noexcept { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } @@ -153,7 +153,7 @@ template template void static_set::insert_if_async( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) noexcept { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } @@ -191,7 +191,56 @@ template template void static_set::contains_async( - InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const + InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const noexcept +{ + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + auto const grid_size = + (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / + (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + + auto const always_true = thrust::constant_iterator{true}; + detail::contains_if_n + <<>>( + first, num_keys, always_true, thrust::identity{}, output_begin, ref(op::contains)); +} + +template +template +void static_set::contains_if( + InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + OutputIt output_begin, + cuda_stream_ref stream) const +{ + contains_if_async(first, last, stencil, pred, output_begin, stream); + stream.synchronize(); +} + +template +template +void static_set::contains_if_async( + InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + OutputIt output_begin, + cuda_stream_ref stream) const noexcept { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } @@ -200,15 +249,9 @@ void static_set (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); - if constexpr (cg_size == 1) { - detail::contains - <<>>( - first, num_keys, output_begin, ref(op::contains)); - } else { - detail::contains - <<>>( - first, num_keys, output_begin, ref(op::contains)); - } + detail::contains_if_n + <<>>( + first, num_keys, stencil, pred, output_begin, ref(op::contains)); } template static_set::size_type static_set::size( - cuda_stream_ref stream) const + cuda_stream_ref stream) const noexcept { auto counter = detail::counter_storage{allocator_}; counter.reset(stream); diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh index 90d59a2e1..5ba161bfa 100644 --- a/include/cuco/static_set.cuh +++ b/include/cuco/static_set.cuh @@ -200,7 +200,7 @@ class static_set { * @param stream CUDA stream used for insert */ template - void insert_async(InputIt first, InputIt last, cuda_stream_ref stream = {}); + void insert_async(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept; /** * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns @@ -251,8 +251,11 @@ class static_set { * @param stream CUDA stream used for the operation */ template - void insert_if_async( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream = {}); + void insert_if_async(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + cuda_stream_ref stream = {}) noexcept; /** * @brief Indicates whether the keys in the range `[first, last)` are contained in the set. @@ -290,7 +293,71 @@ class static_set { void contains_async(InputIt first, InputIt last, OutputIt output_begin, - cuda_stream_ref stream = {}) const; + cuda_stream_ref stream = {}) const noexcept; + + /** + * @brief Indicates whether the keys in the range `[first, last)` are contained in the set if + * `pred` of the corresponding stencil returns true. + * + * @note If `pred( *(stencil + i) )` is true, stores `true` or `false` to `(output_begin + i)` + * indicating if the key `*(first + i)` is present in the set. If `pred( *(stencil + i) )` is + * false, stores false to `(output_begin + i)`. + * @note This function synchronizes the given stream. For asynchronous execution use + * `contains_if_async`. + * + * @tparam InputIt Device accessible input iterator + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param stream Stream used for executing the kernels + */ + template + void contains_if(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + OutputIt output_begin, + cuda_stream_ref stream = {}) const; + + /** + * @brief Asynchonously indicates whether the keys in the range `[first, last)` are contained in + * the set if `pred` of the corresponding stencil returns true. + * + * @note If `pred( *(stencil + i) )` is true, stores `true` or `false` to `(output_begin + i)` + * indicating if the key `*(first + i)` is present in the set. If `pred( *(stencil + i) )` is + * false, stores false to `(output_begin + i)`. + * + * @tparam InputIt Device accessible input iterator + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param stream Stream used for executing the kernels + */ + template + void contains_if_async(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + OutputIt output_begin, + cuda_stream_ref stream = {}) const noexcept; /** * @brief For all keys in the range `[first, last)`, finds an element with key equivalent to the @@ -360,7 +427,7 @@ class static_set { * @param stream CUDA stream used to get the number of inserted elements * @return The number of elements in the container */ - [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const; + [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept; /** * @brief Gets the maximum number of elements the hash map can hold. diff --git a/tests/static_set/unique_sequence_test.cu b/tests/static_set/unique_sequence_test.cu index 6a18b3378..cd4869fde 100644 --- a/tests/static_set/unique_sequence_test.cu +++ b/tests/static_set/unique_sequence_test.cu @@ -42,6 +42,9 @@ __inline__ void test_unique_sequence(Set& set, std::size_t num_keys) auto keys_begin = d_keys.begin(); thrust::device_vector d_contained(num_keys); + auto zip_equal = [] __device__(auto const& p) { return thrust::get<0>(p) == thrust::get<1>(p); }; + auto is_even = [] __device__(auto const& i) { return i % 2 == 0; }; + SECTION("Non-inserted keys should not be contained.") { REQUIRE(set.size() == 0); @@ -58,17 +61,13 @@ __inline__ void test_unique_sequence(Set& set, std::size_t num_keys) auto zip = thrust::make_zip_iterator(thrust::make_tuple( d_results.begin(), thrust::constant_iterator{set.empty_key_sentinel()})); - REQUIRE(cuco::test::all_of(zip, zip + num_keys, [] __device__(auto const& p) { - return thrust::get<0>(p) == thrust::get<1>(p); - })); + REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal)); } SECTION("All conditionally inserted keys should be contained") { - auto const inserted = set.insert_if(keys_begin, - keys_begin + num_keys, - thrust::counting_iterator(0), - [] __device__(auto const& key) { return (key % 2) == 0; }); + auto const inserted = set.insert_if( + keys_begin, keys_begin + num_keys, thrust::counting_iterator(0), is_even); REQUIRE(inserted == num_keys / 2); REQUIRE(set.size() == num_keys / 2); @@ -90,6 +89,19 @@ __inline__ void test_unique_sequence(Set& set, std::size_t num_keys) REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), thrust::identity{})); } + SECTION("Conditional contains should return true on even inputs.") + { + set.contains_if(keys_begin, + keys_begin + num_keys, + thrust::counting_iterator(0), + is_even, + d_contained.begin()); + auto gold_iter = + thrust::make_transform_iterator(thrust::counting_iterator(0), is_even); + auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_contained.begin(), gold_iter)); + REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal)); + } + SECTION("All inserted keys should be correctly recovered during find") { thrust::device_vector d_results(num_keys); @@ -97,9 +109,7 @@ __inline__ void test_unique_sequence(Set& set, std::size_t num_keys) set.find(keys_begin, keys_begin + num_keys, d_results.begin()); auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_results.begin(), keys_begin)); - REQUIRE(cuco::test::all_of(zip, zip + num_keys, [] __device__(auto const& p) { - return thrust::get<0>(p) == thrust::get<1>(p); - })); + REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal)); } } From d2767deb7d6d37746c988eb3e7a3731cf1362bfe Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 2 May 2023 11:14:23 -0700 Subject: [PATCH 106/152] Add set `insert_and_find` device ref APIs (#298) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR adds device side `insert_and_find` APIs for `static_set`. It also includes small cleanups of the current refactoring code. --------- Co-authored-by: Daniel Jünger <2955913+sleeepyjack@users.noreply.github.com> --- .../cuco/detail/static_set/static_set_ref.inl | 272 +++++++++++++----- include/cuco/operator.hpp | 6 + include/cuco/static_set_ref.cuh | 21 +- tests/CMakeLists.txt | 1 + tests/static_set/insert_and_find_test.cu | 112 ++++++++ tests/static_set/retrieve_all_test.cu | 69 ++--- tests/static_set/unique_sequence_test.cu | 54 ++-- 7 files changed, 382 insertions(+), 153 deletions(-) create mode 100644 tests/static_set/insert_and_find_test.cu diff --git a/include/cuco/detail/static_set/static_set_ref.inl b/include/cuco/detail/static_set/static_set_ref.inl index 7973f7b92..ac79623a5 100644 --- a/include/cuco/detail/static_set/static_set_ref.inl +++ b/include/cuco/detail/static_set/static_set_ref.inl @@ -22,6 +22,7 @@ #include #include +#include #include @@ -82,6 +83,61 @@ static_set_ref::e return empty_key_sentinel_; } +template +__device__ + static_set_ref::insert_result + static_set_ref::attempt_insert( + value_type* slot, value_type const& value) +{ + // temporary workaround due to performance regression + // https://github.com/NVIDIA/libcudacxx/issues/366 + value_type const old = [&]() { + value_type expected = this->empty_key_sentinel(); + value_type val = value; + if constexpr (sizeof(value_type) == sizeof(unsigned int)) { + auto* expected_ptr = reinterpret_cast(&expected); + auto* value_ptr = reinterpret_cast(&val); + if constexpr (Scope == cuda::thread_scope_system) { + return atomicCAS_system(reinterpret_cast(slot), *expected_ptr, *value_ptr); + } else if constexpr (Scope == cuda::thread_scope_device) { + return atomicCAS(reinterpret_cast(slot), *expected_ptr, *value_ptr); + } else if constexpr (Scope == cuda::thread_scope_block) { + return atomicCAS_block(reinterpret_cast(slot), *expected_ptr, *value_ptr); + } else { + static_assert(cuco::dependent_false, "Unsupported thread scope"); + } + } + if constexpr (sizeof(value_type) == sizeof(unsigned long long int)) { + auto* expected_ptr = reinterpret_cast(&expected); + auto* value_ptr = reinterpret_cast(&val); + if constexpr (Scope == cuda::thread_scope_system) { + return atomicCAS_system( + reinterpret_cast(slot), *expected_ptr, *value_ptr); + } else if constexpr (Scope == cuda::thread_scope_device) { + return atomicCAS( + reinterpret_cast(slot), *expected_ptr, *value_ptr); + } else if constexpr (Scope == cuda::thread_scope_block) { + return atomicCAS_block( + reinterpret_cast(slot), *expected_ptr, *value_ptr); + } else { + static_assert(cuco::dependent_false, "Unsupported thread scope"); + } + } + }(); + if (*slot == old) { + // Shouldn't use `predicate_` operator directly since it includes a redundant bitwise compare + return predicate_.equal_to(old, value) == detail::equal_result::EQUAL ? insert_result::DUPLICATE + : insert_result::CONTINUE; + } else { + return insert_result::SUCCESS; + } +} + namespace detail { template (*this); auto probing_iter = ref_.probing_scheme_(value, ref_.storage_ref_.num_windows()); @@ -124,7 +181,7 @@ class operator_impldata() + intra_window_index, value)) { case insert_result::CONTINUE: continue; case insert_result::SUCCESS: return true; @@ -146,6 +203,8 @@ class operator_impl group, value_type const& value) noexcept { + using insert_result = typename ref_type::insert_result; + auto& ref_ = static_cast(*this); auto probing_iter = ref_.probing_scheme_(group, value, ref_.storage_ref_.num_windows()); @@ -173,7 +232,7 @@ class operator_impldata() + intra_window_index, value) : insert_result::CONTINUE; @@ -187,68 +246,136 @@ class operator_impl +class operator_impl> { + using base_type = static_set_ref; + using ref_type = static_set_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + using iterator = typename base_type::iterator; + using const_iterator = typename base_type::const_iterator; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + public: /** - * @brief Attempts to insert an element into a slot. + * @brief Inserts the given element into the set. * - * @note Dispatches the correct implementation depending on the container - * type and presence of other operator mixins. + * @note This API returns a pair consisting of an iterator to the inserted element (or to the + * element that prevented the insertion) and a `bool` denoting whether the insertion took place or + * not. * - * @param slot Pointer to the slot in memory - * @param value Element to insert + * @param value The element to insert * - * @return Result of this operation, i.e., success/continue/duplicate + * @return a pair consisting of an iterator to the element and a bool indicating whether the + * insertion is successful or not. */ - [[nodiscard]] __device__ insert_result attempt_insert(value_type* slot, value_type const& value) + __device__ thrust::pair insert_and_find(value_type const& value) noexcept { - auto& ref_ = static_cast(*this); - - // temporary workaround due to performance regression - // https://github.com/NVIDIA/libcudacxx/issues/366 - value_type const old = [&]() { - value_type expected = ref_.empty_key_sentinel_.value; - value_type val = value; - if constexpr (sizeof(value_type) == sizeof(uint32_t)) { - auto* expected_ptr = reinterpret_cast(&expected); - auto* value_ptr = reinterpret_cast(&val); - if constexpr (thread_scope == cuda::thread_scope_system) { - return atomicCAS_system(reinterpret_cast(slot), *expected_ptr, *value_ptr); - } else if constexpr (thread_scope == cuda::thread_scope_device) { - return atomicCAS(reinterpret_cast(slot), *expected_ptr, *value_ptr); - } else if constexpr (thread_scope == cuda::thread_scope_block) { - return atomicCAS_block(reinterpret_cast(slot), *expected_ptr, *value_ptr); - } else { - static_assert(cuco::dependent_false, "Unsupported thread scope"); + using insert_result = typename ref_type::insert_result; + + ref_type& ref_ = static_cast(*this); + auto probing_iter = ref_.probing_scheme_(value, ref_.storage_ref_.num_windows()); + + while (true) { + auto const window_slots = ref_.storage_ref_[*probing_iter]; + + for (auto i = 0; i < window_size; ++i) { + auto const eq_res = ref_.predicate_(window_slots[i], value); + auto* window_ptr = (ref_.storage_ref_.data() + *probing_iter)->data(); + + // If the key is already in the container, return false + if (eq_res == detail::equal_result::EQUAL) { return {iterator{&window_ptr[i]}, false}; } + if (eq_res == detail::equal_result::EMPTY) { + switch (ref_.attempt_insert(window_ptr + i, value)) { + case insert_result::SUCCESS: { + return {iterator{&window_ptr[i]}, true}; + } + case insert_result::DUPLICATE: { + return {iterator{&window_ptr[i]}, false}; + } + default: continue; + } + } + } + ++probing_iter; + }; + } + + /** + * @brief Inserts the given element into the set. + * + * @note This API returns a pair consisting of an iterator to the inserted element (or to the + * element that prevented the insertion) and a `bool` denoting whether the insertion took place or + * not. + * + * @param group The Cooperative Group used to perform group insert_and_find + * @param value The element to insert + * + * @return a pair consisting of an iterator to the element and a bool indicating whether the + * insertion is successful or not. + */ + __device__ thrust::pair insert_and_find( + cooperative_groups::thread_block_tile const& group, value_type const& value) noexcept + { + using insert_result = typename ref_type::insert_result; + + ref_type& ref_ = static_cast(*this); + auto probing_iter = ref_.probing_scheme_(group, value, ref_.storage_ref_.num_windows()); + + while (true) { + auto const window_slots = ref_.storage_ref_[*probing_iter]; + + auto const [state, intra_window_index] = [&]() { + for (auto i = 0; i < window_size; ++i) { + switch (ref_.predicate_(window_slots[i], value)) { + case detail::equal_result::EMPTY: return cuco::pair{detail::equal_result::EMPTY, i}; + case detail::equal_result::EQUAL: return cuco::pair{detail::equal_result::EQUAL, i}; + default: continue; + } } + // returns dummy index `-1` for UNEQUAL + return cuco::pair{detail::equal_result::UNEQUAL, -1}; + }(); + + auto* slot_ptr = (ref_.storage_ref_.data() + *probing_iter)->data() + intra_window_index; + + // If the key is already in the container, return false + auto const group_finds_equal = group.ballot(state == detail::equal_result::EQUAL); + if (group_finds_equal) { + auto const src_lane = __ffs(group_finds_equal) - 1; + auto const res = group.shfl(reinterpret_cast(slot_ptr), src_lane); + return {iterator{reinterpret_cast(res)}, false}; } - if constexpr (sizeof(value_type) == sizeof(uint64_t)) { - auto* expected_ptr = reinterpret_cast(&expected); - auto* value_ptr = reinterpret_cast(&val); - if constexpr (thread_scope == cuda::thread_scope_system) { - return atomicCAS_system( - reinterpret_cast(slot), *expected_ptr, *value_ptr); - } else if constexpr (thread_scope == cuda::thread_scope_device) { - return atomicCAS( - reinterpret_cast(slot), *expected_ptr, *value_ptr); - } else if constexpr (thread_scope == cuda::thread_scope_block) { - return atomicCAS_block( - reinterpret_cast(slot), *expected_ptr, *value_ptr); - } else { - static_assert(cuco::dependent_false, "Unsupported thread scope"); + + auto const group_contains_empty = group.ballot(state == detail::equal_result::EMPTY); + if (group_contains_empty) { + auto const src_lane = __ffs(group_contains_empty) - 1; + auto const res = group.shfl(reinterpret_cast(slot_ptr), src_lane); + auto const status = (group.thread_rank() == src_lane) ? ref_.attempt_insert(slot_ptr, value) + : insert_result::CONTINUE; + + switch (group.shfl(status, src_lane)) { + case insert_result::SUCCESS: { + return {iterator{reinterpret_cast(res)}, true}; + } + case insert_result::DUPLICATE: { + return {iterator{reinterpret_cast(res)}, false}; + } + default: continue; } + } else { + ++probing_iter; } - }(); - if (*slot == old) { - // Shouldn't use `predicate_` operator directly since it includes a redundant bitwise compare - return ref_.predicate_.equal_to(old, value) == detail::equal_result::EQUAL - ? insert_result::DUPLICATE - : insert_result::CONTINUE; - } else { - return insert_result::SUCCESS; } } }; @@ -312,17 +439,17 @@ class operator_impl - [[nodiscard]] __device__ bool contains(cooperative_groups::thread_block_tile const& g, - ProbeKey const& key) const noexcept + [[nodiscard]] __device__ bool contains( + cooperative_groups::thread_block_tile const& group, ProbeKey const& key) const noexcept { auto const& ref_ = static_cast(*this); - auto probing_iter = ref_.probing_scheme_(g, key, ref_.storage_ref_.num_windows()); + auto probing_iter = ref_.probing_scheme_(group, key, ref_.storage_ref_.num_windows()); while (true) { auto const window_slots = ref_.storage_ref_[*probing_iter]; @@ -338,8 +465,8 @@ class operator_impl; using key_type = typename base_type::key_type; using value_type = typename base_type::value_type; - using iterator = typename StorageRef::iterator; - using const_iterator = typename StorageRef::const_iterator; + using iterator = typename base_type::iterator; + using const_iterator = typename base_type::const_iterator; static constexpr auto cg_size = base_type::cg_size; static constexpr auto window_size = base_type::window_size; @@ -438,18 +565,18 @@ class operator_impl - [[nodiscard]] __device__ const_iterator - find(cooperative_groups::thread_block_tile const& g, ProbeKey const& key) const noexcept + [[nodiscard]] __device__ const_iterator find( + cooperative_groups::thread_block_tile const& group, ProbeKey const& key) const noexcept { auto const& ref_ = static_cast(*this); - auto probing_iter = ref_.probing_scheme_(g, key, ref_.storage_ref_.num_windows()); + auto probing_iter = ref_.probing_scheme_(group, key, ref_.storage_ref_.num_windows()); while (true) { auto const window_slots = ref_.storage_ref_[*probing_iter]; @@ -467,17 +594,18 @@ class operator_impl(&( - *(ref_.storage_ref_.data() + *probing_iter))[intra_window_index]), - src_lane); + auto const res = + group.shfl(reinterpret_cast( + &(*(ref_.storage_ref_.data() + *probing_iter))[intra_window_index]), + src_lane); return const_iterator{reinterpret_cast(res)}; } // Find an empty slot, meaning that the probe key isn't present in the set - if (g.any(state == detail::equal_result::EMPTY)) { return this->end(); } + if (group.any(state == detail::equal_result::EMPTY)) { return this->end(); } ++probing_iter; } diff --git a/include/cuco/operator.hpp b/include/cuco/operator.hpp index 1f466e87e..b7629ae4c 100644 --- a/include/cuco/operator.hpp +++ b/include/cuco/operator.hpp @@ -27,6 +27,12 @@ inline namespace op { struct insert_tag { } inline constexpr insert; +/** + * @brief `insert_and_find` operator tag + */ +struct insert_and_find_tag { +} inline constexpr insert_and_find; + /** * @brief `contains` operator tag */ diff --git a/include/cuco/static_set_ref.cuh b/include/cuco/static_set_ref.cuh index 7c5ae13dc..acaa980bf 100644 --- a/include/cuco/static_set_ref.cuh +++ b/include/cuco/static_set_ref.cuh @@ -80,11 +80,12 @@ class static_set_ref using extent_type = typename storage_ref_type::extent_type; ///< Extent type using size_type = typename storage_ref_type::size_type; ///< Probing scheme size type using key_equal = KeyEqual; ///< Type of key equality binary callable + using iterator = typename storage_ref_type::iterator; ///< Slot iterator type + using const_iterator = typename storage_ref_type::const_iterator; ///< Const slot iterator type static constexpr auto cg_size = probing_scheme_type::cg_size; ///< Cooperative group size static constexpr auto window_size = - storage_ref_type::window_size; ///< Number of elements handled per window - static constexpr auto thread_scope = Scope; ///< Thread scope + storage_ref_type::window_size; ///< Number of elements handled per window /** * @brief Constructs static_set_ref. @@ -115,6 +116,22 @@ class static_set_ref [[nodiscard]] __host__ __device__ constexpr key_type empty_key_sentinel() const noexcept; private: + // TODO: this should be a common enum for all data structures + enum class insert_result : int32_t { CONTINUE = 0, SUCCESS = 1, DUPLICATE = 2 }; + + /** + * @brief Attempts to insert an element into a slot. + * + * @note Dispatches the correct implementation depending on the container + * type and presence of other operator mixins. + * + * @param slot Pointer to the slot in memory + * @param value Element to insert + * + * @return Result of this operation, i.e., success/continue/duplicate + */ + [[nodiscard]] __device__ insert_result attempt_insert(value_type* slot, value_type const& value); + cuco::empty_key empty_key_sentinel_; ///< Empty key sentinel detail::equal_wrapper predicate_; ///< Key equality binary callable probing_scheme_type probing_scheme_; ///< Probing scheme diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 4cbd43d22..9a1da8772 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -58,6 +58,7 @@ ConfigureTest(UTILITY_TEST ConfigureTest(STATIC_SET_TEST static_set/capacity_test.cu static_set/heterogeneous_lookup_test.cu + static_set/insert_and_find_test.cu static_set/large_input_test.cu static_set/retrieve_all_test.cu static_set/size_test.cu diff --git a/tests/static_set/insert_and_find_test.cu b/tests/static_set/insert_and_find_test.cu new file mode 100644 index 000000000..7c4ff08fa --- /dev/null +++ b/tests/static_set/insert_and_find_test.cu @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include + +#include + +template +__inline__ void test_insert_and_find(Set& set, std::size_t num_keys) +{ + using Key = typename Set::key_type; + static auto constexpr cg_size = Set::cg_size; + + auto const keys_begin = [&]() { + if constexpr (cg_size == 1) { + return thrust::counting_iterator(0); + } else { + return thrust::make_transform_iterator(thrust::counting_iterator(0), + [] __device__(auto i) { return i / cg_size; }); + } + }(); + auto const keys_end = [&]() { + if constexpr (cg_size == 1) { + return keys_begin + num_keys; + } else { + return keys_begin + num_keys * cg_size; + } + }(); + + auto ref = set.ref(cuco::experimental::op::insert_and_find); + + REQUIRE(cuco::test::all_of(keys_begin, keys_end, [ref] __device__(Key key) mutable { + auto [iter, inserted] = [&]() { + if constexpr (cg_size == 1) { + return ref.insert_and_find(key); + } else { + auto const tile = + cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + return ref.insert_and_find(tile, key); + } + }(); + return inserted == true; + })); + + SECTION("Inserting elements for the second time will always fail.") + { + REQUIRE(cuco::test::all_of(keys_begin, keys_end, [ref] __device__(Key key) mutable { + auto [iter, inserted] = [&]() { + if constexpr (cg_size == 1) { + return ref.insert_and_find(key); + } else { + auto const tile = + cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + return ref.insert_and_find(tile, key); + } + }(); + return inserted == false and key == *iter; + })); + } +} + +TEMPLATE_TEST_CASE_SIG( + "Insert and find", + "", + ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize), + (int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, cuco::test::probe_sequence::linear_probing, 2)) +{ + constexpr std::size_t num_keys{400}; + + using probe = + std::conditional_t>, + cuco::experimental::double_hashing, + cuco::murmurhash3_32>>; + + auto set = cuco::experimental::static_set, + cuda::thread_scope_device, + thrust::equal_to, + probe, + cuco::cuda_allocator, + cuco::experimental::aow_storage<2>>{ + num_keys, cuco::empty_key{-1}}; + test_insert_and_find(set, num_keys); +} diff --git a/tests/static_set/retrieve_all_test.cu b/tests/static_set/retrieve_all_test.cu index 45fa5e56b..5f7b0ff9d 100644 --- a/tests/static_set/retrieve_all_test.cu +++ b/tests/static_set/retrieve_all_test.cu @@ -20,12 +20,10 @@ #include #include -#include #include #include #include #include -#include #include @@ -35,27 +33,25 @@ __inline__ void test_unique_sequence(Set& set, std::size_t num_keys) using Key = typename Set::key_type; thrust::device_vector d_keys(num_keys); - - thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); - - auto key_begin = d_keys.begin(); + thrust::sequence(d_keys.begin(), d_keys.end()); + auto keys_begin = d_keys.begin(); SECTION("Non-inserted keys should not be contained.") { REQUIRE(set.size() == 0); - auto key_end = set.retrieve_all(key_begin); - REQUIRE(std::distance(key_begin, key_end) == 0); + auto keys_end = set.retrieve_all(keys_begin); + REQUIRE(std::distance(keys_begin, keys_end) == 0); } - set.insert(key_begin, key_begin + num_keys); + set.insert(keys_begin, keys_begin + num_keys); REQUIRE(set.size() == num_keys); SECTION("All inserted key/value pairs should be contained.") { thrust::device_vector d_res(num_keys); auto d_res_end = set.retrieve_all(d_res.begin()); - thrust::sort(thrust::device, d_res.begin(), d_res_end); + thrust::sort(d_res.begin(), d_res_end); REQUIRE(cuco::test::equal( d_res.begin(), d_res_end, thrust::counting_iterator(0), thrust::equal_to{})); } @@ -79,38 +75,23 @@ TEMPLATE_TEST_CASE_SIG( : 422 // 211 x 2 x 1 ; - using extent_type = cuco::experimental::extent; - using allocator_type = cuco::cuda_allocator; - using storage_type = cuco::experimental::aow_storage<1>; - - if constexpr (Probe == cuco::test::probe_sequence::linear_probing) { - using probe = cuco::experimental::linear_probing>; - auto set = cuco::experimental::static_set, - probe, - allocator_type, - storage_type>{num_keys, cuco::empty_key{-1}}; - - REQUIRE(set.capacity() == gold_capacity); - - test_unique_sequence(set, num_keys); - } - - if constexpr (Probe == cuco::test::probe_sequence::double_hashing) { - using probe = cuco::experimental:: - double_hashing, cuco::murmurhash3_32>; - auto set = cuco::experimental::static_set, - probe, - allocator_type, - storage_type>{num_keys, cuco::empty_key{-1}}; - - REQUIRE(set.capacity() == gold_capacity); - - test_unique_sequence(set, num_keys); - } + using probe = + std::conditional_t>, + cuco::experimental::double_hashing, + cuco::murmurhash3_32>>; + + auto set = cuco::experimental::static_set, + cuda::thread_scope_device, + thrust::equal_to, + probe, + cuco::cuda_allocator, + cuco::experimental::aow_storage<1>>{ + num_keys, cuco::empty_key{-1}}; + + REQUIRE(set.capacity() == gold_capacity); + + test_unique_sequence(set, num_keys); } diff --git a/tests/static_set/unique_sequence_test.cu b/tests/static_set/unique_sequence_test.cu index cd4869fde..7468e90f3 100644 --- a/tests/static_set/unique_sequence_test.cu +++ b/tests/static_set/unique_sequence_test.cu @@ -20,7 +20,6 @@ #include #include -#include #include #include #include @@ -131,38 +130,23 @@ TEMPLATE_TEST_CASE_SIG( : 412 // 103 x 2 x 2 ; - using extent_type = cuco::experimental::extent; - using allocator_type = cuco::cuda_allocator; - using storage_type = cuco::experimental::aow_storage<2>; - - if constexpr (Probe == cuco::test::probe_sequence::linear_probing) { - using probe = cuco::experimental::linear_probing>; - auto set = cuco::experimental::static_set, - probe, - allocator_type, - storage_type>{num_keys, cuco::empty_key{-1}}; - - REQUIRE(set.capacity() == gold_capacity); - - test_unique_sequence(set, num_keys); - } - - if constexpr (Probe == cuco::test::probe_sequence::double_hashing) { - using probe = cuco::experimental:: - double_hashing, cuco::murmurhash3_32>; - auto set = cuco::experimental::static_set, - probe, - allocator_type, - storage_type>{num_keys, cuco::empty_key{-1}}; - - REQUIRE(set.capacity() == gold_capacity); - - test_unique_sequence(set, num_keys); - } + using probe = + std::conditional_t>, + cuco::experimental::double_hashing, + cuco::murmurhash3_32>>; + + auto set = cuco::experimental::static_set, + cuda::thread_scope_device, + thrust::equal_to, + probe, + cuco::cuda_allocator, + cuco::experimental::aow_storage<2>>{ + num_keys, cuco::empty_key{-1}}; + + REQUIRE(set.capacity() == gold_capacity); + + test_unique_sequence(set, num_keys); } From d1ae9094b7e25afa20520301b494ef41f3afd62f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 2 May 2023 17:51:13 -0700 Subject: [PATCH 107/152] Clean up test code (#299) Some minor cleanups of the test code: - Use `conditional_t` instead of `if constexpr` to minimize redundancy - Use `std::distance` instead of raw `-` operator to determine the iterator scope - `const` when possible --- .../custom_pair_retrieve_test.cu | 22 ++++++---------- tests/static_multimap/custom_type_test.cu | 26 ++++++++----------- tests/static_multimap/insert_if_test.cu | 22 ++++++---------- tests/static_multimap/multiplicity_test.cu | 22 ++++++---------- tests/static_multimap/non_match_test.cu | 26 +++++++++---------- tests/static_multimap/pair_function_test.cu | 22 ++++++---------- tests/utils.hpp | 12 +++++---- 7 files changed, 62 insertions(+), 90 deletions(-) diff --git a/tests/static_multimap/custom_pair_retrieve_test.cu b/tests/static_multimap/custom_pair_retrieve_test.cu index 1b0a346ee..563abd835 100644 --- a/tests/static_multimap/custom_pair_retrieve_test.cu +++ b/tests/static_multimap/custom_pair_retrieve_test.cu @@ -196,18 +196,12 @@ TEMPLATE_TEST_CASE_SIG( { constexpr std::size_t num_pairs{200}; - if constexpr (Probe == cuco::test::probe_sequence::linear_probing) { - cuco::static_multimap, - cuco::linear_probing<1, cuco::murmurhash3_32>> - map{num_pairs * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; - test_non_shmem_pair_retrieve(map, num_pairs); - } - if constexpr (Probe == cuco::test::probe_sequence::double_hashing) { - cuco::static_multimap map{ - num_pairs * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; - test_non_shmem_pair_retrieve(map, num_pairs); - } + using probe = std::conditional_t< + Probe == cuco::test::probe_sequence::linear_probing, + cuco::linear_probing<1, cuco::murmurhash3_32>, + cuco::double_hashing<8, cuco::murmurhash3_32, cuco::murmurhash3_32>>; + + cuco::static_multimap, probe> + map{num_pairs * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; + test_non_shmem_pair_retrieve(map, num_pairs); } diff --git a/tests/static_multimap/custom_type_test.cu b/tests/static_multimap/custom_type_test.cu index f3cee280f..d76404b18 100644 --- a/tests/static_multimap/custom_type_test.cu +++ b/tests/static_multimap/custom_type_test.cu @@ -39,7 +39,10 @@ struct key_pair { }; struct hash_key_pair { - __device__ uint32_t operator()(key_pair k) const { return k.a; }; + __host__ __device__ hash_key_pair() : hash_key_pair{0} {} + __host__ __device__ hash_key_pair(uint32_t offset) : offset_(offset) {} + __device__ uint32_t operator()(key_pair k) const { return k.a + offset_; }; + uint32_t offset_; }; struct key_pair_equals { @@ -228,18 +231,11 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type", constexpr std::size_t num_pairs = 100; constexpr std::size_t capacity = num_pairs * 2; - if constexpr (Probe == cuco::test::probe_sequence::linear_probing) { - cuco::static_multimap, - cuco::linear_probing<1, hash_key_pair>> - map{capacity, cuco::empty_key{sentinel_key}, cuco::empty_value{sentinel_value}}; - test_custom_key_value_type(map, num_pairs); - } - if constexpr (Probe == cuco::test::probe_sequence::double_hashing) { - cuco::static_multimap map{ - capacity, cuco::empty_key{sentinel_key}, cuco::empty_value{sentinel_value}}; - test_custom_key_value_type(map, num_pairs); - } + using probe = std::conditional_t, + cuco::double_hashing<8, hash_key_pair, hash_key_pair>>; + + cuco::static_multimap, probe> + map{capacity, cuco::empty_key{sentinel_key}, cuco::empty_value{sentinel_value}}; + test_custom_key_value_type(map, num_pairs); } diff --git a/tests/static_multimap/insert_if_test.cu b/tests/static_multimap/insert_if_test.cu index 734a93505..0d560ff6e 100644 --- a/tests/static_multimap/insert_if_test.cu +++ b/tests/static_multimap/insert_if_test.cu @@ -67,18 +67,12 @@ TEMPLATE_TEST_CASE_SIG( return cuco::pair_type{i, i}; }); - if constexpr (Probe == cuco::test::probe_sequence::linear_probing) { - cuco::static_multimap, - cuco::linear_probing<1, cuco::murmurhash3_32>> - map{num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; - test_insert_if(map, d_pairs.begin(), d_keys.begin(), num_keys); - } - if constexpr (Probe == cuco::test::probe_sequence::double_hashing) { - cuco::static_multimap map{ - num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; - test_insert_if(map, d_pairs.begin(), d_keys.begin(), num_keys); - } + using probe = std::conditional_t< + Probe == cuco::test::probe_sequence::linear_probing, + cuco::linear_probing<1, cuco::murmurhash3_32>, + cuco::double_hashing<8, cuco::murmurhash3_32, cuco::murmurhash3_32>>; + + cuco::static_multimap, probe> + map{num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; + test_insert_if(map, d_pairs.begin(), d_keys.begin(), num_keys); } diff --git a/tests/static_multimap/multiplicity_test.cu b/tests/static_multimap/multiplicity_test.cu index f1255aaca..f21d52c3d 100644 --- a/tests/static_multimap/multiplicity_test.cu +++ b/tests/static_multimap/multiplicity_test.cu @@ -161,18 +161,12 @@ TEMPLATE_TEST_CASE_SIG( { constexpr std::size_t num_items{4}; - if constexpr (Probe == cuco::test::probe_sequence::linear_probing) { - cuco::static_multimap, - cuco::linear_probing<1, cuco::murmurhash3_32>> - map{5, cuco::empty_key{-1}, cuco::empty_value{-1}}; - test_multiplicity_two(map, num_items); - } - if constexpr (Probe == cuco::test::probe_sequence::double_hashing) { - cuco::static_multimap map{ - 5, cuco::empty_key{-1}, cuco::empty_value{-1}}; - test_multiplicity_two(map, num_items); - } + using probe = std::conditional_t< + Probe == cuco::test::probe_sequence::linear_probing, + cuco::linear_probing<1, cuco::murmurhash3_32>, + cuco::double_hashing<8, cuco::murmurhash3_32, cuco::murmurhash3_32>>; + + cuco::static_multimap, probe> + map{5, cuco::empty_key{-1}, cuco::empty_value{-1}}; + test_multiplicity_two(map, num_items); } diff --git a/tests/static_multimap/non_match_test.cu b/tests/static_multimap/non_match_test.cu index 3fdb60c14..be76a38ce 100644 --- a/tests/static_multimap/non_match_test.cu +++ b/tests/static_multimap/non_match_test.cu @@ -139,18 +139,16 @@ TEMPLATE_TEST_CASE_SIG( return cuco::pair_type{i / 2, i}; }); - if constexpr (Probe == cuco::test::probe_sequence::linear_probing) { - cuco::static_multimap, - cuco::linear_probing<1, cuco::murmurhash3_32>> - map{num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; - test_non_matches(map, d_pairs.begin(), d_keys.begin(), num_keys); - } - if constexpr (Probe == cuco::test::probe_sequence::double_hashing) { - cuco::static_multimap map{ - num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; - test_non_matches(map, d_pairs.begin(), d_keys.begin(), num_keys); - } + using probe = std::conditional_t< + Probe == cuco::test::probe_sequence::linear_probing, + cuco::linear_probing<1, cuco::murmurhash3_32>, + cuco::double_hashing<8, cuco::murmurhash3_32, cuco::murmurhash3_32>>; + + cuco::static_multimap, + cuco::linear_probing<1, cuco::murmurhash3_32>> + map{num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; + test_non_matches(map, d_pairs.begin(), d_keys.begin(), num_keys); } diff --git a/tests/static_multimap/pair_function_test.cu b/tests/static_multimap/pair_function_test.cu index 26cc5bbd2..8edecd6f1 100644 --- a/tests/static_multimap/pair_function_test.cu +++ b/tests/static_multimap/pair_function_test.cu @@ -132,18 +132,12 @@ TEMPLATE_TEST_CASE_SIG( return cuco::pair_type{i / 2, i}; }); - if constexpr (Probe == cuco::test::probe_sequence::linear_probing) { - cuco::static_multimap, - cuco::linear_probing<1, cuco::murmurhash3_32>> - map{num_pairs * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; - test_pair_functions(map, d_pairs.begin(), num_pairs); - } - if constexpr (Probe == cuco::test::probe_sequence::double_hashing) { - cuco::static_multimap map{ - num_pairs * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; - test_pair_functions(map, d_pairs.begin(), num_pairs); - } + using probe = std::conditional_t< + Probe == cuco::test::probe_sequence::linear_probing, + cuco::linear_probing<1, cuco::murmurhash3_32>, + cuco::double_hashing<8, cuco::murmurhash3_32, cuco::murmurhash3_32>>; + + cuco::static_multimap, probe> + map{num_pairs * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; + test_pair_functions(map, d_pairs.begin(), num_pairs); } diff --git a/tests/utils.hpp b/tests/utils.hpp index a94b04a57..3325027a9 100644 --- a/tests/utils.hpp +++ b/tests/utils.hpp @@ -24,6 +24,8 @@ #include +#include + namespace cuco { namespace test { @@ -37,7 +39,7 @@ enum class probe_sequence { linear_probing, double_hashing }; template int count_if(Iterator begin, Iterator end, Predicate p, cudaStream_t stream = 0) { - auto const size = end - begin; + auto const size = std::distance(begin, end); auto const grid_size = (size + block_size - 1) / block_size; int* count; @@ -51,7 +53,7 @@ int count_if(Iterator begin, Iterator end, Predicate p, cudaStream_t stream = 0) detail::count_if<<>>(begin, end, count, p); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); - auto res = *count; + auto const res = *count; CUCO_CUDA_TRY(cudaFree(count)); @@ -61,7 +63,7 @@ int count_if(Iterator begin, Iterator end, Predicate p, cudaStream_t stream = 0) template bool all_of(Iterator begin, Iterator end, Predicate p, cudaStream_t stream = 0) { - auto const size = end - begin; + auto const size = std::distance(begin, end); auto const count = count_if(begin, end, p, stream); return size == count; @@ -83,7 +85,7 @@ bool none_of(Iterator begin, Iterator end, Predicate p, cudaStream_t stream = 0) template bool equal(Iterator1 begin1, Iterator1 end1, Iterator2 begin2, Predicate p, cudaStream_t stream = 0) { - auto const size = end1 - begin1; + auto const size = std::distance(begin1, end1); auto const grid_size = (size + block_size - 1) / block_size; int* count; @@ -97,7 +99,7 @@ bool equal(Iterator1 begin1, Iterator1 end1, Iterator2 begin2, Predicate p, cuda detail::count_if<<>>(begin1, end1, begin2, count, p); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); - auto res = *count; + auto const res = *count; CUCO_CUDA_TRY(cudaFree(count)); From a371904f50a0553346407f163a9ad14275d2aee9 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 5 May 2023 13:46:16 -0700 Subject: [PATCH 108/152] Bump rapids-cmake to 23.06 (#302) Bumps `rapids-cmake` version to 23.06 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5f7c21a8d..12c5228e4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,7 +16,7 @@ cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake) - file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.02/RAPIDS.cmake + file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.06/RAPIDS.cmake ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake) endif() include(${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake) From 580ab2ba5c4c4241854ac939e9f91e628d3a5c65 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 8 May 2023 18:48:01 -0700 Subject: [PATCH 109/152] Update `static_map` size computation (#301) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #300 Several changes involved in this PR: - Using a custom kernel to compute the number of filled slots in map and multimap - Fixing a bug where erased slots are not counted as empty slots - Allowing size computation for more than 2^31 elements and to be replaced by cub reduce sum once cub-v2.1.0 is available - Changing the `static_map` public APIs like `insert`, `insert_if` and `erase` to return the number of successful insertions/erasures - Adding `erased_key_sentinel` into multimap Note: `get_size` is **expensive**, use the return value of bulk insert/erase when possible. --------- Co-authored-by: Daniel Jünger <2955913+sleeepyjack@users.noreply.github.com> --- include/cuco/detail/common_kernels.cuh | 65 +++++++++++++++++++ include/cuco/detail/static_map.inl | 61 ++++++++++++----- .../static_multimap/device_view_impl.inl | 11 +++- .../static_multimap/static_multimap.inl | 24 +++++-- include/cuco/static_map.cuh | 55 ++++++++++------ include/cuco/static_multimap.cuh | 25 +++++-- tests/static_map/erase_test.cu | 3 +- 7 files changed, 196 insertions(+), 48 deletions(-) create mode 100644 include/cuco/detail/common_kernels.cuh diff --git a/include/cuco/detail/common_kernels.cuh b/include/cuco/detail/common_kernels.cuh new file mode 100644 index 000000000..62a259bd5 --- /dev/null +++ b/include/cuco/detail/common_kernels.cuh @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include + +#include + +namespace cuco { +namespace detail { + +/** + * @brief Calculates the number of filled slots for the given view. + * + * @tparam BlockSize Number of threads in each block + * @tparam View Type of non-owning view allowing access to map storage + * @tparam AtomicT Atomic counter type + * + * @param view Non-owning device view used to access to map storage + * @param count Number of filled slots + */ +template +__global__ void size(View view, AtomicT* count) +{ + using size_type = std::size_t; + + cuco::detail::index_type const loop_stride = gridDim.x * BlockSize; + cuco::detail::index_type idx = BlockSize * blockIdx.x + threadIdx.x; + + size_type thread_count = 0; + auto const n = view.get_capacity(); + + auto* slots = view.get_slots(); + + while (idx < n) { + auto const key = (slots + idx)->first.load(cuda::std::memory_order_relaxed); + thread_count += not(cuco::detail::bitwise_compare(key, view.get_empty_key_sentinel()) or + cuco::detail::bitwise_compare(key, view.get_erased_key_sentinel())); + idx += loop_stride; + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + size_type const block_count = BlockReduce(temp_storage).Sum(thread_count); + if (threadIdx.x == 0) { count->fetch_add(block_count, cuda::std::memory_order_relaxed); } +} + +} // namespace detail +} // namespace cuco diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index cd7f8c079..e6cdd27d8 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -15,7 +15,10 @@ */ #include +#include #include +#include +#include #include #include @@ -89,11 +92,11 @@ static_map::~static_map() template template -void static_map::insert( +std::size_t static_map::insert( InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream) { auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return; } + if (num_keys == 0) { return 0; } auto const block_size = 128; auto const stride = 1; @@ -113,7 +116,7 @@ void static_map::insert( CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // stream sync to ensure h_num_successes is updated - size_ += h_num_successes; + return h_num_successes; } template @@ -122,16 +125,16 @@ template -void static_map::insert_if(InputIt first, - InputIt last, - StencilIt stencil, - Predicate pred, - Hash hash, - KeyEqual key_equal, - cudaStream_t stream) +std::size_t static_map::insert_if(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + Hash hash, + KeyEqual key_equal, + cudaStream_t stream) { auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return; } + if (num_keys == 0) { return 0; } auto constexpr block_size = 128; auto constexpr stride = 1; @@ -150,12 +153,12 @@ void static_map::insert_if(InputIt first, &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); - size_ += h_num_successes; + return h_num_successes; } template template -void static_map::erase( +std::size_t static_map::erase( InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream) { CUCO_EXPECTS(get_empty_key_sentinel() != get_erased_key_sentinel(), @@ -163,7 +166,7 @@ void static_map::erase( std::runtime_error); auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return; } + if (num_keys == 0) { return 0; } auto constexpr block_size = 128; auto constexpr stride = 1; @@ -183,7 +186,7 @@ void static_map::erase( CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // stream sync to ensure h_num_successes is updated - size_ -= h_num_successes; + return h_num_successes; } template @@ -281,6 +284,34 @@ void static_map::contains(InputIt first, <<>>(first, num_keys, output_begin, view, hash, key_equal); } +template +std::size_t static_map::get_size(cudaStream_t stream) const noexcept +{ + auto view = get_device_view(); + auto counter = + experimental::detail::counter_storage{slot_allocator_}; + counter.reset(stream); + + auto const grid_size = + (this->get_capacity() + + experimental::detail::CUCO_DEFAULT_STRIDE * experimental::detail::CUCO_DEFAULT_BLOCK_SIZE - + 1) / + (experimental::detail::CUCO_DEFAULT_STRIDE * experimental::detail::CUCO_DEFAULT_BLOCK_SIZE); + + // TODO: custom kernel to be replaced by cub::DeviceReduce::Sum when cub version is bumped to + // v2.1.0 + detail::size + <<>>(view, counter.data()); + + return counter.load_to_host(stream); +} + +template +float static_map::get_load_factor(cudaStream_t stream) const noexcept +{ + return static_cast(this->get_size(stream)) / capacity_; +} + template template __device__ static_map::device_mutable_view::insert_result diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index 9e328898d..7bb7e7ea9 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -63,7 +63,8 @@ class static_multimap::device_view_ Value empty_value_sentinel) noexcept : probe_sequence_{slots, capacity}, empty_key_sentinel_{empty_key_sentinel}, - empty_value_sentinel_{empty_value_sentinel} + empty_value_sentinel_{empty_value_sentinel}, + erased_key_sentinel_{empty_key_sentinel} { } @@ -172,6 +173,13 @@ class static_multimap::device_view_ return empty_value_sentinel_; } + /** + * @brief Gets the sentinel value used to represent an erased slot. + * + * @return The sentinel value used to represent an erased slot + */ + __host__ __device__ Key get_erased_key_sentinel() const noexcept { return erased_key_sentinel_; } + /** * @brief Gets slots array. * @@ -206,6 +214,7 @@ class static_multimap::device_view_ probe_sequence_type probe_sequence_; ///< Probe sequence used to probe the hash map Key empty_key_sentinel_{}; ///< Key value that represents an empty slot Value empty_value_sentinel_{}; ///< Initial Value of empty slot + Key erased_key_sentinel_{}; ///< Key value that represents an erased slot }; // class device_view_impl_base template +#include +#include #include #include @@ -906,10 +909,23 @@ template ::get_size( cudaStream_t stream) const noexcept { - auto begin = thrust::make_transform_iterator(raw_slots(), detail::slot_to_tuple{}); - auto filled = cuco::detail::slot_is_filled{get_empty_key_sentinel()}; - - return thrust::count_if(thrust::cuda::par.on(stream), begin, begin + get_capacity(), filled); + auto view = get_device_view(); + auto counter = + experimental::detail::counter_storage{slot_allocator_}; + counter.reset(stream); + + auto const grid_size = + (this->get_capacity() + + experimental::detail::CUCO_DEFAULT_STRIDE * experimental::detail::CUCO_DEFAULT_BLOCK_SIZE - + 1) / + (experimental::detail::CUCO_DEFAULT_STRIDE * experimental::detail::CUCO_DEFAULT_BLOCK_SIZE); + + // TODO: custom kernel to be replaced by cub::DeviceReduce::Sum when cub version is bumped to + // v2.1.0 + detail::size + <<>>(view, counter.data()); + + return counter.load_to_host(stream); } template , typename KeyEqual = thrust::equal_to> - void insert(InputIt first, - InputIt last, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}, - cudaStream_t stream = 0); + std::size_t insert(InputIt first, + InputIt last, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}, + cudaStream_t stream = 0); /** * @brief Inserts key/value pairs in the range `[first, last)` if `pred` @@ -271,6 +274,7 @@ class static_map { * argument type is convertible from std::iterator_traits::value_type * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of key/value pairs * @param last End of the sequence of key/value pairs * @param stencil Beginning of the stencil sequence @@ -279,19 +283,21 @@ class static_map { * @param hash The unary function to hash each key * @param key_equal The binary function to compare two keys for equality * @param stream CUDA stream used for insert + * + * @return Number of successful insertions */ template , typename KeyEqual = thrust::equal_to> - void insert_if(InputIt first, - InputIt last, - StencilIt stencil, - Predicate pred, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}, - cudaStream_t stream = 0); + std::size_t insert_if(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}, + cudaStream_t stream = 0); /** * @brief Erases keys in the range `[first, last)`. @@ -307,27 +313,30 @@ class static_map { * * This function synchronizes `stream`. * + * @throw std::runtime_error if a unique erased key sentinel value was not + * provided at construction + * * @tparam InputIt Device accessible input iterator whose `value_type` is * convertible to the map's `value_type` * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param hash The unary function to apply to hash each key * @param key_equal The binary function to compare two keys for equality * @param stream Stream used for executing the kernels * - * @throw std::runtime_error if a unique erased key sentinel value was not - * provided at construction + * @return Number of successful erasures */ template , typename KeyEqual = thrust::equal_to> - void erase(InputIt first, - InputIt last, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}, - cudaStream_t stream = 0); + std::size_t erase(InputIt first, + InputIt last, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}, + cudaStream_t stream = 0); /** * @brief Finds the values corresponding to all keys in the range `[first, last)`. @@ -1350,16 +1359,20 @@ class static_map { /** * @brief Gets the number of elements in the hash map. * + * @param stream Stream used for size computation + * * @return The number of elements in the map */ - std::size_t get_size() const noexcept { return size_; } + [[nodiscard]] std::size_t get_size(cudaStream_t stream = 0) const noexcept; /** * @brief Gets the load factor of the hash map. * + * @param stream Stream used for load factor computation + * * @return The load factor of the hash map */ - float get_load_factor() const noexcept { return static_cast(size_) / capacity_; } + [[nodiscard]] float get_load_factor(cudaStream_t stream = 0) const noexcept; /** * @brief Gets the sentinel value used to represent an empty key slot. diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index fe68da32b..be087a572 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -664,6 +664,16 @@ class static_multimap { return impl_.get_empty_value_sentinel(); } + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + __host__ __device__ __forceinline__ Key get_erased_key_sentinel() const noexcept + { + return impl_.get_erased_key_sentinel(); + } + protected: ViewImpl impl_; }; // class device_view_base @@ -1286,10 +1296,11 @@ class static_multimap { /** * @brief Gets the number of elements in the hash map. * - * @param stream CUDA stream used to get the number of inserted elements + * @param stream CUDA stream used for size computation + * * @return The number of elements in the map */ - std::size_t get_size(cudaStream_t stream = 0) const noexcept; + [[nodiscard]] std::size_t get_size(cudaStream_t stream = 0) const noexcept; /** * @brief Gets the load factor of the hash map. @@ -1297,7 +1308,7 @@ class static_multimap { * @param stream CUDA stream used to get the load factor * @return The load factor of the hash map */ - float get_load_factor(cudaStream_t stream = 0) const noexcept; + [[nodiscard]] float get_load_factor(cudaStream_t stream = 0) const noexcept; /** * @brief Gets the sentinel value used to represent an empty key slot. @@ -1342,9 +1353,11 @@ class static_multimap { } private: - std::size_t capacity_{}; ///< Total number of slots - Key empty_key_sentinel_{}; ///< Key value that represents an empty slot - Value empty_value_sentinel_{}; ///< Initial value of empty slot + std::size_t capacity_{}; ///< Total number of slots + Key empty_key_sentinel_{}; ///< Key value that represents an empty slot + Value empty_value_sentinel_{}; ///< Initial value of empty slot + // TODO multimap erase + Key erased_key_sentinel_{}; ///< Key value that represents an erased slot slot_allocator_type slot_allocator_{}; ///< Allocator used to allocate slots counter_allocator_type counter_allocator_{}; ///< Allocator used to allocate counters counter_deleter delete_counter_; ///< Custom counter deleter diff --git a/tests/static_map/erase_test.cu b/tests/static_map/erase_test.cu index 1315a5cba..e4e316613 100644 --- a/tests/static_map/erase_test.cu +++ b/tests/static_map/erase_test.cu @@ -53,8 +53,9 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t), (int64_t)) REQUIRE(map.get_size() == num_keys); - map.erase(d_keys.begin(), d_keys.end()); + auto const size = map.erase(d_keys.begin(), d_keys.end()); + REQUIRE(size == num_keys); REQUIRE(map.get_size() == 0); map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); From 8ced4b09e9849b7b793c95f6783e3ecda476b898 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 11 May 2023 13:56:59 -0700 Subject: [PATCH 110/152] Revert "Update `static_map` size computation" (#305) Reverts NVIDIA/cuCollections#301 Temporarily revert the changes for `size` computation since it breaks cudf. --- include/cuco/detail/common_kernels.cuh | 65 ------------------- include/cuco/detail/static_map.inl | 61 +++++------------ .../static_multimap/device_view_impl.inl | 11 +--- .../static_multimap/static_multimap.inl | 24 ++----- include/cuco/static_map.cuh | 55 ++++++---------- include/cuco/static_multimap.cuh | 25 ++----- tests/static_map/erase_test.cu | 3 +- 7 files changed, 48 insertions(+), 196 deletions(-) delete mode 100644 include/cuco/detail/common_kernels.cuh diff --git a/include/cuco/detail/common_kernels.cuh b/include/cuco/detail/common_kernels.cuh deleted file mode 100644 index 62a259bd5..000000000 --- a/include/cuco/detail/common_kernels.cuh +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -#include - -#include - -namespace cuco { -namespace detail { - -/** - * @brief Calculates the number of filled slots for the given view. - * - * @tparam BlockSize Number of threads in each block - * @tparam View Type of non-owning view allowing access to map storage - * @tparam AtomicT Atomic counter type - * - * @param view Non-owning device view used to access to map storage - * @param count Number of filled slots - */ -template -__global__ void size(View view, AtomicT* count) -{ - using size_type = std::size_t; - - cuco::detail::index_type const loop_stride = gridDim.x * BlockSize; - cuco::detail::index_type idx = BlockSize * blockIdx.x + threadIdx.x; - - size_type thread_count = 0; - auto const n = view.get_capacity(); - - auto* slots = view.get_slots(); - - while (idx < n) { - auto const key = (slots + idx)->first.load(cuda::std::memory_order_relaxed); - thread_count += not(cuco::detail::bitwise_compare(key, view.get_empty_key_sentinel()) or - cuco::detail::bitwise_compare(key, view.get_erased_key_sentinel())); - idx += loop_stride; - } - - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - size_type const block_count = BlockReduce(temp_storage).Sum(thread_count); - if (threadIdx.x == 0) { count->fetch_add(block_count, cuda::std::memory_order_relaxed); } -} - -} // namespace detail -} // namespace cuco diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index e6cdd27d8..cd7f8c079 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -15,10 +15,7 @@ */ #include -#include #include -#include -#include #include #include @@ -92,11 +89,11 @@ static_map::~static_map() template template -std::size_t static_map::insert( +void static_map::insert( InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream) { auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return 0; } + if (num_keys == 0) { return; } auto const block_size = 128; auto const stride = 1; @@ -116,7 +113,7 @@ std::size_t static_map::insert( CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // stream sync to ensure h_num_successes is updated - return h_num_successes; + size_ += h_num_successes; } template @@ -125,16 +122,16 @@ template -std::size_t static_map::insert_if(InputIt first, - InputIt last, - StencilIt stencil, - Predicate pred, - Hash hash, - KeyEqual key_equal, - cudaStream_t stream) +void static_map::insert_if(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + Hash hash, + KeyEqual key_equal, + cudaStream_t stream) { auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return 0; } + if (num_keys == 0) { return; } auto constexpr block_size = 128; auto constexpr stride = 1; @@ -153,12 +150,12 @@ std::size_t static_map::insert_if(InputIt first, &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); - return h_num_successes; + size_ += h_num_successes; } template template -std::size_t static_map::erase( +void static_map::erase( InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream) { CUCO_EXPECTS(get_empty_key_sentinel() != get_erased_key_sentinel(), @@ -166,7 +163,7 @@ std::size_t static_map::erase( std::runtime_error); auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return 0; } + if (num_keys == 0) { return; } auto constexpr block_size = 128; auto constexpr stride = 1; @@ -186,7 +183,7 @@ std::size_t static_map::erase( CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // stream sync to ensure h_num_successes is updated - return h_num_successes; + size_ -= h_num_successes; } template @@ -284,34 +281,6 @@ void static_map::contains(InputIt first, <<>>(first, num_keys, output_begin, view, hash, key_equal); } -template -std::size_t static_map::get_size(cudaStream_t stream) const noexcept -{ - auto view = get_device_view(); - auto counter = - experimental::detail::counter_storage{slot_allocator_}; - counter.reset(stream); - - auto const grid_size = - (this->get_capacity() + - experimental::detail::CUCO_DEFAULT_STRIDE * experimental::detail::CUCO_DEFAULT_BLOCK_SIZE - - 1) / - (experimental::detail::CUCO_DEFAULT_STRIDE * experimental::detail::CUCO_DEFAULT_BLOCK_SIZE); - - // TODO: custom kernel to be replaced by cub::DeviceReduce::Sum when cub version is bumped to - // v2.1.0 - detail::size - <<>>(view, counter.data()); - - return counter.load_to_host(stream); -} - -template -float static_map::get_load_factor(cudaStream_t stream) const noexcept -{ - return static_cast(this->get_size(stream)) / capacity_; -} - template template __device__ static_map::device_mutable_view::insert_result diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index 7bb7e7ea9..9e328898d 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -63,8 +63,7 @@ class static_multimap::device_view_ Value empty_value_sentinel) noexcept : probe_sequence_{slots, capacity}, empty_key_sentinel_{empty_key_sentinel}, - empty_value_sentinel_{empty_value_sentinel}, - erased_key_sentinel_{empty_key_sentinel} + empty_value_sentinel_{empty_value_sentinel} { } @@ -173,13 +172,6 @@ class static_multimap::device_view_ return empty_value_sentinel_; } - /** - * @brief Gets the sentinel value used to represent an erased slot. - * - * @return The sentinel value used to represent an erased slot - */ - __host__ __device__ Key get_erased_key_sentinel() const noexcept { return erased_key_sentinel_; } - /** * @brief Gets slots array. * @@ -214,7 +206,6 @@ class static_multimap::device_view_ probe_sequence_type probe_sequence_; ///< Probe sequence used to probe the hash map Key empty_key_sentinel_{}; ///< Key value that represents an empty slot Value empty_value_sentinel_{}; ///< Initial Value of empty slot - Key erased_key_sentinel_{}; ///< Key value that represents an erased slot }; // class device_view_impl_base template -#include -#include #include #include @@ -909,23 +906,10 @@ template ::get_size( cudaStream_t stream) const noexcept { - auto view = get_device_view(); - auto counter = - experimental::detail::counter_storage{slot_allocator_}; - counter.reset(stream); - - auto const grid_size = - (this->get_capacity() + - experimental::detail::CUCO_DEFAULT_STRIDE * experimental::detail::CUCO_DEFAULT_BLOCK_SIZE - - 1) / - (experimental::detail::CUCO_DEFAULT_STRIDE * experimental::detail::CUCO_DEFAULT_BLOCK_SIZE); - - // TODO: custom kernel to be replaced by cub::DeviceReduce::Sum when cub version is bumped to - // v2.1.0 - detail::size - <<>>(view, counter.data()); - - return counter.load_to_host(stream); + auto begin = thrust::make_transform_iterator(raw_slots(), detail::slot_to_tuple{}); + auto filled = cuco::detail::slot_is_filled{get_empty_key_sentinel()}; + + return thrust::count_if(thrust::cuda::par.on(stream), begin, begin + get_capacity(), filled); } template , typename KeyEqual = thrust::equal_to> - std::size_t insert(InputIt first, - InputIt last, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}, - cudaStream_t stream = 0); + void insert(InputIt first, + InputIt last, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}, + cudaStream_t stream = 0); /** * @brief Inserts key/value pairs in the range `[first, last)` if `pred` @@ -274,7 +271,6 @@ class static_map { * argument type is convertible from std::iterator_traits::value_type * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type - * * @param first Beginning of the sequence of key/value pairs * @param last End of the sequence of key/value pairs * @param stencil Beginning of the stencil sequence @@ -283,21 +279,19 @@ class static_map { * @param hash The unary function to hash each key * @param key_equal The binary function to compare two keys for equality * @param stream CUDA stream used for insert - * - * @return Number of successful insertions */ template , typename KeyEqual = thrust::equal_to> - std::size_t insert_if(InputIt first, - InputIt last, - StencilIt stencil, - Predicate pred, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}, - cudaStream_t stream = 0); + void insert_if(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}, + cudaStream_t stream = 0); /** * @brief Erases keys in the range `[first, last)`. @@ -313,30 +307,27 @@ class static_map { * * This function synchronizes `stream`. * - * @throw std::runtime_error if a unique erased key sentinel value was not - * provided at construction - * * @tparam InputIt Device accessible input iterator whose `value_type` is * convertible to the map's `value_type` * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type - * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param hash The unary function to apply to hash each key * @param key_equal The binary function to compare two keys for equality * @param stream Stream used for executing the kernels * - * @return Number of successful erasures + * @throw std::runtime_error if a unique erased key sentinel value was not + * provided at construction */ template , typename KeyEqual = thrust::equal_to> - std::size_t erase(InputIt first, - InputIt last, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}, - cudaStream_t stream = 0); + void erase(InputIt first, + InputIt last, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}, + cudaStream_t stream = 0); /** * @brief Finds the values corresponding to all keys in the range `[first, last)`. @@ -1359,20 +1350,16 @@ class static_map { /** * @brief Gets the number of elements in the hash map. * - * @param stream Stream used for size computation - * * @return The number of elements in the map */ - [[nodiscard]] std::size_t get_size(cudaStream_t stream = 0) const noexcept; + std::size_t get_size() const noexcept { return size_; } /** * @brief Gets the load factor of the hash map. * - * @param stream Stream used for load factor computation - * * @return The load factor of the hash map */ - [[nodiscard]] float get_load_factor(cudaStream_t stream = 0) const noexcept; + float get_load_factor() const noexcept { return static_cast(size_) / capacity_; } /** * @brief Gets the sentinel value used to represent an empty key slot. diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index be087a572..fe68da32b 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -664,16 +664,6 @@ class static_multimap { return impl_.get_empty_value_sentinel(); } - /** - * @brief Gets the sentinel value used to represent an empty key slot. - * - * @return The sentinel value used to represent an empty key slot - */ - __host__ __device__ __forceinline__ Key get_erased_key_sentinel() const noexcept - { - return impl_.get_erased_key_sentinel(); - } - protected: ViewImpl impl_; }; // class device_view_base @@ -1296,11 +1286,10 @@ class static_multimap { /** * @brief Gets the number of elements in the hash map. * - * @param stream CUDA stream used for size computation - * + * @param stream CUDA stream used to get the number of inserted elements * @return The number of elements in the map */ - [[nodiscard]] std::size_t get_size(cudaStream_t stream = 0) const noexcept; + std::size_t get_size(cudaStream_t stream = 0) const noexcept; /** * @brief Gets the load factor of the hash map. @@ -1308,7 +1297,7 @@ class static_multimap { * @param stream CUDA stream used to get the load factor * @return The load factor of the hash map */ - [[nodiscard]] float get_load_factor(cudaStream_t stream = 0) const noexcept; + float get_load_factor(cudaStream_t stream = 0) const noexcept; /** * @brief Gets the sentinel value used to represent an empty key slot. @@ -1353,11 +1342,9 @@ class static_multimap { } private: - std::size_t capacity_{}; ///< Total number of slots - Key empty_key_sentinel_{}; ///< Key value that represents an empty slot - Value empty_value_sentinel_{}; ///< Initial value of empty slot - // TODO multimap erase - Key erased_key_sentinel_{}; ///< Key value that represents an erased slot + std::size_t capacity_{}; ///< Total number of slots + Key empty_key_sentinel_{}; ///< Key value that represents an empty slot + Value empty_value_sentinel_{}; ///< Initial value of empty slot slot_allocator_type slot_allocator_{}; ///< Allocator used to allocate slots counter_allocator_type counter_allocator_{}; ///< Allocator used to allocate counters counter_deleter delete_counter_; ///< Custom counter deleter diff --git a/tests/static_map/erase_test.cu b/tests/static_map/erase_test.cu index e4e316613..1315a5cba 100644 --- a/tests/static_map/erase_test.cu +++ b/tests/static_map/erase_test.cu @@ -53,9 +53,8 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t), (int64_t)) REQUIRE(map.get_size() == num_keys); - auto const size = map.erase(d_keys.begin(), d_keys.end()); + map.erase(d_keys.begin(), d_keys.end()); - REQUIRE(size == num_keys); REQUIRE(map.get_size() == 0); map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); From f01e6e247fd8215bcf6c2f43d28f529226c59b21 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 11 May 2023 13:57:32 -0700 Subject: [PATCH 111/152] Make size type a template parameter to get rid of narrow conversions (#306) This PR fixes a narrow conversion bug unveiled by cudf CI. --- include/cuco/detail/storage/aow_storage.cuh | 3 ++- include/cuco/detail/storage/counter_storage.cuh | 3 ++- include/cuco/detail/storage/storage_base.cuh | 7 ++++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/include/cuco/detail/storage/aow_storage.cuh b/include/cuco/detail/storage/aow_storage.cuh index 6d24cd832..e2c031096 100644 --- a/include/cuco/detail/storage/aow_storage.cuh +++ b/include/cuco/detail/storage/aow_storage.cuh @@ -272,7 +272,8 @@ class aow_storage : public aow_storage_base { typename std::allocator_traits::rebind_alloc; ///< Type of the ///< allocator to ///< (de)allocate windows - using window_deleter_type = custom_deleter; ///< Type of window deleter + using window_deleter_type = + custom_deleter; ///< Type of window deleter using ref_type = aow_storage_ref; ///< Storage ref type /** diff --git a/include/cuco/detail/storage/counter_storage.cuh b/include/cuco/detail/storage/counter_storage.cuh index 021e530d9..12c963530 100644 --- a/include/cuco/detail/storage/counter_storage.cuh +++ b/include/cuco/detail/storage/counter_storage.cuh @@ -44,7 +44,8 @@ class counter_storage : public storage_base; ///< Type of the counter using allocator_type = typename std::allocator_traits::rebind_alloc< value_type>; ///< Type of the allocator to (de)allocate counter - using counter_deleter_type = custom_deleter; ///< Type of counter deleter + using counter_deleter_type = + custom_deleter; ///< Type of counter deleter /** * @brief Constructor of counter storage. diff --git a/include/cuco/detail/storage/storage_base.cuh b/include/cuco/detail/storage/storage_base.cuh index dec443dce..ada0726db 100644 --- a/include/cuco/detail/storage/storage_base.cuh +++ b/include/cuco/detail/storage/storage_base.cuh @@ -24,9 +24,10 @@ namespace detail { /** * @brief Custom deleter for unique pointer. * + * @tparam SizeType Type of device storage size * @tparam Allocator Type of allocator used for device storage */ -template +template struct custom_deleter { using pointer = typename Allocator::value_type*; ///< Value pointer type @@ -36,7 +37,7 @@ struct custom_deleter { * @param size Number of values to deallocate * @param allocator Allocator used for deallocating device storage */ - explicit constexpr custom_deleter(std::size_t size, Allocator& allocator) + explicit constexpr custom_deleter(SizeType size, Allocator& allocator) : size_{size}, allocator_{allocator} { } @@ -48,7 +49,7 @@ struct custom_deleter { */ void operator()(pointer ptr) { allocator_.deallocate(ptr, size_); } - std::size_t size_; ///< Number of values to delete + SizeType size_; ///< Number of values to delete Allocator& allocator_; ///< Allocator used deallocating values }; From 546ca606a17f480fa9d58d1752cce2aad6575bc4 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 12 May 2023 09:19:52 -0700 Subject: [PATCH 112/152] Fix narrow conversion issues in probing iterator (#307) Similar to #306 This PR fixes narrow conversions inside the probing iterator. Corresponding tests are added to exercise this issue. --- include/cuco/detail/probing_scheme_impl.inl | 19 +++++++++++++------ tests/static_set/unique_sequence_test.cu | 12 +++++++----- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/include/cuco/detail/probing_scheme_impl.inl b/include/cuco/detail/probing_scheme_impl.inl index 5a5c5ae23..4b617a133 100644 --- a/include/cuco/detail/probing_scheme_impl.inl +++ b/include/cuco/detail/probing_scheme_impl.inl @@ -96,7 +96,8 @@ template __host__ __device__ constexpr auto linear_probing::operator()( ProbeKey const& probe_key, Extent upper_bound) const noexcept { - return detail::probing_iterator{hash_(probe_key) % upper_bound, + using size_type = typename Extent::value_type; + return detail::probing_iterator{static_cast(hash_(probe_key) % upper_bound), 1, // step size is 1 upper_bound}; } @@ -108,8 +109,11 @@ __host__ __device__ constexpr auto linear_probing::operator()( ProbeKey const& probe_key, Extent upper_bound) const noexcept { + using size_type = typename Extent::value_type; return detail::probing_iterator{ - (hash_(probe_key) + g.thread_rank()) % upper_bound, cg_size, upper_bound}; + static_cast((hash_(probe_key) + g.thread_rank()) % upper_bound), + cg_size, + upper_bound}; } template @@ -124,9 +128,11 @@ template __host__ __device__ constexpr auto double_hashing::operator()( ProbeKey const& probe_key, Extent upper_bound) const noexcept { + using size_type = typename Extent::value_type; return detail::probing_iterator{ - hash1_(probe_key) % upper_bound, - hash2_(probe_key) % (upper_bound - 1) + 1, // step size in range [1, prime - 1] + static_cast(hash1_(probe_key) % upper_bound), + static_cast(hash2_(probe_key) % (upper_bound - 1) + + 1), // step size in range [1, prime - 1] upper_bound}; } @@ -137,9 +143,10 @@ __host__ __device__ constexpr auto double_hashing::operato ProbeKey const& probe_key, Extent upper_bound) const noexcept { + using size_type = typename Extent::value_type; return detail::probing_iterator{ - (hash1_(probe_key) + g.thread_rank()) % upper_bound, - (hash2_(probe_key) % (upper_bound / cg_size - 1) + 1) * cg_size, + static_cast((hash1_(probe_key) + g.thread_rank()) % upper_bound), + static_cast((hash2_(probe_key) % (upper_bound / cg_size - 1) + 1) * cg_size), upper_bound}; } } // namespace experimental diff --git a/tests/static_set/unique_sequence_test.cu b/tests/static_set/unique_sequence_test.cu index 7468e90f3..7285577bf 100644 --- a/tests/static_set/unique_sequence_test.cu +++ b/tests/static_set/unique_sequence_test.cu @@ -29,8 +29,10 @@ #include +using size_type = int32_t; + template -__inline__ void test_unique_sequence(Set& set, std::size_t num_keys) +__inline__ void test_unique_sequence(Set& set, size_type num_keys) { using Key = typename Set::key_type; @@ -125,9 +127,9 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, cuco::test::probe_sequence::linear_probing, 2)) { - constexpr std::size_t num_keys{400}; - auto constexpr gold_capacity = CGSize == 1 ? 422 // 211 x 1 x 2 - : 412 // 103 x 2 x 2 + constexpr size_type num_keys{400}; + constexpr size_type gold_capacity = CGSize == 1 ? 422 // 211 x 1 x 2 + : 412 // 103 x 2 x 2 ; using probe = @@ -138,7 +140,7 @@ TEMPLATE_TEST_CASE_SIG( cuco::murmurhash3_32>>; auto set = cuco::experimental::static_set, + cuco::experimental::extent, cuda::thread_scope_device, thrust::equal_to, probe, From fba86cda23d6a34d00ffebabb40f2949ec3a2cf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 16 May 2023 21:41:31 +0200 Subject: [PATCH 113/152] Extend macros for error checking and int128 support (#308) --- README.md | 20 ++++++++++---------- include/cuco/detail/__config | 24 +++++++++++++++++++++--- 2 files changed, 31 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index ef294f838..9bf9ae777 100644 --- a/README.md +++ b/README.md @@ -5,13 +5,13 @@ Doxygen Documentation (TODO) -`cuCollections` (`cuco`) is an open-source, header-only library of GPU-accelerated, concurrent data structures. +`cuCollections` (`cuco`) is an open-source, header-only library of GPU-accelerated, concurrent data structures. -Similar to how [Thrust](https://github.com/thrust/thrust) and [CUB](https://github.com/thrust/cub) provide STL-like, GPU accelerated algorithms and primitives, `cuCollections` provides STL-like concurrent data structures. `cuCollections` is not a one-to-one, drop-in replacement for STL data structures like `std::unordered_map`. Instead, it provides functionally similar data structures tailored for efficient use with GPUs. +Similar to how [Thrust](https://github.com/thrust/thrust) and [CUB](https://github.com/thrust/cub) provide STL-like, GPU accelerated algorithms and primitives, `cuCollections` provides STL-like concurrent data structures. `cuCollections` is not a one-to-one, drop-in replacement for STL data structures like `std::unordered_map`. Instead, it provides functionally similar data structures tailored for efficient use with GPUs. ## Development Status -`cuCollections` is still under heavy development. Users should expect breaking changes and refactoring to be common. +`cuCollections` is still under heavy development. Users should expect breaking changes and refactoring to be common. ## Getting cuCollections @@ -21,7 +21,7 @@ Similar to how [Thrust](https://github.com/thrust/thrust) and [CUB](https://gith `cuCollections` is designed to make it easy to include within another CMake project. The `CMakeLists.txt` exports a `cuco` target that can be linked[1](#link-footnote) - into a target to setup include directories, dependencies, and compile flags necessary to use `cuCollections` in your project. + into a target to setup include directories, dependencies, and compile flags necessary to use `cuCollections` in your project. We recommend using [CMake Package Manager (CPM)](https://github.com/TheLartians/CPM.cmake) to fetch `cuCollections` into your project. @@ -47,12 +47,12 @@ target_link_libraries(my_library cuco) This will take care of downloading `cuCollections` from GitHub and making the headers available in a location that can be found by CMake. Linking against the `cuco` target will provide everything needed for `cuco` to be used by the `my_library` target. -1: `cuCollections` is header-only and therefore there is no binary component to "link" against. The linking terminology comes from CMake's `target_link_libraries` which is still used even for header-only library targets. +1: `cuCollections` is header-only and therefore there is no binary component to "link" against. The linking terminology comes from CMake's `target_link_libraries` which is still used even for header-only library targets. ## Requirements -- `nvcc 11+` +- `nvcc 11.5+` - C++17 -- Volta+ +- Volta+ - Pascal is partially supported. Any data structures that require blocking algorithms are not supported. See [libcu++](https://nvidia.github.io/libcudacxx/setup/requirements.html#device-architectures) documentation for more details. ## Dependencies @@ -67,7 +67,7 @@ No action is required from the user to satisfy these dependencies. `cuCollection ## Building cuCollections -Since `cuCollections` is header-only, there is nothing to build to use it. +Since `cuCollections` is header-only, there is nothing to build to use it. To build the tests, benchmarks, and examples: @@ -75,7 +75,7 @@ To build the tests, benchmarks, and examples: cd $CUCO_ROOT mkdir -p build cd build -cmake .. +cmake .. make ``` Binaries will be built into: @@ -179,7 +179,7 @@ class example_class { ## Data Structures -We plan to add many GPU-accelerated, concurrent data structures to `cuCollections`. As of now, the two flagships are variants of hash tables. +We plan to add many GPU-accelerated, concurrent data structures to `cuCollections`. As of now, the two flagships are variants of hash tables. ### `static_set` diff --git a/include/cuco/detail/__config b/include/cuco/detail/__config index c76a1bbef..07dec5e50 100644 --- a/include/cuco/detail/__config +++ b/include/cuco/detail/__config @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,9 +14,23 @@ * limitations under the License. */ - #pragma once +#pragma once - #include +#include + +#if !defined(__CUDACC_VER_MAJOR__) || !defined(__CUDACC_VER_MINOR__) +#error "NVCC version not found" +#elif __CUDACC_VER_MAJOR__ < 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ < 5) +#error "NVCC version 11.5 or later is required" +#endif + +#if !defined(__CUDACC_RELAXED_CONSTEXPR__) +#error "Support for relaxed constexpr is required" +#endif + +#if !defined(__CUDACC_EXTENDED_LAMBDA__) +#error "Support for extended device lambdas is required" +#endif // WAR for libcudacxx/296 #define CUCO_CUDA_MINIMUM_ARCH _NV_FIRST_ARG(__CUDA_ARCH_LIST__) @@ -32,3 +46,7 @@ #if (CUCO_CUDA_MINIMUM_ARCH >= 700) #define CUCO_HAS_INDEPENDENT_THREADS #endif + +#if defined(__SIZEOF_INT128__) +#define CUCO_HAS_INT128 +#endif \ No newline at end of file From 64b61712b665e30a60dfc9d2039af23cd26e0ab7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 23 May 2023 01:17:10 +0200 Subject: [PATCH 114/152] Utility class for fast integer division/modulo (#309) This PR introduces a new utility class `fast_int` which provides optimized integer division/modulo by precomputing a set of magic numbers. --- include/cuco/utility/fast_int.cuh | 150 ++++++++++++++++++++++++++++++ tests/CMakeLists.txt | 3 +- tests/utility/fast_int_test.cu | 62 ++++++++++++ 3 files changed, 214 insertions(+), 1 deletion(-) create mode 100644 include/cuco/utility/fast_int.cuh create mode 100644 tests/utility/fast_int_test.cu diff --git a/include/cuco/utility/fast_int.cuh b/include/cuco/utility/fast_int.cuh new file mode 100644 index 000000000..f9852ff17 --- /dev/null +++ b/include/cuco/utility/fast_int.cuh @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include + +namespace cuco::utility { + +/** + * @brief Integer type with optimized division and modulo operators. + * + * @tparam T Underlying integer type + */ +template +struct fast_int { + static_assert(cuda::std::is_same_v or cuda::std::is_same_v +#if defined(CUCO_HAS_INT128) + or cuda::std::is_same_v or cuda::std::is_same_v +#endif + , + "Unsupported integer type"); + + using value_type = T; ///< Underlying integer type + + /** + * @brief Constructs a fast_int from an integer value. + * + * @param value Integer value + */ + __host__ __device__ explicit constexpr fast_int(T value) noexcept : value_{value} + { + evaluate_magic_numbers(); + } + + /** + * @brief Explicit conversion operator to the underlying value type. + * + * @return Underlying value + */ + __host__ __device__ explicit constexpr operator value_type() const noexcept { return value_; } + + private: + using intermediate_type = + cuda::std::conditional_t; ///< Intermediate type for multiplication + using unsigned_value_type = cuda::std::make_unsigned_t; ///< Unsigned value type + using signed_value_type = cuda::std::make_signed_t; ///< Signed value type + + static constexpr value_type value_bits = + CHAR_BIT * sizeof(value_type); ///< Number of bits required to represent the value + + /** + * @brief Computes the high bits of the multiplication of two unsigned integers. + * + * @param lhs Left-hand side of the multiplication + * @param rhs Right-hand side of the multiplication + * + * @return High bits of the multiplication + */ + __host__ __device__ constexpr value_type mulhi(unsigned_value_type lhs, + unsigned_value_type rhs) const noexcept + { +#if defined(__CUDA_ARCH__) + if constexpr (sizeof(value_type) == 4) { + return __umulhi(lhs, rhs); + } else { + return __umul64hi(lhs, rhs); + } +#else + return (intermediate_type(lhs) * intermediate_type(rhs)) >> value_bits; +#endif + } + + /** + * @brief Computes the log2 of an unsigned integer. + * + * @param v Unsigned integer + * + * @return Log2 of the unsigned integer + */ + __host__ __device__ constexpr value_type log2(value_type v) const noexcept + { + return cuda::std::bit_width(unsigned_value_type(v)) - 1; + } + + /** + * @brief Computes the magic numbers for the fast division. + */ + __host__ __device__ constexpr void evaluate_magic_numbers() noexcept + { + // TODO assert(value_ > 0); + auto const val_log2 = this->log2(value_); + + // if value_ is a power of 2, we can use a simple shift + if (cuda::std::has_single_bit(unsigned_value_type(value_))) { + magic_ = 0; + shift_ = val_log2; + } else { + auto upper = intermediate_type(1) << value_bits; + auto lower = intermediate_type(1); + auto const lval = intermediate_type(value_); + + // compute the magic number and shift; see "Hacker's Delight" by Henry S. Warren, Jr., 10-2 + for (shift_ = 0; shift_ < val_log2; ++shift_, upper <<= 1, lower <<= 1) { + if ((upper % lval) <= lower) { break; } + } + magic_ = upper / lval; + } + } + + value_type value_; ///< Underlying integer value + value_type magic_; ///< Magic number for fast division + value_type shift_; ///< Shift for fast division + + friend __host__ __device__ constexpr value_type operator/(value_type lhs, + fast_int const& rhs) noexcept + { + if (rhs.value_ == 1) { return lhs; } // edge case for value_ == 1 + if (rhs.magic_ == 0) { return lhs >> rhs.shift_; } // edge case for value_ == pow2 + auto const mul = (lhs == cuda::std::numeric_limits::max()) ? lhs : lhs + 1; + return rhs.mulhi(rhs.magic_, mul) >> rhs.shift_; + } + + friend __host__ __device__ constexpr value_type operator%(value_type lhs, + fast_int const& rhs) noexcept + { + return lhs - (lhs / rhs) * rhs.value_; + } +}; +} // namespace cuco::utility \ No newline at end of file diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 9a1da8772..335885ddf 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -51,7 +51,8 @@ endfunction(ConfigureTest) # - utility tests --------------------------------------------------------------------------------- ConfigureTest(UTILITY_TEST utility/extent_test.cu - utility/storage_test.cu) + utility/storage_test.cu + utility/fast_int_test.cu) ################################################################################################### # - static_set tests ------------------------------------------------------------------------------ diff --git a/tests/utility/fast_int_test.cu b/tests/utility/fast_int_test.cu new file mode 100644 index 000000000..c780293f9 --- /dev/null +++ b/tests/utility/fast_int_test.cu @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include + +#include +#include + +TEMPLATE_TEST_CASE( + "utility::fast_int tests", "", std::int32_t, std::uint32_t, std::int64_t, std::uint64_t) +{ + TestType value = GENERATE(1, 2, 9, 32, 4123, 8192, 4312456); + TestType lhs = GENERATE(1, 2, 9, 32, 4123, 8192, 4312456); + constexpr auto max_value = std::numeric_limits::max(); + + cuco::utility::fast_int fast_value{value}; + + SECTION("Should be explicitly convertible to the underlying integer type.") + { + REQUIRE(static_cast(fast_value) == value); + } + + SECTION("Fast div/mod should produce correct result.") + { + INFO(lhs << " /% " << value); + REQUIRE(lhs / fast_value == lhs / value); + REQUIRE(lhs % fast_value == lhs % value); + } + + SECTION("Fast div/mod with maximum rhs value should produce correct result.") + { + INFO(lhs << " /% " << max_value); + cuco::utility::fast_int fast_max{max_value}; + REQUIRE(lhs / fast_max == lhs / max_value); + REQUIRE(lhs % fast_max == lhs % max_value); + } + + SECTION("Fast div/mod with maximum lhs value should produce correct result.") + { + INFO(max_value << " /% " << value); + REQUIRE(max_value / fast_value == max_value / value); + REQUIRE(max_value % fast_value == max_value % value); + } +} From f4e2b2574434b8d0d82596249cefb41575b99dd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 25 May 2023 19:41:38 +0200 Subject: [PATCH 115/152] Create ops-bot.yaml (#311) This file enables the NV organization features required for the new Github Actions workflow. --- .github/ops-bot.yaml | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .github/ops-bot.yaml diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml new file mode 100644 index 000000000..84bbe71f4 --- /dev/null +++ b/.github/ops-bot.yaml @@ -0,0 +1,4 @@ +# This file controls which features from the `ops-bot` repository below are enabled. +# - https://github.com/rapidsai/ops-bot + +copy_prs: true From 8e4902295e225a721b3352ffdf01db543dd52aa4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 26 May 2023 00:03:39 +0200 Subject: [PATCH 116/152] New hash function alternatives (#310) This PR introduces a set of new hash function alternatives, namely - `xxhash_32` and `xxhash_64`: 32- and 64-bit versions of the famous [xxHash](https://github.com/Cyan4973/xxHash) hash function. - `fmix_32` and `fmix_64`: 32- and 64-bit versions of the Murmur3 integer/avalanche finalizer. Closes #290 --------- Co-authored-by: Yunsong Wang Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- benchmarks/CMakeLists.txt | 3 + benchmarks/defaults.hpp | 4 +- benchmarks/hash_bench.cu | 105 ++++++ .../hash_table/static_set/contains_bench.cu | 7 + .../hash_table/static_set/find_bench.cu | 7 + .../murmurhash3.cuh} | 111 +++++- include/cuco/detail/hash_functions/xxhash.cuh | 354 ++++++++++++++++++ include/cuco/detail/probe_sequence_impl.cuh | 1 - include/cuco/hash_functions.cuh | 43 ++- tests/CMakeLists.txt | 3 +- tests/utility/hash_test.cu | 157 ++++++++ 11 files changed, 772 insertions(+), 23 deletions(-) create mode 100644 benchmarks/hash_bench.cu rename include/cuco/detail/{hash_functions.cuh => hash_functions/murmurhash3.cuh} (55%) create mode 100644 include/cuco/detail/hash_functions/xxhash.cuh create mode 100644 tests/utility/hash_test.cu diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index d75052d3e..b094b84b1 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -78,3 +78,6 @@ ConfigureBench(DYNAMIC_MAP_BENCH hash_table/dynamic_map/find_bench.cu hash_table/dynamic_map/contains_bench.cu hash_table/dynamic_map/erase_bench.cu) + +ConfigureBench(HASH_BENCH + hash_bench.cu) diff --git a/benchmarks/defaults.hpp b/benchmarks/defaults.hpp index d9d35b9a9..22e4f5338 100644 --- a/benchmarks/defaults.hpp +++ b/benchmarks/defaults.hpp @@ -35,7 +35,9 @@ auto constexpr SKEW = 0.5; auto constexpr BATCH_SIZE = 1'000'000; auto constexpr INITIAL_SIZE = 50'000'000; -auto const N_RANGE = nvbench::range(10'000'000, 100'000'000, 20'000'000); +auto const N_RANGE = nvbench::range(10'000'000, 100'000'000, 20'000'000); +auto const N_RANGE_CACHE = + std::vector{8'000, 80'000, 800'000, 8'000'000, 80'000'000}; auto const OCCUPANCY_RANGE = nvbench::range(0.1, 0.9, 0.1); auto const MULTIPLICITY_RANGE = std::vector{1, 2, 4, 8, 16}; auto const MATCHING_RATE_RANGE = nvbench::range(0.1, 1., 0.1); diff --git a/benchmarks/hash_bench.cu b/benchmarks/hash_bench.cu new file mode 100644 index 000000000..2d8f13e09 --- /dev/null +++ b/benchmarks/hash_bench.cu @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +template +struct large_key { + constexpr __host__ __device__ large_key(int32_t seed) noexcept + { +#pragma unroll Words + for (int32_t i = 0; i < Words; ++i) { + data_[i] = seed; + } + } + + private: + int32_t data_[Words]; +}; + +template +__global__ void hash_bench_kernel(Hasher hash, + cuco::detail::index_type n, + OutputIt out, + bool materialize_result) +{ + cuco::detail::index_type const gid = BlockSize * blockIdx.x + threadIdx.x; + cuco::detail::index_type const loop_stride = gridDim.x * BlockSize; + cuco::detail::index_type idx = gid; + typename Hasher::result_type agg = 0; + + while (idx < n) { + typename Hasher::argument_type key(idx); + for (int32_t i = 0; i < 100; ++i) { // execute hash func 100 times + agg += hash(key); + } + idx += loop_stride; + } + + if (materialize_result) { out[gid] = agg; } +} + +/** + * @brief A benchmark evaluating performance of various hash functions + */ +template +void hash_eval(nvbench::state& state, nvbench::type_list) +{ + bool const materialize_result = false; + constexpr auto block_size = 128; + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N * 10); + auto const grid_size = SDIV(num_keys, block_size * 16); + + thrust::device_vector hash_values((materialize_result) ? num_keys + : 1); + + state.add_element_count(num_keys); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + hash_bench_kernel<<>>( + Hash{}, num_keys, hash_values.begin(), materialize_result); + }); +} + +NVBENCH_BENCH_TYPES( + hash_eval, + NVBENCH_TYPE_AXES(nvbench::type_list, + cuco::murmurhash3_32, + cuco::murmurhash3_32>, // 32*4bytes + cuco::xxhash_32, + cuco::xxhash_32, + cuco::xxhash_32>, + cuco::xxhash_64, + cuco::xxhash_64, + cuco::xxhash_64>, + cuco::murmurhash3_fmix_32, + cuco::murmurhash3_fmix_64>)) + .set_name("hash_function_eval") + .set_type_axes_names({"Hash"}) + .set_max_noise(defaults::MAX_NOISE); \ No newline at end of file diff --git a/benchmarks/hash_table/static_set/contains_bench.cu b/benchmarks/hash_table/static_set/contains_bench.cu index 697b98574..35362ed9e 100644 --- a/benchmarks/hash_table/static_set/contains_bench.cu +++ b/benchmarks/hash_table/static_set/contains_bench.cu @@ -73,3 +73,10 @@ NVBENCH_BENCH_TYPES(static_set_contains, .set_type_axes_names({"Key", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); + +NVBENCH_BENCH_TYPES(static_set_contains, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_constains_unique_capacity") + .set_type_axes_names({"Key", "Distribution"}) + .add_int64_axis("NumInputs", defaults::N_RANGE_CACHE); diff --git a/benchmarks/hash_table/static_set/find_bench.cu b/benchmarks/hash_table/static_set/find_bench.cu index 53450f771..e0ab9111c 100644 --- a/benchmarks/hash_table/static_set/find_bench.cu +++ b/benchmarks/hash_table/static_set/find_bench.cu @@ -75,3 +75,10 @@ NVBENCH_BENCH_TYPES(static_set_find, .set_type_axes_names({"Key", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); + +NVBENCH_BENCH_TYPES(static_set_find, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_find_unique_capacity") + .set_type_axes_names({"Key", "Distribution"}) + .add_int64_axis("NumInputs", defaults::N_RANGE_CACHE); diff --git a/include/cuco/detail/hash_functions.cuh b/include/cuco/detail/hash_functions/murmurhash3.cuh similarity index 55% rename from include/cuco/detail/hash_functions.cuh rename to include/cuco/detail/hash_functions/murmurhash3.cuh index 3c3f7403a..ce5ab9d56 100644 --- a/include/cuco/detail/hash_functions.cuh +++ b/include/cuco/detail/hash_functions/murmurhash3.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,8 +16,94 @@ #pragma once +#include + namespace cuco::detail { +/** + * @brief The 32bit integer finalizer hash function of `MurmurHash3`. + * + * @throw Key type must be 4 bytes in size + * + * @tparam Key The type of the values to hash + */ +template +struct MurmurHash3_fmix32 { + static_assert(sizeof(Key) == 4, "Key type must be 4 bytes in size."); + + using argument_type = Key; ///< The type of the values taken as argument + using result_type = uint32_t; ///< The type of the hash values produced + + /** + * @brief Constructs a MurmurHash3_fmix32 hash function with the given `seed`. + * + * @param seed A custom number to randomize the resulting hash value + */ + __host__ __device__ constexpr MurmurHash3_fmix32(uint32_t seed = 0) : seed_{seed} {} + + /** + * @brief Returns a hash value for its argument, as a value of type `result_type`. + * + * @param key The input argument to hash + * @return A resulting hash value for `key` + */ + constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept + { + uint32_t h = static_cast(key) ^ seed_; + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + return h; + } + + private: + uint32_t seed_; +}; + +/** + * @brief The 64bit integer finalizer hash function of `MurmurHash3`. + * + * @throw Key type must be 8 bytes in size + * + * @tparam Key The type of the values to hash + */ +template +struct MurmurHash3_fmix64 { + static_assert(sizeof(Key) == 8, "Key type must be 8 bytes in size."); + + using argument_type = Key; ///< The type of the values taken as argument + using result_type = uint64_t; ///< The type of the hash values produced + + /** + * @brief Constructs a MurmurHash3_fmix64 hash function with the given `seed`. + * + * @param seed A custom number to randomize the resulting hash value + */ + __host__ __device__ constexpr MurmurHash3_fmix64(uint64_t seed = 0) : seed_{seed} {} + + /** + * @brief Returns a hash value for its argument, as a value of type `result_type`. + * + * @param key The input argument to hash + * @return A resulting hash value for `key` + */ + constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept + { + uint64_t h = static_cast(key) ^ seed_; + h ^= h >> 33; + h *= 0xff51afd7ed558ccd; + h ^= h >> 33; + h *= 0xc4ceb9fe1a85ec53; + h ^= h >> 33; + return h; + } + + private: + uint64_t seed_; +}; + /** * @brief A `MurmurHash3_32` hash function to hash the given argument on host and device. * @@ -38,15 +124,12 @@ struct MurmurHash3_32 { using argument_type = Key; ///< The type of the values taken as argument using result_type = uint32_t; ///< The type of the hash values produced - /// Default constructor - __host__ __device__ constexpr MurmurHash3_32() : MurmurHash3_32{0} {} - /** * @brief Constructs a MurmurHash3_32 hash function with the given `seed`. * * @param seed A custom number to randomize the resulting hash value */ - __host__ __device__ constexpr MurmurHash3_32(uint32_t seed) : m_seed(seed) {} + __host__ __device__ constexpr MurmurHash3_32(uint32_t seed = 0) : fmix32_{0}, seed_{seed} {} /** * @brief Returns a hash value for its argument, as a value of type `result_type`. @@ -60,7 +143,7 @@ struct MurmurHash3_32 { const uint8_t* const data = (const uint8_t*)&key; constexpr int nblocks = len / 4; - uint32_t h1 = m_seed; + uint32_t h1 = seed_; constexpr uint32_t c1 = 0xcc9e2d51; constexpr uint32_t c2 = 0x1b873593; //---------- @@ -92,7 +175,7 @@ struct MurmurHash3_32 { //---------- // finalization h1 ^= len; - h1 = fmix32(h1); + h1 = fmix32_(h1); return h1; } @@ -102,16 +185,8 @@ struct MurmurHash3_32 { return (x << r) | (x >> (32 - r)); } - constexpr __host__ __device__ uint32_t fmix32(uint32_t h) const noexcept - { - h ^= h >> 16; - h *= 0x85ebca6b; - h ^= h >> 13; - h *= 0xc2b2ae35; - h ^= h >> 16; - return h; - } - uint32_t m_seed; + MurmurHash3_fmix32 fmix32_; + uint32_t seed_; }; -} // namespace cuco::detail +} // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/detail/hash_functions/xxhash.cuh b/include/cuco/detail/hash_functions/xxhash.cuh new file mode 100644 index 000000000..d98116997 --- /dev/null +++ b/include/cuco/detail/hash_functions/xxhash.cuh @@ -0,0 +1,354 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cuco::detail { + +/** + * @brief A `XXHash_32` hash function to hash the given argument on host and device. + * + * XXHash_32 implementation from + * https://github.com/Cyan4973/xxHash + * ----------------------------------------------------------------------------- + * xxHash - Extremely Fast Hash algorithm + * Header File + * Copyright (C) 2012-2021 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @tparam Key The type of the values to hash + */ +template +struct XXHash_32 { + private: + static constexpr uint32_t prime1 = 0x9E3779B1U; + static constexpr uint32_t prime2 = 0x85EBCA77U; + static constexpr uint32_t prime3 = 0xC2B2AE3DU; + static constexpr uint32_t prime4 = 0x27D4EB2FU; + static constexpr uint32_t prime5 = 0x165667B1U; + + public: + using argument_type = Key; ///< The type of the values taken as argument + using result_type = uint32_t; ///< The type of the hash values produced + + /** + * @brief Constructs a XXH32 hash function with the given `seed`. + * + * @param seed A custom number to randomize the resulting hash value + */ + __host__ __device__ constexpr XXHash_32(uint32_t seed = 0) : seed_{seed} {} + + /** + * @brief Returns a hash value for its argument, as a value of type `result_type`. + * + * @param key The input argument to hash + * @return A resulting hash value for `key` + */ + constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept + { + // TODO do we need to add checks/hints for alignment? + constexpr auto nbytes = sizeof(Key); + char const* const bytes = (char const*)&key; ///< per-byte access + uint32_t const* const blocks = (uint32_t const*)&key; ///< 4-byte word access + + uint32_t offset = 0; + uint32_t h32; + + // data can be processed in 16-byte chunks + if constexpr (nbytes >= 16) { + constexpr auto limit = nbytes - 16; + uint32_t v1 = seed_ + prime1 + prime2; + uint32_t v2 = seed_ + prime2; + uint32_t v3 = seed_; + uint32_t v4 = seed_ - prime1; + + do { + // pipeline 4*4byte computations + auto const pipeline_offset = offset / 4; + v1 += blocks[pipeline_offset] * prime2; + v1 = rotl(v1, 13); + v1 *= prime1; + v2 += blocks[pipeline_offset + 1] * prime2; + v2 = rotl(v2, 13); + v2 *= prime1; + v3 += blocks[pipeline_offset + 2] * prime2; + v3 = rotl(v3, 13); + v3 *= prime1; + v4 += blocks[pipeline_offset + 3] * prime2; + v4 = rotl(v4, 13); + v4 *= prime1; + offset += 16; + } while (offset <= limit); + + h32 = rotl(v1, 1) + rotl(v2, 7) + rotl(v3, 12) + rotl(v4, 18); + } else { + h32 = seed_ + prime5; + } + + h32 += nbytes; + + // remaining data can be processed in 4-byte chunks + if constexpr ((nbytes % 16) >= 4) { + for (; offset <= nbytes - 4; offset += 4) { + h32 += blocks[offset / 4] * prime3; + h32 = rotl(h32, 17) * prime4; + } + } + + // the following loop is only needed if the size of the key is no multiple of the block size + if constexpr (nbytes % 4) { + while (offset < nbytes) { + h32 += (bytes[offset] & 255) * prime5; + h32 = rotl(h32, 11) * prime1; + ++offset; + } + } + + return finalize(h32); + } + + private: + constexpr __host__ __device__ uint32_t rotl(uint32_t h, int8_t r) const noexcept + { + return ((h << r) | (h >> (32 - r))); + } + + // avalanche helper + constexpr __host__ __device__ uint32_t finalize(uint32_t h) const noexcept + { + h ^= h >> 15; + h *= prime2; + h ^= h >> 13; + h *= prime3; + h ^= h >> 16; + return h; + } + + uint32_t seed_; +}; + +/** + * @brief A `XXHash_64` hash function to hash the given argument on host and device. + * + * XXHash_64 implementation from + * https://github.com/Cyan4973/xxHash + * ----------------------------------------------------------------------------- + * xxHash - Extremely Fast Hash algorithm + * Header File + * Copyright (C) 2012-2021 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + * + * @tparam Key The type of the values to hash + */ +template +struct XXHash_64 { + private: + static constexpr uint64_t prime1 = 11400714785074694791ULL; + static constexpr uint64_t prime2 = 14029467366897019727ULL; + static constexpr uint64_t prime3 = 1609587929392839161ULL; + static constexpr uint64_t prime4 = 9650029242287828579ULL; + static constexpr uint64_t prime5 = 2870177450012600261ULL; + + public: + using argument_type = Key; ///< The type of the values taken as argument + using result_type = uint64_t; ///< The type of the hash values produced + + /** + * @brief Constructs a XXH64 hash function with the given `seed`. + * + * @param seed A custom number to randomize the resulting hash value + */ + __host__ __device__ constexpr XXHash_64(uint64_t seed = 0) : seed_{seed} {} + + /** + * @brief Returns a hash value for its argument, as a value of type `result_type`. + * + * @param key The input argument to hash + * @return A resulting hash value for `key` + */ + constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept + { + // TODO do we need to add checks/hints for alignment? + constexpr auto nbytes = sizeof(Key); + char const* const bytes = (char const*)&key; ///< per-byte access + uint32_t const* const blocks4 = (uint32_t const*)&key; ///< 4-byte word access + uint64_t const* const blocks8 = (uint64_t const*)&key; ///< 8-byte word access + + uint64_t offset = 0; + uint64_t h64; + + // data can be processed in 32-byte chunks + if constexpr (nbytes >= 32) { + constexpr auto limit = nbytes - 32; + uint64_t v1 = seed_ + prime1 + prime2; + uint64_t v2 = seed_ + prime2; + uint64_t v3 = seed_; + uint64_t v4 = seed_ - prime1; + + do { + // pipeline 4*8byte computations + auto const pipeline_offset = offset / 8; + v1 += blocks8[pipeline_offset] * prime2; + v1 = rotl(v1, 31); + v1 *= prime1; + v2 += blocks8[pipeline_offset + 1] * prime2; + v2 = rotl(v2, 31); + v2 *= prime1; + v3 += blocks8[pipeline_offset + 2] * prime2; + v3 = rotl(v3, 31); + v3 *= prime1; + v4 += blocks8[pipeline_offset + 3] * prime2; + v4 = rotl(v4, 31); + v4 *= prime1; + offset += 32; + } while (offset <= limit); + + h64 = rotl(v1, 1) + rotl(v2, 7) + rotl(v3, 12) + rotl(v4, 18); + + v1 *= prime2; + v1 = rotl(v1, 31); + v1 *= prime1; + h64 ^= v1; + h64 = h64 * prime1 + prime4; + + v2 *= prime2; + v2 = rotl(v2, 31); + v2 *= prime1; + h64 ^= v2; + h64 = h64 * prime1 + prime4; + + v3 *= prime2; + v3 = rotl(v3, 31); + v3 *= prime1; + h64 ^= v3; + h64 = h64 * prime1 + prime4; + + v4 *= prime2; + v4 = rotl(v4, 31); + v4 *= prime1; + h64 ^= v4; + h64 = h64 * prime1 + prime4; + } else { + h64 = seed_ + prime5; + } + + h64 += nbytes; + + // remaining data can be processed in 8-byte chunks + if constexpr ((nbytes % 32) >= 8) { + for (; offset <= nbytes - 8; offset += 8) { + uint64_t k1 = blocks8[offset / 8] * prime2; + k1 = rotl(k1, 31) * prime1; + h64 ^= k1; + h64 = rotl(h64, 27) * prime1 + prime4; + } + } + + // remaining data can be processed in 4-byte chunks + if constexpr (((nbytes % 32) % 8) >= 4) { + for (; offset <= nbytes - 4; offset += 4) { + h64 ^= (blocks4[offset / 4] & 0xFFFFFFFFULL) * prime1; + h64 = rotl(h64, 23) * prime2 + prime3; + } + } + + // the following loop is only needed if the size of the key is no multiple of a previous block + // size + if constexpr (nbytes % 4) { + while (offset < nbytes) { + h64 += (bytes[offset] & 0xFF) * prime5; + h64 = rotl(h64, 11) * prime1; + ++offset; + } + } + return finalize(h64); + } + + private: + constexpr __host__ __device__ uint64_t rotl(uint64_t h, int8_t r) const noexcept + { + return ((h << r) | (h >> (64 - r))); + } + + // avalanche helper + constexpr __host__ __device__ uint64_t finalize(uint64_t h) const noexcept + { + h ^= h >> 33; + h *= prime2; + h ^= h >> 29; + h *= prime3; + h ^= h >> 32; + return h; + } + + uint64_t seed_; +}; + +} // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/detail/probe_sequence_impl.cuh b/include/cuco/detail/probe_sequence_impl.cuh index 688b2f28f..14124b639 100644 --- a/include/cuco/detail/probe_sequence_impl.cuh +++ b/include/cuco/detail/probe_sequence_impl.cuh @@ -16,7 +16,6 @@ #pragma once -#include #include #include diff --git a/include/cuco/hash_functions.cuh b/include/cuco/hash_functions.cuh index 365958d64..21b78d675 100644 --- a/include/cuco/hash_functions.cuh +++ b/include/cuco/hash_functions.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,10 +16,33 @@ #pragma once -#include +#include +#include namespace cuco { +/** + * @brief The 32-bit integer finalizer function of `MurmurHash3` to hash the given argument on host + * and device. + * + * @throw Key type must be 4 bytes in size + * + * @tparam Key The type of the values to hash + */ +template +using murmurhash3_fmix_32 = detail::MurmurHash3_fmix32; + +/** + * @brief The 64-bit integer finalizer function of `MurmurHash3` to hash the given argument on host + * and device. + * + * @throw Key type must be 8 bytes in size + * + * @tparam Key The type of the values to hash + */ +template +using murmurhash3_fmix_64 = detail::MurmurHash3_fmix64; + /** * @brief A `murmurhash3_32` hash function to hash the given argument on host and device. * @@ -28,4 +51,20 @@ namespace cuco { template using murmurhash3_32 = detail::MurmurHash3_32; +/** + * @brief A `XXH32` hash function to hash the given argument on host and device. + * + * @tparam Key The type of the values to hash + */ +template +using xxhash_32 = detail::XXHash_32; + +/** + * @brief A `XXH64` hash function to hash the given argument on host and device. + * + * @tparam Key The type of the values to hash + */ +template +using xxhash_64 = detail::XXHash_64; + } // namespace cuco diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 335885ddf..ebc37e39b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -52,7 +52,8 @@ endfunction(ConfigureTest) ConfigureTest(UTILITY_TEST utility/extent_test.cu utility/storage_test.cu - utility/fast_int_test.cu) + utility/fast_int_test.cu + utility/hash_test.cu) ################################################################################################### # - static_set tests ------------------------------------------------------------------------------ diff --git a/tests/utility/hash_test.cu b/tests/utility/hash_test.cu new file mode 100644 index 000000000..aa16af607 --- /dev/null +++ b/tests/utility/hash_test.cu @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include + +template +struct large_key { + constexpr __host__ __device__ large_key(int32_t value) noexcept + { + for (int32_t i = 0; i < Words; ++i) { + data_[i] = value; + } + } + + private: + int32_t data_[Words]; +}; + +TEST_CASE("Test cuco::xxhash_64", "") +{ + // Reference hash values were computed using https://github.com/Cyan4973/xxHash + SECTION("Check if host-generated hash values match the reference implementation.") + { + int32_t k1 = 0; // key + uint64_t s1 = 0; // seed + cuco::xxhash_64 h1(s1); // hasher + CHECK(h1(k1) == 4246796580750024372ULL); + + int32_t k2 = 0; // key + uint64_t s2 = 42; // seed + cuco::xxhash_64 h2(s2); // hasher + CHECK(h2(k2) == 3614696996920510707ULL); + + int32_t k3 = 42; // key + uint64_t s3 = 0; // seed + cuco::xxhash_64 h3(s3); // hasher + CHECK(h3(k3) == 15516826743637085169ULL); + + int32_t k4 = 123456789; // key + uint64_t s4 = 0; // seed + cuco::xxhash_64 h4(s4); // hasher + CHECK(h4(k4) == 9462334144942111946ULL); + + int64_t k5 = 0; // key + uint64_t s5 = 0; // seed + cuco::xxhash_64 h5(s5); // hasher + CHECK(h5(k5) == 3803688792395291579ULL); + + int64_t k6 = 0; // key + uint64_t s6 = 42; // seed + cuco::xxhash_64 h6(s6); // hasher + CHECK(h6(k6) == 13194218611613725804ULL); + + int64_t k7 = 42; // key + uint64_t s7 = 0; // seed + cuco::xxhash_64 h7(s7); // hasher + CHECK(h7(k7) == 13066772586158965587ULL); + + int64_t k8 = 123456789; // key + uint64_t s8 = 0; // seed + cuco::xxhash_64 h8(s8); // hasher + CHECK(h8(k8) == 14662639848940634189ULL); + +#if defined(CUCO_HAS_INT128) + __int128 k9 = 123456789; // key + uint64_t s9 = 0; // seed + cuco::xxhash_64<__int128> h9(s9); // hasher + CHECK(h9(k9) == 7986913354431084250ULL); +#endif + + // 32*4=128-byte key to test the pipelined outermost hashing loop + large_key<32> k10(123456789); // key + uint64_t s10 = 0; // seed + cuco::xxhash_64> h10(s10); // hasher + CHECK(h10(k10) == 2031761887105658523ULL); + } + + // TODO SECTION("Check if device-generated hash values match the reference implementation.") +} + +TEST_CASE("Test cuco::xxhash_32", "") +{ + // Reference hash values were computed using https://github.com/Cyan4973/xxHash + SECTION("Check if host-generated hash values match the reference implementation.") + { + int32_t k1 = 0; // key + uint32_t s1 = 0; // seed + cuco::xxhash_32 h1(s1); // hasher + CHECK(h1(k1) == 148298089); + + int32_t k2 = 0; // key + uint32_t s2 = 42; // seed + cuco::xxhash_32 h2(s2); // hasher + CHECK(h2(k2) == 2132181312); + + int32_t k3 = 42; // key + uint32_t s3 = 0; // seed + cuco::xxhash_32 h3(s3); // hasher + CHECK(h3(k3) == 1161967057); + + int32_t k4 = 123456789; // key + uint32_t s4 = 0; // seed + cuco::xxhash_32 h4(s4); // hasher + CHECK(h4(k4) == 2987034094); + + int64_t k5 = 0; // key + uint32_t s5 = 0; // seed + cuco::xxhash_32 h5(s5); // hasher + CHECK(h5(k5) == 3736311059); + + int64_t k6 = 0; // key + uint32_t s6 = 42; // seed + cuco::xxhash_32 h6(s6); // hasher + CHECK(h6(k6) == 1076387279); + + int64_t k7 = 42; // key + uint32_t s7 = 0; // seed + cuco::xxhash_32 h7(s7); // hasher + CHECK(h7(k7) == 2332451213); + + int64_t k8 = 123456789; // key + uint32_t s8 = 0; // seed + cuco::xxhash_32 h8(s8); // hasher + CHECK(h8(k8) == 1561711919); + +#if defined(CUCO_HAS_INT128) + __int128 k9 = 123456789; // key + uint32_t s9 = 0; // seed + cuco::xxhash_32<__int128> h9(s9); // hasher + CHECK(h9(k9) == 1846633701); +#endif + + // 32*4=128-byte key to test the pipelined outermost hashing loop + large_key<32> k10(123456789); // key + uint64_t s10 = 0; // seed + cuco::xxhash_32> h10(s10); // hasher + CHECK(h10(k10) == 3715432378); + } +} \ No newline at end of file From 33f2c2b77cc67ec527d82c9b6723b9880198e29e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= <2955913+sleeepyjack@users.noreply.github.com> Date: Mon, 5 Jun 2023 19:32:01 +0200 Subject: [PATCH 117/152] xxhash cleanups (#313) --- benchmarks/CMakeLists.txt | 2 + benchmarks/hash_bench.cu | 10 +- include/cuco/detail/hash_functions/xxhash.cuh | 99 +++++----- tests/utility/hash_test.cu | 185 +++++++++--------- 4 files changed, 146 insertions(+), 150 deletions(-) diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index b094b84b1..3635336e8 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -79,5 +79,7 @@ ConfigureBench(DYNAMIC_MAP_BENCH hash_table/dynamic_map/contains_bench.cu hash_table/dynamic_map/erase_bench.cu) +################################################################################################### +# - hash function benchmarks ---------------------------------------------------------------------- ConfigureBench(HASH_BENCH hash_bench.cu) diff --git a/benchmarks/hash_bench.cu b/benchmarks/hash_bench.cu index 2d8f13e09..58c6ee770 100644 --- a/benchmarks/hash_bench.cu +++ b/benchmarks/hash_bench.cu @@ -15,7 +15,6 @@ */ #include -#include #include #include @@ -26,9 +25,6 @@ #include -using namespace cuco::benchmark; -using namespace cuco::utility; - template struct large_key { constexpr __host__ __device__ large_key(int32_t seed) noexcept @@ -73,8 +69,8 @@ void hash_eval(nvbench::state& state, nvbench::type_list) { bool const materialize_result = false; constexpr auto block_size = 128; - auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N * 10); - auto const grid_size = SDIV(num_keys, block_size * 16); + auto const num_keys = state.get_int64_or_default("NumInputs", cuco::benchmark::defaults::N * 10); + auto const grid_size = SDIV(num_keys, block_size * 16); thrust::device_vector hash_values((materialize_result) ? num_keys : 1); @@ -102,4 +98,4 @@ NVBENCH_BENCH_TYPES( cuco::murmurhash3_fmix_64>)) .set_name("hash_function_eval") .set_type_axes_names({"Hash"}) - .set_max_noise(defaults::MAX_NOISE); \ No newline at end of file + .set_max_noise(cuco::benchmark::defaults::MAX_NOISE); \ No newline at end of file diff --git a/include/cuco/detail/hash_functions/xxhash.cuh b/include/cuco/detail/hash_functions/xxhash.cuh index d98116997..10360be9b 100644 --- a/include/cuco/detail/hash_functions/xxhash.cuh +++ b/include/cuco/detail/hash_functions/xxhash.cuh @@ -60,22 +60,22 @@ namespace cuco::detail { template struct XXHash_32 { private: - static constexpr uint32_t prime1 = 0x9E3779B1U; - static constexpr uint32_t prime2 = 0x85EBCA77U; - static constexpr uint32_t prime3 = 0xC2B2AE3DU; - static constexpr uint32_t prime4 = 0x27D4EB2FU; - static constexpr uint32_t prime5 = 0x165667B1U; + static constexpr std::uint32_t prime1 = 0x9e3779b1u; + static constexpr std::uint32_t prime2 = 0x85ebca77u; + static constexpr std::uint32_t prime3 = 0xc2b2ae3du; + static constexpr std::uint32_t prime4 = 0x27d4eb2fu; + static constexpr std::uint32_t prime5 = 0x165667b1u; public: - using argument_type = Key; ///< The type of the values taken as argument - using result_type = uint32_t; ///< The type of the hash values produced + using argument_type = Key; ///< The type of the values taken as argument + using result_type = std::uint32_t; ///< The type of the hash values produced /** * @brief Constructs a XXH32 hash function with the given `seed`. * * @param seed A custom number to randomize the resulting hash value */ - __host__ __device__ constexpr XXHash_32(uint32_t seed = 0) : seed_{seed} {} + __host__ __device__ constexpr XXHash_32(std::uint32_t seed = 0) : seed_{seed} {} /** * @brief Returns a hash value for its argument, as a value of type `result_type`. @@ -86,20 +86,21 @@ struct XXHash_32 { constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept { // TODO do we need to add checks/hints for alignment? - constexpr auto nbytes = sizeof(Key); - char const* const bytes = (char const*)&key; ///< per-byte access - uint32_t const* const blocks = (uint32_t const*)&key; ///< 4-byte word access + constexpr auto nbytes = sizeof(Key); + [[maybe_unused]] auto const bytes = reinterpret_cast(&key); ///< per-byte access + [[maybe_unused]] auto const blocks = + reinterpret_cast(&key); ///< 4-byte word access - uint32_t offset = 0; - uint32_t h32; + std::size_t offset = 0; + std::uint32_t h32; // data can be processed in 16-byte chunks if constexpr (nbytes >= 16) { constexpr auto limit = nbytes - 16; - uint32_t v1 = seed_ + prime1 + prime2; - uint32_t v2 = seed_ + prime2; - uint32_t v3 = seed_; - uint32_t v4 = seed_ - prime1; + std::uint32_t v1 = seed_ + prime1 + prime2; + std::uint32_t v2 = seed_ + prime2; + std::uint32_t v3 = seed_; + std::uint32_t v4 = seed_ - prime1; do { // pipeline 4*4byte computations @@ -134,7 +135,7 @@ struct XXHash_32 { } } - // the following loop is only needed if the size of the key is no multiple of the block size + // the following loop is only needed if the size of the key is not a multiple of the block size if constexpr (nbytes % 4) { while (offset < nbytes) { h32 += (bytes[offset] & 255) * prime5; @@ -147,13 +148,13 @@ struct XXHash_32 { } private: - constexpr __host__ __device__ uint32_t rotl(uint32_t h, int8_t r) const noexcept + constexpr __host__ __device__ std::uint32_t rotl(std::uint32_t h, std::int8_t r) const noexcept { return ((h << r) | (h >> (32 - r))); } // avalanche helper - constexpr __host__ __device__ uint32_t finalize(uint32_t h) const noexcept + constexpr __host__ __device__ std::uint32_t finalize(std::uint32_t h) const noexcept { h ^= h >> 15; h *= prime2; @@ -163,7 +164,7 @@ struct XXHash_32 { return h; } - uint32_t seed_; + std::uint32_t seed_; }; /** @@ -210,22 +211,22 @@ struct XXHash_32 { template struct XXHash_64 { private: - static constexpr uint64_t prime1 = 11400714785074694791ULL; - static constexpr uint64_t prime2 = 14029467366897019727ULL; - static constexpr uint64_t prime3 = 1609587929392839161ULL; - static constexpr uint64_t prime4 = 9650029242287828579ULL; - static constexpr uint64_t prime5 = 2870177450012600261ULL; + static constexpr std::uint64_t prime1 = 11400714785074694791ull; + static constexpr std::uint64_t prime2 = 14029467366897019727ull; + static constexpr std::uint64_t prime3 = 1609587929392839161ull; + static constexpr std::uint64_t prime4 = 9650029242287828579ull; + static constexpr std::uint64_t prime5 = 2870177450012600261ull; public: - using argument_type = Key; ///< The type of the values taken as argument - using result_type = uint64_t; ///< The type of the hash values produced + using argument_type = Key; ///< The type of the values taken as argument + using result_type = std::uint64_t; ///< The type of the hash values produced /** * @brief Constructs a XXH64 hash function with the given `seed`. * * @param seed A custom number to randomize the resulting hash value */ - __host__ __device__ constexpr XXHash_64(uint64_t seed = 0) : seed_{seed} {} + __host__ __device__ constexpr XXHash_64(std::uint64_t seed = 0) : seed_{seed} {} /** * @brief Returns a hash value for its argument, as a value of type `result_type`. @@ -236,21 +237,23 @@ struct XXHash_64 { constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept { // TODO do we need to add checks/hints for alignment? - constexpr auto nbytes = sizeof(Key); - char const* const bytes = (char const*)&key; ///< per-byte access - uint32_t const* const blocks4 = (uint32_t const*)&key; ///< 4-byte word access - uint64_t const* const blocks8 = (uint64_t const*)&key; ///< 8-byte word access + constexpr auto nbytes = sizeof(Key); + [[maybe_unused]] auto const bytes = reinterpret_cast(&key); ///< per-byte access + [[maybe_unused]] auto const blocks4 = + reinterpret_cast(&key); ///< 4-byte word access + [[maybe_unused]] auto const blocks8 = + reinterpret_cast(&key); ///< 8-byte word access - uint64_t offset = 0; - uint64_t h64; + std::size_t offset = 0; + std::uint64_t h64; // data can be processed in 32-byte chunks if constexpr (nbytes >= 32) { constexpr auto limit = nbytes - 32; - uint64_t v1 = seed_ + prime1 + prime2; - uint64_t v2 = seed_ + prime2; - uint64_t v3 = seed_; - uint64_t v4 = seed_ - prime1; + std::uint64_t v1 = seed_ + prime1 + prime2; + std::uint64_t v2 = seed_ + prime2; + std::uint64_t v3 = seed_; + std::uint64_t v4 = seed_ - prime1; do { // pipeline 4*8byte computations @@ -304,8 +307,8 @@ struct XXHash_64 { // remaining data can be processed in 8-byte chunks if constexpr ((nbytes % 32) >= 8) { for (; offset <= nbytes - 8; offset += 8) { - uint64_t k1 = blocks8[offset / 8] * prime2; - k1 = rotl(k1, 31) * prime1; + std::uint64_t k1 = blocks8[offset / 8] * prime2; + k1 = rotl(k1, 31) * prime1; h64 ^= k1; h64 = rotl(h64, 27) * prime1 + prime4; } @@ -314,16 +317,16 @@ struct XXHash_64 { // remaining data can be processed in 4-byte chunks if constexpr (((nbytes % 32) % 8) >= 4) { for (; offset <= nbytes - 4; offset += 4) { - h64 ^= (blocks4[offset / 4] & 0xFFFFFFFFULL) * prime1; + h64 ^= (blocks4[offset / 4] & 0xffffffffull) * prime1; h64 = rotl(h64, 23) * prime2 + prime3; } } - // the following loop is only needed if the size of the key is no multiple of a previous block - // size + // the following loop is only needed if the size of the key is not a multiple of a previous + // block size if constexpr (nbytes % 4) { while (offset < nbytes) { - h64 += (bytes[offset] & 0xFF) * prime5; + h64 += (bytes[offset] & 0xff) * prime5; h64 = rotl(h64, 11) * prime1; ++offset; } @@ -332,13 +335,13 @@ struct XXHash_64 { } private: - constexpr __host__ __device__ uint64_t rotl(uint64_t h, int8_t r) const noexcept + constexpr __host__ __device__ std::uint64_t rotl(std::uint64_t h, std::int8_t r) const noexcept { return ((h << r) | (h >> (64 - r))); } // avalanche helper - constexpr __host__ __device__ uint64_t finalize(uint64_t h) const noexcept + constexpr __host__ __device__ std::uint64_t finalize(std::uint64_t h) const noexcept { h ^= h >> 33; h *= prime2; @@ -348,7 +351,7 @@ struct XXHash_64 { return h; } - uint64_t seed_; + std::uint64_t seed_; }; } // namespace cuco::detail \ No newline at end of file diff --git a/tests/utility/hash_test.cu b/tests/utility/hash_test.cu index aa16af607..6dca70aea 100644 --- a/tests/utility/hash_test.cu +++ b/tests/utility/hash_test.cu @@ -19,6 +19,8 @@ #include #include +#include + #include template @@ -34,66 +36,86 @@ struct large_key { int32_t data_[Words]; }; +template +__host__ __device__ bool check_hash_result(typename Hash::argument_type const& key, + typename Hash::result_type seed, + typename Hash::result_type expected) noexcept +{ + Hash h(seed); + return (h(key) == expected); +} + +template +__global__ void check_hash_result_kernel_64(OutputIter result) +{ + result[0] = check_hash_result>(0, 0, 4246796580750024372); + result[1] = check_hash_result>(0, 42, 3614696996920510707); + result[2] = check_hash_result>(42, 0, 15516826743637085169); + result[3] = check_hash_result>(123456789, 0, 9462334144942111946); + + result[4] = check_hash_result>(0, 0, 3803688792395291579); + result[5] = check_hash_result>(0, 42, 13194218611613725804); + result[6] = check_hash_result>(42, 0, 13066772586158965587); + result[7] = check_hash_result>(123456789, 0, 14662639848940634189); + +#if defined(CUCO_HAS_INT128) + result[8] = check_hash_result>(123456789, 0, 7986913354431084250); +#endif + + result[9] = check_hash_result>>(123456789, 0, 2031761887105658523); +} + TEST_CASE("Test cuco::xxhash_64", "") { // Reference hash values were computed using https://github.com/Cyan4973/xxHash SECTION("Check if host-generated hash values match the reference implementation.") { - int32_t k1 = 0; // key - uint64_t s1 = 0; // seed - cuco::xxhash_64 h1(s1); // hasher - CHECK(h1(k1) == 4246796580750024372ULL); - - int32_t k2 = 0; // key - uint64_t s2 = 42; // seed - cuco::xxhash_64 h2(s2); // hasher - CHECK(h2(k2) == 3614696996920510707ULL); - - int32_t k3 = 42; // key - uint64_t s3 = 0; // seed - cuco::xxhash_64 h3(s3); // hasher - CHECK(h3(k3) == 15516826743637085169ULL); - - int32_t k4 = 123456789; // key - uint64_t s4 = 0; // seed - cuco::xxhash_64 h4(s4); // hasher - CHECK(h4(k4) == 9462334144942111946ULL); - - int64_t k5 = 0; // key - uint64_t s5 = 0; // seed - cuco::xxhash_64 h5(s5); // hasher - CHECK(h5(k5) == 3803688792395291579ULL); - - int64_t k6 = 0; // key - uint64_t s6 = 42; // seed - cuco::xxhash_64 h6(s6); // hasher - CHECK(h6(k6) == 13194218611613725804ULL); - - int64_t k7 = 42; // key - uint64_t s7 = 0; // seed - cuco::xxhash_64 h7(s7); // hasher - CHECK(h7(k7) == 13066772586158965587ULL); - - int64_t k8 = 123456789; // key - uint64_t s8 = 0; // seed - cuco::xxhash_64 h8(s8); // hasher - CHECK(h8(k8) == 14662639848940634189ULL); + CHECK(check_hash_result>(0, 0, 4246796580750024372)); + CHECK(check_hash_result>(0, 42, 3614696996920510707)); + CHECK(check_hash_result>(42, 0, 15516826743637085169)); + CHECK(check_hash_result>(123456789, 0, 9462334144942111946)); + + CHECK(check_hash_result>(0, 0, 3803688792395291579)); + CHECK(check_hash_result>(0, 42, 13194218611613725804)); + CHECK(check_hash_result>(42, 0, 13066772586158965587)); + CHECK(check_hash_result>(123456789, 0, 14662639848940634189)); #if defined(CUCO_HAS_INT128) - __int128 k9 = 123456789; // key - uint64_t s9 = 0; // seed - cuco::xxhash_64<__int128> h9(s9); // hasher - CHECK(h9(k9) == 7986913354431084250ULL); + CHECK(check_hash_result>(123456789, 0, 7986913354431084250)); #endif // 32*4=128-byte key to test the pipelined outermost hashing loop - large_key<32> k10(123456789); // key - uint64_t s10 = 0; // seed - cuco::xxhash_64> h10(s10); // hasher - CHECK(h10(k10) == 2031761887105658523ULL); + CHECK(check_hash_result>>(123456789, 0, 2031761887105658523)); } - // TODO SECTION("Check if device-generated hash values match the reference implementation.") + SECTION("Check if device-generated hash values match the reference implementation.") + { + thrust::device_vector result(10); + + check_hash_result_kernel_64<<<1, 1>>>(result.begin()); + + CHECK(cuco::test::all_of(result.begin(), result.end(), [] __device__(bool v) { return v; })); + } +} + +template +__global__ void check_hash_result_kernel_32(OutputIter result) +{ + result[0] = check_hash_result>(0, 0, 148298089); + result[1] = check_hash_result>(0, 42, 2132181312); + result[2] = check_hash_result>(42, 0, 1161967057); + result[3] = check_hash_result>(123456789, 0, 2987034094); + + result[4] = check_hash_result>(0, 0, 3736311059); + result[5] = check_hash_result>(0, 42, 1076387279); + result[6] = check_hash_result>(42, 0, 2332451213); + result[7] = check_hash_result>(123456789, 0, 1561711919); + +#if defined(CUCO_HAS_INT128) + result[8] = check_hash_result>(123456789, 0, 1846633701); +#endif + + result[9] = check_hash_result>>(123456789, 0, 3715432378); } TEST_CASE("Test cuco::xxhash_32", "") @@ -101,57 +123,30 @@ TEST_CASE("Test cuco::xxhash_32", "") // Reference hash values were computed using https://github.com/Cyan4973/xxHash SECTION("Check if host-generated hash values match the reference implementation.") { - int32_t k1 = 0; // key - uint32_t s1 = 0; // seed - cuco::xxhash_32 h1(s1); // hasher - CHECK(h1(k1) == 148298089); - - int32_t k2 = 0; // key - uint32_t s2 = 42; // seed - cuco::xxhash_32 h2(s2); // hasher - CHECK(h2(k2) == 2132181312); - - int32_t k3 = 42; // key - uint32_t s3 = 0; // seed - cuco::xxhash_32 h3(s3); // hasher - CHECK(h3(k3) == 1161967057); - - int32_t k4 = 123456789; // key - uint32_t s4 = 0; // seed - cuco::xxhash_32 h4(s4); // hasher - CHECK(h4(k4) == 2987034094); - - int64_t k5 = 0; // key - uint32_t s5 = 0; // seed - cuco::xxhash_32 h5(s5); // hasher - CHECK(h5(k5) == 3736311059); - - int64_t k6 = 0; // key - uint32_t s6 = 42; // seed - cuco::xxhash_32 h6(s6); // hasher - CHECK(h6(k6) == 1076387279); - - int64_t k7 = 42; // key - uint32_t s7 = 0; // seed - cuco::xxhash_32 h7(s7); // hasher - CHECK(h7(k7) == 2332451213); - - int64_t k8 = 123456789; // key - uint32_t s8 = 0; // seed - cuco::xxhash_32 h8(s8); // hasher - CHECK(h8(k8) == 1561711919); + CHECK(check_hash_result>(0, 0, 148298089)); + CHECK(check_hash_result>(0, 42, 2132181312)); + CHECK(check_hash_result>(42, 0, 1161967057)); + CHECK(check_hash_result>(123456789, 0, 2987034094)); + + CHECK(check_hash_result>(0, 0, 3736311059)); + CHECK(check_hash_result>(0, 42, 1076387279)); + CHECK(check_hash_result>(42, 0, 2332451213)); + CHECK(check_hash_result>(123456789, 0, 1561711919)); #if defined(CUCO_HAS_INT128) - __int128 k9 = 123456789; // key - uint32_t s9 = 0; // seed - cuco::xxhash_32<__int128> h9(s9); // hasher - CHECK(h9(k9) == 1846633701); + CHECK(check_hash_result>(123456789, 0, 1846633701)); #endif // 32*4=128-byte key to test the pipelined outermost hashing loop - large_key<32> k10(123456789); // key - uint64_t s10 = 0; // seed - cuco::xxhash_32> h10(s10); // hasher - CHECK(h10(k10) == 3715432378); + CHECK(check_hash_result>>(123456789, 0, 3715432378)); + } + + SECTION("Check if device-generated hash values match the reference implementation.") + { + thrust::device_vector result(10); + + check_hash_result_kernel_32<<<1, 1>>>(result.begin()); + + CHECK(cuco::test::all_of(result.begin(), result.end(), [] __device__(bool v) { return v; })); } } \ No newline at end of file From 1374552398ff830bf87c8bdeea22bff3513c9024 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 13 Jun 2023 14:05:25 +0200 Subject: [PATCH 118/152] Fix problem with implicit conversion of lhs operands in fast_int operators (#316) The optimized operator/ and operator% in `fast_int` require the left-hand side operands to be of the same type as the underlying `value_type` of the `fast_int`. In order to meet this constraint, the compiler converts the lhs to the `value_type` implicitly, which results in unexpected behavior when mixing signed and unsigned input types (see reproducer https://godbolt.org/z/j8z4a9rGK). This PR enforces the lhs operand to be of `value_type` and throws a compile-time error otherwise. --- include/cuco/utility/fast_int.cuh | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/include/cuco/utility/fast_int.cuh b/include/cuco/utility/fast_int.cuh index f9852ff17..5cd2998f6 100644 --- a/include/cuco/utility/fast_int.cuh +++ b/include/cuco/utility/fast_int.cuh @@ -132,17 +132,19 @@ struct fast_int { value_type magic_; ///< Magic number for fast division value_type shift_; ///< Shift for fast division - friend __host__ __device__ constexpr value_type operator/(value_type lhs, - fast_int const& rhs) noexcept + template + friend __host__ __device__ constexpr value_type operator/(Lhs lhs, fast_int const& rhs) noexcept { + static_assert(cuda::std::is_same_v, + "Left-hand side operand must be of type value_type."); if (rhs.value_ == 1) { return lhs; } // edge case for value_ == 1 if (rhs.magic_ == 0) { return lhs >> rhs.shift_; } // edge case for value_ == pow2 auto const mul = (lhs == cuda::std::numeric_limits::max()) ? lhs : lhs + 1; return rhs.mulhi(rhs.magic_, mul) >> rhs.shift_; } - friend __host__ __device__ constexpr value_type operator%(value_type lhs, - fast_int const& rhs) noexcept + template + friend __host__ __device__ constexpr value_type operator%(Lhs lhs, fast_int const& rhs) noexcept { return lhs - (lhs / rhs) * rhs.value_; } From c0cce866368396ced18a37a0d30287729d6af6c4 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 22 Jun 2023 18:02:53 -0700 Subject: [PATCH 119/152] Add experimental `static_map` (#314) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Contributes to #110 This PR adds `experimental::static_map` and involves several changes to the existing code: - Extracts common `open_addressing_impl` and `open_addressing_ref_impl` classes to minimize duplicates between map and set implementations - Updates the existing code and fixes bugs: invalid type conversion in `attemp_insert`, narrow conversions inside probing scheme, doc improvement, etc. --------- Co-authored-by: Daniel Jünger <2955913+sleeepyjack@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- examples/static_set/device_ref_example.cu | 5 +- include/cuco/detail/common_functors.cuh | 54 ++ include/cuco/detail/common_kernels.cuh | 265 ++++++++ include/cuco/detail/equal_wrapper.cuh | 30 +- include/cuco/detail/open_addressing_impl.cuh | 534 +++++++++++++++ .../cuco/detail/open_addressing_ref_impl.cuh | 615 ++++++++++++++++++ include/cuco/detail/pair.cuh | 20 + include/cuco/detail/static_map/functors.cuh | 57 ++ include/cuco/detail/static_map/kernels.cuh | 93 +++ include/cuco/detail/static_map/static_map.inl | 331 ++++++++++ .../cuco/detail/static_map/static_map_ref.inl | 455 +++++++++++++ include/cuco/detail/static_set/functors.cuh | 35 +- include/cuco/detail/static_set/kernels.cuh | 248 +------ include/cuco/detail/static_set/static_set.inl | 161 +---- .../cuco/detail/static_set/static_set_ref.inl | 375 ++--------- include/cuco/detail/storage/aow_storage.cuh | 13 +- include/cuco/detail/storage/storage.cuh | 8 +- include/cuco/static_map.cuh | 443 +++++++++++++ include/cuco/static_map_ref.cuh | 148 +++++ include/cuco/static_set.cuh | 65 +- include/cuco/static_set_ref.cuh | 37 +- tests/static_map/unique_sequence_test.cu | 126 ++++ 22 files changed, 3290 insertions(+), 828 deletions(-) create mode 100644 include/cuco/detail/common_functors.cuh create mode 100644 include/cuco/detail/common_kernels.cuh create mode 100644 include/cuco/detail/open_addressing_impl.cuh create mode 100644 include/cuco/detail/open_addressing_ref_impl.cuh create mode 100644 include/cuco/detail/static_map/functors.cuh create mode 100644 include/cuco/detail/static_map/kernels.cuh create mode 100644 include/cuco/detail/static_map/static_map.inl create mode 100644 include/cuco/detail/static_map/static_map_ref.inl create mode 100644 include/cuco/static_map_ref.cuh diff --git a/examples/static_set/device_ref_example.cu b/examples/static_set/device_ref_example.cu index 0179baa83..136292f6b 100644 --- a/examples/static_set/device_ref_example.cu +++ b/examples/static_set/device_ref_example.cu @@ -51,8 +51,11 @@ __global__ void custom_contains(SetRef set, InputIterator keys, std::size_t n, O int64_t const loop_stride = gridDim.x * blockDim.x; int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; + auto const tile = + cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + while (idx < n) { - found[idx] = set.contains(*(keys + idx)); + found[idx] = set.contains(tile, *(keys + idx)); idx += loop_stride; } } diff --git a/include/cuco/detail/common_functors.cuh b/include/cuco/detail/common_functors.cuh new file mode 100644 index 000000000..12fe14e0a --- /dev/null +++ b/include/cuco/detail/common_functors.cuh @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + */ + +#pragma once + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief Device functor returning the content of the slot indexed by `idx`. + * + * @tparam StorageRef Storage ref type + */ +template +struct get_slot { + StorageRef storage_; ///< Storage ref + + /** + * @brief Constructs `get_slot` functor with the given storage ref. + * + * @param s Input storage ref + */ + explicit constexpr get_slot(StorageRef s) noexcept : storage_{s} {} + + /** + * @brief Accesses the slot content with the given index. + * + * @param idx The slot index + * @return The slot content + */ + __device__ constexpr auto operator()(typename StorageRef::size_type idx) const noexcept + { + auto const window_idx = idx / StorageRef::window_size; + auto const intra_idx = idx % StorageRef::window_size; + return storage_[window_idx][intra_idx]; + } +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/common_kernels.cuh b/include/cuco/detail/common_kernels.cuh new file mode 100644 index 000000000..896ec753b --- /dev/null +++ b/include/cuco/detail/common_kernels.cuh @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +#include + +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief Inserts all elements in the range `[first, first + n)` and returns the number of + * successful insertions if `pred` of the corresponding stencil returns true. + * + * @note If multiple elements in `[first, first + n)` compare equal, it is unspecified which element + * is inserted. + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * + * @tparam CGSize Number of threads in each CG + * @tparam BlockSize Number of threads in each block + * @tparam InputIterator Device accessible input iterator whose `value_type` is + * convertible to the `value_type` of the data structure + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` + * and argument type is convertible from `std::iterator_traits::value_type` + * @tparam AtomicT Atomic counter type + * @tparam Ref Type of non-owning device container ref allowing access to storage + * + * @param first Beginning of the sequence of input elements + * @param n Number of input elements + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + n)` + * @param num_successes Number of successful inserted elements + * @param ref Non-owning container device ref used to access the slot storage + */ +template +__global__ void insert_if_n(InputIterator first, + cuco::detail::index_type n, + StencilIt stencil, + Predicate pred, + AtomicT* num_successes, + Ref ref) +{ + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + typename Ref::size_type thread_num_successes = 0; + + cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize; + cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize; + + while (idx < n) { + if (pred(*(stencil + idx))) { + typename Ref::value_type const insert_pair{*(first + idx)}; + if constexpr (CGSize == 1) { + if (ref.insert(insert_pair)) { thread_num_successes++; }; + } else { + auto const tile = + cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + if (ref.insert(tile, insert_pair) && tile.thread_rank() == 0) { thread_num_successes++; }; + } + } + idx += loop_stride; + } + + // compute number of successfully inserted elements for each block + // and atomically add to the grand total + auto const block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); + if (threadIdx.x == 0) { + num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed); + } +} + +/** + * @brief Inserts all elements in the range `[first, first + n)` if `pred` of the corresponding + * stencil returns true. + * + * @note If multiple elements in `[first, first + n)` compare equal, it is unspecified which element + * is inserted. + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * + * @tparam CGSize Number of threads in each CG + * @tparam BlockSize Number of threads in each block + * @tparam InputIterator Device accessible input iterator whose `value_type` is + * convertible to the `value_type` of the data structure + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` + * and argument type is convertible from `std::iterator_traits::value_type` + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of input elements + * @param n Number of input elements + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + n)` + * @param ref Non-owning container device ref used to access the slot storage + */ +template +__global__ void insert_if_n( + InputIterator first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, Ref ref) +{ + cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize; + cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize; + + while (idx < n) { + if (pred(*(stencil + idx))) { + typename Ref::value_type const insert_pair{*(first + idx)}; + if constexpr (CGSize == 1) { + ref.insert(insert_pair); + } else { + auto const tile = + cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + ref.insert(tile, insert_pair); + } + } + idx += loop_stride; + } +} + +/** + * @brief Indicates whether the keys in the range `[first, first + n)` are contained in the data + * structure if `pred` of the corresponding stencil returns true. + * + * @note If `pred( *(stencil + i) )` is true, stores `true` or `false` to `(output_begin + i)` + * indicating if the key `*(first + i)` is present in the container. If `pred( *(stencil + i) )` is + * false, stores false to `(output_begin + i)`. + * + * @tparam CGSize Number of threads in each CG + * @tparam BlockSize The size of the thread block + * @tparam InputIt Device accessible input iterator + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` + * and argument type is convertible from `std::iterator_traits::value_type` + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of keys + * @param n Number of keys + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + n)` + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param ref Non-owning container device ref used to access the slot storage + */ +template +__global__ void contains_if_n(InputIt first, + cuco::detail::index_type n, + StencilIt stencil, + Predicate pred, + OutputIt output_begin, + Ref ref) +{ + namespace cg = cooperative_groups; + + auto const block = cg::this_thread_block(); + auto const thread_idx = block.thread_rank(); + + cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize; + cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize; + + __shared__ bool output_buffer[BlockSize / CGSize]; + + while (idx - thread_idx < n) { // the whole thread block falls into the same iteration + if constexpr (CGSize == 1) { + if (idx < n) { + auto const key = *(first + idx); + /* + * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased + * sector stores from L2 to global memory. By writing results to shared memory and then + * synchronizing before writing back to global, we no longer rely on L1, preventing the + * increase in sector stores from L2 to global and improving performance. + */ + output_buffer[thread_idx] = pred(*(stencil + idx)) ? ref.contains(key) : false; + } + block.sync(); + if (idx < n) { *(output_begin + idx) = output_buffer[thread_idx]; } + } else { + auto const tile = cg::tiled_partition(cg::this_thread_block()); + if (idx < n) { + auto const key = *(first + idx); + auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false; + if (tile.thread_rank() == 0) { *(output_begin + idx) = found; } + } + } + idx += loop_stride; + } +} + +/** + * @brief Calculates the number of filled slots for the given window storage. + * + * @tparam BlockSize Number of threads in each block + * @tparam StorageRef Type of non-owning ref allowing access to storage + * @tparam Predicate Type of predicate indicating if the given slot is filled + * @tparam AtomicT Atomic counter type + * + * @param storage Non-owning device ref used to access the slot storage + * @param is_filled Predicate indicating if the given slot is filled + * @param count Number of filled slots + */ +template +__global__ void size(StorageRef storage, Predicate is_filled, AtomicT* count) +{ + using size_type = typename StorageRef::size_type; + + cuco::detail::index_type const loop_stride = gridDim.x * BlockSize; + cuco::detail::index_type idx = BlockSize * blockIdx.x + threadIdx.x; + + size_type thread_count = 0; + auto const n = storage.num_windows(); + + while (idx < n) { + auto const window = storage[idx]; +#pragma unroll + for (auto const& it : window) { + thread_count += static_cast(is_filled(it)); + } + idx += loop_stride; + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + auto const block_count = BlockReduce(temp_storage).Sum(thread_count); + if (threadIdx.x == 0) { count->fetch_add(block_count, cuda::std::memory_order_relaxed); } +} + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/equal_wrapper.cuh b/include/cuco/detail/equal_wrapper.cuh index 1774e0bf3..d2ded4a33 100644 --- a/include/cuco/detail/equal_wrapper.cuh +++ b/include/cuco/detail/equal_wrapper.cuh @@ -29,7 +29,7 @@ namespace detail { enum class equal_result : int32_t { UNEQUAL = 0, EMPTY = 1, EQUAL = 2 }; /** - * @brief Equality wrapper. + * @brief Key equality wrapper. * * User-provided equality binary callable cannot be used to compare against sentinel value. * @@ -38,8 +38,8 @@ enum class equal_result : int32_t { UNEQUAL = 0, EMPTY = 1, EQUAL = 2 }; */ template struct equal_wrapper { - T sentinel_; ///< Sentinel value - Equal equal_; ///< Custom equality callable + T empty_sentinel_; ///< Sentinel value + Equal equal_; ///< Custom equality callable /** * @brief Equality wrapper ctor. @@ -47,23 +47,23 @@ struct equal_wrapper { * @param sentinel Sentinel value * @param equal Equality binary callable */ - __host__ __device__ constexpr equal_wrapper(T sentinel, Equal const& equal) - : sentinel_{sentinel}, equal_{equal} + __host__ __device__ constexpr equal_wrapper(T sentinel, Equal const& equal) noexcept + : empty_sentinel_{sentinel}, equal_{equal} { } /** * @brief Equality check with the given equality callable. * - * @tparam LHS Left-hand side Element type - * @tparam RHS Right-hand side Element type + * @tparam U Right-hand side Element type * * @param lhs Left-hand side element to check equality * @param rhs Right-hand side element to check equality - * @return Three way equality comparison result + * + * @return `EQUAL` if `lhs` and `rhs` are equivalent. `UNEQUAL` otherwise. */ - template - __device__ constexpr equal_result equal_to(LHS const& lhs, RHS const& rhs) const noexcept + template + __device__ constexpr equal_result equal_to(T const& lhs, U const& rhs) const noexcept { return equal_(lhs, rhs) ? equal_result::EQUAL : equal_result::UNEQUAL; } @@ -71,22 +71,22 @@ struct equal_wrapper { /** * @brief Order-sensitive equality operator. * - * This function always compares the left-hand side element against `sentinel_` value first - * then perform a equality check with the given `equal_` callable, i.e., `equal_(lhs, rhs)`. - * + * @note This function always compares the left-hand side element against `empty_sentinel_` value + * first then perform a equality check with the given `equal_` callable, i.e., `equal_(lhs, rhs)`. * @note Container (like set or map) keys MUST be always on the left-hand side. * * @tparam U Right-hand side Element type * * @param lhs Left-hand side element to check equality * @param rhs Right-hand side element to check equality + * * @return Three way equality comparison result */ template __device__ constexpr equal_result operator()(T const& lhs, U const& rhs) const noexcept { - return cuco::detail::bitwise_compare(lhs, sentinel_) ? equal_result::EMPTY - : this->equal_to(lhs, rhs); + return cuco::detail::bitwise_compare(lhs, empty_sentinel_) ? equal_result::EMPTY + : this->equal_to(lhs, rhs); } }; diff --git a/include/cuco/detail/open_addressing_impl.cuh b/include/cuco/detail/open_addressing_impl.cuh new file mode 100644 index 000000000..6c7fa7965 --- /dev/null +++ b/include/cuco/detail/open_addressing_impl.cuh @@ -0,0 +1,534 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include + +namespace cuco { +namespace experimental { +namespace detail { +/** + * @brief An open addressing impl class. + * + * @note This class should NOT be used directly. + * + * @throw If the size of the given slot type is larger than 8 bytes + * @throw If the given key type doesn't have unique object representations, i.e., + * `cuco::bitwise_comparable_v == false` + * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base` + * + * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v` + * @tparam Value Type used for storage values. + * @tparam Extent Data structure size type + * @tparam Scope The scope in which operations will be performed by individual threads. + * @tparam KeyEqual Binary callable type used to compare two keys for equality + * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for choices) + * @tparam Allocator Type of allocator used for device storage + * @tparam Storage Slot window storage type + */ +template +class open_addressing_impl { + static_assert(sizeof(Value) <= 8, "Container does not support slot types larger than 8 bytes."); + + static_assert( + cuco::is_bitwise_comparable_v, + "Key type must have unique object representations or have been explicitly declared as safe for " + "bitwise comparison via specialization of cuco::is_bitwise_comparable_v."); + + static_assert( + std::is_base_of_v, + ProbingScheme>, + "ProbingScheme must inherit from cuco::detail::probing_scheme_base"); + + public: + static constexpr auto cg_size = ProbingScheme::cg_size; ///< CG size used for probing + static constexpr auto window_size = Storage::window_size; ///< Window size used for probing + static constexpr auto thread_scope = Scope; ///< CUDA thread scope + + using key_type = Key; ///< Key type + using value_type = Value; ///< The storage value type, NOT payload type + /// Extent type + using extent_type = decltype(make_valid_extent(std::declval())); + using size_type = typename extent_type::value_type; ///< Size type + using key_equal = KeyEqual; ///< Key equality comparator type + using storage_type = + detail::storage; ///< Storage type + using allocator_type = typename storage_type::allocator_type; ///< Allocator type + + using storage_ref_type = typename storage_type::ref_type; ///< Non-owning window storage ref type + using probing_scheme_type = ProbingScheme; ///< Probe scheme type + + /** + * @brief Constructs a statically-sized open addressing data structure with the specified initial + * capacity, sentinel values and CUDA stream. + * + * @note The actual capacity depends on the given `capacity`, the probing scheme, CG size, and the + * window size and it's computed via `make_valid_extent` factory. Insert operations will not + * automatically grow the container. Attempting to insert more unique keys than the capacity of + * the container results in undefined behavior. + * @note The `empty_key_sentinel` is reserved and behavior is undefined when attempting to insert + * this sentinel value. + * + * @param capacity The requested lower-bound size + * @param empty_key_sentinel The reserved key value for empty slots + * @param empty_slot_sentinel The reserved slot value for empty slots + * @param pred Key equality binary predicate + * @param probing_scheme Probing scheme + * @param alloc Allocator used for allocating device storage + * @param stream CUDA stream used to initialize the data structure + */ + constexpr open_addressing_impl(Extent capacity, + key_type empty_key_sentinel, + value_type empty_slot_sentinel, + KeyEqual const& pred, + ProbingScheme const& probing_scheme, + Allocator const& alloc, + cuda_stream_ref stream) noexcept + : empty_key_sentinel_{empty_key_sentinel}, + predicate_{pred}, + probing_scheme_{probing_scheme}, + storage_{make_valid_extent(capacity), alloc} + { + storage_.initialize(empty_slot_sentinel, stream); + } + + /** + * @brief Inserts all keys in the range `[first, last)` and returns the number of successful + * insertions. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `insert_async`. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * open_addressing_impl::value_type> is `true` + * @tparam Ref Type of non-owning device container ref allowing access to storage + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param container_ref Non-owning device container ref used to access the slot storage + * @param stream CUDA stream used for insert + * + * @return Number of successfully inserted keys + */ + template + size_type insert(InputIt first, InputIt last, Ref container_ref, cuda_stream_ref stream) + { + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return 0; } + + auto counter = + detail::counter_storage{this->allocator()}; + counter.reset(stream); + + auto const grid_size = + (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / + (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + + auto const always_true = thrust::constant_iterator{true}; + detail::insert_if_n + <<>>( + first, num_keys, always_true, thrust::identity{}, counter.data(), container_ref); + + return counter.load_to_host(stream); + } + + /** + * @brief Asynchonously inserts all keys in the range `[first, last)`. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * open_addressing_impl::value_type> is `true` + * @tparam Ref Type of non-owning device container ref allowing access to storage + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param container_ref Non-owning device container ref used to access the slot storage + * @param stream CUDA stream used for insert + */ + template + void insert_async(InputIt first, InputIt last, Ref container_ref, cuda_stream_ref stream) noexcept + { + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + auto const grid_size = + (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / + (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + + auto const always_true = thrust::constant_iterator{true}; + detail::insert_if_n + <<>>( + first, num_keys, always_true, thrust::identity{}, container_ref); + } + + /** + * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns + * true. + * + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * @note This function synchronizes the given stream and returns the number of successful + * insertions. For asynchronous execution use `insert_if_async`. + * + * @tparam InputIt Device accessible random access iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * @tparam Ref Type of non-owning device container ref allowing access to storage + * + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param container_ref Non-owning device container ref used to access the slot storage + * @param stream CUDA stream used for the operation + * + * @return Number of successfully inserted keys + */ + template + size_type insert_if(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + Ref container_ref, + cuda_stream_ref stream) + { + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return 0; } + + auto counter = + detail::counter_storage{this->allocator()}; + counter.reset(stream); + + auto const grid_size = + (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / + (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + + detail::insert_if_n + <<>>( + first, num_keys, stencil, pred, counter.data(), container_ref); + + return counter.load_to_host(stream); + } + + /** + * @brief Asynchonously inserts keys in the range `[first, last)` if `pred` of the corresponding + * stencil returns true. + * + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * + * @tparam InputIt Device accessible random access iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * @tparam Ref Type of non-owning device container ref allowing access to storage + * + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param container_ref Non-owning device container ref used to access the slot storage + * @param stream CUDA stream used for the operation + */ + template + void insert_if_async(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + Ref container_ref, + cuda_stream_ref stream) noexcept + { + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + auto const grid_size = + (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / + (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + + detail::insert_if_n + <<>>( + first, num_keys, stencil, pred, container_ref); + } + + /** + * @brief Asynchonously indicates whether the keys in the range `[first, last)` are contained in + * the container. + * + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * @tparam Ref Type of non-owning device container ref allowing access to storage + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param container_ref Non-owning device container ref used to access the slot storage + * @param stream Stream used for executing the kernels + */ + template + void contains_async(InputIt first, + InputIt last, + OutputIt output_begin, + Ref container_ref, + cuda_stream_ref stream) const noexcept + { + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + auto const grid_size = + (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / + (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + + auto const always_true = thrust::constant_iterator{true}; + detail::contains_if_n + <<>>( + first, num_keys, always_true, thrust::identity{}, output_begin, container_ref); + } + + /** + * @brief Asynchonously indicates whether the keys in the range `[first, last)` are contained in + * the container if `pred` of the corresponding stencil returns true. + * + * @note If `pred( *(stencil + i) )` is true, stores `true` or `false` to `(output_begin + i)` + * indicating if the key `*(first + i)` is present int the container. If `pred( *(stencil + i) )` + * is false, stores false to `(output_begin + i)`. + * + * @tparam InputIt Device accessible input iterator + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * @tparam Ref Type of non-owning device container ref allowing access to storage + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param container_ref Non-owning device container ref used to access the slot storage + * @param stream Stream used for executing the kernels + */ + template + void contains_if_async(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + OutputIt output_begin, + Ref container_ref, + cuda_stream_ref stream) const noexcept + { + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + auto const grid_size = + (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / + (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + + detail::contains_if_n + <<>>( + first, num_keys, stencil, pred, output_begin, container_ref); + } + + /** + * @brief Retrieves all keys contained in the container. + * + * @note This API synchronizes the given stream. + * @note The order in which keys are returned is implementation defined and not guaranteed to be + * consistent between subsequent calls to `retrieve_all`. + * @note Behavior is undefined if the range beginning at `output_begin` is smaller than the return + * value of `size()`. + * + * @tparam OutputIt Device accessible random access output iterator whose `value_type` is + * convertible from the container's `key_type` + * @tparam Predicate Type of predicate indicating if the given slot is filled + * + * @param output_begin Beginning output iterator for keys + * @param is_filled Predicate indicating if the given slot is filled + * @param stream CUDA stream used for this operation + * + * @return Iterator indicating the end of the output + */ + template + [[nodiscard]] OutputIt retrieve_all(OutputIt output_begin, + Predicate const& is_filled, + cuda_stream_ref stream) const + { + auto begin = + thrust::make_transform_iterator(thrust::counting_iterator(0), + detail::get_slot(this->storage_ref())); + + std::size_t temp_storage_bytes = 0; + using temp_allocator_type = typename std::allocator_traits::rebind_alloc; + auto temp_allocator = temp_allocator_type{this->allocator()}; + auto d_num_out = reinterpret_cast( + std::allocator_traits::allocate(temp_allocator, sizeof(size_type))); + CUCO_CUDA_TRY(cub::DeviceSelect::If(nullptr, + temp_storage_bytes, + begin, + output_begin, + d_num_out, + this->capacity(), + is_filled, + stream)); + + // Allocate temporary storage + auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes); + + CUCO_CUDA_TRY(cub::DeviceSelect::If(d_temp_storage, + temp_storage_bytes, + begin, + output_begin, + d_num_out, + this->capacity(), + is_filled, + stream)); + + size_type h_num_out; + CUCO_CUDA_TRY( + cudaMemcpyAsync(&h_num_out, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream)); + stream.synchronize(); + std::allocator_traits::deallocate( + temp_allocator, reinterpret_cast(d_num_out), sizeof(size_type)); + temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); + + return output_begin + h_num_out; + } + + /** + * @brief Gets the number of elements in the container. + * + * @note This function synchronizes the given stream. + * + * @tparam Predicate Type of predicate indicating if the given slot is filled + * + * @param is_filled Predicate indicating if the given slot is filled + * @param stream CUDA stream used to get the number of inserted elements + * + * @return The number of elements in the container + */ + template + [[nodiscard]] size_type size(Predicate const& is_filled, cuda_stream_ref stream) const noexcept + { + auto counter = + detail::counter_storage{this->allocator()}; + counter.reset(stream); + + auto const grid_size = + (storage_.num_windows() + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / + (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + + // TODO: custom kernel to be replaced by cub::DeviceReduce::Sum when cub version is bumped to + // v2.1.0 + detail::size + <<>>( + storage_.ref(), is_filled, counter.data()); + + return counter.load_to_host(stream); + } + + /** + * @brief Gets the maximum number of elements the container can hold. + * + * @return The maximum number of elements the container can hold + */ + [[nodiscard]] constexpr auto capacity() const noexcept { return storage_.capacity(); } + + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + [[nodiscard]] constexpr key_type empty_key_sentinel() const noexcept + { + return empty_key_sentinel_; + } + + /** + * @brief Gets the key comparator. + * + * @return The comparator used to compare keys + */ + [[nodiscard]] constexpr key_equal key_eq() const noexcept { return predicate_; } + + /** + * @brief Gets the probing scheme. + * + * @return The probing scheme used for the container + */ + [[nodiscard]] constexpr probing_scheme_type const& probing_scheme() const noexcept + { + return probing_scheme_; + } + + /** + * @brief Gets the container allocator. + * + * @return The container allocator + */ + [[nodiscard]] constexpr allocator_type allocator() const noexcept { return storage_.allocator(); } + + /** + * @brief Gets the non-owning storage ref. + * + * @return The non-owning storage ref of the container + */ + [[nodiscard]] constexpr storage_ref_type storage_ref() const noexcept { return storage_.ref(); } + + protected: + key_type empty_key_sentinel_; ///< Key value that represents an empty slot + key_equal predicate_; ///< Key equality binary predicate + probing_scheme_type probing_scheme_; ///< Probing scheme + storage_type storage_; ///< Slot window storage +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing_ref_impl.cuh new file mode 100644 index 000000000..dd3a84434 --- /dev/null +++ b/include/cuco/detail/open_addressing_ref_impl.cuh @@ -0,0 +1,615 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include + +#include + +#include + +#include +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief Common device non-owning "ref" implementation class. + * + * @note This class should NOT be used directly. + * + * @throw If the given key type doesn't have unique object representations, i.e., + * `cuco::bitwise_comparable_v == false` + * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base` + * + * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v` returning true + * @tparam Scope The scope in which operations will be performed by individual threads. + * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for options) + * @tparam StorageRef Storage ref type + */ +template +class open_addressing_ref_impl { + static_assert( + cuco::is_bitwise_comparable_v, + "Key type must have unique object representations or have been explicitly declared as safe for " + "bitwise comparison via specialization of cuco::is_bitwise_comparable_v."); + + static_assert( + std::is_base_of_v, + ProbingScheme>, + "ProbingScheme must inherit from cuco::detail::probing_scheme_base"); + + public: + using key_type = Key; ///< Key type + using probing_scheme_type = ProbingScheme; ///< Type of probing scheme + using storage_ref_type = StorageRef; ///< Type of storage ref + using window_type = typename storage_ref_type::window_type; ///< Window type + using value_type = typename storage_ref_type::value_type; ///< Storage element type + using extent_type = typename storage_ref_type::extent_type; ///< Extent type + using size_type = typename storage_ref_type::size_type; ///< Probing scheme size type + using iterator = typename storage_ref_type::iterator; ///< Slot iterator type + using const_iterator = typename storage_ref_type::const_iterator; ///< Const slot iterator type + + static constexpr auto cg_size = probing_scheme_type::cg_size; ///< Cooperative group size + static constexpr auto window_size = + storage_ref_type::window_size; ///< Number of elements handled per window + + /** + * @brief Constructs open_addressing_ref_impl. + * + * @param empty_slot_sentinel Sentinel indicating an empty slot + * @param probing_scheme Probing scheme + * @param storage_ref Non-owning ref of slot storage + */ + __host__ __device__ explicit constexpr open_addressing_ref_impl( + value_type empty_slot_sentinel, + probing_scheme_type const& probing_scheme, + storage_ref_type storage_ref) noexcept + : empty_slot_sentinel_{empty_slot_sentinel}, + probing_scheme_{probing_scheme}, + storage_ref_{storage_ref} + { + } + + /** + * @brief Gets the maximum number of elements the container can hold. + * + * @return The maximum number of elements the container can hold + */ + [[nodiscard]] __host__ __device__ constexpr auto capacity() const noexcept + { + return storage_ref_.capacity(); + } + + /** + * @brief Returns a const_iterator to one past the last slot. + * + * @return A const_iterator to one past the last slot + */ + [[nodiscard]] __host__ __device__ constexpr const_iterator end() const noexcept + { + return storage_ref_.end(); + } + + /** + * @brief Returns an iterator to one past the last slot. + * + * @return An iterator to one past the last slot + */ + [[nodiscard]] __host__ __device__ constexpr iterator end() noexcept { return storage_ref_.end(); } + + /** + * @brief Inserts an element. + * + * @tparam Predicate Predicate type + * + * @param key Key of the element to insert + * @param value The element to insert + * @param predicate Predicate used to compare slot content against `key` + * + * @return True if the given element is successfully inserted + */ + template + __device__ bool insert(key_type const& key, + value_type const& value, + Predicate const& predicate) noexcept + { + static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); + auto probing_iter = probing_scheme_(key, storage_ref_.num_windows()); + + while (true) { + auto const window_slots = storage_ref_[*probing_iter]; + + for (auto& slot_content : window_slots) { + auto const eq_res = predicate(slot_content, key); + + // If the key is already in the container, return false + if (eq_res == detail::equal_result::EQUAL) { return false; } + if (eq_res == detail::equal_result::EMPTY) { + auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); + switch (attempt_insert( + (storage_ref_.data() + *probing_iter)->data() + intra_window_index, value, predicate)) { + case insert_result::CONTINUE: continue; + case insert_result::SUCCESS: return true; + case insert_result::DUPLICATE: return false; + } + } + } + ++probing_iter; + } + } + + /** + * @brief Inserts an element. + * + * @tparam Predicate Predicate type + * + * @param group The Cooperative Group used to perform group insert + * @param key Key of the element to insert + * @param value The element to insert + * @param predicate Predicate used to compare slot content against `key` + * + * @return True if the given element is successfully inserted + */ + template + __device__ bool insert(cooperative_groups::thread_block_tile const& group, + key_type const& key, + value_type const& value, + Predicate const& predicate) noexcept + { + auto probing_iter = probing_scheme_(group, key, storage_ref_.num_windows()); + + while (true) { + auto const window_slots = storage_ref_[*probing_iter]; + + auto const [state, intra_window_index] = [&]() { + for (auto i = 0; i < window_size; ++i) { + switch (predicate(window_slots[i], key)) { + case detail::equal_result::EMPTY: return window_results{detail::equal_result::EMPTY, i}; + case detail::equal_result::EQUAL: return window_results{detail::equal_result::EQUAL, i}; + default: continue; + } + } + // returns dummy index `-1` for UNEQUAL + return window_results{detail::equal_result::UNEQUAL, -1}; + }(); + + // If the key is already in the container, return false + if (group.any(state == detail::equal_result::EQUAL)) { return false; } + + auto const group_contains_empty = group.ballot(state == detail::equal_result::EMPTY); + + if (group_contains_empty) { + auto const src_lane = __ffs(group_contains_empty) - 1; + auto const status = + (group.thread_rank() == src_lane) + ? attempt_insert((storage_ref_.data() + *probing_iter)->data() + intra_window_index, + value, + predicate) + : insert_result::CONTINUE; + + switch (group.shfl(status, src_lane)) { + case insert_result::SUCCESS: return true; + case insert_result::DUPLICATE: return false; + default: continue; + } + } else { + ++probing_iter; + } + } + } + + /** + * @brief Inserts the given element into the container. + * + * @note This API returns a pair consisting of an iterator to the inserted element (or to the + * element that prevented the insertion) and a `bool` denoting whether the insertion took place or + * not. + * + * @tparam Predicate Predicate type + * + * @param key Key of the element to insert + * @param value The element to insert + * @param predicate Predicate used to compare slot content against `key` + * + * @return a pair consisting of an iterator to the element and a bool indicating whether the + * insertion is successful or not. + */ + template + __device__ thrust::pair insert_and_find(key_type const& key, + value_type const& value, + Predicate const& predicate) noexcept + { + static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); + auto probing_iter = probing_scheme_(key, storage_ref_.num_windows()); + + while (true) { + auto const window_slots = storage_ref_[*probing_iter]; + + for (auto i = 0; i < window_size; ++i) { + auto const eq_res = predicate(window_slots[i], key); + auto* window_ptr = (storage_ref_.data() + *probing_iter)->data(); + + // If the key is already in the container, return false + if (eq_res == detail::equal_result::EQUAL) { return {iterator{&window_ptr[i]}, false}; } + if (eq_res == detail::equal_result::EMPTY) { + switch (attempt_insert(window_ptr + i, value, predicate)) { + case insert_result::SUCCESS: { + return {iterator{&window_ptr[i]}, true}; + } + case insert_result::DUPLICATE: { + return {iterator{&window_ptr[i]}, false}; + } + default: continue; + } + } + } + ++probing_iter; + }; + } + + /** + * @brief Inserts the given element into the container. + * + * @note This API returns a pair consisting of an iterator to the inserted element (or to the + * element that prevented the insertion) and a `bool` denoting whether the insertion took place or + * not. + * + * @tparam Predicate Predicate type + * + * @param group The Cooperative Group used to perform group insert_and_find + * @param key Key of the element to insert + * @param value The element to insert + * @param predicate Predicate used to compare slot content against `key` + * + * @return a pair consisting of an iterator to the element and a bool indicating whether the + * insertion is successful or not. + */ + template + __device__ thrust::pair insert_and_find( + cooperative_groups::thread_block_tile const& group, + key_type const& key, + value_type const& value, + Predicate const& predicate) noexcept + { + auto probing_iter = probing_scheme_(group, key, storage_ref_.num_windows()); + + while (true) { + auto const window_slots = storage_ref_[*probing_iter]; + + auto const [state, intra_window_index] = [&]() { + for (auto i = 0; i < window_size; ++i) { + switch (predicate(window_slots[i], key)) { + case detail::equal_result::EMPTY: return window_results{detail::equal_result::EMPTY, i}; + case detail::equal_result::EQUAL: return window_results{detail::equal_result::EQUAL, i}; + default: continue; + } + } + // returns dummy index `-1` for UNEQUAL + return window_results{detail::equal_result::UNEQUAL, -1}; + }(); + + auto* slot_ptr = (storage_ref_.data() + *probing_iter)->data() + intra_window_index; + + // If the key is already in the container, return false + auto const group_finds_equal = group.ballot(state == detail::equal_result::EQUAL); + if (group_finds_equal) { + auto const src_lane = __ffs(group_finds_equal) - 1; + auto const res = group.shfl(reinterpret_cast(slot_ptr), src_lane); + return {iterator{reinterpret_cast(res)}, false}; + } + + auto const group_contains_empty = group.ballot(state == detail::equal_result::EMPTY); + if (group_contains_empty) { + auto const src_lane = __ffs(group_contains_empty) - 1; + auto const res = group.shfl(reinterpret_cast(slot_ptr), src_lane); + auto const status = (group.thread_rank() == src_lane) + ? attempt_insert(slot_ptr, value, predicate) + : insert_result::CONTINUE; + + switch (group.shfl(status, src_lane)) { + case insert_result::SUCCESS: { + return {iterator{reinterpret_cast(res)}, true}; + } + case insert_result::DUPLICATE: { + return {iterator{reinterpret_cast(res)}, false}; + } + default: continue; + } + } else { + ++probing_iter; + } + } + } + + /** + * @brief Indicates whether the probe key `key` was inserted into the container. + * + * @note If the probe key `key` was inserted into the container, returns true. Otherwise, returns + * false. + * + * @tparam ProbeKey Probe key type + * @tparam Predicate Predicate type + * + * @param key The key to search for + * @param predicate Predicate used to compare slot content against `key` + * + * @return A boolean indicating whether the probe key is present + */ + template + [[nodiscard]] __device__ bool contains(ProbeKey const& key, + Predicate const& predicate) const noexcept + { + static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); + auto probing_iter = probing_scheme_(key, storage_ref_.num_windows()); + + while (true) { + // TODO atomic_ref::load if insert operator is present + auto const window_slots = storage_ref_[*probing_iter]; + + for (auto& slot_content : window_slots) { + switch (predicate(slot_content, key)) { + case detail::equal_result::UNEQUAL: continue; + case detail::equal_result::EMPTY: return false; + case detail::equal_result::EQUAL: return true; + } + } + ++probing_iter; + } + } + + /** + * @brief Indicates whether the probe key `key` was inserted into the container. + * + * @note If the probe key `key` was inserted into the container, returns true. Otherwise, returns + * false. + * + * @tparam ProbeKey Probe key type + * @tparam Predicate Predicate type + * + * @param group The Cooperative Group used to perform group contains + * @param key The key to search for + * @param predicate Predicate used to compare slot content against `key` + * + * @return A boolean indicating whether the probe key is present + */ + template + [[nodiscard]] __device__ bool contains( + cooperative_groups::thread_block_tile const& group, + ProbeKey const& key, + Predicate const& predicate) const noexcept + { + auto probing_iter = probing_scheme_(group, key, storage_ref_.num_windows()); + + while (true) { + auto const window_slots = storage_ref_[*probing_iter]; + + auto const state = [&]() { + for (auto& slot : window_slots) { + switch (predicate(slot, key)) { + case detail::equal_result::EMPTY: return detail::equal_result::EMPTY; + case detail::equal_result::EQUAL: return detail::equal_result::EQUAL; + default: continue; + } + } + return detail::equal_result::UNEQUAL; + }(); + + if (group.any(state == detail::equal_result::EQUAL)) { return true; } + if (group.any(state == detail::equal_result::EMPTY)) { return false; } + + ++probing_iter; + } + } + + /** + * @brief Finds an element in the container with key equivalent to the probe key. + * + * @note Returns a un-incrementable input iterator to the element whose key is equivalent to + * `key`. If no such element exists, returns `end()`. + * + * @tparam ProbeKey Probe key type + * @tparam Predicate Predicate type + * + * @param key The key to search for + * @param predicate Predicate used to compare slot content against `key` + * + * @return An iterator to the position at which the equivalent key is stored + */ + template + [[nodiscard]] __device__ const_iterator find(ProbeKey const& key, + Predicate const& predicate) const noexcept + { + static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); + auto probing_iter = probing_scheme_(key, storage_ref_.num_windows()); + + while (true) { + // TODO atomic_ref::load if insert operator is present + auto const window_slots = storage_ref_[*probing_iter]; + + for (auto i = 0; i < window_size; ++i) { + switch (predicate(window_slots[i], key)) { + case detail::equal_result::EMPTY: { + return this->end(); + } + case detail::equal_result::EQUAL: { + return const_iterator{&(*(storage_ref_.data() + *probing_iter))[i]}; + } + default: continue; + } + } + ++probing_iter; + } + } + + /** + * @brief Finds an element in the container with key equivalent to the probe key. + * + * @note Returns a un-incrementable input iterator to the element whose key is equivalent to + * `key`. If no such element exists, returns `end()`. + * + * @tparam ProbeKey Probe key type + * @tparam Predicate Predicate type + * + * @param group The Cooperative Group used to perform this operation + * @param key The key to search for + * @param predicate Predicate used to compare slot content against `key` + * + * @return An iterator to the position at which the equivalent key is stored + */ + template + [[nodiscard]] __device__ const_iterator + find(cooperative_groups::thread_block_tile const& group, + ProbeKey const& key, + Predicate const& predicate) const noexcept + { + auto probing_iter = probing_scheme_(group, key, storage_ref_.num_windows()); + + while (true) { + auto const window_slots = storage_ref_[*probing_iter]; + + auto const [state, intra_window_index] = [&]() { + for (auto i = 0; i < window_size; ++i) { + switch (predicate(window_slots[i], key)) { + case detail::equal_result::EMPTY: return window_results{detail::equal_result::EMPTY, i}; + case detail::equal_result::EQUAL: return window_results{detail::equal_result::EQUAL, i}; + default: continue; + } + } + // returns dummy index `-1` for UNEQUAL + return window_results{detail::equal_result::UNEQUAL, -1}; + }(); + + // Find a match for the probe key, thus return an iterator to the entry + auto const group_finds_match = group.ballot(state == detail::equal_result::EQUAL); + if (group_finds_match) { + auto const src_lane = __ffs(group_finds_match) - 1; + auto const res = group.shfl( + reinterpret_cast(&(*(storage_ref_.data() + *probing_iter))[intra_window_index]), + src_lane); + return const_iterator{reinterpret_cast(res)}; + } + + // Find an empty slot, meaning that the probe key isn't present in the container + if (group.any(state == detail::equal_result::EMPTY)) { return this->end(); } + + ++probing_iter; + } + } + + private: + /// Three-way insert result enum + enum class insert_result : int32_t { CONTINUE = 0, SUCCESS = 1, DUPLICATE = 2 }; + + /** + * @brief Helper struct to store intermediate window probing results. + */ + struct window_results { + detail::equal_result state_; ///< Equal result + int32_t intra_window_index_; ///< Intra-window index + + /** + * @brief Constructs window_results. + * + * @param state The three way equality result + *@param Intra-window index + */ + __device__ explicit constexpr window_results(detail::equal_result state, int32_t index) noexcept + : state_{state}, intra_window_index_{index} + { + } + }; + + /** + * @brief Attempts to insert an element into a slot. + * + * @note Dispatches the correct implementation depending on the container + * type and presence of other operator mixins. + * + * @tparam Predicate Predicate type + * + * @param slot Pointer to the slot in memory + * @param value Element to insert + * @param predicate Predicate used to compare slot content against `key` + * + * @return Result of this operation, i.e., success/continue/duplicate + */ + template + [[nodiscard]] __device__ insert_result attempt_insert(value_type* slot, + value_type const& value, + Predicate const& predicate) + { + // temporary workaround due to performance regression + // https://github.com/NVIDIA/libcudacxx/issues/366 + auto old = [&]() { + value_type expected = this->empty_slot_sentinel_; + value_type val = value; + if constexpr (sizeof(value_type) == sizeof(unsigned int)) { + auto* expected_ptr = reinterpret_cast(&expected); + auto* value_ptr = reinterpret_cast(&val); + if constexpr (Scope == cuda::thread_scope_system) { + return atomicCAS_system(reinterpret_cast(slot), *expected_ptr, *value_ptr); + } else if constexpr (Scope == cuda::thread_scope_device) { + return atomicCAS(reinterpret_cast(slot), *expected_ptr, *value_ptr); + } else if constexpr (Scope == cuda::thread_scope_block) { + return atomicCAS_block(reinterpret_cast(slot), *expected_ptr, *value_ptr); + } else { + static_assert(cuco::dependent_false, "Unsupported thread scope"); + } + } + if constexpr (sizeof(value_type) == sizeof(unsigned long long int)) { + auto* expected_ptr = reinterpret_cast(&expected); + auto* value_ptr = reinterpret_cast(&val); + if constexpr (Scope == cuda::thread_scope_system) { + return atomicCAS_system( + reinterpret_cast(slot), *expected_ptr, *value_ptr); + } else if constexpr (Scope == cuda::thread_scope_device) { + return atomicCAS( + reinterpret_cast(slot), *expected_ptr, *value_ptr); + } else if constexpr (Scope == cuda::thread_scope_block) { + return atomicCAS_block( + reinterpret_cast(slot), *expected_ptr, *value_ptr); + } else { + static_assert(cuco::dependent_false, "Unsupported thread scope"); + } + } + }(); + auto* old_ptr = reinterpret_cast(&old); + if (*slot == *old_ptr) { + // Shouldn't use `predicate` operator directly since it includes a redundant bitwise compare + return predicate.equal_to(*old_ptr, value) == detail::equal_result::EQUAL + ? insert_result::DUPLICATE + : insert_result::CONTINUE; + } else { + return insert_result::SUCCESS; + } + } + + value_type empty_slot_sentinel_; ///< Sentinel value indicating an empty slot + probing_scheme_type probing_scheme_; ///< Probing scheme + storage_ref_type storage_ref_; ///< Slot storage ref +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/pair.cuh b/include/cuco/detail/pair.cuh index 7ea39889c..94ad5090f 100644 --- a/include/cuco/detail/pair.cuh +++ b/include/cuco/detail/pair.cuh @@ -272,4 +272,24 @@ __host__ __device__ pair_type make_pair(F&& f, S&& s) noexcept return pair_type{std::forward(f), std::forward(s)}; } +/** + * @brief Tests if both elements of lhs and rhs are equal + * + * @tparam T1 Type of the first element of the left-hand side pair + * @tparam T2 Type of the second element of the left-hand side pair + * @tparam U1 Type of the first element of the right-hand side pair + * @tparam U2 Type of the second element of the right-hand side pair + * + * @param lhs Left-hand side pair + * @param rhs Right-hand side pair + * + * @return True if two pairs are equal. False otherwise + */ +template +__host__ __device__ constexpr bool operator==(cuco::pair const& lhs, + cuco::pair const& rhs) noexcept +{ + return lhs.first == rhs.first and lhs.second == rhs.second; +} + } // namespace cuco diff --git a/include/cuco/detail/static_map/functors.cuh b/include/cuco/detail/static_map/functors.cuh new file mode 100644 index 000000000..c807ed5f1 --- /dev/null +++ b/include/cuco/detail/static_map/functors.cuh @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + */ + +#pragma once + +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief Device functor returning whether the input slot indexed by `idx` is filled. + * + * @tparam T The slot content type + */ +template +struct slot_is_filled { + T empty_sentinel_; ///< The value of the empty key sentinel + + /** + * @brief Constructs `slot_is_filled` functor with the given empty sentinel. + * + * @param s Sentinel indicating empty slot + */ + explicit constexpr slot_is_filled(T const& s) noexcept : empty_sentinel_{s} {} + + /** + * @brief Indicates if the target slot `slot` is filled. + * + * @tparam U Slot content type + * + * @param slot The slot + * @return `true` if slot is filled + */ + template + __device__ constexpr bool operator()(U const& slot) const noexcept + { + return not cuco::detail::bitwise_compare(empty_sentinel_, slot.first); + } +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh new file mode 100644 index 000000000..c27c2eac8 --- /dev/null +++ b/include/cuco/detail/static_map/kernels.cuh @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include + +#include + +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief Finds the equivalent map elements of all keys in the range `[first, last)`. + * + * @note If the key `*(first + i)` has a match in the container, copies the payload of its matched + * element to `(output_begin + i)`. Else, copies the empty value sentinel. Uses the CUDA Cooperative + * Groups API to leverage groups of multiple threads to find each key. This provides a significant + * boost in throughput compared to the non Cooperative Group `find` at moderate to high load + * factors. + * + * @tparam CGSize Number of threads in each CG + * @tparam BlockSize The size of the thread block + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from the map's `mapped_type` + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of keys + * @param n Number of keys to query + * @param output_begin Beginning of the sequence of matched payloads retrieved for each key + * @param ref Non-owning map device ref used to access the slot storage + */ +template +__global__ void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref) +{ + namespace cg = cooperative_groups; + + auto const block = cg::this_thread_block(); + auto const thread_idx = block.thread_rank(); + + cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize; + cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize; + __shared__ typename Ref::mapped_type output_buffer[BlockSize / CGSize]; + + while (idx - thread_idx < n) { // the whole thread block falls into the same iteration + if (idx < n) { + auto const key = *(first + idx); + if constexpr (CGSize == 1) { + auto const found = ref.find(key); + /* + * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased + * sector stores from L2 to global memory. By writing results to shared memory and then + * synchronizing before writing back to global, we no longer rely on L1, preventing the + * increase in sector stores from L2 to global and improving performance. + */ + output_buffer[thread_idx] = + found == ref.end() ? ref.empty_value_sentinel() : (*found).second; + block.sync(); + *(output_begin + idx) = output_buffer[thread_idx]; + } else { + auto const tile = cg::tiled_partition(block); + auto const found = ref.find(tile, key); + + if (tile.thread_rank() == 0) { + *(output_begin + idx) = found == ref.end() ? ref.empty_value_sentinel() : (*found).second; + } + } + } + idx += loop_stride; + } +} + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl new file mode 100644 index 000000000..e4f414313 --- /dev/null +++ b/include/cuco/detail/static_map/static_map.inl @@ -0,0 +1,331 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cuco { +namespace experimental { + +template +constexpr static_map:: + static_map(Extent capacity, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + KeyEqual const& pred, + ProbingScheme const& probing_scheme, + Allocator const& alloc, + cuda_stream_ref stream) + : impl_{std::make_unique(capacity, + empty_key_sentinel, + cuco::pair{empty_key_sentinel, empty_value_sentinel}, + pred, + probing_scheme, + alloc, + stream)}, + empty_value_sentinel_{empty_value_sentinel} +{ +} + +template +template +static_map::size_type +static_map::insert( + InputIt first, InputIt last, cuda_stream_ref stream) +{ + return impl_->insert(first, last, ref(op::insert), stream); +} + +template +template +void static_map::insert_async( + InputIt first, InputIt last, cuda_stream_ref stream) noexcept +{ + impl_->insert_async(first, last, ref(op::insert), stream); +} + +template +template +static_map::size_type +static_map::insert_if( + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) +{ + return impl_->insert_if(first, last, stencil, pred, ref(op::insert), stream); +} + +template +template +void static_map:: + insert_if_async( + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) noexcept +{ + impl_->insert_if_async(first, last, stencil, pred, ref(op::insert), stream); +} + +template +template +void static_map::contains( + InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const +{ + contains_async(first, last, output_begin, stream); + stream.synchronize(); +} + +template +template +void static_map::contains_async( + InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const noexcept +{ + impl_->contains_async(first, last, output_begin, ref(op::contains), stream); +} + +template +template +void static_map::contains_if( + InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + OutputIt output_begin, + cuda_stream_ref stream) const +{ + contains_if_async(first, last, stencil, pred, output_begin, stream); + stream.synchronize(); +} + +template +template +void static_map:: + contains_if_async(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + OutputIt output_begin, + cuda_stream_ref stream) const noexcept +{ + impl_->contains_if_async(first, last, stencil, pred, output_begin, ref(op::contains), stream); +} + +template +template +void static_map::find( + InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const +{ + find_async(first, last, output_begin, stream); + stream.synchronize(); +} + +template +template +void static_map::find_async( + InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const +{ + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + auto const grid_size = + (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / + (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + + detail::find + <<>>( + first, num_keys, output_begin, ref(op::find)); +} + +template +template +OutputIt +static_map::retrieve_all( + OutputIt output_begin, cuda_stream_ref stream) const +{ + auto const is_filled = detail::slot_is_filled(this->empty_key_sentinel()); + return impl_->retrieve_all(output_begin, is_filled, stream); +} + +template +static_map::size_type +static_map::size( + cuda_stream_ref stream) const noexcept +{ + auto const is_filled = detail::slot_is_filled(this->empty_key_sentinel()); + return impl_->size(is_filled, stream); +} + +template +constexpr auto +static_map::capacity() + const noexcept +{ + return impl_->capacity(); +} + +template +constexpr static_map::key_type +static_map::empty_key_sentinel() + const noexcept +{ + return impl_->empty_key_sentinel(); +} + +template +constexpr static_map:: + mapped_type + static_map:: + empty_value_sentinel() const noexcept +{ + return this->empty_value_sentinel_; +} + +template +template +auto static_map::ref( + Operators...) const noexcept +{ + static_assert(sizeof...(Operators), "No operators specified"); + return ref_type{cuco::empty_key(this->empty_key_sentinel()), + cuco::empty_value(this->empty_value_sentinel()), + impl_->key_eq(), + impl_->probing_scheme(), + impl_->storage_ref()}; +} +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl new file mode 100644 index 000000000..fce7c941b --- /dev/null +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -0,0 +1,455 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +#include + +namespace cuco { +namespace experimental { + +template +__host__ __device__ constexpr static_map_ref< + Key, + T, + Scope, + KeyEqual, + ProbingScheme, + StorageRef, + Operators...>::static_map_ref(cuco::empty_key empty_key_sentinel, + cuco::empty_value empty_value_sentinel, + KeyEqual const& predicate, + ProbingScheme const& probing_scheme, + StorageRef storage_ref) noexcept + : impl_{cuco::pair{empty_key_sentinel, empty_value_sentinel}, probing_scheme, storage_ref}, + empty_value_sentinel_{empty_value_sentinel}, + predicate_{empty_key_sentinel, predicate} +{ +} + +template +__host__ __device__ constexpr auto +static_map_ref::capacity() + const noexcept +{ + return impl_.capacity(); +} + +template +__host__ __device__ constexpr Key +static_map_ref:: + empty_key_sentinel() const noexcept +{ + return predicate_.empty_sentinel_; +} + +template +__host__ __device__ constexpr T +static_map_ref:: + empty_value_sentinel() const noexcept +{ + return empty_value_sentinel_; +} + +template +struct static_map_ref:: + predicate_wrapper { + detail::equal_wrapper predicate_; + + /** + * @brief Map predicate wrapper ctor. + * + * @param sentinel Sentinel value + * @param equal Equality binary callable + */ + __host__ __device__ constexpr predicate_wrapper(key_type empty_key_sentinel, + key_equal const& equal) noexcept + : predicate_{empty_key_sentinel, equal} + { + } + + /** + * @brief Equality check with the given equality callable. + * + * @tparam U Right-hand side Element type + * + * @param lhs Left-hand side element to check equality + * @param rhs Right-hand side element to check equality + * + * @return `EQUAL` if `lhs` and `rhs` are equivalent. `UNEQUAL` otherwise. + */ + template + __device__ constexpr detail::equal_result equal_to(value_type const& lhs, + U const& rhs) const noexcept + { + return predicate_.equal_to(lhs.first, rhs); + } + + /** + * @brief Equality check with the given equality callable. + * + * @param lhs Left-hand side element to check equality + * @param rhs Right-hand side element to check equality + * + * @return `EQUAL` if `lhs` and `rhs` are equivalent. `UNEQUAL` otherwise. + */ + __device__ constexpr detail::equal_result equal_to(value_type const& lhs, + value_type const& rhs) const noexcept + { + return predicate_.equal_to(lhs.first, rhs.first); + } + + /** + * @brief Order-sensitive equality operator. + * + * @note Container keys MUST be always on the left-hand side. + * + * @tparam U Right-hand side Element type + * + * @param lhs Left-hand side element to check equality + * @param rhs Right-hand side element to check equality + * + * @return Three way equality comparison result + */ + template + __device__ constexpr detail::equal_result operator()(value_type const& lhs, + U const& rhs) const noexcept + { + return predicate_(lhs.first, rhs); + } +}; + +namespace detail { + +template +class operator_impl< + op::insert_tag, + static_map_ref> { + using base_type = static_map_ref; + using ref_type = static_map_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + public: + /** + * @brief Inserts an element. + * + * @param value The element to insert + * @return True if the given element is successfully inserted + */ + __device__ bool insert(value_type const& value) noexcept + { + ref_type& ref_ = static_cast(*this); + return ref_.impl_.insert(value.first, value, ref_.predicate_); + } + + /** + * @brief Inserts an element. + * + * @param group The Cooperative Group used to perform group insert + * @param value The element to insert + * @return True if the given element is successfully inserted + */ + __device__ bool insert(cooperative_groups::thread_block_tile const& group, + value_type const& value) noexcept + { + auto& ref_ = static_cast(*this); + return ref_.impl_.insert(group, value.first, value, ref_.predicate_); + } +}; + +template +class operator_impl< + op::insert_and_find_tag, + static_map_ref> { + using base_type = static_map_ref; + using ref_type = static_map_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + using iterator = typename base_type::iterator; + using const_iterator = typename base_type::const_iterator; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + public: + /** + * @brief Returns a const_iterator to one past the last slot. + * + * @note This API is available only when `find_tag` or `insert_and_find_tag` is present. + * + * @return A const_iterator to one past the last slot + */ + [[nodiscard]] __host__ __device__ constexpr const_iterator end() const noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.impl_.end(); + } + + /** + * @brief Returns an iterator to one past the last slot. + * + * @note This API is available only when `find_tag` or `insert_and_find_tag` is present. + * + * @return An iterator to one past the last slot + */ + [[nodiscard]] __host__ __device__ constexpr iterator end() noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.impl_.end(); + } + + /** + * @brief Inserts the given element into the map. + * + * @note This API returns a pair consisting of an iterator to the inserted element (or to the + * element that prevented the insertion) and a `bool` denoting whether the insertion took place or + * not. + * + * @param value The element to insert + * + * @return a pair consisting of an iterator to the element and a bool indicating whether the + * insertion is successful or not. + */ + __device__ thrust::pair insert_and_find(value_type const& value) noexcept + { + ref_type& ref_ = static_cast(*this); + return ref_.impl_.insert_and_find(value.first, value, ref_.predicate_); + } + + /** + * @brief Inserts the given element into the map. + * + * @note This API returns a pair consisting of an iterator to the inserted element (or to the + * element that prevented the insertion) and a `bool` denoting whether the insertion took place or + * not. + * + * @param group The Cooperative Group used to perform group insert_and_find + * @param value The element to insert + * + * @return a pair consisting of an iterator to the element and a bool indicating whether the + * insertion is successful or not. + */ + __device__ thrust::pair insert_and_find( + cooperative_groups::thread_block_tile const& group, value_type const& value) noexcept + { + ref_type& ref_ = static_cast(*this); + return ref_.impl_.insert_and_find(group, value.first, value, ref_.predicate_); + } +}; + +template +class operator_impl< + op::contains_tag, + static_map_ref> { + using base_type = static_map_ref; + using ref_type = static_map_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + public: + /** + * @brief Indicates whether the probe key `key` was inserted into the container. + * + * @note If the probe key `key` was inserted into the container, returns + * true. Otherwise, returns false. + * + * @tparam ProbeKey Probe key type + * + * @param key The key to search for + * + * @return A boolean indicating whether the probe key is present + */ + template + [[nodiscard]] __device__ bool contains(ProbeKey const& key) const noexcept + { + // CRTP: cast `this` to the actual ref type + auto const& ref_ = static_cast(*this); + return ref_.impl_.contains(key, ref_.predicate_); + } + + /** + * @brief Indicates whether the probe key `key` was inserted into the container. + * + * @note If the probe key `key` was inserted into the container, returns + * true. Otherwise, returns false. + * + * @tparam ProbeKey Probe key type + * + * @param group The Cooperative Group used to perform group contains + * @param key The key to search for + * + * @return A boolean indicating whether the probe key is present + */ + template + [[nodiscard]] __device__ bool contains( + cooperative_groups::thread_block_tile const& group, ProbeKey const& key) const noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.impl_.contains(group, key, ref_.predicate_); + } +}; + +template +class operator_impl< + op::find_tag, + static_map_ref> { + using base_type = static_map_ref; + using ref_type = static_map_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + using iterator = typename base_type::iterator; + using const_iterator = typename base_type::const_iterator; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + public: + /** + * @brief Returns a const_iterator to one past the last slot. + * + * @note This API is available only when `find_tag` or `insert_and_find_tag` is present. + * + * @return A const_iterator to one past the last slot + */ + [[nodiscard]] __host__ __device__ constexpr const_iterator end() const noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.impl_.end(); + } + + /** + * @brief Returns an iterator to one past the last slot. + * + * @note This API is available only when `find_tag` or `insert_and_find_tag` is present. + * + * @return An iterator to one past the last slot + */ + [[nodiscard]] __host__ __device__ constexpr iterator end() noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.impl_.end(); + } + + /** + * @brief Finds an element in the map with key equivalent to the probe key. + * + * @note Returns a un-incrementable input iterator to the element whose key is equivalent to + * `key`. If no such element exists, returns `end()`. + * + * @tparam ProbeKey Probe key type + * + * @param key The key to search for + * + * @return An iterator to the position at which the equivalent key is stored + */ + template + [[nodiscard]] __device__ const_iterator find(ProbeKey const& key) const noexcept + { + // CRTP: cast `this` to the actual ref type + auto const& ref_ = static_cast(*this); + return ref_.impl_.find(key, ref_.predicate_); + } + + /** + * @brief Finds an element in the map with key equivalent to the probe key. + * + * @note Returns a un-incrementable input iterator to the element whose key is equivalent to + * `key`. If no such element exists, returns `end()`. + * + * @tparam ProbeKey Probe key type + * + * @param group The Cooperative Group used to perform this operation + * @param key The key to search for + * + * @return An iterator to the position at which the equivalent key is stored + */ + template + [[nodiscard]] __device__ const_iterator find( + cooperative_groups::thread_block_tile const& group, ProbeKey const& key) const noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.impl_.find(group, key, ref_.predicate_); + } +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/static_set/functors.cuh b/include/cuco/detail/static_set/functors.cuh index 52375f225..ce3183b8d 100644 --- a/include/cuco/detail/static_set/functors.cuh +++ b/include/cuco/detail/static_set/functors.cuh @@ -21,36 +21,6 @@ namespace cuco { namespace experimental { namespace detail { -/** - * @brief Device functor returning the content of the slot indexed by `idx`. - * - * @tparam StorageRef Storage ref type - */ -template -struct get_slot { - StorageRef storage_; ///< Storage ref - - /** - * @brief Constructs `get_slot` functor with the given storage ref. - * - * @param s Input storage ref - */ - get_slot(StorageRef s) : storage_{s} {} - - /** - * @brief Accesses the slot content with the given index. - * - * @param idx The slot index - * @return The slot content - */ - __device__ typename StorageRef::value_type operator()(typename StorageRef::size_type idx) const - { - auto const window_idx = idx / StorageRef::window_size; - auto const intra_idx = idx % StorageRef::window_size; - return storage_[window_idx][intra_idx]; - } -}; - /** * @brief Device functor returning whether the input slot indexed by `idx` is filled. * @@ -65,7 +35,7 @@ struct slot_is_filled { * * @param s Sentinel indicating empty slot */ - slot_is_filled(T s) : empty_sentinel_{s} {} + explicit constexpr slot_is_filled(T const& s) noexcept : empty_sentinel_{s} {} /** * @brief Indicates if the target slot `slot` is filled. @@ -73,9 +43,10 @@ struct slot_is_filled { * @tparam T Slot content type * * @param slot The slot + * * @return `true` if slot is filled */ - __device__ bool operator()(T slot) const + __device__ constexpr bool operator()(T const& slot) const noexcept { return not cuco::detail::bitwise_compare(empty_sentinel_, slot); } diff --git a/include/cuco/detail/static_set/kernels.cuh b/include/cuco/detail/static_set/kernels.cuh index 3cca8d2b4..4023dc16e 100644 --- a/include/cuco/detail/static_set/kernels.cuh +++ b/include/cuco/detail/static_set/kernels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ */ #pragma once +#include #include #include @@ -27,216 +28,20 @@ namespace cuco { namespace experimental { namespace detail { -/** - * @brief Inserts all elements in the range `[first, first + n)` and returns the number of - * successful insertions if `pred` of the corresponding stencil returns true. - * - * @note If multiple elements in `[first, first + n)` compare equal, it is unspecified which element - * is inserted. - * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. - * - * @tparam CGSize Number of threads in each CG - * @tparam BlockSize Number of threads in each block - * @tparam InputIterator Device accessible input iterator whose `value_type` is - * convertible to the `value_type` of the data structure - * @tparam StencilIt Device accessible random access iterator whose value_type is - * convertible to Predicate's argument type - * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` - * and argument type is convertible from `std::iterator_traits::value_type` - * @tparam AtomicT Atomic counter type - * @tparam Ref Type of non-owning device ref allowing access to storage - * - * @param first Beginning of the sequence of input elements - * @param n Number of input elements - * @param stencil Beginning of the stencil sequence - * @param pred Predicate to test on every element in the range `[stencil, stencil + n)` - * @param num_successes Number of successful inserted elements - * @param ref Non-owning set device ref used to access the slot storage - */ -template -__global__ void insert_if_n(InputIterator first, - cuco::detail::index_type n, - StencilIt stencil, - Predicate pred, - AtomicT* num_successes, - Ref ref) -{ - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - typename Ref::size_type thread_num_successes = 0; - - cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize; - cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize; - - while (idx < n) { - if (pred(*(stencil + idx))) { - typename Ref::value_type const insert_pair{*(first + idx)}; - if constexpr (CGSize == 1) { - if (ref.insert(insert_pair)) { thread_num_successes++; }; - } else { - auto const tile = - cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); - if (ref.insert(tile, insert_pair) && tile.thread_rank() == 0) { thread_num_successes++; }; - } - } - idx += loop_stride; - } - - // compute number of successfully inserted elements for each block - // and atomically add to the grand total - typename Ref::size_type block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); - if (threadIdx.x == 0) { - num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed); - } -} - -/** - * @brief Inserts all elements in the range `[first, first + n)` if `pred` of the corresponding - * stencil returns true. - * - * @note If multiple elements in `[first, first + n)` compare equal, it is unspecified which element - * is inserted. - * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. - * - * @tparam CGSize Number of threads in each CG - * @tparam BlockSize Number of threads in each block - * @tparam InputIterator Device accessible input iterator whose `value_type` is - * convertible to the `value_type` of the data structure - * @tparam StencilIt Device accessible random access iterator whose value_type is - * convertible to Predicate's argument type - * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` - * and argument type is convertible from `std::iterator_traits::value_type` - * @tparam Ref Type of non-owning device ref allowing access to storage - * - * @param first Beginning of the sequence of input elements - * @param n Number of input elements - * @param stencil Beginning of the stencil sequence - * @param pred Predicate to test on every element in the range `[stencil, stencil + n)` - * @param ref Non-owning set device ref used to access the slot storage - */ -template -__global__ void insert_if_n( - InputIterator first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, Ref ref) -{ - cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize; - cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize; - - while (idx < n) { - if (pred(*(stencil + idx))) { - typename Ref::value_type const insert_pair{*(first + idx)}; - if constexpr (CGSize == 1) { - ref.insert(insert_pair); - } else { - auto const tile = - cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); - ref.insert(tile, insert_pair); - } - } - idx += loop_stride; - } -} - -/** - * @brief Indicates whether the keys in the range `[first, first + n)` are contained in the data - * structure if `pred` of the corresponding stencil returns true. - * - * @note If `pred( *(stencil + i) )` is true, stores `true` or `false` to `(output_begin + i)` - * indicating if the key `*(first + i)` is present in the set. If `pred( *(stencil + i) )` is false, - * stores false to `(output_begin + i)`. - * - * @tparam CGSize Number of threads in each CG - * @tparam BlockSize The size of the thread block - * @tparam InputIt Device accessible input iterator - * @tparam StencilIt Device accessible random access iterator whose value_type is - * convertible to Predicate's argument type - * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` - * and argument type is convertible from `std::iterator_traits::value_type` - * @tparam OutputIt Device accessible output iterator assignable from `bool` - * @tparam Ref Type of non-owning device ref allowing access to storage - * - * @param first Beginning of the sequence of keys - * @param n Number of keys - * @param stencil Beginning of the stencil sequence - * @param pred Predicate to test on every element in the range `[stencil, stencil + n)` - * @param output_begin Beginning of the sequence of booleans for the presence of each key - * @param ref Non-owning set device ref used to access the slot storage - */ -template -__global__ void contains_if_n(InputIt first, - cuco::detail::index_type n, - StencilIt stencil, - Predicate pred, - OutputIt output_begin, - Ref ref) -{ - namespace cg = cooperative_groups; - - auto const block = cg::this_thread_block(); - auto const thread_idx = block.thread_rank(); - - cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize; - cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize; - - __shared__ bool output_buffer[BlockSize / CGSize]; - - while (idx - thread_idx < n) { // the whole thread block falls into the same iteration - if constexpr (CGSize == 1) { - if (idx < n) { - auto const key = *(first + idx); - /* - * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased - * sector stores from L2 to global memory. By writing results to shared memory and then - * synchronizing before writing back to global, we no longer rely on L1, preventing the - * increase in sector stores from L2 to global and improving performance. - */ - output_buffer[thread_idx] = pred(*(stencil + idx)) ? ref.contains(key) : false; - } - block.sync(); - if (idx < n) { *(output_begin + idx) = output_buffer[thread_idx]; } - } else { - auto const tile = cg::tiled_partition(cg::this_thread_block()); - if (idx < n) { - auto const key = *(first + idx); - auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false; - if (tile.thread_rank() == 0) { *(output_begin + idx) = found; } - } - } - idx += loop_stride; - } -} - /** * @brief Finds the equivalent set elements of all keys in the range `[first, last)`. * * If the key `*(first + i)` has a match in the set, copies its matched element to `(output_begin + - * i)`. Else, copies the empty value sentinel. Uses the CUDA Cooperative Groups API to leverage - * groups of multiple threads to find each key. This provides a significant boost in throughput - * compared to the non Cooperative Group `find` at moderate to high load factors. + * i)`. Else, copies the empty key sentinel. Uses the CUDA Cooperative Groups API to leverage groups + * of multiple threads to find each key. This provides a significant boost in throughput compared to + * the non Cooperative Group `find` at moderate to high load factors. * * @tparam CGSize Number of threads in each CG * @tparam BlockSize The size of the thread block * @tparam InputIt Device accessible input iterator - * @tparam OutputIt Device accessible output iterator assignable from the set's `value_type` + * @tparam OutputIt Device accessible output iterator assignable from the set's `key_type` * @tparam Ref Type of non-owning device ref allowing access to storage * - * * @param first Beginning of the sequence of keys * @param n Number of keys to query * @param output_begin Beginning of the sequence of matched elements retrieved for each key @@ -252,7 +57,7 @@ __global__ void find(InputIt first, cuco::detail::index_type n, OutputIt output_ cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize; cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize; - __shared__ typename Ref::value_type output_buffer[BlockSize / CGSize]; + __shared__ typename Ref::key_type output_buffer[BlockSize / CGSize]; while (idx - thread_idx < n) { // the whole thread block falls into the same iteration if (idx < n) { @@ -281,45 +86,6 @@ __global__ void find(InputIt first, cuco::detail::index_type n, OutputIt output_ } } -/** - * @brief Calculates the number of filled slots for the given window storage. - * - * @tparam BlockSize Number of threads in each block - * @tparam StorageRef Type of non-owning ref allowing access to storage - * @tparam AtomicT Atomic counter type - * - * @param storage Non-owning device ref used to access the slot storage - * @param empty_sentinel Sentinel indicating empty slots - * @param count Number of filled slots - */ -template -__global__ void size(StorageRef storage, - typename StorageRef::value_type empty_sentinel, - AtomicT* count) -{ - using size_type = typename StorageRef::size_type; - - cuco::detail::index_type const loop_stride = gridDim.x * BlockSize; - cuco::detail::index_type idx = BlockSize * blockIdx.x + threadIdx.x; - - size_type thread_count = 0; - auto const n = storage.num_windows(); - - while (idx < n) { - auto const window = storage[idx]; -#pragma unroll - for (auto const& it : window) { - thread_count += static_cast(not cuco::detail::bitwise_compare(it, empty_sentinel)); - } - idx += loop_stride; - } - - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - size_type const block_count = BlockReduce(temp_storage).Sum(thread_count); - if (threadIdx.x == 0) { count->fetch_add(block_count, cuda::std::memory_order_relaxed); } -} - } // namespace detail } // namespace experimental } // namespace cuco diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl index fdd4bfaf4..769a1131f 100644 --- a/include/cuco/detail/static_set/static_set.inl +++ b/include/cuco/detail/static_set/static_set.inl @@ -15,24 +15,13 @@ */ #include -#include -#include #include #include -#include #include #include #include #include -#include -#include -#include -#include - -#include -#include - #include namespace cuco { @@ -48,17 +37,13 @@ template ::static_set( Extent capacity, empty_key empty_key_sentinel, - KeyEqual pred, + KeyEqual const& pred, ProbingScheme const& probing_scheme, Allocator const& alloc, cuda_stream_ref stream) - : empty_key_sentinel_{empty_key_sentinel}, - predicate_{pred}, - probing_scheme_{probing_scheme}, - allocator_{alloc}, - storage_{make_valid_extent(capacity), allocator_} + : impl_{std::make_unique( + capacity, empty_key_sentinel, empty_key_sentinel, pred, probing_scheme, alloc, stream)} { - storage_.initialize(empty_key_sentinel_, stream); } template ::siz static_set::insert( InputIt first, InputIt last, cuda_stream_ref stream) { - auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return 0; } - - auto counter = detail::counter_storage{allocator_}; - counter.reset(stream); - - auto const grid_size = - (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / - (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); - - auto const always_true = thrust::constant_iterator{true}; - detail::insert_if_n - <<>>( - first, num_keys, always_true, thrust::identity{}, counter.data(), ref(op::insert)); - - return counter.load_to_host(stream); + return impl_->insert(first, last, ref(op::insert), stream); } template void static_set::insert_async( InputIt first, InputIt last, cuda_stream_ref stream) noexcept { - auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return; } - - auto const grid_size = - (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / - (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); - - auto const always_true = thrust::constant_iterator{true}; - detail::insert_if_n - <<>>( - first, num_keys, always_true, thrust::identity{}, ref(op::insert)); + impl_->insert_async(first, last, ref(op::insert), stream); } template ::siz static_set::insert_if( InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) { - auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return 0; } - - auto counter = detail::counter_storage{allocator_}; - counter.reset(stream); - - auto const grid_size = - (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / - (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); - - detail::insert_if_n - <<>>( - first, num_keys, stencil, pred, counter.data(), ref(op::insert)); - - return counter.load_to_host(stream); + return impl_->insert_if(first, last, stencil, pred, ref(op::insert), stream); } template void static_set::insert_if_async( InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) noexcept { - auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return; } - - auto const grid_size = - (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / - (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); - - detail::insert_if_n - <<>>( - first, num_keys, stencil, pred, ref(op::insert)); + impl_->insert_if_async(first, last, stencil, pred, ref(op::insert), stream); } template void static_set::contains_async( InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const noexcept { - auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return; } - - auto const grid_size = - (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / - (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); - - auto const always_true = thrust::constant_iterator{true}; - detail::contains_if_n - <<>>( - first, num_keys, always_true, thrust::identity{}, output_begin, ref(op::contains)); + impl_->contains_async(first, last, output_begin, ref(op::contains), stream); } template OutputIt output_begin, cuda_stream_ref stream) const noexcept { - auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return; } - - auto const grid_size = - (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / - (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); - - detail::contains_if_n - <<>>( - first, num_keys, stencil, pred, output_begin, ref(op::contains)); + impl_->contains_if_async(first, last, stencil, pred, output_begin, ref(op::contains), stream); } template OutputIt static_set::retrieve_all( OutputIt output_begin, cuda_stream_ref stream) const { - auto begin = thrust::make_transform_iterator(thrust::counting_iterator(0), - detail::get_slot(storage_.ref())); - auto filled = detail::slot_is_filled(empty_key_sentinel_); - - std::size_t temp_storage_bytes = 0; - using temp_allocator_type = typename std::allocator_traits::rebind_alloc; - auto temp_allocator = temp_allocator_type{allocator_}; - auto d_num_out = reinterpret_cast( - std::allocator_traits::allocate(temp_allocator, sizeof(size_type))); - CUCO_CUDA_TRY(cub::DeviceSelect::If( - nullptr, temp_storage_bytes, begin, output_begin, d_num_out, capacity(), filled, stream)); - - // Allocate temporary storage - auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes); - - CUCO_CUDA_TRY(cub::DeviceSelect::If(d_temp_storage, - temp_storage_bytes, - begin, - output_begin, - d_num_out, - capacity(), - filled, - stream)); - - size_type h_num_out; - CUCO_CUDA_TRY( - cudaMemcpyAsync(&h_num_out, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream)); - stream.synchronize(); - std::allocator_traits::deallocate( - temp_allocator, reinterpret_cast(d_num_out), sizeof(size_type)); - temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); - - return output_begin + h_num_out; + auto const is_filled = detail::slot_is_filled(this->empty_key_sentinel()); + return impl_->retrieve_all(output_begin, is_filled, stream); } template ::siz static_set::size( cuda_stream_ref stream) const noexcept { - auto counter = detail::counter_storage{allocator_}; - counter.reset(stream); - - auto const grid_size = - (storage_.num_windows() + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / - (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); - - // TODO: custom kernel to be replaced by cub::DeviceReduce::Sum when cub version is bumped to - // v2.1.0 - detail::size - <<>>( - storage_.ref(), this->empty_key_sentinel(), counter.data()); - - return counter.load_to_host(stream); + auto const is_filled = detail::slot_is_filled(this->empty_key_sentinel()); + return impl_->size(is_filled, stream); } template ::capacity() const noexcept { - return storage_.capacity(); + return impl_->capacity(); } template ::empty_key_sentinel() const noexcept { - return empty_key_sentinel_; + return impl_->empty_key_sentinel(); } template Operators...) const noexcept { static_assert(sizeof...(Operators), "No operators specified"); - return ref_type{ - cuco::empty_key(empty_key_sentinel_), predicate_, probing_scheme_, storage_.ref()}; + return ref_type{cuco::empty_key(this->empty_key_sentinel()), + impl_->key_eq(), + impl_->probing_scheme(), + impl_->storage_ref()}; } } // namespace experimental } // namespace cuco diff --git a/include/cuco/detail/static_set/static_set_ref.inl b/include/cuco/detail/static_set/static_set_ref.inl index ac79623a5..3482738cc 100644 --- a/include/cuco/detail/static_set/static_set_ref.inl +++ b/include/cuco/detail/static_set/static_set_ref.inl @@ -16,21 +16,12 @@ #pragma once -#include -#include #include -#include -#include -#include - -#include +#include #include -#include -#include - namespace cuco { namespace experimental { @@ -50,10 +41,8 @@ __host__ __device__ constexpr static_set_ref< KeyEqual const& predicate, ProbingScheme const& probing_scheme, StorageRef storage_ref) noexcept - : empty_key_sentinel_{empty_key_sentinel}, - predicate_{empty_key_sentinel, predicate}, - probing_scheme_{probing_scheme}, - storage_ref_{storage_ref} + : impl_{empty_key_sentinel, probing_scheme, storage_ref}, + predicate_{empty_key_sentinel, predicate} { } @@ -67,7 +56,7 @@ __host__ __device__ constexpr auto static_set_ref::capacity() const noexcept { - return storage_ref_.capacity(); + return impl_.capacity(); } template ::empty_key_sentinel() const noexcept { - return empty_key_sentinel_; -} - -template -__device__ - static_set_ref::insert_result - static_set_ref::attempt_insert( - value_type* slot, value_type const& value) -{ - // temporary workaround due to performance regression - // https://github.com/NVIDIA/libcudacxx/issues/366 - value_type const old = [&]() { - value_type expected = this->empty_key_sentinel(); - value_type val = value; - if constexpr (sizeof(value_type) == sizeof(unsigned int)) { - auto* expected_ptr = reinterpret_cast(&expected); - auto* value_ptr = reinterpret_cast(&val); - if constexpr (Scope == cuda::thread_scope_system) { - return atomicCAS_system(reinterpret_cast(slot), *expected_ptr, *value_ptr); - } else if constexpr (Scope == cuda::thread_scope_device) { - return atomicCAS(reinterpret_cast(slot), *expected_ptr, *value_ptr); - } else if constexpr (Scope == cuda::thread_scope_block) { - return atomicCAS_block(reinterpret_cast(slot), *expected_ptr, *value_ptr); - } else { - static_assert(cuco::dependent_false, "Unsupported thread scope"); - } - } - if constexpr (sizeof(value_type) == sizeof(unsigned long long int)) { - auto* expected_ptr = reinterpret_cast(&expected); - auto* value_ptr = reinterpret_cast(&val); - if constexpr (Scope == cuda::thread_scope_system) { - return atomicCAS_system( - reinterpret_cast(slot), *expected_ptr, *value_ptr); - } else if constexpr (Scope == cuda::thread_scope_device) { - return atomicCAS( - reinterpret_cast(slot), *expected_ptr, *value_ptr); - } else if constexpr (Scope == cuda::thread_scope_block) { - return atomicCAS_block( - reinterpret_cast(slot), *expected_ptr, *value_ptr); - } else { - static_assert(cuco::dependent_false, "Unsupported thread scope"); - } - } - }(); - if (*slot == old) { - // Shouldn't use `predicate_` operator directly since it includes a redundant bitwise compare - return predicate_.equal_to(old, value) == detail::equal_result::EQUAL ? insert_result::DUPLICATE - : insert_result::CONTINUE; - } else { - return insert_result::SUCCESS; - } + return predicate_.empty_sentinel_; } namespace detail { @@ -161,36 +95,13 @@ class operator_impl(*this); - auto probing_iter = ref_.probing_scheme_(value, ref_.storage_ref_.num_windows()); - - while (true) { - auto const window_slots = ref_.storage_ref_[*probing_iter]; - - // TODO: perf gain with #pragma unroll since num_windows is build time constant - for (auto& slot_content : window_slots) { - auto const eq_res = ref_.predicate_(slot_content, value); - - // If the key is already in the container, return false - if (eq_res == detail::equal_result::EQUAL) { return false; } - if (eq_res == detail::equal_result::EMPTY) { - auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); - switch (ref_.attempt_insert( - (ref_.storage_ref_.data() + *probing_iter)->data() + intra_window_index, value)) { - case insert_result::CONTINUE: continue; - case insert_result::SUCCESS: return true; - case insert_result::DUPLICATE: return false; - } - } - } - ++probing_iter; - } + ref_type& ref_ = static_cast(*this); + return ref_.impl_.insert(value, value, ref_.predicate_); } /** @@ -198,53 +109,14 @@ class operator_impl group, + __device__ bool insert(cooperative_groups::thread_block_tile const& group, value_type const& value) noexcept { - using insert_result = typename ref_type::insert_result; - - auto& ref_ = static_cast(*this); - auto probing_iter = ref_.probing_scheme_(group, value, ref_.storage_ref_.num_windows()); - - while (true) { - auto const window_slots = ref_.storage_ref_[*probing_iter]; - - auto const [state, intra_window_index] = [&]() { - for (auto i = 0; i < window_size; ++i) { - switch (ref_.predicate_(window_slots[i], value)) { - case detail::equal_result::EMPTY: return cuco::pair{detail::equal_result::EMPTY, i}; - case detail::equal_result::EQUAL: return cuco::pair{detail::equal_result::EQUAL, i}; - default: continue; - } - } - // returns dummy index `-1` for UNEQUAL - return cuco::pair{detail::equal_result::UNEQUAL, -1}; - }(); - - // If the key is already in the container, return false - if (group.any(state == detail::equal_result::EQUAL)) { return false; } - - auto const group_contains_empty = group.ballot(state == detail::equal_result::EMPTY); - - if (group_contains_empty) { - auto const src_lane = __ffs(group_contains_empty) - 1; - auto const status = - (group.thread_rank() == src_lane) - ? ref_.attempt_insert( - (ref_.storage_ref_.data() + *probing_iter)->data() + intra_window_index, value) - : insert_result::CONTINUE; - - switch (group.shfl(status, src_lane)) { - case insert_result::SUCCESS: return true; - case insert_result::DUPLICATE: return false; - default: continue; - } - } else { - ++probing_iter; - } - } + auto& ref_ = static_cast(*this); + return ref_.impl_.insert(group, value, value, ref_.predicate_); } }; @@ -267,6 +139,32 @@ class operator_impl(*this); + return ref_.impl_.end(); + } + + /** + * @brief Returns an iterator to one past the last slot. + * + * @note This API is available only when `find_tag` or `insert_and_find_tag` is present. + * + * @return An iterator to one past the last slot + */ + [[nodiscard]] __host__ __device__ constexpr iterator end() noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.impl_.end(); + } + /** * @brief Inserts the given element into the set. * @@ -281,34 +179,8 @@ class operator_impl insert_and_find(value_type const& value) noexcept { - using insert_result = typename ref_type::insert_result; - - ref_type& ref_ = static_cast(*this); - auto probing_iter = ref_.probing_scheme_(value, ref_.storage_ref_.num_windows()); - - while (true) { - auto const window_slots = ref_.storage_ref_[*probing_iter]; - - for (auto i = 0; i < window_size; ++i) { - auto const eq_res = ref_.predicate_(window_slots[i], value); - auto* window_ptr = (ref_.storage_ref_.data() + *probing_iter)->data(); - - // If the key is already in the container, return false - if (eq_res == detail::equal_result::EQUAL) { return {iterator{&window_ptr[i]}, false}; } - if (eq_res == detail::equal_result::EMPTY) { - switch (ref_.attempt_insert(window_ptr + i, value)) { - case insert_result::SUCCESS: { - return {iterator{&window_ptr[i]}, true}; - } - case insert_result::DUPLICATE: { - return {iterator{&window_ptr[i]}, false}; - } - default: continue; - } - } - } - ++probing_iter; - }; + ref_type& ref_ = static_cast(*this); + return ref_.impl_.insert_and_find(value, value, ref_.predicate_); } /** @@ -327,56 +199,8 @@ class operator_impl insert_and_find( cooperative_groups::thread_block_tile const& group, value_type const& value) noexcept { - using insert_result = typename ref_type::insert_result; - - ref_type& ref_ = static_cast(*this); - auto probing_iter = ref_.probing_scheme_(group, value, ref_.storage_ref_.num_windows()); - - while (true) { - auto const window_slots = ref_.storage_ref_[*probing_iter]; - - auto const [state, intra_window_index] = [&]() { - for (auto i = 0; i < window_size; ++i) { - switch (ref_.predicate_(window_slots[i], value)) { - case detail::equal_result::EMPTY: return cuco::pair{detail::equal_result::EMPTY, i}; - case detail::equal_result::EQUAL: return cuco::pair{detail::equal_result::EQUAL, i}; - default: continue; - } - } - // returns dummy index `-1` for UNEQUAL - return cuco::pair{detail::equal_result::UNEQUAL, -1}; - }(); - - auto* slot_ptr = (ref_.storage_ref_.data() + *probing_iter)->data() + intra_window_index; - - // If the key is already in the container, return false - auto const group_finds_equal = group.ballot(state == detail::equal_result::EQUAL); - if (group_finds_equal) { - auto const src_lane = __ffs(group_finds_equal) - 1; - auto const res = group.shfl(reinterpret_cast(slot_ptr), src_lane); - return {iterator{reinterpret_cast(res)}, false}; - } - - auto const group_contains_empty = group.ballot(state == detail::equal_result::EMPTY); - if (group_contains_empty) { - auto const src_lane = __ffs(group_contains_empty) - 1; - auto const res = group.shfl(reinterpret_cast(slot_ptr), src_lane); - auto const status = (group.thread_rank() == src_lane) ? ref_.attempt_insert(slot_ptr, value) - : insert_result::CONTINUE; - - switch (group.shfl(status, src_lane)) { - case insert_result::SUCCESS: { - return {iterator{reinterpret_cast(res)}, true}; - } - case insert_result::DUPLICATE: { - return {iterator{reinterpret_cast(res)}, false}; - } - default: continue; - } - } else { - ++probing_iter; - } - } + ref_type& ref_ = static_cast(*this); + return ref_.impl_.insert_and_find(group, value, value, ref_.predicate_); } }; @@ -400,47 +224,33 @@ class operator_impl [[nodiscard]] __device__ bool contains(ProbeKey const& key) const noexcept { - // CRTP: cast `this` to the actual ref type auto const& ref_ = static_cast(*this); - - auto probing_iter = ref_.probing_scheme_(key, ref_.storage_ref_.num_windows()); - - while (true) { - // TODO atomic_ref::load if insert operator is present - auto const window_slots = ref_.storage_ref_[*probing_iter]; - - for (auto& slot_content : window_slots) { - switch (ref_.predicate_(slot_content, key)) { - case detail::equal_result::UNEQUAL: continue; - case detail::equal_result::EMPTY: return false; - case detail::equal_result::EQUAL: return true; - } - } - ++probing_iter; - } + return ref_.impl_.contains(key, ref_.predicate_); } /** * @brief Indicates whether the probe key `key` was inserted into the container. * - * If the probe key `key` was inserted into the container, returns - * true. Otherwise, returns false. + * @note If the probe key `key` was inserted into the container, returns true. Otherwise, returns + * false. * * @tparam ProbeKey Probe key type * * @param group The Cooperative Group used to perform group contains * @param key The key to search for + * * @return A boolean indicating whether the probe key is present */ template @@ -448,28 +258,7 @@ class operator_impl const& group, ProbeKey const& key) const noexcept { auto const& ref_ = static_cast(*this); - - auto probing_iter = ref_.probing_scheme_(group, key, ref_.storage_ref_.num_windows()); - - while (true) { - auto const window_slots = ref_.storage_ref_[*probing_iter]; - - auto const state = [&]() { - for (auto& slot : window_slots) { - switch (ref_.predicate_(slot, key)) { - case detail::equal_result::EMPTY: return detail::equal_result::EMPTY; - case detail::equal_result::EQUAL: return detail::equal_result::EQUAL; - default: continue; - } - } - return detail::equal_result::UNEQUAL; - }(); - - if (group.any(state == detail::equal_result::EQUAL)) { return true; } - if (group.any(state == detail::equal_result::EMPTY)) { return false; } - - ++probing_iter; - } + return ref_.impl_.contains(group, key, ref_.predicate_); } }; @@ -495,27 +284,27 @@ class operator_impl(*this); - return ref_.storage_ref_.end(); + return ref_.impl_.end(); } /** * @brief Returns an iterator to one past the last slot. * - * @note This API is available only when `find_tag` is present. + * @note This API is available only when `find_tag` or `insert_and_find_tag` is present. * * @return An iterator to one past the last slot */ [[nodiscard]] __host__ __device__ constexpr iterator end() noexcept { auto const& ref_ = static_cast(*this); - return ref_.storage_ref_.end(); + return ref_.impl_.end(); } /** @@ -535,26 +324,7 @@ class operator_impl(*this); - - auto probing_iter = ref_.probing_scheme_(key, ref_.storage_ref_.num_windows()); - - while (true) { - // TODO atomic_ref::load if insert operator is present - auto const window_slots = ref_.storage_ref_[*probing_iter]; - - for (auto i = 0; i < window_size; ++i) { - switch (ref_.predicate_(window_slots[i], key)) { - case detail::equal_result::EMPTY: { - return this->end(); - } - case detail::equal_result::EQUAL: { - return const_iterator{&(*(ref_.storage_ref_.data() + *probing_iter))[i]}; - } - default: continue; - } - } - ++probing_iter; - } + return ref_.impl_.find(key, ref_.predicate_); } /** @@ -575,40 +345,7 @@ class operator_impl const& group, ProbeKey const& key) const noexcept { auto const& ref_ = static_cast(*this); - - auto probing_iter = ref_.probing_scheme_(group, key, ref_.storage_ref_.num_windows()); - - while (true) { - auto const window_slots = ref_.storage_ref_[*probing_iter]; - - auto const [state, intra_window_index] = [&]() { - for (auto i = 0; i < window_size; ++i) { - switch (ref_.predicate_(window_slots[i], key)) { - case detail::equal_result::EMPTY: return cuco::pair{detail::equal_result::EMPTY, i}; - case detail::equal_result::EQUAL: return cuco::pair{detail::equal_result::EQUAL, i}; - default: continue; - } - } - // returns dummy index `-1` for UNEQUAL - return cuco::pair{detail::equal_result::UNEQUAL, -1}; - }(); - - // Find a match for the probe key, thus return an iterator to the entry - auto const group_finds_match = group.ballot(state == detail::equal_result::EQUAL); - if (group_finds_match) { - auto const src_lane = __ffs(group_finds_match) - 1; - auto const res = - group.shfl(reinterpret_cast( - &(*(ref_.storage_ref_.data() + *probing_iter))[intra_window_index]), - src_lane); - return const_iterator{reinterpret_cast(res)}; - } - - // Find an empty slot, meaning that the probe key isn't present in the set - if (group.any(state == detail::equal_result::EMPTY)) { return this->end(); } - - ++probing_iter; - } + return ref_.impl_.find(group, key, ref_.predicate_); } }; diff --git a/include/cuco/detail/storage/aow_storage.cuh b/include/cuco/detail/storage/aow_storage.cuh index e2c031096..ac86508de 100644 --- a/include/cuco/detail/storage/aow_storage.cuh +++ b/include/cuco/detail/storage/aow_storage.cuh @@ -268,10 +268,8 @@ class aow_storage : public aow_storage_base { using base_type::capacity; using base_type::num_windows; - using allocator_type = - typename std::allocator_traits::rebind_alloc; ///< Type of the - ///< allocator to - ///< (de)allocate windows + /// Type of the allocator to (de)allocate windows + using allocator_type = typename std::allocator_traits::rebind_alloc; using window_deleter_type = custom_deleter; ///< Type of window deleter using ref_type = aow_storage_ref; ///< Storage ref type @@ -313,6 +311,13 @@ class aow_storage : public aow_storage_base { */ [[nodiscard]] constexpr window_type* data() const noexcept { return windows_.get(); } + /** + * @brief Gets the storage allocator. + * + * @return The storage allocator + */ + [[nodiscard]] constexpr allocator_type allocator() const noexcept { return allocator_; } + /** * @brief Gets window storage reference. * diff --git a/include/cuco/detail/storage/storage.cuh b/include/cuco/detail/storage/storage.cuh index b4fc86890..108aa7f84 100644 --- a/include/cuco/detail/storage/storage.cuh +++ b/include/cuco/detail/storage/storage.cuh @@ -33,13 +33,15 @@ template class storage : StorageImpl::template impl { public: /// Storage implementation type - using impl_type = typename StorageImpl::template impl; - using ref_type = typename impl_type::ref_type; ///< Storage ref type - using value_type = typename impl_type::value_type; ///< Storage value type + using impl_type = typename StorageImpl::template impl; + using ref_type = typename impl_type::ref_type; ///< Storage ref type + using value_type = typename impl_type::value_type; ///< Storage value type + using allocator_type = typename impl_type::allocator_type; ///< Storage value type /// Number of elements per window static constexpr int window_size = impl_type::window_size; + using impl_type::allocator; using impl_type::capacity; using impl_type::data; using impl_type::initialize; diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index ad4b1ae78..c1aeb6965 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -17,10 +17,12 @@ #pragma once #include +#include #include #include #include #include +#include #include #include @@ -37,6 +39,446 @@ #include namespace cuco { +namespace experimental { +/** + * @brief A GPU-accelerated, unordered, associative container of key-value pairs with unique keys. + * + * The `static_map` supports two types of operations: + * - Host-side "bulk" operations + * - Device-side "singular" operations + * + * The host-side bulk operations include `insert`, `contains`, etc. These APIs should be used when + * there are a large number of keys to modify or lookup. For example, given a range of keys + * specified by device-accessible iterators, the bulk `insert` function will insert all keys into + * the map. + * + * The singular device-side operations allow individual threads (or cooperative groups) to perform + * independent modify or lookup operations from device code. These operations are accessed through + * non-owning, trivially copyable reference types (or "ref"). User can combine any arbitrary + * operators (see options in `include/cuco/operator.hpp`) when creating the ref. Concurrent modify + * and lookup will be supported if both kinds of operators are specified during the ref + * construction. + * + * @note Allows constant time concurrent modify or lookup operations from threads in device code. + * @note cuCollections data stuctures always place the slot keys on the left-hand side when invoking + * the key comparison predicate, i.e., `pred(slot_key, query_key)`. Order-sensitive `KeyEqual` + * should be used with caution. + * @note `ProbingScheme::cg_size` indicates how many threads are used to handle one independent + * device operation. `cg_size == 1` uses the scalar (or non-CG) code paths. + * + * @throw If the size of the given key type is larger than 4 bytes + * @throw If the size of the given slot type is larger than 8 bytes + * @throw If the given key type doesn't have unique object representations, i.e., + * `cuco::bitwise_comparable_v == false` + * @throw If the given mapped type doesn't have unique object representations, i.e., + * `cuco::bitwise_comparable_v == false` + * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base` + * + * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v` + * @tparam T Type of the mapped values + * @tparam Extent Data structure size type + * @tparam Scope The scope in which operations will be performed by individual threads. + * @tparam KeyEqual Binary callable type used to compare two keys for equality + * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for choices) + * @tparam Allocator Type of allocator used for device storage + * @tparam Storage Slot window storage type + */ + +template , + cuda::thread_scope Scope = cuda::thread_scope_device, + class KeyEqual = thrust::equal_to, + class ProbingScheme = cuco::experimental::double_hashing<4, // CG size + cuco::murmurhash3_32, + cuco::murmurhash3_32>, + class Allocator = cuco::cuda_allocator>, + class Storage = cuco::experimental::aow_storage<1>> +class static_map { + static_assert(sizeof(Key) <= 4, "Container does not support key types larger than 4 bytes."); + + static_assert(cuco::is_bitwise_comparable_v, + "Mapped type must have unique object representations or have been explicitly " + "declared as safe for bitwise comparison via specialization of " + "cuco::is_bitwise_comparable_v."); + + using impl_type = detail::open_addressing_impl, + Extent, + Scope, + KeyEqual, + ProbingScheme, + Allocator, + Storage>; + + public: + static constexpr auto cg_size = impl_type::cg_size; ///< CG size used for probing + static constexpr auto window_size = impl_type::window_size; ///< Window size used for probing + static constexpr auto thread_scope = impl_type::thread_scope; ///< CUDA thread scope + + using key_type = typename impl_type::key_type; ///< Key type + using value_type = typename impl_type::value_type; ///< Key-value pair type + using extent_type = typename impl_type::extent_type; ///< Extent type + using size_type = typename impl_type::size_type; ///< Size type + using key_equal = typename impl_type::key_equal; ///< Key equality comparator type + using allocator_type = typename impl_type::allocator_type; ///< Allocator type + /// Non-owning window storage ref type + using storage_ref_type = typename impl_type::storage_ref_type; + using probing_scheme_type = typename impl_type::probing_scheme_type; ///< Probing scheme type + + using mapped_type = T; ///< Payload type + template + using ref_type = + cuco::experimental::static_map_ref; ///< Non-owning container ref type + + static_map(static_map const&) = delete; + static_map& operator=(static_map const&) = delete; + + static_map(static_map&&) = default; ///< Move constructor + + /** + * @brief Replaces the contents of the container with another container. + * + * @return Reference of the current map object + */ + static_map& operator=(static_map&&) = default; + ~static_map() = default; + + /** + * @brief Constructs a statically-sized map with the specified initial capacity, sentinel values + * and CUDA stream. + * + * The actual map capacity depends on the given `capacity`, the probing scheme, CG size, and the + * window size and it's computed via `make_valid_extent` factory. Insert operations will not + * automatically grow the map. Attempting to insert more unique keys than the capacity of the map + * results in undefined behavior. + * + * The `empty_key_sentinel` is reserved and behavior is undefined when attempting to insert + * this sentinel value. + * + * @param capacity The requested lower-bound map size + * @param empty_key_sentinel The reserved key value for empty slots + * @param empty_value_sentinel The reserved mapped value for empty slots + * @param pred Key equality binary predicate + * @param probing_scheme Probing scheme + * @param alloc Allocator used for allocating device storage + * @param stream CUDA stream used to initialize the map + */ + constexpr static_map(Extent capacity, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + KeyEqual const& pred = {}, + ProbingScheme const& probing_scheme = {}, + Allocator const& alloc = {}, + cuda_stream_ref stream = {}); + + /** + * @brief Inserts all keys in the range `[first, last)` and returns the number of successful + * insertions. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `insert_async`. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * static_map::value_type> is `true` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stream CUDA stream used for insert + * + * @return Number of successful insertions + */ + template + size_type insert(InputIt first, InputIt last, cuda_stream_ref stream = {}); + + /** + * @brief Asynchonously inserts all keys in the range `[first, last)`. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * static_map::value_type> is `true` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stream CUDA stream used for insert + */ + template + void insert_async(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept; + + /** + * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns + * true. + * + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * @note This function synchronizes the given stream and returns the number of successful + * insertions. For asynchronous execution use `insert_if_async`. + * + * @tparam InputIt Device accessible random access iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param stream CUDA stream used for the operation + * + * @return Number of successful insertions + */ + template + size_type insert_if( + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream = {}); + + /** + * @brief Asynchonously inserts keys in the range `[first, last)` if `pred` of the corresponding + * stencil returns true. + * + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * + * @tparam InputIt Device accessible random access iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param stream CUDA stream used for the operation + */ + template + void insert_if_async(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + cuda_stream_ref stream = {}) noexcept; + + /** + * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `contains_async`. + * + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param stream Stream used for executing the kernels + */ + template + void contains(InputIt first, + InputIt last, + OutputIt output_begin, + cuda_stream_ref stream = {}) const; + + /** + * @brief Asynchonously indicates whether the keys in the range `[first, last)` are contained in + * the map. + * + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param stream Stream used for executing the kernels + */ + template + void contains_async(InputIt first, + InputIt last, + OutputIt output_begin, + cuda_stream_ref stream = {}) const noexcept; + + /** + * @brief Indicates whether the keys in the range `[first, last)` are contained in the map if + * `pred` of the corresponding stencil returns true. + * + * @note If `pred( *(stencil + i) )` is true, stores `true` or `false` to `(output_begin + i)` + * indicating if the key `*(first + i)` is present in the map. If `pred( *(stencil + i) )` is + * false, stores false to `(output_begin + i)`. + * @note This function synchronizes the given stream. For asynchronous execution use + * `contains_if_async`. + * + * @tparam InputIt Device accessible input iterator + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param stream Stream used for executing the kernels + */ + template + void contains_if(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + OutputIt output_begin, + cuda_stream_ref stream = {}) const; + + /** + * @brief Asynchonously indicates whether the keys in the range `[first, last)` are contained in + * the map if `pred` of the corresponding stencil returns true. + * + * @note If `pred( *(stencil + i) )` is true, stores `true` or `false` to `(output_begin + i)` + * indicating if the key `*(first + i)` is present in the map. If `pred( *(stencil + i) )` is + * false, stores false to `(output_begin + i)`. + * + * @tparam InputIt Device accessible input iterator + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param stream Stream used for executing the kernels + */ + template + void contains_if_async(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + OutputIt output_begin, + cuda_stream_ref stream = {}) const noexcept; + + /** + * @brief For all keys in the range `[first, last)`, finds a payload with its key equivalent to + * the query key. + * + * @note This function synchronizes the given stream. For asynchronous execution use `find_async`. + * @note If the key `*(first + i)` has a matched `element` in the map, copies the payload of + * `element` to + * `(output_begin + i)`. Else, copies the empty value sentinel. + * + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from the map's `mapped_type` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of payloads retrieved for each key + * @param stream Stream used for executing the kernels + */ + template + void find(InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream = {}) const; + + /** + * @brief For all keys in the range `[first, last)`, asynchonously finds a payload with its key + * equivalent to the query key. + * + * @note If the key `*(first + i)` has a matched `element` in the map, copies the payload of + * `element` to + * `(output_begin + i)`. Else, copies the empty value sentinel. + * + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from the map's `mapped_type` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of payloads retrieved for each key + * @param stream Stream used for executing the kernels + */ + template + void find_async(InputIt first, + InputIt last, + OutputIt output_begin, + cuda_stream_ref stream = {}) const; + + /** + * @brief Retrieves all keys contained in the map. + * + * @note This API synchronizes the given stream. + * @note The order in which keys are returned is implementation defined and not guaranteed to be + * consistent between subsequent calls to `retrieve_all`. + * @note Behavior is undefined if the range beginning at `output_begin` is smaller than the return + * value of `size()`. + * + * @tparam OutputIt Device accessible random access output iterator whose `value_type` is + * convertible from the container's `key_type`. + * + * @param output_begin Beginning output iterator for keys + * @param stream CUDA stream used for this operation + * + * @return Iterator indicating the end of the output + */ + template + [[nodiscard]] OutputIt retrieve_all(OutputIt output_begin, cuda_stream_ref stream = {}) const; + + /** + * @brief Gets the number of elements in the container. + * + * @note This function synchronizes the given stream. + * + * @param stream CUDA stream used to get the number of inserted elements + * @return The number of elements in the container + */ + [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept; + + /** + * @brief Gets the maximum number of elements the hash map can hold. + * + * @return The maximum number of elements the hash map can hold + */ + [[nodiscard]] constexpr auto capacity() const noexcept; + + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + [[nodiscard]] constexpr key_type empty_key_sentinel() const noexcept; + + /** + * @brief Gets the sentinel value used to represent an empty value slot. + * + * @return The sentinel value used to represent an empty value slot + */ + [[nodiscard]] constexpr mapped_type empty_value_sentinel() const noexcept; + + /** + * @brief Get device ref with operators. + * + * @tparam Operators Set of `cuco::op` to be provided by the ref + * + * @param ops List of operators, e.g., `cuco::insert` + * + * @return Device ref of the current `static_map` object + */ + template + [[nodiscard]] auto ref(Operators... ops) const noexcept; + + private: + std::unique_ptr impl_; ///< Static map implementation + mapped_type empty_value_sentinel_; ///< Sentinel value that indicates an empty payload +}; +} // namespace experimental template class dynamic_map; @@ -1424,3 +1866,4 @@ class static_map { } // namespace cuco #include +#include diff --git a/include/cuco/static_map_ref.cuh b/include/cuco/static_map_ref.cuh new file mode 100644 index 000000000..3be18abc0 --- /dev/null +++ b/include/cuco/static_map_ref.cuh @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include + +namespace cuco { +namespace experimental { + +/** + * @brief Device non-owning "ref" type that can be used in device code to perform arbitrary + * operations defined in `include/cuco/operator.hpp` + * + * @note Concurrent modify and lookup will be supported if both kinds of operators are specified + * during the ref construction. + * @note cuCollections data stuctures always place the slot keys on the left-hand + * side when invoking the key comparison predicate. + * @note Ref types are trivially-copyable and are intended to be passed by value. + * @note `ProbingScheme::cg_size` indicates how many threads are used to handle one independent + * device operation. `cg_size == 1` uses the scalar (or non-CG) code paths. + * + * @throw If the size of the given key type is larger than 4 bytes + * @throw If the size of the given slot type is larger than 8 bytes + * @throw If the given key type doesn't have unique object representations, i.e., + * `cuco::bitwise_comparable_v == false` + * @throw If the given payload type doesn't have unique object representations, i.e., + * `cuco::bitwise_comparable_v == false` + * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base` + * + * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v` returning true + * @tparam T Type used for mapped values. Requires `cuco::is_bitwise_comparable_v` returning true + * @tparam Scope The scope in which operations will be performed by individual threads. + * @tparam KeyEqual Binary callable type used to compare two keys for equality + * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for options) + * @tparam StorageRef Storage ref type + * @tparam Operators Device operator options defined in `include/cuco/operator.hpp` + */ +template +class static_map_ref + : public detail::operator_impl< + Operators, + static_map_ref>... { + using impl_type = detail::open_addressing_ref_impl; + + static_assert(sizeof(Key) <= 4, "Container does not support key types larger than 4 bytes."); + + static_assert(sizeof(cuco::pair) <= 8, + "Container does not support slot types larger than 8 bytes."); + + static_assert( + cuco::is_bitwise_comparable_v, + "Key type must have unique object representations or have been explicitly declared as safe for " + "bitwise comparison via specialization of cuco::is_bitwise_comparable_v."); + + public: + using key_type = Key; ///< Key type + using mapped_type = T; ///< Mapped type + using probing_scheme_type = ProbingScheme; ///< Type of probing scheme + using storage_ref_type = StorageRef; ///< Type of storage ref + using window_type = typename storage_ref_type::window_type; ///< Window type + using value_type = typename storage_ref_type::value_type; ///< Storage element type + using extent_type = typename storage_ref_type::extent_type; ///< Extent type + using size_type = typename storage_ref_type::size_type; ///< Probing scheme size type + using key_equal = KeyEqual; ///< Type of key equality binary callable + using iterator = typename storage_ref_type::iterator; ///< Slot iterator type + using const_iterator = typename storage_ref_type::const_iterator; ///< Const slot iterator type + + static constexpr auto cg_size = probing_scheme_type::cg_size; ///< Cooperative group size + static constexpr auto window_size = + storage_ref_type::window_size; ///< Number of elements handled per window + + /** + * @brief Constructs static_map_ref. + * + * @param empty_key_sentinel Sentinel indicating empty key + * @param empty_value_sentinel Sentinel indicating empty payload + * @param predicate Key equality binary callable + * @param probing_scheme Probing scheme + * @param storage_ref Non-owning ref of slot storage + */ + __host__ __device__ explicit constexpr static_map_ref( + cuco::empty_key empty_key_sentinel, + cuco::empty_value empty_value_sentinel, + key_equal const& predicate, + probing_scheme_type const& probing_scheme, + storage_ref_type storage_ref) noexcept; + + /** + * @brief Gets the maximum number of elements the container can hold. + * + * @return The maximum number of elements the container can hold + */ + [[nodiscard]] __host__ __device__ constexpr auto capacity() const noexcept; + + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + [[nodiscard]] __host__ __device__ constexpr key_type empty_key_sentinel() const noexcept; + + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + [[nodiscard]] __host__ __device__ constexpr mapped_type empty_value_sentinel() const noexcept; + + private: + struct predicate_wrapper; + + impl_type impl_; ///< Static map ref implementation + predicate_wrapper predicate_; ///< Key equality binary callable + mapped_type empty_value_sentinel_; ///< Empty value sentinel + + // Mixins need to be friends with this class in order to access private members + template + friend class detail::operator_impl; +}; + +} // namespace experimental +} // namespace cuco + +#include diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh index 5ba161bfa..a37f2a461 100644 --- a/include/cuco/static_set.cuh +++ b/include/cuco/static_set.cuh @@ -17,8 +17,7 @@ #pragma once #include -#include -#include +#include #include #include #include @@ -88,38 +87,27 @@ template , cuco::murmurhash3_32>, - class Allocator = cuco::cuda_allocator, + class Allocator = cuco::cuda_allocator, class Storage = cuco::experimental::aow_storage<1>> class static_set { - static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); - - static_assert( - cuco::is_bitwise_comparable_v, - "Key type must have unique object representations or have been explicitly declared as safe for " - "bitwise comparison via specialization of cuco::is_bitwise_comparable_v."); - - static_assert( - std::is_base_of_v, - ProbingScheme>, - "ProbingScheme must inherit from cuco::detail::probing_scheme_base"); + using impl_type = detail:: + open_addressing_impl; public: - static constexpr auto cg_size = ProbingScheme::cg_size; ///< CG size used to for probing - static constexpr auto window_size = Storage::window_size; ///< Window size used to for probing - static constexpr auto thread_scope = Scope; ///< CUDA thread scope - - using key_type = Key; ///< Key type - using value_type = Key; ///< Key type - /// Extent type - using extent_type = decltype(make_valid_extent(std::declval())); - using size_type = typename extent_type::value_type; ///< Size type - using key_equal = KeyEqual; ///< Key equality comparator type - using allocator_type = Allocator; ///< Allocator type - using storage_type = - detail::storage; ///< Storage type - - using storage_ref_type = typename storage_type::ref_type; ///< Non-owning window storage ref type - using probing_scheme_type = ProbingScheme; ///< Probe scheme type + static constexpr auto cg_size = impl_type::cg_size; ///< CG size used for probing + static constexpr auto window_size = impl_type::window_size; ///< Window size used for probing + static constexpr auto thread_scope = impl_type::thread_scope; ///< CUDA thread scope + + using key_type = typename impl_type::key_type; ///< Key type + using value_type = typename impl_type::value_type; ///< Key type + using extent_type = typename impl_type::extent_type; ///< Extent type + using size_type = typename impl_type::size_type; ///< Size type + using key_equal = typename impl_type::key_equal; ///< Key equality comparator type + using allocator_type = typename impl_type::allocator_type; ///< Allocator type + /// Non-owning window storage ref type + using storage_ref_type = typename impl_type::storage_ref_type; + using probing_scheme_type = typename impl_type::probing_scheme_type; ///< Probing scheme type + template using ref_type = cuco::experimental::static_set_ref empty_key_sentinel, - KeyEqual pred = {}, + KeyEqual const& pred = {}, ProbingScheme const& probing_scheme = {}, Allocator const& alloc = {}, cuda_stream_ref stream = {}); @@ -368,7 +356,7 @@ class static_set { * `(output_begin + i)`. Else, copies the empty key sentinel. * * @tparam InputIt Device accessible input iterator - * @tparam OutputIt Device accessible output iterator assignable from the set's `value_type` + * @tparam OutputIt Device accessible output iterator assignable from the set's `key_type` * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys @@ -386,7 +374,7 @@ class static_set { * `(output_begin + i)`. Else, copies the empty key sentinel. * * @tparam InputIt Device accessible input iterator - * @tparam OutputIt Device accessible output iterator assignable from the set's `value_type` + * @tparam OutputIt Device accessible output iterator assignable from the set's `key_type` * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys @@ -405,7 +393,7 @@ class static_set { * @note This API synchronizes the given stream. * @note The order in which keys are returned is implementation defined and not guaranteed to be * consistent between subsequent calls to `retrieve_all`. - * @note Behavior is undefined if the range beginning at `keys_out` is smaller than the return + * @note Behavior is undefined if the range beginning at `output_begin` is smaller than the return * value of `size()`. * * @tparam OutputIt Device accessible random access output iterator whose `value_type` is @@ -456,13 +444,8 @@ class static_set { [[nodiscard]] auto ref(Operators... ops) const noexcept; private: - key_type empty_key_sentinel_; ///< Key value that represents an empty slot - key_equal predicate_; ///< Key equality binary predicate - probing_scheme_type probing_scheme_; ///< Probing scheme - allocator_type allocator_; ///< Allocator used to (de)allocate temporary storage - storage_type storage_; ///< Slot window storage + std::unique_ptr impl_; }; - } // namespace experimental } // namespace cuco diff --git a/include/cuco/static_set_ref.cuh b/include/cuco/static_set_ref.cuh index acaa980bf..49cfe116e 100644 --- a/include/cuco/static_set_ref.cuh +++ b/include/cuco/static_set_ref.cuh @@ -17,11 +17,14 @@ #pragma once #include +#include #include #include #include +#include + namespace cuco { namespace experimental { @@ -59,17 +62,9 @@ class static_set_ref : public detail::operator_impl< Operators, static_set_ref>... { - static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); - - static_assert( - cuco::is_bitwise_comparable_v, - "Key type must have unique object representations or have been explicitly declared as safe for " - "bitwise comparison via specialization of cuco::is_bitwise_comparable_v."); + using impl_type = detail::open_addressing_ref_impl; - static_assert( - std::is_base_of_v, - ProbingScheme>, - "ProbingScheme must inherit from cuco::detail::probing_scheme_base"); + static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); public: using key_type = Key; ///< Key Type @@ -116,26 +111,8 @@ class static_set_ref [[nodiscard]] __host__ __device__ constexpr key_type empty_key_sentinel() const noexcept; private: - // TODO: this should be a common enum for all data structures - enum class insert_result : int32_t { CONTINUE = 0, SUCCESS = 1, DUPLICATE = 2 }; - - /** - * @brief Attempts to insert an element into a slot. - * - * @note Dispatches the correct implementation depending on the container - * type and presence of other operator mixins. - * - * @param slot Pointer to the slot in memory - * @param value Element to insert - * - * @return Result of this operation, i.e., success/continue/duplicate - */ - [[nodiscard]] __device__ insert_result attempt_insert(value_type* slot, value_type const& value); - - cuco::empty_key empty_key_sentinel_; ///< Empty key sentinel - detail::equal_wrapper predicate_; ///< Key equality binary callable - probing_scheme_type probing_scheme_; ///< Probing scheme - storage_ref_type storage_ref_; ///< Slot storage ref + impl_type impl_; + detail::equal_wrapper predicate_; ///< Key equality binary callable // Mixins need to be friends with this class in order to access private members template diff --git a/tests/static_map/unique_sequence_test.cu b/tests/static_map/unique_sequence_test.cu index 678fe4098..54ba4e6f1 100644 --- a/tests/static_map/unique_sequence_test.cu +++ b/tests/static_map/unique_sequence_test.cu @@ -152,3 +152,129 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", } } } + +using size_type = int32_t; + +template +__inline__ void test_unique_sequence(Map& map, size_type num_keys) +{ + using Key = typename Map::key_type; + using Value = typename Map::mapped_type; + + thrust::device_vector d_keys(num_keys); + + thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); + + auto keys_begin = d_keys.begin(); + auto pairs_begin = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [] __device__(auto i) { return cuco::pair_type(i, i); }); + thrust::device_vector d_contained(num_keys); + + auto zip_equal = [] __device__(auto const& p) { return thrust::get<0>(p) == thrust::get<1>(p); }; + auto is_even = [] __device__(auto const& i) { return i % 2 == 0; }; + + SECTION("Non-inserted keys should not be contained.") + { + REQUIRE(map.size() == 0); + + map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), thrust::identity{})); + } + + SECTION("Non-inserted keys have no matches") + { + thrust::device_vector d_results(num_keys); + + map.find(keys_begin, keys_begin + num_keys, d_results.begin()); + auto zip = thrust::make_zip_iterator(thrust::make_tuple( + d_results.begin(), thrust::constant_iterator{map.empty_key_sentinel()})); + + REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal)); + } + + SECTION("All conditionally inserted keys should be contained") + { + auto const inserted = map.insert_if( + pairs_begin, pairs_begin + num_keys, thrust::counting_iterator(0), is_even); + REQUIRE(inserted == num_keys / 2); + REQUIRE(map.size() == num_keys / 2); + + map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::equal(d_contained.begin(), + d_contained.end(), + thrust::counting_iterator(0), + [] __device__(auto const& idx_contained, auto const& idx) { + return ((idx % 2) == 0) == idx_contained; + })); + } + + map.insert(pairs_begin, pairs_begin + num_keys); + REQUIRE(map.size() == num_keys); + + SECTION("All inserted keys should be contained.") + { + map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), thrust::identity{})); + } + + SECTION("Conditional contains should return true on even inputs.") + { + map.contains_if(keys_begin, + keys_begin + num_keys, + thrust::counting_iterator(0), + is_even, + d_contained.begin()); + auto gold_iter = + thrust::make_transform_iterator(thrust::counting_iterator(0), is_even); + auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_contained.begin(), gold_iter)); + REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal)); + } + + SECTION("All inserted keys should be correctly recovered during find") + { + thrust::device_vector d_results(num_keys); + + map.find(keys_begin, keys_begin + num_keys, d_results.begin()); + auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_results.begin(), keys_begin)); + + REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal)); + } +} + +TEMPLATE_TEST_CASE_SIG("Unique sequence", + "", + ((cuco::test::probe_sequence Probe, int CGSize), Probe, CGSize), + (cuco::test::probe_sequence::double_hashing, 1), + (cuco::test::probe_sequence::double_hashing, 2), + (cuco::test::probe_sequence::linear_probing, 1), + (cuco::test::probe_sequence::linear_probing, 2)) +{ + using Key = int32_t; + using Value = int32_t; + + constexpr size_type num_keys{400}; + constexpr size_type gold_capacity = CGSize == 1 ? 422 // 211 x 1 x 2 + : 412; // 103 x 2 x 2 + + using probe = + std::conditional_t>, + cuco::experimental::double_hashing, + cuco::murmurhash3_32>>; + + auto map = cuco::experimental::static_map, + cuda::thread_scope_device, + thrust::equal_to, + probe, + cuco::cuda_allocator, + cuco::experimental::aow_storage<2>>{ + num_keys, cuco::empty_key{-1}, cuco::empty_value{-1}}; + + REQUIRE(map.capacity() == gold_capacity); + + test_unique_sequence(map, num_keys); +} From 88ff1e4eb7bbdd5a0d1c9025548e1451d8128a93 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 27 Jun 2023 11:58:21 -0700 Subject: [PATCH 120/152] Clean up `cuco::pair` (#319) Contributes to #110 Depends on #314 This PR: - deprecates `cuco::pair_type` alias - fixes issues with `cuco::make_pair` - separates `pair` declarations and implementation details --- .../hash_table/dynamic_map/contains_bench.cu | 2 +- .../hash_table/dynamic_map/erase_bench.cu | 2 +- .../hash_table/dynamic_map/find_bench.cu | 2 +- .../hash_table/dynamic_map/insert_bench.cu | 2 +- .../hash_table/static_map/contains_bench.cu | 2 +- .../hash_table/static_map/erase_bench.cu | 2 +- .../hash_table/static_map/find_bench.cu | 2 +- .../hash_table/static_map/insert_bench.cu | 2 +- .../hash_table/static_multimap/count_bench.cu | 2 +- .../static_multimap/insert_bench.cu | 2 +- .../hash_table/static_multimap/query_bench.cu | 4 +- .../static_multimap/retrieve_bench.cu | 4 +- examples/static_multimap/host_bulk_example.cu | 2 +- include/cuco/detail/dynamic_map.inl | 2 +- .../cuco/detail/open_addressing_ref_impl.cuh | 2 +- include/cuco/detail/pair.inl | 51 ++++++ include/cuco/detail/probe_sequence_impl.cuh | 9 +- .../cuco/detail/static_multimap/kernels.cuh | 4 +- include/cuco/detail/traits.hpp | 59 +++++++ include/cuco/detail/utils.cuh | 92 +++++++++++ include/cuco/dynamic_map.cuh | 2 +- include/cuco/pair.cuh | 146 ++++++++++++++++++ include/cuco/static_map.cuh | 8 +- include/cuco/static_multimap.cuh | 6 +- include/cuco/utility/traits.hpp | 3 + tests/dynamic_map/unique_sequence_test.cu | 6 +- tests/static_map/custom_type_test.cu | 25 ++- tests/static_map/duplicate_keys_test.cu | 2 +- tests/static_map/heterogeneous_lookup_test.cu | 6 +- tests/static_map/key_sentinel_test.cu | 32 ++-- tests/static_map/stream_test.cu | 6 +- tests/static_map/unique_sequence_test.cu | 78 +++++----- .../custom_pair_retrieve_test.cu | 10 +- tests/static_multimap/custom_type_test.cu | 46 +++--- .../heterogeneous_lookup_test.cu | 6 +- tests/static_multimap/insert_if_test.cu | 4 +- tests/static_multimap/multiplicity_test.cu | 70 ++++----- tests/static_multimap/non_match_test.cu | 74 ++++----- tests/static_multimap/pair_function_test.cu | 10 +- tests/utility/storage_test.cu | 2 +- 40 files changed, 567 insertions(+), 224 deletions(-) create mode 100644 include/cuco/detail/pair.inl create mode 100644 include/cuco/detail/traits.hpp create mode 100644 include/cuco/pair.cuh diff --git a/benchmarks/hash_table/dynamic_map/contains_bench.cu b/benchmarks/hash_table/dynamic_map/contains_bench.cu index 8e41b8e2d..ff349bc53 100644 --- a/benchmarks/hash_table/dynamic_map/contains_bench.cu +++ b/benchmarks/hash_table/dynamic_map/contains_bench.cu @@ -35,7 +35,7 @@ template std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> dynamic_map_contains( nvbench::state& state, nvbench::type_list) { - using pair_type = cuco::pair_type; + using pair_type = cuco::pair; auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); auto const initial_size = state.get_int64_or_default("InitSize", defaults::INITIAL_SIZE); diff --git a/benchmarks/hash_table/dynamic_map/erase_bench.cu b/benchmarks/hash_table/dynamic_map/erase_bench.cu index b815515e8..96f5ec7ec 100644 --- a/benchmarks/hash_table/dynamic_map/erase_bench.cu +++ b/benchmarks/hash_table/dynamic_map/erase_bench.cu @@ -35,7 +35,7 @@ template std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> dynamic_map_erase( nvbench::state& state, nvbench::type_list) { - using pair_type = cuco::pair_type; + using pair_type = cuco::pair; auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); auto const initial_size = state.get_int64_or_default("InitSize", defaults::INITIAL_SIZE); diff --git a/benchmarks/hash_table/dynamic_map/find_bench.cu b/benchmarks/hash_table/dynamic_map/find_bench.cu index 12576ccc1..b06cfab4e 100644 --- a/benchmarks/hash_table/dynamic_map/find_bench.cu +++ b/benchmarks/hash_table/dynamic_map/find_bench.cu @@ -35,7 +35,7 @@ template std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> dynamic_map_find( nvbench::state& state, nvbench::type_list) { - using pair_type = cuco::pair_type; + using pair_type = cuco::pair; auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); auto const initial_size = state.get_int64_or_default("InitSize", defaults::INITIAL_SIZE); diff --git a/benchmarks/hash_table/dynamic_map/insert_bench.cu b/benchmarks/hash_table/dynamic_map/insert_bench.cu index de2fa8a4a..8e8cc8a84 100644 --- a/benchmarks/hash_table/dynamic_map/insert_bench.cu +++ b/benchmarks/hash_table/dynamic_map/insert_bench.cu @@ -35,7 +35,7 @@ template std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> dynamic_map_insert( nvbench::state& state, nvbench::type_list) { - using pair_type = cuco::pair_type; + using pair_type = cuco::pair; auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); auto const initial_size = state.get_int64_or_default("InitSize", defaults::INITIAL_SIZE); diff --git a/benchmarks/hash_table/static_map/contains_bench.cu b/benchmarks/hash_table/static_map/contains_bench.cu index 09737a136..0b5d482a1 100644 --- a/benchmarks/hash_table/static_map/contains_bench.cu +++ b/benchmarks/hash_table/static_map/contains_bench.cu @@ -35,7 +35,7 @@ template std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_contains( nvbench::state& state, nvbench::type_list) { - using pair_type = cuco::pair_type; + using pair_type = cuco::pair; auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); diff --git a/benchmarks/hash_table/static_map/erase_bench.cu b/benchmarks/hash_table/static_map/erase_bench.cu index 3f26504a7..c6e56eb07 100644 --- a/benchmarks/hash_table/static_map/erase_bench.cu +++ b/benchmarks/hash_table/static_map/erase_bench.cu @@ -35,7 +35,7 @@ template std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_erase( nvbench::state& state, nvbench::type_list) { - using pair_type = cuco::pair_type; + using pair_type = cuco::pair; auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); diff --git a/benchmarks/hash_table/static_map/find_bench.cu b/benchmarks/hash_table/static_map/find_bench.cu index 4a1ccca11..276a35e0b 100644 --- a/benchmarks/hash_table/static_map/find_bench.cu +++ b/benchmarks/hash_table/static_map/find_bench.cu @@ -35,7 +35,7 @@ template std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_find( nvbench::state& state, nvbench::type_list) { - using pair_type = cuco::pair_type; + using pair_type = cuco::pair; auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); diff --git a/benchmarks/hash_table/static_map/insert_bench.cu b/benchmarks/hash_table/static_map/insert_bench.cu index b6fadc057..ef997bef8 100644 --- a/benchmarks/hash_table/static_map/insert_bench.cu +++ b/benchmarks/hash_table/static_map/insert_bench.cu @@ -35,7 +35,7 @@ template std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_insert( nvbench::state& state, nvbench::type_list) { - using pair_type = cuco::pair_type; + using pair_type = cuco::pair; auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); diff --git a/benchmarks/hash_table/static_multimap/count_bench.cu b/benchmarks/hash_table/static_multimap/count_bench.cu index e087e3243..fa71c8d0c 100644 --- a/benchmarks/hash_table/static_multimap/count_bench.cu +++ b/benchmarks/hash_table/static_multimap/count_bench.cu @@ -35,7 +35,7 @@ template std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_count( nvbench::state& state, nvbench::type_list) { - using pair_type = cuco::pair_type; + using pair_type = cuco::pair; auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); diff --git a/benchmarks/hash_table/static_multimap/insert_bench.cu b/benchmarks/hash_table/static_multimap/insert_bench.cu index c045f3a91..aa41044bb 100644 --- a/benchmarks/hash_table/static_multimap/insert_bench.cu +++ b/benchmarks/hash_table/static_multimap/insert_bench.cu @@ -35,7 +35,7 @@ template std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_insert( nvbench::state& state, nvbench::type_list) { - using pair_type = cuco::pair_type; + using pair_type = cuco::pair; auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); diff --git a/benchmarks/hash_table/static_multimap/query_bench.cu b/benchmarks/hash_table/static_multimap/query_bench.cu index 783c83556..7d6202297 100644 --- a/benchmarks/hash_table/static_multimap/query_bench.cu +++ b/benchmarks/hash_table/static_multimap/query_bench.cu @@ -35,7 +35,7 @@ template std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_query( nvbench::state& state, nvbench::type_list) { - using pair_type = cuco::pair_type; + using pair_type = cuco::pair; auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); @@ -99,4 +99,4 @@ NVBENCH_BENCH_TYPES(static_multimap_query, .set_name("static_multimap_query_uniform_multiplicity") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) - .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); \ No newline at end of file + .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); diff --git a/benchmarks/hash_table/static_multimap/retrieve_bench.cu b/benchmarks/hash_table/static_multimap/retrieve_bench.cu index 432bd3485..e30fbe547 100644 --- a/benchmarks/hash_table/static_multimap/retrieve_bench.cu +++ b/benchmarks/hash_table/static_multimap/retrieve_bench.cu @@ -35,7 +35,7 @@ template std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_retrieve( nvbench::state& state, nvbench::type_list) { - using pair_type = cuco::pair_type; + using pair_type = cuco::pair; auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); @@ -98,4 +98,4 @@ NVBENCH_BENCH_TYPES(static_multimap_retrieve, .set_name("static_multimap_retrieve_uniform_multiplicity") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) - .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); \ No newline at end of file + .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); diff --git a/examples/static_multimap/host_bulk_example.cu b/examples/static_multimap/host_bulk_example.cu index a7d5a95a7..d1fe5589a 100644 --- a/examples/static_multimap/host_bulk_example.cu +++ b/examples/static_multimap/host_bulk_example.cu @@ -60,7 +60,7 @@ int main(void) // The `_outer` suffix indicates that the occurrence of a non-match is 1. auto const output_size = map.count_outer(keys_to_find.begin(), keys_to_find.end()); - thrust::device_vector> d_results(output_size); + thrust::device_vector> d_results(output_size); // Finds all keys {0, 1, 2, ...} and stores associated key/value pairs into `d_results` // If a key `keys_to_find[i]` doesn't exist, `d_results[i].second == empty_value_sentinel` diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index e329eefeb..7b5145190 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -146,7 +146,7 @@ void dynamic_map::insert( auto const n = std::min(capacity_remaining, num_to_insert); auto const grid_size = (tile_size * n + stride * block_size - 1) / (stride * block_size); - detail::insert> + detail::insert> <<>>(first, first + n, submap_views_.data().get(), diff --git a/include/cuco/detail/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing_ref_impl.cuh index dd3a84434..56b699104 100644 --- a/include/cuco/detail/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing_ref_impl.cuh @@ -17,7 +17,7 @@ #pragma once #include -#include +#include #include #include diff --git a/include/cuco/detail/pair.inl b/include/cuco/detail/pair.inl new file mode 100644 index 000000000..56d16e4fb --- /dev/null +++ b/include/cuco/detail/pair.inl @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace cuco { + +template +__host__ __device__ constexpr pair::pair(First const& f, Second const& s) + : first{f}, second{s} +{ +} + +template +template +__host__ __device__ constexpr pair::pair(pair const& p) + : first{p.first}, second{p.second} +{ +} + +template +__host__ __device__ constexpr pair, std::decay_t> make_pair(F&& f, + S&& s) noexcept +{ + return pair, std::decay_t>(std::forward(f), std::forward(s)); +} + +template +__host__ __device__ constexpr bool operator==(cuco::pair const& lhs, + cuco::pair const& rhs) noexcept +{ + return lhs.first == rhs.first and lhs.second == rhs.second; +} + +} // namespace cuco diff --git a/include/cuco/detail/probe_sequence_impl.cuh b/include/cuco/detail/probe_sequence_impl.cuh index 14124b639..c108840b2 100644 --- a/include/cuco/detail/probe_sequence_impl.cuh +++ b/include/cuco/detail/probe_sequence_impl.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,8 @@ #pragma once -#include +#include +#include #include @@ -71,13 +72,13 @@ template class probe_sequence_impl_base { protected: - using value_type = cuco::pair_type; ///< Type of key/value pairs + using value_type = cuco::pair; ///< Type of key/value pairs using key_type = Key; ///< Key type using mapped_type = Value; ///< Type of mapped values using atomic_key_type = cuda::atomic; ///< Type of atomic keys using atomic_mapped_type = cuda::atomic; ///< Type of atomic mapped values /// Pair type of atomic key and atomic mapped value - using pair_atomic_type = cuco::pair_type; + using pair_atomic_type = cuco::pair; /// Type of the forward iterator to `pair_atomic_type` using iterator = pair_atomic_type*; /// Type of the forward iterator to `const pair_atomic_type` diff --git a/include/cuco/detail/static_multimap/kernels.cuh b/include/cuco/detail/static_multimap/kernels.cuh index c010fa8f3..ca5f898a5 100644 --- a/include/cuco/detail/static_multimap/kernels.cuh +++ b/include/cuco/detail/static_multimap/kernels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ */ #pragma once -#include +#include #include diff --git a/include/cuco/detail/traits.hpp b/include/cuco/detail/traits.hpp new file mode 100644 index 000000000..602a93251 --- /dev/null +++ b/include/cuco/detail/traits.hpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + */ + +#pragma once + +#include +#include + +#include +#include + +namespace cuco::detail { + +template +struct is_std_pair_like : cuda::std::false_type { +}; + +template +struct is_std_pair_like(cuda::std::declval())), + decltype(cuda::std::get<1>(cuda::std::declval()))>> + : cuda::std::conditional_t::value == 2, + cuda::std::true_type, + cuda::std::false_type> { +}; + +template +struct is_thrust_pair_like_impl : cuda::std::false_type { +}; + +template +struct is_thrust_pair_like_impl< + T, + cuda::std::void_t(cuda::std::declval())), + decltype(thrust::get<1>(cuda::std::declval()))>> + : cuda::std::conditional_t::value == 2, + cuda::std::true_type, + cuda::std::false_type> { +}; + +template +struct is_thrust_pair_like + : is_thrust_pair_like_impl()))>> { +}; + +} // namespace cuco::detail diff --git a/include/cuco/detail/utils.cuh b/include/cuco/detail/utils.cuh index 0a337d4f5..fdded70f5 100644 --- a/include/cuco/detail/utils.cuh +++ b/include/cuco/detail/utils.cuh @@ -19,6 +19,8 @@ #include +#include + namespace cuco { namespace detail { @@ -102,5 +104,95 @@ struct strong_type { T value; ///< Underlying value }; +/** + * @brief Gives value to use as alignment for a pair type that is at least the + * size of the sum of the size of the first type and second type, or 16, + * whichever is smaller. + */ +template +constexpr std::size_t pair_alignment() +{ + return std::min(std::size_t{16}, cuda::std::bit_ceil(sizeof(First) + sizeof(Second))); +} + +/** + * @brief Denotes the equivalent packed type based on the size of the object. + * + * @tparam N The size of the object + */ +template +struct packed { + using type = void; ///< `void` type by default +}; +/** + * @brief Denotes the packed type when the size of the object is 8. + */ +template <> +struct packed { + using type = uint64_t; ///< Packed type as `uint64_t` if the size of the object is 8 +}; +/** + * @brief Denotes the packed type when the size of the object is 4. + */ +template <> +struct packed { + using type = uint32_t; ///< Packed type as `uint32_t` if the size of the object is 4 +}; +template +using packed_t = typename packed::type; + +/** + * @brief Indicates if a pair type can be packed. + * + * When the size of the key,value pair being inserted into the hash table is + * equal in size to a type where atomicCAS is natively supported, it is more + * efficient to "pack" the pair and insert it with a single atomicCAS. + * + * Pair types whose key and value have the same object representation may be + * packed. Also, the `Pair` must not contain any padding bits otherwise + * accessing the packed value would be undefined. + * + * @tparam Pair The pair type that will be packed + * + * @return true If the pair type can be packed + * @return false If the pair type cannot be packed + */ +template +constexpr bool is_packable() +{ + return not std::is_void>::value and std::has_unique_object_representations_v; +} +/** + * @brief Allows viewing a pair in a packed representation. + * + * Used as an optimization for inserting when a pair can be inserted with a + * single atomicCAS + */ +template +union pair_converter { + using packed_type = packed_t; ///< The packed pair type + packed_type packed; ///< The pair in the packed representation + Pair pair; ///< The pair in the pair representation + + /** + * @brief Constructs a pair converter by copying from `p` + * + * @tparam T Type that is convertible to `Pair` + * + * @param p The pair to copy from + */ + template + __device__ pair_converter(T&& p) : pair{p} + { + } + + /** + * @brief Constructs a pair converter by copying from `p` + * + * @param p The packed data to copy from + */ + __device__ pair_converter(packed_type p) : packed{p} {} +}; + } // namespace detail } // namespace cuco diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index b9dcf9f22..8aa87163f 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -101,7 +101,7 @@ class dynamic_map { static_assert(std::is_arithmetic::value, "Unsupported, non-arithmetic key type."); public: - using value_type = cuco::pair_type; ///< Type of key/value pairs + using value_type = cuco::pair; ///< Type of key/value pairs using key_type = Key; ///< Key type using mapped_type = Value; ///< Type of mapped values using atomic_ctr_type = cuda::atomic; ///< Atomic counter type diff --git a/include/cuco/pair.cuh b/include/cuco/pair.cuh new file mode 100644 index 000000000..0a804cc04 --- /dev/null +++ b/include/cuco/pair.cuh @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include + +#include +#include + +namespace cuco { + +/** + * @brief Custom pair type + * + * @note This is necessary because `thrust::pair` is under aligned. + * + * @tparam First Type of the first value in the pair + * @tparam Second Type of the second value in the pair + */ +template +struct alignas(detail::pair_alignment()) pair { + using first_type = First; ///< Type of the first value in the pair + using second_type = Second; ///< Type of the second value in the pair + + pair() = default; + ~pair() = default; + pair(pair const&) = default; ///< Copy constructor + pair(pair&&) = default; ///< Move constructor + + /** + * @brief Replaces the contents of the pair with another pair. + * + * @return Reference of the current pair object + */ + pair& operator=(pair const&) = default; + + /** + * @brief Replaces the contents of the pair with another pair. + * + * @return Reference of the current pair object + */ + pair& operator=(pair&&) = default; + + /** + * @brief Constructs a pair from objects `f` and `s`. + * + * @param f The object to copy into `first` + * @param s The object to copy into `second` + */ + __host__ __device__ constexpr pair(First const& f, Second const& s); + + /** + * @brief Constructs a pair by copying from the given pair `p`. + * + * @tparam F Type of the first value of `p` + * @tparam S Type of the second value of `p` + * + * @param p The pair to copy from + */ + template + __host__ __device__ constexpr pair(pair const& p); + + /** + * @brief Constructs a pair from the given std::pair-like `p`. + * + * @tparam T Type of the pair to copy from + * + * @param p The input pair to copy from + */ + template ::value>* = nullptr> + __host__ __device__ constexpr pair(T const& p) + : pair{std::get<0>(thrust::raw_reference_cast(p)), std::get<1>(thrust::raw_reference_cast(p))} + { + } + + /** + * @brief Constructs a pair from the given thrust::pair-like `p`. + * + * @tparam T Type of the pair to copy from + * + * @param p The input pair to copy from + */ + template ::value>* = nullptr> + __host__ __device__ constexpr pair(T const& p) + : pair{thrust::get<0>(thrust::raw_reference_cast(p)), + thrust::get<1>(thrust::raw_reference_cast(p))} + { + } + + First first; ///< The first value in the pair + Second second; ///< The second value in the pair +}; + +/** + * @brief Creates a pair with the given first and second elements + * + * @tparam F Type of first element + * @tparam S Type of second element + * + * @param f First element + * @param s Second element + * + * @return A pair with first element `f` and second element `s`. + */ +template +__host__ __device__ constexpr pair, std::decay_t> make_pair(F&& f, + S&& s) noexcept; + +/** + * @brief Tests if both elements of lhs and rhs are equal + * + * @tparam T1 Type of the first element of the left-hand side pair + * @tparam T2 Type of the second element of the left-hand side pair + * @tparam U1 Type of the first element of the right-hand side pair + * @tparam U2 Type of the second element of the right-hand side pair + * + * @param lhs Left-hand side pair + * @param rhs Right-hand side pair + * + * @return True if two pairs are equal. False otherwise + */ +template +__host__ __device__ constexpr bool operator==(cuco::pair const& lhs, + cuco::pair const& rhs) noexcept; + +} // namespace cuco + +#include diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index c1aeb6965..f296e9ed1 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -18,9 +18,9 @@ #include #include -#include #include #include +#include #include #include #include @@ -577,14 +577,14 @@ class static_map { friend class dynamic_map; ///< Dynamic map as friend class public: - using value_type = cuco::pair_type; ///< Type of key/value pairs + using value_type = cuco::pair; ///< Type of key/value pairs using key_type = Key; ///< Key type using mapped_type = Value; ///< Type of mapped values using atomic_key_type = cuda::atomic; ///< Type of atomic keys using atomic_mapped_type = cuda::atomic; ///< Type of atomic mapped values using pair_atomic_type = - cuco::pair_type; ///< Pair type of atomic key and atomic mapped value + cuco::pair; ///< Pair type of atomic key and atomic mapped value using slot_type = pair_atomic_type; ///< Type of hash map slots using atomic_ctr_type = cuda::atomic; ///< Atomic counter type using allocator_type = Allocator; ///< Allocator type diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index fe68da32b..075848dd2 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -149,14 +149,14 @@ class static_multimap { "cuco::linear_probing."); public: - using value_type = cuco::pair_type; ///< Type of key/value pairs + using value_type = cuco::pair; ///< Type of key/value pairs using key_type = Key; ///< Key type using mapped_type = Value; ///< Type of mapped values using atomic_key_type = cuda::atomic; ///< Type of atomic keys using atomic_mapped_type = cuda::atomic; ///< Type of atomic mapped values using pair_atomic_type = - cuco::pair_type; ///< Pair type of atomic key and atomic mapped value + cuco::pair; ///< Pair type of atomic key and atomic mapped value using atomic_ctr_type = cuda::atomic; ///< Atomic counter type using allocator_type = Allocator; ///< Allocator type using slot_allocator_type = typename std::allocator_traits::rebind_alloc< diff --git a/include/cuco/utility/traits.hpp b/include/cuco/utility/traits.hpp index 78e8dabcb..1a6252dcb 100644 --- a/include/cuco/utility/traits.hpp +++ b/include/cuco/utility/traits.hpp @@ -16,6 +16,9 @@ #pragma once +#include +#include + #include namespace cuco { diff --git a/tests/dynamic_map/unique_sequence_test.cu b/tests/dynamic_map/unique_sequence_test.cu index 820fb95f8..aa01ca51a 100644 --- a/tests/dynamic_map/unique_sequence_test.cu +++ b/tests/dynamic_map/unique_sequence_test.cu @@ -48,9 +48,9 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); thrust::sequence(thrust::device, d_values.begin(), d_values.end()); - auto pairs_begin = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - [] __device__(auto i) { return cuco::pair_type(i, i); }); + auto pairs_begin = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [] __device__(auto i) { return cuco::pair(i, i); }); thrust::device_vector d_results(num_keys); thrust::device_vector d_contained(num_keys); diff --git a/tests/static_map/custom_type_test.cu b/tests/static_map/custom_type_test.cu index ac743037a..e23216ca3 100644 --- a/tests/static_map/custom_type_test.cu +++ b/tests/static_map/custom_type_test.cu @@ -131,9 +131,9 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type", insert_values.begin(), [] __device__(auto i) { return Value{i}; }); - auto insert_pairs = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - [] __device__(auto i) { return cuco::pair_type(i, i); }); + auto insert_pairs = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [] __device__(auto i) { return cuco::pair(i, i); }); SECTION("All inserted keys-value pairs should be correctly recovered during find") { @@ -212,7 +212,7 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type", map.insert(insert_pairs, insert_pairs + num, hash_custom_key{}, custom_key_equals{}); auto view = map.get_device_view(); REQUIRE(cuco::test::all_of( - insert_pairs, insert_pairs + num, [view] __device__(cuco::pair_type const& pair) { + insert_pairs, insert_pairs + num, [view] __device__(cuco::pair const& pair) { return view.contains(pair.first, hash_custom_key{}, custom_key_equals{}); })); } @@ -220,12 +220,11 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type", SECTION("Inserting unique keys should return insert success.") { auto m_view = map.get_device_mutable_view(); - REQUIRE( - cuco::test::all_of(insert_pairs, - insert_pairs + num, - [m_view] __device__(cuco::pair_type const& pair) mutable { - return m_view.insert(pair, hash_custom_key{}, custom_key_equals{}); - })); + REQUIRE(cuco::test::all_of(insert_pairs, + insert_pairs + num, + [m_view] __device__(cuco::pair const& pair) mutable { + return m_view.insert(pair, hash_custom_key{}, custom_key_equals{}); + })); } SECTION("Cannot find any key in an empty hash map") @@ -236,7 +235,7 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type", REQUIRE(cuco::test::all_of( insert_pairs, insert_pairs + num, - [view] __device__(cuco::pair_type const& pair) mutable { + [view] __device__(cuco::pair const& pair) mutable { return view.find(pair.first, hash_custom_key{}, custom_key_equals{}) == view.end(); })); } @@ -245,9 +244,7 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type", { auto const view = map.get_device_view(); REQUIRE(cuco::test::all_of( - insert_pairs, - insert_pairs + num, - [view] __device__(cuco::pair_type const& pair) { + insert_pairs, insert_pairs + num, [view] __device__(cuco::pair const& pair) { return view.find(pair.first, hash_custom_key{}, custom_key_equals{}) == view.end(); })); } diff --git a/tests/static_map/duplicate_keys_test.cu b/tests/static_map/duplicate_keys_test.cu index 54d1c42f1..5620fa4e9 100644 --- a/tests/static_map/duplicate_keys_test.cu +++ b/tests/static_map/duplicate_keys_test.cu @@ -49,7 +49,7 @@ TEMPLATE_TEST_CASE_SIG("Duplicate keys", auto pairs_begin = thrust::make_transform_iterator( thrust::make_counting_iterator(0), - [] __device__(auto i) { return cuco::pair_type(i / 2, i / 2); }); + [] __device__(auto i) { return cuco::pair(i / 2, i / 2); }); thrust::device_vector d_results(num_keys); thrust::device_vector d_contained(num_keys); diff --git a/tests/static_map/heterogeneous_lookup_test.cu b/tests/static_map/heterogeneous_lookup_test.cu index 17b7d5662..e842612b1 100644 --- a/tests/static_map/heterogeneous_lookup_test.cu +++ b/tests/static_map/heterogeneous_lookup_test.cu @@ -99,9 +99,9 @@ TEMPLATE_TEST_CASE("Heterogeneous lookup", cuco::static_map map{ capacity, cuco::empty_key{sentinel_key}, cuco::empty_value{sentinel_value}}; - auto insert_pairs = thrust::make_transform_iterator( - thrust::counting_iterator(0), - [] __device__(auto i) { return cuco::pair_type(i, i); }); + auto insert_pairs = + thrust::make_transform_iterator(thrust::counting_iterator(0), + [] __device__(auto i) { return cuco::pair(i, i); }); auto probe_keys = thrust::make_transform_iterator(thrust::counting_iterator(0), [] __device__(auto i) { return ProbeKey(i); }); diff --git a/tests/static_map/key_sentinel_test.cu b/tests/static_map/key_sentinel_test.cu index dcf88c99c..d8441e9cf 100644 --- a/tests/static_map/key_sentinel_test.cu +++ b/tests/static_map/key_sentinel_test.cu @@ -51,19 +51,19 @@ TEMPLATE_TEST_CASE_SIG( } CUCO_CUDA_TRY(cudaMemcpyToSymbol(A, h_A, SIZE * sizeof(int))); - auto pairs_begin = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - [] __device__(auto i) { return cuco::pair_type(i, i); }); + auto pairs_begin = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [] __device__(auto i) { return cuco::pair(i, i); }); SECTION( "Tests of non-CG insert: The custom `key_equal` can never be used to compare against sentinel") { - REQUIRE(cuco::test::all_of( - pairs_begin, - pairs_begin + num_keys, - [m_view] __device__(cuco::pair_type const& pair) mutable { - return m_view.insert(pair, cuco::murmurhash3_32{}, custom_equals{}); - })); + REQUIRE(cuco::test::all_of(pairs_begin, + pairs_begin + num_keys, + [m_view] __device__(cuco::pair const& pair) mutable { + return m_view.insert( + pair, cuco::murmurhash3_32{}, custom_equals{}); + })); } SECTION( @@ -72,13 +72,11 @@ TEMPLATE_TEST_CASE_SIG( map.insert( pairs_begin, pairs_begin + num_keys, cuco::murmurhash3_32{}, custom_equals{}); // All keys inserted via custom `key_equal` should be found - REQUIRE(cuco::test::all_of(pairs_begin, - pairs_begin + num_keys, - [view] __device__(cuco::pair_type const& pair) { - auto const found = view.find(pair.first); - return (found != view.end()) and - (found->first.load() == pair.first and - found->second.load() == pair.second); - })); + REQUIRE(cuco::test::all_of( + pairs_begin, pairs_begin + num_keys, [view] __device__(cuco::pair const& pair) { + auto const found = view.find(pair.first); + return (found != view.end()) and + (found->first.load() == pair.first and found->second.load() == pair.second); + })); } } diff --git a/tests/static_map/stream_test.cu b/tests/static_map/stream_test.cu index 35cd7e821..2bf71e2e6 100644 --- a/tests/static_map/stream_test.cu +++ b/tests/static_map/stream_test.cu @@ -53,9 +53,9 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream", thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); thrust::sequence(thrust::device, d_values.begin(), d_values.end()); - auto pairs_begin = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - [] __device__(auto i) { return cuco::pair_type(i, i); }); + auto pairs_begin = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [] __device__(auto i) { return cuco::pair(i, i); }); auto hash_fn = cuco::murmurhash3_32{}; auto equal_fn = thrust::equal_to{}; diff --git a/tests/static_map/unique_sequence_test.cu b/tests/static_map/unique_sequence_test.cu index 54ba4e6f1..3fa4ef219 100644 --- a/tests/static_map/unique_sequence_test.cu +++ b/tests/static_map/unique_sequence_test.cu @@ -51,9 +51,9 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); thrust::sequence(thrust::device, d_values.begin(), d_values.end()); - auto pairs_begin = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - [] __device__(auto i) { return cuco::pair_type(i, i); }); + auto pairs_begin = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [] __device__(auto i) { return cuco::pair(i, i); }); thrust::device_vector d_results(num_keys); thrust::device_vector d_contained(num_keys); @@ -87,69 +87,63 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", SECTION("Inserting unique keys should return insert success.") { - REQUIRE( - cuco::test::all_of(pairs_begin, - pairs_begin + num_keys, - [m_view] __device__(cuco::pair_type const& pair) mutable { - return m_view.insert(pair); - })); + REQUIRE(cuco::test::all_of(pairs_begin, + pairs_begin + num_keys, + [m_view] __device__(cuco::pair const& pair) mutable { + return m_view.insert(pair); + })); } SECTION("Cannot find any key in an empty hash map with non-const view") { SECTION("non-const view") - { - REQUIRE( - cuco::test::all_of(pairs_begin, - pairs_begin + num_keys, - [view] __device__(cuco::pair_type const& pair) mutable { - return view.find(pair.first) == view.end(); - })); - } - SECTION("const view") { REQUIRE(cuco::test::all_of(pairs_begin, pairs_begin + num_keys, - [view] __device__(cuco::pair_type const& pair) { + [view] __device__(cuco::pair const& pair) mutable { return view.find(pair.first) == view.end(); })); } + SECTION("const view") + { + REQUIRE(cuco::test::all_of( + pairs_begin, pairs_begin + num_keys, [view] __device__(cuco::pair const& pair) { + return view.find(pair.first) == view.end(); + })); + } } SECTION("Keys are all found after inserting many keys.") { // Bulk insert keys - thrust::for_each(thrust::device, - pairs_begin, - pairs_begin + num_keys, - [m_view] __device__(cuco::pair_type const& pair) mutable { - m_view.insert(pair); - }); + thrust::for_each( + thrust::device, + pairs_begin, + pairs_begin + num_keys, + [m_view] __device__(cuco::pair const& pair) mutable { m_view.insert(pair); }); SECTION("non-const view") - { - // All keys should be found - REQUIRE(cuco::test::all_of( - pairs_begin, - pairs_begin + num_keys, - [view] __device__(cuco::pair_type const& pair) mutable { - auto const found = view.find(pair.first); - return (found != view.end()) and - (found->first.load() == pair.first and found->second.load() == pair.second); - })); - } - SECTION("const view") { // All keys should be found REQUIRE(cuco::test::all_of(pairs_begin, pairs_begin + num_keys, - [view] __device__(cuco::pair_type const& pair) { + [view] __device__(cuco::pair const& pair) mutable { auto const found = view.find(pair.first); return (found != view.end()) and (found->first.load() == pair.first and found->second.load() == pair.second); })); } + SECTION("const view") + { + // All keys should be found + REQUIRE(cuco::test::all_of( + pairs_begin, pairs_begin + num_keys, [view] __device__(cuco::pair const& pair) { + auto const found = view.find(pair.first); + return (found != view.end()) and + (found->first.load() == pair.first and found->second.load() == pair.second); + })); + } } } @@ -165,10 +159,10 @@ __inline__ void test_unique_sequence(Map& map, size_type num_keys) thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); - auto keys_begin = d_keys.begin(); - auto pairs_begin = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - [] __device__(auto i) { return cuco::pair_type(i, i); }); + auto keys_begin = d_keys.begin(); + auto pairs_begin = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [] __device__(auto i) { return cuco::pair(i, i); }); thrust::device_vector d_contained(num_keys); auto zip_equal = [] __device__(auto const& p) { return thrust::get<0>(p) == thrust::get<1>(p); }; diff --git a/tests/static_multimap/custom_pair_retrieve_test.cu b/tests/static_multimap/custom_pair_retrieve_test.cu index 563abd835..b9aba7e76 100644 --- a/tests/static_multimap/custom_pair_retrieve_test.cu +++ b/tests/static_multimap/custom_pair_retrieve_test.cu @@ -35,8 +35,8 @@ // Custom pair equal template struct pair_equal { - __device__ bool operator()(const cuco::pair_type& lhs, - const cuco::pair_type& rhs) const + __device__ bool operator()(const cuco::pair& lhs, + const cuco::pair& rhs) const { return lhs.first == rhs.first; } @@ -86,7 +86,7 @@ void test_non_shmem_pair_retrieve(Map& map, std::size_t const num_pairs) using Key = typename Map::key_type; using Value = typename Map::mapped_type; - thrust::device_vector> d_pairs(num_pairs); + thrust::device_vector> d_pairs(num_pairs); // pair multiplicity = 2 thrust::transform(thrust::device, @@ -94,7 +94,7 @@ void test_non_shmem_pair_retrieve(Map& map, std::size_t const num_pairs) thrust::counting_iterator(num_pairs), d_pairs.begin(), [] __device__(auto i) { - return cuco::pair_type{i / 2, i}; + return cuco::pair{i / 2, i}; }); auto pair_begin = d_pairs.begin(); @@ -107,7 +107,7 @@ void test_non_shmem_pair_retrieve(Map& map, std::size_t const num_pairs) thrust::counting_iterator(num_pairs), pair_begin, [] __device__(auto i) { - return cuco::pair_type{i, i}; + return cuco::pair{i, i}; }); // create an array of prefix sum diff --git a/tests/static_multimap/custom_type_test.cu b/tests/static_multimap/custom_type_test.cu index d76404b18..f53719205 100644 --- a/tests/static_multimap/custom_type_test.cu +++ b/tests/static_multimap/custom_type_test.cu @@ -98,7 +98,7 @@ __inline__ void test_custom_key_value_type(Map& map, std::size_t num_pairs) auto count = map.count(key_begin, key_begin + num_pairs, stream, key_pair_equals{}); REQUIRE(count == num_pairs); - thrust::device_vector> found_pairs(num_pairs); + thrust::device_vector> found_pairs(num_pairs); auto output_end = map.retrieve( key_begin, key_begin + num_pairs, found_pairs.begin(), stream, key_pair_equals{}); std::size_t const size = std::distance(found_pairs.begin(), output_end); @@ -110,16 +110,17 @@ __inline__ void test_custom_key_value_type(Map& map, std::size_t num_pairs) thrust::device, found_pairs.begin(), found_pairs.end(), - [] __device__(const cuco::pair_type& lhs, - const cuco::pair_type& rhs) { return lhs.first.a < rhs.first.a; }); - - REQUIRE(cuco::test::equal( - pair_begin, - pair_begin + num_pairs, - found_pairs.begin(), - [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { - return lhs.first.a == rhs.first.a; - })); + [] __device__(const cuco::pair& lhs, const cuco::pair& rhs) { + return lhs.first.a < rhs.first.a; + }); + + REQUIRE( + cuco::test::equal(pair_begin, + pair_begin + num_pairs, + found_pairs.begin(), + [] __device__(cuco::pair lhs, cuco::pair rhs) { + return lhs.first.a == rhs.first.a; + })); } SECTION("Non-matches are not included in the output") @@ -141,7 +142,7 @@ __inline__ void test_custom_key_value_type(Map& map, std::size_t num_pairs) auto count = map.count(query_key_begin, query_key_begin + num, stream, key_pair_equals{}); REQUIRE(count == num_pairs); - thrust::device_vector> found_pairs(num_pairs); + thrust::device_vector> found_pairs(num_pairs); auto output_end = map.retrieve( query_key_begin, query_key_begin + num, found_pairs.begin(), stream, key_pair_equals{}); std::size_t const size = std::distance(found_pairs.begin(), output_end); @@ -153,15 +154,16 @@ __inline__ void test_custom_key_value_type(Map& map, std::size_t num_pairs) thrust::device, found_pairs.begin(), found_pairs.end(), - [] __device__(const cuco::pair_type& lhs, - const cuco::pair_type& rhs) { return lhs.first.a < rhs.first.a; }); - REQUIRE(cuco::test::equal( - pair_begin, - pair_begin + num_pairs, - found_pairs.begin(), - [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { - return lhs.first.a == rhs.first.a; - })); + [] __device__(const cuco::pair& lhs, const cuco::pair& rhs) { + return lhs.first.a < rhs.first.a; + }); + REQUIRE( + cuco::test::equal(pair_begin, + pair_begin + num_pairs, + found_pairs.begin(), + [] __device__(cuco::pair lhs, cuco::pair rhs) { + return lhs.first.a == rhs.first.a; + })); } SECTION("Outer functions include non-matches in the output") @@ -183,7 +185,7 @@ __inline__ void test_custom_key_value_type(Map& map, std::size_t num_pairs) map.count_outer(query_key_begin, query_key_begin + num, stream, key_pair_equals{}); REQUIRE(count_outer == num); - thrust::device_vector> found_pairs(num); + thrust::device_vector> found_pairs(num); auto output_end = map.retrieve_outer( query_key_begin, query_key_begin + num, found_pairs.begin(), stream, key_pair_equals{}); std::size_t const size_outer = std::distance(found_pairs.begin(), output_end); diff --git a/tests/static_multimap/heterogeneous_lookup_test.cu b/tests/static_multimap/heterogeneous_lookup_test.cu index 9b724d43c..5a5b8b242 100644 --- a/tests/static_multimap/heterogeneous_lookup_test.cu +++ b/tests/static_multimap/heterogeneous_lookup_test.cu @@ -103,9 +103,9 @@ TEMPLATE_TEST_CASE("Heterogeneous lookup", cuco::linear_probing<1, custom_hasher>> map{capacity, cuco::empty_key{sentinel_key}, cuco::empty_value{sentinel_value}}; - auto insert_pairs = thrust::make_transform_iterator( - thrust::counting_iterator(0), - [] __device__(auto i) { return cuco::pair_type(i, i); }); + auto insert_pairs = + thrust::make_transform_iterator(thrust::counting_iterator(0), + [] __device__(auto i) { return cuco::pair(i, i); }); auto probe_keys = thrust::make_transform_iterator(thrust::counting_iterator(0), [] __device__(auto i) { return ProbeKey(i); }); diff --git a/tests/static_multimap/insert_if_test.cu b/tests/static_multimap/insert_if_test.cu index 0d560ff6e..33f0b2ce3 100644 --- a/tests/static_multimap/insert_if_test.cu +++ b/tests/static_multimap/insert_if_test.cu @@ -55,7 +55,7 @@ TEMPLATE_TEST_CASE_SIG( constexpr std::size_t num_keys{1'000}; thrust::device_vector d_keys(num_keys); - thrust::device_vector> d_pairs(num_keys); + thrust::device_vector> d_pairs(num_keys); thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); // multiplicity = 1 @@ -64,7 +64,7 @@ TEMPLATE_TEST_CASE_SIG( thrust::counting_iterator(num_keys), d_pairs.begin(), [] __device__(auto i) { - return cuco::pair_type{i, i}; + return cuco::pair{i, i}; }); using probe = std::conditional_t< diff --git a/tests/static_multimap/multiplicity_test.cu b/tests/static_multimap/multiplicity_test.cu index f21d52c3d..650145a41 100644 --- a/tests/static_multimap/multiplicity_test.cu +++ b/tests/static_multimap/multiplicity_test.cu @@ -36,7 +36,7 @@ __inline__ void test_multiplicity_two(Map& map, std::size_t num_items) using Value = typename Map::mapped_type; thrust::device_vector d_keys(num_items / 2); - thrust::device_vector> d_pairs(num_items); + thrust::device_vector> d_pairs(num_items); thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); // multiplicity = 2 @@ -45,10 +45,10 @@ __inline__ void test_multiplicity_two(Map& map, std::size_t num_items) thrust::counting_iterator(num_items), d_pairs.begin(), [] __device__(auto i) { - return cuco::pair_type{i / 2, i}; + return cuco::pair{i / 2, i}; }); - thrust::device_vector> d_results(num_items); + thrust::device_vector> d_results(num_items); auto key_begin = d_keys.begin(); auto pair_begin = d_pairs.begin(); @@ -91,22 +91,22 @@ __inline__ void test_multiplicity_two(Map& map, std::size_t num_items) REQUIRE(size == num_items); // sort before compare - thrust::sort(thrust::device, - d_results.begin(), - d_results.end(), - [] __device__(const cuco::pair_type& lhs, - const cuco::pair_type& rhs) { - if (lhs.first != rhs.first) { return lhs.first < rhs.first; } - return lhs.second < rhs.second; - }); - - REQUIRE(cuco::test::equal( - pair_begin, - pair_begin + num_items, - output_begin, - [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { - return lhs.first == rhs.first and lhs.second == rhs.second; - })); + thrust::sort( + thrust::device, + d_results.begin(), + d_results.end(), + [] __device__(const cuco::pair& lhs, const cuco::pair& rhs) { + if (lhs.first != rhs.first) { return lhs.first < rhs.first; } + return lhs.second < rhs.second; + }); + + REQUIRE( + cuco::test::equal(pair_begin, + pair_begin + num_items, + output_begin, + [] __device__(cuco::pair lhs, cuco::pair rhs) { + return lhs.first == rhs.first and lhs.second == rhs.second; + })); } SECTION("count and count_outer should return the same value.") @@ -129,22 +129,22 @@ __inline__ void test_multiplicity_two(Map& map, std::size_t num_items) REQUIRE(size == size_outer); // sort before compare - thrust::sort(thrust::device, - d_results.begin(), - d_results.end(), - [] __device__(const cuco::pair_type& lhs, - const cuco::pair_type& rhs) { - if (lhs.first != rhs.first) { return lhs.first < rhs.first; } - return lhs.second < rhs.second; - }); - - REQUIRE(cuco::test::equal( - pair_begin, - pair_begin + num_items, - output_begin, - [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { - return lhs.first == rhs.first and lhs.second == rhs.second; - })); + thrust::sort( + thrust::device, + d_results.begin(), + d_results.end(), + [] __device__(const cuco::pair& lhs, const cuco::pair& rhs) { + if (lhs.first != rhs.first) { return lhs.first < rhs.first; } + return lhs.second < rhs.second; + }); + + REQUIRE( + cuco::test::equal(pair_begin, + pair_begin + num_items, + output_begin, + [] __device__(cuco::pair lhs, cuco::pair rhs) { + return lhs.first == rhs.first and lhs.second == rhs.second; + })); } } diff --git a/tests/static_multimap/non_match_test.cu b/tests/static_multimap/non_match_test.cu index be76a38ce..afa6a938c 100644 --- a/tests/static_multimap/non_match_test.cu +++ b/tests/static_multimap/non_match_test.cu @@ -39,7 +39,7 @@ __inline__ void test_non_matches(Map& map, PairIt pair_begin, KeyIt key_begin, s SECTION("Output of count and retrieve should be coherent.") { auto num = map.count(key_begin, key_begin + num_keys); - thrust::device_vector> d_results(num); + thrust::device_vector> d_results(num); REQUIRE(num == num_keys); @@ -50,28 +50,28 @@ __inline__ void test_non_matches(Map& map, PairIt pair_begin, KeyIt key_begin, s REQUIRE(size == num_keys); // sort before compare - thrust::sort(thrust::device, - output_begin, - output_end, - [] __device__(const cuco::pair_type& lhs, - const cuco::pair_type& rhs) { - if (lhs.first != rhs.first) { return lhs.first < rhs.first; } - return lhs.second < rhs.second; - }); - - REQUIRE(cuco::test::equal( - pair_begin, - pair_begin + num_keys, + thrust::sort( + thrust::device, output_begin, - [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { - return lhs.first == rhs.first and lhs.second == rhs.second; - })); + output_end, + [] __device__(const cuco::pair& lhs, const cuco::pair& rhs) { + if (lhs.first != rhs.first) { return lhs.first < rhs.first; } + return lhs.second < rhs.second; + }); + + REQUIRE( + cuco::test::equal(pair_begin, + pair_begin + num_keys, + output_begin, + [] __device__(cuco::pair lhs, cuco::pair rhs) { + return lhs.first == rhs.first and lhs.second == rhs.second; + })); } SECTION("Output of count_outer and retrieve_outer should be coherent.") { auto num = map.count_outer(key_begin, key_begin + num_keys); - thrust::device_vector> d_results(num); + thrust::device_vector> d_results(num); REQUIRE(num == (num_keys + num_keys / 2)); @@ -82,34 +82,34 @@ __inline__ void test_non_matches(Map& map, PairIt pair_begin, KeyIt key_begin, s REQUIRE(size == (num_keys + num_keys / 2)); // sort before compare - thrust::sort(thrust::device, - output_begin, - output_end, - [] __device__(const cuco::pair_type& lhs, - const cuco::pair_type& rhs) { - if (lhs.first != rhs.first) { return lhs.first < rhs.first; } - return lhs.second < rhs.second; - }); + thrust::sort( + thrust::device, + output_begin, + output_end, + [] __device__(const cuco::pair& lhs, const cuco::pair& rhs) { + if (lhs.first != rhs.first) { return lhs.first < rhs.first; } + return lhs.second < rhs.second; + }); // create gold reference - thrust::device_vector> gold(size); + thrust::device_vector> gold(size); auto gold_begin = gold.begin(); thrust::transform(thrust::device, thrust::counting_iterator(0), thrust::counting_iterator(size), gold_begin, [num_keys] __device__(auto i) { - if (i < num_keys) { return cuco::pair_type{i / 2, i}; } - return cuco::pair_type{i - num_keys / 2, -1}; + if (i < num_keys) { return cuco::pair{i / 2, i}; } + return cuco::pair{i - num_keys / 2, -1}; }); - REQUIRE(cuco::test::equal( - gold_begin, - gold_begin + size, - output_begin, - [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { - return lhs.first == rhs.first and lhs.second == rhs.second; - })); + REQUIRE( + cuco::test::equal(gold_begin, + gold_begin + size, + output_begin, + [] __device__(cuco::pair lhs, cuco::pair rhs) { + return lhs.first == rhs.first and lhs.second == rhs.second; + })); } } @@ -127,7 +127,7 @@ TEMPLATE_TEST_CASE_SIG( constexpr std::size_t num_keys{1'000}; thrust::device_vector d_keys(num_keys); - thrust::device_vector> d_pairs(num_keys); + thrust::device_vector> d_pairs(num_keys); thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); // multiplicity = 2 @@ -136,7 +136,7 @@ TEMPLATE_TEST_CASE_SIG( thrust::counting_iterator(num_keys), d_pairs.begin(), [] __device__(auto i) { - return cuco::pair_type{i / 2, i}; + return cuco::pair{i / 2, i}; }); using probe = std::conditional_t< diff --git a/tests/static_multimap/pair_function_test.cu b/tests/static_multimap/pair_function_test.cu index 8edecd6f1..2eae61a66 100644 --- a/tests/static_multimap/pair_function_test.cu +++ b/tests/static_multimap/pair_function_test.cu @@ -32,8 +32,8 @@ // Custom pair equal template struct pair_equal { - __device__ bool operator()(const cuco::pair_type& lhs, - const cuco::pair_type& rhs) const + __device__ bool operator()(const cuco::pair& lhs, + const cuco::pair& rhs) const { return lhs.first == rhs.first; } @@ -54,7 +54,7 @@ __inline__ void test_pair_functions(Map& map, PairIt pair_begin, std::size_t num thrust::counting_iterator(num_pairs), pair_begin, [] __device__(auto i) { - return cuco::pair_type{i, i}; + return cuco::pair{i, i}; }); SECTION("pair_contains returns true for all inserted pairs and false for non-inserted ones.") @@ -121,7 +121,7 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int64_t, cuco::test::probe_sequence::double_hashing)) { constexpr std::size_t num_pairs{4}; - thrust::device_vector> d_pairs(num_pairs); + thrust::device_vector> d_pairs(num_pairs); // pair multiplicity = 2 thrust::transform(thrust::device, @@ -129,7 +129,7 @@ TEMPLATE_TEST_CASE_SIG( thrust::counting_iterator(num_pairs), d_pairs.begin(), [] __device__(auto i) { - return cuco::pair_type{i / 2, i}; + return cuco::pair{i / 2, i}; }); using probe = std::conditional_t< diff --git a/tests/utility/storage_test.cu b/tests/utility/storage_test.cu index c82b5ab44..afb9848d3 100644 --- a/tests/utility/storage_test.cu +++ b/tests/utility/storage_test.cu @@ -16,9 +16,9 @@ #include -#include #include #include +#include #include #include From af9cffa7405401128dab983b86c6edd48c8e8957 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 27 Jun 2023 14:55:46 -0700 Subject: [PATCH 121/152] Update docs for the new static_map::retrieve_all (#323) Fixes inaccurate docs for `experimental::static_map::retrieve_all` --- include/cuco/static_map.cuh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index f296e9ed1..c89fd4d01 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -412,7 +412,7 @@ class static_map { cuda_stream_ref stream = {}) const; /** - * @brief Retrieves all keys contained in the map. + * @brief Retrieves all of the keys and their associated values. * * @note This API synchronizes the given stream. * @note The order in which keys are returned is implementation defined and not guaranteed to be @@ -421,9 +421,9 @@ class static_map { * value of `size()`. * * @tparam OutputIt Device accessible random access output iterator whose `value_type` is - * convertible from the container's `key_type`. + * convertible from the container's `value_type`. * - * @param output_begin Beginning output iterator for keys + * @param output_begin Beginning output iterator for key-value pairs * @param stream CUDA stream used for this operation * * @return Iterator indicating the end of the output From 5e27a54673a2b155054d5482fd307c4a201825da Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 27 Jun 2023 17:57:26 -0700 Subject: [PATCH 122/152] Add container-specific detail namespace to resolve naming conflicts (#324) This PRs adds container-specific namespace, e.g. `static_map_ns` and `static_set_ns`, to resolve naming conflict. --- include/cuco/detail/static_map/functors.cuh | 2 ++ include/cuco/detail/static_map/kernels.cuh | 2 ++ include/cuco/detail/static_map/static_map.inl | 6 +++--- include/cuco/detail/static_set/functors.cuh | 2 ++ include/cuco/detail/static_set/kernels.cuh | 2 ++ include/cuco/detail/static_set/static_set.inl | 6 +++--- 6 files changed, 14 insertions(+), 6 deletions(-) diff --git a/include/cuco/detail/static_map/functors.cuh b/include/cuco/detail/static_map/functors.cuh index c807ed5f1..0f9a7352b 100644 --- a/include/cuco/detail/static_map/functors.cuh +++ b/include/cuco/detail/static_map/functors.cuh @@ -19,6 +19,7 @@ namespace cuco { namespace experimental { +namespace static_map_ns { namespace detail { /** @@ -53,5 +54,6 @@ struct slot_is_filled { }; } // namespace detail +} // namespace static_map_ns } // namespace experimental } // namespace cuco diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh index c27c2eac8..29906d061 100644 --- a/include/cuco/detail/static_map/kernels.cuh +++ b/include/cuco/detail/static_map/kernels.cuh @@ -26,6 +26,7 @@ namespace cuco { namespace experimental { +namespace static_map_ns { namespace detail { /** @@ -89,5 +90,6 @@ __global__ void find(InputIt first, cuco::detail::index_type n, OutputIt output_ } } // namespace detail +} // namespace static_map_ns } // namespace experimental } // namespace cuco diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index e4f414313..0757c8c51 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -224,7 +224,7 @@ void static_map + static_map_ns::detail::find <<>>( first, num_keys, output_begin, ref(op::find)); } @@ -242,7 +242,7 @@ OutputIt static_map::retrieve_all( OutputIt output_begin, cuda_stream_ref stream) const { - auto const is_filled = detail::slot_is_filled(this->empty_key_sentinel()); + auto const is_filled = static_map_ns::detail::slot_is_filled(this->empty_key_sentinel()); return impl_->retrieve_all(output_begin, is_filled, stream); } @@ -258,7 +258,7 @@ static_map:: static_map::size( cuda_stream_ref stream) const noexcept { - auto const is_filled = detail::slot_is_filled(this->empty_key_sentinel()); + auto const is_filled = static_map_ns::detail::slot_is_filled(this->empty_key_sentinel()); return impl_->size(is_filled, stream); } diff --git a/include/cuco/detail/static_set/functors.cuh b/include/cuco/detail/static_set/functors.cuh index ce3183b8d..3ee7be4be 100644 --- a/include/cuco/detail/static_set/functors.cuh +++ b/include/cuco/detail/static_set/functors.cuh @@ -19,6 +19,7 @@ namespace cuco { namespace experimental { +namespace static_set_ns { namespace detail { /** @@ -53,5 +54,6 @@ struct slot_is_filled { }; } // namespace detail +} // namespace static_set_ns } // namespace experimental } // namespace cuco diff --git a/include/cuco/detail/static_set/kernels.cuh b/include/cuco/detail/static_set/kernels.cuh index 4023dc16e..004a50b58 100644 --- a/include/cuco/detail/static_set/kernels.cuh +++ b/include/cuco/detail/static_set/kernels.cuh @@ -26,6 +26,7 @@ namespace cuco { namespace experimental { +namespace static_set_ns { namespace detail { /** @@ -87,5 +88,6 @@ __global__ void find(InputIt first, cuco::detail::index_type n, OutputIt output_ } } // namespace detail +} // namespace static_set_ns } // namespace experimental } // namespace cuco diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl index 769a1131f..c625a2042 100644 --- a/include/cuco/detail/static_set/static_set.inl +++ b/include/cuco/detail/static_set/static_set.inl @@ -205,7 +205,7 @@ void static_set (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); - detail::find + static_set_ns::detail::find <<>>( first, num_keys, output_begin, ref(op::find)); } @@ -221,7 +221,7 @@ template OutputIt static_set::retrieve_all( OutputIt output_begin, cuda_stream_ref stream) const { - auto const is_filled = detail::slot_is_filled(this->empty_key_sentinel()); + auto const is_filled = static_set_ns::detail::slot_is_filled(this->empty_key_sentinel()); return impl_->retrieve_all(output_begin, is_filled, stream); } @@ -236,7 +236,7 @@ static_set::siz static_set::size( cuda_stream_ref stream) const noexcept { - auto const is_filled = detail::slot_is_filled(this->empty_key_sentinel()); + auto const is_filled = static_set_ns::detail::slot_is_filled(this->empty_key_sentinel()); return impl_->size(is_filled, stream); } From 6bc62c6fc9d2edbd1dacbe1b242b173d4cffad99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 29 Jun 2023 16:59:26 +0200 Subject: [PATCH 123/152] Make XXHash the default hash function (#318) --- include/cuco/dynamic_map.cuh | 8 ++-- include/cuco/hash_functions.cuh | 14 +++++-- include/cuco/probe_sequences.cuh | 2 +- include/cuco/probing_scheme.cuh | 2 +- include/cuco/static_map.cuh | 42 +++++++++---------- include/cuco/static_multimap.cuh | 3 +- include/cuco/static_set.cuh | 3 +- tests/static_map/key_sentinel_test.cu | 8 ++-- tests/static_map/stream_test.cu | 2 +- .../custom_pair_retrieve_test.cu | 7 ++-- tests/static_multimap/insert_if_test.cu | 7 ++-- tests/static_multimap/multiplicity_test.cu | 7 ++-- tests/static_multimap/non_match_test.cu | 9 ++-- tests/static_multimap/pair_function_test.cu | 7 ++-- tests/static_set/capacity_test.cu | 26 ++++-------- tests/static_set/insert_and_find_test.cu | 10 ++--- tests/static_set/large_input_test.cu | 8 +--- tests/static_set/retrieve_all_test.cu | 10 ++--- tests/static_set/unique_sequence_test.cu | 10 ++--- 19 files changed, 84 insertions(+), 101 deletions(-) diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index 8aa87163f..998ff3647 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -208,7 +208,7 @@ class dynamic_map { * @param stream Stream used for executing the kernels */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> void insert(InputIt first, InputIt last, @@ -247,7 +247,7 @@ class dynamic_map { * provided at construction */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> void erase(InputIt first, InputIt last, @@ -277,7 +277,7 @@ class dynamic_map { */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> void find(InputIt first, InputIt last, @@ -307,7 +307,7 @@ class dynamic_map { */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> void contains(InputIt first, InputIt last, diff --git a/include/cuco/hash_functions.cuh b/include/cuco/hash_functions.cuh index 21b78d675..000f46fef 100644 --- a/include/cuco/hash_functions.cuh +++ b/include/cuco/hash_functions.cuh @@ -44,7 +44,7 @@ template using murmurhash3_fmix_64 = detail::MurmurHash3_fmix64; /** - * @brief A `murmurhash3_32` hash function to hash the given argument on host and device. + * @brief A 32-bit `MurmurHash3` hash function to hash the given argument on host and device. * * @tparam Key The type of the values to hash */ @@ -52,7 +52,7 @@ template using murmurhash3_32 = detail::MurmurHash3_32; /** - * @brief A `XXH32` hash function to hash the given argument on host and device. + * @brief A 32-bit `XXH32` hash function to hash the given argument on host and device. * * @tparam Key The type of the values to hash */ @@ -60,11 +60,19 @@ template using xxhash_32 = detail::XXHash_32; /** - * @brief A `XXH64` hash function to hash the given argument on host and device. + * @brief A 64-bit `XXH64` hash function to hash the given argument on host and device. * * @tparam Key The type of the values to hash */ template using xxhash_64 = detail::XXHash_64; +/** + * @brief Default hash function. + * + * @tparam Key The type of the values to hash + */ +template +using default_hash_function = xxhash_32; + } // namespace cuco diff --git a/include/cuco/probe_sequences.cuh b/include/cuco/probe_sequences.cuh index 071b0921e..7921b6629 100644 --- a/include/cuco/probe_sequences.cuh +++ b/include/cuco/probe_sequences.cuh @@ -60,7 +60,7 @@ class linear_probing : public detail::probe_sequence_base { * @tparam Hash1 Unary callable type * @tparam Hash2 Unary callable type */ -template +template class double_hashing : public detail::probe_sequence_base { public: using probe_sequence_base_type = diff --git a/include/cuco/probing_scheme.cuh b/include/cuco/probing_scheme.cuh index 0880ee97b..039433cef 100644 --- a/include/cuco/probing_scheme.cuh +++ b/include/cuco/probing_scheme.cuh @@ -96,7 +96,7 @@ class linear_probing : private detail::probing_scheme_base { * @tparam Hash1 Unary callable type * @tparam Hash2 Unary callable type */ -template +template class double_hashing : private detail::probing_scheme_base { public: using probing_scheme_base_type = diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index c89fd4d01..e740160c4 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -89,11 +89,11 @@ template , cuda::thread_scope Scope = cuda::thread_scope_device, class KeyEqual = thrust::equal_to, - class ProbingScheme = cuco::experimental::double_hashing<4, // CG size - cuco::murmurhash3_32, - cuco::murmurhash3_32>, - class Allocator = cuco::cuda_allocator>, - class Storage = cuco::experimental::aow_storage<1>> + class ProbingScheme = + cuco::experimental::double_hashing<4, // CG size + cuco::default_hash_function>, + class Allocator = cuco::cuda_allocator>, + class Storage = cuco::experimental::aow_storage<1>> class static_map { static_assert(sizeof(Key) <= 4, "Container does not support key types larger than 4 bytes."); @@ -691,7 +691,7 @@ class static_map { * @param stream Stream used for executing the kernels */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> void insert(InputIt first, InputIt last, @@ -725,7 +725,7 @@ class static_map { template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> void insert_if(InputIt first, InputIt last, @@ -763,7 +763,7 @@ class static_map { * provided at construction */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> void erase(InputIt first, InputIt last, @@ -792,7 +792,7 @@ class static_map { */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> void find(InputIt first, InputIt last, @@ -847,7 +847,7 @@ class static_map { */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> void contains(InputIt first, InputIt last, @@ -1369,7 +1369,7 @@ class static_map { * equality * @return `true` if the insert was successful, `false` otherwise. */ - template , + template , typename KeyEqual = thrust::equal_to> __device__ bool insert(value_type const& insert_pair, Hash hash = Hash{}, @@ -1400,7 +1400,7 @@ class static_map { * @return a pair consisting of an iterator to the element and a bool, * either `true` if the insert was successful, `false` otherwise. */ - template , + template , typename KeyEqual = thrust::equal_to> __device__ thrust::pair insert_and_find( value_type const& insert_pair, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) noexcept; @@ -1425,7 +1425,7 @@ class static_map { * @return `true` if the insert was successful, `false` otherwise. */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> __device__ bool insert(CG const& g, value_type const& insert_pair, @@ -1446,7 +1446,7 @@ class static_map { * equality * @return `true` if the erasure was successful, `false` otherwise. */ - template , + template , typename KeyEqual = thrust::equal_to> __device__ bool erase(key_type const& k, Hash hash = Hash{}, @@ -1469,7 +1469,7 @@ class static_map { * @return `true` if the erasure was successful, `false` otherwise. */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> __device__ bool erase(CG const& g, key_type const& k, @@ -1636,7 +1636,7 @@ class static_map { * @return An iterator to the position at which the key/value pair * containing `k` was inserted */ - template , + template , typename KeyEqual = thrust::equal_to> __device__ iterator find(Key const& k, Hash hash = Hash{}, @@ -1656,7 +1656,7 @@ class static_map { * @return An iterator to the position at which the key/value pair * containing `k` was inserted */ - template , + template , typename KeyEqual = thrust::equal_to> __device__ const_iterator find(Key const& k, Hash hash = Hash{}, @@ -1683,7 +1683,7 @@ class static_map { * containing `k` was inserted */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> __device__ iterator find(CG g, Key const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) noexcept; @@ -1709,7 +1709,7 @@ class static_map { * containing `k` was inserted */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> __device__ const_iterator find(CG g, Key const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) const noexcept; @@ -1738,7 +1738,7 @@ class static_map { * containing `k` was inserted */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> __device__ bool contains(ProbeKey const& k, Hash hash = Hash{}, @@ -1773,7 +1773,7 @@ class static_map { */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> __device__ std::enable_if_t, bool> contains( CG const& g, diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index 075848dd2..9e2a2e280 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -130,8 +130,7 @@ template , - class ProbeSequence = - cuco::double_hashing<8, cuco::murmurhash3_32, cuco::murmurhash3_32>> + class ProbeSequence = cuco::double_hashing<8, cuco::default_hash_function>> class static_multimap { static_assert( cuco::is_bitwise_comparable_v, diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh index a37f2a461..058db0b89 100644 --- a/include/cuco/static_set.cuh +++ b/include/cuco/static_set.cuh @@ -85,8 +85,7 @@ template , class ProbingScheme = experimental::double_hashing<4, // CG size - cuco::murmurhash3_32, - cuco::murmurhash3_32>, + cuco::default_hash_function>, class Allocator = cuco::cuda_allocator, class Storage = cuco::experimental::aow_storage<1>> class static_set { diff --git a/tests/static_map/key_sentinel_test.cu b/tests/static_map/key_sentinel_test.cu index d8441e9cf..74a1badd1 100644 --- a/tests/static_map/key_sentinel_test.cu +++ b/tests/static_map/key_sentinel_test.cu @@ -62,15 +62,17 @@ TEMPLATE_TEST_CASE_SIG( pairs_begin + num_keys, [m_view] __device__(cuco::pair const& pair) mutable { return m_view.insert( - pair, cuco::murmurhash3_32{}, custom_equals{}); + pair, cuco::default_hash_function{}, custom_equals{}); })); } SECTION( "Tests of CG insert: The custom `key_equal` can never be used to compare against sentinel") { - map.insert( - pairs_begin, pairs_begin + num_keys, cuco::murmurhash3_32{}, custom_equals{}); + map.insert(pairs_begin, + pairs_begin + num_keys, + cuco::default_hash_function{}, + custom_equals{}); // All keys inserted via custom `key_equal` should be found REQUIRE(cuco::test::all_of( pairs_begin, pairs_begin + num_keys, [view] __device__(cuco::pair const& pair) { diff --git a/tests/static_map/stream_test.cu b/tests/static_map/stream_test.cu index 2bf71e2e6..6121cbd62 100644 --- a/tests/static_map/stream_test.cu +++ b/tests/static_map/stream_test.cu @@ -57,7 +57,7 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream", thrust::make_transform_iterator(thrust::make_counting_iterator(0), [] __device__(auto i) { return cuco::pair(i, i); }); - auto hash_fn = cuco::murmurhash3_32{}; + auto hash_fn = cuco::default_hash_function{}; auto equal_fn = thrust::equal_to{}; // bulk function test cases diff --git a/tests/static_multimap/custom_pair_retrieve_test.cu b/tests/static_multimap/custom_pair_retrieve_test.cu index b9aba7e76..7856b9e20 100644 --- a/tests/static_multimap/custom_pair_retrieve_test.cu +++ b/tests/static_multimap/custom_pair_retrieve_test.cu @@ -196,10 +196,9 @@ TEMPLATE_TEST_CASE_SIG( { constexpr std::size_t num_pairs{200}; - using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::linear_probing, - cuco::linear_probing<1, cuco::murmurhash3_32>, - cuco::double_hashing<8, cuco::murmurhash3_32, cuco::murmurhash3_32>>; + using probe = std::conditional_t>, + cuco::double_hashing<8, cuco::default_hash_function>>; cuco::static_multimap, probe> map{num_pairs * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; diff --git a/tests/static_multimap/insert_if_test.cu b/tests/static_multimap/insert_if_test.cu index 33f0b2ce3..5d5648e71 100644 --- a/tests/static_multimap/insert_if_test.cu +++ b/tests/static_multimap/insert_if_test.cu @@ -67,10 +67,9 @@ TEMPLATE_TEST_CASE_SIG( return cuco::pair{i, i}; }); - using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::linear_probing, - cuco::linear_probing<1, cuco::murmurhash3_32>, - cuco::double_hashing<8, cuco::murmurhash3_32, cuco::murmurhash3_32>>; + using probe = std::conditional_t>, + cuco::double_hashing<8, cuco::default_hash_function>>; cuco::static_multimap, probe> map{num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; diff --git a/tests/static_multimap/multiplicity_test.cu b/tests/static_multimap/multiplicity_test.cu index 650145a41..5de83a042 100644 --- a/tests/static_multimap/multiplicity_test.cu +++ b/tests/static_multimap/multiplicity_test.cu @@ -161,10 +161,9 @@ TEMPLATE_TEST_CASE_SIG( { constexpr std::size_t num_items{4}; - using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::linear_probing, - cuco::linear_probing<1, cuco::murmurhash3_32>, - cuco::double_hashing<8, cuco::murmurhash3_32, cuco::murmurhash3_32>>; + using probe = std::conditional_t>, + cuco::double_hashing<8, cuco::default_hash_function>>; cuco::static_multimap, probe> map{5, cuco::empty_key{-1}, cuco::empty_value{-1}}; diff --git a/tests/static_multimap/non_match_test.cu b/tests/static_multimap/non_match_test.cu index afa6a938c..94023af56 100644 --- a/tests/static_multimap/non_match_test.cu +++ b/tests/static_multimap/non_match_test.cu @@ -139,16 +139,15 @@ TEMPLATE_TEST_CASE_SIG( return cuco::pair{i / 2, i}; }); - using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::linear_probing, - cuco::linear_probing<1, cuco::murmurhash3_32>, - cuco::double_hashing<8, cuco::murmurhash3_32, cuco::murmurhash3_32>>; + using probe = std::conditional_t>, + cuco::double_hashing<8, cuco::default_hash_function>>; cuco::static_multimap, - cuco::linear_probing<1, cuco::murmurhash3_32>> + cuco::linear_probing<1, cuco::default_hash_function>> map{num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; test_non_matches(map, d_pairs.begin(), d_keys.begin(), num_keys); } diff --git a/tests/static_multimap/pair_function_test.cu b/tests/static_multimap/pair_function_test.cu index 2eae61a66..3ef49377d 100644 --- a/tests/static_multimap/pair_function_test.cu +++ b/tests/static_multimap/pair_function_test.cu @@ -132,10 +132,9 @@ TEMPLATE_TEST_CASE_SIG( return cuco::pair{i / 2, i}; }); - using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::linear_probing, - cuco::linear_probing<1, cuco::murmurhash3_32>, - cuco::double_hashing<8, cuco::murmurhash3_32, cuco::murmurhash3_32>>; + using probe = std::conditional_t>, + cuco::double_hashing<8, cuco::default_hash_function>>; cuco::static_multimap, probe> map{num_pairs * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; diff --git a/tests/static_set/capacity_test.cu b/tests/static_set/capacity_test.cu index e4d3e146a..e144325d5 100644 --- a/tests/static_set/capacity_test.cu +++ b/tests/static_set/capacity_test.cu @@ -21,9 +21,8 @@ TEST_CASE("Static set capacity", "") { constexpr std::size_t num_keys{400}; - using Key = int32_t; - using ProbeT = - cuco::experimental::double_hashing<1, cuco::murmurhash3_32, cuco::murmurhash3_32>; + using Key = int32_t; + using ProbeT = cuco::experimental::double_hashing<1, cuco::default_hash_function>; using Equal = thrust::equal_to; using AllocatorT = cuco::cuda_allocator; using StorageT = cuco::experimental::aow_storage<2>; @@ -35,11 +34,7 @@ TEST_CASE("Static set capacity", "") using extent_type = cuco::experimental::extent; cuco::experimental:: static_set - set{extent_type{}, - cuco::empty_key{-1}, - {}, - ProbeT{cuco::murmurhash3_32{}, cuco::murmurhash3_32{}}, - {}}; + set{extent_type{}, cuco::empty_key{-1}}; auto const capacity = set.capacity(); STATIC_REQUIRE(capacity == gold_capacity); @@ -55,11 +50,7 @@ TEST_CASE("Static set capacity", "") using extent_type = cuco::experimental::extent; cuco::experimental:: static_set - set{num_keys, - cuco::empty_key{-1}, - {}, - ProbeT{cuco::murmurhash3_32{}, cuco::murmurhash3_32{}}, - {}}; + set{num_keys, cuco::empty_key{-1}}; auto const capacity = set.capacity(); REQUIRE(capacity == gold_capacity); @@ -73,10 +64,10 @@ TEST_CASE("Static set capacity", "") auto constexpr gold_capacity = 412; // 103 x 2 x 2 using extent_type = cuco::experimental::extent; - using probe = cuco::experimental::linear_probing<2, cuco::murmurhash3_32>; + using probe = cuco::experimental::linear_probing<2, cuco::default_hash_function>; auto set = cuco::experimental:: static_set{ - extent_type{}, cuco::empty_key{-1}, {}, probe{cuco::murmurhash3_32{}}, {}}; + extent_type{}, cuco::empty_key{-1}}; REQUIRE(set.capacity() == gold_capacity); @@ -92,15 +83,14 @@ TEST_CASE("Static set capacity", "") { auto constexpr gold_capacity = 412; // 103 x 2 x 2 - using probe = cuco::experimental::linear_probing<2, cuco::murmurhash3_32>; + using probe = cuco::experimental::linear_probing<2, cuco::default_hash_function>; auto set = cuco::experimental::static_set, cuda::thread_scope_device, Equal, probe, AllocatorT, - StorageT>{ - num_keys, cuco::empty_key{-1}, {}, probe{cuco::murmurhash3_32{}}, {}}; + StorageT>{num_keys, cuco::empty_key{-1}}; auto const capacity = set.capacity(); REQUIRE(capacity == gold_capacity); diff --git a/tests/static_set/insert_and_find_test.cu b/tests/static_set/insert_and_find_test.cu index 7c4ff08fa..9d0cc057a 100644 --- a/tests/static_set/insert_and_find_test.cu +++ b/tests/static_set/insert_and_find_test.cu @@ -93,12 +93,10 @@ TEMPLATE_TEST_CASE_SIG( { constexpr std::size_t num_keys{400}; - using probe = - std::conditional_t>, - cuco::experimental::double_hashing, - cuco::murmurhash3_32>>; + using probe = std::conditional_t< + Probe == cuco::test::probe_sequence::linear_probing, + cuco::experimental::linear_probing>, + cuco::experimental::double_hashing>>; auto set = cuco::experimental::static_set, diff --git a/tests/static_set/large_input_test.cu b/tests/static_set/large_input_test.cu index 6f4e5803b..5015ca750 100644 --- a/tests/static_set/large_input_test.cu +++ b/tests/static_set/large_input_test.cu @@ -67,16 +67,12 @@ TEMPLATE_TEST_CASE_SIG( constexpr std::size_t num_keys{1'200'000'000}; using extent_type = cuco::experimental::extent; - using probe = cuco::experimental:: - double_hashing, cuco::murmurhash3_32>; + using probe = cuco::experimental::double_hashing>; try { auto set = cuco::experimental:: static_set, probe>{ - num_keys * 2, - cuco::empty_key{-1}, - thrust::equal_to{}, - probe{cuco::murmurhash3_32{}, cuco::murmurhash3_32{}}}; + num_keys * 2, cuco::empty_key{-1}}; thrust::device_vector d_contained(num_keys); test_unique_sequence(set, d_contained.data().get(), num_keys); diff --git a/tests/static_set/retrieve_all_test.cu b/tests/static_set/retrieve_all_test.cu index 5f7b0ff9d..97a489455 100644 --- a/tests/static_set/retrieve_all_test.cu +++ b/tests/static_set/retrieve_all_test.cu @@ -75,12 +75,10 @@ TEMPLATE_TEST_CASE_SIG( : 422 // 211 x 2 x 1 ; - using probe = - std::conditional_t>, - cuco::experimental::double_hashing, - cuco::murmurhash3_32>>; + using probe = std::conditional_t< + Probe == cuco::test::probe_sequence::linear_probing, + cuco::experimental::linear_probing>, + cuco::experimental::double_hashing>>; auto set = cuco::experimental::static_set, diff --git a/tests/static_set/unique_sequence_test.cu b/tests/static_set/unique_sequence_test.cu index 7285577bf..4c037463a 100644 --- a/tests/static_set/unique_sequence_test.cu +++ b/tests/static_set/unique_sequence_test.cu @@ -132,12 +132,10 @@ TEMPLATE_TEST_CASE_SIG( : 412 // 103 x 2 x 2 ; - using probe = - std::conditional_t>, - cuco::experimental::double_hashing, - cuco::murmurhash3_32>>; + using probe = std::conditional_t< + Probe == cuco::test::probe_sequence::linear_probing, + cuco::experimental::linear_probing>, + cuco::experimental::double_hashing>>; auto set = cuco::experimental::static_set, From 806aa8051ba933c758636586e2c34487282465f0 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 29 Jun 2023 16:21:37 -0700 Subject: [PATCH 124/152] Modify `experimental::static_map::retrieve_all` API (#325) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR modifies the public `experimental::static_map::retrieve_all` API to take a key iterator and a value iterator separately. --------- Co-authored-by: Daniel Jünger <2955913+sleeepyjack@users.noreply.github.com> --- include/cuco/detail/open_addressing_impl.cuh | 13 ++--- include/cuco/detail/static_map/functors.cuh | 55 +++++++++++++++++-- include/cuco/detail/static_map/static_map.inl | 19 +++++-- include/cuco/detail/static_set/static_set.inl | 6 +- include/cuco/static_map.cuh | 21 ++++--- tests/static_map/unique_sequence_test.cu | 16 ++++++ 6 files changed, 104 insertions(+), 26 deletions(-) diff --git a/include/cuco/detail/open_addressing_impl.cuh b/include/cuco/detail/open_addressing_impl.cuh index 6c7fa7965..a13088091 100644 --- a/include/cuco/detail/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing_impl.cuh @@ -386,25 +386,24 @@ class open_addressing_impl { * @note Behavior is undefined if the range beginning at `output_begin` is smaller than the return * value of `size()`. * + * @tparam InputIt Device accessible container slot iterator * @tparam OutputIt Device accessible random access output iterator whose `value_type` is - * convertible from the container's `key_type` + * convertible from the container's `value_type` * @tparam Predicate Type of predicate indicating if the given slot is filled * + * @param begin Beginning of the container slot iterator * @param output_begin Beginning output iterator for keys * @param is_filled Predicate indicating if the given slot is filled * @param stream CUDA stream used for this operation * * @return Iterator indicating the end of the output */ - template - [[nodiscard]] OutputIt retrieve_all(OutputIt output_begin, + template + [[nodiscard]] OutputIt retrieve_all(InputIt begin, + OutputIt output_begin, Predicate const& is_filled, cuda_stream_ref stream) const { - auto begin = - thrust::make_transform_iterator(thrust::counting_iterator(0), - detail::get_slot(this->storage_ref())); - std::size_t temp_storage_bytes = 0; using temp_allocator_type = typename std::allocator_traits::rebind_alloc; auto temp_allocator = temp_allocator_type{this->allocator()}; diff --git a/include/cuco/detail/static_map/functors.cuh b/include/cuco/detail/static_map/functors.cuh index 0f9a7352b..f508206f0 100644 --- a/include/cuco/detail/static_map/functors.cuh +++ b/include/cuco/detail/static_map/functors.cuh @@ -17,17 +17,51 @@ #include +#include + namespace cuco { namespace experimental { namespace static_map_ns { namespace detail { +/** + * @brief Device functor returning the content of the slot indexed by `idx`. + * + * @tparam StorageRef Storage ref type + */ +template +struct get_slot { + StorageRef storage_; ///< Storage ref + + /** + * @brief Constructs `get_slot` functor with the given storage ref. + * + * @param s Input storage ref + */ + explicit constexpr get_slot(StorageRef s) noexcept : storage_{s} {} + + /** + * @brief Accesses the slot content with the given index. + * + * @param idx The slot index + * @return The slot content + */ + __device__ constexpr auto operator()(typename StorageRef::size_type idx) const noexcept + { + auto const window_idx = idx / StorageRef::window_size; + auto const intra_idx = idx % StorageRef::window_size; + auto const [first, second] = storage_[window_idx][intra_idx]; + return thrust::make_tuple(first, second); + } +}; + /** * @brief Device functor returning whether the input slot indexed by `idx` is filled. * - * @tparam T The slot content type + * @tparam T The slot key type + * @tparam U The slot value type */ -template +template struct slot_is_filled { T empty_sentinel_; ///< The value of the empty key sentinel @@ -44,10 +78,23 @@ struct slot_is_filled { * @tparam U Slot content type * * @param slot The slot + * + * @return `true` if slot is filled + */ + template + __device__ constexpr bool operator()(Slot const& slot) const noexcept + { + return not cuco::detail::bitwise_compare(empty_sentinel_, thrust::get<0>(slot)); + } + + /** + * @brief Indicates if the target slot `slot` is filled. + * + * @param slot The slot + * * @return `true` if slot is filled */ - template - __device__ constexpr bool operator()(U const& slot) const noexcept + __device__ constexpr bool operator()(cuco::pair const& slot) const noexcept { return not cuco::detail::bitwise_compare(empty_sentinel_, slot.first); } diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index 0757c8c51..7df6b69c0 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -237,13 +237,20 @@ template -template -OutputIt +template +std::pair static_map::retrieve_all( - OutputIt output_begin, cuda_stream_ref stream) const + KeyOut keys_out, ValueOut values_out, cuda_stream_ref stream) const { - auto const is_filled = static_map_ns::detail::slot_is_filled(this->empty_key_sentinel()); - return impl_->retrieve_all(output_begin, is_filled, stream); + auto const begin = thrust::make_transform_iterator( + thrust::counting_iterator{0}, + static_map_ns::detail::get_slot(impl_->storage_ref())); + auto const is_filled = static_map_ns::detail::slot_is_filled(this->empty_key_sentinel()); + auto zipped_out_begin = thrust::make_zip_iterator(thrust::make_tuple(keys_out, values_out)); + auto const zipped_out_end = impl_->retrieve_all(begin, zipped_out_begin, is_filled, stream); + auto const num = std::distance(zipped_out_begin, zipped_out_end); + + return std::make_pair(keys_out + num, values_out + num); } template :: static_map::size( cuda_stream_ref stream) const noexcept { - auto const is_filled = static_map_ns::detail::slot_is_filled(this->empty_key_sentinel()); + auto const is_filled = static_map_ns::detail::slot_is_filled(this->empty_key_sentinel()); return impl_->size(is_filled, stream); } diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl index c625a2042..aa2030e31 100644 --- a/include/cuco/detail/static_set/static_set.inl +++ b/include/cuco/detail/static_set/static_set.inl @@ -221,8 +221,12 @@ template OutputIt static_set::retrieve_all( OutputIt output_begin, cuda_stream_ref stream) const { + auto const begin = + thrust::make_transform_iterator(thrust::counting_iterator{0}, + detail::get_slot(impl_->storage_ref())); auto const is_filled = static_set_ns::detail::slot_is_filled(this->empty_key_sentinel()); - return impl_->retrieve_all(output_begin, is_filled, stream); + + return impl_->retrieve_all(begin, output_begin, is_filled, stream); } template - [[nodiscard]] OutputIt retrieve_all(OutputIt output_begin, cuda_stream_ref stream = {}) const; + template + [[nodiscard]] std::pair retrieve_all(KeyOut keys_out, + ValueOut values_out, + cuda_stream_ref stream = {}) const; /** * @brief Gets the number of elements in the container. diff --git a/tests/static_map/unique_sequence_test.cu b/tests/static_map/unique_sequence_test.cu index 3fa4ef219..43013b0c8 100644 --- a/tests/static_map/unique_sequence_test.cu +++ b/tests/static_map/unique_sequence_test.cu @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -234,6 +235,21 @@ __inline__ void test_unique_sequence(Map& map, size_type num_keys) REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal)); } + + SECTION("All inserted key-values should be properly retrieved") + { + thrust::device_vector d_values(num_keys); + + auto const [keys_end, values_end] = map.retrieve_all(keys_begin, d_values.begin()); + REQUIRE(std::distance(keys_begin, keys_end) == num_keys); + REQUIRE(std::distance(d_values.begin(), values_end) == num_keys); + + thrust::sort(thrust::device, d_values.begin(), values_end); + REQUIRE(cuco::test::equal(d_values.begin(), + values_end, + thrust::make_counting_iterator(0), + thrust::equal_to{})); + } } TEMPLATE_TEST_CASE_SIG("Unique sequence", From 1ea86e270ffcd9148c978cae33fdf10c3c853448 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 30 Jun 2023 19:14:03 +0200 Subject: [PATCH 125/152] Fix xxhash64 implementation (#326) There is a typo in the current `xxhash_64` implementation which results in incorrect hash values if the key type is not a multiple of 4 bytes. This PR fixes this typo and adds tests for <4byte key types. --- include/cuco/detail/hash_functions/xxhash.cuh | 2 +- tests/utility/hash_test.cu | 63 ++++++++++++------- 2 files changed, 43 insertions(+), 22 deletions(-) diff --git a/include/cuco/detail/hash_functions/xxhash.cuh b/include/cuco/detail/hash_functions/xxhash.cuh index 10360be9b..2ea67fd8a 100644 --- a/include/cuco/detail/hash_functions/xxhash.cuh +++ b/include/cuco/detail/hash_functions/xxhash.cuh @@ -326,7 +326,7 @@ struct XXHash_64 { // block size if constexpr (nbytes % 4) { while (offset < nbytes) { - h64 += (bytes[offset] & 0xff) * prime5; + h64 ^= (bytes[offset] & 0xff) * prime5; h64 = rotl(h64, 11) * prime1; ++offset; } diff --git a/tests/utility/hash_test.cu b/tests/utility/hash_test.cu index 6dca70aea..5e518669e 100644 --- a/tests/utility/hash_test.cu +++ b/tests/utility/hash_test.cu @@ -48,21 +48,28 @@ __host__ __device__ bool check_hash_result(typename Hash::argument_type const& k template __global__ void check_hash_result_kernel_64(OutputIter result) { - result[0] = check_hash_result>(0, 0, 4246796580750024372); - result[1] = check_hash_result>(0, 42, 3614696996920510707); - result[2] = check_hash_result>(42, 0, 15516826743637085169); - result[3] = check_hash_result>(123456789, 0, 9462334144942111946); + int i = 0; - result[4] = check_hash_result>(0, 0, 3803688792395291579); - result[5] = check_hash_result>(0, 42, 13194218611613725804); - result[6] = check_hash_result>(42, 0, 13066772586158965587); - result[7] = check_hash_result>(123456789, 0, 14662639848940634189); + result[i++] = check_hash_result>(0, 0, 16804241149081757544); + result[i++] = check_hash_result>(42, 0, 765293966243412708); + result[i++] = check_hash_result>(0, 42, 9486749600008296231); + + result[i++] = check_hash_result>(0, 0, 4246796580750024372); + result[i++] = check_hash_result>(0, 42, 3614696996920510707); + result[i++] = check_hash_result>(42, 0, 15516826743637085169); + result[i++] = check_hash_result>(123456789, 0, 9462334144942111946); + + result[i++] = check_hash_result>(0, 0, 3803688792395291579); + result[i++] = check_hash_result>(0, 42, 13194218611613725804); + result[i++] = check_hash_result>(42, 0, 13066772586158965587); + result[i++] = check_hash_result>(123456789, 0, 14662639848940634189); #if defined(CUCO_HAS_INT128) - result[8] = check_hash_result>(123456789, 0, 7986913354431084250); + result[i++] = check_hash_result>(123456789, 0, 7986913354431084250); #endif - result[9] = check_hash_result>>(123456789, 0, 2031761887105658523); + result[i++] = + check_hash_result>>(123456789, 0, 2031761887105658523); } TEST_CASE("Test cuco::xxhash_64", "") @@ -70,6 +77,10 @@ TEST_CASE("Test cuco::xxhash_64", "") // Reference hash values were computed using https://github.com/Cyan4973/xxHash SECTION("Check if host-generated hash values match the reference implementation.") { + CHECK(check_hash_result>(0, 0, 16804241149081757544)); + CHECK(check_hash_result>(42, 0, 765293966243412708)); + CHECK(check_hash_result>(0, 42, 9486749600008296231)); + CHECK(check_hash_result>(0, 0, 4246796580750024372)); CHECK(check_hash_result>(0, 42, 3614696996920510707)); CHECK(check_hash_result>(42, 0, 15516826743637085169)); @@ -101,21 +112,27 @@ TEST_CASE("Test cuco::xxhash_64", "") template __global__ void check_hash_result_kernel_32(OutputIter result) { - result[0] = check_hash_result>(0, 0, 148298089); - result[1] = check_hash_result>(0, 42, 2132181312); - result[2] = check_hash_result>(42, 0, 1161967057); - result[3] = check_hash_result>(123456789, 0, 2987034094); + int i = 0; + + result[i++] = check_hash_result>(0, 0, 3479547966); + result[i++] = check_hash_result>(42, 0, 3774771295); + result[i++] = check_hash_result>(0, 42, 2099223482); - result[4] = check_hash_result>(0, 0, 3736311059); - result[5] = check_hash_result>(0, 42, 1076387279); - result[6] = check_hash_result>(42, 0, 2332451213); - result[7] = check_hash_result>(123456789, 0, 1561711919); + result[i++] = check_hash_result>(0, 0, 148298089); + result[i++] = check_hash_result>(0, 42, 2132181312); + result[i++] = check_hash_result>(42, 0, 1161967057); + result[i++] = check_hash_result>(123456789, 0, 2987034094); + + result[i++] = check_hash_result>(0, 0, 3736311059); + result[i++] = check_hash_result>(0, 42, 1076387279); + result[i++] = check_hash_result>(42, 0, 2332451213); + result[i++] = check_hash_result>(123456789, 0, 1561711919); #if defined(CUCO_HAS_INT128) - result[8] = check_hash_result>(123456789, 0, 1846633701); + result[i++] = check_hash_result>(123456789, 0, 1846633701); #endif - result[9] = check_hash_result>>(123456789, 0, 3715432378); + result[i++] = check_hash_result>>(123456789, 0, 3715432378); } TEST_CASE("Test cuco::xxhash_32", "") @@ -123,6 +140,10 @@ TEST_CASE("Test cuco::xxhash_32", "") // Reference hash values were computed using https://github.com/Cyan4973/xxHash SECTION("Check if host-generated hash values match the reference implementation.") { + CHECK(check_hash_result>(0, 0, 3479547966)); + CHECK(check_hash_result>(42, 0, 3774771295)); + CHECK(check_hash_result>(0, 42, 2099223482)); + CHECK(check_hash_result>(0, 0, 148298089)); CHECK(check_hash_result>(0, 42, 2132181312)); CHECK(check_hash_result>(42, 0, 1161967057)); @@ -143,7 +164,7 @@ TEST_CASE("Test cuco::xxhash_32", "") SECTION("Check if device-generated hash values match the reference implementation.") { - thrust::device_vector result(10); + thrust::device_vector result(20, true); check_hash_result_kernel_32<<<1, 1>>>(result.begin()); From 7c5c29db87534a4b27acbd733439f3db0b0b44c4 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 6 Jul 2023 13:58:41 -0700 Subject: [PATCH 126/152] Fix pair implementation issues (#330) - Fixe a bug of std pair traits bug unveiled by cudf [CI](https://github.com/rapidsai/cudf/actions/runs/5469754572/jobs/9959101572?pr=13665) - Removes the redundant file --- include/cuco/detail/pair.cuh | 295 --------------------------------- include/cuco/detail/traits.hpp | 12 +- 2 files changed, 6 insertions(+), 301 deletions(-) delete mode 100644 include/cuco/detail/pair.cuh diff --git a/include/cuco/detail/pair.cuh b/include/cuco/detail/pair.cuh deleted file mode 100644 index 94ad5090f..000000000 --- a/include/cuco/detail/pair.cuh +++ /dev/null @@ -1,295 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include - -#include -#include -#include - -namespace cuco { -namespace detail { - -/** - * @brief Rounds `v` to the nearest power of 2 greater than or equal to `v`. - * - * @param v - * @return The nearest power of 2 greater than or equal to `v`. - */ -constexpr std::size_t next_pow2(std::size_t v) noexcept -{ - --v; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - return ++v; -} - -/** - * @brief Gives value to use as alignment for a pair type that is at least the - * size of the sum of the size of the first type and second type, or 16, - * whichever is smaller. - */ -template -constexpr std::size_t pair_alignment() -{ - return std::min(std::size_t{16}, next_pow2(sizeof(First) + sizeof(Second))); -} - -template -struct is_std_pair_like : std::false_type { -}; - -template -struct is_std_pair_like< - T, - std::void_t(std::declval())), decltype(std::get<1>(std::declval()))>> - : std::conditional_t::value == 2, std::true_type, std::false_type> { -}; - -template -struct is_thrust_pair_like_impl : std::false_type { -}; - -template -struct is_thrust_pair_like_impl(std::declval())), - decltype(thrust::get<1>(std::declval()))>> - : std::conditional_t::value == 2, std::true_type, std::false_type> { -}; - -template -struct is_thrust_pair_like - : is_thrust_pair_like_impl< - std::remove_reference_t()))>> { -}; - -/** - * @brief Denotes the equivalent packed type based on the size of the object. - * - * @tparam N The size of the object - */ -template -struct packed { - using type = void; ///< `void` type by default -}; -/** - * @brief Denotes the packed type when the size of the object is 8. - */ -template <> -struct packed { - using type = uint64_t; ///< Packed type as `uint64_t` if the size of the object is 8 -}; -/** - * @brief Denotes the packed type when the size of the object is 4. - */ -template <> -struct packed { - using type = uint32_t; ///< Packed type as `uint32_t` if the size of the object is 4 -}; -template -using packed_t = typename packed::type; - -/** - * @brief Indicates if a pair type can be packed. - * - * When the size of the key,value pair being inserted into the hash table is - * equal in size to a type where atomicCAS is natively supported, it is more - * efficient to "pack" the pair and insert it with a single atomicCAS. - * - * Pair types whose key and value have the same object representation may be - * packed. Also, the `pair_type` must not contain any padding bits otherwise - * accessing the packed value would be undefined. - * - * @tparam pair_type The pair type that will be packed - * - * @return true If the pair type can be packed - * @return false If the pair type cannot be packed - */ -template -constexpr bool is_packable() -{ - return not std::is_void>::value and - std::has_unique_object_representations_v; -} - -/** - * @brief Allows viewing a pair in a packed representation. - * - * Used as an optimization for inserting when a pair can be inserted with a - * single atomicCAS - */ -template -union pair_converter { - using packed_type = packed_t; ///< The packed pair type - packed_type packed; ///< The pair in the packed representation - pair_type pair; ///< The pair in the pair representation - - /** - * @brief Constructs a pair converter by copying from `p` - * - * @tparam T Type that is convertible to `pair_type` - * - * @param p The pair to copy from - */ - template - __device__ pair_converter(T&& p) : pair{p} - { - } - - /** - * @brief Constructs a pair converter by copying from `p` - * - * @param p The packed data to copy from - */ - __device__ pair_converter(packed_type p) : packed{p} {} -}; - -} // namespace detail - -/** - * @brief Custom pair type - * - * This is necessary because `thrust::pair` is under aligned. - * - * @tparam First Type of the first value in the pair - * @tparam Second Type of the second value in the pair - */ -template -struct alignas(detail::pair_alignment()) pair { - using first_type = First; ///< Type of the first value in the pair - using second_type = Second; ///< Type of the second value in the pair - - pair() = default; - ~pair() = default; - pair(pair const&) = default; ///< Copy constructor - pair(pair&&) = default; ///< Move constructor - - /** - * @brief Replaces the contents of the pair with another pair. - * - * @return Reference of the current pair object - */ - pair& operator=(pair const&) = default; - - /** - * @brief Replaces the contents of the pair with another pair. - * - * @return Reference of the current pair object - */ - pair& operator=(pair&&) = default; - - /** - * @brief Constructs a pair from objects `f` and `s`. - * - * @param f The object to copy into `first` - * @param s The object to copy into `second` - */ - __host__ __device__ constexpr pair(First const& f, Second const& s) : first{f}, second{s} {} - - /** - * @brief Constructs a pair by copying from the given pair `p`. - * - * @tparam F Type of the first value of `p` - * @tparam S Type of the second value of `p` - * - * @param p The pair to copy from - */ - template - __host__ __device__ constexpr pair(pair const& p) : first{p.first}, second{p.second} - { - } - - /** - * @brief Constructs a pair from the given std::pair-like `p`. - * - * @tparam T Type of the pair to copy from - * - * @param p The input pair to copy from - */ - template ::value>* = nullptr> - __host__ __device__ constexpr pair(T const& p) - : pair{std::get<0>(thrust::raw_reference_cast(p)), std::get<1>(thrust::raw_reference_cast(p))} - { - } - - /** - * @brief Constructs a pair from the given thrust::pair-like `p`. - * - * @tparam T Type of the pair to copy from - * - * @param p The input pair to copy from - */ - template ::value>* = nullptr> - __host__ __device__ constexpr pair(T const& p) - : pair{thrust::get<0>(thrust::raw_reference_cast(p)), - thrust::get<1>(thrust::raw_reference_cast(p))} - { - } - - First first; ///< The first value in the pair - Second second; ///< The second value in the pair -}; - -template -using pair_type = cuco::pair; - -/** - * @brief Creates a pair of type `pair_type` - * - * @tparam F - * @tparam S - * - * @param f - * @param s - * @return pair_type with first element `f` and second element `s`. - */ -template -__host__ __device__ pair_type make_pair(F&& f, S&& s) noexcept -{ - return pair_type{std::forward(f), std::forward(s)}; -} - -/** - * @brief Tests if both elements of lhs and rhs are equal - * - * @tparam T1 Type of the first element of the left-hand side pair - * @tparam T2 Type of the second element of the left-hand side pair - * @tparam U1 Type of the first element of the right-hand side pair - * @tparam U2 Type of the second element of the right-hand side pair - * - * @param lhs Left-hand side pair - * @param rhs Right-hand side pair - * - * @return True if two pairs are equal. False otherwise - */ -template -__host__ __device__ constexpr bool operator==(cuco::pair const& lhs, - cuco::pair const& rhs) noexcept -{ - return lhs.first == rhs.first and lhs.second == rhs.second; -} - -} // namespace cuco diff --git a/include/cuco/detail/traits.hpp b/include/cuco/detail/traits.hpp index 602a93251..313f95430 100644 --- a/include/cuco/detail/traits.hpp +++ b/include/cuco/detail/traits.hpp @@ -18,9 +18,10 @@ #include #include -#include #include +#include + namespace cuco::detail { template @@ -29,11 +30,10 @@ struct is_std_pair_like : cuda::std::false_type { template struct is_std_pair_like(cuda::std::declval())), - decltype(cuda::std::get<1>(cuda::std::declval()))>> - : cuda::std::conditional_t::value == 2, - cuda::std::true_type, - cuda::std::false_type> { + cuda::std::void_t(cuda::std::declval())), + decltype(std::get<1>(cuda::std::declval()))>> + : cuda::std:: + conditional_t::value == 2, cuda::std::true_type, cuda::std::false_type> { }; template From 303f134573afa315cf14fca3f7a0b730438497c3 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 6 Jul 2023 15:48:52 -0700 Subject: [PATCH 127/152] Add 16-byte key-value support to `experimental::static_map` (#321) This PR adds `back_to_back_cas` and `cas_dependent_write` to the base `open_addressing_ref_impl` class. It allows the support of 8-byte keys and payloads in the new experimental map. --- include/cuco/detail/open_addressing_impl.cuh | 7 +- .../cuco/detail/open_addressing_ref_impl.cuh | 266 ++++++++++++++---- .../cuco/detail/static_map/static_map_ref.inl | 14 + include/cuco/static_map.cuh | 9 +- include/cuco/static_map_ref.cuh | 9 +- include/cuco/static_set_ref.cuh | 2 - tests/static_map/unique_sequence_test.cu | 36 ++- 7 files changed, 272 insertions(+), 71 deletions(-) diff --git a/include/cuco/detail/open_addressing_impl.cuh b/include/cuco/detail/open_addressing_impl.cuh index a13088091..d586fe5c3 100644 --- a/include/cuco/detail/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing_impl.cuh @@ -42,7 +42,8 @@ namespace detail { * * @note This class should NOT be used directly. * - * @throw If the size of the given slot type is larger than 8 bytes + * @throw If the size of the given key type is larger than 8 bytes + * @throw If the size of the given slot type is larger than 16 bytes * @throw If the given key type doesn't have unique object representations, i.e., * `cuco::bitwise_comparable_v == false` * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base` @@ -65,7 +66,9 @@ template class open_addressing_impl { - static_assert(sizeof(Value) <= 8, "Container does not support slot types larger than 8 bytes."); + static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); + + static_assert(sizeof(Value) <= 16, "Container does not support slot types larger than 16 bytes."); static_assert( cuco::is_bitwise_comparable_v, diff --git a/include/cuco/detail/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing_ref_impl.cuh index 56b699104..0e9cfdbe4 100644 --- a/include/cuco/detail/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing_ref_impl.cuh @@ -38,6 +38,7 @@ namespace detail { * * @note This class should NOT be used directly. * + * @throw If the size of the given key type is larger than 8 bytes * @throw If the given key type doesn't have unique object representations, i.e., * `cuco::bitwise_comparable_v == false` * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base` @@ -49,6 +50,8 @@ namespace detail { */ template class open_addressing_ref_impl { + static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); + static_assert( cuco::is_bitwise_comparable_v, "Key type must have unique object representations or have been explicitly declared as safe for " @@ -253,7 +256,13 @@ class open_addressing_ref_impl { // If the key is already in the container, return false if (eq_res == detail::equal_result::EQUAL) { return {iterator{&window_ptr[i]}, false}; } if (eq_res == detail::equal_result::EMPTY) { - switch (attempt_insert(window_ptr + i, value, predicate)) { + switch ([&]() { + if constexpr (sizeof(value_type) <= 8) { + return packed_cas(window_ptr + i, value, predicate); + } else { + return cas_dependent_write(window_ptr + i, value, predicate); + } + }()) { case insert_result::SUCCESS: { return {iterator{&window_ptr[i]}, true}; } @@ -323,9 +332,14 @@ class open_addressing_ref_impl { if (group_contains_empty) { auto const src_lane = __ffs(group_contains_empty) - 1; auto const res = group.shfl(reinterpret_cast(slot_ptr), src_lane); - auto const status = (group.thread_rank() == src_lane) - ? attempt_insert(slot_ptr, value, predicate) - : insert_result::CONTINUE; + auto const status = [&]() { + if (group.thread_rank() != src_lane) { return insert_result::CONTINUE; } + if constexpr (sizeof(value_type) <= 8) { + return packed_cas(slot_ptr, value, predicate); + } else { + return cas_dependent_write(slot_ptr, value, predicate); + } + }(); switch (group.shfl(status, src_lane)) { case insert_result::SUCCESS: { @@ -541,10 +555,91 @@ class open_addressing_ref_impl { }; /** - * @brief Attempts to insert an element into a slot. + * @brief Compares the content of the address `address` (old value) with the `expected` value and, + * only if they are the same, sets the content of `address` to `desired`. * - * @note Dispatches the correct implementation depending on the container - * type and presence of other operator mixins. + * @tparam T Address content type + * + * @param address The target address + * @param expected The value expected to be found at the target address + * @param desired The value to store at the target address if it is as expected + * + * @return The old value located at address `address` + */ + template + __device__ constexpr auto compare_and_swap(T* address, T expected, T desired) + { + // temporary workaround due to performance regression + // https://github.com/NVIDIA/libcudacxx/issues/366 + if constexpr (sizeof(T) == sizeof(unsigned int)) { + auto* const slot_ptr = reinterpret_cast(address); + auto const* const expected_ptr = reinterpret_cast(&expected); + auto const* const desired_ptr = reinterpret_cast(&desired); + if constexpr (Scope == cuda::thread_scope_system) { + return atomicCAS_system(slot_ptr, *expected_ptr, *desired_ptr); + } else if constexpr (Scope == cuda::thread_scope_device) { + return atomicCAS(slot_ptr, *expected_ptr, *desired_ptr); + } else if constexpr (Scope == cuda::thread_scope_block) { + return atomicCAS_block(slot_ptr, *expected_ptr, *desired_ptr); + } else { + static_assert(cuco::dependent_false, "Unsupported thread scope"); + } + } else if constexpr (sizeof(T) == sizeof(unsigned long long int)) { + auto* const slot_ptr = reinterpret_cast(address); + auto const* const expected_ptr = reinterpret_cast(&expected); + auto const* const desired_ptr = reinterpret_cast(&desired); + if constexpr (Scope == cuda::thread_scope_system) { + return atomicCAS_system(slot_ptr, *expected_ptr, *desired_ptr); + } else if constexpr (Scope == cuda::thread_scope_device) { + return atomicCAS(slot_ptr, *expected_ptr, *desired_ptr); + } else if constexpr (Scope == cuda::thread_scope_block) { + return atomicCAS_block(slot_ptr, *expected_ptr, *desired_ptr); + } else { + static_assert(cuco::dependent_false, "Unsupported thread scope"); + } + } + } + + /** + * @brief Atomically stores `value` at the given `address`. + * + * @tparam T Address content type + * + * @param address The target address + * @param value The value to store + */ + template + __device__ constexpr void atomic_store(T* address, T value) + { + if constexpr (sizeof(T) == sizeof(unsigned int)) { + auto* const slot_ptr = reinterpret_cast(address); + auto const* const value_ptr = reinterpret_cast(&value); + if constexpr (Scope == cuda::thread_scope_system) { + atomicExch_system(slot_ptr, *value_ptr); + } else if constexpr (Scope == cuda::thread_scope_device) { + atomicExch(slot_ptr, *value_ptr); + } else if constexpr (Scope == cuda::thread_scope_block) { + atomicExch_block(slot_ptr, *value_ptr); + } else { + static_assert(cuco::dependent_false, "Unsupported thread scope"); + } + } else if constexpr (sizeof(T) == sizeof(unsigned long long int)) { + auto* const slot_ptr = reinterpret_cast(address); + auto const* const value_ptr = reinterpret_cast(&value); + if constexpr (Scope == cuda::thread_scope_system) { + atomicExch_system(slot_ptr, *value_ptr); + } else if constexpr (Scope == cuda::thread_scope_device) { + atomicExch(slot_ptr, *value_ptr); + } else if constexpr (Scope == cuda::thread_scope_block) { + atomicExch_block(slot_ptr, *value_ptr); + } else { + static_assert(cuco::dependent_false, "Unsupported thread scope"); + } + } + } + + /** + * @brief Inserts the specified element with one single CAS operation. * * @tparam Predicate Predicate type * @@ -555,53 +650,130 @@ class open_addressing_ref_impl { * @return Result of this operation, i.e., success/continue/duplicate */ template - [[nodiscard]] __device__ insert_result attempt_insert(value_type* slot, - value_type const& value, - Predicate const& predicate) + [[nodiscard]] __device__ constexpr insert_result packed_cas(value_type* slot, + value_type const& value, + Predicate const& predicate) noexcept { - // temporary workaround due to performance regression - // https://github.com/NVIDIA/libcudacxx/issues/366 - auto old = [&]() { - value_type expected = this->empty_slot_sentinel_; - value_type val = value; - if constexpr (sizeof(value_type) == sizeof(unsigned int)) { - auto* expected_ptr = reinterpret_cast(&expected); - auto* value_ptr = reinterpret_cast(&val); - if constexpr (Scope == cuda::thread_scope_system) { - return atomicCAS_system(reinterpret_cast(slot), *expected_ptr, *value_ptr); - } else if constexpr (Scope == cuda::thread_scope_device) { - return atomicCAS(reinterpret_cast(slot), *expected_ptr, *value_ptr); - } else if constexpr (Scope == cuda::thread_scope_block) { - return atomicCAS_block(reinterpret_cast(slot), *expected_ptr, *value_ptr); - } else { - static_assert(cuco::dependent_false, "Unsupported thread scope"); - } - } - if constexpr (sizeof(value_type) == sizeof(unsigned long long int)) { - auto* expected_ptr = reinterpret_cast(&expected); - auto* value_ptr = reinterpret_cast(&val); - if constexpr (Scope == cuda::thread_scope_system) { - return atomicCAS_system( - reinterpret_cast(slot), *expected_ptr, *value_ptr); - } else if constexpr (Scope == cuda::thread_scope_device) { - return atomicCAS( - reinterpret_cast(slot), *expected_ptr, *value_ptr); - } else if constexpr (Scope == cuda::thread_scope_block) { - return atomicCAS_block( - reinterpret_cast(slot), *expected_ptr, *value_ptr); - } else { - static_assert(cuco::dependent_false, "Unsupported thread scope"); - } - } - }(); + auto old = compare_and_swap(slot, this->empty_slot_sentinel_, value); auto* old_ptr = reinterpret_cast(&old); - if (*slot == *old_ptr) { + if (cuco::detail::bitwise_compare(*old_ptr, this->empty_slot_sentinel_)) { + return insert_result::SUCCESS; + } else { // Shouldn't use `predicate` operator directly since it includes a redundant bitwise compare return predicate.equal_to(*old_ptr, value) == detail::equal_result::EQUAL ? insert_result::DUPLICATE : insert_result::CONTINUE; - } else { + } + } + + /** + * @brief Inserts the specified element with two back-to-back CAS operations. + * + * @tparam Predicate Predicate type + * + * @param slot Pointer to the slot in memory + * @param value Element to insert + * @param predicate Predicate used to compare slot content against `key` + * + * @return Result of this operation, i.e., success/continue/duplicate + */ + template + [[nodiscard]] __device__ constexpr insert_result back_to_back_cas( + value_type* slot, value_type const& value, Predicate const& predicate) noexcept + { + auto const expected_key = this->empty_slot_sentinel_.first; + auto const expected_payload = this->empty_slot_sentinel_.second; + + auto old_key = compare_and_swap(&slot->first, expected_key, value.first); + auto old_payload = compare_and_swap(&slot->second, expected_payload, value.second); + + using mapped_type = decltype(expected_payload); + + auto* old_key_ptr = reinterpret_cast(&old_key); + auto* old_payload_ptr = reinterpret_cast(&old_payload); + + // if key success + if (cuco::detail::bitwise_compare(*old_key_ptr, expected_key)) { + while (not cuco::detail::bitwise_compare(*old_payload_ptr, expected_payload)) { + old_payload = compare_and_swap(&slot->second, expected_payload, value.second); + } return insert_result::SUCCESS; + } else if (cuco::detail::bitwise_compare(*old_payload_ptr, expected_payload)) { + atomic_store(&slot->second, expected_payload); + } + + // Our key was already present in the slot, so our key is a duplicate + // Shouldn't use `predicate` operator directly since it includes a redundant bitwise compare + if (predicate.equal_to(*old_key_ptr, value.first) == detail::equal_result::EQUAL) { + return insert_result::DUPLICATE; + } + + return insert_result::CONTINUE; + } + + /** + * @brief Inserts the specified element with CAS-dependent write operations. + * + * @tparam Predicate Predicate type + * + * @param slot Pointer to the slot in memory + * @param value Element to insert + * @param predicate Predicate used to compare slot content against `key` + * + * @return Result of this operation, i.e., success/continue/duplicate + */ + template + [[nodiscard]] __device__ constexpr insert_result cas_dependent_write( + value_type* slot, value_type const& value, Predicate const& predicate) noexcept + { + auto const expected_key = this->empty_slot_sentinel_.first; + + auto old_key = compare_and_swap(&slot->first, expected_key, value.first); + + auto* old_key_ptr = reinterpret_cast(&old_key); + + // if key success + if (cuco::detail::bitwise_compare(*old_key_ptr, expected_key)) { + atomic_store(&slot->second, value.second); + return insert_result::SUCCESS; + } + + // Our key was already present in the slot, so our key is a duplicate + // Shouldn't use `predicate` operator directly since it includes a redundant bitwise compare + if (predicate.equal_to(*old_key_ptr, value.first) == detail::equal_result::EQUAL) { + return insert_result::DUPLICATE; + } + + return insert_result::CONTINUE; + } + + /** + * @brief Attempts to insert an element into a slot. + * + * @note Dispatches the correct implementation depending on the container + * type and presence of other operator mixins. + * + * @tparam Predicate Predicate type + * + * @param slot Pointer to the slot in memory + * @param value Element to insert + * @param predicate Predicate used to compare slot content against `key` + * + * @return Result of this operation, i.e., success/continue/duplicate + */ + template + [[nodiscard]] __device__ insert_result attempt_insert(value_type* slot, + value_type const& value, + Predicate const& predicate) noexcept + { + if constexpr (sizeof(value_type) <= 8) { + return packed_cas(slot, value, predicate); + } else { +#if (_CUDA_ARCH__ < 700) + return cas_dependent_write(slot, value, predicate); +#else + return back_to_back_cas(slot, value, predicate); +#endif } } diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index fce7c941b..f3c412924 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -146,6 +146,20 @@ struct static_map_ref == false` * @throw If the given mapped type doesn't have unique object representations, i.e., @@ -95,7 +96,9 @@ template >, class Storage = cuco::experimental::aow_storage<1>> class static_map { - static_assert(sizeof(Key) <= 4, "Container does not support key types larger than 4 bytes."); + static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); + + static_assert(sizeof(T) <= 8, "Container does not support payload types larger than 8 bytes."); static_assert(cuco::is_bitwise_comparable_v, "Mapped type must have unique object representations or have been explicitly " diff --git a/include/cuco/static_map_ref.cuh b/include/cuco/static_map_ref.cuh index 3be18abc0..b278ee453 100644 --- a/include/cuco/static_map_ref.cuh +++ b/include/cuco/static_map_ref.cuh @@ -37,8 +37,8 @@ namespace experimental { * @note `ProbingScheme::cg_size` indicates how many threads are used to handle one independent * device operation. `cg_size == 1` uses the scalar (or non-CG) code paths. * - * @throw If the size of the given key type is larger than 4 bytes - * @throw If the size of the given slot type is larger than 8 bytes + * @throw If the size of the given key type is larger than 8 bytes + * @throw If the size of the given payload type is larger than 8 bytes * @throw If the given key type doesn't have unique object representations, i.e., * `cuco::bitwise_comparable_v == false` * @throw If the given payload type doesn't have unique object representations, i.e., @@ -66,10 +66,7 @@ class static_map_ref static_map_ref>... { using impl_type = detail::open_addressing_ref_impl; - static_assert(sizeof(Key) <= 4, "Container does not support key types larger than 4 bytes."); - - static_assert(sizeof(cuco::pair) <= 8, - "Container does not support slot types larger than 8 bytes."); + static_assert(sizeof(T) <= 8, "Container does not support payload types larger than 8 bytes."); static_assert( cuco::is_bitwise_comparable_v, diff --git a/include/cuco/static_set_ref.cuh b/include/cuco/static_set_ref.cuh index 49cfe116e..941829256 100644 --- a/include/cuco/static_set_ref.cuh +++ b/include/cuco/static_set_ref.cuh @@ -64,8 +64,6 @@ class static_set_ref static_set_ref>... { using impl_type = detail::open_addressing_ref_impl; - static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); - public: using key_type = Key; ///< Key Type using probing_scheme_type = ProbingScheme; ///< Type of probing scheme diff --git a/tests/static_map/unique_sequence_test.cu b/tests/static_map/unique_sequence_test.cu index 43013b0c8..d0581afdb 100644 --- a/tests/static_map/unique_sequence_test.cu +++ b/tests/static_map/unique_sequence_test.cu @@ -252,17 +252,31 @@ __inline__ void test_unique_sequence(Map& map, size_type num_keys) } } -TEMPLATE_TEST_CASE_SIG("Unique sequence", - "", - ((cuco::test::probe_sequence Probe, int CGSize), Probe, CGSize), - (cuco::test::probe_sequence::double_hashing, 1), - (cuco::test::probe_sequence::double_hashing, 2), - (cuco::test::probe_sequence::linear_probing, 1), - (cuco::test::probe_sequence::linear_probing, 2)) +TEMPLATE_TEST_CASE_SIG( + "Unique sequence", + "", + ((typename Key, typename Value, cuco::test::probe_sequence Probe, int CGSize), + Key, + Value, + Probe, + CGSize), + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) { - using Key = int32_t; - using Value = int32_t; - constexpr size_type num_keys{400}; constexpr size_type gold_capacity = CGSize == 1 ? 422 // 211 x 1 x 2 : 412; // 103 x 2 x 2 @@ -282,7 +296,7 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence", probe, cuco::cuda_allocator, cuco::experimental::aow_storage<2>>{ - num_keys, cuco::empty_key{-1}, cuco::empty_value{-1}}; + num_keys, cuco::empty_key{-1}, cuco::empty_value{-1}}; REQUIRE(map.capacity() == gold_capacity); From c00debe54bf46d7d6af0f654cb87c1fa43007d02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 11 Jul 2023 02:50:20 +0200 Subject: [PATCH 128/152] Add function to re-initialize/clear a container (#333) This PR adds `clear(stream)` and `clear_async(stream)` for both `cuco::experimental::static_set` `cuco::experimental::static_map`. --- include/cuco/detail/open_addressing_impl.cuh | 31 +++++++++++++++++-- include/cuco/detail/static_map/static_map.inl | 28 +++++++++++++++++ include/cuco/detail/static_set/static_set.inl | 26 ++++++++++++++++ include/cuco/static_map.cuh | 20 +++++++++++- include/cuco/static_set.cuh | 20 +++++++++++- tests/static_set/size_test.cu | 8 +++-- 6 files changed, 126 insertions(+), 7 deletions(-) diff --git a/include/cuco/detail/open_addressing_impl.cuh b/include/cuco/detail/open_addressing_impl.cuh index d586fe5c3..2c8688869 100644 --- a/include/cuco/detail/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing_impl.cuh @@ -106,8 +106,10 @@ class open_addressing_impl { * window size and it's computed via `make_valid_extent` factory. Insert operations will not * automatically grow the container. Attempting to insert more unique keys than the capacity of * the container results in undefined behavior. - * @note The `empty_key_sentinel` is reserved and behavior is undefined when attempting to insert + * @note Any `*_sentinel`s are reserved and behavior is undefined when attempting to insert * this sentinel value. + * @note If a non-default CUDA stream is provided, the caller is responsible for synchronizing the + * stream before the object is first used. * * @param capacity The requested lower-bound size * @param empty_key_sentinel The reserved key value for empty slots @@ -125,11 +127,35 @@ class open_addressing_impl { Allocator const& alloc, cuda_stream_ref stream) noexcept : empty_key_sentinel_{empty_key_sentinel}, + empty_slot_sentinel_{empty_slot_sentinel}, predicate_{pred}, probing_scheme_{probing_scheme}, storage_{make_valid_extent(capacity), alloc} { - storage_.initialize(empty_slot_sentinel, stream); + this->clear_async(stream); + } + + /** + * @brief Erases all elements from the container. After this call, `size()` returns zero. + * Invalidates any references, pointers, or iterators referring to contained elements. + * + * @param stream CUDA stream this operation is executed in + */ + void clear(cuda_stream_ref stream) noexcept + { + this->clear_async(stream); + stream.synchronize(); + } + + /** + * @brief Asynchronously erases all elements from the container. After this call, `size()` returns + * zero. Invalidates any references, pointers, or iterators referring to contained elements. + * + * @param stream CUDA stream this operation is executed in + */ + void clear_async(cuda_stream_ref stream) noexcept + { + storage_.initialize(empty_slot_sentinel_, stream); } /** @@ -526,6 +552,7 @@ class open_addressing_impl { protected: key_type empty_key_sentinel_; ///< Key value that represents an empty slot + value_type empty_slot_sentinel_; ///< Slot value that represents an empty slot key_equal predicate_; ///< Key equality binary predicate probing_scheme_type probing_scheme_; ///< Probing scheme storage_type storage_; ///< Slot window storage diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index 7df6b69c0..76b1df79b 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -54,6 +54,34 @@ constexpr static_map +void static_map::clear( + cuda_stream_ref stream) noexcept +{ + impl_->clear(stream); +} + +template +void static_map::clear_async( + cuda_stream_ref stream) noexcept +{ + impl_->clear_async(stream); +} + template +void static_set::clear( + cuda_stream_ref stream) noexcept +{ + impl_->clear(stream); +} + +template +void static_set::clear_async( + cuda_stream_ref stream) noexcept +{ + impl_->clear_async(stream); +} + template Date: Wed, 12 Jul 2023 04:37:55 +0200 Subject: [PATCH 129/152] Enable hash computation from variable length keys (#327) --- benchmarks/hash_bench.cu | 2 +- .../detail/hash_functions/murmurhash3.cuh | 85 +++++++----- include/cuco/detail/hash_functions/utils.cuh | 28 ++++ include/cuco/detail/hash_functions/xxhash.cuh | 126 +++++++++++------- tests/utility/hash_test.cu | 25 ++++ 5 files changed, 180 insertions(+), 86 deletions(-) create mode 100644 include/cuco/detail/hash_functions/utils.cuh diff --git a/benchmarks/hash_bench.cu b/benchmarks/hash_bench.cu index 58c6ee770..973f6976d 100644 --- a/benchmarks/hash_bench.cu +++ b/benchmarks/hash_bench.cu @@ -77,7 +77,7 @@ void hash_eval(nvbench::state& state, nvbench::type_list) state.add_element_count(num_keys); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + state.exec([&](nvbench::launch& launch) { hash_bench_kernel<<>>( Hash{}, num_keys, hash_values.begin(), materialize_result); }); diff --git a/include/cuco/detail/hash_functions/murmurhash3.cuh b/include/cuco/detail/hash_functions/murmurhash3.cuh index ce5ab9d56..a12143523 100644 --- a/include/cuco/detail/hash_functions/murmurhash3.cuh +++ b/include/cuco/detail/hash_functions/murmurhash3.cuh @@ -16,7 +16,12 @@ #pragma once +#include +#include + +#include #include +#include namespace cuco::detail { @@ -31,15 +36,15 @@ template struct MurmurHash3_fmix32 { static_assert(sizeof(Key) == 4, "Key type must be 4 bytes in size."); - using argument_type = Key; ///< The type of the values taken as argument - using result_type = uint32_t; ///< The type of the hash values produced + using argument_type = Key; ///< The type of the values taken as argument + using result_type = std::uint32_t; ///< The type of the hash values produced /** * @brief Constructs a MurmurHash3_fmix32 hash function with the given `seed`. * * @param seed A custom number to randomize the resulting hash value */ - __host__ __device__ constexpr MurmurHash3_fmix32(uint32_t seed = 0) : seed_{seed} {} + __host__ __device__ constexpr MurmurHash3_fmix32(std::uint32_t seed = 0) : seed_{seed} {} /** * @brief Returns a hash value for its argument, as a value of type `result_type`. @@ -49,7 +54,7 @@ struct MurmurHash3_fmix32 { */ constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept { - uint32_t h = static_cast(key) ^ seed_; + std::uint32_t h = static_cast(key) ^ seed_; h ^= h >> 16; h *= 0x85ebca6b; h ^= h >> 13; @@ -59,7 +64,7 @@ struct MurmurHash3_fmix32 { } private: - uint32_t seed_; + std::uint32_t seed_; }; /** @@ -73,15 +78,15 @@ template struct MurmurHash3_fmix64 { static_assert(sizeof(Key) == 8, "Key type must be 8 bytes in size."); - using argument_type = Key; ///< The type of the values taken as argument - using result_type = uint64_t; ///< The type of the hash values produced + using argument_type = Key; ///< The type of the values taken as argument + using result_type = std::uint64_t; ///< The type of the hash values produced /** * @brief Constructs a MurmurHash3_fmix64 hash function with the given `seed`. * * @param seed A custom number to randomize the resulting hash value */ - __host__ __device__ constexpr MurmurHash3_fmix64(uint64_t seed = 0) : seed_{seed} {} + __host__ __device__ constexpr MurmurHash3_fmix64(std::uint64_t seed = 0) : seed_{seed} {} /** * @brief Returns a hash value for its argument, as a value of type `result_type`. @@ -91,7 +96,7 @@ struct MurmurHash3_fmix64 { */ constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept { - uint64_t h = static_cast(key) ^ seed_; + std::uint64_t h = static_cast(key) ^ seed_; h ^= h >> 33; h *= 0xff51afd7ed558ccd; h ^= h >> 33; @@ -101,7 +106,7 @@ struct MurmurHash3_fmix64 { } private: - uint64_t seed_; + std::uint64_t seed_; }; /** @@ -121,36 +126,50 @@ struct MurmurHash3_fmix64 { */ template struct MurmurHash3_32 { - using argument_type = Key; ///< The type of the values taken as argument - using result_type = uint32_t; ///< The type of the hash values produced + using argument_type = Key; ///< The type of the values taken as argument + using result_type = std::uint32_t; ///< The type of the hash values produced /** * @brief Constructs a MurmurHash3_32 hash function with the given `seed`. * * @param seed A custom number to randomize the resulting hash value */ - __host__ __device__ constexpr MurmurHash3_32(uint32_t seed = 0) : fmix32_{0}, seed_{seed} {} + __host__ __device__ constexpr MurmurHash3_32(std::uint32_t seed = 0) : fmix32_{0}, seed_{seed} {} /** * @brief Returns a hash value for its argument, as a value of type `result_type`. * * @param key The input argument to hash - * @return A resulting hash value for `key` + * @return The resulting hash value for `key` */ constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept { - constexpr int len = sizeof(argument_type); - const uint8_t* const data = (const uint8_t*)&key; - constexpr int nblocks = len / 4; + return compute_hash(reinterpret_cast(&key), + cuco::experimental::extent{}); + } - uint32_t h1 = seed_; - constexpr uint32_t c1 = 0xcc9e2d51; - constexpr uint32_t c2 = 0x1b873593; + /** + * @brief Returns a hash value for its argument, as a value of type `result_type`. + * + * @tparam Extent The extent type + * + * @param bytes The input argument to hash + * @param size The extent of the data in bytes + * @return The resulting hash value + */ + template + constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes, + Extent size) const noexcept + { + auto const nblocks = size / 4; + + std::uint32_t h1 = seed_; + constexpr std::uint32_t c1 = 0xcc9e2d51; + constexpr std::uint32_t c2 = 0x1b873593; //---------- // body - const uint32_t* const blocks = (const uint32_t*)(data + nblocks * 4); - for (int i = -nblocks; i; i++) { - uint32_t k1 = blocks[i]; // getblock32(blocks,i); + for (std::remove_const_t i = 0; size >= 4 && i < nblocks; i++) { + std::uint32_t k1 = load_chunk(bytes, i); k1 *= c1; k1 = rotl32(k1, 15); k1 *= c2; @@ -160,13 +179,12 @@ struct MurmurHash3_32 { } //---------- // tail - const uint8_t* tail = (const uint8_t*)(data + nblocks * 4); - uint32_t k1 = 0; - switch (len & 3) { - case 3: k1 ^= tail[2] << 16; - case 2: k1 ^= tail[1] << 8; + std::uint32_t k1 = 0; + switch (size & 3) { + case 3: k1 ^= std::to_integer(bytes[nblocks * 4 + 2]) << 16; [[fallthrough]]; + case 2: k1 ^= std::to_integer(bytes[nblocks * 4 + 1]) << 8; [[fallthrough]]; case 1: - k1 ^= tail[0]; + k1 ^= std::to_integer(bytes[nblocks * 4 + 0]); k1 *= c1; k1 = rotl32(k1, 15); k1 *= c2; @@ -174,19 +192,18 @@ struct MurmurHash3_32 { }; //---------- // finalization - h1 ^= len; + h1 ^= size; h1 = fmix32_(h1); return h1; } private: - constexpr __host__ __device__ uint32_t rotl32(uint32_t x, int8_t r) const noexcept + constexpr __host__ __device__ std::uint32_t rotl32(std::uint32_t x, std::int8_t r) const noexcept { return (x << r) | (x >> (32 - r)); } - MurmurHash3_fmix32 fmix32_; - uint32_t seed_; + MurmurHash3_fmix32 fmix32_; + std::uint32_t seed_; }; - } // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/detail/hash_functions/utils.cuh b/include/cuco/detail/hash_functions/utils.cuh new file mode 100644 index 000000000..a50779f23 --- /dev/null +++ b/include/cuco/detail/hash_functions/utils.cuh @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace cuco::detail { + +template +constexpr __host__ __device__ T load_chunk(U const* const data, Extent index) noexcept +{ + auto const chunks = reinterpret_cast(data); + return chunks[index]; +} + +}; // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/detail/hash_functions/xxhash.cuh b/include/cuco/detail/hash_functions/xxhash.cuh index 2ea67fd8a..c686f3b82 100644 --- a/include/cuco/detail/hash_functions/xxhash.cuh +++ b/include/cuco/detail/hash_functions/xxhash.cuh @@ -16,6 +16,10 @@ #pragma once +#include +#include + +#include #include namespace cuco::detail { @@ -81,40 +85,51 @@ struct XXHash_32 { * @brief Returns a hash value for its argument, as a value of type `result_type`. * * @param key The input argument to hash - * @return A resulting hash value for `key` + * @return The resulting hash value for `key` */ constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept { - // TODO do we need to add checks/hints for alignment? - constexpr auto nbytes = sizeof(Key); - [[maybe_unused]] auto const bytes = reinterpret_cast(&key); ///< per-byte access - [[maybe_unused]] auto const blocks = - reinterpret_cast(&key); ///< 4-byte word access + return compute_hash(reinterpret_cast(&key), + cuco::experimental::extent{}); + } + /** + * @brief Returns a hash value for its argument, as a value of type `result_type`. + * + * @tparam Extent The extent type + * + * @param bytes The input argument to hash + * @param size The extent of the data in bytes + * @return The resulting hash value + */ + template + constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes, + Extent size) const noexcept + { std::size_t offset = 0; std::uint32_t h32; // data can be processed in 16-byte chunks - if constexpr (nbytes >= 16) { - constexpr auto limit = nbytes - 16; - std::uint32_t v1 = seed_ + prime1 + prime2; - std::uint32_t v2 = seed_ + prime2; - std::uint32_t v3 = seed_; - std::uint32_t v4 = seed_ - prime1; + if (size >= 16) { + auto const limit = size - 16; + std::uint32_t v1 = seed_ + prime1 + prime2; + std::uint32_t v2 = seed_ + prime2; + std::uint32_t v3 = seed_; + std::uint32_t v4 = seed_ - prime1; do { // pipeline 4*4byte computations auto const pipeline_offset = offset / 4; - v1 += blocks[pipeline_offset] * prime2; + v1 += load_chunk(bytes, pipeline_offset + 0) * prime2; v1 = rotl(v1, 13); v1 *= prime1; - v2 += blocks[pipeline_offset + 1] * prime2; + v2 += load_chunk(bytes, pipeline_offset + 1) * prime2; v2 = rotl(v2, 13); v2 *= prime1; - v3 += blocks[pipeline_offset + 2] * prime2; + v3 += load_chunk(bytes, pipeline_offset + 2) * prime2; v3 = rotl(v3, 13); v3 *= prime1; - v4 += blocks[pipeline_offset + 3] * prime2; + v4 += load_chunk(bytes, pipeline_offset + 3) * prime2; v4 = rotl(v4, 13); v4 *= prime1; offset += 16; @@ -125,20 +140,20 @@ struct XXHash_32 { h32 = seed_ + prime5; } - h32 += nbytes; + h32 += size; // remaining data can be processed in 4-byte chunks - if constexpr ((nbytes % 16) >= 4) { - for (; offset <= nbytes - 4; offset += 4) { - h32 += blocks[offset / 4] * prime3; + if ((size % 16) >= 4) { + for (; offset <= size - 4; offset += 4) { + h32 += load_chunk(bytes, offset / 4) * prime3; h32 = rotl(h32, 17) * prime4; } } // the following loop is only needed if the size of the key is not a multiple of the block size - if constexpr (nbytes % 4) { - while (offset < nbytes) { - h32 += (bytes[offset] & 255) * prime5; + if (size % 4) { + while (offset < size) { + h32 += (std::to_integer(bytes[offset]) & 255) * prime5; h32 = rotl(h32, 11) * prime1; ++offset; } @@ -232,42 +247,51 @@ struct XXHash_64 { * @brief Returns a hash value for its argument, as a value of type `result_type`. * * @param key The input argument to hash - * @return A resulting hash value for `key` + * @return The resulting hash value for `key` */ constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept { - // TODO do we need to add checks/hints for alignment? - constexpr auto nbytes = sizeof(Key); - [[maybe_unused]] auto const bytes = reinterpret_cast(&key); ///< per-byte access - [[maybe_unused]] auto const blocks4 = - reinterpret_cast(&key); ///< 4-byte word access - [[maybe_unused]] auto const blocks8 = - reinterpret_cast(&key); ///< 8-byte word access + return compute_hash(reinterpret_cast(&key), + cuco::experimental::extent{}); + } + /** + * @brief Returns a hash value for its argument, as a value of type `result_type`. + * + * @tparam Extent The extent type + * + * @param bytes The input argument to hash + * @param size The extent of the data in bytes + * @return The resulting hash value + */ + template + constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes, + Extent size) const noexcept + { std::size_t offset = 0; std::uint64_t h64; // data can be processed in 32-byte chunks - if constexpr (nbytes >= 32) { - constexpr auto limit = nbytes - 32; - std::uint64_t v1 = seed_ + prime1 + prime2; - std::uint64_t v2 = seed_ + prime2; - std::uint64_t v3 = seed_; - std::uint64_t v4 = seed_ - prime1; + if (size >= 32) { + auto const limit = size - 32; + std::uint64_t v1 = seed_ + prime1 + prime2; + std::uint64_t v2 = seed_ + prime2; + std::uint64_t v3 = seed_; + std::uint64_t v4 = seed_ - prime1; do { // pipeline 4*8byte computations auto const pipeline_offset = offset / 8; - v1 += blocks8[pipeline_offset] * prime2; + v1 += load_chunk(bytes, pipeline_offset + 0) * prime2; v1 = rotl(v1, 31); v1 *= prime1; - v2 += blocks8[pipeline_offset + 1] * prime2; + v2 += load_chunk(bytes, pipeline_offset + 1) * prime2; v2 = rotl(v2, 31); v2 *= prime1; - v3 += blocks8[pipeline_offset + 2] * prime2; + v3 += load_chunk(bytes, pipeline_offset + 2) * prime2; v3 = rotl(v3, 31); v3 *= prime1; - v4 += blocks8[pipeline_offset + 3] * prime2; + v4 += load_chunk(bytes, pipeline_offset + 3) * prime2; v4 = rotl(v4, 31); v4 *= prime1; offset += 32; @@ -302,12 +326,12 @@ struct XXHash_64 { h64 = seed_ + prime5; } - h64 += nbytes; + h64 += size; // remaining data can be processed in 8-byte chunks - if constexpr ((nbytes % 32) >= 8) { - for (; offset <= nbytes - 8; offset += 8) { - std::uint64_t k1 = blocks8[offset / 8] * prime2; + if ((size % 32) >= 8) { + for (; offset <= size - 8; offset += 8) { + std::uint64_t k1 = load_chunk(bytes, offset / 8) * prime2; k1 = rotl(k1, 31) * prime1; h64 ^= k1; h64 = rotl(h64, 27) * prime1 + prime4; @@ -315,18 +339,18 @@ struct XXHash_64 { } // remaining data can be processed in 4-byte chunks - if constexpr (((nbytes % 32) % 8) >= 4) { - for (; offset <= nbytes - 4; offset += 4) { - h64 ^= (blocks4[offset / 4] & 0xffffffffull) * prime1; + if (((size % 32) % 8) >= 4) { + for (; offset <= size - 4; offset += 4) { + h64 ^= (load_chunk(bytes, offset / 4) & 0xffffffffull) * prime1; h64 = rotl(h64, 23) * prime2 + prime3; } } // the following loop is only needed if the size of the key is not a multiple of a previous // block size - if constexpr (nbytes % 4) { - while (offset < nbytes) { - h64 ^= (bytes[offset] & 0xff) * prime5; + if (size % 4) { + while (offset < size) { + h64 ^= (std::to_integer(bytes[offset]) & 0xff) * prime5; h64 = rotl(h64, 11) * prime1; ++offset; } diff --git a/tests/utility/hash_test.cu b/tests/utility/hash_test.cu index 5e518669e..3e8880860 100644 --- a/tests/utility/hash_test.cu +++ b/tests/utility/hash_test.cu @@ -21,8 +21,11 @@ #include +#include #include +#include + template struct large_key { constexpr __host__ __device__ large_key(int32_t value) noexcept @@ -170,4 +173,26 @@ TEST_CASE("Test cuco::xxhash_32", "") CHECK(cuco::test::all_of(result.begin(), result.end(), [] __device__(bool v) { return v; })); } +} + +TEMPLATE_TEST_CASE_SIG("Static vs. dynamic key hash test", + "", + ((typename Hash), Hash), + (cuco::murmurhash3_32), + (cuco::murmurhash3_32), + (cuco::xxhash_32), + (cuco::xxhash_32), + (cuco::xxhash_64), + (cuco::xxhash_64)) +{ + using key_type = typename Hash::argument_type; + + Hash hash; + key_type key = 42; + + SECTION("Identical keys with static and dynamic key size should have the same hash value.") + { + CHECK(hash(key) == + hash.compute_hash(reinterpret_cast(&key), sizeof(key_type))); + } } \ No newline at end of file From a2833dbfb1b7d4915d530bd6adb45092a87a3b07 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 18 Jul 2023 08:42:30 -0500 Subject: [PATCH 130/152] Remove extra mod (#334) This PR removes an extra modulus operation. 8 is a factor of 32, so `(x % 32) % 8 == x % 8`. Reproducer: https://godbolt.org/z/vcGsqaPd1 --- include/cuco/detail/hash_functions/xxhash.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/hash_functions/xxhash.cuh b/include/cuco/detail/hash_functions/xxhash.cuh index c686f3b82..a36f74bca 100644 --- a/include/cuco/detail/hash_functions/xxhash.cuh +++ b/include/cuco/detail/hash_functions/xxhash.cuh @@ -339,7 +339,7 @@ struct XXHash_64 { } // remaining data can be processed in 4-byte chunks - if (((size % 32) % 8) >= 4) { + if ((size % 8) >= 4) { for (; offset <= size - 4; offset += 4) { h64 ^= (load_chunk(bytes, offset / 4) & 0xffffffffull) * prime1; h64 = rotl(h64, 23) * prime2 + prime3; @@ -378,4 +378,4 @@ struct XXHash_64 { std::uint64_t seed_; }; -} // namespace cuco::detail \ No newline at end of file +} // namespace cuco::detail From ff1620114ab5fff5fa9d407e5e3322c49f38a2f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= <2955913+sleeepyjack@users.noreply.github.com> Date: Sat, 29 Jul 2023 03:51:11 +0200 Subject: [PATCH 131/152] Fix memory alignment issues in hash computation (#338) When hashing large keys, e.g., strings, we traverse the input key iteratively in chunks of 4/8 bytes. The current implementation of the `load_chunk` function falsely assumes that the start of the key is always aligned to the chunk size, which is not always the case (see [discussion](https://github.com/rapidsai/cudf/pull/13612#discussion_r1267949464)). Additionally, this PR fixes some uncaught `[-Wmaybe-uninitialized]` warnings when compiling the unit tests. --- include/cuco/detail/hash_functions/utils.cuh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/hash_functions/utils.cuh b/include/cuco/detail/hash_functions/utils.cuh index a50779f23..37e279ba7 100644 --- a/include/cuco/detail/hash_functions/utils.cuh +++ b/include/cuco/detail/hash_functions/utils.cuh @@ -21,8 +21,10 @@ namespace cuco::detail { template constexpr __host__ __device__ T load_chunk(U const* const data, Extent index) noexcept { - auto const chunks = reinterpret_cast(data); - return chunks[index]; + auto const bytes = reinterpret_cast(data); + T chunk; + memcpy(&chunk, bytes + index * sizeof(T), sizeof(T)); + return chunk; } }; // namespace cuco::detail \ No newline at end of file From fd7263cc7b171693bbf5f06a9b177c44c0c0f188 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 31 Jul 2023 19:10:17 +0100 Subject: [PATCH 132/152] Add block synchronisation after reset of CG-level counters (#337) This is needed because if we go round loops again, we might read before things have been reset. - Tentatively closes #336. --- .../detail/static_multimap/device_view_impl.inl | 16 ++++++++++++++++ include/cuco/detail/static_multimap/kernels.cuh | 6 ++++++ 2 files changed, 22 insertions(+) diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index 9e328898d..98c08e720 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -1000,8 +1000,12 @@ class static_multimap::device_view_ if (*flushing_cg_counter + flushing_cg.size() * vector_width() > buffer_size) { flush_output_buffer( flushing_cg, *flushing_cg_counter, output_buffer, num_matches, output_begin); + // Everyone in the group reads the counter when flushing, so + // sync before writing. + flushing_cg.sync(); // First lane reset warp-level counter if (flushing_cg.thread_rank() == 0) { *flushing_cg_counter = 0; } + flushing_cg.sync(); } current_slot = next_slot(current_slot); @@ -1092,8 +1096,12 @@ class static_multimap::device_view_ // Flush if the next iteration won't fit into buffer if ((*cg_counter + g.size()) > buffer_size) { flush_output_buffer(g, *cg_counter, output_buffer, num_matches, output_begin); + // Everyone in the group reads the counter when flushing, so + // sync before writing. + g.sync(); // First lane reset CG-level counter if (lane_id == 0) { *cg_counter = 0; } + g.sync(); } current_slot = next_slot(current_slot); } // while running @@ -1428,8 +1436,12 @@ class static_multimap::device_view_ num_matches, probe_output_begin, contained_output_begin); + // Everyone in the group reads the counter when flushing, so + // sync before writing. + flushing_cg.sync(); // First lane reset warp-level counter if (flushing_cg.thread_rank() == 0) { *flushing_cg_counter = 0; } + flushing_cg.sync(); } current_slot = next_slot(current_slot); @@ -1539,8 +1551,12 @@ class static_multimap::device_view_ num_matches, probe_output_begin, contained_output_begin); + // Everyone in the group reads the counter when flushing, so + // sync before writing. + g.sync(); // First lane reset CG-level counter if (lane_id == 0) { *cg_counter = 0; } + g.sync(); } current_slot = next_slot(current_slot); } // while running diff --git a/include/cuco/detail/static_multimap/kernels.cuh b/include/cuco/detail/static_multimap/kernels.cuh index ca5f898a5..67fb36045 100644 --- a/include/cuco/detail/static_multimap/kernels.cuh +++ b/include/cuco/detail/static_multimap/kernels.cuh @@ -387,6 +387,8 @@ __global__ void retrieve(InputIt first, if (flushing_cg.thread_rank() == 0) { flushing_cg_counter[flushing_cg_id] = 0; } + flushing_cg.sync(); + while (flushing_cg.any(idx < n)) { bool active_flag = idx < n; auto active_flushing_cg = cg::binary_partition(flushing_cg, active_flag); @@ -416,6 +418,7 @@ __global__ void retrieve(InputIt first, idx += loop_stride; } + flushing_cg.sync(); // Final flush of output buffer if (flushing_cg_counter[flushing_cg_id] > 0) { view.flush_output_buffer(flushing_cg, @@ -499,6 +502,8 @@ __global__ void pair_retrieve(InputIt first, if (flushing_cg.thread_rank() == 0) { flushing_cg_counter[flushing_cg_id] = 0; } + flushing_cg.sync(); + while (flushing_cg.any(idx < n)) { bool active_flag = idx < n; auto active_flushing_cg = cg::binary_partition(flushing_cg, active_flag); @@ -532,6 +537,7 @@ __global__ void pair_retrieve(InputIt first, idx += loop_stride; } + flushing_cg.sync(); // Final flush of output buffer if (flushing_cg_counter[flushing_cg_id] > 0) { view.flush_output_buffer(flushing_cg, From 3fe5704b26a877f9321f07128b5764463e536012 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 2 Aug 2023 00:35:07 +0200 Subject: [PATCH 133/152] Enable fast_int functionality for dynamic extent (#315) Adds `fast_int` functionality to `cuco::extent`. Related to #284 --- include/cuco/detail/extent/extent.inl | 126 ++++++++++++++++ include/cuco/detail/open_addressing_impl.cuh | 6 +- .../cuco/detail/open_addressing_ref_impl.cuh | 24 ++-- include/cuco/detail/probing_scheme_impl.inl | 27 ++-- include/cuco/detail/storage/aow_storage.cuh | 20 ++- .../cuco/detail/storage/counter_storage.cuh | 6 +- include/cuco/detail/storage/storage_base.cuh | 18 ++- include/cuco/detail/utils.cuh | 20 +++ include/cuco/extent.cuh | 135 +++++++++++------- include/cuco/static_map.cuh | 2 +- include/cuco/static_set.cuh | 2 +- include/cuco/utility/fast_int.cuh | 7 + tests/CMakeLists.txt | 1 - tests/static_set/capacity_test.cu | 102 ------------- tests/utility/extent_test.cu | 8 +- 15 files changed, 309 insertions(+), 195 deletions(-) create mode 100644 include/cuco/detail/extent/extent.inl delete mode 100644 tests/static_set/capacity_test.cu diff --git a/include/cuco/detail/extent/extent.inl b/include/cuco/detail/extent/extent.inl new file mode 100644 index 000000000..9d79bc907 --- /dev/null +++ b/include/cuco/detail/extent/extent.inl @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include // TODO move to detail/extent/ +#include +#include + +#include + +namespace cuco { +namespace experimental { + +template +struct window_extent { + using value_type = SizeType; ///< Extent value type + + static auto constexpr cg_size = CGSize; + static auto constexpr window_size = WindowSize; + + __host__ __device__ constexpr value_type value() const noexcept { return N; } + __host__ __device__ explicit constexpr operator value_type() const noexcept { return value(); } + + private: + __host__ __device__ explicit constexpr window_extent() noexcept {} + __host__ __device__ explicit constexpr window_extent(SizeType) noexcept {} + + template + friend auto constexpr make_window_extent(extent ext); +}; + +template +struct window_extent + : cuco::utility::fast_int { + using value_type = + typename cuco::utility::fast_int::fast_int::value_type; ///< Extent value type + + static auto constexpr cg_size = CGSize; + static auto constexpr window_size = WindowSize; + + private: + using cuco::utility::fast_int::fast_int; + + template + friend auto constexpr make_window_extent(extent ext); +}; + +template +[[nodiscard]] auto constexpr make_window_extent(extent ext) +{ + return make_window_extent(ext); +} + +template +[[nodiscard]] std::size_t constexpr make_window_extent(std::size_t size) +{ + return make_window_extent(size); +} + +template +[[nodiscard]] auto constexpr make_window_extent(extent ext) +{ + auto constexpr max_prime = cuco::detail::primes.back(); + auto constexpr max_value = + (static_cast(std::numeric_limits::max()) < max_prime) + ? std::numeric_limits::max() + : static_cast(max_prime); + auto const size = SDIV(ext, CGSize * WindowSize); + if (size <= 0 or size > max_value) { CUCO_FAIL("Invalid input extent"); } + + if constexpr (N == dynamic_extent) { + return window_extent{static_cast( + *cuco::detail::lower_bound( + cuco::detail::primes.begin(), cuco::detail::primes.end(), static_cast(size)) * + CGSize)}; + } + if constexpr (N != dynamic_extent) { + return window_extent( + *cuco::detail::lower_bound(cuco::detail::primes.begin(), + cuco::detail::primes.end(), + static_cast(size)) * + CGSize)>{}; + } +} + +template +[[nodiscard]] std::size_t constexpr make_window_extent(std::size_t size) +{ + return static_cast(make_window_extent(extent{size})); +} + +namespace detail { + +template +struct is_window_extent : std::false_type { +}; + +template +struct is_window_extent> : std::true_type { +}; + +template +inline constexpr bool is_window_extent_v = is_window_extent::value; + +} // namespace detail + +} // namespace experimental +} // namespace cuco \ No newline at end of file diff --git a/include/cuco/detail/open_addressing_impl.cuh b/include/cuco/detail/open_addressing_impl.cuh index 2c8688869..dbf169dca 100644 --- a/include/cuco/detail/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing_impl.cuh @@ -88,7 +88,7 @@ class open_addressing_impl { using key_type = Key; ///< Key type using value_type = Value; ///< The storage value type, NOT payload type /// Extent type - using extent_type = decltype(make_valid_extent(std::declval())); + using extent_type = decltype(make_window_extent(std::declval())); using size_type = typename extent_type::value_type; ///< Size type using key_equal = KeyEqual; ///< Key equality comparator type using storage_type = @@ -103,7 +103,7 @@ class open_addressing_impl { * capacity, sentinel values and CUDA stream. * * @note The actual capacity depends on the given `capacity`, the probing scheme, CG size, and the - * window size and it's computed via `make_valid_extent` factory. Insert operations will not + * window size and it is computed via the `make_window_extent` factory. Insert operations will not * automatically grow the container. Attempting to insert more unique keys than the capacity of * the container results in undefined behavior. * @note Any `*_sentinel`s are reserved and behavior is undefined when attempting to insert @@ -130,7 +130,7 @@ class open_addressing_impl { empty_slot_sentinel_{empty_slot_sentinel}, predicate_{pred}, probing_scheme_{probing_scheme}, - storage_{make_valid_extent(capacity), alloc} + storage_{make_window_extent(capacity), alloc} { this->clear_async(stream); } diff --git a/include/cuco/detail/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing_ref_impl.cuh index 0e9cfdbe4..99187cc51 100644 --- a/include/cuco/detail/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing_ref_impl.cuh @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include @@ -62,6 +63,13 @@ class open_addressing_ref_impl { ProbingScheme>, "ProbingScheme must inherit from cuco::detail::probing_scheme_base"); + static_assert(is_window_extent_v, + "Extent is not a valid cuco::window_extent"); + static_assert(ProbingScheme::cg_size == StorageRef::extent_type::cg_size, + "Extent has incompatible CG size"); + static_assert(StorageRef::window_size == StorageRef::extent_type::window_size, + "Extent has incompatible window size"); + public: using key_type = Key; ///< Key type using probing_scheme_type = ProbingScheme; ///< Type of probing scheme @@ -138,7 +146,7 @@ class open_addressing_ref_impl { Predicate const& predicate) noexcept { static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); - auto probing_iter = probing_scheme_(key, storage_ref_.num_windows()); + auto probing_iter = probing_scheme_(key, storage_ref_.window_extent()); while (true) { auto const window_slots = storage_ref_[*probing_iter]; @@ -180,7 +188,7 @@ class open_addressing_ref_impl { value_type const& value, Predicate const& predicate) noexcept { - auto probing_iter = probing_scheme_(group, key, storage_ref_.num_windows()); + auto probing_iter = probing_scheme_(group, key, storage_ref_.window_extent()); while (true) { auto const window_slots = storage_ref_[*probing_iter]; @@ -244,7 +252,7 @@ class open_addressing_ref_impl { Predicate const& predicate) noexcept { static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); - auto probing_iter = probing_scheme_(key, storage_ref_.num_windows()); + auto probing_iter = probing_scheme_(key, storage_ref_.window_extent()); while (true) { auto const window_slots = storage_ref_[*probing_iter]; @@ -301,7 +309,7 @@ class open_addressing_ref_impl { value_type const& value, Predicate const& predicate) noexcept { - auto probing_iter = probing_scheme_(group, key, storage_ref_.num_windows()); + auto probing_iter = probing_scheme_(group, key, storage_ref_.window_extent()); while (true) { auto const window_slots = storage_ref_[*probing_iter]; @@ -375,7 +383,7 @@ class open_addressing_ref_impl { Predicate const& predicate) const noexcept { static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); - auto probing_iter = probing_scheme_(key, storage_ref_.num_windows()); + auto probing_iter = probing_scheme_(key, storage_ref_.window_extent()); while (true) { // TODO atomic_ref::load if insert operator is present @@ -413,7 +421,7 @@ class open_addressing_ref_impl { ProbeKey const& key, Predicate const& predicate) const noexcept { - auto probing_iter = probing_scheme_(group, key, storage_ref_.num_windows()); + auto probing_iter = probing_scheme_(group, key, storage_ref_.window_extent()); while (true) { auto const window_slots = storage_ref_[*probing_iter]; @@ -455,7 +463,7 @@ class open_addressing_ref_impl { Predicate const& predicate) const noexcept { static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); - auto probing_iter = probing_scheme_(key, storage_ref_.num_windows()); + auto probing_iter = probing_scheme_(key, storage_ref_.window_extent()); while (true) { // TODO atomic_ref::load if insert operator is present @@ -497,7 +505,7 @@ class open_addressing_ref_impl { ProbeKey const& key, Predicate const& predicate) const noexcept { - auto probing_iter = probing_scheme_(group, key, storage_ref_.num_windows()); + auto probing_iter = probing_scheme_(group, key, storage_ref_.window_extent()); while (true) { auto const window_slots = storage_ref_[*probing_iter]; diff --git a/include/cuco/detail/probing_scheme_impl.inl b/include/cuco/detail/probing_scheme_impl.inl index 4b617a133..3090d026e 100644 --- a/include/cuco/detail/probing_scheme_impl.inl +++ b/include/cuco/detail/probing_scheme_impl.inl @@ -16,6 +16,8 @@ #pragma once +#include + namespace cuco { namespace experimental { namespace detail { @@ -97,9 +99,10 @@ __host__ __device__ constexpr auto linear_probing::operator()( ProbeKey const& probe_key, Extent upper_bound) const noexcept { using size_type = typename Extent::value_type; - return detail::probing_iterator{static_cast(hash_(probe_key) % upper_bound), - 1, // step size is 1 - upper_bound}; + return detail::probing_iterator{ + cuco::detail::sanitize_hash(hash_(probe_key)) % upper_bound, + 1, // step size is 1 + upper_bound}; } template @@ -111,7 +114,7 @@ __host__ __device__ constexpr auto linear_probing::operator()( { using size_type = typename Extent::value_type; return detail::probing_iterator{ - static_cast((hash_(probe_key) + g.thread_rank()) % upper_bound), + cuco::detail::sanitize_hash(hash_(probe_key) + g.thread_rank()) % upper_bound, cg_size, upper_bound}; } @@ -130,9 +133,10 @@ __host__ __device__ constexpr auto double_hashing::operato { using size_type = typename Extent::value_type; return detail::probing_iterator{ - static_cast(hash1_(probe_key) % upper_bound), - static_cast(hash2_(probe_key) % (upper_bound - 1) + - 1), // step size in range [1, prime - 1] + cuco::detail::sanitize_hash(hash1_(probe_key)) % upper_bound, + max(size_type{1}, + cuco::detail::sanitize_hash(hash2_(probe_key)) % + upper_bound), // step size in range [1, prime - 1] upper_bound}; } @@ -145,9 +149,12 @@ __host__ __device__ constexpr auto double_hashing::operato { using size_type = typename Extent::value_type; return detail::probing_iterator{ - static_cast((hash1_(probe_key) + g.thread_rank()) % upper_bound), - static_cast((hash2_(probe_key) % (upper_bound / cg_size - 1) + 1) * cg_size), - upper_bound}; + cuco::detail::sanitize_hash(hash1_(probe_key) + g.thread_rank()) % upper_bound, + static_cast((cuco::detail::sanitize_hash(hash2_(probe_key)) % + (upper_bound.value() / cg_size - 1) + + 1) * + cg_size), + upper_bound}; // TODO use fast_int operator } } // namespace experimental } // namespace cuco diff --git a/include/cuco/detail/storage/aow_storage.cuh b/include/cuco/detail/storage/aow_storage.cuh index ac86508de..1f01dad37 100644 --- a/include/cuco/detail/storage/aow_storage.cuh +++ b/include/cuco/detail/storage/aow_storage.cuh @@ -67,7 +67,7 @@ class aow_storage_base : public storage_base { * * @return The total number of slot windows */ - [[nodiscard]] __host__ __device__ constexpr extent_type num_windows() const noexcept + [[nodiscard]] __host__ __device__ constexpr size_type num_windows() const noexcept { return storage_base::capacity(); } @@ -77,9 +77,19 @@ class aow_storage_base : public storage_base { * * @return The total number of slots */ - [[nodiscard]] __host__ __device__ constexpr auto capacity() const noexcept + [[nodiscard]] __host__ __device__ constexpr size_type capacity() const noexcept { - return storage_base::capacity().template multiply(); + return storage_base::capacity() * window_size; + } + + /** + * @brief Gets the window extent of the current storage. + * + * @return The window extent. + */ + [[nodiscard]] __host__ __device__ constexpr extent_type window_extent() const noexcept + { + return storage_base::extent(); } }; @@ -278,7 +288,7 @@ class aow_storage : public aow_storage_base { * @brief Constructor of AoW storage. * * @note The input `size` should be exclusively determined by the return value of - * `make_valid_extent` since it depends on the requested low-bound value, the probing scheme, and + * `make_window_extent` since it depends on the requested low-bound value, the probing scheme, and * the storage. * * @param size Number of windows to (de)allocate @@ -325,7 +335,7 @@ class aow_storage : public aow_storage_base { */ [[nodiscard]] constexpr ref_type ref() const noexcept { - return ref_type{this->num_windows(), this->data()}; + return ref_type{this->window_extent(), this->data()}; } /** diff --git a/include/cuco/detail/storage/counter_storage.cuh b/include/cuco/detail/storage/counter_storage.cuh index 12c963530..bb36b15e2 100644 --- a/include/cuco/detail/storage/counter_storage.cuh +++ b/include/cuco/detail/storage/counter_storage.cuh @@ -38,7 +38,7 @@ namespace detail { template class counter_storage : public storage_base> { public: - using storage_base>::capacity_; ///< Storage size + using storage_base>::capacity; ///< Storage size using size_type = SizeType; ///< Size type using value_type = cuda::atomic; ///< Type of the counter @@ -56,8 +56,8 @@ class counter_storage : public storage_base>{cuco::experimental::extent{}}, allocator_{allocator}, - counter_deleter_{capacity_, allocator_}, - counter_{allocator_.allocate(capacity_), counter_deleter_} + counter_deleter_{this->capacity(), allocator_}, + counter_{allocator_.allocate(this->capacity()), counter_deleter_} { } diff --git a/include/cuco/detail/storage/storage_base.cuh b/include/cuco/detail/storage/storage_base.cuh index ada0726db..15ec30472 100644 --- a/include/cuco/detail/storage/storage_base.cuh +++ b/include/cuco/detail/storage/storage_base.cuh @@ -71,20 +71,30 @@ class storage_base { * * @param size Number of elements to (de)allocate */ - explicit constexpr storage_base(Extent size) : capacity_{size} {} + explicit constexpr storage_base(Extent size) : extent_{size} {} /** * @brief Gets the total number of elements in the current storage. * * @return The total number of elements */ - [[nodiscard]] __host__ __device__ constexpr extent_type capacity() const noexcept + [[nodiscard]] __host__ __device__ constexpr size_type capacity() const noexcept { - return capacity_; + return static_cast(extent_); + } + + /** + * @brief Gets the extent of the current storage. + * + * @return The extent. + */ + [[nodiscard]] __host__ __device__ constexpr extent_type extent() const noexcept + { + return extent_; } protected: - extent_type capacity_; ///< Total number of elements + extent_type extent_; ///< Total number of elements }; } // namespace detail diff --git a/include/cuco/detail/utils.cuh b/include/cuco/detail/utils.cuh index fdded70f5..793854694 100644 --- a/include/cuco/detail/utils.cuh +++ b/include/cuco/detail/utils.cuh @@ -20,6 +20,8 @@ #include #include +#include +#include namespace cuco { namespace detail { @@ -104,6 +106,24 @@ struct strong_type { T value; ///< Underlying value }; +/** + * @brief Converts a given hash value into a valid (positive) size type. + * + * @tparam SizeType The target type + * @tparam HashType The input type + * + * @return Converted hash value + */ +template +__host__ __device__ constexpr SizeType sanitize_hash(HashType hash) noexcept +{ + if constexpr (cuda::std::is_signed_v) { + return cuda::std::abs(static_cast(hash)); + } else { + return static_cast(hash); + } +} + /** * @brief Gives value to use as alignment for a pair type that is at least the * size of the sum of the size of the first type and second type, or 16, diff --git a/include/cuco/extent.cuh b/include/cuco/extent.cuh index b825188ed..a08f77a39 100644 --- a/include/cuco/extent.cuh +++ b/include/cuco/extent.cuh @@ -16,8 +16,6 @@ #pragma once -#include - #include #include @@ -46,19 +44,6 @@ struct extent { * @return Extent size */ __host__ __device__ constexpr operator value_type() const noexcept { return N; } - - /** - * @brief Multiplies the current extent with the given `Value`. - * - * @tparam Value The input value to multiply with - * - * @return Resulting static extent - */ - template - __host__ __device__ constexpr auto multiply() const noexcept - { - return extent{}; - } }; /** @@ -84,25 +69,67 @@ struct extent { */ __host__ __device__ constexpr operator value_type() const noexcept { return value_; } - /** - * @brief Multiplies the current extent with the given `Value`. - * - * @tparam Value The input value to multiply with - * - * @return Resulting extent - */ - template - __host__ __device__ constexpr auto multiply() const noexcept - { - return extent{Value * value_}; - } - private: value_type value_; ///< Extent value }; /** - * @brief Computes valid extent based on given parameters. + * @brief Window extent strong type. + * + * @note This type is used internally and can only be constructed using the `make_window_extent' + * factory method. + * + * @tparam SizeType Size type + * @tparam N Extent + * + */ +template +struct window_extent; + +/** + * @brief Computes a valid window extent/capacity for a given container type. + * + * @note The actual capacity of a container (map/set) should be exclusively determined by the return + * value of this utility since the output depends on the requested low-bound size, the probing + * scheme, and the storage. This utility is used internally during container constructions while for + * container ref constructions, it would be users' responsibility to use this function to determine + * the capacity ctor argument for the container. + * + * @tparam Container Container type to compute the extent for + * @tparam SizeType Size type + * @tparam N Extent + * + * @param ext The input extent + * + * @throw If the input extent is invalid + * + * @return Resulting valid `window extent` + */ +template +[[nodiscard]] auto constexpr make_window_extent(extent ext); + +/** + * @brief Computes a valid capacity for a given container type. + * + * @note The actual capacity of a container (map/set) should be exclusively determined by the return + * value of this utility since the output depends on the requested low-bound size, the probing + * scheme, and the storage. This utility is used internally during container constructions while for + * container ref constructions, it would be users' responsibility to use this function to determine + * the capacity ctor argument for the container. + * + * @tparam Container Container type to compute the extent for + * + * @param size The input size + * + * @throw If the input size is invalid + * + * @return Resulting valid extent as `std::size_t` + */ +template +[[nodiscard]] std::size_t constexpr make_window_extent(std::size_t size); + +/** + * @brief Computes valid window extent based on given parameters. * * @note The actual capacity of a container (map/set) should be exclusively determined by the return * value of this utility since the output depends on the requested low-bound size, the probing @@ -115,35 +142,37 @@ struct extent { * @tparam SizeType Size type * @tparam N Extent * + * @param ext The input extent + * * @throw If the input extent is invalid * * @return Resulting valid extent */ template -[[nodiscard]] auto constexpr make_valid_extent(extent ext) -{ - auto constexpr max_prime = cuco::detail::primes.back(); - auto constexpr max_value = - (static_cast(std::numeric_limits::max()) < max_prime) - ? std::numeric_limits::max() - : static_cast(max_prime); - auto const size = SDIV(ext, CGSize * WindowSize); - if (size <= 0 or size > max_value) { CUCO_FAIL("Invalid input extent"); } - - if constexpr (N == dynamic_extent) { - return extent{static_cast( - *cuco::detail::lower_bound( - cuco::detail::primes.begin(), cuco::detail::primes.end(), static_cast(size)) * - CGSize)}; - } - if constexpr (N != dynamic_extent) { - return extent(*cuco::detail::lower_bound(cuco::detail::primes.begin(), - cuco::detail::primes.end(), - static_cast(size)) * - CGSize)>{}; - } -} +[[nodiscard]] auto constexpr make_window_extent(extent ext); + +/** + * @brief Computes valid window extent/capacity based on given parameters. + * + * @note The actual capacity of a container (map/set) should be exclusively determined by the return + * value of this utility since the output depends on the requested low-bound size, the probing + * scheme, and the storage. This utility is used internally during container constructions while for + * container ref constructions, it would be users' responsibility to use this function to determine + * the capacity ctor argument for the container. + * + * @tparam CGSize Number of elements handled per CG + * @tparam WindowSize Number of elements handled per Window + * + * @param size The input size + * + * @throw If the input size is invalid + * + * @return Resulting valid extent as `std::size_t` + */ +template +[[nodiscard]] std::size_t constexpr make_window_extent(std::size_t size); } // namespace experimental } // namespace cuco + +#include diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index a14325c84..38f3b92c9 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -158,7 +158,7 @@ class static_map { * and CUDA stream. * * The actual map capacity depends on the given `capacity`, the probing scheme, CG size, and the - * window size and it's computed via `make_valid_extent` factory. Insert operations will not + * window size and it is computed via the `make_window_extent` factory. Insert operations will not * automatically grow the map. Attempting to insert more unique keys than the capacity of the map * results in undefined behavior. * diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh index 9aa67a498..0cb558a65 100644 --- a/include/cuco/static_set.cuh +++ b/include/cuco/static_set.cuh @@ -134,7 +134,7 @@ class static_set { * and CUDA stream. * * The actual set capacity depends on the given `capacity`, the probing scheme, CG size, and the - * window size and it's computed via `make_valid_extent` factory. Insert operations will not + * window size and it is computed via the `make_window_extent` factory. Insert operations will not * automatically grow the set. Attempting to insert more unique keys than the capacity of the map * results in undefined behavior. * diff --git a/include/cuco/utility/fast_int.cuh b/include/cuco/utility/fast_int.cuh index 5cd2998f6..6616e2c5c 100644 --- a/include/cuco/utility/fast_int.cuh +++ b/include/cuco/utility/fast_int.cuh @@ -51,6 +51,13 @@ struct fast_int { evaluate_magic_numbers(); } + /** + * @brief Get the underlying integer value. + * + * @return Underlying value + */ + __host__ __device__ constexpr value_type value() const noexcept { return value_; } + /** * @brief Explicit conversion operator to the underlying value type. * diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ebc37e39b..a72ff52d5 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -58,7 +58,6 @@ ConfigureTest(UTILITY_TEST ################################################################################################### # - static_set tests ------------------------------------------------------------------------------ ConfigureTest(STATIC_SET_TEST - static_set/capacity_test.cu static_set/heterogeneous_lookup_test.cu static_set/insert_and_find_test.cu static_set/large_input_test.cu diff --git a/tests/static_set/capacity_test.cu b/tests/static_set/capacity_test.cu deleted file mode 100644 index e144325d5..000000000 --- a/tests/static_set/capacity_test.cu +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include - -TEST_CASE("Static set capacity", "") -{ - constexpr std::size_t num_keys{400}; - using Key = int32_t; - using ProbeT = cuco::experimental::double_hashing<1, cuco::default_hash_function>; - using Equal = thrust::equal_to; - using AllocatorT = cuco::cuda_allocator; - using StorageT = cuco::experimental::aow_storage<2>; - - SECTION("Static extent must be evaluated at compile time.") - { - auto constexpr gold_capacity = 422; // 211 x 2 - - using extent_type = cuco::experimental::extent; - cuco::experimental:: - static_set - set{extent_type{}, cuco::empty_key{-1}}; - auto const capacity = set.capacity(); - STATIC_REQUIRE(capacity == gold_capacity); - - auto ref = set.ref(cuco::experimental::insert); - auto const ref_capacity = ref.capacity(); - STATIC_REQUIRE(ref_capacity == gold_capacity); - } - - SECTION("Dynamic extent is evaluated at run time.") - { - auto constexpr gold_capacity = 422; // 211 x 2 - - using extent_type = cuco::experimental::extent; - cuco::experimental:: - static_set - set{num_keys, cuco::empty_key{-1}}; - auto const capacity = set.capacity(); - REQUIRE(capacity == gold_capacity); - - auto ref = set.ref(cuco::experimental::insert); - auto const ref_capacity = ref.capacity(); - REQUIRE(ref_capacity == gold_capacity); - } - - SECTION("Static extent must be evaluated at compile time.") - { - auto constexpr gold_capacity = 412; // 103 x 2 x 2 - - using extent_type = cuco::experimental::extent; - using probe = cuco::experimental::linear_probing<2, cuco::default_hash_function>; - auto set = cuco::experimental:: - static_set{ - extent_type{}, cuco::empty_key{-1}}; - - REQUIRE(set.capacity() == gold_capacity); - - auto const capacity = set.capacity(); - STATIC_REQUIRE(capacity == gold_capacity); - - auto ref = set.ref(cuco::experimental::insert); - auto const ref_capacity = ref.capacity(); - STATIC_REQUIRE(ref_capacity == gold_capacity); - } - - SECTION("Dynamic extent is evaluated at run time.") - { - auto constexpr gold_capacity = 412; // 103 x 2 x 2 - - using probe = cuco::experimental::linear_probing<2, cuco::default_hash_function>; - auto set = cuco::experimental::static_set, - cuda::thread_scope_device, - Equal, - probe, - AllocatorT, - StorageT>{num_keys, cuco::empty_key{-1}}; - - auto const capacity = set.capacity(); - REQUIRE(capacity == gold_capacity); - - auto ref = set.ref(cuco::experimental::insert); - auto const ref_capacity = ref.capacity(); - REQUIRE(ref_capacity == gold_capacity); - } -} diff --git a/tests/utility/extent_test.cu b/tests/utility/extent_test.cu index 2623a8ae5..d44e20368 100644 --- a/tests/utility/extent_test.cu +++ b/tests/utility/extent_test.cu @@ -43,14 +43,14 @@ TEMPLATE_TEST_CASE_SIG( SECTION("Compute static valid extent at compile time.") { auto constexpr size = cuco::experimental::extent{}; - auto constexpr res = cuco::experimental::make_valid_extent(size); - STATIC_REQUIRE(gold_reference == res); + auto constexpr res = cuco::experimental::make_window_extent(size); + STATIC_REQUIRE(gold_reference == res.value()); } SECTION("Compute dynamic valid extent at run time.") { auto const size = cuco::experimental::extent{num}; - auto const res = cuco::experimental::make_valid_extent(size); - REQUIRE(gold_reference == res); + auto const res = cuco::experimental::make_window_extent(size); + REQUIRE(gold_reference == res.value()); } } From 3d577bf802f952951decc42aeaa44af15aab8e43 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 2 Aug 2023 14:01:13 -0500 Subject: [PATCH 134/152] Fix doxygen check with unsupported version. (#344) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While running `pre-commit`, I got this error: ```bash ./ci/checks/doxygen.sh: line 19: : command not found ./ci/checks/doxygen.sh: line 20: : command not found ./ci/checks/doxygen.sh: line 21: : command not found ``` I have doxygen 1.9.7 installed, which isn't a supported version in this script. The Doxygen check is supposed to pass silently when run locally with an unsupported version (it runs properly in CI, instead). It seems like there's an unrecognized symbol in the bash script from PR #177: https://github.com/NVIDIA/cuCollections/commit/05fb1dbe9fed4f446cecb0b024232ede1dea6926 This PR removes that symbol, fixing the error. --- ci/checks/doxygen.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/checks/doxygen.sh b/ci/checks/doxygen.sh index b9a243cd1..515558c4a 100755 --- a/ci/checks/doxygen.sh +++ b/ci/checks/doxygen.sh @@ -16,9 +16,9 @@ function version { echo "$@" | awk -F. '{ printf("%d%03d%03d%03d\n", $1,$2,$3,$4 # Doxygen supported version 1.8.20 to 1.9.1 DOXYGEN_VERSION=$(doxygen --version) if [ $(version "$DOXYGEN_VERSION") -lt $(version "1.8.20") ] || [ $(version $DOXYGEN_VERSION) -gt $(version "1.9.1") ]; then - echo -e "Warning: Unsupported Doxygen version $DOXYGEN_VERSION" - echo -e "Expecting Doxygen version from 1.8.20 to 1.9.1" - exit 0 + echo -e "Warning: Unsupported Doxygen version $DOXYGEN_VERSION" + echo -e "Expecting Doxygen version from 1.8.20 to 1.9.1" + exit 0 fi # Run doxygen, ignore missing tag files error From e29b25b7d0e2caebce40444d727b60b410864d74 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 2 Aug 2023 15:54:55 -0500 Subject: [PATCH 135/152] Use rapids-cmake 23.10. (#342) This PR updates cuCollections to rapids-cmake 23.10. This helps prepare for CCCL 2.1.0 updates (https://github.com/rapidsai/rapids-cmake/pull/399). --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 12c5228e4..f3ca85a8a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,7 +16,7 @@ cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake) - file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.06/RAPIDS.cmake + file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.10/RAPIDS.cmake ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake) endif() include(${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake) From 6548a1ed0e59387f4411ba08e0e869811bc38690 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 3 Aug 2023 14:23:40 -0700 Subject: [PATCH 136/152] Fix bitwise compare alignment issues (#341) This PR forces data alignment in the `bitwise_compare` implementation. Closes #340 --- include/cuco/detail/bitwise_compare.cuh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/bitwise_compare.cuh b/include/cuco/detail/bitwise_compare.cuh index 979dab829..2ed7ad072 100644 --- a/include/cuco/detail/bitwise_compare.cuh +++ b/include/cuco/detail/bitwise_compare.cuh @@ -75,8 +75,11 @@ __host__ __device__ constexpr bool bitwise_compare(T const& lhs, T const& rhs) cuco::is_bitwise_comparable_v, "Bitwise compared objects must have unique object representations or be explicitly declared as " "safe for bitwise comparison via specialization of cuco::is_bitwise_comparable_v."); - return detail::bitwise_compare_impl::compare(reinterpret_cast(&lhs), - reinterpret_cast(&rhs)); + + alignas(sizeof(T)) T __lhs{lhs}; + alignas(sizeof(T)) T __rhs{rhs}; + return detail::bitwise_compare_impl::compare(reinterpret_cast(&__lhs), + reinterpret_cast(&__rhs)); } } // namespace detail From 5186b39522e13a3681c0eb591db4eaacbf969485 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sun, 6 Aug 2023 15:56:12 -0700 Subject: [PATCH 137/152] Improve implememtation details in experimental data structures (#345) This PR fixes issues and adds new features requested by https://github.com/rapidsai/cudf/pull/13807. It: - removes the requirement of the second hasher from double hashing must be constructible from an integer - fixes an issue in map iterator `!=` operator - overloads map iterator access operator - allows zero capacity container - adds `capacity_test` back since several corner cases need to be exercised --- include/cuco/detail/extent/extent.inl | 7 +- include/cuco/detail/storage/aow_storage.cuh | 9 +- include/cuco/extent.cuh | 2 +- tests/CMakeLists.txt | 1 + tests/static_set/capacity_test.cu | 99 +++++++++++++++++++++ 5 files changed, 113 insertions(+), 5 deletions(-) create mode 100644 tests/static_set/capacity_test.cu diff --git a/include/cuco/detail/extent/extent.inl b/include/cuco/detail/extent/extent.inl index 9d79bc907..c89f7325b 100644 --- a/include/cuco/detail/extent/extent.inl +++ b/include/cuco/detail/extent/extent.inl @@ -80,8 +80,9 @@ template (static_cast(std::numeric_limits::max()) < max_prime) ? std::numeric_limits::max() : static_cast(max_prime); - auto const size = SDIV(ext, CGSize * WindowSize); - if (size <= 0 or size > max_value) { CUCO_FAIL("Invalid input extent"); } + auto const size = + SDIV(std::max(static_cast(ext), static_cast(1)), CGSize * WindowSize); + if (size > max_value) { CUCO_FAIL("Invalid input extent"); } if constexpr (N == dynamic_extent) { return window_extent{static_cast( @@ -123,4 +124,4 @@ inline constexpr bool is_window_extent_v = is_window_extent::value; } // namespace detail } // namespace experimental -} // namespace cuco \ No newline at end of file +} // namespace cuco diff --git a/include/cuco/detail/storage/aow_storage.cuh b/include/cuco/detail/storage/aow_storage.cuh index 1f01dad37..e817e3293 100644 --- a/include/cuco/detail/storage/aow_storage.cuh +++ b/include/cuco/detail/storage/aow_storage.cuh @@ -174,6 +174,13 @@ class aow_storage_ref : public aow_storage_base { */ __device__ constexpr reference operator*() const { return *current_; } + /** + * @brief Access operator + * + * @return Pointer to the current slot + */ + __device__ constexpr value_type* operator->() const { return current_; } + /** * Equality operator * @@ -191,7 +198,7 @@ class aow_storage_ref : public aow_storage_base { */ friend __device__ constexpr bool operator!=(iterator const& lhs, iterator const& rhs) noexcept { - return not lhs == rhs; + return not(lhs == rhs); } private: diff --git a/include/cuco/extent.cuh b/include/cuco/extent.cuh index a08f77a39..e45068d9e 100644 --- a/include/cuco/extent.cuh +++ b/include/cuco/extent.cuh @@ -36,7 +36,7 @@ struct extent { constexpr extent() = default; /// Constructs from `SizeType` - __host__ __device__ constexpr explicit extent(SizeType) noexcept {} + __host__ __device__ constexpr extent(SizeType) noexcept {} /** * @brief Conversion to value_type. diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a72ff52d5..ebc37e39b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -58,6 +58,7 @@ ConfigureTest(UTILITY_TEST ################################################################################################### # - static_set tests ------------------------------------------------------------------------------ ConfigureTest(STATIC_SET_TEST + static_set/capacity_test.cu static_set/heterogeneous_lookup_test.cu static_set/insert_and_find_test.cu static_set/large_input_test.cu diff --git a/tests/static_set/capacity_test.cu b/tests/static_set/capacity_test.cu new file mode 100644 index 000000000..3b7681e0a --- /dev/null +++ b/tests/static_set/capacity_test.cu @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +TEST_CASE("Static set capacity", "") +{ + using Key = int32_t; + using ProbeT = cuco::experimental::double_hashing<1, cuco::default_hash_function>; + using Equal = thrust::equal_to; + using AllocatorT = cuco::cuda_allocator; + using StorageT = cuco::experimental::aow_storage<2>; + + SECTION("zero capacity is allowed.") + { + auto constexpr gold_capacity = 4; + + using extent_type = cuco::experimental::extent; + cuco::experimental:: + static_set + set{extent_type{}, cuco::empty_key{-1}}; + auto const capacity = set.capacity(); + REQUIRE(capacity == gold_capacity); + + auto ref = set.ref(cuco::experimental::insert); + auto const ref_capacity = ref.capacity(); + REQUIRE(ref_capacity == gold_capacity); + } + + SECTION("negative capacity (ikr -_-||) is also allowed.") + { + auto constexpr gold_capacity = 4; + + using extent_type = cuco::experimental::extent; + cuco::experimental:: + static_set + set{extent_type{-10}, cuco::empty_key{-1}}; + auto const capacity = set.capacity(); + REQUIRE(capacity == gold_capacity); + + auto ref = set.ref(cuco::experimental::insert); + auto const ref_capacity = ref.capacity(); + REQUIRE(ref_capacity == gold_capacity); + } + + constexpr std::size_t num_keys{400}; + + SECTION("Dynamic extent is evaluated at run time.") + { + auto constexpr gold_capacity = 422; // 211 x 2 + + using extent_type = cuco::experimental::extent; + cuco::experimental:: + static_set + set{num_keys, cuco::empty_key{-1}}; + auto const capacity = set.capacity(); + REQUIRE(capacity == gold_capacity); + + auto ref = set.ref(cuco::experimental::insert); + auto const ref_capacity = ref.capacity(); + REQUIRE(ref_capacity == gold_capacity); + } + + SECTION("Dynamic extent is evaluated at run time.") + { + auto constexpr gold_capacity = 412; // 103 x 2 x 2 + + using probe = cuco::experimental::linear_probing<2, cuco::default_hash_function>; + auto set = cuco::experimental::static_set, + cuda::thread_scope_device, + Equal, + probe, + AllocatorT, + StorageT>{num_keys, cuco::empty_key{-1}}; + + auto const capacity = set.capacity(); + REQUIRE(capacity == gold_capacity); + + auto ref = set.ref(cuco::experimental::insert); + auto const ref_capacity = ref.capacity(); + REQUIRE(ref_capacity == gold_capacity); + } +} From cf958d0607d42c679687959393b1c13aa2938dcb Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 10 Aug 2023 10:57:08 -0700 Subject: [PATCH 138/152] Expose `aow_storage` to the public (#349) Closes #348 This PR exposes `cuco::experimental::window` type and `aow_storage` class to the public. --- include/cuco/aow_storage.cuh | 218 ++++++++++ include/cuco/detail/storage/aow_storage.cuh | 372 ------------------ include/cuco/detail/storage/aow_storage.inl | 197 ++++++++++ .../cuco/detail/storage/aow_storage_base.cuh | 106 +++++ include/cuco/detail/storage/storage.cuh | 2 +- include/cuco/detail/storage/storage_base.cuh | 2 +- include/cuco/static_map.cuh | 2 +- include/cuco/static_set.cuh | 2 +- include/cuco/storage.cuh | 22 +- tests/static_map/unique_sequence_test.cu | 2 +- tests/static_set/capacity_test.cu | 2 +- tests/static_set/insert_and_find_test.cu | 2 +- tests/static_set/retrieve_all_test.cu | 2 +- tests/static_set/unique_sequence_test.cu | 2 +- tests/utility/storage_test.cu | 22 +- 15 files changed, 553 insertions(+), 402 deletions(-) create mode 100644 include/cuco/aow_storage.cuh delete mode 100644 include/cuco/detail/storage/aow_storage.cuh create mode 100644 include/cuco/detail/storage/aow_storage.inl create mode 100644 include/cuco/detail/storage/aow_storage_base.cuh diff --git a/include/cuco/aow_storage.cuh b/include/cuco/aow_storage.cuh new file mode 100644 index 000000000..fdd970cf4 --- /dev/null +++ b/include/cuco/aow_storage.cuh @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include + +#include + +#include +#include +#include +#include + +namespace cuco { +namespace experimental { + +/// Window type alias +template +using window = detail::window; + +/// forward declaration +template +class aow_storage_ref; + +/** + * @brief Array of Window open addressing storage class. + * + * @tparam T Slot type + * @tparam WindowSize Number of slots in each window + * @tparam Extent Type of extent denoting number of windows + * @tparam Allocator Type of allocator used for device storage (de)allocation + */ +template +class aow_storage : public detail::aow_storage_base { + public: + using base_type = detail::aow_storage_base; ///< AoW base class type + + using base_type::window_size; ///< Number of elements processed per window + + using extent_type = typename base_type::extent_type; ///< Storage extent type + using size_type = typename base_type::size_type; ///< Storage size type + using value_type = typename base_type::value_type; ///< Slot type + using window_type = typename base_type::window_type; ///< Slot window type + + using base_type::capacity; + using base_type::num_windows; + + /// Type of the allocator to (de)allocate windows + using allocator_type = typename std::allocator_traits::rebind_alloc; + using window_deleter_type = + detail::custom_deleter; ///< Type of window deleter + using ref_type = aow_storage_ref; ///< Storage ref type + + /** + * @brief Constructor of AoW storage. + * + * @note The input `size` should be exclusively determined by the return value of + * `make_window_extent` since it depends on the requested low-bound value, the probing scheme, and + * the storage. + * + * @param size Number of windows to (de)allocate + * @param allocator Allocator used for (de)allocating device storage + */ + explicit constexpr aow_storage(Extent size, Allocator const& allocator) noexcept; + + aow_storage(aow_storage&&) = default; ///< Move constructor + /** + * @brief Replaces the contents of the storage with another storage. + * + * @return Reference of the current storage object + */ + aow_storage& operator=(aow_storage&&) = default; + ~aow_storage() = default; ///< Destructor + + aow_storage(aow_storage const&) = delete; + aow_storage& operator=(aow_storage const&) = delete; + + /** + * @brief Gets windows array. + * + * @return Pointer to the first window + */ + [[nodiscard]] constexpr window_type* data() const noexcept; + + /** + * @brief Gets the storage allocator. + * + * @return The storage allocator + */ + [[nodiscard]] constexpr allocator_type allocator() const noexcept; + + /** + * @brief Gets window storage reference. + * + * @return Reference of window storage + */ + [[nodiscard]] constexpr ref_type ref() const noexcept; + + /** + * @brief Initializes each slot in the AoW storage to contain `key`. + * + * @param key Key to which all keys in `slots` are initialized + * @param stream Stream used for executing the kernel + */ + void initialize(value_type key, cuda_stream_ref stream) noexcept; + + private: + allocator_type allocator_; ///< Allocator used to (de)allocate windows + window_deleter_type window_deleter_; ///< Custom windows deleter + std::unique_ptr windows_; ///< Pointer to AoW storage +}; + +/** + * @brief Non-owning AoW storage reference type. + * + * @tparam T Storage element type + * @tparam WindowSize Number of slots in each window + * @tparam Extent Type of extent denoting storage capacity + */ +template +class aow_storage_ref : public detail::aow_storage_base { + public: + using base_type = detail::aow_storage_base; ///< AoW base class type + + using base_type::window_size; ///< Number of elements processed per window + + using extent_type = typename base_type::extent_type; ///< Storage extent type + using size_type = typename base_type::size_type; ///< Storage size type + using value_type = typename base_type::value_type; ///< Slot type + using window_type = typename base_type::window_type; ///< Slot window type + + using base_type::capacity; + using base_type::num_windows; + + /** + * @brief Constructor of AoS storage ref. + * + * @param size Number of windows + * @param windows Pointer to the windows array + */ + __host__ __device__ explicit constexpr aow_storage_ref(Extent size, + window_type* windows) noexcept; + + /** + * @brief Custom un-incrementable input iterator for the convenience of `find` operations. + * + * @note This iterator is for read only and NOT incrementable. + */ + struct iterator; + using const_iterator = iterator const; ///< Const forward iterator type + + /** + * @brief Returns an iterator to one past the last slot. + * + * This is provided for convenience for those familiar with checking + * an iterator returned from `find()` against the `end()` iterator. + * + * @return An iterator to one past the last slot + */ + [[nodiscard]] __device__ constexpr iterator end() noexcept; + + /** + * @brief Returns a const_iterator to one past the last slot. + * + * This is provided for convenience for those familiar with checking + * an iterator returned from `find()` against the `end()` iterator. + * + * @return A const_iterator to one past the last slot + */ + [[nodiscard]] __device__ constexpr const_iterator end() const noexcept; + + /** + * @brief Gets windows array. + * + * @return Pointer to the first window + */ + [[nodiscard]] __device__ constexpr window_type* data() noexcept; + + /** + * @brief Gets windows array. + * + * @return Pointer to the first window + */ + [[nodiscard]] __device__ constexpr window_type* data() const noexcept; + + /** + * @brief Returns an array of slots (or a window) for a given index. + * + * @param index Index of the window + * @return An array of slots + */ + [[nodiscard]] __device__ constexpr window_type operator[](size_type index) const noexcept; + + private: + window_type* windows_; ///< Pointer to the windows array +}; + +} // namespace experimental +} // namespace cuco + +#include diff --git a/include/cuco/detail/storage/aow_storage.cuh b/include/cuco/detail/storage/aow_storage.cuh deleted file mode 100644 index e817e3293..000000000 --- a/include/cuco/detail/storage/aow_storage.cuh +++ /dev/null @@ -1,372 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include - -#include - -#include -#include -#include -#include - -namespace cuco { -namespace experimental { -namespace detail { -/** - * @brief Base class of array of slot windows open addressing storage. - * - * This should NOT be used directly. - * - * @tparam WindowSize Number of elements in each window - * @tparam T Element type - * @tparam Extent Type of extent denoting the number of windows - */ -template -class aow_storage_base : public storage_base { - public: - /** - * @brief The number of elements (slots) processed per window. - */ - static constexpr int32_t window_size = WindowSize; - - using extent_type = typename storage_base::extent_type; ///< Storage extent type - using size_type = typename storage_base::size_type; ///< Storage size type - - using value_type = T; ///< Slot type - using window_type = cuda::std::array; ///< Slot window type - - /** - * @brief Constructor of AoW base storage. - * - * @param size Number of windows to store - */ - explicit constexpr aow_storage_base(Extent size) : storage_base{size} {} - - /** - * @brief Gets the total number of slot windows in the current storage. - * - * @return The total number of slot windows - */ - [[nodiscard]] __host__ __device__ constexpr size_type num_windows() const noexcept - { - return storage_base::capacity(); - } - - /** - * @brief Gets the total number of slots in the current storage. - * - * @return The total number of slots - */ - [[nodiscard]] __host__ __device__ constexpr size_type capacity() const noexcept - { - return storage_base::capacity() * window_size; - } - - /** - * @brief Gets the window extent of the current storage. - * - * @return The window extent. - */ - [[nodiscard]] __host__ __device__ constexpr extent_type window_extent() const noexcept - { - return storage_base::extent(); - } -}; - -/** - * @brief Non-owning AoW storage reference type. - * - * @tparam WindowSize Number of slots in each window - * @tparam T Storage element type - * @tparam Extent Type of extent denoting storage capacity - */ -template -class aow_storage_ref : public aow_storage_base { - public: - using base_type = aow_storage_base; ///< AoW base class type - - using base_type::window_size; ///< Number of elements processed per window - - using extent_type = typename base_type::extent_type; ///< Storage extent type - using size_type = typename base_type::size_type; ///< Storage size type - using value_type = typename base_type::value_type; ///< Slot type - using window_type = typename base_type::window_type; ///< Slot window type - - using base_type::capacity; - using base_type::num_windows; - - /** - * @brief Constructor of AoS storage ref. - * - * @param windows Pointer to the windows array - * @param num_windows Number of windows - */ - explicit constexpr aow_storage_ref(Extent num_windows, window_type* windows) noexcept - : aow_storage_base{num_windows}, windows_{windows} - { - } - - /** - * @brief Custom un-incrementable input iterator for the convenience of `find` operations. - * - * @note This iterator is for read only and NOT incrementable. - */ - struct iterator { - public: - using iterator_category = std::input_iterator_tag; ///< iterator category - using reference = value_type&; ///< iterator reference type - - /** - * @brief Constructs a device side input iterator of the given slot. - * - * @param current The slot pointer - */ - __device__ constexpr explicit iterator(value_type* current) noexcept : current_{current} {} - - /** - * @brief Prefix increment operator - * - * @throw This code path should never be chosen. - * - * @return Current iterator - */ - __device__ constexpr iterator& operator++() noexcept - { - static_assert("Un-incrementable input iterator"); - } - - /** - * @brief Postfix increment operator - * - * @throw This code path should never be chosen. - * - * @return Current iterator - */ - __device__ constexpr iterator operator++(int32_t) noexcept - { - static_assert("Un-incrementable input iterator"); - } - - /** - * @brief Dereference operator - * - * @return Reference to the current slot - */ - __device__ constexpr reference operator*() const { return *current_; } - - /** - * @brief Access operator - * - * @return Pointer to the current slot - */ - __device__ constexpr value_type* operator->() const { return current_; } - - /** - * Equality operator - * - * @return True if two iterators are identical - */ - friend __device__ constexpr bool operator==(iterator const& lhs, iterator const& rhs) noexcept - { - return lhs.current_ == rhs.current_; - } - - /** - * Inequality operator - * - * @return True if two iterators are not identical - */ - friend __device__ constexpr bool operator!=(iterator const& lhs, iterator const& rhs) noexcept - { - return not(lhs == rhs); - } - - private: - value_type* current_{}; ///< Pointer to the current slot - }; - using const_iterator = iterator const; ///< Const forward iterator type - - /** - * @brief Returns an iterator to one past the last slot. - * - * This is provided for convenience for those familiar with checking - * an iterator returned from `find()` against the `end()` iterator. - * - * @return An iterator to one past the last slot - */ - [[nodiscard]] __device__ constexpr iterator end() noexcept - { - return iterator{reinterpret_cast(this->data() + this->capacity())}; - } - - /** - * @brief Returns a const_iterator to one past the last slot. - * - * This is provided for convenience for those familiar with checking - * an iterator returned from `find()` against the `end()` iterator. - * - * @return A const_iterator to one past the last slot - */ - [[nodiscard]] __device__ constexpr const_iterator end() const noexcept - { - return const_iterator{reinterpret_cast(this->data() + this->capacity())}; - } - - /** - * @brief Gets windows array. - * - * @return Pointer to the first window - */ - [[nodiscard]] __device__ constexpr window_type* data() noexcept { return windows_; } - - /** - * @brief Gets windows array. - * - * @return Pointer to the first window - */ - [[nodiscard]] __device__ constexpr window_type* data() const noexcept { return windows_; } - - /** - * @brief Returns an array of slots (or a window) for a given index. - * - * @param index Index of the window - * @return An array of slots - */ - [[nodiscard]] __device__ constexpr window_type operator[](size_type index) const noexcept - { - return *reinterpret_cast( - __builtin_assume_aligned(this->data() + index, sizeof(value_type) * window_size)); - } - - private: - window_type* windows_; ///< Pointer to the windows array -}; - -/** - * @brief Array of slot Window open addressing storage class. - * - * @tparam WindowSize Number of slots in each window - * @tparam T Slot type - * @tparam Extent Type of extent denoting number of windows - * @tparam Allocator Type of allocator used for device storage (de)allocation - */ -template -class aow_storage : public aow_storage_base { - public: - using base_type = aow_storage_base; ///< AoW base class type - - using base_type::window_size; ///< Number of elements processed per window - - using extent_type = typename base_type::extent_type; ///< Storage extent type - using size_type = typename base_type::size_type; ///< Storage size type - using value_type = typename base_type::value_type; ///< Slot type - using window_type = typename base_type::window_type; ///< Slot window type - - using base_type::capacity; - using base_type::num_windows; - - /// Type of the allocator to (de)allocate windows - using allocator_type = typename std::allocator_traits::rebind_alloc; - using window_deleter_type = - custom_deleter; ///< Type of window deleter - using ref_type = aow_storage_ref; ///< Storage ref type - - /** - * @brief Constructor of AoW storage. - * - * @note The input `size` should be exclusively determined by the return value of - * `make_window_extent` since it depends on the requested low-bound value, the probing scheme, and - * the storage. - * - * @param size Number of windows to (de)allocate - * @param allocator Allocator used for (de)allocating device storage - */ - explicit constexpr aow_storage(Extent size, Allocator const& allocator) - : aow_storage_base{size}, - allocator_{allocator}, - window_deleter_{capacity(), allocator_}, - windows_{allocator_.allocate(capacity()), window_deleter_} - { - } - - aow_storage(aow_storage&&) = default; ///< Move constructor - /** - * @brief Replaces the contents of the storage with another storage. - * - * @return Reference of the current storage object - */ - aow_storage& operator=(aow_storage&&) = default; - ~aow_storage() = default; ///< Destructor - - aow_storage(aow_storage const&) = delete; - aow_storage& operator=(aow_storage const&) = delete; - - /** - * @brief Gets windows array. - * - * @return Pointer to the first window - */ - [[nodiscard]] constexpr window_type* data() const noexcept { return windows_.get(); } - - /** - * @brief Gets the storage allocator. - * - * @return The storage allocator - */ - [[nodiscard]] constexpr allocator_type allocator() const noexcept { return allocator_; } - - /** - * @brief Gets window storage reference. - * - * @return Reference of window storage - */ - [[nodiscard]] constexpr ref_type ref() const noexcept - { - return ref_type{this->window_extent(), this->data()}; - } - - /** - * @brief Initializes each slot in the AoW storage to contain `key`. - * - * @param key Key to which all keys in `slots` are initialized - * @param stream Stream used for executing the kernel - */ - void initialize(value_type key, cuda_stream_ref stream) noexcept - { - auto constexpr stride = 4; - auto const grid_size = (this->num_windows() + stride * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / - (stride * detail::CUCO_DEFAULT_BLOCK_SIZE); - - detail::initialize<<>>( - this->data(), this->num_windows(), key); - } - - private: - allocator_type allocator_; ///< Allocator used to (de)allocate windows - window_deleter_type window_deleter_; ///< Custom windows deleter - std::unique_ptr windows_; ///< Pointer to AoW storage -}; - -} // namespace detail -} // namespace experimental -} // namespace cuco diff --git a/include/cuco/detail/storage/aow_storage.inl b/include/cuco/detail/storage/aow_storage.inl new file mode 100644 index 000000000..b4052b2a0 --- /dev/null +++ b/include/cuco/detail/storage/aow_storage.inl @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +namespace cuco { +namespace experimental { + +template +constexpr aow_storage::aow_storage( + Extent size, Allocator const& allocator) noexcept + : detail::aow_storage_base{size}, + allocator_{allocator}, + window_deleter_{capacity(), allocator_}, + windows_{allocator_.allocate(capacity()), window_deleter_} +{ +} + +template +constexpr aow_storage::window_type* +aow_storage::data() const noexcept +{ + return windows_.get(); +} + +template +constexpr aow_storage::allocator_type +aow_storage::allocator() const noexcept +{ + return allocator_; +} + +template +constexpr aow_storage::ref_type +aow_storage::ref() const noexcept +{ + return ref_type{this->window_extent(), this->data()}; +} + +template +void aow_storage::initialize(value_type key, + cuda_stream_ref stream) noexcept +{ + auto constexpr stride = 4; + auto const grid_size = (this->num_windows() + stride * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / + (stride * detail::CUCO_DEFAULT_BLOCK_SIZE); + + detail::initialize<<>>( + this->data(), this->num_windows(), key); +} + +template +__host__ __device__ constexpr aow_storage_ref::aow_storage_ref( + Extent size, window_type* windows) noexcept + : detail::aow_storage_base{size}, windows_{windows} +{ +} + +template +struct aow_storage_ref::iterator { + public: + using iterator_category = std::input_iterator_tag; ///< iterator category + using reference = value_type&; ///< iterator reference type + + /** + * @brief Constructs a device side input iterator of the given slot. + * + * @param current The slot pointer + */ + __device__ constexpr explicit iterator(value_type* current) noexcept : current_{current} {} + + /** + * @brief Prefix increment operator + * + * @throw This code path should never be chosen. + * + * @return Current iterator + */ + __device__ constexpr iterator& operator++() noexcept + { + static_assert("Un-incrementable input iterator"); + } + + /** + * @brief Postfix increment operator + * + * @throw This code path should never be chosen. + * + * @return Current iterator + */ + __device__ constexpr iterator operator++(int32_t) noexcept + { + static_assert("Un-incrementable input iterator"); + } + + /** + * @brief Dereference operator + * + * @return Reference to the current slot + */ + __device__ constexpr reference operator*() const { return *current_; } + + /** + * @brief Access operator + * + * @return Pointer to the current slot + */ + __device__ constexpr value_type* operator->() const { return current_; } + + /** + * Equality operator + * + * @return True if two iterators are identical + */ + friend __device__ constexpr bool operator==(iterator const& lhs, iterator const& rhs) noexcept + { + return lhs.current_ == rhs.current_; + } + + /** + * Inequality operator + * + * @return True if two iterators are not identical + */ + friend __device__ constexpr bool operator!=(iterator const& lhs, iterator const& rhs) noexcept + { + return not(lhs == rhs); + } + + private: + value_type* current_{}; ///< Pointer to the current slot +}; + +template +__device__ constexpr aow_storage_ref::iterator +aow_storage_ref::end() noexcept +{ + return iterator{reinterpret_cast(this->data() + this->capacity())}; +} + +template +__device__ constexpr aow_storage_ref::const_iterator +aow_storage_ref::end() const noexcept +{ + return const_iterator{reinterpret_cast(this->data() + this->capacity())}; +} + +template +__device__ constexpr aow_storage_ref::window_type* +aow_storage_ref::data() noexcept +{ + return windows_; +} + +template +__device__ constexpr aow_storage_ref::window_type* +aow_storage_ref::data() const noexcept +{ + return windows_; +} + +template +__device__ constexpr aow_storage_ref::window_type +aow_storage_ref::operator[](size_type index) const noexcept +{ + return *reinterpret_cast( + __builtin_assume_aligned(this->data() + index, sizeof(value_type) * window_size)); +} + +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/storage/aow_storage_base.cuh b/include/cuco/detail/storage/aow_storage_base.cuh new file mode 100644 index 000000000..5f3d84df4 --- /dev/null +++ b/include/cuco/detail/storage/aow_storage_base.cuh @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +#include +#include + +namespace cuco { +namespace experimental { +namespace detail { +/** + * @brief Window data structure type + * + * @tparam T Window slot type + * @tparam WindowSize Number of elements per window + */ +template +struct window : public cuda::std::array { + public: + static int32_t constexpr window_size = WindowSize; ///< Number of slots per window +}; + +/** + * @brief Base class of array of slot windows open addressing storage. + * + * @note This should NOT be used directly. + * + * @tparam T Slot type + * @tparam WindowSize Number of slots in each window + * @tparam Extent Type of extent denoting the number of windows + */ +template +class aow_storage_base : public storage_base { + public: + /** + * @brief The number of elements (slots) processed per window. + */ + static constexpr int32_t window_size = WindowSize; + + using extent_type = typename storage_base::extent_type; ///< Storage extent type + using size_type = typename storage_base::size_type; ///< Storage size type + + using value_type = T; ///< Slot type + using window_type = window; ///< Slot window type + + /** + * @brief Constructor of AoW base storage. + * + * @param size Number of windows to store + */ + __host__ __device__ explicit constexpr aow_storage_base(Extent size) : storage_base{size} + { + } + + /** + * @brief Gets the total number of slot windows in the current storage. + * + * @return The total number of slot windows + */ + [[nodiscard]] __host__ __device__ constexpr size_type num_windows() const noexcept + { + return storage_base::capacity(); + } + + /** + * @brief Gets the total number of slots in the current storage. + * + * @return The total number of slots + */ + [[nodiscard]] __host__ __device__ constexpr size_type capacity() const noexcept + { + return storage_base::capacity() * window_size; + } + + /** + * @brief Gets the window extent of the current storage. + * + * @return The window extent. + */ + [[nodiscard]] __host__ __device__ constexpr extent_type window_extent() const noexcept + { + return storage_base::extent(); + } +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/storage/storage.cuh b/include/cuco/detail/storage/storage.cuh index 108aa7f84..b9a00baa2 100644 --- a/include/cuco/detail/storage/storage.cuh +++ b/include/cuco/detail/storage/storage.cuh @@ -16,7 +16,7 @@ #pragma once -#include +#include namespace cuco { namespace experimental { diff --git a/include/cuco/detail/storage/storage_base.cuh b/include/cuco/detail/storage/storage_base.cuh index 15ec30472..98eed6c13 100644 --- a/include/cuco/detail/storage/storage_base.cuh +++ b/include/cuco/detail/storage/storage_base.cuh @@ -71,7 +71,7 @@ class storage_base { * * @param size Number of elements to (de)allocate */ - explicit constexpr storage_base(Extent size) : extent_{size} {} + __host__ __device__ explicit constexpr storage_base(Extent size) : extent_{size} {} /** * @brief Gets the total number of elements in the current storage. diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 38f3b92c9..2df5b2a10 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -94,7 +94,7 @@ template >, class Allocator = cuco::cuda_allocator>, - class Storage = cuco::experimental::aow_storage<1>> + class Storage = cuco::experimental::storage<1>> class static_map { static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh index 0cb558a65..a7eee42ac 100644 --- a/include/cuco/static_set.cuh +++ b/include/cuco/static_set.cuh @@ -87,7 +87,7 @@ template >, class Allocator = cuco::cuda_allocator, - class Storage = cuco::experimental::aow_storage<1>> + class Storage = cuco::experimental::storage<1>> class static_set { using impl_type = detail:: open_addressing_impl; diff --git a/include/cuco/storage.cuh b/include/cuco/storage.cuh index 969b49f37..e34e59c96 100644 --- a/include/cuco/storage.cuh +++ b/include/cuco/storage.cuh @@ -21,26 +21,28 @@ namespace cuco { namespace experimental { /** - * @brief Public Array of slot Windows storage class. + * @brief Public storage class. * - * The window size defines the workload granularity for each CUDA thread, i.e., how many slots a - * thread would concurrently operate on when performing modify or lookup operations. cuCollections - * uses the AoW storage to supersede the raw flat slot storage due to its superior granularity - * control: When window size equals one, AoW performs the same as the flat storage. If the - * underlying operation is more memory bandwidth bound, e.g., high occupancy multimap operations, a - * larger window size can reduce the length of probing sequences thus improve runtime performance. + * @note This is a public interface used to control storage window size. A window consists of a + * number of contiguous slots. The window size defines the workload granularity for each CUDA + * thread, i.e., how many slots a thread would concurrently operate on when performing modify or + * lookup operations. cuCollections uses the AoW storage to supersede the raw flat slot storage due + * to its superior granularity control: When window size equals one, AoW performs the same as the + * flat storage. If the underlying operation is more memory bandwidth bound, e.g., high occupancy + * multimap operations, a larger window size can reduce the length of probing sequences thus improve + * runtime performance. * * @tparam WindowSize Number of elements per window storage */ template -class aow_storage { +class storage { public: - /// Number of elements per window storage + /// Number of slots per window storage static constexpr int32_t window_size = WindowSize; /// Type of implementation details template - using impl = detail::aow_storage; + using impl = aow_storage; }; } // namespace experimental diff --git a/tests/static_map/unique_sequence_test.cu b/tests/static_map/unique_sequence_test.cu index d0581afdb..6a0165cc2 100644 --- a/tests/static_map/unique_sequence_test.cu +++ b/tests/static_map/unique_sequence_test.cu @@ -295,7 +295,7 @@ TEMPLATE_TEST_CASE_SIG( thrust::equal_to, probe, cuco::cuda_allocator, - cuco::experimental::aow_storage<2>>{ + cuco::experimental::storage<2>>{ num_keys, cuco::empty_key{-1}, cuco::empty_value{-1}}; REQUIRE(map.capacity() == gold_capacity); diff --git a/tests/static_set/capacity_test.cu b/tests/static_set/capacity_test.cu index 3b7681e0a..4c66a7ccc 100644 --- a/tests/static_set/capacity_test.cu +++ b/tests/static_set/capacity_test.cu @@ -24,7 +24,7 @@ TEST_CASE("Static set capacity", "") using ProbeT = cuco::experimental::double_hashing<1, cuco::default_hash_function>; using Equal = thrust::equal_to; using AllocatorT = cuco::cuda_allocator; - using StorageT = cuco::experimental::aow_storage<2>; + using StorageT = cuco::experimental::storage<2>; SECTION("zero capacity is allowed.") { diff --git a/tests/static_set/insert_and_find_test.cu b/tests/static_set/insert_and_find_test.cu index 9d0cc057a..278510e08 100644 --- a/tests/static_set/insert_and_find_test.cu +++ b/tests/static_set/insert_and_find_test.cu @@ -104,7 +104,7 @@ TEMPLATE_TEST_CASE_SIG( thrust::equal_to, probe, cuco::cuda_allocator, - cuco::experimental::aow_storage<2>>{ + cuco::experimental::storage<2>>{ num_keys, cuco::empty_key{-1}}; test_insert_and_find(set, num_keys); } diff --git a/tests/static_set/retrieve_all_test.cu b/tests/static_set/retrieve_all_test.cu index 97a489455..616e35138 100644 --- a/tests/static_set/retrieve_all_test.cu +++ b/tests/static_set/retrieve_all_test.cu @@ -86,7 +86,7 @@ TEMPLATE_TEST_CASE_SIG( thrust::equal_to, probe, cuco::cuda_allocator, - cuco::experimental::aow_storage<1>>{ + cuco::experimental::storage<1>>{ num_keys, cuco::empty_key{-1}}; REQUIRE(set.capacity() == gold_capacity); diff --git a/tests/static_set/unique_sequence_test.cu b/tests/static_set/unique_sequence_test.cu index 4c037463a..53ede7524 100644 --- a/tests/static_set/unique_sequence_test.cu +++ b/tests/static_set/unique_sequence_test.cu @@ -143,7 +143,7 @@ TEMPLATE_TEST_CASE_SIG( thrust::equal_to, probe, cuco::cuda_allocator, - cuco::experimental::aow_storage<2>>{ + cuco::experimental::storage<2>>{ num_keys, cuco::empty_key{-1}}; REQUIRE(set.capacity() == gold_capacity); diff --git a/tests/utility/storage_test.cu b/tests/utility/storage_test.cu index afb9848d3..b776f628c 100644 --- a/tests/utility/storage_test.cu +++ b/tests/utility/storage_test.cu @@ -16,7 +16,7 @@ #include -#include +#include #include #include #include @@ -39,11 +39,11 @@ TEMPLATE_TEST_CASE_SIG("Storage tests", SECTION("Allocate array of pairs with AoS storage.") { - auto s = cuco::experimental::detail::aow_storage, - cuco::experimental::extent, - allocator_type>( - cuco::experimental::extent{size}, allocator); + auto s = + cuco::experimental::aow_storage, + window_size, + cuco::experimental::extent, + allocator_type>(cuco::experimental::extent{size}, allocator); auto const num_windows = s.num_windows(); auto const capacity = s.capacity(); @@ -54,8 +54,8 @@ TEMPLATE_TEST_CASE_SIG("Storage tests", SECTION("Allocate array of pairs with AoS storage with static extent.") { using extent_type = cuco::experimental::extent; - auto s = cuco::experimental::detail:: - aow_storage, extent_type, allocator_type>(extent_type{}, + auto s = cuco::experimental:: + aow_storage, window_size, extent_type, allocator_type>(extent_type{}, allocator); auto const num_windows = s.num_windows(); auto const capacity = s.capacity(); @@ -66,8 +66,8 @@ TEMPLATE_TEST_CASE_SIG("Storage tests", SECTION("Allocate array of keys with AoS storage.") { - auto s = cuco::experimental::detail:: - aow_storage, allocator_type>( + auto s = cuco::experimental:: + aow_storage, allocator_type>( cuco::experimental::extent{size}, allocator); auto const num_windows = s.num_windows(); auto const capacity = s.capacity(); @@ -79,7 +79,7 @@ TEMPLATE_TEST_CASE_SIG("Storage tests", SECTION("Allocate array of keys with AoS storage with static extent.") { using extent_type = cuco::experimental::extent; - auto s = cuco::experimental::detail::aow_storage( + auto s = cuco::experimental::aow_storage( extent_type{}, allocator); auto const num_windows = s.num_windows(); auto const capacity = s.capacity(); From c63d490017de9efc0ee9dfa61133ec8274ce21d4 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 15 Aug 2023 14:57:33 -0700 Subject: [PATCH 139/152] Fix alignment issue for data type larger than 16B (#351) Closes #347 This fixes a bug where data is misaligned if it's larger than 16 bytes. The problem is not unveiled by CI since we are building for the full CUDA arch matrix thus custom types larger than 16 bytes are not exercised. It also includes minor cleanups by adding blank lines between different functions. --- include/cuco/detail/bitwise_compare.cuh | 16 ++++++++++++++-- include/cuco/detail/utils.cuh | 4 ++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/bitwise_compare.cuh b/include/cuco/detail/bitwise_compare.cuh index 2ed7ad072..a8a5a69d1 100644 --- a/include/cuco/detail/bitwise_compare.cuh +++ b/include/cuco/detail/bitwise_compare.cuh @@ -18,6 +18,8 @@ #include +#include + #include #include @@ -60,6 +62,16 @@ struct bitwise_compare_impl<8> { } }; +/** + * @brief Gives value to use as alignment for a type that is at least the + * size of type, or 16, whichever is smaller. + */ +template +constexpr std::size_t alignment() +{ + return std::min(std::size_t{16}, cuda::std::bit_ceil(sizeof(T))); +} + /** * @brief Performs a bitwise equality comparison between the two specified objects * @@ -76,8 +88,8 @@ __host__ __device__ constexpr bool bitwise_compare(T const& lhs, T const& rhs) "Bitwise compared objects must have unique object representations or be explicitly declared as " "safe for bitwise comparison via specialization of cuco::is_bitwise_comparable_v."); - alignas(sizeof(T)) T __lhs{lhs}; - alignas(sizeof(T)) T __rhs{rhs}; + alignas(detail::alignment()) T __lhs{lhs}; + alignas(detail::alignment()) T __rhs{rhs}; return detail::bitwise_compare_impl::compare(reinterpret_cast(&__lhs), reinterpret_cast(&__rhs)); } diff --git a/include/cuco/detail/utils.cuh b/include/cuco/detail/utils.cuh index 793854694..22675d496 100644 --- a/include/cuco/detail/utils.cuh +++ b/include/cuco/detail/utils.cuh @@ -144,6 +144,7 @@ template struct packed { using type = void; ///< `void` type by default }; + /** * @brief Denotes the packed type when the size of the object is 8. */ @@ -151,6 +152,7 @@ template <> struct packed { using type = uint64_t; ///< Packed type as `uint64_t` if the size of the object is 8 }; + /** * @brief Denotes the packed type when the size of the object is 4. */ @@ -158,6 +160,7 @@ template <> struct packed { using type = uint32_t; ///< Packed type as `uint32_t` if the size of the object is 4 }; + template using packed_t = typename packed::type; @@ -182,6 +185,7 @@ constexpr bool is_packable() { return not std::is_void>::value and std::has_unique_object_representations_v; } + /** * @brief Allows viewing a pair in a packed representation. * From d101b4c427773c19f0c4f5c552787c5fabf29cca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 22 Aug 2023 02:08:05 +0200 Subject: [PATCH 140/152] Migrate CI to Github Actions (#312) This PR migrates CI workflows from the legacy gpuCI to Github Actions. Most of the scripts are 1:1 copies from https://github.com/NVIDIA/cccl with some minor adjustments here and there. --- .../cuda12.2-gcc12/devcontainer.json | 39 ++++++ .devcontainer/devcontainer.json | 37 ++++++ .devcontainer/launch.sh | 58 +++++++++ .devcontainer/make_devcontainers.sh | 60 +++++++++ .github/actions/compute-matrix/action.yml | 39 ++++++ .../actions/compute-matrix/compute-matrix.sh | 37 ++++++ .../actions/configure_cccl_sccache/action.yml | 34 +++++ .github/ops-bot.yaml | 15 +++ .github/workflows/add_to_project.yml | 29 ----- .github/workflows/build-and-test.yml | 86 +++++++++++++ .github/workflows/dispatch-build-and-test.yml | 49 +++++++ .github/workflows/pr.yml | 121 ++++++++++++++++++ .github/workflows/run-as-coder.yml | 66 ++++++++++ .gitignore | 1 - .pre-commit-config.yaml | 2 +- ci/build.sh | 121 ++++++++++++++++++ ci/checks/style.sh | 33 ----- ci/gpu/build.sh | 76 ----------- ci/matrix.yml | 46 +++++++ ci/{checks => pre-commit}/doxygen.sh | 18 ++- ci/sccache_hit_rate.sh | 55 ++++++++ ci/sccache_stats.sh | 60 +++++++++ ci/test.sh | 24 ++++ 23 files changed, 962 insertions(+), 144 deletions(-) create mode 100644 .devcontainer/cuda12.2-gcc12/devcontainer.json create mode 100644 .devcontainer/devcontainer.json create mode 100755 .devcontainer/launch.sh create mode 100755 .devcontainer/make_devcontainers.sh create mode 100644 .github/actions/compute-matrix/action.yml create mode 100755 .github/actions/compute-matrix/compute-matrix.sh create mode 100644 .github/actions/configure_cccl_sccache/action.yml delete mode 100644 .github/workflows/add_to_project.yml create mode 100644 .github/workflows/build-and-test.yml create mode 100644 .github/workflows/dispatch-build-and-test.yml create mode 100644 .github/workflows/pr.yml create mode 100644 .github/workflows/run-as-coder.yml create mode 100755 ci/build.sh delete mode 100755 ci/checks/style.sh delete mode 100644 ci/gpu/build.sh create mode 100644 ci/matrix.yml rename ci/{checks => pre-commit}/doxygen.sh (65%) create mode 100755 ci/sccache_hit_rate.sh create mode 100755 ci/sccache_stats.sh create mode 100755 ci/test.sh diff --git a/.devcontainer/cuda12.2-gcc12/devcontainer.json b/.devcontainer/cuda12.2-gcc12/devcontainer.json new file mode 100644 index 000000000..199ce44f4 --- /dev/null +++ b/.devcontainer/cuda12.2-gcc12/devcontainer.json @@ -0,0 +1,39 @@ +{ + "shutdownAction": "stopContainer", + "image": "rapidsai/devcontainers:23.08-cpp-gcc12-cuda12.2-ubuntu22.04", + "hostRequirements": { + "gpu": true + }, + "initializeCommand": [ + "/bin/bash", + "-c", + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + ], + "containerEnv": { + "SCCACHE_REGION": "us-east-2", + "SCCACHE_BUCKET": "rapids-sccache-devs", + "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", + "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", + "DEVCONTAINER_NAME": "cuda12.2-gcc12" + }, + "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", + "mounts": [ + "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + ], + "customizations": { + "vscode": { + "extensions": [ + "llvm-vs-code-extensions.vscode-clangd" + ], + "settings": { + "clangd.arguments": [ + "--compile-commands-dir=${workspaceFolder}/build/latest" + ] + } + } + }, + "name": "cuda12.2-gcc12" +} diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 000000000..84cfa82cc --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,37 @@ +{ + "shutdownAction": "stopContainer", + "image": "rapidsai/devcontainers:23.08-cpp-gcc12-cuda12.2-ubuntu22.04", + "hostRequirements": { + "gpu": true + }, + "initializeCommand": [ + "/bin/bash", + "-c", + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + ], + "containerEnv": { + "SCCACHE_REGION": "us-east-2", + "SCCACHE_BUCKET": "rapids-sccache-devs", + "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", + "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history" + }, + "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", + "mounts": [ + "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + ], + "customizations": { + "vscode": { + "extensions": [ + "llvm-vs-code-extensions.vscode-clangd" + ], + "settings": { + "clangd.arguments": [ + "--compile-commands-dir=${workspaceFolder}/build/latest" + ] + } + } + } +} \ No newline at end of file diff --git a/.devcontainer/launch.sh b/.devcontainer/launch.sh new file mode 100755 index 000000000..157a49bef --- /dev/null +++ b/.devcontainer/launch.sh @@ -0,0 +1,58 @@ +#! /usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +launch_devcontainer() { + + # Ensure we're in the repo root + cd "$( cd "$( dirname "$(realpath -m "${BASH_SOURCE[0]}")" )" && pwd )/.."; + + if [[ -z $1 ]] || [[ -z $2 ]]; then + echo "Usage: $0 [CUDA version] [Host compiler]" + echo "Example: $0 12.1 gcc12" + return 1 + fi + + local cuda_version="$1" + local host_compiler="$2" + local workspace="$(basename "$(pwd)")"; + local tmpdir="$(mktemp -d)/${workspace}"; + local path="$(pwd)/.devcontainer/cuda${cuda_version}-${host_compiler}"; + + mkdir -p "${tmpdir}"; + mkdir -p "${tmpdir}/.devcontainer"; + cp -arL "$path/devcontainer.json" "${tmpdir}/.devcontainer"; + sed -i "s@\${localWorkspaceFolder}@$(pwd)@g" "${tmpdir}/.devcontainer/devcontainer.json"; + path="${tmpdir}"; + + local hash="$(echo -n "${path}" | xxd -pu - | tr -d '[:space:]')"; + local url="vscode://vscode-remote/dev-container+${hash}/home/coder/cuCollections"; + + echo "devcontainer URL: ${url}"; + + local launch=""; + if type open >/dev/null 2>&1; then + launch="open"; + elif type xdg-open >/dev/null 2>&1; then + launch="xdg-open"; + fi + + if [ -n "${launch}" ]; then + code --new-window "${tmpdir}"; + exec "${launch}" "${url}" >/dev/null 2>&1; + fi +} + +launch_devcontainer "$@"; \ No newline at end of file diff --git a/.devcontainer/make_devcontainers.sh b/.devcontainer/make_devcontainers.sh new file mode 100755 index 000000000..700dc3713 --- /dev/null +++ b/.devcontainer/make_devcontainers.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script parses the CI matrix.yaml file and generates a devcontainer.json file for each unique combination of +# CUDA version, compiler name/version, and Ubuntu version. The devcontainer.json files are written to the +# .devcontainer directory to a subdirectory named after the CUDA version and compiler name/version. +# GitHub docs on using multiple devcontainer.json files: +# https://docs.github.com/en/codespaces/setting-up-your-project-for-codespaces/adding-a-dev-container-configuration/introduction-to-dev-containers#devcontainerjson + +# Ensure the script is being executed in its containing directory +cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"; + +# The root devcontainer.json file is used as a template for all other devcontainer.json files +# by replacing the `image:` field with the appropriate image name +base_devcontainer_file="./devcontainer.json" + + +# Read matrix.yaml and convert it to json +matrix_json=$(yq -o json ../ci/matrix.yml) + + +# Get the devcontainer image version and define image tag root +DEVCONTAINER_VERSION=$(echo "$matrix_json" | jq -r '.devcontainer_version') +IMAGE_ROOT="rapidsai/devcontainers:${DEVCONTAINER_VERSION}-cpp-" + +# Get unique combinations of cuda version, compiler name/version, and Ubuntu version +combinations=$(echo "$matrix_json" | jq -c '[.pull_request.nvcc[] | {cuda: .cuda, compiler_name: .compiler.name, compiler_version: .compiler.version, os: .os}] | unique | .[]') + +# For each unique combination +for combination in $combinations; do + cuda_version=$(echo "$combination" | jq -r '.cuda') + compiler_name=$(echo "$combination" | jq -r '.compiler_name') + compiler_version=$(echo "$combination" | jq -r '.compiler_version') + os=$(echo "$combination" | jq -r '.os') + + name="cuda$cuda_version-$compiler_name$compiler_version" + mkdir -p "$name" + devcontainer_file="$name/devcontainer.json" + image="$IMAGE_ROOT$compiler_name$compiler_version-cuda$cuda_version-$os" + + # Use the base_devcontainer.json as a template, plug in the CUDA, compiler names, versions, and Ubuntu version, + # and write the output to the new devcontainer.json file + #jq --arg image "$image" --arg name "$name" '. + {image: $image, name: $name}' $base_devcontainer_file > "$devcontainer_file" + jq --arg image "$image" --arg name "$name" '.image = $image | .name = $name | .containerEnv.DEVCONTAINER_NAME = $name' $base_devcontainer_file > "$devcontainer_file" + + echo "Created $devcontainer_file" +done \ No newline at end of file diff --git a/.github/actions/compute-matrix/action.yml b/.github/actions/compute-matrix/action.yml new file mode 100644 index 000000000..fbbe49b54 --- /dev/null +++ b/.github/actions/compute-matrix/action.yml @@ -0,0 +1,39 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Compute Matrix +description: "Compute the matrix for a given matrix type from the specified matrix file" + +inputs: + matrix_query: + description: "The jq query used to specify the desired matrix. e.g., .pull_request.nvcc" + required: true + matrix_file: + description: 'The file containing the matrix' + required: true +outputs: + matrix: + description: 'The requested matrix' + value: ${{ steps.compute-matrix.outputs.MATRIX }} + +runs: + using: "composite" + steps: + - name: Compute matrix + id: compute-matrix + run: | + MATRIX=$(./.github/actions/compute-matrix/compute-matrix.sh ${{inputs.matrix_file}} ${{inputs.matrix_query}} ) + echo "matrix=$MATRIX" | tee -a $GITHUB_OUTPUT + shell: bash -euxo pipefail {0} \ No newline at end of file diff --git a/.github/actions/compute-matrix/compute-matrix.sh b/.github/actions/compute-matrix/compute-matrix.sh new file mode 100755 index 000000000..64a6f5642 --- /dev/null +++ b/.github/actions/compute-matrix/compute-matrix.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +# Check for the correct number of arguments +if [ $# -ne 2 ]; then + echo "Usage: $0 MATRIX_FILE MATRIX_QUERY" + echo "MATRIX_FILE: The path to the matrix file." + echo "MATRIX_QUERY: The jq query used to specify the desired matrix. e.g., '.pull-request.nvcc'" + exit 1 +fi + +# Get realpath before changing directory +MATRIX_FILE=$(realpath "$1") +MATRIX_QUERY="$2" + +# Ensure the script is being executed in its containing directory +cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"; + +echo "Input matrix file:" >&2 +cat "$MATRIX_FILE" >&2 +echo "Query: $MATRIX_QUERY" >&2 +echo $(yq -o=json "$MATRIX_FILE" | jq -c -r "$MATRIX_QUERY | map(. as \$o | {std: .std[]} + del(\$o.std))") \ No newline at end of file diff --git a/.github/actions/configure_cccl_sccache/action.yml b/.github/actions/configure_cccl_sccache/action.yml new file mode 100644 index 000000000..458669688 --- /dev/null +++ b/.github/actions/configure_cccl_sccache/action.yml @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Set up AWS credentials and environment variables for sccache +description: "Set up AWS credentials and environment variables for sccache" +runs: + using: "composite" + steps: + - name: Get AWS credentials for sccache bucket + uses: aws-actions/configure-aws-credentials@v2 + with: + role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-NVIDIA + aws-region: us-east-2 + role-duration-seconds: 43200 # 12 hours + - name: Set environment variables + run: | + echo "SCCACHE_BUCKET=rapids-sccache-east" >> $GITHUB_ENV + echo "SCCACHE_REGION=us-east-2" >> $GITHUB_ENV + echo "SCCACHE_IDLE_TIMEOUT=32768" >> $GITHUB_ENV + echo "SCCACHE_S3_USE_SSL=true" >> $GITHUB_ENV + echo "SCCACHE_S3_NO_CREDENTIALS=false" >> $GITHUB_ENV + shell: bash \ No newline at end of file diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml index 84bbe71f4..217ccebf9 100644 --- a/.github/ops-bot.yaml +++ b/.github/ops-bot.yaml @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # This file controls which features from the `ops-bot` repository below are enabled. # - https://github.com/rapidsai/ops-bot diff --git a/.github/workflows/add_to_project.yml b/.github/workflows/add_to_project.yml deleted file mode 100644 index 72dd4acd2..000000000 --- a/.github/workflows/add_to_project.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: Add new issue/PR to project - -on: - issues: - types: - - opened - - pull_request_target: - types: - - opened - -jobs: - add-to-project: - name: Add issue or PR to project - runs-on: ubuntu-latest - steps: - - name: Generate token - id: generate_token - uses: tibdex/github-app-token@36464acb844fc53b9b8b2401da68844f6b05ebb0 - with: - app_id: ${{ secrets.CCCL_AUTH_APP_ID }} - private_key: ${{ secrets.CCCL_AUTH_APP_PEM }} - - name: Add to Project - env: - TOKEN: ${{ steps.generate_token.outputs.token }} - uses: actions/add-to-project@v0.3.0 - with: - project-url: https://github.com/orgs/NVIDIA/projects/6 - github-token: ${{ env.TOKEN }} diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml new file mode 100644 index 000000000..6599e9dcb --- /dev/null +++ b/.github/workflows/build-and-test.yml @@ -0,0 +1,86 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: build and test + +defaults: + run: + shell: bash -eo pipefail {0} + +on: + workflow_call: + inputs: + devcontainer_version: {type: string, required: true} + cuda_version: {type: string, required: true} + compiler: {type: string, required: true} + compiler_exe: {type: string, required: true} + compiler_version: {type: string, required: true} + std: {type: string, required: true} + gpu_build_archs: {type: string, required: true} + cpu: {type: string, required: true} + os: {type: string, required: true} + build_script: {type: string, required: false} + test_script: {type: string, required: false} + run_tests: {type: boolean, required: false, default: true} + +jobs: + devcontainer_image: + name: Devcontainer ${{ inputs.os }}/${{ inputs.compiler }}${{ inputs.compiler_version }} + runs-on: ubuntu-latest + outputs: + image_name: ${{ steps.compute-devcontainer-image-name.outputs.name }} + steps: + - name: Compute devcontainer image name + id: compute-devcontainer-image-name + run: | + COMPILER_SEGMENT="" + if [ "${{ inputs.compiler }}" != "cc" ] && [ "${{ inputs.compiler_exe }}" != "c++" ]; then + COMPILER_SEGMENT="${{ inputs.compiler }}${{ inputs.compiler_version }}-" + fi + DEVCONTAINER_IMAGE="rapidsai/devcontainers:${{inputs.devcontainer_version}}-cpp-${COMPILER_SEGMENT}cuda${{inputs.cuda_version}}-${{inputs.os}}" + echo "DEVCONTAINER_IMAGE=$DEVCONTAINER_IMAGE" >> $GITHUB_ENV + echo "name=$DEVCONTAINER_IMAGE" >> $GITHUB_OUTPUT + - name: Check if devcontainer image exists + run: | + docker buildx imagetools inspect $DEVCONTAINER_IMAGE > /dev/null + if [ $? -ne 0 ]; then + echo "Error: Docker image $DEVCONTAINER_IMAGE does not exist." + exit 1 + fi + + build: + needs: devcontainer_image + if: inputs.build_script != '' && needs.devcontainer_image.outputs.image_name != '' + name: Build ${{inputs.compiler}}${{inputs.compiler_version}}/C++${{inputs.std}}/SM${{inputs.gpu_build_archs}} + uses: ./.github/workflows/run-as-coder.yml + with: + name: Build ${{inputs.compiler}}${{inputs.compiler_version}}/C++${{inputs.std}}/SM${{inputs.gpu_build_archs}} + runner: linux-${{inputs.cpu}}-cpu16 + image: ${{ needs.devcontainer_image.outputs.image_name }} + command: | + ${{ inputs.build_script }} "${{inputs.compiler_exe}}" "${{inputs.std}}" "${{inputs.gpu_build_archs}}" + + test: + needs: [devcontainer_image, build] + if: ${{ !cancelled() && ( needs.build.result == 'success' || needs.build.result == 'skipped' ) && inputs.test_script != '' && needs.devcontainer_image.outputs.image_name != '' && inputs.run_tests}} + name: Test ${{inputs.compiler}}${{inputs.compiler_version}}/C++${{inputs.std}}/SM${{inputs.gpu_build_archs}} + uses: ./.github/workflows/run-as-coder.yml + with: + name: Test ${{inputs.compiler}}${{inputs.compiler_version}}/C++${{inputs.std}}/SM${{inputs.gpu_build_archs}} + runner: linux-${{inputs.cpu}}-gpu-v100-latest-1 + image: ${{ needs.devcontainer_image.outputs.image_name }} + command: | + nvidia-smi + ${{ inputs.test_script }} "${{inputs.compiler_exe}}" "${{inputs.std}}" "${{inputs.gpu_build_archs}}" \ No newline at end of file diff --git a/.github/workflows/dispatch-build-and-test.yml b/.github/workflows/dispatch-build-and-test.yml new file mode 100644 index 000000000..dea71e00e --- /dev/null +++ b/.github/workflows/dispatch-build-and-test.yml @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Dispatch build and test + +on: + workflow_call: + inputs: + per_cuda_compiler_matrix: {type: string, required: true} + build_script: {type: string, required: false} + test_script: {type: string, required: false} + devcontainer_version: {type: string, required: true} + +jobs: + # Using a matrix to dispatch to the build-and-test reusable workflow for each build configuration + # ensures that the build/test steps can overlap across different configurations. For example, + # the build step for CUDA 12.1 + gcc 9.3 can run at the same time as the test step for CUDA 11.0 + clang 11. + build_and_test: + name: ${{matrix.cpu}} + uses: ./.github/workflows/build-and-test.yml + strategy: + fail-fast: false + matrix: + include: ${{ fromJSON(inputs.per_cuda_compiler_matrix) }} + with: + devcontainer_version: ${{ inputs.devcontainer_version }} + cuda_version: ${{ matrix.cuda }} + compiler: ${{ matrix.compiler.name }} + compiler_exe: ${{ matrix.compiler.exe }} + compiler_version: ${{ matrix.compiler.version }} + std: ${{ matrix.std }} + gpu_build_archs: ${{ matrix.gpu_build_archs }} + cpu: ${{ matrix.cpu }} + os: ${{ matrix.os }} + build_script: ${{ inputs.build_script }} + test_script: ${{ inputs.test_script }} + run_tests: ${{ contains(matrix.jobs, 'test') && !contains(github.event.head_commit.message, 'skip-tests') }} diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml new file mode 100644 index 000000000..061b30a99 --- /dev/null +++ b/.github/workflows/pr.yml @@ -0,0 +1,121 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This is the main workflow that runs on every PR and push to main +name: pr + +defaults: + run: + shell: bash -euo pipefail {0} + +on: + push: + branches: + - main + - dev + - "pull-request/[0-9]+" + +# Only runs one instance of this workflow at a time for a given PR and cancels any in-progress runs when a new one starts. +concurrency: + group: ${{ github.workflow }}-on-${{ github.event_name }}-from-${{ github.ref_name }} + cancel-in-progress: true + +jobs: + doxygen-check: + name: Doxygen check + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v3 + - name: Install Doxygen + run: | + sudo apt-get update -q + sudo apt-get install -y doxygen + - name: Check Doxygen docs + run: | + ./ci/pre-commit/doxygen.sh + if [ $? -ne 0 ]; then + echo "Doxygen check failed" + exit 1 + fi + shell: bash -euxo pipefail {0} + + get-devcontainer-version: + name: Get devcontainer version + runs-on: ubuntu-latest + outputs: + DEVCONTAINER_VERSION: ${{ steps.set-outputs.outputs.DEVCONTAINER_VERSION }} + steps: + - name: Checkout repo + uses: actions/checkout@v3 + - name: Get devcontainer version + id: set-outputs + run: | + DEVCONTAINER_VERSION=$(yq -o json ci/matrix.yml | jq -r '.devcontainer_version') + echo "DEVCONTAINER_VERSION=$DEVCONTAINER_VERSION" | tee -a "$GITHUB_OUTPUT" + + compute-nvcc-matrix: + name: Compute NVCC matrix + runs-on: ubuntu-latest + outputs: + FULL_MATRIX: ${{ steps.set-outputs.outputs.FULL_MATRIX }} + CUDA_VERSIONS: ${{ steps.set-outputs.outputs.CUDA_VERSIONS }} + HOST_COMPILERS: ${{ steps.set-outputs.outputs.HOST_COMPILERS }} + PER_CUDA_COMPILER_MATRIX: ${{ steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX }} + steps: + - name: Checkout repo + uses: actions/checkout@v3 + - name: Get full nvcc matrix + id: compute-nvcc-matrix + uses: ./.github/actions/compute-matrix + with: + matrix_file: './ci/matrix.yml' + matrix_query: '.pull_request.nvcc' + - name: Set outputs + id: set-outputs + run: | + FULL_MATRIX='${{steps.compute-nvcc-matrix.outputs.matrix}}' + echo "FULL_MATRIX=$FULL_MATRIX" | tee -a "$GITHUB_OUTPUT" + CUDA_VERSIONS=$(echo $FULL_MATRIX | jq -c '[.[] | .cuda] | unique') + echo "CUDA_VERSIONS=$CUDA_VERSIONS" | tee -a "$GITHUB_OUTPUT" + HOST_COMPILERS=$(echo $FULL_MATRIX | jq -c '[.[] | .compiler.name] | unique') + echo "HOST_COMPILERS=$HOST_COMPILERS" | tee -a "$GITHUB_OUTPUT" + PER_CUDA_COMPILER_MATRIX=$(echo $FULL_MATRIX | jq -c ' group_by(.cuda + .compiler.name) | map({(.[0].cuda + "-" + .[0].compiler.name): .}) | add') + echo "PER_CUDA_COMPILER_MATRIX=$PER_CUDA_COMPILER_MATRIX" | tee -a "$GITHUB_OUTPUT" + + ci: + name: CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }} + needs: [compute-nvcc-matrix, get-devcontainer-version] + uses: ./.github/workflows/dispatch-build-and-test.yml + strategy: + fail-fast: false + matrix: + cuda_version: ${{ fromJSON(needs.compute-nvcc-matrix.outputs.CUDA_VERSIONS) }} + compiler: ${{ fromJSON(needs.compute-nvcc-matrix.outputs.HOST_COMPILERS) }} + with: + per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-nvcc-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ format('{0}-{1}', matrix.cuda_version, matrix.compiler) ]) }} + build_script: "./ci/build.sh" + test_script: "./ci/test.sh" + devcontainer_version: ${{ needs.get-devcontainer-version.outputs.DEVCONTAINER_VERSION }} + + # This job is the final job that runs after all other jobs and is used for branch protection status checks. + # See: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks + ci-success: + runs-on: ubuntu-latest + name: CI success + needs: + - ci + steps: + - run: echo "CI success" \ No newline at end of file diff --git a/.github/workflows/run-as-coder.yml b/.github/workflows/run-as-coder.yml new file mode 100644 index 000000000..573ef134a --- /dev/null +++ b/.github/workflows/run-as-coder.yml @@ -0,0 +1,66 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Run as coder user + +defaults: + run: + shell: bash -exo pipefail {0} + + +on: + workflow_call: + inputs: + name: {type: string, required: true} + image: {type: string, required: true} + runner: {type: string, required: true} + command: {type: string, required: true} + env: { type: string, required: false, default: "" } + +jobs: + run-as-coder: + name: ${{inputs.name}} + runs-on: ${{inputs.runner}} + container: + options: -u root + image: ${{inputs.image}} + env: + NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} + permissions: + id-token: write + steps: + - name: Checkout repo + uses: actions/checkout@v3 + with: + path: cuCollections + persist-credentials: false + - name: Move files to coder user home directory + run: | + cp -R cuCollections /home/coder/cuCollections + chown -R coder:coder /home/coder/ + - name: Configure credentials and environment variables for sccache + uses: ./cuCollections/.github/actions/configure_cccl_sccache + - name: Run command + shell: su coder {0} + run: | + set -exo pipefail + cd ~/cuCollections + eval "${{inputs.command}}" || exit_code=$? + if [ ! -z "$exit_code" ]; then + echo "::error::Error! To checkout the corresponding code and reproduce locally, run the following commands:" + echo "git clone --branch $GITHUB_REF_NAME --single-branch --recurse-submodules https://github.com/$GITHUB_REPOSITORY.git && cd $(echo $GITHUB_REPOSITORY | cut -d'/' -f2) && git checkout $GITHUB_SHA" + echo "docker run --rm -it --gpus all --pull=always --volume \$PWD:/repo --workdir /repo ${{ inputs.image }} ${{inputs.command}}" + exit $exit_code + fi diff --git a/.gitignore b/.gitignore index e57f3b30c..6ccf378c2 100644 --- a/.gitignore +++ b/.gitignore @@ -8,7 +8,6 @@ __pycache__ *.dylib .cache .vscode -.devcontainer *.code-workspace *.swp *.pytest_cache diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e2fe04169..5679bf67f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -20,7 +20,7 @@ repos: hooks: - id: doxygen-check name: doxygen-check - entry: ./ci/checks/doxygen.sh + entry: ./ci/pre-commit/doxygen.sh files: ^include/ types_or: [file] language: system diff --git a/ci/build.sh b/ci/build.sh new file mode 100755 index 000000000..0baeaa68c --- /dev/null +++ b/ci/build.sh @@ -0,0 +1,121 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -eo pipefail + +# Ensure the script is being executed in its containing directory +cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"; + +# Script defaults +CUDA_COMPILER=nvcc + +# Check if the correct number of arguments has been provided +function usage { + echo "Usage: $0 [OPTIONS] " + echo "The PARALLEL_LEVEL environment variable controls the amount of build parallelism. Default is the number of cores." + echo "Example: PARALLEL_LEVEL=8 $0 g++-8 14 \"70\" " + echo "Example: $0 clang++-8 17 \"70;75;80-virtual\" " + echo "Possible options: " + echo " -nvcc: path/to/nvcc" + echo " -v/--verbose: enable shell echo for debugging" + exit 1 +} + +# Check for extra options +# While there are more than 3 arguments, parse switches/options +while [ "$#" -gt 3 ] +do + case "${1}" in + -h) usage ;; + -help) usage ;; + --help) usage ;; + --verbose) VERBOSE=1; shift ;; + -v) VERBOSE=1; shift ;; + -nvcc) CUDA_COMPILER="${2}"; shift 2;; + *) usage ;; + esac +done + +if [ $VERBOSE ]; then + set -x +fi + +if [ "$#" -ne 3 ]; then + echo "Invalid number of arguments" + usage +fi + +# Begin processing unsets after option parsing +set -u + +# Assign command line arguments to variables +readonly HOST_COMPILER=$(which $1) +readonly CXX_STANDARD=$2 + +# Replace spaces, commas and semicolons with semicolons for CMake list +readonly GPU_ARCHS=$(echo $3 | tr ' ,' ';') + +readonly PARALLEL_LEVEL=${PARALLEL_LEVEL:=$(nproc)} +readonly NVCC_VERSION=$($CUDA_COMPILER --version | grep release | awk '{print $6}' | cut -c2-) + +if [ -z ${DEVCONTAINER_NAME+x} ]; then + BUILD_DIR=../build/local +else + BUILD_DIR=../build/${DEVCONTAINER_NAME} +fi + +# The most recent build will always be symlinked to cuCollections/build/latest +mkdir -p $BUILD_DIR +rm -f ../build/latest +ln -sf $BUILD_DIR ../build/latest +export BUILD_DIR +echo $BUILD_DIR + +CMAKE_OPTIONS=" + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_STANDARD=${CXX_STANDARD} \ + -DCMAKE_CUDA_STANDARD=${CXX_STANDARD} \ + -DCMAKE_CXX_COMPILER=${HOST_COMPILER} \ + -DCMAKE_CUDA_COMPILER=${CUDA_COMPILER} \ + -DCMAKE_CUDA_HOST_COMPILER=${HOST_COMPILER} \ + -DCMAKE_CUDA_ARCHITECTURES=${GPU_ARCHS} \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ +" + +echo "========================================" +echo "Begin build" +echo "pwd=$(pwd)" +echo "NVCC_VERSION=$NVCC_VERSION" +echo "HOST_COMPILER=$HOST_COMPILER" +echo "CXX_STANDARD=$CXX_STANDARD" +echo "GPU_ARCHS=$GPU_ARCHS" +echo "PARALLEL_LEVEL=$PARALLEL_LEVEL" +echo "BUILD_DIR=$BUILD_DIR" +echo "========================================" + +function configure(){ + cmake -S .. -B $BUILD_DIR $CMAKE_OPTIONS +} + +function build(){ + source "./sccache_stats.sh" start + cmake --build $BUILD_DIR --parallel $PARALLEL_LEVEL + echo "Build complete" + source "./sccache_stats.sh" end +} + +configure +build \ No newline at end of file diff --git a/ci/checks/style.sh b/ci/checks/style.sh deleted file mode 100755 index fbbe1d120..000000000 --- a/ci/checks/style.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash -# Copyright (c) 2018-2022, NVIDIA CORPORATION. -############################## -# cuCollections Style Tester # -############################## - -# Ignore errors and set path -set +e -PATH=/conda/bin:$PATH -# LC_ALL=C.UTF-8 -# LANG=C.UTF-8 - -# Activate common conda env -. /opt/conda/etc/profile.d/conda.sh -conda activate rapids - -# Run clang-format and check for a consistent code format -CLANG_FORMAT=`pre-commit run clang-format --all-files 2>&1` -CLANG_FORMAT_RETVAL=$? - -# Run doxygen check -DOXYGEN_CHECK=`ci/checks/doxygen.sh` -DOXYGEN_CHECK_RETVAL=$? - -echo -e "$DOXYGEN_CHECK" - -RETVALS=( - $CLANG_FORMAT_RETVAL -) -IFS=$'\n' -RETVAL=`echo "${RETVALS[*]}" | sort -nr | head -n1` - -exit $RETVAL diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh deleted file mode 100644 index be4c72f92..000000000 --- a/ci/gpu/build.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/bin/bash -# Copyright (c) 2021, NVIDIA CORPORATION. -##############################################i### -# cuCollections GPU build and test script for CI # -################################################## -set -e -NUMARGS=$# -ARGS=$* - -# Arg parsing function -function hasArg { - (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ") -} - -# Set path and build parallel level -export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH -export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4} -export CUDA_REL=${CUDA_VERSION%.*} - -# Set home to the job's workspace -export HOME=$WORKSPACE - -################################################################################ -# SETUP - Check environment -################################################################################ - -gpuci_logger "Check environment" -env - -gpuci_logger "Check GPU usage" -nvidia-smi - -gpuci_logger "Install Dependencies" -. /opt/conda/etc/profile.d/conda.sh -conda create -y -n cuda -c nvidia -c conda-forge "cudatoolkit=${CUDA_VER}" "cmake>=3.23.1" -conda activate cuda - -gpuci_logger "Check versions" -python --version - -gpuci_logger "Check conda environment" -conda info -conda config --show-sources -conda list --show-channel-urls - -################################################################################ -# BUILD - Build from Source -################################################################################ - -gpuci_logger "Build Tests/Examples" -cd ${WORKSPACE} -mkdir -p build -cd build -cmake .. -make - -################################################################################ -# TEST - Run Tests -################################################################################ - -if hasArg --skip-tests; then - gpuci_logger "Skipping Tests" -else - gpuci_logger "Check GPU usage" - nvidia-smi - cd ${WORKSPACE}/build/tests - ctest . - - # This block may provide more verbose testing output since each test is ran individually - #cd ${WORKSPACE}/build/tests - #for gt in "$WORKSPACE/build/tests"* ; do - # test_name=$(basename ${gt}) - # echo "Running $test_name" - # ${gt} - #done -fi diff --git a/ci/matrix.yml b/ci/matrix.yml new file mode 100644 index 000000000..5916dd113 --- /dev/null +++ b/ci/matrix.yml @@ -0,0 +1,46 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +cuda_oldest: &cuda_oldest '11.8' +cuda_newest: &cuda_newest '12.2' + +# The GPUs to test on +# Note: This assumes that the appropriate gpu_build_archs are set to include building for the GPUs listed here +gpus: + - 'v100' + +# The version of the devcontainer images to use from https://hub.docker.com/r/rapidsai/devcontainers +devcontainer_version: '23.08' + +# Each environment below will generate a unique build/test job +# See the "compute-matrix" job in the workflow for how this is parsed and used +# cuda: The CUDA Toolkit version +# os: The operating system used +# cpu: The CPU architecture +# compiler: The compiler to use +# name: The compiler name +# version: The compiler version +# exe: The unverionsed compiler binary name +# To use the system's default compiler set "exe: 'c++'" or "name: 'cc'" +# gpu_build_archs: The GPU architectures to build for (comma-separated list) +# std: The C++ standards to build for +# This field is unique as it will generate an independent build/test job for each value + +# Configurations that will run for every PR +pull_request: + nvcc: + # There is currently only one CUDA 11.8 image available which comes with the system's default C++ compiler. For ubuntu22.04, we know that the default CC is gcc11.3 + - {cuda: *cuda_oldest, os: 'ubuntu22.04', cpu: 'amd64', compiler: {name: 'gcc', version: '11', exe: 'c++'}, gpu_build_archs: '60', std: [17], jobs: ['build', 'test']} + - {cuda: *cuda_newest, os: 'ubuntu22.04', cpu: 'amd64', compiler: {name: 'gcc', version: '12', exe: 'g++'}, gpu_build_archs: '70', std: [17], jobs: ['build', 'test']} \ No newline at end of file diff --git a/ci/checks/doxygen.sh b/ci/pre-commit/doxygen.sh similarity index 65% rename from ci/checks/doxygen.sh rename to ci/pre-commit/doxygen.sh index 515558c4a..8f387c6ea 100755 --- a/ci/checks/doxygen.sh +++ b/ci/pre-commit/doxygen.sh @@ -1,8 +1,18 @@ #!/bin/bash -# Copyright (c) 2022, NVIDIA CORPORATION. -######################################## -# cuCollections doxygen warnings check # -######################################## +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # skip if doxygen is not installed if ! [ -x "$(command -v doxygen)" ]; then diff --git a/ci/sccache_hit_rate.sh b/ci/sccache_hit_rate.sh new file mode 100755 index 000000000..8b6d2d3f5 --- /dev/null +++ b/ci/sccache_hit_rate.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +# Ensure two arguments are provided +if [ $# -ne 2 ]; then + echo "Usage: $0 " >&2 + exit 1 +fi + +# Print the contents of the before file +echo "=== Contents of $1 ===" >&2 +cat $1 >&2 +echo "=== End of $1 ===" >&2 + +# Print the contents of the after file +echo "=== Contents of $2 ===" >&2 +cat $2 >&2 +echo "=== End of $2 ===" >&2 + +# Extract compile requests and cache hits from the before and after files +requests_before=$(awk '/^[ \t]*Compile requests[ \t]+[0-9]+/ {print $3}' "$1") +hits_before=$(awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}' "$1") +requests_after=$(awk '/^[ \t]*Compile requests[ \t]+[0-9]+/ {print $3}' "$2") +hits_after=$(awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}' "$2") + +# Calculate the differences to find out how many new requests and hits +requests_diff=$((requests_after - requests_before)) +hits_diff=$((hits_after - hits_before)) + +echo "New Compile Requests: $requests_diff" >&2 +echo "New Hits: $hits_diff" >&2 + +# Calculate and print the hit rate +if [ $requests_diff -eq 0 ]; then + echo "No new compile requests, hit rate is not applicable" +else + hit_rate=$(awk -v hits=$hits_diff -v requests=$requests_diff 'BEGIN {printf "%.2f", hits/requests * 100}') + echo "sccache hit rate: $hit_rate%" >&2 + echo "$hit_rate" +fi \ No newline at end of file diff --git a/ci/sccache_stats.sh b/ci/sccache_stats.sh new file mode 100755 index 000000000..a834347cb --- /dev/null +++ b/ci/sccache_stats.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script prints the sccache hit rate between two calls to sccache --show-stats. +# It should be sourced in your script before and after the operations you want to profile, +# with the 'start' or 'end' argument respectively. + +mode=$1 + +if [[ "$mode" != "start" && "$mode" != "end" ]]; then + echo "Invalid mode: $mode" + echo "Usage: $0 {start|end}" + exit 1 +fi + +case $mode in + start) + export SCCACHE_START_HITS=$(sccache --show-stats | awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}') + export SCCACHE_START_MISSES=$(sccache --show-stats | awk '/^[ \t]*Cache misses[ \t]+[0-9]+/ {print $3}') + ;; + end) + if [[ -z ${SCCACHE_START_HITS+x} || -z ${SCCACHE_START_MISSES+x} ]]; then + echo "Error: start stats not collected. Did you call this script with 'start' before your operations?" + exit 1 + fi + + final_hits=$(sccache --show-stats | awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}') + final_misses=$(sccache --show-stats | awk '/^[ \t]*Cache misses[ \t]+[0-9]+/ {print $3}') + hits=$((final_hits - SCCACHE_START_HITS)) + misses=$((final_misses - SCCACHE_START_MISSES)) + total=$((hits + misses)) + + prefix="" + if [ ${GITHUB_ACTIONS:-false} = "true" ]; then + prefix="::notice::" + fi + + if (( total > 0 )); then + hit_rate=$(awk -v hits="$hits" -v total="$total" 'BEGIN { printf "%.2f", (hits / total) * 100 }') + echo ${prefix}"sccache hits: $hits | misses: $misses | hit rate: $hit_rate%" + else + echo ${prefix}"sccache stats: N/A No new compilation requests" + fi + unset SCCACHE_START_HITS + unset SCCACHE_START_MISSES + ;; +esac \ No newline at end of file diff --git a/ci/test.sh b/ci/test.sh new file mode 100755 index 000000000..cfcce2acd --- /dev/null +++ b/ci/test.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Ensure the script is being executed in its containing directory +cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"; + +source ./build.sh "$@" + +ctest --test-dir ${BUILD_DIR}/tests --output-on-failure --timeout 60 + +echo "Test complete" \ No newline at end of file From 70a21da78f9f2ab13c95087c164b29423459a9df Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 22 Aug 2023 10:45:54 -0500 Subject: [PATCH 141/152] Use thrust::identity. (#357) This PR adapts a few device lambdas to use `thrust::identity`. This helps lift out a bit of the diff from #343. --- tests/dynamic_map/erase_test.cu | 33 ++++++++++---------------- tests/static_map/erase_test.cu | 19 ++++++--------- tests/static_map/shared_memory_test.cu | 4 +--- 3 files changed, 20 insertions(+), 36 deletions(-) diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu index 44e9e8fb5..1a60b49b6 100644 --- a/tests/dynamic_map/erase_test.cu +++ b/tests/dynamic_map/erase_test.cu @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -62,9 +63,7 @@ TEMPLATE_TEST_CASE_SIG("erase key", map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); // keys were actaully deleted - REQUIRE(cuco::test::none_of(d_keys_exist.begin(), - d_keys_exist.end(), - [] __device__(const bool key_found) { return key_found; })); + REQUIRE(cuco::test::none_of(d_keys_exist.begin(), d_keys_exist.end(), thrust::identity{})); // ensures that map is reusing deleted slots map.insert(pairs_begin, pairs_begin + num_keys); @@ -73,21 +72,17 @@ TEMPLATE_TEST_CASE_SIG("erase key", map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); - REQUIRE(cuco::test::all_of(d_keys_exist.begin(), - d_keys_exist.end(), - [] __device__(const bool key_found) { return key_found; })); + REQUIRE(cuco::test::all_of(d_keys_exist.begin(), d_keys_exist.end(), thrust::identity{})); // erase can act selectively map.erase(d_keys.begin(), d_keys.begin() + num_keys / 2); map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); - REQUIRE(cuco::test::none_of(d_keys_exist.begin(), - d_keys_exist.begin() + num_keys / 2, - [] __device__(const bool key_found) { return key_found; })); + REQUIRE(cuco::test::none_of( + d_keys_exist.begin(), d_keys_exist.begin() + num_keys / 2, thrust::identity{})); - REQUIRE(cuco::test::all_of(d_keys_exist.begin() + num_keys / 2, - d_keys_exist.end(), - [] __device__(const bool key_found) { return key_found; })); + REQUIRE(cuco::test::all_of( + d_keys_exist.begin() + num_keys / 2, d_keys_exist.end(), thrust::identity{})); // clear map map.erase(d_keys.begin() + num_keys / 2, d_keys.end()); @@ -115,13 +110,11 @@ TEMPLATE_TEST_CASE_SIG("erase key", map.erase(d_keys.begin(), d_keys.begin() + 2 * num_keys); map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); - REQUIRE(cuco::test::none_of(d_keys_exist.begin(), - d_keys_exist.begin() + 2 * num_keys, - [] __device__(const bool key_found) { return key_found; })); + REQUIRE(cuco::test::none_of( + d_keys_exist.begin(), d_keys_exist.begin() + 2 * num_keys, thrust::identity{})); - REQUIRE(cuco::test::all_of(d_keys_exist.begin() + 2 * num_keys, - d_keys_exist.end(), - [] __device__(const bool key_found) { return key_found; })); + REQUIRE(cuco::test::all_of( + d_keys_exist.begin() + 2 * num_keys, d_keys_exist.end(), thrust::identity{})); REQUIRE(map.get_size() == 2 * num_keys); // check that keys can be successfully deleted from all submaps (some will be unsuccessful @@ -130,9 +123,7 @@ TEMPLATE_TEST_CASE_SIG("erase key", map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); - REQUIRE(cuco::test::none_of(d_keys_exist.begin(), - d_keys_exist.end(), - [] __device__(const bool key_found) { return key_found; })); + REQUIRE(cuco::test::none_of(d_keys_exist.begin(), d_keys_exist.end(), thrust::identity{})); REQUIRE(map.get_size() == 0); } diff --git a/tests/static_map/erase_test.cu b/tests/static_map/erase_test.cu index 1315a5cba..26cbd3fd3 100644 --- a/tests/static_map/erase_test.cu +++ b/tests/static_map/erase_test.cu @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -59,9 +60,7 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t), (int64_t)) map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); - REQUIRE(cuco::test::none_of(d_keys_exist.begin(), - d_keys_exist.end(), - [] __device__(const bool key_found) { return key_found; })); + REQUIRE(cuco::test::none_of(d_keys_exist.begin(), d_keys_exist.end(), thrust::identity{})); map.insert(pairs_begin, pairs_begin + num_keys); @@ -69,20 +68,16 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t), (int64_t)) map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); - REQUIRE(cuco::test::all_of(d_keys_exist.begin(), - d_keys_exist.end(), - [] __device__(const bool key_found) { return key_found; })); + REQUIRE(cuco::test::all_of(d_keys_exist.begin(), d_keys_exist.end(), thrust::identity{})); map.erase(d_keys.begin(), d_keys.begin() + num_keys / 2); map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); - REQUIRE(cuco::test::none_of(d_keys_exist.begin(), - d_keys_exist.begin() + num_keys / 2, - [] __device__(const bool key_found) { return key_found; })); + REQUIRE(cuco::test::none_of( + d_keys_exist.begin(), d_keys_exist.begin() + num_keys / 2, thrust::identity{})); - REQUIRE(cuco::test::all_of(d_keys_exist.begin() + num_keys / 2, - d_keys_exist.end(), - [] __device__(const bool key_found) { return key_found; })); + REQUIRE(cuco::test::all_of( + d_keys_exist.begin() + num_keys / 2, d_keys_exist.end(), thrust::identity{})); map.erase(d_keys.begin() + num_keys / 2, d_keys.end()); REQUIRE(map.get_size() == 0); diff --git a/tests/static_map/shared_memory_test.cu b/tests/static_map/shared_memory_test.cu index 8b9d35390..444f1c7e7 100644 --- a/tests/static_map/shared_memory_test.cu +++ b/tests/static_map/shared_memory_test.cu @@ -148,9 +148,7 @@ TEMPLATE_TEST_CASE_SIG("Shared memory static map", d_keys_exist.data().get(), d_keys_and_values_correct.data().get()); - REQUIRE(cuco::test::none_of(d_keys_exist.begin(), - d_keys_exist.end(), - [] __device__(const bool key_found) { return key_found; })); + REQUIRE(cuco::test::none_of(d_keys_exist.begin(), d_keys_exist.end(), thrust::identity{})); } } From 9f4b27f45fbd1217a2a0b1aef1706bfcde795657 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 23 Aug 2023 15:45:23 -0700 Subject: [PATCH 142/152] Distinct code path for key equality checks in `packed_cas` (#356) This PR fixes a bug where we previously used the whole slot content for map key comparisons in `packed_cas`. --- .../cuco/detail/open_addressing_ref_impl.cuh | 62 +++++++++++++------ .../cuco/detail/static_map/static_map_ref.inl | 20 +++--- .../cuco/detail/static_set/static_set_ref.inl | 20 +++--- 3 files changed, 67 insertions(+), 35 deletions(-) diff --git a/include/cuco/detail/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing_ref_impl.cuh index 99187cc51..4aa701759 100644 --- a/include/cuco/detail/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing_ref_impl.cuh @@ -132,6 +132,7 @@ class open_addressing_ref_impl { /** * @brief Inserts an element. * + * @tparam HasPayload Boolean indicating it's a set or map implementation * @tparam Predicate Predicate type * * @param key Key of the element to insert @@ -140,7 +141,7 @@ class open_addressing_ref_impl { * * @return True if the given element is successfully inserted */ - template + template __device__ bool insert(key_type const& key, value_type const& value, Predicate const& predicate) noexcept @@ -158,7 +159,7 @@ class open_addressing_ref_impl { if (eq_res == detail::equal_result::EQUAL) { return false; } if (eq_res == detail::equal_result::EMPTY) { auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); - switch (attempt_insert( + switch (attempt_insert( (storage_ref_.data() + *probing_iter)->data() + intra_window_index, value, predicate)) { case insert_result::CONTINUE: continue; case insert_result::SUCCESS: return true; @@ -173,6 +174,7 @@ class open_addressing_ref_impl { /** * @brief Inserts an element. * + * @tparam HasPayload Boolean indicating it's a set or map implementation * @tparam Predicate Predicate type * * @param group The Cooperative Group used to perform group insert @@ -182,7 +184,7 @@ class open_addressing_ref_impl { * * @return True if the given element is successfully inserted */ - template + template __device__ bool insert(cooperative_groups::thread_block_tile const& group, key_type const& key, value_type const& value, @@ -214,9 +216,10 @@ class open_addressing_ref_impl { auto const src_lane = __ffs(group_contains_empty) - 1; auto const status = (group.thread_rank() == src_lane) - ? attempt_insert((storage_ref_.data() + *probing_iter)->data() + intra_window_index, - value, - predicate) + ? attempt_insert( + (storage_ref_.data() + *probing_iter)->data() + intra_window_index, + value, + predicate) : insert_result::CONTINUE; switch (group.shfl(status, src_lane)) { @@ -237,6 +240,7 @@ class open_addressing_ref_impl { * element that prevented the insertion) and a `bool` denoting whether the insertion took place or * not. * + * @tparam HasPayload Boolean indicating it's a set or map implementation * @tparam Predicate Predicate type * * @param key Key of the element to insert @@ -246,7 +250,7 @@ class open_addressing_ref_impl { * @return a pair consisting of an iterator to the element and a bool indicating whether the * insertion is successful or not. */ - template + template __device__ thrust::pair insert_and_find(key_type const& key, value_type const& value, Predicate const& predicate) noexcept @@ -266,7 +270,7 @@ class open_addressing_ref_impl { if (eq_res == detail::equal_result::EMPTY) { switch ([&]() { if constexpr (sizeof(value_type) <= 8) { - return packed_cas(window_ptr + i, value, predicate); + return packed_cas(window_ptr + i, value, predicate); } else { return cas_dependent_write(window_ptr + i, value, predicate); } @@ -292,6 +296,7 @@ class open_addressing_ref_impl { * element that prevented the insertion) and a `bool` denoting whether the insertion took place or * not. * + * @tparam HasPayload Boolean indicating it's a set or map implementation * @tparam Predicate Predicate type * * @param group The Cooperative Group used to perform group insert_and_find @@ -302,7 +307,7 @@ class open_addressing_ref_impl { * @return a pair consisting of an iterator to the element and a bool indicating whether the * insertion is successful or not. */ - template + template __device__ thrust::pair insert_and_find( cooperative_groups::thread_block_tile const& group, key_type const& key, @@ -343,7 +348,7 @@ class open_addressing_ref_impl { auto const status = [&]() { if (group.thread_rank() != src_lane) { return insert_result::CONTINUE; } if constexpr (sizeof(value_type) <= 8) { - return packed_cas(slot_ptr, value, predicate); + return packed_cas(slot_ptr, value, predicate); } else { return cas_dependent_write(slot_ptr, value, predicate); } @@ -649,6 +654,7 @@ class open_addressing_ref_impl { /** * @brief Inserts the specified element with one single CAS operation. * + * @tparam HasPayload Boolean indicating it's a set or map implementation * @tparam Predicate Predicate type * * @param slot Pointer to the slot in memory @@ -657,20 +663,37 @@ class open_addressing_ref_impl { * * @return Result of this operation, i.e., success/continue/duplicate */ - template + template [[nodiscard]] __device__ constexpr insert_result packed_cas(value_type* slot, value_type const& value, Predicate const& predicate) noexcept { - auto old = compare_and_swap(slot, this->empty_slot_sentinel_, value); - auto* old_ptr = reinterpret_cast(&old); - if (cuco::detail::bitwise_compare(*old_ptr, this->empty_slot_sentinel_)) { + auto old = compare_and_swap(slot, this->empty_slot_sentinel_, value); + auto* old_ptr = reinterpret_cast(&old); + auto const inserted = [&]() { + if constexpr (HasPayload) { + // If it's a set implementation, compare the whole slot content + return cuco::detail::bitwise_compare(*old_ptr, this->empty_slot_sentinel_); + } else { + // If it's a map implementation, compare keys only + return cuco::detail::bitwise_compare(old_ptr->first, this->empty_slot_sentinel_.first); + } + }(); + if (inserted) { return insert_result::SUCCESS; } else { // Shouldn't use `predicate` operator directly since it includes a redundant bitwise compare - return predicate.equal_to(*old_ptr, value) == detail::equal_result::EQUAL - ? insert_result::DUPLICATE - : insert_result::CONTINUE; + auto const res = [&]() { + if constexpr (HasPayload) { + // If it's a set implementation, compare the whole slot content + return predicate.equal_to(*old_ptr, value); + } else { + // If it's a map implementation, compare keys only + return predicate.equal_to(old_ptr->first, value.first); + } + }(); + return res == detail::equal_result::EQUAL ? insert_result::DUPLICATE + : insert_result::CONTINUE; } } @@ -761,6 +784,7 @@ class open_addressing_ref_impl { * @note Dispatches the correct implementation depending on the container * type and presence of other operator mixins. * + * @tparam HasPayload Boolean indicating it's a set or map implementation * @tparam Predicate Predicate type * * @param slot Pointer to the slot in memory @@ -769,13 +793,13 @@ class open_addressing_ref_impl { * * @return Result of this operation, i.e., success/continue/duplicate */ - template + template [[nodiscard]] __device__ insert_result attempt_insert(value_type* slot, value_type const& value, Predicate const& predicate) noexcept { if constexpr (sizeof(value_type) <= 8) { - return packed_cas(slot, value, predicate); + return packed_cas(slot, value, predicate); } else { #if (_CUDA_ARCH__ < 700) return cas_dependent_write(slot, value, predicate); diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index f3c412924..536973b20 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -209,8 +209,9 @@ class operator_impl< */ __device__ bool insert(value_type const& value) noexcept { - ref_type& ref_ = static_cast(*this); - return ref_.impl_.insert(value.first, value, ref_.predicate_); + ref_type& ref_ = static_cast(*this); + auto constexpr has_payload = false; + return ref_.impl_.insert(value.first, value, ref_.predicate_); } /** @@ -223,8 +224,9 @@ class operator_impl< __device__ bool insert(cooperative_groups::thread_block_tile const& group, value_type const& value) noexcept { - auto& ref_ = static_cast(*this); - return ref_.impl_.insert(group, value.first, value, ref_.predicate_); + auto& ref_ = static_cast(*this); + auto constexpr has_payload = false; + return ref_.impl_.insert(group, value.first, value, ref_.predicate_); } }; @@ -289,8 +291,9 @@ class operator_impl< */ __device__ thrust::pair insert_and_find(value_type const& value) noexcept { - ref_type& ref_ = static_cast(*this); - return ref_.impl_.insert_and_find(value.first, value, ref_.predicate_); + ref_type& ref_ = static_cast(*this); + auto constexpr has_payload = false; + return ref_.impl_.insert_and_find(value.first, value, ref_.predicate_); } /** @@ -309,8 +312,9 @@ class operator_impl< __device__ thrust::pair insert_and_find( cooperative_groups::thread_block_tile const& group, value_type const& value) noexcept { - ref_type& ref_ = static_cast(*this); - return ref_.impl_.insert_and_find(group, value.first, value, ref_.predicate_); + ref_type& ref_ = static_cast(*this); + auto constexpr has_payload = false; + return ref_.impl_.insert_and_find(group, value.first, value, ref_.predicate_); } }; diff --git a/include/cuco/detail/static_set/static_set_ref.inl b/include/cuco/detail/static_set/static_set_ref.inl index 3482738cc..3131f3764 100644 --- a/include/cuco/detail/static_set/static_set_ref.inl +++ b/include/cuco/detail/static_set/static_set_ref.inl @@ -100,8 +100,9 @@ class operator_impl(*this); - return ref_.impl_.insert(value, value, ref_.predicate_); + ref_type& ref_ = static_cast(*this); + auto constexpr has_payload = true; + return ref_.impl_.insert(value, value, ref_.predicate_); } /** @@ -115,8 +116,9 @@ class operator_impl const& group, value_type const& value) noexcept { - auto& ref_ = static_cast(*this); - return ref_.impl_.insert(group, value, value, ref_.predicate_); + auto& ref_ = static_cast(*this); + auto constexpr has_payload = true; + return ref_.impl_.insert(group, value, value, ref_.predicate_); } }; @@ -179,8 +181,9 @@ class operator_impl insert_and_find(value_type const& value) noexcept { - ref_type& ref_ = static_cast(*this); - return ref_.impl_.insert_and_find(value, value, ref_.predicate_); + ref_type& ref_ = static_cast(*this); + auto constexpr has_payload = true; + return ref_.impl_.insert_and_find(value, value, ref_.predicate_); } /** @@ -199,8 +202,9 @@ class operator_impl insert_and_find( cooperative_groups::thread_block_tile const& group, value_type const& value) noexcept { - ref_type& ref_ = static_cast(*this); - return ref_.impl_.insert_and_find(group, value, value, ref_.predicate_); + ref_type& ref_ = static_cast(*this); + auto constexpr has_payload = true; + return ref_.impl_.insert_and_find(group, value, value, ref_.predicate_); } }; From 4fcced4cfb840c67a2949a9027308dca7afca4cf Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 25 Aug 2023 16:36:31 -0700 Subject: [PATCH 143/152] Fix broken links for static_set live examples (#358) This PR fixes the broken links for set live examples. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9bf9ae777..93ac04027 100644 --- a/README.md +++ b/README.md @@ -186,8 +186,8 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection `cuco::static_set` is a fixed-size container that stores unique elements in no particular order. See the Doxygen documentation in `static_set.cuh` for more detailed information. #### Examples: -- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/jnjcdG16c)) -- [Device-ref APIs for individual operations](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/EGMj6qx73)) +- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/Pzf6vabz1)) +- [Device-ref APIs for individual operations](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/sfG3qKqGv)) ### `static_map` From 0ba07922059bb84cdb32e1a783a6e3a5da559bff Mon Sep 17 00:00:00 2001 From: AJ Schmidt Date: Tue, 29 Aug 2023 16:57:08 -0400 Subject: [PATCH 144/152] Use `copy-pr-bot` (#360) This PR replaces the `copy_prs` functionality from the `ops-bot` with the new dedicated `copy-pr-bot` GitHub application. Thorough documentation for the new `copy-pr-bot` application can be viewed below. - https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/ **Important**: `copy-pr-bot` enforces signed commits. If an organization member opens a PR that contains unsigned commits, it will be deemed untrusted and therefore require an `/ok to test` comment. See the GitHub docs [here](https://docs.github.com/en/authentication/managing-commit-signature-verification/about-commit-signature-verification) for information on how to set up commit signing. Any time a PR is deemed untrusted, it will receive a comment that looks like this: https://github.com/rapidsai/ci-imgs/pull/63#issuecomment-1688973208. Every subsequent commit on an untrusted PR will require an additional `/ok to test` comment. Any existing PRs that have unsigned commits after this change is merged will require an `/ok to test` comment for each subsequent commit _or_ the PR can be rebased to include signed commits as mentioned in the docs below: https://docs.gha-runners.nvidia.com/cpr/contributors. This information is all included on the documentation page linked above. _I've skipped CI on this PR since it's not a change that is tested._ [skip ci] --- .github/copy-pr-bot.yaml | 4 ++++ .github/ops-bot.yaml | 19 ------------------- 2 files changed, 4 insertions(+), 19 deletions(-) create mode 100644 .github/copy-pr-bot.yaml delete mode 100644 .github/ops-bot.yaml diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml new file mode 100644 index 000000000..895ba83ee --- /dev/null +++ b/.github/copy-pr-bot.yaml @@ -0,0 +1,4 @@ +# Configuration file for `copy-pr-bot` GitHub App +# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/ + +enabled: true diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml deleted file mode 100644 index 217ccebf9..000000000 --- a/.github/ops-bot.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This file controls which features from the `ops-bot` repository below are enabled. -# - https://github.com/rapidsai/ops-bot - -copy_prs: true From 8b07be33079b2cdeb22ed14765b5a4850606407e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 29 Aug 2023 17:13:34 -0700 Subject: [PATCH 145/152] Add missing headers (#361) This PR adds missing headers --- include/cuco/detail/open_addressing_ref_impl.cuh | 1 + include/cuco/static_map.cuh | 1 + 2 files changed, 2 insertions(+) diff --git a/include/cuco/detail/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing_ref_impl.cuh index 4aa701759..667789eca 100644 --- a/include/cuco/detail/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing_ref_impl.cuh @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 2df5b2a10..65644ccff 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include From e15ee27ad3e5f6bcaf2f6717d370e91f5e5c5ae8 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 1 Sep 2023 11:26:13 -0700 Subject: [PATCH 146/152] Add `insert_or_assign` for map (#353) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #317 This PR adds `static_map::insert_or_assign`. It also removes the `[[nodiscard]]` qualifier for `retrieve_all` since they shouldn't be necessary. --------- Co-authored-by: Daniel Jünger <2955913+sleeepyjack@users.noreply.github.com> --- .../cuco/detail/open_addressing_ref_impl.cuh | 101 +++++++---- include/cuco/detail/static_map/kernels.cuh | 39 ++++- include/cuco/detail/static_map/static_map.inl | 40 +++++ .../cuco/detail/static_map/static_map_ref.inl | 162 ++++++++++++++++++ include/cuco/operator.hpp | 6 + include/cuco/static_map.cuh | 46 ++++- include/cuco/static_set.cuh | 2 +- tests/CMakeLists.txt | 1 + tests/static_map/insert_or_assign_test.cu | 114 ++++++++++++ 9 files changed, 474 insertions(+), 37 deletions(-) create mode 100644 tests/static_map/insert_or_assign_test.cu diff --git a/include/cuco/detail/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing_ref_impl.cuh index 667789eca..46ef2bfd7 100644 --- a/include/cuco/detail/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing_ref_impl.cuh @@ -35,6 +35,29 @@ namespace cuco { namespace experimental { namespace detail { +/// Three-way insert result enum +enum class insert_result : int32_t { CONTINUE = 0, SUCCESS = 1, DUPLICATE = 2 }; + +/** + * @brief Helper struct to store intermediate window probing results. + */ +struct window_probing_results { + detail::equal_result state_; ///< Equal result + int32_t intra_window_index_; ///< Intra-window index + + /** + * @brief Constructs window_probing_results. + * + * @param state The three way equality result + * @param index Intra-window index + */ + __device__ explicit constexpr window_probing_results(detail::equal_result state, + int32_t index) noexcept + : state_{state}, intra_window_index_{index} + { + } +}; + /** * @brief Common device non-owning "ref" implementation class. * @@ -199,13 +222,15 @@ class open_addressing_ref_impl { auto const [state, intra_window_index] = [&]() { for (auto i = 0; i < window_size; ++i) { switch (predicate(window_slots[i], key)) { - case detail::equal_result::EMPTY: return window_results{detail::equal_result::EMPTY, i}; - case detail::equal_result::EQUAL: return window_results{detail::equal_result::EQUAL, i}; + case detail::equal_result::EMPTY: + return window_probing_results{detail::equal_result::EMPTY, i}; + case detail::equal_result::EQUAL: + return window_probing_results{detail::equal_result::EQUAL, i}; default: continue; } } // returns dummy index `-1` for UNEQUAL - return window_results{detail::equal_result::UNEQUAL, -1}; + return window_probing_results{detail::equal_result::UNEQUAL, -1}; }(); // If the key is already in the container, return false @@ -323,13 +348,15 @@ class open_addressing_ref_impl { auto const [state, intra_window_index] = [&]() { for (auto i = 0; i < window_size; ++i) { switch (predicate(window_slots[i], key)) { - case detail::equal_result::EMPTY: return window_results{detail::equal_result::EMPTY, i}; - case detail::equal_result::EQUAL: return window_results{detail::equal_result::EQUAL, i}; + case detail::equal_result::EMPTY: + return window_probing_results{detail::equal_result::EMPTY, i}; + case detail::equal_result::EQUAL: + return window_probing_results{detail::equal_result::EQUAL, i}; default: continue; } } // returns dummy index `-1` for UNEQUAL - return window_results{detail::equal_result::UNEQUAL, -1}; + return window_probing_results{detail::equal_result::UNEQUAL, -1}; }(); auto* slot_ptr = (storage_ref_.data() + *probing_iter)->data() + intra_window_index; @@ -519,13 +546,15 @@ class open_addressing_ref_impl { auto const [state, intra_window_index] = [&]() { for (auto i = 0; i < window_size; ++i) { switch (predicate(window_slots[i], key)) { - case detail::equal_result::EMPTY: return window_results{detail::equal_result::EMPTY, i}; - case detail::equal_result::EQUAL: return window_results{detail::equal_result::EQUAL, i}; + case detail::equal_result::EMPTY: + return window_probing_results{detail::equal_result::EMPTY, i}; + case detail::equal_result::EQUAL: + return window_probing_results{detail::equal_result::EQUAL, i}; default: continue; } } // returns dummy index `-1` for UNEQUAL - return window_results{detail::equal_result::UNEQUAL, -1}; + return window_probing_results{detail::equal_result::UNEQUAL, -1}; }(); // Find a match for the probe key, thus return an iterator to the entry @@ -545,29 +574,6 @@ class open_addressing_ref_impl { } } - private: - /// Three-way insert result enum - enum class insert_result : int32_t { CONTINUE = 0, SUCCESS = 1, DUPLICATE = 2 }; - - /** - * @brief Helper struct to store intermediate window probing results. - */ - struct window_results { - detail::equal_result state_; ///< Equal result - int32_t intra_window_index_; ///< Intra-window index - - /** - * @brief Constructs window_results. - * - * @param state The three way equality result - *@param Intra-window index - */ - __device__ explicit constexpr window_results(detail::equal_result state, int32_t index) noexcept - : state_{state}, intra_window_index_{index} - { - } - }; - /** * @brief Compares the content of the address `address` (old value) with the `expected` value and, * only if they are the same, sets the content of `address` to `desired`. @@ -652,6 +658,37 @@ class open_addressing_ref_impl { } } + /** + * @brief Gets the sentinel used to represent an empty slot. + * + * @return The sentinel value used to represent an empty slot + */ + [[nodiscard]] __device__ constexpr value_type empty_slot_sentinel() const noexcept + { + return empty_slot_sentinel_; + } + + /** + * @brief Gets the probing scheme. + * + * @return The probing scheme used for the container + */ + [[nodiscard]] __device__ constexpr probing_scheme_type const& probing_scheme() const noexcept + { + return probing_scheme_; + } + + /** + * @brief Gets the non-owning storage ref. + * + * @return The non-owning storage ref of the container + */ + [[nodiscard]] __device__ constexpr storage_ref_type storage_ref() const noexcept + { + return storage_ref_; + } + + private: /** * @brief Inserts the specified element with one single CAS operation. * diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh index 29906d061..73c87eb61 100644 --- a/include/cuco/detail/static_map/kernels.cuh +++ b/include/cuco/detail/static_map/kernels.cuh @@ -30,7 +30,44 @@ namespace static_map_ns { namespace detail { /** - * @brief Finds the equivalent map elements of all keys in the range `[first, last)`. + * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent to + * `k` already exists in the container, assigns `v` to the mapped_type corresponding to the key `k`. + * If the key does not exist, inserts the pair as if by insert. + * + * @note If multiple elements in `[first, first + n)` compare equal, it is unspecified which element + * is inserted. + * + * @tparam CGSize Number of threads in each CG + * @tparam BlockSize Number of threads in each block + * @tparam InputIterator Device accessible input iterator whose `value_type` is + * convertible to the `value_type` of the data structure + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of input elements + * @param n Number of input elements + * @param ref Non-owning container device ref used to access the slot storage + */ +template +__global__ void insert_or_assign(InputIterator first, cuco::detail::index_type n, Ref ref) +{ + cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize; + cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize; + + while (idx < n) { + typename Ref::value_type const insert_pair{*(first + idx)}; + if constexpr (CGSize == 1) { + ref.insert_or_assign(insert_pair); + } else { + auto const tile = + cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + ref.insert_or_assign(tile, insert_pair); + } + idx += loop_stride; + } +} + +/** + * @brief Finds the equivalent map elements of all keys in the range `[first, first + n)`. * * @note If the key `*(first + i)` has a match in the container, copies the payload of its matched * element to `(output_begin + i)`. Else, copies the empty value sentinel. Uses the CUDA Cooperative diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index 76b1df79b..15f6cc5ab 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -145,6 +145,46 @@ void static_mapinsert_if_async(first, last, stencil, pred, ref(op::insert), stream); } +template +template +void static_map:: + insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream) noexcept +{ + return this->insert_or_assign_async(first, last, stream); + stream.synchronize(); +} + +template +template +void static_map:: + insert_or_assign_async(InputIt first, InputIt last, cuda_stream_ref stream) noexcept +{ + auto const num = cuco::detail::distance(first, last); + if (num == 0) { return; } + + auto const grid_size = + (cg_size * num + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / + (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + + static_map_ns::detail::insert_or_assign + <<>>( + first, num, ref(op::insert_or_assign)); +} + template +class operator_impl< + op::insert_or_assign_tag, + static_map_ref> { + using base_type = static_map_ref; + using ref_type = static_map_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + static_assert(sizeof(T) == 4 or sizeof(T) == 8, + "sizeof(mapped_type) must be either 4 bytes or 8 bytes."); + + public: + /** + * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v` + * to the mapped_type corresponding to the key `k`. + * + * @param value The element to insert + */ + __device__ void insert_or_assign(value_type const& value) noexcept + { + static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); + + ref_type& ref_ = static_cast(*this); + auto const key = value.first; + auto& probing_scheme = ref_.impl_.probing_scheme(); + auto storage_ref = ref_.impl_.storage_ref(); + auto probing_iter = probing_scheme(key, storage_ref.window_extent()); + + while (true) { + auto const window_slots = storage_ref[*probing_iter]; + + for (auto& slot_content : window_slots) { + auto const eq_res = ref_.predicate_(slot_content, key); + + // If the key is already in the container, update the payload and return + if (eq_res == detail::equal_result::EQUAL) { + auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); + ref_.impl_.atomic_store( + &((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second, + value.second); + return; + } + if (eq_res == detail::equal_result::EMPTY) { + auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); + if (attempt_insert_or_assign( + (storage_ref.data() + *probing_iter)->data() + intra_window_index, value)) { + return; + } + } + } + ++probing_iter; + } + } + + /** + * @brief Inserts an element. + * + * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v` + * to the mapped_type corresponding to the key `k`. + * + * @param group The Cooperative Group used to perform group insert + * @param value The element to insert + */ + __device__ void insert_or_assign(cooperative_groups::thread_block_tile const& group, + value_type const& value) noexcept + { + ref_type& ref_ = static_cast(*this); + + auto const key = value.first; + auto& probing_scheme = ref_.impl_.probing_scheme(); + auto storage_ref = ref_.impl_.storage_ref(); + auto probing_iter = probing_scheme(group, key, storage_ref.window_extent()); + + while (true) { + auto const window_slots = storage_ref[*probing_iter]; + + auto const [state, intra_window_index] = [&]() { + for (auto i = 0; i < window_size; ++i) { + switch (ref_.predicate_(window_slots[i], key)) { + case detail::equal_result::EMPTY: + return detail::window_probing_results{detail::equal_result::EMPTY, i}; + case detail::equal_result::EQUAL: + return detail::window_probing_results{detail::equal_result::EQUAL, i}; + default: continue; + } + } + // returns dummy index `-1` for UNEQUAL + return detail::window_probing_results{detail::equal_result::UNEQUAL, -1}; + }(); + + auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL); + if (group_contains_equal) { + auto const src_lane = __ffs(group_contains_equal) - 1; + if (group.thread_rank() == src_lane) { + ref_.impl_.atomic_store( + &((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second, + value.second); + } + group.sync(); + return; + } + + auto const group_contains_empty = group.ballot(state == detail::equal_result::EMPTY); + if (group_contains_empty) { + auto const src_lane = __ffs(group_contains_empty) - 1; + auto const status = + (group.thread_rank() == src_lane) + ? attempt_insert_or_assign( + (storage_ref.data() + *probing_iter)->data() + intra_window_index, value) + : false; + + // Exit if inserted or assigned + if (group.shfl(status, src_lane)) { return; } + } else { + ++probing_iter; + } + } + } + + private: + /** + * @brief Attempts to insert an element into a slot or update the matching payload with the given + * element + * + * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v` + * to the mapped_type corresponding to the key `k`. + * + * @param group The Cooperative Group used to perform group insert + * @param value The element to insert + * + * @return Returns `true` if the given `value` is inserted or `value` has a match in the map. + */ + __device__ constexpr bool attempt_insert_or_assign(value_type* slot, + value_type const& value) noexcept + { + ref_type& ref_ = static_cast(*this); + auto const expected_key = ref_.impl_.empty_slot_sentinel().first; + + auto old_key = ref_.impl_.compare_and_swap(&slot->first, expected_key, value.first); + auto* old_key_ptr = reinterpret_cast(&old_key); + + // if key success or key was already present in the map + if (cuco::detail::bitwise_compare(*old_key_ptr, expected_key) or + (ref_.predicate_.equal_to(*old_key_ptr, value.first) == detail::equal_result::EQUAL)) { + // Update payload + ref_.impl_.atomic_store(&slot->second, value.second); + return true; + } + return false; + } +}; + template std::is_convertible::value_type, + * static_map::value_type> is `true` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stream CUDA stream used for insert + */ + template + void insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept; + + /** + * @brief For any key-value pair `{k, v}` in the range `[first, last)`, if a key equivalent to `k` + * already exists in the container, assigns `v` to the mapped_type corresponding to the key `k`. + * If the key does not exist, inserts the pair as if by insert. + * + * @note If multiple pairs in `[first, last)` compare equal, it is unspecified which pair is + * inserted or assigned. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * static_map::value_type> is `true` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stream CUDA stream used for insert + */ + template + void insert_or_assign_async(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept; + /** * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. * @@ -454,9 +494,9 @@ class static_map { * @return Pair of iterators indicating the last elements in the output */ template - [[nodiscard]] std::pair retrieve_all(KeyOut keys_out, - ValueOut values_out, - cuda_stream_ref stream = {}) const; + std::pair retrieve_all(KeyOut keys_out, + ValueOut values_out, + cuda_stream_ref stream = {}) const; /** * @brief Gets the number of elements in the container. diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh index a7eee42ac..773b6ed1f 100644 --- a/include/cuco/static_set.cuh +++ b/include/cuco/static_set.cuh @@ -422,7 +422,7 @@ class static_set { * @return Iterator indicating the end of the output */ template - [[nodiscard]] OutputIt retrieve_all(OutputIt output_begin, cuda_stream_ref stream = {}) const; + OutputIt retrieve_all(OutputIt output_begin, cuda_stream_ref stream = {}) const; /** * @brief Gets the number of elements in the container. diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ebc37e39b..d78ec7f49 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -74,6 +74,7 @@ ConfigureTest(STATIC_MAP_TEST static_map/erase_test.cu static_map/heterogeneous_lookup_test.cu static_map/insert_and_find_test.cu + static_map/insert_or_assign_test.cu static_map/key_sentinel_test.cu static_map/shared_memory_test.cu static_map/stream_test.cu diff --git a/tests/static_map/insert_or_assign_test.cu b/tests/static_map/insert_or_assign_test.cu new file mode 100644 index 000000000..90c6553ce --- /dev/null +++ b/tests/static_map/insert_or_assign_test.cu @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include + +using size_type = std::size_t; + +template +__inline__ void test_insert_or_assign(Map& map, size_type num_keys) +{ + using Key = typename Map::key_type; + using Value = typename Map::mapped_type; + + // Insert pairs + auto pairs_begin = + thrust::make_transform_iterator(thrust::counting_iterator(0), + [] __device__(auto i) { return cuco::pair(i, i); }); + + auto const initial_size = map.insert(pairs_begin, pairs_begin + num_keys); + REQUIRE(initial_size == num_keys); // all keys should be inserted + + // Query pairs have the same keys but different payloads + auto query_pairs_begin = thrust::make_transform_iterator( + thrust::counting_iterator(0), + [] __device__(auto i) { return cuco::pair(i, i * 2); }); + + map.insert_or_assign(query_pairs_begin, query_pairs_begin + num_keys); + + auto const updated_size = map.size(); + // all keys are present in the map so the size shouldn't change + REQUIRE(updated_size == initial_size); + + thrust::device_vector d_keys(num_keys); + thrust::device_vector d_values(num_keys); + map.retrieve_all(d_keys.begin(), d_values.begin()); + + auto gold_values_begin = thrust::make_transform_iterator(thrust::counting_iterator(0), + [] __device__(auto i) { return i * 2; }); + + thrust::sort(thrust::device, d_values.begin(), d_values.end()); + REQUIRE(cuco::test::equal( + d_values.begin(), d_values.end(), gold_values_begin, thrust::equal_to{})); +} + +TEMPLATE_TEST_CASE_SIG( + "Insert or assign", + "", + ((typename Key, typename Value, cuco::test::probe_sequence Probe, int CGSize), + Key, + Value, + Probe, + CGSize), + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) +{ + constexpr size_type num_keys{400}; + + using probe = + std::conditional_t>, + cuco::experimental::double_hashing, + cuco::murmurhash3_32>>; + + auto map = cuco::experimental::static_map, + cuda::thread_scope_device, + thrust::equal_to, + probe, + cuco::cuda_allocator, + cuco::experimental::storage<2>>{ + num_keys, cuco::empty_key{-1}, cuco::empty_value{-1}}; + + test_insert_or_assign(map, num_keys); +} From 9b4ebaf0cce60e485436ba3b556d46280e089e4c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 4 Sep 2023 19:20:36 -0700 Subject: [PATCH 147/152] Fix typos in the documentation (#364) This PR fixes several typos in the new containers' documentation. --- include/cuco/detail/open_addressing_impl.cuh | 8 ++++---- include/cuco/static_map.cuh | 16 ++++++++-------- include/cuco/static_map_ref.cuh | 2 +- include/cuco/static_set.cuh | 16 ++++++++-------- include/cuco/static_set_ref.cuh | 2 +- 5 files changed, 22 insertions(+), 22 deletions(-) diff --git a/include/cuco/detail/open_addressing_impl.cuh b/include/cuco/detail/open_addressing_impl.cuh index dbf169dca..80b08dc42 100644 --- a/include/cuco/detail/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing_impl.cuh @@ -200,7 +200,7 @@ class open_addressing_impl { } /** - * @brief Asynchonously inserts all keys in the range `[first, last)`. + * @brief Asynchronously inserts all keys in the range `[first, last)`. * * @tparam InputIt Device accessible random access input iterator where * std::is_convertible::value_type, @@ -281,7 +281,7 @@ class open_addressing_impl { } /** - * @brief Asynchonously inserts keys in the range `[first, last)` if `pred` of the corresponding + * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding * stencil returns true. * * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. @@ -323,7 +323,7 @@ class open_addressing_impl { } /** - * @brief Asynchonously indicates whether the keys in the range `[first, last)` are contained in + * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in * the container. * * @tparam InputIt Device accessible input iterator @@ -357,7 +357,7 @@ class open_addressing_impl { } /** - * @brief Asynchonously indicates whether the keys in the range `[first, last)` are contained in + * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in * the container if `pred` of the corresponding stencil returns true. * * @note If `pred( *(stencil + i) )` is true, stores `true` or `false` to `(output_begin + i)` diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index f1e931bc8..f23fb5045 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -61,9 +61,9 @@ namespace experimental { * construction. * * @note Allows constant time concurrent modify or lookup operations from threads in device code. - * @note cuCollections data stuctures always place the slot keys on the left-hand side when invoking - * the key comparison predicate, i.e., `pred(slot_key, query_key)`. Order-sensitive `KeyEqual` - * should be used with caution. + * @note cuCollections data structures always place the slot keys on the left-hand side when + * invoking the key comparison predicate, i.e., `pred(slot_key, query_key)`. Order-sensitive + * `KeyEqual` should be used with caution. * @note `ProbingScheme::cg_size` indicates how many threads are used to handle one independent * device operation. `cg_size == 1` uses the scalar (or non-CG) code paths. * @@ -221,7 +221,7 @@ class static_map { size_type insert(InputIt first, InputIt last, cuda_stream_ref stream = {}); /** - * @brief Asynchonously inserts all keys in the range `[first, last)`. + * @brief Asynchronously inserts all keys in the range `[first, last)`. * * @tparam InputIt Device accessible random access input iterator where * std::is_convertible::value_type, @@ -263,7 +263,7 @@ class static_map { InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream = {}); /** - * @brief Asynchonously inserts keys in the range `[first, last)` if `pred` of the corresponding + * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding * stencil returns true. * * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. @@ -350,7 +350,7 @@ class static_map { cuda_stream_ref stream = {}) const; /** - * @brief Asynchonously indicates whether the keys in the range `[first, last)` are contained in + * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in * the map. * * @tparam InputIt Device accessible input iterator @@ -401,7 +401,7 @@ class static_map { cuda_stream_ref stream = {}) const; /** - * @brief Asynchonously indicates whether the keys in the range `[first, last)` are contained in + * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in * the map if `pred` of the corresponding stencil returns true. * * @note If `pred( *(stencil + i) )` is true, stores `true` or `false` to `(output_begin + i)` @@ -452,7 +452,7 @@ class static_map { void find(InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream = {}) const; /** - * @brief For all keys in the range `[first, last)`, asynchonously finds a payload with its key + * @brief For all keys in the range `[first, last)`, asynchronously finds a payload with its key * equivalent to the query key. * * @note If the key `*(first + i)` has a matched `element` in the map, copies the payload of diff --git a/include/cuco/static_map_ref.cuh b/include/cuco/static_map_ref.cuh index b278ee453..2460f1f10 100644 --- a/include/cuco/static_map_ref.cuh +++ b/include/cuco/static_map_ref.cuh @@ -31,7 +31,7 @@ namespace experimental { * * @note Concurrent modify and lookup will be supported if both kinds of operators are specified * during the ref construction. - * @note cuCollections data stuctures always place the slot keys on the left-hand + * @note cuCollections data structures always place the slot keys on the left-hand * side when invoking the key comparison predicate. * @note Ref types are trivially-copyable and are intended to be passed by value. * @note `ProbingScheme::cg_size` indicates how many threads are used to handle one independent diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh index 773b6ed1f..613a99bd4 100644 --- a/include/cuco/static_set.cuh +++ b/include/cuco/static_set.cuh @@ -60,9 +60,9 @@ namespace experimental { * construction. * * @note Allows constant time concurrent modify or lookup operations from threads in device code. - * @note cuCollections data stuctures always place the slot keys on the left-hand side when invoking - * the key comparison predicate, i.e., `pred(slot_key, query_key)`. Order-sensitive `KeyEqual` - * should be used with caution. + * @note cuCollections data structures always place the slot keys on the left-hand side when + * invoking the key comparison predicate, i.e., `pred(slot_key, query_key)`. Order-sensitive + * `KeyEqual` should be used with caution. * @note `ProbingScheme::cg_size` indicates how many threads are used to handle one independent * device operation. `cg_size == 1` uses the scalar (or non-CG) code paths. * @@ -194,7 +194,7 @@ class static_set { size_type insert(InputIt first, InputIt last, cuda_stream_ref stream = {}); /** - * @brief Asynchonously inserts all keys in the range `[first, last)`. + * @brief Asynchronously inserts all keys in the range `[first, last)`. * * @tparam InputIt Device accessible random access input iterator where * std::is_convertible::value_type, @@ -236,7 +236,7 @@ class static_set { InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream = {}); /** - * @brief Asynchonously inserts keys in the range `[first, last)` if `pred` of the corresponding + * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding * stencil returns true. * * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. @@ -283,7 +283,7 @@ class static_set { cuda_stream_ref stream = {}) const; /** - * @brief Asynchonously indicates whether the keys in the range `[first, last)` are contained in + * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in * the set. * * @tparam InputIt Device accessible input iterator @@ -334,7 +334,7 @@ class static_set { cuda_stream_ref stream = {}) const; /** - * @brief Asynchonously indicates whether the keys in the range `[first, last)` are contained in + * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in * the set if `pred` of the corresponding stencil returns true. * * @note If `pred( *(stencil + i) )` is true, stores `true` or `false` to `(output_begin + i)` @@ -384,7 +384,7 @@ class static_set { void find(InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream = {}) const; /** - * @brief For all keys in the range `[first, last)`, asynchonously finds an element with key + * @brief For all keys in the range `[first, last)`, asynchronously finds an element with key * equivalent to the query key. * * @note If the key `*(first + i)` has a matched `element` in the set, copies `element` to diff --git a/include/cuco/static_set_ref.cuh b/include/cuco/static_set_ref.cuh index 941829256..cf9c00ee0 100644 --- a/include/cuco/static_set_ref.cuh +++ b/include/cuco/static_set_ref.cuh @@ -34,7 +34,7 @@ namespace experimental { * * @note Concurrent modify and lookup will be supported if both kinds of operators are specified * during the ref construction. - * @note cuCollections data stuctures always place the slot keys on the left-hand + * @note cuCollections data structures always place the slot keys on the left-hand * side when invoking the key comparison predicate. * @note Ref types are trivially-copyable and are intended to be passed by value. * @note `ProbingScheme::cg_size` indicates how many threads are used to handle one independent From 7c76a124df0c2cd3fd66e3e080b9470a3b4707c6 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 5 Sep 2023 16:59:00 -0700 Subject: [PATCH 148/152] Clean up detail utility functions (#359) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR - adds `grid_stride` and `global_thread_id` utilities in `detail/utility/cuda.cuh` (inspired by similar work in libcudf) - adds `compute_grid_size` helper function in `detail/utility/cuda.hpp` - renames `SDIV` as `ceiling_div` and moves it to `detail/utility/math.hpp` --------- Co-authored-by: Daniel Jünger <2955913+sleeepyjack@users.noreply.github.com> --- benchmarks/hash_bench.cu | 5 +- include/cuco/detail/common_kernels.cuh | 23 ++++---- include/cuco/detail/extent/extent.inl | 5 +- include/cuco/detail/open_addressing_impl.cuh | 58 +++++++------------ include/cuco/detail/prime.hpp | 4 +- include/cuco/detail/static_map/kernels.cuh | 14 ++--- include/cuco/detail/static_map/static_map.inl | 18 +++--- .../static_multimap/static_multimap.inl | 36 +++--------- include/cuco/detail/static_set/kernels.cuh | 10 ++-- include/cuco/detail/static_set/static_set.inl | 10 ++-- include/cuco/detail/storage/aow_storage.inl | 10 ++-- include/cuco/detail/storage/kernels.cuh | 6 +- .../detail/{tuning.cuh => utility/cuda.cuh} | 29 ++++++++-- include/cuco/detail/utility/cuda.hpp | 49 ++++++++++++++++ include/cuco/detail/utility/math.hpp | 46 +++++++++++++++ include/cuco/detail/utils.hpp | 35 +---------- 16 files changed, 197 insertions(+), 161 deletions(-) rename include/cuco/detail/{tuning.cuh => utility/cuda.cuh} (52%) create mode 100644 include/cuco/detail/utility/cuda.hpp create mode 100644 include/cuco/detail/utility/math.hpp diff --git a/benchmarks/hash_bench.cu b/benchmarks/hash_bench.cu index 973f6976d..ec35c186e 100644 --- a/benchmarks/hash_bench.cu +++ b/benchmarks/hash_bench.cu @@ -16,7 +16,6 @@ #include -#include #include #include @@ -70,7 +69,7 @@ void hash_eval(nvbench::state& state, nvbench::type_list) bool const materialize_result = false; constexpr auto block_size = 128; auto const num_keys = state.get_int64_or_default("NumInputs", cuco::benchmark::defaults::N * 10); - auto const grid_size = SDIV(num_keys, block_size * 16); + auto const grid_size = (num_keys + block_size * 16 - 1) / block_size * 16; thrust::device_vector hash_values((materialize_result) ? num_keys : 1); @@ -98,4 +97,4 @@ NVBENCH_BENCH_TYPES( cuco::murmurhash3_fmix_64>)) .set_name("hash_function_eval") .set_type_axes_names({"Hash"}) - .set_max_noise(cuco::benchmark::defaults::MAX_NOISE); \ No newline at end of file + .set_max_noise(cuco::benchmark::defaults::MAX_NOISE); diff --git a/include/cuco/detail/common_kernels.cuh b/include/cuco/detail/common_kernels.cuh index 896ec753b..73dba3cf3 100644 --- a/include/cuco/detail/common_kernels.cuh +++ b/include/cuco/detail/common_kernels.cuh @@ -15,7 +15,7 @@ */ #pragma once -#include +#include #include @@ -71,8 +71,8 @@ __global__ void insert_if_n(InputIterator first, __shared__ typename BlockReduce::TempStorage temp_storage; typename Ref::size_type thread_num_successes = 0; - cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize; - cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize; + auto const loop_stride = cuco::detail::grid_stride() / CGSize; + auto idx = cuco::detail::global_thread_id() / CGSize; while (idx < n) { if (pred(*(stencil + idx))) { @@ -129,8 +129,8 @@ template #include // TODO move to detail/extent/ +#include #include #include @@ -80,8 +81,8 @@ template (static_cast(std::numeric_limits::max()) < max_prime) ? std::numeric_limits::max() : static_cast(max_prime); - auto const size = - SDIV(std::max(static_cast(ext), static_cast(1)), CGSize * WindowSize); + auto const size = cuco::detail::int_div_ceil( + std::max(static_cast(ext), static_cast(1)), CGSize * WindowSize); if (size > max_value) { CUCO_FAIL("Invalid input extent"); } if constexpr (N == dynamic_extent) { diff --git a/include/cuco/detail/open_addressing_impl.cuh b/include/cuco/detail/open_addressing_impl.cuh index 80b08dc42..ef4821b40 100644 --- a/include/cuco/detail/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing_impl.cuh @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include @@ -187,13 +187,11 @@ class open_addressing_impl { detail::counter_storage{this->allocator()}; counter.reset(stream); - auto const grid_size = - (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / - (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); auto const always_true = thrust::constant_iterator{true}; - detail::insert_if_n - <<>>( + detail::insert_if_n + <<>>( first, num_keys, always_true, thrust::identity{}, counter.data(), container_ref); return counter.load_to_host(stream); @@ -218,13 +216,11 @@ class open_addressing_impl { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } - auto const grid_size = - (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / - (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); auto const always_true = thrust::constant_iterator{true}; - detail::insert_if_n - <<>>( + detail::insert_if_n + <<>>( first, num_keys, always_true, thrust::identity{}, container_ref); } @@ -269,12 +265,10 @@ class open_addressing_impl { detail::counter_storage{this->allocator()}; counter.reset(stream); - auto const grid_size = - (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / - (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); - detail::insert_if_n - <<>>( + detail::insert_if_n + <<>>( first, num_keys, stencil, pred, counter.data(), container_ref); return counter.load_to_host(stream); @@ -313,12 +307,10 @@ class open_addressing_impl { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } - auto const grid_size = - (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / - (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); - detail::insert_if_n - <<>>( + detail::insert_if_n + <<>>( first, num_keys, stencil, pred, container_ref); } @@ -346,13 +338,11 @@ class open_addressing_impl { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } - auto const grid_size = - (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / - (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); auto const always_true = thrust::constant_iterator{true}; - detail::contains_if_n - <<>>( + detail::contains_if_n + <<>>( first, num_keys, always_true, thrust::identity{}, output_begin, container_ref); } @@ -397,12 +387,10 @@ class open_addressing_impl { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } - auto const grid_size = - (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / - (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); - detail::contains_if_n - <<>>( + detail::contains_if_n + <<>>( first, num_keys, stencil, pred, output_begin, container_ref); } @@ -489,14 +477,12 @@ class open_addressing_impl { detail::counter_storage{this->allocator()}; counter.reset(stream); - auto const grid_size = - (storage_.num_windows() + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / - (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + auto const grid_size = cuco::detail::grid_size(storage_.num_windows()); // TODO: custom kernel to be replaced by cub::DeviceReduce::Sum when cub version is bumped to // v2.1.0 - detail::size - <<>>( + detail::size + <<>>( storage_.ref(), is_filled, counter.data()); return counter.load_to_host(stream); diff --git a/include/cuco/detail/prime.hpp b/include/cuco/detail/prime.hpp index 186a29257..c788fa245 100644 --- a/include/cuco/detail/prime.hpp +++ b/include/cuco/detail/prime.hpp @@ -16,7 +16,7 @@ #pragma once -#include +#include #include #include @@ -20154,7 +20154,7 @@ constexpr T get_valid_capacity(T capacity) noexcept if constexpr (not uses_vector_load) { return cg_size; } }(); - auto const c = SDIV(capacity, stride); + auto const c = int_div_ceil(capacity, stride); auto const min_prime = std::lower_bound(primes.begin(), primes.end(), c); return *min_prime * stride; } diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh index 73c87eb61..a36095462 100644 --- a/include/cuco/detail/static_map/kernels.cuh +++ b/include/cuco/detail/static_map/kernels.cuh @@ -16,7 +16,7 @@ #pragma once #include -#include +#include #include @@ -50,8 +50,8 @@ namespace detail { template __global__ void insert_or_assign(InputIterator first, cuco::detail::index_type n, Ref ref) { - cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize; - cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize; + auto const loop_stride = cuco::detail::grid_stride() / CGSize; + auto idx = cuco::detail::global_thread_id() / CGSize; while (idx < n) { typename Ref::value_type const insert_pair{*(first + idx)}; @@ -91,11 +91,11 @@ __global__ void find(InputIt first, cuco::detail::index_type n, OutputIt output_ { namespace cg = cooperative_groups; - auto const block = cg::this_thread_block(); - auto const thread_idx = block.thread_rank(); + auto const block = cg::this_thread_block(); + auto const thread_idx = block.thread_rank(); + auto const loop_stride = cuco::detail::grid_stride() / CGSize; + auto idx = cuco::detail::global_thread_id() / CGSize; - cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize; - cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize; __shared__ typename Ref::mapped_type output_buffer[BlockSize / CGSize]; while (idx - thread_idx < n) { // the whole thread block falls into the same iteration diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index 15f6cc5ab..d7274245e 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include @@ -176,12 +176,10 @@ void static_map - <<>>( + static_map_ns::detail::insert_or_assign + <<>>( first, num, ref(op::insert_or_assign)); } @@ -288,12 +286,10 @@ void static_map - <<>>( + static_map_ns::detail::find + <<>>( first, num_keys, output_begin, ref(op::find)); } diff --git a/include/cuco/detail/static_multimap/static_multimap.inl b/include/cuco/detail/static_multimap/static_multimap.inl index 969765e07..4e9570bce 100644 --- a/include/cuco/detail/static_multimap/static_multimap.inl +++ b/include/cuco/detail/static_multimap/static_multimap.inl @@ -14,8 +14,8 @@ * limitations under the License. */ +#include #include -#include #include #include @@ -286,7 +286,6 @@ OutputIt static_multimap::retrieve( // Using per-warp buffer for vector loads and per-CG buffer for scalar loads constexpr auto buffer_size = uses_vector_load() ? (warp_size() * 3u) : (cg_size() * 3u); - constexpr auto block_size = 128; constexpr auto is_outer = false; auto view = get_device_view(); @@ -295,23 +294,13 @@ OutputIt static_multimap::retrieve( return cg_size(); }(); - auto const grid_size = detail::get_grid_size(detail::retrieve, - block_size); + auto const grid_size = detail::grid_size(num_keys, cg_size()); CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream)); std::size_t h_counter; - detail::retrieve - <<>>( + detail::retrieve + <<>>( first, num_keys, output_begin, d_counter_.get(), view, key_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( @@ -336,7 +325,6 @@ OutputIt static_multimap::retrieve_ // Using per-warp buffer for vector loads and per-CG buffer for scalar loads constexpr auto buffer_size = uses_vector_load() ? (warp_size() * 3u) : (cg_size() * 3u); - constexpr auto block_size = 128; constexpr auto is_outer = true; auto view = get_device_view(); @@ -345,23 +333,13 @@ OutputIt static_multimap::retrieve_ return cg_size(); }(); - auto const grid_size = detail::get_grid_size(detail::retrieve, - block_size); + auto const grid_size = detail::grid_size(num_keys, cg_size()); CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream)); std::size_t h_counter; - detail::retrieve - <<>>( + detail::retrieve + <<>>( first, num_keys, output_begin, d_counter_.get(), view, key_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( diff --git a/include/cuco/detail/static_set/kernels.cuh b/include/cuco/detail/static_set/kernels.cuh index 004a50b58..72744f2b4 100644 --- a/include/cuco/detail/static_set/kernels.cuh +++ b/include/cuco/detail/static_set/kernels.cuh @@ -16,7 +16,7 @@ #pragma once #include -#include +#include #include @@ -53,11 +53,11 @@ __global__ void find(InputIt first, cuco::detail::index_type n, OutputIt output_ { namespace cg = cooperative_groups; - auto const block = cg::this_thread_block(); - auto const thread_idx = block.thread_rank(); + auto const block = cg::this_thread_block(); + auto const thread_idx = block.thread_rank(); + auto const loop_stride = cuco::detail::grid_stride() / CGSize; + auto idx = cuco::detail::global_thread_id() / CGSize; - cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize; - cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize; __shared__ typename Ref::key_type output_buffer[BlockSize / CGSize]; while (idx - thread_idx < n) { // the whole thread block falls into the same iteration diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl index bd5239f15..4898f3055 100644 --- a/include/cuco/detail/static_set/static_set.inl +++ b/include/cuco/detail/static_set/static_set.inl @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include @@ -227,12 +227,10 @@ void static_set auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } - auto const grid_size = - (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / - (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); - static_set_ns::detail::find - <<>>( + static_set_ns::detail::find + <<>>( first, num_keys, output_begin, ref(op::find)); } diff --git a/include/cuco/detail/storage/aow_storage.inl b/include/cuco/detail/storage/aow_storage.inl index b4052b2a0..59877b283 100644 --- a/include/cuco/detail/storage/aow_storage.inl +++ b/include/cuco/detail/storage/aow_storage.inl @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include @@ -67,11 +67,11 @@ template void aow_storage::initialize(value_type key, cuda_stream_ref stream) noexcept { - auto constexpr stride = 4; - auto const grid_size = (this->num_windows() + stride * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / - (stride * detail::CUCO_DEFAULT_BLOCK_SIZE); + auto constexpr cg_size = 1; + auto constexpr stride = 4; + auto const grid_size = cuco::detail::grid_size(this->num_windows(), cg_size, stride); - detail::initialize<<>>( + detail::initialize<<>>( this->data(), this->num_windows(), key); } diff --git a/include/cuco/detail/storage/kernels.cuh b/include/cuco/detail/storage/kernels.cuh index 546c58daa..2a5868f61 100644 --- a/include/cuco/detail/storage/kernels.cuh +++ b/include/cuco/detail/storage/kernels.cuh @@ -15,7 +15,7 @@ */ #pragma once -#include +#include #include @@ -37,8 +37,8 @@ __global__ void initialize(WindowT* windows, cuco::detail::index_type n, typename WindowT::value_type value) { - cuco::detail::index_type const loop_stride = gridDim.x * blockDim.x; - cuco::detail::index_type idx = blockDim.x * blockIdx.x + threadIdx.x; + auto const loop_stride = cuco::detail::grid_stride(); + auto idx = cuco::detail::global_thread_id(); while (idx < n) { auto& window_slots = *(windows + idx); diff --git a/include/cuco/detail/tuning.cuh b/include/cuco/detail/utility/cuda.cuh similarity index 52% rename from include/cuco/detail/tuning.cuh rename to include/cuco/detail/utility/cuda.cuh index 035b60cc5..6e5f13ff7 100644 --- a/include/cuco/detail/tuning.cuh +++ b/include/cuco/detail/utility/cuda.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,13 +15,30 @@ #pragma once +#include + namespace cuco { -namespace experimental { namespace detail { -static constexpr int CUCO_DEFAULT_BLOCK_SIZE = 128; -static constexpr int CUCO_DEFAULT_STRIDE = 1; +/** + * @brief Returns the global thread index in a 1D scalar grid + * + * @return The global thread index + */ +__device__ static index_type global_thread_id() noexcept +{ + return index_type{threadIdx.x} + index_type{blockDim.x} * index_type{blockIdx.x}; +} + +/** + * @brief Returns the grid stride of a 1D grid + * + * @return The grid stride + */ +__device__ static index_type grid_stride() noexcept +{ + return index_type{gridDim.x} * index_type{blockDim.x}; +} } // namespace detail -} // namespace experimental -} // namespace cuco \ No newline at end of file +} // namespace cuco diff --git a/include/cuco/detail/utility/cuda.hpp b/include/cuco/detail/utility/cuda.hpp new file mode 100644 index 000000000..f6a84df98 --- /dev/null +++ b/include/cuco/detail/utility/cuda.hpp @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + */ + +#pragma once + +#include + +namespace cuco { +namespace detail { + +using index_type = int64_t; ///< CUDA thread index type + +/// Default block size +constexpr int32_t default_block_size() noexcept { return 128; } +/// Default stride +constexpr int32_t default_stride() noexcept { return 1; } + +/** + * @brief Computes the desired 1D grid size with the given parameters + * + * @param num Number of elements to handle in the kernel + * @param cg_size Number of threads per CUDA Cooperative Group + * @param stride Number of elements to be handled by each thread + * @param block_size Number of threads in each thread block + * + * @return The resulting grid size + */ +constexpr auto grid_size(index_type num, + int32_t cg_size = 1, + int32_t stride = default_stride(), + int32_t block_size = default_block_size()) noexcept +{ + return int_div_ceil(cg_size * num, stride * block_size); +} + +} // namespace detail +} // namespace cuco diff --git a/include/cuco/detail/utility/math.hpp b/include/cuco/detail/utility/math.hpp new file mode 100644 index 000000000..47484d6ad --- /dev/null +++ b/include/cuco/detail/utility/math.hpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + */ + +#pragma once + +#include + +namespace cuco { +namespace detail { + +/** + * @brief Ceiling of an integer division + * + * @tparam T Type of dividend + * @tparam U Type of divisor + * + * @throw If `T` is not an integral type + * @throw If `U` is not an integral type + * + * @param dividend Numerator + * @param divisor Denominator + * + * @return Ceiling of the integer division + */ +template +constexpr T int_div_ceil(T dividend, U divisor) noexcept +{ + static_assert(std::is_integral_v); + static_assert(std::is_integral_v); + return (dividend + divisor - 1) / divisor; +} + +} // namespace detail +} // namespace cuco diff --git a/include/cuco/detail/utils.hpp b/include/cuco/detail/utils.hpp index 513ccd559..86c045e3b 100644 --- a/include/cuco/detail/utils.hpp +++ b/include/cuco/detail/utils.hpp @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include @@ -23,40 +24,6 @@ namespace cuco { namespace detail { -using index_type = int64_t; ///< index type for internal use - -/** - * @brief Compute the number of bits of a simple type. - * - * @tparam T The type we want to infer its size in bits - * - * @return Size of type T in bits - */ -template -static constexpr std::size_t type_bits() noexcept -{ - return sizeof(T) * CHAR_BIT; -} - -// safe division -#ifndef SDIV -#define SDIV(x, y) (((x) + (y)-1) / (y)) -#endif - -template -auto get_grid_size(Kernel kernel, std::size_t block_size, std::size_t dynamic_smem_bytes = 0) -{ - int grid_size{-1}; - CUCO_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &grid_size, kernel, block_size, dynamic_smem_bytes)); - int dev_id{-1}; - CUCO_CUDA_TRY(cudaGetDevice(&dev_id)); - int num_sms{-1}; - CUCO_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id)); - grid_size *= num_sms; - return grid_size; -} - template constexpr inline index_type distance(Iterator begin, Iterator end) { From 641c8683ec9dcd9d69d82d7018713b67b5d97a8f Mon Sep 17 00:00:00 2001 From: amukkara <134339030+amukkara@users.noreply.github.com> Date: Fri, 8 Sep 2023 16:48:13 -0700 Subject: [PATCH 149/152] Add `dynamic_bitset` (#352) This PR adds `dynamic_bitset` code that will be used in Trie data structure. Trie will be integrated in a separate PR #350. Since `dynamic_bitset` is not intended to be part of public-facing API, all files (.cuh and .inl) are located in include/cuco/detail/trie/dynamic_bitset Tests are added in tests/dynamic_bitset --------- Co-authored-by: Yunsong Wang Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../trie/dynamic_bitset/dynamic_bitset.cuh | 375 ++++++++++++++++ .../trie/dynamic_bitset/dynamic_bitset.inl | 404 ++++++++++++++++++ .../detail/trie/dynamic_bitset/kernels.cuh | 240 +++++++++++ tests/CMakeLists.txt | 9 + tests/dynamic_bitset/find_next_test.cu | 73 ++++ tests/dynamic_bitset/get_test.cu | 69 +++ tests/dynamic_bitset/rank_test.cu | 56 +++ tests/dynamic_bitset/select_test.cu | 96 +++++ tests/dynamic_bitset/size_test.cu | 33 ++ 9 files changed, 1355 insertions(+) create mode 100644 include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh create mode 100644 include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl create mode 100644 include/cuco/detail/trie/dynamic_bitset/kernels.cuh create mode 100644 tests/dynamic_bitset/find_next_test.cu create mode 100644 tests/dynamic_bitset/get_test.cu create mode 100644 tests/dynamic_bitset/rank_test.cu create mode 100644 tests/dynamic_bitset/select_test.cu create mode 100644 tests/dynamic_bitset/size_test.cu diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh new file mode 100644 index 000000000..8383669fc --- /dev/null +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh @@ -0,0 +1,375 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include + +#include + +#include +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief Struct to store ranks of bits at 256-bit intervals (or blocks) + * + * This struct encodes a list of four rank values using base + offset format + * e.g. [1000, 1005, 1006, 1009] is stored as base = 1000, offsets = [5, 6, 9] + * base uses 40 bits, split between one uint32_t and one uint8_t + * each offset uses 8 bits + */ +struct rank { + uint32_t base_hi_; ///< Upper 32 bits of base + uint8_t base_lo_; ///< Lower 8 bits of base + cuda::std::array offsets_; ///< Offsets for 64-bit sub-intervals, relative to base + + /** + * @brief Gets base rank of current 256-bit interval + * + * @return The base rank + */ + __host__ __device__ constexpr uint64_t base() const noexcept + { + return (static_cast(base_hi_) << CHAR_BIT) | base_lo_; + } + + /** + * @brief Sets base rank of current 256-bit interval + * + * @param base Base rank + */ + __host__ __device__ constexpr void set_base(uint64_t base) noexcept + { + base_hi_ = static_cast(base >> CHAR_BIT); + base_lo_ = static_cast(base); + } +}; + +/** + * @brief Bitset class with rank and select index structures + * + * In addition to standard bitset set/test operations, this class provides + * rank and select operation API. It maintains index structures to make both these + * new operations close to constant time. + * + * Current limitations: + * - Stream controls are partially supported due to the use of `thrust::device_vector` as storage + * - Device ref doesn't support modifiers like `set`, `reset`, etc. + * + * @tparam Allocator Type of allocator used for device storage + */ +// TODO: have to use device_malloc_allocator for now otherwise the container cannot grow +template > +class dynamic_bitset { + public: + using size_type = std::size_t; ///< size type to specify bit index + using word_type = uint64_t; ///< word type + /// Type of the allocator to (de)allocate words + using allocator_type = typename std::allocator_traits::rebind_alloc; + + /// Number of bits per block. Note this is a tradeoff between space efficiency and perf. + static constexpr size_type words_per_block = 4; + /// Number of bits in a word + static constexpr size_type bits_per_word = sizeof(word_type) * CHAR_BIT; + /// Number of bits in a block + static constexpr size_type bits_per_block = words_per_block * bits_per_word; + + /** + * @brief Constructs an empty bitset + * + * @param allocator Allocator used for allocating device storage + */ + constexpr dynamic_bitset(Allocator const& allocator = Allocator{}); + + /** + * @brief Appends the given element `value` to the end of the bitset + * + * This API may involve data reallocation if the current storage is exhausted. + * + * @param value Boolean value of the new bit to be added + */ + constexpr void push_back(bool value) noexcept; + + /** + * @brief Sets the target bit indexed by `index` to a specified `value`. + * + * @param index Position of bit to be modified + * @param value New value of the target bit + */ + constexpr void set(size_type index, bool value) noexcept; + + /** + * @brief Sets the last bit to a specified value + * + * @param value New value of the last bit + */ + constexpr void set_last(bool value) noexcept; + + /** + * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores the + * boolean value at position `keys_begin[i]` to `output_begin[i]`. + * + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from boolean + * type + * + * @param keys_begin Begin iterator to keys list whose values are queried + * @param keys_end End iterator to keys list + * @param outputs_begin Begin iterator to outputs of test operation + * @param stream Stream to execute test kernel + */ + template + constexpr void test(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream = {}) noexcept; + + /** + * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores total + * count of `1` bits preceeding (but not including) position `keys_begin[i]` to `output_begin[i]`. + * + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's + * `size_type` + * + * @param keys_begin Begin iterator to keys list whose ranks are queried + * @param keys_end End iterator to keys list + * @param outputs_begin Begin iterator to outputs ranks list + * @param stream Stream to execute ranks kernel + */ + template + constexpr void rank(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream = {}) noexcept; + + /** + * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores the + * position of `keys_begin[i]`th `1` bit to `output_begin[i]`. + * + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's + * `size_type` + * + * @param keys_begin Begin iterator to keys list whose select values are queried + * @param keys_end End iterator to keys list + * @param outputs_begin Begin iterator to outputs selects list + * @param stream Stream to execute selects kernel + */ + template + constexpr void select(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream = {}) noexcept; + + using rank_type = cuco::experimental::detail::rank; ///< Rank type + + /** + *@brief Struct to hold all storage refs needed by reference + */ + // TODO: this is not a real ref type, to be changed + struct storage_ref_type { + const word_type* words_ref_; ///< Words ref + + const rank_type* ranks_true_ref_; ///< Ranks ref for 1 bits + const size_type* selects_true_ref_; ///< Selects ref for 1 bits + + const rank_type* ranks_false_ref_; ///< Ranks ref for 0 bits + const size_type* selects_false_ref_; ///< Selects ref 0 bits + }; + + /** + * @brief Device non-owning reference type of dynamic_bitset + */ + class reference { + public: + /** + * @brief Constructs a reference + * + * @param storage Struct with non-owning refs to bitset storage arrays + */ + __host__ __device__ explicit constexpr reference(storage_ref_type storage) noexcept; + + /** + * @brief Access value of a single bit + * + * @param key Position of bit + * + * @return Value of bit at position specified by key + */ + [[nodiscard]] __device__ constexpr bool test(size_type key) const noexcept; + + /** + * @brief Access a single word of internal storage + * + * @param word_id Index of word + * + * @return Word at position specified by index + */ + [[nodiscard]] __device__ constexpr word_type word(size_type word_id) const noexcept; + + /** + * @brief Find position of first set bit starting from a given position (inclusive) + * + * @param key Position of starting bit + * + * @return Index of next set bit + */ + [[nodiscard]] __device__ size_type find_next(size_type key) const noexcept; + + /** + * @brief Find number of set bits (rank) in all positions before the input position (exclusive) + * + * @param key Input bit position + * + * @return Rank of input position + */ + [[nodiscard]] __device__ constexpr size_type rank(size_type key) const noexcept; + + /** + * @brief Find position of Nth set (1) bit counting from start + * + * @param count Input N + * + * @return Position of Nth set bit + */ + [[nodiscard]] __device__ constexpr size_type select(size_type count) const noexcept; + + /** + * @brief Find position of Nth not-set (0) bit counting from start + * + * @param count Input N + * + * @return Position of Nth not-set bit + */ + [[nodiscard]] __device__ constexpr size_type select_false(size_type count) const noexcept; + + private: + /** + * @brief Helper function for select operation that computes an initial rank estimate + * + * @param count Input count for which select operation is being performed + * @param selects Selects array + * @param ranks Ranks array + * + * @return index in ranks which corresponds to highest rank less than count (least upper bound) + */ + template + [[nodiscard]] __device__ constexpr size_type initial_rank_estimate( + size_type count, const SelectsRef& selects, const RanksRef& ranks) const noexcept; + + /** + * @brief Subtract rank estimate from input count and return an increment to word_id + * + * @tparam Rank type + * + * @param count Input count that will be updated + * @param rank Initial rank estimate for count + * + * @return Increment to word_id based on rank values + */ + template + [[nodiscard]] __device__ constexpr size_type subtract_rank_from_count(size_type& count, + Rank rank) const noexcept; + + /** + * @brief Find position of Nth set bit in a 64-bit word + * + * @param N Input count + * + * @return Position of Nth set bit + */ + [[nodiscard]] __device__ size_type select_bit_in_word(size_type N, + word_type word) const noexcept; + + storage_ref_type storage_; ///< Non-owning storage + }; + + using ref_type = reference; ///< Non-owning container ref type + + /** + * @brief Gets non-owning device ref of the current object + * + * @return Device ref of the current `dynamic_bitset` object + */ + [[nodiscard]] constexpr ref_type ref() const noexcept; + + /** + * @brief Gets the number of bits dynamic_bitset holds + * + * @return Number of bits dynamic_bitset holds + */ + [[nodiscard]] constexpr size_type size() const noexcept; + + private: + /// Type of the allocator to (de)allocate ranks + using rank_allocator_type = typename std::allocator_traits::rebind_alloc; + /// Type of the allocator to (de)allocate indices + using size_allocator_type = typename std::allocator_traits::rebind_alloc; + + allocator_type allocator_; ///< Words allocator + size_type n_bits_; ///< Number of bits dynamic_bitset currently holds + bool is_built_; ///< Flag indicating whether the rank and select indices are built or not + + /// Words vector that represents all bits + thrust::device_vector words_; + /// Rank values for every 256-th bit (4-th word) + thrust::device_vector ranks_true_; + /// Same as ranks_ but for `0` bits + thrust::device_vector ranks_false_; + /// Block indices of (0, 256, 512...)th `1` bit + thrust::device_vector selects_true_; + /// Same as selects_, but for `0` bits + thrust::device_vector selects_false_; + + /** + * @brief Builds indexes for rank and select + * + * @param stream Stream to execute kernels + */ + constexpr void build(cuda_stream_ref stream = {}) noexcept; + + /** + * @brief Populates rank and select indexes for true or false bits + * + * @param ranks Output array of ranks + * @param selects Output array of selects + * @param flip_bits If true, negate bits to construct indexes for false bits + * @param stream Stream to execute kernels + */ + constexpr void build_ranks_and_selects( + thrust::device_vector& ranks, + thrust::device_vector& selects, + bool flip_bits, + cuda_stream_ref stream = {}); +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco + +#include diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl new file mode 100644 index 000000000..d56ef9d7c --- /dev/null +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -0,0 +1,404 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include +#include + +#include + +namespace cuco { +namespace experimental { +namespace detail { + +template +constexpr dynamic_bitset::dynamic_bitset(Allocator const& allocator) + : allocator_{allocator}, + n_bits_{0}, + is_built_{false}, + words_{allocator}, + ranks_true_{allocator}, + ranks_false_{allocator}, + selects_true_{allocator}, + selects_false_{allocator} +{ +} + +template +constexpr void dynamic_bitset::push_back(bool bit) noexcept +{ + if (n_bits_ % bits_per_block == 0) { + words_.resize(words_.size() + words_per_block); // Extend storage by one block + } + + set(n_bits_++, bit); +} + +template +constexpr void dynamic_bitset::set(size_type index, bool bit) noexcept +{ + is_built_ = false; + size_type word_id = index / bits_per_word; + size_type bit_id = index % bits_per_word; + if (bit) { + words_[word_id] |= 1UL << bit_id; + } else { + words_[word_id] &= ~(1UL << bit_id); + } +} + +template +constexpr void dynamic_bitset::set_last(bool bit) noexcept +{ + set(n_bits_ - 1, bit); +} + +template +template +constexpr void dynamic_bitset::test(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) noexcept + +{ + build(); + auto const num_keys = cuco::detail::distance(keys_begin, keys_end); + if (num_keys == 0) { return; } + + auto const grid_size = cuco::detail::grid_size(num_keys); + + bitset_test_kernel<<>>( + ref(), keys_begin, outputs_begin, num_keys); +} + +template +template +constexpr void dynamic_bitset::rank(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) noexcept +{ + build(); + auto const num_keys = cuco::detail::distance(keys_begin, keys_end); + if (num_keys == 0) { return; } + + auto const grid_size = cuco::detail::grid_size(num_keys); + + bitset_rank_kernel<<>>( + ref(), keys_begin, outputs_begin, num_keys); +} + +template +template +constexpr void dynamic_bitset::select(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) noexcept + +{ + build(); + auto const num_keys = cuco::detail::distance(keys_begin, keys_end); + if (num_keys == 0) { return; } + + auto const grid_size = cuco::detail::grid_size(num_keys); + + bitset_select_kernel<<>>( + ref(), keys_begin, outputs_begin, num_keys); +} + +template +constexpr void dynamic_bitset::build_ranks_and_selects( + thrust::device_vector& ranks, + thrust::device_vector& selects, + bool flip_bits, + cuda_stream_ref stream) +{ + if (n_bits_ == 0) { return; } + + // Step 1. Compute prefix sum of per-word bit counts + // Population counts for each word + size_type const num_words = words_.size(); + // Sized to have one extra entry for subsequent prefix sum + auto const bit_counts_size = num_words + 1; + + thrust::device_vector bit_counts(num_words + 1, this->allocator_); + auto const bit_counts_begin = thrust::raw_pointer_cast(bit_counts.data()); + + auto grid_size = cuco::detail::grid_size(num_words); + bit_counts_kernel<<>>( + thrust::raw_pointer_cast(words_.data()), bit_counts_begin, num_words, flip_bits); + + std::size_t temp_storage_bytes = 0; + using temp_allocator_type = typename std::allocator_traits::rebind_alloc; + auto temp_allocator = temp_allocator_type{this->allocator_}; + + CUCO_CUDA_TRY(cub::DeviceScan::ExclusiveSum( + nullptr, temp_storage_bytes, bit_counts_begin, bit_counts_begin, bit_counts_size, stream)); + + // Allocate temporary storage + auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes); + + CUCO_CUDA_TRY(cub::DeviceScan::ExclusiveSum(thrust::raw_pointer_cast(d_temp_storage), + temp_storage_bytes, + bit_counts_begin, + bit_counts_begin, + bit_counts_size, + stream)); + + temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); + + // Step 2. Compute ranks + auto const num_blocks = (num_words - 1) / words_per_block + 2; + ranks.resize(num_blocks); + + grid_size = cuco::detail::grid_size(num_blocks); + encode_ranks_from_prefix_bit_counts<<>>( + bit_counts_begin, + thrust::raw_pointer_cast(ranks.data()), + num_words, + num_blocks, + words_per_block); + + // Step 3. Compute selects + thrust::device_vector select_markers(num_blocks, + this->allocator_); + auto const select_markers_begin = thrust::raw_pointer_cast(select_markers.data()); + + mark_blocks_with_select_entries<<>>( + bit_counts_begin, select_markers_begin, num_blocks, words_per_block, bits_per_block); + + auto d_sum = reinterpret_cast(thrust::raw_pointer_cast( + std::allocator_traits::allocate(temp_allocator, sizeof(size_type)))); + CUCO_CUDA_TRY(cub::DeviceReduce::Sum( + nullptr, temp_storage_bytes, select_markers_begin, d_sum, num_blocks, stream)); + + d_temp_storage = temp_allocator.allocate(temp_storage_bytes); + + CUCO_CUDA_TRY(cub::DeviceReduce::Sum(thrust::raw_pointer_cast(d_temp_storage), + temp_storage_bytes, + select_markers_begin, + d_sum, + num_blocks, + stream)); + + size_type num_selects{}; + CUCO_CUDA_TRY( + cudaMemcpyAsync(&num_selects, d_sum, sizeof(size_type), cudaMemcpyDeviceToHost, stream)); + stream.synchronize(); + std::allocator_traits::deallocate( + temp_allocator, thrust::device_ptr{reinterpret_cast(d_sum)}, sizeof(size_type)); + temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); + + selects.resize(num_selects); + + auto const select_begin = thrust::raw_pointer_cast(selects.data()); + + CUCO_CUDA_TRY(cub::DeviceSelect::Flagged(nullptr, + temp_storage_bytes, + thrust::make_counting_iterator(0UL), + select_markers_begin, + select_begin, + thrust::make_discard_iterator(), + num_blocks, + stream)); + + d_temp_storage = temp_allocator.allocate(temp_storage_bytes); + + CUCO_CUDA_TRY(cub::DeviceSelect::Flagged(thrust::raw_pointer_cast(d_temp_storage), + temp_storage_bytes, + thrust::make_counting_iterator(0UL), + select_markers_begin, + select_begin, + thrust::make_discard_iterator(), + num_blocks, + stream)); + + temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); +} + +template +constexpr void dynamic_bitset::build(cuda_stream_ref stream) noexcept +{ + if (not is_built_) { + build_ranks_and_selects(ranks_true_, selects_true_, false, stream); // 1 bits + build_ranks_and_selects(ranks_false_, selects_false_, true, stream); // 0 bits + is_built_ = true; + } +} + +template +constexpr dynamic_bitset::ref_type dynamic_bitset::ref() const noexcept +{ + return ref_type{storage_ref_type{thrust::raw_pointer_cast(words_.data()), + thrust::raw_pointer_cast(ranks_true_.data()), + thrust::raw_pointer_cast(selects_true_.data()), + thrust::raw_pointer_cast(ranks_false_.data()), + thrust::raw_pointer_cast(selects_false_.data())}}; +} + +template +constexpr dynamic_bitset::size_type dynamic_bitset::size() const noexcept +{ + return n_bits_; +} + +// Device reference implementations + +template +__host__ __device__ constexpr dynamic_bitset::reference::reference( + storage_ref_type storage) noexcept + : storage_{storage} +{ +} + +template +__device__ constexpr bool dynamic_bitset::reference::test(size_type key) const noexcept +{ + return (storage_.words_ref_[key / bits_per_word] >> (key % bits_per_word)) & 1UL; +} + +template +__device__ constexpr typename dynamic_bitset::word_type +dynamic_bitset::reference::word(size_type word_id) const noexcept +{ + return storage_.words_ref_[word_id]; +} + +template +__device__ typename dynamic_bitset::size_type +dynamic_bitset::reference::find_next(size_type key) const noexcept +{ + size_type word_id = key / bits_per_word; + size_type bit_id = key % bits_per_word; + word_type word = storage_.words_ref_[word_id]; + word &= ~(0UL) << bit_id; + while (word == 0) { + word = storage_.words_ref_[++word_id]; + } + return word_id * bits_per_word + __ffsll(word) - 1; // cuda intrinsic +} + +template +__device__ constexpr typename dynamic_bitset::size_type +dynamic_bitset::reference::rank(size_type key) const noexcept +{ + size_type word_id = key / bits_per_word; + size_type bit_id = key % bits_per_word; + size_type rank_id = word_id / words_per_block; + size_type offset_id = word_id % words_per_block; + + auto rank = storage_.ranks_true_ref_[rank_id]; + size_type n = rank.base(); + + if (offset_id != 0) { n += rank.offsets_[offset_id - 1]; } + + n += cuda::std::popcount(storage_.words_ref_[word_id] & ((1UL << bit_id) - 1)); + + return n; +} + +template +__device__ constexpr typename dynamic_bitset::size_type +dynamic_bitset::reference::select(size_type count) const noexcept +{ + auto rank_id = initial_rank_estimate(count, storage_.selects_true_ref_, storage_.ranks_true_ref_); + auto rank = storage_.ranks_true_ref_[rank_id]; + + size_type word_id = rank_id * words_per_block; + word_id += subtract_rank_from_count(count, rank); + + return word_id * bits_per_word + select_bit_in_word(count, storage_.words_ref_[word_id]); +} + +template +__device__ constexpr typename dynamic_bitset::size_type +dynamic_bitset::reference::select_false(size_type count) const noexcept +{ + auto rank_id = + initial_rank_estimate(count, storage_.selects_false_ref_, storage_.ranks_false_ref_); + auto rank = storage_.ranks_false_ref_[rank_id]; + + size_type word_id = rank_id * words_per_block; + word_id += subtract_rank_from_count(count, rank); + + return word_id * bits_per_word + select_bit_in_word(count, ~(storage_.words_ref_[word_id])); +} + +template +template +__device__ constexpr typename dynamic_bitset::size_type +dynamic_bitset::reference::initial_rank_estimate(size_type count, + SelectsRef const& selects, + RanksRef const& ranks) const noexcept +{ + size_type block_id = count / (bits_per_word * words_per_block); + size_type begin = selects[block_id]; + size_type end = selects[block_id + 1] + 1UL; + + if (begin + 10 >= end) { // Linear search + while (count >= ranks[begin + 1].base()) { + ++begin; + } + } else { // Binary search + while (begin + 1 < end) { + size_type middle = (begin + end) / 2; + if (count < ranks[middle].base()) { + end = middle; + } else { + begin = middle; + } + } + } + return begin; +} + +template +template +__device__ constexpr typename dynamic_bitset::size_type +dynamic_bitset::reference::subtract_rank_from_count(size_type& count, + Rank rank) const noexcept +{ + count -= rank.base(); + + bool a0 = count >= rank.offsets_[0]; + bool a1 = count >= rank.offsets_[1]; + bool a2 = count >= rank.offsets_[2]; + size_type inc = a0 + a1 + a2; + + count -= (inc > 0) * rank.offsets_[inc - (inc > 0)]; + + return inc; +} + +template +__device__ typename dynamic_bitset::size_type +dynamic_bitset::reference::select_bit_in_word(size_type N, word_type word) const noexcept +{ + for (size_type pos = 0; pos < N; pos++) { + word &= word - 1; + } + return __ffsll(word & -word) - 1; // cuda intrinsic +} +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh new file mode 100644 index 000000000..c92ab60b2 --- /dev/null +++ b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh @@ -0,0 +1,240 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/* + * @brief Test bits for a range of keys + * + * @tparam BitsetRef Bitset reference type + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from boolean + * type + * + * @param ref Bitset ref + * @param keys Begin iterator to keys + * @param outputs Begin iterator to outputs + * @param num_keys Number of input keys + */ +template +__global__ void bitset_test_kernel(BitsetRef ref, + KeyIt keys, + OutputIt outputs, + cuco::detail::index_type num_keys) +{ + auto key_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); + + while (key_id < num_keys) { + outputs[key_id] = ref.test(keys[key_id]); + key_id += stride; + } +} + +/* + * @brief Gather rank values for a range of keys + * + * @tparam BitsetRef Bitset reference type + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's + * `size_type` + * + * @param ref Bitset ref + * @param keys Begin iterator to keys + * @param outputs Begin iterator to outputs + * @param num_keys Number of input keys + */ +template +__global__ void bitset_rank_kernel(BitsetRef ref, + KeyIt keys, + OutputIt outputs, + cuco::detail::index_type num_keys) +{ + auto key_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); + + while (key_id < num_keys) { + outputs[key_id] = ref.rank(keys[key_id]); + key_id += stride; + } +} + +/* + * @brief Gather select values for a range of keys + * + * @tparam BitsetRef Bitset reference type + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's + * `size_type` + * + * @param ref Bitset ref + * @param keys Begin iterator to keys + * @param outputs Begin iterator to outputs + * @param num_keys Number of input keys + */ +template +__global__ void bitset_select_kernel(BitsetRef ref, + KeyIt keys, + OutputIt outputs, + cuco::detail::index_type num_keys) +{ + auto key_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); + + while (key_id < num_keys) { + outputs[key_id] = ref.select(keys[key_id]); + key_id += stride; + } +} + +/* + * @brief Computes number of set or not-set bits in each word + * + * @tparam WordType Word type + * @tparam SizeType Size type + * + * @param words Input array of words + * @param bit_counts Output array of per-word bit counts + * @param num_words Number of words + * @param flip_bits Boolean to request negation of words before counting bits + */ +template +__global__ void bit_counts_kernel(WordType const* words, + SizeType* bit_counts, + cuco::detail::index_type num_words, + bool flip_bits) +{ + auto word_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); + + while (word_id < num_words) { + auto word = words[word_id]; + bit_counts[word_id] = cuda::std::popcount(flip_bits ? ~word : word); + word_id += stride; + } +} + +/* + * @brief Compute rank values at block size intervals. + * + * ranks[i] = Number of set bits in [0, i) range + * This kernel transforms prefix sum array of per-word bit counts + * into base-delta encoding style of `rank` struct. + * Since prefix sum is available, there are no dependencies across blocks. + + * @tparam SizeType Size type + * + * @param prefix_bit_counts Prefix sum array of per-word bit counts + * @param ranks Output array of ranks + * @param num_words Length of input array + * @param num_blocks Length of ouput array + * @param words_per_block Number of words in each block + */ +template +__global__ void encode_ranks_from_prefix_bit_counts(const SizeType* prefix_bit_counts, + rank* ranks, + SizeType num_words, + SizeType num_blocks, + SizeType words_per_block) +{ + auto rank_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); + + while (rank_id < num_blocks) { + SizeType word_id = rank_id * words_per_block; + + // Set base value of rank + auto& rank = ranks[rank_id]; + rank.set_base(prefix_bit_counts[word_id]); + + if (rank_id < num_blocks - 1) { + // For each subsequent word in this block, compute deltas from base + for (SizeType block_offset = 0; block_offset < words_per_block - 1; block_offset++) { + auto delta = prefix_bit_counts[word_id + block_offset + 1] - prefix_bit_counts[word_id]; + rank.offsets_[block_offset] = delta; + } + } + rank_id += stride; + } +} + +/* + * @brief Compute select values at block size intervals. + * + * selects[i] = Position of (i+ 1)th set bit + * This kernel check for blocks where prefix sum crosses a multiple of `bits_per_block`. + * Such blocks are marked in the output boolean array + * + * @tparam SizeType Size type + * + * @param prefix_bit_counts Prefix sum array of per-word bit counts + * @param selects_markers Ouput array indicating whether a block has selects entry or not + * @param num_blocks Length of ouput array + * @param words_per_block Number of words in each block + * @param bits_per_block Number of bits in each block + */ +template +__global__ void mark_blocks_with_select_entries(SizeType const* prefix_bit_counts, + SizeType* select_markers, + SizeType num_blocks, + SizeType words_per_block, + SizeType bits_per_block) +{ + auto block_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); + + while (block_id < num_blocks) { + if (block_id == 0) { // Block 0 always has a selects entry + select_markers[block_id] = 1; + block_id += stride; + continue; + } + + select_markers[block_id] = 0; // Always clear marker first + SizeType word_id = block_id * words_per_block; + SizeType prev_count = prefix_bit_counts[word_id]; + + for (size_t block_offset = 1; block_offset <= words_per_block; block_offset++) { + SizeType count = prefix_bit_counts[word_id + block_offset]; + + // Selects entry is added when cumulative bitcount crosses a multiple of bits_per_block + if ((prev_count - 1) / bits_per_block != (count - 1) / bits_per_block) { + select_markers[block_id] = 1; + break; + } + prev_count = count; + } + + block_id += stride; + } +} + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d78ec7f49..3deeeddf1 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -96,3 +96,12 @@ ConfigureTest(STATIC_MULTIMAP_TEST static_multimap/multiplicity_test.cu static_multimap/non_match_test.cu static_multimap/pair_function_test.cu) + +################################################################################################### +# - dynamic_bitset tests -------------------------------------------------------------------------- +ConfigureTest(DYNAMIC_BITSET_TEST + dynamic_bitset/find_next_test.cu + dynamic_bitset/get_test.cu + dynamic_bitset/rank_test.cu + dynamic_bitset/select_test.cu + dynamic_bitset/size_test.cu) diff --git a/tests/dynamic_bitset/find_next_test.cu b/tests/dynamic_bitset/find_next_test.cu new file mode 100644 index 000000000..97ba366ea --- /dev/null +++ b/tests/dynamic_bitset/find_next_test.cu @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include + +#include + +template +__global__ void find_next_kernel(BitsetRef ref, size_type num_elements, OutputIt output) +{ + cuco::detail::index_type index = blockIdx.x * blockDim.x + threadIdx.x; + cuco::detail::index_type stride = gridDim.x * blockDim.x; + while (index < num_elements) { + output[index] = ref.find_next(index); + index += stride; + } +} + +extern bool modulo_bitgen(uint64_t i); // Defined in get_test.cu + +TEST_CASE("Find next set test", "") +{ + cuco::experimental::detail::dynamic_bitset bv; + + using size_type = std::size_t; + constexpr size_type num_elements{400}; + + for (size_type i = 0; i < num_elements; i++) { + bv.push_back(modulo_bitgen(i)); + } + + thrust::device_vector device_result(num_elements); + auto ref = bv.ref(); + find_next_kernel<<<1, 1024>>>(ref, num_elements, device_result.data()); + + thrust::host_vector host_result = device_result; + size_type num_matches = 0; + + size_type next_set_pos = -1lu; + do { + next_set_pos++; + } while (next_set_pos < num_elements and !modulo_bitgen(next_set_pos)); + + for (size_type key = 0; key < num_elements; key++) { + num_matches += host_result[key] == next_set_pos; + + if (key == next_set_pos) { + do { + next_set_pos++; + } while (next_set_pos < num_elements and !modulo_bitgen(next_set_pos)); + } + } + REQUIRE(num_matches == num_elements); +} diff --git a/tests/dynamic_bitset/get_test.cu b/tests/dynamic_bitset/get_test.cu new file mode 100644 index 000000000..10f81a116 --- /dev/null +++ b/tests/dynamic_bitset/get_test.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +template +__global__ void test_kernel(BitsetRef ref, size_type num_elements, OutputIt output) +{ + cuco::detail::index_type index = blockIdx.x * blockDim.x + threadIdx.x; + cuco::detail::index_type stride = gridDim.x * blockDim.x; + while (index < num_elements) { + output[index] = ref.test(index); + index += stride; + } +} + +bool modulo_bitgen(uint64_t i) { return i % 7 == 0; } + +TEST_CASE("Get test", "") +{ + cuco::experimental::detail::dynamic_bitset bv; + + using size_type = std::size_t; + constexpr size_type num_elements{400}; + + size_type num_set_ref = 0; + for (size_type i = 0; i < num_elements; i++) { + bv.push_back(modulo_bitgen(i)); + num_set_ref += modulo_bitgen(i); + } + + // Host-bulk test + thrust::device_vector keys(num_elements); + thrust::sequence(keys.begin(), keys.end(), 0); + + thrust::device_vector test_result(num_elements); + thrust::fill(test_result.begin(), test_result.end(), 0); + + bv.test(keys.begin(), keys.end(), test_result.begin()); + + size_type num_set = thrust::reduce(thrust::device, test_result.begin(), test_result.end(), 0); + REQUIRE(num_set == num_set_ref); + + // Device-ref test + auto ref = bv.ref(); + thrust::fill(test_result.begin(), test_result.end(), 0); + test_kernel<<<1, 1024>>>(ref, num_elements, test_result.data()); + + num_set = thrust::reduce(thrust::device, test_result.begin(), test_result.end(), 0); + REQUIRE(num_set == num_set_ref); +} diff --git a/tests/dynamic_bitset/rank_test.cu b/tests/dynamic_bitset/rank_test.cu new file mode 100644 index 000000000..3b4d17cca --- /dev/null +++ b/tests/dynamic_bitset/rank_test.cu @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include + +#include + +extern bool modulo_bitgen(uint64_t i); // Defined in get_test.cu + +TEST_CASE("Rank test", "") +{ + cuco::experimental::detail::dynamic_bitset bv; + + using size_type = std::size_t; + constexpr size_type num_elements{4000}; + + for (size_type i = 0; i < num_elements; i++) { + bv.push_back(modulo_bitgen(i)); + } + + thrust::device_vector keys(num_elements); + thrust::sequence(keys.begin(), keys.end(), 0); + + thrust::device_vector d_ranks(num_elements); + + bv.rank(keys.begin(), keys.end(), d_ranks.begin()); + + thrust::host_vector h_ranks = d_ranks; + + size_type cur_rank = 0; + size_type num_matches = 0; + for (size_type i = 0; i < num_elements; i++) { + num_matches += cur_rank == h_ranks[i]; + if (modulo_bitgen(i)) { cur_rank++; } + } + REQUIRE(num_matches == num_elements); +} diff --git a/tests/dynamic_bitset/select_test.cu b/tests/dynamic_bitset/select_test.cu new file mode 100644 index 000000000..3dc0d74da --- /dev/null +++ b/tests/dynamic_bitset/select_test.cu @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include + +#include + +template +__global__ void select_false_kernel(BitsetRef ref, size_type num_elements, OutputIt output) +{ + cuco::detail::index_type index = blockIdx.x * blockDim.x + threadIdx.x; + cuco::detail::index_type stride = gridDim.x * blockDim.x; + while (index < num_elements) { + output[index] = ref.select_false(index); + index += stride; + } +} + +extern bool modulo_bitgen(uint64_t i); // Defined in get_test.cu + +TEST_CASE("Select test", "") +{ + cuco::experimental::detail::dynamic_bitset bv; + + using size_type = std::size_t; + constexpr size_type num_elements{4000}; + + size_type num_set = 0; + for (size_type i = 0; i < num_elements; i++) { + bv.push_back(modulo_bitgen(i)); + num_set += modulo_bitgen(i); + } + + // Check select + { + thrust::device_vector keys(num_set); + thrust::sequence(keys.begin(), keys.end(), 0); + + thrust::device_vector d_selects(num_set); + + bv.select(keys.begin(), keys.end(), d_selects.begin()); + + thrust::host_vector h_selects = d_selects; + + size_type num_matches = 0; + size_type cur_set_pos = -1lu; + for (size_type i = 0; i < num_set; i++) { + do { + cur_set_pos++; + } while (cur_set_pos < num_elements and !modulo_bitgen(cur_set_pos)); + + num_matches += cur_set_pos == h_selects[i]; + } + REQUIRE(num_matches == num_set); + } + + // Check select_false + { + size_type num_not_set = num_elements - num_set; + + auto ref = bv.ref(); + thrust::device_vector device_result(num_not_set); + select_false_kernel<<<1, 1024>>>(ref, num_not_set, device_result.data()); + thrust::host_vector host_result = device_result; + + size_type num_matches = 0; + size_type cur_not_set_pos = -1lu; + for (size_type i = 0; i < num_not_set; i++) { + do { + cur_not_set_pos++; + } while (cur_not_set_pos < num_elements and modulo_bitgen(cur_not_set_pos)); + + num_matches += cur_not_set_pos == host_result[i]; + } + REQUIRE(num_matches == num_not_set); + } +} diff --git a/tests/dynamic_bitset/size_test.cu b/tests/dynamic_bitset/size_test.cu new file mode 100644 index 000000000..611159dc3 --- /dev/null +++ b/tests/dynamic_bitset/size_test.cu @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +TEST_CASE("Size computation", "") +{ + cuco::experimental::detail::dynamic_bitset bv; + using size_type = std::size_t; + constexpr size_type num_elements{400}; + + for (size_type i = 0; i < num_elements; i++) { + bv.push_back(i % 2 == 0); // Alternate 0s and 1s pattern + } + + auto size = bv.size(); + REQUIRE(size == num_elements); +} From dcd5a99da5e5b98b0af13476428e5a69013c9d93 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 14 Sep 2023 13:32:56 -0700 Subject: [PATCH 150/152] Rename `insert_pair` as `insert_element` in common kernels (#367) A minor cleanup to rename variables in detail implementations. --- include/cuco/detail/common_kernels.cuh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/cuco/detail/common_kernels.cuh b/include/cuco/detail/common_kernels.cuh index 73dba3cf3..759041bad 100644 --- a/include/cuco/detail/common_kernels.cuh +++ b/include/cuco/detail/common_kernels.cuh @@ -76,13 +76,13 @@ __global__ void insert_if_n(InputIterator first, while (idx < n) { if (pred(*(stencil + idx))) { - typename Ref::value_type const insert_pair{*(first + idx)}; + typename Ref::value_type const insert_element{*(first + idx)}; if constexpr (CGSize == 1) { - if (ref.insert(insert_pair)) { thread_num_successes++; }; + if (ref.insert(insert_element)) { thread_num_successes++; }; } else { auto const tile = cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); - if (ref.insert(tile, insert_pair) && tile.thread_rank() == 0) { thread_num_successes++; }; + if (ref.insert(tile, insert_element) && tile.thread_rank() == 0) { thread_num_successes++; } } } idx += loop_stride; @@ -134,13 +134,13 @@ __global__ void insert_if_n( while (idx < n) { if (pred(*(stencil + idx))) { - typename Ref::value_type const insert_pair{*(first + idx)}; + typename Ref::value_type const insert_element{*(first + idx)}; if constexpr (CGSize == 1) { - ref.insert(insert_pair); + ref.insert(insert_element); } else { auto const tile = cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); - ref.insert(tile, insert_pair); + ref.insert(tile, insert_element); } } idx += loop_stride; From 0cd4da08be0289b20306ec44a68044668730c0a9 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 15 Sep 2023 10:37:40 -0700 Subject: [PATCH 151/152] Clean up ref implementations with `has_payload` flag (#368) #356 introduces the `HasPayload` template boolean to distinguish code paths between map and set implementations thus the key input for base ref insert functions becomes redundant. This PR cleans up the base ref implementations by removing the key input and fixes a logical issue in #356: set doesn't have payload while map has. --- .../cuco/detail/open_addressing_ref_impl.cuh | 55 +++++++++++++------ .../cuco/detail/static_map/static_map_ref.inl | 16 +++--- .../cuco/detail/static_set/static_set_ref.inl | 16 +++--- 3 files changed, 54 insertions(+), 33 deletions(-) diff --git a/include/cuco/detail/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing_ref_impl.cuh index 46ef2bfd7..213d35af1 100644 --- a/include/cuco/detail/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing_ref_impl.cuh @@ -159,18 +159,23 @@ class open_addressing_ref_impl { * @tparam HasPayload Boolean indicating it's a set or map implementation * @tparam Predicate Predicate type * - * @param key Key of the element to insert * @param value The element to insert * @param predicate Predicate used to compare slot content against `key` * * @return True if the given element is successfully inserted */ template - __device__ bool insert(key_type const& key, - value_type const& value, - Predicate const& predicate) noexcept + __device__ bool insert(value_type const& value, Predicate const& predicate) noexcept { static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); + + auto const key = [&]() { + if constexpr (HasPayload) { + return value.first; + } else { + return value; + } + }(); auto probing_iter = probing_scheme_(key, storage_ref_.window_extent()); while (true) { @@ -202,7 +207,6 @@ class open_addressing_ref_impl { * @tparam Predicate Predicate type * * @param group The Cooperative Group used to perform group insert - * @param key Key of the element to insert * @param value The element to insert * @param predicate Predicate used to compare slot content against `key` * @@ -210,10 +214,16 @@ class open_addressing_ref_impl { */ template __device__ bool insert(cooperative_groups::thread_block_tile const& group, - key_type const& key, value_type const& value, Predicate const& predicate) noexcept { + auto const key = [&]() { + if constexpr (HasPayload) { + return value.first; + } else { + return value; + } + }(); auto probing_iter = probing_scheme_(group, key, storage_ref_.window_extent()); while (true) { @@ -269,7 +279,6 @@ class open_addressing_ref_impl { * @tparam HasPayload Boolean indicating it's a set or map implementation * @tparam Predicate Predicate type * - * @param key Key of the element to insert * @param value The element to insert * @param predicate Predicate used to compare slot content against `key` * @@ -277,11 +286,18 @@ class open_addressing_ref_impl { * insertion is successful or not. */ template - __device__ thrust::pair insert_and_find(key_type const& key, - value_type const& value, + __device__ thrust::pair insert_and_find(value_type const& value, Predicate const& predicate) noexcept { static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); + + auto const key = [&]() { + if constexpr (HasPayload) { + return value.first; + } else { + return value; + } + }(); auto probing_iter = probing_scheme_(key, storage_ref_.window_extent()); while (true) { @@ -326,7 +342,6 @@ class open_addressing_ref_impl { * @tparam Predicate Predicate type * * @param group The Cooperative Group used to perform group insert_and_find - * @param key Key of the element to insert * @param value The element to insert * @param predicate Predicate used to compare slot content against `key` * @@ -336,10 +351,16 @@ class open_addressing_ref_impl { template __device__ thrust::pair insert_and_find( cooperative_groups::thread_block_tile const& group, - key_type const& key, value_type const& value, Predicate const& predicate) noexcept { + auto const key = [&]() { + if constexpr (HasPayload) { + return value.first; + } else { + return value; + } + }(); auto probing_iter = probing_scheme_(group, key, storage_ref_.window_extent()); while (true) { @@ -710,11 +731,11 @@ class open_addressing_ref_impl { auto* old_ptr = reinterpret_cast(&old); auto const inserted = [&]() { if constexpr (HasPayload) { - // If it's a set implementation, compare the whole slot content - return cuco::detail::bitwise_compare(*old_ptr, this->empty_slot_sentinel_); - } else { // If it's a map implementation, compare keys only return cuco::detail::bitwise_compare(old_ptr->first, this->empty_slot_sentinel_.first); + } else { + // If it's a set implementation, compare the whole slot content + return cuco::detail::bitwise_compare(*old_ptr, this->empty_slot_sentinel_); } }(); if (inserted) { @@ -723,11 +744,11 @@ class open_addressing_ref_impl { // Shouldn't use `predicate` operator directly since it includes a redundant bitwise compare auto const res = [&]() { if constexpr (HasPayload) { - // If it's a set implementation, compare the whole slot content - return predicate.equal_to(*old_ptr, value); - } else { // If it's a map implementation, compare keys only return predicate.equal_to(old_ptr->first, value.first); + } else { + // If it's a set implementation, compare the whole slot content + return predicate.equal_to(*old_ptr, value); } }(); return res == detail::equal_result::EQUAL ? insert_result::DUPLICATE diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index 13fc2ce47..250c84feb 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -210,8 +210,8 @@ class operator_impl< __device__ bool insert(value_type const& value) noexcept { ref_type& ref_ = static_cast(*this); - auto constexpr has_payload = false; - return ref_.impl_.insert(value.first, value, ref_.predicate_); + auto constexpr has_payload = true; + return ref_.impl_.insert(value, ref_.predicate_); } /** @@ -225,8 +225,8 @@ class operator_impl< value_type const& value) noexcept { auto& ref_ = static_cast(*this); - auto constexpr has_payload = false; - return ref_.impl_.insert(group, value.first, value, ref_.predicate_); + auto constexpr has_payload = true; + return ref_.impl_.insert(group, value, ref_.predicate_); } }; @@ -454,8 +454,8 @@ class operator_impl< __device__ thrust::pair insert_and_find(value_type const& value) noexcept { ref_type& ref_ = static_cast(*this); - auto constexpr has_payload = false; - return ref_.impl_.insert_and_find(value.first, value, ref_.predicate_); + auto constexpr has_payload = true; + return ref_.impl_.insert_and_find(value, ref_.predicate_); } /** @@ -475,8 +475,8 @@ class operator_impl< cooperative_groups::thread_block_tile const& group, value_type const& value) noexcept { ref_type& ref_ = static_cast(*this); - auto constexpr has_payload = false; - return ref_.impl_.insert_and_find(group, value.first, value, ref_.predicate_); + auto constexpr has_payload = true; + return ref_.impl_.insert_and_find(group, value, ref_.predicate_); } }; diff --git a/include/cuco/detail/static_set/static_set_ref.inl b/include/cuco/detail/static_set/static_set_ref.inl index 3131f3764..2bb7f0c6f 100644 --- a/include/cuco/detail/static_set/static_set_ref.inl +++ b/include/cuco/detail/static_set/static_set_ref.inl @@ -101,8 +101,8 @@ class operator_impl(*this); - auto constexpr has_payload = true; - return ref_.impl_.insert(value, value, ref_.predicate_); + auto constexpr has_payload = false; + return ref_.impl_.insert(value, ref_.predicate_); } /** @@ -117,8 +117,8 @@ class operator_impl(*this); - auto constexpr has_payload = true; - return ref_.impl_.insert(group, value, value, ref_.predicate_); + auto constexpr has_payload = false; + return ref_.impl_.insert(group, value, ref_.predicate_); } }; @@ -182,8 +182,8 @@ class operator_impl insert_and_find(value_type const& value) noexcept { ref_type& ref_ = static_cast(*this); - auto constexpr has_payload = true; - return ref_.impl_.insert_and_find(value, value, ref_.predicate_); + auto constexpr has_payload = false; + return ref_.impl_.insert_and_find(value, ref_.predicate_); } /** @@ -203,8 +203,8 @@ class operator_impl const& group, value_type const& value) noexcept { ref_type& ref_ = static_cast(*this); - auto constexpr has_payload = true; - return ref_.impl_.insert_and_find(group, value, value, ref_.predicate_); + auto constexpr has_payload = false; + return ref_.impl_.insert_and_find(group, value, ref_.predicate_); } }; From 359f5ae67e93b69a8df35ebd1d12f746aac8916e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 26 Sep 2023 13:13:44 -0700 Subject: [PATCH 152/152] Add device subsets example (#346) Depends on #349 This PR adds an example demonstrating how to create multiple subsets with one single storage. It includes necessary changes and cleanups that will unblock orc/parquet dictionary encoding (https://github.com/rapidsai/cudf/issues/12261) to use the new map/set data structures. --------- Co-authored-by: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> --- examples/CMakeLists.txt | 1 + examples/static_set/device_ref_example.cu | 16 +- examples/static_set/device_subsets_example.cu | 183 ++++++++++++++++++ include/cuco/aow_storage.cuh | 23 ++- include/cuco/detail/extent/extent.inl | 35 ++-- include/cuco/detail/open_addressing_impl.cuh | 8 +- .../cuco/detail/open_addressing_ref_impl.cuh | 10 +- .../cuco/detail/static_map/static_map_ref.inl | 39 ++++ .../cuco/detail/static_set/static_set_ref.inl | 34 ++++ include/cuco/detail/storage/aow_storage.inl | 8 + include/cuco/detail/storage/storage.cuh | 1 + include/cuco/extent.cuh | 16 +- include/cuco/static_map_ref.cuh | 42 ++++ include/cuco/static_set_ref.cuh | 41 ++++ include/cuco/storage.cuh | 1 + 15 files changed, 403 insertions(+), 55 deletions(-) create mode 100644 examples/static_set/device_subsets_example.cu diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d78627eee..91e1417aa 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -35,6 +35,7 @@ endfunction(ConfigureExample) ConfigureExample(STATIC_SET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/host_bulk_example.cu") ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_ref_example.cu") +ConfigureExample(STATIC_SET_DEVICE_SUBSETS_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_subsets_example.cu") ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu") ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_view_example.cu") ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu") diff --git a/examples/static_set/device_ref_example.cu b/examples/static_set/device_ref_example.cu index 136292f6b..52e41cf45 100644 --- a/examples/static_set/device_ref_example.cu +++ b/examples/static_set/device_ref_example.cu @@ -26,6 +26,14 @@ #include #include +/** + * @file device_reference_example.cu + * @brief Demonstrates usage of the static_set device-side APIs. + * + * static_set provides a non-owning reference which can be used to interact with + * the container from within device code. + */ + // insert a set of keys into a hash set using one cooperative group for each task template __global__ void custom_cooperative_insert(SetRef set, InputIterator keys, std::size_t n) @@ -60,14 +68,6 @@ __global__ void custom_contains(SetRef set, InputIterator keys, std::size_t n, O } } -/** - * @file device_reference_example.cu - * @brief Demonstrates usage of the static_set device-side APIs. - * - * static_set provides a non-owning reference which can be used to interact with - * the container from within device code. - * - */ int main(void) { using Key = int; diff --git a/examples/static_set/device_subsets_example.cu b/examples/static_set/device_subsets_example.cu new file mode 100644 index 000000000..827342f95 --- /dev/null +++ b/examples/static_set/device_subsets_example.cu @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include + +/** + * @file device_subsets_example.cu + * @brief Demonstrates how to use one bulk set storage to create multiple subsets and perform + * individual operations via device-side ref APIs. + * + * To optimize memory usage, especially when dealing with expensive data allocation and multiple + * hashsets, a practical solution involves employing a single bulk storage for generating subsets. + * This eliminates the need for separate memory allocation and deallocation for each container. This + * can be achieved by using the lightweight non-owning ref type. + * + * @note This example is for demonstration purposes only. It is not intended to show the most + * performant way to do the example algorithm. + */ + +auto constexpr cg_size = 8; ///< A CUDA Cooperative Group of 8 threads to handle each subset +auto constexpr window_size = 1; ///< Number of concurrent slots handled by each thread +auto constexpr N = 10; ///< Number of elements to insert and query + +using key_type = int; ///< Key type +using probing_scheme_type = cuco::experimental::linear_probing< + cg_size, + cuco::default_hash_function>; ///< Type controls CG granularity and probing scheme + ///< (linear probing v.s. double hashing) +/// Type of bulk allocation storage +using storage_type = cuco::experimental::aow_storage; +/// Lightweight non-owning storage ref type +using storage_ref_type = typename storage_type::ref_type; +using ref_type = cuco::experimental::static_set_ref, + probing_scheme_type, + storage_ref_type>; ///< Set ref type + +/// Sample data to insert and query +__device__ constexpr std::array data = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19}; +/// Empty slots are represented by reserved "sentinel" values. These values should be selected such +/// that they never occur in your input data. +key_type constexpr empty_key_sentinel = -1; + +/** + * @brief Inserts sample data into subsets by using cooperative group + * + * Each Cooperative Group creates its own subset and inserts `N` sample data. + * + * @param set_refs Pointer to the array of subset objects + */ +__global__ void insert(ref_type* set_refs) +{ + namespace cg = cooperative_groups; + + auto const tile = cg::tiled_partition(cg::this_thread_block()); + // Get subset (or CG) index + auto const idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size; + + auto raw_set_ref = *(set_refs + idx); + auto insert_set_ref = std::move(raw_set_ref).with(cuco::experimental::insert); + + // Insert `N` elemtns into the set with CG insert + for (int i = 0; i < N; i++) { + insert_set_ref.insert(tile, data[i]); + } +} + +/** + * @brief All inserted data can be found + * + * Each Cooperative Group reconstructs its own subset ref based on the storage parameters and + * verifies all inserted data can be found. + * + * @param set_refs Pointer to the array of subset objects + */ +__global__ void find(ref_type* set_refs) +{ + namespace cg = cooperative_groups; + + auto const tile = cg::tiled_partition(cg::this_thread_block()); + auto const idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size; + + auto raw_set_ref = *(set_refs + idx); + auto find_set_ref = std::move(raw_set_ref).with(cuco::experimental::find); + + // Result denoting if any of the inserted data is not found + __shared__ int result; + if (threadIdx.x == 0) { result = 0; } + __syncthreads(); + + for (int i = 0; i < N; i++) { + // Query the set with inserted data + auto const found = find_set_ref.find(tile, data[i]); + // Record if the inserted data has been found + atomicOr(&result, *found != data[i]); + } + __syncthreads(); + + if (threadIdx.x == 0) { + // If the result is still 0, all inserted data are found. + if (result == 0) { printf("Success! Found all inserted elements.\n"); } + } +} + +int main() +{ + // Number of subsets to be created + auto constexpr num = 16; + // Each subset may have a different requested size + auto constexpr subset_sizes = + std::array{20, 20, 20, 20, 30, 30, 30, 30, 40, 40, 40, 40, 50, 50, 50, 50}; + + auto valid_sizes = std::vector(); + valid_sizes.reserve(num); + + for (size_t i = 0; i < num; ++i) { + valid_sizes.emplace_back( + static_cast(cuco::experimental::make_window_extent(subset_sizes[i]))); + } + + std::vector offsets(num + 1, 0); + + // prefix sum to compute offsets and total number of windows + std::size_t current_sum = 0; + for (std::size_t i = 0; i < valid_sizes.size(); ++i) { + current_sum += valid_sizes[i]; + offsets[i + 1] = current_sum; + } + + // total number of windows is located at the back of the offsets array + auto const total_num_windows = offsets.back(); + + // Create a single bulk storage used by all subsets + auto set_storage = storage_type{total_num_windows}; + // Initializes the storage with the given sentinel + set_storage.initialize(empty_key_sentinel); + + std::vector set_refs; + + // create subsets + for (std::size_t i = 0; i < num; ++i) { + storage_ref_type storage_ref{valid_sizes[i], set_storage.data() + offsets[i]}; + set_refs.emplace_back( + ref_type{cuco::empty_key{empty_key_sentinel}, {}, {}, storage_ref}); + } + + thrust::device_vector d_set_refs(set_refs); + + // Insert sample data + insert<<<1, 128>>>(d_set_refs.data().get()); + // Find all inserted data + find<<<1, 128>>>(d_set_refs.data().get()); + + return 0; +} diff --git a/include/cuco/aow_storage.cuh b/include/cuco/aow_storage.cuh index fdd970cf4..479246fac 100644 --- a/include/cuco/aow_storage.cuh +++ b/include/cuco/aow_storage.cuh @@ -16,10 +16,10 @@ #pragma once -#include - #include +#include #include +#include #include @@ -47,7 +47,10 @@ class aow_storage_ref; * @tparam Extent Type of extent denoting number of windows * @tparam Allocator Type of allocator used for device storage (de)allocation */ -template +template , + typename Allocator = cuco::cuda_allocator>> class aow_storage : public detail::aow_storage_base { public: using base_type = detail::aow_storage_base; ///< AoW base class type @@ -78,7 +81,7 @@ class aow_storage : public detail::aow_storage_base { * @param size Number of windows to (de)allocate * @param allocator Allocator used for (de)allocating device storage */ - explicit constexpr aow_storage(Extent size, Allocator const& allocator) noexcept; + explicit constexpr aow_storage(Extent size, Allocator const& allocator = {}) noexcept; aow_storage(aow_storage&&) = default; ///< Move constructor /** @@ -119,7 +122,15 @@ class aow_storage : public detail::aow_storage_base { * @param key Key to which all keys in `slots` are initialized * @param stream Stream used for executing the kernel */ - void initialize(value_type key, cuda_stream_ref stream) noexcept; + void initialize(value_type key, cuda_stream_ref stream = {}) noexcept; + + /** + * @brief Asynchronously initializes each slot in the AoW storage to contain `key`. + * + * @param key Key to which all keys in `slots` are initialized + * @param stream Stream used for executing the kernel + */ + void initialize_async(value_type key, cuda_stream_ref stream = {}) noexcept; private: allocator_type allocator_; ///< Allocator used to (de)allocate windows @@ -134,7 +145,7 @@ class aow_storage : public detail::aow_storage_base { * @tparam WindowSize Number of slots in each window * @tparam Extent Type of extent denoting storage capacity */ -template +template > class aow_storage_ref : public detail::aow_storage_base { public: using base_type = detail::aow_storage_base; ///< AoW base class type diff --git a/include/cuco/detail/extent/extent.inl b/include/cuco/detail/extent/extent.inl index 911bda9b1..a7cd83dcd 100644 --- a/include/cuco/detail/extent/extent.inl +++ b/include/cuco/detail/extent/extent.inl @@ -27,13 +27,10 @@ namespace cuco { namespace experimental { -template +template struct window_extent { using value_type = SizeType; ///< Extent value type - static auto constexpr cg_size = CGSize; - static auto constexpr window_size = WindowSize; - __host__ __device__ constexpr value_type value() const noexcept { return N; } __host__ __device__ explicit constexpr operator value_type() const noexcept { return value(); } @@ -45,15 +42,11 @@ struct window_extent { friend auto constexpr make_window_extent(extent ext); }; -template -struct window_extent - : cuco::utility::fast_int { +template +struct window_extent : cuco::utility::fast_int { using value_type = typename cuco::utility::fast_int::fast_int::value_type; ///< Extent value type - static auto constexpr cg_size = CGSize; - static auto constexpr window_size = WindowSize; - private: using cuco::utility::fast_int::fast_int; @@ -67,10 +60,10 @@ template return make_window_extent(ext); } -template -[[nodiscard]] std::size_t constexpr make_window_extent(std::size_t size) +template +[[nodiscard]] auto constexpr make_window_extent(SizeType size) { - return make_window_extent(size); + return make_window_extent(extent{size}); } template @@ -86,15 +79,13 @@ template if (size > max_value) { CUCO_FAIL("Invalid input extent"); } if constexpr (N == dynamic_extent) { - return window_extent{static_cast( + return window_extent{static_cast( *cuco::detail::lower_bound( cuco::detail::primes.begin(), cuco::detail::primes.end(), static_cast(size)) * CGSize)}; } if constexpr (N != dynamic_extent) { - return window_extent( *cuco::detail::lower_bound(cuco::detail::primes.begin(), cuco::detail::primes.end(), @@ -103,10 +94,10 @@ template } } -template -[[nodiscard]] std::size_t constexpr make_window_extent(std::size_t size) +template +[[nodiscard]] auto constexpr make_window_extent(SizeType size) { - return static_cast(make_window_extent(extent{size})); + return make_window_extent(extent{size}); } namespace detail { @@ -115,8 +106,8 @@ template struct is_window_extent : std::false_type { }; -template -struct is_window_extent> : std::true_type { +template +struct is_window_extent> : std::true_type { }; template diff --git a/include/cuco/detail/open_addressing_impl.cuh b/include/cuco/detail/open_addressing_impl.cuh index ef4821b40..2bc3a7225 100644 --- a/include/cuco/detail/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing_impl.cuh @@ -141,11 +141,7 @@ class open_addressing_impl { * * @param stream CUDA stream this operation is executed in */ - void clear(cuda_stream_ref stream) noexcept - { - this->clear_async(stream); - stream.synchronize(); - } + void clear(cuda_stream_ref stream) noexcept { storage_.initialize(empty_slot_sentinel_, stream); } /** * @brief Asynchronously erases all elements from the container. After this call, `size()` returns @@ -155,7 +151,7 @@ class open_addressing_impl { */ void clear_async(cuda_stream_ref stream) noexcept { - storage_.initialize(empty_slot_sentinel_, stream); + storage_.initialize_async(empty_slot_sentinel_, stream); } /** diff --git a/include/cuco/detail/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing_ref_impl.cuh index 213d35af1..cce691c21 100644 --- a/include/cuco/detail/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing_ref_impl.cuh @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -87,12 +88,9 @@ class open_addressing_ref_impl { ProbingScheme>, "ProbingScheme must inherit from cuco::detail::probing_scheme_base"); - static_assert(is_window_extent_v, - "Extent is not a valid cuco::window_extent"); - static_assert(ProbingScheme::cg_size == StorageRef::extent_type::cg_size, - "Extent has incompatible CG size"); - static_assert(StorageRef::window_size == StorageRef::extent_type::window_size, - "Extent has incompatible window size"); + // TODO: how to re-enable this check? + // static_assert(is_window_extent_v, + // "Extent is not a valid cuco::window_extent"); public: using key_type = Key; ///< Key type diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index 250c84feb..28b3ffaf2 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -50,6 +50,30 @@ __host__ __device__ constexpr static_map_ref< { } +template +template +__host__ __device__ constexpr static_map_ref:: + static_map_ref( + static_map_ref&& + other) noexcept + : impl_{std::move(other.impl_)}, + predicate_{std::move(other.predicate_)}, + empty_value_sentinel_{std::move(other.empty_value_sentinel_)} +{ +} + template return empty_value_sentinel_; } +template +template +auto static_map_ref::with( + NewOperators...) && noexcept +{ + return static_map_ref( + std::move(*this)); +} + template +template +__host__ __device__ constexpr static_set_ref:: + static_set_ref( + static_set_ref&& + other) noexcept + : impl_{std::move(other.impl_)}, predicate_{std::move(other.predicate_)} +{ +} + template ::e return predicate_.empty_sentinel_; } +template +template +auto static_set_ref::with( + NewOperators...) && noexcept +{ + return static_set_ref( + std::move(*this)); +} + namespace detail { template ::ref() const noexcept template void aow_storage::initialize(value_type key, cuda_stream_ref stream) noexcept +{ + this->initialize_async(key, stream); + stream.synchronize(); +} + +template +void aow_storage::initialize_async( + value_type key, cuda_stream_ref stream) noexcept { auto constexpr cg_size = 1; auto constexpr stride = 4; diff --git a/include/cuco/detail/storage/storage.cuh b/include/cuco/detail/storage/storage.cuh index b9a00baa2..4dda179c9 100644 --- a/include/cuco/detail/storage/storage.cuh +++ b/include/cuco/detail/storage/storage.cuh @@ -45,6 +45,7 @@ class storage : StorageImpl::template impl { using impl_type::capacity; using impl_type::data; using impl_type::initialize; + using impl_type::initialize_async; using impl_type::num_windows; using impl_type::ref; diff --git a/include/cuco/extent.cuh b/include/cuco/extent.cuh index e45068d9e..50e7ae4aa 100644 --- a/include/cuco/extent.cuh +++ b/include/cuco/extent.cuh @@ -83,7 +83,7 @@ struct extent { * @tparam N Extent * */ -template +template struct window_extent; /** @@ -118,15 +118,16 @@ template * the capacity ctor argument for the container. * * @tparam Container Container type to compute the extent for + * @tparam SizeType Size type * * @param size The input size * * @throw If the input size is invalid * - * @return Resulting valid extent as `std::size_t` + * @return Resulting valid extent */ -template -[[nodiscard]] std::size_t constexpr make_window_extent(std::size_t size); +template +[[nodiscard]] auto constexpr make_window_extent(SizeType size); /** * @brief Computes valid window extent based on given parameters. @@ -162,15 +163,16 @@ template * * @tparam CGSize Number of elements handled per CG * @tparam WindowSize Number of elements handled per Window + * @tparam SizeType Size type * * @param size The input size * * @throw If the input size is invalid * - * @return Resulting valid extent as `std::size_t` + * @return Resulting valid extent */ -template -[[nodiscard]] std::size_t constexpr make_window_extent(std::size_t size); +template +[[nodiscard]] auto constexpr make_window_extent(SizeType size); } // namespace experimental } // namespace cuco diff --git a/include/cuco/static_map_ref.cuh b/include/cuco/static_map_ref.cuh index 2460f1f10..c41ed88f3 100644 --- a/include/cuco/static_map_ref.cuh +++ b/include/cuco/static_map_ref.cuh @@ -17,8 +17,11 @@ #pragma once #include +#include #include +#include #include +#include #include @@ -106,6 +109,18 @@ class static_map_ref probing_scheme_type const& probing_scheme, storage_ref_type storage_ref) noexcept; + /** + * @brief Operator-agnostic move constructor. + * + * @tparam OtherOperators Operator set of the `other` object + * + * @param other Object to construct `*this` from + */ + template + __host__ __device__ explicit constexpr static_map_ref( + static_map_ref&& + other) noexcept; + /** * @brief Gets the maximum number of elements the container can hold. * @@ -127,6 +142,23 @@ class static_map_ref */ [[nodiscard]] __host__ __device__ constexpr mapped_type empty_value_sentinel() const noexcept; + /** + * @brief Creates a reference with new operators from the current object. + * + * Note that this function uses move semantics and thus invalidates the current object. + * + * @warning Using two or more reference objects to the same container but with + * a different operator set at the same time results in undefined behavior. + * + * @tparam NewOperators List of `cuco::op::*_tag` types + * + * @param ops List of operators, e.g., `cuco::insert` + * + * @return `*this` with `NewOperators...` + */ + template + [[nodiscard]] __host__ __device__ auto with(NewOperators... ops) && noexcept; + private: struct predicate_wrapper; @@ -137,6 +169,16 @@ class static_map_ref // Mixins need to be friends with this class in order to access private members template friend class detail::operator_impl; + + // Refs with other operator sets need to be friends too + template + friend class static_map_ref; }; } // namespace experimental diff --git a/include/cuco/static_set_ref.cuh b/include/cuco/static_set_ref.cuh index cf9c00ee0..b2c8158e7 100644 --- a/include/cuco/static_set_ref.cuh +++ b/include/cuco/static_set_ref.cuh @@ -18,8 +18,11 @@ #include #include +#include #include +#include #include +#include #include @@ -94,6 +97,18 @@ class static_set_ref probing_scheme_type const& probing_scheme, storage_ref_type storage_ref) noexcept; + /** + * @brief Operator-agnostic move constructor. + * + * @tparam OtherOperators Operator set of the `other` object + * + * @param other Object to construct `*this` from + */ + template + __host__ __device__ explicit constexpr static_set_ref( + static_set_ref&& + other) noexcept; + /** * @brief Gets the maximum number of elements the container can hold. * @@ -108,6 +123,23 @@ class static_set_ref */ [[nodiscard]] __host__ __device__ constexpr key_type empty_key_sentinel() const noexcept; + /** + * @brief Creates a reference with new operators from the current object. + * + * Note that this function uses move semantics and thus invalidates the current object. + * + * @warning Using two or more reference objects to the same container but with + * a different operator set at the same time results in undefined behavior. + * + * @tparam NewOperators List of `cuco::op::*_tag` types + * + * @param ops List of operators, e.g., `cuco::insert` + * + * @return `*this` with `NewOperators...` + */ + template + [[nodiscard]] __host__ __device__ auto with(NewOperators... ops) && noexcept; + private: impl_type impl_; detail::equal_wrapper predicate_; ///< Key equality binary callable @@ -115,6 +147,15 @@ class static_set_ref // Mixins need to be friends with this class in order to access private members template friend class detail::operator_impl; + + // Refs with other operator sets need to be friends too + template + friend class static_set_ref; }; } // namespace experimental diff --git a/include/cuco/storage.cuh b/include/cuco/storage.cuh index e34e59c96..e2e0c6f46 100644 --- a/include/cuco/storage.cuh +++ b/include/cuco/storage.cuh @@ -20,6 +20,7 @@ namespace cuco { namespace experimental { + /** * @brief Public storage class. *