From 42ab5a2fcee6f563c25b39f298e0b5296f72fb1e Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Tue, 16 Jul 2024 22:46:21 +0000 Subject: [PATCH 01/10] Add insert_or_apply shared_memory implementation --- include/cuco/detail/static_map/kernels.cuh | 116 ++++++++++++++++++ include/cuco/detail/static_map/static_map.inl | 58 ++++++++- tests/static_map/insert_or_apply_test.cu | 15 ++- 3 files changed, 178 insertions(+), 11 deletions(-) diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh index f47c86cdf..9500d3087 100644 --- a/include/cuco/detail/static_map/kernels.cuh +++ b/include/cuco/detail/static_map/kernels.cuh @@ -108,4 +108,120 @@ __global__ void insert_or_apply(InputIt first, cuco::detail::index_type n, Op op } } +template +CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem( + InputIt first, + cuco::detail::index_type n, + Op op, + Ref ref, + typename SharedMapRefType::extent_type window_extent) +{ + namespace cg = cooperative_groups; + using Key = typename Ref::key_type; + using Value = typename Ref::mapped_type; + using value_type = typename std::iterator_traits::value_type; + + auto const block = cg::this_thread_block(); + auto const thread_idx = block.thread_rank(); + auto const loop_stride = cuco::detail::grid_stride() / CGSize; + auto idx = cuco::detail::global_thread_id() / CGSize; + + // Shared map initialization + __shared__ typename SharedMapRefType::window_type windows[window_extent.value()]; + auto storage = SharedMapRefType::storage_ref_type(window_extent, windows); + auto const num_windows = storage.num_windows(); + + // BlockReduce to find cardinality + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + __shared__ int32_t block_cardinality; + + if (thread_idx == 0) block_cardinality = 0; + block.sync(); + + auto shared_map = SharedMapRefType(cuco::empty_key(ref.empty_key_sentinel()), + cuco::empty_value(ref.empty_value_sentinel()), + {}, + {}, + {}, + storage); + auto shared_map_ref = std::move(shared_map).with(cuco::op::insert_or_apply); + shared_map_ref.initialize(block); + block.sync(); + + int32_t num_loop = 0; + while (idx - thread_idx < n) { + if constexpr (CGSize == 1) { + // insert-or-apply into the shared map first + if (idx < n) { + value_type const& insert_pair = *(first + idx); + shared_map_ref.insert_or_apply(insert_pair, op); + } + block.sync(); + + // for first pass block_cardinality will be < threshold + // we can skip block_reduction to find cardinality + if (num_loop != 0) { + // find if cardinality exceeds threshold + int32_t thread_count = 0; + if (idx < n) { + auto window_idx = thread_idx; + while (window_idx < num_windows) { + auto const slot = storage[window_idx][0]; + if (not cuco::detail::bitwise_compare(slot.first, ref.empty_key_sentinel())) { + thread_count += 1; + } + window_idx += BlockSize; + } + } + + int32_t local_cardinality = BlockReduce(temp_storage).Sum(thread_count); + if (thread_idx == 0) block_cardinality = local_cardinality; + block.sync(); + + if (idx < n) { + if (block_cardinality > BlockSize) { break; } + } + } + } else { + auto const tile = cg::tiled_partition(block); + if (idx < n) { + value_type const& insert_pair = *(first + idx); + ref.insert_or_apply(tile, insert_pair, op); + } + } + idx += loop_stride; + num_loop += 1; + } + + if constexpr (CGSize == 1) { + // write from shared_map to global_map + auto window_idx = thread_idx; + while (window_idx < num_windows) { + auto const slot = storage[window_idx][0]; + if (not cuco::detail::bitwise_compare(slot.first, ref.empty_key_sentinel())) { + // TODO use insert ? + ref.insert_or_apply(slot, op); + } + window_idx += BlockSize; + } + + // insert-or-apply into global map for the remaining elements whose block_cardinality + // exceeds the cardinality threshold. + if (block_cardinality > BlockSize) { + idx += loop_stride; + while (idx < n) { + value_type const& insert_pair = *(first + idx); + ref.insert_or_apply(insert_pair, op); + idx += loop_stride; + } + } + } +} + } // namespace cuco::static_map_ns::detail \ No newline at end of file diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index e2f7aae71..2f5daa89d 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -17,7 +17,9 @@ #include #include #include +#include #include +#include #include #include @@ -318,11 +320,57 @@ void static_map - <<>>( - first, num, op, ref(op::insert_or_apply)); + auto cardinality_estimator = cuco::distinct_count_estimator{}; + auto keys_begin = thrust::make_transform_iterator( + first, cuda::proclaim_return_type([] __device__(auto const& input_pair) { + return input_pair.first; + })); + + auto constexpr cardinality_threshold = cuco::detail::default_block_size(); + + cardinality_estimator.add(keys_begin, keys_begin + num); + auto const cardinality = cardinality_estimator.estimate(); + + int32_t const grid_size = cuco::detail::grid_size(num, cg_size); + + if (cardinality > cardinality_threshold) { + static_map_ns::detail::insert_or_apply + <<>>( + first, num, op, ref(op::insert_or_apply)); + } else { + constexpr int32_t block_size = cuco::detail::default_block_size(); + constexpr int32_t cardinality_threshold = 128; + + constexpr int32_t shared_map_num_elements = cardinality_threshold + block_size; + constexpr float load_factor = 0.7; + constexpr int32_t shared_map_size = (1.0 / load_factor) * shared_map_num_elements; + + using extent_type = cuco::extent; + using shared_map_type = cuco::static_map>; + using shared_map_ref_type = typename shared_map_type::ref_type<>; + auto constexpr window_extent = cuco::make_window_extent(extent_type{}); + + using ref_type = decltype(ref(op::insert_or_apply)); + + auto insert_or_apply_shmem_fn_ptr = static_map_ns::detail:: + insert_or_apply_shmem; + + int32_t const max_op_grid_size = + cuco::detail::max_occupancy_grid_size(block_size, insert_or_apply_shmem_fn_ptr, 0); + + auto const shmem_grid_size = std::min(grid_size, max_op_grid_size); + + static_map_ns::detail::insert_or_apply_shmem + <<>>( + first, num, op, ref(op::insert_or_apply), window_extent); + } } template + __device__ void operator()(cuda::atomic_ref lhs, T rhs) + { + lhs.fetch_add(rhs, cuda::memory_order_relaxed); + } +}; + template void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_keys) { @@ -48,12 +56,7 @@ void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_key return cuco::pair{i % num_unique_keys, 1}; })); - map.insert_or_apply( - pairs_begin, - pairs_begin + num_keys, - [] __device__(cuda::atomic_ref lhs, const Value& rhs) { - lhs.fetch_add(rhs, cuda::memory_order_relaxed); - }); + map.insert_or_apply(pairs_begin, pairs_begin + num_keys, binary_plus_op{}); REQUIRE(map.size() == num_unique_keys); From b7f0ac792c2fe999c8573b599b48c533fcf72567 Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Wed, 17 Jul 2024 03:21:33 +0000 Subject: [PATCH 02/10] add stream parameter to cardinality estimator --- include/cuco/detail/static_map/static_map.inl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index 2f5daa89d..9e26a91b2 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -328,8 +328,8 @@ void static_map Date: Wed, 17 Jul 2024 20:42:49 +0000 Subject: [PATCH 03/10] update insert_or_apply device API to return bool --- .../cuco/detail/static_map/static_map_ref.inl | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index 75abc38e4..1441ecc9f 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -577,7 +577,7 @@ class operator_impl< */ template - __device__ void insert_or_apply(Value const& value, Op op) + __device__ bool insert_or_apply(Value const& value, Op op) { static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); @@ -609,17 +609,17 @@ class operator_impl< ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); } op(cuda::atomic_ref{slot_ptr->second}, val.second); - return; + return false; } if (eq_res == detail::equal_result::AVAILABLE) { switch (ref_.impl_.attempt_insert_stable(slot_ptr, slot_content, val)) { - case insert_result::SUCCESS: return; + case insert_result::SUCCESS: return true; case insert_result::DUPLICATE: { if constexpr (sizeof(value_type) > 8) { ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); } op(cuda::atomic_ref{slot_ptr->second}, val.second); - return; + return false; } default: continue; } @@ -630,12 +630,13 @@ class operator_impl< } template - __device__ void insert_or_apply(Value const& value, cuco::op::reduce::sum_tag) + __device__ bool insert_or_apply(Value const& value, cuco::op::reduce::sum_tag) { auto& ref_ = static_cast(*this); - ref_.insert_or_apply(value, [](cuda::atomic_ref payload_ref, T const& payload) { - payload_ref.fetch_add(payload, cuda::memory_order_relaxed); - }); + return ref_.insert_or_apply(value, + [](cuda::atomic_ref payload_ref, T const& payload) { + payload_ref.fetch_add(payload, cuda::memory_order_relaxed); + }); } /** @@ -654,7 +655,7 @@ class operator_impl< */ template - __device__ void insert_or_apply(cooperative_groups::thread_block_tile const& group, + __device__ bool insert_or_apply(cooperative_groups::thread_block_tile const& group, Value const& value, Op op) { @@ -697,7 +698,7 @@ class operator_impl< } op(cuda::atomic_ref{slot_ptr->second}, val.second); } - return; + return false; } auto const group_contains_available = group.ballot(state == detail::equal_result::AVAILABLE); @@ -709,7 +710,7 @@ class operator_impl< }(); switch (group.shfl(status, src_lane)) { - case insert_result::SUCCESS: return; + case insert_result::SUCCESS: return true; case insert_result::DUPLICATE: { if (group.thread_rank() == src_lane) { if constexpr (sizeof(value_type) > 8) { @@ -717,7 +718,7 @@ class operator_impl< } op(cuda::atomic_ref{slot_ptr->second}, val.second); } - return; + return false; } default: continue; } @@ -728,12 +729,12 @@ class operator_impl< } template - __device__ void insert_or_apply(cooperative_groups::thread_block_tile const& group, + __device__ bool insert_or_apply(cooperative_groups::thread_block_tile const& group, Value const& value, cuco::op::reduce::sum_tag) { auto& ref_ = static_cast(*this); - ref_.insert_or_apply( + return ref_.insert_or_apply( group, value, [](cuda::atomic_ref payload_ref, T const& payload) { payload_ref.fetch_add(payload, cuda::memory_order_relaxed); }); From 5d475955c0eed31542539efa7b095bc4614cde0b Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Wed, 17 Jul 2024 20:43:26 +0000 Subject: [PATCH 04/10] use warp shuffles to find block_cardinality --- include/cuco/detail/static_map/kernels.cuh | 45 ++++++++-------------- 1 file changed, 15 insertions(+), 30 deletions(-) diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh index 9500d3087..2920ac56f 100644 --- a/include/cuco/detail/static_map/kernels.cuh +++ b/include/cuco/detail/static_map/kernels.cuh @@ -131,19 +131,20 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem( auto const loop_stride = cuco::detail::grid_stride() / CGSize; auto idx = cuco::detail::global_thread_id() / CGSize; + auto warp = cg::tiled_partition<32>(block); + auto const warp_thread_idx = warp.thread_rank(); + // Shared map initialization __shared__ typename SharedMapRefType::window_type windows[window_extent.value()]; auto storage = SharedMapRefType::storage_ref_type(window_extent, windows); auto const num_windows = storage.num_windows(); - // BlockReduce to find cardinality - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; __shared__ int32_t block_cardinality; - if (thread_idx == 0) block_cardinality = 0; block.sync(); + cuda::atomic_ref cardinality_counter(block_cardinality); + auto shared_map = SharedMapRefType(cuco::empty_key(ref.empty_key_sentinel()), cuco::empty_value(ref.empty_value_sentinel()), {}, @@ -154,40 +155,25 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem( shared_map_ref.initialize(block); block.sync(); - int32_t num_loop = 0; while (idx - thread_idx < n) { if constexpr (CGSize == 1) { // insert-or-apply into the shared map first + int32_t inserted = 0; if (idx < n) { value_type const& insert_pair = *(first + idx); - shared_map_ref.insert_or_apply(insert_pair, op); + inserted = shared_map_ref.insert_or_apply(insert_pair, op); } - block.sync(); - // for first pass block_cardinality will be < threshold - // we can skip block_reduction to find cardinality - if (num_loop != 0) { - // find if cardinality exceeds threshold - int32_t thread_count = 0; - if (idx < n) { - auto window_idx = thread_idx; - while (window_idx < num_windows) { - auto const slot = storage[window_idx][0]; - if (not cuco::detail::bitwise_compare(slot.first, ref.empty_key_sentinel())) { - thread_count += 1; - } - window_idx += BlockSize; - } - } - - int32_t local_cardinality = BlockReduce(temp_storage).Sum(thread_count); - if (thread_idx == 0) block_cardinality = local_cardinality; - block.sync(); - - if (idx < n) { - if (block_cardinality > BlockSize) { break; } + if (idx - warp_thread_idx < n) // all threads in warp particpate + { + warp.sync(); // sync for inserted to materalize + for (int32_t i = warp.size() / 2; i > 0; i /= 2) { + inserted += warp.shfl_down(inserted, i); } } + if (warp_thread_idx == 0) cardinality_counter.fetch_add(inserted, cuda::memory_order_relaxed); + block.sync(); + if (block_cardinality > BlockSize) break; } else { auto const tile = cg::tiled_partition(block); if (idx < n) { @@ -196,7 +182,6 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem( } } idx += loop_stride; - num_loop += 1; } if constexpr (CGSize == 1) { From 303975392f2b2b77f31ecd299c7df7580153225d Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Fri, 19 Jul 2024 14:25:51 +0000 Subject: [PATCH 05/10] optimizations in shmem kernel --- include/cuco/detail/static_map/kernels.cuh | 30 ++++++++++------------ 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh index 2920ac56f..2429e3000 100644 --- a/include/cuco/detail/static_map/kernels.cuh +++ b/include/cuco/detail/static_map/kernels.cuh @@ -22,6 +22,7 @@ #include #include +#include #include @@ -110,10 +111,10 @@ __global__ void insert_or_apply(InputIt first, cuco::detail::index_type n, Op op template + class SharedMapRefType, + class InputIt, + class Op, + class Ref> CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem( InputIt first, cuco::detail::index_type n, @@ -157,23 +158,21 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem( while (idx - thread_idx < n) { if constexpr (CGSize == 1) { + int32_t inserted = 0; + int32_t local_cardinality = 0; // insert-or-apply into the shared map first - int32_t inserted = 0; if (idx < n) { value_type const& insert_pair = *(first + idx); inserted = shared_map_ref.insert_or_apply(insert_pair, op); } - - if (idx - warp_thread_idx < n) // all threads in warp particpate - { - warp.sync(); // sync for inserted to materalize - for (int32_t i = warp.size() / 2; i > 0; i /= 2) { - inserted += warp.shfl_down(inserted, i); - } + if (idx - warp_thread_idx < n) { // all threads in warp particpate + local_cardinality = cg::reduce(warp, inserted, cg::plus()); + } + if (warp_thread_idx == 0) { + cardinality_counter.fetch_add(local_cardinality, cuda::memory_order_relaxed); } - if (warp_thread_idx == 0) cardinality_counter.fetch_add(inserted, cuda::memory_order_relaxed); block.sync(); - if (block_cardinality > BlockSize) break; + if (block_cardinality > BlockSize) { break; } } else { auto const tile = cg::tiled_partition(block); if (idx < n) { @@ -185,12 +184,11 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem( } if constexpr (CGSize == 1) { - // write from shared_map to global_map + // insert-or-apply from shared map to global map auto window_idx = thread_idx; while (window_idx < num_windows) { auto const slot = storage[window_idx][0]; if (not cuco::detail::bitwise_compare(slot.first, ref.empty_key_sentinel())) { - // TODO use insert ? ref.insert_or_apply(slot, op); } window_idx += BlockSize; From 89689d2d342f5c1503f3b22bdf78274549eb6b69 Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Fri, 19 Jul 2024 14:26:56 +0000 Subject: [PATCH 06/10] dispatch to shmem based on input size --- include/cuco/detail/static_map/static_map.inl | 86 ++++++++----------- 1 file changed, 38 insertions(+), 48 deletions(-) diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index 9e26a91b2..056ee70ae 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -320,56 +320,46 @@ void static_map{}; - auto keys_begin = thrust::make_transform_iterator( - first, cuda::proclaim_return_type([] __device__(auto const& input_pair) { - return input_pair.first; - })); - - auto constexpr cardinality_threshold = cuco::detail::default_block_size(); - - cardinality_estimator.add(keys_begin, keys_begin + num, stream); - auto const cardinality = cardinality_estimator.estimate(stream); - - int32_t const grid_size = cuco::detail::grid_size(num, cg_size); - - if (cardinality > cardinality_threshold) { + int32_t constexpr shmem_block_size = 1024; + int32_t const default_grid_size = cuco::detail::grid_size(num, cg_size); + + int32_t constexpr cardinality_threshold = shmem_block_size; + int32_t constexpr shared_map_num_elements = cardinality_threshold + shmem_block_size; + float constexpr load_factor = 0.7; + int32_t constexpr shared_map_size = (1.0 / load_factor) * shared_map_num_elements; + + using extent_type = cuco::extent; + using shared_map_type = cuco::static_map>; + using shared_map_ref_type = typename shared_map_type::ref_type<>; + auto constexpr window_extent = cuco::make_window_extent(extent_type{}); + + using ref_type = decltype(ref(op::insert_or_apply)); + + auto insert_or_apply_shmem_fn_ptr = static_map_ns::detail:: + insert_or_apply_shmem; + + int32_t const max_op_grid_size = + cuco::detail::max_occupancy_grid_size(shmem_block_size, insert_or_apply_shmem_fn_ptr, 0); + + auto const shmem_grid_size = std::min(default_grid_size, max_op_grid_size); + auto const num_loops_per_thread = num / (shmem_grid_size * shmem_block_size); + + // use shared_memory only if each thread has atleast 2 elements to process + if (num_loops_per_thread > 2) { + static_map_ns::detail::insert_or_apply_shmem + <<>>( + first, num, op, ref(op::insert_or_apply), window_extent); + } else { static_map_ns::detail::insert_or_apply - <<>>( + <<>>( first, num, op, ref(op::insert_or_apply)); - } else { - constexpr int32_t block_size = cuco::detail::default_block_size(); - constexpr int32_t cardinality_threshold = 128; - - constexpr int32_t shared_map_num_elements = cardinality_threshold + block_size; - constexpr float load_factor = 0.7; - constexpr int32_t shared_map_size = (1.0 / load_factor) * shared_map_num_elements; - - using extent_type = cuco::extent; - using shared_map_type = cuco::static_map>; - using shared_map_ref_type = typename shared_map_type::ref_type<>; - auto constexpr window_extent = cuco::make_window_extent(extent_type{}); - - using ref_type = decltype(ref(op::insert_or_apply)); - - auto insert_or_apply_shmem_fn_ptr = static_map_ns::detail:: - insert_or_apply_shmem; - - int32_t const max_op_grid_size = - cuco::detail::max_occupancy_grid_size(block_size, insert_or_apply_shmem_fn_ptr, 0); - - auto const shmem_grid_size = std::min(grid_size, max_op_grid_size); - - static_map_ns::detail::insert_or_apply_shmem - <<>>( - first, num, op, ref(op::insert_or_apply), window_extent); } } From 2819862e5ed6deda28ac7ad9851485318055cc75 Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Fri, 19 Jul 2024 14:28:05 +0000 Subject: [PATCH 07/10] Use atomic instead of atomic_ref --- include/cuco/detail/static_map/kernels.cuh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh index 2429e3000..4661de031 100644 --- a/include/cuco/detail/static_map/kernels.cuh +++ b/include/cuco/detail/static_map/kernels.cuh @@ -140,12 +140,11 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem( auto storage = SharedMapRefType::storage_ref_type(window_extent, windows); auto const num_windows = storage.num_windows(); - __shared__ int32_t block_cardinality; - if (thread_idx == 0) block_cardinality = 0; + using atomic_type = cuda::atomic; + __shared__ atomic_type block_cardinality; + if (thread_idx == 0) { new (&block_cardinality) atomic_type{}; } block.sync(); - cuda::atomic_ref cardinality_counter(block_cardinality); - auto shared_map = SharedMapRefType(cuco::empty_key(ref.empty_key_sentinel()), cuco::empty_value(ref.empty_value_sentinel()), {}, @@ -169,7 +168,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem( local_cardinality = cg::reduce(warp, inserted, cg::plus()); } if (warp_thread_idx == 0) { - cardinality_counter.fetch_add(local_cardinality, cuda::memory_order_relaxed); + block_cardinality.fetch_add(local_cardinality, cuda::memory_order_relaxed); } block.sync(); if (block_cardinality > BlockSize) { break; } From 78e203da4e7b56e8541d9248460ae68d256f4e18 Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Fri, 26 Jul 2024 18:50:55 +0000 Subject: [PATCH 08/10] minor cleanup --- include/cuco/detail/static_map/kernels.cuh | 4 ++-- include/cuco/detail/static_map/static_map.inl | 5 ++--- include/cuco/detail/static_map/static_map_ref.inl | 4 ++++ 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh index 4661de031..25a4809f2 100644 --- a/include/cuco/detail/static_map/kernels.cuh +++ b/include/cuco/detail/static_map/kernels.cuh @@ -145,12 +145,12 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem( if (thread_idx == 0) { new (&block_cardinality) atomic_type{}; } block.sync(); - auto shared_map = SharedMapRefType(cuco::empty_key(ref.empty_key_sentinel()), + auto shared_map = SharedMapRefType{cuco::empty_key(ref.empty_key_sentinel()), cuco::empty_value(ref.empty_value_sentinel()), {}, {}, {}, - storage); + storage}; auto shared_map_ref = std::move(shared_map).with(cuco::op::insert_or_apply); shared_map_ref.initialize(block); block.sync(); diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index 056ee70ae..cfa919ff2 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -17,14 +17,13 @@ #include #include #include -#include #include -#include #include #include #include +#include #include namespace cuco { @@ -346,7 +345,7 @@ void static_map; int32_t const max_op_grid_size = - cuco::detail::max_occupancy_grid_size(shmem_block_size, insert_or_apply_shmem_fn_ptr, 0); + cuco::detail::max_occupancy_grid_size(shmem_block_size, insert_or_apply_shmem_fn_ptr); auto const shmem_grid_size = std::min(default_grid_size, max_op_grid_size); auto const num_loops_per_thread = num / (shmem_grid_size * shmem_block_size); diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index 1441ecc9f..a9a8a3e1c 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -574,6 +574,8 @@ class operator_impl< * @param value The element to insert * @param op The callable object to perform binary operation between existing value at the slot * and the element to insert. + * + * @return Returns `true` if the given `value` is inserted successfully. */ template @@ -652,6 +654,8 @@ class operator_impl< * @param value The element to insert * @param op The callable object to perform binary operation between existing value at the slot * and the element to insert. + * + * @return Returns `true` if the given `value` is inserted successfully. */ template From a023537109a2a68cfbc25d05c0b13bf06220c179 Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Fri, 26 Jul 2024 21:29:32 +0000 Subject: [PATCH 09/10] minor nits --- include/cuco/detail/static_map/kernels.cuh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh index 25a4809f2..a58e9e273 100644 --- a/include/cuco/detail/static_map/kernels.cuh +++ b/include/cuco/detail/static_map/kernels.cuh @@ -145,8 +145,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem( if (thread_idx == 0) { new (&block_cardinality) atomic_type{}; } block.sync(); - auto shared_map = SharedMapRefType{cuco::empty_key(ref.empty_key_sentinel()), - cuco::empty_value(ref.empty_value_sentinel()), + auto shared_map = SharedMapRefType{cuco::empty_key{ref.empty_key_sentinel()}, + cuco::empty_value{ref.empty_value_sentinel()}, {}, {}, {}, @@ -155,7 +155,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem( shared_map_ref.initialize(block); block.sync(); - while (idx - thread_idx < n) { + while ((idx - thread_idx / CGSize) < n) { if constexpr (CGSize == 1) { int32_t inserted = 0; int32_t local_cardinality = 0; From d2ccc9fc8fa7c762de29b575677079da183dc459 Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Tue, 30 Jul 2024 01:30:20 +0000 Subject: [PATCH 10/10] add tests that use shared memory map kernel --- tests/static_map/insert_or_apply_test.cu | 95 +++++++++++++++++++++++- 1 file changed, 94 insertions(+), 1 deletion(-) diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu index 0c54b2730..d3bceedcb 100644 --- a/tests/static_map/insert_or_apply_test.cu +++ b/tests/static_map/insert_or_apply_test.cu @@ -70,6 +70,66 @@ void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_key thrust::equal_to{})); } +template +void test_insert_or_apply_shmem(Map& map, size_type num_keys, size_type num_unique_keys) +{ + REQUIRE((num_keys % num_unique_keys) == 0); + + using Key = typename Map::key_type; + using Value = typename Map::mapped_type; + + using KeyEqual = typename Map::key_equal; + using ProbingScheme = typename Map::probing_scheme_type; + using Allocator = typename Map::allocator_type; + auto constexpr cg_size = Map::cg_size; + + int32_t constexpr shmem_block_size = 1024; + int32_t constexpr cardinality_threshold = shmem_block_size; + int32_t constexpr shared_map_num_elements = cardinality_threshold + shmem_block_size; + float constexpr load_factor = 0.7; + int32_t constexpr shared_map_size = (1.0 / load_factor) * shared_map_num_elements; + + using extent_type = cuco::extent; + using shared_map_type = cuco::static_map>; + + using shared_map_ref_type = typename shared_map_type::ref_type<>; + auto constexpr window_extent = cuco::make_window_extent(extent_type{}); + + // Insert pairs + auto pairs_begin = thrust::make_transform_iterator( + thrust::counting_iterator(0), + cuda::proclaim_return_type>([num_unique_keys] __device__(auto i) { + return cuco::pair{i % num_unique_keys, 1}; + })); + + auto const shmem_grid_size = cuco::detail::grid_size(num_keys, cg_size, 1, shmem_block_size); + + cuda::stream_ref stream{}; + + // launch the shmem kernel + cuco::static_map_ns::detail::insert_or_apply_shmem + <<>>( + pairs_begin, num_keys, binary_plus_op{}, map.ref(cuco::op::insert_or_apply), window_extent); + + REQUIRE(map.size() == num_unique_keys); + + thrust::device_vector d_keys(num_unique_keys); + thrust::device_vector d_values(num_unique_keys); + map.retrieve_all(d_keys.begin(), d_values.begin()); + + REQUIRE(cuco::test::equal(d_values.begin(), + d_values.end(), + thrust::make_constant_iterator(num_keys / num_unique_keys), + thrust::equal_to{})); +} + TEMPLATE_TEST_CASE_SIG( "static_map insert_or_apply tests", "", @@ -145,4 +205,37 @@ TEMPLATE_TEST_CASE_SIG( num_keys, cuco::empty_key{-1}, cuco::empty_value{0}}; test_insert_or_apply(map, num_keys, num_keys); -} \ No newline at end of file +} + +TEMPLATE_TEST_CASE_SIG( + "static_map insert_or_apply shared memory", "", ((typename Key)), (int32_t), (int64_t)) +{ + using Value = Key; + + using map_type = cuco::static_map, + cuda::thread_scope_device, + thrust::equal_to, + cuco::linear_probing<1, cuco::murmurhash3_32>, + cuco::cuda_allocator, + cuco::storage<2>>; + + SECTION("duplicate keys") + { + constexpr size_type num_keys = 10'000; + constexpr size_type num_unique_keys = 100; + + auto map = map_type{num_keys, cuco::empty_key{-1}, cuco::empty_value{0}}; + test_insert_or_apply_shmem(map, num_keys, num_unique_keys); + } + + SECTION("unique keys") + { + constexpr size_type num_keys = 10'000; + constexpr size_type num_unique_keys = num_keys; + + auto map = map_type{num_keys, cuco::empty_key{-1}, cuco::empty_value{0}}; + test_insert_or_apply_shmem(map, num_keys, num_unique_keys); + } +}