From 42ab5a2fcee6f563c25b39f298e0b5296f72fb1e Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Tue, 16 Jul 2024 22:46:21 +0000
Subject: [PATCH 01/10] Add insert_or_apply shared_memory implementation

---
 include/cuco/detail/static_map/kernels.cuh    | 116 ++++++++++++++++++
 include/cuco/detail/static_map/static_map.inl |  58 ++++++++-
 tests/static_map/insert_or_apply_test.cu      |  15 ++-
 3 files changed, 178 insertions(+), 11 deletions(-)
diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh
index f47c86cdf..9500d3087 100644
--- a/include/cuco/detail/static_map/kernels.cuh
+++ b/include/cuco/detail/static_map/kernels.cuh
@@ -108,4 +108,120 @@ __global__ void insert_or_apply(InputIt first, cuco::detail::index_type n, Op op
   }
 }
 
+template <int32_t CGSize,
+          int32_t BlockSize,
+          typename SharedMapRefType,
+          typename InputIt,
+          typename Op,
+          typename Ref>
+CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem(
+  InputIt first,
+  cuco::detail::index_type n,
+  Op op,
+  Ref ref,
+  typename SharedMapRefType::extent_type window_extent)
+{
+  namespace cg     = cooperative_groups;
+  using Key        = typename Ref::key_type;
+  using Value      = typename Ref::mapped_type;
+  using value_type = typename std::iterator_traits<InputIt>::value_type;
+
+  auto const block       = cg::this_thread_block();
+  auto const thread_idx  = block.thread_rank();
+  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
+  auto idx               = cuco::detail::global_thread_id() / CGSize;
+
+  // Shared map initialization
+  __shared__ typename SharedMapRefType::window_type windows[window_extent.value()];
+  auto storage           = SharedMapRefType::storage_ref_type(window_extent, windows);
+  auto const num_windows = storage.num_windows();
+
+  // BlockReduce to find cardinality
+  using BlockReduce = cub::BlockReduce<int32_t, BlockSize>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  __shared__ int32_t block_cardinality;
+
+  if (thread_idx == 0) block_cardinality = 0;
+  block.sync();
+
+  auto shared_map     = SharedMapRefType(cuco::empty_key<Key>(ref.empty_key_sentinel()),
+                                     cuco::empty_value<Value>(ref.empty_value_sentinel()),
+                                         {},
+                                         {},
+                                         {},
+                                     storage);
+  auto shared_map_ref = std::move(shared_map).with(cuco::op::insert_or_apply);
+  shared_map_ref.initialize(block);
+  block.sync();
+
+  int32_t num_loop = 0;
+  while (idx - thread_idx < n) {
+    if constexpr (CGSize == 1) {
+      // insert-or-apply into the shared map first
+      if (idx < n) {
+        value_type const& insert_pair = *(first + idx);
+        shared_map_ref.insert_or_apply(insert_pair, op);
+      }
+      block.sync();
+
+      // for first pass block_cardinality will be < threshold
+      // we can skip block_reduction to find cardinality
+      if (num_loop != 0) {
+        // find if cardinality exceeds threshold
+        int32_t thread_count = 0;
+        if (idx < n) {
+          auto window_idx = thread_idx;
+          while (window_idx < num_windows) {
+            auto const slot = storage[window_idx][0];
+            if (not cuco::detail::bitwise_compare(slot.first, ref.empty_key_sentinel())) {
+              thread_count += 1;
+            }
+            window_idx += BlockSize;
+          }
+        }
+
+        int32_t local_cardinality = BlockReduce(temp_storage).Sum(thread_count);
+        if (thread_idx == 0) block_cardinality = local_cardinality;
+        block.sync();
+
+        if (idx < n) {
+          if (block_cardinality > BlockSize) { break; }
+        }
+      }
+    } else {
+      auto const tile = cg::tiled_partition<CGSize>(block);
+      if (idx < n) {
+        value_type const& insert_pair = *(first + idx);
+        ref.insert_or_apply(tile, insert_pair, op);
+      }
+    }
+    idx += loop_stride;
+    num_loop += 1;
+  }
+
+  if constexpr (CGSize == 1) {
+    // write from shared_map to global_map
+    auto window_idx = thread_idx;
+    while (window_idx < num_windows) {
+      auto const slot = storage[window_idx][0];
+      if (not cuco::detail::bitwise_compare(slot.first, ref.empty_key_sentinel())) {
+        // TODO use insert ?
+        ref.insert_or_apply(slot, op);
+      }
+      window_idx += BlockSize;
+    }
+
+    // insert-or-apply into global map for the remaining elements whose block_cardinality
+    // exceeds the cardinality threshold.
+    if (block_cardinality > BlockSize) {
+      idx += loop_stride;
+      while (idx < n) {
+        value_type const& insert_pair = *(first + idx);
+        ref.insert_or_apply(insert_pair, op);
+        idx += loop_stride;
+      }
+    }
+  }
+}
+
 }  // namespace cuco::static_map_ns::detail
\ No newline at end of file
diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl
index e2f7aae71..2f5daa89d 100644
--- a/include/cuco/detail/static_map/static_map.inl
+++ b/include/cuco/detail/static_map/static_map.inl
@@ -17,7 +17,9 @@
 #include <cuco/detail/bitwise_compare.cuh>
 #include <cuco/detail/static_map/kernels.cuh>
 #include <cuco/detail/utility/cuda.hpp>
+#include <cuco/detail/utility/math.hpp>
 #include <cuco/detail/utils.hpp>
+#include <cuco/distinct_count_estimator.cuh>
 #include <cuco/operator.hpp>
 #include <cuco/static_map_ref.cuh>
 
@@ -318,11 +320,57 @@ void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stora
   auto const num = cuco::detail::distance(first, last);
   if (num == 0) { return; }
 
-  auto const grid_size = cuco::detail::grid_size(num, cg_size);
-
-  static_map_ns::detail::insert_or_apply<cg_size, cuco::detail::default_block_size()>
-    <<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
-      first, num, op, ref(op::insert_or_apply));
+  auto cardinality_estimator = cuco::distinct_count_estimator<Key>{};
+  auto keys_begin            = thrust::make_transform_iterator(
+    first, cuda::proclaim_return_type<Key>([] __device__(auto const& input_pair) {
+      return input_pair.first;
+    }));
+
+  auto constexpr cardinality_threshold = cuco::detail::default_block_size();
+
+  cardinality_estimator.add(keys_begin, keys_begin + num);
+  auto const cardinality = cardinality_estimator.estimate();
+
+  int32_t const grid_size = cuco::detail::grid_size(num, cg_size);
+
+  if (cardinality > cardinality_threshold) {
+    static_map_ns::detail::insert_or_apply<cg_size, cuco::detail::default_block_size()>
+      <<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
+        first, num, op, ref(op::insert_or_apply));
+  } else {
+    constexpr int32_t block_size            = cuco::detail::default_block_size();
+    constexpr int32_t cardinality_threshold = 128;
+
+    constexpr int32_t shared_map_num_elements = cardinality_threshold + block_size;
+    constexpr float load_factor               = 0.7;
+    constexpr int32_t shared_map_size         = (1.0 / load_factor) * shared_map_num_elements;
+
+    using extent_type            = cuco::extent<int32_t, shared_map_size>;
+    using shared_map_type        = cuco::static_map<Key,
+                                                    T,
+                                                    extent_type,
+                                                    cuda::thread_scope_block,
+                                                    KeyEqual,
+                                                    ProbingScheme,
+                                                    Allocator,
+                                                    cuco::storage<1>>;
+    using shared_map_ref_type    = typename shared_map_type::ref_type<>;
+    auto constexpr window_extent = cuco::make_window_extent<shared_map_ref_type>(extent_type{});
+
+    using ref_type = decltype(ref(op::insert_or_apply));
+
+    auto insert_or_apply_shmem_fn_ptr = static_map_ns::detail::
+      insert_or_apply_shmem<cg_size, block_size, shared_map_ref_type, InputIt, Op, ref_type>;
+
+    int32_t const max_op_grid_size =
+      cuco::detail::max_occupancy_grid_size(block_size, insert_or_apply_shmem_fn_ptr, 0);
+
+    auto const shmem_grid_size = std::min(grid_size, max_op_grid_size);
+
+    static_map_ns::detail::insert_or_apply_shmem<cg_size, block_size, shared_map_ref_type>
+      <<<shmem_grid_size, block_size, 0, stream.get()>>>(
+        first, num, op, ref(op::insert_or_apply), window_extent);
+  }
 }
 
 template <class Key,
diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu
index 75b15a88c..0c54b2730 100644
--- a/tests/static_map/insert_or_apply_test.cu
+++ b/tests/static_map/insert_or_apply_test.cu
@@ -33,6 +33,14 @@
 
 using size_type = std::size_t;
 
+struct binary_plus_op {
+  template <typename T, cuda::thread_scope Scope>
+  __device__ void operator()(cuda::atomic_ref<T, Scope> lhs, T rhs)
+  {
+    lhs.fetch_add(rhs, cuda::memory_order_relaxed);
+  }
+};
+
 template <typename Map>
 void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_keys)
 {
@@ -48,12 +56,7 @@ void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_key
       return cuco::pair<Key, Value>{i % num_unique_keys, 1};
     }));
 
-  map.insert_or_apply(
-    pairs_begin,
-    pairs_begin + num_keys,
-    [] __device__(cuda::atomic_ref<Value, Map::thread_scope> lhs, const Value& rhs) {
-      lhs.fetch_add(rhs, cuda::memory_order_relaxed);
-    });
+  map.insert_or_apply(pairs_begin, pairs_begin + num_keys, binary_plus_op{});
 
   REQUIRE(map.size() == num_unique_keys);
 

From b7f0ac792c2fe999c8573b599b48c533fcf72567 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Wed, 17 Jul 2024 03:21:33 +0000
Subject: [PATCH 02/10] add stream parameter to cardinality estimator

---
 include/cuco/detail/static_map/static_map.inl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl
index 2f5daa89d..9e26a91b2 100644
--- a/include/cuco/detail/static_map/static_map.inl
+++ b/include/cuco/detail/static_map/static_map.inl
@@ -328,8 +328,8 @@ void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stora
 
   auto constexpr cardinality_threshold = cuco::detail::default_block_size();
 
-  cardinality_estimator.add(keys_begin, keys_begin + num);
-  auto const cardinality = cardinality_estimator.estimate();
+  cardinality_estimator.add(keys_begin, keys_begin + num, stream);
+  auto const cardinality = cardinality_estimator.estimate(stream);
 
   int32_t const grid_size = cuco::detail::grid_size(num, cg_size);
 

From dc8d29e6e1f03df5ffd58d5d48e5a9fdc59c9271 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Wed, 17 Jul 2024 20:42:49 +0000
Subject: [PATCH 03/10] update insert_or_apply device API to return bool

---
 .../cuco/detail/static_map/static_map_ref.inl | 29 ++++++++++---------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl
index 75abc38e4..1441ecc9f 100644
--- a/include/cuco/detail/static_map/static_map_ref.inl
+++ b/include/cuco/detail/static_map/static_map_ref.inl
@@ -577,7 +577,7 @@ class operator_impl<
    */
 
   template <typename Value, typename Op>
-  __device__ void insert_or_apply(Value const& value, Op op)
+  __device__ bool insert_or_apply(Value const& value, Op op)
   {
     static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
 
@@ -609,17 +609,17 @@ class operator_impl<
             ref_.impl_.wait_for_payload(slot_ptr->second, empty_value);
           }
           op(cuda::atomic_ref<T, Scope>{slot_ptr->second}, val.second);
-          return;
+          return false;
         }
         if (eq_res == detail::equal_result::AVAILABLE) {
           switch (ref_.impl_.attempt_insert_stable(slot_ptr, slot_content, val)) {
-            case insert_result::SUCCESS: return;
+            case insert_result::SUCCESS: return true;
             case insert_result::DUPLICATE: {
               if constexpr (sizeof(value_type) > 8) {
                 ref_.impl_.wait_for_payload(slot_ptr->second, empty_value);
               }
               op(cuda::atomic_ref<T, Scope>{slot_ptr->second}, val.second);
-              return;
+              return false;
             }
             default: continue;
           }
@@ -630,12 +630,13 @@ class operator_impl<
   }
 
   template <typename Value>
-  __device__ void insert_or_apply(Value const& value, cuco::op::reduce::sum_tag)
+  __device__ bool insert_or_apply(Value const& value, cuco::op::reduce::sum_tag)
   {
     auto& ref_ = static_cast<ref_type&>(*this);
-    ref_.insert_or_apply(value, [](cuda::atomic_ref<T, Scope> payload_ref, T const& payload) {
-      payload_ref.fetch_add(payload, cuda::memory_order_relaxed);
-    });
+    return ref_.insert_or_apply(value,
+                                [](cuda::atomic_ref<T, Scope> payload_ref, T const& payload) {
+                                  payload_ref.fetch_add(payload, cuda::memory_order_relaxed);
+                                });
   }
 
   /**
@@ -654,7 +655,7 @@ class operator_impl<
    */
 
   template <typename Value, typename Op>
-  __device__ void insert_or_apply(cooperative_groups::thread_block_tile<cg_size> const& group,
+  __device__ bool insert_or_apply(cooperative_groups::thread_block_tile<cg_size> const& group,
                                   Value const& value,
                                   Op op)
   {
@@ -697,7 +698,7 @@ class operator_impl<
           }
           op(cuda::atomic_ref<T, Scope>{slot_ptr->second}, val.second);
         }
-        return;
+        return false;
       }
 
       auto const group_contains_available = group.ballot(state == detail::equal_result::AVAILABLE);
@@ -709,7 +710,7 @@ class operator_impl<
         }();
 
         switch (group.shfl(status, src_lane)) {
-          case insert_result::SUCCESS: return;
+          case insert_result::SUCCESS: return true;
           case insert_result::DUPLICATE: {
             if (group.thread_rank() == src_lane) {
               if constexpr (sizeof(value_type) > 8) {
@@ -717,7 +718,7 @@ class operator_impl<
               }
               op(cuda::atomic_ref<T, Scope>{slot_ptr->second}, val.second);
             }
-            return;
+            return false;
           }
           default: continue;
         }
@@ -728,12 +729,12 @@ class operator_impl<
   }
 
   template <typename Value>
-  __device__ void insert_or_apply(cooperative_groups::thread_block_tile<cg_size> const& group,
+  __device__ bool insert_or_apply(cooperative_groups::thread_block_tile<cg_size> const& group,
                                   Value const& value,
                                   cuco::op::reduce::sum_tag)
   {
     auto& ref_ = static_cast<ref_type&>(*this);
-    ref_.insert_or_apply(
+    return ref_.insert_or_apply(
       group, value, [](cuda::atomic_ref<T, Scope> payload_ref, T const& payload) {
         payload_ref.fetch_add(payload, cuda::memory_order_relaxed);
       });

From 5d475955c0eed31542539efa7b095bc4614cde0b Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Wed, 17 Jul 2024 20:43:26 +0000
Subject: [PATCH 04/10] use warp shuffles to find block_cardinality

---
 include/cuco/detail/static_map/kernels.cuh | 45 ++++++++--------------
 1 file changed, 15 insertions(+), 30 deletions(-)

diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh
index 9500d3087..2920ac56f 100644
--- a/include/cuco/detail/static_map/kernels.cuh
+++ b/include/cuco/detail/static_map/kernels.cuh
@@ -131,19 +131,20 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem(
   auto const loop_stride = cuco::detail::grid_stride() / CGSize;
   auto idx               = cuco::detail::global_thread_id() / CGSize;
 
+  auto warp                  = cg::tiled_partition<32>(block);
+  auto const warp_thread_idx = warp.thread_rank();
+
   // Shared map initialization
   __shared__ typename SharedMapRefType::window_type windows[window_extent.value()];
   auto storage           = SharedMapRefType::storage_ref_type(window_extent, windows);
   auto const num_windows = storage.num_windows();
 
-  // BlockReduce to find cardinality
-  using BlockReduce = cub::BlockReduce<int32_t, BlockSize>;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
   __shared__ int32_t block_cardinality;
-
   if (thread_idx == 0) block_cardinality = 0;
   block.sync();
 
+  cuda::atomic_ref<int32_t, cuda::thread_scope_block> cardinality_counter(block_cardinality);
+
   auto shared_map     = SharedMapRefType(cuco::empty_key<Key>(ref.empty_key_sentinel()),
                                      cuco::empty_value<Value>(ref.empty_value_sentinel()),
                                          {},
@@ -154,40 +155,25 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem(
   shared_map_ref.initialize(block);
   block.sync();
 
-  int32_t num_loop = 0;
   while (idx - thread_idx < n) {
     if constexpr (CGSize == 1) {
       // insert-or-apply into the shared map first
+      int32_t inserted = 0;
       if (idx < n) {
         value_type const& insert_pair = *(first + idx);
-        shared_map_ref.insert_or_apply(insert_pair, op);
+        inserted                      = shared_map_ref.insert_or_apply(insert_pair, op);
       }
-      block.sync();
 
-      // for first pass block_cardinality will be < threshold
-      // we can skip block_reduction to find cardinality
-      if (num_loop != 0) {
-        // find if cardinality exceeds threshold
-        int32_t thread_count = 0;
-        if (idx < n) {
-          auto window_idx = thread_idx;
-          while (window_idx < num_windows) {
-            auto const slot = storage[window_idx][0];
-            if (not cuco::detail::bitwise_compare(slot.first, ref.empty_key_sentinel())) {
-              thread_count += 1;
-            }
-            window_idx += BlockSize;
-          }
-        }
-
-        int32_t local_cardinality = BlockReduce(temp_storage).Sum(thread_count);
-        if (thread_idx == 0) block_cardinality = local_cardinality;
-        block.sync();
-
-        if (idx < n) {
-          if (block_cardinality > BlockSize) { break; }
+      if (idx - warp_thread_idx < n)  // all threads in warp particpate
+      {
+        warp.sync();  // sync for inserted to materalize
+        for (int32_t i = warp.size() / 2; i > 0; i /= 2) {
+          inserted += warp.shfl_down(inserted, i);
         }
       }
+      if (warp_thread_idx == 0) cardinality_counter.fetch_add(inserted, cuda::memory_order_relaxed);
+      block.sync();
+      if (block_cardinality > BlockSize) break;
     } else {
       auto const tile = cg::tiled_partition<CGSize>(block);
       if (idx < n) {
@@ -196,7 +182,6 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem(
       }
     }
     idx += loop_stride;
-    num_loop += 1;
   }
 
   if constexpr (CGSize == 1) {

From 303975392f2b2b77f31ecd299c7df7580153225d Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Fri, 19 Jul 2024 14:25:51 +0000
Subject: [PATCH 05/10] optimizations in shmem kernel

---
 include/cuco/detail/static_map/kernels.cuh | 30 ++++++++++------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh
index 2920ac56f..2429e3000 100644
--- a/include/cuco/detail/static_map/kernels.cuh
+++ b/include/cuco/detail/static_map/kernels.cuh
@@ -22,6 +22,7 @@
 #include <cuda/atomic>
 
 #include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
 
 #include <iterator>
 
@@ -110,10 +111,10 @@ __global__ void insert_or_apply(InputIt first, cuco::detail::index_type n, Op op
 
 template <int32_t CGSize,
           int32_t BlockSize,
-          typename SharedMapRefType,
-          typename InputIt,
-          typename Op,
-          typename Ref>
+          class SharedMapRefType,
+          class InputIt,
+          class Op,
+          class Ref>
 CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem(
   InputIt first,
   cuco::detail::index_type n,
@@ -157,23 +158,21 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem(
 
   while (idx - thread_idx < n) {
     if constexpr (CGSize == 1) {
+      int32_t inserted          = 0;
+      int32_t local_cardinality = 0;
       // insert-or-apply into the shared map first
-      int32_t inserted = 0;
       if (idx < n) {
         value_type const& insert_pair = *(first + idx);
         inserted                      = shared_map_ref.insert_or_apply(insert_pair, op);
       }
-
-      if (idx - warp_thread_idx < n)  // all threads in warp particpate
-      {
-        warp.sync();  // sync for inserted to materalize
-        for (int32_t i = warp.size() / 2; i > 0; i /= 2) {
-          inserted += warp.shfl_down(inserted, i);
-        }
+      if (idx - warp_thread_idx < n) {  // all threads in warp particpate
+        local_cardinality = cg::reduce(warp, inserted, cg::plus<int32_t>());
+      }
+      if (warp_thread_idx == 0) {
+        cardinality_counter.fetch_add(local_cardinality, cuda::memory_order_relaxed);
       }
-      if (warp_thread_idx == 0) cardinality_counter.fetch_add(inserted, cuda::memory_order_relaxed);
       block.sync();
-      if (block_cardinality > BlockSize) break;
+      if (block_cardinality > BlockSize) { break; }
     } else {
       auto const tile = cg::tiled_partition<CGSize>(block);
       if (idx < n) {
@@ -185,12 +184,11 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem(
   }
 
   if constexpr (CGSize == 1) {
-    // write from shared_map to global_map
+    // insert-or-apply from shared map to global map
     auto window_idx = thread_idx;
     while (window_idx < num_windows) {
       auto const slot = storage[window_idx][0];
       if (not cuco::detail::bitwise_compare(slot.first, ref.empty_key_sentinel())) {
-        // TODO use insert ?
         ref.insert_or_apply(slot, op);
       }
       window_idx += BlockSize;

From 89689d2d342f5c1503f3b22bdf78274549eb6b69 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Fri, 19 Jul 2024 14:26:56 +0000
Subject: [PATCH 06/10] dispatch to shmem based on input size

---
 include/cuco/detail/static_map/static_map.inl | 86 ++++++++-----------
 1 file changed, 38 insertions(+), 48 deletions(-)

diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl
index 9e26a91b2..056ee70ae 100644
--- a/include/cuco/detail/static_map/static_map.inl
+++ b/include/cuco/detail/static_map/static_map.inl
@@ -320,56 +320,46 @@ void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stora
   auto const num = cuco::detail::distance(first, last);
   if (num == 0) { return; }
 
-  auto cardinality_estimator = cuco::distinct_count_estimator<Key>{};
-  auto keys_begin            = thrust::make_transform_iterator(
-    first, cuda::proclaim_return_type<Key>([] __device__(auto const& input_pair) {
-      return input_pair.first;
-    }));
-
-  auto constexpr cardinality_threshold = cuco::detail::default_block_size();
-
-  cardinality_estimator.add(keys_begin, keys_begin + num, stream);
-  auto const cardinality = cardinality_estimator.estimate(stream);
-
-  int32_t const grid_size = cuco::detail::grid_size(num, cg_size);
-
-  if (cardinality > cardinality_threshold) {
+  int32_t constexpr shmem_block_size = 1024;
+  int32_t const default_grid_size    = cuco::detail::grid_size(num, cg_size);
+
+  int32_t constexpr cardinality_threshold   = shmem_block_size;
+  int32_t constexpr shared_map_num_elements = cardinality_threshold + shmem_block_size;
+  float constexpr load_factor               = 0.7;
+  int32_t constexpr shared_map_size         = (1.0 / load_factor) * shared_map_num_elements;
+
+  using extent_type            = cuco::extent<int32_t, shared_map_size>;
+  using shared_map_type        = cuco::static_map<Key,
+                                                  T,
+                                                  extent_type,
+                                                  cuda::thread_scope_block,
+                                                  KeyEqual,
+                                                  ProbingScheme,
+                                                  Allocator,
+                                                  cuco::storage<1>>;
+  using shared_map_ref_type    = typename shared_map_type::ref_type<>;
+  auto constexpr window_extent = cuco::make_window_extent<shared_map_ref_type>(extent_type{});
+
+  using ref_type = decltype(ref(op::insert_or_apply));
+
+  auto insert_or_apply_shmem_fn_ptr = static_map_ns::detail::
+    insert_or_apply_shmem<cg_size, shmem_block_size, shared_map_ref_type, InputIt, Op, ref_type>;
+
+  int32_t const max_op_grid_size =
+    cuco::detail::max_occupancy_grid_size(shmem_block_size, insert_or_apply_shmem_fn_ptr, 0);
+
+  auto const shmem_grid_size      = std::min(default_grid_size, max_op_grid_size);
+  auto const num_loops_per_thread = num / (shmem_grid_size * shmem_block_size);
+
+  // use shared_memory only if each thread has atleast 2 elements to process
+  if (num_loops_per_thread > 2) {
+    static_map_ns::detail::insert_or_apply_shmem<cg_size, shmem_block_size, shared_map_ref_type>
+      <<<shmem_grid_size, shmem_block_size, 0, stream.get()>>>(
+        first, num, op, ref(op::insert_or_apply), window_extent);
+  } else {
     static_map_ns::detail::insert_or_apply<cg_size, cuco::detail::default_block_size()>
-      <<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
+      <<<default_grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
         first, num, op, ref(op::insert_or_apply));
-  } else {
-    constexpr int32_t block_size            = cuco::detail::default_block_size();
-    constexpr int32_t cardinality_threshold = 128;
-
-    constexpr int32_t shared_map_num_elements = cardinality_threshold + block_size;
-    constexpr float load_factor               = 0.7;
-    constexpr int32_t shared_map_size         = (1.0 / load_factor) * shared_map_num_elements;
-
-    using extent_type            = cuco::extent<int32_t, shared_map_size>;
-    using shared_map_type        = cuco::static_map<Key,
-                                                    T,
-                                                    extent_type,
-                                                    cuda::thread_scope_block,
-                                                    KeyEqual,
-                                                    ProbingScheme,
-                                                    Allocator,
-                                                    cuco::storage<1>>;
-    using shared_map_ref_type    = typename shared_map_type::ref_type<>;
-    auto constexpr window_extent = cuco::make_window_extent<shared_map_ref_type>(extent_type{});
-
-    using ref_type = decltype(ref(op::insert_or_apply));
-
-    auto insert_or_apply_shmem_fn_ptr = static_map_ns::detail::
-      insert_or_apply_shmem<cg_size, block_size, shared_map_ref_type, InputIt, Op, ref_type>;
-
-    int32_t const max_op_grid_size =
-      cuco::detail::max_occupancy_grid_size(block_size, insert_or_apply_shmem_fn_ptr, 0);
-
-    auto const shmem_grid_size = std::min(grid_size, max_op_grid_size);
-
-    static_map_ns::detail::insert_or_apply_shmem<cg_size, block_size, shared_map_ref_type>
-      <<<shmem_grid_size, block_size, 0, stream.get()>>>(
-        first, num, op, ref(op::insert_or_apply), window_extent);
   }
 }
 

From 2819862e5ed6deda28ac7ad9851485318055cc75 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Fri, 19 Jul 2024 14:28:05 +0000
Subject: [PATCH 07/10] Use atomic<int> instead of atomic_ref

---
 include/cuco/detail/static_map/kernels.cuh | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh
index 2429e3000..4661de031 100644
--- a/include/cuco/detail/static_map/kernels.cuh
+++ b/include/cuco/detail/static_map/kernels.cuh
@@ -140,12 +140,11 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem(
   auto storage           = SharedMapRefType::storage_ref_type(window_extent, windows);
   auto const num_windows = storage.num_windows();
 
-  __shared__ int32_t block_cardinality;
-  if (thread_idx == 0) block_cardinality = 0;
+  using atomic_type = cuda::atomic<int32_t, cuda::thread_scope_block>;
+  __shared__ atomic_type block_cardinality;
+  if (thread_idx == 0) { new (&block_cardinality) atomic_type{}; }
   block.sync();
 
-  cuda::atomic_ref<int32_t, cuda::thread_scope_block> cardinality_counter(block_cardinality);
-
   auto shared_map     = SharedMapRefType(cuco::empty_key<Key>(ref.empty_key_sentinel()),
                                      cuco::empty_value<Value>(ref.empty_value_sentinel()),
                                          {},
@@ -169,7 +168,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem(
         local_cardinality = cg::reduce(warp, inserted, cg::plus<int32_t>());
       }
       if (warp_thread_idx == 0) {
-        cardinality_counter.fetch_add(local_cardinality, cuda::memory_order_relaxed);
+        block_cardinality.fetch_add(local_cardinality, cuda::memory_order_relaxed);
       }
       block.sync();
       if (block_cardinality > BlockSize) { break; }

From 78e203da4e7b56e8541d9248460ae68d256f4e18 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Fri, 26 Jul 2024 18:50:55 +0000
Subject: [PATCH 08/10] minor cleanup

---
 include/cuco/detail/static_map/kernels.cuh        | 4 ++--
 include/cuco/detail/static_map/static_map.inl     | 5 ++---
 include/cuco/detail/static_map/static_map_ref.inl | 4 ++++
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh
index 4661de031..25a4809f2 100644
--- a/include/cuco/detail/static_map/kernels.cuh
+++ b/include/cuco/detail/static_map/kernels.cuh
@@ -145,12 +145,12 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem(
   if (thread_idx == 0) { new (&block_cardinality) atomic_type{}; }
   block.sync();
 
-  auto shared_map     = SharedMapRefType(cuco::empty_key<Key>(ref.empty_key_sentinel()),
+  auto shared_map     = SharedMapRefType{cuco::empty_key<Key>(ref.empty_key_sentinel()),
                                      cuco::empty_value<Value>(ref.empty_value_sentinel()),
                                          {},
                                          {},
                                          {},
-                                     storage);
+                                     storage};
   auto shared_map_ref = std::move(shared_map).with(cuco::op::insert_or_apply);
   shared_map_ref.initialize(block);
   block.sync();
diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl
index 056ee70ae..cfa919ff2 100644
--- a/include/cuco/detail/static_map/static_map.inl
+++ b/include/cuco/detail/static_map/static_map.inl
@@ -17,14 +17,13 @@
 #include <cuco/detail/bitwise_compare.cuh>
 #include <cuco/detail/static_map/kernels.cuh>
 #include <cuco/detail/utility/cuda.hpp>
-#include <cuco/detail/utility/math.hpp>
 #include <cuco/detail/utils.hpp>
-#include <cuco/distinct_count_estimator.cuh>
 #include <cuco/operator.hpp>
 #include <cuco/static_map_ref.cuh>
 
 #include <cuda/stream_ref>
 
+#include <algorithm>
 #include <cstddef>
 
 namespace cuco {
@@ -346,7 +345,7 @@ void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stora
     insert_or_apply_shmem<cg_size, shmem_block_size, shared_map_ref_type, InputIt, Op, ref_type>;
 
   int32_t const max_op_grid_size =
-    cuco::detail::max_occupancy_grid_size(shmem_block_size, insert_or_apply_shmem_fn_ptr, 0);
+    cuco::detail::max_occupancy_grid_size(shmem_block_size, insert_or_apply_shmem_fn_ptr);
 
   auto const shmem_grid_size      = std::min(default_grid_size, max_op_grid_size);
   auto const num_loops_per_thread = num / (shmem_grid_size * shmem_block_size);
diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl
index 1441ecc9f..a9a8a3e1c 100644
--- a/include/cuco/detail/static_map/static_map_ref.inl
+++ b/include/cuco/detail/static_map/static_map_ref.inl
@@ -574,6 +574,8 @@ class operator_impl<
    * @param value The element to insert
    * @param op The callable object to perform binary operation between existing value at the slot
    *  and the element to insert.
+   *
+   * @return Returns `true` if the given `value` is inserted successfully.
    */
 
   template <typename Value, typename Op>
@@ -652,6 +654,8 @@ class operator_impl<
    * @param value The element to insert
    * @param op The callable object to perform binary operation between existing value at the slot
    *  and the element to insert.
+   *
+   * @return Returns `true` if the given `value` is inserted successfully.
    */
 
   template <typename Value, typename Op>

From a023537109a2a68cfbc25d05c0b13bf06220c179 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Fri, 26 Jul 2024 21:29:32 +0000
Subject: [PATCH 09/10] minor nits

---
 include/cuco/detail/static_map/kernels.cuh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh
index 25a4809f2..a58e9e273 100644
--- a/include/cuco/detail/static_map/kernels.cuh
+++ b/include/cuco/detail/static_map/kernels.cuh
@@ -145,8 +145,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem(
   if (thread_idx == 0) { new (&block_cardinality) atomic_type{}; }
   block.sync();
 
-  auto shared_map     = SharedMapRefType{cuco::empty_key<Key>(ref.empty_key_sentinel()),
-                                     cuco::empty_value<Value>(ref.empty_value_sentinel()),
+  auto shared_map     = SharedMapRefType{cuco::empty_key<Key>{ref.empty_key_sentinel()},
+                                     cuco::empty_value<Value>{ref.empty_value_sentinel()},
                                          {},
                                          {},
                                          {},
@@ -155,7 +155,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem(
   shared_map_ref.initialize(block);
   block.sync();
 
-  while (idx - thread_idx < n) {
+  while ((idx - thread_idx / CGSize) < n) {
     if constexpr (CGSize == 1) {
       int32_t inserted          = 0;
       int32_t local_cardinality = 0;

From d2ccc9fc8fa7c762de29b575677079da183dc459 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Tue, 30 Jul 2024 01:30:20 +0000
Subject: [PATCH 10/10] add tests that use shared memory map kernel

---
 tests/static_map/insert_or_apply_test.cu | 95 +++++++++++++++++++++++-
 1 file changed, 94 insertions(+), 1 deletion(-)

diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu
index 0c54b2730..d3bceedcb 100644
--- a/tests/static_map/insert_or_apply_test.cu
+++ b/tests/static_map/insert_or_apply_test.cu
@@ -70,6 +70,66 @@ void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_key
                             thrust::equal_to<Value>{}));
 }
 
+template <typename Map>
+void test_insert_or_apply_shmem(Map& map, size_type num_keys, size_type num_unique_keys)
+{
+  REQUIRE((num_keys % num_unique_keys) == 0);
+
+  using Key   = typename Map::key_type;
+  using Value = typename Map::mapped_type;
+
+  using KeyEqual         = typename Map::key_equal;
+  using ProbingScheme    = typename Map::probing_scheme_type;
+  using Allocator        = typename Map::allocator_type;
+  auto constexpr cg_size = Map::cg_size;
+
+  int32_t constexpr shmem_block_size        = 1024;
+  int32_t constexpr cardinality_threshold   = shmem_block_size;
+  int32_t constexpr shared_map_num_elements = cardinality_threshold + shmem_block_size;
+  float constexpr load_factor               = 0.7;
+  int32_t constexpr shared_map_size         = (1.0 / load_factor) * shared_map_num_elements;
+
+  using extent_type     = cuco::extent<int32_t, shared_map_size>;
+  using shared_map_type = cuco::static_map<Key,
+                                           Value,
+                                           extent_type,
+                                           cuda::thread_scope_block,
+                                           KeyEqual,
+                                           ProbingScheme,
+                                           Allocator,
+                                           cuco::storage<1>>;
+
+  using shared_map_ref_type    = typename shared_map_type::ref_type<>;
+  auto constexpr window_extent = cuco::make_window_extent<shared_map_ref_type>(extent_type{});
+
+  // Insert pairs
+  auto pairs_begin = thrust::make_transform_iterator(
+    thrust::counting_iterator<size_type>(0),
+    cuda::proclaim_return_type<cuco::pair<Key, Value>>([num_unique_keys] __device__(auto i) {
+      return cuco::pair<Key, Value>{i % num_unique_keys, 1};
+    }));
+
+  auto const shmem_grid_size = cuco::detail::grid_size(num_keys, cg_size, 1, shmem_block_size);
+
+  cuda::stream_ref stream{};
+
+  // launch the shmem kernel
+  cuco::static_map_ns::detail::insert_or_apply_shmem<cg_size, shmem_block_size, shared_map_ref_type>
+    <<<shmem_grid_size, shmem_block_size, 0, stream.get()>>>(
+      pairs_begin, num_keys, binary_plus_op{}, map.ref(cuco::op::insert_or_apply), window_extent);
+
+  REQUIRE(map.size() == num_unique_keys);
+
+  thrust::device_vector<Key> d_keys(num_unique_keys);
+  thrust::device_vector<Value> d_values(num_unique_keys);
+  map.retrieve_all(d_keys.begin(), d_values.begin());
+
+  REQUIRE(cuco::test::equal(d_values.begin(),
+                            d_values.end(),
+                            thrust::make_constant_iterator<Value>(num_keys / num_unique_keys),
+                            thrust::equal_to<Value>{}));
+}
+
 TEMPLATE_TEST_CASE_SIG(
   "static_map insert_or_apply tests",
   "",
@@ -145,4 +205,37 @@ TEMPLATE_TEST_CASE_SIG(
     num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
 
   test_insert_or_apply(map, num_keys, num_keys);
-}
\ No newline at end of file
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "static_map insert_or_apply shared memory", "", ((typename Key)), (int32_t), (int64_t))
+{
+  using Value = Key;
+
+  using map_type = cuco::static_map<Key,
+                                    Value,
+                                    cuco::extent<size_type>,
+                                    cuda::thread_scope_device,
+                                    thrust::equal_to<Key>,
+                                    cuco::linear_probing<1, cuco::murmurhash3_32<Key>>,
+                                    cuco::cuda_allocator<std::byte>,
+                                    cuco::storage<2>>;
+
+  SECTION("duplicate keys")
+  {
+    constexpr size_type num_keys        = 10'000;
+    constexpr size_type num_unique_keys = 100;
+
+    auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
+    test_insert_or_apply_shmem(map, num_keys, num_unique_keys);
+  }
+
+  SECTION("unique keys")
+  {
+    constexpr size_type num_keys        = 10'000;
+    constexpr size_type num_unique_keys = num_keys;
+
+    auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
+    test_insert_or_apply_shmem(map, num_keys, num_unique_keys);
+  }
+}