diff --git a/.devcontainer/cuda11.8-gcc11/devcontainer.json b/.devcontainer/cuda11.8-gcc11/devcontainer.json
index 332439947..606759c73 100644
--- a/.devcontainer/cuda11.8-gcc11/devcontainer.json
+++ b/.devcontainer/cuda11.8-gcc11/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-gcc11-cuda11.8-ubuntu22.04",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc11-cuda11.8-ubuntu22.04",
   "hostRequirements": {
     "gpu": true
   },
diff --git a/.devcontainer/cuda12.5-gcc12/devcontainer.json b/.devcontainer/cuda12.5-gcc12/devcontainer.json
index bf3e725af..3f562f865 100644
--- a/.devcontainer/cuda12.5-gcc12/devcontainer.json
+++ b/.devcontainer/cuda12.5-gcc12/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-gcc12-cuda12.5-ubuntu22.04",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc12-cuda12.5-ubuntu22.04",
   "hostRequirements": {
     "gpu": true
   },
diff --git a/.devcontainer/cuda12.5-gcc13/devcontainer.json b/.devcontainer/cuda12.5-gcc13/devcontainer.json
index f1f303243..01bbe927b 100644
--- a/.devcontainer/cuda12.5-gcc13/devcontainer.json
+++ b/.devcontainer/cuda12.5-gcc13/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-gcc13-cuda12.5-ubuntu22.04",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc13-cuda12.5-ubuntu22.04",
   "hostRequirements": {
     "gpu": true
   },
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index f1f303243..01bbe927b 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-gcc13-cuda12.5-ubuntu22.04",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc13-cuda12.5-ubuntu22.04",
   "hostRequirements": {
     "gpu": true
   },
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6b9e97e83..8de18db73 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,7 +16,7 @@
 cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake)
-    file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.08/RAPIDS.cmake
+    file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.10/RAPIDS.cmake
          ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake)
 endif()
 include(${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake)
diff --git a/ci/matrix.yml b/ci/matrix.yml
index b4dd33ca2..471a4fb0d 100644
--- a/ci/matrix.yml
+++ b/ci/matrix.yml
@@ -23,7 +23,7 @@ gpus:
   - 'v100'
 
 # The version of the devcontainer images to use from https://hub.docker.com/r/rapidsai/devcontainers
-devcontainer_version: '24.08'
+devcontainer_version: '24.10'
 
 # Each environment below will generate a unique build/test job
 # See the "compute-matrix" job in the workflow for how this is parsed and used
diff --git a/include/cuco/detail/__config b/include/cuco/detail/__config
index 6d4bf7339..24c7758ab 100644
--- a/include/cuco/detail/__config
+++ b/include/cuco/detail/__config
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <nv/target>
+#include <cuda/std/version>
 
 #if !defined(__CUDACC_VER_MAJOR__) || !defined(__CUDACC_VER_MINOR__)
 #error "NVCC version not found"
@@ -32,6 +33,10 @@
 #error "Support for extended device lambdas is required (nvcc flag --expt-extended-lambda)"
 #endif
 
+#if !defined(CCCL_VERSION) || (CCCL_VERSION < 2005000)
+#error "CCCL version 2.5.0 or later is required"
+#endif
+
 // WAR for libcudacxx/296
 #define CUCO_CUDA_MINIMUM_ARCH _NV_FIRST_ARG(__CUDA_ARCH_LIST__)
 
diff --git a/include/cuco/detail/bitwise_compare.cuh b/include/cuco/detail/bitwise_compare.cuh
index a8a5a69d1..d828bbf86 100644
--- a/include/cuco/detail/bitwise_compare.cuh
+++ b/include/cuco/detail/bitwise_compare.cuh
@@ -18,6 +18,7 @@
 
 #include <cuco/utility/traits.hpp>
 
+#include <cuda/functional>
 #include <cuda/std/bit>
 
 #include <cstdint>
@@ -67,9 +68,10 @@ struct bitwise_compare_impl<8> {
  * size of type, or 16, whichever is smaller.
  */
 template <typename T>
-constexpr std::size_t alignment()
+__host__ __device__ constexpr std::size_t alignment()
 {
-  return std::min(std::size_t{16}, cuda::std::bit_ceil(sizeof(T)));
+  constexpr std::size_t alignment = cuda::std::bit_ceil(sizeof(T));
+  return cuda::std::min(std::size_t{16}, alignment);
 }
 
 /**
diff --git a/include/cuco/detail/open_addressing/kernels.cuh b/include/cuco/detail/open_addressing/kernels.cuh
index 266335a50..24fce230c 100644
--- a/include/cuco/detail/open_addressing/kernels.cuh
+++ b/include/cuco/detail/open_addressing/kernels.cuh
@@ -182,6 +182,46 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void erase(InputIt first,
   }
 }
 
+/**
+ * @brief For each key in the range [first, first + n), applies the function object `callback_op` to
+ * the copy of all corresponding matches found in the container.
+ *
+ * @note The return value of `callback_op`, if any, is ignored.
+ *
+ * @tparam CGSize Number of threads in each CG
+ * @tparam BlockSize Number of threads in each block
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
+ * convertible to the `key_type` of the data structure
+ * @tparam CallbackOp Type of unary callback function object
+ * @tparam Ref Type of non-owning device ref allowing access to storage
+ *
+ * @param first Beginning of the sequence of input elements
+ * @param n Number of input elements
+ * @param callback_op Function to call on every matched slot found in the container
+ * @param ref Non-owning container device ref used to access the slot storage
+ */
+template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename CallbackOp, typename Ref>
+CUCO_KERNEL __launch_bounds__(BlockSize) void for_each_n(InputIt first,
+                                                         cuco::detail::index_type n,
+                                                         CallbackOp callback_op,
+                                                         Ref ref)
+{
+  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
+  auto idx               = cuco::detail::global_thread_id() / CGSize;
+
+  while (idx < n) {
+    typename std::iterator_traits<InputIt>::value_type const& key{*(first + idx)};
+    if constexpr (CGSize == 1) {
+      ref.for_each(key, callback_op);
+    } else {
+      auto const tile =
+        cooperative_groups::tiled_partition<CGSize>(cooperative_groups::this_thread_block());
+      ref.for_each(tile, key, callback_op);
+    }
+    idx += loop_stride;
+  }
+}
+
 /**
  * @brief Indicates whether the keys in the range `[first, first + n)` are contained in the data
  * structure if `pred` of the corresponding stencil returns true.
diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh
index 9dabff990..a8eff9036 100644
--- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh
+++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh
@@ -27,6 +27,7 @@
 #include <cuco/storage.cuh>
 #include <cuco/utility/traits.hpp>
 
+#include <cub/device/device_for.cuh>
 #include <cub/device/device_select.cuh>
 #include <cuda/atomic>
 #include <thrust/iterator/constant_iterator.h>
@@ -99,6 +100,7 @@ class open_addressing_impl {
 
   using storage_ref_type = typename storage_type::ref_type;  ///< Non-owning window storage ref type
   using probing_scheme_type = ProbingScheme;                 ///< Probe scheme type
+  using hasher              = typename probing_scheme_type::hasher;  ///< Hash function type
 
   /**
    * @brief Constructs a statically-sized open addressing data structure with the specified initial
@@ -588,7 +590,7 @@ class open_addressing_impl {
   [[nodiscard]] size_type count(InputIt first,
                                 InputIt last,
                                 Ref container_ref,
-                                cuda::stream_ref stream) const noexcept
+                                cuda::stream_ref stream) const
   {
     auto constexpr is_outer = false;
     return this->count<is_outer>(first, last, container_ref, stream);
@@ -638,49 +640,126 @@ class open_addressing_impl {
   template <typename OutputIt>
   [[nodiscard]] OutputIt retrieve_all(OutputIt output_begin, cuda::stream_ref stream) const
   {
-    std::size_t temp_storage_bytes = 0;
     using temp_allocator_type =
       typename std::allocator_traits<allocator_type>::template rebind_alloc<char>;
+
+    cuco::detail::index_type constexpr stride = std::numeric_limits<int32_t>::max();
+
+    cuco::detail::index_type h_num_out{0};
     auto temp_allocator = temp_allocator_type{this->allocator()};
     auto d_num_out      = reinterpret_cast<size_type*>(
       std::allocator_traits<temp_allocator_type>::allocate(temp_allocator, sizeof(size_type)));
-    auto const begin = thrust::make_transform_iterator(
-      thrust::counting_iterator<size_type>{0},
-      open_addressing_ns::detail::get_slot<has_payload, storage_ref_type>(this->storage_ref()));
-    auto const is_filled = open_addressing_ns::detail::slot_is_filled<has_payload, key_type>{
-      this->empty_key_sentinel(), this->erased_key_sentinel()};
-    CUCO_CUDA_TRY(cub::DeviceSelect::If(nullptr,
-                                        temp_storage_bytes,
-                                        begin,
-                                        output_begin,
-                                        d_num_out,
-                                        this->capacity(),
-                                        is_filled,
-                                        stream.get()));
-
-    // Allocate temporary storage
-    auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes);
-
-    CUCO_CUDA_TRY(cub::DeviceSelect::If(d_temp_storage,
-                                        temp_storage_bytes,
-                                        begin,
-                                        output_begin,
-                                        d_num_out,
-                                        this->capacity(),
-                                        is_filled,
-                                        stream.get()));
-
-    size_type h_num_out;
-    CUCO_CUDA_TRY(cudaMemcpyAsync(
-      &h_num_out, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get()));
-    stream.wait();
+
+    // TODO: PR #580 to be reverted once https://github.com/NVIDIA/cccl/issues/1422 is resolved
+    for (cuco::detail::index_type offset = 0;
+         offset < static_cast<cuco::detail::index_type>(this->capacity());
+         offset += stride) {
+      auto const num_items =
+        std::min(static_cast<cuco::detail::index_type>(this->capacity()) - offset, stride);
+      auto const begin = thrust::make_transform_iterator(
+        thrust::counting_iterator{static_cast<size_type>(offset)},
+        open_addressing_ns::detail::get_slot<has_payload, storage_ref_type>(this->storage_ref()));
+      auto const is_filled = open_addressing_ns::detail::slot_is_filled<has_payload, key_type>{
+        this->empty_key_sentinel(), this->erased_key_sentinel()};
+
+      std::size_t temp_storage_bytes = 0;
+
+      CUCO_CUDA_TRY(cub::DeviceSelect::If(nullptr,
+                                          temp_storage_bytes,
+                                          begin,
+                                          output_begin + h_num_out,
+                                          d_num_out,
+                                          static_cast<int32_t>(num_items),
+                                          is_filled,
+                                          stream.get()));
+
+      // Allocate temporary storage
+      auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes);
+
+      CUCO_CUDA_TRY(cub::DeviceSelect::If(d_temp_storage,
+                                          temp_storage_bytes,
+                                          begin,
+                                          output_begin + h_num_out,
+                                          d_num_out,
+                                          static_cast<int32_t>(num_items),
+                                          is_filled,
+                                          stream.get()));
+
+      size_type temp_count;
+      CUCO_CUDA_TRY(cudaMemcpyAsync(
+        &temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get()));
+      stream.wait();
+      h_num_out += temp_count;
+      temp_allocator.deallocate(d_temp_storage, temp_storage_bytes);
+    }
+
     std::allocator_traits<temp_allocator_type>::deallocate(
       temp_allocator, reinterpret_cast<char*>(d_num_out), sizeof(size_type));
-    temp_allocator.deallocate(d_temp_storage, temp_storage_bytes);
 
     return output_begin + h_num_out;
   }
 
+  /**
+   * @brief Asynchronously applies the given function object `callback_op` to the copy of every
+   * filled slot in the container
+   *
+   * @note The return value of `callback_op`, if any, is ignored.
+   *
+   * @tparam CallbackOp Type of unary callback function object
+   *
+   * @param callback_op Function to call on every filled slot in the container
+   * @param stream CUDA stream used for this operation
+   */
+  template <typename CallbackOp>
+  void for_each_async(CallbackOp&& callback_op, cuda::stream_ref stream) const
+  {
+    auto const is_filled = open_addressing_ns::detail::slot_is_filled<has_payload, key_type>{
+      this->empty_key_sentinel(), this->erased_key_sentinel()};
+
+    auto storage_ref = this->storage_ref();
+    auto const op    = [callback_op, is_filled, storage_ref] __device__(auto const window_slots) {
+      for (auto const slot : window_slots) {
+        if (is_filled(slot)) { callback_op(slot); }
+      }
+    };
+
+    CUCO_CUDA_TRY(cub::DeviceFor::ForEachCopyN(
+      storage_ref.data(), storage_ref.num_windows(), op, stream.get()));
+  }
+
+  /**
+   * @brief For each key in the range [first, last), asynchronously applies the function object
+   * `callback_op` to the copy of all corresponding matches found in the container.
+   *
+   * @note The return value of `callback_op`, if any, is ignored.
+   *
+   * @tparam InputIt Device accessible random access input iterator
+   * @tparam CallbackOp Type of unary callback function object
+   * @tparam Ref Type of non-owning device container ref allowing access to storage
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param callback_op Function to call on every match found in the container
+   * @param container_ref Non-owning device container ref used to access the slot storage
+   * @param stream CUDA stream used for this operation
+   */
+  template <typename InputIt, typename CallbackOp, typename Ref>
+  void for_each_async(InputIt first,
+                      InputIt last,
+                      CallbackOp&& callback_op,
+                      Ref container_ref,
+                      cuda::stream_ref stream) const noexcept
+  {
+    auto const num_keys = cuco::detail::distance(first, last);
+    if (num_keys == 0) { return; }
+
+    auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);
+
+    detail::for_each_n<cg_size, cuco::detail::default_block_size()>
+      <<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
+        first, num_keys, std::forward<CallbackOp>(callback_op), container_ref);
+  }
+
   /**
    * @brief Gets the number of elements in the container
    *
@@ -855,6 +934,16 @@ class open_addressing_impl {
     return probing_scheme_;
   }
 
+  /**
+   * @brief Gets the function(s) used to hash keys
+   *
+   * @return The function(s) used to hash keys
+   */
+  [[nodiscard]] constexpr hasher hash_function() const noexcept
+  {
+    return this->probing_scheme().hash_function();
+  }
+
   /**
    * @brief Gets the container allocator.
    *
diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh
index 5396f318a..f4c20f829 100644
--- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh
+++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh
@@ -109,6 +109,7 @@ class open_addressing_ref_impl {
  public:
   using key_type            = Key;                                     ///< Key type
   using probing_scheme_type = ProbingScheme;                           ///< Type of probing scheme
+  using hasher              = typename probing_scheme_type::hasher;    ///< Hash function type
   using storage_ref_type    = StorageRef;                              ///< Type of storage ref
   using window_type         = typename storage_ref_type::window_type;  ///< Window type
   using value_type          = typename storage_ref_type::value_type;   ///< Storage element type
@@ -233,11 +234,22 @@ class open_addressing_ref_impl {
    *
    * @return The probing scheme used for the container
    */
-  [[nodiscard]] __device__ constexpr probing_scheme_type const& probing_scheme() const noexcept
+  [[nodiscard]] __host__ __device__ constexpr probing_scheme_type const& probing_scheme()
+    const noexcept
   {
     return probing_scheme_;
   }
 
+  /**
+   * @brief Gets the function(s) used to hash keys
+   *
+   * @return The function(s) used to hash keys
+   */
+  [[nodiscard]] __host__ __device__ constexpr hasher hash_function() const noexcept
+  {
+    return this->probing_scheme().hash_function();
+  }
+
   /**
    * @brief Gets the non-owning storage ref.
    *
@@ -743,7 +755,7 @@ class open_addressing_ref_impl {
    * @note If the probe key `key` was inserted into the container, returns true. Otherwise, returns
    * false.
    *
-   * @tparam ProbeKey Input type which is convertible to 'key_type'
+   * @tparam ProbeKey Probe key type
    *
    * @param key The key to search for
    *
@@ -776,7 +788,7 @@ class open_addressing_ref_impl {
    * @note If the probe key `key` was inserted into the container, returns true. Otherwise, returns
    * false.
    *
-   * @tparam ProbeKey Input type which is convertible to 'key_type'
+   * @tparam ProbeKey Probe key type
    *
    * @param group The Cooperative Group used to perform group contains
    * @param key The key to search for
@@ -884,7 +896,7 @@ class open_addressing_ref_impl {
    * @note Returns a un-incrementable input iterator to the element whose key is equivalent to
    * `key`. If no such element exists, returns `end()`.
    *
-   * @tparam ProbeKey Input type which is convertible to 'key_type'
+   * @tparam ProbeKey Probe key type
    *
    * @param key The key to search for
    *
@@ -922,7 +934,7 @@ class open_addressing_ref_impl {
    * @note Returns a un-incrementable input iterator to the element whose key is equivalent to
    * `key`. If no such element exists, returns `end()`.
    *
-   * @tparam ProbeKey Input type which is convertible to 'key_type'
+   * @tparam ProbeKey Probe key type
    *
    * @param group The Cooperative Group used to perform this operation
    * @param key The key to search for
@@ -966,17 +978,16 @@ class open_addressing_ref_impl {
   }
 
   /**
-   * @brief Executes a callback on every element in the container with key equivalent to the probe
-   * key.
+   * @brief For a given key, applies the function object `callback_op` to the copy of all
+   * corresponding matches found in the container.
    *
-   * @note Passes an un-incrementable input iterator to the element whose key is equivalent to
-   * `key` to the callback.
+   * @note The return value of `callback_op`, if any, is ignored.
    *
-   * @tparam ProbeKey Input type which is convertible to 'key_type'
-   * @tparam CallbackOp Unary callback functor or device lambda
+   * @tparam ProbeKey Probe key type
+   * @tparam CallbackOp Type of unary callback function object
    *
    * @param key The key to search for
-   * @param callback_op Function to call on every element found
+   * @param callback_op Function to apply to every matched slot
    */
   template <class ProbeKey, class CallbackOp>
   __device__ void for_each(ProbeKey const& key, CallbackOp&& callback_op) const noexcept
@@ -995,7 +1006,7 @@ class open_addressing_ref_impl {
             return;
           }
           case detail::equal_result::EQUAL: {
-            callback_op(const_iterator{&(*(this->storage_ref_.data() + *probing_iter))[i]});
+            callback_op(window_slots[i]);
             continue;
           }
           default: continue;
@@ -1006,24 +1017,23 @@ class open_addressing_ref_impl {
   }
 
   /**
-   * @brief Executes a callback on every element in the container with key equivalent to the probe
-   * key.
-   *
-   * @note Passes an un-incrementable input iterator to the element whose key is equivalent to
-   * `key` to the callback.
+   * @brief For a given key, applies the function object `callback_op` to the copy of all
+   * corresponding matches found in the container.
    *
    * @note This function uses cooperative group semantics, meaning that any thread may call the
    * callback if it finds a matching element. If multiple elements are found within the same group,
    * each thread with a match will call the callback with its associated element.
    *
+   * @note The return value of `callback_op`, if any, is ignored.
+   *
    * @note Synchronizing `group` within `callback_op` is undefined behavior.
    *
-   * @tparam ProbeKey Input type which is convertible to 'key_type'
-   * @tparam CallbackOp Unary callback functor or device lambda
+   * @tparam ProbeKey Probe key type
+   * @tparam CallbackOp Type of unary callback function object
    *
    * @param group The Cooperative Group used to perform this operation
    * @param key The key to search for
-   * @param callback_op Function to call on every element found
+   * @param callback_op Function to apply to every matched slot
    */
   template <class ProbeKey, class CallbackOp>
   __device__ void for_each(cooperative_groups::thread_block_tile<cg_size> const& group,
@@ -1045,7 +1055,7 @@ class open_addressing_ref_impl {
             continue;
           }
           case detail::equal_result::EQUAL: {
-            callback_op(const_iterator{&(*(this->storage_ref_.data() + *probing_iter))[i]});
+            callback_op(window_slots[i]);
             continue;
           }
           default: {
@@ -1060,12 +1070,9 @@ class open_addressing_ref_impl {
   }
 
   /**
-   * @brief Executes a callback on every element in the container with key equivalent to the probe
-   * key and can additionally perform work that requires synchronizing the Cooperative Group
-   * performing this operation.
-   *
-   * @note Passes an un-incrementable input iterator to the element whose key is equivalent to
-   * `key` to the callback.
+   * @brief Applies the function object `callback_op` to the copy of every slot in the container
+   * with key equivalent to the probe key and can additionally perform work that requires
+   * synchronizing the Cooperative Group performing this operation.
    *
    * @note This function uses cooperative group semantics, meaning that any thread may call the
    * callback if it finds a matching element. If multiple elements are found within the same group,
@@ -1073,18 +1080,20 @@ class open_addressing_ref_impl {
    *
    * @note Synchronizing `group` within `callback_op` is undefined behavior.
    *
+   * @note The return value of `callback_op`, if any, is ignored.
+   *
    * @note The `sync_op` function can be used to perform work that requires synchronizing threads in
    * `group` inbetween probing steps, where the number of probing steps performed between
    * synchronization points is capped by `window_size * cg_size`. The functor will be called right
    * after the current probing window has been traversed.
    *
-   * @tparam ProbeKey Input type which is convertible to 'key_type'
-   * @tparam CallbackOp Unary callback functor or device lambda
-   * @tparam SyncOp Functor or device lambda which accepts the current `group` object
+   * @tparam ProbeKey Probe key type
+   * @tparam CallbackOp Type of unary callback function object
+   * @tparam SyncOp Type of function object which accepts the current `group` object
    *
    * @param group The Cooperative Group used to perform this operation
    * @param key The key to search for
-   * @param callback_op Function to call on every element found
+   * @param callback_op Function to apply to every matched slot
    * @param sync_op Function that is allowed to synchronize `group` inbetween probing windows
    */
   template <class ProbeKey, class CallbackOp, class SyncOp>
@@ -1108,7 +1117,7 @@ class open_addressing_ref_impl {
             continue;
           }
           case detail::equal_result::EQUAL: {
-            callback_op(const_iterator{&(*(this->storage_ref_.data() + *probing_iter))[i]});
+            callback_op(window_slots[i]);
             continue;
           }
           default: {
diff --git a/include/cuco/detail/pair/helpers.cuh b/include/cuco/detail/pair/helpers.cuh
index 3cc343687..aa040ab94 100644
--- a/include/cuco/detail/pair/helpers.cuh
+++ b/include/cuco/detail/pair/helpers.cuh
@@ -19,6 +19,8 @@
 #include <cuda/std/bit>
 #include <cuda/std/type_traits>
 
+#include <cstdint>
+
 namespace cuco::detail {
 
 /**
@@ -29,7 +31,8 @@ namespace cuco::detail {
 template <typename First, typename Second>
 __host__ __device__ constexpr std::size_t pair_alignment()
 {
-  return cuda::std::min(std::size_t{16}, cuda::std::bit_ceil(sizeof(First) + sizeof(Second)));
+  constexpr std::size_t alignment = cuda::std::bit_ceil(sizeof(First) + sizeof(Second));
+  return cuda::std::min(std::size_t{16}, alignment);
 }
 
 /**
diff --git a/include/cuco/detail/probing_scheme/probing_scheme_impl.inl b/include/cuco/detail/probing_scheme/probing_scheme_impl.inl
index 61670d7be..047ec7987 100644
--- a/include/cuco/detail/probing_scheme/probing_scheme_impl.inl
+++ b/include/cuco/detail/probing_scheme/probing_scheme_impl.inl
@@ -127,6 +127,13 @@ __host__ __device__ constexpr auto linear_probing<CGSize, Hash>::operator()(
     upper_bound};
 }
 
+template <int32_t CGSize, typename Hash>
+__host__ __device__ constexpr linear_probing<CGSize, Hash>::hasher
+linear_probing<CGSize, Hash>::hash_function() const noexcept
+{
+  return hash_;
+}
+
 template <int32_t CGSize, typename Hash1, typename Hash2>
 __host__ __device__ constexpr double_hashing<CGSize, Hash1, Hash2>::double_hashing(
   Hash1 const& hash1, Hash2 const& hash2)
@@ -192,4 +199,12 @@ __host__ __device__ constexpr auto double_hashing<CGSize, Hash1, Hash2>::operato
       cg_size),
     upper_bound};  // TODO use fast_int operator
 }
+
+template <int32_t CGSize, typename Hash1, typename Hash2>
+__host__ __device__ constexpr double_hashing<CGSize, Hash1, Hash2>::hasher
+double_hashing<CGSize, Hash1, Hash2>::hash_function() const noexcept
+{
+  return {hash1_, hash2_};
+}
+
 }  // namespace cuco
diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh
index c05d0b28b..bf2aced70 100644
--- a/include/cuco/detail/static_map/kernels.cuh
+++ b/include/cuco/detail/static_map/kernels.cuh
@@ -202,8 +202,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem(
 
   auto shared_map     = SharedMapRefType{cuco::empty_key<Key>{ref.empty_key_sentinel()},
                                      cuco::empty_value<Value>{ref.empty_value_sentinel()},
-                                         {},
-                                         {},
+                                     ref.key_eq(),
+                                     ref.probing_scheme(),
                                          {},
                                      storage};
   auto shared_map_ref = std::move(shared_map).with(cuco::op::insert_or_apply);
diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl
index 86b75507d..e2915e1fd 100644
--- a/include/cuco/detail/static_map/static_map.inl
+++ b/include/cuco/detail/static_map/static_map.inl
@@ -347,19 +347,11 @@ template <class Key,
           class ProbingScheme,
           class Allocator,
           class Storage>
-template <typename InputIt, typename Init, typename Op>
+template <typename InputIt, typename Init, typename Op, typename>
 void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
   insert_or_apply_async(
     InputIt first, InputIt last, Init init, Op op, cuda::stream_ref stream) noexcept
 {
-  using shared_map_type   = cuco::static_map<Key,
-                                             T,
-                                             int32_t,
-                                             cuda::thread_scope_block,
-                                             KeyEqual,
-                                             ProbingScheme,
-                                             Allocator,
-                                             cuco::storage<1>>;
   auto constexpr has_init = true;
   static_map_ns::detail::dispatch_insert_or_apply<has_init, cg_size, Allocator>(
     first, last, init, op, ref(op::insert_or_apply), stream);
@@ -499,6 +491,70 @@ void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stora
   impl_->find_async(first, last, output_begin, ref(op::find), stream);
 }
 
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename CallbackOp>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::for_each(
+  CallbackOp&& callback_op, cuda::stream_ref stream) const
+{
+  impl_->for_each_async(std::forward<CallbackOp>(callback_op), stream);
+  stream.wait();
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename CallbackOp>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::for_each_async(
+  CallbackOp&& callback_op, cuda::stream_ref stream) const
+{
+  impl_->for_each_async(std::forward<CallbackOp>(callback_op), stream);
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename CallbackOp>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::for_each(
+  InputIt first, InputIt last, CallbackOp&& callback_op, cuda::stream_ref stream) const
+{
+  impl_->for_each_async(
+    first, last, std::forward<CallbackOp>(callback_op), ref(op::for_each), stream);
+  stream.wait();
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename CallbackOp>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::for_each_async(
+  InputIt first, InputIt last, CallbackOp&& callback_op, cuda::stream_ref stream) const noexcept
+{
+  impl_->for_each_async(
+    first, last, std::forward<CallbackOp>(callback_op), ref(op::for_each), stream);
+}
+
 template <class Key,
           class T,
           class Extent,
@@ -653,6 +709,36 @@ static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
   return impl_->erased_key_sentinel();
 }
 
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+constexpr static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::key_equal
+static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::key_eq()
+  const noexcept
+{
+  return impl_->key_eq();
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+constexpr static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::hasher
+static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::hash_function()
+  const noexcept
+{
+  return impl_->hash_function();
+}
+
 template <class Key,
           class T,
           class Extent,
diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl
index b38562027..3756a641b 100644
--- a/include/cuco/detail/static_map/static_map_ref.inl
+++ b/include/cuco/detail/static_map/static_map_ref.inl
@@ -125,6 +125,26 @@ static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>
   return this->impl_.key_eq();
 }
 
+template <typename Key,
+          typename T,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+__host__ __device__ constexpr static_map_ref<Key,
+                                             T,
+                                             Scope,
+                                             KeyEqual,
+                                             ProbingScheme,
+                                             StorageRef,
+                                             Operators...>::hasher
+static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>::hash_function()
+  const noexcept
+{
+  return impl_.hash_function();
+}
+
 template <typename Key,
           typename T,
           cuda::thread_scope Scope,
@@ -1150,7 +1170,7 @@ class operator_impl<
    * @note If the probe key `key` was inserted into the container, returns
    * true. Otherwise, returns false.
    *
-   * @tparam ProbeKey Input key type which is convertible to 'key_type'
+   * @tparam ProbeKey Probe key type
    *
    * @param key The key to search for
    *
@@ -1170,7 +1190,7 @@ class operator_impl<
    * @note If the probe key `key` was inserted into the container, returns
    * true. Otherwise, returns false.
    *
-   * @tparam ProbeKey Input key type which is convertible to 'key_type'
+   * @tparam ProbeKey Probe key type
    *
    * @param group The Cooperative Group used to perform group contains
    * @param key The key to search for
@@ -1213,7 +1233,7 @@ class operator_impl<
    * @note Returns a un-incrementable input iterator to the element whose key is equivalent to
    * `key`. If no such element exists, returns `end()`.
    *
-   * @tparam ProbeKey Input key type which is convertible to 'key_type'
+   * @tparam ProbeKey Probe key type
    *
    * @param key The key to search for
    *
@@ -1233,7 +1253,7 @@ class operator_impl<
    * @note Returns a un-incrementable input iterator to the element whose key is equivalent to
    * `key`. If no such element exists, returns `end()`.
    *
-   * @tparam ProbeKey Input key type which is convertible to 'key_type'
+   * @tparam ProbeKey Probe key type
    *
    * @param group The Cooperative Group used to perform this operation
    * @param key The key to search for
@@ -1249,5 +1269,75 @@ class operator_impl<
   }
 };
 
+template <typename Key,
+          typename T,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+class operator_impl<
+  op::for_each_tag,
+  static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>> {
+  using base_type = static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef>;
+  using ref_type = static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>;
+  using key_type = typename base_type::key_type;
+  using value_type     = typename base_type::value_type;
+  using iterator       = typename base_type::iterator;
+  using const_iterator = typename base_type::const_iterator;
+
+  static constexpr auto cg_size     = base_type::cg_size;
+  static constexpr auto window_size = base_type::window_size;
+
+ public:
+  /**
+   * @brief For a given key, applies the function object `callback_op` to its match found in the
+   * container.
+   *
+   * @note The return value of `callback_op`, if any, is ignored.
+   *
+   * @tparam ProbeKey Probe key type
+   * @tparam CallbackOp Type of unary callback function object
+   *
+   * @param key The key to search for
+   * @param callback_op Function to apply to the copy of the matched key-value pair
+   */
+  template <class ProbeKey, class CallbackOp>
+  __device__ void for_each(ProbeKey const& key, CallbackOp&& callback_op) const noexcept
+  {
+    // CRTP: cast `this` to the actual ref type
+    auto const& ref_ = static_cast<ref_type const&>(*this);
+    ref_.impl_.for_each(key, std::forward<CallbackOp>(callback_op));
+  }
+
+  /**
+   * @brief For a given key, applies the function object `callback_op` to its match found in the
+   * container.
+   *
+   * @note This function uses cooperative group semantics, meaning that any thread may call the
+   * callback if it finds a matching key-value pair.
+   *
+   * @note The return value of `callback_op`, if any, is ignored.
+   *
+   * @note Synchronizing `group` within `callback_op` is undefined behavior.
+   *
+   * @tparam ProbeKey Probe key type
+   * @tparam CallbackOp Type of unary callback function object
+   *
+   * @param group The Cooperative Group used to perform this operation
+   * @param key The key to search for
+   * @param callback_op Function to apply to the copy of the matched key-value pair
+   */
+  template <class ProbeKey, class CallbackOp>
+  __device__ void for_each(cooperative_groups::thread_block_tile<cg_size> const& group,
+                           ProbeKey const& key,
+                           CallbackOp&& callback_op) const noexcept
+  {
+    // CRTP: cast `this` to the actual ref type
+    auto const& ref_ = static_cast<ref_type const&>(*this);
+    ref_.impl_.for_each(group, key, std::forward<CallbackOp>(callback_op));
+  }
+};
+
 }  // namespace detail
 }  // namespace cuco
diff --git a/include/cuco/detail/static_multimap/static_multimap.inl b/include/cuco/detail/static_multimap/static_multimap.inl
index 3d97a2734..9d0bdbcff 100644
--- a/include/cuco/detail/static_multimap/static_multimap.inl
+++ b/include/cuco/detail/static_multimap/static_multimap.inl
@@ -152,11 +152,11 @@ template <class Key,
           class Allocator,
           class Storage>
 template <typename InputIt>
-static_multimap<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
-static_multimap<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::insert(
+void static_multimap<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::insert(
   InputIt first, InputIt last, cuda::stream_ref stream)
 {
-  return impl_->insert(first, last, ref(op::insert), stream);
+  this->insert_async(first, last, stream);
+  stream.wait();
 }
 
 template <class Key,
@@ -174,6 +174,41 @@ void static_multimap<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator,
   impl_->insert_async(first, last, ref(op::insert), stream);
 }
 
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename StencilIt, typename Predicate>
+static_multimap<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
+static_multimap<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::insert_if(
+  InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream)
+{
+  return impl_->insert_if(first, last, stencil, pred, ref(op::insert), stream);
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename StencilIt, typename Predicate>
+void static_multimap<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
+  insert_if_async(InputIt first,
+                  InputIt last,
+                  StencilIt stencil,
+                  Predicate pred,
+                  cuda::stream_ref stream) noexcept
+{
+  impl_->insert_if_async(first, last, stencil, pred, ref(op::insert), stream);
+}
+
 template <class Key,
           class T,
           class Extent,
@@ -249,6 +284,22 @@ void static_multimap<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator,
   impl_->contains_if_async(first, last, stencil, pred, output_begin, ref(op::contains), stream);
 }
 
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt>
+static_multimap<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
+static_multimap<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::count(
+  InputIt first, InputIt last, cuda::stream_ref stream) const
+{
+  return impl_->count(first, last, ref(op::count), stream);
+}
+
 template <class Key,
           class T,
           class Extent,
@@ -312,6 +363,38 @@ constexpr static_multimap<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Alloca
   return impl_->erased_key_sentinel();
 }
 
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+constexpr static_multimap<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
+  key_equal
+  static_multimap<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::key_eq()
+    const noexcept
+{
+  return impl_->key_eq();
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+constexpr static_multimap<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
+  hasher
+  static_multimap<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
+    hash_function() const noexcept
+{
+  return impl_->hash_function();
+}
+
 template <class Key,
           class T,
           class Extent,
diff --git a/include/cuco/detail/static_multimap/static_multimap_ref.inl b/include/cuco/detail/static_multimap/static_multimap_ref.inl
index 01f9171d4..3bbf90e3a 100644
--- a/include/cuco/detail/static_multimap/static_multimap_ref.inl
+++ b/include/cuco/detail/static_multimap/static_multimap_ref.inl
@@ -122,6 +122,26 @@ static_multimap_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operator
   return impl_.key_eq();
 }
 
+template <typename Key,
+          typename T,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+__host__ __device__ constexpr static_multimap_ref<Key,
+                                                  T,
+                                                  Scope,
+                                                  KeyEqual,
+                                                  ProbingScheme,
+                                                  StorageRef,
+                                                  Operators...>::hasher
+static_multimap_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>::
+  hash_function() const noexcept
+{
+  return impl_.hash_function();
+}
+
 template <typename Key,
           typename T,
           cuda::thread_scope Scope,
@@ -431,7 +451,7 @@ class operator_impl<
    * @note If the probe key `key` was inserted into the container, returns
    * true. Otherwise, returns false.
    *
-   * @tparam ProbeKey Input key type which is convertible to 'key_type'
+   * @tparam ProbeKey Probe key type
    *
    * @param key The key to search for
    *
@@ -451,7 +471,7 @@ class operator_impl<
    * @note If the probe key `key` was inserted into the container, returns
    * true. Otherwise, returns false.
    *
-   * @tparam ProbeKey Input key type which is convertible to 'key_type'
+   * @tparam ProbeKey Probe key type
    *
    * @param group The Cooperative Group used to perform group contains
    * @param key The key to search for
@@ -466,5 +486,62 @@ class operator_impl<
     return ref_.impl_.contains(group, key);
   }
 };
+
+template <typename Key,
+          typename T,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+class operator_impl<
+  op::count_tag,
+  static_multimap_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>> {
+  using base_type = static_multimap_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef>;
+  using ref_type =
+    static_multimap_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>;
+  using key_type   = typename base_type::key_type;
+  using value_type = typename base_type::value_type;
+  using size_type  = typename base_type::size_type;
+
+  static constexpr auto cg_size     = base_type::cg_size;
+  static constexpr auto window_size = base_type::window_size;
+
+ public:
+  /**
+   * @brief Counts the occurrence of a given key contained in multimap
+   *
+   * @tparam ProbeKey Input type
+   *
+   * @param key The key to count for
+   *
+   * @return Number of occurrences found by the current thread
+   */
+  template <typename ProbeKey>
+  __device__ size_type count(ProbeKey const& key) const noexcept
+  {
+    auto const& ref_ = static_cast<ref_type const&>(*this);
+    return ref_.impl_.count(key);
+  }
+
+  /**
+   * @brief Counts the occurrence of a given key contained in multimap
+   *
+   * @tparam ProbeKey Probe key type
+   *
+   * @param group The Cooperative Group used to perform group count
+   * @param key The key to count for
+   *
+   * @return Number of occurrences found by the current thread
+   */
+  template <typename ProbeKey>
+  __device__ size_type count(cooperative_groups::thread_block_tile<cg_size> const& group,
+                             ProbeKey const& key) const noexcept
+  {
+    auto const& ref_ = static_cast<ref_type const&>(*this);
+    return ref_.impl_.count(group, key);
+  }
+};
+
 }  // namespace detail
 }  // namespace cuco
diff --git a/include/cuco/detail/static_multiset/static_multiset.inl b/include/cuco/detail/static_multiset/static_multiset.inl
index b102540a3..cf31f2571 100644
--- a/include/cuco/detail/static_multiset/static_multiset.inl
+++ b/include/cuco/detail/static_multiset/static_multiset.inl
@@ -152,11 +152,11 @@ template <class Key,
           class Allocator,
           class Storage>
 template <typename InputIt, typename StencilIt, typename Predicate>
-void static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::insert_if(
+static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
+static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::insert_if(
   InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream)
 {
-  this->insert_if_async(first, last, stencil, pred, stream);
-  stream.wait();
+  return impl_->insert_if(first, last, stencil, pred, ref(op::insert), stream);
 }
 
 template <class Key,
@@ -287,7 +287,7 @@ template <class Key,
 template <typename InputIt>
 static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
 static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::count(
-  InputIt first, InputIt last, cuda::stream_ref stream) const noexcept
+  InputIt first, InputIt last, cuda::stream_ref stream) const
 {
   return impl_->count(first, last, ref(op::count), stream);
 }
@@ -393,6 +393,35 @@ static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>
   return impl_->erased_key_sentinel();
 }
 
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+constexpr static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
+  key_equal
+  static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::key_eq()
+    const noexcept
+{
+  return impl_->key_eq();
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+constexpr static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::hasher
+static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::hash_function()
+  const noexcept
+{
+  return impl_->hash_function();
+}
+
 template <class Key,
           class Extent,
           cuda::thread_scope Scope,
diff --git a/include/cuco/detail/static_multiset/static_multiset_ref.inl b/include/cuco/detail/static_multiset/static_multiset_ref.inl
index fc1f3db9d..aa25cdf70 100644
--- a/include/cuco/detail/static_multiset/static_multiset_ref.inl
+++ b/include/cuco/detail/static_multiset/static_multiset_ref.inl
@@ -108,6 +108,24 @@ static_multiset_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators..
   return this->impl_.key_eq();
 }
 
+template <typename Key,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+__host__ __device__ constexpr static_multiset_ref<Key,
+                                                  Scope,
+                                                  KeyEqual,
+                                                  ProbingScheme,
+                                                  StorageRef,
+                                                  Operators...>::hasher
+static_multiset_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>::hash_function()
+  const noexcept
+{
+  return impl_.hash_function();
+}
+
 template <typename Key,
           cuda::thread_scope Scope,
           typename KeyEqual,
@@ -380,7 +398,7 @@ class operator_impl<
   /**
    * @brief Indicates whether the probe key `key` was inserted into the container.
    *
-   * @tparam ProbeKey Input type which is convertible to 'key_type'
+   * @tparam ProbeKey Probe key type
    *
    * @param key The key to search for
    *
@@ -396,7 +414,7 @@ class operator_impl<
   /**
    * @brief Indicates whether the probe key `key` was inserted into the container.
    *
-   * @tparam ProbeKey Input type which is convertible to 'key_type'
+   * @tparam ProbeKey Probe key type
    *
    * @param group The Cooperative Group used to perform group contains
    * @param key The key to search for
@@ -439,7 +457,7 @@ class operator_impl<
    * @note Returns a un-incrementable input iterator to the element whose key is equivalent to
    * `key`. If no such element exists, returns `end()`.
    *
-   * @tparam ProbeKey Input type which is convertible to 'key_type'
+   * @tparam ProbeKey Probe key type
    *
    * @param key The key to search for
    *
@@ -459,7 +477,7 @@ class operator_impl<
    * @note Returns a un-incrementable input iterator to the element whose key is equivalent to
    * `key`. If no such element exists, returns `end()`.
    *
-   * @tparam ProbeKey Input type which is convertible to 'key_type'
+   * @tparam ProbeKey Probe key type
    *
    * @param group The Cooperative Group used to perform this operation
    * @param key The key to search for
@@ -498,7 +516,7 @@ class operator_impl<
    * @note Passes an un-incrementable input iterator to the element whose key is equivalent to
    * `key` to the callback.
    *
-   * @tparam ProbeKey Input type which is convertible to 'key_type'
+   * @tparam ProbeKey Probe key type
    * @tparam CallbackOp Unary callback functor or device lambda
    *
    * @param key The key to search for
@@ -525,7 +543,7 @@ class operator_impl<
    *
    * @note Synchronizing `group` within `callback_op` is undefined behavior.
    *
-   * @tparam ProbeKey Input type which is convertible to 'key_type'
+   * @tparam ProbeKey Probe key type
    * @tparam CallbackOp Unary callback functor or device lambda
    *
    * @param group The Cooperative Group used to perform this operation
@@ -561,7 +579,7 @@ class operator_impl<
    * synchronization points is capped by `window_size * cg_size`. The functor will be called right
    * after the current probing window has been traversed.
    *
-   * @tparam ProbeKey Input type which is convertible to 'key_type'
+   * @tparam ProbeKey Probe key type
    * @tparam CallbackOp Unary callback functor or device lambda
    * @tparam SyncOp Functor or device lambda which accepts the current `group` object
    *
@@ -606,7 +624,7 @@ class operator_impl<
   /**
    * @brief Counts the occurrence of a given key contained in multiset
    *
-   * @tparam ProbeKey Input type
+   * @tparam ProbeKey Probe key type
    *
    * @param key The key to count for
    *
diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl
index bff86ab54..90af70707 100644
--- a/include/cuco/detail/static_set/static_set.inl
+++ b/include/cuco/detail/static_set/static_set.inl
@@ -514,6 +514,33 @@ static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::era
   return impl_->erased_key_sentinel();
 }
 
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+constexpr static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::key_equal
+static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::key_eq() const noexcept
+{
+  return impl_->key_eq();
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+constexpr static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::hasher
+static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::hash_function()
+  const noexcept
+{
+  return impl_->hash_function();
+}
+
 template <class Key,
           class Extent,
           cuda::thread_scope Scope,
diff --git a/include/cuco/detail/static_set/static_set_ref.inl b/include/cuco/detail/static_set/static_set_ref.inl
index dc25c2372..7e2882a0a 100644
--- a/include/cuco/detail/static_set/static_set_ref.inl
+++ b/include/cuco/detail/static_set/static_set_ref.inl
@@ -106,6 +106,24 @@ static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>::k
   return this->impl_.key_eq();
 }
 
+template <typename Key,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+__host__ __device__ constexpr static_set_ref<Key,
+                                             Scope,
+                                             KeyEqual,
+                                             ProbingScheme,
+                                             StorageRef,
+                                             Operators...>::hasher
+static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>::hash_function()
+  const noexcept
+{
+  return impl_.hash_function();
+}
+
 template <typename Key,
           cuda::thread_scope Scope,
           typename KeyEqual,
@@ -529,7 +547,7 @@ class operator_impl<op::contains_tag,
    * @note If the probe key `key` was inserted into the container, returns true. Otherwise, returns
    * false.
    *
-   * @tparam ProbeKey Input type which is convertible to 'key_type'
+   * @tparam ProbeKey Probe key type
    *
    * @param key The key to search for
    *
@@ -548,7 +566,7 @@ class operator_impl<op::contains_tag,
    * @note If the probe key `key` was inserted into the container, returns true. Otherwise, returns
    * false.
    *
-   * @tparam ProbeKey Input type which is convertible to 'key_type'
+   * @tparam ProbeKey Probe key type
    *
    * @param group The Cooperative Group used to perform group contains
    * @param key The key to search for
@@ -589,7 +607,7 @@ class operator_impl<op::find_tag,
    * @note Returns a un-incrementable input iterator to the element whose key is equivalent to
    * `key`. If no such element exists, returns `end()`.
    *
-   * @tparam ProbeKey Input type which is convertible to 'key_type'
+   * @tparam ProbeKey Probe key type
    *
    * @param key The key to search for
    *
@@ -609,7 +627,7 @@ class operator_impl<op::find_tag,
    * @note Returns a un-incrementable input iterator to the element whose key is equivalent to
    * `key`. If no such element exists, returns `end()`.
    *
-   * @tparam ProbeKey Input type which is convertible to 'key_type'
+   * @tparam ProbeKey Probe key type
    *
    * @param group The Cooperative Group used to perform this operation
    * @param key The key to search for
diff --git a/include/cuco/detail/storage/aow_storage.inl b/include/cuco/detail/storage/aow_storage.inl
index dfbb90327..5d976e658 100644
--- a/include/cuco/detail/storage/aow_storage.inl
+++ b/include/cuco/detail/storage/aow_storage.inl
@@ -74,6 +74,8 @@ template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
 void aow_storage<T, WindowSize, Extent, Allocator>::initialize_async(
   value_type key, cuda::stream_ref stream) noexcept
 {
+  if (this->num_windows() == 0) { return; }
+
   auto constexpr cg_size = 1;
   auto constexpr stride  = 4;
   auto const grid_size   = cuco::detail::grid_size(this->num_windows(), cg_size, stride);
diff --git a/include/cuco/detail/utils.cuh b/include/cuco/detail/utils.cuh
index 21e4df759..c7b8a3f8b 100644
--- a/include/cuco/detail/utils.cuh
+++ b/include/cuco/detail/utils.cuh
@@ -15,6 +15,7 @@
 
 #pragma once
 
+#include <cuco/detail/__config>
 #include <cuco/detail/bitwise_compare.cuh>
 
 #include <cuda/std/array>
diff --git a/include/cuco/probing_scheme.cuh b/include/cuco/probing_scheme.cuh
index 5591cb74a..4885ad63d 100644
--- a/include/cuco/probing_scheme.cuh
+++ b/include/cuco/probing_scheme.cuh
@@ -19,6 +19,9 @@
 #include <cuco/detail/probing_scheme/probing_scheme_base.cuh>
 #include <cuco/pair.cuh>
 
+#include <cuda/std/tuple>
+#include <cuda/std/type_traits>
+
 #include <cooperative_groups.h>
 
 namespace cuco {
@@ -35,10 +38,12 @@ namespace cuco {
  */
 template <int32_t CGSize, typename Hash>
 class linear_probing : private detail::probing_scheme_base<CGSize> {
- public:
   using probing_scheme_base_type =
     detail::probing_scheme_base<CGSize>;  ///< The base probe scheme type
+
+ public:
   using probing_scheme_base_type::cg_size;
+  using hasher = Hash;  ///< Hash function type
 
   /**
    *@brief Constructs linear probing scheme with the hasher callable.
@@ -91,6 +96,13 @@ class linear_probing : private detail::probing_scheme_base<CGSize> {
     ProbeKey const& probe_key,
     Extent upper_bound) const noexcept;
 
+  /**
+   * @brief Gets the function used to hash keys
+   *
+   * @return The function used to hash keys
+   */
+  __host__ __device__ constexpr hasher hash_function() const noexcept;
+
  private:
   Hash hash_;
 };
@@ -111,10 +123,12 @@ class linear_probing : private detail::probing_scheme_base<CGSize> {
  */
 template <int32_t CGSize, typename Hash1, typename Hash2 = Hash1>
 class double_hashing : private detail::probing_scheme_base<CGSize> {
- public:
   using probing_scheme_base_type =
     detail::probing_scheme_base<CGSize>;  ///< The base probe scheme type
+
+ public:
   using probing_scheme_base_type::cg_size;
+  using hasher = cuda::std::tuple<Hash1, Hash2>;  ///< Hash function type
 
   /**
    *@brief Constructs double hashing probing scheme with the two hasher callables.
@@ -193,11 +207,36 @@ class double_hashing : private detail::probing_scheme_base<CGSize> {
     ProbeKey const& probe_key,
     Extent upper_bound) const noexcept;
 
+  /**
+   * @brief Gets the functions used to hash keys
+   *
+   * @return The functions used to hash keys
+   */
+  __host__ __device__ constexpr hasher hash_function() const noexcept;
+
  private:
   Hash1 hash1_;
   Hash2 hash2_;
 };
 
+/**
+ * @brief Trait indicating whether the given probing scheme is of `double_hashing` type or not
+ *
+ * @tparam T Input probing scheme type
+ */
+template <typename T>
+struct is_double_hashing : cuda::std::false_type {};
+
+/**
+ * @brief Trait indicating whether the given probing scheme is of `double_hashing` type or not
+ *
+ * @tparam CGSize Size of CUDA Cooperative Groups
+ * @tparam Hash1 Unary callable type
+ * @tparam Hash2 Unary callable type
+ */
+template <int32_t CGSize, typename Hash1, typename Hash2>
+struct is_double_hashing<cuco::double_hashing<CGSize, Hash1, Hash2>> : cuda::std::true_type {};
+
 }  // namespace cuco
 
 #include <cuco/detail/probing_scheme/probing_scheme_impl.inl>
diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
index 9c87e45a9..fc7dc088d 100644
--- a/include/cuco/static_map.cuh
+++ b/include/cuco/static_map.cuh
@@ -37,6 +37,7 @@
 
 #include <cstddef>
 #include <memory>
+#include <type_traits>
 #include <utility>
 
 namespace cuco {
@@ -126,6 +127,7 @@ class static_map {
   /// Non-owning window storage ref type
   using storage_ref_type    = typename impl_type::storage_ref_type;
   using probing_scheme_type = typename impl_type::probing_scheme_type;  ///< Probing scheme type
+  using hasher              = typename probing_scheme_type::hasher;     ///< Hash function type
 
   using mapped_type = T;  ///< Payload type
   template <typename... Operators>
@@ -313,6 +315,61 @@ class static_map {
   template <typename InputIt>
   void insert_async(InputIt first, InputIt last, cuda::stream_ref stream = {}) noexcept;
 
+  /**
+   * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns
+   * true.
+   *
+   * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true.
+   * @note This function synchronizes the given stream and returns the number of successful
+   * insertions. For asynchronous execution use `insert_if_async`.
+   *
+   * @tparam InputIt Device accessible random access iterator whose `value_type` is
+   * convertible to the container's `value_type`
+   * @tparam StencilIt Device accessible random access iterator whose value_type is
+   * convertible to Predicate's argument type
+   * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and
+   * argument type is convertible from <tt>std::iterator_traits<StencilIt>::value_type</tt>
+   *
+   * @param first Beginning of the sequence of key/value pairs
+   * @param last End of the sequence of key/value pairs
+   * @param stencil Beginning of the stencil sequence
+   * @param pred Predicate to test on every element in the range `[stencil, stencil +
+   * std::distance(first, last))`
+   * @param stream CUDA stream used for the operation
+   *
+   * @return Number of successful insertions
+   */
+  template <typename InputIt, typename StencilIt, typename Predicate>
+  size_type insert_if(
+    InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream = {});
+
+  /**
+   * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding
+   * stencil returns true.
+   *
+   * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true.
+   *
+   * @tparam InputIt Device accessible random access iterator whose `value_type` is
+   * convertible to the container's `value_type`
+   * @tparam StencilIt Device accessible random access iterator whose value_type is
+   * convertible to Predicate's argument type
+   * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and
+   * argument type is convertible from <tt>std::iterator_traits<StencilIt>::value_type</tt>
+   *
+   * @param first Beginning of the sequence of key/value pairs
+   * @param last End of the sequence of key/value pairs
+   * @param stencil Beginning of the stencil sequence
+   * @param pred Predicate to test on every element in the range `[stencil, stencil +
+   * std::distance(first, last))`
+   * @param stream CUDA stream used for the operation
+   */
+  template <typename InputIt, typename StencilIt, typename Predicate>
+  void insert_if_async(InputIt first,
+                       InputIt last,
+                       StencilIt stencil,
+                       Predicate pred,
+                       cuda::stream_ref stream = {}) noexcept;
+
   /**
    * @brief Asynchronously inserts all elements in the range `[first, last)`.
    *
@@ -369,61 +426,6 @@ class static_map {
                        InsertedIt inserted_begin,
                        cuda::stream_ref stream = {});
 
-  /**
-   * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns
-   * true.
-   *
-   * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true.
-   * @note This function synchronizes the given stream and returns the number of successful
-   * insertions. For asynchronous execution use `insert_if_async`.
-   *
-   * @tparam InputIt Device accessible random access iterator whose `value_type` is
-   * convertible to the container's `value_type`
-   * @tparam StencilIt Device accessible random access iterator whose value_type is
-   * convertible to Predicate's argument type
-   * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and
-   * argument type is convertible from <tt>std::iterator_traits<StencilIt>::value_type</tt>
-   *
-   * @param first Beginning of the sequence of key/value pairs
-   * @param last End of the sequence of key/value pairs
-   * @param stencil Beginning of the stencil sequence
-   * @param pred Predicate to test on every element in the range `[stencil, stencil +
-   * std::distance(first, last))`
-   * @param stream CUDA stream used for the operation
-   *
-   * @return Number of successful insertions
-   */
-  template <typename InputIt, typename StencilIt, typename Predicate>
-  size_type insert_if(
-    InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream = {});
-
-  /**
-   * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding
-   * stencil returns true.
-   *
-   * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true.
-   *
-   * @tparam InputIt Device accessible random access iterator whose `value_type` is
-   * convertible to the container's `value_type`
-   * @tparam StencilIt Device accessible random access iterator whose value_type is
-   * convertible to Predicate's argument type
-   * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and
-   * argument type is convertible from <tt>std::iterator_traits<StencilIt>::value_type</tt>
-   *
-   * @param first Beginning of the sequence of key/value pairs
-   * @param last End of the sequence of key/value pairs
-   * @param stencil Beginning of the stencil sequence
-   * @param pred Predicate to test on every element in the range `[stencil, stencil +
-   * std::distance(first, last))`
-   * @param stream CUDA stream used for the operation
-   */
-  template <typename InputIt, typename StencilIt, typename Predicate>
-  void insert_if_async(InputIt first,
-                       InputIt last,
-                       StencilIt stencil,
-                       Predicate pred,
-                       cuda::stream_ref stream = {}) noexcept;
-
   /**
    * @brief For any key-value pair `{k, v}` in the range `[first, last)`, if a key equivalent to `k`
    * already exists in the container, assigns `v` to the mapped_type corresponding to the key `k`.
@@ -564,7 +566,10 @@ class static_map {
    * @param op Callable object to perform apply operation.
    * @param stream CUDA stream used for insert
    */
-  template <typename InputIt, typename Init, typename Op>
+  template <typename InputIt,
+            typename Init,
+            typename Op,
+            typename = std::enable_if_t<std::is_convertible_v<Init, T>>>
   void insert_or_apply_async(
     InputIt first, InputIt last, Init init, Op op, cuda::stream_ref stream = {}) noexcept;
 
@@ -762,6 +767,74 @@ class static_map {
                   OutputIt output_begin,
                   cuda::stream_ref stream = {}) const;
 
+  /**
+   * @brief Applies the given function object `callback_op` to the copy of every filled slot in the
+   * container
+   *
+   * @note The return value of `callback_op`, if any, is ignored.
+   *
+   * @tparam CallbackOp Type of unary callback function object
+   *
+   * @param callback_op Function to apply to the copy of the matched key-value pair
+   * @param stream CUDA stream used for this operation
+   */
+  template <typename CallbackOp>
+  void for_each(CallbackOp&& callback_op, cuda::stream_ref stream = {}) const;
+
+  /**
+   * @brief Asynchronously applies the given function object `callback_op` to the copy of every
+   * filled slot in the container
+   *
+   * @note The return value of `callback_op`, if any, is ignored.
+   *
+   * @tparam CallbackOp Type of unary callback function object
+   *
+   * @param callback_op Function to apply to the copy of the matched key-value pair
+   * @param stream CUDA stream used for this operation
+   */
+  template <typename CallbackOp>
+  void for_each_async(CallbackOp&& callback_op, cuda::stream_ref stream = {}) const;
+
+  /**
+   * @brief For each key in the range [first, last), applies the function object `callback_op` to
+   * the copy of all corresponding matches found in the container.
+   *
+   * @note The return value of `callback_op`, if any, is ignored.
+   *
+   * @tparam InputIt Device accessible random access input iterator
+   * @tparam CallbackOp Type of unary callback function object
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param callback_op Function to apply to the copy of the matched key-value pair
+   * @param stream CUDA stream used for this operation
+   */
+  template <typename InputIt, typename CallbackOp>
+  void for_each(InputIt first,
+                InputIt last,
+                CallbackOp&& callback_op,
+                cuda::stream_ref stream = {}) const;
+
+  /**
+   * @brief For each key in the range [first, last), asynchronously applies the function object
+   * `callback_op` to the copy of all corresponding matches found in the container.
+   *
+   * @note The return value of `callback_op`, if any, is ignored.
+   *
+   * @tparam InputIt Device accessible random access input iterator
+   * @tparam CallbackOp Type of unary callback function object
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param callback_op Function to apply to the copy of the matched key-value pair
+   * @param stream CUDA stream used for this operation
+   */
+  template <typename InputIt, typename CallbackOp>
+  void for_each_async(InputIt first,
+                      InputIt last,
+                      CallbackOp&& callback_op,
+                      cuda::stream_ref stream = {}) const noexcept;
+
   /**
    * @brief Retrieves all of the keys and their associated values.
    *
@@ -880,6 +953,20 @@ class static_map {
    */
   [[nodiscard]] constexpr key_type erased_key_sentinel() const noexcept;
 
+  /**
+   * @brief Gets the function used to compare keys for equality
+   *
+   * @return The function used to compare keys for equality
+   */
+  [[nodiscard]] constexpr key_equal key_eq() const noexcept;
+
+  /**
+   * @brief Gets the function(s) used to hash keys
+   *
+   * @return The function(s) used to hash keys
+   */
+  [[nodiscard]] constexpr hasher hash_function() const noexcept;
+
   /**
    * @brief Get device ref with operators.
    *
diff --git a/include/cuco/static_map_ref.cuh b/include/cuco/static_map_ref.cuh
index e12bdb6f2..1da1e501a 100644
--- a/include/cuco/static_map_ref.cuh
+++ b/include/cuco/static_map_ref.cuh
@@ -86,6 +86,7 @@ class static_map_ref
   using key_type            = Key;                                     ///< Key type
   using mapped_type         = T;                                       ///< Mapped type
   using probing_scheme_type = ProbingScheme;                           ///< Type of probing scheme
+  using hasher              = typename probing_scheme_type::hasher;    ///< Hash function type
   using storage_ref_type    = StorageRef;                              ///< Type of storage ref
   using window_type         = typename storage_ref_type::window_type;  ///< Window type
   using value_type          = typename storage_ref_type::value_type;   ///< Storage element type
@@ -190,6 +191,13 @@ class static_map_ref
    */
   [[nodiscard]] __host__ __device__ constexpr key_equal key_eq() const noexcept;
 
+  /**
+   * @brief Gets the function(s) used to hash keys
+   *
+   * @return The function(s) used to hash keys
+   */
+  [[nodiscard]] __host__ __device__ constexpr hasher hash_function() const noexcept;
+
   /**
    * @brief Returns a const_iterator to one past the last slot.
    *
diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh
index 31bac4b4f..ebf17edba 100644
--- a/include/cuco/static_multimap.cuh
+++ b/include/cuco/static_multimap.cuh
@@ -130,6 +130,7 @@ class static_multimap {
   /// Non-owning window storage ref type
   using storage_ref_type    = typename impl_type::storage_ref_type;
   using probing_scheme_type = typename impl_type::probing_scheme_type;  ///< Probing scheme type
+  using hasher              = typename probing_scheme_type::hasher;     ///< Hash function type
 
   using mapped_type = T;  ///< Payload type
   template <typename... Operators>
@@ -284,8 +285,7 @@ class static_multimap {
   void clear_async(cuda::stream_ref stream = {}) noexcept;
 
   /**
-   * @brief Inserts all keys in the range `[first, last)` and returns the number of successful
-   * insertions.
+   * @brief Inserts all keys in the range `[first, last)`
    *
    * @note This function synchronizes the given stream. For asynchronous execution use
    * `insert_async`.
@@ -297,11 +297,9 @@ class static_multimap {
    * @param first Beginning of the sequence of keys
    * @param last End of the sequence of keys
    * @param stream CUDA stream used for insert
-   *
-   * @return Number of successful insertions
    */
   template <typename InputIt>
-  size_type insert(InputIt first, InputIt last, cuda::stream_ref stream = {});
+  void insert(InputIt first, InputIt last, cuda::stream_ref stream = {});
 
   /**
    * @brief Asynchronously inserts all keys in the range `[first, last)`.
@@ -317,6 +315,61 @@ class static_multimap {
   template <typename InputIt>
   void insert_async(InputIt first, InputIt last, cuda::stream_ref stream = {}) noexcept;
 
+  /**
+   * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns
+   * true.
+   *
+   * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true.
+   * @note This function synchronizes the given stream and returns the number of successful
+   * insertions. For asynchronous execution use `insert_if_async`.
+   *
+   * @tparam InputIt Device accessible random access iterator whose `value_type` is
+   * convertible to the container's `value_type`
+   * @tparam StencilIt Device accessible random access iterator whose value_type is
+   * convertible to Predicate's argument type
+   * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and
+   * argument type is convertible from <tt>std::iterator_traits<StencilIt>::value_type</tt>
+   *
+   * @param first Beginning of the sequence of key/value pairs
+   * @param last End of the sequence of key/value pairs
+   * @param stencil Beginning of the stencil sequence
+   * @param pred Predicate to test on every element in the range `[stencil, stencil +
+   * std::distance(first, last))`
+   * @param stream CUDA stream used for the operation
+   *
+   * @return Number of successful insertions
+   */
+  template <typename InputIt, typename StencilIt, typename Predicate>
+  size_type insert_if(
+    InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream = {});
+
+  /**
+   * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding
+   * stencil returns true.
+   *
+   * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true.
+   *
+   * @tparam InputIt Device accessible random access iterator whose `value_type` is
+   * convertible to the container's `value_type`
+   * @tparam StencilIt Device accessible random access iterator whose value_type is
+   * convertible to Predicate's argument type
+   * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and
+   * argument type is convertible from <tt>std::iterator_traits<StencilIt>::value_type</tt>
+   *
+   * @param first Beginning of the sequence of key/value pairs
+   * @param last End of the sequence of key/value pairs
+   * @param stencil Beginning of the stencil sequence
+   * @param pred Predicate to test on every element in the range `[stencil, stencil +
+   * std::distance(first, last))`
+   * @param stream CUDA stream used for the operation
+   */
+  template <typename InputIt, typename StencilIt, typename Predicate>
+  void insert_if_async(InputIt first,
+                       InputIt last,
+                       StencilIt stencil,
+                       Predicate pred,
+                       cuda::stream_ref stream = {}) noexcept;
+
   /**
    * @brief Indicates whether the keys in the range `[first, last)` are contained in the map.
    *
@@ -419,6 +472,22 @@ class static_multimap {
                          OutputIt output_begin,
                          cuda::stream_ref stream = {}) const noexcept;
 
+  /**
+   * @brief Counts the occurrences of keys in `[first, last)` contained in the multimap
+   *
+   * @note This function synchronizes the given stream.
+   *
+   * @tparam Input Device accessible input iterator
+   *
+   * @param first Beginning of the sequence of keys to count
+   * @param last End of the sequence of keys to count
+   * @param stream CUDA stream used for count
+   *
+   * @return The sum of total occurrences of all keys in `[first, last)`
+   */
+  template <typename InputIt>
+  size_type count(InputIt first, InputIt last, cuda::stream_ref stream = {}) const;
+
   /**
    * @brief Gets the maximum number of elements the hash map can hold.
    *
@@ -447,6 +516,20 @@ class static_multimap {
    */
   [[nodiscard]] constexpr key_type erased_key_sentinel() const noexcept;
 
+  /**
+   * @brief Gets the function used to compare keys for equality
+   *
+   * @return The function used to compare keys for equality
+   */
+  [[nodiscard]] constexpr key_equal key_eq() const noexcept;
+
+  /**
+   * @brief Gets the function(s) used to hash keys
+   *
+   * @return The function(s) used to hash keys
+   */
+  [[nodiscard]] constexpr hasher hash_function() const noexcept;
+
   /**
    * @brief Get device ref with operators.
    *
diff --git a/include/cuco/static_multimap_ref.cuh b/include/cuco/static_multimap_ref.cuh
index 559392db1..b23925b86 100644
--- a/include/cuco/static_multimap_ref.cuh
+++ b/include/cuco/static_multimap_ref.cuh
@@ -74,9 +74,6 @@ class static_multimap_ref
   using impl_type = detail::
     open_addressing_ref_impl<Key, Scope, KeyEqual, ProbingScheme, StorageRef, allows_duplicates>;
 
-  static_assert(sizeof(T) == 4 or sizeof(T) == 8,
-                "sizeof(mapped_type) must be either 4 bytes or 8 bytes.");
-
   static_assert(
     cuco::is_bitwise_comparable_v<Key>,
     "Key type must have unique object representations or have been explicitly declared as safe for "
@@ -86,6 +83,7 @@ class static_multimap_ref
   using key_type            = Key;                                     ///< Key type
   using mapped_type         = T;                                       ///< Mapped type
   using probing_scheme_type = ProbingScheme;                           ///< Type of probing scheme
+  using hasher              = typename probing_scheme_type::hasher;    ///< Hash function type
   using storage_ref_type    = StorageRef;                              ///< Type of storage ref
   using window_type         = typename storage_ref_type::window_type;  ///< Window type
   using value_type          = typename storage_ref_type::value_type;   ///< Storage element type
@@ -192,6 +190,13 @@ class static_multimap_ref
    */
   [[nodiscard]] __host__ __device__ constexpr key_equal key_eq() const noexcept;
 
+  /**
+   * @brief Gets the function(s) used to hash keys
+   *
+   * @return The function(s) used to hash keys
+   */
+  [[nodiscard]] __host__ __device__ constexpr hasher hash_function() const noexcept;
+
   /**
    * @brief Returns a const_iterator to one past the last slot.
    *
diff --git a/include/cuco/static_multiset.cuh b/include/cuco/static_multiset.cuh
index 2158b31c4..22cda307f 100644
--- a/include/cuco/static_multiset.cuh
+++ b/include/cuco/static_multiset.cuh
@@ -100,6 +100,7 @@ class static_multiset {
   /// Non-owning window storage ref type
   using storage_ref_type    = typename impl_type::storage_ref_type;
   using probing_scheme_type = typename impl_type::probing_scheme_type;  ///< Probing scheme type
+  using hasher              = typename probing_scheme_type::hasher;     ///< Hash function type
 
   template <typename... Operators>
   using ref_type = cuco::static_multiset_ref<key_type,
@@ -299,9 +300,11 @@ class static_multiset {
    * @param pred Predicate to test on every element in the range `[stencil, stencil +
    * std::distance(first, last))`
    * @param stream CUDA stream used for the operation
+   *
+   * @return Number of successful insertions
    */
   template <typename InputIt, typename StencilIt, typename Predicate>
-  void insert_if(
+  size_type insert_if(
     InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream = {});
 
   /**
@@ -487,7 +490,7 @@ class static_multiset {
    * @return The sum of total occurrences of all keys in `[first, last)`
    */
   template <typename InputIt>
-  size_type count(InputIt first, InputIt last, cuda::stream_ref stream = {}) const noexcept;
+  size_type count(InputIt first, InputIt last, cuda::stream_ref stream = {}) const;
 
   /**
    * @brief Counts the occurrences of keys in `[first, last)` contained in the multiset
@@ -570,6 +573,20 @@ class static_multiset {
    */
   [[nodiscard]] constexpr key_type erased_key_sentinel() const noexcept;
 
+  /**
+   * @brief Gets the function used to compare keys for equality
+   *
+   * @return The function used to compare keys for equality
+   */
+  [[nodiscard]] constexpr key_equal key_eq() const noexcept;
+
+  /**
+   * @brief Gets the function(s) used to hash keys
+   *
+   * @return The function(s) used to hash keys
+   */
+  [[nodiscard]] constexpr hasher hash_function() const noexcept;
+
   /**
    * @brief Get device ref with operators.
    *
diff --git a/include/cuco/static_multiset_ref.cuh b/include/cuco/static_multiset_ref.cuh
index 9d9ed5307..bf0588f2f 100644
--- a/include/cuco/static_multiset_ref.cuh
+++ b/include/cuco/static_multiset_ref.cuh
@@ -74,6 +74,7 @@ class static_multiset_ref
  public:
   using key_type            = Key;                                     ///< Key Type
   using probing_scheme_type = ProbingScheme;                           ///< Type of probing scheme
+  using hasher              = typename probing_scheme_type::hasher;    ///< Hash function type
   using storage_ref_type    = StorageRef;                              ///< Type of storage ref
   using window_type         = typename storage_ref_type::window_type;  ///< Window type
   using value_type          = typename storage_ref_type::value_type;   ///< Storage element type
@@ -169,6 +170,13 @@ class static_multiset_ref
    */
   [[nodiscard]] __host__ __device__ constexpr key_equal key_eq() const noexcept;
 
+  /**
+   * @brief Gets the function(s) used to hash keys
+   *
+   * @return The function(s) used to hash keys
+   */
+  [[nodiscard]] __host__ __device__ constexpr hasher hash_function() const noexcept;
+
   /**
    * @brief Returns a const_iterator to one past the last slot.
    *
diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh
index 2ae20ed74..28ade4efb 100644
--- a/include/cuco/static_set.cuh
+++ b/include/cuco/static_set.cuh
@@ -105,6 +105,7 @@ class static_set {
   /// Non-owning window storage ref type
   using storage_ref_type    = typename impl_type::storage_ref_type;
   using probing_scheme_type = typename impl_type::probing_scheme_type;  ///< Probing scheme type
+  using hasher              = typename probing_scheme_type::hasher;     ///< Hash function type
 
   template <typename... Operators>
   using ref_type = cuco::static_set_ref<key_type,
@@ -766,6 +767,20 @@ class static_set {
    */
   [[nodiscard]] constexpr key_type erased_key_sentinel() const noexcept;
 
+  /**
+   * @brief Gets the function used to compare keys for equality
+   *
+   * @return The function used to compare keys for equality
+   */
+  [[nodiscard]] constexpr key_equal key_eq() const noexcept;
+
+  /**
+   * @brief Gets the function(s) used to hash keys
+   *
+   * @return The function(s) used to hash keys
+   */
+  [[nodiscard]] constexpr hasher hash_function() const noexcept;
+
   /**
    * @brief Get device ref with operators.
    *
diff --git a/include/cuco/static_set_ref.cuh b/include/cuco/static_set_ref.cuh
index a7c691b0e..1271cb756 100644
--- a/include/cuco/static_set_ref.cuh
+++ b/include/cuco/static_set_ref.cuh
@@ -74,6 +74,7 @@ class static_set_ref
  public:
   using key_type            = Key;                                     ///< Key Type
   using probing_scheme_type = ProbingScheme;                           ///< Type of probing scheme
+  using hasher              = typename probing_scheme_type::hasher;    ///< Hash function type
   using storage_ref_type    = StorageRef;                              ///< Type of storage ref
   using window_type         = typename storage_ref_type::window_type;  ///< Window type
   using value_type          = typename storage_ref_type::value_type;   ///< Storage element type
@@ -167,6 +168,13 @@ class static_set_ref
    */
   [[nodiscard]] __host__ __device__ constexpr key_equal key_eq() const noexcept;
 
+  /**
+   * @brief Gets the function(s) used to hash keys
+   *
+   * @return The function(s) used to hash keys
+   */
+  [[nodiscard]] __host__ __device__ constexpr hasher hash_function() const noexcept;
+
   /**
    * @brief Returns a const_iterator to one past the last slot.
    *
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 32a6a63a1..d976c007d 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -77,6 +77,7 @@ ConfigureTest(STATIC_MAP_TEST
     static_map/custom_type_test.cu
     static_map/duplicate_keys_test.cu
     static_map/erase_test.cu
+    static_map/for_each_test.cu
     static_map/hash_test.cu
     static_map/heterogeneous_lookup_test.cu
     static_map/insert_and_find_test.cu
@@ -108,6 +109,7 @@ ConfigureTest(STATIC_MULTISET_TEST
 ###################################################################################################
 # - static_multimap tests -------------------------------------------------------------------------
 ConfigureTest(STATIC_MULTIMAP_TEST
+    static_multimap/count_test.cu
     static_multimap/custom_pair_retrieve_test.cu
     static_multimap/custom_type_test.cu
     static_multimap/heterogeneous_lookup_test.cu
diff --git a/tests/static_map/for_each_test.cu b/tests/static_map/for_each_test.cu
new file mode 100644
index 000000000..1c72a2e58
--- /dev/null
+++ b/tests/static_map/for_each_test.cu
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <test_utils.hpp>
+
+#include <cuco/static_map.cuh>
+
+#include <cuda/atomic>
+#include <thrust/device_vector.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/tuple.h>
+
+#include <catch2/catch_template_test_macros.hpp>
+
+using size_type = std::size_t;
+
+template <typename Map>
+void test_for_each(Map& map, size_type num_keys)
+{
+  using Key   = typename Map::key_type;
+  using Value = typename Map::mapped_type;
+
+  REQUIRE(num_keys % 2 == 0);
+
+  // Insert pairs
+  auto pairs_begin = thrust::make_transform_iterator(
+    thrust::counting_iterator<size_type>(0),
+    cuda::proclaim_return_type<cuco::pair<Key, Value>>([] __device__(auto i) {
+      // use payload as 1 for even keys and 2 for odd keys
+      return cuco::pair<Key, Value>{i, i % 2 + 1};
+    }));
+
+  cuda::stream_ref stream{};
+
+  map.insert(pairs_begin, pairs_begin + num_keys, stream);
+
+  using Allocator = cuco::cuda_allocator<cuda::atomic<size_type, cuda::thread_scope_device>>;
+  cuco::detail::counter_storage<size_type, cuda::thread_scope_device, Allocator> counter_storage(
+    Allocator{});
+  counter_storage.reset(stream);
+
+  // count all the keys which are even and whose payload has value 1
+  map.for_each(
+    [counter = counter_storage.data()] __device__(auto const slot) {
+      auto const& [key, value] = slot;
+      if (((key % 2 == 0)) and (value == 1)) { counter->fetch_add(1, cuda::memory_order_relaxed); }
+    },
+    stream);
+
+  auto const res = counter_storage.load_to_host(stream);
+  REQUIRE(res == num_keys / 2);
+
+  counter_storage.reset(stream);
+
+  map.for_each(
+    thrust::counting_iterator<size_type>(0),
+    thrust::counting_iterator<size_type>(2 * num_keys),  // test for false-positives
+    [counter = counter_storage.data()] __device__(auto const slot) {
+      auto const& [key, value] = slot;
+      if (((key % 2 == 0)) and (value == 1)) { counter->fetch_add(1, cuda::memory_order_relaxed); }
+    },
+    stream);
+  REQUIRE(res == num_keys / 2);
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "static_map for_each tests",
+  "",
+  ((typename Key, typename Value, cuco::test::probe_sequence Probe, int CGSize),
+   Key,
+   Value,
+   Probe,
+   CGSize),
+  (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 2),
+  (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 2),
+  (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2),
+  (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2))
+{
+  constexpr size_type num_keys{100};
+  using probe = std::conditional_t<
+    Probe == cuco::test::probe_sequence::linear_probing,
+    cuco::linear_probing<CGSize, cuco::murmurhash3_32<Key>>,
+    cuco::double_hashing<CGSize, cuco::murmurhash3_32<Key>, cuco::murmurhash3_32<Key>>>;
+
+  using map_type = cuco::static_map<Key,
+                                    Value,
+                                    cuco::extent<size_type>,
+                                    cuda::thread_scope_device,
+                                    thrust::equal_to<Key>,
+                                    probe,
+                                    cuco::cuda_allocator<std::byte>,
+                                    cuco::storage<2>>;
+
+  auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
+  test_for_each(map, num_keys);
+}
diff --git a/tests/static_multimap/count_test.cu b/tests/static_multimap/count_test.cu
new file mode 100644
index 000000000..8eacfa729
--- /dev/null
+++ b/tests/static_multimap/count_test.cu
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <test_utils.hpp>
+
+#include <cuco/static_multimap.cuh>
+
+#include <cuda/functional>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/generate.h>
+#include <thrust/sequence.h>
+#include <thrust/shuffle.h>
+
+#include <catch2/catch_template_test_macros.hpp>
+
+using size_type = int32_t;
+
+static size_type constexpr multiplicity = 3;
+
+template <typename Map>
+void test_multiplicity_count(Map& map, size_type num_keys)
+{
+  using Key   = typename Map::key_type;
+  using Value = typename Map::mapped_type;
+
+  auto const keys_begin = thrust::counting_iterator<Key>{0};
+
+  SECTION("Count of empty map should be zero.")
+  {
+    auto const count = map.count(keys_begin, keys_begin + num_keys);
+    REQUIRE(count == 0);
+  }
+
+  SECTION("Count of n unique keys should be n.")
+  {
+    auto const pairs_begin = thrust::make_transform_iterator(
+      thrust::make_counting_iterator<size_type>(0),
+      cuda::proclaim_return_type<cuco::pair<Key, Value>>(
+        [] __device__(auto i) { return cuco::pair<Key, Value>{i, i}; }));
+    map.insert(pairs_begin, pairs_begin + num_keys);
+
+    auto const count = map.count(keys_begin, keys_begin + num_keys);
+    REQUIRE(count == num_keys);
+  }
+
+  SECTION("Count of n unique keys should be n x multiplicity.")
+  {
+    auto const pairs_begin = thrust::make_transform_iterator(
+      thrust::make_counting_iterator<size_type>(0),
+      cuda::proclaim_return_type<cuco::pair<Key, Value>>(
+        [] __device__(auto i) { return cuco::pair<Key, Value>{i / multiplicity, i}; }));
+    map.insert(pairs_begin, pairs_begin + num_keys * multiplicity);
+
+    auto const count = map.count(keys_begin, keys_begin + num_keys);
+    REQUIRE(count == num_keys * multiplicity);
+  }
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "static_multimap count tests",
+  "",
+  ((typename T, cuco::test::probe_sequence Probe, int CGSize), T, Probe, CGSize),
+  (int32_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int32_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int64_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int64_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int32_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int32_t, cuco::test::probe_sequence::linear_probing, 2),
+  (int64_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int64_t, cuco::test::probe_sequence::linear_probing, 2))
+{
+  constexpr size_type num_keys{1'000};
+
+  using probe = std::conditional_t<
+    Probe == cuco::test::probe_sequence::linear_probing,
+    cuco::linear_probing<CGSize, cuco::default_hash_function<T>>,
+    cuco::double_hashing<CGSize, cuco::default_hash_function<T>, cuco::default_hash_function<T>>>;
+
+  auto map = cuco::experimental::static_multimap<T,
+                                                 T,
+                                                 cuco::extent<size_type>,
+                                                 cuda::thread_scope_device,
+                                                 thrust::equal_to<T>,
+                                                 probe,
+                                                 cuco::cuda_allocator<std::byte>,
+                                                 cuco::storage<2>>{
+    num_keys * multiplicity, cuco::empty_key<T>{-1}, cuco::empty_value<T>{-1}};
+
+  test_multiplicity_count(map, num_keys);
+}
diff --git a/tests/static_multimap/insert_if_test.cu b/tests/static_multimap/insert_if_test.cu
index f2959ac6a..40697f40a 100644
--- a/tests/static_multimap/insert_if_test.cu
+++ b/tests/static_multimap/insert_if_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,51 +26,84 @@
 
 #include <catch2/catch_template_test_macros.hpp>
 
-template <typename Key, typename Map, typename PairIt, typename KeyIt>
-void test_insert_if(Map& map, PairIt pair_begin, KeyIt key_begin, std::size_t size)
+template <typename Map>
+void test_insert_if(Map& map, std::size_t size)
 {
+  using Key   = typename Map::key_type;
+  using Value = typename Map::mapped_type;
+
   // 50% insertion
-  auto pred_lambda = [] __device__(Key k) { return k % 2 == 0; };
+  auto const pred       = [] __device__(Key k) { return k % 2 == 0; };
+  auto const keys_begin = thrust::counting_iterator<Key>{0};
+
+  SECTION("Count of n / 2 insertions should be n / 2.")
+  {
+    auto const pairs_begin = thrust::make_transform_iterator(
+      keys_begin, cuda::proclaim_return_type<cuco::pair<Key, Value>>([] __device__(auto i) {
+        return cuco::pair<Key, Value>{i, i};
+      }));
+
+    auto const num = map.insert_if(pairs_begin, pairs_begin + size, keys_begin, pred);
+    REQUIRE(num * 2 == size);
 
-  map.insert_if(pair_begin, pair_begin + size, key_begin, pred_lambda);
+    auto const count = map.count(keys_begin, keys_begin + size);
+    REQUIRE(count * 2 == size);
+  }
 
-  auto res = map.get_size();
-  REQUIRE(res * 2 == size);
+  SECTION("Inserting the same element n / 2 times should return n / 2.")
+  {
+    auto const pairs_begin = thrust::constant_iterator<cuco::pair<Key, Value>>{{1, 1}};
 
-  auto num = map.count(key_begin, key_begin + size);
-  REQUIRE(num * 2 == size);
+    auto const num = map.insert_if(pairs_begin, pairs_begin + size, keys_begin, pred);
+    REQUIRE(num * 2 == size);
+
+    auto const count = map.count(keys_begin, keys_begin + size);
+    REQUIRE(count * 2 == size);
+  }
 }
 
 TEMPLATE_TEST_CASE_SIG(
-  "Tests of insert_if",
+  "static_multimap insert_if",
   "",
-  ((typename Key, typename Value, cuco::test::probe_sequence Probe), Key, Value, Probe),
-  (int32_t, int32_t, cuco::test::probe_sequence::linear_probing),
-  (int32_t, int64_t, cuco::test::probe_sequence::linear_probing),
-  (int64_t, int64_t, cuco::test::probe_sequence::linear_probing),
-  (int32_t, int32_t, cuco::test::probe_sequence::double_hashing),
-  (int32_t, int64_t, cuco::test::probe_sequence::double_hashing),
-  (int64_t, int64_t, cuco::test::probe_sequence::double_hashing))
+  ((typename Key, typename Value, cuco::test::probe_sequence Probe, int CGSize),
+   Key,
+   Value,
+   Probe,
+   CGSize),
+  (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 2),
+  (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 2),
+  (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2),
+  (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2))
 {
   constexpr std::size_t num_keys{1'000};
 
-  thrust::device_vector<Key> d_keys(num_keys);
-  thrust::device_vector<cuco::pair<Key, Value>> d_pairs(num_keys);
-
-  thrust::sequence(thrust::device, d_keys.begin(), d_keys.end());
-  // multiplicity = 1
-  thrust::transform(thrust::device,
-                    thrust::counting_iterator<int>(0),
-                    thrust::counting_iterator<int>(num_keys),
-                    d_pairs.begin(),
-                    [] __device__(auto i) { return cuco::pair<Key, Value>{i, i}; });
-
-  using probe =
-    std::conditional_t<Probe == cuco::test::probe_sequence::linear_probing,
-                       cuco::legacy::linear_probing<1, cuco::default_hash_function<Key>>,
-                       cuco::legacy::double_hashing<8, cuco::default_hash_function<Key>>>;
-
-  cuco::static_multimap<Key, Value, cuda::thread_scope_device, cuco::cuda_allocator<char>, probe>
-    map{num_keys * 2, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
-  test_insert_if<Key>(map, d_pairs.begin(), d_keys.begin(), num_keys);
+  using extent_type = cuco::extent<std::size_t>;
+  using probe       = std::conditional_t<
+          Probe == cuco::test::probe_sequence::linear_probing,
+          cuco::linear_probing<CGSize, cuco::murmurhash3_32<Key>>,
+          cuco::double_hashing<CGSize, cuco::murmurhash3_32<Key>, cuco::murmurhash3_32<Key>>>;
+
+  auto map = cuco::experimental::static_multimap<Key,
+                                                 Value,
+                                                 extent_type,
+                                                 cuda::thread_scope_device,
+                                                 thrust::equal_to<Key>,
+                                                 probe,
+                                                 cuco::cuda_allocator<std::byte>,
+                                                 cuco::storage<2>>{
+    num_keys * 2, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
+
+  test_insert_if(map, num_keys);
 }
diff --git a/tests/static_multiset/for_each_test.cu b/tests/static_multiset/for_each_test.cu
index 1872586b7..b987ba660 100644
--- a/tests/static_multiset/for_each_test.cu
+++ b/tests/static_multiset/for_each_test.cu
@@ -45,8 +45,8 @@ CUCO_KERNEL void for_each_check_scalar(Ref ref,
   while (idx < n) {
     auto const& key     = *(first + idx);
     std::size_t matches = 0;
-    ref.for_each(key, [&] __device__(auto const it) {
-      if (ref.key_eq()(key, *it)) { matches++; }
+    ref.for_each(key, [&] __device__(auto const slot) {
+      if (ref.key_eq()(key, slot)) { matches++; }
     });
     if (matches != multiplicity) { error_counter->fetch_add(1, cuda::memory_order_relaxed); }
     idx += loop_stride;
@@ -73,13 +73,13 @@ CUCO_KERNEL void for_each_check_cooperative(Ref ref,
       ref.for_each(
         tile,
         key,
-        [&] __device__(auto const it) {
-          if (ref.key_eq()(key, *it)) { thread_matches++; }
+        [&] __device__(auto const slot) {
+          if (ref.key_eq()(key, slot)) { thread_matches++; }
         },
         [] __device__(auto const& group) { group.sync(); });
     } else {
-      ref.for_each(tile, key, [&] __device__(auto const it) {
-        if (ref.key_eq()(key, *it)) { thread_matches++; }
+      ref.for_each(tile, key, [&] __device__(auto const slot) {
+        if (ref.key_eq()(key, slot)) { thread_matches++; }
       });
     }
     auto const tile_matches =
diff --git a/tests/static_set/large_input_test.cu b/tests/static_set/large_input_test.cu
index 481762e5f..d4cef0201 100644
--- a/tests/static_set/large_input_test.cu
+++ b/tests/static_set/large_input_test.cu
@@ -53,14 +53,27 @@ void test_unique_sequence(Set& set, bool* res_begin, std::size_t num_keys)
     set.contains(keys_begin, keys_end, res_begin);
     REQUIRE(cuco::test::all_of(res_begin, res_begin + num_keys, thrust::identity{}));
   }
+
+  SECTION("All inserted key/value pairs can be retrieved.")
+  {
+    auto output_keys = thrust::device_vector<Key>(num_keys);
+
+    auto const keys_end = set.retrieve_all(output_keys.begin());
+    REQUIRE(static_cast<std::size_t>(std::distance(output_keys.begin(), keys_end)) == num_keys);
+
+    thrust::sort(output_keys.begin(), keys_end);
+
+    REQUIRE(cuco::test::equal(output_keys.begin(),
+                              output_keys.end(),
+                              thrust::counting_iterator<Key>(0),
+                              thrust::equal_to<Key>{}));
+  }
 }
 
 TEMPLATE_TEST_CASE_SIG(
-  "Large input",
+  "cuco::static_set large input test",
   "",
   ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize),
-  (int32_t, cuco::test::probe_sequence::double_hashing, 1),
-  (int32_t, cuco::test::probe_sequence::double_hashing, 2),
   (int64_t, cuco::test::probe_sequence::double_hashing, 1),
   (int64_t, cuco::test::probe_sequence::double_hashing, 2))
 {
diff --git a/tests/utility/storage_test.cu b/tests/utility/storage_test.cu
index 8cbd8c08d..8fb923fb8 100644
--- a/tests/utility/storage_test.cu
+++ b/tests/utility/storage_test.cu
@@ -37,6 +37,15 @@ TEMPLATE_TEST_CASE_SIG("Storage tests",
   using allocator_type = cuco::cuda_allocator<char>;
   auto allocator       = allocator_type{};
 
+  SECTION("Initialize empty storage is allowed.")
+  {
+    auto s = cuco::
+      aow_storage<cuco::pair<Key, Value>, window_size, cuco::extent<std::size_t>, allocator_type>{
+        cuco::extent<std::size_t>{0}, allocator};
+
+    s.initialize(cuco::pair<Key, Value>{1, 1});
+  }
+
   SECTION("Allocate array of pairs with AoS storage.")
   {
     auto s = cuco::