Merge remote-tracking branch 'upstream/dev' into add-erase

NVIDIA · Oct 4, 2023 · f239a99 · f239a99
2 parents 4b86ede + b4657fd
commit f239a99
Show file tree

Hide file tree

Showing 26 changed files with 943 additions and 176 deletions.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -35,6 +35,7 @@ endfunction(ConfigureExample)
 
 ConfigureExample(STATIC_SET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/host_bulk_example.cu")
 ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_ref_example.cu")
+ConfigureExample(STATIC_SET_DEVICE_SUBSETS_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_subsets_example.cu")
 ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu")
 ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_view_example.cu")
 ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu")

diff --git a/examples/static_set/device_ref_example.cu b/examples/static_set/device_ref_example.cu
@@ -26,6 +26,14 @@
 #include <cstddef>
 #include <iostream>
 
+/**
+ * @file device_reference_example.cu
+ * @brief Demonstrates usage of the static_set device-side APIs.
+ *
+ * static_set provides a non-owning reference which can be used to interact with
+ * the container from within device code.
+ */
+
 // insert a set of keys into a hash set using one cooperative group for each task
 template <typename SetRef, typename InputIterator>
 __global__ void custom_cooperative_insert(SetRef set, InputIterator keys, std::size_t n)
@@ -60,14 +68,6 @@ __global__ void custom_contains(SetRef set, InputIterator keys, std::size_t n, O
   }
 }
 
-/**
- * @file device_reference_example.cu
- * @brief Demonstrates usage of the static_set device-side APIs.
- *
- * static_set provides a non-owning reference which can be used to interact with
- * the container from within device code.
- *
- */
 int main(void)
 {
   using Key = int;

diff --git a/examples/static_set/device_subsets_example.cu b/examples/static_set/device_subsets_example.cu
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuco/static_set_ref.cuh>
+#include <cuco/storage.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+
+#include <cooperative_groups.h>
+
+#include <cuda/std/array>
+
+#include <algorithm>
+#include <cstddef>
+#include <iostream>
+#include <numeric>
+
+/**
+ * @file device_subsets_example.cu
+ * @brief Demonstrates how to use one bulk set storage to create multiple subsets and perform
+ * individual operations via device-side ref APIs.
+ *
+ * To optimize memory usage, especially when dealing with expensive data allocation and multiple
+ * hashsets, a practical solution involves employing a single bulk storage for generating subsets.
+ * This eliminates the need for separate memory allocation and deallocation for each container. This
+ * can be achieved by using the lightweight non-owning ref type.
+ *
+ * @note This example is for demonstration purposes only. It is not intended to show the most
+ * performant way to do the example algorithm.
+ */
+
+auto constexpr cg_size     = 8;   ///< A CUDA Cooperative Group of 8 threads to handle each subset
+auto constexpr window_size = 1;   ///< Number of concurrent slots handled by each thread
+auto constexpr N           = 10;  ///< Number of elements to insert and query
+
+using key_type            = int;  ///< Key type
+using probing_scheme_type = cuco::experimental::linear_probing<
+  cg_size,
+  cuco::default_hash_function<key_type>>;  ///< Type controls CG granularity and probing scheme
+                                           ///< (linear probing v.s. double hashing)
+/// Type of bulk allocation storage
+using storage_type = cuco::experimental::aow_storage<key_type, window_size>;
+/// Lightweight non-owning storage ref type
+using storage_ref_type = typename storage_type::ref_type;
+using ref_type         = cuco::experimental::static_set_ref<key_type,
+                                                    cuda::thread_scope_device,
+                                                    thrust::equal_to<key_type>,
+                                                    probing_scheme_type,
+                                                    storage_ref_type>;  ///< Set ref type
+
+/// Sample data to insert and query
+__device__ constexpr std::array<key_type, N> data = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+/// Empty slots are represented by reserved "sentinel" values. These values should be selected such
+/// that they never occur in your input data.
+key_type constexpr empty_key_sentinel = -1;
+
+/**
+ * @brief Inserts sample data into subsets by using cooperative group
+ *
+ * Each Cooperative Group creates its own subset and inserts `N` sample data.
+ *
+ * @param set_refs Pointer to the array of subset objects
+ */
+__global__ void insert(ref_type* set_refs)
+{
+  namespace cg = cooperative_groups;
+
+  auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
+  // Get subset (or CG) index
+  auto const idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;
+
+  auto raw_set_ref    = *(set_refs + idx);
+  auto insert_set_ref = std::move(raw_set_ref).with(cuco::experimental::insert);
+
+  // Insert `N` elemtns into the set with CG insert
+  for (int i = 0; i < N; i++) {
+    insert_set_ref.insert(tile, data[i]);
+  }
+}
+
+/**
+ * @brief All inserted data can be found
+ *
+ * Each Cooperative Group reconstructs its own subset ref based on the storage parameters and
+ * verifies all inserted data can be found.
+ *
+ * @param set_refs Pointer to the array of subset objects
+ */
+__global__ void find(ref_type* set_refs)
+{
+  namespace cg = cooperative_groups;
+
+  auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
+  auto const idx  = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;
+
+  auto raw_set_ref  = *(set_refs + idx);
+  auto find_set_ref = std::move(raw_set_ref).with(cuco::experimental::find);
+
+  // Result denoting if any of the inserted data is not found
+  __shared__ int result;
+  if (threadIdx.x == 0) { result = 0; }
+  __syncthreads();
+
+  for (int i = 0; i < N; i++) {
+    // Query the set with inserted data
+    auto const found = find_set_ref.find(tile, data[i]);
+    // Record if the inserted data has been found
+    atomicOr(&result, *found != data[i]);
+  }
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    // If the result is still 0, all inserted data are found.
+    if (result == 0) { printf("Success! Found all inserted elements.\n"); }
+  }
+}
+
+int main()
+{
+  // Number of subsets to be created
+  auto constexpr num = 16;
+  // Each subset may have a different requested size
+  auto constexpr subset_sizes =
+    std::array<std::size_t, num>{20, 20, 20, 20, 30, 30, 30, 30, 40, 40, 40, 40, 50, 50, 50, 50};
+
+  auto valid_sizes = std::vector<std::size_t>();
+  valid_sizes.reserve(num);
+
+  for (size_t i = 0; i < num; ++i) {
+    valid_sizes.emplace_back(
+      static_cast<std::size_t>(cuco::experimental::make_window_extent<ref_type>(subset_sizes[i])));
+  }
+
+  std::vector<std::size_t> offsets(num + 1, 0);
+
+  // prefix sum to compute offsets and total number of windows
+  std::size_t current_sum = 0;
+  for (std::size_t i = 0; i < valid_sizes.size(); ++i) {
+    current_sum += valid_sizes[i];
+    offsets[i + 1] = current_sum;
+  }
+
+  // total number of windows is located at the back of the offsets array
+  auto const total_num_windows = offsets.back();
+
+  // Create a single bulk storage used by all subsets
+  auto set_storage = storage_type{total_num_windows};
+  // Initializes the storage with the given sentinel
+  set_storage.initialize(empty_key_sentinel);
+
+  std::vector<ref_type> set_refs;
+
+  // create subsets
+  for (std::size_t i = 0; i < num; ++i) {
+    storage_ref_type storage_ref{valid_sizes[i], set_storage.data() + offsets[i]};
+    set_refs.emplace_back(
+      ref_type{cuco::empty_key<key_type>{empty_key_sentinel}, {}, {}, storage_ref});
+  }
+
+  thrust::device_vector<ref_type> d_set_refs(set_refs);
+
+  // Insert sample data
+  insert<<<1, 128>>>(d_set_refs.data().get());
+  // Find all inserted data
+  find<<<1, 128>>>(d_set_refs.data().get());
+
+  return 0;
+}
diff --git a/include/cuco/aow_storage.cuh b/include/cuco/aow_storage.cuh
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include <cuco/detail/storage/aow_storage_base.cuh>
-
 #include <cuco/cuda_stream_ref.hpp>
+#include <cuco/detail/storage/aow_storage_base.cuh>
 #include <cuco/extent.cuh>
+#include <cuco/utility/allocator.hpp>
 
 #include <cuda/std/array>
 
@@ -47,7 +47,10 @@ class aow_storage_ref;
  * @tparam Extent Type of extent denoting number of windows
  * @tparam Allocator Type of allocator used for device storage (de)allocation
  */
-template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
+template <typename T,
+          int32_t WindowSize,
+          typename Extent    = cuco::experimental::extent<std::size_t>,
+          typename Allocator = cuco::cuda_allocator<cuco::experimental::window<T, WindowSize>>>
 class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
  public:
   using base_type = detail::aow_storage_base<T, WindowSize, Extent>;  ///< AoW base class type
@@ -78,7 +81,7 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
    * @param size Number of windows to (de)allocate
    * @param allocator Allocator used for (de)allocating device storage
    */
-  explicit constexpr aow_storage(Extent size, Allocator const& allocator) noexcept;
+  explicit constexpr aow_storage(Extent size, Allocator const& allocator = {}) noexcept;
 
   aow_storage(aow_storage&&) = default;  ///< Move constructor
   /**
@@ -119,7 +122,15 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
    * @param key Key to which all keys in `slots` are initialized
    * @param stream Stream used for executing the kernel
    */
-  void initialize(value_type key, cuda_stream_ref stream) noexcept;
+  void initialize(value_type key, cuda_stream_ref stream = {}) noexcept;
+
+  /**
+   * @brief Asynchronously initializes each slot in the AoW storage to contain `key`.
+   *
+   * @param key Key to which all keys in `slots` are initialized
+   * @param stream Stream used for executing the kernel
+   */
+  void initialize_async(value_type key, cuda_stream_ref stream = {}) noexcept;
 
  private:
   allocator_type allocator_;            ///< Allocator used to (de)allocate windows
@@ -134,7 +145,7 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
  * @tparam WindowSize Number of slots in each window
  * @tparam Extent Type of extent denoting storage capacity
  */
-template <typename T, int32_t WindowSize, typename Extent>
+template <typename T, int32_t WindowSize, typename Extent = cuco::experimental::extent<std::size_t>>
 class aow_storage_ref : public detail::aow_storage_base<T, WindowSize, Extent> {
  public:
   using base_type = detail::aow_storage_base<T, WindowSize, Extent>;  ///< AoW base class type

diff --git a/include/cuco/detail/common_kernels.cuh b/include/cuco/detail/common_kernels.cuh
@@ -23,6 +23,8 @@
 
 #include <cooperative_groups.h>
 
+#include <iterator>
+
 namespace cuco {
 namespace experimental {
 namespace detail {
@@ -37,7 +39,7 @@ namespace detail {
  *
  * @tparam CGSize Number of threads in each CG
  * @tparam BlockSize Number of threads in each block
- * @tparam InputIterator Device accessible input iterator whose `value_type` is
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
  * convertible to the `value_type` of the data structure
  * @tparam StencilIt Device accessible random access iterator whose value_type is
  * convertible to Predicate's argument type
@@ -55,12 +57,12 @@ namespace detail {
  */
 template <int32_t CGSize,
           int32_t BlockSize,
-          typename InputIterator,
+          typename InputIt,
           typename StencilIt,
           typename Predicate,
           typename AtomicT,
           typename Ref>
-__global__ void insert_if_n(InputIterator first,
+__global__ void insert_if_n(InputIt first,
                             cuco::detail::index_type n,
                             StencilIt stencil,
                             Predicate pred,
@@ -76,7 +78,7 @@ __global__ void insert_if_n(InputIterator first,
 
   while (idx < n) {
     if (pred(*(stencil + idx))) {
-      typename Ref::value_type const insert_element{*(first + idx)};
+      typename std::iterator_traits<InputIt>::value_type const& insert_element{*(first + idx)};
       if constexpr (CGSize == 1) {
         if (ref.insert(insert_element)) { thread_num_successes++; };
       } else {
@@ -106,7 +108,7 @@ __global__ void insert_if_n(InputIterator first,
  *
  * @tparam CGSize Number of threads in each CG
  * @tparam BlockSize Number of threads in each block
- * @tparam InputIterator Device accessible input iterator whose `value_type` is
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
  * convertible to the `value_type` of the data structure
  * @tparam StencilIt Device accessible random access iterator whose value_type is
  * convertible to Predicate's argument type
@@ -122,19 +124,19 @@ __global__ void insert_if_n(InputIterator first,
  */
 template <int32_t CGSize,
           int32_t BlockSize,
-          typename InputIterator,
+          typename InputIt,
           typename StencilIt,
           typename Predicate,
           typename Ref>
 __global__ void insert_if_n(
-  InputIterator first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, Ref ref)
+  InputIt first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, Ref ref)
 {
   auto const loop_stride = cuco::detail::grid_stride() / CGSize;
   auto idx               = cuco::detail::global_thread_id() / CGSize;
 
   while (idx < n) {
     if (pred(*(stencil + idx))) {
-      typename Ref::value_type const insert_element{*(first + idx)};
+      typename std::iterator_traits<InputIt>::value_type const& insert_element{*(first + idx)};
       if constexpr (CGSize == 1) {
         ref.insert(insert_element);
       } else {
@@ -230,7 +232,7 @@ __global__ void contains_if_n(InputIt first,
   while (idx - thread_idx < n) {  // the whole thread block falls into the same iteration
     if constexpr (CGSize == 1) {
       if (idx < n) {
-        auto const key = *(first + idx);
+        typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
         /*
          * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
          * sector stores from L2 to global memory. By writing results to shared memory and then
@@ -244,7 +246,7 @@ __global__ void contains_if_n(InputIt first,
     } else {
       auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
       if (idx < n) {
-        auto const key   = *(first + idx);
+        typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
         auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false;
         if (tile.thread_rank() == 0) { *(output_begin + idx) = found; }
       }