Add device subsets example (#346)

Depends on #349 This PR adds an example demonstrating how to create multiple subsets with one single storage. It includes necessary changes and cleanups that will unblock orc/parquet dictionary encoding (rapidsai/cudf#12261) to use the new map/set data structures. --------- Co-authored-by: Daniel Juenger <[email protected]>
NVIDIA · Sep 26, 2023 · 359f5ae · 359f5ae
1 parent 0cd4da0
commit 359f5ae
Show file tree

Hide file tree

Showing 15 changed files with 403 additions and 55 deletions.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -35,6 +35,7 @@ endfunction(ConfigureExample)
 
 ConfigureExample(STATIC_SET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/host_bulk_example.cu")
 ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_ref_example.cu")
+ConfigureExample(STATIC_SET_DEVICE_SUBSETS_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_subsets_example.cu")
 ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu")
 ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_view_example.cu")
 ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu")

diff --git a/examples/static_set/device_ref_example.cu b/examples/static_set/device_ref_example.cu
@@ -26,6 +26,14 @@
 #include <cstddef>
 #include <iostream>
 
+/**
+ * @file device_reference_example.cu
+ * @brief Demonstrates usage of the static_set device-side APIs.
+ *
+ * static_set provides a non-owning reference which can be used to interact with
+ * the container from within device code.
+ */
+
 // insert a set of keys into a hash set using one cooperative group for each task
 template <typename SetRef, typename InputIterator>
 __global__ void custom_cooperative_insert(SetRef set, InputIterator keys, std::size_t n)
@@ -60,14 +68,6 @@ __global__ void custom_contains(SetRef set, InputIterator keys, std::size_t n, O
   }
 }
 
-/**
- * @file device_reference_example.cu
- * @brief Demonstrates usage of the static_set device-side APIs.
- *
- * static_set provides a non-owning reference which can be used to interact with
- * the container from within device code.
- *
- */
 int main(void)
 {
   using Key = int;

diff --git a/examples/static_set/device_subsets_example.cu b/examples/static_set/device_subsets_example.cu
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuco/static_set_ref.cuh>
+#include <cuco/storage.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+
+#include <cooperative_groups.h>
+
+#include <cuda/std/array>
+
+#include <algorithm>
+#include <cstddef>
+#include <iostream>
+#include <numeric>
+
+/**
+ * @file device_subsets_example.cu
+ * @brief Demonstrates how to use one bulk set storage to create multiple subsets and perform
+ * individual operations via device-side ref APIs.
+ *
+ * To optimize memory usage, especially when dealing with expensive data allocation and multiple
+ * hashsets, a practical solution involves employing a single bulk storage for generating subsets.
+ * This eliminates the need for separate memory allocation and deallocation for each container. This
+ * can be achieved by using the lightweight non-owning ref type.
+ *
+ * @note This example is for demonstration purposes only. It is not intended to show the most
+ * performant way to do the example algorithm.
+ */
+
+auto constexpr cg_size     = 8;   ///< A CUDA Cooperative Group of 8 threads to handle each subset
+auto constexpr window_size = 1;   ///< Number of concurrent slots handled by each thread
+auto constexpr N           = 10;  ///< Number of elements to insert and query
+
+using key_type            = int;  ///< Key type
+using probing_scheme_type = cuco::experimental::linear_probing<
+  cg_size,
+  cuco::default_hash_function<key_type>>;  ///< Type controls CG granularity and probing scheme
+                                           ///< (linear probing v.s. double hashing)
+/// Type of bulk allocation storage
+using storage_type = cuco::experimental::aow_storage<key_type, window_size>;
+/// Lightweight non-owning storage ref type
+using storage_ref_type = typename storage_type::ref_type;
+using ref_type         = cuco::experimental::static_set_ref<key_type,
+                                                    cuda::thread_scope_device,
+                                                    thrust::equal_to<key_type>,
+                                                    probing_scheme_type,
+                                                    storage_ref_type>;  ///< Set ref type
+
+/// Sample data to insert and query
+__device__ constexpr std::array<key_type, N> data = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+/// Empty slots are represented by reserved "sentinel" values. These values should be selected such
+/// that they never occur in your input data.
+key_type constexpr empty_key_sentinel = -1;
+
+/**
+ * @brief Inserts sample data into subsets by using cooperative group
+ *
+ * Each Cooperative Group creates its own subset and inserts `N` sample data.
+ *
+ * @param set_refs Pointer to the array of subset objects
+ */
+__global__ void insert(ref_type* set_refs)
+{
+  namespace cg = cooperative_groups;
+
+  auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
+  // Get subset (or CG) index
+  auto const idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;
+
+  auto raw_set_ref    = *(set_refs + idx);
+  auto insert_set_ref = std::move(raw_set_ref).with(cuco::experimental::insert);
+
+  // Insert `N` elemtns into the set with CG insert
+  for (int i = 0; i < N; i++) {
+    insert_set_ref.insert(tile, data[i]);
+  }
+}
+
+/**
+ * @brief All inserted data can be found
+ *
+ * Each Cooperative Group reconstructs its own subset ref based on the storage parameters and
+ * verifies all inserted data can be found.
+ *
+ * @param set_refs Pointer to the array of subset objects
+ */
+__global__ void find(ref_type* set_refs)
+{
+  namespace cg = cooperative_groups;
+
+  auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
+  auto const idx  = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;
+
+  auto raw_set_ref  = *(set_refs + idx);
+  auto find_set_ref = std::move(raw_set_ref).with(cuco::experimental::find);
+
+  // Result denoting if any of the inserted data is not found
+  __shared__ int result;
+  if (threadIdx.x == 0) { result = 0; }
+  __syncthreads();
+
+  for (int i = 0; i < N; i++) {
+    // Query the set with inserted data
+    auto const found = find_set_ref.find(tile, data[i]);
+    // Record if the inserted data has been found
+    atomicOr(&result, *found != data[i]);
+  }
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    // If the result is still 0, all inserted data are found.
+    if (result == 0) { printf("Success! Found all inserted elements.\n"); }
+  }
+}
+
+int main()
+{
+  // Number of subsets to be created
+  auto constexpr num = 16;
+  // Each subset may have a different requested size
+  auto constexpr subset_sizes =
+    std::array<std::size_t, num>{20, 20, 20, 20, 30, 30, 30, 30, 40, 40, 40, 40, 50, 50, 50, 50};
+
+  auto valid_sizes = std::vector<std::size_t>();
+  valid_sizes.reserve(num);
+
+  for (size_t i = 0; i < num; ++i) {
+    valid_sizes.emplace_back(
+      static_cast<std::size_t>(cuco::experimental::make_window_extent<ref_type>(subset_sizes[i])));
+  }
+
+  std::vector<std::size_t> offsets(num + 1, 0);
+
+  // prefix sum to compute offsets and total number of windows
+  std::size_t current_sum = 0;
+  for (std::size_t i = 0; i < valid_sizes.size(); ++i) {
+    current_sum += valid_sizes[i];
+    offsets[i + 1] = current_sum;
+  }
+
+  // total number of windows is located at the back of the offsets array
+  auto const total_num_windows = offsets.back();
+
+  // Create a single bulk storage used by all subsets
+  auto set_storage = storage_type{total_num_windows};
+  // Initializes the storage with the given sentinel
+  set_storage.initialize(empty_key_sentinel);
+
+  std::vector<ref_type> set_refs;
+
+  // create subsets
+  for (std::size_t i = 0; i < num; ++i) {
+    storage_ref_type storage_ref{valid_sizes[i], set_storage.data() + offsets[i]};
+    set_refs.emplace_back(
+      ref_type{cuco::empty_key<key_type>{empty_key_sentinel}, {}, {}, storage_ref});
+  }
+
+  thrust::device_vector<ref_type> d_set_refs(set_refs);
+
+  // Insert sample data
+  insert<<<1, 128>>>(d_set_refs.data().get());
+  // Find all inserted data
+  find<<<1, 128>>>(d_set_refs.data().get());
+
+  return 0;
+}
diff --git a/include/cuco/aow_storage.cuh b/include/cuco/aow_storage.cuh
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include <cuco/detail/storage/aow_storage_base.cuh>
-
 #include <cuco/cuda_stream_ref.hpp>
+#include <cuco/detail/storage/aow_storage_base.cuh>
 #include <cuco/extent.cuh>
+#include <cuco/utility/allocator.hpp>
 
 #include <cuda/std/array>
 
@@ -47,7 +47,10 @@ class aow_storage_ref;
  * @tparam Extent Type of extent denoting number of windows
  * @tparam Allocator Type of allocator used for device storage (de)allocation
  */
-template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
+template <typename T,
+          int32_t WindowSize,
+          typename Extent    = cuco::experimental::extent<std::size_t>,
+          typename Allocator = cuco::cuda_allocator<cuco::experimental::window<T, WindowSize>>>
 class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
  public:
   using base_type = detail::aow_storage_base<T, WindowSize, Extent>;  ///< AoW base class type
@@ -78,7 +81,7 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
    * @param size Number of windows to (de)allocate
    * @param allocator Allocator used for (de)allocating device storage
    */
-  explicit constexpr aow_storage(Extent size, Allocator const& allocator) noexcept;
+  explicit constexpr aow_storage(Extent size, Allocator const& allocator = {}) noexcept;
 
   aow_storage(aow_storage&&) = default;  ///< Move constructor
   /**
@@ -119,7 +122,15 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
    * @param key Key to which all keys in `slots` are initialized
    * @param stream Stream used for executing the kernel
    */
-  void initialize(value_type key, cuda_stream_ref stream) noexcept;
+  void initialize(value_type key, cuda_stream_ref stream = {}) noexcept;
+
+  /**
+   * @brief Asynchronously initializes each slot in the AoW storage to contain `key`.
+   *
+   * @param key Key to which all keys in `slots` are initialized
+   * @param stream Stream used for executing the kernel
+   */
+  void initialize_async(value_type key, cuda_stream_ref stream = {}) noexcept;
 
  private:
   allocator_type allocator_;            ///< Allocator used to (de)allocate windows
@@ -134,7 +145,7 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
  * @tparam WindowSize Number of slots in each window
  * @tparam Extent Type of extent denoting storage capacity
  */
-template <typename T, int32_t WindowSize, typename Extent>
+template <typename T, int32_t WindowSize, typename Extent = cuco::experimental::extent<std::size_t>>
 class aow_storage_ref : public detail::aow_storage_base<T, WindowSize, Extent> {
  public:
   using base_type = detail::aow_storage_base<T, WindowSize, Extent>;  ///< AoW base class type

diff --git a/include/cuco/detail/extent/extent.inl b/include/cuco/detail/extent/extent.inl
@@ -27,13 +27,10 @@
 namespace cuco {
 namespace experimental {
 
-template <int32_t CGSize, int32_t WindowSize, typename SizeType, std::size_t N = dynamic_extent>
+template <typename SizeType, std::size_t N>
 struct window_extent {
   using value_type = SizeType;  ///< Extent value type
 
-  static auto constexpr cg_size     = CGSize;
-  static auto constexpr window_size = WindowSize;
-
   __host__ __device__ constexpr value_type value() const noexcept { return N; }
   __host__ __device__ explicit constexpr operator value_type() const noexcept { return value(); }
 
@@ -45,15 +42,11 @@ struct window_extent {
   friend auto constexpr make_window_extent(extent<SizeType_, N_> ext);
 };
 
-template <int32_t CGSize, int32_t WindowSize, typename SizeType>
-struct window_extent<CGSize, WindowSize, SizeType, dynamic_extent>
-  : cuco::utility::fast_int<SizeType> {
+template <typename SizeType>
+struct window_extent<SizeType, dynamic_extent> : cuco::utility::fast_int<SizeType> {
   using value_type =
     typename cuco::utility::fast_int<SizeType>::fast_int::value_type;  ///< Extent value type
 
-  static auto constexpr cg_size     = CGSize;
-  static auto constexpr window_size = WindowSize;
-
  private:
   using cuco::utility::fast_int<SizeType>::fast_int;
 
@@ -67,10 +60,10 @@ template <typename Container, typename SizeType, std::size_t N>
   return make_window_extent<Container::cg_size, Container::window_size>(ext);
 }
 
-template <typename Container>
-[[nodiscard]] std::size_t constexpr make_window_extent(std::size_t size)
+template <typename Container, typename SizeType>
+[[nodiscard]] auto constexpr make_window_extent(SizeType size)
 {
-  return make_window_extent<Container::cg_size, Container::window_size>(size);
+  return make_window_extent<Container::cg_size, Container::window_size>(extent<SizeType>{size});
 }
 
 template <int32_t CGSize, int32_t WindowSize, typename SizeType, std::size_t N>
@@ -86,15 +79,13 @@ template <int32_t CGSize, int32_t WindowSize, typename SizeType, std::size_t N>
   if (size > max_value) { CUCO_FAIL("Invalid input extent"); }
 
   if constexpr (N == dynamic_extent) {
-    return window_extent<CGSize, WindowSize, SizeType>{static_cast<SizeType>(
+    return window_extent<SizeType>{static_cast<SizeType>(
       *cuco::detail::lower_bound(
         cuco::detail::primes.begin(), cuco::detail::primes.end(), static_cast<uint64_t>(size)) *
       CGSize)};
   }
   if constexpr (N != dynamic_extent) {
-    return window_extent<CGSize,
-                         WindowSize,
-                         SizeType,
+    return window_extent<SizeType,
                          static_cast<std::size_t>(
                            *cuco::detail::lower_bound(cuco::detail::primes.begin(),
                                                       cuco::detail::primes.end(),
@@ -103,10 +94,10 @@ template <int32_t CGSize, int32_t WindowSize, typename SizeType, std::size_t N>
   }
 }
 
-template <int32_t CGSize, int32_t WindowSize>
-[[nodiscard]] std::size_t constexpr make_window_extent(std::size_t size)
+template <int32_t CGSize, int32_t WindowSize, typename SizeType>
+[[nodiscard]] auto constexpr make_window_extent(SizeType size)
 {
-  return static_cast<std::size_t>(make_window_extent<CGSize, WindowSize>(extent{size}));
+  return make_window_extent<CGSize, WindowSize>(extent<SizeType>{size});
 }
 
 namespace detail {
@@ -115,8 +106,8 @@ template <typename...>
 struct is_window_extent : std::false_type {
 };
 
-template <int32_t CGSize, int32_t WindowSize, typename SizeType, std::size_t N>
-struct is_window_extent<window_extent<CGSize, WindowSize, SizeType, N>> : std::true_type {
+template <typename SizeType, std::size_t N>
+struct is_window_extent<window_extent<SizeType, N>> : std::true_type {
 };
 
 template <typename T>

diff --git a/include/cuco/detail/open_addressing_impl.cuh b/include/cuco/detail/open_addressing_impl.cuh
@@ -141,11 +141,7 @@ class open_addressing_impl {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  void clear(cuda_stream_ref stream) noexcept
-  {
-    this->clear_async(stream);
-    stream.synchronize();
-  }
+  void clear(cuda_stream_ref stream) noexcept { storage_.initialize(empty_slot_sentinel_, stream); }
 
   /**
    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
@@ -155,7 +151,7 @@ class open_addressing_impl {
    */
   void clear_async(cuda_stream_ref stream) noexcept
   {
-    storage_.initialize(empty_slot_sentinel_, stream);
+    storage_.initialize_async(empty_slot_sentinel_, stream);
   }
 
   /**