Merge remote-tracking branch 'upstream/dev' into bloom-filter-new

NVIDIA · Aug 29, 2024 · a87cbd2 · a87cbd2
2 parents 2713345 + b55e38d
commit a87cbd2
Show file tree

Hide file tree

Showing 40 changed files with 1,371 additions and 224 deletions.
diff --git a/.devcontainer/cuda11.8-gcc11/devcontainer.json b/.devcontainer/cuda11.8-gcc11/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-gcc11-cuda11.8-ubuntu22.04",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc11-cuda11.8-ubuntu22.04",
   "hostRequirements": {
     "gpu": true
   },

diff --git a/.devcontainer/cuda12.5-gcc12/devcontainer.json b/.devcontainer/cuda12.5-gcc12/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-gcc12-cuda12.5-ubuntu22.04",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc12-cuda12.5-ubuntu22.04",
   "hostRequirements": {
     "gpu": true
   },

diff --git a/.devcontainer/cuda12.5-gcc13/devcontainer.json b/.devcontainer/cuda12.5-gcc13/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-gcc13-cuda12.5-ubuntu22.04",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc13-cuda12.5-ubuntu22.04",
   "hostRequirements": {
     "gpu": true
   },

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.08-cpp-gcc13-cuda12.5-ubuntu22.04",
+  "image": "rapidsai/devcontainers:24.10-cpp-gcc13-cuda12.5-ubuntu22.04",
   "hostRequirements": {
     "gpu": true
   },

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -16,7 +16,7 @@
 cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake)
-    file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.08/RAPIDS.cmake
+    file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.10/RAPIDS.cmake
          ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake)
 endif()
 include(${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake)

diff --git a/ci/matrix.yml b/ci/matrix.yml
@@ -23,7 +23,7 @@ gpus:
   - 'v100'
 
 # The version of the devcontainer images to use from https://hub.docker.com/r/rapidsai/devcontainers
-devcontainer_version: '24.08'
+devcontainer_version: '24.10'
 
 # Each environment below will generate a unique build/test job
 # See the "compute-matrix" job in the workflow for how this is parsed and used

diff --git a/include/cuco/detail/__config b/include/cuco/detail/__config
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <nv/target>
+#include <cuda/std/version>
 
 #if !defined(__CUDACC_VER_MAJOR__) || !defined(__CUDACC_VER_MINOR__)
 #error "NVCC version not found"
@@ -32,6 +33,10 @@
 #error "Support for extended device lambdas is required (nvcc flag --expt-extended-lambda)"
 #endif
 
+#if !defined(CCCL_VERSION) || (CCCL_VERSION < 2005000)
+#error "CCCL version 2.5.0 or later is required"
+#endif
+
 // WAR for libcudacxx/296
 #define CUCO_CUDA_MINIMUM_ARCH _NV_FIRST_ARG(__CUDA_ARCH_LIST__)
 

diff --git a/include/cuco/detail/bitwise_compare.cuh b/include/cuco/detail/bitwise_compare.cuh
@@ -18,6 +18,7 @@
 
 #include <cuco/utility/traits.hpp>
 
+#include <cuda/functional>
 #include <cuda/std/bit>
 
 #include <cstdint>
@@ -67,9 +68,10 @@ struct bitwise_compare_impl<8> {
  * size of type, or 16, whichever is smaller.
  */
 template <typename T>
-constexpr std::size_t alignment()
+__host__ __device__ constexpr std::size_t alignment()
 {
-  return std::min(std::size_t{16}, cuda::std::bit_ceil(sizeof(T)));
+  constexpr std::size_t alignment = cuda::std::bit_ceil(sizeof(T));
+  return cuda::std::min(std::size_t{16}, alignment);
 }
 
 /**

diff --git a/include/cuco/detail/open_addressing/kernels.cuh b/include/cuco/detail/open_addressing/kernels.cuh
@@ -182,6 +182,46 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void erase(InputIt first,
   }
 }
 
+/**
+ * @brief For each key in the range [first, first + n), applies the function object `callback_op` to
+ * the copy of all corresponding matches found in the container.
+ *
+ * @note The return value of `callback_op`, if any, is ignored.
+ *
+ * @tparam CGSize Number of threads in each CG
+ * @tparam BlockSize Number of threads in each block
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
+ * convertible to the `key_type` of the data structure
+ * @tparam CallbackOp Type of unary callback function object
+ * @tparam Ref Type of non-owning device ref allowing access to storage
+ *
+ * @param first Beginning of the sequence of input elements
+ * @param n Number of input elements
+ * @param callback_op Function to call on every matched slot found in the container
+ * @param ref Non-owning container device ref used to access the slot storage
+ */
+template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename CallbackOp, typename Ref>
+CUCO_KERNEL __launch_bounds__(BlockSize) void for_each_n(InputIt first,
+                                                         cuco::detail::index_type n,
+                                                         CallbackOp callback_op,
+                                                         Ref ref)
+{
+  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
+  auto idx               = cuco::detail::global_thread_id() / CGSize;
+
+  while (idx < n) {
+    typename std::iterator_traits<InputIt>::value_type const& key{*(first + idx)};
+    if constexpr (CGSize == 1) {
+      ref.for_each(key, callback_op);
+    } else {
+      auto const tile =
+        cooperative_groups::tiled_partition<CGSize>(cooperative_groups::this_thread_block());
+      ref.for_each(tile, key, callback_op);
+    }
+    idx += loop_stride;
+  }
+}
+
 /**
  * @brief Indicates whether the keys in the range `[first, first + n)` are contained in the data
  * structure if `pred` of the corresponding stencil returns true.

diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh
@@ -27,6 +27,7 @@
 #include <cuco/storage.cuh>
 #include <cuco/utility/traits.hpp>
 
+#include <cub/device/device_for.cuh>
 #include <cub/device/device_select.cuh>
 #include <cuda/atomic>
 #include <thrust/iterator/constant_iterator.h>
@@ -99,6 +100,7 @@ class open_addressing_impl {
 
   using storage_ref_type = typename storage_type::ref_type;  ///< Non-owning window storage ref type
   using probing_scheme_type = ProbingScheme;                 ///< Probe scheme type
+  using hasher              = typename probing_scheme_type::hasher;  ///< Hash function type
 
   /**
    * @brief Constructs a statically-sized open addressing data structure with the specified initial
@@ -588,7 +590,7 @@ class open_addressing_impl {
   [[nodiscard]] size_type count(InputIt first,
                                 InputIt last,
                                 Ref container_ref,
-                                cuda::stream_ref stream) const noexcept
+                                cuda::stream_ref stream) const
   {
     auto constexpr is_outer = false;
     return this->count<is_outer>(first, last, container_ref, stream);
@@ -638,49 +640,126 @@ class open_addressing_impl {
   template <typename OutputIt>
   [[nodiscard]] OutputIt retrieve_all(OutputIt output_begin, cuda::stream_ref stream) const
   {
-    std::size_t temp_storage_bytes = 0;
     using temp_allocator_type =
       typename std::allocator_traits<allocator_type>::template rebind_alloc<char>;
+
+    cuco::detail::index_type constexpr stride = std::numeric_limits<int32_t>::max();
+
+    cuco::detail::index_type h_num_out{0};
     auto temp_allocator = temp_allocator_type{this->allocator()};
     auto d_num_out      = reinterpret_cast<size_type*>(
       std::allocator_traits<temp_allocator_type>::allocate(temp_allocator, sizeof(size_type)));
-    auto const begin = thrust::make_transform_iterator(
-      thrust::counting_iterator<size_type>{0},
-      open_addressing_ns::detail::get_slot<has_payload, storage_ref_type>(this->storage_ref()));
-    auto const is_filled = open_addressing_ns::detail::slot_is_filled<has_payload, key_type>{
-      this->empty_key_sentinel(), this->erased_key_sentinel()};
-    CUCO_CUDA_TRY(cub::DeviceSelect::If(nullptr,
-                                        temp_storage_bytes,
-                                        begin,
-                                        output_begin,
-                                        d_num_out,
-                                        this->capacity(),
-                                        is_filled,
-                                        stream.get()));
-
-    // Allocate temporary storage
-    auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes);
-
-    CUCO_CUDA_TRY(cub::DeviceSelect::If(d_temp_storage,
-                                        temp_storage_bytes,
-                                        begin,
-                                        output_begin,
-                                        d_num_out,
-                                        this->capacity(),
-                                        is_filled,
-                                        stream.get()));
-
-    size_type h_num_out;
-    CUCO_CUDA_TRY(cudaMemcpyAsync(
-      &h_num_out, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get()));
-    stream.wait();
+
+    // TODO: PR #580 to be reverted once https://github.com/NVIDIA/cccl/issues/1422 is resolved
+    for (cuco::detail::index_type offset = 0;
+         offset < static_cast<cuco::detail::index_type>(this->capacity());
+         offset += stride) {
+      auto const num_items =
+        std::min(static_cast<cuco::detail::index_type>(this->capacity()) - offset, stride);
+      auto const begin = thrust::make_transform_iterator(
+        thrust::counting_iterator{static_cast<size_type>(offset)},
+        open_addressing_ns::detail::get_slot<has_payload, storage_ref_type>(this->storage_ref()));
+      auto const is_filled = open_addressing_ns::detail::slot_is_filled<has_payload, key_type>{
+        this->empty_key_sentinel(), this->erased_key_sentinel()};
+
+      std::size_t temp_storage_bytes = 0;
+
+      CUCO_CUDA_TRY(cub::DeviceSelect::If(nullptr,
+                                          temp_storage_bytes,
+                                          begin,
+                                          output_begin + h_num_out,
+                                          d_num_out,
+                                          static_cast<int32_t>(num_items),
+                                          is_filled,
+                                          stream.get()));
+
+      // Allocate temporary storage
+      auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes);
+
+      CUCO_CUDA_TRY(cub::DeviceSelect::If(d_temp_storage,
+                                          temp_storage_bytes,
+                                          begin,
+                                          output_begin + h_num_out,
+                                          d_num_out,
+                                          static_cast<int32_t>(num_items),
+                                          is_filled,
+                                          stream.get()));
+
+      size_type temp_count;
+      CUCO_CUDA_TRY(cudaMemcpyAsync(
+        &temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get()));
+      stream.wait();
+      h_num_out += temp_count;
+      temp_allocator.deallocate(d_temp_storage, temp_storage_bytes);
+    }
+
     std::allocator_traits<temp_allocator_type>::deallocate(
       temp_allocator, reinterpret_cast<char*>(d_num_out), sizeof(size_type));
-    temp_allocator.deallocate(d_temp_storage, temp_storage_bytes);
 
     return output_begin + h_num_out;
   }
 
+  /**
+   * @brief Asynchronously applies the given function object `callback_op` to the copy of every
+   * filled slot in the container
+   *
+   * @note The return value of `callback_op`, if any, is ignored.
+   *
+   * @tparam CallbackOp Type of unary callback function object
+   *
+   * @param callback_op Function to call on every filled slot in the container
+   * @param stream CUDA stream used for this operation
+   */
+  template <typename CallbackOp>
+  void for_each_async(CallbackOp&& callback_op, cuda::stream_ref stream) const
+  {
+    auto const is_filled = open_addressing_ns::detail::slot_is_filled<has_payload, key_type>{
+      this->empty_key_sentinel(), this->erased_key_sentinel()};
+
+    auto storage_ref = this->storage_ref();
+    auto const op    = [callback_op, is_filled, storage_ref] __device__(auto const window_slots) {
+      for (auto const slot : window_slots) {
+        if (is_filled(slot)) { callback_op(slot); }
+      }
+    };
+
+    CUCO_CUDA_TRY(cub::DeviceFor::ForEachCopyN(
+      storage_ref.data(), storage_ref.num_windows(), op, stream.get()));
+  }
+
+  /**
+   * @brief For each key in the range [first, last), asynchronously applies the function object
+   * `callback_op` to the copy of all corresponding matches found in the container.
+   *
+   * @note The return value of `callback_op`, if any, is ignored.
+   *
+   * @tparam InputIt Device accessible random access input iterator
+   * @tparam CallbackOp Type of unary callback function object
+   * @tparam Ref Type of non-owning device container ref allowing access to storage
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param callback_op Function to call on every match found in the container
+   * @param container_ref Non-owning device container ref used to access the slot storage
+   * @param stream CUDA stream used for this operation
+   */
+  template <typename InputIt, typename CallbackOp, typename Ref>
+  void for_each_async(InputIt first,
+                      InputIt last,
+                      CallbackOp&& callback_op,
+                      Ref container_ref,
+                      cuda::stream_ref stream) const noexcept
+  {
+    auto const num_keys = cuco::detail::distance(first, last);
+    if (num_keys == 0) { return; }
+
+    auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);
+
+    detail::for_each_n<cg_size, cuco::detail::default_block_size()>
+      <<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
+        first, num_keys, std::forward<CallbackOp>(callback_op), container_ref);
+  }
+
   /**
    * @brief Gets the number of elements in the container
    *
@@ -855,6 +934,16 @@ class open_addressing_impl {
     return probing_scheme_;
   }
 
+  /**
+   * @brief Gets the function(s) used to hash keys
+   *
+   * @return The function(s) used to hash keys
+   */
+  [[nodiscard]] constexpr hasher hash_function() const noexcept
+  {
+    return this->probing_scheme().hash_function();
+  }
+
   /**
    * @brief Gets the container allocator.
    *