Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/dev' into bloom-filter-new
Browse files Browse the repository at this point in the history
  • Loading branch information
sleeepyjack committed Aug 29, 2024
2 parents 2713345 + b55e38d commit a87cbd2
Show file tree
Hide file tree
Showing 40 changed files with 1,371 additions and 224 deletions.
2 changes: 1 addition & 1 deletion .devcontainer/cuda11.8-gcc11/devcontainer.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:24.08-cpp-gcc11-cuda11.8-ubuntu22.04",
"image": "rapidsai/devcontainers:24.10-cpp-gcc11-cuda11.8-ubuntu22.04",
"hostRequirements": {
"gpu": true
},
Expand Down
2 changes: 1 addition & 1 deletion .devcontainer/cuda12.5-gcc12/devcontainer.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:24.08-cpp-gcc12-cuda12.5-ubuntu22.04",
"image": "rapidsai/devcontainers:24.10-cpp-gcc12-cuda12.5-ubuntu22.04",
"hostRequirements": {
"gpu": true
},
Expand Down
2 changes: 1 addition & 1 deletion .devcontainer/cuda12.5-gcc13/devcontainer.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:24.08-cpp-gcc13-cuda12.5-ubuntu22.04",
"image": "rapidsai/devcontainers:24.10-cpp-gcc13-cuda12.5-ubuntu22.04",
"hostRequirements": {
"gpu": true
},
Expand Down
2 changes: 1 addition & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:24.08-cpp-gcc13-cuda12.5-ubuntu22.04",
"image": "rapidsai/devcontainers:24.10-cpp-gcc13-cuda12.5-ubuntu22.04",
"hostRequirements": {
"gpu": true
},
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)

if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake)
file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.08/RAPIDS.cmake
file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.10/RAPIDS.cmake
${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake)
endif()
include(${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake)
Expand Down
2 changes: 1 addition & 1 deletion ci/matrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ gpus:
- 'v100'

# The version of the devcontainer images to use from https://hub.docker.com/r/rapidsai/devcontainers
devcontainer_version: '24.08'
devcontainer_version: '24.10'

# Each environment below will generate a unique build/test job
# See the "compute-matrix" job in the workflow for how this is parsed and used
Expand Down
5 changes: 5 additions & 0 deletions include/cuco/detail/__config
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#pragma once

#include <nv/target>
#include <cuda/std/version>

#if !defined(__CUDACC_VER_MAJOR__) || !defined(__CUDACC_VER_MINOR__)
#error "NVCC version not found"
Expand All @@ -32,6 +33,10 @@
#error "Support for extended device lambdas is required (nvcc flag --expt-extended-lambda)"
#endif

#if !defined(CCCL_VERSION) || (CCCL_VERSION < 2005000)
#error "CCCL version 2.5.0 or later is required"
#endif

// WAR for libcudacxx/296
#define CUCO_CUDA_MINIMUM_ARCH _NV_FIRST_ARG(__CUDA_ARCH_LIST__)

Expand Down
6 changes: 4 additions & 2 deletions include/cuco/detail/bitwise_compare.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <cuco/utility/traits.hpp>

#include <cuda/functional>
#include <cuda/std/bit>

#include <cstdint>
Expand Down Expand Up @@ -67,9 +68,10 @@ struct bitwise_compare_impl<8> {
* size of type, or 16, whichever is smaller.
*/
template <typename T>
constexpr std::size_t alignment()
__host__ __device__ constexpr std::size_t alignment()
{
return std::min(std::size_t{16}, cuda::std::bit_ceil(sizeof(T)));
constexpr std::size_t alignment = cuda::std::bit_ceil(sizeof(T));
return cuda::std::min(std::size_t{16}, alignment);
}

/**
Expand Down
40 changes: 40 additions & 0 deletions include/cuco/detail/open_addressing/kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,46 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void erase(InputIt first,
}
}

/**
* @brief For each key in the range [first, first + n), applies the function object `callback_op` to
* the copy of all corresponding matches found in the container.
*
* @note The return value of `callback_op`, if any, is ignored.
*
* @tparam CGSize Number of threads in each CG
* @tparam BlockSize Number of threads in each block
* @tparam InputIt Device accessible input iterator whose `value_type` is
* convertible to the `key_type` of the data structure
* @tparam CallbackOp Type of unary callback function object
* @tparam Ref Type of non-owning device ref allowing access to storage
*
* @param first Beginning of the sequence of input elements
* @param n Number of input elements
* @param callback_op Function to call on every matched slot found in the container
* @param ref Non-owning container device ref used to access the slot storage
*/
template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename CallbackOp, typename Ref>
CUCO_KERNEL __launch_bounds__(BlockSize) void for_each_n(InputIt first,
cuco::detail::index_type n,
CallbackOp callback_op,
Ref ref)
{
auto const loop_stride = cuco::detail::grid_stride() / CGSize;
auto idx = cuco::detail::global_thread_id() / CGSize;

while (idx < n) {
typename std::iterator_traits<InputIt>::value_type const& key{*(first + idx)};
if constexpr (CGSize == 1) {
ref.for_each(key, callback_op);
} else {
auto const tile =
cooperative_groups::tiled_partition<CGSize>(cooperative_groups::this_thread_block());
ref.for_each(tile, key, callback_op);
}
idx += loop_stride;
}
}

/**
* @brief Indicates whether the keys in the range `[first, first + n)` are contained in the data
* structure if `pred` of the corresponding stencil returns true.
Expand Down
155 changes: 122 additions & 33 deletions include/cuco/detail/open_addressing/open_addressing_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <cuco/storage.cuh>
#include <cuco/utility/traits.hpp>

#include <cub/device/device_for.cuh>
#include <cub/device/device_select.cuh>
#include <cuda/atomic>
#include <thrust/iterator/constant_iterator.h>
Expand Down Expand Up @@ -99,6 +100,7 @@ class open_addressing_impl {

using storage_ref_type = typename storage_type::ref_type; ///< Non-owning window storage ref type
using probing_scheme_type = ProbingScheme; ///< Probe scheme type
using hasher = typename probing_scheme_type::hasher; ///< Hash function type

/**
* @brief Constructs a statically-sized open addressing data structure with the specified initial
Expand Down Expand Up @@ -588,7 +590,7 @@ class open_addressing_impl {
[[nodiscard]] size_type count(InputIt first,
InputIt last,
Ref container_ref,
cuda::stream_ref stream) const noexcept
cuda::stream_ref stream) const
{
auto constexpr is_outer = false;
return this->count<is_outer>(first, last, container_ref, stream);
Expand Down Expand Up @@ -638,49 +640,126 @@ class open_addressing_impl {
template <typename OutputIt>
[[nodiscard]] OutputIt retrieve_all(OutputIt output_begin, cuda::stream_ref stream) const
{
std::size_t temp_storage_bytes = 0;
using temp_allocator_type =
typename std::allocator_traits<allocator_type>::template rebind_alloc<char>;

cuco::detail::index_type constexpr stride = std::numeric_limits<int32_t>::max();

cuco::detail::index_type h_num_out{0};
auto temp_allocator = temp_allocator_type{this->allocator()};
auto d_num_out = reinterpret_cast<size_type*>(
std::allocator_traits<temp_allocator_type>::allocate(temp_allocator, sizeof(size_type)));
auto const begin = thrust::make_transform_iterator(
thrust::counting_iterator<size_type>{0},
open_addressing_ns::detail::get_slot<has_payload, storage_ref_type>(this->storage_ref()));
auto const is_filled = open_addressing_ns::detail::slot_is_filled<has_payload, key_type>{
this->empty_key_sentinel(), this->erased_key_sentinel()};
CUCO_CUDA_TRY(cub::DeviceSelect::If(nullptr,
temp_storage_bytes,
begin,
output_begin,
d_num_out,
this->capacity(),
is_filled,
stream.get()));

// Allocate temporary storage
auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes);

CUCO_CUDA_TRY(cub::DeviceSelect::If(d_temp_storage,
temp_storage_bytes,
begin,
output_begin,
d_num_out,
this->capacity(),
is_filled,
stream.get()));

size_type h_num_out;
CUCO_CUDA_TRY(cudaMemcpyAsync(
&h_num_out, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get()));
stream.wait();

// TODO: PR #580 to be reverted once https://github.com/NVIDIA/cccl/issues/1422 is resolved
for (cuco::detail::index_type offset = 0;
offset < static_cast<cuco::detail::index_type>(this->capacity());
offset += stride) {
auto const num_items =
std::min(static_cast<cuco::detail::index_type>(this->capacity()) - offset, stride);
auto const begin = thrust::make_transform_iterator(
thrust::counting_iterator{static_cast<size_type>(offset)},
open_addressing_ns::detail::get_slot<has_payload, storage_ref_type>(this->storage_ref()));
auto const is_filled = open_addressing_ns::detail::slot_is_filled<has_payload, key_type>{
this->empty_key_sentinel(), this->erased_key_sentinel()};

std::size_t temp_storage_bytes = 0;

CUCO_CUDA_TRY(cub::DeviceSelect::If(nullptr,
temp_storage_bytes,
begin,
output_begin + h_num_out,
d_num_out,
static_cast<int32_t>(num_items),
is_filled,
stream.get()));

// Allocate temporary storage
auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes);

CUCO_CUDA_TRY(cub::DeviceSelect::If(d_temp_storage,
temp_storage_bytes,
begin,
output_begin + h_num_out,
d_num_out,
static_cast<int32_t>(num_items),
is_filled,
stream.get()));

size_type temp_count;
CUCO_CUDA_TRY(cudaMemcpyAsync(
&temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get()));
stream.wait();
h_num_out += temp_count;
temp_allocator.deallocate(d_temp_storage, temp_storage_bytes);
}

std::allocator_traits<temp_allocator_type>::deallocate(
temp_allocator, reinterpret_cast<char*>(d_num_out), sizeof(size_type));
temp_allocator.deallocate(d_temp_storage, temp_storage_bytes);

return output_begin + h_num_out;
}

/**
* @brief Asynchronously applies the given function object `callback_op` to the copy of every
* filled slot in the container
*
* @note The return value of `callback_op`, if any, is ignored.
*
* @tparam CallbackOp Type of unary callback function object
*
* @param callback_op Function to call on every filled slot in the container
* @param stream CUDA stream used for this operation
*/
template <typename CallbackOp>
void for_each_async(CallbackOp&& callback_op, cuda::stream_ref stream) const
{
auto const is_filled = open_addressing_ns::detail::slot_is_filled<has_payload, key_type>{
this->empty_key_sentinel(), this->erased_key_sentinel()};

auto storage_ref = this->storage_ref();
auto const op = [callback_op, is_filled, storage_ref] __device__(auto const window_slots) {
for (auto const slot : window_slots) {
if (is_filled(slot)) { callback_op(slot); }
}
};

CUCO_CUDA_TRY(cub::DeviceFor::ForEachCopyN(
storage_ref.data(), storage_ref.num_windows(), op, stream.get()));
}

/**
* @brief For each key in the range [first, last), asynchronously applies the function object
* `callback_op` to the copy of all corresponding matches found in the container.
*
* @note The return value of `callback_op`, if any, is ignored.
*
* @tparam InputIt Device accessible random access input iterator
* @tparam CallbackOp Type of unary callback function object
* @tparam Ref Type of non-owning device container ref allowing access to storage
*
* @param first Beginning of the sequence of keys
* @param last End of the sequence of keys
* @param callback_op Function to call on every match found in the container
* @param container_ref Non-owning device container ref used to access the slot storage
* @param stream CUDA stream used for this operation
*/
template <typename InputIt, typename CallbackOp, typename Ref>
void for_each_async(InputIt first,
InputIt last,
CallbackOp&& callback_op,
Ref container_ref,
cuda::stream_ref stream) const noexcept
{
auto const num_keys = cuco::detail::distance(first, last);
if (num_keys == 0) { return; }

auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

detail::for_each_n<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
first, num_keys, std::forward<CallbackOp>(callback_op), container_ref);
}

/**
* @brief Gets the number of elements in the container
*
Expand Down Expand Up @@ -855,6 +934,16 @@ class open_addressing_impl {
return probing_scheme_;
}

/**
* @brief Gets the function(s) used to hash keys
*
* @return The function(s) used to hash keys
*/
[[nodiscard]] constexpr hasher hash_function() const noexcept
{
return this->probing_scheme().hash_function();
}

/**
* @brief Gets the container allocator.
*
Expand Down
Loading

0 comments on commit a87cbd2

Please sign in to comment.