diff --git a/.devcontainer/cuda11.8-gcc11/devcontainer.json b/.devcontainer/cuda11.8-gcc11/devcontainer.json index 332439947..606759c73 100644 --- a/.devcontainer/cuda11.8-gcc11/devcontainer.json +++ b/.devcontainer/cuda11.8-gcc11/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-gcc11-cuda11.8-ubuntu22.04", + "image": "rapidsai/devcontainers:24.10-cpp-gcc11-cuda11.8-ubuntu22.04", "hostRequirements": { "gpu": true }, diff --git a/.devcontainer/cuda12.5-gcc12/devcontainer.json b/.devcontainer/cuda12.5-gcc12/devcontainer.json index bf3e725af..3f562f865 100644 --- a/.devcontainer/cuda12.5-gcc12/devcontainer.json +++ b/.devcontainer/cuda12.5-gcc12/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-gcc12-cuda12.5-ubuntu22.04", + "image": "rapidsai/devcontainers:24.10-cpp-gcc12-cuda12.5-ubuntu22.04", "hostRequirements": { "gpu": true }, diff --git a/.devcontainer/cuda12.5-gcc13/devcontainer.json b/.devcontainer/cuda12.5-gcc13/devcontainer.json index f1f303243..01bbe927b 100644 --- a/.devcontainer/cuda12.5-gcc13/devcontainer.json +++ b/.devcontainer/cuda12.5-gcc13/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-gcc13-cuda12.5-ubuntu22.04", + "image": "rapidsai/devcontainers:24.10-cpp-gcc13-cuda12.5-ubuntu22.04", "hostRequirements": { "gpu": true }, diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index f1f303243..01bbe927b 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-gcc13-cuda12.5-ubuntu22.04", + "image": "rapidsai/devcontainers:24.10-cpp-gcc13-cuda12.5-ubuntu22.04", "hostRequirements": { "gpu": true }, diff --git a/CMakeLists.txt b/CMakeLists.txt index 6b9e97e83..8de18db73 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,7 +16,7 @@ cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake) - file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.08/RAPIDS.cmake + file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.10/RAPIDS.cmake ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake) endif() include(${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake) diff --git a/ci/matrix.yml b/ci/matrix.yml index b4dd33ca2..471a4fb0d 100644 --- a/ci/matrix.yml +++ b/ci/matrix.yml @@ -23,7 +23,7 @@ gpus: - 'v100' # The version of the devcontainer images to use from https://hub.docker.com/r/rapidsai/devcontainers -devcontainer_version: '24.08' +devcontainer_version: '24.10' # Each environment below will generate a unique build/test job # See the "compute-matrix" job in the workflow for how this is parsed and used diff --git a/include/cuco/detail/__config b/include/cuco/detail/__config index 6d4bf7339..24c7758ab 100644 --- a/include/cuco/detail/__config +++ b/include/cuco/detail/__config @@ -17,6 +17,7 @@ #pragma once #include +#include #if !defined(__CUDACC_VER_MAJOR__) || !defined(__CUDACC_VER_MINOR__) #error "NVCC version not found" @@ -32,6 +33,10 @@ #error "Support for extended device lambdas is required (nvcc flag --expt-extended-lambda)" #endif +#if !defined(CCCL_VERSION) || (CCCL_VERSION < 2005000) +#error "CCCL version 2.5.0 or later is required" +#endif + // WAR for libcudacxx/296 #define CUCO_CUDA_MINIMUM_ARCH _NV_FIRST_ARG(__CUDA_ARCH_LIST__) diff --git a/include/cuco/detail/bitwise_compare.cuh b/include/cuco/detail/bitwise_compare.cuh index a8a5a69d1..d828bbf86 100644 --- a/include/cuco/detail/bitwise_compare.cuh +++ b/include/cuco/detail/bitwise_compare.cuh @@ -18,6 +18,7 @@ #include +#include #include #include @@ -67,9 +68,10 @@ struct bitwise_compare_impl<8> { * size of type, or 16, whichever is smaller. */ template -constexpr std::size_t alignment() +__host__ __device__ constexpr std::size_t alignment() { - return std::min(std::size_t{16}, cuda::std::bit_ceil(sizeof(T))); + constexpr std::size_t alignment = cuda::std::bit_ceil(sizeof(T)); + return cuda::std::min(std::size_t{16}, alignment); } /** diff --git a/include/cuco/detail/open_addressing/kernels.cuh b/include/cuco/detail/open_addressing/kernels.cuh index 266335a50..24fce230c 100644 --- a/include/cuco/detail/open_addressing/kernels.cuh +++ b/include/cuco/detail/open_addressing/kernels.cuh @@ -182,6 +182,46 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void erase(InputIt first, } } +/** + * @brief For each key in the range [first, first + n), applies the function object `callback_op` to + * the copy of all corresponding matches found in the container. + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @tparam CGSize Number of threads in each CG + * @tparam BlockSize Number of threads in each block + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the `key_type` of the data structure + * @tparam CallbackOp Type of unary callback function object + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of input elements + * @param n Number of input elements + * @param callback_op Function to call on every matched slot found in the container + * @param ref Non-owning container device ref used to access the slot storage + */ +template +CUCO_KERNEL __launch_bounds__(BlockSize) void for_each_n(InputIt first, + cuco::detail::index_type n, + CallbackOp callback_op, + Ref ref) +{ + auto const loop_stride = cuco::detail::grid_stride() / CGSize; + auto idx = cuco::detail::global_thread_id() / CGSize; + + while (idx < n) { + typename std::iterator_traits::value_type const& key{*(first + idx)}; + if constexpr (CGSize == 1) { + ref.for_each(key, callback_op); + } else { + auto const tile = + cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + ref.for_each(tile, key, callback_op); + } + idx += loop_stride; + } +} + /** * @brief Indicates whether the keys in the range `[first, first + n)` are contained in the data * structure if `pred` of the corresponding stencil returns true. diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 9dabff990..a8eff9036 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -99,6 +100,7 @@ class open_addressing_impl { using storage_ref_type = typename storage_type::ref_type; ///< Non-owning window storage ref type using probing_scheme_type = ProbingScheme; ///< Probe scheme type + using hasher = typename probing_scheme_type::hasher; ///< Hash function type /** * @brief Constructs a statically-sized open addressing data structure with the specified initial @@ -588,7 +590,7 @@ class open_addressing_impl { [[nodiscard]] size_type count(InputIt first, InputIt last, Ref container_ref, - cuda::stream_ref stream) const noexcept + cuda::stream_ref stream) const { auto constexpr is_outer = false; return this->count(first, last, container_ref, stream); @@ -638,49 +640,126 @@ class open_addressing_impl { template [[nodiscard]] OutputIt retrieve_all(OutputIt output_begin, cuda::stream_ref stream) const { - std::size_t temp_storage_bytes = 0; using temp_allocator_type = typename std::allocator_traits::template rebind_alloc; + + cuco::detail::index_type constexpr stride = std::numeric_limits::max(); + + cuco::detail::index_type h_num_out{0}; auto temp_allocator = temp_allocator_type{this->allocator()}; auto d_num_out = reinterpret_cast( std::allocator_traits::allocate(temp_allocator, sizeof(size_type))); - auto const begin = thrust::make_transform_iterator( - thrust::counting_iterator{0}, - open_addressing_ns::detail::get_slot(this->storage_ref())); - auto const is_filled = open_addressing_ns::detail::slot_is_filled{ - this->empty_key_sentinel(), this->erased_key_sentinel()}; - CUCO_CUDA_TRY(cub::DeviceSelect::If(nullptr, - temp_storage_bytes, - begin, - output_begin, - d_num_out, - this->capacity(), - is_filled, - stream.get())); - - // Allocate temporary storage - auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes); - - CUCO_CUDA_TRY(cub::DeviceSelect::If(d_temp_storage, - temp_storage_bytes, - begin, - output_begin, - d_num_out, - this->capacity(), - is_filled, - stream.get())); - - size_type h_num_out; - CUCO_CUDA_TRY(cudaMemcpyAsync( - &h_num_out, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get())); - stream.wait(); + + // TODO: PR #580 to be reverted once https://github.com/NVIDIA/cccl/issues/1422 is resolved + for (cuco::detail::index_type offset = 0; + offset < static_cast(this->capacity()); + offset += stride) { + auto const num_items = + std::min(static_cast(this->capacity()) - offset, stride); + auto const begin = thrust::make_transform_iterator( + thrust::counting_iterator{static_cast(offset)}, + open_addressing_ns::detail::get_slot(this->storage_ref())); + auto const is_filled = open_addressing_ns::detail::slot_is_filled{ + this->empty_key_sentinel(), this->erased_key_sentinel()}; + + std::size_t temp_storage_bytes = 0; + + CUCO_CUDA_TRY(cub::DeviceSelect::If(nullptr, + temp_storage_bytes, + begin, + output_begin + h_num_out, + d_num_out, + static_cast(num_items), + is_filled, + stream.get())); + + // Allocate temporary storage + auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes); + + CUCO_CUDA_TRY(cub::DeviceSelect::If(d_temp_storage, + temp_storage_bytes, + begin, + output_begin + h_num_out, + d_num_out, + static_cast(num_items), + is_filled, + stream.get())); + + size_type temp_count; + CUCO_CUDA_TRY(cudaMemcpyAsync( + &temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get())); + stream.wait(); + h_num_out += temp_count; + temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); + } + std::allocator_traits::deallocate( temp_allocator, reinterpret_cast(d_num_out), sizeof(size_type)); - temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); return output_begin + h_num_out; } + /** + * @brief Asynchronously applies the given function object `callback_op` to the copy of every + * filled slot in the container + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @tparam CallbackOp Type of unary callback function object + * + * @param callback_op Function to call on every filled slot in the container + * @param stream CUDA stream used for this operation + */ + template + void for_each_async(CallbackOp&& callback_op, cuda::stream_ref stream) const + { + auto const is_filled = open_addressing_ns::detail::slot_is_filled{ + this->empty_key_sentinel(), this->erased_key_sentinel()}; + + auto storage_ref = this->storage_ref(); + auto const op = [callback_op, is_filled, storage_ref] __device__(auto const window_slots) { + for (auto const slot : window_slots) { + if (is_filled(slot)) { callback_op(slot); } + } + }; + + CUCO_CUDA_TRY(cub::DeviceFor::ForEachCopyN( + storage_ref.data(), storage_ref.num_windows(), op, stream.get())); + } + + /** + * @brief For each key in the range [first, last), asynchronously applies the function object + * `callback_op` to the copy of all corresponding matches found in the container. + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @tparam InputIt Device accessible random access input iterator + * @tparam CallbackOp Type of unary callback function object + * @tparam Ref Type of non-owning device container ref allowing access to storage + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param callback_op Function to call on every match found in the container + * @param container_ref Non-owning device container ref used to access the slot storage + * @param stream CUDA stream used for this operation + */ + template + void for_each_async(InputIt first, + InputIt last, + CallbackOp&& callback_op, + Ref container_ref, + cuda::stream_ref stream) const noexcept + { + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); + + detail::for_each_n + <<>>( + first, num_keys, std::forward(callback_op), container_ref); + } + /** * @brief Gets the number of elements in the container * @@ -855,6 +934,16 @@ class open_addressing_impl { return probing_scheme_; } + /** + * @brief Gets the function(s) used to hash keys + * + * @return The function(s) used to hash keys + */ + [[nodiscard]] constexpr hasher hash_function() const noexcept + { + return this->probing_scheme().hash_function(); + } + /** * @brief Gets the container allocator. * diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index 5396f318a..f4c20f829 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -109,6 +109,7 @@ class open_addressing_ref_impl { public: using key_type = Key; ///< Key type using probing_scheme_type = ProbingScheme; ///< Type of probing scheme + using hasher = typename probing_scheme_type::hasher; ///< Hash function type using storage_ref_type = StorageRef; ///< Type of storage ref using window_type = typename storage_ref_type::window_type; ///< Window type using value_type = typename storage_ref_type::value_type; ///< Storage element type @@ -233,11 +234,22 @@ class open_addressing_ref_impl { * * @return The probing scheme used for the container */ - [[nodiscard]] __device__ constexpr probing_scheme_type const& probing_scheme() const noexcept + [[nodiscard]] __host__ __device__ constexpr probing_scheme_type const& probing_scheme() + const noexcept { return probing_scheme_; } + /** + * @brief Gets the function(s) used to hash keys + * + * @return The function(s) used to hash keys + */ + [[nodiscard]] __host__ __device__ constexpr hasher hash_function() const noexcept + { + return this->probing_scheme().hash_function(); + } + /** * @brief Gets the non-owning storage ref. * @@ -743,7 +755,7 @@ class open_addressing_ref_impl { * @note If the probe key `key` was inserted into the container, returns true. Otherwise, returns * false. * - * @tparam ProbeKey Input type which is convertible to 'key_type' + * @tparam ProbeKey Probe key type * * @param key The key to search for * @@ -776,7 +788,7 @@ class open_addressing_ref_impl { * @note If the probe key `key` was inserted into the container, returns true. Otherwise, returns * false. * - * @tparam ProbeKey Input type which is convertible to 'key_type' + * @tparam ProbeKey Probe key type * * @param group The Cooperative Group used to perform group contains * @param key The key to search for @@ -884,7 +896,7 @@ class open_addressing_ref_impl { * @note Returns a un-incrementable input iterator to the element whose key is equivalent to * `key`. If no such element exists, returns `end()`. * - * @tparam ProbeKey Input type which is convertible to 'key_type' + * @tparam ProbeKey Probe key type * * @param key The key to search for * @@ -922,7 +934,7 @@ class open_addressing_ref_impl { * @note Returns a un-incrementable input iterator to the element whose key is equivalent to * `key`. If no such element exists, returns `end()`. * - * @tparam ProbeKey Input type which is convertible to 'key_type' + * @tparam ProbeKey Probe key type * * @param group The Cooperative Group used to perform this operation * @param key The key to search for @@ -966,17 +978,16 @@ class open_addressing_ref_impl { } /** - * @brief Executes a callback on every element in the container with key equivalent to the probe - * key. + * @brief For a given key, applies the function object `callback_op` to the copy of all + * corresponding matches found in the container. * - * @note Passes an un-incrementable input iterator to the element whose key is equivalent to - * `key` to the callback. + * @note The return value of `callback_op`, if any, is ignored. * - * @tparam ProbeKey Input type which is convertible to 'key_type' - * @tparam CallbackOp Unary callback functor or device lambda + * @tparam ProbeKey Probe key type + * @tparam CallbackOp Type of unary callback function object * * @param key The key to search for - * @param callback_op Function to call on every element found + * @param callback_op Function to apply to every matched slot */ template __device__ void for_each(ProbeKey const& key, CallbackOp&& callback_op) const noexcept @@ -995,7 +1006,7 @@ class open_addressing_ref_impl { return; } case detail::equal_result::EQUAL: { - callback_op(const_iterator{&(*(this->storage_ref_.data() + *probing_iter))[i]}); + callback_op(window_slots[i]); continue; } default: continue; @@ -1006,24 +1017,23 @@ class open_addressing_ref_impl { } /** - * @brief Executes a callback on every element in the container with key equivalent to the probe - * key. - * - * @note Passes an un-incrementable input iterator to the element whose key is equivalent to - * `key` to the callback. + * @brief For a given key, applies the function object `callback_op` to the copy of all + * corresponding matches found in the container. * * @note This function uses cooperative group semantics, meaning that any thread may call the * callback if it finds a matching element. If multiple elements are found within the same group, * each thread with a match will call the callback with its associated element. * + * @note The return value of `callback_op`, if any, is ignored. + * * @note Synchronizing `group` within `callback_op` is undefined behavior. * - * @tparam ProbeKey Input type which is convertible to 'key_type' - * @tparam CallbackOp Unary callback functor or device lambda + * @tparam ProbeKey Probe key type + * @tparam CallbackOp Type of unary callback function object * * @param group The Cooperative Group used to perform this operation * @param key The key to search for - * @param callback_op Function to call on every element found + * @param callback_op Function to apply to every matched slot */ template __device__ void for_each(cooperative_groups::thread_block_tile const& group, @@ -1045,7 +1055,7 @@ class open_addressing_ref_impl { continue; } case detail::equal_result::EQUAL: { - callback_op(const_iterator{&(*(this->storage_ref_.data() + *probing_iter))[i]}); + callback_op(window_slots[i]); continue; } default: { @@ -1060,12 +1070,9 @@ class open_addressing_ref_impl { } /** - * @brief Executes a callback on every element in the container with key equivalent to the probe - * key and can additionally perform work that requires synchronizing the Cooperative Group - * performing this operation. - * - * @note Passes an un-incrementable input iterator to the element whose key is equivalent to - * `key` to the callback. + * @brief Applies the function object `callback_op` to the copy of every slot in the container + * with key equivalent to the probe key and can additionally perform work that requires + * synchronizing the Cooperative Group performing this operation. * * @note This function uses cooperative group semantics, meaning that any thread may call the * callback if it finds a matching element. If multiple elements are found within the same group, @@ -1073,18 +1080,20 @@ class open_addressing_ref_impl { * * @note Synchronizing `group` within `callback_op` is undefined behavior. * + * @note The return value of `callback_op`, if any, is ignored. + * * @note The `sync_op` function can be used to perform work that requires synchronizing threads in * `group` inbetween probing steps, where the number of probing steps performed between * synchronization points is capped by `window_size * cg_size`. The functor will be called right * after the current probing window has been traversed. * - * @tparam ProbeKey Input type which is convertible to 'key_type' - * @tparam CallbackOp Unary callback functor or device lambda - * @tparam SyncOp Functor or device lambda which accepts the current `group` object + * @tparam ProbeKey Probe key type + * @tparam CallbackOp Type of unary callback function object + * @tparam SyncOp Type of function object which accepts the current `group` object * * @param group The Cooperative Group used to perform this operation * @param key The key to search for - * @param callback_op Function to call on every element found + * @param callback_op Function to apply to every matched slot * @param sync_op Function that is allowed to synchronize `group` inbetween probing windows */ template @@ -1108,7 +1117,7 @@ class open_addressing_ref_impl { continue; } case detail::equal_result::EQUAL: { - callback_op(const_iterator{&(*(this->storage_ref_.data() + *probing_iter))[i]}); + callback_op(window_slots[i]); continue; } default: { diff --git a/include/cuco/detail/pair/helpers.cuh b/include/cuco/detail/pair/helpers.cuh index 3cc343687..aa040ab94 100644 --- a/include/cuco/detail/pair/helpers.cuh +++ b/include/cuco/detail/pair/helpers.cuh @@ -19,6 +19,8 @@ #include #include +#include + namespace cuco::detail { /** @@ -29,7 +31,8 @@ namespace cuco::detail { template __host__ __device__ constexpr std::size_t pair_alignment() { - return cuda::std::min(std::size_t{16}, cuda::std::bit_ceil(sizeof(First) + sizeof(Second))); + constexpr std::size_t alignment = cuda::std::bit_ceil(sizeof(First) + sizeof(Second)); + return cuda::std::min(std::size_t{16}, alignment); } /** diff --git a/include/cuco/detail/probing_scheme/probing_scheme_impl.inl b/include/cuco/detail/probing_scheme/probing_scheme_impl.inl index 61670d7be..047ec7987 100644 --- a/include/cuco/detail/probing_scheme/probing_scheme_impl.inl +++ b/include/cuco/detail/probing_scheme/probing_scheme_impl.inl @@ -127,6 +127,13 @@ __host__ __device__ constexpr auto linear_probing::operator()( upper_bound}; } +template +__host__ __device__ constexpr linear_probing::hasher +linear_probing::hash_function() const noexcept +{ + return hash_; +} + template __host__ __device__ constexpr double_hashing::double_hashing( Hash1 const& hash1, Hash2 const& hash2) @@ -192,4 +199,12 @@ __host__ __device__ constexpr auto double_hashing::operato cg_size), upper_bound}; // TODO use fast_int operator } + +template +__host__ __device__ constexpr double_hashing::hasher +double_hashing::hash_function() const noexcept +{ + return {hash1_, hash2_}; +} + } // namespace cuco diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh index c05d0b28b..bf2aced70 100644 --- a/include/cuco/detail/static_map/kernels.cuh +++ b/include/cuco/detail/static_map/kernels.cuh @@ -202,8 +202,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem( auto shared_map = SharedMapRefType{cuco::empty_key{ref.empty_key_sentinel()}, cuco::empty_value{ref.empty_value_sentinel()}, - {}, - {}, + ref.key_eq(), + ref.probing_scheme(), {}, storage}; auto shared_map_ref = std::move(shared_map).with(cuco::op::insert_or_apply); diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index 86b75507d..e2915e1fd 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -347,19 +347,11 @@ template -template +template void static_map:: insert_or_apply_async( InputIt first, InputIt last, Init init, Op op, cuda::stream_ref stream) noexcept { - using shared_map_type = cuco::static_map>; auto constexpr has_init = true; static_map_ns::detail::dispatch_insert_or_apply( first, last, init, op, ref(op::insert_or_apply), stream); @@ -499,6 +491,70 @@ void static_mapfind_async(first, last, output_begin, ref(op::find), stream); } +template +template +void static_map::for_each( + CallbackOp&& callback_op, cuda::stream_ref stream) const +{ + impl_->for_each_async(std::forward(callback_op), stream); + stream.wait(); +} + +template +template +void static_map::for_each_async( + CallbackOp&& callback_op, cuda::stream_ref stream) const +{ + impl_->for_each_async(std::forward(callback_op), stream); +} + +template +template +void static_map::for_each( + InputIt first, InputIt last, CallbackOp&& callback_op, cuda::stream_ref stream) const +{ + impl_->for_each_async( + first, last, std::forward(callback_op), ref(op::for_each), stream); + stream.wait(); +} + +template +template +void static_map::for_each_async( + InputIt first, InputIt last, CallbackOp&& callback_op, cuda::stream_ref stream) const noexcept +{ + impl_->for_each_async( + first, last, std::forward(callback_op), ref(op::for_each), stream); +} + template :: return impl_->erased_key_sentinel(); } +template +constexpr static_map::key_equal +static_map::key_eq() + const noexcept +{ + return impl_->key_eq(); +} + +template +constexpr static_map::hasher +static_map::hash_function() + const noexcept +{ + return impl_->hash_function(); +} + template return this->impl_.key_eq(); } +template +__host__ __device__ constexpr static_map_ref::hasher +static_map_ref::hash_function() + const noexcept +{ + return impl_.hash_function(); +} + template +class operator_impl< + op::for_each_tag, + static_map_ref> { + using base_type = static_map_ref; + using ref_type = static_map_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + using iterator = typename base_type::iterator; + using const_iterator = typename base_type::const_iterator; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + public: + /** + * @brief For a given key, applies the function object `callback_op` to its match found in the + * container. + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @tparam ProbeKey Probe key type + * @tparam CallbackOp Type of unary callback function object + * + * @param key The key to search for + * @param callback_op Function to apply to the copy of the matched key-value pair + */ + template + __device__ void for_each(ProbeKey const& key, CallbackOp&& callback_op) const noexcept + { + // CRTP: cast `this` to the actual ref type + auto const& ref_ = static_cast(*this); + ref_.impl_.for_each(key, std::forward(callback_op)); + } + + /** + * @brief For a given key, applies the function object `callback_op` to its match found in the + * container. + * + * @note This function uses cooperative group semantics, meaning that any thread may call the + * callback if it finds a matching key-value pair. + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @note Synchronizing `group` within `callback_op` is undefined behavior. + * + * @tparam ProbeKey Probe key type + * @tparam CallbackOp Type of unary callback function object + * + * @param group The Cooperative Group used to perform this operation + * @param key The key to search for + * @param callback_op Function to apply to the copy of the matched key-value pair + */ + template + __device__ void for_each(cooperative_groups::thread_block_tile const& group, + ProbeKey const& key, + CallbackOp&& callback_op) const noexcept + { + // CRTP: cast `this` to the actual ref type + auto const& ref_ = static_cast(*this); + ref_.impl_.for_each(group, key, std::forward(callback_op)); + } +}; + } // namespace detail } // namespace cuco diff --git a/include/cuco/detail/static_multimap/static_multimap.inl b/include/cuco/detail/static_multimap/static_multimap.inl index 3d97a2734..9d0bdbcff 100644 --- a/include/cuco/detail/static_multimap/static_multimap.inl +++ b/include/cuco/detail/static_multimap/static_multimap.inl @@ -152,11 +152,11 @@ template template -static_multimap::size_type -static_multimap::insert( +void static_multimap::insert( InputIt first, InputIt last, cuda::stream_ref stream) { - return impl_->insert(first, last, ref(op::insert), stream); + this->insert_async(first, last, stream); + stream.wait(); } template insert_async(first, last, ref(op::insert), stream); } +template +template +static_multimap::size_type +static_multimap::insert_if( + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream) +{ + return impl_->insert_if(first, last, stencil, pred, ref(op::insert), stream); +} + +template +template +void static_multimap:: + insert_if_async(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + cuda::stream_ref stream) noexcept +{ + impl_->insert_if_async(first, last, stencil, pred, ref(op::insert), stream); +} + template contains_if_async(first, last, stencil, pred, output_begin, ref(op::contains), stream); } +template +template +static_multimap::size_type +static_multimap::count( + InputIt first, InputIt last, cuda::stream_ref stream) const +{ + return impl_->count(first, last, ref(op::count), stream); +} + template erased_key_sentinel(); } +template +constexpr static_multimap:: + key_equal + static_multimap::key_eq() + const noexcept +{ + return impl_->key_eq(); +} + +template +constexpr static_multimap:: + hasher + static_multimap:: + hash_function() const noexcept +{ + return impl_->hash_function(); +} + template +__host__ __device__ constexpr static_multimap_ref::hasher +static_multimap_ref:: + hash_function() const noexcept +{ + return impl_.hash_function(); +} + template +class operator_impl< + op::count_tag, + static_multimap_ref> { + using base_type = static_multimap_ref; + using ref_type = + static_multimap_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + using size_type = typename base_type::size_type; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + public: + /** + * @brief Counts the occurrence of a given key contained in multimap + * + * @tparam ProbeKey Input type + * + * @param key The key to count for + * + * @return Number of occurrences found by the current thread + */ + template + __device__ size_type count(ProbeKey const& key) const noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.impl_.count(key); + } + + /** + * @brief Counts the occurrence of a given key contained in multimap + * + * @tparam ProbeKey Probe key type + * + * @param group The Cooperative Group used to perform group count + * @param key The key to count for + * + * @return Number of occurrences found by the current thread + */ + template + __device__ size_type count(cooperative_groups::thread_block_tile const& group, + ProbeKey const& key) const noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.impl_.count(group, key); + } +}; + } // namespace detail } // namespace cuco diff --git a/include/cuco/detail/static_multiset/static_multiset.inl b/include/cuco/detail/static_multiset/static_multiset.inl index b102540a3..cf31f2571 100644 --- a/include/cuco/detail/static_multiset/static_multiset.inl +++ b/include/cuco/detail/static_multiset/static_multiset.inl @@ -152,11 +152,11 @@ template template -void static_multiset::insert_if( +static_multiset::size_type +static_multiset::insert_if( InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream) { - this->insert_if_async(first, last, stencil, pred, stream); - stream.wait(); + return impl_->insert_if(first, last, stencil, pred, ref(op::insert), stream); } template static_multiset::size_type static_multiset::count( - InputIt first, InputIt last, cuda::stream_ref stream) const noexcept + InputIt first, InputIt last, cuda::stream_ref stream) const { return impl_->count(first, last, ref(op::count), stream); } @@ -393,6 +393,35 @@ static_multiset return impl_->erased_key_sentinel(); } +template +constexpr static_multiset:: + key_equal + static_multiset::key_eq() + const noexcept +{ + return impl_->key_eq(); +} + +template +constexpr static_multiset::hasher +static_multiset::hash_function() + const noexcept +{ + return impl_->hash_function(); +} + template impl_.key_eq(); } +template +__host__ __device__ constexpr static_multiset_ref::hasher +static_multiset_ref::hash_function() + const noexcept +{ + return impl_.hash_function(); +} + template ::era return impl_->erased_key_sentinel(); } +template +constexpr static_set::key_equal +static_set::key_eq() const noexcept +{ + return impl_->key_eq(); +} + +template +constexpr static_set::hasher +static_set::hash_function() + const noexcept +{ + return impl_->hash_function(); +} + template ::k return this->impl_.key_eq(); } +template +__host__ __device__ constexpr static_set_ref::hasher +static_set_ref::hash_function() + const noexcept +{ + return impl_.hash_function(); +} + template void aow_storage::initialize_async( value_type key, cuda::stream_ref stream) noexcept { + if (this->num_windows() == 0) { return; } + auto constexpr cg_size = 1; auto constexpr stride = 4; auto const grid_size = cuco::detail::grid_size(this->num_windows(), cg_size, stride); diff --git a/include/cuco/detail/utils.cuh b/include/cuco/detail/utils.cuh index 21e4df759..c7b8a3f8b 100644 --- a/include/cuco/detail/utils.cuh +++ b/include/cuco/detail/utils.cuh @@ -15,6 +15,7 @@ #pragma once +#include #include #include diff --git a/include/cuco/probing_scheme.cuh b/include/cuco/probing_scheme.cuh index 5591cb74a..4885ad63d 100644 --- a/include/cuco/probing_scheme.cuh +++ b/include/cuco/probing_scheme.cuh @@ -19,6 +19,9 @@ #include #include +#include +#include + #include namespace cuco { @@ -35,10 +38,12 @@ namespace cuco { */ template class linear_probing : private detail::probing_scheme_base { - public: using probing_scheme_base_type = detail::probing_scheme_base; ///< The base probe scheme type + + public: using probing_scheme_base_type::cg_size; + using hasher = Hash; ///< Hash function type /** *@brief Constructs linear probing scheme with the hasher callable. @@ -91,6 +96,13 @@ class linear_probing : private detail::probing_scheme_base { ProbeKey const& probe_key, Extent upper_bound) const noexcept; + /** + * @brief Gets the function used to hash keys + * + * @return The function used to hash keys + */ + __host__ __device__ constexpr hasher hash_function() const noexcept; + private: Hash hash_; }; @@ -111,10 +123,12 @@ class linear_probing : private detail::probing_scheme_base { */ template class double_hashing : private detail::probing_scheme_base { - public: using probing_scheme_base_type = detail::probing_scheme_base; ///< The base probe scheme type + + public: using probing_scheme_base_type::cg_size; + using hasher = cuda::std::tuple; ///< Hash function type /** *@brief Constructs double hashing probing scheme with the two hasher callables. @@ -193,11 +207,36 @@ class double_hashing : private detail::probing_scheme_base { ProbeKey const& probe_key, Extent upper_bound) const noexcept; + /** + * @brief Gets the functions used to hash keys + * + * @return The functions used to hash keys + */ + __host__ __device__ constexpr hasher hash_function() const noexcept; + private: Hash1 hash1_; Hash2 hash2_; }; +/** + * @brief Trait indicating whether the given probing scheme is of `double_hashing` type or not + * + * @tparam T Input probing scheme type + */ +template +struct is_double_hashing : cuda::std::false_type {}; + +/** + * @brief Trait indicating whether the given probing scheme is of `double_hashing` type or not + * + * @tparam CGSize Size of CUDA Cooperative Groups + * @tparam Hash1 Unary callable type + * @tparam Hash2 Unary callable type + */ +template +struct is_double_hashing> : cuda::std::true_type {}; + } // namespace cuco #include diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 9c87e45a9..fc7dc088d 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -37,6 +37,7 @@ #include #include +#include #include namespace cuco { @@ -126,6 +127,7 @@ class static_map { /// Non-owning window storage ref type using storage_ref_type = typename impl_type::storage_ref_type; using probing_scheme_type = typename impl_type::probing_scheme_type; ///< Probing scheme type + using hasher = typename probing_scheme_type::hasher; ///< Hash function type using mapped_type = T; ///< Payload type template @@ -313,6 +315,61 @@ class static_map { template void insert_async(InputIt first, InputIt last, cuda::stream_ref stream = {}) noexcept; + /** + * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns + * true. + * + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * @note This function synchronizes the given stream and returns the number of successful + * insertions. For asynchronous execution use `insert_if_async`. + * + * @tparam InputIt Device accessible random access iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param stream CUDA stream used for the operation + * + * @return Number of successful insertions + */ + template + size_type insert_if( + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream = {}); + + /** + * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding + * stencil returns true. + * + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * + * @tparam InputIt Device accessible random access iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param stream CUDA stream used for the operation + */ + template + void insert_if_async(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + cuda::stream_ref stream = {}) noexcept; + /** * @brief Asynchronously inserts all elements in the range `[first, last)`. * @@ -369,61 +426,6 @@ class static_map { InsertedIt inserted_begin, cuda::stream_ref stream = {}); - /** - * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns - * true. - * - * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. - * @note This function synchronizes the given stream and returns the number of successful - * insertions. For asynchronous execution use `insert_if_async`. - * - * @tparam InputIt Device accessible random access iterator whose `value_type` is - * convertible to the container's `value_type` - * @tparam StencilIt Device accessible random access iterator whose value_type is - * convertible to Predicate's argument type - * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and - * argument type is convertible from std::iterator_traits::value_type - * - * @param first Beginning of the sequence of key/value pairs - * @param last End of the sequence of key/value pairs - * @param stencil Beginning of the stencil sequence - * @param pred Predicate to test on every element in the range `[stencil, stencil + - * std::distance(first, last))` - * @param stream CUDA stream used for the operation - * - * @return Number of successful insertions - */ - template - size_type insert_if( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream = {}); - - /** - * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding - * stencil returns true. - * - * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. - * - * @tparam InputIt Device accessible random access iterator whose `value_type` is - * convertible to the container's `value_type` - * @tparam StencilIt Device accessible random access iterator whose value_type is - * convertible to Predicate's argument type - * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and - * argument type is convertible from std::iterator_traits::value_type - * - * @param first Beginning of the sequence of key/value pairs - * @param last End of the sequence of key/value pairs - * @param stencil Beginning of the stencil sequence - * @param pred Predicate to test on every element in the range `[stencil, stencil + - * std::distance(first, last))` - * @param stream CUDA stream used for the operation - */ - template - void insert_if_async(InputIt first, - InputIt last, - StencilIt stencil, - Predicate pred, - cuda::stream_ref stream = {}) noexcept; - /** * @brief For any key-value pair `{k, v}` in the range `[first, last)`, if a key equivalent to `k` * already exists in the container, assigns `v` to the mapped_type corresponding to the key `k`. @@ -564,7 +566,10 @@ class static_map { * @param op Callable object to perform apply operation. * @param stream CUDA stream used for insert */ - template + template >> void insert_or_apply_async( InputIt first, InputIt last, Init init, Op op, cuda::stream_ref stream = {}) noexcept; @@ -762,6 +767,74 @@ class static_map { OutputIt output_begin, cuda::stream_ref stream = {}) const; + /** + * @brief Applies the given function object `callback_op` to the copy of every filled slot in the + * container + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @tparam CallbackOp Type of unary callback function object + * + * @param callback_op Function to apply to the copy of the matched key-value pair + * @param stream CUDA stream used for this operation + */ + template + void for_each(CallbackOp&& callback_op, cuda::stream_ref stream = {}) const; + + /** + * @brief Asynchronously applies the given function object `callback_op` to the copy of every + * filled slot in the container + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @tparam CallbackOp Type of unary callback function object + * + * @param callback_op Function to apply to the copy of the matched key-value pair + * @param stream CUDA stream used for this operation + */ + template + void for_each_async(CallbackOp&& callback_op, cuda::stream_ref stream = {}) const; + + /** + * @brief For each key in the range [first, last), applies the function object `callback_op` to + * the copy of all corresponding matches found in the container. + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @tparam InputIt Device accessible random access input iterator + * @tparam CallbackOp Type of unary callback function object + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param callback_op Function to apply to the copy of the matched key-value pair + * @param stream CUDA stream used for this operation + */ + template + void for_each(InputIt first, + InputIt last, + CallbackOp&& callback_op, + cuda::stream_ref stream = {}) const; + + /** + * @brief For each key in the range [first, last), asynchronously applies the function object + * `callback_op` to the copy of all corresponding matches found in the container. + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @tparam InputIt Device accessible random access input iterator + * @tparam CallbackOp Type of unary callback function object + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param callback_op Function to apply to the copy of the matched key-value pair + * @param stream CUDA stream used for this operation + */ + template + void for_each_async(InputIt first, + InputIt last, + CallbackOp&& callback_op, + cuda::stream_ref stream = {}) const noexcept; + /** * @brief Retrieves all of the keys and their associated values. * @@ -880,6 +953,20 @@ class static_map { */ [[nodiscard]] constexpr key_type erased_key_sentinel() const noexcept; + /** + * @brief Gets the function used to compare keys for equality + * + * @return The function used to compare keys for equality + */ + [[nodiscard]] constexpr key_equal key_eq() const noexcept; + + /** + * @brief Gets the function(s) used to hash keys + * + * @return The function(s) used to hash keys + */ + [[nodiscard]] constexpr hasher hash_function() const noexcept; + /** * @brief Get device ref with operators. * diff --git a/include/cuco/static_map_ref.cuh b/include/cuco/static_map_ref.cuh index e12bdb6f2..1da1e501a 100644 --- a/include/cuco/static_map_ref.cuh +++ b/include/cuco/static_map_ref.cuh @@ -86,6 +86,7 @@ class static_map_ref using key_type = Key; ///< Key type using mapped_type = T; ///< Mapped type using probing_scheme_type = ProbingScheme; ///< Type of probing scheme + using hasher = typename probing_scheme_type::hasher; ///< Hash function type using storage_ref_type = StorageRef; ///< Type of storage ref using window_type = typename storage_ref_type::window_type; ///< Window type using value_type = typename storage_ref_type::value_type; ///< Storage element type @@ -190,6 +191,13 @@ class static_map_ref */ [[nodiscard]] __host__ __device__ constexpr key_equal key_eq() const noexcept; + /** + * @brief Gets the function(s) used to hash keys + * + * @return The function(s) used to hash keys + */ + [[nodiscard]] __host__ __device__ constexpr hasher hash_function() const noexcept; + /** * @brief Returns a const_iterator to one past the last slot. * diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index 31bac4b4f..ebf17edba 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -130,6 +130,7 @@ class static_multimap { /// Non-owning window storage ref type using storage_ref_type = typename impl_type::storage_ref_type; using probing_scheme_type = typename impl_type::probing_scheme_type; ///< Probing scheme type + using hasher = typename probing_scheme_type::hasher; ///< Hash function type using mapped_type = T; ///< Payload type template @@ -284,8 +285,7 @@ class static_multimap { void clear_async(cuda::stream_ref stream = {}) noexcept; /** - * @brief Inserts all keys in the range `[first, last)` and returns the number of successful - * insertions. + * @brief Inserts all keys in the range `[first, last)` * * @note This function synchronizes the given stream. For asynchronous execution use * `insert_async`. @@ -297,11 +297,9 @@ class static_multimap { * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param stream CUDA stream used for insert - * - * @return Number of successful insertions */ template - size_type insert(InputIt first, InputIt last, cuda::stream_ref stream = {}); + void insert(InputIt first, InputIt last, cuda::stream_ref stream = {}); /** * @brief Asynchronously inserts all keys in the range `[first, last)`. @@ -317,6 +315,61 @@ class static_multimap { template void insert_async(InputIt first, InputIt last, cuda::stream_ref stream = {}) noexcept; + /** + * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns + * true. + * + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * @note This function synchronizes the given stream and returns the number of successful + * insertions. For asynchronous execution use `insert_if_async`. + * + * @tparam InputIt Device accessible random access iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param stream CUDA stream used for the operation + * + * @return Number of successful insertions + */ + template + size_type insert_if( + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream = {}); + + /** + * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding + * stencil returns true. + * + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * + * @tparam InputIt Device accessible random access iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param stream CUDA stream used for the operation + */ + template + void insert_if_async(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + cuda::stream_ref stream = {}) noexcept; + /** * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. * @@ -419,6 +472,22 @@ class static_multimap { OutputIt output_begin, cuda::stream_ref stream = {}) const noexcept; + /** + * @brief Counts the occurrences of keys in `[first, last)` contained in the multimap + * + * @note This function synchronizes the given stream. + * + * @tparam Input Device accessible input iterator + * + * @param first Beginning of the sequence of keys to count + * @param last End of the sequence of keys to count + * @param stream CUDA stream used for count + * + * @return The sum of total occurrences of all keys in `[first, last)` + */ + template + size_type count(InputIt first, InputIt last, cuda::stream_ref stream = {}) const; + /** * @brief Gets the maximum number of elements the hash map can hold. * @@ -447,6 +516,20 @@ class static_multimap { */ [[nodiscard]] constexpr key_type erased_key_sentinel() const noexcept; + /** + * @brief Gets the function used to compare keys for equality + * + * @return The function used to compare keys for equality + */ + [[nodiscard]] constexpr key_equal key_eq() const noexcept; + + /** + * @brief Gets the function(s) used to hash keys + * + * @return The function(s) used to hash keys + */ + [[nodiscard]] constexpr hasher hash_function() const noexcept; + /** * @brief Get device ref with operators. * diff --git a/include/cuco/static_multimap_ref.cuh b/include/cuco/static_multimap_ref.cuh index 559392db1..b23925b86 100644 --- a/include/cuco/static_multimap_ref.cuh +++ b/include/cuco/static_multimap_ref.cuh @@ -74,9 +74,6 @@ class static_multimap_ref using impl_type = detail:: open_addressing_ref_impl; - static_assert(sizeof(T) == 4 or sizeof(T) == 8, - "sizeof(mapped_type) must be either 4 bytes or 8 bytes."); - static_assert( cuco::is_bitwise_comparable_v, "Key type must have unique object representations or have been explicitly declared as safe for " @@ -86,6 +83,7 @@ class static_multimap_ref using key_type = Key; ///< Key type using mapped_type = T; ///< Mapped type using probing_scheme_type = ProbingScheme; ///< Type of probing scheme + using hasher = typename probing_scheme_type::hasher; ///< Hash function type using storage_ref_type = StorageRef; ///< Type of storage ref using window_type = typename storage_ref_type::window_type; ///< Window type using value_type = typename storage_ref_type::value_type; ///< Storage element type @@ -192,6 +190,13 @@ class static_multimap_ref */ [[nodiscard]] __host__ __device__ constexpr key_equal key_eq() const noexcept; + /** + * @brief Gets the function(s) used to hash keys + * + * @return The function(s) used to hash keys + */ + [[nodiscard]] __host__ __device__ constexpr hasher hash_function() const noexcept; + /** * @brief Returns a const_iterator to one past the last slot. * diff --git a/include/cuco/static_multiset.cuh b/include/cuco/static_multiset.cuh index 2158b31c4..22cda307f 100644 --- a/include/cuco/static_multiset.cuh +++ b/include/cuco/static_multiset.cuh @@ -100,6 +100,7 @@ class static_multiset { /// Non-owning window storage ref type using storage_ref_type = typename impl_type::storage_ref_type; using probing_scheme_type = typename impl_type::probing_scheme_type; ///< Probing scheme type + using hasher = typename probing_scheme_type::hasher; ///< Hash function type template using ref_type = cuco::static_multiset_ref - void insert_if( + size_type insert_if( InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream = {}); /** @@ -487,7 +490,7 @@ class static_multiset { * @return The sum of total occurrences of all keys in `[first, last)` */ template - size_type count(InputIt first, InputIt last, cuda::stream_ref stream = {}) const noexcept; + size_type count(InputIt first, InputIt last, cuda::stream_ref stream = {}) const; /** * @brief Counts the occurrences of keys in `[first, last)` contained in the multiset @@ -570,6 +573,20 @@ class static_multiset { */ [[nodiscard]] constexpr key_type erased_key_sentinel() const noexcept; + /** + * @brief Gets the function used to compare keys for equality + * + * @return The function used to compare keys for equality + */ + [[nodiscard]] constexpr key_equal key_eq() const noexcept; + + /** + * @brief Gets the function(s) used to hash keys + * + * @return The function(s) used to hash keys + */ + [[nodiscard]] constexpr hasher hash_function() const noexcept; + /** * @brief Get device ref with operators. * diff --git a/include/cuco/static_multiset_ref.cuh b/include/cuco/static_multiset_ref.cuh index 9d9ed5307..bf0588f2f 100644 --- a/include/cuco/static_multiset_ref.cuh +++ b/include/cuco/static_multiset_ref.cuh @@ -74,6 +74,7 @@ class static_multiset_ref public: using key_type = Key; ///< Key Type using probing_scheme_type = ProbingScheme; ///< Type of probing scheme + using hasher = typename probing_scheme_type::hasher; ///< Hash function type using storage_ref_type = StorageRef; ///< Type of storage ref using window_type = typename storage_ref_type::window_type; ///< Window type using value_type = typename storage_ref_type::value_type; ///< Storage element type @@ -169,6 +170,13 @@ class static_multiset_ref */ [[nodiscard]] __host__ __device__ constexpr key_equal key_eq() const noexcept; + /** + * @brief Gets the function(s) used to hash keys + * + * @return The function(s) used to hash keys + */ + [[nodiscard]] __host__ __device__ constexpr hasher hash_function() const noexcept; + /** * @brief Returns a const_iterator to one past the last slot. * diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh index 2ae20ed74..28ade4efb 100644 --- a/include/cuco/static_set.cuh +++ b/include/cuco/static_set.cuh @@ -105,6 +105,7 @@ class static_set { /// Non-owning window storage ref type using storage_ref_type = typename impl_type::storage_ref_type; using probing_scheme_type = typename impl_type::probing_scheme_type; ///< Probing scheme type + using hasher = typename probing_scheme_type::hasher; ///< Hash function type template using ref_type = cuco::static_set_ref + +#include + +#include +#include +#include +#include +#include + +#include + +using size_type = std::size_t; + +template +void test_for_each(Map& map, size_type num_keys) +{ + using Key = typename Map::key_type; + using Value = typename Map::mapped_type; + + REQUIRE(num_keys % 2 == 0); + + // Insert pairs + auto pairs_begin = thrust::make_transform_iterator( + thrust::counting_iterator(0), + cuda::proclaim_return_type>([] __device__(auto i) { + // use payload as 1 for even keys and 2 for odd keys + return cuco::pair{i, i % 2 + 1}; + })); + + cuda::stream_ref stream{}; + + map.insert(pairs_begin, pairs_begin + num_keys, stream); + + using Allocator = cuco::cuda_allocator>; + cuco::detail::counter_storage counter_storage( + Allocator{}); + counter_storage.reset(stream); + + // count all the keys which are even and whose payload has value 1 + map.for_each( + [counter = counter_storage.data()] __device__(auto const slot) { + auto const& [key, value] = slot; + if (((key % 2 == 0)) and (value == 1)) { counter->fetch_add(1, cuda::memory_order_relaxed); } + }, + stream); + + auto const res = counter_storage.load_to_host(stream); + REQUIRE(res == num_keys / 2); + + counter_storage.reset(stream); + + map.for_each( + thrust::counting_iterator(0), + thrust::counting_iterator(2 * num_keys), // test for false-positives + [counter = counter_storage.data()] __device__(auto const slot) { + auto const& [key, value] = slot; + if (((key % 2 == 0)) and (value == 1)) { counter->fetch_add(1, cuda::memory_order_relaxed); } + }, + stream); + REQUIRE(res == num_keys / 2); +} + +TEMPLATE_TEST_CASE_SIG( + "static_map for_each tests", + "", + ((typename Key, typename Value, cuco::test::probe_sequence Probe, int CGSize), + Key, + Value, + Probe, + CGSize), + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) +{ + constexpr size_type num_keys{100}; + using probe = std::conditional_t< + Probe == cuco::test::probe_sequence::linear_probing, + cuco::linear_probing>, + cuco::double_hashing, cuco::murmurhash3_32>>; + + using map_type = cuco::static_map, + cuda::thread_scope_device, + thrust::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>; + + auto map = map_type{num_keys, cuco::empty_key{-1}, cuco::empty_value{0}}; + test_for_each(map, num_keys); +} diff --git a/tests/static_multimap/count_test.cu b/tests/static_multimap/count_test.cu new file mode 100644 index 000000000..8eacfa729 --- /dev/null +++ b/tests/static_multimap/count_test.cu @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include + +using size_type = int32_t; + +static size_type constexpr multiplicity = 3; + +template +void test_multiplicity_count(Map& map, size_type num_keys) +{ + using Key = typename Map::key_type; + using Value = typename Map::mapped_type; + + auto const keys_begin = thrust::counting_iterator{0}; + + SECTION("Count of empty map should be zero.") + { + auto const count = map.count(keys_begin, keys_begin + num_keys); + REQUIRE(count == 0); + } + + SECTION("Count of n unique keys should be n.") + { + auto const pairs_begin = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type>( + [] __device__(auto i) { return cuco::pair{i, i}; })); + map.insert(pairs_begin, pairs_begin + num_keys); + + auto const count = map.count(keys_begin, keys_begin + num_keys); + REQUIRE(count == num_keys); + } + + SECTION("Count of n unique keys should be n x multiplicity.") + { + auto const pairs_begin = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type>( + [] __device__(auto i) { return cuco::pair{i / multiplicity, i}; })); + map.insert(pairs_begin, pairs_begin + num_keys * multiplicity); + + auto const count = map.count(keys_begin, keys_begin + num_keys); + REQUIRE(count == num_keys * multiplicity); + } +} + +TEMPLATE_TEST_CASE_SIG( + "static_multimap count tests", + "", + ((typename T, cuco::test::probe_sequence Probe, int CGSize), T, Probe, CGSize), + (int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, cuco::test::probe_sequence::linear_probing, 2)) +{ + constexpr size_type num_keys{1'000}; + + using probe = std::conditional_t< + Probe == cuco::test::probe_sequence::linear_probing, + cuco::linear_probing>, + cuco::double_hashing, cuco::default_hash_function>>; + + auto map = cuco::experimental::static_multimap, + cuda::thread_scope_device, + thrust::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>{ + num_keys * multiplicity, cuco::empty_key{-1}, cuco::empty_value{-1}}; + + test_multiplicity_count(map, num_keys); +} diff --git a/tests/static_multimap/insert_if_test.cu b/tests/static_multimap/insert_if_test.cu index f2959ac6a..40697f40a 100644 --- a/tests/static_multimap/insert_if_test.cu +++ b/tests/static_multimap/insert_if_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,51 +26,84 @@ #include -template -void test_insert_if(Map& map, PairIt pair_begin, KeyIt key_begin, std::size_t size) +template +void test_insert_if(Map& map, std::size_t size) { + using Key = typename Map::key_type; + using Value = typename Map::mapped_type; + // 50% insertion - auto pred_lambda = [] __device__(Key k) { return k % 2 == 0; }; + auto const pred = [] __device__(Key k) { return k % 2 == 0; }; + auto const keys_begin = thrust::counting_iterator{0}; + + SECTION("Count of n / 2 insertions should be n / 2.") + { + auto const pairs_begin = thrust::make_transform_iterator( + keys_begin, cuda::proclaim_return_type>([] __device__(auto i) { + return cuco::pair{i, i}; + })); + + auto const num = map.insert_if(pairs_begin, pairs_begin + size, keys_begin, pred); + REQUIRE(num * 2 == size); - map.insert_if(pair_begin, pair_begin + size, key_begin, pred_lambda); + auto const count = map.count(keys_begin, keys_begin + size); + REQUIRE(count * 2 == size); + } - auto res = map.get_size(); - REQUIRE(res * 2 == size); + SECTION("Inserting the same element n / 2 times should return n / 2.") + { + auto const pairs_begin = thrust::constant_iterator>{{1, 1}}; - auto num = map.count(key_begin, key_begin + size); - REQUIRE(num * 2 == size); + auto const num = map.insert_if(pairs_begin, pairs_begin + size, keys_begin, pred); + REQUIRE(num * 2 == size); + + auto const count = map.count(keys_begin, keys_begin + size); + REQUIRE(count * 2 == size); + } } TEMPLATE_TEST_CASE_SIG( - "Tests of insert_if", + "static_multimap insert_if", "", - ((typename Key, typename Value, cuco::test::probe_sequence Probe), Key, Value, Probe), - (int32_t, int32_t, cuco::test::probe_sequence::linear_probing), - (int32_t, int64_t, cuco::test::probe_sequence::linear_probing), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing), - (int32_t, int32_t, cuco::test::probe_sequence::double_hashing), - (int32_t, int64_t, cuco::test::probe_sequence::double_hashing), - (int64_t, int64_t, cuco::test::probe_sequence::double_hashing)) + ((typename Key, typename Value, cuco::test::probe_sequence Probe, int CGSize), + Key, + Value, + Probe, + CGSize), + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) { constexpr std::size_t num_keys{1'000}; - thrust::device_vector d_keys(num_keys); - thrust::device_vector> d_pairs(num_keys); - - thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); - // multiplicity = 1 - thrust::transform(thrust::device, - thrust::counting_iterator(0), - thrust::counting_iterator(num_keys), - d_pairs.begin(), - [] __device__(auto i) { return cuco::pair{i, i}; }); - - using probe = - std::conditional_t>, - cuco::legacy::double_hashing<8, cuco::default_hash_function>>; - - cuco::static_multimap, probe> - map{num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; - test_insert_if(map, d_pairs.begin(), d_keys.begin(), num_keys); + using extent_type = cuco::extent; + using probe = std::conditional_t< + Probe == cuco::test::probe_sequence::linear_probing, + cuco::linear_probing>, + cuco::double_hashing, cuco::murmurhash3_32>>; + + auto map = cuco::experimental::static_multimap, + probe, + cuco::cuda_allocator, + cuco::storage<2>>{ + num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; + + test_insert_if(map, num_keys); } diff --git a/tests/static_multiset/for_each_test.cu b/tests/static_multiset/for_each_test.cu index 1872586b7..b987ba660 100644 --- a/tests/static_multiset/for_each_test.cu +++ b/tests/static_multiset/for_each_test.cu @@ -45,8 +45,8 @@ CUCO_KERNEL void for_each_check_scalar(Ref ref, while (idx < n) { auto const& key = *(first + idx); std::size_t matches = 0; - ref.for_each(key, [&] __device__(auto const it) { - if (ref.key_eq()(key, *it)) { matches++; } + ref.for_each(key, [&] __device__(auto const slot) { + if (ref.key_eq()(key, slot)) { matches++; } }); if (matches != multiplicity) { error_counter->fetch_add(1, cuda::memory_order_relaxed); } idx += loop_stride; @@ -73,13 +73,13 @@ CUCO_KERNEL void for_each_check_cooperative(Ref ref, ref.for_each( tile, key, - [&] __device__(auto const it) { - if (ref.key_eq()(key, *it)) { thread_matches++; } + [&] __device__(auto const slot) { + if (ref.key_eq()(key, slot)) { thread_matches++; } }, [] __device__(auto const& group) { group.sync(); }); } else { - ref.for_each(tile, key, [&] __device__(auto const it) { - if (ref.key_eq()(key, *it)) { thread_matches++; } + ref.for_each(tile, key, [&] __device__(auto const slot) { + if (ref.key_eq()(key, slot)) { thread_matches++; } }); } auto const tile_matches = diff --git a/tests/static_set/large_input_test.cu b/tests/static_set/large_input_test.cu index 481762e5f..d4cef0201 100644 --- a/tests/static_set/large_input_test.cu +++ b/tests/static_set/large_input_test.cu @@ -53,14 +53,27 @@ void test_unique_sequence(Set& set, bool* res_begin, std::size_t num_keys) set.contains(keys_begin, keys_end, res_begin); REQUIRE(cuco::test::all_of(res_begin, res_begin + num_keys, thrust::identity{})); } + + SECTION("All inserted key/value pairs can be retrieved.") + { + auto output_keys = thrust::device_vector(num_keys); + + auto const keys_end = set.retrieve_all(output_keys.begin()); + REQUIRE(static_cast(std::distance(output_keys.begin(), keys_end)) == num_keys); + + thrust::sort(output_keys.begin(), keys_end); + + REQUIRE(cuco::test::equal(output_keys.begin(), + output_keys.end(), + thrust::counting_iterator(0), + thrust::equal_to{})); + } } TEMPLATE_TEST_CASE_SIG( - "Large input", + "cuco::static_set large input test", "", ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize), - (int32_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, cuco::test::probe_sequence::double_hashing, 2), (int64_t, cuco::test::probe_sequence::double_hashing, 1), (int64_t, cuco::test::probe_sequence::double_hashing, 2)) { diff --git a/tests/utility/storage_test.cu b/tests/utility/storage_test.cu index 8cbd8c08d..8fb923fb8 100644 --- a/tests/utility/storage_test.cu +++ b/tests/utility/storage_test.cu @@ -37,6 +37,15 @@ TEMPLATE_TEST_CASE_SIG("Storage tests", using allocator_type = cuco::cuda_allocator; auto allocator = allocator_type{}; + SECTION("Initialize empty storage is allowed.") + { + auto s = cuco:: + aow_storage, window_size, cuco::extent, allocator_type>{ + cuco::extent{0}, allocator}; + + s.initialize(cuco::pair{1, 1}); + } + SECTION("Allocate array of pairs with AoS storage.") { auto s = cuco::