From 8edf6a37add572be7f52ae924f85d2f4df2327cc Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 9 Aug 2024 19:11:17 -0500 Subject: [PATCH 01/12] Bump to RAPIDS 24.10 (#575) Updates RAPIDS branches and devcontainers to 24.10. --- .devcontainer/cuda11.8-gcc11/devcontainer.json | 2 +- .devcontainer/cuda12.5-gcc12/devcontainer.json | 2 +- .devcontainer/cuda12.5-gcc13/devcontainer.json | 2 +- .devcontainer/devcontainer.json | 2 +- CMakeLists.txt | 2 +- ci/matrix.yml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.devcontainer/cuda11.8-gcc11/devcontainer.json b/.devcontainer/cuda11.8-gcc11/devcontainer.json index 332439947..606759c73 100644 --- a/.devcontainer/cuda11.8-gcc11/devcontainer.json +++ b/.devcontainer/cuda11.8-gcc11/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-gcc11-cuda11.8-ubuntu22.04", + "image": "rapidsai/devcontainers:24.10-cpp-gcc11-cuda11.8-ubuntu22.04", "hostRequirements": { "gpu": true }, diff --git a/.devcontainer/cuda12.5-gcc12/devcontainer.json b/.devcontainer/cuda12.5-gcc12/devcontainer.json index bf3e725af..3f562f865 100644 --- a/.devcontainer/cuda12.5-gcc12/devcontainer.json +++ b/.devcontainer/cuda12.5-gcc12/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-gcc12-cuda12.5-ubuntu22.04", + "image": "rapidsai/devcontainers:24.10-cpp-gcc12-cuda12.5-ubuntu22.04", "hostRequirements": { "gpu": true }, diff --git a/.devcontainer/cuda12.5-gcc13/devcontainer.json b/.devcontainer/cuda12.5-gcc13/devcontainer.json index f1f303243..01bbe927b 100644 --- a/.devcontainer/cuda12.5-gcc13/devcontainer.json +++ b/.devcontainer/cuda12.5-gcc13/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-gcc13-cuda12.5-ubuntu22.04", + "image": "rapidsai/devcontainers:24.10-cpp-gcc13-cuda12.5-ubuntu22.04", "hostRequirements": { "gpu": true }, diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index f1f303243..01bbe927b 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,6 +1,6 @@ { "shutdownAction": "stopContainer", - "image": "rapidsai/devcontainers:24.08-cpp-gcc13-cuda12.5-ubuntu22.04", + "image": "rapidsai/devcontainers:24.10-cpp-gcc13-cuda12.5-ubuntu22.04", "hostRequirements": { "gpu": true }, diff --git a/CMakeLists.txt b/CMakeLists.txt index 6b9e97e83..8de18db73 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,7 +16,7 @@ cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake) - file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.08/RAPIDS.cmake + file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.10/RAPIDS.cmake ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake) endif() include(${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake) diff --git a/ci/matrix.yml b/ci/matrix.yml index b4dd33ca2..471a4fb0d 100644 --- a/ci/matrix.yml +++ b/ci/matrix.yml @@ -23,7 +23,7 @@ gpus: - 'v100' # The version of the devcontainer images to use from https://hub.docker.com/r/rapidsai/devcontainers -devcontainer_version: '24.08' +devcontainer_version: '24.10' # Each environment below will generate a unique build/test job # See the "compute-matrix" job in the workflow for how this is parsed and used From 99282c0b4326ada5aa8d69a9061397f651a748f7 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sat, 10 Aug 2024 09:41:53 -0700 Subject: [PATCH 02/12] Fix `ProbeKey` description in the documentation (#574) --- .../open_addressing/open_addressing_ref_impl.cuh | 14 +++++++------- .../cuco/detail/static_map/static_map_ref.inl | 8 ++++---- .../static_multimap/static_multimap_ref.inl | 4 ++-- .../static_multiset/static_multiset_ref.inl | 16 ++++++++-------- .../cuco/detail/static_set/static_set_ref.inl | 8 ++++---- 5 files changed, 25 insertions(+), 25 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index 5396f318a..0be26e482 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -743,7 +743,7 @@ class open_addressing_ref_impl { * @note If the probe key `key` was inserted into the container, returns true. Otherwise, returns * false. * - * @tparam ProbeKey Input type which is convertible to 'key_type' + * @tparam ProbeKey Probe key type * * @param key The key to search for * @@ -776,7 +776,7 @@ class open_addressing_ref_impl { * @note If the probe key `key` was inserted into the container, returns true. Otherwise, returns * false. * - * @tparam ProbeKey Input type which is convertible to 'key_type' + * @tparam ProbeKey Probe key type * * @param group The Cooperative Group used to perform group contains * @param key The key to search for @@ -884,7 +884,7 @@ class open_addressing_ref_impl { * @note Returns a un-incrementable input iterator to the element whose key is equivalent to * `key`. If no such element exists, returns `end()`. * - * @tparam ProbeKey Input type which is convertible to 'key_type' + * @tparam ProbeKey Probe key type * * @param key The key to search for * @@ -922,7 +922,7 @@ class open_addressing_ref_impl { * @note Returns a un-incrementable input iterator to the element whose key is equivalent to * `key`. If no such element exists, returns `end()`. * - * @tparam ProbeKey Input type which is convertible to 'key_type' + * @tparam ProbeKey Probe key type * * @param group The Cooperative Group used to perform this operation * @param key The key to search for @@ -972,7 +972,7 @@ class open_addressing_ref_impl { * @note Passes an un-incrementable input iterator to the element whose key is equivalent to * `key` to the callback. * - * @tparam ProbeKey Input type which is convertible to 'key_type' + * @tparam ProbeKey Probe key type * @tparam CallbackOp Unary callback functor or device lambda * * @param key The key to search for @@ -1018,7 +1018,7 @@ class open_addressing_ref_impl { * * @note Synchronizing `group` within `callback_op` is undefined behavior. * - * @tparam ProbeKey Input type which is convertible to 'key_type' + * @tparam ProbeKey Probe key type * @tparam CallbackOp Unary callback functor or device lambda * * @param group The Cooperative Group used to perform this operation @@ -1078,7 +1078,7 @@ class open_addressing_ref_impl { * synchronization points is capped by `window_size * cg_size`. The functor will be called right * after the current probing window has been traversed. * - * @tparam ProbeKey Input type which is convertible to 'key_type' + * @tparam ProbeKey Probe key type * @tparam CallbackOp Unary callback functor or device lambda * @tparam SyncOp Functor or device lambda which accepts the current `group` object * diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index b38562027..65ed81ebd 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -1150,7 +1150,7 @@ class operator_impl< * @note If the probe key `key` was inserted into the container, returns * true. Otherwise, returns false. * - * @tparam ProbeKey Input key type which is convertible to 'key_type' + * @tparam ProbeKey Probe key type * * @param key The key to search for * @@ -1170,7 +1170,7 @@ class operator_impl< * @note If the probe key `key` was inserted into the container, returns * true. Otherwise, returns false. * - * @tparam ProbeKey Input key type which is convertible to 'key_type' + * @tparam ProbeKey Probe key type * * @param group The Cooperative Group used to perform group contains * @param key The key to search for @@ -1213,7 +1213,7 @@ class operator_impl< * @note Returns a un-incrementable input iterator to the element whose key is equivalent to * `key`. If no such element exists, returns `end()`. * - * @tparam ProbeKey Input key type which is convertible to 'key_type' + * @tparam ProbeKey Probe key type * * @param key The key to search for * @@ -1233,7 +1233,7 @@ class operator_impl< * @note Returns a un-incrementable input iterator to the element whose key is equivalent to * `key`. If no such element exists, returns `end()`. * - * @tparam ProbeKey Input key type which is convertible to 'key_type' + * @tparam ProbeKey Probe key type * * @param group The Cooperative Group used to perform this operation * @param key The key to search for diff --git a/include/cuco/detail/static_multimap/static_multimap_ref.inl b/include/cuco/detail/static_multimap/static_multimap_ref.inl index 01f9171d4..59709ec41 100644 --- a/include/cuco/detail/static_multimap/static_multimap_ref.inl +++ b/include/cuco/detail/static_multimap/static_multimap_ref.inl @@ -431,7 +431,7 @@ class operator_impl< * @note If the probe key `key` was inserted into the container, returns * true. Otherwise, returns false. * - * @tparam ProbeKey Input key type which is convertible to 'key_type' + * @tparam ProbeKey Probe key type * * @param key The key to search for * @@ -451,7 +451,7 @@ class operator_impl< * @note If the probe key `key` was inserted into the container, returns * true. Otherwise, returns false. * - * @tparam ProbeKey Input key type which is convertible to 'key_type' + * @tparam ProbeKey Probe key type * * @param group The Cooperative Group used to perform group contains * @param key The key to search for diff --git a/include/cuco/detail/static_multiset/static_multiset_ref.inl b/include/cuco/detail/static_multiset/static_multiset_ref.inl index fc1f3db9d..141659985 100644 --- a/include/cuco/detail/static_multiset/static_multiset_ref.inl +++ b/include/cuco/detail/static_multiset/static_multiset_ref.inl @@ -380,7 +380,7 @@ class operator_impl< /** * @brief Indicates whether the probe key `key` was inserted into the container. * - * @tparam ProbeKey Input type which is convertible to 'key_type' + * @tparam ProbeKey Probe key type * * @param key The key to search for * @@ -396,7 +396,7 @@ class operator_impl< /** * @brief Indicates whether the probe key `key` was inserted into the container. * - * @tparam ProbeKey Input type which is convertible to 'key_type' + * @tparam ProbeKey Probe key type * * @param group The Cooperative Group used to perform group contains * @param key The key to search for @@ -439,7 +439,7 @@ class operator_impl< * @note Returns a un-incrementable input iterator to the element whose key is equivalent to * `key`. If no such element exists, returns `end()`. * - * @tparam ProbeKey Input type which is convertible to 'key_type' + * @tparam ProbeKey Probe key type * * @param key The key to search for * @@ -459,7 +459,7 @@ class operator_impl< * @note Returns a un-incrementable input iterator to the element whose key is equivalent to * `key`. If no such element exists, returns `end()`. * - * @tparam ProbeKey Input type which is convertible to 'key_type' + * @tparam ProbeKey Probe key type * * @param group The Cooperative Group used to perform this operation * @param key The key to search for @@ -498,7 +498,7 @@ class operator_impl< * @note Passes an un-incrementable input iterator to the element whose key is equivalent to * `key` to the callback. * - * @tparam ProbeKey Input type which is convertible to 'key_type' + * @tparam ProbeKey Probe key type * @tparam CallbackOp Unary callback functor or device lambda * * @param key The key to search for @@ -525,7 +525,7 @@ class operator_impl< * * @note Synchronizing `group` within `callback_op` is undefined behavior. * - * @tparam ProbeKey Input type which is convertible to 'key_type' + * @tparam ProbeKey Probe key type * @tparam CallbackOp Unary callback functor or device lambda * * @param group The Cooperative Group used to perform this operation @@ -561,7 +561,7 @@ class operator_impl< * synchronization points is capped by `window_size * cg_size`. The functor will be called right * after the current probing window has been traversed. * - * @tparam ProbeKey Input type which is convertible to 'key_type' + * @tparam ProbeKey Probe key type * @tparam CallbackOp Unary callback functor or device lambda * @tparam SyncOp Functor or device lambda which accepts the current `group` object * @@ -606,7 +606,7 @@ class operator_impl< /** * @brief Counts the occurrence of a given key contained in multiset * - * @tparam ProbeKey Input type + * @tparam ProbeKey Probe key type * * @param key The key to count for * diff --git a/include/cuco/detail/static_set/static_set_ref.inl b/include/cuco/detail/static_set/static_set_ref.inl index dc25c2372..d6adf68be 100644 --- a/include/cuco/detail/static_set/static_set_ref.inl +++ b/include/cuco/detail/static_set/static_set_ref.inl @@ -529,7 +529,7 @@ class operator_impl Date: Tue, 13 Aug 2024 03:32:58 -0700 Subject: [PATCH 03/12] Add host-bulk `for_each` for static_map (#565) --- .../cuco/detail/open_addressing/kernels.cuh | 40 ++++++ .../open_addressing/open_addressing_impl.cuh | 62 +++++++++ .../open_addressing_ref_impl.cuh | 47 ++++--- include/cuco/detail/static_map/static_map.inl | 64 +++++++++ .../cuco/detail/static_map/static_map_ref.inl | 70 ++++++++++ include/cuco/static_map.cuh | 68 ++++++++++ tests/CMakeLists.txt | 1 + tests/static_map/for_each_test.cu | 122 ++++++++++++++++++ tests/static_multiset/for_each_test.cu | 12 +- 9 files changed, 455 insertions(+), 31 deletions(-) create mode 100644 tests/static_map/for_each_test.cu diff --git a/include/cuco/detail/open_addressing/kernels.cuh b/include/cuco/detail/open_addressing/kernels.cuh index 266335a50..24fce230c 100644 --- a/include/cuco/detail/open_addressing/kernels.cuh +++ b/include/cuco/detail/open_addressing/kernels.cuh @@ -182,6 +182,46 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void erase(InputIt first, } } +/** + * @brief For each key in the range [first, first + n), applies the function object `callback_op` to + * the copy of all corresponding matches found in the container. + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @tparam CGSize Number of threads in each CG + * @tparam BlockSize Number of threads in each block + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the `key_type` of the data structure + * @tparam CallbackOp Type of unary callback function object + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of input elements + * @param n Number of input elements + * @param callback_op Function to call on every matched slot found in the container + * @param ref Non-owning container device ref used to access the slot storage + */ +template +CUCO_KERNEL __launch_bounds__(BlockSize) void for_each_n(InputIt first, + cuco::detail::index_type n, + CallbackOp callback_op, + Ref ref) +{ + auto const loop_stride = cuco::detail::grid_stride() / CGSize; + auto idx = cuco::detail::global_thread_id() / CGSize; + + while (idx < n) { + typename std::iterator_traits::value_type const& key{*(first + idx)}; + if constexpr (CGSize == 1) { + ref.for_each(key, callback_op); + } else { + auto const tile = + cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + ref.for_each(tile, key, callback_op); + } + idx += loop_stride; + } +} + /** * @brief Indicates whether the keys in the range `[first, first + n)` are contained in the data * structure if `pred` of the corresponding stencil returns true. diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 9dabff990..f9c35e0ff 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -681,6 +682,67 @@ class open_addressing_impl { return output_begin + h_num_out; } + /** + * @brief Asynchronously applies the given function object `callback_op` to the copy of every + * filled slot in the container + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @tparam CallbackOp Type of unary callback function object + * + * @param callback_op Function to call on every filled slot in the container + * @param stream CUDA stream used for this operation + */ + template + void for_each_async(CallbackOp&& callback_op, cuda::stream_ref stream) const + { + auto const is_filled = open_addressing_ns::detail::slot_is_filled{ + this->empty_key_sentinel(), this->erased_key_sentinel()}; + + auto storage_ref = this->storage_ref(); + auto const op = [callback_op, is_filled, storage_ref] __device__(auto const window_slots) { + for (auto const slot : window_slots) { + if (is_filled(slot)) { callback_op(slot); } + } + }; + + CUCO_CUDA_TRY(cub::DeviceFor::ForEachCopyN( + storage_ref.data(), storage_ref.num_windows(), op, stream.get())); + } + + /** + * @brief For each key in the range [first, last), asynchronously applies the function object + * `callback_op` to the copy of all corresponding matches found in the container. + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @tparam InputIt Device accessible random access input iterator + * @tparam CallbackOp Type of unary callback function object + * @tparam Ref Type of non-owning device container ref allowing access to storage + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param callback_op Function to call on every match found in the container + * @param container_ref Non-owning device container ref used to access the slot storage + * @param stream CUDA stream used for this operation + */ + template + void for_each_async(InputIt first, + InputIt last, + CallbackOp&& callback_op, + Ref container_ref, + cuda::stream_ref stream) const noexcept + { + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); + + detail::for_each_n + <<>>( + first, num_keys, std::forward(callback_op), container_ref); + } + /** * @brief Gets the number of elements in the container * diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index 0be26e482..12a306a71 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -966,17 +966,16 @@ class open_addressing_ref_impl { } /** - * @brief Executes a callback on every element in the container with key equivalent to the probe - * key. + * @brief For a given key, applies the function object `callback_op` to the copy of all + * corresponding matches found in the container. * - * @note Passes an un-incrementable input iterator to the element whose key is equivalent to - * `key` to the callback. + * @note The return value of `callback_op`, if any, is ignored. * * @tparam ProbeKey Probe key type - * @tparam CallbackOp Unary callback functor or device lambda + * @tparam CallbackOp Type of unary callback function object * * @param key The key to search for - * @param callback_op Function to call on every element found + * @param callback_op Function to apply to every matched slot */ template __device__ void for_each(ProbeKey const& key, CallbackOp&& callback_op) const noexcept @@ -995,7 +994,7 @@ class open_addressing_ref_impl { return; } case detail::equal_result::EQUAL: { - callback_op(const_iterator{&(*(this->storage_ref_.data() + *probing_iter))[i]}); + callback_op(window_slots[i]); continue; } default: continue; @@ -1006,24 +1005,23 @@ class open_addressing_ref_impl { } /** - * @brief Executes a callback on every element in the container with key equivalent to the probe - * key. - * - * @note Passes an un-incrementable input iterator to the element whose key is equivalent to - * `key` to the callback. + * @brief For a given key, applies the function object `callback_op` to the copy of all + * corresponding matches found in the container. * * @note This function uses cooperative group semantics, meaning that any thread may call the * callback if it finds a matching element. If multiple elements are found within the same group, * each thread with a match will call the callback with its associated element. * + * @note The return value of `callback_op`, if any, is ignored. + * * @note Synchronizing `group` within `callback_op` is undefined behavior. * * @tparam ProbeKey Probe key type - * @tparam CallbackOp Unary callback functor or device lambda + * @tparam CallbackOp Type of unary callback function object * * @param group The Cooperative Group used to perform this operation * @param key The key to search for - * @param callback_op Function to call on every element found + * @param callback_op Function to apply to every matched slot */ template __device__ void for_each(cooperative_groups::thread_block_tile const& group, @@ -1045,7 +1043,7 @@ class open_addressing_ref_impl { continue; } case detail::equal_result::EQUAL: { - callback_op(const_iterator{&(*(this->storage_ref_.data() + *probing_iter))[i]}); + callback_op(window_slots[i]); continue; } default: { @@ -1060,12 +1058,9 @@ class open_addressing_ref_impl { } /** - * @brief Executes a callback on every element in the container with key equivalent to the probe - * key and can additionally perform work that requires synchronizing the Cooperative Group - * performing this operation. - * - * @note Passes an un-incrementable input iterator to the element whose key is equivalent to - * `key` to the callback. + * @brief Applies the function object `callback_op` to the copy of every slot in the container + * with key equivalent to the probe key and can additionally perform work that requires + * synchronizing the Cooperative Group performing this operation. * * @note This function uses cooperative group semantics, meaning that any thread may call the * callback if it finds a matching element. If multiple elements are found within the same group, @@ -1073,18 +1068,20 @@ class open_addressing_ref_impl { * * @note Synchronizing `group` within `callback_op` is undefined behavior. * + * @note The return value of `callback_op`, if any, is ignored. + * * @note The `sync_op` function can be used to perform work that requires synchronizing threads in * `group` inbetween probing steps, where the number of probing steps performed between * synchronization points is capped by `window_size * cg_size`. The functor will be called right * after the current probing window has been traversed. * * @tparam ProbeKey Probe key type - * @tparam CallbackOp Unary callback functor or device lambda - * @tparam SyncOp Functor or device lambda which accepts the current `group` object + * @tparam CallbackOp Type of unary callback function object + * @tparam SyncOp Type of function object which accepts the current `group` object * * @param group The Cooperative Group used to perform this operation * @param key The key to search for - * @param callback_op Function to call on every element found + * @param callback_op Function to apply to every matched slot * @param sync_op Function that is allowed to synchronize `group` inbetween probing windows */ template @@ -1108,7 +1105,7 @@ class open_addressing_ref_impl { continue; } case detail::equal_result::EQUAL: { - callback_op(const_iterator{&(*(this->storage_ref_.data() + *probing_iter))[i]}); + callback_op(window_slots[i]); continue; } default: { diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index 86b75507d..e575114de 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -499,6 +499,70 @@ void static_mapfind_async(first, last, output_begin, ref(op::find), stream); } +template +template +void static_map::for_each( + CallbackOp&& callback_op, cuda::stream_ref stream) const +{ + impl_->for_each_async(std::forward(callback_op), stream); + stream.wait(); +} + +template +template +void static_map::for_each_async( + CallbackOp&& callback_op, cuda::stream_ref stream) const +{ + impl_->for_each_async(std::forward(callback_op), stream); +} + +template +template +void static_map::for_each( + InputIt first, InputIt last, CallbackOp&& callback_op, cuda::stream_ref stream) const +{ + impl_->for_each_async( + first, last, std::forward(callback_op), ref(op::for_each), stream); + stream.wait(); +} + +template +template +void static_map::for_each_async( + InputIt first, InputIt last, CallbackOp&& callback_op, cuda::stream_ref stream) const noexcept +{ + impl_->for_each_async( + first, last, std::forward(callback_op), ref(op::for_each), stream); +} + template +class operator_impl< + op::for_each_tag, + static_map_ref> { + using base_type = static_map_ref; + using ref_type = static_map_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + using iterator = typename base_type::iterator; + using const_iterator = typename base_type::const_iterator; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + public: + /** + * @brief For a given key, applies the function object `callback_op` to its match found in the + * container. + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @tparam ProbeKey Probe key type + * @tparam CallbackOp Type of unary callback function object + * + * @param key The key to search for + * @param callback_op Function to apply to the copy of the matched key-value pair + */ + template + __device__ void for_each(ProbeKey const& key, CallbackOp&& callback_op) const noexcept + { + // CRTP: cast `this` to the actual ref type + auto const& ref_ = static_cast(*this); + ref_.impl_.for_each(key, std::forward(callback_op)); + } + + /** + * @brief For a given key, applies the function object `callback_op` to its match found in the + * container. + * + * @note This function uses cooperative group semantics, meaning that any thread may call the + * callback if it finds a matching key-value pair. + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @note Synchronizing `group` within `callback_op` is undefined behavior. + * + * @tparam ProbeKey Probe key type + * @tparam CallbackOp Type of unary callback function object + * + * @param group The Cooperative Group used to perform this operation + * @param key The key to search for + * @param callback_op Function to apply to the copy of the matched key-value pair + */ + template + __device__ void for_each(cooperative_groups::thread_block_tile const& group, + ProbeKey const& key, + CallbackOp&& callback_op) const noexcept + { + // CRTP: cast `this` to the actual ref type + auto const& ref_ = static_cast(*this); + ref_.impl_.for_each(group, key, std::forward(callback_op)); + } +}; + } // namespace detail } // namespace cuco diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 9c87e45a9..01a39ad5d 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -762,6 +762,74 @@ class static_map { OutputIt output_begin, cuda::stream_ref stream = {}) const; + /** + * @brief Applies the given function object `callback_op` to the copy of every filled slot in the + * container + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @tparam CallbackOp Type of unary callback function object + * + * @param callback_op Function to apply to the copy of the matched key-value pair + * @param stream CUDA stream used for this operation + */ + template + void for_each(CallbackOp&& callback_op, cuda::stream_ref stream = {}) const; + + /** + * @brief Asynchronously applies the given function object `callback_op` to the copy of every + * filled slot in the container + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @tparam CallbackOp Type of unary callback function object + * + * @param callback_op Function to apply to the copy of the matched key-value pair + * @param stream CUDA stream used for this operation + */ + template + void for_each_async(CallbackOp&& callback_op, cuda::stream_ref stream = {}) const; + + /** + * @brief For each key in the range [first, last), applies the function object `callback_op` to + * the copy of all corresponding matches found in the container. + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @tparam InputIt Device accessible random access input iterator + * @tparam CallbackOp Type of unary callback function object + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param callback_op Function to apply to the copy of the matched key-value pair + * @param stream CUDA stream used for this operation + */ + template + void for_each(InputIt first, + InputIt last, + CallbackOp&& callback_op, + cuda::stream_ref stream = {}) const; + + /** + * @brief For each key in the range [first, last), asynchronously applies the function object + * `callback_op` to the copy of all corresponding matches found in the container. + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @tparam InputIt Device accessible random access input iterator + * @tparam CallbackOp Type of unary callback function object + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param callback_op Function to apply to the copy of the matched key-value pair + * @param stream CUDA stream used for this operation + */ + template + void for_each_async(InputIt first, + InputIt last, + CallbackOp&& callback_op, + cuda::stream_ref stream = {}) const noexcept; + /** * @brief Retrieves all of the keys and their associated values. * diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index dde1317b0..dc610af5b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -77,6 +77,7 @@ ConfigureTest(STATIC_MAP_TEST static_map/custom_type_test.cu static_map/duplicate_keys_test.cu static_map/erase_test.cu + static_map/for_each_test.cu static_map/hash_test.cu static_map/heterogeneous_lookup_test.cu static_map/insert_and_find_test.cu diff --git a/tests/static_map/for_each_test.cu b/tests/static_map/for_each_test.cu new file mode 100644 index 000000000..1c72a2e58 --- /dev/null +++ b/tests/static_map/for_each_test.cu @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include + +#include + +using size_type = std::size_t; + +template +void test_for_each(Map& map, size_type num_keys) +{ + using Key = typename Map::key_type; + using Value = typename Map::mapped_type; + + REQUIRE(num_keys % 2 == 0); + + // Insert pairs + auto pairs_begin = thrust::make_transform_iterator( + thrust::counting_iterator(0), + cuda::proclaim_return_type>([] __device__(auto i) { + // use payload as 1 for even keys and 2 for odd keys + return cuco::pair{i, i % 2 + 1}; + })); + + cuda::stream_ref stream{}; + + map.insert(pairs_begin, pairs_begin + num_keys, stream); + + using Allocator = cuco::cuda_allocator>; + cuco::detail::counter_storage counter_storage( + Allocator{}); + counter_storage.reset(stream); + + // count all the keys which are even and whose payload has value 1 + map.for_each( + [counter = counter_storage.data()] __device__(auto const slot) { + auto const& [key, value] = slot; + if (((key % 2 == 0)) and (value == 1)) { counter->fetch_add(1, cuda::memory_order_relaxed); } + }, + stream); + + auto const res = counter_storage.load_to_host(stream); + REQUIRE(res == num_keys / 2); + + counter_storage.reset(stream); + + map.for_each( + thrust::counting_iterator(0), + thrust::counting_iterator(2 * num_keys), // test for false-positives + [counter = counter_storage.data()] __device__(auto const slot) { + auto const& [key, value] = slot; + if (((key % 2 == 0)) and (value == 1)) { counter->fetch_add(1, cuda::memory_order_relaxed); } + }, + stream); + REQUIRE(res == num_keys / 2); +} + +TEMPLATE_TEST_CASE_SIG( + "static_map for_each tests", + "", + ((typename Key, typename Value, cuco::test::probe_sequence Probe, int CGSize), + Key, + Value, + Probe, + CGSize), + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) +{ + constexpr size_type num_keys{100}; + using probe = std::conditional_t< + Probe == cuco::test::probe_sequence::linear_probing, + cuco::linear_probing>, + cuco::double_hashing, cuco::murmurhash3_32>>; + + using map_type = cuco::static_map, + cuda::thread_scope_device, + thrust::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>; + + auto map = map_type{num_keys, cuco::empty_key{-1}, cuco::empty_value{0}}; + test_for_each(map, num_keys); +} diff --git a/tests/static_multiset/for_each_test.cu b/tests/static_multiset/for_each_test.cu index 1872586b7..b987ba660 100644 --- a/tests/static_multiset/for_each_test.cu +++ b/tests/static_multiset/for_each_test.cu @@ -45,8 +45,8 @@ CUCO_KERNEL void for_each_check_scalar(Ref ref, while (idx < n) { auto const& key = *(first + idx); std::size_t matches = 0; - ref.for_each(key, [&] __device__(auto const it) { - if (ref.key_eq()(key, *it)) { matches++; } + ref.for_each(key, [&] __device__(auto const slot) { + if (ref.key_eq()(key, slot)) { matches++; } }); if (matches != multiplicity) { error_counter->fetch_add(1, cuda::memory_order_relaxed); } idx += loop_stride; @@ -73,13 +73,13 @@ CUCO_KERNEL void for_each_check_cooperative(Ref ref, ref.for_each( tile, key, - [&] __device__(auto const it) { - if (ref.key_eq()(key, *it)) { thread_matches++; } + [&] __device__(auto const slot) { + if (ref.key_eq()(key, slot)) { thread_matches++; } }, [] __device__(auto const& group) { group.sync(); }); } else { - ref.for_each(tile, key, [&] __device__(auto const it) { - if (ref.key_eq()(key, *it)) { thread_matches++; } + ref.for_each(tile, key, [&] __device__(auto const slot) { + if (ref.key_eq()(key, slot)) { thread_matches++; } }); } auto const tile_matches = From d7f4a1afbfd470a99b266f73b6be24d04aed85de Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 13 Aug 2024 21:27:07 -0700 Subject: [PATCH 04/12] Enable Initialization for Empty Storage (#578) This PR adds an early return for storage initialization so launching an empty kernel won't crash. --- include/cuco/detail/storage/aow_storage.inl | 2 ++ tests/utility/storage_test.cu | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/include/cuco/detail/storage/aow_storage.inl b/include/cuco/detail/storage/aow_storage.inl index dfbb90327..5d976e658 100644 --- a/include/cuco/detail/storage/aow_storage.inl +++ b/include/cuco/detail/storage/aow_storage.inl @@ -74,6 +74,8 @@ template void aow_storage::initialize_async( value_type key, cuda::stream_ref stream) noexcept { + if (this->num_windows() == 0) { return; } + auto constexpr cg_size = 1; auto constexpr stride = 4; auto const grid_size = cuco::detail::grid_size(this->num_windows(), cg_size, stride); diff --git a/tests/utility/storage_test.cu b/tests/utility/storage_test.cu index 8cbd8c08d..8fb923fb8 100644 --- a/tests/utility/storage_test.cu +++ b/tests/utility/storage_test.cu @@ -37,6 +37,15 @@ TEMPLATE_TEST_CASE_SIG("Storage tests", using allocator_type = cuco::cuda_allocator; auto allocator = allocator_type{}; + SECTION("Initialize empty storage is allowed.") + { + auto s = cuco:: + aow_storage, window_size, cuco::extent, allocator_type>{ + cuco::extent{0}, allocator}; + + s.initialize(cuco::pair{1, 1}); + } + SECTION("Allocate array of pairs with AoS storage.") { auto s = cuco:: From 6eaed1be104b5e80c7411022c570e0d6ab877505 Mon Sep 17 00:00:00 2001 From: Srinivas Yadav <43375352+srinivasyadav18@users.noreply.github.com> Date: Fri, 16 Aug 2024 10:13:42 -0700 Subject: [PATCH 05/12] Fix `insert_or_apply` (#579) This PR cleans up some of the issues occured during merge of #551. 1. propagate the **key_eq** and **probing_scheme** from **global** `ref` to constructor of `shared_memory_ref` in **insert_or_apply_shmem** kernel. 2. Disable **init** overload of `insert_or_apply` using **sfinae**, because `cuda::stream_ref` is default constructed, this can invoke the **init** overload even though the user calls **no-init** overload. --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- include/cuco/detail/static_map/kernels.cuh | 4 ++-- include/cuco/detail/static_map/static_map.inl | 10 +--------- include/cuco/static_map.cuh | 6 +++++- 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh index c05d0b28b..bf2aced70 100644 --- a/include/cuco/detail/static_map/kernels.cuh +++ b/include/cuco/detail/static_map/kernels.cuh @@ -202,8 +202,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem( auto shared_map = SharedMapRefType{cuco::empty_key{ref.empty_key_sentinel()}, cuco::empty_value{ref.empty_value_sentinel()}, - {}, - {}, + ref.key_eq(), + ref.probing_scheme(), {}, storage}; auto shared_map_ref = std::move(shared_map).with(cuco::op::insert_or_apply); diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index e575114de..08acdacaf 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -347,19 +347,11 @@ template -template +template void static_map:: insert_or_apply_async( InputIt first, InputIt last, Init init, Op op, cuda::stream_ref stream) noexcept { - using shared_map_type = cuco::static_map>; auto constexpr has_init = true; static_map_ns::detail::dispatch_insert_or_apply( first, last, init, op, ref(op::insert_or_apply), stream); diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 01a39ad5d..09e980261 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -37,6 +37,7 @@ #include #include +#include #include namespace cuco { @@ -564,7 +565,10 @@ class static_map { * @param op Callable object to perform apply operation. * @param stream CUDA stream used for insert */ - template + template >> void insert_or_apply_async( InputIt first, InputIt last, Init init, Op op, cuda::stream_ref stream = {}) noexcept; From abc5095b011feccd7d3ecb36b369741b96ae6f0e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 16 Aug 2024 12:26:17 -0700 Subject: [PATCH 06/12] Fix retrieve_all for containers with large capacity (#580) Fix #576 This PR fixes the large input retrieve_all bug with a method similar to the streaming approach mentioned in https://github.com/NVIDIA/cccl/issues/1422#issuecomment-2285307255. To be reverted once the CCCL fix is in place. --- .../open_addressing/open_addressing_impl.cuh | 80 +++++++++++-------- tests/static_set/large_input_test.cu | 19 ++++- 2 files changed, 64 insertions(+), 35 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index f9c35e0ff..772e8a667 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -639,45 +639,61 @@ class open_addressing_impl { template [[nodiscard]] OutputIt retrieve_all(OutputIt output_begin, cuda::stream_ref stream) const { - std::size_t temp_storage_bytes = 0; using temp_allocator_type = typename std::allocator_traits::template rebind_alloc; + + cuco::detail::index_type constexpr stride = std::numeric_limits::max(); + + cuco::detail::index_type h_num_out{0}; auto temp_allocator = temp_allocator_type{this->allocator()}; auto d_num_out = reinterpret_cast( std::allocator_traits::allocate(temp_allocator, sizeof(size_type))); - auto const begin = thrust::make_transform_iterator( - thrust::counting_iterator{0}, - open_addressing_ns::detail::get_slot(this->storage_ref())); - auto const is_filled = open_addressing_ns::detail::slot_is_filled{ - this->empty_key_sentinel(), this->erased_key_sentinel()}; - CUCO_CUDA_TRY(cub::DeviceSelect::If(nullptr, - temp_storage_bytes, - begin, - output_begin, - d_num_out, - this->capacity(), - is_filled, - stream.get())); - - // Allocate temporary storage - auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes); - - CUCO_CUDA_TRY(cub::DeviceSelect::If(d_temp_storage, - temp_storage_bytes, - begin, - output_begin, - d_num_out, - this->capacity(), - is_filled, - stream.get())); - - size_type h_num_out; - CUCO_CUDA_TRY(cudaMemcpyAsync( - &h_num_out, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get())); - stream.wait(); + + // TODO: PR #580 to be reverted once https://github.com/NVIDIA/cccl/issues/1422 is resolved + for (cuco::detail::index_type offset = 0; + offset < static_cast(this->capacity()); + offset += stride) { + auto const num_items = + std::min(static_cast(this->capacity()) - offset, stride); + auto const begin = thrust::make_transform_iterator( + thrust::counting_iterator{static_cast(offset)}, + open_addressing_ns::detail::get_slot(this->storage_ref())); + auto const is_filled = open_addressing_ns::detail::slot_is_filled{ + this->empty_key_sentinel(), this->erased_key_sentinel()}; + + std::size_t temp_storage_bytes = 0; + + CUCO_CUDA_TRY(cub::DeviceSelect::If(nullptr, + temp_storage_bytes, + begin, + output_begin + h_num_out, + d_num_out, + static_cast(num_items), + is_filled, + stream.get())); + + // Allocate temporary storage + auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes); + + CUCO_CUDA_TRY(cub::DeviceSelect::If(d_temp_storage, + temp_storage_bytes, + begin, + output_begin + h_num_out, + d_num_out, + static_cast(num_items), + is_filled, + stream.get())); + + size_type temp_count; + CUCO_CUDA_TRY(cudaMemcpyAsync( + &temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get())); + stream.wait(); + h_num_out += temp_count; + temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); + } + std::allocator_traits::deallocate( temp_allocator, reinterpret_cast(d_num_out), sizeof(size_type)); - temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); return output_begin + h_num_out; } diff --git a/tests/static_set/large_input_test.cu b/tests/static_set/large_input_test.cu index 481762e5f..d4cef0201 100644 --- a/tests/static_set/large_input_test.cu +++ b/tests/static_set/large_input_test.cu @@ -53,14 +53,27 @@ void test_unique_sequence(Set& set, bool* res_begin, std::size_t num_keys) set.contains(keys_begin, keys_end, res_begin); REQUIRE(cuco::test::all_of(res_begin, res_begin + num_keys, thrust::identity{})); } + + SECTION("All inserted key/value pairs can be retrieved.") + { + auto output_keys = thrust::device_vector(num_keys); + + auto const keys_end = set.retrieve_all(output_keys.begin()); + REQUIRE(static_cast(std::distance(output_keys.begin(), keys_end)) == num_keys); + + thrust::sort(output_keys.begin(), keys_end); + + REQUIRE(cuco::test::equal(output_keys.begin(), + output_keys.end(), + thrust::counting_iterator(0), + thrust::equal_to{})); + } } TEMPLATE_TEST_CASE_SIG( - "Large input", + "cuco::static_set large input test", "", ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize), - (int32_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, cuco::test::probe_sequence::double_hashing, 2), (int64_t, cuco::test::probe_sequence::double_hashing, 1), (int64_t, cuco::test::probe_sequence::double_hashing, 2)) { From f6209048aa9976823fa89ab50d67e9e6ae7cee0f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 21 Aug 2024 16:00:15 -0700 Subject: [PATCH 07/12] Improve the probing scheme header (#583) This PR fixes a small bug that the current `probing_scheme.cuh` header is not self-contained due to a missing inclusion. It also adds a trait to help #581 and #582. --- include/cuco/detail/utils.cuh | 1 + include/cuco/probing_scheme.cuh | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/include/cuco/detail/utils.cuh b/include/cuco/detail/utils.cuh index 21e4df759..c7b8a3f8b 100644 --- a/include/cuco/detail/utils.cuh +++ b/include/cuco/detail/utils.cuh @@ -15,6 +15,7 @@ #pragma once +#include #include #include diff --git a/include/cuco/probing_scheme.cuh b/include/cuco/probing_scheme.cuh index 5591cb74a..e8243c2a8 100644 --- a/include/cuco/probing_scheme.cuh +++ b/include/cuco/probing_scheme.cuh @@ -19,6 +19,8 @@ #include #include +#include + #include namespace cuco { @@ -198,6 +200,24 @@ class double_hashing : private detail::probing_scheme_base { Hash2 hash2_; }; +/** + * @brief Trait indicating whether the given probing scheme is of `double_hashing` type or not + * + * @tparam T Input probing scheme type + */ +template +struct is_double_hashing : cuda::std::false_type {}; + +/** + * @brief Trait indicating whether the given probing scheme is of `double_hashing` type or not + * + * @tparam CGSize Size of CUDA Cooperative Groups + * @tparam Hash1 Unary callable type + * @tparam Hash2 Unary callable type + */ +template +struct is_double_hashing> : cuda::std::true_type {}; + } // namespace cuco #include From d3477661d771e0d6fd22259bf6dd6f8c64a7401c Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Fri, 23 Aug 2024 12:44:58 +0200 Subject: [PATCH 08/12] Work around gcc bug in pair_alignment (#585) --- include/cuco/detail/bitwise_compare.cuh | 6 ++++-- include/cuco/detail/pair/helpers.cuh | 5 ++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/include/cuco/detail/bitwise_compare.cuh b/include/cuco/detail/bitwise_compare.cuh index a8a5a69d1..d828bbf86 100644 --- a/include/cuco/detail/bitwise_compare.cuh +++ b/include/cuco/detail/bitwise_compare.cuh @@ -18,6 +18,7 @@ #include +#include #include #include @@ -67,9 +68,10 @@ struct bitwise_compare_impl<8> { * size of type, or 16, whichever is smaller. */ template -constexpr std::size_t alignment() +__host__ __device__ constexpr std::size_t alignment() { - return std::min(std::size_t{16}, cuda::std::bit_ceil(sizeof(T))); + constexpr std::size_t alignment = cuda::std::bit_ceil(sizeof(T)); + return cuda::std::min(std::size_t{16}, alignment); } /** diff --git a/include/cuco/detail/pair/helpers.cuh b/include/cuco/detail/pair/helpers.cuh index 3cc343687..aa040ab94 100644 --- a/include/cuco/detail/pair/helpers.cuh +++ b/include/cuco/detail/pair/helpers.cuh @@ -19,6 +19,8 @@ #include #include +#include + namespace cuco::detail { /** @@ -29,7 +31,8 @@ namespace cuco::detail { template __host__ __device__ constexpr std::size_t pair_alignment() { - return cuda::std::min(std::size_t{16}, cuda::std::bit_ceil(sizeof(First) + sizeof(Second))); + constexpr std::size_t alignment = cuda::std::bit_ceil(sizeof(First) + sizeof(Second)); + return cuda::std::min(std::size_t{16}, alignment); } /** From 9fe6c82d08b95795e351af5dd5828fbb06a556c7 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 23 Aug 2024 16:22:08 -0700 Subject: [PATCH 09/12] Expose `key_eq` member accessors (#584) Currently, `key_eq` is only accessible in ref classes, not in container classes. This PR addresses the issue by exposing `key_eq` in all classes. --- include/cuco/detail/static_map/static_map.inl | 15 +++++++++++++++ .../detail/static_multimap/static_multimap.inl | 16 ++++++++++++++++ .../detail/static_multiset/static_multiset.inl | 15 +++++++++++++++ include/cuco/detail/static_set/static_set.inl | 13 +++++++++++++ include/cuco/static_map.cuh | 7 +++++++ include/cuco/static_multimap.cuh | 7 +++++++ include/cuco/static_multiset.cuh | 7 +++++++ include/cuco/static_set.cuh | 7 +++++++ 8 files changed, 87 insertions(+) diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index 08acdacaf..a90315187 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -709,6 +709,21 @@ static_map:: return impl_->erased_key_sentinel(); } +template +constexpr static_map::key_equal +static_map::key_eq() + const noexcept +{ + return impl_->key_eq(); +} + template erased_key_sentinel(); } +template +constexpr static_multimap:: + key_equal + static_multimap::key_eq() + const noexcept +{ + return impl_->key_eq(); +} + template return impl_->erased_key_sentinel(); } +template +constexpr static_multiset:: + key_equal + static_multiset::key_eq() + const noexcept +{ + return impl_->key_eq(); +} + template ::era return impl_->erased_key_sentinel(); } +template +constexpr static_set::key_equal +static_set::key_eq() const noexcept +{ + return impl_->key_eq(); +} + template Date: Tue, 27 Aug 2024 09:34:33 -0700 Subject: [PATCH 10/12] Add multimap count and conditional insert (#571) This PR adds `count`, `insert_if` and `insert_if_async` APIs to the new multimap. --- .../open_addressing/open_addressing_impl.cuh | 2 +- .../static_multimap/static_multimap.inl | 57 ++++++++- .../static_multimap/static_multimap_ref.inl | 57 +++++++++ .../static_multiset/static_multiset.inl | 8 +- include/cuco/static_map.cuh | 110 +++++++++--------- include/cuco/static_multimap.cuh | 78 ++++++++++++- include/cuco/static_multimap_ref.cuh | 3 - include/cuco/static_multiset.cuh | 6 +- tests/CMakeLists.txt | 1 + tests/static_multimap/count_test.cu | 104 +++++++++++++++++ tests/static_multimap/insert_if_test.cu | 105 +++++++++++------ 11 files changed, 422 insertions(+), 109 deletions(-) create mode 100644 tests/static_multimap/count_test.cu diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 772e8a667..176239be2 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -589,7 +589,7 @@ class open_addressing_impl { [[nodiscard]] size_type count(InputIt first, InputIt last, Ref container_ref, - cuda::stream_ref stream) const noexcept + cuda::stream_ref stream) const { auto constexpr is_outer = false; return this->count(first, last, container_ref, stream); diff --git a/include/cuco/detail/static_multimap/static_multimap.inl b/include/cuco/detail/static_multimap/static_multimap.inl index 6582acd90..2ac96fa2f 100644 --- a/include/cuco/detail/static_multimap/static_multimap.inl +++ b/include/cuco/detail/static_multimap/static_multimap.inl @@ -152,11 +152,11 @@ template template -static_multimap::size_type -static_multimap::insert( +void static_multimap::insert( InputIt first, InputIt last, cuda::stream_ref stream) { - return impl_->insert(first, last, ref(op::insert), stream); + this->insert_async(first, last, stream); + stream.wait(); } template insert_async(first, last, ref(op::insert), stream); } +template +template +static_multimap::size_type +static_multimap::insert_if( + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream) +{ + return impl_->insert_if(first, last, stencil, pred, ref(op::insert), stream); +} + +template +template +void static_multimap:: + insert_if_async(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + cuda::stream_ref stream) noexcept +{ + impl_->insert_if_async(first, last, stencil, pred, ref(op::insert), stream); +} + template contains_if_async(first, last, stencil, pred, output_begin, ref(op::contains), stream); } +template +template +static_multimap::size_type +static_multimap::count( + InputIt first, InputIt last, cuda::stream_ref stream) const +{ + return impl_->count(first, last, ref(op::count), stream); +} + template +class operator_impl< + op::count_tag, + static_multimap_ref> { + using base_type = static_multimap_ref; + using ref_type = + static_multimap_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + using size_type = typename base_type::size_type; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + public: + /** + * @brief Counts the occurrence of a given key contained in multimap + * + * @tparam ProbeKey Input type + * + * @param key The key to count for + * + * @return Number of occurrences found by the current thread + */ + template + __device__ size_type count(ProbeKey const& key) const noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.impl_.count(key); + } + + /** + * @brief Counts the occurrence of a given key contained in multimap + * + * @tparam ProbeKey Probe key type + * + * @param group The Cooperative Group used to perform group count + * @param key The key to count for + * + * @return Number of occurrences found by the current thread + */ + template + __device__ size_type count(cooperative_groups::thread_block_tile const& group, + ProbeKey const& key) const noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.impl_.count(group, key); + } +}; + } // namespace detail } // namespace cuco diff --git a/include/cuco/detail/static_multiset/static_multiset.inl b/include/cuco/detail/static_multiset/static_multiset.inl index b667ba5ae..2f68d2ec5 100644 --- a/include/cuco/detail/static_multiset/static_multiset.inl +++ b/include/cuco/detail/static_multiset/static_multiset.inl @@ -152,11 +152,11 @@ template template -void static_multiset::insert_if( +static_multiset::size_type +static_multiset::insert_if( InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream) { - this->insert_if_async(first, last, stencil, pred, stream); - stream.wait(); + return impl_->insert_if(first, last, stencil, pred, ref(op::insert), stream); } template static_multiset::size_type static_multiset::count( - InputIt first, InputIt last, cuda::stream_ref stream) const noexcept + InputIt first, InputIt last, cuda::stream_ref stream) const { return impl_->count(first, last, ref(op::count), stream); } diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 899a0cdb4..efc202b4f 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -314,6 +314,61 @@ class static_map { template void insert_async(InputIt first, InputIt last, cuda::stream_ref stream = {}) noexcept; + /** + * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns + * true. + * + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * @note This function synchronizes the given stream and returns the number of successful + * insertions. For asynchronous execution use `insert_if_async`. + * + * @tparam InputIt Device accessible random access iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param stream CUDA stream used for the operation + * + * @return Number of successful insertions + */ + template + size_type insert_if( + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream = {}); + + /** + * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding + * stencil returns true. + * + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * + * @tparam InputIt Device accessible random access iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param stream CUDA stream used for the operation + */ + template + void insert_if_async(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + cuda::stream_ref stream = {}) noexcept; + /** * @brief Asynchronously inserts all elements in the range `[first, last)`. * @@ -370,61 +425,6 @@ class static_map { InsertedIt inserted_begin, cuda::stream_ref stream = {}); - /** - * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns - * true. - * - * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. - * @note This function synchronizes the given stream and returns the number of successful - * insertions. For asynchronous execution use `insert_if_async`. - * - * @tparam InputIt Device accessible random access iterator whose `value_type` is - * convertible to the container's `value_type` - * @tparam StencilIt Device accessible random access iterator whose value_type is - * convertible to Predicate's argument type - * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and - * argument type is convertible from std::iterator_traits::value_type - * - * @param first Beginning of the sequence of key/value pairs - * @param last End of the sequence of key/value pairs - * @param stencil Beginning of the stencil sequence - * @param pred Predicate to test on every element in the range `[stencil, stencil + - * std::distance(first, last))` - * @param stream CUDA stream used for the operation - * - * @return Number of successful insertions - */ - template - size_type insert_if( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream = {}); - - /** - * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding - * stencil returns true. - * - * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. - * - * @tparam InputIt Device accessible random access iterator whose `value_type` is - * convertible to the container's `value_type` - * @tparam StencilIt Device accessible random access iterator whose value_type is - * convertible to Predicate's argument type - * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and - * argument type is convertible from std::iterator_traits::value_type - * - * @param first Beginning of the sequence of key/value pairs - * @param last End of the sequence of key/value pairs - * @param stencil Beginning of the stencil sequence - * @param pred Predicate to test on every element in the range `[stencil, stencil + - * std::distance(first, last))` - * @param stream CUDA stream used for the operation - */ - template - void insert_if_async(InputIt first, - InputIt last, - StencilIt stencil, - Predicate pred, - cuda::stream_ref stream = {}) noexcept; - /** * @brief For any key-value pair `{k, v}` in the range `[first, last)`, if a key equivalent to `k` * already exists in the container, assigns `v` to the mapped_type corresponding to the key `k`. diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index 043e04a9d..6eb2a960c 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -284,8 +284,7 @@ class static_multimap { void clear_async(cuda::stream_ref stream = {}) noexcept; /** - * @brief Inserts all keys in the range `[first, last)` and returns the number of successful - * insertions. + * @brief Inserts all keys in the range `[first, last)` * * @note This function synchronizes the given stream. For asynchronous execution use * `insert_async`. @@ -297,11 +296,9 @@ class static_multimap { * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param stream CUDA stream used for insert - * - * @return Number of successful insertions */ template - size_type insert(InputIt first, InputIt last, cuda::stream_ref stream = {}); + void insert(InputIt first, InputIt last, cuda::stream_ref stream = {}); /** * @brief Asynchronously inserts all keys in the range `[first, last)`. @@ -317,6 +314,61 @@ class static_multimap { template void insert_async(InputIt first, InputIt last, cuda::stream_ref stream = {}) noexcept; + /** + * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns + * true. + * + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * @note This function synchronizes the given stream and returns the number of successful + * insertions. For asynchronous execution use `insert_if_async`. + * + * @tparam InputIt Device accessible random access iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param stream CUDA stream used for the operation + * + * @return Number of successful insertions + */ + template + size_type insert_if( + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream = {}); + + /** + * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding + * stencil returns true. + * + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * + * @tparam InputIt Device accessible random access iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param stream CUDA stream used for the operation + */ + template + void insert_if_async(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + cuda::stream_ref stream = {}) noexcept; + /** * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. * @@ -419,6 +471,22 @@ class static_multimap { OutputIt output_begin, cuda::stream_ref stream = {}) const noexcept; + /** + * @brief Counts the occurrences of keys in `[first, last)` contained in the multimap + * + * @note This function synchronizes the given stream. + * + * @tparam Input Device accessible input iterator + * + * @param first Beginning of the sequence of keys to count + * @param last End of the sequence of keys to count + * @param stream CUDA stream used for count + * + * @return The sum of total occurrences of all keys in `[first, last)` + */ + template + size_type count(InputIt first, InputIt last, cuda::stream_ref stream = {}) const; + /** * @brief Gets the maximum number of elements the hash map can hold. * diff --git a/include/cuco/static_multimap_ref.cuh b/include/cuco/static_multimap_ref.cuh index 559392db1..ac2526285 100644 --- a/include/cuco/static_multimap_ref.cuh +++ b/include/cuco/static_multimap_ref.cuh @@ -74,9 +74,6 @@ class static_multimap_ref using impl_type = detail:: open_addressing_ref_impl; - static_assert(sizeof(T) == 4 or sizeof(T) == 8, - "sizeof(mapped_type) must be either 4 bytes or 8 bytes."); - static_assert( cuco::is_bitwise_comparable_v, "Key type must have unique object representations or have been explicitly declared as safe for " diff --git a/include/cuco/static_multiset.cuh b/include/cuco/static_multiset.cuh index 55da49476..90f57f2f4 100644 --- a/include/cuco/static_multiset.cuh +++ b/include/cuco/static_multiset.cuh @@ -299,9 +299,11 @@ class static_multiset { * @param pred Predicate to test on every element in the range `[stencil, stencil + * std::distance(first, last))` * @param stream CUDA stream used for the operation + * + * @return Number of successful insertions */ template - void insert_if( + size_type insert_if( InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream = {}); /** @@ -487,7 +489,7 @@ class static_multiset { * @return The sum of total occurrences of all keys in `[first, last)` */ template - size_type count(InputIt first, InputIt last, cuda::stream_ref stream = {}) const noexcept; + size_type count(InputIt first, InputIt last, cuda::stream_ref stream = {}) const; /** * @brief Counts the occurrences of keys in `[first, last)` contained in the multiset diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index dc610af5b..80b7c2870 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -109,6 +109,7 @@ ConfigureTest(STATIC_MULTISET_TEST ################################################################################################### # - static_multimap tests ------------------------------------------------------------------------- ConfigureTest(STATIC_MULTIMAP_TEST + static_multimap/count_test.cu static_multimap/custom_pair_retrieve_test.cu static_multimap/custom_type_test.cu static_multimap/heterogeneous_lookup_test.cu diff --git a/tests/static_multimap/count_test.cu b/tests/static_multimap/count_test.cu new file mode 100644 index 000000000..8eacfa729 --- /dev/null +++ b/tests/static_multimap/count_test.cu @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include + +using size_type = int32_t; + +static size_type constexpr multiplicity = 3; + +template +void test_multiplicity_count(Map& map, size_type num_keys) +{ + using Key = typename Map::key_type; + using Value = typename Map::mapped_type; + + auto const keys_begin = thrust::counting_iterator{0}; + + SECTION("Count of empty map should be zero.") + { + auto const count = map.count(keys_begin, keys_begin + num_keys); + REQUIRE(count == 0); + } + + SECTION("Count of n unique keys should be n.") + { + auto const pairs_begin = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type>( + [] __device__(auto i) { return cuco::pair{i, i}; })); + map.insert(pairs_begin, pairs_begin + num_keys); + + auto const count = map.count(keys_begin, keys_begin + num_keys); + REQUIRE(count == num_keys); + } + + SECTION("Count of n unique keys should be n x multiplicity.") + { + auto const pairs_begin = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type>( + [] __device__(auto i) { return cuco::pair{i / multiplicity, i}; })); + map.insert(pairs_begin, pairs_begin + num_keys * multiplicity); + + auto const count = map.count(keys_begin, keys_begin + num_keys); + REQUIRE(count == num_keys * multiplicity); + } +} + +TEMPLATE_TEST_CASE_SIG( + "static_multimap count tests", + "", + ((typename T, cuco::test::probe_sequence Probe, int CGSize), T, Probe, CGSize), + (int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, cuco::test::probe_sequence::linear_probing, 2)) +{ + constexpr size_type num_keys{1'000}; + + using probe = std::conditional_t< + Probe == cuco::test::probe_sequence::linear_probing, + cuco::linear_probing>, + cuco::double_hashing, cuco::default_hash_function>>; + + auto map = cuco::experimental::static_multimap, + cuda::thread_scope_device, + thrust::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>{ + num_keys * multiplicity, cuco::empty_key{-1}, cuco::empty_value{-1}}; + + test_multiplicity_count(map, num_keys); +} diff --git a/tests/static_multimap/insert_if_test.cu b/tests/static_multimap/insert_if_test.cu index f2959ac6a..40697f40a 100644 --- a/tests/static_multimap/insert_if_test.cu +++ b/tests/static_multimap/insert_if_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,51 +26,84 @@ #include -template -void test_insert_if(Map& map, PairIt pair_begin, KeyIt key_begin, std::size_t size) +template +void test_insert_if(Map& map, std::size_t size) { + using Key = typename Map::key_type; + using Value = typename Map::mapped_type; + // 50% insertion - auto pred_lambda = [] __device__(Key k) { return k % 2 == 0; }; + auto const pred = [] __device__(Key k) { return k % 2 == 0; }; + auto const keys_begin = thrust::counting_iterator{0}; + + SECTION("Count of n / 2 insertions should be n / 2.") + { + auto const pairs_begin = thrust::make_transform_iterator( + keys_begin, cuda::proclaim_return_type>([] __device__(auto i) { + return cuco::pair{i, i}; + })); + + auto const num = map.insert_if(pairs_begin, pairs_begin + size, keys_begin, pred); + REQUIRE(num * 2 == size); - map.insert_if(pair_begin, pair_begin + size, key_begin, pred_lambda); + auto const count = map.count(keys_begin, keys_begin + size); + REQUIRE(count * 2 == size); + } - auto res = map.get_size(); - REQUIRE(res * 2 == size); + SECTION("Inserting the same element n / 2 times should return n / 2.") + { + auto const pairs_begin = thrust::constant_iterator>{{1, 1}}; - auto num = map.count(key_begin, key_begin + size); - REQUIRE(num * 2 == size); + auto const num = map.insert_if(pairs_begin, pairs_begin + size, keys_begin, pred); + REQUIRE(num * 2 == size); + + auto const count = map.count(keys_begin, keys_begin + size); + REQUIRE(count * 2 == size); + } } TEMPLATE_TEST_CASE_SIG( - "Tests of insert_if", + "static_multimap insert_if", "", - ((typename Key, typename Value, cuco::test::probe_sequence Probe), Key, Value, Probe), - (int32_t, int32_t, cuco::test::probe_sequence::linear_probing), - (int32_t, int64_t, cuco::test::probe_sequence::linear_probing), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing), - (int32_t, int32_t, cuco::test::probe_sequence::double_hashing), - (int32_t, int64_t, cuco::test::probe_sequence::double_hashing), - (int64_t, int64_t, cuco::test::probe_sequence::double_hashing)) + ((typename Key, typename Value, cuco::test::probe_sequence Probe, int CGSize), + Key, + Value, + Probe, + CGSize), + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) { constexpr std::size_t num_keys{1'000}; - thrust::device_vector d_keys(num_keys); - thrust::device_vector> d_pairs(num_keys); - - thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); - // multiplicity = 1 - thrust::transform(thrust::device, - thrust::counting_iterator(0), - thrust::counting_iterator(num_keys), - d_pairs.begin(), - [] __device__(auto i) { return cuco::pair{i, i}; }); - - using probe = - std::conditional_t>, - cuco::legacy::double_hashing<8, cuco::default_hash_function>>; - - cuco::static_multimap, probe> - map{num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; - test_insert_if(map, d_pairs.begin(), d_keys.begin(), num_keys); + using extent_type = cuco::extent; + using probe = std::conditional_t< + Probe == cuco::test::probe_sequence::linear_probing, + cuco::linear_probing>, + cuco::double_hashing, cuco::murmurhash3_32>>; + + auto map = cuco::experimental::static_multimap, + probe, + cuco::cuda_allocator, + cuco::storage<2>>{ + num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; + + test_insert_if(map, num_keys); } From a20460c160560d38e51bcf35a5ffb5848992c9c4 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 27 Aug 2024 09:35:34 -0700 Subject: [PATCH 11/12] Expose hash_function member function (#587) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Close #582 This PR exposes `hash_function` member function for cuco hash tables. --------- Co-authored-by: Daniel Jünger --- .../open_addressing/open_addressing_impl.cuh | 11 +++++++++ .../open_addressing_ref_impl.cuh | 14 ++++++++++- .../probing_scheme/probing_scheme_impl.inl | 15 ++++++++++++ include/cuco/detail/static_map/static_map.inl | 15 ++++++++++++ .../cuco/detail/static_map/static_map_ref.inl | 20 ++++++++++++++++ .../static_multimap/static_multimap.inl | 16 +++++++++++++ .../static_multimap/static_multimap_ref.inl | 20 ++++++++++++++++ .../static_multiset/static_multiset.inl | 14 +++++++++++ .../static_multiset/static_multiset_ref.inl | 18 +++++++++++++++ include/cuco/detail/static_set/static_set.inl | 14 +++++++++++ .../cuco/detail/static_set/static_set_ref.inl | 18 +++++++++++++++ include/cuco/probing_scheme.cuh | 23 +++++++++++++++++-- include/cuco/static_map.cuh | 8 +++++++ include/cuco/static_map_ref.cuh | 8 +++++++ include/cuco/static_multimap.cuh | 8 +++++++ include/cuco/static_multimap_ref.cuh | 8 +++++++ include/cuco/static_multiset.cuh | 8 +++++++ include/cuco/static_multiset_ref.cuh | 8 +++++++ include/cuco/static_set.cuh | 8 +++++++ include/cuco/static_set_ref.cuh | 8 +++++++ 20 files changed, 259 insertions(+), 3 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 176239be2..a8eff9036 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -100,6 +100,7 @@ class open_addressing_impl { using storage_ref_type = typename storage_type::ref_type; ///< Non-owning window storage ref type using probing_scheme_type = ProbingScheme; ///< Probe scheme type + using hasher = typename probing_scheme_type::hasher; ///< Hash function type /** * @brief Constructs a statically-sized open addressing data structure with the specified initial @@ -933,6 +934,16 @@ class open_addressing_impl { return probing_scheme_; } + /** + * @brief Gets the function(s) used to hash keys + * + * @return The function(s) used to hash keys + */ + [[nodiscard]] constexpr hasher hash_function() const noexcept + { + return this->probing_scheme().hash_function(); + } + /** * @brief Gets the container allocator. * diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index 12a306a71..f4c20f829 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -109,6 +109,7 @@ class open_addressing_ref_impl { public: using key_type = Key; ///< Key type using probing_scheme_type = ProbingScheme; ///< Type of probing scheme + using hasher = typename probing_scheme_type::hasher; ///< Hash function type using storage_ref_type = StorageRef; ///< Type of storage ref using window_type = typename storage_ref_type::window_type; ///< Window type using value_type = typename storage_ref_type::value_type; ///< Storage element type @@ -233,11 +234,22 @@ class open_addressing_ref_impl { * * @return The probing scheme used for the container */ - [[nodiscard]] __device__ constexpr probing_scheme_type const& probing_scheme() const noexcept + [[nodiscard]] __host__ __device__ constexpr probing_scheme_type const& probing_scheme() + const noexcept { return probing_scheme_; } + /** + * @brief Gets the function(s) used to hash keys + * + * @return The function(s) used to hash keys + */ + [[nodiscard]] __host__ __device__ constexpr hasher hash_function() const noexcept + { + return this->probing_scheme().hash_function(); + } + /** * @brief Gets the non-owning storage ref. * diff --git a/include/cuco/detail/probing_scheme/probing_scheme_impl.inl b/include/cuco/detail/probing_scheme/probing_scheme_impl.inl index 61670d7be..047ec7987 100644 --- a/include/cuco/detail/probing_scheme/probing_scheme_impl.inl +++ b/include/cuco/detail/probing_scheme/probing_scheme_impl.inl @@ -127,6 +127,13 @@ __host__ __device__ constexpr auto linear_probing::operator()( upper_bound}; } +template +__host__ __device__ constexpr linear_probing::hasher +linear_probing::hash_function() const noexcept +{ + return hash_; +} + template __host__ __device__ constexpr double_hashing::double_hashing( Hash1 const& hash1, Hash2 const& hash2) @@ -192,4 +199,12 @@ __host__ __device__ constexpr auto double_hashing::operato cg_size), upper_bound}; // TODO use fast_int operator } + +template +__host__ __device__ constexpr double_hashing::hasher +double_hashing::hash_function() const noexcept +{ + return {hash1_, hash2_}; +} + } // namespace cuco diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index a90315187..e2915e1fd 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -724,6 +724,21 @@ static_map:: return impl_->key_eq(); } +template +constexpr static_map::hasher +static_map::hash_function() + const noexcept +{ + return impl_->hash_function(); +} + template return this->impl_.key_eq(); } +template +__host__ __device__ constexpr static_map_ref::hasher +static_map_ref::hash_function() + const noexcept +{ + return impl_.hash_function(); +} + template key_eq(); } +template +constexpr static_multimap:: + hasher + static_multimap:: + hash_function() const noexcept +{ + return impl_->hash_function(); +} + template +__host__ __device__ constexpr static_multimap_ref::hasher +static_multimap_ref:: + hash_function() const noexcept +{ + return impl_.hash_function(); +} + template key_eq(); } +template +constexpr static_multiset::hasher +static_multiset::hash_function() + const noexcept +{ + return impl_->hash_function(); +} + template impl_.key_eq(); } +template +__host__ __device__ constexpr static_multiset_ref::hasher +static_multiset_ref::hash_function() + const noexcept +{ + return impl_.hash_function(); +} + template ::key return impl_->key_eq(); } +template +constexpr static_set::hasher +static_set::hash_function() + const noexcept +{ + return impl_->hash_function(); +} + template ::k return this->impl_.key_eq(); } +template +__host__ __device__ constexpr static_set_ref::hasher +static_set_ref::hash_function() + const noexcept +{ + return impl_.hash_function(); +} + template #include +#include #include #include @@ -37,10 +38,12 @@ namespace cuco { */ template class linear_probing : private detail::probing_scheme_base { - public: using probing_scheme_base_type = detail::probing_scheme_base; ///< The base probe scheme type + + public: using probing_scheme_base_type::cg_size; + using hasher = Hash; ///< Hash function type /** *@brief Constructs linear probing scheme with the hasher callable. @@ -93,6 +96,13 @@ class linear_probing : private detail::probing_scheme_base { ProbeKey const& probe_key, Extent upper_bound) const noexcept; + /** + * @brief Gets the function used to hash keys + * + * @return The function used to hash keys + */ + __host__ __device__ constexpr hasher hash_function() const noexcept; + private: Hash hash_; }; @@ -113,10 +123,12 @@ class linear_probing : private detail::probing_scheme_base { */ template class double_hashing : private detail::probing_scheme_base { - public: using probing_scheme_base_type = detail::probing_scheme_base; ///< The base probe scheme type + + public: using probing_scheme_base_type::cg_size; + using hasher = cuda::std::tuple; ///< Hash function type /** *@brief Constructs double hashing probing scheme with the two hasher callables. @@ -195,6 +207,13 @@ class double_hashing : private detail::probing_scheme_base { ProbeKey const& probe_key, Extent upper_bound) const noexcept; + /** + * @brief Gets the functions used to hash keys + * + * @return The functions used to hash keys + */ + __host__ __device__ constexpr hasher hash_function() const noexcept; + private: Hash1 hash1_; Hash2 hash2_; diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index efc202b4f..fc7dc088d 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -127,6 +127,7 @@ class static_map { /// Non-owning window storage ref type using storage_ref_type = typename impl_type::storage_ref_type; using probing_scheme_type = typename impl_type::probing_scheme_type; ///< Probing scheme type + using hasher = typename probing_scheme_type::hasher; ///< Hash function type using mapped_type = T; ///< Payload type template @@ -959,6 +960,13 @@ class static_map { */ [[nodiscard]] constexpr key_equal key_eq() const noexcept; + /** + * @brief Gets the function(s) used to hash keys + * + * @return The function(s) used to hash keys + */ + [[nodiscard]] constexpr hasher hash_function() const noexcept; + /** * @brief Get device ref with operators. * diff --git a/include/cuco/static_map_ref.cuh b/include/cuco/static_map_ref.cuh index e12bdb6f2..1da1e501a 100644 --- a/include/cuco/static_map_ref.cuh +++ b/include/cuco/static_map_ref.cuh @@ -86,6 +86,7 @@ class static_map_ref using key_type = Key; ///< Key type using mapped_type = T; ///< Mapped type using probing_scheme_type = ProbingScheme; ///< Type of probing scheme + using hasher = typename probing_scheme_type::hasher; ///< Hash function type using storage_ref_type = StorageRef; ///< Type of storage ref using window_type = typename storage_ref_type::window_type; ///< Window type using value_type = typename storage_ref_type::value_type; ///< Storage element type @@ -190,6 +191,13 @@ class static_map_ref */ [[nodiscard]] __host__ __device__ constexpr key_equal key_eq() const noexcept; + /** + * @brief Gets the function(s) used to hash keys + * + * @return The function(s) used to hash keys + */ + [[nodiscard]] __host__ __device__ constexpr hasher hash_function() const noexcept; + /** * @brief Returns a const_iterator to one past the last slot. * diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index 6eb2a960c..ebf17edba 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -130,6 +130,7 @@ class static_multimap { /// Non-owning window storage ref type using storage_ref_type = typename impl_type::storage_ref_type; using probing_scheme_type = typename impl_type::probing_scheme_type; ///< Probing scheme type + using hasher = typename probing_scheme_type::hasher; ///< Hash function type using mapped_type = T; ///< Payload type template @@ -522,6 +523,13 @@ class static_multimap { */ [[nodiscard]] constexpr key_equal key_eq() const noexcept; + /** + * @brief Gets the function(s) used to hash keys + * + * @return The function(s) used to hash keys + */ + [[nodiscard]] constexpr hasher hash_function() const noexcept; + /** * @brief Get device ref with operators. * diff --git a/include/cuco/static_multimap_ref.cuh b/include/cuco/static_multimap_ref.cuh index ac2526285..b23925b86 100644 --- a/include/cuco/static_multimap_ref.cuh +++ b/include/cuco/static_multimap_ref.cuh @@ -83,6 +83,7 @@ class static_multimap_ref using key_type = Key; ///< Key type using mapped_type = T; ///< Mapped type using probing_scheme_type = ProbingScheme; ///< Type of probing scheme + using hasher = typename probing_scheme_type::hasher; ///< Hash function type using storage_ref_type = StorageRef; ///< Type of storage ref using window_type = typename storage_ref_type::window_type; ///< Window type using value_type = typename storage_ref_type::value_type; ///< Storage element type @@ -189,6 +190,13 @@ class static_multimap_ref */ [[nodiscard]] __host__ __device__ constexpr key_equal key_eq() const noexcept; + /** + * @brief Gets the function(s) used to hash keys + * + * @return The function(s) used to hash keys + */ + [[nodiscard]] __host__ __device__ constexpr hasher hash_function() const noexcept; + /** * @brief Returns a const_iterator to one past the last slot. * diff --git a/include/cuco/static_multiset.cuh b/include/cuco/static_multiset.cuh index 90f57f2f4..22cda307f 100644 --- a/include/cuco/static_multiset.cuh +++ b/include/cuco/static_multiset.cuh @@ -100,6 +100,7 @@ class static_multiset { /// Non-owning window storage ref type using storage_ref_type = typename impl_type::storage_ref_type; using probing_scheme_type = typename impl_type::probing_scheme_type; ///< Probing scheme type + using hasher = typename probing_scheme_type::hasher; ///< Hash function type template using ref_type = cuco::static_multiset_ref using ref_type = cuco::static_set_ref Date: Thu, 29 Aug 2024 02:23:14 +0200 Subject: [PATCH 12/12] Add diagnostic to detect unsupported or missing CCCL (#591) --- include/cuco/detail/__config | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/cuco/detail/__config b/include/cuco/detail/__config index 6d4bf7339..24c7758ab 100644 --- a/include/cuco/detail/__config +++ b/include/cuco/detail/__config @@ -17,6 +17,7 @@ #pragma once #include +#include #if !defined(__CUDACC_VER_MAJOR__) || !defined(__CUDACC_VER_MINOR__) #error "NVCC version not found" @@ -32,6 +33,10 @@ #error "Support for extended device lambdas is required (nvcc flag --expt-extended-lambda)" #endif +#if !defined(CCCL_VERSION) || (CCCL_VERSION < 2005000) +#error "CCCL version 2.5.0 or later is required" +#endif + // WAR for libcudacxx/296 #define CUCO_CUDA_MINIMUM_ARCH _NV_FIRST_ARG(__CUDA_ARCH_LIST__)