From a9c69b9e085df62b6322ad1a3920ccc22b2b8cf6 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 31 Oct 2024 14:38:47 -0700 Subject: [PATCH] Add host for_each APIs for multiset and multimap --- .../static_multimap/static_multimap.inl | 67 ++++++ .../static_multiset/static_multiset.inl | 151 +++++++++---- include/cuco/static_multimap.cuh | 68 ++++++ include/cuco/static_multiset.cuh | 200 ++++++++++++------ 4 files changed, 376 insertions(+), 110 deletions(-) diff --git a/include/cuco/detail/static_multimap/static_multimap.inl b/include/cuco/detail/static_multimap/static_multimap.inl index 7048a5426..965e14f3d 100644 --- a/include/cuco/detail/static_multimap/static_multimap.inl +++ b/include/cuco/detail/static_multimap/static_multimap.inl @@ -315,6 +315,73 @@ void static_multimapfind_async(first, last, output_begin, ref(op::find), stream); } +template +template +void static_multimap::for_each( + CallbackOp&& callback_op, cuda::stream_ref stream) const +{ + impl_->for_each_async(std::forward(callback_op), stream); + stream.wait(); +} + +template +template +void static_multimap:: + for_each_async(CallbackOp&& callback_op, cuda::stream_ref stream) const +{ + impl_->for_each_async(std::forward(callback_op), stream); +} + +template +template +void static_multimap::for_each( + InputIt first, InputIt last, CallbackOp&& callback_op, cuda::stream_ref stream) const +{ + impl_->for_each_async( + first, last, std::forward(callback_op), ref(op::for_each), stream); + stream.wait(); +} + +template +template +void static_multimap:: + for_each_async(InputIt first, + InputIt last, + CallbackOp&& callback_op, + cuda::stream_ref stream) const noexcept +{ + impl_->for_each_async( + first, last, std::forward(callback_op), ref(op::for_each), stream); +} + template -template -std::pair -static_multiset::retrieve( - InputProbeIt first, - InputProbeIt last, - OutputProbeIt output_probe, - OutputMatchIt output_match, - cuda::stream_ref stream) const +template +void static_multiset::for_each( + CallbackOp&& callback_op, cuda::stream_ref stream) const { - return this->impl_->retrieve( - first, last, output_probe, output_match, this->ref(op::retrieve), stream); + impl_->for_each_async(std::forward(callback_op), stream); + stream.wait(); } template -template -std::pair -static_multiset::retrieve( - InputProbeIt first, - InputProbeIt last, - ProbeEqual const& probe_equal, - ProbeHash const& probe_hash, - OutputProbeIt output_probe, - OutputMatchIt output_match, - cuda::stream_ref stream) const +template +void static_multiset:: + for_each_async(CallbackOp&& callback_op, cuda::stream_ref stream) const { - auto const probe_ref = - this->ref(op::retrieve).rebind_key_eq(probe_equal).rebind_hash_function(probe_hash); - return this->impl_->retrieve(first, last, output_probe, output_match, probe_ref, stream); + impl_->for_each_async(std::forward(callback_op), stream); } template -template -std::pair -static_multiset::retrieve_outer( - InputProbeIt first, - InputProbeIt last, - ProbeEqual const& probe_equal, - ProbeHash const& probe_hash, - OutputProbeIt output_probe, - OutputMatchIt output_match, - cuda::stream_ref stream) const +template +void static_multiset::for_each( + InputIt first, InputIt last, CallbackOp&& callback_op, cuda::stream_ref stream) const { - auto const probe_ref = - this->ref(op::retrieve).rebind_key_eq(probe_equal).rebind_hash_function(probe_hash); - return this->impl_->retrieve_outer(first, last, output_probe, output_match, probe_ref, stream); + impl_->for_each_async( + first, last, std::forward(callback_op), ref(op::for_each), stream); + stream.wait(); +} + +template +template +void static_multiset:: + for_each_async(InputIt first, + InputIt last, + CallbackOp&& callback_op, + cuda::stream_ref stream) const noexcept +{ + impl_->for_each_async( + first, last, std::forward(callback_op), ref(op::for_each), stream); } template stream); } +template +template +std::pair +static_multiset::retrieve( + InputProbeIt first, + InputProbeIt last, + OutputProbeIt output_probe, + OutputMatchIt output_match, + cuda::stream_ref stream) const +{ + return this->impl_->retrieve( + first, last, output_probe, output_match, this->ref(op::retrieve), stream); +} + +template +template +std::pair +static_multiset::retrieve( + InputProbeIt first, + InputProbeIt last, + ProbeEqual const& probe_equal, + ProbeHash const& probe_hash, + OutputProbeIt output_probe, + OutputMatchIt output_match, + cuda::stream_ref stream) const +{ + auto const probe_ref = + this->ref(op::retrieve).rebind_key_eq(probe_equal).rebind_hash_function(probe_hash); + return this->impl_->retrieve(first, last, output_probe, output_match, probe_ref, stream); +} + +template +template +std::pair +static_multiset::retrieve_outer( + InputProbeIt first, + InputProbeIt last, + ProbeEqual const& probe_equal, + ProbeHash const& probe_hash, + OutputProbeIt output_probe, + OutputMatchIt output_match, + cuda::stream_ref stream) const +{ + auto const probe_ref = + this->ref(op::retrieve).rebind_key_eq(probe_equal).rebind_hash_function(probe_hash); + return this->impl_->retrieve_outer(first, last, output_probe, output_match, probe_ref, stream); +} + template + void for_each(CallbackOp&& callback_op, cuda::stream_ref stream = {}) const; + + /** + * @brief Asynchronously applies the given function object `callback_op` to the copy of every + * filled slot in the container + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @tparam CallbackOp Type of unary callback function object + * + * @param callback_op Function to apply to the copy of the filled slot + * @param stream CUDA stream used for this operation + */ + template + void for_each_async(CallbackOp&& callback_op, cuda::stream_ref stream = {}) const; + + /** + * @brief For each key in the range [first, last), applies the function object `callback_op` to + * the copy of all corresponding matches found in the container. + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @tparam InputIt Device accessible random access input iterator + * @tparam CallbackOp Type of unary callback function object + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param callback_op Function to apply to the copy of the matched slot + * @param stream CUDA stream used for this operation + */ + template + void for_each(InputIt first, + InputIt last, + CallbackOp&& callback_op, + cuda::stream_ref stream = {}) const; + + /** + * @brief For each key in the range [first, last), asynchronously applies the function object + * `callback_op` to the copy of all corresponding matches found in the container. + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @tparam InputIt Device accessible random access input iterator + * @tparam CallbackOp Type of unary callback function object + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param callback_op Function to apply to the copy of the matched slot + * @param stream CUDA stream used for this operation + */ + template + void for_each_async(InputIt first, + InputIt last, + CallbackOp&& callback_op, + cuda::stream_ref stream = {}) const noexcept; + /** * @brief Counts the occurrences of keys in `[first, last)` contained in the multimap * diff --git a/include/cuco/static_multiset.cuh b/include/cuco/static_multiset.cuh index 943465c51..4cd5277d5 100644 --- a/include/cuco/static_multiset.cuh +++ b/include/cuco/static_multiset.cuh @@ -482,6 +482,140 @@ class static_multiset { OutputIt output_begin, cuda::stream_ref stream = {}) const; + /** + * @brief Applies the given function object `callback_op` to the copy of every filled slot in the + * container + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @tparam CallbackOp Type of unary callback function object + * + * @param callback_op Function to apply to the copy of the filled slot + * @param stream CUDA stream used for this operation + */ + template + void for_each(CallbackOp&& callback_op, cuda::stream_ref stream = {}) const; + + /** + * @brief Asynchronously applies the given function object `callback_op` to the copy of every + * filled slot in the container + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @tparam CallbackOp Type of unary callback function object + * + * @param callback_op Function to apply to the copy of the filled slot + * @param stream CUDA stream used for this operation + */ + template + void for_each_async(CallbackOp&& callback_op, cuda::stream_ref stream = {}) const; + + /** + * @brief For each key in the range [first, last), applies the function object `callback_op` to + * the copy of all corresponding matches found in the container. + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @tparam InputIt Device accessible random access input iterator + * @tparam CallbackOp Type of unary callback function object + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param callback_op Function to apply to the copy of the matched slot + * @param stream CUDA stream used for this operation + */ + template + void for_each(InputIt first, + InputIt last, + CallbackOp&& callback_op, + cuda::stream_ref stream = {}) const; + + /** + * @brief For each key in the range [first, last), asynchronously applies the function object + * `callback_op` to the copy of all corresponding matches found in the container. + * + * @note The return value of `callback_op`, if any, is ignored. + * + * @tparam InputIt Device accessible random access input iterator + * @tparam CallbackOp Type of unary callback function object + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param callback_op Function to apply to the copy of the matched slot + * @param stream CUDA stream used for this operation + */ + template + void for_each_async(InputIt first, + InputIt last, + CallbackOp&& callback_op, + cuda::stream_ref stream = {}) const noexcept; + + /** + * @brief Counts the occurrences of keys in `[first, last)` contained in the multiset + * + * @note This function synchronizes the given stream. + * + * @tparam Input Device accessible input iterator + * + * @param first Beginning of the sequence of keys to count + * @param last End of the sequence of keys to count + * @param stream CUDA stream used for count + * + * @return The sum of total occurrences of all keys in `[first, last)` + */ + template + size_type count(InputIt first, InputIt last, cuda::stream_ref stream = {}) const; + + /** + * @brief Counts the occurrences of keys in `[first, last)` contained in the multiset + * + * @note This function synchronizes the given stream. + * + * @tparam Input Device accessible input iterator + * @tparam ProbeKeyEqual Binary callable + * @tparam ProbeHash Unary hash callable + * + * @param first Beginning of the sequence of keys to count + * @param last End of the sequence of keys to count + * @param probe_key_equal Binary callable to compare two keys for equality + * @param probe_hash Unary callable to hash a given key + * @param stream CUDA stream used for count + * + * @return The sum of total occurrences of all keys in `[first, last)` + */ + template + size_type count(InputIt first, + InputIt last, + ProbeKeyEqual const& probe_key_equal, + ProbeHash const& probe_hash, + cuda::stream_ref stream = {}) const; + + /** + * @brief Counts the occurrences of keys in `[first, last)` contained in the multiset + * + * @note This function synchronizes the given stream. + * @note If a given key has no matches, its occurrence is 1. + * + * @tparam Input Device accessible input iterator + * @tparam ProbeKeyEqual Binary callable + * @tparam ProbeHash Unary hash callable + * + * @param first Beginning of the sequence of keys to count + * @param last End of the sequence of keys to count + * @param probe_key_equal Binary callable to compare two keys for equality + * @param probe_hash Unary callable to hash a given key + * @param stream CUDA stream used for count + * + * @return The sum of total occurrences of all keys in `[first, last)` where keys have no matches + * are considered to have a single occurrence. + */ + template + size_type count_outer(InputIt first, + InputIt last, + ProbeKeyEqual const& probe_key_equal, + ProbeHash const& probe_hash, + cuda::stream_ref stream = {}) const; + /** * @brief Retrieves all the slots corresponding to all keys in the range `[first, last)`. * @@ -604,72 +738,6 @@ class static_multiset { OutputMatchIt output_match, cuda::stream_ref stream = {}) const; - /** - * @brief Counts the occurrences of keys in `[first, last)` contained in the multiset - * - * @note This function synchronizes the given stream. - * - * @tparam Input Device accessible input iterator - * - * @param first Beginning of the sequence of keys to count - * @param last End of the sequence of keys to count - * @param stream CUDA stream used for count - * - * @return The sum of total occurrences of all keys in `[first, last)` - */ - template - size_type count(InputIt first, InputIt last, cuda::stream_ref stream = {}) const; - - /** - * @brief Counts the occurrences of keys in `[first, last)` contained in the multiset - * - * @note This function synchronizes the given stream. - * - * @tparam Input Device accessible input iterator - * @tparam ProbeKeyEqual Binary callable - * @tparam ProbeHash Unary hash callable - * - * @param first Beginning of the sequence of keys to count - * @param last End of the sequence of keys to count - * @param probe_key_equal Binary callable to compare two keys for equality - * @param probe_hash Unary callable to hash a given key - * @param stream CUDA stream used for count - * - * @return The sum of total occurrences of all keys in `[first, last)` - */ - template - size_type count(InputIt first, - InputIt last, - ProbeKeyEqual const& probe_key_equal, - ProbeHash const& probe_hash, - cuda::stream_ref stream = {}) const; - - /** - * @brief Counts the occurrences of keys in `[first, last)` contained in the multiset - * - * @note This function synchronizes the given stream. - * @note If a given key has no matches, its occurrence is 1. - * - * @tparam Input Device accessible input iterator - * @tparam ProbeKeyEqual Binary callable - * @tparam ProbeHash Unary hash callable - * - * @param first Beginning of the sequence of keys to count - * @param last End of the sequence of keys to count - * @param probe_key_equal Binary callable to compare two keys for equality - * @param probe_hash Unary callable to hash a given key - * @param stream CUDA stream used for count - * - * @return The sum of total occurrences of all keys in `[first, last)` where keys have no matches - * are considered to have a single occurrence. - */ - template - size_type count_outer(InputIt first, - InputIt last, - ProbeKeyEqual const& probe_key_equal, - ProbeHash const& probe_hash, - cuda::stream_ref stream = {}) const; - /** * @brief Gets the number of elements in the container. *