From 1bb0ae6ce9475b1902b9c66e92c172d4862ded7e Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 21 Nov 2024 11:34:22 -0800 Subject: [PATCH 1/4] Allow `cuco::arrow_filter_policy` to accept a custom implementation of `xxhash_64` (#642) --- include/cuco/bloom_filter_policies.cuh | 8 ++++++-- include/cuco/detail/bloom_filter/arrow_filter_policy.cuh | 7 ++++--- tests/bloom_filter/unique_sequence_test.cu | 4 ++-- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/include/cuco/bloom_filter_policies.cuh b/include/cuco/bloom_filter_policies.cuh index cf9ddb371..a433e7cca 100644 --- a/include/cuco/bloom_filter_policies.cuh +++ b/include/cuco/bloom_filter_policies.cuh @@ -18,6 +18,7 @@ #include #include +#include #include @@ -28,9 +29,12 @@ namespace cuco { * fingerprint. * * @tparam Key The type of the values to generate a fingerprint for. + * @tparam XXHash64 Custom (64 bit) XXHash hasher to generate a key's fingerprint. + * By default, cuco::xxhash_64 hasher will be used. + * */ -template -using arrow_filter_policy = detail::arrow_filter_policy; +template > +using arrow_filter_policy = detail::arrow_filter_policy; /** * @brief The default policy that defines how a Blocked Bloom Filter generates and stores a key's diff --git a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh index 23b95793e..e1f1b641a 100644 --- a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh +++ b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh @@ -78,12 +78,13 @@ namespace cuco::detail { * @endcode * * @tparam Key The type of the values to generate a fingerprint for. + * @tparam XXHash64 64-bit XXHash hasher implementation for fingerprint generation. */ -template +template class arrow_filter_policy { public: - using hasher = cuco::xxhash_64; ///< xxhash_64 hasher for Arrow bloom filter policy - using word_type = std::uint32_t; ///< uint32_t for Arrow bloom filter policy + using hasher = XXHash64; ///< 64-bit XXHash hasher for Arrow bloom filter policy + using word_type = std::uint32_t; ///< uint32_t for Arrow bloom filter policy using hash_argument_type = typename hasher::argument_type; ///< Hash function input type using hash_result_type = decltype(std::declval()( std::declval())); ///< hash function output type diff --git a/tests/bloom_filter/unique_sequence_test.cu b/tests/bloom_filter/unique_sequence_test.cu index c5970b17c..b23564270 100644 --- a/tests/bloom_filter/unique_sequence_test.cu +++ b/tests/bloom_filter/unique_sequence_test.cu @@ -35,8 +35,8 @@ void test_unique_sequence(Filter& filter, size_type num_keys) { using Key = typename Filter::key_type; + // Generate keys thrust::device_vector keys(num_keys); - thrust::sequence(thrust::device, keys.begin(), keys.end()); thrust::device_vector contained(num_keys, false); @@ -119,4 +119,4 @@ TEMPLATE_TEST_CASE_SIG("bloom_filter arrow policy tests", auto filter = filter_type{1000}; test_unique_sequence(filter, num_keys); -} +} \ No newline at end of file From b8429d44314a02da9b5ebc17fa0a9284543aa825 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 26 Nov 2024 11:24:23 -0800 Subject: [PATCH 2/4] Doc update for `cuco::arrow_filter_policy` (#646) This PR updates docstring for `cuco::arrow_filter_policy` reflecting the recent change allowing taking in a custom `xxhash_64` hasher along with a couple other minor changes --- include/cuco/detail/bloom_filter/arrow_filter_policy.cuh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh index e1f1b641a..edb88e6e0 100644 --- a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh +++ b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh @@ -40,14 +40,15 @@ namespace cuco::detail { * void bulk_insert_and_eval_arrow_policy_bloom_filter(device_vector const& positive_keys, * device_vector const& negative_keys) * { - * using policy_type = cuco::arrow_filter_policy; + * using xxhash_64 = cuco::xxhash_64; + * using policy_type = cuco::arrow_filter_policy; * * // Warn or throw if the number of filter blocks is greater than maximum used by Arrow policy. * static_assert(NUM_FILTER_BLOCKS <= policy_type::max_filter_blocks, "NUM_FILTER_BLOCKS must be * in range: [1, 4194304]"); * * // Create a bloom filter with Arrow policy - * cuco::bloom_filter, + * cuco::bloom_filter, * cuda::thread_scope_device, policy_type> filter{NUM_FILTER_BLOCKS}; * * // Add positive keys to the bloom filter From 58d79ee0c9dfc40b3ec042a518df4efc749901d3 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 2 Dec 2024 16:06:04 -0800 Subject: [PATCH 3/4] Move kernels to their own namespaces to avoid build conflicts (#648) Replace #644. The original PR was accidentally closed since I force-pushed to my `dev` branch. Including the Bloom filter header alongside any other hash table header leads to build-time failures due to naming conflicts (see https://godbolt.org/z/P66ah8hah). This PR resolves the issue by relocating the open-addressing and Bloom filter kernels into separate namespaces to prevent such conflicts. --- .../detail/bloom_filter/bloom_filter_impl.cuh | 9 ++--- include/cuco/detail/bloom_filter/kernels.cuh | 4 +-- .../cuco/detail/open_addressing/functors.cuh | 4 +-- .../cuco/detail/open_addressing/kernels.cuh | 4 +-- .../open_addressing/open_addressing_impl.cuh | 35 ++++++++++--------- include/cuco/detail/static_map/helpers.cuh | 4 +-- include/cuco/detail/static_map/kernels.cuh | 4 +-- include/cuco/detail/static_map/static_map.inl | 6 ++-- tests/static_map/insert_or_apply_test.cu | 2 +- 9 files changed, 37 insertions(+), 35 deletions(-) diff --git a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh index a67db3475..c2393b29e 100644 --- a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh +++ b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh @@ -198,7 +198,7 @@ class bloom_filter_impl { auto const grid_size = cuco::detail::grid_size(num_keys, cg_size, cuco::detail::default_stride(), block_size); - detail::add_if_n + detail::bloom_filter_ns::add_if_n <<>>(first, num_keys, stencil, pred, *this); } @@ -303,8 +303,9 @@ class bloom_filter_impl { auto const grid_size = cuco::detail::grid_size(num_keys, cg_size, cuco::detail::default_stride(), block_size); - detail::contains_if_n<<>>( - first, num_keys, stencil, pred, output_begin, *this); + detail::bloom_filter_ns::contains_if_n + <<>>( + first, num_keys, stencil, pred, output_begin, *this); } [[nodiscard]] __host__ __device__ constexpr word_type* data() noexcept { return words_; } @@ -365,4 +366,4 @@ class bloom_filter_impl { policy_type policy_; }; -} // namespace cuco::detail \ No newline at end of file +} // namespace cuco::detail diff --git a/include/cuco/detail/bloom_filter/kernels.cuh b/include/cuco/detail/bloom_filter/kernels.cuh index 1d514aa6b..b0ef7b684 100644 --- a/include/cuco/detail/bloom_filter/kernels.cuh +++ b/include/cuco/detail/bloom_filter/kernels.cuh @@ -22,7 +22,7 @@ #include #include -namespace cuco::detail { +namespace cuco::detail::bloom_filter_ns { CUCO_SUPPRESS_KERNEL_WARNINGS @@ -89,4 +89,4 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first, } } -} // namespace cuco::detail \ No newline at end of file +} // namespace cuco::detail::bloom_filter_ns diff --git a/include/cuco/detail/open_addressing/functors.cuh b/include/cuco/detail/open_addressing/functors.cuh index b94f80226..14fa61f6f 100644 --- a/include/cuco/detail/open_addressing/functors.cuh +++ b/include/cuco/detail/open_addressing/functors.cuh @@ -18,7 +18,7 @@ #include #include -namespace cuco::open_addressing_ns::detail { +namespace cuco::detail::open_addressing_ns { /** * @brief Device functor returning the content of the slot indexed by `idx` @@ -107,4 +107,4 @@ struct slot_is_filled { } }; -} // namespace cuco::open_addressing_ns::detail +} // namespace cuco::detail::open_addressing_ns diff --git a/include/cuco/detail/open_addressing/kernels.cuh b/include/cuco/detail/open_addressing/kernels.cuh index 56f329485..3842ffaa7 100644 --- a/include/cuco/detail/open_addressing/kernels.cuh +++ b/include/cuco/detail/open_addressing/kernels.cuh @@ -25,7 +25,7 @@ #include -namespace cuco::detail { +namespace cuco::detail::open_addressing_ns { CUCO_SUPPRESS_KERNEL_WARNINGS /** @@ -729,4 +729,4 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void rehash( } } -} // namespace cuco::detail +} // namespace cuco::detail::open_addressing_ns diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index f8d36b556..aece06a12 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -342,7 +342,7 @@ class open_addressing_impl { auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); - detail::insert_if_n + detail::open_addressing_ns::insert_if_n <<>>( first, num_keys, stencil, pred, counter.data(), container_ref); @@ -384,7 +384,7 @@ class open_addressing_impl { auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); - detail::insert_if_n + detail::open_addressing_ns::insert_if_n <<>>( first, num_keys, stencil, pred, container_ref); } @@ -426,7 +426,7 @@ class open_addressing_impl { auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); - detail::insert_and_find + detail::open_addressing_ns::insert_and_find <<>>( first, num_keys, found_begin, inserted_begin, container_ref); } @@ -466,7 +466,7 @@ class open_addressing_impl { auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); - detail::erase + detail::open_addressing_ns::erase <<>>( first, num_keys, container_ref); } @@ -540,7 +540,7 @@ class open_addressing_impl { auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); - detail::contains_if_n + detail::open_addressing_ns::contains_if_n <<>>( first, num_keys, stencil, pred, output_begin, container_ref); } @@ -615,7 +615,7 @@ class open_addressing_impl { auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); - detail::find_if_n + detail::open_addressing_ns::find_if_n <<>>( first, num_keys, stencil, pred, output_begin, container_ref); } @@ -789,8 +789,8 @@ class open_addressing_impl { std::min(static_cast(this->capacity()) - offset, stride); auto const begin = thrust::make_transform_iterator( thrust::counting_iterator{static_cast(offset)}, - open_addressing_ns::detail::get_slot(this->storage_ref())); - auto const is_filled = open_addressing_ns::detail::slot_is_filled{ + detail::open_addressing_ns::get_slot(this->storage_ref())); + auto const is_filled = detail::open_addressing_ns::slot_is_filled{ this->empty_key_sentinel(), this->erased_key_sentinel()}; std::size_t temp_storage_bytes = 0; @@ -844,7 +844,7 @@ class open_addressing_impl { template void for_each_async(CallbackOp&& callback_op, cuda::stream_ref stream) const { - auto const is_filled = open_addressing_ns::detail::slot_is_filled{ + auto const is_filled = detail::open_addressing_ns::slot_is_filled{ this->empty_key_sentinel(), this->erased_key_sentinel()}; auto storage_ref = this->storage_ref(); @@ -886,7 +886,7 @@ class open_addressing_impl { auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); - detail::for_each_n + detail::open_addressing_ns::for_each_n <<>>( first, num_keys, std::forward(callback_op), container_ref); } @@ -907,12 +907,12 @@ class open_addressing_impl { counter.reset(stream); auto const grid_size = cuco::detail::grid_size(storage_.num_buckets()); - auto const is_filled = open_addressing_ns::detail::slot_is_filled{ + auto const is_filled = detail::open_addressing_ns::slot_is_filled{ this->empty_key_sentinel(), this->erased_key_sentinel()}; // TODO: custom kernel to be replaced by cub::DeviceReduce::Sum when cub version is bumped to // v2.1.0 - detail::size + detail::open_addressing_ns::size <<>>( storage_.ref(), is_filled, counter.data()); @@ -1014,10 +1014,10 @@ class open_addressing_impl { auto constexpr block_size = cuco::detail::default_block_size(); auto constexpr stride = cuco::detail::default_stride(); auto const grid_size = cuco::detail::grid_size(num_buckets, 1, stride, block_size); - auto const is_filled = open_addressing_ns::detail::slot_is_filled{ + auto const is_filled = detail::open_addressing_ns::slot_is_filled{ this->empty_key_sentinel(), this->erased_key_sentinel()}; - detail::rehash<<>>( + detail::open_addressing_ns::rehash<<>>( old_storage.ref(), container.ref(op::insert), is_filled); } @@ -1120,7 +1120,7 @@ class open_addressing_impl { auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); - detail::count + detail::open_addressing_ns::count <<>>( first, num_keys, counter.data(), container_ref); @@ -1180,8 +1180,9 @@ class open_addressing_impl { auto constexpr grid_stride = 1; auto const grid_size = cuco::detail::grid_size(n, cg_size, grid_stride, block_size); - detail::retrieve<<>>( - first, n, output_probe, output_match, counter.data(), container_ref); + detail::open_addressing_ns::retrieve + <<>>( + first, n, output_probe, output_match, counter.data(), container_ref); auto const num_retrieved = counter.load_to_host(stream.get()); diff --git a/include/cuco/detail/static_map/helpers.cuh b/include/cuco/detail/static_map/helpers.cuh index 9627f4c9c..6eedae810 100644 --- a/include/cuco/detail/static_map/helpers.cuh +++ b/include/cuco/detail/static_map/helpers.cuh @@ -18,7 +18,7 @@ #include #include -namespace cuco::static_map_ns::detail { +namespace cuco::detail::static_map_ns { /** * @brief Dispatches to shared memory map kernel if `num_elements_per_thread > 2`, else @@ -112,4 +112,4 @@ void dispatch_insert_or_apply( first, num, init, op, ref); } } -} // namespace cuco::static_map_ns::detail \ No newline at end of file +} // namespace cuco::detail::static_map_ns diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh index 5c468ba37..dee450ff4 100644 --- a/include/cuco/detail/static_map/kernels.cuh +++ b/include/cuco/detail/static_map/kernels.cuh @@ -26,7 +26,7 @@ #include -namespace cuco::static_map_ns::detail { +namespace cuco::detail::static_map_ns { CUCO_SUPPRESS_KERNEL_WARNINGS // TODO user insert_or_assign internally @@ -262,4 +262,4 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem( } } } -} // namespace cuco::static_map_ns::detail +} // namespace cuco::detail::static_map_ns diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index 7c69263d2..67f7cfa94 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -284,7 +284,7 @@ void static_map + detail::static_map_ns::insert_or_assign <<>>( first, num, ref(op::insert_or_assign)); } @@ -335,7 +335,7 @@ void static_mapempty_value_sentinel(); // use empty_sentinel as unused init value - static_map_ns::detail::dispatch_insert_or_apply( + detail::static_map_ns::dispatch_insert_or_apply( first, last, init, op, ref(op::insert_or_apply), stream); } @@ -353,7 +353,7 @@ void static_map( + detail::static_map_ns::dispatch_insert_or_apply( first, last, init, op, ref(op::insert_or_apply), stream); } diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu index 9663c95a3..03d8ce5c9 100644 --- a/tests/static_map/insert_or_apply_test.cu +++ b/tests/static_map/insert_or_apply_test.cu @@ -115,7 +115,7 @@ void test_insert_or_apply_shmem(Map& map, size_type num_keys, size_type num_uniq cuda::stream_ref stream{}; // launch the shmem kernel - cuco::static_map_ns::detail:: + cuco::detail::static_map_ns:: insert_or_apply_shmem <<>>(pairs_begin, num_keys, From 759b710de719107667dfe80cf574d027d8e03649 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 2 Dec 2024 16:37:34 -0800 Subject: [PATCH 4/4] Add host find APIs taking custom key equality and hasher (#645) This PR introduces host find APIs for the hash set, enabling queries using a different key_eq and hash function. It can be used to improve cudf distinct join performance. --- include/cuco/detail/static_set/static_set.inl | 55 ++++++++++++++ include/cuco/static_set.cuh | 71 +++++++++++++++++++ 2 files changed, 126 insertions(+) diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl index dd9398aa0..340283ece 100644 --- a/include/cuco/detail/static_set/static_set.inl +++ b/include/cuco/detail/static_set/static_set.inl @@ -337,6 +337,29 @@ void static_set impl_->find_async(first, last, output_begin, ref(op::find), stream); } +template +template +void static_set::find_async( + InputIt first, + InputIt last, + ProbeEqual const& probe_equal, + ProbeHash const& probe_hash, + OutputIt output_begin, + cuda::stream_ref stream) const +{ + impl_->find_async(first, + last, + output_begin, + ref(op::find).rebind_key_eq(probe_equal).rebind_hash_function(probe_hash), + stream); +} + template impl_->find_if_async(first, last, stencil, pred, output_begin, ref(op::find), stream); } +template +template +void static_set::find_if_async( + InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + ProbeEqual const& probe_equal, + ProbeHash const& probe_hash, + OutputIt output_begin, + cuda::stream_ref stream) const +{ + impl_->find_if_async(first, + last, + stencil, + pred, + output_begin, + ref(op::find).rebind_key_eq(probe_equal).rebind_hash_function(probe_hash), + stream); +} + template + void find_async(InputIt first, + InputIt last, + ProbeEqual const& probe_equal, + ProbeHash const& probe_hash, + OutputIt output_begin, + cuda::stream_ref stream = {}) const; + /** * @brief For all keys in the range `[first, last)`, finds a match with its key equivalent to the * query key. @@ -654,6 +682,49 @@ class static_set { OutputIt output_begin, cuda::stream_ref stream = {}) const; + /** + * @brief For all keys in the range `[first, last)`, asynchronously finds + * a match with its key equivalent to the query key. + * + * @note If `pred( *(stencil + i) )` is true, stores the payload of the + * matched key or the `empty_value_sentienl` to `(output_begin + i)`. If `pred( *(stencil + i) )` + * is false, always stores the `empty_value_sentienl` to `(output_begin + i)`. + * + * @tparam InputIt Device accessible input iterator + * @tparam StencilIt Device accessible random access iterator whose `value_type` is convertible to + * Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * @tparam ProbeEqual Binary callable equal type + * @tparam ProbeHash Unary callable hasher type that can be constructed from + * an integer value + * @tparam OutputIt Device accessible output iterator + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param probe_equal The binary function to compare set keys and probe keys for equality + * @param probe_hash The unary function to hash probe keys + * @param output_begin Beginning of the sequence of matches retrieved for each key + * @param stream Stream used for executing the kernels + */ + template + void find_if_async(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + ProbeEqual const& probe_equal, + ProbeHash const& probe_hash, + OutputIt output_begin, + cuda::stream_ref stream = {}) const; + /** * @brief Applies the given function object `callback_op` to the copy of every filled slot in the * container