diff --git a/benchmarks/benchmark_defaults.hpp b/benchmarks/benchmark_defaults.hpp index ac80df38b..1d689d55a 100644 --- a/benchmarks/benchmark_defaults.hpp +++ b/benchmarks/benchmark_defaults.hpp @@ -34,16 +34,14 @@ using HASH_RANGE = nvbench::type_list, // cuco::murmurhash3_x86_128, // cuco::murmurhash3_x64_128>; // TODO handle tuple-like hash value -auto constexpr N = 100'000'000; -auto constexpr OCCUPANCY = 0.5; -auto constexpr MULTIPLICITY = 1; -auto constexpr MATCHING_RATE = 1.0; -auto constexpr MAX_NOISE = 3; -auto constexpr SKEW = 0.5; -auto constexpr BATCH_SIZE = 1'000'000; -auto constexpr INITIAL_SIZE = 50'000'000; -auto constexpr FILTER_SIZE_MB = 2'000; -auto constexpr PATTERN_BITS = 6; +auto constexpr N = 100'000'000; +auto constexpr OCCUPANCY = 0.5; +auto constexpr MULTIPLICITY = 1; +auto constexpr MATCHING_RATE = 1.0; +auto constexpr MAX_NOISE = 3; +auto constexpr SKEW = 0.5; +auto constexpr BATCH_SIZE = 1'000'000; +auto constexpr INITIAL_SIZE = 50'000'000; auto const N_RANGE = nvbench::range(10'000'000, 100'000'000, 20'000'000); auto const N_RANGE_CACHE = @@ -52,8 +50,5 @@ auto const OCCUPANCY_RANGE = nvbench::range(0.1, 0.9, 0.1); auto const MULTIPLICITY_RANGE = std::vector{1, 2, 4, 8, 16}; auto const MATCHING_RATE_RANGE = nvbench::range(0.1, 1., 0.1); auto const SKEW_RANGE = nvbench::range(0.1, 1., 0.1); -auto const FILTER_SIZE_MB_RANGE_CACHE = - std::vector{1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048}; -auto const PATTERN_BITS_RANGE = std::vector{1, 2, 4, 6, 8}; } // namespace cuco::benchmark::defaults diff --git a/benchmarks/bloom_filter/bloom_filter_bench.cu b/benchmarks/bloom_filter/bloom_filter_bench.cu index 11d5a3c2a..d653cadd6 100644 --- a/benchmarks/bloom_filter/bloom_filter_bench.cu +++ b/benchmarks/bloom_filter/bloom_filter_bench.cu @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "defaults.hpp" + #include #include @@ -22,6 +24,7 @@ #include +#include #include #include #include @@ -64,19 +67,23 @@ void add_fpr_summary(nvbench::state& state, FilterType& filter) template void bloom_filter_add(nvbench::state& state, nvbench::type_list) { - using filter_type = cuco::bloom_filter, - cuda::thread_scope_device, - rebind_hasher_t>; + using filter_type = + cuco::bloom_filter, + cuda::thread_scope_device, + cuco::bloom_filter_policy, Block>>; auto const num_keys = state.get_int64("NumInputs"); auto const filter_size_mb = state.get_int64("FilterSizeMB"); auto const pattern_bits = state.get_int64("PatternBits"); + if (pattern_bits < filter_type::words_per_block and pattern_bits != defaults::BF_PATTERN_BITS) { + state.skip("pattern_bits must be at least words_per_block"); + } + std::size_t const num_sub_filters = (filter_size_mb * 1024 * 1024) / - (sizeof(typename filter_type::word_type) * filter_type::block_words); + (sizeof(typename filter_type::word_type) * filter_type::words_per_block); thrust::device_vector keys(num_keys); @@ -85,9 +92,9 @@ void bloom_filter_add(nvbench::state& state, nvbench::type_list(num_keys * - filter_type::block_words); + filter_type::words_per_block); - filter_type filter{num_sub_filters, static_cast(pattern_bits)}; + filter_type filter{num_sub_filters, {}, {static_cast(pattern_bits)}}; add_fpr_summary(state, filter); @@ -102,19 +109,23 @@ void bloom_filter_add(nvbench::state& state, nvbench::type_list void bloom_filter_contains(nvbench::state& state, nvbench::type_list) { - using filter_type = cuco::bloom_filter, - cuda::thread_scope_device, - rebind_hasher_t>; + using filter_type = + cuco::bloom_filter, + cuda::thread_scope_device, + cuco::bloom_filter_policy, Block>>; auto const num_keys = state.get_int64("NumInputs"); auto const filter_size_mb = state.get_int64("FilterSizeMB"); auto const pattern_bits = state.get_int64("PatternBits"); + if (pattern_bits < filter_type::words_per_block and pattern_bits != defaults::BF_PATTERN_BITS) { + state.skip("pattern_bits must be at least words_per_block"); + } + std::size_t const num_sub_filters = (filter_size_mb * 1024 * 1024) / - (sizeof(typename filter_type::word_type) * filter_type::block_words); + (sizeof(typename filter_type::word_type) * filter_type::words_per_block); thrust::device_vector keys(num_keys); thrust::device_vector result(num_keys, false); @@ -124,9 +135,9 @@ void bloom_filter_contains(nvbench::state& state, nvbench::type_list(num_keys * - filter_type::block_words); + filter_type::words_per_block); - filter_type filter{num_sub_filters, static_cast(pattern_bits)}; + filter_type filter{num_sub_filters, {}, {static_cast(pattern_bits)}}; add_fpr_summary(state, filter); @@ -137,37 +148,34 @@ void bloom_filter_contains(nvbench::state& state, nvbench::type_list; - NVBENCH_BENCH_TYPES(bloom_filter_add, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, - nvbench::type_list>, - nvbench::type_list, + nvbench::type_list, + nvbench::type_list, nvbench::type_list)) .set_name("bloom_filter_add_unique_size") .set_type_axes_names({"Key", "Hash", "Block", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) - .add_int64_axis("NumInputs", {BF_N}) - .add_int64_axis("FilterSizeMB", defaults::FILTER_SIZE_MB_RANGE_CACHE) - .add_int64_axis("PatternBits", {defaults::PATTERN_BITS}); + .add_int64_axis("NumInputs", {defaults::BF_N}) + .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE) + .add_int64_axis("PatternBits", {defaults::BF_PATTERN_BITS}); NVBENCH_BENCH_TYPES(bloom_filter_add, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, defaults::HASH_RANGE, - nvbench::type_list, + nvbench::type_list, nvbench::type_list)) .set_name("bloom_filter_add_unique_hash") .set_type_axes_names({"Key", "Hash", "Block", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) - .add_int64_axis("NumInputs", {BF_N}) - .add_int64_axis("FilterSizeMB", {defaults::FILTER_SIZE_MB}) - .add_int64_axis("PatternBits", {defaults::PATTERN_BITS}); + .add_int64_axis("NumInputs", {defaults::BF_N}) + .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB}) + .add_int64_axis("PatternBits", {defaults::BF_PATTERN_BITS}); NVBENCH_BENCH_TYPES(bloom_filter_add, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, - nvbench::type_list>, + nvbench::type_list, nvbench::type_list, cuda::std::array, cuda::std::array, @@ -180,39 +188,39 @@ NVBENCH_BENCH_TYPES(bloom_filter_add, .set_name("bloom_filter_add_unique_block_dim") .set_type_axes_names({"Key", "Hash", "Block", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) - .add_int64_axis("NumInputs", {BF_N}) - .add_int64_axis("FilterSizeMB", {defaults::FILTER_SIZE_MB}) - .add_int64_axis("PatternBits", {defaults::PATTERN_BITS}); + .add_int64_axis("NumInputs", {defaults::BF_N}) + .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB}) + .add_int64_axis("PatternBits", {defaults::BF_PATTERN_BITS}); NVBENCH_BENCH_TYPES(bloom_filter_contains, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, - nvbench::type_list>, - nvbench::type_list, + nvbench::type_list, + nvbench::type_list, nvbench::type_list)) .set_name("bloom_filter_contains_unique_size") .set_type_axes_names({"Key", "Hash", "Block", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) - .add_int64_axis("NumInputs", {BF_N}) - .add_int64_axis("FilterSizeMB", defaults::FILTER_SIZE_MB_RANGE_CACHE) - .add_int64_axis("PatternBits", {defaults::PATTERN_BITS}); + .add_int64_axis("NumInputs", {defaults::BF_N}) + .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE) + .add_int64_axis("PatternBits", {defaults::BF_PATTERN_BITS}); NVBENCH_BENCH_TYPES(bloom_filter_contains, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, defaults::HASH_RANGE, - nvbench::type_list, + nvbench::type_list, nvbench::type_list)) .set_name("bloom_filter_contains_unique_hash") .set_type_axes_names({"Key", "Hash", "Block", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) - .add_int64_axis("NumInputs", {BF_N}) - .add_int64_axis("FilterSizeMB", {defaults::FILTER_SIZE_MB}) - .add_int64_axis("PatternBits", {defaults::PATTERN_BITS}); + .add_int64_axis("NumInputs", {defaults::BF_N}) + .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB}) + .add_int64_axis("PatternBits", {defaults::BF_PATTERN_BITS}); NVBENCH_BENCH_TYPES(bloom_filter_contains, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, - nvbench::type_list>, + nvbench::type_list, nvbench::type_list, cuda::std::array, cuda::std::array, @@ -225,9 +233,9 @@ NVBENCH_BENCH_TYPES(bloom_filter_contains, .set_name("bloom_filter_contains_unique_block_dim") .set_type_axes_names({"Key", "Hash", "Block", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) - .add_int64_axis("NumInputs", {BF_N}) - .add_int64_axis("FilterSizeMB", {defaults::FILTER_SIZE_MB}) - .add_int64_axis("PatternBits", {defaults::PATTERN_BITS}); + .add_int64_axis("NumInputs", {defaults::BF_N}) + .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB}) + .add_int64_axis("PatternBits", {defaults::BF_PATTERN_BITS}); /* // benchmark outer product of configuration space @@ -247,8 +255,8 @@ NVBENCH_BENCH_TYPES( .set_name("bloom_filter_add_unique_product") .set_type_axes_names({"Key", "Hash", "Block", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) - .add_int64_axis("NumInputs", {BF_N}) - .add_int64_axis("FilterSizeMB", defaults::FILTER_SIZE_MB_RANGE_CACHE) + .add_int64_axis("NumInputs", {defaults::BF_N}) + .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE) .add_int64_axis("PatternBits", {1, 2, 4, 6, 8, 10}); NVBENCH_BENCH_TYPES( @@ -267,7 +275,7 @@ NVBENCH_BENCH_TYPES( .set_name("bloom_filter_contains_unique_product") .set_type_axes_names({"Key", "Hash", "Block", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) - .add_int64_axis("NumInputs", {BF_N}) - .add_int64_axis("FilterSizeMB", defaults::FILTER_SIZE_MB_RANGE_CACHE) + .add_int64_axis("NumInputs", {defaults::BF_N}) + .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE) .add_int64_axis("PatternBits", {1, 2, 4, 6, 8, 10}); */ diff --git a/benchmarks/bloom_filter/defaults.hpp b/benchmarks/bloom_filter/defaults.hpp new file mode 100644 index 000000000..0b6fdd819 --- /dev/null +++ b/benchmarks/bloom_filter/defaults.hpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +#include +#include + +namespace cuco::benchmark::defaults { + +static constexpr auto BF_N = 400'000'000; +static constexpr auto BF_SIZE_MB = 2'000; +using BF_POLICY = cuco::default_filter_policy; +using BF_HASH = typename BF_POLICY::hasher; +using BF_BLOCK = cuda::std::array; +// This is a dummy value which will be dynamically replaced with the filter's actual default +auto constexpr BF_PATTERN_BITS = 0; +auto const BF_SIZE_MB_RANGE_CACHE = + std::vector{1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048}; +auto const BF_PATTERN_BITS_RANGE = std::vector{1, 2, 4, 6, 8, 16}; + +} // namespace cuco::benchmark::defaults diff --git a/examples/bloom_filter/host_bulk_example.cu b/examples/bloom_filter/host_bulk_example.cu index f7d762feb..14a2a5cfa 100644 --- a/examples/bloom_filter/host_bulk_example.cu +++ b/examples/bloom_filter/host_bulk_example.cu @@ -30,8 +30,8 @@ int main(void) int constexpr num_tp = num_keys * 0.5; int constexpr num_tn = num_keys - num_tp; - // Spawn a filter with 1000 sub-filters and 6-bit patterns for each key. - cuco::bloom_filter filter{1000, 6}; + // Spawn a filter with 200 sub-filters. + cuco::bloom_filter filter{200}; thrust::device_vector keys(num_keys); thrust::sequence(keys.begin(), keys.end(), 1); diff --git a/include/cuco/bloom_filter.cuh b/include/cuco/bloom_filter.cuh index 3cb5f1d97..ec88e57a7 100644 --- a/include/cuco/bloom_filter.cuh +++ b/include/cuco/bloom_filter.cuh @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include @@ -50,22 +51,17 @@ namespace cuco { * independent add or lookup operations from device code. These operations are accessed through * non-owning, trivially copyable reference types (or "ref"). * - * @note `Block` is used **only** to determine `block_words` via `cuda::std::tuple_size` and - * `word_type` via `Block::value_type` and does not represent the actual storage type of the filter. - * We recommend using `cuda::std::array`. - * * @tparam Key Key type - * @tparam Block Type to determine the filter's block size and underlying word type * @tparam Extent Size type that is used to determine the number of blocks in the filter * @tparam Scope The scope in which operations will be performed by individual threads - * @tparam Hash Hash function used to generate a key's fingerprint + * @tparam Policy Type that defines how to generate and store key fingerprints (see + * `cuco/bloom_filter_policy.cuh`) * @tparam Allocator Type of allocator used for device-accessible storage */ template , class Extent = cuco::extent, cuda::thread_scope Scope = cuda::thread_scope_device, - class Hash = cuco::xxhash_64, + class Policy = cuco::default_filter_policy, class Allocator = cuco::cuda_allocator> class bloom_filter { public: @@ -75,16 +71,15 @@ class bloom_filter { * @tparam NewScope Thead scope of the to be updated ref type */ template - using ref_type = bloom_filter_ref; + using ref_type = bloom_filter_ref; static constexpr auto thread_scope = ref_type<>::thread_scope; ///< CUDA thread scope - static constexpr auto block_words = - ref_type<>::block_words; ///< Number of machine words in each filter block + static constexpr auto words_per_block = + ref_type<>::words_per_block; ///< Number of machine words in each filter block using key_type = typename ref_type<>::key_type; ///< Key Type using extent_type = typename ref_type<>::extent_type; ///< Extent type using size_type = typename extent_type::value_type; ///< Underlying type of the extent type - using hasher = typename ref_type<>::hasher; ///< Hash function type using word_type = typename ref_type<>::word_type; ///< Machine word type using allocator_type = typename std::allocator_traits::template rebind_alloc; ///< Allocator @@ -108,21 +103,20 @@ class bloom_filter { /** * @brief Constructs a statically-sized Bloom filter. * - * @note The total number of bits in the filter is determined by `block_words * num_blocks * + * @note The total number of bits in the filter is determined by `words_per_block * num_blocks * * sizeof(word_type) * CHAR_BIT`. * * @param num_blocks Number of sub-filters or blocks - * @param pattern_bits Number of bits in a key's fingerprint - * @param hash Hash function used to generate a key's fingerprint + * @param scope The scope in which operations will be performed + * @param policy Fingerprint generation policy (see `cuco/bloom_filter_policy.cuh`) * @param alloc Allocator used for allocating device-accessible storage * @param stream CUDA stream used to initialize the filter */ __host__ bloom_filter(Extent num_blocks, - std::uint32_t pattern_bits, - cuda_thread_scope = {}, - Hash const& hash = {}, - Allocator const& alloc = {}, - cuda::stream_ref stream = {}); + cuda_thread_scope scope = {}, + Policy const& policy = {}, + Allocator const& alloc = {}, + cuda::stream_ref stream = {}); /** * @brief Erases all information from the filter. @@ -355,20 +349,6 @@ class bloom_filter { */ [[nodiscard]] __host__ extent_type block_extent() const noexcept; - /** - * @brief Gets the number of bits in a key's fingerprint - * - * @return The number of fingerprint bits - */ - [[nodiscard]] __host__ hasher hash_function() const noexcept; - - /** - * @brief Gets the function used to hash keys - * - * @return The function used to hash keys - */ - [[nodiscard]] __host__ std::uint32_t pattern_bits() const noexcept; - /** * @brief Gets the allocator. * diff --git a/include/cuco/bloom_filter_policy.cuh b/include/cuco/bloom_filter_policy.cuh new file mode 100644 index 000000000..0cbd6d29d --- /dev/null +++ b/include/cuco/bloom_filter_policy.cuh @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include + +#include + +namespace cuco { + +// TODO Policies are meant as customization points so we need to ensure a user-defined policy +// fulfills a certain concept. + +/** + * @brief A policy that defines how a Blocked Bloom Filter generates and stores a key's fingerprint. + * + * @note `Block` is used **only** to determine `block_words` via `cuda::std::tuple_size` and + * `word_type` via `Block::value_type` and does not represent the actual storage type of the filter. + * We recommend using `cuda::std::array`. + * + * @tparam Hash Hash function used to generate a key's fingerprint + * @tparam Block Type to determine the filter's block size and underlying word type + */ +template +class bloom_filter_policy { + using impl_type = cuco::detail::bloom_filter_policy_impl; + + public: + using hasher = typename impl_type::hasher; ///< Type of the hash function + using hash_argument_type = typename impl_type::hash_argument_type; ///< Hash function input type + using hash_result_type = typename impl_type::hash_result_type; ///< hash function output type + using word_type = + typename impl_type::word_type; ///< Underlying machine word type of the filter's storage + + static constexpr std::uint32_t words_per_block = + impl_type::words_per_block; ///< Number of machine words in each filter block + + public: + /** + * @brief Constructs the `bloom_filter_policy`object. + * + * @note This policy dynamically adjusts the `pattern_bits` to set at least one bit per word in + * the filter block. + * + * @param pattern_bits Number of bits in a key's fingerprint + * @param hash Hash function used to generate a key's fingerprint + */ + __host__ __device__ constexpr bloom_filter_policy(std::uint32_t pattern_bits = words_per_block, + Hash hash = {}); + + /** + * @brief Generates the hash value for a given key. + * + * @note This function is meant as a customization point and is only used in the internals of the + * `bloom_filter(_ref)` implementation. + * + * @param key The key to hash + * + * @return The hash value of the key + */ + __device__ constexpr hash_result_type hash(hash_argument_type const& key) const; + + /** + * @brief Determines the filter block a key is added into. + * + * @note This function is meant as a customization point and is only used in the internals of the + * `bloom_filter(_ref)` implementation. + * + * @tparam Extent Size type that is used to determine the number of blocks in the filter + * + * @param hash Hash value of the key + * @param num_blocks Number of block in the filter + * + * @return The block index for the given key's hash value + */ + template + __device__ constexpr auto block_index(hash_result_type hash, Extent num_blocks) const; + + /** + * @brief Determines the fingerprint pattern for a word within the filter block for a given key's + * hash value. + * + * @note This function is meant as a customization point and is only used in the internals of the + * `bloom_filter(_ref)` implementation. + * + * @param hash Hash value of the key + * @param word_index Target word within the filter block + * + * @return The bit pattern for the word in the filter block + */ + __device__ constexpr word_type word_pattern(hash_result_type hash, + std::uint32_t word_index) const; + + private: + impl_type impl_; ///< Policy implementation +}; + +/** + * @brief Default Bloom filter policy + * + * @tparam Key The type of the values to hash + */ +template +using default_filter_policy = + bloom_filter_policy, cuda::std::array>; + +} // namespace cuco + +#include \ No newline at end of file diff --git a/include/cuco/bloom_filter_ref.cuh b/include/cuco/bloom_filter_ref.cuh index e3abac149..92ecd8432 100644 --- a/include/cuco/bloom_filter_ref.cuh +++ b/include/cuco/bloom_filter_ref.cuh @@ -22,38 +22,31 @@ #include #include -#include - namespace cuco { /** * @brief Non-owning "ref" type of `bloom_filter`. * * @note Ref types are trivially-copyable and are intended to be passed by value. - * @note `Block` is used **only** to determine `block_words` via `cuda::std::tuple_size` and - * `word_type` via `Block::value_type` and does not represent the actual storage type of the filter. - * We recommend using `cuda::std::array`. * * @tparam Key Key type - * @tparam Block Type to determine the filter's block size and underlying word type * @tparam Extent Size type that is used to determine the number of blocks in the filter * @tparam Scope The scope in which operations will be performed by individual threads - * @tparam Hash Hash function used to generate a key's fingerprint + * @tparam Policy Type that defines how to generate and store key fingerprints (see + * `cuco/bloom_filter_policy.cuh`) */ -template +template class bloom_filter_ref { - using impl_type = - detail::bloom_filter_impl; ///< Implementation type + using impl_type = detail::bloom_filter_impl; ///< Implementation type public: static constexpr auto thread_scope = impl_type::thread_scope; ///< CUDA thread scope - static constexpr auto block_words = - impl_type::block_words; ///< Number of machine words in each filter block + static constexpr auto words_per_block = + impl_type::words_per_block; ///< Number of machine words in each filter block using key_type = typename impl_type::key_type; ///< Key Type using extent_type = typename impl_type::extent_type; ///< Extent type using size_type = typename extent_type::value_type; ///< Underlying type of the extent type - using hasher = typename impl_type::hasher; ///< Hash function type using word_type = typename impl_type::word_type; ///< Machine word type /** @@ -61,18 +54,17 @@ class bloom_filter_ref { * * @note The storage span starting at `data` must have an extent of at least `num_blocks` * elements. - * @note `data` must be aligned to at least `sizeof(word_type) * block_words`. + * @note `data` must be aligned to at least `sizeof(word_type) * words_per_block`. * * @param data Pointer to the storage span of the filter * @param num_blocks Number of sub-filters or blocks - * @param pattern_bits Number of bits in a key's fingerprint - * @param hash Hash function used to generate a key's fingerprint + * @param scope The scope in which operations will be performed + * @param policy Fingerprint generation policy (see `cuco/bloom_filter_policy.cuh`) */ __host__ __device__ bloom_filter_ref(word_type* data, Extent num_blocks, - std::uint32_t pattern_bits, - cuda_thread_scope, - Hash const& hash); + cuda_thread_scope scope, + Policy const& policy); /** * @brief Device function that cooperatively erases all information from the filter. @@ -114,7 +106,7 @@ class bloom_filter_ref { /** * @brief Device function that cooperatively adds a key to the filter. * - * @note Best performance is achieved if the size of the CG is equal to `block_words`. + * @note Best performance is achieved if the size of the CG is equal to `words_per_block`. * * @tparam CG Cooperative Group type * @tparam ProbeKey Input type that is implicitly convertible to `key_type` @@ -360,20 +352,6 @@ class bloom_filter_ref { */ [[nodiscard]] __host__ __device__ extent_type block_extent() const noexcept; - /** - * @brief Gets the number of sub-filter blocks. - * - * @return Number of sub-filter blocks - */ - [[nodiscard]] __host__ __device__ std::uint32_t pattern_bits() const noexcept; - - /** - * @brief Gets the number of bits in a key's fingerprint - * - * @return The number of fingerprint bits - */ - [[nodiscard]] __host__ __device__ hasher hash_function() const noexcept; - private: impl_type impl_; ///< Object containing the Blocked Bloom filter implementation }; diff --git a/include/cuco/detail/bloom_filter/bloom_filter.inl b/include/cuco/detail/bloom_filter/bloom_filter.inl index 1545acfb0..64157cabd 100644 --- a/include/cuco/detail/bloom_filter/bloom_filter.inl +++ b/include/cuco/detail/bloom_filter/bloom_filter.inl @@ -26,139 +26,86 @@ namespace cuco { -template -__host__ bloom_filter::bloom_filter( - Extent num_blocks, - uint32_t pattern_bits, - cuda_thread_scope, - Hash const& hash, - Allocator const& alloc, - cuda::stream_ref stream) +template +__host__ bloom_filter::bloom_filter(Extent num_blocks, + cuda_thread_scope, + Policy const& policy, + Allocator const& alloc, + cuda::stream_ref stream) : allocator_{alloc}, - data_{ - allocator_.allocate(num_blocks * block_words), - detail::custom_deleter{num_blocks * block_words, allocator_}}, - ref_{data_.get(), num_blocks, pattern_bits, {}, hash} + data_{allocator_.allocate(num_blocks * words_per_block), + detail::custom_deleter{num_blocks * words_per_block, + allocator_}}, + ref_{data_.get(), num_blocks, {}, policy} { this->clear_async(stream); } -template -__host__ void bloom_filter::clear( - cuda::stream_ref stream) +template +__host__ void bloom_filter::clear(cuda::stream_ref stream) { ref_.clear(stream); } -template -__host__ void bloom_filter::clear_async( +template +__host__ void bloom_filter::clear_async( cuda::stream_ref stream) { ref_.clear_async(stream); } -template +template template -__host__ void bloom_filter::add(InputIt first, - InputIt last, - cuda::stream_ref stream) +__host__ void bloom_filter::add(InputIt first, + InputIt last, + cuda::stream_ref stream) { ref_.add(first, last, stream); } -template +template template -__host__ void bloom_filter::add_async( +__host__ void bloom_filter::add_async( InputIt first, InputIt last, cuda::stream_ref stream) { ref_.add_async(first, last, stream); } -template +template template -__host__ void bloom_filter::add_if( +__host__ void bloom_filter::add_if( InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream) { ref_.add_if(first, last, stencil, pred, stream); } -template +template template -__host__ void bloom_filter::add_if_async( +__host__ void bloom_filter::add_if_async( InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream) noexcept { ref_.add_if_async(first, last, stencil, pred, stream); } -template +template template -__host__ void bloom_filter::contains( +__host__ void bloom_filter::contains( InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const { ref_.contains(first, last, output_begin, stream); } -template +template template -__host__ void bloom_filter::contains_async( +__host__ void bloom_filter::contains_async( InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const noexcept { ref_.contains_async(first, last, output_begin, stream); } -template +template template -__host__ void bloom_filter::contains_if( +__host__ void bloom_filter::contains_if( InputIt first, InputIt last, StencilIt stencil, @@ -169,14 +116,9 @@ __host__ void bloom_filter::contains ref_.contains_if(first, last, stencil, pred, output_begin, stream); } -template +template template -__host__ void bloom_filter::contains_if_async( +__host__ void bloom_filter::contains_if_async( InputIt first, InputIt last, StencilIt stencil, @@ -187,89 +129,38 @@ __host__ void bloom_filter::contains ref_.contains_if_async(first, last, stencil, pred, output_begin, stream); } -template -[[nodiscard]] __host__ typename bloom_filter::word_type* -bloom_filter::data() noexcept +template +[[nodiscard]] __host__ typename bloom_filter::word_type* +bloom_filter::data() noexcept { return ref_.data(); } -template +template [[nodiscard]] __host__ - typename bloom_filter::word_type const* - bloom_filter::data() const noexcept + typename bloom_filter::word_type const* + bloom_filter::data() const noexcept { return ref_.data(); } -template -[[nodiscard]] __host__ - typename bloom_filter::extent_type - bloom_filter::block_extent() const noexcept +template +[[nodiscard]] __host__ typename bloom_filter::extent_type +bloom_filter::block_extent() const noexcept { return ref_.block_extent(); } -template -[[nodiscard]] __host__ uint32_t -bloom_filter::pattern_bits() const noexcept -{ - return ref_.pattern_bits(); -} - -template -[[nodiscard]] __host__ typename bloom_filter::hasher -bloom_filter::hash_function() const noexcept -{ - return ref_.hash_function(); -} - -template -[[nodiscard]] __host__ - typename bloom_filter::allocator_type - bloom_filter::allocator() const noexcept +template +[[nodiscard]] __host__ typename bloom_filter::allocator_type +bloom_filter::allocator() const noexcept { return allocator_; } -template -[[nodiscard]] __host__ typename bloom_filter::ref_type<> -bloom_filter::ref() const noexcept +template +[[nodiscard]] __host__ typename bloom_filter::ref_type<> +bloom_filter::ref() const noexcept { return ref_; } diff --git a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh index 601f61436..c5d1beff4 100644 --- a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh +++ b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh @@ -38,41 +38,36 @@ namespace cuco::detail { -template +template class bloom_filter_impl { public: - static constexpr auto thread_scope = Scope; ///< CUDA thread scope - static constexpr auto block_words = cuda::std::tuple_size_v; - using key_type = Key; using extent_type = Extent; using size_type = typename extent_type::value_type; - using hasher = Hash; - using word_type = typename Block::value_type; // TODO static_assert can use fetch_or() and load() - - static_assert(cuda::std::has_single_bit(block_words) and block_words <= 32, - "Number of words per block must be a power-of-two and less than or equal to 32"); - - __host__ __device__ bloom_filter_impl(word_type* filter, - Extent num_blocks, - uint32_t pattern_bits, - cuda_thread_scope, - Hash const& hash) - : words_{filter}, num_blocks_{num_blocks}, pattern_bits_{pattern_bits}, hash_{hash} + using policy_type = Policy; + using word_type = + typename policy_type::word_type; // TODO static_assert can use fetch_or() and load() + + static constexpr auto thread_scope = Scope; ///< CUDA thread scope + static constexpr auto words_per_block = policy_type::words_per_block; + + __host__ __device__ + bloom_filter_impl(word_type* filter, Extent num_blocks, cuda_thread_scope, Policy policy) + : words_{filter}, num_blocks_{num_blocks}, policy_{policy} { #ifndef __CUDA_ARCH__ auto const alignment = 1ull << cuda::std::countr_zero(reinterpret_cast(filter)); CUCO_EXPECTS(alignment >= required_alignment(), "Invalid memory alignment", std::runtime_error); - CUCO_EXPECTS(this->num_blocks_ > 0, "Number of blocks cannot be zero", std::runtime_error); + CUCO_EXPECTS(num_blocks_ > 0, "Number of blocks cannot be zero", std::runtime_error); #endif } template __device__ void clear(CG const& group) { - for (int i = group.thread_rank(); num_blocks_ * block_words; i += group.size()) { + for (int i = group.thread_rank(); num_blocks_ * words_per_block; i += group.size()) { words_[i] = 0; } } @@ -87,7 +82,7 @@ class bloom_filter_impl { { CUCO_CUDA_TRY(cub::DeviceFor::ForEachN( words_, - num_blocks_ * block_words, + num_blocks_ * words_per_block, [] __device__(word_type & word) { word = 0; }, stream.get())); } @@ -95,32 +90,32 @@ class bloom_filter_impl { template __device__ void add(ProbeKey const& key) { - auto const hash_value = hash_(key); - auto const idx = this->block_idx(hash_value); + auto const hash_value = policy_.hash(key); + auto const idx = policy_.block_index(hash_value, num_blocks_); -#pragma unroll block_words - for (int32_t i = 0; i < block_words; ++i) { - auto const word = this->pattern_word(hash_value, i); +#pragma unroll words_per_block + for (uint32_t i = 0; i < words_per_block; ++i) { + auto const word = policy_.word_pattern(hash_value, i); if (word != 0) { auto atom_word = - cuda::atomic_ref{*(words_ + (idx * block_words + i))}; + cuda::atomic_ref{*(words_ + (idx * words_per_block + i))}; atom_word.fetch_or(word, cuda::memory_order_relaxed); } } } template - __device__ void add(cooperative_groups::thread_block_tile const& tile, + __device__ void add(cooperative_groups::thread_block_tile const& tile, ProbeKey const& key) { - auto const hash_value = hash_(key); - auto const idx = this->block_idx(hash_value); + auto const hash_value = policy_.hash(key); + auto const idx = policy_.block_index(hash_value, num_blocks_); auto const rank = tile.thread_rank(); - auto const word = this->pattern_word(hash_value, rank); + auto const word = policy_.word_pattern(hash_value, rank); if (word != 0) { auto atom_word = - cuda::atomic_ref{*(words_ + (idx * block_words + rank))}; + cuda::atomic_ref{*(words_ + (idx * words_per_block + rank))}; atom_word.fetch_or(word, cuda::memory_order_relaxed); } } @@ -142,7 +137,7 @@ class bloom_filter_impl { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } - if constexpr (block_words == 1) { + if constexpr (words_per_block == 1) { CUCO_CUDA_TRY(cub::DeviceFor::ForEachCopyN( first, num_keys, @@ -173,8 +168,8 @@ class bloom_filter_impl { if (num_keys == 0) { return; } auto constexpr block_size = cuco::detail::default_block_size(); - auto const grid_size = - cuco::detail::grid_size(num_keys, block_words, cuco::detail::default_stride(), block_size); + auto const grid_size = cuco::detail::grid_size( + num_keys, words_per_block, cuco::detail::default_stride(), block_size); detail::add_if_n <<>>(first, num_keys, stencil, pred, *this); @@ -183,16 +178,15 @@ class bloom_filter_impl { template [[nodiscard]] __device__ bool contains(ProbeKey const& key) const { - auto const hash_value = hash_(key); - auto const idx = this->block_idx(hash_value); + auto const hash_value = policy_.hash(key); - auto const stored_pattern = - this->vec_load_words(idx * block_words); // vectorized load - auto const expected_pattern = this->pattern(hash_value); + auto const stored_pattern = this->vec_load_words( + policy_.block_index(hash_value, num_blocks_) * words_per_block); -#pragma unroll block_words - for (int32_t i = 0; i < block_words; ++i) { - if ((stored_pattern[i] & expected_pattern[i]) != expected_pattern[i]) { return false; } +#pragma unroll words_per_block + for (uint32_t i = 0; i < words_per_block; ++i) { + auto const expected_pattern = policy_.word_pattern(hash_value, i); + if ((stored_pattern[i] & expected_pattern) != expected_pattern) { return false; } } return true; @@ -267,10 +261,6 @@ class bloom_filter_impl { return num_blocks_; } - [[nodiscard]] __host__ __device__ uint32_t pattern_bits() const noexcept { return pattern_bits_; } - - [[nodiscard]] __host__ __device__ hasher hash_function() const noexcept { return hash_; } - // TODO // [[nodiscard]] __host__ double occupancy() const; // [[nodiscard]] __host__ double expected_false_positive_rate(size_t unique_keys) const @@ -280,80 +270,23 @@ class bloom_filter_impl { // memory_to_use, cuda_thread_scope scope = {}) const noexcept; private: - template - __device__ size_type block_idx(HashValue hash_value) const - { - // TODO use fast_int modulo - return hash_value % num_blocks_; - } - - // we use the LSB bits of the hash value to determine the pattern bits for each word - template - __device__ auto pattern(HashValue hash_value) const - { - cuda::std::array pattern{}; - auto constexpr word_bits = sizeof(word_type) * CHAR_BIT; - auto constexpr bit_index_width = cuda::std::bit_width(word_bits - 1); - word_type constexpr bit_index_mask = (word_type{1} << bit_index_width) - 1; - - auto const bits_per_word = pattern_bits_ / block_words; - auto const remainder = pattern_bits_ % block_words; - - uint32_t k = 0; -#pragma unroll block_words - for (int32_t i = 0; i < block_words; ++i) { - for (int32_t j = 0; j < bits_per_word + (i < remainder ? 1 : 0); ++j) { - if (k++ >= pattern_bits_) { return pattern; } - pattern[i] |= word_type{1} << (hash_value & bit_index_mask); - hash_value >>= bit_index_width; - } - } - - return pattern; - } - - template - __device__ word_type pattern_word(HashValue hash_value, uint32_t i) const - { - auto constexpr word_bits = sizeof(word_type) * CHAR_BIT; - auto constexpr bit_index_width = cuda::std::bit_width(word_bits - 1); - word_type constexpr bit_index_mask = (word_type{1} << bit_index_width) - 1; - - auto const bits_per_word = pattern_bits_ / block_words; - auto const remainder = pattern_bits_ % block_words; - auto const bits_so_far = bits_per_word * i + (i < remainder ? i : remainder); - - hash_value >>= bits_so_far * bit_index_width; - - // Compute the word at index i - word_type word = 0; - int32_t j_limit = bits_per_word + (i < remainder ? 1 : 0); - - for (int32_t j = 0; j < j_limit; ++j) { - word |= word_type{1} << (hash_value & bit_index_mask); - hash_value >>= bit_index_width; - } - - return word; - } - template __device__ auto vec_load_words(size_type index) const { using vec_type = cuda::std::array; - return *reinterpret_cast( - __builtin_assume_aligned(words_ + index, sizeof(word_type) * NumWords)); + return *reinterpret_cast(__builtin_assume_aligned( + words_ + index, min(sizeof(word_type) * NumWords, required_alignment()))); } __host__ __device__ static constexpr size_t required_alignment() noexcept { - return sizeof(word_type) * block_words; + return sizeof(word_type) * words_per_block; // TODO check if a maximum of 16byte is suffiecient } word_type* words_; extent_type num_blocks_; - uint32_t pattern_bits_; - hasher hash_; + policy_type policy_; }; + } // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/detail/bloom_filter/bloom_filter_policy.inl b/include/cuco/detail/bloom_filter/bloom_filter_policy.inl new file mode 100644 index 000000000..cf5ce6ecf --- /dev/null +++ b/include/cuco/detail/bloom_filter/bloom_filter_policy.inl @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cuco { + +template +__host__ __device__ constexpr bloom_filter_policy::bloom_filter_policy( + uint32_t pattern_bits, Hash hash) + : impl_{pattern_bits, hash} +{ +} + +template +__device__ constexpr typename bloom_filter_policy::hash_result_type +bloom_filter_policy::hash( + typename bloom_filter_policy::hash_argument_type const& key) const +{ + return impl_.hash(key); +} + +template +template +__device__ constexpr auto bloom_filter_policy::block_index( + typename bloom_filter_policy::hash_result_type hash, Extent num_blocks) const +{ + return impl_.block_index(hash, num_blocks); +} + +template +__device__ constexpr typename bloom_filter_policy::word_type +bloom_filter_policy::word_pattern( + bloom_filter_policy::hash_result_type hash, std::uint32_t word_index) const +{ + return impl_.word_pattern(hash, word_index); +} + +} // namespace cuco \ No newline at end of file diff --git a/include/cuco/detail/bloom_filter/bloom_filter_policy_impl.cuh b/include/cuco/detail/bloom_filter/bloom_filter_policy_impl.cuh new file mode 100644 index 000000000..affdb04cb --- /dev/null +++ b/include/cuco/detail/bloom_filter/bloom_filter_policy_impl.cuh @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +// TODO switch to once available +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cuco::detail { + +template +class bloom_filter_policy_impl { + public: + using hasher = Hash; + using word_type = typename Block::value_type; + using hash_argument_type = typename hasher::argument_type; + using hash_result_type = decltype(std::declval()(std::declval())); + + static constexpr std::uint32_t words_per_block = cuda::std::tuple_size_v; + + private: + static constexpr std::uint32_t word_bits = + cuda::std::numeric_limits::digits; + static constexpr std::uint32_t bit_index_width = cuda::std::bit_width(word_bits - 1); + + public: + static_assert(cuda::std::has_single_bit(words_per_block) and words_per_block <= 32, + "Number of words per block must be a power-of-two and less than or equal to 32"); + + __host__ __device__ constexpr bloom_filter_policy_impl(uint32_t pattern_bits, Hash hash) + : hash_{hash} + { + constexpr uint32_t hash_bits = cuda::std::numeric_limits::digits; + + // This ensures each word in the block has at least one bit set; otherwise we would never use + // some of the words + constexpr uint32_t min_pattern_bits = words_per_block; + + // The maximum number of bits to be set for a key is capped by the number of bits in the hash + // value or the number of bits in the filter block + constexpr uint32_t max_pattern_bits = + cuda::std::min(hash_bits / bit_index_width, word_bits * words_per_block); + + // TODO do we want this to fail? + // constexpr uint32_t max_pattern_bits_from_hash = hash_bits / bit_index_width; + // static_assert(min_pattern_bits <= max_pattern_bits_from_hash, "Hash value too narrow for the + // specified filter block type"); + + // TODO update this part once cuda::std::clamp is available + // Here we adjust the value dynamically; alternatively we could throw an error if the value is + // outside the expected range + pattern_bits_ = + cuda::std::min(cuda::std::max(min_pattern_bits, pattern_bits), max_pattern_bits); + + min_bits_per_word_ = pattern_bits_ / words_per_block; + remainder_bits_ = pattern_bits_ % words_per_block; + } + + __device__ constexpr hash_result_type hash(hash_argument_type const& key) const + { + return hash_(key); + } + + template + __device__ constexpr auto block_index(hash_result_type hash, Extent num_blocks) const + { + return hash % num_blocks; + } + + __device__ constexpr word_type word_pattern(hash_result_type hash, std::uint32_t word_index) const + { + word_type constexpr bit_index_mask = (word_type{1} << bit_index_width) - 1; + + auto const bits_so_far = min_bits_per_word_ * word_index + + (word_index < remainder_bits_ ? word_index : remainder_bits_); + + hash >>= bits_so_far * bit_index_width; + + word_type word = 0; + int32_t bits_per_word = min_bits_per_word_ + (word_index < remainder_bits_ ? 1 : 0); + + for (int32_t bit = 0; bit < bits_per_word; ++bit) { + word |= word_type{1} << (hash & bit_index_mask); + hash >>= bit_index_width; + } + + return word; + } + + private: + uint32_t pattern_bits_; + uint32_t min_bits_per_word_; + uint32_t remainder_bits_; + hasher hash_; +}; + +} // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/detail/bloom_filter/bloom_filter_ref.inl b/include/cuco/detail/bloom_filter/bloom_filter_ref.inl index 64ac421f5..56467a128 100644 --- a/include/cuco/detail/bloom_filter/bloom_filter_ref.inl +++ b/include/cuco/detail/bloom_filter/bloom_filter_ref.inl @@ -21,117 +21,112 @@ #include #include -#include - namespace cuco { -template -__host__ __device__ -bloom_filter_ref::bloom_filter_ref(word_type* data, - Extent num_blocks, - std::uint32_t pattern_bits, - cuda_thread_scope, - Hash const& hash) - : impl_{data, num_blocks, pattern_bits, {}, hash} +template +__host__ __device__ bloom_filter_ref::bloom_filter_ref( + word_type* data, Extent num_blocks, cuda_thread_scope, Policy const& policy) + : impl_{data, num_blocks, {}, policy} { } -template +template template -__device__ void bloom_filter_ref::clear(CG const& group) +__device__ void bloom_filter_ref::clear(CG const& group) { impl_.clear(group); } -template -__host__ void bloom_filter_ref::clear(cuda::stream_ref stream) +template +__host__ void bloom_filter_ref::clear(cuda::stream_ref stream) { impl_.clear(stream); } -template -__host__ void bloom_filter_ref::clear_async( - cuda::stream_ref stream) +template +__host__ void bloom_filter_ref::clear_async(cuda::stream_ref stream) { impl_.clear_async(stream); } -template +template template -__device__ void bloom_filter_ref::add(ProbeKey const& key) +__device__ void bloom_filter_ref::add(ProbeKey const& key) { impl_.add(key); } -template +template template -__device__ void bloom_filter_ref::add(CG const& group, - ProbeKey const& key) +__device__ void bloom_filter_ref::add(CG const& group, + ProbeKey const& key) { impl_.add(group, key); } -template +template template -__host__ void bloom_filter_ref::add(InputIt first, - InputIt last, - cuda::stream_ref stream) +__host__ void bloom_filter_ref::add(InputIt first, + InputIt last, + cuda::stream_ref stream) { impl_.add(first, last, stream); } -template +template template -__host__ void bloom_filter_ref::add_async(InputIt first, - InputIt last, - cuda::stream_ref stream) +__host__ void bloom_filter_ref::add_async(InputIt first, + InputIt last, + cuda::stream_ref stream) { impl_.add_async(first, last, stream); } -template +template template -__host__ void bloom_filter_ref::add_if( +__host__ void bloom_filter_ref::add_if( InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream) { impl_.add_if(first, last, stencil, pred, stream); } -template +template template -__host__ void bloom_filter_ref::add_if_async( +__host__ void bloom_filter_ref::add_if_async( InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream) noexcept { impl_.add_if_async(first, last, stencil, pred, stream); } -template +template template -[[nodiscard]] __device__ bool bloom_filter_ref::contains( +[[nodiscard]] __device__ bool bloom_filter_ref::contains( ProbeKey const& key) const { return impl_.contains(key); } -template +template template -__host__ void bloom_filter_ref::contains( - InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const +__host__ void bloom_filter_ref::contains(InputIt first, + InputIt last, + OutputIt output_begin, + cuda::stream_ref stream) const { impl_.contains(first, last, output_begin, stream); } -template +template template -__host__ void bloom_filter_ref::contains_async( +__host__ void bloom_filter_ref::contains_async( InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const noexcept { impl_.contains_async(first, last, output_begin, stream); } -template +template template -__host__ void bloom_filter_ref::contains_if( +__host__ void bloom_filter_ref::contains_if( InputIt first, InputIt last, StencilIt stencil, @@ -142,9 +137,9 @@ __host__ void bloom_filter_ref::contains_if( impl_.contains_if(first, last, stencil, pred, output_begin, stream); } -template +template template -__host__ void bloom_filter_ref::contains_if_async( +__host__ void bloom_filter_ref::contains_if_async( InputIt first, InputIt last, StencilIt stencil, @@ -155,42 +150,26 @@ __host__ void bloom_filter_ref::contains_if_asy impl_.contains_if_async(first, last, stencil, pred, output_begin, stream); } -template -[[nodiscard]] __host__ __device__ - typename bloom_filter_ref::word_type* - bloom_filter_ref::data() noexcept +template +[[nodiscard]] __host__ __device__ typename bloom_filter_ref::word_type* +bloom_filter_ref::data() noexcept { return impl_.data(); } -template +template [[nodiscard]] __host__ __device__ - typename bloom_filter_ref::word_type const* - bloom_filter_ref::data() const noexcept + typename bloom_filter_ref::word_type const* + bloom_filter_ref::data() const noexcept { return impl_.data(); } -template -[[nodiscard]] __host__ __device__ - typename bloom_filter_ref::extent_type - bloom_filter_ref::block_extent() const noexcept +template +[[nodiscard]] __host__ __device__ typename bloom_filter_ref::extent_type +bloom_filter_ref::block_extent() const noexcept { return impl_.block_extent(); } -template -[[nodiscard]] __host__ __device__ uint32_t -bloom_filter_ref::pattern_bits() const noexcept -{ - return impl_.pattern_bits(); -} - -template -[[nodiscard]] __host__ __device__ typename bloom_filter_ref::hasher -bloom_filter_ref::hash_function() const noexcept -{ - return impl_.hash_function(); -} - } // namespace cuco \ No newline at end of file diff --git a/include/cuco/detail/bloom_filter/kernels.cuh b/include/cuco/detail/bloom_filter/kernels.cuh index df1848ea6..59dbc22e3 100644 --- a/include/cuco/detail/bloom_filter/kernels.cuh +++ b/include/cuco/detail/bloom_filter/kernels.cuh @@ -30,18 +30,18 @@ template (cooperative_groups::this_thread_block()); + cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); while (idx < n) { if (pred(*(stencil + idx))) { typename std::iterator_traits::value_type const& insert_element{*(first + idx)}; - if constexpr (block_words == 1) { + if constexpr (words_per_block == 1) { ref.add(insert_element); } else { ref.add(tile, insert_element); diff --git a/tests/bloom_filter/unique_sequence_test.cu b/tests/bloom_filter/unique_sequence_test.cu index ac65eb284..2c2ffce48 100644 --- a/tests/bloom_filter/unique_sequence_test.cu +++ b/tests/bloom_filter/unique_sequence_test.cu @@ -91,22 +91,19 @@ void test_unique_sequence(Filter& filter, size_type num_keys) TEMPLATE_TEST_CASE_SIG( "Unique sequence", "", - ((typename Key, typename Hash, uint32_t BlockWords, typename Word), Key, Hash, BlockWords, Word), - (int32_t, cuco::default_hash_function, 1, uint32_t), - (int32_t, cuco::default_hash_function, 8, uint32_t), - (int32_t, cuco::default_hash_function, 1, uint64_t), - (int32_t, cuco::default_hash_function, 8, uint64_t)) + ((class Key, class Policy), Key, Policy), + (int32_t, cuco::bloom_filter_policy, cuda::std::array>), + (int32_t, cuco::bloom_filter_policy, cuda::std::array>), + (int32_t, cuco::bloom_filter_policy, cuda::std::array>), + (int32_t, cuco::bloom_filter_policy, cuda::std::array>)) { - using filter_type = cuco::bloom_filter, - cuco::extent, - cuda::thread_scope_device, - Hash>; + using filter_type = + cuco::bloom_filter, cuda::thread_scope_device, Policy>; constexpr size_type num_keys{400}; - uint32_t pattern_bits = GENERATE(1, 2, 4, 6); + uint32_t word_bits = GENERATE(1, 2, 4, 6); - auto filter = filter_type{1000, pattern_bits}; + auto filter = filter_type{1000, {}, {word_bits * Policy::words_per_block}}; test_unique_sequence(filter, num_keys); }