diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 3635336e8..6b03cb98c 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -53,7 +53,8 @@ ConfigureBench(STATIC_SET_BENCH hash_table/static_set/find_bench.cu hash_table/static_set/insert_bench.cu hash_table/static_set/retrieve_all_bench.cu - hash_table/static_set/size_bench.cu) + hash_table/static_set/size_bench.cu + hash_table/static_set/rehash_bench.cu) ################################################################################################### # - static_map benchmarks ------------------------------------------------------------------------- diff --git a/benchmarks/hash_table/static_set/rehash_bench.cu b/benchmarks/hash_table/static_set/rehash_bench.cu new file mode 100644 index 000000000..266e1f887 --- /dev/null +++ b/benchmarks/hash_table/static_set/rehash_bench.cu @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include + +/** + * @brief A benchmark evaluating `cuco::static_set::rehash` performance + */ +template +void static_set_rehash(nvbench::state& state, nvbench::type_list) +{ + std::size_t const capacity = state.get_int64_or_default("Capacity", cuco::benchmark::defaults::N); + auto const occupancy = + state.get_float64_or_default("Occupancy", cuco::benchmark::defaults::OCCUPANCY); + + std::size_t const num_keys = capacity * occupancy; + + thrust::device_vector keys(num_keys); // slots per second + + cuco::utility::key_generator gen; + gen.generate(cuco::benchmark::dist_from_state(state), keys.begin(), keys.end()); + + state.add_element_count(capacity); + + cuco::experimental::static_set set{capacity, cuco::empty_key{-1}}; + + set.insert(keys.begin(), keys.end()); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { set.rehash({launch.get_stream()}); }); +} + +NVBENCH_BENCH_TYPES(static_set_rehash, + NVBENCH_TYPE_AXES(cuco::benchmark::defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_rehash_unique_occupancy") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(cuco::benchmark::defaults::MAX_NOISE) + .add_float64_axis("Occupancy", cuco::benchmark::defaults::OCCUPANCY_RANGE); diff --git a/include/cuco/detail/common_kernels.cuh b/include/cuco/detail/common_kernels.cuh index cecd50735..51200b6e5 100644 --- a/include/cuco/detail/common_kernels.cuh +++ b/include/cuco/detail/common_kernels.cuh @@ -293,6 +293,53 @@ __global__ void size(StorageRef storage, Predicate is_filled, AtomicT* count) if (threadIdx.x == 0) { count->fetch_add(block_count, cuda::std::memory_order_relaxed); } } +template +__global__ void rehash(typename ContainerRef::storage_ref_type storage_ref, + ContainerRef container_ref, + Predicate is_filled) +{ + namespace cg = cooperative_groups; + + __shared__ typename ContainerRef::value_type buffer[BlockSize]; + __shared__ unsigned int buffer_size; + + auto constexpr cg_size = ContainerRef::cg_size; + auto const block = cg::this_thread_block(); + auto const tile = cg::tiled_partition(block); + + auto const thread_rank = block.thread_rank(); + auto constexpr tiles_per_block = BlockSize / cg_size; // tile.meta_group_size() but constexpr + auto const tile_rank = tile.meta_group_rank(); + auto const loop_stride = cuco::detail::grid_stride(); + auto idx = cuco::detail::global_thread_id(); + auto const n = storage_ref.num_windows(); + + while (idx - thread_rank < n) { + if (thread_rank == 0) { buffer_size = 0; } + block.sync(); + + // gather values in shmem buffer + if (idx < n) { + auto const window = storage_ref[idx]; + + for (auto const& slot : window) { + if (is_filled(slot)) { buffer[atomicAdd_block(&buffer_size, 1)] = slot; } + } + } + block.sync(); + + auto const local_buffer_size = buffer_size; + + // insert from shmem buffer into the container + for (auto tidx = tile_rank; tidx < local_buffer_size; tidx += tiles_per_block) { + container_ref.insert(tile, buffer[tidx]); + } + block.sync(); + + idx += loop_stride; + } +} + } // namespace detail } // namespace experimental } // namespace cuco diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 84865583d..712ff85ee 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -624,6 +625,120 @@ class open_addressing_impl { return counter.load_to_host(stream); } + /** + * @brief Regenerates the container + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `rehash_async`. + * + * @tparam Container The container type this function operates on + * @tparam Predicate Type of predicate indicating if the given slot is filled + * + * @param extent The container's new `window_extent` after this operation took place + * @param container The container to be rehashed + * @param is_filled Predicate indicating if the given slot is filled + * @param stream CUDA stream used for this operation + */ + template + void rehash(Container const& container, Predicate const& is_filled, cuda_stream_ref stream) + { + this->rehash_async(container, is_filled, stream); + stream.synchronize(); + } + + /** + * @brief Asynchronously reserves at least the specified number of slots and regenerates the + * container + * + * @note Changes the number of windows to a value that is not less than `extent`, then + * rehashes the container, i.e. puts the elements into appropriate slots considering + * that the total number of slots has changed. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `rehash_async`. + * + * @note Behavior is undefined if the desired `extent` is insufficient to store all of the + * contained elements. + * + * @note This function is not available if the conatiner's `extent_type` is static. + * + * @tparam Container The container type this function operates on + * @tparam Predicate Type of predicate indicating if the given slot is filled + * + * @param extent The container's new `window_extent` after this operation took place + * @param container The container to be rehashed + * @param is_filled Predicate indicating if the given slot is filled + * @param stream CUDA stream used for this operation + */ + template + void rehash(extent_type extent, + Container const& container, + Predicate const& is_filled, + cuda_stream_ref stream) + { + this->rehash_async(extent, container, is_filled, stream); + stream.synchronize(); + } + + /** + * @brief Asynchronously regenerates the container + * + * @tparam Container The container type this function operates on + * @tparam Predicate Type of predicate indicating if the given slot is filled + * + * @param extent The container's new `window_extent` after this operation took place + * @param container The container to be rehashed + * @param is_filled Predicate indicating if the given slot is filled + * @param stream CUDA stream used for this operation + */ + template + void rehash_async(Container const& container, Predicate const& is_filled, cuda_stream_ref stream) + { + this->rehash_async(this->storage_.window_extent(), container, is_filled, stream); + } + + /** + * @brief Asynchronously reserves at least the specified number of slots and regenerates the + * container + * + * @note Changes the number of windows to a value that is not less than `extent`, then + * rehashes the container, i.e. puts the elements into appropriate slots considering + * that the total number of slots has changed. + * + * @note Behavior is undefined if the desired `extent` is insufficient to store all of the + * contained elements. + * + * @note This function is not available if the conatiner's `extent_type` is static. + * + * @tparam Container The container type this function operates on + * @tparam Predicate Type of predicate indicating if the given slot is filled + * + * @param extent The container's new `window_extent` after this operation took place + * @param container The container to be rehashed + * @param is_filled Predicate indicating if the given slot is filled + * @param stream CUDA stream used for this operation + */ + template + void rehash_async(extent_type extent, + Container const& container, + Predicate const& is_filled, + cuda_stream_ref stream) + { + auto const old_storage = std::move(this->storage_); + new (&storage_) storage_type{extent, this->allocator()}; + this->clear_async(stream); + + auto const num_windows = old_storage.num_windows(); + if (num_windows == 0) { return; } + + auto constexpr block_size = cuco::detail::default_block_size(); + auto constexpr stride = cuco::detail::default_stride(); + auto const grid_size = cuco::detail::grid_size(num_windows, 1, stride, block_size); + + detail::rehash<<>>( + old_storage.ref(), container.ref(op::insert), is_filled); + } + /** * @brief Gets the maximum number of elements the container can hold. * diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index 36eb74f5f..9249d4fa1 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -407,6 +407,72 @@ static_map:: return std::make_pair(keys_out + num, values_out + num); } +template +void static_map::rehash( + cuda_stream_ref stream) +{ + auto const is_filled = static_map_ns::detail::slot_is_filled(this->empty_key_sentinel(), + this->erased_key_sentinel()); + this->impl_->rehash(*this, is_filled, stream); +} + +template +void static_map::rehash( + size_type capacity, cuda_stream_ref stream) +{ + auto const is_filled = static_map_ns::detail::slot_is_filled(this->empty_key_sentinel(), + this->erased_key_sentinel()); + auto const extent = make_window_extent(capacity); + this->impl_->rehash(extent, *this, is_filled, stream); +} + +template +void static_map::rehash_async( + cuda_stream_ref stream) +{ + auto const is_filled = static_map_ns::detail::slot_is_filled(this->empty_key_sentinel(), + this->erased_key_sentinel()); + this->impl_->rehash_async(*this, is_filled, stream); +} + +template +void static_map::rehash_async( + size_type capacity, cuda_stream_ref stream) +{ + auto const is_filled = static_map_ns::detail::slot_is_filled(this->empty_key_sentinel(), + this->erased_key_sentinel()); + auto const extent = make_window_extent(capacity); + this->impl_->rehash_async(extent, *this, is_filled, stream); +} + template retrieve_all(begin, output_begin, is_filled, stream); } +template +void static_set::rehash( + cuda_stream_ref stream) +{ + auto const is_filled = + static_set_ns::detail::slot_is_filled(this->empty_key_sentinel(), this->erased_key_sentinel()); + this->impl_->rehash(*this, is_filled, stream); +} + +template +void static_set::rehash( + size_type capacity, cuda_stream_ref stream) +{ + auto const is_filled = + static_set_ns::detail::slot_is_filled(this->empty_key_sentinel(), this->erased_key_sentinel()); + auto const extent = make_window_extent(capacity); + this->impl_->rehash(extent, *this, is_filled, stream); +} + +template +void static_set::rehash_async( + cuda_stream_ref stream) +{ + auto const is_filled = + static_set_ns::detail::slot_is_filled(this->empty_key_sentinel(), this->erased_key_sentinel()); + this->impl_->rehash_async(*this, is_filled, stream); +} + +template +void static_set::rehash_async( + size_type capacity, cuda_stream_ref stream) +{ + auto const is_filled = + static_set_ns::detail::slot_is_filled(this->empty_key_sentinel(), this->erased_key_sentinel()); + auto const extent = make_window_extent(capacity); + this->impl_->rehash_async(extent, *this, is_filled, stream); +} + template { using impl_type::initialize_async; using impl_type::num_windows; using impl_type::ref; + using impl_type::window_extent; /** * @brief Constructs storage. diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 488f970b3..56fbabe23 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -620,6 +620,61 @@ class static_map { ValueOut values_out, cuda_stream_ref stream = {}) const; + /** + * @brief Regenerates the container. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `rehash_async`. + * + * @param stream CUDA stream used for this operation + */ + void rehash(cuda_stream_ref stream = {}); + + /** + * @brief Reserves at least the specified number of slots and regenerates the container + * + * @note Changes the number of slots to a value that is not less than `capacity`, then + * rehashes the container, i.e. puts the elements into appropriate slots considering + * that the total number of slots has changed. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `rehash_async`. + * + * @note Behavior is undefined if the desired `capacity` is insufficient to store all of the + * contained elements. + * + * @note This function is not available if the conatiner's `extent_type` is static. + * + * @param capacity New capacity of the container + * @param stream CUDA stream used for this operation + */ + void rehash(size_type capacity, cuda_stream_ref stream = {}); + + /** + * @brief Asynchronously regenerates the container. + * + * @param stream CUDA stream used for this operation + */ + void rehash_async(cuda_stream_ref stream = {}); + + /** + * @brief Asynchronously reserves at least the specified number of slots and regenerates the + * container + * + * @note Changes the number of slots to a value that is not less than `capacity`, then + * rehashes the container, i.e. puts the elements into appropriate slots considering + * that the total number of slots has changed. + * + * @note Behavior is undefined if the desired `capacity` is insufficient to store all of the + * contained elements. + * + * @note This function is not available if the conatiner's `extent_type` is static. + * + * @param capacity New capacity of the container + * @param stream CUDA stream used for this operation + */ + void rehash_async(size_type capacity, cuda_stream_ref stream = {}); + /** * @brief Gets the number of elements in the container. * diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh index fdb65f5f8..594cc5f27 100644 --- a/include/cuco/static_set.cuh +++ b/include/cuco/static_set.cuh @@ -542,6 +542,61 @@ class static_set { template OutputIt retrieve_all(OutputIt output_begin, cuda_stream_ref stream = {}) const; + /** + * @brief Regenerates the container. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `rehash_async`. + * + * @param stream CUDA stream used for this operation + */ + void rehash(cuda_stream_ref stream = {}); + + /** + * @brief Reserves at least the specified number of slots and regenerates the container + * + * @note Changes the number of slots to a value that is not less than `capacity`, then + * rehashes the container, i.e. puts the elements into appropriate slots considering + * that the total number of slots has changed. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `rehash_async`. + * + * @note Behavior is undefined if the desired `capacity` is insufficient to store all of the + * contained elements. + * + * @note This function is not available if the conatiner's `extent_type` is static. + * + * @param capacity New capacity of the container + * @param stream CUDA stream used for this operation + */ + void rehash(size_type capacity, cuda_stream_ref stream = {}); + + /** + * @brief Asynchronously regenerates the container. + * + * @param stream CUDA stream used for this operation + */ + void rehash_async(cuda_stream_ref stream = {}); + + /** + * @brief Asynchronously reserves at least the specified number of slots and regenerates the + * container + * + * @note Changes the number of slots to a value that is not less than `capacity`, then + * rehashes the container, i.e. puts the elements into appropriate slots considering + * that the total number of slots has changed. + * + * @note Behavior is undefined if the desired `capacity` is insufficient to store all of the + * contained elements. + * + * @note This function is not available if the conatiner's `extent_type` is static. + * + * @param capacity New capacity of the container + * @param stream CUDA stream used for this operation + */ + void rehash_async(size_type capacity, cuda_stream_ref stream = {}); + /** * @brief Gets the number of elements in the container. * diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 775b5b82f..916e0ea42 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -63,6 +63,7 @@ ConfigureTest(STATIC_SET_TEST static_set/insert_and_find_test.cu static_set/large_input_test.cu static_set/retrieve_all_test.cu + static_set/rehash_test.cu static_set/size_test.cu static_set/unique_sequence_test.cu) @@ -79,7 +80,8 @@ ConfigureTest(STATIC_MAP_TEST static_map/key_sentinel_test.cu static_map/shared_memory_test.cu static_map/stream_test.cu - static_map/unique_sequence_test.cu) + static_map/unique_sequence_test.cu + static_map/rehash_test.cu) ################################################################################################### # - dynamic_map tests ----------------------------------------------------------------------------- diff --git a/tests/static_map/rehash_test.cu b/tests/static_map/rehash_test.cu new file mode 100644 index 000000000..69a73c6b3 --- /dev/null +++ b/tests/static_map/rehash_test.cu @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include + +TEST_CASE("Rehash", "") +{ + using key_type = int; + using mapped_type = long; + + constexpr std::size_t num_keys{400}; + constexpr std::size_t num_erased_keys{100}; + + cuco::experimental::static_map map{num_keys, + cuco::empty_key{-1}, + cuco::empty_value{-1}, + cuco::erased_key{-2}}; + + thrust::device_vector d_keys(num_keys); + thrust::device_vector d_values(num_keys); + + thrust::sequence(d_keys.begin(), d_keys.end()); + thrust::sequence(d_values.begin(), d_values.end()); + + auto pairs_begin = + thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin())); + + map.insert(pairs_begin, pairs_begin + num_keys); + + map.rehash(); + REQUIRE(map.size() == num_keys); + + map.rehash(num_keys * 2); + REQUIRE(map.size() == num_keys); + + map.erase(d_keys.begin(), d_keys.begin() + num_erased_keys); + map.rehash(); + REQUIRE(map.size() == num_keys - num_erased_keys); +} diff --git a/tests/static_set/rehash_test.cu b/tests/static_set/rehash_test.cu new file mode 100644 index 000000000..98106df8a --- /dev/null +++ b/tests/static_set/rehash_test.cu @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include + +TEST_CASE("Rehash", "") +{ + using key_type = int; + + constexpr std::size_t num_keys{400}; + constexpr std::size_t num_erased_keys{100}; + + cuco::experimental::static_set set{ + num_keys, cuco::empty_key{-1}, cuco::erased_key{-2}}; + + thrust::device_vector d_keys(num_keys); + + thrust::sequence(d_keys.begin(), d_keys.end()); + + set.insert(d_keys.begin(), d_keys.end()); + + set.rehash(); + REQUIRE(set.size() == num_keys); + + set.rehash(num_keys * 2); + REQUIRE(set.size() == num_keys); + + set.erase(d_keys.begin(), d_keys.begin() + num_erased_keys); + set.rehash(); + REQUIRE(set.size() == num_keys - num_erased_keys); +}