Skip to content

Commit

Permalink
Add rehash functionality (#380)
Browse files Browse the repository at this point in the history
This PR adds the ability to `rehash` aka resize or regenerate an open
addressing container.

Closes #21
  • Loading branch information
sleeepyjack authored Oct 11, 2023
1 parent 0f86edb commit 72ca959
Show file tree
Hide file tree
Showing 12 changed files with 574 additions and 2 deletions.
3 changes: 2 additions & 1 deletion benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ ConfigureBench(STATIC_SET_BENCH
hash_table/static_set/find_bench.cu
hash_table/static_set/insert_bench.cu
hash_table/static_set/retrieve_all_bench.cu
hash_table/static_set/size_bench.cu)
hash_table/static_set/size_bench.cu
hash_table/static_set/rehash_bench.cu)

###################################################################################################
# - static_map benchmarks -------------------------------------------------------------------------
Expand Down
60 changes: 60 additions & 0 deletions benchmarks/hash_table/static_set/rehash_bench.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <defaults.hpp>
#include <utils.hpp>

#include <cuco/static_set.cuh>
#include <cuco/utility/key_generator.hpp>

#include <nvbench/nvbench.cuh>

#include <thrust/device_vector.h>

/**
* @brief A benchmark evaluating `cuco::static_set::rehash` performance
*/
template <typename Key, typename Dist>
void static_set_rehash(nvbench::state& state, nvbench::type_list<Key, Dist>)
{
std::size_t const capacity = state.get_int64_or_default("Capacity", cuco::benchmark::defaults::N);
auto const occupancy =
state.get_float64_or_default("Occupancy", cuco::benchmark::defaults::OCCUPANCY);

std::size_t const num_keys = capacity * occupancy;

thrust::device_vector<Key> keys(num_keys); // slots per second

cuco::utility::key_generator gen;
gen.generate(cuco::benchmark::dist_from_state<Dist>(state), keys.begin(), keys.end());

state.add_element_count(capacity);

cuco::experimental::static_set<Key> set{capacity, cuco::empty_key<Key>{-1}};

set.insert(keys.begin(), keys.end());

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { set.rehash({launch.get_stream()}); });
}

NVBENCH_BENCH_TYPES(static_set_rehash,
NVBENCH_TYPE_AXES(cuco::benchmark::defaults::KEY_TYPE_RANGE,
nvbench::type_list<cuco::utility::distribution::unique>))
.set_name("static_set_rehash_unique_occupancy")
.set_type_axes_names({"Key", "Distribution"})
.set_max_noise(cuco::benchmark::defaults::MAX_NOISE)
.add_float64_axis("Occupancy", cuco::benchmark::defaults::OCCUPANCY_RANGE);
47 changes: 47 additions & 0 deletions include/cuco/detail/common_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,53 @@ __global__ void size(StorageRef storage, Predicate is_filled, AtomicT* count)
if (threadIdx.x == 0) { count->fetch_add(block_count, cuda::std::memory_order_relaxed); }
}

template <int32_t BlockSize, typename ContainerRef, typename Predicate>
__global__ void rehash(typename ContainerRef::storage_ref_type storage_ref,
ContainerRef container_ref,
Predicate is_filled)
{
namespace cg = cooperative_groups;

__shared__ typename ContainerRef::value_type buffer[BlockSize];
__shared__ unsigned int buffer_size;

auto constexpr cg_size = ContainerRef::cg_size;
auto const block = cg::this_thread_block();
auto const tile = cg::tiled_partition<cg_size>(block);

auto const thread_rank = block.thread_rank();
auto constexpr tiles_per_block = BlockSize / cg_size; // tile.meta_group_size() but constexpr
auto const tile_rank = tile.meta_group_rank();
auto const loop_stride = cuco::detail::grid_stride();
auto idx = cuco::detail::global_thread_id();
auto const n = storage_ref.num_windows();

while (idx - thread_rank < n) {
if (thread_rank == 0) { buffer_size = 0; }
block.sync();

// gather values in shmem buffer
if (idx < n) {
auto const window = storage_ref[idx];

for (auto const& slot : window) {
if (is_filled(slot)) { buffer[atomicAdd_block(&buffer_size, 1)] = slot; }
}
}
block.sync();

auto const local_buffer_size = buffer_size;

// insert from shmem buffer into the container
for (auto tidx = tile_rank; tidx < local_buffer_size; tidx += tiles_per_block) {
container_ref.insert(tile, buffer[tidx]);
}
block.sync();

idx += loop_stride;
}
}

} // namespace detail
} // namespace experimental
} // namespace cuco
115 changes: 115 additions & 0 deletions include/cuco/detail/open_addressing/open_addressing_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cuco/detail/storage/counter_storage.cuh>
#include <cuco/detail/utility/cuda.hpp>
#include <cuco/extent.cuh>
#include <cuco/operator.hpp>
#include <cuco/probing_scheme.cuh>
#include <cuco/storage.cuh>
#include <cuco/utility/traits.hpp>
Expand Down Expand Up @@ -624,6 +625,120 @@ class open_addressing_impl {
return counter.load_to_host(stream);
}

/**
* @brief Regenerates the container
*
* @note This function synchronizes the given stream. For asynchronous execution use
* `rehash_async`.
*
* @tparam Container The container type this function operates on
* @tparam Predicate Type of predicate indicating if the given slot is filled
*
* @param extent The container's new `window_extent` after this operation took place
* @param container The container to be rehashed
* @param is_filled Predicate indicating if the given slot is filled
* @param stream CUDA stream used for this operation
*/
template <typename Container, typename Predicate>
void rehash(Container const& container, Predicate const& is_filled, cuda_stream_ref stream)
{
this->rehash_async(container, is_filled, stream);
stream.synchronize();
}

/**
* @brief Asynchronously reserves at least the specified number of slots and regenerates the
* container
*
* @note Changes the number of windows to a value that is not less than `extent`, then
* rehashes the container, i.e. puts the elements into appropriate slots considering
* that the total number of slots has changed.
*
* @note This function synchronizes the given stream. For asynchronous execution use
* `rehash_async`.
*
* @note Behavior is undefined if the desired `extent` is insufficient to store all of the
* contained elements.
*
* @note This function is not available if the conatiner's `extent_type` is static.
*
* @tparam Container The container type this function operates on
* @tparam Predicate Type of predicate indicating if the given slot is filled
*
* @param extent The container's new `window_extent` after this operation took place
* @param container The container to be rehashed
* @param is_filled Predicate indicating if the given slot is filled
* @param stream CUDA stream used for this operation
*/
template <typename Container, typename Predicate>
void rehash(extent_type extent,
Container const& container,
Predicate const& is_filled,
cuda_stream_ref stream)
{
this->rehash_async(extent, container, is_filled, stream);
stream.synchronize();
}

/**
* @brief Asynchronously regenerates the container
*
* @tparam Container The container type this function operates on
* @tparam Predicate Type of predicate indicating if the given slot is filled
*
* @param extent The container's new `window_extent` after this operation took place
* @param container The container to be rehashed
* @param is_filled Predicate indicating if the given slot is filled
* @param stream CUDA stream used for this operation
*/
template <typename Container, typename Predicate>
void rehash_async(Container const& container, Predicate const& is_filled, cuda_stream_ref stream)
{
this->rehash_async(this->storage_.window_extent(), container, is_filled, stream);
}

/**
* @brief Asynchronously reserves at least the specified number of slots and regenerates the
* container
*
* @note Changes the number of windows to a value that is not less than `extent`, then
* rehashes the container, i.e. puts the elements into appropriate slots considering
* that the total number of slots has changed.
*
* @note Behavior is undefined if the desired `extent` is insufficient to store all of the
* contained elements.
*
* @note This function is not available if the conatiner's `extent_type` is static.
*
* @tparam Container The container type this function operates on
* @tparam Predicate Type of predicate indicating if the given slot is filled
*
* @param extent The container's new `window_extent` after this operation took place
* @param container The container to be rehashed
* @param is_filled Predicate indicating if the given slot is filled
* @param stream CUDA stream used for this operation
*/
template <typename Container, typename Predicate>
void rehash_async(extent_type extent,
Container const& container,
Predicate const& is_filled,
cuda_stream_ref stream)
{
auto const old_storage = std::move(this->storage_);
new (&storage_) storage_type{extent, this->allocator()};
this->clear_async(stream);

auto const num_windows = old_storage.num_windows();
if (num_windows == 0) { return; }

auto constexpr block_size = cuco::detail::default_block_size();
auto constexpr stride = cuco::detail::default_stride();
auto const grid_size = cuco::detail::grid_size(num_windows, 1, stride, block_size);

detail::rehash<block_size><<<grid_size, block_size, 0, stream>>>(
old_storage.ref(), container.ref(op::insert), is_filled);
}

/**
* @brief Gets the maximum number of elements the container can hold.
*
Expand Down
66 changes: 66 additions & 0 deletions include/cuco/detail/static_map/static_map.inl
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,72 @@ static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
return std::make_pair(keys_out + num, values_out + num);
}

template <class Key,
class T,
class Extent,
cuda::thread_scope Scope,
class KeyEqual,
class ProbingScheme,
class Allocator,
class Storage>
void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash(
cuda_stream_ref stream)
{
auto const is_filled = static_map_ns::detail::slot_is_filled<Key, T>(this->empty_key_sentinel(),
this->erased_key_sentinel());
this->impl_->rehash(*this, is_filled, stream);
}

template <class Key,
class T,
class Extent,
cuda::thread_scope Scope,
class KeyEqual,
class ProbingScheme,
class Allocator,
class Storage>
void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash(
size_type capacity, cuda_stream_ref stream)
{
auto const is_filled = static_map_ns::detail::slot_is_filled<Key, T>(this->empty_key_sentinel(),
this->erased_key_sentinel());
auto const extent = make_window_extent<static_map>(capacity);
this->impl_->rehash(extent, *this, is_filled, stream);
}

template <class Key,
class T,
class Extent,
cuda::thread_scope Scope,
class KeyEqual,
class ProbingScheme,
class Allocator,
class Storage>
void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash_async(
cuda_stream_ref stream)
{
auto const is_filled = static_map_ns::detail::slot_is_filled<Key, T>(this->empty_key_sentinel(),
this->erased_key_sentinel());
this->impl_->rehash_async(*this, is_filled, stream);
}

template <class Key,
class T,
class Extent,
cuda::thread_scope Scope,
class KeyEqual,
class ProbingScheme,
class Allocator,
class Storage>
void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash_async(
size_type capacity, cuda_stream_ref stream)
{
auto const is_filled = static_map_ns::detail::slot_is_filled<Key, T>(this->empty_key_sentinel(),
this->erased_key_sentinel());
auto const extent = make_window_extent<static_map>(capacity);
this->impl_->rehash_async(extent, *this, is_filled, stream);
}

template <class Key,
class T,
class Extent,
Expand Down
62 changes: 62 additions & 0 deletions include/cuco/detail/static_set/static_set.inl
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,68 @@ OutputIt static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stor
return impl_->retrieve_all(begin, output_begin, is_filled, stream);
}

template <class Key,
class Extent,
cuda::thread_scope Scope,
class KeyEqual,
class ProbingScheme,
class Allocator,
class Storage>
void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash(
cuda_stream_ref stream)
{
auto const is_filled =
static_set_ns::detail::slot_is_filled(this->empty_key_sentinel(), this->erased_key_sentinel());
this->impl_->rehash(*this, is_filled, stream);
}

template <class Key,
class Extent,
cuda::thread_scope Scope,
class KeyEqual,
class ProbingScheme,
class Allocator,
class Storage>
void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash(
size_type capacity, cuda_stream_ref stream)
{
auto const is_filled =
static_set_ns::detail::slot_is_filled(this->empty_key_sentinel(), this->erased_key_sentinel());
auto const extent = make_window_extent<static_set>(capacity);
this->impl_->rehash(extent, *this, is_filled, stream);
}

template <class Key,
class Extent,
cuda::thread_scope Scope,
class KeyEqual,
class ProbingScheme,
class Allocator,
class Storage>
void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash_async(
cuda_stream_ref stream)
{
auto const is_filled =
static_set_ns::detail::slot_is_filled(this->empty_key_sentinel(), this->erased_key_sentinel());
this->impl_->rehash_async(*this, is_filled, stream);
}

template <class Key,
class Extent,
cuda::thread_scope Scope,
class KeyEqual,
class ProbingScheme,
class Allocator,
class Storage>
void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash_async(
size_type capacity, cuda_stream_ref stream)
{
auto const is_filled =
static_set_ns::detail::slot_is_filled(this->empty_key_sentinel(), this->erased_key_sentinel());
auto const extent = make_window_extent<static_set>(capacity);
this->impl_->rehash_async(extent, *this, is_filled, stream);
}

template <class Key,
class Extent,
cuda::thread_scope Scope,
Expand Down
Loading

0 comments on commit 72ca959

Please sign in to comment.