Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/dev' into add-erase
Browse files Browse the repository at this point in the history
  • Loading branch information
PointKernel committed Oct 4, 2023
2 parents 4b86ede + b4657fd commit f239a99
Show file tree
Hide file tree
Showing 26 changed files with 943 additions and 176 deletions.
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ endfunction(ConfigureExample)

ConfigureExample(STATIC_SET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/host_bulk_example.cu")
ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_ref_example.cu")
ConfigureExample(STATIC_SET_DEVICE_SUBSETS_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_subsets_example.cu")
ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu")
ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_view_example.cu")
ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu")
Expand Down
16 changes: 8 additions & 8 deletions examples/static_set/device_ref_example.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,14 @@
#include <cstddef>
#include <iostream>

/**
* @file device_reference_example.cu
* @brief Demonstrates usage of the static_set device-side APIs.
*
* static_set provides a non-owning reference which can be used to interact with
* the container from within device code.
*/

// insert a set of keys into a hash set using one cooperative group for each task
template <typename SetRef, typename InputIterator>
__global__ void custom_cooperative_insert(SetRef set, InputIterator keys, std::size_t n)
Expand Down Expand Up @@ -60,14 +68,6 @@ __global__ void custom_contains(SetRef set, InputIterator keys, std::size_t n, O
}
}

/**
* @file device_reference_example.cu
* @brief Demonstrates usage of the static_set device-side APIs.
*
* static_set provides a non-owning reference which can be used to interact with
* the container from within device code.
*
*/
int main(void)
{
using Key = int;
Expand Down
183 changes: 183 additions & 0 deletions examples/static_set/device_subsets_example.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cuco/static_set_ref.cuh>
#include <cuco/storage.cuh>

#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/scan.h>

#include <cooperative_groups.h>

#include <cuda/std/array>

#include <algorithm>
#include <cstddef>
#include <iostream>
#include <numeric>

/**
* @file device_subsets_example.cu
* @brief Demonstrates how to use one bulk set storage to create multiple subsets and perform
* individual operations via device-side ref APIs.
*
* To optimize memory usage, especially when dealing with expensive data allocation and multiple
* hashsets, a practical solution involves employing a single bulk storage for generating subsets.
* This eliminates the need for separate memory allocation and deallocation for each container. This
* can be achieved by using the lightweight non-owning ref type.
*
* @note This example is for demonstration purposes only. It is not intended to show the most
* performant way to do the example algorithm.
*/

auto constexpr cg_size = 8; ///< A CUDA Cooperative Group of 8 threads to handle each subset
auto constexpr window_size = 1; ///< Number of concurrent slots handled by each thread
auto constexpr N = 10; ///< Number of elements to insert and query

using key_type = int; ///< Key type
using probing_scheme_type = cuco::experimental::linear_probing<
cg_size,
cuco::default_hash_function<key_type>>; ///< Type controls CG granularity and probing scheme
///< (linear probing v.s. double hashing)
/// Type of bulk allocation storage
using storage_type = cuco::experimental::aow_storage<key_type, window_size>;
/// Lightweight non-owning storage ref type
using storage_ref_type = typename storage_type::ref_type;
using ref_type = cuco::experimental::static_set_ref<key_type,
cuda::thread_scope_device,
thrust::equal_to<key_type>,
probing_scheme_type,
storage_ref_type>; ///< Set ref type

/// Sample data to insert and query
__device__ constexpr std::array<key_type, N> data = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
/// Empty slots are represented by reserved "sentinel" values. These values should be selected such
/// that they never occur in your input data.
key_type constexpr empty_key_sentinel = -1;

/**
* @brief Inserts sample data into subsets by using cooperative group
*
* Each Cooperative Group creates its own subset and inserts `N` sample data.
*
* @param set_refs Pointer to the array of subset objects
*/
__global__ void insert(ref_type* set_refs)
{
namespace cg = cooperative_groups;

auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
// Get subset (or CG) index
auto const idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;

auto raw_set_ref = *(set_refs + idx);
auto insert_set_ref = std::move(raw_set_ref).with(cuco::experimental::insert);

// Insert `N` elemtns into the set with CG insert
for (int i = 0; i < N; i++) {
insert_set_ref.insert(tile, data[i]);
}
}

/**
* @brief All inserted data can be found
*
* Each Cooperative Group reconstructs its own subset ref based on the storage parameters and
* verifies all inserted data can be found.
*
* @param set_refs Pointer to the array of subset objects
*/
__global__ void find(ref_type* set_refs)
{
namespace cg = cooperative_groups;

auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
auto const idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;

auto raw_set_ref = *(set_refs + idx);
auto find_set_ref = std::move(raw_set_ref).with(cuco::experimental::find);

// Result denoting if any of the inserted data is not found
__shared__ int result;
if (threadIdx.x == 0) { result = 0; }
__syncthreads();

for (int i = 0; i < N; i++) {
// Query the set with inserted data
auto const found = find_set_ref.find(tile, data[i]);
// Record if the inserted data has been found
atomicOr(&result, *found != data[i]);
}
__syncthreads();

if (threadIdx.x == 0) {
// If the result is still 0, all inserted data are found.
if (result == 0) { printf("Success! Found all inserted elements.\n"); }
}
}

int main()
{
// Number of subsets to be created
auto constexpr num = 16;
// Each subset may have a different requested size
auto constexpr subset_sizes =
std::array<std::size_t, num>{20, 20, 20, 20, 30, 30, 30, 30, 40, 40, 40, 40, 50, 50, 50, 50};

auto valid_sizes = std::vector<std::size_t>();
valid_sizes.reserve(num);

for (size_t i = 0; i < num; ++i) {
valid_sizes.emplace_back(
static_cast<std::size_t>(cuco::experimental::make_window_extent<ref_type>(subset_sizes[i])));
}

std::vector<std::size_t> offsets(num + 1, 0);

// prefix sum to compute offsets and total number of windows
std::size_t current_sum = 0;
for (std::size_t i = 0; i < valid_sizes.size(); ++i) {
current_sum += valid_sizes[i];
offsets[i + 1] = current_sum;
}

// total number of windows is located at the back of the offsets array
auto const total_num_windows = offsets.back();

// Create a single bulk storage used by all subsets
auto set_storage = storage_type{total_num_windows};
// Initializes the storage with the given sentinel
set_storage.initialize(empty_key_sentinel);

std::vector<ref_type> set_refs;

// create subsets
for (std::size_t i = 0; i < num; ++i) {
storage_ref_type storage_ref{valid_sizes[i], set_storage.data() + offsets[i]};
set_refs.emplace_back(
ref_type{cuco::empty_key<key_type>{empty_key_sentinel}, {}, {}, storage_ref});
}

thrust::device_vector<ref_type> d_set_refs(set_refs);

// Insert sample data
insert<<<1, 128>>>(d_set_refs.data().get());
// Find all inserted data
find<<<1, 128>>>(d_set_refs.data().get());

return 0;
}
23 changes: 17 additions & 6 deletions include/cuco/aow_storage.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@

#pragma once

#include <cuco/detail/storage/aow_storage_base.cuh>

#include <cuco/cuda_stream_ref.hpp>
#include <cuco/detail/storage/aow_storage_base.cuh>
#include <cuco/extent.cuh>
#include <cuco/utility/allocator.hpp>

#include <cuda/std/array>

Expand Down Expand Up @@ -47,7 +47,10 @@ class aow_storage_ref;
* @tparam Extent Type of extent denoting number of windows
* @tparam Allocator Type of allocator used for device storage (de)allocation
*/
template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
template <typename T,
int32_t WindowSize,
typename Extent = cuco::experimental::extent<std::size_t>,
typename Allocator = cuco::cuda_allocator<cuco::experimental::window<T, WindowSize>>>
class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
public:
using base_type = detail::aow_storage_base<T, WindowSize, Extent>; ///< AoW base class type
Expand Down Expand Up @@ -78,7 +81,7 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
* @param size Number of windows to (de)allocate
* @param allocator Allocator used for (de)allocating device storage
*/
explicit constexpr aow_storage(Extent size, Allocator const& allocator) noexcept;
explicit constexpr aow_storage(Extent size, Allocator const& allocator = {}) noexcept;

aow_storage(aow_storage&&) = default; ///< Move constructor
/**
Expand Down Expand Up @@ -119,7 +122,15 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
* @param key Key to which all keys in `slots` are initialized
* @param stream Stream used for executing the kernel
*/
void initialize(value_type key, cuda_stream_ref stream) noexcept;
void initialize(value_type key, cuda_stream_ref stream = {}) noexcept;

/**
* @brief Asynchronously initializes each slot in the AoW storage to contain `key`.
*
* @param key Key to which all keys in `slots` are initialized
* @param stream Stream used for executing the kernel
*/
void initialize_async(value_type key, cuda_stream_ref stream = {}) noexcept;

private:
allocator_type allocator_; ///< Allocator used to (de)allocate windows
Expand All @@ -134,7 +145,7 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
* @tparam WindowSize Number of slots in each window
* @tparam Extent Type of extent denoting storage capacity
*/
template <typename T, int32_t WindowSize, typename Extent>
template <typename T, int32_t WindowSize, typename Extent = cuco::experimental::extent<std::size_t>>
class aow_storage_ref : public detail::aow_storage_base<T, WindowSize, Extent> {
public:
using base_type = detail::aow_storage_base<T, WindowSize, Extent>; ///< AoW base class type
Expand Down
22 changes: 12 additions & 10 deletions include/cuco/detail/common_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@

#include <cooperative_groups.h>

#include <iterator>

namespace cuco {
namespace experimental {
namespace detail {
Expand All @@ -37,7 +39,7 @@ namespace detail {
*
* @tparam CGSize Number of threads in each CG
* @tparam BlockSize Number of threads in each block
* @tparam InputIterator Device accessible input iterator whose `value_type` is
* @tparam InputIt Device accessible input iterator whose `value_type` is
* convertible to the `value_type` of the data structure
* @tparam StencilIt Device accessible random access iterator whose value_type is
* convertible to Predicate's argument type
Expand All @@ -55,12 +57,12 @@ namespace detail {
*/
template <int32_t CGSize,
int32_t BlockSize,
typename InputIterator,
typename InputIt,
typename StencilIt,
typename Predicate,
typename AtomicT,
typename Ref>
__global__ void insert_if_n(InputIterator first,
__global__ void insert_if_n(InputIt first,
cuco::detail::index_type n,
StencilIt stencil,
Predicate pred,
Expand All @@ -76,7 +78,7 @@ __global__ void insert_if_n(InputIterator first,

while (idx < n) {
if (pred(*(stencil + idx))) {
typename Ref::value_type const insert_element{*(first + idx)};
typename std::iterator_traits<InputIt>::value_type const& insert_element{*(first + idx)};
if constexpr (CGSize == 1) {
if (ref.insert(insert_element)) { thread_num_successes++; };
} else {
Expand Down Expand Up @@ -106,7 +108,7 @@ __global__ void insert_if_n(InputIterator first,
*
* @tparam CGSize Number of threads in each CG
* @tparam BlockSize Number of threads in each block
* @tparam InputIterator Device accessible input iterator whose `value_type` is
* @tparam InputIt Device accessible input iterator whose `value_type` is
* convertible to the `value_type` of the data structure
* @tparam StencilIt Device accessible random access iterator whose value_type is
* convertible to Predicate's argument type
Expand All @@ -122,19 +124,19 @@ __global__ void insert_if_n(InputIterator first,
*/
template <int32_t CGSize,
int32_t BlockSize,
typename InputIterator,
typename InputIt,
typename StencilIt,
typename Predicate,
typename Ref>
__global__ void insert_if_n(
InputIterator first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, Ref ref)
InputIt first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, Ref ref)
{
auto const loop_stride = cuco::detail::grid_stride() / CGSize;
auto idx = cuco::detail::global_thread_id() / CGSize;

while (idx < n) {
if (pred(*(stencil + idx))) {
typename Ref::value_type const insert_element{*(first + idx)};
typename std::iterator_traits<InputIt>::value_type const& insert_element{*(first + idx)};
if constexpr (CGSize == 1) {
ref.insert(insert_element);
} else {
Expand Down Expand Up @@ -230,7 +232,7 @@ __global__ void contains_if_n(InputIt first,
while (idx - thread_idx < n) { // the whole thread block falls into the same iteration
if constexpr (CGSize == 1) {
if (idx < n) {
auto const key = *(first + idx);
typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
/*
* The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
* sector stores from L2 to global memory. By writing results to shared memory and then
Expand All @@ -244,7 +246,7 @@ __global__ void contains_if_n(InputIt first,
} else {
auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
if (idx < n) {
auto const key = *(first + idx);
typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false;
if (tile.thread_rank() == 0) { *(output_begin + idx) = found; }
}
Expand Down
Loading

0 comments on commit f239a99

Please sign in to comment.