Skip to content

Commit

Permalink
Add device subsets example (#346)
Browse files Browse the repository at this point in the history
Depends on #349 

This PR adds an example demonstrating how to create multiple subsets
with one single storage. It includes necessary changes and cleanups that
will unblock orc/parquet dictionary encoding
(rapidsai/cudf#12261) to use the new map/set
data structures.

---------

Co-authored-by: Daniel Juenger <[email protected]>
  • Loading branch information
PointKernel and sleeepyjack authored Sep 26, 2023
1 parent 0cd4da0 commit 359f5ae
Show file tree
Hide file tree
Showing 15 changed files with 403 additions and 55 deletions.
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ endfunction(ConfigureExample)

ConfigureExample(STATIC_SET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/host_bulk_example.cu")
ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_ref_example.cu")
ConfigureExample(STATIC_SET_DEVICE_SUBSETS_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_subsets_example.cu")
ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu")
ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_view_example.cu")
ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu")
Expand Down
16 changes: 8 additions & 8 deletions examples/static_set/device_ref_example.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,14 @@
#include <cstddef>
#include <iostream>

/**
* @file device_reference_example.cu
* @brief Demonstrates usage of the static_set device-side APIs.
*
* static_set provides a non-owning reference which can be used to interact with
* the container from within device code.
*/

// insert a set of keys into a hash set using one cooperative group for each task
template <typename SetRef, typename InputIterator>
__global__ void custom_cooperative_insert(SetRef set, InputIterator keys, std::size_t n)
Expand Down Expand Up @@ -60,14 +68,6 @@ __global__ void custom_contains(SetRef set, InputIterator keys, std::size_t n, O
}
}

/**
* @file device_reference_example.cu
* @brief Demonstrates usage of the static_set device-side APIs.
*
* static_set provides a non-owning reference which can be used to interact with
* the container from within device code.
*
*/
int main(void)
{
using Key = int;
Expand Down
183 changes: 183 additions & 0 deletions examples/static_set/device_subsets_example.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cuco/static_set_ref.cuh>
#include <cuco/storage.cuh>

#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/scan.h>

#include <cooperative_groups.h>

#include <cuda/std/array>

#include <algorithm>
#include <cstddef>
#include <iostream>
#include <numeric>

/**
* @file device_subsets_example.cu
* @brief Demonstrates how to use one bulk set storage to create multiple subsets and perform
* individual operations via device-side ref APIs.
*
* To optimize memory usage, especially when dealing with expensive data allocation and multiple
* hashsets, a practical solution involves employing a single bulk storage for generating subsets.
* This eliminates the need for separate memory allocation and deallocation for each container. This
* can be achieved by using the lightweight non-owning ref type.
*
* @note This example is for demonstration purposes only. It is not intended to show the most
* performant way to do the example algorithm.
*/

auto constexpr cg_size = 8; ///< A CUDA Cooperative Group of 8 threads to handle each subset
auto constexpr window_size = 1; ///< Number of concurrent slots handled by each thread
auto constexpr N = 10; ///< Number of elements to insert and query

using key_type = int; ///< Key type
using probing_scheme_type = cuco::experimental::linear_probing<
cg_size,
cuco::default_hash_function<key_type>>; ///< Type controls CG granularity and probing scheme
///< (linear probing v.s. double hashing)
/// Type of bulk allocation storage
using storage_type = cuco::experimental::aow_storage<key_type, window_size>;
/// Lightweight non-owning storage ref type
using storage_ref_type = typename storage_type::ref_type;
using ref_type = cuco::experimental::static_set_ref<key_type,
cuda::thread_scope_device,
thrust::equal_to<key_type>,
probing_scheme_type,
storage_ref_type>; ///< Set ref type

/// Sample data to insert and query
__device__ constexpr std::array<key_type, N> data = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
/// Empty slots are represented by reserved "sentinel" values. These values should be selected such
/// that they never occur in your input data.
key_type constexpr empty_key_sentinel = -1;

/**
* @brief Inserts sample data into subsets by using cooperative group
*
* Each Cooperative Group creates its own subset and inserts `N` sample data.
*
* @param set_refs Pointer to the array of subset objects
*/
__global__ void insert(ref_type* set_refs)
{
namespace cg = cooperative_groups;

auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
// Get subset (or CG) index
auto const idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;

auto raw_set_ref = *(set_refs + idx);
auto insert_set_ref = std::move(raw_set_ref).with(cuco::experimental::insert);

// Insert `N` elemtns into the set with CG insert
for (int i = 0; i < N; i++) {
insert_set_ref.insert(tile, data[i]);
}
}

/**
* @brief All inserted data can be found
*
* Each Cooperative Group reconstructs its own subset ref based on the storage parameters and
* verifies all inserted data can be found.
*
* @param set_refs Pointer to the array of subset objects
*/
__global__ void find(ref_type* set_refs)
{
namespace cg = cooperative_groups;

auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
auto const idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;

auto raw_set_ref = *(set_refs + idx);
auto find_set_ref = std::move(raw_set_ref).with(cuco::experimental::find);

// Result denoting if any of the inserted data is not found
__shared__ int result;
if (threadIdx.x == 0) { result = 0; }
__syncthreads();

for (int i = 0; i < N; i++) {
// Query the set with inserted data
auto const found = find_set_ref.find(tile, data[i]);
// Record if the inserted data has been found
atomicOr(&result, *found != data[i]);
}
__syncthreads();

if (threadIdx.x == 0) {
// If the result is still 0, all inserted data are found.
if (result == 0) { printf("Success! Found all inserted elements.\n"); }
}
}

int main()
{
// Number of subsets to be created
auto constexpr num = 16;
// Each subset may have a different requested size
auto constexpr subset_sizes =
std::array<std::size_t, num>{20, 20, 20, 20, 30, 30, 30, 30, 40, 40, 40, 40, 50, 50, 50, 50};

auto valid_sizes = std::vector<std::size_t>();
valid_sizes.reserve(num);

for (size_t i = 0; i < num; ++i) {
valid_sizes.emplace_back(
static_cast<std::size_t>(cuco::experimental::make_window_extent<ref_type>(subset_sizes[i])));
}

std::vector<std::size_t> offsets(num + 1, 0);

// prefix sum to compute offsets and total number of windows
std::size_t current_sum = 0;
for (std::size_t i = 0; i < valid_sizes.size(); ++i) {
current_sum += valid_sizes[i];
offsets[i + 1] = current_sum;
}

// total number of windows is located at the back of the offsets array
auto const total_num_windows = offsets.back();

// Create a single bulk storage used by all subsets
auto set_storage = storage_type{total_num_windows};
// Initializes the storage with the given sentinel
set_storage.initialize(empty_key_sentinel);

std::vector<ref_type> set_refs;

// create subsets
for (std::size_t i = 0; i < num; ++i) {
storage_ref_type storage_ref{valid_sizes[i], set_storage.data() + offsets[i]};
set_refs.emplace_back(
ref_type{cuco::empty_key<key_type>{empty_key_sentinel}, {}, {}, storage_ref});
}

thrust::device_vector<ref_type> d_set_refs(set_refs);

// Insert sample data
insert<<<1, 128>>>(d_set_refs.data().get());
// Find all inserted data
find<<<1, 128>>>(d_set_refs.data().get());

return 0;
}
23 changes: 17 additions & 6 deletions include/cuco/aow_storage.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@

#pragma once

#include <cuco/detail/storage/aow_storage_base.cuh>

#include <cuco/cuda_stream_ref.hpp>
#include <cuco/detail/storage/aow_storage_base.cuh>
#include <cuco/extent.cuh>
#include <cuco/utility/allocator.hpp>

#include <cuda/std/array>

Expand Down Expand Up @@ -47,7 +47,10 @@ class aow_storage_ref;
* @tparam Extent Type of extent denoting number of windows
* @tparam Allocator Type of allocator used for device storage (de)allocation
*/
template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
template <typename T,
int32_t WindowSize,
typename Extent = cuco::experimental::extent<std::size_t>,
typename Allocator = cuco::cuda_allocator<cuco::experimental::window<T, WindowSize>>>
class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
public:
using base_type = detail::aow_storage_base<T, WindowSize, Extent>; ///< AoW base class type
Expand Down Expand Up @@ -78,7 +81,7 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
* @param size Number of windows to (de)allocate
* @param allocator Allocator used for (de)allocating device storage
*/
explicit constexpr aow_storage(Extent size, Allocator const& allocator) noexcept;
explicit constexpr aow_storage(Extent size, Allocator const& allocator = {}) noexcept;

aow_storage(aow_storage&&) = default; ///< Move constructor
/**
Expand Down Expand Up @@ -119,7 +122,15 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
* @param key Key to which all keys in `slots` are initialized
* @param stream Stream used for executing the kernel
*/
void initialize(value_type key, cuda_stream_ref stream) noexcept;
void initialize(value_type key, cuda_stream_ref stream = {}) noexcept;

/**
* @brief Asynchronously initializes each slot in the AoW storage to contain `key`.
*
* @param key Key to which all keys in `slots` are initialized
* @param stream Stream used for executing the kernel
*/
void initialize_async(value_type key, cuda_stream_ref stream = {}) noexcept;

private:
allocator_type allocator_; ///< Allocator used to (de)allocate windows
Expand All @@ -134,7 +145,7 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
* @tparam WindowSize Number of slots in each window
* @tparam Extent Type of extent denoting storage capacity
*/
template <typename T, int32_t WindowSize, typename Extent>
template <typename T, int32_t WindowSize, typename Extent = cuco::experimental::extent<std::size_t>>
class aow_storage_ref : public detail::aow_storage_base<T, WindowSize, Extent> {
public:
using base_type = detail::aow_storage_base<T, WindowSize, Extent>; ///< AoW base class type
Expand Down
35 changes: 13 additions & 22 deletions include/cuco/detail/extent/extent.inl
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,10 @@
namespace cuco {
namespace experimental {

template <int32_t CGSize, int32_t WindowSize, typename SizeType, std::size_t N = dynamic_extent>
template <typename SizeType, std::size_t N>
struct window_extent {
using value_type = SizeType; ///< Extent value type

static auto constexpr cg_size = CGSize;
static auto constexpr window_size = WindowSize;

__host__ __device__ constexpr value_type value() const noexcept { return N; }
__host__ __device__ explicit constexpr operator value_type() const noexcept { return value(); }

Expand All @@ -45,15 +42,11 @@ struct window_extent {
friend auto constexpr make_window_extent(extent<SizeType_, N_> ext);
};

template <int32_t CGSize, int32_t WindowSize, typename SizeType>
struct window_extent<CGSize, WindowSize, SizeType, dynamic_extent>
: cuco::utility::fast_int<SizeType> {
template <typename SizeType>
struct window_extent<SizeType, dynamic_extent> : cuco::utility::fast_int<SizeType> {
using value_type =
typename cuco::utility::fast_int<SizeType>::fast_int::value_type; ///< Extent value type

static auto constexpr cg_size = CGSize;
static auto constexpr window_size = WindowSize;

private:
using cuco::utility::fast_int<SizeType>::fast_int;

Expand All @@ -67,10 +60,10 @@ template <typename Container, typename SizeType, std::size_t N>
return make_window_extent<Container::cg_size, Container::window_size>(ext);
}

template <typename Container>
[[nodiscard]] std::size_t constexpr make_window_extent(std::size_t size)
template <typename Container, typename SizeType>
[[nodiscard]] auto constexpr make_window_extent(SizeType size)
{
return make_window_extent<Container::cg_size, Container::window_size>(size);
return make_window_extent<Container::cg_size, Container::window_size>(extent<SizeType>{size});
}

template <int32_t CGSize, int32_t WindowSize, typename SizeType, std::size_t N>
Expand All @@ -86,15 +79,13 @@ template <int32_t CGSize, int32_t WindowSize, typename SizeType, std::size_t N>
if (size > max_value) { CUCO_FAIL("Invalid input extent"); }

if constexpr (N == dynamic_extent) {
return window_extent<CGSize, WindowSize, SizeType>{static_cast<SizeType>(
return window_extent<SizeType>{static_cast<SizeType>(
*cuco::detail::lower_bound(
cuco::detail::primes.begin(), cuco::detail::primes.end(), static_cast<uint64_t>(size)) *
CGSize)};
}
if constexpr (N != dynamic_extent) {
return window_extent<CGSize,
WindowSize,
SizeType,
return window_extent<SizeType,
static_cast<std::size_t>(
*cuco::detail::lower_bound(cuco::detail::primes.begin(),
cuco::detail::primes.end(),
Expand All @@ -103,10 +94,10 @@ template <int32_t CGSize, int32_t WindowSize, typename SizeType, std::size_t N>
}
}

template <int32_t CGSize, int32_t WindowSize>
[[nodiscard]] std::size_t constexpr make_window_extent(std::size_t size)
template <int32_t CGSize, int32_t WindowSize, typename SizeType>
[[nodiscard]] auto constexpr make_window_extent(SizeType size)
{
return static_cast<std::size_t>(make_window_extent<CGSize, WindowSize>(extent{size}));
return make_window_extent<CGSize, WindowSize>(extent<SizeType>{size});
}

namespace detail {
Expand All @@ -115,8 +106,8 @@ template <typename...>
struct is_window_extent : std::false_type {
};

template <int32_t CGSize, int32_t WindowSize, typename SizeType, std::size_t N>
struct is_window_extent<window_extent<CGSize, WindowSize, SizeType, N>> : std::true_type {
template <typename SizeType, std::size_t N>
struct is_window_extent<window_extent<SizeType, N>> : std::true_type {
};

template <typename T>
Expand Down
8 changes: 2 additions & 6 deletions include/cuco/detail/open_addressing_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -141,11 +141,7 @@ class open_addressing_impl {
*
* @param stream CUDA stream this operation is executed in
*/
void clear(cuda_stream_ref stream) noexcept
{
this->clear_async(stream);
stream.synchronize();
}
void clear(cuda_stream_ref stream) noexcept { storage_.initialize(empty_slot_sentinel_, stream); }

/**
* @brief Asynchronously erases all elements from the container. After this call, `size()` returns
Expand All @@ -155,7 +151,7 @@ class open_addressing_impl {
*/
void clear_async(cuda_stream_ref stream) noexcept
{
storage_.initialize(empty_slot_sentinel_, stream);
storage_.initialize_async(empty_slot_sentinel_, stream);
}

/**
Expand Down
Loading

0 comments on commit 359f5ae

Please sign in to comment.