From 359f5ae67e93b69a8df35ebd1d12f746aac8916e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 26 Sep 2023 13:13:44 -0700 Subject: [PATCH] Add device subsets example (#346) Depends on #349 This PR adds an example demonstrating how to create multiple subsets with one single storage. It includes necessary changes and cleanups that will unblock orc/parquet dictionary encoding (https://github.com/rapidsai/cudf/issues/12261) to use the new map/set data structures. --------- Co-authored-by: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> --- examples/CMakeLists.txt | 1 + examples/static_set/device_ref_example.cu | 16 +- examples/static_set/device_subsets_example.cu | 183 ++++++++++++++++++ include/cuco/aow_storage.cuh | 23 ++- include/cuco/detail/extent/extent.inl | 35 ++-- include/cuco/detail/open_addressing_impl.cuh | 8 +- .../cuco/detail/open_addressing_ref_impl.cuh | 10 +- .../cuco/detail/static_map/static_map_ref.inl | 39 ++++ .../cuco/detail/static_set/static_set_ref.inl | 34 ++++ include/cuco/detail/storage/aow_storage.inl | 8 + include/cuco/detail/storage/storage.cuh | 1 + include/cuco/extent.cuh | 16 +- include/cuco/static_map_ref.cuh | 42 ++++ include/cuco/static_set_ref.cuh | 41 ++++ include/cuco/storage.cuh | 1 + 15 files changed, 403 insertions(+), 55 deletions(-) create mode 100644 examples/static_set/device_subsets_example.cu diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d78627eee..91e1417aa 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -35,6 +35,7 @@ endfunction(ConfigureExample) ConfigureExample(STATIC_SET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/host_bulk_example.cu") ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_ref_example.cu") +ConfigureExample(STATIC_SET_DEVICE_SUBSETS_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_subsets_example.cu") ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu") ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_view_example.cu") ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu") diff --git a/examples/static_set/device_ref_example.cu b/examples/static_set/device_ref_example.cu index 136292f6b..52e41cf45 100644 --- a/examples/static_set/device_ref_example.cu +++ b/examples/static_set/device_ref_example.cu @@ -26,6 +26,14 @@ #include #include +/** + * @file device_reference_example.cu + * @brief Demonstrates usage of the static_set device-side APIs. + * + * static_set provides a non-owning reference which can be used to interact with + * the container from within device code. + */ + // insert a set of keys into a hash set using one cooperative group for each task template __global__ void custom_cooperative_insert(SetRef set, InputIterator keys, std::size_t n) @@ -60,14 +68,6 @@ __global__ void custom_contains(SetRef set, InputIterator keys, std::size_t n, O } } -/** - * @file device_reference_example.cu - * @brief Demonstrates usage of the static_set device-side APIs. - * - * static_set provides a non-owning reference which can be used to interact with - * the container from within device code. - * - */ int main(void) { using Key = int; diff --git a/examples/static_set/device_subsets_example.cu b/examples/static_set/device_subsets_example.cu new file mode 100644 index 000000000..827342f95 --- /dev/null +++ b/examples/static_set/device_subsets_example.cu @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include + +/** + * @file device_subsets_example.cu + * @brief Demonstrates how to use one bulk set storage to create multiple subsets and perform + * individual operations via device-side ref APIs. + * + * To optimize memory usage, especially when dealing with expensive data allocation and multiple + * hashsets, a practical solution involves employing a single bulk storage for generating subsets. + * This eliminates the need for separate memory allocation and deallocation for each container. This + * can be achieved by using the lightweight non-owning ref type. + * + * @note This example is for demonstration purposes only. It is not intended to show the most + * performant way to do the example algorithm. + */ + +auto constexpr cg_size = 8; ///< A CUDA Cooperative Group of 8 threads to handle each subset +auto constexpr window_size = 1; ///< Number of concurrent slots handled by each thread +auto constexpr N = 10; ///< Number of elements to insert and query + +using key_type = int; ///< Key type +using probing_scheme_type = cuco::experimental::linear_probing< + cg_size, + cuco::default_hash_function>; ///< Type controls CG granularity and probing scheme + ///< (linear probing v.s. double hashing) +/// Type of bulk allocation storage +using storage_type = cuco::experimental::aow_storage; +/// Lightweight non-owning storage ref type +using storage_ref_type = typename storage_type::ref_type; +using ref_type = cuco::experimental::static_set_ref, + probing_scheme_type, + storage_ref_type>; ///< Set ref type + +/// Sample data to insert and query +__device__ constexpr std::array data = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19}; +/// Empty slots are represented by reserved "sentinel" values. These values should be selected such +/// that they never occur in your input data. +key_type constexpr empty_key_sentinel = -1; + +/** + * @brief Inserts sample data into subsets by using cooperative group + * + * Each Cooperative Group creates its own subset and inserts `N` sample data. + * + * @param set_refs Pointer to the array of subset objects + */ +__global__ void insert(ref_type* set_refs) +{ + namespace cg = cooperative_groups; + + auto const tile = cg::tiled_partition(cg::this_thread_block()); + // Get subset (or CG) index + auto const idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size; + + auto raw_set_ref = *(set_refs + idx); + auto insert_set_ref = std::move(raw_set_ref).with(cuco::experimental::insert); + + // Insert `N` elemtns into the set with CG insert + for (int i = 0; i < N; i++) { + insert_set_ref.insert(tile, data[i]); + } +} + +/** + * @brief All inserted data can be found + * + * Each Cooperative Group reconstructs its own subset ref based on the storage parameters and + * verifies all inserted data can be found. + * + * @param set_refs Pointer to the array of subset objects + */ +__global__ void find(ref_type* set_refs) +{ + namespace cg = cooperative_groups; + + auto const tile = cg::tiled_partition(cg::this_thread_block()); + auto const idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size; + + auto raw_set_ref = *(set_refs + idx); + auto find_set_ref = std::move(raw_set_ref).with(cuco::experimental::find); + + // Result denoting if any of the inserted data is not found + __shared__ int result; + if (threadIdx.x == 0) { result = 0; } + __syncthreads(); + + for (int i = 0; i < N; i++) { + // Query the set with inserted data + auto const found = find_set_ref.find(tile, data[i]); + // Record if the inserted data has been found + atomicOr(&result, *found != data[i]); + } + __syncthreads(); + + if (threadIdx.x == 0) { + // If the result is still 0, all inserted data are found. + if (result == 0) { printf("Success! Found all inserted elements.\n"); } + } +} + +int main() +{ + // Number of subsets to be created + auto constexpr num = 16; + // Each subset may have a different requested size + auto constexpr subset_sizes = + std::array{20, 20, 20, 20, 30, 30, 30, 30, 40, 40, 40, 40, 50, 50, 50, 50}; + + auto valid_sizes = std::vector(); + valid_sizes.reserve(num); + + for (size_t i = 0; i < num; ++i) { + valid_sizes.emplace_back( + static_cast(cuco::experimental::make_window_extent(subset_sizes[i]))); + } + + std::vector offsets(num + 1, 0); + + // prefix sum to compute offsets and total number of windows + std::size_t current_sum = 0; + for (std::size_t i = 0; i < valid_sizes.size(); ++i) { + current_sum += valid_sizes[i]; + offsets[i + 1] = current_sum; + } + + // total number of windows is located at the back of the offsets array + auto const total_num_windows = offsets.back(); + + // Create a single bulk storage used by all subsets + auto set_storage = storage_type{total_num_windows}; + // Initializes the storage with the given sentinel + set_storage.initialize(empty_key_sentinel); + + std::vector set_refs; + + // create subsets + for (std::size_t i = 0; i < num; ++i) { + storage_ref_type storage_ref{valid_sizes[i], set_storage.data() + offsets[i]}; + set_refs.emplace_back( + ref_type{cuco::empty_key{empty_key_sentinel}, {}, {}, storage_ref}); + } + + thrust::device_vector d_set_refs(set_refs); + + // Insert sample data + insert<<<1, 128>>>(d_set_refs.data().get()); + // Find all inserted data + find<<<1, 128>>>(d_set_refs.data().get()); + + return 0; +} diff --git a/include/cuco/aow_storage.cuh b/include/cuco/aow_storage.cuh index fdd970cf4..479246fac 100644 --- a/include/cuco/aow_storage.cuh +++ b/include/cuco/aow_storage.cuh @@ -16,10 +16,10 @@ #pragma once -#include - #include +#include #include +#include #include @@ -47,7 +47,10 @@ class aow_storage_ref; * @tparam Extent Type of extent denoting number of windows * @tparam Allocator Type of allocator used for device storage (de)allocation */ -template +template , + typename Allocator = cuco::cuda_allocator>> class aow_storage : public detail::aow_storage_base { public: using base_type = detail::aow_storage_base; ///< AoW base class type @@ -78,7 +81,7 @@ class aow_storage : public detail::aow_storage_base { * @param size Number of windows to (de)allocate * @param allocator Allocator used for (de)allocating device storage */ - explicit constexpr aow_storage(Extent size, Allocator const& allocator) noexcept; + explicit constexpr aow_storage(Extent size, Allocator const& allocator = {}) noexcept; aow_storage(aow_storage&&) = default; ///< Move constructor /** @@ -119,7 +122,15 @@ class aow_storage : public detail::aow_storage_base { * @param key Key to which all keys in `slots` are initialized * @param stream Stream used for executing the kernel */ - void initialize(value_type key, cuda_stream_ref stream) noexcept; + void initialize(value_type key, cuda_stream_ref stream = {}) noexcept; + + /** + * @brief Asynchronously initializes each slot in the AoW storage to contain `key`. + * + * @param key Key to which all keys in `slots` are initialized + * @param stream Stream used for executing the kernel + */ + void initialize_async(value_type key, cuda_stream_ref stream = {}) noexcept; private: allocator_type allocator_; ///< Allocator used to (de)allocate windows @@ -134,7 +145,7 @@ class aow_storage : public detail::aow_storage_base { * @tparam WindowSize Number of slots in each window * @tparam Extent Type of extent denoting storage capacity */ -template +template > class aow_storage_ref : public detail::aow_storage_base { public: using base_type = detail::aow_storage_base; ///< AoW base class type diff --git a/include/cuco/detail/extent/extent.inl b/include/cuco/detail/extent/extent.inl index 911bda9b1..a7cd83dcd 100644 --- a/include/cuco/detail/extent/extent.inl +++ b/include/cuco/detail/extent/extent.inl @@ -27,13 +27,10 @@ namespace cuco { namespace experimental { -template +template struct window_extent { using value_type = SizeType; ///< Extent value type - static auto constexpr cg_size = CGSize; - static auto constexpr window_size = WindowSize; - __host__ __device__ constexpr value_type value() const noexcept { return N; } __host__ __device__ explicit constexpr operator value_type() const noexcept { return value(); } @@ -45,15 +42,11 @@ struct window_extent { friend auto constexpr make_window_extent(extent ext); }; -template -struct window_extent - : cuco::utility::fast_int { +template +struct window_extent : cuco::utility::fast_int { using value_type = typename cuco::utility::fast_int::fast_int::value_type; ///< Extent value type - static auto constexpr cg_size = CGSize; - static auto constexpr window_size = WindowSize; - private: using cuco::utility::fast_int::fast_int; @@ -67,10 +60,10 @@ template return make_window_extent(ext); } -template -[[nodiscard]] std::size_t constexpr make_window_extent(std::size_t size) +template +[[nodiscard]] auto constexpr make_window_extent(SizeType size) { - return make_window_extent(size); + return make_window_extent(extent{size}); } template @@ -86,15 +79,13 @@ template if (size > max_value) { CUCO_FAIL("Invalid input extent"); } if constexpr (N == dynamic_extent) { - return window_extent{static_cast( + return window_extent{static_cast( *cuco::detail::lower_bound( cuco::detail::primes.begin(), cuco::detail::primes.end(), static_cast(size)) * CGSize)}; } if constexpr (N != dynamic_extent) { - return window_extent( *cuco::detail::lower_bound(cuco::detail::primes.begin(), cuco::detail::primes.end(), @@ -103,10 +94,10 @@ template } } -template -[[nodiscard]] std::size_t constexpr make_window_extent(std::size_t size) +template +[[nodiscard]] auto constexpr make_window_extent(SizeType size) { - return static_cast(make_window_extent(extent{size})); + return make_window_extent(extent{size}); } namespace detail { @@ -115,8 +106,8 @@ template struct is_window_extent : std::false_type { }; -template -struct is_window_extent> : std::true_type { +template +struct is_window_extent> : std::true_type { }; template diff --git a/include/cuco/detail/open_addressing_impl.cuh b/include/cuco/detail/open_addressing_impl.cuh index ef4821b40..2bc3a7225 100644 --- a/include/cuco/detail/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing_impl.cuh @@ -141,11 +141,7 @@ class open_addressing_impl { * * @param stream CUDA stream this operation is executed in */ - void clear(cuda_stream_ref stream) noexcept - { - this->clear_async(stream); - stream.synchronize(); - } + void clear(cuda_stream_ref stream) noexcept { storage_.initialize(empty_slot_sentinel_, stream); } /** * @brief Asynchronously erases all elements from the container. After this call, `size()` returns @@ -155,7 +151,7 @@ class open_addressing_impl { */ void clear_async(cuda_stream_ref stream) noexcept { - storage_.initialize(empty_slot_sentinel_, stream); + storage_.initialize_async(empty_slot_sentinel_, stream); } /** diff --git a/include/cuco/detail/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing_ref_impl.cuh index 213d35af1..cce691c21 100644 --- a/include/cuco/detail/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing_ref_impl.cuh @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -87,12 +88,9 @@ class open_addressing_ref_impl { ProbingScheme>, "ProbingScheme must inherit from cuco::detail::probing_scheme_base"); - static_assert(is_window_extent_v, - "Extent is not a valid cuco::window_extent"); - static_assert(ProbingScheme::cg_size == StorageRef::extent_type::cg_size, - "Extent has incompatible CG size"); - static_assert(StorageRef::window_size == StorageRef::extent_type::window_size, - "Extent has incompatible window size"); + // TODO: how to re-enable this check? + // static_assert(is_window_extent_v, + // "Extent is not a valid cuco::window_extent"); public: using key_type = Key; ///< Key type diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index 250c84feb..28b3ffaf2 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -50,6 +50,30 @@ __host__ __device__ constexpr static_map_ref< { } +template +template +__host__ __device__ constexpr static_map_ref:: + static_map_ref( + static_map_ref&& + other) noexcept + : impl_{std::move(other.impl_)}, + predicate_{std::move(other.predicate_)}, + empty_value_sentinel_{std::move(other.empty_value_sentinel_)} +{ +} + template return empty_value_sentinel_; } +template +template +auto static_map_ref::with( + NewOperators...) && noexcept +{ + return static_map_ref( + std::move(*this)); +} + template +template +__host__ __device__ constexpr static_set_ref:: + static_set_ref( + static_set_ref&& + other) noexcept + : impl_{std::move(other.impl_)}, predicate_{std::move(other.predicate_)} +{ +} + template ::e return predicate_.empty_sentinel_; } +template +template +auto static_set_ref::with( + NewOperators...) && noexcept +{ + return static_set_ref( + std::move(*this)); +} + namespace detail { template ::ref() const noexcept template void aow_storage::initialize(value_type key, cuda_stream_ref stream) noexcept +{ + this->initialize_async(key, stream); + stream.synchronize(); +} + +template +void aow_storage::initialize_async( + value_type key, cuda_stream_ref stream) noexcept { auto constexpr cg_size = 1; auto constexpr stride = 4; diff --git a/include/cuco/detail/storage/storage.cuh b/include/cuco/detail/storage/storage.cuh index b9a00baa2..4dda179c9 100644 --- a/include/cuco/detail/storage/storage.cuh +++ b/include/cuco/detail/storage/storage.cuh @@ -45,6 +45,7 @@ class storage : StorageImpl::template impl { using impl_type::capacity; using impl_type::data; using impl_type::initialize; + using impl_type::initialize_async; using impl_type::num_windows; using impl_type::ref; diff --git a/include/cuco/extent.cuh b/include/cuco/extent.cuh index e45068d9e..50e7ae4aa 100644 --- a/include/cuco/extent.cuh +++ b/include/cuco/extent.cuh @@ -83,7 +83,7 @@ struct extent { * @tparam N Extent * */ -template +template struct window_extent; /** @@ -118,15 +118,16 @@ template * the capacity ctor argument for the container. * * @tparam Container Container type to compute the extent for + * @tparam SizeType Size type * * @param size The input size * * @throw If the input size is invalid * - * @return Resulting valid extent as `std::size_t` + * @return Resulting valid extent */ -template -[[nodiscard]] std::size_t constexpr make_window_extent(std::size_t size); +template +[[nodiscard]] auto constexpr make_window_extent(SizeType size); /** * @brief Computes valid window extent based on given parameters. @@ -162,15 +163,16 @@ template * * @tparam CGSize Number of elements handled per CG * @tparam WindowSize Number of elements handled per Window + * @tparam SizeType Size type * * @param size The input size * * @throw If the input size is invalid * - * @return Resulting valid extent as `std::size_t` + * @return Resulting valid extent */ -template -[[nodiscard]] std::size_t constexpr make_window_extent(std::size_t size); +template +[[nodiscard]] auto constexpr make_window_extent(SizeType size); } // namespace experimental } // namespace cuco diff --git a/include/cuco/static_map_ref.cuh b/include/cuco/static_map_ref.cuh index 2460f1f10..c41ed88f3 100644 --- a/include/cuco/static_map_ref.cuh +++ b/include/cuco/static_map_ref.cuh @@ -17,8 +17,11 @@ #pragma once #include +#include #include +#include #include +#include #include @@ -106,6 +109,18 @@ class static_map_ref probing_scheme_type const& probing_scheme, storage_ref_type storage_ref) noexcept; + /** + * @brief Operator-agnostic move constructor. + * + * @tparam OtherOperators Operator set of the `other` object + * + * @param other Object to construct `*this` from + */ + template + __host__ __device__ explicit constexpr static_map_ref( + static_map_ref&& + other) noexcept; + /** * @brief Gets the maximum number of elements the container can hold. * @@ -127,6 +142,23 @@ class static_map_ref */ [[nodiscard]] __host__ __device__ constexpr mapped_type empty_value_sentinel() const noexcept; + /** + * @brief Creates a reference with new operators from the current object. + * + * Note that this function uses move semantics and thus invalidates the current object. + * + * @warning Using two or more reference objects to the same container but with + * a different operator set at the same time results in undefined behavior. + * + * @tparam NewOperators List of `cuco::op::*_tag` types + * + * @param ops List of operators, e.g., `cuco::insert` + * + * @return `*this` with `NewOperators...` + */ + template + [[nodiscard]] __host__ __device__ auto with(NewOperators... ops) && noexcept; + private: struct predicate_wrapper; @@ -137,6 +169,16 @@ class static_map_ref // Mixins need to be friends with this class in order to access private members template friend class detail::operator_impl; + + // Refs with other operator sets need to be friends too + template + friend class static_map_ref; }; } // namespace experimental diff --git a/include/cuco/static_set_ref.cuh b/include/cuco/static_set_ref.cuh index cf9c00ee0..b2c8158e7 100644 --- a/include/cuco/static_set_ref.cuh +++ b/include/cuco/static_set_ref.cuh @@ -18,8 +18,11 @@ #include #include +#include #include +#include #include +#include #include @@ -94,6 +97,18 @@ class static_set_ref probing_scheme_type const& probing_scheme, storage_ref_type storage_ref) noexcept; + /** + * @brief Operator-agnostic move constructor. + * + * @tparam OtherOperators Operator set of the `other` object + * + * @param other Object to construct `*this` from + */ + template + __host__ __device__ explicit constexpr static_set_ref( + static_set_ref&& + other) noexcept; + /** * @brief Gets the maximum number of elements the container can hold. * @@ -108,6 +123,23 @@ class static_set_ref */ [[nodiscard]] __host__ __device__ constexpr key_type empty_key_sentinel() const noexcept; + /** + * @brief Creates a reference with new operators from the current object. + * + * Note that this function uses move semantics and thus invalidates the current object. + * + * @warning Using two or more reference objects to the same container but with + * a different operator set at the same time results in undefined behavior. + * + * @tparam NewOperators List of `cuco::op::*_tag` types + * + * @param ops List of operators, e.g., `cuco::insert` + * + * @return `*this` with `NewOperators...` + */ + template + [[nodiscard]] __host__ __device__ auto with(NewOperators... ops) && noexcept; + private: impl_type impl_; detail::equal_wrapper predicate_; ///< Key equality binary callable @@ -115,6 +147,15 @@ class static_set_ref // Mixins need to be friends with this class in order to access private members template friend class detail::operator_impl; + + // Refs with other operator sets need to be friends too + template + friend class static_set_ref; }; } // namespace experimental diff --git a/include/cuco/storage.cuh b/include/cuco/storage.cuh index e34e59c96..e2e0c6f46 100644 --- a/include/cuco/storage.cuh +++ b/include/cuco/storage.cuh @@ -20,6 +20,7 @@ namespace cuco { namespace experimental { + /** * @brief Public storage class. *