diff --git a/README.md b/README.md index 163e43f2f..f9d5d885e 100644 --- a/README.md +++ b/README.md @@ -205,10 +205,10 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection `cuco::static_map` is a fixed-size hash table using open addressing with linear probing. See the Doxygen documentation in `static_map.cuh` for more detailed information. #### Examples: -- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/T49P85Mnd)) -- [Device-view APIs for individual operations](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/device_view_example.cu) (see [live example in godbolt](https://godbolt.org/z/dh8bMn3G1)) -- [Custom data types, key equality operators and hash functions](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/custom_type_example.cu) (see [live example in godbolt](https://godbolt.org/z/7djKevK6e)) -- [Key histogram](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/count_by_key_example.cu) (see [live example in godbolt](https://godbolt.org/z/vecGeYM48)) +- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/7jK9od6bx)) +- [Device-ref APIs for individual operations](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/W338MePdW)) +- [Custom data types, key equality operators and hash functions](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/custom_type_example.cu) (see [live example in godbolt](https://godbolt.org/z/YYb1WE9od)) +- [Key histogram](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/count_by_key_example.cu) (see [live example in godbolt](https://godbolt.org/z/6rz7MYoMe)) ### `static_multimap` diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 91e1417aa..a3d0ae247 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -37,7 +37,7 @@ ConfigureExample(STATIC_SET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/stati ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_ref_example.cu") ConfigureExample(STATIC_SET_DEVICE_SUBSETS_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_subsets_example.cu") ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu") -ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_view_example.cu") +ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_ref_example.cu") ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu") ConfigureExample(STATIC_MAP_COUNT_BY_KEY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/count_by_key_example.cu") ConfigureExample(STATIC_MULTIMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multimap/host_bulk_example.cu") diff --git a/examples/static_map/count_by_key_example.cu b/examples/static_map/count_by_key_example.cu index 4c8cfdb11..903da975a 100644 --- a/examples/static_map/count_by_key_example.cu +++ b/examples/static_map/count_by_key_example.cu @@ -36,7 +36,7 @@ * the context of a count-by-key operation, i.e. for a histogram over keys. * * Individual operations like a single insert or find can be performed in device code via the - * static_map "device_view" types. + * "static_map_ref" types. * * @note This example is for demonstration purposes only. It is not intended to show the most * performant way to do the example algorithm. @@ -47,17 +47,17 @@ * @brief Inserts keys and counts how often they occur in the input sequence. * * @tparam BlockSize CUDA block size - * @tparam Map Type of the map returned from static_map::get_device_mutable_view + * @tparam Map Type of the map device reference * @tparam KeyIter Input iterator whose value_type convertible to Map::key_type * @tparam UniqueIter Output iterator whose value_type is convertible to uint64_t * - * @param[in] map_view View of the map into which inserts will be performed + * @param[in] map_ref Reference of the map into which inserts will be performed * @param[in] key_begin The beginning of the range of keys to insert * @param[in] num_keys The total number of keys and values * @param[out] num_unique_keys The total number of distinct keys inserted */ template -__global__ void count_by_key(Map map_view, +__global__ void count_by_key(Map map_ref, KeyIter keys, uint64_t num_keys, UniqueIter num_unique_keys) @@ -71,13 +71,14 @@ __global__ void count_by_key(Map map_view, uint64_t thread_unique_keys = 0; while (idx < num_keys) { // insert key into the map with a count of 1 - auto [slot, is_new_key] = map_view.insert_and_find({keys[idx], 1}); + auto [slot, is_new_key] = map_ref.insert_and_find(cuco::pair{keys[idx], 1}); if (is_new_key) { // first occurrence of the key thread_unique_keys++; } else { // key is already in the map -> increment count - slot->second.fetch_add(1, cuda::memory_order_relaxed); + auto ref = cuda::atomic_ref{slot->second}; + ref.fetch_add(1, cuda::memory_order_relaxed); } idx += loop_stride; } @@ -101,7 +102,7 @@ int main(void) // Empty slots are represented by reserved "sentinel" values. These values should be selected such // that they never occur in your input data. Key constexpr empty_key_sentinel = static_cast(-1); - Count constexpr empty_value_sentinel = static_cast(-1); + Count constexpr empty_value_sentinel = static_cast(-1); // Number of keys to be inserted auto constexpr num_keys = 50'000; @@ -125,23 +126,26 @@ int main(void) // Compute capacity based on a 50% load factor auto constexpr load_factor = 0.5; - // If the number of unique keys is known in advance, we can use it to calculate the map capacity - std::size_t const capacity = std::ceil((num_keys / key_duplicates) / load_factor); - // If we can't give an estimated upper bound on the number of unique keys - // we conservatively assume each key in the input is distinct - // std::size_t const capacity = std::ceil(num_keys / load_factor); + // If the number of elements is known in advance, we can use it to calculate the map capacity + std::size_t const num_elements = num_keys / key_duplicates; - // Constructs a map with "capacity" slots. - cuco::static_map map{ - capacity, cuco::empty_key{empty_key_sentinel}, cuco::empty_value{empty_value_sentinel}}; + // Constructs a map with number of elements and desired load factor. + auto map = cuco::experimental::static_map{ + num_elements, + load_factor, + cuco::empty_key{empty_key_sentinel}, + cuco::empty_value{empty_value_sentinel}, + thrust::equal_to{}, + cuco::experimental::linear_probing<1, cuco::default_hash_function>{}}; - // Get a non-owning, mutable view of the map that allows inserts to pass by value into the kernel - auto device_insert_view = map.get_device_mutable_view(); + // Get a non-owning, mutable reference of the map that allows `insert_and_find` operation to pass + // by value into the kernel + auto map_ref = map.ref(cuco::experimental::op::insert_and_find); auto constexpr block_size = 256; auto const grid_size = (num_keys + block_size - 1) / block_size; - count_by_key<<>>( - device_insert_view, insert_keys.begin(), num_keys, num_unique_keys.data()); + count_by_key + <<>>(map_ref, insert_keys.begin(), num_keys, num_unique_keys.data()); // Retrieve contents of all the non-empty slots in the map thrust::device_vector result_keys(num_unique_keys[0]); @@ -149,10 +153,10 @@ int main(void) map.retrieve_all(result_keys.begin(), result_counts.begin()); // Check if the number of result keys is correct - auto num_keys_check = num_unique_keys[0] == (num_keys / key_duplicates); + auto const num_keys_check = num_unique_keys[0] == (num_keys / key_duplicates); // Iterate over all result counts and verify that they are correct - auto counts_check = thrust::all_of( + auto const counts_check = thrust::all_of( result_counts.begin(), result_counts.end(), [] __host__ __device__(Count const count) { return count == key_duplicates; }); diff --git a/examples/static_map/custom_type_example.cu b/examples/static_map/custom_type_example.cu index 217672646..21a9cd8f1 100644 --- a/examples/static_map/custom_type_example.cu +++ b/examples/static_map/custom_type_example.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,7 +25,6 @@ #include // User-defined key type -#if !defined(CUCO_HAS_INDEPENDENT_THREADS) struct custom_key_type { int32_t a; int32_t b; @@ -33,24 +32,6 @@ struct custom_key_type { __host__ __device__ custom_key_type() {} __host__ __device__ custom_key_type(int32_t x) : a{x}, b{x} {} }; -#else -// Key type larger than 8B only supported for sm_70 and up -struct custom_key_type { - int32_t a; - int32_t b; - int32_t c; - - __host__ __device__ custom_key_type() {} - __host__ __device__ custom_key_type(int32_t x) : a{x}, b{x}, c{x} {} - - // Device equality operator is mandatory due to libcudacxx bug: - // https://github.com/NVIDIA/libcudacxx/issues/223 - __device__ bool operator==(custom_key_type const& other) const - { - return a == other.a and b == other.b and c == other.c; - } -}; -#endif // User-defined value type struct custom_value_type { @@ -63,17 +44,12 @@ struct custom_value_type { // User-defined device hash callable struct custom_hash { - template - __device__ uint32_t operator()(key_type k) - { - return k.a; - }; + __device__ uint32_t operator()(custom_key_type const& k) const noexcept { return k.a; }; }; // User-defined device key equal callable -struct custom_key_equals { - template - __device__ bool operator()(key_type const& lhs, key_type const& rhs) +struct custom_key_equal { + __device__ bool operator()(custom_key_type const& lhs, custom_key_type const& rhs) const noexcept { return lhs.a == rhs.a; } @@ -91,15 +67,20 @@ int main(void) auto pairs_begin = thrust::make_transform_iterator( thrust::make_counting_iterator(0), cuda::proclaim_return_type>( - [] __device__(auto i) { return cuco::make_pair(custom_key_type{i}, custom_value_type{i}); })); + [] __device__(auto i) { + return cuco::pair{custom_key_type{i}, custom_value_type{i}}; + })); // Construct a map with 100,000 slots using the given empty key/value sentinels. Note the // capacity is chosen knowing we will insert 80,000 keys, for an load factor of 80%. - cuco::static_map map{ - 100'000, cuco::empty_key{empty_key_sentinel}, cuco::empty_value{empty_value_sentinel}}; + auto map = cuco::experimental::static_map{cuco::experimental::extent{}, + cuco::empty_key{empty_key_sentinel}, + cuco::empty_value{empty_value_sentinel}, + custom_key_equal{}, + cuco::experimental::linear_probing<1, custom_hash>{}}; // Inserts 80,000 pairs into the map by using the custom hasher and custom equality callable - map.insert(pairs_begin, pairs_begin + num_pairs, custom_hash{}, custom_key_equals{}); + map.insert(pairs_begin, pairs_begin + num_pairs); // Reproduce inserted keys auto insert_keys = @@ -111,14 +92,14 @@ int main(void) // Determine if all the inserted keys can be found by using the same hasher and equality // function as `insert`. If a key `insert_keys[i]` doesn't exist, `contained[i] == false`. - map.contains( - insert_keys, insert_keys + num_pairs, contained.begin(), custom_hash{}, custom_key_equals{}); + map.contains(insert_keys, insert_keys + num_pairs, contained.begin()); // This will fail due to inconsistent hash and key equal. // map.contains(insert_keys, insert_keys + num_pairs, contained.begin()); // All inserted keys are contained - assert( - thrust::all_of(contained.begin(), contained.end(), [] __device__(auto const& b) { return b; })); + auto const all_contained = + thrust::all_of(contained.begin(), contained.end(), [] __device__(auto const& b) { return b; }); + if (all_contained) { std::cout << "Success! Found all values.\n"; } return 0; } diff --git a/examples/static_map/device_view_example.cu b/examples/static_map/device_ref_example.cu similarity index 77% rename from examples/static_map/device_view_example.cu rename to examples/static_map/device_ref_example.cu index f3414e3ff..8c52610c4 100644 --- a/examples/static_map/device_view_example.cu +++ b/examples/static_map/device_ref_example.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,12 +29,11 @@ #include /** - * @file device_view_example.cu + * @file device_ref_example.cu * @brief Demonstrates usage of the device side APIs for individual operations like insert/find. * * Individual operations like a single insert or find can be performed in device code via the - * static_map "device_view" types. Note that concurrent insert and find are not supported, and - * therefore there are separate view types for insert and find to help prevent undefined behavior. + * "static_map_ref" types. * * @note This example is for demonstration purposes only. It is not intended to show the most * performant way to do the example algorithm. @@ -44,12 +43,12 @@ /** * @brief Inserts keys that pass the specified predicated into the map. * - * @tparam Map Type of the map returned from static_map::get_device_mutable_view + * @tparam Map Type of the map device reference * @tparam KeyIter Input iterator whose value_type convertible to Map::key_type * @tparam ValueIter Input iterator whose value_type is convertible to Map::mapped_type * @tparam Predicate Unary predicate * - * @param[in] map_view View of the map into which inserts will be performed + * @param[in] map_ref Reference of the map into which inserts will be performed * @param[in] key_begin The beginning of the range of keys to insert * @param[in] value_begin The beginning of the range of values associated with each key to insert * @param[in] num_keys The total number of keys and values @@ -58,7 +57,7 @@ * @param[out] num_inserted The total number of keys successfully inserted */ template -__global__ void filtered_insert(Map map_view, +__global__ void filtered_insert(Map map_ref, KeyIter key_begin, ValueIter value_begin, std::size_t num_keys, @@ -71,9 +70,9 @@ __global__ void filtered_insert(Map map_view, while (tid < num_keys) { // Only insert keys that pass the predicate if (pred(key_begin[tid])) { - // device_mutable_view::insert returns `true` if it is the first time the given key was + // Map::insert returns `true` if it is the first time the given key was // inserted and `false` if the key already existed - if (map_view.insert({key_begin[tid], value_begin[tid]})) { + if (map_ref.insert(cuco::pair{key_begin[tid], value_begin[tid]})) { ++counter; // Count number of successfully inserted keys } } @@ -87,25 +86,26 @@ __global__ void filtered_insert(Map map_view, /** * @brief For keys that have a match in the map, increments their corresponding value by one. * - * @tparam Map Type of the map returned from static_map::get_device_view + * @tparam Map Type of the map device reference * @tparam KeyIter Input iterator whose value_type convertible to Map::key_type * - * @param map_view View of the map into which queries will be performed + * @param map_ref Reference of the map into which queries will be performed * @param key_begin The beginning of the range of keys to query * @param num_keys The total number of keys */ template -__global__ void increment_values(Map map_view, KeyIter key_begin, std::size_t num_keys) +__global__ void increment_values(Map map_ref, KeyIter key_begin, std::size_t num_keys) { auto tid = threadIdx.x + blockIdx.x * blockDim.x; while (tid < num_keys) { // If the key exists in the map, find returns an iterator to the specified key. Otherwise it // returns map.end() - auto found = map_view.find(key_begin[tid]); - if (found != map_view.end()) { + auto found = map_ref.find(key_begin[tid]); + if (found != map_ref.end()) { // If the key exists, atomically increment the associated value - // The value type of the iterator is pair, cuda::atomic> - found->second.fetch_add(1, cuda::memory_order_relaxed); + auto ref = + cuda::atomic_ref{found->second}; + ref.fetch_add(1, cuda::memory_order_relaxed); } tid += gridDim.x * blockDim.x; } @@ -135,11 +135,16 @@ int main(void) std::size_t const capacity = std::ceil(num_keys / load_factor); // Constructs a map with "capacity" slots using -1 and -1 as the empty key/value sentinels. - cuco::static_map map{ - capacity, cuco::empty_key{empty_key_sentinel}, cuco::empty_value{empty_value_sentinel}}; + auto map = cuco::experimental::static_map{ + capacity, + cuco::empty_key{empty_key_sentinel}, + cuco::empty_value{empty_value_sentinel}, + thrust::equal_to{}, + cuco::experimental::linear_probing<1, cuco::default_hash_function>{}}; - // Get a non-owning, mutable view of the map that allows inserts to pass by value into the kernel - auto device_insert_view = map.get_device_mutable_view(); + // Get a non-owning, mutable reference of the map that allows inserts to pass by value into the + // kernel + auto insert_ref = map.ref(cuco::experimental::op::insert); // Predicate will only insert even keys auto is_even = [] __device__(auto key) { return (key % 2) == 0; }; @@ -149,7 +154,7 @@ int main(void) auto constexpr block_size = 256; auto const grid_size = (num_keys + block_size - 1) / block_size; - filtered_insert<<>>(device_insert_view, + filtered_insert<<>>(insert_ref, insert_keys.begin(), insert_values.begin(), num_keys, @@ -158,10 +163,11 @@ int main(void) std::cout << "Number of keys inserted: " << num_inserted[0] << std::endl; - // Get a non-owning view of the map that allows find operations to pass by value into the kernel - auto device_find_view = map.get_device_view(); + // Get a non-owning reference of the map that allows find operations to pass by value into the + // kernel + auto find_ref = map.ref(cuco::experimental::op::find); - increment_values<<>>(device_find_view, insert_keys.begin(), num_keys); + increment_values<<>>(find_ref, insert_keys.begin(), num_keys); // Retrieve contents of all the non-empty slots in the map thrust::device_vector contained_keys(num_inserted[0]); diff --git a/examples/static_map/host_bulk_example.cu b/examples/static_map/host_bulk_example.cu index 746857511..f08c08235 100644 --- a/examples/static_map/host_bulk_example.cu +++ b/examples/static_map/host_bulk_example.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -54,7 +54,7 @@ int main(void) std::size_t const capacity = std::ceil(num_keys / load_factor); // Constructs a map with "capacity" slots using -1 and -1 as the empty key/value sentinels. - cuco::static_map map{ + auto map = cuco::experimental::static_map{ capacity, cuco::empty_key{empty_key_sentinel}, cuco::empty_value{empty_value_sentinel}}; // Create a sequence of keys and values {{0,0}, {1,1}, ... {i,i}} diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index e10c55bea..44310927e 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -15,6 +15,7 @@ */ #include +#include #include #include #include @@ -570,7 +571,7 @@ auto static_mapempty_key_sentinel() == this->erased_key_sentinel() + return cuco::detail::bitwise_compare(this->empty_key_sentinel(), this->erased_key_sentinel()) ? ref_type{cuco::empty_key(this->empty_key_sentinel()), cuco::empty_value(this->empty_value_sentinel()), impl_->key_eq(), diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl index dfa307064..600b4c4c9 100644 --- a/include/cuco/detail/static_set/static_set.inl +++ b/include/cuco/detail/static_set/static_set.inl @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include #include @@ -472,7 +473,7 @@ auto static_set Operators...) const noexcept { static_assert(sizeof...(Operators), "No operators specified"); - return this->empty_key_sentinel() == this->erased_key_sentinel() + return cuco::detail::bitwise_compare(this->empty_key_sentinel(), this->erased_key_sentinel()) ? ref_type{cuco::empty_key(this->empty_key_sentinel()), impl_->key_eq(), impl_->probing_scheme(),