From ab1cc40ea4324fb4db63c55c5047831acec86c86 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 28 Aug 2024 18:21:15 -0700 Subject: [PATCH] Remove relaxed constexpr for examples --- .../hash_function/hash_function_bench.cu | 21 ++++++++-------- examples/CMakeLists.txt | 2 +- examples/static_set/device_subsets_example.cu | 2 +- include/cuco/detail/__config | 4 ---- .../detail/hash_functions/murmurhash3.cuh | 24 +++++++++++-------- include/cuco/detail/hash_functions/utils.cuh | 6 +++-- include/cuco/detail/hash_functions/xxhash.cuh | 19 ++++++++------- .../detail/hyperloglog/hyperloglog_ref.cuh | 6 ++--- .../open_addressing_ref_impl.cuh | 9 +++---- include/cuco/detail/probe_sequence_impl.cuh | 12 +++++----- include/cuco/static_multimap.cuh | 9 ++++--- include/cuco/utility/key_generator.cuh | 16 +++++++------ tests/utility/hash_test.cu | 7 +++--- 13 files changed, 73 insertions(+), 64 deletions(-) diff --git a/benchmarks/hash_function/hash_function_bench.cu b/benchmarks/hash_function/hash_function_bench.cu index e02ba8f0a..d48b28cd4 100644 --- a/benchmarks/hash_function/hash_function_bench.cu +++ b/benchmarks/hash_function/hash_function_bench.cu @@ -21,9 +21,9 @@ #include +#include #include -#include #include #include @@ -139,8 +139,8 @@ __global__ void string_hash_bench_kernel( template void string_hash_eval(nvbench::state& state, nvbench::type_list) { - static_assert(std::is_same_v, - "Argument type must be std::byte"); + static_assert(std::is_same_v, + "Argument type must be cuda::std::byte"); bool const materialize_result = false; constexpr auto block_size = 128; @@ -164,7 +164,7 @@ void string_hash_eval(nvbench::state& state, nvbench::type_list) : 1); state.add_element_count(num_keys); - // state.add_global_memory_reads(storage.size() * n_repeats); + // state.add_global_memory_reads(storage.size() * n_repeats); state.exec([&](nvbench::launch& launch) { string_hash_bench_kernel<<>>( @@ -196,12 +196,13 @@ NVBENCH_BENCH_TYPES( .set_max_noise(cuco::benchmark::defaults::MAX_NOISE) .add_int64_axis("NumInputs", {cuco::benchmark::defaults::N * 10}); -NVBENCH_BENCH_TYPES(string_hash_eval, - NVBENCH_TYPE_AXES(nvbench::type_list, - cuco::xxhash_32, - cuco::xxhash_64, - cuco::murmurhash3_x86_128, - cuco::murmurhash3_x64_128>)) +NVBENCH_BENCH_TYPES( + string_hash_eval, + NVBENCH_TYPE_AXES(nvbench::type_list, + cuco::xxhash_32, + cuco::xxhash_64, + cuco::murmurhash3_x86_128, + cuco::murmurhash3_x64_128>)) .set_name("string_hash_function_eval") .set_type_axes_names({"Hash"}) .set_max_noise(cuco::benchmark::defaults::MAX_NOISE) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index b5fafd152..30517928f 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -25,7 +25,7 @@ function(ConfigureExample EXAMPLE_NAME EXAMPLE_SRC) target_include_directories(${EXAMPLE_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") target_compile_options(${EXAMPLE_NAME} PRIVATE --compiler-options=-Wall --compiler-options=-Wextra - --expt-extended-lambda --expt-relaxed-constexpr -Xcompiler -Wno-subobject-linkage) + --expt-extended-lambda -Xcompiler -Wno-subobject-linkage) target_link_libraries(${EXAMPLE_NAME} PRIVATE cuco CUDA::cudart) endfunction(ConfigureExample) diff --git a/examples/static_set/device_subsets_example.cu b/examples/static_set/device_subsets_example.cu index e7276bf16..4e479f27b 100644 --- a/examples/static_set/device_subsets_example.cu +++ b/examples/static_set/device_subsets_example.cu @@ -64,7 +64,7 @@ using ref_type = cuco::static_set_ref; ///< Set ref type /// Sample data to insert and query -__device__ constexpr std::array data = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19}; +__device__ constexpr cuda::std::array data = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19}; /// Empty slots are represented by reserved "sentinel" values. These values should be selected such /// that they never occur in your input data. key_type constexpr empty_key_sentinel = -1; diff --git a/include/cuco/detail/__config b/include/cuco/detail/__config index 6d4bf7339..c5360fba7 100644 --- a/include/cuco/detail/__config +++ b/include/cuco/detail/__config @@ -24,10 +24,6 @@ #error "NVCC version 11.5 or later is required" #endif -#if !defined(__CUDACC_RELAXED_CONSTEXPR__) -#error "Support for relaxed constexpr is required (nvcc flag --expt-relaxed-constexpr)" -#endif - #if !defined(__CUDACC_EXTENDED_LAMBDA__) #error "Support for extended device lambdas is required (nvcc flag --expt-extended-lambda)" #endif diff --git a/include/cuco/detail/hash_functions/murmurhash3.cuh b/include/cuco/detail/hash_functions/murmurhash3.cuh index 01aeeeead..f99c04c75 100644 --- a/include/cuco/detail/hash_functions/murmurhash3.cuh +++ b/include/cuco/detail/hash_functions/murmurhash3.cuh @@ -20,9 +20,9 @@ #include #include +#include #include -#include #include namespace cuco::detail { @@ -146,7 +146,7 @@ struct MurmurHash3_32 { */ constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept { - return compute_hash(reinterpret_cast(&key), + return compute_hash(reinterpret_cast(&key), cuco::extent{}); } @@ -160,7 +160,7 @@ struct MurmurHash3_32 { * @return The resulting hash value */ template - constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes, + constexpr result_type __host__ __device__ compute_hash(cuda::std::byte const* bytes, Extent size) const noexcept { auto const nblocks = size / 4; @@ -183,10 +183,14 @@ struct MurmurHash3_32 { // tail std::uint32_t k1 = 0; switch (size & 3) { - case 3: k1 ^= std::to_integer(bytes[nblocks * 4 + 2]) << 16; [[fallthrough]]; - case 2: k1 ^= std::to_integer(bytes[nblocks * 4 + 1]) << 8; [[fallthrough]]; + case 3: + k1 ^= cuda::std::to_integer(bytes[nblocks * 4 + 2]) << 16; + [[fallthrough]]; + case 2: + k1 ^= cuda::std::to_integer(bytes[nblocks * 4 + 1]) << 8; + [[fallthrough]]; case 1: - k1 ^= std::to_integer(bytes[nblocks * 4 + 0]); + k1 ^= cuda::std::to_integer(bytes[nblocks * 4 + 0]); k1 *= c1; k1 = rotl32(k1, 15); k1 *= c2; @@ -247,7 +251,7 @@ struct MurmurHash3_x64_128 { */ constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept { - return compute_hash(reinterpret_cast(&key), + return compute_hash(reinterpret_cast(&key), cuco::extent{}); } @@ -261,7 +265,7 @@ struct MurmurHash3_x64_128 { * @return The resulting hash value */ template - constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes, + constexpr result_type __host__ __device__ compute_hash(cuda::std::byte const* bytes, Extent size) const noexcept { constexpr std::uint32_t block_size = 16; @@ -390,7 +394,7 @@ struct MurmurHash3_x86_128 { */ constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept { - return compute_hash(reinterpret_cast(&key), + return compute_hash(reinterpret_cast(&key), cuco::extent{}); } @@ -404,7 +408,7 @@ struct MurmurHash3_x86_128 { * @return The resulting hash value */ template - constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes, + constexpr result_type __host__ __device__ compute_hash(cuda::std::byte const* bytes, Extent size) const noexcept { constexpr std::uint32_t block_size = 16; diff --git a/include/cuco/detail/hash_functions/utils.cuh b/include/cuco/detail/hash_functions/utils.cuh index d2a7421dd..4c79cec2e 100644 --- a/include/cuco/detail/hash_functions/utils.cuh +++ b/include/cuco/detail/hash_functions/utils.cuh @@ -16,12 +16,14 @@ #pragma once +#include + namespace cuco::detail { template constexpr __host__ __device__ T load_chunk(U const* const data, Extent index) noexcept { - auto const bytes = reinterpret_cast(data); + auto const bytes = reinterpret_cast(data); T chunk; memcpy(&chunk, bytes + index * sizeof(T), sizeof(T)); return chunk; @@ -37,4 +39,4 @@ constexpr __host__ __device__ std::uint64_t rotl64(std::uint64_t x, std::int8_t return (x << r) | (x >> (64 - r)); } -}; // namespace cuco::detail \ No newline at end of file +}; // namespace cuco::detail diff --git a/include/cuco/detail/hash_functions/xxhash.cuh b/include/cuco/detail/hash_functions/xxhash.cuh index 709045060..4ef75c782 100644 --- a/include/cuco/detail/hash_functions/xxhash.cuh +++ b/include/cuco/detail/hash_functions/xxhash.cuh @@ -19,7 +19,8 @@ #include #include -#include +#include + #include namespace cuco::detail { @@ -91,10 +92,10 @@ struct XXHash_32 { { if constexpr (sizeof(Key) <= 16) { Key const key_copy = key; - return compute_hash(reinterpret_cast(&key_copy), + return compute_hash(reinterpret_cast(&key_copy), cuco::extent{}); } else { - return compute_hash(reinterpret_cast(&key), + return compute_hash(reinterpret_cast(&key), cuco::extent{}); } } @@ -109,7 +110,7 @@ struct XXHash_32 { * @return The resulting hash value */ template - constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes, + constexpr result_type __host__ __device__ compute_hash(cuda::std::byte const* bytes, Extent size) const noexcept { std::size_t offset = 0; @@ -159,7 +160,7 @@ struct XXHash_32 { // the following loop is only needed if the size of the key is not a multiple of the block size if (size % 4) { while (offset < size) { - h32 += (std::to_integer(bytes[offset]) & 255) * prime5; + h32 += (cuda::std::to_integer(bytes[offset]) & 255) * prime5; h32 = rotl32(h32, 11) * prime1; ++offset; } @@ -254,10 +255,10 @@ struct XXHash_64 { { if constexpr (sizeof(Key) <= 16) { Key const key_copy = key; - return compute_hash(reinterpret_cast(&key_copy), + return compute_hash(reinterpret_cast(&key_copy), cuco::extent{}); } else { - return compute_hash(reinterpret_cast(&key), + return compute_hash(reinterpret_cast(&key), cuco::extent{}); } } @@ -272,7 +273,7 @@ struct XXHash_64 { * @return The resulting hash value */ template - constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes, + constexpr result_type __host__ __device__ compute_hash(cuda::std::byte const* bytes, Extent size) const noexcept { std::size_t offset = 0; @@ -357,7 +358,7 @@ struct XXHash_64 { // block size if (size % 4) { while (offset < size) { - h64 ^= (std::to_integer(bytes[offset]) & 0xff) * prime5; + h64 ^= (cuda::std::to_integer(bytes[offset]) & 0xff) * prime5; h64 = rotl64(h64, 11) * prime1; ++offset; } diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index 5597e1d6f..08db69018 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -475,8 +475,8 @@ class hyperloglog_ref { cuco::sketch_size_kb sketch_size_kb) noexcept { // minimum precision is 4 or 64 bytes - return std::max(static_cast(sizeof(register_type) * 1ull << 4), - cuda::std::bit_floor(static_cast(sketch_size_kb * 1024))); + return cuda::std::max(static_cast(sizeof(register_type) * 1ull << 4), + cuda::std::bit_floor(static_cast(sketch_size_kb * 1024))); } /** @@ -493,7 +493,7 @@ class hyperloglog_ref { // https://github.com/apache/spark/blob/6a27789ad7d59cd133653a49be0bb49729542abe/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/HyperLogLogPlusPlusHelper.scala#L43 // minimum precision is 4 or 64 bytes - auto const precision = std::max( + auto const precision = cuda::std::max( static_cast(4), static_cast( cuda::std::ceil(2.0 * cuda::std::log(1.106 / standard_deviation) / cuda::std::log(2.0)))); diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index f4c20f829..c78705804 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -213,8 +213,8 @@ class open_addressing_ref_impl { * * @return The key equality predicate */ - [[nodiscard]] __device__ constexpr detail::equal_wrapper const& predicate() - const noexcept + [[nodiscard]] __host__ __device__ constexpr detail::equal_wrapper const& + predicate() const noexcept { return this->predicate_; } @@ -255,7 +255,7 @@ class open_addressing_ref_impl { * * @return The non-owning storage ref of the container */ - [[nodiscard]] __device__ constexpr storage_ref_type const& storage_ref() const noexcept + [[nodiscard]] __host__ __device__ constexpr storage_ref_type const& storage_ref() const noexcept { return storage_ref_; } @@ -1142,7 +1142,8 @@ class open_addressing_ref_impl { * @return The key */ template - [[nodiscard]] __device__ constexpr auto const& extract_key(Value const& value) const noexcept + [[nodiscard]] __host__ __device__ constexpr auto const& extract_key( + Value const& value) const noexcept { if constexpr (this->has_payload) { return thrust::raw_reference_cast(value).first; diff --git a/include/cuco/detail/probe_sequence_impl.cuh b/include/cuco/detail/probe_sequence_impl.cuh index 46d18e419..51b1bfd68 100644 --- a/include/cuco/detail/probe_sequence_impl.cuh +++ b/include/cuco/detail/probe_sequence_impl.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -45,7 +45,7 @@ class probe_sequence_base { * * @return The number of elements loaded with each vector load */ - static constexpr uint32_t vector_width() noexcept { return 2u; } + static __host__ __device__ constexpr uint32_t vector_width() noexcept { return 2u; } }; /** @@ -210,7 +210,7 @@ class linear_probing_impl __device__ __forceinline__ iterator initial_slot(cooperative_groups::thread_block_tile const& g, ProbeKey const& k) noexcept { - return const_cast(std::as_const(*this).initial_slot(g, k)); + return const_cast(cuda::std::as_const(*this).initial_slot(g, k)); } /** @@ -257,7 +257,7 @@ class linear_probing_impl */ __device__ __forceinline__ iterator next_slot(iterator s) noexcept { - return const_cast(std::as_const(*this).next_slot(s)); + return const_cast(cuda::std::as_const(*this).next_slot(s)); } /** @@ -364,7 +364,7 @@ class double_hashing_impl __device__ __forceinline__ iterator initial_slot(cooperative_groups::thread_block_tile const& g, ProbeKey const& k) noexcept { - return const_cast(std::as_const(*this).initial_slot(g, k)); + return const_cast(cuda::std::as_const(*this).initial_slot(g, k)); } /** @@ -409,7 +409,7 @@ class double_hashing_impl */ __device__ __forceinline__ iterator next_slot(iterator s) noexcept { - return const_cast(std::as_const(*this).next_slot(s)); + return const_cast(cuda::std::as_const(*this).next_slot(s)); } /** diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index ebf17edba..27c682dd6 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -1052,7 +1052,7 @@ class static_multimap { * * @return Boolean indicating if vector-load is used. */ - static constexpr bool uses_vector_load() noexcept + static __host__ __device__ constexpr bool uses_vector_load() noexcept { return cuco::detail::is_packable(); } @@ -1060,12 +1060,15 @@ class static_multimap { /** * @brief Returns the number of pairs loaded with each vector-load */ - static constexpr uint32_t vector_width() noexcept { return ProbeSequence::vector_width(); } + static __host__ __device__ constexpr uint32_t vector_width() noexcept + { + return ProbeSequence::vector_width(); + } /** * @brief Returns the warp size. */ - static constexpr uint32_t warp_size() noexcept { return 32u; } + static __host__ __device__ constexpr uint32_t warp_size() noexcept { return 32u; } /** * @brief Custom deleter for unique pointer of slots. diff --git a/include/cuco/utility/key_generator.cuh b/include/cuco/utility/key_generator.cuh index ebd3d9feb..aa0917f91 100644 --- a/include/cuco/utility/key_generator.cuh +++ b/include/cuco/utility/key_generator.cuh @@ -437,7 +437,8 @@ class key_generator { * vector holding the actual data */ template -std::pair>, thrust::device_vector> +std::pair>, + thrust::device_vector> generate_random_byte_sequences(std::size_t n_sequences, std::size_t min_sequence_length, std::size_t max_sequence_length, @@ -475,20 +476,21 @@ generate_random_byte_sequences(std::size_t n_sequences, // the total number of bytes required to store the sequences auto const n_bytes = thrust::reduce(exec_pol, lengths.begin(), lengths.end()); // the byte vector holding the actual sequence data - thrust::device_vector bytes(n_bytes); + thrust::device_vector bytes(n_bytes); auto offsets_and_lengths = thrust::make_zip_iterator(thrust::make_tuple(offsets.begin(), lengths.begin())); - thrust::device_vector> sequences(n_sequences); + thrust::device_vector> sequences(n_sequences); // create the span object for each sequence thrust::transform( exec_pol, offsets_and_lengths, offsets_and_lengths + n_sequences, sequences.begin(), - cuda::proclaim_return_type>( + cuda::proclaim_return_type>( [bytes_ptr = thrust::raw_pointer_cast(bytes.data())] __device__(auto const& seq) { - return cuda::std::span{bytes_ptr + thrust::get<0>(seq), thrust::get<1>(seq)}; + return cuda::std::span{bytes_ptr + thrust::get<0>(seq), + thrust::get<1>(seq)}; })); // fill the byte buffer with random data @@ -496,11 +498,11 @@ generate_random_byte_sequences(std::size_t n_sequences, thrust::counting_iterator(0), thrust::counting_iterator(bytes.size()), bytes.begin(), - cuda::proclaim_return_type([seed] __device__(std::size_t idx) { + cuda::proclaim_return_type([seed] __device__(std::size_t idx) { RNG rng; thrust::uniform_int_distribution byte_distribution{0, 255}; rng.seed(seed + idx); - return static_cast(byte_distribution(rng)); + return static_cast(byte_distribution(rng)); })); return {std::move(sequences), std::move(bytes)}; diff --git a/tests/utility/hash_test.cu b/tests/utility/hash_test.cu index 90f9c8a5b..bad467a75 100644 --- a/tests/utility/hash_test.cu +++ b/tests/utility/hash_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,13 +19,12 @@ #include #include +#include #include #include #include -#include - template struct large_key { constexpr __host__ __device__ large_key(int32_t value) noexcept @@ -238,7 +237,7 @@ TEMPLATE_TEST_CASE_SIG("Static vs. dynamic key hash test", SECTION("Identical keys with static and dynamic key size should have the same hash value.") { CHECK(hash(key) == - hash.compute_hash(reinterpret_cast(&key), sizeof(key_type))); + hash.compute_hash(reinterpret_cast(&key), sizeof(key_type))); } }