Skip to content

Commit

Permalink
Remove relaxed constexpr for examples
Browse files Browse the repository at this point in the history
  • Loading branch information
PointKernel committed Aug 29, 2024
1 parent a20460c commit ab1cc40
Show file tree
Hide file tree
Showing 13 changed files with 73 additions and 64 deletions.
21 changes: 11 additions & 10 deletions benchmarks/hash_function/hash_function_bench.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@

#include <nvbench/nvbench.cuh>

#include <cuda/std/cstddef>
#include <thrust/device_vector.h>

#include <cstddef>
#include <cstdint>
#include <type_traits>

Expand Down Expand Up @@ -139,8 +139,8 @@ __global__ void string_hash_bench_kernel(
template <typename Hash>
void string_hash_eval(nvbench::state& state, nvbench::type_list<Hash>)
{
static_assert(std::is_same_v<typename Hash::argument_type, std::byte>,
"Argument type must be std::byte");
static_assert(std::is_same_v<typename Hash::argument_type, cuda::std::byte>,
"Argument type must be cuda::std::byte");

bool const materialize_result = false;
constexpr auto block_size = 128;
Expand All @@ -164,7 +164,7 @@ void string_hash_eval(nvbench::state& state, nvbench::type_list<Hash>)
: 1);

state.add_element_count(num_keys);
// state.add_global_memory_reads<std::byte>(storage.size() * n_repeats);
// state.add_global_memory_reads<cuda::std::byte>(storage.size() * n_repeats);

state.exec([&](nvbench::launch& launch) {
string_hash_bench_kernel<block_size><<<grid_size, block_size, 0, launch.get_stream()>>>(
Expand Down Expand Up @@ -196,12 +196,13 @@ NVBENCH_BENCH_TYPES(
.set_max_noise(cuco::benchmark::defaults::MAX_NOISE)
.add_int64_axis("NumInputs", {cuco::benchmark::defaults::N * 10});

NVBENCH_BENCH_TYPES(string_hash_eval,
NVBENCH_TYPE_AXES(nvbench::type_list<cuco::murmurhash3_32<std::byte>,
cuco::xxhash_32<std::byte>,
cuco::xxhash_64<std::byte>,
cuco::murmurhash3_x86_128<std::byte>,
cuco::murmurhash3_x64_128<std::byte>>))
NVBENCH_BENCH_TYPES(
string_hash_eval,
NVBENCH_TYPE_AXES(nvbench::type_list<cuco::murmurhash3_32<cuda::std::byte>,
cuco::xxhash_32<cuda::std::byte>,
cuco::xxhash_64<cuda::std::byte>,
cuco::murmurhash3_x86_128<cuda::std::byte>,
cuco::murmurhash3_x64_128<cuda::std::byte>>))
.set_name("string_hash_function_eval")
.set_type_axes_names({"Hash"})
.set_max_noise(cuco::benchmark::defaults::MAX_NOISE)
Expand Down
2 changes: 1 addition & 1 deletion examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ function(ConfigureExample EXAMPLE_NAME EXAMPLE_SRC)
target_include_directories(${EXAMPLE_NAME} PRIVATE
"${CMAKE_CURRENT_SOURCE_DIR}")
target_compile_options(${EXAMPLE_NAME} PRIVATE --compiler-options=-Wall --compiler-options=-Wextra
--expt-extended-lambda --expt-relaxed-constexpr -Xcompiler -Wno-subobject-linkage)
--expt-extended-lambda -Xcompiler -Wno-subobject-linkage)
target_link_libraries(${EXAMPLE_NAME} PRIVATE cuco CUDA::cudart)
endfunction(ConfigureExample)

Expand Down
2 changes: 1 addition & 1 deletion examples/static_set/device_subsets_example.cu
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ using ref_type = cuco::static_set_ref<key_type,
storage_ref_type>; ///< Set ref type

/// Sample data to insert and query
__device__ constexpr std::array<key_type, N> data = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
__device__ constexpr cuda::std::array<key_type, N> data = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
/// Empty slots are represented by reserved "sentinel" values. These values should be selected such
/// that they never occur in your input data.
key_type constexpr empty_key_sentinel = -1;
Expand Down
4 changes: 0 additions & 4 deletions include/cuco/detail/__config
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,6 @@
#error "NVCC version 11.5 or later is required"
#endif

#if !defined(__CUDACC_RELAXED_CONSTEXPR__)
#error "Support for relaxed constexpr is required (nvcc flag --expt-relaxed-constexpr)"
#endif

#if !defined(__CUDACC_EXTENDED_LAMBDA__)
#error "Support for extended device lambdas is required (nvcc flag --expt-extended-lambda)"
#endif
Expand Down
24 changes: 14 additions & 10 deletions include/cuco/detail/hash_functions/murmurhash3.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@
#include <cuco/extent.cuh>

#include <cuda/std/array>
#include <cuda/std/cstddef>
#include <cuda/std/type_traits>

#include <cstddef>
#include <cstdint>

namespace cuco::detail {
Expand Down Expand Up @@ -146,7 +146,7 @@ struct MurmurHash3_32 {
*/
constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
{
return compute_hash(reinterpret_cast<std::byte const*>(&key),
return compute_hash(reinterpret_cast<cuda::std::byte const*>(&key),
cuco::extent<std::size_t, sizeof(Key)>{});
}

Expand All @@ -160,7 +160,7 @@ struct MurmurHash3_32 {
* @return The resulting hash value
*/
template <typename Extent>
constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes,
constexpr result_type __host__ __device__ compute_hash(cuda::std::byte const* bytes,
Extent size) const noexcept
{
auto const nblocks = size / 4;
Expand All @@ -183,10 +183,14 @@ struct MurmurHash3_32 {
// tail
std::uint32_t k1 = 0;
switch (size & 3) {
case 3: k1 ^= std::to_integer<std::uint32_t>(bytes[nblocks * 4 + 2]) << 16; [[fallthrough]];
case 2: k1 ^= std::to_integer<std::uint32_t>(bytes[nblocks * 4 + 1]) << 8; [[fallthrough]];
case 3:
k1 ^= cuda::std::to_integer<std::uint32_t>(bytes[nblocks * 4 + 2]) << 16;
[[fallthrough]];
case 2:
k1 ^= cuda::std::to_integer<std::uint32_t>(bytes[nblocks * 4 + 1]) << 8;
[[fallthrough]];
case 1:
k1 ^= std::to_integer<std::uint32_t>(bytes[nblocks * 4 + 0]);
k1 ^= cuda::std::to_integer<std::uint32_t>(bytes[nblocks * 4 + 0]);
k1 *= c1;
k1 = rotl32(k1, 15);
k1 *= c2;
Expand Down Expand Up @@ -247,7 +251,7 @@ struct MurmurHash3_x64_128 {
*/
constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
{
return compute_hash(reinterpret_cast<std::byte const*>(&key),
return compute_hash(reinterpret_cast<cuda::std::byte const*>(&key),
cuco::extent<std::size_t, sizeof(Key)>{});
}

Expand All @@ -261,7 +265,7 @@ struct MurmurHash3_x64_128 {
* @return The resulting hash value
*/
template <typename Extent>
constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes,
constexpr result_type __host__ __device__ compute_hash(cuda::std::byte const* bytes,
Extent size) const noexcept
{
constexpr std::uint32_t block_size = 16;
Expand Down Expand Up @@ -390,7 +394,7 @@ struct MurmurHash3_x86_128 {
*/
constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
{
return compute_hash(reinterpret_cast<std::byte const*>(&key),
return compute_hash(reinterpret_cast<cuda::std::byte const*>(&key),
cuco::extent<std::size_t, sizeof(Key)>{});
}

Expand All @@ -404,7 +408,7 @@ struct MurmurHash3_x86_128 {
* @return The resulting hash value
*/
template <typename Extent>
constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes,
constexpr result_type __host__ __device__ compute_hash(cuda::std::byte const* bytes,
Extent size) const noexcept
{
constexpr std::uint32_t block_size = 16;
Expand Down
6 changes: 4 additions & 2 deletions include/cuco/detail/hash_functions/utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@

#pragma once

#include <cuda/std/cstddef>

namespace cuco::detail {

template <typename T, typename U, typename Extent>
constexpr __host__ __device__ T load_chunk(U const* const data, Extent index) noexcept
{
auto const bytes = reinterpret_cast<std::byte const*>(data);
auto const bytes = reinterpret_cast<cuda::std::byte const*>(data);
T chunk;
memcpy(&chunk, bytes + index * sizeof(T), sizeof(T));
return chunk;
Expand All @@ -37,4 +39,4 @@ constexpr __host__ __device__ std::uint64_t rotl64(std::uint64_t x, std::int8_t
return (x << r) | (x >> (64 - r));
}

}; // namespace cuco::detail
}; // namespace cuco::detail
19 changes: 10 additions & 9 deletions include/cuco/detail/hash_functions/xxhash.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
#include <cuco/detail/hash_functions/utils.cuh>
#include <cuco/extent.cuh>

#include <cstddef>
#include <cuda/std/cstddef>

#include <cstdint>

namespace cuco::detail {
Expand Down Expand Up @@ -91,10 +92,10 @@ struct XXHash_32 {
{
if constexpr (sizeof(Key) <= 16) {
Key const key_copy = key;
return compute_hash(reinterpret_cast<std::byte const*>(&key_copy),
return compute_hash(reinterpret_cast<cuda::std::byte const*>(&key_copy),
cuco::extent<std::size_t, sizeof(Key)>{});
} else {
return compute_hash(reinterpret_cast<std::byte const*>(&key),
return compute_hash(reinterpret_cast<cuda::std::byte const*>(&key),
cuco::extent<std::size_t, sizeof(Key)>{});
}
}
Expand All @@ -109,7 +110,7 @@ struct XXHash_32 {
* @return The resulting hash value
*/
template <typename Extent>
constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes,
constexpr result_type __host__ __device__ compute_hash(cuda::std::byte const* bytes,
Extent size) const noexcept
{
std::size_t offset = 0;
Expand Down Expand Up @@ -159,7 +160,7 @@ struct XXHash_32 {
// the following loop is only needed if the size of the key is not a multiple of the block size
if (size % 4) {
while (offset < size) {
h32 += (std::to_integer<std::uint32_t>(bytes[offset]) & 255) * prime5;
h32 += (cuda::std::to_integer<std::uint32_t>(bytes[offset]) & 255) * prime5;
h32 = rotl32(h32, 11) * prime1;
++offset;
}
Expand Down Expand Up @@ -254,10 +255,10 @@ struct XXHash_64 {
{
if constexpr (sizeof(Key) <= 16) {
Key const key_copy = key;
return compute_hash(reinterpret_cast<std::byte const*>(&key_copy),
return compute_hash(reinterpret_cast<cuda::std::byte const*>(&key_copy),
cuco::extent<std::size_t, sizeof(Key)>{});
} else {
return compute_hash(reinterpret_cast<std::byte const*>(&key),
return compute_hash(reinterpret_cast<cuda::std::byte const*>(&key),
cuco::extent<std::size_t, sizeof(Key)>{});
}
}
Expand All @@ -272,7 +273,7 @@ struct XXHash_64 {
* @return The resulting hash value
*/
template <typename Extent>
constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes,
constexpr result_type __host__ __device__ compute_hash(cuda::std::byte const* bytes,
Extent size) const noexcept
{
std::size_t offset = 0;
Expand Down Expand Up @@ -357,7 +358,7 @@ struct XXHash_64 {
// block size
if (size % 4) {
while (offset < size) {
h64 ^= (std::to_integer<std::uint32_t>(bytes[offset]) & 0xff) * prime5;
h64 ^= (cuda::std::to_integer<std::uint32_t>(bytes[offset]) & 0xff) * prime5;
h64 = rotl64(h64, 11) * prime1;
++offset;
}
Expand Down
6 changes: 3 additions & 3 deletions include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -475,8 +475,8 @@ class hyperloglog_ref {
cuco::sketch_size_kb sketch_size_kb) noexcept
{
// minimum precision is 4 or 64 bytes
return std::max(static_cast<std::size_t>(sizeof(register_type) * 1ull << 4),
cuda::std::bit_floor(static_cast<std::size_t>(sketch_size_kb * 1024)));
return cuda::std::max(static_cast<std::size_t>(sizeof(register_type) * 1ull << 4),
cuda::std::bit_floor(static_cast<std::size_t>(sketch_size_kb * 1024)));
}

/**
Expand All @@ -493,7 +493,7 @@ class hyperloglog_ref {
// https://github.com/apache/spark/blob/6a27789ad7d59cd133653a49be0bb49729542abe/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/HyperLogLogPlusPlusHelper.scala#L43

// minimum precision is 4 or 64 bytes
auto const precision = std::max(
auto const precision = cuda::std::max(
static_cast<int32_t>(4),
static_cast<int32_t>(
cuda::std::ceil(2.0 * cuda::std::log(1.106 / standard_deviation) / cuda::std::log(2.0))));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -213,8 +213,8 @@ class open_addressing_ref_impl {
*
* @return The key equality predicate
*/
[[nodiscard]] __device__ constexpr detail::equal_wrapper<key_type, key_equal> const& predicate()
const noexcept
[[nodiscard]] __host__ __device__ constexpr detail::equal_wrapper<key_type, key_equal> const&
predicate() const noexcept
{
return this->predicate_;
}
Expand Down Expand Up @@ -255,7 +255,7 @@ class open_addressing_ref_impl {
*
* @return The non-owning storage ref of the container
*/
[[nodiscard]] __device__ constexpr storage_ref_type const& storage_ref() const noexcept
[[nodiscard]] __host__ __device__ constexpr storage_ref_type const& storage_ref() const noexcept
{
return storage_ref_;
}
Expand Down Expand Up @@ -1142,7 +1142,8 @@ class open_addressing_ref_impl {
* @return The key
*/
template <typename Value>
[[nodiscard]] __device__ constexpr auto const& extract_key(Value const& value) const noexcept
[[nodiscard]] __host__ __device__ constexpr auto const& extract_key(
Value const& value) const noexcept
{
if constexpr (this->has_payload) {
return thrust::raw_reference_cast(value).first;
Expand Down
12 changes: 6 additions & 6 deletions include/cuco/detail/probe_sequence_impl.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -45,7 +45,7 @@ class probe_sequence_base {
*
* @return The number of elements loaded with each vector load
*/
static constexpr uint32_t vector_width() noexcept { return 2u; }
static __host__ __device__ constexpr uint32_t vector_width() noexcept { return 2u; }
};

/**
Expand Down Expand Up @@ -210,7 +210,7 @@ class linear_probing_impl
__device__ __forceinline__ iterator
initial_slot(cooperative_groups::thread_block_tile<cg_size> const& g, ProbeKey const& k) noexcept
{
return const_cast<iterator>(std::as_const(*this).initial_slot(g, k));
return const_cast<iterator>(cuda::std::as_const(*this).initial_slot(g, k));
}

/**
Expand Down Expand Up @@ -257,7 +257,7 @@ class linear_probing_impl
*/
__device__ __forceinline__ iterator next_slot(iterator s) noexcept
{
return const_cast<iterator>(std::as_const(*this).next_slot(s));
return const_cast<iterator>(cuda::std::as_const(*this).next_slot(s));
}

/**
Expand Down Expand Up @@ -364,7 +364,7 @@ class double_hashing_impl
__device__ __forceinline__ iterator
initial_slot(cooperative_groups::thread_block_tile<cg_size> const& g, ProbeKey const& k) noexcept
{
return const_cast<iterator>(std::as_const(*this).initial_slot(g, k));
return const_cast<iterator>(cuda::std::as_const(*this).initial_slot(g, k));
}

/**
Expand Down Expand Up @@ -409,7 +409,7 @@ class double_hashing_impl
*/
__device__ __forceinline__ iterator next_slot(iterator s) noexcept
{
return const_cast<iterator>(std::as_const(*this).next_slot(s));
return const_cast<iterator>(cuda::std::as_const(*this).next_slot(s));
}

/**
Expand Down
9 changes: 6 additions & 3 deletions include/cuco/static_multimap.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1052,20 +1052,23 @@ class static_multimap {
*
* @return Boolean indicating if vector-load is used.
*/
static constexpr bool uses_vector_load() noexcept
static __host__ __device__ constexpr bool uses_vector_load() noexcept
{
return cuco::detail::is_packable<value_type>();
}

/**
* @brief Returns the number of pairs loaded with each vector-load
*/
static constexpr uint32_t vector_width() noexcept { return ProbeSequence::vector_width(); }
static __host__ __device__ constexpr uint32_t vector_width() noexcept
{
return ProbeSequence::vector_width();
}

/**
* @brief Returns the warp size.
*/
static constexpr uint32_t warp_size() noexcept { return 32u; }
static __host__ __device__ constexpr uint32_t warp_size() noexcept { return 32u; }

/**
* @brief Custom deleter for unique pointer of slots.
Expand Down
Loading

0 comments on commit ab1cc40

Please sign in to comment.