Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

backport unreachable #2852

Closed
wants to merge 43 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
fdc6c36
backport and fix `unreachable`
davebayer Nov 18, 2024
0246df1
remove `_CCCL_UNREACHABLE`
davebayer Nov 18, 2024
0d83892
update tests
davebayer Nov 18, 2024
f6f2574
define cccl library `unreachable` version
davebayer Nov 18, 2024
94101f2
use `_CCCL_NORETURN` instead of standard `[[noreturn]]`
davebayer Nov 18, 2024
4f0c516
use `_CUDA_VSTD` within libcu++
davebayer Nov 18, 2024
a8e3f60
fix for cuda compilation with NVHPC
davebayer Nov 18, 2024
bed27be
fix compilation for host/device unreachable
davebayer Nov 18, 2024
86e4ce8
separate host and device implementations
davebayer Nov 18, 2024
f7affcf
fall back to the original implementation
davebayer Nov 19, 2024
3208ba5
Merge branch 'main' into backport_and_fix_unreachable
davebayer Nov 19, 2024
913ef98
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Nov 19, 2024
4dfdb61
fix tests
davebayer Nov 20, 2024
7bed5c3
Merge branch 'main' into backport_and_fix_unreachable
davebayer Nov 20, 2024
897086a
Move implementation of `_LIBCUDACXX_TEMPLATE` to CCCL (#2832)
miscco Nov 21, 2024
786d442
Try to work around issue with NVHPC in conjunction of older CTK versi…
miscco Nov 21, 2024
6b5fa22
Refactoring (#2905)
bernhardmgruber Nov 21, 2024
44f0331
add "`interface`" to `_CCCL_PUSH_MACROS` (#2919)
ericniebler Nov 21, 2024
c9a6e6a
Replace inconsistent Doxygen macros with `_CCCL_DOXYGEN_INVOKED` (#2921)
ericniebler Nov 21, 2024
a50019d
implement C++26 `std::span::at` (#2924)
davebayer Nov 22, 2024
d0f5bd2
move msvc compiler macros to new version (#2885)
davebayer Nov 22, 2024
5b57a4c
Reorganize PTX tests to match generator (#2930)
bernhardmgruber Nov 22, 2024
96e8199
Reorganize PTX docs to match generator (#2929)
bernhardmgruber Nov 22, 2024
92a22f2
Improve build instructions for libcu++ (#2881)
miscco Nov 22, 2024
14484a6
Reorganize PTX headers to match generator (#2925)
bernhardmgruber Nov 22, 2024
e897f1f
implement C++26 `std::span`'s constructor from `std::initializer_list…
davebayer Nov 22, 2024
3b23083
Add tuple protocol to `cuda::std::complex` from C++26 (#2882)
davebayer Nov 22, 2024
0a0773f
Move implementation of `_LIBCUDACXX_TEMPLATE` to CCCL (#2832)
miscco Nov 21, 2024
3911b25
Replace inconsistent Doxygen macros with `_CCCL_DOXYGEN_INVOKED` (#2921)
ericniebler Nov 21, 2024
b760c7f
move msvc compiler macros to new version (#2885)
davebayer Nov 22, 2024
8ee0b18
implement C++26 `std::span`'s constructor from `std::initializer_list…
davebayer Nov 22, 2024
ef9ae72
Add missing qualifier for cuda namespace (#2940)
bernhardmgruber Nov 23, 2024
5e4d6e0
Try to fix a clang warning: (#2941)
bernhardmgruber Nov 23, 2024
6dbeb46
minor consistency improvements in concepts macros (#2928)
ericniebler Nov 24, 2024
2a47168
Drop some of the mdspan fold implementation (#2949)
miscco Nov 25, 2024
a4477a4
[STF] Implement CUDASTF_DOT_TIMING for the ctx.cuda_kernel construct …
caugonnet Nov 25, 2024
09db35f
Avoid potential null dereference in `annotated_ptr` (#2951)
miscco Nov 25, 2024
8ca5380
make compiler version comparison utility generic (#2952)
davebayer Nov 25, 2024
3c6fcd2
Add SM100 descriptor to target (#2954)
miscco Nov 25, 2024
aededb9
Regenerate `cuda::ptx` headers/docs and run format (#2937)
bernhardmgruber Nov 25, 2024
4d53204
Regenerate PTX test (#2953)
bernhardmgruber Nov 25, 2024
bdff820
Do not include extended floating point headers if they are not needed…
miscco Nov 25, 2024
651bbcf
[CUDAX] Add copy_bytes and fill_bytes overloads for mdspan (#2932)
pciolkosz Nov 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
4 changes: 2 additions & 2 deletions .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,8 @@ IndentWidth: 2
KeepEmptyLinesAtTheStartOfBlocks: false
MaxEmptyLinesToKeep: 1
Macros:
- _LIBCUDACXX_TEMPLATE(...)=template<...>
- _LIBCUDACXX_REQUIRES(...)=requires (...)
- _CCCL_TEMPLATE(...)=template<...>
- _CCCL_REQUIRES(...)=requires (...)
WhitespaceSensitiveMacros:
- _CCCL_HAS_INCLUDE
NamespaceIndentation: None
Expand Down
19 changes: 18 additions & 1 deletion c2h/include/c2h/generators.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,24 @@
#include <c2h/vector.h>

#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
# include <cub/util_type.cuh> // for <cuda_fp8.h>
# if defined(_CCCL_HAS_NVFP16)
# include <cuda_fp16.h>
# endif // _CCCL_HAS_NVFP16

# if defined(_CCCL_HAS_NVBF16)
_CCCL_DIAG_PUSH
_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
# include <cuda_bf16.h>
_CCCL_DIAG_POP

# if _CCCL_CUDACC_AT_LEAST(11, 8)
// cuda_fp8.h resets default for C4127, so we have to guard the inclusion
_CCCL_DIAG_PUSH
# include <cuda_fp8.h>
_CCCL_DIAG_POP
# endif // _CCCL_CUDACC_AT_LEAST(11, 8)
# endif // _CCCL_HAS_NVBF16

# if defined(__CUDA_FP8_TYPES_EXIST__)
namespace std
{
Expand Down
79 changes: 39 additions & 40 deletions cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -418,52 +418,51 @@ struct less_t
{
return lhs < rhs;
}
};

template <>
__host__ __device__ inline bool less_t::operator()(const complex& lhs, const complex& rhs) const
{
double magnitude_0 = cuda::std::abs(lhs);
double magnitude_1 = cuda::std::abs(rhs);

if (cuda::std::isnan(magnitude_0) || cuda::std::isnan(magnitude_1))
{
// NaN's are always equal.
return false;
}
else if (cuda::std::isinf(magnitude_0) || cuda::std::isinf(magnitude_1))
__host__ __device__ inline bool operator()(const complex& lhs, const complex& rhs) const
{
// If the real or imaginary part of the complex number has a very large value
// (close to the maximum representable value for a double), it is possible that
// the magnitude computation can result in positive infinity:
// ```cpp
// const double large_number = std::numeric_limits<double>::max() / 2;
// std::complex<double> z(large_number, large_number);
// std::abs(z) == inf;
// ```
// Dividing both components by a constant before computing the magnitude prevents overflow.
const complex::value_type scaler = 0.5;

magnitude_0 = cuda::std::abs(lhs * scaler);
magnitude_1 = cuda::std::abs(rhs * scaler);
}
double magnitude_0 = cuda::std::abs(lhs);
double magnitude_1 = cuda::std::abs(rhs);

if (cuda::std::isnan(magnitude_0) || cuda::std::isnan(magnitude_1))
{
// NaN's are always equal.
return false;
}
else if (cuda::std::isinf(magnitude_0) || cuda::std::isinf(magnitude_1))
{
// If the real or imaginary part of the complex number has a very large value
// (close to the maximum representable value for a double), it is possible that
// the magnitude computation can result in positive infinity:
// ```cpp
// const double large_number = std::numeric_limits<double>::max() / 2;
// std::complex<double> z(large_number, large_number);
// std::abs(z) == inf;
// ```
// Dividing both components by a constant before computing the magnitude prevents overflow.
const complex::value_type scaler = 0.5;

magnitude_0 = cuda::std::abs(lhs * scaler);
magnitude_1 = cuda::std::abs(rhs * scaler);
}

const complex::value_type difference = cuda::std::abs(magnitude_0 - magnitude_1);
const complex::value_type threshold = cuda::std::numeric_limits<complex::value_type>::epsilon() * 2;
const complex::value_type difference = cuda::std::abs(magnitude_0 - magnitude_1);
const complex::value_type threshold = cuda::std::numeric_limits<complex::value_type>::epsilon() * 2;

if (difference < threshold)
{
// Triangles with the same magnitude are sorted by their phase angle.
const complex::value_type phase_angle_0 = cuda::std::arg(lhs);
const complex::value_type phase_angle_1 = cuda::std::arg(rhs);
if (difference < threshold)
{
// Triangles with the same magnitude are sorted by their phase angle.
const complex::value_type phase_angle_0 = cuda::std::arg(lhs);
const complex::value_type phase_angle_1 = cuda::std::arg(rhs);

return phase_angle_0 < phase_angle_1;
}
else
{
return magnitude_0 < magnitude_1;
return phase_angle_0 < phase_angle_1;
}
else
{
return magnitude_0 < magnitude_1;
}
}
}
};

struct max_t
{
Expand Down
46 changes: 20 additions & 26 deletions cub/cub/agent/agent_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -106,23 +106,19 @@ template <int _BLOCK_THREADS,
int _VEC_SIZE = 4>
struct AgentHistogramPolicy
{
enum
{
/// Threads per thread block
BLOCK_THREADS = _BLOCK_THREADS,

/// Pixels per thread (per tile of input)
PIXELS_PER_THREAD = _PIXELS_PER_THREAD,
/// Threads per thread block
static constexpr int BLOCK_THREADS = _BLOCK_THREADS;
/// Pixels per thread (per tile of input)
static constexpr int PIXELS_PER_THREAD = _PIXELS_PER_THREAD;

/// Whether to perform localized RLE to compress samples before histogramming
IS_RLE_COMPRESS = _RLE_COMPRESS,
/// Whether to perform localized RLE to compress samples before histogramming
static constexpr bool IS_RLE_COMPRESS = _RLE_COMPRESS;

/// Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
MEM_PREFERENCE = _MEM_PREFERENCE,
/// Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
static constexpr BlockHistogramMemoryPreference MEM_PREFERENCE = _MEM_PREFERENCE;

/// Whether to dequeue tiles from a global work queue
IS_WORK_STEALING = _WORK_STEALING,
};
/// Whether to dequeue tiles from a global work queue
static constexpr bool IS_WORK_STEALING = _WORK_STEALING;

/// Vector size for samples loading (1, 2, 4)
static constexpr int VEC_SIZE = _VEC_SIZE;
Expand Down Expand Up @@ -202,23 +198,21 @@ struct AgentHistogram
using VecT = typename CubVector<SampleT, VecSize>::Type;

/// Constants
enum
{
BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS,
static constexpr int BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS;

PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD,
SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS,
VECS_PER_THREAD = SAMPLES_PER_THREAD / VecSize,
static constexpr int PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD;
static constexpr int SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS;
static constexpr int VECS_PER_THREAD = SAMPLES_PER_THREAD / VecSize;

TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS,
TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS,
static constexpr int TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS;
static constexpr int TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS;

IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS,
static constexpr bool IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS;

MEM_PREFERENCE = (PRIVATIZED_SMEM_BINS > 0) ? AgentHistogramPolicyT::MEM_PREFERENCE : GMEM,
static constexpr BlockHistogramMemoryPreference MEM_PREFERENCE =
(PRIVATIZED_SMEM_BINS > 0) ? AgentHistogramPolicyT::MEM_PREFERENCE : GMEM;

IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING,
};
static constexpr bool IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING;

/// Cache load modifier for reading input elements
static constexpr CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;
Expand Down
6 changes: 4 additions & 2 deletions cub/cub/agent/agent_sub_warp_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@

#include <thrust/system/cuda/detail/core/util.h>

#include <cuda/std/utility>

#include <nv/target>

CUB_NAMESPACE_BEGIN
Expand Down Expand Up @@ -120,7 +122,7 @@ class AgentSubWarpSort
{
return lhs < rhs;
}
_CCCL_UNREACHABLE();
::cuda::std::unreachable();
}

#if defined(__CUDA_FP16_TYPES_EXIST__)
Expand All @@ -135,7 +137,7 @@ class AgentSubWarpSort
{
NV_IF_TARGET(NV_PROVIDES_SM_53, (return __hlt(lhs, rhs);), (return __half2float(lhs) < __half2float(rhs);));
}
_CCCL_UNREACHABLE();
::cuda::std::unreachable();
}
#endif // __CUDA_FP16_TYPES_EXIST__
};
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/block/block_discontinuity.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ public:
//! @name Head flag operations
//! @{

#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document

/**
* @param[out] head_flags
Expand Down Expand Up @@ -349,7 +349,7 @@ public:
Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
}

#endif // DOXYGEN_SHOULD_SKIP_THIS
#endif // _CCCL_DOXYGEN_INVOKED

//! @rst
//! Sets head flags indicating discontinuities between items partitioned across the thread
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/block/block_exchange.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1217,7 +1217,7 @@ public:

//! @} end member group

#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document

/// @param[in-out] items
/// Items to exchange, converting between **striped** and **blocked** arrangements.
Expand Down Expand Up @@ -1292,7 +1292,7 @@ public:
ScatterToStriped(items, items, ranks, is_valid);
}

#endif // DOXYGEN_SHOULD_SKIP_THIS
#endif // _CCCL_DOXYGEN_INVOKED
};

CUB_NAMESPACE_END
4 changes: 2 additions & 2 deletions cub/cub/block/block_load.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(
LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end);
}

#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document

//! @brief Internal implementation for load vectorization
//!
Expand Down Expand Up @@ -225,7 +225,7 @@ InternalLoadDirectBlockedVectorized(int linear_tid, const T* block_src_ptr, T (&
}
}

#endif // DOXYGEN_SHOULD_SKIP_THIS
#endif // _CCCL_DOXYGEN_INVOKED

//! @rst
//! Load a linear segment of items into a blocked arrangement across the thread block.
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/block/block_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -175,14 +175,14 @@ private:
// Whether or not there are values to be trucked along with keys
static constexpr bool KEYS_ONLY = ::cuda::std::is_same<ValueT, NullType>::value;

#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
/// Shared memory type required by this thread block
union _TempStorage
{
KeyT keys_shared[ITEMS_PER_TILE + 1];
ValueT items_shared[ITEMS_PER_TILE + 1];
}; // union TempStorage
#endif // DOXYGEN_SHOULD_SKIP_THIS
#endif // _CCCL_DOXYGEN_INVOKED

/// Shared storage reference
_TempStorage& temp_storage;
Expand Down
16 changes: 8 additions & 8 deletions cub/cub/block/block_radix_rank.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ struct BlockRadixRankEmptyCallback
_CCCL_DEVICE _CCCL_FORCEINLINE void operator()(int (&bins)[BINS_PER_THREAD]) {}
};

#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
namespace detail
{

Expand Down Expand Up @@ -121,7 +121,7 @@ struct warp_in_block_matcher_t<Bits, 0, PartialWarpId>
};

} // namespace detail
#endif // DOXYGEN_SHOULD_SKIP_THIS
#endif // _CCCL_DOXYGEN_INVOKED

//! @rst
//! BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
Expand Down Expand Up @@ -263,7 +263,7 @@ private:
/// BlockScan type
using BlockScan = BlockScan<PackedCounter, BLOCK_DIM_X, INNER_SCAN_ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z>;

#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
struct __align__(16) _TempStorage
{
union Aliasable
Expand All @@ -276,7 +276,7 @@ private:
// Storage for scanning local ranks
typename BlockScan::TempStorage block_scan;
};
#endif // !DOXYGEN_SHOULD_SKIP_THIS
#endif // !_CCCL_DOXYGEN_INVOKED

/// Shared storage reference
_TempStorage& temp_storage;
Expand Down Expand Up @@ -597,7 +597,7 @@ private:
/// BlockScan type
using BlockScanT = BlockScan<DigitCounterT, BLOCK_THREADS, INNER_SCAN_ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z>;

#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
struct __align__(16) _TempStorage
{
typename BlockScanT::TempStorage block_scan;
Expand All @@ -609,7 +609,7 @@ private:
}
aliasable;
};
#endif // !DOXYGEN_SHOULD_SKIP_THIS
#endif // !_CCCL_DOXYGEN_INVOKED

/// Shared storage reference
_TempStorage& temp_storage;
Expand Down Expand Up @@ -1183,7 +1183,7 @@ struct BlockRadixRankMatchEarlyCounts
}
};

#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
namespace detail
{

Expand Down Expand Up @@ -1211,6 +1211,6 @@ using block_radix_rank_t = ::cuda::std::_If<
BlockRadixRankMatchEarlyCounts<BlockDimX, RadixBits, IsDescending, ScanAlgorithm, WARP_MATCH_ATOMIC_OR>>>>>;

} // namespace detail
#endif // DOXYGEN_SHOULD_SKIP_THIS
#endif // _CCCL_DOXYGEN_INVOKED

CUB_NAMESPACE_END
Loading
Loading