NVIDIA · davebayer · Nov 18, 2024 · Nov 18, 2024 · Nov 18, 2024 · Nov 18, 2024
@@ -128,8 +128,8 @@ IndentWidth: 2
 KeepEmptyLinesAtTheStartOfBlocks: false
 MaxEmptyLinesToKeep: 1
 Macros:
-- _LIBCUDACXX_TEMPLATE(...)=template<...>
-- _LIBCUDACXX_REQUIRES(...)=requires (...)
+- _CCCL_TEMPLATE(...)=template<...>
+- _CCCL_REQUIRES(...)=requires (...)
 WhitespaceSensitiveMacros:
 - _CCCL_HAS_INCLUDE
 NamespaceIndentation: None

@@ -35,7 +35,24 @@
 #include <c2h/vector.h>
 
 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-#  include <cub/util_type.cuh> // for <cuda_fp8.h>
+#  if defined(_CCCL_HAS_NVFP16)
+#    include <cuda_fp16.h>
+#  endif // _CCCL_HAS_NVFP16
+
+#  if defined(_CCCL_HAS_NVBF16)
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
+#    include <cuda_bf16.h>
+_CCCL_DIAG_POP
+
+#    if _CCCL_CUDACC_AT_LEAST(11, 8)
+// cuda_fp8.h resets default for C4127, so we have to guard the inclusion
+_CCCL_DIAG_PUSH
+#      include <cuda_fp8.h>
+_CCCL_DIAG_POP
+#    endif // _CCCL_CUDACC_AT_LEAST(11, 8)
+#  endif // _CCCL_HAS_NVBF16
+
 #  if defined(__CUDA_FP8_TYPES_EXIST__)
 namespace std
 {

@@ -418,52 +418,51 @@ struct less_t
   {
     return lhs < rhs;
   }
-};
-
-template <>
-__host__ __device__ inline bool less_t::operator()(const complex& lhs, const complex& rhs) const
-{
-  double magnitude_0 = cuda::std::abs(lhs);
-  double magnitude_1 = cuda::std::abs(rhs);
 
-  if (cuda::std::isnan(magnitude_0) || cuda::std::isnan(magnitude_1))
-  {
-    // NaN's are always equal.
-    return false;
-  }
-  else if (cuda::std::isinf(magnitude_0) || cuda::std::isinf(magnitude_1))
+  __host__ __device__ inline bool operator()(const complex& lhs, const complex& rhs) const
   {
-    // If the real or imaginary part of the complex number has a very large value
-    // (close to the maximum representable value for a double), it is possible that
-    // the magnitude computation can result in positive infinity:
-    // ```cpp
-    // const double large_number = std::numeric_limits<double>::max() / 2;
-    // std::complex<double> z(large_number, large_number);
-    // std::abs(z) == inf;
-    // ```
-    // Dividing both components by a constant before computing the magnitude prevents overflow.
-    const complex::value_type scaler = 0.5;
-
-    magnitude_0 = cuda::std::abs(lhs * scaler);
-    magnitude_1 = cuda::std::abs(rhs * scaler);
-  }
+    double magnitude_0 = cuda::std::abs(lhs);
+    double magnitude_1 = cuda::std::abs(rhs);
+
+    if (cuda::std::isnan(magnitude_0) || cuda::std::isnan(magnitude_1))
+    {
+      // NaN's are always equal.
+      return false;
+    }
+    else if (cuda::std::isinf(magnitude_0) || cuda::std::isinf(magnitude_1))
+    {
+      // If the real or imaginary part of the complex number has a very large value
+      // (close to the maximum representable value for a double), it is possible that
+      // the magnitude computation can result in positive infinity:
+      // ```cpp
+      // const double large_number = std::numeric_limits<double>::max() / 2;
+      // std::complex<double> z(large_number, large_number);
+      // std::abs(z) == inf;
+      // ```
+      // Dividing both components by a constant before computing the magnitude prevents overflow.
+      const complex::value_type scaler = 0.5;
+
+      magnitude_0 = cuda::std::abs(lhs * scaler);
+      magnitude_1 = cuda::std::abs(rhs * scaler);
+    }
 
-  const complex::value_type difference = cuda::std::abs(magnitude_0 - magnitude_1);
-  const complex::value_type threshold  = cuda::std::numeric_limits<complex::value_type>::epsilon() * 2;
+    const complex::value_type difference = cuda::std::abs(magnitude_0 - magnitude_1);
+    const complex::value_type threshold  = cuda::std::numeric_limits<complex::value_type>::epsilon() * 2;
 
-  if (difference < threshold)
-  {
-    // Triangles with the same magnitude are sorted by their phase angle.
-    const complex::value_type phase_angle_0 = cuda::std::arg(lhs);
-    const complex::value_type phase_angle_1 = cuda::std::arg(rhs);
+    if (difference < threshold)
+    {
+      // Triangles with the same magnitude are sorted by their phase angle.
+      const complex::value_type phase_angle_0 = cuda::std::arg(lhs);
+      const complex::value_type phase_angle_1 = cuda::std::arg(rhs);
 
-    return phase_angle_0 < phase_angle_1;
-  }
-  else
-  {
-    return magnitude_0 < magnitude_1;
+      return phase_angle_0 < phase_angle_1;
+    }
+    else
+    {
+      return magnitude_0 < magnitude_1;
+    }
   }
-}
+};
 
 struct max_t
 {

@@ -106,23 +106,19 @@ template <int _BLOCK_THREADS,
           int _VEC_SIZE = 4>
 struct AgentHistogramPolicy
 {
-  enum
-  {
-    /// Threads per thread block
-    BLOCK_THREADS = _BLOCK_THREADS,
-
-    /// Pixels per thread (per tile of input)
-    PIXELS_PER_THREAD = _PIXELS_PER_THREAD,
+  /// Threads per thread block
+  static constexpr int BLOCK_THREADS = _BLOCK_THREADS;
+  /// Pixels per thread (per tile of input)
+  static constexpr int PIXELS_PER_THREAD = _PIXELS_PER_THREAD;
 
-    /// Whether to perform localized RLE to compress samples before histogramming
-    IS_RLE_COMPRESS = _RLE_COMPRESS,
+  /// Whether to perform localized RLE to compress samples before histogramming
+  static constexpr bool IS_RLE_COMPRESS = _RLE_COMPRESS;
 
-    /// Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
-    MEM_PREFERENCE = _MEM_PREFERENCE,
+  /// Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
+  static constexpr BlockHistogramMemoryPreference MEM_PREFERENCE = _MEM_PREFERENCE;
 
-    /// Whether to dequeue tiles from a global work queue
-    IS_WORK_STEALING = _WORK_STEALING,
-  };
+  /// Whether to dequeue tiles from a global work queue
+  static constexpr bool IS_WORK_STEALING = _WORK_STEALING;
 
   /// Vector size for samples loading (1, 2, 4)
   static constexpr int VEC_SIZE = _VEC_SIZE;
@@ -202,23 +198,21 @@ struct AgentHistogram
   using VecT                   = typename CubVector<SampleT, VecSize>::Type;
 
   /// Constants
-  enum
-  {
-    BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS,
+  static constexpr int BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS;
 
-    PIXELS_PER_THREAD  = AgentHistogramPolicyT::PIXELS_PER_THREAD,
-    SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS,
-    VECS_PER_THREAD    = SAMPLES_PER_THREAD / VecSize,
+  static constexpr int PIXELS_PER_THREAD  = AgentHistogramPolicyT::PIXELS_PER_THREAD;
+  static constexpr int SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS;
+  static constexpr int VECS_PER_THREAD    = SAMPLES_PER_THREAD / VecSize;
 
-    TILE_PIXELS  = PIXELS_PER_THREAD * BLOCK_THREADS,
-    TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS,
+  static constexpr int TILE_PIXELS  = PIXELS_PER_THREAD * BLOCK_THREADS;
+  static constexpr int TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS;
 
-    IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS,
+  static constexpr bool IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS;
 
-    MEM_PREFERENCE = (PRIVATIZED_SMEM_BINS > 0) ? AgentHistogramPolicyT::MEM_PREFERENCE : GMEM,
+  static constexpr BlockHistogramMemoryPreference MEM_PREFERENCE =
+    (PRIVATIZED_SMEM_BINS > 0) ? AgentHistogramPolicyT::MEM_PREFERENCE : GMEM;
 
-    IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING,
-  };
+  static constexpr bool IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING;
 
   /// Cache load modifier for reading input elements
   static constexpr CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;

@@ -45,6 +45,8 @@
 
 #include <thrust/system/cuda/detail/core/util.h>
 
+#include <cuda/std/utility>
+
 #include <nv/target>
 
 CUB_NAMESPACE_BEGIN
@@ -120,7 +122,7 @@ class AgentSubWarpSort
       {
         return lhs < rhs;
       }
-      _CCCL_UNREACHABLE();
+      ::cuda::std::unreachable();
     }
 
 #if defined(__CUDA_FP16_TYPES_EXIST__)
@@ -135,7 +137,7 @@ class AgentSubWarpSort
       {
         NV_IF_TARGET(NV_PROVIDES_SM_53, (return __hlt(lhs, rhs);), (return __half2float(lhs) < __half2float(rhs);));
       }
-      _CCCL_UNREACHABLE();
+      ::cuda::std::unreachable();
     }
 #endif // __CUDA_FP16_TYPES_EXIST__
   };

@@ -270,7 +270,7 @@ public:
   //! @name Head flag operations
   //! @{
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
   /**
    * @param[out] head_flags
@@ -349,7 +349,7 @@ public:
     Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
   }
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   //! @rst
   //! Sets head flags indicating discontinuities between items partitioned across the thread

@@ -1217,7 +1217,7 @@ public:
 
   //! @}  end member group
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
   /// @param[in-out] items
   ///   Items to exchange, converting between **striped** and **blocked** arrangements.
@@ -1292,7 +1292,7 @@ public:
     ScatterToStriped(items, items, ranks, is_valid);
   }
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 };
 
 CUB_NAMESPACE_END
@@ -179,7 +179,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(
   LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end);
 }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 
 //! @brief Internal implementation for load vectorization
 //!
@@ -225,7 +225,7 @@ InternalLoadDirectBlockedVectorized(int linear_tid, const T* block_src_ptr, T (&
   }
 }
 
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 //! @rst
 //! Load a linear segment of items into a blocked arrangement across the thread block.

@@ -175,14 +175,14 @@ private:
   // Whether or not there are values to be trucked along with keys
   static constexpr bool KEYS_ONLY = ::cuda::std::is_same<ValueT, NullType>::value;
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   /// Shared memory type required by this thread block
   union _TempStorage
   {
     KeyT keys_shared[ITEMS_PER_TILE + 1];
     ValueT items_shared[ITEMS_PER_TILE + 1];
   }; // union TempStorage
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
   /// Shared storage reference
   _TempStorage& temp_storage;

@@ -93,7 +93,7 @@ struct BlockRadixRankEmptyCallback
   _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(int (&bins)[BINS_PER_THREAD]) {}
 };
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 namespace detail
 {
 
@@ -121,7 +121,7 @@ struct warp_in_block_matcher_t<Bits, 0, PartialWarpId>
 };
 
 } // namespace detail
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 //! @rst
 //! BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
@@ -263,7 +263,7 @@ private:
   /// BlockScan type
   using BlockScan = BlockScan<PackedCounter, BLOCK_DIM_X, INNER_SCAN_ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z>;
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   struct __align__(16) _TempStorage
   {
     union Aliasable
@@ -276,7 +276,7 @@ private:
     // Storage for scanning local ranks
     typename BlockScan::TempStorage block_scan;
   };
-#endif // !DOXYGEN_SHOULD_SKIP_THIS
+#endif // !_CCCL_DOXYGEN_INVOKED
 
   /// Shared storage reference
   _TempStorage& temp_storage;
@@ -597,7 +597,7 @@ private:
   /// BlockScan type
   using BlockScanT = BlockScan<DigitCounterT, BLOCK_THREADS, INNER_SCAN_ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z>;
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
   struct __align__(16) _TempStorage
   {
     typename BlockScanT::TempStorage block_scan;
@@ -609,7 +609,7 @@ private:
     }
     aliasable;
   };
-#endif // !DOXYGEN_SHOULD_SKIP_THIS
+#endif // !_CCCL_DOXYGEN_INVOKED
 
   /// Shared storage reference
   _TempStorage& temp_storage;
@@ -1183,7 +1183,7 @@ struct BlockRadixRankMatchEarlyCounts
   }
 };
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
 namespace detail
 {
 
@@ -1211,6 +1211,6 @@ using block_radix_rank_t = ::cuda::std::_If<
         BlockRadixRankMatchEarlyCounts<BlockDimX, RadixBits, IsDescending, ScanAlgorithm, WARP_MATCH_ATOMIC_OR>>>>>;
 
 } // namespace detail
-#endif // DOXYGEN_SHOULD_SKIP_THIS
+#endif // _CCCL_DOXYGEN_INVOKED
 
 CUB_NAMESPACE_END