Skip to content

Commit

Permalink
no unroll in Laswp
Browse files Browse the repository at this point in the history
Signed-off-by: Yuuichi Asahi <[email protected]>
  • Loading branch information
Yuuichi Asahi committed Jan 29, 2025
1 parent 3e27b0d commit dda5d2e
Showing 1 changed file with 5 additions and 6 deletions.
11 changes: 5 additions & 6 deletions batched/dense/impl/KokkosBatched_Laswp_Serial_Internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,17 +94,16 @@ struct SerialLaswpVectorBackwardInternal {
template <typename IntType, typename ValueType>
KOKKOS_INLINE_FUNCTION static int invoke(const int plen, const IntType *KOKKOS_RESTRICT p, const int ps0,
/* */ ValueType *KOKKOS_RESTRICT A, const int as0) {
for (int i = (plen - 1); i >= 0; --i) {
const int piv = p[i * ps0];

// On H100 with Cuda 12.0.0, the compiler seems to apply
// an aggressive optimization which crashes this function
// Insert unnecessary operation to disallow optimization
// Disabling loop unrolling fixes the issue
#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ARCH_HOPPER90)
#if CUDA_VERSION == 12000
if (piv < 0) return 0;
#if CUDA_VERSION >= 12000 && CUDA_VERSION < 12100
#pragma unroll 1
#endif
#endif
for (int i = (plen - 1); i >= 0; --i) {
const int piv = p[i * ps0];
if (piv != i) {
const int idx_i = i * as0, idx_p = piv * as0;
const ValueType tmp = A[idx_i];
Expand Down

0 comments on commit dda5d2e

Please sign in to comment.