From c0651cf62932af78f004b217fc1f4412a59fc517 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Tue, 19 Dec 2023 12:06:19 +0200 Subject: [PATCH] Initial ROCm 6 enablement (#572) - Use non-async alloc/free if `<= 16` bytes. - Fix `Mem.is_pinned` for arrays (ROCm 6 changed `hipMemoryType` struct definition). - Fix page fault in rocBLAS (use new out-of-place `trmm` function). - Cleanup wrapper generators. - Disable device RNG for Navi 3 (doesn't support `memtime` or `memrealtime` LLVM intrinsic). - Disable rocSPARSE 7 rocFFT tests on ROCm 6.0+. --- .buildkite/pipeline.yml | 40 +++++++++++++++++++-------------------- README.md | 1 + gen/Project.toml | 3 --- gen/generator.jl | 22 --------------------- gen/generator.toml | 7 ------- gen/miopen/generator.jl | 3 +-- gen/prologue.jl | 4 ---- gen/rocblas/generator.jl | 3 +-- src/blas/highlevel.jl | 16 ++++++++-------- src/blas/librocblas.jl | 16 ++++++++-------- src/blas/wrappers.jl | 11 ++++++----- src/device/random.jl | 1 + src/hip/libhip_common.jl | 10 ++++++++++ src/runtime/memory/hip.jl | 29 +++++++++++++++++++--------- src/sparse/rocSPARSE.jl | 1 - src/sparse/types.jl | 1 + test/core_tests.jl | 7 ++++++- test/device_tests.jl | 7 +++++-- test/hip_extra_tests.jl | 18 ++++++++++++++---- test/rocarray/blas.jl | 7 +++++-- 20 files changed, 107 insertions(+), 100 deletions(-) delete mode 100644 gen/generator.jl delete mode 100644 gen/generator.toml delete mode 100644 gen/prologue.jl diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 2d324cbb2..e034f61b7 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -2,7 +2,7 @@ steps: - label: "Documentation" plugins: - JuliaCI/julia#v1: - version: 1.9 + version: "1.10" command: | julia --project -e ' println("--- :julia: Instantiating project") @@ -22,25 +22,25 @@ steps: if: build.message !~ /\[skip docs\]/ timeout_in_minutes: 10 - - label: "Julia 1.9 - No Artifacts" - plugins: - - JuliaCI/julia#v1: - version: 1.9 - - JuliaCI/julia-test#v1: - - JuliaCI/julia-coverage#v1: - codecov: true - agents: - queue: "juliagpu" - rocm: "*" - rocmgpu: "gfx1100" - if: build.message !~ /\[skip tests\]/ - command: "julia --project -e 'using Pkg; Pkg.update()'" - timeout_in_minutes: 180 - env: - JULIA_NUM_THREADS: 4 - JULIA_AMDGPU_CORE_MUST_LOAD: "1" - JULIA_AMDGPU_HIP_MUST_LOAD: "1" - JULIA_AMDGPU_DISABLE_ARTIFACTS: "1" + # - label: "Julia 1.9 - No Artifacts" + # plugins: + # - JuliaCI/julia#v1: + # version: 1.9 + # - JuliaCI/julia-test#v1: + # - JuliaCI/julia-coverage#v1: + # codecov: true + # agents: + # queue: "juliagpu" + # rocm: "*" + # rocmgpu: "gfx1100" + # if: build.message !~ /\[skip tests\]/ + # command: "julia --project -e 'using Pkg; Pkg.update()'" + # timeout_in_minutes: 180 + # env: + # JULIA_NUM_THREADS: 4 + # JULIA_AMDGPU_CORE_MUST_LOAD: "1" + # JULIA_AMDGPU_HIP_MUST_LOAD: "1" + # JULIA_AMDGPU_DISABLE_ARTIFACTS: "1" - label: "Julia 1.10 - No Artifacts" plugins: diff --git a/README.md b/README.md index 3d91f71e5..bc8b7030c 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,7 @@ The AMDGPU.jl package requires **Julia 1.9+**, **ROCm 5.3-5.7**, which means only GPUs that are supported by these versions will work.\ **64-bit Linux and Windows** are supported, see [requirements](https://amdgpu.juliagpu.org/dev/#Installation). +**Navi 3** GPUs require Julia 1.10 or higher. ## Quick start diff --git a/gen/Project.toml b/gen/Project.toml index e1b0a3a26..f36be7fcd 100644 --- a/gen/Project.toml +++ b/gen/Project.toml @@ -1,5 +1,2 @@ [deps] Clang = "40e3b903-d033-50b4-a0cc-940c62c95e31" -MIOpen_jll = "2409bb75-d5ef-542a-ac68-1cfd4c37dc24" -hsa_rocr_jll = "dd59ff1a-a01a-568d-8b29-0669330f116a" -rocBLAS_jll = "1ef8cab2-a151-54b4-a57f-5fbb4046a4ab" diff --git a/gen/generator.jl b/gen/generator.jl deleted file mode 100644 index 1391fed53..000000000 --- a/gen/generator.jl +++ /dev/null @@ -1,22 +0,0 @@ -using Clang.Generators -using hsa_rocr_jll - -cd(@__DIR__) - -include_dir = normpath(hsa_rocr_jll.artifact_dir, "include") -hsa_dir = joinpath(include_dir, "hsa") - -options = load_options(joinpath(@__DIR__, "generator.toml")) - -args = get_default_args() -push!(args, "-I$include_dir") - -headers = [joinpath(hsa_dir, header) for header in ["hsa.h", "hsa_ext_amd.h"]] -# there is also an experimental `detect_headers` function for auto-detecting top-level headers in the directory -# headers = detect_headers(clang_dir, args) - -# create context -ctx = create_context(headers, args, options) - -# run generator -build!(ctx) diff --git a/gen/generator.toml b/gen/generator.toml deleted file mode 100644 index b508300ab..000000000 --- a/gen/generator.toml +++ /dev/null @@ -1,7 +0,0 @@ -[general] -library_name = "libhsaruntime" -output_file_path = "../src/hsa/LibHSARuntime.jl" -module_name = "LibHSARuntime" -jll_pkg_name = "hsa_rocr_jll" -export_symbol_prefixes = ["HSA", "hsa_"] -prologue_file_path = "prologue.jl" diff --git a/gen/miopen/generator.jl b/gen/miopen/generator.jl index 2de483eb5..b54359b1e 100644 --- a/gen/miopen/generator.jl +++ b/gen/miopen/generator.jl @@ -1,7 +1,6 @@ using Clang.Generators -using MIOpen_jll -include_dir = normpath(MIOpen_jll.artifact_dir, "include") +include_dir = normpath("/opt/rocm/include") miopen_dir = joinpath(include_dir, "miopen") options = load_options("miopen/miopen-generator.toml") diff --git a/gen/prologue.jl b/gen/prologue.jl deleted file mode 100644 index d89ef9a24..000000000 --- a/gen/prologue.jl +++ /dev/null @@ -1,4 +0,0 @@ -import ...AMDGPU: libhsaruntime - -const HSA_EXPORT_DECORATOR = identity -const HSA_CALL = nothing diff --git a/gen/rocblas/generator.jl b/gen/rocblas/generator.jl index 9122e83dd..149da1ae8 100644 --- a/gen/rocblas/generator.jl +++ b/gen/rocblas/generator.jl @@ -1,7 +1,6 @@ using Clang.Generators -using rocBLAS_jll -include_dir = normpath(rocBLAS_jll.artifact_dir, "include") +include_dir = normpath("/opt/rocm/include") rocblas_dir = joinpath(include_dir, "rocblas") options = load_options("rocblas/rocblas-generator.toml") diff --git a/src/blas/highlevel.jl b/src/blas/highlevel.jl index 1574ee7c1..2f463b713 100644 --- a/src/blas/highlevel.jl +++ b/src/blas/highlevel.jl @@ -313,9 +313,9 @@ LinearAlgebra.mul!(C::ROCMatrix{T}, adjA::LinearAlgebra.Adjoint{<:Any, <:ROCMatr ) @eval begin LinearAlgebra.lmul!(A::$t{T, <: ROCMatrix}, B::ROCMatrix{T}) where T <: ROCBLASFloat = - trmm!('L', $uploc, 'N', $isunitc, one(T), parent(A), B) + trmm!('L', $uploc, 'N', $isunitc, one(T), parent(A), B, B) LinearAlgebra.rmul!(A::ROCMatrix{T}, B::$t{T, <: ROCMatrix}) where T <: ROCBLASFloat = - trmm!('R', $uploc, 'N', $isunitc, one(T), parent(B), A) + trmm!('R', $uploc, 'N', $isunitc, one(T), parent(B), A, A) LinearAlgebra.ldiv!(A::$t{T, <: ROCMatrix}, B::ROCMatrix{T}) where T <: ROCBLASFloat = trsm!('L', $uploc, 'N', $isunitc, one(T), parent(A), B) @@ -334,18 +334,18 @@ LinearAlgebra.mul!(C::ROCMatrix{T}, adjA::LinearAlgebra.Adjoint{<:Any, <:ROCMatr @eval begin # Multiplication. LinearAlgebra.lmul!(A::$t{<: Any, <: Transpose{T, <: ROCMatrix}}, B::ROCMatrix{T}) where T <: ROCBLASFloat = - trmm!('L', $uploc, 'T', $isunitc, one(T), parent(parent(A)), B) + trmm!('L', $uploc, 'T', $isunitc, one(T), parent(parent(A)), B, B) LinearAlgebra.lmul!(A::$t{<: Any, <: Adjoint{T, <: ROCMatrix}}, B::ROCMatrix{T}) where T <: ROCBLASFloat = - trmm!('L', $uploc, 'T', $isunitc, one(T), parent(parent(A)), B) + trmm!('L', $uploc, 'T', $isunitc, one(T), parent(parent(A)), B, B) LinearAlgebra.lmul!(A::$t{<: Any, <: Adjoint{T, <: ROCMatrix}}, B::ROCMatrix{T}) where T <: ROCBLASComplex = - trmm!('L', $uploc, 'C', $isunitc, one(T), parent(parent(A)), B) + trmm!('L', $uploc, 'C', $isunitc, one(T), parent(parent(A)), B, B) LinearAlgebra.rmul!(A::ROCMatrix{T}, B::$t{<: Any, <: Transpose{T, <: ROCMatrix}}) where T <: ROCBLASFloat = - trmm!('R', $uploc, 'T', $isunitc, one(T), parent(parent(B)), A) + trmm!('R', $uploc, 'T', $isunitc, one(T), parent(parent(B)), A, A) LinearAlgebra.rmul!(A::ROCMatrix{T}, B::$t{<: Any, <: Adjoint{T, <: ROCMatrix}}) where T <: ROCBLASFloat = - trmm!('R', $uploc, 'T', $isunitc, one(T), parent(parent(B)), A) + trmm!('R', $uploc, 'T', $isunitc, one(T), parent(parent(B)), A, A) LinearAlgebra.rmul!(A::ROCMatrix{T}, B::$t{<: Any, <: Adjoint{T, <: ROCMatrix}}) where T <: ROCBLASComplex = - trmm!('R', $uploc, 'C', $isunitc, one(T), parent(parent(B)), A) + trmm!('R', $uploc, 'C', $isunitc, one(T), parent(parent(B)), A, A) # Left division. LinearAlgebra.ldiv!(A::$t{<: Any, <: Transpose{T, <: ROCMatrix}}, B::ROCMatrix{T}) where T <: ROCBLASFloat = diff --git a/src/blas/librocblas.jl b/src/blas/librocblas.jl index c551a0446..7132bdf9b 100644 --- a/src/blas/librocblas.jl +++ b/src/blas/librocblas.jl @@ -2614,24 +2614,24 @@ function rocblas_zsyrkx_strided_batched(handle, uplo, trans, n, k, alpha, A, lda ccall((:rocblas_zsyrkx_strided_batched, librocblas), rocblas_status, (rocblas_handle, rocblas_fill, rocblas_operation, rocblas_int, rocblas_int, Ptr{rocblas_double_complex}, Ptr{rocblas_double_complex}, rocblas_int, rocblas_stride, Ptr{rocblas_double_complex}, rocblas_int, rocblas_stride, Ptr{rocblas_double_complex}, Ptr{rocblas_double_complex}, rocblas_int, rocblas_stride, rocblas_int), handle, uplo, trans, n, k, alpha, A, lda, stride_A, B, ldb, stride_B, beta, C, ldc, stride_C, batch_count) end -function rocblas_strmm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb) +function rocblas_strmm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, C, ldc) AMDGPU.prepare_state() - ccall((:rocblas_strmm, librocblas), rocblas_status, (rocblas_handle, rocblas_side, rocblas_fill, rocblas_operation, rocblas_diagonal, rocblas_int, rocblas_int, Ptr{Cfloat}, Ptr{Cfloat}, rocblas_int, Ptr{Cfloat}, rocblas_int), handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb) + ccall((:rocblas_strmm, librocblas), rocblas_status, (rocblas_handle, rocblas_side, rocblas_fill, rocblas_operation, rocblas_diagonal, rocblas_int, rocblas_int, Ptr{Cfloat}, Ptr{Cfloat}, rocblas_int, Ptr{Cfloat}, rocblas_int, Ptr{Cfloat}, rocblas_int), handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, C, ldc) end -function rocblas_dtrmm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb) +function rocblas_dtrmm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, C, ldc) AMDGPU.prepare_state() - ccall((:rocblas_dtrmm, librocblas), rocblas_status, (rocblas_handle, rocblas_side, rocblas_fill, rocblas_operation, rocblas_diagonal, rocblas_int, rocblas_int, Ptr{Cdouble}, Ptr{Cdouble}, rocblas_int, Ptr{Cdouble}, rocblas_int), handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb) + ccall((:rocblas_dtrmm, librocblas), rocblas_status, (rocblas_handle, rocblas_side, rocblas_fill, rocblas_operation, rocblas_diagonal, rocblas_int, rocblas_int, Ptr{Cdouble}, Ptr{Cdouble}, rocblas_int, Ptr{Cdouble}, rocblas_int, Ptr{Cdouble}, rocblas_int), handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, C, ldc) end -function rocblas_ctrmm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb) +function rocblas_ctrmm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, C, ldc) AMDGPU.prepare_state() - ccall((:rocblas_ctrmm, librocblas), rocblas_status, (rocblas_handle, rocblas_side, rocblas_fill, rocblas_operation, rocblas_diagonal, rocblas_int, rocblas_int, Ptr{rocblas_float_complex}, Ptr{rocblas_float_complex}, rocblas_int, Ptr{rocblas_float_complex}, rocblas_int), handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb) + ccall((:rocblas_ctrmm, librocblas), rocblas_status, (rocblas_handle, rocblas_side, rocblas_fill, rocblas_operation, rocblas_diagonal, rocblas_int, rocblas_int, Ptr{rocblas_float_complex}, Ptr{rocblas_float_complex}, rocblas_int, Ptr{rocblas_float_complex}, rocblas_int, Ptr{rocblas_float_complex}, rocblas_int), handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, C, ldc) end -function rocblas_ztrmm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb) +function rocblas_ztrmm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, C, ldc) AMDGPU.prepare_state() - ccall((:rocblas_ztrmm, librocblas), rocblas_status, (rocblas_handle, rocblas_side, rocblas_fill, rocblas_operation, rocblas_diagonal, rocblas_int, rocblas_int, Ptr{rocblas_double_complex}, Ptr{rocblas_double_complex}, rocblas_int, Ptr{rocblas_double_complex}, rocblas_int), handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb) + ccall((:rocblas_ztrmm, librocblas), rocblas_status, (rocblas_handle, rocblas_side, rocblas_fill, rocblas_operation, rocblas_diagonal, rocblas_int, rocblas_int, Ptr{rocblas_double_complex}, Ptr{rocblas_double_complex}, rocblas_int, Ptr{rocblas_double_complex}, rocblas_int, Ptr{rocblas_double_complex}, rocblas_int), handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, C, ldc) end function rocblas_strmm_batched(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, batch_count) diff --git a/src/blas/wrappers.jl b/src/blas/wrappers.jl index 8a1b709cc..c2c1710fc 100644 --- a/src/blas/wrappers.jl +++ b/src/blas/wrappers.jl @@ -972,26 +972,27 @@ for (mmname, smname, elty) in @eval begin function trmm!( side::Char, uplo::Char, transa::Char, diag::Char, alpha::($elty), - A::ROCMatrix{$elty}, B::ROCMatrix{$elty}, + A::ROCMatrix{$elty}, B::ROCMatrix{$elty}, C::ROCMatrix{$elty}, ) m, n = size(B) mA, nA = size(A) # TODO: clean up error messages if mA != nA throw(DimensionMismatch("A must be square")) end if nA != (side == 'L' ? m : n) throw(DimensionMismatch("trmm!")) end - lda = max(1,stride(A,2)) - ldb = max(1,stride(B,2)) + lda = max(1, stride(A, 2)) + ldb = max(1, stride(B, 2)) + ldc = max(1, stride(C, 2)) (; handle, stream) = lib_state() $(mmname)( handle, side, uplo, transa, diag, m, n, Ref(alpha), - A, lda, B, ldb) |> check + A, lda, B, ldb, C, ldc) |> check B end function trmm( side::Char, uplo::Char, transa::Char, diag::Char, alpha::($elty), A::ROCMatrix{$elty}, B::ROCMatrix{$elty}, ) - trmm!(side, uplo, transa, diag, alpha, A, copy(B)) + trmm!(side, uplo, transa, diag, alpha, A, B, similar(B)) end function trsm!( side::Char, uplo::Char, transa::Char, diag::Char, alpha::($elty), diff --git a/src/device/random.jl b/src/device/random.jl index b6868eafd..72eb01862 100644 --- a/src/device/random.jl +++ b/src/device/random.jl @@ -54,6 +54,7 @@ end return ROCDeviceArray{UInt32,1,AS.Local}((32,), ptr) end +# TODO no memtime instructions on RDNA 3 @device_override Random.make_seed() = Base.unsafe_trunc(UInt32, memrealtime()) diff --git a/src/hip/libhip_common.jl b/src/hip/libhip_common.jl index 66da884a3..5ab20741b 100644 --- a/src/hip/libhip_common.jl +++ b/src/hip/libhip_common.jl @@ -18,6 +18,16 @@ end hipMemoryTypeManaged end +# TODO use this once we support ROCm 6+ only. +@cenum hipMemoryTypeV2 begin + hipMemoryTypeUnregisteredV2 = 0 + hipMemoryTypeHostV2 = 1 + hipMemoryTypeDeviceV2 = 2 + hipMemoryTypeManagedV2 = 3 + hipMemoryTypeArrayV2 = 10 + hipMemoryTypeUnifiedV2 = 11 +end + @cenum hiprtcResult::UInt32 begin HIPRTC_SUCCESS = 0 HIPRTC_ERROR_OUT_OF_MEMORY = 1 diff --git a/src/runtime/memory/hip.jl b/src/runtime/memory/hip.jl index 8f6b5041a..9359e886f 100644 --- a/src/runtime/memory/hip.jl +++ b/src/runtime/memory/hip.jl @@ -87,10 +87,13 @@ function HIPBuffer(bytesize; stream::HIP.HIPStream) end # Try to allocate. - - # Async is ~300x slower: https://discourse.julialang.org/t/lux-tutorial-amdgpu-20x-slower-than-cpu/107053/11 - # HIP.hipMallocAsync(ptr_ref, bytesize, stream) |> HIP.check - HIP.hipMalloc(ptr_ref, bytesize) |> HIP.check + # NOTE Async is ~300x slower for small (≤ 16 bytes) allocations: + # https://github.com/ROCm/HIP/issues/3370#issuecomment-1842938966 + if bytesize > 16 + HIP.hipMallocAsync(ptr_ref, bytesize, stream) |> HIP.check + else + HIP.hipMalloc(ptr_ref, bytesize) |> HIP.check + end ptr = ptr_ref[] ptr == C_NULL && throw(HIP.HIPError(HIP.hipErrorOutOfMemory)) return ptr @@ -127,8 +130,11 @@ function free(buf::HIPBuffer; stream::HIP.HIPStream) buf.own || return buf.ptr == C_NULL && return - # HIP.hipFreeAsync(buf, stream) |> HIP.check - HIP.hipFree(buf) |> HIP.check + if buf.bytesize > 16 + HIP.hipFreeAsync(buf, stream) |> HIP.check + else + HIP.hipFree(buf) |> HIP.check + end return end @@ -239,19 +245,24 @@ function get_device_ptr(ptr::Ptr{Cvoid}) ptr_ref[] end -function is_pinned(ptr::Ptr{Cvoid}) +function is_pinned(ptr) ptr == C_NULL && return false st, data = attributes(ptr) if st == HIP.hipErrorInvalidValue return false elseif st == HIP.hipSuccess - return data.memoryType == HIP.hipMemoryTypeHost + # TODO one we support only ROCm 6+ drop if/else. + if HIP.runtime_version() > v"6-" + return data.memoryType == HIP.hipMemoryTypeHostV2 + else + return data.memoryType == HIP.hipMemoryTypeHost + end end st |> HIP.check end -function attributes(ptr::Ptr{Cvoid}) +function attributes(ptr) data = Ref{HIP.hipPointerAttribute_t}() st = HIP.hipPointerGetAttributes(data, ptr) st, data[] diff --git a/src/sparse/rocSPARSE.jl b/src/sparse/rocSPARSE.jl index a80140268..ea38be682 100644 --- a/src/sparse/rocSPARSE.jl +++ b/src/sparse/rocSPARSE.jl @@ -16,7 +16,6 @@ import AMDGPU: librocsparse, HandleCache, HIP, library_state import AMDGPU.Device: ROCDeviceVector import .HIP: HIPContext, HIPStream, hipStream_t -# TODO replace const SparseChar = Char # core library diff --git a/src/sparse/types.jl b/src/sparse/types.jl index 20c4f19a1..66df08a4e 100644 --- a/src/sparse/types.jl +++ b/src/sparse/types.jl @@ -85,6 +85,7 @@ end ## SparseChar conversions function Base.convert(::Type{rocsparse_operation}, trans::SparseChar) + @show trans if trans == 'N' rocsparse_operation_none elseif trans == 'T' diff --git a/test/core_tests.jl b/test/core_tests.jl index a8d30dd59..de49080a5 100644 --- a/test/core_tests.jl +++ b/test/core_tests.jl @@ -85,6 +85,11 @@ include("codegen/trap.jl") include("rocarray/base.jl") include("rocarray/broadcast.jl") -include("tls.jl") +const IS_NAVI3 = AMDGPU.device().gcn_arch in ("gfx1100", "gfx1101", "gfx1102", "gfx1103") + +# TODO rework, hangs on Navi 3 +if !IS_NAVI3 + include("tls.jl") +end end diff --git a/test/device_tests.jl b/test/device_tests.jl index 0e365f546..2bed34a9b 100644 --- a/test/device_tests.jl +++ b/test/device_tests.jl @@ -16,8 +16,11 @@ include("device/synchronization.jl") include("device/execution_control.jl") include("device/exceptions.jl") -# TODO 1.9 fails with out-of-bounds error for some reason... -if VERSION ≥ v"1.10-" +const IS_NAVI3 = AMDGPU.device().gcn_arch in ("gfx1100", "gfx1101", "gfx1102", "gfx1103") + +# TODO NAVI 3 does not support `memtime` and `memrealtime` llvm intrinsic. +# TODO Julia 1.9 fails with out-of-bounds error for some reason... +if VERSION ≥ v"1.10-" && !IS_NAVI3 include("device/random.jl") end diff --git a/test/hip_extra_tests.jl b/test/hip_extra_tests.jl index 221669ad8..3a28ffc4b 100644 --- a/test/hip_extra_tests.jl +++ b/test/hip_extra_tests.jl @@ -13,11 +13,21 @@ AMDGPU.allowscalar(false) if AMDGPU.functional(:rocsolver) include("rocarray/solver.jl") end -if AMDGPU.functional(:rocsparse) - include("rocsparse/rocsparse.jl") + +# TODO rocSPARSE needs an update to work with ROCm 6.0+: +# https://github.com/JuliaGPU/AMDGPU.jl/issues/571 +if HIP.runtime_version() ≥ v"6-" + @test_skip "rocSPARSE" +else + if AMDGPU.functional(:rocsparse) + include("rocsparse/rocsparse.jl") + end end -# TODO rocFFT tests crash Windows due to access violation -if Sys.islinux() && AMDGPU.functional(:rocfft) + +# TODO rocFFT needs an update to work with ROCm 6.0+. +if HIP.runtime_version() ≥ v"6-" + @test_skip "rocFFT" +else include("rocarray/fft.jl") end diff --git a/test/rocarray/blas.jl b/test/rocarray/blas.jl index b3cb9114d..b4c9b9452 100644 --- a/test/rocarray/blas.jl +++ b/test/rocarray/blas.jl @@ -203,8 +203,11 @@ end b = rand(T, 20) dA, dB, db = ROCArray(A), ROCArray(B), ROCArray(b) - for t in (identity, transpose, adjoint), - TR in (UpperTriangular, LowerTriangular, UnitUpperTriangular, UnitLowerTriangular) + for t in ( + identity, transpose, adjoint, + ), TR in ( + UpperTriangular, LowerTriangular, UnitUpperTriangular, UnitLowerTriangular, + ) # Left division. dC = copy(dB)