Skip to content

Commit

Permalink
Initial ROCm 6 enablement (#572)
Browse files Browse the repository at this point in the history
- Use non-async alloc/free if `<= 16` bytes.
- Fix `Mem.is_pinned` for arrays (ROCm 6 changed `hipMemoryType` struct definition).
- Fix page fault in rocBLAS (use new out-of-place `trmm` function).
- Cleanup wrapper generators.
- Disable device RNG for Navi 3 (doesn't support `memtime` or `memrealtime` LLVM intrinsic).
- Disable rocSPARSE 7 rocFFT tests on ROCm 6.0+.
  • Loading branch information
pxl-th authored Dec 19, 2023
1 parent 455a08c commit c0651cf
Show file tree
Hide file tree
Showing 20 changed files with 107 additions and 100 deletions.
40 changes: 20 additions & 20 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ steps:
- label: "Documentation"
plugins:
- JuliaCI/julia#v1:
version: 1.9
version: "1.10"
command: |
julia --project -e '
println("--- :julia: Instantiating project")
Expand All @@ -22,25 +22,25 @@ steps:
if: build.message !~ /\[skip docs\]/
timeout_in_minutes: 10

- label: "Julia 1.9 - No Artifacts"
plugins:
- JuliaCI/julia#v1:
version: 1.9
- JuliaCI/julia-test#v1:
- JuliaCI/julia-coverage#v1:
codecov: true
agents:
queue: "juliagpu"
rocm: "*"
rocmgpu: "gfx1100"
if: build.message !~ /\[skip tests\]/
command: "julia --project -e 'using Pkg; Pkg.update()'"
timeout_in_minutes: 180
env:
JULIA_NUM_THREADS: 4
JULIA_AMDGPU_CORE_MUST_LOAD: "1"
JULIA_AMDGPU_HIP_MUST_LOAD: "1"
JULIA_AMDGPU_DISABLE_ARTIFACTS: "1"
# - label: "Julia 1.9 - No Artifacts"
# plugins:
# - JuliaCI/julia#v1:
# version: 1.9
# - JuliaCI/julia-test#v1:
# - JuliaCI/julia-coverage#v1:
# codecov: true
# agents:
# queue: "juliagpu"
# rocm: "*"
# rocmgpu: "gfx1100"
# if: build.message !~ /\[skip tests\]/
# command: "julia --project -e 'using Pkg; Pkg.update()'"
# timeout_in_minutes: 180
# env:
# JULIA_NUM_THREADS: 4
# JULIA_AMDGPU_CORE_MUST_LOAD: "1"
# JULIA_AMDGPU_HIP_MUST_LOAD: "1"
# JULIA_AMDGPU_DISABLE_ARTIFACTS: "1"

- label: "Julia 1.10 - No Artifacts"
plugins:
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

The AMDGPU.jl package requires **Julia 1.9+**, **ROCm 5.3-5.7**, which means only GPUs that are supported by these versions will work.\
**64-bit Linux and Windows** are supported, see [requirements](https://amdgpu.juliagpu.org/dev/#Installation).
**Navi 3** GPUs require Julia 1.10 or higher.

## Quick start

Expand Down
3 changes: 0 additions & 3 deletions gen/Project.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,2 @@
[deps]
Clang = "40e3b903-d033-50b4-a0cc-940c62c95e31"
MIOpen_jll = "2409bb75-d5ef-542a-ac68-1cfd4c37dc24"
hsa_rocr_jll = "dd59ff1a-a01a-568d-8b29-0669330f116a"
rocBLAS_jll = "1ef8cab2-a151-54b4-a57f-5fbb4046a4ab"
22 changes: 0 additions & 22 deletions gen/generator.jl

This file was deleted.

7 changes: 0 additions & 7 deletions gen/generator.toml

This file was deleted.

3 changes: 1 addition & 2 deletions gen/miopen/generator.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
using Clang.Generators
using MIOpen_jll

include_dir = normpath(MIOpen_jll.artifact_dir, "include")
include_dir = normpath("/opt/rocm/include")
miopen_dir = joinpath(include_dir, "miopen")
options = load_options("miopen/miopen-generator.toml")

Expand Down
4 changes: 0 additions & 4 deletions gen/prologue.jl

This file was deleted.

3 changes: 1 addition & 2 deletions gen/rocblas/generator.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
using Clang.Generators
using rocBLAS_jll

include_dir = normpath(rocBLAS_jll.artifact_dir, "include")
include_dir = normpath("/opt/rocm/include")
rocblas_dir = joinpath(include_dir, "rocblas")
options = load_options("rocblas/rocblas-generator.toml")

Expand Down
16 changes: 8 additions & 8 deletions src/blas/highlevel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -313,9 +313,9 @@ LinearAlgebra.mul!(C::ROCMatrix{T}, adjA::LinearAlgebra.Adjoint{<:Any, <:ROCMatr
)
@eval begin
LinearAlgebra.lmul!(A::$t{T, <: ROCMatrix}, B::ROCMatrix{T}) where T <: ROCBLASFloat =
trmm!('L', $uploc, 'N', $isunitc, one(T), parent(A), B)
trmm!('L', $uploc, 'N', $isunitc, one(T), parent(A), B, B)
LinearAlgebra.rmul!(A::ROCMatrix{T}, B::$t{T, <: ROCMatrix}) where T <: ROCBLASFloat =
trmm!('R', $uploc, 'N', $isunitc, one(T), parent(B), A)
trmm!('R', $uploc, 'N', $isunitc, one(T), parent(B), A, A)

LinearAlgebra.ldiv!(A::$t{T, <: ROCMatrix}, B::ROCMatrix{T}) where T <: ROCBLASFloat =
trsm!('L', $uploc, 'N', $isunitc, one(T), parent(A), B)
Expand All @@ -334,18 +334,18 @@ LinearAlgebra.mul!(C::ROCMatrix{T}, adjA::LinearAlgebra.Adjoint{<:Any, <:ROCMatr
@eval begin
# Multiplication.
LinearAlgebra.lmul!(A::$t{<: Any, <: Transpose{T, <: ROCMatrix}}, B::ROCMatrix{T}) where T <: ROCBLASFloat =
trmm!('L', $uploc, 'T', $isunitc, one(T), parent(parent(A)), B)
trmm!('L', $uploc, 'T', $isunitc, one(T), parent(parent(A)), B, B)
LinearAlgebra.lmul!(A::$t{<: Any, <: Adjoint{T, <: ROCMatrix}}, B::ROCMatrix{T}) where T <: ROCBLASFloat =
trmm!('L', $uploc, 'T', $isunitc, one(T), parent(parent(A)), B)
trmm!('L', $uploc, 'T', $isunitc, one(T), parent(parent(A)), B, B)
LinearAlgebra.lmul!(A::$t{<: Any, <: Adjoint{T, <: ROCMatrix}}, B::ROCMatrix{T}) where T <: ROCBLASComplex =
trmm!('L', $uploc, 'C', $isunitc, one(T), parent(parent(A)), B)
trmm!('L', $uploc, 'C', $isunitc, one(T), parent(parent(A)), B, B)

LinearAlgebra.rmul!(A::ROCMatrix{T}, B::$t{<: Any, <: Transpose{T, <: ROCMatrix}}) where T <: ROCBLASFloat =
trmm!('R', $uploc, 'T', $isunitc, one(T), parent(parent(B)), A)
trmm!('R', $uploc, 'T', $isunitc, one(T), parent(parent(B)), A, A)
LinearAlgebra.rmul!(A::ROCMatrix{T}, B::$t{<: Any, <: Adjoint{T, <: ROCMatrix}}) where T <: ROCBLASFloat =
trmm!('R', $uploc, 'T', $isunitc, one(T), parent(parent(B)), A)
trmm!('R', $uploc, 'T', $isunitc, one(T), parent(parent(B)), A, A)
LinearAlgebra.rmul!(A::ROCMatrix{T}, B::$t{<: Any, <: Adjoint{T, <: ROCMatrix}}) where T <: ROCBLASComplex =
trmm!('R', $uploc, 'C', $isunitc, one(T), parent(parent(B)), A)
trmm!('R', $uploc, 'C', $isunitc, one(T), parent(parent(B)), A, A)

# Left division.
LinearAlgebra.ldiv!(A::$t{<: Any, <: Transpose{T, <: ROCMatrix}}, B::ROCMatrix{T}) where T <: ROCBLASFloat =
Expand Down
16 changes: 8 additions & 8 deletions src/blas/librocblas.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2614,24 +2614,24 @@ function rocblas_zsyrkx_strided_batched(handle, uplo, trans, n, k, alpha, A, lda
ccall((:rocblas_zsyrkx_strided_batched, librocblas), rocblas_status, (rocblas_handle, rocblas_fill, rocblas_operation, rocblas_int, rocblas_int, Ptr{rocblas_double_complex}, Ptr{rocblas_double_complex}, rocblas_int, rocblas_stride, Ptr{rocblas_double_complex}, rocblas_int, rocblas_stride, Ptr{rocblas_double_complex}, Ptr{rocblas_double_complex}, rocblas_int, rocblas_stride, rocblas_int), handle, uplo, trans, n, k, alpha, A, lda, stride_A, B, ldb, stride_B, beta, C, ldc, stride_C, batch_count)
end

function rocblas_strmm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb)
function rocblas_strmm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, C, ldc)
AMDGPU.prepare_state()
ccall((:rocblas_strmm, librocblas), rocblas_status, (rocblas_handle, rocblas_side, rocblas_fill, rocblas_operation, rocblas_diagonal, rocblas_int, rocblas_int, Ptr{Cfloat}, Ptr{Cfloat}, rocblas_int, Ptr{Cfloat}, rocblas_int), handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb)
ccall((:rocblas_strmm, librocblas), rocblas_status, (rocblas_handle, rocblas_side, rocblas_fill, rocblas_operation, rocblas_diagonal, rocblas_int, rocblas_int, Ptr{Cfloat}, Ptr{Cfloat}, rocblas_int, Ptr{Cfloat}, rocblas_int, Ptr{Cfloat}, rocblas_int), handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, C, ldc)
end

function rocblas_dtrmm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb)
function rocblas_dtrmm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, C, ldc)
AMDGPU.prepare_state()
ccall((:rocblas_dtrmm, librocblas), rocblas_status, (rocblas_handle, rocblas_side, rocblas_fill, rocblas_operation, rocblas_diagonal, rocblas_int, rocblas_int, Ptr{Cdouble}, Ptr{Cdouble}, rocblas_int, Ptr{Cdouble}, rocblas_int), handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb)
ccall((:rocblas_dtrmm, librocblas), rocblas_status, (rocblas_handle, rocblas_side, rocblas_fill, rocblas_operation, rocblas_diagonal, rocblas_int, rocblas_int, Ptr{Cdouble}, Ptr{Cdouble}, rocblas_int, Ptr{Cdouble}, rocblas_int, Ptr{Cdouble}, rocblas_int), handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, C, ldc)
end

function rocblas_ctrmm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb)
function rocblas_ctrmm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, C, ldc)
AMDGPU.prepare_state()
ccall((:rocblas_ctrmm, librocblas), rocblas_status, (rocblas_handle, rocblas_side, rocblas_fill, rocblas_operation, rocblas_diagonal, rocblas_int, rocblas_int, Ptr{rocblas_float_complex}, Ptr{rocblas_float_complex}, rocblas_int, Ptr{rocblas_float_complex}, rocblas_int), handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb)
ccall((:rocblas_ctrmm, librocblas), rocblas_status, (rocblas_handle, rocblas_side, rocblas_fill, rocblas_operation, rocblas_diagonal, rocblas_int, rocblas_int, Ptr{rocblas_float_complex}, Ptr{rocblas_float_complex}, rocblas_int, Ptr{rocblas_float_complex}, rocblas_int, Ptr{rocblas_float_complex}, rocblas_int), handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, C, ldc)
end

function rocblas_ztrmm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb)
function rocblas_ztrmm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, C, ldc)
AMDGPU.prepare_state()
ccall((:rocblas_ztrmm, librocblas), rocblas_status, (rocblas_handle, rocblas_side, rocblas_fill, rocblas_operation, rocblas_diagonal, rocblas_int, rocblas_int, Ptr{rocblas_double_complex}, Ptr{rocblas_double_complex}, rocblas_int, Ptr{rocblas_double_complex}, rocblas_int), handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb)
ccall((:rocblas_ztrmm, librocblas), rocblas_status, (rocblas_handle, rocblas_side, rocblas_fill, rocblas_operation, rocblas_diagonal, rocblas_int, rocblas_int, Ptr{rocblas_double_complex}, Ptr{rocblas_double_complex}, rocblas_int, Ptr{rocblas_double_complex}, rocblas_int, Ptr{rocblas_double_complex}, rocblas_int), handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, C, ldc)
end

function rocblas_strmm_batched(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, batch_count)
Expand Down
11 changes: 6 additions & 5 deletions src/blas/wrappers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -972,26 +972,27 @@ for (mmname, smname, elty) in
@eval begin
function trmm!(
side::Char, uplo::Char, transa::Char, diag::Char, alpha::($elty),
A::ROCMatrix{$elty}, B::ROCMatrix{$elty},
A::ROCMatrix{$elty}, B::ROCMatrix{$elty}, C::ROCMatrix{$elty},
)
m, n = size(B)
mA, nA = size(A)
# TODO: clean up error messages
if mA != nA throw(DimensionMismatch("A must be square")) end
if nA != (side == 'L' ? m : n) throw(DimensionMismatch("trmm!")) end
lda = max(1,stride(A,2))
ldb = max(1,stride(B,2))
lda = max(1, stride(A, 2))
ldb = max(1, stride(B, 2))
ldc = max(1, stride(C, 2))
(; handle, stream) = lib_state()
$(mmname)(
handle, side, uplo, transa, diag, m, n, Ref(alpha),
A, lda, B, ldb) |> check
A, lda, B, ldb, C, ldc) |> check
B
end
function trmm(
side::Char, uplo::Char, transa::Char, diag::Char, alpha::($elty),
A::ROCMatrix{$elty}, B::ROCMatrix{$elty},
)
trmm!(side, uplo, transa, diag, alpha, A, copy(B))
trmm!(side, uplo, transa, diag, alpha, A, B, similar(B))
end
function trsm!(
side::Char, uplo::Char, transa::Char, diag::Char, alpha::($elty),
Expand Down
1 change: 1 addition & 0 deletions src/device/random.jl
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ end
return ROCDeviceArray{UInt32,1,AS.Local}((32,), ptr)
end

# TODO no memtime instructions on RDNA 3
@device_override Random.make_seed() = Base.unsafe_trunc(UInt32, memrealtime())


Expand Down
10 changes: 10 additions & 0 deletions src/hip/libhip_common.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,16 @@ end
hipMemoryTypeManaged
end

# TODO use this once we support ROCm 6+ only.
@cenum hipMemoryTypeV2 begin
hipMemoryTypeUnregisteredV2 = 0
hipMemoryTypeHostV2 = 1
hipMemoryTypeDeviceV2 = 2
hipMemoryTypeManagedV2 = 3
hipMemoryTypeArrayV2 = 10
hipMemoryTypeUnifiedV2 = 11
end

@cenum hiprtcResult::UInt32 begin
HIPRTC_SUCCESS = 0
HIPRTC_ERROR_OUT_OF_MEMORY = 1
Expand Down
29 changes: 20 additions & 9 deletions src/runtime/memory/hip.jl
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,13 @@ function HIPBuffer(bytesize; stream::HIP.HIPStream)
end

# Try to allocate.

# Async is ~300x slower: https://discourse.julialang.org/t/lux-tutorial-amdgpu-20x-slower-than-cpu/107053/11
# HIP.hipMallocAsync(ptr_ref, bytesize, stream) |> HIP.check
HIP.hipMalloc(ptr_ref, bytesize) |> HIP.check
# NOTE Async is ~300x slower for small (≤ 16 bytes) allocations:
# https://github.com/ROCm/HIP/issues/3370#issuecomment-1842938966
if bytesize > 16
HIP.hipMallocAsync(ptr_ref, bytesize, stream) |> HIP.check
else
HIP.hipMalloc(ptr_ref, bytesize) |> HIP.check
end
ptr = ptr_ref[]
ptr == C_NULL && throw(HIP.HIPError(HIP.hipErrorOutOfMemory))
return ptr
Expand Down Expand Up @@ -127,8 +130,11 @@ function free(buf::HIPBuffer; stream::HIP.HIPStream)
buf.own || return

buf.ptr == C_NULL && return
# HIP.hipFreeAsync(buf, stream) |> HIP.check
HIP.hipFree(buf) |> HIP.check
if buf.bytesize > 16
HIP.hipFreeAsync(buf, stream) |> HIP.check
else
HIP.hipFree(buf) |> HIP.check
end
return
end

Expand Down Expand Up @@ -239,19 +245,24 @@ function get_device_ptr(ptr::Ptr{Cvoid})
ptr_ref[]
end

function is_pinned(ptr::Ptr{Cvoid})
function is_pinned(ptr)
ptr == C_NULL && return false

st, data = attributes(ptr)
if st == HIP.hipErrorInvalidValue
return false
elseif st == HIP.hipSuccess
return data.memoryType == HIP.hipMemoryTypeHost
# TODO one we support only ROCm 6+ drop if/else.
if HIP.runtime_version() > v"6-"
return data.memoryType == HIP.hipMemoryTypeHostV2
else
return data.memoryType == HIP.hipMemoryTypeHost
end
end
st |> HIP.check
end

function attributes(ptr::Ptr{Cvoid})
function attributes(ptr)
data = Ref{HIP.hipPointerAttribute_t}()
st = HIP.hipPointerGetAttributes(data, ptr)
st, data[]
Expand Down
1 change: 0 additions & 1 deletion src/sparse/rocSPARSE.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ import AMDGPU: librocsparse, HandleCache, HIP, library_state
import AMDGPU.Device: ROCDeviceVector
import .HIP: HIPContext, HIPStream, hipStream_t

# TODO replace
const SparseChar = Char

# core library
Expand Down
1 change: 1 addition & 0 deletions src/sparse/types.jl
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ end
## SparseChar conversions

function Base.convert(::Type{rocsparse_operation}, trans::SparseChar)
@show trans
if trans == 'N'
rocsparse_operation_none
elseif trans == 'T'
Expand Down
7 changes: 6 additions & 1 deletion test/core_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,11 @@ include("codegen/trap.jl")
include("rocarray/base.jl")
include("rocarray/broadcast.jl")

include("tls.jl")
const IS_NAVI3 = AMDGPU.device().gcn_arch in ("gfx1100", "gfx1101", "gfx1102", "gfx1103")

# TODO rework, hangs on Navi 3
if !IS_NAVI3
include("tls.jl")
end

end
7 changes: 5 additions & 2 deletions test/device_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,11 @@ include("device/synchronization.jl")
include("device/execution_control.jl")
include("device/exceptions.jl")

# TODO 1.9 fails with out-of-bounds error for some reason...
if VERSION v"1.10-"
const IS_NAVI3 = AMDGPU.device().gcn_arch in ("gfx1100", "gfx1101", "gfx1102", "gfx1103")

# TODO NAVI 3 does not support `memtime` and `memrealtime` llvm intrinsic.
# TODO Julia 1.9 fails with out-of-bounds error for some reason...
if VERSION v"1.10-" && !IS_NAVI3
include("device/random.jl")
end

Expand Down
18 changes: 14 additions & 4 deletions test/hip_extra_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,21 @@ AMDGPU.allowscalar(false)
if AMDGPU.functional(:rocsolver)
include("rocarray/solver.jl")
end
if AMDGPU.functional(:rocsparse)
include("rocsparse/rocsparse.jl")

# TODO rocSPARSE needs an update to work with ROCm 6.0+:
# https://github.com/JuliaGPU/AMDGPU.jl/issues/571
if HIP.runtime_version() v"6-"
@test_skip "rocSPARSE"
else
if AMDGPU.functional(:rocsparse)
include("rocsparse/rocsparse.jl")
end
end
# TODO rocFFT tests crash Windows due to access violation
if Sys.islinux() && AMDGPU.functional(:rocfft)

# TODO rocFFT needs an update to work with ROCm 6.0+.
if HIP.runtime_version() v"6-"
@test_skip "rocFFT"
else
include("rocarray/fft.jl")
end

Expand Down
7 changes: 5 additions & 2 deletions test/rocarray/blas.jl
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,11 @@ end
b = rand(T, 20)
dA, dB, db = ROCArray(A), ROCArray(B), ROCArray(b)

for t in (identity, transpose, adjoint),
TR in (UpperTriangular, LowerTriangular, UnitUpperTriangular, UnitLowerTriangular)
for t in (
identity, transpose, adjoint,
), TR in (
UpperTriangular, LowerTriangular, UnitUpperTriangular, UnitLowerTriangular,
)

# Left division.
dC = copy(dB)
Expand Down

0 comments on commit c0651cf

Please sign in to comment.