Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial ROCm 6 enablement #572

Merged
merged 5 commits into from
Dec 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 20 additions & 20 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ steps:
- label: "Documentation"
plugins:
- JuliaCI/julia#v1:
version: 1.9
version: "1.10"
command: |
julia --project -e '
println("--- :julia: Instantiating project")
Expand All @@ -22,25 +22,25 @@ steps:
if: build.message !~ /\[skip docs\]/
timeout_in_minutes: 10

- label: "Julia 1.9 - No Artifacts"
plugins:
- JuliaCI/julia#v1:
version: 1.9
- JuliaCI/julia-test#v1:
- JuliaCI/julia-coverage#v1:
codecov: true
agents:
queue: "juliagpu"
rocm: "*"
rocmgpu: "gfx1100"
if: build.message !~ /\[skip tests\]/
command: "julia --project -e 'using Pkg; Pkg.update()'"
timeout_in_minutes: 180
env:
JULIA_NUM_THREADS: 4
JULIA_AMDGPU_CORE_MUST_LOAD: "1"
JULIA_AMDGPU_HIP_MUST_LOAD: "1"
JULIA_AMDGPU_DISABLE_ARTIFACTS: "1"
# - label: "Julia 1.9 - No Artifacts"
# plugins:
# - JuliaCI/julia#v1:
# version: 1.9
# - JuliaCI/julia-test#v1:
# - JuliaCI/julia-coverage#v1:
# codecov: true
# agents:
# queue: "juliagpu"
# rocm: "*"
# rocmgpu: "gfx1100"
# if: build.message !~ /\[skip tests\]/
# command: "julia --project -e 'using Pkg; Pkg.update()'"
# timeout_in_minutes: 180
# env:
# JULIA_NUM_THREADS: 4
# JULIA_AMDGPU_CORE_MUST_LOAD: "1"
# JULIA_AMDGPU_HIP_MUST_LOAD: "1"
# JULIA_AMDGPU_DISABLE_ARTIFACTS: "1"

- label: "Julia 1.10 - No Artifacts"
plugins:
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

The AMDGPU.jl package requires **Julia 1.9+**, **ROCm 5.3-5.7**, which means only GPUs that are supported by these versions will work.\
**64-bit Linux and Windows** are supported, see [requirements](https://amdgpu.juliagpu.org/dev/#Installation).
**Navi 3** GPUs require Julia 1.10 or higher.

## Quick start

Expand Down
3 changes: 0 additions & 3 deletions gen/Project.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,2 @@
[deps]
Clang = "40e3b903-d033-50b4-a0cc-940c62c95e31"
MIOpen_jll = "2409bb75-d5ef-542a-ac68-1cfd4c37dc24"
hsa_rocr_jll = "dd59ff1a-a01a-568d-8b29-0669330f116a"
rocBLAS_jll = "1ef8cab2-a151-54b4-a57f-5fbb4046a4ab"
22 changes: 0 additions & 22 deletions gen/generator.jl

This file was deleted.

7 changes: 0 additions & 7 deletions gen/generator.toml

This file was deleted.

3 changes: 1 addition & 2 deletions gen/miopen/generator.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
using Clang.Generators
using MIOpen_jll

include_dir = normpath(MIOpen_jll.artifact_dir, "include")
include_dir = normpath("/opt/rocm/include")
miopen_dir = joinpath(include_dir, "miopen")
options = load_options("miopen/miopen-generator.toml")

Expand Down
4 changes: 0 additions & 4 deletions gen/prologue.jl

This file was deleted.

3 changes: 1 addition & 2 deletions gen/rocblas/generator.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
using Clang.Generators
using rocBLAS_jll

include_dir = normpath(rocBLAS_jll.artifact_dir, "include")
include_dir = normpath("/opt/rocm/include")
rocblas_dir = joinpath(include_dir, "rocblas")
options = load_options("rocblas/rocblas-generator.toml")

Expand Down
16 changes: 8 additions & 8 deletions src/blas/highlevel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -313,9 +313,9 @@ LinearAlgebra.mul!(C::ROCMatrix{T}, adjA::LinearAlgebra.Adjoint{<:Any, <:ROCMatr
)
@eval begin
LinearAlgebra.lmul!(A::$t{T, <: ROCMatrix}, B::ROCMatrix{T}) where T <: ROCBLASFloat =
trmm!('L', $uploc, 'N', $isunitc, one(T), parent(A), B)
trmm!('L', $uploc, 'N', $isunitc, one(T), parent(A), B, B)
LinearAlgebra.rmul!(A::ROCMatrix{T}, B::$t{T, <: ROCMatrix}) where T <: ROCBLASFloat =
trmm!('R', $uploc, 'N', $isunitc, one(T), parent(B), A)
trmm!('R', $uploc, 'N', $isunitc, one(T), parent(B), A, A)

LinearAlgebra.ldiv!(A::$t{T, <: ROCMatrix}, B::ROCMatrix{T}) where T <: ROCBLASFloat =
trsm!('L', $uploc, 'N', $isunitc, one(T), parent(A), B)
Expand All @@ -334,18 +334,18 @@ LinearAlgebra.mul!(C::ROCMatrix{T}, adjA::LinearAlgebra.Adjoint{<:Any, <:ROCMatr
@eval begin
# Multiplication.
LinearAlgebra.lmul!(A::$t{<: Any, <: Transpose{T, <: ROCMatrix}}, B::ROCMatrix{T}) where T <: ROCBLASFloat =
trmm!('L', $uploc, 'T', $isunitc, one(T), parent(parent(A)), B)
trmm!('L', $uploc, 'T', $isunitc, one(T), parent(parent(A)), B, B)
LinearAlgebra.lmul!(A::$t{<: Any, <: Adjoint{T, <: ROCMatrix}}, B::ROCMatrix{T}) where T <: ROCBLASFloat =
trmm!('L', $uploc, 'T', $isunitc, one(T), parent(parent(A)), B)
trmm!('L', $uploc, 'T', $isunitc, one(T), parent(parent(A)), B, B)
LinearAlgebra.lmul!(A::$t{<: Any, <: Adjoint{T, <: ROCMatrix}}, B::ROCMatrix{T}) where T <: ROCBLASComplex =
trmm!('L', $uploc, 'C', $isunitc, one(T), parent(parent(A)), B)
trmm!('L', $uploc, 'C', $isunitc, one(T), parent(parent(A)), B, B)

LinearAlgebra.rmul!(A::ROCMatrix{T}, B::$t{<: Any, <: Transpose{T, <: ROCMatrix}}) where T <: ROCBLASFloat =
trmm!('R', $uploc, 'T', $isunitc, one(T), parent(parent(B)), A)
trmm!('R', $uploc, 'T', $isunitc, one(T), parent(parent(B)), A, A)
LinearAlgebra.rmul!(A::ROCMatrix{T}, B::$t{<: Any, <: Adjoint{T, <: ROCMatrix}}) where T <: ROCBLASFloat =
trmm!('R', $uploc, 'T', $isunitc, one(T), parent(parent(B)), A)
trmm!('R', $uploc, 'T', $isunitc, one(T), parent(parent(B)), A, A)
LinearAlgebra.rmul!(A::ROCMatrix{T}, B::$t{<: Any, <: Adjoint{T, <: ROCMatrix}}) where T <: ROCBLASComplex =
trmm!('R', $uploc, 'C', $isunitc, one(T), parent(parent(B)), A)
trmm!('R', $uploc, 'C', $isunitc, one(T), parent(parent(B)), A, A)

# Left division.
LinearAlgebra.ldiv!(A::$t{<: Any, <: Transpose{T, <: ROCMatrix}}, B::ROCMatrix{T}) where T <: ROCBLASFloat =
Expand Down
16 changes: 8 additions & 8 deletions src/blas/librocblas.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2614,24 +2614,24 @@
ccall((:rocblas_zsyrkx_strided_batched, librocblas), rocblas_status, (rocblas_handle, rocblas_fill, rocblas_operation, rocblas_int, rocblas_int, Ptr{rocblas_double_complex}, Ptr{rocblas_double_complex}, rocblas_int, rocblas_stride, Ptr{rocblas_double_complex}, rocblas_int, rocblas_stride, Ptr{rocblas_double_complex}, Ptr{rocblas_double_complex}, rocblas_int, rocblas_stride, rocblas_int), handle, uplo, trans, n, k, alpha, A, lda, stride_A, B, ldb, stride_B, beta, C, ldc, stride_C, batch_count)
end

function rocblas_strmm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb)
function rocblas_strmm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, C, ldc)
AMDGPU.prepare_state()
ccall((:rocblas_strmm, librocblas), rocblas_status, (rocblas_handle, rocblas_side, rocblas_fill, rocblas_operation, rocblas_diagonal, rocblas_int, rocblas_int, Ptr{Cfloat}, Ptr{Cfloat}, rocblas_int, Ptr{Cfloat}, rocblas_int), handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb)
ccall((:rocblas_strmm, librocblas), rocblas_status, (rocblas_handle, rocblas_side, rocblas_fill, rocblas_operation, rocblas_diagonal, rocblas_int, rocblas_int, Ptr{Cfloat}, Ptr{Cfloat}, rocblas_int, Ptr{Cfloat}, rocblas_int, Ptr{Cfloat}, rocblas_int), handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, C, ldc)
end

function rocblas_dtrmm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb)
function rocblas_dtrmm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, C, ldc)
AMDGPU.prepare_state()
ccall((:rocblas_dtrmm, librocblas), rocblas_status, (rocblas_handle, rocblas_side, rocblas_fill, rocblas_operation, rocblas_diagonal, rocblas_int, rocblas_int, Ptr{Cdouble}, Ptr{Cdouble}, rocblas_int, Ptr{Cdouble}, rocblas_int), handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb)
ccall((:rocblas_dtrmm, librocblas), rocblas_status, (rocblas_handle, rocblas_side, rocblas_fill, rocblas_operation, rocblas_diagonal, rocblas_int, rocblas_int, Ptr{Cdouble}, Ptr{Cdouble}, rocblas_int, Ptr{Cdouble}, rocblas_int, Ptr{Cdouble}, rocblas_int), handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, C, ldc)
end

function rocblas_ctrmm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb)
function rocblas_ctrmm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, C, ldc)

Check warning on line 2627 in src/blas/librocblas.jl

View check run for this annotation

Codecov / codecov/patch

src/blas/librocblas.jl#L2627

Added line #L2627 was not covered by tests
AMDGPU.prepare_state()
ccall((:rocblas_ctrmm, librocblas), rocblas_status, (rocblas_handle, rocblas_side, rocblas_fill, rocblas_operation, rocblas_diagonal, rocblas_int, rocblas_int, Ptr{rocblas_float_complex}, Ptr{rocblas_float_complex}, rocblas_int, Ptr{rocblas_float_complex}, rocblas_int), handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb)
ccall((:rocblas_ctrmm, librocblas), rocblas_status, (rocblas_handle, rocblas_side, rocblas_fill, rocblas_operation, rocblas_diagonal, rocblas_int, rocblas_int, Ptr{rocblas_float_complex}, Ptr{rocblas_float_complex}, rocblas_int, Ptr{rocblas_float_complex}, rocblas_int, Ptr{rocblas_float_complex}, rocblas_int), handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, C, ldc)

Check warning on line 2629 in src/blas/librocblas.jl

View check run for this annotation

Codecov / codecov/patch

src/blas/librocblas.jl#L2629

Added line #L2629 was not covered by tests
end

function rocblas_ztrmm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb)
function rocblas_ztrmm(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, C, ldc)

Check warning on line 2632 in src/blas/librocblas.jl

View check run for this annotation

Codecov / codecov/patch

src/blas/librocblas.jl#L2632

Added line #L2632 was not covered by tests
AMDGPU.prepare_state()
ccall((:rocblas_ztrmm, librocblas), rocblas_status, (rocblas_handle, rocblas_side, rocblas_fill, rocblas_operation, rocblas_diagonal, rocblas_int, rocblas_int, Ptr{rocblas_double_complex}, Ptr{rocblas_double_complex}, rocblas_int, Ptr{rocblas_double_complex}, rocblas_int), handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb)
ccall((:rocblas_ztrmm, librocblas), rocblas_status, (rocblas_handle, rocblas_side, rocblas_fill, rocblas_operation, rocblas_diagonal, rocblas_int, rocblas_int, Ptr{rocblas_double_complex}, Ptr{rocblas_double_complex}, rocblas_int, Ptr{rocblas_double_complex}, rocblas_int, Ptr{rocblas_double_complex}, rocblas_int), handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, C, ldc)

Check warning on line 2634 in src/blas/librocblas.jl

View check run for this annotation

Codecov / codecov/patch

src/blas/librocblas.jl#L2634

Added line #L2634 was not covered by tests
end

function rocblas_strmm_batched(handle, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb, batch_count)
Expand Down
11 changes: 6 additions & 5 deletions src/blas/wrappers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -972,26 +972,27 @@
@eval begin
function trmm!(
side::Char, uplo::Char, transa::Char, diag::Char, alpha::($elty),
A::ROCMatrix{$elty}, B::ROCMatrix{$elty},
A::ROCMatrix{$elty}, B::ROCMatrix{$elty}, C::ROCMatrix{$elty},
)
m, n = size(B)
mA, nA = size(A)
# TODO: clean up error messages
if mA != nA throw(DimensionMismatch("A must be square")) end
if nA != (side == 'L' ? m : n) throw(DimensionMismatch("trmm!")) end
lda = max(1,stride(A,2))
ldb = max(1,stride(B,2))
lda = max(1, stride(A, 2))
ldb = max(1, stride(B, 2))
ldc = max(1, stride(C, 2))
(; handle, stream) = lib_state()
$(mmname)(
handle, side, uplo, transa, diag, m, n, Ref(alpha),
A, lda, B, ldb) |> check
A, lda, B, ldb, C, ldc) |> check
B
end
function trmm(
side::Char, uplo::Char, transa::Char, diag::Char, alpha::($elty),
A::ROCMatrix{$elty}, B::ROCMatrix{$elty},
)
trmm!(side, uplo, transa, diag, alpha, A, copy(B))
trmm!(side, uplo, transa, diag, alpha, A, B, similar(B))

Check warning on line 995 in src/blas/wrappers.jl

View check run for this annotation

Codecov / codecov/patch

src/blas/wrappers.jl#L995

Added line #L995 was not covered by tests
end
function trsm!(
side::Char, uplo::Char, transa::Char, diag::Char, alpha::($elty),
Expand Down
1 change: 1 addition & 0 deletions src/device/random.jl
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ end
return ROCDeviceArray{UInt32,1,AS.Local}((32,), ptr)
end

# TODO no memtime instructions on RDNA 3
@device_override Random.make_seed() = Base.unsafe_trunc(UInt32, memrealtime())


Expand Down
10 changes: 10 additions & 0 deletions src/hip/libhip_common.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,16 @@ end
hipMemoryTypeManaged
end

# TODO use this once we support ROCm 6+ only.
@cenum hipMemoryTypeV2 begin
hipMemoryTypeUnregisteredV2 = 0
hipMemoryTypeHostV2 = 1
hipMemoryTypeDeviceV2 = 2
hipMemoryTypeManagedV2 = 3
hipMemoryTypeArrayV2 = 10
hipMemoryTypeUnifiedV2 = 11
end

@cenum hiprtcResult::UInt32 begin
HIPRTC_SUCCESS = 0
HIPRTC_ERROR_OUT_OF_MEMORY = 1
Expand Down
29 changes: 20 additions & 9 deletions src/runtime/memory/hip.jl
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,13 @@
end

# Try to allocate.

# Async is ~300x slower: https://discourse.julialang.org/t/lux-tutorial-amdgpu-20x-slower-than-cpu/107053/11
# HIP.hipMallocAsync(ptr_ref, bytesize, stream) |> HIP.check
HIP.hipMalloc(ptr_ref, bytesize) |> HIP.check
# NOTE Async is ~300x slower for small (≤ 16 bytes) allocations:
# https://github.com/ROCm/HIP/issues/3370#issuecomment-1842938966
if bytesize > 16
HIP.hipMallocAsync(ptr_ref, bytesize, stream) |> HIP.check
else
HIP.hipMalloc(ptr_ref, bytesize) |> HIP.check
end
ptr = ptr_ref[]
ptr == C_NULL && throw(HIP.HIPError(HIP.hipErrorOutOfMemory))
return ptr
Expand Down Expand Up @@ -127,8 +130,11 @@
buf.own || return

buf.ptr == C_NULL && return
# HIP.hipFreeAsync(buf, stream) |> HIP.check
HIP.hipFree(buf) |> HIP.check
if buf.bytesize > 16
HIP.hipFreeAsync(buf, stream) |> HIP.check
else
HIP.hipFree(buf) |> HIP.check
end
return
end

Expand Down Expand Up @@ -239,19 +245,24 @@
ptr_ref[]
end

function is_pinned(ptr::Ptr{Cvoid})
function is_pinned(ptr)
ptr == C_NULL && return false

st, data = attributes(ptr)
if st == HIP.hipErrorInvalidValue
return false
elseif st == HIP.hipSuccess
return data.memoryType == HIP.hipMemoryTypeHost
# TODO one we support only ROCm 6+ drop if/else.
if HIP.runtime_version() > v"6-"
return data.memoryType == HIP.hipMemoryTypeHostV2
else
return data.memoryType == HIP.hipMemoryTypeHost

Check warning on line 259 in src/runtime/memory/hip.jl

View check run for this annotation

Codecov / codecov/patch

src/runtime/memory/hip.jl#L259

Added line #L259 was not covered by tests
end
end
st |> HIP.check
end

function attributes(ptr::Ptr{Cvoid})
function attributes(ptr)
data = Ref{HIP.hipPointerAttribute_t}()
st = HIP.hipPointerGetAttributes(data, ptr)
st, data[]
Expand Down
1 change: 0 additions & 1 deletion src/sparse/rocSPARSE.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ import AMDGPU: librocsparse, HandleCache, HIP, library_state
import AMDGPU.Device: ROCDeviceVector
import .HIP: HIPContext, HIPStream, hipStream_t

# TODO replace
const SparseChar = Char

# core library
Expand Down
1 change: 1 addition & 0 deletions src/sparse/types.jl
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
## SparseChar conversions

function Base.convert(::Type{rocsparse_operation}, trans::SparseChar)
@show trans

Check warning on line 88 in src/sparse/types.jl

View check run for this annotation

Codecov / codecov/patch

src/sparse/types.jl#L88

Added line #L88 was not covered by tests
if trans == 'N'
rocsparse_operation_none
elseif trans == 'T'
Expand Down
7 changes: 6 additions & 1 deletion test/core_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,11 @@ include("codegen/trap.jl")
include("rocarray/base.jl")
include("rocarray/broadcast.jl")

include("tls.jl")
const IS_NAVI3 = AMDGPU.device().gcn_arch in ("gfx1100", "gfx1101", "gfx1102", "gfx1103")

# TODO rework, hangs on Navi 3
if !IS_NAVI3
include("tls.jl")
end

end
7 changes: 5 additions & 2 deletions test/device_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,11 @@ include("device/synchronization.jl")
include("device/execution_control.jl")
include("device/exceptions.jl")

# TODO 1.9 fails with out-of-bounds error for some reason...
if VERSION ≥ v"1.10-"
const IS_NAVI3 = AMDGPU.device().gcn_arch in ("gfx1100", "gfx1101", "gfx1102", "gfx1103")

# TODO NAVI 3 does not support `memtime` and `memrealtime` llvm intrinsic.
# TODO Julia 1.9 fails with out-of-bounds error for some reason...
if VERSION ≥ v"1.10-" && !IS_NAVI3
include("device/random.jl")
end

Expand Down
18 changes: 14 additions & 4 deletions test/hip_extra_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,21 @@ AMDGPU.allowscalar(false)
if AMDGPU.functional(:rocsolver)
include("rocarray/solver.jl")
end
if AMDGPU.functional(:rocsparse)
include("rocsparse/rocsparse.jl")

# TODO rocSPARSE needs an update to work with ROCm 6.0+:
# https://github.com/JuliaGPU/AMDGPU.jl/issues/571
if HIP.runtime_version() ≥ v"6-"
@test_skip "rocSPARSE"
else
if AMDGPU.functional(:rocsparse)
include("rocsparse/rocsparse.jl")
end
end
# TODO rocFFT tests crash Windows due to access violation
if Sys.islinux() && AMDGPU.functional(:rocfft)

# TODO rocFFT needs an update to work with ROCm 6.0+.
if HIP.runtime_version() ≥ v"6-"
@test_skip "rocFFT"
else
include("rocarray/fft.jl")
end

Expand Down
7 changes: 5 additions & 2 deletions test/rocarray/blas.jl
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,11 @@ end
b = rand(T, 20)
dA, dB, db = ROCArray(A), ROCArray(B), ROCArray(b)

for t in (identity, transpose, adjoint),
TR in (UpperTriangular, LowerTriangular, UnitUpperTriangular, UnitLowerTriangular)
for t in (
identity, transpose, adjoint,
), TR in (
UpperTriangular, LowerTriangular, UnitUpperTriangular, UnitLowerTriangular,
)

# Left division.
dC = copy(dB)
Expand Down