Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch to artifact device libraries if ROCm 5.5+ is detected #540

Merged
merged 3 commits into from
Nov 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ LLVM = "6"
LLVM_jll = "14, 15"
MacroTools = "0.5"
Preferences = "1"
ROCmDeviceLibs_jll = "5.4"
SpecialFunctions = "2"
UnsafeAtomicsLLVM = "0.1"
hsa_rocr_jll = "5.4"
ROCmDeviceLibs_jll = "5.6.1"
julia = "1.9"
29 changes: 2 additions & 27 deletions docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,6 @@ Pkg.test("AMDGPU")
Julia **1.9 or higher**.

Minimal supported ROCm version is **5.3**.
However, if you have ROCm 5.5+ installed, refer to
[LLVM compatibility & mixed ROCm mode](@ref) section for additional instructions.

For optimal experience, you should have full ROCm stack installed.
Refer to official ROCm stack installation instructions: <https://rocm.docs.amd.com/en/latest/deploy/linux/index.html>

Expand All @@ -56,31 +53,13 @@ Currently, AMDGPU.jl utilizes following libraries:
### ROCm artifacts

There is limited support for ROCm 5.4+ artifacts which can be enabled with
[`AMDGPU.enable_artifacts!`](@ref).
[`AMDGPU.use_artifacts!`](@ref).

Limited means not all libraries are available and some of the functionality
may be disabled.

```@docs
AMDGPU.enable_artifacts!
```

### LLVM compatibility & mixed ROCm mode

As a rule of thumb, Julia LLVM version should match ROCm LLVM version.
For example Julia 1.10 uses LLVM 15, but ROCm 5.5+ uses LLVM 16 which are incompatible.

However, there is a way to run system ROCm 5.5+ with Julia:

1. Add respective version of artifact device libraries in your project:
- ROCm 5.5: `]add [email protected]`;
- ROCm 5.6: `]add [email protected]`.
2. Call [`AMDGPU.use_devlibs_jll!`](@ref) in your Julia session to switch
to artifact device libraries (and the rest of the libraries
will be used from system-wide installation).

```@docs
AMDGPU.use_devlibs_jll!
AMDGPU.use_artifacts!
```

### Extra Setup Details
Expand Down Expand Up @@ -147,10 +126,6 @@ Template of `LocalPreferences.toml` with all options:
# If `true` then use ROCm libraries provided by artifacts.
# However, not all ROCm libraries are available as artifacts.
use_artifacts = false
# Use mixed-mode ROCm. This will use device libraries from artifacts,
# but the rest of the ROCm libraries from system-wide installation.
# See `LLVM compatibility & mixed ROCm mode` section in the documentation.
use_devlibs_jll = false
# Use non-blocking synchronization for all `AMDGPU.synchronize()` calls.
nonblocking_synchronization = true
# Memory limit specifies maximum amount of memory in percentages
Expand Down
70 changes: 31 additions & 39 deletions src/discovery/discovery.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
include("utils.jl")

"""
enable_artifacts!(flag::Bool = true)
use_artifacts!(flag::Bool = true)

Pass `true` to switch from system-wide ROCm installtion to artifacts.
When using artifacts, system-wide installation is not needed at all.
"""
function enable_artifacts!(flag::Bool = true; show_message::Bool = true)
function use_artifacts!(flag::Bool = true; show_message::Bool = true)
if flag && Base.libllvm_version >= v"16"
error("No supported artifacts for LLVM 16+. See: https://github.com/JuliaGPU/AMDGPU.jl/issues/440.")
end
Expand All @@ -30,42 +30,12 @@
import hsa_rocr_jll
end

"""
use_devlibs_jll!(flag::Bool = true)

Pass `true` to use device libraries from artifacts and
the rest of the libraries from system-wide ROCm installation (mixed-mode).

This allows using ROCm 5.5+ which internally uses LLVM 16+, but
device libraries from artifacts are built with LLVM 15 which makes them
compatible with Julia.
"""
function use_devlibs_jll!(flag::Bool = true; show_message::Bool = true)
@set_preferences!("use_devlibs_jll" => flag)
if show_message
@info """
Switched `use_devlibs_jll` to `$flag`.
Restart Julia session for the changes to take effect.
"""
end
end

if haskey(ENV, "JULIA_AMDGPU_USE_DEVLIBS_JLL")
use_devlibs = parse(Bool, get(ENV, "JULIA_AMDGPU_USE_DEVLIBS_JLL", "false"))
if use_devlibs && Base.libllvm_version >= v"16"
error("No supported artifacts for LLVM 16+. See: https://github.com/JuliaGPU/AMDGPU.jl/issues/440.")
end
use_devlibs_jll!(use_devlibs; show_message=false)
end

use_devlibs_jll()::Bool = @load_preference("use_devlibs_jll", false)

if haskey(ENV, "JULIA_AMDGPU_DISABLE_ARTIFACTS")
disable_artifacts = parse(Bool, get(ENV, "JULIA_AMDGPU_DISABLE_ARTIFACTS", "true"))
if !disable_artifacts && Base.libllvm_version >= v"16"
error("No supported artifacts for LLVM 16+. See: https://github.com/JuliaGPU/AMDGPU.jl/issues/440.")
end
enable_artifacts!(!disable_artifacts; show_message=false)
use_artifacts!(!disable_artifacts; show_message=false)
end

function get_artifact_library(pkg::Symbol, libname::Symbol)::String
Expand Down Expand Up @@ -97,23 +67,36 @@
end
end

function get_device_libs(;
function get_device_libs(
from_artifact::Bool;
artifact_library::Symbol = :ROCmDeviceLibs_jll,
artifact_field::Symbol = :bitcode_path,
)
if use_artifacts() || use_devlibs_jll()
if from_artifact
get_artifact_library(artifact_library, artifact_field)
else
find_device_libs()
end
end

export use_artifacts, enable_artifacts!, use_devlibs_jll, use_devlibs_jll!
export use_artifacts, use_artifacts!
export lld_artifact, lld_path, libhsaruntime, libdevice_libs, libhip
export librocblas, librocsparse, librocsolver, librocalution
export librocrand, librocfft, libMIOpen_path
export julia_exeflags

function _hip_runtime_version()
v_ref = Ref{Cint}()
res = ccall((:hipRuntimeGetVersion, libhip), UInt32, (Ptr{Cint},), v_ref)
res > 0 && error("Failed to get HIP runtime version.")

v = v_ref[]
major = v ÷ 10_000_000
minor = (v ÷ 100_000) % 100
patch = v % 100000
VersionNumber(major, minor, patch)
end

function __init__()
if isdir("/sys/class/kfd/kfd/topology/nodes/")
for node_id in readdir("/sys/class/kfd/kfd/topology/nodes/")
Expand Down Expand Up @@ -146,11 +129,20 @@
global libhsaruntime = get_library("libhsa-runtime64";
rocm_paths, artifact_library=:hsa_rocr_jll,
artifact_field=:libhsa_runtime64, ext="so.1")
global libdevice_libs = get_device_libs()

# HIP.
global libhip = get_library("libamdhip64";
rocm_paths, artifact_library=:HIP_jll)
global libhip = get_library("libamdhip64"; rocm_paths, artifact_library=:HIP_jll)

from_artifact = if isempty(libhip)
use_artifacts()

Check warning on line 137 in src/discovery/discovery.jl

View check run for this annotation

Codecov / codecov/patch

src/discovery/discovery.jl#L137

Added line #L137 was not covered by tests
else
# Detect HIP version, which will influence what device libraries to use.
hip_version = Base.thisminor(_hip_runtime_version())
hip_version > v"5.4" ? true : use_artifacts()
end
# If ROCm 5.5+ - use artifact device libraries.
global libdevice_libs = get_device_libs(from_artifact)

# HIP-based libraries.
global librocblas = get_library("librocblas";
rocm_paths, artifact_library=:rocBLAS_jll)
Expand Down