Skip to content

Commit

Permalink
Improve support for unified and host memory (#2138)
Browse files Browse the repository at this point in the history
- scalar indexing is now allowed with unified and host arrays
- construction using `cu` has been generalized, now taking
  both `device`, `host` and `unified` boolean kwargs
- the default memory location can be configured using the
  `default_memory `preference (now reported by `versioninfo`)
- `unsafe_wrap` has been extended to take array inputs
- HMM support, allow conversion of unmanaged CPU memory
  to `CuArray` objects
- CI coverage for all of the above
  • Loading branch information
maleadt authored Nov 1, 2023
1 parent d11ba2a commit 76d6d06
Show file tree
Hide file tree
Showing 11 changed files with 408 additions and 166 deletions.
57 changes: 37 additions & 20 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@ steps:
cuda: "*"
commands: |
echo -e "[CUDA_Runtime_jll]\nlocal = \"true\"" >LocalPreferences.toml
if: build.message !~ /\[skip tests\]/ &&
build.message !~ /\[skip julia\]/
if: build.message !~ /\[skip tests\]/
timeout_in_minutes: 120
matrix:
setup:
Expand All @@ -44,7 +43,7 @@ steps:
- JuliaCI/julia#v1:
version: 1.9
- JuliaCI/julia-test#v1:
test_args: "core base libraries"
test_args: "--quickfail core base libraries"
- JuliaCI/julia-coverage#v1:
dirs:
- src
Expand All @@ -53,9 +52,7 @@ steps:
agents:
queue: "juliagpu"
cuda: "*"
if: build.message !~ /\[skip tests\]/ &&
build.message !~ /\[skip cuda\]/ &&
!build.pull_request.draft
if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
timeout_in_minutes: 120
matrix:
setup:
Expand All @@ -73,6 +70,34 @@ steps:
echo -e "[CUDA_Runtime_jll]\nversion = \"{{matrix.cuda}}\"" >LocalPreferences.toml
echo -e "[CUDA_Driver_jll]\ncompat = \"false\"" >>LocalPreferences.toml
- group: "Memory"
key: "memory"
depends_on: "julia"
steps:
- label: "CuArray with {{matrix.memory}} memory"
plugins:
- JuliaCI/julia#v1:
version: 1.9
- JuliaCI/julia-test#v1:
test_args: "--quickfail core base libraries"
- JuliaCI/julia-coverage#v1:
dirs:
- src
- lib
- examples
agents:
queue: "juliagpu"
cuda: "*"
if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
timeout_in_minutes: 120
matrix:
setup:
memory:
- "unified"
- "host"
commands: |
echo -e "[CUDA]\ndefault_memory = \"{{matrix.memory}}\"" >LocalPreferences.toml
- group: ":nesting_dolls: Subpackages"
depends_on: "cuda"
steps:
Expand Down Expand Up @@ -104,9 +129,7 @@ steps:
agents:
queue: "juliagpu"
cuda: "*"
if: build.message !~ /\[skip tests\]/ &&
build.message !~ /\[skip subpackages\]/ &&
!build.pull_request.draft
if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
timeout_in_minutes: 120
commands: |
julia --project -e '
Expand Down Expand Up @@ -165,9 +188,7 @@ steps:
agents:
queue: "juliagpu"
cuda: "*"
if: build.message !~ /\[skip tests\]/ &&
build.message !~ /\[skip downstream\]/ &&
!build.pull_request.draft
if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
timeout_in_minutes: 60
soft_fail:
- exit_status: 3
Expand Down Expand Up @@ -240,9 +261,7 @@ steps:
cuda: "*"
env:
JULIA_CUDA_USE_COMPAT: 'false' # NVIDIA bug #3418723: injection tools prevent probing libcuda
if: build.message !~ /\[skip tests\]/ &&
build.message !~ /\[skip sanitizer\]/ &&
!build.pull_request.draft
if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
timeout_in_minutes: 10

# we want to benchmark every commit on the master branch, even if it failed CI
Expand Down Expand Up @@ -274,9 +293,8 @@ steps:
agents:
queue: "juliagpu"
cuda: "*"
if: build.message !~ /\[skip benchmarks\]/ &&
build.branch !~ /^master$$/ &&
!build.pull_request.draft
if: build.message !~ /\[skip benchmarks\]/ && !build.pull_request.draft &&
build.branch !~ /^master$$/
timeout_in_minutes: 30

# if we will submit results, use the benchmark queue so that we will
Expand Down Expand Up @@ -310,8 +328,7 @@ steps:
queue: "benchmark"
gpu: "rtx2070"
cuda: "*"
if: build.message !~ /\[skip benchmarks\]/ &&
build.branch =~ /^master$$/
if: build.message !~ /\[skip benchmarks\]/ && build.branch =~ /^master$$/
matrix:
setup:
julia:
Expand Down
4 changes: 4 additions & 0 deletions LocalPreferences.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
# making it possible to do use cooperative multitasking.
#nonblocking_synchronization = true

# which memory type unspecified allocations should default to.
# possible values: "device", "unified", "host"
#default_memory = "device"

[CUDA_Driver_jll]
# whether to attempt to load a forwards-compatibile userspace driver.
# only turn this off if you experience issues, e.g., when using a local
Expand Down
14 changes: 7 additions & 7 deletions lib/cusparse/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -417,9 +417,9 @@ Adapt.adapt_storage(::Type{CuArray}, xs::SparseMatrixCSC) = CuSparseMatrixCSC(xs
Adapt.adapt_storage(::Type{CuArray{T}}, xs::SparseVector) where {T} = CuSparseVector{T}(xs)
Adapt.adapt_storage(::Type{CuArray{T}}, xs::SparseMatrixCSC) where {T} = CuSparseMatrixCSC{T}(xs)

Adapt.adapt_storage(::CUDA.CuArrayAdaptor, xs::AbstractSparseArray) =
Adapt.adapt_storage(::CUDA.CuArrayKernelAdaptor, xs::AbstractSparseArray) =
adapt(CuArray, xs)
Adapt.adapt_storage(::CUDA.CuArrayAdaptor, xs::AbstractSparseArray{<:AbstractFloat}) =
Adapt.adapt_storage(::CUDA.CuArrayKernelAdaptor, xs::AbstractSparseArray{<:AbstractFloat}) =
adapt(CuArray{Float32}, xs)

Adapt.adapt_storage(::Type{Array}, xs::CuSparseVector) = SparseVector(xs)
Expand Down Expand Up @@ -546,15 +546,15 @@ end

# interop with device arrays

function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseVector)
function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseVector)
return CuSparseDeviceVector(
adapt(to, x.iPtr),
adapt(to, x.nzVal),
length(x), x.nnz
)
end

function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSR)
function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixCSR)
return CuSparseDeviceMatrixCSR(
adapt(to, x.rowPtr),
adapt(to, x.colVal),
Expand All @@ -563,7 +563,7 @@ function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSR)
)
end

function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSC)
function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixCSC)
return CuSparseDeviceMatrixCSC(
adapt(to, x.colPtr),
adapt(to, x.rowVal),
Expand All @@ -572,7 +572,7 @@ function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSC)
)
end

function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixBSR)
function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixBSR)
return CuSparseDeviceMatrixBSR(
adapt(to, x.rowPtr),
adapt(to, x.colVal),
Expand All @@ -582,7 +582,7 @@ function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixBSR)
)
end

function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCOO)
function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixCOO)
return CuSparseDeviceMatrixCOO(
adapt(to, x.rowInd),
adapt(to, x.colInd),
Expand Down
Loading

0 comments on commit 76d6d06

Please sign in to comment.