Improve support for unified and host memory (#2138)

- scalar indexing is now allowed with unified and host arrays - construction using `cu` has been generalized, now taking both `device`, `host` and `unified` boolean kwargs - the default memory location can be configured using the `default_memory `preference (now reported by `versioninfo`) - `unsafe_wrap` has been extended to take array inputs - HMM support, allow conversion of unmanaged CPU memory to `CuArray` objects - CI coverage for all of the above
JuliaGPU · Nov 1, 2023 · 76d6d06 · 76d6d06
1 parent d11ba2a
commit 76d6d06
Show file tree

Hide file tree

Showing 11 changed files with 408 additions and 166 deletions.
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -19,8 +19,7 @@ steps:
           cuda: "*"
         commands: |
           echo -e "[CUDA_Runtime_jll]\nlocal = \"true\"" >LocalPreferences.toml
-        if: build.message !~ /\[skip tests\]/ &&
-            build.message !~ /\[skip julia\]/
+        if: build.message !~ /\[skip tests\]/
         timeout_in_minutes: 120
         matrix:
           setup:
@@ -44,7 +43,7 @@ steps:
           - JuliaCI/julia#v1:
               version: 1.9
           - JuliaCI/julia-test#v1:
-              test_args: "core base libraries"
+              test_args: "--quickfail core base libraries"
           - JuliaCI/julia-coverage#v1:
               dirs:
                 - src
@@ -53,9 +52,7 @@ steps:
         agents:
           queue: "juliagpu"
           cuda: "*"
-        if: build.message !~ /\[skip tests\]/ &&
-            build.message !~ /\[skip cuda\]/ &&
-            !build.pull_request.draft
+        if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
         timeout_in_minutes: 120
         matrix:
           setup:
@@ -73,6 +70,34 @@ steps:
           echo -e "[CUDA_Runtime_jll]\nversion = \"{{matrix.cuda}}\"" >LocalPreferences.toml
           echo -e "[CUDA_Driver_jll]\ncompat = \"false\"" >>LocalPreferences.toml
 
+  - group: "Memory"
+    key: "memory"
+    depends_on: "julia"
+    steps:
+      - label: "CuArray with {{matrix.memory}} memory"
+        plugins:
+          - JuliaCI/julia#v1:
+              version: 1.9
+          - JuliaCI/julia-test#v1:
+              test_args: "--quickfail core base libraries"
+          - JuliaCI/julia-coverage#v1:
+              dirs:
+                - src
+                - lib
+                - examples
+        agents:
+          queue: "juliagpu"
+          cuda: "*"
+        if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
+        timeout_in_minutes: 120
+        matrix:
+          setup:
+            memory:
+              - "unified"
+              - "host"
+        commands: |
+          echo -e "[CUDA]\ndefault_memory = \"{{matrix.memory}}\"" >LocalPreferences.toml
+
   - group: ":nesting_dolls: Subpackages"
     depends_on: "cuda"
     steps:
@@ -104,9 +129,7 @@ steps:
         agents:
           queue: "juliagpu"
           cuda: "*"
-        if: build.message !~ /\[skip tests\]/ &&
-            build.message !~ /\[skip subpackages\]/ &&
-            !build.pull_request.draft
+        if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
         timeout_in_minutes: 120
         commands: |
           julia --project -e '
@@ -165,9 +188,7 @@ steps:
         agents:
           queue: "juliagpu"
           cuda: "*"
-        if: build.message !~ /\[skip tests\]/ &&
-            build.message !~ /\[skip downstream\]/ &&
-            !build.pull_request.draft
+        if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
         timeout_in_minutes: 60
         soft_fail:
           - exit_status: 3
@@ -240,9 +261,7 @@ steps:
           cuda: "*"
         env:
           JULIA_CUDA_USE_COMPAT: 'false'  # NVIDIA bug #3418723: injection tools prevent probing libcuda
-        if: build.message !~ /\[skip tests\]/ &&
-            build.message !~ /\[skip sanitizer\]/ &&
-            !build.pull_request.draft
+        if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
         timeout_in_minutes: 10
 
   # we want to benchmark every commit on the master branch, even if it failed CI
@@ -274,9 +293,8 @@ steps:
         agents:
           queue: "juliagpu"
           cuda: "*"
-        if: build.message !~ /\[skip benchmarks\]/ &&
-            build.branch !~ /^master$$/ &&
-            !build.pull_request.draft
+        if: build.message !~ /\[skip benchmarks\]/ && !build.pull_request.draft &&
+            build.branch !~ /^master$$/
         timeout_in_minutes: 30
 
       # if we will submit results, use the benchmark queue so that we will
@@ -310,8 +328,7 @@ steps:
           queue: "benchmark"
           gpu: "rtx2070"
           cuda: "*"
-        if: build.message !~ /\[skip benchmarks\]/ &&
-            build.branch =~ /^master$$/
+        if: build.message !~ /\[skip benchmarks\]/ && build.branch =~ /^master$$/
         matrix:
           setup:
             julia:

diff --git a/LocalPreferences.toml b/LocalPreferences.toml
@@ -12,6 +12,10 @@
 # making it possible to do use cooperative multitasking.
 #nonblocking_synchronization = true
 
+# which memory type unspecified allocations should default to.
+# possible values: "device", "unified", "host"
+#default_memory = "device"
+
 [CUDA_Driver_jll]
 # whether to attempt to load a forwards-compatibile userspace driver.
 # only turn this off if you experience issues, e.g., when using a local

diff --git a/lib/cusparse/array.jl b/lib/cusparse/array.jl
@@ -417,9 +417,9 @@ Adapt.adapt_storage(::Type{CuArray}, xs::SparseMatrixCSC) = CuSparseMatrixCSC(xs
 Adapt.adapt_storage(::Type{CuArray{T}}, xs::SparseVector) where {T} = CuSparseVector{T}(xs)
 Adapt.adapt_storage(::Type{CuArray{T}}, xs::SparseMatrixCSC) where {T} = CuSparseMatrixCSC{T}(xs)
 
-Adapt.adapt_storage(::CUDA.CuArrayAdaptor, xs::AbstractSparseArray) =
+Adapt.adapt_storage(::CUDA.CuArrayKernelAdaptor, xs::AbstractSparseArray) =
   adapt(CuArray, xs)
-Adapt.adapt_storage(::CUDA.CuArrayAdaptor, xs::AbstractSparseArray{<:AbstractFloat}) =
+Adapt.adapt_storage(::CUDA.CuArrayKernelAdaptor, xs::AbstractSparseArray{<:AbstractFloat}) =
   adapt(CuArray{Float32}, xs)
 
 Adapt.adapt_storage(::Type{Array}, xs::CuSparseVector) = SparseVector(xs)
@@ -546,15 +546,15 @@ end
 
 # interop with device arrays
 
-function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseVector)
+function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseVector)
     return CuSparseDeviceVector(
         adapt(to, x.iPtr),
         adapt(to, x.nzVal),
         length(x), x.nnz
     )
 end
 
-function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSR)
+function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixCSR)
     return CuSparseDeviceMatrixCSR(
         adapt(to, x.rowPtr),
         adapt(to, x.colVal),
@@ -563,7 +563,7 @@ function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSR)
     )
 end
 
-function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSC)
+function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixCSC)
     return CuSparseDeviceMatrixCSC(
         adapt(to, x.colPtr),
         adapt(to, x.rowVal),
@@ -572,7 +572,7 @@ function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSC)
     )
 end
 
-function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixBSR)
+function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixBSR)
     return CuSparseDeviceMatrixBSR(
         adapt(to, x.rowPtr),
         adapt(to, x.colVal),
@@ -582,7 +582,7 @@ function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixBSR)
     )
 end
 
-function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCOO)
+function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixCOO)
     return CuSparseDeviceMatrixCOO(
         adapt(to, x.rowInd),
         adapt(to, x.colInd),