Switch to KernelAbstractions.jl (#559)

Co-authored-by: James Schloss <[email protected]>
JuliaGPU · Oct 17, 2024 · 2769d6b · 2769d6b · maleadt · Oct 17, 2024
1 parent 8dfd805
commit 2769d6b
Show file tree

Hide file tree

Showing 29 changed files with 289 additions and 691 deletions.
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -42,6 +42,8 @@ steps:
       cuda: "*"
     if: build.message !~ /\[skip tests\]/
     timeout_in_minutes: 120
+    soft_fail:
+      - exit_status: 3
 
   - label: "oneAPI.jl"
     plugins:
@@ -87,6 +89,8 @@ steps:
       intel: "*"
     if: build.message !~ /\[skip tests\]/
     timeout_in_minutes: 60
+    soft_fail:
+      - exit_status: 3
 
   - label: "Metal.jl"
     plugins:
@@ -132,6 +136,8 @@ steps:
       arch: "aarch64"
     if: build.message !~ /\[skip tests\]/
     timeout_in_minutes: 60
+    soft_fail:
+      - exit_status: 3
 
 env:
   SECRET_CODECOV_TOKEN: "GrevHmzmr2Vt6UK4wbbTTB1+kcMcIlF6nCXVCk3Z0plHDimpD6BwdN9T2A+5J9k3I2em0xXUqpt+2qUSqM8Bn5mNdpjR0TvxVY3oYXc+qzvBXmcZJpuCgJeoTP1P+kVFwszUn4na3fohNq9Qffp6tXMn/j8yJQKOiiC8mkD0aPEI0zISHuDaa/7j7JYf0vTrMRRZ9BMUQHmFuVaIQN8FLGG2BiE3236rj4eHh0lj2IfekCG3wd/LUzAsMx0MC3kIR8WzOWW2rf6xUMPkjm5+NuHwhAOcZc0+LRM7GYIwoW/nHAgyIqjvLiInNFmaJk+7V/GAKtd+gSAIzmyBUHAy6A==;U2FsdGVkX1+4ZljneQoaNE295nRIx8D6+WoFIgT6Pg2BXHaTyhTL4sxEcG0jX0e7oq68uvi4bK7x7YMS4L0Kew=="
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -75,7 +75,7 @@ jobs:
                         TestEnv.activate()
                       catch err
                         @error "Could not install OpenCL.jl" exception=(err,catch_backtrace())
-                        exit(3)
+                        exit(0)
                       finally
                         Pkg.activate(package)
                       end

diff --git a/Project.toml b/Project.toml
@@ -1,10 +1,11 @@
 name = "GPUArrays"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
-version = "10.3.1"
+version = "11.0.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
@@ -15,7 +16,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [compat]
 Adapt = "4.0"
-GPUArraysCore = "= 0.1.6"
+GPUArraysCore = "= 0.2.0"
 LLVM = "3.9, 4, 5, 6, 7, 8, 9"
 LinearAlgebra = "1"
 Printf = "1"

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -9,10 +9,9 @@ will get a lot of functionality for free. This will allow to have multiple GPUAr
 implementation for different purposes, while maximizing the ability to share code.
 
 **This package is not intended for end users!** Instead, you should use one of the packages
-that builds on GPUArrays.jl. There is currently only a single package that actively builds
-on these interfaces, namely [CuArrays.jl](https://github.com/JuliaGPU/CuArrays.jl).
+that builds on GPUArrays.jl such as [CUDA](https://github.com/JuliaGPU/CUDA.jl), [AMDGPU](https://github.com/JuliaGPU/AMDGPU.jl), [OneAPI](https://github.com/JuliaGPU/oneAPI.jl), or [Metal](https://github.com/JuliaGPU/Metal.jl).
 
-In this documentation, you will find more information on the interface that you are expected
+This documentation is meant for users who might wish to implement a version of GPUArrays for another GPU backend and will cover the features you will need
 to implement, the functionality you gain by doing so, and the test suite that is available
 to verify your implementation. GPUArrays.jl also provides a reference implementation of
 these interfaces on the CPU: The `JLArray` array type uses Julia's parallel programming

diff --git a/docs/src/interface.md b/docs/src/interface.md
@@ -1,53 +1,32 @@
 # Interface
 
 To extend the above functionality to a new array type, you should use the types and
-implement the interfaces listed on this page. GPUArrays is design around having two
-different array types to represent a GPU array: one that only ever lives on the host, and
+implement the interfaces listed on this page. GPUArrays is designed around having two
+different array types to represent a GPU array: one that exists only on the host, and
 one that actually can be instantiated on the device (i.e. in kernels).
+Device functionality is then handled by [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl).
 
+## Host abstractions
 
-## Device functionality
-
-Several types and interfaces are related to the device and execution of code on it. First of
-all, you need to provide a type that represents your execution back-end and a way to call
-kernels:
+You should provide an array type that builds on the `AbstractGPUArray` supertype, such as:
 
-```@docs
-GPUArrays.AbstractGPUBackend
-GPUArrays.AbstractKernelContext
-GPUArrays.gpu_call
-GPUArrays.thread_block_heuristic
 ```
+mutable struct CustomArray{T, N} <: AbstractGPUArray{T, N}
+    data::DataRef{Vector{UInt8}}
+    offset::Int
+    dims::Dims{N}
+    ...
+end
 
-You then need to provide implementations of certain methods that will be executed on the
-device itself:
-
-```@docs
-GPUArrays.AbstractDeviceArray
-GPUArrays.LocalMemory
-GPUArrays.synchronize_threads
-GPUArrays.blockidx
-GPUArrays.blockdim
-GPUArrays.threadidx
-GPUArrays.griddim
 ```
 
+This will allow your defined type (in this case `JLArray`) to use the GPUArrays interface where available.
+To be able to actually use the functionality that is defined for `AbstractGPUArray`s, you need to define the backend, like so:
 
-## Host abstractions
-
-You should provide an array type that builds on the `AbstractGPUArray` supertype:
-
-```@docs
-AbstractGPUArray
 ```
-
-First of all, you should implement operations that are expected to be defined for any
-`AbstractArray` type. Refer to the Julia manual for more details, or look at the `JLArray`
-reference implementation.
-
-To be able to actually use the functionality that is defined for `AbstractGPUArray`s, you
-should provide implementations of the following interfaces:
-
-```@docs
-GPUArrays.backend
+import KernelAbstractions: Backend
+struct CustomBackend <: KernelAbstractions.GPU
+KernelAbstractions.get_backend(a::CA) where CA <: CustomArray = CustomBackend()
 ```
+
+There are numerous examples of potential interfaces for GPUArrays, such as with [JLArrays](https://github.com/JuliaGPU/GPUArrays.jl/blob/master/lib/JLArrays/src/JLArrays.jl), [CuArrays](https://github.com/JuliaGPU/CUDA.jl/blob/master/src/gpuarrays.jl), and [ROCArrays](https://github.com/JuliaGPU/AMDGPU.jl/blob/master/src/gpuarrays.jl).
diff --git a/lib/GPUArraysCore/Project.toml b/lib/GPUArraysCore/Project.toml
@@ -1,7 +1,7 @@
 name = "GPUArraysCore"
 uuid = "46192b85-c4d5-4398-a991-12ede77f4527"
 authors = ["Tim Besard <[email protected]>"]
-version = "0.1.6"
+version = "0.2.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"

diff --git a/lib/GPUArraysCore/src/GPUArraysCore.jl b/lib/GPUArraysCore/src/GPUArraysCore.jl
@@ -209,19 +209,4 @@ macro allowscalar(ex)
     end
 end
 
-
-## other
-
-"""
-    backend(x)
-    backend(T::Type)
-
-Gets the GPUArrays back-end responsible for managing arrays of type `T`.
-"""
-backend(::Type) = error("This object is not a GPU array") # COV_EXCL_LINE
-backend(x) = backend(typeof(x))
-
-# WrappedArray from Adapt for Base wrappers.
-backend(::Type{WA}) where WA<:WrappedArray = backend(unwrap_type(WA))
-
 end # module GPUArraysCore
diff --git a/lib/JLArrays/Project.toml b/lib/JLArrays/Project.toml
@@ -6,10 +6,11 @@ version = "0.1.6"
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 
 [compat]
 Adapt = "2.0, 3.0, 4.0"
-GPUArrays = "10"
-julia = "1.8"
+GPUArrays = "11"
 Random = "1"
+julia = "1.8"
diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl
@@ -1,53 +1,30 @@
 # reference implementation on the CPU
-
-# note that most of the code in this file serves to define a functional array type,
-# the actual implementation of GPUArrays-interfaces is much more limited.
+# This acts as a wrapper around KernelAbstractions's parallel CPU
+# functionality. It is useful for testing GPUArrays (and other packages)
+# when no GPU is present.
+# This file follows conventions from AMDGPU.jl
 
 module JLArrays
 
-export JLArray, JLVector, JLMatrix, jl
+export JLArray, JLVector, JLMatrix, jl, JLBackend
 
 using GPUArrays
 
 using Adapt
 
+import KernelAbstractions
+import KernelAbstractions: Adapt, StaticArrays, Backend, Kernel, StaticSize, DynamicSize, partition, blocks, workitems, launch_config
+
 
 #
 # Device functionality
 #
 
 const MAXTHREADS = 256
 
-
-## execution
-
-struct JLBackend <: AbstractGPUBackend end
-
-mutable struct JLKernelContext <: AbstractKernelContext
-    blockdim::Int
-    griddim::Int
-    blockidx::Int
-    threadidx::Int
-
-    localmem_counter::Int
-    localmems::Vector{Vector{Array}}
-end
-
-function JLKernelContext(threads::Int, blockdim::Int)
-    blockcount = prod(blockdim)
-    lmems = [Vector{Array}() for i in 1:blockcount]
-    JLKernelContext(threads, blockdim, 1, 1, 0, lmems)
-end
-
-function JLKernelContext(ctx::JLKernelContext, threadidx::Int)
-    JLKernelContext(
-        ctx.blockdim,
-        ctx.griddim,
-        ctx.blockidx,
-        threadidx,
-        0,
-        ctx.localmems
-    )
+struct JLBackend <: KernelAbstractions.GPU
+    static::Bool
+    JLBackend(;static::Bool=false) = new(static)
 end
 
 struct Adaptor end
@@ -60,27 +37,6 @@ end
 Base.getindex(r::JlRefValue) = r.x
 Adapt.adapt_structure(to::Adaptor, r::Base.RefValue) = JlRefValue(adapt(to, r[]))
 
-function GPUArrays.gpu_call(::JLBackend, f, args, threads::Int, blocks::Int;
-                            name::Union{String,Nothing})
-    ctx = JLKernelContext(threads, blocks)
-    device_args = jlconvert.(args)
-    tasks = Array{Task}(undef, threads)
-    for blockidx in 1:blocks
-        ctx.blockidx = blockidx
-        for threadidx in 1:threads
-            thread_ctx = JLKernelContext(ctx, threadidx)
-            tasks[threadidx] = @async f(thread_ctx, device_args...)
-            # TODO: require 1.3 and use Base.Threads.@spawn for actual multithreading
-            #       (this would require a different synchronization mechanism)
-        end
-        for t in tasks
-            fetch(t)
-        end
-    end
-    return
-end
-
-
 ## executed on-device
 
 # array type
@@ -108,42 +64,6 @@ end
 @inline Base.setindex!(A::JLDeviceArray, x, index::Integer) = setindex!(typed_data(A), x, index)
 
 
-# indexing
-
-for f in (:blockidx, :blockdim, :threadidx, :griddim)
-    @eval GPUArrays.$f(ctx::JLKernelContext) = ctx.$f
-end
-
-# memory
-
-function GPUArrays.LocalMemory(ctx::JLKernelContext, ::Type{T}, ::Val{dims}, ::Val{id}) where {T, dims, id}
-    ctx.localmem_counter += 1
-    lmems = ctx.localmems[blockidx(ctx)]
-
-    # first invocation in block
-    data = if length(lmems) < ctx.localmem_counter
-        lmem = fill(zero(T), dims)
-        push!(lmems, lmem)
-        lmem
-    else
-        lmems[ctx.localmem_counter]
-    end
-
-    N = length(dims)
-    JLDeviceArray{T,N}(data, tuple(dims...))
-end
-
-# synchronization
-
-@inline function GPUArrays.synchronize_threads(::JLKernelContext)
-    # All threads are getting started asynchronously, so a yield will yield to the next
-    # execution of the same function, which should call yield at the exact same point in the
-    # program, leading to a chain of yields effectively syncing the tasks (threads).
-    yield()
-    return
-end
-
-
 #
 # Host abstractions
 #
@@ -409,8 +329,6 @@ end
 
 ## GPUArrays interfaces
 
-GPUArrays.backend(::Type{<:JLArray}) = JLBackend()
-
 Adapt.adapt_storage(::Adaptor, x::JLArray{T,N}) where {T,N} =
   JLDeviceArray{T,N}(x.data[], x.offset, x.dims)
 
@@ -423,4 +341,50 @@ function GPUArrays.mapreducedim!(f, op, R::AnyJLArray, A::Union{AbstractArray,Br
     R
 end
 
+## KernelAbstractions interface
+
+KernelAbstractions.get_backend(a::JLA) where JLA <: JLArray = JLBackend()
+
+function KernelAbstractions.mkcontext(kernel::Kernel{JLBackend}, I, _ndrange, iterspace, ::Dynamic) where Dynamic
+    return KernelAbstractions.CompilerMetadata{KernelAbstractions.ndrange(kernel), Dynamic}(I, _ndrange, iterspace)
+end
+
+KernelAbstractions.allocate(::JLBackend, ::Type{T}, dims::Tuple) where T = JLArray{T}(undef, dims)
+
+@inline function launch_config(kernel::Kernel{JLBackend}, ndrange, workgroupsize)
+    if ndrange isa Integer
+        ndrange = (ndrange,)
+    end
+    if workgroupsize isa Integer
+        workgroupsize = (workgroupsize, )
+    end
+
+    if KernelAbstractions.workgroupsize(kernel) <: DynamicSize && workgroupsize === nothing
+        workgroupsize = (1024,) # Vectorization, 4x unrolling, minimal grain size
+    end
+    iterspace, dynamic = partition(kernel, ndrange, workgroupsize)
+    # partition checked that the ndrange's agreed
+    if KernelAbstractions.ndrange(kernel) <: StaticSize
+        ndrange = nothing
+    end
+
+    return ndrange, workgroupsize, iterspace, dynamic
+end
+
+KernelAbstractions.isgpu(b::JLBackend) = false
+
+function convert_to_cpu(obj::Kernel{JLBackend, W, N, F}) where {W, N, F}
+    return Kernel{typeof(KernelAbstractions.CPU(; static = obj.backend.static)), W, N, F}(KernelAbstractions.CPU(; static = obj.backend.static), obj.f)
+end
+
+function (obj::Kernel{JLBackend})(args...; ndrange=nothing, workgroupsize=nothing)
+    device_args = jlconvert.(args)
+    new_obj = convert_to_cpu(obj)
+    new_obj(device_args...; ndrange, workgroupsize)
+end
+
+Adapt.adapt_storage(::JLBackend, a::Array) = Adapt.adapt(JLArrays.JLArray, a)
+Adapt.adapt_storage(::JLBackend, a::JLArrays.JLArray) = a
+Adapt.adapt_storage(::KernelAbstractions.CPU, a::JLArrays.JLArray) = convert(Array, a)
+
 end