diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 1db6e05248..9196a4ef5f 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -246,8 +246,9 @@ steps: build.message !~ /\[only/ && !build.pull_request.draft && build.message !~ /\[skip tests\]/ && build.message !~ /\[skip downstream\]/ - timeout_in_minutes: 30 - soft_fail: true + timeout_in_minutes: 60 + soft_fail: + - exit_status: 3 - group: ":eyes: Special" depends_on: "cuda" diff --git a/.github/workflows/Container.yml b/.github/workflows/Container.yml index 0b2ce03676..307fc8882c 100644 --- a/.github/workflows/Container.yml +++ b/.github/workflows/Container.yml @@ -1,4 +1,4 @@ -name: Publish Docker image +name: Publish Docker container on: workflow_dispatch: @@ -20,7 +20,7 @@ on: jobs: push_to_registry: - name: Build container - Julia ${{ matrix.julia }} - CUDA ${{ matrix.cuda }} + name: Container for ${{ matrix.platform }} - Julia ${{ matrix.julia }} - CUDA ${{ matrix.cuda }} runs-on: ubuntu-latest permissions: contents: read @@ -30,9 +30,15 @@ jobs: matrix: julia: ["1.10", "1.11"] cuda: ["11.8", "12.6"] + platform: ["linux/amd64", "linux/arm64"] include: - julia: "1.11" cuda: "12.6" + platform: "linux/amd64" + default: true + - julia: "1.11" + cuda: "12.6" + platform: "linux/arm64" default: true steps: @@ -68,6 +74,15 @@ jobs: CUDA_MAJOR=$(echo ${{ matrix.cuda }} | cut -d'.' -f1) echo "major=${CUDA_MAJOR}" >> $GITHUB_OUTPUT + - name: Set CPU target + id: cpu_target + run: | + if [[ "${{ matrix.platform }}" == "linux/amd64" ]]; then + echo "target=generic;sandybridge,-xsaveopt,clone_all;haswell,-rdrnd,base(1)" >> $GITHUB_OUTPUT + elif [[ "${{ matrix.platform }}" == "linux/arm64" ]]; then + echo "target=generic;cortex-a57;thunderx2t99;carmel,clone_all;apple-m1,base(3);neoverse-512tvb,base(3)" >> $GITHUB_OUTPUT + fi + - name: Log in to registry uses: docker/login-action@v3 with: @@ -97,9 +112,11 @@ jobs: context: . push: true provenance: false # the build fetches the repo again, so provenance tracking is not useful + platforms: ${{ matrix.platform }} tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} build-args: | JULIA_VERSION=${{ matrix.julia }} CUDA_VERSION=${{ matrix.cuda }} PACKAGE_SPEC=CUDA#${{ steps.pkg.outputs.ref }} + JULIA_CPU_TARGET=${{ steps.cpu_target.outputs.target }} diff --git a/Dockerfile b/Dockerfile index ddf30358f0..74dd18c4bd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,6 +10,9 @@ ARG JULIA_VERSION=1 FROM julia:${JULIA_VERSION} +ARG JULIA_CPU_TARGET=native +ENV JULIA_CPU_TARGET=${JULIA_CPU_TARGET} + ARG CUDA_VERSION=12.6 ARG PACKAGE_SPEC=CUDA diff --git a/ext/EnzymeCoreExt.jl b/ext/EnzymeCoreExt.jl index fb1003eaf9..d047f83222 100644 --- a/ext/EnzymeCoreExt.jl +++ b/ext/EnzymeCoreExt.jl @@ -25,6 +25,9 @@ end function EnzymeCore.EnzymeRules.inactive_noinl(::typeof(CUDA.is_pinned), args...) return nothing end +function EnzymeCore.EnzymeRules.inactive_noinl(::typeof(CUDA.device_synchronize), args...) + return nothing +end function EnzymeCore.EnzymeRules.inactive(::typeof(CUDA.launch_configuration), args...; kwargs...) return nothing end @@ -555,6 +558,59 @@ function EnzymeCore.EnzymeRules.noalias(::Type{CT}, ::UndefInitializer, args...) return nothing end +@inline function EnzymeCore.make_zero( + x::DenseCuArray{FT}, +) where {FT<:AbstractFloat} + return Base.zero(x) +end +@inline function EnzymeCore.make_zero( + x::DenseCuArray{Complex{FT}}, +) where {FT<:AbstractFloat} + return Base.zero(x) +end + +@inline function EnzymeCore.make_zero( + ::Type{CT}, + seen::IdDict, + prev::CT, + ::Val{copy_if_inactive} = Val(false), +)::CT where {copy_if_inactive, FT<:AbstractFloat, CT <: Union{DenseCuArray{FT},DenseCuArray{Complex{FT}}}} + if haskey(seen, prev) + return seen[prev] + end + newa = Base.zero(prev) + seen[prev] = newa + return newa +end + +@inline function EnzymeCore.make_zero!( + prev::DenseCuArray{FT}, + seen::ST, +)::Nothing where {FT<:AbstractFloat,ST} + if !isnothing(seen) + if prev in seen + return nothing + end + push!(seen, prev) + end + fill!(prev, zero(FT)) + return nothing +end + +@inline function EnzymeCore.make_zero!( + prev::DenseCuArray{Complex{FT}}, + seen::ST, +)::Nothing where {FT<:AbstractFloat,ST} + if !isnothing(seen) + if prev in seen + return nothing + end + push!(seen, prev) + end + fill!(prev, zero(Complex{FT})) + return nothing +end + function EnzymeCore.EnzymeRules.forward(config, ofn::Const{typeof(GPUArrays.mapreducedim!)}, ::Type{RT}, f::EnzymeCore.Const{typeof(Base.identity)}, diff --git a/lib/cublas/wrappers.jl b/lib/cublas/wrappers.jl index 11e56d1530..4ec187f242 100644 --- a/lib/cublas/wrappers.jl +++ b/lib/cublas/wrappers.jl @@ -1215,8 +1215,20 @@ end @inline function unsafe_strided_batch(strided::DenseCuArray{T}) where {T} batchsize = last(size(strided)) stride = prod(size(strided)[1:end-1]) - ptrs = [pointer(strided, (i-1)*stride + 1) for i in 1:batchsize] - return CuArray(ptrs) + + ptrs = CuArray{CuPtr{T}}(undef, batchsize) + nblocks = cld(batchsize, 256) + @cuda threads = 256 blocks = nblocks create_ptrs_kernel!(ptrs, strided, stride) + return ptrs +end + +function create_ptrs_kernel!(ptrs::CuDeviceArray{T}, A, batch_stride) where {T} + index = (blockIdx().x - 1i32) * blockDim().x + threadIdx().x + stride = gridDim().x * blockDim().x + for i in index:stride:length(ptrs) + ptrs[i] = reinterpret(CuPtr{T}, pointer(A, (i - 1i32) * batch_stride + 1i32)) + end + return nothing end ## (GE) general matrix-matrix multiplication grouped batched diff --git a/lib/utils/cache.jl b/lib/utils/cache.jl index 0abf92a436..9a7da297f5 100644 --- a/lib/utils/cache.jl +++ b/lib/utils/cache.jl @@ -40,7 +40,7 @@ function Base.pop!(cache::HandleCache{K,V}, key::K) where {K,V} if handle === nothing && num_active_handles > cache.max_entries GC.gc(false) @lock cache.lock begin - if haskey(cache.idle_handles, key) && isempty(cache.idle_handles[key]) + if haskey(cache.idle_handles, key) && !isempty(cache.idle_handles[key]) handle = pop!(cache.idle_handles[key]) end end diff --git a/test/extensions/enzyme.jl b/test/extensions/enzyme.jl index 3e34af70a5..75f452d36b 100644 --- a/test/extensions/enzyme.jl +++ b/test/extensions/enzyme.jl @@ -7,6 +7,15 @@ using CUDA @test EnzymeCore.compiler_job_from_backend(CUDABackend(), typeof(()->nothing), Tuple{}) isa GPUCompiler.CompilerJob end +@testset "Make_zero" begin + A = CUDA.ones(64) + dA = Enzyme.make_zero(A) + @test all(dA .≈ 0) + dA = CUDA.ones(64) + Enzyme.make_zero!(dA) + @test all(dA .≈ 0) +end + function square_kernel!(x) i = threadIdx().x x[i] *= x[i]