diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 1db6e05248..9196a4ef5f 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -246,8 +246,9 @@ steps:
           build.message !~ /\[only/ && !build.pull_request.draft &&
             build.message !~ /\[skip tests\]/ &&
             build.message !~ /\[skip downstream\]/
-        timeout_in_minutes: 30
-        soft_fail: true
+        timeout_in_minutes: 60
+        soft_fail:
+          - exit_status: 3
 
   - group: ":eyes: Special"
     depends_on: "cuda"
diff --git a/.github/workflows/Container.yml b/.github/workflows/Container.yml
index 0b2ce03676..307fc8882c 100644
--- a/.github/workflows/Container.yml
+++ b/.github/workflows/Container.yml
@@ -1,4 +1,4 @@
-name: Publish Docker image
+name: Publish Docker container
 
 on:
   workflow_dispatch:
@@ -20,7 +20,7 @@ on:
 
 jobs:
   push_to_registry:
-    name: Build container - Julia ${{ matrix.julia }} - CUDA ${{ matrix.cuda }}
+    name: Container for ${{ matrix.platform }} - Julia ${{ matrix.julia }} - CUDA ${{ matrix.cuda }}
     runs-on: ubuntu-latest
     permissions:
       contents: read
@@ -30,9 +30,15 @@ jobs:
       matrix:
         julia: ["1.10", "1.11"]
         cuda: ["11.8", "12.6"]
+        platform: ["linux/amd64", "linux/arm64"]
         include:
           - julia: "1.11"
             cuda: "12.6"
+            platform: "linux/amd64"
+            default: true
+          - julia: "1.11"
+            cuda: "12.6"
+            platform: "linux/arm64"
             default: true
 
     steps:
@@ -68,6 +74,15 @@ jobs:
           CUDA_MAJOR=$(echo ${{ matrix.cuda }} | cut -d'.' -f1)
           echo "major=${CUDA_MAJOR}" >> $GITHUB_OUTPUT
 
+      - name: Set CPU target
+        id: cpu_target
+        run: |
+          if [[ "${{ matrix.platform }}" == "linux/amd64" ]]; then
+            echo "target=generic;sandybridge,-xsaveopt,clone_all;haswell,-rdrnd,base(1)" >> $GITHUB_OUTPUT
+          elif [[ "${{ matrix.platform }}" == "linux/arm64" ]]; then
+            echo "target=generic;cortex-a57;thunderx2t99;carmel,clone_all;apple-m1,base(3);neoverse-512tvb,base(3)" >> $GITHUB_OUTPUT
+          fi
+
       - name: Log in to registry
         uses: docker/login-action@v3
         with:
@@ -97,9 +112,11 @@ jobs:
           context: .
           push: true
           provenance: false # the build fetches the repo again, so provenance tracking is not useful
+          platforms: ${{ matrix.platform }}
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
           build-args: |
             JULIA_VERSION=${{ matrix.julia }}
             CUDA_VERSION=${{ matrix.cuda }}
             PACKAGE_SPEC=CUDA#${{ steps.pkg.outputs.ref }}
+            JULIA_CPU_TARGET=${{ steps.cpu_target.outputs.target }}
diff --git a/Dockerfile b/Dockerfile
index ddf30358f0..74dd18c4bd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -10,6 +10,9 @@
 ARG JULIA_VERSION=1
 FROM julia:${JULIA_VERSION}
 
+ARG JULIA_CPU_TARGET=native
+ENV JULIA_CPU_TARGET=${JULIA_CPU_TARGET}
+
 ARG CUDA_VERSION=12.6
 
 ARG PACKAGE_SPEC=CUDA
diff --git a/ext/EnzymeCoreExt.jl b/ext/EnzymeCoreExt.jl
index fb1003eaf9..d047f83222 100644
--- a/ext/EnzymeCoreExt.jl
+++ b/ext/EnzymeCoreExt.jl
@@ -25,6 +25,9 @@ end
 function EnzymeCore.EnzymeRules.inactive_noinl(::typeof(CUDA.is_pinned), args...)
     return nothing
 end
+function EnzymeCore.EnzymeRules.inactive_noinl(::typeof(CUDA.device_synchronize), args...)
+    return nothing
+end
 function EnzymeCore.EnzymeRules.inactive(::typeof(CUDA.launch_configuration), args...; kwargs...)
     return nothing
 end
@@ -555,6 +558,59 @@ function EnzymeCore.EnzymeRules.noalias(::Type{CT}, ::UndefInitializer, args...)
     return nothing
 end
 
+@inline function EnzymeCore.make_zero(
+    x::DenseCuArray{FT},
+) where {FT<:AbstractFloat}
+    return Base.zero(x)
+end
+@inline function EnzymeCore.make_zero(
+    x::DenseCuArray{Complex{FT}},
+) where {FT<:AbstractFloat}
+    return Base.zero(x)
+end
+
+@inline function EnzymeCore.make_zero(
+    ::Type{CT},
+    seen::IdDict,
+    prev::CT,
+    ::Val{copy_if_inactive} = Val(false),
+)::CT where {copy_if_inactive, FT<:AbstractFloat, CT <: Union{DenseCuArray{FT},DenseCuArray{Complex{FT}}}}
+    if haskey(seen, prev)
+        return seen[prev]
+    end
+    newa = Base.zero(prev)
+    seen[prev] = newa
+    return newa
+end
+
+@inline function EnzymeCore.make_zero!(
+    prev::DenseCuArray{FT},
+    seen::ST,
+)::Nothing where {FT<:AbstractFloat,ST}
+    if !isnothing(seen)
+        if prev in seen
+            return nothing
+        end
+        push!(seen, prev)
+    end
+    fill!(prev, zero(FT))
+    return nothing
+end
+
+@inline function EnzymeCore.make_zero!(
+    prev::DenseCuArray{Complex{FT}},
+    seen::ST,
+)::Nothing where {FT<:AbstractFloat,ST}
+    if !isnothing(seen)
+        if prev in seen
+            return nothing
+        end
+        push!(seen, prev)
+    end
+    fill!(prev, zero(Complex{FT}))
+    return nothing
+end
+
 function EnzymeCore.EnzymeRules.forward(config, ofn::Const{typeof(GPUArrays.mapreducedim!)},
                                         ::Type{RT},
                                         f::EnzymeCore.Const{typeof(Base.identity)},
diff --git a/lib/cublas/wrappers.jl b/lib/cublas/wrappers.jl
index 11e56d1530..4ec187f242 100644
--- a/lib/cublas/wrappers.jl
+++ b/lib/cublas/wrappers.jl
@@ -1215,8 +1215,20 @@ end
 @inline function unsafe_strided_batch(strided::DenseCuArray{T}) where {T}
     batchsize = last(size(strided))
     stride = prod(size(strided)[1:end-1])
-    ptrs = [pointer(strided, (i-1)*stride + 1) for i in 1:batchsize]
-    return CuArray(ptrs)
+
+    ptrs = CuArray{CuPtr{T}}(undef, batchsize)
+    nblocks = cld(batchsize, 256)
+    @cuda threads = 256 blocks = nblocks create_ptrs_kernel!(ptrs, strided, stride)
+    return ptrs
+end
+
+function create_ptrs_kernel!(ptrs::CuDeviceArray{T}, A, batch_stride) where {T}
+    index = (blockIdx().x - 1i32) * blockDim().x + threadIdx().x
+    stride = gridDim().x * blockDim().x
+    for i in index:stride:length(ptrs)
+        ptrs[i] = reinterpret(CuPtr{T}, pointer(A, (i - 1i32) * batch_stride + 1i32))
+    end
+    return nothing
 end
 
 ## (GE) general matrix-matrix multiplication grouped batched
diff --git a/lib/utils/cache.jl b/lib/utils/cache.jl
index 0abf92a436..9a7da297f5 100644
--- a/lib/utils/cache.jl
+++ b/lib/utils/cache.jl
@@ -40,7 +40,7 @@ function Base.pop!(cache::HandleCache{K,V}, key::K) where {K,V}
     if handle === nothing && num_active_handles > cache.max_entries
         GC.gc(false)
         @lock cache.lock begin
-            if haskey(cache.idle_handles, key) && isempty(cache.idle_handles[key])
+            if haskey(cache.idle_handles, key) && !isempty(cache.idle_handles[key])
                 handle = pop!(cache.idle_handles[key])
             end
         end
diff --git a/test/extensions/enzyme.jl b/test/extensions/enzyme.jl
index 3e34af70a5..75f452d36b 100644
--- a/test/extensions/enzyme.jl
+++ b/test/extensions/enzyme.jl
@@ -7,6 +7,15 @@ using CUDA
     @test EnzymeCore.compiler_job_from_backend(CUDABackend(), typeof(()->nothing), Tuple{}) isa GPUCompiler.CompilerJob
 end
 
+@testset "Make_zero" begin
+   A = CUDA.ones(64)
+   dA = Enzyme.make_zero(A)
+   @test all(dA .≈ 0)
+   dA = CUDA.ones(64)
+   Enzyme.make_zero!(dA)
+   @test all(dA .≈ 0)
+end
+
 function square_kernel!(x)
     i = threadIdx().x
     x[i] *= x[i]