From ece7ba2b33564dd4705f573da9cf382f6231f09c Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Fri, 25 Oct 2024 16:56:02 -0400
Subject: [PATCH] fix: correctly handle adjoints of wrapped arrays (#90)

* fix: correctly handle adjoints of wrapped arrays

* fix: use fast paths for adapt

* fix: adapt ranges to https://github.com/JuliaGPU/Adapt.jl/pull/86
---
 Project.toml                          |  6 ++----
 ext/MLDataDevicesChainRulesCoreExt.jl | 21 ++++++++++++---------
 src/MLDataDevices.jl                  |  1 -
 src/public.jl                         | 16 +++++-----------
 test/amdgpu_tests.jl                  |  4 ++--
 test/cuda_tests.jl                    |  4 ++--
 test/metal_tests.jl                   |  4 ++--
 test/misc_tests.jl                    | 22 +++++++++++++++++-----
 test/oneapi_tests.jl                  |  4 ++--
 9 files changed, 44 insertions(+), 38 deletions(-)

diff --git a/Project.toml b/Project.toml
index c85cb0d..68d4325 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,13 +1,12 @@
 name = "MLDataDevices"
 uuid = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
 authors = ["Avik Pal <avikpal@mit.edu> and contributors"]
-version = "1.4.1"
+version = "1.4.2"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
-LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 
@@ -47,14 +46,13 @@ MLDataDevicesoneAPIExt = ["GPUArrays", "oneAPI"]
 
 [compat]
 AMDGPU = "0.9.6, 1"
-Adapt = "4"
+Adapt = "4.1"
 CUDA = "5.2"
 ChainRulesCore = "1.23"
 Compat = "4.15"
 FillArrays = "1"
 Functors = "0.4.8"
 GPUArrays = "10, 11"
-LinearAlgebra = "1.10"
 MLUtils = "0.4.4"
 Metal = "1"
 Preferences = "1.4"
diff --git a/ext/MLDataDevicesChainRulesCoreExt.jl b/ext/MLDataDevicesChainRulesCoreExt.jl
index 6a770b8..518ff20 100644
--- a/ext/MLDataDevicesChainRulesCoreExt.jl
+++ b/ext/MLDataDevicesChainRulesCoreExt.jl
@@ -1,24 +1,27 @@
 module MLDataDevicesChainRulesCoreExt
 
 using Adapt: Adapt
-using ChainRulesCore: ChainRulesCore, NoTangent, @non_differentiable
+using ChainRulesCore: ChainRulesCore, NoTangent, ProjectTo, @non_differentiable
 
 using MLDataDevices: AbstractDevice, UnknownDevice, get_device, get_device_type
 
 @non_differentiable get_device(::Any)
 @non_differentiable get_device_type(::Any)
 
-function ChainRulesCore.rrule(
-        ::typeof(Adapt.adapt_storage), to::AbstractDevice, x::AbstractArray)
-    ∇adapt_storage = let dev = get_device(x)
-        if dev === nothing || dev isa UnknownDevice
+function ChainRulesCore.rrule(::typeof(Adapt.adapt), to::AbstractDevice, x::AbstractArray)
+    dev = get_device(x)
+    y = Adapt.adapt_storage(to, x)
+    if dev === nothing || dev isa UnknownDevice
+        dev isa UnknownDevice &&
             @warn "`get_device(::$(typeof(x)))` returned `$(dev)`." maxlog=1
-            Δ -> (NoTangent(), NoTangent(), Δ)
-        else
-            Δ -> (NoTangent(), NoTangent(), dev(Δ))
+        ∇adapt_storage_unknown = Δ -> (NoTangent(), NoTangent(), Δ)
+        return y, ∇adapt_storage_unknown
+    else
+        ∇adapt_storage = let dev = dev, x = x
+            Δ -> (NoTangent(), NoTangent(), ProjectTo(x)(dev(Δ)))
         end
+        return Adapt.adapt_storage(to, x), ∇adapt_storage
     end
-    return Adapt.adapt_storage(to, x), ∇adapt_storage
 end
 
 end
diff --git a/src/MLDataDevices.jl b/src/MLDataDevices.jl
index c837887..108d8bf 100644
--- a/src/MLDataDevices.jl
+++ b/src/MLDataDevices.jl
@@ -5,7 +5,6 @@ using Functors: Functors, fleaves
 using Preferences: @delete_preferences!, @load_preference, @set_preferences!
 using Random: AbstractRNG, Random
 using Compat: @compat
-using LinearAlgebra: Transpose, Adjoint
 
 abstract type AbstractDevice <: Function end
 abstract type AbstractCPUDevice <: AbstractDevice end
diff --git a/src/public.jl b/src/public.jl
index 104a424..6440ddb 100644
--- a/src/public.jl
+++ b/src/public.jl
@@ -342,8 +342,10 @@ for (dev) in (:CPU, :CUDA, :AMDGPU, :Metal, :oneAPI, :XLA)
     ldev = Symbol(dev, :Device)
     @eval begin
         function (D::$(ldev))(x::AbstractArray{T}) where {T}
-            return (isbitstype(T) || Internal.special_aos(x)) ? Adapt.adapt(D, x) :
-                   map(D, x)
+            if isbitstype(T) || Internal.special_aos(x) || x isa Adapt.WrappedArray
+                return Adapt.adapt(D, x)
+            end
+            return map(D, x)
         end
         (D::$(ldev))(x::Union{Tuple, NamedTuple}) = map(D, x)
         function (D::$(ldev))(x)
@@ -373,14 +375,6 @@ for T in (AMDGPUDevice, CUDADevice, MetalDevice, oneAPIDevice, XLADevice)
     end
 end
 
-Adapt.adapt_storage(::CPUDevice, x::AbstractRange) = x
-Adapt.adapt_storage(::XLADevice, x::AbstractRange) = x
-# Prevent Ambiguity
-for T in (AMDGPUDevice, AMDGPUDevice{Nothing}, CUDADevice,
-    CUDADevice{Nothing}, MetalDevice, oneAPIDevice)
-    @eval Adapt.adapt_storage(to::$(T), x::AbstractRange) = Adapt.adapt(to, collect(x))
-end
-
 """
     isleaf(x) -> Bool
 
@@ -399,4 +393,4 @@ If `MLDataDevices.isleaf(x::T)` is not defined, then it will fall back to `Funct
 isleaf(x) = Functors.isleaf(x)
 
 isleaf(::AbstractArray{T}) where {T} = isbitstype(T)
-isleaf(::Union{Transpose, Adjoint, PermutedDimsArray}) = false
+isleaf(::Adapt.WrappedArray) = false
diff --git a/test/amdgpu_tests.jl b/test/amdgpu_tests.jl
index 41a8797..a771ada 100644
--- a/test/amdgpu_tests.jl
+++ b/test/amdgpu_tests.jl
@@ -53,7 +53,7 @@ using FillArrays, Zygote  # Extensions
     @test ps_xpu.mixed[1] isa Float32
     @test ps_xpu.mixed[2] isa Float64
     @test ps_xpu.mixed[3] isa aType
-    @test ps_xpu.range isa aType
+    @test ps_xpu.range isa AbstractRange
     @test ps_xpu.e == ps.e
     @test ps_xpu.d == ps.d
     @test ps_xpu.rng_default isa rngType
@@ -83,7 +83,7 @@ using FillArrays, Zygote  # Extensions
     @test ps_cpu.mixed[1] isa Float32
     @test ps_cpu.mixed[2] isa Float64
     @test ps_cpu.mixed[3] isa Array
-    @test ps_cpu.range isa Array
+    @test ps_cpu.range isa AbstractRange
     @test ps_cpu.e == ps.e
     @test ps_cpu.d == ps.d
     @test ps_cpu.rng_default isa Random.TaskLocalRNG
diff --git a/test/cuda_tests.jl b/test/cuda_tests.jl
index 1f95831..2fce480 100644
--- a/test/cuda_tests.jl
+++ b/test/cuda_tests.jl
@@ -52,7 +52,7 @@ using FillArrays, Zygote  # Extensions
     @test ps_xpu.mixed[1] isa Float32
     @test ps_xpu.mixed[2] isa Float64
     @test ps_xpu.mixed[3] isa aType
-    @test ps_xpu.range isa aType
+    @test ps_xpu.range isa AbstractRange
     @test ps_xpu.e == ps.e
     @test ps_xpu.d == ps.d
     @test ps_xpu.rng_default isa rngType
@@ -82,7 +82,7 @@ using FillArrays, Zygote  # Extensions
     @test ps_cpu.mixed[1] isa Float32
     @test ps_cpu.mixed[2] isa Float64
     @test ps_cpu.mixed[3] isa Array
-    @test ps_cpu.range isa Array
+    @test ps_cpu.range isa AbstractRange
     @test ps_cpu.e == ps.e
     @test ps_cpu.d == ps.d
     @test ps_cpu.rng_default isa Random.TaskLocalRNG
diff --git a/test/metal_tests.jl b/test/metal_tests.jl
index aeb596a..2bc8845 100644
--- a/test/metal_tests.jl
+++ b/test/metal_tests.jl
@@ -51,7 +51,7 @@ using FillArrays, Zygote  # Extensions
     @test ps_xpu.mixed[1] isa Float32
     @test ps_xpu.mixed[2] isa Float64
     @test ps_xpu.mixed[3] isa aType
-    @test ps_xpu.range isa aType
+    @test ps_xpu.range isa AbstractRange
     @test ps_xpu.e == ps.e
     @test ps_xpu.d == ps.d
     @test ps_xpu.rng_default isa rngType
@@ -81,7 +81,7 @@ using FillArrays, Zygote  # Extensions
     @test ps_cpu.mixed[1] isa Float32
     @test ps_cpu.mixed[2] isa Float64
     @test ps_cpu.mixed[3] isa Array
-    @test ps_cpu.range isa Array
+    @test ps_cpu.range isa AbstractRange
     @test ps_cpu.e == ps.e
     @test ps_cpu.d == ps.d
     @test ps_cpu.rng_default isa Random.TaskLocalRNG
diff --git a/test/misc_tests.jl b/test/misc_tests.jl
index 9bec386..28275d3 100644
--- a/test/misc_tests.jl
+++ b/test/misc_tests.jl
@@ -50,17 +50,17 @@ end
 
 @testset "CRC Tests" begin
     dev = cpu_device() # Other devices don't work with FiniteDifferences.jl
-    test_rrule(Adapt.adapt_storage, dev, randn(Float64, 10); check_inferred=true)
+    test_rrule(Adapt.adapt, dev, randn(Float64, 10); check_inferred=true)
 
     gdev = gpu_device()
     if !(gdev isa MetalDevice)  # On intel devices causes problems
         x = randn(10)
-        ∂dev, ∂x = Zygote.gradient(sum ∘ Adapt.adapt_storage, gdev, x)
+        ∂dev, ∂x = Zygote.gradient(sum ∘ Adapt.adapt, gdev, x)
         @test ∂dev === nothing
         @test ∂x ≈ ones(10)
 
         x = randn(10) |> gdev
-        ∂dev, ∂x = Zygote.gradient(sum ∘ Adapt.adapt_storage, cpu_device(), x)
+        ∂dev, ∂x = Zygote.gradient(sum ∘ Adapt.adapt, cpu_device(), x)
         @test ∂dev === nothing
         @test ∂x ≈ gdev(ones(10))
         @test get_device(∂x) isa parameterless_type(typeof(gdev))
@@ -181,7 +181,6 @@ end
     end
 
     @testset "shared parameters" begin
-        # from  
         x = rand(1)
         m = (; a=x, b=x')
         count = Ref(0)
@@ -199,7 +198,7 @@ end
             y::Float64
         end
 
-        for x in [1.0, 'a', BitsType(1, 2.0)]
+        @testset for x in [1.0, 'a', BitsType(1, 2.0)]
             @test MLDataDevices.isleaf([x])
             @test !MLDataDevices.isleaf([x]')
             @test !MLDataDevices.isleaf(transpose([x]))
@@ -207,3 +206,16 @@ end
         end
     end
 end
+
+@testset "Zygote.gradient(wrapped arrays)" begin
+    using Zygote
+
+    x = rand(4, 4)
+    cdev = cpu_device()
+
+    @test only(Zygote.gradient(x -> sum(abs2, cdev(x)), x')) isa Matrix{Float64}
+
+    gdev = gpu_device()
+
+    @test only(Zygote.gradient(x -> sum(abs2, gdev(x)), x')) isa Matrix{Float64}
+end
diff --git a/test/oneapi_tests.jl b/test/oneapi_tests.jl
index 8bb6026..2169869 100644
--- a/test/oneapi_tests.jl
+++ b/test/oneapi_tests.jl
@@ -51,7 +51,7 @@ using FillArrays, Zygote  # Extensions
     @test ps_xpu.mixed[1] isa Float32
     @test ps_xpu.mixed[2] isa Float64
     @test ps_xpu.mixed[3] isa aType
-    @test ps_xpu.range isa aType
+    @test ps_xpu.range isa AbstractRange
     @test ps_xpu.e == ps.e
     @test ps_xpu.d == ps.d
     @test ps_xpu.rng_default isa rngType
@@ -81,7 +81,7 @@ using FillArrays, Zygote  # Extensions
     @test ps_cpu.mixed[1] isa Float32
     @test ps_cpu.mixed[2] isa Float64
     @test ps_cpu.mixed[3] isa Array
-    @test ps_cpu.range isa Array
+    @test ps_cpu.range isa AbstractRange
     @test ps_cpu.e == ps.e
     @test ps_cpu.d == ps.d
     @test ps_cpu.rng_default isa Random.TaskLocalRNG