From 016cfe63f230386469c7e20599a14d759038d98c Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Wed, 27 Nov 2024 14:45:53 -0500
Subject: [PATCH] maybe this works?

---
 ext/FluxEnzymeExt/FluxEnzymeExt.jl | 31 +++++++++++++++---------------
 test/ext_enzyme/enzyme.jl          |  4 ++--
 2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/ext/FluxEnzymeExt/FluxEnzymeExt.jl b/ext/FluxEnzymeExt/FluxEnzymeExt.jl
index 37556a9309..0833c3b867 100644
--- a/ext/FluxEnzymeExt/FluxEnzymeExt.jl
+++ b/ext/FluxEnzymeExt/FluxEnzymeExt.jl
@@ -69,14 +69,14 @@ function Flux._enzyme_withgradient(f, args::Union{Const, Duplicated}...; zero::B
   # _, val = Enzyme.autodiff(ReverseWithPrimal, f, Active, args...)
 
   # Take II, using split mode.
-  forward, reverse = autodiff_thunk(ReverseSplitWithPrimal, Const{typeof(f)}, Active, map(typeof, args)...)
-  tape, result, shadow_result  = forward(Const(f), args...)
-  reverse(Const(f), args..., _sensitivity(result), tape)
+  # forward, reverse = autodiff_thunk(ReverseSplitWithPrimal, Const{typeof(f)}, Active, map(typeof, args)...)
+  # tape, result, shadow_result  = forward(Const(f), args...)
+  # reverse(Const(f), args..., _sensitivity(result), tape)
 
   # Take III, it may be more efficient to have the function write the loss into Ref(0.0)?
-  # Some cases work, but Flux.withgradient(m -> m(3), Duplicated(model)) does not.
-  # dup_loss = DuplicatedNoNeed(Ref(0.0), Ref(1.0))
+  dup_loss = DuplicatedNoNeed(Ref(0f0), Ref(1f0))
   # result = autodiff(Reverse, Const(_ref_loss!), Const, dup_loss, Const(f), args...)
+  _, result = autodiff(ReverseWithPrimal, Const(_ref_loss!), Const, dup_loss, Const(f), args...)
 
   (; val = result, grad = map(_grad_or_nothing, args))
 end
@@ -87,18 +87,17 @@ end
 _sensitivity(y) = error("""`Flux.withgradient(f, xs...)` expects that `y = f(xs...)` is a real numnber,
     or else a Tuple or NamedTuple whose first element is a real number.""")
 
-# function _ref_loss!(out::Ref, f, args...)  # for Take III above
-#   val = f(args...)
-#   @show val
-#   out[] = _get_loss(val)  # saves loss by mutation
-#   val  # returns the whole thing
-# end
+function _ref_loss!(out::Ref, f, args...)  # for Take III above
+  val = f(args...)
+  out[] = _get_loss(val)  # saves loss by mutation
+  val  # returns the whole thing
+end
 
-# @inline _get_loss(y::Real) = y
-# @inline _get_loss(ys::Tuple{Real,Vararg}) = ys[1]
-# @inline _get_loss(ys::NamedTuple{S, <:Tuple{Real,Vararg}}) where S = ys[1]
-# _get_loss(y) = error("""`Flux.withgradient(f, xs...)` expects that `y = f(xs...)` is a real numnber,
-#     or else a Tuple or NamedTuple whose first element is a real number.""")
+@inline _get_loss(y::Real) = y
+@inline _get_loss(ys::Tuple{Real,Vararg}) = ys[1]
+@inline _get_loss(ys::NamedTuple{S, <:Tuple{Real,Vararg}}) where S = ys[1]
+_get_loss(y) = error("""`Flux.withgradient(f, xs...)` expects that `y = f(xs...)` is a real numnber,
+    or else a Tuple or NamedTuple whose first element is a real number.""")
 
 ### Flux.Train, for train!
 
diff --git a/test/ext_enzyme/enzyme.jl b/test/ext_enzyme/enzyme.jl
index 775cc41d68..2f0a6f47bf 100644
--- a/test/ext_enzyme/enzyme.jl
+++ b/test/ext_enzyme/enzyme.jl
@@ -216,6 +216,6 @@ end
     @test Flux.gradient(|>, z, Duplicated(sum ∘ LayerNorm(3)))[1] ≈ [0.0, 0.0, 0.0]
     @test Flux.gradient(|>, z, Const(sum ∘ LayerNorm(3)))[2] === nothing
 
-    @test_broken Flux.withgradient(sum ∘ LayerNorm(3), z).grad[1] ≈ [0.0, 0.0, 0.0]  # AssertionError: Base.allocatedinline(actualRetType) returns false: actualRetType = Any, rettype = Active{Any}
-    @test_broken Flux.withgradient(|>, z, Duplicated(sum ∘ LayerNorm(3))).grad[1] ≈ [0.0, 0.0, 0.0]  # AssertionError: Base.allocatedinline(actualRetType) returns false: actualRetType = Any, rettype = Active{Any}
+    @test Flux.withgradient(sum ∘ LayerNorm(3), z).grad[1] ≈ [0.0, 0.0, 0.0]
+    @test Flux.withgradient(|>, z, Duplicated(sum ∘ LayerNorm(3))).grad[1] ≈ [0.0, 0.0, 0.0]
 end