From be5b47fad5fc9c0a3e22f239ec8517df60ffdf4c Mon Sep 17 00:00:00 2001
From: Brian Chen <ToucheSir@users.noreply.github.com>
Date: Wed, 22 Jun 2022 19:35:04 -0700
Subject: [PATCH 1/3] Improved type stability with explicit params

We can disable accumulating (implicit) parameters to the gradient cache
in explicit mode. This can dramatically improve type stability because
`accum_param` will return a `Union{Nothing, [grad type]}` otherwise.
---
 src/compiler/interface.jl | 34 ++++++++++++++++++++++++++--------
 src/lib/array.jl          |  4 ++--
 src/lib/broadcast.jl      | 12 ++++++++----
 src/lib/lib.jl            |  3 ++-
 test/compiler.jl          | 17 +++++++++++------
 5 files changed, 49 insertions(+), 21 deletions(-)

diff --git a/src/compiler/interface.jl b/src/compiler/interface.jl
index d5428e97e..f429102f6 100644
--- a/src/compiler/interface.jl
+++ b/src/compiler/interface.jl
@@ -4,11 +4,11 @@ using Core: Typeof
 import Base: copy!, IdSet
 import Base.Broadcast: broadcasted, materialize!
 
-mutable struct Context <: AContext
+mutable struct Context{I} <: AContext
   cache::Union{IdDict{Any,Any},Nothing}
 end
 
-Context() = Context(nothing)
+Context() = Context{false}(nothing)
 
 cache(cx::Context) = cx.cache === nothing ? (cx.cache = IdDict()) : cx.cache
 
@@ -36,10 +36,28 @@ _pullback(f, args...) = _pullback(Context(), f, args...)
 tailmemaybe(::Nothing) = nothing
 tailmemaybe(x::Tuple) = Base.tail(x)
 
-function pullback(f, args...)
-  y, back = _pullback(f, args...)
+@inline pullback(f, args...) = pullback(f, Context(), args...)
+function pullback(f, cx::AContext, args...)
+  y, back = _pullback(cx, f, args...)
   y, Δ -> tailmemaybe(back(Δ))
 end
+function pullback(cx::Context, f, args...)
+  ChainRulesCore.ignore_derivatives() do
+    @warn """
+    Incorrect argument order for pullback, please use:
+
+      pullback(f, __context__::Context, args)
+    
+    instead of:
+
+      pullback(__context__::Context, f, args)
+    
+    This is usually caused by a call to pullback in a higher-order @adjoint.
+    The above warning will become an error in Zygote 0.7.
+    """
+  end
+  return pullback(f, cx, args...)
+end
 
 sensitivity(y::Number) = one(y)
 sensitivity(y::Complex) = error("Output is complex, so the gradient is not defined.")
@@ -334,21 +352,21 @@ function Base.map(f, gs1::Grads, gss::ADictOrGrads...)
 end
 
 function Base.map!(f, gsout::Grads, gss::ADictOrGrads...)
-  all(issetequal(gsout.params, keys(gs)) for gs in gss) || 
+  all(issetequal(gsout.params, keys(gs)) for gs in gss) ||
     throw(ArgumentError("map! expects Grads objects with the same Params."))
   for p in gsout.params
-    gsout[p] = f((_getformap(gs, p) for gs in gss)...) 
+    gsout[p] = f((_getformap(gs, p) for gs in gss)...)
   end
   return gsout
 end
 
 function _getformap(gs, p)
   g = gs[p]
-  isnothing(g) ? fill!(similar(p), 0) : g 
+  isnothing(g) ? fill!(similar(p), 0) : g
 end
 
 function pullback(f, ps::Params)
-  cx = Context()
+  cx = Context{true}(nothing)
   y, back = _pullback(cx, f)
   y, function (Δ)
     for p in ps
diff --git a/src/lib/array.jl b/src/lib/array.jl
index 4e72713d9..293801b21 100644
--- a/src/lib/array.jl
+++ b/src/lib/array.jl
@@ -310,7 +310,7 @@ end
 
 @adjoint function sum(f, xs::AbstractArray{<:AbstractArray}; kws...)
   @assert !haskey(kws, :init) # TODO add init support (julia 1.6)
-  return pullback(__context__, (f, xs) -> sum(f.(xs); kws...), f, xs)
+  return pullback((f, xs) -> sum(f.(xs); kws...), __context__, f, xs)
 end
 
 @adjoint function sum(xs::AbstractArray{Bool}; dims = :)
@@ -318,7 +318,7 @@ end
 end
 
 function _pullback(cx::AContext, ::typeof(prod), f, xs::AbstractArray)
-  y, back = pullback(cx, ((f, xs) -> prod(f.(xs))), f, xs)
+  y, back = pullback((f, xs) -> prod(f.(xs)), cx, f, xs)
   y, ȳ -> (nothing, back(ȳ)...)
 end
 
diff --git a/src/lib/broadcast.jl b/src/lib/broadcast.jl
index b3c16e823..8c0d3c54c 100644
--- a/src/lib/broadcast.jl
+++ b/src/lib/broadcast.jl
@@ -30,6 +30,10 @@ using Base.Broadcast: Broadcasted, AbstractArrayStyle, broadcasted, materialize
 # Utilities
 # =========
 
+# ChainRules already marks this non-differentiable,
+# But inference can still give up because of the Zygote -> CR wrapper layer
+@nograd Broadcast.combine_styles
+
 accum_sum(xs; dims = :) = reduce(accum, xs, dims = dims)
 
 # Work around reducedim_init issue
@@ -82,16 +86,16 @@ _minus(::Nothing) = nothing
 @adjoint broadcasted(::typeof(*), x::Numeric, y::Numeric) = x.*y,
    Δ -> (nothing, unbroadcast(x, Δ .* conj.(y)), unbroadcast(y, Δ .* conj.(x)))
 @adjoint broadcasted(::typeof(*), x::Number, y::AbstractArray{<:Number}) =
-  _pullback(*, x, y)  # this uses dot(y,Δ) instead of sum(Δ .* conj.(y))
+  _pullback(__context__, *, x, y)  # this uses dot(y,Δ) instead of sum(Δ .* conj.(y))
 @adjoint broadcasted(::typeof(*), x::AbstractArray{<:Number}, y::Number) =
-  _pullback(*, x, y)
+  _pullback(__context__, *, x, y)
 
 @adjoint function broadcasted(::typeof(/), x::Numeric, y::Numeric)
   res = x ./ y
   res, Δ -> (nothing, unbroadcast(x, Δ ./ conj.(y)), unbroadcast(y, .-Δ .* conj.(res ./ y)))
 end
 @adjoint broadcasted(::typeof(/), x::AbstractArray{<:Number}, y::Number) =
-  _pullback(/, x, y)
+  _pullback(__context__, /, x, y)
 
 @adjoint function broadcasted(::typeof(Base.literal_pow), ::typeof(^), x::Numeric, exp::Val{p}) where p
   y = Base.literal_pow.(^, x, exp)
@@ -273,7 +277,7 @@ using GPUArraysCore  # replaces @require CUDA block, weird indenting to preserve
   # Not the ChainRules.rrule which will use the Zygote.Context and thus not be GPU compatible
   @adjoint function sum(f, xs::AbstractGPUArray; kws...)
     @assert !haskey(kws, :init) # TODO add init support (julia 1.6)
-    return pullback(__context__, (f, xs) -> sum(f.(xs); kws...), f, xs)
+    return pullback((f, xs) -> sum(f.(xs); kws...), __context__, f, xs)
   end
 
   @adjoint function Base.convert(::Type{T}, xs::Array)  where {T<:AbstractGPUArray}
diff --git a/src/lib/lib.jl b/src/lib/lib.jl
index 22bda1e19..52a734809 100644
--- a/src/lib/lib.jl
+++ b/src/lib/lib.jl
@@ -21,7 +21,7 @@ accum(x, y) =
 
 accum(x, y, zs...) = accum(accum(x, y), zs...)
 
-accum(x::Tuple, ys::Tuple...) = accum.(x, ys...)
+accum(x::Tuple, ys::Tuple...) = map(accum, x, ys...)
 accum(x::AbstractArray, ys::AbstractArray...) = accum.(x, ys...)
 
 @generated function accum(x::NamedTuple, y::NamedTuple)
@@ -48,6 +48,7 @@ end
 
 @adjoint Base.typeassert(x, T) = Base.typeassert(x, T), Δ -> (Δ, nothing)
 
+accum_param(::Context{false}, _, Δ) = Δ
 @generated function accum_param(cx::Context, x, Δ)
   isbitstype(x) && return :(Δ)
   quote
diff --git a/test/compiler.jl b/test/compiler.jl
index bc37d271e..c5ddf1f38 100644
--- a/test/compiler.jl
+++ b/test/compiler.jl
@@ -1,5 +1,5 @@
 using Zygote, Test
-using Zygote: pullback, @adjoint
+using Zygote: pullback, @adjoint, Context
 
 macro test_inferred(ex)
   :(let res = nothing
@@ -160,13 +160,18 @@ end
 @testset "inference for `getproperty`" begin
     Gaussian = _Gaussian(:getproperty)
     g = Gaussian(randn(3), randn(3, 3))
-    y, back = @inferred pullback(x -> x.m, g)
-    @test y == getfield(g, :m)
-    # This type instability is due to the handling of non-bitstypes in `accum_param`
+    y_explicit, back_explicit = @inferred pullback(x -> x.m, g)
+    y_implicit, back_implicit = @inferred pullback(x -> x.m, Context{true}(nothing), g)
+    @test y_explicit == y_implicit == getfield(g, :m)
+
+    ∇args = ((m = [1.0, 0.0, 0.0], P = nothing),)
     if VERSION > v"1.7-"
-      @test Base.return_types(back, Tuple{Vector{Float64}}) == Any[Union{Tuple{Nothing}, typeof(((m = [1.0, 0.0, 0.0], P = nothing),))}]
+      # This type instability is due to the handling of non-bitstypes in `accum_param`
+      @test Base.return_types(back_implicit, Tuple{Vector{Float64}}) == Any[Union{Tuple{Nothing}, typeof(∇args)}]
+      # But the same should infer if implicit parameters are disabled
+      @test Base.return_types(back_explicit, Tuple{Vector{Float64}}) == Any[typeof(∇args)]
     end
-    @test back([1., 0, 0]) == ((m = [1.0, 0.0, 0.0], P = nothing),)
+    @test back_explicit([1., 0, 0]) == back_implicit([1., 0, 0]) == ∇args
 
     Base.getproperty(g::Gaussian, s::Symbol) = 2getfield(g, s)
     y, back = pullback(x -> x.m, g)

From e9a60757196368f8999e2413f2658a3d9cfffab4 Mon Sep 17 00:00:00 2001
From: Brian Chen <ToucheSir@users.noreply.github.com>
Date: Fri, 24 Jun 2022 12:43:33 -0700
Subject: [PATCH 2/3] basic comment for Context{I}

---
 src/compiler/interface.jl | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/compiler/interface.jl b/src/compiler/interface.jl
index f429102f6..ee2a69528 100644
--- a/src/compiler/interface.jl
+++ b/src/compiler/interface.jl
@@ -4,6 +4,9 @@ using Core: Typeof
 import Base: copy!, IdSet
 import Base.Broadcast: broadcasted, materialize!
 
+# Internal container used to track accumulated gradients of mutable types (including params).
+# Type param I ∈ (true, false) indicates whether implicit params are in use.
+# By default, this should be false unless pullback(f, ::Params) is called.
 mutable struct Context{I} <: AContext
   cache::Union{IdDict{Any,Any},Nothing}
 end
@@ -47,11 +50,11 @@ function pullback(cx::Context, f, args...)
     Incorrect argument order for pullback, please use:
 
       pullback(f, __context__::Context, args)
-    
+
     instead of:
 
       pullback(__context__::Context, f, args)
-    
+
     This is usually caused by a call to pullback in a higher-order @adjoint.
     The above warning will become an error in Zygote 0.7.
     """

From 3433cdd310cf3f262ea72370caba13028d8d4e0e Mon Sep 17 00:00:00 2001
From: Brian Chen <ToucheSir@users.noreply.github.com>
Date: Sun, 31 Jul 2022 08:36:57 -0700
Subject: [PATCH 3/3] Add accum_param tests

Co-authored-by: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
---
 test/features.jl | 61 ++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 57 insertions(+), 4 deletions(-)

diff --git a/test/features.jl b/test/features.jl
index cdfe7329e..d4f68d36b 100644
--- a/test/features.jl
+++ b/test/features.jl
@@ -476,7 +476,7 @@ end
   @test_broken gradient(x -> abs2(x[1].x) + 7 * x[1].x.re, [Ref(1+im)]) == ([(x = 9.0 + 2.0im,)],)
   @test_broken gradient(x -> abs2(x[1].x) + 7 * real(x[1].x), [Ref(1+im)]) == ([(x = 9.0 + 2.0im,)],)  # worked on 0.6.0, 0.6.20
 
-  @test_broken gradient(x -> abs2(x[].x) + 7 * real(x[].x), Ref(Ref(1+im))) == ((x = 9.0 + 2.0im,),)  # gives nothing, same in 0.6.0
+  @test gradient(x -> abs2(x[].x) + 7 * real(x[].x), Ref(Ref(1+im))) == ((x = (x = 9.0 + 2.0im,),),) # gave `nothing` from 0.6.0 to 0.6.41
 
   # Array of mutables:
   @test gradient(x -> sum(getindex.(x).^2), Ref.(1:3))[1] == [(;x=2i) for i in 1:3]
@@ -490,6 +490,59 @@ end
   @test gradient(x -> sum(sum, Ref(x) .* [1,2,3]), [4,5]) == ([6.0, 6.0],)
 end
 
+@testset "mutable accum_param bugs" begin
+  mutable struct Mut{T}; x::T; end
+  struct Imm{T}; x::T; end
+
+  # Indexing a tuple containing a mutable struct gave `nothing`
+  x1 = (Mut(3.0),)
+  x2 = (Imm(3.0),)
+  x3 = (Ref(3.0),)
+  @test gradient(x -> x[1].x^2, x1)[1] == ((x = 6.0,),)  # fails on v0.6.0 v0.6.41
+  @test gradient(x -> x[1].x^2, x2)[1] == ((x = 6.0,),)
+  @test gradient(x -> x[1].x^2, x3)[1] == ((x = 6.0,),)  # fails on v0.6.0 v0.6.41
+  i1 = 1
+  @test gradient(x -> x[i1].x^2, x1)[1] == ((x = 6.0,),)  # fails on v0.6.0 v0.6.41
+  @test gradient(x -> x[i1].x^2, x2)[1] == ((x = 6.0,),)
+  @test gradient(x -> x[i1].x^2, x3)[1] == ((x = 6.0,),)  # fails on v0.6.0 v0.6.41
+
+  @test gradient(x -> x[1][1].x^2, [x1])[1] == [((x = 6.0,),)]  # fails on v0.6.0 v0.6.41
+  @test gradient(x -> x[1][1].x^2, [x2])[1] == [((x = 6.0,),)]
+  @test gradient(x -> x[1][1].x^2, [x3])[1] == [((x = 6.0,),)]  # fails on v0.6.0 v0.6.41
+
+  # When `getfield` returns a mutable struct, it gave `nothing`:
+  x4 = Imm(Mut(4.0))
+  x5 = Mut(Mut(4.0))
+  x6 = Imm(Imm(4.0))
+  @test gradient(x -> x.x.x^3, x4)[1] == (x = (x = 48.0,),)  # fails on v0.6.0 v0.6.41
+  @test gradient(x -> x.x.x^3, x5)[1] == (x = (x = 48.0,),)  # fails on v0.6.0
+  @test gradient(x -> x.x.x^3, x6)[1] == (x = (x = 48.0,),)  # fails on v0.6.41
+
+  @test gradient(x -> x[2].x.x^3, [x4, x4])[1] == [nothing, (x = (x = 48.0,),)]  # fails on v0.6.0 v0.6.41
+  @test gradient(x -> x[2].x.x^3, [x4, x5])[1] == [nothing, (x = (x = 48.0,),)]  # fails on v0.6.0
+  @test gradient(x -> x[2].x.x^3, [x4, x6])[1] == [nothing, (x = (x = 48.0,),)]  # fails on v0.6.41
+
+  # Check when using implicit parameters, Params cases used to pass:
+  y1 = [3.0]
+  y2 = (Mut(y1),)
+  y3 = (Imm(y1),)
+  @test gradient(x -> sum(x[1].x)^2, y2)[1] == ((x = [6.0],),)  # fails on v0.6.0 v0.6.41
+  @test gradient(() -> sum(y2[1].x)^2, Params([y1]))[y1] == [6.0]
+  @test gradient(x -> sum(x[1].x)^2, y3)[1] == ((x = [6.0],),)
+  @test gradient(() -> sum(y3[1].x)^2, Params([y1]))[y1] == [6.0]
+
+  @test gradient(x -> sum(x[1].x .+ x[1].x)^3, y2)[1] == ((x = [216.0],),)  # fails on v0.6.0 v0.6.41
+  @test gradient(() -> sum(y2[1].x .+ y2[1].x)^3, Params([y1]))[y1] == [216.0]
+  @test gradient(x -> sum(x[1].x .+ x[1].x)^3, y3)[1] == ((x = [216.0],),)
+  @test gradient(() -> sum(y3[1].x .+ y3[1].x)^3, Params([y1]))[y1] == [216.0]
+
+  i1 = 1
+  @test gradient(x -> sum(x[i1].x .+ x[1].x)^3, y2)[1] == ((x = [216.0],),)  # fails on v0.6.0 v0.6.41
+  @test gradient(() -> sum(y2[i1].x .+ y2[1].x)^3, Params([y1]))[y1] == [216.0]
+  @test gradient(x -> sum(x[i1].x .+ x[1].x)^3, y3)[1] == ((x = [216.0],),)
+  @test gradient(() -> sum(y3[i1].x .+ y3[1].x)^3, Params([y1]))[y1] == [216.0]
+end
+
 @testset "NamedTuples" begin
   @test gradient(x -> x.a, (a=1, b=2)) == ((a = 1, b = nothing),)
   @test gradient(x -> x[1].a, [(a=1, b=2)]) == ([(a = 1, b = nothing)],)
@@ -517,7 +570,7 @@ end
   @test (x->10*(x => 2)[2])'(100) === nothing
 
   @test gradient(x-> (:x => x)[2], 17) == (1,)
-    
+
   d = Dict(:x=>1.0, :y=>3.0);
   @test gradient(d -> Dict(:x => d[:x])[:x], d) == (Dict(:x => 1),)
 end
@@ -546,7 +599,7 @@ end
   # zip
   if VERSION >= v"1.5"
     # On Julia 1.4 and earlier, [x/y for (x,y) in zip(10:14, 1:10)] is a DimensionMismatch,
-    # while on 1.5 - 1.7 it stops early. 
+    # while on 1.5 - 1.7 it stops early.
 
     @test gradient(10:14, 1:10) do xs, ys
       sum([x/y for (x,y) in zip(xs, ys)])
@@ -608,7 +661,7 @@ end
 
   # Iterators.Product with enumerate
   @test gradient([2 3; 4 5]) do xs
-    sum([x^i+y for (i,x) in enumerate(xs), y in xs]) 
+    sum([x^i+y for (i,x) in enumerate(xs), y in xs])
   end == ([8 112; 36 2004],)
 end