Skip to content

Commit

Permalink
Add destructure, take II (#54)
Browse files Browse the repository at this point in the history
* destructure, take II

* add a test

* tidy

* replace append! with reduce(vcat, ...)

* testset names

* rename everything

* tweak

* two broken tests

* make len positional, fix a bug

* second derivatives

* arrays of arrays

* more... the dimensionmismatch bug is not here

* warnings
  • Loading branch information
mcabbott authored Feb 14, 2022
1 parent 4155bcd commit 1e34fa2
Show file tree
Hide file tree
Showing 8 changed files with 341 additions and 9 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

[compat]
ChainRulesCore = "1"
Functors = "0.2.7"
Functors = "0.2.8"
julia = "1.6"

[extras]
Expand Down
7 changes: 7 additions & 0 deletions docs/src/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,13 @@ optimiser to act on all suitable fields. To restrict this, define `trainable`:
Optimisers.trainable
```

Such restrictions are also obeyed by this function for flattening a model:

```@docs
Optimisers.destructure
Optimisers.Restructure
```

## Rule Definition

```@docs
Expand Down
5 changes: 4 additions & 1 deletion src/Optimisers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@ using Functors: functor, fmap, isleaf
using LinearAlgebra

include("interface.jl")
include("rules.jl")

include("destructure.jl")
export destructure, total, total2

include("rules.jl")
export Descent, ADAM, Momentum, Nesterov, RMSProp,
ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW, RADAM, OADAM, AdaBelief,
WeightDecay, ClipGrad, ClipNorm, OptimiserChain
Expand Down
152 changes: 152 additions & 0 deletions src/destructure.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@

using ChainRulesCore: ChainRulesCore, NoTangent, ProjectTo, unthunk
const NoT = NoTangent()

"""
destructure(model) -> vector, reconstructor
Copies all [`trainable`](@ref), [`isnumeric`](@ref) parameters in the model
to a vector, and returns also a function which reverses this transformation.
Differentiable.
# Example
```jldoctest
julia> v, re = destructure((x=[1.0, 2.0], y=(sin, [3 + 4im])))
(ComplexF64[1.0 + 0.0im, 2.0 + 0.0im, 3.0 + 4.0im], Restructure(NamedTuple, ..., 3))
julia> re([3, 5-im, 7+11im])
(x = [3.0, 5.0], y = (sin, ComplexF64[7.0 + 11.0im]))
```
"""
function destructure(x)
flat, off, len = _flatten(x)
flat, Restructure(x, off, len)
end

"""
Restructure(Model, ..., length)
This is what [`destructure`](@ref) returns, and `re(p)` will re-build the model with
new parameters from vector `p`. If the model is callable, then `re(x, p) == re(p)(x)`.
# Example
```julia
julia> using Flux, Optimisers
julia> _, re = destructure(Dense([1 2; 3 4], [0, 0], sigmoid))
([1, 3, 2, 4, 0, 0], Restructure(Dense, ..., 6))
julia> m = re(-4:1)
Dense(2, 2, σ) # 6 parameters
julia> m([0.2, 0.3]) ≈ re([0.2, 0.3], -4:1)
true
```
"""
struct Restructure{T,S}
model::T
offsets::S
length::Int
end
(re::Restructure)(flat::AbstractVector) = _rebuild(re.model, re.offsets, flat, re.length)
(re::Restructure)(x, flat::AbstractVector) = re(flat)(x)
Base.show(io::IO, re::Restructure{T}) where T = print(io, "Restructure(", T.name.name, ", ..., ", re.length, ")")
Base.length(re::Restructure) = re.length

# This flattens a model, and returns a web of offsets for later use:
function _flatten(x)
isnumeric(x) && return vcat(_vec(x)), 0, length(x) # trivial case
arrays = AbstractVector[]
len = Ref(0)
off = fmap(x; exclude = isnumeric, walk = (f, z) -> map(f, _trainable(z))) do y
push!(arrays, _vec(y))
o = len[]
len[] = o + length(y)
o
end
reduce(vcat, arrays), off, len[]
end

_vec(x::Number) = LinRange(x,x,1)
_vec(x::AbstractArray) = vec(x)

function ChainRulesCore.rrule(::typeof(_flatten), x)
flat, off, len = _flatten(x)
_maybewarn()
_flatten_back((dflat, _, _)) = (NoT, _rebuild(x, off, unthunk(dflat), len; walk = _Tangent_biwalk, prune = NoT))
(flat, off, len), _flatten_back
end

# This reconstructs either a model like x, or a gradient for it:
function _rebuild(x, off, flat::AbstractVector, len = length(flat); walk = _trainable_biwalk, kw...)
len == length(flat) || throw(DimensionMismatch("Rebuild expected a vector of length $len, got $(length(flat))"))
fmap(x, off; exclude = isnumeric, walk, kw...) do y, o
_getat(y, o, flat)
end
end

_getat(y::Number, o::Int, flat::AbstractVector) = ProjectTo(y)(flat[o + 1])
_getat(y::AbstractArray, o::Int, flat::AbstractVector) =
ProjectTo(y)(reshape(flat[o .+ (1:length(y))], axes(y))) # ProjectTo is just correcting eltypes

function _trainable_biwalk(f, x, aux)
ch, re = functor(typeof(x), x)
au, _ = functor(typeof(x), aux)
_trainmap(f, ch, _trainable(x), au) |> re
end

function _trainmap(f, ch, tr, aux)
map(ch, tr, aux) do c, t, a # isnothing(t) indicates non-trainable field, safe given isnumeric(c)
isnothing(t) ? c : f(t, a)
end
end

function _Tangent_biwalk(f, x, aux) # use with prune = NoT
ch, re = functor(typeof(x), x)
au, _ = functor(typeof(x), aux)
y = _trainmap(f, ch, _trainable(x), au)
y isa Tuple{} && return NoT
p = ProjectTo(x)
if p isa ProjectTo # e.g. Array, NamedTuple
p(y)
else # p === identity for unknown structs
Tangent{typeof(x), typeof(y)}(y)
end
end

function ChainRulesCore.rrule(::typeof(_rebuild), x, off, flat, len; kw...)
_rebuild_back(dx) = (NoT, NoT, NoT, _grad!(x, unthunk(dx), off, _zero(flat)), NoT)
_rebuild(x, off, flat, len; kw...), _rebuild_back
end

_zero(x) = map!(zero, similar(x, float(eltype(x))), x) # mutable zero array for _grad!
ChainRulesCore.@non_differentiable _zero(x)

# This is the gradient of model reconstruction, accumulating duplicates:
function _grad!(x, dx, off, flat::AbstractVector)
x′, _ = functor(typeof(x), x)
dx′, _ = functor(typeof(x), base(dx))
off′, _ = functor(typeof(x), off)
foreach((xᵢ, dxᵢ, oᵢ) -> _grad!(xᵢ, dxᵢ, oᵢ, flat), x′, dx′, off′)
flat
end
function _grad!(x, dx, off::Integer, flat::AbstractVector)
@views flat[off .+ (1:length(x))] .+= dx # must visit all tied nodes
flat
end
_grad!(x, dx::Zero, off, flat::AbstractVector) = dx
_grad!(x, dx::Zero, off::Integer, flat::AbstractVector) = dx # ambiguity

# These are only needed for 2nd derivatives:
function ChainRulesCore.rrule(::typeof(_grad!), x, dx, off, flat)
@warn "second derivatives of Restructure may not work yet, sorry!" maxlog=3
_grad_back(dflat) = (NoT, NoT, _rebuild(x, off, unthunk(dflat); walk = _Tangent_biwalk, prune = NoT), NoT, NoT)
_grad!(x, dx, off, flat), _grad_back
end
base(dx::Tangent{<:Tangent}) = backing(dx).backing # might be needed for gradient(gradient(destructure))
base(dx::Tangent{Any, <:NamedTuple{(:backing,)}}) = base(backing(dx).backing) # Zygote version
_maybewarn() = nothing
function ChainRulesCore.rrule(::typeof(_maybewarn))
@warn "second derivatives of destructure may not work yet, sorry!" maxlog=3
nothing, _ -> (NoT,)
end
5 changes: 3 additions & 2 deletions src/interface.jl
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,10 @@ trainable(x) = functor(x)[1]

_trainable(x) = _trainable(functor(x)[1], trainable(x))
_trainable(ch::NamedTuple, tr::NamedTuple) = merge(map(_ -> nothing, ch), tr)
_trainable(ch::Tuple, tr::Tuple) = tr
_trainable(ch::Tuple{Vararg{Any,N}}, tr::Tuple{Vararg{Any,N}}) where N = tr
_trainable(ch::AbstractArray, tr::AbstractArray) = tr
function _trainable(ch::NamedTuple, tr::Tuple) # for old Flux-style no-names tuple
@warn "trainable(x) should now return a NamedTuple with the field names, not a Tuple"
@warn "trainable(x) should now return a NamedTuple with the field names, not a Tuple" maxlog=3
map(c -> c in tr ? c : nothing, ch)
end

Expand Down
166 changes: 166 additions & 0 deletions test/destructure.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@

m1 = collect(1:3.0)
m2 = (collect(1:3.0), collect(4:6.0))
m3 = (x = m1, y = sin, z = collect(4:6.0))
m4 = (x = m1, y = m1, z = collect(4:6.0)) # tied
m5 = (a = (m3, true), b = (m1, false), c = (m4, true))
m6 = (a = m1, b = [4.0 + im], c = m1)
m7 = TwoThirds((sin, collect(1:3.0)), (cos, collect(4:6.0)), (tan, collect(7:9.0)))
m8 = [Foo(m1, m1), (a = true, b = Foo([4.0], false), c = ()), [[5.0]]]

@testset "flatten & rebuild" begin
@test destructure(m1)[1] isa Vector{Float64}
@test destructure(m1)[1] == 1:3
@test destructure(m2)[1] == 1:6
@test destructure(m3)[1] == 1:6
@test destructure(m4)[1] == 1:6
@test destructure(m5)[1] == vcat(1:6, 4:6)
@test destructure(m6)[1] == vcat(1:3, 4 + im)

@test destructure(m1)[2](7:9) == [7,8,9]
@test destructure(m2)[2](4:9) == ([4,5,6], [7,8,9])
@test destructure(m3)[2](4:9) == (x = [4,5,6], y = sin, z = [7,8,9])
m4′ = destructure(m4)[2](4:9)
@test m4′ == (x = [4,5,6], y = [4,5,6], z = [7,8,9])
@test m4′.x === m4′.y
m5′ = destructure(m5)[2](reverse(1:9))
@test m5′.a[1].x === m5′.b[1]
@test m5′.b[2] === false
m6′ = destructure(m6)[2]((4:7) .+ (1:4) .* im)
@test m6′.a isa Vector{Float64}
@test m6′.a == 4:6
@test m6′.a === m6′.c
@test m6′.b == [7 + 4im]

# struct, trainable
@test destructure(m7)[1] == 1:3
m7′ = destructure(m7)[2]([10,20,30])
@test m7′.a == (sin, [10,20,30])
@test m7′.b == (cos, [4,5,6])
@test m7′.c == (tan, [7,8,9])

@test destructure(m8)[1] == 1:5
m8′ = destructure(m8)[2](1:5)
@test m8′[1].x === m8′[1].y
@test m8′[2].b.y === false
@test m8′[3][1] == [5.0]

# errors
@test_throws Exception destructure(m7)[2]([10,20])
@test_throws Exception destructure(m7)[2]([10,20,30,40])
end

@testset "gradient of flatten" begin
@test gradient(m -> destructure(m)[1][1], m1)[1] == [1,0,0]
@test gradient(m -> destructure(m)[1][2], m2)[1] == ([0,1,0], [0,0,0])
@test gradient(m -> destructure(m)[1][3], (m1, m1))[1] == ([0,0,1], nothing)
@test gradient(m -> destructure(m)[1][1], m3)[1] == (x = [1,0,0], y = nothing, z = [0,0,0])
@test gradient(m -> destructure(m)[1][2], m4)[1] == (x = [0,1,0], y = nothing, z = [0,0,0])

g5 = gradient(m -> destructure(m)[1][3], m5)[1]
@test g5.a[1].x == [0,0,1]
@test g5.a[2] === nothing

g6 = gradient(m -> imag(destructure(m)[1][4]), m6)[1]
@test g6.a == [0,0,0]
@test g6.a isa Vector{Float64}
@test g6.b == [0+im]

g8 = gradient(m -> sum(abs2, destructure(m)[1]), m8)[1]
@test g8[1].x == [2,4,6]
@test g8[2].b.x == [8]
@test g8[3] == [[10.0]]

@testset "second derivative" begin
@test gradient([1,2,3.0]) do v
sum(abs2, gradient(m -> sum(abs2, destructure(m)[1]), (v, [4,5,6.0]))[1][1])
end[1] [8,16,24]
# With Diffractor, non-leaf _grad!(x, dx, off, flat::AbstractVector) gets double-wrapped dx:
# off = (0, 3), dx = Tangent{Tangent{Tuple{Vector{Float64}, Vector{Float64}}, ...
# until you add explicit double-unwrap: base(dx::Tangent{<:Tangent}) = backing(dx).backing
# With Zygote, instead:
# dx = Tangent{Any}(backing = Tangent{Any}([4.0, 8.0, 12.0], ZeroTangent()),)

@test gradient([1,2,3.0]) do v
sum(gradient(m -> sum(destructure(m)[1])^3, (v, [4,5,6.0]))[1][1])
end[1] == [378, 378, 378]

@test_broken gradient([1,2,3.0]) do v
sum(abs2, gradient(m -> sum(abs2, destructure(m)[1]), (x = v, y = sin, z = [4,5,6.0]))[1][1])
end[1] [8,16,24]
# Zygote error in (::typeof(∂(canonicalize)))(Δ::NamedTuple{(:backing,), Tuple{NamedTuple{(:x, :y, :z)
# Diffractor error in perform_optic_transform
end
end

@testset "gradient of rebuild" begin
re1 = destructure(m1)[2]
@test gradient(x -> re1(x)[1], rand(3))[1] == [1,0,0]
re2 = destructure(m2)[2]
@test gradient(x -> re2(x)[1][2], rand(6))[1] == [0,1,0,0,0,0]
re3 = destructure(m3)[2]
@test gradient(x -> re3(x).x[3], rand(6))[1] == [0,0,1,0,0,0]
@test gradient(x -> re3(x).z[1], rand(6))[1] == [0,0,0,1,0,0]

re4 = destructure(m4)[2]
@test gradient(x -> re4(x).x[1], rand(6))[1] == [1,0,0,0,0,0]
@test gradient(x -> re4(x).y[2], rand(6))[1] == [0,1,0,0,0,0]
@test gradient(rand(6)) do x
m = re4(x)
m.x[1] + 2*m.y[2] + 3*m.z[3]
end[1] == [1,2,0, 0,0,3]

re7 = destructure(m7)[2]
@test gradient(x -> re7(x).a[2][3], rand(3))[1] == [0,0,1]
@test gradient(x -> re7(x).b[2][2], rand(3))[1] == [0,0,0]
@test gradient(x -> re7(x).c[2][1], rand(3))[1] == [0,0,0]

v8, re8 = destructure(m8)
@test gradient(x -> sum(abs2, re8(x)[1].y), v8)[1] == [2,4,6,0,0]
@test gradient(x -> only(sum(re8(x)[3]))^2, v8)[1] == [0,0,0,0,10]

@testset "second derivative" begin
@test_broken gradient(collect(1:6.0)) do y
sum(abs2, gradient(x -> sum(abs2, re2(x)[1]), y)[1])
end[1] [8,16,24,0,0,0]
# ERROR: Need an adjoint for constructor ChainRulesCore.Tangent{Any, Tuple{Vector{Float64}, ChainRulesCore.ZeroTangent}}. Gradient is of type Tuple{Vector{Float64}, Vector{Float64}}
# with Zygote, which can be fixed by:
# Zygote.@adjoint Tangent{T,B}(x::Tuple) where {T,B<:Tuple} = Tangent{T,B}(x), dx -> (dx,)

@test_broken gradient(collect(1:6.0)) do y
sum(abs2, gradient(x -> sum(abs2, re3(x).z), y)[1])
end[1] [0,0,0,32,40,48]
# Not fixed by this:
# Zygote.@adjoint Tangent{T,B}(x::NamedTuple) where {T,B<:NamedTuple} = Tangent{T,B}(x), dx -> (dx,)
end
end

@testset "Flux issue 1826" begin
v, re = destructure((x=[1,2.0], y=[3,4,5.0]))
@test gradient(zero(v)) do w
m = re(w)
5 * sum(m.x) + 7 * sum(m[2]) # uses both x and y
end == ([5.0, 5.0, 7.0, 7.0, 7.0],)
# This, using only x, was broken on Flux:
@test gradient(w -> sum(re(w).x), zero(v)) == ([1.0, 1.0, 0.0, 0.0, 0.0],)

sh = [7,7.0];
v, re = destructure((x=sh, y=[3.0,4.0], z=sh)) # shared array in the model
@test v == [7, 7, 3, 4]
@test re([1,10,100,1000]) == (x = [1, 10], y = [100, 1000], z = [1, 10])

@test gradient(zero(v)) do w
m = re(w)
3 * sum(m.x) + 13 * sum(m.z) # no dependence on y, but two distinct gradient arrays
end == ([16, 16, 0, 0],) # Flux gave ([3.0, 3.0, 13.0, 13.0],)

@test gradient(zero(v)) do w
m = re(w)
4(sum(m.x) + sum(m.z)) # now two gradients are ===, so it eliminates one
end == ([8,8,0,0],)

@test gradient(zero(v)) do w
m = re(w)
4(sum(m.x) + sum(m.y)) + 13*sum(m.z) # again two gradients are ===, so it eliminates one
end == ([17,17,4,4],) # Flux gave ([4.0, 4.0, 13.0, 13.0],)
end
Loading

0 comments on commit 1e34fa2

Please sign in to comment.