diff --git a/NEWS.md b/NEWS.md
index 28b5856a24..863107aa8c 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -4,11 +4,13 @@ See also [github's page](https://github.com/FluxML/Flux.jl/releases) for a compl
 
 ## v0.15.0 
 * Recurrent layers have undergone a complete redesign in [PR 2500](https://github.com/FluxML/Flux.jl/pull/2500).
-* `RNN`, `LSTM`, and `GRU` no longer store the hidden state internally. Instead, they now take the previous state as input and return the updated state as output.
-* These layers (`RNN`, `LSTM`, `GRU`) now process entire sequences at once, rather than one element at a time.
-* The `Recur` wrapper has been deprecated and removed.
-* The `reset!` function has also been removed; state management is now entirely up to the user.
-* `RNNCell`, `LSTMCell`, and `GRUCell` are now exported and provide functionality for single time-step processing.
+  * `RNNCell`, `LSTMCell`, and `GRUCell` are now exported and provide functionality for single time-step processing: `rnncell(x_t, h_t) -> h_{t+1}`.
+  * `RNN`, `LSTM`, and `GRU` no longer store the hidden state internally, it has to be explicitely passed to the layer. Moreover, they now process entire sequences at once, rather than one element at a time: `rnn(x, h) -> h′`.
+  * The `Recur` wrapper has been deprecated and removed.
+  * The `reset!` function has also been removed; state management is now entirely up to the user.
+* The `Flux.Optimise` module has been deprecated in favor of the Optimisers.jl package.
+  Now Flux re-exports the optimisers from Optimisers.jl. Most users will be uneffected by this change.
+  The module is still available for now, but will be removed in a future release.
 
 ## v0.14.22
 * Data movement between devices is now provided by [MLDataDevices.jl](https://github.com/LuxDL/MLDataDevices.jl).
diff --git a/docs/src/guide/models/overview.md b/docs/src/guide/models/overview.md
index 71eff0d33f..8bb88833c8 100644
--- a/docs/src/guide/models/overview.md
+++ b/docs/src/guide/models/overview.md
@@ -95,7 +95,7 @@ Under the hood, the Flux [`Flux.train!`](@ref) function uses *a loss function* a
 julia> using Flux: train!
 
 julia> opt = Descent()
-Descent(0.1)
+Descent(0.1f0)
 
 julia> data = [(x_train, y_train)]
 1-element Vector{Tuple{Matrix{Int64}, Matrix{Int64}}}:
diff --git a/src/Flux.jl b/src/Flux.jl
index 5dd193ef2b..2804803947 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -12,6 +12,8 @@ using MLUtils
 const stack = MLUtils.stack  # now exported by Base
 import Optimisers: Optimisers, trainable, destructure  # before v0.13, Flux owned these functions
 using Optimisers: freeze!, thaw!, adjust!, trainables
+@reexport using Optimisers
+
 using Random: default_rng
 using Zygote, ChainRulesCore
 using Zygote: Params, @adjoint, gradient, pullback
@@ -56,13 +58,8 @@ export Chain, Dense, Embedding, EmbeddingBag,
 ))
 
 include("optimise/Optimise.jl")
-using .Optimise
-export Descent, Adam, Momentum, Nesterov, RMSProp,
-  AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, OAdam,
-  AdamW, RAdam, AdaBelief, InvDecay, ExpDecay,
-  WeightDecay, SignDecay, ClipValue, ClipNorm
-
-export ClipGrad, OptimiserChain  # these are const defined in deprecations, for ClipValue, Optimiser
+using .Optimise: Optimise
+export ClipValue # this is const defined in deprecations, for ClipGrad
 
 include("train.jl")
 using .Train
diff --git a/src/deprecations.jl b/src/deprecations.jl
index 0d842f786b..57ea3bf72a 100644
--- a/src/deprecations.jl
+++ b/src/deprecations.jl
@@ -41,31 +41,40 @@ train!(loss, ps::Params, data, opt::Optimisers.AbstractRule; cb=nothing) = error
   """)
 
 train!(loss, model, data, opt::Optimise.AbstractOptimiser; cb=nothing) =
-  train!(loss, model, data, _old_to_new(opt); cb)
+  train!(loss, model, data, __old_to_new(opt); cb)
 
 # Next, to use the new `setup` with the still-exported old-style `Adam` etc:
 import .Train: setup
-setup(rule::Optimise.AbstractOptimiser, model) = setup(_old_to_new(rule), model)
+setup(rule::Optimise.AbstractOptimiser, model) = setup(__old_to_new(rule), model)
 # ... and allow accidental use of `Optimisers.setup` to do the same:
-Optimisers.setup(rule::Optimise.AbstractOptimiser, model) = setup(_old_to_new(rule), model)
+Optimisers.setup(rule::Optimise.AbstractOptimiser, model) = setup(__old_to_new(rule), model)
+
+
+function __old_to_new(rule)
+  Base.depwarn("""Optimisers from  Flux.Optimise module are deprecated. 
+                   Use optimisers from Optimisers.jl instead.""", :__old_to_new)
+  return _old_to_new(rule)
+end
 
 for T in [:Descent, :Adam, :Momentum, :Nesterov,
    	      :AdaGrad, :AdaMax, :AdaDelta, :AMSGrad, :NAdam, :RAdam, :OAdam, :AdaBelief,
    	      # :InvDecay, :ExpDecay, 
           :SignDecay,
           ]
-  @eval function _old_to_new(rule::$T)
+  @eval function _old_to_new(rule::Optimise.$T)
     args = map(f -> getfield(rule, f), fieldnames(Optimisers.$T))
     Optimisers.$T(args...)
   end
 end
-_old_to_new(rule::Optimiser) = Optimisers.OptimiserChain(map(_old_to_new, rule.os)...)
-const OptimiserChain = Optimise.Optimiser  # lets you use new name with implicit params too.
-_old_to_new(rule::WeightDecay) = Optimisers.WeightDecay(rule.wd)  # called lambda now
-_old_to_new(rule::ClipNorm) = Optimisers.ClipNorm(rule.thresh)  # called omega, and there are more fields 
-_old_to_new(rule::ClipValue) = Optimisers.ClipGrad(rule.thresh)  # called delta now, and struct name differs
-const ClipGrad = Optimise.ClipValue
-_old_to_new(rule::RMSProp) = Optimisers.RMSProp(rule.eta, rule.rho, rule.epsilon)  # RMSProp has no field centred
+_old_to_new(rule::Optimise.Optimiser) = Optimisers.OptimiserChain(map(_old_to_new, rule.os)...)
+# const OptimiserChain = Optimise.Optimiser  # lets you use new name with implicit params too.
+const Optimiser = Optimisers.OptimiserChain
+_old_to_new(rule::Optimise.WeightDecay) = Optimisers.WeightDecay(rule.wd)  # called lambda now
+_old_to_new(rule::Optimise.ClipNorm) = Optimisers.ClipNorm(rule.thresh)  # called omega, and there are more fields 
+_old_to_new(rule::Optimise.ClipValue) = Optimisers.ClipGrad(rule.thresh)  # called delta now, and struct name differs
+# const ClipGrad = Optimise.ClipValue
+const ClipValue = Optimisers.ClipGrad
+_old_to_new(rule::Optimise.RMSProp) = Optimisers.RMSProp(rule.eta, rule.rho, rule.epsilon)  # RMSProp has no field centred
 
 _old_to_new(rule) = error("Flux.setup does not know how to translate this old-style implicit rule to a new-style Optimisers.jl explicit rule")
 
@@ -83,8 +92,21 @@ function update!(opt::Optimise.AbstractOptimiser, model, grad)
   # to accept only arrays. Remove if this causes problems!
   # update!(opt::Flux.Optimise.AbstractOptimiser, x::AbstractArray, x̄)
   error("""Invalid input to `update!`.
-    * For the implicit style, this needs `update(::AbstractOptimiser, ::Params, ::Grads)`
-    * For the explicit style, `update(state, model, grad)` needs `state = Flux.setup(opt, model)`.
+    * For the implicit style, this needs `update!(::AbstractOptimiser, ::Params, ::Grads)`
+    * For the explicit style, `update!(state, model, grad)` needs `state = Flux.setup(opt, model)`.
+    """)
+end
+
+# TODO this friendly error should go in Optimisers.jl.
+# remove after https://github.com/FluxML/Optimisers.jl/pull/181
+function update!(opt::Optimisers.AbstractRule, model, grad)
+  error("""Invalid input to `update!`.
+     `update!(state, model, grad)` needs `state = Flux.setup(opt, model)`.
+    """)
+end
+function update!(opt::Optimisers.AbstractRule, model::Chain, grad::Tuple)
+  error("""Invalid input to `update!`.
+     `update!(state, model, grad)` needs `state = Flux.setup(opt, model)`.
     """)
 end
 
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 8ba07b95a8..14ed11e319 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -145,7 +145,7 @@ Conv((3,), 4 => 5, σ)  # 65 parameters
 julia> layer(randn(100, 4, 64)) |> size
 (98, 5, 64)
 
-julia> Flux.params(layer) |> length
+julia> Flux.trainables(layer) |> length
 2
 ```
 """
@@ -294,7 +294,7 @@ ConvTranspose((3,), 5 => 4, σ)  # 64 parameters
 julia> layer(randn(100, 5, 64)) |> size  # transposed convolution will increase the dimension size (upsampling)
 (102, 4, 64)
 
-julia> Flux.params(layer) |> length
+julia> Flux.trainables(layer) |> length
 2
 ```
 """
diff --git a/src/layers/show.jl b/src/layers/show.jl
index 67cf49e996..f3fc170ec5 100644
--- a/src/layers/show.jl
+++ b/src/layers/show.jl
@@ -104,15 +104,15 @@ function _layer_show(io::IO, layer, indent::Int=0, name=nothing)
   _str = isnothing(name) ? "" : "$name = "
   str = _str * _layer_string(io, layer)
   print(io, " "^indent, str, indent==0 ? "" : ",")
-  if !isempty(params(layer))
+  if !isempty(trainables(layer))
     print(io, " "^max(2, (indent==0 ? 20 : 39) - indent - length(str)))
-    printstyled(io, "# ", underscorise(sum(length, params(layer); init=0)), " parameters"; 
+    printstyled(io, "# ", underscorise(sum(length, trainables(layer); init=0)), " parameters"; 
 color=:light_black)
-    nonparam = _childarray_sum(length, layer) - sum(length, params(layer), init=0)
+    nonparam = _childarray_sum(length, layer) - sum(length, trainables(layer), init=0)
     if nonparam > 0
       printstyled(io, ", plus ", underscorise(nonparam), indent==0 ? " non-trainable" : ""; color=:light_black)
     end
-    _nan_show(io, params(layer))
+    _nan_show(io, trainables(layer))
   end
   indent==0 || println(io)
 end
@@ -127,7 +127,7 @@ function _layer_string(::IO, a::AbstractArray)
 end
 
 function _big_finale(io::IO, m)
-  ps = params(m)
+  ps = trainables(m)
   if length(ps) > 2
     pars = underscorise(sum(length, ps; init=0))
     bytes = Base.format_bytes(Base.summarysize(m))
diff --git a/src/outputsize.jl b/src/outputsize.jl
index 5d6132d059..c413405048 100644
--- a/src/outputsize.jl
+++ b/src/outputsize.jl
@@ -302,8 +302,6 @@ function ChainRulesCore.rrule(::typeof(striplazy), m)
   striplazy(m), _ -> error("striplazy should never be used within a gradient")
 end
 
-params!(p::Params, x::LazyLayer, seen = IdSet()) = error("LazyLayer should never be used within params(m). Call striplazy(m) first.")
-
 Functors.functor(::Type{<:LazyLayer}, x) = error("LazyLayer should not be walked with Functors.jl, as the arrays which Flux.gpu wants to move may not exist yet.")
 
 function Base.show(io::IO, l::LazyLayer)
diff --git a/test/data.jl b/test/data.jl
index b97c4dae80..77ba99d133 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -80,18 +80,20 @@ using Random
     # test interaction with `train!`
     θ = ones(2)
     X = zeros(2, 10)
-    loss(x) = sum((x .- θ).^2)
+    loss(θ, x) = sum((x .- θ).^2)
     d  = DataLoader(X)
-    Flux.train!(loss, Params([θ]), ncycle(d, 10), Descent(0.1))
+    opt_state = Flux.setup(Descent(0.1), θ)
+    Flux.train!(loss, θ, ncycle(d, 10), opt_state)
     @test norm(θ) < 1e-4
 
     # test interaction with `train!`
     θ = zeros(2)
     X = ones(2, 10)
     Y = fill(2, 10)
-    loss(x, y) = sum((y - x'*θ).^2)
+    loss(θ, x, y) = sum((y - x'*θ).^2)
     d  = DataLoader((X, Y))
-    Flux.train!(loss, Params([θ]), ncycle(d, 10), Descent(0.1))
+    opt_state = Flux.setup(Descent(0.1), θ)
+    Flux.train!(loss, θ, ncycle(d, 10), opt_state)
     @test norm(θ .- 1) < 1e-10
 
     # specify the rng
diff --git a/test/ext_cuda/cuda.jl b/test/ext_cuda/cuda.jl
index e4be91cd02..066998f14c 100644
--- a/test/ext_cuda/cuda.jl
+++ b/test/ext_cuda/cuda.jl
@@ -21,7 +21,7 @@ CUDA.allowscalar(false)
   m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax)
   cm = gpu(m)
 
-  @test all(p isa CuArray for p in Flux.params(cm))
+  @test all(p isa CuArray for p in Flux.trainables(cm))
   @test cm(gpu(rand(10, 10))) isa CuArray{Float32,2}
 
   xs = rand(5, 5)
diff --git a/test/ext_cuda/layers.jl b/test/ext_cuda/layers.jl
index 7b7ceb7114..cdb8f003e9 100644
--- a/test/ext_cuda/layers.jl
+++ b/test/ext_cuda/layers.jl
@@ -110,8 +110,8 @@ end
   l = cl((2,2), 1=>3, bias = false) |> gpu
   ip = zeros(Float32, 28,28,1,1) |> gpu
   @test sum(l(ip)) ≈ 0.f0
-  gs = gradient(() -> sum(l(ip)), Flux.params(l))
-  @test l.bias ∉ gs.params
+  gs = gradient(l -> sum(l(ip)), l)[1]
+  @test gs.bias === nothing
 end
 
 @testset "Dense without bias" begin
@@ -119,8 +119,8 @@ end
   ip = zeros(Float32, 3, 7) |> gpu
 
   @test sum(l(ip)) ≈ 0.f0
-  gs = gradient(() -> sum(l(ip)), Flux.params(l))
-  @test l.bias ∉ gs.params
+  gs = gradient(l -> sum(l(ip)), l)[1]
+  @test gs.bias === nothing
 end
 
 @testset "Extended BatchNorm" begin
@@ -133,13 +133,13 @@ end
   μ_cpu = copy(m_cpu.μ)
   m_cpu(x_cpu)
   @test m_cpu.μ ≈ μ_cpu
-  gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
+  gradient(m_cpu -> sum(m_cpu(x_cpu)), m_cpu)
   @test !(m_cpu.μ ≈ μ_cpu)
 
   μ_gpu = copy(m_gpu.μ)
   m_gpu(x_gpu)
   @test m_gpu.μ ≈ μ_gpu
-  gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
+  gradient(m_gpu -> sum(m_gpu(x_gpu)), m_gpu)
   @test !(m_gpu.μ ≈ μ_gpu)
 
   @test Array(m_gpu.μ) ≈ m_cpu.μ
@@ -149,14 +149,14 @@ end
   μ_cpu = copy(m_cpu.μ)
   m_cpu(x_cpu)
   @test m_cpu.μ ≈ μ_cpu
-  gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
+  gradient(m_cpu -> sum(m_cpu(x_cpu)), m_cpu)
   @test m_cpu.μ ≈ μ_cpu
 
   testmode!(m_gpu)
   μ_gpu = copy(m_gpu.μ)
   m_gpu(x_gpu)
   @test m_gpu.μ ≈ μ_gpu
-  gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
+  gradient(m_gpu -> sum(m_gpu(x_gpu)), m_gpu)
   @test m_gpu.μ ≈ μ_gpu
 
   ## In trainmode, always track statistics
@@ -165,7 +165,7 @@ end
   m_cpu(x_cpu)
   @test !(m_cpu.μ ≈ μ_cpu)
   μ_cpu = copy(m_cpu.μ)
-  gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
+  gradient(m_cpu -> sum(m_cpu(x_cpu)), m_cpu)
   @test !(m_cpu.μ ≈ μ_cpu)
 
   trainmode!(m_gpu)
@@ -173,44 +173,28 @@ end
   m_gpu(x_gpu)
   @test !(m_gpu.μ ≈ μ_gpu)
   μ_gpu = copy(m_gpu.μ)
-  gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
+  gradient(m_gpu -> sum(m_gpu(x_gpu)), m_gpu)
   @test !(m_gpu.μ ≈ μ_gpu)
-
-  ## No errors if input type mistmatch
-  # x_cpu = rand(Float64, 3, 2, 2)
-  # x_gpu = x_cpu |> gpu
-  # m_cpu(x_cpu)
-  # gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
-  # m_gpu(x_gpu)
-  # gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
 end
 
 @testset "Two-streams Bilinear" begin
   x = zeros(Float32,10,9) |> gpu
   y = zeros(Float32,2,9) |> gpu
   b = Flux.Bilinear(10, 2, 3) |> gpu
-  @test size(b(x,y)) == (3,9)
-  @test sum(abs2, b(x,y)) ≈ 0f0
-  gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b))
-  b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu
-  gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu))
-  for (pgpu, pcpu) in zip(params(b), params(b_cpu))
-    @test gs_cpu[pcpu] ≈ Array(gs_gpu[pgpu])
-  end
+  @test size(b(x, y)) == (3,9)
+  @test sum(abs2, b(x, y)) ≈ 0f0
+  test_gradients(b |> cpu, x |> cpu, y |> cpu, 
+    test_gpu=true, compare_finite_diff=false, loss=(m, x, y) -> mean(abs2, m(x, y)))
 end
 
 @testset "Two-streams Bilinear" begin
   x = zeros(Float32,10,9) |> gpu
   y = zeros(Float32,2,9) |> gpu
   b = Flux.Bilinear(10, 2, 3) |> gpu
-  @test size(b(x,y)) == (3,9)
-  @test sum(abs2, b(x,y)) ≈ 0f0
-  gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b))
-  b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu
-  gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu))
-  for (pgpu, pcpu) in zip(params(b), params(b_cpu))
-    @test gs_cpu[pcpu] ≈ Array(gs_gpu[pgpu])
-  end
+  @test size(b(x, y)) == (3,9)
+  @test sum(abs2, b(x, y)) ≈ 0f0
+  test_gradients(b |> cpu, x |> cpu, y |> cpu, 
+    test_gpu=true, compare_finite_diff=false, loss=(m, x, y) -> mean(abs2, m(x, y)))
 end
 
 @testset "Parallel" begin
@@ -228,15 +212,9 @@ end
   end
 
   @testset "gradient" begin
-    input_cpu = randn(10, 10, 10, 10)
-    input_gpu = input_cpu |> gpu
     layer_cpu = Parallel(+, x -> zero(x), identity)
-    layer_gpu = layer_cpu |> gpu
-    gs_cpu = gradient(() -> sum(abs2.(layer_cpu(input_cpu))), params(layer_cpu))
-    gs_gpu = gradient(() -> sum(abs2.(layer_gpu(input_gpu))), params(layer_gpu))
-    for (pgpu, pcpu) in zip(params(layer_cpu), params(layer_gpu))
-      @test gs_cpu[pcpu] ≈ gs_gpu[pgpu]
-    end
+    test_gradients(layer_cpu, randn(2, 2, 2, 2), 
+      test_gpu=true, compare_finite_diff=false, loss=(m, x) -> mean(abs2, m(x)))
   end
 end
 
diff --git a/test/layers/basic.jl b/test/layers/basic.jl
index 8e33340611..c95c8c8288 100644
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@@ -42,11 +42,11 @@ using Flux: activations
   @testset "Activations" begin
     c = Chain(Dense(3,5,relu), Dense(5,1,relu))
     X = Float32.([1.0; 1.0; 1.0])
-    @test_nowarn gradient(()->Flux.activations(c, X)[2][1], Flux.params(c))
+    @test_nowarn gradient(c -> Flux.activations(c, X)[2][1], c)
 
     c2 = Chain(enc = c[1], dec = c[2])
     @test Flux.activations(c, X) == Flux.activations(c2, X)
-    @test_nowarn gradient(()->Flux.activations(c2, X)[2][1], Flux.params(c2))
+    @test_nowarn gradient(c -> Flux.activations(c, X)[2][1], c2)
   end
 
   @testset "Dense" begin
@@ -158,9 +158,9 @@ using Flux: activations
       @test mo(input) == target
     end
 
-    @testset "params" begin
+    @testset "trainables" begin
       mo = Maxout(()->Dense(32, 64), 4)
-      ps = Flux.params(mo)
+      ps = Flux.trainables(mo)
       @test length(ps) == 8  #4 alts, each with weight and bias
     end
   end
@@ -198,7 +198,7 @@ using Flux: activations
       x = randn(Float32,11,7)
       b = Flux.Bilinear(11, 11, 3)
       @test size(b(x)) == (3,7)
-      @test_nowarn gs = gradient(() -> sum(abs2.(b(x))), params(b))
+      test_gradients(b, x)
     end
 
     @testset "constructors" begin
@@ -447,16 +447,15 @@ end
 @testset "gradients of Chain{Vector}" begin
   m1 = Chain(Dense(3,4,tanh; bias=false), Dense(4,2))
   m1v = Chain([m1[1], m1[2]])
-  @test sum(length, params(m1)) == sum(length, params(m1v))
+  @test sum(length, Flux.trainables(m1)) == sum(length, Flux.trainables(m1v))
 
   x1 = randn(Float32,3,5)
   @test m1(x1) ≈ m1v(x1)
 
   y1 = rand(Bool,2,5)
-  g1 = gradient(() -> Flux.Losses.logitcrossentropy(m1(x1), y1), params(m1))
-  g1v = gradient(() -> Flux.Losses.logitcrossentropy(m1v(x1), y1), params(m1v))
-  @test g1[m1[1].weight] ≈ g1v[m1v[1].weight]
-  @test g1[m1[2].bias] ≈ g1v[m1v[2].bias]
+  g1 = gradient(m1 -> Flux.logitcrossentropy(m1(x1), y1), m1)[1]
+  g1v = gradient(m1v -> Flux.logitcrossentropy(m1v(x1), y1), m1v)[1]
+  check_equal_leaves(g1, g1v)
 
   @test Flux.destructure(m1)[1] ≈ Flux.destructure(m1v)[1]
   z1 = rand(22);
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index 2e75a1e39d..8780fef957 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -43,28 +43,30 @@ end
   @test sum(op) == prod(size(op))
 
   @testset "No bias mapped through $lmap" for lmap in (identity, cpu, f32)
-    bias = Conv((2,2), 1=>3, bias = false) |> lmap
-    op = bias(ip)
+    model = Conv((2,2), 1=>3, bias = false) |> lmap
+    op = model(ip)
     @test sum(op) ≈ 0.f0
-    gs = gradient(() -> sum(bias(ip)), Flux.params(bias))
-    @test bias.bias ∉ gs.params
+    g = gradient(m -> sum(m(ip)), model)[1]
+    @test g.bias isa Nothing
   end
 
-  # Train w/o bias and make sure no convergence happens
-  # when only bias can be converged
-  bias = Conv((2, 2), 1=>3, bias = false);
-  ip = zeros(Float32, 28,28,1,1)
-  op = zeros(Float32, 27,27,3,1) .+ 2.f0
-  opt = Descent()
-
-  for _ = 1:10^3
-    gs = gradient(Flux.params(bias)) do
-      Flux.Losses.mse(bias(ip), op)
+  @testset "no bias train" begin
+    # Train w/o bias and make sure no convergence happens
+    # when only bias can be converged
+    model = Conv((2, 2), 1=>3, bias = false);
+    ip = zeros(Float32, 28,28,1,1)
+    op = zeros(Float32, 27,27,3,1) .+ 2.f0
+    opt_state = Flux.setup(Descent(), model)
+
+    for _ = 1:10^3
+      g = gradient(model) do m
+        Flux.mse(m(ip), op)
+      end[1]
+      Flux.update!(opt_state, model, g)
     end
-    Flux.Optimise.update!(opt, params(bias), gs)
-  end
 
-  @test Flux.Losses.mse(bias(ip), op) ≈ 4.f0
+    @test Flux.Losses.mse(model(ip), op) ≈ 4.f0
+  end
 
   @testset "Grouped Conv" begin
     ip = rand(Float32, 28, 100, 2)
@@ -164,11 +166,11 @@ end
 
   m = ConvTranspose((3,3), 1=>1)
   # Test that the gradient call does not throw: #900
-  @test gradient(()->sum(m(x)), Flux.params(m)) isa Flux.Zygote.Grads
+  g = gradient(m -> sum(m(x)), m)[1]
 
   x = zeros(Float32, 5, 5, 2, 4)
   m = ConvTranspose((3,3), 2=>3)
-  @test gradient(()->sum(m(x)), params(m)) isa Flux.Zygote.Grads
+  g = gradient(m -> sum(m(x)), m)[1]
 
   # test ConvTranspose supports groups argument
   x = randn(Float32, 10, 10, 2, 3)
@@ -178,7 +180,7 @@ end
   m2 = ConvTranspose((3,3), 2=>4, groups=2, pad=SamePad())
   @test size(m2.weight) == (3,3,2,2)
   @test size(m1(x)) == size(m2(x))
-  @test gradient(()->sum(m2(x)), params(m2)) isa Flux.Zygote.Grads
+  g = gradient(m -> sum(m(x)), m2)[1]
 
   x = randn(Float32, 10, 2,1)
   m = ConvTranspose((3,), 2=>4, pad=SamePad(), groups=2)
diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index be7c5dec92..f678297eaa 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -129,7 +129,7 @@ end
                              2.0 4.0 6.0]
 
     @test Flux.hasaffine(m) == true
-    @test length(Flux.params(m)) == 2
+    @test length(Flux.trainables(m)) == 2
 
     @test m.β == [0, 0]  # initβ(2)
     @test m.γ == [1, 1]  # initγ(2)
@@ -211,9 +211,9 @@ end
     @inferred m(x)
   end
 
-  @test length(Flux.params(BatchNorm(10))) == 2
-  @test length(Flux.params(BatchNorm(10, affine=true))) == 2
-  @test length(Flux.params(BatchNorm(10, affine=false))) == 0
+  @test length(Flux.trainables(BatchNorm(10))) == 2
+  @test length(Flux.trainables(BatchNorm(10, affine=true))) == 2
+  @test length(Flux.trainables(BatchNorm(10, affine=false))) == 0
 
   @test BatchNorm(5; active=true).active === true
   @test_throws Exception BatchNorm(5; active=:something_else)
@@ -224,7 +224,7 @@ end
   let m = InstanceNorm(2; affine=true, track_stats=true), sizes = (3, 2, 2),
         x = reshape(collect(1:prod(sizes)), sizes)
 
-      @test length(Flux.params(m)) == 2
+      @test length(Flux.trainables(m)) == 2
       x = Float32.(x)
       @test m.β == [0, 0]  # initβ(2)
       @test m.γ == [1, 1]  # initγ(2)
@@ -287,7 +287,7 @@ end
       x = reshape(collect(1:prod(sizes)), sizes)
 
     @test Flux.hasaffine(m) == true
-    @test length(Flux.params(m)) == 2
+    @test length(Flux.trainables(m)) == 2
     x = Float64.(x)
     y = m(x)
     μ = mean(x, dims=1)
@@ -300,7 +300,7 @@ end
   let m = InstanceNorm(2, sigmoid), sizes = (3, 2, 2),
       x = reshape(collect(1:prod(sizes)), sizes)
     @test Flux.hasaffine(m) == false
-    @test length(Flux.params(m)) == 0
+    @test length(Flux.trainables(m)) == 0
 
     x = Float64.(x)
     y = m(x)
@@ -345,9 +345,9 @@ end
     @inferred m(x)
   end
 
-  @test length(Flux.params(InstanceNorm(10))) == 0
-  @test length(Flux.params(InstanceNorm(10, affine=true))) == 2
-  @test length(Flux.params(InstanceNorm(10, affine=false))) == 0
+  @test length(Flux.trainables(InstanceNorm(10))) == 0
+  @test length(Flux.trainables(InstanceNorm(10, affine=true))) == 2
+  @test length(Flux.trainables(InstanceNorm(10, affine=false))) == 0
 
   @test InstanceNorm(5; active=true).active === true
   @test_throws Exception InstanceNorm(5; active=:something_else)
@@ -370,10 +370,10 @@ end
 
   m = LayerNorm((2,3,4))
   @test Flux.hasaffine(m) == true
-  @test length(Flux.params(m)) == 2
+  @test length(Flux.trainables(m)) == 2
   m = LayerNorm((2,3,4), affine=false)
   @test Flux.hasaffine(m) == false
-  @test length(Flux.params(m)) == 0
+  @test length(Flux.trainables(m)) == 0
 end
 
 @testset "GroupNorm" begin
@@ -383,7 +383,7 @@ end
   let m = GroupNorm(4,2), sizes = (3,4,2),
         x = reshape(collect(1:prod(sizes)), sizes)
 
-      @test length(Flux.params(m)) == 2
+      @test length(Flux.trainables(m)) == 2
       x = Float32.(x)
       @test m.β == [0, 0, 0, 0]  # initβ(32)
       @test m.γ == [1, 1, 1, 1]  # initγ(32)
diff --git a/test/optimise.jl b/test/optimise.jl
deleted file mode 100644
index c63ba85727..0000000000
--- a/test/optimise.jl
+++ /dev/null
@@ -1,222 +0,0 @@
-using Flux.Optimise
-using Flux.Optimise: runall
-using Flux: Params, gradient
-import FillArrays, ComponentArrays
-import Optimisers
-using Test
-using Random
-
-@testset "Optimise" begin
-  # Ensure rng has different state inside and outside the inner @testset
-  # so that w and w' are different
-  Random.seed!(84)
-  w = randn(10, 10)
-  @testset for opt in [AdamW(), AdaGrad(0.1), AdaMax(), AdaDelta(0.9), AMSGrad(),
-                       NAdam(), RAdam(), Descent(0.1), Adam(), OAdam(), AdaBelief(),
-                       Nesterov(), RMSProp(), Momentum()]
-    Random.seed!(42)
-    w′ = randn(10, 10)
-    b = false
-    loss(x) = Flux.Losses.mse(w*x, w′*x .+ b)
-    for t = 1: 10^5
-      θ = params([w′, b])
-      x = rand(10)
-      θ̄ = gradient(() -> loss(x), θ)
-      Optimise.update!(opt, θ, θ̄)
-    end
-    @test loss(rand(10, 10)) < 0.01
-  end
-end
-
-@testset "Optimiser" begin
-  Random.seed!(84)
-  w = randn(10, 10)
-  @testset for Opt in [InvDecay, WeightDecay, ExpDecay, SignDecay]
-    Random.seed!(42)
-    w′ = randn(10, 10)
-    loss(x) = Flux.Losses.mse(w*x, w′*x)
-    opt = Optimiser(Opt(), Adam(0.001))
-    for t = 1:10^5
-      θ = Params([w′])
-      x = rand(10)
-      θ̄ = gradient(() -> loss(x), θ)
-      Optimise.update!(opt, θ, θ̄)
-    end
-    @test loss(rand(10, 10)) < 0.01
-  end
-end
-
-@testset "Training Loop" begin
-
-  # Test multiple callbacks
-  x = 0
-  fs = [() -> (), () -> x = 1]
-  cbs = runall(fs)
-  cbs()
-  @test x == 1
-
-  r = rand(3, 3)
-  loss(x) = sum(x .* x)
-  Flux.train!(loss, Flux.params(r), (r,), Descent())
-end
-
-@testset "Stop on NaN" begin
-  m = Dense(1 => 1)
-  m.weight .= 0
-  CNT = 0
-  @test_throws DomainError Flux.train!(Flux.params(m), 1:100, Descent(0.1)) do i
-    CNT += 1
-    (i == 51 ? NaN32 : 1f0) * sum(m([1.0]))
-  end
-  @test CNT == 51  # stopped early
-  @test m.weight[1] ≈ -5  # did not corrupt weights
-end
-
-@testset "ExpDecay" begin
-
-  @testset "Sanity Check" begin
-    o = ExpDecay(0.2, 0.5, 1, 1e-3)
-    p = [0.0]
-    steps = 1:8
-    eta_expected = @. max(o.eta * 0.5 ^ steps, o.clip)
-    eta_actual = [Optimise.apply!(o, p, [1.0])[1] for _ in steps]
-    @test eta_actual == eta_expected
-  end
-
-  @testset "starting step" begin
-    start = 4
-    o = ExpDecay(0.2, 0.5, 1, 1e-3, start)
-    p = [0.0]
-    steps = 1:8
-    eta_expected = @. max(o.eta * 0.5 ^ max(steps - start, 0), o.clip)
-    eta_actual = [Optimise.apply!(o, p, [1.0])[1] for _ in steps]
-    @test eta_actual == eta_expected
-  end
-
-  w = randn(10, 10)
-  o = ExpDecay(0.1, 0.1, 1000, 1e-4)
-  w1 = randn(10,10)
-  loss(x) = Flux.Losses.mse(w*x, w1*x)
-  flag = 1
-  decay_steps = []
-  for t = 1:10^5
-    prev_eta = o.eta
-    θ = Params([w1])
-    x = rand(10)
-    θ̄ = gradient(() -> loss(x), θ)
-    prev_grad = collect(θ̄[w1])
-    delta = Optimise.apply!(o, w1, θ̄[w1])
-    w1 .-= delta
-    new_eta = o.eta
-    if new_eta != prev_eta
-      push!(decay_steps, t)
-    end
-    array = fill(o.eta, size(prev_grad))
-    if array .* prev_grad != delta
-      flag = 0
-    end
-  end
-  @test flag == 1
-  # Test to check if decay happens at decay steps. Eta reaches clip value (1e-4) after 4000 steps (decay by 0.1 every 1000 steps starting at 0.1).
-  ground_truth = []
-  for i in 1:4
-    push!(ground_truth, 1000*i)  # Expected decay steps for this example.
-  end
-  @test decay_steps == ground_truth
-  @test o.eta == o.clip
-end
-
-@testset "Clipping" begin
-  w = randn(10, 10)
-  loss(x) = sum(w * x)
-  θ = Params([w])
-  x = 1000 * randn(10)
-  w̄ = gradient(() -> loss(x), θ)[w]
-  w̄_value = Optimise.apply!(ClipValue(1.0), w, copy(w̄))
-  @test all(w̄_value .<= 1)
-  w̄_norm = Optimise.apply!(ClipNorm(1.0), w, copy(w̄))
-  @test norm(w̄_norm) <= 1
-end
-
-@testset "update!: handle Fills from Zygote" begin
-  w = randn(10,10)
-  wold = copy(w)
-  g = FillArrays.Ones(size(w))
-  opt = Descent(0.1)
-  Flux.update!(opt, w, g)
-  @test w ≈ wold .- 0.1 
-
-  w = randn(3)
-  wold = copy(w)
-  θ = Flux.params([w])
-  gs = gradient(() -> w[1], θ)
-  opt = Descent(0.1)
-  Flux.update!(opt, θ, gs)
-  @test w[1] ≈ wold[1] .- 0.1
-  @test w[2:3] ≈ wold[2:3] 
-
-  ## Issue #1510
-  w = randn(10,10)
-  wold = copy(w)
-  θ = Flux.params([w])
-  gs = gradient(() -> sum(w), θ)
-  opt = Descent(0.1)
-  Flux.update!(opt, θ, gs)
-  @test w ≈ wold .- 0.1 
-end
-
-@testset "update!: handle ComponentArrays" begin
-  w = ComponentArrays.ComponentArray(a=1.0, b=[2, 1, 4], c=(a=2, b=[1, 2]))
-  wold = deepcopy(w)
-  opt_state = Optimisers.setup(Optimisers.Descent(0.1), w)
-  gs = gradient(w -> w.a + sum(w.c.b), w)[1]
-  Flux.update!(opt_state, w, gs)
-  @test w.a ≈ wold.a - 0.1
-  @test w.b ≈ wold.b
-  @test w.c.b ≈ wold.c.b .- 0.1
-  @test w.c.a ≈ wold.c.a
-
-  w = ComponentArrays.ComponentArray(a=1.0, b=[2, 1, 4], c=(a=2, b=[1, 2]))
-  wold = deepcopy(w)
-  opt_state = Optimisers.setup(Optimisers.Descent(0.1), w)
-  gs = gradient(w -> sum(w), w)[1]
-  Flux.update!(opt_state, w, gs)
-  @test w ≈ wold .- 0.1
-end
-
-# Flux PR #1776
-# We need to test that optimisers like Adam that maintain an internal momentum
-# estimate properly calculate the second-order statistics on the gradients as
-# the flow backward through the model.  Previously, we would calculate second-
-# order statistics via `Δ^2` rather than the complex-aware `Δ * conj(Δ)`, which
-# wreaks all sorts of havoc on our training loops.  This test ensures that
-# a simple optimization is montonically decreasing (up to learning step effects)
-@testset "Momentum Optimisers and complex values" begin
-  # Test every optimiser that has momentum internally
-  for opt_ctor in [Adam, RMSProp, RAdam, OAdam, AdaGrad, AdaDelta, NAdam, AdaBelief]
-    # Our "model" is just a complex number
-    w = zeros(ComplexF32, 1)
-
-    # Our model attempts to learn `f(x) = conj(x)` where `f(x) = w*x`
-    function loss()
-        # Deterministic training data is the best training data
-        x = ones(1, 1) + 1im*ones(1, 1)
-
-        # Manually implement `mse()` to allow demonstration of brokenness
-        # on older Flux builds that don't have a fixed `mse()`
-        return sum(abs2.(w * x .- conj(x)))
-    end
-
-    params = Flux.Params([w])
-    opt = opt_ctor(1e-2)
-
-    # Train for 10 iterations, enforcing that loss is monotonically decreasing
-    last_loss = Inf
-    for idx in 1:10
-        grads = Flux.gradient(loss, params)
-        @test loss() < last_loss
-        last_loss = loss()
-        Flux.update!(opt, params, grads)
-    end
-  end
-end
diff --git a/test/outputsize.jl b/test/outputsize.jl
index 55cb823c5c..fe217c0fc9 100644
--- a/test/outputsize.jl
+++ b/test/outputsize.jl
@@ -248,7 +248,7 @@ end
   @test string(ld) == "LazyLayer(Dense(2 => 3, relu))"
   @test Flux.striplazy(ld) isa Dense
 
-  @test_throws Exception Flux.params(lm)
+  @test_throws Exception Flux.trainables(lm)
   @test_throws Exception gradient(x -> sum(abs2, lm(x)), [1,2])
   @test_throws Exception gradient(m -> sum(abs2, Flux.striplazy(m)([1,2])), ld)
   
diff --git a/test/runtests.jl b/test/runtests.jl
index ff6660be14..6f5a2e7d84 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,5 @@
 using Flux
 using Flux: OneHotArray, OneHotMatrix, OneHotVector
-using Flux: params
 using Test
 using Random, Statistics, LinearAlgebra
 using IterTools: ncycle
@@ -32,8 +31,7 @@ Random.seed!(0)
       include("loading.jl")
     end
 
-    @testset "Optimise / Train" begin
-      include("optimise.jl")
+    @testset "Train" begin
       include("train.jl")
       include("tracker.jl")
     end
diff --git a/test/test_utils.jl b/test/test_utils.jl
index 25a4f1af47..c736943f1c 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -19,7 +19,6 @@ function finitediff_withgradient(f, x...)
     return y, FiniteDifferences.grad(fdm, f, x...)
 end
 
-
 function check_equal_leaves(a, b; rtol=1e-4, atol=1e-4)
     fmapstructure_with_path(a, b) do kp, x, y
         if x isa AbstractArray
@@ -30,7 +29,6 @@ function check_equal_leaves(a, b; rtol=1e-4, atol=1e-4)
     end
 end
 
-
 function test_gradients(
             f, 
             xs...;
diff --git a/test/train.jl b/test/train.jl
index 5a1fd0592e..a021b6f22a 100644
--- a/test/train.jl
+++ b/test/train.jl
@@ -90,38 +90,28 @@ end
 
 @testset "Explicit Flux.update! features" begin
   m = Chain(Dense(2=>3, tanh), Dense(3=>1), only)
-  x = rand(2)
+  x = rand(Float32, 2)
   y1 = m(x)  # before
 
-  # Implicit gradient
-  gold = Zygote.gradient(() -> m(x), Flux.params(m))
-  @test gold isa Flux.Zygote.Grads
-  @test_throws ErrorException Flux.update!(Flux.Adam(), m, gold)  # friendly
-  Flux.update!(Flux.Adam(), Flux.params(m), gold)
-  y2 = m(x)
-  @test y2 < y1
-
   # Explicit gradient
   gs = Zygote.gradient(marg -> marg(x), m)
   @test gs isa Tuple
-  @test_throws ErrorException Flux.update!(Flux.Adam(), Flux.params(m), gs) # friendly
-  @test_throws ErrorException Flux.update!(Flux.Adam(), Flux.params(m), gs[1]) # friendly
   @test_throws ErrorException Flux.update!(Flux.Adam(), m, gs)  # friendly
   @test_throws ErrorException Flux.update!(Flux.Adam(), m, gs[1])  # friendly
   s = Flux.setup(Adam(), m)
   @info "ignore this warning, just testing an upgrade path:"
   Flux.update!(s, m, gs)  # Chain + Tuple can be unambiguously sorted out
+  y2 = m(x)
+  @test y2 < y1
+  Flux.update!(s, m, gs[1])  # finally, this is the correct thing
   y3 = m(x)
   @test y3 < y2
-  Flux.update!(s, m, gs[1])  # finally, this is the correct thing
-  y4 = m(x)
-  @test y4 < y3
 
   # Also check that if you import the new Adam, then Flux.setup does still work!
   s2 = Flux.setup(Optimisers.Adam(), m)
   Flux.update!(s2, m, gs[1])
-  y5 = m(x)
-  @test y5 < y4
+  y4 = m(x)
+  @test y4 < y3
 end
 
 for (trainfn!, name) in ((Flux.train!, "Zygote"), (train_enzyme!, "Enzyme"))
@@ -147,25 +137,21 @@ for (trainfn!, name) in ((Flux.train!, "Zygote"), (train_enzyme!, "Enzyme"))
     end
     diff1 = model.weight .- init_weight
 
-    # Take 2: the same, but with Flux.params. Was broken for a bit, no tests!
-    # skipping this test for Enzyme cause implicit params is unsupported
-    if name == "Zygote"
-      model.weight .= init_weight
-      model.bias .= 0
-      pen2(x::AbstractArray) = sum(abs2, x)/2
-      opt = Flux.setup(Adam(0.1), model)
-
-      trainfn!(model, data, opt) do m, x, y
-        err = Flux.mse(m(x), y)
-        l2 = sum(pen2, Flux.params(m))
-        err + 0.33 * l2
-      end
-
-      diff2 = model.weight .- init_weight
-      @test diff1 ≈ diff2
+    # Take 2: the same, but with Optimisers.trainables. 
+    model.weight .= init_weight
+    model.bias .= 0
+    pen2(x::AbstractArray) = sum(abs2, x)/2
+    opt = Flux.setup(Adam(0.1), model)
 
+    trainfn!(model, data, opt) do m, x, y
+      err = Flux.mse(m(x), y)
+      l2 = sum(pen2, Flux.trainables(m))
+      err + 0.33 * l2
     end
 
+    diff2 = model.weight .- init_weight
+    @test diff1 ≈ diff2
+
     # Take 3: using WeightDecay instead. Need the /2 above, to match exactly.
     model.weight .= init_weight
     model.bias .= 0
diff --git a/test/utils.jl b/test/utils.jl
index dc46d68255..6b0a16bcf3 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -2,7 +2,7 @@ using Flux
 using Flux: throttle, nfan, glorot_uniform, glorot_normal,
              kaiming_normal, kaiming_uniform, orthogonal, truncated_normal, lecun_normal,
              sparse_init, identity_init, unstack, batch, unbatch,
-             unsqueeze, params, loadmodel!
+             unsqueeze, loadmodel!
 using MLUtils
 using Statistics, LinearAlgebra
 using Random
@@ -255,38 +255,32 @@ end
   end
 end
 
-@testset "Params" begin
+@testset "Trainables" begin
   m = Dense(10 => 5)
-  @test size.(params(m)) == [(5, 10), (5,)]
+  @test size.(Flux.trainables(m)) == [(5, 10), (5,)]
   m = RNN(10 => 5)
-  @test size.(params(m)) == [(5, 10), (5, 5), (5,)]
+  @test size.(Flux.trainables(m)) == [(5, 10), (5, 5), (5,)]
 
   # Layer duplicated in same chain, params just once pls.
   c = Chain(m, m)
-  @test size.(params(c)) == [(5, 10), (5, 5), (5,)]
+  @test size.(Flux.trainables(c)) == [(5, 10), (5, 5), (5,)]
 
   # Self-referential array. Just want params, no stack overflow pls.
   r = Any[nothing,m]
   r[1] = r
-  @test size.(params(r)) == [(5, 10), (5, 5), (5,)]
+  @test_broken size.(Flux.trainables(r)) == [(5, 10), (5, 5), (5,)]
 
   # Ensure functor explores inside Transpose but not SubArray
   m = (x = view([1,2,3]pi, 1:2), y = transpose([4 5]pi))
-  @test size.(Flux.params(m)) == [(2,), (1, 2)]
+  @test size.(Flux.trainables(m)) == [(2,), (1, 2)]
 end
 
-@testset "params gradient" begin
+@testset "trainables gradient" begin
   m = (x=[1,2.0], y=[3.0]);
 
-  # Explicit -- was broken by #2054
-  gnew = gradient(m -> (sum(norm, Flux.params(m))), m)[1]
+  gnew = gradient(m -> (sum(norm, Flux.trainables(m))), m)[1]
   @test gnew.x ≈ [0.4472135954999579, 0.8944271909999159]
   @test gnew.y ≈ [1.0]
-
-  # Implicit
-  gold = gradient(() -> (sum(norm, Flux.params(m))), Flux.params(m))
-  @test gold[m.x] ≈ [0.4472135954999579, 0.8944271909999159]
-  @test gold[m.y] ≈ [1.0]
 end
 
 @testset "Precision" begin
@@ -345,28 +339,12 @@ end
     o = ones(s)
     z = zeros(s)
 
-    @testset "Explicit" begin
-      gfun(args...) = gradient((x, y) -> sum(op.(x,y)), args...)
-      g = gfun(o, z)
-      @test gfun(o, false) == (g[1], nothing)
-
-      g = gfun(z, o)
-      @test gfun(false, o) == (nothing, g[2])
-    end
-
-    @testset "Implicit" begin
-      gfun(args...) = gradient(() -> sum(op.(args...)), params(collect(args)))
-      g = gfun(o, z)
-
-      gres = gfun(o, false)
-      @test gres[o] == g[o]
-      @test false ∉ gres.params
+    gfun(args...) = gradient((x, y) -> sum(op.(x,y)), args...)
+    g = gfun(o, z)
+    @test gfun(o, false) == (g[1], nothing)
 
-      g = gfun(z, o)
-      gres = gfun(false, o)
-      @test gres[o] == g[o]
-      @test false ∉ gres.params
-    end
+    g = gfun(z, o)
+    @test gfun(false, o) == (nothing, g[2])
   end
 end
 
@@ -566,10 +544,10 @@ end
 @testset "Shared parameters" begin
   mat = [1 2; 3 4.0]
   simple = ((nothing, mat, (3, mat, 4)))
-  @test length(Flux.params(simple)) == 1
+  @test length(Flux.trainables(simple)) == 1
   
   oneadj = (nt = (m = mat, a = mat'))
-  @test length(Flux.params(oneadj)) == 1  # needs Functors@0.3
+  @test length(Flux.trainables(oneadj)) == 1  # needs Functors@0.3
   
   @test Flux.destructure(simple)[1] == Flux.destructure(oneadj)[1] == [1, 3, 2, 4]
 end
@@ -631,13 +609,13 @@ end
     model = Model(d, d)
 
     # Works
-    g1 = Flux.gradient(() -> sum(model(x)), Flux.params(model))
+    g1 = Flux.gradient(m -> sum(m(x)), model)[1]
 
     p, re = Flux.destructure(model)
     # Fails
-    g2 = Flux.gradient(p -> sum(re(p)(x)), p)
+    g2 = Flux.gradient(p -> sum(re(p)(x)), p)[1]
 
-    @test g2[1] ≈ vcat(g1[d.weight], g1[d.bias])
+    @test g2 ≈ vcat(g1.a.weight + g1.b.weight, g1.a.bias + g1.b.bias)
   end
 
   @testset "issue 1826" begin