diff --git a/src/deprecations.jl b/src/deprecations.jl
index 6148894dbe..0b146f7c05 100644
--- a/src/deprecations.jl
+++ b/src/deprecations.jl
@@ -41,13 +41,20 @@ train!(loss, ps::Params, data, opt::Optimisers.AbstractRule; cb=nothing) = error
   """)
 
 train!(loss, model, data, opt::Optimise.AbstractOptimiser; cb=nothing) =
-  train!(loss, model, data, _old_to_new(opt); cb)
+  train!(loss, model, data, __old_to_new(opt); cb)
 
 # Next, to use the new `setup` with the still-exported old-style `Adam` etc:
 import .Train: setup
-setup(rule::Optimise.AbstractOptimiser, model) = setup(_old_to_new(rule), model)
+setup(rule::Optimise.AbstractOptimiser, model) = setup(__old_to_new(rule), model)
 # ... and allow accidental use of `Optimisers.setup` to do the same:
-Optimisers.setup(rule::Optimise.AbstractOptimiser, model) = setup(_old_to_new(rule), model)
+Optimisers.setup(rule::Optimise.AbstractOptimiser, model) = setup(__old_to_new(rule), model)
+
+
+function __old_to_new(rule)
+  Base.depwarn("""Optimisers from  Flux.Optimise module are deprecated. 
+                   Use optimisers from Optimisers.jl instead.""", :__old_to_new)
+  return _old_to_new(rule)
+end
 
 for T in [:Descent, :Adam, :Momentum, :Nesterov,
    	      :AdaGrad, :AdaMax, :AdaDelta, :AMSGrad, :NAdam, :RAdam, :OAdam, :AdaBelief,
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 8ba07b95a8..14ed11e319 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -145,7 +145,7 @@ Conv((3,), 4 => 5, σ)  # 65 parameters
 julia> layer(randn(100, 4, 64)) |> size
 (98, 5, 64)
 
-julia> Flux.params(layer) |> length
+julia> Flux.trainables(layer) |> length
 2
 ```
 """
@@ -294,7 +294,7 @@ ConvTranspose((3,), 5 => 4, σ)  # 64 parameters
 julia> layer(randn(100, 5, 64)) |> size  # transposed convolution will increase the dimension size (upsampling)
 (102, 4, 64)
 
-julia> Flux.params(layer) |> length
+julia> Flux.trainables(layer) |> length
 2
 ```
 """
diff --git a/test/ext_cuda/cuda.jl b/test/ext_cuda/cuda.jl
index 709cef7aef..4baaa47aea 100644
--- a/test/ext_cuda/cuda.jl
+++ b/test/ext_cuda/cuda.jl
@@ -19,7 +19,7 @@ using SparseArrays: sparse, SparseMatrixCSC, AbstractSparseArray
   m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax)
   cm = gpu(m)
 
-  @test all(p isa CuArray for p in Flux.params(cm))
+  @test all(p isa CuArray for p in Flux.trainables(cm))
   @test cm(gpu(rand(10, 10))) isa CuArray{Float32,2}
 
   xs = rand(5, 5)
diff --git a/test/layers/basic.jl b/test/layers/basic.jl
index 95da13f0c9..e4f8b23ea9 100644
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@@ -40,11 +40,11 @@ using Flux: activations
   @testset "Activations" begin
     c = Chain(Dense(3,5,relu), Dense(5,1,relu))
     X = Float32.([1.0; 1.0; 1.0])
-    @test_nowarn gradient(()->Flux.activations(c, X)[2][1], Flux.params(c))
+    @test_nowarn gradient(c -> Flux.activations(c, X)[2][1], c)
 
     c2 = Chain(enc = c[1], dec = c[2])
     @test Flux.activations(c, X) == Flux.activations(c2, X)
-    @test_nowarn gradient(()->Flux.activations(c2, X)[2][1], Flux.params(c2))
+    @test_nowarn gradient(c -> Flux.activations(c, X)[2][1], c2)
   end
 
   @testset "Dense" begin
@@ -156,9 +156,9 @@ using Flux: activations
       @test mo(input) == target
     end
 
-    @testset "params" begin
+    @testset "trainables" begin
       mo = Maxout(()->Dense(32, 64), 4)
-      ps = Flux.params(mo)
+      ps = Flux.trainables(mo)
       @test length(ps) == 8  #4 alts, each with weight and bias
     end
   end
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index 2e75a1e39d..8780fef957 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -43,28 +43,30 @@ end
   @test sum(op) == prod(size(op))
 
   @testset "No bias mapped through $lmap" for lmap in (identity, cpu, f32)
-    bias = Conv((2,2), 1=>3, bias = false) |> lmap
-    op = bias(ip)
+    model = Conv((2,2), 1=>3, bias = false) |> lmap
+    op = model(ip)
     @test sum(op) ≈ 0.f0
-    gs = gradient(() -> sum(bias(ip)), Flux.params(bias))
-    @test bias.bias ∉ gs.params
+    g = gradient(m -> sum(m(ip)), model)[1]
+    @test g.bias isa Nothing
   end
 
-  # Train w/o bias and make sure no convergence happens
-  # when only bias can be converged
-  bias = Conv((2, 2), 1=>3, bias = false);
-  ip = zeros(Float32, 28,28,1,1)
-  op = zeros(Float32, 27,27,3,1) .+ 2.f0
-  opt = Descent()
-
-  for _ = 1:10^3
-    gs = gradient(Flux.params(bias)) do
-      Flux.Losses.mse(bias(ip), op)
+  @testset "no bias train" begin
+    # Train w/o bias and make sure no convergence happens
+    # when only bias can be converged
+    model = Conv((2, 2), 1=>3, bias = false);
+    ip = zeros(Float32, 28,28,1,1)
+    op = zeros(Float32, 27,27,3,1) .+ 2.f0
+    opt_state = Flux.setup(Descent(), model)
+
+    for _ = 1:10^3
+      g = gradient(model) do m
+        Flux.mse(m(ip), op)
+      end[1]
+      Flux.update!(opt_state, model, g)
     end
-    Flux.Optimise.update!(opt, params(bias), gs)
-  end
 
-  @test Flux.Losses.mse(bias(ip), op) ≈ 4.f0
+    @test Flux.Losses.mse(model(ip), op) ≈ 4.f0
+  end
 
   @testset "Grouped Conv" begin
     ip = rand(Float32, 28, 100, 2)
@@ -164,11 +166,11 @@ end
 
   m = ConvTranspose((3,3), 1=>1)
   # Test that the gradient call does not throw: #900
-  @test gradient(()->sum(m(x)), Flux.params(m)) isa Flux.Zygote.Grads
+  g = gradient(m -> sum(m(x)), m)[1]
 
   x = zeros(Float32, 5, 5, 2, 4)
   m = ConvTranspose((3,3), 2=>3)
-  @test gradient(()->sum(m(x)), params(m)) isa Flux.Zygote.Grads
+  g = gradient(m -> sum(m(x)), m)[1]
 
   # test ConvTranspose supports groups argument
   x = randn(Float32, 10, 10, 2, 3)
@@ -178,7 +180,7 @@ end
   m2 = ConvTranspose((3,3), 2=>4, groups=2, pad=SamePad())
   @test size(m2.weight) == (3,3,2,2)
   @test size(m1(x)) == size(m2(x))
-  @test gradient(()->sum(m2(x)), params(m2)) isa Flux.Zygote.Grads
+  g = gradient(m -> sum(m(x)), m2)[1]
 
   x = randn(Float32, 10, 2,1)
   m = ConvTranspose((3,), 2=>4, pad=SamePad(), groups=2)
diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index be7c5dec92..f678297eaa 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -129,7 +129,7 @@ end
                              2.0 4.0 6.0]
 
     @test Flux.hasaffine(m) == true
-    @test length(Flux.params(m)) == 2
+    @test length(Flux.trainables(m)) == 2
 
     @test m.β == [0, 0]  # initβ(2)
     @test m.γ == [1, 1]  # initγ(2)
@@ -211,9 +211,9 @@ end
     @inferred m(x)
   end
 
-  @test length(Flux.params(BatchNorm(10))) == 2
-  @test length(Flux.params(BatchNorm(10, affine=true))) == 2
-  @test length(Flux.params(BatchNorm(10, affine=false))) == 0
+  @test length(Flux.trainables(BatchNorm(10))) == 2
+  @test length(Flux.trainables(BatchNorm(10, affine=true))) == 2
+  @test length(Flux.trainables(BatchNorm(10, affine=false))) == 0
 
   @test BatchNorm(5; active=true).active === true
   @test_throws Exception BatchNorm(5; active=:something_else)
@@ -224,7 +224,7 @@ end
   let m = InstanceNorm(2; affine=true, track_stats=true), sizes = (3, 2, 2),
         x = reshape(collect(1:prod(sizes)), sizes)
 
-      @test length(Flux.params(m)) == 2
+      @test length(Flux.trainables(m)) == 2
       x = Float32.(x)
       @test m.β == [0, 0]  # initβ(2)
       @test m.γ == [1, 1]  # initγ(2)
@@ -287,7 +287,7 @@ end
       x = reshape(collect(1:prod(sizes)), sizes)
 
     @test Flux.hasaffine(m) == true
-    @test length(Flux.params(m)) == 2
+    @test length(Flux.trainables(m)) == 2
     x = Float64.(x)
     y = m(x)
     μ = mean(x, dims=1)
@@ -300,7 +300,7 @@ end
   let m = InstanceNorm(2, sigmoid), sizes = (3, 2, 2),
       x = reshape(collect(1:prod(sizes)), sizes)
     @test Flux.hasaffine(m) == false
-    @test length(Flux.params(m)) == 0
+    @test length(Flux.trainables(m)) == 0
 
     x = Float64.(x)
     y = m(x)
@@ -345,9 +345,9 @@ end
     @inferred m(x)
   end
 
-  @test length(Flux.params(InstanceNorm(10))) == 0
-  @test length(Flux.params(InstanceNorm(10, affine=true))) == 2
-  @test length(Flux.params(InstanceNorm(10, affine=false))) == 0
+  @test length(Flux.trainables(InstanceNorm(10))) == 0
+  @test length(Flux.trainables(InstanceNorm(10, affine=true))) == 2
+  @test length(Flux.trainables(InstanceNorm(10, affine=false))) == 0
 
   @test InstanceNorm(5; active=true).active === true
   @test_throws Exception InstanceNorm(5; active=:something_else)
@@ -370,10 +370,10 @@ end
 
   m = LayerNorm((2,3,4))
   @test Flux.hasaffine(m) == true
-  @test length(Flux.params(m)) == 2
+  @test length(Flux.trainables(m)) == 2
   m = LayerNorm((2,3,4), affine=false)
   @test Flux.hasaffine(m) == false
-  @test length(Flux.params(m)) == 0
+  @test length(Flux.trainables(m)) == 0
 end
 
 @testset "GroupNorm" begin
@@ -383,7 +383,7 @@ end
   let m = GroupNorm(4,2), sizes = (3,4,2),
         x = reshape(collect(1:prod(sizes)), sizes)
 
-      @test length(Flux.params(m)) == 2
+      @test length(Flux.trainables(m)) == 2
       x = Float32.(x)
       @test m.β == [0, 0, 0, 0]  # initβ(32)
       @test m.γ == [1, 1, 1, 1]  # initγ(32)
diff --git a/test/layers/recurrent.jl b/test/layers/recurrent.jl
index 7df8b0d4c2..35a3b0211f 100644
--- a/test/layers/recurrent.jl
+++ b/test/layers/recurrent.jl
@@ -1,36 +1,5 @@
 using LinearAlgebra
 
-@testset "RNN gradients-implicit" begin
-    layer = Flux.Recur(Flux.RNNCell(1, 1, identity))
-    layer.cell.Wi .= 5.0
-    layer.cell.Wh .= 4.0
-    layer.cell.b .= 0.0f0
-    layer.cell.state0 .= 7.0
-    x = [[2.0f0], [3.0f0]]
-
-    # theoretical primal gradients
-    primal =
-        layer.cell.Wh .* (layer.cell.Wh * layer.cell.state0 .+ x[1] .* layer.cell.Wi) .+
-        x[2] .* layer.cell.Wi
-    ∇Wi = x[1] .* layer.cell.Wh .+ x[2]
-    ∇Wh = 2 .* layer.cell.Wh .* layer.cell.state0 .+ x[1] .* layer.cell.Wi
-    ∇b = layer.cell.Wh .+ 1
-    ∇state0 = layer.cell.Wh .^ 2
-
-    Flux.reset!(layer)
-    ps = Flux.params(layer)
-    e, g = Flux.withgradient(ps) do
-        out = [layer(xi) for xi in x]
-        sum(out[2])
-    end
-
-    @test primal[1] ≈ e
-    @test ∇Wi ≈ g[ps[1]]
-    @test ∇Wh ≈ g[ps[2]]
-    @test ∇b ≈ g[ps[3]]
-    @test ∇state0 ≈ g[ps[4]]
-
-end
 
 @testset "RNN gradients-explicit" begin
     layer = Flux.Recur(Flux.RNNCell(1, 1, identity))
@@ -70,9 +39,9 @@ end
   for r ∈ [RNN,]
     rnn = r(2 => 3)
     Flux.reset!(rnn)
-    grads_seq = gradient(Flux.params(rnn)) do
+    grads_seq = gradient(rnn) do rnn
         sum([rnn(s) for s in seq][3])
-    end
+    end[1]
     Flux.reset!(rnn);
     bptt = gradient(Wh -> sum(tanh.(rnn.cell.Wi * seq[3] + Wh *
                                   tanh.(rnn.cell.Wi * seq[2] + Wh *
@@ -82,7 +51,7 @@ end
                                   + rnn.cell.b)
                             + rnn.cell.b)),
                     rnn.cell.Wh)
-    @test grads_seq[rnn.cell.Wh] ≈ bptt[1]
+    @test_broken grads_seq.cell.Wh ≈ bptt[1]
   end
 end
 
@@ -92,9 +61,9 @@ end
   for r ∈ [RNN,]
     rnn = r(2 => 3)
     Flux.reset!(rnn)
-    grads_seq = gradient(Flux.params(rnn)) do
+    grads_seq = gradient(rnn) do rnn
         sum([rnn(s) for s in seq][3])
-    end
+    end[1]
     Flux.reset!(rnn);
     bptt = gradient(Wh -> sum(tanh.(rnn.cell.Wi * seq[3] + Wh *
                                   tanh.(rnn.cell.Wi * seq[2] + Wh *
@@ -104,7 +73,7 @@ end
                                   + rnn.cell.b)
                             + rnn.cell.b)),
                     rnn.cell.Wh)
-    @test grads_seq[rnn.cell.Wh] ≈ bptt[1]
+    @test_broken grads_seq.cell.Wh ≈ bptt[1]
   end
 end
 
@@ -112,9 +81,9 @@ end
   seq = rand(Float32, (2, 1, 3))
   rnn = RNN(2 => 3)
   Flux.reset!(rnn)
-  grads_seq = gradient(Flux.params(rnn)) do
+  grads_seq = gradient(rnn) do rnn
     sum(rnn(seq)[:, :, 3])
-  end
+  end[1]
   Flux.reset!(rnn);
   bptt = gradient(rnn.cell.Wh) do Wh
     # calculate state 1
@@ -131,7 +100,7 @@ end
                rnn.cell.b)
     sum(s3) # loss is sum of state 3
   end
-  @test grads_seq[rnn.cell.Wh] ≈ bptt[1]
+  @test_broken grads_seq.cell.Wh ≈ bptt[1]
 end
 
 @testset "RNN-shapes" begin
diff --git a/test/optimise.jl b/test/optimise.jl
deleted file mode 100644
index c63ba85727..0000000000
--- a/test/optimise.jl
+++ /dev/null
@@ -1,222 +0,0 @@
-using Flux.Optimise
-using Flux.Optimise: runall
-using Flux: Params, gradient
-import FillArrays, ComponentArrays
-import Optimisers
-using Test
-using Random
-
-@testset "Optimise" begin
-  # Ensure rng has different state inside and outside the inner @testset
-  # so that w and w' are different
-  Random.seed!(84)
-  w = randn(10, 10)
-  @testset for opt in [AdamW(), AdaGrad(0.1), AdaMax(), AdaDelta(0.9), AMSGrad(),
-                       NAdam(), RAdam(), Descent(0.1), Adam(), OAdam(), AdaBelief(),
-                       Nesterov(), RMSProp(), Momentum()]
-    Random.seed!(42)
-    w′ = randn(10, 10)
-    b = false
-    loss(x) = Flux.Losses.mse(w*x, w′*x .+ b)
-    for t = 1: 10^5
-      θ = params([w′, b])
-      x = rand(10)
-      θ̄ = gradient(() -> loss(x), θ)
-      Optimise.update!(opt, θ, θ̄)
-    end
-    @test loss(rand(10, 10)) < 0.01
-  end
-end
-
-@testset "Optimiser" begin
-  Random.seed!(84)
-  w = randn(10, 10)
-  @testset for Opt in [InvDecay, WeightDecay, ExpDecay, SignDecay]
-    Random.seed!(42)
-    w′ = randn(10, 10)
-    loss(x) = Flux.Losses.mse(w*x, w′*x)
-    opt = Optimiser(Opt(), Adam(0.001))
-    for t = 1:10^5
-      θ = Params([w′])
-      x = rand(10)
-      θ̄ = gradient(() -> loss(x), θ)
-      Optimise.update!(opt, θ, θ̄)
-    end
-    @test loss(rand(10, 10)) < 0.01
-  end
-end
-
-@testset "Training Loop" begin
-
-  # Test multiple callbacks
-  x = 0
-  fs = [() -> (), () -> x = 1]
-  cbs = runall(fs)
-  cbs()
-  @test x == 1
-
-  r = rand(3, 3)
-  loss(x) = sum(x .* x)
-  Flux.train!(loss, Flux.params(r), (r,), Descent())
-end
-
-@testset "Stop on NaN" begin
-  m = Dense(1 => 1)
-  m.weight .= 0
-  CNT = 0
-  @test_throws DomainError Flux.train!(Flux.params(m), 1:100, Descent(0.1)) do i
-    CNT += 1
-    (i == 51 ? NaN32 : 1f0) * sum(m([1.0]))
-  end
-  @test CNT == 51  # stopped early
-  @test m.weight[1] ≈ -5  # did not corrupt weights
-end
-
-@testset "ExpDecay" begin
-
-  @testset "Sanity Check" begin
-    o = ExpDecay(0.2, 0.5, 1, 1e-3)
-    p = [0.0]
-    steps = 1:8
-    eta_expected = @. max(o.eta * 0.5 ^ steps, o.clip)
-    eta_actual = [Optimise.apply!(o, p, [1.0])[1] for _ in steps]
-    @test eta_actual == eta_expected
-  end
-
-  @testset "starting step" begin
-    start = 4
-    o = ExpDecay(0.2, 0.5, 1, 1e-3, start)
-    p = [0.0]
-    steps = 1:8
-    eta_expected = @. max(o.eta * 0.5 ^ max(steps - start, 0), o.clip)
-    eta_actual = [Optimise.apply!(o, p, [1.0])[1] for _ in steps]
-    @test eta_actual == eta_expected
-  end
-
-  w = randn(10, 10)
-  o = ExpDecay(0.1, 0.1, 1000, 1e-4)
-  w1 = randn(10,10)
-  loss(x) = Flux.Losses.mse(w*x, w1*x)
-  flag = 1
-  decay_steps = []
-  for t = 1:10^5
-    prev_eta = o.eta
-    θ = Params([w1])
-    x = rand(10)
-    θ̄ = gradient(() -> loss(x), θ)
-    prev_grad = collect(θ̄[w1])
-    delta = Optimise.apply!(o, w1, θ̄[w1])
-    w1 .-= delta
-    new_eta = o.eta
-    if new_eta != prev_eta
-      push!(decay_steps, t)
-    end
-    array = fill(o.eta, size(prev_grad))
-    if array .* prev_grad != delta
-      flag = 0
-    end
-  end
-  @test flag == 1
-  # Test to check if decay happens at decay steps. Eta reaches clip value (1e-4) after 4000 steps (decay by 0.1 every 1000 steps starting at 0.1).
-  ground_truth = []
-  for i in 1:4
-    push!(ground_truth, 1000*i)  # Expected decay steps for this example.
-  end
-  @test decay_steps == ground_truth
-  @test o.eta == o.clip
-end
-
-@testset "Clipping" begin
-  w = randn(10, 10)
-  loss(x) = sum(w * x)
-  θ = Params([w])
-  x = 1000 * randn(10)
-  w̄ = gradient(() -> loss(x), θ)[w]
-  w̄_value = Optimise.apply!(ClipValue(1.0), w, copy(w̄))
-  @test all(w̄_value .<= 1)
-  w̄_norm = Optimise.apply!(ClipNorm(1.0), w, copy(w̄))
-  @test norm(w̄_norm) <= 1
-end
-
-@testset "update!: handle Fills from Zygote" begin
-  w = randn(10,10)
-  wold = copy(w)
-  g = FillArrays.Ones(size(w))
-  opt = Descent(0.1)
-  Flux.update!(opt, w, g)
-  @test w ≈ wold .- 0.1 
-
-  w = randn(3)
-  wold = copy(w)
-  θ = Flux.params([w])
-  gs = gradient(() -> w[1], θ)
-  opt = Descent(0.1)
-  Flux.update!(opt, θ, gs)
-  @test w[1] ≈ wold[1] .- 0.1
-  @test w[2:3] ≈ wold[2:3] 
-
-  ## Issue #1510
-  w = randn(10,10)
-  wold = copy(w)
-  θ = Flux.params([w])
-  gs = gradient(() -> sum(w), θ)
-  opt = Descent(0.1)
-  Flux.update!(opt, θ, gs)
-  @test w ≈ wold .- 0.1 
-end
-
-@testset "update!: handle ComponentArrays" begin
-  w = ComponentArrays.ComponentArray(a=1.0, b=[2, 1, 4], c=(a=2, b=[1, 2]))
-  wold = deepcopy(w)
-  opt_state = Optimisers.setup(Optimisers.Descent(0.1), w)
-  gs = gradient(w -> w.a + sum(w.c.b), w)[1]
-  Flux.update!(opt_state, w, gs)
-  @test w.a ≈ wold.a - 0.1
-  @test w.b ≈ wold.b
-  @test w.c.b ≈ wold.c.b .- 0.1
-  @test w.c.a ≈ wold.c.a
-
-  w = ComponentArrays.ComponentArray(a=1.0, b=[2, 1, 4], c=(a=2, b=[1, 2]))
-  wold = deepcopy(w)
-  opt_state = Optimisers.setup(Optimisers.Descent(0.1), w)
-  gs = gradient(w -> sum(w), w)[1]
-  Flux.update!(opt_state, w, gs)
-  @test w ≈ wold .- 0.1
-end
-
-# Flux PR #1776
-# We need to test that optimisers like Adam that maintain an internal momentum
-# estimate properly calculate the second-order statistics on the gradients as
-# the flow backward through the model.  Previously, we would calculate second-
-# order statistics via `Δ^2` rather than the complex-aware `Δ * conj(Δ)`, which
-# wreaks all sorts of havoc on our training loops.  This test ensures that
-# a simple optimization is montonically decreasing (up to learning step effects)
-@testset "Momentum Optimisers and complex values" begin
-  # Test every optimiser that has momentum internally
-  for opt_ctor in [Adam, RMSProp, RAdam, OAdam, AdaGrad, AdaDelta, NAdam, AdaBelief]
-    # Our "model" is just a complex number
-    w = zeros(ComplexF32, 1)
-
-    # Our model attempts to learn `f(x) = conj(x)` where `f(x) = w*x`
-    function loss()
-        # Deterministic training data is the best training data
-        x = ones(1, 1) + 1im*ones(1, 1)
-
-        # Manually implement `mse()` to allow demonstration of brokenness
-        # on older Flux builds that don't have a fixed `mse()`
-        return sum(abs2.(w * x .- conj(x)))
-    end
-
-    params = Flux.Params([w])
-    opt = opt_ctor(1e-2)
-
-    # Train for 10 iterations, enforcing that loss is monotonically decreasing
-    last_loss = Inf
-    for idx in 1:10
-        grads = Flux.gradient(loss, params)
-        @test loss() < last_loss
-        last_loss = loss()
-        Flux.update!(opt, params, grads)
-    end
-  end
-end
diff --git a/test/outputsize.jl b/test/outputsize.jl
index 55cb823c5c..fe217c0fc9 100644
--- a/test/outputsize.jl
+++ b/test/outputsize.jl
@@ -248,7 +248,7 @@ end
   @test string(ld) == "LazyLayer(Dense(2 => 3, relu))"
   @test Flux.striplazy(ld) isa Dense
 
-  @test_throws Exception Flux.params(lm)
+  @test_throws Exception Flux.trainables(lm)
   @test_throws Exception gradient(x -> sum(abs2, lm(x)), [1,2])
   @test_throws Exception gradient(m -> sum(abs2, Flux.striplazy(m)([1,2])), ld)
   
diff --git a/test/runtests.jl b/test/runtests.jl
index ef3d67f4d7..86a9885c90 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -30,8 +30,7 @@ Random.seed!(0)
       include("loading.jl")
     end
 
-    @testset "Optimise / Train" begin
-      include("optimise.jl")
+    @testset "Train" begin
       include("train.jl")
       include("tracker.jl")
     end
diff --git a/test/test_utils.jl b/test/test_utils.jl
index 004d3035ad..fe52bb30de 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -67,17 +67,9 @@ function gpu_autodiff_test(
     checkgrad || return
 
     ### GRADIENT WITH RESPECT TO INPUT ###
-
-    y_cpu, back_cpu = pullback((x...) -> f_cpu(x...), xs_cpu...)
-    @test check_type(y_cpu)
-    Δ_cpu = size(y_cpu) == () ? randn(Float32) : randn(Float32, size(y_cpu))
-    gs_cpu = back_cpu(Δ_cpu)
-
-    Δ_gpu = Δ_cpu |> gpu
-    y_gpu, back_gpu = pullback((x...) -> f_gpu(x...), xs_gpu...)
-    @test check_type(y_gpu)
-    gs_gpu = back_gpu(Δ_gpu)
-
+    y_cpu, gs_cpu = withgradient(x -> sum(f_cpu(x...)), xs_cpu...)
+    y_gpu, gs_gpu = withgradient(x -> sum(f_gpu(x...)), xs_gpu...)
+    
     if test_equal
         @test collect(y_cpu) ≈ collect(y_gpu) rtol=rtol atol=atol
         for (g_gpu, g_cpu) in zip(gs_gpu, gs_cpu)
@@ -86,21 +78,11 @@ function gpu_autodiff_test(
     end
 
     ### GRADIENT WITH RESPECT TO f ###
-
-    ps_cpu = Flux.params(f_cpu)
-    y_cpu, back_cpu = pullback(() -> f_cpu(xs_cpu...), ps_cpu)
-    gs_cpu = back_cpu(Δ_cpu)
-
-    ps_gpu = Flux.params(f_gpu)
-    y_gpu, back_gpu = pullback(() -> f_gpu(xs_gpu...), ps_gpu)
-    gs_gpu = back_gpu(Δ_gpu)
+    g_cpu = gradient(f -> sum(f(xs_cpu...)), f_cpu)[1]
+    g_gpu = gradient(f -> sum(f(xs_gpu...)), f_gpu)[1]
 
     if test_equal
-        @test collect(y_cpu) ≈ collect(y_gpu) rtol=rtol atol=atol
-        @assert length(ps_gpu) == length(ps_cpu)
-        for (p_gpu, p_cpu) in zip(ps_gpu, ps_cpu)
-            check_grad(gs_gpu[p_gpu], gs_cpu[p_cpu]; atol, rtol, allow_nothing)
-        end
+        check_grad(g_gpu, g_cpu; atol, rtol, allow_nothing)
     end
 end
 
diff --git a/test/train.jl b/test/train.jl
index 38338c19b9..8824839965 100644
--- a/test/train.jl
+++ b/test/train.jl
@@ -93,35 +93,25 @@ end
   x = rand(2)
   y1 = m(x)  # before
 
-  # Implicit gradient
-  gold = Zygote.gradient(() -> m(x), Flux.params(m))
-  @test gold isa Flux.Zygote.Grads
-  @test_throws ErrorException Flux.update!(Flux.Adam(), m, gold)  # friendly
-  Flux.update!(Flux.Adam(), Flux.params(m), gold)
-  y2 = m(x)
-  @test y2 < y1
-
   # Explicit gradient
   gs = Zygote.gradient(marg -> marg(x), m)
   @test gs isa Tuple
-  @test_throws ErrorException Flux.update!(Flux.Adam(), Flux.params(m), gs) # friendly
-  @test_throws ErrorException Flux.update!(Flux.Adam(), Flux.params(m), gs[1]) # friendly
   @test_throws ErrorException Flux.update!(Flux.Adam(), m, gs)  # friendly
   @test_throws ErrorException Flux.update!(Flux.Adam(), m, gs[1])  # friendly
   s = Flux.setup(Adam(), m)
   @info "ignore this warning, just testing an upgrade path:"
   Flux.update!(s, m, gs)  # Chain + Tuple can be unambiguously sorted out
+  y2 = m(x)
+  @test y2 < y1
+  Flux.update!(s, m, gs[1])  # finally, this is the correct thing
   y3 = m(x)
   @test y3 < y2
-  Flux.update!(s, m, gs[1])  # finally, this is the correct thing
-  y4 = m(x)
-  @test y4 < y3
 
   # Also check that if you import the new Adam, then Flux.setup does still work!
   s2 = Flux.setup(Optimisers.Adam(), m)
   Flux.update!(s2, m, gs[1])
-  y5 = m(x)
-  @test y5 < y4
+  y4 = m(x)
+  @test y4 < y3
 end
 
 for (trainfn!, name) in ((Flux.train!, "Zygote"), (train_enzyme!, "Enzyme"))
@@ -147,28 +137,21 @@ for (trainfn!, name) in ((Flux.train!, "Zygote"), (train_enzyme!, "Enzyme"))
     end
     diff1 = model.weight .- init_weight
 
-    # Take 2: the same, but with Flux.params. Was broken for a bit, no tests!
-    # skipping this test for Enzyme cause implicit params is unsupported
-    if name == "Zygote"
-      model.weight .= init_weight
-      model.bias .= 0
-      pen2(x::AbstractArray) = sum(abs2, x)/2
-      opt = Flux.setup(Adam(0.1), model)
-
-      @test begin
-        trainfn!(model, data, opt) do m, x, y
-          err = Flux.mse(m(x), y)
-          l2 = sum(pen2, Flux.params(m))
-          err + 0.33 * l2
-        end
-
-        diff2 = model.weight .- init_weight
-        @test diff1 ≈ diff2
-  
-        true
-      end broken = VERSION >= v"1.11"
+    # Take 2: the same, but with Optimisers.trainables. 
+    model.weight .= init_weight
+    model.bias .= 0
+    pen2(x::AbstractArray) = sum(abs2, x)/2
+    opt = Flux.setup(Adam(0.1), model)
+
+    trainfn!(model, data, opt) do m, x, y
+      err = Flux.mse(m(x), y)
+      l2 = sum(pen2, Flux.trainables(m))
+      err + 0.33 * l2
     end
 
+    diff2 = model.weight .- init_weight
+    @test diff1 ≈ diff2
+
     # Take 3: using WeightDecay instead. Need the /2 above, to match exactly.
     model.weight .= init_weight
     model.bias .= 0
diff --git a/test/utils.jl b/test/utils.jl
index b526b63286..8cdbac2daf 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -250,41 +250,32 @@ end
   end
 end
 
-@testset "Params" begin
+@testset "Trainables" begin
   m = Dense(10, 5)
-  @test size.(params(m)) == [(5, 10), (5,)]
+  @test size.(Flux.trainables(m)) == [(5, 10), (5,)]
   m = RNN(10, 5)
-  @test size.(params(m)) == [(5, 10), (5, 5), (5,), (5, 1)]
+  @test size.(Flux.trainables(m)) == [(5, 10), (5, 5), (5,), (5, 1)]
 
   # Layer duplicated in same chain, params just once pls.
   c = Chain(m, m)
-  @test size.(params(c)) == [(5, 10), (5, 5), (5,), (5, 1)]
+  @test size.(Flux.trainables(c)) == [(5, 10), (5, 5), (5,), (5, 1)]
 
   # Self-referential array. Just want params, no stack overflow pls.
   r = Any[nothing,m]
   r[1] = r
-  @test size.(params(r)) == [(5, 10), (5, 5), (5,), (5, 1)]
+  @test_broken size.(Flux.trainables(r)) == [(5, 10), (5, 5), (5,), (5, 1)]
 
   # Ensure functor explores inside Transpose but not SubArray
   m = (x = view([1,2,3]pi, 1:2), y = transpose([4 5]pi))
-  @test size.(Flux.params(m)) == [(2,), (1, 2)]
+  @test size.(Flux.trainables(m)) == [(2,), (1, 2)]
 end
 
-@testset "params gradient" begin
+@testset "trainables gradient" begin
   m = (x=[1,2.0], y=[3.0]);
 
-  @test begin
-    # Explicit -- was broken by #2054 / then fixed / now broken again on julia v1.11
-    gnew = gradient(m -> (sum(norm, Flux.params(m))), m)[1]
-    @test gnew.x ≈ [0.4472135954999579, 0.8944271909999159]
-    @test gnew.y ≈ [1.0]
-    true
-  end broken = VERSION >= v"1.11"
-
-  # Implicit
-  gold = gradient(() -> (sum(norm, Flux.params(m))), Flux.params(m))
-  @test gold[m.x] ≈ [0.4472135954999579, 0.8944271909999159]
-  @test gold[m.y] ≈ [1.0]
+  gnew = gradient(m -> (sum(norm, Flux.trainables(m))), m)[1]
+  @test gnew.x ≈ [0.4472135954999579, 0.8944271909999159]
+  @test gnew.y ≈ [1.0]
 end
 
 @testset "Precision" begin
@@ -564,10 +555,10 @@ end
 @testset "Shared parameters" begin
   mat = [1 2; 3 4.0]
   simple = ((nothing, mat, (3, mat, 4)))
-  @test length(Flux.params(simple)) == 1
+  @test length(Flux.trainables(simple)) == 1
   
   oneadj = (nt = (m = mat, a = mat'))
-  @test length(Flux.params(oneadj)) == 1  # needs Functors@0.3
+  @test length(Flux.trainables(oneadj)) == 1  # needs Functors@0.3
   
   @test Flux.destructure(simple)[1] == Flux.destructure(oneadj)[1] == [1, 3, 2, 4]
 
@@ -653,13 +644,13 @@ end
     model = Model(d, d)
 
     # Works
-    g1 = Flux.gradient(() -> sum(model(x)), Flux.params(model))
+    g1 = Flux.gradient(m -> sum(m(x)), model)[1]
 
     p, re = Flux.destructure(model)
     # Fails
-    g2 = Flux.gradient(p -> sum(re(p)(x)), p)
+    g2 = Flux.gradient(p -> sum(re(p)(x)), p)[1]
 
-    @test g2[1] ≈ vcat(g1[d.weight], g1[d.bias])
+    @test g2 ≈ vcat(g1.a.weight + g1.b.weight, g1.a.bias + g1.b.bias)
   end
 
   @testset "issue 1826" begin