diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index c9b4e450ac..58d9ae5002 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -46,7 +46,7 @@ steps:
           using Pkg
           Pkg.resolve()'
     commands: |
-      printf "[MLDataDevices]\ngpu_backend = \"Metal\"\n" > LocalPreferences.toml
+      printf "[Flux]\ngpu_backend = \"Metal\"\n" > LocalPreferences.toml
 
     if: build.message !~ /\[skip tests\]/
     timeout_in_minutes: 60
@@ -63,7 +63,7 @@ steps:
   - label: "AMD GPU with Julia 1"
     plugins:
       - JuliaCI/julia#v1:
-          version: "1.10"
+          version: "1"
       - JuliaCI/julia-test#v1:
       - JuliaCI/julia-coverage#v1:
           dirs:
@@ -74,7 +74,7 @@ steps:
       rocm: "*"
       rocmgpu: "*"
     commands: |
-      printf "[MLDataDevices]\ngpu_backend = \"AMDGPU\"\n" > LocalPreferences.toml
+      printf "[Flux]\ngpu_backend = \"AMDGPU\"\n" > LocalPreferences.toml
     timeout_in_minutes: 60
     env:
       JULIA_AMDGPU_CORE_MUST_LOAD: "1"
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index aa4c13d212..7c4bd4c1a6 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -9,16 +9,23 @@ on:
       - master
     tags: '*'
 
+# needed for julia-actions/cache to delete old caches
+permissions:
+  actions: write
+  contents: read
+
 jobs:
   test:
-    name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
+    name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }}
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
       matrix:
         version:
-          # - '1.9' # Uncomment when 1.10 is out. Replace this with the minimum Julia version that your package supports. 
-          - '1'
+          - '1.10' # Replace this with the minimum Julia version that your package supports. 
+          - '1'    # latest stable 1.x releas
+          # - 'pre'  # latest stable prerelease
+          - 'nightly' # latest nightly release
         os: [ubuntu-latest]
         arch: [x64]
         include:
@@ -29,7 +36,7 @@ jobs:
             version: '1'
             arch: aarch64
     steps:
-      - uses: actions/checkout@v4.2.1
+      - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@v2
         with:
           version: ${{ matrix.version }}
@@ -47,17 +54,16 @@ jobs:
       - uses: julia-actions/julia-buildpkg@v1
       - name: "Run test without coverage report"
         uses: julia-actions/julia-runtest@v1
-        if: ${{ !contains(fromJson('["1", "1.9"]'), matrix.version) || matrix.os != 'ubuntu-latest' }}
+        if: matrix.version != '1' || matrix.os != 'ubuntu-latest'
         with:
           coverage: false
-
       - name: "Run test with coverage report"
         uses: julia-actions/julia-runtest@v1
-        if: contains(fromJson('["1", "1.9"]'), matrix.version) && matrix.os == 'ubuntu-latest'
+        if: matrix.version == '1' && matrix.os == 'ubuntu-latest'
       - uses: julia-actions/julia-processcoverage@v1
-        if: contains(fromJson('["1", "1.9"]'), matrix.version) && matrix.os == 'ubuntu-latest'
+        if: matrix.version == '1' && matrix.os == 'ubuntu-latest'
       - uses: codecov/codecov-action@v4
-        if: contains(fromJson('["1", "1.9"]'), matrix.version) && matrix.os == 'ubuntu-latest'
+        if: matrix.version == '1' && matrix.os == 'ubuntu-latest'
         with:
           file: lcov.info
 
diff --git a/.github/workflows/nightly-ci.yml b/.github/workflows/nightly-ci.yml
deleted file mode 100644
index e340370686..0000000000
--- a/.github/workflows/nightly-ci.yml
+++ /dev/null
@@ -1,42 +0,0 @@
-name: Nightly CI
-
-on:
-  pull_request:
-    branches:
-      - master
-  push:
-    branches:
-      - master
-    tags: '*'
-
-jobs:
-  test:
-    name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        version: [nightly]
-        os: [ubuntu-latest]
-        arch: [x64]
-    steps:
-      - uses: actions/checkout@v4.2.1
-      - uses: julia-actions/setup-julia@v2
-        with:
-          version: ${{ matrix.version }}
-          arch: ${{ matrix.arch }}
-      - uses: julia-actions/cache@v2
-        env:
-          cache-name: cache-artifacts
-        with:
-          path: ~/.julia/artifacts
-          key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
-          restore-keys: |
-            ${{ runner.os }}-test-${{ env.cache-name }}-
-            ${{ runner.os }}-test-
-            ${{ runner.os }}-
-      - uses: julia-actions/julia-buildpkg@v1
-      - name: "Run test without coverage report"
-        uses: julia-actions/julia-runtest@v1
-        with:
-          coverage: false
diff --git a/Project.toml b/Project.toml
index d805332f20..a3219459cf 100644
--- a/Project.toml
+++ b/Project.toml
@@ -67,4 +67,4 @@ SpecialFunctions = "2.1.2"
 Statistics = "1"
 Zygote = "0.6.67"
 cuDNN = "1"
-julia = "1.9"
+julia = "1.10"
diff --git a/ext/FluxAMDGPUExt/FluxAMDGPUExt.jl b/ext/FluxAMDGPUExt/FluxAMDGPUExt.jl
index 8e8086c1a8..b44c5106e8 100644
--- a/ext/FluxAMDGPUExt/FluxAMDGPUExt.jl
+++ b/ext/FluxAMDGPUExt/FluxAMDGPUExt.jl
@@ -6,7 +6,7 @@ import Flux
 import Flux: FluxCPUAdaptor, FluxAMDGPUAdaptor, _amd, adapt_storage, fmap
 import Flux: DenseConvDims, Conv, ConvTranspose, conv, conv_reshape_bias
 import NNlib
-
+using MLDataDevices: MLDataDevices
 using AMDGPU
 using Adapt
 using Random
diff --git a/ext/FluxCUDAExt/FluxCUDAExt.jl b/ext/FluxCUDAExt/FluxCUDAExt.jl
index 9f0dae1aa9..02a18dc4bc 100644
--- a/ext/FluxCUDAExt/FluxCUDAExt.jl
+++ b/ext/FluxCUDAExt/FluxCUDAExt.jl
@@ -10,6 +10,7 @@ using ChainRulesCore
 using Random
 using Adapt
 import Adapt: adapt_storage
+using MLDataDevices: MLDataDevices
 
 
 const USE_CUDA = Ref{Union{Nothing, Bool}}(nothing)
diff --git a/ext/FluxCUDAExt/functor.jl b/ext/FluxCUDAExt/functor.jl
index 205f366b24..08de4994b2 100644
--- a/ext/FluxCUDAExt/functor.jl
+++ b/ext/FluxCUDAExt/functor.jl
@@ -57,5 +57,5 @@ function _cuda(id::Union{Nothing, Int}, x)
 end
 
 function Flux._get_device(::Val{:CUDA}, id::Int)
-    return MLDataUtils.gpu_device(id+1, force=true)
+    return MLDataDevices.gpu_device(id+1, force=true)
 end
diff --git a/ext/FluxEnzymeExt/FluxEnzymeExt.jl b/ext/FluxEnzymeExt/FluxEnzymeExt.jl
index e6ce51297f..5ac7c2e577 100644
--- a/ext/FluxEnzymeExt/FluxEnzymeExt.jl
+++ b/ext/FluxEnzymeExt/FluxEnzymeExt.jl
@@ -16,10 +16,6 @@ _applyloss(loss, model, d...) = loss(model, d...)
 
 EnzymeRules.inactive(::typeof(Flux.Losses._check_sizes), args...) = true
 
-using Flux: _old_to_new  # from src/deprecations.jl
-train!(loss, model::Duplicated, data, opt::Optimise.AbstractOptimiser; cb=nothing) =
-  train!(loss, model, data, _old_to_new(opt); cb)
-
 function train!(loss, model::Duplicated, data, rule::Optimisers.AbstractRule; cb = nothing)
   train!(loss, model, data, _rule_to_state(model, rule); cb)
 end
diff --git a/ext/FluxMetalExt/FluxMetalExt.jl b/ext/FluxMetalExt/FluxMetalExt.jl
index 27316c3b16..fe5fb2e7e5 100644
--- a/ext/FluxMetalExt/FluxMetalExt.jl
+++ b/ext/FluxMetalExt/FluxMetalExt.jl
@@ -4,7 +4,7 @@ import Flux
 import Flux: FluxCPUAdaptor, FluxMetalAdaptor, _metal, _isleaf, adapt_storage, fmap
 import NNlib
 using ChainRulesCore
-
+using MLDataDevices: MLDataDevices
 using Metal
 using Adapt
 using Random
diff --git a/ext/FluxMetalExt/functor.jl b/ext/FluxMetalExt/functor.jl
index ad59cc7117..443c824e7e 100644
--- a/ext/FluxMetalExt/functor.jl
+++ b/ext/FluxMetalExt/functor.jl
@@ -35,6 +35,6 @@ end
 
 function Flux._get_device(::Val{:Metal}, id::Int)
     @assert id == 0 "Metal backend only supports one device at the moment"
-    return MLDataDevices.gpu_device()
+    return MLDataDevices.gpu_device(force=true)
 end
 
diff --git a/src/Flux.jl b/src/Flux.jl
index 2763e26499..47142dbdda 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -18,13 +18,14 @@ using Zygote: Params, @adjoint, gradient, pullback
 using Zygote.ForwardDiff: value
 export gradient
 
-@reexport using MLDataDevices: MLDataDevices, gpu_backend!, supported_gpu_backends, reset_gpu_device!,
+@reexport using MLDataDevices: MLDataDevices, supported_gpu_backends, reset_gpu_device!,
                     default_device_rng,
                     gpu_device, cpu_device, xla_device,
                     CPUDevice,
                     CUDADevice, AMDGPUDevice, MetalDevice, oneAPIDevice,
                     XLADevice,
                     # get_device, # we define get_device here for retrocompatibility
+                    # gpu_backend!, # have to define here due to https://github.com/JuliaPackaging/Preferences.jl/issues/39
                     get_device_type,
                     DeviceIterator
 
@@ -104,7 +105,7 @@ include("losses/Losses.jl")
 using .Losses
 
 include("devices.jl")
-export get_device
+export get_device, gpu_backend!
 
 # Distributed Training
 include("distributed/backend.jl")
diff --git a/src/deprecations.jl b/src/deprecations.jl
index 8dadadfd6d..6148894dbe 100644
--- a/src/deprecations.jl
+++ b/src/deprecations.jl
@@ -1,47 +1,6 @@
 
 # v0.13 deprecations
 
-function Broadcast.broadcasted(f::Recur, args...)
-  # This had an explicit @adjoint rule, calling Zygote.∇map(__context__, f, args...), until v0.12
-  Base.depwarn("""Broadcasting is not safe to use with RNNs, as it does not guarantee an iteration order.
-    Re-writing this as a comprehension would be better.""", :broadcasted)
-  map(f, args...)  # map isn't really safe either, but 
-end
-
-@deprecate frequencies(xs) group_counts(xs)
-
-struct Zeros
-  function Zeros()
-    Base.depwarn("Flux.Zeros is no more, has ceased to be, is bereft of life, is an ex-boondoggle... please use bias=false instead", :Zeros)
-    false
-  end
-end
-Zeros(args...) = Zeros()  # was used both Dense(10, 2, initb = Zeros) and Dense(rand(2,10), Zeros())
-
-function Optimise.update!(x::AbstractArray, x̄)
-  Base.depwarn("`Flux.Optimise.update!(x, x̄)` was not used internally and has been removed. Please write `x .-= x̄` instead.", :update!)
-  x .-= x̄
-end
-
-function Diagonal(size::Integer...; kw...)
-  Base.depwarn("Flux.Diagonal is now Flux.Scale, and also allows an activation function.", :Diagonal)
-  Scale(size...; kw...)
-end
-function Diagonal(size::Tuple; kw...)
-  Base.depwarn("Flux.Diagonal is now Flux.Scale, and also allows an activation function.", :Diagonal)
-  Scale(size...; kw...)
-end
-
-# Deprecate this eventually once saving models w/o structure is no more
-function loadparams!(m, xs)
-  Base.depwarn("loadparams! will be deprecated eventually. Use loadmodel! instead.", :loadparams!)
-  for (p, x) in zip(params(m), xs)
-    size(p) == size(x) ||
-      error("Expected param size $(size(p)), got $(size(x))")
-    copyto!(p, x)
-  end
-end
-
 # Channel notation: Changed to match Conv, but very softly deprecated!
 # Perhaps change to @deprecate for v0.15, but there is no plan to remove these.
 Dense(in::Integer, out::Integer, σ = identity; kw...) =
@@ -56,32 +15,6 @@ LSTMCell(in::Integer, out::Integer; kw...) = LSTMCell(in => out; kw...)
 GRUCell(in::Integer, out::Integer; kw...) = GRUCell(in => out; kw...)
 GRUv3Cell(in::Integer, out::Integer; kw...) = GRUv3Cell(in => out; kw...)
 
-# Optimisers with old naming convention
-Base.@deprecate_binding ADAM Adam
-Base.@deprecate_binding NADAM NAdam
-Base.@deprecate_binding ADAMW AdamW
-Base.@deprecate_binding RADAM RAdam
-Base.@deprecate_binding OADAM OAdam
-Base.@deprecate_binding ADAGrad AdaGrad
-Base.@deprecate_binding ADADelta AdaDelta
-
-# Remove sub-module Data, while making sure Flux.Data.DataLoader keeps working
-Base.@deprecate_binding Data Flux false "Sub-module Flux.Data has been removed. The only thing it contained may be accessed as Flux.DataLoader"
-
-@deprecate paramtype(T,m) _paramtype(T,m) false  # internal method, renamed to make this clear
-
-@deprecate rng_from_array() Random.default_rng()
-
-function istraining()
-  Base.depwarn("Flux.istraining() is deprecated, use NNlib.within_gradient(x) instead", :istraining)
-  false
-end
-ChainRulesCore.rrule(::typeof(istraining)) = true, _ -> (NoTangent(),)
-
-function _isactive(m)
-  Base.depwarn("_isactive(m) is deprecated, use _isactive(m,x)", :_isactive, force=true)
-  _isactive(m, 1:0)
-end
 
 #=
   # Valid method in Optimise, old implicit style, is:
@@ -110,7 +43,6 @@ train!(loss, ps::Params, data, opt::Optimisers.AbstractRule; cb=nothing) = error
 train!(loss, model, data, opt::Optimise.AbstractOptimiser; cb=nothing) =
   train!(loss, model, data, _old_to_new(opt); cb)
 
-
 # Next, to use the new `setup` with the still-exported old-style `Adam` etc:
 import .Train: setup
 setup(rule::Optimise.AbstractOptimiser, model) = setup(_old_to_new(rule), model)
@@ -179,33 +111,6 @@ function update!(opt::Optimise.AbstractOptimiser, ::Params, grads::Union{Tuple,
     """)
 end
 
-""" 
-    trainmode!(m, active)
-
-!!! warning
-    This two-argument method is deprecated.
-
-Possible values of  `active` are:
-- `true` for training, or 
-- `false` for testing, same as [`testmode!`](@ref)`(m)`
-- `:auto` or `nothing` for Flux to detect training automatically.
-"""
-function trainmode!(m, active::Bool)
-  Base.depwarn("trainmode!(m, active::Bool) is deprecated", :trainmode)
-  testmode!(m, !active)
-end
-
-# Greek-letter keywords deprecated in Flux 0.13
-# Arguments (old => new, :function, "β" => "beta")
-function _greek_ascii_depwarn(βbeta::Pair, func = :loss, names = "" => "")
-  Base.depwarn(LazyString("function ", func, " no longer accepts greek-letter keyword ", names.first, """
-    please use ascii """, names.second, " instead"), func)
-  βbeta.first
-end
-_greek_ascii_depwarn(βbeta::Pair{Nothing}, _...) = βbeta.second
-
-ChainRulesCore.@non_differentiable _greek_ascii_depwarn(::Any...)
-
 
 # v0.14 deprecations
 @deprecate default_rng_value() Random.default_rng()
diff --git a/src/functor.jl b/src/functor.jl
index c76646729a..16efa99a20 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -189,11 +189,19 @@ _isleaf(::Union{Transpose, Adjoint, PermutedDimsArray}) = false
 
 _isleaf(::AbstractRNG) = true
 
+# Remove when 
+# https://github.com/JuliaPackaging/Preferences.jl/issues/39
+# is resolved
+function gpu_backend!(backend::String)
+    @set_preferences!("gpu_backend" => backend)
+    MLDataDevices.gpu_backend!(backend)
+end
+
 # the order below is important
 const GPU_BACKENDS = ("CUDA", "AMDGPU", "Metal", "CPU")
-const GPU_BACKEND_ORDER = Dict(collect(zip(GPU_BACKENDS, 1:length(GPU_BACKENDS))))
-const GPU_BACKEND = load_preference(MLDataDevices, "gpu_backend", "CUDA")
-
+# const GPU_BACKEND = load_preference(MLDataDevices, "gpu_backend", "CUDA")
+# https://github.com/JuliaPackaging/Preferences.jl/issues/39
+const GPU_BACKEND = @load_preference("gpu_backend", "CUDA")
 
 """
     gpu(m)
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index c6663cca88..99092f9756 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -191,10 +191,9 @@ struct LayerNorm{F,D,T,N}
   affine::Bool
 end
 
-function LayerNorm(size::Tuple{Vararg{Int}}, λ=identity; affine::Bool=true, eps::Real=1f-5, ϵ=nothing)
-  ε = _greek_ascii_depwarn(ϵ => eps, :LayerNorm, "ϵ" => "eps")
+function LayerNorm(size::Tuple{Vararg{Int}}, λ=identity; affine::Bool=true, eps::Real=1f-5)
   diag = affine ? Scale(size..., λ) : λ!=identity ? Base.Fix1(broadcast, λ) : identity
-  return LayerNorm(λ, diag, ε, size, affine)
+  return LayerNorm(λ, diag, eps, size, affine)
 end
 LayerNorm(size::Integer...; kw...) = LayerNorm(Int.(size); kw...)
 LayerNorm(size_act...; kw...) = LayerNorm(Int.(size_act[1:end-1]), size_act[end]; kw...)
@@ -328,9 +327,8 @@ end
 function BatchNorm(chs::Int, λ=identity;
           initβ=zeros32, initγ=ones32,
           affine::Bool=true, track_stats::Bool=true, active::Union{Bool,Nothing}=nothing,
-          eps::Real=1f-5, momentum::Real=0.1f0, ϵ=nothing)
+          eps::Real=1f-5, momentum::Real=0.1f0)
 
-  ε = _greek_ascii_depwarn(ϵ => eps, :BatchNorm, "ϵ" => "eps")
 
   β = affine ? initβ(chs) : nothing
   γ = affine ? initγ(chs) : nothing
@@ -338,7 +336,7 @@ function BatchNorm(chs::Int, λ=identity;
   σ² = track_stats ? ones32(chs) : nothing
 
   return BatchNorm(λ, β, γ,
-            μ, σ², ε, momentum,
+            μ, σ², eps, momentum,
             affine, track_stats,
             active, chs)
 end
@@ -421,9 +419,7 @@ end
 function InstanceNorm(chs::Int, λ=identity;
                     initβ=zeros32, initγ=ones32,
                     affine::Bool=false, track_stats::Bool=false, active::Union{Bool,Nothing}=nothing,
-                    eps::Real=1f-5, momentum::Real=0.1f0, ϵ=nothing)
-
-  ε = _greek_ascii_depwarn(ϵ => eps, :InstanceNorm, "ϵ" => "eps")
+                    eps::Real=1f-5, momentum::Real=0.1f0)
 
   β = affine ? initβ(chs) : nothing
   γ = affine ? initγ(chs) : nothing
@@ -431,7 +427,7 @@ function InstanceNorm(chs::Int, λ=identity;
   σ² = track_stats ? ones32(chs) : nothing
 
   return InstanceNorm(λ, β, γ,
-            μ, σ², ε, momentum,
+            μ, σ², eps, momentum,
             affine, track_stats,
             active, chs)
 end
@@ -520,9 +516,7 @@ end
 function GroupNorm(chs::Int, G::Int, λ=identity;
               initβ=zeros32, initγ=ones32,
               affine::Bool=true, active::Union{Bool,Nothing}=nothing,
-              eps::Real=1f-5, momentum::Real=0.1f0, ϵ=nothing)
-
-  ε = _greek_ascii_depwarn(ϵ => eps, :GroupNorm, "ϵ" => "eps")
+              eps::Real=1f-5, momentum::Real=0.1f0)
 
   chs % G == 0 || error("The number of groups ($(G)) must divide the number of channels ($chs)")
 
@@ -535,7 +529,7 @@ function GroupNorm(chs::Int, G::Int, λ=identity;
   return GroupNorm(G, λ,
             β, γ,
             μ, σ²,
-            ε, momentum,
+            eps, momentum,
             affine, track_stats,
             active, chs)
 end
diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 2565ea2e84..4fb739e0c3 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -34,11 +34,10 @@ julia> isapprox(std(y; dims=1, corrected=false), ones(1, 10), atol=1e-5)
 true
 ```
 """
-@inline function normalise(x::AbstractArray; dims=ndims(x), eps=ofeltype(x, 1e-5), ϵ=nothing)
-  ε = _greek_ascii_depwarn(ϵ => eps, :InstanceNorm, "ϵ" => "eps")
+@inline function normalise(x::AbstractArray; dims=ndims(x), eps=ofeltype(x, 1e-5))
   μ = mean(x, dims=dims)
   σ = std(x, dims=dims, mean=μ, corrected=false)
-  return @. (x - μ) / (σ + ε)
+  return @. (x - μ) / (σ + eps)
 end
 
 """
diff --git a/src/losses/Losses.jl b/src/losses/Losses.jl
index 34315baadd..5b4a1d697b 100644
--- a/src/losses/Losses.jl
+++ b/src/losses/Losses.jl
@@ -4,7 +4,7 @@ using Statistics
 using Zygote
 using Zygote: @adjoint
 using ChainRulesCore
-using ..Flux: ofeltype, epseltype, _greek_ascii_depwarn
+using ..Flux: ofeltype, epseltype
 using NNlib: logsoftmax, logσ, ctc_loss, ctc_alpha, ∇ctc_loss
 import Base.Broadcast: broadcasted
 
diff --git a/src/losses/functions.jl b/src/losses/functions.jl
index 7897cc5754..5f9778e93c 100644
--- a/src/losses/functions.jl
+++ b/src/losses/functions.jl
@@ -66,10 +66,9 @@ julia> Flux.msle(Float32[0.9, 1.8, 2.7], 1:3)
 0.011100831f0
 ```
 """
-function msle(ŷ, y; agg = mean, eps::Real = epseltype(ŷ), ϵ = nothing)
-  ϵ = _greek_ascii_depwarn(ϵ => eps, :msle, "ϵ" => "eps")
+function msle(ŷ, y; agg = mean, eps::Real = epseltype(ŷ))
   _check_sizes(ŷ, y)
-  agg((log.((ŷ .+ ϵ) ./ (y .+ ϵ))) .^2 )
+  agg((log.((ŷ .+ eps) ./ (y .+ eps))) .^2 )
 end
 
 function _huber_metric(abs_error, δ)
@@ -101,9 +100,8 @@ julia> Flux.huber_loss(ŷ, 1:3, delta=0.05)  # changes behaviour as |ŷ - y| >
 0.003750000000000005
 ```
 """
-function huber_loss(ŷ, y; agg = mean, delta::Real = 1, δ = nothing)
-    delta_tmp = _greek_ascii_depwarn(δ => delta, :huber_loss, "δ" => "delta")
-    δ = ofeltype(ŷ, delta_tmp)
+function huber_loss(ŷ, y; agg = mean, delta::Real = 1)
+    δ = ofeltype(ŷ, delta)
     _check_sizes(ŷ, y)
     abs_error = abs.(ŷ .- y)
 
@@ -230,10 +228,9 @@ julia> Flux.crossentropy(y_model, y_smooth)
 1.5776052f0
 ```
 """
-function crossentropy(ŷ, y; dims = 1, agg = mean, eps::Real = epseltype(ŷ), ϵ = nothing)
-  ϵ = _greek_ascii_depwarn(ϵ => eps, :crossentropy, "ϵ" => "eps")
+function crossentropy(ŷ, y; dims = 1, agg = mean, eps::Real = epseltype(ŷ))
   _check_sizes(ŷ, y)
-  agg(.-sum(xlogy.(y, ŷ .+ ϵ); dims = dims))
+  agg(.-sum(xlogy.(y, ŷ .+ eps); dims = dims))
 end
 
 """
@@ -319,10 +316,9 @@ julia> Flux.crossentropy(y_prob, y_hot)
 0.43989f0
 ```
 """
-function binarycrossentropy(ŷ, y; agg = mean, eps::Real = epseltype(ŷ), ϵ = nothing)
-  ϵ = _greek_ascii_depwarn(ϵ => eps, :binarycrossentropy, "ϵ" => "eps")
+function binarycrossentropy(ŷ, y; agg = mean, eps::Real = epseltype(ŷ))
   _check_sizes(ŷ, y)
-  agg(@.(-xlogy(y, ŷ + ϵ) - xlogy(1 - y, 1 - ŷ + ϵ)))
+  agg(@.(-xlogy(y, ŷ + eps) - xlogy(1 - y, 1 - ŷ + eps)))
 end
 
 """
@@ -390,11 +386,10 @@ julia> Flux.kldivergence(p1, p2; eps = 0)  # about 17.3 with the regulator
 Inf
 ```
 """
-function kldivergence(ŷ, y; dims = 1, agg = mean, eps::Real = epseltype(ŷ), ϵ = nothing)
-  ϵ = _greek_ascii_depwarn(ϵ => eps, :kldivergence, "ϵ" => "eps")
+function kldivergence(ŷ, y; dims = 1, agg = mean, eps::Real = epseltype(ŷ))
   _check_sizes(ŷ, y)
   entropy = agg(sum(xlogx.(y); dims = dims))
-  cross_entropy = crossentropy(ŷ, y; dims, agg, eps=ϵ)
+  cross_entropy = crossentropy(ŷ, y; dims, agg, eps)
   return entropy + cross_entropy
 end
 
@@ -531,13 +526,12 @@ Calculated as:
 
 """
 function tversky_loss(ŷ, y; beta::Real = 0.7, β = nothing)
-  beta_temp = _greek_ascii_depwarn(β => beta, :tversky_loss, "β" => "beta")
-  β = ofeltype(ŷ, beta_temp)
-    _check_sizes(ŷ, y)
-    #TODO add agg
-    num = sum(y .* ŷ) + 1
-    den = sum(y .* ŷ + β * (1 .- y) .* ŷ + (1 - β) * y .* (1 .- ŷ)) + 1
-    1 - num / den
+  β = ofeltype(ŷ, beta)
+  _check_sizes(ŷ, y)
+  #TODO add agg
+  num = sum(y .* ŷ) + 1
+  den = sum(y .* ŷ + β * (1 .- y) .* ŷ + (1 - β) * y .* (1 .- ŷ)) + 1
+  1 - num / den
 end
 
 """
@@ -568,12 +562,10 @@ julia> Flux.binary_focal_loss(ŷ, y) ≈ 0.0728675615927385
 true
 ```
 """
-function binary_focal_loss(ŷ, y; agg=mean, gamma=2, eps::Real=epseltype(ŷ), ϵ = nothing, γ = nothing)
-    ϵ = _greek_ascii_depwarn(ϵ => eps, :binary_focal_loss, "ϵ" => "eps")
-    gamma_temp = _greek_ascii_depwarn(γ => gamma, :binary_focal_loss, "γ" => "gamma")
-    γ = gamma_temp isa Integer ? gamma_temp : ofeltype(ŷ, gamma_temp)
+function binary_focal_loss(ŷ, y; agg=mean, gamma=2, eps::Real=epseltype(ŷ))
+    γ = gamma isa Integer ? gamma : ofeltype(ŷ, gamma)
     _check_sizes(ŷ, y)
-    ŷϵ = ŷ .+ ϵ
+    ŷϵ = ŷ .+ eps
     p_t = y .* ŷϵ  + (1 .- y) .* (1 .- ŷϵ)
     ce = .-log.(p_t)
     weight = (1 .- p_t) .^ γ
@@ -616,11 +608,9 @@ See also: [`Losses.binary_focal_loss`](@ref) for binary (not one-hot) labels
 
 """
 function focal_loss(ŷ, y; dims=1, agg=mean, gamma=2, eps::Real=epseltype(ŷ), ϵ=nothing, γ=nothing)
-  ϵ = _greek_ascii_depwarn(ϵ => eps, :focal_loss, "ϵ" => "eps")
-  gamma_temp = _greek_ascii_depwarn(γ => gamma, :focal_loss, "γ" => "gamma")
-  γ = gamma_temp isa Integer ? gamma_temp : ofeltype(ŷ, gamma_temp)
+  γ = gamma isa Integer ? gamma : ofeltype(ŷ, gamma)
   _check_sizes(ŷ, y)
-  ŷϵ = ŷ .+ ϵ
+  ŷϵ = ŷ .+ eps
   agg(sum(@. -y * (1 - ŷϵ)^γ * log(ŷϵ); dims))
 end
 
diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 6c1b78919f..be7c5dec92 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -446,7 +446,7 @@ end
   @test Zygote.hessian_reverse(sum∘m1, [1.0,2.0,3.0]) == zeros(3, 3)
 
   m2 = Chain(BatchNorm(3), sum)
-  @test_broken Zygote.hessian_reverse(m2, Float32[1 2; 3 4; 5 6]) == zeros(Float32, 6, 6)
+  @test Zygote.hessian_reverse(m2, Float32[1 2; 3 4; 5 6]) == zeros(Float32, 6, 6) broken = VERSION >= v"1.11"
 end
 
 @testset "ForwardDiff" begin
diff --git a/test/layers/show.jl b/test/layers/show.jl
index 6910e5fa08..95ddca0571 100644
--- a/test/layers/show.jl
+++ b/test/layers/show.jl
@@ -76,7 +76,7 @@ end
 
 # Bug when no children, https://github.com/FluxML/Flux.jl/issues/2208
 struct NoFields end
-Flux.@functor NoFields
+Flux.@layer NoFields
 
 @testset "show with no fields" begin
   str = repr("text/plain", Chain(Dense(1=>1), Dense(1=>1), NoFields()))
diff --git a/test/losses.jl b/test/losses.jl
index a5ce1139df..1285f30148 100644
--- a/test/losses.jl
+++ b/test/losses.jl
@@ -76,7 +76,7 @@ y_dis[1,:], y_dis[2,:] = y_dis[2,:], y_dis[1,:]
   @test crossentropy(ŷ, y_smoothed) ≈ lossvalue_smoothed
   @test crossentropy(ylp, label_smoothing(yl, 2sf)) ≈ -sum(yls.*log.(ylp))
   @test crossentropy(ylp, yl) ≈ -sum(yl.*log.(ylp))
-  @test iszero(crossentropy(y_same, ya, ϵ=0))  # ε is deprecated
+  @test iszero(crossentropy(y_same, ya, eps=0))
   @test iszero(crossentropy(ya, ya, eps=0))
   @test crossentropy(y_sim, ya) < crossentropy(y_sim, ya_smoothed)
   @test crossentropy(y_dis, ya) > crossentropy(y_dis, ya_smoothed)
@@ -92,15 +92,15 @@ logŷ, y = randn(3), rand(3)
 yls = y.*(1-2sf).+sf
 
 @testset "binarycrossentropy" begin
-  @test binarycrossentropy.(σ.(logŷ), label_smoothing(y, 2sf; dims=0); ϵ=0) ≈ -yls.*log.(σ.(logŷ)) - (1 .- yls).*log.(1 .- σ.(logŷ))
+  @test binarycrossentropy.(σ.(logŷ), label_smoothing(y, 2sf; dims=0); eps=0) ≈ -yls.*log.(σ.(logŷ)) - (1 .- yls).*log.(1 .- σ.(logŷ))
   @test binarycrossentropy(σ.(logŷ), y; eps=0) ≈ mean(-y.*log.(σ.(logŷ)) - (1 .- y).*log.(1 .- σ.(logŷ)))
   @test binarycrossentropy(σ.(logŷ), y) ≈ mean(-y.*log.(σ.(logŷ) .+ eps.(σ.(logŷ))) - (1 .- y).*log.(1 .- σ.(logŷ) .+ eps.(σ.(logŷ))))
   @test binarycrossentropy([0.1,0.2,0.9], 1) ≈ -mean(log, [0.1,0.2,0.9])  # constant label
 end
 
 @testset "logitbinarycrossentropy" begin
-  @test logitbinarycrossentropy.(logŷ, label_smoothing(y, 0.2)) ≈ binarycrossentropy.(σ.(logŷ), label_smoothing(y, 0.2); ϵ=0)
-  @test logitbinarycrossentropy(logŷ, y) ≈ binarycrossentropy(σ.(logŷ), y; ϵ=0)
+  @test logitbinarycrossentropy.(logŷ, label_smoothing(y, 0.2)) ≈ binarycrossentropy.(σ.(logŷ), label_smoothing(y, 0.2); eps=0)
+  @test logitbinarycrossentropy(logŷ, y) ≈ binarycrossentropy(σ.(logŷ), y; eps=0)
 end
 
 y = onehotbatch([1], 0:1)
@@ -152,7 +152,7 @@ end
 
 @testset "tversky_loss" begin
   @test Flux.tversky_loss(ŷ, y) ≈ -0.06772009029345383
-  @test Flux.tversky_loss(ŷ, y, β=0.8) ≈ -0.09490740740740744
+  @test Flux.tversky_loss(ŷ, y, beta=0.8) ≈ -0.09490740740740744
   @test Flux.tversky_loss(y, y) ≈ -0.5576923076923075
 end
 
@@ -180,7 +180,7 @@ end
           0.4 0.7]
     @test Flux.binary_focal_loss(ŷ, y) ≈ 0.0728675615927385
     @test Flux.binary_focal_loss(ŷ1, y1) ≈ 0.05691642237852222
-    @test Flux.binary_focal_loss(ŷ, y; γ=0.0) ≈ Flux.binarycrossentropy(ŷ, y)
+    @test Flux.binary_focal_loss(ŷ, y; gamma=0.0) ≈ Flux.binarycrossentropy(ŷ, y)
 end
 
 @testset "focal_loss" begin
diff --git a/test/outputsize.jl b/test/outputsize.jl
index 0eab572eb7..55cb823c5c 100644
--- a/test/outputsize.jl
+++ b/test/outputsize.jl
@@ -10,7 +10,6 @@
   @test outputsize(m, (10,); padbatch=true) == (2, 1)
   @test outputsize(m, (10, 30)) == (2, 30)
 
-  @info "Don't mind the following error, it's for testing purpose."
   m = Chain(Dense(10, 8, σ), Dense(8, 4), Dense(5, 2))
   @test_throws DimensionMismatch outputsize(m, (10,))
 
diff --git a/test/runtests.jl b/test/runtests.jl
index c48b281c92..ef3d67f4d7 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -65,14 +65,6 @@ Random.seed!(0)
     @testset "functors" begin
       include("functors.jl")
     end
-
-    @static if VERSION == v"1.9"
-      using Documenter
-      @testset "Docs" begin
-        DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive=true)
-        doctest(Flux)
-      end
-    end
   else
       @info "Skipping CPU tests."
   end
@@ -143,10 +135,10 @@ Random.seed!(0)
   end
 
   if get(ENV, "FLUX_TEST_ENZYME", "true") == "true"
-      @testset "Enzyme" begin
-        import Enzyme
-        include("ext_enzyme/enzyme.jl")
-      end
+    @testset "Enzyme" begin
+      import Enzyme
+      include("ext_enzyme/enzyme.jl")
+    end
   else
     @info "Skipping Enzyme tests, set FLUX_TEST_ENZYME=true to run them."
   end
diff --git a/test/train.jl b/test/train.jl
index 96bd0d22a4..38338c19b9 100644
--- a/test/train.jl
+++ b/test/train.jl
@@ -155,7 +155,7 @@ for (trainfn!, name) in ((Flux.train!, "Zygote"), (train_enzyme!, "Enzyme"))
       pen2(x::AbstractArray) = sum(abs2, x)/2
       opt = Flux.setup(Adam(0.1), model)
 
-      @test_broken begin
+      @test begin
         trainfn!(model, data, opt) do m, x, y
           err = Flux.mse(m(x), y)
           l2 = sum(pen2, Flux.params(m))
@@ -166,7 +166,7 @@ for (trainfn!, name) in ((Flux.train!, "Zygote"), (train_enzyme!, "Enzyme"))
         @test diff1 ≈ diff2
   
         true
-      end
+      end broken = VERSION >= v"1.11"
     end
 
     # Take 3: using WeightDecay instead. Need the /2 above, to match exactly.
diff --git a/test/utils.jl b/test/utils.jl
index e05d5f4562..b526b63286 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -273,14 +273,14 @@ end
 @testset "params gradient" begin
   m = (x=[1,2.0], y=[3.0]);
 
-  @test_broken begin
-    # Explicit -- was broken by #2054 / then fixed / now broken again on julia v0.11
+  @test begin
+    # Explicit -- was broken by #2054 / then fixed / now broken again on julia v1.11
     gnew = gradient(m -> (sum(norm, Flux.params(m))), m)[1]
     @test gnew.x ≈ [0.4472135954999579, 0.8944271909999159]
     @test gnew.y ≈ [1.0]
     true
-  end
-  
+  end broken = VERSION >= v"1.11"
+
   # Implicit
   gold = gradient(() -> (sum(norm, Flux.params(m))), Flux.params(m))
   @test gold[m.x] ≈ [0.4472135954999579, 0.8944271909999159]
@@ -689,6 +689,6 @@ end
 
 # make sure rng_from_array is non_differentiable
 @testset "rng_from_array" begin
-  m(x) = (rand(rng_from_array(x)) * x)[1]
+  m(x) = (rand(Flux.rng_from_array(x)) * x)[1]
   gradient(m, ones(2))
 end