deprecation of params and Optimise (continued) (#2526)

FluxML · Nov 17, 2024 · 1fedc0d · 1fedc0d
1 parent 6fffc31
commit 1fedc0d
Show file tree

Hide file tree

Showing 27 changed files with 374 additions and 444 deletions.
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -14,6 +14,7 @@ steps:
     env:
       FLUX_TEST_CUDA: "true"
       FLUX_TEST_CPU: "false"
+      FLUX_TEST_ENZYME: "false"
     timeout_in_minutes: 60
 
   # - label: "GPU nightly"
@@ -53,6 +54,7 @@ steps:
     env:
       FLUX_TEST_METAL: "true"
       FLUX_TEST_CPU: "false"
+      FLUX_TEST_ENZYME: "false"
     matrix:
       setup:
         julia:
@@ -82,6 +84,7 @@ steps:
       JULIA_AMDGPU_DISABLE_ARTIFACTS: "1"
       FLUX_TEST_AMDGPU: "true"
       FLUX_TEST_CPU: "false"
+      FLUX_TEST_ENZYME: "false"
       JULIA_NUM_THREADS: 4
 env:
   SECRET_CODECOV_TOKEN: "fAV/xwuaV0l5oaIYSAXRQIor8h7yHdlrpLUZFwNVnchn7rDk9UZoz0oORG9vlKLc1GK2HhaPRAy+fTkJ3GM/8Y0phHh3ANK8f5UsGm2DUTNsnf6u9izgnwnoRTcsWu+vSO0fyYrxBvBCoJwljL+yZbDFz3oE16DP7HPIzxfQagm+o/kMEszVuoUXhuLXXH0LxT6pXl214qjqs04HfMRmKIIiup48NB6fBLdhGlQz64MdMNHBfgDa/fafB7eNvn0X6pEOxysoy6bDQLUhKelOXgcDx1UsTo34Yiqr+QeJPAeKcO//PWurwQhPoUoHfLad2da9DN4uQk4YQLqAlcIuAA==;U2FsdGVkX1+mRXF2c9soCXT7DYymY3msM+vrpaifiTp8xA+gMpbQ0G63WY3tJ+6V/fJcVnxYoKZVXbjcg8fl4Q=="
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,4 @@ LocalPreferences.toml
 .DS_Store
 docs/mymodel.bson
 prova.jl
+benchmarks/
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.15-DEV"
+version = "0.15.0-DEV"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"

diff --git a/ext/FluxEnzymeExt/FluxEnzymeExt.jl b/ext/FluxEnzymeExt/FluxEnzymeExt.jl
@@ -2,7 +2,6 @@ module FluxEnzymeExt
 
 using Flux
 import Flux.Train: train!, _rule_to_state
-import Flux.Optimise
 import Optimisers
 import Enzyme
 using Enzyme: EnzymeRules, Active, Const, Duplicated, autodiff, ReverseWithPrimal

diff --git a/src/Flux.jl b/src/Flux.jl
@@ -9,14 +9,14 @@ using MacroTools: @forward
 
 @reexport using NNlib
 using MLUtils
-const stack = MLUtils.stack  # now exported by Base
-import Optimisers: Optimisers, trainable, destructure  # before v0.13, Flux owned these functions
-using Optimisers: freeze!, thaw!, adjust!, trainables
+
+using Optimisers: Optimisers, destructure, freeze!, thaw!, adjust!, trainables, update!
+import Optimisers: trainable
 @reexport using Optimisers
 
 using Random: default_rng
 using Zygote, ChainRulesCore
-using Zygote: Params, @adjoint, gradient, pullback
+using Zygote: @adjoint, gradient, pullback
 using Zygote.ForwardDiff: value
 export gradient
 
@@ -31,10 +31,6 @@ export gradient
                     get_device_type,
                     DeviceIterator
 
-
-# Pirate error to catch a common mistake. (Internal function `base` because overloading `update!` is more likely to give ambiguities.)
-Optimisers.base(dx::Zygote.Grads) = error("Optimisers.jl cannot be used with Zygote.jl's implicit gradients, `Params` & `Grads`")
-
 export Chain, Dense, Embedding, EmbeddingBag,
        Maxout, SkipConnection, Parallel, PairwiseFusion,
        RNNCell, LSTMCell, GRUCell, GRUv3Cell,
@@ -55,12 +51,43 @@ export Chain, Dense, Embedding, EmbeddingBag,
   Bilinear, Scale,
   # utils
   outputsize, state, create_bias, @layer,
+  # from OneHotArrays.jl
+  onehot, onehotbatch, onecold,  
+  # from Train
+  setup, train!,
+  # from Optimsers.jl
+  destructure, freeze!, thaw!, adjust!, trainables, update!, trainable,
+  # init
+  glorot_uniform,
+  glorot_normal,
+  kaiming_uniform,
+  kaiming_normal,
+  truncated_normal,
+  lecun_normal,
+  orthogonal,
+  sparse_init,
+  identity_init,
+  # Losses
+  binary_focal_loss,
+  binarycrossentropy,
+  crossentropy,
+  dice_coeff_loss,
+  focal_loss,
+  hinge_loss,
+  huber_loss,
+  kldivergence,
+  label_smoothing,
+  logitbinarycrossentropy,
+  logitcrossentropy,
+  mae,
+  mse,
+  msle,
+  poisson_loss,
+  siamese_contrastive_loss,
+  squared_hinge_loss,
+  tversky_loss,
 ))
 
-include("optimise/Optimise.jl")
-using .Optimise: Optimise
-export ClipValue # this is const defined in deprecations, for ClipGrad
-
 include("train.jl")
 using .Train
 using .Train: setup
@@ -69,18 +96,6 @@ using Adapt, Functors, OneHotArrays
 include("utils.jl")
 include("functor.jl")
 
-@compat(public, (
-  # from OneHotArrays.jl
-  onehot, onehotbatch, onecold,  
-  # from Functors.jl
-  functor, @functor, KeyPath, haskeypath, getkeypath,
-  # from Optimise/Train/Optimisers.jl
-  setup, update!, destructure, freeze!, adjust!, params, trainable, trainables
-))
-
-# Pirate error to catch a common mistake.
-Functors.functor(::Type{<:MLUtils.DataLoader}, x) = error("`DataLoader` does not support Functors.jl, thus functions like `Flux.gpu` will not act on its contents.")
-
 include("layers/show.jl")
 include("layers/macro.jl")
 
@@ -97,8 +112,6 @@ include("loading.jl")
 include("outputsize.jl")
 export @autosize
 
-include("deprecations.jl")
-
 include("losses/Losses.jl")
 using .Losses
 
@@ -110,38 +123,6 @@ include("distributed/backend.jl")
 include("distributed/public_api.jl")
 export MPIBackend, NCCLBackend, DistributedUtils
 
-@compat(public, (
-  # init
-  glorot_uniform,
-  glorot_normal,
-  kaiming_uniform,
-  kaiming_normal,
-  truncated_normal,
-  lecun_normal,
-  orthogonal,
-  sparse_init,
-  identity_init,
-
-  # Losses
-  binary_focal_loss,
-  binarycrossentropy,
-  crossentropy,
-  dice_coeff_loss,
-  focal_loss,
-  hinge_loss,
-  huber_loss,
-  kldivergence,
-  label_smoothing,
-  logitbinarycrossentropy,
-  logitcrossentropy,
-  mae,
-  mse,
-  msle,
-  poisson_loss,
-  siamese_contrastive_loss,
-  squared_hinge_loss,
-  tversky_loss,
-))
-
+include("deprecations.jl")
 
 end # module
diff --git a/src/deprecations.jl b/src/deprecations.jl
@@ -16,125 +16,8 @@ GRUCell(in::Integer, out::Integer; kw...) = GRUCell(in => out; kw...)
 GRUv3Cell(in::Integer, out::Integer; kw...) = GRUv3Cell(in => out; kw...)
 
 
-#=
-  # Valid method in Optimise, old implicit style, is:
-  train!(loss, ps::Params, data, opt::AbstractOptimiser; cb = () -> ())
-
-  # Valid methods in Train, new explict style, are:
-  train!(loss, model, data, opt)  # preferred
-  train!(loss, model, data, opt::Optimisers.AbstractRule)  # if you forget setup
-
-  # Provide friendly errors for what happens if you mix these up:
-=#
-import .Optimise: train!
-
-train!(loss, ps::Params, data, opt; cb=nothing) = error(
-  """can't mix implict Params with explict state!
-  To use `Flux.params(m)` in `train!`, the 4th argument must be from the old `Flux.Optimise` sub-module.
-  But better to use the new explicit style, in which `m` itself is the 2nd argument.
-  """)
-
-train!(loss, ps::Params, data, opt::Optimisers.AbstractRule; cb=nothing) = error(
-  """can't mix implict Params with explict rule from Optimisers.jl
-  To use `Flux.params(m)` in `train!`, the 4th argument must be from the old `Flux.Optimise` sub-module.
-  But better to use the new explicit style, in which `m` itself is the 2nd argument.
-  """)
-
-train!(loss, model, data, opt::Optimise.AbstractOptimiser; cb=nothing) =
-  train!(loss, model, data, __old_to_new(opt); cb)
-
-# Next, to use the new `setup` with the still-exported old-style `Adam` etc:
-import .Train: setup
-setup(rule::Optimise.AbstractOptimiser, model) = setup(__old_to_new(rule), model)
-# ... and allow accidental use of `Optimisers.setup` to do the same:
-Optimisers.setup(rule::Optimise.AbstractOptimiser, model) = setup(__old_to_new(rule), model)
-
-
-function __old_to_new(rule)
-  Base.depwarn("""Optimisers from  Flux.Optimise module are deprecated. 
-                   Use optimisers from Optimisers.jl instead.""", :__old_to_new)
-  return _old_to_new(rule)
-end
-
-for T in [:Descent, :Adam, :Momentum, :Nesterov,
-   	      :AdaGrad, :AdaMax, :AdaDelta, :AMSGrad, :NAdam, :RAdam, :OAdam, :AdaBelief,
-   	      # :InvDecay, :ExpDecay, 
-          :SignDecay,
-          ]
-  @eval function _old_to_new(rule::Optimise.$T)
-    args = map(f -> getfield(rule, f), fieldnames(Optimisers.$T))
-    Optimisers.$T(args...)
-  end
-end
-_old_to_new(rule::Optimise.Optimiser) = Optimisers.OptimiserChain(map(_old_to_new, rule.os)...)
-# const OptimiserChain = Optimise.Optimiser  # lets you use new name with implicit params too.
-const Optimiser = Optimisers.OptimiserChain
-_old_to_new(rule::Optimise.WeightDecay) = Optimisers.WeightDecay(rule.wd)  # called lambda now
-_old_to_new(rule::Optimise.ClipNorm) = Optimisers.ClipNorm(rule.thresh)  # called omega, and there are more fields 
-_old_to_new(rule::Optimise.ClipValue) = Optimisers.ClipGrad(rule.thresh)  # called delta now, and struct name differs
-# const ClipGrad = Optimise.ClipValue
-const ClipValue = Optimisers.ClipGrad
-_old_to_new(rule::Optimise.RMSProp) = Optimisers.RMSProp(rule.eta, rule.rho, rule.epsilon)  # RMSProp has no field centred
-
-_old_to_new(rule) = error("Flux.setup does not know how to translate this old-style implicit rule to a new-style Optimisers.jl explicit rule")
-
-# This allows you to mix and match, like Flux.setup(OptimiserChain(Optimisers.SignDecay(), Flux.Descent()), [1,2,3.])
-Optimisers.OptimiserChain(rules::Union{Optimisers.AbstractRule, Optimise.AbstractOptimiser}...) =
-  Optimisers.OptimiserChain(map(_old_to_new, rules))
-_old_to_new(rule::Optimisers.AbstractRule) = rule
-
-# Since `update!` should be called in a loop, it makes less sense to call `setup` for you if you forgot.
-# But let's make sure that such uses give a helpful error:
-import .Optimise: update!
-
-function update!(opt::Optimise.AbstractOptimiser, model, grad)
-  # This error method requires narrowing the main worker method of Flux.Optimise
-  # to accept only arrays. Remove if this causes problems!
-  # update!(opt::Flux.Optimise.AbstractOptimiser, x::AbstractArray, x̄)
-  error("""Invalid input to `update!`.
-    * For the implicit style, this needs `update!(::AbstractOptimiser, ::Params, ::Grads)`
-    * For the explicit style, `update!(state, model, grad)` needs `state = Flux.setup(opt, model)`.
-    """)
-end
-
-# TODO this friendly error should go in Optimisers.jl.
-# remove after https://github.com/FluxML/Optimisers.jl/pull/181
-function update!(opt::Optimisers.AbstractRule, model, grad)
-  error("""Invalid input to `update!`.
-     `update!(state, model, grad)` needs `state = Flux.setup(opt, model)`.
-    """)
-end
-function update!(opt::Optimisers.AbstractRule, model::Chain, grad::Tuple)
-  error("""Invalid input to `update!`.
-     `update!(state, model, grad)` needs `state = Flux.setup(opt, model)`.
-    """)
-end
-
-# An easy error to make is to pass result of explicit gradient(...), not gradient(...)[1]
-# Can't catch every case, but can catch many simple Flux models:
-
-function update!(opt, model::Chain, grads::Tuple)
-  # Zygote will make a NamedTuple{(:layers,)} for the gradient of Chain, Diffractor a Tangent
-  @warn """explicit `update!(opt, model, grad)` wants the gradient for the model alone,
-    not the whole tuple from `gradient(m -> loss(m, x, y), model)`. You probably want `grads[1]`."""
-  update!(opt, model, grads[1])
-end
-
-function update!(opt::Optimise.AbstractOptimiser, model::Chain, grads::Tuple)  # ambiguity
-  update!(opt, model, grads[1])  # calls error case "Invalid input" just above
-end
-
-# One more easy error to catch is using explicit gradient with `params(m)`:
+#### v0.14 deprecations ###########################
 
-function update!(opt::Optimise.AbstractOptimiser, ::Params, grads::Union{Tuple, NamedTuple})
-  error("""can't mix implicit Params with explicit gradients!
-    * For the implicit style, this needs `update(::AbstractOptimiser, ::Params, ::Grads)` with implicit gradient.
-    * For the explicit style, `update(state, model, grad)` needs the model itself, and `state = Flux.setup(opt, model)`.
-    """)
-end
-
-
-# v0.14 deprecations
 @deprecate default_rng_value() Random.default_rng()
 
 
@@ -179,14 +62,14 @@ const FluxCUDAAdaptor = CUDADevice
 const FluxAMDGPUAdaptor = AMDGPUDevice
 const FluxMetalAdaptor = MetalDevice
 
-# v0.15 deprecations
+######## v0.15 deprecations #########################
 
-# Enable these when 0.15 is released, and delete const ClipGrad = Optimise.ClipValue etc: 
+# Enable these when 0.16 is released, and delete const ClipGrad = Optimise.ClipValue etc: 
 # Base.@deprecate_binding Optimiser OptimiserChain
 # Base.@deprecate_binding ClipValue ClipGrad
 
 # train!(loss::Function, ps::Zygote.Params, data, opt) = throw(ArgumentError(
-#   """On Flux 0.15, `train!` no longer accepts implicit `Zygote.Params`.
+#   """On Flux 0.16, `train!` no longer accepts implicit `Zygote.Params`.
 #   Instead of `train!(loss_xy, Flux.params(model), data, Adam())`
 #   it now needs `opt = Flux.setup(Adam(), model); train!(loss_mxy, model, data, opt)`
 #   where `loss_mxy` accepts the model as its first argument.
@@ -197,3 +80,49 @@ function reset!(x)
   Base.depwarn("reset!(m) is deprecated. You can remove this call as it is no more needed.", :reset!)
   return x
 end
+
+function params!(p::Zygote.Params, x, seen = IdSet())
+  if x isa AbstractArray{<:Number} && Functors.isleaf(x)
+    return push!(p, x)
+  elseif x in seen
+    nothing
+  else
+    _check_new_macro(x)  # complains if you used @functor not @layer
+    push!(seen, x)
+    for child in trainable(x)
+      params!(p, child, seen)
+    end
+  end
+end
+
+function params(m...)
+  Base.depwarn("""
+  Flux.params(m...) is deprecated. Use `Flux.trainable(model)` for parameters' collection
+  and the explicit `gradient(m -> loss(m, x, y), model)` for gradient computation.
+  """, :params)
+  ps = Params()
+  params!(ps, m)
+  return ps
+end
+
+# Allows caching of the parameters when params is called within gradient() to fix #2040.
+# @non_differentiable params(m...)  # https://github.com/FluxML/Flux.jl/pull/2054
+# That speeds up implicit use, and silently breaks explicit use. 
+# From @macroexpand Zygote.@non_differentiable params(m...) and https://github.com/FluxML/Zygote.jl/pull/1248
+Zygote._pullback(::Zygote.Context{true}, ::typeof(params), m...) = params(m), _ -> nothing
+
+include("optimise/Optimise.jl") ## deprecated Module
+
+
+# TODO this friendly error should go in Optimisers.jl.
+# remove after https://github.com/FluxML/Optimisers.jl/pull/181
+function Optimisers.update!(opt::Optimisers.AbstractRule, model, grad)
+  error("""Invalid input to `update!`.
+     `update!(state, model, grad)` needs `state = Flux.setup(opt, model)`.
+    """)
+end
+function Optimisers.update!(opt::Optimisers.AbstractRule, model::Chain, grad::Tuple)
+  error("""Invalid input to `update!`.
+     `update!(state, model, grad)` needs `state = Flux.setup(opt, model)`.
+    """)
+end