From 2134bb1e83dca82e679b2b8269182de0f975e945 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Tue, 22 Oct 2024 07:50:05 +0200
Subject: [PATCH] news

---
 NEWS.md                             | 8 ++++++++
 docs/src/guide/models/recurrence.md | 6 ++----
 docs/src/reference/models/layers.md | 2 --
 src/layers/show.jl                  | 2 +-
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 654ae70c07..28b5856a24 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -2,6 +2,14 @@
 
 See also [github's page](https://github.com/FluxML/Flux.jl/releases) for a complete list of PRs merged before each release.
 
+## v0.15.0 
+* Recurrent layers have undergone a complete redesign in [PR 2500](https://github.com/FluxML/Flux.jl/pull/2500).
+* `RNN`, `LSTM`, and `GRU` no longer store the hidden state internally. Instead, they now take the previous state as input and return the updated state as output.
+* These layers (`RNN`, `LSTM`, `GRU`) now process entire sequences at once, rather than one element at a time.
+* The `Recur` wrapper has been deprecated and removed.
+* The `reset!` function has also been removed; state management is now entirely up to the user.
+* `RNNCell`, `LSTMCell`, and `GRUCell` are now exported and provide functionality for single time-step processing.
+
 ## v0.14.22
 * Data movement between devices is now provided by [MLDataDevices.jl](https://github.com/LuxDL/MLDataDevices.jl).
 
diff --git a/docs/src/guide/models/recurrence.md b/docs/src/guide/models/recurrence.md
index 7827062f22..7acaec8edb 100644
--- a/docs/src/guide/models/recurrence.md
+++ b/docs/src/guide/models/recurrence.md
@@ -169,14 +169,13 @@ X = [seq_1, seq_2]
 Y = [y1, y2]
 data = zip(X,Y)
 
-Flux.reset!(m)
 [m(x) for x in seq_init]
 
 opt = Flux.setup(Adam(1e-3), m)
 Flux.train!(loss, m, data, opt)
 ```
 
-In this previous example, model's state is first reset with `Flux.reset!`. Then, there's a warmup that is performed over a sequence of length 1 by feeding it with `seq_init`, resulting in a warmup state. The model can then be trained for 1 epoch, where 2 batches are provided (`seq_1` and `seq_2`) and all the timesteps outputs are considered for the loss.
+Then, there's a warmup that is performed over a sequence of length 1 by feeding it with `seq_init`, resulting in a warmup state. The model can then be trained for 1 epoch, where 2 batches are provided (`seq_1` and `seq_2`) and all the timesteps outputs are considered for the loss.
 
 In this scenario, it is important to note that a single continuous sequence is considered. Since the model state is not reset between the 2 batches, the state of the model flows through the batches, which only makes sense in the context where `seq_1` is the continuation of `seq_init` and so on.
 
@@ -187,7 +186,7 @@ x = [rand(Float32, 2, 4) for i = 1:3]
 y = [rand(Float32, 1, 4) for i = 1:3]
 ```
 
-That would mean that we have 4 sentences (or samples), each with 2 features (let's say a very small embedding!) and each with a length of 3 (3 words per sentence). Computing `m(batch[1])`, would still represent `x1 -> y1` in our diagram and returns the first word output, but now for each of the 4 independent sentences (second dimension of the input matrix). We do not need to use `Flux.reset!(m)` here; each sentence in the batch will output in its own "column", and the outputs of the different sentences won't mix. 
+That would mean that we have 4 sentences (or samples), each with 2 features (let's say a very small embedding!) and each with a length of 3 (3 words per sentence). Computing `m(batch[1])`, would still represent `x1 -> y1` in our diagram and returns the first word output, but now for each of the 4 independent sentences (second dimension of the input matrix). Each sentence in the batch will output in its own "column", and the outputs of the different sentences won't mix. 
 
 To illustrate, we go through an example of batching with our implementation of `rnn_cell`. The implementation doesn't need to change; the batching comes for "free" from the way Julia does broadcasting and the rules of matrix multiplication.
 
@@ -223,7 +222,6 @@ In many situations, such as when dealing with a language model, the sentences in
 
 ```julia
 function loss(x, y)
-  Flux.reset!(m)
   sum(mse(m(xi), yi) for (xi, yi) in zip(x, y))
 end
 ```
diff --git a/docs/src/reference/models/layers.md b/docs/src/reference/models/layers.md
index 509702e30c..e2f680e500 100644
--- a/docs/src/reference/models/layers.md
+++ b/docs/src/reference/models/layers.md
@@ -112,8 +112,6 @@ RNN
 LSTM
 GRU
 GRUv3
-Flux.Recur
-Flux.reset!
 ```
 
 ## Normalisation & Regularisation
diff --git a/src/layers/show.jl b/src/layers/show.jl
index a03ddf3754..4d906f1b49 100644
--- a/src/layers/show.jl
+++ b/src/layers/show.jl
@@ -14,7 +14,7 @@ function _macro_big_show(ex)
       end
     end
 
-    # Don't show Chain(Tuple(...)), always splat that. And ignore Recur's non-trainable state:
+    # Don't show Chain(Tuple(...)), always splat that. And ignore non-trainable buffers:
     Flux._show_children(x::$ex) = _flat_children(trainable(x))
   end
 end