From 898b558192d7babba7023b6930fc9b8d4f07962d Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Wed, 8 Jun 2022 00:18:14 +0530
Subject: [PATCH 1/9] Add doctests for losses and stateless layers

---
 src/layers/stateless.jl |  25 ++++++++
 src/losses/functions.jl | 122 ++++++++++++++++++++++++++++++++++++----
 2 files changed, 136 insertions(+), 11 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 34e365ae9d..5298b634bb 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -32,6 +32,31 @@ end
 Normalise `x` to mean 0 and standard deviation 1 across the dimension(s) given by `dims`.
 Per default, `dims` is the last dimension. 
 `ϵ` is a small additive factor added to the denominator for numerical stability.
+
+# Examples
+```jldoctest
+julia> x = [9, 10, 20, 60];
+
+julia> Flux.std(x)
+24.01908963026423
+
+julia> y = Flux.normalise(x);
+
+julia> Flux.std(y)
+1.1546999832655012
+
+julia> x = rand(1:100, 10, 2);
+
+julia> Flux.std(x, dims=1)
+1×2 Matrix{Float64}:
+ 28.5324  34.6425
+
+julia> y = Flux.normalise(x, dims=1);
+
+julia> Flux.std(y, dims=1)
+1×2 Matrix{Float64}:
+ 1.05409  1.05409
+```
 """
 @inline function normalise(x::AbstractArray; dims=ndims(x), ϵ=ofeltype(x, 1e-5))
   μ = mean(x, dims=dims)
diff --git a/src/losses/functions.jl b/src/losses/functions.jl
index 532d9f3dfb..3cc0f7d827 100644
--- a/src/losses/functions.jl
+++ b/src/losses/functions.jl
@@ -80,6 +80,17 @@ given the prediction `ŷ` and true values `y`.
                  | 0.5 * |ŷ - y|^2,            for |ŷ - y| <= δ
     Huber loss = |
                  |  δ * (|ŷ - y| - 0.5 * δ), otherwise
+
+# Example
+```jldoctest
+julia> ŷ = [1.1, 2.1, 3.1];
+
+julia> Flux.huber_loss(ŷ, 1:3)  # default δ = 1 > |ŷ - y|
+0.005000000000000009
+
+julia> Flux.huber_loss(ŷ, 1:3, δ=0.05)  # changes behaviour as |ŷ - y| > δ
+0.003750000000000005
+```
 """
 function huber_loss(ŷ, y; agg = mean, δ = ofeltype(ŷ, 1))
    _check_sizes(ŷ, y)
@@ -377,12 +388,22 @@ function kldivergence(ŷ, y; dims = 1, agg = mean, ϵ = epseltype(ŷ))
 end
 
 """
-    poisson_loss(ŷ, y)
+    poisson_loss(ŷ, y; agg = mean)
 
-# Return how much the predicted distribution `ŷ` diverges from the expected Poisson
-# distribution `y`; calculated as `sum(ŷ .- y .* log.(ŷ)) / size(y, 2)`.
+Return how much the predicted distribution `ŷ` diverges from the expected Poisson
+distribution `y`; calculated as -
+
+    `sum(ŷ .- y .* log.(ŷ)) / size(y, 2)`.
 
 [More information.](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
+
+# Example
+```jldoctest
+julia> y_model = [1, 3, 3];  # data should only take integral values
+
+julia> poisson_loss(y_model, 1:3)
+0.5023128522198171
+```
 """
 function poisson_loss(ŷ, y; agg = mean)
   _check_sizes(ŷ, y)
@@ -392,11 +413,32 @@ end
 """
     hinge_loss(ŷ, y; agg = mean)
 
-Return the [hinge_loss loss](https://en.wikipedia.org/wiki/Hinge_loss) given the
+Return the [hinge_loss](https://en.wikipedia.org/wiki/Hinge_loss) given the
 prediction `ŷ` and true labels `y` (containing 1 or -1); calculated as
-`sum(max.(0, 1 .- ŷ .* y)) / size(y, 2)`.
 
+    `sum(max.(0, 1 .- ŷ .* y)) / size(y, 2)`.
+
+Usually used with classifiers like Support Vector Machines.
 See also: [`squared_hinge_loss`](@ref)
+
+# Example
+```jldoctest
+julia> y_true = [1, -1, 1, 1];
+
+julia> y_pred = [0.1, 0.3, 1, 1.5];
+
+julia> Flux.hinge_loss(y_pred, y_true)
+0.55
+
+julia> Flux.hinge_loss(y_pred[1], y_true[1])  # same sign but |ŷ| < 1
+0.9
+
+julia> Flux.hinge_loss(y_pred[end], y_true[end])  # same sign but |ŷ| >= 1 -> loss = 0
+0.0
+
+julia> Flux.hinge_loss(y_pred[2], y_true[2])  # opposite signs -> loss != 0
+1.3
+```
 """
 function hinge_loss(ŷ, y; agg = mean)
   _check_sizes(ŷ, y)
@@ -407,9 +449,31 @@ end
     squared_hinge_loss(ŷ, y)
 
 Return the squared hinge_loss loss given the prediction `ŷ` and true labels `y`
-(containing 1 or -1); calculated as `sum((max.(0, 1 .- ŷ .* y)).^2) / size(y, 2)`.
+(containing 1 or -1); calculated as
 
+    `sum((max.(0, 1 .- ŷ .* y)).^2) / size(y, 2)`.
+
+Usually used with classifiers like Support Vector Machines.
 See also: [`hinge_loss`](@ref)
+
+# Example
+```jldoctes
+julia> y_true = [1, -1, 1, 1];
+
+julia> y_pred = [0.1, 0.3, 1, 1.5];
+
+julia> Flux.squared_hinge_loss(y_pred, y_true)
+0.625
+
+julia> Flux.squared_hinge_loss(y_pred[1], y_true[1])  # same sign but |ŷ| < 1
+0.81
+
+julia> Flux.squared_hinge_loss(y_pred[end], y_true[end])  # same sign and |ŷ| >= 1 -> loss = 0
+0.0
+
+julia> Flux.squared_hinge_loss(y_pred[2], y_true[2])  # opposite signs -> loss != 0
+1.6900000000000002
+```
 """
 function squared_hinge_loss(ŷ, y; agg = mean)
   _check_sizes(ŷ, y)
@@ -422,9 +486,20 @@ end
 Return a loss based on the dice coefficient.
 Used in the [V-Net](https://arxiv.org/abs/1606.04797) image segmentation
 architecture.
-Similar to the F1_score. Calculated as:
+The dice coefficient is similar to the F1_score. Loss calculated as:
 
     1 - 2*sum(|ŷ .* y| + smooth) / (sum(ŷ.^2) + sum(y.^2) + smooth)
+
+# Example
+```jldoctest
+julia> y_pred = [1.1, 2.1, 3.1];
+
+julia> Flux.dice_coeff_loss(y_pred, 1:3)
+0.000992391663909964
+
+julia> 1 - Flux.dice_coeff_loss(y_pred, 1:3)  # ~ F1 score for image segmentation
+0.99900760833609
+```
 """
 function dice_coeff_loss(ŷ, y; smooth = ofeltype(ŷ, 1.0))
   _check_sizes(ŷ, y)
@@ -438,7 +513,23 @@ Return the [Tversky loss](https://arxiv.org/abs/1706.05721).
 Used with imbalanced data to give more weight to false negatives.
 Larger β weigh recall more than precision (by placing more emphasis on false negatives)
 Calculated as:
+
     1 - sum(|y .* ŷ| + 1) / (sum(y .* ŷ + β*(1 .- y) .* ŷ + (1 - β)*y .* (1 .- ŷ)) + 1)
+
+# Example
+```jldoctest
+julia> ŷ = [1, 0, 1, 1, 0];
+
+julia> y = [1, 0, 0, 1, 0];  # one false negative data point
+
+julia> Flux.tversky_loss(ŷ, y)
+0.18918918918918926
+
+julia> y = [1, 1, 1, 1, 0];  # No false negatives, but a false positive
+
+julia> Flux.tversky_loss(ŷ, y)  # loss is smaller as more weight given to the false negatives
+0.06976744186046513
+```
 """
 function tversky_loss(ŷ, y; β = ofeltype(ŷ, 0.7))
     _check_sizes(ŷ, y)
@@ -456,6 +547,8 @@ The input, 'ŷ', is expected to be normalized (i.e. [softmax](@ref Softmax) out
 
 For `γ == 0`, the loss is mathematically equivalent to [`Losses.binarycrossentropy`](@ref).
 
+See also: [`Losses.focal_loss`](@ref) for multi-class setting
+
 # Example
 ```jldoctest
 julia> y = [0  1  0
@@ -473,9 +566,6 @@ julia> ŷ = [0.268941  0.5  0.268941
 julia> Flux.binary_focal_loss(ŷ, y) ≈ 0.0728675615927385
 true
 ```
-
-See also: [`Losses.focal_loss`](@ref) for multi-class setting
-
 """
 function binary_focal_loss(ŷ, y; agg=mean, γ=2, ϵ=epseltype(ŷ))
     _check_sizes(ŷ, y)
@@ -536,7 +626,17 @@ which can be useful for training Siamese Networks. It is given by
     agg(@. (1 - y) * ŷ^2 + y * max(0, margin - ŷ)^2)                           
                                  
 Specify `margin` to set the baseline for distance at which pairs are dissimilar.
-                                    
+
+# Example
+```jldoctest
+julia> ŷ = [0.5, 1.5, 2.5];
+
+julia> Flux.siamese_contrastive_loss(ŷ, 1:3)
+-4.833333333333333
+
+julia> Flux.siamese_contrastive_loss(ŷ, 1:3, margin = 2)
+-4.0
+```
 """
 function siamese_contrastive_loss(ŷ, y; agg = mean, margin::Real = 1)
     _check_sizes(ŷ, y)

From 0a7f42a96e0e64f9d20325709b93b4e1099512ef Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Wed, 8 Jun 2022 00:47:17 +0530
Subject: [PATCH 2/9] Fix and update docs for `utils.jl`

---
 docs/src/utilities.md   |  2 +
 src/layers/basic.jl     |  6 +--
 src/layers/conv.jl      |  6 +--
 src/layers/normalise.jl | 16 ++++----
 src/utils.jl            | 91 ++++++++++++++++++++++++++++-------------
 5 files changed, 79 insertions(+), 42 deletions(-)

diff --git a/docs/src/utilities.md b/docs/src/utilities.md
index 6e6226a45f..1d45d07485 100644
--- a/docs/src/utilities.md
+++ b/docs/src/utilities.md
@@ -42,7 +42,9 @@ Flux.orthogonal
 Flux.sparse_init
 Flux.identity_init
 Flux.ones32
+Flux.zeros32
 Flux.rand32
+Flux.randn32
 ```
 
 ## Changing the type of model parameters
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 42813cb5f7..51c5fda9b1 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -155,7 +155,7 @@ struct Dense{F, M<:AbstractMatrix, B}
   bias::B
   σ::F
   function Dense(W::M, bias = true, σ::F = identity) where {M<:AbstractMatrix, F}
-    b = create_bias(W, bias, size(W,1))
+    b = _create_bias(W, bias, size(W,1))
     new{F,M,typeof(b)}(W, b, σ)
   end
 end
@@ -228,7 +228,7 @@ struct Scale{F, A<:AbstractArray, B}
   bias::B
   σ::F
   function Scale(scale::A, bias::B = true, σ::F = identity) where {A<:AbstractArray, B<:Union{Bool, AbstractArray}, F}
-    b = create_bias(scale, bias, size(scale)...)
+    b = _create_bias(scale, bias, size(scale)...)
     new{F, A, typeof(b)}(scale, b, σ)
   end
 end
@@ -403,7 +403,7 @@ struct Bilinear{F,A,B}
   σ::F
   function Bilinear(W::A, bias = true, σ::F = identity) where {A<:AbstractArray, F}
     ndims(A) == 3 || throw(ArgumentError("expected a 3-array of weights"))
-    b = create_bias(W, bias, size(W,1))
+    b = _create_bias(W, bias, size(W,1))
     new{F,A,typeof(b)}(W, b, σ)
   end
 end
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 003395c15d..36aa5c8430 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -156,7 +156,7 @@ function Conv(w::AbstractArray{T,N}, b = true, σ = identity;
   stride = expand(Val(N-2), stride)
   dilation = expand(Val(N-2), dilation)
   pad = calc_padding(Conv, pad, size(w)[1:N-2], dilation, stride)
-  bias = create_bias(w, b, size(w, N))
+  bias = _create_bias(w, b, size(w, N))
   return Conv(σ, w, bias, stride, pad, dilation, groups)
 end
 
@@ -293,7 +293,7 @@ function ConvTranspose(w::AbstractArray{T,N}, bias = true, σ = identity;
   stride = expand(Val(N-2), stride)
   dilation = expand(Val(N-2), dilation)
   pad = calc_padding(ConvTranspose, pad, size(w)[1:N-2], dilation, stride)
-  b = create_bias(w, bias, size(w, N-1) * groups)
+  b = _create_bias(w, bias, size(w, N-1) * groups)
   return ConvTranspose(σ, w, b, stride, pad, dilation, groups)
 end
 
@@ -441,7 +441,7 @@ function CrossCor(w::AbstractArray{T,N}, bias = true, σ = identity;
   stride = expand(Val(N-2), stride)
   dilation = expand(Val(N-2), dilation)
   pad = calc_padding(CrossCor, pad, size(w)[1:N-2], dilation, stride)
-  b = create_bias(w, bias, size(w, N))
+  b = _create_bias(w, bias, size(w, N))
   return CrossCor(σ, w, b, stride, pad, dilation)
 end
 
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index f1f6c22033..0874be9d98 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -10,7 +10,7 @@ _dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(s
 _dropout_kernel(y::T, p, q) where {T} = y > p ? T(1 / q) : T(0)
 
 """
-    dropout([rng = rng_from_array(x)], x, p; dims=:, active=true)
+    dropout([rng = _rng_from_array(x)], x, p; dims=:, active=true)
 
 The dropout function. If `active` is `true`,
 for each input, either sets that input to `0` (with probability
@@ -34,7 +34,7 @@ function dropout(rng, x, p; dims=:, active::Bool=true)
   y = dropout_mask(rng, x, p, dims=dims)
   return x .* y
 end
-dropout(x, p; kwargs...) = dropout(rng_from_array(x), x, p; kwargs...)
+dropout(x, p; kwargs...) = dropout(_rng_from_array(x), x, p; kwargs...)
 
 dropout_mask(rng::CUDA.RNG, x::CuArray, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...)
 dropout_mask(rng, x::CuArray, p; kwargs...) =
@@ -51,7 +51,7 @@ end
 ChainRulesCore.@non_differentiable dropout_mask(::Any, ::Any, ::Any)
 
 """
-    Dropout(p; dims=:, rng = rng_from_array())
+    Dropout(p; dims=:, rng = _rng_from_array())
 
 Dropout layer.
 
@@ -96,9 +96,9 @@ mutable struct Dropout{F,D,R<:AbstractRNG}
   active::Union{Bool, Nothing}
   rng::R
 end
-Dropout(p, dims, active) = Dropout(p, dims, active, rng_from_array())
+Dropout(p, dims, active) = Dropout(p, dims, active, _rng_from_array())
 
-function Dropout(p; dims=:, rng = rng_from_array())
+function Dropout(p; dims=:, rng = _rng_from_array())
   @assert 0 ≤ p ≤ 1
   Dropout(p, dims, nothing, rng)
 end
@@ -121,7 +121,7 @@ function Base.show(io::IO, d::Dropout)
 end
 
 """
-    AlphaDropout(p; rng = rng_from_array())
+    AlphaDropout(p; rng = _rng_from_array())
 
 A dropout layer. Used in
 [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515).
@@ -155,8 +155,8 @@ mutable struct AlphaDropout{F,R<:AbstractRNG}
     new{typeof(p), typeof(rng)}(p, active, rng)
   end
 end
-AlphaDropout(p, active) = AlphaDropout(p, active, rng_from_array())
-AlphaDropout(p; rng = rng_from_array()) = AlphaDropout(p, nothing, rng)
+AlphaDropout(p, active) = AlphaDropout(p, active, _rng_from_array())
+AlphaDropout(p; rng = _rng_from_array()) = AlphaDropout(p, nothing, rng)
 
 @functor AlphaDropout
 trainable(a::AlphaDropout) = (;)
diff --git a/src/utils.jl b/src/utils.jl
index 85dc8b711f..e830a6a71d 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -34,7 +34,7 @@ ofeltype(x, y) = convert(float(eltype(x)), y)
 epseltype(x) = eps(float(eltype(x)))
 
 """
-    rng_from_array([x])
+    _rng_from_array([x])
 
 Create an instance of the RNG most appropriate for `x`.
 The current defaults are:
@@ -43,12 +43,12 @@ The current defaults are:
   - Julia version is < 1.7: `Random.GLOBAL_RNG`
   - Julia version is >= 1.7: `Random.default_rng()`
 """
-rng_from_array(::AbstractArray) = rng_from_array()
-rng_from_array(::CuArray) = CUDA.default_rng()
+_rng_from_array(::AbstractArray) = _rng_from_array()
+_rng_from_array(::CuArray) = CUDA.default_rng()
 if VERSION >= v"1.7"
-  rng_from_array() = Random.default_rng()
+  _rng_from_array() = Random.default_rng()
 else
-  rng_from_array() = Random.GLOBAL_RNG
+  _rng_from_array() = Random.GLOBAL_RNG
 end
 
 """
@@ -91,8 +91,8 @@ function glorot_uniform(rng::AbstractRNG, dims::Integer...; gain::Real=1)
   scale = Float32(gain) * sqrt(24.0f0 / sum(nfan(dims...)))
   (rand(rng, Float32, dims...) .- 0.5f0) .* scale
 end
-glorot_uniform(dims::Integer...; kw...) = glorot_uniform(rng_from_array(), dims...; kw...)
-glorot_uniform(rng::AbstractRNG=rng_from_array(); init_kwargs...) = (dims...; kwargs...) -> glorot_uniform(rng, dims...; init_kwargs..., kwargs...)
+glorot_uniform(dims::Integer...; kw...) = glorot_uniform(_rng_from_array(), dims...; kw...)
+glorot_uniform(rng::AbstractRNG=_rng_from_array(); init_kwargs...) = (dims...; kwargs...) -> glorot_uniform(rng, dims...; init_kwargs..., kwargs...)
 
 ChainRulesCore.@non_differentiable glorot_uniform(::Any...)
 
@@ -134,8 +134,8 @@ function glorot_normal(rng::AbstractRNG, dims::Integer...; gain::Real=1)
   std = Float32(gain) * sqrt(2.0f0 / sum(nfan(dims...)))
   randn(rng, Float32, dims...) .* std
 end
-glorot_normal(dims::Integer...; kwargs...) = glorot_normal(rng_from_array(), dims...; kwargs...)
-glorot_normal(rng::AbstractRNG=rng_from_array(); init_kwargs...) = (dims...; kwargs...) -> glorot_normal(rng, dims...; init_kwargs..., kwargs...)
+glorot_normal(dims::Integer...; kwargs...) = glorot_normal(_rng_from_array(), dims...; kwargs...)
+glorot_normal(rng::AbstractRNG=_rng_from_array(); init_kwargs...) = (dims...; kwargs...) -> glorot_normal(rng, dims...; init_kwargs..., kwargs...)
 
 ChainRulesCore.@non_differentiable glorot_normal(::Any...)
 
@@ -169,8 +169,8 @@ function kaiming_uniform(rng::AbstractRNG, dims::Integer...; gain::Real = √2)
   return (rand(rng, Float32, dims...) .- 0.5f0) .* 2bound
 end
 
-kaiming_uniform(dims::Integer...; kwargs...) = kaiming_uniform(rng_from_array(), dims...; kwargs...)
-kaiming_uniform(rng::AbstractRNG=rng_from_array(); init_kwargs...) = (dims...; kwargs...) -> kaiming_uniform(rng, dims...; init_kwargs..., kwargs...)
+kaiming_uniform(dims::Integer...; kwargs...) = kaiming_uniform(_rng_from_array(), dims...; kwargs...)
+kaiming_uniform(rng::AbstractRNG=_rng_from_array(); init_kwargs...) = (dims...; kwargs...) -> kaiming_uniform(rng, dims...; init_kwargs..., kwargs...)
 
 ChainRulesCore.@non_differentiable kaiming_uniform(::Any...)
 
@@ -206,7 +206,7 @@ function kaiming_normal(rng::AbstractRNG, dims::Integer...; gain::Real = √2f0)
   return randn(rng, Float32, dims...) .* std
 end
 
-kaiming_normal(dims::Integer...; kwargs...) = kaiming_normal(rng_from_array(), dims...; kwargs...)
+kaiming_normal(dims::Integer...; kwargs...) = kaiming_normal(_rng_from_array(), dims...; kwargs...)
 kaiming_normal(rng::AbstractRNG; init_kwargs...) = (dims...; kwargs...) -> kaiming_normal(rng, dims...; init_kwargs..., kwargs...)
 
 ChainRulesCore.@non_differentiable kaiming_normal(::Any...)
@@ -252,8 +252,8 @@ function truncated_normal(rng::AbstractRNG, dims::Integer...; mean = 0, std = 1,
   return xs
 end
 
-truncated_normal(dims::Integer...; kwargs...) = truncated_normal(rng_from_array(), dims...; kwargs...)
-truncated_normal(rng::AbstractRNG=rng_from_array(); init_kwargs...) = (dims...; kwargs...) -> truncated_normal(rng, dims...; init_kwargs..., kwargs...)
+truncated_normal(dims::Integer...; kwargs...) = truncated_normal(_rng_from_array(), dims...; kwargs...)
+truncated_normal(rng::AbstractRNG=_rng_from_array(); init_kwargs...) = (dims...; kwargs...) -> truncated_normal(rng, dims...; init_kwargs..., kwargs...)
 
 ChainRulesCore.@non_differentiable truncated_normal(::Any...)
 
@@ -313,8 +313,8 @@ function orthogonal(rng::AbstractRNG, d1::Integer, ds::Integer...; kwargs...)
   return reshape(orthogonal(rng, rows, cols; kwargs...), dims)
 end
 
-orthogonal(dims::Integer...; kwargs...) = orthogonal(rng_from_array(), dims...; kwargs...)
-orthogonal(rng::AbstractRNG=rng_from_array(); init_kwargs...) = (dims::Integer...; kwargs...) -> orthogonal(rng, dims...; init_kwargs..., kwargs...)
+orthogonal(dims::Integer...; kwargs...) = orthogonal(_rng_from_array(), dims...; kwargs...)
+orthogonal(rng::AbstractRNG=_rng_from_array(); init_kwargs...) = (dims::Integer...; kwargs...) -> orthogonal(rng, dims...; init_kwargs..., kwargs...)
 
 ChainRulesCore.@non_differentiable orthogonal(::Any...)
 
@@ -361,8 +361,8 @@ function sparse_init(rng::AbstractRNG, dims::Integer...; sparsity, std = 0.01)
   return mapslices(shuffle, sparse_array, dims=1)
 end
 
-sparse_init(dims::Integer...; kwargs...) = sparse_init(rng_from_array(), dims...; kwargs...)
-sparse_init(rng::AbstractRNG=rng_from_array(); init_kwargs...) = (dims...; kwargs...) -> sparse_init(rng, dims...; init_kwargs..., kwargs...)
+sparse_init(dims::Integer...; kwargs...) = sparse_init(_rng_from_array(), dims...; kwargs...)
+sparse_init(rng::AbstractRNG=_rng_from_array(); init_kwargs...) = (dims...; kwargs...) -> sparse_init(rng, dims...; init_kwargs..., kwargs...)
 
 ChainRulesCore.@non_differentiable sparse_init(::Any...)
 
@@ -452,7 +452,7 @@ end
 
 # For consistency, it accepts an RNG, but ignores it:
 identity_init(::AbstractRNG, dims::Integer...; kwargs...) = identity_init(dims...; kwargs...)
-identity_init(rng::AbstractRNG=rng_from_array(); init_kwargs...) = (args...;kwargs...) -> identity_init(rng, args...; init_kwargs..., kwargs...)
+identity_init(rng::AbstractRNG=_rng_from_array(); init_kwargs...) = (args...;kwargs...) -> identity_init(rng, args...; init_kwargs..., kwargs...)
 
 ChainRulesCore.@non_differentiable identity_init(::Any...)
 
@@ -461,33 +461,40 @@ zeros32(dims::Integer...) = Base.zeros(Float32, dims...)
 
 """
     ones32(size...) = ones(Float32, size...)
-    zeros32(size...) = zeros(Float32, size...)
 
-Return an `Array{Float32}` of the given `size`.
+Return an `Array{Float32}` of the given `size` filled with 1s.
 """
 ones32(dims...) = Base.ones(Float32, dims...)
 
-@doc @doc(ones32)
+"""
+    zeros32(size...) = zeros(Float32, size...)
+
+Return an `Array{Float32}` of the given `size` filled with 0s.
+"""
 zeros32(dims...) = Base.zeros(Float32, dims...)
 
 """
     rand32([rng], size...)
-    randn32([rng], size...)
 
-Return an `Array{Float32}` of the given `size`, filled like `rand` or `randn`.
+Return an `Array{Float32}` of the given `size`, filled like `rand`.
 When the size is not provided, `rand32(rng::AbstractRNG)` returns a function.
 """
 rand32(dims::Integer...) = Base.rand(Float32, dims...)
 rand32(rng::AbstractRNG, dims::Integer...) = Base.rand(rng, Float32, dims...)
 rand32(rng::AbstractRNG) = (dims...,) -> Base.rand(rng, Float32, dims...)
 
-@doc @doc(rand32)
+"""
+    randn32([rng], size...)
+
+Return an `Array{Float32}` of the given `size`, filled like `randn`.
+When the size is not provided, `randn32(rng::AbstractRNG)` returns a function.
+"""
 randn32(dims::Integer...) = Base.randn(Float32, dims...)
 randn32(rng::AbstractRNG, dims::Integer...) = Base.randn(rng, Float32, dims...)
 randn32(rng::AbstractRNG) = (dims...,) -> Base.randn(rng, Float32, dims...)
 
 """
-    create_bias(weights, bias, size...)
+    _create_bias(weights, bias, size...)
 
 Return a bias parameter for a layer, based on the value given
 to the constructor's keyword `bias=bias`.
@@ -497,10 +504,10 @@ to the constructor's keyword `bias=bias`.
 * `bias::AbstractArray` uses the array provided, provided it has the correct size.
   It does not at present correct the `eltype` to match that of `weights`.
 """
-function create_bias(weights::AbstractArray, bias::Bool, dims::Integer...)
+function _create_bias(weights::AbstractArray, bias::Bool, dims::Integer...)
   bias ? fill!(similar(weights, dims...), 0) : false
 end
-function create_bias(weights::AbstractArray, bias::AbstractArray, dims::Integer...)
+function _create_bias(weights::AbstractArray, bias::AbstractArray, dims::Integer...)
   size(bias) == dims || throw(DimensionMismatch("expected bias of size $(dims), got size $(size(bias))"))
   bias
 end
@@ -518,6 +525,34 @@ Normally, the throttled function will run as much as it can, without ever
 going more than once per `wait` duration; but if you'd like to disable the
 execution on the leading edge, pass `leading=false`. To enable execution on
 the trailing edge, pass `trailing=true`.
+
+# Examples
+```jldoctest
+julia> a = Flux.throttle(() -> println("Flux"), 2);
+
+julia> a()
+Flux
+
+julia> a()
+Flux
+
+julia> for i = 1:4  # sleeps for 1 second -> the function can be called in alternate iterations
+           a()
+           sleep(1)
+       end
+Flux
+Flux
+
+julia> for i = 1:4  # sleeps for 2 second -> the function can be called in the next iteration
+           a()
+           sleep(2)
+       end
+Flux
+Flux
+Flux
+Flux
+
+```
 """
 function throttle(f, timeout; leading=true, trailing=false)
   cooldown = true

From 0c7bdb946e67a1c82fac797b0535a3c28627e4e9 Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Wed, 8 Jun 2022 01:02:44 +0530
Subject: [PATCH 3/9] Minor typos

---
 src/losses/functions.jl | 6 +++---
 src/utils.jl            | 4 ----
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/losses/functions.jl b/src/losses/functions.jl
index 3cc0f7d827..352e5e4672 100644
--- a/src/losses/functions.jl
+++ b/src/losses/functions.jl
@@ -393,7 +393,7 @@ end
 Return how much the predicted distribution `ŷ` diverges from the expected Poisson
 distribution `y`; calculated as -
 
-    `sum(ŷ .- y .* log.(ŷ)) / size(y, 2)`.
+    sum(ŷ .- y .* log.(ŷ)) / size(y, 2)
 
 [More information.](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson).
 
@@ -416,7 +416,7 @@ end
 Return the [hinge_loss](https://en.wikipedia.org/wiki/Hinge_loss) given the
 prediction `ŷ` and true labels `y` (containing 1 or -1); calculated as
 
-    `sum(max.(0, 1 .- ŷ .* y)) / size(y, 2)`.
+    sum(max.(0, 1 .- ŷ .* y)) / size(y, 2)
 
 Usually used with classifiers like Support Vector Machines.
 See also: [`squared_hinge_loss`](@ref)
@@ -451,7 +451,7 @@ end
 Return the squared hinge_loss loss given the prediction `ŷ` and true labels `y`
 (containing 1 or -1); calculated as
 
-    `sum((max.(0, 1 .- ŷ .* y)).^2) / size(y, 2)`.
+    sum((max.(0, 1 .- ŷ .* y)).^2) / size(y, 2)
 
 Usually used with classifiers like Support Vector Machines.
 See also: [`hinge_loss`](@ref)
diff --git a/src/utils.jl b/src/utils.jl
index e830a6a71d..15525c6357 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -533,9 +533,6 @@ julia> a = Flux.throttle(() -> println("Flux"), 2);
 julia> a()
 Flux
 
-julia> a()
-Flux
-
 julia> for i = 1:4  # sleeps for 1 second -> the function can be called in alternate iterations
            a()
            sleep(1)
@@ -551,7 +548,6 @@ Flux
 Flux
 Flux
 Flux
-
 ```
 """
 function throttle(f, timeout; leading=true, trailing=false)

From dcbdf07534ae46b972b6927ffa838593a41c3dc1 Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Thu, 9 Jun 2022 20:18:13 +0530
Subject: [PATCH 4/9] Fix the implementation and docstring of `tversky_loss`

---
 src/losses/functions.jl | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/losses/functions.jl b/src/losses/functions.jl
index 352e5e4672..b4eb2f4d50 100644
--- a/src/losses/functions.jl
+++ b/src/losses/functions.jl
@@ -511,31 +511,31 @@ end
 
 Return the [Tversky loss](https://arxiv.org/abs/1706.05721).
 Used with imbalanced data to give more weight to false negatives.
-Larger β weigh recall more than precision (by placing more emphasis on false negatives)
+Larger β weigh recall more than precision (by placing more emphasis on false negatives).
 Calculated as:
 
-    1 - sum(|y .* ŷ| + 1) / (sum(y .* ŷ + β*(1 .- y) .* ŷ + (1 - β)*y .* (1 .- ŷ)) + 1)
+    1 - sum(|y .* ŷ| + 1) / (sum(y .* ŷ + (1 - β)*(1 .- y) .* ŷ + β*y .* (1 .- ŷ)) + 1)
 
 # Example
 ```jldoctest
-julia> ŷ = [1, 0, 1, 1, 0];
+julia> y = [0, 1, 0, 1, 1, 1];
 
-julia> y = [1, 0, 0, 1, 0];  # one false negative data point
+julia> ŷ_fp = [1, 1, 1, 1, 1, 1];  # 2 false positive -> 2 wrong predictions
 
-julia> Flux.tversky_loss(ŷ, y)
-0.18918918918918926
+julia> ŷ_fnp = [1, 1, 0, 1, 1, 0];  # 1 false negative, 1 false positive -> 2 wrong predictions
 
-julia> y = [1, 1, 1, 1, 0];  # No false negatives, but a false positive
+julia> Flux.tversky_loss(ŷ_fnp, y)
+0.19999999999999996
 
-julia> Flux.tversky_loss(ŷ, y)  # loss is smaller as more weight given to the false negatives
-0.06976744186046513
+julia> Flux.tversky_loss(ŷ_fp, y)  # should be smaller than tversky_loss(ŷ_fnp, y), as FN is given more weight
+0.1071428571428571
 ```
 """
 function tversky_loss(ŷ, y; β = ofeltype(ŷ, 0.7))
     _check_sizes(ŷ, y)
     #TODO add agg
     num = sum(y .* ŷ) + 1
-    den = sum(y .* ŷ + β * (1 .- y) .* ŷ + (1 - β) * y .* (1 .- ŷ)) + 1
+    den = sum(y .* ŷ + (1 - β) * (1 .- y) .* ŷ + β * y .* (1 .- ŷ)) + 1
     1 - num / den
 end
 

From 02c8a950154c8ea6f9aaea5167f01ab858d786c0 Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Tue, 14 Jun 2022 16:58:58 +0530
Subject: [PATCH 5/9] Clean up the doctests + fix tests

---
 src/layers/stateless.jl | 18 ++++++------------
 src/losses/functions.jl | 30 +++++++++++++++---------------
 src/utils.jl            |  2 +-
 test/losses.jl          |  4 ++--
 4 files changed, 24 insertions(+), 30 deletions(-)

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 5298b634bb..06c8b6a4a9 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -35,27 +35,21 @@ Per default, `dims` is the last dimension.
 
 # Examples
 ```jldoctest
-julia> x = [9, 10, 20, 60];
+julia> using Statistics
 
-julia> Flux.std(x)
-24.01908963026423
+julia> x = [9, 10, 20, 60];
 
 julia> y = Flux.normalise(x);
 
-julia> Flux.std(y)
-1.1546999832655012
+julia> isapprox(std(y), 1, atol=0.2) && std(y) != std(x)
+true
 
 julia> x = rand(1:100, 10, 2);
 
-julia> Flux.std(x, dims=1)
-1×2 Matrix{Float64}:
- 28.5324  34.6425
-
 julia> y = Flux.normalise(x, dims=1);
 
-julia> Flux.std(y, dims=1)
-1×2 Matrix{Float64}:
- 1.05409  1.05409
+julia> isapprox(std(y, dims=1), ones(1, 2), atol=0.2) && std(y, dims=1) != std(x, dims=1)
+true
 ```
 """
 @inline function normalise(x::AbstractArray; dims=ndims(x), ϵ=ofeltype(x, 1e-5))
diff --git a/src/losses/functions.jl b/src/losses/functions.jl
index b4eb2f4d50..e40f7c1cff 100644
--- a/src/losses/functions.jl
+++ b/src/losses/functions.jl
@@ -401,7 +401,7 @@ distribution `y`; calculated as -
 ```jldoctest
 julia> y_model = [1, 3, 3];  # data should only take integral values
 
-julia> poisson_loss(y_model, 1:3)
+julia> Flux.poisson_loss(y_model, 1:3)
 0.5023128522198171
 ```
 """
@@ -430,14 +430,14 @@ julia> y_pred = [0.1, 0.3, 1, 1.5];
 julia> Flux.hinge_loss(y_pred, y_true)
 0.55
 
-julia> Flux.hinge_loss(y_pred[1], y_true[1])  # same sign but |ŷ| < 1
-0.9
+julia> Flux.hinge_loss(y_pred[1], y_true[1]) != 0  # same sign but |ŷ| < 1
+true
 
-julia> Flux.hinge_loss(y_pred[end], y_true[end])  # same sign but |ŷ| >= 1 -> loss = 0
-0.0
+julia> Flux.hinge_loss(y_pred[end], y_true[end]) == 0  # same sign but |ŷ| >= 1
+true
 
-julia> Flux.hinge_loss(y_pred[2], y_true[2])  # opposite signs -> loss != 0
-1.3
+julia> Flux.hinge_loss(y_pred[2], y_true[2]) != 0 # opposite signs
+true
 ```
 """
 function hinge_loss(ŷ, y; agg = mean)
@@ -465,14 +465,14 @@ julia> y_pred = [0.1, 0.3, 1, 1.5];
 julia> Flux.squared_hinge_loss(y_pred, y_true)
 0.625
 
-julia> Flux.squared_hinge_loss(y_pred[1], y_true[1])  # same sign but |ŷ| < 1
-0.81
+julia> Flux.squared_hinge_loss(y_pred[1], y_true[1]) != 0
+true
 
-julia> Flux.squared_hinge_loss(y_pred[end], y_true[end])  # same sign and |ŷ| >= 1 -> loss = 0
-0.0
+julia> Flux.squared_hinge_loss(y_pred[end], y_true[end]) == 0
+true
 
-julia> Flux.squared_hinge_loss(y_pred[2], y_true[2])  # opposite signs -> loss != 0
-1.6900000000000002
+julia> Flux.squared_hinge_loss(y_pred[2], y_true[2]) != 0
+true
 ```
 """
 function squared_hinge_loss(ŷ, y; agg = mean)
@@ -527,8 +527,8 @@ julia> ŷ_fnp = [1, 1, 0, 1, 1, 0];  # 1 false negative, 1 false positive -> 2
 julia> Flux.tversky_loss(ŷ_fnp, y)
 0.19999999999999996
 
-julia> Flux.tversky_loss(ŷ_fp, y)  # should be smaller than tversky_loss(ŷ_fnp, y), as FN is given more weight
-0.1071428571428571
+julia> Flux.tversky_loss(ŷ_fp, y) < Flux.tversky_loss(ŷ_fnp, y)  # FN is given more weight
+true
 ```
 """
 function tversky_loss(ŷ, y; β = ofeltype(ŷ, 0.7))
diff --git a/src/utils.jl b/src/utils.jl
index 15525c6357..c67d1a6270 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -535,7 +535,7 @@ Flux
 
 julia> for i = 1:4  # sleeps for 1 second -> the function can be called in alternate iterations
            a()
-           sleep(1)
+           sleep(1.5)
        end
 Flux
 Flux
diff --git a/test/losses.jl b/test/losses.jl
index 2ca697a657..c1061bb388 100644
--- a/test/losses.jl
+++ b/test/losses.jl
@@ -163,8 +163,8 @@ y = [1.0 0.5 0.3 2.4]
 end
 
 @testset "tversky_loss" begin
-  @test Flux.tversky_loss(ŷ, y) ≈ -0.06772009029345383
-  @test Flux.tversky_loss(ŷ, y, β=0.8) ≈ -0.09490740740740744
+  @test Flux.tversky_loss(ŷ, y) ≈ 0.028747433264887046
+  @test Flux.tversky_loss(ŷ, y, β=0.8) ≈ 0.050200803212851364
   @test Flux.tversky_loss(y, y) ≈ -0.5576923076923075
 end
 

From 9dec787668046fc09e72b897e5df777084006d9a Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Tue, 14 Jun 2022 20:00:22 +0530
Subject: [PATCH 6/9] Fix doctests

---
 src/utils.jl | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/src/utils.jl b/src/utils.jl
index c67d1a6270..44ba48bde4 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -530,24 +530,12 @@ the trailing edge, pass `trailing=true`.
 ```jldoctest
 julia> a = Flux.throttle(() -> println("Flux"), 2);
 
-julia> a()
-Flux
-
-julia> for i = 1:4  # sleeps for 1 second -> the function can be called in alternate iterations
+julia> for i = 1:4  # a called in alternate iterations
            a()
-           sleep(1.5)
+           sleep(1)
        end
 Flux
 Flux
-
-julia> for i = 1:4  # sleeps for 2 second -> the function can be called in the next iteration
-           a()
-           sleep(2)
-       end
-Flux
-Flux
-Flux
-Flux
 ```
 """
 function throttle(f, timeout; leading=true, trailing=false)

From 4773a697f59c1d4dd10e58629f5e8f65a6111c5d Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Sat, 6 Aug 2022 14:32:48 +0530
Subject: [PATCH 7/9] `rng_from_array()` -> `default_rng_value()`

---
 docs/src/utilities.md   |  2 ++
 src/layers/normalise.jl | 12 ++++-----
 src/utils.jl            | 58 +++++++++++++++++++++++------------------
 3 files changed, 41 insertions(+), 31 deletions(-)

diff --git a/docs/src/utilities.md b/docs/src/utilities.md
index 1d45d07485..28f6bc4a18 100644
--- a/docs/src/utilities.md
+++ b/docs/src/utilities.md
@@ -45,6 +45,8 @@ Flux.ones32
 Flux.zeros32
 Flux.rand32
 Flux.randn32
+Flux.rng_from_array
+Flux.default_rng_value
 ```
 
 ## Changing the type of model parameters
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 0874be9d98..18cb4ac479 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -51,7 +51,7 @@ end
 ChainRulesCore.@non_differentiable dropout_mask(::Any, ::Any, ::Any)
 
 """
-    Dropout(p; dims=:, rng = _rng_from_array())
+    Dropout(p; dims=:, rng = default_rng_value())
 
 Dropout layer.
 
@@ -96,9 +96,9 @@ mutable struct Dropout{F,D,R<:AbstractRNG}
   active::Union{Bool, Nothing}
   rng::R
 end
-Dropout(p, dims, active) = Dropout(p, dims, active, _rng_from_array())
+Dropout(p, dims, active) = Dropout(p, dims, active, default_rng_value())
 
-function Dropout(p; dims=:, rng = _rng_from_array())
+function Dropout(p; dims=:, rng = default_rng_value())
   @assert 0 ≤ p ≤ 1
   Dropout(p, dims, nothing, rng)
 end
@@ -121,7 +121,7 @@ function Base.show(io::IO, d::Dropout)
 end
 
 """
-    AlphaDropout(p; rng = _rng_from_array())
+    AlphaDropout(p; rng = default_rng_value())
 
 A dropout layer. Used in
 [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515).
@@ -155,8 +155,8 @@ mutable struct AlphaDropout{F,R<:AbstractRNG}
     new{typeof(p), typeof(rng)}(p, active, rng)
   end
 end
-AlphaDropout(p, active) = AlphaDropout(p, active, _rng_from_array())
-AlphaDropout(p; rng = _rng_from_array()) = AlphaDropout(p, nothing, rng)
+AlphaDropout(p, active) = AlphaDropout(p, active, default_rng_value())
+AlphaDropout(p; rng = default_rng_value()) = AlphaDropout(p, nothing, rng)
 
 @functor AlphaDropout
 trainable(a::AlphaDropout) = (;)
diff --git a/src/utils.jl b/src/utils.jl
index 44ba48bde4..9624bfbefb 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -43,16 +43,24 @@ The current defaults are:
   - Julia version is < 1.7: `Random.GLOBAL_RNG`
   - Julia version is >= 1.7: `Random.default_rng()`
 """
-_rng_from_array(::AbstractArray) = _rng_from_array()
-_rng_from_array(::CuArray) = CUDA.default_rng()
+rng_from_array(::AbstractArray) = default_rng_value()
+rng_from_array(::CuArray) = CUDA.default_rng()
+
 if VERSION >= v"1.7"
-  _rng_from_array() = Random.default_rng()
+  @doc """
+      default_rng_value()
+
+  Create an instance of the default RNG depending on Julia's version.
+  - Julia version is < 1.7: `Random.GLOBAL_RNG`
+  - Julia version is >= 1.7: `Random.default_rng()`
+  """
+  default_rng_value() = Random.default_rng()
 else
-  _rng_from_array() = Random.GLOBAL_RNG
+  default_rng_value() = Random.GLOBAL_RNG
 end
 
 """
-    glorot_uniform([rng=GLOBAL_RNG], size...; gain = 1) -> Array
+    glorot_uniform([rng = default_rng_value()], size...; gain = 1) -> Array
     glorot_uniform([rng]; kw...) -> Function
 
 Return an `Array{Float32}` of the given `size` containing random numbers drawn from a uniform
@@ -91,13 +99,13 @@ function glorot_uniform(rng::AbstractRNG, dims::Integer...; gain::Real=1)
   scale = Float32(gain) * sqrt(24.0f0 / sum(nfan(dims...)))
   (rand(rng, Float32, dims...) .- 0.5f0) .* scale
 end
-glorot_uniform(dims::Integer...; kw...) = glorot_uniform(_rng_from_array(), dims...; kw...)
-glorot_uniform(rng::AbstractRNG=_rng_from_array(); init_kwargs...) = (dims...; kwargs...) -> glorot_uniform(rng, dims...; init_kwargs..., kwargs...)
+glorot_uniform(dims::Integer...; kw...) = glorot_uniform(default_rng_value(), dims...; kw...)
+glorot_uniform(rng::AbstractRNG=default_rng_value(); init_kwargs...) = (dims...; kwargs...) -> glorot_uniform(rng, dims...; init_kwargs..., kwargs...)
 
 ChainRulesCore.@non_differentiable glorot_uniform(::Any...)
 
 """
-    glorot_normal([rng=GLOBAL_RNG], size...; gain = 1) -> Array
+    glorot_normal([rng = default_rng_value(), size...; gain = 1) -> Array
     glorot_normal([rng]; kw...) -> Function
 
 Return an `Array{Float32}` of the given `size` containing random numbers drawn from a normal
@@ -134,13 +142,13 @@ function glorot_normal(rng::AbstractRNG, dims::Integer...; gain::Real=1)
   std = Float32(gain) * sqrt(2.0f0 / sum(nfan(dims...)))
   randn(rng, Float32, dims...) .* std
 end
-glorot_normal(dims::Integer...; kwargs...) = glorot_normal(_rng_from_array(), dims...; kwargs...)
-glorot_normal(rng::AbstractRNG=_rng_from_array(); init_kwargs...) = (dims...; kwargs...) -> glorot_normal(rng, dims...; init_kwargs..., kwargs...)
+glorot_normal(dims::Integer...; kwargs...) = glorot_normal(default_rng_value(), dims...; kwargs...)
+glorot_normal(rng::AbstractRNG=default_rng_value(); init_kwargs...) = (dims...; kwargs...) -> glorot_normal(rng, dims...; init_kwargs..., kwargs...)
 
 ChainRulesCore.@non_differentiable glorot_normal(::Any...)
 
 """
-    kaiming_uniform([rng=GLOBAL_RNG], size...; gain = √2) -> Array
+    kaiming_uniform([rng = default_rng_value()], size...; gain = √2) -> Array
     kaiming_uniform([rng]; kw...) -> Function
 
 Return an `Array{Float32}` of the given `size` containing random numbers drawn from a uniform distribution
@@ -169,13 +177,13 @@ function kaiming_uniform(rng::AbstractRNG, dims::Integer...; gain::Real = √2)
   return (rand(rng, Float32, dims...) .- 0.5f0) .* 2bound
 end
 
-kaiming_uniform(dims::Integer...; kwargs...) = kaiming_uniform(_rng_from_array(), dims...; kwargs...)
-kaiming_uniform(rng::AbstractRNG=_rng_from_array(); init_kwargs...) = (dims...; kwargs...) -> kaiming_uniform(rng, dims...; init_kwargs..., kwargs...)
+kaiming_uniform(dims::Integer...; kwargs...) = kaiming_uniform(default_rng_value(), dims...; kwargs...)
+kaiming_uniform(rng::AbstractRNG=default_rng_value(); init_kwargs...) = (dims...; kwargs...) -> kaiming_uniform(rng, dims...; init_kwargs..., kwargs...)
 
 ChainRulesCore.@non_differentiable kaiming_uniform(::Any...)
 
 """
-    kaiming_normal([rng=GLOBAL_RNG], size...; gain = √2) -> Array
+    kaiming_normal([rng = default_rng_value()], size...; gain = √2) -> Array
     kaiming_normal([rng]; kw...) -> Function
 
 Return an `Array{Float32}` of the given `size` containing random numbers taken from a normal
@@ -206,13 +214,13 @@ function kaiming_normal(rng::AbstractRNG, dims::Integer...; gain::Real = √2f0)
   return randn(rng, Float32, dims...) .* std
 end
 
-kaiming_normal(dims::Integer...; kwargs...) = kaiming_normal(_rng_from_array(), dims...; kwargs...)
+kaiming_normal(dims::Integer...; kwargs...) = kaiming_normal(default_rng_value(), dims...; kwargs...)
 kaiming_normal(rng::AbstractRNG; init_kwargs...) = (dims...; kwargs...) -> kaiming_normal(rng, dims...; init_kwargs..., kwargs...)
 
 ChainRulesCore.@non_differentiable kaiming_normal(::Any...)
 
 """
-    truncated_normal([rng=GLOBAL_RNG], size...; mean = 0, std = 1, lo = -2, hi = 2) -> Array
+    truncated_normal([rng = default_rng_value()], size...; mean = 0, std = 1, lo = -2, hi = 2) -> Array
     truncated_normal([rng]; kw...) -> Function
   
 Return an `Array{Float32}` of the given `size` where each element is drawn from a truncated normal distribution.
@@ -252,13 +260,13 @@ function truncated_normal(rng::AbstractRNG, dims::Integer...; mean = 0, std = 1,
   return xs
 end
 
-truncated_normal(dims::Integer...; kwargs...) = truncated_normal(_rng_from_array(), dims...; kwargs...)
-truncated_normal(rng::AbstractRNG=_rng_from_array(); init_kwargs...) = (dims...; kwargs...) -> truncated_normal(rng, dims...; init_kwargs..., kwargs...)
+truncated_normal(dims::Integer...; kwargs...) = truncated_normal(default_rng_value(), dims...; kwargs...)
+truncated_normal(rng::AbstractRNG=default_rng_value(); init_kwargs...) = (dims...; kwargs...) -> truncated_normal(rng, dims...; init_kwargs..., kwargs...)
 
 ChainRulesCore.@non_differentiable truncated_normal(::Any...)
 
 """
-    orthogonal([rng=GLOBAL_RNG], size...; gain = 1) -> Array
+    orthogonal([rng = default_rng_value()], size...; gain = 1) -> Array
     orthogonal([rng]; kw...) -> Function
 
 Return an `Array{Float32}` of the given `size` which is a (semi) orthogonal matrix, as described in [1].
@@ -313,13 +321,13 @@ function orthogonal(rng::AbstractRNG, d1::Integer, ds::Integer...; kwargs...)
   return reshape(orthogonal(rng, rows, cols; kwargs...), dims)
 end
 
-orthogonal(dims::Integer...; kwargs...) = orthogonal(_rng_from_array(), dims...; kwargs...)
-orthogonal(rng::AbstractRNG=_rng_from_array(); init_kwargs...) = (dims::Integer...; kwargs...) -> orthogonal(rng, dims...; init_kwargs..., kwargs...)
+orthogonal(dims::Integer...; kwargs...) = orthogonal(default_rng_value(), dims...; kwargs...)
+orthogonal(rng::AbstractRNG=default_rng_value(); init_kwargs...) = (dims::Integer...; kwargs...) -> orthogonal(rng, dims...; init_kwargs..., kwargs...)
 
 ChainRulesCore.@non_differentiable orthogonal(::Any...)
 
 """
-    sparse_init([rng=GLOBAL_RNG], rows, cols; sparsity, std = 0.01) -> Array
+    sparse_init([rng = default_rng_value()], rows, cols; sparsity, std = 0.01) -> Array
     sparse_init([rng]; kw...) -> Function
 
 Return a `Matrix{Float32}` of size `rows, cols` where each column contains a fixed fraction of
@@ -361,8 +369,8 @@ function sparse_init(rng::AbstractRNG, dims::Integer...; sparsity, std = 0.01)
   return mapslices(shuffle, sparse_array, dims=1)
 end
 
-sparse_init(dims::Integer...; kwargs...) = sparse_init(_rng_from_array(), dims...; kwargs...)
-sparse_init(rng::AbstractRNG=_rng_from_array(); init_kwargs...) = (dims...; kwargs...) -> sparse_init(rng, dims...; init_kwargs..., kwargs...)
+sparse_init(dims::Integer...; kwargs...) = sparse_init(default_rng_value(), dims...; kwargs...)
+sparse_init(rng::AbstractRNG=default_rng_value(); init_kwargs...) = (dims...; kwargs...) -> sparse_init(rng, dims...; init_kwargs..., kwargs...)
 
 ChainRulesCore.@non_differentiable sparse_init(::Any...)
 
@@ -452,7 +460,7 @@ end
 
 # For consistency, it accepts an RNG, but ignores it:
 identity_init(::AbstractRNG, dims::Integer...; kwargs...) = identity_init(dims...; kwargs...)
-identity_init(rng::AbstractRNG=_rng_from_array(); init_kwargs...) = (args...;kwargs...) -> identity_init(rng, args...; init_kwargs..., kwargs...)
+identity_init(rng::AbstractRNG=default_rng_value(); init_kwargs...) = (args...;kwargs...) -> identity_init(rng, args...; init_kwargs..., kwargs...)
 
 ChainRulesCore.@non_differentiable identity_init(::Any...)
 

From a67400fc195d4c80e253eb068d114c1a0c9a5825 Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Sat, 6 Aug 2022 15:52:11 +0530
Subject: [PATCH 8/9] fix tests (typos)

---
 src/layers/normalise.jl | 4 ++--
 src/utils.jl            | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 18cb4ac479..446575f355 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -10,7 +10,7 @@ _dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(s
 _dropout_kernel(y::T, p, q) where {T} = y > p ? T(1 / q) : T(0)
 
 """
-    dropout([rng = _rng_from_array(x)], x, p; dims=:, active=true)
+    dropout([rng = rng_from_array(x)], x, p; dims=:, active=true)
 
 The dropout function. If `active` is `true`,
 for each input, either sets that input to `0` (with probability
@@ -34,7 +34,7 @@ function dropout(rng, x, p; dims=:, active::Bool=true)
   y = dropout_mask(rng, x, p, dims=dims)
   return x .* y
 end
-dropout(x, p; kwargs...) = dropout(_rng_from_array(x), x, p; kwargs...)
+dropout(x, p; kwargs...) = dropout(rng_from_array(x), x, p; kwargs...)
 
 dropout_mask(rng::CUDA.RNG, x::CuArray, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...)
 dropout_mask(rng, x::CuArray, p; kwargs...) =
diff --git a/src/utils.jl b/src/utils.jl
index 9624bfbefb..ad963d5ed6 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -34,7 +34,7 @@ ofeltype(x, y) = convert(float(eltype(x)), y)
 epseltype(x) = eps(float(eltype(x)))
 
 """
-    _rng_from_array([x])
+    rng_from_array([x])
 
 Create an instance of the RNG most appropriate for `x`.
 The current defaults are:

From f49ec343b9ee7910015d1a6b9e97a08318017c6c Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Tue, 16 Aug 2022 20:33:23 +0530
Subject: [PATCH 9/9] Revert `tversky_loss` changes

---
 src/losses/functions.jl | 16 +---------------
 test/losses.jl          |  4 ++--
 2 files changed, 3 insertions(+), 17 deletions(-)

diff --git a/src/losses/functions.jl b/src/losses/functions.jl
index e40f7c1cff..1bb14b2e74 100644
--- a/src/losses/functions.jl
+++ b/src/losses/functions.jl
@@ -516,26 +516,12 @@ Calculated as:
 
     1 - sum(|y .* ŷ| + 1) / (sum(y .* ŷ + (1 - β)*(1 .- y) .* ŷ + β*y .* (1 .- ŷ)) + 1)
 
-# Example
-```jldoctest
-julia> y = [0, 1, 0, 1, 1, 1];
-
-julia> ŷ_fp = [1, 1, 1, 1, 1, 1];  # 2 false positive -> 2 wrong predictions
-
-julia> ŷ_fnp = [1, 1, 0, 1, 1, 0];  # 1 false negative, 1 false positive -> 2 wrong predictions
-
-julia> Flux.tversky_loss(ŷ_fnp, y)
-0.19999999999999996
-
-julia> Flux.tversky_loss(ŷ_fp, y) < Flux.tversky_loss(ŷ_fnp, y)  # FN is given more weight
-true
-```
 """
 function tversky_loss(ŷ, y; β = ofeltype(ŷ, 0.7))
     _check_sizes(ŷ, y)
     #TODO add agg
     num = sum(y .* ŷ) + 1
-    den = sum(y .* ŷ + (1 - β) * (1 .- y) .* ŷ + β * y .* (1 .- ŷ)) + 1
+    den = sum(y .* ŷ + β * (1 .- y) .* ŷ + (1 - β) * y .* (1 .- ŷ)) + 1
     1 - num / den
 end
 
diff --git a/test/losses.jl b/test/losses.jl
index c1061bb388..2ca697a657 100644
--- a/test/losses.jl
+++ b/test/losses.jl
@@ -163,8 +163,8 @@ y = [1.0 0.5 0.3 2.4]
 end
 
 @testset "tversky_loss" begin
-  @test Flux.tversky_loss(ŷ, y) ≈ 0.028747433264887046
-  @test Flux.tversky_loss(ŷ, y, β=0.8) ≈ 0.050200803212851364
+  @test Flux.tversky_loss(ŷ, y) ≈ -0.06772009029345383
+  @test Flux.tversky_loss(ŷ, y, β=0.8) ≈ -0.09490740740740744
   @test Flux.tversky_loss(y, y) ≈ -0.5576923076923075
 end