From 8b189d08dfcad5b1f91f2e6d3ecad53c71d2324d Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Sat, 11 Jun 2022 00:24:32 +0530
Subject: [PATCH 01/15] Add doctests in `upsample.jl`

---
 src/layers/upsample.jl | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/src/layers/upsample.jl b/src/layers/upsample.jl
index 9deb413bb7..3649f0f01e 100644
--- a/src/layers/upsample.jl
+++ b/src/layers/upsample.jl
@@ -75,9 +75,41 @@ end
 """
     PixelShuffle(r::Int)
 
-Pixel shuffling layer with upscale factor `r`.
+Pixel shuffling layer with upscale factor `r`. Usually used for generating higher
+resolution images while upscaling them.
  
 See [`NNlib.pixel_shuffle`](@ref).
+
+# Examples
+```jldoctest; filter = r"[+-]?([0-9]*[.])?[0-9]+"
+julia> p = PixelShuffle(2);
+
+julia> xs = rand(2, 2, 4, 1)  # an image with 4 channels having 2X2 pixels in each channel
+2×2×4×1 Array{Float64, 4}:
+[:, :, 1, 1] =
+ 0.826452   0.0519244
+ 0.0686387  0.438346
+
+[:, :, 2, 1] =
+ 0.343179  0.445101
+ 0.543927  0.740905
+
+[:, :, 3, 1] =
+ 0.105997  0.422996
+ 0.32957   0.167205
+
+[:, :, 4, 1] =
+ 0.825737  0.98609
+ 0.757365  0.294784
+
+julia> p(xs)  # an image with only 1 channel with 4X4 pixels in the single channel
+4×4×1×1 Array{Float64, 4}:
+[:, :, 1, 1] =
+ 0.826452   0.105997  0.0519244  0.422996
+ 0.343179   0.825737  0.445101   0.98609
+ 0.0686387  0.32957   0.438346   0.167205
+ 0.543927   0.757365  0.740905   0.294784
+```
 """
 struct PixelShuffle 
   r::Int

From 2a0ed9bd902475ccae3250954dcdb814fdddeea2 Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Sat, 11 Jun 2022 00:26:37 +0530
Subject: [PATCH 02/15] Add doctests in `recurrent.jl`

---
 docs/src/models/layers.md |   1 +
 src/layers/recurrent.jl   | 135 ++++++++++++++++++++++++++++++++------
 2 files changed, 117 insertions(+), 19 deletions(-)

diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index 81fbb60a2d..34300ca840 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -42,6 +42,7 @@ Much like the core layers above, but can be used to process sequence data (as we
 RNN
 LSTM
 GRU
+GRUv3
 Flux.Recur
 Flux.reset!
 ```
diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
index 7c98f2394f..3ef902d3e5 100644
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@@ -63,28 +63,97 @@ in the background. `cell` should be a model of the form:
 
 For example, here's a recurrent network that keeps a running total of its inputs:
 
-```julia
-accum(h, x) = (h + x, x)
-rnn = Flux.Recur(accum, 0)
-rnn(2)      # 2
-rnn(3)      # 3
-rnn.state   # 5
-rnn.(1:10)  # apply to a sequence
-rnn.state   # 60
+# Examples
+```jldoctest
+julia> accum(h, x) = (h + x, x)
+accum (generic function with 1 method)
+
+julia> rnn = Flux.Recur(accum, 0)
+Recur(accum)
+
+julia> rnn(2) 
+2
+
+julia> rnn(3)
+3
+
+julia> rnn.state
+5
+
+julia> rnn.(1:10)  # apply to a sequence
+10-element Vector{Int64}:
+  1
+  2
+  3
+  4
+  5
+  6
+  7
+  8
+  9
+ 10
+
+julia> rnn.state
+60
 ```
 
 Folding over a 3d Array of dimensions `(features, batch, time)` is also supported:
 
-```julia
-accum(h, x) = (h .+ x, x)
-rnn = Flux.Recur(accum, zeros(Int, 1, 1))
-rnn([2])                    # 2
-rnn([3])                    # 3
-rnn.state                   # 5
-rnn(reshape(1:10, 1, 1, :)) # apply to a sequence of (features, batch, time)
-rnn.state                   # 60
-```
+```jldoctest
+julia> accum(h, x) = (h .+ x, x)
+accum (generic function with 1 method)
+
+julia> rnn = Flux.Recur(accum, zeros(Int, 1, 1))
+Recur(accum)
+
+julia> rnn([2])
+1-element Vector{Int64}:
+ 2
+
+julia> rnn([3])
+1-element Vector{Int64}:
+ 3
+
+julia> rnn.state
+1×1 Matrix{Int64}:
+ 5
+
+julia> rnn(reshape(1:10, 1, 1, :))  # apply to a sequence of (features, batch, time)
+1×1×10 Array{Int64, 3}:
+[:, :, 1] =
+ 1
+
+[:, :, 2] =
+ 2
+
+[:, :, 3] =
+ 3
+
+[:, :, 4] =
+ 4
 
+[:, :, 5] =
+ 5
+
+[:, :, 6] =
+ 6
+
+[:, :, 7] =
+ 7
+
+[:, :, 8] =
+ 8
+
+[:, :, 9] =
+ 9
+
+[:, :, 10] =
+ 10
+
+julia> rnn.state
+1×1 Matrix{Int64}:
+ 60
+```
 """
 mutable struct Recur{T,S}
   cell::T
@@ -107,8 +176,36 @@ Base.show(io::IO, m::Recur) = print(io, "Recur(", m.cell, ")")
 Reset the hidden state of a recurrent layer back to its original value.
 
 Assuming you have a `Recur` layer `rnn`, this is roughly equivalent to:
-```julia
-rnn.state = hidden(rnn.cell)
+
+    rnn.state = hidden(rnn.cell)
+
+# Examples
+```jldoctest
+julia> r = RNN(3 => 5);
+
+julia> r.state
+5×1 Matrix{Float32}:
+ 0.0
+ 0.0
+ 0.0
+ 0.0
+ 0.0
+
+julia> r(rand(Float32, 3)); r.state
+5×1 Matrix{Float32}:
+ -0.32719195
+ -0.45280662
+ -0.50386846
+ -0.14782222
+  0.23584609
+
+julia> Flux.reset!(r)
+5×1 Matrix{Float32}:
+ 0.0
+ 0.0
+ 0.0
+ 0.0
+ 0.0
 ```
 """
 reset!(m::Recur) = (m.state = m.cell.state0)

From 4b9e2fba2dac131568375d110f1efc5ca949d118 Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Sat, 11 Jun 2022 13:58:04 +0530
Subject: [PATCH 03/15] Add doctests in `normalise.jl`

---
 src/layers/normalise.jl | 138 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 131 insertions(+), 7 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 4c696d916d..da85bc7d61 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -55,7 +55,7 @@ ChainRulesCore.@non_differentiable dropout_mask(::Any, ::Any, ::Any)
 """
     Dropout(p; dims=:, rng = rng_from_array())
 
-Dropout layer. In the forward pass, apply the [`Flux.dropout`](@ref) function on the input.
+Dropout layer. In the forward pass, applies the [`Flux.dropout`](@ref) function on the input.
 
 To apply dropout along certain dimension(s), specify the `dims` keyword.
 e.g. `Dropout(p; dims = 3)` will randomly zero out entire channels on WHCN input
@@ -65,6 +65,35 @@ Specify `rng` to use a custom RNG instead of the default.
 Custom RNGs are only supported on the CPU.
 
 Does nothing to the input once [`Flux.testmode!`](@ref) is `true`.
+
+# Examples
+```jldoctest
+julia> m = Chain(Dense(2 => 2), Dropout(1))
+Chain(
+  Dense(2 => 2),                        # 6 parameters
+  Dropout(1),
+)
+
+julia> Flux.trainmode!(m);  # activating the layer without actually training it
+
+julia> m([1, 2])  # drops neurons with a probability of 1
+2-element Vector{Float32}:
+ -0.0
+ -0.0
+
+julia> m = Chain(Dense(2 => 2), Dropout(0.5))
+Chain(
+  Dense(2 => 2),                        # 6 parameters
+  Dropout(0.5),
+)
+
+julia> Flux.trainmode!(m);  # activating the layer without actually training it
+
+julia> m([1, 2])  # drops neurons with a probability of 0.5
+2-element Vector{Float32}:
+ -4.537827
+ -0.0
+```
 """
 mutable struct Dropout{F,D,R<:AbstractRNG}
   p::F
@@ -105,6 +134,33 @@ The AlphaDropout layer ensures that mean and variance of activations
 remain the same as before.
 
 Does nothing to the input once [`testmode!`](@ref) is true.
+
+# Examples
+```jldoctest
+julia> x = randn(20,1);
+
+julia> m = Chain(Dense(20 => 10, selu), AlphaDropout(0.5))
+Chain(
+  Dense(20 => 10, selu),                # 210 parameters
+  AlphaDropout{Float64, Random.TaskLocalRNG}(0.5, nothing, Random.TaskLocalRNG()),
+)
+
+julia> Flux.trainmode!(m);
+
+julia> y = m(x);
+
+julia> Flux.std(x)
+1.097500619939126
+
+julia> Flux.std(y)  # maintains the standard deviation of the input
+1.1504012188827453
+
+julia> Flux.mean(x)  # maintains the mean of the input
+-0.3217018554158738
+
+julia> Flux.mean(y)
+-0.2526866470385106
+```
 """
 mutable struct AlphaDropout{F,R<:AbstractRNG}
   p::F
@@ -154,6 +210,27 @@ If `affine=true`, it also applies a learnable shift and rescaling
 using the [`Scale`](@ref) layer.
 
 See also [`BatchNorm`](@ref), [`InstanceNorm`](@ref), [`GroupNorm`](@ref), and [`normalise`](@ref).
+
+# Examples
+```jldoctest
+julia> xs = rand(3, 3, 3, 2);  # a batch of 2 3X3X3 images
+
+julia> m = LayerNorm(3);
+
+julia> y = m(xs);
+
+julia> Flux.std(xs[:, :, :, 1])
+0.28713812337208383
+
+julia> Flux.std(y[:, :, :, 1])  # normalises each image (or all channels in an image)
+1.018993632693022
+
+julia> Flux.std(xs[:, :, :, 2])
+0.22540260537916373
+
+julia> Flux.std(y[:, :, :, 2])  # normalises each image (or all channels in an image)
+1.018965249873791
+```
 """
 struct LayerNorm{F,D,T,N}
   λ::F
@@ -256,12 +333,17 @@ Use [`testmode!`](@ref) during inference.
 
 # Examples
 ```julia
-m = Chain(
-  Dense(28^2 => 64),
-  BatchNorm(64, relu),
-  Dense(64 => 10),
-  BatchNorm(10),
-  softmax)
+julia> xs = rand(3, 3, 3, 2);  # a batch of 2 3X3X3 images
+
+julia> Flux.std(xs)
+2.6822461565718467
+
+julia> m = BatchNorm(3);
+
+julia> Flux.trainmode!(m);  # activating the layer without actually training it
+
+julia> Flux.std(m(xs))  # normalises the complete batch
+1.0093209961092855
 ```
 """
 mutable struct BatchNorm{F,V,N,W}
@@ -339,6 +421,27 @@ that will be used to renormalize the input in test phase.
 
 **Warning**: the defaults for `affine` and `track_stats` used to be `true`
 in previous Flux versions (< v0.12).
+
+# Examples
+```jldoctest
+julia> xs = rand(3, 3, 3, 2);  # a batch of 2 3X3X3 images
+
+julia> m = InstanceNorm(3);
+
+julia> y = m(xs);
+
+julia> Flux.std(xs[:, :, 1, 1])  # original standard deviation of the first channel of image 1
+0.2989802650787384
+
+julia> Flux.std(y[:, :, 1, 1])  # each channel of the batch is normalised
+1.0606027381538408
+
+julia> Flux.std(xs[:, :, 2, 2])  # original standard deviation of the second channel of image 2
+0.28662705400461197
+
+julia> Flux.std(y[:, :, 2, 2])  # each channel of the batch is normalised
+1.06058729821187
+```
 """
 mutable struct InstanceNorm{F,V,N,W}
   λ::F  # activation function
@@ -416,6 +519,27 @@ through to learnable per-channel bias `β` and scale `γ` parameters.
 
 If `track_stats=true`, accumulates mean and var statistics in training phase
 that will be used to renormalize the input in test phase.
+
+# Examples
+```jldoctest
+julia> xs = rand(3, 3, 4, 2);  # a batch of 2 3X3X4 images
+
+julia> m = GroupNorm(4, 2);
+
+julia> y = m(xs);
+
+julia> Flux.std(xs[:, :, 1:2, 1])  # original standard deviation of the first 2 channels of image 1
+0.307588490584917
+
+julia> Flux.std(y[:, :, 1:2, 1])  # normalises channels in groups of 2 (as specified)
+1.0289339365431291
+
+julia> Flux.std(xs[:, :, 3:4, 2])  # original standard deviation of the last 2 channels of image 2
+0.3111566100804274
+
+julia> Flux.std(y[:, :, 3:4, 2])    # normalises channels in groups of 2 (as specified)
+1.0289352493058574
+```
 """
 mutable struct GroupNorm{F,V,N,W}
   G::Int  # number of groups

From 69e996ababf6e4a8951b768bdc0a2ac1b84e64b0 Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Sat, 11 Jun 2022 14:05:15 +0530
Subject: [PATCH 04/15] Typos

---
 src/layers/normalise.jl | 2 +-
 src/layers/upsample.jl  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index da85bc7d61..fecd5a3732 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -87,7 +87,7 @@ Chain(
   Dropout(0.5),
 )
 
-julia> Flux.trainmode!(m);  # activating the layer without actually training it
+julia> Flux.trainmode!(m);
 
 julia> m([1, 2])  # drops neurons with a probability of 0.5
 2-element Vector{Float32}:
diff --git a/src/layers/upsample.jl b/src/layers/upsample.jl
index 3649f0f01e..47bf84b49c 100644
--- a/src/layers/upsample.jl
+++ b/src/layers/upsample.jl
@@ -102,7 +102,7 @@ julia> xs = rand(2, 2, 4, 1)  # an image with 4 channels having 2X2 pixels in ea
  0.825737  0.98609
  0.757365  0.294784
 
-julia> p(xs)  # an image with only 1 channel with 4X4 pixels in the single channel
+julia> p(xs)  # upsampled image with only 1 channel
 4×4×1×1 Array{Float64, 4}:
 [:, :, 1, 1] =
  0.826452   0.105997  0.0519244  0.422996

From ef69936663b0d79f54f2f24ba5166b915a0850d6 Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Sat, 11 Jun 2022 15:23:03 +0530
Subject: [PATCH 05/15] Remove redundant randomness, add docfilters, and make
 them stricter for Dropout layer

---
 src/layers/normalise.jl | 16 ++++++----------
 src/layers/recurrent.jl | 30 +++++++++++-------------------
 src/layers/upsample.jl  | 31 +++++--------------------------
 3 files changed, 22 insertions(+), 55 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index fecd5a3732..e6caca7122 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -67,7 +67,7 @@ Custom RNGs are only supported on the CPU.
 Does nothing to the input once [`Flux.testmode!`](@ref) is `true`.
 
 # Examples
-```jldoctest
+```jldoctest; filter = r"[+-]?(?:(?:[0-9])(?:\\.\\d+)?)|(?:1)(?:\\.0+)?"
 julia> m = Chain(Dense(2 => 2), Dropout(1))
 Chain(
   Dense(2 => 2),                        # 6 parameters
@@ -136,14 +136,10 @@ remain the same as before.
 Does nothing to the input once [`testmode!`](@ref) is true.
 
 # Examples
-```jldoctest
+```jldoctest; filter = r"[+-]?([0-9]*[.])?[0-9]+"
 julia> x = randn(20,1);
 
-julia> m = Chain(Dense(20 => 10, selu), AlphaDropout(0.5))
-Chain(
-  Dense(20 => 10, selu),                # 210 parameters
-  AlphaDropout{Float64, Random.TaskLocalRNG}(0.5, nothing, Random.TaskLocalRNG()),
-)
+julia> m = Chain(Dense(20 => 10, selu), AlphaDropout(0.5));
 
 julia> Flux.trainmode!(m);
 
@@ -212,7 +208,7 @@ using the [`Scale`](@ref) layer.
 See also [`BatchNorm`](@ref), [`InstanceNorm`](@ref), [`GroupNorm`](@ref), and [`normalise`](@ref).
 
 # Examples
-```jldoctest
+```jldoctest; filter = r"[+-]?([0-9]*[.])?[0-9]+"
 julia> xs = rand(3, 3, 3, 2);  # a batch of 2 3X3X3 images
 
 julia> m = LayerNorm(3);
@@ -423,7 +419,7 @@ that will be used to renormalize the input in test phase.
 in previous Flux versions (< v0.12).
 
 # Examples
-```jldoctest
+```jldoctest; filter = r"[+-]?([0-9]*[.])?[0-9]+"
 julia> xs = rand(3, 3, 3, 2);  # a batch of 2 3X3X3 images
 
 julia> m = InstanceNorm(3);
@@ -521,7 +517,7 @@ If `track_stats=true`, accumulates mean and var statistics in training phase
 that will be used to renormalize the input in test phase.
 
 # Examples
-```jldoctest
+```jldoctest; filter = r"[+-]?([0-9]*[.])?[0-9]+"
 julia> xs = rand(3, 3, 4, 2);  # a batch of 2 3X3X4 images
 
 julia> m = GroupNorm(4, 2);
diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
index 3ef902d3e5..64726aaa40 100644
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@@ -180,31 +180,23 @@ Assuming you have a `Recur` layer `rnn`, this is roughly equivalent to:
     rnn.state = hidden(rnn.cell)
 
 # Examples
-```jldoctest
-julia> r = RNN(3 => 5);
+```jldoctest; filter = r"[+-]?([0-9]*[.])?[0-9]+"
+julia> r = RNN(1 => 1);
+
+julia> a = Vector{Float32}([1])
+1-element Vector{Float32}:
+ 1.0
 
 julia> r.state
-5×1 Matrix{Float32}:
- 0.0
- 0.0
- 0.0
- 0.0
+1×1 Matrix{Float32}:
  0.0
 
-julia> r(rand(Float32, 3)); r.state
-5×1 Matrix{Float32}:
- -0.32719195
- -0.45280662
- -0.50386846
- -0.14782222
-  0.23584609
+julia> r(a); r.state
+1×1 Matrix{Float32}:
+ 0.61431444
 
 julia> Flux.reset!(r)
-5×1 Matrix{Float32}:
- 0.0
- 0.0
- 0.0
- 0.0
+1×1 Matrix{Float32}:
  0.0
 ```
 """
diff --git a/src/layers/upsample.jl b/src/layers/upsample.jl
index 47bf84b49c..662e056adc 100644
--- a/src/layers/upsample.jl
+++ b/src/layers/upsample.jl
@@ -81,34 +81,13 @@ resolution images while upscaling them.
 See [`NNlib.pixel_shuffle`](@ref).
 
 # Examples
-```jldoctest; filter = r"[+-]?([0-9]*[.])?[0-9]+"
+```jldoctest
 julia> p = PixelShuffle(2);
 
-julia> xs = rand(2, 2, 4, 1)  # an image with 4 channels having 2X2 pixels in each channel
-2×2×4×1 Array{Float64, 4}:
-[:, :, 1, 1] =
- 0.826452   0.0519244
- 0.0686387  0.438346
-
-[:, :, 2, 1] =
- 0.343179  0.445101
- 0.543927  0.740905
-
-[:, :, 3, 1] =
- 0.105997  0.422996
- 0.32957   0.167205
-
-[:, :, 4, 1] =
- 0.825737  0.98609
- 0.757365  0.294784
-
-julia> p(xs)  # upsampled image with only 1 channel
-4×4×1×1 Array{Float64, 4}:
-[:, :, 1, 1] =
- 0.826452   0.105997  0.0519244  0.422996
- 0.343179   0.825737  0.445101   0.98609
- 0.0686387  0.32957   0.438346   0.167205
- 0.543927   0.757365  0.740905   0.294784
+julia> xs = rand(2, 2, 4, 1); # an image with 4 channels having 2X2 pixels in each channel
+
+julia> p(xs) |> size  # upsampled image with only 1 channel
+(4, 4, 1, 1)
 ```
 """
 struct PixelShuffle 

From e7ad7f8478d3de7f86c5e50fdcc03f7f1634905b Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Sat, 11 Jun 2022 16:08:15 +0530
Subject: [PATCH 06/15] Remove doctest from `Dropout` layer

---
 src/layers/normalise.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index e6caca7122..9b31813f89 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -67,7 +67,7 @@ Custom RNGs are only supported on the CPU.
 Does nothing to the input once [`Flux.testmode!`](@ref) is `true`.
 
 # Examples
-```jldoctest; filter = r"[+-]?(?:(?:[0-9])(?:\\.\\d+)?)|(?:1)(?:\\.0+)?"
+```julia
 julia> m = Chain(Dense(2 => 2), Dropout(1))
 Chain(
   Dense(2 => 2),                        # 6 parameters

From 142918e6a3326091651fae4d2dd42ff9e8ec9c73 Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Sat, 11 Jun 2022 23:33:59 +0530
Subject: [PATCH 07/15] Update src/layers/recurrent.jl

Co-authored-by: Brian Chen <ToucheSir@users.noreply.github.com>
---
 src/layers/recurrent.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
index 64726aaa40..c1d8ec057c 100644
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@@ -183,7 +183,7 @@ Assuming you have a `Recur` layer `rnn`, this is roughly equivalent to:
 ```jldoctest; filter = r"[+-]?([0-9]*[.])?[0-9]+"
 julia> r = RNN(1 => 1);
 
-julia> a = Vector{Float32}([1])
+julia> a = ones(Float32, 1)
 1-element Vector{Float32}:
  1.0
 

From a24d7592dcf1553040fdf1aec6d4d15946834acf Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Sun, 12 Jun 2022 00:53:17 +0530
Subject: [PATCH 08/15] Update docstrings of `Recur` and `PixelShuffle`

---
 src/layers/recurrent.jl |  9 +++++--
 src/layers/upsample.jl  | 55 ++++++++++++++++++++++++++++++++++++++---
 2 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
index c1d8ec057c..5059449f38 100644
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@@ -80,7 +80,7 @@ julia> rnn(3)
 julia> rnn.state
 5
 
-julia> rnn.(1:10)  # apply to a sequence
+julia> rnn(1:10)  # apply to a sequence
 10-element Vector{Int64}:
   1
   2
@@ -118,7 +118,12 @@ julia> rnn.state
 1×1 Matrix{Int64}:
  5
 
-julia> rnn(reshape(1:10, 1, 1, :))  # apply to a sequence of (features, batch, time)
+julia> vec = rnn(reshape(1:10, 1, 1, :));  # apply to a sequence of (features, batch, time)
+
+julia> size(vec)
+(1, 1, 10)
+
+julia> vec 
 1×1×10 Array{Int64, 3}:
 [:, :, 1] =
  1
diff --git a/src/layers/upsample.jl b/src/layers/upsample.jl
index 662e056adc..c71a9acc8d 100644
--- a/src/layers/upsample.jl
+++ b/src/layers/upsample.jl
@@ -84,10 +84,57 @@ See [`NNlib.pixel_shuffle`](@ref).
 ```jldoctest
 julia> p = PixelShuffle(2);
 
-julia> xs = rand(2, 2, 4, 1); # an image with 4 channels having 2X2 pixels in each channel
-
-julia> p(xs) |> size  # upsampled image with only 1 channel
-(4, 4, 1, 1)
+julia> xs = [2row + col + channel/10 for row in 1:2, col in 1:2, channel in 1:4, n in 1:1]
+2×2×4×1 Array{Float64, 4}:
+[:, :, 1, 1] =
+ 3.1  4.1
+ 5.1  6.1
+
+[:, :, 2, 1] =
+ 3.2  4.2
+ 5.2  6.2
+
+[:, :, 3, 1] =
+ 3.3  4.3
+ 5.3  6.3
+
+[:, :, 4, 1] =
+ 3.4  4.4
+ 5.4  6.4
+
+julia> p(xs)
+4×4×1×1 Array{Float64, 4}:
+[:, :, 1, 1] =
+ 3.1  3.3  4.1  4.3
+ 3.2  3.4  4.2  4.4
+ 5.1  5.3  6.1  6.3
+ 5.2  5.4  6.2  6.4
+
+julia> xs = [3row + col + channel/10 for row in 1:2, col in 1:3, channel in 1:4, n in 1:1]
+2×3×4×1 Array{Float64, 4}:
+[:, :, 1, 1] =
+ 4.1  5.1  6.1
+ 7.1  8.1  9.1
+
+[:, :, 2, 1] =
+ 4.2  5.2  6.2
+ 7.2  8.2  9.2
+
+[:, :, 3, 1] =
+ 4.3  5.3  6.3
+ 7.3  8.3  9.3
+
+[:, :, 4, 1] =
+ 4.4  5.4  6.4
+ 7.4  8.4  9.4
+
+julia> p(xs)
+4×6×1×1 Array{Float64, 4}:
+[:, :, 1, 1] =
+ 4.1  4.3  5.1  5.3  6.1  6.3
+ 4.2  4.4  5.2  5.4  6.2  6.4
+ 7.1  7.3  8.1  8.3  9.1  9.3
+ 7.2  7.4  8.2  8.4  9.2  9.4
 ```
 """
 struct PixelShuffle 

From 10a2b52a73d7369e05b15d2e40d3c62714c90418 Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Sun, 12 Jun 2022 02:04:08 +0530
Subject: [PATCH 09/15] Clean the doctests of `normalise.jl`

---
 docs/src/models/layers.md |   1 -
 src/layers/normalise.jl   | 114 ++++++++++++++------------------------
 src/layers/recurrent.jl   |  16 ------
 3 files changed, 43 insertions(+), 88 deletions(-)

diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index 34300ca840..ad5a99b737 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -68,7 +68,6 @@ These layers don't affect the structure of the network but may improve training
 ```@docs
 Flux.normalise
 BatchNorm
-Flux.dropout
 Dropout
 AlphaDropout
 LayerNorm
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 9b31813f89..f543db0c09 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -67,32 +67,24 @@ Custom RNGs are only supported on the CPU.
 Does nothing to the input once [`Flux.testmode!`](@ref) is `true`.
 
 # Examples
-```julia
-julia> m = Chain(Dense(2 => 2), Dropout(1))
-Chain(
-  Dense(2 => 2),                        # 6 parameters
-  Dropout(1),
-)
+```jldoctest
+julia> m = Chain(Dense(1 => 1), Dropout(1));
+
+julia> Flux.trainmode!(m);
 
-julia> Flux.trainmode!(m);  # activating the layer without actually training it
+julia> y = m([1]);
 
-julia> m([1, 2])  # drops neurons with a probability of 1
-2-element Vector{Float32}:
- -0.0
- -0.0
+julia> count(i->(i == 0), y) == m[2].p  # number of zeros == 1
+true
 
-julia> m = Chain(Dense(2 => 2), Dropout(0.5))
-Chain(
-  Dense(2 => 2),                        # 6 parameters
-  Dropout(0.5),
-)
+julia> m = Chain(Dense(1 => 1), Dropout(0.5));
 
 julia> Flux.trainmode!(m);
 
-julia> m([1, 2])  # drops neurons with a probability of 0.5
-2-element Vector{Float32}:
- -4.537827
- -0.0
+julia> y = m([1]);
+
+julia> m[2].p - 0.5 <= count(i->(i == 0), y) <= m[2].p + 0.5  # number of zeros can be 0 or 1
+true
 ```
 """
 mutable struct Dropout{F,D,R<:AbstractRNG}
@@ -136,7 +128,9 @@ remain the same as before.
 Does nothing to the input once [`testmode!`](@ref) is true.
 
 # Examples
-```jldoctest; filter = r"[+-]?([0-9]*[.])?[0-9]+"
+```jldoctest
+julia> using Statistics
+
 julia> x = randn(20,1);
 
 julia> m = Chain(Dense(20 => 10, selu), AlphaDropout(0.5));
@@ -145,17 +139,8 @@ julia> Flux.trainmode!(m);
 
 julia> y = m(x);
 
-julia> Flux.std(x)
-1.097500619939126
-
-julia> Flux.std(y)  # maintains the standard deviation of the input
-1.1504012188827453
-
-julia> Flux.mean(x)  # maintains the mean of the input
--0.3217018554158738
-
-julia> Flux.mean(y)
--0.2526866470385106
+julia> isapprox(std(x), std(y), rtol=0.6)
+true
 ```
 """
 mutable struct AlphaDropout{F,R<:AbstractRNG}
@@ -208,24 +193,20 @@ using the [`Scale`](@ref) layer.
 See also [`BatchNorm`](@ref), [`InstanceNorm`](@ref), [`GroupNorm`](@ref), and [`normalise`](@ref).
 
 # Examples
-```jldoctest; filter = r"[+-]?([0-9]*[.])?[0-9]+"
+```jldoctest
+julia> using Statistics
+
 julia> xs = rand(3, 3, 3, 2);  # a batch of 2 3X3X3 images
 
 julia> m = LayerNorm(3);
 
 julia> y = m(xs);
 
-julia> Flux.std(xs[:, :, :, 1])
-0.28713812337208383
-
-julia> Flux.std(y[:, :, :, 1])  # normalises each image (or all channels in an image)
-1.018993632693022
+julia> isapprox(std(y[:, :, :, 1]), 1, atol=0.1) && std(xs[:, :, :, 1]) != std(y[:, :, :, 1])
+true
 
-julia> Flux.std(xs[:, :, :, 2])
-0.22540260537916373
-
-julia> Flux.std(y[:, :, :, 2])  # normalises each image (or all channels in an image)
-1.018965249873791
+julia> isapprox(std(y[:, :, :, 2]), 1, atol=0.1) && std(xs[:, :, :, 2]) != std(y[:, :, :, 2])
+true
 ```
 """
 struct LayerNorm{F,D,T,N}
@@ -329,17 +310,16 @@ Use [`testmode!`](@ref) during inference.
 
 # Examples
 ```julia
-julia> xs = rand(3, 3, 3, 2);  # a batch of 2 3X3X3 images
+julia> using Statistics
 
-julia> Flux.std(xs)
-2.6822461565718467
+julia> xs = rand(3, 3, 3, 2);  # a batch of 2 3X3X3 images
 
 julia> m = BatchNorm(3);
 
-julia> Flux.trainmode!(m);  # activating the layer without actually training it
+julia> Flux.trainmode!(m);
 
-julia> Flux.std(m(xs))  # normalises the complete batch
-1.0093209961092855
+julia> isapprox(std(m(xs)), 1, atol=0.1) && std(xs) != std(m(xs))
+true
 ```
 """
 mutable struct BatchNorm{F,V,N,W}
@@ -419,24 +399,20 @@ that will be used to renormalize the input in test phase.
 in previous Flux versions (< v0.12).
 
 # Examples
-```jldoctest; filter = r"[+-]?([0-9]*[.])?[0-9]+"
+```jldoctest
+julia> using Statistics
+
 julia> xs = rand(3, 3, 3, 2);  # a batch of 2 3X3X3 images
 
 julia> m = InstanceNorm(3);
 
 julia> y = m(xs);
 
-julia> Flux.std(xs[:, :, 1, 1])  # original standard deviation of the first channel of image 1
-0.2989802650787384
+julia> isapprox(std(y[:, :, 1, 1]), 1, atol=0.1) && std(xs[:, :, 1, 1]) != std(y[:, :, 1, 1])
+true
 
-julia> Flux.std(y[:, :, 1, 1])  # each channel of the batch is normalised
-1.0606027381538408
-
-julia> Flux.std(xs[:, :, 2, 2])  # original standard deviation of the second channel of image 2
-0.28662705400461197
-
-julia> Flux.std(y[:, :, 2, 2])  # each channel of the batch is normalised
-1.06058729821187
+julia> isapprox(std(y[:, :, 2, 2]), 1, atol=0.1) && std(xs[:, :, 2, 2]) != std(y[:, :, 2, 2])
+true
 ```
 """
 mutable struct InstanceNorm{F,V,N,W}
@@ -517,24 +493,20 @@ If `track_stats=true`, accumulates mean and var statistics in training phase
 that will be used to renormalize the input in test phase.
 
 # Examples
-```jldoctest; filter = r"[+-]?([0-9]*[.])?[0-9]+"
+```jldoctest
+julia> using Statistics
+
 julia> xs = rand(3, 3, 4, 2);  # a batch of 2 3X3X4 images
 
 julia> m = GroupNorm(4, 2);
 
 julia> y = m(xs);
 
-julia> Flux.std(xs[:, :, 1:2, 1])  # original standard deviation of the first 2 channels of image 1
-0.307588490584917
-
-julia> Flux.std(y[:, :, 1:2, 1])  # normalises channels in groups of 2 (as specified)
-1.0289339365431291
-
-julia> Flux.std(xs[:, :, 3:4, 2])  # original standard deviation of the last 2 channels of image 2
-0.3111566100804274
+julia> isapprox(std(y[:, :, 1:2, 1]), 1, atol=0.1) && std(xs[:, :, 1:2, 1]) != std(y[:, :, 1:2, 1])
+true
 
-julia> Flux.std(y[:, :, 3:4, 2])    # normalises channels in groups of 2 (as specified)
-1.0289352493058574
+julia> isapprox(std(y[:, :, 3:4, 2]), 1, atol=0.1) && std(xs[:, :, 3:4, 2]) != std(y[:, :, 3:4, 2])
+true
 ```
 """
 mutable struct GroupNorm{F,V,N,W}
diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
index 5059449f38..929f5b2d71 100644
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@@ -79,22 +79,6 @@ julia> rnn(3)
 
 julia> rnn.state
 5
-
-julia> rnn(1:10)  # apply to a sequence
-10-element Vector{Int64}:
-  1
-  2
-  3
-  4
-  5
-  6
-  7
-  8
-  9
- 10
-
-julia> rnn.state
-60
 ```
 
 Folding over a 3d Array of dimensions `(features, batch, time)` is also supported:

From 430f7a09575e593935e76feace27a74a23b58817 Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Sun, 12 Jun 2022 13:01:56 +0530
Subject: [PATCH 10/15] Update src/layers/normalise.jl

Co-authored-by: Brian Chen <ToucheSir@users.noreply.github.com>
---
 src/layers/normalise.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index f543db0c09..ee5a3d78f3 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -74,7 +74,7 @@ julia> Flux.trainmode!(m);
 
 julia> y = m([1]);
 
-julia> count(i->(i == 0), y) == m[2].p  # number of zeros == 1
+julia> y == [0]
 true
 
 julia> m = Chain(Dense(1 => 1), Dropout(0.5));

From 4beb2a2135b507d12c652d40829f293f6fa4af7a Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Sun, 12 Jun 2022 13:45:15 +0530
Subject: [PATCH 11/15] Clean the doctests further

---
 src/layers/normalise.jl |  8 ++++----
 src/layers/recurrent.jl | 45 ++++++++++++-----------------------------
 2 files changed, 17 insertions(+), 36 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index ee5a3d78f3..7ac31ade8b 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -77,13 +77,13 @@ julia> y = m([1]);
 julia> y == [0]
 true
 
-julia> m = Chain(Dense(1 => 1), Dropout(0.5));
+julia> m = Chain(Dense(1000 => 1000), Dropout(0.5));
 
 julia> Flux.trainmode!(m);
 
-julia> y = m([1]);
+julia> y = m(ones(1000));
 
-julia> m[2].p - 0.5 <= count(i->(i == 0), y) <= m[2].p + 0.5  # number of zeros can be 0 or 1
+julia> isapprox(count(==(0), y) / length(y), 0.5, atol=0.1)
 true
 ```
 """
@@ -139,7 +139,7 @@ julia> Flux.trainmode!(m);
 
 julia> y = m(x);
 
-julia> isapprox(std(x), std(y), rtol=0.6)
+julia> isapprox(std(x), std(y), atol=0.6)
 true
 ```
 """
diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
index 929f5b2d71..baf6190f32 100644
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@@ -102,41 +102,22 @@ julia> rnn.state
 1×1 Matrix{Int64}:
  5
 
-julia> vec = rnn(reshape(1:10, 1, 1, :));  # apply to a sequence of (features, batch, time)
+julia> out = rnn(reshape(1:10, 1, 1, :));  # apply to a sequence of (features, batch, time)
 
-julia> size(vec)
+julia> out |> size
 (1, 1, 10)
 
-julia> vec 
-1×1×10 Array{Int64, 3}:
-[:, :, 1] =
- 1
-
-[:, :, 2] =
- 2
-
-[:, :, 3] =
- 3
-
-[:, :, 4] =
- 4
-
-[:, :, 5] =
- 5
-
-[:, :, 6] =
- 6
-
-[:, :, 7] =
- 7
-
-[:, :, 8] =
- 8
-
-[:, :, 9] =
- 9
-
-[:, :, 10] =
+julia> vec(out)
+10-element Vector{Int64}:
+  1
+  2
+  3
+  4
+  5
+  6
+  7
+  8
+  9
  10
 
 julia> rnn.state

From 004b369059cadc11727fd442242230af8b73a44a Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Wed, 15 Jun 2022 01:31:40 +0530
Subject: [PATCH 12/15] Fix the shape of images in comments

---
 src/layers/normalise.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 7ac31ade8b..c3594b6950 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -196,7 +196,7 @@ See also [`BatchNorm`](@ref), [`InstanceNorm`](@ref), [`GroupNorm`](@ref), and [
 ```jldoctest
 julia> using Statistics
 
-julia> xs = rand(3, 3, 3, 2);  # a batch of 2 3X3X3 images
+julia> xs = rand(3, 3, 3, 2);  # a batch of 2 images, each having 3 channels
 
 julia> m = LayerNorm(3);
 
@@ -312,7 +312,7 @@ Use [`testmode!`](@ref) during inference.
 ```julia
 julia> using Statistics
 
-julia> xs = rand(3, 3, 3, 2);  # a batch of 2 3X3X3 images
+julia> xs = rand(3, 3, 3, 2);  # a batch of 2 images, each having 3 channels
 
 julia> m = BatchNorm(3);
 
@@ -402,7 +402,7 @@ in previous Flux versions (< v0.12).
 ```jldoctest
 julia> using Statistics
 
-julia> xs = rand(3, 3, 3, 2);  # a batch of 2 3X3X3 images
+julia> xs = rand(3, 3, 3, 2);  # a batch of 2 images, each having 3 channels
 
 julia> m = InstanceNorm(3);
 
@@ -496,7 +496,7 @@ that will be used to renormalize the input in test phase.
 ```jldoctest
 julia> using Statistics
 
-julia> xs = rand(3, 3, 4, 2);  # a batch of 2 3X3X4 images
+julia> xs = rand(3, 3, 4, 2);  # a batch of 2 images, each having 4 channels
 
 julia> m = GroupNorm(4, 2);
 

From ce0e64cf985db4a62329842f74bf0c822d5be2ac Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Wed, 15 Jun 2022 19:57:58 +0530
Subject: [PATCH 13/15] Use more data points for the failing doctest

---
 src/layers/normalise.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index c3594b6950..762521703b 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -131,15 +131,15 @@ Does nothing to the input once [`testmode!`](@ref) is true.
 ```jldoctest
 julia> using Statistics
 
-julia> x = randn(20,1);
+julia> x = randn(1000,1);
 
-julia> m = Chain(Dense(20 => 10, selu), AlphaDropout(0.5));
+julia> m = Chain(Dense(1000 => 1000, selu), AlphaDropout(0.2));
 
 julia> Flux.trainmode!(m);
 
 julia> y = m(x);
 
-julia> isapprox(std(x), std(y), atol=0.6)
+julia> isapprox(std(x), std(y), atol=0.2)
 true
 ```
 """

From 565cf2452d107fd1b968b6d083a682767a26a8a7 Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Fri, 24 Jun 2022 17:37:24 +0530
Subject: [PATCH 14/15] Use the dims kwarg

---
 src/layers/normalise.jl | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 762521703b..ef145ad102 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -202,10 +202,7 @@ julia> m = LayerNorm(3);
 
 julia> y = m(xs);
 
-julia> isapprox(std(y[:, :, :, 1]), 1, atol=0.1) && std(xs[:, :, :, 1]) != std(y[:, :, :, 1])
-true
-
-julia> isapprox(std(y[:, :, :, 2]), 1, atol=0.1) && std(xs[:, :, :, 2]) != std(y[:, :, :, 2])
+julia> isapprox(std(y, dims=1:3), ones(1, 1, 1, 2), atol=0.1) && std(y, dims=1:3) != std(xs, dims=1:3)
 true
 ```
 """
@@ -408,10 +405,7 @@ julia> m = InstanceNorm(3);
 
 julia> y = m(xs);
 
-julia> isapprox(std(y[:, :, 1, 1]), 1, atol=0.1) && std(xs[:, :, 1, 1]) != std(y[:, :, 1, 1])
-true
-
-julia> isapprox(std(y[:, :, 2, 2]), 1, atol=0.1) && std(xs[:, :, 2, 2]) != std(y[:, :, 2, 2])
+julia> isapprox(std(y, dims=1:2), ones(1, 1, 3, 2), atol=0.2) && std(y, dims=1:2) != std(xs, dims=1:2)
 true
 ```
 """

From 2ab42cdc1378b9b73bc5fd219451b80534740e46 Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Sun, 26 Jun 2022 19:22:40 +0530
Subject: [PATCH 15/15] Update the doctests of `Flux.reset!`

---
 src/layers/recurrent.jl | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
index baf6190f32..760933bb96 100644
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@@ -150,23 +150,29 @@ Assuming you have a `Recur` layer `rnn`, this is roughly equivalent to:
     rnn.state = hidden(rnn.cell)
 
 # Examples
-```jldoctest; filter = r"[+-]?([0-9]*[.])?[0-9]+"
-julia> r = RNN(1 => 1);
+```jldoctest
+julia> r = Flux.RNNCell(relu, ones(1,1), zeros(1,1), ones(1,1), zeros(1,1));  # users should use the RNN wrapper struct instead
+
+julia> y = Flux.Recur(r, ones(1,1));
 
-julia> a = ones(Float32, 1)
-1-element Vector{Float32}:
+julia> y.state
+1×1 Matrix{Float64}:
  1.0
 
-julia> r.state
-1×1 Matrix{Float32}:
- 0.0
+julia> y(ones(1,1))  # relu(1*1 + 1)
+1×1 Matrix{Float64}:
+ 2.0
 
-julia> r(a); r.state
-1×1 Matrix{Float32}:
- 0.61431444
+julia> y.state
+1×1 Matrix{Float64}:
+ 2.0
+
+julia> Flux.reset!(y)
+1×1 Matrix{Float64}:
+ 0.0
 
-julia> Flux.reset!(r)
-1×1 Matrix{Float32}:
+julia> y.state
+1×1 Matrix{Float64}:
  0.0
 ```
 """