From 39c08fcfae44c88484c3a92fd4b449ec4bdafda3 Mon Sep 17 00:00:00 2001
From: Saransh Chopra <saransh0701@gmail.com>
Date: Thu, 6 Oct 2022 00:24:41 +0530
Subject: [PATCH] Run JuliaFormatter again

---
 docs/make.jl                 | 113 +++++++------
 perf/bench_utils.jl          |   6 +-
 perf/recurrent.jl            |   2 +-
 perf/vgg.jl                  |  76 ++++-----
 src/Flux.jl                  |  78 +++++++--
 src/cuda/cudnn.jl            |  38 ++++-
 src/deprecations.jl          |  66 +++++---
 src/functor.jl               |  22 ++-
 src/layers/basic.jl          | 158 +++++++++++-------
 src/layers/conv.jl           | 261 ++++++++++++++++++++----------
 src/layers/normalise.jl      | 134 ++++++++-------
 src/layers/recurrent.jl      | 172 ++++++++++++--------
 src/layers/show.jl           |  74 +++++++--
 src/layers/upsample.jl       |  12 +-
 src/loading.jl               |  24 +--
 src/losses/Losses.jl         |  29 ++--
 src/losses/functions.jl      |   6 +-
 src/losses/utils.jl          |  14 +-
 src/optimise/Optimise.jl     |  29 +++-
 src/optimise/optimisers.jl   | 123 ++++++++------
 src/optimise/train.jl        |  32 ++--
 src/outputsize.jl            |  38 +++--
 src/utils.jl                 |  33 ++--
 test/ctc-gpu.jl              |  23 +--
 test/ctc.jl                  |  23 +--
 test/cuda/cuda.jl            |   4 +-
 test/cuda/curnn.jl           |  84 +++++-----
 test/cuda/layers.jl          | 186 +++++++++++++--------
 test/cuda/losses.jl          |  24 +--
 test/cuda/test_utils.jl      |  17 +-
 test/data.jl                 |  18 ++-
 test/layers/basic.jl         |  62 ++++---
 test/layers/conv.jl          |  65 ++++----
 test/layers/normalisation.jl | 305 +++++++++++++++++++----------------
 test/layers/recurrent.jl     | 206 +++++++++++++----------
 test/layers/upsample.jl      |  26 +--
 test/losses.jl               | 179 ++++++++++++--------
 test/optimise.jl             | 108 ++++++++-----
 test/outputsize.jl           |  35 +++-
 test/runtests.jl             |  26 +--
 test/utils.jl                | 222 ++++++++++++++-----------
 41 files changed, 1946 insertions(+), 1207 deletions(-)

diff --git a/docs/make.jl b/docs/make.jl
index 2e73f638a9..ecfaaa256c 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,62 +1,61 @@
-using Documenter, Flux, NNlib, Functors, MLUtils, BSON, Optimisers, OneHotArrays, Zygote,
-      ChainRulesCore
+using Documenter,
+    Flux, NNlib, Functors, MLUtils, BSON, Optimisers, OneHotArrays, Zygote, ChainRulesCore
 
 DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive = true)
 
 makedocs(;
-         modules = [
-             Flux,
-             NNlib,
-             Functors,
-             MLUtils,
-             BSON,
-             Optimisers,
-             OneHotArrays,
-             Zygote,
-             ChainRulesCore,
-             Base,
-         ],
-         doctest = false,
-         sitename = "Flux",
-         # strict = [:cross_references,],
-         pages = [
-             "Home" => "index.md",
-             "Building Models" => [
-                 "Overview" => "models/overview.md",
-                 "Basics" => "models/basics.md",
-                 "Recurrence" => "models/recurrence.md",
-                 "Layer Reference" => "models/layers.md",
-                 "Loss Functions" => "models/losses.md",
-                 "Regularisation" => "models/regularisation.md",
-                 "Custom Layers" => "models/advanced.md",
-                 "NNlib.jl" => "models/nnlib.md",
-                 "Activation Functions" => "models/activation.md",
-             ],
-             "Handling Data" => [
-                 "MLUtils.jl" => "data/mlutils.md",
-                 "OneHotArrays.jl" => "data/onehot.md",
-             ],
-             "Training Models" => [
-                 "Optimisers" => "training/optimisers.md",
-                 "Training" => "training/training.md",
-                 "Callback Helpers" => "training/callbacks.md",
-                 "Zygote.jl" => "training/zygote.md",
-             ],
-             "GPU Support" => "gpu.md",
-             "Model Tools" => [
-                 "Saving & Loading" => "saving.md",
-                 "Shape Inference" => "outputsize.md",
-                 "Weight Initialisation" => "utilities.md",
-                 "Functors.jl" => "models/functors.md",
-             ],
-             "Performance Tips" => "performance.md",
-             "Flux's Ecosystem" => "ecosystem.md",
-         ],
-         format = Documenter.HTML(; sidebar_sitename = false,
-                                  analytics = "UA-36890222-9",
-                                  assets = ["assets/flux.css"],
-                                  prettyurls = get(ENV, "CI", nothing) == "true"))
+    modules = [
+        Flux,
+        NNlib,
+        Functors,
+        MLUtils,
+        BSON,
+        Optimisers,
+        OneHotArrays,
+        Zygote,
+        ChainRulesCore,
+        Base,
+    ],
+    doctest = false,
+    sitename = "Flux",
+    # strict = [:cross_references,],
+    pages = [
+        "Home" => "index.md",
+        "Building Models" => [
+            "Overview" => "models/overview.md",
+            "Basics" => "models/basics.md",
+            "Recurrence" => "models/recurrence.md",
+            "Layer Reference" => "models/layers.md",
+            "Loss Functions" => "models/losses.md",
+            "Regularisation" => "models/regularisation.md",
+            "Custom Layers" => "models/advanced.md",
+            "NNlib.jl" => "models/nnlib.md",
+            "Activation Functions" => "models/activation.md",
+        ],
+        "Handling Data" =>
+            ["MLUtils.jl" => "data/mlutils.md", "OneHotArrays.jl" => "data/onehot.md"],
+        "Training Models" => [
+            "Optimisers" => "training/optimisers.md",
+            "Training" => "training/training.md",
+            "Callback Helpers" => "training/callbacks.md",
+            "Zygote.jl" => "training/zygote.md",
+        ],
+        "GPU Support" => "gpu.md",
+        "Model Tools" => [
+            "Saving & Loading" => "saving.md",
+            "Shape Inference" => "outputsize.md",
+            "Weight Initialisation" => "utilities.md",
+            "Functors.jl" => "models/functors.md",
+        ],
+        "Performance Tips" => "performance.md",
+        "Flux's Ecosystem" => "ecosystem.md",
+    ],
+    format = Documenter.HTML(;
+        sidebar_sitename = false,
+        analytics = "UA-36890222-9",
+        assets = ["assets/flux.css"],
+        prettyurls = get(ENV, "CI", nothing) == "true",
+    ),
+)
 
-deploydocs(; repo = "github.com/FluxML/Flux.jl.git",
-           target = "build",
-           push_preview = true)
+deploydocs(; repo = "github.com/FluxML/Flux.jl.git", target = "build", push_preview = true)
diff --git a/perf/bench_utils.jl b/perf/bench_utils.jl
index d7897851a4..f719b01c99 100644
--- a/perf/bench_utils.jl
+++ b/perf/bench_utils.jl
@@ -24,19 +24,19 @@ function run_benchmark(model, x; cuda = true)
         fw(model, x)
         GC.gc()
         CUDA.reclaim() #warmup
-        @btime CUDA.@sync(fw($model, $x)) teardown=(GC.gc(); CUDA.reclaim())
+        @btime CUDA.@sync(fw($model, $x)) teardown = (GC.gc(); CUDA.reclaim())
 
         println("  backward")
         bw(back)
         GC.gc()
         CUDA.reclaim() #warmup
-        @btime CUDA.@sync(bw($back)) teardown=(GC.gc(); CUDA.reclaim())
+        @btime CUDA.@sync(bw($back)) teardown = (GC.gc(); CUDA.reclaim())
 
         println("  forw and back")
         fwbw(model, ps, x)
         GC.gc()
         CUDA.reclaim() #warmup
-        @btime CUDA.@sync(fwbw($model, $ps, $x)) teardown=(GC.gc(); CUDA.reclaim())
+        @btime CUDA.@sync(fwbw($model, $ps, $x)) teardown = (GC.gc(); CUDA.reclaim())
     else
         println("  forward")
         fw(model, x)  #warmup
diff --git a/perf/recurrent.jl b/perf/recurrent.jl
index 9002e248d6..bf4a2474da 100644
--- a/perf/recurrent.jl
+++ b/perf/recurrent.jl
@@ -51,7 +51,7 @@ end
 
 for rnn_type in [Flux.RNN, Flux.GRU, Flux.LSTM]
     rnn_benchmark_sweep(rnn_type) do n, ts
-        return [randn(Float32, n, n) for _ in 1:ts], "Vec"
+        return [randn(Float32, n, n) for _ = 1:ts], "Vec"
     end
 end
 
diff --git a/perf/vgg.jl b/perf/vgg.jl
index dad9d1aad1..d86fdd6fe1 100644
--- a/perf/vgg.jl
+++ b/perf/vgg.jl
@@ -6,43 +6,45 @@ using CUDA
 using Zygote: pullback
 
 function vgg16()
-    return Chain(Conv((3, 3), 3 => 64, relu; pad = (1, 1), stride = (1, 1)),
-                 BatchNorm(64),
-                 Conv((3, 3), 64 => 64, relu; pad = (1, 1), stride = (1, 1)),
-                 BatchNorm(64),
-                 MaxPool((2, 2)),
-                 Conv((3, 3), 64 => 128, relu; pad = (1, 1), stride = (1, 1)),
-                 BatchNorm(128),
-                 Conv((3, 3), 128 => 128, relu; pad = (1, 1), stride = (1, 1)),
-                 BatchNorm(128),
-                 MaxPool((2, 2)),
-                 Conv((3, 3), 128 => 256, relu; pad = (1, 1), stride = (1, 1)),
-                 BatchNorm(256),
-                 Conv((3, 3), 256 => 256, relu; pad = (1, 1), stride = (1, 1)),
-                 BatchNorm(256),
-                 Conv((3, 3), 256 => 256, relu; pad = (1, 1), stride = (1, 1)),
-                 BatchNorm(256),
-                 MaxPool((2, 2)),
-                 Conv((3, 3), 256 => 512, relu; pad = (1, 1), stride = (1, 1)),
-                 BatchNorm(512),
-                 Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)),
-                 BatchNorm(512),
-                 Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)),
-                 BatchNorm(512),
-                 MaxPool((2, 2)),
-                 Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)),
-                 BatchNorm(512),
-                 Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)),
-                 BatchNorm(512),
-                 Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)),
-                 BatchNorm(512),
-                 MaxPool((2, 2)),
-                 flatten,
-                 Dense(512, 4096, relu),
-                 Dropout(0.5),
-                 Dense(4096, 4096, relu),
-                 Dropout(0.5),
-                 Dense(4096, 10))
+    return Chain(
+        Conv((3, 3), 3 => 64, relu; pad = (1, 1), stride = (1, 1)),
+        BatchNorm(64),
+        Conv((3, 3), 64 => 64, relu; pad = (1, 1), stride = (1, 1)),
+        BatchNorm(64),
+        MaxPool((2, 2)),
+        Conv((3, 3), 64 => 128, relu; pad = (1, 1), stride = (1, 1)),
+        BatchNorm(128),
+        Conv((3, 3), 128 => 128, relu; pad = (1, 1), stride = (1, 1)),
+        BatchNorm(128),
+        MaxPool((2, 2)),
+        Conv((3, 3), 128 => 256, relu; pad = (1, 1), stride = (1, 1)),
+        BatchNorm(256),
+        Conv((3, 3), 256 => 256, relu; pad = (1, 1), stride = (1, 1)),
+        BatchNorm(256),
+        Conv((3, 3), 256 => 256, relu; pad = (1, 1), stride = (1, 1)),
+        BatchNorm(256),
+        MaxPool((2, 2)),
+        Conv((3, 3), 256 => 512, relu; pad = (1, 1), stride = (1, 1)),
+        BatchNorm(512),
+        Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)),
+        BatchNorm(512),
+        Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)),
+        BatchNorm(512),
+        MaxPool((2, 2)),
+        Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)),
+        BatchNorm(512),
+        Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)),
+        BatchNorm(512),
+        Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)),
+        BatchNorm(512),
+        MaxPool((2, 2)),
+        flatten,
+        Dense(512, 4096, relu),
+        Dropout(0.5),
+        Dense(4096, 4096, relu),
+        Dropout(0.5),
+        Dense(4096, 10),
+    )
 end
 
 let model = vgg16(), x = rand(Float32, 32, 32, 3, 64)
diff --git a/src/Flux.jl b/src/Flux.jl
index 11aa0ca917..d2e2783199 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -15,29 +15,75 @@ export gradient
 
 # Pirate error to catch a common mistake. (Internal function `base` because overloading `update!` is more likely to give ambiguities.)
 function Optimisers.base(dx::Zygote.Grads)
-    return error("Optimisers.jl cannot be used with Zygote.jl's implicit gradients, `Params` & `Grads`")
+    return error(
+        "Optimisers.jl cannot be used with Zygote.jl's implicit gradients, `Params` & `Grads`",
+    )
 end
 
-export Chain, Dense, Embedding, Maxout, SkipConnection, Parallel, PairwiseFusion,
-       RNN, LSTM, GRU, GRUv3,
-       SamePad, Conv, CrossCor, ConvTranspose, DepthwiseConv,
-       AdaptiveMaxPool, AdaptiveMeanPool, GlobalMaxPool, GlobalMeanPool, MaxPool, MeanPool,
-       Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm, GroupNorm,
-       Upsample, PixelShuffle,
-       fmap, cpu, gpu, f32, f64,
-       testmode!, trainmode!
+export Chain,
+    Dense,
+    Embedding,
+    Maxout,
+    SkipConnection,
+    Parallel,
+    PairwiseFusion,
+    RNN,
+    LSTM,
+    GRU,
+    GRUv3,
+    SamePad,
+    Conv,
+    CrossCor,
+    ConvTranspose,
+    DepthwiseConv,
+    AdaptiveMaxPool,
+    AdaptiveMeanPool,
+    GlobalMaxPool,
+    GlobalMeanPool,
+    MaxPool,
+    MeanPool,
+    Dropout,
+    AlphaDropout,
+    LayerNorm,
+    BatchNorm,
+    InstanceNorm,
+    GroupNorm,
+    Upsample,
+    PixelShuffle,
+    fmap,
+    cpu,
+    gpu,
+    f32,
+    f64,
+    testmode!,
+    trainmode!
 
 include("optimise/Optimise.jl")
 using .Optimise
 using .Optimise: @epochs
 using .Optimise: skip
-export Descent, Adam, Momentum, Nesterov, RMSProp,
-       AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, OAdam,
-       AdamW, RAdam, AdaBelief, InvDecay, ExpDecay,
-       WeightDecay, ClipValue, ClipNorm
+export Descent,
+    Adam,
+    Momentum,
+    Nesterov,
+    RMSProp,
+    AdaGrad,
+    AdaMax,
+    AdaDelta,
+    AMSGrad,
+    NAdam,
+    OAdam,
+    AdamW,
+    RAdam,
+    AdaBelief,
+    InvDecay,
+    ExpDecay,
+    WeightDecay,
+    ClipValue,
+    ClipNorm
 
 using CUDA
-const use_cuda = Ref{Union{Nothing, Bool}}(nothing)
+const use_cuda = Ref{Union{Nothing,Bool}}(nothing)
 
 using Adapt, Functors, OneHotArrays
 include("utils.jl")
@@ -45,7 +91,9 @@ include("functor.jl")
 
 # Pirate error to catch a common mistake.
 function Functors.functor(::Type{<:MLUtils.DataLoader}, x)
-    return error("`DataLoader` does not support Functors.jl, thus functions like `Flux.gpu` will not act on its contents.")
+    return error(
+        "`DataLoader` does not support Functors.jl, thus functions like `Flux.gpu` will not act on its contents.",
+    )
 end
 
 include("layers/stateless.jl")
diff --git a/src/cuda/cudnn.jl b/src/cuda/cudnn.jl
index c20a7f873c..6ffa43e16a 100644
--- a/src/cuda/cudnn.jl
+++ b/src/cuda/cudnn.jl
@@ -1,17 +1,39 @@
 import NNlibCUDA: batchnorm, ∇batchnorm
 
-function (BN::Flux.BatchNorm)(x::Union{CuArray{T, 2}, CuArray{T, 4}, CuArray{T, 5}},
-                              cache = nothing) where {T <: Union{Float32, Float64}}
+function (BN::Flux.BatchNorm)(
+    x::Union{CuArray{T,2},CuArray{T,4},CuArray{T,5}},
+    cache = nothing,
+) where {T<:Union{Float32,Float64}}
     @assert BN.affine "BatchNorm: only affine=true supported on gpu"
     @assert BN.track_stats "BatchNorm: only track_stats=true supported on gpu"
-    @assert length(BN.β)==size(x, ndims(x) - 1) "BatchNorm: input has wrong number of channels"
-    return BN.λ.(batchnorm(BN.γ, BN.β, x, BN.μ, BN.σ², BN.momentum;
-                           cache = cache, alpha = 1, beta = 0, eps = BN.ϵ,
-                           training = Flux._isactive(BN)))
+    @assert length(BN.β) == size(x, ndims(x) - 1) "BatchNorm: input has wrong number of channels"
+    return BN.λ.(
+        batchnorm(
+            BN.γ,
+            BN.β,
+            x,
+            BN.μ,
+            BN.σ²,
+            BN.momentum;
+            cache = cache,
+            alpha = 1,
+            beta = 0,
+            eps = BN.ϵ,
+            training = Flux._isactive(BN),
+        )
+    )
 end
 
-function ChainRulesCore.rrule(::typeof(batchnorm), g, b, x, running_mean, running_var,
-                              momentum; kw...)
+function ChainRulesCore.rrule(
+    ::typeof(batchnorm),
+    g,
+    b,
+    x,
+    running_mean,
+    running_var,
+    momentum;
+    kw...,
+)
     y = batchnorm(g, b, x, running_mean, running_var, momentum; kw...)
     function batchnorm_pullback(Δ)
         grad = ∇batchnorm(g, b, x, unthunk(Δ), running_mean, running_var, momentum; kw...)
diff --git a/src/deprecations.jl b/src/deprecations.jl
index b6f56183f7..6d29cb6fd1 100644
--- a/src/deprecations.jl
+++ b/src/deprecations.jl
@@ -1,32 +1,49 @@
 # v0.12 deprecations
 
 function ones(dims...)
-    Base.depwarn("Flux.ones(size...) is deprecated, please use Flux.ones32(size...) or Base.ones(Float32, size...)",
-                 :ones; force = true)
+    Base.depwarn(
+        "Flux.ones(size...) is deprecated, please use Flux.ones32(size...) or Base.ones(Float32, size...)",
+        :ones;
+        force = true,
+    )
     return Base.ones(Float32, dims...)
 end
 ones(T::Type, dims...) = Base.ones(T, dims...)
 
 function zeros(dims...)
-    Base.depwarn("Flux.zeros(size...) is deprecated, please use Flux.zeros32(size...) or Base.zeros(Float32, size...)",
-                 :zeros; force = true)
+    Base.depwarn(
+        "Flux.zeros(size...) is deprecated, please use Flux.zeros32(size...) or Base.zeros(Float32, size...)",
+        :zeros;
+        force = true,
+    )
     return Base.zeros(Float32, dims...)
 end
 zeros(T::Type, dims...) = Base.zeros(T, dims...)
 
 function ones32(::Type, dims...)
-    throw(ArgumentError("Flux.ones32 is always Float32, use Base.ones to specify the element type"))
+    throw(
+        ArgumentError(
+            "Flux.ones32 is always Float32, use Base.ones to specify the element type",
+        ),
+    )
 end
 function zeros32(::Type, dims...)
-    throw(ArgumentError("Flux.zeros32 is always Float32, use Base.zeros to specify the element type"))
+    throw(
+        ArgumentError(
+            "Flux.zeros32 is always Float32, use Base.zeros to specify the element type",
+        ),
+    )
 end
 
 # v0.13 deprecations
 
 function Broadcast.broadcasted(f::Recur, args...)
     # This had an explicit @adjoint rule, calling Zygote.∇map(__context__, f, args...), until v0.12
-    Base.depwarn("""Broadcasting is not safe to use with RNNs, as it does not guarantee an iteration order.
-      Re-writing this as a comprehension would be better.""", :broadcasted)
+    Base.depwarn(
+        """Broadcasting is not safe to use with RNNs, as it does not guarantee an iteration order.
+Re-writing this as a comprehension would be better.""",
+        :broadcasted,
+    )
     return map(f, args...)  # map isn't really safe either, but 
 end
 
@@ -34,37 +51,46 @@ end
 
 struct Zeros
     function Zeros()
-        Base.depwarn("Flux.Zeros is no more, has ceased to be, is bereft of life, is an ex-boondoggle... please use bias=false instead",
-                     :Zeros)
+        Base.depwarn(
+            "Flux.Zeros is no more, has ceased to be, is bereft of life, is an ex-boondoggle... please use bias=false instead",
+            :Zeros,
+        )
         return false
     end
 end
 Zeros(args...) = Zeros()  # was used both Dense(10, 2, initb = Zeros) and Dense(rand(2,10), Zeros())
 
 function Optimise.update!(x::AbstractArray, x̄)
-    Base.depwarn("`Flux.Optimise.update!(x, x̄)` was not used internally and has been removed. Please write `x .-= x̄` instead.",
-                 :update!)
+    Base.depwarn(
+        "`Flux.Optimise.update!(x, x̄)` was not used internally and has been removed. Please write `x .-= x̄` instead.",
+        :update!,
+    )
     return x .-= x̄
 end
 
 function Diagonal(size::Integer...; kw...)
-    Base.depwarn("Flux.Diagonal is now Flux.Scale, and also allows an activation function.",
-                 :Diagonal)
+    Base.depwarn(
+        "Flux.Diagonal is now Flux.Scale, and also allows an activation function.",
+        :Diagonal,
+    )
     return Scale(size...; kw...)
 end
 function Diagonal(size::Tuple; kw...)
-    Base.depwarn("Flux.Diagonal is now Flux.Scale, and also allows an activation function.",
-                 :Diagonal)
+    Base.depwarn(
+        "Flux.Diagonal is now Flux.Scale, and also allows an activation function.",
+        :Diagonal,
+    )
     return Scale(size...; kw...)
 end
 
 # Deprecate this eventually once saving models w/o structure is no more
 function loadparams!(m, xs)
-    Base.depwarn("loadparams! will be deprecated eventually. Use loadmodel! instead.",
-                 :loadparams!)
+    Base.depwarn(
+        "loadparams! will be deprecated eventually. Use loadmodel! instead.",
+        :loadparams!,
+    )
     for (p, x) in zip(params(m), xs)
-        size(p) == size(x) ||
-            error("Expected param size $(size(p)), got $(size(x))")
+        size(p) == size(x) || error("Expected param size $(size(p)), got $(size(x))")
         copyto!(p, x)
     end
 end
diff --git a/src/functor.jl b/src/functor.jl
index 993ea95693..4463aaced7 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -104,7 +104,9 @@ else
 end
 adapt_storage(to::FluxCUDAAdaptor, x::CUDA.RNG) = x
 function adapt_storage(to::FluxCUDAAdaptor, x::AbstractRNG)
-    return error("Cannot map RNG of type $(typeof(x)) to GPU. GPU execution only supports Random.default_rng().")
+    return error(
+        "Cannot map RNG of type $(typeof(x)) to GPU. GPU execution only supports Random.default_rng().",
+    )
 end
 
 # TODO: figure out the correct design for OneElement
@@ -116,8 +118,10 @@ struct FluxCPUAdaptor end
 adapt_storage(to::FluxCPUAdaptor, x::AbstractArray) = adapt(Array, x)
 adapt_storage(to::FluxCPUAdaptor, x::AbstractRange) = x
 adapt_storage(to::FluxCPUAdaptor, x::Zygote.FillArrays.AbstractFill) = x
-function adapt_storage(to::FluxCPUAdaptor,
-                       x::T) where {T <: CUDA.CUSPARSE.CUDA.CUSPARSE.AbstractCuSparseMatrix}
+function adapt_storage(
+    to::FluxCPUAdaptor,
+    x::T,
+) where {T<:CUDA.CUSPARSE.CUDA.CUSPARSE.AbstractCuSparseMatrix}
     return adapt(Array, x)
 end
 adapt_storage(to::FluxCPUAdaptor, x::Zygote.OneElement) = x
@@ -129,10 +133,13 @@ function ChainRulesCore.rrule(::Type{Array}, x::CUDA.CuArray)
     return Array(x), dx -> (NoTangent(), CUDA.cu(unthunk(dx)))
 end
 
-function ChainRulesCore.rrule(::typeof(Adapt.adapt_storage), to::FluxCPUAdaptor,
-                              x::CUDA.AbstractGPUArray)
+function ChainRulesCore.rrule(
+    ::typeof(Adapt.adapt_storage),
+    to::FluxCPUAdaptor,
+    x::CUDA.AbstractGPUArray,
+)
     return adapt_storage(to, x),
-           dx -> (NoTangent(), NoTangent(), adapt_storage(FluxCUDAAdaptor(), unthunk(dx)))
+    dx -> (NoTangent(), NoTangent(), adapt_storage(FluxCUDAAdaptor(), unthunk(dx)))
 end
 
 # CPU/GPU movement conveniences
@@ -206,7 +213,8 @@ function check_use_cuda()
         end
         if !(use_cuda[])
             @info """The GPU function is being called but the GPU is not accessible. 
-                     Defaulting back to the CPU. (No action is required if you want to run on the CPU).""" maxlog=1
+                     Defaulting back to the CPU. (No action is required if you want to run on the CPU).""" maxlog =
+                1
         end
     end
 end
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 97d0d957ee..647b237144 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -32,7 +32,7 @@ For large models, there is a special type-unstable path which can reduce compila
 times. This can be used by supplying a vector of layers `Chain([layer1, layer2, ...])`.
 This feature is somewhat experimental, beware!
 """
-struct Chain{T <: Union{Tuple, NamedTuple, AbstractVector}}
+struct Chain{T<:Union{Tuple,NamedTuple,AbstractVector}}
     layers::T
 end
 
@@ -44,16 +44,22 @@ function Chain(; kw...)
     return Chain(values(kw))
 end
 
-@forward Chain.layers Base.getindex, Base.length, Base.first, Base.last,
-                      Base.iterate, Base.lastindex, Base.keys, Base.firstindex
+@forward Chain.layers Base.getindex,
+Base.length,
+Base.first,
+Base.last,
+Base.iterate,
+Base.lastindex,
+Base.keys,
+Base.firstindex
 
 @functor Chain
 
 (c::Chain)(x) = _applychain(c.layers, x)
 
-@generated function _applychain(layers::Tuple{Vararg{<:Any, N}}, x) where {N}
-    symbols = vcat(:x, [gensym() for _ in 1:N])
-    calls = [:($(symbols[i + 1]) = layers[$i]($(symbols[i]))) for i in 1:N]
+@generated function _applychain(layers::Tuple{Vararg{<:Any,N}}, x) where {N}
+    symbols = vcat(:x, [gensym() for _ = 1:N])
+    calls = [:($(symbols[i+1]) = layers[$i]($(symbols[i]))) for i = 1:N]
     return Expr(:block, calls...)
 end
 
@@ -156,18 +162,22 @@ julia> Flux.params(d1)  # no trainable bias
 Params([[1.0 1.0 … 1.0 1.0; 1.0 1.0 … 1.0 1.0]])
 ```
 """
-struct Dense{F, M <: AbstractMatrix, B}
+struct Dense{F,M<:AbstractMatrix,B}
     weight::M
     bias::B
     σ::F
-    function Dense(W::M, bias = true, σ::F = identity) where {M <: AbstractMatrix, F}
+    function Dense(W::M, bias = true, σ::F = identity) where {M<:AbstractMatrix,F}
         b = _create_bias(W, bias, size(W, 1))
-        return new{F, M, typeof(b)}(W, b, σ)
+        return new{F,M,typeof(b)}(W, b, σ)
     end
 end
 
-function Dense((in, out)::Pair{<:Integer, <:Integer}, σ = identity;
-               init = glorot_uniform, bias = true)
+function Dense(
+    (in, out)::Pair{<:Integer,<:Integer},
+    σ = identity;
+    init = glorot_uniform,
+    bias = true,
+)
     return Dense(init(out, in), bias, σ)
 end
 
@@ -229,15 +239,17 @@ julia> Flux.params(b)
 Params([[1 2 3 4]])
 ```
 """
-struct Scale{F, A <: AbstractArray, B}
+struct Scale{F,A<:AbstractArray,B}
     scale::A
     bias::B
     σ::F
-    function Scale(scale::A, bias::B = true,
-                   σ::F = identity) where {A <: AbstractArray,
-                                           B <: Union{Bool, AbstractArray}, F}
+    function Scale(
+        scale::A,
+        bias::B = true,
+        σ::F = identity,
+    ) where {A<:AbstractArray,B<:Union{Bool,AbstractArray},F}
         b = _create_bias(scale, bias, size(scale)...)
-        return new{F, A, typeof(b)}(scale, b, σ)
+        return new{F,A,typeof(b)}(scale, b, σ)
     end
 end
 
@@ -245,7 +257,7 @@ function Scale(s1::Integer, s23::Integer...; bias = true, init = ones32, _act =
     return Scale(init(s1, s23...), bias, _act)
 end
 function Scale(size_act...; bias = true, init = ones32)
-    return Scale(size_act[1:(end - 1)]...; bias, init, _act = size_act[end])
+    return Scale(size_act[1:(end-1)]...; bias, init, _act = size_act[end])
 end
 
 @functor Scale
@@ -298,11 +310,11 @@ julia> Flux.outputsize(m3, (5, 11))
 (7, 11)
 ```
 """
-struct Maxout{T <: Tuple}
+struct Maxout{T<:Tuple}
     layers::T
 end
 Maxout(layers...) = Maxout(layers)
-Maxout(f::Function, n_alts::Integer) = Maxout((f() for _ in 1:n_alts)...)
+Maxout(f::Function, n_alts::Integer) = Maxout((f() for _ = 1:n_alts)...)
 
 @functor Maxout
 
@@ -346,7 +358,7 @@ true
 
 See also [`Parallel`](@ref), [`Maxout`](@ref).
 """
-struct SkipConnection{T, F}
+struct SkipConnection{T,F}
     layers::T
     connection::F  #user can pass arbitrary connections here, such as (a,b) -> a + b
 end
@@ -409,24 +421,28 @@ julia> Flux.Bilinear(rand(4, 8, 16), false, tanh)  # first dim of weight is the
 Bilinear((8, 16) => 4, tanh; bias=false)  # 512 parameters
 ```
 """
-struct Bilinear{F, A, B}
+struct Bilinear{F,A,B}
     weight::A
     bias::B
     σ::F
-    function Bilinear(W::A, bias = true, σ::F = identity) where {A <: AbstractArray, F}
+    function Bilinear(W::A, bias = true, σ::F = identity) where {A<:AbstractArray,F}
         ndims(A) == 3 || throw(ArgumentError("expected a 3-array of weights"))
         b = _create_bias(W, bias, size(W, 1))
-        return new{F, A, typeof(b)}(W, b, σ)
+        return new{F,A,typeof(b)}(W, b, σ)
     end
 end
 
 @functor Bilinear
 
-function Bilinear(((in1, in2), out)::Pair{<:Tuple, <:Integer}, σ = identity;
-                  bias = true, init = glorot_uniform)
+function Bilinear(
+    ((in1, in2), out)::Pair{<:Tuple,<:Integer},
+    σ = identity;
+    bias = true,
+    init = glorot_uniform,
+)
     return Bilinear(init(out, in1, in2), bias, σ)
 end
-function Bilinear((in12, out)::Pair{<:Integer, <:Integer}, σ = identity; kw...)
+function Bilinear((in12, out)::Pair{<:Integer,<:Integer}, σ = identity; kw...)
     return Bilinear((in12, in12) => out, σ; kw...)
 end
 
@@ -436,8 +452,11 @@ function (a::Bilinear)(x::AbstractMatrix, y::AbstractMatrix)
     d_z, d_x, d_y = size(W)
     d_x == size(x, 1) && d_y == size(y, 1) ||
         throw(DimensionMismatch("number of rows in data must match W"))
-    size(x, 2) == size(y, 2) ||
-        throw(DimensionMismatch("Data inputs must agree on number of columns, got $(size(x,2)) and $(size(y,2))"))
+    size(x, 2) == size(y, 2) || throw(
+        DimensionMismatch(
+            "Data inputs must agree on number of columns, got $(size(x,2)) and $(size(y,2))",
+        ),
+    )
 
     # @einsum Wy[o,i,s] := W[o,i,j] * y[j,s]
     Wy = reshape(reshape(W, (:, d_y)) * y, (d_z, d_x, :))
@@ -454,14 +473,21 @@ end
 function (a::Bilinear)(x::AbstractVector, y::AbstractVector)
     return vec(a(reshape(x, :, 1), reshape(y, :, 1)))
 end
-(a::Bilinear)(x::NTuple{2, AbstractArray}) = a(x[1], x[2])
+(a::Bilinear)(x::NTuple{2,AbstractArray}) = a(x[1], x[2])
 
 function Base.show(io::IO, l::Bilinear)
     if size(l.weight, 2) == size(l.weight, 3)
         print(io, "Bilinear(", size(l.weight, 2), " => ", size(l.weight, 1))
     else
-        print(io, "Bilinear((", size(l.weight, 2), ", ", size(l.weight, 3), ") => ",
-              size(l.weight, 1))
+        print(
+            io,
+            "Bilinear((",
+            size(l.weight, 2),
+            ", ",
+            size(l.weight, 3),
+            ") => ",
+            size(l.weight, 1),
+        )
     end
     l.σ == identity || print(io, ", ", l.σ)
     l.bias === false && print(io, "; bias=false")
@@ -511,7 +537,7 @@ julia> model2[:β] == model2[2]
 true
 ```
 """
-struct Parallel{F, T <: Union{Tuple, NamedTuple}}
+struct Parallel{F,T<:Union{Tuple,NamedTuple}}
     connection::F
     layers::T
 end
@@ -520,7 +546,11 @@ Parallel(connection, layers...) = Parallel(connection, layers)
 function Parallel(connection; kw...)
     layers = NamedTuple(kw)
     if :layers in keys(layers) || :connection in keys(layers)
-        throw(ArgumentError("a Parallel layer cannot have a named sub-layer called `connection` or `layers`"))
+        throw(
+            ArgumentError(
+                "a Parallel layer cannot have a named sub-layer called `connection` or `layers`",
+            ),
+        )
     end
     isempty(layers) && return Parallel(connection, ())
     return Parallel(connection, layers)
@@ -535,7 +565,11 @@ function _parallel_check(layers, xs)
     nl = length(layers)
     nx = length(xs)
     if (nl != nx)
-        throw(ArgumentError("Parallel with $nl sub-layers can take one input or $nl inputs, but got $nx inputs"))
+        throw(
+            ArgumentError(
+                "Parallel with $nl sub-layers can take one input or $nl inputs, but got $nx inputs",
+            ),
+        )
     end
 end
 ChainRulesCore.@non_differentiable _parallel_check(nl, nx)
@@ -547,7 +581,7 @@ end
 
 Base.getindex(m::Parallel, i) = m.layers[i]
 Base.getindex(m::Parallel, i::AbstractVector) = Parallel(m.connection, m.layers[i])
-function Base.getindex(m::Parallel{<:Any, <:NamedTuple}, i::AbstractVector)
+function Base.getindex(m::Parallel{<:Any,<:NamedTuple}, i::AbstractVector)
     return Parallel(m.connection, NamedTuple{keys(m)[i]}(Tuple(m.layers)[i]))
 end
 
@@ -605,7 +639,7 @@ end
 
 A tuple of length N with the output of each fusion ((`y1`, `y2`, ..., `yN`) in the example above).
 """
-struct PairwiseFusion{F, T <: Union{Tuple, NamedTuple}}
+struct PairwiseFusion{F,T<:Union{Tuple,NamedTuple}}
     connection::F
     layers::T
 end
@@ -614,7 +648,11 @@ PairwiseFusion(connection, layers...) = PairwiseFusion(connection, layers)
 function PairwiseFusion(connection; kw...)
     layers = NamedTuple(kw)
     if :layers in keys(layers) || :connection in keys(layers)
-        throw(ArgumentError("a PairwiseFusion layer cannot have a named sub-layer called `connection` or `layers`"))
+        throw(
+            ArgumentError(
+                "a PairwiseFusion layer cannot have a named sub-layer called `connection` or `layers`",
+            ),
+        )
     end
     isempty(layers) && return PairwiseFusion(connection, ())
     return PairwiseFusion(connection, layers)
@@ -624,7 +662,11 @@ function _pairwise_check(x, layers, T)
     lx = length(x)
     N = length(layers)
     if T <: Tuple && lx != N
-        throw(ArgumentError("PairwiseFusion with $N sub-layers can take one input or $N inputs, but got $lx inputs"))
+        throw(
+            ArgumentError(
+                "PairwiseFusion with $N sub-layers can take one input or $N inputs, but got $lx inputs",
+            ),
+        )
     end
 end
 ChainRulesCore.@non_differentiable _pairwise_check(lx, N, T)
@@ -635,19 +677,24 @@ function (m::PairwiseFusion)(x::T) where {T}
 end
 (m::PairwiseFusion)(xs...) = m(xs)
 
-@generated function applypairwisefusion(layers::Tuple{Vararg{<:Any, N}}, connection,
-                                        x::T) where {N, T}
-    y_symbols = [gensym() for _ in 1:(N + 1)]
+@generated function applypairwisefusion(
+    layers::Tuple{Vararg{<:Any,N}},
+    connection,
+    x::T,
+) where {N,T}
+    y_symbols = [gensym() for _ = 1:(N+1)]
     getinput(i) = T <: Tuple ? :(x[$i]) : :x
-    calls = [:($(y_symbols[N + 1]) = $(getinput(1)))]
-    for i in 1:(N - 1)
-        push!(calls,
-              quote
-                  $(y_symbols[i]) = layers[$i]($(y_symbols[N + 1]))
-                  $(y_symbols[N + 1]) = connection($(y_symbols[i]), $(getinput(i + 1)))
-              end)
+    calls = [:($(y_symbols[N+1]) = $(getinput(1)))]
+    for i = 1:(N-1)
+        push!(
+            calls,
+            quote
+                $(y_symbols[i]) = layers[$i]($(y_symbols[N+1]))
+                $(y_symbols[N+1]) = connection($(y_symbols[i]), $(getinput(i + 1)))
+            end,
+        )
     end
-    push!(calls, :($(y_symbols[N]) = layers[$N]($(y_symbols[N + 1]))))
+    push!(calls, :($(y_symbols[N]) = layers[$N]($(y_symbols[N+1]))))
     push!(calls, :(return tuple($(Tuple(y_symbols[1:N])...))))
     return Expr(:block, calls...)
 end
@@ -661,7 +708,7 @@ Base.getindex(m::PairwiseFusion, i) = m.layers[i]
 function Base.getindex(m::PairwiseFusion, i::AbstractVector)
     return PairwiseFusion(m.connection, m.layers[i])
 end
-function Base.getindex(m::PairwiseFusion{<:Any, <:NamedTuple}, i::AbstractVector)
+function Base.getindex(m::PairwiseFusion{<:Any,<:NamedTuple}, i::AbstractVector)
     return PairwiseFusion(m.connection, NamedTuple{keys(m)[i]}(Tuple(m.layers)[i]))
 end
 
@@ -710,15 +757,18 @@ end
 
 @functor Embedding
 
-Embedding((in, out)::Pair{<:Integer, <:Integer}; init = randn32) = Embedding(init(out, in))
+Embedding((in, out)::Pair{<:Integer,<:Integer}; init = randn32) = Embedding(init(out, in))
 
 (m::Embedding)(x::Integer) = m.weight[:, x]
 (m::Embedding)(x::AbstractVector) = NNlib.gather(m.weight, x)
 (m::Embedding)(x::AbstractArray) = reshape(m(vec(x)), :, size(x)...)
 
-function (m::Embedding)(x::Union{OneHotVector{T, L}, OneHotMatrix{T, L}}) where {T, L}
-    size(m.weight, 2) == L ||
-        throw(DimensionMismatch("Matrix column must correspond with OneHot size: $(size(m.weight, 2)) != $L"))
+function (m::Embedding)(x::Union{OneHotVector{T,L},OneHotMatrix{T,L}}) where {T,L}
+    size(m.weight, 2) == L || throw(
+        DimensionMismatch(
+            "Matrix column must correspond with OneHot size: $(size(m.weight, 2)) != $L",
+        ),
+    )
     return m(onecold(x))
 end
 
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 5cd8782606..b620983dbc 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -1,7 +1,7 @@
 using NNlib: conv, ∇conv_data, depthwiseconv, output_size
 
 # pad dims of x with dims of y until ndims(x) == ndims(y)
-_paddims(x::Tuple, y::Tuple) = (x..., y[(end - (length(y) - length(x) - 1)):end]...)
+_paddims(x::Tuple, y::Tuple) = (x..., y[(end-(length(y)-length(x)-1)):end]...)
 
 expand(N, i::Tuple) = i
 expand(N, i::Integer) = ntuple(_ -> i, N)
@@ -48,10 +48,10 @@ julia> layer3(xs) |> size  # output size = `ceil(input_size/stride)` = 50
 """
 struct SamePad end
 
-function calc_padding(lt, pad, k::NTuple{N, T}, dilation, stride) where {T, N}
+function calc_padding(lt, pad, k::NTuple{N,T}, dilation, stride) where {T,N}
     return expand(Val(2 * N), pad)
 end
-function calc_padding(lt, ::SamePad, k::NTuple{N, T}, dilation, stride) where {N, T}
+function calc_padding(lt, ::SamePad, k::NTuple{N,T}, dilation, stride) where {N,T}
     #Ref: "A guide to convolution arithmetic for deep learning" https://arxiv.org/abs/1603.07285
 
     # Effective kernel size, including dilation
@@ -127,13 +127,13 @@ julia> Conv((5, 5), 3 => 7; stride = 2, dilation = 4)(xs) |> size
 (42, 42, 7, 50)
 ```
 """
-struct Conv{N, M, F, A, V}
+struct Conv{N,M,F,A,V}
     σ::F
     weight::A
     bias::V
-    stride::NTuple{N, Int}
-    pad::NTuple{M, Int}
-    dilation::NTuple{N, Int}
+    stride::NTuple{N,Int}
+    pad::NTuple{M,Int}
+    dilation::NTuple{N,Int}
     groups::Int
 end
 
@@ -159,19 +159,34 @@ julia> Flux.params(layer) |> length
 2
 ```
 """
-function Conv(w::AbstractArray{T, N}, b = true, σ = identity;
-              stride = 1, pad = 0, dilation = 1, groups = 1) where {T, N}
-    @assert size(w, N) % groups==0 "Output channel dimension must be divisible by groups."
+function Conv(
+    w::AbstractArray{T,N},
+    b = true,
+    σ = identity;
+    stride = 1,
+    pad = 0,
+    dilation = 1,
+    groups = 1,
+) where {T,N}
+    @assert size(w, N) % groups == 0 "Output channel dimension must be divisible by groups."
     stride = expand(Val(N - 2), stride)
     dilation = expand(Val(N - 2), dilation)
-    pad = calc_padding(Conv, pad, size(w)[1:(N - 2)], dilation, stride)
+    pad = calc_padding(Conv, pad, size(w)[1:(N-2)], dilation, stride)
     bias = _create_bias(w, b, size(w, N))
     return Conv(σ, w, bias, stride, pad, dilation, groups)
 end
 
-function Conv(k::NTuple{N, Integer}, ch::Pair{<:Integer, <:Integer}, σ = identity;
-              init = glorot_uniform, stride = 1, pad = 0, dilation = 1, groups = 1,
-              bias = true) where {N}
+function Conv(
+    k::NTuple{N,Integer},
+    ch::Pair{<:Integer,<:Integer},
+    σ = identity;
+    init = glorot_uniform,
+    stride = 1,
+    pad = 0,
+    dilation = 1,
+    groups = 1,
+    bias = true,
+) where {N}
     weight = convfilter(k, ch; init, groups)
     return Conv(weight, bias, σ; stride, pad, dilation, groups)
 end
@@ -187,19 +202,29 @@ distribution.
 
 This is internally used by the [`Conv`](@ref) layer.
 """
-function convfilter(filter::NTuple{N, Integer}, ch::Pair{<:Integer, <:Integer};
-                    init = glorot_uniform, groups = 1) where {N}
+function convfilter(
+    filter::NTuple{N,Integer},
+    ch::Pair{<:Integer,<:Integer};
+    init = glorot_uniform,
+    groups = 1,
+) where {N}
     cin, cout = ch
-    @assert cin % groups==0 "Input channel dimension must be divisible by groups."
-    @assert cout % groups==0 "Output channel dimension must be divisible by groups."
+    @assert cin % groups == 0 "Input channel dimension must be divisible by groups."
+    @assert cout % groups == 0 "Output channel dimension must be divisible by groups."
     return init(filter..., cin ÷ groups, cout)
 end
 
 @functor Conv
 
 function conv_dims(c::Conv, x::AbstractArray)
-    return DenseConvDims(x, c.weight; stride = c.stride, padding = c.pad,
-                         dilation = c.dilation, groups = c.groups)
+    return DenseConvDims(
+        x,
+        c.weight;
+        stride = c.stride,
+        padding = c.pad,
+        dilation = c.dilation,
+        groups = c.groups,
+    )
 end
 
 ChainRulesCore.@non_differentiable conv_dims(::Any, ::Any)
@@ -214,7 +239,7 @@ _channels_in(l::Conv) = size(l.weight, ndims(l.weight) - 1) * l.groups
 _channels_out(l::Conv) = size(l.weight, ndims(l.weight))
 
 function Base.show(io::IO, l::Conv)
-    print(io, "Conv(", size(l.weight)[1:(ndims(l.weight) - 2)])
+    print(io, "Conv(", size(l.weight)[1:(ndims(l.weight)-2)])
     print(io, ", ", _channels_in(l), " => ", _channels_out(l))
     _print_conv_opt(io, l)
     return print(io, ")")
@@ -263,18 +288,18 @@ julia> ConvTranspose((5, 5), 3 => 7; stride = 3, pad = SamePad())(xs) |> size
 (300, 300, 7, 50)
 ```
 """
-struct ConvTranspose{N, M, F, A, V}
+struct ConvTranspose{N,M,F,A,V}
     σ::F
     weight::A
     bias::V
-    stride::NTuple{N, Int}
-    pad::NTuple{M, Int}
-    dilation::NTuple{N, Int}
+    stride::NTuple{N,Int}
+    pad::NTuple{M,Int}
+    dilation::NTuple{N,Int}
     groups::Int
 end
 
 _channels_in(l::ConvTranspose) = size(l.weight)[end]
-_channels_out(l::ConvTranspose) = size(l.weight)[end - 1] * l.groups
+_channels_out(l::ConvTranspose) = size(l.weight)[end-1] * l.groups
 
 """
     ConvTranspose(weight::AbstractArray, [bias, activation; stride, pad, dilation, groups])
@@ -300,19 +325,33 @@ julia> Flux.params(layer) |> length
 2
 ```
 """
-function ConvTranspose(w::AbstractArray{T, N}, bias = true, σ = identity;
-                       stride = 1, pad = 0, dilation = 1, groups = 1) where {T, N}
+function ConvTranspose(
+    w::AbstractArray{T,N},
+    bias = true,
+    σ = identity;
+    stride = 1,
+    pad = 0,
+    dilation = 1,
+    groups = 1,
+) where {T,N}
     stride = expand(Val(N - 2), stride)
     dilation = expand(Val(N - 2), dilation)
-    pad = calc_padding(ConvTranspose, pad, size(w)[1:(N - 2)], dilation, stride)
+    pad = calc_padding(ConvTranspose, pad, size(w)[1:(N-2)], dilation, stride)
     b = _create_bias(w, bias, size(w, N - 1) * groups)
     return ConvTranspose(σ, w, b, stride, pad, dilation, groups)
 end
 
-function ConvTranspose(k::NTuple{N, Integer}, ch::Pair{<:Integer, <:Integer}, σ = identity;
-                       init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
-                       groups = 1,
-                       bias = true) where {N}
+function ConvTranspose(
+    k::NTuple{N,Integer},
+    ch::Pair{<:Integer,<:Integer},
+    σ = identity;
+    init = glorot_uniform,
+    stride = 1,
+    pad = 0,
+    dilation = 1,
+    groups = 1,
+    bias = true,
+) where {N}
     weight = convfilter(k, reverse(ch); init, groups)
     return ConvTranspose(weight, bias, σ; stride, pad, dilation, groups)
 end
@@ -322,17 +361,21 @@ end
 function conv_transpose_dims(c::ConvTranspose, x::AbstractArray)
     # Calculate size of "input", from ∇conv_data()'s perspective...
     combined_pad = (c.pad[1:2:end] .+ c.pad[2:2:end])
-    I = (size(x)[1:(end - 2)] .- 1) .* c.stride .+ 1 .+
-        (size(c.weight)[1:(end - 2)] .- 1) .* c.dilation .- combined_pad
-    C_in = size(c.weight)[end - 1] * c.groups
+    I =
+        (size(x)[1:(end-2)] .- 1) .* c.stride .+ 1 .+
+        (size(c.weight)[1:(end-2)] .- 1) .* c.dilation .- combined_pad
+    C_in = size(c.weight)[end-1] * c.groups
     batch_size = size(x)[end]
     # Create DenseConvDims() that looks like the corresponding conv()
     w_size = size(c.weight)
-    return DenseConvDims((I..., C_in, batch_size), w_size;
-                         stride = c.stride,
-                         padding = c.pad,
-                         dilation = c.dilation,
-                         groups = c.groups)
+    return DenseConvDims(
+        (I..., C_in, batch_size),
+        w_size;
+        stride = c.stride,
+        padding = c.pad,
+        dilation = c.dilation,
+        groups = c.groups,
+    )
 end
 
 ChainRulesCore.@non_differentiable conv_transpose_dims(::Any, ::Any)
@@ -344,14 +387,19 @@ function (c::ConvTranspose)(x::AbstractArray)
 end
 
 function Base.show(io::IO, l::ConvTranspose)
-    print(io, "ConvTranspose(", size(l.weight)[1:(ndims(l.weight) - 2)])
+    print(io, "ConvTranspose(", size(l.weight)[1:(ndims(l.weight)-2)])
     print(io, ", ", _channels_in(l), " => ", _channels_out(l))
     _print_conv_opt(io, l)
     return print(io, ")")
 end
 
-function calc_padding(::Type{ConvTranspose}, pad::SamePad, k::NTuple{N, T}, dilation,
-                      stride) where {N, T}
+function calc_padding(
+    ::Type{ConvTranspose},
+    pad::SamePad,
+    k::NTuple{N,T},
+    dilation,
+    stride,
+) where {N,T}
     return calc_padding(Conv, pad, k .- stride .+ 1, dilation, stride)
 end
 
@@ -379,17 +427,29 @@ julia> DepthwiseConv((5, 5), 3 => 9; stride = 2, pad = 2)(xs) |> size
 (50, 50, 9, 50)
 ```
 """
-function DepthwiseConv(k::NTuple{<:Any, Integer}, ch::Pair{<:Integer, <:Integer},
-                       σ = identity;
-                       stride = 1, pad = 0, dilation = 1, bias = true,
-                       init = glorot_uniform)
+function DepthwiseConv(
+    k::NTuple{<:Any,Integer},
+    ch::Pair{<:Integer,<:Integer},
+    σ = identity;
+    stride = 1,
+    pad = 0,
+    dilation = 1,
+    bias = true,
+    init = glorot_uniform,
+)
     return Conv(k, ch, σ; groups = ch.first, stride, pad, dilation, bias, init)
 end
 
-function DepthwiseConv(w::AbstractArray{T, N}, bias = true, σ = identity;
-                       stride = 1, pad = 0, dilation = 1) where {T, N}
-    w2 = reshape(w, size(w)[1:(end - 2)]..., 1, :)
-    return Conv(w2, bias, σ; groups = size(w)[end - 1], stride, pad, dilation)
+function DepthwiseConv(
+    w::AbstractArray{T,N},
+    bias = true,
+    σ = identity;
+    stride = 1,
+    pad = 0,
+    dilation = 1,
+) where {T,N}
+    w2 = reshape(w, size(w)[1:(end-2)]..., 1, :)
+    return Conv(w2, bias, σ; groups = size(w)[end-1], stride, pad, dilation)
 end
 
 """
@@ -419,13 +479,13 @@ julia> CrossCor((5, 5), 3 => 7; stride = 3, pad = (2, 0))(xs) |> size
 (34, 32, 7, 50)
 ```
 """
-struct CrossCor{N, M, F, A, V}
+struct CrossCor{N,M,F,A,V}
     σ::F
     weight::A
     bias::V
-    stride::NTuple{N, Int}
-    pad::NTuple{M, Int}
-    dilation::NTuple{N, Int}
+    stride::NTuple{N,Int}
+    pad::NTuple{M,Int}
+    dilation::NTuple{N,Int}
 end
 
 """
@@ -449,18 +509,31 @@ julia> layer(randn(100, 4, 64)) |> size
 (98, 5, 64)
 ```
 """
-function CrossCor(w::AbstractArray{T, N}, bias = true, σ = identity;
-                  stride = 1, pad = 0, dilation = 1) where {T, N}
+function CrossCor(
+    w::AbstractArray{T,N},
+    bias = true,
+    σ = identity;
+    stride = 1,
+    pad = 0,
+    dilation = 1,
+) where {T,N}
     stride = expand(Val(N - 2), stride)
     dilation = expand(Val(N - 2), dilation)
-    pad = calc_padding(CrossCor, pad, size(w)[1:(N - 2)], dilation, stride)
+    pad = calc_padding(CrossCor, pad, size(w)[1:(N-2)], dilation, stride)
     b = _create_bias(w, bias, size(w, N))
     return CrossCor(σ, w, b, stride, pad, dilation)
 end
 
-function CrossCor(k::NTuple{N, Integer}, ch::Pair{<:Integer, <:Integer}, σ = identity;
-                  init = glorot_uniform, stride = 1, pad = 0, dilation = 1,
-                  bias = true) where {N}
+function CrossCor(
+    k::NTuple{N,Integer},
+    ch::Pair{<:Integer,<:Integer},
+    σ = identity;
+    init = glorot_uniform,
+    stride = 1,
+    pad = 0,
+    dilation = 1,
+    bias = true,
+) where {N}
     weight = convfilter(k, ch; init = init)
     return CrossCor(weight, bias, σ; stride, pad, dilation)
 end
@@ -473,8 +546,13 @@ function crosscor(x, w, ddims::DenseConvDims)
 end
 
 function crosscor_dims(c::CrossCor, x::AbstractArray)
-    return DenseConvDims(x, c.weight; stride = c.stride, padding = c.pad,
-                         dilation = c.dilation)
+    return DenseConvDims(
+        x,
+        c.weight;
+        stride = c.stride,
+        padding = c.pad,
+        dilation = c.dilation,
+    )
 end
 
 ChainRulesCore.@non_differentiable crosscor_dims(::Any, ::Any)
@@ -486,9 +564,14 @@ function (c::CrossCor)(x::AbstractArray)
 end
 
 function Base.show(io::IO, l::CrossCor)
-    print(io, "CrossCor(", size(l.weight)[1:(ndims(l.weight) - 2)])
-    print(io, ", ", size(l.weight, ndims(l.weight) - 1), " => ",
-          size(l.weight, ndims(l.weight)))
+    print(io, "CrossCor(", size(l.weight)[1:(ndims(l.weight)-2)])
+    print(
+        io,
+        ", ",
+        size(l.weight, ndims(l.weight) - 1),
+        " => ",
+        size(l.weight, ndims(l.weight)),
+    )
     _print_conv_opt(io, l)
     return print(io, ")")
 end
@@ -516,13 +599,13 @@ julia> MaxPool((4, 4))(xs) ≈ AdaptiveMaxPool((25, 25))(xs)
 true
 ```
 """
-struct AdaptiveMaxPool{S, O}
-    out::NTuple{O, Int}
-    AdaptiveMaxPool(out::NTuple{O, Int}) where {O} = new{O + 2, O}(out)
+struct AdaptiveMaxPool{S,O}
+    out::NTuple{O,Int}
+    AdaptiveMaxPool(out::NTuple{O,Int}) where {O} = new{O + 2,O}(out)
 end
 
-function (a::AdaptiveMaxPool{S})(x::AbstractArray{T, S}) where {S, T}
-    insize = size(x)[1:(end - 2)]
+function (a::AdaptiveMaxPool{S})(x::AbstractArray{T,S}) where {S,T}
+    insize = size(x)[1:(end-2)]
     outsize = a.out
     stride = insize .÷ outsize
     k = insize .- (outsize .- 1) .* stride
@@ -558,13 +641,13 @@ julia> MeanPool((4, 4))(xs) ≈ AdaptiveMeanPool((25, 25))(xs)
 true
 ```
 """
-struct AdaptiveMeanPool{S, O}
-    out::NTuple{O, Int}
-    AdaptiveMeanPool(out::NTuple{O, Int}) where {O} = new{O + 2, O}(out)
+struct AdaptiveMeanPool{S,O}
+    out::NTuple{O,Int}
+    AdaptiveMeanPool(out::NTuple{O,Int}) where {O} = new{O + 2,O}(out)
 end
 
-function (a::AdaptiveMeanPool{S})(x::AbstractArray{T, S}) where {S, T}
-    insize = size(x)[1:(end - 2)]
+function (a::AdaptiveMeanPool{S})(x::AbstractArray{T,S}) where {S,T}
+    insize = size(x)[1:(end-2)]
     outsize = a.out
     stride = insize .÷ outsize
     k = insize .- (outsize .- 1) .* stride
@@ -605,7 +688,7 @@ function (g::GlobalMaxPool)(x)
     # Input size
     x_size = size(x)
     # Kernel size
-    k = x_size[1:(end - 2)]
+    k = x_size[1:(end-2)]
     # Pooling dimensions
     pdims = PoolDims(x, k)
 
@@ -639,7 +722,7 @@ function (g::GlobalMeanPool)(x)
     # Input size
     x_size = size(x)
     # Kernel size
-    k = x_size[1:(end - 2)]
+    k = x_size[1:(end-2)]
     # Pooling dimensions
     pdims = PoolDims(x, k)
 
@@ -689,13 +772,13 @@ julia> layer(rand(Float32, 100, 7, 50)) |> size
 (34, 7, 50)
 ```
 """
-struct MaxPool{N, M}
-    k::NTuple{N, Int}
-    pad::NTuple{M, Int}
-    stride::NTuple{N, Int}
+struct MaxPool{N,M}
+    k::NTuple{N,Int}
+    pad::NTuple{M,Int}
+    stride::NTuple{N,Int}
 end
 
-function MaxPool(k::NTuple{N, Integer}; pad = 0, stride = k) where {N}
+function MaxPool(k::NTuple{N,Integer}; pad = 0, stride = k) where {N}
     stride = expand(Val(N), stride)
     pad = calc_padding(MaxPool, pad, k, 1, stride)
     return MaxPool(k, pad, stride)
@@ -748,13 +831,13 @@ julia> m(xs) |> size
 (20, 20, 7, 50)
 ```
 """
-struct MeanPool{N, M}
-    k::NTuple{N, Int}
-    pad::NTuple{M, Int}
-    stride::NTuple{N, Int}
+struct MeanPool{N,M}
+    k::NTuple{N,Int}
+    pad::NTuple{M,Int}
+    stride::NTuple{N,Int}
 end
 
-function MeanPool(k::NTuple{N, Integer}; pad = 0, stride = k) where {N}
+function MeanPool(k::NTuple{N,Integer}; pad = 0, stride = k) where {N}
     stride = expand(Val(N), stride)
     pad = calc_padding(MeanPool, pad, k, 1, stride)
     return MeanPool(k, pad, stride)
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 43c0a317c6..437d709463 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -38,7 +38,11 @@ dropout(x, p; kwargs...) = dropout(rng_from_array(x), x, p; kwargs...)
 
 dropout_mask(rng::CUDA.RNG, x::CuArray, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...)
 function dropout_mask(rng, x::CuArray, p; kwargs...)
-    throw(ArgumentError("x isa CuArray, but rng isa $(typeof(rng)). dropout_mask only support CUDA.RNG for CuArrays."))
+    throw(
+        ArgumentError(
+            "x isa CuArray, but rng isa $(typeof(rng)). dropout_mask only support CUDA.RNG for CuArrays.",
+        ),
+    )
 end
 dropout_mask(rng, x, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...)
 function _dropout_mask(rng, x, p; dims = :)
@@ -92,10 +96,10 @@ julia> isapprox(count(==(0), y) / length(y), 0.5; atol = 0.1)
 true
 ```
 """
-mutable struct Dropout{F, D, R <: AbstractRNG}
+mutable struct Dropout{F,D,R<:AbstractRNG}
     p::F
     dims::D
-    active::Union{Bool, Nothing}
+    active::Union{Bool,Nothing}
     rng::R
 end
 Dropout(p, dims, active) = Dropout(p, dims, active, default_rng_value())
@@ -150,13 +154,13 @@ julia> isapprox(std(x), std(y); atol = 0.2)
 true
 ```
 """
-mutable struct AlphaDropout{F, R <: AbstractRNG}
+mutable struct AlphaDropout{F,R<:AbstractRNG}
     p::F
-    active::Union{Bool, Nothing}
+    active::Union{Bool,Nothing}
     rng::R
     function AlphaDropout(p, active, rng)
         @assert 0 ≤ p ≤ 1
-        return new{typeof(p), typeof(rng)}(p, active, rng)
+        return new{typeof(p),typeof(rng)}(p, active, rng)
     end
 end
 AlphaDropout(p, active) = AlphaDropout(p, active, default_rng_value())
@@ -216,21 +220,25 @@ julia> isapprox(std(y; dims = 1:3), ones(1, 1, 1, 2); atol = 0.1) &&
 true
 ```
 """
-struct LayerNorm{F, D, T, N}
+struct LayerNorm{F,D,T,N}
     λ::F
     diag::D
     ϵ::T
-    size::NTuple{N, Int}
+    size::NTuple{N,Int}
     affine::Bool
 end
 
-function LayerNorm(size::Tuple{Vararg{Int}}, λ = identity; affine::Bool = true,
-                   ϵ::Real = 1.0f-5)
+function LayerNorm(
+    size::Tuple{Vararg{Int}},
+    λ = identity;
+    affine::Bool = true,
+    ϵ::Real = 1.0f-5,
+)
     diag = affine ? Scale(size..., λ) : λ != identity ? Base.Fix1(broadcast, λ) : identity
     return LayerNorm(λ, diag, ϵ, size, affine)
 end
 LayerNorm(size::Integer...; kw...) = LayerNorm(Int.(size); kw...)
-LayerNorm(size_act...; kw...) = LayerNorm(Int.(size_act[1:(end - 1)]), size_act[end]; kw...)
+LayerNorm(size_act...; kw...) = LayerNorm(Int.(size_act[1:(end-1)]), size_act[end]; kw...)
 
 @functor LayerNorm
 
@@ -247,8 +255,12 @@ end
 # Compute the statistics on the slices specified by reduce_dims.
 # reduce_dims=[1,...,N-2,N] for BatchNorm
 # reduce_dims=[1,...,N-2] for InstanceNorm and GroupNorm
-function _norm_layer_forward(l, x::AbstractArray{T, N}; reduce_dims,
-                             affine_shape) where {T, N}
+function _norm_layer_forward(
+    l,
+    x::AbstractArray{T,N};
+    reduce_dims,
+    affine_shape,
+) where {T,N}
     if !_isactive(l) && l.track_stats # testmode with tracked stats
         stats_shape = ntuple(i -> i == N - 1 ? size(x, N - 1) : 1, N)
         μ = reshape(l.μ, stats_shape)
@@ -271,7 +283,7 @@ end
 
 @inline _norm_layer_forward(x, μ, σ², ϵ) = (x .- μ) ./ sqrt.(σ² .+ ϵ)
 
-function _track_stats!(bn, x::AbstractArray{T, N}, μ, σ², reduce_dims) where {T, N}
+function _track_stats!(bn, x::AbstractArray{T,N}, μ, σ², reduce_dims) where {T,N}
     V = eltype(bn.σ²)
     mtm = bn.momentum
     res_mtm = one(V) - mtm
@@ -328,7 +340,7 @@ julia> isapprox(std(m(xs)), 1; atol = 0.1) && std(xs) != std(m(xs))
 true
 ```
 """
-mutable struct BatchNorm{F, V, N, W}
+mutable struct BatchNorm{F,V,N,W}
     λ::F  # activation function
     β::V  # bias
     γ::V  # scale
@@ -338,23 +350,26 @@ mutable struct BatchNorm{F, V, N, W}
     momentum::N
     affine::Bool
     track_stats::Bool
-    active::Union{Bool, Nothing}
+    active::Union{Bool,Nothing}
     chs::Int # number of channels
 end
 
-function BatchNorm(chs::Int, λ = identity;
-                   initβ = zeros32, initγ = ones32,
-                   affine = true, track_stats = true,
-                   ϵ = 1.0f-5, momentum = 0.1f0)
+function BatchNorm(
+    chs::Int,
+    λ = identity;
+    initβ = zeros32,
+    initγ = ones32,
+    affine = true,
+    track_stats = true,
+    ϵ = 1.0f-5,
+    momentum = 0.1f0,
+)
     β = affine ? initβ(chs) : nothing
     γ = affine ? initγ(chs) : nothing
     μ = track_stats ? zeros32(chs) : nothing
     σ² = track_stats ? ones32(chs) : nothing
 
-    return BatchNorm(λ, β, γ,
-                     μ, σ², ϵ, momentum,
-                     affine, track_stats,
-                     nothing, chs)
+    return BatchNorm(λ, β, γ, μ, σ², ϵ, momentum, affine, track_stats, nothing, chs)
 end
 
 @functor BatchNorm
@@ -363,7 +378,7 @@ trainable(bn::BatchNorm) = hasaffine(bn) ? (β = bn.β, γ = bn.γ) : (;)
 function (BN::BatchNorm)(x)
     @assert size(x, ndims(x) - 1) == BN.chs
     N = ndims(x)
-    reduce_dims = [1:(N - 2); N]
+    reduce_dims = [1:(N-2); N]
     affine_shape = ntuple(i -> i == N - 1 ? size(x, N - 1) : 1, N)
     return _norm_layer_forward(BN, x; reduce_dims, affine_shape)
 end
@@ -419,7 +434,7 @@ julia> isapprox(std(y; dims = 1:2), ones(1, 1, 3, 2); atol = 0.2) &&
 true
 ```
 """
-mutable struct InstanceNorm{F, V, N, W}
+mutable struct InstanceNorm{F,V,N,W}
     λ::F  # activation function
     β::V  # bias
     γ::V  # scale
@@ -429,17 +444,25 @@ mutable struct InstanceNorm{F, V, N, W}
     momentum::N
     affine::Bool
     track_stats::Bool
-    active::Union{Bool, Nothing}
+    active::Union{Bool,Nothing}
     chs::Int # number of channels
 end
 
-function InstanceNorm(chs::Int, λ = identity;
-                      initβ = zeros32, initγ = ones32,
-                      affine = false, track_stats = false,
-                      ϵ = 1.0f-5, momentum = 0.1f0)
+function InstanceNorm(
+    chs::Int,
+    λ = identity;
+    initβ = zeros32,
+    initγ = ones32,
+    affine = false,
+    track_stats = false,
+    ϵ = 1.0f-5,
+    momentum = 0.1f0,
+)
     if track_stats
-        Base.depwarn("`track_stats=true` will be removed from InstanceNorm in Flux 0.14. The default value is `track_stats=false`, which will work as before.",
-                     :InstanceNorm)
+        Base.depwarn(
+            "`track_stats=true` will be removed from InstanceNorm in Flux 0.14. The default value is `track_stats=false`, which will work as before.",
+            :InstanceNorm,
+        )
     end
 
     β = affine ? initβ(chs) : nothing
@@ -447,10 +470,7 @@ function InstanceNorm(chs::Int, λ = identity;
     μ = track_stats ? zeros32(chs) : nothing
     σ² = track_stats ? ones32(chs) : nothing
 
-    return InstanceNorm(λ, β, γ,
-                        μ, σ², ϵ, momentum,
-                        affine, track_stats,
-                        nothing, chs)
+    return InstanceNorm(λ, β, γ, μ, σ², ϵ, momentum, affine, track_stats, nothing, chs)
 end
 
 @functor InstanceNorm
@@ -460,7 +480,7 @@ function (l::InstanceNorm)(x)
     @assert ndims(x) > 2
     @assert size(x, ndims(x) - 1) == l.chs
     N = ndims(x)
-    reduce_dims = 1:(N - 2)
+    reduce_dims = 1:(N-2)
     affine_shape = ntuple(i -> i == N - 1 ? size(x, N - 1) : 1, N)
     return _norm_layer_forward(l, x; reduce_dims, affine_shape)
 end
@@ -522,7 +542,7 @@ true
 ```  # number of groups
 ```
 """
-mutable struct GroupNorm{F, V, N, W}
+mutable struct GroupNorm{F,V,N,W}
     G::Int  # number of groups
     λ::F  # activation function
     β::V  # bias
@@ -533,20 +553,29 @@ mutable struct GroupNorm{F, V, N, W}
     momentum::N
     affine::Bool
     track_stats::Bool
-    active::Union{Bool, Nothing}
+    active::Union{Bool,Nothing}
     chs::Int # number of channels
 end
 
 @functor GroupNorm
 trainable(gn::GroupNorm) = hasaffine(gn) ? (β = gn.β, γ = gn.γ) : (;)
 
-function GroupNorm(chs::Int, G::Int, λ = identity;
-                   initβ = zeros32, initγ = ones32,
-                   affine = true, track_stats = false,
-                   ϵ = 1.0f-5, momentum = 0.1f0)
+function GroupNorm(
+    chs::Int,
+    G::Int,
+    λ = identity;
+    initβ = zeros32,
+    initγ = ones32,
+    affine = true,
+    track_stats = false,
+    ϵ = 1.0f-5,
+    momentum = 0.1f0,
+)
     if track_stats
-        Base.depwarn("`track_stats=true` will be removed from GroupNorm in Flux 0.14. The default value is `track_stats=false`, which will work as before.",
-                     :GroupNorm)
+        Base.depwarn(
+            "`track_stats=true` will be removed from GroupNorm in Flux 0.14. The default value is `track_stats=false`, which will work as before.",
+            :GroupNorm,
+        )
     end
 
     chs % G == 0 ||
@@ -557,12 +586,7 @@ function GroupNorm(chs::Int, G::Int, λ = identity;
     μ = track_stats ? zeros32(G) : nothing
     σ² = track_stats ? ones32(G) : nothing
 
-    return GroupNorm(G, λ,
-                     β, γ,
-                     μ, σ²,
-                     ϵ, momentum,
-                     affine, track_stats,
-                     nothing, chs)
+    return GroupNorm(G, λ, β, γ, μ, σ², ϵ, momentum, affine, track_stats, nothing, chs)
 end
 
 function (gn::GroupNorm)(x)
@@ -570,9 +594,9 @@ function (gn::GroupNorm)(x)
     @assert size(x, ndims(x) - 1) == gn.chs
     N = ndims(x)
     sz = size(x)
-    x = reshape(x, sz[1:(N - 2)]..., sz[N - 1] ÷ gn.G, gn.G, sz[N])
+    x = reshape(x, sz[1:(N-2)]..., sz[N-1] ÷ gn.G, gn.G, sz[N])
     N = ndims(x)
-    reduce_dims = 1:(N - 2)
+    reduce_dims = 1:(N-2)
     affine_shape = ntuple(i -> i ∈ (N - 1, N - 2) ? size(x, i) : 1, N)
     x = _norm_layer_forward(gn, x; reduce_dims, affine_shape)
     return reshape(x, sz)
@@ -598,4 +622,4 @@ scale parameters, `false` otherwise.
 
 See [`BatchNorm`](@ref), [`InstanceNorm`](@ref), [`GroupNorm`](@ref), and [`LayerNorm`](@ref).
 """
-hasaffine(l::Union{BatchNorm, InstanceNorm, LayerNorm, GroupNorm}) = l.affine
+hasaffine(l::Union{BatchNorm,InstanceNorm,LayerNorm,GroupNorm}) = l.affine
diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
index e1e1b55519..5fdf1e7d00 100644
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@@ -19,13 +19,13 @@ function ChainRulesCore.rrule(::typeof(multigate), x::AbstractArray, h, c)
 end
 
 # Type stable and AD-friendly helper for iterating over the last dimension of an array
-function eachlastdim(A::AbstractArray{T, N}) where {T, N}
+function eachlastdim(A::AbstractArray{T,N}) where {T,N}
     inds_before = ntuple(_ -> :, N - 1)
     return (view(A, inds_before..., i) for i in axes(A, N))
 end
 
 # adapted from https://github.com/JuliaDiff/ChainRules.jl/blob/f13e0a45d10bb13f48d6208e9c9d5b4a52b96732/src/rulesets/Base/indexing.jl#L77
-function ∇eachlastdim(dys_raw, x::AbstractArray{T, N}) where {T, N}
+function ∇eachlastdim(dys_raw, x::AbstractArray{T,N}) where {T,N}
     dys = unthunk(dys_raw)
     i1 = findfirst(dy -> dy isa AbstractArray, dys)
     if isnothing(i1)  # all slices are Zero!
@@ -44,7 +44,7 @@ function ∇eachlastdim(dys_raw, x::AbstractArray{T, N}) where {T, N}
     return ProjectTo(x)(dx)
 end
 
-function ChainRulesCore.rrule(::typeof(eachlastdim), x::AbstractArray{T, N}) where {T, N}
+function ChainRulesCore.rrule(::typeof(eachlastdim), x::AbstractArray{T,N}) where {T,N}
     lastdims(dy) = (NoTangent(), ∇eachlastdim(unthunk(dy), x))
     return collect(eachlastdim(x)), lastdims
 end
@@ -126,7 +126,7 @@ julia> rnn.state
  60
 ```
 """
-mutable struct Recur{T, S}
+mutable struct Recur{T,S}
     cell::T
     state::S
 end
@@ -183,7 +183,7 @@ reset!(m) = foreach(reset!, functor(m)[1])
 
 flip(f, xs) = reverse([f(x) for x in reverse(xs)])
 
-function (m::Recur)(x::AbstractArray{T, 3}) where {T}
+function (m::Recur)(x::AbstractArray{T,3}) where {T}
     h = [m(x_t) for x_t in eachlastdim(x)]
     sze = size(h[1])
     return reshape(reduce(hcat, h), sze[1], sze[2], length(h))
@@ -192,23 +192,31 @@ end
 # Vanilla RNN
 
 struct RNNCell{F,I,H,V,S}
-  σ::F
-  Wi::I
-  Wh::H
-  b::V
-  state0::S
+    σ::F
+    Wi::I
+    Wh::H
+    b::V
+    state0::S
 end
 
-function RNNCell((in, out)::Pair, σ = tanh; init = Flux.glorot_uniform, initb = zeros32,
-                 init_state = zeros32)
+function RNNCell(
+    (in, out)::Pair,
+    σ = tanh;
+    init = Flux.glorot_uniform,
+    initb = zeros32,
+    init_state = zeros32,
+)
     return RNNCell(σ, init(out, in), init(out, out), initb(out), init_state(out, 1))
 end
 
-function (m::RNNCell{F,I,H,V,<:AbstractMatrix{T}})(h, x::Union{AbstractVecOrMat{T},OneHotArray}) where {F,I,H,V,T}
-  Wi, Wh, b = m.Wi, m.Wh, m.b
-  σ = NNlib.fast_act(m.σ, x)
-  h = σ.(Wi*x .+ Wh*h .+ b)
-  return h, reshape_cell_output(h, x)
+function (m::RNNCell{F,I,H,V,<:AbstractMatrix{T}})(
+    h,
+    x::Union{AbstractVecOrMat{T},OneHotArray},
+) where {F,I,H,V,T}
+    Wi, Wh, b = m.Wi, m.Wh, m.b
+    σ = NNlib.fast_act(m.σ, x)
+    h = σ.(Wi * x .+ Wh * h .+ b)
+    return h, reshape_cell_output(h, x)
 end
 
 @functor RNNCell
@@ -295,29 +303,38 @@ Recur(m::RNNCell) = Recur(m, m.state0)
 # LSTM
 
 struct LSTMCell{I,H,V,S}
-  Wi::I
-  Wh::H
-  b::V
-  state0::S
+    Wi::I
+    Wh::H
+    b::V
+    state0::S
 end
 
-function LSTMCell((in, out)::Pair;
-                  init = glorot_uniform,
-                  initb = zeros32,
-                  init_state = zeros32)
-    cell = LSTMCell(init(out * 4, in), init(out * 4, out), initb(out * 4),
-                    (init_state(out, 1), init_state(out, 1)))
+function LSTMCell(
+    (in, out)::Pair;
+    init = glorot_uniform,
+    initb = zeros32,
+    init_state = zeros32,
+)
+    cell = LSTMCell(
+        init(out * 4, in),
+        init(out * 4, out),
+        initb(out * 4),
+        (init_state(out, 1), init_state(out, 1)),
+    )
     cell.b[gate(out, 2)] .= 1
     return cell
 end
 
-function (m::LSTMCell{I,H,V,<:NTuple{2,AbstractMatrix{T}}})((h, c), x::Union{AbstractVecOrMat{T},OneHotArray}) where {I,H,V,T}
-  b, o = m.b, size(h, 1)
-  g = muladd(m.Wi, x, muladd(m.Wh, h, b))
-  input, forget, cell, output = multigate(g, o, Val(4))
-  c′ = @. sigmoid_fast(forget) * c + sigmoid_fast(input) * tanh_fast(cell)
-  h′ = @. sigmoid_fast(output) * tanh_fast(c′)
-  return (h′, c′), reshape_cell_output(h′, x)
+function (m::LSTMCell{I,H,V,<:NTuple{2,AbstractMatrix{T}}})(
+    (h, c),
+    x::Union{AbstractVecOrMat{T},OneHotArray},
+) where {I,H,V,T}
+    b, o = m.b, size(h, 1)
+    g = muladd(m.Wi, x, muladd(m.Wh, h, b))
+    input, forget, cell, output = multigate(g, o, Val(4))
+    c′ = @. sigmoid_fast(forget) * c + sigmoid_fast(input) * tanh_fast(cell)
+    h′ = @. sigmoid_fast(output) * tanh_fast(c′)
+    return (h′, c′), reshape_cell_output(h′, x)
 end
 
 @functor LSTMCell
@@ -376,25 +393,37 @@ function _gru_output(gxs, ghs, bs)
 end
 
 struct GRUCell{I,H,V,S}
-  Wi::I
-  Wh::H
-  b::V
-  state0::S
+    Wi::I
+    Wh::H
+    b::V
+    state0::S
 end
 
-function GRUCell((in, out)::Pair; init = glorot_uniform, initb = zeros32,
-                 init_state = zeros32)
-    return GRUCell(init(out * 3, in), init(out * 3, out), initb(out * 3),
-                   init_state(out, 1))
+function GRUCell(
+    (in, out)::Pair;
+    init = glorot_uniform,
+    initb = zeros32,
+    init_state = zeros32,
+)
+    return GRUCell(
+        init(out * 3, in),
+        init(out * 3, out),
+        initb(out * 3),
+        init_state(out, 1),
+    )
 end
 
-function (m::GRUCell{I,H,V,<:AbstractMatrix{T}})(h, x::Union{AbstractVecOrMat{T},OneHotArray}) where {I,H,V,T}
-  Wi, Wh, b, o = m.Wi, m.Wh, m.b, size(h, 1)
-  gxs, ghs, bs = multigate(Wi*x, o, Val(3)), multigate(Wh*h, o, Val(3)), multigate(b, o, Val(3))
-  r, z = _gru_output(gxs, ghs, bs)
-  h̃ = @. tanh_fast(gxs[3] + r * ghs[3] + bs[3])
-  h′ = @. (1 - z) * h̃ + z * h
-  return h′, reshape_cell_output(h′, x)
+function (m::GRUCell{I,H,V,<:AbstractMatrix{T}})(
+    h,
+    x::Union{AbstractVecOrMat{T},OneHotArray},
+) where {I,H,V,T}
+    Wi, Wh, b, o = m.Wi, m.Wh, m.b, size(h, 1)
+    gxs, ghs, bs =
+        multigate(Wi * x, o, Val(3)), multigate(Wh * h, o, Val(3)), multigate(b, o, Val(3))
+    r, z = _gru_output(gxs, ghs, bs)
+    h̃ = @. tanh_fast(gxs[3] + r * ghs[3] + bs[3])
+    h′ = @. (1 - z) * h̃ + z * h
+    return h′, reshape_cell_output(h′, x)
 end
 
 @functor GRUCell
@@ -448,26 +477,39 @@ Recur(m::GRUCell) = Recur(m, m.state0)
 # GRU v3
 
 struct GRUv3Cell{I,H,V,HH,S}
-  Wi::I
-  Wh::H
-  b::V
-  Wh_h̃::HH
-  state0::S
+    Wi::I
+    Wh::H
+    b::V
+    Wh_h̃::HH
+    state0::S
 end
 
-function GRUv3Cell((in, out)::Pair; init = glorot_uniform, initb = zeros32,
-                   init_state = zeros32)
-    return GRUv3Cell(init(out * 3, in), init(out * 2, out), initb(out * 3),
-                     init(out, out), init_state(out, 1))
+function GRUv3Cell(
+    (in, out)::Pair;
+    init = glorot_uniform,
+    initb = zeros32,
+    init_state = zeros32,
+)
+    return GRUv3Cell(
+        init(out * 3, in),
+        init(out * 2, out),
+        initb(out * 3),
+        init(out, out),
+        init_state(out, 1),
+    )
 end
 
-function (m::GRUv3Cell{I,H,V,HH,<:AbstractMatrix{T}})(h, x::Union{AbstractVecOrMat{T},OneHotArray}) where {I,H,V,HH,T}
-  Wi, Wh, b, Wh_h̃, o = m.Wi, m.Wh, m.b, m.Wh_h̃, size(h, 1)
-  gxs, ghs, bs = multigate(Wi*x, o, Val(3)), multigate(Wh*h, o, Val(2)), multigate(b, o, Val(3))
-  r, z = _gru_output(gxs, ghs, bs)
-  h̃ = tanh_fast.(gxs[3] .+ (Wh_h̃ * (r .* h)) .+ bs[3])
-  h′ = @. (1 - z) * h̃ + z * h
-  return h′, reshape_cell_output(h′, x)
+function (m::GRUv3Cell{I,H,V,HH,<:AbstractMatrix{T}})(
+    h,
+    x::Union{AbstractVecOrMat{T},OneHotArray},
+) where {I,H,V,HH,T}
+    Wi, Wh, b, Wh_h̃, o = m.Wi, m.Wh, m.b, m.Wh_h̃, size(h, 1)
+    gxs, ghs, bs =
+        multigate(Wi * x, o, Val(3)), multigate(Wh * h, o, Val(2)), multigate(b, o, Val(3))
+    r, z = _gru_output(gxs, ghs, bs)
+    h̃ = tanh_fast.(gxs[3] .+ (Wh_h̃ * (r .* h)) .+ bs[3])
+    h′ = @. (1 - z) * h̃ + z * h
+    return h′, reshape_cell_output(h′, x)
 end
 
 @functor GRUv3Cell
diff --git a/src/layers/show.jl b/src/layers/show.jl
index 8918bcd51c..b2e69d0b75 100644
--- a/src/layers/show.jl
+++ b/src/layers/show.jl
@@ -1,6 +1,11 @@
 
 for T in [
-    :Chain, :Parallel, :SkipConnection, :Recur, :Maxout, :PairwiseFusion,  # container types
+    :Chain,
+    :Parallel,
+    :SkipConnection,
+    :Recur,
+    :Maxout,
+    :PairwiseFusion,  # container types
 ]
     @eval function Base.show(io::IO, m::MIME"text/plain", x::$T)
         if get(io, :typeinfo, nothing) === nothing  # e.g. top level in REPL
@@ -25,8 +30,8 @@ function _big_show(io::IO, obj, indent::Int = 0, name = nothing)
             for k in Base.keys(obj)
                 _big_show(io, obj[k], indent + 2, k)
             end
-        elseif obj isa Parallel{<:Any, <:NamedTuple} ||
-               obj isa PairwiseFusion{<:Any, <:NamedTuple}
+        elseif obj isa Parallel{<:Any,<:NamedTuple} ||
+               obj isa PairwiseFusion{<:Any,<:NamedTuple}
             _big_show(io, obj.connection, indent + 2)
             for k in Base.keys(obj)
                 _big_show(io, obj[k], indent + 2, k)
@@ -58,8 +63,17 @@ _show_children(p::Parallel) = (p.connection, p.layers...)
 _show_children(f::PairwiseFusion) = (f.connection, f.layers...)
 
 for T in [
-    :Conv, :ConvTranspose, :CrossCor, :Dense, :Scale, :Bilinear, :Embedding,
-    :BatchNorm, :LayerNorm, :InstanceNorm, :GroupNorm,
+    :Conv,
+    :ConvTranspose,
+    :CrossCor,
+    :Dense,
+    :Scale,
+    :Bilinear,
+    :Embedding,
+    :BatchNorm,
+    :LayerNorm,
+    :InstanceNorm,
+    :GroupNorm,
 ]
     @eval function Base.show(io::IO, m::MIME"text/plain", x::$T)
         if !get(io, :compact, false)
@@ -76,12 +90,22 @@ function _layer_show(io::IO, layer, indent::Int = 0, name = nothing)
     print(io, " "^indent, str, indent == 0 ? "" : ",")
     if !isempty(params(layer))
         print(io, " "^max(2, (indent == 0 ? 20 : 39) - indent - length(str)))
-        printstyled(io, "# ", underscorise(sum(length, params(layer))), " parameters";
-                    color = :light_black)
+        printstyled(
+            io,
+            "# ",
+            underscorise(sum(length, params(layer))),
+            " parameters";
+            color = :light_black,
+        )
         nonparam = _childarray_sum(length, layer) - sum(length, params(layer))
         if nonparam > 0
-            printstyled(io, ", plus ", underscorise(nonparam),
-                        indent == 0 ? " non-trainable" : ""; color = :light_black)
+            printstyled(
+                io,
+                ", plus ",
+                underscorise(nonparam),
+                indent == 0 ? " non-trainable" : "";
+                color = :light_black,
+            )
         end
         _nan_show(io, params(layer))
     end
@@ -96,15 +120,35 @@ function _big_finale(io::IO, m)
         noncnt = _childarray_sum(_ -> 1, m) - length(ps)
         if noncnt > 0
             nonparam = underscorise(_childarray_sum(length, m) - sum(length, ps))
-            printstyled(io, " "^08, "# Total: ", length(ps), " trainable arrays, ";
-                        color = :light_black)
+            printstyled(
+                io,
+                " "^08,
+                "# Total: ",
+                length(ps),
+                " trainable arrays, ";
+                color = :light_black,
+            )
             println(io, pars, " parameters,")
-            printstyled(io, " "^10, "# plus ", noncnt, " non-trainable, ", nonparam,
-                        " parameters, summarysize "; color = :light_black)
+            printstyled(
+                io,
+                " "^10,
+                "# plus ",
+                noncnt,
+                " non-trainable, ",
+                nonparam,
+                " parameters, summarysize ";
+                color = :light_black,
+            )
             print(io, bytes, ".")
         else
-            printstyled(io, " "^18, "# Total: ", length(ps), " arrays, ";
-                        color = :light_black)
+            printstyled(
+                io,
+                " "^18,
+                "# Total: ",
+                length(ps),
+                " arrays, ";
+                color = :light_black,
+            )
             print(io, pars, " parameters, ", bytes, ".")
         end
     end
diff --git a/src/layers/upsample.jl b/src/layers/upsample.jl
index dad2a512bb..d67190a49b 100644
--- a/src/layers/upsample.jl
+++ b/src/layers/upsample.jl
@@ -31,7 +31,7 @@ julia> m(ones(2, 2, 1, 1)) |> size
 (4, 5, 1, 1)
 ```
 """
-struct Upsample{mode, S, T}
+struct Upsample{mode,S,T}
     scale::S
     size::T
 end
@@ -42,26 +42,26 @@ function Upsample(mode::Symbol = :nearest; scale = nothing, size = nothing)
     if !(isnothing(scale) ⊻ isnothing(size))
         throw(ArgumentError("Either scale or size should be specified (but not both)."))
     end
-    return Upsample{mode, typeof(scale), typeof(size)}(scale, size)
+    return Upsample{mode,typeof(scale),typeof(size)}(scale, size)
 end
 
 Upsample(scale, mode::Symbol = :nearest) = Upsample(mode; scale)
 
 (m::Upsample{:nearest})(x::AbstractArray) = NNlib.upsample_nearest(x, m.scale)
-function (m::Upsample{:nearest, Int})(x::AbstractArray{T, N}) where {T, N}
+function (m::Upsample{:nearest,Int})(x::AbstractArray{T,N}) where {T,N}
     return NNlib.upsample_nearest(x, ntuple(i -> m.scale, N - 2))
 end
-function (m::Upsample{:nearest, Nothing})(x::AbstractArray)
+function (m::Upsample{:nearest,Nothing})(x::AbstractArray)
     return NNlib.upsample_nearest(x; size = m.size)
 end
 
 (m::Upsample{:bilinear})(x::AbstractArray) = NNlib.upsample_bilinear(x, m.scale)
-function (m::Upsample{:bilinear, Nothing})(x::AbstractArray)
+function (m::Upsample{:bilinear,Nothing})(x::AbstractArray)
     return NNlib.upsample_bilinear(x; size = m.size)
 end
 
 (m::Upsample{:trilinear})(x::AbstractArray) = NNlib.upsample_trilinear(x, m.scale)
-function (m::Upsample{:trilinear, Nothing})(x::AbstractArray)
+function (m::Upsample{:trilinear,Nothing})(x::AbstractArray)
     return NNlib.upsample_trilinear(x; size = m.size)
 end
 
diff --git a/src/loading.jl b/src/loading.jl
index 35e3868189..0dd73a0d59 100644
--- a/src/loading.jl
+++ b/src/loading.jl
@@ -23,16 +23,19 @@ function loadleaf!(dst::AbstractArray, src::AbstractArray, err)
 end
 
 function _tie_check(dst::Bool, src::AbstractArray)
-    return iszero(dst) ||
-           error("Encountered tied parameter with boolean source at some nodes and non-boolean sources at others.")
+    return iszero(dst) || error(
+        "Encountered tied parameter with boolean source at some nodes and non-boolean sources at others.",
+    )
 end
 function _tie_check(dst::AbstractArray, src::Bool)
-    return (iszero(dst) && iszero(src)) ||
-           error("Encountered tied parameter with boolean source at some nodes and non-boolean sources at others.")
+    return (iszero(dst) && iszero(src)) || error(
+        "Encountered tied parameter with boolean source at some nodes and non-boolean sources at others.",
+    )
 end
 function _tie_check(dst::AbstractArray, src::AbstractArray)
-    return (dst == src) ||
-           error("Encountered tied destination parameters with untied and mismatched sources.")
+    return (dst == src) || error(
+        "Encountered tied destination parameters with untied and mismatched sources.",
+    )
 end
 _tie_check(dst, src) = true
 
@@ -97,10 +100,13 @@ but copying a `src` value of `true` will error.
 function loadmodel!(dst, src; filter = _ -> true, cache = Base.IdSet())
     ldsts = _filter_children(filter, functor(dst)[1])
     lsrcs = _filter_children(filter, functor(src)[1])
-    (keys(ldsts) == keys(lsrcs)) ||
-        throw(ArgumentError("Tried to load $src into $dst but the structures do not match."))
+    (keys(ldsts) == keys(lsrcs)) || throw(
+        ArgumentError("Tried to load $src into $dst but the structures do not match."),
+    )
 
-    err = DimensionMismatch("Tried to load $src into $dst but the parameter sizes do not match.")
+    err = DimensionMismatch(
+        "Tried to load $src into $dst but the parameter sizes do not match.",
+    )
     foreach(ldsts, lsrcs) do ldst, lsrc
         if ldst in cache # we already loaded this parameter before
             _tie_check(ldst, lsrc) && return ldst
diff --git a/src/losses/Losses.jl b/src/losses/Losses.jl
index 863d075916..a35f93af03 100644
--- a/src/losses/Losses.jl
+++ b/src/losses/Losses.jl
@@ -9,17 +9,24 @@ using CUDA
 using NNlib: logsoftmax, logσ, ctc_loss, ctc_alpha, ∇ctc_loss
 import Base.Broadcast: broadcasted
 
-export mse, mae, msle,
-       label_smoothing,
-       crossentropy, logitcrossentropy,
-       binarycrossentropy, logitbinarycrossentropy,
-       kldivergence,
-       huber_loss,
-       tversky_loss,
-       dice_coeff_loss,
-       poisson_loss,
-       hinge_loss, squared_hinge_loss,
-       binary_focal_loss, focal_loss, siamese_contrastive_loss
+export mse,
+    mae,
+    msle,
+    label_smoothing,
+    crossentropy,
+    logitcrossentropy,
+    binarycrossentropy,
+    logitbinarycrossentropy,
+    kldivergence,
+    huber_loss,
+    tversky_loss,
+    dice_coeff_loss,
+    poisson_loss,
+    hinge_loss,
+    squared_hinge_loss,
+    binary_focal_loss,
+    focal_loss,
+    siamese_contrastive_loss
 
 include("utils.jl")
 include("functions.jl")
diff --git a/src/losses/functions.jl b/src/losses/functions.jl
index 65b6b2fe60..674fe3065c 100644
--- a/src/losses/functions.jl
+++ b/src/losses/functions.jl
@@ -157,7 +157,7 @@ julia> Flux.crossentropy(y_dis, y) > Flux.crossentropy(y_dis, y_smoothed)
 true
 ```
 """
-function label_smoothing(y::Union{AbstractArray, Number}, α::Number; dims::Int = 1)
+function label_smoothing(y::Union{AbstractArray,Number}, α::Number; dims::Int = 1)
     if !(0 < α < 1)
         throw(ArgumentError("α must be between 0 and 1"))
     end
@@ -320,7 +320,7 @@ julia> Flux.crossentropy(y_prob, y_hot)
 """
 function binarycrossentropy(ŷ, y; agg = mean, ϵ = epseltype(ŷ))
     _check_sizes(ŷ, y)
-    return agg(@.(-xlogy(y, ŷ + ϵ)-xlogy(1 - y, 1 - ŷ + ϵ)))
+    return agg(@.(-xlogy(y, ŷ + ϵ) - xlogy(1 - y, 1 - ŷ + ϵ)))
 end
 
 """
@@ -351,7 +351,7 @@ julia> Flux.binarycrossentropy(sigmoid.(y_model), y_bin)
 """
 function logitbinarycrossentropy(ŷ, y; agg = mean)
     _check_sizes(ŷ, y)
-    return agg(@.((1 - y) * ŷ-logσ(ŷ)))
+    return agg(@.((1 - y) * ŷ - logσ(ŷ)))
 end
 
 """
diff --git a/src/losses/utils.jl b/src/losses/utils.jl
index cda3e4a557..43aab12a05 100644
--- a/src/losses/utils.jl
+++ b/src/losses/utils.jl
@@ -21,17 +21,19 @@ end
 @adjoint function broadcasted(::typeof(xlogy), x::Zygote.Numeric, y::Zygote.Numeric)
     res = xlogy.(x, y)
     return res,
-           Δ -> (nothing, Zygote.unbroadcast(x, xlogy.(Δ, y)),
-                 Zygote.unbroadcast(y, Δ .* x ./ y))
+    Δ -> (nothing, Zygote.unbroadcast(x, xlogy.(Δ, y)), Zygote.unbroadcast(y, Δ .* x ./ y))
 end
 
 ChainRulesCore.@scalar_rule xlogy(x, y) (log(y), x / y)  # should help Diffractor's broadcasting
-ChainRulesCore.@scalar_rule xlogx(x) (log(y)+true)
+ChainRulesCore.@scalar_rule xlogx(x) (log(y) + true)
 
 function _check_sizes(ŷ::AbstractArray, y::AbstractArray)
-    for d in 1:max(ndims(ŷ), ndims(y))
-        size(ŷ, d) == size(y, d) ||
-            throw(DimensionMismatch("loss function expects size(ŷ) = $(size(ŷ)) to match size(y) = $(size(y))"))
+    for d = 1:max(ndims(ŷ), ndims(y))
+        size(ŷ, d) == size(y, d) || throw(
+            DimensionMismatch(
+                "loss function expects size(ŷ) = $(size(ŷ)) to match size(y) = $(size(y))",
+            ),
+        )
     end
 end
 _check_sizes(ŷ, y) = nothing  # pass-through, for constant label e.g. y = 1
diff --git a/src/optimise/Optimise.jl b/src/optimise/Optimise.jl
index fa78f513d8..5bc95d0ab2 100644
--- a/src/optimise/Optimise.jl
+++ b/src/optimise/Optimise.jl
@@ -3,11 +3,30 @@ module Optimise
 using LinearAlgebra
 import ArrayInterface
 
-export train!, update!,
-       Descent, Adam, Momentum, Nesterov, RMSProp,
-       AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, AdamW, RAdam, OAdam, AdaBelief,
-       InvDecay, ExpDecay, WeightDecay, stop, skip, Optimiser,
-       ClipValue, ClipNorm
+export train!,
+    update!,
+    Descent,
+    Adam,
+    Momentum,
+    Nesterov,
+    RMSProp,
+    AdaGrad,
+    AdaMax,
+    AdaDelta,
+    AMSGrad,
+    NAdam,
+    AdamW,
+    RAdam,
+    OAdam,
+    AdaBelief,
+    InvDecay,
+    ExpDecay,
+    WeightDecay,
+    stop,
+    skip,
+    Optimiser,
+    ClipValue,
+    ClipNorm
 
 include("optimisers.jl")
 include("train.jl")
diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index f4d9687384..e7e40012c6 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -172,9 +172,9 @@ opt = Adam(0.001, (0.9, 0.8))
 """
 mutable struct Adam <: AbstractOptimiser
     eta::Float64
-    beta::Tuple{Float64, Float64}
+    beta::Tuple{Float64,Float64}
     epsilon::Float64
-    state::IdDict{Any, Any}
+    state::IdDict{Any,Any}
 end
 Adam(η::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = Adam(η, β, ϵ, IdDict())
 Adam(η::Real, β::Tuple, state::IdDict) = Adam(η, β, EPS, state)
@@ -183,9 +183,12 @@ function apply!(o::Adam, x, Δ)
     η, β = o.eta, o.beta
 
     mt, vt, βp = get!(o.state, x) do
-        return (zero(x), zero(x),
-                Float64[β[1], β[2]])
-    end::Tuple{typeof(x), typeof(x), Vector{Float64}}
+        return (
+            zero(x),
+            zero(x),
+            Float64[β[1], β[2]],
+        )
+    end::Tuple{typeof(x),typeof(x),Vector{Float64}}
 
     @. mt = β[1] * mt + (1 - β[1]) * Δ
     @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ)
@@ -215,9 +218,9 @@ opt = RAdam(0.001, (0.9, 0.8))
 """
 mutable struct RAdam <: AbstractOptimiser
     eta::Float64
-    beta::Tuple{Float64, Float64}
+    beta::Tuple{Float64,Float64}
     epsilon::Float64
-    state::IdDict{Any, Any}
+    state::IdDict{Any,Any}
 end
 RAdam(η::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = RAdam(η, β, ϵ, IdDict())
 RAdam(η::Real, β::Tuple, state::IdDict) = RAdam(η, β, EPS, state)
@@ -226,11 +229,14 @@ function apply!(o::RAdam, x, Δ)
     η, β = o.eta, o.beta
     ρ∞ = 2 / (1 - β[2]) - 1
 
-    mt, vt, βp, t = get!(o.state,
-                         x) do
-        return (zero(x), zero(x), Float64[β[1], β[2]],
-                Ref(1))
-    end::Tuple{typeof(x), typeof(x), Vector{Float64}, Base.RefValue{Int}}
+    mt, vt, βp, t = get!(o.state, x) do
+        return (
+            zero(x),
+            zero(x),
+            Float64[β[1], β[2]],
+            Ref(1),
+        )
+    end::Tuple{typeof(x),typeof(x),Vector{Float64},Base.RefValue{Int}}
 
     @. mt = β[1] * mt + (1 - β[1]) * Δ
     @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ)
@@ -267,9 +273,9 @@ opt = AdaMax(0.001, (0.9, 0.995))
 """
 mutable struct AdaMax <: AbstractOptimiser
     eta::Float64
-    beta::Tuple{Float64, Float64}
+    beta::Tuple{Float64,Float64}
     epsilon::Float64
-    state::IdDict{Any, Any}
+    state::IdDict{Any,Any}
 end
 AdaMax(η::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = AdaMax(η, β, ϵ, IdDict())
 AdaMax(η::Real, β::Tuple, state::IdDict) = AdaMax(η, β, EPS, state)
@@ -278,9 +284,12 @@ function apply!(o::AdaMax, x, Δ)
     η, β = o.eta, o.beta
 
     mt, ut, βp = get!(o.state, x) do
-        return (zero(x), zero(x),
-                Float64[β[1], β[2]])
-    end::Tuple{typeof(x), typeof(x), Vector{Float64}}
+        return (
+            zero(x),
+            zero(x),
+            Float64[β[1], β[2]],
+        )
+    end::Tuple{typeof(x),typeof(x),Vector{Float64}}
 
     @. mt = β[1] * mt + (1 - β[1]) * Δ
     @. ut = max(β[2] * ut, abs(Δ))
@@ -311,9 +320,9 @@ opt = OAdam(0.001, (0.9, 0.995))
 """
 mutable struct OAdam <: AbstractOptimiser
     eta::Float64
-    beta::Tuple{Float64, Float64}
+    beta::Tuple{Float64,Float64}
     epsilon::Float64
-    state::IdDict{Any, Any}
+    state::IdDict{Any,Any}
 end
 OAdam(η::Real = 0.001, β::Tuple = (0.5, 0.9), ϵ::Real = EPS) = OAdam(η, β, ϵ, IdDict())
 OAdam(η::Real, β::Tuple, state::IdDict) = RMSProp(η, β, EPS, state)
@@ -321,11 +330,14 @@ OAdam(η::Real, β::Tuple, state::IdDict) = RMSProp(η, β, EPS, state)
 function apply!(o::OAdam, x, Δ)
     η, β = o.eta, o.beta
 
-    mt, vt, Δ_, βp = get!(o.state,
-                          x) do
-        return (zero(x), zero(x), zero(x),
-                Float64[β[1], β[2]])
-    end::Tuple{typeof(x), typeof(x), typeof(x), Vector{Float64}}
+    mt, vt, Δ_, βp = get!(o.state, x) do
+        return (
+            zero(x),
+            zero(x),
+            zero(x),
+            Float64[β[1], β[2]],
+        )
+    end::Tuple{typeof(x),typeof(x),typeof(x),Vector{Float64}}
 
     @. mt = β[1] * mt + (1 - β[1]) * Δ
     @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ)
@@ -390,14 +402,14 @@ opt = AdaDelta(0.89)
 mutable struct AdaDelta <: AbstractOptimiser
     rho::Float64
     epsilon::Float64
-    state::IdDict{Any, Any}
+    state::IdDict{Any,Any}
 end
 AdaDelta(ρ::Real = 0.9, ϵ::Real = EPS) = AdaDelta(ρ, ϵ, IdDict())
 AdaDelta(ρ::Real, state::IdDict) = AdaDelta(ρ, EPS, state)
 
 function apply!(o::AdaDelta, x, Δ)
     ρ = o.rho
-    acc, Δacc = get!(() -> (zero(x), zero(x)), o.state, x)::NTuple{2, typeof(x)}
+    acc, Δacc = get!(() -> (zero(x), zero(x)), o.state, x)::NTuple{2,typeof(x)}
     @. acc = ρ * acc + (1 - ρ) * Δ * conj(Δ)
     # DON'T remove epsilon from numerator
     # or even out of the square roots
@@ -427,9 +439,9 @@ opt = AMSGrad(0.001, (0.89, 0.995))
 """
 mutable struct AMSGrad <: AbstractOptimiser
     eta::Float64
-    beta::Tuple{Float64, Float64}
+    beta::Tuple{Float64,Float64}
     epsilon::Float64
-    state::IdDict{Any, Any}
+    state::IdDict{Any,Any}
 end
 AMSGrad(η::Real = 0.001, β = (0.9, 0.999), ϵ::Real = EPS) = AMSGrad(η, β, ϵ, IdDict())
 AMSGrad(η::Real, β::Tuple, state::IdDict) = AMSGrad(η, β, EPS, state)
@@ -438,9 +450,12 @@ function apply!(o::AMSGrad, x, Δ)
     η, β = o.eta, o.beta
 
     mt, vt, v̂t = get!(o.state, x) do
-        return (fill!(similar(x), o.epsilon), fill!(similar(x), o.epsilon),
-                fill!(similar(x), o.epsilon))
-    end::NTuple{3, typeof(x)}
+        return (
+            fill!(similar(x), o.epsilon),
+            fill!(similar(x), o.epsilon),
+            fill!(similar(x), o.epsilon),
+        )
+    end::NTuple{3,typeof(x)}
 
     @. mt = β[1] * mt + (1 - β[1]) * Δ
     @. vt = β[2] * vt + (1 - β[2]) * Δ^2
@@ -469,9 +484,9 @@ opt = NAdam(0.002, (0.89, 0.995))
 """
 mutable struct NAdam <: AbstractOptimiser
     eta::Float64
-    beta::Tuple{Float64, Float64}
+    beta::Tuple{Float64,Float64}
     epsilon::Float64
-    state::IdDict{Any, Any}
+    state::IdDict{Any,Any}
 end
 NAdam(η::Real = 0.001, β = (0.9, 0.999), ϵ::Real = EPS) = NAdam(η, β, ϵ, IdDict())
 NAdam(η::Real, β::Tuple, state::IdDict) = NAdam(η, β, EPS, state)
@@ -480,15 +495,19 @@ function apply!(o::NAdam, x, Δ)
     η, β = o.eta, o.beta
 
     mt, vt, βp = get!(o.state, x) do
-        return (zero(x), zero(x),
-                Float64[o.beta[1], o.beta[2]])
-    end::Tuple{typeof(x), typeof(x), Vector{Float64}}
+        return (
+            zero(x),
+            zero(x),
+            Float64[o.beta[1], o.beta[2]],
+        )
+    end::Tuple{typeof(x),typeof(x),Vector{Float64}}
     β1p, β2p = βp
 
     @. mt = β[1] * mt + (1 - β[1]) * Δ
     @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ)
-    @. Δ = (β[1] * mt / (1 - β[1] * β1p) + (1 - β[1]) * Δ / (1 - β1p)) /
-           (√(vt * β[2] / (1 - β2p)) + o.epsilon) * η
+    @. Δ =
+        (β[1] * mt / (1 - β[1] * β1p) + (1 - β[1]) * Δ / (1 - β1p)) /
+        (√(vt * β[2] / (1 - β2p)) + o.epsilon) * η
     βp .= βp .* β
 
     return Δ
@@ -539,9 +558,9 @@ opt = AdaBelief(0.001, (0.9, 0.8))
 """
 mutable struct AdaBelief <: AbstractOptimiser
     eta::Float64
-    beta::Tuple{Float64, Float64}
+    beta::Tuple{Float64,Float64}
     epsilon::Float64
-    state::IdDict{Any, Any}
+    state::IdDict{Any,Any}
 end
 AdaBelief(η::Real = 0.001, β = (0.9, 0.999), ϵ::Real = EPS) = AdaBelief(η, β, ϵ, IdDict())
 AdaBelief(η::Real, β::Tuple, state::IdDict) = AdaBelief(η, β, EPS, state)
@@ -550,9 +569,12 @@ function apply!(o::AdaBelief, x, Δ)
     η, β = o.eta, o.beta
 
     mt, st, βp = get!(o.state, x) do
-        return (zero(x), zero(x),
-                Float64[β[1], β[2]])
-    end::Tuple{typeof(x), typeof(x), Vector{Float64}}
+        return (
+            zero(x),
+            zero(x),
+            Float64[β[1], β[2]],
+        )
+    end::Tuple{typeof(x),typeof(x),Vector{Float64}}
 
     #= st is a variance and can go to zero. This is in contrast to Adam, which uses the
     second moment which is usually far enough from zero. This is problematic, since st
@@ -587,8 +609,12 @@ end
 
 Optimiser(opts::AbstractOptimiser...) = Optimiser(Any[opts...])
 
-@forward Optimiser.os Base.getindex, Base.first, Base.last, Base.lastindex, Base.push!,
-                      Base.setindex!
+@forward Optimiser.os Base.getindex,
+Base.first,
+Base.last,
+Base.lastindex,
+Base.push!,
+Base.setindex!
 @forward Optimiser.os Base.iterate
 
 Base.getindex(c::Optimiser, i::AbstractArray) = Optimiser(c.os[i]...)
@@ -623,10 +649,10 @@ opt = Optimiser(Adam(1.0f-3), InvDecay(1.0f-2))
 """
 mutable struct InvDecay <: AbstractOptimiser
     gamma::Float64
-    state::IdDict{Any, Int}
+    state::IdDict{Any,Int}
 end
 
-InvDecay(γ = 0.001) = InvDecay(γ, IdDict{Any, Int}())
+InvDecay(γ = 0.001) = InvDecay(γ, IdDict{Any,Int}())
 
 function apply!(o::InvDecay, x, Δ)
     γ = o.gamma
@@ -683,7 +709,8 @@ end
 function apply!(o::ExpDecay, x, Δ)
     η, s, decay, start = o.eta, o.step, o.decay, o.start
     n = o.current[x] = get(o.current, x, 0) + 1
-    if n > start && n % s == 0 &&
+    if n > start &&
+       n % s == 0 &&
        count(x -> x > start && x % s == 0, values(o.current)) == 1
         η = max(η * decay, o.clip)
         o.eta = η
diff --git a/src/optimise/train.jl b/src/optimise/train.jl
index e32451b0da..bead5860f0 100644
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@@ -50,9 +50,11 @@ end
 ```
 """
 function skip()
-    Base.depwarn("""Flux.skip() will be removed from Flux 0.14.
-                    and should be replaced with `continue` in an ordinary `for` loop.""",
-                 :skip)
+    Base.depwarn(
+        """Flux.skip() will be removed from Flux 0.14.
+           and should be replaced with `continue` in an ordinary `for` loop.""",
+        :skip,
+    )
     throw(SkipException())
 end
 
@@ -77,8 +79,11 @@ end
 ```
 """
 function stop()
-    Base.depwarn("""Flux.stop() will be removed from Flux 0.14.
-                    It should be replaced with `break` in an ordinary `for` loop.""", :stop)
+    Base.depwarn(
+        """Flux.stop() will be removed from Flux 0.14.
+           It should be replaced with `break` in an ordinary `for` loop.""",
+        :stop,
+    )
     throw(StopException())
 end
 
@@ -173,11 +178,14 @@ hello
 ```
 """
 macro epochs(n, ex)
-    Base.depwarn("""The macro `@epochs` will be removed from Flux 0.14.
-                    As an alternative, you can write a simple `for i in 1:epochs` loop.""",
-                 Symbol("@epochs"); force = true)
-    return :(@progress for i in 1:($(esc(n)))
-                 @info "Epoch $i"
-                 $(esc(ex))
-             end)
+    Base.depwarn(
+        """The macro `@epochs` will be removed from Flux 0.14.
+           As an alternative, you can write a simple `for i in 1:epochs` loop.""",
+        Symbol("@epochs");
+        force = true,
+    )
+    return :(@progress for i = 1:($(esc(n)))
+        @info "Epoch $i"
+        $(esc(ex))
+    end)
 end
diff --git a/src/outputsize.jl b/src/outputsize.jl
index ec87107adc..65f006d54f 100644
--- a/src/outputsize.jl
+++ b/src/outputsize.jl
@@ -11,19 +11,33 @@ Unlike `Nothing` and `Missing` it is a number: `Nil <: Real <: Number`.
 """
 struct Nil <: Real end
 
-@doc @doc(Nil)
-const nil = Nil()
+@doc @doc(Nil) const nil = Nil()
 
-Nil(::T) where {T <: Number} = nil
-(::Type{T})(::Nil) where {T <: Number} = nil
+Nil(::T) where {T<:Number} = nil
+(::Type{T})(::Nil) where {T<:Number} = nil
 Base.convert(::Type{Nil}, ::Number) = nil
 
 Base.float(::Type{Nil}) = Nil
 
-for f in [:copy, :zero, :one, :oneunit,
-    :+, :-, :abs, :abs2, :inv,
-    :exp, :log, :log1p, :log2, :log10,
-    :sqrt, :tanh, :conj]
+for f in [
+    :copy,
+    :zero,
+    :one,
+    :oneunit,
+    :+,
+    :-,
+    :abs,
+    :abs2,
+    :inv,
+    :exp,
+    :log,
+    :log1p,
+    :log2,
+    :log10,
+    :sqrt,
+    :tanh,
+    :conj,
+]
     @eval Base.$f(::Nil) = nil
 end
 
@@ -167,8 +181,12 @@ end
 for (fn, Dims) in ((:conv, DenseConvDims),)
     @eval begin
         function NNlib.$fn(a::AbstractArray{Nil}, b::AbstractArray{Nil}, dims::$Dims)
-            return fill(nil, NNlib.output_size(dims)..., NNlib.channels_out(dims),
-                        size(a)[end])
+            return fill(
+                nil,
+                NNlib.output_size(dims)...,
+                NNlib.channels_out(dims),
+                size(a)[end],
+            )
         end
 
         function NNlib.$fn(a::AbstractArray{<:Real}, b::AbstractArray{Nil}, dims::$Dims)
diff --git a/src/utils.jl b/src/utils.jl
index 2d0137a182..07d99e8b97 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -28,7 +28,7 @@ nfan() = 1, 1 # fan_in, fan_out
 nfan(n) = 1, n # A vector is treated as a n×1 matrix
 nfan(n_out, n_in) = n_in, n_out # In case of Dense kernels: arranged as matrices
 nfan(dims::Tuple) = nfan(dims...)
-nfan(dims...) = prod(dims[1:(end - 2)]) .* (dims[end - 1], dims[end]) # In case of convolution kernels
+nfan(dims...) = prod(dims[1:(end-2)]) .* (dims[end-1], dims[end]) # In case of convolution kernels
 
 ofeltype(x, y) = convert(float(eltype(x)), y)
 epseltype(x) = eps(float(eltype(x)))
@@ -270,11 +270,18 @@ julia> round(std(Flux.truncated_normal(10^6; lo = -100, hi = 100)))
 1.0f0
 ```
 """
-function truncated_normal(rng::AbstractRNG, dims::Integer...; mean = 0, std = 1, lo = -2,
-                          hi = 2)
+function truncated_normal(
+    rng::AbstractRNG,
+    dims::Integer...;
+    mean = 0,
+    std = 1,
+    lo = -2,
+    hi = 2,
+)
     norm_cdf(x) = 0.5 * (1 + erf(x / √2))
     if (mean < lo - 2 * std) || (mean > hi + 2 * std)
-        @warn "Mean is more than 2 std outside the limits in truncated_normal, so the distribution of values may be inaccurate." maxlog=1
+        @warn "Mean is more than 2 std outside the limits in truncated_normal, so the distribution of values may be inaccurate." maxlog =
+            1
     end
     l = norm_cdf((lo - mean) / std)
     u = norm_cdf((hi - mean) / std)
@@ -347,7 +354,7 @@ end
 
 function orthogonal(rng::AbstractRNG, d1::Integer, ds::Integer...; kwargs...)
     dims = (d1, ds...)
-    rows = prod(dims[1:(end - 1)])
+    rows = prod(dims[1:(end-1)])
     cols = dims[end]
     return reshape(orthogonal(rng, rows, cols; kwargs...), dims)
 end
@@ -356,8 +363,8 @@ function orthogonal(dims::Integer...; kwargs...)
     return orthogonal(default_rng_value(), dims...; kwargs...)
 end
 function orthogonal(rng::AbstractRNG = default_rng_value(); init_kwargs...)
-    return (dims::Integer...; kwargs...) -> orthogonal(rng, dims...; init_kwargs...,
-                                                       kwargs...)
+    return (dims::Integer...; kwargs...) ->
+        orthogonal(rng, dims...; init_kwargs..., kwargs...)
 end
 
 ChainRulesCore.@non_differentiable orthogonal(::Any...)
@@ -396,7 +403,11 @@ julia> count(iszero, ans.weight; dims = 1)
 """
 function sparse_init(rng::AbstractRNG, dims::Integer...; sparsity, std = 0.01)
     if length(dims) != 2
-        throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization."))
+        throw(
+            ArgumentError(
+                "Only 2-dimensional outputs are supported for sparse initialization.",
+            ),
+        )
     end
     rows, cols = dims
     prop_zero = min(1.0, sparsity)
@@ -495,10 +506,10 @@ end
 
 # Assume convolution
 function identity_init(dims::Integer...; gain::Real = 1, shift = 0)
-    nin, nout = dims[end - 1], dims[end]
-    centers = map(d -> cld(d, 2), dims[1:(end - 2)])
+    nin, nout = dims[end-1], dims[end]
+    centers = map(d -> cld(d, 2), dims[1:(end-2)])
     weights = zeros32(dims...)
-    for i in 1:min(nin, nout)
+    for i = 1:min(nin, nout)
         weights[centers..., i, i] = gain
     end
     return circshift(weights, shift)
diff --git a/test/ctc-gpu.jl b/test/ctc-gpu.jl
index 1ed898cd21..6439a3a8f5 100644
--- a/test/ctc-gpu.jl
+++ b/test/ctc-gpu.jl
@@ -10,7 +10,7 @@ using CUDA
 function ctc_ngradient(x, y)
     f = Flux.Losses.ctc_loss
     grads = zero(x)
-    for i in 1:length(x)
+    for i = 1:length(x)
         δ = sqrt(eps())
         tmp = x[i]
         x[i] = tmp - δ / 2
@@ -30,7 +30,7 @@ end
     g1 = gradient(ctc_loss, x_cu, y)[1]
     g1 = g1 |> collect
     g2 = ctc_ngradient(x, y)
-    @test g1≈g2 rtol=1e-5 atol=1e-5
+    @test g1 ≈ g2 rtol = 1e-5 atol = 1e-5
 
     # test that GPU loss matches CPU implementation
     l1 = ctc_loss(x_cu, y)
@@ -42,18 +42,23 @@ end
     y = [1, 2]
     @test ctc_loss(x_cu, y) ≈ 3.6990738275138035
 
-    g = [-0.317671 -0.427729 0.665241; 0.244728 -0.0196172 -0.829811;
-         0.0729422 0.447346 0.16457]
+    g = [
+        -0.317671 -0.427729 0.665241
+        0.244728 -0.0196172 -0.829811
+        0.0729422 0.447346 0.16457
+    ]
     ghat = gradient(ctc_loss, x_cu, y)[1] |> collect
-    @test g≈ghat rtol=1e-5 atol=1e-5
+    @test g ≈ ghat rtol = 1e-5 atol = 1e-5
 
     x_cu = [-3.0 12.0 8.0 15.0; 4.0 20.0 -2.0 20.0; 8.0 -33.0 6.0 5.0] |> CuArray
     y = [1, 2] |> CuArray
     @test ctc_loss(x_cu, y) ≈ 8.02519869363453
 
-    g = [-2.29294774655333e-06 -0.999662657278862 1.75500863563993e-06 0.00669284889063;
-         0.017985914969696 0.999662657278861 -1.9907078755387e-06 -0.006693150917307;
-         -0.01798362202195 -2.52019580677916e-20 2.35699239251042e-07 3.02026677058789e-07]
+    g = [
+        -2.29294774655333e-06 -0.999662657278862 1.75500863563993e-06 0.00669284889063
+        0.017985914969696 0.999662657278861 -1.9907078755387e-06 -0.006693150917307
+        -0.01798362202195 -2.52019580677916e-20 2.35699239251042e-07 3.02026677058789e-07
+    ]
     ghat = gradient(ctc_loss, x_cu, y)[1] |> collect
-    @test g≈ghat rtol=1e-5 atol=1e-5
+    @test g ≈ ghat rtol = 1e-5 atol = 1e-5
 end
diff --git a/test/ctc.jl b/test/ctc.jl
index 88386ff0e7..059b14f292 100644
--- a/test/ctc.jl
+++ b/test/ctc.jl
@@ -9,7 +9,7 @@ using LinearAlgebra
 function ctc_ngradient(x, y)
     f = Flux.Losses.ctc_loss
     grads = zero(x)
-    for i in 1:length(x)
+    for i = 1:length(x)
         δ = sqrt(eps())
         tmp = x[i]
         x[i] = tmp - δ / 2
@@ -27,25 +27,30 @@ end
     y = rand(1:9, 30)
     g1 = gradient(ctc_loss, x, y)[1]
     g2 = ctc_ngradient(x, y)
-    @test g1≈g2 rtol=1e-5 atol=1e-5
+    @test g1 ≈ g2 rtol = 1e-5 atol = 1e-5
 
     # tests using hand-calculated values
     x = [1.0 2.0 3.0; 2.0 1.0 1.0; 3.0 3.0 2.0]
     y = [1, 2]
     @test ctc_loss(x, y) ≈ 3.6990738275138035
 
-    g = [-0.317671 -0.427729 0.665241; 0.244728 -0.0196172 -0.829811;
-         0.0729422 0.447346 0.16457]
+    g = [
+        -0.317671 -0.427729 0.665241
+        0.244728 -0.0196172 -0.829811
+        0.0729422 0.447346 0.16457
+    ]
     ghat = gradient(ctc_loss, x, y)[1]
-    @test g≈ghat rtol=1e-5 atol=1e-5
+    @test g ≈ ghat rtol = 1e-5 atol = 1e-5
 
     x = [-3.0 12.0 8.0 15.0; 4.0 20.0 -2.0 20.0; 8.0 -33.0 6.0 5.0]
     y = [1, 2]
     @test ctc_loss(x, y) ≈ 8.02519869363453
 
-    g = [-2.29294774655333e-06 -0.999662657278862 1.75500863563993e-06 0.00669284889063;
-         0.017985914969696 0.999662657278861 -1.9907078755387e-06 -0.006693150917307;
-         -0.01798362202195 -2.52019580677916e-20 2.35699239251042e-07 3.02026677058789e-07]
+    g = [
+        -2.29294774655333e-06 -0.999662657278862 1.75500863563993e-06 0.00669284889063
+        0.017985914969696 0.999662657278861 -1.9907078755387e-06 -0.006693150917307
+        -0.01798362202195 -2.52019580677916e-20 2.35699239251042e-07 3.02026677058789e-07
+    ]
     ghat = gradient(ctc_loss, x, y)[1]
-    @test g≈ghat rtol=1e-5 atol=1e-5
+    @test g ≈ ghat rtol = 1e-5 atol = 1e-5
 end
diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl
index 033a08df95..25648eb787 100644
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@@ -21,7 +21,7 @@ using SparseArrays: sparse, SparseMatrixCSC, AbstractSparseArray
     cm = gpu(m)
 
     @test all(p isa CuArray for p in params(cm))
-    @test cm(gpu(rand(10, 10))) isa CuArray{Float32, 2}
+    @test cm(gpu(rand(10, 10))) isa CuArray{Float32,2}
 
     xs = rand(5, 5)
     ys = Flux.onehotbatch(1:5, 1:5)
@@ -81,7 +81,7 @@ end
         M = 2.0 * I(10) |> collect
         Q = cholesky(M)
         Q_gpu = Q |> gpu
-        @test Q_gpu isa Cholesky{<:Any, <:CuArray}
+        @test Q_gpu isa Cholesky{<:Any,<:CuArray}
         Q_cpu = Q_gpu |> cpu
         @test Q_cpu == cholesky(eltype(Q_gpu).(M))
     end
diff --git a/test/cuda/curnn.jl b/test/cuda/curnn.jl
index 2ff137d34f..a46b5684c8 100644
--- a/test/cuda/curnn.jl
+++ b/test/cuda/curnn.jl
@@ -11,55 +11,55 @@ using Flux, CUDA, Test
     @test collect(m̄.cell.Wi) == collect(θ[m.cell.Wi])
 end
 
-@testset "RNN" begin @testset for R in [RNN, GRU, LSTM, GRUv3], batch_size in (1, 5)
-    rnn = R(10, 5)
-    curnn = fmap(gpu, rnn)
+@testset "RNN" begin
+    @testset for R in [RNN, GRU, LSTM, GRUv3], batch_size in (1, 5)
+        rnn = R(10, 5)
+        curnn = fmap(gpu, rnn)
 
-    Flux.reset!(rnn)
-    Flux.reset!(curnn)
-    x = batch_size == 1 ?
-        rand(Float32, 10) :
-        rand(Float32, 10, batch_size)
-    cux = gpu(x)
+        Flux.reset!(rnn)
+        Flux.reset!(curnn)
+        x = batch_size == 1 ? rand(Float32, 10) : rand(Float32, 10, batch_size)
+        cux = gpu(x)
 
-    y, back = pullback((r, x) -> r(x), rnn, x)
-    cuy, cuback = pullback((r, x) -> r(x), curnn, cux)
+        y, back = pullback((r, x) -> r(x), rnn, x)
+        cuy, cuback = pullback((r, x) -> r(x), curnn, cux)
 
-    @test y ≈ collect(cuy)
+        @test y ≈ collect(cuy)
 
-    ȳ = randn(size(y))
-    m̄, x̄ = back(ȳ)
-    cum̄, cux̄ = cuback(gpu(ȳ))
+        ȳ = randn(size(y))
+        m̄, x̄ = back(ȳ)
+        cum̄, cux̄ = cuback(gpu(ȳ))
 
-    @test x̄ ≈ collect(cux̄)
-    @test m̄[].cell.Wi ≈ collect(cum̄[].cell.Wi)
-    @test m̄[].cell.Wh ≈ collect(cum̄[].cell.Wh)
-    @test m̄[].cell.b ≈ collect(cum̄[].cell.b)
-    if m̄[].state isa Tuple
-        for (x, cx) in zip(m̄[].state, cum̄[].state)
-            @test x ≈ collect(cx)
+        @test x̄ ≈ collect(cux̄)
+        @test m̄[].cell.Wi ≈ collect(cum̄[].cell.Wi)
+        @test m̄[].cell.Wh ≈ collect(cum̄[].cell.Wh)
+        @test m̄[].cell.b ≈ collect(cum̄[].cell.b)
+        if m̄[].state isa Tuple
+            for (x, cx) in zip(m̄[].state, cum̄[].state)
+                @test x ≈ collect(cx)
+            end
+        else
+            @test m̄[].state ≈ collect(cum̄[].state)
         end
-    else
-        @test m̄[].state ≈ collect(cum̄[].state)
-    end
 
-    Flux.reset!(rnn)
-    Flux.reset!(curnn)
-    ohx = batch_size == 1 ?
-          Flux.onehot(rand(1:10), 1:10) :
-          Flux.onehotbatch(rand(1:10, batch_size), 1:10)
-    cuohx = gpu(ohx)
-    y = (rnn(ohx); rnn(ohx))
+        Flux.reset!(rnn)
+        Flux.reset!(curnn)
+        ohx =
+            batch_size == 1 ? Flux.onehot(rand(1:10), 1:10) :
+            Flux.onehotbatch(rand(1:10, batch_size), 1:10)
+        cuohx = gpu(ohx)
+        y = (rnn(ohx); rnn(ohx))
 
-    cuy = (curnn(cuohx); curnn(cuohx))
-    @test y ≈ collect(cuy)
+        cuy = (curnn(cuohx); curnn(cuohx))
+        @test y ≈ collect(cuy)
 
-    Flux.reset!(rnn)
-    Flux.reset!(curnn)
-    fx = rand(Float32, 10, batch_size, 3)
-    cufx = gpu(fx)
-    fy = (rnn(fx); rnn(fx))
+        Flux.reset!(rnn)
+        Flux.reset!(curnn)
+        fx = rand(Float32, 10, batch_size, 3)
+        cufx = gpu(fx)
+        fy = (rnn(fx); rnn(fx))
 
-    cufy = (curnn(cufx); curnn(cufx))
-    @test fy ≈ collect(cufy)
-end end
+        cufy = (curnn(cufx); curnn(cufx))
+        @test fy ≈ collect(cufy)
+    end
+end
diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl
index 631b103839..e2da95931d 100644
--- a/test/cuda/layers.jl
+++ b/test/cuda/layers.jl
@@ -13,63 +13,70 @@ end
 # TODO: These layers get into scalar indexing issues.
 const BROKEN_LAYERS = Union{}
 
-const ACTIVATIONS = [identity, relu, tanh,
-    sigmoid, exp, softplus,
-    elu, selu]
-
-function gpu_gradtest(name::String, layers::Vector, x_cpu = nothing, args...;
-                      test_cpu = true)
+const ACTIVATIONS = [identity, relu, tanh, sigmoid, exp, softplus, elu, selu]
+
+function gpu_gradtest(
+    name::String,
+    layers::Vector,
+    x_cpu = nothing,
+    args...;
+    test_cpu = true,
+)
     isnothing(x_cpu) && error("Missing input to test the layers against.")
-    @testset "$name GPU grad tests" begin for layer in layers
-        @testset "$layer Layer GPU grad test" begin
-
-            # compute output and grad of parameters
-            l_cpu = layer(args...)
-            ps_cpu = Flux.params(l_cpu)
-            y_cpu, back_cpu = pullback(() -> sum(l_cpu(x_cpu)), ps_cpu)
-            gs_cpu = back_cpu(1.0f0)
-
-            x_gpu = gpu(x_cpu)
-            l_gpu = l_cpu |> gpu
-            ps_gpu = Flux.params(l_gpu)
-
-            if typeof(l_gpu) <: BROKEN_LAYERS
-                @test_broken gradient(() -> sum(l_gpu(x_gpu)), ps_gpu) isa Flux.Zygote.Grads
-            else
-                y_gpu, back_gpu = pullback(() -> sum(l_gpu(x_gpu)), ps_gpu)
-                gs_gpu = back_gpu(1.0f0) # TODO many layers error out when backprop int 1, should fix
-
-                # compute grad of input
-                xg_cpu = gradient(x -> sum(l_cpu(x)), x_cpu)[1]
-                xg_gpu = gradient(x -> sum(l_gpu(x)), x_gpu)[1]
-
-                # test
-                if test_cpu
-                    @test y_gpu≈y_cpu rtol=1.0f-3 atol=1.0f-3
-                    if isnothing(xg_cpu)
-                        @test isnothing(xg_gpu)
-                    else
-                        if layer === GroupedConvTranspose
-                            @test Array(xg_gpu)≈xg_cpu rtol=2.0f-2 atol=1.0f-3
+    @testset "$name GPU grad tests" begin
+        for layer in layers
+            @testset "$layer Layer GPU grad test" begin
+
+                # compute output and grad of parameters
+                l_cpu = layer(args...)
+                ps_cpu = Flux.params(l_cpu)
+                y_cpu, back_cpu = pullback(() -> sum(l_cpu(x_cpu)), ps_cpu)
+                gs_cpu = back_cpu(1.0f0)
+
+                x_gpu = gpu(x_cpu)
+                l_gpu = l_cpu |> gpu
+                ps_gpu = Flux.params(l_gpu)
+
+                if typeof(l_gpu) <: BROKEN_LAYERS
+                    @test_broken gradient(() -> sum(l_gpu(x_gpu)), ps_gpu) isa
+                                 Flux.Zygote.Grads
+                else
+                    y_gpu, back_gpu = pullback(() -> sum(l_gpu(x_gpu)), ps_gpu)
+                    gs_gpu = back_gpu(1.0f0) # TODO many layers error out when backprop int 1, should fix
+
+                    # compute grad of input
+                    xg_cpu = gradient(x -> sum(l_cpu(x)), x_cpu)[1]
+                    xg_gpu = gradient(x -> sum(l_gpu(x)), x_gpu)[1]
+
+                    # test
+                    if test_cpu
+                        @test y_gpu ≈ y_cpu rtol = 1.0f-3 atol = 1.0f-3
+                        if isnothing(xg_cpu)
+                            @test isnothing(xg_gpu)
                         else
-                            @test Array(xg_gpu)≈xg_cpu rtol=1.0f-3 atol=1.0f-3
+                            if layer === GroupedConvTranspose
+                                @test Array(xg_gpu) ≈ xg_cpu rtol = 2.0f-2 atol = 1.0f-3
+                            else
+                                @test Array(xg_gpu) ≈ xg_cpu rtol = 1.0f-3 atol = 1.0f-3
+                            end
                         end
                     end
-                end
-                @test gs_gpu isa Flux.Zygote.Grads
-                for (p_cpu, p_gpu) in zip(ps_cpu, ps_gpu)
-                    if isnothing(gs_cpu[p_cpu])
-                        @test isnothing(gs_gpu[p_gpu])
-                    else
-                        @test gs_gpu[p_gpu] isa Flux.CUDA.CuArray
-                        if test_cpu
-                            @test Array(gs_gpu[p_gpu])≈gs_cpu[p_cpu] rtol=1.0f-3 atol=1.0f-3
+                    @test gs_gpu isa Flux.Zygote.Grads
+                    for (p_cpu, p_gpu) in zip(ps_cpu, ps_gpu)
+                        if isnothing(gs_cpu[p_cpu])
+                            @test isnothing(gs_gpu[p_gpu])
+                        else
+                            @test gs_gpu[p_gpu] isa Flux.CUDA.CuArray
+                            if test_cpu
+                                @test Array(gs_gpu[p_gpu]) ≈ gs_cpu[p_cpu] rtol = 1.0f-3 atol =
+                                    1.0f-3
+                            end
                         end
                     end
                 end
             end
         end
-    end end
+    end
 end
 
 # Just to give testset in gpu_gradtest meaningful labels
@@ -82,29 +89,68 @@ GroupedConvTranspose(args...) = ConvTranspose(args...; groups = 5)
 
 for act in ACTIVATIONS
     r = rand(Float32, 28, 28, 1, 1)
-    conv_layers = [Conv, ConvNoBias,
-        ConvTranspose, ConvTransposeNoBias,
-        CrossCor, CrossCorNoBias,
-        DepthwiseConv, DepthwiseConvNoBias]
-    gpu_gradtest("Convolution with $act", conv_layers, r, (2, 2), 1 => 3, act;
-                 test_cpu = false)
+    conv_layers = [
+        Conv,
+        ConvNoBias,
+        ConvTranspose,
+        ConvTransposeNoBias,
+        CrossCor,
+        CrossCorNoBias,
+        DepthwiseConv,
+        DepthwiseConvNoBias,
+    ]
+    gpu_gradtest(
+        "Convolution with $act",
+        conv_layers,
+        r,
+        (2, 2),
+        1 => 3,
+        act;
+        test_cpu = false,
+    )
 
     groupedconv = [GroupedConv, GroupedConvTranspose]
-    gpu_gradtest("GroupedConvolution with $act", groupedconv, rand(Float32, 28, 28, 100, 2),
-                 (3, 3), 100 => 25, act; test_cpu = true)
+    gpu_gradtest(
+        "GroupedConvolution with $act",
+        groupedconv,
+        rand(Float32, 28, 28, 100, 2),
+        (3, 3),
+        100 => 25,
+        act;
+        test_cpu = true,
+    )
 
     batch_norm = [BatchNorm]
-    gpu_gradtest("BatchNorm 1 with $act", batch_norm, rand(Float32, 28, 28, 3, 4), 3, act;
-                 test_cpu = false) #TODO fix errors
-    gpu_gradtest("BatchNorm 2 with $act", batch_norm, rand(Float32, 5, 4), 5, act;
-                 test_cpu = false)
+    gpu_gradtest(
+        "BatchNorm 1 with $act",
+        batch_norm,
+        rand(Float32, 28, 28, 3, 4),
+        3,
+        act;
+        test_cpu = false,
+    ) #TODO fix errors
+    gpu_gradtest(
+        "BatchNorm 2 with $act",
+        batch_norm,
+        rand(Float32, 5, 4),
+        5,
+        act;
+        test_cpu = false,
+    )
 
     instancenorm = [InstanceNorm]
     gpu_gradtest("InstanceNorm with $act", instancenorm, r, 1, act; test_cpu = false)
 
     groupnorm = [GroupNorm]
-    gpu_gradtest("GroupNorm with $act", groupnorm, rand(Float32, 28, 28, 3, 1), 3, 1, act;
-                 test_cpu = false)
+    gpu_gradtest(
+        "GroupNorm with $act",
+        groupnorm,
+        rand(Float32, 28, 28, 3, 1),
+        3,
+        1,
+        act;
+        test_cpu = false,
+    )
 end
 
 r = rand(Float32, 28, 28, 1, 1)
@@ -137,8 +183,13 @@ gpu_gradtest("Embedding integer index", embedding, 1, 5, 2)
 gpu_gradtest("Embedding 2d index", embedding, [1 2; 3 4], 5, 2)
 gpu_gradtest("Embedding OneHotVec index", embedding, OneHotVector(1, 5), 5, 2)
 gpu_gradtest("Embedding OneHotMatrix index", embedding, OneHotMatrix([1, 2, 3], 5), 5, 2)
-gpu_gradtest("Embedding OneHotMatrix repeated indices", embedding,
-             OneHotMatrix([1, 2, 2], 5), 5, 2)
+gpu_gradtest(
+    "Embedding OneHotMatrix repeated indices",
+    embedding,
+    OneHotMatrix([1, 2, 2], 5),
+    5,
+    2,
+)
 
 @testset "function layers" begin
     x = rand(Float32, 3, 3)
@@ -287,8 +338,11 @@ end
 end
 
 @testset "Dropout RNGs" begin
-    @test_throws ArgumentError Flux.dropout(MersenneTwister(), CUDA.rand(Float32, 2, 3),
-                                            0.1)
+    @test_throws ArgumentError Flux.dropout(
+        MersenneTwister(),
+        CUDA.rand(Float32, 2, 3),
+        0.1,
+    )
     @testset for layer in (Dropout, AlphaDropout)
         m = layer(0.1; rng = MersenneTwister(123))
         @test_throws ErrorException gpu(m)
diff --git a/test/cuda/losses.jl b/test/cuda/losses.jl
index 3ecbceb46e..467d3ed46e 100644
--- a/test/cuda/losses.jl
+++ b/test/cuda/losses.jl
@@ -1,5 +1,5 @@
-using Flux.Losses: crossentropy, binarycrossentropy, logitbinarycrossentropy,
-                   binary_focal_loss, focal_loss
+using Flux.Losses:
+    crossentropy, binarycrossentropy, logitbinarycrossentropy, binary_focal_loss, focal_loss
 
 @testset "Losses" begin
     x = [1.0, 2.0, 3.0]
@@ -14,16 +14,22 @@ using Flux.Losses: crossentropy, binarycrossentropy, logitbinarycrossentropy,
     @test binarycrossentropy(σ.(x), y) ≈ binarycrossentropy(gpu(σ.(x)), gpu(y))
     @test logitbinarycrossentropy(x, y) ≈ logitbinarycrossentropy(gpu(x), gpu(y))
 
-    x = [0.268941 0.5 0.268941
-         0.731059 0.5 0.731059]
-    y = [0 1 0
-         1 0 1]
+    x = [
+        0.268941 0.5 0.268941
+        0.731059 0.5 0.731059
+    ]
+    y = [
+        0 1 0
+        1 0 1
+    ]
     @test binary_focal_loss(x, y) ≈ binary_focal_loss(gpu(x), gpu(y))
 
     x = softmax(reshape(-7:7, 3, 5) .* 1.0f0)
-    y = [1 0 0 0 1
-         0 1 0 1 0
-         0 0 1 0 0]
+    y = [
+        1 0 0 0 1
+        0 1 0 1 0
+        0 0 1 0 0
+    ]
     @test focal_loss(x, y) ≈ focal_loss(gpu(x), gpu(y))
 
     @testset "GPU grad tests" begin
diff --git a/test/cuda/test_utils.jl b/test/cuda/test_utils.jl
index 466b08c8b9..027d13a612 100644
--- a/test/cuda/test_utils.jl
+++ b/test/cuda/test_utils.jl
@@ -7,10 +7,10 @@ function check_grad(g_gpu::Base.RefValue, g_cpu::Base.RefValue, atol, rtol)
 end
 check_grad(g_gpu::Nothing, g_cpu::Nothing, atol, rtol) = @test true
 function check_grad(g_gpu::Float32, g_cpu::Float32, atol, rtol)
-    @test g_cpu≈g_gpu rtol=rtol atol=atol
+    @test g_cpu ≈ g_gpu rtol = rtol atol = atol
 end
 function check_grad(g_gpu::CuArray{Float32}, g_cpu::Array{Float32}, atol, rtol)
-    @test g_cpu≈collect(g_gpu) rtol=rtol atol=atol
+    @test g_cpu ≈ collect(g_gpu) rtol = rtol atol = atol
 end
 
 function check_grad(g_gpu::Tuple, g_cpu::Tuple, atol, rtol)
@@ -27,8 +27,13 @@ function check_grad(g_gpu::NamedTuple, g_cpu::NamedTuple, atol, rtol)
     end
 end
 
-function gpu_autodiff_test(f_cpu, xs_cpu::Array{Float32}...;
-                           test_equal = true, rtol = 1e-4, atol = 1e-4)
+function gpu_autodiff_test(
+    f_cpu,
+    xs_cpu::Array{Float32}...;
+    test_equal = true,
+    rtol = 1e-4,
+    atol = 1e-4,
+)
     check_type(x) = false
     check_type(x::Float32) = true
     check_type(x::CuArray{Float32}) = true
@@ -50,7 +55,7 @@ function gpu_autodiff_test(f_cpu, xs_cpu::Array{Float32}...;
     gs_gpu = back_gpu(Δ_gpu)
 
     if test_equal
-        @test collect(y_cpu)≈collect(y_gpu) rtol=rtol atol=atol
+        @test collect(y_cpu) ≈ collect(y_gpu) rtol = rtol atol = atol
         for (g_gpu, g_cpu) in zip(gs_gpu, gs_cpu)
             check_grad(g_gpu, g_cpu, atol, rtol)
         end
@@ -66,7 +71,7 @@ function gpu_autodiff_test(f_cpu, xs_cpu::Array{Float32}...;
     gs_gpu = back_gpu(Δ_gpu)
 
     if test_equal
-        @test collect(y_cpu)≈collect(y_gpu) rtol=rtol atol=atol
+        @test collect(y_cpu) ≈ collect(y_gpu) rtol = rtol atol = atol
         @assert length(ps_gpu) == length(ps_cpu)
         for (p_gpu, p_cpu) in zip(ps_gpu, ps_cpu)
             check_grad(gs_gpu[p_gpu], gs_cpu[p_cpu], atol, rtol)
diff --git a/test/data.jl b/test/data.jl
index 8ee1d58a89..3d2083af4f 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -36,7 +36,7 @@ using Random
     # @inferred first(d)
     batches = collect(d)
     # @test eltype(batches) == eltype(d) == Tuple{typeof(X), typeof(Y)}
-    @test eltype(batches) == Tuple{typeof(X), typeof(Y)}
+    @test eltype(batches) == Tuple{typeof(X),typeof(Y)}
     @test length(batches) == 3
     @test length(batches[1]) == 2
     @test length(batches[2]) == 2
@@ -53,7 +53,7 @@ using Random
     # @inferred first(d)
     batches = collect(d)
     # @test eltype(batches) == eltype(d) == NamedTuple{(:x, :y), Tuple{typeof(X), typeof(Y)}}
-    @test eltype(batches) == NamedTuple{(:x, :y), Tuple{typeof(X), typeof(Y)}}
+    @test eltype(batches) == NamedTuple{(:x, :y),Tuple{typeof(X),typeof(Y)}}
     @test length(batches) == 3
     @test length(batches[1]) == 2
     @test length(batches[2]) == 2
@@ -69,7 +69,7 @@ using Random
     d = DataLoader([1:10;]; shuffle = true)
     cd = collect(zip(d, d))
     # skip the first since it used to be different also before fixing the bug
-    @test [cd[i][1] for i in 2:10] != [cd[i][2] for i in 2:10]
+    @test [cd[i][1] for i = 2:10] != [cd[i][2] for i = 2:10]
 
     # test interaction with `train!`
     θ = ones(2)
@@ -89,7 +89,13 @@ using Random
     @test norm(θ .- 1) < 1e-10
 
     # specify the rng
-    d = map(identity,
-            DataLoader(X; batchsize = 2, shuffle = true,
-                       rng = Random.seed!(Random.default_rng(), 5)))
+    d = map(
+        identity,
+        DataLoader(
+            X;
+            batchsize = 2,
+            shuffle = true,
+            rng = Random.seed!(Random.default_rng(), 5),
+        ),
+    )
 end
diff --git a/test/layers/basic.jl b/test/layers/basic.jl
index 93e74e7915..f3600850a8 100644
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@@ -2,16 +2,18 @@ using Test, Random
 import Flux: activations
 
 @testset "basic" begin
-    @testset "helpers" begin @testset "activations" begin
-        dummy_model = Chain(x -> x .^ 2, x -> x .- 3, x -> tan.(x))
-        x = randn(10)
-        @test activations(dummy_model, x)[1] == x .^ 2
-        @test activations(dummy_model, x)[2] == (x .^ 2 .- 3)
-        @test activations(dummy_model, x)[3] == tan.(x .^ 2 .- 3)
-
-        @test activations(Chain(), x) == ()
-        @test activations(Chain(identity, x -> :foo), x)[2] == :foo # results include `Any` type
-    end end
+    @testset "helpers" begin
+        @testset "activations" begin
+            dummy_model = Chain(x -> x .^ 2, x -> x .- 3, x -> tan.(x))
+            x = randn(10)
+            @test activations(dummy_model, x)[1] == x .^ 2
+            @test activations(dummy_model, x)[2] == (x .^ 2 .- 3)
+            @test activations(dummy_model, x)[3] == tan.(x .^ 2 .- 3)
+
+            @test activations(Chain(), x) == ()
+            @test activations(Chain(identity, x -> :foo), x)[2] == :foo # results include `Any` type
+        end
+    end
 
     @testset "Chain" begin
         @test_nowarn Chain(Dense(10, 5, σ), Dense(5, 2))(randn(10))
@@ -87,10 +89,9 @@ import Flux: activations
             @test Dense(10, 2, identity; init = ones)(ones(10, 1)) == 10 * ones(2, 1)
             @test Dense(10, 2, identity; init = ones)([ones(10, 1) 2 * ones(10, 1)]) ==
                   [10 20; 10 20]
-            @test Dense(10, 2, identity; init = ones, bias = false)([ones(10, 1) 2 *
-                                                                                 ones(10,
-                                                                                      1)]) ==
-                  [10 20; 10 20]
+            @test Dense(10, 2, identity; init = ones, bias = false)(
+                [ones(10, 1) 2 * ones(10, 1)],
+            ) == [10 20; 10 20]
         end
     end
 
@@ -158,8 +159,9 @@ import Flux: activations
 
         @testset "concat size" begin
             input = randn(10, 2)
-            @test size(SkipConnection(Dense(10, 10), (a, b) -> cat(a, b; dims = 2))(input)) ==
-                  (10, 4)
+            @test size(
+                SkipConnection(Dense(10, 10), (a, b) -> cat(a, b; dims = 2))(input),
+            ) == (10, 4)
         end
     end
 
@@ -217,8 +219,9 @@ import Flux: activations
 
         @testset "concat size" begin
             input = randn(10, 2)
-            @test size(Parallel((a, b) -> cat(a, b; dims = 2), Dense(10, 10), identity)(input)) ==
-                  (10, 4)
+            @test size(
+                Parallel((a, b) -> cat(a, b; dims = 2), Dense(10, 10), identity)(input),
+            ) == (10, 4)
             @test size(Parallel(hcat; one = Dense(10, 10), two = identity)(input)) ==
                   (10, 4)
         end
@@ -226,8 +229,9 @@ import Flux: activations
         @testset "vararg input" begin
             inputs = randn(10), randn(5), randn(4)
             @test size(Parallel(+, Dense(10, 2), Dense(5, 2), Dense(4, 2))(inputs)) == (2,)
-            @test size(Parallel(+; a = Dense(10, 2), b = Dense(5, 2), c = Dense(4, 2))(inputs)) ==
-                  (2,)
+            @test size(
+                Parallel(+; a = Dense(10, 2), b = Dense(5, 2), c = Dense(4, 2))(inputs),
+            ) == (2,)
             @test_throws ArgumentError Parallel(+, sin, cos)(1, 2, 3)  # wrong number of inputs
             @test Parallel(+, sin, cos)(pi / 2) ≈ 1
         end
@@ -237,10 +241,16 @@ import Flux: activations
             @test m[1] == m[:one]
             @test m[1:2] == m
 
-            @test_throws ArgumentError Parallel(hcat, layers = Dense(10, 10),
-                                                two = identity) # reserved names
-            @test_throws ArgumentError Parallel(hcat, connection = Dense(10, 10),
-                                                two = identity)
+            @test_throws ArgumentError Parallel(
+                hcat,
+                layers = Dense(10, 10),
+                two = identity,
+            ) # reserved names
+            @test_throws ArgumentError Parallel(
+                hcat,
+                connection = Dense(10, 10),
+                two = identity,
+            )
 
             @test m == fmap(identity, m)  # does not forget names
 
@@ -249,7 +259,7 @@ import Flux: activations
         end
 
         @testset "trivial cases" begin
-            @test Parallel(hcat) isa Parallel{typeof(hcat), Tuple{}}  # not a NamedTuple
+            @test Parallel(hcat) isa Parallel{typeof(hcat),Tuple{}}  # not a NamedTuple
             @test Parallel(hcat)(1) == hcat()
             @test Parallel(hcat, inv)(2) == hcat(1 / 2)  # still calls connection once.
         end
@@ -314,7 +324,7 @@ import Flux: activations
 
         x = rand(1:vocab_size, 3, 4)
         y = m(x)
-        @test y isa Array{Float32, 3}
+        @test y isa Array{Float32,3}
         @test size(y) == (embed_size, 3, 4)
 
         @test m(2) ≈ m.weight[:, 2]
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index c5e7845833..51082723fb 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -25,12 +25,15 @@ end
 
 @testset "CNN" begin
     r = zeros(Float32, 28, 28, 1, 5)
-    m = Chain(Conv((2, 2), 1 => 16, relu),
-              MaxPool((2, 2)),
-              Conv((2, 2), 16 => 8, relu),
-              MaxPool((2, 2)),
-              x -> reshape(x, :, size(x, 4)),
-              Dense(288, 10), softmax)
+    m = Chain(
+        Conv((2, 2), 1 => 16, relu),
+        MaxPool((2, 2)),
+        Conv((2, 2), 16 => 8, relu),
+        MaxPool((2, 2)),
+        x -> reshape(x, :, size(x, 4)),
+        Dense(288, 10),
+        softmax,
+    )
 
     @test size(m(r)) == (10, 5)
 
@@ -56,7 +59,7 @@ end
     op = zeros(Float32, 27, 27, 3, 1) .+ 2.0f0
     opt = Descent()
 
-    for _ in 1:(10^3)
+    for _ = 1:(10^3)
         gs = gradient(Flux.params(bias)) do
             return Flux.Losses.mse(bias(ip), op)
         end
@@ -113,7 +116,7 @@ end
     @test _channels_out(ConvTranspose((5, 6), 2 => 2; groups = 2)) == 2
 
     for Layer in [Conv, ConvTranspose]
-        for _ in 1:10
+        for _ = 1:10
             groups = rand(1:10)
             kernel_size = Tuple(rand(1:5) for _ in rand(1:3))
             cin = rand(1:5) * groups
@@ -135,7 +138,7 @@ end
     @test y_hat[2, 2] ≈ 9.0
     @test y_hat[end, 1] ≈ 4.0
     @test y_hat[1, end] ≈ 3.0
-    @test y_hat[1, end - 1] ≈ 6.0
+    @test y_hat[1, end-1] ≈ 6.0
     @test y_hat[end, end] ≈ 2.0
 end
 
@@ -203,19 +206,22 @@ end
     w = rand(Float32, 2, 2, 1, 1)
     y = CrossCor(w, [0.0])
 
-    @test sum(w .* x[1:2, 1:2, :, :])≈y(x)[1, 1, 1, 1] rtol=2e-7
+    @test sum(w .* x[1:2, 1:2, :, :]) ≈ y(x)[1, 1, 1, 1] rtol = 2e-7
 
     r = zeros(Float32, 28, 28, 1, 5)
-    m = Chain(CrossCor((2, 2), 1 => 16, relu),
-              MaxPool((2, 2)),
-              CrossCor((2, 2), 16 => 8, relu; bias = false),
-              MaxPool((2, 2)),
-              x -> reshape(x, :, size(x, 4)),
-              Dense(288, 10), softmax)
+    m = Chain(
+        CrossCor((2, 2), 1 => 16, relu),
+        MaxPool((2, 2)),
+        CrossCor((2, 2), 16 => 8, relu; bias = false),
+        MaxPool((2, 2)),
+        x -> reshape(x, :, size(x, 4)),
+        Dense(288, 10),
+        softmax,
+    )
 
     @test size(m(r)) == (10, 5)
     @test y(x) != Conv(w, [0.0])(x)
-    @test CrossCor(w[end:-1:1, end:-1:1, :, :], [0.0])(x)≈Conv(w, [0.0])(x) rtol=1e-7
+    @test CrossCor(w[end:-1:1, end:-1:1, :, :], [0.0])(x) ≈ Conv(w, [0.0])(x) rtol = 1e-7
 end
 
 @testset "Conv with non quadratic window #700" begin
@@ -224,17 +230,17 @@ end
 
     l = Conv((3, 3), 1 => 1)
     expected = zeros(eltype(l.weight), 5, 5, 1, 1)
-    expected[2:(end - 1), 2:(end - 1), 1, 1] = l.weight
+    expected[2:(end-1), 2:(end-1), 1, 1] = l.weight
     @test expected ≈ l(data)
 
     l = Conv((3, 1), 1 => 1)
     expected = zeros(eltype(l.weight), 5, 7, 1, 1)
-    expected[2:(end - 1), 4, 1, 1] = l.weight
+    expected[2:(end-1), 4, 1, 1] = l.weight
     @test expected ≈ l(data)
 
     l = Conv((1, 3), 1 => 1)
     expected = zeros(eltype(l.weight), 7, 5, 1, 1)
-    expected[4, 2:(end - 1), 1, 1] = l.weight
+    expected[4, 2:(end-1), 1, 1] = l.weight
     @test expected ≈ l(data)
 
     @test begin
@@ -244,9 +250,9 @@ end
     end
 end
 
-@testset "$ltype SamePad kernelsize $k" for ltype in (Conv, ConvTranspose, DepthwiseConv,
-                                                      CrossCor),
-                                            k in ((1,), (2,), (3,), (4, 5), (6, 7, 8))
+@testset "$ltype SamePad kernelsize $k" for ltype in
+                                            (Conv, ConvTranspose, DepthwiseConv, CrossCor),
+    k in ((1,), (2,), (3,), (4, 5), (6, 7, 8))
 
     data = ones(Float32, (k .+ 3)..., 1, 1)
     l = ltype(k, 1 => 1; pad = SamePad())
@@ -258,24 +264,25 @@ end
     stride = 3
     l = ltype(k, 1 => 1; pad = SamePad(), stride = stride)
     if ltype == ConvTranspose
-        @test size(l(data))[1:(end - 2)] == stride .* size(data)[1:(end - 2)]
+        @test size(l(data))[1:(end-2)] == stride .* size(data)[1:(end-2)]
     else
-        @test size(l(data))[1:(end - 2)] == cld.(size(data)[1:(end - 2)], stride)
+        @test size(l(data))[1:(end-2)] == cld.(size(data)[1:(end-2)], stride)
     end
 end
 
 @testset "$ltype SamePad windowsize $k" for ltype in (MeanPool, MaxPool),
-                                            k in ((1,), (2,), (3,), (4, 5), (6, 7, 8))
+    k in ((1,), (2,), (3,), (4, 5), (6, 7, 8))
 
     data = ones(Float32, (k .+ 3)..., 1, 1)
 
     l = ltype(k; pad = SamePad())
-    @test size(l(data))[1:(end - 2)] == cld.(size(data)[1:(end - 2)], k)
+    @test size(l(data))[1:(end-2)] == cld.(size(data)[1:(end-2)], k)
 end
 
 @testset "bugs fixed" begin
-# https://github.com/FluxML/Flux.jl/issues/1421
-@test Conv((5, 5), 10 => 20, identity; init = Base.randn).bias isa Vector{Float64} end
+    # https://github.com/FluxML/Flux.jl/issues/1421
+    @test Conv((5, 5), 10 => 20, identity; init = Base.randn).bias isa Vector{Float64}
+end
 
 @testset "constructors: $fun" for fun in [Conv, CrossCor, ConvTranspose, DepthwiseConv]
     @test fun(rand(2, 3, 4)).bias isa Vector{Float64}
diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 1d4be15240..32e99245d6 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -3,123 +3,128 @@ using Zygote: pullback
 
 evalwgrad(f, x...) = pullback(f, x...)[1]
 
-@testset "Dropout" begin @testset for rng_kwargs in ((), (; rng = MersenneTwister()))
-    x = [1.0 + 0im, 2.0 + 1im, 3.0 + 3im]
-    @test x == Dropout(0.1; rng_kwargs...)(x)
-    @test x == evalwgrad(Dropout(0; rng_kwargs...), x)
-    @test zero(x) == evalwgrad(Dropout(1; rng_kwargs...), x)
-
-    x = [1.0, 2.0, 3.0]
-    @test x == Dropout(0.1; rng_kwargs...)(x)
-    @test x == evalwgrad(Dropout(0; rng_kwargs...), x)
-    @test zero(x) == evalwgrad(Dropout(1; rng_kwargs...), x)
-
-    x = rand(100)
-    m = Dropout(0.9; rng_kwargs...)
-    y = evalwgrad(m, x)
-    @test count(a -> a == 0, y) > 50
-    testmode!(m, true)
-    y = evalwgrad(m, x) # should override istraining
-    @test count(a -> a == 0, y) == 0
-    testmode!(m, false)
-    y = evalwgrad(m, x)
-    @test count(a -> a == 0, y) > 50
-
-    x = rand(Float32, 100)
-    m = Chain(Dense(100, 100),
-              Dropout(0.9; rng_kwargs...))
-    y = evalwgrad(m, x)
-    @test count(a -> a == 0, y) > 50
-    testmode!(m, true)
-    y = evalwgrad(m, x) # should override istraining
-    @test count(a -> a == 0, y) == 0
-
-    x = rand(100, 50)
-    m = Dropout(0.5; dims = 2, rng_kwargs...)
-    y = m(x)
-    c = map(i -> count(a -> a == 0, @view y[i, :]), 1:100)
-    @test minimum(c) == maximum(c)
-    m = Dropout(0.5; dims = 1, rng_kwargs...)
-    y = m(x)
-    c = map(i -> count(a -> a == 0, @view y[:, i]), 1:50)
-    @test minimum(c) == maximum(c)
-
-    # issue #1084
-    m = Dropout(0.9; rng_kwargs...)
-    x = rand(100)
-
-    testmode!(m)
-    y = m(x)
-    @test count(a -> a == 0, y) == 0
-    trainmode!(m)
-    y = m(x)
-    @test count(a -> a == 0, y) > 50
-
-    y = Flux.dropout(values(rng_kwargs)..., x, 0.9; active = true)
-    @test count(a -> a == 0, y) > 50
-
-    y = Flux.dropout(values(rng_kwargs)..., x, 0.9; active = false)
-    @test count(a -> a == 0, y) == 0
-
-    # CPU RNGs map onto CPU ok
-    if isempty(rng_kwargs)
-        if VERSION >= v"1.7"
-            @test cpu(m).rng isa Random.TaskLocalRNG
+@testset "Dropout" begin
+    @testset for rng_kwargs in ((), (; rng = MersenneTwister()))
+        x = [1.0 + 0im, 2.0 + 1im, 3.0 + 3im]
+        @test x == Dropout(0.1; rng_kwargs...)(x)
+        @test x == evalwgrad(Dropout(0; rng_kwargs...), x)
+        @test zero(x) == evalwgrad(Dropout(1; rng_kwargs...), x)
+
+        x = [1.0, 2.0, 3.0]
+        @test x == Dropout(0.1; rng_kwargs...)(x)
+        @test x == evalwgrad(Dropout(0; rng_kwargs...), x)
+        @test zero(x) == evalwgrad(Dropout(1; rng_kwargs...), x)
+
+        x = rand(100)
+        m = Dropout(0.9; rng_kwargs...)
+        y = evalwgrad(m, x)
+        @test count(a -> a == 0, y) > 50
+        testmode!(m, true)
+        y = evalwgrad(m, x) # should override istraining
+        @test count(a -> a == 0, y) == 0
+        testmode!(m, false)
+        y = evalwgrad(m, x)
+        @test count(a -> a == 0, y) > 50
+
+        x = rand(Float32, 100)
+        m = Chain(Dense(100, 100), Dropout(0.9; rng_kwargs...))
+        y = evalwgrad(m, x)
+        @test count(a -> a == 0, y) > 50
+        testmode!(m, true)
+        y = evalwgrad(m, x) # should override istraining
+        @test count(a -> a == 0, y) == 0
+
+        x = rand(100, 50)
+        m = Dropout(0.5; dims = 2, rng_kwargs...)
+        y = m(x)
+        c = map(i -> count(a -> a == 0, @view y[i, :]), 1:100)
+        @test minimum(c) == maximum(c)
+        m = Dropout(0.5; dims = 1, rng_kwargs...)
+        y = m(x)
+        c = map(i -> count(a -> a == 0, @view y[:, i]), 1:50)
+        @test minimum(c) == maximum(c)
+
+        # issue #1084
+        m = Dropout(0.9; rng_kwargs...)
+        x = rand(100)
+
+        testmode!(m)
+        y = m(x)
+        @test count(a -> a == 0, y) == 0
+        trainmode!(m)
+        y = m(x)
+        @test count(a -> a == 0, y) > 50
+
+        y = Flux.dropout(values(rng_kwargs)..., x, 0.9; active = true)
+        @test count(a -> a == 0, y) > 50
+
+        y = Flux.dropout(values(rng_kwargs)..., x, 0.9; active = false)
+        @test count(a -> a == 0, y) == 0
+
+        # CPU RNGs map onto CPU ok
+        if isempty(rng_kwargs)
+            if VERSION >= v"1.7"
+                @test cpu(m).rng isa Random.TaskLocalRNG
+            else
+                @test cpu(m).rng isa Random._GLOBAL_RNG
+            end
         else
-            @test cpu(m).rng isa Random._GLOBAL_RNG
+            @test cpu(m).rng === only(values(rng_kwargs))
         end
-    else
-        @test cpu(m).rng === only(values(rng_kwargs))
-    end
-end end
-
-@testset "AlphaDropout" begin @testset for rng_kwargs in ((), (; rng = MersenneTwister()))
-    x = [1.0, 2.0, 3.0]
-    @test x == AlphaDropout(0.1; rng_kwargs...)(x)
-    @test x == evalwgrad(AlphaDropout(0; rng_kwargs...), x)
-    @test zero(x) == evalwgrad(AlphaDropout(1; rng_kwargs...), x)
-
-    x = randn(1000) # large enough to prevent flaky test
-    m = AlphaDropout(0.5; rng_kwargs...)
-
-    y = evalwgrad(m, x)
-    # Should preserve unit mean and variance
-    @test mean(y)≈0 atol=0.2
-    @test var(y)≈1 atol=0.2
-
-    testmode!(m, true) # should override istraining
-    @test evalwgrad(m, x) == x
-
-    testmode!(m, false)
-    y = evalwgrad(m, x)
-    @test mean(y)≈0 atol=0.2
-    @test var(y)≈1 atol=0.2
-
-    # Known good value ranges
-    # Values taken from https://github.com/pytorch/pytorch/blob/v1.10.0/test/cpp/api/modules.cpp#L1337-L1338
-    x = ones(100)
-    if isempty(rng_kwargs)
-        @test 40 < sum(evalwgrad(m, x)) < 130
-    else
-        # FIXME: this breaks spuriously for MersenneTwister
-        @test_skip 40 < sum(evalwgrad(m, x)) < 130
     end
+end
+
+@testset "AlphaDropout" begin
+    @testset for rng_kwargs in ((), (; rng = MersenneTwister()))
+        x = [1.0, 2.0, 3.0]
+        @test x == AlphaDropout(0.1; rng_kwargs...)(x)
+        @test x == evalwgrad(AlphaDropout(0; rng_kwargs...), x)
+        @test zero(x) == evalwgrad(AlphaDropout(1; rng_kwargs...), x)
+
+        x = randn(1000) # large enough to prevent flaky test
+        m = AlphaDropout(0.5; rng_kwargs...)
+
+        y = evalwgrad(m, x)
+        # Should preserve unit mean and variance
+        @test mean(y) ≈ 0 atol = 0.2
+        @test var(y) ≈ 1 atol = 0.2
 
-    # CPU RNGs map onto CPU ok
-    if isempty(rng_kwargs)
-        if VERSION >= v"1.7"
-            @test cpu(m).rng isa Random.TaskLocalRNG
+        testmode!(m, true) # should override istraining
+        @test evalwgrad(m, x) == x
+
+        testmode!(m, false)
+        y = evalwgrad(m, x)
+        @test mean(y) ≈ 0 atol = 0.2
+        @test var(y) ≈ 1 atol = 0.2
+
+        # Known good value ranges
+        # Values taken from https://github.com/pytorch/pytorch/blob/v1.10.0/test/cpp/api/modules.cpp#L1337-L1338
+        x = ones(100)
+        if isempty(rng_kwargs)
+            @test 40 < sum(evalwgrad(m, x)) < 130
+        else
+            # FIXME: this breaks spuriously for MersenneTwister
+            @test_skip 40 < sum(evalwgrad(m, x)) < 130
+        end
+
+        # CPU RNGs map onto CPU ok
+        if isempty(rng_kwargs)
+            if VERSION >= v"1.7"
+                @test cpu(m).rng isa Random.TaskLocalRNG
+            else
+                @test cpu(m).rng isa Random._GLOBAL_RNG
+            end
         else
-            @test cpu(m).rng isa Random._GLOBAL_RNG
+            @test cpu(m).rng === only(values(rng_kwargs))
         end
-    else
-        @test cpu(m).rng === only(values(rng_kwargs))
     end
-end end
+end
 
 @testset "BatchNorm" begin
-    let m = BatchNorm(2), x = [1.0 3.0 5.0;
-                               2.0 4.0 6.0]
+    let m = BatchNorm(2), x = [
+            1.0 3.0 5.0
+            2.0 4.0 6.0
+        ]
         @test Flux.hasaffine(m) == true
         @test length(Flux.params(m)) == 2
 
@@ -162,8 +167,10 @@ end end
     end
 
     # with activation function
-    let m = BatchNorm(2, sigmoid), x = [1.0 3.0 5.0;
-                                        2.0 4.0 6.0]
+    let m = BatchNorm(2, sigmoid), x = [
+            1.0 3.0 5.0
+            2.0 4.0 6.0
+        ]
         y = m(x)
         @test isapprox(y, sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ)), atol = 1.0e-7)
         @inferred m(x)
@@ -203,7 +210,8 @@ end
 
 @testset "InstanceNorm" begin
     # begin tests
-    let m = InstanceNorm(2; affine = true, track_stats = true), sizes = (3, 2, 2),
+    let m = InstanceNorm(2; affine = true, track_stats = true),
+        sizes = (3, 2, 2),
         x = reshape(collect(1:prod(sizes)), sizes)
 
         @test length(Flux.params(m)) == 2
@@ -235,21 +243,23 @@ end
         # (1. - .1) * 0 + .1 * (5. + 11.) / 2 = .8
         N = ndims(x)
         @test m.μ ≈ [0.5, 0.8]
-        n = prod(size(x, i) for i in 1:(N - 2))
+        n = prod(size(x, i) for i = 1:(N-2))
         corr = n / (n - 1)
-        σ² = var(x; dims = 1:(N - 2), corrected = false)
+        σ² = var(x; dims = 1:(N-2), corrected = false)
         @test m.σ² ≈ 0.1 * corr * vec(mean(σ²; dims = N)) .+ 0.9 * 1
 
         y = m(x)
         @test length(m.μ) == 2
         @test length(m.σ²) == 2
-        @test y≈(x .- reshape(m.μ, 1, 2, 1)) ./ sqrt.(reshape(m.σ², 1, 2, 1) .+ 1.0f-5) atol=1.0e-5
+        @test y ≈ (x .- reshape(m.μ, 1, 2, 1)) ./ sqrt.(reshape(m.σ², 1, 2, 1) .+ 1.0f-5) atol =
+            1.0e-5
 
         @inferred m(x)
     end
 
     # with activation function
-    let m = InstanceNorm(2, sigmoid; affine = true, track_stats = true), sizes = (3, 2, 2),
+    let m = InstanceNorm(2, sigmoid; affine = true, track_stats = true),
+        sizes = (3, 2, 2),
         x = reshape(collect(1:prod(sizes)), sizes)
 
         x = Float64.(x)
@@ -260,13 +270,14 @@ end
         y = m(x) # inference time after a training step
         μ = reshape(m.μ, affine_shape...)
         σ² = reshape(m.σ², affine_shape...)
-        @test y≈sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol=1.0e-7
+        @test y ≈ sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol = 1.0e-7
 
         @inferred m(x)
     end
 
     # with activation function
-    let m = InstanceNorm(2, sigmoid; affine = true, track_stats = false), sizes = (3, 2, 2),
+    let m = InstanceNorm(2, sigmoid; affine = true, track_stats = false),
+        sizes = (3, 2, 2),
         x = reshape(collect(1:prod(sizes)), sizes)
 
         @test Flux.hasaffine(m) == true
@@ -275,12 +286,13 @@ end
         y = m(x)
         μ = mean(x; dims = 1)
         σ² = var(x; dims = 1, corrected = false)
-        @test y≈sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol=1.0e-7
+        @test y ≈ sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol = 1.0e-7
 
         @inferred m(x)
     end
 
-    let m = InstanceNorm(2, sigmoid), sizes = (3, 2, 2),
+    let m = InstanceNorm(2, sigmoid),
+        sizes = (3, 2, 2),
         x = reshape(collect(1:prod(sizes)), sizes)
 
         @test Flux.hasaffine(m) == false
@@ -290,12 +302,13 @@ end
         y = m(x)
         μ = mean(x; dims = 1)
         σ² = var(x; dims = 1, corrected = false)
-        @test y≈sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol=1.0e-7
+        @test y ≈ sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol = 1.0e-7
 
         @inferred m(x)
     end
 
-    let m = trainmode!(InstanceNorm(2; affine = true)), sizes = (2, 4, 1, 2, 3),
+    let m = trainmode!(InstanceNorm(2; affine = true)),
+        sizes = (2, 4, 1, 2, 3),
         x = Float32.(reshape(collect(1:prod(sizes)), sizes))
 
         y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3)
@@ -306,12 +319,13 @@ end
     end
 
     # check that μ, σ², and the output are the correct size for higher rank tensors
-    let m = InstanceNorm(2; affine = true, track_stats = true), sizes = (5, 5, 3, 4, 2, 6),
+    let m = InstanceNorm(2; affine = true, track_stats = true),
+        sizes = (5, 5, 3, 4, 2, 6),
         x = reshape(Float32.(collect(1:prod(sizes))), sizes)
 
         y = evalwgrad(m, x)
-        @test size(m.μ) == (sizes[end - 1],)
-        @test size(m.σ²) == (sizes[end - 1],)
+        @test size(m.μ) == (sizes[end-1],)
+        @test size(m.σ²) == (sizes[end-1],)
         @test size(y) == sizes
 
         @inferred m(x)
@@ -319,11 +333,11 @@ end
 
     # show that instance norm is equal to batch norm when channel and batch dims are squashed
     let m_inorm = trainmode!(InstanceNorm(2; affine = true)),
-        m_bnorm = trainmode!(BatchNorm(12)), sizes = (5, 5, 3, 4, 2, 6),
+        m_bnorm = trainmode!(BatchNorm(12)),
+        sizes = (5, 5, 3, 4, 2, 6),
         x = reshape(Float32.(collect(1:prod(sizes))), sizes)
 
-        @test m_inorm(x) ==
-              reshape(m_bnorm(reshape(x, (sizes[1:(end - 2)]..., :, 1))), sizes)
+        @test m_inorm(x) == reshape(m_bnorm(reshape(x, (sizes[1:(end-2)]..., :, 1))), sizes)
     end
 
     let m = InstanceNorm(32), x = randn(Float32, 416, 416, 32, 1)
@@ -365,7 +379,8 @@ end
     # begin tests
     squeeze(x) = dropdims(x; dims = tuple(findall(size(x) .== 1)...)) # To remove all singular dimensions
 
-    let m = GroupNorm(4, 2; track_stats = true), sizes = (3, 4, 2),
+    let m = GroupNorm(4, 2; track_stats = true),
+        sizes = (3, 4, 2),
         x = reshape(collect(1:prod(sizes)), sizes)
 
         @test length(Flux.params(m)) == 2
@@ -409,19 +424,20 @@ end
 
         y = m(x)
         out = (z .- reshape(m.μ, 1, 1, 2, 1)) ./ sqrt.(reshape(m.σ², 1, 1, 2, 1) .+ 1.0f-5)
-        @test y≈reshape(out, size(x)) atol=1.0e-5
+        @test y ≈ reshape(out, size(x)) atol = 1.0e-5
     end
     # with activation function
-    let m = GroupNorm(4, 2, sigmoid; track_stats = true), sizes = (3, 4, 2),
+    let m = GroupNorm(4, 2, sigmoid; track_stats = true),
+        sizes = (3, 4, 2),
         x = reshape(collect(1:prod(sizes)), sizes)
 
         x = Float32.(x)
         μ_affine_shape = ones(Int, length(sizes) + 1)
-        μ_affine_shape[end - 1] = 2 # Number of groups
+        μ_affine_shape[end-1] = 2 # Number of groups
 
         affine_shape = ones(Int, length(sizes) + 1)
-        affine_shape[end - 2] = 2 # Channels per group
-        affine_shape[end - 1] = 2 # Number of groups
+        affine_shape[end-2] = 2 # Channels per group
+        affine_shape[end-1] = 2 # Number of groups
         affine_shape[1] = sizes[1]
         affine_shape[end] = sizes[end]
 
@@ -429,12 +445,18 @@ end
 
         y = m(x)
         x_ = reshape(x, affine_shape...)
-        out = reshape(sigmoid.((x_ .- reshape(m.μ, μ_affine_shape...)) ./
-                               sqrt.(reshape(m.σ², μ_affine_shape...) .+ m.ϵ)), og_shape)
-        @test y≈out atol=1e-7
+        out = reshape(
+            sigmoid.(
+                (x_ .- reshape(m.μ, μ_affine_shape...)) ./
+                sqrt.(reshape(m.σ², μ_affine_shape...) .+ m.ϵ)
+            ),
+            og_shape,
+        )
+        @test y ≈ out atol = 1e-7
     end
 
-    let m = trainmode!(GroupNorm(2, 2; track_stats = true)), sizes = (2, 4, 1, 2, 3),
+    let m = trainmode!(GroupNorm(2, 2; track_stats = true)),
+        sizes = (2, 4, 1, 2, 3),
         x = Float32.(reshape(collect(1:prod(sizes)), sizes))
 
         y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3)
@@ -443,7 +465,8 @@ end
     end
 
     # check that μ, σ², and the output are the correct size for higher rank tensors
-    let m = GroupNorm(4, 2; track_stats = true), sizes = (5, 5, 3, 4, 4, 6),
+    let m = GroupNorm(4, 2; track_stats = true),
+        sizes = (5, 5, 3, 4, 4, 6),
         x = Float32.(reshape(collect(1:prod(sizes)), sizes))
 
         y = evalwgrad(m, x)
@@ -453,7 +476,8 @@ end
     end
 
     # show that group norm is the same as instance norm when the group size is the same as the number of channels
-    let IN = trainmode!(InstanceNorm(4; affine = true)), GN = trainmode!(GroupNorm(4, 4)),
+    let IN = trainmode!(InstanceNorm(4; affine = true)),
+        GN = trainmode!(GroupNorm(4, 4)),
         sizes = (2, 2, 3, 4, 5),
         x = Float32.(reshape(collect(1:prod(sizes)), sizes))
 
@@ -461,7 +485,8 @@ end
     end
 
     # show that group norm is the same as batch norm for a group of size 1 and batch of size 1
-    let BN = trainmode!(BatchNorm(4)), GN = trainmode!(GroupNorm(4, 4)),
+    let BN = trainmode!(BatchNorm(4)),
+        GN = trainmode!(GroupNorm(4, 4)),
         sizes = (2, 2, 3, 4, 1),
         x = Float32.(reshape(collect(1:prod(sizes)), sizes))
 
diff --git a/test/layers/recurrent.jl b/test/layers/recurrent.jl
index 19c1506095..225c4d83a9 100644
--- a/test/layers/recurrent.jl
+++ b/test/layers/recurrent.jl
@@ -2,7 +2,7 @@ using LinearAlgebra
 
 # Ref FluxML/Flux.jl#1209 1D input
 @testset "BPTT-1D" begin
-    seq = [rand(Float32, 2) for i in 1:3]
+    seq = [rand(Float32, 2) for i = 1:3]
     for r in [RNN]
         rnn = r(2 => 3)
         Flux.reset!(rnn)
@@ -10,23 +10,29 @@ using LinearAlgebra
             return sum([rnn(s) for s in seq][3])
         end
         Flux.reset!(rnn)
-        bptt = gradient(Wh -> sum(tanh.(rnn.cell.Wi * seq[3] +
-                                        Wh *
-                                        tanh.(rnn.cell.Wi * seq[2] +
-                                              Wh *
-                                              tanh.(rnn.cell.Wi * seq[1] +
-                                                    Wh * rnn.cell.state0
-                                                    + rnn.cell.b)
-                                              + rnn.cell.b)
-                                        + rnn.cell.b)),
-                        rnn.cell.Wh)
+        bptt = gradient(
+            Wh -> sum(
+                tanh.(
+                    rnn.cell.Wi * seq[3] +
+                    Wh *
+                    tanh.(
+                        rnn.cell.Wi * seq[2] +
+                        Wh *
+                        tanh.(rnn.cell.Wi * seq[1] + Wh * rnn.cell.state0 + rnn.cell.b) +
+                        rnn.cell.b
+                    ) +
+                    rnn.cell.b
+                ),
+            ),
+            rnn.cell.Wh,
+        )
         @test grads_seq[rnn.cell.Wh] ≈ bptt[1]
     end
 end
 
 # Ref FluxML/Flux.jl#1209 2D input
 @testset "BPTT-2D" begin
-    seq = [rand(Float32, (2, 1)) for i in 1:3]
+    seq = [rand(Float32, (2, 1)) for i = 1:3]
     for r in [RNN]
         rnn = r(2 => 3)
         Flux.reset!(rnn)
@@ -34,16 +40,22 @@ end
             return sum([rnn(s) for s in seq][3])
         end
         Flux.reset!(rnn)
-        bptt = gradient(Wh -> sum(tanh.(rnn.cell.Wi * seq[3] +
-                                        Wh *
-                                        tanh.(rnn.cell.Wi * seq[2] +
-                                              Wh *
-                                              tanh.(rnn.cell.Wi * seq[1] +
-                                                    Wh * rnn.cell.state0
-                                                    + rnn.cell.b)
-                                              + rnn.cell.b)
-                                        + rnn.cell.b)),
-                        rnn.cell.Wh)
+        bptt = gradient(
+            Wh -> sum(
+                tanh.(
+                    rnn.cell.Wi * seq[3] +
+                    Wh *
+                    tanh.(
+                        rnn.cell.Wi * seq[2] +
+                        Wh *
+                        tanh.(rnn.cell.Wi * seq[1] + Wh * rnn.cell.state0 + rnn.cell.b) +
+                        rnn.cell.b
+                    ) +
+                    rnn.cell.b
+                ),
+            ),
+            rnn.cell.Wh,
+        )
         @test grads_seq[rnn.cell.Wh] ≈ bptt[1]
     end
 end
@@ -58,46 +70,44 @@ end
     Flux.reset!(rnn)
     bptt = gradient(rnn.cell.Wh) do Wh
         # calculate state 1
-        s1 = tanh.(rnn.cell.Wi * seq[:, :, 1] +
-                   Wh * rnn.cell.state0 +
-                   rnn.cell.b)
+        s1 = tanh.(rnn.cell.Wi * seq[:, :, 1] + Wh * rnn.cell.state0 + rnn.cell.b)
         #calculate state 2
-        s2 = tanh.(rnn.cell.Wi * seq[:, :, 2] +
-                   Wh * s1 +
-                   rnn.cell.b)
+        s2 = tanh.(rnn.cell.Wi * seq[:, :, 2] + Wh * s1 + rnn.cell.b)
         #calculate state 3
-        s3 = tanh.(rnn.cell.Wi * seq[:, :, 3] +
-                   Wh * s2 +
-                   rnn.cell.b)
+        s3 = tanh.(rnn.cell.Wi * seq[:, :, 3] + Wh * s2 + rnn.cell.b)
         return sum(s3) # loss is sum of state 3
     end
     @test grads_seq[rnn.cell.Wh] ≈ bptt[1]
 end
 
-@testset "RNN-shapes" begin @testset for R in [RNN, GRU, LSTM, GRUv3]
-    m1 = R(3 => 5)
-    m2 = R(3 => 5)
-    m3 = R(3, 5)  # leave one to test the silently deprecated "," not "=>" notation
-    x1 = rand(Float32, 3)
-    x2 = rand(Float32, 3, 1)
-    x3 = rand(Float32, 3, 1, 2)
-    Flux.reset!(m1)
-    Flux.reset!(m2)
-    Flux.reset!(m3)
-    @test size(m1(x1)) == (5,)
-    @test size(m1(x1)) == (5,) # repeat in case of effect from change in state shape
-    @test size(m2(x2)) == (5, 1)
-    @test size(m2(x2)) == (5, 1)
-    @test size(m3(x3)) == (5, 1, 2)
-    @test size(m3(x3)) == (5, 1, 2)
-end end
+@testset "RNN-shapes" begin
+    @testset for R in [RNN, GRU, LSTM, GRUv3]
+        m1 = R(3 => 5)
+        m2 = R(3 => 5)
+        m3 = R(3, 5)  # leave one to test the silently deprecated "," not "=>" notation
+        x1 = rand(Float32, 3)
+        x2 = rand(Float32, 3, 1)
+        x3 = rand(Float32, 3, 1, 2)
+        Flux.reset!(m1)
+        Flux.reset!(m2)
+        Flux.reset!(m3)
+        @test size(m1(x1)) == (5,)
+        @test size(m1(x1)) == (5,) # repeat in case of effect from change in state shape
+        @test size(m2(x2)) == (5, 1)
+        @test size(m2(x2)) == (5, 1)
+        @test size(m3(x3)) == (5, 1, 2)
+        @test size(m3(x3)) == (5, 1, 2)
+    end
+end
 
-@testset "RNN-input-state-eltypes" begin @testset for R in [RNN, GRU, LSTM, GRUv3]
-    m = R(3 => 5)
-    x = rand(Float64, 3, 1)
-    Flux.reset!(m)
-    @test_throws MethodError m(x)
-end end
+@testset "RNN-input-state-eltypes" begin
+    @testset for R in [RNN, GRU, LSTM, GRUv3]
+        m = R(3 => 5)
+        x = rand(Float64, 3, 1)
+        Flux.reset!(m)
+        @test_throws MethodError m(x)
+    end
+end
 
 @testset "multigate" begin
     x = rand(6, 5)
@@ -113,14 +123,19 @@ end
     x = rand(3, 3, 1, 2, 4)
     @test length(Flux.eachlastdim(x)) == size(x, ndims(x))
     @test collect(@inferred(Flux.eachlastdim(x))) == collect(eachslice(x; dims = ndims(x)))
-    slicedim = (size(x)[1:(end - 1)]..., 1)
+    slicedim = (size(x)[1:(end-1)]..., 1)
     res, (dx,) = Flux.withgradient(x) do x
         x1, _, x3, _ = Flux.eachlastdim(x)
         return sum(x1) + sum(x3 .* 3)
     end
     @test res ≈ sum(selectdim(x, ndims(x), 1)) + 3sum(selectdim(x, ndims(x), 3))
-    @test dx ≈ cat(fill(1, slicedim), fill(0, slicedim),
-              fill(3, slicedim), fill(0, slicedim); dims = ndims(x))
+    @test dx ≈ cat(
+        fill(1, slicedim),
+        fill(0, slicedim),
+        fill(3, slicedim),
+        fill(0, slicedim);
+        dims = ndims(x),
+    )
 end
 
 @testset "∇eachlastdim" begin
@@ -132,40 +147,57 @@ end
     NoTangent = Flux.Zygote.NoTangent
     abstract_zeros_vector = [ZeroTangent(), ZeroTangent(), NoTangent(), NoTangent()]
     @test @inferred(Flux.∇eachlastdim(abstract_zeros_vector, x)) == zeros(size(x))
-    x2 = rand(Float64, x_size[1:(end - 1)])
-    x3 = rand(Float64, x_size[1:(end - 1)])
+    x2 = rand(Float64, x_size[1:(end-1)])
+    x3 = rand(Float64, x_size[1:(end-1)])
     mixed_vector = [ZeroTangent(), x2, x3, ZeroTangent()]
-    @test @inferred(Flux.∇eachlastdim(mixed_vector, x)) ≈ cat(zeros(x_size[1:(end - 1)]),
-              x2,
-              x3,
-              zeros(x_size[1:(end - 1)]); dims = ndims(x))
+    @test @inferred(Flux.∇eachlastdim(mixed_vector, x)) ≈
+          cat(zeros(x_size[1:(end-1)]), x2, x3, zeros(x_size[1:(end-1)]); dims = ndims(x))
 end
 
 @testset "Different Internal Matrix Types" begin
-  R = Flux.Recur(Flux.RNNCell(tanh, rand(5, 3), Tridiagonal(rand(5, 5)), rand(5), rand(5, 1)))
-  # don't want to pull in SparseArrays just for this test, but there aren't any
-  # non-square structured matrix types in LinearAlgebra. so we will use a different
-  # eltype matrix, which would fail before when `W_i` and `W_h` were required to be the
-  # same type.
-  L = Flux.Recur(Flux.LSTMCell(rand(5*4, 3), rand(1:20, 5*4, 5), rand(5*4), (rand(5, 1), rand(5, 1))))
-  G = Flux.Recur(Flux.GRUCell(rand(5*3, 3), rand(1:20, 5*3, 5), rand(5*3), rand(5, 1)))
-  G3 = Flux.Recur(Flux.GRUv3Cell(rand(5*3, 3), rand(1:20, 5*2, 5), rand(5*3), Tridiagonal(rand(5, 5)), rand(5, 1)))
+    R = Flux.Recur(
+        Flux.RNNCell(tanh, rand(5, 3), Tridiagonal(rand(5, 5)), rand(5), rand(5, 1)),
+    )
+    # don't want to pull in SparseArrays just for this test, but there aren't any
+    # non-square structured matrix types in LinearAlgebra. so we will use a different
+    # eltype matrix, which would fail before when `W_i` and `W_h` were required to be the
+    # same type.
+    L = Flux.Recur(
+        Flux.LSTMCell(
+            rand(5 * 4, 3),
+            rand(1:20, 5 * 4, 5),
+            rand(5 * 4),
+            (rand(5, 1), rand(5, 1)),
+        ),
+    )
+    G = Flux.Recur(
+        Flux.GRUCell(rand(5 * 3, 3), rand(1:20, 5 * 3, 5), rand(5 * 3), rand(5, 1)),
+    )
+    G3 = Flux.Recur(
+        Flux.GRUv3Cell(
+            rand(5 * 3, 3),
+            rand(1:20, 5 * 2, 5),
+            rand(5 * 3),
+            Tridiagonal(rand(5, 5)),
+            rand(5, 1),
+        ),
+    )
 
-  for m in [R, L, G, G3]
+    for m in [R, L, G, G3]
 
-    x1 = rand(3)
-    x2 = rand(3, 1)
-    x3 = rand(3, 1, 2)
-    Flux.reset!(m)
-    @test size(m(x1)) == (5,)
-    Flux.reset!(m)
-    @test size(m(x1)) == (5,) # repeat in case of effect from change in state shape
-    @test size(m(x2)) == (5, 1)
-    Flux.reset!(m)
-    @test size(m(x2)) == (5, 1)
-    Flux.reset!(m)
-    @test size(m(x3)) == (5, 1, 2)
-    Flux.reset!(m)
-    @test size(m(x3)) == (5, 1, 2)
-  end
+        x1 = rand(3)
+        x2 = rand(3, 1)
+        x3 = rand(3, 1, 2)
+        Flux.reset!(m)
+        @test size(m(x1)) == (5,)
+        Flux.reset!(m)
+        @test size(m(x1)) == (5,) # repeat in case of effect from change in state shape
+        @test size(m(x2)) == (5, 1)
+        Flux.reset!(m)
+        @test size(m(x2)) == (5, 1)
+        Flux.reset!(m)
+        @test size(m(x3)) == (5, 1, 2)
+        Flux.reset!(m)
+        @test size(m(x3)) == (5, 1, 2)
+    end
 end
diff --git a/test/layers/upsample.jl b/test/layers/upsample.jl
index c4e1c30341..66831d3d68 100644
--- a/test/layers/upsample.jl
+++ b/test/layers/upsample.jl
@@ -2,19 +2,19 @@
     m = Upsample(:bilinear; scale = (2, 3))
     x = rand(Float32, 3, 4, 2, 3)
     y = m(x)
-    @test y isa Array{Float32, 4}
+    @test y isa Array{Float32,4}
     @test size(y) == (6, 12, 2, 3)
 
     m = Upsample(:bilinear; scale = 3)
     x = rand(Float32, 3, 4, 2, 3)
     y = m(x)
-    @test y isa Array{Float32, 4}
+    @test y isa Array{Float32,4}
     @test size(y) == (9, 12, 2, 3)
 
     m = Upsample(:bilinear; size = (4, 6))
     x = rand(Float32, 3, 4, 2, 3)
     y = m(x)
-    @test y isa Array{Float32, 4}
+    @test y isa Array{Float32,4}
     @test size(y) == (4, 6, 2, 3)
 end
 
@@ -22,19 +22,19 @@ end
     m = Upsample(:trilinear; scale = (2, 3, 2))
     x = rand(Float32, 3, 4, 2, 3, 4)
     y = m(x)
-    @test y isa Array{Float32, 5}
+    @test y isa Array{Float32,5}
     @test size(y) == (6, 12, 4, 3, 4)
 
     m = Upsample(:trilinear; scale = 3)
     x = rand(Float32, 3, 4, 2, 3, 4)
     y = m(x)
-    @test y isa Array{Float32, 5}
+    @test y isa Array{Float32,5}
     @test size(y) == (9, 12, 6, 3, 4)
 
     m = Upsample(:trilinear; size = (4, 6, 4))
     x = rand(Float32, 3, 4, 2, 3, 4)
     y = m(x)
-    @test y isa Array{Float32, 5}
+    @test y isa Array{Float32,5}
     @test size(y) == (4, 6, 4, 3, 4)
 end
 
@@ -42,24 +42,24 @@ end
     x = rand(Float32, 3, 2, 3)
     m = Upsample(:nearest; scale = (2,))
     y = m(x)
-    @test y isa Array{Float32, 3}
+    @test y isa Array{Float32,3}
     @test size(y) == (6, 2, 3)
 
     x = rand(Float32, 3, 4, 2, 3)
 
     m = Upsample(:nearest; scale = (2, 3))
     y = m(x)
-    @test y isa Array{Float32, 4}
+    @test y isa Array{Float32,4}
     @test size(y) == (6, 12, 2, 3)
 
     m = Upsample(:nearest; scale = (2,))
     y = m(x)
-    @test y isa Array{Float32, 4}
+    @test y isa Array{Float32,4}
     @test size(y) == (6, 4, 2, 3)
 
     m = Upsample(:nearest; scale = 2)
     y = m(x)
-    @test y isa Array{Float32, 4}
+    @test y isa Array{Float32,4}
     @test size(y) == (6, 8, 2, 3)
 
     m = Upsample(2)
@@ -68,7 +68,7 @@ end
 
     m = Upsample(:nearest; size = (6, 8))
     y = m(x)
-    @test y isa Array{Float32, 4}
+    @test y isa Array{Float32,4}
     @test size(y) == (6, 8, 2, 3)
 end
 
@@ -76,12 +76,12 @@ end
     m = PixelShuffle(2)
     x = rand(Float32, 3, 18, 3)
     y = m(x)
-    @test y isa Array{Float32, 3}
+    @test y isa Array{Float32,3}
     @test size(y) == (6, 9, 3)
 
     m = PixelShuffle(3)
     x = rand(Float32, 3, 4, 18, 3)
     y = m(x)
-    @test y isa Array{Float32, 4}
+    @test y isa Array{Float32,4}
     @test size(y) == (9, 12, 2, 3)
 end
diff --git a/test/losses.jl b/test/losses.jl
index f8f261c7c8..7984941c78 100644
--- a/test/losses.jl
+++ b/test/losses.jl
@@ -1,22 +1,35 @@
 using Test
 using Flux: onehotbatch, σ
 
-using Flux.Losses: mse, label_smoothing, crossentropy, logitcrossentropy,
-                   binarycrossentropy, logitbinarycrossentropy
+using Flux.Losses:
+    mse,
+    label_smoothing,
+    crossentropy,
+    logitcrossentropy,
+    binarycrossentropy,
+    logitbinarycrossentropy
 using Flux.Losses: xlogx, xlogy
 
 # group here all losses, used in tests
-const ALL_LOSSES = [Flux.Losses.mse, Flux.Losses.mae, Flux.Losses.msle,
-    Flux.Losses.crossentropy, Flux.Losses.logitcrossentropy,
-    Flux.Losses.binarycrossentropy, Flux.Losses.logitbinarycrossentropy,
+const ALL_LOSSES = [
+    Flux.Losses.mse,
+    Flux.Losses.mae,
+    Flux.Losses.msle,
+    Flux.Losses.crossentropy,
+    Flux.Losses.logitcrossentropy,
+    Flux.Losses.binarycrossentropy,
+    Flux.Losses.logitbinarycrossentropy,
     Flux.Losses.kldivergence,
     Flux.Losses.huber_loss,
     Flux.Losses.tversky_loss,
     Flux.Losses.dice_coeff_loss,
     Flux.Losses.poisson_loss,
-    Flux.Losses.hinge_loss, Flux.Losses.squared_hinge_loss,
-    Flux.Losses.binary_focal_loss, Flux.Losses.focal_loss,
-    Flux.Losses.siamese_contrastive_loss]
+    Flux.Losses.hinge_loss,
+    Flux.Losses.squared_hinge_loss,
+    Flux.Losses.binary_focal_loss,
+    Flux.Losses.focal_loss,
+    Flux.Losses.siamese_contrastive_loss,
+]
 
 @testset "xlogx & xlogy" begin
     @test iszero(xlogx(0))
@@ -45,13 +58,19 @@ y = [1, 1, 0, 0]
     @test mse(0 + 0im, 1 + 1im) == 2
 end
 
-@testset "mae" begin @test Flux.mae(ŷ, y) ≈ 1 / 2 end
+@testset "mae" begin
+    @test Flux.mae(ŷ, y) ≈ 1 / 2
+end
 
-@testset "huber_loss" begin @test Flux.huber_loss(ŷ, y) ≈ 0.20500000000000002 end
+@testset "huber_loss" begin
+    @test Flux.huber_loss(ŷ, y) ≈ 0.20500000000000002
+end
 
 y = [123.0, 456.0, 789.0]
 ŷ = [345.0, 332.0, 789.0]
-@testset "msle" begin @test Flux.msle(ŷ, y) ≈ 0.38813985859136585 end
+@testset "msle" begin
+    @test Flux.msle(ŷ, y) ≈ 0.38813985859136585
+end
 
 # Now onehot y's
 y = onehotbatch([1, 1, 0, 0], 0:1)
@@ -105,8 +124,10 @@ yls = y .* (1 - 2sf) .+ sf
           -yls .* log.(σ.(logŷ)) - (1 .- yls) .* log.(1 .- σ.(logŷ))
     @test binarycrossentropy(σ.(logŷ), y; ϵ = 0) ≈
           mean(-y .* log.(σ.(logŷ)) - (1 .- y) .* log.(1 .- σ.(logŷ)))
-    @test binarycrossentropy(σ.(logŷ), y) ≈ mean(-y .* log.(σ.(logŷ) .+ eps.(σ.(logŷ))) -
-               (1 .- y) .* log.(1 .- σ.(logŷ) .+ eps.(σ.(logŷ))))
+    @test binarycrossentropy(σ.(logŷ), y) ≈ mean(
+        -y .* log.(σ.(logŷ) .+ eps.(σ.(logŷ))) -
+        (1 .- y) .* log.(1 .- σ.(logŷ) .+ eps.(σ.(logŷ))),
+    )
     @test binarycrossentropy([0.1, 0.2, 0.9], 1) ≈ -mean(log, [0.1, 0.2, 0.9])  # constant label
 end
 
@@ -170,68 +191,94 @@ end
     @test Flux.tversky_loss(y, y) ≈ -0.5576923076923075
 end
 
-@testset "no spurious promotions" begin for T in (Float32, Float64)
-    y = rand(T, 2)
-    ŷ = rand(T, 2)
-    for f in ALL_LOSSES
-        fwd, back = Flux.pullback(f, ŷ, y)
-        @test fwd isa T
-        @test eltype(back(one(T))[1]) == T
+@testset "no spurious promotions" begin
+    for T in (Float32, Float64)
+        y = rand(T, 2)
+        ŷ = rand(T, 2)
+        for f in ALL_LOSSES
+            fwd, back = Flux.pullback(f, ŷ, y)
+            @test fwd isa T
+            @test eltype(back(one(T))[1]) == T
+        end
     end
-end end
+end
 
 @testset "binary_focal_loss" begin
-    y = [0 1 0
-         1 0 1]
-    ŷ = [0.268941 0.5 0.268941
-         0.731059 0.5 0.731059]
-
-    y1 = [1 0
-          0 1]
-    ŷ1 = [0.6 0.3
-          0.4 0.7]
+    y = [
+        0 1 0
+        1 0 1
+    ]
+    ŷ = [
+        0.268941 0.5 0.268941
+        0.731059 0.5 0.731059
+    ]
+
+    y1 = [
+        1 0
+        0 1
+    ]
+    ŷ1 = [
+        0.6 0.3
+        0.4 0.7
+    ]
     @test Flux.binary_focal_loss(ŷ, y) ≈ 0.0728675615927385
     @test Flux.binary_focal_loss(ŷ1, y1) ≈ 0.05691642237852222
     @test Flux.binary_focal_loss(ŷ, y; γ = 0.0) ≈ Flux.binarycrossentropy(ŷ, y)
 end
 
 @testset "focal_loss" begin
-    y = [1 0 0 0 1
-         0 1 0 1 0
-         0 0 1 0 0]
+    y = [
+        1 0 0 0 1
+        0 1 0 1 0
+        0 0 1 0 0
+    ]
     ŷ = softmax(reshape(-7:7, 3, 5) .* 1.0f0)
-    y1 = [1 0
-          0 0
-          0 1]
-    ŷ1 = [0.4 0.2
-          0.5 0.5
-          0.1 0.3]
+    y1 = [
+        1 0
+        0 0
+        0 1
+    ]
+    ŷ1 = [
+        0.4 0.2
+        0.5 0.5
+        0.1 0.3
+    ]
     @test Flux.focal_loss(ŷ, y) ≈ 1.1277571935622628
     @test Flux.focal_loss(ŷ1, y1) ≈ 0.45990566879720157
     @test Flux.focal_loss(ŷ, y; γ = 0.0) ≈ Flux.crossentropy(ŷ, y)
 end
 
 @testset "siamese_contrastive_loss" begin
-    y = [1 0
-         0 0
-         0 1]
-    ŷ = [0.4 0.2
-         0.5 0.5
-         0.1 0.3]
-    y1 = [1 0 0 0 1
-          0 1 0 1 0
-          0 0 1 0 0]
+    y = [
+        1 0
+        0 0
+        0 1
+    ]
+    ŷ = [
+        0.4 0.2
+        0.5 0.5
+        0.1 0.3
+    ]
+    y1 = [
+        1 0 0 0 1
+        0 1 0 1 0
+        0 0 1 0 0
+    ]
     ŷ1 = softmax(reshape(-7:7, 3, 5) .* 1.0f0)
-    y2 = [1
-          0
-          0
-          1
-          1]
-    ŷ2 = [0.6
-          0.4
-          0.1
-          0.2
-          0.7]
+    y2 = [
+        1
+        0
+        0
+        1
+        1
+    ]
+    ŷ2 = [
+        0.6
+        0.4
+        0.1
+        0.2
+        0.7
+    ]
     @test Flux.siamese_contrastive_loss(ŷ, y) ≈ 0.2333333333333333
     @test Flux.siamese_contrastive_loss(ŷ, y; margin = 0.5f0) ≈ 0.10000000000000002
     @test Flux.siamese_contrastive_loss(ŷ, y; margin = 1.5f0) ≈ 0.5333333333333333
@@ -246,10 +293,14 @@ end
     @test Flux.siamese_contrastive_loss(ŷ1, y1; margin = 0) ≈ 0.13161165f0
     @test Flux.siamese_contrastive_loss(ŷ2, y2) ≈ 0.21200000000000005
     @test Flux.siamese_contrastive_loss(ŷ2, ŷ2) ≈ 0.18800000000000003
-    @test_throws DomainError(-0.5, "Margin must be non-negative") Flux.siamese_contrastive_loss(ŷ1,
-                                                                                                y1,
-                                                                                                margin = -0.5)
-    @test_throws DomainError(-1, "Margin must be non-negative") Flux.siamese_contrastive_loss(ŷ,
-                                                                                              y,
-                                                                                              margin = -1)
+    @test_throws DomainError(-0.5, "Margin must be non-negative") Flux.siamese_contrastive_loss(
+        ŷ1,
+        y1,
+        margin = -0.5,
+    )
+    @test_throws DomainError(-1, "Margin must be non-negative") Flux.siamese_contrastive_loss(
+        ŷ,
+        y,
+        margin = -1,
+    )
 end
diff --git a/test/optimise.jl b/test/optimise.jl
index 49e5c6c913..9f9f788c01 100644
--- a/test/optimise.jl
+++ b/test/optimise.jl
@@ -10,14 +10,27 @@ using Random
     # so that w and w' are different
     Random.seed!(84)
     w = randn(10, 10)
-    @testset for opt in [AdamW(), AdaGrad(0.1), AdaMax(), AdaDelta(0.9), AMSGrad(),
-        NAdam(), RAdam(), Descent(0.1), Adam(), OAdam(), AdaBelief(),
-        Nesterov(), RMSProp(), Momentum()]
+    @testset for opt in [
+        AdamW(),
+        AdaGrad(0.1),
+        AdaMax(),
+        AdaDelta(0.9),
+        AMSGrad(),
+        NAdam(),
+        RAdam(),
+        Descent(0.1),
+        Adam(),
+        OAdam(),
+        AdaBelief(),
+        Nesterov(),
+        RMSProp(),
+        Momentum(),
+    ]
         Random.seed!(42)
         w′ = randn(10, 10)
         b = false
         loss(x) = Flux.Losses.mse(w * x, w′ * x .+ b)
-        for t in 1:(10^5)
+        for t = 1:(10^5)
             θ = params([w′, b])
             x = rand(10)
             θ̄ = gradient(() -> loss(x), θ)
@@ -35,7 +48,7 @@ end
         w′ = randn(10, 10)
         loss(x) = Flux.Losses.mse(w * x, w′ * x)
         opt = Optimiser(Opt(), Adam(0.001))
-        for t in 1:(10^5)
+        for t = 1:(10^5)
             θ = Params([w′])
             x = rand(10)
             θ̄ = gradient(() -> loss(x), θ)
@@ -48,26 +61,32 @@ end
 @testset "Training Loop" begin
     i = 0
     l = 1
-    Flux.train!(() -> (sleep(0.1); Flux.skip(); i += 1),
-                Params([]),
-                Iterators.repeated((), 10),
-                Descent())
+    Flux.train!(
+        () -> (sleep(0.1); Flux.skip(); i += 1),
+        Params([]),
+        Iterators.repeated((), 10),
+        Descent(),
+    )
 
     @test i == 0 #all skipped
 
-    Flux.train!(() -> (sleep(0.1); i == 8 && Flux.skip(); i += 1),
-                Params([]),
-                Iterators.repeated((), 10),
-                Descent())
+    Flux.train!(
+        () -> (sleep(0.1); i == 8 && Flux.skip(); i += 1),
+        Params([]),
+        Iterators.repeated((), 10),
+        Descent(),
+    )
 
     @test i == 8 #skip after i hit 8
 
     i = 0
-    Flux.train!(() -> (sleep(0.1); i += 1; l),
-                Params([]),
-                Iterators.repeated((), 100),
-                Descent();
-                cb = Flux.throttle(() -> (i > 3 && Flux.stop()), 1))
+    Flux.train!(
+        () -> (sleep(0.1); i += 1; l),
+        Params([]),
+        Iterators.repeated((), 100),
+        Descent();
+        cb = Flux.throttle(() -> (i > 3 && Flux.stop()), 1),
+    )
 
     @test 3 < i < 50
 
@@ -109,7 +128,7 @@ end
     loss(x) = Flux.Losses.mse(w * x, w1 * x)
     flag = 1
     decay_steps = []
-    for t in 1:(10^5)
+    for t = 1:(10^5)
         prev_eta = o.eta
         θ = Params([w1])
         x = rand(10)
@@ -129,7 +148,7 @@ end
     @test flag == 1
     # Test to check if decay happens at decay steps. Eta reaches clip value (1e-4) after 4000 steps (decay by 0.1 every 1000 steps starting at 0.1).
     ground_truth = []
-    for i in 1:4
+    for i = 1:4
         push!(ground_truth, 1000 * i)  # Expected decay steps for this example.
     end
     @test decay_steps == ground_truth
@@ -204,30 +223,31 @@ end
 # wreaks all sorts of havoc on our training loops.  This test ensures that
 # a simple optimization is montonically decreasing (up to learning step effects)
 @testset "Momentum Optimisers and complex values" begin
-# Test every optimizer that has momentum internally
-for opt_ctor in [Adam, RMSProp, RAdam, OAdam, AdaGrad, AdaDelta, NAdam, AdaBelief]
-    # Our "model" is just a complex number
-    w = zeros(ComplexF32, 1)
-
-    # Our model attempts to learn `f(x) = conj(x)` where `f(x) = w*x`
-    function loss()
-        # Deterministic training data is the best training data
-        x = ones(1, 1) + 1im * ones(1, 1)
-
-        # Manually implement `mse()` to allow demonstration of brokenness
-        # on older Flux builds that don't have a fixed `mse()`
-        return sum(abs2.(w * x .- conj(x)))
-    end
+    # Test every optimizer that has momentum internally
+    for opt_ctor in [Adam, RMSProp, RAdam, OAdam, AdaGrad, AdaDelta, NAdam, AdaBelief]
+        # Our "model" is just a complex number
+        w = zeros(ComplexF32, 1)
+
+        # Our model attempts to learn `f(x) = conj(x)` where `f(x) = w*x`
+        function loss()
+            # Deterministic training data is the best training data
+            x = ones(1, 1) + 1im * ones(1, 1)
+
+            # Manually implement `mse()` to allow demonstration of brokenness
+            # on older Flux builds that don't have a fixed `mse()`
+            return sum(abs2.(w * x .- conj(x)))
+        end
 
-    params = Flux.Params([w])
-    opt = opt_ctor(1e-2)
+        params = Flux.Params([w])
+        opt = opt_ctor(1e-2)
 
-    # Train for 10 iterations, enforcing that loss is monotonically decreasing
-    last_loss = Inf
-    for idx in 1:10
-        grads = Flux.gradient(loss, params)
-        @test loss() < last_loss
-        last_loss = loss()
-        Flux.update!(opt, params, grads)
+        # Train for 10 iterations, enforcing that loss is monotonically decreasing
+        last_loss = Inf
+        for idx = 1:10
+            grads = Flux.gradient(loss, params)
+            @test loss() < last_loss
+            last_loss = loss()
+            Flux.update!(opt, params, grads)
+        end
     end
-end end
+end
diff --git a/test/outputsize.jl b/test/outputsize.jl
index c3c2c7ae7b..64eda2af31 100644
--- a/test/outputsize.jl
+++ b/test/outputsize.jl
@@ -3,7 +3,7 @@
     @test outputsize(m, (10, 10, 3, 1)) == (6, 6, 32, 1)
 
     m = Dense(10, 5)
-    @test_throws DimensionMismatch outputsize(m, (5, 2))==(5, 1)
+    @test_throws DimensionMismatch outputsize(m, (5, 2)) == (5, 1)
     @test outputsize(m, (10,); padbatch = true) == (5, 1)
 
     m = Chain(Dense(10, 8, σ), Dense(8, 5), Dense(5, 2))
@@ -55,12 +55,33 @@ end
     @test outputsize(m, (2, 7), (3, 7)) == (13, 7)
 end
 
-@testset "activations" begin @testset for f in [celu, elu, gelu, hardsigmoid, hardtanh,
-    leakyrelu, lisht, logcosh, logσ, mish,
-    relu, relu6, rrelu, selu, σ, softplus,
-    softshrink, softsign, swish, tanhshrink, trelu]
-    @test outputsize(Dense(10, 5, f), (10, 1)) == (5, 1)
-end end
+@testset "activations" begin
+    @testset for f in [
+        celu,
+        elu,
+        gelu,
+        hardsigmoid,
+        hardtanh,
+        leakyrelu,
+        lisht,
+        logcosh,
+        logσ,
+        mish,
+        relu,
+        relu6,
+        rrelu,
+        selu,
+        σ,
+        softplus,
+        softshrink,
+        softsign,
+        swish,
+        tanhshrink,
+        trelu,
+    ]
+        @test outputsize(Dense(10, 5, f), (10, 1)) == (5, 1)
+    end
+end
 
 @testset "conv" begin
     m = Conv((3, 3), 3 => 16)
diff --git a/test/runtests.jl b/test/runtests.jl
index 4189ea0dd5..2a1b2913ca 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -10,12 +10,18 @@ using CUDA
 
 Random.seed!(0)
 
-@testset verbose=true "Flux.jl" begin
-    @testset "Utils" begin include("utils.jl") end
+@testset verbose = true "Flux.jl" begin
+    @testset "Utils" begin
+        include("utils.jl")
+    end
 
-    @testset "Optimise" begin include("optimise.jl") end
+    @testset "Optimise" begin
+        include("optimise.jl")
+    end
 
-    @testset "Data" begin include("data.jl") end
+    @testset "Data" begin
+        include("data.jl")
+    end
 
     @testset "Losses" begin
         include("losses.jl")
@@ -38,11 +44,13 @@ Random.seed!(0)
         include("outputsize.jl")
     end
 
-    @testset "CUDA" begin if CUDA.functional()
-        include("cuda/runtests.jl")
-    else
-        @warn "CUDA unavailable, not testing GPU support"
-    end end
+    @testset "CUDA" begin
+        if CUDA.functional()
+            include("cuda/runtests.jl")
+        else
+            @warn "CUDA unavailable, not testing GPU support"
+        end
+    end
 
     @static if VERSION == v"1.6"
         using Documenter
diff --git a/test/utils.jl b/test/utils.jl
index f71e336c8a..7da452ba02 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -1,8 +1,22 @@
 using Flux
-using Flux: throttle, nfan, glorot_uniform, glorot_normal,
-            kaiming_normal, kaiming_uniform, orthogonal, truncated_normal,
-            sparse_init, identity_init, unstack, batch, unbatch,
-            unsqueeze, params, loadparams!, loadmodel!
+using Flux:
+    throttle,
+    nfan,
+    glorot_uniform,
+    glorot_normal,
+    kaiming_normal,
+    kaiming_uniform,
+    orthogonal,
+    truncated_normal,
+    sparse_init,
+    identity_init,
+    unstack,
+    batch,
+    unbatch,
+    unsqueeze,
+    params,
+    loadparams!,
+    loadmodel!
 using MLUtils
 using StatsBase: var, std
 using Statistics, LinearAlgebra
@@ -71,8 +85,10 @@ end
     end
 
     @testset "Basics: $init" for init in [
-        glorot_uniform, glorot_normal,
-        kaiming_uniform, kaiming_normal,
+        glorot_uniform,
+        glorot_normal,
+        kaiming_uniform,
+        kaiming_normal,
         orthogonal,
         sparse_init,
         truncated_normal,
@@ -89,7 +105,7 @@ end
         end
         @test size(init(3, 4)) == (3, 4)
         # only init(size...) is accepted:
-        @test_throws MethodError size(init((3, 4, 5)))==(3, 4, 5)
+        @test_throws MethodError size(init((3, 4, 5))) == (3, 4, 5)
 
         # rng, and currying:
         @test size(init(MersenneTwister(1), 3, 4)) == (3, 4)
@@ -164,8 +180,8 @@ end
         for (n_in, n_out, sparsity, σ) in [(100, 100, 0.25, 0.1), (100, 400, 0.75, 0.01)]
             expected_zeros = ceil(Integer, n_in * sparsity)
             v = sparse_init(n_in, n_out; sparsity = sparsity, std = σ)
-            @test all([sum(v[:, col] .== 0) == expected_zeros for col in 1:n_out])
-            @test 0.9 * σ < std(v[v .!= 0]) < 1.1 * σ
+            @test all([sum(v[:, col] .== 0) == expected_zeros for col = 1:n_out])
+            @test 0.9 * σ < std(v[v.!=0]) < 1.1 * σ
         end
 
         @test eltype(sparse_init(3, 4; std = 1.5, sparsity = 0.5)) == Float32
@@ -173,9 +189,9 @@ end
 
     @testset "truncated_normal" begin
         m = truncated_normal(100, 100)
-        @test minimum(m)≈-2 atol=0.05  # default arguments
-        @test maximum(m)≈2 atol=0.05
-        @test mean(m)≈0 atol=0.1
+        @test minimum(m) ≈ -2 atol = 0.05  # default arguments
+        @test maximum(m) ≈ 2 atol = 0.05
+        @test mean(m) ≈ 0 atol = 0.1
 
         size100 = (100, 100, 100)
         for (μ, σ, lo, hi) in [(0.0, 1, -2, 3), (1, 2, -4.0, 5.0)]
@@ -225,15 +241,12 @@ end
             indata = reshape(collect(Float32, 1:9), 3, 3)
             @test l(indata) == indata
         end
-        @testset "$layer ID mapping with kernelsize $kernelsize" for layer in (Conv,
-                                                                               ConvTranspose,
-                                                                               CrossCor),
-                                                                     kernelsize in ((1,),
-                                                                                    (3,),
-                                                                                    (1, 3),
-                                                                                    (3, 5),
-                                                                                    (3, 5,
-                                                                                     7))
+        @testset "$layer ID mapping with kernelsize $kernelsize" for layer in (
+                Conv,
+                ConvTranspose,
+                CrossCor,
+            ),
+            kernelsize in ((1,), (3,), (1, 3), (3, 5), (3, 5, 7))
 
             nch = 3
             l = layer(kernelsize, nch => nch; init = identity_init, pad = SamePad())
@@ -244,10 +257,18 @@ end
         @testset "Inception identity" begin
             insize = 7
             path1 = Conv((1, 3), insize => 2; init = identity_init, pad = SamePad())
-            path2 = Conv((3, 5), insize => 3; init = identity_init(; shift = (0, 0, 2, 0)),
-                         pad = SamePad())
-            path3 = Conv((5, 7), insize => 2; init = identity_init(; shift = (0, 0, 5, 0)),
-                         pad = SamePad())
+            path2 = Conv(
+                (3, 5),
+                insize => 3;
+                init = identity_init(; shift = (0, 0, 2, 0)),
+                pad = SamePad(),
+            )
+            path3 = Conv(
+                (5, 7),
+                insize => 2;
+                init = identity_init(; shift = (0, 0, 5, 0)),
+                pad = SamePad(),
+            )
             block = Parallel((xs...) -> cat(xs...; dims = 3), path1, path2, path3)
 
             indata = randn(Float32, 9, 9, 7, 2)
@@ -295,7 +316,7 @@ end
     @test f32(m).bias === m.bias === false
 
     @testset "Gradients for broadcasted $op with sizes $s" for op in (+, -, *),
-                                                               s in ((1,), (2, 3))
+        s in ((1,), (2, 3))
 
         o = ones(s)
         z = zeros(s)
@@ -346,10 +367,12 @@ end
 end
 
 @testset "Batching" begin
-    stacked_array = [8 9 3 5
-                     9 6 6 9
-                     9 1 7 2
-                     7 4 10 6]
+    stacked_array = [
+        8 9 3 5
+        9 6 6 9
+        9 1 7 2
+        7 4 10 6
+    ]
     unstacked_array = [[8, 9, 9, 7], [9, 6, 1, 4], [3, 6, 7, 10], [5, 9, 2, 6]]
     @test unbatch(stacked_array) == unstacked_array
     @test batch(unstacked_array) == stacked_array
@@ -359,20 +382,18 @@ end
     @test unbatch([1, 2, 3]) == [1, 2, 3]
 
     # generic iterable
-    @test batch(ones(2) for i in 1:3) == ones(2, 3)
-    @test unbatch(ones(2, 3)) == [ones(2) for i in 1:3]
+    @test batch(ones(2) for i = 1:3) == ones(2, 3)
+    @test unbatch(ones(2, 3)) == [ones(2) for i = 1:3]
 end
 
 @testset "Param remapping" begin
     ls(dims...) = reshape(collect(Float32, 1:prod(dims)), dims...) # accepts dims in reverse order to Dense
     dl(nin, nout, bias) = Dense(ls(nout, nin), bias(nout))
-    dm(bias) = Chain(dl(3, 5, bias),
-                     dl(5, 4, bias),
-                     dl(4, 3, bias))
+    dm(bias) = Chain(dl(3, 5, bias), dl(5, 4, bias), dl(4, 3, bias))
 
     nobias(n) = false
-    testdense(m, bt) = @testset "Check layer $i" for (i, (l1, l2)) in enumerate(zip(m,
-                                                                                    dm(bt)))
+    testdense(m, bt) = @testset "Check layer $i" for (i, (l1, l2)) in
+                                                     enumerate(zip(m, dm(bt)))
         @test l1.weight == l2.weight
         @test l1.bias == l2.bias
         @test_skip typeof(l1.bias) === typeof(l2.bias)
@@ -420,8 +441,12 @@ end
 
         # tests for BatchNorm and Dropout
         m1 = Chain(Conv((3, 3), 3 => 16), BatchNorm(16), Flux.flatten, Dropout(0.2))
-        m2 = Chain(Conv((3, 3), 3 => 16), BatchNorm(16), x -> reshape(x, :, size(x)[end]),
-                   Dropout(0.1))
+        m2 = Chain(
+            Conv((3, 3), 3 => 16),
+            BatchNorm(16),
+            x -> reshape(x, :, size(x)[end]),
+            Dropout(0.1),
+        )
         m2[2].μ .= rand(Float32, size(m2[2].μ)...)
         loadmodel!(m1, m2)
         # non-trainable parameters are copied as well
@@ -436,36 +461,40 @@ end
         # tests MaxPool
         # tests testmode!/trainmode! is not copied
         # tests Dense, Conv, BatchNorm, Dropout (like above) but in a bigger model
-        chain1 = Chain(Dropout(0.2),
-                       Conv((3, 3), 1 => 32, relu),
-                       BatchNorm(32, relu),
-                       MaxPool((2, 2)),
-                       Dropout(0.2),
-                       Conv((3, 3), 32 => 16, relu),
-                       Dropout(0.2),
-                       MaxPool((2, 2)),
-                       Dropout(0.2),
-                       Conv((3, 3), 16 => 10, relu),
-                       Dropout(0.2),
-                       x -> reshape(x, :, size(x, 4)),
-                       Dropout(0.2),
-                       Dense(90, 10),
-                       softmax)
-        chain2 = Chain([Dropout(0.1),
-                           Conv((3, 3), 1 => 32, relu),
-                           BatchNorm(32, relu),
-                           MaxPool((3, 3)),
-                           Dropout(0.1),
-                           Conv((3, 3), 32 => 16, relu),
-                           Dropout(0.1),
-                           MaxPool((3, 3)),
-                           Dropout(0.1),
-                           Conv((3, 3), 16 => 10, relu),
-                           Dropout(0.1),
-                           x -> reshape(x, :, size(x, 4)),
-                           Dropout(0.1),
-                           Dense(90, 10),
-                           softmax])
+        chain1 = Chain(
+            Dropout(0.2),
+            Conv((3, 3), 1 => 32, relu),
+            BatchNorm(32, relu),
+            MaxPool((2, 2)),
+            Dropout(0.2),
+            Conv((3, 3), 32 => 16, relu),
+            Dropout(0.2),
+            MaxPool((2, 2)),
+            Dropout(0.2),
+            Conv((3, 3), 16 => 10, relu),
+            Dropout(0.2),
+            x -> reshape(x, :, size(x, 4)),
+            Dropout(0.2),
+            Dense(90, 10),
+            softmax,
+        )
+        chain2 = Chain([
+            Dropout(0.1),
+            Conv((3, 3), 1 => 32, relu),
+            BatchNorm(32, relu),
+            MaxPool((3, 3)),
+            Dropout(0.1),
+            Conv((3, 3), 32 => 16, relu),
+            Dropout(0.1),
+            MaxPool((3, 3)),
+            Dropout(0.1),
+            Conv((3, 3), 16 => 10, relu),
+            Dropout(0.1),
+            x -> reshape(x, :, size(x, 4)),
+            Dropout(0.1),
+            Dense(90, 10),
+            softmax,
+        ])
         chain2[3].μ .= 5.0f0
         chain2[3].σ² .= 2.0f0
         testmode!(chain2)
@@ -473,7 +502,7 @@ end
         for (dst, src) in zip(chain1, chain2)
             if dst isa Dropout
                 @test dst.p == 0.2
-            elseif dst isa Union{Conv, Dense}
+            elseif dst isa Union{Conv,Dense}
                 @test dst.weight == src.weight
                 @test dst.bias == src.bias
             elseif dst isa MaxPool
@@ -486,12 +515,12 @@ end
         end
 
         # copy only a subset of the model
-        chain1[end - 1].weight .= 1.0f0
+        chain1[end-1].weight .= 1.0f0
         chain1[3].μ .= 3.0f0
         chain1[2].bias .= 5.0f0
-        loadmodel!(chain2[end - 1], chain1[end - 1])
+        loadmodel!(chain2[end-1], chain1[end-1])
         loadmodel!(chain2[3], chain1[3])
-        @test chain2[end - 1].weight == chain1[end - 1].weight
+        @test chain2[end-1].weight == chain1[end-1].weight
         @test chain2[3].μ == chain1[3].μ
         @test chain2[2].bias != chain1[2].bias
 
@@ -602,17 +631,18 @@ end
     @test modules[5] === m2
     @test modules[6] === m3
 
-    mod_par = Flux.modules(Parallel(Flux.Bilinear(2, 2, 2, cbrt), Dense(2, 2, abs),
-                                    Dense(2, 2, abs2)))
+    mod_par = Flux.modules(
+        Parallel(Flux.Bilinear(2, 2, 2, cbrt), Dense(2, 2, abs), Dense(2, 2, abs2)),
+    )
     @test length(mod_par) == 5
 
     mod_rnn = Flux.modules(Chain(Dense(2, 3), BatchNorm(3), LSTM(3, 4)))
     @test length(mod_rnn) == 6
     @test mod_rnn[end] isa Flux.LSTMCell
 
-    mod_skip = Flux.modules(Chain(SkipConnection(Conv((2, 3), 4 => 5; pad = 6, stride = 7),
-                                                 +),
-                                  LayerNorm(8)))
+    mod_skip = Flux.modules(
+        Chain(SkipConnection(Conv((2, 3), 4 => 5; pad = 6, stride = 7), +), LayerNorm(8)),
+    )
     @test length(mod_skip) == 6
     @test mod_skip[end] isa Flux.Scale
 end
@@ -631,7 +661,7 @@ end
         end
 
         n_iter = 0
-        for i in 1:length(v)
+        for i = 1:length(v)
             trigger(i) && break
             n_iter += 1
         end
@@ -653,8 +683,11 @@ end
         end
 
         @testset "distance" begin
-            es = Flux.early_stopping(identity, 10;
-                                     distance = (best_score, score) -> score - best_score)
+            es = Flux.early_stopping(
+                identity,
+                10;
+                distance = (best_score, score) -> score - best_score,
+            )
 
             n_iter = 0
             while n_iter < 99
@@ -718,8 +751,7 @@ end
             return out = m.dense(x)
         end
 
-        model = TwoDenses(Dense(3, 1),
-                          Dense(3, 2))
+        model = TwoDenses(Dense(3, 1), Dense(3, 2))
         p, re = Flux.destructure(model)
 
         x = [1.0, 2.0, 3.0]
@@ -781,8 +813,10 @@ end
         n_outputs = [3, 7]
 
         data = rand(Float32, n_input, n_batch)
-        model = Chain(Dense(n_input, n_shared),
-                      Split(Dense(n_shared, n_outputs[1]), Dense(n_shared, n_outputs[2])))
+        model = Chain(
+            Dense(n_input, n_shared),
+            Split(Dense(n_shared, n_outputs[1]), Dense(n_shared, n_outputs[2])),
+        )
 
         pvec, re = Flux.destructure(model)
         loss(x, idx, pv) = sum(abs2, re(pv)(x)[idx])  # loss wrt `idx`th output term
@@ -792,16 +826,20 @@ end
     end
 end
 
-@testset "Rrule" begin @testset "issue 2033" begin if CUDA.functional()
-    struct Wrapped{T}
-        x::T
+@testset "Rrule" begin
+    @testset "issue 2033" begin
+        if CUDA.functional()
+            struct Wrapped{T}
+                x::T
+            end
+            y, _ = Flux.pullback(Wrapped, cu(randn(3, 3)))
+            @test y isa Wrapped{<:CuArray}
+        end
     end
-    y, _ = Flux.pullback(Wrapped, cu(randn(3, 3)))
-    @test y isa Wrapped{<:CuArray}
-end end end
+end
 
 # make sure rng_from_array is non_differentiable
 @testset "rng_from_array" begin
-    m(x) = (rand(rng_from_array(x)) * x)[1]
+    m(x) = (rand(rng_from_array(x))*x)[1]
     gradient(m, ones(2))
 end