From 289aa9d7c252700cd5345e10b85c7c6a194a2501 Mon Sep 17 00:00:00 2001
From: Saransh Chopra <saransh0701@gmail.com>
Date: Thu, 6 Oct 2022 19:24:59 +0530
Subject: [PATCH] Oops, forgot .JuliaFormatter.toml this time

---
 .JuliaFormatter.toml         |   9 +
 docs/make.jl                 | 108 ++++++------
 perf/bench_utils.jl          |   6 +-
 perf/recurrent.jl            |   2 +-
 perf/vgg.jl                  |  76 ++++-----
 src/Flux.jl                  | 118 +++++++------
 src/cuda/cudnn.jl            |  52 +++---
 src/deprecations.jl          |  66 +++-----
 src/functor.jl               |  23 +--
 src/layers/basic.jl          | 173 ++++++++-----------
 src/layers/conv.jl           | 315 ++++++++++++++++-------------------
 src/layers/normalise.jl      | 134 +++++++--------
 src/layers/recurrent.jl      | 166 +++++++++---------
 src/layers/show.jl           |  74 ++++----
 src/layers/upsample.jl       |  12 +-
 src/loading.jl               |  24 +--
 src/losses/Losses.jl         |  34 ++--
 src/losses/functions.jl      |   6 +-
 src/losses/utils.jl          |  14 +-
 src/optimise/Optimise.jl     |  46 ++---
 src/optimise/optimisers.jl   | 131 +++++++--------
 src/optimise/train.jl        |  34 ++--
 src/outputsize.jl            |  14 +-
 src/utils.jl                 |  37 ++--
 test/ctc-gpu.jl              |  24 ++-
 test/ctc.jl                  |  24 ++-
 test/cuda/cuda.jl            |   4 +-
 test/cuda/curnn.jl           |  97 ++++++-----
 test/cuda/layers.jl          | 197 ++++++++++------------
 test/cuda/losses.jl          |  23 +--
 test/cuda/test_utils.jl      |  20 +--
 test/data.jl                 |  20 +--
 test/layers/basic.jl         |  64 ++++---
 test/layers/conv.jl          |  67 ++++----
 test/layers/normalisation.jl | 266 ++++++++++++++---------------
 test/layers/recurrent.jl     | 162 ++++++++----------
 test/layers/upsample.jl      |  26 +--
 test/losses.jl               | 162 +++++++-----------
 test/optimise.jl             |  89 +++++-----
 test/outputsize.jl           |  54 +++---
 test/runtests.jl             |  26 +--
 test/utils.jl                | 227 ++++++++++++-------------
 42 files changed, 1448 insertions(+), 1778 deletions(-)
 create mode 100644 .JuliaFormatter.toml

diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml
new file mode 100644
index 0000000000..07fcf66f1f
--- /dev/null
+++ b/.JuliaFormatter.toml
@@ -0,0 +1,9 @@
+style = "sciml"
+whitespace_in_kwargs = true
+format_docstrings = true
+always_for_in = true
+join_lines_based_on_source = true
+separate_kwargs_with_semicolon = true
+always_use_return = true
+margin = 92
+indent = 4
diff --git a/docs/make.jl b/docs/make.jl
index ecfaaa256c..dcf4081aaf 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,61 +1,61 @@
 using Documenter,
-    Flux, NNlib, Functors, MLUtils, BSON, Optimisers, OneHotArrays, Zygote, ChainRulesCore
+      Flux, NNlib, Functors, MLUtils, BSON, Optimisers, OneHotArrays, Zygote, ChainRulesCore
 
 DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive = true)
 
 makedocs(;
-    modules = [
-        Flux,
-        NNlib,
-        Functors,
-        MLUtils,
-        BSON,
-        Optimisers,
-        OneHotArrays,
-        Zygote,
-        ChainRulesCore,
-        Base,
-    ],
-    doctest = false,
-    sitename = "Flux",
-    # strict = [:cross_references,],
-    pages = [
-        "Home" => "index.md",
-        "Building Models" => [
-            "Overview" => "models/overview.md",
-            "Basics" => "models/basics.md",
-            "Recurrence" => "models/recurrence.md",
-            "Layer Reference" => "models/layers.md",
-            "Loss Functions" => "models/losses.md",
-            "Regularisation" => "models/regularisation.md",
-            "Custom Layers" => "models/advanced.md",
-            "NNlib.jl" => "models/nnlib.md",
-            "Activation Functions" => "models/activation.md",
-        ],
-        "Handling Data" =>
-            ["MLUtils.jl" => "data/mlutils.md", "OneHotArrays.jl" => "data/onehot.md"],
-        "Training Models" => [
-            "Optimisers" => "training/optimisers.md",
-            "Training" => "training/training.md",
-            "Callback Helpers" => "training/callbacks.md",
-            "Zygote.jl" => "training/zygote.md",
-        ],
-        "GPU Support" => "gpu.md",
-        "Model Tools" => [
-            "Saving & Loading" => "saving.md",
-            "Shape Inference" => "outputsize.md",
-            "Weight Initialisation" => "utilities.md",
-            "Functors.jl" => "models/functors.md",
-        ],
-        "Performance Tips" => "performance.md",
-        "Flux's Ecosystem" => "ecosystem.md",
-    ],
-    format = Documenter.HTML(;
-        sidebar_sitename = false,
-        analytics = "UA-36890222-9",
-        assets = ["assets/flux.css"],
-        prettyurls = get(ENV, "CI", nothing) == "true",
-    ),
-)
+         modules = [
+             Flux,
+             NNlib,
+             Functors,
+             MLUtils,
+             BSON,
+             Optimisers,
+             OneHotArrays,
+             Zygote,
+             ChainRulesCore,
+             Base,
+         ],
+         doctest = false,
+         sitename = "Flux",
+         # strict = [:cross_references,],
+         pages = [
+             "Home" => "index.md",
+             "Building Models" => [
+                 "Overview" => "models/overview.md",
+                 "Basics" => "models/basics.md",
+                 "Recurrence" => "models/recurrence.md",
+                 "Layer Reference" => "models/layers.md",
+                 "Loss Functions" => "models/losses.md",
+                 "Regularisation" => "models/regularisation.md",
+                 "Custom Layers" => "models/advanced.md",
+                 "NNlib.jl" => "models/nnlib.md",
+                 "Activation Functions" => "models/activation.md",
+             ],
+             "Handling Data" => [
+                 "MLUtils.jl" => "data/mlutils.md",
+                 "OneHotArrays.jl" => "data/onehot.md",
+             ],
+             "Training Models" => [
+                 "Optimisers" => "training/optimisers.md",
+                 "Training" => "training/training.md",
+                 "Callback Helpers" => "training/callbacks.md",
+                 "Zygote.jl" => "training/zygote.md",
+             ],
+             "GPU Support" => "gpu.md",
+             "Model Tools" => [
+                 "Saving & Loading" => "saving.md",
+                 "Shape Inference" => "outputsize.md",
+                 "Weight Initialisation" => "utilities.md",
+                 "Functors.jl" => "models/functors.md",
+             ],
+             "Performance Tips" => "performance.md",
+             "Flux's Ecosystem" => "ecosystem.md",
+         ],
+         format = Documenter.HTML(;
+                                  sidebar_sitename = false,
+                                  analytics = "UA-36890222-9",
+                                  assets = ["assets/flux.css"],
+                                  prettyurls = get(ENV, "CI", nothing) == "true"))
 
 deploydocs(; repo = "github.com/FluxML/Flux.jl.git", target = "build", push_preview = true)
diff --git a/perf/bench_utils.jl b/perf/bench_utils.jl
index f719b01c99..d7897851a4 100644
--- a/perf/bench_utils.jl
+++ b/perf/bench_utils.jl
@@ -24,19 +24,19 @@ function run_benchmark(model, x; cuda = true)
         fw(model, x)
         GC.gc()
         CUDA.reclaim() #warmup
-        @btime CUDA.@sync(fw($model, $x)) teardown = (GC.gc(); CUDA.reclaim())
+        @btime CUDA.@sync(fw($model, $x)) teardown=(GC.gc(); CUDA.reclaim())
 
         println("  backward")
         bw(back)
         GC.gc()
         CUDA.reclaim() #warmup
-        @btime CUDA.@sync(bw($back)) teardown = (GC.gc(); CUDA.reclaim())
+        @btime CUDA.@sync(bw($back)) teardown=(GC.gc(); CUDA.reclaim())
 
         println("  forw and back")
         fwbw(model, ps, x)
         GC.gc()
         CUDA.reclaim() #warmup
-        @btime CUDA.@sync(fwbw($model, $ps, $x)) teardown = (GC.gc(); CUDA.reclaim())
+        @btime CUDA.@sync(fwbw($model, $ps, $x)) teardown=(GC.gc(); CUDA.reclaim())
     else
         println("  forward")
         fw(model, x)  #warmup
diff --git a/perf/recurrent.jl b/perf/recurrent.jl
index bf4a2474da..9002e248d6 100644
--- a/perf/recurrent.jl
+++ b/perf/recurrent.jl
@@ -51,7 +51,7 @@ end
 
 for rnn_type in [Flux.RNN, Flux.GRU, Flux.LSTM]
     rnn_benchmark_sweep(rnn_type) do n, ts
-        return [randn(Float32, n, n) for _ = 1:ts], "Vec"
+        return [randn(Float32, n, n) for _ in 1:ts], "Vec"
     end
 end
 
diff --git a/perf/vgg.jl b/perf/vgg.jl
index d86fdd6fe1..dad9d1aad1 100644
--- a/perf/vgg.jl
+++ b/perf/vgg.jl
@@ -6,45 +6,43 @@ using CUDA
 using Zygote: pullback
 
 function vgg16()
-    return Chain(
-        Conv((3, 3), 3 => 64, relu; pad = (1, 1), stride = (1, 1)),
-        BatchNorm(64),
-        Conv((3, 3), 64 => 64, relu; pad = (1, 1), stride = (1, 1)),
-        BatchNorm(64),
-        MaxPool((2, 2)),
-        Conv((3, 3), 64 => 128, relu; pad = (1, 1), stride = (1, 1)),
-        BatchNorm(128),
-        Conv((3, 3), 128 => 128, relu; pad = (1, 1), stride = (1, 1)),
-        BatchNorm(128),
-        MaxPool((2, 2)),
-        Conv((3, 3), 128 => 256, relu; pad = (1, 1), stride = (1, 1)),
-        BatchNorm(256),
-        Conv((3, 3), 256 => 256, relu; pad = (1, 1), stride = (1, 1)),
-        BatchNorm(256),
-        Conv((3, 3), 256 => 256, relu; pad = (1, 1), stride = (1, 1)),
-        BatchNorm(256),
-        MaxPool((2, 2)),
-        Conv((3, 3), 256 => 512, relu; pad = (1, 1), stride = (1, 1)),
-        BatchNorm(512),
-        Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)),
-        BatchNorm(512),
-        Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)),
-        BatchNorm(512),
-        MaxPool((2, 2)),
-        Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)),
-        BatchNorm(512),
-        Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)),
-        BatchNorm(512),
-        Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)),
-        BatchNorm(512),
-        MaxPool((2, 2)),
-        flatten,
-        Dense(512, 4096, relu),
-        Dropout(0.5),
-        Dense(4096, 4096, relu),
-        Dropout(0.5),
-        Dense(4096, 10),
-    )
+    return Chain(Conv((3, 3), 3 => 64, relu; pad = (1, 1), stride = (1, 1)),
+                 BatchNorm(64),
+                 Conv((3, 3), 64 => 64, relu; pad = (1, 1), stride = (1, 1)),
+                 BatchNorm(64),
+                 MaxPool((2, 2)),
+                 Conv((3, 3), 64 => 128, relu; pad = (1, 1), stride = (1, 1)),
+                 BatchNorm(128),
+                 Conv((3, 3), 128 => 128, relu; pad = (1, 1), stride = (1, 1)),
+                 BatchNorm(128),
+                 MaxPool((2, 2)),
+                 Conv((3, 3), 128 => 256, relu; pad = (1, 1), stride = (1, 1)),
+                 BatchNorm(256),
+                 Conv((3, 3), 256 => 256, relu; pad = (1, 1), stride = (1, 1)),
+                 BatchNorm(256),
+                 Conv((3, 3), 256 => 256, relu; pad = (1, 1), stride = (1, 1)),
+                 BatchNorm(256),
+                 MaxPool((2, 2)),
+                 Conv((3, 3), 256 => 512, relu; pad = (1, 1), stride = (1, 1)),
+                 BatchNorm(512),
+                 Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)),
+                 BatchNorm(512),
+                 Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)),
+                 BatchNorm(512),
+                 MaxPool((2, 2)),
+                 Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)),
+                 BatchNorm(512),
+                 Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)),
+                 BatchNorm(512),
+                 Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)),
+                 BatchNorm(512),
+                 MaxPool((2, 2)),
+                 flatten,
+                 Dense(512, 4096, relu),
+                 Dropout(0.5),
+                 Dense(4096, 4096, relu),
+                 Dropout(0.5),
+                 Dense(4096, 10))
 end
 
 let model = vgg16(), x = rand(Float32, 32, 32, 3, 64)
diff --git a/src/Flux.jl b/src/Flux.jl
index d2e2783199..b4e56dadfb 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -15,75 +15,73 @@ export gradient
 
 # Pirate error to catch a common mistake. (Internal function `base` because overloading `update!` is more likely to give ambiguities.)
 function Optimisers.base(dx::Zygote.Grads)
-    return error(
-        "Optimisers.jl cannot be used with Zygote.jl's implicit gradients, `Params` & `Grads`",
-    )
+    return error("Optimisers.jl cannot be used with Zygote.jl's implicit gradients, `Params` & `Grads`")
 end
 
 export Chain,
-    Dense,
-    Embedding,
-    Maxout,
-    SkipConnection,
-    Parallel,
-    PairwiseFusion,
-    RNN,
-    LSTM,
-    GRU,
-    GRUv3,
-    SamePad,
-    Conv,
-    CrossCor,
-    ConvTranspose,
-    DepthwiseConv,
-    AdaptiveMaxPool,
-    AdaptiveMeanPool,
-    GlobalMaxPool,
-    GlobalMeanPool,
-    MaxPool,
-    MeanPool,
-    Dropout,
-    AlphaDropout,
-    LayerNorm,
-    BatchNorm,
-    InstanceNorm,
-    GroupNorm,
-    Upsample,
-    PixelShuffle,
-    fmap,
-    cpu,
-    gpu,
-    f32,
-    f64,
-    testmode!,
-    trainmode!
+       Dense,
+       Embedding,
+       Maxout,
+       SkipConnection,
+       Parallel,
+       PairwiseFusion,
+       RNN,
+       LSTM,
+       GRU,
+       GRUv3,
+       SamePad,
+       Conv,
+       CrossCor,
+       ConvTranspose,
+       DepthwiseConv,
+       AdaptiveMaxPool,
+       AdaptiveMeanPool,
+       GlobalMaxPool,
+       GlobalMeanPool,
+       MaxPool,
+       MeanPool,
+       Dropout,
+       AlphaDropout,
+       LayerNorm,
+       BatchNorm,
+       InstanceNorm,
+       GroupNorm,
+       Upsample,
+       PixelShuffle,
+       fmap,
+       cpu,
+       gpu,
+       f32,
+       f64,
+       testmode!,
+       trainmode!
 
 include("optimise/Optimise.jl")
 using .Optimise
 using .Optimise: @epochs
 using .Optimise: skip
 export Descent,
-    Adam,
-    Momentum,
-    Nesterov,
-    RMSProp,
-    AdaGrad,
-    AdaMax,
-    AdaDelta,
-    AMSGrad,
-    NAdam,
-    OAdam,
-    AdamW,
-    RAdam,
-    AdaBelief,
-    InvDecay,
-    ExpDecay,
-    WeightDecay,
-    ClipValue,
-    ClipNorm
+       Adam,
+       Momentum,
+       Nesterov,
+       RMSProp,
+       AdaGrad,
+       AdaMax,
+       AdaDelta,
+       AMSGrad,
+       NAdam,
+       OAdam,
+       AdamW,
+       RAdam,
+       AdaBelief,
+       InvDecay,
+       ExpDecay,
+       WeightDecay,
+       ClipValue,
+       ClipNorm
 
 using CUDA
-const use_cuda = Ref{Union{Nothing,Bool}}(nothing)
+const use_cuda = Ref{Union{Nothing, Bool}}(nothing)
 
 using Adapt, Functors, OneHotArrays
 include("utils.jl")
@@ -91,9 +89,7 @@ include("functor.jl")
 
 # Pirate error to catch a common mistake.
 function Functors.functor(::Type{<:MLUtils.DataLoader}, x)
-    return error(
-        "`DataLoader` does not support Functors.jl, thus functions like `Flux.gpu` will not act on its contents.",
-    )
+    return error("`DataLoader` does not support Functors.jl, thus functions like `Flux.gpu` will not act on its contents.")
 end
 
 include("layers/stateless.jl")
diff --git a/src/cuda/cudnn.jl b/src/cuda/cudnn.jl
index 6ffa43e16a..40805f20c2 100644
--- a/src/cuda/cudnn.jl
+++ b/src/cuda/cudnn.jl
@@ -1,39 +1,31 @@
 import NNlibCUDA: batchnorm, ∇batchnorm
 
-function (BN::Flux.BatchNorm)(
-    x::Union{CuArray{T,2},CuArray{T,4},CuArray{T,5}},
-    cache = nothing,
-) where {T<:Union{Float32,Float64}}
+function (BN::Flux.BatchNorm)(x::Union{CuArray{T, 2}, CuArray{T, 4}, CuArray{T, 5}},
+                              cache = nothing) where {T <: Union{Float32, Float64}}
     @assert BN.affine "BatchNorm: only affine=true supported on gpu"
     @assert BN.track_stats "BatchNorm: only track_stats=true supported on gpu"
-    @assert length(BN.β) == size(x, ndims(x) - 1) "BatchNorm: input has wrong number of channels"
-    return BN.λ.(
-        batchnorm(
-            BN.γ,
-            BN.β,
-            x,
-            BN.μ,
-            BN.σ²,
-            BN.momentum;
-            cache = cache,
-            alpha = 1,
-            beta = 0,
-            eps = BN.ϵ,
-            training = Flux._isactive(BN),
-        )
-    )
+    @assert length(BN.β)==size(x, ndims(x) - 1) "BatchNorm: input has wrong number of channels"
+    return BN.λ.(batchnorm(BN.γ,
+                           BN.β,
+                           x,
+                           BN.μ,
+                           BN.σ²,
+                           BN.momentum;
+                           cache = cache,
+                           alpha = 1,
+                           beta = 0,
+                           eps = BN.ϵ,
+                           training = Flux._isactive(BN)))
 end
 
-function ChainRulesCore.rrule(
-    ::typeof(batchnorm),
-    g,
-    b,
-    x,
-    running_mean,
-    running_var,
-    momentum;
-    kw...,
-)
+function ChainRulesCore.rrule(::typeof(batchnorm),
+                              g,
+                              b,
+                              x,
+                              running_mean,
+                              running_var,
+                              momentum;
+                              kw...)
     y = batchnorm(g, b, x, running_mean, running_var, momentum; kw...)
     function batchnorm_pullback(Δ)
         grad = ∇batchnorm(g, b, x, unthunk(Δ), running_mean, running_var, momentum; kw...)
diff --git a/src/deprecations.jl b/src/deprecations.jl
index 6d29cb6fd1..cb5689e360 100644
--- a/src/deprecations.jl
+++ b/src/deprecations.jl
@@ -1,49 +1,35 @@
 # v0.12 deprecations
 
 function ones(dims...)
-    Base.depwarn(
-        "Flux.ones(size...) is deprecated, please use Flux.ones32(size...) or Base.ones(Float32, size...)",
-        :ones;
-        force = true,
-    )
+    Base.depwarn("Flux.ones(size...) is deprecated, please use Flux.ones32(size...) or Base.ones(Float32, size...)",
+                 :ones;
+                 force = true)
     return Base.ones(Float32, dims...)
 end
 ones(T::Type, dims...) = Base.ones(T, dims...)
 
 function zeros(dims...)
-    Base.depwarn(
-        "Flux.zeros(size...) is deprecated, please use Flux.zeros32(size...) or Base.zeros(Float32, size...)",
-        :zeros;
-        force = true,
-    )
+    Base.depwarn("Flux.zeros(size...) is deprecated, please use Flux.zeros32(size...) or Base.zeros(Float32, size...)",
+                 :zeros;
+                 force = true)
     return Base.zeros(Float32, dims...)
 end
 zeros(T::Type, dims...) = Base.zeros(T, dims...)
 
 function ones32(::Type, dims...)
-    throw(
-        ArgumentError(
-            "Flux.ones32 is always Float32, use Base.ones to specify the element type",
-        ),
-    )
+    throw(ArgumentError("Flux.ones32 is always Float32, use Base.ones to specify the element type"))
 end
 function zeros32(::Type, dims...)
-    throw(
-        ArgumentError(
-            "Flux.zeros32 is always Float32, use Base.zeros to specify the element type",
-        ),
-    )
+    throw(ArgumentError("Flux.zeros32 is always Float32, use Base.zeros to specify the element type"))
 end
 
 # v0.13 deprecations
 
 function Broadcast.broadcasted(f::Recur, args...)
     # This had an explicit @adjoint rule, calling Zygote.∇map(__context__, f, args...), until v0.12
-    Base.depwarn(
-        """Broadcasting is not safe to use with RNNs, as it does not guarantee an iteration order.
-Re-writing this as a comprehension would be better.""",
-        :broadcasted,
-    )
+    Base.depwarn("""Broadcasting is not safe to use with RNNs, as it does not guarantee an iteration order.
+         Re-writing this as a comprehension would be better.""",
+                 :broadcasted)
     return map(f, args...)  # map isn't really safe either, but 
 end
 
@@ -51,44 +37,34 @@ end
 
 struct Zeros
     function Zeros()
-        Base.depwarn(
-            "Flux.Zeros is no more, has ceased to be, is bereft of life, is an ex-boondoggle... please use bias=false instead",
-            :Zeros,
-        )
+        Base.depwarn("Flux.Zeros is no more, has ceased to be, is bereft of life, is an ex-boondoggle... please use bias=false instead",
+                     :Zeros)
         return false
     end
 end
 Zeros(args...) = Zeros()  # was used both Dense(10, 2, initb = Zeros) and Dense(rand(2,10), Zeros())
 
 function Optimise.update!(x::AbstractArray, x̄)
-    Base.depwarn(
-        "`Flux.Optimise.update!(x, x̄)` was not used internally and has been removed. Please write `x .-= x̄` instead.",
-        :update!,
-    )
+    Base.depwarn("`Flux.Optimise.update!(x, x̄)` was not used internally and has been removed. Please write `x .-= x̄` instead.",
+                 :update!)
     return x .-= x̄
 end
 
 function Diagonal(size::Integer...; kw...)
-    Base.depwarn(
-        "Flux.Diagonal is now Flux.Scale, and also allows an activation function.",
-        :Diagonal,
-    )
+    Base.depwarn("Flux.Diagonal is now Flux.Scale, and also allows an activation function.",
+                 :Diagonal)
     return Scale(size...; kw...)
 end
 function Diagonal(size::Tuple; kw...)
-    Base.depwarn(
-        "Flux.Diagonal is now Flux.Scale, and also allows an activation function.",
-        :Diagonal,
-    )
+    Base.depwarn("Flux.Diagonal is now Flux.Scale, and also allows an activation function.",
+                 :Diagonal)
     return Scale(size...; kw...)
 end
 
 # Deprecate this eventually once saving models w/o structure is no more
 function loadparams!(m, xs)
-    Base.depwarn(
-        "loadparams! will be deprecated eventually. Use loadmodel! instead.",
-        :loadparams!,
-    )
+    Base.depwarn("loadparams! will be deprecated eventually. Use loadmodel! instead.",
+                 :loadparams!)
     for (p, x) in zip(params(m), xs)
         size(p) == size(x) || error("Expected param size $(size(p)), got $(size(x))")
         copyto!(p, x)
diff --git a/src/functor.jl b/src/functor.jl
index 4463aaced7..5f946fa069 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -104,9 +104,7 @@ else
 end
 adapt_storage(to::FluxCUDAAdaptor, x::CUDA.RNG) = x
 function adapt_storage(to::FluxCUDAAdaptor, x::AbstractRNG)
-    return error(
-        "Cannot map RNG of type $(typeof(x)) to GPU. GPU execution only supports Random.default_rng().",
-    )
+    return error("Cannot map RNG of type $(typeof(x)) to GPU. GPU execution only supports Random.default_rng().")
 end
 
 # TODO: figure out the correct design for OneElement
@@ -118,10 +116,8 @@ struct FluxCPUAdaptor end
 adapt_storage(to::FluxCPUAdaptor, x::AbstractArray) = adapt(Array, x)
 adapt_storage(to::FluxCPUAdaptor, x::AbstractRange) = x
 adapt_storage(to::FluxCPUAdaptor, x::Zygote.FillArrays.AbstractFill) = x
-function adapt_storage(
-    to::FluxCPUAdaptor,
-    x::T,
-) where {T<:CUDA.CUSPARSE.CUDA.CUSPARSE.AbstractCuSparseMatrix}
+function adapt_storage(to::FluxCPUAdaptor,
+                       x::T) where {T <: CUDA.CUSPARSE.CUDA.CUSPARSE.AbstractCuSparseMatrix}
     return adapt(Array, x)
 end
 adapt_storage(to::FluxCPUAdaptor, x::Zygote.OneElement) = x
@@ -133,13 +129,11 @@ function ChainRulesCore.rrule(::Type{Array}, x::CUDA.CuArray)
     return Array(x), dx -> (NoTangent(), CUDA.cu(unthunk(dx)))
 end
 
-function ChainRulesCore.rrule(
-    ::typeof(Adapt.adapt_storage),
-    to::FluxCPUAdaptor,
-    x::CUDA.AbstractGPUArray,
-)
+function ChainRulesCore.rrule(::typeof(Adapt.adapt_storage),
+                              to::FluxCPUAdaptor,
+                              x::CUDA.AbstractGPUArray)
     return adapt_storage(to, x),
-    dx -> (NoTangent(), NoTangent(), adapt_storage(FluxCUDAAdaptor(), unthunk(dx)))
+           dx -> (NoTangent(), NoTangent(), adapt_storage(FluxCUDAAdaptor(), unthunk(dx)))
 end
 
 # CPU/GPU movement conveniences
@@ -213,8 +207,7 @@ function check_use_cuda()
         end
         if !(use_cuda[])
             @info """The GPU function is being called but the GPU is not accessible. 
-                     Defaulting back to the CPU. (No action is required if you want to run on the CPU).""" maxlog =
-                1
+                     Defaulting back to the CPU. (No action is required if you want to run on the CPU).""" maxlog=1
         end
     end
 end
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 647b237144..72e8208268 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -32,7 +32,7 @@ For large models, there is a special type-unstable path which can reduce compila
 times. This can be used by supplying a vector of layers `Chain([layer1, layer2, ...])`.
 This feature is somewhat experimental, beware!
 """
-struct Chain{T<:Union{Tuple,NamedTuple,AbstractVector}}
+struct Chain{T <: Union{Tuple, NamedTuple, AbstractVector}}
     layers::T
 end
 
@@ -45,21 +45,21 @@ function Chain(; kw...)
 end
 
 @forward Chain.layers Base.getindex,
-Base.length,
-Base.first,
-Base.last,
-Base.iterate,
-Base.lastindex,
-Base.keys,
-Base.firstindex
+                      Base.length,
+                      Base.first,
+                      Base.last,
+                      Base.iterate,
+                      Base.lastindex,
+                      Base.keys,
+                      Base.firstindex
 
 @functor Chain
 
 (c::Chain)(x) = _applychain(c.layers, x)
 
-@generated function _applychain(layers::Tuple{Vararg{<:Any,N}}, x) where {N}
-    symbols = vcat(:x, [gensym() for _ = 1:N])
-    calls = [:($(symbols[i+1]) = layers[$i]($(symbols[i]))) for i = 1:N]
+@generated function _applychain(layers::Tuple{Vararg{<:Any, N}}, x) where {N}
+    symbols = vcat(:x, [gensym() for _ in 1:N])
+    calls = [:($(symbols[i + 1]) = layers[$i]($(symbols[i]))) for i in 1:N]
     return Expr(:block, calls...)
 end
 
@@ -162,22 +162,20 @@ julia> Flux.params(d1)  # no trainable bias
 Params([[1.0 1.0 … 1.0 1.0; 1.0 1.0 … 1.0 1.0]])
 ```
 """
-struct Dense{F,M<:AbstractMatrix,B}
+struct Dense{F, M <: AbstractMatrix, B}
     weight::M
     bias::B
     σ::F
-    function Dense(W::M, bias = true, σ::F = identity) where {M<:AbstractMatrix,F}
+    function Dense(W::M, bias = true, σ::F = identity) where {M <: AbstractMatrix, F}
         b = _create_bias(W, bias, size(W, 1))
-        return new{F,M,typeof(b)}(W, b, σ)
+        return new{F, M, typeof(b)}(W, b, σ)
     end
 end
 
-function Dense(
-    (in, out)::Pair{<:Integer,<:Integer},
-    σ = identity;
-    init = glorot_uniform,
-    bias = true,
-)
+function Dense((in, out)::Pair{<:Integer, <:Integer},
+               σ = identity;
+               init = glorot_uniform,
+               bias = true)
     return Dense(init(out, in), bias, σ)
 end
 
@@ -239,17 +237,16 @@ julia> Flux.params(b)
 Params([[1 2 3 4]])
 ```
 """
-struct Scale{F,A<:AbstractArray,B}
+struct Scale{F, A <: AbstractArray, B}
     scale::A
     bias::B
     σ::F
-    function Scale(
-        scale::A,
-        bias::B = true,
-        σ::F = identity,
-    ) where {A<:AbstractArray,B<:Union{Bool,AbstractArray},F}
+    function Scale(scale::A,
+                   bias::B = true,
+                   σ::F = identity) where {A <: AbstractArray,
+                                           B <: Union{Bool, AbstractArray}, F}
         b = _create_bias(scale, bias, size(scale)...)
-        return new{F,A,typeof(b)}(scale, b, σ)
+        return new{F, A, typeof(b)}(scale, b, σ)
     end
 end
 
@@ -257,7 +254,7 @@ function Scale(s1::Integer, s23::Integer...; bias = true, init = ones32, _act =
     return Scale(init(s1, s23...), bias, _act)
 end
 function Scale(size_act...; bias = true, init = ones32)
-    return Scale(size_act[1:(end-1)]...; bias, init, _act = size_act[end])
+    return Scale(size_act[1:(end - 1)]...; bias, init, _act = size_act[end])
 end
 
 @functor Scale
@@ -310,11 +307,11 @@ julia> Flux.outputsize(m3, (5, 11))
 (7, 11)
 ```
 """
-struct Maxout{T<:Tuple}
+struct Maxout{T <: Tuple}
     layers::T
 end
 Maxout(layers...) = Maxout(layers)
-Maxout(f::Function, n_alts::Integer) = Maxout((f() for _ = 1:n_alts)...)
+Maxout(f::Function, n_alts::Integer) = Maxout((f() for _ in 1:n_alts)...)
 
 @functor Maxout
 
@@ -358,7 +355,7 @@ true
 
 See also [`Parallel`](@ref), [`Maxout`](@ref).
 """
-struct SkipConnection{T,F}
+struct SkipConnection{T, F}
     layers::T
     connection::F  #user can pass arbitrary connections here, such as (a,b) -> a + b
 end
@@ -421,28 +418,26 @@ julia> Flux.Bilinear(rand(4, 8, 16), false, tanh)  # first dim of weight is the
 Bilinear((8, 16) => 4, tanh; bias=false)  # 512 parameters
 ```
 """
-struct Bilinear{F,A,B}
+struct Bilinear{F, A, B}
     weight::A
     bias::B
     σ::F
-    function Bilinear(W::A, bias = true, σ::F = identity) where {A<:AbstractArray,F}
+    function Bilinear(W::A, bias = true, σ::F = identity) where {A <: AbstractArray, F}
         ndims(A) == 3 || throw(ArgumentError("expected a 3-array of weights"))
         b = _create_bias(W, bias, size(W, 1))
-        return new{F,A,typeof(b)}(W, b, σ)
+        return new{F, A, typeof(b)}(W, b, σ)
     end
 end
 
 @functor Bilinear
 
-function Bilinear(
-    ((in1, in2), out)::Pair{<:Tuple,<:Integer},
-    σ = identity;
-    bias = true,
-    init = glorot_uniform,
-)
+function Bilinear(((in1, in2), out)::Pair{<:Tuple, <:Integer},
+                  σ = identity;
+                  bias = true,
+                  init = glorot_uniform)
     return Bilinear(init(out, in1, in2), bias, σ)
 end
-function Bilinear((in12, out)::Pair{<:Integer,<:Integer}, σ = identity; kw...)
+function Bilinear((in12, out)::Pair{<:Integer, <:Integer}, σ = identity; kw...)
     return Bilinear((in12, in12) => out, σ; kw...)
 end
 
@@ -452,11 +447,8 @@ function (a::Bilinear)(x::AbstractMatrix, y::AbstractMatrix)
     d_z, d_x, d_y = size(W)
     d_x == size(x, 1) && d_y == size(y, 1) ||
         throw(DimensionMismatch("number of rows in data must match W"))
-    size(x, 2) == size(y, 2) || throw(
-        DimensionMismatch(
-            "Data inputs must agree on number of columns, got $(size(x,2)) and $(size(y,2))",
-        ),
-    )
+    size(x, 2) == size(y, 2) ||
+        throw(DimensionMismatch("Data inputs must agree on number of columns, got $(size(x,2)) and $(size(y,2))"))
 
     # @einsum Wy[o,i,s] := W[o,i,j] * y[j,s]
     Wy = reshape(reshape(W, (:, d_y)) * y, (d_z, d_x, :))
@@ -473,21 +465,19 @@ end
 function (a::Bilinear)(x::AbstractVector, y::AbstractVector)
     return vec(a(reshape(x, :, 1), reshape(y, :, 1)))
 end
-(a::Bilinear)(x::NTuple{2,AbstractArray}) = a(x[1], x[2])
+(a::Bilinear)(x::NTuple{2, AbstractArray}) = a(x[1], x[2])
 
 function Base.show(io::IO, l::Bilinear)
     if size(l.weight, 2) == size(l.weight, 3)
         print(io, "Bilinear(", size(l.weight, 2), " => ", size(l.weight, 1))
     else
-        print(
-            io,
-            "Bilinear((",
-            size(l.weight, 2),
-            ", ",
-            size(l.weight, 3),
-            ") => ",
-            size(l.weight, 1),
-        )
+        print(io,
+              "Bilinear((",
+              size(l.weight, 2),
+              ", ",
+              size(l.weight, 3),
+              ") => ",
+              size(l.weight, 1))
     end
     l.σ == identity || print(io, ", ", l.σ)
     l.bias === false && print(io, "; bias=false")
@@ -537,7 +527,7 @@ julia> model2[:β] == model2[2]
 true
 ```
 """
-struct Parallel{F,T<:Union{Tuple,NamedTuple}}
+struct Parallel{F, T <: Union{Tuple, NamedTuple}}
     connection::F
     layers::T
 end
@@ -546,11 +536,7 @@ Parallel(connection, layers...) = Parallel(connection, layers)
 function Parallel(connection; kw...)
     layers = NamedTuple(kw)
     if :layers in keys(layers) || :connection in keys(layers)
-        throw(
-            ArgumentError(
-                "a Parallel layer cannot have a named sub-layer called `connection` or `layers`",
-            ),
-        )
+        throw(ArgumentError("a Parallel layer cannot have a named sub-layer called `connection` or `layers`"))
     end
     isempty(layers) && return Parallel(connection, ())
     return Parallel(connection, layers)
@@ -565,11 +551,7 @@ function _parallel_check(layers, xs)
     nl = length(layers)
     nx = length(xs)
     if (nl != nx)
-        throw(
-            ArgumentError(
-                "Parallel with $nl sub-layers can take one input or $nl inputs, but got $nx inputs",
-            ),
-        )
+        throw(ArgumentError("Parallel with $nl sub-layers can take one input or $nl inputs, but got $nx inputs"))
     end
 end
 ChainRulesCore.@non_differentiable _parallel_check(nl, nx)
@@ -581,7 +563,7 @@ end
 
 Base.getindex(m::Parallel, i) = m.layers[i]
 Base.getindex(m::Parallel, i::AbstractVector) = Parallel(m.connection, m.layers[i])
-function Base.getindex(m::Parallel{<:Any,<:NamedTuple}, i::AbstractVector)
+function Base.getindex(m::Parallel{<:Any, <:NamedTuple}, i::AbstractVector)
     return Parallel(m.connection, NamedTuple{keys(m)[i]}(Tuple(m.layers)[i]))
 end
 
@@ -639,7 +621,7 @@ end
 
 A tuple of length N with the output of each fusion ((`y1`, `y2`, ..., `yN`) in the example above).
 """
-struct PairwiseFusion{F,T<:Union{Tuple,NamedTuple}}
+struct PairwiseFusion{F, T <: Union{Tuple, NamedTuple}}
     connection::F
     layers::T
 end
@@ -648,11 +630,7 @@ PairwiseFusion(connection, layers...) = PairwiseFusion(connection, layers)
 function PairwiseFusion(connection; kw...)
     layers = NamedTuple(kw)
     if :layers in keys(layers) || :connection in keys(layers)
-        throw(
-            ArgumentError(
-                "a PairwiseFusion layer cannot have a named sub-layer called `connection` or `layers`",
-            ),
-        )
+        throw(ArgumentError("a PairwiseFusion layer cannot have a named sub-layer called `connection` or `layers`"))
     end
     isempty(layers) && return PairwiseFusion(connection, ())
     return PairwiseFusion(connection, layers)
@@ -662,11 +640,7 @@ function _pairwise_check(x, layers, T)
     lx = length(x)
     N = length(layers)
     if T <: Tuple && lx != N
-        throw(
-            ArgumentError(
-                "PairwiseFusion with $N sub-layers can take one input or $N inputs, but got $lx inputs",
-            ),
-        )
+        throw(ArgumentError("PairwiseFusion with $N sub-layers can take one input or $N inputs, but got $lx inputs"))
     end
 end
 ChainRulesCore.@non_differentiable _pairwise_check(lx, N, T)
@@ -677,24 +651,20 @@ function (m::PairwiseFusion)(x::T) where {T}
 end
 (m::PairwiseFusion)(xs...) = m(xs)
 
-@generated function applypairwisefusion(
-    layers::Tuple{Vararg{<:Any,N}},
-    connection,
-    x::T,
-) where {N,T}
-    y_symbols = [gensym() for _ = 1:(N+1)]
+@generated function applypairwisefusion(layers::Tuple{Vararg{<:Any, N}},
+                                        connection,
+                                        x::T) where {N, T}
+    y_symbols = [gensym() for _ in 1:(N + 1)]
     getinput(i) = T <: Tuple ? :(x[$i]) : :x
-    calls = [:($(y_symbols[N+1]) = $(getinput(1)))]
-    for i = 1:(N-1)
-        push!(
-            calls,
-            quote
-                $(y_symbols[i]) = layers[$i]($(y_symbols[N+1]))
-                $(y_symbols[N+1]) = connection($(y_symbols[i]), $(getinput(i + 1)))
-            end,
-        )
+    calls = [:($(y_symbols[N + 1]) = $(getinput(1)))]
+    for i in 1:(N - 1)
+        push!(calls,
+              quote
+                  $(y_symbols[i]) = layers[$i]($(y_symbols[N + 1]))
+                  $(y_symbols[N + 1]) = connection($(y_symbols[i]), $(getinput(i + 1)))
+              end)
     end
-    push!(calls, :($(y_symbols[N]) = layers[$N]($(y_symbols[N+1]))))
+    push!(calls, :($(y_symbols[N]) = layers[$N]($(y_symbols[N + 1]))))
     push!(calls, :(return tuple($(Tuple(y_symbols[1:N])...))))
     return Expr(:block, calls...)
 end
@@ -708,7 +678,7 @@ Base.getindex(m::PairwiseFusion, i) = m.layers[i]
 function Base.getindex(m::PairwiseFusion, i::AbstractVector)
     return PairwiseFusion(m.connection, m.layers[i])
 end
-function Base.getindex(m::PairwiseFusion{<:Any,<:NamedTuple}, i::AbstractVector)
+function Base.getindex(m::PairwiseFusion{<:Any, <:NamedTuple}, i::AbstractVector)
     return PairwiseFusion(m.connection, NamedTuple{keys(m)[i]}(Tuple(m.layers)[i]))
 end
 
@@ -757,18 +727,15 @@ end
 
 @functor Embedding
 
-Embedding((in, out)::Pair{<:Integer,<:Integer}; init = randn32) = Embedding(init(out, in))
+Embedding((in, out)::Pair{<:Integer, <:Integer}; init = randn32) = Embedding(init(out, in))
 
 (m::Embedding)(x::Integer) = m.weight[:, x]
 (m::Embedding)(x::AbstractVector) = NNlib.gather(m.weight, x)
 (m::Embedding)(x::AbstractArray) = reshape(m(vec(x)), :, size(x)...)
 
-function (m::Embedding)(x::Union{OneHotVector{T,L},OneHotMatrix{T,L}}) where {T,L}
-    size(m.weight, 2) == L || throw(
-        DimensionMismatch(
-            "Matrix column must correspond with OneHot size: $(size(m.weight, 2)) != $L",
-        ),
-    )
+function (m::Embedding)(x::Union{OneHotVector{T, L}, OneHotMatrix{T, L}}) where {T, L}
+    size(m.weight, 2) == L ||
+        throw(DimensionMismatch("Matrix column must correspond with OneHot size: $(size(m.weight, 2)) != $L"))
     return m(onecold(x))
 end
 
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index b620983dbc..91a7000249 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -1,7 +1,7 @@
 using NNlib: conv, ∇conv_data, depthwiseconv, output_size
 
 # pad dims of x with dims of y until ndims(x) == ndims(y)
-_paddims(x::Tuple, y::Tuple) = (x..., y[(end-(length(y)-length(x)-1)):end]...)
+_paddims(x::Tuple, y::Tuple) = (x..., y[(end - (length(y) - length(x) - 1)):end]...)
 
 expand(N, i::Tuple) = i
 expand(N, i::Integer) = ntuple(_ -> i, N)
@@ -48,10 +48,10 @@ julia> layer3(xs) |> size  # output size = `ceil(input_size/stride)` = 50
 """
 struct SamePad end
 
-function calc_padding(lt, pad, k::NTuple{N,T}, dilation, stride) where {T,N}
+function calc_padding(lt, pad, k::NTuple{N, T}, dilation, stride) where {T, N}
     return expand(Val(2 * N), pad)
 end
-function calc_padding(lt, ::SamePad, k::NTuple{N,T}, dilation, stride) where {N,T}
+function calc_padding(lt, ::SamePad, k::NTuple{N, T}, dilation, stride) where {N, T}
     #Ref: "A guide to convolution arithmetic for deep learning" https://arxiv.org/abs/1603.07285
 
     # Effective kernel size, including dilation
@@ -127,13 +127,13 @@ julia> Conv((5, 5), 3 => 7; stride = 2, dilation = 4)(xs) |> size
 (42, 42, 7, 50)
 ```
 """
-struct Conv{N,M,F,A,V}
+struct Conv{N, M, F, A, V}
     σ::F
     weight::A
     bias::V
-    stride::NTuple{N,Int}
-    pad::NTuple{M,Int}
-    dilation::NTuple{N,Int}
+    stride::NTuple{N, Int}
+    pad::NTuple{M, Int}
+    dilation::NTuple{N, Int}
     groups::Int
 end
 
@@ -159,34 +159,30 @@ julia> Flux.params(layer) |> length
 2
 ```
 """
-function Conv(
-    w::AbstractArray{T,N},
-    b = true,
-    σ = identity;
-    stride = 1,
-    pad = 0,
-    dilation = 1,
-    groups = 1,
-) where {T,N}
-    @assert size(w, N) % groups == 0 "Output channel dimension must be divisible by groups."
+function Conv(w::AbstractArray{T, N},
+              b = true,
+              σ = identity;
+              stride = 1,
+              pad = 0,
+              dilation = 1,
+              groups = 1) where {T, N}
+    @assert size(w, N) % groups==0 "Output channel dimension must be divisible by groups."
     stride = expand(Val(N - 2), stride)
     dilation = expand(Val(N - 2), dilation)
-    pad = calc_padding(Conv, pad, size(w)[1:(N-2)], dilation, stride)
+    pad = calc_padding(Conv, pad, size(w)[1:(N - 2)], dilation, stride)
     bias = _create_bias(w, b, size(w, N))
     return Conv(σ, w, bias, stride, pad, dilation, groups)
 end
 
-function Conv(
-    k::NTuple{N,Integer},
-    ch::Pair{<:Integer,<:Integer},
-    σ = identity;
-    init = glorot_uniform,
-    stride = 1,
-    pad = 0,
-    dilation = 1,
-    groups = 1,
-    bias = true,
-) where {N}
+function Conv(k::NTuple{N, Integer},
+              ch::Pair{<:Integer, <:Integer},
+              σ = identity;
+              init = glorot_uniform,
+              stride = 1,
+              pad = 0,
+              dilation = 1,
+              groups = 1,
+              bias = true) where {N}
     weight = convfilter(k, ch; init, groups)
     return Conv(weight, bias, σ; stride, pad, dilation, groups)
 end
@@ -202,29 +198,25 @@ distribution.
 
 This is internally used by the [`Conv`](@ref) layer.
 """
-function convfilter(
-    filter::NTuple{N,Integer},
-    ch::Pair{<:Integer,<:Integer};
-    init = glorot_uniform,
-    groups = 1,
-) where {N}
+function convfilter(filter::NTuple{N, Integer},
+                    ch::Pair{<:Integer, <:Integer};
+                    init = glorot_uniform,
+                    groups = 1) where {N}
     cin, cout = ch
-    @assert cin % groups == 0 "Input channel dimension must be divisible by groups."
-    @assert cout % groups == 0 "Output channel dimension must be divisible by groups."
+    @assert cin % groups==0 "Input channel dimension must be divisible by groups."
+    @assert cout % groups==0 "Output channel dimension must be divisible by groups."
     return init(filter..., cin ÷ groups, cout)
 end
 
 @functor Conv
 
 function conv_dims(c::Conv, x::AbstractArray)
-    return DenseConvDims(
-        x,
-        c.weight;
-        stride = c.stride,
-        padding = c.pad,
-        dilation = c.dilation,
-        groups = c.groups,
-    )
+    return DenseConvDims(x,
+                         c.weight;
+                         stride = c.stride,
+                         padding = c.pad,
+                         dilation = c.dilation,
+                         groups = c.groups)
 end
 
 ChainRulesCore.@non_differentiable conv_dims(::Any, ::Any)
@@ -239,7 +231,7 @@ _channels_in(l::Conv) = size(l.weight, ndims(l.weight) - 1) * l.groups
 _channels_out(l::Conv) = size(l.weight, ndims(l.weight))
 
 function Base.show(io::IO, l::Conv)
-    print(io, "Conv(", size(l.weight)[1:(ndims(l.weight)-2)])
+    print(io, "Conv(", size(l.weight)[1:(ndims(l.weight) - 2)])
     print(io, ", ", _channels_in(l), " => ", _channels_out(l))
     _print_conv_opt(io, l)
     return print(io, ")")
@@ -288,18 +280,18 @@ julia> ConvTranspose((5, 5), 3 => 7; stride = 3, pad = SamePad())(xs) |> size
 (300, 300, 7, 50)
 ```
 """
-struct ConvTranspose{N,M,F,A,V}
+struct ConvTranspose{N, M, F, A, V}
     σ::F
     weight::A
     bias::V
-    stride::NTuple{N,Int}
-    pad::NTuple{M,Int}
-    dilation::NTuple{N,Int}
+    stride::NTuple{N, Int}
+    pad::NTuple{M, Int}
+    dilation::NTuple{N, Int}
     groups::Int
 end
 
 _channels_in(l::ConvTranspose) = size(l.weight)[end]
-_channels_out(l::ConvTranspose) = size(l.weight)[end-1] * l.groups
+_channels_out(l::ConvTranspose) = size(l.weight)[end - 1] * l.groups
 
 """
     ConvTranspose(weight::AbstractArray, [bias, activation; stride, pad, dilation, groups])
@@ -325,33 +317,29 @@ julia> Flux.params(layer) |> length
 2
 ```
 """
-function ConvTranspose(
-    w::AbstractArray{T,N},
-    bias = true,
-    σ = identity;
-    stride = 1,
-    pad = 0,
-    dilation = 1,
-    groups = 1,
-) where {T,N}
+function ConvTranspose(w::AbstractArray{T, N},
+                       bias = true,
+                       σ = identity;
+                       stride = 1,
+                       pad = 0,
+                       dilation = 1,
+                       groups = 1) where {T, N}
     stride = expand(Val(N - 2), stride)
     dilation = expand(Val(N - 2), dilation)
-    pad = calc_padding(ConvTranspose, pad, size(w)[1:(N-2)], dilation, stride)
+    pad = calc_padding(ConvTranspose, pad, size(w)[1:(N - 2)], dilation, stride)
     b = _create_bias(w, bias, size(w, N - 1) * groups)
     return ConvTranspose(σ, w, b, stride, pad, dilation, groups)
 end
 
-function ConvTranspose(
-    k::NTuple{N,Integer},
-    ch::Pair{<:Integer,<:Integer},
-    σ = identity;
-    init = glorot_uniform,
-    stride = 1,
-    pad = 0,
-    dilation = 1,
-    groups = 1,
-    bias = true,
-) where {N}
+function ConvTranspose(k::NTuple{N, Integer},
+                       ch::Pair{<:Integer, <:Integer},
+                       σ = identity;
+                       init = glorot_uniform,
+                       stride = 1,
+                       pad = 0,
+                       dilation = 1,
+                       groups = 1,
+                       bias = true) where {N}
     weight = convfilter(k, reverse(ch); init, groups)
     return ConvTranspose(weight, bias, σ; stride, pad, dilation, groups)
 end
@@ -361,21 +349,18 @@ end
 function conv_transpose_dims(c::ConvTranspose, x::AbstractArray)
     # Calculate size of "input", from ∇conv_data()'s perspective...
     combined_pad = (c.pad[1:2:end] .+ c.pad[2:2:end])
-    I =
-        (size(x)[1:(end-2)] .- 1) .* c.stride .+ 1 .+
-        (size(c.weight)[1:(end-2)] .- 1) .* c.dilation .- combined_pad
-    C_in = size(c.weight)[end-1] * c.groups
+    I = (size(x)[1:(end - 2)] .- 1) .* c.stride .+ 1 .+
+        (size(c.weight)[1:(end - 2)] .- 1) .* c.dilation .- combined_pad
+    C_in = size(c.weight)[end - 1] * c.groups
     batch_size = size(x)[end]
     # Create DenseConvDims() that looks like the corresponding conv()
     w_size = size(c.weight)
-    return DenseConvDims(
-        (I..., C_in, batch_size),
-        w_size;
-        stride = c.stride,
-        padding = c.pad,
-        dilation = c.dilation,
-        groups = c.groups,
-    )
+    return DenseConvDims((I..., C_in, batch_size),
+                         w_size;
+                         stride = c.stride,
+                         padding = c.pad,
+                         dilation = c.dilation,
+                         groups = c.groups)
 end
 
 ChainRulesCore.@non_differentiable conv_transpose_dims(::Any, ::Any)
@@ -387,19 +372,17 @@ function (c::ConvTranspose)(x::AbstractArray)
 end
 
 function Base.show(io::IO, l::ConvTranspose)
-    print(io, "ConvTranspose(", size(l.weight)[1:(ndims(l.weight)-2)])
+    print(io, "ConvTranspose(", size(l.weight)[1:(ndims(l.weight) - 2)])
     print(io, ", ", _channels_in(l), " => ", _channels_out(l))
     _print_conv_opt(io, l)
     return print(io, ")")
 end
 
-function calc_padding(
-    ::Type{ConvTranspose},
-    pad::SamePad,
-    k::NTuple{N,T},
-    dilation,
-    stride,
-) where {N,T}
+function calc_padding(::Type{ConvTranspose},
+                      pad::SamePad,
+                      k::NTuple{N, T},
+                      dilation,
+                      stride) where {N, T}
     return calc_padding(Conv, pad, k .- stride .+ 1, dilation, stride)
 end
 
@@ -427,29 +410,25 @@ julia> DepthwiseConv((5, 5), 3 => 9; stride = 2, pad = 2)(xs) |> size
 (50, 50, 9, 50)
 ```
 """
-function DepthwiseConv(
-    k::NTuple{<:Any,Integer},
-    ch::Pair{<:Integer,<:Integer},
-    σ = identity;
-    stride = 1,
-    pad = 0,
-    dilation = 1,
-    bias = true,
-    init = glorot_uniform,
-)
+function DepthwiseConv(k::NTuple{<:Any, Integer},
+                       ch::Pair{<:Integer, <:Integer},
+                       σ = identity;
+                       stride = 1,
+                       pad = 0,
+                       dilation = 1,
+                       bias = true,
+                       init = glorot_uniform)
     return Conv(k, ch, σ; groups = ch.first, stride, pad, dilation, bias, init)
 end
 
-function DepthwiseConv(
-    w::AbstractArray{T,N},
-    bias = true,
-    σ = identity;
-    stride = 1,
-    pad = 0,
-    dilation = 1,
-) where {T,N}
-    w2 = reshape(w, size(w)[1:(end-2)]..., 1, :)
-    return Conv(w2, bias, σ; groups = size(w)[end-1], stride, pad, dilation)
+function DepthwiseConv(w::AbstractArray{T, N},
+                       bias = true,
+                       σ = identity;
+                       stride = 1,
+                       pad = 0,
+                       dilation = 1) where {T, N}
+    w2 = reshape(w, size(w)[1:(end - 2)]..., 1, :)
+    return Conv(w2, bias, σ; groups = size(w)[end - 1], stride, pad, dilation)
 end
 
 """
@@ -479,13 +458,13 @@ julia> CrossCor((5, 5), 3 => 7; stride = 3, pad = (2, 0))(xs) |> size
 (34, 32, 7, 50)
 ```
 """
-struct CrossCor{N,M,F,A,V}
+struct CrossCor{N, M, F, A, V}
     σ::F
     weight::A
     bias::V
-    stride::NTuple{N,Int}
-    pad::NTuple{M,Int}
-    dilation::NTuple{N,Int}
+    stride::NTuple{N, Int}
+    pad::NTuple{M, Int}
+    dilation::NTuple{N, Int}
 end
 
 """
@@ -509,31 +488,27 @@ julia> layer(randn(100, 4, 64)) |> size
 (98, 5, 64)
 ```
 """
-function CrossCor(
-    w::AbstractArray{T,N},
-    bias = true,
-    σ = identity;
-    stride = 1,
-    pad = 0,
-    dilation = 1,
-) where {T,N}
+function CrossCor(w::AbstractArray{T, N},
+                  bias = true,
+                  σ = identity;
+                  stride = 1,
+                  pad = 0,
+                  dilation = 1) where {T, N}
     stride = expand(Val(N - 2), stride)
     dilation = expand(Val(N - 2), dilation)
-    pad = calc_padding(CrossCor, pad, size(w)[1:(N-2)], dilation, stride)
+    pad = calc_padding(CrossCor, pad, size(w)[1:(N - 2)], dilation, stride)
     b = _create_bias(w, bias, size(w, N))
     return CrossCor(σ, w, b, stride, pad, dilation)
 end
 
-function CrossCor(
-    k::NTuple{N,Integer},
-    ch::Pair{<:Integer,<:Integer},
-    σ = identity;
-    init = glorot_uniform,
-    stride = 1,
-    pad = 0,
-    dilation = 1,
-    bias = true,
-) where {N}
+function CrossCor(k::NTuple{N, Integer},
+                  ch::Pair{<:Integer, <:Integer},
+                  σ = identity;
+                  init = glorot_uniform,
+                  stride = 1,
+                  pad = 0,
+                  dilation = 1,
+                  bias = true) where {N}
     weight = convfilter(k, ch; init = init)
     return CrossCor(weight, bias, σ; stride, pad, dilation)
 end
@@ -546,13 +521,11 @@ function crosscor(x, w, ddims::DenseConvDims)
 end
 
 function crosscor_dims(c::CrossCor, x::AbstractArray)
-    return DenseConvDims(
-        x,
-        c.weight;
-        stride = c.stride,
-        padding = c.pad,
-        dilation = c.dilation,
-    )
+    return DenseConvDims(x,
+                         c.weight;
+                         stride = c.stride,
+                         padding = c.pad,
+                         dilation = c.dilation)
 end
 
 ChainRulesCore.@non_differentiable crosscor_dims(::Any, ::Any)
@@ -564,14 +537,12 @@ function (c::CrossCor)(x::AbstractArray)
 end
 
 function Base.show(io::IO, l::CrossCor)
-    print(io, "CrossCor(", size(l.weight)[1:(ndims(l.weight)-2)])
-    print(
-        io,
-        ", ",
-        size(l.weight, ndims(l.weight) - 1),
-        " => ",
-        size(l.weight, ndims(l.weight)),
-    )
+    print(io, "CrossCor(", size(l.weight)[1:(ndims(l.weight) - 2)])
+    print(io,
+          ", ",
+          size(l.weight, ndims(l.weight) - 1),
+          " => ",
+          size(l.weight, ndims(l.weight)))
     _print_conv_opt(io, l)
     return print(io, ")")
 end
@@ -599,13 +570,13 @@ julia> MaxPool((4, 4))(xs) ≈ AdaptiveMaxPool((25, 25))(xs)
 true
 ```
 """
-struct AdaptiveMaxPool{S,O}
-    out::NTuple{O,Int}
-    AdaptiveMaxPool(out::NTuple{O,Int}) where {O} = new{O + 2,O}(out)
+struct AdaptiveMaxPool{S, O}
+    out::NTuple{O, Int}
+    AdaptiveMaxPool(out::NTuple{O, Int}) where {O} = new{O + 2, O}(out)
 end
 
-function (a::AdaptiveMaxPool{S})(x::AbstractArray{T,S}) where {S,T}
-    insize = size(x)[1:(end-2)]
+function (a::AdaptiveMaxPool{S})(x::AbstractArray{T, S}) where {S, T}
+    insize = size(x)[1:(end - 2)]
     outsize = a.out
     stride = insize .÷ outsize
     k = insize .- (outsize .- 1) .* stride
@@ -641,13 +612,13 @@ julia> MeanPool((4, 4))(xs) ≈ AdaptiveMeanPool((25, 25))(xs)
 true
 ```
 """
-struct AdaptiveMeanPool{S,O}
-    out::NTuple{O,Int}
-    AdaptiveMeanPool(out::NTuple{O,Int}) where {O} = new{O + 2,O}(out)
+struct AdaptiveMeanPool{S, O}
+    out::NTuple{O, Int}
+    AdaptiveMeanPool(out::NTuple{O, Int}) where {O} = new{O + 2, O}(out)
 end
 
-function (a::AdaptiveMeanPool{S})(x::AbstractArray{T,S}) where {S,T}
-    insize = size(x)[1:(end-2)]
+function (a::AdaptiveMeanPool{S})(x::AbstractArray{T, S}) where {S, T}
+    insize = size(x)[1:(end - 2)]
     outsize = a.out
     stride = insize .÷ outsize
     k = insize .- (outsize .- 1) .* stride
@@ -688,7 +659,7 @@ function (g::GlobalMaxPool)(x)
     # Input size
     x_size = size(x)
     # Kernel size
-    k = x_size[1:(end-2)]
+    k = x_size[1:(end - 2)]
     # Pooling dimensions
     pdims = PoolDims(x, k)
 
@@ -722,7 +693,7 @@ function (g::GlobalMeanPool)(x)
     # Input size
     x_size = size(x)
     # Kernel size
-    k = x_size[1:(end-2)]
+    k = x_size[1:(end - 2)]
     # Pooling dimensions
     pdims = PoolDims(x, k)
 
@@ -772,13 +743,13 @@ julia> layer(rand(Float32, 100, 7, 50)) |> size
 (34, 7, 50)
 ```
 """
-struct MaxPool{N,M}
-    k::NTuple{N,Int}
-    pad::NTuple{M,Int}
-    stride::NTuple{N,Int}
+struct MaxPool{N, M}
+    k::NTuple{N, Int}
+    pad::NTuple{M, Int}
+    stride::NTuple{N, Int}
 end
 
-function MaxPool(k::NTuple{N,Integer}; pad = 0, stride = k) where {N}
+function MaxPool(k::NTuple{N, Integer}; pad = 0, stride = k) where {N}
     stride = expand(Val(N), stride)
     pad = calc_padding(MaxPool, pad, k, 1, stride)
     return MaxPool(k, pad, stride)
@@ -831,13 +802,13 @@ julia> m(xs) |> size
 (20, 20, 7, 50)
 ```
 """
-struct MeanPool{N,M}
-    k::NTuple{N,Int}
-    pad::NTuple{M,Int}
-    stride::NTuple{N,Int}
+struct MeanPool{N, M}
+    k::NTuple{N, Int}
+    pad::NTuple{M, Int}
+    stride::NTuple{N, Int}
 end
 
-function MeanPool(k::NTuple{N,Integer}; pad = 0, stride = k) where {N}
+function MeanPool(k::NTuple{N, Integer}; pad = 0, stride = k) where {N}
     stride = expand(Val(N), stride)
     pad = calc_padding(MeanPool, pad, k, 1, stride)
     return MeanPool(k, pad, stride)
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 437d709463..3af09ce237 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -38,11 +38,7 @@ dropout(x, p; kwargs...) = dropout(rng_from_array(x), x, p; kwargs...)
 
 dropout_mask(rng::CUDA.RNG, x::CuArray, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...)
 function dropout_mask(rng, x::CuArray, p; kwargs...)
-    throw(
-        ArgumentError(
-            "x isa CuArray, but rng isa $(typeof(rng)). dropout_mask only support CUDA.RNG for CuArrays.",
-        ),
-    )
+    throw(ArgumentError("x isa CuArray, but rng isa $(typeof(rng)). dropout_mask only support CUDA.RNG for CuArrays."))
 end
 dropout_mask(rng, x, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...)
 function _dropout_mask(rng, x, p; dims = :)
@@ -96,10 +92,10 @@ julia> isapprox(count(==(0), y) / length(y), 0.5; atol = 0.1)
 true
 ```
 """
-mutable struct Dropout{F,D,R<:AbstractRNG}
+mutable struct Dropout{F, D, R <: AbstractRNG}
     p::F
     dims::D
-    active::Union{Bool,Nothing}
+    active::Union{Bool, Nothing}
     rng::R
 end
 Dropout(p, dims, active) = Dropout(p, dims, active, default_rng_value())
@@ -154,13 +150,13 @@ julia> isapprox(std(x), std(y); atol = 0.2)
 true
 ```
 """
-mutable struct AlphaDropout{F,R<:AbstractRNG}
+mutable struct AlphaDropout{F, R <: AbstractRNG}
     p::F
-    active::Union{Bool,Nothing}
+    active::Union{Bool, Nothing}
     rng::R
     function AlphaDropout(p, active, rng)
         @assert 0 ≤ p ≤ 1
-        return new{typeof(p),typeof(rng)}(p, active, rng)
+        return new{typeof(p), typeof(rng)}(p, active, rng)
     end
 end
 AlphaDropout(p, active) = AlphaDropout(p, active, default_rng_value())
@@ -220,25 +216,23 @@ julia> isapprox(std(y; dims = 1:3), ones(1, 1, 1, 2); atol = 0.1) &&
 true
 ```
 """
-struct LayerNorm{F,D,T,N}
+struct LayerNorm{F, D, T, N}
     λ::F
     diag::D
     ϵ::T
-    size::NTuple{N,Int}
+    size::NTuple{N, Int}
     affine::Bool
 end
 
-function LayerNorm(
-    size::Tuple{Vararg{Int}},
-    λ = identity;
-    affine::Bool = true,
-    ϵ::Real = 1.0f-5,
-)
+function LayerNorm(size::Tuple{Vararg{Int}},
+                   λ = identity;
+                   affine::Bool = true,
+                   ϵ::Real = 1.0f-5)
     diag = affine ? Scale(size..., λ) : λ != identity ? Base.Fix1(broadcast, λ) : identity
     return LayerNorm(λ, diag, ϵ, size, affine)
 end
 LayerNorm(size::Integer...; kw...) = LayerNorm(Int.(size); kw...)
-LayerNorm(size_act...; kw...) = LayerNorm(Int.(size_act[1:(end-1)]), size_act[end]; kw...)
+LayerNorm(size_act...; kw...) = LayerNorm(Int.(size_act[1:(end - 1)]), size_act[end]; kw...)
 
 @functor LayerNorm
 
@@ -255,12 +249,10 @@ end
 # Compute the statistics on the slices specified by reduce_dims.
 # reduce_dims=[1,...,N-2,N] for BatchNorm
 # reduce_dims=[1,...,N-2] for InstanceNorm and GroupNorm
-function _norm_layer_forward(
-    l,
-    x::AbstractArray{T,N};
-    reduce_dims,
-    affine_shape,
-) where {T,N}
+function _norm_layer_forward(l,
+                             x::AbstractArray{T, N};
+                             reduce_dims,
+                             affine_shape) where {T, N}
     if !_isactive(l) && l.track_stats # testmode with tracked stats
         stats_shape = ntuple(i -> i == N - 1 ? size(x, N - 1) : 1, N)
         μ = reshape(l.μ, stats_shape)
@@ -283,7 +275,7 @@ end
 
 @inline _norm_layer_forward(x, μ, σ², ϵ) = (x .- μ) ./ sqrt.(σ² .+ ϵ)
 
-function _track_stats!(bn, x::AbstractArray{T,N}, μ, σ², reduce_dims) where {T,N}
+function _track_stats!(bn, x::AbstractArray{T, N}, μ, σ², reduce_dims) where {T, N}
     V = eltype(bn.σ²)
     mtm = bn.momentum
     res_mtm = one(V) - mtm
@@ -340,7 +332,7 @@ julia> isapprox(std(m(xs)), 1; atol = 0.1) && std(xs) != std(m(xs))
 true
 ```
 """
-mutable struct BatchNorm{F,V,N,W}
+mutable struct BatchNorm{F, V, N, W}
     λ::F  # activation function
     β::V  # bias
     γ::V  # scale
@@ -350,20 +342,18 @@ mutable struct BatchNorm{F,V,N,W}
     momentum::N
     affine::Bool
     track_stats::Bool
-    active::Union{Bool,Nothing}
+    active::Union{Bool, Nothing}
     chs::Int # number of channels
 end
 
-function BatchNorm(
-    chs::Int,
-    λ = identity;
-    initβ = zeros32,
-    initγ = ones32,
-    affine = true,
-    track_stats = true,
-    ϵ = 1.0f-5,
-    momentum = 0.1f0,
-)
+function BatchNorm(chs::Int,
+                   λ = identity;
+                   initβ = zeros32,
+                   initγ = ones32,
+                   affine = true,
+                   track_stats = true,
+                   ϵ = 1.0f-5,
+                   momentum = 0.1f0)
     β = affine ? initβ(chs) : nothing
     γ = affine ? initγ(chs) : nothing
     μ = track_stats ? zeros32(chs) : nothing
@@ -378,7 +368,7 @@ trainable(bn::BatchNorm) = hasaffine(bn) ? (β = bn.β, γ = bn.γ) : (;)
 function (BN::BatchNorm)(x)
     @assert size(x, ndims(x) - 1) == BN.chs
     N = ndims(x)
-    reduce_dims = [1:(N-2); N]
+    reduce_dims = [1:(N - 2); N]
     affine_shape = ntuple(i -> i == N - 1 ? size(x, N - 1) : 1, N)
     return _norm_layer_forward(BN, x; reduce_dims, affine_shape)
 end
@@ -434,7 +424,7 @@ julia> isapprox(std(y; dims = 1:2), ones(1, 1, 3, 2); atol = 0.2) &&
 true
 ```
 """
-mutable struct InstanceNorm{F,V,N,W}
+mutable struct InstanceNorm{F, V, N, W}
     λ::F  # activation function
     β::V  # bias
     γ::V  # scale
@@ -444,25 +434,21 @@ mutable struct InstanceNorm{F,V,N,W}
     momentum::N
     affine::Bool
     track_stats::Bool
-    active::Union{Bool,Nothing}
+    active::Union{Bool, Nothing}
     chs::Int # number of channels
 end
 
-function InstanceNorm(
-    chs::Int,
-    λ = identity;
-    initβ = zeros32,
-    initγ = ones32,
-    affine = false,
-    track_stats = false,
-    ϵ = 1.0f-5,
-    momentum = 0.1f0,
-)
+function InstanceNorm(chs::Int,
+                      λ = identity;
+                      initβ = zeros32,
+                      initγ = ones32,
+                      affine = false,
+                      track_stats = false,
+                      ϵ = 1.0f-5,
+                      momentum = 0.1f0)
     if track_stats
-        Base.depwarn(
-            "`track_stats=true` will be removed from InstanceNorm in Flux 0.14. The default value is `track_stats=false`, which will work as before.",
-            :InstanceNorm,
-        )
+        Base.depwarn("`track_stats=true` will be removed from InstanceNorm in Flux 0.14. The default value is `track_stats=false`, which will work as before.",
+                     :InstanceNorm)
     end
 
     β = affine ? initβ(chs) : nothing
@@ -480,7 +466,7 @@ function (l::InstanceNorm)(x)
     @assert ndims(x) > 2
     @assert size(x, ndims(x) - 1) == l.chs
     N = ndims(x)
-    reduce_dims = 1:(N-2)
+    reduce_dims = 1:(N - 2)
     affine_shape = ntuple(i -> i == N - 1 ? size(x, N - 1) : 1, N)
     return _norm_layer_forward(l, x; reduce_dims, affine_shape)
 end
@@ -542,7 +528,7 @@ true
 ```  # number of groups
 ```
 """
-mutable struct GroupNorm{F,V,N,W}
+mutable struct GroupNorm{F, V, N, W}
     G::Int  # number of groups
     λ::F  # activation function
     β::V  # bias
@@ -553,29 +539,25 @@ mutable struct GroupNorm{F,V,N,W}
     momentum::N
     affine::Bool
     track_stats::Bool
-    active::Union{Bool,Nothing}
+    active::Union{Bool, Nothing}
     chs::Int # number of channels
 end
 
 @functor GroupNorm
 trainable(gn::GroupNorm) = hasaffine(gn) ? (β = gn.β, γ = gn.γ) : (;)
 
-function GroupNorm(
-    chs::Int,
-    G::Int,
-    λ = identity;
-    initβ = zeros32,
-    initγ = ones32,
-    affine = true,
-    track_stats = false,
-    ϵ = 1.0f-5,
-    momentum = 0.1f0,
-)
+function GroupNorm(chs::Int,
+                   G::Int,
+                   λ = identity;
+                   initβ = zeros32,
+                   initγ = ones32,
+                   affine = true,
+                   track_stats = false,
+                   ϵ = 1.0f-5,
+                   momentum = 0.1f0)
     if track_stats
-        Base.depwarn(
-            "`track_stats=true` will be removed from GroupNorm in Flux 0.14. The default value is `track_stats=false`, which will work as before.",
-            :GroupNorm,
-        )
+        Base.depwarn("`track_stats=true` will be removed from GroupNorm in Flux 0.14. The default value is `track_stats=false`, which will work as before.",
+                     :GroupNorm)
     end
 
     chs % G == 0 ||
@@ -594,9 +576,9 @@ function (gn::GroupNorm)(x)
     @assert size(x, ndims(x) - 1) == gn.chs
     N = ndims(x)
     sz = size(x)
-    x = reshape(x, sz[1:(N-2)]..., sz[N-1] ÷ gn.G, gn.G, sz[N])
+    x = reshape(x, sz[1:(N - 2)]..., sz[N - 1] ÷ gn.G, gn.G, sz[N])
     N = ndims(x)
-    reduce_dims = 1:(N-2)
+    reduce_dims = 1:(N - 2)
     affine_shape = ntuple(i -> i ∈ (N - 1, N - 2) ? size(x, i) : 1, N)
     x = _norm_layer_forward(gn, x; reduce_dims, affine_shape)
     return reshape(x, sz)
@@ -622,4 +604,4 @@ scale parameters, `false` otherwise.
 
 See [`BatchNorm`](@ref), [`InstanceNorm`](@ref), [`GroupNorm`](@ref), and [`LayerNorm`](@ref).
 """
-hasaffine(l::Union{BatchNorm,InstanceNorm,LayerNorm,GroupNorm}) = l.affine
+hasaffine(l::Union{BatchNorm, InstanceNorm, LayerNorm, GroupNorm}) = l.affine
diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
index 5fdf1e7d00..bec0c539bb 100644
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@@ -19,13 +19,13 @@ function ChainRulesCore.rrule(::typeof(multigate), x::AbstractArray, h, c)
 end
 
 # Type stable and AD-friendly helper for iterating over the last dimension of an array
-function eachlastdim(A::AbstractArray{T,N}) where {T,N}
+function eachlastdim(A::AbstractArray{T, N}) where {T, N}
     inds_before = ntuple(_ -> :, N - 1)
     return (view(A, inds_before..., i) for i in axes(A, N))
 end
 
 # adapted from https://github.com/JuliaDiff/ChainRules.jl/blob/f13e0a45d10bb13f48d6208e9c9d5b4a52b96732/src/rulesets/Base/indexing.jl#L77
-function ∇eachlastdim(dys_raw, x::AbstractArray{T,N}) where {T,N}
+function ∇eachlastdim(dys_raw, x::AbstractArray{T, N}) where {T, N}
     dys = unthunk(dys_raw)
     i1 = findfirst(dy -> dy isa AbstractArray, dys)
     if isnothing(i1)  # all slices are Zero!
@@ -44,7 +44,7 @@ function ∇eachlastdim(dys_raw, x::AbstractArray{T,N}) where {T,N}
     return ProjectTo(x)(dx)
 end
 
-function ChainRulesCore.rrule(::typeof(eachlastdim), x::AbstractArray{T,N}) where {T,N}
+function ChainRulesCore.rrule(::typeof(eachlastdim), x::AbstractArray{T, N}) where {T, N}
     lastdims(dy) = (NoTangent(), ∇eachlastdim(unthunk(dy), x))
     return collect(eachlastdim(x)), lastdims
 end
@@ -126,7 +126,7 @@ julia> rnn.state
  60
 ```
 """
-mutable struct Recur{T,S}
+mutable struct Recur{T, S}
     cell::T
     state::S
 end
@@ -183,7 +183,7 @@ reset!(m) = foreach(reset!, functor(m)[1])
 
 flip(f, xs) = reverse([f(x) for x in reverse(xs)])
 
-function (m::Recur)(x::AbstractArray{T,3}) where {T}
+function (m::Recur)(x::AbstractArray{T, 3}) where {T}
     h = [m(x_t) for x_t in eachlastdim(x)]
     sze = size(h[1])
     return reshape(reduce(hcat, h), sze[1], sze[2], length(h))
@@ -191,7 +191,7 @@ end
 
 # Vanilla RNN
 
-struct RNNCell{F,I,H,V,S}
+struct RNNCell{F, I, H, V, S}
     σ::F
     Wi::I
     Wh::H
@@ -199,20 +199,19 @@ struct RNNCell{F,I,H,V,S}
     state0::S
 end
 
-function RNNCell(
-    (in, out)::Pair,
-    σ = tanh;
-    init = Flux.glorot_uniform,
-    initb = zeros32,
-    init_state = zeros32,
-)
+function RNNCell((in, out)::Pair,
+                 σ = tanh;
+                 init = Flux.glorot_uniform,
+                 initb = zeros32,
+                 init_state = zeros32)
     return RNNCell(σ, init(out, in), init(out, out), initb(out), init_state(out, 1))
 end
 
-function (m::RNNCell{F,I,H,V,<:AbstractMatrix{T}})(
-    h,
-    x::Union{AbstractVecOrMat{T},OneHotArray},
-) where {F,I,H,V,T}
+function (m::RNNCell{F, I, H, V, <:AbstractMatrix{T}})(h,
+                                                       x::Union{AbstractVecOrMat{T},
+                                                                OneHotArray}) where {F, I,
+                                                                                     H, V, T
+                                                                                     }
     Wi, Wh, b = m.Wi, m.Wh, m.b
     σ = NNlib.fast_act(m.σ, x)
     h = σ.(Wi * x .+ Wh * h .+ b)
@@ -286,49 +285,52 @@ julia> r(rand(Float32, 3, 10)) |> size # batch size of 10
     ```
 
 # Note:
-  `RNNCell`s can be constructed directly by specifying the non-linear function, the `Wi` and `Wh` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The  `Wi` and `Wh` matrices do not need to be the same type, but if `Wh` is `dxd`, then `Wi` should be of shape `dxN`.
 
-  ```julia
-  julia> using LinearAlgebra
+`RNNCell`s can be constructed directly by specifying the non-linear function, the `Wi` and `Wh` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The  `Wi` and `Wh` matrices do not need to be the same type, but if `Wh` is `dxd`, then `Wi` should be of shape `dxN`.
 
-  julia> r = Flux.Recur(Flux.RNNCell(tanh, rand(5, 4), Tridiagonal(rand(5, 5)), rand(5), rand(5, 1)))
+```julia
+julia> using LinearAlgebra
 
-  julia> r(rand(4, 10)) |> size # batch size of 10
-  (5, 10)
-  ```
+julia> r = Flux.Recur(Flux.RNNCell(tanh, rand(5, 4), Tridiagonal(rand(5, 5)), rand(5), rand(5, 1)))
+
+julia> r(rand(4, 10)) |> size # batch size of 10
+(5, 10)
+```
 """
 RNN(a...; ka...) = Recur(RNNCell(a...; ka...))
 Recur(m::RNNCell) = Recur(m, m.state0)
 
 # LSTM
 
-struct LSTMCell{I,H,V,S}
+struct LSTMCell{I, H, V, S}
     Wi::I
     Wh::H
     b::V
     state0::S
 end
 
-function LSTMCell(
-    (in, out)::Pair;
-    init = glorot_uniform,
-    initb = zeros32,
-    init_state = zeros32,
-)
-    cell = LSTMCell(
-        init(out * 4, in),
-        init(out * 4, out),
-        initb(out * 4),
-        (init_state(out, 1), init_state(out, 1)),
-    )
+function LSTMCell((in, out)::Pair;
+                  init = glorot_uniform,
+                  initb = zeros32,
+                  init_state = zeros32)
+    cell = LSTMCell(init(out * 4, in),
+                    init(out * 4, out),
+                    initb(out * 4),
+                    (init_state(out, 1), init_state(out, 1)))
     cell.b[gate(out, 2)] .= 1
     return cell
 end
 
-function (m::LSTMCell{I,H,V,<:NTuple{2,AbstractMatrix{T}}})(
-    (h, c),
-    x::Union{AbstractVecOrMat{T},OneHotArray},
-) where {I,H,V,T}
+function (m::LSTMCell{I, H, V, <:NTuple{2, AbstractMatrix{T}}})((h, c),
+                                                                x::Union{
+                                                                         AbstractVecOrMat{T
+                                                                                          },
+                                                                         OneHotArray}) where {
+                                                                                              I,
+                                                                                              H,
+                                                                                              V,
+                                                                                              T
+                                                                                              }
     b, o = m.b, size(h, 1)
     g = muladd(m.Wi, x, muladd(m.Wh, h, b))
     input, forget, cell, output = multigate(g, o, Val(4))
@@ -379,7 +381,8 @@ julia> l(rand(Float32, 3, 10)) |> size # batch size of 10
     Failing to call `reset!` when the input batch size changes can lead to unexpected behavior. See the example in [`RNN`](@ref).
 
 # Note:
-  `LSTMCell`s can be constructed directly by specifying the non-linear function, the `Wi` and `Wh` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The  `Wi` and `Wh` matrices do not need to be the same type. See the example in [`RNN`](@ref).
+
+`LSTMCell`s can be constructed directly by specifying the non-linear function, the `Wi` and `Wh` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The  `Wi` and `Wh` matrices do not need to be the same type. See the example in [`RNN`](@ref).
 """
 LSTM(a...; ka...) = Recur(LSTMCell(a...; ka...))
 Recur(m::LSTMCell) = Recur(m, m.state0)
@@ -392,34 +395,30 @@ function _gru_output(gxs, ghs, bs)
     return r, z
 end
 
-struct GRUCell{I,H,V,S}
+struct GRUCell{I, H, V, S}
     Wi::I
     Wh::H
     b::V
     state0::S
 end
 
-function GRUCell(
-    (in, out)::Pair;
-    init = glorot_uniform,
-    initb = zeros32,
-    init_state = zeros32,
-)
-    return GRUCell(
-        init(out * 3, in),
-        init(out * 3, out),
-        initb(out * 3),
-        init_state(out, 1),
-    )
+function GRUCell((in, out)::Pair;
+                 init = glorot_uniform,
+                 initb = zeros32,
+                 init_state = zeros32)
+    return GRUCell(init(out * 3, in),
+                   init(out * 3, out),
+                   initb(out * 3),
+                   init_state(out, 1))
 end
 
-function (m::GRUCell{I,H,V,<:AbstractMatrix{T}})(
-    h,
-    x::Union{AbstractVecOrMat{T},OneHotArray},
-) where {I,H,V,T}
+function (m::GRUCell{I, H, V, <:AbstractMatrix{T}})(h,
+                                                    x::Union{AbstractVecOrMat{T},
+                                                             OneHotArray}) where {I, H, V, T
+                                                                                  }
     Wi, Wh, b, o = m.Wi, m.Wh, m.b, size(h, 1)
-    gxs, ghs, bs =
-        multigate(Wi * x, o, Val(3)), multigate(Wh * h, o, Val(3)), multigate(b, o, Val(3))
+    gxs, ghs, bs = multigate(Wi * x, o, Val(3)), multigate(Wh * h, o, Val(3)),
+                   multigate(b, o, Val(3))
     r, z = _gru_output(gxs, ghs, bs)
     h̃ = @. tanh_fast(gxs[3] + r * ghs[3] + bs[3])
     h′ = @. (1 - z) * h̃ + z * h
@@ -469,14 +468,15 @@ julia> g(rand(Float32, 3, 10)) |> size # batch size of 10
     Failing to call `reset!` when the input batch size changes can lead to unexpected behavior. See the example in [`RNN`](@ref).
 
 # Note:
-  `GRUCell`s can be constructed directly by specifying the non-linear function, the `Wi` and `Wh` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The  `Wi` and `Wh` matrices do not need to be the same type. See the example in [`RNN`](@ref).
+
+`GRUCell`s can be constructed directly by specifying the non-linear function, the `Wi` and `Wh` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The  `Wi` and `Wh` matrices do not need to be the same type. See the example in [`RNN`](@ref).
 """
 GRU(a...; ka...) = Recur(GRUCell(a...; ka...))
 Recur(m::GRUCell) = Recur(m, m.state0)
 
 # GRU v3
 
-struct GRUv3Cell{I,H,V,HH,S}
+struct GRUv3Cell{I, H, V, HH, S}
     Wi::I
     Wh::H
     b::V
@@ -484,28 +484,27 @@ struct GRUv3Cell{I,H,V,HH,S}
     state0::S
 end
 
-function GRUv3Cell(
-    (in, out)::Pair;
-    init = glorot_uniform,
-    initb = zeros32,
-    init_state = zeros32,
-)
-    return GRUv3Cell(
-        init(out * 3, in),
-        init(out * 2, out),
-        initb(out * 3),
-        init(out, out),
-        init_state(out, 1),
-    )
+function GRUv3Cell((in, out)::Pair;
+                   init = glorot_uniform,
+                   initb = zeros32,
+                   init_state = zeros32)
+    return GRUv3Cell(init(out * 3, in),
+                     init(out * 2, out),
+                     initb(out * 3),
+                     init(out, out),
+                     init_state(out, 1))
 end
 
-function (m::GRUv3Cell{I,H,V,HH,<:AbstractMatrix{T}})(
-    h,
-    x::Union{AbstractVecOrMat{T},OneHotArray},
-) where {I,H,V,HH,T}
+function (m::GRUv3Cell{I, H, V, HH, <:AbstractMatrix{T}})(h,
+                                                          x::Union{AbstractVecOrMat{T},
+                                                                   OneHotArray}) where {I,
+                                                                                        H,
+                                                                                        V,
+                                                                                        HH,
+                                                                                        T}
     Wi, Wh, b, Wh_h̃, o = m.Wi, m.Wh, m.b, m.Wh_h̃, size(h, 1)
-    gxs, ghs, bs =
-        multigate(Wi * x, o, Val(3)), multigate(Wh * h, o, Val(2)), multigate(b, o, Val(3))
+    gxs, ghs, bs = multigate(Wi * x, o, Val(3)), multigate(Wh * h, o, Val(2)),
+                   multigate(b, o, Val(3))
     r, z = _gru_output(gxs, ghs, bs)
     h̃ = tanh_fast.(gxs[3] .+ (Wh_h̃ * (r .* h)) .+ bs[3])
     h′ = @. (1 - z) * h̃ + z * h
@@ -555,7 +554,8 @@ julia> g(rand(Float32, 3, 10)) |> size # batch size of 10
     Failing to call `reset!` when the input batch size changes can lead to unexpected behavior. See the example in [`RNN`](@ref).
 
 # Note:
-  `GRUv3Cell`s can be constructed directly by specifying the non-linear function, the `Wi`, `Wh`, and `Wh_h` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The  `Wi`, `Wh`, and `Wh_h` matrices do not need to be the same type. See the example in [`RNN`](@ref).
+
+`GRUv3Cell`s can be constructed directly by specifying the non-linear function, the `Wi`, `Wh`, and `Wh_h` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The  `Wi`, `Wh`, and `Wh_h` matrices do not need to be the same type. See the example in [`RNN`](@ref).
 """
 GRUv3(a...; ka...) = Recur(GRUv3Cell(a...; ka...))
 Recur(m::GRUv3Cell) = Recur(m, m.state0)
diff --git a/src/layers/show.jl b/src/layers/show.jl
index b2e69d0b75..e21b0ae445 100644
--- a/src/layers/show.jl
+++ b/src/layers/show.jl
@@ -30,8 +30,8 @@ function _big_show(io::IO, obj, indent::Int = 0, name = nothing)
             for k in Base.keys(obj)
                 _big_show(io, obj[k], indent + 2, k)
             end
-        elseif obj isa Parallel{<:Any,<:NamedTuple} ||
-               obj isa PairwiseFusion{<:Any,<:NamedTuple}
+        elseif obj isa Parallel{<:Any, <:NamedTuple} ||
+               obj isa PairwiseFusion{<:Any, <:NamedTuple}
             _big_show(io, obj.connection, indent + 2)
             for k in Base.keys(obj)
                 _big_show(io, obj[k], indent + 2, k)
@@ -90,22 +90,18 @@ function _layer_show(io::IO, layer, indent::Int = 0, name = nothing)
     print(io, " "^indent, str, indent == 0 ? "" : ",")
     if !isempty(params(layer))
         print(io, " "^max(2, (indent == 0 ? 20 : 39) - indent - length(str)))
-        printstyled(
-            io,
-            "# ",
-            underscorise(sum(length, params(layer))),
-            " parameters";
-            color = :light_black,
-        )
+        printstyled(io,
+                    "# ",
+                    underscorise(sum(length, params(layer))),
+                    " parameters";
+                    color = :light_black)
         nonparam = _childarray_sum(length, layer) - sum(length, params(layer))
         if nonparam > 0
-            printstyled(
-                io,
-                ", plus ",
-                underscorise(nonparam),
-                indent == 0 ? " non-trainable" : "";
-                color = :light_black,
-            )
+            printstyled(io,
+                        ", plus ",
+                        underscorise(nonparam),
+                        indent == 0 ? " non-trainable" : "";
+                        color = :light_black)
         end
         _nan_show(io, params(layer))
     end
@@ -120,35 +116,29 @@ function _big_finale(io::IO, m)
         noncnt = _childarray_sum(_ -> 1, m) - length(ps)
         if noncnt > 0
             nonparam = underscorise(_childarray_sum(length, m) - sum(length, ps))
-            printstyled(
-                io,
-                " "^08,
-                "# Total: ",
-                length(ps),
-                " trainable arrays, ";
-                color = :light_black,
-            )
+            printstyled(io,
+                        " "^08,
+                        "# Total: ",
+                        length(ps),
+                        " trainable arrays, ";
+                        color = :light_black)
             println(io, pars, " parameters,")
-            printstyled(
-                io,
-                " "^10,
-                "# plus ",
-                noncnt,
-                " non-trainable, ",
-                nonparam,
-                " parameters, summarysize ";
-                color = :light_black,
-            )
+            printstyled(io,
+                        " "^10,
+                        "# plus ",
+                        noncnt,
+                        " non-trainable, ",
+                        nonparam,
+                        " parameters, summarysize ";
+                        color = :light_black)
             print(io, bytes, ".")
         else
-            printstyled(
-                io,
-                " "^18,
-                "# Total: ",
-                length(ps),
-                " arrays, ";
-                color = :light_black,
-            )
+            printstyled(io,
+                        " "^18,
+                        "# Total: ",
+                        length(ps),
+                        " arrays, ";
+                        color = :light_black)
             print(io, pars, " parameters, ", bytes, ".")
         end
     end
diff --git a/src/layers/upsample.jl b/src/layers/upsample.jl
index d67190a49b..dad2a512bb 100644
--- a/src/layers/upsample.jl
+++ b/src/layers/upsample.jl
@@ -31,7 +31,7 @@ julia> m(ones(2, 2, 1, 1)) |> size
 (4, 5, 1, 1)
 ```
 """
-struct Upsample{mode,S,T}
+struct Upsample{mode, S, T}
     scale::S
     size::T
 end
@@ -42,26 +42,26 @@ function Upsample(mode::Symbol = :nearest; scale = nothing, size = nothing)
     if !(isnothing(scale) ⊻ isnothing(size))
         throw(ArgumentError("Either scale or size should be specified (but not both)."))
     end
-    return Upsample{mode,typeof(scale),typeof(size)}(scale, size)
+    return Upsample{mode, typeof(scale), typeof(size)}(scale, size)
 end
 
 Upsample(scale, mode::Symbol = :nearest) = Upsample(mode; scale)
 
 (m::Upsample{:nearest})(x::AbstractArray) = NNlib.upsample_nearest(x, m.scale)
-function (m::Upsample{:nearest,Int})(x::AbstractArray{T,N}) where {T,N}
+function (m::Upsample{:nearest, Int})(x::AbstractArray{T, N}) where {T, N}
     return NNlib.upsample_nearest(x, ntuple(i -> m.scale, N - 2))
 end
-function (m::Upsample{:nearest,Nothing})(x::AbstractArray)
+function (m::Upsample{:nearest, Nothing})(x::AbstractArray)
     return NNlib.upsample_nearest(x; size = m.size)
 end
 
 (m::Upsample{:bilinear})(x::AbstractArray) = NNlib.upsample_bilinear(x, m.scale)
-function (m::Upsample{:bilinear,Nothing})(x::AbstractArray)
+function (m::Upsample{:bilinear, Nothing})(x::AbstractArray)
     return NNlib.upsample_bilinear(x; size = m.size)
 end
 
 (m::Upsample{:trilinear})(x::AbstractArray) = NNlib.upsample_trilinear(x, m.scale)
-function (m::Upsample{:trilinear,Nothing})(x::AbstractArray)
+function (m::Upsample{:trilinear, Nothing})(x::AbstractArray)
     return NNlib.upsample_trilinear(x; size = m.size)
 end
 
diff --git a/src/loading.jl b/src/loading.jl
index 0dd73a0d59..35e3868189 100644
--- a/src/loading.jl
+++ b/src/loading.jl
@@ -23,19 +23,16 @@ function loadleaf!(dst::AbstractArray, src::AbstractArray, err)
 end
 
 function _tie_check(dst::Bool, src::AbstractArray)
-    return iszero(dst) || error(
-        "Encountered tied parameter with boolean source at some nodes and non-boolean sources at others.",
-    )
+    return iszero(dst) ||
+           error("Encountered tied parameter with boolean source at some nodes and non-boolean sources at others.")
 end
 function _tie_check(dst::AbstractArray, src::Bool)
-    return (iszero(dst) && iszero(src)) || error(
-        "Encountered tied parameter with boolean source at some nodes and non-boolean sources at others.",
-    )
+    return (iszero(dst) && iszero(src)) ||
+           error("Encountered tied parameter with boolean source at some nodes and non-boolean sources at others.")
 end
 function _tie_check(dst::AbstractArray, src::AbstractArray)
-    return (dst == src) || error(
-        "Encountered tied destination parameters with untied and mismatched sources.",
-    )
+    return (dst == src) ||
+           error("Encountered tied destination parameters with untied and mismatched sources.")
 end
 _tie_check(dst, src) = true
 
@@ -100,13 +97,10 @@ but copying a `src` value of `true` will error.
 function loadmodel!(dst, src; filter = _ -> true, cache = Base.IdSet())
     ldsts = _filter_children(filter, functor(dst)[1])
     lsrcs = _filter_children(filter, functor(src)[1])
-    (keys(ldsts) == keys(lsrcs)) || throw(
-        ArgumentError("Tried to load $src into $dst but the structures do not match."),
-    )
+    (keys(ldsts) == keys(lsrcs)) ||
+        throw(ArgumentError("Tried to load $src into $dst but the structures do not match."))
 
-    err = DimensionMismatch(
-        "Tried to load $src into $dst but the parameter sizes do not match.",
-    )
+    err = DimensionMismatch("Tried to load $src into $dst but the parameter sizes do not match.")
     foreach(ldsts, lsrcs) do ldst, lsrc
         if ldst in cache # we already loaded this parameter before
             _tie_check(ldst, lsrc) && return ldst
diff --git a/src/losses/Losses.jl b/src/losses/Losses.jl
index a35f93af03..a6126bf4e5 100644
--- a/src/losses/Losses.jl
+++ b/src/losses/Losses.jl
@@ -10,23 +10,23 @@ using NNlib: logsoftmax, logσ, ctc_loss, ctc_alpha, ∇ctc_loss
 import Base.Broadcast: broadcasted
 
 export mse,
-    mae,
-    msle,
-    label_smoothing,
-    crossentropy,
-    logitcrossentropy,
-    binarycrossentropy,
-    logitbinarycrossentropy,
-    kldivergence,
-    huber_loss,
-    tversky_loss,
-    dice_coeff_loss,
-    poisson_loss,
-    hinge_loss,
-    squared_hinge_loss,
-    binary_focal_loss,
-    focal_loss,
-    siamese_contrastive_loss
+       mae,
+       msle,
+       label_smoothing,
+       crossentropy,
+       logitcrossentropy,
+       binarycrossentropy,
+       logitbinarycrossentropy,
+       kldivergence,
+       huber_loss,
+       tversky_loss,
+       dice_coeff_loss,
+       poisson_loss,
+       hinge_loss,
+       squared_hinge_loss,
+       binary_focal_loss,
+       focal_loss,
+       siamese_contrastive_loss
 
 include("utils.jl")
 include("functions.jl")
diff --git a/src/losses/functions.jl b/src/losses/functions.jl
index 674fe3065c..65b6b2fe60 100644
--- a/src/losses/functions.jl
+++ b/src/losses/functions.jl
@@ -157,7 +157,7 @@ julia> Flux.crossentropy(y_dis, y) > Flux.crossentropy(y_dis, y_smoothed)
 true
 ```
 """
-function label_smoothing(y::Union{AbstractArray,Number}, α::Number; dims::Int = 1)
+function label_smoothing(y::Union{AbstractArray, Number}, α::Number; dims::Int = 1)
     if !(0 < α < 1)
         throw(ArgumentError("α must be between 0 and 1"))
     end
@@ -320,7 +320,7 @@ julia> Flux.crossentropy(y_prob, y_hot)
 """
 function binarycrossentropy(ŷ, y; agg = mean, ϵ = epseltype(ŷ))
     _check_sizes(ŷ, y)
-    return agg(@.(-xlogy(y, ŷ + ϵ) - xlogy(1 - y, 1 - ŷ + ϵ)))
+    return agg(@.(-xlogy(y, ŷ + ϵ)-xlogy(1 - y, 1 - ŷ + ϵ)))
 end
 
 """
@@ -351,7 +351,7 @@ julia> Flux.binarycrossentropy(sigmoid.(y_model), y_bin)
 """
 function logitbinarycrossentropy(ŷ, y; agg = mean)
     _check_sizes(ŷ, y)
-    return agg(@.((1 - y) * ŷ - logσ(ŷ)))
+    return agg(@.((1 - y) * ŷ-logσ(ŷ)))
 end
 
 """
diff --git a/src/losses/utils.jl b/src/losses/utils.jl
index 43aab12a05..cda3e4a557 100644
--- a/src/losses/utils.jl
+++ b/src/losses/utils.jl
@@ -21,19 +21,17 @@ end
 @adjoint function broadcasted(::typeof(xlogy), x::Zygote.Numeric, y::Zygote.Numeric)
     res = xlogy.(x, y)
     return res,
-    Δ -> (nothing, Zygote.unbroadcast(x, xlogy.(Δ, y)), Zygote.unbroadcast(y, Δ .* x ./ y))
+           Δ -> (nothing, Zygote.unbroadcast(x, xlogy.(Δ, y)),
+                 Zygote.unbroadcast(y, Δ .* x ./ y))
 end
 
 ChainRulesCore.@scalar_rule xlogy(x, y) (log(y), x / y)  # should help Diffractor's broadcasting
-ChainRulesCore.@scalar_rule xlogx(x) (log(y) + true)
+ChainRulesCore.@scalar_rule xlogx(x) (log(y)+true)
 
 function _check_sizes(ŷ::AbstractArray, y::AbstractArray)
-    for d = 1:max(ndims(ŷ), ndims(y))
-        size(ŷ, d) == size(y, d) || throw(
-            DimensionMismatch(
-                "loss function expects size(ŷ) = $(size(ŷ)) to match size(y) = $(size(y))",
-            ),
-        )
+    for d in 1:max(ndims(ŷ), ndims(y))
+        size(ŷ, d) == size(y, d) ||
+            throw(DimensionMismatch("loss function expects size(ŷ) = $(size(ŷ)) to match size(y) = $(size(y))"))
     end
 end
 _check_sizes(ŷ, y) = nothing  # pass-through, for constant label e.g. y = 1
diff --git a/src/optimise/Optimise.jl b/src/optimise/Optimise.jl
index 5bc95d0ab2..be4e556092 100644
--- a/src/optimise/Optimise.jl
+++ b/src/optimise/Optimise.jl
@@ -4,29 +4,29 @@ using LinearAlgebra
 import ArrayInterface
 
 export train!,
-    update!,
-    Descent,
-    Adam,
-    Momentum,
-    Nesterov,
-    RMSProp,
-    AdaGrad,
-    AdaMax,
-    AdaDelta,
-    AMSGrad,
-    NAdam,
-    AdamW,
-    RAdam,
-    OAdam,
-    AdaBelief,
-    InvDecay,
-    ExpDecay,
-    WeightDecay,
-    stop,
-    skip,
-    Optimiser,
-    ClipValue,
-    ClipNorm
+       update!,
+       Descent,
+       Adam,
+       Momentum,
+       Nesterov,
+       RMSProp,
+       AdaGrad,
+       AdaMax,
+       AdaDelta,
+       AMSGrad,
+       NAdam,
+       AdamW,
+       RAdam,
+       OAdam,
+       AdaBelief,
+       InvDecay,
+       ExpDecay,
+       WeightDecay,
+       stop,
+       skip,
+       Optimiser,
+       ClipValue,
+       ClipNorm
 
 include("optimisers.jl")
 include("train.jl")
diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index e7e40012c6..940b3ca5d0 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -172,9 +172,9 @@ opt = Adam(0.001, (0.9, 0.8))
 """
 mutable struct Adam <: AbstractOptimiser
     eta::Float64
-    beta::Tuple{Float64,Float64}
+    beta::Tuple{Float64, Float64}
     epsilon::Float64
-    state::IdDict{Any,Any}
+    state::IdDict{Any, Any}
 end
 Adam(η::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = Adam(η, β, ϵ, IdDict())
 Adam(η::Real, β::Tuple, state::IdDict) = Adam(η, β, EPS, state)
@@ -183,12 +183,10 @@ function apply!(o::Adam, x, Δ)
     η, β = o.eta, o.beta
 
     mt, vt, βp = get!(o.state, x) do
-        return (
-            zero(x),
-            zero(x),
-            Float64[β[1], β[2]],
-        )
-    end::Tuple{typeof(x),typeof(x),Vector{Float64}}
+        return (zero(x),
+                zero(x),
+                Float64[β[1], β[2]])
+    end::Tuple{typeof(x), typeof(x), Vector{Float64}}
 
     @. mt = β[1] * mt + (1 - β[1]) * Δ
     @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ)
@@ -218,9 +216,9 @@ opt = RAdam(0.001, (0.9, 0.8))
 """
 mutable struct RAdam <: AbstractOptimiser
     eta::Float64
-    beta::Tuple{Float64,Float64}
+    beta::Tuple{Float64, Float64}
     epsilon::Float64
-    state::IdDict{Any,Any}
+    state::IdDict{Any, Any}
 end
 RAdam(η::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = RAdam(η, β, ϵ, IdDict())
 RAdam(η::Real, β::Tuple, state::IdDict) = RAdam(η, β, EPS, state)
@@ -229,14 +227,13 @@ function apply!(o::RAdam, x, Δ)
     η, β = o.eta, o.beta
     ρ∞ = 2 / (1 - β[2]) - 1
 
-    mt, vt, βp, t = get!(o.state, x) do
-        return (
-            zero(x),
-            zero(x),
-            Float64[β[1], β[2]],
-            Ref(1),
-        )
-    end::Tuple{typeof(x),typeof(x),Vector{Float64},Base.RefValue{Int}}
+    mt, vt, βp, t = get!(o.state,
+                         x) do
+        return (zero(x),
+                zero(x),
+                Float64[β[1], β[2]],
+                Ref(1))
+    end::Tuple{typeof(x), typeof(x), Vector{Float64}, Base.RefValue{Int}}
 
     @. mt = β[1] * mt + (1 - β[1]) * Δ
     @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ)
@@ -273,9 +270,9 @@ opt = AdaMax(0.001, (0.9, 0.995))
 """
 mutable struct AdaMax <: AbstractOptimiser
     eta::Float64
-    beta::Tuple{Float64,Float64}
+    beta::Tuple{Float64, Float64}
     epsilon::Float64
-    state::IdDict{Any,Any}
+    state::IdDict{Any, Any}
 end
 AdaMax(η::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = AdaMax(η, β, ϵ, IdDict())
 AdaMax(η::Real, β::Tuple, state::IdDict) = AdaMax(η, β, EPS, state)
@@ -284,12 +281,10 @@ function apply!(o::AdaMax, x, Δ)
     η, β = o.eta, o.beta
 
     mt, ut, βp = get!(o.state, x) do
-        return (
-            zero(x),
-            zero(x),
-            Float64[β[1], β[2]],
-        )
-    end::Tuple{typeof(x),typeof(x),Vector{Float64}}
+        return (zero(x),
+                zero(x),
+                Float64[β[1], β[2]])
+    end::Tuple{typeof(x), typeof(x), Vector{Float64}}
 
     @. mt = β[1] * mt + (1 - β[1]) * Δ
     @. ut = max(β[2] * ut, abs(Δ))
@@ -320,9 +315,9 @@ opt = OAdam(0.001, (0.9, 0.995))
 """
 mutable struct OAdam <: AbstractOptimiser
     eta::Float64
-    beta::Tuple{Float64,Float64}
+    beta::Tuple{Float64, Float64}
     epsilon::Float64
-    state::IdDict{Any,Any}
+    state::IdDict{Any, Any}
 end
 OAdam(η::Real = 0.001, β::Tuple = (0.5, 0.9), ϵ::Real = EPS) = OAdam(η, β, ϵ, IdDict())
 OAdam(η::Real, β::Tuple, state::IdDict) = RMSProp(η, β, EPS, state)
@@ -330,14 +325,13 @@ OAdam(η::Real, β::Tuple, state::IdDict) = RMSProp(η, β, EPS, state)
 function apply!(o::OAdam, x, Δ)
     η, β = o.eta, o.beta
 
-    mt, vt, Δ_, βp = get!(o.state, x) do
-        return (
-            zero(x),
-            zero(x),
-            zero(x),
-            Float64[β[1], β[2]],
-        )
-    end::Tuple{typeof(x),typeof(x),typeof(x),Vector{Float64}}
+    mt, vt, Δ_, βp = get!(o.state,
+                          x) do
+        return (zero(x),
+                zero(x),
+                zero(x),
+                Float64[β[1], β[2]])
+    end::Tuple{typeof(x), typeof(x), typeof(x), Vector{Float64}}
 
     @. mt = β[1] * mt + (1 - β[1]) * Δ
     @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ)
@@ -402,14 +396,14 @@ opt = AdaDelta(0.89)
 mutable struct AdaDelta <: AbstractOptimiser
     rho::Float64
     epsilon::Float64
-    state::IdDict{Any,Any}
+    state::IdDict{Any, Any}
 end
 AdaDelta(ρ::Real = 0.9, ϵ::Real = EPS) = AdaDelta(ρ, ϵ, IdDict())
 AdaDelta(ρ::Real, state::IdDict) = AdaDelta(ρ, EPS, state)
 
 function apply!(o::AdaDelta, x, Δ)
     ρ = o.rho
-    acc, Δacc = get!(() -> (zero(x), zero(x)), o.state, x)::NTuple{2,typeof(x)}
+    acc, Δacc = get!(() -> (zero(x), zero(x)), o.state, x)::NTuple{2, typeof(x)}
     @. acc = ρ * acc + (1 - ρ) * Δ * conj(Δ)
     # DON'T remove epsilon from numerator
     # or even out of the square roots
@@ -439,9 +433,9 @@ opt = AMSGrad(0.001, (0.89, 0.995))
 """
 mutable struct AMSGrad <: AbstractOptimiser
     eta::Float64
-    beta::Tuple{Float64,Float64}
+    beta::Tuple{Float64, Float64}
     epsilon::Float64
-    state::IdDict{Any,Any}
+    state::IdDict{Any, Any}
 end
 AMSGrad(η::Real = 0.001, β = (0.9, 0.999), ϵ::Real = EPS) = AMSGrad(η, β, ϵ, IdDict())
 AMSGrad(η::Real, β::Tuple, state::IdDict) = AMSGrad(η, β, EPS, state)
@@ -450,12 +444,10 @@ function apply!(o::AMSGrad, x, Δ)
     η, β = o.eta, o.beta
 
     mt, vt, v̂t = get!(o.state, x) do
-        return (
-            fill!(similar(x), o.epsilon),
-            fill!(similar(x), o.epsilon),
-            fill!(similar(x), o.epsilon),
-        )
-    end::NTuple{3,typeof(x)}
+        return (fill!(similar(x), o.epsilon),
+                fill!(similar(x), o.epsilon),
+                fill!(similar(x), o.epsilon))
+    end::NTuple{3, typeof(x)}
 
     @. mt = β[1] * mt + (1 - β[1]) * Δ
     @. vt = β[2] * vt + (1 - β[2]) * Δ^2
@@ -484,9 +476,9 @@ opt = NAdam(0.002, (0.89, 0.995))
 """
 mutable struct NAdam <: AbstractOptimiser
     eta::Float64
-    beta::Tuple{Float64,Float64}
+    beta::Tuple{Float64, Float64}
     epsilon::Float64
-    state::IdDict{Any,Any}
+    state::IdDict{Any, Any}
 end
 NAdam(η::Real = 0.001, β = (0.9, 0.999), ϵ::Real = EPS) = NAdam(η, β, ϵ, IdDict())
 NAdam(η::Real, β::Tuple, state::IdDict) = NAdam(η, β, EPS, state)
@@ -495,19 +487,16 @@ function apply!(o::NAdam, x, Δ)
     η, β = o.eta, o.beta
 
     mt, vt, βp = get!(o.state, x) do
-        return (
-            zero(x),
-            zero(x),
-            Float64[o.beta[1], o.beta[2]],
-        )
-    end::Tuple{typeof(x),typeof(x),Vector{Float64}}
+        return (zero(x),
+                zero(x),
+                Float64[o.beta[1], o.beta[2]])
+    end::Tuple{typeof(x), typeof(x), Vector{Float64}}
     β1p, β2p = βp
 
     @. mt = β[1] * mt + (1 - β[1]) * Δ
     @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ)
-    @. Δ =
-        (β[1] * mt / (1 - β[1] * β1p) + (1 - β[1]) * Δ / (1 - β1p)) /
-        (√(vt * β[2] / (1 - β2p)) + o.epsilon) * η
+    @. Δ = (β[1] * mt / (1 - β[1] * β1p) + (1 - β[1]) * Δ / (1 - β1p)) /
+           (√(vt * β[2] / (1 - β2p)) + o.epsilon) * η
     βp .= βp .* β
 
     return Δ
@@ -558,9 +547,9 @@ opt = AdaBelief(0.001, (0.9, 0.8))
 """
 mutable struct AdaBelief <: AbstractOptimiser
     eta::Float64
-    beta::Tuple{Float64,Float64}
+    beta::Tuple{Float64, Float64}
     epsilon::Float64
-    state::IdDict{Any,Any}
+    state::IdDict{Any, Any}
 end
 AdaBelief(η::Real = 0.001, β = (0.9, 0.999), ϵ::Real = EPS) = AdaBelief(η, β, ϵ, IdDict())
 AdaBelief(η::Real, β::Tuple, state::IdDict) = AdaBelief(η, β, EPS, state)
@@ -569,12 +558,10 @@ function apply!(o::AdaBelief, x, Δ)
     η, β = o.eta, o.beta
 
     mt, st, βp = get!(o.state, x) do
-        return (
-            zero(x),
-            zero(x),
-            Float64[β[1], β[2]],
-        )
-    end::Tuple{typeof(x),typeof(x),Vector{Float64}}
+        return (zero(x),
+                zero(x),
+                Float64[β[1], β[2]])
+    end::Tuple{typeof(x), typeof(x), Vector{Float64}}
 
     #= st is a variance and can go to zero. This is in contrast to Adam, which uses the
     second moment which is usually far enough from zero. This is problematic, since st
@@ -610,11 +597,11 @@ end
 Optimiser(opts::AbstractOptimiser...) = Optimiser(Any[opts...])
 
 @forward Optimiser.os Base.getindex,
-Base.first,
-Base.last,
-Base.lastindex,
-Base.push!,
-Base.setindex!
+                      Base.first,
+                      Base.last,
+                      Base.lastindex,
+                      Base.push!,
+                      Base.setindex!
 @forward Optimiser.os Base.iterate
 
 Base.getindex(c::Optimiser, i::AbstractArray) = Optimiser(c.os[i]...)
@@ -649,10 +636,10 @@ opt = Optimiser(Adam(1.0f-3), InvDecay(1.0f-2))
 """
 mutable struct InvDecay <: AbstractOptimiser
     gamma::Float64
-    state::IdDict{Any,Int}
+    state::IdDict{Any, Int}
 end
 
-InvDecay(γ = 0.001) = InvDecay(γ, IdDict{Any,Int}())
+InvDecay(γ = 0.001) = InvDecay(γ, IdDict{Any, Int}())
 
 function apply!(o::InvDecay, x, Δ)
     γ = o.gamma
diff --git a/src/optimise/train.jl b/src/optimise/train.jl
index bead5860f0..243356fcf2 100644
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@@ -50,11 +50,9 @@ end
 ```
 """
 function skip()
-    Base.depwarn(
-        """Flux.skip() will be removed from Flux 0.14.
-           and should be replaced with `continue` in an ordinary `for` loop.""",
-        :skip,
-    )
+    Base.depwarn("""Flux.skip() will be removed from Flux 0.14.
+                    and should be replaced with `continue` in an ordinary `for` loop.""",
+                 :skip)
     throw(SkipException())
 end
 
@@ -79,11 +77,9 @@ end
 ```
 """
 function stop()
-    Base.depwarn(
-        """Flux.stop() will be removed from Flux 0.14.
-           It should be replaced with `break` in an ordinary `for` loop.""",
-        :stop,
-    )
+    Base.depwarn("""Flux.stop() will be removed from Flux 0.14.
+                    It should be replaced with `break` in an ordinary `for` loop.""",
+                 :stop)
     throw(StopException())
 end
 
@@ -178,14 +174,12 @@ hello
 ```
 """
 macro epochs(n, ex)
-    Base.depwarn(
-        """The macro `@epochs` will be removed from Flux 0.14.
-           As an alternative, you can write a simple `for i in 1:epochs` loop.""",
-        Symbol("@epochs");
-        force = true,
-    )
-    return :(@progress for i = 1:($(esc(n)))
-        @info "Epoch $i"
-        $(esc(ex))
-    end)
+    Base.depwarn("""The macro `@epochs` will be removed from Flux 0.14.
+                    As an alternative, you can write a simple `for i in 1:epochs` loop.""",
+                 Symbol("@epochs");
+                 force = true)
+    return :(@progress for i in 1:($(esc(n)))
+                 @info "Epoch $i"
+                 $(esc(ex))
+             end)
 end
diff --git a/src/outputsize.jl b/src/outputsize.jl
index 65f006d54f..c31bbbba2a 100644
--- a/src/outputsize.jl
+++ b/src/outputsize.jl
@@ -13,8 +13,8 @@ struct Nil <: Real end
 
 @doc @doc(Nil) const nil = Nil()
 
-Nil(::T) where {T<:Number} = nil
-(::Type{T})(::Nil) where {T<:Number} = nil
+Nil(::T) where {T <: Number} = nil
+(::Type{T})(::Nil) where {T <: Number} = nil
 Base.convert(::Type{Nil}, ::Number) = nil
 
 Base.float(::Type{Nil}) = Nil
@@ -181,12 +181,10 @@ end
 for (fn, Dims) in ((:conv, DenseConvDims),)
     @eval begin
         function NNlib.$fn(a::AbstractArray{Nil}, b::AbstractArray{Nil}, dims::$Dims)
-            return fill(
-                nil,
-                NNlib.output_size(dims)...,
-                NNlib.channels_out(dims),
-                size(a)[end],
-            )
+            return fill(nil,
+                        NNlib.output_size(dims)...,
+                        NNlib.channels_out(dims),
+                        size(a)[end])
         end
 
         function NNlib.$fn(a::AbstractArray{<:Real}, b::AbstractArray{Nil}, dims::$Dims)
diff --git a/src/utils.jl b/src/utils.jl
index 07d99e8b97..6751bc1096 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -28,7 +28,7 @@ nfan() = 1, 1 # fan_in, fan_out
 nfan(n) = 1, n # A vector is treated as a n×1 matrix
 nfan(n_out, n_in) = n_in, n_out # In case of Dense kernels: arranged as matrices
 nfan(dims::Tuple) = nfan(dims...)
-nfan(dims...) = prod(dims[1:(end-2)]) .* (dims[end-1], dims[end]) # In case of convolution kernels
+nfan(dims...) = prod(dims[1:(end - 2)]) .* (dims[end - 1], dims[end]) # In case of convolution kernels
 
 ofeltype(x, y) = convert(float(eltype(x)), y)
 epseltype(x) = eps(float(eltype(x)))
@@ -270,18 +270,15 @@ julia> round(std(Flux.truncated_normal(10^6; lo = -100, hi = 100)))
 1.0f0
 ```
 """
-function truncated_normal(
-    rng::AbstractRNG,
-    dims::Integer...;
-    mean = 0,
-    std = 1,
-    lo = -2,
-    hi = 2,
-)
+function truncated_normal(rng::AbstractRNG,
+                          dims::Integer...;
+                          mean = 0,
+                          std = 1,
+                          lo = -2,
+                          hi = 2)
     norm_cdf(x) = 0.5 * (1 + erf(x / √2))
     if (mean < lo - 2 * std) || (mean > hi + 2 * std)
-        @warn "Mean is more than 2 std outside the limits in truncated_normal, so the distribution of values may be inaccurate." maxlog =
-            1
+        @warn "Mean is more than 2 std outside the limits in truncated_normal, so the distribution of values may be inaccurate." maxlog=1
     end
     l = norm_cdf((lo - mean) / std)
     u = norm_cdf((hi - mean) / std)
@@ -354,7 +351,7 @@ end
 
 function orthogonal(rng::AbstractRNG, d1::Integer, ds::Integer...; kwargs...)
     dims = (d1, ds...)
-    rows = prod(dims[1:(end-1)])
+    rows = prod(dims[1:(end - 1)])
     cols = dims[end]
     return reshape(orthogonal(rng, rows, cols; kwargs...), dims)
 end
@@ -363,8 +360,8 @@ function orthogonal(dims::Integer...; kwargs...)
     return orthogonal(default_rng_value(), dims...; kwargs...)
 end
 function orthogonal(rng::AbstractRNG = default_rng_value(); init_kwargs...)
-    return (dims::Integer...; kwargs...) ->
-        orthogonal(rng, dims...; init_kwargs..., kwargs...)
+    return (dims::Integer...; kwargs...) -> orthogonal(rng, dims...; init_kwargs...,
+                                                       kwargs...)
 end
 
 ChainRulesCore.@non_differentiable orthogonal(::Any...)
@@ -403,11 +400,7 @@ julia> count(iszero, ans.weight; dims = 1)
 """
 function sparse_init(rng::AbstractRNG, dims::Integer...; sparsity, std = 0.01)
     if length(dims) != 2
-        throw(
-            ArgumentError(
-                "Only 2-dimensional outputs are supported for sparse initialization.",
-            ),
-        )
+        throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization."))
     end
     rows, cols = dims
     prop_zero = min(1.0, sparsity)
@@ -506,10 +499,10 @@ end
 
 # Assume convolution
 function identity_init(dims::Integer...; gain::Real = 1, shift = 0)
-    nin, nout = dims[end-1], dims[end]
-    centers = map(d -> cld(d, 2), dims[1:(end-2)])
+    nin, nout = dims[end - 1], dims[end]
+    centers = map(d -> cld(d, 2), dims[1:(end - 2)])
     weights = zeros32(dims...)
-    for i = 1:min(nin, nout)
+    for i in 1:min(nin, nout)
         weights[centers..., i, i] = gain
     end
     return circshift(weights, shift)
diff --git a/test/ctc-gpu.jl b/test/ctc-gpu.jl
index 6439a3a8f5..e85d56590e 100644
--- a/test/ctc-gpu.jl
+++ b/test/ctc-gpu.jl
@@ -10,7 +10,7 @@ using CUDA
 function ctc_ngradient(x, y)
     f = Flux.Losses.ctc_loss
     grads = zero(x)
-    for i = 1:length(x)
+    for i in 1:length(x)
         δ = sqrt(eps())
         tmp = x[i]
         x[i] = tmp - δ / 2
@@ -30,7 +30,7 @@ end
     g1 = gradient(ctc_loss, x_cu, y)[1]
     g1 = g1 |> collect
     g2 = ctc_ngradient(x, y)
-    @test g1 ≈ g2 rtol = 1e-5 atol = 1e-5
+    @test g1≈g2 rtol=1e-5 atol=1e-5
 
     # test that GPU loss matches CPU implementation
     l1 = ctc_loss(x_cu, y)
@@ -42,23 +42,19 @@ end
     y = [1, 2]
     @test ctc_loss(x_cu, y) ≈ 3.6990738275138035
 
-    g = [
-        -0.317671 -0.427729 0.665241
-        0.244728 -0.0196172 -0.829811
-        0.0729422 0.447346 0.16457
-    ]
+    g = [-0.317671 -0.427729 0.665241
+         0.244728 -0.0196172 -0.829811
+         0.0729422 0.447346 0.16457]
     ghat = gradient(ctc_loss, x_cu, y)[1] |> collect
-    @test g ≈ ghat rtol = 1e-5 atol = 1e-5
+    @test g≈ghat rtol=1e-5 atol=1e-5
 
     x_cu = [-3.0 12.0 8.0 15.0; 4.0 20.0 -2.0 20.0; 8.0 -33.0 6.0 5.0] |> CuArray
     y = [1, 2] |> CuArray
     @test ctc_loss(x_cu, y) ≈ 8.02519869363453
 
-    g = [
-        -2.29294774655333e-06 -0.999662657278862 1.75500863563993e-06 0.00669284889063
-        0.017985914969696 0.999662657278861 -1.9907078755387e-06 -0.006693150917307
-        -0.01798362202195 -2.52019580677916e-20 2.35699239251042e-07 3.02026677058789e-07
-    ]
+    g = [-2.29294774655333e-06 -0.999662657278862 1.75500863563993e-06 0.00669284889063
+         0.017985914969696 0.999662657278861 -1.9907078755387e-06 -0.006693150917307
+         -0.01798362202195 -2.52019580677916e-20 2.35699239251042e-07 3.02026677058789e-07]
     ghat = gradient(ctc_loss, x_cu, y)[1] |> collect
-    @test g ≈ ghat rtol = 1e-5 atol = 1e-5
+    @test g≈ghat rtol=1e-5 atol=1e-5
 end
diff --git a/test/ctc.jl b/test/ctc.jl
index 059b14f292..6e5681f977 100644
--- a/test/ctc.jl
+++ b/test/ctc.jl
@@ -9,7 +9,7 @@ using LinearAlgebra
 function ctc_ngradient(x, y)
     f = Flux.Losses.ctc_loss
     grads = zero(x)
-    for i = 1:length(x)
+    for i in 1:length(x)
         δ = sqrt(eps())
         tmp = x[i]
         x[i] = tmp - δ / 2
@@ -27,30 +27,26 @@ end
     y = rand(1:9, 30)
     g1 = gradient(ctc_loss, x, y)[1]
     g2 = ctc_ngradient(x, y)
-    @test g1 ≈ g2 rtol = 1e-5 atol = 1e-5
+    @test g1≈g2 rtol=1e-5 atol=1e-5
 
     # tests using hand-calculated values
     x = [1.0 2.0 3.0; 2.0 1.0 1.0; 3.0 3.0 2.0]
     y = [1, 2]
     @test ctc_loss(x, y) ≈ 3.6990738275138035
 
-    g = [
-        -0.317671 -0.427729 0.665241
-        0.244728 -0.0196172 -0.829811
-        0.0729422 0.447346 0.16457
-    ]
+    g = [-0.317671 -0.427729 0.665241
+         0.244728 -0.0196172 -0.829811
+         0.0729422 0.447346 0.16457]
     ghat = gradient(ctc_loss, x, y)[1]
-    @test g ≈ ghat rtol = 1e-5 atol = 1e-5
+    @test g≈ghat rtol=1e-5 atol=1e-5
 
     x = [-3.0 12.0 8.0 15.0; 4.0 20.0 -2.0 20.0; 8.0 -33.0 6.0 5.0]
     y = [1, 2]
     @test ctc_loss(x, y) ≈ 8.02519869363453
 
-    g = [
-        -2.29294774655333e-06 -0.999662657278862 1.75500863563993e-06 0.00669284889063
-        0.017985914969696 0.999662657278861 -1.9907078755387e-06 -0.006693150917307
-        -0.01798362202195 -2.52019580677916e-20 2.35699239251042e-07 3.02026677058789e-07
-    ]
+    g = [-2.29294774655333e-06 -0.999662657278862 1.75500863563993e-06 0.00669284889063
+         0.017985914969696 0.999662657278861 -1.9907078755387e-06 -0.006693150917307
+         -0.01798362202195 -2.52019580677916e-20 2.35699239251042e-07 3.02026677058789e-07]
     ghat = gradient(ctc_loss, x, y)[1]
-    @test g ≈ ghat rtol = 1e-5 atol = 1e-5
+    @test g≈ghat rtol=1e-5 atol=1e-5
 end
diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl
index 25648eb787..033a08df95 100644
--- a/test/cuda/cuda.jl
+++ b/test/cuda/cuda.jl
@@ -21,7 +21,7 @@ using SparseArrays: sparse, SparseMatrixCSC, AbstractSparseArray
     cm = gpu(m)
 
     @test all(p isa CuArray for p in params(cm))
-    @test cm(gpu(rand(10, 10))) isa CuArray{Float32,2}
+    @test cm(gpu(rand(10, 10))) isa CuArray{Float32, 2}
 
     xs = rand(5, 5)
     ys = Flux.onehotbatch(1:5, 1:5)
@@ -81,7 +81,7 @@ end
         M = 2.0 * I(10) |> collect
         Q = cholesky(M)
         Q_gpu = Q |> gpu
-        @test Q_gpu isa Cholesky{<:Any,<:CuArray}
+        @test Q_gpu isa Cholesky{<:Any, <:CuArray}
         Q_cpu = Q_gpu |> cpu
         @test Q_cpu == cholesky(eltype(Q_gpu).(M))
     end
diff --git a/test/cuda/curnn.jl b/test/cuda/curnn.jl
index a46b5684c8..93e6dea6aa 100644
--- a/test/cuda/curnn.jl
+++ b/test/cuda/curnn.jl
@@ -11,55 +11,52 @@ using Flux, CUDA, Test
     @test collect(m̄.cell.Wi) == collect(θ[m.cell.Wi])
 end
 
-@testset "RNN" begin
-    @testset for R in [RNN, GRU, LSTM, GRUv3], batch_size in (1, 5)
-        rnn = R(10, 5)
-        curnn = fmap(gpu, rnn)
-
-        Flux.reset!(rnn)
-        Flux.reset!(curnn)
-        x = batch_size == 1 ? rand(Float32, 10) : rand(Float32, 10, batch_size)
-        cux = gpu(x)
-
-        y, back = pullback((r, x) -> r(x), rnn, x)
-        cuy, cuback = pullback((r, x) -> r(x), curnn, cux)
-
-        @test y ≈ collect(cuy)
-
-        ȳ = randn(size(y))
-        m̄, x̄ = back(ȳ)
-        cum̄, cux̄ = cuback(gpu(ȳ))
-
-        @test x̄ ≈ collect(cux̄)
-        @test m̄[].cell.Wi ≈ collect(cum̄[].cell.Wi)
-        @test m̄[].cell.Wh ≈ collect(cum̄[].cell.Wh)
-        @test m̄[].cell.b ≈ collect(cum̄[].cell.b)
-        if m̄[].state isa Tuple
-            for (x, cx) in zip(m̄[].state, cum̄[].state)
-                @test x ≈ collect(cx)
-            end
-        else
-            @test m̄[].state ≈ collect(cum̄[].state)
+@testset "RNN" begin @testset for R in [RNN, GRU, LSTM, GRUv3], batch_size in (1, 5)
+    rnn = R(10, 5)
+    curnn = fmap(gpu, rnn)
+
+    Flux.reset!(rnn)
+    Flux.reset!(curnn)
+    x = batch_size == 1 ? rand(Float32, 10) : rand(Float32, 10, batch_size)
+    cux = gpu(x)
+
+    y, back = pullback((r, x) -> r(x), rnn, x)
+    cuy, cuback = pullback((r, x) -> r(x), curnn, cux)
+
+    @test y ≈ collect(cuy)
+
+    ȳ = randn(size(y))
+    m̄, x̄ = back(ȳ)
+    cum̄, cux̄ = cuback(gpu(ȳ))
+
+    @test x̄ ≈ collect(cux̄)
+    @test m̄[].cell.Wi ≈ collect(cum̄[].cell.Wi)
+    @test m̄[].cell.Wh ≈ collect(cum̄[].cell.Wh)
+    @test m̄[].cell.b ≈ collect(cum̄[].cell.b)
+    if m̄[].state isa Tuple
+        for (x, cx) in zip(m̄[].state, cum̄[].state)
+            @test x ≈ collect(cx)
         end
-
-        Flux.reset!(rnn)
-        Flux.reset!(curnn)
-        ohx =
-            batch_size == 1 ? Flux.onehot(rand(1:10), 1:10) :
-            Flux.onehotbatch(rand(1:10, batch_size), 1:10)
-        cuohx = gpu(ohx)
-        y = (rnn(ohx); rnn(ohx))
-
-        cuy = (curnn(cuohx); curnn(cuohx))
-        @test y ≈ collect(cuy)
-
-        Flux.reset!(rnn)
-        Flux.reset!(curnn)
-        fx = rand(Float32, 10, batch_size, 3)
-        cufx = gpu(fx)
-        fy = (rnn(fx); rnn(fx))
-
-        cufy = (curnn(cufx); curnn(cufx))
-        @test fy ≈ collect(cufy)
+    else
+        @test m̄[].state ≈ collect(cum̄[].state)
     end
-end
+
+    Flux.reset!(rnn)
+    Flux.reset!(curnn)
+    ohx = batch_size == 1 ? Flux.onehot(rand(1:10), 1:10) :
+          Flux.onehotbatch(rand(1:10, batch_size), 1:10)
+    cuohx = gpu(ohx)
+    y = (rnn(ohx); rnn(ohx))
+
+    cuy = (curnn(cuohx); curnn(cuohx))
+    @test y ≈ collect(cuy)
+
+    Flux.reset!(rnn)
+    Flux.reset!(curnn)
+    fx = rand(Float32, 10, batch_size, 3)
+    cufx = gpu(fx)
+    fy = (rnn(fx); rnn(fx))
+
+    cufy = (curnn(cufx); curnn(cufx))
+    @test fy ≈ collect(cufy)
+end end
diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl
index e2da95931d..eb9382c42d 100644
--- a/test/cuda/layers.jl
+++ b/test/cuda/layers.jl
@@ -15,68 +15,63 @@ const BROKEN_LAYERS = Union{}
 
 const ACTIVATIONS = [identity, relu, tanh, sigmoid, exp, softplus, elu, selu]
 
-function gpu_gradtest(
-    name::String,
-    layers::Vector,
-    x_cpu = nothing,
-    args...;
-    test_cpu = true,
-)
+function gpu_gradtest(name::String,
+                      layers::Vector,
+                      x_cpu = nothing,
+                      args...;
+                      test_cpu = true)
     isnothing(x_cpu) && error("Missing input to test the layers against.")
-    @testset "$name GPU grad tests" begin
-        for layer in layers
-            @testset "$layer Layer GPU grad test" begin
-
-                # compute output and grad of parameters
-                l_cpu = layer(args...)
-                ps_cpu = Flux.params(l_cpu)
-                y_cpu, back_cpu = pullback(() -> sum(l_cpu(x_cpu)), ps_cpu)
-                gs_cpu = back_cpu(1.0f0)
-
-                x_gpu = gpu(x_cpu)
-                l_gpu = l_cpu |> gpu
-                ps_gpu = Flux.params(l_gpu)
-
-                if typeof(l_gpu) <: BROKEN_LAYERS
-                    @test_broken gradient(() -> sum(l_gpu(x_gpu)), ps_gpu) isa
-                                 Flux.Zygote.Grads
-                else
-                    y_gpu, back_gpu = pullback(() -> sum(l_gpu(x_gpu)), ps_gpu)
-                    gs_gpu = back_gpu(1.0f0) # TODO many layers error out when backprop int 1, should fix
-
-                    # compute grad of input
-                    xg_cpu = gradient(x -> sum(l_cpu(x)), x_cpu)[1]
-                    xg_gpu = gradient(x -> sum(l_gpu(x)), x_gpu)[1]
-
-                    # test
-                    if test_cpu
-                        @test y_gpu ≈ y_cpu rtol = 1.0f-3 atol = 1.0f-3
-                        if isnothing(xg_cpu)
-                            @test isnothing(xg_gpu)
+    @testset "$name GPU grad tests" begin for layer in layers
+        @testset "$layer Layer GPU grad test" begin
+
+            # compute output and grad of parameters
+            l_cpu = layer(args...)
+            ps_cpu = Flux.params(l_cpu)
+            y_cpu, back_cpu = pullback(() -> sum(l_cpu(x_cpu)), ps_cpu)
+            gs_cpu = back_cpu(1.0f0)
+
+            x_gpu = gpu(x_cpu)
+            l_gpu = l_cpu |> gpu
+            ps_gpu = Flux.params(l_gpu)
+
+            if typeof(l_gpu) <: BROKEN_LAYERS
+                @test_broken gradient(() -> sum(l_gpu(x_gpu)), ps_gpu) isa
+                             Flux.Zygote.Grads
+            else
+                y_gpu, back_gpu = pullback(() -> sum(l_gpu(x_gpu)), ps_gpu)
+                gs_gpu = back_gpu(1.0f0) # TODO many layers error out when backprop int 1, should fix
+
+                # compute grad of input
+                xg_cpu = gradient(x -> sum(l_cpu(x)), x_cpu)[1]
+                xg_gpu = gradient(x -> sum(l_gpu(x)), x_gpu)[1]
+
+                # test
+                if test_cpu
+                    @test y_gpu≈y_cpu rtol=1.0f-3 atol=1.0f-3
+                    if isnothing(xg_cpu)
+                        @test isnothing(xg_gpu)
+                    else
+                        if layer === GroupedConvTranspose
+                            @test Array(xg_gpu)≈xg_cpu rtol=2.0f-2 atol=1.0f-3
                         else
-                            if layer === GroupedConvTranspose
-                                @test Array(xg_gpu) ≈ xg_cpu rtol = 2.0f-2 atol = 1.0f-3
-                            else
-                                @test Array(xg_gpu) ≈ xg_cpu rtol = 1.0f-3 atol = 1.0f-3
-                            end
+                            @test Array(xg_gpu)≈xg_cpu rtol=1.0f-3 atol=1.0f-3
                         end
                     end
-                    @test gs_gpu isa Flux.Zygote.Grads
-                    for (p_cpu, p_gpu) in zip(ps_cpu, ps_gpu)
-                        if isnothing(gs_cpu[p_cpu])
-                            @test isnothing(gs_gpu[p_gpu])
-                        else
-                            @test gs_gpu[p_gpu] isa Flux.CUDA.CuArray
-                            if test_cpu
-                                @test Array(gs_gpu[p_gpu]) ≈ gs_cpu[p_cpu] rtol = 1.0f-3 atol =
-                                    1.0f-3
-                            end
+                end
+                @test gs_gpu isa Flux.Zygote.Grads
+                for (p_cpu, p_gpu) in zip(ps_cpu, ps_gpu)
+                    if isnothing(gs_cpu[p_cpu])
+                        @test isnothing(gs_gpu[p_gpu])
+                    else
+                        @test gs_gpu[p_gpu] isa Flux.CUDA.CuArray
+                        if test_cpu
+                            @test Array(gs_gpu[p_gpu])≈gs_cpu[p_cpu] rtol=1.0f-3 atol=1.0f-3
                         end
                     end
                 end
             end
         end
-    end
+    end end
 end
 
 # Just to give testset in gpu_gradtest meaningful labels
@@ -99,58 +94,48 @@ for act in ACTIVATIONS
         DepthwiseConv,
         DepthwiseConvNoBias,
     ]
-    gpu_gradtest(
-        "Convolution with $act",
-        conv_layers,
-        r,
-        (2, 2),
-        1 => 3,
-        act;
-        test_cpu = false,
-    )
+    gpu_gradtest("Convolution with $act",
+                 conv_layers,
+                 r,
+                 (2, 2),
+                 1 => 3,
+                 act;
+                 test_cpu = false)
 
     groupedconv = [GroupedConv, GroupedConvTranspose]
-    gpu_gradtest(
-        "GroupedConvolution with $act",
-        groupedconv,
-        rand(Float32, 28, 28, 100, 2),
-        (3, 3),
-        100 => 25,
-        act;
-        test_cpu = true,
-    )
+    gpu_gradtest("GroupedConvolution with $act",
+                 groupedconv,
+                 rand(Float32, 28, 28, 100, 2),
+                 (3, 3),
+                 100 => 25,
+                 act;
+                 test_cpu = true)
 
     batch_norm = [BatchNorm]
-    gpu_gradtest(
-        "BatchNorm 1 with $act",
-        batch_norm,
-        rand(Float32, 28, 28, 3, 4),
-        3,
-        act;
-        test_cpu = false,
-    ) #TODO fix errors
-    gpu_gradtest(
-        "BatchNorm 2 with $act",
-        batch_norm,
-        rand(Float32, 5, 4),
-        5,
-        act;
-        test_cpu = false,
-    )
+    gpu_gradtest("BatchNorm 1 with $act",
+                 batch_norm,
+                 rand(Float32, 28, 28, 3, 4),
+                 3,
+                 act;
+                 test_cpu = false) #TODO fix errors
+    gpu_gradtest("BatchNorm 2 with $act",
+                 batch_norm,
+                 rand(Float32, 5, 4),
+                 5,
+                 act;
+                 test_cpu = false)
 
     instancenorm = [InstanceNorm]
     gpu_gradtest("InstanceNorm with $act", instancenorm, r, 1, act; test_cpu = false)
 
     groupnorm = [GroupNorm]
-    gpu_gradtest(
-        "GroupNorm with $act",
-        groupnorm,
-        rand(Float32, 28, 28, 3, 1),
-        3,
-        1,
-        act;
-        test_cpu = false,
-    )
+    gpu_gradtest("GroupNorm with $act",
+                 groupnorm,
+                 rand(Float32, 28, 28, 3, 1),
+                 3,
+                 1,
+                 act;
+                 test_cpu = false)
 end
 
 r = rand(Float32, 28, 28, 1, 1)
@@ -183,13 +168,11 @@ gpu_gradtest("Embedding integer index", embedding, 1, 5, 2)
 gpu_gradtest("Embedding 2d index", embedding, [1 2; 3 4], 5, 2)
 gpu_gradtest("Embedding OneHotVec index", embedding, OneHotVector(1, 5), 5, 2)
 gpu_gradtest("Embedding OneHotMatrix index", embedding, OneHotMatrix([1, 2, 3], 5), 5, 2)
-gpu_gradtest(
-    "Embedding OneHotMatrix repeated indices",
-    embedding,
-    OneHotMatrix([1, 2, 2], 5),
-    5,
-    2,
-)
+gpu_gradtest("Embedding OneHotMatrix repeated indices",
+             embedding,
+             OneHotMatrix([1, 2, 2], 5),
+             5,
+             2)
 
 @testset "function layers" begin
     x = rand(Float32, 3, 3)
@@ -338,11 +321,9 @@ end
 end
 
 @testset "Dropout RNGs" begin
-    @test_throws ArgumentError Flux.dropout(
-        MersenneTwister(),
-        CUDA.rand(Float32, 2, 3),
-        0.1,
-    )
+    @test_throws ArgumentError Flux.dropout(MersenneTwister(),
+                                            CUDA.rand(Float32, 2, 3),
+                                            0.1)
     @testset for layer in (Dropout, AlphaDropout)
         m = layer(0.1; rng = MersenneTwister(123))
         @test_throws ErrorException gpu(m)
diff --git a/test/cuda/losses.jl b/test/cuda/losses.jl
index 467d3ed46e..1383bd04cd 100644
--- a/test/cuda/losses.jl
+++ b/test/cuda/losses.jl
@@ -1,5 +1,6 @@
 using Flux.Losses:
-    crossentropy, binarycrossentropy, logitbinarycrossentropy, binary_focal_loss, focal_loss
+                   crossentropy, binarycrossentropy, logitbinarycrossentropy,
+                   binary_focal_loss, focal_loss
 
 @testset "Losses" begin
     x = [1.0, 2.0, 3.0]
@@ -14,22 +15,16 @@ using Flux.Losses:
     @test binarycrossentropy(σ.(x), y) ≈ binarycrossentropy(gpu(σ.(x)), gpu(y))
     @test logitbinarycrossentropy(x, y) ≈ logitbinarycrossentropy(gpu(x), gpu(y))
 
-    x = [
-        0.268941 0.5 0.268941
-        0.731059 0.5 0.731059
-    ]
-    y = [
-        0 1 0
-        1 0 1
-    ]
+    x = [0.268941 0.5 0.268941
+         0.731059 0.5 0.731059]
+    y = [0 1 0
+         1 0 1]
     @test binary_focal_loss(x, y) ≈ binary_focal_loss(gpu(x), gpu(y))
 
     x = softmax(reshape(-7:7, 3, 5) .* 1.0f0)
-    y = [
-        1 0 0 0 1
-        0 1 0 1 0
-        0 0 1 0 0
-    ]
+    y = [1 0 0 0 1
+         0 1 0 1 0
+         0 0 1 0 0]
     @test focal_loss(x, y) ≈ focal_loss(gpu(x), gpu(y))
 
     @testset "GPU grad tests" begin
diff --git a/test/cuda/test_utils.jl b/test/cuda/test_utils.jl
index 027d13a612..77fdba5c89 100644
--- a/test/cuda/test_utils.jl
+++ b/test/cuda/test_utils.jl
@@ -7,10 +7,10 @@ function check_grad(g_gpu::Base.RefValue, g_cpu::Base.RefValue, atol, rtol)
 end
 check_grad(g_gpu::Nothing, g_cpu::Nothing, atol, rtol) = @test true
 function check_grad(g_gpu::Float32, g_cpu::Float32, atol, rtol)
-    @test g_cpu ≈ g_gpu rtol = rtol atol = atol
+    @test g_cpu≈g_gpu rtol=rtol atol=atol
 end
 function check_grad(g_gpu::CuArray{Float32}, g_cpu::Array{Float32}, atol, rtol)
-    @test g_cpu ≈ collect(g_gpu) rtol = rtol atol = atol
+    @test g_cpu≈collect(g_gpu) rtol=rtol atol=atol
 end
 
 function check_grad(g_gpu::Tuple, g_cpu::Tuple, atol, rtol)
@@ -27,13 +27,11 @@ function check_grad(g_gpu::NamedTuple, g_cpu::NamedTuple, atol, rtol)
     end
 end
 
-function gpu_autodiff_test(
-    f_cpu,
-    xs_cpu::Array{Float32}...;
-    test_equal = true,
-    rtol = 1e-4,
-    atol = 1e-4,
-)
+function gpu_autodiff_test(f_cpu,
+                           xs_cpu::Array{Float32}...;
+                           test_equal = true,
+                           rtol = 1e-4,
+                           atol = 1e-4)
     check_type(x) = false
     check_type(x::Float32) = true
     check_type(x::CuArray{Float32}) = true
@@ -55,7 +53,7 @@ function gpu_autodiff_test(
     gs_gpu = back_gpu(Δ_gpu)
 
     if test_equal
-        @test collect(y_cpu) ≈ collect(y_gpu) rtol = rtol atol = atol
+        @test collect(y_cpu)≈collect(y_gpu) rtol=rtol atol=atol
         for (g_gpu, g_cpu) in zip(gs_gpu, gs_cpu)
             check_grad(g_gpu, g_cpu, atol, rtol)
         end
@@ -71,7 +69,7 @@ function gpu_autodiff_test(
     gs_gpu = back_gpu(Δ_gpu)
 
     if test_equal
-        @test collect(y_cpu) ≈ collect(y_gpu) rtol = rtol atol = atol
+        @test collect(y_cpu)≈collect(y_gpu) rtol=rtol atol=atol
         @assert length(ps_gpu) == length(ps_cpu)
         for (p_gpu, p_cpu) in zip(ps_gpu, ps_cpu)
             check_grad(gs_gpu[p_gpu], gs_cpu[p_cpu], atol, rtol)
diff --git a/test/data.jl b/test/data.jl
index 3d2083af4f..0b66f6b50c 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -36,7 +36,7 @@ using Random
     # @inferred first(d)
     batches = collect(d)
     # @test eltype(batches) == eltype(d) == Tuple{typeof(X), typeof(Y)}
-    @test eltype(batches) == Tuple{typeof(X),typeof(Y)}
+    @test eltype(batches) == Tuple{typeof(X), typeof(Y)}
     @test length(batches) == 3
     @test length(batches[1]) == 2
     @test length(batches[2]) == 2
@@ -53,7 +53,7 @@ using Random
     # @inferred first(d)
     batches = collect(d)
     # @test eltype(batches) == eltype(d) == NamedTuple{(:x, :y), Tuple{typeof(X), typeof(Y)}}
-    @test eltype(batches) == NamedTuple{(:x, :y),Tuple{typeof(X),typeof(Y)}}
+    @test eltype(batches) == NamedTuple{(:x, :y), Tuple{typeof(X), typeof(Y)}}
     @test length(batches) == 3
     @test length(batches[1]) == 2
     @test length(batches[2]) == 2
@@ -69,7 +69,7 @@ using Random
     d = DataLoader([1:10;]; shuffle = true)
     cd = collect(zip(d, d))
     # skip the first since it used to be different also before fixing the bug
-    @test [cd[i][1] for i = 2:10] != [cd[i][2] for i = 2:10]
+    @test [cd[i][1] for i in 2:10] != [cd[i][2] for i in 2:10]
 
     # test interaction with `train!`
     θ = ones(2)
@@ -89,13 +89,9 @@ using Random
     @test norm(θ .- 1) < 1e-10
 
     # specify the rng
-    d = map(
-        identity,
-        DataLoader(
-            X;
-            batchsize = 2,
-            shuffle = true,
-            rng = Random.seed!(Random.default_rng(), 5),
-        ),
-    )
+    d = map(identity,
+            DataLoader(X;
+                       batchsize = 2,
+                       shuffle = true,
+                       rng = Random.seed!(Random.default_rng(), 5)))
 end
diff --git a/test/layers/basic.jl b/test/layers/basic.jl
index f3600850a8..0a3e73879e 100644
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@@ -2,18 +2,16 @@ using Test, Random
 import Flux: activations
 
 @testset "basic" begin
-    @testset "helpers" begin
-        @testset "activations" begin
-            dummy_model = Chain(x -> x .^ 2, x -> x .- 3, x -> tan.(x))
-            x = randn(10)
-            @test activations(dummy_model, x)[1] == x .^ 2
-            @test activations(dummy_model, x)[2] == (x .^ 2 .- 3)
-            @test activations(dummy_model, x)[3] == tan.(x .^ 2 .- 3)
-
-            @test activations(Chain(), x) == ()
-            @test activations(Chain(identity, x -> :foo), x)[2] == :foo # results include `Any` type
-        end
-    end
+    @testset "helpers" begin @testset "activations" begin
+        dummy_model = Chain(x -> x .^ 2, x -> x .- 3, x -> tan.(x))
+        x = randn(10)
+        @test activations(dummy_model, x)[1] == x .^ 2
+        @test activations(dummy_model, x)[2] == (x .^ 2 .- 3)
+        @test activations(dummy_model, x)[3] == tan.(x .^ 2 .- 3)
+
+        @test activations(Chain(), x) == ()
+        @test activations(Chain(identity, x -> :foo), x)[2] == :foo # results include `Any` type
+    end end
 
     @testset "Chain" begin
         @test_nowarn Chain(Dense(10, 5, σ), Dense(5, 2))(randn(10))
@@ -89,9 +87,10 @@ import Flux: activations
             @test Dense(10, 2, identity; init = ones)(ones(10, 1)) == 10 * ones(2, 1)
             @test Dense(10, 2, identity; init = ones)([ones(10, 1) 2 * ones(10, 1)]) ==
                   [10 20; 10 20]
-            @test Dense(10, 2, identity; init = ones, bias = false)(
-                [ones(10, 1) 2 * ones(10, 1)],
-            ) == [10 20; 10 20]
+            @test Dense(10, 2, identity; init = ones, bias = false)([ones(10, 1) 2 *
+                                                                                 ones(10,
+                                                                                      1)]) ==
+                  [10 20; 10 20]
         end
     end
 
@@ -159,9 +158,8 @@ import Flux: activations
 
         @testset "concat size" begin
             input = randn(10, 2)
-            @test size(
-                SkipConnection(Dense(10, 10), (a, b) -> cat(a, b; dims = 2))(input),
-            ) == (10, 4)
+            @test size(SkipConnection(Dense(10, 10), (a, b) -> cat(a, b; dims = 2))(input)) ==
+                  (10, 4)
         end
     end
 
@@ -219,9 +217,8 @@ import Flux: activations
 
         @testset "concat size" begin
             input = randn(10, 2)
-            @test size(
-                Parallel((a, b) -> cat(a, b; dims = 2), Dense(10, 10), identity)(input),
-            ) == (10, 4)
+            @test size(Parallel((a, b) -> cat(a, b; dims = 2), Dense(10, 10), identity)(input)) ==
+                  (10, 4)
             @test size(Parallel(hcat; one = Dense(10, 10), two = identity)(input)) ==
                   (10, 4)
         end
@@ -229,9 +226,8 @@ import Flux: activations
         @testset "vararg input" begin
             inputs = randn(10), randn(5), randn(4)
             @test size(Parallel(+, Dense(10, 2), Dense(5, 2), Dense(4, 2))(inputs)) == (2,)
-            @test size(
-                Parallel(+; a = Dense(10, 2), b = Dense(5, 2), c = Dense(4, 2))(inputs),
-            ) == (2,)
+            @test size(Parallel(+; a = Dense(10, 2), b = Dense(5, 2), c = Dense(4, 2))(inputs)) ==
+                  (2,)
             @test_throws ArgumentError Parallel(+, sin, cos)(1, 2, 3)  # wrong number of inputs
             @test Parallel(+, sin, cos)(pi / 2) ≈ 1
         end
@@ -241,16 +237,12 @@ import Flux: activations
             @test m[1] == m[:one]
             @test m[1:2] == m
 
-            @test_throws ArgumentError Parallel(
-                hcat,
-                layers = Dense(10, 10),
-                two = identity,
-            ) # reserved names
-            @test_throws ArgumentError Parallel(
-                hcat,
-                connection = Dense(10, 10),
-                two = identity,
-            )
+            @test_throws ArgumentError Parallel(hcat,
+                                                layers = Dense(10, 10),
+                                                two = identity) # reserved names
+            @test_throws ArgumentError Parallel(hcat,
+                                                connection = Dense(10, 10),
+                                                two = identity)
 
             @test m == fmap(identity, m)  # does not forget names
 
@@ -259,7 +251,7 @@ import Flux: activations
         end
 
         @testset "trivial cases" begin
-            @test Parallel(hcat) isa Parallel{typeof(hcat),Tuple{}}  # not a NamedTuple
+            @test Parallel(hcat) isa Parallel{typeof(hcat), Tuple{}}  # not a NamedTuple
             @test Parallel(hcat)(1) == hcat()
             @test Parallel(hcat, inv)(2) == hcat(1 / 2)  # still calls connection once.
         end
@@ -324,7 +316,7 @@ import Flux: activations
 
         x = rand(1:vocab_size, 3, 4)
         y = m(x)
-        @test y isa Array{Float32,3}
+        @test y isa Array{Float32, 3}
         @test size(y) == (embed_size, 3, 4)
 
         @test m(2) ≈ m.weight[:, 2]
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index 51082723fb..1733b5e40b 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -25,15 +25,13 @@ end
 
 @testset "CNN" begin
     r = zeros(Float32, 28, 28, 1, 5)
-    m = Chain(
-        Conv((2, 2), 1 => 16, relu),
-        MaxPool((2, 2)),
-        Conv((2, 2), 16 => 8, relu),
-        MaxPool((2, 2)),
-        x -> reshape(x, :, size(x, 4)),
-        Dense(288, 10),
-        softmax,
-    )
+    m = Chain(Conv((2, 2), 1 => 16, relu),
+              MaxPool((2, 2)),
+              Conv((2, 2), 16 => 8, relu),
+              MaxPool((2, 2)),
+              x -> reshape(x, :, size(x, 4)),
+              Dense(288, 10),
+              softmax)
 
     @test size(m(r)) == (10, 5)
 
@@ -59,7 +57,7 @@ end
     op = zeros(Float32, 27, 27, 3, 1) .+ 2.0f0
     opt = Descent()
 
-    for _ = 1:(10^3)
+    for _ in 1:(10^3)
         gs = gradient(Flux.params(bias)) do
             return Flux.Losses.mse(bias(ip), op)
         end
@@ -116,7 +114,7 @@ end
     @test _channels_out(ConvTranspose((5, 6), 2 => 2; groups = 2)) == 2
 
     for Layer in [Conv, ConvTranspose]
-        for _ = 1:10
+        for _ in 1:10
             groups = rand(1:10)
             kernel_size = Tuple(rand(1:5) for _ in rand(1:3))
             cin = rand(1:5) * groups
@@ -138,7 +136,7 @@ end
     @test y_hat[2, 2] ≈ 9.0
     @test y_hat[end, 1] ≈ 4.0
     @test y_hat[1, end] ≈ 3.0
-    @test y_hat[1, end-1] ≈ 6.0
+    @test y_hat[1, end - 1] ≈ 6.0
     @test y_hat[end, end] ≈ 2.0
 end
 
@@ -206,22 +204,20 @@ end
     w = rand(Float32, 2, 2, 1, 1)
     y = CrossCor(w, [0.0])
 
-    @test sum(w .* x[1:2, 1:2, :, :]) ≈ y(x)[1, 1, 1, 1] rtol = 2e-7
+    @test sum(w .* x[1:2, 1:2, :, :])≈y(x)[1, 1, 1, 1] rtol=2e-7
 
     r = zeros(Float32, 28, 28, 1, 5)
-    m = Chain(
-        CrossCor((2, 2), 1 => 16, relu),
-        MaxPool((2, 2)),
-        CrossCor((2, 2), 16 => 8, relu; bias = false),
-        MaxPool((2, 2)),
-        x -> reshape(x, :, size(x, 4)),
-        Dense(288, 10),
-        softmax,
-    )
+    m = Chain(CrossCor((2, 2), 1 => 16, relu),
+              MaxPool((2, 2)),
+              CrossCor((2, 2), 16 => 8, relu; bias = false),
+              MaxPool((2, 2)),
+              x -> reshape(x, :, size(x, 4)),
+              Dense(288, 10),
+              softmax)
 
     @test size(m(r)) == (10, 5)
     @test y(x) != Conv(w, [0.0])(x)
-    @test CrossCor(w[end:-1:1, end:-1:1, :, :], [0.0])(x) ≈ Conv(w, [0.0])(x) rtol = 1e-7
+    @test CrossCor(w[end:-1:1, end:-1:1, :, :], [0.0])(x)≈Conv(w, [0.0])(x) rtol=1e-7
 end
 
 @testset "Conv with non quadratic window #700" begin
@@ -230,17 +226,17 @@ end
 
     l = Conv((3, 3), 1 => 1)
     expected = zeros(eltype(l.weight), 5, 5, 1, 1)
-    expected[2:(end-1), 2:(end-1), 1, 1] = l.weight
+    expected[2:(end - 1), 2:(end - 1), 1, 1] = l.weight
     @test expected ≈ l(data)
 
     l = Conv((3, 1), 1 => 1)
     expected = zeros(eltype(l.weight), 5, 7, 1, 1)
-    expected[2:(end-1), 4, 1, 1] = l.weight
+    expected[2:(end - 1), 4, 1, 1] = l.weight
     @test expected ≈ l(data)
 
     l = Conv((1, 3), 1 => 1)
     expected = zeros(eltype(l.weight), 7, 5, 1, 1)
-    expected[4, 2:(end-1), 1, 1] = l.weight
+    expected[4, 2:(end - 1), 1, 1] = l.weight
     @test expected ≈ l(data)
 
     @test begin
@@ -250,9 +246,9 @@ end
     end
 end
 
-@testset "$ltype SamePad kernelsize $k" for ltype in
-                                            (Conv, ConvTranspose, DepthwiseConv, CrossCor),
-    k in ((1,), (2,), (3,), (4, 5), (6, 7, 8))
+@testset "$ltype SamePad kernelsize $k" for ltype in (Conv, ConvTranspose, DepthwiseConv,
+                                                      CrossCor),
+                                            k in ((1,), (2,), (3,), (4, 5), (6, 7, 8))
 
     data = ones(Float32, (k .+ 3)..., 1, 1)
     l = ltype(k, 1 => 1; pad = SamePad())
@@ -264,25 +260,24 @@ end
     stride = 3
     l = ltype(k, 1 => 1; pad = SamePad(), stride = stride)
     if ltype == ConvTranspose
-        @test size(l(data))[1:(end-2)] == stride .* size(data)[1:(end-2)]
+        @test size(l(data))[1:(end - 2)] == stride .* size(data)[1:(end - 2)]
     else
-        @test size(l(data))[1:(end-2)] == cld.(size(data)[1:(end-2)], stride)
+        @test size(l(data))[1:(end - 2)] == cld.(size(data)[1:(end - 2)], stride)
     end
 end
 
 @testset "$ltype SamePad windowsize $k" for ltype in (MeanPool, MaxPool),
-    k in ((1,), (2,), (3,), (4, 5), (6, 7, 8))
+                                            k in ((1,), (2,), (3,), (4, 5), (6, 7, 8))
 
     data = ones(Float32, (k .+ 3)..., 1, 1)
 
     l = ltype(k; pad = SamePad())
-    @test size(l(data))[1:(end-2)] == cld.(size(data)[1:(end-2)], k)
+    @test size(l(data))[1:(end - 2)] == cld.(size(data)[1:(end - 2)], k)
 end
 
 @testset "bugs fixed" begin
-    # https://github.com/FluxML/Flux.jl/issues/1421
-    @test Conv((5, 5), 10 => 20, identity; init = Base.randn).bias isa Vector{Float64}
-end
+# https://github.com/FluxML/Flux.jl/issues/1421
+@test Conv((5, 5), 10 => 20, identity; init = Base.randn).bias isa Vector{Float64} end
 
 @testset "constructors: $fun" for fun in [Conv, CrossCor, ConvTranspose, DepthwiseConv]
     @test fun(rand(2, 3, 4)).bias isa Vector{Float64}
diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 32e99245d6..e4cfcb9d35 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -3,128 +3,122 @@ using Zygote: pullback
 
 evalwgrad(f, x...) = pullback(f, x...)[1]
 
-@testset "Dropout" begin
-    @testset for rng_kwargs in ((), (; rng = MersenneTwister()))
-        x = [1.0 + 0im, 2.0 + 1im, 3.0 + 3im]
-        @test x == Dropout(0.1; rng_kwargs...)(x)
-        @test x == evalwgrad(Dropout(0; rng_kwargs...), x)
-        @test zero(x) == evalwgrad(Dropout(1; rng_kwargs...), x)
-
-        x = [1.0, 2.0, 3.0]
-        @test x == Dropout(0.1; rng_kwargs...)(x)
-        @test x == evalwgrad(Dropout(0; rng_kwargs...), x)
-        @test zero(x) == evalwgrad(Dropout(1; rng_kwargs...), x)
-
-        x = rand(100)
-        m = Dropout(0.9; rng_kwargs...)
-        y = evalwgrad(m, x)
-        @test count(a -> a == 0, y) > 50
-        testmode!(m, true)
-        y = evalwgrad(m, x) # should override istraining
-        @test count(a -> a == 0, y) == 0
-        testmode!(m, false)
-        y = evalwgrad(m, x)
-        @test count(a -> a == 0, y) > 50
-
-        x = rand(Float32, 100)
-        m = Chain(Dense(100, 100), Dropout(0.9; rng_kwargs...))
-        y = evalwgrad(m, x)
-        @test count(a -> a == 0, y) > 50
-        testmode!(m, true)
-        y = evalwgrad(m, x) # should override istraining
-        @test count(a -> a == 0, y) == 0
-
-        x = rand(100, 50)
-        m = Dropout(0.5; dims = 2, rng_kwargs...)
-        y = m(x)
-        c = map(i -> count(a -> a == 0, @view y[i, :]), 1:100)
-        @test minimum(c) == maximum(c)
-        m = Dropout(0.5; dims = 1, rng_kwargs...)
-        y = m(x)
-        c = map(i -> count(a -> a == 0, @view y[:, i]), 1:50)
-        @test minimum(c) == maximum(c)
-
-        # issue #1084
-        m = Dropout(0.9; rng_kwargs...)
-        x = rand(100)
-
-        testmode!(m)
-        y = m(x)
-        @test count(a -> a == 0, y) == 0
-        trainmode!(m)
-        y = m(x)
-        @test count(a -> a == 0, y) > 50
-
-        y = Flux.dropout(values(rng_kwargs)..., x, 0.9; active = true)
-        @test count(a -> a == 0, y) > 50
-
-        y = Flux.dropout(values(rng_kwargs)..., x, 0.9; active = false)
-        @test count(a -> a == 0, y) == 0
-
-        # CPU RNGs map onto CPU ok
-        if isempty(rng_kwargs)
-            if VERSION >= v"1.7"
-                @test cpu(m).rng isa Random.TaskLocalRNG
-            else
-                @test cpu(m).rng isa Random._GLOBAL_RNG
-            end
+@testset "Dropout" begin @testset for rng_kwargs in ((), (; rng = MersenneTwister()))
+    x = [1.0 + 0im, 2.0 + 1im, 3.0 + 3im]
+    @test x == Dropout(0.1; rng_kwargs...)(x)
+    @test x == evalwgrad(Dropout(0; rng_kwargs...), x)
+    @test zero(x) == evalwgrad(Dropout(1; rng_kwargs...), x)
+
+    x = [1.0, 2.0, 3.0]
+    @test x == Dropout(0.1; rng_kwargs...)(x)
+    @test x == evalwgrad(Dropout(0; rng_kwargs...), x)
+    @test zero(x) == evalwgrad(Dropout(1; rng_kwargs...), x)
+
+    x = rand(100)
+    m = Dropout(0.9; rng_kwargs...)
+    y = evalwgrad(m, x)
+    @test count(a -> a == 0, y) > 50
+    testmode!(m, true)
+    y = evalwgrad(m, x) # should override istraining
+    @test count(a -> a == 0, y) == 0
+    testmode!(m, false)
+    y = evalwgrad(m, x)
+    @test count(a -> a == 0, y) > 50
+
+    x = rand(Float32, 100)
+    m = Chain(Dense(100, 100), Dropout(0.9; rng_kwargs...))
+    y = evalwgrad(m, x)
+    @test count(a -> a == 0, y) > 50
+    testmode!(m, true)
+    y = evalwgrad(m, x) # should override istraining
+    @test count(a -> a == 0, y) == 0
+
+    x = rand(100, 50)
+    m = Dropout(0.5; dims = 2, rng_kwargs...)
+    y = m(x)
+    c = map(i -> count(a -> a == 0, @view y[i, :]), 1:100)
+    @test minimum(c) == maximum(c)
+    m = Dropout(0.5; dims = 1, rng_kwargs...)
+    y = m(x)
+    c = map(i -> count(a -> a == 0, @view y[:, i]), 1:50)
+    @test minimum(c) == maximum(c)
+
+    # issue #1084
+    m = Dropout(0.9; rng_kwargs...)
+    x = rand(100)
+
+    testmode!(m)
+    y = m(x)
+    @test count(a -> a == 0, y) == 0
+    trainmode!(m)
+    y = m(x)
+    @test count(a -> a == 0, y) > 50
+
+    y = Flux.dropout(values(rng_kwargs)..., x, 0.9; active = true)
+    @test count(a -> a == 0, y) > 50
+
+    y = Flux.dropout(values(rng_kwargs)..., x, 0.9; active = false)
+    @test count(a -> a == 0, y) == 0
+
+    # CPU RNGs map onto CPU ok
+    if isempty(rng_kwargs)
+        if VERSION >= v"1.7"
+            @test cpu(m).rng isa Random.TaskLocalRNG
         else
-            @test cpu(m).rng === only(values(rng_kwargs))
+            @test cpu(m).rng isa Random._GLOBAL_RNG
         end
+    else
+        @test cpu(m).rng === only(values(rng_kwargs))
+    end
+end end
+
+@testset "AlphaDropout" begin @testset for rng_kwargs in ((), (; rng = MersenneTwister()))
+    x = [1.0, 2.0, 3.0]
+    @test x == AlphaDropout(0.1; rng_kwargs...)(x)
+    @test x == evalwgrad(AlphaDropout(0; rng_kwargs...), x)
+    @test zero(x) == evalwgrad(AlphaDropout(1; rng_kwargs...), x)
+
+    x = randn(1000) # large enough to prevent flaky test
+    m = AlphaDropout(0.5; rng_kwargs...)
+
+    y = evalwgrad(m, x)
+    # Should preserve unit mean and variance
+    @test mean(y)≈0 atol=0.2
+    @test var(y)≈1 atol=0.2
+
+    testmode!(m, true) # should override istraining
+    @test evalwgrad(m, x) == x
+
+    testmode!(m, false)
+    y = evalwgrad(m, x)
+    @test mean(y)≈0 atol=0.2
+    @test var(y)≈1 atol=0.2
+
+    # Known good value ranges
+    # Values taken from https://github.com/pytorch/pytorch/blob/v1.10.0/test/cpp/api/modules.cpp#L1337-L1338
+    x = ones(100)
+    if isempty(rng_kwargs)
+        @test 40 < sum(evalwgrad(m, x)) < 130
+    else
+        # FIXME: this breaks spuriously for MersenneTwister
+        @test_skip 40 < sum(evalwgrad(m, x)) < 130
     end
-end
-
-@testset "AlphaDropout" begin
-    @testset for rng_kwargs in ((), (; rng = MersenneTwister()))
-        x = [1.0, 2.0, 3.0]
-        @test x == AlphaDropout(0.1; rng_kwargs...)(x)
-        @test x == evalwgrad(AlphaDropout(0; rng_kwargs...), x)
-        @test zero(x) == evalwgrad(AlphaDropout(1; rng_kwargs...), x)
-
-        x = randn(1000) # large enough to prevent flaky test
-        m = AlphaDropout(0.5; rng_kwargs...)
-
-        y = evalwgrad(m, x)
-        # Should preserve unit mean and variance
-        @test mean(y) ≈ 0 atol = 0.2
-        @test var(y) ≈ 1 atol = 0.2
-
-        testmode!(m, true) # should override istraining
-        @test evalwgrad(m, x) == x
-
-        testmode!(m, false)
-        y = evalwgrad(m, x)
-        @test mean(y) ≈ 0 atol = 0.2
-        @test var(y) ≈ 1 atol = 0.2
-
-        # Known good value ranges
-        # Values taken from https://github.com/pytorch/pytorch/blob/v1.10.0/test/cpp/api/modules.cpp#L1337-L1338
-        x = ones(100)
-        if isempty(rng_kwargs)
-            @test 40 < sum(evalwgrad(m, x)) < 130
-        else
-            # FIXME: this breaks spuriously for MersenneTwister
-            @test_skip 40 < sum(evalwgrad(m, x)) < 130
-        end
 
-        # CPU RNGs map onto CPU ok
-        if isempty(rng_kwargs)
-            if VERSION >= v"1.7"
-                @test cpu(m).rng isa Random.TaskLocalRNG
-            else
-                @test cpu(m).rng isa Random._GLOBAL_RNG
-            end
+    # CPU RNGs map onto CPU ok
+    if isempty(rng_kwargs)
+        if VERSION >= v"1.7"
+            @test cpu(m).rng isa Random.TaskLocalRNG
         else
-            @test cpu(m).rng === only(values(rng_kwargs))
+            @test cpu(m).rng isa Random._GLOBAL_RNG
         end
+    else
+        @test cpu(m).rng === only(values(rng_kwargs))
     end
-end
+end end
 
 @testset "BatchNorm" begin
-    let m = BatchNorm(2), x = [
-            1.0 3.0 5.0
-            2.0 4.0 6.0
-        ]
+    let m = BatchNorm(2), x = [1.0 3.0 5.0
+                               2.0 4.0 6.0]
         @test Flux.hasaffine(m) == true
         @test length(Flux.params(m)) == 2
 
@@ -167,10 +161,8 @@ end
     end
 
     # with activation function
-    let m = BatchNorm(2, sigmoid), x = [
-            1.0 3.0 5.0
-            2.0 4.0 6.0
-        ]
+    let m = BatchNorm(2, sigmoid), x = [1.0 3.0 5.0
+                                        2.0 4.0 6.0]
         y = m(x)
         @test isapprox(y, sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ)), atol = 1.0e-7)
         @inferred m(x)
@@ -243,16 +235,15 @@ end
         # (1. - .1) * 0 + .1 * (5. + 11.) / 2 = .8
         N = ndims(x)
         @test m.μ ≈ [0.5, 0.8]
-        n = prod(size(x, i) for i = 1:(N-2))
+        n = prod(size(x, i) for i in 1:(N - 2))
         corr = n / (n - 1)
-        σ² = var(x; dims = 1:(N-2), corrected = false)
+        σ² = var(x; dims = 1:(N - 2), corrected = false)
         @test m.σ² ≈ 0.1 * corr * vec(mean(σ²; dims = N)) .+ 0.9 * 1
 
         y = m(x)
         @test length(m.μ) == 2
         @test length(m.σ²) == 2
-        @test y ≈ (x .- reshape(m.μ, 1, 2, 1)) ./ sqrt.(reshape(m.σ², 1, 2, 1) .+ 1.0f-5) atol =
-            1.0e-5
+        @test y≈(x .- reshape(m.μ, 1, 2, 1)) ./ sqrt.(reshape(m.σ², 1, 2, 1) .+ 1.0f-5) atol=1.0e-5
 
         @inferred m(x)
     end
@@ -270,7 +261,7 @@ end
         y = m(x) # inference time after a training step
         μ = reshape(m.μ, affine_shape...)
         σ² = reshape(m.σ², affine_shape...)
-        @test y ≈ sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol = 1.0e-7
+        @test y≈sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol=1.0e-7
 
         @inferred m(x)
     end
@@ -286,7 +277,7 @@ end
         y = m(x)
         μ = mean(x; dims = 1)
         σ² = var(x; dims = 1, corrected = false)
-        @test y ≈ sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol = 1.0e-7
+        @test y≈sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol=1.0e-7
 
         @inferred m(x)
     end
@@ -302,7 +293,7 @@ end
         y = m(x)
         μ = mean(x; dims = 1)
         σ² = var(x; dims = 1, corrected = false)
-        @test y ≈ sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol = 1.0e-7
+        @test y≈sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol=1.0e-7
 
         @inferred m(x)
     end
@@ -324,8 +315,8 @@ end
         x = reshape(Float32.(collect(1:prod(sizes))), sizes)
 
         y = evalwgrad(m, x)
-        @test size(m.μ) == (sizes[end-1],)
-        @test size(m.σ²) == (sizes[end-1],)
+        @test size(m.μ) == (sizes[end - 1],)
+        @test size(m.σ²) == (sizes[end - 1],)
         @test size(y) == sizes
 
         @inferred m(x)
@@ -337,7 +328,8 @@ end
         sizes = (5, 5, 3, 4, 2, 6),
         x = reshape(Float32.(collect(1:prod(sizes))), sizes)
 
-        @test m_inorm(x) == reshape(m_bnorm(reshape(x, (sizes[1:(end-2)]..., :, 1))), sizes)
+        @test m_inorm(x) ==
+              reshape(m_bnorm(reshape(x, (sizes[1:(end - 2)]..., :, 1))), sizes)
     end
 
     let m = InstanceNorm(32), x = randn(Float32, 416, 416, 32, 1)
@@ -424,7 +416,7 @@ end
 
         y = m(x)
         out = (z .- reshape(m.μ, 1, 1, 2, 1)) ./ sqrt.(reshape(m.σ², 1, 1, 2, 1) .+ 1.0f-5)
-        @test y ≈ reshape(out, size(x)) atol = 1.0e-5
+        @test y≈reshape(out, size(x)) atol=1.0e-5
     end
     # with activation function
     let m = GroupNorm(4, 2, sigmoid; track_stats = true),
@@ -433,11 +425,11 @@ end
 
         x = Float32.(x)
         μ_affine_shape = ones(Int, length(sizes) + 1)
-        μ_affine_shape[end-1] = 2 # Number of groups
+        μ_affine_shape[end - 1] = 2 # Number of groups
 
         affine_shape = ones(Int, length(sizes) + 1)
-        affine_shape[end-2] = 2 # Channels per group
-        affine_shape[end-1] = 2 # Number of groups
+        affine_shape[end - 2] = 2 # Channels per group
+        affine_shape[end - 1] = 2 # Number of groups
         affine_shape[1] = sizes[1]
         affine_shape[end] = sizes[end]
 
@@ -445,14 +437,10 @@ end
 
         y = m(x)
         x_ = reshape(x, affine_shape...)
-        out = reshape(
-            sigmoid.(
-                (x_ .- reshape(m.μ, μ_affine_shape...)) ./
-                sqrt.(reshape(m.σ², μ_affine_shape...) .+ m.ϵ)
-            ),
-            og_shape,
-        )
-        @test y ≈ out atol = 1e-7
+        out = reshape(sigmoid.((x_ .- reshape(m.μ, μ_affine_shape...)) ./
+                               sqrt.(reshape(m.σ², μ_affine_shape...) .+ m.ϵ)),
+                      og_shape)
+        @test y≈out atol=1e-7
     end
 
     let m = trainmode!(GroupNorm(2, 2; track_stats = true)),
diff --git a/test/layers/recurrent.jl b/test/layers/recurrent.jl
index 225c4d83a9..2ca108ba6b 100644
--- a/test/layers/recurrent.jl
+++ b/test/layers/recurrent.jl
@@ -2,7 +2,7 @@ using LinearAlgebra
 
 # Ref FluxML/Flux.jl#1209 1D input
 @testset "BPTT-1D" begin
-    seq = [rand(Float32, 2) for i = 1:3]
+    seq = [rand(Float32, 2) for i in 1:3]
     for r in [RNN]
         rnn = r(2 => 3)
         Flux.reset!(rnn)
@@ -10,29 +10,22 @@ using LinearAlgebra
             return sum([rnn(s) for s in seq][3])
         end
         Flux.reset!(rnn)
-        bptt = gradient(
-            Wh -> sum(
-                tanh.(
-                    rnn.cell.Wi * seq[3] +
-                    Wh *
-                    tanh.(
-                        rnn.cell.Wi * seq[2] +
-                        Wh *
-                        tanh.(rnn.cell.Wi * seq[1] + Wh * rnn.cell.state0 + rnn.cell.b) +
-                        rnn.cell.b
-                    ) +
-                    rnn.cell.b
-                ),
-            ),
-            rnn.cell.Wh,
-        )
+        bptt = gradient(Wh -> sum(tanh.(rnn.cell.Wi * seq[3] +
+                                        Wh *
+                                        tanh.(rnn.cell.Wi * seq[2] +
+                                              Wh *
+                                              tanh.(rnn.cell.Wi * seq[1] +
+                                                    Wh * rnn.cell.state0 + rnn.cell.b) +
+                                              rnn.cell.b) +
+                                        rnn.cell.b)),
+                        rnn.cell.Wh)
         @test grads_seq[rnn.cell.Wh] ≈ bptt[1]
     end
 end
 
 # Ref FluxML/Flux.jl#1209 2D input
 @testset "BPTT-2D" begin
-    seq = [rand(Float32, (2, 1)) for i = 1:3]
+    seq = [rand(Float32, (2, 1)) for i in 1:3]
     for r in [RNN]
         rnn = r(2 => 3)
         Flux.reset!(rnn)
@@ -40,22 +33,15 @@ end
             return sum([rnn(s) for s in seq][3])
         end
         Flux.reset!(rnn)
-        bptt = gradient(
-            Wh -> sum(
-                tanh.(
-                    rnn.cell.Wi * seq[3] +
-                    Wh *
-                    tanh.(
-                        rnn.cell.Wi * seq[2] +
-                        Wh *
-                        tanh.(rnn.cell.Wi * seq[1] + Wh * rnn.cell.state0 + rnn.cell.b) +
-                        rnn.cell.b
-                    ) +
-                    rnn.cell.b
-                ),
-            ),
-            rnn.cell.Wh,
-        )
+        bptt = gradient(Wh -> sum(tanh.(rnn.cell.Wi * seq[3] +
+                                        Wh *
+                                        tanh.(rnn.cell.Wi * seq[2] +
+                                              Wh *
+                                              tanh.(rnn.cell.Wi * seq[1] +
+                                                    Wh * rnn.cell.state0 + rnn.cell.b) +
+                                              rnn.cell.b) +
+                                        rnn.cell.b)),
+                        rnn.cell.Wh)
         @test grads_seq[rnn.cell.Wh] ≈ bptt[1]
     end
 end
@@ -80,34 +66,30 @@ end
     @test grads_seq[rnn.cell.Wh] ≈ bptt[1]
 end
 
-@testset "RNN-shapes" begin
-    @testset for R in [RNN, GRU, LSTM, GRUv3]
-        m1 = R(3 => 5)
-        m2 = R(3 => 5)
-        m3 = R(3, 5)  # leave one to test the silently deprecated "," not "=>" notation
-        x1 = rand(Float32, 3)
-        x2 = rand(Float32, 3, 1)
-        x3 = rand(Float32, 3, 1, 2)
-        Flux.reset!(m1)
-        Flux.reset!(m2)
-        Flux.reset!(m3)
-        @test size(m1(x1)) == (5,)
-        @test size(m1(x1)) == (5,) # repeat in case of effect from change in state shape
-        @test size(m2(x2)) == (5, 1)
-        @test size(m2(x2)) == (5, 1)
-        @test size(m3(x3)) == (5, 1, 2)
-        @test size(m3(x3)) == (5, 1, 2)
-    end
-end
+@testset "RNN-shapes" begin @testset for R in [RNN, GRU, LSTM, GRUv3]
+    m1 = R(3 => 5)
+    m2 = R(3 => 5)
+    m3 = R(3, 5)  # leave one to test the silently deprecated "," not "=>" notation
+    x1 = rand(Float32, 3)
+    x2 = rand(Float32, 3, 1)
+    x3 = rand(Float32, 3, 1, 2)
+    Flux.reset!(m1)
+    Flux.reset!(m2)
+    Flux.reset!(m3)
+    @test size(m1(x1)) == (5,)
+    @test size(m1(x1)) == (5,) # repeat in case of effect from change in state shape
+    @test size(m2(x2)) == (5, 1)
+    @test size(m2(x2)) == (5, 1)
+    @test size(m3(x3)) == (5, 1, 2)
+    @test size(m3(x3)) == (5, 1, 2)
+end end
 
-@testset "RNN-input-state-eltypes" begin
-    @testset for R in [RNN, GRU, LSTM, GRUv3]
-        m = R(3 => 5)
-        x = rand(Float64, 3, 1)
-        Flux.reset!(m)
-        @test_throws MethodError m(x)
-    end
-end
+@testset "RNN-input-state-eltypes" begin @testset for R in [RNN, GRU, LSTM, GRUv3]
+    m = R(3 => 5)
+    x = rand(Float64, 3, 1)
+    Flux.reset!(m)
+    @test_throws MethodError m(x)
+end end
 
 @testset "multigate" begin
     x = rand(6, 5)
@@ -123,19 +105,17 @@ end
     x = rand(3, 3, 1, 2, 4)
     @test length(Flux.eachlastdim(x)) == size(x, ndims(x))
     @test collect(@inferred(Flux.eachlastdim(x))) == collect(eachslice(x; dims = ndims(x)))
-    slicedim = (size(x)[1:(end-1)]..., 1)
+    slicedim = (size(x)[1:(end - 1)]..., 1)
     res, (dx,) = Flux.withgradient(x) do x
         x1, _, x3, _ = Flux.eachlastdim(x)
         return sum(x1) + sum(x3 .* 3)
     end
     @test res ≈ sum(selectdim(x, ndims(x), 1)) + 3sum(selectdim(x, ndims(x), 3))
-    @test dx ≈ cat(
-        fill(1, slicedim),
-        fill(0, slicedim),
-        fill(3, slicedim),
-        fill(0, slicedim);
-        dims = ndims(x),
-    )
+    @test dx ≈ cat(fill(1, slicedim),
+              fill(0, slicedim),
+              fill(3, slicedim),
+              fill(0, slicedim);
+              dims = ndims(x))
 end
 
 @testset "∇eachlastdim" begin
@@ -147,44 +127,34 @@ end
     NoTangent = Flux.Zygote.NoTangent
     abstract_zeros_vector = [ZeroTangent(), ZeroTangent(), NoTangent(), NoTangent()]
     @test @inferred(Flux.∇eachlastdim(abstract_zeros_vector, x)) == zeros(size(x))
-    x2 = rand(Float64, x_size[1:(end-1)])
-    x3 = rand(Float64, x_size[1:(end-1)])
+    x2 = rand(Float64, x_size[1:(end - 1)])
+    x3 = rand(Float64, x_size[1:(end - 1)])
     mixed_vector = [ZeroTangent(), x2, x3, ZeroTangent()]
     @test @inferred(Flux.∇eachlastdim(mixed_vector, x)) ≈
-          cat(zeros(x_size[1:(end-1)]), x2, x3, zeros(x_size[1:(end-1)]); dims = ndims(x))
+          cat(zeros(x_size[1:(end - 1)]), x2, x3, zeros(x_size[1:(end - 1)]);
+              dims = ndims(x))
 end
 
 @testset "Different Internal Matrix Types" begin
-    R = Flux.Recur(
-        Flux.RNNCell(tanh, rand(5, 3), Tridiagonal(rand(5, 5)), rand(5), rand(5, 1)),
-    )
+    R = Flux.Recur(Flux.RNNCell(tanh, rand(5, 3), Tridiagonal(rand(5, 5)), rand(5),
+                                rand(5, 1)))
     # don't want to pull in SparseArrays just for this test, but there aren't any
     # non-square structured matrix types in LinearAlgebra. so we will use a different
     # eltype matrix, which would fail before when `W_i` and `W_h` were required to be the
     # same type.
-    L = Flux.Recur(
-        Flux.LSTMCell(
-            rand(5 * 4, 3),
-            rand(1:20, 5 * 4, 5),
-            rand(5 * 4),
-            (rand(5, 1), rand(5, 1)),
-        ),
-    )
-    G = Flux.Recur(
-        Flux.GRUCell(rand(5 * 3, 3), rand(1:20, 5 * 3, 5), rand(5 * 3), rand(5, 1)),
-    )
-    G3 = Flux.Recur(
-        Flux.GRUv3Cell(
-            rand(5 * 3, 3),
-            rand(1:20, 5 * 2, 5),
-            rand(5 * 3),
-            Tridiagonal(rand(5, 5)),
-            rand(5, 1),
-        ),
-    )
+    L = Flux.Recur(Flux.LSTMCell(rand(5 * 4, 3),
+                                 rand(1:20, 5 * 4, 5),
+                                 rand(5 * 4),
+                                 (rand(5, 1), rand(5, 1))))
+    G = Flux.Recur(Flux.GRUCell(rand(5 * 3, 3), rand(1:20, 5 * 3, 5), rand(5 * 3),
+                                rand(5, 1)))
+    G3 = Flux.Recur(Flux.GRUv3Cell(rand(5 * 3, 3),
+                                   rand(1:20, 5 * 2, 5),
+                                   rand(5 * 3),
+                                   Tridiagonal(rand(5, 5)),
+                                   rand(5, 1)))
 
     for m in [R, L, G, G3]
-
         x1 = rand(3)
         x2 = rand(3, 1)
         x3 = rand(3, 1, 2)
diff --git a/test/layers/upsample.jl b/test/layers/upsample.jl
index 66831d3d68..c4e1c30341 100644
--- a/test/layers/upsample.jl
+++ b/test/layers/upsample.jl
@@ -2,19 +2,19 @@
     m = Upsample(:bilinear; scale = (2, 3))
     x = rand(Float32, 3, 4, 2, 3)
     y = m(x)
-    @test y isa Array{Float32,4}
+    @test y isa Array{Float32, 4}
     @test size(y) == (6, 12, 2, 3)
 
     m = Upsample(:bilinear; scale = 3)
     x = rand(Float32, 3, 4, 2, 3)
     y = m(x)
-    @test y isa Array{Float32,4}
+    @test y isa Array{Float32, 4}
     @test size(y) == (9, 12, 2, 3)
 
     m = Upsample(:bilinear; size = (4, 6))
     x = rand(Float32, 3, 4, 2, 3)
     y = m(x)
-    @test y isa Array{Float32,4}
+    @test y isa Array{Float32, 4}
     @test size(y) == (4, 6, 2, 3)
 end
 
@@ -22,19 +22,19 @@ end
     m = Upsample(:trilinear; scale = (2, 3, 2))
     x = rand(Float32, 3, 4, 2, 3, 4)
     y = m(x)
-    @test y isa Array{Float32,5}
+    @test y isa Array{Float32, 5}
     @test size(y) == (6, 12, 4, 3, 4)
 
     m = Upsample(:trilinear; scale = 3)
     x = rand(Float32, 3, 4, 2, 3, 4)
     y = m(x)
-    @test y isa Array{Float32,5}
+    @test y isa Array{Float32, 5}
     @test size(y) == (9, 12, 6, 3, 4)
 
     m = Upsample(:trilinear; size = (4, 6, 4))
     x = rand(Float32, 3, 4, 2, 3, 4)
     y = m(x)
-    @test y isa Array{Float32,5}
+    @test y isa Array{Float32, 5}
     @test size(y) == (4, 6, 4, 3, 4)
 end
 
@@ -42,24 +42,24 @@ end
     x = rand(Float32, 3, 2, 3)
     m = Upsample(:nearest; scale = (2,))
     y = m(x)
-    @test y isa Array{Float32,3}
+    @test y isa Array{Float32, 3}
     @test size(y) == (6, 2, 3)
 
     x = rand(Float32, 3, 4, 2, 3)
 
     m = Upsample(:nearest; scale = (2, 3))
     y = m(x)
-    @test y isa Array{Float32,4}
+    @test y isa Array{Float32, 4}
     @test size(y) == (6, 12, 2, 3)
 
     m = Upsample(:nearest; scale = (2,))
     y = m(x)
-    @test y isa Array{Float32,4}
+    @test y isa Array{Float32, 4}
     @test size(y) == (6, 4, 2, 3)
 
     m = Upsample(:nearest; scale = 2)
     y = m(x)
-    @test y isa Array{Float32,4}
+    @test y isa Array{Float32, 4}
     @test size(y) == (6, 8, 2, 3)
 
     m = Upsample(2)
@@ -68,7 +68,7 @@ end
 
     m = Upsample(:nearest; size = (6, 8))
     y = m(x)
-    @test y isa Array{Float32,4}
+    @test y isa Array{Float32, 4}
     @test size(y) == (6, 8, 2, 3)
 end
 
@@ -76,12 +76,12 @@ end
     m = PixelShuffle(2)
     x = rand(Float32, 3, 18, 3)
     y = m(x)
-    @test y isa Array{Float32,3}
+    @test y isa Array{Float32, 3}
     @test size(y) == (6, 9, 3)
 
     m = PixelShuffle(3)
     x = rand(Float32, 3, 4, 18, 3)
     y = m(x)
-    @test y isa Array{Float32,4}
+    @test y isa Array{Float32, 4}
     @test size(y) == (9, 12, 2, 3)
 end
diff --git a/test/losses.jl b/test/losses.jl
index 7984941c78..a8a41bdf43 100644
--- a/test/losses.jl
+++ b/test/losses.jl
@@ -2,12 +2,12 @@ using Test
 using Flux: onehotbatch, σ
 
 using Flux.Losses:
-    mse,
-    label_smoothing,
-    crossentropy,
-    logitcrossentropy,
-    binarycrossentropy,
-    logitbinarycrossentropy
+                   mse,
+                   label_smoothing,
+                   crossentropy,
+                   logitcrossentropy,
+                   binarycrossentropy,
+                   logitbinarycrossentropy
 using Flux.Losses: xlogx, xlogy
 
 # group here all losses, used in tests
@@ -58,19 +58,13 @@ y = [1, 1, 0, 0]
     @test mse(0 + 0im, 1 + 1im) == 2
 end
 
-@testset "mae" begin
-    @test Flux.mae(ŷ, y) ≈ 1 / 2
-end
+@testset "mae" begin @test Flux.mae(ŷ, y) ≈ 1 / 2 end
 
-@testset "huber_loss" begin
-    @test Flux.huber_loss(ŷ, y) ≈ 0.20500000000000002
-end
+@testset "huber_loss" begin @test Flux.huber_loss(ŷ, y) ≈ 0.20500000000000002 end
 
 y = [123.0, 456.0, 789.0]
 ŷ = [345.0, 332.0, 789.0]
-@testset "msle" begin
-    @test Flux.msle(ŷ, y) ≈ 0.38813985859136585
-end
+@testset "msle" begin @test Flux.msle(ŷ, y) ≈ 0.38813985859136585 end
 
 # Now onehot y's
 y = onehotbatch([1, 1, 0, 0], 0:1)
@@ -124,10 +118,8 @@ yls = y .* (1 - 2sf) .+ sf
           -yls .* log.(σ.(logŷ)) - (1 .- yls) .* log.(1 .- σ.(logŷ))
     @test binarycrossentropy(σ.(logŷ), y; ϵ = 0) ≈
           mean(-y .* log.(σ.(logŷ)) - (1 .- y) .* log.(1 .- σ.(logŷ)))
-    @test binarycrossentropy(σ.(logŷ), y) ≈ mean(
-        -y .* log.(σ.(logŷ) .+ eps.(σ.(logŷ))) -
-        (1 .- y) .* log.(1 .- σ.(logŷ) .+ eps.(σ.(logŷ))),
-    )
+    @test binarycrossentropy(σ.(logŷ), y) ≈ mean(-y .* log.(σ.(logŷ) .+ eps.(σ.(logŷ))) -
+               (1 .- y) .* log.(1 .- σ.(logŷ) .+ eps.(σ.(logŷ))))
     @test binarycrossentropy([0.1, 0.2, 0.9], 1) ≈ -mean(log, [0.1, 0.2, 0.9])  # constant label
 end
 
@@ -191,94 +183,68 @@ end
     @test Flux.tversky_loss(y, y) ≈ -0.5576923076923075
 end
 
-@testset "no spurious promotions" begin
-    for T in (Float32, Float64)
-        y = rand(T, 2)
-        ŷ = rand(T, 2)
-        for f in ALL_LOSSES
-            fwd, back = Flux.pullback(f, ŷ, y)
-            @test fwd isa T
-            @test eltype(back(one(T))[1]) == T
-        end
+@testset "no spurious promotions" begin for T in (Float32, Float64)
+    y = rand(T, 2)
+    ŷ = rand(T, 2)
+    for f in ALL_LOSSES
+        fwd, back = Flux.pullback(f, ŷ, y)
+        @test fwd isa T
+        @test eltype(back(one(T))[1]) == T
     end
-end
+end end
 
 @testset "binary_focal_loss" begin
-    y = [
-        0 1 0
-        1 0 1
-    ]
-    ŷ = [
-        0.268941 0.5 0.268941
-        0.731059 0.5 0.731059
-    ]
-
-    y1 = [
-        1 0
-        0 1
-    ]
-    ŷ1 = [
-        0.6 0.3
-        0.4 0.7
-    ]
+    y = [0 1 0
+         1 0 1]
+    ŷ = [0.268941 0.5 0.268941
+         0.731059 0.5 0.731059]
+
+    y1 = [1 0
+          0 1]
+    ŷ1 = [0.6 0.3
+          0.4 0.7]
     @test Flux.binary_focal_loss(ŷ, y) ≈ 0.0728675615927385
     @test Flux.binary_focal_loss(ŷ1, y1) ≈ 0.05691642237852222
     @test Flux.binary_focal_loss(ŷ, y; γ = 0.0) ≈ Flux.binarycrossentropy(ŷ, y)
 end
 
 @testset "focal_loss" begin
-    y = [
-        1 0 0 0 1
-        0 1 0 1 0
-        0 0 1 0 0
-    ]
+    y = [1 0 0 0 1
+         0 1 0 1 0
+         0 0 1 0 0]
     ŷ = softmax(reshape(-7:7, 3, 5) .* 1.0f0)
-    y1 = [
-        1 0
-        0 0
-        0 1
-    ]
-    ŷ1 = [
-        0.4 0.2
-        0.5 0.5
-        0.1 0.3
-    ]
+    y1 = [1 0
+          0 0
+          0 1]
+    ŷ1 = [0.4 0.2
+          0.5 0.5
+          0.1 0.3]
     @test Flux.focal_loss(ŷ, y) ≈ 1.1277571935622628
     @test Flux.focal_loss(ŷ1, y1) ≈ 0.45990566879720157
     @test Flux.focal_loss(ŷ, y; γ = 0.0) ≈ Flux.crossentropy(ŷ, y)
 end
 
 @testset "siamese_contrastive_loss" begin
-    y = [
-        1 0
-        0 0
-        0 1
-    ]
-    ŷ = [
-        0.4 0.2
-        0.5 0.5
-        0.1 0.3
-    ]
-    y1 = [
-        1 0 0 0 1
-        0 1 0 1 0
-        0 0 1 0 0
-    ]
+    y = [1 0
+         0 0
+         0 1]
+    ŷ = [0.4 0.2
+         0.5 0.5
+         0.1 0.3]
+    y1 = [1 0 0 0 1
+          0 1 0 1 0
+          0 0 1 0 0]
     ŷ1 = softmax(reshape(-7:7, 3, 5) .* 1.0f0)
-    y2 = [
-        1
-        0
-        0
-        1
-        1
-    ]
-    ŷ2 = [
-        0.6
-        0.4
-        0.1
-        0.2
-        0.7
-    ]
+    y2 = [1
+          0
+          0
+          1
+          1]
+    ŷ2 = [0.6
+          0.4
+          0.1
+          0.2
+          0.7]
     @test Flux.siamese_contrastive_loss(ŷ, y) ≈ 0.2333333333333333
     @test Flux.siamese_contrastive_loss(ŷ, y; margin = 0.5f0) ≈ 0.10000000000000002
     @test Flux.siamese_contrastive_loss(ŷ, y; margin = 1.5f0) ≈ 0.5333333333333333
@@ -293,14 +259,10 @@ end
     @test Flux.siamese_contrastive_loss(ŷ1, y1; margin = 0) ≈ 0.13161165f0
     @test Flux.siamese_contrastive_loss(ŷ2, y2) ≈ 0.21200000000000005
     @test Flux.siamese_contrastive_loss(ŷ2, ŷ2) ≈ 0.18800000000000003
-    @test_throws DomainError(-0.5, "Margin must be non-negative") Flux.siamese_contrastive_loss(
-        ŷ1,
-        y1,
-        margin = -0.5,
-    )
-    @test_throws DomainError(-1, "Margin must be non-negative") Flux.siamese_contrastive_loss(
-        ŷ,
-        y,
-        margin = -1,
-    )
+    @test_throws DomainError(-0.5, "Margin must be non-negative") Flux.siamese_contrastive_loss(ŷ1,
+                                                                                                y1,
+                                                                                                margin = -0.5)
+    @test_throws DomainError(-1, "Margin must be non-negative") Flux.siamese_contrastive_loss(ŷ,
+                                                                                              y,
+                                                                                              margin = -1)
 end
diff --git a/test/optimise.jl b/test/optimise.jl
index 9f9f788c01..6b7df97baf 100644
--- a/test/optimise.jl
+++ b/test/optimise.jl
@@ -30,7 +30,7 @@ using Random
         w′ = randn(10, 10)
         b = false
         loss(x) = Flux.Losses.mse(w * x, w′ * x .+ b)
-        for t = 1:(10^5)
+        for t in 1:(10^5)
             θ = params([w′, b])
             x = rand(10)
             θ̄ = gradient(() -> loss(x), θ)
@@ -48,7 +48,7 @@ end
         w′ = randn(10, 10)
         loss(x) = Flux.Losses.mse(w * x, w′ * x)
         opt = Optimiser(Opt(), Adam(0.001))
-        for t = 1:(10^5)
+        for t in 1:(10^5)
             θ = Params([w′])
             x = rand(10)
             θ̄ = gradient(() -> loss(x), θ)
@@ -61,32 +61,26 @@ end
 @testset "Training Loop" begin
     i = 0
     l = 1
-    Flux.train!(
-        () -> (sleep(0.1); Flux.skip(); i += 1),
-        Params([]),
-        Iterators.repeated((), 10),
-        Descent(),
-    )
+    Flux.train!(() -> (sleep(0.1); Flux.skip(); i += 1),
+                Params([]),
+                Iterators.repeated((), 10),
+                Descent())
 
     @test i == 0 #all skipped
 
-    Flux.train!(
-        () -> (sleep(0.1); i == 8 && Flux.skip(); i += 1),
-        Params([]),
-        Iterators.repeated((), 10),
-        Descent(),
-    )
+    Flux.train!(() -> (sleep(0.1); i == 8 && Flux.skip(); i += 1),
+                Params([]),
+                Iterators.repeated((), 10),
+                Descent())
 
     @test i == 8 #skip after i hit 8
 
     i = 0
-    Flux.train!(
-        () -> (sleep(0.1); i += 1; l),
-        Params([]),
-        Iterators.repeated((), 100),
-        Descent();
-        cb = Flux.throttle(() -> (i > 3 && Flux.stop()), 1),
-    )
+    Flux.train!(() -> (sleep(0.1); i += 1; l),
+                Params([]),
+                Iterators.repeated((), 100),
+                Descent();
+                cb = Flux.throttle(() -> (i > 3 && Flux.stop()), 1))
 
     @test 3 < i < 50
 
@@ -128,7 +122,7 @@ end
     loss(x) = Flux.Losses.mse(w * x, w1 * x)
     flag = 1
     decay_steps = []
-    for t = 1:(10^5)
+    for t in 1:(10^5)
         prev_eta = o.eta
         θ = Params([w1])
         x = rand(10)
@@ -148,7 +142,7 @@ end
     @test flag == 1
     # Test to check if decay happens at decay steps. Eta reaches clip value (1e-4) after 4000 steps (decay by 0.1 every 1000 steps starting at 0.1).
     ground_truth = []
-    for i = 1:4
+    for i in 1:4
         push!(ground_truth, 1000 * i)  # Expected decay steps for this example.
     end
     @test decay_steps == ground_truth
@@ -223,31 +217,30 @@ end
 # wreaks all sorts of havoc on our training loops.  This test ensures that
 # a simple optimization is montonically decreasing (up to learning step effects)
 @testset "Momentum Optimisers and complex values" begin
-    # Test every optimizer that has momentum internally
-    for opt_ctor in [Adam, RMSProp, RAdam, OAdam, AdaGrad, AdaDelta, NAdam, AdaBelief]
-        # Our "model" is just a complex number
-        w = zeros(ComplexF32, 1)
-
-        # Our model attempts to learn `f(x) = conj(x)` where `f(x) = w*x`
-        function loss()
-            # Deterministic training data is the best training data
-            x = ones(1, 1) + 1im * ones(1, 1)
-
-            # Manually implement `mse()` to allow demonstration of brokenness
-            # on older Flux builds that don't have a fixed `mse()`
-            return sum(abs2.(w * x .- conj(x)))
-        end
+# Test every optimizer that has momentum internally
+for opt_ctor in [Adam, RMSProp, RAdam, OAdam, AdaGrad, AdaDelta, NAdam, AdaBelief]
+    # Our "model" is just a complex number
+    w = zeros(ComplexF32, 1)
+
+    # Our model attempts to learn `f(x) = conj(x)` where `f(x) = w*x`
+    function loss()
+        # Deterministic training data is the best training data
+        x = ones(1, 1) + 1im * ones(1, 1)
+
+        # Manually implement `mse()` to allow demonstration of brokenness
+        # on older Flux builds that don't have a fixed `mse()`
+        return sum(abs2.(w * x .- conj(x)))
+    end
 
-        params = Flux.Params([w])
-        opt = opt_ctor(1e-2)
+    params = Flux.Params([w])
+    opt = opt_ctor(1e-2)
 
-        # Train for 10 iterations, enforcing that loss is monotonically decreasing
-        last_loss = Inf
-        for idx = 1:10
-            grads = Flux.gradient(loss, params)
-            @test loss() < last_loss
-            last_loss = loss()
-            Flux.update!(opt, params, grads)
-        end
+    # Train for 10 iterations, enforcing that loss is monotonically decreasing
+    last_loss = Inf
+    for idx in 1:10
+        grads = Flux.gradient(loss, params)
+        @test loss() < last_loss
+        last_loss = loss()
+        Flux.update!(opt, params, grads)
     end
-end
+end end
diff --git a/test/outputsize.jl b/test/outputsize.jl
index 64eda2af31..2d2baceece 100644
--- a/test/outputsize.jl
+++ b/test/outputsize.jl
@@ -3,7 +3,7 @@
     @test outputsize(m, (10, 10, 3, 1)) == (6, 6, 32, 1)
 
     m = Dense(10, 5)
-    @test_throws DimensionMismatch outputsize(m, (5, 2)) == (5, 1)
+    @test_throws DimensionMismatch outputsize(m, (5, 2))==(5, 1)
     @test outputsize(m, (10,); padbatch = true) == (5, 1)
 
     m = Chain(Dense(10, 8, σ), Dense(8, 5), Dense(5, 2))
@@ -55,33 +55,31 @@ end
     @test outputsize(m, (2, 7), (3, 7)) == (13, 7)
 end
 
-@testset "activations" begin
-    @testset for f in [
-        celu,
-        elu,
-        gelu,
-        hardsigmoid,
-        hardtanh,
-        leakyrelu,
-        lisht,
-        logcosh,
-        logσ,
-        mish,
-        relu,
-        relu6,
-        rrelu,
-        selu,
-        σ,
-        softplus,
-        softshrink,
-        softsign,
-        swish,
-        tanhshrink,
-        trelu,
-    ]
-        @test outputsize(Dense(10, 5, f), (10, 1)) == (5, 1)
-    end
-end
+@testset "activations" begin @testset for f in [
+    celu,
+    elu,
+    gelu,
+    hardsigmoid,
+    hardtanh,
+    leakyrelu,
+    lisht,
+    logcosh,
+    logσ,
+    mish,
+    relu,
+    relu6,
+    rrelu,
+    selu,
+    σ,
+    softplus,
+    softshrink,
+    softsign,
+    swish,
+    tanhshrink,
+    trelu,
+]
+    @test outputsize(Dense(10, 5, f), (10, 1)) == (5, 1)
+end end
 
 @testset "conv" begin
     m = Conv((3, 3), 3 => 16)
diff --git a/test/runtests.jl b/test/runtests.jl
index 2a1b2913ca..4189ea0dd5 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -10,18 +10,12 @@ using CUDA
 
 Random.seed!(0)
 
-@testset verbose = true "Flux.jl" begin
-    @testset "Utils" begin
-        include("utils.jl")
-    end
+@testset verbose=true "Flux.jl" begin
+    @testset "Utils" begin include("utils.jl") end
 
-    @testset "Optimise" begin
-        include("optimise.jl")
-    end
+    @testset "Optimise" begin include("optimise.jl") end
 
-    @testset "Data" begin
-        include("data.jl")
-    end
+    @testset "Data" begin include("data.jl") end
 
     @testset "Losses" begin
         include("losses.jl")
@@ -44,13 +38,11 @@ Random.seed!(0)
         include("outputsize.jl")
     end
 
-    @testset "CUDA" begin
-        if CUDA.functional()
-            include("cuda/runtests.jl")
-        else
-            @warn "CUDA unavailable, not testing GPU support"
-        end
-    end
+    @testset "CUDA" begin if CUDA.functional()
+        include("cuda/runtests.jl")
+    else
+        @warn "CUDA unavailable, not testing GPU support"
+    end end
 
     @static if VERSION == v"1.6"
         using Documenter
diff --git a/test/utils.jl b/test/utils.jl
index 7da452ba02..6d2cb855e8 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -1,22 +1,22 @@
 using Flux
 using Flux:
-    throttle,
-    nfan,
-    glorot_uniform,
-    glorot_normal,
-    kaiming_normal,
-    kaiming_uniform,
-    orthogonal,
-    truncated_normal,
-    sparse_init,
-    identity_init,
-    unstack,
-    batch,
-    unbatch,
-    unsqueeze,
-    params,
-    loadparams!,
-    loadmodel!
+            throttle,
+            nfan,
+            glorot_uniform,
+            glorot_normal,
+            kaiming_normal,
+            kaiming_uniform,
+            orthogonal,
+            truncated_normal,
+            sparse_init,
+            identity_init,
+            unstack,
+            batch,
+            unbatch,
+            unsqueeze,
+            params,
+            loadparams!,
+            loadmodel!
 using MLUtils
 using StatsBase: var, std
 using Statistics, LinearAlgebra
@@ -105,7 +105,7 @@ end
         end
         @test size(init(3, 4)) == (3, 4)
         # only init(size...) is accepted:
-        @test_throws MethodError size(init((3, 4, 5))) == (3, 4, 5)
+        @test_throws MethodError size(init((3, 4, 5)))==(3, 4, 5)
 
         # rng, and currying:
         @test size(init(MersenneTwister(1), 3, 4)) == (3, 4)
@@ -180,8 +180,8 @@ end
         for (n_in, n_out, sparsity, σ) in [(100, 100, 0.25, 0.1), (100, 400, 0.75, 0.01)]
             expected_zeros = ceil(Integer, n_in * sparsity)
             v = sparse_init(n_in, n_out; sparsity = sparsity, std = σ)
-            @test all([sum(v[:, col] .== 0) == expected_zeros for col = 1:n_out])
-            @test 0.9 * σ < std(v[v.!=0]) < 1.1 * σ
+            @test all([sum(v[:, col] .== 0) == expected_zeros for col in 1:n_out])
+            @test 0.9 * σ < std(v[v .!= 0]) < 1.1 * σ
         end
 
         @test eltype(sparse_init(3, 4; std = 1.5, sparsity = 0.5)) == Float32
@@ -189,9 +189,9 @@ end
 
     @testset "truncated_normal" begin
         m = truncated_normal(100, 100)
-        @test minimum(m) ≈ -2 atol = 0.05  # default arguments
-        @test maximum(m) ≈ 2 atol = 0.05
-        @test mean(m) ≈ 0 atol = 0.1
+        @test minimum(m)≈-2 atol=0.05  # default arguments
+        @test maximum(m)≈2 atol=0.05
+        @test mean(m)≈0 atol=0.1
 
         size100 = (100, 100, 100)
         for (μ, σ, lo, hi) in [(0.0, 1, -2, 3), (1, 2, -4.0, 5.0)]
@@ -241,12 +241,15 @@ end
             indata = reshape(collect(Float32, 1:9), 3, 3)
             @test l(indata) == indata
         end
-        @testset "$layer ID mapping with kernelsize $kernelsize" for layer in (
-                Conv,
-                ConvTranspose,
-                CrossCor,
-            ),
-            kernelsize in ((1,), (3,), (1, 3), (3, 5), (3, 5, 7))
+        @testset "$layer ID mapping with kernelsize $kernelsize" for layer in (Conv,
+                                                                               ConvTranspose,
+                                                                               CrossCor),
+                                                                     kernelsize in ((1,),
+                                                                                    (3,),
+                                                                                    (1, 3),
+                                                                                    (3, 5),
+                                                                                    (3, 5,
+                                                                                     7))
 
             nch = 3
             l = layer(kernelsize, nch => nch; init = identity_init, pad = SamePad())
@@ -257,18 +260,14 @@ end
         @testset "Inception identity" begin
             insize = 7
             path1 = Conv((1, 3), insize => 2; init = identity_init, pad = SamePad())
-            path2 = Conv(
-                (3, 5),
-                insize => 3;
-                init = identity_init(; shift = (0, 0, 2, 0)),
-                pad = SamePad(),
-            )
-            path3 = Conv(
-                (5, 7),
-                insize => 2;
-                init = identity_init(; shift = (0, 0, 5, 0)),
-                pad = SamePad(),
-            )
+            path2 = Conv((3, 5),
+                         insize => 3;
+                         init = identity_init(; shift = (0, 0, 2, 0)),
+                         pad = SamePad())
+            path3 = Conv((5, 7),
+                         insize => 2;
+                         init = identity_init(; shift = (0, 0, 5, 0)),
+                         pad = SamePad())
             block = Parallel((xs...) -> cat(xs...; dims = 3), path1, path2, path3)
 
             indata = randn(Float32, 9, 9, 7, 2)
@@ -316,7 +315,7 @@ end
     @test f32(m).bias === m.bias === false
 
     @testset "Gradients for broadcasted $op with sizes $s" for op in (+, -, *),
-        s in ((1,), (2, 3))
+                                                               s in ((1,), (2, 3))
 
         o = ones(s)
         z = zeros(s)
@@ -367,12 +366,10 @@ end
 end
 
 @testset "Batching" begin
-    stacked_array = [
-        8 9 3 5
-        9 6 6 9
-        9 1 7 2
-        7 4 10 6
-    ]
+    stacked_array = [8 9 3 5
+                     9 6 6 9
+                     9 1 7 2
+                     7 4 10 6]
     unstacked_array = [[8, 9, 9, 7], [9, 6, 1, 4], [3, 6, 7, 10], [5, 9, 2, 6]]
     @test unbatch(stacked_array) == unstacked_array
     @test batch(unstacked_array) == stacked_array
@@ -382,8 +379,8 @@ end
     @test unbatch([1, 2, 3]) == [1, 2, 3]
 
     # generic iterable
-    @test batch(ones(2) for i = 1:3) == ones(2, 3)
-    @test unbatch(ones(2, 3)) == [ones(2) for i = 1:3]
+    @test batch(ones(2) for i in 1:3) == ones(2, 3)
+    @test unbatch(ones(2, 3)) == [ones(2) for i in 1:3]
 end
 
 @testset "Param remapping" begin
@@ -392,8 +389,8 @@ end
     dm(bias) = Chain(dl(3, 5, bias), dl(5, 4, bias), dl(4, 3, bias))
 
     nobias(n) = false
-    testdense(m, bt) = @testset "Check layer $i" for (i, (l1, l2)) in
-                                                     enumerate(zip(m, dm(bt)))
+    testdense(m, bt) = @testset "Check layer $i" for (i, (l1, l2)) in enumerate(zip(m,
+                                                                                    dm(bt)))
         @test l1.weight == l2.weight
         @test l1.bias == l2.bias
         @test_skip typeof(l1.bias) === typeof(l2.bias)
@@ -441,12 +438,10 @@ end
 
         # tests for BatchNorm and Dropout
         m1 = Chain(Conv((3, 3), 3 => 16), BatchNorm(16), Flux.flatten, Dropout(0.2))
-        m2 = Chain(
-            Conv((3, 3), 3 => 16),
-            BatchNorm(16),
-            x -> reshape(x, :, size(x)[end]),
-            Dropout(0.1),
-        )
+        m2 = Chain(Conv((3, 3), 3 => 16),
+                   BatchNorm(16),
+                   x -> reshape(x, :, size(x)[end]),
+                   Dropout(0.1))
         m2[2].μ .= rand(Float32, size(m2[2].μ)...)
         loadmodel!(m1, m2)
         # non-trainable parameters are copied as well
@@ -461,40 +456,38 @@ end
         # tests MaxPool
         # tests testmode!/trainmode! is not copied
         # tests Dense, Conv, BatchNorm, Dropout (like above) but in a bigger model
-        chain1 = Chain(
-            Dropout(0.2),
-            Conv((3, 3), 1 => 32, relu),
-            BatchNorm(32, relu),
-            MaxPool((2, 2)),
-            Dropout(0.2),
-            Conv((3, 3), 32 => 16, relu),
-            Dropout(0.2),
-            MaxPool((2, 2)),
-            Dropout(0.2),
-            Conv((3, 3), 16 => 10, relu),
-            Dropout(0.2),
-            x -> reshape(x, :, size(x, 4)),
-            Dropout(0.2),
-            Dense(90, 10),
-            softmax,
-        )
+        chain1 = Chain(Dropout(0.2),
+                       Conv((3, 3), 1 => 32, relu),
+                       BatchNorm(32, relu),
+                       MaxPool((2, 2)),
+                       Dropout(0.2),
+                       Conv((3, 3), 32 => 16, relu),
+                       Dropout(0.2),
+                       MaxPool((2, 2)),
+                       Dropout(0.2),
+                       Conv((3, 3), 16 => 10, relu),
+                       Dropout(0.2),
+                       x -> reshape(x, :, size(x, 4)),
+                       Dropout(0.2),
+                       Dense(90, 10),
+                       softmax)
         chain2 = Chain([
-            Dropout(0.1),
-            Conv((3, 3), 1 => 32, relu),
-            BatchNorm(32, relu),
-            MaxPool((3, 3)),
-            Dropout(0.1),
-            Conv((3, 3), 32 => 16, relu),
-            Dropout(0.1),
-            MaxPool((3, 3)),
-            Dropout(0.1),
-            Conv((3, 3), 16 => 10, relu),
-            Dropout(0.1),
-            x -> reshape(x, :, size(x, 4)),
-            Dropout(0.1),
-            Dense(90, 10),
-            softmax,
-        ])
+                           Dropout(0.1),
+                           Conv((3, 3), 1 => 32, relu),
+                           BatchNorm(32, relu),
+                           MaxPool((3, 3)),
+                           Dropout(0.1),
+                           Conv((3, 3), 32 => 16, relu),
+                           Dropout(0.1),
+                           MaxPool((3, 3)),
+                           Dropout(0.1),
+                           Conv((3, 3), 16 => 10, relu),
+                           Dropout(0.1),
+                           x -> reshape(x, :, size(x, 4)),
+                           Dropout(0.1),
+                           Dense(90, 10),
+                           softmax,
+                       ])
         chain2[3].μ .= 5.0f0
         chain2[3].σ² .= 2.0f0
         testmode!(chain2)
@@ -502,7 +495,7 @@ end
         for (dst, src) in zip(chain1, chain2)
             if dst isa Dropout
                 @test dst.p == 0.2
-            elseif dst isa Union{Conv,Dense}
+            elseif dst isa Union{Conv, Dense}
                 @test dst.weight == src.weight
                 @test dst.bias == src.bias
             elseif dst isa MaxPool
@@ -515,12 +508,12 @@ end
         end
 
         # copy only a subset of the model
-        chain1[end-1].weight .= 1.0f0
+        chain1[end - 1].weight .= 1.0f0
         chain1[3].μ .= 3.0f0
         chain1[2].bias .= 5.0f0
-        loadmodel!(chain2[end-1], chain1[end-1])
+        loadmodel!(chain2[end - 1], chain1[end - 1])
         loadmodel!(chain2[3], chain1[3])
-        @test chain2[end-1].weight == chain1[end-1].weight
+        @test chain2[end - 1].weight == chain1[end - 1].weight
         @test chain2[3].μ == chain1[3].μ
         @test chain2[2].bias != chain1[2].bias
 
@@ -631,18 +624,16 @@ end
     @test modules[5] === m2
     @test modules[6] === m3
 
-    mod_par = Flux.modules(
-        Parallel(Flux.Bilinear(2, 2, 2, cbrt), Dense(2, 2, abs), Dense(2, 2, abs2)),
-    )
+    mod_par = Flux.modules(Parallel(Flux.Bilinear(2, 2, 2, cbrt), Dense(2, 2, abs),
+                                    Dense(2, 2, abs2)))
     @test length(mod_par) == 5
 
     mod_rnn = Flux.modules(Chain(Dense(2, 3), BatchNorm(3), LSTM(3, 4)))
     @test length(mod_rnn) == 6
     @test mod_rnn[end] isa Flux.LSTMCell
 
-    mod_skip = Flux.modules(
-        Chain(SkipConnection(Conv((2, 3), 4 => 5; pad = 6, stride = 7), +), LayerNorm(8)),
-    )
+    mod_skip = Flux.modules(Chain(SkipConnection(Conv((2, 3), 4 => 5; pad = 6, stride = 7),
+                                                 +), LayerNorm(8)))
     @test length(mod_skip) == 6
     @test mod_skip[end] isa Flux.Scale
 end
@@ -661,7 +652,7 @@ end
         end
 
         n_iter = 0
-        for i = 1:length(v)
+        for i in 1:length(v)
             trigger(i) && break
             n_iter += 1
         end
@@ -683,11 +674,9 @@ end
         end
 
         @testset "distance" begin
-            es = Flux.early_stopping(
-                identity,
-                10;
-                distance = (best_score, score) -> score - best_score,
-            )
+            es = Flux.early_stopping(identity,
+                                     10;
+                                     distance = (best_score, score) -> score - best_score)
 
             n_iter = 0
             while n_iter < 99
@@ -813,10 +802,8 @@ end
         n_outputs = [3, 7]
 
         data = rand(Float32, n_input, n_batch)
-        model = Chain(
-            Dense(n_input, n_shared),
-            Split(Dense(n_shared, n_outputs[1]), Dense(n_shared, n_outputs[2])),
-        )
+        model = Chain(Dense(n_input, n_shared),
+                      Split(Dense(n_shared, n_outputs[1]), Dense(n_shared, n_outputs[2])))
 
         pvec, re = Flux.destructure(model)
         loss(x, idx, pv) = sum(abs2, re(pv)(x)[idx])  # loss wrt `idx`th output term
@@ -826,20 +813,16 @@ end
     end
 end
 
-@testset "Rrule" begin
-    @testset "issue 2033" begin
-        if CUDA.functional()
-            struct Wrapped{T}
-                x::T
-            end
-            y, _ = Flux.pullback(Wrapped, cu(randn(3, 3)))
-            @test y isa Wrapped{<:CuArray}
-        end
+@testset "Rrule" begin @testset "issue 2033" begin if CUDA.functional()
+    struct Wrapped{T}
+        x::T
     end
-end
+    y, _ = Flux.pullback(Wrapped, cu(randn(3, 3)))
+    @test y isa Wrapped{<:CuArray}
+end end end
 
 # make sure rng_from_array is non_differentiable
 @testset "rng_from_array" begin
-    m(x) = (rand(rng_from_array(x))*x)[1]
+    m(x) = (rand(rng_from_array(x)) * x)[1]
     gradient(m, ones(2))
 end