diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml new file mode 100644 index 0000000000..07fcf66f1f --- /dev/null +++ b/.JuliaFormatter.toml @@ -0,0 +1,9 @@ +style = "sciml" +whitespace_in_kwargs = true +format_docstrings = true +always_for_in = true +join_lines_based_on_source = true +separate_kwargs_with_semicolon = true +always_use_return = true +margin = 92 +indent = 4 diff --git a/docs/make.jl b/docs/make.jl index ecfaaa256c..dcf4081aaf 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,61 +1,61 @@ using Documenter, - Flux, NNlib, Functors, MLUtils, BSON, Optimisers, OneHotArrays, Zygote, ChainRulesCore + Flux, NNlib, Functors, MLUtils, BSON, Optimisers, OneHotArrays, Zygote, ChainRulesCore DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive = true) makedocs(; - modules = [ - Flux, - NNlib, - Functors, - MLUtils, - BSON, - Optimisers, - OneHotArrays, - Zygote, - ChainRulesCore, - Base, - ], - doctest = false, - sitename = "Flux", - # strict = [:cross_references,], - pages = [ - "Home" => "index.md", - "Building Models" => [ - "Overview" => "models/overview.md", - "Basics" => "models/basics.md", - "Recurrence" => "models/recurrence.md", - "Layer Reference" => "models/layers.md", - "Loss Functions" => "models/losses.md", - "Regularisation" => "models/regularisation.md", - "Custom Layers" => "models/advanced.md", - "NNlib.jl" => "models/nnlib.md", - "Activation Functions" => "models/activation.md", - ], - "Handling Data" => - ["MLUtils.jl" => "data/mlutils.md", "OneHotArrays.jl" => "data/onehot.md"], - "Training Models" => [ - "Optimisers" => "training/optimisers.md", - "Training" => "training/training.md", - "Callback Helpers" => "training/callbacks.md", - "Zygote.jl" => "training/zygote.md", - ], - "GPU Support" => "gpu.md", - "Model Tools" => [ - "Saving & Loading" => "saving.md", - "Shape Inference" => "outputsize.md", - "Weight Initialisation" => "utilities.md", - "Functors.jl" => "models/functors.md", - ], - "Performance Tips" => "performance.md", - "Flux's Ecosystem" => "ecosystem.md", - ], - format = Documenter.HTML(; - sidebar_sitename = false, - analytics = "UA-36890222-9", - assets = ["assets/flux.css"], - prettyurls = get(ENV, "CI", nothing) == "true", - ), -) + modules = [ + Flux, + NNlib, + Functors, + MLUtils, + BSON, + Optimisers, + OneHotArrays, + Zygote, + ChainRulesCore, + Base, + ], + doctest = false, + sitename = "Flux", + # strict = [:cross_references,], + pages = [ + "Home" => "index.md", + "Building Models" => [ + "Overview" => "models/overview.md", + "Basics" => "models/basics.md", + "Recurrence" => "models/recurrence.md", + "Layer Reference" => "models/layers.md", + "Loss Functions" => "models/losses.md", + "Regularisation" => "models/regularisation.md", + "Custom Layers" => "models/advanced.md", + "NNlib.jl" => "models/nnlib.md", + "Activation Functions" => "models/activation.md", + ], + "Handling Data" => [ + "MLUtils.jl" => "data/mlutils.md", + "OneHotArrays.jl" => "data/onehot.md", + ], + "Training Models" => [ + "Optimisers" => "training/optimisers.md", + "Training" => "training/training.md", + "Callback Helpers" => "training/callbacks.md", + "Zygote.jl" => "training/zygote.md", + ], + "GPU Support" => "gpu.md", + "Model Tools" => [ + "Saving & Loading" => "saving.md", + "Shape Inference" => "outputsize.md", + "Weight Initialisation" => "utilities.md", + "Functors.jl" => "models/functors.md", + ], + "Performance Tips" => "performance.md", + "Flux's Ecosystem" => "ecosystem.md", + ], + format = Documenter.HTML(; + sidebar_sitename = false, + analytics = "UA-36890222-9", + assets = ["assets/flux.css"], + prettyurls = get(ENV, "CI", nothing) == "true")) deploydocs(; repo = "github.com/FluxML/Flux.jl.git", target = "build", push_preview = true) diff --git a/perf/bench_utils.jl b/perf/bench_utils.jl index f719b01c99..d7897851a4 100644 --- a/perf/bench_utils.jl +++ b/perf/bench_utils.jl @@ -24,19 +24,19 @@ function run_benchmark(model, x; cuda = true) fw(model, x) GC.gc() CUDA.reclaim() #warmup - @btime CUDA.@sync(fw($model, $x)) teardown = (GC.gc(); CUDA.reclaim()) + @btime CUDA.@sync(fw($model, $x)) teardown=(GC.gc(); CUDA.reclaim()) println(" backward") bw(back) GC.gc() CUDA.reclaim() #warmup - @btime CUDA.@sync(bw($back)) teardown = (GC.gc(); CUDA.reclaim()) + @btime CUDA.@sync(bw($back)) teardown=(GC.gc(); CUDA.reclaim()) println(" forw and back") fwbw(model, ps, x) GC.gc() CUDA.reclaim() #warmup - @btime CUDA.@sync(fwbw($model, $ps, $x)) teardown = (GC.gc(); CUDA.reclaim()) + @btime CUDA.@sync(fwbw($model, $ps, $x)) teardown=(GC.gc(); CUDA.reclaim()) else println(" forward") fw(model, x) #warmup diff --git a/perf/recurrent.jl b/perf/recurrent.jl index bf4a2474da..9002e248d6 100644 --- a/perf/recurrent.jl +++ b/perf/recurrent.jl @@ -51,7 +51,7 @@ end for rnn_type in [Flux.RNN, Flux.GRU, Flux.LSTM] rnn_benchmark_sweep(rnn_type) do n, ts - return [randn(Float32, n, n) for _ = 1:ts], "Vec" + return [randn(Float32, n, n) for _ in 1:ts], "Vec" end end diff --git a/perf/vgg.jl b/perf/vgg.jl index d86fdd6fe1..dad9d1aad1 100644 --- a/perf/vgg.jl +++ b/perf/vgg.jl @@ -6,45 +6,43 @@ using CUDA using Zygote: pullback function vgg16() - return Chain( - Conv((3, 3), 3 => 64, relu; pad = (1, 1), stride = (1, 1)), - BatchNorm(64), - Conv((3, 3), 64 => 64, relu; pad = (1, 1), stride = (1, 1)), - BatchNorm(64), - MaxPool((2, 2)), - Conv((3, 3), 64 => 128, relu; pad = (1, 1), stride = (1, 1)), - BatchNorm(128), - Conv((3, 3), 128 => 128, relu; pad = (1, 1), stride = (1, 1)), - BatchNorm(128), - MaxPool((2, 2)), - Conv((3, 3), 128 => 256, relu; pad = (1, 1), stride = (1, 1)), - BatchNorm(256), - Conv((3, 3), 256 => 256, relu; pad = (1, 1), stride = (1, 1)), - BatchNorm(256), - Conv((3, 3), 256 => 256, relu; pad = (1, 1), stride = (1, 1)), - BatchNorm(256), - MaxPool((2, 2)), - Conv((3, 3), 256 => 512, relu; pad = (1, 1), stride = (1, 1)), - BatchNorm(512), - Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)), - BatchNorm(512), - Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)), - BatchNorm(512), - MaxPool((2, 2)), - Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)), - BatchNorm(512), - Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)), - BatchNorm(512), - Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)), - BatchNorm(512), - MaxPool((2, 2)), - flatten, - Dense(512, 4096, relu), - Dropout(0.5), - Dense(4096, 4096, relu), - Dropout(0.5), - Dense(4096, 10), - ) + return Chain(Conv((3, 3), 3 => 64, relu; pad = (1, 1), stride = (1, 1)), + BatchNorm(64), + Conv((3, 3), 64 => 64, relu; pad = (1, 1), stride = (1, 1)), + BatchNorm(64), + MaxPool((2, 2)), + Conv((3, 3), 64 => 128, relu; pad = (1, 1), stride = (1, 1)), + BatchNorm(128), + Conv((3, 3), 128 => 128, relu; pad = (1, 1), stride = (1, 1)), + BatchNorm(128), + MaxPool((2, 2)), + Conv((3, 3), 128 => 256, relu; pad = (1, 1), stride = (1, 1)), + BatchNorm(256), + Conv((3, 3), 256 => 256, relu; pad = (1, 1), stride = (1, 1)), + BatchNorm(256), + Conv((3, 3), 256 => 256, relu; pad = (1, 1), stride = (1, 1)), + BatchNorm(256), + MaxPool((2, 2)), + Conv((3, 3), 256 => 512, relu; pad = (1, 1), stride = (1, 1)), + BatchNorm(512), + Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)), + BatchNorm(512), + Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)), + BatchNorm(512), + MaxPool((2, 2)), + Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)), + BatchNorm(512), + Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)), + BatchNorm(512), + Conv((3, 3), 512 => 512, relu; pad = (1, 1), stride = (1, 1)), + BatchNorm(512), + MaxPool((2, 2)), + flatten, + Dense(512, 4096, relu), + Dropout(0.5), + Dense(4096, 4096, relu), + Dropout(0.5), + Dense(4096, 10)) end let model = vgg16(), x = rand(Float32, 32, 32, 3, 64) diff --git a/src/Flux.jl b/src/Flux.jl index d2e2783199..b4e56dadfb 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -15,75 +15,73 @@ export gradient # Pirate error to catch a common mistake. (Internal function `base` because overloading `update!` is more likely to give ambiguities.) function Optimisers.base(dx::Zygote.Grads) - return error( - "Optimisers.jl cannot be used with Zygote.jl's implicit gradients, `Params` & `Grads`", - ) + return error("Optimisers.jl cannot be used with Zygote.jl's implicit gradients, `Params` & `Grads`") end export Chain, - Dense, - Embedding, - Maxout, - SkipConnection, - Parallel, - PairwiseFusion, - RNN, - LSTM, - GRU, - GRUv3, - SamePad, - Conv, - CrossCor, - ConvTranspose, - DepthwiseConv, - AdaptiveMaxPool, - AdaptiveMeanPool, - GlobalMaxPool, - GlobalMeanPool, - MaxPool, - MeanPool, - Dropout, - AlphaDropout, - LayerNorm, - BatchNorm, - InstanceNorm, - GroupNorm, - Upsample, - PixelShuffle, - fmap, - cpu, - gpu, - f32, - f64, - testmode!, - trainmode! + Dense, + Embedding, + Maxout, + SkipConnection, + Parallel, + PairwiseFusion, + RNN, + LSTM, + GRU, + GRUv3, + SamePad, + Conv, + CrossCor, + ConvTranspose, + DepthwiseConv, + AdaptiveMaxPool, + AdaptiveMeanPool, + GlobalMaxPool, + GlobalMeanPool, + MaxPool, + MeanPool, + Dropout, + AlphaDropout, + LayerNorm, + BatchNorm, + InstanceNorm, + GroupNorm, + Upsample, + PixelShuffle, + fmap, + cpu, + gpu, + f32, + f64, + testmode!, + trainmode! include("optimise/Optimise.jl") using .Optimise using .Optimise: @epochs using .Optimise: skip export Descent, - Adam, - Momentum, - Nesterov, - RMSProp, - AdaGrad, - AdaMax, - AdaDelta, - AMSGrad, - NAdam, - OAdam, - AdamW, - RAdam, - AdaBelief, - InvDecay, - ExpDecay, - WeightDecay, - ClipValue, - ClipNorm + Adam, + Momentum, + Nesterov, + RMSProp, + AdaGrad, + AdaMax, + AdaDelta, + AMSGrad, + NAdam, + OAdam, + AdamW, + RAdam, + AdaBelief, + InvDecay, + ExpDecay, + WeightDecay, + ClipValue, + ClipNorm using CUDA -const use_cuda = Ref{Union{Nothing,Bool}}(nothing) +const use_cuda = Ref{Union{Nothing, Bool}}(nothing) using Adapt, Functors, OneHotArrays include("utils.jl") @@ -91,9 +89,7 @@ include("functor.jl") # Pirate error to catch a common mistake. function Functors.functor(::Type{<:MLUtils.DataLoader}, x) - return error( - "`DataLoader` does not support Functors.jl, thus functions like `Flux.gpu` will not act on its contents.", - ) + return error("`DataLoader` does not support Functors.jl, thus functions like `Flux.gpu` will not act on its contents.") end include("layers/stateless.jl") diff --git a/src/cuda/cudnn.jl b/src/cuda/cudnn.jl index 6ffa43e16a..40805f20c2 100644 --- a/src/cuda/cudnn.jl +++ b/src/cuda/cudnn.jl @@ -1,39 +1,31 @@ import NNlibCUDA: batchnorm, ∇batchnorm -function (BN::Flux.BatchNorm)( - x::Union{CuArray{T,2},CuArray{T,4},CuArray{T,5}}, - cache = nothing, -) where {T<:Union{Float32,Float64}} +function (BN::Flux.BatchNorm)(x::Union{CuArray{T, 2}, CuArray{T, 4}, CuArray{T, 5}}, + cache = nothing) where {T <: Union{Float32, Float64}} @assert BN.affine "BatchNorm: only affine=true supported on gpu" @assert BN.track_stats "BatchNorm: only track_stats=true supported on gpu" - @assert length(BN.β) == size(x, ndims(x) - 1) "BatchNorm: input has wrong number of channels" - return BN.λ.( - batchnorm( - BN.γ, - BN.β, - x, - BN.μ, - BN.σ², - BN.momentum; - cache = cache, - alpha = 1, - beta = 0, - eps = BN.ϵ, - training = Flux._isactive(BN), - ) - ) + @assert length(BN.β)==size(x, ndims(x) - 1) "BatchNorm: input has wrong number of channels" + return BN.λ.(batchnorm(BN.γ, + BN.β, + x, + BN.μ, + BN.σ², + BN.momentum; + cache = cache, + alpha = 1, + beta = 0, + eps = BN.ϵ, + training = Flux._isactive(BN))) end -function ChainRulesCore.rrule( - ::typeof(batchnorm), - g, - b, - x, - running_mean, - running_var, - momentum; - kw..., -) +function ChainRulesCore.rrule(::typeof(batchnorm), + g, + b, + x, + running_mean, + running_var, + momentum; + kw...) y = batchnorm(g, b, x, running_mean, running_var, momentum; kw...) function batchnorm_pullback(Δ) grad = ∇batchnorm(g, b, x, unthunk(Δ), running_mean, running_var, momentum; kw...) diff --git a/src/deprecations.jl b/src/deprecations.jl index 6d29cb6fd1..cb5689e360 100644 --- a/src/deprecations.jl +++ b/src/deprecations.jl @@ -1,49 +1,35 @@ # v0.12 deprecations function ones(dims...) - Base.depwarn( - "Flux.ones(size...) is deprecated, please use Flux.ones32(size...) or Base.ones(Float32, size...)", - :ones; - force = true, - ) + Base.depwarn("Flux.ones(size...) is deprecated, please use Flux.ones32(size...) or Base.ones(Float32, size...)", + :ones; + force = true) return Base.ones(Float32, dims...) end ones(T::Type, dims...) = Base.ones(T, dims...) function zeros(dims...) - Base.depwarn( - "Flux.zeros(size...) is deprecated, please use Flux.zeros32(size...) or Base.zeros(Float32, size...)", - :zeros; - force = true, - ) + Base.depwarn("Flux.zeros(size...) is deprecated, please use Flux.zeros32(size...) or Base.zeros(Float32, size...)", + :zeros; + force = true) return Base.zeros(Float32, dims...) end zeros(T::Type, dims...) = Base.zeros(T, dims...) function ones32(::Type, dims...) - throw( - ArgumentError( - "Flux.ones32 is always Float32, use Base.ones to specify the element type", - ), - ) + throw(ArgumentError("Flux.ones32 is always Float32, use Base.ones to specify the element type")) end function zeros32(::Type, dims...) - throw( - ArgumentError( - "Flux.zeros32 is always Float32, use Base.zeros to specify the element type", - ), - ) + throw(ArgumentError("Flux.zeros32 is always Float32, use Base.zeros to specify the element type")) end # v0.13 deprecations function Broadcast.broadcasted(f::Recur, args...) # This had an explicit @adjoint rule, calling Zygote.∇map(__context__, f, args...), until v0.12 - Base.depwarn( - """Broadcasting is not safe to use with RNNs, as it does not guarantee an iteration order. -Re-writing this as a comprehension would be better.""", - :broadcasted, - ) + Base.depwarn("""Broadcasting is not safe to use with RNNs, as it does not guarantee an iteration order. + Re-writing this as a comprehension would be better.""", + :broadcasted) return map(f, args...) # map isn't really safe either, but end @@ -51,44 +37,34 @@ end struct Zeros function Zeros() - Base.depwarn( - "Flux.Zeros is no more, has ceased to be, is bereft of life, is an ex-boondoggle... please use bias=false instead", - :Zeros, - ) + Base.depwarn("Flux.Zeros is no more, has ceased to be, is bereft of life, is an ex-boondoggle... please use bias=false instead", + :Zeros) return false end end Zeros(args...) = Zeros() # was used both Dense(10, 2, initb = Zeros) and Dense(rand(2,10), Zeros()) function Optimise.update!(x::AbstractArray, x̄) - Base.depwarn( - "`Flux.Optimise.update!(x, x̄)` was not used internally and has been removed. Please write `x .-= x̄` instead.", - :update!, - ) + Base.depwarn("`Flux.Optimise.update!(x, x̄)` was not used internally and has been removed. Please write `x .-= x̄` instead.", + :update!) return x .-= x̄ end function Diagonal(size::Integer...; kw...) - Base.depwarn( - "Flux.Diagonal is now Flux.Scale, and also allows an activation function.", - :Diagonal, - ) + Base.depwarn("Flux.Diagonal is now Flux.Scale, and also allows an activation function.", + :Diagonal) return Scale(size...; kw...) end function Diagonal(size::Tuple; kw...) - Base.depwarn( - "Flux.Diagonal is now Flux.Scale, and also allows an activation function.", - :Diagonal, - ) + Base.depwarn("Flux.Diagonal is now Flux.Scale, and also allows an activation function.", + :Diagonal) return Scale(size...; kw...) end # Deprecate this eventually once saving models w/o structure is no more function loadparams!(m, xs) - Base.depwarn( - "loadparams! will be deprecated eventually. Use loadmodel! instead.", - :loadparams!, - ) + Base.depwarn("loadparams! will be deprecated eventually. Use loadmodel! instead.", + :loadparams!) for (p, x) in zip(params(m), xs) size(p) == size(x) || error("Expected param size $(size(p)), got $(size(x))") copyto!(p, x) diff --git a/src/functor.jl b/src/functor.jl index 4463aaced7..5f946fa069 100644 --- a/src/functor.jl +++ b/src/functor.jl @@ -104,9 +104,7 @@ else end adapt_storage(to::FluxCUDAAdaptor, x::CUDA.RNG) = x function adapt_storage(to::FluxCUDAAdaptor, x::AbstractRNG) - return error( - "Cannot map RNG of type $(typeof(x)) to GPU. GPU execution only supports Random.default_rng().", - ) + return error("Cannot map RNG of type $(typeof(x)) to GPU. GPU execution only supports Random.default_rng().") end # TODO: figure out the correct design for OneElement @@ -118,10 +116,8 @@ struct FluxCPUAdaptor end adapt_storage(to::FluxCPUAdaptor, x::AbstractArray) = adapt(Array, x) adapt_storage(to::FluxCPUAdaptor, x::AbstractRange) = x adapt_storage(to::FluxCPUAdaptor, x::Zygote.FillArrays.AbstractFill) = x -function adapt_storage( - to::FluxCPUAdaptor, - x::T, -) where {T<:CUDA.CUSPARSE.CUDA.CUSPARSE.AbstractCuSparseMatrix} +function adapt_storage(to::FluxCPUAdaptor, + x::T) where {T <: CUDA.CUSPARSE.CUDA.CUSPARSE.AbstractCuSparseMatrix} return adapt(Array, x) end adapt_storage(to::FluxCPUAdaptor, x::Zygote.OneElement) = x @@ -133,13 +129,11 @@ function ChainRulesCore.rrule(::Type{Array}, x::CUDA.CuArray) return Array(x), dx -> (NoTangent(), CUDA.cu(unthunk(dx))) end -function ChainRulesCore.rrule( - ::typeof(Adapt.adapt_storage), - to::FluxCPUAdaptor, - x::CUDA.AbstractGPUArray, -) +function ChainRulesCore.rrule(::typeof(Adapt.adapt_storage), + to::FluxCPUAdaptor, + x::CUDA.AbstractGPUArray) return adapt_storage(to, x), - dx -> (NoTangent(), NoTangent(), adapt_storage(FluxCUDAAdaptor(), unthunk(dx))) + dx -> (NoTangent(), NoTangent(), adapt_storage(FluxCUDAAdaptor(), unthunk(dx))) end # CPU/GPU movement conveniences @@ -213,8 +207,7 @@ function check_use_cuda() end if !(use_cuda[]) @info """The GPU function is being called but the GPU is not accessible. - Defaulting back to the CPU. (No action is required if you want to run on the CPU).""" maxlog = - 1 + Defaulting back to the CPU. (No action is required if you want to run on the CPU).""" maxlog=1 end end end diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 647b237144..72e8208268 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -32,7 +32,7 @@ For large models, there is a special type-unstable path which can reduce compila times. This can be used by supplying a vector of layers `Chain([layer1, layer2, ...])`. This feature is somewhat experimental, beware! """ -struct Chain{T<:Union{Tuple,NamedTuple,AbstractVector}} +struct Chain{T <: Union{Tuple, NamedTuple, AbstractVector}} layers::T end @@ -45,21 +45,21 @@ function Chain(; kw...) end @forward Chain.layers Base.getindex, -Base.length, -Base.first, -Base.last, -Base.iterate, -Base.lastindex, -Base.keys, -Base.firstindex + Base.length, + Base.first, + Base.last, + Base.iterate, + Base.lastindex, + Base.keys, + Base.firstindex @functor Chain (c::Chain)(x) = _applychain(c.layers, x) -@generated function _applychain(layers::Tuple{Vararg{<:Any,N}}, x) where {N} - symbols = vcat(:x, [gensym() for _ = 1:N]) - calls = [:($(symbols[i+1]) = layers[$i]($(symbols[i]))) for i = 1:N] +@generated function _applychain(layers::Tuple{Vararg{<:Any, N}}, x) where {N} + symbols = vcat(:x, [gensym() for _ in 1:N]) + calls = [:($(symbols[i + 1]) = layers[$i]($(symbols[i]))) for i in 1:N] return Expr(:block, calls...) end @@ -162,22 +162,20 @@ julia> Flux.params(d1) # no trainable bias Params([[1.0 1.0 … 1.0 1.0; 1.0 1.0 … 1.0 1.0]]) ``` """ -struct Dense{F,M<:AbstractMatrix,B} +struct Dense{F, M <: AbstractMatrix, B} weight::M bias::B σ::F - function Dense(W::M, bias = true, σ::F = identity) where {M<:AbstractMatrix,F} + function Dense(W::M, bias = true, σ::F = identity) where {M <: AbstractMatrix, F} b = _create_bias(W, bias, size(W, 1)) - return new{F,M,typeof(b)}(W, b, σ) + return new{F, M, typeof(b)}(W, b, σ) end end -function Dense( - (in, out)::Pair{<:Integer,<:Integer}, - σ = identity; - init = glorot_uniform, - bias = true, -) +function Dense((in, out)::Pair{<:Integer, <:Integer}, + σ = identity; + init = glorot_uniform, + bias = true) return Dense(init(out, in), bias, σ) end @@ -239,17 +237,16 @@ julia> Flux.params(b) Params([[1 2 3 4]]) ``` """ -struct Scale{F,A<:AbstractArray,B} +struct Scale{F, A <: AbstractArray, B} scale::A bias::B σ::F - function Scale( - scale::A, - bias::B = true, - σ::F = identity, - ) where {A<:AbstractArray,B<:Union{Bool,AbstractArray},F} + function Scale(scale::A, + bias::B = true, + σ::F = identity) where {A <: AbstractArray, + B <: Union{Bool, AbstractArray}, F} b = _create_bias(scale, bias, size(scale)...) - return new{F,A,typeof(b)}(scale, b, σ) + return new{F, A, typeof(b)}(scale, b, σ) end end @@ -257,7 +254,7 @@ function Scale(s1::Integer, s23::Integer...; bias = true, init = ones32, _act = return Scale(init(s1, s23...), bias, _act) end function Scale(size_act...; bias = true, init = ones32) - return Scale(size_act[1:(end-1)]...; bias, init, _act = size_act[end]) + return Scale(size_act[1:(end - 1)]...; bias, init, _act = size_act[end]) end @functor Scale @@ -310,11 +307,11 @@ julia> Flux.outputsize(m3, (5, 11)) (7, 11) ``` """ -struct Maxout{T<:Tuple} +struct Maxout{T <: Tuple} layers::T end Maxout(layers...) = Maxout(layers) -Maxout(f::Function, n_alts::Integer) = Maxout((f() for _ = 1:n_alts)...) +Maxout(f::Function, n_alts::Integer) = Maxout((f() for _ in 1:n_alts)...) @functor Maxout @@ -358,7 +355,7 @@ true See also [`Parallel`](@ref), [`Maxout`](@ref). """ -struct SkipConnection{T,F} +struct SkipConnection{T, F} layers::T connection::F #user can pass arbitrary connections here, such as (a,b) -> a + b end @@ -421,28 +418,26 @@ julia> Flux.Bilinear(rand(4, 8, 16), false, tanh) # first dim of weight is the Bilinear((8, 16) => 4, tanh; bias=false) # 512 parameters ``` """ -struct Bilinear{F,A,B} +struct Bilinear{F, A, B} weight::A bias::B σ::F - function Bilinear(W::A, bias = true, σ::F = identity) where {A<:AbstractArray,F} + function Bilinear(W::A, bias = true, σ::F = identity) where {A <: AbstractArray, F} ndims(A) == 3 || throw(ArgumentError("expected a 3-array of weights")) b = _create_bias(W, bias, size(W, 1)) - return new{F,A,typeof(b)}(W, b, σ) + return new{F, A, typeof(b)}(W, b, σ) end end @functor Bilinear -function Bilinear( - ((in1, in2), out)::Pair{<:Tuple,<:Integer}, - σ = identity; - bias = true, - init = glorot_uniform, -) +function Bilinear(((in1, in2), out)::Pair{<:Tuple, <:Integer}, + σ = identity; + bias = true, + init = glorot_uniform) return Bilinear(init(out, in1, in2), bias, σ) end -function Bilinear((in12, out)::Pair{<:Integer,<:Integer}, σ = identity; kw...) +function Bilinear((in12, out)::Pair{<:Integer, <:Integer}, σ = identity; kw...) return Bilinear((in12, in12) => out, σ; kw...) end @@ -452,11 +447,8 @@ function (a::Bilinear)(x::AbstractMatrix, y::AbstractMatrix) d_z, d_x, d_y = size(W) d_x == size(x, 1) && d_y == size(y, 1) || throw(DimensionMismatch("number of rows in data must match W")) - size(x, 2) == size(y, 2) || throw( - DimensionMismatch( - "Data inputs must agree on number of columns, got $(size(x,2)) and $(size(y,2))", - ), - ) + size(x, 2) == size(y, 2) || + throw(DimensionMismatch("Data inputs must agree on number of columns, got $(size(x,2)) and $(size(y,2))")) # @einsum Wy[o,i,s] := W[o,i,j] * y[j,s] Wy = reshape(reshape(W, (:, d_y)) * y, (d_z, d_x, :)) @@ -473,21 +465,19 @@ end function (a::Bilinear)(x::AbstractVector, y::AbstractVector) return vec(a(reshape(x, :, 1), reshape(y, :, 1))) end -(a::Bilinear)(x::NTuple{2,AbstractArray}) = a(x[1], x[2]) +(a::Bilinear)(x::NTuple{2, AbstractArray}) = a(x[1], x[2]) function Base.show(io::IO, l::Bilinear) if size(l.weight, 2) == size(l.weight, 3) print(io, "Bilinear(", size(l.weight, 2), " => ", size(l.weight, 1)) else - print( - io, - "Bilinear((", - size(l.weight, 2), - ", ", - size(l.weight, 3), - ") => ", - size(l.weight, 1), - ) + print(io, + "Bilinear((", + size(l.weight, 2), + ", ", + size(l.weight, 3), + ") => ", + size(l.weight, 1)) end l.σ == identity || print(io, ", ", l.σ) l.bias === false && print(io, "; bias=false") @@ -537,7 +527,7 @@ julia> model2[:β] == model2[2] true ``` """ -struct Parallel{F,T<:Union{Tuple,NamedTuple}} +struct Parallel{F, T <: Union{Tuple, NamedTuple}} connection::F layers::T end @@ -546,11 +536,7 @@ Parallel(connection, layers...) = Parallel(connection, layers) function Parallel(connection; kw...) layers = NamedTuple(kw) if :layers in keys(layers) || :connection in keys(layers) - throw( - ArgumentError( - "a Parallel layer cannot have a named sub-layer called `connection` or `layers`", - ), - ) + throw(ArgumentError("a Parallel layer cannot have a named sub-layer called `connection` or `layers`")) end isempty(layers) && return Parallel(connection, ()) return Parallel(connection, layers) @@ -565,11 +551,7 @@ function _parallel_check(layers, xs) nl = length(layers) nx = length(xs) if (nl != nx) - throw( - ArgumentError( - "Parallel with $nl sub-layers can take one input or $nl inputs, but got $nx inputs", - ), - ) + throw(ArgumentError("Parallel with $nl sub-layers can take one input or $nl inputs, but got $nx inputs")) end end ChainRulesCore.@non_differentiable _parallel_check(nl, nx) @@ -581,7 +563,7 @@ end Base.getindex(m::Parallel, i) = m.layers[i] Base.getindex(m::Parallel, i::AbstractVector) = Parallel(m.connection, m.layers[i]) -function Base.getindex(m::Parallel{<:Any,<:NamedTuple}, i::AbstractVector) +function Base.getindex(m::Parallel{<:Any, <:NamedTuple}, i::AbstractVector) return Parallel(m.connection, NamedTuple{keys(m)[i]}(Tuple(m.layers)[i])) end @@ -639,7 +621,7 @@ end A tuple of length N with the output of each fusion ((`y1`, `y2`, ..., `yN`) in the example above). """ -struct PairwiseFusion{F,T<:Union{Tuple,NamedTuple}} +struct PairwiseFusion{F, T <: Union{Tuple, NamedTuple}} connection::F layers::T end @@ -648,11 +630,7 @@ PairwiseFusion(connection, layers...) = PairwiseFusion(connection, layers) function PairwiseFusion(connection; kw...) layers = NamedTuple(kw) if :layers in keys(layers) || :connection in keys(layers) - throw( - ArgumentError( - "a PairwiseFusion layer cannot have a named sub-layer called `connection` or `layers`", - ), - ) + throw(ArgumentError("a PairwiseFusion layer cannot have a named sub-layer called `connection` or `layers`")) end isempty(layers) && return PairwiseFusion(connection, ()) return PairwiseFusion(connection, layers) @@ -662,11 +640,7 @@ function _pairwise_check(x, layers, T) lx = length(x) N = length(layers) if T <: Tuple && lx != N - throw( - ArgumentError( - "PairwiseFusion with $N sub-layers can take one input or $N inputs, but got $lx inputs", - ), - ) + throw(ArgumentError("PairwiseFusion with $N sub-layers can take one input or $N inputs, but got $lx inputs")) end end ChainRulesCore.@non_differentiable _pairwise_check(lx, N, T) @@ -677,24 +651,20 @@ function (m::PairwiseFusion)(x::T) where {T} end (m::PairwiseFusion)(xs...) = m(xs) -@generated function applypairwisefusion( - layers::Tuple{Vararg{<:Any,N}}, - connection, - x::T, -) where {N,T} - y_symbols = [gensym() for _ = 1:(N+1)] +@generated function applypairwisefusion(layers::Tuple{Vararg{<:Any, N}}, + connection, + x::T) where {N, T} + y_symbols = [gensym() for _ in 1:(N + 1)] getinput(i) = T <: Tuple ? :(x[$i]) : :x - calls = [:($(y_symbols[N+1]) = $(getinput(1)))] - for i = 1:(N-1) - push!( - calls, - quote - $(y_symbols[i]) = layers[$i]($(y_symbols[N+1])) - $(y_symbols[N+1]) = connection($(y_symbols[i]), $(getinput(i + 1))) - end, - ) + calls = [:($(y_symbols[N + 1]) = $(getinput(1)))] + for i in 1:(N - 1) + push!(calls, + quote + $(y_symbols[i]) = layers[$i]($(y_symbols[N + 1])) + $(y_symbols[N + 1]) = connection($(y_symbols[i]), $(getinput(i + 1))) + end) end - push!(calls, :($(y_symbols[N]) = layers[$N]($(y_symbols[N+1])))) + push!(calls, :($(y_symbols[N]) = layers[$N]($(y_symbols[N + 1])))) push!(calls, :(return tuple($(Tuple(y_symbols[1:N])...)))) return Expr(:block, calls...) end @@ -708,7 +678,7 @@ Base.getindex(m::PairwiseFusion, i) = m.layers[i] function Base.getindex(m::PairwiseFusion, i::AbstractVector) return PairwiseFusion(m.connection, m.layers[i]) end -function Base.getindex(m::PairwiseFusion{<:Any,<:NamedTuple}, i::AbstractVector) +function Base.getindex(m::PairwiseFusion{<:Any, <:NamedTuple}, i::AbstractVector) return PairwiseFusion(m.connection, NamedTuple{keys(m)[i]}(Tuple(m.layers)[i])) end @@ -757,18 +727,15 @@ end @functor Embedding -Embedding((in, out)::Pair{<:Integer,<:Integer}; init = randn32) = Embedding(init(out, in)) +Embedding((in, out)::Pair{<:Integer, <:Integer}; init = randn32) = Embedding(init(out, in)) (m::Embedding)(x::Integer) = m.weight[:, x] (m::Embedding)(x::AbstractVector) = NNlib.gather(m.weight, x) (m::Embedding)(x::AbstractArray) = reshape(m(vec(x)), :, size(x)...) -function (m::Embedding)(x::Union{OneHotVector{T,L},OneHotMatrix{T,L}}) where {T,L} - size(m.weight, 2) == L || throw( - DimensionMismatch( - "Matrix column must correspond with OneHot size: $(size(m.weight, 2)) != $L", - ), - ) +function (m::Embedding)(x::Union{OneHotVector{T, L}, OneHotMatrix{T, L}}) where {T, L} + size(m.weight, 2) == L || + throw(DimensionMismatch("Matrix column must correspond with OneHot size: $(size(m.weight, 2)) != $L")) return m(onecold(x)) end diff --git a/src/layers/conv.jl b/src/layers/conv.jl index b620983dbc..91a7000249 100644 --- a/src/layers/conv.jl +++ b/src/layers/conv.jl @@ -1,7 +1,7 @@ using NNlib: conv, ∇conv_data, depthwiseconv, output_size # pad dims of x with dims of y until ndims(x) == ndims(y) -_paddims(x::Tuple, y::Tuple) = (x..., y[(end-(length(y)-length(x)-1)):end]...) +_paddims(x::Tuple, y::Tuple) = (x..., y[(end - (length(y) - length(x) - 1)):end]...) expand(N, i::Tuple) = i expand(N, i::Integer) = ntuple(_ -> i, N) @@ -48,10 +48,10 @@ julia> layer3(xs) |> size # output size = `ceil(input_size/stride)` = 50 """ struct SamePad end -function calc_padding(lt, pad, k::NTuple{N,T}, dilation, stride) where {T,N} +function calc_padding(lt, pad, k::NTuple{N, T}, dilation, stride) where {T, N} return expand(Val(2 * N), pad) end -function calc_padding(lt, ::SamePad, k::NTuple{N,T}, dilation, stride) where {N,T} +function calc_padding(lt, ::SamePad, k::NTuple{N, T}, dilation, stride) where {N, T} #Ref: "A guide to convolution arithmetic for deep learning" https://arxiv.org/abs/1603.07285 # Effective kernel size, including dilation @@ -127,13 +127,13 @@ julia> Conv((5, 5), 3 => 7; stride = 2, dilation = 4)(xs) |> size (42, 42, 7, 50) ``` """ -struct Conv{N,M,F,A,V} +struct Conv{N, M, F, A, V} σ::F weight::A bias::V - stride::NTuple{N,Int} - pad::NTuple{M,Int} - dilation::NTuple{N,Int} + stride::NTuple{N, Int} + pad::NTuple{M, Int} + dilation::NTuple{N, Int} groups::Int end @@ -159,34 +159,30 @@ julia> Flux.params(layer) |> length 2 ``` """ -function Conv( - w::AbstractArray{T,N}, - b = true, - σ = identity; - stride = 1, - pad = 0, - dilation = 1, - groups = 1, -) where {T,N} - @assert size(w, N) % groups == 0 "Output channel dimension must be divisible by groups." +function Conv(w::AbstractArray{T, N}, + b = true, + σ = identity; + stride = 1, + pad = 0, + dilation = 1, + groups = 1) where {T, N} + @assert size(w, N) % groups==0 "Output channel dimension must be divisible by groups." stride = expand(Val(N - 2), stride) dilation = expand(Val(N - 2), dilation) - pad = calc_padding(Conv, pad, size(w)[1:(N-2)], dilation, stride) + pad = calc_padding(Conv, pad, size(w)[1:(N - 2)], dilation, stride) bias = _create_bias(w, b, size(w, N)) return Conv(σ, w, bias, stride, pad, dilation, groups) end -function Conv( - k::NTuple{N,Integer}, - ch::Pair{<:Integer,<:Integer}, - σ = identity; - init = glorot_uniform, - stride = 1, - pad = 0, - dilation = 1, - groups = 1, - bias = true, -) where {N} +function Conv(k::NTuple{N, Integer}, + ch::Pair{<:Integer, <:Integer}, + σ = identity; + init = glorot_uniform, + stride = 1, + pad = 0, + dilation = 1, + groups = 1, + bias = true) where {N} weight = convfilter(k, ch; init, groups) return Conv(weight, bias, σ; stride, pad, dilation, groups) end @@ -202,29 +198,25 @@ distribution. This is internally used by the [`Conv`](@ref) layer. """ -function convfilter( - filter::NTuple{N,Integer}, - ch::Pair{<:Integer,<:Integer}; - init = glorot_uniform, - groups = 1, -) where {N} +function convfilter(filter::NTuple{N, Integer}, + ch::Pair{<:Integer, <:Integer}; + init = glorot_uniform, + groups = 1) where {N} cin, cout = ch - @assert cin % groups == 0 "Input channel dimension must be divisible by groups." - @assert cout % groups == 0 "Output channel dimension must be divisible by groups." + @assert cin % groups==0 "Input channel dimension must be divisible by groups." + @assert cout % groups==0 "Output channel dimension must be divisible by groups." return init(filter..., cin ÷ groups, cout) end @functor Conv function conv_dims(c::Conv, x::AbstractArray) - return DenseConvDims( - x, - c.weight; - stride = c.stride, - padding = c.pad, - dilation = c.dilation, - groups = c.groups, - ) + return DenseConvDims(x, + c.weight; + stride = c.stride, + padding = c.pad, + dilation = c.dilation, + groups = c.groups) end ChainRulesCore.@non_differentiable conv_dims(::Any, ::Any) @@ -239,7 +231,7 @@ _channels_in(l::Conv) = size(l.weight, ndims(l.weight) - 1) * l.groups _channels_out(l::Conv) = size(l.weight, ndims(l.weight)) function Base.show(io::IO, l::Conv) - print(io, "Conv(", size(l.weight)[1:(ndims(l.weight)-2)]) + print(io, "Conv(", size(l.weight)[1:(ndims(l.weight) - 2)]) print(io, ", ", _channels_in(l), " => ", _channels_out(l)) _print_conv_opt(io, l) return print(io, ")") @@ -288,18 +280,18 @@ julia> ConvTranspose((5, 5), 3 => 7; stride = 3, pad = SamePad())(xs) |> size (300, 300, 7, 50) ``` """ -struct ConvTranspose{N,M,F,A,V} +struct ConvTranspose{N, M, F, A, V} σ::F weight::A bias::V - stride::NTuple{N,Int} - pad::NTuple{M,Int} - dilation::NTuple{N,Int} + stride::NTuple{N, Int} + pad::NTuple{M, Int} + dilation::NTuple{N, Int} groups::Int end _channels_in(l::ConvTranspose) = size(l.weight)[end] -_channels_out(l::ConvTranspose) = size(l.weight)[end-1] * l.groups +_channels_out(l::ConvTranspose) = size(l.weight)[end - 1] * l.groups """ ConvTranspose(weight::AbstractArray, [bias, activation; stride, pad, dilation, groups]) @@ -325,33 +317,29 @@ julia> Flux.params(layer) |> length 2 ``` """ -function ConvTranspose( - w::AbstractArray{T,N}, - bias = true, - σ = identity; - stride = 1, - pad = 0, - dilation = 1, - groups = 1, -) where {T,N} +function ConvTranspose(w::AbstractArray{T, N}, + bias = true, + σ = identity; + stride = 1, + pad = 0, + dilation = 1, + groups = 1) where {T, N} stride = expand(Val(N - 2), stride) dilation = expand(Val(N - 2), dilation) - pad = calc_padding(ConvTranspose, pad, size(w)[1:(N-2)], dilation, stride) + pad = calc_padding(ConvTranspose, pad, size(w)[1:(N - 2)], dilation, stride) b = _create_bias(w, bias, size(w, N - 1) * groups) return ConvTranspose(σ, w, b, stride, pad, dilation, groups) end -function ConvTranspose( - k::NTuple{N,Integer}, - ch::Pair{<:Integer,<:Integer}, - σ = identity; - init = glorot_uniform, - stride = 1, - pad = 0, - dilation = 1, - groups = 1, - bias = true, -) where {N} +function ConvTranspose(k::NTuple{N, Integer}, + ch::Pair{<:Integer, <:Integer}, + σ = identity; + init = glorot_uniform, + stride = 1, + pad = 0, + dilation = 1, + groups = 1, + bias = true) where {N} weight = convfilter(k, reverse(ch); init, groups) return ConvTranspose(weight, bias, σ; stride, pad, dilation, groups) end @@ -361,21 +349,18 @@ end function conv_transpose_dims(c::ConvTranspose, x::AbstractArray) # Calculate size of "input", from ∇conv_data()'s perspective... combined_pad = (c.pad[1:2:end] .+ c.pad[2:2:end]) - I = - (size(x)[1:(end-2)] .- 1) .* c.stride .+ 1 .+ - (size(c.weight)[1:(end-2)] .- 1) .* c.dilation .- combined_pad - C_in = size(c.weight)[end-1] * c.groups + I = (size(x)[1:(end - 2)] .- 1) .* c.stride .+ 1 .+ + (size(c.weight)[1:(end - 2)] .- 1) .* c.dilation .- combined_pad + C_in = size(c.weight)[end - 1] * c.groups batch_size = size(x)[end] # Create DenseConvDims() that looks like the corresponding conv() w_size = size(c.weight) - return DenseConvDims( - (I..., C_in, batch_size), - w_size; - stride = c.stride, - padding = c.pad, - dilation = c.dilation, - groups = c.groups, - ) + return DenseConvDims((I..., C_in, batch_size), + w_size; + stride = c.stride, + padding = c.pad, + dilation = c.dilation, + groups = c.groups) end ChainRulesCore.@non_differentiable conv_transpose_dims(::Any, ::Any) @@ -387,19 +372,17 @@ function (c::ConvTranspose)(x::AbstractArray) end function Base.show(io::IO, l::ConvTranspose) - print(io, "ConvTranspose(", size(l.weight)[1:(ndims(l.weight)-2)]) + print(io, "ConvTranspose(", size(l.weight)[1:(ndims(l.weight) - 2)]) print(io, ", ", _channels_in(l), " => ", _channels_out(l)) _print_conv_opt(io, l) return print(io, ")") end -function calc_padding( - ::Type{ConvTranspose}, - pad::SamePad, - k::NTuple{N,T}, - dilation, - stride, -) where {N,T} +function calc_padding(::Type{ConvTranspose}, + pad::SamePad, + k::NTuple{N, T}, + dilation, + stride) where {N, T} return calc_padding(Conv, pad, k .- stride .+ 1, dilation, stride) end @@ -427,29 +410,25 @@ julia> DepthwiseConv((5, 5), 3 => 9; stride = 2, pad = 2)(xs) |> size (50, 50, 9, 50) ``` """ -function DepthwiseConv( - k::NTuple{<:Any,Integer}, - ch::Pair{<:Integer,<:Integer}, - σ = identity; - stride = 1, - pad = 0, - dilation = 1, - bias = true, - init = glorot_uniform, -) +function DepthwiseConv(k::NTuple{<:Any, Integer}, + ch::Pair{<:Integer, <:Integer}, + σ = identity; + stride = 1, + pad = 0, + dilation = 1, + bias = true, + init = glorot_uniform) return Conv(k, ch, σ; groups = ch.first, stride, pad, dilation, bias, init) end -function DepthwiseConv( - w::AbstractArray{T,N}, - bias = true, - σ = identity; - stride = 1, - pad = 0, - dilation = 1, -) where {T,N} - w2 = reshape(w, size(w)[1:(end-2)]..., 1, :) - return Conv(w2, bias, σ; groups = size(w)[end-1], stride, pad, dilation) +function DepthwiseConv(w::AbstractArray{T, N}, + bias = true, + σ = identity; + stride = 1, + pad = 0, + dilation = 1) where {T, N} + w2 = reshape(w, size(w)[1:(end - 2)]..., 1, :) + return Conv(w2, bias, σ; groups = size(w)[end - 1], stride, pad, dilation) end """ @@ -479,13 +458,13 @@ julia> CrossCor((5, 5), 3 => 7; stride = 3, pad = (2, 0))(xs) |> size (34, 32, 7, 50) ``` """ -struct CrossCor{N,M,F,A,V} +struct CrossCor{N, M, F, A, V} σ::F weight::A bias::V - stride::NTuple{N,Int} - pad::NTuple{M,Int} - dilation::NTuple{N,Int} + stride::NTuple{N, Int} + pad::NTuple{M, Int} + dilation::NTuple{N, Int} end """ @@ -509,31 +488,27 @@ julia> layer(randn(100, 4, 64)) |> size (98, 5, 64) ``` """ -function CrossCor( - w::AbstractArray{T,N}, - bias = true, - σ = identity; - stride = 1, - pad = 0, - dilation = 1, -) where {T,N} +function CrossCor(w::AbstractArray{T, N}, + bias = true, + σ = identity; + stride = 1, + pad = 0, + dilation = 1) where {T, N} stride = expand(Val(N - 2), stride) dilation = expand(Val(N - 2), dilation) - pad = calc_padding(CrossCor, pad, size(w)[1:(N-2)], dilation, stride) + pad = calc_padding(CrossCor, pad, size(w)[1:(N - 2)], dilation, stride) b = _create_bias(w, bias, size(w, N)) return CrossCor(σ, w, b, stride, pad, dilation) end -function CrossCor( - k::NTuple{N,Integer}, - ch::Pair{<:Integer,<:Integer}, - σ = identity; - init = glorot_uniform, - stride = 1, - pad = 0, - dilation = 1, - bias = true, -) where {N} +function CrossCor(k::NTuple{N, Integer}, + ch::Pair{<:Integer, <:Integer}, + σ = identity; + init = glorot_uniform, + stride = 1, + pad = 0, + dilation = 1, + bias = true) where {N} weight = convfilter(k, ch; init = init) return CrossCor(weight, bias, σ; stride, pad, dilation) end @@ -546,13 +521,11 @@ function crosscor(x, w, ddims::DenseConvDims) end function crosscor_dims(c::CrossCor, x::AbstractArray) - return DenseConvDims( - x, - c.weight; - stride = c.stride, - padding = c.pad, - dilation = c.dilation, - ) + return DenseConvDims(x, + c.weight; + stride = c.stride, + padding = c.pad, + dilation = c.dilation) end ChainRulesCore.@non_differentiable crosscor_dims(::Any, ::Any) @@ -564,14 +537,12 @@ function (c::CrossCor)(x::AbstractArray) end function Base.show(io::IO, l::CrossCor) - print(io, "CrossCor(", size(l.weight)[1:(ndims(l.weight)-2)]) - print( - io, - ", ", - size(l.weight, ndims(l.weight) - 1), - " => ", - size(l.weight, ndims(l.weight)), - ) + print(io, "CrossCor(", size(l.weight)[1:(ndims(l.weight) - 2)]) + print(io, + ", ", + size(l.weight, ndims(l.weight) - 1), + " => ", + size(l.weight, ndims(l.weight))) _print_conv_opt(io, l) return print(io, ")") end @@ -599,13 +570,13 @@ julia> MaxPool((4, 4))(xs) ≈ AdaptiveMaxPool((25, 25))(xs) true ``` """ -struct AdaptiveMaxPool{S,O} - out::NTuple{O,Int} - AdaptiveMaxPool(out::NTuple{O,Int}) where {O} = new{O + 2,O}(out) +struct AdaptiveMaxPool{S, O} + out::NTuple{O, Int} + AdaptiveMaxPool(out::NTuple{O, Int}) where {O} = new{O + 2, O}(out) end -function (a::AdaptiveMaxPool{S})(x::AbstractArray{T,S}) where {S,T} - insize = size(x)[1:(end-2)] +function (a::AdaptiveMaxPool{S})(x::AbstractArray{T, S}) where {S, T} + insize = size(x)[1:(end - 2)] outsize = a.out stride = insize .÷ outsize k = insize .- (outsize .- 1) .* stride @@ -641,13 +612,13 @@ julia> MeanPool((4, 4))(xs) ≈ AdaptiveMeanPool((25, 25))(xs) true ``` """ -struct AdaptiveMeanPool{S,O} - out::NTuple{O,Int} - AdaptiveMeanPool(out::NTuple{O,Int}) where {O} = new{O + 2,O}(out) +struct AdaptiveMeanPool{S, O} + out::NTuple{O, Int} + AdaptiveMeanPool(out::NTuple{O, Int}) where {O} = new{O + 2, O}(out) end -function (a::AdaptiveMeanPool{S})(x::AbstractArray{T,S}) where {S,T} - insize = size(x)[1:(end-2)] +function (a::AdaptiveMeanPool{S})(x::AbstractArray{T, S}) where {S, T} + insize = size(x)[1:(end - 2)] outsize = a.out stride = insize .÷ outsize k = insize .- (outsize .- 1) .* stride @@ -688,7 +659,7 @@ function (g::GlobalMaxPool)(x) # Input size x_size = size(x) # Kernel size - k = x_size[1:(end-2)] + k = x_size[1:(end - 2)] # Pooling dimensions pdims = PoolDims(x, k) @@ -722,7 +693,7 @@ function (g::GlobalMeanPool)(x) # Input size x_size = size(x) # Kernel size - k = x_size[1:(end-2)] + k = x_size[1:(end - 2)] # Pooling dimensions pdims = PoolDims(x, k) @@ -772,13 +743,13 @@ julia> layer(rand(Float32, 100, 7, 50)) |> size (34, 7, 50) ``` """ -struct MaxPool{N,M} - k::NTuple{N,Int} - pad::NTuple{M,Int} - stride::NTuple{N,Int} +struct MaxPool{N, M} + k::NTuple{N, Int} + pad::NTuple{M, Int} + stride::NTuple{N, Int} end -function MaxPool(k::NTuple{N,Integer}; pad = 0, stride = k) where {N} +function MaxPool(k::NTuple{N, Integer}; pad = 0, stride = k) where {N} stride = expand(Val(N), stride) pad = calc_padding(MaxPool, pad, k, 1, stride) return MaxPool(k, pad, stride) @@ -831,13 +802,13 @@ julia> m(xs) |> size (20, 20, 7, 50) ``` """ -struct MeanPool{N,M} - k::NTuple{N,Int} - pad::NTuple{M,Int} - stride::NTuple{N,Int} +struct MeanPool{N, M} + k::NTuple{N, Int} + pad::NTuple{M, Int} + stride::NTuple{N, Int} end -function MeanPool(k::NTuple{N,Integer}; pad = 0, stride = k) where {N} +function MeanPool(k::NTuple{N, Integer}; pad = 0, stride = k) where {N} stride = expand(Val(N), stride) pad = calc_padding(MeanPool, pad, k, 1, stride) return MeanPool(k, pad, stride) diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl index 437d709463..3af09ce237 100644 --- a/src/layers/normalise.jl +++ b/src/layers/normalise.jl @@ -38,11 +38,7 @@ dropout(x, p; kwargs...) = dropout(rng_from_array(x), x, p; kwargs...) dropout_mask(rng::CUDA.RNG, x::CuArray, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...) function dropout_mask(rng, x::CuArray, p; kwargs...) - throw( - ArgumentError( - "x isa CuArray, but rng isa $(typeof(rng)). dropout_mask only support CUDA.RNG for CuArrays.", - ), - ) + throw(ArgumentError("x isa CuArray, but rng isa $(typeof(rng)). dropout_mask only support CUDA.RNG for CuArrays.")) end dropout_mask(rng, x, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...) function _dropout_mask(rng, x, p; dims = :) @@ -96,10 +92,10 @@ julia> isapprox(count(==(0), y) / length(y), 0.5; atol = 0.1) true ``` """ -mutable struct Dropout{F,D,R<:AbstractRNG} +mutable struct Dropout{F, D, R <: AbstractRNG} p::F dims::D - active::Union{Bool,Nothing} + active::Union{Bool, Nothing} rng::R end Dropout(p, dims, active) = Dropout(p, dims, active, default_rng_value()) @@ -154,13 +150,13 @@ julia> isapprox(std(x), std(y); atol = 0.2) true ``` """ -mutable struct AlphaDropout{F,R<:AbstractRNG} +mutable struct AlphaDropout{F, R <: AbstractRNG} p::F - active::Union{Bool,Nothing} + active::Union{Bool, Nothing} rng::R function AlphaDropout(p, active, rng) @assert 0 ≤ p ≤ 1 - return new{typeof(p),typeof(rng)}(p, active, rng) + return new{typeof(p), typeof(rng)}(p, active, rng) end end AlphaDropout(p, active) = AlphaDropout(p, active, default_rng_value()) @@ -220,25 +216,23 @@ julia> isapprox(std(y; dims = 1:3), ones(1, 1, 1, 2); atol = 0.1) && true ``` """ -struct LayerNorm{F,D,T,N} +struct LayerNorm{F, D, T, N} λ::F diag::D ϵ::T - size::NTuple{N,Int} + size::NTuple{N, Int} affine::Bool end -function LayerNorm( - size::Tuple{Vararg{Int}}, - λ = identity; - affine::Bool = true, - ϵ::Real = 1.0f-5, -) +function LayerNorm(size::Tuple{Vararg{Int}}, + λ = identity; + affine::Bool = true, + ϵ::Real = 1.0f-5) diag = affine ? Scale(size..., λ) : λ != identity ? Base.Fix1(broadcast, λ) : identity return LayerNorm(λ, diag, ϵ, size, affine) end LayerNorm(size::Integer...; kw...) = LayerNorm(Int.(size); kw...) -LayerNorm(size_act...; kw...) = LayerNorm(Int.(size_act[1:(end-1)]), size_act[end]; kw...) +LayerNorm(size_act...; kw...) = LayerNorm(Int.(size_act[1:(end - 1)]), size_act[end]; kw...) @functor LayerNorm @@ -255,12 +249,10 @@ end # Compute the statistics on the slices specified by reduce_dims. # reduce_dims=[1,...,N-2,N] for BatchNorm # reduce_dims=[1,...,N-2] for InstanceNorm and GroupNorm -function _norm_layer_forward( - l, - x::AbstractArray{T,N}; - reduce_dims, - affine_shape, -) where {T,N} +function _norm_layer_forward(l, + x::AbstractArray{T, N}; + reduce_dims, + affine_shape) where {T, N} if !_isactive(l) && l.track_stats # testmode with tracked stats stats_shape = ntuple(i -> i == N - 1 ? size(x, N - 1) : 1, N) μ = reshape(l.μ, stats_shape) @@ -283,7 +275,7 @@ end @inline _norm_layer_forward(x, μ, σ², ϵ) = (x .- μ) ./ sqrt.(σ² .+ ϵ) -function _track_stats!(bn, x::AbstractArray{T,N}, μ, σ², reduce_dims) where {T,N} +function _track_stats!(bn, x::AbstractArray{T, N}, μ, σ², reduce_dims) where {T, N} V = eltype(bn.σ²) mtm = bn.momentum res_mtm = one(V) - mtm @@ -340,7 +332,7 @@ julia> isapprox(std(m(xs)), 1; atol = 0.1) && std(xs) != std(m(xs)) true ``` """ -mutable struct BatchNorm{F,V,N,W} +mutable struct BatchNorm{F, V, N, W} λ::F # activation function β::V # bias γ::V # scale @@ -350,20 +342,18 @@ mutable struct BatchNorm{F,V,N,W} momentum::N affine::Bool track_stats::Bool - active::Union{Bool,Nothing} + active::Union{Bool, Nothing} chs::Int # number of channels end -function BatchNorm( - chs::Int, - λ = identity; - initβ = zeros32, - initγ = ones32, - affine = true, - track_stats = true, - ϵ = 1.0f-5, - momentum = 0.1f0, -) +function BatchNorm(chs::Int, + λ = identity; + initβ = zeros32, + initγ = ones32, + affine = true, + track_stats = true, + ϵ = 1.0f-5, + momentum = 0.1f0) β = affine ? initβ(chs) : nothing γ = affine ? initγ(chs) : nothing μ = track_stats ? zeros32(chs) : nothing @@ -378,7 +368,7 @@ trainable(bn::BatchNorm) = hasaffine(bn) ? (β = bn.β, γ = bn.γ) : (;) function (BN::BatchNorm)(x) @assert size(x, ndims(x) - 1) == BN.chs N = ndims(x) - reduce_dims = [1:(N-2); N] + reduce_dims = [1:(N - 2); N] affine_shape = ntuple(i -> i == N - 1 ? size(x, N - 1) : 1, N) return _norm_layer_forward(BN, x; reduce_dims, affine_shape) end @@ -434,7 +424,7 @@ julia> isapprox(std(y; dims = 1:2), ones(1, 1, 3, 2); atol = 0.2) && true ``` """ -mutable struct InstanceNorm{F,V,N,W} +mutable struct InstanceNorm{F, V, N, W} λ::F # activation function β::V # bias γ::V # scale @@ -444,25 +434,21 @@ mutable struct InstanceNorm{F,V,N,W} momentum::N affine::Bool track_stats::Bool - active::Union{Bool,Nothing} + active::Union{Bool, Nothing} chs::Int # number of channels end -function InstanceNorm( - chs::Int, - λ = identity; - initβ = zeros32, - initγ = ones32, - affine = false, - track_stats = false, - ϵ = 1.0f-5, - momentum = 0.1f0, -) +function InstanceNorm(chs::Int, + λ = identity; + initβ = zeros32, + initγ = ones32, + affine = false, + track_stats = false, + ϵ = 1.0f-5, + momentum = 0.1f0) if track_stats - Base.depwarn( - "`track_stats=true` will be removed from InstanceNorm in Flux 0.14. The default value is `track_stats=false`, which will work as before.", - :InstanceNorm, - ) + Base.depwarn("`track_stats=true` will be removed from InstanceNorm in Flux 0.14. The default value is `track_stats=false`, which will work as before.", + :InstanceNorm) end β = affine ? initβ(chs) : nothing @@ -480,7 +466,7 @@ function (l::InstanceNorm)(x) @assert ndims(x) > 2 @assert size(x, ndims(x) - 1) == l.chs N = ndims(x) - reduce_dims = 1:(N-2) + reduce_dims = 1:(N - 2) affine_shape = ntuple(i -> i == N - 1 ? size(x, N - 1) : 1, N) return _norm_layer_forward(l, x; reduce_dims, affine_shape) end @@ -542,7 +528,7 @@ true ``` # number of groups ``` """ -mutable struct GroupNorm{F,V,N,W} +mutable struct GroupNorm{F, V, N, W} G::Int # number of groups λ::F # activation function β::V # bias @@ -553,29 +539,25 @@ mutable struct GroupNorm{F,V,N,W} momentum::N affine::Bool track_stats::Bool - active::Union{Bool,Nothing} + active::Union{Bool, Nothing} chs::Int # number of channels end @functor GroupNorm trainable(gn::GroupNorm) = hasaffine(gn) ? (β = gn.β, γ = gn.γ) : (;) -function GroupNorm( - chs::Int, - G::Int, - λ = identity; - initβ = zeros32, - initγ = ones32, - affine = true, - track_stats = false, - ϵ = 1.0f-5, - momentum = 0.1f0, -) +function GroupNorm(chs::Int, + G::Int, + λ = identity; + initβ = zeros32, + initγ = ones32, + affine = true, + track_stats = false, + ϵ = 1.0f-5, + momentum = 0.1f0) if track_stats - Base.depwarn( - "`track_stats=true` will be removed from GroupNorm in Flux 0.14. The default value is `track_stats=false`, which will work as before.", - :GroupNorm, - ) + Base.depwarn("`track_stats=true` will be removed from GroupNorm in Flux 0.14. The default value is `track_stats=false`, which will work as before.", + :GroupNorm) end chs % G == 0 || @@ -594,9 +576,9 @@ function (gn::GroupNorm)(x) @assert size(x, ndims(x) - 1) == gn.chs N = ndims(x) sz = size(x) - x = reshape(x, sz[1:(N-2)]..., sz[N-1] ÷ gn.G, gn.G, sz[N]) + x = reshape(x, sz[1:(N - 2)]..., sz[N - 1] ÷ gn.G, gn.G, sz[N]) N = ndims(x) - reduce_dims = 1:(N-2) + reduce_dims = 1:(N - 2) affine_shape = ntuple(i -> i ∈ (N - 1, N - 2) ? size(x, i) : 1, N) x = _norm_layer_forward(gn, x; reduce_dims, affine_shape) return reshape(x, sz) @@ -622,4 +604,4 @@ scale parameters, `false` otherwise. See [`BatchNorm`](@ref), [`InstanceNorm`](@ref), [`GroupNorm`](@ref), and [`LayerNorm`](@ref). """ -hasaffine(l::Union{BatchNorm,InstanceNorm,LayerNorm,GroupNorm}) = l.affine +hasaffine(l::Union{BatchNorm, InstanceNorm, LayerNorm, GroupNorm}) = l.affine diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl index 5fdf1e7d00..bec0c539bb 100644 --- a/src/layers/recurrent.jl +++ b/src/layers/recurrent.jl @@ -19,13 +19,13 @@ function ChainRulesCore.rrule(::typeof(multigate), x::AbstractArray, h, c) end # Type stable and AD-friendly helper for iterating over the last dimension of an array -function eachlastdim(A::AbstractArray{T,N}) where {T,N} +function eachlastdim(A::AbstractArray{T, N}) where {T, N} inds_before = ntuple(_ -> :, N - 1) return (view(A, inds_before..., i) for i in axes(A, N)) end # adapted from https://github.com/JuliaDiff/ChainRules.jl/blob/f13e0a45d10bb13f48d6208e9c9d5b4a52b96732/src/rulesets/Base/indexing.jl#L77 -function ∇eachlastdim(dys_raw, x::AbstractArray{T,N}) where {T,N} +function ∇eachlastdim(dys_raw, x::AbstractArray{T, N}) where {T, N} dys = unthunk(dys_raw) i1 = findfirst(dy -> dy isa AbstractArray, dys) if isnothing(i1) # all slices are Zero! @@ -44,7 +44,7 @@ function ∇eachlastdim(dys_raw, x::AbstractArray{T,N}) where {T,N} return ProjectTo(x)(dx) end -function ChainRulesCore.rrule(::typeof(eachlastdim), x::AbstractArray{T,N}) where {T,N} +function ChainRulesCore.rrule(::typeof(eachlastdim), x::AbstractArray{T, N}) where {T, N} lastdims(dy) = (NoTangent(), ∇eachlastdim(unthunk(dy), x)) return collect(eachlastdim(x)), lastdims end @@ -126,7 +126,7 @@ julia> rnn.state 60 ``` """ -mutable struct Recur{T,S} +mutable struct Recur{T, S} cell::T state::S end @@ -183,7 +183,7 @@ reset!(m) = foreach(reset!, functor(m)[1]) flip(f, xs) = reverse([f(x) for x in reverse(xs)]) -function (m::Recur)(x::AbstractArray{T,3}) where {T} +function (m::Recur)(x::AbstractArray{T, 3}) where {T} h = [m(x_t) for x_t in eachlastdim(x)] sze = size(h[1]) return reshape(reduce(hcat, h), sze[1], sze[2], length(h)) @@ -191,7 +191,7 @@ end # Vanilla RNN -struct RNNCell{F,I,H,V,S} +struct RNNCell{F, I, H, V, S} σ::F Wi::I Wh::H @@ -199,20 +199,19 @@ struct RNNCell{F,I,H,V,S} state0::S end -function RNNCell( - (in, out)::Pair, - σ = tanh; - init = Flux.glorot_uniform, - initb = zeros32, - init_state = zeros32, -) +function RNNCell((in, out)::Pair, + σ = tanh; + init = Flux.glorot_uniform, + initb = zeros32, + init_state = zeros32) return RNNCell(σ, init(out, in), init(out, out), initb(out), init_state(out, 1)) end -function (m::RNNCell{F,I,H,V,<:AbstractMatrix{T}})( - h, - x::Union{AbstractVecOrMat{T},OneHotArray}, -) where {F,I,H,V,T} +function (m::RNNCell{F, I, H, V, <:AbstractMatrix{T}})(h, + x::Union{AbstractVecOrMat{T}, + OneHotArray}) where {F, I, + H, V, T + } Wi, Wh, b = m.Wi, m.Wh, m.b σ = NNlib.fast_act(m.σ, x) h = σ.(Wi * x .+ Wh * h .+ b) @@ -286,49 +285,52 @@ julia> r(rand(Float32, 3, 10)) |> size # batch size of 10 ``` # Note: - `RNNCell`s can be constructed directly by specifying the non-linear function, the `Wi` and `Wh` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The `Wi` and `Wh` matrices do not need to be the same type, but if `Wh` is `dxd`, then `Wi` should be of shape `dxN`. - ```julia - julia> using LinearAlgebra +`RNNCell`s can be constructed directly by specifying the non-linear function, the `Wi` and `Wh` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The `Wi` and `Wh` matrices do not need to be the same type, but if `Wh` is `dxd`, then `Wi` should be of shape `dxN`. - julia> r = Flux.Recur(Flux.RNNCell(tanh, rand(5, 4), Tridiagonal(rand(5, 5)), rand(5), rand(5, 1))) +```julia +julia> using LinearAlgebra - julia> r(rand(4, 10)) |> size # batch size of 10 - (5, 10) - ``` +julia> r = Flux.Recur(Flux.RNNCell(tanh, rand(5, 4), Tridiagonal(rand(5, 5)), rand(5), rand(5, 1))) + +julia> r(rand(4, 10)) |> size # batch size of 10 +(5, 10) +``` """ RNN(a...; ka...) = Recur(RNNCell(a...; ka...)) Recur(m::RNNCell) = Recur(m, m.state0) # LSTM -struct LSTMCell{I,H,V,S} +struct LSTMCell{I, H, V, S} Wi::I Wh::H b::V state0::S end -function LSTMCell( - (in, out)::Pair; - init = glorot_uniform, - initb = zeros32, - init_state = zeros32, -) - cell = LSTMCell( - init(out * 4, in), - init(out * 4, out), - initb(out * 4), - (init_state(out, 1), init_state(out, 1)), - ) +function LSTMCell((in, out)::Pair; + init = glorot_uniform, + initb = zeros32, + init_state = zeros32) + cell = LSTMCell(init(out * 4, in), + init(out * 4, out), + initb(out * 4), + (init_state(out, 1), init_state(out, 1))) cell.b[gate(out, 2)] .= 1 return cell end -function (m::LSTMCell{I,H,V,<:NTuple{2,AbstractMatrix{T}}})( - (h, c), - x::Union{AbstractVecOrMat{T},OneHotArray}, -) where {I,H,V,T} +function (m::LSTMCell{I, H, V, <:NTuple{2, AbstractMatrix{T}}})((h, c), + x::Union{ + AbstractVecOrMat{T + }, + OneHotArray}) where { + I, + H, + V, + T + } b, o = m.b, size(h, 1) g = muladd(m.Wi, x, muladd(m.Wh, h, b)) input, forget, cell, output = multigate(g, o, Val(4)) @@ -379,7 +381,8 @@ julia> l(rand(Float32, 3, 10)) |> size # batch size of 10 Failing to call `reset!` when the input batch size changes can lead to unexpected behavior. See the example in [`RNN`](@ref). # Note: - `LSTMCell`s can be constructed directly by specifying the non-linear function, the `Wi` and `Wh` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The `Wi` and `Wh` matrices do not need to be the same type. See the example in [`RNN`](@ref). + +`LSTMCell`s can be constructed directly by specifying the non-linear function, the `Wi` and `Wh` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The `Wi` and `Wh` matrices do not need to be the same type. See the example in [`RNN`](@ref). """ LSTM(a...; ka...) = Recur(LSTMCell(a...; ka...)) Recur(m::LSTMCell) = Recur(m, m.state0) @@ -392,34 +395,30 @@ function _gru_output(gxs, ghs, bs) return r, z end -struct GRUCell{I,H,V,S} +struct GRUCell{I, H, V, S} Wi::I Wh::H b::V state0::S end -function GRUCell( - (in, out)::Pair; - init = glorot_uniform, - initb = zeros32, - init_state = zeros32, -) - return GRUCell( - init(out * 3, in), - init(out * 3, out), - initb(out * 3), - init_state(out, 1), - ) +function GRUCell((in, out)::Pair; + init = glorot_uniform, + initb = zeros32, + init_state = zeros32) + return GRUCell(init(out * 3, in), + init(out * 3, out), + initb(out * 3), + init_state(out, 1)) end -function (m::GRUCell{I,H,V,<:AbstractMatrix{T}})( - h, - x::Union{AbstractVecOrMat{T},OneHotArray}, -) where {I,H,V,T} +function (m::GRUCell{I, H, V, <:AbstractMatrix{T}})(h, + x::Union{AbstractVecOrMat{T}, + OneHotArray}) where {I, H, V, T + } Wi, Wh, b, o = m.Wi, m.Wh, m.b, size(h, 1) - gxs, ghs, bs = - multigate(Wi * x, o, Val(3)), multigate(Wh * h, o, Val(3)), multigate(b, o, Val(3)) + gxs, ghs, bs = multigate(Wi * x, o, Val(3)), multigate(Wh * h, o, Val(3)), + multigate(b, o, Val(3)) r, z = _gru_output(gxs, ghs, bs) h̃ = @. tanh_fast(gxs[3] + r * ghs[3] + bs[3]) h′ = @. (1 - z) * h̃ + z * h @@ -469,14 +468,15 @@ julia> g(rand(Float32, 3, 10)) |> size # batch size of 10 Failing to call `reset!` when the input batch size changes can lead to unexpected behavior. See the example in [`RNN`](@ref). # Note: - `GRUCell`s can be constructed directly by specifying the non-linear function, the `Wi` and `Wh` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The `Wi` and `Wh` matrices do not need to be the same type. See the example in [`RNN`](@ref). + +`GRUCell`s can be constructed directly by specifying the non-linear function, the `Wi` and `Wh` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The `Wi` and `Wh` matrices do not need to be the same type. See the example in [`RNN`](@ref). """ GRU(a...; ka...) = Recur(GRUCell(a...; ka...)) Recur(m::GRUCell) = Recur(m, m.state0) # GRU v3 -struct GRUv3Cell{I,H,V,HH,S} +struct GRUv3Cell{I, H, V, HH, S} Wi::I Wh::H b::V @@ -484,28 +484,27 @@ struct GRUv3Cell{I,H,V,HH,S} state0::S end -function GRUv3Cell( - (in, out)::Pair; - init = glorot_uniform, - initb = zeros32, - init_state = zeros32, -) - return GRUv3Cell( - init(out * 3, in), - init(out * 2, out), - initb(out * 3), - init(out, out), - init_state(out, 1), - ) +function GRUv3Cell((in, out)::Pair; + init = glorot_uniform, + initb = zeros32, + init_state = zeros32) + return GRUv3Cell(init(out * 3, in), + init(out * 2, out), + initb(out * 3), + init(out, out), + init_state(out, 1)) end -function (m::GRUv3Cell{I,H,V,HH,<:AbstractMatrix{T}})( - h, - x::Union{AbstractVecOrMat{T},OneHotArray}, -) where {I,H,V,HH,T} +function (m::GRUv3Cell{I, H, V, HH, <:AbstractMatrix{T}})(h, + x::Union{AbstractVecOrMat{T}, + OneHotArray}) where {I, + H, + V, + HH, + T} Wi, Wh, b, Wh_h̃, o = m.Wi, m.Wh, m.b, m.Wh_h̃, size(h, 1) - gxs, ghs, bs = - multigate(Wi * x, o, Val(3)), multigate(Wh * h, o, Val(2)), multigate(b, o, Val(3)) + gxs, ghs, bs = multigate(Wi * x, o, Val(3)), multigate(Wh * h, o, Val(2)), + multigate(b, o, Val(3)) r, z = _gru_output(gxs, ghs, bs) h̃ = tanh_fast.(gxs[3] .+ (Wh_h̃ * (r .* h)) .+ bs[3]) h′ = @. (1 - z) * h̃ + z * h @@ -555,7 +554,8 @@ julia> g(rand(Float32, 3, 10)) |> size # batch size of 10 Failing to call `reset!` when the input batch size changes can lead to unexpected behavior. See the example in [`RNN`](@ref). # Note: - `GRUv3Cell`s can be constructed directly by specifying the non-linear function, the `Wi`, `Wh`, and `Wh_h` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The `Wi`, `Wh`, and `Wh_h` matrices do not need to be the same type. See the example in [`RNN`](@ref). + +`GRUv3Cell`s can be constructed directly by specifying the non-linear function, the `Wi`, `Wh`, and `Wh_h` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The `Wi`, `Wh`, and `Wh_h` matrices do not need to be the same type. See the example in [`RNN`](@ref). """ GRUv3(a...; ka...) = Recur(GRUv3Cell(a...; ka...)) Recur(m::GRUv3Cell) = Recur(m, m.state0) diff --git a/src/layers/show.jl b/src/layers/show.jl index b2e69d0b75..e21b0ae445 100644 --- a/src/layers/show.jl +++ b/src/layers/show.jl @@ -30,8 +30,8 @@ function _big_show(io::IO, obj, indent::Int = 0, name = nothing) for k in Base.keys(obj) _big_show(io, obj[k], indent + 2, k) end - elseif obj isa Parallel{<:Any,<:NamedTuple} || - obj isa PairwiseFusion{<:Any,<:NamedTuple} + elseif obj isa Parallel{<:Any, <:NamedTuple} || + obj isa PairwiseFusion{<:Any, <:NamedTuple} _big_show(io, obj.connection, indent + 2) for k in Base.keys(obj) _big_show(io, obj[k], indent + 2, k) @@ -90,22 +90,18 @@ function _layer_show(io::IO, layer, indent::Int = 0, name = nothing) print(io, " "^indent, str, indent == 0 ? "" : ",") if !isempty(params(layer)) print(io, " "^max(2, (indent == 0 ? 20 : 39) - indent - length(str))) - printstyled( - io, - "# ", - underscorise(sum(length, params(layer))), - " parameters"; - color = :light_black, - ) + printstyled(io, + "# ", + underscorise(sum(length, params(layer))), + " parameters"; + color = :light_black) nonparam = _childarray_sum(length, layer) - sum(length, params(layer)) if nonparam > 0 - printstyled( - io, - ", plus ", - underscorise(nonparam), - indent == 0 ? " non-trainable" : ""; - color = :light_black, - ) + printstyled(io, + ", plus ", + underscorise(nonparam), + indent == 0 ? " non-trainable" : ""; + color = :light_black) end _nan_show(io, params(layer)) end @@ -120,35 +116,29 @@ function _big_finale(io::IO, m) noncnt = _childarray_sum(_ -> 1, m) - length(ps) if noncnt > 0 nonparam = underscorise(_childarray_sum(length, m) - sum(length, ps)) - printstyled( - io, - " "^08, - "# Total: ", - length(ps), - " trainable arrays, "; - color = :light_black, - ) + printstyled(io, + " "^08, + "# Total: ", + length(ps), + " trainable arrays, "; + color = :light_black) println(io, pars, " parameters,") - printstyled( - io, - " "^10, - "# plus ", - noncnt, - " non-trainable, ", - nonparam, - " parameters, summarysize "; - color = :light_black, - ) + printstyled(io, + " "^10, + "# plus ", + noncnt, + " non-trainable, ", + nonparam, + " parameters, summarysize "; + color = :light_black) print(io, bytes, ".") else - printstyled( - io, - " "^18, - "# Total: ", - length(ps), - " arrays, "; - color = :light_black, - ) + printstyled(io, + " "^18, + "# Total: ", + length(ps), + " arrays, "; + color = :light_black) print(io, pars, " parameters, ", bytes, ".") end end diff --git a/src/layers/upsample.jl b/src/layers/upsample.jl index d67190a49b..dad2a512bb 100644 --- a/src/layers/upsample.jl +++ b/src/layers/upsample.jl @@ -31,7 +31,7 @@ julia> m(ones(2, 2, 1, 1)) |> size (4, 5, 1, 1) ``` """ -struct Upsample{mode,S,T} +struct Upsample{mode, S, T} scale::S size::T end @@ -42,26 +42,26 @@ function Upsample(mode::Symbol = :nearest; scale = nothing, size = nothing) if !(isnothing(scale) ⊻ isnothing(size)) throw(ArgumentError("Either scale or size should be specified (but not both).")) end - return Upsample{mode,typeof(scale),typeof(size)}(scale, size) + return Upsample{mode, typeof(scale), typeof(size)}(scale, size) end Upsample(scale, mode::Symbol = :nearest) = Upsample(mode; scale) (m::Upsample{:nearest})(x::AbstractArray) = NNlib.upsample_nearest(x, m.scale) -function (m::Upsample{:nearest,Int})(x::AbstractArray{T,N}) where {T,N} +function (m::Upsample{:nearest, Int})(x::AbstractArray{T, N}) where {T, N} return NNlib.upsample_nearest(x, ntuple(i -> m.scale, N - 2)) end -function (m::Upsample{:nearest,Nothing})(x::AbstractArray) +function (m::Upsample{:nearest, Nothing})(x::AbstractArray) return NNlib.upsample_nearest(x; size = m.size) end (m::Upsample{:bilinear})(x::AbstractArray) = NNlib.upsample_bilinear(x, m.scale) -function (m::Upsample{:bilinear,Nothing})(x::AbstractArray) +function (m::Upsample{:bilinear, Nothing})(x::AbstractArray) return NNlib.upsample_bilinear(x; size = m.size) end (m::Upsample{:trilinear})(x::AbstractArray) = NNlib.upsample_trilinear(x, m.scale) -function (m::Upsample{:trilinear,Nothing})(x::AbstractArray) +function (m::Upsample{:trilinear, Nothing})(x::AbstractArray) return NNlib.upsample_trilinear(x; size = m.size) end diff --git a/src/loading.jl b/src/loading.jl index 0dd73a0d59..35e3868189 100644 --- a/src/loading.jl +++ b/src/loading.jl @@ -23,19 +23,16 @@ function loadleaf!(dst::AbstractArray, src::AbstractArray, err) end function _tie_check(dst::Bool, src::AbstractArray) - return iszero(dst) || error( - "Encountered tied parameter with boolean source at some nodes and non-boolean sources at others.", - ) + return iszero(dst) || + error("Encountered tied parameter with boolean source at some nodes and non-boolean sources at others.") end function _tie_check(dst::AbstractArray, src::Bool) - return (iszero(dst) && iszero(src)) || error( - "Encountered tied parameter with boolean source at some nodes and non-boolean sources at others.", - ) + return (iszero(dst) && iszero(src)) || + error("Encountered tied parameter with boolean source at some nodes and non-boolean sources at others.") end function _tie_check(dst::AbstractArray, src::AbstractArray) - return (dst == src) || error( - "Encountered tied destination parameters with untied and mismatched sources.", - ) + return (dst == src) || + error("Encountered tied destination parameters with untied and mismatched sources.") end _tie_check(dst, src) = true @@ -100,13 +97,10 @@ but copying a `src` value of `true` will error. function loadmodel!(dst, src; filter = _ -> true, cache = Base.IdSet()) ldsts = _filter_children(filter, functor(dst)[1]) lsrcs = _filter_children(filter, functor(src)[1]) - (keys(ldsts) == keys(lsrcs)) || throw( - ArgumentError("Tried to load $src into $dst but the structures do not match."), - ) + (keys(ldsts) == keys(lsrcs)) || + throw(ArgumentError("Tried to load $src into $dst but the structures do not match.")) - err = DimensionMismatch( - "Tried to load $src into $dst but the parameter sizes do not match.", - ) + err = DimensionMismatch("Tried to load $src into $dst but the parameter sizes do not match.") foreach(ldsts, lsrcs) do ldst, lsrc if ldst in cache # we already loaded this parameter before _tie_check(ldst, lsrc) && return ldst diff --git a/src/losses/Losses.jl b/src/losses/Losses.jl index a35f93af03..a6126bf4e5 100644 --- a/src/losses/Losses.jl +++ b/src/losses/Losses.jl @@ -10,23 +10,23 @@ using NNlib: logsoftmax, logσ, ctc_loss, ctc_alpha, ∇ctc_loss import Base.Broadcast: broadcasted export mse, - mae, - msle, - label_smoothing, - crossentropy, - logitcrossentropy, - binarycrossentropy, - logitbinarycrossentropy, - kldivergence, - huber_loss, - tversky_loss, - dice_coeff_loss, - poisson_loss, - hinge_loss, - squared_hinge_loss, - binary_focal_loss, - focal_loss, - siamese_contrastive_loss + mae, + msle, + label_smoothing, + crossentropy, + logitcrossentropy, + binarycrossentropy, + logitbinarycrossentropy, + kldivergence, + huber_loss, + tversky_loss, + dice_coeff_loss, + poisson_loss, + hinge_loss, + squared_hinge_loss, + binary_focal_loss, + focal_loss, + siamese_contrastive_loss include("utils.jl") include("functions.jl") diff --git a/src/losses/functions.jl b/src/losses/functions.jl index 674fe3065c..65b6b2fe60 100644 --- a/src/losses/functions.jl +++ b/src/losses/functions.jl @@ -157,7 +157,7 @@ julia> Flux.crossentropy(y_dis, y) > Flux.crossentropy(y_dis, y_smoothed) true ``` """ -function label_smoothing(y::Union{AbstractArray,Number}, α::Number; dims::Int = 1) +function label_smoothing(y::Union{AbstractArray, Number}, α::Number; dims::Int = 1) if !(0 < α < 1) throw(ArgumentError("α must be between 0 and 1")) end @@ -320,7 +320,7 @@ julia> Flux.crossentropy(y_prob, y_hot) """ function binarycrossentropy(ŷ, y; agg = mean, ϵ = epseltype(ŷ)) _check_sizes(ŷ, y) - return agg(@.(-xlogy(y, ŷ + ϵ) - xlogy(1 - y, 1 - ŷ + ϵ))) + return agg(@.(-xlogy(y, ŷ + ϵ)-xlogy(1 - y, 1 - ŷ + ϵ))) end """ @@ -351,7 +351,7 @@ julia> Flux.binarycrossentropy(sigmoid.(y_model), y_bin) """ function logitbinarycrossentropy(ŷ, y; agg = mean) _check_sizes(ŷ, y) - return agg(@.((1 - y) * ŷ - logσ(ŷ))) + return agg(@.((1 - y) * ŷ-logσ(ŷ))) end """ diff --git a/src/losses/utils.jl b/src/losses/utils.jl index 43aab12a05..cda3e4a557 100644 --- a/src/losses/utils.jl +++ b/src/losses/utils.jl @@ -21,19 +21,17 @@ end @adjoint function broadcasted(::typeof(xlogy), x::Zygote.Numeric, y::Zygote.Numeric) res = xlogy.(x, y) return res, - Δ -> (nothing, Zygote.unbroadcast(x, xlogy.(Δ, y)), Zygote.unbroadcast(y, Δ .* x ./ y)) + Δ -> (nothing, Zygote.unbroadcast(x, xlogy.(Δ, y)), + Zygote.unbroadcast(y, Δ .* x ./ y)) end ChainRulesCore.@scalar_rule xlogy(x, y) (log(y), x / y) # should help Diffractor's broadcasting -ChainRulesCore.@scalar_rule xlogx(x) (log(y) + true) +ChainRulesCore.@scalar_rule xlogx(x) (log(y)+true) function _check_sizes(ŷ::AbstractArray, y::AbstractArray) - for d = 1:max(ndims(ŷ), ndims(y)) - size(ŷ, d) == size(y, d) || throw( - DimensionMismatch( - "loss function expects size(ŷ) = $(size(ŷ)) to match size(y) = $(size(y))", - ), - ) + for d in 1:max(ndims(ŷ), ndims(y)) + size(ŷ, d) == size(y, d) || + throw(DimensionMismatch("loss function expects size(ŷ) = $(size(ŷ)) to match size(y) = $(size(y))")) end end _check_sizes(ŷ, y) = nothing # pass-through, for constant label e.g. y = 1 diff --git a/src/optimise/Optimise.jl b/src/optimise/Optimise.jl index 5bc95d0ab2..be4e556092 100644 --- a/src/optimise/Optimise.jl +++ b/src/optimise/Optimise.jl @@ -4,29 +4,29 @@ using LinearAlgebra import ArrayInterface export train!, - update!, - Descent, - Adam, - Momentum, - Nesterov, - RMSProp, - AdaGrad, - AdaMax, - AdaDelta, - AMSGrad, - NAdam, - AdamW, - RAdam, - OAdam, - AdaBelief, - InvDecay, - ExpDecay, - WeightDecay, - stop, - skip, - Optimiser, - ClipValue, - ClipNorm + update!, + Descent, + Adam, + Momentum, + Nesterov, + RMSProp, + AdaGrad, + AdaMax, + AdaDelta, + AMSGrad, + NAdam, + AdamW, + RAdam, + OAdam, + AdaBelief, + InvDecay, + ExpDecay, + WeightDecay, + stop, + skip, + Optimiser, + ClipValue, + ClipNorm include("optimisers.jl") include("train.jl") diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl index e7e40012c6..940b3ca5d0 100644 --- a/src/optimise/optimisers.jl +++ b/src/optimise/optimisers.jl @@ -172,9 +172,9 @@ opt = Adam(0.001, (0.9, 0.8)) """ mutable struct Adam <: AbstractOptimiser eta::Float64 - beta::Tuple{Float64,Float64} + beta::Tuple{Float64, Float64} epsilon::Float64 - state::IdDict{Any,Any} + state::IdDict{Any, Any} end Adam(η::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = Adam(η, β, ϵ, IdDict()) Adam(η::Real, β::Tuple, state::IdDict) = Adam(η, β, EPS, state) @@ -183,12 +183,10 @@ function apply!(o::Adam, x, Δ) η, β = o.eta, o.beta mt, vt, βp = get!(o.state, x) do - return ( - zero(x), - zero(x), - Float64[β[1], β[2]], - ) - end::Tuple{typeof(x),typeof(x),Vector{Float64}} + return (zero(x), + zero(x), + Float64[β[1], β[2]]) + end::Tuple{typeof(x), typeof(x), Vector{Float64}} @. mt = β[1] * mt + (1 - β[1]) * Δ @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ) @@ -218,9 +216,9 @@ opt = RAdam(0.001, (0.9, 0.8)) """ mutable struct RAdam <: AbstractOptimiser eta::Float64 - beta::Tuple{Float64,Float64} + beta::Tuple{Float64, Float64} epsilon::Float64 - state::IdDict{Any,Any} + state::IdDict{Any, Any} end RAdam(η::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = RAdam(η, β, ϵ, IdDict()) RAdam(η::Real, β::Tuple, state::IdDict) = RAdam(η, β, EPS, state) @@ -229,14 +227,13 @@ function apply!(o::RAdam, x, Δ) η, β = o.eta, o.beta ρ∞ = 2 / (1 - β[2]) - 1 - mt, vt, βp, t = get!(o.state, x) do - return ( - zero(x), - zero(x), - Float64[β[1], β[2]], - Ref(1), - ) - end::Tuple{typeof(x),typeof(x),Vector{Float64},Base.RefValue{Int}} + mt, vt, βp, t = get!(o.state, + x) do + return (zero(x), + zero(x), + Float64[β[1], β[2]], + Ref(1)) + end::Tuple{typeof(x), typeof(x), Vector{Float64}, Base.RefValue{Int}} @. mt = β[1] * mt + (1 - β[1]) * Δ @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ) @@ -273,9 +270,9 @@ opt = AdaMax(0.001, (0.9, 0.995)) """ mutable struct AdaMax <: AbstractOptimiser eta::Float64 - beta::Tuple{Float64,Float64} + beta::Tuple{Float64, Float64} epsilon::Float64 - state::IdDict{Any,Any} + state::IdDict{Any, Any} end AdaMax(η::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = AdaMax(η, β, ϵ, IdDict()) AdaMax(η::Real, β::Tuple, state::IdDict) = AdaMax(η, β, EPS, state) @@ -284,12 +281,10 @@ function apply!(o::AdaMax, x, Δ) η, β = o.eta, o.beta mt, ut, βp = get!(o.state, x) do - return ( - zero(x), - zero(x), - Float64[β[1], β[2]], - ) - end::Tuple{typeof(x),typeof(x),Vector{Float64}} + return (zero(x), + zero(x), + Float64[β[1], β[2]]) + end::Tuple{typeof(x), typeof(x), Vector{Float64}} @. mt = β[1] * mt + (1 - β[1]) * Δ @. ut = max(β[2] * ut, abs(Δ)) @@ -320,9 +315,9 @@ opt = OAdam(0.001, (0.9, 0.995)) """ mutable struct OAdam <: AbstractOptimiser eta::Float64 - beta::Tuple{Float64,Float64} + beta::Tuple{Float64, Float64} epsilon::Float64 - state::IdDict{Any,Any} + state::IdDict{Any, Any} end OAdam(η::Real = 0.001, β::Tuple = (0.5, 0.9), ϵ::Real = EPS) = OAdam(η, β, ϵ, IdDict()) OAdam(η::Real, β::Tuple, state::IdDict) = RMSProp(η, β, EPS, state) @@ -330,14 +325,13 @@ OAdam(η::Real, β::Tuple, state::IdDict) = RMSProp(η, β, EPS, state) function apply!(o::OAdam, x, Δ) η, β = o.eta, o.beta - mt, vt, Δ_, βp = get!(o.state, x) do - return ( - zero(x), - zero(x), - zero(x), - Float64[β[1], β[2]], - ) - end::Tuple{typeof(x),typeof(x),typeof(x),Vector{Float64}} + mt, vt, Δ_, βp = get!(o.state, + x) do + return (zero(x), + zero(x), + zero(x), + Float64[β[1], β[2]]) + end::Tuple{typeof(x), typeof(x), typeof(x), Vector{Float64}} @. mt = β[1] * mt + (1 - β[1]) * Δ @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ) @@ -402,14 +396,14 @@ opt = AdaDelta(0.89) mutable struct AdaDelta <: AbstractOptimiser rho::Float64 epsilon::Float64 - state::IdDict{Any,Any} + state::IdDict{Any, Any} end AdaDelta(ρ::Real = 0.9, ϵ::Real = EPS) = AdaDelta(ρ, ϵ, IdDict()) AdaDelta(ρ::Real, state::IdDict) = AdaDelta(ρ, EPS, state) function apply!(o::AdaDelta, x, Δ) ρ = o.rho - acc, Δacc = get!(() -> (zero(x), zero(x)), o.state, x)::NTuple{2,typeof(x)} + acc, Δacc = get!(() -> (zero(x), zero(x)), o.state, x)::NTuple{2, typeof(x)} @. acc = ρ * acc + (1 - ρ) * Δ * conj(Δ) # DON'T remove epsilon from numerator # or even out of the square roots @@ -439,9 +433,9 @@ opt = AMSGrad(0.001, (0.89, 0.995)) """ mutable struct AMSGrad <: AbstractOptimiser eta::Float64 - beta::Tuple{Float64,Float64} + beta::Tuple{Float64, Float64} epsilon::Float64 - state::IdDict{Any,Any} + state::IdDict{Any, Any} end AMSGrad(η::Real = 0.001, β = (0.9, 0.999), ϵ::Real = EPS) = AMSGrad(η, β, ϵ, IdDict()) AMSGrad(η::Real, β::Tuple, state::IdDict) = AMSGrad(η, β, EPS, state) @@ -450,12 +444,10 @@ function apply!(o::AMSGrad, x, Δ) η, β = o.eta, o.beta mt, vt, v̂t = get!(o.state, x) do - return ( - fill!(similar(x), o.epsilon), - fill!(similar(x), o.epsilon), - fill!(similar(x), o.epsilon), - ) - end::NTuple{3,typeof(x)} + return (fill!(similar(x), o.epsilon), + fill!(similar(x), o.epsilon), + fill!(similar(x), o.epsilon)) + end::NTuple{3, typeof(x)} @. mt = β[1] * mt + (1 - β[1]) * Δ @. vt = β[2] * vt + (1 - β[2]) * Δ^2 @@ -484,9 +476,9 @@ opt = NAdam(0.002, (0.89, 0.995)) """ mutable struct NAdam <: AbstractOptimiser eta::Float64 - beta::Tuple{Float64,Float64} + beta::Tuple{Float64, Float64} epsilon::Float64 - state::IdDict{Any,Any} + state::IdDict{Any, Any} end NAdam(η::Real = 0.001, β = (0.9, 0.999), ϵ::Real = EPS) = NAdam(η, β, ϵ, IdDict()) NAdam(η::Real, β::Tuple, state::IdDict) = NAdam(η, β, EPS, state) @@ -495,19 +487,16 @@ function apply!(o::NAdam, x, Δ) η, β = o.eta, o.beta mt, vt, βp = get!(o.state, x) do - return ( - zero(x), - zero(x), - Float64[o.beta[1], o.beta[2]], - ) - end::Tuple{typeof(x),typeof(x),Vector{Float64}} + return (zero(x), + zero(x), + Float64[o.beta[1], o.beta[2]]) + end::Tuple{typeof(x), typeof(x), Vector{Float64}} β1p, β2p = βp @. mt = β[1] * mt + (1 - β[1]) * Δ @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ) - @. Δ = - (β[1] * mt / (1 - β[1] * β1p) + (1 - β[1]) * Δ / (1 - β1p)) / - (√(vt * β[2] / (1 - β2p)) + o.epsilon) * η + @. Δ = (β[1] * mt / (1 - β[1] * β1p) + (1 - β[1]) * Δ / (1 - β1p)) / + (√(vt * β[2] / (1 - β2p)) + o.epsilon) * η βp .= βp .* β return Δ @@ -558,9 +547,9 @@ opt = AdaBelief(0.001, (0.9, 0.8)) """ mutable struct AdaBelief <: AbstractOptimiser eta::Float64 - beta::Tuple{Float64,Float64} + beta::Tuple{Float64, Float64} epsilon::Float64 - state::IdDict{Any,Any} + state::IdDict{Any, Any} end AdaBelief(η::Real = 0.001, β = (0.9, 0.999), ϵ::Real = EPS) = AdaBelief(η, β, ϵ, IdDict()) AdaBelief(η::Real, β::Tuple, state::IdDict) = AdaBelief(η, β, EPS, state) @@ -569,12 +558,10 @@ function apply!(o::AdaBelief, x, Δ) η, β = o.eta, o.beta mt, st, βp = get!(o.state, x) do - return ( - zero(x), - zero(x), - Float64[β[1], β[2]], - ) - end::Tuple{typeof(x),typeof(x),Vector{Float64}} + return (zero(x), + zero(x), + Float64[β[1], β[2]]) + end::Tuple{typeof(x), typeof(x), Vector{Float64}} #= st is a variance and can go to zero. This is in contrast to Adam, which uses the second moment which is usually far enough from zero. This is problematic, since st @@ -610,11 +597,11 @@ end Optimiser(opts::AbstractOptimiser...) = Optimiser(Any[opts...]) @forward Optimiser.os Base.getindex, -Base.first, -Base.last, -Base.lastindex, -Base.push!, -Base.setindex! + Base.first, + Base.last, + Base.lastindex, + Base.push!, + Base.setindex! @forward Optimiser.os Base.iterate Base.getindex(c::Optimiser, i::AbstractArray) = Optimiser(c.os[i]...) @@ -649,10 +636,10 @@ opt = Optimiser(Adam(1.0f-3), InvDecay(1.0f-2)) """ mutable struct InvDecay <: AbstractOptimiser gamma::Float64 - state::IdDict{Any,Int} + state::IdDict{Any, Int} end -InvDecay(γ = 0.001) = InvDecay(γ, IdDict{Any,Int}()) +InvDecay(γ = 0.001) = InvDecay(γ, IdDict{Any, Int}()) function apply!(o::InvDecay, x, Δ) γ = o.gamma diff --git a/src/optimise/train.jl b/src/optimise/train.jl index bead5860f0..243356fcf2 100644 --- a/src/optimise/train.jl +++ b/src/optimise/train.jl @@ -50,11 +50,9 @@ end ``` """ function skip() - Base.depwarn( - """Flux.skip() will be removed from Flux 0.14. - and should be replaced with `continue` in an ordinary `for` loop.""", - :skip, - ) + Base.depwarn("""Flux.skip() will be removed from Flux 0.14. + and should be replaced with `continue` in an ordinary `for` loop.""", + :skip) throw(SkipException()) end @@ -79,11 +77,9 @@ end ``` """ function stop() - Base.depwarn( - """Flux.stop() will be removed from Flux 0.14. - It should be replaced with `break` in an ordinary `for` loop.""", - :stop, - ) + Base.depwarn("""Flux.stop() will be removed from Flux 0.14. + It should be replaced with `break` in an ordinary `for` loop.""", + :stop) throw(StopException()) end @@ -178,14 +174,12 @@ hello ``` """ macro epochs(n, ex) - Base.depwarn( - """The macro `@epochs` will be removed from Flux 0.14. - As an alternative, you can write a simple `for i in 1:epochs` loop.""", - Symbol("@epochs"); - force = true, - ) - return :(@progress for i = 1:($(esc(n))) - @info "Epoch $i" - $(esc(ex)) - end) + Base.depwarn("""The macro `@epochs` will be removed from Flux 0.14. + As an alternative, you can write a simple `for i in 1:epochs` loop.""", + Symbol("@epochs"); + force = true) + return :(@progress for i in 1:($(esc(n))) + @info "Epoch $i" + $(esc(ex)) + end) end diff --git a/src/outputsize.jl b/src/outputsize.jl index 65f006d54f..c31bbbba2a 100644 --- a/src/outputsize.jl +++ b/src/outputsize.jl @@ -13,8 +13,8 @@ struct Nil <: Real end @doc @doc(Nil) const nil = Nil() -Nil(::T) where {T<:Number} = nil -(::Type{T})(::Nil) where {T<:Number} = nil +Nil(::T) where {T <: Number} = nil +(::Type{T})(::Nil) where {T <: Number} = nil Base.convert(::Type{Nil}, ::Number) = nil Base.float(::Type{Nil}) = Nil @@ -181,12 +181,10 @@ end for (fn, Dims) in ((:conv, DenseConvDims),) @eval begin function NNlib.$fn(a::AbstractArray{Nil}, b::AbstractArray{Nil}, dims::$Dims) - return fill( - nil, - NNlib.output_size(dims)..., - NNlib.channels_out(dims), - size(a)[end], - ) + return fill(nil, + NNlib.output_size(dims)..., + NNlib.channels_out(dims), + size(a)[end]) end function NNlib.$fn(a::AbstractArray{<:Real}, b::AbstractArray{Nil}, dims::$Dims) diff --git a/src/utils.jl b/src/utils.jl index 07d99e8b97..6751bc1096 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -28,7 +28,7 @@ nfan() = 1, 1 # fan_in, fan_out nfan(n) = 1, n # A vector is treated as a n×1 matrix nfan(n_out, n_in) = n_in, n_out # In case of Dense kernels: arranged as matrices nfan(dims::Tuple) = nfan(dims...) -nfan(dims...) = prod(dims[1:(end-2)]) .* (dims[end-1], dims[end]) # In case of convolution kernels +nfan(dims...) = prod(dims[1:(end - 2)]) .* (dims[end - 1], dims[end]) # In case of convolution kernels ofeltype(x, y) = convert(float(eltype(x)), y) epseltype(x) = eps(float(eltype(x))) @@ -270,18 +270,15 @@ julia> round(std(Flux.truncated_normal(10^6; lo = -100, hi = 100))) 1.0f0 ``` """ -function truncated_normal( - rng::AbstractRNG, - dims::Integer...; - mean = 0, - std = 1, - lo = -2, - hi = 2, -) +function truncated_normal(rng::AbstractRNG, + dims::Integer...; + mean = 0, + std = 1, + lo = -2, + hi = 2) norm_cdf(x) = 0.5 * (1 + erf(x / √2)) if (mean < lo - 2 * std) || (mean > hi + 2 * std) - @warn "Mean is more than 2 std outside the limits in truncated_normal, so the distribution of values may be inaccurate." maxlog = - 1 + @warn "Mean is more than 2 std outside the limits in truncated_normal, so the distribution of values may be inaccurate." maxlog=1 end l = norm_cdf((lo - mean) / std) u = norm_cdf((hi - mean) / std) @@ -354,7 +351,7 @@ end function orthogonal(rng::AbstractRNG, d1::Integer, ds::Integer...; kwargs...) dims = (d1, ds...) - rows = prod(dims[1:(end-1)]) + rows = prod(dims[1:(end - 1)]) cols = dims[end] return reshape(orthogonal(rng, rows, cols; kwargs...), dims) end @@ -363,8 +360,8 @@ function orthogonal(dims::Integer...; kwargs...) return orthogonal(default_rng_value(), dims...; kwargs...) end function orthogonal(rng::AbstractRNG = default_rng_value(); init_kwargs...) - return (dims::Integer...; kwargs...) -> - orthogonal(rng, dims...; init_kwargs..., kwargs...) + return (dims::Integer...; kwargs...) -> orthogonal(rng, dims...; init_kwargs..., + kwargs...) end ChainRulesCore.@non_differentiable orthogonal(::Any...) @@ -403,11 +400,7 @@ julia> count(iszero, ans.weight; dims = 1) """ function sparse_init(rng::AbstractRNG, dims::Integer...; sparsity, std = 0.01) if length(dims) != 2 - throw( - ArgumentError( - "Only 2-dimensional outputs are supported for sparse initialization.", - ), - ) + throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization.")) end rows, cols = dims prop_zero = min(1.0, sparsity) @@ -506,10 +499,10 @@ end # Assume convolution function identity_init(dims::Integer...; gain::Real = 1, shift = 0) - nin, nout = dims[end-1], dims[end] - centers = map(d -> cld(d, 2), dims[1:(end-2)]) + nin, nout = dims[end - 1], dims[end] + centers = map(d -> cld(d, 2), dims[1:(end - 2)]) weights = zeros32(dims...) - for i = 1:min(nin, nout) + for i in 1:min(nin, nout) weights[centers..., i, i] = gain end return circshift(weights, shift) diff --git a/test/ctc-gpu.jl b/test/ctc-gpu.jl index 6439a3a8f5..e85d56590e 100644 --- a/test/ctc-gpu.jl +++ b/test/ctc-gpu.jl @@ -10,7 +10,7 @@ using CUDA function ctc_ngradient(x, y) f = Flux.Losses.ctc_loss grads = zero(x) - for i = 1:length(x) + for i in 1:length(x) δ = sqrt(eps()) tmp = x[i] x[i] = tmp - δ / 2 @@ -30,7 +30,7 @@ end g1 = gradient(ctc_loss, x_cu, y)[1] g1 = g1 |> collect g2 = ctc_ngradient(x, y) - @test g1 ≈ g2 rtol = 1e-5 atol = 1e-5 + @test g1≈g2 rtol=1e-5 atol=1e-5 # test that GPU loss matches CPU implementation l1 = ctc_loss(x_cu, y) @@ -42,23 +42,19 @@ end y = [1, 2] @test ctc_loss(x_cu, y) ≈ 3.6990738275138035 - g = [ - -0.317671 -0.427729 0.665241 - 0.244728 -0.0196172 -0.829811 - 0.0729422 0.447346 0.16457 - ] + g = [-0.317671 -0.427729 0.665241 + 0.244728 -0.0196172 -0.829811 + 0.0729422 0.447346 0.16457] ghat = gradient(ctc_loss, x_cu, y)[1] |> collect - @test g ≈ ghat rtol = 1e-5 atol = 1e-5 + @test g≈ghat rtol=1e-5 atol=1e-5 x_cu = [-3.0 12.0 8.0 15.0; 4.0 20.0 -2.0 20.0; 8.0 -33.0 6.0 5.0] |> CuArray y = [1, 2] |> CuArray @test ctc_loss(x_cu, y) ≈ 8.02519869363453 - g = [ - -2.29294774655333e-06 -0.999662657278862 1.75500863563993e-06 0.00669284889063 - 0.017985914969696 0.999662657278861 -1.9907078755387e-06 -0.006693150917307 - -0.01798362202195 -2.52019580677916e-20 2.35699239251042e-07 3.02026677058789e-07 - ] + g = [-2.29294774655333e-06 -0.999662657278862 1.75500863563993e-06 0.00669284889063 + 0.017985914969696 0.999662657278861 -1.9907078755387e-06 -0.006693150917307 + -0.01798362202195 -2.52019580677916e-20 2.35699239251042e-07 3.02026677058789e-07] ghat = gradient(ctc_loss, x_cu, y)[1] |> collect - @test g ≈ ghat rtol = 1e-5 atol = 1e-5 + @test g≈ghat rtol=1e-5 atol=1e-5 end diff --git a/test/ctc.jl b/test/ctc.jl index 059b14f292..6e5681f977 100644 --- a/test/ctc.jl +++ b/test/ctc.jl @@ -9,7 +9,7 @@ using LinearAlgebra function ctc_ngradient(x, y) f = Flux.Losses.ctc_loss grads = zero(x) - for i = 1:length(x) + for i in 1:length(x) δ = sqrt(eps()) tmp = x[i] x[i] = tmp - δ / 2 @@ -27,30 +27,26 @@ end y = rand(1:9, 30) g1 = gradient(ctc_loss, x, y)[1] g2 = ctc_ngradient(x, y) - @test g1 ≈ g2 rtol = 1e-5 atol = 1e-5 + @test g1≈g2 rtol=1e-5 atol=1e-5 # tests using hand-calculated values x = [1.0 2.0 3.0; 2.0 1.0 1.0; 3.0 3.0 2.0] y = [1, 2] @test ctc_loss(x, y) ≈ 3.6990738275138035 - g = [ - -0.317671 -0.427729 0.665241 - 0.244728 -0.0196172 -0.829811 - 0.0729422 0.447346 0.16457 - ] + g = [-0.317671 -0.427729 0.665241 + 0.244728 -0.0196172 -0.829811 + 0.0729422 0.447346 0.16457] ghat = gradient(ctc_loss, x, y)[1] - @test g ≈ ghat rtol = 1e-5 atol = 1e-5 + @test g≈ghat rtol=1e-5 atol=1e-5 x = [-3.0 12.0 8.0 15.0; 4.0 20.0 -2.0 20.0; 8.0 -33.0 6.0 5.0] y = [1, 2] @test ctc_loss(x, y) ≈ 8.02519869363453 - g = [ - -2.29294774655333e-06 -0.999662657278862 1.75500863563993e-06 0.00669284889063 - 0.017985914969696 0.999662657278861 -1.9907078755387e-06 -0.006693150917307 - -0.01798362202195 -2.52019580677916e-20 2.35699239251042e-07 3.02026677058789e-07 - ] + g = [-2.29294774655333e-06 -0.999662657278862 1.75500863563993e-06 0.00669284889063 + 0.017985914969696 0.999662657278861 -1.9907078755387e-06 -0.006693150917307 + -0.01798362202195 -2.52019580677916e-20 2.35699239251042e-07 3.02026677058789e-07] ghat = gradient(ctc_loss, x, y)[1] - @test g ≈ ghat rtol = 1e-5 atol = 1e-5 + @test g≈ghat rtol=1e-5 atol=1e-5 end diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl index 25648eb787..033a08df95 100644 --- a/test/cuda/cuda.jl +++ b/test/cuda/cuda.jl @@ -21,7 +21,7 @@ using SparseArrays: sparse, SparseMatrixCSC, AbstractSparseArray cm = gpu(m) @test all(p isa CuArray for p in params(cm)) - @test cm(gpu(rand(10, 10))) isa CuArray{Float32,2} + @test cm(gpu(rand(10, 10))) isa CuArray{Float32, 2} xs = rand(5, 5) ys = Flux.onehotbatch(1:5, 1:5) @@ -81,7 +81,7 @@ end M = 2.0 * I(10) |> collect Q = cholesky(M) Q_gpu = Q |> gpu - @test Q_gpu isa Cholesky{<:Any,<:CuArray} + @test Q_gpu isa Cholesky{<:Any, <:CuArray} Q_cpu = Q_gpu |> cpu @test Q_cpu == cholesky(eltype(Q_gpu).(M)) end diff --git a/test/cuda/curnn.jl b/test/cuda/curnn.jl index a46b5684c8..93e6dea6aa 100644 --- a/test/cuda/curnn.jl +++ b/test/cuda/curnn.jl @@ -11,55 +11,52 @@ using Flux, CUDA, Test @test collect(m̄.cell.Wi) == collect(θ[m.cell.Wi]) end -@testset "RNN" begin - @testset for R in [RNN, GRU, LSTM, GRUv3], batch_size in (1, 5) - rnn = R(10, 5) - curnn = fmap(gpu, rnn) - - Flux.reset!(rnn) - Flux.reset!(curnn) - x = batch_size == 1 ? rand(Float32, 10) : rand(Float32, 10, batch_size) - cux = gpu(x) - - y, back = pullback((r, x) -> r(x), rnn, x) - cuy, cuback = pullback((r, x) -> r(x), curnn, cux) - - @test y ≈ collect(cuy) - - ȳ = randn(size(y)) - m̄, x̄ = back(ȳ) - cum̄, cux̄ = cuback(gpu(ȳ)) - - @test x̄ ≈ collect(cux̄) - @test m̄[].cell.Wi ≈ collect(cum̄[].cell.Wi) - @test m̄[].cell.Wh ≈ collect(cum̄[].cell.Wh) - @test m̄[].cell.b ≈ collect(cum̄[].cell.b) - if m̄[].state isa Tuple - for (x, cx) in zip(m̄[].state, cum̄[].state) - @test x ≈ collect(cx) - end - else - @test m̄[].state ≈ collect(cum̄[].state) +@testset "RNN" begin @testset for R in [RNN, GRU, LSTM, GRUv3], batch_size in (1, 5) + rnn = R(10, 5) + curnn = fmap(gpu, rnn) + + Flux.reset!(rnn) + Flux.reset!(curnn) + x = batch_size == 1 ? rand(Float32, 10) : rand(Float32, 10, batch_size) + cux = gpu(x) + + y, back = pullback((r, x) -> r(x), rnn, x) + cuy, cuback = pullback((r, x) -> r(x), curnn, cux) + + @test y ≈ collect(cuy) + + ȳ = randn(size(y)) + m̄, x̄ = back(ȳ) + cum̄, cux̄ = cuback(gpu(ȳ)) + + @test x̄ ≈ collect(cux̄) + @test m̄[].cell.Wi ≈ collect(cum̄[].cell.Wi) + @test m̄[].cell.Wh ≈ collect(cum̄[].cell.Wh) + @test m̄[].cell.b ≈ collect(cum̄[].cell.b) + if m̄[].state isa Tuple + for (x, cx) in zip(m̄[].state, cum̄[].state) + @test x ≈ collect(cx) end - - Flux.reset!(rnn) - Flux.reset!(curnn) - ohx = - batch_size == 1 ? Flux.onehot(rand(1:10), 1:10) : - Flux.onehotbatch(rand(1:10, batch_size), 1:10) - cuohx = gpu(ohx) - y = (rnn(ohx); rnn(ohx)) - - cuy = (curnn(cuohx); curnn(cuohx)) - @test y ≈ collect(cuy) - - Flux.reset!(rnn) - Flux.reset!(curnn) - fx = rand(Float32, 10, batch_size, 3) - cufx = gpu(fx) - fy = (rnn(fx); rnn(fx)) - - cufy = (curnn(cufx); curnn(cufx)) - @test fy ≈ collect(cufy) + else + @test m̄[].state ≈ collect(cum̄[].state) end -end + + Flux.reset!(rnn) + Flux.reset!(curnn) + ohx = batch_size == 1 ? Flux.onehot(rand(1:10), 1:10) : + Flux.onehotbatch(rand(1:10, batch_size), 1:10) + cuohx = gpu(ohx) + y = (rnn(ohx); rnn(ohx)) + + cuy = (curnn(cuohx); curnn(cuohx)) + @test y ≈ collect(cuy) + + Flux.reset!(rnn) + Flux.reset!(curnn) + fx = rand(Float32, 10, batch_size, 3) + cufx = gpu(fx) + fy = (rnn(fx); rnn(fx)) + + cufy = (curnn(cufx); curnn(cufx)) + @test fy ≈ collect(cufy) +end end diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl index e2da95931d..eb9382c42d 100644 --- a/test/cuda/layers.jl +++ b/test/cuda/layers.jl @@ -15,68 +15,63 @@ const BROKEN_LAYERS = Union{} const ACTIVATIONS = [identity, relu, tanh, sigmoid, exp, softplus, elu, selu] -function gpu_gradtest( - name::String, - layers::Vector, - x_cpu = nothing, - args...; - test_cpu = true, -) +function gpu_gradtest(name::String, + layers::Vector, + x_cpu = nothing, + args...; + test_cpu = true) isnothing(x_cpu) && error("Missing input to test the layers against.") - @testset "$name GPU grad tests" begin - for layer in layers - @testset "$layer Layer GPU grad test" begin - - # compute output and grad of parameters - l_cpu = layer(args...) - ps_cpu = Flux.params(l_cpu) - y_cpu, back_cpu = pullback(() -> sum(l_cpu(x_cpu)), ps_cpu) - gs_cpu = back_cpu(1.0f0) - - x_gpu = gpu(x_cpu) - l_gpu = l_cpu |> gpu - ps_gpu = Flux.params(l_gpu) - - if typeof(l_gpu) <: BROKEN_LAYERS - @test_broken gradient(() -> sum(l_gpu(x_gpu)), ps_gpu) isa - Flux.Zygote.Grads - else - y_gpu, back_gpu = pullback(() -> sum(l_gpu(x_gpu)), ps_gpu) - gs_gpu = back_gpu(1.0f0) # TODO many layers error out when backprop int 1, should fix - - # compute grad of input - xg_cpu = gradient(x -> sum(l_cpu(x)), x_cpu)[1] - xg_gpu = gradient(x -> sum(l_gpu(x)), x_gpu)[1] - - # test - if test_cpu - @test y_gpu ≈ y_cpu rtol = 1.0f-3 atol = 1.0f-3 - if isnothing(xg_cpu) - @test isnothing(xg_gpu) + @testset "$name GPU grad tests" begin for layer in layers + @testset "$layer Layer GPU grad test" begin + + # compute output and grad of parameters + l_cpu = layer(args...) + ps_cpu = Flux.params(l_cpu) + y_cpu, back_cpu = pullback(() -> sum(l_cpu(x_cpu)), ps_cpu) + gs_cpu = back_cpu(1.0f0) + + x_gpu = gpu(x_cpu) + l_gpu = l_cpu |> gpu + ps_gpu = Flux.params(l_gpu) + + if typeof(l_gpu) <: BROKEN_LAYERS + @test_broken gradient(() -> sum(l_gpu(x_gpu)), ps_gpu) isa + Flux.Zygote.Grads + else + y_gpu, back_gpu = pullback(() -> sum(l_gpu(x_gpu)), ps_gpu) + gs_gpu = back_gpu(1.0f0) # TODO many layers error out when backprop int 1, should fix + + # compute grad of input + xg_cpu = gradient(x -> sum(l_cpu(x)), x_cpu)[1] + xg_gpu = gradient(x -> sum(l_gpu(x)), x_gpu)[1] + + # test + if test_cpu + @test y_gpu≈y_cpu rtol=1.0f-3 atol=1.0f-3 + if isnothing(xg_cpu) + @test isnothing(xg_gpu) + else + if layer === GroupedConvTranspose + @test Array(xg_gpu)≈xg_cpu rtol=2.0f-2 atol=1.0f-3 else - if layer === GroupedConvTranspose - @test Array(xg_gpu) ≈ xg_cpu rtol = 2.0f-2 atol = 1.0f-3 - else - @test Array(xg_gpu) ≈ xg_cpu rtol = 1.0f-3 atol = 1.0f-3 - end + @test Array(xg_gpu)≈xg_cpu rtol=1.0f-3 atol=1.0f-3 end end - @test gs_gpu isa Flux.Zygote.Grads - for (p_cpu, p_gpu) in zip(ps_cpu, ps_gpu) - if isnothing(gs_cpu[p_cpu]) - @test isnothing(gs_gpu[p_gpu]) - else - @test gs_gpu[p_gpu] isa Flux.CUDA.CuArray - if test_cpu - @test Array(gs_gpu[p_gpu]) ≈ gs_cpu[p_cpu] rtol = 1.0f-3 atol = - 1.0f-3 - end + end + @test gs_gpu isa Flux.Zygote.Grads + for (p_cpu, p_gpu) in zip(ps_cpu, ps_gpu) + if isnothing(gs_cpu[p_cpu]) + @test isnothing(gs_gpu[p_gpu]) + else + @test gs_gpu[p_gpu] isa Flux.CUDA.CuArray + if test_cpu + @test Array(gs_gpu[p_gpu])≈gs_cpu[p_cpu] rtol=1.0f-3 atol=1.0f-3 end end end end end - end + end end end # Just to give testset in gpu_gradtest meaningful labels @@ -99,58 +94,48 @@ for act in ACTIVATIONS DepthwiseConv, DepthwiseConvNoBias, ] - gpu_gradtest( - "Convolution with $act", - conv_layers, - r, - (2, 2), - 1 => 3, - act; - test_cpu = false, - ) + gpu_gradtest("Convolution with $act", + conv_layers, + r, + (2, 2), + 1 => 3, + act; + test_cpu = false) groupedconv = [GroupedConv, GroupedConvTranspose] - gpu_gradtest( - "GroupedConvolution with $act", - groupedconv, - rand(Float32, 28, 28, 100, 2), - (3, 3), - 100 => 25, - act; - test_cpu = true, - ) + gpu_gradtest("GroupedConvolution with $act", + groupedconv, + rand(Float32, 28, 28, 100, 2), + (3, 3), + 100 => 25, + act; + test_cpu = true) batch_norm = [BatchNorm] - gpu_gradtest( - "BatchNorm 1 with $act", - batch_norm, - rand(Float32, 28, 28, 3, 4), - 3, - act; - test_cpu = false, - ) #TODO fix errors - gpu_gradtest( - "BatchNorm 2 with $act", - batch_norm, - rand(Float32, 5, 4), - 5, - act; - test_cpu = false, - ) + gpu_gradtest("BatchNorm 1 with $act", + batch_norm, + rand(Float32, 28, 28, 3, 4), + 3, + act; + test_cpu = false) #TODO fix errors + gpu_gradtest("BatchNorm 2 with $act", + batch_norm, + rand(Float32, 5, 4), + 5, + act; + test_cpu = false) instancenorm = [InstanceNorm] gpu_gradtest("InstanceNorm with $act", instancenorm, r, 1, act; test_cpu = false) groupnorm = [GroupNorm] - gpu_gradtest( - "GroupNorm with $act", - groupnorm, - rand(Float32, 28, 28, 3, 1), - 3, - 1, - act; - test_cpu = false, - ) + gpu_gradtest("GroupNorm with $act", + groupnorm, + rand(Float32, 28, 28, 3, 1), + 3, + 1, + act; + test_cpu = false) end r = rand(Float32, 28, 28, 1, 1) @@ -183,13 +168,11 @@ gpu_gradtest("Embedding integer index", embedding, 1, 5, 2) gpu_gradtest("Embedding 2d index", embedding, [1 2; 3 4], 5, 2) gpu_gradtest("Embedding OneHotVec index", embedding, OneHotVector(1, 5), 5, 2) gpu_gradtest("Embedding OneHotMatrix index", embedding, OneHotMatrix([1, 2, 3], 5), 5, 2) -gpu_gradtest( - "Embedding OneHotMatrix repeated indices", - embedding, - OneHotMatrix([1, 2, 2], 5), - 5, - 2, -) +gpu_gradtest("Embedding OneHotMatrix repeated indices", + embedding, + OneHotMatrix([1, 2, 2], 5), + 5, + 2) @testset "function layers" begin x = rand(Float32, 3, 3) @@ -338,11 +321,9 @@ end end @testset "Dropout RNGs" begin - @test_throws ArgumentError Flux.dropout( - MersenneTwister(), - CUDA.rand(Float32, 2, 3), - 0.1, - ) + @test_throws ArgumentError Flux.dropout(MersenneTwister(), + CUDA.rand(Float32, 2, 3), + 0.1) @testset for layer in (Dropout, AlphaDropout) m = layer(0.1; rng = MersenneTwister(123)) @test_throws ErrorException gpu(m) diff --git a/test/cuda/losses.jl b/test/cuda/losses.jl index 467d3ed46e..1383bd04cd 100644 --- a/test/cuda/losses.jl +++ b/test/cuda/losses.jl @@ -1,5 +1,6 @@ using Flux.Losses: - crossentropy, binarycrossentropy, logitbinarycrossentropy, binary_focal_loss, focal_loss + crossentropy, binarycrossentropy, logitbinarycrossentropy, + binary_focal_loss, focal_loss @testset "Losses" begin x = [1.0, 2.0, 3.0] @@ -14,22 +15,16 @@ using Flux.Losses: @test binarycrossentropy(σ.(x), y) ≈ binarycrossentropy(gpu(σ.(x)), gpu(y)) @test logitbinarycrossentropy(x, y) ≈ logitbinarycrossentropy(gpu(x), gpu(y)) - x = [ - 0.268941 0.5 0.268941 - 0.731059 0.5 0.731059 - ] - y = [ - 0 1 0 - 1 0 1 - ] + x = [0.268941 0.5 0.268941 + 0.731059 0.5 0.731059] + y = [0 1 0 + 1 0 1] @test binary_focal_loss(x, y) ≈ binary_focal_loss(gpu(x), gpu(y)) x = softmax(reshape(-7:7, 3, 5) .* 1.0f0) - y = [ - 1 0 0 0 1 - 0 1 0 1 0 - 0 0 1 0 0 - ] + y = [1 0 0 0 1 + 0 1 0 1 0 + 0 0 1 0 0] @test focal_loss(x, y) ≈ focal_loss(gpu(x), gpu(y)) @testset "GPU grad tests" begin diff --git a/test/cuda/test_utils.jl b/test/cuda/test_utils.jl index 027d13a612..77fdba5c89 100644 --- a/test/cuda/test_utils.jl +++ b/test/cuda/test_utils.jl @@ -7,10 +7,10 @@ function check_grad(g_gpu::Base.RefValue, g_cpu::Base.RefValue, atol, rtol) end check_grad(g_gpu::Nothing, g_cpu::Nothing, atol, rtol) = @test true function check_grad(g_gpu::Float32, g_cpu::Float32, atol, rtol) - @test g_cpu ≈ g_gpu rtol = rtol atol = atol + @test g_cpu≈g_gpu rtol=rtol atol=atol end function check_grad(g_gpu::CuArray{Float32}, g_cpu::Array{Float32}, atol, rtol) - @test g_cpu ≈ collect(g_gpu) rtol = rtol atol = atol + @test g_cpu≈collect(g_gpu) rtol=rtol atol=atol end function check_grad(g_gpu::Tuple, g_cpu::Tuple, atol, rtol) @@ -27,13 +27,11 @@ function check_grad(g_gpu::NamedTuple, g_cpu::NamedTuple, atol, rtol) end end -function gpu_autodiff_test( - f_cpu, - xs_cpu::Array{Float32}...; - test_equal = true, - rtol = 1e-4, - atol = 1e-4, -) +function gpu_autodiff_test(f_cpu, + xs_cpu::Array{Float32}...; + test_equal = true, + rtol = 1e-4, + atol = 1e-4) check_type(x) = false check_type(x::Float32) = true check_type(x::CuArray{Float32}) = true @@ -55,7 +53,7 @@ function gpu_autodiff_test( gs_gpu = back_gpu(Δ_gpu) if test_equal - @test collect(y_cpu) ≈ collect(y_gpu) rtol = rtol atol = atol + @test collect(y_cpu)≈collect(y_gpu) rtol=rtol atol=atol for (g_gpu, g_cpu) in zip(gs_gpu, gs_cpu) check_grad(g_gpu, g_cpu, atol, rtol) end @@ -71,7 +69,7 @@ function gpu_autodiff_test( gs_gpu = back_gpu(Δ_gpu) if test_equal - @test collect(y_cpu) ≈ collect(y_gpu) rtol = rtol atol = atol + @test collect(y_cpu)≈collect(y_gpu) rtol=rtol atol=atol @assert length(ps_gpu) == length(ps_cpu) for (p_gpu, p_cpu) in zip(ps_gpu, ps_cpu) check_grad(gs_gpu[p_gpu], gs_cpu[p_cpu], atol, rtol) diff --git a/test/data.jl b/test/data.jl index 3d2083af4f..0b66f6b50c 100644 --- a/test/data.jl +++ b/test/data.jl @@ -36,7 +36,7 @@ using Random # @inferred first(d) batches = collect(d) # @test eltype(batches) == eltype(d) == Tuple{typeof(X), typeof(Y)} - @test eltype(batches) == Tuple{typeof(X),typeof(Y)} + @test eltype(batches) == Tuple{typeof(X), typeof(Y)} @test length(batches) == 3 @test length(batches[1]) == 2 @test length(batches[2]) == 2 @@ -53,7 +53,7 @@ using Random # @inferred first(d) batches = collect(d) # @test eltype(batches) == eltype(d) == NamedTuple{(:x, :y), Tuple{typeof(X), typeof(Y)}} - @test eltype(batches) == NamedTuple{(:x, :y),Tuple{typeof(X),typeof(Y)}} + @test eltype(batches) == NamedTuple{(:x, :y), Tuple{typeof(X), typeof(Y)}} @test length(batches) == 3 @test length(batches[1]) == 2 @test length(batches[2]) == 2 @@ -69,7 +69,7 @@ using Random d = DataLoader([1:10;]; shuffle = true) cd = collect(zip(d, d)) # skip the first since it used to be different also before fixing the bug - @test [cd[i][1] for i = 2:10] != [cd[i][2] for i = 2:10] + @test [cd[i][1] for i in 2:10] != [cd[i][2] for i in 2:10] # test interaction with `train!` θ = ones(2) @@ -89,13 +89,9 @@ using Random @test norm(θ .- 1) < 1e-10 # specify the rng - d = map( - identity, - DataLoader( - X; - batchsize = 2, - shuffle = true, - rng = Random.seed!(Random.default_rng(), 5), - ), - ) + d = map(identity, + DataLoader(X; + batchsize = 2, + shuffle = true, + rng = Random.seed!(Random.default_rng(), 5))) end diff --git a/test/layers/basic.jl b/test/layers/basic.jl index f3600850a8..0a3e73879e 100644 --- a/test/layers/basic.jl +++ b/test/layers/basic.jl @@ -2,18 +2,16 @@ using Test, Random import Flux: activations @testset "basic" begin - @testset "helpers" begin - @testset "activations" begin - dummy_model = Chain(x -> x .^ 2, x -> x .- 3, x -> tan.(x)) - x = randn(10) - @test activations(dummy_model, x)[1] == x .^ 2 - @test activations(dummy_model, x)[2] == (x .^ 2 .- 3) - @test activations(dummy_model, x)[3] == tan.(x .^ 2 .- 3) - - @test activations(Chain(), x) == () - @test activations(Chain(identity, x -> :foo), x)[2] == :foo # results include `Any` type - end - end + @testset "helpers" begin @testset "activations" begin + dummy_model = Chain(x -> x .^ 2, x -> x .- 3, x -> tan.(x)) + x = randn(10) + @test activations(dummy_model, x)[1] == x .^ 2 + @test activations(dummy_model, x)[2] == (x .^ 2 .- 3) + @test activations(dummy_model, x)[3] == tan.(x .^ 2 .- 3) + + @test activations(Chain(), x) == () + @test activations(Chain(identity, x -> :foo), x)[2] == :foo # results include `Any` type + end end @testset "Chain" begin @test_nowarn Chain(Dense(10, 5, σ), Dense(5, 2))(randn(10)) @@ -89,9 +87,10 @@ import Flux: activations @test Dense(10, 2, identity; init = ones)(ones(10, 1)) == 10 * ones(2, 1) @test Dense(10, 2, identity; init = ones)([ones(10, 1) 2 * ones(10, 1)]) == [10 20; 10 20] - @test Dense(10, 2, identity; init = ones, bias = false)( - [ones(10, 1) 2 * ones(10, 1)], - ) == [10 20; 10 20] + @test Dense(10, 2, identity; init = ones, bias = false)([ones(10, 1) 2 * + ones(10, + 1)]) == + [10 20; 10 20] end end @@ -159,9 +158,8 @@ import Flux: activations @testset "concat size" begin input = randn(10, 2) - @test size( - SkipConnection(Dense(10, 10), (a, b) -> cat(a, b; dims = 2))(input), - ) == (10, 4) + @test size(SkipConnection(Dense(10, 10), (a, b) -> cat(a, b; dims = 2))(input)) == + (10, 4) end end @@ -219,9 +217,8 @@ import Flux: activations @testset "concat size" begin input = randn(10, 2) - @test size( - Parallel((a, b) -> cat(a, b; dims = 2), Dense(10, 10), identity)(input), - ) == (10, 4) + @test size(Parallel((a, b) -> cat(a, b; dims = 2), Dense(10, 10), identity)(input)) == + (10, 4) @test size(Parallel(hcat; one = Dense(10, 10), two = identity)(input)) == (10, 4) end @@ -229,9 +226,8 @@ import Flux: activations @testset "vararg input" begin inputs = randn(10), randn(5), randn(4) @test size(Parallel(+, Dense(10, 2), Dense(5, 2), Dense(4, 2))(inputs)) == (2,) - @test size( - Parallel(+; a = Dense(10, 2), b = Dense(5, 2), c = Dense(4, 2))(inputs), - ) == (2,) + @test size(Parallel(+; a = Dense(10, 2), b = Dense(5, 2), c = Dense(4, 2))(inputs)) == + (2,) @test_throws ArgumentError Parallel(+, sin, cos)(1, 2, 3) # wrong number of inputs @test Parallel(+, sin, cos)(pi / 2) ≈ 1 end @@ -241,16 +237,12 @@ import Flux: activations @test m[1] == m[:one] @test m[1:2] == m - @test_throws ArgumentError Parallel( - hcat, - layers = Dense(10, 10), - two = identity, - ) # reserved names - @test_throws ArgumentError Parallel( - hcat, - connection = Dense(10, 10), - two = identity, - ) + @test_throws ArgumentError Parallel(hcat, + layers = Dense(10, 10), + two = identity) # reserved names + @test_throws ArgumentError Parallel(hcat, + connection = Dense(10, 10), + two = identity) @test m == fmap(identity, m) # does not forget names @@ -259,7 +251,7 @@ import Flux: activations end @testset "trivial cases" begin - @test Parallel(hcat) isa Parallel{typeof(hcat),Tuple{}} # not a NamedTuple + @test Parallel(hcat) isa Parallel{typeof(hcat), Tuple{}} # not a NamedTuple @test Parallel(hcat)(1) == hcat() @test Parallel(hcat, inv)(2) == hcat(1 / 2) # still calls connection once. end @@ -324,7 +316,7 @@ import Flux: activations x = rand(1:vocab_size, 3, 4) y = m(x) - @test y isa Array{Float32,3} + @test y isa Array{Float32, 3} @test size(y) == (embed_size, 3, 4) @test m(2) ≈ m.weight[:, 2] diff --git a/test/layers/conv.jl b/test/layers/conv.jl index 51082723fb..1733b5e40b 100644 --- a/test/layers/conv.jl +++ b/test/layers/conv.jl @@ -25,15 +25,13 @@ end @testset "CNN" begin r = zeros(Float32, 28, 28, 1, 5) - m = Chain( - Conv((2, 2), 1 => 16, relu), - MaxPool((2, 2)), - Conv((2, 2), 16 => 8, relu), - MaxPool((2, 2)), - x -> reshape(x, :, size(x, 4)), - Dense(288, 10), - softmax, - ) + m = Chain(Conv((2, 2), 1 => 16, relu), + MaxPool((2, 2)), + Conv((2, 2), 16 => 8, relu), + MaxPool((2, 2)), + x -> reshape(x, :, size(x, 4)), + Dense(288, 10), + softmax) @test size(m(r)) == (10, 5) @@ -59,7 +57,7 @@ end op = zeros(Float32, 27, 27, 3, 1) .+ 2.0f0 opt = Descent() - for _ = 1:(10^3) + for _ in 1:(10^3) gs = gradient(Flux.params(bias)) do return Flux.Losses.mse(bias(ip), op) end @@ -116,7 +114,7 @@ end @test _channels_out(ConvTranspose((5, 6), 2 => 2; groups = 2)) == 2 for Layer in [Conv, ConvTranspose] - for _ = 1:10 + for _ in 1:10 groups = rand(1:10) kernel_size = Tuple(rand(1:5) for _ in rand(1:3)) cin = rand(1:5) * groups @@ -138,7 +136,7 @@ end @test y_hat[2, 2] ≈ 9.0 @test y_hat[end, 1] ≈ 4.0 @test y_hat[1, end] ≈ 3.0 - @test y_hat[1, end-1] ≈ 6.0 + @test y_hat[1, end - 1] ≈ 6.0 @test y_hat[end, end] ≈ 2.0 end @@ -206,22 +204,20 @@ end w = rand(Float32, 2, 2, 1, 1) y = CrossCor(w, [0.0]) - @test sum(w .* x[1:2, 1:2, :, :]) ≈ y(x)[1, 1, 1, 1] rtol = 2e-7 + @test sum(w .* x[1:2, 1:2, :, :])≈y(x)[1, 1, 1, 1] rtol=2e-7 r = zeros(Float32, 28, 28, 1, 5) - m = Chain( - CrossCor((2, 2), 1 => 16, relu), - MaxPool((2, 2)), - CrossCor((2, 2), 16 => 8, relu; bias = false), - MaxPool((2, 2)), - x -> reshape(x, :, size(x, 4)), - Dense(288, 10), - softmax, - ) + m = Chain(CrossCor((2, 2), 1 => 16, relu), + MaxPool((2, 2)), + CrossCor((2, 2), 16 => 8, relu; bias = false), + MaxPool((2, 2)), + x -> reshape(x, :, size(x, 4)), + Dense(288, 10), + softmax) @test size(m(r)) == (10, 5) @test y(x) != Conv(w, [0.0])(x) - @test CrossCor(w[end:-1:1, end:-1:1, :, :], [0.0])(x) ≈ Conv(w, [0.0])(x) rtol = 1e-7 + @test CrossCor(w[end:-1:1, end:-1:1, :, :], [0.0])(x)≈Conv(w, [0.0])(x) rtol=1e-7 end @testset "Conv with non quadratic window #700" begin @@ -230,17 +226,17 @@ end l = Conv((3, 3), 1 => 1) expected = zeros(eltype(l.weight), 5, 5, 1, 1) - expected[2:(end-1), 2:(end-1), 1, 1] = l.weight + expected[2:(end - 1), 2:(end - 1), 1, 1] = l.weight @test expected ≈ l(data) l = Conv((3, 1), 1 => 1) expected = zeros(eltype(l.weight), 5, 7, 1, 1) - expected[2:(end-1), 4, 1, 1] = l.weight + expected[2:(end - 1), 4, 1, 1] = l.weight @test expected ≈ l(data) l = Conv((1, 3), 1 => 1) expected = zeros(eltype(l.weight), 7, 5, 1, 1) - expected[4, 2:(end-1), 1, 1] = l.weight + expected[4, 2:(end - 1), 1, 1] = l.weight @test expected ≈ l(data) @test begin @@ -250,9 +246,9 @@ end end end -@testset "$ltype SamePad kernelsize $k" for ltype in - (Conv, ConvTranspose, DepthwiseConv, CrossCor), - k in ((1,), (2,), (3,), (4, 5), (6, 7, 8)) +@testset "$ltype SamePad kernelsize $k" for ltype in (Conv, ConvTranspose, DepthwiseConv, + CrossCor), + k in ((1,), (2,), (3,), (4, 5), (6, 7, 8)) data = ones(Float32, (k .+ 3)..., 1, 1) l = ltype(k, 1 => 1; pad = SamePad()) @@ -264,25 +260,24 @@ end stride = 3 l = ltype(k, 1 => 1; pad = SamePad(), stride = stride) if ltype == ConvTranspose - @test size(l(data))[1:(end-2)] == stride .* size(data)[1:(end-2)] + @test size(l(data))[1:(end - 2)] == stride .* size(data)[1:(end - 2)] else - @test size(l(data))[1:(end-2)] == cld.(size(data)[1:(end-2)], stride) + @test size(l(data))[1:(end - 2)] == cld.(size(data)[1:(end - 2)], stride) end end @testset "$ltype SamePad windowsize $k" for ltype in (MeanPool, MaxPool), - k in ((1,), (2,), (3,), (4, 5), (6, 7, 8)) + k in ((1,), (2,), (3,), (4, 5), (6, 7, 8)) data = ones(Float32, (k .+ 3)..., 1, 1) l = ltype(k; pad = SamePad()) - @test size(l(data))[1:(end-2)] == cld.(size(data)[1:(end-2)], k) + @test size(l(data))[1:(end - 2)] == cld.(size(data)[1:(end - 2)], k) end @testset "bugs fixed" begin - # https://github.com/FluxML/Flux.jl/issues/1421 - @test Conv((5, 5), 10 => 20, identity; init = Base.randn).bias isa Vector{Float64} -end +# https://github.com/FluxML/Flux.jl/issues/1421 +@test Conv((5, 5), 10 => 20, identity; init = Base.randn).bias isa Vector{Float64} end @testset "constructors: $fun" for fun in [Conv, CrossCor, ConvTranspose, DepthwiseConv] @test fun(rand(2, 3, 4)).bias isa Vector{Float64} diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl index 32e99245d6..e4cfcb9d35 100644 --- a/test/layers/normalisation.jl +++ b/test/layers/normalisation.jl @@ -3,128 +3,122 @@ using Zygote: pullback evalwgrad(f, x...) = pullback(f, x...)[1] -@testset "Dropout" begin - @testset for rng_kwargs in ((), (; rng = MersenneTwister())) - x = [1.0 + 0im, 2.0 + 1im, 3.0 + 3im] - @test x == Dropout(0.1; rng_kwargs...)(x) - @test x == evalwgrad(Dropout(0; rng_kwargs...), x) - @test zero(x) == evalwgrad(Dropout(1; rng_kwargs...), x) - - x = [1.0, 2.0, 3.0] - @test x == Dropout(0.1; rng_kwargs...)(x) - @test x == evalwgrad(Dropout(0; rng_kwargs...), x) - @test zero(x) == evalwgrad(Dropout(1; rng_kwargs...), x) - - x = rand(100) - m = Dropout(0.9; rng_kwargs...) - y = evalwgrad(m, x) - @test count(a -> a == 0, y) > 50 - testmode!(m, true) - y = evalwgrad(m, x) # should override istraining - @test count(a -> a == 0, y) == 0 - testmode!(m, false) - y = evalwgrad(m, x) - @test count(a -> a == 0, y) > 50 - - x = rand(Float32, 100) - m = Chain(Dense(100, 100), Dropout(0.9; rng_kwargs...)) - y = evalwgrad(m, x) - @test count(a -> a == 0, y) > 50 - testmode!(m, true) - y = evalwgrad(m, x) # should override istraining - @test count(a -> a == 0, y) == 0 - - x = rand(100, 50) - m = Dropout(0.5; dims = 2, rng_kwargs...) - y = m(x) - c = map(i -> count(a -> a == 0, @view y[i, :]), 1:100) - @test minimum(c) == maximum(c) - m = Dropout(0.5; dims = 1, rng_kwargs...) - y = m(x) - c = map(i -> count(a -> a == 0, @view y[:, i]), 1:50) - @test minimum(c) == maximum(c) - - # issue #1084 - m = Dropout(0.9; rng_kwargs...) - x = rand(100) - - testmode!(m) - y = m(x) - @test count(a -> a == 0, y) == 0 - trainmode!(m) - y = m(x) - @test count(a -> a == 0, y) > 50 - - y = Flux.dropout(values(rng_kwargs)..., x, 0.9; active = true) - @test count(a -> a == 0, y) > 50 - - y = Flux.dropout(values(rng_kwargs)..., x, 0.9; active = false) - @test count(a -> a == 0, y) == 0 - - # CPU RNGs map onto CPU ok - if isempty(rng_kwargs) - if VERSION >= v"1.7" - @test cpu(m).rng isa Random.TaskLocalRNG - else - @test cpu(m).rng isa Random._GLOBAL_RNG - end +@testset "Dropout" begin @testset for rng_kwargs in ((), (; rng = MersenneTwister())) + x = [1.0 + 0im, 2.0 + 1im, 3.0 + 3im] + @test x == Dropout(0.1; rng_kwargs...)(x) + @test x == evalwgrad(Dropout(0; rng_kwargs...), x) + @test zero(x) == evalwgrad(Dropout(1; rng_kwargs...), x) + + x = [1.0, 2.0, 3.0] + @test x == Dropout(0.1; rng_kwargs...)(x) + @test x == evalwgrad(Dropout(0; rng_kwargs...), x) + @test zero(x) == evalwgrad(Dropout(1; rng_kwargs...), x) + + x = rand(100) + m = Dropout(0.9; rng_kwargs...) + y = evalwgrad(m, x) + @test count(a -> a == 0, y) > 50 + testmode!(m, true) + y = evalwgrad(m, x) # should override istraining + @test count(a -> a == 0, y) == 0 + testmode!(m, false) + y = evalwgrad(m, x) + @test count(a -> a == 0, y) > 50 + + x = rand(Float32, 100) + m = Chain(Dense(100, 100), Dropout(0.9; rng_kwargs...)) + y = evalwgrad(m, x) + @test count(a -> a == 0, y) > 50 + testmode!(m, true) + y = evalwgrad(m, x) # should override istraining + @test count(a -> a == 0, y) == 0 + + x = rand(100, 50) + m = Dropout(0.5; dims = 2, rng_kwargs...) + y = m(x) + c = map(i -> count(a -> a == 0, @view y[i, :]), 1:100) + @test minimum(c) == maximum(c) + m = Dropout(0.5; dims = 1, rng_kwargs...) + y = m(x) + c = map(i -> count(a -> a == 0, @view y[:, i]), 1:50) + @test minimum(c) == maximum(c) + + # issue #1084 + m = Dropout(0.9; rng_kwargs...) + x = rand(100) + + testmode!(m) + y = m(x) + @test count(a -> a == 0, y) == 0 + trainmode!(m) + y = m(x) + @test count(a -> a == 0, y) > 50 + + y = Flux.dropout(values(rng_kwargs)..., x, 0.9; active = true) + @test count(a -> a == 0, y) > 50 + + y = Flux.dropout(values(rng_kwargs)..., x, 0.9; active = false) + @test count(a -> a == 0, y) == 0 + + # CPU RNGs map onto CPU ok + if isempty(rng_kwargs) + if VERSION >= v"1.7" + @test cpu(m).rng isa Random.TaskLocalRNG else - @test cpu(m).rng === only(values(rng_kwargs)) + @test cpu(m).rng isa Random._GLOBAL_RNG end + else + @test cpu(m).rng === only(values(rng_kwargs)) + end +end end + +@testset "AlphaDropout" begin @testset for rng_kwargs in ((), (; rng = MersenneTwister())) + x = [1.0, 2.0, 3.0] + @test x == AlphaDropout(0.1; rng_kwargs...)(x) + @test x == evalwgrad(AlphaDropout(0; rng_kwargs...), x) + @test zero(x) == evalwgrad(AlphaDropout(1; rng_kwargs...), x) + + x = randn(1000) # large enough to prevent flaky test + m = AlphaDropout(0.5; rng_kwargs...) + + y = evalwgrad(m, x) + # Should preserve unit mean and variance + @test mean(y)≈0 atol=0.2 + @test var(y)≈1 atol=0.2 + + testmode!(m, true) # should override istraining + @test evalwgrad(m, x) == x + + testmode!(m, false) + y = evalwgrad(m, x) + @test mean(y)≈0 atol=0.2 + @test var(y)≈1 atol=0.2 + + # Known good value ranges + # Values taken from https://github.com/pytorch/pytorch/blob/v1.10.0/test/cpp/api/modules.cpp#L1337-L1338 + x = ones(100) + if isempty(rng_kwargs) + @test 40 < sum(evalwgrad(m, x)) < 130 + else + # FIXME: this breaks spuriously for MersenneTwister + @test_skip 40 < sum(evalwgrad(m, x)) < 130 end -end - -@testset "AlphaDropout" begin - @testset for rng_kwargs in ((), (; rng = MersenneTwister())) - x = [1.0, 2.0, 3.0] - @test x == AlphaDropout(0.1; rng_kwargs...)(x) - @test x == evalwgrad(AlphaDropout(0; rng_kwargs...), x) - @test zero(x) == evalwgrad(AlphaDropout(1; rng_kwargs...), x) - - x = randn(1000) # large enough to prevent flaky test - m = AlphaDropout(0.5; rng_kwargs...) - - y = evalwgrad(m, x) - # Should preserve unit mean and variance - @test mean(y) ≈ 0 atol = 0.2 - @test var(y) ≈ 1 atol = 0.2 - - testmode!(m, true) # should override istraining - @test evalwgrad(m, x) == x - - testmode!(m, false) - y = evalwgrad(m, x) - @test mean(y) ≈ 0 atol = 0.2 - @test var(y) ≈ 1 atol = 0.2 - - # Known good value ranges - # Values taken from https://github.com/pytorch/pytorch/blob/v1.10.0/test/cpp/api/modules.cpp#L1337-L1338 - x = ones(100) - if isempty(rng_kwargs) - @test 40 < sum(evalwgrad(m, x)) < 130 - else - # FIXME: this breaks spuriously for MersenneTwister - @test_skip 40 < sum(evalwgrad(m, x)) < 130 - end - # CPU RNGs map onto CPU ok - if isempty(rng_kwargs) - if VERSION >= v"1.7" - @test cpu(m).rng isa Random.TaskLocalRNG - else - @test cpu(m).rng isa Random._GLOBAL_RNG - end + # CPU RNGs map onto CPU ok + if isempty(rng_kwargs) + if VERSION >= v"1.7" + @test cpu(m).rng isa Random.TaskLocalRNG else - @test cpu(m).rng === only(values(rng_kwargs)) + @test cpu(m).rng isa Random._GLOBAL_RNG end + else + @test cpu(m).rng === only(values(rng_kwargs)) end -end +end end @testset "BatchNorm" begin - let m = BatchNorm(2), x = [ - 1.0 3.0 5.0 - 2.0 4.0 6.0 - ] + let m = BatchNorm(2), x = [1.0 3.0 5.0 + 2.0 4.0 6.0] @test Flux.hasaffine(m) == true @test length(Flux.params(m)) == 2 @@ -167,10 +161,8 @@ end end # with activation function - let m = BatchNorm(2, sigmoid), x = [ - 1.0 3.0 5.0 - 2.0 4.0 6.0 - ] + let m = BatchNorm(2, sigmoid), x = [1.0 3.0 5.0 + 2.0 4.0 6.0] y = m(x) @test isapprox(y, sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ)), atol = 1.0e-7) @inferred m(x) @@ -243,16 +235,15 @@ end # (1. - .1) * 0 + .1 * (5. + 11.) / 2 = .8 N = ndims(x) @test m.μ ≈ [0.5, 0.8] - n = prod(size(x, i) for i = 1:(N-2)) + n = prod(size(x, i) for i in 1:(N - 2)) corr = n / (n - 1) - σ² = var(x; dims = 1:(N-2), corrected = false) + σ² = var(x; dims = 1:(N - 2), corrected = false) @test m.σ² ≈ 0.1 * corr * vec(mean(σ²; dims = N)) .+ 0.9 * 1 y = m(x) @test length(m.μ) == 2 @test length(m.σ²) == 2 - @test y ≈ (x .- reshape(m.μ, 1, 2, 1)) ./ sqrt.(reshape(m.σ², 1, 2, 1) .+ 1.0f-5) atol = - 1.0e-5 + @test y≈(x .- reshape(m.μ, 1, 2, 1)) ./ sqrt.(reshape(m.σ², 1, 2, 1) .+ 1.0f-5) atol=1.0e-5 @inferred m(x) end @@ -270,7 +261,7 @@ end y = m(x) # inference time after a training step μ = reshape(m.μ, affine_shape...) σ² = reshape(m.σ², affine_shape...) - @test y ≈ sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol = 1.0e-7 + @test y≈sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol=1.0e-7 @inferred m(x) end @@ -286,7 +277,7 @@ end y = m(x) μ = mean(x; dims = 1) σ² = var(x; dims = 1, corrected = false) - @test y ≈ sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol = 1.0e-7 + @test y≈sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol=1.0e-7 @inferred m(x) end @@ -302,7 +293,7 @@ end y = m(x) μ = mean(x; dims = 1) σ² = var(x; dims = 1, corrected = false) - @test y ≈ sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol = 1.0e-7 + @test y≈sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol=1.0e-7 @inferred m(x) end @@ -324,8 +315,8 @@ end x = reshape(Float32.(collect(1:prod(sizes))), sizes) y = evalwgrad(m, x) - @test size(m.μ) == (sizes[end-1],) - @test size(m.σ²) == (sizes[end-1],) + @test size(m.μ) == (sizes[end - 1],) + @test size(m.σ²) == (sizes[end - 1],) @test size(y) == sizes @inferred m(x) @@ -337,7 +328,8 @@ end sizes = (5, 5, 3, 4, 2, 6), x = reshape(Float32.(collect(1:prod(sizes))), sizes) - @test m_inorm(x) == reshape(m_bnorm(reshape(x, (sizes[1:(end-2)]..., :, 1))), sizes) + @test m_inorm(x) == + reshape(m_bnorm(reshape(x, (sizes[1:(end - 2)]..., :, 1))), sizes) end let m = InstanceNorm(32), x = randn(Float32, 416, 416, 32, 1) @@ -424,7 +416,7 @@ end y = m(x) out = (z .- reshape(m.μ, 1, 1, 2, 1)) ./ sqrt.(reshape(m.σ², 1, 1, 2, 1) .+ 1.0f-5) - @test y ≈ reshape(out, size(x)) atol = 1.0e-5 + @test y≈reshape(out, size(x)) atol=1.0e-5 end # with activation function let m = GroupNorm(4, 2, sigmoid; track_stats = true), @@ -433,11 +425,11 @@ end x = Float32.(x) μ_affine_shape = ones(Int, length(sizes) + 1) - μ_affine_shape[end-1] = 2 # Number of groups + μ_affine_shape[end - 1] = 2 # Number of groups affine_shape = ones(Int, length(sizes) + 1) - affine_shape[end-2] = 2 # Channels per group - affine_shape[end-1] = 2 # Number of groups + affine_shape[end - 2] = 2 # Channels per group + affine_shape[end - 1] = 2 # Number of groups affine_shape[1] = sizes[1] affine_shape[end] = sizes[end] @@ -445,14 +437,10 @@ end y = m(x) x_ = reshape(x, affine_shape...) - out = reshape( - sigmoid.( - (x_ .- reshape(m.μ, μ_affine_shape...)) ./ - sqrt.(reshape(m.σ², μ_affine_shape...) .+ m.ϵ) - ), - og_shape, - ) - @test y ≈ out atol = 1e-7 + out = reshape(sigmoid.((x_ .- reshape(m.μ, μ_affine_shape...)) ./ + sqrt.(reshape(m.σ², μ_affine_shape...) .+ m.ϵ)), + og_shape) + @test y≈out atol=1e-7 end let m = trainmode!(GroupNorm(2, 2; track_stats = true)), diff --git a/test/layers/recurrent.jl b/test/layers/recurrent.jl index 225c4d83a9..2ca108ba6b 100644 --- a/test/layers/recurrent.jl +++ b/test/layers/recurrent.jl @@ -2,7 +2,7 @@ using LinearAlgebra # Ref FluxML/Flux.jl#1209 1D input @testset "BPTT-1D" begin - seq = [rand(Float32, 2) for i = 1:3] + seq = [rand(Float32, 2) for i in 1:3] for r in [RNN] rnn = r(2 => 3) Flux.reset!(rnn) @@ -10,29 +10,22 @@ using LinearAlgebra return sum([rnn(s) for s in seq][3]) end Flux.reset!(rnn) - bptt = gradient( - Wh -> sum( - tanh.( - rnn.cell.Wi * seq[3] + - Wh * - tanh.( - rnn.cell.Wi * seq[2] + - Wh * - tanh.(rnn.cell.Wi * seq[1] + Wh * rnn.cell.state0 + rnn.cell.b) + - rnn.cell.b - ) + - rnn.cell.b - ), - ), - rnn.cell.Wh, - ) + bptt = gradient(Wh -> sum(tanh.(rnn.cell.Wi * seq[3] + + Wh * + tanh.(rnn.cell.Wi * seq[2] + + Wh * + tanh.(rnn.cell.Wi * seq[1] + + Wh * rnn.cell.state0 + rnn.cell.b) + + rnn.cell.b) + + rnn.cell.b)), + rnn.cell.Wh) @test grads_seq[rnn.cell.Wh] ≈ bptt[1] end end # Ref FluxML/Flux.jl#1209 2D input @testset "BPTT-2D" begin - seq = [rand(Float32, (2, 1)) for i = 1:3] + seq = [rand(Float32, (2, 1)) for i in 1:3] for r in [RNN] rnn = r(2 => 3) Flux.reset!(rnn) @@ -40,22 +33,15 @@ end return sum([rnn(s) for s in seq][3]) end Flux.reset!(rnn) - bptt = gradient( - Wh -> sum( - tanh.( - rnn.cell.Wi * seq[3] + - Wh * - tanh.( - rnn.cell.Wi * seq[2] + - Wh * - tanh.(rnn.cell.Wi * seq[1] + Wh * rnn.cell.state0 + rnn.cell.b) + - rnn.cell.b - ) + - rnn.cell.b - ), - ), - rnn.cell.Wh, - ) + bptt = gradient(Wh -> sum(tanh.(rnn.cell.Wi * seq[3] + + Wh * + tanh.(rnn.cell.Wi * seq[2] + + Wh * + tanh.(rnn.cell.Wi * seq[1] + + Wh * rnn.cell.state0 + rnn.cell.b) + + rnn.cell.b) + + rnn.cell.b)), + rnn.cell.Wh) @test grads_seq[rnn.cell.Wh] ≈ bptt[1] end end @@ -80,34 +66,30 @@ end @test grads_seq[rnn.cell.Wh] ≈ bptt[1] end -@testset "RNN-shapes" begin - @testset for R in [RNN, GRU, LSTM, GRUv3] - m1 = R(3 => 5) - m2 = R(3 => 5) - m3 = R(3, 5) # leave one to test the silently deprecated "," not "=>" notation - x1 = rand(Float32, 3) - x2 = rand(Float32, 3, 1) - x3 = rand(Float32, 3, 1, 2) - Flux.reset!(m1) - Flux.reset!(m2) - Flux.reset!(m3) - @test size(m1(x1)) == (5,) - @test size(m1(x1)) == (5,) # repeat in case of effect from change in state shape - @test size(m2(x2)) == (5, 1) - @test size(m2(x2)) == (5, 1) - @test size(m3(x3)) == (5, 1, 2) - @test size(m3(x3)) == (5, 1, 2) - end -end +@testset "RNN-shapes" begin @testset for R in [RNN, GRU, LSTM, GRUv3] + m1 = R(3 => 5) + m2 = R(3 => 5) + m3 = R(3, 5) # leave one to test the silently deprecated "," not "=>" notation + x1 = rand(Float32, 3) + x2 = rand(Float32, 3, 1) + x3 = rand(Float32, 3, 1, 2) + Flux.reset!(m1) + Flux.reset!(m2) + Flux.reset!(m3) + @test size(m1(x1)) == (5,) + @test size(m1(x1)) == (5,) # repeat in case of effect from change in state shape + @test size(m2(x2)) == (5, 1) + @test size(m2(x2)) == (5, 1) + @test size(m3(x3)) == (5, 1, 2) + @test size(m3(x3)) == (5, 1, 2) +end end -@testset "RNN-input-state-eltypes" begin - @testset for R in [RNN, GRU, LSTM, GRUv3] - m = R(3 => 5) - x = rand(Float64, 3, 1) - Flux.reset!(m) - @test_throws MethodError m(x) - end -end +@testset "RNN-input-state-eltypes" begin @testset for R in [RNN, GRU, LSTM, GRUv3] + m = R(3 => 5) + x = rand(Float64, 3, 1) + Flux.reset!(m) + @test_throws MethodError m(x) +end end @testset "multigate" begin x = rand(6, 5) @@ -123,19 +105,17 @@ end x = rand(3, 3, 1, 2, 4) @test length(Flux.eachlastdim(x)) == size(x, ndims(x)) @test collect(@inferred(Flux.eachlastdim(x))) == collect(eachslice(x; dims = ndims(x))) - slicedim = (size(x)[1:(end-1)]..., 1) + slicedim = (size(x)[1:(end - 1)]..., 1) res, (dx,) = Flux.withgradient(x) do x x1, _, x3, _ = Flux.eachlastdim(x) return sum(x1) + sum(x3 .* 3) end @test res ≈ sum(selectdim(x, ndims(x), 1)) + 3sum(selectdim(x, ndims(x), 3)) - @test dx ≈ cat( - fill(1, slicedim), - fill(0, slicedim), - fill(3, slicedim), - fill(0, slicedim); - dims = ndims(x), - ) + @test dx ≈ cat(fill(1, slicedim), + fill(0, slicedim), + fill(3, slicedim), + fill(0, slicedim); + dims = ndims(x)) end @testset "∇eachlastdim" begin @@ -147,44 +127,34 @@ end NoTangent = Flux.Zygote.NoTangent abstract_zeros_vector = [ZeroTangent(), ZeroTangent(), NoTangent(), NoTangent()] @test @inferred(Flux.∇eachlastdim(abstract_zeros_vector, x)) == zeros(size(x)) - x2 = rand(Float64, x_size[1:(end-1)]) - x3 = rand(Float64, x_size[1:(end-1)]) + x2 = rand(Float64, x_size[1:(end - 1)]) + x3 = rand(Float64, x_size[1:(end - 1)]) mixed_vector = [ZeroTangent(), x2, x3, ZeroTangent()] @test @inferred(Flux.∇eachlastdim(mixed_vector, x)) ≈ - cat(zeros(x_size[1:(end-1)]), x2, x3, zeros(x_size[1:(end-1)]); dims = ndims(x)) + cat(zeros(x_size[1:(end - 1)]), x2, x3, zeros(x_size[1:(end - 1)]); + dims = ndims(x)) end @testset "Different Internal Matrix Types" begin - R = Flux.Recur( - Flux.RNNCell(tanh, rand(5, 3), Tridiagonal(rand(5, 5)), rand(5), rand(5, 1)), - ) + R = Flux.Recur(Flux.RNNCell(tanh, rand(5, 3), Tridiagonal(rand(5, 5)), rand(5), + rand(5, 1))) # don't want to pull in SparseArrays just for this test, but there aren't any # non-square structured matrix types in LinearAlgebra. so we will use a different # eltype matrix, which would fail before when `W_i` and `W_h` were required to be the # same type. - L = Flux.Recur( - Flux.LSTMCell( - rand(5 * 4, 3), - rand(1:20, 5 * 4, 5), - rand(5 * 4), - (rand(5, 1), rand(5, 1)), - ), - ) - G = Flux.Recur( - Flux.GRUCell(rand(5 * 3, 3), rand(1:20, 5 * 3, 5), rand(5 * 3), rand(5, 1)), - ) - G3 = Flux.Recur( - Flux.GRUv3Cell( - rand(5 * 3, 3), - rand(1:20, 5 * 2, 5), - rand(5 * 3), - Tridiagonal(rand(5, 5)), - rand(5, 1), - ), - ) + L = Flux.Recur(Flux.LSTMCell(rand(5 * 4, 3), + rand(1:20, 5 * 4, 5), + rand(5 * 4), + (rand(5, 1), rand(5, 1)))) + G = Flux.Recur(Flux.GRUCell(rand(5 * 3, 3), rand(1:20, 5 * 3, 5), rand(5 * 3), + rand(5, 1))) + G3 = Flux.Recur(Flux.GRUv3Cell(rand(5 * 3, 3), + rand(1:20, 5 * 2, 5), + rand(5 * 3), + Tridiagonal(rand(5, 5)), + rand(5, 1))) for m in [R, L, G, G3] - x1 = rand(3) x2 = rand(3, 1) x3 = rand(3, 1, 2) diff --git a/test/layers/upsample.jl b/test/layers/upsample.jl index 66831d3d68..c4e1c30341 100644 --- a/test/layers/upsample.jl +++ b/test/layers/upsample.jl @@ -2,19 +2,19 @@ m = Upsample(:bilinear; scale = (2, 3)) x = rand(Float32, 3, 4, 2, 3) y = m(x) - @test y isa Array{Float32,4} + @test y isa Array{Float32, 4} @test size(y) == (6, 12, 2, 3) m = Upsample(:bilinear; scale = 3) x = rand(Float32, 3, 4, 2, 3) y = m(x) - @test y isa Array{Float32,4} + @test y isa Array{Float32, 4} @test size(y) == (9, 12, 2, 3) m = Upsample(:bilinear; size = (4, 6)) x = rand(Float32, 3, 4, 2, 3) y = m(x) - @test y isa Array{Float32,4} + @test y isa Array{Float32, 4} @test size(y) == (4, 6, 2, 3) end @@ -22,19 +22,19 @@ end m = Upsample(:trilinear; scale = (2, 3, 2)) x = rand(Float32, 3, 4, 2, 3, 4) y = m(x) - @test y isa Array{Float32,5} + @test y isa Array{Float32, 5} @test size(y) == (6, 12, 4, 3, 4) m = Upsample(:trilinear; scale = 3) x = rand(Float32, 3, 4, 2, 3, 4) y = m(x) - @test y isa Array{Float32,5} + @test y isa Array{Float32, 5} @test size(y) == (9, 12, 6, 3, 4) m = Upsample(:trilinear; size = (4, 6, 4)) x = rand(Float32, 3, 4, 2, 3, 4) y = m(x) - @test y isa Array{Float32,5} + @test y isa Array{Float32, 5} @test size(y) == (4, 6, 4, 3, 4) end @@ -42,24 +42,24 @@ end x = rand(Float32, 3, 2, 3) m = Upsample(:nearest; scale = (2,)) y = m(x) - @test y isa Array{Float32,3} + @test y isa Array{Float32, 3} @test size(y) == (6, 2, 3) x = rand(Float32, 3, 4, 2, 3) m = Upsample(:nearest; scale = (2, 3)) y = m(x) - @test y isa Array{Float32,4} + @test y isa Array{Float32, 4} @test size(y) == (6, 12, 2, 3) m = Upsample(:nearest; scale = (2,)) y = m(x) - @test y isa Array{Float32,4} + @test y isa Array{Float32, 4} @test size(y) == (6, 4, 2, 3) m = Upsample(:nearest; scale = 2) y = m(x) - @test y isa Array{Float32,4} + @test y isa Array{Float32, 4} @test size(y) == (6, 8, 2, 3) m = Upsample(2) @@ -68,7 +68,7 @@ end m = Upsample(:nearest; size = (6, 8)) y = m(x) - @test y isa Array{Float32,4} + @test y isa Array{Float32, 4} @test size(y) == (6, 8, 2, 3) end @@ -76,12 +76,12 @@ end m = PixelShuffle(2) x = rand(Float32, 3, 18, 3) y = m(x) - @test y isa Array{Float32,3} + @test y isa Array{Float32, 3} @test size(y) == (6, 9, 3) m = PixelShuffle(3) x = rand(Float32, 3, 4, 18, 3) y = m(x) - @test y isa Array{Float32,4} + @test y isa Array{Float32, 4} @test size(y) == (9, 12, 2, 3) end diff --git a/test/losses.jl b/test/losses.jl index 7984941c78..a8a41bdf43 100644 --- a/test/losses.jl +++ b/test/losses.jl @@ -2,12 +2,12 @@ using Test using Flux: onehotbatch, σ using Flux.Losses: - mse, - label_smoothing, - crossentropy, - logitcrossentropy, - binarycrossentropy, - logitbinarycrossentropy + mse, + label_smoothing, + crossentropy, + logitcrossentropy, + binarycrossentropy, + logitbinarycrossentropy using Flux.Losses: xlogx, xlogy # group here all losses, used in tests @@ -58,19 +58,13 @@ y = [1, 1, 0, 0] @test mse(0 + 0im, 1 + 1im) == 2 end -@testset "mae" begin - @test Flux.mae(ŷ, y) ≈ 1 / 2 -end +@testset "mae" begin @test Flux.mae(ŷ, y) ≈ 1 / 2 end -@testset "huber_loss" begin - @test Flux.huber_loss(ŷ, y) ≈ 0.20500000000000002 -end +@testset "huber_loss" begin @test Flux.huber_loss(ŷ, y) ≈ 0.20500000000000002 end y = [123.0, 456.0, 789.0] ŷ = [345.0, 332.0, 789.0] -@testset "msle" begin - @test Flux.msle(ŷ, y) ≈ 0.38813985859136585 -end +@testset "msle" begin @test Flux.msle(ŷ, y) ≈ 0.38813985859136585 end # Now onehot y's y = onehotbatch([1, 1, 0, 0], 0:1) @@ -124,10 +118,8 @@ yls = y .* (1 - 2sf) .+ sf -yls .* log.(σ.(logŷ)) - (1 .- yls) .* log.(1 .- σ.(logŷ)) @test binarycrossentropy(σ.(logŷ), y; ϵ = 0) ≈ mean(-y .* log.(σ.(logŷ)) - (1 .- y) .* log.(1 .- σ.(logŷ))) - @test binarycrossentropy(σ.(logŷ), y) ≈ mean( - -y .* log.(σ.(logŷ) .+ eps.(σ.(logŷ))) - - (1 .- y) .* log.(1 .- σ.(logŷ) .+ eps.(σ.(logŷ))), - ) + @test binarycrossentropy(σ.(logŷ), y) ≈ mean(-y .* log.(σ.(logŷ) .+ eps.(σ.(logŷ))) - + (1 .- y) .* log.(1 .- σ.(logŷ) .+ eps.(σ.(logŷ)))) @test binarycrossentropy([0.1, 0.2, 0.9], 1) ≈ -mean(log, [0.1, 0.2, 0.9]) # constant label end @@ -191,94 +183,68 @@ end @test Flux.tversky_loss(y, y) ≈ -0.5576923076923075 end -@testset "no spurious promotions" begin - for T in (Float32, Float64) - y = rand(T, 2) - ŷ = rand(T, 2) - for f in ALL_LOSSES - fwd, back = Flux.pullback(f, ŷ, y) - @test fwd isa T - @test eltype(back(one(T))[1]) == T - end +@testset "no spurious promotions" begin for T in (Float32, Float64) + y = rand(T, 2) + ŷ = rand(T, 2) + for f in ALL_LOSSES + fwd, back = Flux.pullback(f, ŷ, y) + @test fwd isa T + @test eltype(back(one(T))[1]) == T end -end +end end @testset "binary_focal_loss" begin - y = [ - 0 1 0 - 1 0 1 - ] - ŷ = [ - 0.268941 0.5 0.268941 - 0.731059 0.5 0.731059 - ] - - y1 = [ - 1 0 - 0 1 - ] - ŷ1 = [ - 0.6 0.3 - 0.4 0.7 - ] + y = [0 1 0 + 1 0 1] + ŷ = [0.268941 0.5 0.268941 + 0.731059 0.5 0.731059] + + y1 = [1 0 + 0 1] + ŷ1 = [0.6 0.3 + 0.4 0.7] @test Flux.binary_focal_loss(ŷ, y) ≈ 0.0728675615927385 @test Flux.binary_focal_loss(ŷ1, y1) ≈ 0.05691642237852222 @test Flux.binary_focal_loss(ŷ, y; γ = 0.0) ≈ Flux.binarycrossentropy(ŷ, y) end @testset "focal_loss" begin - y = [ - 1 0 0 0 1 - 0 1 0 1 0 - 0 0 1 0 0 - ] + y = [1 0 0 0 1 + 0 1 0 1 0 + 0 0 1 0 0] ŷ = softmax(reshape(-7:7, 3, 5) .* 1.0f0) - y1 = [ - 1 0 - 0 0 - 0 1 - ] - ŷ1 = [ - 0.4 0.2 - 0.5 0.5 - 0.1 0.3 - ] + y1 = [1 0 + 0 0 + 0 1] + ŷ1 = [0.4 0.2 + 0.5 0.5 + 0.1 0.3] @test Flux.focal_loss(ŷ, y) ≈ 1.1277571935622628 @test Flux.focal_loss(ŷ1, y1) ≈ 0.45990566879720157 @test Flux.focal_loss(ŷ, y; γ = 0.0) ≈ Flux.crossentropy(ŷ, y) end @testset "siamese_contrastive_loss" begin - y = [ - 1 0 - 0 0 - 0 1 - ] - ŷ = [ - 0.4 0.2 - 0.5 0.5 - 0.1 0.3 - ] - y1 = [ - 1 0 0 0 1 - 0 1 0 1 0 - 0 0 1 0 0 - ] + y = [1 0 + 0 0 + 0 1] + ŷ = [0.4 0.2 + 0.5 0.5 + 0.1 0.3] + y1 = [1 0 0 0 1 + 0 1 0 1 0 + 0 0 1 0 0] ŷ1 = softmax(reshape(-7:7, 3, 5) .* 1.0f0) - y2 = [ - 1 - 0 - 0 - 1 - 1 - ] - ŷ2 = [ - 0.6 - 0.4 - 0.1 - 0.2 - 0.7 - ] + y2 = [1 + 0 + 0 + 1 + 1] + ŷ2 = [0.6 + 0.4 + 0.1 + 0.2 + 0.7] @test Flux.siamese_contrastive_loss(ŷ, y) ≈ 0.2333333333333333 @test Flux.siamese_contrastive_loss(ŷ, y; margin = 0.5f0) ≈ 0.10000000000000002 @test Flux.siamese_contrastive_loss(ŷ, y; margin = 1.5f0) ≈ 0.5333333333333333 @@ -293,14 +259,10 @@ end @test Flux.siamese_contrastive_loss(ŷ1, y1; margin = 0) ≈ 0.13161165f0 @test Flux.siamese_contrastive_loss(ŷ2, y2) ≈ 0.21200000000000005 @test Flux.siamese_contrastive_loss(ŷ2, ŷ2) ≈ 0.18800000000000003 - @test_throws DomainError(-0.5, "Margin must be non-negative") Flux.siamese_contrastive_loss( - ŷ1, - y1, - margin = -0.5, - ) - @test_throws DomainError(-1, "Margin must be non-negative") Flux.siamese_contrastive_loss( - ŷ, - y, - margin = -1, - ) + @test_throws DomainError(-0.5, "Margin must be non-negative") Flux.siamese_contrastive_loss(ŷ1, + y1, + margin = -0.5) + @test_throws DomainError(-1, "Margin must be non-negative") Flux.siamese_contrastive_loss(ŷ, + y, + margin = -1) end diff --git a/test/optimise.jl b/test/optimise.jl index 9f9f788c01..6b7df97baf 100644 --- a/test/optimise.jl +++ b/test/optimise.jl @@ -30,7 +30,7 @@ using Random w′ = randn(10, 10) b = false loss(x) = Flux.Losses.mse(w * x, w′ * x .+ b) - for t = 1:(10^5) + for t in 1:(10^5) θ = params([w′, b]) x = rand(10) θ̄ = gradient(() -> loss(x), θ) @@ -48,7 +48,7 @@ end w′ = randn(10, 10) loss(x) = Flux.Losses.mse(w * x, w′ * x) opt = Optimiser(Opt(), Adam(0.001)) - for t = 1:(10^5) + for t in 1:(10^5) θ = Params([w′]) x = rand(10) θ̄ = gradient(() -> loss(x), θ) @@ -61,32 +61,26 @@ end @testset "Training Loop" begin i = 0 l = 1 - Flux.train!( - () -> (sleep(0.1); Flux.skip(); i += 1), - Params([]), - Iterators.repeated((), 10), - Descent(), - ) + Flux.train!(() -> (sleep(0.1); Flux.skip(); i += 1), + Params([]), + Iterators.repeated((), 10), + Descent()) @test i == 0 #all skipped - Flux.train!( - () -> (sleep(0.1); i == 8 && Flux.skip(); i += 1), - Params([]), - Iterators.repeated((), 10), - Descent(), - ) + Flux.train!(() -> (sleep(0.1); i == 8 && Flux.skip(); i += 1), + Params([]), + Iterators.repeated((), 10), + Descent()) @test i == 8 #skip after i hit 8 i = 0 - Flux.train!( - () -> (sleep(0.1); i += 1; l), - Params([]), - Iterators.repeated((), 100), - Descent(); - cb = Flux.throttle(() -> (i > 3 && Flux.stop()), 1), - ) + Flux.train!(() -> (sleep(0.1); i += 1; l), + Params([]), + Iterators.repeated((), 100), + Descent(); + cb = Flux.throttle(() -> (i > 3 && Flux.stop()), 1)) @test 3 < i < 50 @@ -128,7 +122,7 @@ end loss(x) = Flux.Losses.mse(w * x, w1 * x) flag = 1 decay_steps = [] - for t = 1:(10^5) + for t in 1:(10^5) prev_eta = o.eta θ = Params([w1]) x = rand(10) @@ -148,7 +142,7 @@ end @test flag == 1 # Test to check if decay happens at decay steps. Eta reaches clip value (1e-4) after 4000 steps (decay by 0.1 every 1000 steps starting at 0.1). ground_truth = [] - for i = 1:4 + for i in 1:4 push!(ground_truth, 1000 * i) # Expected decay steps for this example. end @test decay_steps == ground_truth @@ -223,31 +217,30 @@ end # wreaks all sorts of havoc on our training loops. This test ensures that # a simple optimization is montonically decreasing (up to learning step effects) @testset "Momentum Optimisers and complex values" begin - # Test every optimizer that has momentum internally - for opt_ctor in [Adam, RMSProp, RAdam, OAdam, AdaGrad, AdaDelta, NAdam, AdaBelief] - # Our "model" is just a complex number - w = zeros(ComplexF32, 1) - - # Our model attempts to learn `f(x) = conj(x)` where `f(x) = w*x` - function loss() - # Deterministic training data is the best training data - x = ones(1, 1) + 1im * ones(1, 1) - - # Manually implement `mse()` to allow demonstration of brokenness - # on older Flux builds that don't have a fixed `mse()` - return sum(abs2.(w * x .- conj(x))) - end +# Test every optimizer that has momentum internally +for opt_ctor in [Adam, RMSProp, RAdam, OAdam, AdaGrad, AdaDelta, NAdam, AdaBelief] + # Our "model" is just a complex number + w = zeros(ComplexF32, 1) + + # Our model attempts to learn `f(x) = conj(x)` where `f(x) = w*x` + function loss() + # Deterministic training data is the best training data + x = ones(1, 1) + 1im * ones(1, 1) + + # Manually implement `mse()` to allow demonstration of brokenness + # on older Flux builds that don't have a fixed `mse()` + return sum(abs2.(w * x .- conj(x))) + end - params = Flux.Params([w]) - opt = opt_ctor(1e-2) + params = Flux.Params([w]) + opt = opt_ctor(1e-2) - # Train for 10 iterations, enforcing that loss is monotonically decreasing - last_loss = Inf - for idx = 1:10 - grads = Flux.gradient(loss, params) - @test loss() < last_loss - last_loss = loss() - Flux.update!(opt, params, grads) - end + # Train for 10 iterations, enforcing that loss is monotonically decreasing + last_loss = Inf + for idx in 1:10 + grads = Flux.gradient(loss, params) + @test loss() < last_loss + last_loss = loss() + Flux.update!(opt, params, grads) end -end +end end diff --git a/test/outputsize.jl b/test/outputsize.jl index 64eda2af31..2d2baceece 100644 --- a/test/outputsize.jl +++ b/test/outputsize.jl @@ -3,7 +3,7 @@ @test outputsize(m, (10, 10, 3, 1)) == (6, 6, 32, 1) m = Dense(10, 5) - @test_throws DimensionMismatch outputsize(m, (5, 2)) == (5, 1) + @test_throws DimensionMismatch outputsize(m, (5, 2))==(5, 1) @test outputsize(m, (10,); padbatch = true) == (5, 1) m = Chain(Dense(10, 8, σ), Dense(8, 5), Dense(5, 2)) @@ -55,33 +55,31 @@ end @test outputsize(m, (2, 7), (3, 7)) == (13, 7) end -@testset "activations" begin - @testset for f in [ - celu, - elu, - gelu, - hardsigmoid, - hardtanh, - leakyrelu, - lisht, - logcosh, - logσ, - mish, - relu, - relu6, - rrelu, - selu, - σ, - softplus, - softshrink, - softsign, - swish, - tanhshrink, - trelu, - ] - @test outputsize(Dense(10, 5, f), (10, 1)) == (5, 1) - end -end +@testset "activations" begin @testset for f in [ + celu, + elu, + gelu, + hardsigmoid, + hardtanh, + leakyrelu, + lisht, + logcosh, + logσ, + mish, + relu, + relu6, + rrelu, + selu, + σ, + softplus, + softshrink, + softsign, + swish, + tanhshrink, + trelu, +] + @test outputsize(Dense(10, 5, f), (10, 1)) == (5, 1) +end end @testset "conv" begin m = Conv((3, 3), 3 => 16) diff --git a/test/runtests.jl b/test/runtests.jl index 2a1b2913ca..4189ea0dd5 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -10,18 +10,12 @@ using CUDA Random.seed!(0) -@testset verbose = true "Flux.jl" begin - @testset "Utils" begin - include("utils.jl") - end +@testset verbose=true "Flux.jl" begin + @testset "Utils" begin include("utils.jl") end - @testset "Optimise" begin - include("optimise.jl") - end + @testset "Optimise" begin include("optimise.jl") end - @testset "Data" begin - include("data.jl") - end + @testset "Data" begin include("data.jl") end @testset "Losses" begin include("losses.jl") @@ -44,13 +38,11 @@ Random.seed!(0) include("outputsize.jl") end - @testset "CUDA" begin - if CUDA.functional() - include("cuda/runtests.jl") - else - @warn "CUDA unavailable, not testing GPU support" - end - end + @testset "CUDA" begin if CUDA.functional() + include("cuda/runtests.jl") + else + @warn "CUDA unavailable, not testing GPU support" + end end @static if VERSION == v"1.6" using Documenter diff --git a/test/utils.jl b/test/utils.jl index 7da452ba02..6d2cb855e8 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -1,22 +1,22 @@ using Flux using Flux: - throttle, - nfan, - glorot_uniform, - glorot_normal, - kaiming_normal, - kaiming_uniform, - orthogonal, - truncated_normal, - sparse_init, - identity_init, - unstack, - batch, - unbatch, - unsqueeze, - params, - loadparams!, - loadmodel! + throttle, + nfan, + glorot_uniform, + glorot_normal, + kaiming_normal, + kaiming_uniform, + orthogonal, + truncated_normal, + sparse_init, + identity_init, + unstack, + batch, + unbatch, + unsqueeze, + params, + loadparams!, + loadmodel! using MLUtils using StatsBase: var, std using Statistics, LinearAlgebra @@ -105,7 +105,7 @@ end end @test size(init(3, 4)) == (3, 4) # only init(size...) is accepted: - @test_throws MethodError size(init((3, 4, 5))) == (3, 4, 5) + @test_throws MethodError size(init((3, 4, 5)))==(3, 4, 5) # rng, and currying: @test size(init(MersenneTwister(1), 3, 4)) == (3, 4) @@ -180,8 +180,8 @@ end for (n_in, n_out, sparsity, σ) in [(100, 100, 0.25, 0.1), (100, 400, 0.75, 0.01)] expected_zeros = ceil(Integer, n_in * sparsity) v = sparse_init(n_in, n_out; sparsity = sparsity, std = σ) - @test all([sum(v[:, col] .== 0) == expected_zeros for col = 1:n_out]) - @test 0.9 * σ < std(v[v.!=0]) < 1.1 * σ + @test all([sum(v[:, col] .== 0) == expected_zeros for col in 1:n_out]) + @test 0.9 * σ < std(v[v .!= 0]) < 1.1 * σ end @test eltype(sparse_init(3, 4; std = 1.5, sparsity = 0.5)) == Float32 @@ -189,9 +189,9 @@ end @testset "truncated_normal" begin m = truncated_normal(100, 100) - @test minimum(m) ≈ -2 atol = 0.05 # default arguments - @test maximum(m) ≈ 2 atol = 0.05 - @test mean(m) ≈ 0 atol = 0.1 + @test minimum(m)≈-2 atol=0.05 # default arguments + @test maximum(m)≈2 atol=0.05 + @test mean(m)≈0 atol=0.1 size100 = (100, 100, 100) for (μ, σ, lo, hi) in [(0.0, 1, -2, 3), (1, 2, -4.0, 5.0)] @@ -241,12 +241,15 @@ end indata = reshape(collect(Float32, 1:9), 3, 3) @test l(indata) == indata end - @testset "$layer ID mapping with kernelsize $kernelsize" for layer in ( - Conv, - ConvTranspose, - CrossCor, - ), - kernelsize in ((1,), (3,), (1, 3), (3, 5), (3, 5, 7)) + @testset "$layer ID mapping with kernelsize $kernelsize" for layer in (Conv, + ConvTranspose, + CrossCor), + kernelsize in ((1,), + (3,), + (1, 3), + (3, 5), + (3, 5, + 7)) nch = 3 l = layer(kernelsize, nch => nch; init = identity_init, pad = SamePad()) @@ -257,18 +260,14 @@ end @testset "Inception identity" begin insize = 7 path1 = Conv((1, 3), insize => 2; init = identity_init, pad = SamePad()) - path2 = Conv( - (3, 5), - insize => 3; - init = identity_init(; shift = (0, 0, 2, 0)), - pad = SamePad(), - ) - path3 = Conv( - (5, 7), - insize => 2; - init = identity_init(; shift = (0, 0, 5, 0)), - pad = SamePad(), - ) + path2 = Conv((3, 5), + insize => 3; + init = identity_init(; shift = (0, 0, 2, 0)), + pad = SamePad()) + path3 = Conv((5, 7), + insize => 2; + init = identity_init(; shift = (0, 0, 5, 0)), + pad = SamePad()) block = Parallel((xs...) -> cat(xs...; dims = 3), path1, path2, path3) indata = randn(Float32, 9, 9, 7, 2) @@ -316,7 +315,7 @@ end @test f32(m).bias === m.bias === false @testset "Gradients for broadcasted $op with sizes $s" for op in (+, -, *), - s in ((1,), (2, 3)) + s in ((1,), (2, 3)) o = ones(s) z = zeros(s) @@ -367,12 +366,10 @@ end end @testset "Batching" begin - stacked_array = [ - 8 9 3 5 - 9 6 6 9 - 9 1 7 2 - 7 4 10 6 - ] + stacked_array = [8 9 3 5 + 9 6 6 9 + 9 1 7 2 + 7 4 10 6] unstacked_array = [[8, 9, 9, 7], [9, 6, 1, 4], [3, 6, 7, 10], [5, 9, 2, 6]] @test unbatch(stacked_array) == unstacked_array @test batch(unstacked_array) == stacked_array @@ -382,8 +379,8 @@ end @test unbatch([1, 2, 3]) == [1, 2, 3] # generic iterable - @test batch(ones(2) for i = 1:3) == ones(2, 3) - @test unbatch(ones(2, 3)) == [ones(2) for i = 1:3] + @test batch(ones(2) for i in 1:3) == ones(2, 3) + @test unbatch(ones(2, 3)) == [ones(2) for i in 1:3] end @testset "Param remapping" begin @@ -392,8 +389,8 @@ end dm(bias) = Chain(dl(3, 5, bias), dl(5, 4, bias), dl(4, 3, bias)) nobias(n) = false - testdense(m, bt) = @testset "Check layer $i" for (i, (l1, l2)) in - enumerate(zip(m, dm(bt))) + testdense(m, bt) = @testset "Check layer $i" for (i, (l1, l2)) in enumerate(zip(m, + dm(bt))) @test l1.weight == l2.weight @test l1.bias == l2.bias @test_skip typeof(l1.bias) === typeof(l2.bias) @@ -441,12 +438,10 @@ end # tests for BatchNorm and Dropout m1 = Chain(Conv((3, 3), 3 => 16), BatchNorm(16), Flux.flatten, Dropout(0.2)) - m2 = Chain( - Conv((3, 3), 3 => 16), - BatchNorm(16), - x -> reshape(x, :, size(x)[end]), - Dropout(0.1), - ) + m2 = Chain(Conv((3, 3), 3 => 16), + BatchNorm(16), + x -> reshape(x, :, size(x)[end]), + Dropout(0.1)) m2[2].μ .= rand(Float32, size(m2[2].μ)...) loadmodel!(m1, m2) # non-trainable parameters are copied as well @@ -461,40 +456,38 @@ end # tests MaxPool # tests testmode!/trainmode! is not copied # tests Dense, Conv, BatchNorm, Dropout (like above) but in a bigger model - chain1 = Chain( - Dropout(0.2), - Conv((3, 3), 1 => 32, relu), - BatchNorm(32, relu), - MaxPool((2, 2)), - Dropout(0.2), - Conv((3, 3), 32 => 16, relu), - Dropout(0.2), - MaxPool((2, 2)), - Dropout(0.2), - Conv((3, 3), 16 => 10, relu), - Dropout(0.2), - x -> reshape(x, :, size(x, 4)), - Dropout(0.2), - Dense(90, 10), - softmax, - ) + chain1 = Chain(Dropout(0.2), + Conv((3, 3), 1 => 32, relu), + BatchNorm(32, relu), + MaxPool((2, 2)), + Dropout(0.2), + Conv((3, 3), 32 => 16, relu), + Dropout(0.2), + MaxPool((2, 2)), + Dropout(0.2), + Conv((3, 3), 16 => 10, relu), + Dropout(0.2), + x -> reshape(x, :, size(x, 4)), + Dropout(0.2), + Dense(90, 10), + softmax) chain2 = Chain([ - Dropout(0.1), - Conv((3, 3), 1 => 32, relu), - BatchNorm(32, relu), - MaxPool((3, 3)), - Dropout(0.1), - Conv((3, 3), 32 => 16, relu), - Dropout(0.1), - MaxPool((3, 3)), - Dropout(0.1), - Conv((3, 3), 16 => 10, relu), - Dropout(0.1), - x -> reshape(x, :, size(x, 4)), - Dropout(0.1), - Dense(90, 10), - softmax, - ]) + Dropout(0.1), + Conv((3, 3), 1 => 32, relu), + BatchNorm(32, relu), + MaxPool((3, 3)), + Dropout(0.1), + Conv((3, 3), 32 => 16, relu), + Dropout(0.1), + MaxPool((3, 3)), + Dropout(0.1), + Conv((3, 3), 16 => 10, relu), + Dropout(0.1), + x -> reshape(x, :, size(x, 4)), + Dropout(0.1), + Dense(90, 10), + softmax, + ]) chain2[3].μ .= 5.0f0 chain2[3].σ² .= 2.0f0 testmode!(chain2) @@ -502,7 +495,7 @@ end for (dst, src) in zip(chain1, chain2) if dst isa Dropout @test dst.p == 0.2 - elseif dst isa Union{Conv,Dense} + elseif dst isa Union{Conv, Dense} @test dst.weight == src.weight @test dst.bias == src.bias elseif dst isa MaxPool @@ -515,12 +508,12 @@ end end # copy only a subset of the model - chain1[end-1].weight .= 1.0f0 + chain1[end - 1].weight .= 1.0f0 chain1[3].μ .= 3.0f0 chain1[2].bias .= 5.0f0 - loadmodel!(chain2[end-1], chain1[end-1]) + loadmodel!(chain2[end - 1], chain1[end - 1]) loadmodel!(chain2[3], chain1[3]) - @test chain2[end-1].weight == chain1[end-1].weight + @test chain2[end - 1].weight == chain1[end - 1].weight @test chain2[3].μ == chain1[3].μ @test chain2[2].bias != chain1[2].bias @@ -631,18 +624,16 @@ end @test modules[5] === m2 @test modules[6] === m3 - mod_par = Flux.modules( - Parallel(Flux.Bilinear(2, 2, 2, cbrt), Dense(2, 2, abs), Dense(2, 2, abs2)), - ) + mod_par = Flux.modules(Parallel(Flux.Bilinear(2, 2, 2, cbrt), Dense(2, 2, abs), + Dense(2, 2, abs2))) @test length(mod_par) == 5 mod_rnn = Flux.modules(Chain(Dense(2, 3), BatchNorm(3), LSTM(3, 4))) @test length(mod_rnn) == 6 @test mod_rnn[end] isa Flux.LSTMCell - mod_skip = Flux.modules( - Chain(SkipConnection(Conv((2, 3), 4 => 5; pad = 6, stride = 7), +), LayerNorm(8)), - ) + mod_skip = Flux.modules(Chain(SkipConnection(Conv((2, 3), 4 => 5; pad = 6, stride = 7), + +), LayerNorm(8))) @test length(mod_skip) == 6 @test mod_skip[end] isa Flux.Scale end @@ -661,7 +652,7 @@ end end n_iter = 0 - for i = 1:length(v) + for i in 1:length(v) trigger(i) && break n_iter += 1 end @@ -683,11 +674,9 @@ end end @testset "distance" begin - es = Flux.early_stopping( - identity, - 10; - distance = (best_score, score) -> score - best_score, - ) + es = Flux.early_stopping(identity, + 10; + distance = (best_score, score) -> score - best_score) n_iter = 0 while n_iter < 99 @@ -813,10 +802,8 @@ end n_outputs = [3, 7] data = rand(Float32, n_input, n_batch) - model = Chain( - Dense(n_input, n_shared), - Split(Dense(n_shared, n_outputs[1]), Dense(n_shared, n_outputs[2])), - ) + model = Chain(Dense(n_input, n_shared), + Split(Dense(n_shared, n_outputs[1]), Dense(n_shared, n_outputs[2]))) pvec, re = Flux.destructure(model) loss(x, idx, pv) = sum(abs2, re(pv)(x)[idx]) # loss wrt `idx`th output term @@ -826,20 +813,16 @@ end end end -@testset "Rrule" begin - @testset "issue 2033" begin - if CUDA.functional() - struct Wrapped{T} - x::T - end - y, _ = Flux.pullback(Wrapped, cu(randn(3, 3))) - @test y isa Wrapped{<:CuArray} - end +@testset "Rrule" begin @testset "issue 2033" begin if CUDA.functional() + struct Wrapped{T} + x::T end -end + y, _ = Flux.pullback(Wrapped, cu(randn(3, 3))) + @test y isa Wrapped{<:CuArray} +end end end # make sure rng_from_array is non_differentiable @testset "rng_from_array" begin - m(x) = (rand(rng_from_array(x))*x)[1] + m(x) = (rand(rng_from_array(x)) * x)[1] gradient(m, ones(2)) end