FluxML · mcabbott · Dec 20, 2022 · Dec 21, 2022 · Dec 23, 2022 · Dec 23, 2022
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
@@ -172,6 +172,9 @@ function (a::Dense)(x::AbstractVecOrMat)
   return σ.(a.weight * x .+ a.bias)
 end
 
+(a::Dense{typeof(identity), <:AbstractMatrix, Bool})(x::AbstractVecOrMat) =
+  a.weight * x  # fast path, no broadcast
+
 (a::Dense)(x::AbstractArray) = 
   reshape(a(reshape(x, size(x,1), :)), :, size(x)[2:end]...)
 
@@ -246,6 +249,9 @@ function (a::Scale)(x::AbstractArray)
   σ.(a.scale .* x .+ a.bias)
 end
 
+(a::Scale{typeof(identity), <:AbstractArray, Bool})(x::AbstractArray) =
+  a.scale .* x
+
 function Base.show(io::IO, l::Scale)
   print(io, "Scale(", join(size(l.scale), ", "))
   l.σ == identity || print(io, ", ", l.σ)

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
@@ -199,6 +199,10 @@ function (c::Conv)(x::AbstractArray)
   cdims = conv_dims(c, x)
   σ.(conv(x, c.weight, cdims) .+ conv_reshape_bias(c))
 end
+function (c::Conv{<:Any,<:Any,typeof(identity),<:AbstractArray,Bool})(x::AbstractArray)
+  cdims = conv_dims(c, x)
+  conv(x, c.weight, cdims)  # fast path, no broadcast
+end
 
 _channels_in(l::Conv) = size(l.weight, ndims(l.weight)-1) * l.groups
 _channels_out(l::Conv) = size(l.weight, ndims(l.weight))
@@ -332,6 +336,10 @@ function (c::ConvTranspose)(x::AbstractArray)
   cdims = conv_transpose_dims(c, x)
   σ.(∇conv_data(x, c.weight, cdims) .+ conv_reshape_bias(c))
 end
+function (c::ConvTranspose{<:Any,<:Any,typeof(identity),<:AbstractArray,Bool})(x::AbstractArray)
+  cdims = conv_transpose_dims(c, x)
+  ∇conv_data(x, c.weight, cdims)  # fast path, no broadcast
+end
 
 function Base.show(io::IO, l::ConvTranspose)
   print(io, "ConvTranspose(", size(l.weight)[1:ndims(l.weight)-2])
@@ -470,6 +478,10 @@ function (c::CrossCor)(x::AbstractArray)
   cdims = crosscor_dims(c, x)
   σ.(crosscor(x, c.weight, cdims) .+ conv_reshape_bias(c))
 end
+function (c::CrossCor{<:Any,<:Any,typeof(identity),<:AbstractArray,Bool})(x::AbstractArray)
+  cdims = crosscor_dims(c, x)
+  crosscor(x, c.weight, cdims)  # fast path, no broadcast
+end
 
 function Base.show(io::IO, l::CrossCor)
   print(io, "CrossCor(", size(l.weight)[1:ndims(l.weight)-2])

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
@@ -210,7 +210,7 @@ true
 ```
 """
 struct LayerNorm{F,D,T,N}
-  λ::F
+  λ::F  # this field is not used
   diag::D
   ϵ::T
   size::NTuple{N,Int}
@@ -254,16 +254,16 @@ function _norm_layer_forward(
     end
   end
 
-  o = _norm_layer_forward(x, μ, σ², l.ϵ)
-  hasaffine(l) || return l.λ.(o)
-
-  γ = reshape(l.γ, affine_shape)
-  β = reshape(l.β, affine_shape)
-  return l.λ.(γ .* o .+ β)
+  s = (inv∘sqrt).(σ² .+ l.ϵ)  # faster to un-fuse this, fewer inv∘sqrt calls
+  if hasaffine(l)
+    γ = reshape(l.γ, affine_shape)  # ideally reshape on construction?
+    β = reshape(l.β, affine_shape)
+    return l.λ.(γ .* s .* (x .- μ) .+ β)
+  else
+    return l.λ.(s .* (x .- μ))
+  end
 end
 
-@inline _norm_layer_forward(x, μ, σ², ϵ) = (x .- μ) ./ sqrt.(σ² .+ ϵ)
-
 function _track_stats!(
   bn, x::AbstractArray{T, N}, μ, σ², reduce_dims,
 ) where {T, N}
@@ -356,10 +356,9 @@ end
 @functor BatchNorm
 trainable(bn::BatchNorm) = hasaffine(bn) ? (β = bn.β, γ = bn.γ) : (;)
 
-function (BN::BatchNorm)(x)
-  @assert size(x, ndims(x)-1) == BN.chs
-  N = ndims(x)
-  reduce_dims = [1:N-2; N]
+function (BN::BatchNorm)(x::AbstractArray{T,N}) where {T,N}
+  size(x, N-1) == BN.chs || error("BatchNorm expected an input with $(BN.chs) channels, got size(x) == $(size(x))")
+  reduce_dims = ntuple(d -> d + (d==N-1), N-1)  # i.e. 1:N with N-1 removed
   affine_shape = ntuple(i -> i == N-1 ? size(x, N-1) : 1, N)
   return _norm_layer_forward(BN, x; reduce_dims, affine_shape)
 end

diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
@@ -166,11 +166,11 @@ end
   end
 
   # with activation function
-  let m = BatchNorm(2, sigmoid), x = [1.0 3.0 5.0;
-                                      2.0 4.0 6.0]
+  let m = BatchNorm(2, sigmoid)
+    x = Float32[1.0 3.0 5.0; 2.0 4.0 6.0]
     y = m(x)
     @test isapprox(y, sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ)), atol = 1.0e-7)
-    @inferred m(x)
+    @inferred m(x)  # fails when x::Matrix{Float64}, do we care?
   end
 
   let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:6), 3, 2, 1)