FluxML · mcabbott · Dec 20, 2022 · Dec 21, 2022 · Dec 23, 2022 · Dec 23, 2022
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
@@ -169,9 +169,13 @@ end
 
 function (a::Dense)(x::AbstractVecOrMat)
   σ = NNlib.fast_act(a.σ, x)  # replaces tanh => tanh_fast, etc
-  return σ.(a.weight * x .+ a.bias)
+  xT = _match_eltype(a, eltype(a.weight), x)  # fixes Float64 input, etc.
+  return σ.(a.weight * xT .+ a.bias)
 end
 
+(a::Dense{typeof(identity), <:AbstractMatrix, Bool})(x::AbstractVecOrMat) =
+  a.weight * _match_eltype(a, eltype(a.weight), x)  # fast path, no broadcast
+
 (a::Dense)(x::AbstractArray) = 
   reshape(a(reshape(x, size(x,1), :)), :, size(x)[2:end]...)
 
@@ -185,6 +189,22 @@ end
 Dense(W::LinearAlgebra.Diagonal, bias = true, σ = identity) =
   Scale(W.diag, bias, σ)
 
+_match_eltype(layer, ::Type{T}, x::AbstractArray{T}) where {T} = x  # best case
+function _match_eltype(layer, ::Type{Float32}, x::AbstractArray{Float64})  # common mistake
+  @warn "Layer with Float32 parameters got Float64 input.
+  The input will be converted, but any earlier layers may be very slow" layer summary(x) maxlog=1
+  convert(AbstractArray{Float32}, x)
+end
+function _match_eltype(layer, ::Type{T}, x::AbstractArray{<:Union{AbstractFloat, Integer}}) where {T}
+  convert(AbstractArray{T}, x)
+end
+_match_eltype(layer, ::Type, x::OneHotLike) = x
+_match_eltype(layer, ::Type, x::AbstractArray) = x  # weird types
+
+function ChainRulesCore.rrule(::typeof(_match_eltype), layer, ::Type{T}, x::AbstractArray) where {T}
+  _match_eltype(layer, T, x), dx -> (NoTangent(), ZeroTangent(), NoTangent(), dx)  # does not un-thunk dx
+end
+
 """
     Scale(size::Integer..., σ=identity; bias=true, init=ones32)
     Scale(scale::AbstractArray, [bias, σ])
@@ -246,6 +266,9 @@ function (a::Scale)(x::AbstractArray)
   σ.(a.scale .* x .+ a.bias)
 end
 
+(a::Scale{typeof(identity), <:AbstractArray, Bool})(x::AbstractArray) =
+  a.scale .* x
+
 function Base.show(io::IO, l::Scale)
   print(io, "Scale(", join(size(l.scale), ", "))
   l.σ == identity || print(io, ", ", l.σ)
@@ -421,6 +444,7 @@ Bilinear((in12, out)::Pair{<:Integer, <:Integer}, σ = identity; kw...) = Biline
 
 function (a::Bilinear)(x::AbstractMatrix, y::AbstractMatrix)
   W, b, σ = a.weight, a.bias, a.σ
+  xT = _match_eltype(a, eltype(a.weight), x)
 
   d_z, d_x, d_y = size(W)
   d_x == size(x,1) && d_y == size(y,1) || throw(DimensionMismatch("number of rows in data must match W"))
@@ -430,7 +454,7 @@ function (a::Bilinear)(x::AbstractMatrix, y::AbstractMatrix)
   Wy = reshape(reshape(W, (:, d_y)) * y, (d_z, d_x, :))
 
   # @einsum Z[o,s] := Wy[o,i,s] * x[i,s]
-  Wyx = batched_mul(Wy, reshape(x, (d_x, 1, :)))
+  Wyx = batched_mul(Wy, reshape(xT, (d_x, 1, :)))
   Z = reshape(Wyx, (d_z, :))
 
   # @einsum out[o,s] := σ(Z[o,i] + b[o])

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
@@ -197,7 +197,12 @@ ChainRulesCore.@non_differentiable conv_dims(::Any, ::Any)
 function (c::Conv)(x::AbstractArray)
   σ = NNlib.fast_act(c.σ, x)
   cdims = conv_dims(c, x)
-  σ.(conv(x, c.weight, cdims) .+ conv_reshape_bias(c))
+  xT = _match_eltype(c, eltype(c.weight), x)
+  σ.(conv(xT, c.weight, cdims) .+ conv_reshape_bias(c))
+end
+function (c::Conv{<:Any,<:Any,typeof(identity),<:AbstractArray,Bool})(x::AbstractArray)
+  cdims = conv_dims(c, x)
+  conv(x, c.weight, cdims)  # fast path, no broadcast
 end
 
 _channels_in(l::Conv) = size(l.weight, ndims(l.weight)-1) * l.groups
@@ -330,7 +335,13 @@ ChainRulesCore.@non_differentiable conv_transpose_dims(::Any, ::Any)
 function (c::ConvTranspose)(x::AbstractArray)
   σ = NNlib.fast_act(c.σ, x)
   cdims = conv_transpose_dims(c, x)
-  σ.(∇conv_data(x, c.weight, cdims) .+ conv_reshape_bias(c))
+  xT = _match_eltype(c, eltype(c.weight), x)
+  σ.(∇conv_data(xT, c.weight, cdims) .+ conv_reshape_bias(c))
+end
+function (c::ConvTranspose{<:Any,<:Any,typeof(identity),<:AbstractArray,Bool})(x::AbstractArray)
+  cdims = conv_transpose_dims(c, x)
+  xT = _match_eltype(c, eltype(c.weight), x)
+  ∇conv_data(xT, c.weight, cdims)  # fast path, no broadcast
 end
 
 function Base.show(io::IO, l::ConvTranspose)
@@ -468,7 +479,13 @@ ChainRulesCore.@non_differentiable crosscor_dims(::Any, ::Any)
 function (c::CrossCor)(x::AbstractArray)
   σ = NNlib.fast_act(c.σ, x)
   cdims = crosscor_dims(c, x)
-  σ.(crosscor(x, c.weight, cdims) .+ conv_reshape_bias(c))
+  xT = _match_eltype(c, eltype(c.weight), x)
+  σ.(crosscor(xT, c.weight, cdims) .+ conv_reshape_bias(c))
+end
+function (c::CrossCor{<:Any,<:Any,typeof(identity),<:AbstractArray,Bool})(x::AbstractArray)
+  cdims = crosscor_dims(c, x)
+  xT = _match_eltype(c, eltype(c.weight), x)
+  crosscor(xT, c.weight, cdims)  # fast path, no broadcast
 end
 
 function Base.show(io::IO, l::CrossCor)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
@@ -210,7 +210,7 @@ true
 ```
 """
 struct LayerNorm{F,D,T,N}
-  λ::F
+  λ::F  # this field is not used
   diag::D
   ϵ::T
   size::NTuple{N,Int}
@@ -254,16 +254,16 @@ function _norm_layer_forward(
     end
   end
 
-  o = _norm_layer_forward(x, μ, σ², l.ϵ)
-  hasaffine(l) || return l.λ.(o)
-
-  γ = reshape(l.γ, affine_shape)
-  β = reshape(l.β, affine_shape)
-  return l.λ.(γ .* o .+ β)
+  s = (inv∘sqrt).(σ² .+ l.ϵ)  # faster to un-fuse this, fewer inv∘sqrt calls
+  if hasaffine(l)
+    γ = reshape(l.γ, affine_shape)  # ideally reshape on construction?
+    β = reshape(l.β, affine_shape)
+    return l.λ.(γ .* s .* (x .- μ) .+ β)
+  else
+    return l.λ.(s .* (x .- μ))
+  end
 end
 
-@inline _norm_layer_forward(x, μ, σ², ϵ) = (x .- μ) ./ sqrt.(σ² .+ ϵ)
-
 function _track_stats!(
   bn, x::AbstractArray{T, N}, μ, σ², reduce_dims,
 ) where {T, N}
@@ -356,10 +356,9 @@ end
 @functor BatchNorm
 trainable(bn::BatchNorm) = hasaffine(bn) ? (β = bn.β, γ = bn.γ) : (;)
 
-function (BN::BatchNorm)(x)
-  @assert size(x, ndims(x)-1) == BN.chs
-  N = ndims(x)
-  reduce_dims = [1:N-2; N]
+function (BN::BatchNorm)(x::AbstractArray{T,N}) where {T,N}
+  size(x, N-1) == BN.chs || error("BatchNorm expected an input with $(BN.chs) channels, got size(x) == $(size(x))")
+  reduce_dims = ntuple(d -> d + (d==N-1), N-1)  # i.e. 1:N with N-1 removed
   affine_shape = ntuple(i -> i == N-1 ? size(x, N-1) : 1, N)
   return _norm_layer_forward(BN, x; reduce_dims, affine_shape)
 end

diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
@@ -200,10 +200,11 @@ end
 RNNCell((in, out)::Pair, σ=tanh; init=Flux.glorot_uniform, initb=zeros32, init_state=zeros32) =
   RNNCell(σ, init(out, in), init(out, out), initb(out), init_state(out,1))
 
-function (m::RNNCell{F,I,H,V,<:AbstractMatrix{T}})(h, x::Union{AbstractVecOrMat{T},OneHotArray}) where {F,I,H,V,T}
+function (m::RNNCell{F,I,H,V,<:AbstractMatrix{T}})(h, x::Union{AbstractVecOrMat{<:AbstractFloat},OneHotArray}) where {F,I,H,V,T}
   Wi, Wh, b = m.Wi, m.Wh, m.b
   σ = NNlib.fast_act(m.σ, x)
-  h = σ.(Wi*x .+ Wh*h .+ b)
+  xT = _match_eltype(m, T, x)::AbstractArray{T}  # any AbstractFloat is so converted
+  h = σ.(Wi*xT .+ Wh*h .+ b)
   return h, reshape_cell_output(h, x)
 end
 
@@ -305,9 +306,10 @@ function LSTMCell((in, out)::Pair;
   return cell
 end
 
-function (m::LSTMCell{I,H,V,<:NTuple{2,AbstractMatrix{T}}})((h, c), x::Union{AbstractVecOrMat{T},OneHotArray}) where {I,H,V,T}
+function (m::LSTMCell{I,H,V,<:NTuple{2,AbstractMatrix{T}}})((h, c), x::Union{AbstractVecOrMat{<:AbstractFloat},OneHotArray}) where {I,H,V,T}
   b, o = m.b, size(h, 1)
-  g = muladd(m.Wi, x, muladd(m.Wh, h, b))
+  xT = _match_eltype(m, T, x)::AbstractArray{T}
+  g = muladd(m.Wi, xT, muladd(m.Wh, h, b))
   input, forget, cell, output = multigate(g, o, Val(4))
   c′ = @. sigmoid_fast(forget) * c + sigmoid_fast(input) * tanh_fast(cell)
   h′ = @. sigmoid_fast(output) * tanh_fast(c′)
@@ -376,9 +378,10 @@ end
 GRUCell((in, out)::Pair; init = glorot_uniform, initb = zeros32, init_state = zeros32) =
   GRUCell(init(out * 3, in), init(out * 3, out), initb(out * 3), init_state(out,1))
 
-function (m::GRUCell{I,H,V,<:AbstractMatrix{T}})(h, x::Union{AbstractVecOrMat{T},OneHotArray}) where {I,H,V,T}
+function (m::GRUCell{I,H,V,<:AbstractMatrix{T}})(h, x::Union{AbstractVecOrMat{<:AbstractFloat},OneHotArray}) where {I,H,V,T}
   Wi, Wh, b, o = m.Wi, m.Wh, m.b, size(h, 1)
-  gxs, ghs, bs = multigate(Wi*x, o, Val(3)), multigate(Wh*h, o, Val(3)), multigate(b, o, Val(3))
+  xT = _match_eltype(m, T, x)::AbstractArray{T}
+  gxs, ghs, bs = multigate(Wi*xT, o, Val(3)), multigate(Wh*h, o, Val(3)), multigate(b, o, Val(3))
   r, z = _gru_output(gxs, ghs, bs)
   h̃ = @. tanh_fast(gxs[3] + r * ghs[3] + bs[3])
   h′ = @. (1 - z) * h̃ + z * h
@@ -444,9 +447,10 @@ GRUv3Cell((in, out)::Pair; init = glorot_uniform, initb = zeros32, init_state =
   GRUv3Cell(init(out * 3, in), init(out * 2, out), initb(out * 3),
             init(out, out), init_state(out,1))
 
-function (m::GRUv3Cell{I,H,V,HH,<:AbstractMatrix{T}})(h, x::Union{AbstractVecOrMat{T},OneHotArray}) where {I,H,V,HH,T}
+function (m::GRUv3Cell{I,H,V,HH,<:AbstractMatrix{T}})(h, x::Union{AbstractVecOrMat{<:AbstractFloat},OneHotArray}) where {I,H,V,HH,T}
   Wi, Wh, b, Wh_h̃, o = m.Wi, m.Wh, m.b, m.Wh_h̃, size(h, 1)
-  gxs, ghs, bs = multigate(Wi*x, o, Val(3)), multigate(Wh*h, o, Val(2)), multigate(b, o, Val(3))
+  xT = _match_eltype(m, T, x)::AbstractArray{T}
+  gxs, ghs, bs = multigate(Wi*xT, o, Val(3)), multigate(Wh*h, o, Val(2)), multigate(b, o, Val(3))
   r, z = _gru_output(gxs, ghs, bs)
   h̃ = tanh_fast.(gxs[3] .+ (Wh_h̃ * (r .* h)) .+ bs[3])
   h′ = @. (1 - z) * h̃ + z * h

diff --git a/test/layers/basic.jl b/test/layers/basic.jl
@@ -89,6 +89,22 @@ import Flux: activations
       @test Dense(10, 2, identity, init = ones)([ones(10,1) 2*ones(10,1)]) == [10 20; 10 20]
       @test Dense(10, 2, identity, init = ones, bias = false)([ones(10,1) 2*ones(10,1)]) == [10 20; 10 20]
     end
+    @testset "fast paths, type fixes, ambiguities" begin
+      d1 = Dense(2 => 3)
+      d2 = Dense(d1.weight, false)
+      x1 = randn(Float32, 2, 4)
+      @test d1(x1) ≈ d2(x1) ≈ d1.weight * x1
+      x2 = Float64.(x1)
+      @test d1(x2) ≈ d2(x2) ≈ d1.weight * x2
+      @test d1(x2) isa Array{Float32}
+      @test d2(x2) isa Array{Float32}
+      x3 = rand(-5:5, 2, 4)
+      @test d1(x3) ≈ d2(x3) ≈ d1.weight * x3
+      x4 = rand(Bool, 2, 4)
+      @test d1(x4) ≈ d2(x4) ≈ d1.weight * x4
+      x5 = Flux.onehotbatch(rand(Bool, 5), (true, false))
+      @test d1(x5) ≈ d2(x5) ≈ d1.weight * x5
+    end
   end
 
   @testset "Scale" begin

diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
@@ -166,11 +166,11 @@ end
   end
 
   # with activation function
-  let m = BatchNorm(2, sigmoid), x = [1.0 3.0 5.0;
-                                      2.0 4.0 6.0]
+  let m = BatchNorm(2, sigmoid)
+    x = Float32[1.0 3.0 5.0; 2.0 4.0 6.0]
     y = m(x)
     @test isapprox(y, sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ)), atol = 1.0e-7)
-    @inferred m(x)
+    @inferred m(x)  # fails when x::Matrix{Float64}, do we care?
   end
 
   let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:6), 3, 2, 1)

diff --git a/test/layers/recurrent.jl b/test/layers/recurrent.jl
@@ -93,9 +93,10 @@ end
 @testset "RNN-input-state-eltypes" begin
   @testset for R in [RNN, GRU, LSTM, GRUv3]
     m = R(3 => 5)
-    x = rand(Float64, 3, 1)
+    x = rand(Float64, 3, 1)  # Float64 input is now converted
     Flux.reset!(m)
-    @test_throws MethodError m(x)
+    @test m(x) isa Array{Float32}
+    @test m.state isa Array{Float32}
   end
 end