diff --git a/src/deprecations.jl b/src/deprecations.jl index 1769a94170..6cb73d2cf2 100644 --- a/src/deprecations.jl +++ b/src/deprecations.jl @@ -34,11 +34,6 @@ struct Zeros end Zeros(args...) = Zeros() # was used both Dense(10, 2, initb = Zeros) and Dense(rand(2,10), Zeros()) -# function Optimise.update!(x::AbstractArray, x̄) -# Base.depwarn("`Flux.Optimise.update!(x, x̄)` was not used internally and has been removed. Please write `x .-= x̄` instead.", :update!) -# x .-= x̄ -# end - function Diagonal(size::Integer...; kw...) Base.depwarn("Flux.Diagonal is now Flux.Scale, and also allows an activation function.", :Diagonal) Scale(size...; kw...) diff --git a/src/train/Train.jl b/src/train/Train.jl index 32049b9285..bbbe762fa2 100644 --- a/src/train/Train.jl +++ b/src/train/Train.jl @@ -4,7 +4,7 @@ using LinearAlgebra using Optimisers: Optimisers using Functors: fmap -export train!, update!, adjust!, FluxState, @epochs, +export train!, update!, adjust!, FluxState, Descent, Adam, Momentum, Nesterov, RMSProp, AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, AdamW, RAdam, OAdam, AdaBelief #, # InvDecay, ExpDecay, WeightDecay, stop, skip, Optimiser, @@ -15,7 +15,7 @@ export train!, update!, adjust!, FluxState, @epochs, """ FluxState(rule, state=missing) - + This is an interface between the all-mutable world Flux.jl likes, and the could-be-immutable world that Optimisers.jl inhabits. @@ -56,34 +56,14 @@ end ### Two styles of gradient, and their `train!` functions -using ProgressLogging: @progress, @withprogress, @logprogress +using ProgressLogging: @progress, @withprogress, @logprogress # TODO add progress logging again using Zygote: Zygote, Params -include("explicit_train.jl.jl") # new! -include("implicit_train.jl.jl") # Params etc, Zygote only +include("explicit_train.jl") # new! +include("implicit_train.jl") # Params etc, Zygote only explicit_withgradient(f, args...) = Zygote.withgradient(f, args...) # can overload this to use e.g. Yota / Diffractor -# using Requires # Flux doesn't use this right now -# @init @require Diffractor="9f5e2b26-1114-432f-b630-d3fe2085c51c" begin -# @eval function explicit_withgradient(f, args...) -# y, back = Diffractor.∂⃖¹(f, args...) -# _, grads... = back(Zygote.sensitivity(y)) -# return (; value = y, gradient = grads) -# end -# end - -#= - -using Diffractor -function Flux.Train.explicit_withgradient(f, args...) - y, back = Diffractor.∂⃖¹(f, args...) - _, grads... = back(one(y)) - return (; value = y, gradient = grads) -end - -=# - ### Misc. related utilities """ @@ -107,94 +87,4 @@ function adjust!(opt::FluxState, eta::Real) return opt end -""" - @epochs N body - -Run `body` expression `N` times. Mainly useful for quickly doing -multiple epochs of training in a REPL. - -Functionally equivalent to this loop: -``` -for _ in 1:N - body -end -``` -... but adds progress logging and `@info` messages, -and returns the result of the last iteration. - -# Examples -```jldoctest -julia> Flux.@epochs 2 println("hello") -[ Info: Epoch 1 -hello -[ Info: Epoch 2 -hello -``` -""" -macro epochs(n, ex) - @gensym val - body = :(for i in 1:$(esc(n)) - @info "Epoch $i" - $(esc(val)) = $(esc(ex)) - end) - loop = Expr(:macrocall, Symbol("@progress"), __source__, body) - Expr(:block, :($(esc(val)) = nothing), loop, :($(esc(val)))) - # TODO make this actualy return the value? Names aren't right. -# -# $loop -# # @progress for i in 1:$(esc(n)) -# # @info "Epoch $i" -# # $(esc(val)) = $(esc(ex)) -# # end -# $val # DOESN"T WORK! Expr(:macrocall, ...) ? -# end -end - -end - - -#= - -using Flux, Random -data = [(rand(3,2).*[i,1,20/i], [i i]) for i in 1:50] |> shuffle!; - -# This exact code works on Flux@0.13. There, train! returns nothing: -model2 = Chain(Dense(3 => 7, relu), Dense(7 => 1)) -opt2 = Flux.Adam() -Flux.train!(Flux.params(model2), data, opt2) do x, y - Flux.mse(model2(x), y) -end -opt2 # contains an IdDict - -# This is the new "explicit" method of Train -model1 = Chain(Dense(3 => 7, relu), Dense(7 => 1)) -opt1 = Flux.Adam() -Flux.train!(model1, data, opt1) do m, x, y - Flux.mse(m(x), y) -end |> sum -opt1 # contains state tree - -# This is new 3-arg train!, one step not an iteration over data: -x1, y1 = data[1] -Flux.train!(model1, opt1) do m - Flux.mse(m(x1), y1) -end - - - - - -julia> using ProgressLogging -julia> @macroexpand1 @loop N body -begin - x = nothing - @progress for i in 1:N - @info "step $i" - x = body - end - x -end - - - -=# \ No newline at end of file +end # module diff --git a/src/train/explicit_train.jl b/src/train/explicit_train.jl index edd31b281e..673ba6c141 100644 --- a/src/train/explicit_train.jl +++ b/src/train/explicit_train.jl @@ -52,26 +52,28 @@ function train!(loss::Function, model, data, opt::FluxState) _initialise!(opt, model) losses = Float32[] s = opt.state - s isa IdDict && error("can't mix explicit & implicit!") + s isa IdDict && error("""Can't mix explicit & implicit modes! + Once `FluxState` is initialised by `train!` in one mode, it cannot be used in the other.""") for d in data - l, (g, _...) = Zygote.withgradient(loss, model, train_ok(d)...) + l, (g, _...) = explicit_withgradient(loss, model, data_splat(d)...) s, model = Optimisers.update!(s, model, g) push!(losses, l) opt.state = s end - return losses + return losses # Not entirely sure returning losses is a good idea. Flux 0.13 returns `nothing`. end -train_ok(x::T) where T = error("""train! expects every d in data be a Tuple or a NamedTuple, got $T - To allow this type, define `Flux.Optimise.train_ok(x::$T) = (x,)`""") -train_ok(x::Tuple) = x -train_ok(x::NamedTuple) = x +data_splat(x::T) where T = error("""train! expects every d in data be a Tuple or a NamedTuple, got $T + To allow this type, define `Flux.Train.data_splat(x::$T) = (x,)`""") +data_splat(x::Tuple) = x +data_splat(x::NamedTuple) = x function _initialise!(opt::FluxState, model) if opt.state isa Missing opt.state = Optimisers.setup(opt.rule, model) fmap(model, exclude = Optimisers.isnumeric) do x - Optimisers.maywrite(x) || error("model must be fully mutable for train! to work, got $(typeof(x))") + Optimisers.maywrite(x) || error("""model must be fully mutable for train! to work, got x::$(typeof(x)) + If `x .+= dx` is in fact ok, define `Optimisers.maywrite(::$(typeof(x))) = true`""") end end opt @@ -107,12 +109,12 @@ function train!(loss::Function, model, opt::FluxState) l end +# This method lets you use Optimisers.Descent() instead of Flux.Descent(), when there is no state function train!(loss::Function, model, data, opt::Optimisers.AbstractRule) _initialise!(opt, model) - # fmap(opt.state) do x - # x isa Union{Number, AbstractArray{<:Number}} && @warn "optimiser state will be lost!" - # x - # end # won't work as you need to look inside Leaf for non-nothings. - @warn "optimiser state will be lost!" + fmap(opt.state, exclude = x -> x isa Optimsers.Leaf) do leaf + leaf.state isa Nothing || @warn "Optimiser state will be lost! Please wrap optimisation rule in `FluxState`, e.g. by using `Flux.Adam()`" leaf + leaf + end train!(loss, model, data, FluxState(opt)) end diff --git a/src/train/implicit_train.jl b/src/train/implicit_train.jl index 43c3b75766..eb2068eaa0 100644 --- a/src/train/implicit_train.jl +++ b/src/train/implicit_train.jl @@ -29,7 +29,7 @@ function train!(loss::Function, pars::Params, data, opt::FluxState) losses = Float32[] for d in data l, grads = Zygote.withgradient(() -> loss(batchmemaybe(d)...), pars) - update!(opt, pars, grads) + _update!(opt, pars, grads) push!(losses, l) end return losses @@ -49,7 +49,7 @@ function train!(loss::Function, pars::Params, opt::FluxState) Explicit parameters are now preferred, see `train!(loss, model, data, opt)`""", :train!, force=true) _initialise!(opt, pars) l, grads = Zygote.withgradient(() -> loss(), pars) - update!(opt, pars, grads) + _update!(opt, pars, grads) return l end @@ -68,6 +68,12 @@ Legacy method, mimicking the behaviour of Flux <= 0.13. """ function update!(opt::FluxState, xs::Params, gs) Base.depwarn("Flux.update! is a legacy function", :update!) + _initialise!(opt, xs) + _update!(opt, xs, gs) +end +# This _update! exists only so that train! above gives one depwarn, not two! +# ... and also to call _initialise! +function _update!(opt::FluxState, xs::Params, gs) for x in xs isnothing(gs[x]) && continue update!(opt, x, gs[x]) diff --git a/test/layers/conv.jl b/test/layers/conv.jl index 019f3fd603..32e40f4186 100644 --- a/test/layers/conv.jl +++ b/test/layers/conv.jl @@ -55,13 +55,13 @@ end bias = Conv((2, 2), 1=>3, bias = false); ip = zeros(Float32, 28,28,1,1) op = zeros(Float32, 27,27,3,1) .+ 2.f0 - opt = Descent() + opt = Flux.Descent() for _ = 1:10^3 gs = gradient(Flux.params(bias)) do Flux.Losses.mse(bias(ip), op) end - Flux.Optimise.update!(opt, params(bias), gs) + Flux.Optimise.update!(opt, Flux.params(bias), gs) end @test Flux.Losses.mse(bias(ip), op) ≈ 4.f0 @@ -168,7 +168,7 @@ end x = zeros(Float32, 5, 5, 2, 4) m = ConvTranspose((3,3), 2=>3) - @test gradient(()->sum(m(x)), params(m)) isa Flux.Zygote.Grads + @test gradient(()->sum(m(x)), Flux.params(m)) isa Flux.Zygote.Grads # test ConvTranspose supports groups argument x = randn(Float32, 10, 10, 2, 3) @@ -178,7 +178,7 @@ end m2 = ConvTranspose((3,3), 2=>4, groups=2, pad=SamePad()) @test size(m2.weight) == (3,3,2,2) @test size(m1(x)) == size(m2(x)) - @test gradient(()->sum(m2(x)), params(m2)) isa Flux.Zygote.Grads + @test gradient(()->sum(m2(x)), Flux.params(m2)) isa Flux.Zygote.Grads x = randn(Float32, 10, 2,1) m = ConvTranspose((3,), 2=>4, pad=SamePad(), groups=2) diff --git a/test/optimise.jl b/test/optimise.jl deleted file mode 100644 index e922d3c0b8..0000000000 --- a/test/optimise.jl +++ /dev/null @@ -1,239 +0,0 @@ -using Flux.Optimise -using Flux.Optimise: runall -using Flux: Params, gradient -import FillArrays, ComponentArrays -using Test -using Random - -@testset "Optimise" begin - # Ensure rng has different state inside and outside the inner @testset - # so that w and w' are different - Random.seed!(84) - w = randn(10, 10) - @testset for opt in [AdamW(), AdaGrad(0.1), AdaMax(), AdaDelta(0.9), AMSGrad(), - NAdam(), RAdam(), Descent(0.1), Adam(), OAdam(), AdaBelief(), - Nesterov(), RMSProp(), Momentum()] - Random.seed!(42) - w′ = randn(10, 10) - b = false - loss(x) = Flux.Losses.mse(w*x, w′*x .+ b) - for t = 1: 10^5 - θ = params([w′, b]) - x = rand(10) - θ̄ = gradient(() -> loss(x), θ) - Optimise.update!(opt, θ, θ̄) - end - @test loss(rand(10, 10)) < 0.01 - end -end - -@testset "Optimiser" begin - Random.seed!(84) - w = randn(10, 10) - @testset for Opt in [InvDecay, WeightDecay, ExpDecay] - Random.seed!(42) - w′ = randn(10, 10) - loss(x) = Flux.Losses.mse(w*x, w′*x) - opt = Optimiser(Opt(), Adam(0.001)) - for t = 1:10^5 - θ = Params([w′]) - x = rand(10) - θ̄ = gradient(() -> loss(x), θ) - Optimise.update!(opt, θ, θ̄) - end - @test loss(rand(10, 10)) < 0.01 - end -end - -@testset "Training Loop" begin - i = 0 - l = 1 - Flux.train!( - () -> (sleep(0.1); Flux.skip(); i+=1), - Params([]), - Iterators.repeated((), 10), - Descent() - ) - - @test i==0 #all skipped - - Flux.train!( - () -> (sleep(0.1); i==8 && Flux.skip(); i+=1), - Params([]), - Iterators.repeated((), 10), - Descent() - ) - - @test i==8 #skip after i hit 8 - - i = 0 - Flux.train!(() -> (sleep(0.1); i += 1; l), - Params([]), - Iterators.repeated((), 100), - Descent(), - cb = Flux.throttle(() -> (i > 3 && Flux.stop()), 1)) - - @test 3 < i < 50 - - # Test multiple callbacks - x = 0 - fs = [() -> (), () -> x = 1] - cbs = runall(fs) - cbs() - @test x == 1 - - r = rand(3, 3) - loss(x) = sum(x .* x) - Flux.train!(loss, Flux.params(r), (r,), Descent()) -end - -@testset "ExpDecay" begin - - @testset "Sanity Check" begin - o = ExpDecay(0.2, 0.5, 1, 1e-3) - p = [0.0] - steps = 1:8 - eta_expected = @. max(o.eta * 0.5 ^ steps, o.clip) - eta_actual = [Optimise.apply!(o, p, [1.0])[1] for _ in steps] - @test eta_actual == eta_expected - end - - @testset "starting step" begin - start = 4 - o = ExpDecay(0.2, 0.5, 1, 1e-3, start) - p = [0.0] - steps = 1:8 - eta_expected = @. max(o.eta * 0.5 ^ max(steps - start, 0), o.clip) - eta_actual = [Optimise.apply!(o, p, [1.0])[1] for _ in steps] - @test eta_actual == eta_expected - end - - w = randn(10, 10) - o = ExpDecay(0.1, 0.1, 1000, 1e-4) - w1 = randn(10,10) - loss(x) = Flux.Losses.mse(w*x, w1*x) - flag = 1 - decay_steps = [] - for t = 1:10^5 - prev_eta = o.eta - θ = Params([w1]) - x = rand(10) - θ̄ = gradient(() -> loss(x), θ) - prev_grad = collect(θ̄[w1]) - delta = Optimise.apply!(o, w1, θ̄[w1]) - w1 .-= delta - new_eta = o.eta - if new_eta != prev_eta - push!(decay_steps, t) - end - array = fill(o.eta, size(prev_grad)) - if array .* prev_grad != delta - flag = 0 - end - end - @test flag == 1 - # Test to check if decay happens at decay steps. Eta reaches clip value (1e-4) after 4000 steps (decay by 0.1 every 1000 steps starting at 0.1). - ground_truth = [] - for i in 1:4 - push!(ground_truth, 1000*i) # Expected decay steps for this example. - end - @test decay_steps == ground_truth - @test o.eta == o.clip -end - -@testset "Clipping" begin - w = randn(10, 10) - loss(x) = sum(w * x) - θ = Params([w]) - x = 1000 * randn(10) - w̄ = gradient(() -> loss(x), θ)[w] - w̄_value = Optimise.apply!(ClipValue(1.0), w, copy(w̄)) - @test all(w̄_value .<= 1) - w̄_norm = Optimise.apply!(ClipNorm(1.0), w, copy(w̄)) - @test norm(w̄_norm) <= 1 -end - -@testset "update!: handle Fills from Zygote" begin - w = randn(10,10) - wold = copy(w) - g = FillArrays.Ones(size(w)) - opt = Descent(0.1) - Flux.update!(opt, w, g) - @test w ≈ wold .- 0.1 - - w = randn(3) - wold = copy(w) - θ = Flux.params([w]) - gs = gradient(() -> w[1], θ) - opt = Descent(0.1) - Flux.update!(opt, θ, gs) - @test w[1] ≈ wold[1] .- 0.1 - @test w[2:3] ≈ wold[2:3] - - ## Issue #1510 - w = randn(10,10) - wold = copy(w) - θ = Flux.params([w]) - gs = gradient(() -> sum(w), θ) - opt = Descent(0.1) - Flux.update!(opt, θ, gs) - @test w ≈ wold .- 0.1 -end - -@testset "update!: handle ComponentArrays" begin - w = ComponentArrays.ComponentArray(a=1.0, b=[2, 1, 4], c=(a=2, b=[1, 2])) - wold = deepcopy(w) - θ = Flux.params([w]) - gs = gradient(() -> sum(w.a) + sum(w.c.b), θ) - opt = Descent(0.1) - Flux.update!(opt, θ, gs) - @test w.a ≈ wold.a .- 0.1 - @test w.b ≈ wold.b - @test w.c.b ≈ wold.c.b .- 0.1 - @test w.c.a ≈ wold.c.a - - w = ComponentArrays.ComponentArray(a=1.0, b=[2, 1, 4], c=(a=2, b=[1, 2])) - wold = deepcopy(w) - θ = Flux.params([w]) - gs = gradient(() -> sum(w), θ) - opt = Descent(0.1) - Flux.update!(opt, θ, gs) - @test w ≈ wold .- 0.1 -end - -# Flux PR #1776 -# We need to test that optimisers like Adam that maintain an internal momentum -# estimate properly calculate the second-order statistics on the gradients as -# the flow backward through the model. Previously, we would calculate second- -# order statistics via `Δ^2` rather than the complex-aware `Δ * conj(Δ)`, which -# wreaks all sorts of havoc on our training loops. This test ensures that -# a simple optimization is montonically decreasing (up to learning step effects) -@testset "Momentum Optimisers and complex values" begin - # Test every optimizer that has momentum internally - for opt_ctor in [Adam, RMSProp, RAdam, OAdam, AdaGrad, AdaDelta, NAdam, AdaBelief] - # Our "model" is just a complex number - w = zeros(ComplexF32, 1) - - # Our model attempts to learn `f(x) = conj(x)` where `f(x) = w*x` - function loss() - # Deterministic training data is the best training data - x = ones(1, 1) + 1im*ones(1, 1) - - # Manually implement `mse()` to allow demonstration of brokenness - # on older Flux builds that don't have a fixed `mse()` - return sum(abs2.(w * x .- conj(x))) - end - - params = Flux.Params([w]) - opt = opt_ctor(1e-2) - - # Train for 10 iterations, enforcing that loss is monotonically decreasing - last_loss = Inf - for idx in 1:10 - grads = Flux.gradient(loss, params) - @test loss() < last_loss - last_loss = loss() - Flux.update!(opt, params, grads) - end - end -end diff --git a/test/runtests.jl b/test/runtests.jl index 706f126451..d9a5011879 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -20,8 +20,8 @@ Random.seed!(0) include("onehot.jl") end - @testset "Optimise" begin - include("optimise.jl") + @testset "Train" begin + include("train.jl") end @testset "Data" begin diff --git a/test/train.jl b/test/train.jl new file mode 100644 index 0000000000..c7af65f509 --- /dev/null +++ b/test/train.jl @@ -0,0 +1,92 @@ +using Flux.Train +using Zygote: Params, gradient + +import FillArrays, ComponentArrays + +using Test +using Random + +@testset "Implicit train!" begin # These tests pass on Flux v0.13 + Random.seed!(84) + w = randn(10, 10) + w2 = randn(10, 10) # NB outside the inner @testset, else it will be exactly == w, as the RNG seed is reset. + @testset for opt in [Descent(0.1), Adam()] + # [AdamW(), AdaGrad(0.1), AdaMax(), AdaDelta(0.9), AMSGrad(), + # NAdam(), RAdam(), Descent(0.1), Adam(), OAdam(), AdaBelief(), + # Nesterov(), RMSProp(), Momentum()] + w′ = copy(w2) + b = zeros(10) + loss(x) = Flux.Losses.mse(w*x, w′*x .+ b) + @test loss(rand(10, 10)) > 1 + Flux.train!(loss, Flux.params([w′, b]), (rand(10) for _ in 1: 10^5), opt) + @test loss(rand(10, 10)) < 0.01 + end +end + +@testset "Explicit train!" begin + Random.seed!(84) + w = randn(10, 10) + w2 = randn(10, 10) # NB outside the inner @testset, else it will be exactly == w, as the RNG seed is reset. + @testset for opt in [Descent(0.1), Adam()] + @test opt isa FluxState + w′ = copy(w2) + b = zeros(10) + loss(m, x) = Flux.Losses.mse(w*x, m.weight*x .+ m.bias) + model = (weight=w′, bias=b, ignore=nothing) + @test loss(model, rand(10, 10)) > 1 + train!(loss, model, ((rand(10),) for _ in 1: 10^5), opt) + @test loss(model, rand(10, 10)) < 0.01 + end +end + +#= + +@testset "update!: handle Fills from Zygote" begin + w = randn(10,10) + wold = copy(w) + g = FillArrays.Ones(size(w)) + opt = Descent(0.1) + Flux.update!(opt, w, g) + @test w ≈ wold .- 0.1 + + w = randn(3) + wold = copy(w) + θ = Flux.params([w]) + gs = gradient(() -> w[1], θ) + opt = Descent(0.1) + Flux.update!(opt, θ, gs) + @test w[1] ≈ wold[1] .- 0.1 + @test w[2:3] ≈ wold[2:3] + + ## Issue #1510 + w = randn(10,10) + wold = copy(w) + θ = Flux.params([w]) + gs = gradient(() -> sum(w), θ) + opt = Descent(0.1) + Flux.update!(opt, θ, gs) + @test w ≈ wold .- 0.1 +end + +@testset "update!: handle ComponentArrays" begin + w = ComponentArrays.ComponentArray(a=1.0, b=[2, 1, 4], c=(a=2, b=[1, 2])) + wold = deepcopy(w) + θ = Flux.params([w]) + gs = gradient(() -> sum(w.a) + sum(w.c.b), θ) + opt = Descent(0.1) + Flux.update!(opt, θ, gs) + @test w.a ≈ wold.a .- 0.1 + @test w.b ≈ wold.b + @test w.c.b ≈ wold.c.b .- 0.1 + @test w.c.a ≈ wold.c.a + + w = ComponentArrays.ComponentArray(a=1.0, b=[2, 1, 4], c=(a=2, b=[1, 2])) + wold = deepcopy(w) + θ = Flux.params([w]) + gs = gradient(() -> sum(w), θ) + opt = Descent(0.1) + Flux.update!(opt, θ, gs) + @test w ≈ wold .- 0.1 +end + +=# \ No newline at end of file