From e1e5b4cec395151246247d9c37cdec43eae8fc24 Mon Sep 17 00:00:00 2001 From: abhro <5664668+abhro@users.noreply.github.com> Date: Sun, 27 Oct 2024 01:31:23 -0400 Subject: [PATCH 01/13] Fix macro signature in docstring (#184) --- src/interface.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/interface.jl b/src/interface.jl index ac9b90b..7341268 100644 --- a/src/interface.jl +++ b/src/interface.jl @@ -225,12 +225,12 @@ onevalue(λ, x::AbstractArray{T}) where T = onevalue(convert(float(T), λ), x) nonneg(η::Real) = η < 0 ? throw(DomainError(η, "the learning rate cannot be negative")) : η """ - @def struct Rule; eta = 0.1; beta = (0.7, 0.8); end + @def struct Rule; eta = 0.1; beta = (0.7, 0.8); end Helper macro for defining rules with default values. The types of the literal values are used in the `struct`, like this: -``` +```julia struct Rule eta::Float64 beta::Tuple{Float64, Float64} From 1443a6e893b82bf8ba5aa6180bdbbb9ac9051494 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Sun, 27 Oct 2024 09:54:59 +0100 Subject: [PATCH 02/13] fix docs (#185) --- docs/src/api.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/src/api.md b/docs/src/api.md index 378bf72..434ee70 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -85,4 +85,5 @@ It is defined in Functors.jl and re-exported by Optimisers.jl here for convenien Functors.KeyPath Functors.haskeypath Functors.getkeypath +Functors.setkeypath! ``` From fc65256f5ccc09c152d1a59595830de758babb67 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Tue, 5 Nov 2024 06:13:27 +0100 Subject: [PATCH 03/13] Update CompatHelper.yml --- .github/workflows/CompatHelper.yml | 43 +++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 7 deletions(-) diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml index ce8d353..8ad0284 100644 --- a/.github/workflows/CompatHelper.yml +++ b/.github/workflows/CompatHelper.yml @@ -1,16 +1,45 @@ name: CompatHelper - on: schedule: - - cron: '00 00 * * *' - + - cron: 0 0 * * * + workflow_dispatch: +permissions: + contents: write + pull-requests: write jobs: CompatHelper: runs-on: ubuntu-latest steps: - - name: Pkg.add("CompatHelper") - run: julia -e 'using Pkg; Pkg.add("CompatHelper")' - - name: CompatHelper.main() + - name: Check if Julia is already available in the PATH + id: julia_in_path + run: which julia + continue-on-error: true + - name: Install Julia, but only if it is not already available in the PATH + uses: julia-actions/setup-julia@v2 + with: + version: '1' + arch: ${{ runner.arch }} + if: steps.julia_in_path.outcome != 'success' + - name: "Add the General registry via Git" + run: | + import Pkg + ENV["JULIA_PKG_SERVER"] = "" + Pkg.Registry.add("General") + shell: julia --color=yes {0} + - name: "Install CompatHelper" + run: | + import Pkg + name = "CompatHelper" + uuid = "aa819f21-2bde-4658-8897-bab36330d9b7" + version = "3" + Pkg.add(; name, uuid, version) + shell: julia --color=yes {0} + - name: "Run CompatHelper" + run: | + import CompatHelper + CompatHelper.main() + shell: julia --color=yes {0} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: julia -e 'using CompatHelper; CompatHelper.main()' + COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }} + # COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }} From 74f8bc98e7fca4b75211b8234b90fd290c2c1038 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Tue, 5 Nov 2024 06:28:03 +0100 Subject: [PATCH 04/13] allow Functor 0.5 (#186) --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index dab1f6e..d204c63 100644 --- a/Project.toml +++ b/Project.toml @@ -12,7 +12,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [compat] ChainRulesCore = "1" -Functors = "0.4.9" +Functors = "0.4.9, 0.5" Statistics = "1" Zygote = "0.6.40" julia = "1.6" From 0ae05d6bd7e5a2f37c5e31ab82b9919f1c294365 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Tue, 5 Nov 2024 06:28:22 +0100 Subject: [PATCH 05/13] Update Project.toml --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index d204c63..41c9709 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Optimisers" uuid = "3bd65402-5787-11e9-1adc-39752487f4e2" authors = ["Mike J Innes "] -version = "0.3.3" +version = "0.3.4" [deps] ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" From bb71298242a7da4e60b00b27dcfa613370bfe78f Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Tue, 5 Nov 2024 00:32:04 -0500 Subject: [PATCH 06/13] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fe8fb9f..6aecb4b 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,8 @@ Optimisers.jl defines many standard gradient-based optimisation rules, and tools for applying them to deeply nested models. -This is the future of training for [Flux.jl](https://github.com/FluxML/Flux.jl) neural networks, -and the present for [Lux.jl](https://github.com/avik-pal/Lux.jl). +This was written as the new training system for [Flux.jl](https://github.com/FluxML/Flux.jl) neural networks, +and also used by [Lux.jl](https://github.com/avik-pal/Lux.jl). But it can be used separately on any array, or anything else understood by [Functors.jl](https://github.com/FluxML/Functors.jl). ## Installation From 2a1b2ed1a3ef8c7605ed2896ec552978f9148202 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Wed, 6 Nov 2024 01:31:16 +0100 Subject: [PATCH 07/13] make docstrings consistent (#187) * fix docstrings * address the review comments --- src/rules.jl | 91 ++++++++++++++++++++++++++++++---------------------- src/utils.jl | 1 + 2 files changed, 54 insertions(+), 38 deletions(-) diff --git a/src/rules.jl b/src/rules.jl index f3df9d6..bc9c099 100644 --- a/src/rules.jl +++ b/src/rules.jl @@ -8,7 +8,7 @@ """ Descent(η = 1f-1) - Descent(; eta) + Descent(; [eta]) Classic gradient descent optimiser with learning rate `η`. For each parameter `p` and its gradient `dp`, this runs `p -= η*dp`. @@ -20,12 +20,13 @@ For each parameter `p` and its gradient `dp`, this runs `p -= η*dp`. struct Descent{T} <: AbstractRule eta::T end + Descent(; eta = 1f-1) = Descent(eta) init(o::Descent, x::AbstractArray) = nothing function apply!(o::Descent, state, x, dx) - η = convert(float(eltype(x)), o.eta) + η = ofeltype(x, o.eta) return state, @lazy dx * η # @lazy creates a Broadcasted, will later fuse with x .= x .- dx end @@ -64,6 +65,8 @@ end """ Nesterov(η = 0.001, ρ = 0.9) + Nesterov(; [eta, rho]) + Gradient descent optimizer with learning rate `η` and Nesterov momentum `ρ`. @@ -153,27 +156,26 @@ end """ Rprop(η = 1f-3, ℓ = (5f-1, 1.2f0), Γ = (1f-6, 50f0)) + Rprop(; [eta, ell, gamma]) Optimizer using the [Rprop](https://ieeexplore.ieee.org/document/298623) algorithm. A full-batch learning algorithm that depends only on the sign of the gradient. # Parameters -- Learning rate (`η`): Amount by which gradients are discounted before updating +- Learning rate (`η == eta`): Amount by which gradients are discounted before updating the weights. -- Scaling factors (`ℓ::Tuple`): Multiplicative increase and decrease factors. +- Scaling factors (`ℓ::Tuple == ell`): Multiplicative increase and decrease factors. -- Step sizes (`Γ::Tuple`): Mminimal and maximal allowed step sizes. +- Step sizes (`Γ::Tuple == gamma`): Mminimal and maximal allowed step sizes. """ -struct Rprop{T} <: AbstractRule - eta::T - ell::Tuple{T,T} - gamma::Tuple{T,T} +@def struct Rprop <: AbstractRule + eta = 1f-3 + ell = (5f-1, 1.2f0) + gamma = (1f-6, 50f0) end -Rprop(η = 1f-3, ℓ = (5f-1, 1.2f0), Γ = (1f-6, 50f0)) = Rprop{typeof(η)}(η, ℓ, Γ) - init(o::Rprop, x::AbstractArray) = (zero(x), onevalue(o.eta, x)) function apply!(o::Rprop, state, x::AbstractArray{T}, dx) where T @@ -193,15 +195,16 @@ end """ Adam(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8) + Adam(; [eta, beta, epsilon]) [Adam](https://arxiv.org/abs/1412.6980) optimiser. # Parameters -- Learning rate (`η`): Amount by which gradients are discounted before updating +- Learning rate (`η == eta`): Amount by which gradients are discounted before updating the weights. -- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the +- Decay of momentums (`β::Tuple == beta`): Exponential decay for the first (β1) and the second (β2) momentum estimate. -- Machine epsilon (`ϵ`): Constant to prevent division by zero +- Machine epsilon (`ϵ == epsilon`): Constant to prevent division by zero (no need to change default) """ @def struct Adam <: AbstractRule @@ -225,12 +228,13 @@ end """ Lion(η = 0.001, β = (0.9, 0.999)) + Lion(; [eta, beta]) [Lion](https://arxiv.org/abs/2302.06675) optimiser. # Parameters -- Learning rate (`η`): Magnitude by which gradients are updating the weights. -- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the +- Learning rate (`η == eta`): Magnitude by which gradients are updating the weights. +- Decay of momentums (`β::Tuple == beta`): Exponential decay for the first (β1) and the second (β2) momentum estimate. """ @def struct Lion <: AbstractRule @@ -254,15 +258,16 @@ end """ RAdam(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8) + RAdam(; [eta, beta, epsilon]) [Rectified Adam](https://arxiv.org/abs/1908.03265) optimizer. # Parameters -- Learning rate (`η`): Amount by which gradients are discounted before updating +- Learning rate (`η == eta`): Amount by which gradients are discounted before updating the weights. -- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the +- Decay of momentums (`β::Tuple == beta`): Exponential decay for the first (β1) and the second (β2) momentum estimate. -- Machine epsilon (`ϵ`): Constant to prevent division by zero +- Machine epsilon (`ϵ == epsilon`): Constant to prevent division by zero (no need to change default) """ @def struct RAdam <: AbstractRule @@ -294,15 +299,16 @@ end """ AdaMax(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8) + AdaMax(; [eta, beta, epsilon]) [AdaMax](https://arxiv.org/abs/1412.6980) is a variant of Adam based on the ∞-norm. # Parameters -- Learning rate (`η`): Amount by which gradients are discounted before updating +- Learning rate (`η == eta`): Amount by which gradients are discounted before updating the weights. -- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the +- Decay of momentums (`β::Tuple == beta`): Exponential decay for the first (β1) and the second (β2) momentum estimate. -- Machine epsilon (`ϵ`): Constant to prevent division by zero +- Machine epsilon (`ϵ == epsilon`): Constant to prevent division by zero (no need to change default) """ @def struct AdaMax <: AbstractRule @@ -326,16 +332,17 @@ end """ OAdam(η = 0.001, β = (0.5, 0.9), ϵ = 1e-8) + OAdam(; [eta, beta, epsilon]) [OAdam](https://arxiv.org/abs/1711.00141) (Optimistic Adam) is a variant of Adam adding an "optimistic" term suitable for adversarial training. # Parameters -- Learning rate (`η`): Amount by which gradients are discounted before updating +- Learning rate (`η == eta`): Amount by which gradients are discounted before updating the weights. -- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the +- Decay of momentums (`β::Tuple == beta`): Exponential decay for the first (β1) and the second (β2) momentum estimate. -- Machine epsilon (`ϵ`): Constant to prevent division by zero +- Machine epsilon (`ϵ == epsilon`): Constant to prevent division by zero (no need to change default) """ @def struct OAdam <: AbstractRule @@ -361,15 +368,16 @@ end """ AdaGrad(η = 0.1, ϵ = 1e-8) + AdaGrad(; [eta, epsilon]) [AdaGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimizer. It has parameter specific learning rates based on how frequently it is updated. Parameters don't need tuning. # Parameters -- Learning rate (`η`): Amount by which gradients are discounted before updating +- Learning rate (`η == eta`): Amount by which gradients are discounted before updating the weights. -- Machine epsilon (`ϵ`): Constant to prevent division by zero +- Machine epsilon (`ϵ == epsilon`): Constant to prevent division by zero (no need to change default) """ @def struct AdaGrad <: AbstractRule @@ -391,14 +399,15 @@ end """ AdaDelta(ρ = 0.9, ϵ = 1e-8) + AdaDelta(; [rho, epsilon]) [AdaDelta](https://arxiv.org/abs/1212.5701) is a version of AdaGrad adapting its learning rate based on a window of past gradient updates. Parameters don't need tuning. # Parameters -- Rho (`ρ`): Factor by which the gradient is decayed at each time step. -- Machine epsilon (`ϵ`): Constant to prevent division by zero +- Rho (`ρ == rho`): Factor by which the gradient is decayed at each time step. +- Machine epsilon (`ϵ == epsilon`): Constant to prevent division by zero (no need to change default) """ @def struct AdaDelta <: AbstractRule @@ -422,16 +431,17 @@ end """ AMSGrad(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8) + AMSGrad(; [eta, beta, epsilon]) The [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) version of the Adam optimiser. Parameters don't need tuning. # Parameters -- Learning rate (`η`): Amount by which gradients are discounted before updating +- Learning rate (`η == eta`): Amount by which gradients are discounted before updating the weights. -- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the +- Decay of momentums (`β::Tuple == beta`): Exponential decay for the first (β1) and the second (β2) momentum estimate. -- Machine epsilon (`ϵ`): Constant to prevent division by zero +- Machine epsilon (`ϵ == epsilon`): Constant to prevent division by zero (no need to change default) """ @def struct AMSGrad <: AbstractRule @@ -457,16 +467,17 @@ end """ NAdam(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8) + NAdam(; [eta, beta, epsilon]) [NAdam](https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ) is a Nesterov variant of Adam. Parameters don't need tuning. # Parameters -- Learning rate (`η`): Amount by which gradients are discounted before updating +- Learning rate (`η == eta`): Amount by which gradients are discounted before updating the weights. -- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the +- Decay of momentums (`β::Tuple == beta`): Exponential decay for the first (β1) and the second (β2) momentum estimate. -- Machine epsilon (`ϵ`): Constant to prevent division by zero +- Machine epsilon (`ϵ == epsilon`): Constant to prevent division by zero (no need to change default) """ @def struct NAdam <: AbstractRule @@ -515,16 +526,17 @@ AdamW(; eta = 0.001, beta = (0.9, 0.999), lambda = 0, epsilon = 1e-8) = """ AdaBelief(η = 0.001, β = (0.9, 0.999), ϵ = 1e-16) + AdaBelief(; [eta, beta, epsilon]) The [AdaBelief](https://arxiv.org/abs/2010.07468) optimiser is a variant of the well-known Adam optimiser. # Parameters -- Learning rate (`η`): Amount by which gradients are discounted before updating +- Learning rate (`η == eta`): Amount by which gradients are discounted before updating the weights. -- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the +- Decay of momentums (`β::Tuple == beta`): Exponential decay for the first (β1) and the second (β2) momentum estimate. -- Machine epsilon (`ϵ::Float32`): Constant to prevent division by zero +- Machine epsilon (`ϵ == epsilon`): Constant to prevent division by zero (no need to change default) """ @def struct AdaBelief <: AbstractRule @@ -548,6 +560,7 @@ end """ WeightDecay(λ = 5e-4) + WeightDecay(; [lambda]) Implements ``L_2`` regularisation, also known as ridge regression, when composed with other rules as the first transformation in an [`OptimiserChain`](@ref). @@ -585,6 +598,7 @@ function adjust(r::WeightDecay; gamma = nothing, kw...) """ SignDecay(λ = 1e-3) + SignDecay(; [lambda]) Implements ``L_1`` regularisation, also known as LASSO regression, when composed with other rules as the first transformation in an [`OptimiserChain`](@ref). @@ -615,6 +629,7 @@ end """ ClipGrad(δ = 10) + ClipGrad(; [delta]) Restricts every gradient component to obey `-δ ≤ dx[i] ≤ δ`. diff --git a/src/utils.jl b/src/utils.jl index 7c6c95b..12a19dd 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -13,3 +13,4 @@ foreachvalue(f, x::Dict, ys...) = foreach(pairs(x)) do (k, v) f(v, (get(y, k, nothing) for y in ys)...) end +ofeltype(x, y) = convert(float(eltype(x)), y) From e5d187c9c2ce22efe69fa6f3112ddc110d73ba97 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Wed, 6 Nov 2024 01:48:08 +0100 Subject: [PATCH 08/13] doc stable in readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6aecb4b..fa318b2 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ # Optimisers.jl - +[![][docs-stable-img]][docs-stable-url] [![][docs-dev-img]][docs-dev-url] [![][action-img]][action-url] [![][coverage-img]][coverage-url] From 2da6d7fe64bebb84c6d2592a7753c53e36c659cb Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Thu, 7 Nov 2024 08:11:20 +0100 Subject: [PATCH 09/13] docs for nothing behavior and for walking a tree with keypath (#191) * cl/zero * Update docs/src/index.md Co-authored-by: Michael Abbott <32575566+mcabbott@users.noreply.github.com> --------- Co-authored-by: Michael Abbott <32575566+mcabbott@users.noreply.github.com> --- docs/src/index.md | 44 +++++++++++++++++++++++++++++++++++++++++++- src/interface.jl | 8 ++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/docs/src/index.md b/docs/src/index.md index 30ef5c4..3cb32f8 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -311,8 +311,50 @@ julia> trainables(model) Float32[-0.8764882 0.40812716 0.1919528; -0.9123545 -0.4462516 0.6751252] Float32[0.0, 0.0] -julia> l2reg(model) = sum([sum(abs2,p) for p in trainables(model)]); +julia> l2reg(model) = sum([sum(abs2, p) for p in trainables(model)]); julia> g = gradient(l2reg, model)[1]; ``` Notice that the `BatchNorm` layer has two trainable parameters, `γ` and `β`, which are included in the list, while the `μ ` and `σ²` buffers are not. + +Sometimes one wants to iterate over all trainable parameters in a model and the corresponding parameters of a matched structure such a gradient or the moving average of the model. +This can be done using `trainables(model, path=true)`. For instance, here is how to update the parameters +of a moving average model with the parameters of the model: + +```julia +for (kp, p_avg) in trainables(model_avg, path=true) + p = getkeypath(model, kp) + p_avg .= 0.99 .* p_avg .+ 0.01 .* p +end +``` + +## Incomplete or nothing gradients + +If the gradient is not available for some parameters, or branches of the model, +`update` will not take an optimisation step for those parameters. +This is the case when the gradient is `nothing` or a subtype of `ChainRules.AbstractZero`. + +For stateful optimisers, skipping an update it is generaly not the same as updating with a zero gradient. +For example, in the case of Adam, the momentum and variance are updated even if the gradient is zero: + +```julia-repl +julia> x = (a = ones(2), b = ones(2)); +(a = [1.0, 1.0], b = [1.0, 1.0]) + +julia> opt_state = Optimisers.setup(Adam(0.1), x) +(a = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.0, 0.0], [0.0, 0.0], (0.9, 0.999))), b = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.0, 0.0], [0.0, 0.0], (0.9, 0.999)))) + +julia> g = (; a = ones(2), b = ones(2)); # First an update with a non-zero gradient to increase the momentum and variance + +julia> Optimisers.update!(opt_state, x, g); + +julia> opt_state # the state in `a` and `b` are the same +(a = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.1, 0.1], [0.001, 0.001], (0.81, 0.998001))), b = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.1, 0.1], [0.001, 0.001], (0.81, 0.998001)))) + +julia> g = (; a = zeros(2), b = nothing); # Now an update with a zero gradient for a and no gradient for b + +julia> Optimisers.update!(opt_state, x, g); + +julia> opt_state # the state in `a` and `b` differ +(a = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.09, 0.09], [0.000999, 0.000999], (0.729, 0.997003))), b = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.1, 0.1], [0.001, 0.001], (0.81, 0.998001)))) +``` diff --git a/src/interface.jl b/src/interface.jl index 7341268..e44dec1 100644 --- a/src/interface.jl +++ b/src/interface.jl @@ -103,13 +103,21 @@ end subtract!(x, x̄) = maywrite(x) ? (x .= x .- x̄) : eltype(x).(x .- x̄) subtract!(x, x̄::Zero) = x +# If we get Zero from AD on a leaf we skip the optimizer step. See +# https://github.com/FluxML/Optimisers.jl/issues/140 _grads!(dict::IdDict, ℓ::Leaf, x, ::Zero...) = nothing + function _grads!(dict::IdDict, ℓ::Leaf, x, x̄s...) x̄s₀ = get(dict, ℓ, map(_ -> ZeroTangent(), x̄s)) dict[ℓ] = map(+, x̄s, x̄s₀) # adding Zero should be free. Lazy accumulation broadcasted(+, x̄, x̄₀) also possible. nothing end + +# If we get Zero from AD in correspondence of a non-leaf node +# we end the recursion. The optimizer step won't be taken. +# https://github.com/FluxML/Optimisers.jl/issues/140 _grads!(dict::IdDict, t, x, ::Zero...) = nothing + function _grads!(dict::IdDict, tree, x, x̄s...) # The only reason _grads! takes model is that functor(typeof(x), base(x̄)) may differ from # functor(typeof(tree), base(x̄)), for things like Transpose From 4a78a55f55e098a71fc96b2c2d91bb75b7a926cb Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Thu, 7 Nov 2024 08:13:54 +0100 Subject: [PATCH 10/13] fix epsilon for Float16 (#190) --- .gitignore | 1 + src/Optimisers.jl | 2 ++ src/rules.jl | 21 ++++++++++----------- src/utils.jl | 6 ++++++ test/rules.jl | 10 ++++++++++ 5 files changed, 29 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index 952f7ce..763dd6f 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ Manifest.toml .vscode/ docs/build/ .DS_Store +/test.jl \ No newline at end of file diff --git a/src/Optimisers.jl b/src/Optimisers.jl index 2e115c4..99fc162 100644 --- a/src/Optimisers.jl +++ b/src/Optimisers.jl @@ -25,6 +25,8 @@ export Descent, Adam, Momentum, Nesterov, Rprop, RMSProp, WeightDecay, SignDecay, ClipGrad, ClipNorm, OptimiserChain, Lion, AccumGrad +VERSION >= v"1.11.0-DEV.469" && eval(Meta.parse("public apply!, init, setup, update, update!")) + ### ### one-array functions ### diff --git a/src/rules.jl b/src/rules.jl index bc9c099..b4fbd2a 100644 --- a/src/rules.jl +++ b/src/rules.jl @@ -130,7 +130,7 @@ RMSProp(; eta = 0.001, rho = 0.9, epsilon = 1e-8, kw...) = RMSProp(eta, rho, eps init(o::RMSProp, x::AbstractArray) = (zero(x), o.centred ? zero(x) : false) function apply!(o::RMSProp, state, x::AbstractArray{T}, dx) where T - η, ρ, ϵ = T(o.eta), T(o.rho), T(o.epsilon) + η, ρ, ϵ = T(o.eta), T(o.rho), _eps(T, o.epsilon) quad, lin = state @.. quad = ρ * quad + (1 - ρ) * abs2(dx) @@ -216,7 +216,7 @@ end init(o::Adam, x::AbstractArray{T}) where T = (zero(x), zero(x), T.(o.beta)) function apply!(o::Adam, state, x::AbstractArray{T}, dx) where T - η, β, ϵ = T(o.eta), T.(o.beta), T(o.epsilon) + η, β, ϵ = T(o.eta), T.(o.beta), _eps(T, o.epsilon) mt, vt, βt = state @.. mt = β[1] * mt + (1 - β[1]) * dx @@ -279,7 +279,7 @@ end init(o::RAdam, x::AbstractArray{T}) where T = (zero(x), zero(x), T.(o.beta), 1) function apply!(o::RAdam, state, x::AbstractArray{T}, dx) where T - η, β, ϵ = T(o.eta), T.(o.beta), T(o.epsilon) + η, β, ϵ = T(o.eta), T.(o.beta), _eps(T, o.epsilon) ρ∞ = 2/(1-β[2]) - 1 |> real mt, vt, βt, t = state @@ -320,7 +320,7 @@ end init(o::AdaMax, x::AbstractArray{T}) where T = (zero(x), zero(x), T.(o.beta)) function apply!(o::AdaMax, state, x::AbstractArray{T}, dx) where T - η, β, ϵ = T(o.eta), T.(o.beta), T(o.epsilon) + η, β, ϵ = T(o.eta), T.(o.beta), _eps(T, o.epsilon) mt, ut, βt = state @.. mt = β[1] * mt + (1 - β[1]) * dx @@ -354,7 +354,7 @@ end init(o::OAdam, x::AbstractArray{T}) where T = (zero(x), zero(x), T.(o.beta), zero(x)) function apply!(o::OAdam, state, x::AbstractArray{T}, dx) where T - η, β, ϵ = T(o.eta), T.(o.beta), T(o.epsilon) + η, β, ϵ = T(o.eta), T.(o.beta), _eps(T, o.epsilon) mt, vt, βt, term = state @.. mt = β[1] * mt + (1 - β[1]) * dx @@ -388,7 +388,7 @@ end init(o::AdaGrad, x::AbstractArray) = onevalue(o.epsilon, x) function apply!(o::AdaGrad, state, x::AbstractArray{T}, dx) where T - η, ϵ = T(o.eta), T(o.epsilon) + η, ϵ = T(o.eta), _eps(T, o.epsilon) acc = state @.. acc = acc + abs2(dx) @@ -418,7 +418,7 @@ end init(o::AdaDelta, x::AbstractArray) = (zero(x), zero(x)) function apply!(o::AdaDelta, state, x::AbstractArray{T}, dx) where T - ρ, ϵ = T(o.rho), T(o.epsilon) + ρ, ϵ = T(o.rho), _eps(T, o.epsilon) acc, Δacc = state @.. acc = ρ * acc + (1 - ρ) * abs2(dx) @@ -454,7 +454,7 @@ init(o::AMSGrad, x::AbstractArray) = (onevalue(o.epsilon, x), onevalue(o.epsilon, x), onevalue(o.epsilon, x)) function apply!(o::AMSGrad, state, x::AbstractArray{T}, dx) where T - η, β, ϵ = T(o.eta), T.(o.beta), T(o.epsilon) + η, β, ϵ = T(o.eta), T.(o.beta), _eps(T, o.epsilon) mt, vt, v̂t = state @.. mt = β[1] * mt + (1 - β[1]) * dx @@ -489,8 +489,7 @@ end init(o::NAdam, x::AbstractArray{T}) where T = (zero(x), zero(x), T.(o.beta)) function apply!(o::NAdam, state, x::AbstractArray{T}, dx) where T - η, β, ϵ = T(o.eta), T.(o.beta), T(o.epsilon) - + η, β, ϵ = T(o.eta), T.(o.beta), _eps(T, o.epsilon) mt, vt, βt = state @.. mt = β[1] * mt + (1 - β[1]) * dx @@ -548,7 +547,7 @@ end init(o::AdaBelief, x::AbstractArray{T}) where T = (zero(x), zero(x), T.(o.beta)) function apply!(o::AdaBelief, state, x::AbstractArray{T}, dx) where T - η, β, ϵ = T(o.eta), T.(o.beta), T(o.epsilon) + η, β, ϵ = T(o.eta), T.(o.beta), _eps(T, o.epsilon) mt, st, βt = state @.. mt = β[1] * mt + (1 - β[1]) * dx diff --git a/src/utils.jl b/src/utils.jl index 12a19dd..8f66746 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -14,3 +14,9 @@ foreachvalue(f, x::Dict, ys...) = foreach(pairs(x)) do (k, v) end ofeltype(x, y) = convert(float(eltype(x)), y) + +_eps(T::Type{<:AbstractFloat}, e) = T(e) +# catch complex and integers +_eps(T::Type{<:Number}, e) = _eps(real(float(T)), e) +# avoid small e being rounded to zero +_eps(T::Type{Float16}, e) = e == 0 ? T(0) : max(T(1e-7), T(e)) diff --git a/test/rules.jl b/test/rules.jl index 52a3580..9068fa1 100644 --- a/test/rules.jl +++ b/test/rules.jl @@ -267,3 +267,13 @@ end tree, x4 = Optimisers.update(tree, x3, g4) @test x4 ≈ x3 end + +@testset "Float16 epsilon" begin + # issue https://github.com/FluxML/Optimisers.jl/issues/167 + x = Float16[0.579, -0.729, 0.5493] + δx = Float16[-0.001497, 0.0001875, -0.013176] + + os = Optimisers.setup(Adam(1e-4), x); + os, x = Optimisers.update(os, x, δx) + @test x ≈ Float16[1.835, -0.886, 0.5493] rtol=1e-3 +end From 38c9d622c4a9979190b9c4c000604267aac39239 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Thu, 7 Nov 2024 08:14:53 +0100 Subject: [PATCH 11/13] Add the option couple to AdamW and set the default to match pytorch (#188) --- Project.toml | 2 +- README.md | 6 ++++++ src/rules.jl | 56 ++++++++++++++++++++++++++++++++++++++++++------ test/rules.jl | 2 +- test/runtests.jl | 3 +-- 5 files changed, 58 insertions(+), 11 deletions(-) diff --git a/Project.toml b/Project.toml index 41c9709..0a19f49 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Optimisers" uuid = "3bd65402-5787-11e9-1adc-39752487f4e2" authors = ["Mike J Innes "] -version = "0.3.4" +version = "0.4.0" [deps] ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" diff --git a/README.md b/README.md index fa318b2..e15155a 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,12 @@ This was written as the new training system for [Flux.jl](https://github.com/Flu and also used by [Lux.jl](https://github.com/avik-pal/Lux.jl). But it can be used separately on any array, or anything else understood by [Functors.jl](https://github.com/FluxML/Functors.jl). + +> [!WARNING] +> With version 0.4 the default update rule for AdamW has changed to match the pytorch implementation. +> The previous rule, which is closer to the original paper, can be obtained by setting `AdamW(..., couple=false)`. +> See [this issue](https://github.com/FluxML/Flux.jl/issues/2433) for more details. + ## Installation ```julia diff --git a/src/rules.jl b/src/rules.jl index b4fbd2a..0063d70 100644 --- a/src/rules.jl +++ b/src/rules.jl @@ -501,8 +501,8 @@ function apply!(o::NAdam, state, x::AbstractArray{T}, dx) where T end """ - AdamW(η = 0.001, β = (0.9, 0.999), λ = 0, ϵ = 1e-8) - AdamW(; [eta, beta, lambda, epsilon]) + AdamW(η = 0.001, β = (0.9, 0.999), λ = 0, ϵ = 1e-8; couple = true) + AdamW(; [eta, beta, lambda, epsilon, couple]) [AdamW](https://arxiv.org/abs/1711.05101) is a variant of Adam fixing (as in repairing) its weight decay regularization. @@ -516,12 +516,54 @@ Implemented as an [`OptimiserChain`](@ref) of [`Adam`](@ref) and [`WeightDecay`] - Weight decay (`λ == lambda`): Controls the strength of ``L_2`` regularisation. - Machine epsilon (`ϵ == epsilon`): Constant to prevent division by zero (no need to change default) -""" -AdamW(η, β = (0.9, 0.999), λ = 0.0, ϵ = 1e-8) = - OptimiserChain(Adam(η, β, ϵ), WeightDecay(λ)) +- Keyword `couple`: If `true`, the weight decay is coupled with the learning rate, as in pytorch's AdamW. + This corresponds to an update of the form `x = x - η * (dx + λ * x)`, where `dx` is the + update from Adam with learning rate 1. + If `false`, the weight decay is decoupled from the learning rate, in the spirit of the original paper. + This corresponds to an update of the form `x = x - η * dx - λ * x`. + Default is `true`. + +!!! warning "Breaking change in v0.4" + With version 0.4 the default update rule for AdamW has changed to match the pytorch implementation. + The previous rule, which is closer to the original paper, can be obtained by setting `AdamW(..., couple=false)`. + See [this issue](https://github.com/FluxML/Flux.jl/issues/2433) for more details. +""" +struct AdamW{T1,T2,T3,T4} <: AbstractRule + eta::T1 + beta::T2 + epsilon::T3 + lambda::T4 + couple::Bool +end + +function AdamW(η, β = (0.9, 0.999), λ = 0.0, ϵ = 1e-8; couple::Bool = true) + η < 0 && throw(DomainError(η, "the learning rate cannot be negative")) + AdamW(η, β, λ, ϵ, couple) +end + +AdamW(; eta = 0.001, beta = (0.9, 0.999), lambda= 0.0, epsilon = 1e-8, kw...) = + AdamW(eta, beta, lambda, epsilon; kw...) + +init(o::AdamW, x::AbstractArray{T}) where T = (zero(x), zero(x), T.(o.beta)) + +function apply!(o::AdamW, state, x::AbstractArray{T}, dx) where T + η, β, ϵ, λ = T(o.eta), T.(o.beta), T(o.epsilon), T(o.lambda) + mt, vt, βt = state -AdamW(; eta = 0.001, beta = (0.9, 0.999), lambda = 0, epsilon = 1e-8) = - OptimiserChain(Adam(eta, beta, epsilon), WeightDecay(lambda)) + # standard Adam update with learning rate eta=1 + @.. mt = β[1] * mt + (1 - β[1]) * dx + @.. vt = β[2] * vt + (1 - β[2]) * abs2(dx) + dx′ = @lazy mt / (1 - βt[1]) / (sqrt(vt / (1 - βt[2])) + ϵ) + + # apply learning rate and weight decay + if o.couple + dx′′ = @lazy η * (dx′ + λ * x) + else + dx′′ = @lazy η * dx′ + λ * x + end + + return (mt, vt, βt .* β), dx′′ +end """ AdaBelief(η = 0.001, β = (0.9, 0.999), ϵ = 1e-16) diff --git a/test/rules.jl b/test/rules.jl index 9068fa1..499902c 100644 --- a/test/rules.jl +++ b/test/rules.jl @@ -15,7 +15,7 @@ RULES = [ OptimiserChain(ClipGrad(0.5), Momentum()), OptimiserChain(WeightDecay(), OAdam(), ClipGrad(1)), # Not the default: - RMSProp(centred = true), + RMSProp(centred = true), AdamW(couple=false), ] name(o) = typeof(o).name.name # just for printing testset headings diff --git a/test/runtests.jl b/test/runtests.jl index fc0fe57..ae2d9d0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -332,8 +332,7 @@ end @testset "keyword arguments" begin @test Nesterov(rho=0.8, eta=0.1) === Nesterov(0.1, 0.8) - @test AdamW(lambda=0.3).opts[1] == Adam() - @test AdamW(lambda=0.3).opts[2] == WeightDecay(0.3) + @test AdamW(lambda=0.3, eta=0.1) == AdamW(0.1, (0.9, 0.999), 0.3, 1.0e-8) end @testset "forgotten gradient" begin From 26395239c0307fadc4d1143d9ae1bc1a6cb2711e Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Fri, 8 Nov 2024 16:08:22 -0500 Subject: [PATCH 12/13] Add `Duplicated` methods (#192) * add Duplicated methods * add test * test for shared params + minimal docs * remove 1.6 CI * indent by two spaces * fix doctest --- .github/workflows/ci.yml | 2 +- Project.toml | 14 ++++++-- docs/src/index.md | 9 +++++ ext/OptimisersEnzymeCoreExt.jl | 60 ++++++++++++++++++++++++++++++++++ test/runtests.jl | 24 +++++++++++++- 5 files changed, 104 insertions(+), 5 deletions(-) create mode 100644 ext/OptimisersEnzymeCoreExt.jl diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7c3b18c..629d90b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,9 +15,9 @@ jobs: fail-fast: false matrix: version: - - '1.6' - '1' - 'nightly' + - "1.10" os: - ubuntu-latest arch: diff --git a/Project.toml b/Project.toml index 0a19f49..a60b58f 100644 --- a/Project.toml +++ b/Project.toml @@ -1,21 +1,29 @@ name = "Optimisers" uuid = "3bd65402-5787-11e9-1adc-39752487f4e2" +version = "0.4.1" authors = ["Mike J Innes "] -version = "0.4.0" [deps] ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" +EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +[weakdeps] +EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" + +[extensions] +OptimisersEnzymeCoreExt = "EnzymeCore" + [compat] ChainRulesCore = "1" +EnzymeCore = "0.8.5" Functors = "0.4.9, 0.5" Statistics = "1" Zygote = "0.6.40" -julia = "1.6" +julia = "1.10" [extras] StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" @@ -23,4 +31,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [targets] -test = ["Test", "StaticArrays", "Zygote"] +test = ["Test", "EnzymeCore", "StaticArrays", "Zygote"] diff --git a/docs/src/index.md b/docs/src/index.md index 3cb32f8..a595d70 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -358,3 +358,12 @@ julia> Optimisers.update!(opt_state, x, g); julia> opt_state # the state in `a` and `b` differ (a = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.09, 0.09], [0.000999, 0.000999], (0.729, 0.997003))), b = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.1, 0.1], [0.001, 0.001], (0.81, 0.998001)))) ``` + +## Usage with [Enzyme.jl](https://github.com/EnzymeAD/Enzyme.jl) + +Enzyme.jl is a new automatic differentiation package, an alternative to Zygote.jl. +It likes to store the model and the gradient together, as an object `Duplicated(x, dx)`. + +Optimisers.jl now has some methods to handle this: +* `update!(opt_state, Duplicated(model, grad))` uses the gradient to update both the model and the optimiser state, and +* `setup(::AbstractRule, ::Duplicated)` ignores the gradient and returns `setup(rule, model)`. diff --git a/ext/OptimisersEnzymeCoreExt.jl b/ext/OptimisersEnzymeCoreExt.jl new file mode 100644 index 0000000..a1c1ab9 --- /dev/null +++ b/ext/OptimisersEnzymeCoreExt.jl @@ -0,0 +1,60 @@ +module OptimisersEnzymeCoreExt + +import Optimisers: trainable, setup, update!, isnumeric, AbstractRule, _setup +import EnzymeCore: Duplicated, Const + +using Functors: fmapstructure + +trainable(x::Duplicated) = (; val = x.val) +trainable(x::Const) = (;) + +""" + setup(rule::AbstractRule, model_grad::Duplicated) + +For use with Enzyme's Duplicated, this just calls `setup(rule, model_grad.val)`. +""" +setup(rule::AbstractRule, model_grad::Duplicated) = setup(rule, model_grad.val) + +_setup(rule, x::Duplicated; cache) = throw(ArgumentError( + """Objects of type `Duplicated` are only supported by Optimisers.jl at top level, + they may not appear deep inside other objects.""" +)) + +""" + update!(opt_state, model_grad::Duplicated) + +For use with Enzyme's `Duplicated`, which holds both a model/parameters +and the corresponding gradient. + +# Example + +```jldoctest +julia> using Optimisers, EnzymeCore + +julia> x_dx = Duplicated(Float16[1,2,3], Float16[1,0,-4]) +Duplicated{Vector{Float16}}(Float16[1.0, 2.0, 3.0], Float16[1.0, 0.0, -4.0]) + +julia> st = Optimisers.setup(Momentum(1/9), x_dx) # acts only on x not on dx +Leaf(Momentum(0.111111, 0.9), Float16[0.0, 0.0, 0.0]) + +julia> Optimisers.update!(st, x_dx) # mutates both arguments + +julia> x_dx +Duplicated{Vector{Float16}}(Float16[0.8887, 2.0, 3.445], Float16[1.0, 0.0, -4.0]) + +julia> st +Leaf(Momentum(0.111111, 0.9), Float16[0.1111, 0.0, -0.4443]) +``` +""" +function update!(opt_state, model_grad::Duplicated) + _, _ = update!(opt_state, model_grad.val, _grad_or_nothing(model_grad)) + nothing +end + +# This function strips the returned gradient to be Zygote-like, +# most importantly prune=nothing removes 2nd appearance of shared gradient to avoid double-counting. +_grad_or_nothing(dup::Duplicated) = fmapstructure(_grad_or_nothing, dup.dval; prune=nothing) +_grad_or_nothing(::Const) = nothing +_grad_or_nothing(x) = isnumeric(x) ? x : nothing + +end diff --git a/test/runtests.jl b/test/runtests.jl index ae2d9d0..956aa04 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,5 +1,5 @@ using Optimisers -using ChainRulesCore, Functors, StaticArrays, Zygote +using ChainRulesCore, Functors, StaticArrays, Zygote, EnzymeCore using LinearAlgebra, Statistics, Test, Random using Optimisers: @.., @lazy using Base.Broadcast: broadcasted, instantiate, Broadcasted @@ -534,6 +534,28 @@ end @test Optimisers._norm(bc2, p) isa Float64 end end + + @testset "Enzyme Duplicated" begin + x_dx = Duplicated(Float16[1,2,3], Float16[1,0,-4]) + st = Optimisers.setup(Momentum(1/9), x_dx) # acts only on x not on dx + @test st isa Optimisers.Leaf + @test nothing === Optimisers.update!(st, x_dx) # mutates both arguments + @test x_dx.val ≈ Float16[0.8887, 2.0, 3.445] + + shared = [1.0] + model = (x=shared, y=shared) + grad = deepcopy(model) # Enzyme produces something like this, grad.x === grad.y, already accumulated. + dup = Duplicated(model, model) + st2 = Optimisers.setup(Descent(0.1), model) + Optimisers.update!(st2, dup) + @test model.x ≈ [0.9] + shared .= 1 + Optimisers.update!(st2, model, grad) + model.x ≈ [0.8] # This is wrong, but don't make it a test. + # Ideally, perhaps the 3-arg update! could notice that grad.x===grad.y, and not accumulate the gradient in this case? + + @test_throws ArgumentError Optimisers.setup(Adam(), (; a=[1,2,3.], b=x_dx)) # Duplicated deep inside is not allowed + end end @testset verbose=true "Destructure" begin include("destructure.jl") From d842ddb21303bb256147575d3fda1ab783c617a2 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Sat, 9 Nov 2024 08:59:21 +0100 Subject: [PATCH 13/13] remove EnzymeCore dependency --- Project.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index a60b58f..aac006e 100644 --- a/Project.toml +++ b/Project.toml @@ -1,11 +1,10 @@ name = "Optimisers" uuid = "3bd65402-5787-11e9-1adc-39752487f4e2" -version = "0.4.1" +version = "0.4.2" authors = ["Mike J Innes "] [deps] ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" -EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"