Merge branch 'master' into patch-1

FluxML · Nov 11, 2024 · 3269bd9 · 3269bd9
2 parents 89c6fd0 + d842ddb
commit 3269bd9
Show file tree

Hide file tree

Showing 14 changed files with 333 additions and 78 deletions.
diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
@@ -1,16 +1,45 @@
 name: CompatHelper
-
 on:
   schedule:
-    - cron: '00 00 * * *'
-
+    - cron: 0 0 * * *
+  workflow_dispatch:
+permissions:
+  contents: write
+  pull-requests: write
 jobs:
   CompatHelper:
     runs-on: ubuntu-latest
     steps:
-      - name: Pkg.add("CompatHelper")
-        run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
-      - name: CompatHelper.main()
+      - name: Check if Julia is already available in the PATH
+        id: julia_in_path
+        run: which julia
+        continue-on-error: true
+      - name: Install Julia, but only if it is not already available in the PATH
+        uses: julia-actions/setup-julia@v2
+        with:
+          version: '1'
+          arch: ${{ runner.arch }}
+        if: steps.julia_in_path.outcome != 'success'
+      - name: "Add the General registry via Git"
+        run: |
+          import Pkg
+          ENV["JULIA_PKG_SERVER"] = ""
+          Pkg.Registry.add("General")
+        shell: julia --color=yes {0}
+      - name: "Install CompatHelper"
+        run: |
+          import Pkg
+          name = "CompatHelper"
+          uuid = "aa819f21-2bde-4658-8897-bab36330d9b7"
+          version = "3"
+          Pkg.add(; name, uuid, version)
+        shell: julia --color=yes {0}
+      - name: "Run CompatHelper"
+        run: |
+          import CompatHelper
+          CompatHelper.main()
+        shell: julia --color=yes {0}
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: julia -e 'using CompatHelper; CompatHelper.main()'
+          COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}
+          # COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }}
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -15,9 +15,9 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.6'
           - '1'
           - 'nightly'
+          - "1.10"
         os:
           - ubuntu-latest
         arch:

diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,4 @@ Manifest.toml
 .vscode/
 docs/build/
 .DS_Store
+/test.jl
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Optimisers"
 uuid = "3bd65402-5787-11e9-1adc-39752487f4e2"
+version = "0.4.2"
 authors = ["Mike J Innes <mike.j.innes@gmail.com>"]
-version = "0.3.3"
 
 [deps]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
@@ -12,22 +12,25 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [weakdeps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
 
 [extensions]
 OptimisersAdaptExt = ["Adapt"]
+OptimisersEnzymeCoreExt = "EnzymeCore"
 
 [compat]
 Adapt = "4"
 ChainRulesCore = "1"
-Functors = "0.4.9"
+EnzymeCore = "0.8.5"
+Functors = "0.4.9, 0.5"
 Statistics = "1"
 Zygote = "0.6.40"
-julia = "1.6"
+julia = "1.10"
 
 [extras]
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["Test", "StaticArrays", "Zygote"]
+test = ["Test", "EnzymeCore", "StaticArrays", "Zygote"]
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 # Optimisers.jl
 
-<!-- [![][docs-stable-img]][docs-stable-url] -->
+[![][docs-stable-img]][docs-stable-url]
 [![][docs-dev-img]][docs-dev-url]
 [![][action-img]][action-url]
 [![][coverage-img]][coverage-url] 
@@ -21,10 +21,16 @@
 
 Optimisers.jl defines many standard gradient-based optimisation rules, and tools for applying them to deeply nested models.
 
-This is the future of training for [Flux.jl](https://github.com/FluxML/Flux.jl) neural networks,
-and the present for [Lux.jl](https://github.com/avik-pal/Lux.jl).
+This was written as the new training system for [Flux.jl](https://github.com/FluxML/Flux.jl) neural networks,
+and also used by [Lux.jl](https://github.com/avik-pal/Lux.jl).
 But it can be used separately on any array, or anything else understood by [Functors.jl](https://github.com/FluxML/Functors.jl).
 
+
+> [!WARNING]
+> With version 0.4 the default update rule for AdamW has changed to match the pytorch implementation.
+> The previous rule, which is closer to the original paper, can be obtained by setting `AdamW(..., couple=false)`.
+> See [this issue](https://github.com/FluxML/Flux.jl/issues/2433) for more details.
+
 ## Installation
 
 ```julia

diff --git a/docs/src/api.md b/docs/src/api.md
@@ -85,4 +85,5 @@ It is defined in Functors.jl and re-exported by Optimisers.jl here for convenien
 Functors.KeyPath
 Functors.haskeypath
 Functors.getkeypath
+Functors.setkeypath!
 ```
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -311,8 +311,59 @@ julia> trainables(model)
  Float32[-0.8764882 0.40812716 0.1919528; -0.9123545 -0.4462516 0.6751252]
  Float32[0.0, 0.0]
 
-julia> l2reg(model) = sum([sum(abs2,p) for p in trainables(model)]);
+julia> l2reg(model) = sum([sum(abs2, p) for p in trainables(model)]);
 
 julia> g = gradient(l2reg, model)[1];
 ```
 Notice that the `BatchNorm` layer has two trainable parameters, `γ` and `β`, which are included in the list, while the `μ ` and `σ²` buffers are not.
+
+Sometimes one wants to iterate over all trainable parameters in a model and the corresponding parameters of a matched structure such a gradient or the moving average of the model. 
+This can be done using `trainables(model, path=true)`. For instance, here is how to update the parameters
+of a moving average model with the parameters of the model:
+
+```julia
+for (kp, p_avg) in trainables(model_avg, path=true)
+    p = getkeypath(model, kp)  
+    p_avg .= 0.99 .* p_avg .+ 0.01 .* p
+end
+```
+
+## Incomplete or nothing gradients
+
+If the gradient is not available for some parameters, or branches of the model, 
+`update` will not take an optimisation step for those parameters.
+This is the case when the gradient is `nothing` or a subtype of `ChainRules.AbstractZero`.
+
+For stateful optimisers, skipping an update it is generaly not the same as updating with a zero gradient.
+For example, in the case of Adam, the momentum and variance are updated even if the gradient is zero:
+
+```julia-repl
+julia> x = (a = ones(2), b = ones(2));
+(a = [1.0, 1.0], b = [1.0, 1.0])
+
+julia> opt_state = Optimisers.setup(Adam(0.1), x)
+(a = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.0, 0.0], [0.0, 0.0], (0.9, 0.999))), b = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.0, 0.0], [0.0, 0.0], (0.9, 0.999))))
+
+julia> g = (; a = ones(2), b = ones(2)); # First an update with a non-zero gradient to increase the momentum and variance
+
+julia> Optimisers.update!(opt_state, x, g);
+
+julia> opt_state # the state in `a` and `b` are the same
+(a = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.1, 0.1], [0.001, 0.001], (0.81, 0.998001))), b = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.1, 0.1], [0.001, 0.001], (0.81, 0.998001))))
+
+julia> g = (; a = zeros(2), b = nothing); # Now an update with a zero gradient for a and no gradient for b
+
+julia> Optimisers.update!(opt_state, x, g);
+
+julia> opt_state # the state in `a` and `b` differ
+(a = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.09, 0.09], [0.000999, 0.000999], (0.729, 0.997003))), b = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.1, 0.1], [0.001, 0.001], (0.81, 0.998001))))
+```
+
+## Usage with [Enzyme.jl](https://github.com/EnzymeAD/Enzyme.jl)
+
+Enzyme.jl is a new automatic differentiation package, an alternative to Zygote.jl.
+It likes to store the model and the gradient together, as an object `Duplicated(x, dx)`.
+
+Optimisers.jl now has some methods to handle this:
+* `update!(opt_state, Duplicated(model, grad))` uses the gradient to update both the model and the optimiser state, and
+* `setup(::AbstractRule, ::Duplicated)` ignores the gradient and returns `setup(rule, model)`.
diff --git a/ext/OptimisersEnzymeCoreExt.jl b/ext/OptimisersEnzymeCoreExt.jl
@@ -0,0 +1,60 @@
+module OptimisersEnzymeCoreExt
+
+import Optimisers: trainable, setup, update!, isnumeric, AbstractRule, _setup
+import EnzymeCore: Duplicated, Const
+
+using Functors: fmapstructure
+
+trainable(x::Duplicated) = (; val = x.val)
+trainable(x::Const) = (;)
+
+"""
+    setup(rule::AbstractRule, model_grad::Duplicated)
+
+For use with Enzyme's Duplicated, this just calls `setup(rule, model_grad.val)`.
+"""
+setup(rule::AbstractRule, model_grad::Duplicated) = setup(rule, model_grad.val)
+
+_setup(rule, x::Duplicated; cache) = throw(ArgumentError(
+  """Objects of type `Duplicated` are only supported by Optimisers.jl at top level,
+  they may not appear deep inside other objects."""
+))
+
+"""
+    update!(opt_state, model_grad::Duplicated)
+
+For use with Enzyme's `Duplicated`, which holds both a model/parameters
+and the corresponding gradient.
+
+# Example
+
+```jldoctest
+julia> using Optimisers, EnzymeCore
+
+julia> x_dx = Duplicated(Float16[1,2,3], Float16[1,0,-4])
+Duplicated{Vector{Float16}}(Float16[1.0, 2.0, 3.0], Float16[1.0, 0.0, -4.0])
+
+julia> st = Optimisers.setup(Momentum(1/9), x_dx)  # acts only on x not on dx
+Leaf(Momentum(0.111111, 0.9), Float16[0.0, 0.0, 0.0])
+
+julia> Optimisers.update!(st, x_dx)  # mutates both arguments
+
+julia> x_dx
+Duplicated{Vector{Float16}}(Float16[0.8887, 2.0, 3.445], Float16[1.0, 0.0, -4.0])
+
+julia> st
+Leaf(Momentum(0.111111, 0.9), Float16[0.1111, 0.0, -0.4443])
+```
+"""
+function update!(opt_state, model_grad::Duplicated)
+  _, _ = update!(opt_state, model_grad.val, _grad_or_nothing(model_grad))
+  nothing
+end
+
+# This function strips the returned gradient to be Zygote-like,
+# most importantly prune=nothing removes 2nd appearance of shared gradient to avoid double-counting.
+_grad_or_nothing(dup::Duplicated) = fmapstructure(_grad_or_nothing, dup.dval; prune=nothing)
+_grad_or_nothing(::Const) = nothing
+_grad_or_nothing(x) = isnumeric(x) ? x : nothing
+
+end
diff --git a/src/Optimisers.jl b/src/Optimisers.jl
@@ -25,6 +25,8 @@ export Descent, Adam, Momentum, Nesterov, Rprop, RMSProp,
        WeightDecay, SignDecay, ClipGrad, ClipNorm, OptimiserChain, Lion,
        AccumGrad
 
+VERSION >= v"1.11.0-DEV.469" && eval(Meta.parse("public apply!, init, setup, update, update!"))
+
 ###
 ### one-array functions
 ###

diff --git a/src/interface.jl b/src/interface.jl
@@ -103,13 +103,21 @@ end
 subtract!(x, x̄) = maywrite(x) ? (x .= x .- x̄) : eltype(x).(x .- x̄)
 subtract!(x, x̄::Zero) = x
 
+# If we get Zero from AD on a leaf we skip the optimizer step. See
+# https://github.com/FluxML/Optimisers.jl/issues/140
 _grads!(dict::IdDict, ℓ::Leaf, x, ::Zero...) = nothing
+
 function _grads!(dict::IdDict, ℓ::Leaf, x, x̄s...)
   x̄s₀ = get(dict, ℓ, map(_ -> ZeroTangent(), x̄s))
   dict[ℓ] = map(+, x̄s, x̄s₀)  # adding Zero should be free. Lazy accumulation broadcasted(+, x̄, x̄₀) also possible.
   nothing
 end
+
+# If we get Zero from AD in correspondence of a non-leaf node
+# we end the recursion. The optimizer step won't be taken.
+# https://github.com/FluxML/Optimisers.jl/issues/140
 _grads!(dict::IdDict, t, x, ::Zero...) = nothing
+
 function _grads!(dict::IdDict, tree, x, x̄s...)
   # The only reason _grads! takes model is that functor(typeof(x), base(x̄)) may differ from 
   # functor(typeof(tree), base(x̄)), for things like Transpose
@@ -225,12 +233,12 @@ onevalue(λ, x::AbstractArray{T}) where T = onevalue(convert(float(T), λ), x)
 nonneg(η::Real) = η < 0 ? throw(DomainError(η, "the learning rate cannot be negative")) : η
 
 """
-  @def struct Rule; eta = 0.1; beta = (0.7, 0.8); end
+    @def struct Rule; eta = 0.1; beta = (0.7, 0.8); end
 
 Helper macro for defining rules with default values.
 The types of the literal values are used in the `struct`,
 like this:
-```
+```julia
 struct Rule
   eta::Float64
   beta::Tuple{Float64, Float64}