From e1e5b4cec395151246247d9c37cdec43eae8fc24 Mon Sep 17 00:00:00 2001
From: abhro <5664668+abhro@users.noreply.github.com>
Date: Sun, 27 Oct 2024 01:31:23 -0400
Subject: [PATCH 01/13] Fix macro signature in docstring (#184)

---
 src/interface.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/interface.jl b/src/interface.jl
index ac9b90b..7341268 100644
--- a/src/interface.jl
+++ b/src/interface.jl
@@ -225,12 +225,12 @@ onevalue(λ, x::AbstractArray{T}) where T = onevalue(convert(float(T), λ), x)
 nonneg(η::Real) = η < 0 ? throw(DomainError(η, "the learning rate cannot be negative")) : η
 
 """
-  @def struct Rule; eta = 0.1; beta = (0.7, 0.8); end
+    @def struct Rule; eta = 0.1; beta = (0.7, 0.8); end
 
 Helper macro for defining rules with default values.
 The types of the literal values are used in the `struct`,
 like this:
-```
+```julia
 struct Rule
   eta::Float64
   beta::Tuple{Float64, Float64}

From 1443a6e893b82bf8ba5aa6180bdbbb9ac9051494 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Sun, 27 Oct 2024 09:54:59 +0100
Subject: [PATCH 02/13] fix docs (#185)

---
 docs/src/api.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/src/api.md b/docs/src/api.md
index 378bf72..434ee70 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -85,4 +85,5 @@ It is defined in Functors.jl and re-exported by Optimisers.jl here for convenien
 Functors.KeyPath
 Functors.haskeypath
 Functors.getkeypath
+Functors.setkeypath!
 ```

From fc65256f5ccc09c152d1a59595830de758babb67 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Tue, 5 Nov 2024 06:13:27 +0100
Subject: [PATCH 03/13] Update CompatHelper.yml

---
 .github/workflows/CompatHelper.yml | 43 +++++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
index ce8d353..8ad0284 100644
--- a/.github/workflows/CompatHelper.yml
+++ b/.github/workflows/CompatHelper.yml
@@ -1,16 +1,45 @@
 name: CompatHelper
-
 on:
   schedule:
-    - cron: '00 00 * * *'
-
+    - cron: 0 0 * * *
+  workflow_dispatch:
+permissions:
+  contents: write
+  pull-requests: write
 jobs:
   CompatHelper:
     runs-on: ubuntu-latest
     steps:
-      - name: Pkg.add("CompatHelper")
-        run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
-      - name: CompatHelper.main()
+      - name: Check if Julia is already available in the PATH
+        id: julia_in_path
+        run: which julia
+        continue-on-error: true
+      - name: Install Julia, but only if it is not already available in the PATH
+        uses: julia-actions/setup-julia@v2
+        with:
+          version: '1'
+          arch: ${{ runner.arch }}
+        if: steps.julia_in_path.outcome != 'success'
+      - name: "Add the General registry via Git"
+        run: |
+          import Pkg
+          ENV["JULIA_PKG_SERVER"] = ""
+          Pkg.Registry.add("General")
+        shell: julia --color=yes {0}
+      - name: "Install CompatHelper"
+        run: |
+          import Pkg
+          name = "CompatHelper"
+          uuid = "aa819f21-2bde-4658-8897-bab36330d9b7"
+          version = "3"
+          Pkg.add(; name, uuid, version)
+        shell: julia --color=yes {0}
+      - name: "Run CompatHelper"
+        run: |
+          import CompatHelper
+          CompatHelper.main()
+        shell: julia --color=yes {0}
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: julia -e 'using CompatHelper; CompatHelper.main()'
+          COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}
+          # COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }}

From 74f8bc98e7fca4b75211b8234b90fd290c2c1038 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Tue, 5 Nov 2024 06:28:03 +0100
Subject: [PATCH 04/13] allow Functor 0.5 (#186)

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index dab1f6e..d204c63 100644
--- a/Project.toml
+++ b/Project.toml
@@ -12,7 +12,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [compat]
 ChainRulesCore = "1"
-Functors = "0.4.9"
+Functors = "0.4.9, 0.5"
 Statistics = "1"
 Zygote = "0.6.40"
 julia = "1.6"

From 0ae05d6bd7e5a2f37c5e31ab82b9919f1c294365 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Tue, 5 Nov 2024 06:28:22 +0100
Subject: [PATCH 05/13] Update Project.toml

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index d204c63..41c9709 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "Optimisers"
 uuid = "3bd65402-5787-11e9-1adc-39752487f4e2"
 authors = ["Mike J Innes <mike.j.innes@gmail.com>"]
-version = "0.3.3"
+version = "0.3.4"
 
 [deps]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"

From bb71298242a7da4e60b00b27dcfa613370bfe78f Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Tue, 5 Nov 2024 00:32:04 -0500
Subject: [PATCH 06/13] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index fe8fb9f..6aecb4b 100644
--- a/README.md
+++ b/README.md
@@ -21,8 +21,8 @@
 
 Optimisers.jl defines many standard gradient-based optimisation rules, and tools for applying them to deeply nested models.
 
-This is the future of training for [Flux.jl](https://github.com/FluxML/Flux.jl) neural networks,
-and the present for [Lux.jl](https://github.com/avik-pal/Lux.jl).
+This was written as the new training system for [Flux.jl](https://github.com/FluxML/Flux.jl) neural networks,
+and also used by [Lux.jl](https://github.com/avik-pal/Lux.jl).
 But it can be used separately on any array, or anything else understood by [Functors.jl](https://github.com/FluxML/Functors.jl).
 
 ## Installation

From 2a1b2ed1a3ef8c7605ed2896ec552978f9148202 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Wed, 6 Nov 2024 01:31:16 +0100
Subject: [PATCH 07/13] make docstrings consistent (#187)

* fix docstrings

* address the review comments
---
 src/rules.jl | 91 ++++++++++++++++++++++++++++++----------------------
 src/utils.jl |  1 +
 2 files changed, 54 insertions(+), 38 deletions(-)

diff --git a/src/rules.jl b/src/rules.jl
index f3df9d6..bc9c099 100644
--- a/src/rules.jl
+++ b/src/rules.jl
@@ -8,7 +8,7 @@
 
 """
     Descent(η = 1f-1)
-    Descent(; eta)
+    Descent(; [eta])
 
 Classic gradient descent optimiser with learning rate `η`.
 For each parameter `p` and its gradient `dp`, this runs `p -= η*dp`.
@@ -20,12 +20,13 @@ For each parameter `p` and its gradient `dp`, this runs `p -= η*dp`.
 struct Descent{T} <: AbstractRule
   eta::T
 end
+
 Descent(; eta = 1f-1) = Descent(eta)
 
 init(o::Descent, x::AbstractArray) = nothing
 
 function apply!(o::Descent, state, x, dx)
-  η = convert(float(eltype(x)), o.eta)
+  η = ofeltype(x, o.eta)
 
   return state, @lazy dx * η  # @lazy creates a Broadcasted, will later fuse with x .= x .- dx
 end
@@ -64,6 +65,8 @@ end
 
 """
     Nesterov(η = 0.001, ρ = 0.9)
+    Nesterov(; [eta, rho])
+
 
 Gradient descent optimizer with learning rate `η` and Nesterov momentum `ρ`.
 
@@ -153,27 +156,26 @@ end
 
 """
     Rprop(η = 1f-3, ℓ = (5f-1, 1.2f0), Γ = (1f-6, 50f0))
+    Rprop(; [eta, ell, gamma])
 
 Optimizer using the
 [Rprop](https://ieeexplore.ieee.org/document/298623) algorithm. A full-batch
 learning algorithm that depends only on the sign of the gradient.
 
 # Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
+- Learning rate (`η == eta`): Amount by which gradients are discounted before updating
                        the weights.
 
-- Scaling factors (`ℓ::Tuple`): Multiplicative increase and decrease factors.
+- Scaling factors (`ℓ::Tuple == ell`): Multiplicative increase and decrease factors.
 
-- Step sizes (`Γ::Tuple`): Mminimal and maximal allowed step sizes.
+- Step sizes (`Γ::Tuple == gamma`): Mminimal and maximal allowed step sizes.
 """
-struct Rprop{T} <: AbstractRule
-    eta::T
-    ell::Tuple{T,T}
-    gamma::Tuple{T,T}
+@def struct Rprop <: AbstractRule
+    eta =  1f-3
+    ell = (5f-1, 1.2f0)
+    gamma = (1f-6, 50f0)
 end
 
-Rprop(η = 1f-3, ℓ = (5f-1, 1.2f0), Γ = (1f-6, 50f0)) = Rprop{typeof(η)}(η, ℓ, Γ)
-
 init(o::Rprop, x::AbstractArray) = (zero(x), onevalue(o.eta, x))
 
 function apply!(o::Rprop, state, x::AbstractArray{T}, dx) where T
@@ -193,15 +195,16 @@ end
 
 """
     Adam(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8)
+    Adam(; [eta, beta, epsilon])
 
 [Adam](https://arxiv.org/abs/1412.6980) optimiser.
 
 # Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
+- Learning rate (`η == eta`): Amount by which gradients are discounted before updating
                        the weights.
-- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+- Decay of momentums (`β::Tuple == beta`): Exponential decay for the first (β1) and the
                                    second (β2) momentum estimate.
-- Machine epsilon (`ϵ`): Constant to prevent division by zero
+- Machine epsilon (`ϵ == epsilon`): Constant to prevent division by zero
                          (no need to change default)
 """
 @def struct Adam <: AbstractRule
@@ -225,12 +228,13 @@ end
 
 """
     Lion(η = 0.001, β = (0.9, 0.999))
+    Lion(; [eta, beta])
 
 [Lion](https://arxiv.org/abs/2302.06675) optimiser.
 
 # Parameters
-- Learning rate (`η`): Magnitude by which gradients are updating the weights.
-- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+- Learning rate (`η == eta`): Magnitude by which gradients are updating the weights.
+- Decay of momentums (`β::Tuple == beta`): Exponential decay for the first (β1) and the
                                    second (β2) momentum estimate.
 """
 @def struct Lion <: AbstractRule
@@ -254,15 +258,16 @@ end
 
 """
     RAdam(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8)
+    RAdam(; [eta, beta, epsilon])
 
 [Rectified Adam](https://arxiv.org/abs/1908.03265) optimizer.
 
 # Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
+- Learning rate (`η == eta`): Amount by which gradients are discounted before updating
                        the weights.
-- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+- Decay of momentums (`β::Tuple == beta`): Exponential decay for the first (β1) and the
                                    second (β2) momentum estimate.
-- Machine epsilon (`ϵ`): Constant to prevent division by zero
+- Machine epsilon (`ϵ == epsilon`): Constant to prevent division by zero
                          (no need to change default)
 """
 @def struct RAdam <: AbstractRule
@@ -294,15 +299,16 @@ end
 
 """
     AdaMax(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8)
+    AdaMax(; [eta, beta, epsilon])
 
 [AdaMax](https://arxiv.org/abs/1412.6980) is a variant of Adam based on the ∞-norm.
 
 # Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
+- Learning rate (`η == eta`): Amount by which gradients are discounted before updating
                        the weights.
-- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+- Decay of momentums (`β::Tuple == beta`): Exponential decay for the first (β1) and the
                                    second (β2) momentum estimate.
-- Machine epsilon (`ϵ`): Constant to prevent division by zero
+- Machine epsilon (`ϵ == epsilon`): Constant to prevent division by zero
                          (no need to change default)
 """
 @def struct AdaMax <: AbstractRule
@@ -326,16 +332,17 @@ end
 
 """
     OAdam(η = 0.001, β = (0.5, 0.9), ϵ = 1e-8)
+    OAdam(; [eta, beta, epsilon])
 
 [OAdam](https://arxiv.org/abs/1711.00141) (Optimistic Adam)
 is a variant of Adam adding an "optimistic" term suitable for adversarial training.
 
 # Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
+- Learning rate (`η == eta`): Amount by which gradients are discounted before updating
                        the weights.
-- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+- Decay of momentums (`β::Tuple == beta`): Exponential decay for the first (β1) and the
                                    second (β2) momentum estimate.
-- Machine epsilon (`ϵ`): Constant to prevent division by zero
+- Machine epsilon (`ϵ == epsilon`): Constant to prevent division by zero
                          (no need to change default)
 """
 @def struct OAdam <: AbstractRule
@@ -361,15 +368,16 @@ end
 
 """
     AdaGrad(η = 0.1, ϵ = 1e-8)
+    AdaGrad(; [eta, epsilon])
 
 [AdaGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimizer. It has
 parameter specific learning rates based on how frequently it is updated.
 Parameters don't need tuning.
 
 # Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
+- Learning rate (`η == eta`): Amount by which gradients are discounted before updating
                        the weights.
-- Machine epsilon (`ϵ`): Constant to prevent division by zero
+- Machine epsilon (`ϵ == epsilon`): Constant to prevent division by zero
                          (no need to change default)
 """
 @def struct AdaGrad <: AbstractRule
@@ -391,14 +399,15 @@ end
 
 """
     AdaDelta(ρ = 0.9, ϵ = 1e-8)
+    AdaDelta(; [rho, epsilon])
 
 [AdaDelta](https://arxiv.org/abs/1212.5701) is a version of AdaGrad adapting its learning
 rate based on a window of past gradient updates.
 Parameters don't need tuning.
 
 # Parameters
-- Rho (`ρ`): Factor by which the gradient is decayed at each time step.
-- Machine epsilon (`ϵ`): Constant to prevent division by zero
+- Rho (`ρ == rho`): Factor by which the gradient is decayed at each time step.
+- Machine epsilon (`ϵ == epsilon`): Constant to prevent division by zero
                          (no need to change default)
 """
 @def struct AdaDelta <: AbstractRule
@@ -422,16 +431,17 @@ end
 
 """
     AMSGrad(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8)
+    AMSGrad(; [eta, beta, epsilon])
 
 The [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) version of the Adam
 optimiser. Parameters don't need tuning.
 
 # Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
+- Learning rate (`η == eta`): Amount by which gradients are discounted before updating
                        the weights.
-- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+- Decay of momentums (`β::Tuple == beta`): Exponential decay for the first (β1) and the
                                    second (β2) momentum estimate.
-- Machine epsilon (`ϵ`): Constant to prevent division by zero
+- Machine epsilon (`ϵ == epsilon`): Constant to prevent division by zero
                          (no need to change default)
 """
 @def struct AMSGrad <: AbstractRule
@@ -457,16 +467,17 @@ end
 
 """
     NAdam(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8)
+    NAdam(; [eta, beta, epsilon])
 
 [NAdam](https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ) is a Nesterov variant of Adam.
 Parameters don't need tuning.
 
 # Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
+- Learning rate (`η == eta`): Amount by which gradients are discounted before updating
                        the weights.
-- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+- Decay of momentums (`β::Tuple == beta`): Exponential decay for the first (β1) and the
                                    second (β2) momentum estimate.
-- Machine epsilon (`ϵ`): Constant to prevent division by zero
+- Machine epsilon (`ϵ == epsilon`): Constant to prevent division by zero
                          (no need to change default)
 """
 @def struct NAdam <: AbstractRule
@@ -515,16 +526,17 @@ AdamW(; eta = 0.001, beta = (0.9, 0.999), lambda = 0, epsilon = 1e-8) =
 
 """
     AdaBelief(η = 0.001, β = (0.9, 0.999), ϵ = 1e-16)
+    AdaBelief(; [eta, beta, epsilon])
 
 The [AdaBelief](https://arxiv.org/abs/2010.07468) optimiser is a variant of the well-known
 Adam optimiser.
 
 # Parameters
-- Learning rate (`η`): Amount by which gradients are discounted before updating
+- Learning rate (`η == eta`): Amount by which gradients are discounted before updating
                        the weights.
-- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
+- Decay of momentums (`β::Tuple == beta`): Exponential decay for the first (β1) and the
                                    second (β2) momentum estimate.
-- Machine epsilon (`ϵ::Float32`): Constant to prevent division by zero
+- Machine epsilon (`ϵ == epsilon`): Constant to prevent division by zero
                                   (no need to change default)
 """
 @def struct AdaBelief <: AbstractRule
@@ -548,6 +560,7 @@ end
 
 """
     WeightDecay(λ = 5e-4)
+    WeightDecay(; [lambda])
 
 Implements ``L_2`` regularisation, also known as ridge regression, 
 when composed  with other rules as the first transformation in an [`OptimiserChain`](@ref).
@@ -585,6 +598,7 @@ function adjust(r::WeightDecay; gamma = nothing, kw...)
 
 """
     SignDecay(λ = 1e-3)
+    SignDecay(; [lambda])
 
 Implements ``L_1`` regularisation, also known as LASSO regression,
 when composed  with other rules as the first transformation in an [`OptimiserChain`](@ref).
@@ -615,6 +629,7 @@ end
 
 """
     ClipGrad(δ = 10)
+    ClipGrad(; [delta])
 
 Restricts every gradient component to obey `-δ ≤ dx[i] ≤ δ`.
 
diff --git a/src/utils.jl b/src/utils.jl
index 7c6c95b..12a19dd 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -13,3 +13,4 @@ foreachvalue(f, x::Dict, ys...) = foreach(pairs(x)) do (k, v)
   f(v, (get(y, k, nothing) for y in ys)...)
 end
 
+ofeltype(x, y) = convert(float(eltype(x)), y)

From e5d187c9c2ce22efe69fa6f3112ddc110d73ba97 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Wed, 6 Nov 2024 01:48:08 +0100
Subject: [PATCH 08/13] doc stable in readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6aecb4b..fa318b2 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 # Optimisers.jl
 
-<!-- [![][docs-stable-img]][docs-stable-url] -->
+[![][docs-stable-img]][docs-stable-url]
 [![][docs-dev-img]][docs-dev-url]
 [![][action-img]][action-url]
 [![][coverage-img]][coverage-url] 

From 2da6d7fe64bebb84c6d2592a7753c53e36c659cb Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Thu, 7 Nov 2024 08:11:20 +0100
Subject: [PATCH 09/13] docs for nothing behavior and for walking a tree with
 keypath (#191)

* cl/zero

* Update docs/src/index.md

Co-authored-by: Michael Abbott <32575566+mcabbott@users.noreply.github.com>

---------

Co-authored-by: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
---
 docs/src/index.md | 44 +++++++++++++++++++++++++++++++++++++++++++-
 src/interface.jl  |  8 ++++++++
 2 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/docs/src/index.md b/docs/src/index.md
index 30ef5c4..3cb32f8 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -311,8 +311,50 @@ julia> trainables(model)
  Float32[-0.8764882 0.40812716 0.1919528; -0.9123545 -0.4462516 0.6751252]
  Float32[0.0, 0.0]
 
-julia> l2reg(model) = sum([sum(abs2,p) for p in trainables(model)]);
+julia> l2reg(model) = sum([sum(abs2, p) for p in trainables(model)]);
 
 julia> g = gradient(l2reg, model)[1];
 ```
 Notice that the `BatchNorm` layer has two trainable parameters, `γ` and `β`, which are included in the list, while the `μ ` and `σ²` buffers are not.
+
+Sometimes one wants to iterate over all trainable parameters in a model and the corresponding parameters of a matched structure such a gradient or the moving average of the model. 
+This can be done using `trainables(model, path=true)`. For instance, here is how to update the parameters
+of a moving average model with the parameters of the model:
+
+```julia
+for (kp, p_avg) in trainables(model_avg, path=true)
+    p = getkeypath(model, kp)  
+    p_avg .= 0.99 .* p_avg .+ 0.01 .* p
+end
+```
+
+## Incomplete or nothing gradients
+
+If the gradient is not available for some parameters, or branches of the model, 
+`update` will not take an optimisation step for those parameters.
+This is the case when the gradient is `nothing` or a subtype of `ChainRules.AbstractZero`.
+
+For stateful optimisers, skipping an update it is generaly not the same as updating with a zero gradient.
+For example, in the case of Adam, the momentum and variance are updated even if the gradient is zero:
+
+```julia-repl
+julia> x = (a = ones(2), b = ones(2));
+(a = [1.0, 1.0], b = [1.0, 1.0])
+
+julia> opt_state = Optimisers.setup(Adam(0.1), x)
+(a = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.0, 0.0], [0.0, 0.0], (0.9, 0.999))), b = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.0, 0.0], [0.0, 0.0], (0.9, 0.999))))
+
+julia> g = (; a = ones(2), b = ones(2)); # First an update with a non-zero gradient to increase the momentum and variance
+
+julia> Optimisers.update!(opt_state, x, g);
+
+julia> opt_state # the state in `a` and `b` are the same
+(a = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.1, 0.1], [0.001, 0.001], (0.81, 0.998001))), b = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.1, 0.1], [0.001, 0.001], (0.81, 0.998001))))
+
+julia> g = (; a = zeros(2), b = nothing); # Now an update with a zero gradient for a and no gradient for b
+
+julia> Optimisers.update!(opt_state, x, g);
+
+julia> opt_state # the state in `a` and `b` differ
+(a = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.09, 0.09], [0.000999, 0.000999], (0.729, 0.997003))), b = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.1, 0.1], [0.001, 0.001], (0.81, 0.998001))))
+```
diff --git a/src/interface.jl b/src/interface.jl
index 7341268..e44dec1 100644
--- a/src/interface.jl
+++ b/src/interface.jl
@@ -103,13 +103,21 @@ end
 subtract!(x, x̄) = maywrite(x) ? (x .= x .- x̄) : eltype(x).(x .- x̄)
 subtract!(x, x̄::Zero) = x
 
+# If we get Zero from AD on a leaf we skip the optimizer step. See
+# https://github.com/FluxML/Optimisers.jl/issues/140
 _grads!(dict::IdDict, ℓ::Leaf, x, ::Zero...) = nothing
+
 function _grads!(dict::IdDict, ℓ::Leaf, x, x̄s...)
   x̄s₀ = get(dict, ℓ, map(_ -> ZeroTangent(), x̄s))
   dict[ℓ] = map(+, x̄s, x̄s₀)  # adding Zero should be free. Lazy accumulation broadcasted(+, x̄, x̄₀) also possible.
   nothing
 end
+
+# If we get Zero from AD in correspondence of a non-leaf node
+# we end the recursion. The optimizer step won't be taken.
+# https://github.com/FluxML/Optimisers.jl/issues/140
 _grads!(dict::IdDict, t, x, ::Zero...) = nothing
+
 function _grads!(dict::IdDict, tree, x, x̄s...)
   # The only reason _grads! takes model is that functor(typeof(x), base(x̄)) may differ from 
   # functor(typeof(tree), base(x̄)), for things like Transpose

From 4a78a55f55e098a71fc96b2c2d91bb75b7a926cb Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Thu, 7 Nov 2024 08:13:54 +0100
Subject: [PATCH 10/13] fix epsilon for Float16 (#190)

---
 .gitignore        |  1 +
 src/Optimisers.jl |  2 ++
 src/rules.jl      | 21 ++++++++++-----------
 src/utils.jl      |  6 ++++++
 test/rules.jl     | 10 ++++++++++
 5 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/.gitignore b/.gitignore
index 952f7ce..763dd6f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@ Manifest.toml
 .vscode/
 docs/build/
 .DS_Store
+/test.jl
\ No newline at end of file
diff --git a/src/Optimisers.jl b/src/Optimisers.jl
index 2e115c4..99fc162 100644
--- a/src/Optimisers.jl
+++ b/src/Optimisers.jl
@@ -25,6 +25,8 @@ export Descent, Adam, Momentum, Nesterov, Rprop, RMSProp,
        WeightDecay, SignDecay, ClipGrad, ClipNorm, OptimiserChain, Lion,
        AccumGrad
 
+VERSION >= v"1.11.0-DEV.469" && eval(Meta.parse("public apply!, init, setup, update, update!"))
+
 ###
 ### one-array functions
 ###
diff --git a/src/rules.jl b/src/rules.jl
index bc9c099..b4fbd2a 100644
--- a/src/rules.jl
+++ b/src/rules.jl
@@ -130,7 +130,7 @@ RMSProp(; eta = 0.001, rho = 0.9, epsilon = 1e-8, kw...) = RMSProp(eta, rho, eps
 init(o::RMSProp, x::AbstractArray) = (zero(x), o.centred ? zero(x) : false)
 
 function apply!(o::RMSProp, state, x::AbstractArray{T}, dx) where T
-  η, ρ, ϵ = T(o.eta), T(o.rho), T(o.epsilon)
+  η, ρ, ϵ = T(o.eta), T(o.rho), _eps(T, o.epsilon)
   quad, lin = state
 
   @.. quad = ρ * quad + (1 - ρ) * abs2(dx)
@@ -216,7 +216,7 @@ end
 init(o::Adam, x::AbstractArray{T}) where T = (zero(x), zero(x), T.(o.beta))
 
 function apply!(o::Adam, state, x::AbstractArray{T}, dx) where T
-  η, β, ϵ = T(o.eta), T.(o.beta), T(o.epsilon)
+  η, β, ϵ = T(o.eta), T.(o.beta), _eps(T, o.epsilon)
   mt, vt, βt = state
 
   @.. mt = β[1] * mt + (1 - β[1]) * dx
@@ -279,7 +279,7 @@ end
 init(o::RAdam, x::AbstractArray{T}) where T = (zero(x), zero(x), T.(o.beta), 1)
 
 function apply!(o::RAdam, state, x::AbstractArray{T}, dx) where T
-  η, β, ϵ = T(o.eta), T.(o.beta), T(o.epsilon)
+  η, β, ϵ = T(o.eta), T.(o.beta), _eps(T, o.epsilon)
   ρ∞ = 2/(1-β[2]) - 1 |> real
 
   mt, vt, βt, t = state
@@ -320,7 +320,7 @@ end
 init(o::AdaMax, x::AbstractArray{T}) where T = (zero(x), zero(x), T.(o.beta))
 
 function apply!(o::AdaMax, state, x::AbstractArray{T}, dx) where T
-  η, β, ϵ = T(o.eta), T.(o.beta), T(o.epsilon)
+  η, β, ϵ = T(o.eta), T.(o.beta), _eps(T, o.epsilon)
   mt, ut, βt = state
 
   @.. mt = β[1] * mt + (1 - β[1]) * dx
@@ -354,7 +354,7 @@ end
 init(o::OAdam, x::AbstractArray{T}) where T = (zero(x), zero(x), T.(o.beta), zero(x))
 
 function apply!(o::OAdam, state, x::AbstractArray{T}, dx) where T
-  η, β, ϵ = T(o.eta), T.(o.beta), T(o.epsilon)
+  η, β, ϵ = T(o.eta), T.(o.beta), _eps(T, o.epsilon)
   mt, vt, βt, term = state
 
   @.. mt = β[1] * mt + (1 - β[1]) * dx
@@ -388,7 +388,7 @@ end
 init(o::AdaGrad, x::AbstractArray) = onevalue(o.epsilon, x)
 
 function apply!(o::AdaGrad, state, x::AbstractArray{T}, dx) where T
-  η, ϵ = T(o.eta), T(o.epsilon)
+  η, ϵ = T(o.eta), _eps(T, o.epsilon)
   acc = state
 
   @.. acc = acc + abs2(dx)
@@ -418,7 +418,7 @@ end
 init(o::AdaDelta, x::AbstractArray) = (zero(x), zero(x))
 
 function apply!(o::AdaDelta, state, x::AbstractArray{T}, dx) where T
-  ρ, ϵ = T(o.rho), T(o.epsilon)
+  ρ, ϵ = T(o.rho), _eps(T, o.epsilon)
   acc, Δacc = state
 
   @.. acc = ρ * acc + (1 - ρ) * abs2(dx)
@@ -454,7 +454,7 @@ init(o::AMSGrad, x::AbstractArray) =
   (onevalue(o.epsilon, x), onevalue(o.epsilon, x), onevalue(o.epsilon, x))
 
 function apply!(o::AMSGrad, state, x::AbstractArray{T}, dx) where T
-  η, β, ϵ = T(o.eta), T.(o.beta), T(o.epsilon)
+  η, β, ϵ = T(o.eta), T.(o.beta), _eps(T, o.epsilon)
   mt, vt, v̂t = state
 
   @.. mt = β[1] * mt + (1 - β[1]) * dx
@@ -489,8 +489,7 @@ end
 init(o::NAdam, x::AbstractArray{T}) where T = (zero(x), zero(x), T.(o.beta))
 
 function apply!(o::NAdam, state, x::AbstractArray{T}, dx) where T
-  η, β, ϵ = T(o.eta), T.(o.beta), T(o.epsilon)
-
+  η, β, ϵ = T(o.eta), T.(o.beta), _eps(T, o.epsilon)
   mt, vt, βt = state
 
   @.. mt = β[1] * mt + (1 - β[1]) * dx
@@ -548,7 +547,7 @@ end
 init(o::AdaBelief, x::AbstractArray{T}) where T = (zero(x), zero(x), T.(o.beta))
 
 function apply!(o::AdaBelief, state, x::AbstractArray{T}, dx) where T
-  η, β, ϵ = T(o.eta), T.(o.beta), T(o.epsilon)
+  η, β, ϵ = T(o.eta), T.(o.beta), _eps(T, o.epsilon)
   mt, st, βt = state
 
   @.. mt = β[1] * mt + (1 - β[1]) * dx
diff --git a/src/utils.jl b/src/utils.jl
index 12a19dd..8f66746 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -14,3 +14,9 @@ foreachvalue(f, x::Dict, ys...) = foreach(pairs(x)) do (k, v)
 end
 
 ofeltype(x, y) = convert(float(eltype(x)), y)
+
+_eps(T::Type{<:AbstractFloat}, e) = T(e)
+# catch complex and integers
+_eps(T::Type{<:Number}, e) = _eps(real(float(T)), e) 
+# avoid small e being rounded to zero
+_eps(T::Type{Float16}, e) = e == 0 ? T(0) : max(T(1e-7), T(e))
diff --git a/test/rules.jl b/test/rules.jl
index 52a3580..9068fa1 100644
--- a/test/rules.jl
+++ b/test/rules.jl
@@ -267,3 +267,13 @@ end
   tree, x4 = Optimisers.update(tree, x3, g4)
   @test x4 ≈ x3
 end
+
+@testset "Float16 epsilon" begin
+  # issue https://github.com/FluxML/Optimisers.jl/issues/167
+  x = Float16[0.579, -0.729, 0.5493]
+  δx = Float16[-0.001497, 0.0001875, -0.013176]
+
+  os = Optimisers.setup(Adam(1e-4), x);
+  os, x = Optimisers.update(os, x, δx)
+  @test x ≈ Float16[1.835, -0.886, 0.5493] rtol=1e-3
+end

From 38c9d622c4a9979190b9c4c000604267aac39239 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Thu, 7 Nov 2024 08:14:53 +0100
Subject: [PATCH 11/13] Add the option couple to AdamW and set the default to
 match pytorch (#188)

---
 Project.toml     |  2 +-
 README.md        |  6 ++++++
 src/rules.jl     | 56 ++++++++++++++++++++++++++++++++++++++++++------
 test/rules.jl    |  2 +-
 test/runtests.jl |  3 +--
 5 files changed, 58 insertions(+), 11 deletions(-)

diff --git a/Project.toml b/Project.toml
index 41c9709..0a19f49 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "Optimisers"
 uuid = "3bd65402-5787-11e9-1adc-39752487f4e2"
 authors = ["Mike J Innes <mike.j.innes@gmail.com>"]
-version = "0.3.4"
+version = "0.4.0"
 
 [deps]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
diff --git a/README.md b/README.md
index fa318b2..e15155a 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,12 @@ This was written as the new training system for [Flux.jl](https://github.com/Flu
 and also used by [Lux.jl](https://github.com/avik-pal/Lux.jl).
 But it can be used separately on any array, or anything else understood by [Functors.jl](https://github.com/FluxML/Functors.jl).
 
+
+> [!WARNING]
+> With version 0.4 the default update rule for AdamW has changed to match the pytorch implementation.
+> The previous rule, which is closer to the original paper, can be obtained by setting `AdamW(..., couple=false)`.
+> See [this issue](https://github.com/FluxML/Flux.jl/issues/2433) for more details.
+
 ## Installation
 
 ```julia
diff --git a/src/rules.jl b/src/rules.jl
index b4fbd2a..0063d70 100644
--- a/src/rules.jl
+++ b/src/rules.jl
@@ -501,8 +501,8 @@ function apply!(o::NAdam, state, x::AbstractArray{T}, dx) where T
 end
 
 """
-    AdamW(η = 0.001, β = (0.9, 0.999), λ = 0, ϵ = 1e-8)
-    AdamW(; [eta, beta, lambda, epsilon])
+    AdamW(η = 0.001, β = (0.9, 0.999), λ = 0, ϵ = 1e-8; couple = true)
+    AdamW(; [eta, beta, lambda, epsilon, couple])
 
 [AdamW](https://arxiv.org/abs/1711.05101) is a variant of Adam fixing (as in repairing) its
 weight decay regularization.
@@ -516,12 +516,54 @@ Implemented as an [`OptimiserChain`](@ref) of [`Adam`](@ref) and [`WeightDecay`]
 - Weight decay (`λ == lambda`): Controls the strength of ``L_2`` regularisation.
 - Machine epsilon (`ϵ == epsilon`): Constant to prevent division by zero
                          (no need to change default)
-"""
-AdamW(η, β = (0.9, 0.999), λ = 0.0, ϵ = 1e-8) =
-  OptimiserChain(Adam(η, β, ϵ), WeightDecay(λ))
+- Keyword `couple`: If `true`, the weight decay is coupled with the learning rate, as in pytorch's AdamW.
+                    This corresponds to an update of the form `x = x - η * (dx + λ * x)`, where `dx` is the
+                    update from Adam with learning rate 1.
+                    If `false`, the weight decay is decoupled from the learning rate, in the spirit of the original paper.
+                    This corresponds to an update of the form `x = x - η * dx - λ * x`.
+                    Default is `true`.
+
+!!! warning "Breaking change in v0.4"
+    With version 0.4 the default update rule for AdamW has changed to match the pytorch implementation.
+    The previous rule, which is closer to the original paper, can be obtained by setting `AdamW(..., couple=false)`.
+    See [this issue](https://github.com/FluxML/Flux.jl/issues/2433) for more details.
+"""
+struct AdamW{T1,T2,T3,T4} <: AbstractRule
+  eta::T1
+  beta::T2
+  epsilon::T3
+  lambda::T4
+  couple::Bool
+end
+
+function AdamW(η, β = (0.9, 0.999), λ = 0.0, ϵ = 1e-8; couple::Bool = true)
+  η < 0 && throw(DomainError(η, "the learning rate cannot be negative"))
+  AdamW(η, β, λ, ϵ, couple)
+end
+
+AdamW(; eta = 0.001, beta = (0.9, 0.999), lambda= 0.0,  epsilon = 1e-8, kw...) =
+  AdamW(eta, beta, lambda, epsilon; kw...)
+
+init(o::AdamW, x::AbstractArray{T}) where T = (zero(x), zero(x), T.(o.beta))
+
+function apply!(o::AdamW, state, x::AbstractArray{T}, dx) where T
+  η, β, ϵ, λ = T(o.eta), T.(o.beta), T(o.epsilon), T(o.lambda)
+  mt, vt, βt = state
 
-AdamW(; eta = 0.001, beta = (0.9, 0.999), lambda = 0, epsilon = 1e-8) =
-  OptimiserChain(Adam(eta, beta, epsilon), WeightDecay(lambda))
+  # standard Adam update with learning rate eta=1
+  @.. mt = β[1] * mt + (1 - β[1]) * dx
+  @.. vt = β[2] * vt + (1 - β[2]) * abs2(dx)
+  dx′ = @lazy mt / (1 - βt[1]) / (sqrt(vt / (1 - βt[2])) + ϵ)
+
+  # apply learning rate and weight decay
+  if o.couple
+    dx′′ = @lazy η * (dx′ + λ * x)
+  else
+    dx′′ = @lazy η * dx′ + λ * x
+  end
+
+  return (mt, vt, βt .* β), dx′′
+end
 
 """
     AdaBelief(η = 0.001, β = (0.9, 0.999), ϵ = 1e-16)
diff --git a/test/rules.jl b/test/rules.jl
index 9068fa1..499902c 100644
--- a/test/rules.jl
+++ b/test/rules.jl
@@ -15,7 +15,7 @@ RULES = [
   OptimiserChain(ClipGrad(0.5), Momentum()),
   OptimiserChain(WeightDecay(), OAdam(), ClipGrad(1)),
   # Not the default:
-  RMSProp(centred = true),
+  RMSProp(centred = true), AdamW(couple=false),
 ]
 
 name(o) = typeof(o).name.name  # just for printing testset headings
diff --git a/test/runtests.jl b/test/runtests.jl
index fc0fe57..ae2d9d0 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -332,8 +332,7 @@ end
 
     @testset "keyword arguments" begin
       @test Nesterov(rho=0.8, eta=0.1) === Nesterov(0.1, 0.8)
-      @test AdamW(lambda=0.3).opts[1] == Adam()
-      @test AdamW(lambda=0.3).opts[2] == WeightDecay(0.3)
+      @test AdamW(lambda=0.3, eta=0.1) == AdamW(0.1, (0.9, 0.999), 0.3, 1.0e-8)
     end
 
     @testset "forgotten gradient" begin

From 26395239c0307fadc4d1143d9ae1bc1a6cb2711e Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Fri, 8 Nov 2024 16:08:22 -0500
Subject: [PATCH 12/13] Add `Duplicated` methods (#192)

* add Duplicated methods

* add test

* test for shared params + minimal docs

* remove 1.6 CI

* indent by two spaces

* fix doctest
---
 .github/workflows/ci.yml       |  2 +-
 Project.toml                   | 14 ++++++--
 docs/src/index.md              |  9 +++++
 ext/OptimisersEnzymeCoreExt.jl | 60 ++++++++++++++++++++++++++++++++++
 test/runtests.jl               | 24 +++++++++++++-
 5 files changed, 104 insertions(+), 5 deletions(-)
 create mode 100644 ext/OptimisersEnzymeCoreExt.jl

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7c3b18c..629d90b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -15,9 +15,9 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.6'
           - '1'
           - 'nightly'
+          - "1.10"
         os:
           - ubuntu-latest
         arch:
diff --git a/Project.toml b/Project.toml
index 0a19f49..a60b58f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,21 +1,29 @@
 name = "Optimisers"
 uuid = "3bd65402-5787-11e9-1adc-39752487f4e2"
+version = "0.4.1"
 authors = ["Mike J Innes <mike.j.innes@gmail.com>"]
-version = "0.4.0"
 
 [deps]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
+[weakdeps]
+EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
+
+[extensions]
+OptimisersEnzymeCoreExt = "EnzymeCore"
+
 [compat]
 ChainRulesCore = "1"
+EnzymeCore = "0.8.5"
 Functors = "0.4.9, 0.5"
 Statistics = "1"
 Zygote = "0.6.40"
-julia = "1.6"
+julia = "1.10"
 
 [extras]
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
@@ -23,4 +31,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["Test", "StaticArrays", "Zygote"]
+test = ["Test", "EnzymeCore", "StaticArrays", "Zygote"]
diff --git a/docs/src/index.md b/docs/src/index.md
index 3cb32f8..a595d70 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -358,3 +358,12 @@ julia> Optimisers.update!(opt_state, x, g);
 julia> opt_state # the state in `a` and `b` differ
 (a = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.09, 0.09], [0.000999, 0.000999], (0.729, 0.997003))), b = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.1, 0.1], [0.001, 0.001], (0.81, 0.998001))))
 ```
+
+## Usage with [Enzyme.jl](https://github.com/EnzymeAD/Enzyme.jl)
+
+Enzyme.jl is a new automatic differentiation package, an alternative to Zygote.jl.
+It likes to store the model and the gradient together, as an object `Duplicated(x, dx)`.
+
+Optimisers.jl now has some methods to handle this:
+* `update!(opt_state, Duplicated(model, grad))` uses the gradient to update both the model and the optimiser state, and
+* `setup(::AbstractRule, ::Duplicated)` ignores the gradient and returns `setup(rule, model)`.
diff --git a/ext/OptimisersEnzymeCoreExt.jl b/ext/OptimisersEnzymeCoreExt.jl
new file mode 100644
index 0000000..a1c1ab9
--- /dev/null
+++ b/ext/OptimisersEnzymeCoreExt.jl
@@ -0,0 +1,60 @@
+module OptimisersEnzymeCoreExt
+
+import Optimisers: trainable, setup, update!, isnumeric, AbstractRule, _setup
+import EnzymeCore: Duplicated, Const
+
+using Functors: fmapstructure
+
+trainable(x::Duplicated) = (; val = x.val)
+trainable(x::Const) = (;)
+
+"""
+    setup(rule::AbstractRule, model_grad::Duplicated)
+
+For use with Enzyme's Duplicated, this just calls `setup(rule, model_grad.val)`.
+"""
+setup(rule::AbstractRule, model_grad::Duplicated) = setup(rule, model_grad.val)
+
+_setup(rule, x::Duplicated; cache) = throw(ArgumentError(
+  """Objects of type `Duplicated` are only supported by Optimisers.jl at top level,
+  they may not appear deep inside other objects."""
+))
+
+"""
+    update!(opt_state, model_grad::Duplicated)
+
+For use with Enzyme's `Duplicated`, which holds both a model/parameters
+and the corresponding gradient.
+
+# Example
+
+```jldoctest
+julia> using Optimisers, EnzymeCore
+
+julia> x_dx = Duplicated(Float16[1,2,3], Float16[1,0,-4])
+Duplicated{Vector{Float16}}(Float16[1.0, 2.0, 3.0], Float16[1.0, 0.0, -4.0])
+
+julia> st = Optimisers.setup(Momentum(1/9), x_dx)  # acts only on x not on dx
+Leaf(Momentum(0.111111, 0.9), Float16[0.0, 0.0, 0.0])
+
+julia> Optimisers.update!(st, x_dx)  # mutates both arguments
+
+julia> x_dx
+Duplicated{Vector{Float16}}(Float16[0.8887, 2.0, 3.445], Float16[1.0, 0.0, -4.0])
+
+julia> st
+Leaf(Momentum(0.111111, 0.9), Float16[0.1111, 0.0, -0.4443])
+```
+"""
+function update!(opt_state, model_grad::Duplicated)
+  _, _ = update!(opt_state, model_grad.val, _grad_or_nothing(model_grad))
+  nothing
+end
+
+# This function strips the returned gradient to be Zygote-like,
+# most importantly prune=nothing removes 2nd appearance of shared gradient to avoid double-counting.
+_grad_or_nothing(dup::Duplicated) = fmapstructure(_grad_or_nothing, dup.dval; prune=nothing)
+_grad_or_nothing(::Const) = nothing
+_grad_or_nothing(x) = isnumeric(x) ? x : nothing
+
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index ae2d9d0..956aa04 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,5 +1,5 @@
 using Optimisers
-using ChainRulesCore, Functors, StaticArrays, Zygote
+using ChainRulesCore, Functors, StaticArrays, Zygote, EnzymeCore
 using LinearAlgebra, Statistics, Test, Random
 using Optimisers: @.., @lazy
 using Base.Broadcast: broadcasted, instantiate, Broadcasted
@@ -534,6 +534,28 @@ end
         @test Optimisers._norm(bc2, p) isa Float64
       end
     end
+
+    @testset "Enzyme Duplicated" begin
+      x_dx = Duplicated(Float16[1,2,3], Float16[1,0,-4])
+      st = Optimisers.setup(Momentum(1/9), x_dx)  # acts only on x not on dx
+      @test st isa Optimisers.Leaf
+      @test nothing === Optimisers.update!(st, x_dx)  # mutates both arguments
+      @test x_dx.val ≈ Float16[0.8887, 2.0, 3.445]
+
+      shared = [1.0]
+      model = (x=shared, y=shared)
+      grad = deepcopy(model) # Enzyme produces something like this, grad.x === grad.y, already accumulated.
+      dup = Duplicated(model, model)
+      st2 = Optimisers.setup(Descent(0.1), model)
+      Optimisers.update!(st2, dup)
+      @test model.x ≈ [0.9]
+      shared .= 1
+      Optimisers.update!(st2, model, grad)
+      model.x ≈ [0.8]  # This is wrong, but don't make it a test.
+      # Ideally, perhaps the 3-arg update! could notice that grad.x===grad.y, and not accumulate the gradient in this case?
+
+      @test_throws ArgumentError Optimisers.setup(Adam(), (; a=[1,2,3.], b=x_dx))  # Duplicated deep inside is not allowed
+    end
   end
   @testset verbose=true "Destructure" begin
     include("destructure.jl")

From d842ddb21303bb256147575d3fda1ab783c617a2 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Sat, 9 Nov 2024 08:59:21 +0100
Subject: [PATCH 13/13] remove EnzymeCore dependency

---
 Project.toml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index a60b58f..aac006e 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,11 +1,10 @@
 name = "Optimisers"
 uuid = "3bd65402-5787-11e9-1adc-39752487f4e2"
-version = "0.4.1"
+version = "0.4.2"
 authors = ["Mike J Innes <mike.j.innes@gmail.com>"]
 
 [deps]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
-EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"