From 64fda6271ac13803b68b21c6fa79d593cf9158fb Mon Sep 17 00:00:00 2001
From: "Documenter.jl" <documenter@juliadocs.github.io>
Date: Thu, 7 Nov 2024 07:15:28 +0000
Subject: [PATCH] build based on 38c9d62

---
 dev/.documenter-siteinfo.json |   2 +-
 dev/api/index.html            |  66 +++++++++++++++++-----------------
 dev/index.html                |   2 +-
 dev/objects.inv               | Bin 947 -> 946 bytes
 dev/search_index.js           |   2 +-
 5 files changed, 36 insertions(+), 36 deletions(-)
diff --git a/dev/.documenter-siteinfo.json b/dev/.documenter-siteinfo.json
index 04676a2..223fae9 100644
--- a/dev/.documenter-siteinfo.json
+++ b/dev/.documenter-siteinfo.json
@@ -1 +1 @@
-{"documenter":{"julia_version":"1.10.6","generation_timestamp":"2024-11-07T07:14:27","documenter_version":"1.7.0"}}
\ No newline at end of file
+{"documenter":{"julia_version":"1.10.6","generation_timestamp":"2024-11-07T07:15:24","documenter_version":"1.7.0"}}
\ No newline at end of file
diff --git a/dev/api/index.html b/dev/api/index.html
index a6012e6..d0784de 100644
--- a/dev/api/index.html
+++ b/dev/api/index.html
@@ -4,22 +4,22 @@
   gtag('js', new Date());
   gtag('config', 'UA-36890222-9', {'page_path': location.pathname + location.search + location.hash});
 </script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/flux.css" rel="stylesheet" type="text/css"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="Optimisers.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">Optimisers.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li class="is-active"><a class="tocitem" href>API</a><ul class="internal"><li><a class="tocitem" href="#Optimisation-Rules"><span>Optimisation Rules</span></a></li><li><a class="tocitem" href="#Model-Interface"><span>Model Interface</span></a></li><li><a class="tocitem" href="#Rule-Definition"><span>Rule Definition</span></a></li><li><a class="tocitem" href="#KeyPath"><span>KeyPath</span></a></li></ul></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>API</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>API</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/FluxML/Optimisers.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/FluxML/Optimisers.jl/blob/master/docs/src/api.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h2 id="Optimisation-Rules"><a class="docs-heading-anchor" href="#Optimisation-Rules">Optimisation Rules</a><a id="Optimisation-Rules-1"></a><a class="docs-heading-anchor-permalink" href="#Optimisation-Rules" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.Descent" href="#Optimisers.Descent"><code>Optimisers.Descent</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Descent(η = 1f-1)
-Descent(; [eta])</code></pre><p>Classic gradient descent optimiser with learning rate <code>η</code>. For each parameter <code>p</code> and its gradient <code>dp</code>, this runs <code>p -= η*dp</code>.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η == eta</code>): Amount by which gradients are discounted before updating                      the weights.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/rules.jl#L9-L19">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.Momentum" href="#Optimisers.Momentum"><code>Optimisers.Momentum</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Momentum(η = 0.01, ρ = 0.9)
-Momentum(; [eta, rho])</code></pre><p>Gradient descent optimizer with learning rate <code>η</code> and momentum <code>ρ</code>.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η == eta</code>): Amount by which gradients are discounted before updating                      the weights.</li><li>Momentum (<code>ρ == rho</code>): Controls the acceleration of gradient descent in the                 prominent direction, in effect dampening oscillations.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/rules.jl#L40-L51">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.Nesterov" href="#Optimisers.Nesterov"><code>Optimisers.Nesterov</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Nesterov(η = 0.001, ρ = 0.9)
-Nesterov(; [eta, rho])</code></pre><p>Gradient descent optimizer with learning rate <code>η</code> and Nesterov momentum <code>ρ</code>.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η</code>): Amount by which gradients are discounted before updating                      the weights.</li><li>Nesterov momentum (<code>ρ</code>): Controls the acceleration of gradient descent in the                          prominent direction, in effect dampening oscillations.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/rules.jl#L66-L78">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.Rprop" href="#Optimisers.Rprop"><code>Optimisers.Rprop</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Rprop(η = 1f-3, ℓ = (5f-1, 1.2f0), Γ = (1f-6, 50f0))
-Rprop(; [eta, ell, gamma])</code></pre><p>Optimizer using the <a href="https://ieeexplore.ieee.org/document/298623">Rprop</a> algorithm. A full-batch learning algorithm that depends only on the sign of the gradient.</p><p><strong>Parameters</strong></p><ul><li><p>Learning rate (<code>η == eta</code>): Amount by which gradients are discounted before updating                      the weights.</p></li><li><p>Scaling factors (<code>ℓ::Tuple == ell</code>): Multiplicative increase and decrease factors.</p></li><li><p>Step sizes (<code>Γ::Tuple == gamma</code>): Mminimal and maximal allowed step sizes.</p></li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/rules.jl#L157-L172">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.RMSProp" href="#Optimisers.RMSProp"><code>Optimisers.RMSProp</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">RMSProp(η = 0.001, ρ = 0.9, ϵ = 1e-8; centred = false)
-RMSProp(; [eta, rho, epsilon, centred])</code></pre><p>Optimizer using the <a href="https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf">RMSProp</a> algorithm. Often a good choice for recurrent networks. Parameters other than learning rate generally don&#39;t need tuning.</p><p><a href="http://arxiv.org/abs/1308.08500">Centred RMSProp</a> is a variant which normalises gradients by an estimate their variance, instead of their second moment.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η == eta</code>): Amount by which gradients are discounted before updating                      the weights.</li><li>Momentum (<code>ρ == rho</code>): Controls the acceleration of gradient descent in the                 prominent direction, in effect dampening oscillations.</li><li>Machine epsilon (<code>ϵ == epsilon</code>): Constant to prevent division by zero                        (no need to change default)</li><li>Keyword <code>centred</code> (or <code>centered</code>): Indicates whether to use centred variant                                    of the algorithm.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/rules.jl#L95-L116">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.Adam" href="#Optimisers.Adam"><code>Optimisers.Adam</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Adam(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8)
-Adam(; [eta, beta, epsilon])</code></pre><p><a href="https://arxiv.org/abs/1412.6980">Adam</a> optimiser.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η == eta</code>): Amount by which gradients are discounted before updating                      the weights.</li><li>Decay of momentums (<code>β::Tuple == beta</code>): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.</li><li>Machine epsilon (<code>ϵ == epsilon</code>): Constant to prevent division by zero                        (no need to change default)</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/rules.jl#L196-L209">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.RAdam" href="#Optimisers.RAdam"><code>Optimisers.RAdam</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">RAdam(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8)
-RAdam(; [eta, beta, epsilon])</code></pre><p><a href="https://arxiv.org/abs/1908.03265">Rectified Adam</a> optimizer.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η == eta</code>): Amount by which gradients are discounted before updating                      the weights.</li><li>Decay of momentums (<code>β::Tuple == beta</code>): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.</li><li>Machine epsilon (<code>ϵ == epsilon</code>): Constant to prevent division by zero                        (no need to change default)</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/rules.jl#L259-L272">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.AdaMax" href="#Optimisers.AdaMax"><code>Optimisers.AdaMax</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">AdaMax(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8)
-AdaMax(; [eta, beta, epsilon])</code></pre><p><a href="https://arxiv.org/abs/1412.6980">AdaMax</a> is a variant of Adam based on the ∞-norm.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η == eta</code>): Amount by which gradients are discounted before updating                      the weights.</li><li>Decay of momentums (<code>β::Tuple == beta</code>): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.</li><li>Machine epsilon (<code>ϵ == epsilon</code>): Constant to prevent division by zero                        (no need to change default)</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/rules.jl#L300-L313">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.OAdam" href="#Optimisers.OAdam"><code>Optimisers.OAdam</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">OAdam(η = 0.001, β = (0.5, 0.9), ϵ = 1e-8)
-OAdam(; [eta, beta, epsilon])</code></pre><p><a href="https://arxiv.org/abs/1711.00141">OAdam</a> (Optimistic Adam) is a variant of Adam adding an &quot;optimistic&quot; term suitable for adversarial training.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η == eta</code>): Amount by which gradients are discounted before updating                      the weights.</li><li>Decay of momentums (<code>β::Tuple == beta</code>): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.</li><li>Machine epsilon (<code>ϵ == epsilon</code>): Constant to prevent division by zero                        (no need to change default)</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/rules.jl#L333-L347">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.AdaGrad" href="#Optimisers.AdaGrad"><code>Optimisers.AdaGrad</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">AdaGrad(η = 0.1, ϵ = 1e-8)
-AdaGrad(; [eta, epsilon])</code></pre><p><a href="http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf">AdaGrad</a> optimizer. It has parameter specific learning rates based on how frequently it is updated. Parameters don&#39;t need tuning.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η == eta</code>): Amount by which gradients are discounted before updating                      the weights.</li><li>Machine epsilon (<code>ϵ == epsilon</code>): Constant to prevent division by zero                        (no need to change default)</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/rules.jl#L369-L382">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.AdaDelta" href="#Optimisers.AdaDelta"><code>Optimisers.AdaDelta</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">AdaDelta(ρ = 0.9, ϵ = 1e-8)
-AdaDelta(; [rho, epsilon])</code></pre><p><a href="https://arxiv.org/abs/1212.5701">AdaDelta</a> is a version of AdaGrad adapting its learning rate based on a window of past gradient updates. Parameters don&#39;t need tuning.</p><p><strong>Parameters</strong></p><ul><li>Rho (<code>ρ == rho</code>): Factor by which the gradient is decayed at each time step.</li><li>Machine epsilon (<code>ϵ == epsilon</code>): Constant to prevent division by zero                        (no need to change default)</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/rules.jl#L400-L412">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.AMSGrad" href="#Optimisers.AMSGrad"><code>Optimisers.AMSGrad</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">AMSGrad(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8)
-AMSGrad(; [eta, beta, epsilon])</code></pre><p>The <a href="https://openreview.net/forum?id=ryQu7f-RZ">AMSGrad</a> version of the Adam optimiser. Parameters don&#39;t need tuning.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η == eta</code>): Amount by which gradients are discounted before updating                      the weights.</li><li>Decay of momentums (<code>β::Tuple == beta</code>): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.</li><li>Machine epsilon (<code>ϵ == epsilon</code>): Constant to prevent division by zero                        (no need to change default)</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/rules.jl#L432-L446">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.NAdam" href="#Optimisers.NAdam"><code>Optimisers.NAdam</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">NAdam(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8)
-NAdam(; [eta, beta, epsilon])</code></pre><p><a href="https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ">NAdam</a> is a Nesterov variant of Adam. Parameters don&#39;t need tuning.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η == eta</code>): Amount by which gradients are discounted before updating                      the weights.</li><li>Decay of momentums (<code>β::Tuple == beta</code>): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.</li><li>Machine epsilon (<code>ϵ == epsilon</code>): Constant to prevent division by zero                        (no need to change default)</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/rules.jl#L468-L482">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.AdamW" href="#Optimisers.AdamW"><code>Optimisers.AdamW</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">AdamW(η = 0.001, β = (0.9, 0.999), λ = 0, ϵ = 1e-8)
-AdamW(; [eta, beta, lambda, epsilon])</code></pre><p><a href="https://arxiv.org/abs/1711.05101">AdamW</a> is a variant of Adam fixing (as in repairing) its weight decay regularization. Implemented as an <a href="#Optimisers.OptimiserChain"><code>OptimiserChain</code></a> of <a href="#Optimisers.Adam"><code>Adam</code></a> and <a href="#Optimisers.WeightDecay"><code>WeightDecay</code></a>`.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η == eta</code>): Amount by which gradients are discounted before updating                      the weights.</li><li>Decay of momentums (<code>β::Tuple == beta</code>): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.</li><li>Weight decay (<code>λ == lambda</code>): Controls the strength of <span>$L_2$</span> regularisation.</li><li>Machine epsilon (<code>ϵ == epsilon</code>): Constant to prevent division by zero                        (no need to change default)</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/rules.jl#L503-L519">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.AdaBelief" href="#Optimisers.AdaBelief"><code>Optimisers.AdaBelief</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">AdaBelief(η = 0.001, β = (0.9, 0.999), ϵ = 1e-16)
-AdaBelief(; [eta, beta, epsilon])</code></pre><p>The <a href="https://arxiv.org/abs/2010.07468">AdaBelief</a> optimiser is a variant of the well-known Adam optimiser.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η == eta</code>): Amount by which gradients are discounted before updating                      the weights.</li><li>Decay of momentums (<code>β::Tuple == beta</code>): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.</li><li>Machine epsilon (<code>ϵ == epsilon</code>): Constant to prevent division by zero                                 (no need to change default)</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/rules.jl#L526-L540">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.Lion" href="#Optimisers.Lion"><code>Optimisers.Lion</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Lion(η = 0.001, β = (0.9, 0.999))
-Lion(; [eta, beta])</code></pre><p><a href="https://arxiv.org/abs/2302.06675">Lion</a> optimiser.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η == eta</code>): Magnitude by which gradients are updating the weights.</li><li>Decay of momentums (<code>β::Tuple == beta</code>): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/rules.jl#L229-L239">source</a></section></article><p>In addition to the main course, you may wish to order some of these condiments:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.AccumGrad" href="#Optimisers.AccumGrad"><code>Optimisers.AccumGrad</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">AccumGrad(n::Int)</code></pre><p>A rule constructed <code>OptimiserChain(AccumGrad(n), Rule())</code> will accumulate for <code>n</code> steps, before applying <code>Rule</code> to the mean of these <code>n</code> gradients.</p><p>This is useful for training with effective batch sizes too large for the available memory. Instead of computing the gradient for batch size <code>b</code> at once, compute it for size <code>b/n</code> and accumulate <code>n</code> such gradients.</p><p><strong>Example</strong></p><pre><code class="language-julia-repl hljs">julia&gt; m = (x=[1f0], y=[2f0]);
+Descent(; [eta])</code></pre><p>Classic gradient descent optimiser with learning rate <code>η</code>. For each parameter <code>p</code> and its gradient <code>dp</code>, this runs <code>p -= η*dp</code>.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η == eta</code>): Amount by which gradients are discounted before updating                      the weights.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/rules.jl#L9-L19">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.Momentum" href="#Optimisers.Momentum"><code>Optimisers.Momentum</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Momentum(η = 0.01, ρ = 0.9)
+Momentum(; [eta, rho])</code></pre><p>Gradient descent optimizer with learning rate <code>η</code> and momentum <code>ρ</code>.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η == eta</code>): Amount by which gradients are discounted before updating                      the weights.</li><li>Momentum (<code>ρ == rho</code>): Controls the acceleration of gradient descent in the                 prominent direction, in effect dampening oscillations.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/rules.jl#L40-L51">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.Nesterov" href="#Optimisers.Nesterov"><code>Optimisers.Nesterov</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Nesterov(η = 0.001, ρ = 0.9)
+Nesterov(; [eta, rho])</code></pre><p>Gradient descent optimizer with learning rate <code>η</code> and Nesterov momentum <code>ρ</code>.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η</code>): Amount by which gradients are discounted before updating                      the weights.</li><li>Nesterov momentum (<code>ρ</code>): Controls the acceleration of gradient descent in the                          prominent direction, in effect dampening oscillations.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/rules.jl#L66-L78">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.Rprop" href="#Optimisers.Rprop"><code>Optimisers.Rprop</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Rprop(η = 1f-3, ℓ = (5f-1, 1.2f0), Γ = (1f-6, 50f0))
+Rprop(; [eta, ell, gamma])</code></pre><p>Optimizer using the <a href="https://ieeexplore.ieee.org/document/298623">Rprop</a> algorithm. A full-batch learning algorithm that depends only on the sign of the gradient.</p><p><strong>Parameters</strong></p><ul><li><p>Learning rate (<code>η == eta</code>): Amount by which gradients are discounted before updating                      the weights.</p></li><li><p>Scaling factors (<code>ℓ::Tuple == ell</code>): Multiplicative increase and decrease factors.</p></li><li><p>Step sizes (<code>Γ::Tuple == gamma</code>): Mminimal and maximal allowed step sizes.</p></li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/rules.jl#L157-L172">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.RMSProp" href="#Optimisers.RMSProp"><code>Optimisers.RMSProp</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">RMSProp(η = 0.001, ρ = 0.9, ϵ = 1e-8; centred = false)
+RMSProp(; [eta, rho, epsilon, centred])</code></pre><p>Optimizer using the <a href="https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf">RMSProp</a> algorithm. Often a good choice for recurrent networks. Parameters other than learning rate generally don&#39;t need tuning.</p><p><a href="http://arxiv.org/abs/1308.08500">Centred RMSProp</a> is a variant which normalises gradients by an estimate their variance, instead of their second moment.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η == eta</code>): Amount by which gradients are discounted before updating                      the weights.</li><li>Momentum (<code>ρ == rho</code>): Controls the acceleration of gradient descent in the                 prominent direction, in effect dampening oscillations.</li><li>Machine epsilon (<code>ϵ == epsilon</code>): Constant to prevent division by zero                        (no need to change default)</li><li>Keyword <code>centred</code> (or <code>centered</code>): Indicates whether to use centred variant                                    of the algorithm.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/rules.jl#L95-L116">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.Adam" href="#Optimisers.Adam"><code>Optimisers.Adam</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Adam(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8)
+Adam(; [eta, beta, epsilon])</code></pre><p><a href="https://arxiv.org/abs/1412.6980">Adam</a> optimiser.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η == eta</code>): Amount by which gradients are discounted before updating                      the weights.</li><li>Decay of momentums (<code>β::Tuple == beta</code>): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.</li><li>Machine epsilon (<code>ϵ == epsilon</code>): Constant to prevent division by zero                        (no need to change default)</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/rules.jl#L196-L209">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.RAdam" href="#Optimisers.RAdam"><code>Optimisers.RAdam</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">RAdam(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8)
+RAdam(; [eta, beta, epsilon])</code></pre><p><a href="https://arxiv.org/abs/1908.03265">Rectified Adam</a> optimizer.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η == eta</code>): Amount by which gradients are discounted before updating                      the weights.</li><li>Decay of momentums (<code>β::Tuple == beta</code>): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.</li><li>Machine epsilon (<code>ϵ == epsilon</code>): Constant to prevent division by zero                        (no need to change default)</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/rules.jl#L259-L272">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.AdaMax" href="#Optimisers.AdaMax"><code>Optimisers.AdaMax</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">AdaMax(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8)
+AdaMax(; [eta, beta, epsilon])</code></pre><p><a href="https://arxiv.org/abs/1412.6980">AdaMax</a> is a variant of Adam based on the ∞-norm.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η == eta</code>): Amount by which gradients are discounted before updating                      the weights.</li><li>Decay of momentums (<code>β::Tuple == beta</code>): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.</li><li>Machine epsilon (<code>ϵ == epsilon</code>): Constant to prevent division by zero                        (no need to change default)</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/rules.jl#L300-L313">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.OAdam" href="#Optimisers.OAdam"><code>Optimisers.OAdam</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">OAdam(η = 0.001, β = (0.5, 0.9), ϵ = 1e-8)
+OAdam(; [eta, beta, epsilon])</code></pre><p><a href="https://arxiv.org/abs/1711.00141">OAdam</a> (Optimistic Adam) is a variant of Adam adding an &quot;optimistic&quot; term suitable for adversarial training.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η == eta</code>): Amount by which gradients are discounted before updating                      the weights.</li><li>Decay of momentums (<code>β::Tuple == beta</code>): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.</li><li>Machine epsilon (<code>ϵ == epsilon</code>): Constant to prevent division by zero                        (no need to change default)</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/rules.jl#L333-L347">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.AdaGrad" href="#Optimisers.AdaGrad"><code>Optimisers.AdaGrad</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">AdaGrad(η = 0.1, ϵ = 1e-8)
+AdaGrad(; [eta, epsilon])</code></pre><p><a href="http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf">AdaGrad</a> optimizer. It has parameter specific learning rates based on how frequently it is updated. Parameters don&#39;t need tuning.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η == eta</code>): Amount by which gradients are discounted before updating                      the weights.</li><li>Machine epsilon (<code>ϵ == epsilon</code>): Constant to prevent division by zero                        (no need to change default)</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/rules.jl#L369-L382">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.AdaDelta" href="#Optimisers.AdaDelta"><code>Optimisers.AdaDelta</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">AdaDelta(ρ = 0.9, ϵ = 1e-8)
+AdaDelta(; [rho, epsilon])</code></pre><p><a href="https://arxiv.org/abs/1212.5701">AdaDelta</a> is a version of AdaGrad adapting its learning rate based on a window of past gradient updates. Parameters don&#39;t need tuning.</p><p><strong>Parameters</strong></p><ul><li>Rho (<code>ρ == rho</code>): Factor by which the gradient is decayed at each time step.</li><li>Machine epsilon (<code>ϵ == epsilon</code>): Constant to prevent division by zero                        (no need to change default)</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/rules.jl#L400-L412">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.AMSGrad" href="#Optimisers.AMSGrad"><code>Optimisers.AMSGrad</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">AMSGrad(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8)
+AMSGrad(; [eta, beta, epsilon])</code></pre><p>The <a href="https://openreview.net/forum?id=ryQu7f-RZ">AMSGrad</a> version of the Adam optimiser. Parameters don&#39;t need tuning.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η == eta</code>): Amount by which gradients are discounted before updating                      the weights.</li><li>Decay of momentums (<code>β::Tuple == beta</code>): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.</li><li>Machine epsilon (<code>ϵ == epsilon</code>): Constant to prevent division by zero                        (no need to change default)</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/rules.jl#L432-L446">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.NAdam" href="#Optimisers.NAdam"><code>Optimisers.NAdam</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">NAdam(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8)
+NAdam(; [eta, beta, epsilon])</code></pre><p><a href="https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ">NAdam</a> is a Nesterov variant of Adam. Parameters don&#39;t need tuning.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η == eta</code>): Amount by which gradients are discounted before updating                      the weights.</li><li>Decay of momentums (<code>β::Tuple == beta</code>): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.</li><li>Machine epsilon (<code>ϵ == epsilon</code>): Constant to prevent division by zero                        (no need to change default)</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/rules.jl#L468-L482">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.AdamW" href="#Optimisers.AdamW"><code>Optimisers.AdamW</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">AdamW(η = 0.001, β = (0.9, 0.999), λ = 0, ϵ = 1e-8; couple = true)
+AdamW(; [eta, beta, lambda, epsilon, couple])</code></pre><p><a href="https://arxiv.org/abs/1711.05101">AdamW</a> is a variant of Adam fixing (as in repairing) its weight decay regularization. Implemented as an <a href="#Optimisers.OptimiserChain"><code>OptimiserChain</code></a> of <a href="#Optimisers.Adam"><code>Adam</code></a> and <a href="#Optimisers.WeightDecay"><code>WeightDecay</code></a>`.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η == eta</code>): Amount by which gradients are discounted before updating                      the weights.</li><li>Decay of momentums (<code>β::Tuple == beta</code>): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.</li><li>Weight decay (<code>λ == lambda</code>): Controls the strength of <span>$L_2$</span> regularisation.</li><li>Machine epsilon (<code>ϵ == epsilon</code>): Constant to prevent division by zero                        (no need to change default)</li><li>Keyword <code>couple</code>: If <code>true</code>, the weight decay is coupled with the learning rate, as in pytorch&#39;s AdamW.                   This corresponds to an update of the form <code>x = x - η * (dx + λ * x)</code>, where <code>dx</code> is the                   update from Adam with learning rate 1.                   If <code>false</code>, the weight decay is decoupled from the learning rate, in the spirit of the original paper.                   This corresponds to an update of the form <code>x = x - η * dx - λ * x</code>.                   Default is <code>true</code>.</li></ul><div class="admonition is-warning"><header class="admonition-header">Breaking change in v0.4</header><div class="admonition-body"><p>With version 0.4 the default update rule for AdamW has changed to match the pytorch implementation. The previous rule, which is closer to the original paper, can be obtained by setting <code>AdamW(..., couple=false)</code>. See <a href="https://github.com/FluxML/Flux.jl/issues/2433">this issue</a> for more details.</p></div></div></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/rules.jl#L503-L530">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.AdaBelief" href="#Optimisers.AdaBelief"><code>Optimisers.AdaBelief</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">AdaBelief(η = 0.001, β = (0.9, 0.999), ϵ = 1e-16)
+AdaBelief(; [eta, beta, epsilon])</code></pre><p>The <a href="https://arxiv.org/abs/2010.07468">AdaBelief</a> optimiser is a variant of the well-known Adam optimiser.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η == eta</code>): Amount by which gradients are discounted before updating                      the weights.</li><li>Decay of momentums (<code>β::Tuple == beta</code>): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.</li><li>Machine epsilon (<code>ϵ == epsilon</code>): Constant to prevent division by zero                                 (no need to change default)</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/rules.jl#L568-L582">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.Lion" href="#Optimisers.Lion"><code>Optimisers.Lion</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Lion(η = 0.001, β = (0.9, 0.999))
+Lion(; [eta, beta])</code></pre><p><a href="https://arxiv.org/abs/2302.06675">Lion</a> optimiser.</p><p><strong>Parameters</strong></p><ul><li>Learning rate (<code>η == eta</code>): Magnitude by which gradients are updating the weights.</li><li>Decay of momentums (<code>β::Tuple == beta</code>): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/rules.jl#L229-L239">source</a></section></article><p>In addition to the main course, you may wish to order some of these condiments:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.AccumGrad" href="#Optimisers.AccumGrad"><code>Optimisers.AccumGrad</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">AccumGrad(n::Int)</code></pre><p>A rule constructed <code>OptimiserChain(AccumGrad(n), Rule())</code> will accumulate for <code>n</code> steps, before applying <code>Rule</code> to the mean of these <code>n</code> gradients.</p><p>This is useful for training with effective batch sizes too large for the available memory. Instead of computing the gradient for batch size <code>b</code> at once, compute it for size <code>b/n</code> and accumulate <code>n</code> such gradients.</p><p><strong>Example</strong></p><pre><code class="language-julia-repl hljs">julia&gt; m = (x=[1f0], y=[2f0]);
 
 julia&gt; r = OptimiserChain(AccumGrad(2), WeightDecay(0.01), Descent(0.1));
 
@@ -33,10 +33,10 @@
 julia&gt; Optimisers.update!(s, m, (x=[0], y=[444]));
 
 julia&gt; m  # n=2 gradients applied at once
-(x = Float32[-0.651], y = Float32[-20.202002])</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/rules.jl#L761-L789">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.ClipGrad" href="#Optimisers.ClipGrad"><code>Optimisers.ClipGrad</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">ClipGrad(δ = 10)
-ClipGrad(; [delta])</code></pre><p>Restricts every gradient component to obey <code>-δ ≤ dx[i] ≤ δ</code>.</p><p>Typically composed with other rules using <a href="#Optimisers.OptimiserChain"><code>OptimiserChain</code></a>.</p><p>See also <a href="#Optimisers.ClipNorm"><code>ClipNorm</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/rules.jl#L629-L638">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.ClipNorm" href="#Optimisers.ClipNorm"><code>Optimisers.ClipNorm</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">ClipNorm(ω = 10, p = 2; throw = true)</code></pre><p>Scales any gradient array for which <code>norm(dx, p) &gt; ω</code> to stay at this threshold (unless <code>p==0</code>).</p><p>Throws an error if the norm is infinite or <code>NaN</code>, which you can turn off with <code>throw = false</code>.</p><p>Typically composed with other rules using <a href="#Optimisers.OptimiserChain"><code>OptimiserChain</code></a>.</p><p>See also <a href="#Optimisers.ClipGrad"><code>ClipGrad</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/rules.jl#L652-L664">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.SignDecay" href="#Optimisers.SignDecay"><code>Optimisers.SignDecay</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">SignDecay(λ = 1e-3)
-SignDecay(; [lambda])</code></pre><p>Implements <span>$L_1$</span> regularisation, also known as LASSO regression, when composed  with other rules as the first transformation in an <a href="#Optimisers.OptimiserChain"><code>OptimiserChain</code></a>.</p><p>It does this by adding <code>λ .* sign(x)</code> to the gradient. This is equivalent to adding  <code>λ * sum(abs, x) == λ * norm(x, 1)</code> to the loss.</p><p>See also [<code>WeightDecay</code>] for <span>$L_2$</span> normalisation. They can be used together: <code>OptimiserChain(SignDecay(0.012), WeightDecay(0.034), Adam())</code> is equivalent to adding <code>0.012 * norm(x, 1) + 0.017 * norm(x, 2)^2</code> to the loss function.</p><p><strong>Parameters</strong></p><ul><li>Penalty (<code>λ ≥ 0</code>): Controls the strength of the regularisation.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/rules.jl#L598-L614">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.WeightDecay" href="#Optimisers.WeightDecay"><code>Optimisers.WeightDecay</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">WeightDecay(λ = 5e-4)
-WeightDecay(; [lambda])</code></pre><p>Implements <span>$L_2$</span> regularisation, also known as ridge regression,  when composed  with other rules as the first transformation in an <a href="#Optimisers.OptimiserChain"><code>OptimiserChain</code></a>.</p><p>It does this by adding <code>λ .* x</code> to the gradient. This is equivalent to adding  <code>λ/2 * sum(abs2, x) == λ/2 * norm(x)^2</code> to the loss.</p><p>See also [<code>SignDecay</code>] for <span>$L_1$</span> normalisation.</p><p><strong>Parameters</strong></p><ul><li>Penalty (<code>λ ≥ 0</code>): Controls the strength of the regularisation.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/rules.jl#L560-L574">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.OptimiserChain" href="#Optimisers.OptimiserChain"><code>Optimisers.OptimiserChain</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">OptimiserChain(opts...)</code></pre><p>Compose a sequence of optimisers so that each <code>opt</code> in <code>opts</code> updates the gradient, in the order specified.</p><p>With an empty sequence, <code>OptimiserChain()</code> is the identity, so <code>update!</code> will subtract the full gradient from the parameters. This is equivalent to <code>Descent(1)</code>.</p><p><strong>Example</strong></p><pre><code class="language-julia-repl hljs">julia&gt; o = OptimiserChain(ClipGrad(1.0), Descent(0.1));
+(x = Float32[-0.651], y = Float32[-20.202002])</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/rules.jl#L803-L831">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.ClipGrad" href="#Optimisers.ClipGrad"><code>Optimisers.ClipGrad</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">ClipGrad(δ = 10)
+ClipGrad(; [delta])</code></pre><p>Restricts every gradient component to obey <code>-δ ≤ dx[i] ≤ δ</code>.</p><p>Typically composed with other rules using <a href="#Optimisers.OptimiserChain"><code>OptimiserChain</code></a>.</p><p>See also <a href="#Optimisers.ClipNorm"><code>ClipNorm</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/rules.jl#L671-L680">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.ClipNorm" href="#Optimisers.ClipNorm"><code>Optimisers.ClipNorm</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">ClipNorm(ω = 10, p = 2; throw = true)</code></pre><p>Scales any gradient array for which <code>norm(dx, p) &gt; ω</code> to stay at this threshold (unless <code>p==0</code>).</p><p>Throws an error if the norm is infinite or <code>NaN</code>, which you can turn off with <code>throw = false</code>.</p><p>Typically composed with other rules using <a href="#Optimisers.OptimiserChain"><code>OptimiserChain</code></a>.</p><p>See also <a href="#Optimisers.ClipGrad"><code>ClipGrad</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/rules.jl#L694-L706">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.SignDecay" href="#Optimisers.SignDecay"><code>Optimisers.SignDecay</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">SignDecay(λ = 1e-3)
+SignDecay(; [lambda])</code></pre><p>Implements <span>$L_1$</span> regularisation, also known as LASSO regression, when composed  with other rules as the first transformation in an <a href="#Optimisers.OptimiserChain"><code>OptimiserChain</code></a>.</p><p>It does this by adding <code>λ .* sign(x)</code> to the gradient. This is equivalent to adding  <code>λ * sum(abs, x) == λ * norm(x, 1)</code> to the loss.</p><p>See also [<code>WeightDecay</code>] for <span>$L_2$</span> normalisation. They can be used together: <code>OptimiserChain(SignDecay(0.012), WeightDecay(0.034), Adam())</code> is equivalent to adding <code>0.012 * norm(x, 1) + 0.017 * norm(x, 2)^2</code> to the loss function.</p><p><strong>Parameters</strong></p><ul><li>Penalty (<code>λ ≥ 0</code>): Controls the strength of the regularisation.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/rules.jl#L640-L656">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.WeightDecay" href="#Optimisers.WeightDecay"><code>Optimisers.WeightDecay</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">WeightDecay(λ = 5e-4)
+WeightDecay(; [lambda])</code></pre><p>Implements <span>$L_2$</span> regularisation, also known as ridge regression,  when composed  with other rules as the first transformation in an <a href="#Optimisers.OptimiserChain"><code>OptimiserChain</code></a>.</p><p>It does this by adding <code>λ .* x</code> to the gradient. This is equivalent to adding  <code>λ/2 * sum(abs2, x) == λ/2 * norm(x)^2</code> to the loss.</p><p>See also [<code>SignDecay</code>] for <span>$L_1$</span> normalisation.</p><p><strong>Parameters</strong></p><ul><li>Penalty (<code>λ ≥ 0</code>): Controls the strength of the regularisation.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/rules.jl#L602-L616">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.OptimiserChain" href="#Optimisers.OptimiserChain"><code>Optimisers.OptimiserChain</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">OptimiserChain(opts...)</code></pre><p>Compose a sequence of optimisers so that each <code>opt</code> in <code>opts</code> updates the gradient, in the order specified.</p><p>With an empty sequence, <code>OptimiserChain()</code> is the identity, so <code>update!</code> will subtract the full gradient from the parameters. This is equivalent to <code>Descent(1)</code>.</p><p><strong>Example</strong></p><pre><code class="language-julia-repl hljs">julia&gt; o = OptimiserChain(ClipGrad(1.0), Descent(0.1));
 
 julia&gt; m = (zeros(3),);
 
@@ -44,7 +44,7 @@
 (Leaf(OptimiserChain(ClipGrad(1.0), Descent(0.1)), (nothing, nothing)),)
 
 julia&gt; Optimisers.update(s, m, ([0.3, 1, 7],))[2]  # clips before discounting
-([-0.03, -0.1, -0.1],)</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/rules.jl#L707-L730">source</a></section></article><h2 id="Model-Interface"><a class="docs-heading-anchor" href="#Model-Interface">Model Interface</a><a id="Model-Interface-1"></a><a class="docs-heading-anchor-permalink" href="#Model-Interface" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.setup" href="#Optimisers.setup"><code>Optimisers.setup</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Optimisers.setup(rule, model) -&gt; state_tree</code></pre><p>Initialises the given optimiser for every trainable parameter within the model. Returns a tree of the relevant states, which must be passed to <a href="#Optimisers.update"><code>update</code></a> or <a href="#Optimisers.update!"><code>update!</code></a>.</p><p><strong>Example</strong></p><pre><code class="language-julia-repl hljs">julia&gt; m = (x = rand(3), y = (true, false), z = tanh);
+([-0.03, -0.1, -0.1],)</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/rules.jl#L749-L772">source</a></section></article><h2 id="Model-Interface"><a class="docs-heading-anchor" href="#Model-Interface">Model Interface</a><a id="Model-Interface-1"></a><a class="docs-heading-anchor-permalink" href="#Model-Interface" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.setup" href="#Optimisers.setup"><code>Optimisers.setup</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Optimisers.setup(rule, model) -&gt; state_tree</code></pre><p>Initialises the given optimiser for every trainable parameter within the model. Returns a tree of the relevant states, which must be passed to <a href="#Optimisers.update"><code>update</code></a> or <a href="#Optimisers.update!"><code>update!</code></a>.</p><p><strong>Example</strong></p><pre><code class="language-julia-repl hljs">julia&gt; m = (x = rand(3), y = (true, false), z = tanh);
 
 julia&gt; Optimisers.setup(Momentum(), m)  # same field names as m
 (x = Leaf(Momentum(0.01, 0.9), [0.0, 0.0, 0.0]), y = ((), ()), z = ())</code></pre><p>The recursion into structures uses Functors.jl, and any new <code>struct</code>s containing parameters need to be marked with <code>Functors.@functor</code> before use. See <a href="https://fluxml.ai/Flux.jl/stable/models/advanced/">the Flux docs</a> for more about this.</p><pre><code class="language-julia-repl hljs">julia&gt; struct Layer; mat; fun; end
@@ -63,7 +63,7 @@
 (lay = (mat = Leaf(Momentum(0.01, 0.9), Float32[0.0 0.0; 0.0 0.0]), fun = ()), vec = Leaf(Momentum(0.01, 0.9), Float32[0.0, 0.0]))
 
 julia&gt; destructure(model)
-(Float32[1.0, 3.0, 2.0, 4.0, 5.0, 6.0], Restructure(NamedTuple, ..., 6))</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/Optimisers.jl#L79-L117">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.update" href="#Optimisers.update"><code>Optimisers.update</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Optimisers.update(tree, model, gradient) -&gt; (tree, model)</code></pre><p>Uses the optimiser and the gradient to change the trainable parameters in the model. Returns the improved model, and the optimiser states needed for the next update. The initial tree of states comes from <a href="#Optimisers.setup"><code>setup</code></a>.</p><p>See also <a href="#Optimisers.update!"><code>update!</code></a>, which will be faster for models of ordinary <code>Array</code>s or <code>CuArray</code>s.</p><p><strong>Example</strong></p><pre><code class="language-julia-repl hljs">julia&gt; m = (x = Float32[1,2,3], y = tanh);
+(Float32[1.0, 3.0, 2.0, 4.0, 5.0, 6.0], Restructure(NamedTuple, ..., 6))</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/Optimisers.jl#L79-L117">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.update" href="#Optimisers.update"><code>Optimisers.update</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Optimisers.update(tree, model, gradient) -&gt; (tree, model)</code></pre><p>Uses the optimiser and the gradient to change the trainable parameters in the model. Returns the improved model, and the optimiser states needed for the next update. The initial tree of states comes from <a href="#Optimisers.setup"><code>setup</code></a>.</p><p>See also <a href="#Optimisers.update!"><code>update!</code></a>, which will be faster for models of ordinary <code>Array</code>s or <code>CuArray</code>s.</p><p><strong>Example</strong></p><pre><code class="language-julia-repl hljs">julia&gt; m = (x = Float32[1,2,3], y = tanh);
 
 julia&gt; t = Optimisers.setup(Descent(0.1), m)
 (x = Leaf(Descent(0.1), nothing), y = ())
@@ -71,7 +71,7 @@
 julia&gt; g = (x = [1,1,1], y = nothing);  # fake gradient
 
 julia&gt; Optimisers.update(t, m, g)
-((x = Leaf(Descent(0.1), nothing), y = ()), (x = Float32[0.9, 1.9, 2.9], y = tanh))</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/Optimisers.jl#L120-L141">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.update!" href="#Optimisers.update!"><code>Optimisers.update!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Optimisers.update!(tree, model, gradient) -&gt; (tree, model)</code></pre><p>Uses the optimiser and the gradient to change the trainable parameters in the model. Returns the improved model, and the optimiser states needed for the next update. The initial tree of states comes from <a href="#Optimisers.setup"><code>setup</code></a>.</p><p>This is used in exactly the same manner as <a href="#Optimisers.update"><code>update</code></a>, but because it may mutate arrays within the old model (and the old state), it will be faster for models of ordinary <code>Array</code>s or <code>CuArray</code>s. However, you should not rely on the old model being fully updated but rather use the returned model. (The original state tree is always mutated, as each <code>Leaf</code> is mutable.)</p><p><strong>Example</strong></p><pre><code class="language-julia-repl hljs">julia&gt; using StaticArrays, Zygote, Optimisers
+((x = Leaf(Descent(0.1), nothing), y = ()), (x = Float32[0.9, 1.9, 2.9], y = tanh))</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/Optimisers.jl#L120-L141">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.update!" href="#Optimisers.update!"><code>Optimisers.update!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Optimisers.update!(tree, model, gradient) -&gt; (tree, model)</code></pre><p>Uses the optimiser and the gradient to change the trainable parameters in the model. Returns the improved model, and the optimiser states needed for the next update. The initial tree of states comes from <a href="#Optimisers.setup"><code>setup</code></a>.</p><p>This is used in exactly the same manner as <a href="#Optimisers.update"><code>update</code></a>, but because it may mutate arrays within the old model (and the old state), it will be faster for models of ordinary <code>Array</code>s or <code>CuArray</code>s. However, you should not rely on the old model being fully updated but rather use the returned model. (The original state tree is always mutated, as each <code>Leaf</code> is mutable.)</p><p><strong>Example</strong></p><pre><code class="language-julia-repl hljs">julia&gt; using StaticArrays, Zygote, Optimisers
 
 julia&gt; m = (x = [1f0, 2f0], y = SA[4f0, 5f0]);  # partly mutable model
 
@@ -93,7 +93,7 @@
 (x = Float32[0.6666666, 1.5333333], y = Float32[4.0, 5.0])
 
 julia&gt; t == t2  # original state tree is guaranteed to be mutated
-true</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/Optimisers.jl#L144-L184">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.adjust!" href="#Optimisers.adjust!"><code>Optimisers.adjust!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Optimisers.adjust!(tree, η)</code></pre><p>Alters the state <code>tree = setup(rule, model)</code> to change the parameters of the optimisation rule, without destroying its stored state. Typically used mid-way through training.</p><p>Can be applied to part of a model, by acting only on the corresponding part of the state <code>tree</code>.</p><p>To change just the learning rate, provide a number <code>η::Real</code>.</p><p><strong>Example</strong></p><pre><code class="language-julia-repl hljs">julia&gt; m = (vec = rand(Float32, 2), fun = sin);
+true</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/Optimisers.jl#L144-L184">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.adjust!" href="#Optimisers.adjust!"><code>Optimisers.adjust!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Optimisers.adjust!(tree, η)</code></pre><p>Alters the state <code>tree = setup(rule, model)</code> to change the parameters of the optimisation rule, without destroying its stored state. Typically used mid-way through training.</p><p>Can be applied to part of a model, by acting only on the corresponding part of the state <code>tree</code>.</p><p>To change just the learning rate, provide a number <code>η::Real</code>.</p><p><strong>Example</strong></p><pre><code class="language-julia-repl hljs">julia&gt; m = (vec = rand(Float32, 2), fun = sin);
 
 julia&gt; st = Optimisers.setup(Nesterov(), m)  # stored momentum is initialised to zero
 (vec = Leaf(Nesterov(0.001, 0.9), Float32[0.0, 0.0]), fun = ())
@@ -116,7 +116,7 @@
 (vec = Leaf(OptimiserChain(ClipGrad(11.1), Adam(0.001, (0.777, 0.909), 1.0e-8)), (nothing, (Float32[0.0, 0.0], Float32[0.0, 0.0], (0.9, 0.999)))), fun = ())
 
 julia&gt; Optimisers.adjust(st; beta = &quot;no such field&quot;)  # silently ignored!
-(vec = Leaf(Nesterov(0.123, 0.9), Float32[-0.016, -0.088]), fun = ())</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/adjust.jl#L58-L104">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.adjust-Tuple{Any, Real}" href="#Optimisers.adjust-Tuple{Any, Real}"><code>Optimisers.adjust</code></a> — <span class="docstring-category">Method</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">adjust(tree, η) -&gt; tree</code></pre><p>Like <a href="#Optimisers.adjust-Tuple{Any, Real}"><code>adjust!</code></a>, but returns a new tree instead of mutating the old one.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/adjust.jl#L114-L118">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.freeze!" href="#Optimisers.freeze!"><code>Optimisers.freeze!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Optimisers.freeze!(tree)</code></pre><p>Temporarily alters the state <code>tree = setup(rule, model)</code> so that parameters will not be updated. Un-done by <a href="#Optimisers.thaw!"><code>thaw!</code></a>.</p><p>Can be applied to the state corresponding to only part of a model, for instance with <code>model::Chain</code>, to freeze <code>model.layers[1]</code> you should call <code>freeze!(tree.layers[1])</code>.</p><p><strong>Example</strong></p><pre><code class="language-julia-repl hljs">julia&gt; m = (x = ([1.0], 2.0), y = [3.0]);
+(vec = Leaf(Nesterov(0.123, 0.9), Float32[-0.016, -0.088]), fun = ())</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/adjust.jl#L58-L104">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.adjust-Tuple{Any, Real}" href="#Optimisers.adjust-Tuple{Any, Real}"><code>Optimisers.adjust</code></a> — <span class="docstring-category">Method</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">adjust(tree, η) -&gt; tree</code></pre><p>Like <a href="#Optimisers.adjust-Tuple{Any, Real}"><code>adjust!</code></a>, but returns a new tree instead of mutating the old one.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/adjust.jl#L114-L118">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.freeze!" href="#Optimisers.freeze!"><code>Optimisers.freeze!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Optimisers.freeze!(tree)</code></pre><p>Temporarily alters the state <code>tree = setup(rule, model)</code> so that parameters will not be updated. Un-done by <a href="#Optimisers.thaw!"><code>thaw!</code></a>.</p><p>Can be applied to the state corresponding to only part of a model, for instance with <code>model::Chain</code>, to freeze <code>model.layers[1]</code> you should call <code>freeze!(tree.layers[1])</code>.</p><p><strong>Example</strong></p><pre><code class="language-julia-repl hljs">julia&gt; m = (x = ([1.0], 2.0), y = [3.0]);
 
 julia&gt; s = Optimisers.setup(Momentum(), m);
 
@@ -133,11 +133,11 @@
 julia&gt; Optimisers.thaw!(s)
 
 julia&gt; s.x
-(Leaf(Momentum(0.01, 0.9), [0.0]), ())</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/adjust.jl#L5-L36">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.thaw!" href="#Optimisers.thaw!"><code>Optimisers.thaw!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Optimisers.thaw!(tree)</code></pre><p>The reverse of <a href="#Optimisers.freeze!"><code>freeze!</code></a>. Applies to all parameters, mutating every <code>Leaf(rule, state, frozen = true)</code> to <code>Leaf(rule, state, frozen = false)</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/adjust.jl#L40-L45">source</a></section></article><p>Calling <code>Functors.@functor</code> on your model&#39;s layer types by default causes these functions to recurse into all children, and ultimately optimise all <code>isnumeric</code> leaf nodes. To further restrict this by ignoring some fields of a layer type, define <code>trainable</code>:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.trainable" href="#Optimisers.trainable"><code>Optimisers.trainable</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">trainable(x::Layer) -&gt; NamedTuple</code></pre><p>This may be overloaded to make optimisers ignore some fields of every <code>Layer</code>, which would otherwise contain trainable parameters.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>This is very rarely required. Fields of <code>struct Layer</code> which contain functions, or integers like sizes, are always ignored anyway. Overloading <code>trainable</code> is only necessary when some arrays of numbers are to be optimised, and some arrays of numbers are not.</p></div></div><p>The default is <code>Functors.children(x)</code>, usually a NamedTuple of all fields, and <code>trainable(x)</code> must contain a subset of these.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/interface.jl#L161-L175">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.isnumeric" href="#Optimisers.isnumeric"><code>Optimisers.isnumeric</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">isnumeric(x) -&gt; Bool</code></pre><p>Returns <code>true</code> on any parameter to be adjusted by Optimisers.jl, namely arrays of non-integer numbers. Returns <code>false</code> on all other types.</p><p>Requires also that <code>Functors.isleaf(x) == true</code>, to focus on e.g. the parent of a transposed matrix, not the wrapper.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/interface.jl#L136-L144">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.maywrite" href="#Optimisers.maywrite"><code>Optimisers.maywrite</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">maywrite(x) -&gt; Bool</code></pre><p>Should return <code>true</code> if we are completely sure that <code>update!</code> can write new values into <code>x</code>. Otherwise <code>false</code>, indicating a non-mutating path. For now, simply <code>x isa DenseArray</code> allowing <code>Array</code>, <code>CuArray</code>, etc. </p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/interface.jl#L149-L155">source</a></section></article><p>Such restrictions are also obeyed by this function for flattening a model:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.destructure" href="#Optimisers.destructure"><code>Optimisers.destructure</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">destructure(model) -&gt; vector, reconstructor</code></pre><p>Copies all <a href="#Optimisers.trainable"><code>trainable</code></a>, <a href="#Optimisers.isnumeric"><code>isnumeric</code></a> parameters in the model to a vector, and returns also a function which reverses this transformation. Differentiable.</p><p><strong>Example</strong></p><pre><code class="language-julia-repl hljs">julia&gt; v, re = destructure((x=[1.0, 2.0], y=(sin, [3.0 + 4.0im])))
+(Leaf(Momentum(0.01, 0.9), [0.0]), ())</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/adjust.jl#L5-L36">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.thaw!" href="#Optimisers.thaw!"><code>Optimisers.thaw!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Optimisers.thaw!(tree)</code></pre><p>The reverse of <a href="#Optimisers.freeze!"><code>freeze!</code></a>. Applies to all parameters, mutating every <code>Leaf(rule, state, frozen = true)</code> to <code>Leaf(rule, state, frozen = false)</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/adjust.jl#L40-L45">source</a></section></article><p>Calling <code>Functors.@functor</code> on your model&#39;s layer types by default causes these functions to recurse into all children, and ultimately optimise all <code>isnumeric</code> leaf nodes. To further restrict this by ignoring some fields of a layer type, define <code>trainable</code>:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.trainable" href="#Optimisers.trainable"><code>Optimisers.trainable</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">trainable(x::Layer) -&gt; NamedTuple</code></pre><p>This may be overloaded to make optimisers ignore some fields of every <code>Layer</code>, which would otherwise contain trainable parameters.</p><div class="admonition is-warning"><header class="admonition-header">Warning</header><div class="admonition-body"><p>This is very rarely required. Fields of <code>struct Layer</code> which contain functions, or integers like sizes, are always ignored anyway. Overloading <code>trainable</code> is only necessary when some arrays of numbers are to be optimised, and some arrays of numbers are not.</p></div></div><p>The default is <code>Functors.children(x)</code>, usually a NamedTuple of all fields, and <code>trainable(x)</code> must contain a subset of these.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/interface.jl#L161-L175">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.isnumeric" href="#Optimisers.isnumeric"><code>Optimisers.isnumeric</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">isnumeric(x) -&gt; Bool</code></pre><p>Returns <code>true</code> on any parameter to be adjusted by Optimisers.jl, namely arrays of non-integer numbers. Returns <code>false</code> on all other types.</p><p>Requires also that <code>Functors.isleaf(x) == true</code>, to focus on e.g. the parent of a transposed matrix, not the wrapper.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/interface.jl#L136-L144">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.maywrite" href="#Optimisers.maywrite"><code>Optimisers.maywrite</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">maywrite(x) -&gt; Bool</code></pre><p>Should return <code>true</code> if we are completely sure that <code>update!</code> can write new values into <code>x</code>. Otherwise <code>false</code>, indicating a non-mutating path. For now, simply <code>x isa DenseArray</code> allowing <code>Array</code>, <code>CuArray</code>, etc. </p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/interface.jl#L149-L155">source</a></section></article><p>Such restrictions are also obeyed by this function for flattening a model:</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.destructure" href="#Optimisers.destructure"><code>Optimisers.destructure</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">destructure(model) -&gt; vector, reconstructor</code></pre><p>Copies all <a href="#Optimisers.trainable"><code>trainable</code></a>, <a href="#Optimisers.isnumeric"><code>isnumeric</code></a> parameters in the model to a vector, and returns also a function which reverses this transformation. Differentiable.</p><p><strong>Example</strong></p><pre><code class="language-julia-repl hljs">julia&gt; v, re = destructure((x=[1.0, 2.0], y=(sin, [3.0 + 4.0im])))
 (ComplexF64[1.0 + 0.0im, 2.0 + 0.0im, 3.0 + 4.0im], Restructure(NamedTuple, ..., 3))
 
 julia&gt; re([3, 5, 7+11im])
-(x = [3.0, 5.0], y = (sin, ComplexF64[7.0 + 11.0im]))</code></pre><p>If <code>model</code> contains various number types, they are promoted to make <code>vector</code>, and are usually restored by <code>Restructure</code>. Such restoration follows the rules  of <code>ChainRulesCore.ProjectTo</code>, and thus will restore floating point precision, but will permit more exotic numbers like <code>ForwardDiff.Dual</code>.</p><p>If <code>model</code> contains only GPU arrays, then <code>vector</code> will also live on the GPU. At present, a mixture of GPU and ordinary CPU arrays is undefined behaviour.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/destructure.jl#L5-L28">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.Restructure" href="#Optimisers.Restructure"><code>Optimisers.Restructure</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Restructure(Model, ..., length)</code></pre><p>This is what <a href="#Optimisers.destructure"><code>destructure</code></a> returns, and <code>re(p)</code> will re-build the model with new parameters from vector <code>p</code>. If the model is callable, then <code>re(x, p) == re(p)(x)</code>.</p><p><strong>Example</strong></p><pre><code class="language-julia hljs">julia&gt; using Flux, Optimisers
+(x = [3.0, 5.0], y = (sin, ComplexF64[7.0 + 11.0im]))</code></pre><p>If <code>model</code> contains various number types, they are promoted to make <code>vector</code>, and are usually restored by <code>Restructure</code>. Such restoration follows the rules  of <code>ChainRulesCore.ProjectTo</code>, and thus will restore floating point precision, but will permit more exotic numbers like <code>ForwardDiff.Dual</code>.</p><p>If <code>model</code> contains only GPU arrays, then <code>vector</code> will also live on the GPU. At present, a mixture of GPU and ordinary CPU arrays is undefined behaviour.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/destructure.jl#L5-L28">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.Restructure" href="#Optimisers.Restructure"><code>Optimisers.Restructure</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Restructure(Model, ..., length)</code></pre><p>This is what <a href="#Optimisers.destructure"><code>destructure</code></a> returns, and <code>re(p)</code> will re-build the model with new parameters from vector <code>p</code>. If the model is callable, then <code>re(x, p) == re(p)(x)</code>.</p><p><strong>Example</strong></p><pre><code class="language-julia hljs">julia&gt; using Flux, Optimisers
 
 julia&gt; _, re = destructure(Dense([1 2; 3 4], [0, 0], sigmoid))
 ([1, 3, 2, 4, 0, 0], Restructure(Dense, ..., 6))
@@ -146,7 +146,7 @@
 Dense(2, 2, σ)      # 6 parameters
 
 julia&gt; m([0.2, 0.3]) ≈ re([0.2, 0.3], -4:1)
-true</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/destructure.jl#L34-L53">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.trainables" href="#Optimisers.trainables"><code>Optimisers.trainables</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">trainables(x, path = false)</code></pre><p>Return an iterable over all the trainable parameters in <code>x</code>, that is all the numerical arrays (see <a href="#Optimisers.isnumeric"><code>isnumeric</code></a>) which are reachable through <a href="#Optimisers.trainable"><code>trainable</code></a>.</p><p>Parameters appearing multiple times in the model (tied weights) will be present only once in the output.</p><p>If <code>path = false</code>, the output is a list of numerical arrays.</p><p>If <code>path = true</code>, the output is a list of <code>(KeyPath, AbstractArray)</code> pairs, where <a href="#KeyPath"><code>KeyPath</code></a> is a type representing the path to the array in the original structure.</p><p>See also <a href="#Optimisers.destructure"><code>destructure</code></a> for a similar operation that returns a single flat vector instead.</p><p><strong>Examples</strong></p><pre><code class="language-julia-repl hljs">julia&gt; struct MyLayer
+true</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/destructure.jl#L34-L53">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.trainables" href="#Optimisers.trainables"><code>Optimisers.trainables</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">trainables(x, path = false)</code></pre><p>Return an iterable over all the trainable parameters in <code>x</code>, that is all the numerical arrays (see <a href="#Optimisers.isnumeric"><code>isnumeric</code></a>) which are reachable through <a href="#Optimisers.trainable"><code>trainable</code></a>.</p><p>Parameters appearing multiple times in the model (tied weights) will be present only once in the output.</p><p>If <code>path = false</code>, the output is a list of numerical arrays.</p><p>If <code>path = true</code>, the output is a list of <code>(KeyPath, AbstractArray)</code> pairs, where <a href="#KeyPath"><code>KeyPath</code></a> is a type representing the path to the array in the original structure.</p><p>See also <a href="#Optimisers.destructure"><code>destructure</code></a> for a similar operation that returns a single flat vector instead.</p><p><strong>Examples</strong></p><pre><code class="language-julia-repl hljs">julia&gt; struct MyLayer
          w
          b
        end
@@ -178,21 +178,21 @@
 julia&gt; getkeypath(x, KeyPath(:b, 1, &quot;c&quot;))
 2-element Vector{Float64}:
  3.0
- 4.0</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/trainables.jl#L2-L58">source</a></section></article><h2 id="Rule-Definition"><a class="docs-heading-anchor" href="#Rule-Definition">Rule Definition</a><a id="Rule-Definition-1"></a><a class="docs-heading-anchor-permalink" href="#Rule-Definition" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.apply!" href="#Optimisers.apply!"><code>Optimisers.apply!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Optimisers.apply!(rule::RuleType, state, parameters, gradient) -&gt; (state, gradient)</code></pre><p>This defines the action of any optimisation rule. It should return the modified gradient which will be subtracted from the parameters, and the updated state (if any) for use at the next iteration, as a tuple <code>(state, gradient)</code>.</p><p>For efficiency it is free to mutate the old state, but only what is returned will be used. Ideally this should check <code>maywrite(x)</code>, which the built-in rules do via <a href="#Optimisers.@.."><code>@..</code></a>.</p><p>The initial state is <code>init(rule::RuleType, parameters)</code>.</p><p><strong>Example</strong></p><pre><code class="language-julia-repl hljs">julia&gt; Optimisers.init(Descent(0.1), Float32[1,2,3]) === nothing
+ 4.0</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/trainables.jl#L2-L58">source</a></section></article><h2 id="Rule-Definition"><a class="docs-heading-anchor" href="#Rule-Definition">Rule Definition</a><a id="Rule-Definition-1"></a><a class="docs-heading-anchor-permalink" href="#Rule-Definition" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.apply!" href="#Optimisers.apply!"><code>Optimisers.apply!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Optimisers.apply!(rule::RuleType, state, parameters, gradient) -&gt; (state, gradient)</code></pre><p>This defines the action of any optimisation rule. It should return the modified gradient which will be subtracted from the parameters, and the updated state (if any) for use at the next iteration, as a tuple <code>(state, gradient)</code>.</p><p>For efficiency it is free to mutate the old state, but only what is returned will be used. Ideally this should check <code>maywrite(x)</code>, which the built-in rules do via <a href="#Optimisers.@.."><code>@..</code></a>.</p><p>The initial state is <code>init(rule::RuleType, parameters)</code>.</p><p><strong>Example</strong></p><pre><code class="language-julia-repl hljs">julia&gt; Optimisers.init(Descent(0.1), Float32[1,2,3]) === nothing
 true
 
 julia&gt; Optimisers.apply!(Descent(0.1), nothing, Float32[1,2,3], [4,5,6])
-(nothing, Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1}}(*, ([4, 5, 6], 0.1f0)))</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/Optimisers.jl#L34-L54">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.init" href="#Optimisers.init"><code>Optimisers.init</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Optimisers.init(rule::RuleType, parameters) -&gt; state</code></pre><p>Sets up the initial state for a given optimisation rule, and an array of parameters. This and <a href="#Optimisers.apply!"><code>apply!</code></a> are the two functions which any new optimisation rule must define.</p><p><strong>Examples</strong></p><pre><code class="language-julia-repl hljs">julia&gt; Optimisers.init(Descent(), Float32[1,2,3])  # is `nothing`
+(nothing, Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1}}(*, ([4, 5, 6], 0.1f0)))</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/Optimisers.jl#L34-L54">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.init" href="#Optimisers.init"><code>Optimisers.init</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Optimisers.init(rule::RuleType, parameters) -&gt; state</code></pre><p>Sets up the initial state for a given optimisation rule, and an array of parameters. This and <a href="#Optimisers.apply!"><code>apply!</code></a> are the two functions which any new optimisation rule must define.</p><p><strong>Examples</strong></p><pre><code class="language-julia-repl hljs">julia&gt; Optimisers.init(Descent(), Float32[1,2,3])  # is `nothing`
 
 julia&gt; Optimisers.init(Momentum(), [1.0, 2.0])
 2-element Vector{Float64}:
  0.0
- 0.0</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/Optimisers.jl#L57-L72">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.@.." href="#Optimisers.@.."><code>Optimisers.@..</code></a> — <span class="docstring-category">Macro</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">@.. x = y + z</code></pre><p>Sometimes in-place broadcasting macro, for use in <code>apply!</code> rules. If <code>maywrite(x)</code> then it is just <code>@. x = rhs</code>, but if not, it becomes <code>x = @. rhs</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/interface.jl#L195-L200">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.@lazy" href="#Optimisers.@lazy"><code>Optimisers.@lazy</code></a> — <span class="docstring-category">Macro</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">x = @lazy y + z</code></pre><p>Lazy broadcasting macro, for use in <code>apply!</code> rules. It broadcasts like <code>@.</code> but does not materialise, returning a <code>Broadcasted</code> object for later use. Beware that mutation of arguments will affect the result, and that if it is used in two places, work will be done twice.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/interface.jl#L212-L219">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.adjust-Tuple{AbstractRule, Real}" href="#Optimisers.adjust-Tuple{AbstractRule, Real}"><code>Optimisers.adjust</code></a> — <span class="docstring-category">Method</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Optimisers.adjust(rule::RuleType, η::Real) -&gt; rule</code></pre><p>If a new optimisation rule has a learning rate which is <em>not</em> stored in field <code>rule.eta</code>, then you may should add a method to <code>adjust</code>. (But simpler to just use the standard name.)</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/adjust.jl#L130-L135">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.@def" href="#Optimisers.@def"><code>Optimisers.@def</code></a> — <span class="docstring-category">Macro</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">@def struct Rule; eta = 0.1; beta = (0.7, 0.8); end</code></pre><p>Helper macro for defining rules with default values. The types of the literal values are used in the <code>struct</code>, like this:</p><pre><code class="language-julia hljs">struct Rule
+ 0.0</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/Optimisers.jl#L57-L72">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.@.." href="#Optimisers.@.."><code>Optimisers.@..</code></a> — <span class="docstring-category">Macro</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">@.. x = y + z</code></pre><p>Sometimes in-place broadcasting macro, for use in <code>apply!</code> rules. If <code>maywrite(x)</code> then it is just <code>@. x = rhs</code>, but if not, it becomes <code>x = @. rhs</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/interface.jl#L195-L200">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.@lazy" href="#Optimisers.@lazy"><code>Optimisers.@lazy</code></a> — <span class="docstring-category">Macro</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">x = @lazy y + z</code></pre><p>Lazy broadcasting macro, for use in <code>apply!</code> rules. It broadcasts like <code>@.</code> but does not materialise, returning a <code>Broadcasted</code> object for later use. Beware that mutation of arguments will affect the result, and that if it is used in two places, work will be done twice.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/interface.jl#L212-L219">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.adjust-Tuple{AbstractRule, Real}" href="#Optimisers.adjust-Tuple{AbstractRule, Real}"><code>Optimisers.adjust</code></a> — <span class="docstring-category">Method</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">Optimisers.adjust(rule::RuleType, η::Real) -&gt; rule</code></pre><p>If a new optimisation rule has a learning rate which is <em>not</em> stored in field <code>rule.eta</code>, then you may should add a method to <code>adjust</code>. (But simpler to just use the standard name.)</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/adjust.jl#L130-L135">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Optimisers.@def" href="#Optimisers.@def"><code>Optimisers.@def</code></a> — <span class="docstring-category">Macro</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">@def struct Rule; eta = 0.1; beta = (0.7, 0.8); end</code></pre><p>Helper macro for defining rules with default values. The types of the literal values are used in the <code>struct</code>, like this:</p><pre><code class="language-julia hljs">struct Rule
   eta::Float64
   beta::Tuple{Float64, Float64}
   Rule(eta, beta = (0.7, 0.8)) = eta &lt; 0 ? error() : new(eta, beta)
   Rule(; eta = 0.1, beta = (0.7, 0.8)) = Rule(eta, beta)
-end</code></pre><p>Any field called <code>eta</code> is assumed to be a learning rate, and cannot be negative.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/4a78a55f55e098a71fc96b2c2d91bb75b7a926cb/src/interface.jl#L235-L250">source</a></section></article><h2 id="KeyPath"><a class="docs-heading-anchor" href="#KeyPath">KeyPath</a><a id="KeyPath-1"></a><a class="docs-heading-anchor-permalink" href="#KeyPath" title="Permalink"></a></h2><p>A <code>KeyPath</code> is a sequence of keys that can be used to access a value within a nested structure. It is defined in Functors.jl and re-exported by Optimisers.jl here for convenience.</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Functors.KeyPath" href="#Functors.KeyPath"><code>Functors.KeyPath</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">KeyPath(keys...)</code></pre><p>A type for representing a path of keys to a value in a nested structure. Can be constructed with a sequence of keys, or by concatenating other <code>KeyPath</code>s. Keys can be of type <code>Symbol</code>, <code>String</code>, or <code>Int</code>.</p><p>For custom types, access through symbol keys is assumed to be done with <code>getproperty</code>. For consistency, the method <code>Base.propertynames</code> is used to get the viable property names.</p><p>For string and integer keys instead, the access is done with <code>getindex</code>.</p><p>See also <a href="#Functors.getkeypath"><code>getkeypath</code></a>, <a href="#Functors.haskeypath"><code>haskeypath</code></a>.</p><p><strong>Examples</strong></p><pre><code class="language-julia-repl hljs">julia&gt; kp = KeyPath(:b, 3)
+end</code></pre><p>Any field called <code>eta</code> is assumed to be a learning rate, and cannot be negative.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Optimisers.jl/blob/38c9d622c4a9979190b9c4c000604267aac39239/src/interface.jl#L235-L250">source</a></section></article><h2 id="KeyPath"><a class="docs-heading-anchor" href="#KeyPath">KeyPath</a><a id="KeyPath-1"></a><a class="docs-heading-anchor-permalink" href="#KeyPath" title="Permalink"></a></h2><p>A <code>KeyPath</code> is a sequence of keys that can be used to access a value within a nested structure. It is defined in Functors.jl and re-exported by Optimisers.jl here for convenience.</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Functors.KeyPath" href="#Functors.KeyPath"><code>Functors.KeyPath</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">KeyPath(keys...)</code></pre><p>A type for representing a path of keys to a value in a nested structure. Can be constructed with a sequence of keys, or by concatenating other <code>KeyPath</code>s. Keys can be of type <code>Symbol</code>, <code>String</code>, or <code>Int</code>.</p><p>For custom types, access through symbol keys is assumed to be done with <code>getproperty</code>. For consistency, the method <code>Base.propertynames</code> is used to get the viable property names.</p><p>For string and integer keys instead, the access is done with <code>getindex</code>.</p><p>See also <a href="#Functors.getkeypath"><code>getkeypath</code></a>, <a href="#Functors.haskeypath"><code>haskeypath</code></a>.</p><p><strong>Examples</strong></p><pre><code class="language-julia-repl hljs">julia&gt; kp = KeyPath(:b, 3)
 KeyPath(:b, 3)
 
 julia&gt; KeyPath(:a, kp, :c, 4) # construct mixing keys and keypaths
@@ -239,4 +239,4 @@
   :b =&gt; Dict{Any, Any}(:c=&gt;4, &quot;d&quot;=&gt;[5, 6, 7])
 
 julia&gt; getkeypath(x, KeyPath(:b, &quot;d&quot;, 2))
-6</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Functors.jl/blob/v0.5.0/src/keypath.jl#L105-L122">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Functors.setkeypath!" href="#Functors.setkeypath!"><code>Functors.setkeypath!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">setkeypath!(x, kp::KeyPath, v)</code></pre><p>Set the value in <code>x</code> at the path <code>kp</code> to <code>v</code>.</p><p>See also <a href="#KeyPath"><code>KeyPath</code></a>, <a href="#Functors.getkeypath"><code>getkeypath</code></a>, and <a href="#Functors.haskeypath"><code>haskeypath</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Functors.jl/blob/v0.5.0/src/keypath.jl#L164-L170">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../">« Home</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Thursday 7 November 2024 07:14">Thursday 7 November 2024</span>. Using Julia version 1.10.6.</p></section><footer class="modal-card-foot"></footer></div></div></div></body><div data-docstringscollapsed="true"></div></html>
+6</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Functors.jl/blob/v0.5.0/src/keypath.jl#L105-L122">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="Functors.setkeypath!" href="#Functors.setkeypath!"><code>Functors.setkeypath!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">setkeypath!(x, kp::KeyPath, v)</code></pre><p>Set the value in <code>x</code> at the path <code>kp</code> to <code>v</code>.</p><p>See also <a href="#KeyPath"><code>KeyPath</code></a>, <a href="#Functors.getkeypath"><code>getkeypath</code></a>, and <a href="#Functors.haskeypath"><code>haskeypath</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/FluxML/Functors.jl/blob/v0.5.0/src/keypath.jl#L164-L170">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../">« Home</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Thursday 7 November 2024 07:15">Thursday 7 November 2024</span>. Using Julia version 1.10.6.</p></section><footer class="modal-card-foot"></footer></div></div></div></body><div data-docstringscollapsed="true"></div></html>
diff --git a/dev/index.html b/dev/index.html
index f8a1cc8..9e6310c 100644
--- a/dev/index.html
+++ b/dev/index.html
@@ -170,4 +170,4 @@
 julia&gt; Optimisers.update!(opt_state, x, g);
 
 julia&gt; opt_state # the state in `a` and `b` differ
-(a = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.09, 0.09], [0.000999, 0.000999], (0.729, 0.997003))), b = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.1, 0.1], [0.001, 0.001], (0.81, 0.998001))))</code></pre></article><nav class="docs-footer"><a class="docs-footer-nextpage" href="api/">API »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Thursday 7 November 2024 07:14">Thursday 7 November 2024</span>. Using Julia version 1.10.6.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+(a = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.09, 0.09], [0.000999, 0.000999], (0.729, 0.997003))), b = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.1, 0.1], [0.001, 0.001], (0.81, 0.998001))))</code></pre></article><nav class="docs-footer"><a class="docs-footer-nextpage" href="api/">API »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.7.0 on <span class="colophon-date" title="Thursday 7 November 2024 07:15">Thursday 7 November 2024</span>. Using Julia version 1.10.6.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/objects.inv b/dev/objects.inv
index d6d74fd1f80c18d4fa0720d2dff77781b24535a4..335825bb4b765700b47b6df57d418c65d0d37997 100644
GIT binary patch
delta 59
zcmV-B0L1^Z2eJo{LjyD}FtJ8J11z0|sU8Ob=FbA_D`t!IPBCvF<421%{0wJF3v=_y
R&CLE4o9;VX{{r23L*K4t8WI2i

delta 60
zcmV-C0K@;X2eSu|LjyA|G_giN11(*PbX1Rn0CQ+Ds;`+XY)&zEAmc}iHT)E3NegrH
S$<56EC7bR$U;hF^>qUdFz!`1;

diff --git a/dev/search_index.js b/dev/search_index.js
index 1738858..9dee915 100644
--- a/dev/search_index.js
+++ b/dev/search_index.js
@@ -1,3 +1,3 @@
 var documenterSearchIndex = {"docs":
-[{"location":"api/","page":"API","title":"API","text":"CollapsedDocStrings = true","category":"page"},{"location":"api/#Optimisation-Rules","page":"API","title":"Optimisation Rules","text":"","category":"section"},{"location":"api/","page":"API","title":"API","text":"Optimisers.Descent\nOptimisers.Momentum\nOptimisers.Nesterov\nOptimisers.Rprop\nOptimisers.RMSProp\nOptimisers.Adam\nOptimisers.RAdam\nOptimisers.AdaMax\nOptimisers.OAdam\nOptimisers.AdaGrad\nOptimisers.AdaDelta\nOptimisers.AMSGrad\nOptimisers.NAdam\nOptimisers.AdamW\nOptimisers.AdaBelief\nOptimisers.Lion","category":"page"},{"location":"api/#Optimisers.Descent","page":"API","title":"Optimisers.Descent","text":"Descent(η = 1f-1)\nDescent(; [eta])\n\nClassic gradient descent optimiser with learning rate η. For each parameter p and its gradient dp, this runs p -= η*dp.\n\nParameters\n\nLearning rate (η == eta): Amount by which gradients are discounted before updating                      the weights.\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.Momentum","page":"API","title":"Optimisers.Momentum","text":"Momentum(η = 0.01, ρ = 0.9)\nMomentum(; [eta, rho])\n\nGradient descent optimizer with learning rate η and momentum ρ.\n\nParameters\n\nLearning rate (η == eta): Amount by which gradients are discounted before updating                      the weights.\nMomentum (ρ == rho): Controls the acceleration of gradient descent in the                 prominent direction, in effect dampening oscillations.\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.Nesterov","page":"API","title":"Optimisers.Nesterov","text":"Nesterov(η = 0.001, ρ = 0.9)\nNesterov(; [eta, rho])\n\nGradient descent optimizer with learning rate η and Nesterov momentum ρ.\n\nParameters\n\nLearning rate (η): Amount by which gradients are discounted before updating                      the weights.\nNesterov momentum (ρ): Controls the acceleration of gradient descent in the                          prominent direction, in effect dampening oscillations.\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.Rprop","page":"API","title":"Optimisers.Rprop","text":"Rprop(η = 1f-3, ℓ = (5f-1, 1.2f0), Γ = (1f-6, 50f0))\nRprop(; [eta, ell, gamma])\n\nOptimizer using the Rprop algorithm. A full-batch learning algorithm that depends only on the sign of the gradient.\n\nParameters\n\nLearning rate (η == eta): Amount by which gradients are discounted before updating                      the weights.\nScaling factors (ℓ::Tuple == ell): Multiplicative increase and decrease factors.\nStep sizes (Γ::Tuple == gamma): Mminimal and maximal allowed step sizes.\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.RMSProp","page":"API","title":"Optimisers.RMSProp","text":"RMSProp(η = 0.001, ρ = 0.9, ϵ = 1e-8; centred = false)\nRMSProp(; [eta, rho, epsilon, centred])\n\nOptimizer using the RMSProp algorithm. Often a good choice for recurrent networks. Parameters other than learning rate generally don't need tuning.\n\nCentred RMSProp is a variant which normalises gradients by an estimate their variance, instead of their second moment.\n\nParameters\n\nLearning rate (η == eta): Amount by which gradients are discounted before updating                      the weights.\nMomentum (ρ == rho): Controls the acceleration of gradient descent in the                 prominent direction, in effect dampening oscillations.\nMachine epsilon (ϵ == epsilon): Constant to prevent division by zero                        (no need to change default)\nKeyword centred (or centered): Indicates whether to use centred variant                                    of the algorithm.\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.Adam","page":"API","title":"Optimisers.Adam","text":"Adam(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8)\nAdam(; [eta, beta, epsilon])\n\nAdam optimiser.\n\nParameters\n\nLearning rate (η == eta): Amount by which gradients are discounted before updating                      the weights.\nDecay of momentums (β::Tuple == beta): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.\nMachine epsilon (ϵ == epsilon): Constant to prevent division by zero                        (no need to change default)\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.RAdam","page":"API","title":"Optimisers.RAdam","text":"RAdam(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8)\nRAdam(; [eta, beta, epsilon])\n\nRectified Adam optimizer.\n\nParameters\n\nLearning rate (η == eta): Amount by which gradients are discounted before updating                      the weights.\nDecay of momentums (β::Tuple == beta): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.\nMachine epsilon (ϵ == epsilon): Constant to prevent division by zero                        (no need to change default)\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.AdaMax","page":"API","title":"Optimisers.AdaMax","text":"AdaMax(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8)\nAdaMax(; [eta, beta, epsilon])\n\nAdaMax is a variant of Adam based on the ∞-norm.\n\nParameters\n\nLearning rate (η == eta): Amount by which gradients are discounted before updating                      the weights.\nDecay of momentums (β::Tuple == beta): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.\nMachine epsilon (ϵ == epsilon): Constant to prevent division by zero                        (no need to change default)\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.OAdam","page":"API","title":"Optimisers.OAdam","text":"OAdam(η = 0.001, β = (0.5, 0.9), ϵ = 1e-8)\nOAdam(; [eta, beta, epsilon])\n\nOAdam (Optimistic Adam) is a variant of Adam adding an \"optimistic\" term suitable for adversarial training.\n\nParameters\n\nLearning rate (η == eta): Amount by which gradients are discounted before updating                      the weights.\nDecay of momentums (β::Tuple == beta): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.\nMachine epsilon (ϵ == epsilon): Constant to prevent division by zero                        (no need to change default)\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.AdaGrad","page":"API","title":"Optimisers.AdaGrad","text":"AdaGrad(η = 0.1, ϵ = 1e-8)\nAdaGrad(; [eta, epsilon])\n\nAdaGrad optimizer. It has parameter specific learning rates based on how frequently it is updated. Parameters don't need tuning.\n\nParameters\n\nLearning rate (η == eta): Amount by which gradients are discounted before updating                      the weights.\nMachine epsilon (ϵ == epsilon): Constant to prevent division by zero                        (no need to change default)\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.AdaDelta","page":"API","title":"Optimisers.AdaDelta","text":"AdaDelta(ρ = 0.9, ϵ = 1e-8)\nAdaDelta(; [rho, epsilon])\n\nAdaDelta is a version of AdaGrad adapting its learning rate based on a window of past gradient updates. Parameters don't need tuning.\n\nParameters\n\nRho (ρ == rho): Factor by which the gradient is decayed at each time step.\nMachine epsilon (ϵ == epsilon): Constant to prevent division by zero                        (no need to change default)\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.AMSGrad","page":"API","title":"Optimisers.AMSGrad","text":"AMSGrad(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8)\nAMSGrad(; [eta, beta, epsilon])\n\nThe AMSGrad version of the Adam optimiser. Parameters don't need tuning.\n\nParameters\n\nLearning rate (η == eta): Amount by which gradients are discounted before updating                      the weights.\nDecay of momentums (β::Tuple == beta): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.\nMachine epsilon (ϵ == epsilon): Constant to prevent division by zero                        (no need to change default)\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.NAdam","page":"API","title":"Optimisers.NAdam","text":"NAdam(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8)\nNAdam(; [eta, beta, epsilon])\n\nNAdam is a Nesterov variant of Adam. Parameters don't need tuning.\n\nParameters\n\nLearning rate (η == eta): Amount by which gradients are discounted before updating                      the weights.\nDecay of momentums (β::Tuple == beta): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.\nMachine epsilon (ϵ == epsilon): Constant to prevent division by zero                        (no need to change default)\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.AdamW","page":"API","title":"Optimisers.AdamW","text":"AdamW(η = 0.001, β = (0.9, 0.999), λ = 0, ϵ = 1e-8)\nAdamW(; [eta, beta, lambda, epsilon])\n\nAdamW is a variant of Adam fixing (as in repairing) its weight decay regularization. Implemented as an OptimiserChain of Adam and WeightDecay`.\n\nParameters\n\nLearning rate (η == eta): Amount by which gradients are discounted before updating                      the weights.\nDecay of momentums (β::Tuple == beta): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.\nWeight decay (λ == lambda): Controls the strength of L_2 regularisation.\nMachine epsilon (ϵ == epsilon): Constant to prevent division by zero                        (no need to change default)\n\n\n\n\n\n","category":"function"},{"location":"api/#Optimisers.AdaBelief","page":"API","title":"Optimisers.AdaBelief","text":"AdaBelief(η = 0.001, β = (0.9, 0.999), ϵ = 1e-16)\nAdaBelief(; [eta, beta, epsilon])\n\nThe AdaBelief optimiser is a variant of the well-known Adam optimiser.\n\nParameters\n\nLearning rate (η == eta): Amount by which gradients are discounted before updating                      the weights.\nDecay of momentums (β::Tuple == beta): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.\nMachine epsilon (ϵ == epsilon): Constant to prevent division by zero                                 (no need to change default)\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.Lion","page":"API","title":"Optimisers.Lion","text":"Lion(η = 0.001, β = (0.9, 0.999))\nLion(; [eta, beta])\n\nLion optimiser.\n\nParameters\n\nLearning rate (η == eta): Magnitude by which gradients are updating the weights.\nDecay of momentums (β::Tuple == beta): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.\n\n\n\n\n\n","category":"type"},{"location":"api/","page":"API","title":"API","text":"In addition to the main course, you may wish to order some of these condiments:","category":"page"},{"location":"api/","page":"API","title":"API","text":"Optimisers.AccumGrad\nOptimisers.ClipGrad\nOptimisers.ClipNorm\nOptimisers.SignDecay\nOptimisers.WeightDecay\nOptimisers.OptimiserChain","category":"page"},{"location":"api/#Optimisers.AccumGrad","page":"API","title":"Optimisers.AccumGrad","text":"AccumGrad(n::Int)\n\nA rule constructed OptimiserChain(AccumGrad(n), Rule()) will accumulate for n steps, before applying Rule to the mean of these n gradients.\n\nThis is useful for training with effective batch sizes too large for the available memory. Instead of computing the gradient for batch size b at once, compute it for size b/n and accumulate n such gradients.\n\nExample\n\njulia> m = (x=[1f0], y=[2f0]);\n\njulia> r = OptimiserChain(AccumGrad(2), WeightDecay(0.01), Descent(0.1));\n\njulia> s = Optimisers.setup(r, m);\n\njulia> Optimisers.update!(s, m, (x=[33], y=[0]));\n\njulia> m  # model not yet changed\n(x = Float32[1.0], y = Float32[2.0])\n\njulia> Optimisers.update!(s, m, (x=[0], y=[444]));\n\njulia> m  # n=2 gradients applied at once\n(x = Float32[-0.651], y = Float32[-20.202002])\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.ClipGrad","page":"API","title":"Optimisers.ClipGrad","text":"ClipGrad(δ = 10)\nClipGrad(; [delta])\n\nRestricts every gradient component to obey -δ ≤ dx[i] ≤ δ.\n\nTypically composed with other rules using OptimiserChain.\n\nSee also ClipNorm.\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.ClipNorm","page":"API","title":"Optimisers.ClipNorm","text":"ClipNorm(ω = 10, p = 2; throw = true)\n\nScales any gradient array for which norm(dx, p) > ω to stay at this threshold (unless p==0).\n\nThrows an error if the norm is infinite or NaN, which you can turn off with throw = false.\n\nTypically composed with other rules using OptimiserChain.\n\nSee also ClipGrad.\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.SignDecay","page":"API","title":"Optimisers.SignDecay","text":"SignDecay(λ = 1e-3)\nSignDecay(; [lambda])\n\nImplements L_1 regularisation, also known as LASSO regression, when composed  with other rules as the first transformation in an OptimiserChain.\n\nIt does this by adding λ .* sign(x) to the gradient. This is equivalent to adding  λ * sum(abs, x) == λ * norm(x, 1) to the loss.\n\nSee also [WeightDecay] for L_2 normalisation. They can be used together: OptimiserChain(SignDecay(0.012), WeightDecay(0.034), Adam()) is equivalent to adding 0.012 * norm(x, 1) + 0.017 * norm(x, 2)^2 to the loss function.\n\nParameters\n\nPenalty (λ ≥ 0): Controls the strength of the regularisation.\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.WeightDecay","page":"API","title":"Optimisers.WeightDecay","text":"WeightDecay(λ = 5e-4)\nWeightDecay(; [lambda])\n\nImplements L_2 regularisation, also known as ridge regression,  when composed  with other rules as the first transformation in an OptimiserChain.\n\nIt does this by adding λ .* x to the gradient. This is equivalent to adding  λ/2 * sum(abs2, x) == λ/2 * norm(x)^2 to the loss.\n\nSee also [SignDecay] for L_1 normalisation.\n\nParameters\n\nPenalty (λ ≥ 0): Controls the strength of the regularisation.\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.OptimiserChain","page":"API","title":"Optimisers.OptimiserChain","text":"OptimiserChain(opts...)\n\nCompose a sequence of optimisers so that each opt in opts updates the gradient, in the order specified.\n\nWith an empty sequence, OptimiserChain() is the identity, so update! will subtract the full gradient from the parameters. This is equivalent to Descent(1).\n\nExample\n\njulia> o = OptimiserChain(ClipGrad(1.0), Descent(0.1));\n\njulia> m = (zeros(3),);\n\njulia> s = Optimisers.setup(o, m)\n(Leaf(OptimiserChain(ClipGrad(1.0), Descent(0.1)), (nothing, nothing)),)\n\njulia> Optimisers.update(s, m, ([0.3, 1, 7],))[2]  # clips before discounting\n([-0.03, -0.1, -0.1],)\n\n\n\n\n\n","category":"type"},{"location":"api/#Model-Interface","page":"API","title":"Model Interface","text":"","category":"section"},{"location":"api/","page":"API","title":"API","text":"Optimisers.setup\nOptimisers.update\nOptimisers.update!\nOptimisers.adjust!\nOptimisers.adjust(::Any, ::Real)\nOptimisers.freeze!\nOptimisers.thaw!","category":"page"},{"location":"api/#Optimisers.setup","page":"API","title":"Optimisers.setup","text":"Optimisers.setup(rule, model) -> state_tree\n\nInitialises the given optimiser for every trainable parameter within the model. Returns a tree of the relevant states, which must be passed to update or update!.\n\nExample\n\njulia> m = (x = rand(3), y = (true, false), z = tanh);\n\njulia> Optimisers.setup(Momentum(), m)  # same field names as m\n(x = Leaf(Momentum(0.01, 0.9), [0.0, 0.0, 0.0]), y = ((), ()), z = ())\n\nThe recursion into structures uses Functors.jl, and any new structs containing parameters need to be marked with Functors.@functor before use. See the Flux docs for more about this.\n\njulia> struct Layer; mat; fun; end\n\njulia> model = (lay = Layer([1 2; 3 4f0], sin), vec = [5, 6f0]);\n\njulia> Optimisers.setup(Momentum(), model)  # new struct is by default ignored\n(lay = (), vec = Leaf(Momentum(0.01, 0.9), Float32[0.0, 0.0]))\n\njulia> destructure(model)\n(Float32[5.0, 6.0], Restructure(NamedTuple, ..., 2))\n\njulia> using Functors; @functor Layer  # annotate this type as containing parameters\n\njulia> Optimisers.setup(Momentum(), model)\n(lay = (mat = Leaf(Momentum(0.01, 0.9), Float32[0.0 0.0; 0.0 0.0]), fun = ()), vec = Leaf(Momentum(0.01, 0.9), Float32[0.0, 0.0]))\n\njulia> destructure(model)\n(Float32[1.0, 3.0, 2.0, 4.0, 5.0, 6.0], Restructure(NamedTuple, ..., 6))\n\n\n\n\n\n","category":"function"},{"location":"api/#Optimisers.update","page":"API","title":"Optimisers.update","text":"Optimisers.update(tree, model, gradient) -> (tree, model)\n\nUses the optimiser and the gradient to change the trainable parameters in the model. Returns the improved model, and the optimiser states needed for the next update. The initial tree of states comes from setup.\n\nSee also update!, which will be faster for models of ordinary Arrays or CuArrays.\n\nExample\n\njulia> m = (x = Float32[1,2,3], y = tanh);\n\njulia> t = Optimisers.setup(Descent(0.1), m)\n(x = Leaf(Descent(0.1), nothing), y = ())\n\njulia> g = (x = [1,1,1], y = nothing);  # fake gradient\n\njulia> Optimisers.update(t, m, g)\n((x = Leaf(Descent(0.1), nothing), y = ()), (x = Float32[0.9, 1.9, 2.9], y = tanh))\n\n\n\n\n\n","category":"function"},{"location":"api/#Optimisers.update!","page":"API","title":"Optimisers.update!","text":"Optimisers.update!(tree, model, gradient) -> (tree, model)\n\nUses the optimiser and the gradient to change the trainable parameters in the model. Returns the improved model, and the optimiser states needed for the next update. The initial tree of states comes from setup.\n\nThis is used in exactly the same manner as update, but because it may mutate arrays within the old model (and the old state), it will be faster for models of ordinary Arrays or CuArrays. However, you should not rely on the old model being fully updated but rather use the returned model. (The original state tree is always mutated, as each Leaf is mutable.)\n\nExample\n\njulia> using StaticArrays, Zygote, Optimisers\n\njulia> m = (x = [1f0, 2f0], y = SA[4f0, 5f0]);  # partly mutable model\n\njulia> t = Optimisers.setup(Momentum(1/30, 0.9), m)  # tree of states\n(x = Leaf(Momentum(0.0333333, 0.9), Float32[0.0, 0.0]), y = Leaf(Momentum(0.0333333, 0.9), Float32[0.0, 0.0]))\n\njulia> g = gradient(m -> sum(abs2.(m.x .+ m.y)), m)[1]  # structural gradient\n(x = Float32[10.0, 14.0], y = Float32[10.0, 14.0])\n\njulia> t2, m2 = Optimisers.update!(t, m, g);\n\njulia> m2  # after update or update!, this is the new model\n(x = Float32[0.6666666, 1.5333333], y = Float32[3.6666667, 4.5333333])\n\njulia> m2.x === m.x  # update! has re-used this array, for efficiency\ntrue\n\njulia> m  # original should be discarded, may be mutated but no guarantee\n(x = Float32[0.6666666, 1.5333333], y = Float32[4.0, 5.0])\n\njulia> t == t2  # original state tree is guaranteed to be mutated\ntrue\n\n\n\n\n\n","category":"function"},{"location":"api/#Optimisers.adjust!","page":"API","title":"Optimisers.adjust!","text":"Optimisers.adjust!(tree, η)\n\nAlters the state tree = setup(rule, model) to change the parameters of the optimisation rule, without destroying its stored state. Typically used mid-way through training.\n\nCan be applied to part of a model, by acting only on the corresponding part of the state tree.\n\nTo change just the learning rate, provide a number η::Real.\n\nExample\n\njulia> m = (vec = rand(Float32, 2), fun = sin);\n\njulia> st = Optimisers.setup(Nesterov(), m)  # stored momentum is initialised to zero\n(vec = Leaf(Nesterov(0.001, 0.9), Float32[0.0, 0.0]), fun = ())\n\njulia> st, m = Optimisers.update(st, m, (vec = [16, 88], fun = nothing));  # with fake gradient\n\njulia> st\n(vec = Leaf(Nesterov(0.001, 0.9), Float32[-0.016, -0.088]), fun = ())\n\njulia> Optimisers.adjust!(st, 0.123)  # change learning rate, stored momentum untouched\n\njulia> st\n(vec = Leaf(Nesterov(0.123, 0.9), Float32[-0.016, -0.088]), fun = ())\n\nTo change other parameters, adjust! also accepts keyword arguments matching the field names of the optimisation rule's type.\n\njulia> fieldnames(Adam)\n(:eta, :beta, :epsilon)\n\njulia> st2 = Optimisers.setup(OptimiserChain(ClipGrad(), Adam()), m)\n(vec = Leaf(OptimiserChain(ClipGrad(10.0), Adam(0.001, (0.9, 0.999), 1.0e-8)), (nothing, (Float32[0.0, 0.0], Float32[0.0, 0.0], (0.9, 0.999)))), fun = ())\n\njulia> Optimisers.adjust(st2; beta = (0.777, 0.909), delta = 11.1)  # delta acts on ClipGrad\n(vec = Leaf(OptimiserChain(ClipGrad(11.1), Adam(0.001, (0.777, 0.909), 1.0e-8)), (nothing, (Float32[0.0, 0.0], Float32[0.0, 0.0], (0.9, 0.999)))), fun = ())\n\njulia> Optimisers.adjust(st; beta = \"no such field\")  # silently ignored!\n(vec = Leaf(Nesterov(0.123, 0.9), Float32[-0.016, -0.088]), fun = ())\n\n\n\n\n\n","category":"function"},{"location":"api/#Optimisers.adjust-Tuple{Any, Real}","page":"API","title":"Optimisers.adjust","text":"adjust(tree, η) -> tree\n\nLike adjust!, but returns a new tree instead of mutating the old one.\n\n\n\n\n\n","category":"method"},{"location":"api/#Optimisers.freeze!","page":"API","title":"Optimisers.freeze!","text":"Optimisers.freeze!(tree)\n\nTemporarily alters the state tree = setup(rule, model) so that parameters will not be updated. Un-done by thaw!.\n\nCan be applied to the state corresponding to only part of a model, for instance with model::Chain, to freeze model.layers[1] you should call freeze!(tree.layers[1]).\n\nExample\n\njulia> m = (x = ([1.0], 2.0), y = [3.0]);\n\njulia> s = Optimisers.setup(Momentum(), m);\n\njulia> Optimisers.freeze!(s.x)\n\njulia> Optimisers.update!(s, m, (x = ([pi], 10pi), y = [100pi]));  # with fake gradient\n\njulia> m\n(x = ([1.0], 2.0), y = [-0.14159265358979312])\n\njulia> s\n(x = (Leaf(Momentum(0.01, 0.9), [0.0], frozen = true), ()), y = Leaf(Momentum(0.01, 0.9), [3.14159]))\n\njulia> Optimisers.thaw!(s)\n\njulia> s.x\n(Leaf(Momentum(0.01, 0.9), [0.0]), ())\n\n\n\n\n\n","category":"function"},{"location":"api/#Optimisers.thaw!","page":"API","title":"Optimisers.thaw!","text":"Optimisers.thaw!(tree)\n\nThe reverse of freeze!. Applies to all parameters, mutating every Leaf(rule, state, frozen = true) to Leaf(rule, state, frozen = false).\n\n\n\n\n\n","category":"function"},{"location":"api/","page":"API","title":"API","text":"Calling Functors.@functor on your model's layer types by default causes these functions to recurse into all children, and ultimately optimise all isnumeric leaf nodes. To further restrict this by ignoring some fields of a layer type, define trainable:","category":"page"},{"location":"api/","page":"API","title":"API","text":"Optimisers.trainable\nOptimisers.isnumeric\nOptimisers.maywrite","category":"page"},{"location":"api/#Optimisers.trainable","page":"API","title":"Optimisers.trainable","text":"trainable(x::Layer) -> NamedTuple\n\nThis may be overloaded to make optimisers ignore some fields of every Layer, which would otherwise contain trainable parameters.\n\nwarning: Warning\nThis is very rarely required. Fields of struct Layer which contain functions, or integers like sizes, are always ignored anyway. Overloading trainable is only necessary when some arrays of numbers are to be optimised, and some arrays of numbers are not.\n\nThe default is Functors.children(x), usually a NamedTuple of all fields, and trainable(x) must contain a subset of these.\n\n\n\n\n\n","category":"function"},{"location":"api/#Optimisers.isnumeric","page":"API","title":"Optimisers.isnumeric","text":"isnumeric(x) -> Bool\n\nReturns true on any parameter to be adjusted by Optimisers.jl, namely arrays of non-integer numbers. Returns false on all other types.\n\nRequires also that Functors.isleaf(x) == true, to focus on e.g. the parent of a transposed matrix, not the wrapper.\n\n\n\n\n\n","category":"function"},{"location":"api/#Optimisers.maywrite","page":"API","title":"Optimisers.maywrite","text":"maywrite(x) -> Bool\n\nShould return true if we are completely sure that update! can write new values into x. Otherwise false, indicating a non-mutating path. For now, simply x isa DenseArray allowing Array, CuArray, etc. \n\n\n\n\n\n","category":"function"},{"location":"api/","page":"API","title":"API","text":"Such restrictions are also obeyed by this function for flattening a model:","category":"page"},{"location":"api/","page":"API","title":"API","text":"Optimisers.destructure\nOptimisers.Restructure\nOptimisers.trainables","category":"page"},{"location":"api/#Optimisers.destructure","page":"API","title":"Optimisers.destructure","text":"destructure(model) -> vector, reconstructor\n\nCopies all trainable, isnumeric parameters in the model to a vector, and returns also a function which reverses this transformation. Differentiable.\n\nExample\n\njulia> v, re = destructure((x=[1.0, 2.0], y=(sin, [3.0 + 4.0im])))\n(ComplexF64[1.0 + 0.0im, 2.0 + 0.0im, 3.0 + 4.0im], Restructure(NamedTuple, ..., 3))\n\njulia> re([3, 5, 7+11im])\n(x = [3.0, 5.0], y = (sin, ComplexF64[7.0 + 11.0im]))\n\nIf model contains various number types, they are promoted to make vector, and are usually restored by Restructure. Such restoration follows the rules  of ChainRulesCore.ProjectTo, and thus will restore floating point precision, but will permit more exotic numbers like ForwardDiff.Dual.\n\nIf model contains only GPU arrays, then vector will also live on the GPU. At present, a mixture of GPU and ordinary CPU arrays is undefined behaviour.\n\n\n\n\n\n","category":"function"},{"location":"api/#Optimisers.Restructure","page":"API","title":"Optimisers.Restructure","text":"Restructure(Model, ..., length)\n\nThis is what destructure returns, and re(p) will re-build the model with new parameters from vector p. If the model is callable, then re(x, p) == re(p)(x).\n\nExample\n\njulia> using Flux, Optimisers\n\njulia> _, re = destructure(Dense([1 2; 3 4], [0, 0], sigmoid))\n([1, 3, 2, 4, 0, 0], Restructure(Dense, ..., 6))\n\njulia> m = re(-4:1)\nDense(2, 2, σ)      # 6 parameters\n\njulia> m([0.2, 0.3]) ≈ re([0.2, 0.3], -4:1)\ntrue\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.trainables","page":"API","title":"Optimisers.trainables","text":"trainables(x, path = false)\n\nReturn an iterable over all the trainable parameters in x, that is all the numerical arrays (see isnumeric) which are reachable through trainable.\n\nParameters appearing multiple times in the model (tied weights) will be present only once in the output.\n\nIf path = false, the output is a list of numerical arrays.\n\nIf path = true, the output is a list of (KeyPath, AbstractArray) pairs, where KeyPath is a type representing the path to the array in the original structure.\n\nSee also destructure for a similar operation that returns a single flat vector instead.\n\nExamples\n\njulia> struct MyLayer\n         w\n         b\n       end\n\njulia> Functors.@functor MyLayer\n\njulia> Optimisers.trainable(x::MyLayer) = (; w = x.w,) # only w is trainable in this example\n\njulia> x = MyLayer([1.0,2.0,3.0], [4.0,5.0,6.0]);\n\njulia> trainables(x)\n1-element Vector{AbstractArray}:\n [1.0, 2.0, 3.0]\n\n julia> x = MyLayer((a=[1.0,2.0], b=[3.0]), [4.0,5.0,6.0]);\n\n julia> trainables(x) # collects nested parameters\n 2-element Vector{AbstractArray}:\n [1.0, 2.0]\n [3.0]\n\njulia> x = (a = [1.0,2.0], b = (Dict(\"c\" => [3.0, 4.0], \"d\" => 5.0), [6.0,7.0]));\n\njulia> for (kp, y) in trainables(x, path = true)\n           println(kp, \" => \", y)\n       end\nKeyPath(:a,) => [1.0, 2.0]\nKeyPath(:b, 1, \"c\") => [3.0, 4.0]\nKeyPath(:b, 2) => [6.0, 7.0]\n\njulia> getkeypath(x, KeyPath(:b, 1, \"c\"))\n2-element Vector{Float64}:\n 3.0\n 4.0\n\n\n\n\n\n","category":"function"},{"location":"api/#Rule-Definition","page":"API","title":"Rule Definition","text":"","category":"section"},{"location":"api/","page":"API","title":"API","text":"Optimisers.apply!\nOptimisers.init\nOptimisers.@..\nOptimisers.@lazy\nOptimisers.adjust(::AbstractRule, ::Real)\nOptimisers.@def","category":"page"},{"location":"api/#Optimisers.apply!","page":"API","title":"Optimisers.apply!","text":"Optimisers.apply!(rule::RuleType, state, parameters, gradient) -> (state, gradient)\n\nThis defines the action of any optimisation rule. It should return the modified gradient which will be subtracted from the parameters, and the updated state (if any) for use at the next iteration, as a tuple (state, gradient).\n\nFor efficiency it is free to mutate the old state, but only what is returned will be used. Ideally this should check maywrite(x), which the built-in rules do via @...\n\nThe initial state is init(rule::RuleType, parameters).\n\nExample\n\njulia> Optimisers.init(Descent(0.1), Float32[1,2,3]) === nothing\ntrue\n\njulia> Optimisers.apply!(Descent(0.1), nothing, Float32[1,2,3], [4,5,6])\n(nothing, Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1}}(*, ([4, 5, 6], 0.1f0)))\n\n\n\n\n\n","category":"function"},{"location":"api/#Optimisers.init","page":"API","title":"Optimisers.init","text":"Optimisers.init(rule::RuleType, parameters) -> state\n\nSets up the initial state for a given optimisation rule, and an array of parameters. This and apply! are the two functions which any new optimisation rule must define.\n\nExamples\n\njulia> Optimisers.init(Descent(), Float32[1,2,3])  # is `nothing`\n\njulia> Optimisers.init(Momentum(), [1.0, 2.0])\n2-element Vector{Float64}:\n 0.0\n 0.0\n\n\n\n\n\n","category":"function"},{"location":"api/#Optimisers.@..","page":"API","title":"Optimisers.@..","text":"@.. x = y + z\n\nSometimes in-place broadcasting macro, for use in apply! rules. If maywrite(x) then it is just @. x = rhs, but if not, it becomes x = @. rhs.\n\n\n\n\n\n","category":"macro"},{"location":"api/#Optimisers.@lazy","page":"API","title":"Optimisers.@lazy","text":"x = @lazy y + z\n\nLazy broadcasting macro, for use in apply! rules. It broadcasts like @. but does not materialise, returning a Broadcasted object for later use. Beware that mutation of arguments will affect the result, and that if it is used in two places, work will be done twice.\n\n\n\n\n\n","category":"macro"},{"location":"api/#Optimisers.adjust-Tuple{AbstractRule, Real}","page":"API","title":"Optimisers.adjust","text":"Optimisers.adjust(rule::RuleType, η::Real) -> rule\n\nIf a new optimisation rule has a learning rate which is not stored in field rule.eta, then you may should add a method to adjust. (But simpler to just use the standard name.)\n\n\n\n\n\n","category":"method"},{"location":"api/#Optimisers.@def","page":"API","title":"Optimisers.@def","text":"@def struct Rule; eta = 0.1; beta = (0.7, 0.8); end\n\nHelper macro for defining rules with default values. The types of the literal values are used in the struct, like this:\n\nstruct Rule\n  eta::Float64\n  beta::Tuple{Float64, Float64}\n  Rule(eta, beta = (0.7, 0.8)) = eta < 0 ? error() : new(eta, beta)\n  Rule(; eta = 0.1, beta = (0.7, 0.8)) = Rule(eta, beta)\nend\n\nAny field called eta is assumed to be a learning rate, and cannot be negative.\n\n\n\n\n\n","category":"macro"},{"location":"api/#KeyPath","page":"API","title":"KeyPath","text":"","category":"section"},{"location":"api/","page":"API","title":"API","text":"A KeyPath is a sequence of keys that can be used to access a value within a nested structure. It is defined in Functors.jl and re-exported by Optimisers.jl here for convenience.","category":"page"},{"location":"api/","page":"API","title":"API","text":"Functors.KeyPath\nFunctors.haskeypath\nFunctors.getkeypath\nFunctors.setkeypath!","category":"page"},{"location":"api/#Functors.KeyPath","page":"API","title":"Functors.KeyPath","text":"KeyPath(keys...)\n\nA type for representing a path of keys to a value in a nested structure. Can be constructed with a sequence of keys, or by concatenating other KeyPaths. Keys can be of type Symbol, String, or Int.\n\nFor custom types, access through symbol keys is assumed to be done with getproperty. For consistency, the method Base.propertynames is used to get the viable property names.\n\nFor string and integer keys instead, the access is done with getindex.\n\nSee also getkeypath, haskeypath.\n\nExamples\n\njulia> kp = KeyPath(:b, 3)\nKeyPath(:b, 3)\n\njulia> KeyPath(:a, kp, :c, 4) # construct mixing keys and keypaths\nKeyPath(:a, :b, 3, :c, 4)\n\njulia> struct T\n           a\n           b\n       end\n\njulia> function Base.getproperty(x::T, k::Symbol)\n            if k in fieldnames(T)\n                return getfield(x, k)\n            elseif k === :ab\n                return \"ab\"\n            else        \n                error()\n            end\n        end;\n\njulia> Base.propertynames(::T) = (:a, :b, :ab);\n\njulia> x = T(3, Dict(:c => 4, :d => 5));\n\njulia> getkeypath(x, KeyPath(:ab)) # equivalent to x.ab\n\"ab\"\n\njulia> getkeypath(x, KeyPath(:b, :c)) # equivalent to (x.b)[:c]\n4\n\n\n\n\n\n","category":"type"},{"location":"api/#Functors.haskeypath","page":"API","title":"Functors.haskeypath","text":"haskeypath(x, kp::KeyPath)\n\nReturn true if x has a value at the path kp.\n\nSee also KeyPath, getkeypath, and setkeypath!.\n\nExamples\n\njulia> x = Dict(:a => 3, :b => Dict(:c => 4, \"d\" => [5, 6, 7]))\nDict{Symbol, Any} with 2 entries:\n  :a => 3\n  :b => Dict{Any, Any}(:c=>4, \"d\"=>[5, 6, 7])\n\njulia> haskeypath(x, KeyPath(:a))\ntrue\n\njulia> haskeypath(x, KeyPath(:b, \"d\", 1))\ntrue\n\njulia> haskeypath(x, KeyPath(:b, \"d\", 4))\nfalse\n\n\n\n\n\n","category":"function"},{"location":"api/#Functors.getkeypath","page":"API","title":"Functors.getkeypath","text":"getkeypath(x, kp::KeyPath)\n\nReturn the value in x at the path kp.\n\nSee also KeyPath, haskeypath, and setkeypath!.\n\nExamples\n\njulia> x = Dict(:a => 3, :b => Dict(:c => 4, \"d\" => [5, 6, 7]))\nDict{Symbol, Any} with 2 entries:\n  :a => 3\n  :b => Dict{Any, Any}(:c=>4, \"d\"=>[5, 6, 7])\n\njulia> getkeypath(x, KeyPath(:b, \"d\", 2))\n6\n\n\n\n\n\n","category":"function"},{"location":"api/#Functors.setkeypath!","page":"API","title":"Functors.setkeypath!","text":"setkeypath!(x, kp::KeyPath, v)\n\nSet the value in x at the path kp to v.\n\nSee also KeyPath, getkeypath, and haskeypath.\n\n\n\n\n\n","category":"function"},{"location":"#Optimisers.jl","page":"Home","title":"Optimisers.jl","text":"","category":"section"},{"location":"#An-optimisation-rule","page":"Home","title":"An optimisation rule","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"A new optimiser must overload two functions, apply! and init. These act on one array of parameters:","category":"page"},{"location":"","page":"Home","title":"Home","text":"# Define a container to hold any optimiser specific parameters (if any):\nstruct DecayDescent <: Optimisers.AbstractRule\n  eta::Float64\nend\n\n# Define an `apply!` rule which encodes how the gradients will be used to\n# update the parameters:\nfunction Optimisers.apply!(o::DecayDescent, state, x, x̄)\n  T = eltype(x)\n  newx̄ = T(o.eta / √state) .* x̄\n  nextstate = state + 1\n  return nextstate, newx̄\nend\n\n# Define the function which sets up the initial state (if any):\nOptimisers.init(o::DecayDescent, x::AbstractArray) = 1","category":"page"},{"location":"","page":"Home","title":"Home","text":"The parameters will be immediately updated to x .- newx̄, while nextstate is caried to the next iteration.","category":"page"},{"location":"","page":"Home","title":"Home","text":"Notice that the state is handled separately from the optimiser itself. This is a key design principle and allows users to manage their own state explicitly. It of course also makes it easier to store the state.","category":"page"},{"location":"#Usage-with-[Flux.jl](https://github.com/FluxML/Flux.jl)","page":"Home","title":"Usage with Flux.jl","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"To apply such an optimiser to a whole model, setup builds a tree containing any initial state for every trainable array. Then at each step, update uses this and the gradient to adjust the model:","category":"page"},{"location":"","page":"Home","title":"Home","text":"\nusing Flux, Metalhead, Zygote, Optimisers\n\nmodel = Metalhead.ResNet(18) |> gpu  # define a model to train\nimage = rand(Float32, 224, 224, 3, 1) |> gpu;  # dummy data\n@show sum(model(image));  # dummy loss function\n\nrule = Optimisers.Adam()  # use the Adam optimiser with its default settings\nstate_tree = Optimisers.setup(rule, model);  # initialise this optimiser's momentum etc.\n\n∇model, _ = gradient(model, image) do m, x  # calculate the gradients\n  sum(m(x))\nend;\n\nstate_tree, model = Optimisers.update(state_tree, model, ∇model);\n@show sum(model(image));  # reduced\n","category":"page"},{"location":"","page":"Home","title":"Home","text":"Notice that a completely new instance of the model is returned. Internally, this is handled by Functors.jl, where we do a walk over the tree formed by the model and update the parameters using the gradients.","category":"page"},{"location":"","page":"Home","title":"Home","text":"There is also Optimisers.update! which similarly returns a new model, but is free to mutate arrays within the old one for efficiency. (The method of apply! above is likewise free to mutate arrays within its state; they are defensively copied when this rule is used with update.) For Adam(), there are two momenta per parameter, thus state is about twice the size of model:","category":"page"},{"location":"","page":"Home","title":"Home","text":"Base.summarysize(model) / 1024^2  # about 45MB\nBase.summarysize(state) / 1024^2  # about 90MB","category":"page"},{"location":"","page":"Home","title":"Home","text":"Optimisers.jl does not depend on any one automatic differentiation package, but for now the most likely source of gradients is Zygote.jl. Note that update always wants the gradient from Zygote's \"explicit\" mode, as shown above. This ∇model is another tree structure, rather than the dictionary-like object from  Zygote's \"implicit\" mode gradient(() -> loss(...), Flux.params(model)) – see  Zygote's documentation for more about this difference.","category":"page"},{"location":"#Usage-with-[Lux.jl](https://github.com/avik-pal/Lux.jl)","page":"Home","title":"Usage with Lux.jl","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"The main design difference of Lux from Flux is that the tree of parameters is separate from the layer structure. It is these parameters which setup and update need to know about.","category":"page"},{"location":"","page":"Home","title":"Home","text":"Lux describes this separation of parameter storage from model description as \"explicit\" parameters. Beware that it has nothing to do with Zygote's notion of \"explicit\" gradients. (If the same model is written in Flux and Lux, ∇model above and ∇params below will be nearly identical trees of nested NamedTuples.)","category":"page"},{"location":"","page":"Home","title":"Home","text":"\nusing Lux, Boltz, Zygote, Optimisers\n\nlux_model, params, lux_state = Boltz.resnet(:resnet18) |> gpu;  # define and initialise model\nimages = rand(Float32, 224, 224, 3, 4) |> gpu;  # batch of dummy data\ny, lux_state = Lux.apply(lux_model, images, params, lux_state);  # run the model\n@show sum(y);  # initial dummy loss\n\nrule = Optimisers.Adam()\nopt_state = Optimisers.setup(rule, params);  # optimiser state based on model parameters\n\n(loss, lux_state), back = Zygote.pullback(params, images) do p, x\n  y, st = Lux.apply(lux_model, x, p, lux_state)\n  sum(y), st  # return both the loss, and the updated lux_state\nend;\n∇params, _ = back((one.(loss), nothing));  # gradient of only the loss, with respect to parameter tree\nloss == sum(y)  # not yet changed\n\nopt_state, params = Optimisers.update!(opt_state, params, ∇params);\n\ny, lux_state = Lux.apply(lux_model, images, params, lux_state);\n@show sum(y);  # now reduced\n","category":"page"},{"location":"","page":"Home","title":"Home","text":"Besides the parameters stored in params and gradually optimised, any other model state is stored in lux_state, and updated by Lux.apply. (In this example, BatchNorm has state.) This is completely unrelated to Optimisers.jl's state, although designed in a similar spirit.","category":"page"},{"location":"","page":"Home","title":"Home","text":"Base.summarysize(lux_model) / 1024   # just 2KB\nBase.summarysize(params) / 1024^2    # about 45MB, same as Flux model\nBase.summarysize(lux_state) / 1024   # 40KB\nBase.summarysize(opt_state) / 1024^2 # about 90MB, with Adam","category":"page"},{"location":"","page":"Home","title":"Home","text":"If you are certain there is no model state, then the gradient calculation can be simplified to use Zygote.gradient instead of Zygote.pullback:","category":"page"},{"location":"","page":"Home","title":"Home","text":"∇params, _ = gradient(params, images) do p, x\n  y, _ = Lux.apply(lux_model, x, p, lux_state)  # discards new lux_state\n  sum(y)\nend;","category":"page"},{"location":"#Non-trainable-Parameters","page":"Home","title":"Non-trainable Parameters","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"Optimisers.jl uses Functors.jl to walk the structs making up the model, for which they must be annotated @functor Type.  By default optimisation will alter all isnumeric arrays. ","category":"page"},{"location":"","page":"Home","title":"Home","text":"If some arrays of a particular layer should not be treated this way, you can define a method for trainable","category":"page"},{"location":"","page":"Home","title":"Home","text":"struct Layer{T}\n  alpha::T\n  beta::T\n  length::Int\nend\nLayer(n::Int) = Layer(randn(n), zeros(n), n)\n\nFunctors.@functor Layer\n\n# Both array fields will be, for example, moved to the GPU:\nFunctors.children(Layer(3))  # (alpha = [...], beta = [...], length)\n\nOptimisers.trainable(x::Layer) = (; alpha = x.alpha)  # must be a subset of children\n\n# Only the first field will be optimised:\nst = Optimisers.setup(DecayDescent(0.1), Layer(3))","category":"page"},{"location":"#Frozen-Parameters","page":"Home","title":"Frozen Parameters","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"To temporarily prevent training from affecting some parameters, use freeze! and thaw!. They work by mutating all Leafs of the state tree, or part of it.","category":"page"},{"location":"","page":"Home","title":"Home","text":"using Flux, Optimisers\n\nx = randn(Float32, 28, 28, 1, 1);\nnet = @autosize (size(x)...,) Chain(\n  Conv((3, 3), 1 => 3, stride=2, bias=false), Flux.flatten, Dense(_ => 2, relu),\n)\nopt = Optimisers.setup(Optimisers.Momentum(), net);\n\nnet.layers[3] isa Dense  # now freeze this layer's parameters:\nOptimisers.freeze!(opt.layers[3])\nopt.layers[3].bias  # confirm: Leaf(Momentum(...), [0.0, 0.0], frozen = true)\n\nOptimisers.update!(opt, net, gradient(m -> sum(m(x)), net)...);\n\nnet.layers[3].bias  # stil zero, and its momentum is too:\n\nOptimisers.thaw!(opt)\nopt.layers[3].bias  # Leaf(Momentum(...), [0.0, 0.0])","category":"page"},{"location":"#Adjusting-Hyperparameters","page":"Home","title":"Adjusting Hyperparameters","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"To change the learning rate during training, use adjust!. This works much like freeze! by mutating the state tree, or part of it, without discarding the momenta. For the Flux model from just above:","category":"page"},{"location":"","page":"Home","title":"Home","text":"Optimisers.adjust!(opt, 0.03)  # change η for the whole model...\n\nOptimisers.adjust!(opt.layers[3], 0.04)  # ... or just for one layer.","category":"page"},{"location":"","page":"Home","title":"Home","text":"To change other fields of the optimisation rule, it accepts keyword arguments:","category":"page"},{"location":"","page":"Home","title":"Home","text":"Momentum |> fieldnames  # (:eta, :rho)\n\nOptimisers.adjust!(opt, rho = 0.95)  # change ρ for the whole model.","category":"page"},{"location":"#Tied-Parameters","page":"Home","title":"Tied Parameters","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"If the same array appears twice (or more) in the model, Functors.jl should recognise this. Within Optimisers.jl, setup will initialise once, and use the same Leaf for both parameters.  Then update will accumulate the gradient from both, and the updated model returned will have the tie maintained.","category":"page"},{"location":"","page":"Home","title":"Home","text":"using Flux, Optimisers\n\nenc = Chain(Dense(40 => 20, tanh), Dense(20 => 10));\ndec = Chain(Dense(enc[1].weight', true, tanh), Dense(enc[2].weight', true, tanh));\nmodel = Chain(; enc, dec)\n\nst = Optimisers.setup(Optimisers.Adam(), model);\n\nst.layers.enc.layers[1].weight === st.layers.dec.layers[1].weight.parent  # true","category":"page"},{"location":"","page":"Home","title":"Home","text":"This identification relies on ===, and will work for ordinary Arrays and CuArrays. It will not at present work for reshaped arrays, nor for immutable arrays such as those from StaticArrays.jl.","category":"page"},{"location":"#Obtaining-a-flat-parameter-vector","page":"Home","title":"Obtaining a flat parameter vector","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"Instead of a nested tree-like structure, sometimes is is convenient to have all the parameters as one simple vector. Optimisers.jl contains a function destructure which creates this vector, and also creates way to re-build the original structure with new parameters. Both flattening and re-building may be used within gradient calls.","category":"page"},{"location":"","page":"Home","title":"Home","text":"An example with Flux's model:","category":"page"},{"location":"","page":"Home","title":"Home","text":"using ForwardDiff  # an example of a package which only likes one array\n\nmodel = Chain(  # much smaller model example, as ForwardDiff is a slow algorithm here\n          Conv((3, 3), 3 => 5, pad=1, bias=false), \n          BatchNorm(5, relu), \n          Conv((3, 3), 5 => 3, stride=16),\n        )\nimage = rand(Float32, 224, 224, 3, 1);\n@show sum(model(image));\n\nflat, re = destructure(model)\nst = Optimisers.setup(rule, flat)  # state is just one Leaf now\n\n∇flat = ForwardDiff.gradient(flat) do v\n  m = re(v)      # rebuild a new object like model\n  sum(m(image))  # call that as before\nend\n\nst, flat = Optimisers.update(st, flat, ∇flat)\n@show sum(re(flat)(image));","category":"page"},{"location":"","page":"Home","title":"Home","text":"Here flat contains only the 283 trainable parameters, while the non-trainable ones are preserved inside re, an object of type Restructure. When defining new layers, these can be specified if necessary by overloading trainable. By default, all numeric arrays visible to Functors.jl are assumed to contain trainable parameters. Tied parameters (arrays appearing in different layers) are included only once in flat.","category":"page"},{"location":"","page":"Home","title":"Home","text":"Lux stores only the trainable parameters in params. This can also be flattened to a plain Vector in the same way:","category":"page"},{"location":"","page":"Home","title":"Home","text":"params, lux_state = Lux.setup(Random.default_rng(), lux_model);\n\nflat, re = destructure(params)\n\n∇flat = ForwardDiff.gradient(flat) do v\n  p = re(v)  # rebuild an object like params\n  y, _ = Lux.apply(lux_model, images, p, lux_state)\n  sum(y)\nend","category":"page"},{"location":"#Collecting-all-trainable-parameters","page":"Home","title":"Collecting all trainable parameters","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"Sometimes it is useful to collect all trainable parameters in a model, similarly to what destructure does but without concatenating the arrays into a flat vector. This is done by trainables, which returns a list of arrays:","category":"page"},{"location":"","page":"Home","title":"Home","text":"julia> using Flux, Optimisers\n\njulia> model = Chain(Dense(2 => 3, tanh), BatchNorm(3), Dense(3 => 2));\n\njulia> trainables(model)\n6-element Vector{AbstractArray}:\n Float32[0.5756773 -0.1975264; 0.4723181 -0.7546912; -0.91631395 0.07392061]\n Float32[0.0, 0.0, 0.0]\n Float32[0.0, 0.0, 0.0]\n Float32[1.0, 1.0, 1.0]\n Float32[-0.8764882 0.40812716 0.1919528; -0.9123545 -0.4462516 0.6751252]\n Float32[0.0, 0.0]\n\njulia> l2reg(model) = sum([sum(abs2, p) for p in trainables(model)]);\n\njulia> g = gradient(l2reg, model)[1];","category":"page"},{"location":"","page":"Home","title":"Home","text":"Notice that the BatchNorm layer has two trainable parameters, γ and β, which are included in the list, while the μ and σ² buffers are not.","category":"page"},{"location":"","page":"Home","title":"Home","text":"Sometimes one wants to iterate over all trainable parameters in a model and the corresponding parameters of a matched structure such a gradient or the moving average of the model.  This can be done using trainables(model, path=true). For instance, here is how to update the parameters of a moving average model with the parameters of the model:","category":"page"},{"location":"","page":"Home","title":"Home","text":"for (kp, p_avg) in trainables(model_avg, path=true)\n    p = getkeypath(model, kp)  \n    p_avg .= 0.99 .* p_avg .+ 0.01 .* p\nend","category":"page"},{"location":"#Incomplete-or-nothing-gradients","page":"Home","title":"Incomplete or nothing gradients","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"If the gradient is not available for some parameters, or branches of the model,  update will not take an optimisation step for those parameters. This is the case when the gradient is nothing or a subtype of ChainRules.AbstractZero.","category":"page"},{"location":"","page":"Home","title":"Home","text":"For stateful optimisers, skipping an update it is generaly not the same as updating with a zero gradient. For example, in the case of Adam, the momentum and variance are updated even if the gradient is zero:","category":"page"},{"location":"","page":"Home","title":"Home","text":"julia> x = (a = ones(2), b = ones(2));\n(a = [1.0, 1.0], b = [1.0, 1.0])\n\njulia> opt_state = Optimisers.setup(Adam(0.1), x)\n(a = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.0, 0.0], [0.0, 0.0], (0.9, 0.999))), b = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.0, 0.0], [0.0, 0.0], (0.9, 0.999))))\n\njulia> g = (; a = ones(2), b = ones(2)); # First an update with a non-zero gradient to increase the momentum and variance\n\njulia> Optimisers.update!(opt_state, x, g);\n\njulia> opt_state # the state in `a` and `b` are the same\n(a = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.1, 0.1], [0.001, 0.001], (0.81, 0.998001))), b = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.1, 0.1], [0.001, 0.001], (0.81, 0.998001))))\n\njulia> g = (; a = zeros(2), b = nothing); # Now an update with a zero gradient for a and no gradient for b\n\njulia> Optimisers.update!(opt_state, x, g);\n\njulia> opt_state # the state in `a` and `b` differ\n(a = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.09, 0.09], [0.000999, 0.000999], (0.729, 0.997003))), b = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.1, 0.1], [0.001, 0.001], (0.81, 0.998001))))","category":"page"}]
+[{"location":"api/","page":"API","title":"API","text":"CollapsedDocStrings = true","category":"page"},{"location":"api/#Optimisation-Rules","page":"API","title":"Optimisation Rules","text":"","category":"section"},{"location":"api/","page":"API","title":"API","text":"Optimisers.Descent\nOptimisers.Momentum\nOptimisers.Nesterov\nOptimisers.Rprop\nOptimisers.RMSProp\nOptimisers.Adam\nOptimisers.RAdam\nOptimisers.AdaMax\nOptimisers.OAdam\nOptimisers.AdaGrad\nOptimisers.AdaDelta\nOptimisers.AMSGrad\nOptimisers.NAdam\nOptimisers.AdamW\nOptimisers.AdaBelief\nOptimisers.Lion","category":"page"},{"location":"api/#Optimisers.Descent","page":"API","title":"Optimisers.Descent","text":"Descent(η = 1f-1)\nDescent(; [eta])\n\nClassic gradient descent optimiser with learning rate η. For each parameter p and its gradient dp, this runs p -= η*dp.\n\nParameters\n\nLearning rate (η == eta): Amount by which gradients are discounted before updating                      the weights.\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.Momentum","page":"API","title":"Optimisers.Momentum","text":"Momentum(η = 0.01, ρ = 0.9)\nMomentum(; [eta, rho])\n\nGradient descent optimizer with learning rate η and momentum ρ.\n\nParameters\n\nLearning rate (η == eta): Amount by which gradients are discounted before updating                      the weights.\nMomentum (ρ == rho): Controls the acceleration of gradient descent in the                 prominent direction, in effect dampening oscillations.\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.Nesterov","page":"API","title":"Optimisers.Nesterov","text":"Nesterov(η = 0.001, ρ = 0.9)\nNesterov(; [eta, rho])\n\nGradient descent optimizer with learning rate η and Nesterov momentum ρ.\n\nParameters\n\nLearning rate (η): Amount by which gradients are discounted before updating                      the weights.\nNesterov momentum (ρ): Controls the acceleration of gradient descent in the                          prominent direction, in effect dampening oscillations.\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.Rprop","page":"API","title":"Optimisers.Rprop","text":"Rprop(η = 1f-3, ℓ = (5f-1, 1.2f0), Γ = (1f-6, 50f0))\nRprop(; [eta, ell, gamma])\n\nOptimizer using the Rprop algorithm. A full-batch learning algorithm that depends only on the sign of the gradient.\n\nParameters\n\nLearning rate (η == eta): Amount by which gradients are discounted before updating                      the weights.\nScaling factors (ℓ::Tuple == ell): Multiplicative increase and decrease factors.\nStep sizes (Γ::Tuple == gamma): Mminimal and maximal allowed step sizes.\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.RMSProp","page":"API","title":"Optimisers.RMSProp","text":"RMSProp(η = 0.001, ρ = 0.9, ϵ = 1e-8; centred = false)\nRMSProp(; [eta, rho, epsilon, centred])\n\nOptimizer using the RMSProp algorithm. Often a good choice for recurrent networks. Parameters other than learning rate generally don't need tuning.\n\nCentred RMSProp is a variant which normalises gradients by an estimate their variance, instead of their second moment.\n\nParameters\n\nLearning rate (η == eta): Amount by which gradients are discounted before updating                      the weights.\nMomentum (ρ == rho): Controls the acceleration of gradient descent in the                 prominent direction, in effect dampening oscillations.\nMachine epsilon (ϵ == epsilon): Constant to prevent division by zero                        (no need to change default)\nKeyword centred (or centered): Indicates whether to use centred variant                                    of the algorithm.\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.Adam","page":"API","title":"Optimisers.Adam","text":"Adam(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8)\nAdam(; [eta, beta, epsilon])\n\nAdam optimiser.\n\nParameters\n\nLearning rate (η == eta): Amount by which gradients are discounted before updating                      the weights.\nDecay of momentums (β::Tuple == beta): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.\nMachine epsilon (ϵ == epsilon): Constant to prevent division by zero                        (no need to change default)\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.RAdam","page":"API","title":"Optimisers.RAdam","text":"RAdam(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8)\nRAdam(; [eta, beta, epsilon])\n\nRectified Adam optimizer.\n\nParameters\n\nLearning rate (η == eta): Amount by which gradients are discounted before updating                      the weights.\nDecay of momentums (β::Tuple == beta): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.\nMachine epsilon (ϵ == epsilon): Constant to prevent division by zero                        (no need to change default)\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.AdaMax","page":"API","title":"Optimisers.AdaMax","text":"AdaMax(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8)\nAdaMax(; [eta, beta, epsilon])\n\nAdaMax is a variant of Adam based on the ∞-norm.\n\nParameters\n\nLearning rate (η == eta): Amount by which gradients are discounted before updating                      the weights.\nDecay of momentums (β::Tuple == beta): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.\nMachine epsilon (ϵ == epsilon): Constant to prevent division by zero                        (no need to change default)\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.OAdam","page":"API","title":"Optimisers.OAdam","text":"OAdam(η = 0.001, β = (0.5, 0.9), ϵ = 1e-8)\nOAdam(; [eta, beta, epsilon])\n\nOAdam (Optimistic Adam) is a variant of Adam adding an \"optimistic\" term suitable for adversarial training.\n\nParameters\n\nLearning rate (η == eta): Amount by which gradients are discounted before updating                      the weights.\nDecay of momentums (β::Tuple == beta): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.\nMachine epsilon (ϵ == epsilon): Constant to prevent division by zero                        (no need to change default)\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.AdaGrad","page":"API","title":"Optimisers.AdaGrad","text":"AdaGrad(η = 0.1, ϵ = 1e-8)\nAdaGrad(; [eta, epsilon])\n\nAdaGrad optimizer. It has parameter specific learning rates based on how frequently it is updated. Parameters don't need tuning.\n\nParameters\n\nLearning rate (η == eta): Amount by which gradients are discounted before updating                      the weights.\nMachine epsilon (ϵ == epsilon): Constant to prevent division by zero                        (no need to change default)\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.AdaDelta","page":"API","title":"Optimisers.AdaDelta","text":"AdaDelta(ρ = 0.9, ϵ = 1e-8)\nAdaDelta(; [rho, epsilon])\n\nAdaDelta is a version of AdaGrad adapting its learning rate based on a window of past gradient updates. Parameters don't need tuning.\n\nParameters\n\nRho (ρ == rho): Factor by which the gradient is decayed at each time step.\nMachine epsilon (ϵ == epsilon): Constant to prevent division by zero                        (no need to change default)\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.AMSGrad","page":"API","title":"Optimisers.AMSGrad","text":"AMSGrad(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8)\nAMSGrad(; [eta, beta, epsilon])\n\nThe AMSGrad version of the Adam optimiser. Parameters don't need tuning.\n\nParameters\n\nLearning rate (η == eta): Amount by which gradients are discounted before updating                      the weights.\nDecay of momentums (β::Tuple == beta): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.\nMachine epsilon (ϵ == epsilon): Constant to prevent division by zero                        (no need to change default)\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.NAdam","page":"API","title":"Optimisers.NAdam","text":"NAdam(η = 0.001, β = (0.9, 0.999), ϵ = 1e-8)\nNAdam(; [eta, beta, epsilon])\n\nNAdam is a Nesterov variant of Adam. Parameters don't need tuning.\n\nParameters\n\nLearning rate (η == eta): Amount by which gradients are discounted before updating                      the weights.\nDecay of momentums (β::Tuple == beta): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.\nMachine epsilon (ϵ == epsilon): Constant to prevent division by zero                        (no need to change default)\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.AdamW","page":"API","title":"Optimisers.AdamW","text":"AdamW(η = 0.001, β = (0.9, 0.999), λ = 0, ϵ = 1e-8; couple = true)\nAdamW(; [eta, beta, lambda, epsilon, couple])\n\nAdamW is a variant of Adam fixing (as in repairing) its weight decay regularization. Implemented as an OptimiserChain of Adam and WeightDecay`.\n\nParameters\n\nLearning rate (η == eta): Amount by which gradients are discounted before updating                      the weights.\nDecay of momentums (β::Tuple == beta): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.\nWeight decay (λ == lambda): Controls the strength of L_2 regularisation.\nMachine epsilon (ϵ == epsilon): Constant to prevent division by zero                        (no need to change default)\nKeyword couple: If true, the weight decay is coupled with the learning rate, as in pytorch's AdamW.                   This corresponds to an update of the form x = x - η * (dx + λ * x), where dx is the                   update from Adam with learning rate 1.                   If false, the weight decay is decoupled from the learning rate, in the spirit of the original paper.                   This corresponds to an update of the form x = x - η * dx - λ * x.                   Default is true.\n\nwarning: Breaking change in v0.4\nWith version 0.4 the default update rule for AdamW has changed to match the pytorch implementation. The previous rule, which is closer to the original paper, can be obtained by setting AdamW(..., couple=false). See this issue for more details.\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.AdaBelief","page":"API","title":"Optimisers.AdaBelief","text":"AdaBelief(η = 0.001, β = (0.9, 0.999), ϵ = 1e-16)\nAdaBelief(; [eta, beta, epsilon])\n\nThe AdaBelief optimiser is a variant of the well-known Adam optimiser.\n\nParameters\n\nLearning rate (η == eta): Amount by which gradients are discounted before updating                      the weights.\nDecay of momentums (β::Tuple == beta): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.\nMachine epsilon (ϵ == epsilon): Constant to prevent division by zero                                 (no need to change default)\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.Lion","page":"API","title":"Optimisers.Lion","text":"Lion(η = 0.001, β = (0.9, 0.999))\nLion(; [eta, beta])\n\nLion optimiser.\n\nParameters\n\nLearning rate (η == eta): Magnitude by which gradients are updating the weights.\nDecay of momentums (β::Tuple == beta): Exponential decay for the first (β1) and the                                  second (β2) momentum estimate.\n\n\n\n\n\n","category":"type"},{"location":"api/","page":"API","title":"API","text":"In addition to the main course, you may wish to order some of these condiments:","category":"page"},{"location":"api/","page":"API","title":"API","text":"Optimisers.AccumGrad\nOptimisers.ClipGrad\nOptimisers.ClipNorm\nOptimisers.SignDecay\nOptimisers.WeightDecay\nOptimisers.OptimiserChain","category":"page"},{"location":"api/#Optimisers.AccumGrad","page":"API","title":"Optimisers.AccumGrad","text":"AccumGrad(n::Int)\n\nA rule constructed OptimiserChain(AccumGrad(n), Rule()) will accumulate for n steps, before applying Rule to the mean of these n gradients.\n\nThis is useful for training with effective batch sizes too large for the available memory. Instead of computing the gradient for batch size b at once, compute it for size b/n and accumulate n such gradients.\n\nExample\n\njulia> m = (x=[1f0], y=[2f0]);\n\njulia> r = OptimiserChain(AccumGrad(2), WeightDecay(0.01), Descent(0.1));\n\njulia> s = Optimisers.setup(r, m);\n\njulia> Optimisers.update!(s, m, (x=[33], y=[0]));\n\njulia> m  # model not yet changed\n(x = Float32[1.0], y = Float32[2.0])\n\njulia> Optimisers.update!(s, m, (x=[0], y=[444]));\n\njulia> m  # n=2 gradients applied at once\n(x = Float32[-0.651], y = Float32[-20.202002])\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.ClipGrad","page":"API","title":"Optimisers.ClipGrad","text":"ClipGrad(δ = 10)\nClipGrad(; [delta])\n\nRestricts every gradient component to obey -δ ≤ dx[i] ≤ δ.\n\nTypically composed with other rules using OptimiserChain.\n\nSee also ClipNorm.\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.ClipNorm","page":"API","title":"Optimisers.ClipNorm","text":"ClipNorm(ω = 10, p = 2; throw = true)\n\nScales any gradient array for which norm(dx, p) > ω to stay at this threshold (unless p==0).\n\nThrows an error if the norm is infinite or NaN, which you can turn off with throw = false.\n\nTypically composed with other rules using OptimiserChain.\n\nSee also ClipGrad.\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.SignDecay","page":"API","title":"Optimisers.SignDecay","text":"SignDecay(λ = 1e-3)\nSignDecay(; [lambda])\n\nImplements L_1 regularisation, also known as LASSO regression, when composed  with other rules as the first transformation in an OptimiserChain.\n\nIt does this by adding λ .* sign(x) to the gradient. This is equivalent to adding  λ * sum(abs, x) == λ * norm(x, 1) to the loss.\n\nSee also [WeightDecay] for L_2 normalisation. They can be used together: OptimiserChain(SignDecay(0.012), WeightDecay(0.034), Adam()) is equivalent to adding 0.012 * norm(x, 1) + 0.017 * norm(x, 2)^2 to the loss function.\n\nParameters\n\nPenalty (λ ≥ 0): Controls the strength of the regularisation.\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.WeightDecay","page":"API","title":"Optimisers.WeightDecay","text":"WeightDecay(λ = 5e-4)\nWeightDecay(; [lambda])\n\nImplements L_2 regularisation, also known as ridge regression,  when composed  with other rules as the first transformation in an OptimiserChain.\n\nIt does this by adding λ .* x to the gradient. This is equivalent to adding  λ/2 * sum(abs2, x) == λ/2 * norm(x)^2 to the loss.\n\nSee also [SignDecay] for L_1 normalisation.\n\nParameters\n\nPenalty (λ ≥ 0): Controls the strength of the regularisation.\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.OptimiserChain","page":"API","title":"Optimisers.OptimiserChain","text":"OptimiserChain(opts...)\n\nCompose a sequence of optimisers so that each opt in opts updates the gradient, in the order specified.\n\nWith an empty sequence, OptimiserChain() is the identity, so update! will subtract the full gradient from the parameters. This is equivalent to Descent(1).\n\nExample\n\njulia> o = OptimiserChain(ClipGrad(1.0), Descent(0.1));\n\njulia> m = (zeros(3),);\n\njulia> s = Optimisers.setup(o, m)\n(Leaf(OptimiserChain(ClipGrad(1.0), Descent(0.1)), (nothing, nothing)),)\n\njulia> Optimisers.update(s, m, ([0.3, 1, 7],))[2]  # clips before discounting\n([-0.03, -0.1, -0.1],)\n\n\n\n\n\n","category":"type"},{"location":"api/#Model-Interface","page":"API","title":"Model Interface","text":"","category":"section"},{"location":"api/","page":"API","title":"API","text":"Optimisers.setup\nOptimisers.update\nOptimisers.update!\nOptimisers.adjust!\nOptimisers.adjust(::Any, ::Real)\nOptimisers.freeze!\nOptimisers.thaw!","category":"page"},{"location":"api/#Optimisers.setup","page":"API","title":"Optimisers.setup","text":"Optimisers.setup(rule, model) -> state_tree\n\nInitialises the given optimiser for every trainable parameter within the model. Returns a tree of the relevant states, which must be passed to update or update!.\n\nExample\n\njulia> m = (x = rand(3), y = (true, false), z = tanh);\n\njulia> Optimisers.setup(Momentum(), m)  # same field names as m\n(x = Leaf(Momentum(0.01, 0.9), [0.0, 0.0, 0.0]), y = ((), ()), z = ())\n\nThe recursion into structures uses Functors.jl, and any new structs containing parameters need to be marked with Functors.@functor before use. See the Flux docs for more about this.\n\njulia> struct Layer; mat; fun; end\n\njulia> model = (lay = Layer([1 2; 3 4f0], sin), vec = [5, 6f0]);\n\njulia> Optimisers.setup(Momentum(), model)  # new struct is by default ignored\n(lay = (), vec = Leaf(Momentum(0.01, 0.9), Float32[0.0, 0.0]))\n\njulia> destructure(model)\n(Float32[5.0, 6.0], Restructure(NamedTuple, ..., 2))\n\njulia> using Functors; @functor Layer  # annotate this type as containing parameters\n\njulia> Optimisers.setup(Momentum(), model)\n(lay = (mat = Leaf(Momentum(0.01, 0.9), Float32[0.0 0.0; 0.0 0.0]), fun = ()), vec = Leaf(Momentum(0.01, 0.9), Float32[0.0, 0.0]))\n\njulia> destructure(model)\n(Float32[1.0, 3.0, 2.0, 4.0, 5.0, 6.0], Restructure(NamedTuple, ..., 6))\n\n\n\n\n\n","category":"function"},{"location":"api/#Optimisers.update","page":"API","title":"Optimisers.update","text":"Optimisers.update(tree, model, gradient) -> (tree, model)\n\nUses the optimiser and the gradient to change the trainable parameters in the model. Returns the improved model, and the optimiser states needed for the next update. The initial tree of states comes from setup.\n\nSee also update!, which will be faster for models of ordinary Arrays or CuArrays.\n\nExample\n\njulia> m = (x = Float32[1,2,3], y = tanh);\n\njulia> t = Optimisers.setup(Descent(0.1), m)\n(x = Leaf(Descent(0.1), nothing), y = ())\n\njulia> g = (x = [1,1,1], y = nothing);  # fake gradient\n\njulia> Optimisers.update(t, m, g)\n((x = Leaf(Descent(0.1), nothing), y = ()), (x = Float32[0.9, 1.9, 2.9], y = tanh))\n\n\n\n\n\n","category":"function"},{"location":"api/#Optimisers.update!","page":"API","title":"Optimisers.update!","text":"Optimisers.update!(tree, model, gradient) -> (tree, model)\n\nUses the optimiser and the gradient to change the trainable parameters in the model. Returns the improved model, and the optimiser states needed for the next update. The initial tree of states comes from setup.\n\nThis is used in exactly the same manner as update, but because it may mutate arrays within the old model (and the old state), it will be faster for models of ordinary Arrays or CuArrays. However, you should not rely on the old model being fully updated but rather use the returned model. (The original state tree is always mutated, as each Leaf is mutable.)\n\nExample\n\njulia> using StaticArrays, Zygote, Optimisers\n\njulia> m = (x = [1f0, 2f0], y = SA[4f0, 5f0]);  # partly mutable model\n\njulia> t = Optimisers.setup(Momentum(1/30, 0.9), m)  # tree of states\n(x = Leaf(Momentum(0.0333333, 0.9), Float32[0.0, 0.0]), y = Leaf(Momentum(0.0333333, 0.9), Float32[0.0, 0.0]))\n\njulia> g = gradient(m -> sum(abs2.(m.x .+ m.y)), m)[1]  # structural gradient\n(x = Float32[10.0, 14.0], y = Float32[10.0, 14.0])\n\njulia> t2, m2 = Optimisers.update!(t, m, g);\n\njulia> m2  # after update or update!, this is the new model\n(x = Float32[0.6666666, 1.5333333], y = Float32[3.6666667, 4.5333333])\n\njulia> m2.x === m.x  # update! has re-used this array, for efficiency\ntrue\n\njulia> m  # original should be discarded, may be mutated but no guarantee\n(x = Float32[0.6666666, 1.5333333], y = Float32[4.0, 5.0])\n\njulia> t == t2  # original state tree is guaranteed to be mutated\ntrue\n\n\n\n\n\n","category":"function"},{"location":"api/#Optimisers.adjust!","page":"API","title":"Optimisers.adjust!","text":"Optimisers.adjust!(tree, η)\n\nAlters the state tree = setup(rule, model) to change the parameters of the optimisation rule, without destroying its stored state. Typically used mid-way through training.\n\nCan be applied to part of a model, by acting only on the corresponding part of the state tree.\n\nTo change just the learning rate, provide a number η::Real.\n\nExample\n\njulia> m = (vec = rand(Float32, 2), fun = sin);\n\njulia> st = Optimisers.setup(Nesterov(), m)  # stored momentum is initialised to zero\n(vec = Leaf(Nesterov(0.001, 0.9), Float32[0.0, 0.0]), fun = ())\n\njulia> st, m = Optimisers.update(st, m, (vec = [16, 88], fun = nothing));  # with fake gradient\n\njulia> st\n(vec = Leaf(Nesterov(0.001, 0.9), Float32[-0.016, -0.088]), fun = ())\n\njulia> Optimisers.adjust!(st, 0.123)  # change learning rate, stored momentum untouched\n\njulia> st\n(vec = Leaf(Nesterov(0.123, 0.9), Float32[-0.016, -0.088]), fun = ())\n\nTo change other parameters, adjust! also accepts keyword arguments matching the field names of the optimisation rule's type.\n\njulia> fieldnames(Adam)\n(:eta, :beta, :epsilon)\n\njulia> st2 = Optimisers.setup(OptimiserChain(ClipGrad(), Adam()), m)\n(vec = Leaf(OptimiserChain(ClipGrad(10.0), Adam(0.001, (0.9, 0.999), 1.0e-8)), (nothing, (Float32[0.0, 0.0], Float32[0.0, 0.0], (0.9, 0.999)))), fun = ())\n\njulia> Optimisers.adjust(st2; beta = (0.777, 0.909), delta = 11.1)  # delta acts on ClipGrad\n(vec = Leaf(OptimiserChain(ClipGrad(11.1), Adam(0.001, (0.777, 0.909), 1.0e-8)), (nothing, (Float32[0.0, 0.0], Float32[0.0, 0.0], (0.9, 0.999)))), fun = ())\n\njulia> Optimisers.adjust(st; beta = \"no such field\")  # silently ignored!\n(vec = Leaf(Nesterov(0.123, 0.9), Float32[-0.016, -0.088]), fun = ())\n\n\n\n\n\n","category":"function"},{"location":"api/#Optimisers.adjust-Tuple{Any, Real}","page":"API","title":"Optimisers.adjust","text":"adjust(tree, η) -> tree\n\nLike adjust!, but returns a new tree instead of mutating the old one.\n\n\n\n\n\n","category":"method"},{"location":"api/#Optimisers.freeze!","page":"API","title":"Optimisers.freeze!","text":"Optimisers.freeze!(tree)\n\nTemporarily alters the state tree = setup(rule, model) so that parameters will not be updated. Un-done by thaw!.\n\nCan be applied to the state corresponding to only part of a model, for instance with model::Chain, to freeze model.layers[1] you should call freeze!(tree.layers[1]).\n\nExample\n\njulia> m = (x = ([1.0], 2.0), y = [3.0]);\n\njulia> s = Optimisers.setup(Momentum(), m);\n\njulia> Optimisers.freeze!(s.x)\n\njulia> Optimisers.update!(s, m, (x = ([pi], 10pi), y = [100pi]));  # with fake gradient\n\njulia> m\n(x = ([1.0], 2.0), y = [-0.14159265358979312])\n\njulia> s\n(x = (Leaf(Momentum(0.01, 0.9), [0.0], frozen = true), ()), y = Leaf(Momentum(0.01, 0.9), [3.14159]))\n\njulia> Optimisers.thaw!(s)\n\njulia> s.x\n(Leaf(Momentum(0.01, 0.9), [0.0]), ())\n\n\n\n\n\n","category":"function"},{"location":"api/#Optimisers.thaw!","page":"API","title":"Optimisers.thaw!","text":"Optimisers.thaw!(tree)\n\nThe reverse of freeze!. Applies to all parameters, mutating every Leaf(rule, state, frozen = true) to Leaf(rule, state, frozen = false).\n\n\n\n\n\n","category":"function"},{"location":"api/","page":"API","title":"API","text":"Calling Functors.@functor on your model's layer types by default causes these functions to recurse into all children, and ultimately optimise all isnumeric leaf nodes. To further restrict this by ignoring some fields of a layer type, define trainable:","category":"page"},{"location":"api/","page":"API","title":"API","text":"Optimisers.trainable\nOptimisers.isnumeric\nOptimisers.maywrite","category":"page"},{"location":"api/#Optimisers.trainable","page":"API","title":"Optimisers.trainable","text":"trainable(x::Layer) -> NamedTuple\n\nThis may be overloaded to make optimisers ignore some fields of every Layer, which would otherwise contain trainable parameters.\n\nwarning: Warning\nThis is very rarely required. Fields of struct Layer which contain functions, or integers like sizes, are always ignored anyway. Overloading trainable is only necessary when some arrays of numbers are to be optimised, and some arrays of numbers are not.\n\nThe default is Functors.children(x), usually a NamedTuple of all fields, and trainable(x) must contain a subset of these.\n\n\n\n\n\n","category":"function"},{"location":"api/#Optimisers.isnumeric","page":"API","title":"Optimisers.isnumeric","text":"isnumeric(x) -> Bool\n\nReturns true on any parameter to be adjusted by Optimisers.jl, namely arrays of non-integer numbers. Returns false on all other types.\n\nRequires also that Functors.isleaf(x) == true, to focus on e.g. the parent of a transposed matrix, not the wrapper.\n\n\n\n\n\n","category":"function"},{"location":"api/#Optimisers.maywrite","page":"API","title":"Optimisers.maywrite","text":"maywrite(x) -> Bool\n\nShould return true if we are completely sure that update! can write new values into x. Otherwise false, indicating a non-mutating path. For now, simply x isa DenseArray allowing Array, CuArray, etc. \n\n\n\n\n\n","category":"function"},{"location":"api/","page":"API","title":"API","text":"Such restrictions are also obeyed by this function for flattening a model:","category":"page"},{"location":"api/","page":"API","title":"API","text":"Optimisers.destructure\nOptimisers.Restructure\nOptimisers.trainables","category":"page"},{"location":"api/#Optimisers.destructure","page":"API","title":"Optimisers.destructure","text":"destructure(model) -> vector, reconstructor\n\nCopies all trainable, isnumeric parameters in the model to a vector, and returns also a function which reverses this transformation. Differentiable.\n\nExample\n\njulia> v, re = destructure((x=[1.0, 2.0], y=(sin, [3.0 + 4.0im])))\n(ComplexF64[1.0 + 0.0im, 2.0 + 0.0im, 3.0 + 4.0im], Restructure(NamedTuple, ..., 3))\n\njulia> re([3, 5, 7+11im])\n(x = [3.0, 5.0], y = (sin, ComplexF64[7.0 + 11.0im]))\n\nIf model contains various number types, they are promoted to make vector, and are usually restored by Restructure. Such restoration follows the rules  of ChainRulesCore.ProjectTo, and thus will restore floating point precision, but will permit more exotic numbers like ForwardDiff.Dual.\n\nIf model contains only GPU arrays, then vector will also live on the GPU. At present, a mixture of GPU and ordinary CPU arrays is undefined behaviour.\n\n\n\n\n\n","category":"function"},{"location":"api/#Optimisers.Restructure","page":"API","title":"Optimisers.Restructure","text":"Restructure(Model, ..., length)\n\nThis is what destructure returns, and re(p) will re-build the model with new parameters from vector p. If the model is callable, then re(x, p) == re(p)(x).\n\nExample\n\njulia> using Flux, Optimisers\n\njulia> _, re = destructure(Dense([1 2; 3 4], [0, 0], sigmoid))\n([1, 3, 2, 4, 0, 0], Restructure(Dense, ..., 6))\n\njulia> m = re(-4:1)\nDense(2, 2, σ)      # 6 parameters\n\njulia> m([0.2, 0.3]) ≈ re([0.2, 0.3], -4:1)\ntrue\n\n\n\n\n\n","category":"type"},{"location":"api/#Optimisers.trainables","page":"API","title":"Optimisers.trainables","text":"trainables(x, path = false)\n\nReturn an iterable over all the trainable parameters in x, that is all the numerical arrays (see isnumeric) which are reachable through trainable.\n\nParameters appearing multiple times in the model (tied weights) will be present only once in the output.\n\nIf path = false, the output is a list of numerical arrays.\n\nIf path = true, the output is a list of (KeyPath, AbstractArray) pairs, where KeyPath is a type representing the path to the array in the original structure.\n\nSee also destructure for a similar operation that returns a single flat vector instead.\n\nExamples\n\njulia> struct MyLayer\n         w\n         b\n       end\n\njulia> Functors.@functor MyLayer\n\njulia> Optimisers.trainable(x::MyLayer) = (; w = x.w,) # only w is trainable in this example\n\njulia> x = MyLayer([1.0,2.0,3.0], [4.0,5.0,6.0]);\n\njulia> trainables(x)\n1-element Vector{AbstractArray}:\n [1.0, 2.0, 3.0]\n\n julia> x = MyLayer((a=[1.0,2.0], b=[3.0]), [4.0,5.0,6.0]);\n\n julia> trainables(x) # collects nested parameters\n 2-element Vector{AbstractArray}:\n [1.0, 2.0]\n [3.0]\n\njulia> x = (a = [1.0,2.0], b = (Dict(\"c\" => [3.0, 4.0], \"d\" => 5.0), [6.0,7.0]));\n\njulia> for (kp, y) in trainables(x, path = true)\n           println(kp, \" => \", y)\n       end\nKeyPath(:a,) => [1.0, 2.0]\nKeyPath(:b, 1, \"c\") => [3.0, 4.0]\nKeyPath(:b, 2) => [6.0, 7.0]\n\njulia> getkeypath(x, KeyPath(:b, 1, \"c\"))\n2-element Vector{Float64}:\n 3.0\n 4.0\n\n\n\n\n\n","category":"function"},{"location":"api/#Rule-Definition","page":"API","title":"Rule Definition","text":"","category":"section"},{"location":"api/","page":"API","title":"API","text":"Optimisers.apply!\nOptimisers.init\nOptimisers.@..\nOptimisers.@lazy\nOptimisers.adjust(::AbstractRule, ::Real)\nOptimisers.@def","category":"page"},{"location":"api/#Optimisers.apply!","page":"API","title":"Optimisers.apply!","text":"Optimisers.apply!(rule::RuleType, state, parameters, gradient) -> (state, gradient)\n\nThis defines the action of any optimisation rule. It should return the modified gradient which will be subtracted from the parameters, and the updated state (if any) for use at the next iteration, as a tuple (state, gradient).\n\nFor efficiency it is free to mutate the old state, but only what is returned will be used. Ideally this should check maywrite(x), which the built-in rules do via @...\n\nThe initial state is init(rule::RuleType, parameters).\n\nExample\n\njulia> Optimisers.init(Descent(0.1), Float32[1,2,3]) === nothing\ntrue\n\njulia> Optimisers.apply!(Descent(0.1), nothing, Float32[1,2,3], [4,5,6])\n(nothing, Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1}}(*, ([4, 5, 6], 0.1f0)))\n\n\n\n\n\n","category":"function"},{"location":"api/#Optimisers.init","page":"API","title":"Optimisers.init","text":"Optimisers.init(rule::RuleType, parameters) -> state\n\nSets up the initial state for a given optimisation rule, and an array of parameters. This and apply! are the two functions which any new optimisation rule must define.\n\nExamples\n\njulia> Optimisers.init(Descent(), Float32[1,2,3])  # is `nothing`\n\njulia> Optimisers.init(Momentum(), [1.0, 2.0])\n2-element Vector{Float64}:\n 0.0\n 0.0\n\n\n\n\n\n","category":"function"},{"location":"api/#Optimisers.@..","page":"API","title":"Optimisers.@..","text":"@.. x = y + z\n\nSometimes in-place broadcasting macro, for use in apply! rules. If maywrite(x) then it is just @. x = rhs, but if not, it becomes x = @. rhs.\n\n\n\n\n\n","category":"macro"},{"location":"api/#Optimisers.@lazy","page":"API","title":"Optimisers.@lazy","text":"x = @lazy y + z\n\nLazy broadcasting macro, for use in apply! rules. It broadcasts like @. but does not materialise, returning a Broadcasted object for later use. Beware that mutation of arguments will affect the result, and that if it is used in two places, work will be done twice.\n\n\n\n\n\n","category":"macro"},{"location":"api/#Optimisers.adjust-Tuple{AbstractRule, Real}","page":"API","title":"Optimisers.adjust","text":"Optimisers.adjust(rule::RuleType, η::Real) -> rule\n\nIf a new optimisation rule has a learning rate which is not stored in field rule.eta, then you may should add a method to adjust. (But simpler to just use the standard name.)\n\n\n\n\n\n","category":"method"},{"location":"api/#Optimisers.@def","page":"API","title":"Optimisers.@def","text":"@def struct Rule; eta = 0.1; beta = (0.7, 0.8); end\n\nHelper macro for defining rules with default values. The types of the literal values are used in the struct, like this:\n\nstruct Rule\n  eta::Float64\n  beta::Tuple{Float64, Float64}\n  Rule(eta, beta = (0.7, 0.8)) = eta < 0 ? error() : new(eta, beta)\n  Rule(; eta = 0.1, beta = (0.7, 0.8)) = Rule(eta, beta)\nend\n\nAny field called eta is assumed to be a learning rate, and cannot be negative.\n\n\n\n\n\n","category":"macro"},{"location":"api/#KeyPath","page":"API","title":"KeyPath","text":"","category":"section"},{"location":"api/","page":"API","title":"API","text":"A KeyPath is a sequence of keys that can be used to access a value within a nested structure. It is defined in Functors.jl and re-exported by Optimisers.jl here for convenience.","category":"page"},{"location":"api/","page":"API","title":"API","text":"Functors.KeyPath\nFunctors.haskeypath\nFunctors.getkeypath\nFunctors.setkeypath!","category":"page"},{"location":"api/#Functors.KeyPath","page":"API","title":"Functors.KeyPath","text":"KeyPath(keys...)\n\nA type for representing a path of keys to a value in a nested structure. Can be constructed with a sequence of keys, or by concatenating other KeyPaths. Keys can be of type Symbol, String, or Int.\n\nFor custom types, access through symbol keys is assumed to be done with getproperty. For consistency, the method Base.propertynames is used to get the viable property names.\n\nFor string and integer keys instead, the access is done with getindex.\n\nSee also getkeypath, haskeypath.\n\nExamples\n\njulia> kp = KeyPath(:b, 3)\nKeyPath(:b, 3)\n\njulia> KeyPath(:a, kp, :c, 4) # construct mixing keys and keypaths\nKeyPath(:a, :b, 3, :c, 4)\n\njulia> struct T\n           a\n           b\n       end\n\njulia> function Base.getproperty(x::T, k::Symbol)\n            if k in fieldnames(T)\n                return getfield(x, k)\n            elseif k === :ab\n                return \"ab\"\n            else        \n                error()\n            end\n        end;\n\njulia> Base.propertynames(::T) = (:a, :b, :ab);\n\njulia> x = T(3, Dict(:c => 4, :d => 5));\n\njulia> getkeypath(x, KeyPath(:ab)) # equivalent to x.ab\n\"ab\"\n\njulia> getkeypath(x, KeyPath(:b, :c)) # equivalent to (x.b)[:c]\n4\n\n\n\n\n\n","category":"type"},{"location":"api/#Functors.haskeypath","page":"API","title":"Functors.haskeypath","text":"haskeypath(x, kp::KeyPath)\n\nReturn true if x has a value at the path kp.\n\nSee also KeyPath, getkeypath, and setkeypath!.\n\nExamples\n\njulia> x = Dict(:a => 3, :b => Dict(:c => 4, \"d\" => [5, 6, 7]))\nDict{Symbol, Any} with 2 entries:\n  :a => 3\n  :b => Dict{Any, Any}(:c=>4, \"d\"=>[5, 6, 7])\n\njulia> haskeypath(x, KeyPath(:a))\ntrue\n\njulia> haskeypath(x, KeyPath(:b, \"d\", 1))\ntrue\n\njulia> haskeypath(x, KeyPath(:b, \"d\", 4))\nfalse\n\n\n\n\n\n","category":"function"},{"location":"api/#Functors.getkeypath","page":"API","title":"Functors.getkeypath","text":"getkeypath(x, kp::KeyPath)\n\nReturn the value in x at the path kp.\n\nSee also KeyPath, haskeypath, and setkeypath!.\n\nExamples\n\njulia> x = Dict(:a => 3, :b => Dict(:c => 4, \"d\" => [5, 6, 7]))\nDict{Symbol, Any} with 2 entries:\n  :a => 3\n  :b => Dict{Any, Any}(:c=>4, \"d\"=>[5, 6, 7])\n\njulia> getkeypath(x, KeyPath(:b, \"d\", 2))\n6\n\n\n\n\n\n","category":"function"},{"location":"api/#Functors.setkeypath!","page":"API","title":"Functors.setkeypath!","text":"setkeypath!(x, kp::KeyPath, v)\n\nSet the value in x at the path kp to v.\n\nSee also KeyPath, getkeypath, and haskeypath.\n\n\n\n\n\n","category":"function"},{"location":"#Optimisers.jl","page":"Home","title":"Optimisers.jl","text":"","category":"section"},{"location":"#An-optimisation-rule","page":"Home","title":"An optimisation rule","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"A new optimiser must overload two functions, apply! and init. These act on one array of parameters:","category":"page"},{"location":"","page":"Home","title":"Home","text":"# Define a container to hold any optimiser specific parameters (if any):\nstruct DecayDescent <: Optimisers.AbstractRule\n  eta::Float64\nend\n\n# Define an `apply!` rule which encodes how the gradients will be used to\n# update the parameters:\nfunction Optimisers.apply!(o::DecayDescent, state, x, x̄)\n  T = eltype(x)\n  newx̄ = T(o.eta / √state) .* x̄\n  nextstate = state + 1\n  return nextstate, newx̄\nend\n\n# Define the function which sets up the initial state (if any):\nOptimisers.init(o::DecayDescent, x::AbstractArray) = 1","category":"page"},{"location":"","page":"Home","title":"Home","text":"The parameters will be immediately updated to x .- newx̄, while nextstate is caried to the next iteration.","category":"page"},{"location":"","page":"Home","title":"Home","text":"Notice that the state is handled separately from the optimiser itself. This is a key design principle and allows users to manage their own state explicitly. It of course also makes it easier to store the state.","category":"page"},{"location":"#Usage-with-[Flux.jl](https://github.com/FluxML/Flux.jl)","page":"Home","title":"Usage with Flux.jl","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"To apply such an optimiser to a whole model, setup builds a tree containing any initial state for every trainable array. Then at each step, update uses this and the gradient to adjust the model:","category":"page"},{"location":"","page":"Home","title":"Home","text":"\nusing Flux, Metalhead, Zygote, Optimisers\n\nmodel = Metalhead.ResNet(18) |> gpu  # define a model to train\nimage = rand(Float32, 224, 224, 3, 1) |> gpu;  # dummy data\n@show sum(model(image));  # dummy loss function\n\nrule = Optimisers.Adam()  # use the Adam optimiser with its default settings\nstate_tree = Optimisers.setup(rule, model);  # initialise this optimiser's momentum etc.\n\n∇model, _ = gradient(model, image) do m, x  # calculate the gradients\n  sum(m(x))\nend;\n\nstate_tree, model = Optimisers.update(state_tree, model, ∇model);\n@show sum(model(image));  # reduced\n","category":"page"},{"location":"","page":"Home","title":"Home","text":"Notice that a completely new instance of the model is returned. Internally, this is handled by Functors.jl, where we do a walk over the tree formed by the model and update the parameters using the gradients.","category":"page"},{"location":"","page":"Home","title":"Home","text":"There is also Optimisers.update! which similarly returns a new model, but is free to mutate arrays within the old one for efficiency. (The method of apply! above is likewise free to mutate arrays within its state; they are defensively copied when this rule is used with update.) For Adam(), there are two momenta per parameter, thus state is about twice the size of model:","category":"page"},{"location":"","page":"Home","title":"Home","text":"Base.summarysize(model) / 1024^2  # about 45MB\nBase.summarysize(state) / 1024^2  # about 90MB","category":"page"},{"location":"","page":"Home","title":"Home","text":"Optimisers.jl does not depend on any one automatic differentiation package, but for now the most likely source of gradients is Zygote.jl. Note that update always wants the gradient from Zygote's \"explicit\" mode, as shown above. This ∇model is another tree structure, rather than the dictionary-like object from  Zygote's \"implicit\" mode gradient(() -> loss(...), Flux.params(model)) – see  Zygote's documentation for more about this difference.","category":"page"},{"location":"#Usage-with-[Lux.jl](https://github.com/avik-pal/Lux.jl)","page":"Home","title":"Usage with Lux.jl","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"The main design difference of Lux from Flux is that the tree of parameters is separate from the layer structure. It is these parameters which setup and update need to know about.","category":"page"},{"location":"","page":"Home","title":"Home","text":"Lux describes this separation of parameter storage from model description as \"explicit\" parameters. Beware that it has nothing to do with Zygote's notion of \"explicit\" gradients. (If the same model is written in Flux and Lux, ∇model above and ∇params below will be nearly identical trees of nested NamedTuples.)","category":"page"},{"location":"","page":"Home","title":"Home","text":"\nusing Lux, Boltz, Zygote, Optimisers\n\nlux_model, params, lux_state = Boltz.resnet(:resnet18) |> gpu;  # define and initialise model\nimages = rand(Float32, 224, 224, 3, 4) |> gpu;  # batch of dummy data\ny, lux_state = Lux.apply(lux_model, images, params, lux_state);  # run the model\n@show sum(y);  # initial dummy loss\n\nrule = Optimisers.Adam()\nopt_state = Optimisers.setup(rule, params);  # optimiser state based on model parameters\n\n(loss, lux_state), back = Zygote.pullback(params, images) do p, x\n  y, st = Lux.apply(lux_model, x, p, lux_state)\n  sum(y), st  # return both the loss, and the updated lux_state\nend;\n∇params, _ = back((one.(loss), nothing));  # gradient of only the loss, with respect to parameter tree\nloss == sum(y)  # not yet changed\n\nopt_state, params = Optimisers.update!(opt_state, params, ∇params);\n\ny, lux_state = Lux.apply(lux_model, images, params, lux_state);\n@show sum(y);  # now reduced\n","category":"page"},{"location":"","page":"Home","title":"Home","text":"Besides the parameters stored in params and gradually optimised, any other model state is stored in lux_state, and updated by Lux.apply. (In this example, BatchNorm has state.) This is completely unrelated to Optimisers.jl's state, although designed in a similar spirit.","category":"page"},{"location":"","page":"Home","title":"Home","text":"Base.summarysize(lux_model) / 1024   # just 2KB\nBase.summarysize(params) / 1024^2    # about 45MB, same as Flux model\nBase.summarysize(lux_state) / 1024   # 40KB\nBase.summarysize(opt_state) / 1024^2 # about 90MB, with Adam","category":"page"},{"location":"","page":"Home","title":"Home","text":"If you are certain there is no model state, then the gradient calculation can be simplified to use Zygote.gradient instead of Zygote.pullback:","category":"page"},{"location":"","page":"Home","title":"Home","text":"∇params, _ = gradient(params, images) do p, x\n  y, _ = Lux.apply(lux_model, x, p, lux_state)  # discards new lux_state\n  sum(y)\nend;","category":"page"},{"location":"#Non-trainable-Parameters","page":"Home","title":"Non-trainable Parameters","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"Optimisers.jl uses Functors.jl to walk the structs making up the model, for which they must be annotated @functor Type.  By default optimisation will alter all isnumeric arrays. ","category":"page"},{"location":"","page":"Home","title":"Home","text":"If some arrays of a particular layer should not be treated this way, you can define a method for trainable","category":"page"},{"location":"","page":"Home","title":"Home","text":"struct Layer{T}\n  alpha::T\n  beta::T\n  length::Int\nend\nLayer(n::Int) = Layer(randn(n), zeros(n), n)\n\nFunctors.@functor Layer\n\n# Both array fields will be, for example, moved to the GPU:\nFunctors.children(Layer(3))  # (alpha = [...], beta = [...], length)\n\nOptimisers.trainable(x::Layer) = (; alpha = x.alpha)  # must be a subset of children\n\n# Only the first field will be optimised:\nst = Optimisers.setup(DecayDescent(0.1), Layer(3))","category":"page"},{"location":"#Frozen-Parameters","page":"Home","title":"Frozen Parameters","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"To temporarily prevent training from affecting some parameters, use freeze! and thaw!. They work by mutating all Leafs of the state tree, or part of it.","category":"page"},{"location":"","page":"Home","title":"Home","text":"using Flux, Optimisers\n\nx = randn(Float32, 28, 28, 1, 1);\nnet = @autosize (size(x)...,) Chain(\n  Conv((3, 3), 1 => 3, stride=2, bias=false), Flux.flatten, Dense(_ => 2, relu),\n)\nopt = Optimisers.setup(Optimisers.Momentum(), net);\n\nnet.layers[3] isa Dense  # now freeze this layer's parameters:\nOptimisers.freeze!(opt.layers[3])\nopt.layers[3].bias  # confirm: Leaf(Momentum(...), [0.0, 0.0], frozen = true)\n\nOptimisers.update!(opt, net, gradient(m -> sum(m(x)), net)...);\n\nnet.layers[3].bias  # stil zero, and its momentum is too:\n\nOptimisers.thaw!(opt)\nopt.layers[3].bias  # Leaf(Momentum(...), [0.0, 0.0])","category":"page"},{"location":"#Adjusting-Hyperparameters","page":"Home","title":"Adjusting Hyperparameters","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"To change the learning rate during training, use adjust!. This works much like freeze! by mutating the state tree, or part of it, without discarding the momenta. For the Flux model from just above:","category":"page"},{"location":"","page":"Home","title":"Home","text":"Optimisers.adjust!(opt, 0.03)  # change η for the whole model...\n\nOptimisers.adjust!(opt.layers[3], 0.04)  # ... or just for one layer.","category":"page"},{"location":"","page":"Home","title":"Home","text":"To change other fields of the optimisation rule, it accepts keyword arguments:","category":"page"},{"location":"","page":"Home","title":"Home","text":"Momentum |> fieldnames  # (:eta, :rho)\n\nOptimisers.adjust!(opt, rho = 0.95)  # change ρ for the whole model.","category":"page"},{"location":"#Tied-Parameters","page":"Home","title":"Tied Parameters","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"If the same array appears twice (or more) in the model, Functors.jl should recognise this. Within Optimisers.jl, setup will initialise once, and use the same Leaf for both parameters.  Then update will accumulate the gradient from both, and the updated model returned will have the tie maintained.","category":"page"},{"location":"","page":"Home","title":"Home","text":"using Flux, Optimisers\n\nenc = Chain(Dense(40 => 20, tanh), Dense(20 => 10));\ndec = Chain(Dense(enc[1].weight', true, tanh), Dense(enc[2].weight', true, tanh));\nmodel = Chain(; enc, dec)\n\nst = Optimisers.setup(Optimisers.Adam(), model);\n\nst.layers.enc.layers[1].weight === st.layers.dec.layers[1].weight.parent  # true","category":"page"},{"location":"","page":"Home","title":"Home","text":"This identification relies on ===, and will work for ordinary Arrays and CuArrays. It will not at present work for reshaped arrays, nor for immutable arrays such as those from StaticArrays.jl.","category":"page"},{"location":"#Obtaining-a-flat-parameter-vector","page":"Home","title":"Obtaining a flat parameter vector","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"Instead of a nested tree-like structure, sometimes is is convenient to have all the parameters as one simple vector. Optimisers.jl contains a function destructure which creates this vector, and also creates way to re-build the original structure with new parameters. Both flattening and re-building may be used within gradient calls.","category":"page"},{"location":"","page":"Home","title":"Home","text":"An example with Flux's model:","category":"page"},{"location":"","page":"Home","title":"Home","text":"using ForwardDiff  # an example of a package which only likes one array\n\nmodel = Chain(  # much smaller model example, as ForwardDiff is a slow algorithm here\n          Conv((3, 3), 3 => 5, pad=1, bias=false), \n          BatchNorm(5, relu), \n          Conv((3, 3), 5 => 3, stride=16),\n        )\nimage = rand(Float32, 224, 224, 3, 1);\n@show sum(model(image));\n\nflat, re = destructure(model)\nst = Optimisers.setup(rule, flat)  # state is just one Leaf now\n\n∇flat = ForwardDiff.gradient(flat) do v\n  m = re(v)      # rebuild a new object like model\n  sum(m(image))  # call that as before\nend\n\nst, flat = Optimisers.update(st, flat, ∇flat)\n@show sum(re(flat)(image));","category":"page"},{"location":"","page":"Home","title":"Home","text":"Here flat contains only the 283 trainable parameters, while the non-trainable ones are preserved inside re, an object of type Restructure. When defining new layers, these can be specified if necessary by overloading trainable. By default, all numeric arrays visible to Functors.jl are assumed to contain trainable parameters. Tied parameters (arrays appearing in different layers) are included only once in flat.","category":"page"},{"location":"","page":"Home","title":"Home","text":"Lux stores only the trainable parameters in params. This can also be flattened to a plain Vector in the same way:","category":"page"},{"location":"","page":"Home","title":"Home","text":"params, lux_state = Lux.setup(Random.default_rng(), lux_model);\n\nflat, re = destructure(params)\n\n∇flat = ForwardDiff.gradient(flat) do v\n  p = re(v)  # rebuild an object like params\n  y, _ = Lux.apply(lux_model, images, p, lux_state)\n  sum(y)\nend","category":"page"},{"location":"#Collecting-all-trainable-parameters","page":"Home","title":"Collecting all trainable parameters","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"Sometimes it is useful to collect all trainable parameters in a model, similarly to what destructure does but without concatenating the arrays into a flat vector. This is done by trainables, which returns a list of arrays:","category":"page"},{"location":"","page":"Home","title":"Home","text":"julia> using Flux, Optimisers\n\njulia> model = Chain(Dense(2 => 3, tanh), BatchNorm(3), Dense(3 => 2));\n\njulia> trainables(model)\n6-element Vector{AbstractArray}:\n Float32[0.5756773 -0.1975264; 0.4723181 -0.7546912; -0.91631395 0.07392061]\n Float32[0.0, 0.0, 0.0]\n Float32[0.0, 0.0, 0.0]\n Float32[1.0, 1.0, 1.0]\n Float32[-0.8764882 0.40812716 0.1919528; -0.9123545 -0.4462516 0.6751252]\n Float32[0.0, 0.0]\n\njulia> l2reg(model) = sum([sum(abs2, p) for p in trainables(model)]);\n\njulia> g = gradient(l2reg, model)[1];","category":"page"},{"location":"","page":"Home","title":"Home","text":"Notice that the BatchNorm layer has two trainable parameters, γ and β, which are included in the list, while the μ and σ² buffers are not.","category":"page"},{"location":"","page":"Home","title":"Home","text":"Sometimes one wants to iterate over all trainable parameters in a model and the corresponding parameters of a matched structure such a gradient or the moving average of the model.  This can be done using trainables(model, path=true). For instance, here is how to update the parameters of a moving average model with the parameters of the model:","category":"page"},{"location":"","page":"Home","title":"Home","text":"for (kp, p_avg) in trainables(model_avg, path=true)\n    p = getkeypath(model, kp)  \n    p_avg .= 0.99 .* p_avg .+ 0.01 .* p\nend","category":"page"},{"location":"#Incomplete-or-nothing-gradients","page":"Home","title":"Incomplete or nothing gradients","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"If the gradient is not available for some parameters, or branches of the model,  update will not take an optimisation step for those parameters. This is the case when the gradient is nothing or a subtype of ChainRules.AbstractZero.","category":"page"},{"location":"","page":"Home","title":"Home","text":"For stateful optimisers, skipping an update it is generaly not the same as updating with a zero gradient. For example, in the case of Adam, the momentum and variance are updated even if the gradient is zero:","category":"page"},{"location":"","page":"Home","title":"Home","text":"julia> x = (a = ones(2), b = ones(2));\n(a = [1.0, 1.0], b = [1.0, 1.0])\n\njulia> opt_state = Optimisers.setup(Adam(0.1), x)\n(a = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.0, 0.0], [0.0, 0.0], (0.9, 0.999))), b = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.0, 0.0], [0.0, 0.0], (0.9, 0.999))))\n\njulia> g = (; a = ones(2), b = ones(2)); # First an update with a non-zero gradient to increase the momentum and variance\n\njulia> Optimisers.update!(opt_state, x, g);\n\njulia> opt_state # the state in `a` and `b` are the same\n(a = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.1, 0.1], [0.001, 0.001], (0.81, 0.998001))), b = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.1, 0.1], [0.001, 0.001], (0.81, 0.998001))))\n\njulia> g = (; a = zeros(2), b = nothing); # Now an update with a zero gradient for a and no gradient for b\n\njulia> Optimisers.update!(opt_state, x, g);\n\njulia> opt_state # the state in `a` and `b` differ\n(a = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.09, 0.09], [0.000999, 0.000999], (0.729, 0.997003))), b = Leaf(Adam(0.1, (0.9, 0.999), 1.0e-8), ([0.1, 0.1], [0.001, 0.001], (0.81, 0.998001))))","category":"page"}]
 }