koaning · koaning · Dec 12, 2023 · Nov 2, 2023 · Nov 6, 2023 · Nov 6, 2023
diff --git a/docs/_scripts/meta-models.py b/docs/_scripts/meta-models.py
@@ -265,7 +265,7 @@ def plot_model(model):
 mod1 = (GroupedPredictor(DummyRegressor(), groups=["m"])
         .fit(df[["m"]], df["yt"]))
 
-mod2 = (GroupedPredictor(DecayEstimator(DummyRegressor(), decay=0.9), groups=["m"])
+mod2 = (GroupedPredictor(DecayEstimator(DummyRegressor(), decay_func="exponential", decay_rate=0.9), groups=["m"])
         .fit(df[["index", "m"]], df["yt"]))
 
 plt.figure(figsize=(12, 3))
@@ -279,6 +279,29 @@ def plot_model(model):
 plt.clf()
 
 
+# --8<-- [start:decay-functions]
+from sklego.meta._decay_utils import exponential_decay, linear_decay, sigmoid_decay, stepwise_decay 
+
+fig = plt.figure(figsize=(12, 6))
+
+for i, name, func, kwargs in zip(
+    range(1, 5),
+    ("exponential", "linear", "sigmoid", "stepwise"),
+    (exponential_decay, linear_decay, sigmoid_decay, stepwise_decay),
+    ({"decay_rate": 0.995}, {"min_value": 0.1}, {}, {"n_steps": 8})
+    ):
+
+    ax = fig.add_subplot(2, 2, i)
+    x, y = None, np.arange(1000)
+    ax.plot(func(x,y, **kwargs))
+    ax.set_title(f'decay_func="{name}"')
+
+plt.tight_layout()
+# --8<-- [end:decay-functions]
+
+plt.savefig(_static_path / "decay-functions.png")
+plt.clf()
+
 # --8<-- [start:make-blobs]
 import numpy as np
 import matplotlib.pylab as plt

diff --git a/docs/_static/meta-models/baseline-model.png b/docs/_static/meta-models/baseline-model.png
diff --git a/docs/_static/meta-models/confusion-balanced-grid.html b/docs/_static/meta-models/confusion-balanced-grid.html
@@ -11,8 +11,8 @@
         3.        ])},
              refit=&#x27;negatives&#x27;, return_train_score=True,
              scoring={&#x27;accuracy&#x27;: make_scorer(accuracy_score),
-                      &#x27;negatives&#x27;: &lt;function false_negatives at 0x7f418e36d3f0&gt;,
-                      &#x27;positives&#x27;: &lt;function false_positives at 0x7f418e323490&gt;})</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden><div class="sk-item sk-dashed-wrapped"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-1" type="checkbox" ><label for="sk-estimator-id-1" class="sk-toggleable__label sk-toggleable__label-arrow">GridSearchCV</label><div class="sk-toggleable__content"><pre>GridSearchCV(cv=5,
+                      &#x27;negatives&#x27;: &lt;function false_negatives at 0x7f33dfb60c10&gt;,
+                      &#x27;positives&#x27;: &lt;function false_positives at 0x7f33dfb61fc0&gt;})</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden><div class="sk-item sk-dashed-wrapped"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-1" type="checkbox" ><label for="sk-estimator-id-1" class="sk-toggleable__label sk-toggleable__label-arrow">GridSearchCV</label><div class="sk-toggleable__content"><pre>GridSearchCV(cv=5,
              estimator=ConfusionBalancer(alpha=1.0,
                                          estimator=LogisticRegression(max_iter=1000)),
              n_jobs=-1,
@@ -25,5 +25,5 @@
         3.        ])},
              refit=&#x27;negatives&#x27;, return_train_score=True,
              scoring={&#x27;accuracy&#x27;: make_scorer(accuracy_score),
-                      &#x27;negatives&#x27;: &lt;function false_negatives at 0x7f418e36d3f0&gt;,
-                      &#x27;positives&#x27;: &lt;function false_positives at 0x7f418e323490&gt;})</pre></div></div></div><div class="sk-parallel"><div class="sk-parallel-item"><div class="sk-item"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-2" type="checkbox" ><label for="sk-estimator-id-2" class="sk-toggleable__label sk-toggleable__label-arrow">estimator: ConfusionBalancer</label><div class="sk-toggleable__content"><pre>ConfusionBalancer(alpha=1.0, estimator=LogisticRegression(max_iter=1000))</pre></div></div></div><div class="sk-serial"><div class="sk-item sk-dashed-wrapped"><div class="sk-parallel"><div class="sk-parallel-item"><div class="sk-item"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-3" type="checkbox" ><label for="sk-estimator-id-3" class="sk-toggleable__label sk-toggleable__label-arrow">estimator: LogisticRegression</label><div class="sk-toggleable__content"><pre>LogisticRegression(max_iter=1000)</pre></div></div></div><div class="sk-serial"><div class="sk-item"><div class="sk-estimator sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-4" type="checkbox" ><label for="sk-estimator-id-4" class="sk-toggleable__label sk-toggleable__label-arrow">LogisticRegression</label><div class="sk-toggleable__content"><pre>LogisticRegression(max_iter=1000)</pre></div></div></div></div></div></div></div></div></div></div></div></div></div></div></div>
+                      &#x27;negatives&#x27;: &lt;function false_negatives at 0x7f33dfb60c10&gt;,
+                      &#x27;positives&#x27;: &lt;function false_positives at 0x7f33dfb61fc0&gt;})</pre></div></div></div><div class="sk-parallel"><div class="sk-parallel-item"><div class="sk-item"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-2" type="checkbox" ><label for="sk-estimator-id-2" class="sk-toggleable__label sk-toggleable__label-arrow">estimator: ConfusionBalancer</label><div class="sk-toggleable__content"><pre>ConfusionBalancer(alpha=1.0, estimator=LogisticRegression(max_iter=1000))</pre></div></div></div><div class="sk-serial"><div class="sk-item sk-dashed-wrapped"><div class="sk-parallel"><div class="sk-parallel-item"><div class="sk-item"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-3" type="checkbox" ><label for="sk-estimator-id-3" class="sk-toggleable__label sk-toggleable__label-arrow">estimator: LogisticRegression</label><div class="sk-toggleable__content"><pre>LogisticRegression(max_iter=1000)</pre></div></div></div><div class="sk-serial"><div class="sk-item"><div class="sk-estimator sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-4" type="checkbox" ><label for="sk-estimator-id-4" class="sk-toggleable__label sk-toggleable__label-arrow">LogisticRegression</label><div class="sk-toggleable__content"><pre>LogisticRegression(max_iter=1000)</pre></div></div></div></div></div></div></div></div></div></div></div></div></div></div></div>
diff --git a/docs/_static/meta-models/confusion-balancer-results.png b/docs/_static/meta-models/confusion-balancer-results.png
diff --git a/docs/_static/meta-models/decay-functions.png b/docs/_static/meta-models/decay-functions.png
diff --git a/docs/_static/meta-models/decay-model.png b/docs/_static/meta-models/decay-model.png
diff --git a/docs/_static/meta-models/grouped-dummy-model.png b/docs/_static/meta-models/grouped-dummy-model.png
diff --git a/docs/_static/meta-models/grouped-model.png b/docs/_static/meta-models/grouped-model.png
diff --git a/docs/_static/meta-models/grouped-transform.png b/docs/_static/meta-models/grouped-transform.png
diff --git a/docs/_static/meta-models/make-blobs.png b/docs/_static/meta-models/make-blobs.png
diff --git a/docs/_static/meta-models/skewed-data.png b/docs/_static/meta-models/skewed-data.png
diff --git a/docs/_static/meta-models/threshold-chart.png b/docs/_static/meta-models/threshold-chart.png
diff --git a/docs/_static/meta-models/ts-data.png b/docs/_static/meta-models/ts-data.png
diff --git a/docs/api/decay-functions.md b/docs/api/decay-functions.md
@@ -0,0 +1,25 @@
+# Decay Functions
+
+These functions are used in the [`DecayEstimator`][decay-estimator] to generate sample weights for the wrapped model.
+
+::: sklego.meta._decay_utils.exponential_decay
+    options:
+        show_root_full_path: true
+        show_root_heading: true
+
+::: sklego.meta._decay_utils.linear_decay
+    options:
+        show_root_full_path: true
+        show_root_heading: true
+
+::: sklego.meta._decay_utils.sigmoid_decay
+    options:
+        show_root_full_path: true
+        show_root_heading: true
+
+::: sklego.meta._decay_utils.stepwise_decay
+    options:
+        show_root_full_path: true
+        show_root_heading: true
+
+[decay-estimator]: /api/meta#sklego.meta.decay_estimator.DecayEstimator
diff --git a/docs/user-guide/meta-models.md b/docs/user-guide/meta-models.md
@@ -198,6 +198,43 @@ We will create two models on this dataset. One model calculates the average valu
 
 The decay parameter has a lot of influence on the effect of the model but one can clearly see that we shift focus to the more recent data.
 
+### Decay Functions
+
+scikit-lego provides a set of decay functions that can be used to decay the importance of older data. The default decay function used in `DecayEstimator` is the `exponential_decay` function (`decay_func="exponential"`).
+
+Out of the box there are four decay functions available:
+
+![decay-functions](/_static/meta-models/decay-functions.png)
+
+??? example "Code for plotting the decay functions"
+    ```py
+    --8<-- "docs/_scripts/meta-models.py:decay-functions"
+    ```
+
+The arguments of these functions can be passed along to the `DecayEstimator` class as keyword arguments:
+
+```py
+DecayEstimator(..., decay_func="linear", min_value=0.5)
+```
+
+To see which keyword arguments are available for each decay function, please refer to the [Decay Functions API section][decay-functions]:
+
+Notice that passing a string to refer to the built-in decays is just a convenience.
+
+Therefore it is also possible to create a custom decay function and pass it along to the `DecayEstimator` class, **as long as** the first two arguments of the function are `X` and `y` and the return shape is the same as `y`:
+
+```py title="Custom decay function"
+def custom_decay(X, y, alpha, beta, gamma):
+    """My custom decay function where the magic happens"""
+    ...
+    return decay_values
+
+DecayEstimator(...,
+    decay_func=custom_decay,
+    alpha=some_alpha, beta=some_beta, gamma=some_gamma
+)
+```
+
 ## Confusion Balancer
 
 !!! warning "Disclaimer"
@@ -350,12 +387,13 @@ The `OutlierClassifier` can be combined with any classification model in the `St
 [thresholder-api]: /api/meta#sklego.meta.thresholder.Thresholder
 [grouped-predictor-api]: /api/meta#sklego.meta.grouped_predictor.GroupedPredictor
 [grouped-transformer-api]: /api/meta#sklego.meta.grouped_transformer.GroupedTransformer
-[decay-api]: /api/meta#sklego.meta.decay.DecayEstimator
+[decay-api]: /api/meta#sklego.meta.decay_estimator.DecayEstimator
+[decay-functions]: /api/decay-functions
 [confusion-balancer-api]: /api/meta#sklego.meta.confusion_balancer.ConfusionBalancer
 [zero-inflated-api]: /api/meta#sklego.meta.zero_inflated_regressor.ZeroInflatedRegressor
 [outlier-classifier-api]: /api/meta#sklego.meta.outlier_classifier.OutlierClassifier
 
 [standard-scaler-api]: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
 [stacking-classifier-api]: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.StackingClassifier.html#sklearn.ensemble.StackingClassifier
 [dummy-regressor-api]: https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyRegressor.html
-[imb-learn]: https://imbalanced-learn.readthedocs.io/en/stable/
+[imb-learn]: https://imbalanced-learn.org/stable/
diff --git a/mkdocs.yaml b/mkdocs.yaml
@@ -139,6 +139,7 @@ nav:
     - Base: api/base.md
     - Common: api/common.md
     - Datasets: api/datasets.md
+    - Decay Functions: api/decay-functions.md
     - Decomposition: api/decomposition.md
     - Dummy: api/dummy.md
     - Linear Model: api/linear-model.md

diff --git a/sklego/meta/_decay_utils.py b/sklego/meta/_decay_utils.py
@@ -0,0 +1,211 @@
+import numpy as np
+
+def linear_decay(X, y, min_value=0.0, max_value=1.0):
+    """Generates a linear decay by mapping input data `X`, `y` to a linearly decreasing range from `max_value` 
+    to `min_value`. The length and step of the decay is determined by the number of samples in `y`.
+
+    !!! warning
+        It is up to the user to sort the dataset appropriately.
+
+    Parameters
+    ----------
+    X : array-like, shape=(n_samples, n_features,)
+        Training data. Unused, present for API consistency by convention.
+    y : array-like, shape=(n_samples,)
+        Target values. Used to determine the number of samples in the decay.
+    min_value : float, default=0.
+        The minimum value of the decay.
+    max_value : float, default=1.
+        The maximum value of the decay.
+
+    Returns
+    -------
+    np.ndarray, shape=(n_samples,)
+        The decay values.
+
+    Raises
+    ------
+    ValueError
+        If `min_value` is greater than `max_value`.
+    """
+
+    if min_value > max_value:
+        raise ValueError("`min_value` must be less than or equal to `max_value`")
+
+    n_samples = y.shape[0]
+    return np.linspace(min_value, max_value, n_samples + 1)[1:]
+
+def exponential_decay(X, y, decay_rate=0.999):
+    r"""Generates an exponential decay by mapping input data `X`, `y` to a exponential decreasing range
+    $w_{t-1} = decay\_rate * w_{t}$. The length of the decay is determined by the number of samples in `y`.
+
+    !!! warning
+        It is up to the user to sort the dataset appropriately.
+
+    Parameters
+    ----------
+    X : array-like, shape=(n_samples, n_features,)
+        Training data. Unused, present for API consistency by convention.
+    y : array-like, shape=(n_samples,)
+        Target values. Used to determine the number of samples in the decay.
+    decay_rate : float, default=0.999
+        The rate of decay.
+
+    Returns
+    -------
+    np.ndarray, shape=(n_samples,)
+        The decay values.
+
+    Raises
+    ------
+    ValueError
+        If `decay_rate` not between 0 and 1.
+    """
+
+    if decay_rate <= 0 or decay_rate >= 1:
+        raise ValueError(
+            f"`decay_rate` must be between 0. and 1., found {decay_rate}"
+        )
+    n_samples = y.shape[0]
+    return decay_rate ** np.arange(n_samples, 0, -1)
+
+def stepwise_decay(X, y, n_steps=None, step_size=None, min_value=0.0, max_value=1.0):
+    """Generates a stepwise decay function that maps input data `X`, `y` to a decreasing range from `max_value` to 
+    `min_value`.
+
+    It is possible to specify one of `n_steps` or `step_size` to determine the behaviour of the decay.
+
+    - If `step_size` is provided, the decay will be split into `n_samples // step_size` steps, each of which will
+        decrease the value by `step_width = (max_value - min_value) / n_steps`.
+    - If `n_steps` is provided, the decay will be split into `n_steps` steps, each of which will decrease the value
+        by `step_width = (max_value - min_value) / n_steps`.
+
+    Each *step* of length *step_size* has constant weight, and then decreases by `step_width` until the minimum value is
+    reached.
+
+    !!! warning
+        It is up to the user to sort the dataset appropriately.
+
+    Parameters
+    ----------
+    X : array-like, shape=(n_samples, n_features,)
+        Training data. Unused, present for API consistency by convention.
+    y : array-like, shape=(n_samples,)
+        Target values. Used to determine the number of samples in the decay.    
+    n_steps : int | None, default=None
+        The total number of steps in the decay.
+    step_size : int | None, default=None
+        The number of samples for each step in the decay.
+    min_value : float, default=0.
+        The minimum value of the decay.
+    max_value : float, default=1.
+        The maximum value of the decay.
+
+    Returns
+    -------
+    np.ndarray, shape=(n_samples,)
+        The decay values.
+
+    Raises
+    ------
+    ValueError
+        - If `min_value` is greater than `max_value`.
+        - If no value or both values are provided for `n_steps` or `step_size`.
+        - If `step_size` less than 0 or greater than the number of samples.
+        - If `n_steps` less than 0 or greater than the number of samples.
+    TypeError
+        - If `n_steps` is not an integer.
+        - If `step_size` is not an integer.
+    """
+
+    if min_value > max_value:
+        raise ValueError("`min_value` must be less than or equal to `max_value`")
+
+    if step_size is None and n_steps is None:
+        raise ValueError("Either `step_size` or `n_steps` must be provided")
+
+    elif step_size is not None and n_steps is not None:
+        raise ValueError("Only one of `step_size` or `n_steps` must be provided")
+
+    elif step_size is not None and n_steps is None:
+        if not isinstance(step_size, int):
+            raise TypeError("`step_size` must be an integer")
+
+        if step_size <= 0:
+            raise ValueError("`step_size` must be greater than 0")
+
+    elif step_size is None and n_steps is not None:
+        if not isinstance(n_steps, int):
+            raise TypeError("`n_steps` must be an integer")
+
+        if n_steps <= 0:
+            raise ValueError("`n_steps` must be greater than 0")
+
+    n_samples = y.shape[0]
+
+    if step_size is not None and step_size > n_samples:
+        raise ValueError(
+            "`step_size` must be less than or equal to the number of samples"
+        )
+
+    if n_steps is not None and n_steps > n_samples:
+        raise ValueError(
+            "`n_steps` must be less than or equal to the number of samples"
+        )
+
+    n_steps = (n_samples // step_size if step_size is not None else n_steps)
+    step_size = n_samples // n_steps
+    step_width = (max_value - min_value) / n_steps
+
+    return max_value - (np.arange(n_samples, 0, -1) // step_size) * step_width
+
+def sigmoid_decay(X, y, growth_rate=None, min_value=0.0, max_value=1.0):
+    """Generates a sigmoid decay function that maps input data `X`, `y` to a non-linearly decreasing range from
+    `max_value` to `min_value`. The steepness of the decay is determined by the `growth_rate` parameter.
+    If not provided this will be set to `10 / n_samples`, which is a "good enough" default for most cases.
+
+    !!! warning
+        It is up to the user to sort the dataset appropriately.
+
+    Parameters
+    ----------
+    X : array-like, shape=(n_samples, n_features,)
+        Training data. Unused, present for API consistency by convention.
+    y : array-like, shape=(n_samples,)
+        Target values. Used to determine the number of samples in the decay.
+    growth_rate : float | None, default=None
+        The growth rate of the sigmoid function. If not provided this will be set to `10 / n_samples`.
+    min_value : float, default=0.
+        The minimum value of the decay.
+    max_value : float, default=1.
+        The maximum value of the decay.
+
+    Returns
+    -------
+    np.ndarray, shape=(n_samples,)
+        The decay values.
+
+    Raises
+    ------
+    ValueError
+        - If `min_value` is greater than `max_value`.
+        - If `growth_rate` is specified and not between 0 and 1.
+    """
+
+    if min_value > max_value:
+        raise ValueError("`min_value` must be less than or equal to `max_value`")
+
+    if growth_rate is not None and (growth_rate <= 0 or growth_rate >= 1):
+        raise ValueError("`growth_rate` must be between 0. and 1.")
+
+
+    n_samples = y.shape[0]
+    growth_rate = growth_rate or 10 / n_samples
+
+    return min_value + (max_value - min_value) * _sigmoid(
+        x=np.arange(n_samples), growth_rate=growth_rate, offset=n_samples // 2
+    )
+
+
+def _sigmoid(x, growth_rate, offset):
+    return 1 / (1 + np.exp(-growth_rate * (x - offset)))