diff --git a/nbs/core.ipynb b/nbs/core.ipynb index 737a12420..a4f164ed5 100644 --- a/nbs/core.ipynb +++ b/nbs/core.ipynb @@ -87,7 +87,7 @@ " TFT, VanillaTransformer,\n", " Informer, Autoformer, FEDformer,\n", " StemGNN, PatchTST, TimesNet, TimeLLM, TSMixer, TSMixerx,\n", - " MLPMultivariate\n", + " MLPMultivariate, iTransformer\n", ")" ] }, @@ -228,6 +228,7 @@ " 'tsmixer': TSMixer, 'autotsmixer': TSMixer,\n", " 'tsmixerx': TSMixerx, 'autotsmixerx': TSMixerx,\n", " 'mlpmultivariate': MLPMultivariate, 'automlpmultivariate': MLPMultivariate,\n", + " 'itransformer': iTransformer, 'autoitransformer': iTransformer\n", "}" ] }, diff --git a/nbs/imgs_models/iTransformer.png b/nbs/imgs_models/iTransformer.png new file mode 100644 index 000000000..01605df9a Binary files /dev/null and b/nbs/imgs_models/iTransformer.png differ diff --git a/nbs/models.ipynb b/nbs/models.ipynb index 0dafa2598..82331c3b2 100644 --- a/nbs/models.ipynb +++ b/nbs/models.ipynb @@ -60,6 +60,7 @@ "from neuralforecast.models.fedformer import FEDformer\n", "from neuralforecast.models.patchtst import PatchTST\n", "from neuralforecast.models.timesnet import TimesNet\n", + "from neuralforecast.models.itransformer import iTransformer\n", "\n", "from neuralforecast.models.stemgnn import StemGNN\n", "from neuralforecast.models.hint import HINT\n", @@ -2706,6 +2707,157 @@ "model.fit(dataset=dataset)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "a61c3be9", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "class AutoiTransformer(BaseAuto):\n", + "\n", + " default_config = {\n", + " \"input_size_multiplier\": [1, 2, 3, 4, 5],\n", + " \"h\": None,\n", + " \"n_series\": None,\n", + " \"hidden_size\": tune.choice([64, 128, 256]),\n", + " \"n_heads\": tune.choice([4, 8]),\n", + " \"learning_rate\": tune.loguniform(1e-4, 1e-1),\n", + " \"scaler_type\": tune.choice([None, 'robust', 'standard']),\n", + " \"max_steps\": tune.choice([500, 1000, 2000]),\n", + " \"batch_size\": tune.choice([32, 64, 128, 256]),\n", + " \"loss\": None,\n", + " \"random_seed\": tune.randint(1, 20),\n", + " }\n", + "\n", + " def __init__(self,\n", + " h,\n", + " n_series,\n", + " loss=MAE(),\n", + " valid_loss=None,\n", + " config=None, \n", + " search_alg=BasicVariantGenerator(random_state=1),\n", + " num_samples=10,\n", + " refit_with_val=False,\n", + " cpus=cpu_count(),\n", + " gpus=torch.cuda.device_count(),\n", + " verbose=False,\n", + " alias=None,\n", + " backend='ray',\n", + " callbacks=None):\n", + " \n", + " # Define search space, input/output sizes\n", + " if config is None:\n", + " config = self.get_default_config(h=h, backend=backend, n_series=n_series) \n", + "\n", + " # Always use n_series from parameters, raise exception with Optuna because we can't enforce it\n", + " if backend == 'ray':\n", + " config['n_series'] = n_series\n", + " elif backend == 'optuna':\n", + " mock_trial = MockTrial()\n", + " if ('n_series' in config(mock_trial) and config(mock_trial)['n_series'] != n_series) or ('n_series' not in config(mock_trial)):\n", + " raise Exception(f\"config needs 'n_series': {n_series}\") \n", + "\n", + " super(AutoiTransformer, self).__init__(\n", + " cls_model=iTransformer, \n", + " h=h,\n", + " loss=loss,\n", + " valid_loss=valid_loss,\n", + " config=config,\n", + " search_alg=search_alg,\n", + " num_samples=num_samples, \n", + " refit_with_val=refit_with_val,\n", + " cpus=cpus,\n", + " gpus=gpus,\n", + " verbose=verbose,\n", + " alias=alias,\n", + " backend=backend,\n", + " callbacks=callbacks, \n", + " )\n", + "\n", + " @classmethod\n", + " def get_default_config(cls, h, backend, n_series):\n", + " config = cls.default_config.copy() \n", + " config['input_size'] = tune.choice([h * x \\\n", + " for x in config[\"input_size_multiplier\"]])\n", + "\n", + " # Rolling windows with step_size=1 or step_size=h\n", + " # See `BaseWindows` and `BaseRNN`'s create_windows\n", + " config['step_size'] = tune.choice([1, h])\n", + " del config[\"input_size_multiplier\"]\n", + " if backend == 'optuna':\n", + " # Always use n_series from parameters\n", + " config['n_series'] = n_series\n", + " config = cls._ray_config_to_optuna(config) \n", + "\n", + " return config " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f416fa0", + "metadata": {}, + "outputs": [], + "source": [ + "show_doc(AutoiTransformer, title_level=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ffd40db", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "# Use your own config or AutoiTransformer.default_config\n", + "config = dict(max_steps=1, val_check_steps=1, input_size=12, hidden_size=16)\n", + "model = AutoiTransformer(h=12, n_series=1, config=config, num_samples=1, cpus=1)\n", + "\n", + "# Fit and predict\n", + "model.fit(dataset=dataset)\n", + "y_hat = model.predict(dataset=dataset)\n", + "\n", + "# Optuna\n", + "model = AutoiTransformer(h=12, n_series=1, config=None, backend='optuna')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a2052de", + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "# Check Optuna\n", + "assert model.config(MockTrial())['h'] == 12\n", + "\n", + "# Unit test to test that Auto* model contains all required arguments from BaseAuto\n", + "test_args(AutoiTransformer, exclude_args=['cls_model']) \n", + "\n", + "# Unit test for situation: Optuna with updated default config\n", + "my_config = AutoiTransformer.get_default_config(h=12, n_series=1, backend='optuna')\n", + "def my_config_new(trial):\n", + " config = {**my_config(trial)}\n", + " config.update({'max_steps': 1, 'val_check_steps': 1, 'input_size': 12, 'hidden_size': 16})\n", + " return config\n", + "\n", + "model = AutoiTransformer(h=12, n_series=1, config=my_config_new, backend='optuna', num_samples=1, cpus=1)\n", + "model.fit(dataset=dataset)\n", + "\n", + "# Unit test for situation: Ray with updated default config\n", + "my_config = AutoiTransformer.get_default_config(h=12, n_series=1, backend='ray')\n", + "my_config['max_steps'] = 1\n", + "my_config['val_check_steps'] = 1\n", + "my_config['input_size'] = 12\n", + "my_config['hidden_size'] = 16\n", + "model = AutoiTransformer(h=12, n_series=1, config=my_config, backend='ray', num_samples=1, cpus=1)\n", + "model.fit(dataset=dataset)" + ] + }, { "attachments": {}, "cell_type": "markdown", diff --git a/nbs/models.itransformer.ipynb b/nbs/models.itransformer.ipynb new file mode 100644 index 000000000..16f223d3f --- /dev/null +++ b/nbs/models.itransformer.ipynb @@ -0,0 +1,1170 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| default_exp models.itransformer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "from fastcore.test import test_eq\n", + "from nbdev.showdoc import show_doc" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# iTransformer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The iTransformer model simply takes the Transformer architecture but it applies the attention and feed-forward network on the inverted dimensions. This means that time points of each individual series are embedded into tokens. That way, the attention mechanisms learn multivariate correlation and the feed-forward network learns non-linear relationships.\n", + "\n", + "**References**\n", + "- [Yong Liu, Tengge Hu, Haoran Zhang, Haixu Wu, Shiyu Wang, Lintao Ma, Mingsheng Long. \"iTransformer: Inverted Transformers Are Effective for Time Series Forecasting\"](https://arxiv.org/abs/2310.06625)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Figure 1. Architecture of iTransformer.](imgs_models/itransformer.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "\n", + "import numpy as np\n", + "\n", + "from typing import Optional\n", + "from math import sqrt\n", + "\n", + "from neuralforecast.losses.pytorch import MAE\n", + "from neuralforecast.common._base_multivariate import BaseMultivariate\n", + "\n", + "from neuralforecast.common._modules import TransEncoder, TransEncoderLayer, AttentionLayer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Auxiliary functions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.1 Attention" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| exporti\n", + "\n", + "class TriangularCausalMask():\n", + " def __init__(self, B, L, device=\"cpu\"):\n", + " mask_shape = [B, 1, L, L]\n", + " with torch.no_grad():\n", + " self._mask = torch.triu(torch.ones(mask_shape, dtype=torch.bool), diagonal=1).to(device)\n", + "\n", + " @property\n", + " def mask(self):\n", + " return self._mask\n", + "\n", + "class FullAttention(nn.Module):\n", + " def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False):\n", + " super(FullAttention, self).__init__()\n", + " self.scale = scale\n", + " self.mask_flag = mask_flag\n", + " self.output_attention = output_attention\n", + " self.dropout = nn.Dropout(attention_dropout)\n", + "\n", + " def forward(self, queries, keys, values, attn_mask, tau=None, delta=None):\n", + " B, L, H, E = queries.shape\n", + " _, S, _, D = values.shape\n", + " scale = self.scale or 1. / sqrt(E)\n", + "\n", + " scores = torch.einsum(\"blhe,bshe->bhls\", queries, keys)\n", + "\n", + " if self.mask_flag:\n", + " if attn_mask is None:\n", + " attn_mask = TriangularCausalMask(B, L, device=queries.device)\n", + "\n", + " scores.masked_fill_(attn_mask.mask, -np.inf)\n", + "\n", + " A = self.dropout(torch.softmax(scale * scores, dim=-1))\n", + " V = torch.einsum(\"bhls,bshd->blhd\", A, values)\n", + "\n", + " if self.output_attention:\n", + " return (V.contiguous(), A)\n", + " else:\n", + " return (V.contiguous(), None) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.2 Inverted embedding" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| exporti\n", + "\n", + "class DataEmbedding_inverted(nn.Module):\n", + " def __init__(self, c_in, hidden_size, dropout=0.1):\n", + " super(DataEmbedding_inverted, self).__init__()\n", + " self.value_embedding = nn.Linear(c_in, hidden_size)\n", + " self.dropout = nn.Dropout(p=dropout)\n", + "\n", + " def forward(self, x, x_mark):\n", + " x = x.permute(0, 2, 1)\n", + " # x: [Batch Variate Time]\n", + " if x_mark is None:\n", + " x = self.value_embedding(x)\n", + " else:\n", + " # the potential to take covariates (e.g. timestamps) as tokens\n", + " x = self.value_embedding(torch.cat([x, x_mark.permute(0, 2, 1)], 1)) \n", + " # x: [Batch Variate hidden_size]\n", + " return self.dropout(x)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "\n", + "class iTransformer(BaseMultivariate):\n", + "\n", + " \"\"\" iTransformer\n", + "\n", + " **Parameters:**
\n", + " `h`: int, Forecast horizon.
\n", + " `input_size`: int, autorregresive inputs size, y=[1,2,3,4] input_size=2 -> y_[t-2:t]=[1,2].
\n", + " `n_series`: int, number of time-series.
\n", + " `futr_exog_list`: str list, future exogenous columns.
\n", + " `hist_exog_list`: str list, historic exogenous columns.
\n", + " `stat_exog_list`: str list, static exogenous columns.
\n", + " `hidden_size`: int, dimension of the model.
\n", + " `n_heads`: int, number of heads.
\n", + " `e_layers`: int, number of encoder layers.
\n", + " `d_layers`: int, number of decoder layers.
\n", + " `d_ff`: int, dimension of fully-connected layer.
\n", + " `factor`: int, attention factor.
\n", + " `dropout`: float, dropout rate.
\n", + " `use_norm`: bool, whether to normalize or not.
\n", + " `loss`: PyTorch module, instantiated train loss class from [losses collection](https://nixtla.github.io/neuralforecast/losses.pytorch.html).
\n", + " `valid_loss`: PyTorch module=`loss`, instantiated valid loss class from [losses collection](https://nixtla.github.io/neuralforecast/losses.pytorch.html).
\n", + " `max_steps`: int=1000, maximum number of training steps.
\n", + " `learning_rate`: float=1e-3, Learning rate between (0, 1).
\n", + " `num_lr_decays`: int=-1, Number of learning rate decays, evenly distributed across max_steps.
\n", + " `early_stop_patience_steps`: int=-1, Number of validation iterations before early stopping.
\n", + " `val_check_steps`: int=100, Number of training steps between every validation loss check.
\n", + " `batch_size`: int=32, number of different series in each batch.
\n", + " `step_size`: int=1, step size between each window of temporal data.
\n", + " `scaler_type`: str='identity', type of scaler for temporal inputs normalization see [temporal scalers](https://nixtla.github.io/neuralforecast/common.scalers.html).
\n", + " `random_seed`: int=1, random_seed for pytorch initializer and numpy generators.
\n", + " `num_workers_loader`: int=os.cpu_count(), workers to be used by `TimeSeriesDataLoader`.
\n", + " `drop_last_loader`: bool=False, if True `TimeSeriesDataLoader` drops last non-full batch.
\n", + " `alias`: str, optional, Custom name of the model.
\n", + " `optimizer`: Subclass of 'torch.optim.Optimizer', optional, user specified optimizer instead of the default choice (Adam).
\n", + " `optimizer_kwargs`: dict, optional, list of parameters used by the user specified `optimizer`.
\n", + " `**trainer_kwargs`: int, keyword trainer arguments inherited from [PyTorch Lighning's trainer](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html?highlight=trainer).
\n", + " \n", + " **References**
\n", + " - [Yong Liu, Tengge Hu, Haoran Zhang, Haixu Wu, Shiyu Wang, Lintao Ma, Mingsheng Long. \"iTransformer: Inverted Transformers Are Effective for Time Series Forecasting\"](https://arxiv.org/abs/2310.06625)\n", + " \"\"\"\n", + "\n", + " # Class attributes\n", + " SAMPLING_TYPE = 'multivariate'\n", + "\n", + " def __init__(self,\n", + " h,\n", + " input_size,\n", + " n_series,\n", + " futr_exog_list = None,\n", + " hist_exog_list = None,\n", + " stat_exog_list = None,\n", + " hidden_size: int = 512,\n", + " n_heads: int = 8,\n", + " e_layers: int = 2,\n", + " d_layers: int = 1,\n", + " d_ff: int = 2048,\n", + " factor: int = 1,\n", + " dropout: float = 0.1,\n", + " use_norm: bool = True,\n", + " loss = MAE(),\n", + " valid_loss = None,\n", + " max_steps: int = 1000,\n", + " learning_rate: float = 1e-3,\n", + " num_lr_decays: int = -1,\n", + " early_stop_patience_steps: int =-1,\n", + " val_check_steps: int = 100,\n", + " batch_size: int = 32,\n", + " step_size: int = 1,\n", + " scaler_type: str = 'identity',\n", + " random_seed: int = 1,\n", + " num_workers_loader: int = 0,\n", + " drop_last_loader: bool = False,\n", + " optimizer = None,\n", + " optimizer_kwargs = None,\n", + " **trainer_kwargs):\n", + " \n", + " super(iTransformer, self).__init__(h=h,\n", + " input_size=input_size,\n", + " n_series=n_series,\n", + " stat_exog_list = None,\n", + " futr_exog_list = None,\n", + " hist_exog_list = None,\n", + " loss=loss,\n", + " valid_loss=valid_loss,\n", + " max_steps=max_steps,\n", + " learning_rate=learning_rate,\n", + " num_lr_decays=num_lr_decays,\n", + " early_stop_patience_steps=early_stop_patience_steps,\n", + " val_check_steps=val_check_steps,\n", + " batch_size=batch_size,\n", + " step_size=step_size,\n", + " scaler_type=scaler_type,\n", + " random_seed=random_seed,\n", + " num_workers_loader=num_workers_loader,\n", + " drop_last_loader=drop_last_loader,\n", + " optimizer=optimizer,\n", + " optimizer_kwargs=optimizer_kwargs,\n", + " **trainer_kwargs)\n", + " \n", + " # Asserts\n", + " if stat_exog_list is not None:\n", + " raise Exception(\"iTransformer does not support static exogenous variables\")\n", + " if futr_exog_list is not None:\n", + " raise Exception(\"iTransformer does not support future exogenous variables\")\n", + " if hist_exog_list is not None:\n", + " raise Exception(\"iTransformer does not support historical exogenous variables\")\n", + " \n", + " self.enc_in = n_series\n", + " self.dec_in = n_series\n", + " self.c_out = n_series\n", + " self.hidden_size = hidden_size\n", + " self.n_heads = n_heads\n", + " self.e_layers = e_layers\n", + " self.d_layers = d_layers\n", + " self.d_ff = d_ff\n", + " self.factor = factor\n", + " self.dropout = dropout\n", + " self.use_norm = use_norm\n", + "\n", + " # Architecture\n", + " self.enc_embedding = DataEmbedding_inverted(input_size, self.hidden_size, self.dropout)\n", + "\n", + " self.encoder = TransEncoder(\n", + " [\n", + " TransEncoderLayer(\n", + " AttentionLayer(\n", + " FullAttention(False, self.factor, attention_dropout=self.dropout), self.hidden_size, self.n_heads),\n", + " self.hidden_size,\n", + " self.d_ff,\n", + " dropout=self.dropout,\n", + " activation=F.gelu\n", + " ) for l in range(self.e_layers)\n", + " ],\n", + " norm_layer=torch.nn.LayerNorm(self.hidden_size)\n", + " )\n", + "\n", + " self.projector = nn.Linear(self.hidden_size, h, bias=True)\n", + " \n", + " def forecast(self, x_enc):\n", + " if self.use_norm:\n", + " # Normalization from Non-stationary Transformer\n", + " means = x_enc.mean(1, keepdim=True).detach()\n", + " x_enc = x_enc - means\n", + " stdev = torch.sqrt(torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5)\n", + " x_enc /= stdev\n", + "\n", + " _, _, N = x_enc.shape # B L N\n", + " # B: batch_size; E: hidden_size; \n", + " # L: input_size; S: horizon(h);\n", + " # N: number of variate (tokens), can also includes covariates\n", + "\n", + " # Embedding\n", + " # B L N -> B N E (B L N -> B L E in the vanilla Transformer)\n", + " enc_out = self.enc_embedding(x_enc, None) # covariates (e.g timestamp) can be also embedded as tokens\n", + " \n", + " # B N E -> B N E (B L E -> B L E in the vanilla Transformer)\n", + " # the dimensions of embedded time series has been inverted, and then processed by native attn, layernorm and ffn modules\n", + " enc_out, attns = self.encoder(enc_out, attn_mask=None)\n", + "\n", + " # B N E -> B N S -> B S N \n", + " dec_out = self.projector(enc_out).permute(0, 2, 1)[:, :, :N] # filter the covariates\n", + "\n", + " if self.use_norm:\n", + " # De-Normalization from Non-stationary Transformer\n", + " dec_out = dec_out * (stdev[:, 0, :].unsqueeze(1).repeat(1, self.h, 1))\n", + " dec_out = dec_out + (means[:, 0, :].unsqueeze(1).repeat(1, self.h, 1))\n", + "\n", + " return dec_out\n", + " \n", + " def forward(self, windows_batch):\n", + " insample_y = windows_batch['insample_y']\n", + "\n", + " y_pred = self.forecast(insample_y)\n", + " y_pred = y_pred[:, -self.h:, :]\n", + " y_pred = self.loss.domain_map(y_pred)\n", + "\n", + " # domain_map might have squeezed the last dimension in case n_series == 1\n", + " if y_pred.ndim == 2:\n", + " return y_pred.unsqueeze(-1)\n", + " else:\n", + " return y_pred\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "---\n", + "\n", + "[source](https://github.com/Nixtla/neuralforecast/blob/main/neuralforecast/models/itransformer.py#L94){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "\n", + "### iTransformer\n", + "\n", + "> iTransformer (h, input_size, n_series, futr_exog_list=None,\n", + "> hist_exog_list=None, stat_exog_list=None,\n", + "> hidden_size:int=512, n_heads:int=8, e_layers:int=2,\n", + "> d_layers:int=1, d_ff:int=2048, factor:int=1,\n", + "> dropout:float=0.1, use_norm:bool=True, loss=MAE(),\n", + "> valid_loss=None, max_steps:int=1000,\n", + "> learning_rate:float=0.001, num_lr_decays:int=-1,\n", + "> early_stop_patience_steps:int=-1, val_check_steps:int=100,\n", + "> batch_size:int=32, step_size:int=1,\n", + "> scaler_type:str='identity', random_seed:int=1,\n", + "> num_workers_loader:int=0, drop_last_loader:bool=False,\n", + "> optimizer=None, optimizer_kwargs=None, **trainer_kwargs)\n", + "\n", + "iTransformer\n", + "\n", + "**Parameters:**
\n", + "`h`: int, Forecast horizon.
\n", + "`input_size`: int, autorregresive inputs size, y=[1,2,3,4] input_size=2 -> y_[t-2:t]=[1,2].
\n", + "`n_series`: int, number of time-series.
\n", + "`futr_exog_list`: str list, future exogenous columns.
\n", + "`hist_exog_list`: str list, historic exogenous columns.
\n", + "`stat_exog_list`: str list, static exogenous columns.
\n", + "`hidden_size`: int, dimension of the model.
\n", + "`n_heads`: int, number of heads.
\n", + "`e_layers`: int, number of encoder layers.
\n", + "`d_layers`: int, number of decoder layers.
\n", + "`d_ff`: int, dimension of fully-connected layer.
\n", + "`factor`: int, attention factor.
\n", + "`dropout`: float, dropout rate.
\n", + "`use_norm`: bool, whether to normalize or not.
\n", + "`loss`: PyTorch module, instantiated train loss class from [losses collection](https://nixtla.github.io/neuralforecast/losses.pytorch.html).
\n", + "`valid_loss`: PyTorch module=`loss`, instantiated valid loss class from [losses collection](https://nixtla.github.io/neuralforecast/losses.pytorch.html).
\n", + "`max_steps`: int=1000, maximum number of training steps.
\n", + "`learning_rate`: float=1e-3, Learning rate between (0, 1).
\n", + "`num_lr_decays`: int=-1, Number of learning rate decays, evenly distributed across max_steps.
\n", + "`early_stop_patience_steps`: int=-1, Number of validation iterations before early stopping.
\n", + "`val_check_steps`: int=100, Number of training steps between every validation loss check.
\n", + "`batch_size`: int=32, number of different series in each batch.
\n", + "`step_size`: int=1, step size between each window of temporal data.
\n", + "`scaler_type`: str='identity', type of scaler for temporal inputs normalization see [temporal scalers](https://nixtla.github.io/neuralforecast/common.scalers.html).
\n", + "`random_seed`: int=1, random_seed for pytorch initializer and numpy generators.
\n", + "`num_workers_loader`: int=os.cpu_count(), workers to be used by `TimeSeriesDataLoader`.
\n", + "`drop_last_loader`: bool=False, if True `TimeSeriesDataLoader` drops last non-full batch.
\n", + "`alias`: str, optional, Custom name of the model.
\n", + "`optimizer`: Subclass of 'torch.optim.Optimizer', optional, user specified optimizer instead of the default choice (Adam).
\n", + "`optimizer_kwargs`: dict, optional, list of parameters used by the user specified `optimizer`.
\n", + "`**trainer_kwargs`: int, keyword trainer arguments inherited from [PyTorch Lighning's trainer](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html?highlight=trainer).
\n", + "\n", + "**References**
\n", + "- [Yong Liu, Tengge Hu, Haoran Zhang, Haixu Wu, Shiyu Wang, Lintao Ma, Mingsheng Long. \"iTransformer: Inverted Transformers Are Effective for Time Series Forecasting\"](https://arxiv.org/abs/2310.06625)" + ], + "text/plain": [ + "---\n", + "\n", + "[source](https://github.com/Nixtla/neuralforecast/blob/main/neuralforecast/models/itransformer.py#L94){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "\n", + "### iTransformer\n", + "\n", + "> iTransformer (h, input_size, n_series, futr_exog_list=None,\n", + "> hist_exog_list=None, stat_exog_list=None,\n", + "> hidden_size:int=512, n_heads:int=8, e_layers:int=2,\n", + "> d_layers:int=1, d_ff:int=2048, factor:int=1,\n", + "> dropout:float=0.1, use_norm:bool=True, loss=MAE(),\n", + "> valid_loss=None, max_steps:int=1000,\n", + "> learning_rate:float=0.001, num_lr_decays:int=-1,\n", + "> early_stop_patience_steps:int=-1, val_check_steps:int=100,\n", + "> batch_size:int=32, step_size:int=1,\n", + "> scaler_type:str='identity', random_seed:int=1,\n", + "> num_workers_loader:int=0, drop_last_loader:bool=False,\n", + "> optimizer=None, optimizer_kwargs=None, **trainer_kwargs)\n", + "\n", + "iTransformer\n", + "\n", + "**Parameters:**
\n", + "`h`: int, Forecast horizon.
\n", + "`input_size`: int, autorregresive inputs size, y=[1,2,3,4] input_size=2 -> y_[t-2:t]=[1,2].
\n", + "`n_series`: int, number of time-series.
\n", + "`futr_exog_list`: str list, future exogenous columns.
\n", + "`hist_exog_list`: str list, historic exogenous columns.
\n", + "`stat_exog_list`: str list, static exogenous columns.
\n", + "`hidden_size`: int, dimension of the model.
\n", + "`n_heads`: int, number of heads.
\n", + "`e_layers`: int, number of encoder layers.
\n", + "`d_layers`: int, number of decoder layers.
\n", + "`d_ff`: int, dimension of fully-connected layer.
\n", + "`factor`: int, attention factor.
\n", + "`dropout`: float, dropout rate.
\n", + "`use_norm`: bool, whether to normalize or not.
\n", + "`loss`: PyTorch module, instantiated train loss class from [losses collection](https://nixtla.github.io/neuralforecast/losses.pytorch.html).
\n", + "`valid_loss`: PyTorch module=`loss`, instantiated valid loss class from [losses collection](https://nixtla.github.io/neuralforecast/losses.pytorch.html).
\n", + "`max_steps`: int=1000, maximum number of training steps.
\n", + "`learning_rate`: float=1e-3, Learning rate between (0, 1).
\n", + "`num_lr_decays`: int=-1, Number of learning rate decays, evenly distributed across max_steps.
\n", + "`early_stop_patience_steps`: int=-1, Number of validation iterations before early stopping.
\n", + "`val_check_steps`: int=100, Number of training steps between every validation loss check.
\n", + "`batch_size`: int=32, number of different series in each batch.
\n", + "`step_size`: int=1, step size between each window of temporal data.
\n", + "`scaler_type`: str='identity', type of scaler for temporal inputs normalization see [temporal scalers](https://nixtla.github.io/neuralforecast/common.scalers.html).
\n", + "`random_seed`: int=1, random_seed for pytorch initializer and numpy generators.
\n", + "`num_workers_loader`: int=os.cpu_count(), workers to be used by `TimeSeriesDataLoader`.
\n", + "`drop_last_loader`: bool=False, if True `TimeSeriesDataLoader` drops last non-full batch.
\n", + "`alias`: str, optional, Custom name of the model.
\n", + "`optimizer`: Subclass of 'torch.optim.Optimizer', optional, user specified optimizer instead of the default choice (Adam).
\n", + "`optimizer_kwargs`: dict, optional, list of parameters used by the user specified `optimizer`.
\n", + "`**trainer_kwargs`: int, keyword trainer arguments inherited from [PyTorch Lighning's trainer](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html?highlight=trainer).
\n", + "\n", + "**References**
\n", + "- [Yong Liu, Tengge Hu, Haoran Zhang, Haixu Wu, Shiyu Wang, Lintao Ma, Mingsheng Long. \"iTransformer: Inverted Transformers Are Effective for Time Series Forecasting\"](https://arxiv.org/abs/2310.06625)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "show_doc(iTransformer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "---\n", + "\n", + "### iTransformer.fit\n", + "\n", + "> iTransformer.fit (dataset, val_size=0, test_size=0, random_seed=None)\n", + "\n", + "Fit.\n", + "\n", + "The `fit` method, optimizes the neural network's weights using the\n", + "initialization parameters (`learning_rate`, `windows_batch_size`, ...)\n", + "and the `loss` function as defined during the initialization.\n", + "Within `fit` we use a PyTorch Lightning `Trainer` that\n", + "inherits the initialization's `self.trainer_kwargs`, to customize\n", + "its inputs, see [PL's trainer arguments](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html?highlight=trainer).\n", + "\n", + "The method is designed to be compatible with SKLearn-like classes\n", + "and in particular to be compatible with the StatsForecast library.\n", + "\n", + "By default the `model` is not saving training checkpoints to protect\n", + "disk memory, to get them change `enable_checkpointing=True` in `__init__`.\n", + "\n", + "**Parameters:**
\n", + "`dataset`: NeuralForecast's `TimeSeriesDataset`, see [documentation](https://nixtla.github.io/neuralforecast/tsdataset.html).
\n", + "`val_size`: int, validation size for temporal cross-validation.
\n", + "`test_size`: int, test size for temporal cross-validation.
" + ], + "text/plain": [ + "---\n", + "\n", + "### iTransformer.fit\n", + "\n", + "> iTransformer.fit (dataset, val_size=0, test_size=0, random_seed=None)\n", + "\n", + "Fit.\n", + "\n", + "The `fit` method, optimizes the neural network's weights using the\n", + "initialization parameters (`learning_rate`, `windows_batch_size`, ...)\n", + "and the `loss` function as defined during the initialization.\n", + "Within `fit` we use a PyTorch Lightning `Trainer` that\n", + "inherits the initialization's `self.trainer_kwargs`, to customize\n", + "its inputs, see [PL's trainer arguments](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html?highlight=trainer).\n", + "\n", + "The method is designed to be compatible with SKLearn-like classes\n", + "and in particular to be compatible with the StatsForecast library.\n", + "\n", + "By default the `model` is not saving training checkpoints to protect\n", + "disk memory, to get them change `enable_checkpointing=True` in `__init__`.\n", + "\n", + "**Parameters:**
\n", + "`dataset`: NeuralForecast's `TimeSeriesDataset`, see [documentation](https://nixtla.github.io/neuralforecast/tsdataset.html).
\n", + "`val_size`: int, validation size for temporal cross-validation.
\n", + "`test_size`: int, test size for temporal cross-validation.
" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "show_doc(iTransformer.fit, name='iTransformer.fit')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "---\n", + "\n", + "### iTransformer.predict\n", + "\n", + "> iTransformer.predict (dataset, test_size=None, step_size=1,\n", + "> random_seed=None, **data_module_kwargs)\n", + "\n", + "Predict.\n", + "\n", + "Neural network prediction with PL's `Trainer` execution of `predict_step`.\n", + "\n", + "**Parameters:**
\n", + "`dataset`: NeuralForecast's `TimeSeriesDataset`, see [documentation](https://nixtla.github.io/neuralforecast/tsdataset.html).
\n", + "`test_size`: int=None, test size for temporal cross-validation.
\n", + "`step_size`: int=1, Step size between each window.
\n", + "`**data_module_kwargs`: PL's TimeSeriesDataModule args, see [documentation](https://pytorch-lightning.readthedocs.io/en/1.6.1/extensions/datamodules.html#using-a-datamodule)." + ], + "text/plain": [ + "---\n", + "\n", + "### iTransformer.predict\n", + "\n", + "> iTransformer.predict (dataset, test_size=None, step_size=1,\n", + "> random_seed=None, **data_module_kwargs)\n", + "\n", + "Predict.\n", + "\n", + "Neural network prediction with PL's `Trainer` execution of `predict_step`.\n", + "\n", + "**Parameters:**
\n", + "`dataset`: NeuralForecast's `TimeSeriesDataset`, see [documentation](https://nixtla.github.io/neuralforecast/tsdataset.html).
\n", + "`test_size`: int=None, test size for temporal cross-validation.
\n", + "`step_size`: int=1, Step size between each window.
\n", + "`**data_module_kwargs`: PL's TimeSeriesDataModule args, see [documentation](https://pytorch-lightning.readthedocs.io/en/1.6.1/extensions/datamodules.html#using-a-datamodule)." + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "show_doc(iTransformer.predict, name='iTransformer.predict')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Usage example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import pytorch_lightning as pl\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from neuralforecast import NeuralForecast\n", + "from neuralforecast.utils import AirPassengersPanel, AirPassengersStatic\n", + "from neuralforecast.losses.pytorch import MSE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/marcopeix/miniconda3/envs/neuralforecast/lib/python3.10/site-packages/pytorch_lightning/utilities/parsing.py:198: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.\n", + "/Users/marcopeix/miniconda3/envs/neuralforecast/lib/python3.10/site-packages/pytorch_lightning/utilities/parsing.py:198: Attribute 'valid_loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['valid_loss'])`.\n", + "Seed set to 1\n", + "GPU available: True (mps), used: True\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n", + "\n", + " | Name | Type | Params\n", + "---------------------------------------------------------\n", + "0 | padder | ConstantPad1d | 0 \n", + "1 | loss | MSE | 0 \n", + "2 | valid_loss | MAE | 0 \n", + "3 | scaler | TemporalNorm | 0 \n", + "4 | enc_embedding | DataEmbedding_inverted | 3.2 K \n", + "5 | encoder | TransEncoder | 135 K \n", + "6 | projector | Linear | 1.5 K \n", + "---------------------------------------------------------\n", + "140 K Trainable params\n", + "0 Non-trainable params\n", + "140 K Total params\n", + "0.562 Total estimated model params size (MB)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "db2340a0a0ea4ab79a8f3c3fbc5e8962", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sanity Checking: | | 0/? [00:00]. Skipping setting a default `ModelSummary` callback.\n", + "GPU available: True (mps), used: True\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n", + "/Users/marcopeix/miniconda3/envs/neuralforecast/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=10` in the `DataLoader` to improve performance.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b61339b3642d44bfb953a7b2becf4cc4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Predicting: | | 0/? [00:00=AirPassengersPanel['ds'].values[-12]].reset_index(drop=True) # 12 test\n", + "\n", + "model = iTransformer(h=12,\n", + " input_size=24,\n", + " n_series=2,\n", + " hidden_size=128,\n", + " n_heads=2,\n", + " e_layers=2,\n", + " d_layers=1,\n", + " d_ff=4,\n", + " factor=1,\n", + " dropout=0.1,\n", + " use_norm=True,\n", + " loss=MSE(),\n", + " valid_loss=MAE(),\n", + " early_stop_patience_steps=3,\n", + " batch_size=32)\n", + "\n", + "fcst = NeuralForecast(models=[model], freq='M')\n", + "fcst.fit(df=Y_train_df, static_df=AirPassengersStatic, val_size=12)\n", + "forecasts = fcst.predict(futr_df=Y_test_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#| eval: false\n", + "# Plot predictions\n", + "fig, ax = plt.subplots(1, 1, figsize = (20, 7))\n", + "Y_hat_df = forecasts.reset_index(drop=False).drop(columns=['unique_id','ds'])\n", + "plot_df = pd.concat([Y_test_df, Y_hat_df], axis=1)\n", + "plot_df = pd.concat([Y_train_df, plot_df])\n", + "\n", + "plot_df = plot_df[plot_df.unique_id=='Airline1'].drop('unique_id', axis=1)\n", + "plt.plot(plot_df['ds'], plot_df['y'], c='black', label='True')\n", + "plt.plot(plot_df['ds'], plot_df['iTransformer'], c='blue', label='Forecast')\n", + "ax.set_title('AirPassengers Forecast', fontsize=22)\n", + "ax.set_ylabel('Monthly Passengers', fontsize=20)\n", + "ax.set_xlabel('Year', fontsize=20)\n", + "ax.legend(prop={'size': 15})\n", + "ax.grid()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import pytorch_lightning as pl\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from neuralforecast import NeuralForecast\n", + "from neuralforecast.utils import AirPassengersPanel, AirPassengersStatic\n", + "from neuralforecast.losses.pytorch import MSE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/marcopeix/miniconda3/envs/neuralforecast/lib/python3.10/site-packages/pytorch_lightning/utilities/parsing.py:198: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.\n", + "/Users/marcopeix/miniconda3/envs/neuralforecast/lib/python3.10/site-packages/pytorch_lightning/utilities/parsing.py:198: Attribute 'valid_loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['valid_loss'])`.\n", + "Seed set to 1\n", + "GPU available: True (mps), used: True\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n", + "\n", + " | Name | Type | Params\n", + "---------------------------------------------------------\n", + "0 | padder | ConstantPad1d | 0 \n", + "1 | loss | MSE | 0 \n", + "2 | valid_loss | MAE | 0 \n", + "3 | scaler | TemporalNorm | 0 \n", + "4 | enc_embedding | DataEmbedding_inverted | 3.2 K \n", + "5 | encoder | TransEncoder | 135 K \n", + "6 | projector | Linear | 1.5 K \n", + "---------------------------------------------------------\n", + "140 K Trainable params\n", + "0 Non-trainable params\n", + "140 K Total params\n", + "0.562 Total estimated model params size (MB)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f118cbdd019d4bb0990b028bc6e8ddeb", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sanity Checking: | | 0/? [00:00]. Skipping setting a default `ModelSummary` callback.\n", + "GPU available: True (mps), used: True\n", + "TPU available: False, using: 0 TPU cores\n", + "IPU available: False, using: 0 IPUs\n", + "HPU available: False, using: 0 HPUs\n", + "/Users/marcopeix/miniconda3/envs/neuralforecast/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=10` in the `DataLoader` to improve performance.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7d07116f871b4263a25d7874684e167a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Predicting: | | 0/? [00:00=AirPassengersPanel['ds'].values[-12]].reset_index(drop=True) # 12 test\n", + "\n", + "model = iTransformer(h=12,\n", + " input_size=24,\n", + " n_series=1,\n", + " hidden_size=128,\n", + " n_heads=2,\n", + " e_layers=2,\n", + " d_layers=1,\n", + " d_ff=4,\n", + " factor=1,\n", + " dropout=0.1,\n", + " use_norm=True,\n", + " loss=MSE(),\n", + " valid_loss=MAE(),\n", + " early_stop_patience_steps=3,\n", + " batch_size=32)\n", + "\n", + "fcst = NeuralForecast(models=[model], freq='M')\n", + "fcst.fit(df=Y_train_df, val_size=12)\n", + "forecasts = fcst.predict(futr_df=Y_test_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/neuralforecast/_modidx.py b/neuralforecast/_modidx.py index 933d3af02..0f795ee19 100644 --- a/neuralforecast/_modidx.py +++ b/neuralforecast/_modidx.py @@ -120,7 +120,12 @@ 'neuralforecast.auto.AutoVanillaTransformer.__init__': ( 'models.html#autovanillatransformer.__init__', 'neuralforecast/auto.py'), 'neuralforecast.auto.AutoVanillaTransformer.get_default_config': ( 'models.html#autovanillatransformer.get_default_config', - 'neuralforecast/auto.py')}, + 'neuralforecast/auto.py'), + 'neuralforecast.auto.AutoiTransformer': ('models.html#autoitransformer', 'neuralforecast/auto.py'), + 'neuralforecast.auto.AutoiTransformer.__init__': ( 'models.html#autoitransformer.__init__', + 'neuralforecast/auto.py'), + 'neuralforecast.auto.AutoiTransformer.get_default_config': ( 'models.html#autoitransformer.get_default_config', + 'neuralforecast/auto.py')}, 'neuralforecast.core': { 'neuralforecast.core.NeuralForecast': ('core.html#neuralforecast', 'neuralforecast/core.py'), 'neuralforecast.core.NeuralForecast.__init__': ( 'core.html#neuralforecast.__init__', 'neuralforecast/core.py'), @@ -674,6 +679,32 @@ 'neuralforecast/models/informer.py'), 'neuralforecast.models.informer.ProbMask.mask': ( 'models.informer.html#probmask.mask', 'neuralforecast/models/informer.py')}, + 'neuralforecast.models.itransformer': { 'neuralforecast.models.itransformer.DataEmbedding_inverted': ( 'models.itransformer.html#dataembedding_inverted', + 'neuralforecast/models/itransformer.py'), + 'neuralforecast.models.itransformer.DataEmbedding_inverted.__init__': ( 'models.itransformer.html#dataembedding_inverted.__init__', + 'neuralforecast/models/itransformer.py'), + 'neuralforecast.models.itransformer.DataEmbedding_inverted.forward': ( 'models.itransformer.html#dataembedding_inverted.forward', + 'neuralforecast/models/itransformer.py'), + 'neuralforecast.models.itransformer.FullAttention': ( 'models.itransformer.html#fullattention', + 'neuralforecast/models/itransformer.py'), + 'neuralforecast.models.itransformer.FullAttention.__init__': ( 'models.itransformer.html#fullattention.__init__', + 'neuralforecast/models/itransformer.py'), + 'neuralforecast.models.itransformer.FullAttention.forward': ( 'models.itransformer.html#fullattention.forward', + 'neuralforecast/models/itransformer.py'), + 'neuralforecast.models.itransformer.TriangularCausalMask': ( 'models.itransformer.html#triangularcausalmask', + 'neuralforecast/models/itransformer.py'), + 'neuralforecast.models.itransformer.TriangularCausalMask.__init__': ( 'models.itransformer.html#triangularcausalmask.__init__', + 'neuralforecast/models/itransformer.py'), + 'neuralforecast.models.itransformer.TriangularCausalMask.mask': ( 'models.itransformer.html#triangularcausalmask.mask', + 'neuralforecast/models/itransformer.py'), + 'neuralforecast.models.itransformer.iTransformer': ( 'models.itransformer.html#itransformer', + 'neuralforecast/models/itransformer.py'), + 'neuralforecast.models.itransformer.iTransformer.__init__': ( 'models.itransformer.html#itransformer.__init__', + 'neuralforecast/models/itransformer.py'), + 'neuralforecast.models.itransformer.iTransformer.forecast': ( 'models.itransformer.html#itransformer.forecast', + 'neuralforecast/models/itransformer.py'), + 'neuralforecast.models.itransformer.iTransformer.forward': ( 'models.itransformer.html#itransformer.forward', + 'neuralforecast/models/itransformer.py')}, 'neuralforecast.models.lstm': { 'neuralforecast.models.lstm.LSTM': ('models.lstm.html#lstm', 'neuralforecast/models/lstm.py'), 'neuralforecast.models.lstm.LSTM.__init__': ( 'models.lstm.html#lstm.__init__', 'neuralforecast/models/lstm.py'), diff --git a/neuralforecast/auto.py b/neuralforecast/auto.py index cbfef705b..20932619a 100644 --- a/neuralforecast/auto.py +++ b/neuralforecast/auto.py @@ -3,8 +3,8 @@ # %% auto 0 __all__ = ['AutoRNN', 'AutoLSTM', 'AutoGRU', 'AutoTCN', 'AutoDeepAR', 'AutoDilatedRNN', 'AutoMLP', 'AutoNBEATS', 'AutoNBEATSx', 'AutoNHITS', 'AutoDLinear', 'AutoNLinear', 'AutoTFT', 'AutoVanillaTransformer', 'AutoInformer', - 'AutoAutoformer', 'AutoFEDformer', 'AutoPatchTST', 'AutoTimesNet', 'AutoStemGNN', 'AutoHINT', 'AutoTSMixer', - 'AutoTSMixerx', 'AutoMLPMultivariate'] + 'AutoAutoformer', 'AutoFEDformer', 'AutoPatchTST', 'AutoiTransformer', 'AutoTimesNet', 'AutoStemGNN', + 'AutoHINT', 'AutoTSMixer', 'AutoTSMixerx', 'AutoMLPMultivariate'] # %% ../nbs/models.ipynb 2 from os import cpu_count @@ -37,6 +37,7 @@ from .models.fedformer import FEDformer from .models.patchtst import PatchTST from .models.timesnet import TimesNet +from .models.itransformer import iTransformer from .models.stemgnn import StemGNN from .models.hint import HINT @@ -1297,7 +1298,92 @@ def get_default_config(cls, h, backend, n_series=None): return config -# %% ../nbs/models.ipynb 88 +# %% ../nbs/models.ipynb 87 +class AutoiTransformer(BaseAuto): + + default_config = { + "input_size_multiplier": [1, 2, 3, 4, 5], + "h": None, + "n_series": None, + "hidden_size": tune.choice([64, 128, 256]), + "n_heads": tune.choice([4, 8]), + "learning_rate": tune.loguniform(1e-4, 1e-1), + "scaler_type": tune.choice([None, "robust", "standard"]), + "max_steps": tune.choice([500, 1000, 2000]), + "batch_size": tune.choice([32, 64, 128, 256]), + "loss": None, + "random_seed": tune.randint(1, 20), + } + + def __init__( + self, + h, + n_series, + loss=MAE(), + valid_loss=None, + config=None, + search_alg=BasicVariantGenerator(random_state=1), + num_samples=10, + refit_with_val=False, + cpus=cpu_count(), + gpus=torch.cuda.device_count(), + verbose=False, + alias=None, + backend="ray", + callbacks=None, + ): + + # Define search space, input/output sizes + if config is None: + config = self.get_default_config(h=h, backend=backend, n_series=n_series) + + # Always use n_series from parameters, raise exception with Optuna because we can't enforce it + if backend == "ray": + config["n_series"] = n_series + elif backend == "optuna": + mock_trial = MockTrial() + if ( + "n_series" in config(mock_trial) + and config(mock_trial)["n_series"] != n_series + ) or ("n_series" not in config(mock_trial)): + raise Exception(f"config needs 'n_series': {n_series}") + + super(AutoiTransformer, self).__init__( + cls_model=iTransformer, + h=h, + loss=loss, + valid_loss=valid_loss, + config=config, + search_alg=search_alg, + num_samples=num_samples, + refit_with_val=refit_with_val, + cpus=cpus, + gpus=gpus, + verbose=verbose, + alias=alias, + backend=backend, + callbacks=callbacks, + ) + + @classmethod + def get_default_config(cls, h, backend, n_series): + config = cls.default_config.copy() + config["input_size"] = tune.choice( + [h * x for x in config["input_size_multiplier"]] + ) + + # Rolling windows with step_size=1 or step_size=h + # See `BaseWindows` and `BaseRNN`'s create_windows + config["step_size"] = tune.choice([1, h]) + del config["input_size_multiplier"] + if backend == "optuna": + # Always use n_series from parameters + config["n_series"] = n_series + config = cls._ray_config_to_optuna(config) + + return config + +# %% ../nbs/models.ipynb 92 class AutoTimesNet(BaseAuto): default_config = { @@ -1365,7 +1451,7 @@ def get_default_config(cls, h, backend, n_series=None): return config -# %% ../nbs/models.ipynb 93 +# %% ../nbs/models.ipynb 97 class AutoStemGNN(BaseAuto): default_config = { @@ -1450,7 +1536,7 @@ def get_default_config(cls, h, backend, n_series): return config -# %% ../nbs/models.ipynb 97 +# %% ../nbs/models.ipynb 101 class AutoHINT(BaseAuto): def __init__( @@ -1515,7 +1601,7 @@ def _fit_model(self, cls_model, config, dataset, val_size, test_size): def get_default_config(cls, h, backend, n_series=None): raise Exception("AutoHINT has no default configuration.") -# %% ../nbs/models.ipynb 102 +# %% ../nbs/models.ipynb 106 class AutoTSMixer(BaseAuto): default_config = { @@ -1601,7 +1687,7 @@ def get_default_config(cls, h, backend, n_series): return config -# %% ../nbs/models.ipynb 106 +# %% ../nbs/models.ipynb 110 class AutoTSMixerx(BaseAuto): default_config = { @@ -1687,7 +1773,7 @@ def get_default_config(cls, h, backend, n_series): return config -# %% ../nbs/models.ipynb 110 +# %% ../nbs/models.ipynb 114 class AutoMLPMultivariate(BaseAuto): default_config = { diff --git a/neuralforecast/core.py b/neuralforecast/core.py index 5e6061c04..cefc05463 100644 --- a/neuralforecast/core.py +++ b/neuralforecast/core.py @@ -52,6 +52,7 @@ TSMixer, TSMixerx, MLPMultivariate, + iTransformer, ) # %% ../nbs/core.ipynb 5 @@ -156,6 +157,8 @@ def _insample_times( "autotsmixerx": TSMixerx, "mlpmultivariate": MLPMultivariate, "automlpmultivariate": MLPMultivariate, + "itransformer": iTransformer, + "autoitransformer": iTransformer, } # %% ../nbs/core.ipynb 8 diff --git a/neuralforecast/models/__init__.py b/neuralforecast/models/__init__.py index bd3fae666..4ce374098 100644 --- a/neuralforecast/models/__init__.py +++ b/neuralforecast/models/__init__.py @@ -2,6 +2,7 @@ 'MLP', 'NHITS', 'NBEATS', 'NBEATSx', 'DLinear', 'NLinear', 'TFT', 'VanillaTransformer', 'Informer', 'Autoformer', 'PatchTST', 'FEDformer', 'StemGNN', 'HINT', 'TimesNet', 'TimeLLM', 'TSMixer', 'TSMixerx', 'MLPMultivariate', + 'iTransformer' ] from .rnn import RNN @@ -28,4 +29,5 @@ from .timellm import TimeLLM from .tsmixer import TSMixer from .tsmixerx import TSMixerx -from .mlpmultivariate import MLPMultivariate \ No newline at end of file +from .mlpmultivariate import MLPMultivariate +from .itransformer import iTransformer diff --git a/neuralforecast/models/itransformer.py b/neuralforecast/models/itransformer.py new file mode 100644 index 000000000..25b7c69f9 --- /dev/null +++ b/neuralforecast/models/itransformer.py @@ -0,0 +1,293 @@ +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/models.itransformer.ipynb. + +# %% auto 0 +__all__ = ['iTransformer'] + +# %% ../../nbs/models.itransformer.ipynb 6 +import torch +import torch.nn as nn +import torch.nn.functional as F + +import numpy as np + +from typing import Optional +from math import sqrt + +from ..losses.pytorch import MAE +from ..common._base_multivariate import BaseMultivariate + +from neuralforecast.common._modules import ( + TransEncoder, + TransEncoderLayer, + AttentionLayer, +) + +# %% ../../nbs/models.itransformer.ipynb 9 +class TriangularCausalMask: + def __init__(self, B, L, device="cpu"): + mask_shape = [B, 1, L, L] + with torch.no_grad(): + self._mask = torch.triu( + torch.ones(mask_shape, dtype=torch.bool), diagonal=1 + ).to(device) + + @property + def mask(self): + return self._mask + + +class FullAttention(nn.Module): + def __init__( + self, + mask_flag=True, + factor=5, + scale=None, + attention_dropout=0.1, + output_attention=False, + ): + super(FullAttention, self).__init__() + self.scale = scale + self.mask_flag = mask_flag + self.output_attention = output_attention + self.dropout = nn.Dropout(attention_dropout) + + def forward(self, queries, keys, values, attn_mask, tau=None, delta=None): + B, L, H, E = queries.shape + _, S, _, D = values.shape + scale = self.scale or 1.0 / sqrt(E) + + scores = torch.einsum("blhe,bshe->bhls", queries, keys) + + if self.mask_flag: + if attn_mask is None: + attn_mask = TriangularCausalMask(B, L, device=queries.device) + + scores.masked_fill_(attn_mask.mask, -np.inf) + + A = self.dropout(torch.softmax(scale * scores, dim=-1)) + V = torch.einsum("bhls,bshd->blhd", A, values) + + if self.output_attention: + return (V.contiguous(), A) + else: + return (V.contiguous(), None) + +# %% ../../nbs/models.itransformer.ipynb 11 +class DataEmbedding_inverted(nn.Module): + def __init__(self, c_in, hidden_size, dropout=0.1): + super(DataEmbedding_inverted, self).__init__() + self.value_embedding = nn.Linear(c_in, hidden_size) + self.dropout = nn.Dropout(p=dropout) + + def forward(self, x, x_mark): + x = x.permute(0, 2, 1) + # x: [Batch Variate Time] + if x_mark is None: + x = self.value_embedding(x) + else: + # the potential to take covariates (e.g. timestamps) as tokens + x = self.value_embedding(torch.cat([x, x_mark.permute(0, 2, 1)], 1)) + # x: [Batch Variate hidden_size] + return self.dropout(x) + +# %% ../../nbs/models.itransformer.ipynb 13 +class iTransformer(BaseMultivariate): + """iTransformer + + **Parameters:**
+ `h`: int, Forecast horizon.
+ `input_size`: int, autorregresive inputs size, y=[1,2,3,4] input_size=2 -> y_[t-2:t]=[1,2].
+ `n_series`: int, number of time-series.
+ `futr_exog_list`: str list, future exogenous columns.
+ `hist_exog_list`: str list, historic exogenous columns.
+ `stat_exog_list`: str list, static exogenous columns.
+ `hidden_size`: int, dimension of the model.
+ `n_heads`: int, number of heads.
+ `e_layers`: int, number of encoder layers.
+ `d_layers`: int, number of decoder layers.
+ `d_ff`: int, dimension of fully-connected layer.
+ `factor`: int, attention factor.
+ `dropout`: float, dropout rate.
+ `use_norm`: bool, whether to normalize or not.
+ `loss`: PyTorch module, instantiated train loss class from [losses collection](https://nixtla.github.io/neuralforecast/losses.pytorch.html).
+ `valid_loss`: PyTorch module=`loss`, instantiated valid loss class from [losses collection](https://nixtla.github.io/neuralforecast/losses.pytorch.html).
+ `max_steps`: int=1000, maximum number of training steps.
+ `learning_rate`: float=1e-3, Learning rate between (0, 1).
+ `num_lr_decays`: int=-1, Number of learning rate decays, evenly distributed across max_steps.
+ `early_stop_patience_steps`: int=-1, Number of validation iterations before early stopping.
+ `val_check_steps`: int=100, Number of training steps between every validation loss check.
+ `batch_size`: int=32, number of different series in each batch.
+ `step_size`: int=1, step size between each window of temporal data.
+ `scaler_type`: str='identity', type of scaler for temporal inputs normalization see [temporal scalers](https://nixtla.github.io/neuralforecast/common.scalers.html).
+ `random_seed`: int=1, random_seed for pytorch initializer and numpy generators.
+ `num_workers_loader`: int=os.cpu_count(), workers to be used by `TimeSeriesDataLoader`.
+ `drop_last_loader`: bool=False, if True `TimeSeriesDataLoader` drops last non-full batch.
+ `alias`: str, optional, Custom name of the model.
+ `optimizer`: Subclass of 'torch.optim.Optimizer', optional, user specified optimizer instead of the default choice (Adam).
+ `optimizer_kwargs`: dict, optional, list of parameters used by the user specified `optimizer`.
+ `**trainer_kwargs`: int, keyword trainer arguments inherited from [PyTorch Lighning's trainer](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html?highlight=trainer).
+ + **References**
+ - [Yong Liu, Tengge Hu, Haoran Zhang, Haixu Wu, Shiyu Wang, Lintao Ma, Mingsheng Long. "iTransformer: Inverted Transformers Are Effective for Time Series Forecasting"](https://arxiv.org/abs/2310.06625) + """ + + # Class attributes + SAMPLING_TYPE = "multivariate" + + def __init__( + self, + h, + input_size, + n_series, + futr_exog_list=None, + hist_exog_list=None, + stat_exog_list=None, + hidden_size: int = 512, + n_heads: int = 8, + e_layers: int = 2, + d_layers: int = 1, + d_ff: int = 2048, + factor: int = 1, + dropout: float = 0.1, + use_norm: bool = True, + loss=MAE(), + valid_loss=None, + max_steps: int = 1000, + learning_rate: float = 1e-3, + num_lr_decays: int = -1, + early_stop_patience_steps: int = -1, + val_check_steps: int = 100, + batch_size: int = 32, + step_size: int = 1, + scaler_type: str = "identity", + random_seed: int = 1, + num_workers_loader: int = 0, + drop_last_loader: bool = False, + optimizer=None, + optimizer_kwargs=None, + **trainer_kwargs + ): + + super(iTransformer, self).__init__( + h=h, + input_size=input_size, + n_series=n_series, + stat_exog_list=None, + futr_exog_list=None, + hist_exog_list=None, + loss=loss, + valid_loss=valid_loss, + max_steps=max_steps, + learning_rate=learning_rate, + num_lr_decays=num_lr_decays, + early_stop_patience_steps=early_stop_patience_steps, + val_check_steps=val_check_steps, + batch_size=batch_size, + step_size=step_size, + scaler_type=scaler_type, + random_seed=random_seed, + num_workers_loader=num_workers_loader, + drop_last_loader=drop_last_loader, + optimizer=optimizer, + optimizer_kwargs=optimizer_kwargs, + **trainer_kwargs + ) + + # Asserts + if stat_exog_list is not None: + raise Exception("iTransformer does not support static exogenous variables") + if futr_exog_list is not None: + raise Exception("iTransformer does not support future exogenous variables") + if hist_exog_list is not None: + raise Exception( + "iTransformer does not support historical exogenous variables" + ) + + self.enc_in = n_series + self.dec_in = n_series + self.c_out = n_series + self.hidden_size = hidden_size + self.n_heads = n_heads + self.e_layers = e_layers + self.d_layers = d_layers + self.d_ff = d_ff + self.factor = factor + self.dropout = dropout + self.use_norm = use_norm + + # Architecture + self.enc_embedding = DataEmbedding_inverted( + input_size, self.hidden_size, self.dropout + ) + + self.encoder = TransEncoder( + [ + TransEncoderLayer( + AttentionLayer( + FullAttention( + False, self.factor, attention_dropout=self.dropout + ), + self.hidden_size, + self.n_heads, + ), + self.hidden_size, + self.d_ff, + dropout=self.dropout, + activation=F.gelu, + ) + for l in range(self.e_layers) + ], + norm_layer=torch.nn.LayerNorm(self.hidden_size), + ) + + self.projector = nn.Linear(self.hidden_size, h, bias=True) + + def forecast(self, x_enc): + if self.use_norm: + # Normalization from Non-stationary Transformer + means = x_enc.mean(1, keepdim=True).detach() + x_enc = x_enc - means + stdev = torch.sqrt( + torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5 + ) + x_enc /= stdev + + _, _, N = x_enc.shape # B L N + # B: batch_size; E: hidden_size; + # L: input_size; S: horizon(h); + # N: number of variate (tokens), can also includes covariates + + # Embedding + # B L N -> B N E (B L N -> B L E in the vanilla Transformer) + enc_out = self.enc_embedding( + x_enc, None + ) # covariates (e.g timestamp) can be also embedded as tokens + + # B N E -> B N E (B L E -> B L E in the vanilla Transformer) + # the dimensions of embedded time series has been inverted, and then processed by native attn, layernorm and ffn modules + enc_out, attns = self.encoder(enc_out, attn_mask=None) + + # B N E -> B N S -> B S N + dec_out = self.projector(enc_out).permute(0, 2, 1)[ + :, :, :N + ] # filter the covariates + + if self.use_norm: + # De-Normalization from Non-stationary Transformer + dec_out = dec_out * (stdev[:, 0, :].unsqueeze(1).repeat(1, self.h, 1)) + dec_out = dec_out + (means[:, 0, :].unsqueeze(1).repeat(1, self.h, 1)) + + return dec_out + + def forward(self, windows_batch): + insample_y = windows_batch["insample_y"] + + y_pred = self.forecast(insample_y) + y_pred = y_pred[:, -self.h :, :] + y_pred = self.loss.domain_map(y_pred) + + # domain_map might have squeezed the last dimension in case n_series == 1 + if y_pred.ndim == 2: + return y_pred.unsqueeze(-1) + else: + return y_pred