diff --git a/nbs/common.base_recurrent.ipynb b/nbs/common.base_recurrent.ipynb
index f84087969..da17c378b 100644
--- a/nbs/common.base_recurrent.ipynb
+++ b/nbs/common.base_recurrent.ipynb
@@ -151,14 +151,15 @@
     "        self.early_stop_patience_steps = early_stop_patience_steps\n",
     "        self.val_check_steps = val_check_steps\n",
     "\n",
-    "        # Scaler\n",
-    "        self.scaler = TemporalNorm(scaler_type=scaler_type, dim=-1) # Time dimension is -1.\n",
-    "\n",
     "        # Variables\n",
     "        self.futr_exog_list = futr_exog_list if futr_exog_list is not None else []\n",
     "        self.hist_exog_list = hist_exog_list if hist_exog_list is not None else []\n",
     "        self.stat_exog_list = stat_exog_list if stat_exog_list is not None else []\n",
     "\n",
+    "        # Scaler\n",
+    "        self.scaler = TemporalNorm(scaler_type=scaler_type, dim=-1,  # Time dimension is -1.\n",
+    "                        num_features=1+len(self.hist_exog_list)+len(self.futr_exog_list))        \n",
+    "\n",
     "        # Fit arguments\n",
     "        self.val_size = 0\n",
     "        self.test_size = 0\n",
@@ -219,13 +220,17 @@
     "                     'interval': 'step'}\n",
     "        return {'optimizer': optimizer, 'lr_scheduler': scheduler}\n",
     "\n",
-    "    def _normalization(self, batch, val_size=0, test_size=0):\n",
+    "    def _get_temporal_data_cols(self, temporal_cols):\n",
+    "        temporal_data_cols = ['y'] + list(set(temporal_cols.tolist()) &\\\n",
+    "                                  set(self.hist_exog_list + self.futr_exog_list))\n",
+    "        return temporal_data_cols\n",
     "\n",
+    "    def _normalization(self, batch, val_size=0, test_size=0):\n",
     "        temporal = batch['temporal'] # B, C, T\n",
     "        temporal_cols = batch['temporal_cols'].copy()\n",
     "\n",
     "        # Separate data and mask\n",
-    "        temporal_data_cols = temporal_cols.drop('available_mask').tolist()\n",
+    "        temporal_data_cols = self._get_temporal_data_cols(temporal_cols=temporal_cols)\n",
     "        temporal_data = temporal[:, temporal_cols.get_indexer(temporal_data_cols), :]\n",
     "        temporal_mask = temporal[:, temporal_cols.get_loc('available_mask'), :].clone()\n",
     "\n",
@@ -679,6 +684,52 @@
     "show_doc(BaseRecurrent.predict, title_level=3)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "# add h=0,1 unit test for _parse_windows \n",
+    "from neuralforecast.losses.pytorch import MAE\n",
+    "from neuralforecast.utils import AirPassengersDF\n",
+    "from neuralforecast.tsdataset import TimeSeriesDataset, TimeSeriesDataModule\n",
+    "\n",
+    "# Declare batch\n",
+    "AirPassengersDF['x'] = np.array(len(AirPassengersDF))\n",
+    "AirPassengersDF['x2'] = np.array(len(AirPassengersDF)) * 2\n",
+    "dataset, indices, dates, ds = TimeSeriesDataset.from_df(df=AirPassengersDF)\n",
+    "data = TimeSeriesDataModule(dataset=dataset, batch_size=1, drop_last=True)\n",
+    "\n",
+    "train_loader =  data.train_dataloader()\n",
+    "batch = next(iter(train_loader))\n",
+    "\n",
+    "# Test that hist_exog_list and futr_exog_list correctly filter data that is sent to scaler.\n",
+    "baserecurrent = BaseRecurrent(h=12,\n",
+    "                              input_size=117,\n",
+    "                              hist_exog_list=['x', 'x2'],\n",
+    "                              futr_exog_list=['x'],\n",
+    "                              loss=MAE(),\n",
+    "                              valid_loss=MAE(),\n",
+    "                              learning_rate=0.001,\n",
+    "                              max_steps=1,\n",
+    "                              val_check_steps=0,\n",
+    "                              batch_size=1,\n",
+    "                              valid_batch_size=1,\n",
+    "                              windows_batch_size=10,\n",
+    "                              inference_input_size=2,\n",
+    "                              start_padding_enabled=True)\n",
+    "\n",
+    "windows = baserecurrent._create_windows(batch, step='train')\n",
+    "\n",
+    "temporal_cols = windows['temporal_cols'].copy() # B, L+H, C\n",
+    "temporal_data_cols = baserecurrent._get_temporal_data_cols(temporal_cols=temporal_cols)\n",
+    "\n",
+    "test_eq(set(temporal_data_cols), set(['y', 'x', 'x2']))\n",
+    "test_eq(windows['temporal'].shape, torch.Size([1,len(['y', 'x', 'x2', 'available_mask']),117,12+1]))"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/nbs/common.base_windows.ipynb b/nbs/common.base_windows.ipynb
index 6cb63be19..cab27b4df 100644
--- a/nbs/common.base_windows.ipynb
+++ b/nbs/common.base_windows.ipynb
@@ -158,15 +158,16 @@
     "        self.windows_batch_size = windows_batch_size\n",
     "        self.step_size = step_size\n",
     "\n",
-    "        # Scaler\n",
-    "        self.scaler = TemporalNorm(scaler_type=scaler_type, dim=1) # Time dimension is 1.\n",
-    "\n",
     "        # Variables\n",
     "        self.futr_exog_list = futr_exog_list if futr_exog_list is not None else []\n",
     "        self.hist_exog_list = hist_exog_list if hist_exog_list is not None else []\n",
     "        self.stat_exog_list = stat_exog_list if stat_exog_list is not None else []\n",
     "        self.exclude_insample_y = exclude_insample_y\n",
     "\n",
+    "        # Scaler\n",
+    "        self.scaler = TemporalNorm(scaler_type=scaler_type, dim=1,  # Time dimension is 1.\n",
+    "                        num_features=1+len(self.hist_exog_list)+len(self.futr_exog_list))\n",
+    "\n",
     "        # Fit arguments\n",
     "        self.val_size = 0\n",
     "        self.test_size = 0\n",
@@ -353,6 +354,11 @@
     "            return windows_batch\n",
     "        else:\n",
     "            raise ValueError(f'Unknown step {step}')\n",
+    "\n",
+    "    def _get_temporal_data_cols(self, temporal_cols):\n",
+    "        temporal_data_cols = ['y'] + list(set(temporal_cols.tolist()) &\\\n",
+    "                                  set(self.hist_exog_list + self.futr_exog_list))\n",
+    "        return temporal_data_cols\n",
     "            \n",
     "    def _normalization(self, windows):\n",
     "        # windows are already filtered by train/validation/test\n",
@@ -361,7 +367,8 @@
     "        temporal_cols = windows['temporal_cols'].copy() # B, L+H, C\n",
     "\n",
     "        # To avoid leakage uses only the lags\n",
-    "        temporal_data_cols = temporal_cols.drop('available_mask').tolist()\n",
+    "        #temporal_data_cols = temporal_cols.drop('available_mask').tolist()\n",
+    "        temporal_data_cols = self._get_temporal_data_cols(temporal_cols=temporal_cols)\n",
     "        temporal_data = temporal[:, :, temporal_cols.get_indexer(temporal_data_cols)]\n",
     "        temporal_mask = temporal[:, :, temporal_cols.get_loc('available_mask')].clone()\n",
     "        if self.h > 0:\n",
@@ -822,6 +829,7 @@
     "\n",
     "# Declare batch\n",
     "AirPassengersDF['x'] = np.array(len(AirPassengersDF))\n",
+    "AirPassengersDF['x2'] = np.array(len(AirPassengersDF)) * 2\n",
     "dataset, indices, dates, ds = TimeSeriesDataset.from_df(df=AirPassengersDF)\n",
     "data = TimeSeriesDataModule(dataset=dataset, batch_size=1, drop_last=True)\n",
     "\n",
@@ -902,8 +910,51 @@
     "windows = basewindows._create_windows(batch, step='predict')\n",
     "windows = basewindows._normalization(windows=windows)\n",
     "insample_y, insample_mask, outsample_y, outsample_mask, \\\n",
-    "        hist_exog, futr_exog, stat_exog = basewindows._parse_windows(batch, windows)\n"
+    "        hist_exog, futr_exog, stat_exog = basewindows._parse_windows(batch, windows)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "54d2e850",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "\n",
+    "# Test that hist_exog_list and futr_exog_list correctly filter data.\n",
+    "# that is sent to scaler.\n",
+    "basewindows = BaseWindows(h=12,\n",
+    "                          input_size=500,\n",
+    "                          hist_exog_list=['x', 'x2'],\n",
+    "                          futr_exog_list=['x'],\n",
+    "                          loss=MAE(),\n",
+    "                          valid_loss=MAE(),\n",
+    "                          learning_rate=0.001,\n",
+    "                          max_steps=1,\n",
+    "                          val_check_steps=0,\n",
+    "                          batch_size=1,\n",
+    "                          valid_batch_size=1,\n",
+    "                          windows_batch_size=10,\n",
+    "                          inference_windows_batch_size=2,\n",
+    "                          start_padding_enabled=True)\n",
+    "\n",
+    "windows = basewindows._create_windows(batch, step='train')\n",
+    "\n",
+    "temporal_cols = windows['temporal_cols'].copy() # B, L+H, C\n",
+    "temporal_data_cols = basewindows._get_temporal_data_cols(temporal_cols=temporal_cols)\n",
+    "\n",
+    "test_eq(set(temporal_data_cols), set(['y', 'x', 'x2']))\n",
+    "test_eq(windows['temporal'].shape, torch.Size([10,500+12,len(['y', 'x', 'x2', 'available_mask'])]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bf493ff9",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/nbs/common.scalers.ipynb b/nbs/common.scalers.ipynb
index d6a72953a..50caf9a2f 100644
--- a/nbs/common.scalers.ipynb
+++ b/nbs/common.scalers.ipynb
@@ -30,7 +30,7 @@
    "source": [
     "# TemporalNorm\n",
     "\n",
-    "> Temporal normalization has proven to be essential in neural forecasting tasks, as it enables network's non-linearities to express themselves. Forecasting scaling methods take particular interest in the temporal dimension where most of the variance dwells, contrary to other deep learning techniques like `BatchNorm` that normalizes across batch and temporal dimensions, and `LayerNorm` that normalizes across the feature dimension. Currently we support the following techniques: `std`, `median`, `norm`, `norm1`, `invariant`. <br><br>"
+    "> Temporal normalization has proven to be essential in neural forecasting tasks, as it enables network's non-linearities to express themselves. Forecasting scaling methods take particular interest in the temporal dimension where most of the variance dwells, contrary to other deep learning techniques like `BatchNorm` that normalizes across batch and temporal dimensions, and `LayerNorm` that normalizes across the feature dimension. Currently we support the following techniques: `std`, `median`, `norm`, `norm1`, `invariant`, `revin`.<br><br>**References**<br>- [Kin G. Olivares, David Luo, Cristian Challu, Stefania La Vattiata, Max Mergenthaler, Artur Dubrawski (2023). \"HINT: Hierarchical Mixture Networks For Coherent Probabilistic Forecasting\". Neural Information Processing Systems, submitted. Working Paper version available at arxiv.](https://arxiv.org/abs/2305.07089)<br>- [Taesung Kim and Jinhee Kim and Yunwon Tae and Cheonbok Park and Jang-Ho Choi and Jaegul Choo. \"Reversible Instance Normalization for Accurate Time-Series Forecasting against Distribution Shift\". ICLR 2022.](https://openreview.net/pdf?id=cGDAkQo1C0p).<br>- [David Salinas, Valentin Flunkert, Jan Gasthaus, Tim Januschowski (2020). \"DeepAR: Probabilistic forecasting with autoregressive recurrent networks\". International Journal of Forecasting.](https://www.sciencedirect.com/science/article/pii/S0169207019301888)<br>"
    ]
   },
   {
@@ -42,6 +42,19 @@
     "![Figure 1. Illustration of temporal normalization (left), layer normalization (center) and batch normalization (right). The entries in green show the components used to compute the normalizing statistics.](imgs_models/temporal_norm.png)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f5400f41",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "import os\n",
+    "os.environ[\"PYTORCH_ENABLE_MPS_FALLBACK\"] = \"1\"\n",
+    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\""
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -202,8 +215,17 @@
     "    x_range = x_max - x_min\n",
     "    x_range[x_range==0] = 1.0\n",
     "    x_range = x_range + eps\n",
-    "    return x_min, x_range\n",
-    "\n",
+    "    return x_min, x_range"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "39fa429b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| exporti\n",
     "def minmax_scaler(x, x_min, x_range):\n",
     "    return (x - x_min) / x_range\n",
     "\n",
@@ -263,8 +285,17 @@
     "    x_range = x_max - x_min\n",
     "    x_range[x_range==0] = 1.0\n",
     "    x_range = x_range + eps\n",
-    "    return x_min, x_range\n",
-    "\n",
+    "    return x_min, x_range"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a19ed5a8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| exporti\n",
     "def minmax1_scaler(x, x_min, x_range):\n",
     "    x = (x - x_min) / x_range\n",
     "    z = x * (2) - 1\n",
@@ -316,12 +347,21 @@
     "    \"\"\"\n",
     "    x_means = masked_mean(x=x, mask=mask, dim=dim)\n",
     "    x_stds = torch.sqrt(masked_mean(x=(x-x_means)**2, mask=mask, dim=dim))\n",
-    "    \n",
+    "\n",
     "    # Protect against division by zero\n",
     "    x_stds[x_stds==0] = 1.0\n",
     "    x_stds = x_stds + eps\n",
-    "    return x_means, x_stds\n",
-    "\n",
+    "    return x_means, x_stds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "17f90821",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| exporti\n",
     "def std_scaler(x, x_means, x_stds):\n",
     "    return (x - x_means) / x_stds\n",
     "\n",
@@ -386,8 +426,17 @@
     "    # Protect against division by zero\n",
     "    x_mad[x_mad==0] = 1.0\n",
     "    x_mad = x_mad + eps\n",
-    "    return x_median, x_mad\n",
-    "\n",
+    "    return x_median, x_mad"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "33f3cf28",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| exporti\n",
     "def robust_scaler(x, x_median, x_mad):\n",
     "    return (x - x_median) / x_mad\n",
     "\n",
@@ -450,8 +499,17 @@
     "    # Protect against division by zero\n",
     "    x_mad[x_mad==0] = 1.0\n",
     "    x_mad = x_mad + eps\n",
-    "    return x_median, x_mad\n",
-    "\n",
+    "    return x_median, x_mad"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24cca2bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| exporti\n",
     "def invariant_scaler(x, x_median, x_mad):\n",
     "    return torch.arcsinh((x - x_median) / x_mad)\n",
     "\n",
@@ -500,8 +558,17 @@
     "    x_shift = torch.zeros(shape)\n",
     "    x_scale = torch.ones(shape)\n",
     "\n",
-    "    return x_shift, x_scale\n",
-    "\n",
+    "    return x_shift, x_scale"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1d7b313e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| exporti\n",
     "def identity_scaler(x, x_shift, x_scale):\n",
     "    return x\n",
     "\n",
@@ -546,18 +613,27 @@
     "\n",
     "    $$\\mathbf{z}_{[B,T,C]} = \\\\textrm{Scaler}(\\mathbf{x}_{[B,T,C]})$$\n",
     "\n",
+    "    If `scaler_type` is `revin` learnable normalization parameters are added on top of\n",
+    "    the usual normalization technique, the parameters are learned through scale decouple\n",
+    "    global skip connections. The technique is available for point and probabilistic outputs.\n",
+    "\n",
+    "    $$\\mathbf{\\hat{z}}_{[B,T,C]} = \\\\boldsymbol{\\hat{\\\\gamma}}_{[1,1,C]} \\mathbf{z}_{[B,T,C]} +\\\\boldsymbol{\\hat{\\\\beta}}_{[1,1,C]}$$\n",
+    "\n",
     "    **Parameters:**<br>\n",
-    "    `scaler_type`: str, defines the type of scaler used by TemporalNorm.\n",
-    "                    available [`identity`, `standard`, `robust`, `minmax`, `minmax1`, `invariant`].<br>\n",
+    "    `scaler_type`: str, defines the type of scaler used by TemporalNorm. Available [`identity`, `standard`, `robust`, `minmax`, `minmax1`, `invariant`, `revin`].<br>\n",
     "    `dim` (int, optional): Dimension over to compute scale and shift. Defaults to -1.<br>\n",
     "    `eps` (float, optional): Small value to avoid division by zero. Defaults to 1e-6.<br>\n",
-    "                    \n",
-    "    \"\"\"    \n",
-    "    def __init__(self, scaler_type='robust', dim=-1, eps=1e-6):\n",
+    "    `num_features`: int=None, for RevIN-like learnable affine parameters initialization.<br>\n",
+    "\n",
+    "    **References**<br>\n",
+    "    - [Kin G. Olivares, David Luo, Cristian Challu, Stefania La Vattiata, Max Mergenthaler, Artur Dubrawski (2023). \"HINT: Hierarchical Mixture Networks For Coherent Probabilistic Forecasting\". Neural Information Processing Systems, submitted. Working Paper version available at arxiv.](https://arxiv.org/abs/2305.07089)<br>\n",
+    "    \"\"\"\n",
+    "    def __init__(self, scaler_type='robust', dim=-1, eps=1e-6, num_features=None):\n",
     "        super().__init__()\n",
     "        compute_statistics = {None: identity_statistics,\n",
     "                              'identity': identity_statistics,\n",
     "                              'standard': std_statistics,\n",
+    "                              'revin': std_statistics,\n",
     "                              'robust': robust_statistics,\n",
     "                              'minmax': minmax_statistics,\n",
     "                              'minmax1': minmax1_statistics,\n",
@@ -565,6 +641,7 @@
     "        scalers = {None: identity_scaler,\n",
     "                   'identity': identity_scaler,\n",
     "                   'standard': std_scaler,\n",
+    "                   'revin': std_scaler,\n",
     "                   'robust': robust_scaler,\n",
     "                   'minmax': minmax_scaler,\n",
     "                   'minmax1': minmax1_scaler,\n",
@@ -572,11 +649,14 @@
     "        inverse_scalers = {None: inv_identity_scaler,\n",
     "                    'identity': inv_identity_scaler,\n",
     "                    'standard': inv_std_scaler,\n",
+    "                    'revin': inv_std_scaler,\n",
     "                    'robust': inv_robust_scaler,\n",
     "                    'minmax': inv_minmax_scaler,\n",
     "                    'minmax1': inv_minmax1_scaler,\n",
     "                    'invariant': inv_invariant_scaler,}\n",
     "        assert (scaler_type in scalers.keys()), f'{scaler_type} not defined'\n",
+    "        if (scaler_type=='revin') and (num_features is None):\n",
+    "            raise Exception('You must pass num_features for ReVIN scaler.')\n",
     "\n",
     "        self.compute_statistics = compute_statistics[scaler_type]\n",
     "        self.scaler = scalers[scaler_type]\n",
@@ -585,6 +665,18 @@
     "        self.dim = dim\n",
     "        self.eps = eps\n",
     "\n",
+    "        if (scaler_type=='revin'):\n",
+    "            self._init_params(num_features=num_features)\n",
+    "\n",
+    "    def _init_params(self, num_features):\n",
+    "        # Initialize RevIN scaler params to broadcast:\n",
+    "        if self.dim==1: # [B,T,C]  [1,1,C]\n",
+    "            self.revin_bias = nn.Parameter(torch.zeros(1,1,num_features))\n",
+    "            self.revin_weight = nn.Parameter(torch.ones(1,1,num_features))\n",
+    "        elif self.dim==-1: # [B,C,T]  [1,C,1]\n",
+    "            self.revin_bias = nn.Parameter(torch.zeros(1,num_features,1))\n",
+    "            self.revin_weight = nn.Parameter(torch.ones(1,num_features,1))\n",
+    "\n",
     "    #@torch.no_grad()\n",
     "    def transform(self, x, mask):\n",
     "        \"\"\" Center and scale the data.\n",
@@ -594,13 +686,23 @@
     "        `mask`: torch Tensor bool, shape  [batch, time] where `x` is valid and False\n",
     "                where `x` should be masked. Mask should not be all False in any column of\n",
     "                dimension dim to avoid NaNs from zero division.<br>\n",
-    "        \n",
+    "\n",
     "        **Returns:**<br>\n",
-    "        `z`: torch.Tensor same shape as `x`, except scaled.        \n",
+    "        `z`: torch.Tensor same shape as `x`, except scaled.\n",
     "        \"\"\"\n",
     "        x_shift, x_scale = self.compute_statistics(x=x, mask=mask, dim=self.dim, eps=self.eps)\n",
     "        self.x_shift = x_shift\n",
     "        self.x_scale = x_scale\n",
+    "\n",
+    "        # Original Revin performs this operation\n",
+    "        # z = self.revin_weight * z\n",
+    "        # z = z + self.revin_bias\n",
+    "        # However this is only valid for point forecast not for\n",
+    "        # distribution's scale decouple technique.\n",
+    "        if self.scaler_type=='revin':\n",
+    "            self.x_shift = self.x_shift + self.revin_bias\n",
+    "            self.x_scale = self.x_scale * torch.relu(self.revin_weight+self.eps)\n",
+    "\n",
     "        z = self.scaler(x, x_shift, x_scale)\n",
     "        return z\n",
     "\n",
@@ -614,13 +716,24 @@
     "        **Returns:**<br>\n",
     "        `x`: torch.Tensor original data.\n",
     "        \"\"\"\n",
+    "\n",
     "        if x_shift is None:\n",
     "            x_shift = self.x_shift\n",
     "        if x_scale is None:\n",
     "            x_scale = self.x_scale\n",
     "\n",
+    "        # Original Revin performs this operation\n",
+    "        # z = z - self.revin_bias\n",
+    "        # z = (z / (self.revin_weight + self.eps))\n",
+    "        # However this is only valid for point forecast not for\n",
+    "        # distribution's scale decouple technique.\n",
+    "\n",
     "        x = self.inverse_scaler(z, x_shift, x_scale)\n",
-    "        return x"
+    "        return x\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        # The gradients are optained from BaseWindows/BaseRecurrent forwards.\n",
+    "        pass"
    ]
   },
   {
@@ -739,22 +852,13 @@
    "source": [
     "#| hide\n",
     "# Validate scalers\n",
-    "for scaler_type in [None, 'identity', 'standard', 'robust', 'minmax', 'minmax1', 'invariant']:\n",
+    "for scaler_type in [None, 'identity', 'standard', 'robust', 'minmax', 'minmax1', 'invariant', 'revin']:\n",
     "    x = 1.0*torch.tensor(np_x)\n",
     "    mask = torch.tensor(np_mask)\n",
-    "    scaler = TemporalNorm(scaler_type=scaler_type, dim=1)\n",
+    "    scaler = TemporalNorm(scaler_type=scaler_type, dim=1, num_features=np_x.shape[-1])\n",
     "    x_scaled = scaler.transform(x=x, mask=mask)\n",
     "    x_recovered = scaler.inverse_transform(x_scaled)\n",
-    "    assert torch.allclose(x, x_recovered, atol=1e-5), f'Recovered data is not the same as original with {scaler_type}'"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "fb1207bd",
-   "metadata": {},
-   "source": [
-    "# Test Predict (masked)"
+    "    assert torch.allclose(x, x_recovered, atol=1e-3), f'Recovered data is not the same as original with {scaler_type}'"
    ]
   },
   {
@@ -765,6 +869,8 @@
    "outputs": [],
    "source": [
     "#| hide\n",
+    "\n",
+    "# Unit test for masked predict filtering\n",
     "import pandas as pd\n",
     "\n",
     "from neuralforecast import NeuralForecast\n",
@@ -783,6 +889,86 @@
     "Y_hat = nf.predict(df=Y_df)\n",
     "assert pd.isnull(Y_hat).sum().sum() == 0, 'Predictions should not have NaNs'"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fa6e6a40",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "\n",
+    "# Unit test for ReVIN, and its compatibility with distribution's scale decouple\n",
+    "from neuralforecast import NeuralForecast\n",
+    "from neuralforecast.models import NHITS, RNN\n",
+    "from neuralforecast.losses.pytorch import DistributionLoss, HuberLoss, GMM, MAE\n",
+    "from neuralforecast.tsdataset import TimeSeriesDataset\n",
+    "from neuralforecast.utils import AirPassengers, AirPassengersPanel, AirPassengersStatic\n",
+    "\n",
+    "Y_df = AirPassengersPanel\n",
+    "# del Y_df['trend']\n",
+    "\n",
+    "# Instantiate BaseWindow model and test revin dynamic dimensionality with hist_exog_list\n",
+    "model = NHITS(h=12,\n",
+    "              input_size=24,\n",
+    "              loss=GMM(n_components=10, level=[90]),\n",
+    "              hist_exog_list=['y_[lag12]'],\n",
+    "              max_steps=1,\n",
+    "              early_stop_patience_steps=10,\n",
+    "              val_check_steps=50,\n",
+    "              scaler_type='revin',\n",
+    "              learning_rate=1e-3)\n",
+    "nf = NeuralForecast(models=[model], freq='MS')\n",
+    "Y_hat_df = nf.cross_validation(df=Y_df, val_size=12, n_windows=1)\n",
+    "\n",
+    "# Instantiate BaseWindow model and test revin dynamic dimensionality with hist_exog_list\n",
+    "model = NHITS(h=12,\n",
+    "              input_size=24,\n",
+    "              loss=HuberLoss(),\n",
+    "              hist_exog_list=['trend', 'y_[lag12]'],\n",
+    "              max_steps=1,\n",
+    "              early_stop_patience_steps=10,\n",
+    "              val_check_steps=50,\n",
+    "              scaler_type='revin',\n",
+    "              learning_rate=1e-3)\n",
+    "nf = NeuralForecast(models=[model], freq='MS')\n",
+    "Y_hat_df = nf.cross_validation(df=Y_df, val_size=12, n_windows=1)\n",
+    "\n",
+    "# Instantiate BaseRecurrent model and test revin dynamic dimensionality with hist_exog_list\n",
+    "model = RNN(h=12,\n",
+    "              input_size=24,\n",
+    "              loss=GMM(n_components=10, level=[90]),\n",
+    "              hist_exog_list=['trend', 'y_[lag12]'],\n",
+    "              max_steps=1,\n",
+    "              early_stop_patience_steps=10,\n",
+    "              val_check_steps=50,\n",
+    "              scaler_type='revin',\n",
+    "              learning_rate=1e-3)\n",
+    "nf = NeuralForecast(models=[model], freq='MS')\n",
+    "Y_hat_df = nf.cross_validation(df=Y_df, val_size=12, n_windows=1)\n",
+    "\n",
+    "# Instantiate BaseRecurrent model and test revin dynamic dimensionality with hist_exog_list\n",
+    "model = RNN(h=12,\n",
+    "              input_size=24,\n",
+    "              loss=HuberLoss(),\n",
+    "              hist_exog_list=['trend'],\n",
+    "              max_steps=1,\n",
+    "              early_stop_patience_steps=10,\n",
+    "              val_check_steps=50,\n",
+    "              scaler_type='revin',\n",
+    "              learning_rate=1e-3)\n",
+    "nf = NeuralForecast(models=[model], freq='MS')\n",
+    "Y_hat_df = nf.cross_validation(df=Y_df, val_size=12, n_windows=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b2f50bd8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/neuralforecast/common/_base_recurrent.py b/neuralforecast/common/_base_recurrent.py
index a206ea51b..e6ea16b63 100644
--- a/neuralforecast/common/_base_recurrent.py
+++ b/neuralforecast/common/_base_recurrent.py
@@ -102,16 +102,18 @@ def __init__(
         self.early_stop_patience_steps = early_stop_patience_steps
         self.val_check_steps = val_check_steps
 
-        # Scaler
-        self.scaler = TemporalNorm(
-            scaler_type=scaler_type, dim=-1
-        )  # Time dimension is -1.
-
         # Variables
         self.futr_exog_list = futr_exog_list if futr_exog_list is not None else []
         self.hist_exog_list = hist_exog_list if hist_exog_list is not None else []
         self.stat_exog_list = stat_exog_list if stat_exog_list is not None else []
 
+        # Scaler
+        self.scaler = TemporalNorm(
+            scaler_type=scaler_type,
+            dim=-1,  # Time dimension is -1.
+            num_features=1 + len(self.hist_exog_list) + len(self.futr_exog_list),
+        )
+
         # Fit arguments
         self.val_size = 0
         self.test_size = 0
@@ -176,12 +178,18 @@ def configure_optimizers(self):
         }
         return {"optimizer": optimizer, "lr_scheduler": scheduler}
 
+    def _get_temporal_data_cols(self, temporal_cols):
+        temporal_data_cols = ["y"] + list(
+            set(temporal_cols.tolist()) & set(self.hist_exog_list + self.futr_exog_list)
+        )
+        return temporal_data_cols
+
     def _normalization(self, batch, val_size=0, test_size=0):
         temporal = batch["temporal"]  # B, C, T
         temporal_cols = batch["temporal_cols"].copy()
 
         # Separate data and mask
-        temporal_data_cols = temporal_cols.drop("available_mask").tolist()
+        temporal_data_cols = self._get_temporal_data_cols(temporal_cols=temporal_cols)
         temporal_data = temporal[:, temporal_cols.get_indexer(temporal_data_cols), :]
         temporal_mask = temporal[:, temporal_cols.get_loc("available_mask"), :].clone()
 
diff --git a/neuralforecast/common/_base_windows.py b/neuralforecast/common/_base_windows.py
index f4516b36a..3ccd58dbc 100644
--- a/neuralforecast/common/_base_windows.py
+++ b/neuralforecast/common/_base_windows.py
@@ -108,17 +108,19 @@ def __init__(
         self.windows_batch_size = windows_batch_size
         self.step_size = step_size
 
-        # Scaler
-        self.scaler = TemporalNorm(
-            scaler_type=scaler_type, dim=1
-        )  # Time dimension is 1.
-
         # Variables
         self.futr_exog_list = futr_exog_list if futr_exog_list is not None else []
         self.hist_exog_list = hist_exog_list if hist_exog_list is not None else []
         self.stat_exog_list = stat_exog_list if stat_exog_list is not None else []
         self.exclude_insample_y = exclude_insample_y
 
+        # Scaler
+        self.scaler = TemporalNorm(
+            scaler_type=scaler_type,
+            dim=1,  # Time dimension is 1.
+            num_features=1 + len(self.hist_exog_list) + len(self.futr_exog_list),
+        )
+
         # Fit arguments
         self.val_size = 0
         self.test_size = 0
@@ -329,6 +331,12 @@ def _create_windows(self, batch, step, w_idxs=None):
         else:
             raise ValueError(f"Unknown step {step}")
 
+    def _get_temporal_data_cols(self, temporal_cols):
+        temporal_data_cols = ["y"] + list(
+            set(temporal_cols.tolist()) & set(self.hist_exog_list + self.futr_exog_list)
+        )
+        return temporal_data_cols
+
     def _normalization(self, windows):
         # windows are already filtered by train/validation/test
         # from the `create_windows_method` nor leakage risk
@@ -336,7 +344,8 @@ def _normalization(self, windows):
         temporal_cols = windows["temporal_cols"].copy()  # B, L+H, C
 
         # To avoid leakage uses only the lags
-        temporal_data_cols = temporal_cols.drop("available_mask").tolist()
+        # temporal_data_cols = temporal_cols.drop('available_mask').tolist()
+        temporal_data_cols = self._get_temporal_data_cols(temporal_cols=temporal_cols)
         temporal_data = temporal[:, :, temporal_cols.get_indexer(temporal_data_cols)]
         temporal_mask = temporal[:, :, temporal_cols.get_loc("available_mask")].clone()
         if self.h > 0:
diff --git a/neuralforecast/common/_scalers.py b/neuralforecast/common/_scalers.py
index 551c05a8a..a388dd9a8 100644
--- a/neuralforecast/common/_scalers.py
+++ b/neuralforecast/common/_scalers.py
@@ -1,16 +1,14 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/common.scalers.ipynb.
 
 # %% auto 0
-__all__ = ['masked_median', 'masked_mean', 'minmax_statistics', 'minmax_scaler', 'inv_minmax_scaler', 'minmax1_statistics',
-           'minmax1_scaler', 'inv_minmax1_scaler', 'std_statistics', 'std_scaler', 'inv_std_scaler',
-           'robust_statistics', 'robust_scaler', 'inv_robust_scaler', 'invariant_statistics', 'invariant_scaler',
-           'inv_invariant_scaler', 'identity_statistics', 'identity_scaler', 'inv_identity_scaler', 'TemporalNorm']
+__all__ = ['masked_median', 'masked_mean', 'minmax_statistics', 'minmax1_statistics', 'std_statistics', 'robust_statistics',
+           'invariant_statistics', 'identity_statistics', 'TemporalNorm']
 
-# %% ../../nbs/common.scalers.ipynb 4
+# %% ../../nbs/common.scalers.ipynb 5
 import torch
 import torch.nn as nn
 
-# %% ../../nbs/common.scalers.ipynb 7
+# %% ../../nbs/common.scalers.ipynb 8
 def masked_median(x, mask, dim=-1, keepdim=True):
     """Masked Median
 
@@ -56,7 +54,7 @@ def masked_mean(x, mask, dim=-1, keepdim=True):
     x_mean = torch.nan_to_num(x_mean, nan=0.0)
     return x_mean
 
-# %% ../../nbs/common.scalers.ipynb 11
+# %% ../../nbs/common.scalers.ipynb 12
 def minmax_statistics(x, mask, eps=1e-6, dim=-1):
     """MinMax Scaler
 
@@ -96,7 +94,7 @@ def minmax_statistics(x, mask, eps=1e-6, dim=-1):
     x_range = x_range + eps
     return x_min, x_range
 
-
+# %% ../../nbs/common.scalers.ipynb 13
 def minmax_scaler(x, x_min, x_range):
     return (x - x_min) / x_range
 
@@ -104,7 +102,7 @@ def minmax_scaler(x, x_min, x_range):
 def inv_minmax_scaler(z, x_min, x_range):
     return z * x_range + x_min
 
-# %% ../../nbs/common.scalers.ipynb 13
+# %% ../../nbs/common.scalers.ipynb 15
 def minmax1_statistics(x, mask, eps=1e-6, dim=-1):
     """MinMax1 Scaler
 
@@ -145,7 +143,7 @@ def minmax1_statistics(x, mask, eps=1e-6, dim=-1):
     x_range = x_range + eps
     return x_min, x_range
 
-
+# %% ../../nbs/common.scalers.ipynb 16
 def minmax1_scaler(x, x_min, x_range):
     x = (x - x_min) / x_range
     z = x * (2) - 1
@@ -156,7 +154,7 @@ def inv_minmax1_scaler(z, x_min, x_range):
     z = (z + 1) / 2
     return z * x_range + x_min
 
-# %% ../../nbs/common.scalers.ipynb 15
+# %% ../../nbs/common.scalers.ipynb 18
 def std_statistics(x, mask, dim=-1, eps=1e-6):
     """Standard Scaler
 
@@ -186,7 +184,7 @@ def std_statistics(x, mask, dim=-1, eps=1e-6):
     x_stds = x_stds + eps
     return x_means, x_stds
 
-
+# %% ../../nbs/common.scalers.ipynb 19
 def std_scaler(x, x_means, x_stds):
     return (x - x_means) / x_stds
 
@@ -194,7 +192,7 @@ def std_scaler(x, x_means, x_stds):
 def inv_std_scaler(z, x_mean, x_std):
     return (z * x_std) + x_mean
 
-# %% ../../nbs/common.scalers.ipynb 17
+# %% ../../nbs/common.scalers.ipynb 21
 def robust_statistics(x, mask, dim=-1, eps=1e-6):
     """Robust Median Scaler
 
@@ -236,7 +234,7 @@ def robust_statistics(x, mask, dim=-1, eps=1e-6):
     x_mad = x_mad + eps
     return x_median, x_mad
 
-
+# %% ../../nbs/common.scalers.ipynb 22
 def robust_scaler(x, x_median, x_mad):
     return (x - x_median) / x_mad
 
@@ -244,7 +242,7 @@ def robust_scaler(x, x_median, x_mad):
 def inv_robust_scaler(z, x_median, x_mad):
     return z * x_mad + x_median
 
-# %% ../../nbs/common.scalers.ipynb 19
+# %% ../../nbs/common.scalers.ipynb 24
 def invariant_statistics(x, mask, dim=-1, eps=1e-6):
     """Invariant Median Scaler
 
@@ -284,7 +282,7 @@ def invariant_statistics(x, mask, dim=-1, eps=1e-6):
     x_mad = x_mad + eps
     return x_median, x_mad
 
-
+# %% ../../nbs/common.scalers.ipynb 25
 def invariant_scaler(x, x_median, x_mad):
     return torch.arcsinh((x - x_median) / x_mad)
 
@@ -292,7 +290,7 @@ def invariant_scaler(x, x_median, x_mad):
 def inv_invariant_scaler(z, x_median, x_mad):
     return torch.sinh(z) * x_mad + x_median
 
-# %% ../../nbs/common.scalers.ipynb 21
+# %% ../../nbs/common.scalers.ipynb 27
 def identity_statistics(x, mask, dim=-1, eps=1e-6):
     """Identity Scaler
 
@@ -318,7 +316,7 @@ def identity_statistics(x, mask, dim=-1, eps=1e-6):
 
     return x_shift, x_scale
 
-
+# %% ../../nbs/common.scalers.ipynb 28
 def identity_scaler(x, x_shift, x_scale):
     return x
 
@@ -326,7 +324,7 @@ def identity_scaler(x, x_shift, x_scale):
 def inv_identity_scaler(z, x_shift, x_scale):
     return z
 
-# %% ../../nbs/common.scalers.ipynb 24
+# %% ../../nbs/common.scalers.ipynb 31
 class TemporalNorm(nn.Module):
     """Temporal Normalization
 
@@ -337,20 +335,29 @@ class TemporalNorm(nn.Module):
 
     $$\mathbf{z}_{[B,T,C]} = \\textrm{Scaler}(\mathbf{x}_{[B,T,C]})$$
 
+    If `scaler_type` is `revin` learnable normalization parameters are added on top of
+    the usual normalization technique, the parameters are learned through scale decouple
+    global skip connections. The technique is available for point and probabilistic outputs.
+
+    $$\mathbf{\hat{z}}_{[B,T,C]} = \\boldsymbol{\hat{\\gamma}}_{[1,1,C]} \mathbf{z}_{[B,T,C]} +\\boldsymbol{\hat{\\beta}}_{[1,1,C]}$$
+
     **Parameters:**<br>
-    `scaler_type`: str, defines the type of scaler used by TemporalNorm.
-                    available [`identity`, `standard`, `robust`, `minmax`, `minmax1`, `invariant`].<br>
+    `scaler_type`: str, defines the type of scaler used by TemporalNorm. Available [`identity`, `standard`, `robust`, `minmax`, `minmax1`, `invariant`, `revin`].<br>
     `dim` (int, optional): Dimension over to compute scale and shift. Defaults to -1.<br>
     `eps` (float, optional): Small value to avoid division by zero. Defaults to 1e-6.<br>
+    `num_features`: int=None, for RevIN-like learnable affine parameters initialization.<br>
 
+    **References**<br>
+    - [Kin G. Olivares, David Luo, Cristian Challu, Stefania La Vattiata, Max Mergenthaler, Artur Dubrawski (2023). "HINT: Hierarchical Mixture Networks For Coherent Probabilistic Forecasting". Neural Information Processing Systems, submitted. Working Paper version available at arxiv.](https://arxiv.org/abs/2305.07089)<br>
     """
 
-    def __init__(self, scaler_type="robust", dim=-1, eps=1e-6):
+    def __init__(self, scaler_type="robust", dim=-1, eps=1e-6, num_features=None):
         super().__init__()
         compute_statistics = {
             None: identity_statistics,
             "identity": identity_statistics,
             "standard": std_statistics,
+            "revin": std_statistics,
             "robust": robust_statistics,
             "minmax": minmax_statistics,
             "minmax1": minmax1_statistics,
@@ -360,6 +367,7 @@ def __init__(self, scaler_type="robust", dim=-1, eps=1e-6):
             None: identity_scaler,
             "identity": identity_scaler,
             "standard": std_scaler,
+            "revin": std_scaler,
             "robust": robust_scaler,
             "minmax": minmax_scaler,
             "minmax1": minmax1_scaler,
@@ -369,12 +377,15 @@ def __init__(self, scaler_type="robust", dim=-1, eps=1e-6):
             None: inv_identity_scaler,
             "identity": inv_identity_scaler,
             "standard": inv_std_scaler,
+            "revin": inv_std_scaler,
             "robust": inv_robust_scaler,
             "minmax": inv_minmax_scaler,
             "minmax1": inv_minmax1_scaler,
             "invariant": inv_invariant_scaler,
         }
         assert scaler_type in scalers.keys(), f"{scaler_type} not defined"
+        if (scaler_type == "revin") and (num_features is None):
+            raise Exception("You must pass num_features for ReVIN scaler.")
 
         self.compute_statistics = compute_statistics[scaler_type]
         self.scaler = scalers[scaler_type]
@@ -383,6 +394,18 @@ def __init__(self, scaler_type="robust", dim=-1, eps=1e-6):
         self.dim = dim
         self.eps = eps
 
+        if scaler_type == "revin":
+            self._init_params(num_features=num_features)
+
+    def _init_params(self, num_features):
+        # Initialize RevIN scaler params to broadcast:
+        if self.dim == 1:  # [B,T,C]  [1,1,C]
+            self.revin_bias = nn.Parameter(torch.zeros(1, 1, num_features))
+            self.revin_weight = nn.Parameter(torch.ones(1, 1, num_features))
+        elif self.dim == -1:  # [B,C,T]  [1,C,1]
+            self.revin_bias = nn.Parameter(torch.zeros(1, num_features, 1))
+            self.revin_weight = nn.Parameter(torch.ones(1, num_features, 1))
+
     # @torch.no_grad()
     def transform(self, x, mask):
         """Center and scale the data.
@@ -401,6 +424,16 @@ def transform(self, x, mask):
         )
         self.x_shift = x_shift
         self.x_scale = x_scale
+
+        # Original Revin performs this operation
+        # z = self.revin_weight * z
+        # z = z + self.revin_bias
+        # However this is only valid for point forecast not for
+        # distribution's scale decouple technique.
+        if self.scaler_type == "revin":
+            self.x_shift = self.x_shift + self.revin_bias
+            self.x_scale = self.x_scale * torch.relu(self.revin_weight + self.eps)
+
         z = self.scaler(x, x_shift, x_scale)
         return z
 
@@ -414,10 +447,21 @@ def inverse_transform(self, z, x_shift=None, x_scale=None):
         **Returns:**<br>
         `x`: torch.Tensor original data.
         """
+
         if x_shift is None:
             x_shift = self.x_shift
         if x_scale is None:
             x_scale = self.x_scale
 
+        # Original Revin performs this operation
+        # z = z - self.revin_bias
+        # z = (z / (self.revin_weight + self.eps))
+        # However this is only valid for point forecast not for
+        # distribution's scale decouple technique.
+
         x = self.inverse_scaler(z, x_shift, x_scale)
         return x
+
+    def forward(self, x):
+        # The gradients are optained from BaseWindows/BaseRecurrent forwards.
+        pass