From 2a5c3d960599bc8218b3465fa421721c762e146c Mon Sep 17 00:00:00 2001 From: twobitunicorn Date: Mon, 4 Dec 2023 11:48:11 -0800 Subject: [PATCH 1/2] Fixed: Exception: max_epochs is deprecated, use max_steps instead. --- nbs/index.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nbs/index.ipynb b/nbs/index.ipynb index dcfb7e139..9a620a91f 100644 --- a/nbs/index.ipynb +++ b/nbs/index.ipynb @@ -123,8 +123,8 @@ "\n", "# Fit and predict with NBEATS and NHITS models\n", "horizon = len(Y_test_df)\n", - "models = [NBEATS(input_size=2 * horizon, h=horizon, max_epochs=50),\n", - " NHITS(input_size=2 * horizon, h=horizon, max_epochs=50)]\n", + "models = [NBEATS(input_size=2 * horizon, h=horizon, max_steps=50),\n", + " NHITS(input_size=2 * horizon, h=horizon, max_steps=50)]\n", "nf = NeuralForecast(models=models, freq='M')\n", "nf.fit(df=Y_train_df)\n", "Y_hat_df = nf.predict().reset_index()\n", From c9b84e5bb0da216ae37a39ee05b4f4bf5d6d7bd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Mon, 4 Dec 2023 16:34:45 -0600 Subject: [PATCH 2/2] add frequency validation and futr_df debugging methods (#833) --- environment.yml | 2 +- nbs/core.ipynb | 92 +++++++++++++++++++++++++++++-------- neuralforecast/_modidx.py | 4 ++ neuralforecast/core.py | 96 ++++++++++++++++++++++++++++++--------- settings.ini | 2 +- 5 files changed, 154 insertions(+), 42 deletions(-) diff --git a/environment.yml b/environment.yml index a43cbf0ee..5063661a9 100644 --- a/environment.yml +++ b/environment.yml @@ -19,4 +19,4 @@ dependencies: - black - polars - "ray[tune]>=2.2.0" - - utilsforecast>=0.0.17 + - utilsforecast>=0.0.19 diff --git a/nbs/core.ipynb b/nbs/core.ipynb index 54c5de3d9..e17638ab0 100644 --- a/nbs/core.ipynb +++ b/nbs/core.ipynb @@ -74,6 +74,7 @@ " LocalRobustScaler, \n", " LocalStandardScaler,\n", ")\n", + "from utilsforecast.validation import validate_freq\n", "\n", "import neuralforecast.config as nf_config\n", "from neuralforecast.tsdataset import TimeSeriesDataset\n", @@ -259,7 +260,7 @@ " \n", " def __init__(self, \n", " models: List[Any],\n", - " freq: str,\n", + " freq: Union[str, int],\n", " local_scaler_type: Optional[str] = None):\n", " \"\"\"\n", " The `core.StatsForecast` class allows you to efficiently fit multiple `NeuralForecast` models \n", @@ -272,9 +273,8 @@ " models : List[typing.Any]\n", " Instantiated `neuralforecast.models` \n", " see [collection here](https://nixtla.github.io/neuralforecast/models.html).\n", - " freq : str\n", - " Frequency of the data, \n", - " see [panda's available frequencies](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases).\n", + " freq : str or int\n", + " Frequency of the data. Must be a valid pandas or polars offset alias, or an integer.\n", " local_scaler_type : str, optional (default=None)\n", " Scaler to apply per-serie to all features before fitting, which is inverted after predicting.\n", " Can be 'standard', 'robust', 'robust-iqr', 'minmax' or 'boxcox'\n", @@ -381,6 +381,7 @@ "\n", " # Process and save new dataset (in self)\n", " if df is not None:\n", + " validate_freq(df['ds'], self.freq)\n", " self.dataset, self.uids, self.last_dates, self.ds \\\n", " = self._prepare_fit(df=df, static_df=static_df, sort_df=sort_df, predict_only=False)\n", " self.sort_df = sort_df\n", @@ -402,6 +403,51 @@ "\n", " self._fitted = True\n", "\n", + " def make_future_dataframe(self, df: Optional[DataFrame] = None) -> DataFrame:\n", + " \"\"\"Create a dataframe with all ids and future times in the forecasting horizon.\n", + "\n", + " Parameters\n", + " ----------\n", + " df : pandas or polars DataFrame, optional (default=None)\n", + " DataFrame with columns [`unique_id`, `ds`, `y`] and exogenous variables.\n", + " Only required if this is different than the one used in the fit step.\n", + " \"\"\"\n", + " if df is not None:\n", + " df = ufp.sort(df, by=['unique_id', 'ds'])\n", + " last_times_by_id = ufp.group_by_agg(\n", + " df, by='unique_id', aggs={'ds': 'max'}, maintain_order=True\n", + " )\n", + " uids = last_times_by_id['unique_id']\n", + " last_times = last_times_by_id['ds']\n", + " else:\n", + " uids = self.uids\n", + " last_times = self.last_dates\n", + " return ufp.make_future_dataframe(\n", + " uids=uids,\n", + " last_times=last_times,\n", + " freq=self.freq,\n", + " h=self.h,\n", + " id_col='unique_id',\n", + " time_col='ds',\n", + " )\n", + "\n", + " def get_missing_future(\n", + " self, futr_df: DataFrame, df: Optional[DataFrame] = None\n", + " ) -> DataFrame:\n", + " \"\"\"Get the missing ids and times combinations in `futr_df`.\n", + " \n", + " Parameters\n", + " ----------\n", + " futr_df : pandas or polars DataFrame\n", + " DataFrame with [`unique_id`, `ds`] columns and `df`'s future exogenous.\n", + " df : pandas or polars DataFrame, optional (default=None)\n", + " DataFrame with columns [`unique_id`, `ds`, `y`] and exogenous variables.\n", + " Only required if this is different than the one used in the fit step.\n", + " \"\"\"\n", + " expected = self.make_future_dataframe(df)\n", + " ids = ['unique_id', 'ds']\n", + " return ufp.anti_join(expected, futr_df[ids], on=ids)\n", + "\n", " def predict(self,\n", " df: Optional[DataFrame] = None,\n", " static_df: Optional[DataFrame] = None,\n", @@ -455,6 +501,7 @@ "\n", " # Process new dataset but does not store it.\n", " if df is not None:\n", + " validate_freq(df['ds'], self.freq)\n", " dataset, uids, last_dates, _ = self._prepare_fit(\n", " df=df, static_df=static_df, sort_df=sort_df, predict_only=True\n", " )\n", @@ -474,16 +521,13 @@ " cols += [model_name + n for n in model.loss.output_names]\n", "\n", " # Placeholder dataframe for predictions with unique_id and ds\n", - " if isinstance(self.uids, pl_Series):\n", - " df_constructor = pl_DataFrame\n", - " else:\n", - " df_constructor = pd.DataFrame\n", - " starts = ufp.offset_times(last_dates, self.freq, 1)\n", - " fcsts_df = df_constructor(\n", - " {\n", - " 'unique_id': ufp.repeat(self.uids, self.h),\n", - " 'ds': ufp.time_ranges(starts, freq=self.freq, periods=self.h),\n", - " }\n", + " fcsts_df = ufp.make_future_dataframe(\n", + " uids=uids,\n", + " last_times=last_dates,\n", + " freq=self.freq,\n", + " h=self.h,\n", + " id_col='unique_id',\n", + " time_col='ds',\n", " )\n", "\n", " # Update and define new forecasting dataset\n", @@ -492,13 +536,22 @@ " else:\n", " futr_orig_rows = futr_df.shape[0]\n", " futr_df = ufp.join(futr_df, fcsts_df, on=['unique_id', 'ds'])\n", - " base_err_msg = f'`futr_df` must have one row per id and ds in the forecasting horizon ({self.h}).'\n", " if futr_df.shape[0] < fcsts_df.shape[0]:\n", - " raise ValueError(base_err_msg)\n", + " if df is None:\n", + " expected_cmd = 'make_future_dataframe()'\n", + " missing_cmd = 'get_missing_future(futr_df)'\n", + " else:\n", + " expected_cmd = 'make_future_dataframe(df)'\n", + " missing_cmd = 'get_missing_future(futr_df, df)'\n", + " raise ValueError(\n", + " 'There are missing combinations of ids and times in `futr_df`.\\n'\n", + " f'You can run the `{expected_cmd}` method to get the expected combinations or '\n", + " f'the `{missing_cmd}` method to get the missing combinations.'\n", + " )\n", " if futr_orig_rows > futr_df.shape[0]:\n", " dropped_rows = futr_orig_rows - futr_df.shape[0]\n", " warnings.warn(\n", - " f'Dropped {dropped_rows:,} unused rows from `futr_df`. ' + base_err_msg\n", + " f'Dropped {dropped_rows:,} unused rows from `futr_df`.'\n", " )\n", " if any(ufp.is_none(futr_df[col]).any() for col in needed_futr_exog):\n", " raise ValueError('Found null values in `futr_df`')\n", @@ -522,7 +575,7 @@ " fcsts = self._scalers_target_inverse_transform(fcsts, indptr)\n", "\n", " # Declare predictions pd.DataFrame\n", - " if df_constructor is pl_DataFrame:\n", + " if isinstance(self.uids, pl_Series):\n", " fcsts = pl_DataFrame(dict(zip(cols, fcsts.T)))\n", " else:\n", " fcsts = pd.DataFrame(fcsts, columns=cols)\n", @@ -583,6 +636,7 @@ "\n", " # Process and save new dataset (in self)\n", " if df is not None:\n", + " validate_freq(df['ds'], self.freq)\n", " self.dataset, self.uids, self.last_dates, self.ds = self._prepare_fit(\n", " df=df, static_df=static_df, sort_df=sort_df, predict_only=False\n", " )\n", @@ -1250,7 +1304,7 @@ "nf = NeuralForecast(models=models, freq='M')\n", "nf.fit(AirPassengersPanel_train)\n", "# not enough rows in futr_df raises an error\n", - "test_fail(lambda: nf.predict(futr_df=AirPassengersPanel_test.head()), contains='must have one row per id and ds')\n", + "test_fail(lambda: nf.predict(futr_df=AirPassengersPanel_test.head()), contains='There are missing combinations')\n", "# extra rows issues a warning\n", "with warnings.catch_warnings(record=True) as issued_warnings:\n", " warnings.simplefilter('always', UserWarning)\n", diff --git a/neuralforecast/_modidx.py b/neuralforecast/_modidx.py index 4b1694d1c..7c3ef885a 100644 --- a/neuralforecast/_modidx.py +++ b/neuralforecast/_modidx.py @@ -72,7 +72,11 @@ 'neuralforecast.core.NeuralForecast.cross_validation': ( 'core.html#neuralforecast.cross_validation', 'neuralforecast/core.py'), 'neuralforecast.core.NeuralForecast.fit': ('core.html#neuralforecast.fit', 'neuralforecast/core.py'), + 'neuralforecast.core.NeuralForecast.get_missing_future': ( 'core.html#neuralforecast.get_missing_future', + 'neuralforecast/core.py'), 'neuralforecast.core.NeuralForecast.load': ('core.html#neuralforecast.load', 'neuralforecast/core.py'), + 'neuralforecast.core.NeuralForecast.make_future_dataframe': ( 'core.html#neuralforecast.make_future_dataframe', + 'neuralforecast/core.py'), 'neuralforecast.core.NeuralForecast.predict': ( 'core.html#neuralforecast.predict', 'neuralforecast/core.py'), 'neuralforecast.core.NeuralForecast.predict_insample': ( 'core.html#neuralforecast.predict_insample', diff --git a/neuralforecast/core.py b/neuralforecast/core.py index 87d39796e..f9a7748c2 100644 --- a/neuralforecast/core.py +++ b/neuralforecast/core.py @@ -24,6 +24,7 @@ LocalRobustScaler, LocalStandardScaler, ) +from utilsforecast.validation import validate_freq import neuralforecast.config as nf_config from .tsdataset import TimeSeriesDataset @@ -158,7 +159,10 @@ def _warn_id_as_idx(): # %% ../nbs/core.ipynb 10 class NeuralForecast: def __init__( - self, models: List[Any], freq: str, local_scaler_type: Optional[str] = None + self, + models: List[Any], + freq: Union[str, int], + local_scaler_type: Optional[str] = None, ): """ The `core.StatsForecast` class allows you to efficiently fit multiple `NeuralForecast` models @@ -171,9 +175,8 @@ def __init__( models : List[typing.Any] Instantiated `neuralforecast.models` see [collection here](https://nixtla.github.io/neuralforecast/models.html). - freq : str - Frequency of the data, - see [panda's available frequencies](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases). + freq : str or int + Frequency of the data. Must be a valid pandas or polars offset alias, or an integer. local_scaler_type : str, optional (default=None) Scaler to apply per-serie to all features before fitting, which is inverted after predicting. Can be 'standard', 'robust', 'robust-iqr', 'minmax' or 'boxcox' @@ -289,6 +292,7 @@ def fit( # Process and save new dataset (in self) if df is not None: + validate_freq(df["ds"], self.freq) self.dataset, self.uids, self.last_dates, self.ds = self._prepare_fit( df=df, static_df=static_df, sort_df=sort_df, predict_only=False ) @@ -314,6 +318,51 @@ def fit( self._fitted = True + def make_future_dataframe(self, df: Optional[DataFrame] = None) -> DataFrame: + """Create a dataframe with all ids and future times in the forecasting horizon. + + Parameters + ---------- + df : pandas or polars DataFrame, optional (default=None) + DataFrame with columns [`unique_id`, `ds`, `y`] and exogenous variables. + Only required if this is different than the one used in the fit step. + """ + if df is not None: + df = ufp.sort(df, by=["unique_id", "ds"]) + last_times_by_id = ufp.group_by_agg( + df, by="unique_id", aggs={"ds": "max"}, maintain_order=True + ) + uids = last_times_by_id["unique_id"] + last_times = last_times_by_id["ds"] + else: + uids = self.uids + last_times = self.last_dates + return ufp.make_future_dataframe( + uids=uids, + last_times=last_times, + freq=self.freq, + h=self.h, + id_col="unique_id", + time_col="ds", + ) + + def get_missing_future( + self, futr_df: DataFrame, df: Optional[DataFrame] = None + ) -> DataFrame: + """Get the missing ids and times combinations in `futr_df`. + + Parameters + ---------- + futr_df : pandas or polars DataFrame + DataFrame with [`unique_id`, `ds`] columns and `df`'s future exogenous. + df : pandas or polars DataFrame, optional (default=None) + DataFrame with columns [`unique_id`, `ds`, `y`] and exogenous variables. + Only required if this is different than the one used in the fit step. + """ + expected = self.make_future_dataframe(df) + ids = ["unique_id", "ds"] + return ufp.anti_join(expected, futr_df[ids], on=ids) + def predict( self, df: Optional[DataFrame] = None, @@ -373,6 +422,7 @@ def predict( # Process new dataset but does not store it. if df is not None: + validate_freq(df["ds"], self.freq) dataset, uids, last_dates, _ = self._prepare_fit( df=df, static_df=static_df, sort_df=sort_df, predict_only=True ) @@ -393,16 +443,13 @@ def predict( cols += [model_name + n for n in model.loss.output_names] # Placeholder dataframe for predictions with unique_id and ds - if isinstance(self.uids, pl_Series): - df_constructor = pl_DataFrame - else: - df_constructor = pd.DataFrame - starts = ufp.offset_times(last_dates, self.freq, 1) - fcsts_df = df_constructor( - { - "unique_id": ufp.repeat(self.uids, self.h), - "ds": ufp.time_ranges(starts, freq=self.freq, periods=self.h), - } + fcsts_df = ufp.make_future_dataframe( + uids=uids, + last_times=last_dates, + freq=self.freq, + h=self.h, + id_col="unique_id", + time_col="ds", ) # Update and define new forecasting dataset @@ -411,15 +458,21 @@ def predict( else: futr_orig_rows = futr_df.shape[0] futr_df = ufp.join(futr_df, fcsts_df, on=["unique_id", "ds"]) - base_err_msg = f"`futr_df` must have one row per id and ds in the forecasting horizon ({self.h})." if futr_df.shape[0] < fcsts_df.shape[0]: - raise ValueError(base_err_msg) + if df is None: + expected_cmd = "make_future_dataframe()" + missing_cmd = "get_missing_future(futr_df)" + else: + expected_cmd = "make_future_dataframe(df)" + missing_cmd = "get_missing_future(futr_df, df)" + raise ValueError( + "There are missing combinations of ids and times in `futr_df`.\n" + f"You can run the `{expected_cmd}` method to get the expected combinations or " + f"the `{missing_cmd}` method to get the missing combinations." + ) if futr_orig_rows > futr_df.shape[0]: dropped_rows = futr_orig_rows - futr_df.shape[0] - warnings.warn( - f"Dropped {dropped_rows:,} unused rows from `futr_df`. " - + base_err_msg - ) + warnings.warn(f"Dropped {dropped_rows:,} unused rows from `futr_df`.") if any(ufp.is_none(futr_df[col]).any() for col in needed_futr_exog): raise ValueError("Found null values in `futr_df`") futr_dataset = dataset.align(futr_df) @@ -442,7 +495,7 @@ def predict( fcsts = self._scalers_target_inverse_transform(fcsts, indptr) # Declare predictions pd.DataFrame - if df_constructor is pl_DataFrame: + if isinstance(self.uids, pl_Series): fcsts = pl_DataFrame(dict(zip(cols, fcsts.T))) else: fcsts = pd.DataFrame(fcsts, columns=cols) @@ -505,6 +558,7 @@ def cross_validation( # Process and save new dataset (in self) if df is not None: + validate_freq(df["ds"], self.freq) self.dataset, self.uids, self.last_dates, self.ds = self._prepare_fit( df=df, static_df=static_df, sort_df=sort_df, predict_only=False ) diff --git a/settings.ini b/settings.ini index a478e9db6..34b4740c0 100644 --- a/settings.ini +++ b/settings.ini @@ -15,7 +15,7 @@ language = English custom_sidebar = True license = apache2 status = 2 -requirements = numpy>=1.21.6 pandas>=1.3.5 torch>=2.0.0 pytorch-lightning>=2.0.0 ray[tune]>=2.2.0 optuna utilsforecast>=0.0.17 numba +requirements = numpy>=1.21.6 pandas>=1.3.5 torch>=2.0.0 pytorch-lightning>=2.0.0 ray[tune]>=2.2.0 optuna utilsforecast>=0.0.19 numba dev_requirements = nbdev black mypy flake8 matplotlib hyperopt polars pyarrow nbs_path = nbs doc_path = _docs