Skip to content

Forecast decomposition for CatBoost models #1148

Merged
merged 9 commits into from
Mar 7, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `ChangePointsLevelTransform` and base classes `PerIntervalModel`, `BaseChangePointsModelAdapter` for per-interval transforms ([#998](https://github.com/tinkoff-ai/etna/pull/998))
- Method `set_params` to change parameters of ETNA objects ([#1102](https://github.com/tinkoff-ai/etna/pull/1102))
- Function `plot_forecast_decomposition` ([#1129](https://github.com/tinkoff-ai/etna/pull/1129))
- Method `forecast_components` for forecast decomposition in `_CatBoostAdapter` [#1135](https://github.com/tinkoff-ai/etna/issues/1135)
-
### Changed
- Add optional `features` parameter in the signature of `TSDataset.to_pandas`, `TSDataset.to_flatten` ([#809](https://github.com/tinkoff-ai/etna/pull/809))
Expand All @@ -37,6 +38,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add saving/loading for transforms, models, pipelines, ensembles; tutorial for saving/loading ([#1068](https://github.com/tinkoff-ai/etna/pull/1068))
- Add hierarchical time series support([#1083](https://github.com/tinkoff-ai/etna/pull/1083))
- Add `WAPE` metric & `wape` functional metric ([#1085](https://github.com/tinkoff-ai/etna/pull/1085))
-
### Fixed
- Missed kwargs in TFT init([#1078](https://github.com/tinkoff-ai/etna/pull/1078))

Expand Down
78 changes: 62 additions & 16 deletions etna/models/catboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,26 @@ def __init__(
def _prepare_float_category_columns(self, df: pd.DataFrame):
df[self._float_category_columns] = df[self._float_category_columns].astype(str).astype("category")

def _prepare_train_pool(self, features: pd.DataFrame, target: np.ndarray) -> Pool:
alex-hse-repository marked this conversation as resolved.
Show resolved Hide resolved
"""Prepare training pool for CatBoost model."""
columns_dtypes = features.dtypes
category_columns_dtypes = columns_dtypes[columns_dtypes == "category"]
self._categorical = category_columns_dtypes.index.tolist()

# select only columns with float categories
float_category_columns_dtypes_indices = [
idx
for idx, x in enumerate(category_columns_dtypes)
if issubclass(x.categories.dtype.type, (float, np.floating))
]
float_category_columns_dtypes = category_columns_dtypes.iloc[float_category_columns_dtypes_indices]
float_category_columns = float_category_columns_dtypes.index
self._float_category_columns = float_category_columns
self._prepare_float_category_columns(features)

train_pool = Pool(features, target, cat_features=self._categorical)
return train_pool

def fit(self, df: pd.DataFrame, regressors: List[str]) -> "_CatBoostAdapter":
"""
Fit Catboost model.
Expand All @@ -57,22 +77,7 @@ def fit(self, df: pd.DataFrame, regressors: List[str]) -> "_CatBoostAdapter":
"""
features = df.drop(columns=["timestamp", "target"])
target = df["target"]
columns_dtypes = features.dtypes
category_columns_dtypes = columns_dtypes[columns_dtypes == "category"]
self._categorical = category_columns_dtypes.index.tolist()

# select only columns with float categories
float_category_columns_dtypes_indices = [
idx
for idx, x in enumerate(category_columns_dtypes)
if issubclass(x.categories.dtype.type, (float, np.floating))
]
float_category_columns_dtypes = category_columns_dtypes.iloc[float_category_columns_dtypes_indices]
float_category_columns = float_category_columns_dtypes.index
self._float_category_columns = float_category_columns
self._prepare_float_category_columns(features)

train_pool = Pool(features, target.values, cat_features=self._categorical)
train_pool = self._prepare_train_pool(features, target.values)
self.model.fit(train_pool)
return self

Expand Down Expand Up @@ -106,6 +111,47 @@ def get_model(self) -> CatBoostRegressor:
"""
return self.model

def forecast_components(self, df: pd.DataFrame) -> pd.DataFrame:
"""Estimate forecast components.

Parameters
----------
df:
features dataframe

Returns
-------
:
dataframe with forecast components
"""
return self.predict_components(df=df)

def predict_components(self, df: pd.DataFrame) -> pd.DataFrame:
"""Estimate prediction components.

Parameters
----------
df:
features dataframe

Returns
-------
:
dataframe with prediction components
"""
features = df.drop(columns=["timestamp", "target"])

prediction = self.model.predict(features)
pool = self._prepare_train_pool(features, prediction)
shap_values = self.model.get_feature_importance(pool, type="ShapValues")

# encapsulate expected contribution into components
components = shap_values[:, :-1] + shap_values[:, -1, np.newaxis] / (shap_values.shape[1] - 1)

component_names = [f"target_component_{name}" for name in features.columns]

return pd.DataFrame(data=components, columns=component_names)


class CatBoostPerSegmentModel(
PerSegmentModelMixin,
Expand Down
76 changes: 76 additions & 0 deletions tests/test_models/test_catboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,3 +144,79 @@ def test_save_load(model, example_tsds):
horizon = 3
transforms = [LagTransform(in_column="target", lags=list(range(horizon, horizon + 3)))]
assert_model_equals_loaded_original(model=model, ts=example_tsds, transforms=transforms, horizon=horizon)


@pytest.fixture()
def ar_dataset_w_exog():
alex-hse-repository marked this conversation as resolved.
Show resolved Hide resolved
df = generate_ar_df(start_time="2021-01-01", periods=100, n_segments=2)
df_exog = df.copy()
df_exog["f1"] = np.sin(df_exog["target"])
df_exog["f2"] = np.cos(df_exog["target"])
df_exog.drop(columns=["target"], inplace=True)

df = TSDataset.to_dataset(df)
df_exog = TSDataset.to_dataset(df_exog)

ts = TSDataset(df=df, df_exog=df_exog, freq="D")
return ts


def test_forecast_prediction_components_equal(ar_dataset_w_exog):
alex-hse-repository marked this conversation as resolved.
Show resolved Hide resolved
train, test = ar_dataset_w_exog.train_test_split(test_size=5)
alex-hse-repository marked this conversation as resolved.
Show resolved Hide resolved
future = train.make_future(5)

model = CatBoostPerSegmentModel(iterations=10)
model.fit(train)

for segment in test.columns.get_level_values("segment"):
segment_future = future[:, segment, :].droplevel("segment", axis=1).reset_index()
prediction_components = model._models[segment].predict_components(df=segment_future)
forecast_components = model._models[segment].forecast_components(df=segment_future)
pd.testing.assert_frame_equal(prediction_components, forecast_components)


def test_forecast_components_names(ar_dataset_w_exog):
answer = {"target_component_f1", "target_component_f2"}
alex-hse-repository marked this conversation as resolved.
Show resolved Hide resolved

train, test = ar_dataset_w_exog.train_test_split(test_size=5)
future = train.make_future(5)

model = CatBoostPerSegmentModel(iterations=10)
model.fit(train)

for segment in test.columns.get_level_values("segment"):
segment_future = future[:, segment, :].droplevel("segment", axis=1).reset_index()
components = model._models[segment].forecast_components(df=segment_future)
assert set(components.columns) == answer


def test_per_segment_decomposition_sums_to_target(ar_dataset_w_exog):
alex-hse-repository marked this conversation as resolved.
Show resolved Hide resolved
ts = ar_dataset_w_exog
train, test = ts.train_test_split(test_size=5)
future = train.make_future(5)

model = CatBoostPerSegmentModel(iterations=10)
model.fit(train)

y_pred = model.forecast(future)
for segment in test.columns.get_level_values("segment"):
segment_future = future[:, segment, :].droplevel("segment", axis=1).reset_index()
components = model._models[segment].forecast_components(df=segment_future)
y_hat_pred = np.sum(components.values, axis=1)
np.testing.assert_allclose(y_hat_pred, y_pred[:, segment, "target"].values)


def test_multi_segment_decomposition_sums_to_target(ar_dataset_w_exog):
ts = ar_dataset_w_exog
train, test = ts.train_test_split(test_size=5)
future = train.make_future(5)

model = CatBoostMultiSegmentModel(iterations=10)
model.fit(train)

y_pred = model.forecast(future)
for segment in test.columns.get_level_values("segment"):
segment_future = future[:, segment, :].droplevel("segment", axis=1).reset_index()
components = model._base_model.forecast_components(df=segment_future)
y_hat_pred = np.sum(components.values, axis=1)
np.testing.assert_allclose(y_hat_pred, y_pred[:, segment, "target"].values)