tinkoff-ai · alex-hse-repository · Mar 7, 2023 · Mar 3, 2023 · Mar 3, 2023 · Mar 3, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `ChangePointsLevelTransform` and base classes `PerIntervalModel`, `BaseChangePointsModelAdapter` for per-interval transforms ([#998](https://github.com/tinkoff-ai/etna/pull/998))
 - Method `set_params` to change parameters of ETNA objects ([#1102](https://github.com/tinkoff-ai/etna/pull/1102))
 - Function `plot_forecast_decomposition` ([#1129](https://github.com/tinkoff-ai/etna/pull/1129))
+- Method `forecast_components` for forecast decomposition in `_CatBoostAdapter` [#1135](https://github.com/tinkoff-ai/etna/issues/1135)
 - 
 ### Changed
 - Add optional `features` parameter in the signature of `TSDataset.to_pandas`, `TSDataset.to_flatten` ([#809](https://github.com/tinkoff-ai/etna/pull/809))
@@ -37,6 +38,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Add saving/loading for transforms, models, pipelines, ensembles; tutorial for saving/loading ([#1068](https://github.com/tinkoff-ai/etna/pull/1068))
 - Add hierarchical time series support([#1083](https://github.com/tinkoff-ai/etna/pull/1083))
 - Add `WAPE` metric & `wape` functional metric ([#1085](https://github.com/tinkoff-ai/etna/pull/1085))
+-
 ### Fixed
 - Missed kwargs in TFT init([#1078](https://github.com/tinkoff-ai/etna/pull/1078))
 

diff --git a/etna/models/catboost.py b/etna/models/catboost.py
@@ -39,6 +39,26 @@ def __init__(
     def _prepare_float_category_columns(self, df: pd.DataFrame):
         df[self._float_category_columns] = df[self._float_category_columns].astype(str).astype("category")
 
+    def _prepare_train_pool(self, features: pd.DataFrame, target: np.ndarray) -> Pool:
+        """Prepare training pool for CatBoost model."""
+        columns_dtypes = features.dtypes
+        category_columns_dtypes = columns_dtypes[columns_dtypes == "category"]
+        self._categorical = category_columns_dtypes.index.tolist()
+
+        # select only columns with float categories
+        float_category_columns_dtypes_indices = [
+            idx
+            for idx, x in enumerate(category_columns_dtypes)
+            if issubclass(x.categories.dtype.type, (float, np.floating))
+        ]
+        float_category_columns_dtypes = category_columns_dtypes.iloc[float_category_columns_dtypes_indices]
+        float_category_columns = float_category_columns_dtypes.index
+        self._float_category_columns = float_category_columns
+        self._prepare_float_category_columns(features)
+
+        train_pool = Pool(features, target, cat_features=self._categorical)
+        return train_pool
+
     def fit(self, df: pd.DataFrame, regressors: List[str]) -> "_CatBoostAdapter":
         """
         Fit Catboost model.
@@ -57,22 +77,7 @@ def fit(self, df: pd.DataFrame, regressors: List[str]) -> "_CatBoostAdapter":
         """
         features = df.drop(columns=["timestamp", "target"])
         target = df["target"]
-        columns_dtypes = features.dtypes
-        category_columns_dtypes = columns_dtypes[columns_dtypes == "category"]
-        self._categorical = category_columns_dtypes.index.tolist()
-
-        # select only columns with float categories
-        float_category_columns_dtypes_indices = [
-            idx
-            for idx, x in enumerate(category_columns_dtypes)
-            if issubclass(x.categories.dtype.type, (float, np.floating))
-        ]
-        float_category_columns_dtypes = category_columns_dtypes.iloc[float_category_columns_dtypes_indices]
-        float_category_columns = float_category_columns_dtypes.index
-        self._float_category_columns = float_category_columns
-        self._prepare_float_category_columns(features)
-
-        train_pool = Pool(features, target.values, cat_features=self._categorical)
+        train_pool = self._prepare_train_pool(features, target.values)
         self.model.fit(train_pool)
         return self
 
@@ -106,6 +111,47 @@ def get_model(self) -> CatBoostRegressor:
         """
         return self.model
 
+    def forecast_components(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Estimate forecast components.
+
+        Parameters
+        ----------
+        df:
+            features dataframe
+
+        Returns
+        -------
+        :
+            dataframe with forecast components
+        """
+        return self.predict_components(df=df)
+
+    def predict_components(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Estimate prediction components.
+
+        Parameters
+        ----------
+        df:
+            features dataframe
+
+        Returns
+        -------
+        :
+            dataframe with prediction components
+        """
+        features = df.drop(columns=["timestamp", "target"])
+
+        prediction = self.model.predict(features)
+        pool = self._prepare_train_pool(features, prediction)
+        shap_values = self.model.get_feature_importance(pool, type="ShapValues")
+
+        # encapsulate expected contribution into components
+        components = shap_values[:, :-1] + shap_values[:, -1, np.newaxis] / (shap_values.shape[1] - 1)
+
+        component_names = [f"target_component_{name}" for name in features.columns]
+
+        return pd.DataFrame(data=components, columns=component_names)
+
 
 class CatBoostPerSegmentModel(
     PerSegmentModelMixin,

diff --git a/tests/test_models/test_catboost.py b/tests/test_models/test_catboost.py
@@ -144,3 +144,79 @@ def test_save_load(model, example_tsds):
     horizon = 3
     transforms = [LagTransform(in_column="target", lags=list(range(horizon, horizon + 3)))]
     assert_model_equals_loaded_original(model=model, ts=example_tsds, transforms=transforms, horizon=horizon)
+
+
+@pytest.fixture()
+def ar_dataset_w_exog():
+    df = generate_ar_df(start_time="2021-01-01", periods=100, n_segments=2)
+    df_exog = df.copy()
+    df_exog["f1"] = np.sin(df_exog["target"])
+    df_exog["f2"] = np.cos(df_exog["target"])
+    df_exog.drop(columns=["target"], inplace=True)
+
+    df = TSDataset.to_dataset(df)
+    df_exog = TSDataset.to_dataset(df_exog)
+
+    ts = TSDataset(df=df, df_exog=df_exog, freq="D")
+    return ts
+
+
+def test_forecast_prediction_components_equal(ar_dataset_w_exog):
+    train, test = ar_dataset_w_exog.train_test_split(test_size=5)
+    future = train.make_future(5)
+
+    model = CatBoostPerSegmentModel(iterations=10)
+    model.fit(train)
+
+    for segment in test.columns.get_level_values("segment"):
+        segment_future = future[:, segment, :].droplevel("segment", axis=1).reset_index()
+        prediction_components = model._models[segment].predict_components(df=segment_future)
+        forecast_components = model._models[segment].forecast_components(df=segment_future)
+        pd.testing.assert_frame_equal(prediction_components, forecast_components)
+
+
+def test_forecast_components_names(ar_dataset_w_exog):
+    answer = {"target_component_f1", "target_component_f2"}
+
+    train, test = ar_dataset_w_exog.train_test_split(test_size=5)
+    future = train.make_future(5)
+
+    model = CatBoostPerSegmentModel(iterations=10)
+    model.fit(train)
+
+    for segment in test.columns.get_level_values("segment"):
+        segment_future = future[:, segment, :].droplevel("segment", axis=1).reset_index()
+        components = model._models[segment].forecast_components(df=segment_future)
+        assert set(components.columns) == answer
+
+
+def test_per_segment_decomposition_sums_to_target(ar_dataset_w_exog):
+    ts = ar_dataset_w_exog
+    train, test = ts.train_test_split(test_size=5)
+    future = train.make_future(5)
+
+    model = CatBoostPerSegmentModel(iterations=10)
+    model.fit(train)
+
+    y_pred = model.forecast(future)
+    for segment in test.columns.get_level_values("segment"):
+        segment_future = future[:, segment, :].droplevel("segment", axis=1).reset_index()
+        components = model._models[segment].forecast_components(df=segment_future)
+        y_hat_pred = np.sum(components.values, axis=1)
+        np.testing.assert_allclose(y_hat_pred, y_pred[:, segment, "target"].values)
+
+
+def test_multi_segment_decomposition_sums_to_target(ar_dataset_w_exog):
+    ts = ar_dataset_w_exog
+    train, test = ts.train_test_split(test_size=5)
+    future = train.make_future(5)
+
+    model = CatBoostMultiSegmentModel(iterations=10)
+    model.fit(train)
+
+    y_pred = model.forecast(future)
+    for segment in test.columns.get_level_values("segment"):
+        segment_future = future[:, segment, :].droplevel("segment", axis=1).reset_index()
+        components = model._base_model.forecast_components(df=segment_future)
+        y_hat_pred = np.sum(components.values, axis=1)
+        np.testing.assert_allclose(y_hat_pred, y_pred[:, segment, "target"].values)