Skip to content

Make outliers transforms fail on new segments with understandable error #1139

Merged
merged 3 commits into from
Mar 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
-
- Fix `SegmentEncoderTransform` to work with subset of segments and raise error on new segments ([#1103](https://github.com/tinkoff-ai/etna/pull/1103))
- Fix `SklearnTransform` in per-segment mode to work on subset of segments and raise error on new segments ([#1107](https://github.com/tinkoff-ai/etna/pull/1107))
-
- Fix `OutliersTransform` and its children to raise error on new segments ([#1139](https://github.com/tinkoff-ai/etna/pull/1139))
## [1.14.0] - 2022-12-16
### Added
- Add python 3.10 support ([#1005](https://github.com/tinkoff-ai/etna/pull/1005))
Expand Down
36 changes: 32 additions & 4 deletions etna/transforms/outliers/base.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import reprlib
from abc import ABC
from abc import abstractmethod
from typing import Dict
from typing import List
from typing import Optional
from typing import cast

import numpy as np
import pandas as pd
Expand All @@ -26,6 +28,7 @@ def __init__(self, in_column: str):
self.in_column = in_column
self.outliers_timestamps: Optional[Dict[str, List[pd.Timestamp]]] = None
self.original_values: Optional[Dict[str, List[pd.Timestamp]]] = None
self._fit_segments: Optional[List[str]] = None

def _save_original_values(self, ts: TSDataset):
"""
Expand Down Expand Up @@ -61,9 +64,18 @@ def fit(self, df: pd.DataFrame) -> "OutliersTransform":
ts = TSDataset(df, freq=pd.infer_freq(df.index))
self.outliers_timestamps = self.detect_outliers(ts)
self._save_original_values(ts)
self._fit_segments = ts.segments

return self

def _validate_segments(self, segments: List[str]):
self._fit_segments = cast(List[str], self._fit_segments)
new_segments = set(segments) - set(self._fit_segments)
if len(new_segments) > 0:
raise NotImplementedError(
f"This transform can't process segments that weren't present on train data: {reprlib.repr(new_segments)}"
)

def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Replace found outliers with NaNs.
Expand All @@ -75,13 +87,21 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:

Returns
-------
result: pd.DataFrame
result:
dataframe with in_column series with filled with NaNs

Raises
------
ValueError:
If transform isn't fitted.
NotImplementedError:
If there are segments that weren't present during training.
"""
if self.outliers_timestamps is None:
raise ValueError("Transform is not fitted! Fit the Transform before calling transform method.")
result_df = df.copy()
segments = df.columns.get_level_values("segment").unique()
segments = df.columns.get_level_values("segment").unique().tolist()
self._validate_segments(segments)
for segment in segments:
result_df.loc[self.outliers_timestamps[segment], pd.IndexSlice[segment, self.in_column]] = np.NaN
return result_df
Expand All @@ -97,13 +117,21 @@ def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:

Returns
-------
result: pd.DataFrame
result:
data with reconstructed values

Raises
------
ValueError:
If transform isn't fitted.
NotImplementedError:
If there are segments that weren't present during training.
"""
if self.original_values is None or self.outliers_timestamps is None:
raise ValueError("Transform is not fitted! Fit the Transform before calling inverse_transform method.")
result = df.copy()
segments = df.columns.get_level_values("segment").unique()
segments = df.columns.get_level_values("segment").unique().tolist()
self._validate_segments(segments)
for segment in segments:
segment_ts = result[segment, self.in_column]
segment_ts[segment_ts.index.isin(self.outliers_timestamps[segment])] = self.original_values[segment]
Expand Down
18 changes: 8 additions & 10 deletions tests/test_transforms/test_inference/test_inverse_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,10 @@ def test_inverse_transform_train_new_segments(self, transform, dataset_name, exp
TimeSeriesImputerTransform(in_column="target"),
"ts_to_fill",
),
# outliers
(DensityOutliersTransform(in_column="target"), "ts_with_outliers"),
(MedianOutliersTransform(in_column="target"), "ts_with_outliers"),
(PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers"),
],
)
def test_inverse_transform_train_new_segments_not_implemented(self, transform, dataset_name, request):
Expand Down Expand Up @@ -704,11 +708,6 @@ def test_inverse_transform_train_new_segments_failed_not_implemented(self, trans
# math
# TODO: error should be understandable, not like now
(DifferencingTransform(in_column="target", inplace=True), "regular_ts", {"change": {"target"}}),
# outliers
# TODO: error should be understandable, not like now
(DensityOutliersTransform(in_column="target"), "ts_with_outliers", {}),
(MedianOutliersTransform(in_column="target"), "ts_with_outliers", {}),
(PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers", {}),
],
)
def test_inverse_transform_train_new_segments_failed_error(
Expand Down Expand Up @@ -1026,6 +1025,10 @@ def test_inverse_transform_future_new_segments(self, transform, dataset_name, ex
TimeSeriesImputerTransform(in_column="target"),
"ts_to_fill",
),
# outliers
(DensityOutliersTransform(in_column="target"), "ts_with_outliers"),
(MedianOutliersTransform(in_column="target"), "ts_with_outliers"),
(PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers"),
],
)
def test_inverse_transform_future_new_segments_not_implemented(self, transform, dataset_name, request):
Expand Down Expand Up @@ -1081,11 +1084,6 @@ def test_inverse_transform_future_new_segments_failed_not_implemented(self, tran
# TODO: error should be understandable, not like now
(DifferencingTransform(in_column="target", inplace=True), "regular_ts", {}),
(DifferencingTransform(in_column="positive", inplace=True), "ts_with_exog", {"change": {"positive"}}),
# outliers
# TODO: error should be understandable, not like now
(DensityOutliersTransform(in_column="target"), "ts_with_outliers", {}),
(MedianOutliersTransform(in_column="target"), "ts_with_outliers", {}),
(PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers", {}),
],
)
def test_inverse_transform_future_new_segments_failed_error(
Expand Down
42 changes: 8 additions & 34 deletions tests/test_transforms/test_inference/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -630,6 +630,10 @@ def test_transform_train_new_segments(self, transform, dataset_name, expected_ch
TimeSeriesImputerTransform(in_column="target"),
"ts_to_fill",
),
# outliers
(DensityOutliersTransform(in_column="target"), "ts_with_outliers"),
(MedianOutliersTransform(in_column="target"), "ts_with_outliers"),
(PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers"),
],
)
def test_transform_train_new_segments_not_implemented(self, transform, dataset_name, request):
Expand All @@ -653,23 +657,6 @@ def test_transform_train_new_segments_failed_not_implemented(self, transform, da
ts, transform, train_segments=["segment_1", "segment_2"], expected_changes={}
)

@to_be_fixed(raises=Exception)
@pytest.mark.parametrize(
"transform, dataset_name, expected_changes",
[
# outliers
# TODO: error should be understandable, not like now
(DensityOutliersTransform(in_column="target"), "ts_with_outliers", {}),
(MedianOutliersTransform(in_column="target"), "ts_with_outliers", {}),
(PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers", {}),
],
)
def test_transform_train_new_segments_failed_error(self, transform, dataset_name, expected_changes, request):
ts = request.getfixturevalue(dataset_name)
self._test_transform_train_new_segments(
ts, transform, train_segments=["segment_1", "segment_2"], expected_changes=expected_changes
)


class TestTransformFutureNewSegments:
"""Test transform on future part of new segments.
Expand Down Expand Up @@ -972,6 +959,10 @@ def test_transform_future_new_segments(self, transform, dataset_name, expected_c
TimeSeriesImputerTransform(in_column="target"),
"ts_to_fill",
),
# outliers
(DensityOutliersTransform(in_column="target"), "ts_with_outliers"),
(MedianOutliersTransform(in_column="target"), "ts_with_outliers"),
(PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers"),
],
)
def test_transform_future_new_segments_not_implemented(self, transform, dataset_name, request):
Expand All @@ -995,23 +986,6 @@ def test_transform_future_new_segments_failed_not_implemented(self, transform, d
ts, transform, train_segments=["segment_1", "segment_2"], expected_changes={}
)

@to_be_fixed(raises=Exception)
@pytest.mark.parametrize(
"transform, dataset_name, expected_changes",
[
# outliers
# TODO: error should be understandable, not like now
(DensityOutliersTransform(in_column="target"), "ts_with_outliers", {}),
(MedianOutliersTransform(in_column="target"), "ts_with_outliers", {}),
(PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers", {}),
],
)
def test_transform_future_new_segments_failed_error(self, transform, dataset_name, expected_changes, request):
ts = request.getfixturevalue(dataset_name)
self._test_transform_future_new_segments(
ts, transform, train_segments=["segment_1", "segment_2"], expected_changes=expected_changes
)


class TestTransformFutureWithTarget:
"""Test transform on future dataset with known target.
Expand Down
40 changes: 40 additions & 0 deletions tests/test_transforms/test_outliers/test_outliers_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,46 @@ def test_inverse_transform_raise_error_if_not_fitted(transform, outliers_solid_t
_ = transform.inverse_transform(df=outliers_solid_tsds.df)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we need to add test for _validate_segments ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What kind of test do you expect here? I don't think that testing this private method is necessary. But I test that it works when expected in test_transform_new_segments_fail and test_inverse_transform_new_segments_fail.



@pytest.mark.parametrize(
"transform",
(
MedianOutliersTransform(in_column="target"),
DensityOutliersTransform(in_column="target"),
PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel),
),
)
def test_transform_new_segments_fail(transform, outliers_solid_tsds):
df = outliers_solid_tsds.to_pandas()
train_df = df.loc[:, pd.IndexSlice["1", :]]
test_df = df.loc[:, pd.IndexSlice["2", :]]

transform.fit(train_df)
with pytest.raises(
NotImplementedError, match="This transform can't process segments that weren't present on train data"
):
_ = transform.transform(test_df)


@pytest.mark.parametrize(
"transform",
(
MedianOutliersTransform(in_column="target"),
DensityOutliersTransform(in_column="target"),
PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel),
),
)
def test_inverse_transform_new_segments_fail(transform, outliers_solid_tsds):
df = outliers_solid_tsds.to_pandas()
train_df = df.loc[:, pd.IndexSlice["1", :]]
test_df = df.loc[:, pd.IndexSlice["2", :]]

transform.fit(train_df)
with pytest.raises(
NotImplementedError, match="This transform can't process segments that weren't present on train data"
):
_ = transform.inverse_transform(test_df)


@pytest.mark.parametrize(
"transform",
(
Expand Down