diff --git a/changelog/100.bugfix.rst b/changelog/100.bugfix.rst new file mode 100644 index 0000000..5d7e322 --- /dev/null +++ b/changelog/100.bugfix.rst @@ -0,0 +1 @@ +Remove remaining references to -1 'invalidation'; validate directly on time comparison when needed diff --git a/docs/reference/internals.rst b/docs/reference/internals.rst index 59047eb..bbc8471 100644 --- a/docs/reference/internals.rst +++ b/docs/reference/internals.rst @@ -67,7 +67,7 @@ Pandas Helpers pandas_helpers.event_value pandas_helpers.post_process_event pandas_helpers.merge_windowed_event - pandas_helpers.valid_event + pandas_helpers.is_valid_event pandas_helpers.try_casting Performance diff --git a/src/seismometer/data/pandas_helpers.py b/src/seismometer/data/pandas_helpers.py index 3367276..cb81b7f 100644 --- a/src/seismometer/data/pandas_helpers.py +++ b/src/seismometer/data/pandas_helpers.py @@ -39,8 +39,6 @@ def merge_windowed_event( Joins on a set of keys and associates the first event occurring after the prediction time. The following special cases are also applied: - Invalidate late predictions - if a prediction occurs after all recorded events of the type, the prediction is - considered invalid wrt to the event and the _Value is set to -1. Early predictions drop timing - if a prediction occurs before all recorded events of the type, the label is kept for analyses but the time is removed. Imputation of no event to negative label - if no row in the events frame is present for the prediction keys, it is @@ -62,8 +60,6 @@ def merge_windowed_event( events dataframes. min_leadtime_hrs : Number, optional The number of hour offset to be required for prediction, by default 0. - If set to 1, a prediction made within the hour before the last associated event will be invalidated and set - to -1 even though it occurred before the event time. window_hrs : Optional[Number], optional The number of hours the window of predictions of interest should be limited to, by default None. If None, then all predictions occurring before a known event will be included. @@ -165,10 +161,6 @@ def merge_windowed_event( impute_val_no_time=impute_val_no_time, ) - # refactor to generalize - if merge_strategy == "forward": # For forward merges, don't count events that happen before the prediction - predictions.loc[predictions[predtime_col] > predictions[r_ref], event_val_col] = -1 - return predictions.drop(columns=r_ref) @@ -543,9 +535,14 @@ def event_value_name(event_value: str) -> str: return val -def valid_event(dataframe: pd.DataFrame, event: str) -> pd.DataFrame: - """Filters a dataframe to valid predictions, where the event value has not set to -1.""" - return dataframe[dataframe[event_value(event)] >= 0] +def is_valid_event(dataframe: pd.DataFrame, event: str, ref: str) -> pd.DataFrame: + """ + Creates a mask excluding rows (False) where the event occurs before the reference time. + If the comparison cannot be made, all rows will be considered valid (True). + """ + if event_time(event) not in dataframe.columns or ref not in dataframe.columns: + return pd.Series([True] * len(dataframe), index=dataframe.index) + return dataframe[ref] <= dataframe[event_time(event)] def _resolve_time_col(dataframe: pd.DataFrame, ref_event: str) -> str: diff --git a/src/seismometer/data/timeseries.py b/src/seismometer/data/timeseries.py index 4e047b9..74abe90 100644 --- a/src/seismometer/data/timeseries.py +++ b/src/seismometer/data/timeseries.py @@ -2,6 +2,8 @@ import pandas as pd +from .pandas_helpers import is_valid_event + def create_metric_timeseries( dataframe: pd.DataFrame, @@ -77,9 +79,9 @@ def _limit_data( ) -> pd.DataFrame: """Reduces the data to only include valid data points.""" include = dataframe[reftime].notna() - # boolean events - filter out -1 where invalid + if boolean_event: - include = include & (dataframe[event_col] >= 0) + include = include & (is_valid_event(dataframe, event_col, reftime)) reduced = dataframe[include] if time_bounds is None: diff --git a/src/seismometer/seismogram.py b/src/seismometer/seismogram.py index f575b2f..32ea381 100644 --- a/src/seismometer/seismogram.py +++ b/src/seismometer/seismogram.py @@ -312,7 +312,7 @@ def data(self, event: Optional[str] = None) -> pd.DataFrame: if event_time in self.dataframe: return self.dataframe[self._data_mask(event_val) & self._time_mask(event_time)] - return self.dataframe[self._data_mask(event_val)] + return self.dataframe def score_bins(self): """Updates the active values for notebook-scoped selections.""" @@ -392,7 +392,7 @@ def create_cohorts(self) -> None: @lru_cache def _data_mask(self, event_val): - return self.dataframe[event_val] != -1 + return pdh.is_valid_event(self.dataframe, event_val, self.predict_time) @lru_cache def _time_mask(self, event_time, keep_zero: str = None): diff --git a/tests/data/test_timeseries.py b/tests/data/test_timeseries.py index 484b34c..e81afd6 100644 --- a/tests/data/test_timeseries.py +++ b/tests/data/test_timeseries.py @@ -128,9 +128,21 @@ def test_data_limited_to_time_bound(self): compare = actual.sort_values(by=["Reference", "Group", "Value"]).reset_index(drop=True) pdt.assert_frame_equal(compare, expected) - def test_bools_filter_out_invalid(self): + def test_bools_filter_out_early(self): input_frame = create_frame([0, 1], ["A", "B"]) input_frame["Value"] = [-1, -2, 1, -1, -1, -2, -1, 0] + input_frame["Value_Time"] = pd.to_datetime( + [ + "2023-12-31", # early + "2023-11-30", # early + "2024-02-01", # good + "2023-12-31", # early + "2023-12-31", # early + "2023-12-31", # early + "2023-11-30", # early + "2024-02-01", # good + ] + ) expected = pd.DataFrame({"Reference": [pd.Timestamp("2024-01-01")] * 2, "Group": ["A", "B"], "Value": [1, 0]}) actual = undertest.create_metric_timeseries( @@ -142,7 +154,7 @@ def test_bools_filter_out_invalid(self): def test_missing_does_not_count_toward_threshold(self): input_frame = create_frame([0, 1], ["A", "A"]) input_frame["Value"] = [-1, -2, -1, -1, -1, -2, -1, 0] - + input_frame["Value_Time"] = pd.to_datetime(["2023-12-31"] * 7 + ["2024-02-01"]) actual = undertest.create_metric_timeseries( input_frame, "Reference", "Value", ["EntityId"], "Group", censor_threshold=1, boolean_event=True ) diff --git a/tests/test_seismogram.py b/tests/test_seismogram.py index a6d920d..1820eda 100644 --- a/tests/test_seismogram.py +++ b/tests/test_seismogram.py @@ -43,7 +43,7 @@ def get_test_data(): "prediction": [1, 2, 3, 4], "time": ["2022-01-01", "2022-01-02", "2022-01-03", "2022-01-04"], "event1_Value": [0, 1, 0, -1], - "event1_Time": ["2022-01-01", "2022-01-02", "2022-01-03", "2022-01-04"], + "event1_Time": ["2022-01-01", "2022-01-02", "2022-01-03", "2021-12-31"], "event2_Value": [0, 1, 0, 1], "event2_Time": ["2022-01-01", "2022-01-02", "2022-01-03", "2022-01-04"], "cohort1": [1, 0, 1, 0],