diff --git a/changelog/100.bugfix.rst b/changelog/100.bugfix.rst new file mode 100644 index 0000000..5d7e322 --- /dev/null +++ b/changelog/100.bugfix.rst @@ -0,0 +1 @@ +Remove remaining references to -1 'invalidation'; validate directly on time comparison when needed diff --git a/changelog/76.feature.rst b/changelog/76.feature.rst deleted file mode 100644 index 343b7f5..0000000 --- a/changelog/76.feature.rst +++ /dev/null @@ -1,2 +0,0 @@ -Added additional "merge_strategy" event configuration option. -Viable merge strategies are "first", "last", "nearest", "forward", and "count". \ No newline at end of file diff --git a/changelog/81.bugfix.rst b/changelog/81.bugfix.rst deleted file mode 100644 index 2c6bbca..0000000 --- a/changelog/81.bugfix.rst +++ /dev/null @@ -1 +0,0 @@ -Limits control max-widths to 1200px in most cases, allowing row wrap when needed. \ No newline at end of file diff --git a/changelog/82.feature.rst b/changelog/82.feature.rst deleted file mode 100644 index f31dbc1..0000000 --- a/changelog/82.feature.rst +++ /dev/null @@ -1,5 +0,0 @@ -Added ExploreSubgroups as a drop in replacement for sm.cohort_list -Added ExploreModelScoreComparison to compare two scores againts a shared target -Added ExploreModelTargetComparison to compare a single score across two targets -Added MultiselectDropdownWidget as a new widget for selecting cohort_dicts, uses a drop down and dismissalbe tags to keep the UX neater. -Updated handling around `primary_output` and `outputs`, so that if primary_output is in outputs, it does not get added in again during startup. \ No newline at end of file diff --git a/changelog/88.bugfix.rst b/changelog/88.bugfix.rst deleted file mode 100644 index 3d3408a..0000000 --- a/changelog/88.bugfix.rst +++ /dev/null @@ -1 +0,0 @@ -Exclude time columns from ydata-profiling. \ No newline at end of file diff --git a/changelog/90.bugfix.rst b/changelog/90.bugfix.rst deleted file mode 100644 index 210e79f..0000000 --- a/changelog/90.bugfix.rst +++ /dev/null @@ -1 +0,0 @@ -Temporarily remove CLI - variation in templates is not stable enough to ensure robustness. diff --git a/changelog/92.feature.rst b/changelog/92.feature.rst deleted file mode 100644 index 03f4277..0000000 --- a/changelog/92.feature.rst +++ /dev/null @@ -1,2 +0,0 @@ -Cast features to dtypes in the dictionary yml when specified in the dictionary configuration. -Cast event _Values to dtypes in the dictionary yml when specified in the dictionary configuration, done after imputation. diff --git a/changelog/93.feature.rst b/changelog/93.feature.rst deleted file mode 100644 index 1771b46..0000000 --- a/changelog/93.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Add a configuration helper that can generate a dictionary file for events or predictions. diff --git a/docs/reference/internals.rst b/docs/reference/internals.rst index 59047eb..bbc8471 100644 --- a/docs/reference/internals.rst +++ b/docs/reference/internals.rst @@ -67,7 +67,7 @@ Pandas Helpers pandas_helpers.event_value pandas_helpers.post_process_event pandas_helpers.merge_windowed_event - pandas_helpers.valid_event + pandas_helpers.is_valid_event pandas_helpers.try_casting Performance diff --git a/docs/release_notes/index.rst b/docs/release_notes/index.rst index 872640e..c75df45 100644 --- a/docs/release_notes/index.rst +++ b/docs/release_notes/index.rst @@ -9,6 +9,32 @@ Breaking changes may occur between minor versions prior to the v1 release; after .. towncrier release notes start +0.2.2 +----- + +Features +~~~~~~~~ + +- Added additional "merge_strategy" event configuration option. (`#76 `__) +- Viable merge strategies are "first", "last", "nearest", "forward", and "count". (`#76 `__) +- Added ExploreSubgroups as a drop in replacement for sm.cohort_list (`#82 `__) +- Added ExploreModelScoreComparison to compare two scores againts a shared target (`#82 `__) +- Added ExploreModelTargetComparison to compare a single score across two targets (`#82 `__) +- Added MultiselectDropdownWidget as a new widget for selecting cohort_dicts, uses a drop down and dismissalbe tags to keep the UX neater. (`#82 `__) +- Updated handling around `primary_output` and `outputs`, so that if primary_output is in outputs, it does not get added in again during startup. (`#82 `__) +- Cast features to dtypes in the dictionary yml when specified in the dictionary configuration. (`#92 `__) +- Cast event _Values to dtypes in the dictionary yml when specified in the dictionary configuration, done after imputation. (`#92 `__) +- Add a configuration helper that can generate a dictionary file for events or predictions. (`#93 `__) + + +Bugfixes +~~~~~~~~ + +- Limits control max-widths to 1200px in most cases, allowing row wrap when needed. (`#81 `__) +- Exclude time columns from ydata-profiling. (`#88 `__) +- Temporarily remove CLI - variation in templates is not stable enough to ensure robustness. (`#90 `__) + + 0.2.1 ----- diff --git a/src/seismometer/_version.py b/src/seismometer/_version.py index 3ced358..b5fdc75 100644 --- a/src/seismometer/_version.py +++ b/src/seismometer/_version.py @@ -1 +1 @@ -__version__ = "0.2.1" +__version__ = "0.2.2" diff --git a/src/seismometer/data/pandas_helpers.py b/src/seismometer/data/pandas_helpers.py index 3367276..cb81b7f 100644 --- a/src/seismometer/data/pandas_helpers.py +++ b/src/seismometer/data/pandas_helpers.py @@ -39,8 +39,6 @@ def merge_windowed_event( Joins on a set of keys and associates the first event occurring after the prediction time. The following special cases are also applied: - Invalidate late predictions - if a prediction occurs after all recorded events of the type, the prediction is - considered invalid wrt to the event and the _Value is set to -1. Early predictions drop timing - if a prediction occurs before all recorded events of the type, the label is kept for analyses but the time is removed. Imputation of no event to negative label - if no row in the events frame is present for the prediction keys, it is @@ -62,8 +60,6 @@ def merge_windowed_event( events dataframes. min_leadtime_hrs : Number, optional The number of hour offset to be required for prediction, by default 0. - If set to 1, a prediction made within the hour before the last associated event will be invalidated and set - to -1 even though it occurred before the event time. window_hrs : Optional[Number], optional The number of hours the window of predictions of interest should be limited to, by default None. If None, then all predictions occurring before a known event will be included. @@ -165,10 +161,6 @@ def merge_windowed_event( impute_val_no_time=impute_val_no_time, ) - # refactor to generalize - if merge_strategy == "forward": # For forward merges, don't count events that happen before the prediction - predictions.loc[predictions[predtime_col] > predictions[r_ref], event_val_col] = -1 - return predictions.drop(columns=r_ref) @@ -543,9 +535,14 @@ def event_value_name(event_value: str) -> str: return val -def valid_event(dataframe: pd.DataFrame, event: str) -> pd.DataFrame: - """Filters a dataframe to valid predictions, where the event value has not set to -1.""" - return dataframe[dataframe[event_value(event)] >= 0] +def is_valid_event(dataframe: pd.DataFrame, event: str, ref: str) -> pd.DataFrame: + """ + Creates a mask excluding rows (False) where the event occurs before the reference time. + If the comparison cannot be made, all rows will be considered valid (True). + """ + if event_time(event) not in dataframe.columns or ref not in dataframe.columns: + return pd.Series([True] * len(dataframe), index=dataframe.index) + return dataframe[ref] <= dataframe[event_time(event)] def _resolve_time_col(dataframe: pd.DataFrame, ref_event: str) -> str: diff --git a/src/seismometer/data/timeseries.py b/src/seismometer/data/timeseries.py index 4e047b9..74abe90 100644 --- a/src/seismometer/data/timeseries.py +++ b/src/seismometer/data/timeseries.py @@ -2,6 +2,8 @@ import pandas as pd +from .pandas_helpers import is_valid_event + def create_metric_timeseries( dataframe: pd.DataFrame, @@ -77,9 +79,9 @@ def _limit_data( ) -> pd.DataFrame: """Reduces the data to only include valid data points.""" include = dataframe[reftime].notna() - # boolean events - filter out -1 where invalid + if boolean_event: - include = include & (dataframe[event_col] >= 0) + include = include & (is_valid_event(dataframe, event_col, reftime)) reduced = dataframe[include] if time_bounds is None: diff --git a/src/seismometer/seismogram.py b/src/seismometer/seismogram.py index f575b2f..32ea381 100644 --- a/src/seismometer/seismogram.py +++ b/src/seismometer/seismogram.py @@ -312,7 +312,7 @@ def data(self, event: Optional[str] = None) -> pd.DataFrame: if event_time in self.dataframe: return self.dataframe[self._data_mask(event_val) & self._time_mask(event_time)] - return self.dataframe[self._data_mask(event_val)] + return self.dataframe def score_bins(self): """Updates the active values for notebook-scoped selections.""" @@ -392,7 +392,7 @@ def create_cohorts(self) -> None: @lru_cache def _data_mask(self, event_val): - return self.dataframe[event_val] != -1 + return pdh.is_valid_event(self.dataframe, event_val, self.predict_time) @lru_cache def _time_mask(self, event_time, keep_zero: str = None): diff --git a/tests/data/test_timeseries.py b/tests/data/test_timeseries.py index 484b34c..e81afd6 100644 --- a/tests/data/test_timeseries.py +++ b/tests/data/test_timeseries.py @@ -128,9 +128,21 @@ def test_data_limited_to_time_bound(self): compare = actual.sort_values(by=["Reference", "Group", "Value"]).reset_index(drop=True) pdt.assert_frame_equal(compare, expected) - def test_bools_filter_out_invalid(self): + def test_bools_filter_out_early(self): input_frame = create_frame([0, 1], ["A", "B"]) input_frame["Value"] = [-1, -2, 1, -1, -1, -2, -1, 0] + input_frame["Value_Time"] = pd.to_datetime( + [ + "2023-12-31", # early + "2023-11-30", # early + "2024-02-01", # good + "2023-12-31", # early + "2023-12-31", # early + "2023-12-31", # early + "2023-11-30", # early + "2024-02-01", # good + ] + ) expected = pd.DataFrame({"Reference": [pd.Timestamp("2024-01-01")] * 2, "Group": ["A", "B"], "Value": [1, 0]}) actual = undertest.create_metric_timeseries( @@ -142,7 +154,7 @@ def test_bools_filter_out_invalid(self): def test_missing_does_not_count_toward_threshold(self): input_frame = create_frame([0, 1], ["A", "A"]) input_frame["Value"] = [-1, -2, -1, -1, -1, -2, -1, 0] - + input_frame["Value_Time"] = pd.to_datetime(["2023-12-31"] * 7 + ["2024-02-01"]) actual = undertest.create_metric_timeseries( input_frame, "Reference", "Value", ["EntityId"], "Group", censor_threshold=1, boolean_event=True ) diff --git a/tests/test_seismogram.py b/tests/test_seismogram.py index a6d920d..1820eda 100644 --- a/tests/test_seismogram.py +++ b/tests/test_seismogram.py @@ -43,7 +43,7 @@ def get_test_data(): "prediction": [1, 2, 3, 4], "time": ["2022-01-01", "2022-01-02", "2022-01-03", "2022-01-04"], "event1_Value": [0, 1, 0, -1], - "event1_Time": ["2022-01-01", "2022-01-02", "2022-01-03", "2022-01-04"], + "event1_Time": ["2022-01-01", "2022-01-02", "2022-01-03", "2021-12-31"], "event2_Value": [0, 1, 0, 1], "event2_Time": ["2022-01-01", "2022-01-02", "2022-01-03", "2022-01-04"], "cohort1": [1, 0, 1, 0],