Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove explicit 'invalid value' imputation #100

Merged
merged 3 commits into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog/100.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Remove remaining references to -1 'invalidation'; validate directly on time comparison when needed
2 changes: 1 addition & 1 deletion docs/reference/internals.rst
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ Pandas Helpers
pandas_helpers.event_value
pandas_helpers.post_process_event
pandas_helpers.merge_windowed_event
pandas_helpers.valid_event
pandas_helpers.is_valid_event
pandas_helpers.try_casting

Performance
Expand Down
19 changes: 8 additions & 11 deletions src/seismometer/data/pandas_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,6 @@ def merge_windowed_event(
Joins on a set of keys and associates the first event occurring after the prediction time. The following special
cases are also applied:

Invalidate late predictions - if a prediction occurs after all recorded events of the type, the prediction is
considered invalid wrt to the event and the _Value is set to -1.
Early predictions drop timing - if a prediction occurs before all recorded events of the type, the label is kept
for analyses but the time is removed.
Imputation of no event to negative label - if no row in the events frame is present for the prediction keys, it is
Expand All @@ -62,8 +60,6 @@ def merge_windowed_event(
events dataframes.
min_leadtime_hrs : Number, optional
The number of hour offset to be required for prediction, by default 0.
If set to 1, a prediction made within the hour before the last associated event will be invalidated and set
to -1 even though it occurred before the event time.
window_hrs : Optional[Number], optional
The number of hours the window of predictions of interest should be limited to, by default None.
If None, then all predictions occurring before a known event will be included.
Expand Down Expand Up @@ -165,10 +161,6 @@ def merge_windowed_event(
impute_val_no_time=impute_val_no_time,
)

# refactor to generalize
if merge_strategy == "forward": # For forward merges, don't count events that happen before the prediction
predictions.loc[predictions[predtime_col] > predictions[r_ref], event_val_col] = -1

return predictions.drop(columns=r_ref)


Expand Down Expand Up @@ -543,9 +535,14 @@ def event_value_name(event_value: str) -> str:
return val


def valid_event(dataframe: pd.DataFrame, event: str) -> pd.DataFrame:
"""Filters a dataframe to valid predictions, where the event value has not set to -1."""
return dataframe[dataframe[event_value(event)] >= 0]
def is_valid_event(dataframe: pd.DataFrame, event: str, ref: str) -> pd.DataFrame:
gbowlin marked this conversation as resolved.
Show resolved Hide resolved
"""
Creates a mask excluding rows (False) where the event occurs before the reference time.
If the comparison cannot be made, all rows will be considered valid (True).
"""
if event_time(event) not in dataframe.columns or ref not in dataframe.columns:
return pd.Series([True] * len(dataframe), index=dataframe.index)
return dataframe[ref] <= dataframe[event_time(event)]


def _resolve_time_col(dataframe: pd.DataFrame, ref_event: str) -> str:
Expand Down
6 changes: 4 additions & 2 deletions src/seismometer/data/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import pandas as pd

from .pandas_helpers import is_valid_event


def create_metric_timeseries(
dataframe: pd.DataFrame,
Expand Down Expand Up @@ -77,9 +79,9 @@ def _limit_data(
) -> pd.DataFrame:
"""Reduces the data to only include valid data points."""
include = dataframe[reftime].notna()
# boolean events - filter out -1 where invalid

if boolean_event:
include = include & (dataframe[event_col] >= 0)
include = include & (is_valid_event(dataframe, event_col, reftime))
gbowlin marked this conversation as resolved.
Show resolved Hide resolved
reduced = dataframe[include]

if time_bounds is None:
Expand Down
4 changes: 2 additions & 2 deletions src/seismometer/seismogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ def data(self, event: Optional[str] = None) -> pd.DataFrame:
if event_time in self.dataframe:
return self.dataframe[self._data_mask(event_val) & self._time_mask(event_time)]

return self.dataframe[self._data_mask(event_val)]
return self.dataframe

def score_bins(self):
"""Updates the active values for notebook-scoped selections."""
Expand Down Expand Up @@ -392,7 +392,7 @@ def create_cohorts(self) -> None:

@lru_cache
def _data_mask(self, event_val):
return self.dataframe[event_val] != -1
return pdh.is_valid_event(self.dataframe, event_val, self.predict_time)

@lru_cache
def _time_mask(self, event_time, keep_zero: str = None):
Expand Down
16 changes: 14 additions & 2 deletions tests/data/test_timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,9 +128,21 @@ def test_data_limited_to_time_bound(self):
compare = actual.sort_values(by=["Reference", "Group", "Value"]).reset_index(drop=True)
pdt.assert_frame_equal(compare, expected)

def test_bools_filter_out_invalid(self):
def test_bools_filter_out_early(self):
input_frame = create_frame([0, 1], ["A", "B"])
input_frame["Value"] = [-1, -2, 1, -1, -1, -2, -1, 0]
input_frame["Value_Time"] = pd.to_datetime(
[
"2023-12-31", # early
"2023-11-30", # early
"2024-02-01", # good
"2023-12-31", # early
"2023-12-31", # early
"2023-12-31", # early
"2023-11-30", # early
"2024-02-01", # good
]
)
expected = pd.DataFrame({"Reference": [pd.Timestamp("2024-01-01")] * 2, "Group": ["A", "B"], "Value": [1, 0]})

actual = undertest.create_metric_timeseries(
Expand All @@ -142,7 +154,7 @@ def test_bools_filter_out_invalid(self):
def test_missing_does_not_count_toward_threshold(self):
input_frame = create_frame([0, 1], ["A", "A"])
input_frame["Value"] = [-1, -2, -1, -1, -1, -2, -1, 0]

input_frame["Value_Time"] = pd.to_datetime(["2023-12-31"] * 7 + ["2024-02-01"])
actual = undertest.create_metric_timeseries(
input_frame, "Reference", "Value", ["EntityId"], "Group", censor_threshold=1, boolean_event=True
)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_seismogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def get_test_data():
"prediction": [1, 2, 3, 4],
"time": ["2022-01-01", "2022-01-02", "2022-01-03", "2022-01-04"],
"event1_Value": [0, 1, 0, -1],
"event1_Time": ["2022-01-01", "2022-01-02", "2022-01-03", "2022-01-04"],
"event1_Time": ["2022-01-01", "2022-01-02", "2022-01-03", "2021-12-31"],
"event2_Value": [0, 1, 0, 1],
"event2_Time": ["2022-01-01", "2022-01-02", "2022-01-03", "2022-01-04"],
"cohort1": [1, 0, 1, 0],
Expand Down