Skip to content

Commit

Permalink
addition of metrics useful for evaluation healthcare AI/ML models (ep…
Browse files Browse the repository at this point in the history
  • Loading branch information
eotles authored Aug 15, 2024
1 parent 50f0ba4 commit bef9f73
Show file tree
Hide file tree
Showing 5 changed files with 162 additions and 59 deletions.
1 change: 1 addition & 0 deletions changelog/78.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add absolute risk ratio (ARR), number needed to treat(NNT), and net benefit with fixed rho=1/3 from med_metrics package.
1 change: 1 addition & 0 deletions docs/reference/internals.rst
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ Performance
performance.assert_valid_performance_metrics_df
performance.calculate_bin_stats
performance.calculate_eval_ci
performance.calculate_nnt

Seismogram Loaders
~~~~~~~~~~~~~~~~~~
Expand Down
101 changes: 92 additions & 9 deletions src/seismometer/data/performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,21 @@
from .confidence import PRConfidenceParam, ROCConfidenceParam, confidence_dict
from .decorators import export

DEFAULT_RHO = 1 / 3

PathLike = Union[str, Path]
COUNTS = ["TP", "FP", "TN", "FN"]
STATNAMES = COUNTS + ["Threshold", "Accuracy", "Sensitivity", "Specificity", "PPV", "NPV", "Flagged", "LR+"]
STATNAMES = COUNTS + [
"Threshold",
"Accuracy",
"Sensitivity",
"Specificity",
"PPV",
"NPV",
"Flagged",
"LR+",
"NetBenefitScore",
]


@export
Expand All @@ -34,7 +46,14 @@ def assert_valid_performance_metrics_df(df: pd.DataFrame, needs_columns: list =
bool
Whether it is likely a valid performance metrics df.
"""
performance_columns = needs_columns or ["Threshold", "Accuracy", "Sensitivity", "Specificity", "PPV", "NPV"]
performance_columns = needs_columns or [
"Threshold",
"Accuracy",
"Sensitivity",
"Specificity",
"PPV",
"NPV",
]
if (df is None) or (not all([item in df.columns for item in performance_columns])):
raise ValueError(
"Passed performance frame does not have required columns: "
Expand Down Expand Up @@ -78,7 +97,8 @@ def calculate_bin_stats(
y_true = y_true[keep]
y_pred = y_pred[keep].round(5)

m = len(y_true)
n = len(y_true)

if not keep_score_values:
y_pred = as_percentages(y_pred)

Expand Down Expand Up @@ -115,22 +135,77 @@ def calculate_bin_stats(

lr = tpr / fpr

accuracy = (tps + (fps[-1] - fps)) / m
ppcr = (tps + fps) / m
# re-implementation of metrics from med_metrics package, see _calculate_nnt for full citation
nnt = calculate_nnt(ppv)
nbs = (tps - fps * (thresholds / (100 - thresholds))) / n

accuracy = (tps + (fps[-1] - fps)) / n
ppcr = (tps + fps) / n

# NOTE: Don't set index to be threshold because it's a float and this
# makes lookup annoying due to tolerance settings
stats = pd.DataFrame(
np.column_stack(
(tps, fps, fps[-1] - fps, tps[-1] - tps, thresholds, accuracy, tpr, 1 - fpr, ppv, npv, ppcr, lr)
(
tps,
fps,
fps[-1] - fps,
tps[-1] - tps,
thresholds,
accuracy,
tpr,
1 - fpr,
ppv,
npv,
ppcr,
lr,
nbs,
nnt,
)
),
columns=STATNAMES,
columns=STATNAMES + [f"NNT@{DEFAULT_RHO:0.3n}"],
)

stats[COUNTS] = stats[COUNTS].fillna(0).astype(int) # Strengthen dtypes on counts
return stats


@export
def calculate_nnt(arr: np.ndarray, rho: Optional[Number | None] = None) -> np.ndarray:
"""
Calculates NNT (Number Needed to Treat) given a 'unit' ARR (absolute risk reduction) and
the relative risk reduction.
This formulation and the related ARR and Net Benefit calculation is originally
from eotles/med_metrics [#med_metrics]_.
Parameters
----------
arr : np.ndarray
The array of absolute risk reductions for each threshold assuming a rho of 1
rho : Number, optional
The estimated relative risk reduction, by default is DEFAULT_RHO (1/3).
Returns
-------
np.ndarray
the NNT for each threshold.
References
----------
.. [#med_metrics] eotles/med_metrics: Initial public release. Zenodo; 2024.
http://dx.doi.org/10.5281/ZENODO.10514448
"""
if rho is None:
rho = DEFAULT_RHO

# Divide by zero is ok
with np.errstate(invalid="ignore", divide="ignore"):
nnt = 1 / (rho * arr)

return nnt


@export
def calculate_eval_ci(stats: pd.DataFrame, truth: pd.Series, output: pd.Series, conf: Number = 0.95) -> dict:
"""
Expand Down Expand Up @@ -167,11 +242,19 @@ def calculate_eval_ci(stats: pd.DataFrame, truth: pd.Series, output: pd.Series,
auc_interval = roc_conf.interval(roc_conf, truth, output)
# pr
aucpr_interval = aucpr_conf.interval(
aucpr_conf, metrics.auc(stats.Sensitivity, stats.PPV), stats[["TP", "FP", "TN", "FN"]].iloc[0].sum()
aucpr_conf,
metrics.auc(stats.Sensitivity, stats.PPV),
stats[["TP", "FP", "TN", "FN"]].iloc[0].sum(),
)

ci_data = {
"roc": {"Threshold": thresholds, "TPR": tpr, "FPR": fpr, "region": auc_region, "interval": auc_interval},
"roc": {
"Threshold": thresholds,
"TPR": tpr,
"FPR": fpr,
"region": auc_region,
"interval": auc_interval,
},
"pr": {"interval": aucpr_interval},
"conf": conf,
}
Expand Down
48 changes: 29 additions & 19 deletions tests/data/test_cohorts.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from unittest.mock import patch

import numpy as np
import pandas as pd

import seismometer.data.cohorts as undertest
import seismometer.data.performance # NoQA - used in patching


def input_df():
Expand All @@ -17,24 +20,24 @@ def input_df():
def expected_df(cohorts):
data_rows = np.vstack(
(
# TP,FP,TN,FN, Acc,Sens,Spec,PPV,NPV, Flag, LR+, cohort,ct,tgtct
[[0, 0, 1, 1, 0.5, 0.0, 1.0, 0.0, 0.5, 0.0, np.nan, "<1.0", 2, 1]] * 70,
[[0, 1, 0, 1, 0, 0, 0, 0, 0, 0.5, 0, "<1.0", 2, 1]] * 10,
[[1, 1, 0, 0, 0.5, 1, 0, 0.5, 1, 1, 1, "<1.0", 2, 1]] * 21,
# TP,FP,TN,FN, Acc,Sens,Spec,PPV,NPV, Flag, LR+, NNT1/2, cohort,ct,tgtct,
[[0, 0, 1, 1, 0.5, 0.0, 1.0, 0.0, 0.5, 0.0, np.nan, np.inf, "<1.0", 2, 1]] * 70,
[[0, 1, 0, 1, 0, 0, 0, 0, 0, 0.5, 0, np.inf, "<1.0", 2, 1]] * 10,
[[1, 1, 0, 0, 0.5, 1, 0, 0.5, 1, 1, 1, 4, "<1.0", 2, 1]] * 21,
# TP,FP,TN,FN, Acc,Sens,Spec, PPV, NPV, Flag, LR+, cohort,ct,tgtct
[[0, 0, 2, 2, 0.5, 0.0, 1.0, 0.0, 0.5, 0.0, np.nan, ">=1.0", 4, 2]] * 30,
[[1, 0, 2, 1, 0.75, 0.5, 1, 1, 2 / 3, 0.25, np.inf, ">=1.0", 4, 2]] * 10,
[[1, 1, 1, 1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 1.0, ">=1.0", 4, 2]] * 10,
[[2, 1, 1, 0, 0.75, 1, 0.5, 2 / 3, 1, 0.75, 2, ">=1.0", 4, 2]] * 10,
[[2, 2, 0, 0, 0.5, 1, 0, 0.5, 1, 1, 1, ">=1.0", 4, 2]] * 41,
[[0, 0, 2, 2, 0.5, 0.0, 1.0, 0.0, 0.5, 0.0, np.nan, np.inf, ">=1.0", 4, 2]] * 30,
[[1, 0, 2, 1, 0.75, 0.5, 1, 1, 2 / 3, 0.25, np.inf, 2, ">=1.0", 4, 2]] * 10,
[[1, 1, 1, 1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 1.0, 4, ">=1.0", 4, 2]] * 10,
[[2, 1, 1, 0, 0.75, 1, 0.5, 2 / 3, 1, 0.75, 2, 3, ">=1.0", 4, 2]] * 10,
[[2, 2, 0, 0, 0.5, 1, 0, 0.5, 1, 1, 1, 4, ">=1.0", 4, 2]] * 41,
# TP,FP,TN,FN, Acc,Sens,Spec,PPV,NPV, Flag, LR+, cohort,ct,tgtct
[[0, 0, 1, 1, 0.5, 0.0, 1.0, 0.0, 0.5, 0.0, np.nan, "1.0-2.0", 2, 1]] * 50,
[[1, 0, 1, 0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5, np.inf, "1.0-2.0", 2, 1]] * 10,
[[1, 1, 0, 0, 0.5, 1, 0, 0.5, 1, 1, 1, "1.0-2.0", 2, 1]] * 41,
[[0, 0, 1, 1, 0.5, 0.0, 1.0, 0.0, 0.5, 0.0, np.nan, np.inf, "1.0-2.0", 2, 1]] * 50,
[[1, 0, 1, 0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5, np.inf, 2, "1.0-2.0", 2, 1]] * 10,
[[1, 1, 0, 0, 0.5, 1, 0, 0.5, 1, 1, 1, 4, "1.0-2.0", 2, 1]] * 41,
# TP,FP,TN,FN, Acc,Sens,Spec,PPV,NPV, Flag, LR+, cohort,ct,tgtct
[[0, 0, 1, 1, 0.5, 0.0, 1.0, 0.0, 0.5, 0.0, np.nan, ">=2.0", 2, 1]] * 30,
[[1, 0, 1, 0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5, np.inf, ">=2.0", 2, 1]] * 10,
[[1, 1, 0, 0, 0.5, 1, 0, 0.5, 1, 1, 1, ">=2.0", 2, 1]] * 61,
[[0, 0, 1, 1, 0.5, 0.0, 1.0, 0.0, 0.5, 0.0, np.nan, np.inf, ">=2.0", 2, 1]] * 30,
[[1, 0, 1, 0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5, np.inf, 2, ">=2.0", 2, 1]] * 10,
[[1, 1, 0, 0, 0.5, 1, 0, 0.5, 1, 1, 1, 4, ">=2.0", 2, 1]] * 61,
)
)

Expand All @@ -52,6 +55,7 @@ def expected_df(cohorts):
"NPV",
"Flagged",
"LR+",
"NNT@0.5",
"cohort",
"cohort-count",
"cohort-targetcount",
Expand All @@ -62,8 +66,8 @@ def expected_df(cohorts):
df[["TP", "FP", "TN", "FN", "cohort-count", "cohort-targetcount"]] = df[
["TP", "FP", "TN", "FN", "cohort-count", "cohort-targetcount"]
].astype(int)
df[["Accuracy", "Sensitivity", "Specificity", "PPV", "NPV", "Flagged", "LR+"]] = df[
["Accuracy", "Sensitivity", "Specificity", "PPV", "NPV", "Flagged", "LR+"]
df[["Accuracy", "Sensitivity", "Specificity", "PPV", "NPV", "Flagged", "LR+", "NNT@0.5"]] = df[
["Accuracy", "Sensitivity", "Specificity", "PPV", "NPV", "Flagged", "LR+", "NNT@0.5"]
].astype(float)

# reduce before creating category
Expand All @@ -72,19 +76,25 @@ def expected_df(cohorts):
return reduced_df


# We drop these ase the values are threshold dependent and testing would reimplement the formula
# Instead rely on the test_cases in test_perf_stats.py
THRESHOLD_DEPENDENT_COLUMNS = ["NetBenefitScore"]


@patch.object(seismometer.data.performance, "DEFAULT_RHO", 0.5)
class Test_Performance_Data:
def test_data_defaults(self):
df = input_df()
actual = undertest.get_cohort_performance_data(df, "tri", proba="col1", censor_threshold=0)

actual = actual.drop(columns=THRESHOLD_DEPENDENT_COLUMNS)
expected = expected_df(["<1.0", ">=1.0"])

pd.testing.assert_frame_equal(actual, expected, check_column_type=False, check_like=True, check_dtype=False)

def test_data_splits(self):
df = input_df()
actual = undertest.get_cohort_performance_data(df, "tri", proba="col1", splits=[1.0, 2.0], censor_threshold=0)

actual = actual.drop(columns=THRESHOLD_DEPENDENT_COLUMNS)
expected = expected_df(["<1.0", "1.0-2.0", ">=2.0"])

pd.testing.assert_frame_equal(actual, expected, check_column_type=False, check_like=True, check_dtype=False)
Loading

0 comments on commit bef9f73

Please sign in to comment.