addition of metrics useful for evaluation healthcare AI/ML models (ep…

…ic-open-source#78)
diehlbw · Aug 15, 2024 · bef9f73 · bef9f73
1 parent 50f0ba4
commit bef9f73
Show file tree

Hide file tree

Showing 5 changed files with 162 additions and 59 deletions.
diff --git a/changelog/78.feature.rst b/changelog/78.feature.rst
@@ -0,0 +1 @@
+Add absolute risk ratio (ARR), number needed to treat(NNT), and net benefit with fixed rho=1/3 from med_metrics package.
diff --git a/docs/reference/internals.rst b/docs/reference/internals.rst
@@ -110,6 +110,7 @@ Performance
    performance.assert_valid_performance_metrics_df
    performance.calculate_bin_stats
    performance.calculate_eval_ci
+   performance.calculate_nnt
 
 Seismogram Loaders
 ~~~~~~~~~~~~~~~~~~

diff --git a/src/seismometer/data/performance.py b/src/seismometer/data/performance.py
@@ -10,9 +10,21 @@
 from .confidence import PRConfidenceParam, ROCConfidenceParam, confidence_dict
 from .decorators import export
 
+DEFAULT_RHO = 1 / 3
+
 PathLike = Union[str, Path]
 COUNTS = ["TP", "FP", "TN", "FN"]
-STATNAMES = COUNTS + ["Threshold", "Accuracy", "Sensitivity", "Specificity", "PPV", "NPV", "Flagged", "LR+"]
+STATNAMES = COUNTS + [
+    "Threshold",
+    "Accuracy",
+    "Sensitivity",
+    "Specificity",
+    "PPV",
+    "NPV",
+    "Flagged",
+    "LR+",
+    "NetBenefitScore",
+]
 
 
 @export
@@ -34,7 +46,14 @@ def assert_valid_performance_metrics_df(df: pd.DataFrame, needs_columns: list =
     bool
         Whether it is likely a valid performance metrics df.
     """
-    performance_columns = needs_columns or ["Threshold", "Accuracy", "Sensitivity", "Specificity", "PPV", "NPV"]
+    performance_columns = needs_columns or [
+        "Threshold",
+        "Accuracy",
+        "Sensitivity",
+        "Specificity",
+        "PPV",
+        "NPV",
+    ]
     if (df is None) or (not all([item in df.columns for item in performance_columns])):
         raise ValueError(
             "Passed performance frame does not have required columns: "
@@ -78,7 +97,8 @@ def calculate_bin_stats(
     y_true = y_true[keep]
     y_pred = y_pred[keep].round(5)
 
-    m = len(y_true)
+    n = len(y_true)
+
     if not keep_score_values:
         y_pred = as_percentages(y_pred)
 
@@ -115,22 +135,77 @@ def calculate_bin_stats(
 
         lr = tpr / fpr
 
-    accuracy = (tps + (fps[-1] - fps)) / m
-    ppcr = (tps + fps) / m
+        # re-implementation of metrics from med_metrics package, see _calculate_nnt for full citation
+        nnt = calculate_nnt(ppv)
+        nbs = (tps - fps * (thresholds / (100 - thresholds))) / n
+
+    accuracy = (tps + (fps[-1] - fps)) / n
+    ppcr = (tps + fps) / n
 
     # NOTE: Don't set index to be threshold because it's a float and this
     # makes lookup annoying due to tolerance settings
     stats = pd.DataFrame(
         np.column_stack(
-            (tps, fps, fps[-1] - fps, tps[-1] - tps, thresholds, accuracy, tpr, 1 - fpr, ppv, npv, ppcr, lr)
+            (
+                tps,
+                fps,
+                fps[-1] - fps,
+                tps[-1] - tps,
+                thresholds,
+                accuracy,
+                tpr,
+                1 - fpr,
+                ppv,
+                npv,
+                ppcr,
+                lr,
+                nbs,
+                nnt,
+            )
         ),
-        columns=STATNAMES,
+        columns=STATNAMES + [f"NNT@{DEFAULT_RHO:0.3n}"],
     )
 
     stats[COUNTS] = stats[COUNTS].fillna(0).astype(int)  # Strengthen dtypes on counts
     return stats
 
 
+@export
+def calculate_nnt(arr: np.ndarray, rho: Optional[Number | None] = None) -> np.ndarray:
+    """
+    Calculates NNT (Number Needed to Treat) given a 'unit' ARR (absolute risk reduction) and
+    the relative risk reduction.
+
+    This formulation and the related ARR and Net Benefit calculation is originally
+    from eotles/med_metrics [#med_metrics]_.
+
+    Parameters
+    ----------
+    arr : np.ndarray
+        The array of absolute risk reductions for each threshold assuming a rho of 1
+    rho : Number, optional
+        The estimated relative risk reduction, by default is DEFAULT_RHO (1/3).
+
+    Returns
+    -------
+    np.ndarray
+        the NNT for each threshold.
+
+    References
+    ----------
+    .. [#med_metrics] eotles/med_metrics: Initial public release. Zenodo; 2024.
+           http://dx.doi.org/10.5281/ZENODO.10514448
+    """
+    if rho is None:
+        rho = DEFAULT_RHO
+
+    # Divide by zero is ok
+    with np.errstate(invalid="ignore", divide="ignore"):
+        nnt = 1 / (rho * arr)
+
+    return nnt
+
+
 @export
 def calculate_eval_ci(stats: pd.DataFrame, truth: pd.Series, output: pd.Series, conf: Number = 0.95) -> dict:
     """
@@ -167,11 +242,19 @@ def calculate_eval_ci(stats: pd.DataFrame, truth: pd.Series, output: pd.Series,
         auc_interval = roc_conf.interval(roc_conf, truth, output)
         # pr
         aucpr_interval = aucpr_conf.interval(
-            aucpr_conf, metrics.auc(stats.Sensitivity, stats.PPV), stats[["TP", "FP", "TN", "FN"]].iloc[0].sum()
+            aucpr_conf,
+            metrics.auc(stats.Sensitivity, stats.PPV),
+            stats[["TP", "FP", "TN", "FN"]].iloc[0].sum(),
         )
 
     ci_data = {
-        "roc": {"Threshold": thresholds, "TPR": tpr, "FPR": fpr, "region": auc_region, "interval": auc_interval},
+        "roc": {
+            "Threshold": thresholds,
+            "TPR": tpr,
+            "FPR": fpr,
+            "region": auc_region,
+            "interval": auc_interval,
+        },
         "pr": {"interval": aucpr_interval},
         "conf": conf,
     }

diff --git a/tests/data/test_cohorts.py b/tests/data/test_cohorts.py
@@ -1,7 +1,10 @@
+from unittest.mock import patch
+
 import numpy as np
 import pandas as pd
 
 import seismometer.data.cohorts as undertest
+import seismometer.data.performance  # NoQA - used in patching
 
 
 def input_df():
@@ -17,24 +20,24 @@ def input_df():
 def expected_df(cohorts):
     data_rows = np.vstack(
         (
-            # TP,FP,TN,FN,   Acc,Sens,Spec,PPV,NPV,    Flag,      LR+,  cohort,ct,tgtct
-            [[0, 0, 1, 1, 0.5, 0.0, 1.0, 0.0, 0.5, 0.0, np.nan, "<1.0", 2, 1]] * 70,
-            [[0, 1, 0, 1, 0, 0, 0, 0, 0, 0.5, 0, "<1.0", 2, 1]] * 10,
-            [[1, 1, 0, 0, 0.5, 1, 0, 0.5, 1, 1, 1, "<1.0", 2, 1]] * 21,
+            # TP,FP,TN,FN,   Acc,Sens,Spec,PPV,NPV,    Flag,      LR+,  NNT1/2,  cohort,ct,tgtct,
+            [[0, 0, 1, 1, 0.5, 0.0, 1.0, 0.0, 0.5, 0.0, np.nan, np.inf, "<1.0", 2, 1]] * 70,
+            [[0, 1, 0, 1, 0, 0, 0, 0, 0, 0.5, 0, np.inf, "<1.0", 2, 1]] * 10,
+            [[1, 1, 0, 0, 0.5, 1, 0, 0.5, 1, 1, 1, 4, "<1.0", 2, 1]] * 21,
             # TP,FP,TN,FN,   Acc,Sens,Spec, PPV, NPV,    Flag,      LR+,  cohort,ct,tgtct
-            [[0, 0, 2, 2, 0.5, 0.0, 1.0, 0.0, 0.5, 0.0, np.nan, ">=1.0", 4, 2]] * 30,
-            [[1, 0, 2, 1, 0.75, 0.5, 1, 1, 2 / 3, 0.25, np.inf, ">=1.0", 4, 2]] * 10,
-            [[1, 1, 1, 1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 1.0, ">=1.0", 4, 2]] * 10,
-            [[2, 1, 1, 0, 0.75, 1, 0.5, 2 / 3, 1, 0.75, 2, ">=1.0", 4, 2]] * 10,
-            [[2, 2, 0, 0, 0.5, 1, 0, 0.5, 1, 1, 1, ">=1.0", 4, 2]] * 41,
+            [[0, 0, 2, 2, 0.5, 0.0, 1.0, 0.0, 0.5, 0.0, np.nan, np.inf, ">=1.0", 4, 2]] * 30,
+            [[1, 0, 2, 1, 0.75, 0.5, 1, 1, 2 / 3, 0.25, np.inf, 2, ">=1.0", 4, 2]] * 10,
+            [[1, 1, 1, 1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 1.0, 4, ">=1.0", 4, 2]] * 10,
+            [[2, 1, 1, 0, 0.75, 1, 0.5, 2 / 3, 1, 0.75, 2, 3, ">=1.0", 4, 2]] * 10,
+            [[2, 2, 0, 0, 0.5, 1, 0, 0.5, 1, 1, 1, 4, ">=1.0", 4, 2]] * 41,
             # TP,FP,TN,FN,   Acc,Sens,Spec,PPV,NPV,    Flag,      LR+,  cohort,ct,tgtct
-            [[0, 0, 1, 1, 0.5, 0.0, 1.0, 0.0, 0.5, 0.0, np.nan, "1.0-2.0", 2, 1]] * 50,
-            [[1, 0, 1, 0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5, np.inf, "1.0-2.0", 2, 1]] * 10,
-            [[1, 1, 0, 0, 0.5, 1, 0, 0.5, 1, 1, 1, "1.0-2.0", 2, 1]] * 41,
+            [[0, 0, 1, 1, 0.5, 0.0, 1.0, 0.0, 0.5, 0.0, np.nan, np.inf, "1.0-2.0", 2, 1]] * 50,
+            [[1, 0, 1, 0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5, np.inf, 2, "1.0-2.0", 2, 1]] * 10,
+            [[1, 1, 0, 0, 0.5, 1, 0, 0.5, 1, 1, 1, 4, "1.0-2.0", 2, 1]] * 41,
             # TP,FP,TN,FN,   Acc,Sens,Spec,PPV,NPV,    Flag,      LR+,  cohort,ct,tgtct
-            [[0, 0, 1, 1, 0.5, 0.0, 1.0, 0.0, 0.5, 0.0, np.nan, ">=2.0", 2, 1]] * 30,
-            [[1, 0, 1, 0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5, np.inf, ">=2.0", 2, 1]] * 10,
-            [[1, 1, 0, 0, 0.5, 1, 0, 0.5, 1, 1, 1, ">=2.0", 2, 1]] * 61,
+            [[0, 0, 1, 1, 0.5, 0.0, 1.0, 0.0, 0.5, 0.0, np.nan, np.inf, ">=2.0", 2, 1]] * 30,
+            [[1, 0, 1, 0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5, np.inf, 2, ">=2.0", 2, 1]] * 10,
+            [[1, 1, 0, 0, 0.5, 1, 0, 0.5, 1, 1, 1, 4, ">=2.0", 2, 1]] * 61,
         )
     )
 
@@ -52,6 +55,7 @@ def expected_df(cohorts):
             "NPV",
             "Flagged",
             "LR+",
+            "NNT@0.5",
             "cohort",
             "cohort-count",
             "cohort-targetcount",
@@ -62,8 +66,8 @@ def expected_df(cohorts):
     df[["TP", "FP", "TN", "FN", "cohort-count", "cohort-targetcount"]] = df[
         ["TP", "FP", "TN", "FN", "cohort-count", "cohort-targetcount"]
     ].astype(int)
-    df[["Accuracy", "Sensitivity", "Specificity", "PPV", "NPV", "Flagged", "LR+"]] = df[
-        ["Accuracy", "Sensitivity", "Specificity", "PPV", "NPV", "Flagged", "LR+"]
+    df[["Accuracy", "Sensitivity", "Specificity", "PPV", "NPV", "Flagged", "LR+", "NNT@0.5"]] = df[
+        ["Accuracy", "Sensitivity", "Specificity", "PPV", "NPV", "Flagged", "LR+", "NNT@0.5"]
     ].astype(float)
 
     # reduce before creating category
@@ -72,19 +76,25 @@ def expected_df(cohorts):
     return reduced_df
 
 
+# We drop these ase the values are threshold dependent and testing would reimplement the formula
+# Instead rely on the test_cases in test_perf_stats.py
+THRESHOLD_DEPENDENT_COLUMNS = ["NetBenefitScore"]
+
+
+@patch.object(seismometer.data.performance, "DEFAULT_RHO", 0.5)
 class Test_Performance_Data:
     def test_data_defaults(self):
         df = input_df()
         actual = undertest.get_cohort_performance_data(df, "tri", proba="col1", censor_threshold=0)
-
+        actual = actual.drop(columns=THRESHOLD_DEPENDENT_COLUMNS)
         expected = expected_df(["<1.0", ">=1.0"])
 
         pd.testing.assert_frame_equal(actual, expected, check_column_type=False, check_like=True, check_dtype=False)
 
     def test_data_splits(self):
         df = input_df()
         actual = undertest.get_cohort_performance_data(df, "tri", proba="col1", splits=[1.0, 2.0], censor_threshold=0)
-
+        actual = actual.drop(columns=THRESHOLD_DEPENDENT_COLUMNS)
         expected = expected_df(["<1.0", "1.0-2.0", ">=2.0"])
 
         pd.testing.assert_frame_equal(actual, expected, check_column_type=False, check_like=True, check_dtype=False)