feat: 52 added a module to calculate basic epidemiological indicators (…

…#172) * Added function to calculate Incidence rate * Added risk ratio function * Added type annotations * minor changes
thegraphnetwork · Sep 22, 2022 · 6f76dc3 · 6f76dc3
1 parent b26f27c
commit 6f76dc3
Show file tree

Hide file tree

Showing 12 changed files with 248 additions and 407 deletions.
diff --git a/docs/tutorials/forecast_switzerland/forecast_swiss.py b/docs/tutorials/forecast_switzerland/forecast_swiss.py
@@ -102,7 +102,6 @@ def train_eval_single_canton(
     predict_n=14,
     look_back=14,
 ):
-
     """
     Function to train and evaluate the model for one georegion.
 
@@ -410,7 +409,6 @@ def train_all_cantons(
             )
 
             if any(df_c[target_name] > 1):
-
                 ngb_m.train(
                     target_name,
                     df_c,

diff --git a/docs/tutorials/forecast_switzerland/train_models.py b/docs/tutorials/forecast_switzerland/train_models.py
@@ -11,7 +11,6 @@
     path="saved_models_dash",
 )
 
-
 target_curve_name = "total_hosp"
 predictors = ["foph_test_d", "foph_cases_d", "foph_hosp_d", "foph_hospcapacity_d"]
 ini_date = "2020-05-01"

diff --git a/epigraphhub/analysis/clustering.py b/epigraphhub/analysis/clustering.py
@@ -40,7 +40,7 @@ def get_lag(
         x = pd.Series(x).rolling(7).mean().dropna().values
         y = pd.Series(y).rolling(7).mean().dropna().values
     corr = correlate(x, y, mode="full") / np.sqrt(np.dot(x, x) * np.dot(y, y))
-    slice = np.s_[(len(corr) - maxlags) // 2 : -(len(corr) - maxlags) // 2]
+    slice = np.s_[(len(corr) - maxlags) // 2: -(len(corr) - maxlags) // 2]
     corr = corr[slice]
     lags = correlation_lags(x.size, y.size, mode="full")
     lags = lags[slice]
@@ -327,7 +327,6 @@ def plot_clusters(
     if normalize:
 
         for i in inc_canton.columns:
-
             inc_canton[i] = inc_canton[i] / max(inc_canton[i])
 
     figs = []

diff --git a/epigraphhub/analysis/epistats.py b/epigraphhub/analysis/epistats.py
@@ -1,14 +1,13 @@
 from typing import Union
 
-import arviz as az
 import numpy as np
 import pandas as pd
-import plotly.express as px
-import pymc3 as pm
 import scipy.stats as st
+from scipy.stats.contingency import relative_risk
+from scipy.stats._result_classes import RelativeRiskResult
 
 
-def prevalence(pop_size: int, positives: int, a: float = 1, b: float = 1) -> object:
+def posterior_prevalence(pop_size: int, positives: int, a: float = 1, b: float = 1) -> st.rv_continuous:
     """
     Returns the Bayesian posterior prevalence of a disease for a point in time.
     It assumes number of cases follow a binomial distribution with probability described as a beta(a,b) distribution
@@ -23,203 +22,63 @@ def prevalence(pop_size: int, positives: int, a: float = 1, b: float = 1) -> obj
     b : float, optional
         prior beta parameter beta, by default 1
 
-    Returns
-    -------
-    object
-        Returns a scipy stats frozen beta distribution that represents the posterior probability of the prevalence
+    Args:
+        pop_size: population size
+        positives: number of positives
+        a: prior beta parameter alpha
+        b: prior beta parameter beta
+
+    Returns:
+        object: Returns a scipy stats frozen beta distribution that represents the posterior probability of the prevalence
     """
     a, b = 1, 1  # prior beta parameters
     pa = a + positives
     pb = b + pop_size - positives
     return st.beta(pa, pb)
 
 
-def inf_pos_prob_cases_hosp(
-    df: pd.DataFrame,
-    alpha: float = 0.5,
-    beta: float = 0.5,
-    draws: int = 2000,
-    tune: int = 500,
-) -> az.data.inference_data.InferenceData:
-    """
-    This function compute the posterior probability distribution for the prevalence of cases and the probability of hospitalization over time.
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        It takes as input a dataframe with four columns:
-            - cases: Number of cases over time.
-            - hospiotalizations: Number of hospitalizations over time.
-            - tests: Number of tests over time.
-            - tests_pos: Proportion of the positive tests over time.
-        This data frame must have a datetime index.
-    alpha:float
-        The alpha parameter of the Beta distribution
-    beta:float
-        The beta parameter of the Beta distribution
-    draws: int
-        The number of samples to draw.
-    tune: int
-        Number of iterations to tune. Samplers adjust the step sizes,
-        scalings or similar during tuning. Tuning samples will be drawn in addition to the number specified in the
-        draws argument, and will be discarded.
-
-    Returns
-    -------
-    az.data.inference_data.InferenceData
-        An array with the posterior probabilities infered.
-    """
-
-    with pm.Model() as var_bin:
-        prev = pm.Beta("prevalence", alpha, beta, shape=len(df["cases"]))
-
-        cases = pm.Binomial("cases", n=df["tests"].values, p=prev, observed=df["cases"])
-
-        probs = pm.Beta("phosp", alpha, beta, shape=len(df["cases"]))
-
-        hosp = pm.Binomial(
-            "hospitalizations", n=df["cases"], p=probs, observed=df["hospitalizations"]
-        )
-
-    with var_bin:
-        tracevb = pm.sample(draws, tune=tune, return_inferencedata=True)
-
-    return tracevb
-
-
-def plot_pos_prob_prev(
-    df: pd.DataFrame,
-    tracevb: az.data.inference_data.InferenceData,
-    ci: bool = False,
-    save: bool = False,
-    name: Union[str, None] = None,
-    plot: bool = True,
-):
+@np.vectorize
+def incidence_rate(pop_size: int, new_cases: int, scaling: float = 1e5) -> Union[float, np.ndarray, np.ndarray]:
     """
-    This function plots the posterior probability distribution of the prevalence
-    generated with the `inf_pos_prob_cases_hosp()` function.
-
+    incidence is defined as the number of new cases in a population over a period of time, typically 1 year. The incidence rate is also usually scale to 100k people to facilitate comparisons between localities with different populations.
     Parameters
     ----------
-    df : pd.DataFrame
-        A dataframe with four columns:
-        - cases: Number of cases over time.
-        - hospitalizations: Number of hospitalizations over time.
-        - tests: Number of tests over time.
-        - tests_pos: Proportion of the positive tests over time.
-        This data frame must have a datetime index.
-    tracevb : az.data.inference_data.InferenceData
-        the return of the inf_pos_prob_cases_hosp() function.
-    ci : bool, optional
-        If True the confidence interval is computed, by default False
-    save : bool, optional
-        If True the plot is saved, by default False
+    pop_size: population pop_size
+    new_cases: number of new cases observed in the period
+    scaling: number to scale the rate to. If ommitted, the rate is return as cases per 100k.
 
     Returns
     -------
-    fig
-        A plotly figure.
+    A float or a np.ndarray of floats
+
+    Examples
+    --------
+    >>> incidence_rate(1000, 5)
+    500.0
+    >>> incidence_rate([1000,5000,10000], [5,5,5])
+    array([500, 100, 50])
     """
+    IR = new_cases / pop_size * scaling
+    return IR
 
-    Prev_post = pd.DataFrame(
-        index=df["cases"].index,
-        data={
-            "median": tracevb.posterior.prevalence.median(axis=(0, 1)),
-            "lower": np.percentile(tracevb.posterior.prevalence, 2.5, axis=(0, 1)),
-            "upper": np.percentile(tracevb.posterior.prevalence, 97.5, axis=(0, 1)),
-        },
-    )
-
-    fig = px.line(Prev_post.rolling(7).mean().dropna())
-
-    if ci:
-        fig.add_scatter(
-            x=Prev_post.index,
-            y=Prev_post.lower,
-            mode="none",
-            fill="tonexty",
-            name="95% CI",
-        )
-
-    fig.add_scatter(
-        x=df["tests_pos"].index,
-        y=df["tests_pos"].values,
-        name="Test positivity",
-        mode="markers",
-    )
-    fig.update_layout(
-        title="Estimated prevalence of COVID",
-        yaxis_title="Prevalence of infected",
-        xaxis_title="Time (days)",
-    )
-    if save:
-        if name == None:
-            fig.write_image("prevalence_est.png", scale=3)
-        else:
-            fig.write_image(f"{name}.png", scale=3)
-
-    if plot:
-        fig.show()
 
-    return fig
-
-
-def plot_pos_prob_hosp(
-    df: pd.DataFrame,
-    tracevb: az.data.inference_data.InferenceData,
-    save: bool = False,
-    name: Union[str, None] = None,
-    plot: bool = False,
-):
+def risk_ratio(exposed_cases: int, exposed_total: int, control_cases: int, control_total: int) -> RelativeRiskResult:
     """
-    This function plots the posterior probability distribution of hospitalization
-    generated with the `inf_pos_prob_cases_hosp()` function.
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        A dataframe with four columns:
-        - cases: Number of cases over time.
-        - hospitalizations: Number of hospitalizations over time.
-        - tests: Number of tests over time.
-        - tests_pos: Proportion of the positive tests over time.
-        This data frame must have a datetime index.
-    tracevb : az.data.inference_data.InferenceData
-        the return of the inf_pos_prob_cases_hosp() function.
-    ci : bool, optional
-        If True the confidence interval is computed, by default False
-    save : bool, optional
-        If True the plot is saved, by default False
-
-    Returns
-    -------
-    fig
-        A plotly figure.
+    Also known as relative risk, computed the risk of contracting a disease given exposure to a risk factor.
+    Parameters:
+        exposed_cases: number of cases in the exposed group
+        exposed_total: size of the exposed group
+        control_cases: number of cases in the control group
+        control_total: size of the control group
+    Returns:
+    RelativeRiskResult object
+
+    Examples:
+    >>> rr = risk_ratio(27, 122, 44, 487)
+    >>> rr.relative_risk
+    2.4495156482861398
+    >>> rr.confidence_interval(confidence_level=0.95)
+    ConfidenceInterval(low=1.5836990926700116, high=3.7886786315466354)
     """
-
-    Phosp_post = pd.DataFrame(
-        index=df.index,
-        data={
-            "median": tracevb.posterior.phosp.median(axis=(0, 1)),
-            "lower": np.percentile(tracevb.posterior.phosp, 2.5, axis=(0, 1)),
-            "upper": np.percentile(tracevb.posterior.phosp, 97.5, axis=(0, 1)),
-        },
-    )
-
-    fig = px.line(Phosp_post.rolling(7).mean().dropna())
-
-    fig.update_layout(
-        title="Estimated Probability of Hospitalization",
-        yaxis_title="probability",
-    )
-
-    if plot:
-        fig.show()
-
-    if save:
-        if name == None:
-            fig.write_image("prob_hosp_est.png", scale=3)
-        else:
-            fig.write_image(f"{name}.png", scale=3)
-
-    return fig
+    rr = relative_risk(exposed_cases, exposed_total, control_cases, control_total)
+    return rr
diff --git a/epigraphhub/analysis/preprocessing.py b/epigraphhub/analysis/preprocessing.py
@@ -145,7 +145,6 @@ def get_next_n_days(ini_date: str, next_days: int) -> list:
     a = datetime.strptime(ini_date, "%Y-%m-%d")
 
     for i in np.arange(1, next_days + 1):
-
         d_i = datetime.strftime(a + timedelta(days=int(i)), "%Y-%m-%d")
 
         next_dates.append(datetime.strptime(d_i, "%Y-%m-%d"))
@@ -195,7 +194,7 @@ def lstm_split_data(
     data = np.empty((n_ts, look_back + predict_n, df.shape[1]))
     for i in range(n_ts):  # - predict_):
         #         print(i, df[i: look_back+i+predict_n,0])
-        data[i, :, :] = df[i : look_back + i + predict_n, :]
+        data[i, :, :] = df[i: look_back + i + predict_n, :]
     # train_size = int(n_ts * ratio)
     train_size = int(df.shape[0] * ratio) - look_back - predict_n + 1
     # print(train_size)

diff --git a/epigraphhub/data/epigraphhub_db.py b/epigraphhub/data/epigraphhub_db.py
@@ -126,7 +126,6 @@ def get_data_by_location(
         # separe the columns by comma to apply in the sql query
         s_columns = ""
         for i in columns:
-
             s_columns = s_columns + i + ","
 
         s_columns = s_columns[:-1]
@@ -135,7 +134,6 @@ def get_data_by_location(
         query = f"select {s_columns} from {schema}.{table_name}"
 
     if len(loc) == 1:
-
         query = f"select {s_columns} from {schema}.{table_name} where {loc_column} = '{loc[0]}' ;"
 
     if len(loc) > 1 and loc != "All":