From 3e29dabfcef3c0764d5518a1992e873a64079f52 Mon Sep 17 00:00:00 2001 From: Reese Richardson Date: Tue, 12 Jan 2021 21:48:50 -0600 Subject: [PATCH] Added negative log-likelihood assignment function --- data_processing/trace_selection.py | 150 +++++++++++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 data_processing/trace_selection.py diff --git a/data_processing/trace_selection.py b/data_processing/trace_selection.py new file mode 100644 index 00000000..a69cc881 --- /dev/null +++ b/data_processing/trace_selection.py @@ -0,0 +1,150 @@ +""" +Compare COVID-19 simulation outputs to data. +Used for spatial - covidregion - model +""" +import argparse +import os +import pandas as pd +import numpy as np +import scipy.stats +import sys + +sys.path.append('../') +from load_paths import load_box_paths +from datetime import date, timedelta, datetime +from processing_helpers import * + +def parse_args(): + + description = "Simulation run for modeling Covid-19" + parser = argparse.ArgumentParser(description=description) + + parser.add_argument( + "-s", + "--stem", + type=str, + help="Name of simulation experiment" + ) + parser.add_argument( + "-loc", + "--Location", + type=str, + help="Local or NUCLUSTER", + default = "Local" + ) + parser.add_argument( + "-t", "--trajectoriesName", + type=str, + help="Name of trajectoriesDat file, trajectoriesDat.csv or trajectoriesDat_trim.csv", + default='trajectoriesDat.csv' + ) + parser.add_argument( + "--deaths_weight", + type=float, + help="Weight of deaths in negative log likelihood calculation. Default is 1.0.", + default=1.0 + ) + parser.add_argument( + "--crit_weight", + type=float, + help="Weight of ICU population in negative log likelihood calculation. Default is 1.0.", + default=1.0 + ) + parser.add_argument( + "--non_icu_weight", + type=float, + help="Weight of non-ICU population in negative log likelihood calculation. Default is 1.0.", + default=1.0 + ) + parser.add_argument( + "--cli_weight", + type=float, + help="Weight of CLI admissions in negative log likelihood calculation. Default is 1.0.", + default=1.0 + ) + return parser.parse_args() + +def sum_nll(df_values, ref_df_values): + try: + x = -np.log10(scipy.stats.poisson(mu=df_values).pmf(k=ref_df_values)) + except ValueError: + print('ERROR: The simulation and reference arrays may not be the same length.') + print('Length simulation: ' + str(len(df_values))) + print('Length reference: ' + str(len(ref_df_values))) + x[np.abs(x) == np.inf] = 0 + return np.sum(x) + +def compare_sim_and_ref(df, ems_nr, ref_df, channels, data_channel_names, titles, region_label, + first_day, last_day, ymax=10000, logscale=True, weights_array=[1.0,1.0,1.0,1.0]): + + [deaths_weight, crit_weight, non_icu_weight, cli_weight] = weights_array + ref_df_trunc = ref_df[(ref_df['date'] > first_day) & (ref_df['date'] < last_day)] + df_trunc = df[(df['date'] > first_day) & (df['date'] < last_day)] + run_sample_scen_list = list(df_trunc.groupby(['run_num','sample_num','scen_num']).size().index) + rank_export_df = pd.DataFrame({'run_num':[], 'sample_num':[], 'scen_num':[], 'nll':[]}) + for x in run_sample_scen_list: + total_nll = 0 + (run_num, sample_num, scen_num) = x + df_trunc_slice = df_trunc[(df_trunc['run_num'] == run_num) & (df_trunc['sample_num'] == sample_num) & (df_trunc['scen_num'] == scen_num)] + total_nll += deaths_weight*sum_nll(df_trunc_slice['new_detected_deaths'].values, ref_df_trunc['deaths'].values) + total_nll += crit_weight*sum_nll(df_trunc_slice['crit_det'].values, ref_df_trunc['confirmed_covid_icu'].values) + total_nll += cli_weight*sum_nll(df_trunc_slice['new_detected_hospitalized'].values, ref_df_trunc['inpatient'].values) + total_nll += non_icu_weight*sum_nll(df_trunc_slice['hosp_det'].values, ref_df_trunc['covid_non_icu'].values) + rank_export_df = rank_export_df.append(pd.DataFrame({'run_num':[run_num], 'sample_num':[sample_num], 'scen_num':[scen_num], 'nll':[total_nll]})) + rank_export_df['norm_rank'] = (rank_export_df['nll'].rank()-1)/(len(rank_export_df)-1) + rank_export_df = rank_export_df.sort_values(by=['norm_rank']).reset_index(drop=True) + rank_export_df.to_csv(os.path.join(output_path,'traces_ranked_region_' + str(ems_nr) + '.csv'), index=False) + + +def compare_ems(exp_name,fname, ems_nr,first_day,last_day,weights_array): + + if ems_nr == 0: + region_suffix = "_All" + region_label = 'Illinois' + else: + region_suffix = "_EMS-" + str(ems_nr) + region_label = region_suffix.replace('_EMS-', 'COVID-19 Region ') + + column_list = ['time', 'startdate', 'scen_num', 'sample_num','run_num'] + outcome_channels = ['hosp_det_cumul', 'hosp_cumul', 'hosp_det', 'hospitalized', + 'crit_det_cumul', 'crit_cumul', 'crit_det', 'critical', + 'death_det_cumul', 'deaths'] + + for channel in outcome_channels: + column_list.append(channel + region_suffix) + + df = load_sim_data(exp_name, region_suffix=region_suffix, fname=fname,column_list=column_list) + df = df[(df['date'] >= first_day) & (df['date'] <= last_day)] + df['critical_with_suspected'] = df['critical'] + + ref_df = load_ref_df(ems_nr) + + channels = ['new_detected_deaths', 'crit_det', 'hosp_det', 'new_deaths','new_detected_hospitalized', + 'new_detected_hospitalized'] + data_channel_names = ['confirmed_covid_deaths_prev_24h', + 'confirmed_covid_icu', 'covid_non_icu', 'deaths','inpatient', 'admissions'] + titles = ['New Detected\nDeaths (EMR)', 'Critical Detected (EMR)', 'Inpatient non-ICU\nCensus (EMR)', 'New Detected\nDeaths (LL)', + 'Covid-like illness\nadmissions (IDPH)', 'New Detected\nHospitalizations (LL)'] + + compare_sim_and_ref(df, ems_nr, ref_df, channels=channels, data_channel_names=data_channel_names, titles=titles, + region_label=region_label,first_day= first_day, last_day= last_day, logscale=True, weights_array=weights_array) + +if __name__ == '__main__': + + args = parse_args() + stem = args.stem + trajectoriesName = args.trajectoriesName + Location = args.Location + weights_array = [args.deaths_weight, args.crit_weight, args.non_icu_weight, args.cli_weight] + + first_plot_day = date(2020, 3, 25) + last_plot_day = date(2021, 1, 1) + + datapath, projectpath, wdir, exe_dir, git_dir = load_box_paths(Location=Location) + + exp_names = [x for x in os.listdir(os.path.join(wdir, 'simulation_output')) if stem in x] + for exp_name in exp_names: + output_path = os.path.join(wdir, 'simulation_output',exp_name) + for ems_nr in range(0,12): + print("Start processing region " + str(ems_nr)) + compare_ems(exp_name,fname=trajectoriesName, ems_nr=int(ems_nr),first_day=first_plot_day,last_day=last_plot_day,weights_array=weights_array)