Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate chain of events for individuals #1468

Draft
wants to merge 21 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
dbff470
Investigate analysis of events at sim level
marghe-molaro Apr 3, 2024
bf64628
Merge branch 'master' into molaro/harvest-training-data
marghe-molaro Sep 17, 2024
05098f7
Final data-printing set-up
marghe-molaro Sep 30, 2024
16c071c
Print event chains
marghe-molaro Oct 2, 2024
ba81487
Add chains in mode 2 too and clean up in simuation
marghe-molaro Oct 2, 2024
0474624
Merged with master, and moved all logging into event module to keep t…
marghe-molaro Oct 2, 2024
b1c907c
Fix issue with tests by ensuring standard Polling and infection is ma…
marghe-molaro Oct 7, 2024
cfb4264
Switch iloc for loc
marghe-molaro Oct 7, 2024
e0327de
Change syntax of if statement
marghe-molaro Oct 7, 2024
fceee02
Change syntax of if statement and print string of event
marghe-molaro Oct 9, 2024
eaeae62
Focus on rti and print footprint
marghe-molaro Oct 10, 2024
c7bd9d0
Only store change in individual properties, not entire property row. …
marghe-molaro Oct 11, 2024
769aaec
Style fixes
marghe-molaro Oct 11, 2024
757cee3
Include printing of individual properties at the beginning and at bir…
marghe-molaro Oct 13, 2024
22a5e44
Log everything to simulation, as events logger doesn't seem to be vis…
marghe-molaro Oct 16, 2024
7faa817
Consider all modules included as of interest
marghe-molaro Oct 18, 2024
7232f97
Remove pop-wide HSI warning and make epi default even when printing c…
marghe-molaro Oct 18, 2024
98a8832
Merge branch 'master' into molaro/harvest-training-data
marghe-molaro Oct 18, 2024
a6def2d
Style fix
marghe-molaro Oct 18, 2024
ecea532
Remove data generation test, which wasn't really a test
marghe-molaro Oct 18, 2024
ae7a44c
Change dict of properties to string in logging, and add analysis files
marghe-molaro Oct 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
370 changes: 370 additions & 0 deletions src/scripts/analysis_data_generation/analysis_extract_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,370 @@
"""Produce plots to show the health impact (deaths, dalys) each the healthcare system (overall health impact) when
running under different MODES and POLICIES (scenario_impact_of_actual_vs_funded.py)"""

# short tclose -> ideal case
# long tclose -> status quo
import argparse
from pathlib import Path
from typing import Tuple

import pandas as pd

from tlo import Date
from tlo.analysis.utils import extract_results
from datetime import datetime

# Range of years considered
min_year = 2010
max_year = 2040


def all_columns(_df):
return pd.Series(_df.all())

def apply(results_folder: Path, output_folder: Path, resourcefilepath: Path = None, ):
"""Produce standard set of plots describing the effect of each TREATMENT_ID.
- We estimate the epidemiological impact as the EXTRA deaths that would occur if that treatment did not occur.
- We estimate the draw on healthcare system resources as the FEWER appointments when that treatment does not occur.
"""
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
event_chains = extract_results(
results_folder,
module='tlo.simulation',
key='event_chains',
column='0',
#column = str(i),
#custom_generate_series=get_num_dalys_by_year,
do_scaling=False
)
# print(event_chains.loc[0,(0, 0)])

eval_env = {
'datetime': datetime, # Add the datetime class to the eval environment
'pd': pd, # Add pandas to handle Timestamp
'Timestamp': pd.Timestamp, # Specifically add Timestamp for eval
'NaT': pd.NaT,
'nan': float('nan'), # Include NaN for eval (can also use pd.NA if preferred)
}

for item,row in event_chains.iterrows():
value = event_chains.loc[item,(0, 0)]
if value !='':
print('')
print(value)
exit(-1)
#dict = {}
#for i in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
# dict[i] = []

#for i in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
# event_chains = extract_results(
# results_folder,
# module='tlo.simulation'#,
# key='event_chains',
# column = str(i),
# #custom_generate_series=get_num_dalys_by_year,
# do_scaling=False
# )
# print(event_chains)
# print(event_chains.index)
# print(event_chains.columns.levels)

# for index, row in event_chains.iterrows():
# if event_chains.iloc[index,0] is not None:
# if(event_chains.iloc[index,0]['person_ID']==i): #and 'event' in event_chains.iloc[index,0].keys()):
# dict[i].append(event_chains.iloc[index,0])
#elif (event_chains.iloc[index,0]['person_ID']==i and 'event' not in event_chains.iloc[index,0].keys()):
#print(event_chains.iloc[index,0]['de_depr'])
# exit(-1)
#for item in dict[0]:
# print(item)

#exit(-1)

TARGET_PERIOD = (Date(min_year, 1, 1), Date(max_year, 1, 1))

# Definitions of general helper functions
lambda stub: output_folder / f"{stub.replace('*', '_star_')}.png" # noqa: E731

def target_period() -> str:
"""Returns the target period as a string of the form YYYY-YYYY"""
return "-".join(str(t.year) for t in TARGET_PERIOD)

def get_parameter_names_from_scenario_file() -> Tuple[str]:
"""Get the tuple of names of the scenarios from `Scenario` class used to create the results."""
from scripts.healthsystem.impact_of_actual_vs_funded.scenario_impact_of_actual_vs_funded import (
ImpactOfHealthSystemMode,
)
e = ImpactOfHealthSystemMode()
return tuple(e._scenarios.keys())

def get_num_deaths(_df):
"""Return total number of Deaths (total within the TARGET_PERIOD)
"""
return pd.Series(data=len(_df.loc[pd.to_datetime(_df.date).between(*TARGET_PERIOD)]))

def get_num_dalys(_df):
"""Return total number of DALYs (Stacked) by label (total within the TARGET_PERIOD)"""
return pd.Series(
data=_df
.loc[_df.year.between(*[i.year for i in TARGET_PERIOD])]
.drop(columns=['date', 'sex', 'age_range', 'year'])
.sum().sum()
)

def get_num_dalys_by_cause(_df):
"""Return number of DALYs by cause by label (total within the TARGET_PERIOD)"""
return pd.Series(
data=_df
.loc[_df.year.between(*[i.year for i in TARGET_PERIOD])]
.drop(columns=['date', 'sex', 'age_range', 'year'])
.sum()
)

def set_param_names_as_column_index_level_0(_df):
"""Set the columns index (level 0) as the param_names."""
ordered_param_names_no_prefix = {i: x for i, x in enumerate(param_names)}
names_of_cols_level0 = [ordered_param_names_no_prefix.get(col) for col in _df.columns.levels[0]]
assert len(names_of_cols_level0) == len(_df.columns.levels[0])
_df.columns = _df.columns.set_levels(names_of_cols_level0, level=0)
return _df

def find_difference_relative_to_comparison(_ser: pd.Series,
comparison: str,
scaled: bool = False,
drop_comparison: bool = True,
):
"""Find the difference in the values in a pd.Series with a multi-index, between the draws (level 0)
within the runs (level 1), relative to where draw = `comparison`.
The comparison is `X - COMPARISON`."""
return _ser \
.unstack(level=0) \
.apply(lambda x: (x - x[comparison]) / (x[comparison] if scaled else 1.0), axis=1) \
.drop(columns=([comparison] if drop_comparison else [])) \
.stack()


def get_counts_of_hsi_by_treatment_id(_df):
"""Get the counts of the short TREATMENT_IDs occurring"""
_counts_by_treatment_id = _df \
.loc[pd.to_datetime(_df['date']).between(*TARGET_PERIOD), 'TREATMENT_ID'] \
.apply(pd.Series) \
.sum() \
.astype(int)
return _counts_by_treatment_id.groupby(level=0).sum()

year_target = 2023
def get_counts_of_hsi_by_treatment_id_by_year(_df):
"""Get the counts of the short TREATMENT_IDs occurring"""
_counts_by_treatment_id = _df \
.loc[pd.to_datetime(_df['date']).dt.year ==year_target, 'TREATMENT_ID'] \
.apply(pd.Series) \
.sum() \
.astype(int)
return _counts_by_treatment_id.groupby(level=0).sum()

def get_counts_of_hsi_by_short_treatment_id(_df):
"""Get the counts of the short TREATMENT_IDs occurring (shortened, up to first underscore)"""
_counts_by_treatment_id = get_counts_of_hsi_by_treatment_id(_df)
_short_treatment_id = _counts_by_treatment_id.index.map(lambda x: x.split('_')[0] + "*")
return _counts_by_treatment_id.groupby(by=_short_treatment_id).sum()

def get_counts_of_hsi_by_short_treatment_id_by_year(_df):
"""Get the counts of the short TREATMENT_IDs occurring (shortened, up to first underscore)"""
_counts_by_treatment_id = get_counts_of_hsi_by_treatment_id_by_year(_df)
_short_treatment_id = _counts_by_treatment_id.index.map(lambda x: x.split('_')[0] + "*")
return _counts_by_treatment_id.groupby(by=_short_treatment_id).sum()


# Obtain parameter names for this scenario file
param_names = get_parameter_names_from_scenario_file()
print(param_names)

# ================================================================================================
# TIME EVOLUTION OF TOTAL DALYs
# Plot DALYs averted compared to the ``No Policy'' policy

year_target = 2023 # This global variable will be passed to custom function
def get_num_dalys_by_year(_df):
"""Return total number of DALYs (Stacked) by label (total within the TARGET_PERIOD)"""
return pd.Series(
data=_df
.loc[_df.year == year_target]
.drop(columns=['date', 'sex', 'age_range', 'year'])
.sum().sum()
)

ALL = {}
# Plot time trend show year prior transition as well to emphasise that until that point DALYs incurred
# are consistent across different policies
this_min_year = 2010
for year in range(this_min_year, max_year+1):
year_target = year
num_dalys_by_year = extract_results(
results_folder,
module='tlo.methods.healthburden',
key='dalys_stacked',
custom_generate_series=get_num_dalys_by_year,
do_scaling=True
).pipe(set_param_names_as_column_index_level_0)
ALL[year_target] = num_dalys_by_year
# Concatenate the DataFrames into a single DataFrame
concatenated_df = pd.concat(ALL.values(), keys=ALL.keys())
concatenated_df.index = concatenated_df.index.set_names(['date', 'index_original'])
concatenated_df = concatenated_df.reset_index(level='index_original',drop=True)
dalys_by_year = concatenated_df
print(dalys_by_year)
dalys_by_year.to_csv('ConvertedOutputs/Total_DALYs_with_time.csv', index=True)

# ================================================================================================
# Print population under each scenario
pop_model = extract_results(results_folder,
module="tlo.methods.demography",
key="population",
column="total",
index="date",
do_scaling=True
).pipe(set_param_names_as_column_index_level_0)

pop_model.index = pop_model.index.year
pop_model = pop_model[(pop_model.index >= this_min_year) & (pop_model.index <= max_year)]
print(pop_model)
assert dalys_by_year.index.equals(pop_model.index)
assert all(dalys_by_year.columns == pop_model.columns)
pop_model.to_csv('ConvertedOutputs/Population_with_time.csv', index=True)

# ================================================================================================
# DALYs BROKEN DOWN BY CAUSES AND YEAR
# DALYs by cause per year
# %% Quantify the health losses associated with all interventions combined.

year_target = 2023 # This global variable will be passed to custom function
def get_num_dalys_by_year_and_cause(_df):
"""Return total number of DALYs (Stacked) by label (total within the TARGET_PERIOD)"""
return pd.Series(
data=_df
.loc[_df.year == year_target]
.drop(columns=['date', 'sex', 'age_range', 'year'])
.sum()
)

ALL = {}
# Plot time trend show year prior transition as well to emphasise that until that point DALYs incurred
# are consistent across different policies
this_min_year = 2010
for year in range(this_min_year, max_year+1):
year_target = year
num_dalys_by_year = extract_results(
results_folder,
module='tlo.methods.healthburden',
key='dalys_stacked',
custom_generate_series=get_num_dalys_by_year_and_cause,
do_scaling=True
).pipe(set_param_names_as_column_index_level_0)
ALL[year_target] = num_dalys_by_year #summarize(num_dalys_by_year)

# Concatenate the DataFrames into a single DataFrame
concatenated_df = pd.concat(ALL.values(), keys=ALL.keys())

concatenated_df.index = concatenated_df.index.set_names(['date', 'cause'])

df_total = concatenated_df
df_total.to_csv('ConvertedOutputs/DALYS_by_cause_with_time.csv', index=True)

ALL = {}
# Plot time trend show year prior transition as well to emphasise that until that point DALYs incurred
# are consistent across different policies
for year in range(min_year, max_year+1):
year_target = year

hsi_delivered_by_year = extract_results(
results_folder,
module='tlo.methods.healthsystem.summary',
key='HSI_Event',
custom_generate_series=get_counts_of_hsi_by_short_treatment_id_by_year,
do_scaling=True
).pipe(set_param_names_as_column_index_level_0)
ALL[year_target] = hsi_delivered_by_year

# Concatenate the DataFrames into a single DataFrame
concatenated_df = pd.concat(ALL.values(), keys=ALL.keys())
concatenated_df.index = concatenated_df.index.set_names(['date', 'cause'])
HSI_ran_by_year = concatenated_df

del ALL

ALL = {}
# Plot time trend show year prior transition as well to emphasise that until that point DALYs incurred
# are consistent across different policies
for year in range(min_year, max_year+1):
year_target = year

hsi_not_delivered_by_year = extract_results(
results_folder,
module='tlo.methods.healthsystem.summary',
key='Never_ran_HSI_Event',
custom_generate_series=get_counts_of_hsi_by_short_treatment_id_by_year,
do_scaling=True
).pipe(set_param_names_as_column_index_level_0)
ALL[year_target] = hsi_not_delivered_by_year

# Concatenate the DataFrames into a single DataFrame
concatenated_df = pd.concat(ALL.values(), keys=ALL.keys())
concatenated_df.index = concatenated_df.index.set_names(['date', 'cause'])
HSI_never_ran_by_year = concatenated_df

HSI_never_ran_by_year = HSI_never_ran_by_year.fillna(0) #clean_df(
HSI_ran_by_year = HSI_ran_by_year.fillna(0)
HSI_total_by_year = HSI_ran_by_year.add(HSI_never_ran_by_year, fill_value=0)
HSI_ran_by_year.to_csv('ConvertedOutputs/HSIs_ran_by_area_with_time.csv', index=True)
HSI_never_ran_by_year.to_csv('ConvertedOutputs/HSIs_never_ran_by_area_with_time.csv', index=True)
print(HSI_ran_by_year)
print(HSI_never_ran_by_year)
print(HSI_total_by_year)

if __name__ == "__main__":
rfp = Path('resources')

parser = argparse.ArgumentParser(
description="Produce plots to show the impact each set of treatments",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--output-path",
help=(
"Directory to write outputs to. If not specified (set to None) outputs "
"will be written to value of --results-path argument."
),
type=Path,
default=None,
required=False,
)
parser.add_argument(
"--resources-path",
help="Directory containing resource files",
type=Path,
default=Path('resources'),
required=False,
)
parser.add_argument(
"--results-path",
type=Path,
help=(
"Directory containing results from running "
"src/scripts/analysis_data_generation/scenario_generate_chains.py "
),
default=None,
required=False
)
args = parser.parse_args()
assert args.results_path is not None
results_path = args.results_path

output_path = results_path if args.output_path is None else args.output_path

apply(
results_folder=results_path,
output_folder=output_path,
resourcefilepath=args.resources_path
)
Loading
Loading