From 4f18745c80a096141d619c4f38d06c8b56e6dfc1 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Mon, 27 May 2024 12:09:55 -0400 Subject: [PATCH 01/47] partial thoughts -- not working --- README.md | 18 +++++++++- configs/extraction.yaml | 25 +++++-------- configs/pipeline.yaml | 27 ++++++++++++++ configs/preprocess.yaml | 56 ++++++++++++++++++------------ pyproject.toml | 2 ++ src/MEDS_polars_functions/utils.py | 34 ++++++++++++++++++ 6 files changed, 123 insertions(+), 39 deletions(-) create mode 100644 configs/pipeline.yaml diff --git a/README.md b/README.md index e03dda6..101f1f8 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,11 @@ more information. This package provides three things: 1. A working, scalable, simple example of how to extract and pre-process MEDS data for downstream modeling. + These examples are provided in the form of: + - A set of integration tests that are run over synthetic data to verify correctness of the ETL pipeline. + See `tests/test_extraction.py` for the ETL tests with the in-built synthetic source data. + - A working MIMIC-IV MEDS ETL pipeline that can be run over MIMIC-IV v2.2 in approximately 1 hour in serial + mode (and much faster if parallelized). See `MIMIC-IV_Example` for more details. 2. A flexible ETL for extracting MEDS data from a variety of source formats. 3. A pre-processing pipeline that can be used for models that require: - Filtering data to only include patients with a certain number of events @@ -27,7 +32,8 @@ This package provides three things: ## Installation -For now, clone this repository and run `pip install -e .` from the repository root. +For now, clone this repository and run `pip install -e .` from the repository root. To use the MIMIC-IV +example, install the optional MIMIC dependencies as well with `pip install -e .[mimic]`. ## MEDS ETL / Extraction Pipeline @@ -197,6 +203,16 @@ running multiple copies of the same script on independent workers to process the steps again need to happen in a single-threaded manner, but these steps are generally very fast and should not be a bottleneck. +## Running the Pipeline in Parallel via Hydra Multirun +We support two (optional) hydra multirun job launchers for parallelizing ETL and pre-processing pipeline +steps: [`joblib`](https://hydra.cc/docs/plugins/joblib_launcher/) (for local parallelism) and +[`submitit`](https://hydra.cc/docs/plugins/submitit_launcher/) to launch things with slurm for cluster +parallelism. + +To use either of these, you need to install additional optional dependencies: + 1. `pip install -e .[local_parallelism]` for joblib local parallelism support, or + 2. `pip install -e .[slurm_parallelism]` for submitit cluster parallelism support. + ## TODOs: 1. We need to have a vehicle to cleanly separate dataset-specific variables from the general configuration diff --git a/configs/extraction.yaml b/configs/extraction.yaml index 54708d0..c46c0af 100644 --- a/configs/extraction.yaml +++ b/configs/extraction.yaml @@ -1,6 +1,12 @@ -# Raw data -raw_cohort_dir: ??? -MEDS_cohort_dir: ??? +defaults: + - pipeline + +# Pipeline Structure +stages: + - shard_by_event + - generate_patient_shards + - convert_to_MEDS_and_subshard + - merge_subshards # Event Conversion event_conversion_config_fp: ??? @@ -16,16 +22,3 @@ split_fracs: row_chunksize: 200000000 n_patients_per_shard: 50000 infer_schema_length: 10000 - -# Misc -do_overwrite: False -seed: 1 - -# Hydra -hydra: - job: - name: MEDS_ETL_step_${now:%Y-%m-%d_%H-%M-%S} - run: - dir: ${MEDS_cohort_dir}/.logs/etl/${hydra.job.name} - sweep: - dir: ${MEDS_cohort_dir}/.logs/etl/${hydra.job.name} diff --git a/configs/pipeline.yaml b/configs/pipeline.yaml new file mode 100644 index 0000000..c25ba0a --- /dev/null +++ b/configs/pipeline.yaml @@ -0,0 +1,27 @@ + +# Global IO +input_dir: ??? +cohort_dir: ??? + +log_dir: "${cohort_dir}/.logs/${stage}/worker_${worker}/${now:%Y-%m-%d_%H-%M-%S}" + +# General pipeline variables +do_overwrite: False +seed: 1 +stages: ??? # The list of stages to this overall pipeline + +# Worker / Stage information +stage: ??? +worker: 1 +polling_time: 300 # wait time in seconds before beginning reduction steps + +# Stage-specific IO +stage_output_dir: "${cohort_dir}/${stage}" +stage_input_dir: "${stage_input_dir:${input_dir},${cohort_dir},${stages},${stage}}" + +# Hydra +hydra: + run: + dir: "${log_dir}/${hydra.job.name}" + sweep: + dir: "${log_dir}/${hydra.job.name}" diff --git a/configs/preprocess.yaml b/configs/preprocess.yaml index 397ff93..b1a5517 100644 --- a/configs/preprocess.yaml +++ b/configs/preprocess.yaml @@ -1,12 +1,36 @@ -# Raw data -MEDS_cohort_dir: ??? -output_data_dir: ??? -log_dir: "${output_data_dir}/.logs" +defaults: + - pipeline + +# Pipeline Structure +stages: + - filter_patients_by_length + - add_time_derived_measurements + - preliminary_counts + - filter_codes + - fit_outlier_detection + - filter_outliers + - fit_normalization + - normalization + - tokenization + - tensorization + +stages: + filter_patients_by_length: + input_dir: ??? + output_dir: ??? + min_events_per_patient: null + min_measurements_per_patient: null + + add_time_derived_measurements + preliminary_counts + filter_codes + fit_outlier_detection + filter_outliers + fit_normalization + normalization + tokenization + tensorization -# Worker / Stage information -stage: ??? -worker: 1 -polling_time: 300 # wait time in seconds before beginning reduction steps # Filtering parameters min_code_occurrences: null @@ -32,11 +56,11 @@ code_processing_stages: preliminary_counts: - "code/n_occurrences" - "code/n_patients" - outlier_detection: + fit_outlier_detection: - "values/n_occurrences" - "values/sum" - "values/sum_sqd" - normalization: + fit_normalization: - "code/n_occurrences" - "code/n_patients" - "values/n_occurrences" @@ -45,15 +69,3 @@ code_processing_stages: # Outlier detection outlier_stddev_cutoff: 4.5 - -# Misc -do_overwrite: False - -# Hydra -hydra: - job: - name: "MEDS_Preprocessor/stage_${stage}/worker_${worker}/${now:%Y-%m-%d_%H-%M-%S}" - run: - dir: "${log_dir}/${hydra.job.name}" - sweep: - dir: "${log_dir}/${hydra.job.name}" diff --git a/pyproject.toml b/pyproject.toml index bbf8dee..29bba91 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,8 @@ dependencies = ["polars", "pyarrow", "nested_ragged_tensors", "loguru", "hydra-c mimic = ["rootutils"] dev = ["pre-commit"] tests = ["pytest", "pytest-cov[toml]", "rootutils"] +local_parallelism = ["hydra-joblib-launcher"] +slurm_parallelism = ["hydra-submitit-launcher"] [project.urls] Homepage = "https://github.com/mmcdermott/MEDS_polars_functions" diff --git a/src/MEDS_polars_functions/utils.py b/src/MEDS_polars_functions/utils.py index 7899653..f389d49 100644 --- a/src/MEDS_polars_functions/utils.py +++ b/src/MEDS_polars_functions/utils.py @@ -3,10 +3,44 @@ import os from pathlib import Path +from omegaconf import OmegaConf import hydra import polars as pl from loguru import logger as log +def get_stage_input_dir( + raw_input_dir: str, cohort_dir: str, stages: list[str], stage: str +) -> str: + """Resolves the input directory for a stage in a MEDS pipeline. + + Args: + raw_input_dir: The raw input directory (used as the input when the stage is the 1st stage). + cohort_dir: The cohort (output) directory; used as the source for the default stage output. + stages: The stages in the pipeline. + stage: The current stage. + + Returns: + The input directory for the current stage. + + Examples: + >>> get_stage_input_dir("/a/b", "/c/d", ["stage1", "stage2"], "stage1") + '/a/b' + >>> get_stage_input_dir("/a/b", "/c/d", ["stage1", "stage2"], "stage2") + '/c/d/stage1' + """ + if stage == stages[0]: + return raw_input_dir + elif stage not in stages: + raise ValueError( + f"Can't impute input directory for {stage} as it is not in the stages list! " + f"Stages: {stages}. " + "If this is intentional, please provide the input directory explicitly or remove the " + "attempted interpolation from your config by overwriting the `stage_input_dir` parameter." + ) + return os.path.join(cohort_dir, stages[stages.index(stage) - 1]) + +# We actually call this here that way it is registered in every script when the module is imported. +OmegaConf.register_new_resolver("stage_input_idr", get_stage_input_dir, replace=True) def hydra_loguru_init() -> None: """Adds loguru output to the logs that hydra scrapes. From 84eaf5f9a6b0699a846ae99692b8770871516c70 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Mon, 27 May 2024 13:00:59 -0400 Subject: [PATCH 02/47] New structure based on conversation with Nassim --- configs/preprocess.yaml | 103 +++++++++++++++------------------------- 1 file changed, 39 insertions(+), 64 deletions(-) diff --git a/configs/preprocess.yaml b/configs/preprocess.yaml index b1a5517..0b7e71d 100644 --- a/configs/preprocess.yaml +++ b/configs/preprocess.yaml @@ -1,71 +1,46 @@ defaults: - pipeline -# Pipeline Structure -stages: - - filter_patients_by_length - - add_time_derived_measurements - - preliminary_counts - - filter_codes - - fit_outlier_detection - - filter_outliers - - fit_normalization - - normalization - - tokenization - - tensorization +# Global pipeline parameters: +# 1. Code modifiers will be used as adjoining parts of the `code` columns during group-bys and eventual +# tokenization. +code_modifier_columns: ??? +# Pipeline Structure stages: - filter_patients_by_length: - input_dir: ??? - output_dir: ??? + - name: filter_patients_by_length: min_events_per_patient: null min_measurements_per_patient: null - - add_time_derived_measurements - preliminary_counts - filter_codes - fit_outlier_detection - filter_outliers - fit_normalization - normalization - tokenization - tensorization - - -# Filtering parameters -min_code_occurrences: null -min_events_per_patient: null -min_measurements_per_patient: null - -# Time-derived measurements -time_derived_measurements: - age: - dob_code: ??? - age_code: "AGE" - age_unit: "years" - time_of_day: - bin_endpoints: [6, 12, 18, 24] - -# Code modifiers will be used as adjoining parts of the `code` columns during group-bys and eventual -# tokenization. -code_modifier_columns: ??? - -# Code metadata extraction. These may contain duplicates because the data may be filtered between different -# stages, depending on the pipeline in question. -code_processing_stages: - preliminary_counts: - - "code/n_occurrences" - - "code/n_patients" - fit_outlier_detection: - - "values/n_occurrences" - - "values/sum" - - "values/sum_sqd" - fit_normalization: - - "code/n_occurrences" - - "code/n_patients" - - "values/n_occurrences" - - "values/sum" - - "values/sum_sqd" - -# Outlier detection -outlier_stddev_cutoff: 4.5 + - name: add_time_derived_measurements: + age: + dob_code: ??? + age_code: "AGE" + age_unit: "years" + time_of_day: + bin_endpoints: [6, 12, 18, 24] + - name: preliminary_counts + obs_aggregations: + - "code/n_occurrences" + - "code/n_patients" + - name: filter_codes + min_code_occurrences: null + - name: fit_outlier_detection + aggregations: + - "values/n_occurrences" + - "values/sum" + - "values/sum_sqd" + - name: filter_outliers + stddev_cutoff: 4.5 + - name: fit_normalization + aggregations: + - "code/n_occurrences" + - "code/n_patients" + - "values/n_occurrences" + - "values/sum" + - "values/sum_sqd" + - name: normalization + - name: tokenization + - name: tensorization + +stage: ??? +stage_cfg: ${populate_stage:${stage}} From 2cde29a4be5af2d36b67001603adb4260d8131dc Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Mon, 27 May 2024 13:21:30 -0400 Subject: [PATCH 03/47] Updated configs further and started README documentation for this. --- README.md | 29 ++++++++++++++++++++++++++--- configs/extraction.yaml | 33 ++++++++++++++------------------- configs/pipeline.yaml | 12 ++++++------ configs/preprocess.yaml | 14 +++++++++----- 4 files changed, 55 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index 101f1f8..9669c48 100644 --- a/README.md +++ b/README.md @@ -203,15 +203,38 @@ running multiple copies of the same script on independent workers to process the steps again need to happen in a single-threaded manner, but these steps are generally very fast and should not be a bottleneck. -## Running the Pipeline in Parallel via Hydra Multirun +## Overview of configuration manipulation + +### Pipeline configuration: Stages and OmegaConf Resolvers + +The pipeline configuration file for both the provided extraction and pre-processing pipelines are structured +to permit both ease of understanding, flexibility for user-derived modifications, and ease of use in the +simple, file-in/file-out scripts that this repository promotes. How this works is that each pipeline +(extraction and pre-processing) defines one global configuration file which is used as the Hydra specification +for all scripts in that pipeline. This file leverages some generic pipeline configuration options, specified +in `pipeline.yaml` and imported via the Hydra `defaults:` list, but also defines a list of stages with +stage-specific configurations. + +The user can specify the stage in question on the command line either manually (e.g., `stage=stage_name`) or +allow the stage name to be inferred automatically from the script name. Each script receives both the global +configuration file but also a sub-configuration (within the `stage_cfg` node in the received global +configuration) which is pre-populated with the stage-specific configuration for the stage in question and +automatically inferred input and output file paths (if not overwritten in the config file) based on the stage +name and its position in the overall pipeline. This makes it easy to leverage transformations and scripts +defined here in new configuration pipelines, simply by placing them as a stage in a broader pipeline in a +different configuration or order relative to other stages. + +### Running the Pipeline in Parallel via Hydra Multirun + We support two (optional) hydra multirun job launchers for parallelizing ETL and pre-processing pipeline steps: [`joblib`](https://hydra.cc/docs/plugins/joblib_launcher/) (for local parallelism) and [`submitit`](https://hydra.cc/docs/plugins/submitit_launcher/) to launch things with slurm for cluster parallelism. To use either of these, you need to install additional optional dependencies: - 1. `pip install -e .[local_parallelism]` for joblib local parallelism support, or - 2. `pip install -e .[slurm_parallelism]` for submitit cluster parallelism support. + +1. `pip install -e .[local_parallelism]` for joblib local parallelism support, or +2. `pip install -e .[slurm_parallelism]` for submitit cluster parallelism support. ## TODOs: diff --git a/configs/extraction.yaml b/configs/extraction.yaml index c46c0af..c454d31 100644 --- a/configs/extraction.yaml +++ b/configs/extraction.yaml @@ -1,24 +1,19 @@ defaults: - pipeline -# Pipeline Structure -stages: - - shard_by_event - - generate_patient_shards - - convert_to_MEDS_and_subshard - - merge_subshards - -# Event Conversion +# The event conversion configuration file is used throughout the pipeline to define the events to extract. event_conversion_config_fp: ??? -# Splits -external_splits_json_fp: null -split_fracs: - train: 0.8 - tuning: 0.1 - held_out: 0.1 - -# Sharding -row_chunksize: 200000000 -n_patients_per_shard: 50000 -infer_schema_length: 10000 +stages: + - name: shard_events + row_chunksize: 200000000 + infer_schema_length: 10000 + - name: split_and_shard_patients + n_patients_per_shard: 50000 + external_splits_json_fp: null + split_fracs: + train: 0.8 + tuning: 0.1 + held_out: 0.1 + - name: convert_to_sharded_events + - name: merge_to_MEDS_cohort diff --git a/configs/pipeline.yaml b/configs/pipeline.yaml index c25ba0a..a477667 100644 --- a/configs/pipeline.yaml +++ b/configs/pipeline.yaml @@ -1,4 +1,3 @@ - # Global IO input_dir: ??? cohort_dir: ??? @@ -10,17 +9,18 @@ do_overwrite: False seed: 1 stages: ??? # The list of stages to this overall pipeline -# Worker / Stage information -stage: ??? +# Mapreduce information worker: 1 polling_time: 300 # wait time in seconds before beginning reduction steps -# Stage-specific IO -stage_output_dir: "${cohort_dir}/${stage}" -stage_input_dir: "${stage_input_dir:${input_dir},${cohort_dir},${stages},${stage}}" +# Filling in the current stage +stage: ${current_script_name:} +stage_cfg: ${oc.create:${populate_stage:${stage}}} # Hydra hydra: + job: + name: "${stage}" run: dir: "${log_dir}/${hydra.job.name}" sweep: diff --git a/configs/preprocess.yaml b/configs/preprocess.yaml index 0b7e71d..9b60579 100644 --- a/configs/preprocess.yaml +++ b/configs/preprocess.yaml @@ -8,29 +8,35 @@ code_modifier_columns: ??? # Pipeline Structure stages: - - name: filter_patients_by_length: + - name: filter_patients min_events_per_patient: null min_measurements_per_patient: null - - name: add_time_derived_measurements: + + - name: add_time_derived_measurements age: dob_code: ??? age_code: "AGE" age_unit: "years" time_of_day: bin_endpoints: [6, 12, 18, 24] + - name: preliminary_counts obs_aggregations: - "code/n_occurrences" - "code/n_patients" + - name: filter_codes min_code_occurrences: null + - name: fit_outlier_detection aggregations: - "values/n_occurrences" - "values/sum" - "values/sum_sqd" + - name: filter_outliers stddev_cutoff: 4.5 + - name: fit_normalization aggregations: - "code/n_occurrences" @@ -38,9 +44,7 @@ stages: - "values/n_occurrences" - "values/sum" - "values/sum_sqd" + - name: normalization - name: tokenization - name: tensorization - -stage: ??? -stage_cfg: ${populate_stage:${stage}} From aaec6f3b291a418ba6ca11b986eb9ef259584296 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Mon, 27 May 2024 16:41:19 -0400 Subject: [PATCH 04/47] Got the custom OmegaConf resolvers working for populating the stage config. --- .pre-commit-config.yaml | 1 + configs/pipeline.yaml | 2 +- .../extraction/convert_to_sharded_events.py | 6 + src/MEDS_polars_functions/utils.py | 145 +++++++++++++++--- 4 files changed, 129 insertions(+), 25 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7540f52..1533f74 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,6 +38,7 @@ repos: rev: v2.2.0 hooks: - id: autoflake + args: [--in-place, --remove-all-unused-imports] # python upgrading syntax to newer version - repo: https://github.com/asottile/pyupgrade diff --git a/configs/pipeline.yaml b/configs/pipeline.yaml index a477667..12feaea 100644 --- a/configs/pipeline.yaml +++ b/configs/pipeline.yaml @@ -15,7 +15,7 @@ polling_time: 300 # wait time in seconds before beginning reduction steps # Filling in the current stage stage: ${current_script_name:} -stage_cfg: ${oc.create:${populate_stage:${stage}}} +stage_cfg: ${oc.create:${populate_stage:${stage}, ${input_dir}, ${cohort_dir}, ${stages}}} # Hydra hydra: diff --git a/scripts/extraction/convert_to_sharded_events.py b/scripts/extraction/convert_to_sharded_events.py index 50fcab2..07eb7a2 100755 --- a/scripts/extraction/convert_to_sharded_events.py +++ b/scripts/extraction/convert_to_sharded_events.py @@ -22,6 +22,12 @@ def main(cfg: DictConfig): hydra_loguru_init() + logger.info( + f"Running with config:\n{OmegaConf.to_yaml(cfg)}\n" + f"Stage: {cfg.stage}\n\n" + f"Stage config:\n{OmegaConf.to_yaml(cfg.stage_cfg)}" + ) + Path(cfg.raw_cohort_dir) MEDS_cohort_dir = Path(cfg.MEDS_cohort_dir) diff --git a/src/MEDS_polars_functions/utils.py b/src/MEDS_polars_functions/utils.py index f389d49..e61f21d 100644 --- a/src/MEDS_polars_functions/utils.py +++ b/src/MEDS_polars_functions/utils.py @@ -1,46 +1,143 @@ """Core utilities for MEDS pipelines built with these tools.""" import os +import sys from pathlib import Path -from omegaconf import OmegaConf import hydra import polars as pl from loguru import logger as log +from omegaconf import OmegaConf + -def get_stage_input_dir( - raw_input_dir: str, cohort_dir: str, stages: list[str], stage: str -) -> str: - """Resolves the input directory for a stage in a MEDS pipeline. +def current_script_name() -> str: + """Returns the name of the script that called this function. + + Returns: + str: The name of the script that called this function. + """ + return Path(sys.argv[0]).stem + + +def populate_stage( + stage_name: str, + input_dir: str, + cohort_dir: str, + stages: list[dict], + pre_parsed_stages: list[dict] | None = None, +) -> dict: + """Populates a stage in the stages configuration with inferred stage parameters. + + Infers and adds (unless already present, in which case the provided value is used) the following + parameters to the stage configuration: + - `is_metadata`: Whether the stage is a metadata stage, which is determined to be `False` if the stage + does not have an `aggregations` parameter. + - `data_input_dir`: The input directory for the stage (either the global input directory or the previous + data stage's output directory). + - `metadata_input_dir`: The input directory for the stage (either the global input directory or the + previous metadata stage's output directory). + - `output_dir`: The output directory for the stage (the cohort directory with the stage name appended). Args: - raw_input_dir: The raw input directory (used as the input when the stage is the 1st stage). - cohort_dir: The cohort (output) directory; used as the source for the default stage output. - stages: The stages in the pipeline. - stage: The current stage. + stage_name: The name of the stage to populate. + input_dir: The global input directory. + cohort_dir: The cohort directory into which this overall pipeline is writing data. + stages: The stages configuration dictionaries (unresolved). + pre_parsed_stages: The stages configuration dictionaries (resolved). If specified, the function will + not re-resolve the stages in this list. Returns: - The input directory for the current stage. + dict: The populated stage configuration. + + Raises: + ValueError: If the stage is not present in the stages configuration. Examples: - >>> get_stage_input_dir("/a/b", "/c/d", ["stage1", "stage2"], "stage1") - '/a/b' - >>> get_stage_input_dir("/a/b", "/c/d", ["stage1", "stage2"], "stage2") - '/c/d/stage1' + >>> root_config = DictConfig({ + ... "input_dir": "/a/b", + ... "cohort_dir": "/c/d", + ... "stages": [ + ... {"name": "stage1"}, + ... {"name": "stage2", "is_metadata": True}, + ... {"name": "stage3", "is_metadata": None}, + ... {"name": "stage4", "data_input_dir": "/e/f", "output_dir": "/g/h"}, + ... {"name": "stage5", "aggregations": ["foo"]}, + ... {"name": "stage6"}, + ... ], + ... }) + >>> args = (root_config["input_dir"], root_config["cohort_dir"], root_config["stages"]) + >>> populate_stage("stage1", *args) # doctest: +NORMALIZE_WHITESPACE + {'name': 'stage1', 'is_metadata': False, 'data_input_dir': '/a/b', 'metadata_input_dir': '/a/b', + 'output_dir': '/c/d/stage1'} + >>> populate_stage("stage2", *args) # doctest: +NORMALIZE_WHITESPACE + {'name': 'stage2', 'is_metadata': True, 'data_input_dir': '/c/d/stage1', 'metadata_input_dir': '/a/b', + 'output_dir': '/c/d/stage2'} + >>> populate_stage("stage3", *args) # doctest: +NORMALIZE_WHITESPACE + {'name': 'stage3', 'is_metadata': False, 'data_input_dir': '/c/d/stage1', + 'metadata_input_dir': '/c/d/stage2', 'output_dir': '/c/d/stage3'} + >>> populate_stage("stage4", *args) # doctest: +NORMALIZE_WHITESPACE + {'name': 'stage4', 'data_input_dir': '/e/f', 'output_dir': '/g/h', 'is_metadata': False, + 'metadata_input_dir': '/c/d/stage2'} + >>> populate_stage("stage5", *args) # doctest: +NORMALIZE_WHITESPACE + {'name': 'stage5', 'aggregations': ['foo'], 'is_metadata': True, 'data_input_dir': '/g/h', + 'metadata_input_dir': '/c/d/stage2', 'output_dir': '/c/d/stage5'} + >>> populate_stage("stage6", *args) # doctest: +NORMALIZE_WHITESPACE + {'name': 'stage6', 'is_metadata': False, 'data_input_dir': '/g/h', + 'metadata_input_dir': '/c/d/stage5', 'output_dir': '/c/d/stage6'} + >>> populate_stage("stage7", *args) # doctest: +NORMALIZE_WHITESPACE + Traceback (most recent call last): + ... + ValueError: 'stage7' is not a valid stage name. Options are: + ['stage1', 'stage2', 'stage3', 'stage4', 'stage5', 'stage6'] """ - if stage == stages[0]: - return raw_input_dir - elif stage not in stages: + + if stage_name not in {s["name"] for s in stages}: raise ValueError( - f"Can't impute input directory for {stage} as it is not in the stages list! " - f"Stages: {stages}. " - "If this is intentional, please provide the input directory explicitly or remove the " - "attempted interpolation from your config by overwriting the `stage_input_dir` parameter." + f"'{stage_name}' is not a valid stage name. Options are: {list(s['name'] for s in stages)}" ) - return os.path.join(cohort_dir, stages[stages.index(stage) - 1]) -# We actually call this here that way it is registered in every script when the module is imported. -OmegaConf.register_new_resolver("stage_input_idr", get_stage_input_dir, replace=True) + pre_pop_stages_by_name = {s["name"]: s for s in pre_parsed_stages} if pre_parsed_stages else {} + pre_parsed_stages = pre_parsed_stages or [] + + prior_stages = [] + stage = None + prior_data_stage = None + prior_metadata_stage = None + for s in stages: + if s["name"] == stage_name: + stage = s + break + elif s["name"] in pre_pop_stages_by_name: + s_resolved = pre_pop_stages_by_name[s["name"]] + else: + s_resolved = populate_stage(s["name"], input_dir, cohort_dir, stages, prior_stages) + + if s_resolved["is_metadata"]: + prior_metadata_stage = s_resolved + else: + prior_data_stage = s_resolved + prior_stages.append(s_resolved) + + inferred_keys = { + "is_metadata": "aggregations" in stage, + "data_input_dir": input_dir if prior_data_stage is None else prior_data_stage["output_dir"], + "metadata_input_dir": ( + input_dir if prior_metadata_stage is None else prior_metadata_stage["output_dir"] + ), + "output_dir": os.path.join(cohort_dir, stage_name), + } + + out = {**stage} + for key, val in inferred_keys.items(): + if key not in out or out[key] is None: + out[key] = val + + return out + + +OmegaConf.register_new_resolver("current_script_name", current_script_name, replace=False) +OmegaConf.register_new_resolver("populate_stage", populate_stage, replace=False) + def hydra_loguru_init() -> None: """Adds loguru output to the logs that hydra scrapes. From 732a0002ad002b4595121ae59d1c7b0179005765 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Mon, 27 May 2024 17:56:36 -0400 Subject: [PATCH 05/47] Got tests to pass (including integration) on the extraction scripts with the new hydra setup and custom resolvers --- configs/extraction.yaml | 16 +++- configs/pipeline.yaml | 5 +- .../extraction/convert_to_sharded_events.py | 9 +- scripts/extraction/merge_to_MEDS_cohort.py | 14 +-- scripts/extraction/shard_events.py | 23 +++-- .../extraction/split_and_shard_patients.py | 18 ++-- .../add_time_derived_measurements.py | 13 ++- .../preprocessing/collect_code_metadata.py | 8 +- scripts/preprocessing/filter_patients.py | 8 +- src/MEDS_polars_functions/utils.py | 85 +++++++++++-------- tests/test_extraction.py | 52 +++++++----- 11 files changed, 155 insertions(+), 96 deletions(-) diff --git a/configs/extraction.yaml b/configs/extraction.yaml index c454d31..e1e985a 100644 --- a/configs/extraction.yaml +++ b/configs/extraction.yaml @@ -5,15 +5,23 @@ defaults: event_conversion_config_fp: ??? stages: - - name: shard_events + - shard_events + - split_and_shard_patients + - convert_to_sharded_events + - merge_to_MEDS_cohort + +stage_configs: + shard_events: row_chunksize: 200000000 infer_schema_length: 10000 - - name: split_and_shard_patients + split_and_shard_patients: + is_metadata: True + output_dir: ${cohort_dir} n_patients_per_shard: 50000 external_splits_json_fp: null split_fracs: train: 0.8 tuning: 0.1 held_out: 0.1 - - name: convert_to_sharded_events - - name: merge_to_MEDS_cohort + merge_to_MEDS_cohort: + output_dir: ${cohort_dir}/final_cohort diff --git a/configs/pipeline.yaml b/configs/pipeline.yaml index 12feaea..851afd7 100644 --- a/configs/pipeline.yaml +++ b/configs/pipeline.yaml @@ -7,7 +7,8 @@ log_dir: "${cohort_dir}/.logs/${stage}/worker_${worker}/${now:%Y-%m-%d_%H-%M-%S} # General pipeline variables do_overwrite: False seed: 1 -stages: ??? # The list of stages to this overall pipeline +stages: ??? # The list of stages to this overall pipeline (in order) +stage_configs: ??? # The configurations for each stage, keyed by stage name # Mapreduce information worker: 1 @@ -15,7 +16,7 @@ polling_time: 300 # wait time in seconds before beginning reduction steps # Filling in the current stage stage: ${current_script_name:} -stage_cfg: ${oc.create:${populate_stage:${stage}, ${input_dir}, ${cohort_dir}, ${stages}}} +stage_cfg: ${oc.create:${populate_stage:${stage}, ${input_dir}, ${cohort_dir}, ${stages}, ${stage_configs}}} # Hydra hydra: diff --git a/scripts/extraction/convert_to_sharded_events.py b/scripts/extraction/convert_to_sharded_events.py index 07eb7a2..bc1eff3 100755 --- a/scripts/extraction/convert_to_sharded_events.py +++ b/scripts/extraction/convert_to_sharded_events.py @@ -28,10 +28,7 @@ def main(cfg: DictConfig): f"Stage config:\n{OmegaConf.to_yaml(cfg.stage_cfg)}" ) - Path(cfg.raw_cohort_dir) - MEDS_cohort_dir = Path(cfg.MEDS_cohort_dir) - - shards = json.loads((MEDS_cohort_dir / "splits.json").read_text()) + shards = json.loads((Path(cfg.stage_cfg.metadata_input_dir) / "splits.json").read_text()) event_conversion_cfg_fp = Path(cfg.event_conversion_config_fp) if not event_conversion_cfg_fp.exists(): @@ -45,7 +42,7 @@ def main(cfg: DictConfig): default_patient_id_col = event_conversion_cfg.pop("patient_id_col", "patient_id") - patient_subsharded_dir = MEDS_cohort_dir / "patient_sub_sharded_events" + patient_subsharded_dir = Path(cfg.stage_cfg.output_dir) patient_subsharded_dir.mkdir(parents=True, exist_ok=True) OmegaConf.save(event_conversion_cfg, patient_subsharded_dir / "event_conversion_config.yaml") @@ -63,7 +60,7 @@ def main(cfg: DictConfig): event_cfgs = copy.deepcopy(event_cfgs) input_patient_id_column = event_cfgs.pop("patient_id_col", default_patient_id_col) - event_shards = list((MEDS_cohort_dir / "sub_sharded" / input_prefix).glob("*.parquet")) + event_shards = list((Path(cfg.stage_cfg.data_input_dir) / input_prefix).glob("*.parquet")) random.shuffle(event_shards) for shard_fp in event_shards: diff --git a/scripts/extraction/merge_to_MEDS_cohort.py b/scripts/extraction/merge_to_MEDS_cohort.py index cc69d2f..1c7271d 100755 --- a/scripts/extraction/merge_to_MEDS_cohort.py +++ b/scripts/extraction/merge_to_MEDS_cohort.py @@ -7,7 +7,7 @@ import hydra import polars as pl from loguru import logger -from omegaconf import DictConfig +from omegaconf import DictConfig, OmegaConf from MEDS_polars_functions.mapper import wrap as rwlock_wrap from MEDS_polars_functions.utils import hydra_loguru_init @@ -42,13 +42,17 @@ def main(cfg: DictConfig): hydra_loguru_init() - MEDS_cohort_dir = Path(cfg.MEDS_cohort_dir) + logger.info( + f"Running with config:\n{OmegaConf.to_yaml(cfg)}\n" + f"Stage: {cfg.stage}\n\n" + f"Stage config:\n{OmegaConf.to_yaml(cfg.stage_cfg)}" + ) - shards = json.loads((MEDS_cohort_dir / "splits.json").read_text()) + shards = json.loads((Path(cfg.stage_cfg.metadata_input_dir) / "splits.json").read_text()) logger.info("Starting patient shard merging.") - patient_subsharded_dir = MEDS_cohort_dir / "patient_sub_sharded_events" + patient_subsharded_dir = Path(cfg.stage_cfg.data_input_dir) if not patient_subsharded_dir.is_dir(): raise FileNotFoundError(f"Patient sub-sharded directory not found: {patient_subsharded_dir}") @@ -57,7 +61,7 @@ def main(cfg: DictConfig): for sp in patient_splits: in_dir = patient_subsharded_dir / sp - out_fp = MEDS_cohort_dir / "final_cohort" / f"{sp}.parquet" + out_fp = Path(cfg.stage_cfg.output_dir) / f"{sp}.parquet" shard_fps = sorted(list(in_dir.glob("**/*.parquet"))) shard_fp_strs = [f" * {str(fp.resolve())}" for fp in shard_fps] diff --git a/scripts/extraction/shard_events.py b/scripts/extraction/shard_events.py index 15737c1..5ccc36f 100755 --- a/scripts/extraction/shard_events.py +++ b/scripts/extraction/shard_events.py @@ -190,9 +190,14 @@ def main(cfg: DictConfig): """ hydra_loguru_init() - raw_cohort_dir = Path(cfg.raw_cohort_dir) - MEDS_cohort_dir = Path(cfg.MEDS_cohort_dir) - row_chunksize = cfg.row_chunksize + logger.info( + f"Running with config:\n{OmegaConf.to_yaml(cfg)}\n" + f"Stage: {cfg.stage}\n\n" + f"Stage config:\n{OmegaConf.to_yaml(cfg.stage_cfg)}" + ) + + raw_cohort_dir = Path(cfg.stage_cfg.data_input_dir) + row_chunksize = cfg.stage_cfg.row_chunksize event_conversion_cfg_fp = Path(cfg.event_conversion_config_fp) if not event_conversion_cfg_fp.exists(): @@ -226,19 +231,21 @@ def main(cfg: DictConfig): ) logger.info( f"Will read raw data from {str(raw_cohort_dir.resolve())}/$IN_FILE.parquet and write sub-sharded " - f"data to {str(MEDS_cohort_dir.resolve())}/sub_sharded/$IN_FILE/$ROW_START-$ROW_END.parquet" + f"data to {cfg.stage_cfg.output_dir}/$IN_FILE/$ROW_START-$ROW_END.parquet" ) start = datetime.now() for input_file in input_files_to_subshard: columns = prefix_to_columns[get_shard_prefix(raw_cohort_dir, input_file)] - out_dir = MEDS_cohort_dir / "sub_sharded" / get_shard_prefix(raw_cohort_dir, input_file) + out_dir = Path(cfg.stage_cfg.output_dir) / get_shard_prefix(raw_cohort_dir, input_file) out_dir.mkdir(parents=True, exist_ok=True) logger.info(f"Processing {input_file} to {out_dir}.") logger.info(f"Performing preliminary read of {str(input_file.resolve())} to determine row count.") - df = scan_with_row_idx(input_file, columns=columns, infer_schema_length=cfg["infer_schema_length"]) + df = scan_with_row_idx( + input_file, columns=columns, infer_schema_length=cfg.stage_cfg.infer_schema_length + ) row_count = df.select(pl.len()).collect().item() @@ -272,7 +279,9 @@ def main(cfg: DictConfig): rwlock_wrap( input_file, out_fp, - partial(scan_with_row_idx, columns=columns, infer_schema_length=cfg["infer_schema_length"]), + partial( + scan_with_row_idx, columns=columns, infer_schema_length=cfg.stage_cfg.infer_schema_length + ), write_lazyframe, compute_fn, do_overwrite=cfg.do_overwrite, diff --git a/scripts/extraction/split_and_shard_patients.py b/scripts/extraction/split_and_shard_patients.py index fa5c1c2..f618da5 100755 --- a/scripts/extraction/split_and_shard_patients.py +++ b/scripts/extraction/split_and_shard_patients.py @@ -18,10 +18,16 @@ def main(cfg: DictConfig): hydra_loguru_init() + logger.info( + f"Running with config:\n{OmegaConf.to_yaml(cfg)}\n" + f"Stage: {cfg.stage}\n\n" + f"Stage config:\n{OmegaConf.to_yaml(cfg.stage_cfg)}" + ) + logger.info("Starting patient splitting and sharding") - MEDS_cohort_dir = Path(cfg.MEDS_cohort_dir) - subsharded_dir = MEDS_cohort_dir / "sub_sharded" + MEDS_cohort_dir = Path(cfg.stage_cfg.output_dir) + subsharded_dir = Path(cfg.stage_cfg.data_input_dir) event_conversion_cfg_fp = Path(cfg.event_conversion_config_fp) if not event_conversion_cfg_fp.exists(): @@ -61,8 +67,8 @@ def main(cfg: DictConfig): logger.info(f"Found {len(patient_ids)} unique patient IDs of type {patient_ids.dtype}") - if cfg.external_splits_json_fp: - external_splits_json_fp = Path(cfg.external_splits_json_fp) + if cfg.stage_cfg.external_splits_json_fp: + external_splits_json_fp = Path(cfg.stage_cfg.external_splits_json_fp) if not external_splits_json_fp.exists(): raise FileNotFoundError(f"External splits JSON file not found at {external_splits_json_fp}") @@ -79,8 +85,8 @@ def main(cfg: DictConfig): sharded_patients = shard_patients( patients=patient_ids, external_splits=external_splits, - split_fracs_dict=cfg.split_fracs, - n_patients_per_shard=cfg.n_patients_per_shard, + split_fracs_dict=cfg.stage_cfg.split_fracs, + n_patients_per_shard=cfg.stage_cfg.n_patients_per_shard, seed=cfg.seed, ) diff --git a/scripts/preprocessing/add_time_derived_measurements.py b/scripts/preprocessing/add_time_derived_measurements.py index e5cae0d..1e01067 100644 --- a/scripts/preprocessing/add_time_derived_measurements.py +++ b/scripts/preprocessing/add_time_derived_measurements.py @@ -24,12 +24,17 @@ def main(cfg: DictConfig): hydra_loguru_init() - MEDS_cohort_dir = Path(cfg.MEDS_cohort_dir) - output_dir = Path(cfg.output_data_dir) + logger.info( + f"Running with config:\n{OmegaConf.to_yaml(cfg)}\n" + f"Stage: {cfg.stage}\n\n" + f"Stage config:\n{OmegaConf.to_yaml(cfg.stage_cfg)}" + ) - shards = json.loads((MEDS_cohort_dir / "splits.json").read_text()) + output_dir = Path(cfg.stage_dfg.output_dir) - final_cohort_dir = MEDS_cohort_dir / "final_cohort" + shards = json.loads((Path(cfg.stage_cfg.metadata_input_dir) / "splits.json").read_text()) + + final_cohort_dir = cfg.stage_cfg.data_input_dir / "final_cohort" filtered_patients_dir = output_dir / "patients_above_length_threshold" with_time_derived_dir = output_dir / "with_time_derived_measurements" diff --git a/scripts/preprocessing/collect_code_metadata.py b/scripts/preprocessing/collect_code_metadata.py index 36f4b77..fa25bcb 100644 --- a/scripts/preprocessing/collect_code_metadata.py +++ b/scripts/preprocessing/collect_code_metadata.py @@ -9,7 +9,7 @@ import hydra import polars as pl from loguru import logger -from omegaconf import DictConfig +from omegaconf import DictConfig, OmegaConf from MEDS_polars_functions.code_metadata import mapper_fntr, reducer_fntr from MEDS_polars_functions.mapper import wrap as rwlock_wrap @@ -22,6 +22,12 @@ def main(cfg: DictConfig): hydra_loguru_init() + logger.info( + f"Running with config:\n{OmegaConf.to_yaml(cfg)}\n" + f"Stage: {cfg.stage}\n\n" + f"Stage config:\n{OmegaConf.to_yaml(cfg.stage_cfg)}" + ) + MEDS_cohort_dir = Path(cfg.MEDS_cohort_dir) output_dir = Path(cfg.output_data_dir) diff --git a/scripts/preprocessing/filter_patients.py b/scripts/preprocessing/filter_patients.py index f926401..a2b6308 100644 --- a/scripts/preprocessing/filter_patients.py +++ b/scripts/preprocessing/filter_patients.py @@ -8,7 +8,7 @@ import hydra import polars as pl from loguru import logger -from omegaconf import DictConfig +from omegaconf import DictConfig, OmegaConf from MEDS_polars_functions.filter_patients_by_length import ( filter_patients_by_num_events, @@ -24,6 +24,12 @@ def main(cfg: DictConfig): hydra_loguru_init() + logger.info( + f"Running with config:\n{OmegaConf.to_yaml(cfg)}\n" + f"Stage: {cfg.stage}\n\n" + f"Stage config:\n{OmegaConf.to_yaml(cfg.stage_cfg)}" + ) + MEDS_cohort_dir = Path(cfg.MEDS_cohort_dir) output_dir = Path(cfg.output_data_dir) diff --git a/src/MEDS_polars_functions/utils.py b/src/MEDS_polars_functions/utils.py index e61f21d..996673a 100644 --- a/src/MEDS_polars_functions/utils.py +++ b/src/MEDS_polars_functions/utils.py @@ -6,7 +6,7 @@ import hydra import polars as pl -from loguru import logger as log +from loguru import logger from omegaconf import OmegaConf @@ -23,8 +23,9 @@ def populate_stage( stage_name: str, input_dir: str, cohort_dir: str, - stages: list[dict], - pre_parsed_stages: list[dict] | None = None, + stages: list[str], + stage_configs: dict[str, dict], + pre_parsed_stages: dict[str, dict] | None = None, ) -> dict: """Populates a stage in the stages configuration with inferred stage parameters. @@ -42,9 +43,11 @@ def populate_stage( stage_name: The name of the stage to populate. input_dir: The global input directory. cohort_dir: The cohort directory into which this overall pipeline is writing data. - stages: The stages configuration dictionaries (unresolved). - pre_parsed_stages: The stages configuration dictionaries (resolved). If specified, the function will - not re-resolve the stages in this list. + stages: The names of the stages processed by this pipeline in order. + stage_configs: The raw, unresolved stage configuration dictionaries for any stages with specific + arguments, keyed by stage name. + pre_parsed_stages: The stages configuration dictionaries (resolved), keyed by stage name. If + specified, the function will not re-resolve the stages in this list. Returns: dict: The populated stage configuration. @@ -53,70 +56,78 @@ def populate_stage( ValueError: If the stage is not present in the stages configuration. Examples: + >>> from omegaconf import DictConfig >>> root_config = DictConfig({ ... "input_dir": "/a/b", ... "cohort_dir": "/c/d", - ... "stages": [ - ... {"name": "stage1"}, - ... {"name": "stage2", "is_metadata": True}, - ... {"name": "stage3", "is_metadata": None}, - ... {"name": "stage4", "data_input_dir": "/e/f", "output_dir": "/g/h"}, - ... {"name": "stage5", "aggregations": ["foo"]}, - ... {"name": "stage6"}, - ... ], + ... "stages": ["stage1", "stage2", "stage3", "stage4", "stage5", "stage6"], + ... "stage_configs": { + ... "stage2": {"is_metadata": True}, + ... "stage3": {"is_metadata": None}, + ... "stage4": {"data_input_dir": "/e/f", "output_dir": "/g/h"}, + ... "stage5": {"aggregations": ["foo"]}, + ... }, ... }) - >>> args = (root_config["input_dir"], root_config["cohort_dir"], root_config["stages"]) + >>> args = [root_config[k] for k in ["input_dir", "cohort_dir", "stages", "stage_configs"]] >>> populate_stage("stage1", *args) # doctest: +NORMALIZE_WHITESPACE - {'name': 'stage1', 'is_metadata': False, 'data_input_dir': '/a/b', 'metadata_input_dir': '/a/b', + {'is_metadata': False, 'data_input_dir': '/a/b', 'metadata_input_dir': '/a/b', 'output_dir': '/c/d/stage1'} >>> populate_stage("stage2", *args) # doctest: +NORMALIZE_WHITESPACE - {'name': 'stage2', 'is_metadata': True, 'data_input_dir': '/c/d/stage1', 'metadata_input_dir': '/a/b', + {'is_metadata': True, 'data_input_dir': '/c/d/stage1', 'metadata_input_dir': '/a/b', 'output_dir': '/c/d/stage2'} >>> populate_stage("stage3", *args) # doctest: +NORMALIZE_WHITESPACE - {'name': 'stage3', 'is_metadata': False, 'data_input_dir': '/c/d/stage1', + {'is_metadata': False, 'data_input_dir': '/c/d/stage1', 'metadata_input_dir': '/c/d/stage2', 'output_dir': '/c/d/stage3'} >>> populate_stage("stage4", *args) # doctest: +NORMALIZE_WHITESPACE - {'name': 'stage4', 'data_input_dir': '/e/f', 'output_dir': '/g/h', 'is_metadata': False, + {'data_input_dir': '/e/f', 'output_dir': '/g/h', 'is_metadata': False, 'metadata_input_dir': '/c/d/stage2'} >>> populate_stage("stage5", *args) # doctest: +NORMALIZE_WHITESPACE - {'name': 'stage5', 'aggregations': ['foo'], 'is_metadata': True, 'data_input_dir': '/g/h', + {'aggregations': ['foo'], 'is_metadata': True, 'data_input_dir': '/g/h', 'metadata_input_dir': '/c/d/stage2', 'output_dir': '/c/d/stage5'} >>> populate_stage("stage6", *args) # doctest: +NORMALIZE_WHITESPACE - {'name': 'stage6', 'is_metadata': False, 'data_input_dir': '/g/h', + {'is_metadata': False, 'data_input_dir': '/g/h', 'metadata_input_dir': '/c/d/stage5', 'output_dir': '/c/d/stage6'} >>> populate_stage("stage7", *args) # doctest: +NORMALIZE_WHITESPACE Traceback (most recent call last): ... - ValueError: 'stage7' is not a valid stage name. Options are: - ['stage1', 'stage2', 'stage3', 'stage4', 'stage5', 'stage6'] + ValueError: 'stage7' is not a valid stage name. Options are: stage1, stage2, stage3, stage4, stage5, + stage6 """ - if stage_name not in {s["name"] for s in stages}: - raise ValueError( - f"'{stage_name}' is not a valid stage name. Options are: {list(s['name'] for s in stages)}" - ) + for s in stage_configs.keys(): + if s not in stages: + raise ValueError( + f"stage config key '{s}' is not a valid stage name. Options are: {list(stages.keys())}" + ) - pre_pop_stages_by_name = {s["name"]: s for s in pre_parsed_stages} if pre_parsed_stages else {} - pre_parsed_stages = pre_parsed_stages or [] + if stage_name not in stages: + raise ValueError(f"'{stage_name}' is not a valid stage name. Options are: {', '.join(stages)}") + + if pre_parsed_stages is None: + pre_parsed_stages = {} - prior_stages = [] stage = None prior_data_stage = None prior_metadata_stage = None for s in stages: - if s["name"] == stage_name: - stage = s + if s == stage_name: + stage = stage_configs.get(s, {}) break - elif s["name"] in pre_pop_stages_by_name: - s_resolved = pre_pop_stages_by_name[s["name"]] + elif s in pre_parsed_stages: + s_resolved = pre_parsed_stages[s] else: - s_resolved = populate_stage(s["name"], input_dir, cohort_dir, stages, prior_stages) + s_resolved = populate_stage(s, input_dir, cohort_dir, stages, stage_configs, pre_parsed_stages) + pre_parsed_stages[s] = s_resolved if s_resolved["is_metadata"]: prior_metadata_stage = s_resolved else: prior_data_stage = s_resolved - prior_stages.append(s_resolved) + + logger.debug( + f"Parsing stage {stage_name}:\nResolved prior data stage: {prior_data_stage}\n" + f"Resolved prior metadata stage: {prior_metadata_stage}" + ) inferred_keys = { "is_metadata": "aggregations" in stage, @@ -145,7 +156,7 @@ def hydra_loguru_init() -> None: Must be called from a hydra main! """ hydra_path = hydra.core.hydra_config.HydraConfig.get().runtime.output_dir - log.add(os.path.join(hydra_path, "main.log")) + logger.add(os.path.join(hydra_path, "main.log")) def write_lazyframe(df: pl.LazyFrame, out_fp: Path) -> None: diff --git a/tests/test_extraction.py b/tests/test_extraction.py index a256864..9343d17 100644 --- a/tests/test_extraction.py +++ b/tests/test_extraction.py @@ -245,14 +245,14 @@ def test_extraction(): # 4. Merge to the final output. extraction_config_kwargs = { - "raw_cohort_dir": str(raw_cohort_dir.resolve()), - "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()), + "input_dir": str(raw_cohort_dir.resolve()), + "cohort_dir": str(MEDS_cohort_dir.resolve()), "event_conversion_config_fp": str(event_cfgs_yaml.resolve()), - "split_fracs.train": 4 / 6, - "split_fracs.tuning": 1 / 6, - "split_fracs.held_out": 1 / 6, - "row_chunksize": 10, - "n_patients_per_shard": 2, + "stage_configs.split_and_shard_patients.split_fracs.train": 4 / 6, + "stage_configs.split_and_shard_patients.split_fracs.tuning": 1 / 6, + "stage_configs.split_and_shard_patients.split_fracs.held_out": 1 / 6, + "stage_configs.shard_events.row_chunksize": 10, + "stage_configs.split_and_shard_patients.n_patients_per_shard": 2, "hydra.verbose": True, } @@ -269,7 +269,7 @@ def test_extraction(): all_stderrs.append(stderr) all_stdouts.append(stdout) - subsharded_dir = MEDS_cohort_dir / "sub_sharded" + subsharded_dir = MEDS_cohort_dir / "shard_events" try: out_files = list(subsharded_dir.glob("**/*.parquet")) @@ -319,24 +319,30 @@ def test_extraction(): all_stderrs.append(stderr) all_stdouts.append(stdout) - splits_fp = MEDS_cohort_dir / "splits.json" - assert splits_fp.is_file(), f"Expected splits @ {str(splits_fp.resolve())} to exist." + try: + splits_fp = MEDS_cohort_dir / "splits.json" + assert splits_fp.is_file(), f"Expected splits @ {str(splits_fp.resolve())} to exist." - splits = json.loads(splits_fp.read_text()) - expected_keys = ["train/0", "train/1", "tuning/0", "held_out/0"] + splits = json.loads(splits_fp.read_text()) + expected_keys = ["train/0", "train/1", "tuning/0", "held_out/0"] - expected_keys_str = ", ".join(f"'{k}'" for k in expected_keys) - got_keys_str = ", ".join(f"'{k}'" for k in splits.keys()) + expected_keys_str = ", ".join(f"'{k}'" for k in expected_keys) + got_keys_str = ", ".join(f"'{k}'" for k in splits.keys()) - assert set(splits.keys()) == set(expected_keys), ( - f"Expected splits to have keys {expected_keys_str}.\n" f"Got keys: {got_keys_str}" - ) + assert set(splits.keys()) == set(expected_keys), ( + f"Expected splits to have keys {expected_keys_str}.\n" f"Got keys: {got_keys_str}" + ) - assert splits == EXPECTED_SPLITS, ( - f"Expected splits to be {EXPECTED_SPLITS}, got {splits}. NOTE THIS MAY CHANGE IF THE SEED OR " - "DATA CHANGES -- FAILURE HERE MAY BE JUST DUE TO A NON-DETERMINISTIC SPLIT AND THE TEST NEEDING " - "TO BE UPDATED." - ) + assert splits == EXPECTED_SPLITS, ( + f"Expected splits to be {EXPECTED_SPLITS}, got {splits}. NOTE THIS MAY CHANGE IF THE SEED OR " + "DATA CHANGES -- FAILURE HERE MAY BE JUST DUE TO A NON-DETERMINISTIC SPLIT AND THE TEST " + "NEEDING TO BE UPDATED." + ) + except AssertionError as e: + print("Failed to split patients") + print(f"stderr:\n{stderr}") + print(f"stdout:\n{stdout}") + raise e # Step 3: Extract the events and sub-shard by patient stderr, stdout = run_command( @@ -347,7 +353,7 @@ def test_extraction(): all_stderrs.append(stderr) all_stdouts.append(stdout) - patient_subsharded_folder = MEDS_cohort_dir / "patient_sub_sharded_events" + patient_subsharded_folder = MEDS_cohort_dir / "convert_to_sharded_events" assert patient_subsharded_folder.is_dir(), f"Expected {patient_subsharded_folder} to be a directory." for split, expected_outputs in SUB_SHARDED_OUTPUTS.items(): From 841f6617ae0fef2859cf935c011530f11d3155b4 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Tue, 28 May 2024 10:56:14 -0400 Subject: [PATCH 06/47] Updated MIMIC examples --- MIMIC-IV_Example/README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/MIMIC-IV_Example/README.md b/MIMIC-IV_Example/README.md index 535aa56..54ec4ce 100644 --- a/MIMIC-IV_Example/README.md +++ b/MIMIC-IV_Example/README.md @@ -84,8 +84,8 @@ This is a step in 4 parts: ```bash ./scripts/extraction/shard_events.py \ - raw_cohort_dir=$MIMICIV_PREMEDS_DIR \ - MEDS_cohort_dir=$MIMICIV_MEDS_DIR \ + input_dir=$MIMICIV_PREMEDS_DIR \ + cohort_dir=$MIMICIV_MEDS_DIR \ event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml ``` @@ -95,8 +95,8 @@ In practice, on a machine with 150 GB of RAM and 10 cores, this step takes appro ```bash ./scripts/extraction/split_and_shard_patients.py \ - raw_cohort_dir=$MIMICIV_PREMEDS_DIR \ - MEDS_cohort_dir=$MIMICIV_MEDS_DIR \ + input_dir=$MIMICIV_PREMEDS_DIR \ + cohort_dir=$MIMICIV_MEDS_DIR \ event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml ``` @@ -106,8 +106,8 @@ In practice, on a machine with 150 GB of RAM and 10 cores, this step takes less ```bash ./scripts/extraction/convert_to_sharded_events.py \ - raw_cohort_dir=$MIMICIV_PREMEDS_DIR \ - MEDS_cohort_dir=$MIMICIV_MEDS_DIR \ + input_dir=$MIMICIV_PREMEDS_DIR \ + cohort_dir=$MIMICIV_MEDS_DIR \ event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml ``` @@ -121,8 +121,8 @@ and performance is not necessary; however, for larger datasets, it can be. ```bash ./scripts/extraction/merge_to_MEDS_cohort.py \ - raw_cohort_dir=$MIMICIV_PREMEDS_DIR \ - MEDS_cohort_dir=$MIMICIV_MEDS_DIR \ + input_dir=$MIMICIV_PREMEDS_DIR \ + cohort_dir=$MIMICIV_MEDS_DIR \ event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml ``` From bbd673d558da55ab45fb674cda66f5b7ec55c3f9 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Thu, 30 May 2024 17:04:12 -0400 Subject: [PATCH 07/47] Added some content to README that still needs to be re-worked a bit --- README.md | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 85 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 9669c48..d57b82b 100644 --- a/README.md +++ b/README.md @@ -32,10 +32,92 @@ This package provides three things: ## Installation -For now, clone this repository and run `pip install -e .` from the repository root. To use the MIMIC-IV -example, install the optional MIMIC dependencies as well with `pip install -e .[mimic]`. +- For a base installation, clone this repository and run `pip install .` from the repository root. +- For running the MIMIC-IV example, install the optional MIMIC dependencies as well with `pip install .[mimic]`. +- To support same-machine, process-based parallelism, install the optional joblib dependencies with `pip install .[local_parallelism]`. +- To support cluster-based parallelism, install the optional submitit dependencies with `pip install .[slurm_parallelism]`. +- For working on development, install the optional development dependencies with `pip install .[dev,tests]`. +- Optional dependencies can be mutually installed by combining the optional dependency names with commas in + the square brackets, e.g., `pip install .[mimic,local_parallelism]`. + +## Usage -- High Level + +The MEDS ETL and pre-processing pipelines are designed to be run in a modular, stage-based manner, with each +stage of the pipeline being run as a separate script. For a single pipeline, all scripts will take the same +arguments by leveraging the same Hydra configuration file, and to run multiple workers on a single stage in +parallel, the user can launch the same script multiple times _without changing the arguments or configuration +file_, and the scripts will automatically handle the parallelism and avoid duplicative work. This permits +tremendous flexibility in how these pipelines can be run. + +- The user can run the entire pipeline in serial, through a single shell script simply by calling each + stage's script in sequence. +- The user can leverage arbitrary scheduling systems (e.g., Slurm, LSF, Kubernetes, etc.) to run each stage + in parallel on a cluster, by constructing the appropriate worker scripts to run each stage's script and + simply launching as many worker jobs as is desired (note this will typically required a distributed file + system to work correctly, as these scripts use manually created file locks to avoid duplicative work). +- The user can run each stage in parallel on a single machine by launching multiple copies of the same + script in different terminal sessions. This can result in a significant speedup depending on the machine + configuration as it ensures that parallelism can be used with minimal file read contention. + +Two of these methods of parallelism, in particular local-machine parallelism and slurm-based cluster +parallelism, are supported explicitly by this package through the use of the `joblib` and `submitit` Hydra +plugins and Hydra's multirun capabilities, which will be discussed in more detail below. + +By following this design convention, each individual stage of the pipeline can be kept extremely simple (often +each stage corresponds simply to a single short "dataframe" function), can be rigorously tested, can be cached +after completion to permit easy re-suming or re-running of the pipeline, and permits extremely flexible and +efficient (through parallelization) use of the pipeline in a variety of environments, all without imposing +significant complexity, overhead, or computational dependencies on the user. + +Below we walk through usage of this mechanism for both the ETL and the model-specific pre-processing +pipelines in more detail. + +### Scripts for the ETL Pipeline + +The ETL pipeline (which is more complete, and likely to be viable for a wider range of input datasets out of +the box) relies on the following configuration files and scripts: + +Configuration: `configs/extraction.yaml` -## MEDS ETL / Extraction Pipeline +```yaml +# The event conversion configuration file is used throughout the pipeline to define the events to extract. +event_conversion_config_fp: ??? + +stages: + - shard_events + - split_and_shard_patients + - convert_to_sharded_events + - merge_to_MEDS_cohort + +stage_configs: + shard_events: + row_chunksize: 200000000 + infer_schema_length: 10000 + split_and_shard_patients: + is_metadata: true + output_dir: ${cohort_dir} + n_patients_per_shard: 50000 + external_splits_json_fp: + split_fracs: + train: 0.8 + tuning: 0.1 + held_out: 0.1 + merge_to_MEDS_cohort: + output_dir: ${cohort_dir}/final_cohort +``` + +Scripts: + +1. `shard_events.py`: Shards the input data into smaller, event-level shards. +2. `split_and_shard_patients.py`: Splits the patient population into ML splits and shards these splits into + patient-level shards. +3. `convert_to_sharded_events.py`: Converts the input, event-level shards into the MEDS event format and + sub-shards them into patient-level sub-shards. +4. `merge_to_MEDS_cohort.py`: Merges the patient-level, event-level shards into full patient-level shards. + +See the `MIMIC-IV_Example` directory for a full, worked example of the ETL on MIMIC-IV v2.2. + +## MEDS ETL / Extraction Pipeline Details ### Overview From 67f8b6c2f8f87ece5929c10155ed47639c7076d8 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Thu, 30 May 2024 18:21:01 -0400 Subject: [PATCH 08/47] Added joint script demonstrating joblib launcher --- MIMIC-IV_Example/joint_script.sh | 44 ++++++++++++++++++++++++++++++ MIMIC-IV_Example/pre_MEDS.py | 9 ++++++ scripts/extraction/shard_events.py | 5 ++++ 3 files changed, 58 insertions(+) create mode 100755 MIMIC-IV_Example/joint_script.sh diff --git a/MIMIC-IV_Example/joint_script.sh b/MIMIC-IV_Example/joint_script.sh new file mode 100755 index 0000000..ebd397b --- /dev/null +++ b/MIMIC-IV_Example/joint_script.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +MIMICIV_RAW_DIR="$1" +MIMICIV_PREMEDS_DIR="$2" +MIMICIV_MEDS_DIR="$3" +N_PARALLEL_WORKERS="$4" + +shift 4 + +echo "Running pre-MEDS conversion." +./MIMIC-IV_Example/pre_MEDS.py raw_cohort_dir=$MIMICIV_RAW_DIR output_dir=$MIMICIV_PREMEDS_DIR + +echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel" +./scripts/extraction/shard_events.py \ + --multirun \ + worker="range(1,$N_PARALLEL_WORKERS)" \ + hydra/launcher=joblib \ + input_dir=$MIMICIV_PREMEDS_DIR \ + cohort_dir=$MIMICIV_MEDS_DIR \ + event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" + +echo "Splitting patients in serial" +./scripts/extraction/split_and_shard_patients.py \ + input_dir=$MIMICIV_PREMEDS_DIR \ + cohort_dir=$MIMICIV_MEDS_DIR \ + event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" + +echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" +./scripts/extraction/convert_to_sharded_events.py \ + --multirun \ + worker="range(1,$N_PARALLEL_WORKERS)" \ + hydra/launcher=joblib \ + input_dir=$MIMICIV_PREMEDS_DIR \ + cohort_dir=$MIMICIV_MEDS_DIR \ + event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" + +echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel" +./scripts/extraction/merge_to_MEDS_cohort.py \ + --multirun \ + worker="range(1,$N_PARALLEL_WORKERS)" \ + hydra/launcher=joblib \ + input_dir=$MIMICIV_PREMEDS_DIR \ + cohort_dir=$MIMICIV_MEDS_DIR \ + event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" diff --git a/MIMIC-IV_Example/pre_MEDS.py b/MIMIC-IV_Example/pre_MEDS.py index 1f2f223..789b882 100755 --- a/MIMIC-IV_Example/pre_MEDS.py +++ b/MIMIC-IV_Example/pre_MEDS.py @@ -94,6 +94,11 @@ def main(cfg: DictConfig): pfx = get_shard_prefix(raw_cohort_dir, in_fp) out_fp = MEDS_input_dir / in_fp.relative_to(raw_cohort_dir) + + if out_fp.is_file(): + print(f"Done with {pfx}. Continuing") + continue + out_fp.parent.mkdir(parents=True, exist_ok=True) if pfx not in FUNCTIONS: @@ -106,6 +111,10 @@ def main(cfg: DictConfig): continue else: out_fp = MEDS_input_dir / f"{pfx}.parquet" + if out_fp.is_file(): + print(f"Done with {pfx}. Continuing") + continue + fn, need_df = FUNCTIONS[pfx] if not need_df: st = datetime.now() diff --git a/scripts/extraction/shard_events.py b/scripts/extraction/shard_events.py index 5ccc36f..d0533e3 100755 --- a/scripts/extraction/shard_events.py +++ b/scripts/extraction/shard_events.py @@ -222,6 +222,11 @@ def main(cfg: DictConfig): input_files_to_subshard.append(f) seen_files.add(get_shard_prefix(raw_cohort_dir, f)) + if not input_files_to_subshard: + raise FileNotFoundError( + f"Can't find any files in {str(raw_cohort_dir.resolve())} to sub-shard!" + ) + random.shuffle(input_files_to_subshard) subsharding_files_strs = "\n".join([f" * {str(fp.resolve())}" for fp in input_files_to_subshard]) From 7b585819ed27bc56433d8bc967e6ee70477a6fdf Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Thu, 30 May 2024 21:00:22 -0400 Subject: [PATCH 09/47] Minor changes mostly to joint script --- MIMIC-IV_Example/joint_script.sh | 6 +++--- src/MEDS_polars_functions/utils.py | 7 ++----- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/MIMIC-IV_Example/joint_script.sh b/MIMIC-IV_Example/joint_script.sh index ebd397b..9d7ae69 100755 --- a/MIMIC-IV_Example/joint_script.sh +++ b/MIMIC-IV_Example/joint_script.sh @@ -13,7 +13,7 @@ echo "Running pre-MEDS conversion." echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel" ./scripts/extraction/shard_events.py \ --multirun \ - worker="range(1,$N_PARALLEL_WORKERS)" \ + worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ input_dir=$MIMICIV_PREMEDS_DIR \ cohort_dir=$MIMICIV_MEDS_DIR \ @@ -28,7 +28,7 @@ echo "Splitting patients in serial" echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" ./scripts/extraction/convert_to_sharded_events.py \ --multirun \ - worker="range(1,$N_PARALLEL_WORKERS)" \ + worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ input_dir=$MIMICIV_PREMEDS_DIR \ cohort_dir=$MIMICIV_MEDS_DIR \ @@ -37,7 +37,7 @@ echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel" ./scripts/extraction/merge_to_MEDS_cohort.py \ --multirun \ - worker="range(1,$N_PARALLEL_WORKERS)" \ + worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ input_dir=$MIMICIV_PREMEDS_DIR \ cohort_dir=$MIMICIV_MEDS_DIR \ diff --git a/src/MEDS_polars_functions/utils.py b/src/MEDS_polars_functions/utils.py index 996673a..b2fbbb7 100644 --- a/src/MEDS_polars_functions/utils.py +++ b/src/MEDS_polars_functions/utils.py @@ -9,6 +9,8 @@ from loguru import logger from omegaconf import OmegaConf +pl.enable_string_cache() + def current_script_name() -> str: """Returns the name of the script that called this function. @@ -124,11 +126,6 @@ def populate_stage( else: prior_data_stage = s_resolved - logger.debug( - f"Parsing stage {stage_name}:\nResolved prior data stage: {prior_data_stage}\n" - f"Resolved prior metadata stage: {prior_metadata_stage}" - ) - inferred_keys = { "is_metadata": "aggregations" in stage, "data_input_dir": input_dir if prior_data_stage is None else prior_data_stage["output_dir"], From 8aa1db7824623cef16cb209fdd8112383914f505 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Fri, 31 May 2024 08:49:33 -0400 Subject: [PATCH 10/47] Made the locking process more robust --- src/MEDS_polars_functions/mapper.py | 109 +++++++++++++++++++++++----- 1 file changed, 90 insertions(+), 19 deletions(-) diff --git a/src/MEDS_polars_functions/mapper.py b/src/MEDS_polars_functions/mapper.py index deefd0d..34275b8 100644 --- a/src/MEDS_polars_functions/mapper.py +++ b/src/MEDS_polars_functions/mapper.py @@ -8,6 +8,79 @@ from loguru import logger +LOCK_TIME_FMT = "%Y-%m-%dT%H:%M:%S.%f" + + +def get_earliest_lock(cache_directory: Path) -> datetime | None: + """Returns the earliest start time of any lock file present in a cache directory, or None if none exist. + + Args: + cache_directory: The cache directory to check for the presence of a lock file. + + Examples: + >>> import tempfile + >>> directory = tempfile.TemporaryDirectory() + >>> root = Path(directory.name) + >>> empty_directory = root / "cache_empty" + >>> empty_directory.mkdir(exist_ok=True, parents=True) + >>> cache_directory = root / "cache_with_locks" + >>> locks_directory = cache_directory / "locks" + >>> locks_directory.mkdir(exist_ok=True, parents=True) + >>> time_1 = datetime(2021, 1, 1) + >>> time_1_str = time_1.strftime(LOCK_TIME_FMT) # "2021-01-01T00:00:00.000000" + >>> lock_fp_1 = locks_directory / f"{time_1_str}.json" + >>> _ = lock_fp_1.write_text(json.dumps({"start": time_1_str})) + >>> time_2 = datetime(2021, 1, 2, 3, 4, 5) + >>> time_2_str = time_2.strftime(LOCK_TIME_FMT) # "2021-01-02T03:04:05.000000" + >>> lock_fp_2 = locks_directory / f"{time_2_str}.json" + >>> _ = lock_fp_2.write_text(json.dumps({"start": time_2_str})) + >>> get_earliest_lock(cache_directory) + datetime.datetime(2021, 1, 1, 0, 0) + >>> get_earliest_lock(empty_directory) is None + True + >>> lock_fp_1.unlink() + >>> get_earliest_lock(cache_directory) + datetime.datetime(2021, 1, 2, 3, 4, 5) + >>> directory.cleanup() + """ + locks_directory = cache_directory / "locks" + + lock_times = [ + datetime.strptime(json.loads(lock_fp.read_text())["start"], LOCK_TIME_FMT) + for lock_fp in locks_directory.glob("*.json") + ] + + return min(lock_times) if lock_times else None + + +def register_lock(cache_directory: Path) -> tuple[datetime, Path]: + """Register a lock file in a cache directory. + + Args: + cache_directory: The cache directory to register a lock file in. + + Examples: + >>> import tempfile + >>> directory = tempfile.TemporaryDirectory() + >>> root = Path(directory.name) + >>> cache_directory = root / "cache_with_locks" + >>> lock_time, lock_fp = register_lock(cache_directory) + >>> assert (datetime.now() - lock_time).total_seconds() < 1, "Lock time should be ~ now." + >>> lock_fp.is_file() + True + >>> lock_fp.read_text() == f'{{"start": "{lock_time.strftime(LOCK_TIME_FMT)}"}}' + True + >>> directory.cleanup() + """ + + lock_directory = cache_directory / "locks" + lock_directory.mkdir(exist_ok=True, parents=True) + + lock_time = datetime.now() + lock_fp = lock_directory / f"{lock_time.strftime(LOCK_TIME_FMT)}.json" + lock_fp.write_text(json.dumps({"start": lock_time.strftime(LOCK_TIME_FMT)})) + return lock_time, lock_fp + def wrap[ DF_T @@ -108,15 +181,15 @@ def wrap[ │ 3 ┆ 5 ┆ 12 │ └─────┴─────┴─────┘ >>> shutil.rmtree(cache_directory) - >>> lock_fp = cache_directory / "lock.json" - >>> assert not lock_fp.is_file() - >>> def lock_fp_checker_fn(df: pl.DataFrame) -> pl.DataFrame: - ... print(f"Lock fp exists? {lock_fp.is_file()}") + >>> lock_dir = cache_directory / "locks" + >>> assert not lock_dir.exists() + >>> def lock_dir_checker_fn(df: pl.DataFrame) -> pl.DataFrame: + ... print(f"Lock dir exists? {lock_dir.exists()}") ... return df >>> result_computed, out_df = wrap( - ... in_fp, out_fp, read_fn, write_fn, lock_fp_checker_fn, do_return=True + ... in_fp, out_fp, read_fn, write_fn, lock_dir_checker_fn, do_return=True ... ) - Lock fp exists? True + Lock dir exists? True >>> assert result_computed >>> out_df shape: (3, 3) @@ -146,21 +219,19 @@ def wrap[ cache_directory = out_fp.parent / f".{out_fp.stem}_cache" cache_directory.mkdir(exist_ok=True, parents=True) - st_time = datetime.now() - runtime_info = {"start": str(st_time)} + earliest_lock_time = get_earliest_lock(cache_directory) + if earliest_lock_time is not None: + logger.info(f"{out_fp} is in progress as of {earliest_lock_time}. Returning.") + return False, None if do_return else False - lock_fp = cache_directory / "lock.json" - if lock_fp.is_file(): - started_at = json.loads(lock_fp.read_text())["start"] - logger.info( - f"{out_fp} is under construction as of {started_at} as {lock_fp} exists. " "Returning None." - ) - if do_return: - return False, None - else: - return False + st_time, lock_fp = register_lock(cache_directory) - lock_fp.write_text(json.dumps(runtime_info)) + logger.info(f"Registered lock at {st_time}. Double checking no earlier locks have been registered.") + earliest_lock_time = get_earliest_lock(cache_directory) + if earliest_lock_time < st_time: + logger.info(f"Earlier lock found at {earliest_lock_time}. Deleting current lock and returning.") + lock_fp.unlink() + return False, None if do_return else False logger.info(f"Reading input dataframe from {in_fp}") df = read_fn(in_fp) From 42bc74e5b4ed9accea802c3d2ea68103c4ccac46 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Fri, 31 May 2024 09:53:22 -0400 Subject: [PATCH 11/47] Added a slurm script -- yet untested --- MIMIC-IV_Example/README.md | 68 ++++++++++++++++++++++- MIMIC-IV_Example/joint_script_slurm.sh | 77 ++++++++++++++++++++++++++ configs/pipeline.yaml | 4 +- 3 files changed, 146 insertions(+), 3 deletions(-) create mode 100755 MIMIC-IV_Example/joint_script_slurm.sh diff --git a/MIMIC-IV_Example/README.md b/MIMIC-IV_Example/README.md index 54ec4ce..f72f9b2 100644 --- a/MIMIC-IV_Example/README.md +++ b/MIMIC-IV_Example/README.md @@ -72,6 +72,7 @@ root directory of this repository): In practice, on a machine with 150 GB of RAM and 10 cores, this step takes less than 5 minutes in total. ## Step 3: Run the MEDS extraction ETL +### Running locally, serially We will assume you want to output the final MEDS dataset into a directory we'll denote as `$MIMICIV_MEDS_DIR`. Note this is a different directory than the pre-MEDS directory (though, of course, they can both be @@ -80,7 +81,7 @@ subdirectories of the same root directory). This is a step in 4 parts: 1. Sub-shard the raw files. Run this command as many times simultaneously as you would like to have workers - performing this sub-sharding step. + performing this sub-sharding step. See below for how to automate this parallelism using hydra launchers. ```bash ./scripts/extraction/shard_events.py \ @@ -126,6 +127,71 @@ and performance is not necessary; however, for larger datasets, it can be. event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml ``` +### Running Locally, in Parallel. +This step is the exact same commands as above, but leverages Hydra's multirun capabilities with the `joblib` +launcher. Install this package with the optional `local_parallelism` option (e.g., `pip install -e +.[local_parallelism]` and run `./MIMIC-IV_Example/joint_script.sh`. See that script for expected args. + +### Running Each Step over Slurm +To use slurm, run each command with the number of workers desired using Hydra's multirun capabilities with the +`submitit_slurm` launcher. Install this package with the optional `slurm_parallelism` option. See below for +modified commands. Note these can't be chained in a single script as the jobs will not wait for all slurm jobs +to finish before moving on to the next stage. Let `$N_PARALLEL_WORKERS` be the number of desired workers + +1. Sub-shard the raw files. + +```bash +./scripts/extraction/shard_events.py \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=submitit_slurm \ + hydra.launcher.timeout_min=60 \ + hydra.launcher.cpus_per_task=10 \ + hydra.launcher.mem_gb=50 \ + hydra.launcher.name="${hydra.job.name}_${worker}" \ + hydra.launcher.partition="short" \ + input_dir=$MIMICIV_PREMEDS_DIR \ + cohort_dir=$MIMICIV_MEDS_DIR \ + event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml +``` + +In practice, on a machine with 150 GB of RAM and 10 cores, this step takes approximately 20 minutes in total. + +2. Extract and form the patient splits and sub-shards. + +```bash +./scripts/extraction/split_and_shard_patients.py \ + input_dir=$MIMICIV_PREMEDS_DIR \ + cohort_dir=$MIMICIV_MEDS_DIR \ + event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml +``` + +In practice, on a machine with 150 GB of RAM and 10 cores, this step takes less than 5 minutes in total. + +3. Extract patient sub-shards and convert to MEDS events. + +```bash +./scripts/extraction/convert_to_sharded_events.py \ + input_dir=$MIMICIV_PREMEDS_DIR \ + cohort_dir=$MIMICIV_MEDS_DIR \ + event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml +``` + +In practice, serially, this also takes around 20 minutes or more. However, it can be trivially parallelized to +cut the time down by a factor of the number of workers processing the data by simply running the command +multiple times (though this will, of course, consume more resources). If your filesystem is distributed, these +commands can also be launched as separate slurm jobs, for example. For MIMIC-IV, this level of parallelization +and performance is not necessary; however, for larger datasets, it can be. + +4. Merge the MEDS events into a single file per patient sub-shard. + +```bash +./scripts/extraction/merge_to_MEDS_cohort.py \ + input_dir=$MIMICIV_PREMEDS_DIR \ + cohort_dir=$MIMICIV_MEDS_DIR \ + event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml +``` + ## Limitations / TO-DOs: Currently, some tables are ignored, including: diff --git a/MIMIC-IV_Example/joint_script_slurm.sh b/MIMIC-IV_Example/joint_script_slurm.sh new file mode 100755 index 0000000..d6db681 --- /dev/null +++ b/MIMIC-IV_Example/joint_script_slurm.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash + +MIMICIV_RAW_DIR="$1" +MIMICIV_PREMEDS_DIR="$2" +MIMICIV_MEDS_DIR="$3" +N_PARALLEL_WORKERS="$4" + +shift 4 + +# Note we use `--multirun` throughout here due to ensure the submitit launcher is used throughout, so that +# this doesn't fall back on running anything locally in a setting where only slurm worker nodes have +# sufficient computational resources to run the actual jobs. + +echo "Running pre-MEDS conversion on one worker." +./MIMIC-IV_Example/pre_MEDS.py \ + --multirun \ + worker="range(0,1)" \ + hydra/launcher=submitit_slurm \ + hydra.launcher.timeout_min=60 \ + hydra.launcher.cpus_per_task=10 \ + hydra.launcher.mem_gb=50 \ + hydra.launcher.partition="short" \ + raw_cohort_dir=$MIMICIV_RAW_DIR \ + output_dir=$MIMICIV_PREMEDS_DIR + +echo "Trying submitit launching with $N_PARALLEL_WORKERS jobs." + +./scripts/extraction/shard_events.py \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=submitit_slurm \ + hydra.launcher.timeout_min=60 \ + hydra.launcher.cpus_per_task=10 \ + hydra.launcher.mem_gb=50 \ + hydra.launcher.partition="short" \ + input_dir=$MIMICIV_PREMEDS_DIR \ + cohort_dir=$MIMICIV_MEDS_DIR \ + event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml + +echo "Splitting patients on one worker" +./scripts/extraction/split_and_shard_patients.py \ + --multirun \ + worker="range(0,1)" \ + hydra/launcher=submitit_slurm \ + hydra.launcher.timeout_min=60 \ + hydra.launcher.cpus_per_task=10 \ + hydra.launcher.mem_gb=50 \ + hydra.launcher.partition="short" \ + input_dir=$MIMICIV_PREMEDS_DIR \ + cohort_dir=$MIMICIV_MEDS_DIR \ + event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" + +echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" +./scripts/extraction/convert_to_sharded_events.py \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=submitit_slurm \ + hydra.launcher.timeout_min=60 \ + hydra.launcher.cpus_per_task=10 \ + hydra.launcher.mem_gb=50 \ + hydra.launcher.partition="short" \ + input_dir=$MIMICIV_PREMEDS_DIR \ + cohort_dir=$MIMICIV_MEDS_DIR \ + event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" + +echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel" +./scripts/extraction/merge_to_MEDS_cohort.py \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=submitit_slurm \ + hydra.launcher.timeout_min=60 \ + hydra.launcher.cpus_per_task=10 \ + hydra.launcher.mem_gb=50 \ + hydra.launcher.partition="short" \ + input_dir=$MIMICIV_PREMEDS_DIR \ + cohort_dir=$MIMICIV_MEDS_DIR \ + event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" diff --git a/configs/pipeline.yaml b/configs/pipeline.yaml index 851afd7..29a2dfb 100644 --- a/configs/pipeline.yaml +++ b/configs/pipeline.yaml @@ -2,7 +2,7 @@ input_dir: ??? cohort_dir: ??? -log_dir: "${cohort_dir}/.logs/${stage}/worker_${worker}/${now:%Y-%m-%d_%H-%M-%S}" +log_dir: "${cohort_dir}/.logs" # General pipeline variables do_overwrite: False @@ -21,7 +21,7 @@ stage_cfg: ${oc.create:${populate_stage:${stage}, ${input_dir}, ${cohort_dir}, $ # Hydra hydra: job: - name: "${stage}" + name: "${stage}/${worker}/${now:%Y-%m-%d_%H-%M-%S}" run: dir: "${log_dir}/${hydra.job.name}" sweep: From f8441684823971c5e08424e0ceddd1ae9133caee Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Fri, 31 May 2024 15:46:27 -0400 Subject: [PATCH 12/47] Updates to pipeline.yaml --- MIMIC-IV_Example/joint_script_slurm.sh | 99 +++++++++++++------------- configs/pipeline.yaml | 6 +- 2 files changed, 53 insertions(+), 52 deletions(-) diff --git a/MIMIC-IV_Example/joint_script_slurm.sh b/MIMIC-IV_Example/joint_script_slurm.sh index d6db681..8ce85fb 100755 --- a/MIMIC-IV_Example/joint_script_slurm.sh +++ b/MIMIC-IV_Example/joint_script_slurm.sh @@ -11,17 +11,17 @@ shift 4 # this doesn't fall back on running anything locally in a setting where only slurm worker nodes have # sufficient computational resources to run the actual jobs. -echo "Running pre-MEDS conversion on one worker." -./MIMIC-IV_Example/pre_MEDS.py \ - --multirun \ - worker="range(0,1)" \ - hydra/launcher=submitit_slurm \ - hydra.launcher.timeout_min=60 \ - hydra.launcher.cpus_per_task=10 \ - hydra.launcher.mem_gb=50 \ - hydra.launcher.partition="short" \ - raw_cohort_dir=$MIMICIV_RAW_DIR \ - output_dir=$MIMICIV_PREMEDS_DIR +# echo "Running pre-MEDS conversion on one worker." +# ./MIMIC-IV_Example/pre_MEDS.py \ +# --multirun \ +# worker="range(0,1)" \ +# hydra/launcher=submitit_slurm \ +# hydra.launcher.timeout_min=60 \ +# hydra.launcher.cpus_per_task=10 \ +# hydra.launcher.mem_gb=50 \ +# hydra.launcher.partition="short" \ +# raw_cohort_dir=$MIMICIV_RAW_DIR \ +# output_dir=$MIMICIV_PREMEDS_DIR echo "Trying submitit launching with $N_PARALLEL_WORKERS jobs." @@ -33,45 +33,46 @@ echo "Trying submitit launching with $N_PARALLEL_WORKERS jobs." hydra.launcher.cpus_per_task=10 \ hydra.launcher.mem_gb=50 \ hydra.launcher.partition="short" \ + "hydra.job.env_copy=[PATH]" \ input_dir=$MIMICIV_PREMEDS_DIR \ cohort_dir=$MIMICIV_MEDS_DIR \ event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml -echo "Splitting patients on one worker" -./scripts/extraction/split_and_shard_patients.py \ - --multirun \ - worker="range(0,1)" \ - hydra/launcher=submitit_slurm \ - hydra.launcher.timeout_min=60 \ - hydra.launcher.cpus_per_task=10 \ - hydra.launcher.mem_gb=50 \ - hydra.launcher.partition="short" \ - input_dir=$MIMICIV_PREMEDS_DIR \ - cohort_dir=$MIMICIV_MEDS_DIR \ - event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" - -echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" -./scripts/extraction/convert_to_sharded_events.py \ - --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ - hydra/launcher=submitit_slurm \ - hydra.launcher.timeout_min=60 \ - hydra.launcher.cpus_per_task=10 \ - hydra.launcher.mem_gb=50 \ - hydra.launcher.partition="short" \ - input_dir=$MIMICIV_PREMEDS_DIR \ - cohort_dir=$MIMICIV_MEDS_DIR \ - event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" - -echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel" -./scripts/extraction/merge_to_MEDS_cohort.py \ - --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ - hydra/launcher=submitit_slurm \ - hydra.launcher.timeout_min=60 \ - hydra.launcher.cpus_per_task=10 \ - hydra.launcher.mem_gb=50 \ - hydra.launcher.partition="short" \ - input_dir=$MIMICIV_PREMEDS_DIR \ - cohort_dir=$MIMICIV_MEDS_DIR \ - event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" +#echo "Splitting patients on one worker" +#./scripts/extraction/split_and_shard_patients.py \ +# --multirun \ +# worker="range(0,1)" \ +# hydra/launcher=submitit_slurm \ +# hydra.launcher.timeout_min=60 \ +# hydra.launcher.cpus_per_task=10 \ +# hydra.launcher.mem_gb=50 \ +# hydra.launcher.partition="short" \ +# input_dir=$MIMICIV_PREMEDS_DIR \ +# cohort_dir=$MIMICIV_MEDS_DIR \ +# event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" +# +#echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" +#./scripts/extraction/convert_to_sharded_events.py \ +# --multirun \ +# worker="range(0,$N_PARALLEL_WORKERS)" \ +# hydra/launcher=submitit_slurm \ +# hydra.launcher.timeout_min=60 \ +# hydra.launcher.cpus_per_task=10 \ +# hydra.launcher.mem_gb=50 \ +# hydra.launcher.partition="short" \ +# input_dir=$MIMICIV_PREMEDS_DIR \ +# cohort_dir=$MIMICIV_MEDS_DIR \ +# event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" +# +#echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel" +#./scripts/extraction/merge_to_MEDS_cohort.py \ +# --multirun \ +# worker="range(0,$N_PARALLEL_WORKERS)" \ +# hydra/launcher=submitit_slurm \ +# hydra.launcher.timeout_min=60 \ +# hydra.launcher.cpus_per_task=10 \ +# hydra.launcher.mem_gb=50 \ +# hydra.launcher.partition="short" \ +# input_dir=$MIMICIV_PREMEDS_DIR \ +# cohort_dir=$MIMICIV_MEDS_DIR \ +# event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" diff --git a/configs/pipeline.yaml b/configs/pipeline.yaml index 29a2dfb..5694e25 100644 --- a/configs/pipeline.yaml +++ b/configs/pipeline.yaml @@ -21,8 +21,8 @@ stage_cfg: ${oc.create:${populate_stage:${stage}, ${input_dir}, ${cohort_dir}, $ # Hydra hydra: job: - name: "${stage}/${worker}/${now:%Y-%m-%d_%H-%M-%S}" + name: "${stage}_${worker}_${now:%Y-%m-%d_%H-%M-%S}" run: - dir: "${log_dir}/${hydra.job.name}" + dir: "${log_dir}" sweep: - dir: "${log_dir}/${hydra.job.name}" + dir: "${log_dir}" From 6f910b9eb5504df196dea819101f69687a698991 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Fri, 31 May 2024 15:49:18 -0400 Subject: [PATCH 13/47] cleaned files --- MIMIC-IV_Example/README.md | 6 +- MIMIC-IV_Example/joint_script.sh | 18 ++--- MIMIC-IV_Example/joint_script_slurm.sh | 102 ++++++++++++------------- scripts/extraction/shard_events.py | 4 +- 4 files changed, 65 insertions(+), 65 deletions(-) diff --git a/MIMIC-IV_Example/README.md b/MIMIC-IV_Example/README.md index f72f9b2..16e4431 100644 --- a/MIMIC-IV_Example/README.md +++ b/MIMIC-IV_Example/README.md @@ -72,6 +72,7 @@ root directory of this repository): In practice, on a machine with 150 GB of RAM and 10 cores, this step takes less than 5 minutes in total. ## Step 3: Run the MEDS extraction ETL + ### Running locally, serially We will assume you want to output the final MEDS dataset into a directory we'll denote as `$MIMICIV_MEDS_DIR`. @@ -128,11 +129,12 @@ and performance is not necessary; however, for larger datasets, it can be. ``` ### Running Locally, in Parallel. + This step is the exact same commands as above, but leverages Hydra's multirun capabilities with the `joblib` -launcher. Install this package with the optional `local_parallelism` option (e.g., `pip install -e -.[local_parallelism]` and run `./MIMIC-IV_Example/joint_script.sh`. See that script for expected args. +launcher. Install this package with the optional `local_parallelism` option (e.g., `pip install -e .[local_parallelism]` and run `./MIMIC-IV_Example/joint_script.sh`. See that script for expected args. ### Running Each Step over Slurm + To use slurm, run each command with the number of workers desired using Hydra's multirun capabilities with the `submitit_slurm` launcher. Install this package with the optional `slurm_parallelism` option. See below for modified commands. Note these can't be chained in a single script as the jobs will not wait for all slurm jobs diff --git a/MIMIC-IV_Example/joint_script.sh b/MIMIC-IV_Example/joint_script.sh index 9d7ae69..eb58e89 100755 --- a/MIMIC-IV_Example/joint_script.sh +++ b/MIMIC-IV_Example/joint_script.sh @@ -8,21 +8,21 @@ N_PARALLEL_WORKERS="$4" shift 4 echo "Running pre-MEDS conversion." -./MIMIC-IV_Example/pre_MEDS.py raw_cohort_dir=$MIMICIV_RAW_DIR output_dir=$MIMICIV_PREMEDS_DIR +./MIMIC-IV_Example/pre_MEDS.py raw_cohort_dir="$MIMICIV_RAW_DIR" output_dir="$MIMICIV_PREMEDS_DIR" echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel" ./scripts/extraction/shard_events.py \ --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ - input_dir=$MIMICIV_PREMEDS_DIR \ - cohort_dir=$MIMICIV_MEDS_DIR \ + input_dir="$MIMICIV_PREMEDS_DIR" \ + cohort_dir="$MIMICIV_MEDS_DIR" \ event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" echo "Splitting patients in serial" ./scripts/extraction/split_and_shard_patients.py \ - input_dir=$MIMICIV_PREMEDS_DIR \ - cohort_dir=$MIMICIV_MEDS_DIR \ + input_dir="$MIMICIV_PREMEDS_DIR" \ + cohort_dir="$MIMICIV_MEDS_DIR" \ event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" @@ -30,8 +30,8 @@ echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ - input_dir=$MIMICIV_PREMEDS_DIR \ - cohort_dir=$MIMICIV_MEDS_DIR \ + input_dir="$MIMICIV_PREMEDS_DIR" \ + cohort_dir="$MIMICIV_MEDS_DIR" \ event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel" @@ -39,6 +39,6 @@ echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel" --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ - input_dir=$MIMICIV_PREMEDS_DIR \ - cohort_dir=$MIMICIV_MEDS_DIR \ + input_dir="$MIMICIV_PREMEDS_DIR" \ + cohort_dir="$MIMICIV_MEDS_DIR" \ event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" diff --git a/MIMIC-IV_Example/joint_script_slurm.sh b/MIMIC-IV_Example/joint_script_slurm.sh index 8ce85fb..3948e87 100755 --- a/MIMIC-IV_Example/joint_script_slurm.sh +++ b/MIMIC-IV_Example/joint_script_slurm.sh @@ -11,17 +11,17 @@ shift 4 # this doesn't fall back on running anything locally in a setting where only slurm worker nodes have # sufficient computational resources to run the actual jobs. -# echo "Running pre-MEDS conversion on one worker." -# ./MIMIC-IV_Example/pre_MEDS.py \ -# --multirun \ -# worker="range(0,1)" \ -# hydra/launcher=submitit_slurm \ -# hydra.launcher.timeout_min=60 \ -# hydra.launcher.cpus_per_task=10 \ -# hydra.launcher.mem_gb=50 \ -# hydra.launcher.partition="short" \ -# raw_cohort_dir=$MIMICIV_RAW_DIR \ -# output_dir=$MIMICIV_PREMEDS_DIR +echo "Running pre-MEDS conversion on one worker." +./MIMIC-IV_Example/pre_MEDS.py \ + --multirun \ + worker="range(0,1)" \ + hydra/launcher=submitit_slurm \ + hydra.launcher.timeout_min=60 \ + hydra.launcher.cpus_per_task=10 \ + hydra.launcher.mem_gb=50 \ + hydra.launcher.partition="short" \ + raw_cohort_dir="$MIMICIV_RAW_DIR" \ + output_dir="$MIMICIV_PREMEDS_DIR" echo "Trying submitit launching with $N_PARALLEL_WORKERS jobs." @@ -34,45 +34,45 @@ echo "Trying submitit launching with $N_PARALLEL_WORKERS jobs." hydra.launcher.mem_gb=50 \ hydra.launcher.partition="short" \ "hydra.job.env_copy=[PATH]" \ - input_dir=$MIMICIV_PREMEDS_DIR \ - cohort_dir=$MIMICIV_MEDS_DIR \ + input_dir="$MIMICIV_PREMEDS_DIR" \ + cohort_dir="$MIMICIV_MEDS_DIR" \ event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml -#echo "Splitting patients on one worker" -#./scripts/extraction/split_and_shard_patients.py \ -# --multirun \ -# worker="range(0,1)" \ -# hydra/launcher=submitit_slurm \ -# hydra.launcher.timeout_min=60 \ -# hydra.launcher.cpus_per_task=10 \ -# hydra.launcher.mem_gb=50 \ -# hydra.launcher.partition="short" \ -# input_dir=$MIMICIV_PREMEDS_DIR \ -# cohort_dir=$MIMICIV_MEDS_DIR \ -# event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" -# -#echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" -#./scripts/extraction/convert_to_sharded_events.py \ -# --multirun \ -# worker="range(0,$N_PARALLEL_WORKERS)" \ -# hydra/launcher=submitit_slurm \ -# hydra.launcher.timeout_min=60 \ -# hydra.launcher.cpus_per_task=10 \ -# hydra.launcher.mem_gb=50 \ -# hydra.launcher.partition="short" \ -# input_dir=$MIMICIV_PREMEDS_DIR \ -# cohort_dir=$MIMICIV_MEDS_DIR \ -# event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" -# -#echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel" -#./scripts/extraction/merge_to_MEDS_cohort.py \ -# --multirun \ -# worker="range(0,$N_PARALLEL_WORKERS)" \ -# hydra/launcher=submitit_slurm \ -# hydra.launcher.timeout_min=60 \ -# hydra.launcher.cpus_per_task=10 \ -# hydra.launcher.mem_gb=50 \ -# hydra.launcher.partition="short" \ -# input_dir=$MIMICIV_PREMEDS_DIR \ -# cohort_dir=$MIMICIV_MEDS_DIR \ -# event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" +echo "Splitting patients on one worker" +./scripts/extraction/split_and_shard_patients.py \ + --multirun \ + worker="range(0,1)" \ + hydra/launcher=submitit_slurm \ + hydra.launcher.timeout_min=60 \ + hydra.launcher.cpus_per_task=10 \ + hydra.launcher.mem_gb=50 \ + hydra.launcher.partition="short" \ + input_dir="$MIMICIV_PREMEDS_DIR" \ + cohort_dir="$MIMICIV_MEDS_DIR" \ + event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" + +echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" +./scripts/extraction/convert_to_sharded_events.py \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=submitit_slurm \ + hydra.launcher.timeout_min=60 \ + hydra.launcher.cpus_per_task=10 \ + hydra.launcher.mem_gb=50 \ + hydra.launcher.partition="short" \ + input_dir="$MIMICIV_PREMEDS_DIR" \ + cohort_dir="$MIMICIV_MEDS_DIR" \ + event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" + +echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel" +./scripts/extraction/merge_to_MEDS_cohort.py \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=submitit_slurm \ + hydra.launcher.timeout_min=60 \ + hydra.launcher.cpus_per_task=10 \ + hydra.launcher.mem_gb=50 \ + hydra.launcher.partition="short" \ + input_dir="$MIMICIV_PREMEDS_DIR" \ + cohort_dir="$MIMICIV_MEDS_DIR" \ + event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" diff --git a/scripts/extraction/shard_events.py b/scripts/extraction/shard_events.py index d0533e3..9ce0ac9 100755 --- a/scripts/extraction/shard_events.py +++ b/scripts/extraction/shard_events.py @@ -223,9 +223,7 @@ def main(cfg: DictConfig): seen_files.add(get_shard_prefix(raw_cohort_dir, f)) if not input_files_to_subshard: - raise FileNotFoundError( - f"Can't find any files in {str(raw_cohort_dir.resolve())} to sub-shard!" - ) + raise FileNotFoundError(f"Can't find any files in {str(raw_cohort_dir.resolve())} to sub-shard!") random.shuffle(input_files_to_subshard) From 4eadda50ed118b35ecc2804d176ed1c09d504404 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Fri, 31 May 2024 18:49:12 -0400 Subject: [PATCH 14/47] Not remotely working; moving to local for dev --- MIMIC-IV_Example/README.md | 3 +- eICU_Example/README.md | 216 +++++++++++++++++++++++ eICU_Example/configs/event_configs.yaml | 219 ++++++++++++++++++++++++ eICU_Example/configs/pre_MEDS.yaml | 11 ++ eICU_Example/joint_script.sh | 44 +++++ eICU_Example/joint_script_slurm.sh | 78 +++++++++ eICU_Example/pre_MEDS.py | 200 ++++++++++++++++++++++ eICU_Example/sbatch_joint_script.sh | 24 +++ pyproject.toml | 2 +- 9 files changed, 794 insertions(+), 3 deletions(-) create mode 100644 eICU_Example/README.md create mode 100644 eICU_Example/configs/event_configs.yaml create mode 100644 eICU_Example/configs/pre_MEDS.yaml create mode 100755 eICU_Example/joint_script.sh create mode 100755 eICU_Example/joint_script_slurm.sh create mode 100755 eICU_Example/pre_MEDS.py create mode 100644 eICU_Example/sbatch_joint_script.sh diff --git a/MIMIC-IV_Example/README.md b/MIMIC-IV_Example/README.md index f72f9b2..4056319 100644 --- a/MIMIC-IV_Example/README.md +++ b/MIMIC-IV_Example/README.md @@ -33,10 +33,9 @@ Download this repository and install the requirements: ```bash git clone git@github.com:mmcdermott/MEDS_polars_functions.git cd MEDS_polars_functions -git checkout MIMIC_IV conda create -n MEDS python=3.12 conda activate MEDS -pip install .[mimic] +pip install .[examples] ``` ## Step 1: Download MIMIC-IV diff --git a/eICU_Example/README.md b/eICU_Example/README.md new file mode 100644 index 0000000..b23ae9e --- /dev/null +++ b/eICU_Example/README.md @@ -0,0 +1,216 @@ +# eICU-CRD Example + +This is an example of how to extract a MEDS dataset from [eICU-CRD +v2.0](https://physionet.org/content/eicu-crd/2.0/). All scripts in this README are assumed to +be run **not** from this directory but from the root directory of this entire repository (e.g., one directory +up from this one). + +**Status**: This is a work in progress. The code is not yet functional. Remaining work includes: + +- [ ] Implementing the pre-MEDS processing step. + - [ ] Identifying the pre-MEDS steps for eICU +- [ ] Testing the pre-MEDS processing step on live eICU-CRD. + - [ ] Test that it runs at all. + - [ ] Test that the output is as expected. +- [ ] Check the installation instructions on a fresh client. +- [ ] Testing the `configs/event_configs.yaml` configuration on eICU-CRD +- [ ] Testing the MEDS extraction ETL runs on eICU-CRD (this should be expected to work, but needs + live testing). + - [ ] Sub-sharding + - [ ] Patient split gathering + - [ ] Event extraction + - [ ] Merging +- [ ] Validating the output MEDS cohort + - [ ] Basic validation + - [ ] Detailed validation + +## Step 0: Installation + +Download this repository and install the requirements: + +```bash +git clone git@github.com:mmcdermott/MEDS_polars_functions.git +cd MEDS_polars_functions +conda create -n MEDS python=3.12 +conda activate MEDS +pip install .[examples] +``` + +## Step 1: Download eICU + +Download the eICU-CRD dataset (version 2.0) from https://physionet.org/content/eicu-crd/2.0/ following the +instructions on that page. You will need the raw `.csv.gz` files for this example. We will use +`$EICU_RAW_DIR` to denote the root directory of where the resulting _core data files_ are stored -- e.g., +there should be a `hosp` and `icu` subdirectory of `$EICU_RAW_DIR`. + +## Step 2: Get the data ready for base MEDS extraction + +This is a step in a few parts: + +1. Join a few tables by `hadm_id` to get the right timestamps in the right rows for processing. In + particular, we need to join: + - TODO +2. Convert the patient's static data to a more parseable form. This entails: + - Get the patient's DOB in a format that is usable for MEDS, rather than the integral `anchor_year` and + `anchor_offset` fields. + - Merge the patient's `dod` with the `deathtime` from the `admissions` table. + +After these steps, modified files or symlinks to the original files will be written in a new directory which +will be used as the input to the actual MEDS extraction ETL. We'll use `$EICU_PREMEDS_DIR` to denote this +directory. + +To run this step, you can use the following script (assumed to be run **not** from this directory but from the +root directory of this repository): + +```bash +./eICU_Example/pre_MEDS.py raw_cohort_dir=$EICU_RAW_DIR output_dir=$EICU_PREMEDS_DIR +``` + +In practice, on a machine with 150 GB of RAM and 10 cores, this step takes less than 5 minutes in total. + +## Step 3: Run the MEDS extraction ETL +### Running locally, serially + +We will assume you want to output the final MEDS dataset into a directory we'll denote as `$EICU_MEDS_DIR`. +Note this is a different directory than the pre-MEDS directory (though, of course, they can both be +subdirectories of the same root directory). + +This is a step in 4 parts: + +1. Sub-shard the raw files. Run this command as many times simultaneously as you would like to have workers + performing this sub-sharding step. See below for how to automate this parallelism using hydra launchers. + +```bash +./scripts/extraction/shard_events.py \ + input_dir=$EICU_PREMEDS_DIR \ + cohort_dir=$EICU_MEDS_DIR \ + event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml +``` + +In practice, on a machine with 150 GB of RAM and 10 cores, this step takes approximately 20 minutes in total. + +2. Extract and form the patient splits and sub-shards. + +```bash +./scripts/extraction/split_and_shard_patients.py \ + input_dir=$EICU_PREMEDS_DIR \ + cohort_dir=$EICU_MEDS_DIR \ + event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml +``` + +In practice, on a machine with 150 GB of RAM and 10 cores, this step takes less than 5 minutes in total. + +3. Extract patient sub-shards and convert to MEDS events. + +```bash +./scripts/extraction/convert_to_sharded_events.py \ + input_dir=$EICU_PREMEDS_DIR \ + cohort_dir=$EICU_MEDS_DIR \ + event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml +``` + +In practice, serially, this also takes around 20 minutes or more. However, it can be trivially parallelized to +cut the time down by a factor of the number of workers processing the data by simply running the command +multiple times (though this will, of course, consume more resources). If your filesystem is distributed, these +commands can also be launched as separate slurm jobs, for example. For eICU, this level of parallelization +and performance is not necessary; however, for larger datasets, it can be. + +4. Merge the MEDS events into a single file per patient sub-shard. + +```bash +./scripts/extraction/merge_to_MEDS_cohort.py \ + input_dir=$EICU_PREMEDS_DIR \ + cohort_dir=$EICU_MEDS_DIR \ + event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml +``` + +### Running Locally, in Parallel. +This step is the exact same commands as above, but leverages Hydra's multirun capabilities with the `joblib` +launcher. Install this package with the optional `local_parallelism` option (e.g., `pip install -e +.[local_parallelism]` and run `./eICU_Example/joint_script.sh`. See that script for expected args. + +### Running Each Step over Slurm +To use slurm, run each command with the number of workers desired using Hydra's multirun capabilities with the +`submitit_slurm` launcher. Install this package with the optional `slurm_parallelism` option. See below for +modified commands. Note these can't be chained in a single script as the jobs will not wait for all slurm jobs +to finish before moving on to the next stage. Let `$N_PARALLEL_WORKERS` be the number of desired workers + +1. Sub-shard the raw files. + +```bash +./scripts/extraction/shard_events.py \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=submitit_slurm \ + hydra.launcher.timeout_min=60 \ + hydra.launcher.cpus_per_task=10 \ + hydra.launcher.mem_gb=50 \ + hydra.launcher.name="${hydra.job.name}_${worker}" \ + hydra.launcher.partition="short" \ + input_dir=$EICU_PREMEDS_DIR \ + cohort_dir=$EICU_MEDS_DIR \ + event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml +``` + +In practice, on a machine with 150 GB of RAM and 10 cores, this step takes approximately 20 minutes in total. + +2. Extract and form the patient splits and sub-shards. + +```bash +./scripts/extraction/split_and_shard_patients.py \ + input_dir=$EICU_PREMEDS_DIR \ + cohort_dir=$EICU_MEDS_DIR \ + event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml +``` + +In practice, on a machine with 150 GB of RAM and 10 cores, this step takes less than 5 minutes in total. + +3. Extract patient sub-shards and convert to MEDS events. + +```bash +./scripts/extraction/convert_to_sharded_events.py \ + input_dir=$EICU_PREMEDS_DIR \ + cohort_dir=$EICU_MEDS_DIR \ + event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml +``` + +In practice, serially, this also takes around 20 minutes or more. However, it can be trivially parallelized to +cut the time down by a factor of the number of workers processing the data by simply running the command +multiple times (though this will, of course, consume more resources). If your filesystem is distributed, these +commands can also be launched as separate slurm jobs, for example. For eICU, this level of parallelization +and performance is not necessary; however, for larger datasets, it can be. + +4. Merge the MEDS events into a single file per patient sub-shard. + +```bash +./scripts/extraction/merge_to_MEDS_cohort.py \ + input_dir=$EICU_PREMEDS_DIR \ + cohort_dir=$EICU_MEDS_DIR \ + event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml +``` + +## Limitations / TO-DOs: + +Currently, some tables are ignored, including: + +1. `admissiondrug`: The [documentation](https://eicu-crd.mit.edu/eicutables/admissiondrug/) notes that this is + extremely infrequently used, so we skip it. +2. + + +Lots of questions remain about how to appropriately handle timestamps of the data -- e.g., things like HCPCS +events are stored at the level of the _date_, not the _datetime_. How should those be slotted into the +timeline which is otherwise stored at the _datetime_ resolution? + +Other questions: + +1. How to handle merging the deathtimes between the hosp table and the patients table? +2. How to handle the dob nonsense MIMIC has? + +## Future Work + +### Pre-MEDS Processing + +If you wanted, some other processing could also be done here, such as: + +1. Converting the patient's dynamically recorded race into a static, most commonly recorded race field. diff --git a/eICU_Example/configs/event_configs.yaml b/eICU_Example/configs/event_configs.yaml new file mode 100644 index 0000000..af626d1 --- /dev/null +++ b/eICU_Example/configs/event_configs.yaml @@ -0,0 +1,219 @@ +# Note that there is no "patient_id" for eICU -- patients are only differentiable during the course of a +# single health system stay. Accordingly, we set the "patient" id here as the "patientHealthSystemStayID" + +patient_id_col: patientHealthSystemStayID + +hosp/admissions: + ed_registration: + code: ED_REGISTRATION + timestamp: col(edregtime) + timestamp_format: "%Y-%m-%d %H:%M:%S" + ed_out: + code: ED_OUT + timestamp: col(edouttime) + timestamp_format: "%Y-%m-%d %H:%M:%S" + admission: + code: + - HOSPITAL_ADMISSION + - col(admission_type) + - col(admission_location) + timestamp: col(admittime) + timestamp_format: "%Y-%m-%d %H:%M:%S" + insurance: insurance + language: language + marital_status: marital_status + race: race + hadm_id: hadm_id + discharge: + code: + - HOSPITAL_DISCHARGE + - col(discharge_location) + timestamp: col(dischtime) + timestamp_format: "%Y-%m-%d %H:%M:%S" + hadm_id: hadm_id + # We omit the death event here as it is joined to the data in the patients table in the pre-MEDS step. + #death: + # code: DEATH + # timestamp: col(deathtime) + # timestamp_format: "%Y-%m-%d %H:%M:%S" + # death_location: death_location + # death_type: death_type + +hosp/diagnoses_icd: + diagnosis: + code: + - DIAGNOSIS + - ICD + - col(icd_version) + - col(icd_code) + hadm_id: hadm_id + timestamp: col(hadm_discharge_time) + timestamp_format: "%Y-%m-%d %H:%M:%S" + +hosp/drgcodes: + drg: + code: + - DRG + - col(drg_type) + - col(drg_code) + - col(description) + hadm_id: hadm_id + timestamp: col(hadm_discharge_time) + timestamp_format: "%Y-%m-%d %H:%M:%S" + drg_severity: drg_severity + drg_mortality: drg_mortality + +hosp/emar: + medication: + code: + - MEDICATION + - col(medication) + - col(event_txt) + timestamp: col(charttime) + timestamp_format: "%Y-%m-%d %H:%M:%S" + hadm_id: hadm_id + emar_id: emar_id + emar_seq: emar_seq + +hosp/hcpcsevents: + hcpcs: + code: + - HCPCS + - col(short_description) + hadm_id: hadm_id + timestamp: col(chartdate) + timestamp_format: "%Y-%m-%d" + +hosp/labevents: + lab: + code: + - LAB + - col(itemid) + - col(valueuom) + hadm_id: hadm_id + timestamp: col(charttime) + timestamp_format: "%Y-%m-%d %H:%M:%S" + numerical_value: valuenum + text_value: value + priority: priority + +hosp/omr: + omr: + code: col(result_name) + text_value: col(result_value) + timestamp: col(chartdate) + timestamp_format: "%Y-%m-%d" + +hosp/patients: + gender: + code: + - GENDER + - col(gender) + timestamp: null + dob: + code: DOB + timestamp: col(year_of_birth) + timestamp_format: "%Y" + death: + code: DEATH + timestamp: col(dod) + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + +hosp/pharmacy: + medication_start: + code: + - MEDICATION + - START + - col(medication) + timestamp: col(starttime) + route: route + frequency: frequency + doses_per_24_hrs: doses_per_24_hrs + poe_id: poe_id + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + medication_stop: + code: + - MEDICATION + - STOP + - col(medication) + timestamp: col(stoptime) + poe_id: poe_id + timestamp_format: + - "%Y-%m-%d %H:%M:%S" + - "%Y-%m-%d" + +hosp/procedures_icd: + procedure: + code: + - PROCEDURE + - ICD + - col(icd_version) + - col(icd_code) + hadm_id: hadm_id + timestamp: col(chartdate) + timestamp_format: "%Y-%m-%d" + +hosp/transfers: + transfer: + code: + - TRANSFER_TO + - col(eventtype) + - col(careunit) + timestamp: col(intime) + timestamp_format: "%Y-%m-%d %H:%M:%S" + hadm_id: hadm_id + +icu/icustays: + icu_admission: + code: + - ICU_ADMISSION + - col(first_careunit) + timestamp: col(intime) + timestamp_format: "%Y-%m-%d %H:%M:%S" + hadm_id: hadm_id + icustay_id: stay_id + icu_discharge: + code: + - ICU_DISCHARGE + - col(last_careunit) + timestamp: col(outtime) + timestamp_format: "%Y-%m-%d %H:%M:%S" + hadm_id: hadm_id + icustay_id: stay_id + +icu/chartevents: + event: + code: + - LAB + - col(itemid) + - col(valueuom) + timestamp: col(charttime) + timestamp_format: "%Y-%m-%d %H:%M:%S" + numerical_value: valuenum + text_value: value + hadm_id: hadm_id + icustay_id: stay_id + +icu/procedureevents: + start: + code: + - PROCEDURE + - START + - col(itemid) + timestamp: col(starttime) + timestamp_format: "%Y-%m-%d %H:%M:%S" + hadm_id: hadm_id + icustay_id: stay_id + end: + code: + - PROCEDURE + - END + - col(itemid) + timestamp: col(endtime) + timestamp_format: "%Y-%m-%d %H:%M:%S" + hadm_id: hadm_id + icustay_id: stay_id diff --git a/eICU_Example/configs/pre_MEDS.yaml b/eICU_Example/configs/pre_MEDS.yaml new file mode 100644 index 0000000..b5cfa4c --- /dev/null +++ b/eICU_Example/configs/pre_MEDS.yaml @@ -0,0 +1,11 @@ +raw_cohort_dir: ??? +output_dir: ??? + +# Hydra +hydra: + job: + name: pre_MEDS_${now:%Y-%m-%d_%H-%M-%S} + run: + dir: ${output_dir}/.logs/${hydra.job.name} + sweep: + dir: ${output_dir}/.logs/${hydra.job.name} diff --git a/eICU_Example/joint_script.sh b/eICU_Example/joint_script.sh new file mode 100755 index 0000000..9d7ae69 --- /dev/null +++ b/eICU_Example/joint_script.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +MIMICIV_RAW_DIR="$1" +MIMICIV_PREMEDS_DIR="$2" +MIMICIV_MEDS_DIR="$3" +N_PARALLEL_WORKERS="$4" + +shift 4 + +echo "Running pre-MEDS conversion." +./MIMIC-IV_Example/pre_MEDS.py raw_cohort_dir=$MIMICIV_RAW_DIR output_dir=$MIMICIV_PREMEDS_DIR + +echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel" +./scripts/extraction/shard_events.py \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=joblib \ + input_dir=$MIMICIV_PREMEDS_DIR \ + cohort_dir=$MIMICIV_MEDS_DIR \ + event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" + +echo "Splitting patients in serial" +./scripts/extraction/split_and_shard_patients.py \ + input_dir=$MIMICIV_PREMEDS_DIR \ + cohort_dir=$MIMICIV_MEDS_DIR \ + event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" + +echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" +./scripts/extraction/convert_to_sharded_events.py \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=joblib \ + input_dir=$MIMICIV_PREMEDS_DIR \ + cohort_dir=$MIMICIV_MEDS_DIR \ + event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" + +echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel" +./scripts/extraction/merge_to_MEDS_cohort.py \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=joblib \ + input_dir=$MIMICIV_PREMEDS_DIR \ + cohort_dir=$MIMICIV_MEDS_DIR \ + event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" diff --git a/eICU_Example/joint_script_slurm.sh b/eICU_Example/joint_script_slurm.sh new file mode 100755 index 0000000..8ce85fb --- /dev/null +++ b/eICU_Example/joint_script_slurm.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash + +MIMICIV_RAW_DIR="$1" +MIMICIV_PREMEDS_DIR="$2" +MIMICIV_MEDS_DIR="$3" +N_PARALLEL_WORKERS="$4" + +shift 4 + +# Note we use `--multirun` throughout here due to ensure the submitit launcher is used throughout, so that +# this doesn't fall back on running anything locally in a setting where only slurm worker nodes have +# sufficient computational resources to run the actual jobs. + +# echo "Running pre-MEDS conversion on one worker." +# ./MIMIC-IV_Example/pre_MEDS.py \ +# --multirun \ +# worker="range(0,1)" \ +# hydra/launcher=submitit_slurm \ +# hydra.launcher.timeout_min=60 \ +# hydra.launcher.cpus_per_task=10 \ +# hydra.launcher.mem_gb=50 \ +# hydra.launcher.partition="short" \ +# raw_cohort_dir=$MIMICIV_RAW_DIR \ +# output_dir=$MIMICIV_PREMEDS_DIR + +echo "Trying submitit launching with $N_PARALLEL_WORKERS jobs." + +./scripts/extraction/shard_events.py \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=submitit_slurm \ + hydra.launcher.timeout_min=60 \ + hydra.launcher.cpus_per_task=10 \ + hydra.launcher.mem_gb=50 \ + hydra.launcher.partition="short" \ + "hydra.job.env_copy=[PATH]" \ + input_dir=$MIMICIV_PREMEDS_DIR \ + cohort_dir=$MIMICIV_MEDS_DIR \ + event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml + +#echo "Splitting patients on one worker" +#./scripts/extraction/split_and_shard_patients.py \ +# --multirun \ +# worker="range(0,1)" \ +# hydra/launcher=submitit_slurm \ +# hydra.launcher.timeout_min=60 \ +# hydra.launcher.cpus_per_task=10 \ +# hydra.launcher.mem_gb=50 \ +# hydra.launcher.partition="short" \ +# input_dir=$MIMICIV_PREMEDS_DIR \ +# cohort_dir=$MIMICIV_MEDS_DIR \ +# event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" +# +#echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" +#./scripts/extraction/convert_to_sharded_events.py \ +# --multirun \ +# worker="range(0,$N_PARALLEL_WORKERS)" \ +# hydra/launcher=submitit_slurm \ +# hydra.launcher.timeout_min=60 \ +# hydra.launcher.cpus_per_task=10 \ +# hydra.launcher.mem_gb=50 \ +# hydra.launcher.partition="short" \ +# input_dir=$MIMICIV_PREMEDS_DIR \ +# cohort_dir=$MIMICIV_MEDS_DIR \ +# event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" +# +#echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel" +#./scripts/extraction/merge_to_MEDS_cohort.py \ +# --multirun \ +# worker="range(0,$N_PARALLEL_WORKERS)" \ +# hydra/launcher=submitit_slurm \ +# hydra.launcher.timeout_min=60 \ +# hydra.launcher.cpus_per_task=10 \ +# hydra.launcher.mem_gb=50 \ +# hydra.launcher.partition="short" \ +# input_dir=$MIMICIV_PREMEDS_DIR \ +# cohort_dir=$MIMICIV_MEDS_DIR \ +# event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" diff --git a/eICU_Example/pre_MEDS.py b/eICU_Example/pre_MEDS.py new file mode 100755 index 0000000..bf0204c --- /dev/null +++ b/eICU_Example/pre_MEDS.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python + +"""Performs pre-MEDS data wrangling for eICU.""" +import rootutils + +root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True) + +import gzip +from datetime import datetime +from pathlib import Path + +import hydra +import polars as pl +from loguru import logger +from omegaconf import DictConfig + +from MEDS_polars_functions.utils import ( + get_shard_prefix, + hydra_loguru_init, + write_lazyframe, +) + +GLOBAL_TIME_ROOT = datetime(2024, 1, 1) + + +def load_raw_eicu_file(fp: Path, **kwargs) -> pl.LazyFrame: + """Load a raw MIMIC file into a Polars DataFrame. + + Args: + fp: The path to the MIMIC file. + + Returns: + The Polars DataFrame containing the MIMIC data. + """ + + with gzip.open(fp, mode="rb") as f: + return pl.read_csv(f, infer_schema_length=100000, **kwargs).lazy() + +def process_patients_table(df: pl.LazyFrame) -> pl.LazyFrame: + """Takes the patients table and converts it to a form that includes timestamps + + As eICU stores only offset times, note here that we add a CONSTANT TIME ACROSS ALL PATIENTS for the true + timestamp of their health system admission. This is acceptable because in eICU ONLY RELATIVE TIME + DIFFERENCES ARE MEANINGFUL, NOT ABSOLUTE TIMES. + """ + + return ( + df + .with_columns( + pl.lit(GLOBAL_TIME_ROOT, dtype=pl.Datetime).alias("healthSystemAdmitTimestamp"), + ) + .select( + "patientHealthSystemStayID", + "gender", + "age", + "ethnicity", + # Unit stay parameters + "patientUnitStayID", # The unit stay ID + pl.col("healthSystemAdmitTimestamp") + "hospitalID", + "admissionHeight", + # "apacheAdmissionDx", This we grab from `admissiondx` later instead. + + + + + + + +##### MIMIC STUFF --- OLD ##### + +def add_discharge_time_by_hadm_id( + df: pl.LazyFrame, discharge_time_df: pl.LazyFrame, out_column_name: str = "hadm_discharge_time" +) -> pl.LazyFrame: + """Joins the two dataframes by ``"hadm_id"`` and adds the discharge time to the original dataframe.""" + + discharge_time_df = discharge_time_df.select("hadm_id", pl.col("dischtime").alias(out_column_name)) + return df.join(discharge_time_df, on="hadm_id", how="left") + + +def fix_static_data(raw_static_df: pl.LazyFrame, death_times_df: pl.LazyFrame) -> pl.LazyFrame: + """Fixes the static data by adding the death time to the static data and fixes the DOB nonsense. + + Args: + raw_static_df: The raw static data. + death_times_df: The death times data. + + Returns: + The fixed static data. + """ + + death_times_df = death_times_df.group_by("subject_id").agg(pl.col("deathtime").min()) + + return raw_static_df.join(death_times_df, on="subject_id", how="left").select( + "subject_id", + pl.coalesce(pl.col("dod"), pl.col("deathtime")).alias("dod"), + (pl.col("anchor_year") - pl.col("anchor_age")).cast(str).alias("year_of_birth"), + "gender", + ) + + +FUNCTIONS = { + "hosp/diagnoses_icd": (add_discharge_time_by_hadm_id, ("hosp/admissions", ["hadm_id", "dischtime"])), + "hosp/drgcodes": (add_discharge_time_by_hadm_id, ("hosp/admissions", ["hadm_id", "dischtime"])), + "hosp/patients": (fix_static_data, ("hosp/admissions", ["subject_id", "deathtime"])), +} + + +@hydra.main(version_base=None, config_path="configs", config_name="pre_MEDS") +def main(cfg: DictConfig): + """Performs pre-MEDS data wrangling for eICU. + + Inputs are the raw MIMIC files, read from the `raw_cohort_dir` config parameter. Output files are either + symlinked (if they are not modified) or written in processed form to the `MEDS_input_dir` config + parameter. Hydra is used to manage configuration parameters and logging. + """ + + hydra_loguru_init() + + raw_cohort_dir = Path(cfg.raw_cohort_dir) + MEDS_input_dir = Path(cfg.output_dir) + + all_fps = list(raw_cohort_dir.glob("**/*.csv.gz")) + + dfs_to_load = {} + + for in_fp in all_fps: + pfx = get_shard_prefix(raw_cohort_dir, in_fp) + + out_fp = MEDS_input_dir / in_fp.relative_to(raw_cohort_dir) + + if out_fp.is_file(): + print(f"Done with {pfx}. Continuing") + continue + + out_fp.parent.mkdir(parents=True, exist_ok=True) + + if pfx not in FUNCTIONS: + logger.info( + f"No function needed for {pfx}: " + f"Symlinking {str(in_fp.resolve())} to {str(out_fp.resolve())}" + ) + relative_in_fp = in_fp.relative_to(out_fp.parent, walk_up=True) + out_fp.symlink_to(relative_in_fp) + continue + else: + out_fp = MEDS_input_dir / f"{pfx}.parquet" + if out_fp.is_file(): + print(f"Done with {pfx}. Continuing") + continue + + fn, need_df = FUNCTIONS[pfx] + if not need_df: + st = datetime.now() + logger.info(f"Processing {pfx}...") + df = load_raw_eicu_file(in_fp) + logger.info(f" Loaded raw {in_fp} in {datetime.now() - st}") + processed_df = fn(df) + write_lazyframe(processed_df, out_fp) + logger.info(f" Processed and wrote to {str(out_fp.resolve())} in {datetime.now() - st}") + else: + needed_pfx, needed_cols = need_df + if needed_pfx not in dfs_to_load: + dfs_to_load[needed_pfx] = {"fps": set(), "cols": set()} + + dfs_to_load[needed_pfx]["fps"].add(in_fp) + dfs_to_load[needed_pfx]["cols"].update(needed_cols) + + for df_to_load_pfx, fps_and_cols in dfs_to_load.items(): + fps = fps_and_cols["fps"] + cols = list(fps_and_cols["cols"]) + + df_to_load_fp = raw_cohort_dir / f"{df_to_load_pfx}.csv.gz" + + st = datetime.now() + + logger.info(f"Loading {str(df_to_load_fp.resolve())} for manipulating other dataframes...") + df = load_raw_eicu_file(df_to_load_fp, columns=cols) + logger.info(f" Loaded in {datetime.now() - st}") + + for fp in fps: + pfx = get_shard_prefix(raw_cohort_dir, fp) + out_fp = MEDS_input_dir / f"{pfx}.parquet" + + logger.info(f" Processing dependent df @ {pfx}...") + fn, _ = FUNCTIONS[pfx] + + fp_st = datetime.now() + logger.info(f" Loading {str(fp.resolve())}...") + fp_df = load_raw_eicu_file(fp) + logger.info(f" Loaded in {datetime.now() - fp_st}") + processed_df = fn(fp_df, df) + write_lazyframe(processed_df, out_fp) + logger.info(f" Processed and wrote to {str(out_fp.resolve())} in {datetime.now() - fp_st}") + + logger.info(f"Done! All dataframes processed and written to {str(MEDS_input_dir.resolve())}") + + +if __name__ == "__main__": + main() diff --git a/eICU_Example/sbatch_joint_script.sh b/eICU_Example/sbatch_joint_script.sh new file mode 100644 index 0000000..e031363 --- /dev/null +++ b/eICU_Example/sbatch_joint_script.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +#SBATCH -c 10 # Request one core +#SBATCH -t 0-03:00 # Runtime in D-HH:MM format +#SBATCH -p short # Partition to run in +#SBATCH --mem=300GB # Memory total in MiB (for all cores) +#SBATCH -o MIMIC_IV_MEDS_%j_sbatch.out # File to which STDOUT will be written, including job ID (%j) +#SBATCH -e MIMIC_IV_MEDS_%j_sbatch.err # File to which STDERR will be written, including job ID (%j) + +cd /n/data1/hms/dbmi/zaklab/mmd/MEDS_polars_functions + +MIMICIV_RAW_DIR="$1" +MIMICIV_PREMEDS_DIR="$2" +MIMICIV_MEDS_DIR="$3" +N_PARALLEL_WORKERS="$4" + +LOG_DIR="$MIMICIV_MEDS_DIR/.logs" + +echo "Running with saving to $LOG_DIR" + +mkdir -p $LOG_DIR + +PATH="/home/mbm47/.conda/envs/MEDS_pipelines/bin:$PATH" \ + time mprof run --include-children --exit-code --output "$LOG_DIR/mprofile.dat" \ + ./MIMIC-IV_Example/joint_script.sh "$@" 2> $LOG_DIR/timings.txt diff --git a/pyproject.toml b/pyproject.toml index 29bba91..25b9527 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ classifiers = [ dependencies = ["polars", "pyarrow", "nested_ragged_tensors", "loguru", "hydra-core", "numpy"] [project.optional-dependencies] -mimic = ["rootutils"] +examples = ["rootutils"] dev = ["pre-commit"] tests = ["pytest", "pytest-cov[toml]", "rootutils"] local_parallelism = ["hydra-joblib-launcher"] From 7c2e7677c09289b4cf358346b9223d8934d7f98f Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Fri, 31 May 2024 20:04:22 -0400 Subject: [PATCH 15/47] Updated configs and added a resolver to get informative help messages from the right sources --- configs/extraction.yaml | 34 ++++++++++++++++++++++++++++++ configs/pipeline.yaml | 15 +++++++++++++ configs/preprocess.yaml | 1 + src/MEDS_polars_functions/utils.py | 16 ++++++++++++++ 4 files changed, 66 insertions(+) diff --git a/configs/extraction.yaml b/configs/extraction.yaml index e1e985a..b762894 100644 --- a/configs/extraction.yaml +++ b/configs/extraction.yaml @@ -1,5 +1,18 @@ defaults: - pipeline + - _self_ + +description: |- + This pipeline extracts raw MEDS events in longitudinal, sparse form from an input dataset meeting select + criteria and converts them to the flattened, MEDS format. It can be run in its entirety, with controllable + levels of parallelism, or in stages. Arguments: + - `event_conversion_config_fp`: The path to the event conversion configuration file. This file defines + the events to extract from the various rows of the various input files encountered in the global input + directory. + - `input_dir`: The path to the directory containing the raw input files. + - `cohort_dir`: The path to the directory where the output cohort will be written. It will be written in + various subfolders of this dir depending on the stage, as intermediate stages cache their output during + computation for efficiency of re-running and distributing. # The event conversion configuration file is used throughout the pipeline to define the events to extract. event_conversion_config_fp: ??? @@ -12,9 +25,22 @@ stages: stage_configs: shard_events: + description: |- + This stage shards the raw input events into smaller files for easier processing. Arguments: + - `row_chunksize`: The number of rows to read in at a time. + - `infer_schema_length`: The number of rows to read in to infer the schema (only used if the source + files are pdfs) row_chunksize: 200000000 infer_schema_length: 10000 split_and_shard_patients: + description: |- + This stage splits the patients into training, tuning, and held-out sets, and further splits those sets + into shards. Arguments: + - `n_patients_per_shard`: The number of patients to include in a shard. + - `external_splits_json_fp`: The path to a json file containing any pre-defined splits for specially + held-out test sets beyond the IID held out set that will be produced (e.g., for prospective + datasets, etc.). + - `split_fracs`: The fraction of patients to include in the IID training, tuning, and held-out sets. is_metadata: True output_dir: ${cohort_dir} n_patients_per_shard: 50000 @@ -24,4 +50,12 @@ stage_configs: tuning: 0.1 held_out: 0.1 merge_to_MEDS_cohort: + description: |- + This stage splits the patients into training, tuning, and held-out sets, and further splits those sets + into shards. Arguments: + - `n_patients_per_shard`: The number of patients to include in a shard. + - `external_splits_json_fp`: The path to a json file containing any pre-defined splits for specially + held-out test sets beyond the IID held out set that will be produced (e.g., for prospective + datasets, etc.). + - `split_fracs`: The fraction of patients to include in the IID training, tuning, and held-out sets. output_dir: ${cohort_dir}/final_cohort diff --git a/configs/pipeline.yaml b/configs/pipeline.yaml index 5694e25..857785f 100644 --- a/configs/pipeline.yaml +++ b/configs/pipeline.yaml @@ -2,6 +2,10 @@ input_dir: ??? cohort_dir: ??? +_default_description: |- + This is a MEDS pipeline ETL. Please set a more detailed description at the top of your specific pipeline + configuration file. + log_dir: "${cohort_dir}/.logs" # General pipeline variables @@ -26,3 +30,14 @@ hydra: dir: "${log_dir}" sweep: dir: "${log_dir}" + help: + app_name: "MEDS/${stage}" + template: |- + == ${hydra.help.app_name} == + ${hydra.help.app_name} is a command line tool that provides an interface for running MEDS pipelines. + + **Pipeline description:** + ${oc.select:description, ${_default_description}} + + **Stage description:** + ${oc.select:stage_configs.${stage}.description, ${get_script_docstring:}} diff --git a/configs/preprocess.yaml b/configs/preprocess.yaml index 9b60579..d65150b 100644 --- a/configs/preprocess.yaml +++ b/configs/preprocess.yaml @@ -1,5 +1,6 @@ defaults: - pipeline + - _self_ # Global pipeline parameters: # 1. Code modifiers will be used as adjoining parts of the `code` columns during group-bys and eventual diff --git a/src/MEDS_polars_functions/utils.py b/src/MEDS_polars_functions/utils.py index b2fbbb7..11d738a 100644 --- a/src/MEDS_polars_functions/utils.py +++ b/src/MEDS_polars_functions/utils.py @@ -1,5 +1,6 @@ """Core utilities for MEDS pipelines built with these tools.""" +import inspect import os import sys from pathlib import Path @@ -12,6 +13,20 @@ pl.enable_string_cache() +def get_script_docstring() -> str: + """Returns the docstring of the main function of the script that was called. + + Returns: + str: TODO + """ + + main_module = sys.modules["__main__"] + func = getattr(main_module, "main", None) + if func and callable(func): + return inspect.getdoc(func) or "" + return "" + + def current_script_name() -> str: """Returns the name of the script that called this function. @@ -143,6 +158,7 @@ def populate_stage( return out +OmegaConf.register_new_resolver("get_script_docstring", get_script_docstring, replace=False) OmegaConf.register_new_resolver("current_script_name", current_script_name, replace=False) OmegaConf.register_new_resolver("populate_stage", populate_stage, replace=False) From bc78cd448e95738413ad8888d563dbd3c926b8e5 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sat, 1 Jun 2024 09:03:55 -0400 Subject: [PATCH 16/47] Starting eICU scripts and configs --- eICU_Example/configs/event_configs.yaml | 266 +++++------------------- eICU_Example/pre_MEDS.py | 198 +++++++++++++----- 2 files changed, 202 insertions(+), 262 deletions(-) diff --git a/eICU_Example/configs/event_configs.yaml b/eICU_Example/configs/event_configs.yaml index af626d1..9d467f6 100644 --- a/eICU_Example/configs/event_configs.yaml +++ b/eICU_Example/configs/event_configs.yaml @@ -3,217 +3,59 @@ patient_id_col: patientHealthSystemStayID -hosp/admissions: - ed_registration: - code: ED_REGISTRATION - timestamp: col(edregtime) - timestamp_format: "%Y-%m-%d %H:%M:%S" - ed_out: - code: ED_OUT - timestamp: col(edouttime) - timestamp_format: "%Y-%m-%d %H:%M:%S" - admission: - code: - - HOSPITAL_ADMISSION - - col(admission_type) - - col(admission_location) - timestamp: col(admittime) - timestamp_format: "%Y-%m-%d %H:%M:%S" - insurance: insurance - language: language - marital_status: marital_status - race: race - hadm_id: hadm_id - discharge: - code: - - HOSPITAL_DISCHARGE - - col(discharge_location) - timestamp: col(dischtime) - timestamp_format: "%Y-%m-%d %H:%M:%S" - hadm_id: hadm_id - # We omit the death event here as it is joined to the data in the patients table in the pre-MEDS step. - #death: - # code: DEATH - # timestamp: col(deathtime) - # timestamp_format: "%Y-%m-%d %H:%M:%S" - # death_location: death_location - # death_type: death_type - -hosp/diagnoses_icd: - diagnosis: - code: - - DIAGNOSIS - - ICD - - col(icd_version) - - col(icd_code) - hadm_id: hadm_id - timestamp: col(hadm_discharge_time) - timestamp_format: "%Y-%m-%d %H:%M:%S" - -hosp/drgcodes: - drg: - code: - - DRG - - col(drg_type) - - col(drg_code) - - col(description) - hadm_id: hadm_id - timestamp: col(hadm_discharge_time) - timestamp_format: "%Y-%m-%d %H:%M:%S" - drg_severity: drg_severity - drg_mortality: drg_mortality - -hosp/emar: - medication: - code: - - MEDICATION - - col(medication) - - col(event_txt) - timestamp: col(charttime) - timestamp_format: "%Y-%m-%d %H:%M:%S" - hadm_id: hadm_id - emar_id: emar_id - emar_seq: emar_seq - -hosp/hcpcsevents: - hcpcs: - code: - - HCPCS - - col(short_description) - hadm_id: hadm_id - timestamp: col(chartdate) - timestamp_format: "%Y-%m-%d" - -hosp/labevents: - lab: - code: - - LAB - - col(itemid) - - col(valueuom) - hadm_id: hadm_id - timestamp: col(charttime) - timestamp_format: "%Y-%m-%d %H:%M:%S" - numerical_value: valuenum - text_value: value - priority: priority - -hosp/omr: - omr: - code: col(result_name) - text_value: col(result_value) - timestamp: col(chartdate) - timestamp_format: "%Y-%m-%d" - -hosp/patients: +patient: + dob: + code: "DOB" + timestamp: "dateOfBirth" + uniquepid: "uniquepid" gender: - code: - - GENDER - - col(gender) + code: ["GENDER", "col(gender)"] timestamp: null - dob: - code: DOB - timestamp: col(year_of_birth) - timestamp_format: "%Y" - death: - code: DEATH - timestamp: col(dod) - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - -hosp/pharmacy: - medication_start: - code: - - MEDICATION - - START - - col(medication) - timestamp: col(starttime) - route: route - frequency: frequency - doses_per_24_hrs: doses_per_24_hrs - poe_id: poe_id - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - medication_stop: - code: - - MEDICATION - - STOP - - col(medication) - timestamp: col(stoptime) - poe_id: poe_id - timestamp_format: - - "%Y-%m-%d %H:%M:%S" - - "%Y-%m-%d" - -hosp/procedures_icd: - procedure: - code: - - PROCEDURE - - ICD - - col(icd_version) - - col(icd_code) - hadm_id: hadm_id - timestamp: col(chartdate) - timestamp_format: "%Y-%m-%d" - -hosp/transfers: - transfer: - code: - - TRANSFER_TO - - col(eventtype) - - col(careunit) - timestamp: col(intime) - timestamp_format: "%Y-%m-%d %H:%M:%S" - hadm_id: hadm_id - -icu/icustays: - icu_admission: - code: - - ICU_ADMISSION - - col(first_careunit) - timestamp: col(intime) - timestamp_format: "%Y-%m-%d %H:%M:%S" - hadm_id: hadm_id - icustay_id: stay_id - icu_discharge: - code: - - ICU_DISCHARGE - - col(last_careunit) - timestamp: col(outtime) - timestamp_format: "%Y-%m-%d %H:%M:%S" - hadm_id: hadm_id - icustay_id: stay_id - -icu/chartevents: - event: - code: - - LAB - - col(itemid) - - col(valueuom) - timestamp: col(charttime) - timestamp_format: "%Y-%m-%d %H:%M:%S" - numerical_value: valuenum - text_value: value - hadm_id: hadm_id - icustay_id: stay_id - -icu/procedureevents: - start: - code: - - PROCEDURE - - START - - col(itemid) - timestamp: col(starttime) - timestamp_format: "%Y-%m-%d %H:%M:%S" - hadm_id: hadm_id - icustay_id: stay_id - end: - code: - - PROCEDURE - - END - - col(itemid) - timestamp: col(endtime) - timestamp_format: "%Y-%m-%d %H:%M:%S" - hadm_id: hadm_id - icustay_id: stay_id + ethnicity: + code: ["ETHNICITY", "col(ethnicity)"] + timestamp: null + hosp_admission: + code: + - "HOSPITAL_ADMISSION" + - col("hospitalAdmitSource") + - col("hospitalRegion") + - col("hospitalTeachingStatus") + - col("hospitalNumBedsCategory") + timestamp: "hospitalAdmitTimestamp" + hospital_id: "hospitalID" + hosp_discharge: + code: + - "HOSPITAL_DISCHARGE" + - col("hospitalDischargeStatus") + - col("hospitalDischargeLocation") + timestamp: "hospitalDischargeTimestamp" + unit_admission: + code: + - "UNIT_ADMISSION" + - col("unitAdmitSource") + - col("unitStayType") + timestamp: "unitAdmitTimestamp" + ward_id: "wardID" + unit_admission_weight: + code: + - "UNIT_ADMISSION_WEIGHT" + timestamp: "unitAdmitTimestamp" + numerical_value: "unitAdmissionWeight" + unit_admission_height: + code: + - "UNIT_ADMISSION_HEIGHT" + timestamp: "unitAdmitTimestamp" + numerical_value: "unitAdmissionHeight" + unit_discharge: + code: + - "UNIT_DISCHARGE" + - col("unitDischargeStatus") + - col("unitDischargeLocation") + timestamp: "unitDischargeTimestamp" + unit_discharge_weight: + code: + - "UNIT_DISCHARGE_WEIGHT" + timestamp: "unitDischargeTimestamp" + numerical_value: "unitDischargeWeight" + +timestamp_format: "%Y-%m-%d %H:%M:%S" diff --git a/eICU_Example/pre_MEDS.py b/eICU_Example/pre_MEDS.py index bf0204c..c78d506 100755 --- a/eICU_Example/pre_MEDS.py +++ b/eICU_Example/pre_MEDS.py @@ -1,13 +1,18 @@ #!/usr/bin/env python -"""Performs pre-MEDS data wrangling for eICU.""" +"""Performs pre-MEDS data wrangling for eICU. + +See the docstring of `main` for more information. +""" import rootutils root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True) import gzip +from collections.abc import Callable, Sequence from datetime import datetime from pathlib import Path +from typing import NamedTuple import hydra import polars as pl @@ -20,7 +25,13 @@ write_lazyframe, ) -GLOBAL_TIME_ROOT = datetime(2024, 1, 1) +HEALTH_SYSTEM_STAY_ID = "patientHealthSystemStayID" +UNIT_STAY_ID = "patientUnitStayID" +PATIENT_ID = "uniquepid" + +# The end of year date, used for year-only timestamps in eICU. The time is set to midnight as we'll add a +# 24-hour time component from other columns in the data. +END_OF_YEAR = {"month": 12, "day": 31, "hour": 0, "minute": 0, "second": 0} def load_raw_eicu_file(fp: Path, **kwargs) -> pl.LazyFrame: @@ -36,85 +47,172 @@ def load_raw_eicu_file(fp: Path, **kwargs) -> pl.LazyFrame: with gzip.open(fp, mode="rb") as f: return pl.read_csv(f, infer_schema_length=100000, **kwargs).lazy() -def process_patients_table(df: pl.LazyFrame) -> pl.LazyFrame: - """Takes the patients table and converts it to a form that includes timestamps - As eICU stores only offset times, note here that we add a CONSTANT TIME ACROSS ALL PATIENTS for the true - timestamp of their health system admission. This is acceptable because in eICU ONLY RELATIVE TIME - DIFFERENCES ARE MEANINGFUL, NOT ABSOLUTE TIMES. - """ +def check_timestamps_agree(df: pl.LazyFrame, pseudotime_col: pl.Expr, given_24htime_col: str): + expected_time = pl.col(given_24htime_col).str.strptime(pl.Time, "%H:%M:%S") + + time_deltas_min = (pseudotime_col.dt.time() - expected_time).dt.total_minutes() - return ( - df - .with_columns( - pl.lit(GLOBAL_TIME_ROOT, dtype=pl.Datetime).alias("healthSystemAdmitTimestamp"), + # Check that the time deltas are all within 1 minute + logger.info( + "Checking that stated 24h times are consistent given offsets between {pseudotime_col.name} and " + f"{given_24htime_col}..." + ) + max_time_deltas_min = df.select(time_deltas_min.abs().max()).collect().item() + if max_time_deltas_min > 1: + raise ValueError( + f"Max number of minutes between {pseudotime_col.name} and {given_24htime_col} is " + f"{max_time_deltas_min}. Should be <= 1." ) - .select( - "patientHealthSystemStayID", - "gender", - "age", - "ethnicity", - # Unit stay parameters - "patientUnitStayID", # The unit stay ID - pl.col("healthSystemAdmitTimestamp") - "hospitalID", - "admissionHeight", - # "apacheAdmissionDx", This we grab from `admissiondx` later instead. +def process_patient_table(df: pl.LazyFrame, hospital_df: pl.LazyFrame) -> pl.LazyFrame: + """Takes the patient table and converts it to a form that includes timestamps. + As eICU stores only offset times, note here that we add a CONSTANT TIME ACROSS ALL PATIENTS for the true + timestamp of their health system admission. This is acceptable because in eICU ONLY RELATIVE TIME + DIFFERENCES ARE MEANINGFUL, NOT ABSOLUTE TIMES. + The output of this process is ultimately converted to events via the `patient` key in the + `configs/event_configs.yaml` file. + """ + hospital_discharge_pseudotime = pl.datetime(year=pl.col("hospitalDischargeYear"), **END_OF_YEAR) + pl.col( + "hospitalDischargeTime24" + ).str.strptime(pl.Time, "%H:%M:%S") + unit_admit_pseudotime = hospital_discharge_pseudotime - pl.duration( + minutes=pl.col("hospitalDischargeOffset") + ) -##### MIMIC STUFF --- OLD ##### + unit_discharge_pseudotime = unit_admit_pseudotime + pl.duration(minutes=pl.col("unitDischargeOffset")) -def add_discharge_time_by_hadm_id( - df: pl.LazyFrame, discharge_time_df: pl.LazyFrame, out_column_name: str = "hadm_discharge_time" -) -> pl.LazyFrame: - """Joins the two dataframes by ``"hadm_id"`` and adds the discharge time to the original dataframe.""" + hospital_admit_pseudotime = unit_admit_pseudotime + pl.duration(minutes=pl.col("hospitalAdmitOffset")) - discharge_time_df = discharge_time_df.select("hadm_id", pl.col("dischtime").alias(out_column_name)) - return df.join(discharge_time_df, on="hadm_id", how="left") + age_in_years = pl.when(pl.col("age") == "> 89").then(90).otherwise(pl.col("age").cast(pl.UInt16)) + age_in_days = age_in_years * 365.25 + # We assume that the patient was born at the midpoint of the year as we don't know the actual birthdate + pseudo_date_of_birth = unit_admit_pseudotime - pl.duration(days=(age_in_days - 365.25 / 2)) + # Check the times + start = datetime.now() + logger.info( + "Checking that the 24h times are consistent. If this is extremely slow, consider refactoring to have " + "only one `.collect()` call." + ) + check_timestamps_agree(df, hospital_discharge_pseudotime, "hospitalDischargeTime24") + check_timestamps_agree(df, hospital_admit_pseudotime, "hospitalAdmitTime24") + check_timestamps_agree(df, unit_admit_pseudotime, "unitAdmitTime24") + check_timestamps_agree(df, unit_discharge_pseudotime, "unitDischargeTime24") + logger.info(f"Validated 24h times in {datetime.now() - start}") + + logger.warning("NOT validating the `unitVisitNumber` column as that isn't implemented yet.") + + logger.warning( + "NOT SURE ABOUT THE FOLLOWING. Check with the eICU team:\n" + " - `apacheAdmissionDx` is not selected from the patients table as we grab it from `admissiondx`. " + "Is this right?\n" + " - `admissionHeight` and `admissionWeight` are interpreted as **unit** admission height/weight, " + "not hospital admission height/weight. Is this right?\n" + " - `age` is interpreted as the age at the time of the unit stay, not the hospital stay. " + "Is this right?\n" + " - `What is the actual mean age for those > 89? Here we assume 90.\n" + ) -def fix_static_data(raw_static_df: pl.LazyFrame, death_times_df: pl.LazyFrame) -> pl.LazyFrame: - """Fixes the static data by adding the death time to the static data and fixes the DOB nonsense. + return df.join(hospital_df, left_on="hospitalID", right_on="hospitalid", how="left").select( + # 1. Static variables + "uniquepid", + "gender", + pseudo_date_of_birth.alias("dateOfBirth"), + "ethnicity", + # 2. Health system stay parameters + "patientHealthSystemStayID", + "hospitalID", + pl.col("numbedscategory").alias("hospitalNumBedsCategory"), + pl.col("teachingstatus").alias("hospitalTeachingStatus"), + pl.col("region").alias("hospitalRegion"), + # 2.1 Admission parameters + hospital_admit_pseudotime.alias("hospitalAdmitTimestamp"), + "hospitalAdmitSource", + # 2.2 Discharge parameters + hospital_discharge_pseudotime.alias("hospitalDischargeTimestamp"), + "hospitalDischargeLocation", + "hospitalDischargeStatus", + # 3. Unit stay parameters + "patientUnitStayID", # The unit stay ID + "wardID", + # 3.1 Admission parameters + unit_admit_pseudotime.alias("unitAdmitTimestamp"), + "unitAdmitSource", + "unitStayType", + pl.col("admissionHeight").alias("unitAdmissionHeight"), + pl.col("admissionWeight").alias("unitAdmissionWeight"), + # 3.2 Discharge parameters + unit_discharge_pseudotime.alias("unitDischargeTimestamp"), + "unitDischargeLocation", + "unitDischargeStatus", + pl.col("dischargeWeight").alias("unitDischargeWeight"), + ) - Args: - raw_static_df: The raw static data. - death_times_df: The death times data. - Returns: - The fixed static data. - """ +class PreProcessor(NamedTuple): + """A preprocessor function and its dependencies. - death_times_df = death_times_df.group_by("subject_id").agg(pl.col("deathtime").min()) + Args: + function: TODO + dependencies: A two-element tuple containing the prefix of the dependent dataframe and a list of + columns needed from that dataframe. + """ - return raw_static_df.join(death_times_df, on="subject_id", how="left").select( - "subject_id", - pl.coalesce(pl.col("dod"), pl.col("deathtime")).alias("dod"), - (pl.col("anchor_year") - pl.col("anchor_age")).cast(str).alias("year_of_birth"), - "gender", - ) + function: Callable[[Sequence[pl.LazyFrame]], pl.LazyFrame] + dependencies: tuple[str, list[str]] -FUNCTIONS = { - "hosp/diagnoses_icd": (add_discharge_time_by_hadm_id, ("hosp/admissions", ["hadm_id", "dischtime"])), - "hosp/drgcodes": (add_discharge_time_by_hadm_id, ("hosp/admissions", ["hadm_id", "dischtime"])), - "hosp/patients": (fix_static_data, ("hosp/admissions", ["subject_id", "deathtime"])), +FUNCTIONS: dict[str, PreProcessor] = { + "patient": PreProcessor( + process_patient_table, ("hospital", ["hospitalid", "numbedscategory", "teachingstatus", "region"]) + ), } +# From MIMIC +# "hosp/diagnoses_icd": (add_discharge_time_by_hadm_id, ("hosp/admissions", ["hadm_id", "dischtime"])), +# "hosp/drgcodes": (add_discharge_time_by_hadm_id, ("hosp/admissions", ["hadm_id", "dischtime"])), +# "hosp/patients": (fix_static_data, ("hosp/admissions", ["subject_id", "deathtime"])), + @hydra.main(version_base=None, config_path="configs", config_name="pre_MEDS") def main(cfg: DictConfig): """Performs pre-MEDS data wrangling for eICU. - Inputs are the raw MIMIC files, read from the `raw_cohort_dir` config parameter. Output files are either + Inputs are the raw eICU files, read from the `raw_cohort_dir` config parameter. Output files are either symlinked (if they are not modified) or written in processed form to the `MEDS_input_dir` config parameter. Hydra is used to manage configuration parameters and logging. + + Note that eICU has only a tentative ability to identify true relative admission times for even the same + patient, as health system stay IDs are only temporally ordered at the *year* level. As such, to properly + parse this dataset in a longitudinal form, you must do one of the following: + 1. Not operate at the level of patients at all, but instead at the level of health system stays, as + individual events within a health system stay can be well ordered. + 2. Restrict the analysis to only patients who do not have multiple health system stays within a single + year (as health system stays across years can be well ordered, provided we assume to distinct stays + within a single health system cannot overlap). + + In this pipeline, we choose to operate at the level of health system stays, as this is the most general + approach. The only downside is that we lose the ability to track individual patients across health system + stays, and thus can only explore questions of limited longitudinal scope. + + We ignore the following tables for the given reasons: + 1. `admissiondrug`: This table is noted in the + [documentation](https://eicu-crd.mit.edu/eicutables/admissiondrug/) as being "Extremely infrequently + used". + + Args (all as part of the config file): + raw_cohort_dir: The directory containing the raw eICU files. + output_dir: The directory to write the processed files to. """ + raise NotImplementedError("This script is not yet implemented for eICU.") + hydra_loguru_init() raw_cohort_dir = Path(cfg.raw_cohort_dir) From 4c7e2cb34a2fe89d5653b227c06884bf6b863526 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sat, 1 Jun 2024 09:17:06 -0400 Subject: [PATCH 17/47] Added (again untested) allergy table --- eICU_Example/configs/event_configs.yaml | 18 ++++++- eICU_Example/pre_MEDS.py | 69 +++++++++++++++++++++++-- 2 files changed, 81 insertions(+), 6 deletions(-) diff --git a/eICU_Example/configs/event_configs.yaml b/eICU_Example/configs/event_configs.yaml index 9d467f6..9d7177b 100644 --- a/eICU_Example/configs/event_configs.yaml +++ b/eICU_Example/configs/event_configs.yaml @@ -36,6 +36,7 @@ patient: - col("unitStayType") timestamp: "unitAdmitTimestamp" ward_id: "wardID" + unit_stay_id: "patientUnitStayID" unit_admission_weight: code: - "UNIT_ADMISSION_WEIGHT" @@ -58,4 +59,19 @@ patient: timestamp: "unitDischargeTimestamp" numerical_value: "unitDischargeWeight" -timestamp_format: "%Y-%m-%d %H:%M:%S" +admissiondx: + admission_diagnosis: + code: + - "ADMISSION_DX" + - col("admitDxName") + timestamp: "admitDxEnteredTimestamp" + admission_dx_id: "admitDxID" + unit_stay_id: "patientUnitStayID" + +allergy: + allergy: + code: + - "ALLERGY" + - col("allergyType") + - col("allergyName") + timestamp: "allergyEnteredTimestamp" diff --git a/eICU_Example/pre_MEDS.py b/eICU_Example/pre_MEDS.py index c78d506..38a56d8 100755 --- a/eICU_Example/pre_MEDS.py +++ b/eICU_Example/pre_MEDS.py @@ -66,7 +66,7 @@ def check_timestamps_agree(df: pl.LazyFrame, pseudotime_col: pl.Expr, given_24ht ) -def process_patient_table(df: pl.LazyFrame, hospital_df: pl.LazyFrame) -> pl.LazyFrame: +def process_patient(df: pl.LazyFrame, hospital_df: pl.LazyFrame) -> pl.LazyFrame: """Takes the patient table and converts it to a form that includes timestamps. As eICU stores only offset times, note here that we add a CONSTANT TIME ACROSS ALL PATIENTS for the true @@ -121,12 +121,12 @@ def process_patient_table(df: pl.LazyFrame, hospital_df: pl.LazyFrame) -> pl.Laz return df.join(hospital_df, left_on="hospitalID", right_on="hospitalid", how="left").select( # 1. Static variables - "uniquepid", + PATIENT_ID, "gender", pseudo_date_of_birth.alias("dateOfBirth"), "ethnicity", # 2. Health system stay parameters - "patientHealthSystemStayID", + HEALTH_SYSTEM_STAY_ID, "hospitalID", pl.col("numbedscategory").alias("hospitalNumBedsCategory"), pl.col("teachingstatus").alias("hospitalTeachingStatus"), @@ -139,7 +139,7 @@ def process_patient_table(df: pl.LazyFrame, hospital_df: pl.LazyFrame) -> pl.Laz "hospitalDischargeLocation", "hospitalDischargeStatus", # 3. Unit stay parameters - "patientUnitStayID", # The unit stay ID + UNIT_STAY_ID, "wardID", # 3.1 Admission parameters unit_admit_pseudotime.alias("unitAdmitTimestamp"), @@ -155,6 +155,59 @@ def process_patient_table(df: pl.LazyFrame, hospital_df: pl.LazyFrame) -> pl.Laz ) +def process_admissiondx(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame: + """Takes the admissiondx table and converts it to a form that includes timestamps. + + The output of this process is ultimately converted to events via the `admissiondx` key in the + `configs/event_configs.yaml` file. + """ + + admission_dx_pseudotime = pl.col("unitAdmitTimestamp") + pl.duration( + minutes=pl.col("admitDxEnteredOffset") + ) + + logger.warning( + "NOT SURE ABOUT THE FOLLOWING for admissiondx table. Check with the eICU team:\n" + " - How should we use `admitDxTest`? It's not used here.\n" + " - How should we use `admitDxPath`? It's not used here.\n" + ) + + return df.join(patient_df, on=UNIT_STAY_ID, how="inner").select( + HEALTH_SYSTEM_STAY_ID, + UNIT_STAY_ID, + admission_dx_pseudotime.alias("admitDxEnteredTimestamp"), + "admitDxName", + "admitDxID", + ) + + +def process_allergy(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame: + """Takes the allergy table and converts it to a form that includes timestamps. + + The output of this process is ultimately converted to events via the `allergy` key in the + `configs/event_configs.yaml` file. + """ + + allergy_pseudotime = pl.col("unitAdmitTimestamp") + pl.duration(minutes=pl.col("allergyEnteredOffset")) + + logger.warning( + "NOT SURE ABOUT THE FOLLOWING for allergy table. Check with the eICU team:\n" + " - How should we use `allergyNoteType`? It's not used here.\n" + " - How should we use `specialtyType`? It's not used here.\n" + " - How should we use `userType`? It's not used here.\n" + " - Is `drugName` the name of the drug to which the patient is allergic or the drug given to the " + "patient (docs say 'name of the selected admission drug')?\n" + ) + + return df.join(patient_df, on=UNIT_STAY_ID, how="inner").select( + HEALTH_SYSTEM_STAY_ID, + UNIT_STAY_ID, + allergy_pseudotime.alias("allergyEnteredTimestamp"), + "allergyType", + "allergyName", + ) + + class PreProcessor(NamedTuple): """A preprocessor function and its dependencies. @@ -170,7 +223,13 @@ class PreProcessor(NamedTuple): FUNCTIONS: dict[str, PreProcessor] = { "patient": PreProcessor( - process_patient_table, ("hospital", ["hospitalid", "numbedscategory", "teachingstatus", "region"]) + process_patient, ("hospital", ["hospitalid", "numbedscategory", "teachingstatus", "region"]) + ), + "admissiondx": PreProcessor( + process_admissiondx, ("patient", [UNIT_STAY_ID, HEALTH_SYSTEM_STAY_ID, "unitAdmitTimestamp"]) + ), + "allergy": PreProcessor( + process_allergy, ("patient", [UNIT_STAY_ID, HEALTH_SYSTEM_STAY_ID, "unitAdmitTimestamp"]) ), } From f3463f5f8f4715c0ba8c97a16816310a59a91c8e Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sat, 1 Jun 2024 12:12:23 -0400 Subject: [PATCH 18/47] Improved the structure of the pipeline and added a bunch more tables. Still untested. --- eICU_Example/pre_MEDS.py | 125 +++++++++++++++++++++------------------ 1 file changed, 67 insertions(+), 58 deletions(-) diff --git a/eICU_Example/pre_MEDS.py b/eICU_Example/pre_MEDS.py index 38a56d8..0b34356 100755 --- a/eICU_Example/pre_MEDS.py +++ b/eICU_Example/pre_MEDS.py @@ -17,7 +17,7 @@ import hydra import polars as pl from loguru import logger -from omegaconf import DictConfig +from omegaconf import DictConfig, OmegaConf from MEDS_polars_functions.utils import ( get_shard_prefix, @@ -155,57 +155,47 @@ def process_patient(df: pl.LazyFrame, hospital_df: pl.LazyFrame) -> pl.LazyFrame ) -def process_admissiondx(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame: - """Takes the admissiondx table and converts it to a form that includes timestamps. +def join_and_get_pseudotime_fntr( + table_name: str, + offset_col: str, + pseudotime_col: str, + output_data_cols: list[str] | None = None, + warning_items: list[str] | None = None, +) -> Callable[[pl.LazyFrame, pl.LazyFrame], pl.LazyFrame]: + """Returns a function that joins a dataframe to the `patient` table and adds pseudotimes. - The output of this process is ultimately converted to events via the `admissiondx` key in the - `configs/event_configs.yaml` file. + Also raises specified warning strings via the logger for uncertain columns. + + TODO """ - admission_dx_pseudotime = pl.col("unitAdmitTimestamp") + pl.duration( - minutes=pl.col("admitDxEnteredOffset") - ) + if output_data_cols is None: + output_data_cols = [] - logger.warning( - "NOT SURE ABOUT THE FOLLOWING for admissiondx table. Check with the eICU team:\n" - " - How should we use `admitDxTest`? It's not used here.\n" - " - How should we use `admitDxPath`? It's not used here.\n" - ) + def fn(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame: + f"""Takes the {table_name} table and converts it to a form that includes pseudo-timestamps. - return df.join(patient_df, on=UNIT_STAY_ID, how="inner").select( - HEALTH_SYSTEM_STAY_ID, - UNIT_STAY_ID, - admission_dx_pseudotime.alias("admitDxEnteredTimestamp"), - "admitDxName", - "admitDxID", - ) - - -def process_allergy(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame: - """Takes the allergy table and converts it to a form that includes timestamps. + The output of this process is ultimately converted to events via the `{table_name}` key in the + `configs/event_configs.yaml` file. + """ - The output of this process is ultimately converted to events via the `allergy` key in the - `configs/event_configs.yaml` file. - """ + pseudotime = pl.col("unitAdmitTimestamp") + pl.duration(minutes=pl.col(offset_col)) - allergy_pseudotime = pl.col("unitAdmitTimestamp") + pl.duration(minutes=pl.col("allergyEnteredOffset")) + if warning_items: + warning_lines = [ + f"NOT SURE ABOUT THE FOLLOWING for {table_name} table. Check with the eICU team:", + *(f" - {item}" for item in warning_items), + ] + logger.warning("\n".join(warning_lines)) - logger.warning( - "NOT SURE ABOUT THE FOLLOWING for allergy table. Check with the eICU team:\n" - " - How should we use `allergyNoteType`? It's not used here.\n" - " - How should we use `specialtyType`? It's not used here.\n" - " - How should we use `userType`? It's not used here.\n" - " - Is `drugName` the name of the drug to which the patient is allergic or the drug given to the " - "patient (docs say 'name of the selected admission drug')?\n" - ) + return df.join(patient_df, on=UNIT_STAY_ID, how="inner").select( + HEALTH_SYSTEM_STAY_ID, + UNIT_STAY_ID, + pseudotime.alias(pseudotime_col), + *output_data_cols, + ) - return df.join(patient_df, on=UNIT_STAY_ID, how="inner").select( - HEALTH_SYSTEM_STAY_ID, - UNIT_STAY_ID, - allergy_pseudotime.alias("allergyEnteredTimestamp"), - "allergyType", - "allergyName", - ) + return fn class PreProcessor(NamedTuple): @@ -221,23 +211,17 @@ class PreProcessor(NamedTuple): dependencies: tuple[str, list[str]] -FUNCTIONS: dict[str, PreProcessor] = { +NEEDED_PATIENT_COLS = [UNIT_STAY_ID, HEALTH_SYSTEM_STAY_ID, "unitAdmitTimestamp"] +PATIENT_DEPENDENCY = ("patient", NEEDED_PATIENT_COLS) + +# Generic "copy from patients" functions are stored in `configs/table_preprocessors.yaml` and loaded in +# `main`. +SPECIALTY_FUNCTIONS: dict[str, PreProcessor] = { "patient": PreProcessor( process_patient, ("hospital", ["hospitalid", "numbedscategory", "teachingstatus", "region"]) ), - "admissiondx": PreProcessor( - process_admissiondx, ("patient", [UNIT_STAY_ID, HEALTH_SYSTEM_STAY_ID, "unitAdmitTimestamp"]) - ), - "allergy": PreProcessor( - process_allergy, ("patient", [UNIT_STAY_ID, HEALTH_SYSTEM_STAY_ID, "unitAdmitTimestamp"]) - ), } -# From MIMIC -# "hosp/diagnoses_icd": (add_discharge_time_by_hadm_id, ("hosp/admissions", ["hadm_id", "dischtime"])), -# "hosp/drgcodes": (add_discharge_time_by_hadm_id, ("hosp/admissions", ["hadm_id", "dischtime"])), -# "hosp/patients": (fix_static_data, ("hosp/admissions", ["subject_id", "deathtime"])), - @hydra.main(version_base=None, config_path="configs", config_name="pre_MEDS") def main(cfg: DictConfig): @@ -264,6 +248,20 @@ def main(cfg: DictConfig): 1. `admissiondrug`: This table is noted in the [documentation](https://eicu-crd.mit.edu/eicutables/admissiondrug/) as being "Extremely infrequently used". + 2. `apacheApsVar`: This table is a sort of "meta-table" that contains variables used to compute the + APACHE score; we won't use these raw variables from this table, but instead will use the raw data. + 3. `apachePatientResult`: This table has pre-computed APACHE score variables; we won't use these and + will use the raw data directly. + 4. `apachePredVar`: This table contains variables used to compute the APACHE score; we won't use these + in favor of the raw data directly. + 5. `carePlanCareProvider`: This table contains information about the provider for given care-plan + entries; however, as we can't link this table to the particular care-plan entries, we don't use it + here. It also is not clear (to the author of this script; the eICU team may know more) how reliable + the time-offsets are for this table as they merely denote when a provider was entered into the care + plan. + 6. `customLab`: The documentation for this table is very sparse, so we skip it. + 7. `intakeOutput`: There are a number of significant warnings about duplicates, cumulative values, and + more in the documentation for this table, so for now we skip it. Args (all as part of the config file): raw_cohort_dir: The directory containing the raw eICU files. @@ -274,6 +272,17 @@ def main(cfg: DictConfig): hydra_loguru_init() + functions = {**SPECIALTY_FUNCTIONS} + + logger.info("Loading table preprocessors from configs/table_preprocessors.yaml...") + preprocessors = OmegaConf.load("configs/table_preprocessors.yaml") + for table_name, preprocessor_cfg in preprocessors.items(): + logger.info(f" Adding preprocessor for {table_name}:\n{OmegaConf.to_yaml(preprocessor_cfg)}") + functions[table_name] = PreProcessor( + join_and_get_pseudotime_fntr(table_name=table_name, **preprocessor_cfg), + PATIENT_DEPENDENCY, + ) + raw_cohort_dir = Path(cfg.raw_cohort_dir) MEDS_input_dir = Path(cfg.output_dir) @@ -292,7 +301,7 @@ def main(cfg: DictConfig): out_fp.parent.mkdir(parents=True, exist_ok=True) - if pfx not in FUNCTIONS: + if pfx not in functions: logger.info( f"No function needed for {pfx}: " f"Symlinking {str(in_fp.resolve())} to {str(out_fp.resolve())}" @@ -306,7 +315,7 @@ def main(cfg: DictConfig): print(f"Done with {pfx}. Continuing") continue - fn, need_df = FUNCTIONS[pfx] + fn, need_df = functions[pfx] if not need_df: st = datetime.now() logger.info(f"Processing {pfx}...") @@ -340,7 +349,7 @@ def main(cfg: DictConfig): out_fp = MEDS_input_dir / f"{pfx}.parquet" logger.info(f" Processing dependent df @ {pfx}...") - fn, _ = FUNCTIONS[pfx] + fn, _ = functions[pfx] fp_st = datetime.now() logger.info(f" Loading {str(fp.resolve())}...") From 542b7cb4ec3ea3ef806984ec39aa33032ed22d9d Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sat, 1 Jun 2024 12:12:55 -0400 Subject: [PATCH 19/47] Forgot table configs -- likely currently malformed. --- eICU_Example/configs/table_preprocessors.yaml | 94 +++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 eICU_Example/configs/table_preprocessors.yaml diff --git a/eICU_Example/configs/table_preprocessors.yaml b/eICU_Example/configs/table_preprocessors.yaml new file mode 100644 index 0000000..c7d7a80 --- /dev/null +++ b/eICU_Example/configs/table_preprocessors.yaml @@ -0,0 +1,94 @@ + +admissiondx: + offset_col: "admitDxEnteredOffset" + pseudotime_col: "admitDxEnteredTimestamp" + output_data_cols: ["admitDxName", "admitDxID"] + warning_items: ["How should we use `admitDxTest`?", "How should we use `admitDxPath`?"] + +allergy: + offset_col: "allergyEnteredOffset" + pseudotime_col: "allergyEnteredTimestamp" + output_data_cols: ["allergyType", "allergyName"] + warning_items: + - "How should we use `allergyNoteType`?" + - "How should we use `specialtyType`?" + - "How should we use `userType`?" + - |- Is `drugName` the name of the drug to which the patient is allergic or the drug given to the patient + (docs say \'name of the selected admission drug\')? + +carePlanGeneral: + offset_col: "cplItemOffset" + pseudotime_col: "carePlanGeneralItemEnteredTimestamp" + output_data_cols: ["cplGroup", "cplItemValue"] + +carePlanEOL: + offset_col: "cplEolDiscussionOffset" + pseudotime_col: "carePlanEolDiscussionOccurredTimestamp" + warning_items: + - "Is the DiscussionOffset time actually reliable? Should we fall back on the SaveOffset time?" + +carePlanGoal: + offset_col: "cplGoalOffset" + pseudotime_col: "carePlanGoalEnteredTimestamp" + output_data_cols: ["cplGoalCategory", "cplGoalValue", "cplGoalStatus"] + +carePlanInfectiousDisease: + offset_col: "cplInfectDiseaseOffset" + pseudotime_col: "carePlanInfectDiseaseEnteredTimestamp" + output_data_cols: ["infectDiseaseSite", "infectDiseaseAssessment", "responseToTherapy", "treatment"] + +diagonosis: + offset_col: "diagnosisOffset" + pseudotime_col: "diagnosisEnteredTimestamp" + output_data_cols: ["ICD9Code", "diagnosisPriority", "diagnosisString"] + warning_items: + - "Though we use it, the `diagnosisString` field documentation is unclear -- by what is it separated?" + +infusionDrug: + offset_col: "infusionOffset" + pseudotime_col: "infusionEnteredTimestamp" + output_data_cols: + - "infusionDrugID" + - "drugName" + - "drugRate" + - "infusionRate" + - "drugAmount" + - "volumeOfFluid" + - "patientWeight" + +lab: + offset_col: "labResultOffset" + pseudotime_col: "labResultDrawnTimestamp" + output_data_cols: + - "labName" + - "labResult" + - "labResultText" + - "labMeasureNameSystem" + - "labMeasureNameInterface" + - "labTypeID" + warning_items: + - "Is this the time the lab was drawn? Entered? The time the result came in?" + - "We **IGNORE** the `labResultRevisedOffset` column -- this may be a mistake!" + +medication: + offset_col: + - "drugOrderOffset" + - "drugStartOffset" + - "drugStopOffset" + pseudotime_col: + - "drugOrderTimestamp" + - "drugStartTimestamp" + - "drugStopTimestamp" + output_data_cols: + - "medicationID" + - "drugIVAdmixture" + - "drugName" + - "drugHiclSeqno" + - "dosage" + - "routeAdmin" + - "frequency" + - "loadingDose" + - "PRN" + - "GTC" + warning_items: + - "We **IGNORE** the `drugOrderCancelled` column -- this may be a mistake!" From 167acb07788439a9cac141e10d88ef9131925578 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sat, 1 Jun 2024 12:18:24 -0400 Subject: [PATCH 20/47] Added soon to be deleted microlab table --- eICU_Example/configs/table_preprocessors.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/eICU_Example/configs/table_preprocessors.yaml b/eICU_Example/configs/table_preprocessors.yaml index c7d7a80..a4e1cc8 100644 --- a/eICU_Example/configs/table_preprocessors.yaml +++ b/eICU_Example/configs/table_preprocessors.yaml @@ -92,3 +92,14 @@ medication: - "GTC" warning_items: - "We **IGNORE** the `drugOrderCancelled` column -- this may be a mistake!" + +# We don't use this because the culture taken time != culture result time, so seeing this data would give a +# model an advantage over any possible real-world implementation. But, I'm including its data here as it would +# be easy to fit into this paradigm. +#microLab: +# offset_col: "cultureTakenOffset" +# pseudotime_col: "cultureTakenTimestamp" +# output_data_cols: +# - "cultureSite" +# - "organism" +# - "antibiotic" From 15815f3c0314727e06fbcf2ce0e04c07bb974f4e Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sat, 1 Jun 2024 12:19:05 -0400 Subject: [PATCH 21/47] docs update --- eICU_Example/configs/table_preprocessors.yaml | 11 ----------- eICU_Example/pre_MEDS.py | 3 +++ 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/eICU_Example/configs/table_preprocessors.yaml b/eICU_Example/configs/table_preprocessors.yaml index a4e1cc8..c7d7a80 100644 --- a/eICU_Example/configs/table_preprocessors.yaml +++ b/eICU_Example/configs/table_preprocessors.yaml @@ -92,14 +92,3 @@ medication: - "GTC" warning_items: - "We **IGNORE** the `drugOrderCancelled` column -- this may be a mistake!" - -# We don't use this because the culture taken time != culture result time, so seeing this data would give a -# model an advantage over any possible real-world implementation. But, I'm including its data here as it would -# be easy to fit into this paradigm. -#microLab: -# offset_col: "cultureTakenOffset" -# pseudotime_col: "cultureTakenTimestamp" -# output_data_cols: -# - "cultureSite" -# - "organism" -# - "antibiotic" diff --git a/eICU_Example/pre_MEDS.py b/eICU_Example/pre_MEDS.py index 0b34356..f02c0e8 100755 --- a/eICU_Example/pre_MEDS.py +++ b/eICU_Example/pre_MEDS.py @@ -262,6 +262,9 @@ def main(cfg: DictConfig): 6. `customLab`: The documentation for this table is very sparse, so we skip it. 7. `intakeOutput`: There are a number of significant warnings about duplicates, cumulative values, and more in the documentation for this table, so for now we skip it. + 8. `microLab`: We don't use this because the culture taken time != culture result time, so seeing this + data would give a model an advantage over any possible real-world implementation. Plus, the docs say + it is not well populated. Args (all as part of the config file): raw_cohort_dir: The directory containing the raw eICU files. From bda16e83933f537f8a608b6c2edc265bd22c7762 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sat, 1 Jun 2024 14:36:03 -0400 Subject: [PATCH 22/47] Added partial event configs for all tables. --- eICU_Example/configs/event_configs.yaml | 540 ++++++++++++++++++ eICU_Example/configs/table_preprocessors.yaml | 185 +++++- eICU_Example/pre_MEDS.py | 30 +- 3 files changed, 750 insertions(+), 5 deletions(-) diff --git a/eICU_Example/configs/event_configs.yaml b/eICU_Example/configs/event_configs.yaml index 9d7177b..7ac4225 100644 --- a/eICU_Example/configs/event_configs.yaml +++ b/eICU_Example/configs/event_configs.yaml @@ -75,3 +75,543 @@ allergy: - col("allergyType") - col("allergyName") timestamp: "allergyEnteredTimestamp" + +carePlanGeneral: + cplItem: + code: + - "CAREPLAN_GENERAL" + - col("cplGroup") + - col("cplItemValue") + timestamp: "carePlanGeneralItemEnteredTimestamp" + +carePlanEOL: + cplEolDiscussion: + code: + - "CAREPLAN_EOL" + timestamp: "carePlanEolDiscussionOccurredTimestamp" + +carePlanGoal: + cplGoal: + code: + - "CAREPLAN_GOAL" + - col("cplGoalCategory") + - col("cplGoalValue") + - col("cplGoalStatus") + timestamp: "carePlanGoalEnteredTimestamp" + +carePlanInfectiousDisease: + cplInfectDisease: + code: + - "CAREPLAN_INFECTIOUS_DISEASE" + - col("infectDiseaseSite") + - col("infectDiseaseAssessment") + - col("treatment") + - col("responseToTherapy") + timestamp: "carePlanInfectDiseaseEnteredTimestamp" + +diagnosis: + diagnosis: + code: + - "ICD9CM" + - col("ICD9Code") + - col("diagnosisPriority") + timestamp: "diagnosisEnteredTimestamp" + diagnosis_string: "diagnosisString" + +infusionDrug: + infusion: + code: + - "INFUSION" + - col("infusionDrugID") + - col("drugName") + timestamp: "infusionEnteredTimestamp" + drug_rate: "drugRate" + infusion_rate: "infusionRate" + drug_amount: "drugAmount" + volume_of_fluid: "volumeOfFluid" + patient_weight: + code: + - "INFUSION_PATIENT_WEIGHT" + timestamp: "infusionEnteredTimestamp" + numerical_value: "patientWeight" + +lab: + lab: + code: + - "LAB" + - col("labMeasureNameSystem") + - col("labMeasureNameInterface") + - col("labName") + timestamp: "labResultDrawnTimestamp" + numerical_value: "labResult" + text_value: "labResultText" + lab_type_id: "labTypeID" + +medication: + drug_ordered: + code: + - "MEDICATION" + - "ORDERED" + - col(drugName) + timestamp: "drugOrderTimestamp" + medication_id: "medicationID" + drug_iv_admixture: "drugIVAdmixture" + dosage: "dosage" + route_admin: "routeAdmin" + frequency: "frequency" + loading_dose: "loadingDose" + prn: "PRN" + gtc: "GTC" + drug_started: + code: + - "MEDICATION" + - "STARTED" + - col(drugName) + timestamp: "drugStartedTimestamp" + medication_id: "medicationID" + drug_stopped: + code: + - "MEDICATION" + - "STOPPED" + - col(drugName) + timestamp: "drugStoppedTimestamp" + medication_id: "medicationID" + +nurseAssessment: + nurse_assessment_performed: + code: + - "NURSE_ASSESSMENT" + - "PERFORMED" + - NOT YET DONE + timestamp: "nurseAssessPerformedTimestamp" + nurse_assessment_id: "nurseAssessID" + cell_label: "cellLabel" + cell_attribute: "cellAttribute" + cell_attribute_value: "cellAttributeValue" + + nurse_assessment_entered: + code: + - "NURSE_ASSESSMENT" + - "ENTERED" + - NOT YET DONE + timestamp: "nurseAssessEnteredTimestamp" + nurse_assessment_id: "nurseAssessID" + cell_label: "cellLabel" + cell_attribute: "cellAttribute" + cell_attribute_value: "cellAttributeValue" + +nurseCare: + nurse_care_performed: + code: + - "NURSE_CARE" + - "PERFORMED" + - NOT YET DONE + timestamp: "nurseCarePerformedTimestamp" + nurse_care_id: "nurseCareID" + cell_label: "cellLabel" + cell_attribute: "cellAttribute" + cell_attribute_value: "cellAttributeValue" + + nurse_care_entered: + code: + - "NURSE_CARE" + - "ENTERED" + - NOT YET DONE + timestamp: "nurseCareEnteredTimestamp" + nurse_care_id: "nurseCareID" + cell_label: "cellLabel" + cell_attribute: "cellAttribute" + cell_attribute_value: "cellAttributeValue" + +nurseCharting: + nurse_charting_performed: + code: + - "NURSE_CHARTING" + - "PERFORMED" + - NOT YET DONE + timestamp: "nursingChartPerformedTimestamp" + nurse_charting_id: "nursingChartID" + cell_type_cat: "nursingChartCellTypeCat" + cell_type_val_name: "nursingChartCellTypeValName" + cell_type_val_label: "nursingChartCellTypeValLabel" + cell_value: "nursingChartValue" + + nurse_charting_entered: + code: + - "NURSE_CHARTING" + - "ENTERED" + - NOT YET DONE + timestamp: "nursingChartEnteredTimestamp" + nurse_charting_id: "nursingChartID" + cell_type_cat: "nursingChartCellTypeCat" + cell_type_val_name: "nursingChartCellTypeValName" + cell_type_val_label: "nursingChartCellTypeValLabel" + cell_value: "nursingChartValue" + +pastHistory: + past_history_taken: + code: + - "PAST_HISTORY" + - "TAKEN" + - NOT YET DONE + timestamp: "pastHistoryTakenTimestamp" + past_history_id: "pastHistoryID" + note_type: "pastHistoryNoteType" + path: "pastHistoryPath" + value: "pastHistoryValue" + value_text: "pastHistoryValueText" + + past_history_entered: + code: + - "PAST_HISTORY" + - "ENTERED" + - NOT YET DONE + timestamp: "pastHistoryEnteredTimestamp" + past_history_id: "pastHistoryID" + note_type: "pastHistoryNoteType" + path: "pastHistoryPath" + value: "pastHistoryValue" + value_text: "pastHistoryValueText" + + +physicalExam: + physical_exam_entered: + code: + - "PHYSICAL_EXAM" + - "ENTERED" + - NOT YET DONE + timestamp: "physicalExamEnteredTimestamp" + physical_exam_id: "physicalExamID" + text: "physicalExamText" + path: "physicalExamPath" + value: "physicalExamValue" + + +respiratoryCare: + resp_care_status: + code: + - "RESP_CARE" + - "STATUS" + - NOT YET DONE + timestamp: "respCareStatusEnteredTimestamp" + resp_care_id: "respCareID" + + airwayType: "airwayType" + airwaySize: "airwaySize" + airwayPosition: "airwayPosition" + cuffPressure: "cuffPressure" + apneaParams: "apneaParams" + lowExhMVLimit: "lowExhMVLimit" + hiExhMVLimit: "hiExhMVLimit" + lowExhTVLimit: "lowExhTVLimit" + hiPeakPresLimit: "hiPeakPresLimit" + lowPeakPresLimit: "lowPeakPresLimit" + hiRespRateLimit: "hiRespRateLimit" + lowRespRateLimit: "lowRespRateLimit" + sighPresLimit: "sighPresLimit" + lowIronOxLimit: "lowIronOxLimit" + highIronOxLimit: "highIronOxLimit" + meanAirwayPresLimit: "meanAirwayPresLimit" + PEEPLimit: "PEEPLimit" + CPAPLimit: "CPAPLimit" + setApneaInterval: "setApneaInterval" + setApneaTV: "setApneaTV" + setApneaIPPEEPHigh: "setApneaIPPEEPHigh" + setApneaRR: "setApneaRR" + setApneaPeakFlow: "setApneaPeakFlow" + setApneaInspTime: "setApneaInspTime" + setApneaIE: "setApneaIE" + setApneaFIO2: "setApneaFIO2" + + vent_start: + code: + - "VENT" + - "START" + - NOT YET DONE + timestamp: "ventStartTimestamp" + resp_care_id: "respCareID" + + vent_end: + code: + - "VENT" + - "END" + - NOT YET DONE + timestamp: "ventEndTimestamp" + resp_care_id: "respCareID" + + +respiratoryCharting: + resp_charting_performed: + code: + - "RESP_CHARTING" + - "PERFORMED" + - NOT YET DONE + timestamp: "respChartPerformedTimestamp" + resp_chart_id: "respChartID" + type_cat: "respChartTypeCat" + value_label: "respChartValueLabel" + value: "respChartValue" + + resp_charting_entered: + code: + - "RESP_CHARTING" + - "ENTERED" + - NOT YET DONE + timestamp: "respChartEnteredTimestamp" + resp_chart_id: "respChartID" + type_cat: "respChartTypeCat" + value_label: "respChartValueLabel" + value: "respChartValue" + +treatment: + treatment: + code: + - "TREATMENT" + - "ENTERED" + - col("treatmentString") + timestamp: "treatmentEnteredTimestamp" + treatment_id: "treatmentID" + +vitalAperiodic: + non_invasive_systolic: + code: + - "VITALS" + - "APERIODIC" + - "BP" + - "NONINVASIVE_SYSTOLIC" + timestamp: "observationEnteredTimestamp" + vital_id: "vitalAperiodicID" + numeric_value: "nonInvasiveSystolic" + non_invasive_diastolic: + code: + - "VITALS" + - "APERIODIC" + - "BP" + - "NONINVASIVE_DIASTOLIC" + timestamp: "observationEnteredTimestamp" + vital_id: "vitalAperiodicID" + numeric_value: "nonInvasiveDiastolic" + + non_invasive_mean: + code: + - "VITALS" + - "APERIODIC" + - "BP" + - "NONINVASIVE_MEAN" + timestamp: "observationEnteredTimestamp" + vital_id: "vitalAperiodicID" + numeric_value: "nonInvasiveMean" + + paop: + code: + - "VITALS" + - "APERIODIC" + - "PAOP" + timestamp: "observationEnteredTimestamp" + vital_id: "vitalAperiodicID" + numeric_value: "paop" + + cardiac_output: + code: + - "VITALS" + - "APERIODIC" + - "CARDIAC_OUTPUT" + timestamp: "observationEnteredTimestamp" + vital_id: "vitalAperiodicID" + numeric_value: "cardiacOutput" + + cardiac_input: + code: + - "VITALS" + - "APERIODIC" + - "CARDIAC_INPUT" + timestamp: "observationEnteredTimestamp" + vital_id: "vitalAperiodicID" + numeric_value: "cardiacInput" + + svr: + code: + - "VITALS" + - "APERIODIC" + - "SVR" + timestamp: "observationEnteredTimestamp" + vital_id: "vitalAperiodicID" + numeric_value: "svr" + + svri: + code: + - "VITALS" + - "APERIODIC" + - "SVRI" + timestamp: "observationEnteredTimestamp" + vital_id: "vitalAperiodicID" + numeric_value: "svri" + + pvr: + code: + - "VITALS" + - "APERIODIC" + - "PVR" + timestamp: "observationEnteredTimestamp" + vital_id: "vitalAperiodicID" + numeric_value: "pvr" + + pvri: + code: + - "VITALS" + - "APERIODIC" + - "PVRI" + timestamp: "observationEnteredTimestamp" + vital_id: "vitalAperiodicID" + numeric_value: "pvri" + +vitalPeriodic: + temperature: + code: + - "VITALS" + - "PERIODIC" + - "TEMPERATURE" + timestamp: "observationEnteredTimestamp" + vital_id: "vitalPeriodicID" + numeric_value: "temperature" + + saO2: + code: + - "VITALS" + - "PERIODIC" + - "SAO2" + timestamp: "observationEnteredTimestamp" + vital_id: "vitalPeriodicID" + numeric_value: "saO2" + + heartRate: + code: + - "VITALS" + - "PERIODIC" + - "HEARTRATE" + timestamp: "observationEnteredTimestamp" + vital_id: "vitalPeriodicID" + numeric_value: "heartRate" + + respiration: + code: + - "VITALS" + - "PERIODIC" + - "RESPIRATION" + timestamp: "observationEnteredTimestamp" + vital_id: "vitalPeriodicID" + numeric_value: "respiration" + + cvp: + code: + - "VITALS" + - "PERIODIC" + - "CVP" + timestamp: "observationEnteredTimestamp" + vital_id: "vitalPeriodicID" + numeric_value: "cvp" + + etCo2: + code: + - "VITALS" + - "PERIODIC" + - "ETCO2" + timestamp: "observationEnteredTimestamp" + vital_id: "vitalPeriodicID" + numeric_value: "etCo2" + + systemic_systolic: + code: + - "VITALS" + - "PERIODIC" + - "BP" + - "SYSTEMIC_SYSTOLIC" + timestamp: "observationEnteredTimestamp" + vital_id: "vitalPeriodicID" + numeric_value: "systemicSystolic" + + systemic_diastolic: + code: + - "VITALS" + - "PERIODIC" + - "BP" + - "SYSTEMIC_DIASTOLIC" + timestamp: "observationEnteredTimestamp" + vital_id: "vitalPeriodicID" + numeric_value: "systemicDiastolic" + + systemic_mean: + code: + - "VITALS" + - "PERIODIC" + - "BP" + - "SYSTEMIC_MEAN" + timestamp: "observationEnteredTimestamp" + vital_id: "vitalPeriodicID" + numeric_value: "systemicMean" + + pa_systolic: + code: + - "VITALS" + - "PERIODIC" + - "BP" + - "PULM_ART_SYSTOLIC" + timestamp: "observationEnteredTimestamp" + vital_id: "vitalPeriodicID" + numeric_value: "paSystolic" + + pa_diastolic: + code: + - "VITALS" + - "PERIODIC" + - "BP" + - "PULM_ART_DIASTOLIC" + timestamp: "observationEnteredTimestamp" + vital_id: "vitalPeriodicID" + numeric_value: "paDiastolic" + + pa_mean: + code: + - "VITALS" + - "PERIODIC" + - "BP" + - "PULM_ART_MEAN" + timestamp: "observationEnteredTimestamp" + vital_id: "vitalPeriodicID" + numeric_value: "paMean" + + st1: + code: + - "VITALS" + - "PERIODIC" + - "ST1" + timestamp: "observationEnteredTimestamp" + vital_id: "vitalPeriodicID" + numeric_value: "st1" + + st2: + code: + - "VITALS" + - "PERIODIC" + - "ST2" + timestamp: "observationEnteredTimestamp" + vital_id: "vitalPeriodicID" + numeric_value: "st2" + + st3: + code: + - "VITALS" + - "PERIODIC" + - "ST3" + timestamp: "observationEnteredTimestamp" + vital_id: "vitalPeriodicID" + numeric_value: "st3" + + ICP: + code: + - "VITALS" + - "PERIODIC" + - "ICP" + timestamp: "observationEnteredTimestamp" + vital_id: "vitalPeriodicID" + numeric_value: "ICP" diff --git a/eICU_Example/configs/table_preprocessors.yaml b/eICU_Example/configs/table_preprocessors.yaml index c7d7a80..e7da494 100644 --- a/eICU_Example/configs/table_preprocessors.yaml +++ b/eICU_Example/configs/table_preprocessors.yaml @@ -1,4 +1,3 @@ - admissiondx: offset_col: "admitDxEnteredOffset" pseudotime_col: "admitDxEnteredTimestamp" @@ -92,3 +91,187 @@ medication: - "GTC" warning_items: - "We **IGNORE** the `drugOrderCancelled` column -- this may be a mistake!" + +nurseAssessment: + offset_col: + - "nurseAssessOffset" + - "nurseAssessEntryOffset" + pseudotime_col: + - "nurseAssessPerformedTimestamp" + - "nurseAssessEnteredTimestamp" + output_data_cols: + - "nurseAssessID" + - "cellLabel" + - "cellAttribute" + - "cellAttributeValue" + warning_items: + - "Should we be using `cellAttributePath` instead of `cellAttribute`?" + - "SOME MAY BE LISTS" + +nurseCare: + offset_col: + - "nurseCareOffset" + - "nurseCareEntryOffset" + pseudotime_col: + - "nurseCarePerformedTimestamp" + - "nurseCareEnteredTimestamp" + output_data_cols: + - "nurseCareID" + - "cellLabel" + - "cellAttribute" + - "cellAttributeValue" + warning_items: + - "Should we be using `cellAttributePath` instead of `cellAttribute`?" + - "SOME MAY BE LISTS" + +nurseCharting: + offset_col: + - "nursingChartOffset" + - "nursingChartEntryOffset" + pseudotime_col: + - "nursingChartPerformedTimestamp" + - "nursingChartEnteredTimestamp" + output_data_cols: + - "nursingChartID" + - "nursingChartCellTypeCat" + - "nursingChartCellTypeValName" + - "nursingChartCellTypeValLabel" + - "nursingChartValue" + warning_items: + - "SOME MAY BE LISTS" + +pastHistory: + offset_col: + - "pastHistoryOffset" + - "pastHistoryEnteredOffset" + pseudotime_col: + - "pastHistoryTakenTimestamp" + - "pastHistoryEnteredTimestamp" + output_data_cols: + - "pastHistoryID" + - "pastHistoryNoteType" + - "pastHistoryPath" + - "pastHistoryValue" + - "pastHistoryValueText" + warning_items: + - "SOME MAY BE LISTS" + - "How should we use `pastHistoryPath` vs. `pastHistoryNoteType`?" + - "How should we use `pastHistoryValue` vs. `pastHistoryValueText`?" + +physicalExam: + offset_col: "physicalExamOffset" + pseudotime_col: "physicalExamEnteredTimestamp" + output_data_cols: + - "physicalExamID" + - "physicalExamText" + - "physicalExamPath" + - "physicalExamValue" + warning_items: + - "How should we use `physicalExamValue` vs. `physicalExamText`?" + - "I believe the `physicalExamValue` is a **LIST**. This must be processed specially." + +respiratoryCare: + offset_col: + - "respCareStatusOffset" + - "ventStartOffset" + - "ventEndOffset" + pseudotime_col: + - "respCareStatusEnteredTimestamp" + - "ventStartTimestamp" + - "ventEndTimestamp" + output_data_cols: + - "respCareID" + - "airwayType" + - "airwaySize" + - "airwayPosition" + - "cuffPressure" + - "apneaParams" + - "lowExhMVLimit" + - "hiExhMVLimit" + - "lowExhTVLimit" + - "hiPeakPresLimit" + - "lowPeakPresLimit" + - "hiRespRateLimit" + - "lowRespRateLimit" + - "sighPresLimit" + - "lowIronOxLimit" + - "highIronOxLimit" + - "meanAirwayPresLimit" + - "PEEPLimit" + - "CPAPLimit" + - "setApneaInterval" + - "setApneaTV" + - "setApneaIPPEEPHigh" + - "setApneaRR" + - "setApneaPeakFlow" + - "setApneaInspTime" + - "setApneaIE" + - "setApneaFIO2" + warning_items: + - "We ignore the `priorVent*` columns -- this may be a mistake!" + - "There is a lot of data in this table -- what should be incorporated into the event structure?" + - "We might be able to use `priorVent` timestamps to further refine true season of unit admission." + +respiratoryCharting: + offset_col: + - "respChartOffset" + - "respChartEntryOffset" + pseudotime_col: + - "respChartPerformedTimestamp" + - "respChartEnteredTimestamp" + output_data_cols: + - "respChartID" + - "respChartTypeCat" + - "respChartValueLabel" + - "respChartValue" + warning_items: + - "SOME MAY BE LISTS" + +treatment: + offset_col: "treatmentOffset" + pseudotime_col: "treatmentEnteredTimestamp" + output_data_cols: + - "treatmentID" + - "treatmentString" + warning_items: + - "Absence of entries in table do not indicate absence of treatments" + +vitalAperiodic: + offset_col: "observationOffset" + pseudotime_col: "observationEnteredTimestamp" + output_data_cols: + - "vitalAperiodicID" + - "nonInvasiveSystolic" + - "nonInvasiveDiastolic" + - "nonInvasiveMean" + - "paop" + - "cardiacOutput" + - "cardiacInput" + - "svr" + - "svri" + - "pvr" + - "pvri" + +vitalPeriodic: + offset_col: "observationOffset" + pseudotime_col: "observationEnteredTimestamp" + output_data_cols: + - "vitalPeriodicID" + - "temperature" + - "saO2" + - "heartRate" + - "respiration" + - "cvp" + - "etCo2" + - "systemicSystolic" + - "systemicDiastolic" + - "systemicMean" + - "paSystolic" + - "paDiastolic" + - "paMean" + - "st1" + - "st2" + - "st3" + - "ICP" + warning_items: + - "These are 5-minute median values. There are going to be a *lot* of events." diff --git a/eICU_Example/pre_MEDS.py b/eICU_Example/pre_MEDS.py index f02c0e8..1f7cab9 100755 --- a/eICU_Example/pre_MEDS.py +++ b/eICU_Example/pre_MEDS.py @@ -157,8 +157,8 @@ def process_patient(df: pl.LazyFrame, hospital_df: pl.LazyFrame) -> pl.LazyFrame def join_and_get_pseudotime_fntr( table_name: str, - offset_col: str, - pseudotime_col: str, + offset_col: str | list[str], + pseudotime_col: str | list[str], output_data_cols: list[str] | None = None, warning_items: list[str] | None = None, ) -> Callable[[pl.LazyFrame, pl.LazyFrame], pl.LazyFrame]: @@ -172,6 +172,18 @@ def join_and_get_pseudotime_fntr( if output_data_cols is None: output_data_cols = [] + if isinstance(offset_col, str): + offset_col = [offset_col] + if isinstance(pseudotime_col, str): + pseudotime_col = [pseudotime_col] + + if len(offset_col) != len(pseudotime_col): + raise ValueError( + "There must be the same number of `offset_col`s and `pseudotime_col`s specified. Got " + f"{len(offset_col)} and {len(pseudotime_col)}, respectively." + ) + + def fn(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame: f"""Takes the {table_name} table and converts it to a form that includes pseudo-timestamps. @@ -179,7 +191,11 @@ def fn(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame: `configs/event_configs.yaml` file. """ - pseudotime = pl.col("unitAdmitTimestamp") + pl.duration(minutes=pl.col(offset_col)) + + pseudotimes = [ + (pl.col("unitAdmitTimestamp") + pl.duration(minutes=pl.col(offset))).alias(pseudotime) + for pseudotime, offset in zip(pseudotime_col, offset_col) + ] if warning_items: warning_lines = [ @@ -191,7 +207,7 @@ def fn(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame: return df.join(patient_df, on=UNIT_STAY_ID, how="inner").select( HEALTH_SYSTEM_STAY_ID, UNIT_STAY_ID, - pseudotime.alias(pseudotime_col), + *pseudotimes, *output_data_cols, ) @@ -265,6 +281,12 @@ def main(cfg: DictConfig): 8. `microLab`: We don't use this because the culture taken time != culture result time, so seeing this data would give a model an advantage over any possible real-world implementation. Plus, the docs say it is not well populated. + 9. `note`: This table is largely duplicated with structured data due to the fact that primarily + narrative notes were removed due to PHI constraints (see the docs). + + There are other notes for this pipeline: + 1. Many fields here are, I believe, **lists**, not simple categoricals, and should be split and + processed accordingly. This is not yet done. Args (all as part of the config file): raw_cohort_dir: The directory containing the raw eICU files. From 26a386b31929100c60b863d8fbf44db2b7f59c97 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sat, 1 Jun 2024 15:47:02 -0400 Subject: [PATCH 23/47] Revised main script --- eICU_Example/pre_MEDS.py | 138 +++++++++++++-------------------------- 1 file changed, 46 insertions(+), 92 deletions(-) diff --git a/eICU_Example/pre_MEDS.py b/eICU_Example/pre_MEDS.py index 1f7cab9..73e774e 100755 --- a/eICU_Example/pre_MEDS.py +++ b/eICU_Example/pre_MEDS.py @@ -9,10 +9,9 @@ root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True) import gzip -from collections.abc import Callable, Sequence +from collections.abc import Callable from datetime import datetime from pathlib import Path -from typing import NamedTuple import hydra import polars as pl @@ -183,7 +182,6 @@ def join_and_get_pseudotime_fntr( f"{len(offset_col)} and {len(pseudotime_col)}, respectively." ) - def fn(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame: f"""Takes the {table_name} table and converts it to a form that includes pseudo-timestamps. @@ -191,7 +189,6 @@ def fn(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame: `configs/event_configs.yaml` file. """ - pseudotimes = [ (pl.col("unitAdmitTimestamp") + pl.duration(minutes=pl.col(offset))).alias(pseudotime) for pseudotime, offset in zip(pseudotime_col, offset_col) @@ -214,29 +211,7 @@ def fn(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame: return fn -class PreProcessor(NamedTuple): - """A preprocessor function and its dependencies. - - Args: - function: TODO - dependencies: A two-element tuple containing the prefix of the dependent dataframe and a list of - columns needed from that dataframe. - """ - - function: Callable[[Sequence[pl.LazyFrame]], pl.LazyFrame] - dependencies: tuple[str, list[str]] - - NEEDED_PATIENT_COLS = [UNIT_STAY_ID, HEALTH_SYSTEM_STAY_ID, "unitAdmitTimestamp"] -PATIENT_DEPENDENCY = ("patient", NEEDED_PATIENT_COLS) - -# Generic "copy from patients" functions are stored in `configs/table_preprocessors.yaml` and loaded in -# `main`. -SPECIALTY_FUNCTIONS: dict[str, PreProcessor] = { - "patient": PreProcessor( - process_patient, ("hospital", ["hospitalid", "numbedscategory", "teachingstatus", "region"]) - ), -} @hydra.main(version_base=None, config_path="configs", config_name="pre_MEDS") @@ -293,32 +268,59 @@ def main(cfg: DictConfig): output_dir: The directory to write the processed files to. """ - raise NotImplementedError("This script is not yet implemented for eICU.") - hydra_loguru_init() - functions = {**SPECIALTY_FUNCTIONS} - logger.info("Loading table preprocessors from configs/table_preprocessors.yaml...") preprocessors = OmegaConf.load("configs/table_preprocessors.yaml") + functions = {} for table_name, preprocessor_cfg in preprocessors.items(): logger.info(f" Adding preprocessor for {table_name}:\n{OmegaConf.to_yaml(preprocessor_cfg)}") - functions[table_name] = PreProcessor( - join_and_get_pseudotime_fntr(table_name=table_name, **preprocessor_cfg), - PATIENT_DEPENDENCY, - ) + functions[table_name] = join_and_get_pseudotime_fntr(table_name=table_name, **preprocessor_cfg) raw_cohort_dir = Path(cfg.raw_cohort_dir) MEDS_input_dir = Path(cfg.output_dir) - all_fps = list(raw_cohort_dir.glob("**/*.csv.gz")) + logger.info("Processing patient table first...") - dfs_to_load = {} + hospital_fp = raw_cohort_dir / "hospital.csv.gz" + patient_fp = raw_cohort_dir / "patient.csv.gz" + logger.info(f"Loading {str(hospital_fp.resolve())}...") + hospital_df = load_raw_eicu_file( + hospital_fp, columns=["hospitalid", "numbedscategory", "teachingstatus", "region"] + ) + logger.info(f"Loading {str(patient_fp.resolve())}...") + raw_patient_df = load_raw_eicu_file(patient_fp) + + logger.info("Processing patient table...") + patient_df = process_patient(raw_patient_df, hospital_df) + write_lazyframe(patient_df, MEDS_input_dir / "patient.parquet") + + all_fps = [ + fp for fp in raw_cohort_dir.glob("*/.csv.gz") if fp.name not in {"hospital.csv.gz", "patient.csv.gz"} + ] + + unused_tables = { + "admissiondrug", + "apacheApsVar", + "apachePatientResult", + "apachePredVar", + "carePlanCareProvider", + "customLab", + "intakeOutput", + "microLab", + "note", + } for in_fp in all_fps: pfx = get_shard_prefix(raw_cohort_dir, in_fp) + if pfx in unused_tables: + logger.warning(f"Skipping {pfx} as it is not supported in this pipeline.") + continue + elif pfx not in functions: + logger.warning(f"No function needed for {pfx}. For eICU, THIS IS UNEXPECTED") + continue - out_fp = MEDS_input_dir / in_fp.relative_to(raw_cohort_dir) + out_fp = MEDS_input_dir / f"{pfx}.parquet" if out_fp.is_file(): print(f"Done with {pfx}. Continuing") @@ -326,63 +328,15 @@ def main(cfg: DictConfig): out_fp.parent.mkdir(parents=True, exist_ok=True) - if pfx not in functions: - logger.info( - f"No function needed for {pfx}: " - f"Symlinking {str(in_fp.resolve())} to {str(out_fp.resolve())}" - ) - relative_in_fp = in_fp.relative_to(out_fp.parent, walk_up=True) - out_fp.symlink_to(relative_in_fp) - continue - else: - out_fp = MEDS_input_dir / f"{pfx}.parquet" - if out_fp.is_file(): - print(f"Done with {pfx}. Continuing") - continue - - fn, need_df = functions[pfx] - if not need_df: - st = datetime.now() - logger.info(f"Processing {pfx}...") - df = load_raw_eicu_file(in_fp) - logger.info(f" Loaded raw {in_fp} in {datetime.now() - st}") - processed_df = fn(df) - write_lazyframe(processed_df, out_fp) - logger.info(f" Processed and wrote to {str(out_fp.resolve())} in {datetime.now() - st}") - else: - needed_pfx, needed_cols = need_df - if needed_pfx not in dfs_to_load: - dfs_to_load[needed_pfx] = {"fps": set(), "cols": set()} - - dfs_to_load[needed_pfx]["fps"].add(in_fp) - dfs_to_load[needed_pfx]["cols"].update(needed_cols) - - for df_to_load_pfx, fps_and_cols in dfs_to_load.items(): - fps = fps_and_cols["fps"] - cols = list(fps_and_cols["cols"]) - - df_to_load_fp = raw_cohort_dir / f"{df_to_load_pfx}.csv.gz" + fn = functions[pfx] st = datetime.now() - - logger.info(f"Loading {str(df_to_load_fp.resolve())} for manipulating other dataframes...") - df = load_raw_eicu_file(df_to_load_fp, columns=cols) - logger.info(f" Loaded in {datetime.now() - st}") - - for fp in fps: - pfx = get_shard_prefix(raw_cohort_dir, fp) - out_fp = MEDS_input_dir / f"{pfx}.parquet" - - logger.info(f" Processing dependent df @ {pfx}...") - fn, _ = functions[pfx] - - fp_st = datetime.now() - logger.info(f" Loading {str(fp.resolve())}...") - fp_df = load_raw_eicu_file(fp) - logger.info(f" Loaded in {datetime.now() - fp_st}") - processed_df = fn(fp_df, df) - write_lazyframe(processed_df, out_fp) - logger.info(f" Processed and wrote to {str(out_fp.resolve())} in {datetime.now() - fp_st}") + logger.info(f"Processing {pfx}...") + df = load_raw_eicu_file(in_fp) + logger.info(f" * Loaded raw {in_fp} in {datetime.now() - st}") + processed_df = fn(df, patient_df) + write_lazyframe(processed_df, out_fp) + logger.info(f" * Processed and wrote to {str(out_fp.resolve())} in {datetime.now() - st}") logger.info(f"Done! All dataframes processed and written to {str(MEDS_input_dir.resolve())}") From e9000964f7f0a5a039bb10a8d9fece7e6debb087 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sat, 1 Jun 2024 15:56:01 -0400 Subject: [PATCH 24/47] Fixed a variety of lint errors --- MIMIC-IV_Example/README.md | 6 +- MIMIC-IV_Example/joint_script.sh | 18 +-- MIMIC-IV_Example/joint_script_slurm.sh | 102 ++++++++-------- eICU_Example/README.md | 9 +- eICU_Example/configs/event_configs.yaml | 51 ++++---- eICU_Example/configs/table_preprocessors.yaml | 20 +++- eICU_Example/joint_script.sh | 32 ++--- eICU_Example/joint_script_slurm.sh | 110 +++++++++--------- eICU_Example/sbatch_joint_script.sh | 24 ---- scripts/extraction/shard_events.py | 4 +- 10 files changed, 179 insertions(+), 197 deletions(-) delete mode 100644 eICU_Example/sbatch_joint_script.sh diff --git a/MIMIC-IV_Example/README.md b/MIMIC-IV_Example/README.md index 4056319..406f1f2 100644 --- a/MIMIC-IV_Example/README.md +++ b/MIMIC-IV_Example/README.md @@ -71,6 +71,7 @@ root directory of this repository): In practice, on a machine with 150 GB of RAM and 10 cores, this step takes less than 5 minutes in total. ## Step 3: Run the MEDS extraction ETL + ### Running locally, serially We will assume you want to output the final MEDS dataset into a directory we'll denote as `$MIMICIV_MEDS_DIR`. @@ -127,11 +128,12 @@ and performance is not necessary; however, for larger datasets, it can be. ``` ### Running Locally, in Parallel. + This step is the exact same commands as above, but leverages Hydra's multirun capabilities with the `joblib` -launcher. Install this package with the optional `local_parallelism` option (e.g., `pip install -e -.[local_parallelism]` and run `./MIMIC-IV_Example/joint_script.sh`. See that script for expected args. +launcher. Install this package with the optional `local_parallelism` option (e.g., `pip install -e .[local_parallelism]` and run `./MIMIC-IV_Example/joint_script.sh`. See that script for expected args. ### Running Each Step over Slurm + To use slurm, run each command with the number of workers desired using Hydra's multirun capabilities with the `submitit_slurm` launcher. Install this package with the optional `slurm_parallelism` option. See below for modified commands. Note these can't be chained in a single script as the jobs will not wait for all slurm jobs diff --git a/MIMIC-IV_Example/joint_script.sh b/MIMIC-IV_Example/joint_script.sh index 9d7ae69..eb58e89 100755 --- a/MIMIC-IV_Example/joint_script.sh +++ b/MIMIC-IV_Example/joint_script.sh @@ -8,21 +8,21 @@ N_PARALLEL_WORKERS="$4" shift 4 echo "Running pre-MEDS conversion." -./MIMIC-IV_Example/pre_MEDS.py raw_cohort_dir=$MIMICIV_RAW_DIR output_dir=$MIMICIV_PREMEDS_DIR +./MIMIC-IV_Example/pre_MEDS.py raw_cohort_dir="$MIMICIV_RAW_DIR" output_dir="$MIMICIV_PREMEDS_DIR" echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel" ./scripts/extraction/shard_events.py \ --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ - input_dir=$MIMICIV_PREMEDS_DIR \ - cohort_dir=$MIMICIV_MEDS_DIR \ + input_dir="$MIMICIV_PREMEDS_DIR" \ + cohort_dir="$MIMICIV_MEDS_DIR" \ event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" echo "Splitting patients in serial" ./scripts/extraction/split_and_shard_patients.py \ - input_dir=$MIMICIV_PREMEDS_DIR \ - cohort_dir=$MIMICIV_MEDS_DIR \ + input_dir="$MIMICIV_PREMEDS_DIR" \ + cohort_dir="$MIMICIV_MEDS_DIR" \ event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" @@ -30,8 +30,8 @@ echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ - input_dir=$MIMICIV_PREMEDS_DIR \ - cohort_dir=$MIMICIV_MEDS_DIR \ + input_dir="$MIMICIV_PREMEDS_DIR" \ + cohort_dir="$MIMICIV_MEDS_DIR" \ event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel" @@ -39,6 +39,6 @@ echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel" --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ - input_dir=$MIMICIV_PREMEDS_DIR \ - cohort_dir=$MIMICIV_MEDS_DIR \ + input_dir="$MIMICIV_PREMEDS_DIR" \ + cohort_dir="$MIMICIV_MEDS_DIR" \ event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" diff --git a/MIMIC-IV_Example/joint_script_slurm.sh b/MIMIC-IV_Example/joint_script_slurm.sh index 8ce85fb..3948e87 100755 --- a/MIMIC-IV_Example/joint_script_slurm.sh +++ b/MIMIC-IV_Example/joint_script_slurm.sh @@ -11,17 +11,17 @@ shift 4 # this doesn't fall back on running anything locally in a setting where only slurm worker nodes have # sufficient computational resources to run the actual jobs. -# echo "Running pre-MEDS conversion on one worker." -# ./MIMIC-IV_Example/pre_MEDS.py \ -# --multirun \ -# worker="range(0,1)" \ -# hydra/launcher=submitit_slurm \ -# hydra.launcher.timeout_min=60 \ -# hydra.launcher.cpus_per_task=10 \ -# hydra.launcher.mem_gb=50 \ -# hydra.launcher.partition="short" \ -# raw_cohort_dir=$MIMICIV_RAW_DIR \ -# output_dir=$MIMICIV_PREMEDS_DIR +echo "Running pre-MEDS conversion on one worker." +./MIMIC-IV_Example/pre_MEDS.py \ + --multirun \ + worker="range(0,1)" \ + hydra/launcher=submitit_slurm \ + hydra.launcher.timeout_min=60 \ + hydra.launcher.cpus_per_task=10 \ + hydra.launcher.mem_gb=50 \ + hydra.launcher.partition="short" \ + raw_cohort_dir="$MIMICIV_RAW_DIR" \ + output_dir="$MIMICIV_PREMEDS_DIR" echo "Trying submitit launching with $N_PARALLEL_WORKERS jobs." @@ -34,45 +34,45 @@ echo "Trying submitit launching with $N_PARALLEL_WORKERS jobs." hydra.launcher.mem_gb=50 \ hydra.launcher.partition="short" \ "hydra.job.env_copy=[PATH]" \ - input_dir=$MIMICIV_PREMEDS_DIR \ - cohort_dir=$MIMICIV_MEDS_DIR \ + input_dir="$MIMICIV_PREMEDS_DIR" \ + cohort_dir="$MIMICIV_MEDS_DIR" \ event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml -#echo "Splitting patients on one worker" -#./scripts/extraction/split_and_shard_patients.py \ -# --multirun \ -# worker="range(0,1)" \ -# hydra/launcher=submitit_slurm \ -# hydra.launcher.timeout_min=60 \ -# hydra.launcher.cpus_per_task=10 \ -# hydra.launcher.mem_gb=50 \ -# hydra.launcher.partition="short" \ -# input_dir=$MIMICIV_PREMEDS_DIR \ -# cohort_dir=$MIMICIV_MEDS_DIR \ -# event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" -# -#echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" -#./scripts/extraction/convert_to_sharded_events.py \ -# --multirun \ -# worker="range(0,$N_PARALLEL_WORKERS)" \ -# hydra/launcher=submitit_slurm \ -# hydra.launcher.timeout_min=60 \ -# hydra.launcher.cpus_per_task=10 \ -# hydra.launcher.mem_gb=50 \ -# hydra.launcher.partition="short" \ -# input_dir=$MIMICIV_PREMEDS_DIR \ -# cohort_dir=$MIMICIV_MEDS_DIR \ -# event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" -# -#echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel" -#./scripts/extraction/merge_to_MEDS_cohort.py \ -# --multirun \ -# worker="range(0,$N_PARALLEL_WORKERS)" \ -# hydra/launcher=submitit_slurm \ -# hydra.launcher.timeout_min=60 \ -# hydra.launcher.cpus_per_task=10 \ -# hydra.launcher.mem_gb=50 \ -# hydra.launcher.partition="short" \ -# input_dir=$MIMICIV_PREMEDS_DIR \ -# cohort_dir=$MIMICIV_MEDS_DIR \ -# event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" +echo "Splitting patients on one worker" +./scripts/extraction/split_and_shard_patients.py \ + --multirun \ + worker="range(0,1)" \ + hydra/launcher=submitit_slurm \ + hydra.launcher.timeout_min=60 \ + hydra.launcher.cpus_per_task=10 \ + hydra.launcher.mem_gb=50 \ + hydra.launcher.partition="short" \ + input_dir="$MIMICIV_PREMEDS_DIR" \ + cohort_dir="$MIMICIV_MEDS_DIR" \ + event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" + +echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" +./scripts/extraction/convert_to_sharded_events.py \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=submitit_slurm \ + hydra.launcher.timeout_min=60 \ + hydra.launcher.cpus_per_task=10 \ + hydra.launcher.mem_gb=50 \ + hydra.launcher.partition="short" \ + input_dir="$MIMICIV_PREMEDS_DIR" \ + cohort_dir="$MIMICIV_MEDS_DIR" \ + event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" + +echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel" +./scripts/extraction/merge_to_MEDS_cohort.py \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=submitit_slurm \ + hydra.launcher.timeout_min=60 \ + hydra.launcher.cpus_per_task=10 \ + hydra.launcher.mem_gb=50 \ + hydra.launcher.partition="short" \ + input_dir="$MIMICIV_PREMEDS_DIR" \ + cohort_dir="$MIMICIV_MEDS_DIR" \ + event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" diff --git a/eICU_Example/README.md b/eICU_Example/README.md index b23ae9e..2715613 100644 --- a/eICU_Example/README.md +++ b/eICU_Example/README.md @@ -69,6 +69,7 @@ root directory of this repository): In practice, on a machine with 150 GB of RAM and 10 cores, this step takes less than 5 minutes in total. ## Step 3: Run the MEDS extraction ETL + ### Running locally, serially We will assume you want to output the final MEDS dataset into a directory we'll denote as `$EICU_MEDS_DIR`. @@ -125,11 +126,12 @@ and performance is not necessary; however, for larger datasets, it can be. ``` ### Running Locally, in Parallel. + This step is the exact same commands as above, but leverages Hydra's multirun capabilities with the `joblib` -launcher. Install this package with the optional `local_parallelism` option (e.g., `pip install -e -.[local_parallelism]` and run `./eICU_Example/joint_script.sh`. See that script for expected args. +launcher. Install this package with the optional `local_parallelism` option (e.g., `pip install -e .[local_parallelism]` and run `./eICU_Example/joint_script.sh`. See that script for expected args. ### Running Each Step over Slurm + To use slurm, run each command with the number of workers desired using Hydra's multirun capabilities with the `submitit_slurm` launcher. Install this package with the optional `slurm_parallelism` option. See below for modified commands. Note these can't be chained in a single script as the jobs will not wait for all slurm jobs @@ -195,8 +197,7 @@ Currently, some tables are ignored, including: 1. `admissiondrug`: The [documentation](https://eicu-crd.mit.edu/eicutables/admissiondrug/) notes that this is extremely infrequently used, so we skip it. -2. - +2. Lots of questions remain about how to appropriately handle timestamps of the data -- e.g., things like HCPCS events are stored at the level of the _date_, not the _datetime_. How should those be slotted into the diff --git a/eICU_Example/configs/event_configs.yaml b/eICU_Example/configs/event_configs.yaml index 7ac4225..50c8eb0 100644 --- a/eICU_Example/configs/event_configs.yaml +++ b/eICU_Example/configs/event_configs.yaml @@ -273,7 +273,6 @@ pastHistory: value: "pastHistoryValue" value_text: "pastHistoryValueText" - physicalExam: physical_exam_entered: code: @@ -286,7 +285,6 @@ physicalExam: path: "physicalExamPath" value: "physicalExamValue" - respiratoryCare: resp_care_status: code: @@ -296,32 +294,32 @@ respiratoryCare: timestamp: "respCareStatusEnteredTimestamp" resp_care_id: "respCareID" - airwayType: "airwayType" - airwaySize: "airwaySize" - airwayPosition: "airwayPosition" - cuffPressure: "cuffPressure" - apneaParams: "apneaParams" - lowExhMVLimit: "lowExhMVLimit" - hiExhMVLimit: "hiExhMVLimit" - lowExhTVLimit: "lowExhTVLimit" - hiPeakPresLimit: "hiPeakPresLimit" - lowPeakPresLimit: "lowPeakPresLimit" - hiRespRateLimit: "hiRespRateLimit" - lowRespRateLimit: "lowRespRateLimit" - sighPresLimit: "sighPresLimit" - lowIronOxLimit: "lowIronOxLimit" - highIronOxLimit: "highIronOxLimit" + airwayType: "airwayType" + airwaySize: "airwaySize" + airwayPosition: "airwayPosition" + cuffPressure: "cuffPressure" + apneaParams: "apneaParams" + lowExhMVLimit: "lowExhMVLimit" + hiExhMVLimit: "hiExhMVLimit" + lowExhTVLimit: "lowExhTVLimit" + hiPeakPresLimit: "hiPeakPresLimit" + lowPeakPresLimit: "lowPeakPresLimit" + hiRespRateLimit: "hiRespRateLimit" + lowRespRateLimit: "lowRespRateLimit" + sighPresLimit: "sighPresLimit" + lowIronOxLimit: "lowIronOxLimit" + highIronOxLimit: "highIronOxLimit" meanAirwayPresLimit: "meanAirwayPresLimit" - PEEPLimit: "PEEPLimit" - CPAPLimit: "CPAPLimit" - setApneaInterval: "setApneaInterval" - setApneaTV: "setApneaTV" + PEEPLimit: "PEEPLimit" + CPAPLimit: "CPAPLimit" + setApneaInterval: "setApneaInterval" + setApneaTV: "setApneaTV" setApneaIPPEEPHigh: "setApneaIPPEEPHigh" - setApneaRR: "setApneaRR" - setApneaPeakFlow: "setApneaPeakFlow" - setApneaInspTime: "setApneaInspTime" - setApneaIE: "setApneaIE" - setApneaFIO2: "setApneaFIO2" + setApneaRR: "setApneaRR" + setApneaPeakFlow: "setApneaPeakFlow" + setApneaInspTime: "setApneaInspTime" + setApneaIE: "setApneaIE" + setApneaFIO2: "setApneaFIO2" vent_start: code: @@ -339,7 +337,6 @@ respiratoryCare: timestamp: "ventEndTimestamp" resp_care_id: "respCareID" - respiratoryCharting: resp_charting_performed: code: diff --git a/eICU_Example/configs/table_preprocessors.yaml b/eICU_Example/configs/table_preprocessors.yaml index e7da494..3c3a9ca 100644 --- a/eICU_Example/configs/table_preprocessors.yaml +++ b/eICU_Example/configs/table_preprocessors.yaml @@ -2,7 +2,8 @@ admissiondx: offset_col: "admitDxEnteredOffset" pseudotime_col: "admitDxEnteredTimestamp" output_data_cols: ["admitDxName", "admitDxID"] - warning_items: ["How should we use `admitDxTest`?", "How should we use `admitDxPath`?"] + warning_items: + ["How should we use `admitDxTest`?", "How should we use `admitDxPath`?"] allergy: offset_col: "allergyEnteredOffset" @@ -12,8 +13,9 @@ allergy: - "How should we use `allergyNoteType`?" - "How should we use `specialtyType`?" - "How should we use `userType`?" - - |- Is `drugName` the name of the drug to which the patient is allergic or the drug given to the patient - (docs say \'name of the selected admission drug\')? + - >- + Is `drugName` the name of the drug to which the patient is allergic or the drug given to the patient + (docs say 'name of the selected admission drug')? carePlanGeneral: offset_col: "cplItemOffset" @@ -34,7 +36,13 @@ carePlanGoal: carePlanInfectiousDisease: offset_col: "cplInfectDiseaseOffset" pseudotime_col: "carePlanInfectDiseaseEnteredTimestamp" - output_data_cols: ["infectDiseaseSite", "infectDiseaseAssessment", "responseToTherapy", "treatment"] + output_data_cols: + [ + "infectDiseaseSite", + "infectDiseaseAssessment", + "responseToTherapy", + "treatment", + ] diagonosis: offset_col: "diagnosisOffset" @@ -93,7 +101,7 @@ medication: - "We **IGNORE** the `drugOrderCancelled` column -- this may be a mistake!" nurseAssessment: - offset_col: + offset_col: - "nurseAssessOffset" - "nurseAssessEntryOffset" pseudotime_col: @@ -175,7 +183,7 @@ respiratoryCare: - "respCareStatusOffset" - "ventStartOffset" - "ventEndOffset" - pseudotime_col: + pseudotime_col: - "respCareStatusEnteredTimestamp" - "ventStartTimestamp" - "ventEndTimestamp" diff --git a/eICU_Example/joint_script.sh b/eICU_Example/joint_script.sh index 9d7ae69..48cec46 100755 --- a/eICU_Example/joint_script.sh +++ b/eICU_Example/joint_script.sh @@ -1,44 +1,44 @@ #!/usr/bin/env bash -MIMICIV_RAW_DIR="$1" -MIMICIV_PREMEDS_DIR="$2" -MIMICIV_MEDS_DIR="$3" +EICU_RAW_DIR="$1" +EICU_PREMEDS_DIR="$2" +EICU_MEDS_DIR="$3" N_PARALLEL_WORKERS="$4" shift 4 echo "Running pre-MEDS conversion." -./MIMIC-IV_Example/pre_MEDS.py raw_cohort_dir=$MIMICIV_RAW_DIR output_dir=$MIMICIV_PREMEDS_DIR +./eICU_Example/pre_MEDS.py raw_cohort_dir="$EICU_RAW_DIR" output_dir="$EICU_PREMEDS_DIR" echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel" ./scripts/extraction/shard_events.py \ --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ - input_dir=$MIMICIV_PREMEDS_DIR \ - cohort_dir=$MIMICIV_MEDS_DIR \ - event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" + input_dir="$EICU_PREMEDS_DIR" \ + cohort_dir="$EICU_MEDS_DIR" \ + event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@" echo "Splitting patients in serial" ./scripts/extraction/split_and_shard_patients.py \ - input_dir=$MIMICIV_PREMEDS_DIR \ - cohort_dir=$MIMICIV_MEDS_DIR \ - event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" + input_dir="$EICU_PREMEDS_DIR" \ + cohort_dir="$EICU_MEDS_DIR" \ + event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@" echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" ./scripts/extraction/convert_to_sharded_events.py \ --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ - input_dir=$MIMICIV_PREMEDS_DIR \ - cohort_dir=$MIMICIV_MEDS_DIR \ - event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" + input_dir="$EICU_PREMEDS_DIR" \ + cohort_dir="$EICU_MEDS_DIR" \ + event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@" echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel" ./scripts/extraction/merge_to_MEDS_cohort.py \ --multirun \ worker="range(0,$N_PARALLEL_WORKERS)" \ hydra/launcher=joblib \ - input_dir=$MIMICIV_PREMEDS_DIR \ - cohort_dir=$MIMICIV_MEDS_DIR \ - event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" + input_dir="$EICU_PREMEDS_DIR" \ + cohort_dir="$EICU_MEDS_DIR" \ + event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@" diff --git a/eICU_Example/joint_script_slurm.sh b/eICU_Example/joint_script_slurm.sh index 8ce85fb..6b36ef0 100755 --- a/eICU_Example/joint_script_slurm.sh +++ b/eICU_Example/joint_script_slurm.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash -MIMICIV_RAW_DIR="$1" -MIMICIV_PREMEDS_DIR="$2" -MIMICIV_MEDS_DIR="$3" +EICU_RAW_DIR="$1" +EICU_PREMEDS_DIR="$2" +EICU_MEDS_DIR="$3" N_PARALLEL_WORKERS="$4" shift 4 @@ -11,17 +11,17 @@ shift 4 # this doesn't fall back on running anything locally in a setting where only slurm worker nodes have # sufficient computational resources to run the actual jobs. -# echo "Running pre-MEDS conversion on one worker." -# ./MIMIC-IV_Example/pre_MEDS.py \ -# --multirun \ -# worker="range(0,1)" \ -# hydra/launcher=submitit_slurm \ -# hydra.launcher.timeout_min=60 \ -# hydra.launcher.cpus_per_task=10 \ -# hydra.launcher.mem_gb=50 \ -# hydra.launcher.partition="short" \ -# raw_cohort_dir=$MIMICIV_RAW_DIR \ -# output_dir=$MIMICIV_PREMEDS_DIR +echo "Running pre-MEDS conversion on one worker." +./eICU_Example/pre_MEDS.py \ + --multirun \ + worker="range(0,1)" \ + hydra/launcher=submitit_slurm \ + hydra.launcher.timeout_min=60 \ + hydra.launcher.cpus_per_task=10 \ + hydra.launcher.mem_gb=50 \ + hydra.launcher.partition="short" \ + raw_cohort_dir="$EICU_RAW_DIR" \ + output_dir="$EICU_PREMEDS_DIR" echo "Trying submitit launching with $N_PARALLEL_WORKERS jobs." @@ -34,45 +34,45 @@ echo "Trying submitit launching with $N_PARALLEL_WORKERS jobs." hydra.launcher.mem_gb=50 \ hydra.launcher.partition="short" \ "hydra.job.env_copy=[PATH]" \ - input_dir=$MIMICIV_PREMEDS_DIR \ - cohort_dir=$MIMICIV_MEDS_DIR \ - event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml + input_dir="$EICU_PREMEDS_DIR" \ + cohort_dir="$EICU_MEDS_DIR" \ + event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml -#echo "Splitting patients on one worker" -#./scripts/extraction/split_and_shard_patients.py \ -# --multirun \ -# worker="range(0,1)" \ -# hydra/launcher=submitit_slurm \ -# hydra.launcher.timeout_min=60 \ -# hydra.launcher.cpus_per_task=10 \ -# hydra.launcher.mem_gb=50 \ -# hydra.launcher.partition="short" \ -# input_dir=$MIMICIV_PREMEDS_DIR \ -# cohort_dir=$MIMICIV_MEDS_DIR \ -# event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" -# -#echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" -#./scripts/extraction/convert_to_sharded_events.py \ -# --multirun \ -# worker="range(0,$N_PARALLEL_WORKERS)" \ -# hydra/launcher=submitit_slurm \ -# hydra.launcher.timeout_min=60 \ -# hydra.launcher.cpus_per_task=10 \ -# hydra.launcher.mem_gb=50 \ -# hydra.launcher.partition="short" \ -# input_dir=$MIMICIV_PREMEDS_DIR \ -# cohort_dir=$MIMICIV_MEDS_DIR \ -# event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" -# -#echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel" -#./scripts/extraction/merge_to_MEDS_cohort.py \ -# --multirun \ -# worker="range(0,$N_PARALLEL_WORKERS)" \ -# hydra/launcher=submitit_slurm \ -# hydra.launcher.timeout_min=60 \ -# hydra.launcher.cpus_per_task=10 \ -# hydra.launcher.mem_gb=50 \ -# hydra.launcher.partition="short" \ -# input_dir=$MIMICIV_PREMEDS_DIR \ -# cohort_dir=$MIMICIV_MEDS_DIR \ -# event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" +echo "Splitting patients on one worker" +./scripts/extraction/split_and_shard_patients.py \ + --multirun \ + worker="range(0,1)" \ + hydra/launcher=submitit_slurm \ + hydra.launcher.timeout_min=60 \ + hydra.launcher.cpus_per_task=10 \ + hydra.launcher.mem_gb=50 \ + hydra.launcher.partition="short" \ + input_dir="$EICU_PREMEDS_DIR" \ + cohort_dir="$EICU_MEDS_DIR" \ + event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@" + +echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" +./scripts/extraction/convert_to_sharded_events.py \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=submitit_slurm \ + hydra.launcher.timeout_min=60 \ + hydra.launcher.cpus_per_task=10 \ + hydra.launcher.mem_gb=50 \ + hydra.launcher.partition="short" \ + input_dir="$EICU_PREMEDS_DIR" \ + cohort_dir="$EICU_MEDS_DIR" \ + event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@" + +echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel" +./scripts/extraction/merge_to_MEDS_cohort.py \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=submitit_slurm \ + hydra.launcher.timeout_min=60 \ + hydra.launcher.cpus_per_task=10 \ + hydra.launcher.mem_gb=50 \ + hydra.launcher.partition="short" \ + input_dir="$EICU_PREMEDS_DIR" \ + cohort_dir="$EICU_MEDS_DIR" \ + event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@" diff --git a/eICU_Example/sbatch_joint_script.sh b/eICU_Example/sbatch_joint_script.sh deleted file mode 100644 index e031363..0000000 --- a/eICU_Example/sbatch_joint_script.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env bash -#SBATCH -c 10 # Request one core -#SBATCH -t 0-03:00 # Runtime in D-HH:MM format -#SBATCH -p short # Partition to run in -#SBATCH --mem=300GB # Memory total in MiB (for all cores) -#SBATCH -o MIMIC_IV_MEDS_%j_sbatch.out # File to which STDOUT will be written, including job ID (%j) -#SBATCH -e MIMIC_IV_MEDS_%j_sbatch.err # File to which STDERR will be written, including job ID (%j) - -cd /n/data1/hms/dbmi/zaklab/mmd/MEDS_polars_functions - -MIMICIV_RAW_DIR="$1" -MIMICIV_PREMEDS_DIR="$2" -MIMICIV_MEDS_DIR="$3" -N_PARALLEL_WORKERS="$4" - -LOG_DIR="$MIMICIV_MEDS_DIR/.logs" - -echo "Running with saving to $LOG_DIR" - -mkdir -p $LOG_DIR - -PATH="/home/mbm47/.conda/envs/MEDS_pipelines/bin:$PATH" \ - time mprof run --include-children --exit-code --output "$LOG_DIR/mprofile.dat" \ - ./MIMIC-IV_Example/joint_script.sh "$@" 2> $LOG_DIR/timings.txt diff --git a/scripts/extraction/shard_events.py b/scripts/extraction/shard_events.py index d0533e3..9ce0ac9 100755 --- a/scripts/extraction/shard_events.py +++ b/scripts/extraction/shard_events.py @@ -223,9 +223,7 @@ def main(cfg: DictConfig): seen_files.add(get_shard_prefix(raw_cohort_dir, f)) if not input_files_to_subshard: - raise FileNotFoundError( - f"Can't find any files in {str(raw_cohort_dir.resolve())} to sub-shard!" - ) + raise FileNotFoundError(f"Can't find any files in {str(raw_cohort_dir.resolve())} to sub-shard!") random.shuffle(input_files_to_subshard) From 2f92036ed0177ac5886cde3d46d35b27091c1bd9 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sat, 1 Jun 2024 15:56:37 -0400 Subject: [PATCH 25/47] Adjusted a tiny thing in the yaml --- eICU_Example/configs/table_preprocessors.yaml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/eICU_Example/configs/table_preprocessors.yaml b/eICU_Example/configs/table_preprocessors.yaml index 3c3a9ca..da94e96 100644 --- a/eICU_Example/configs/table_preprocessors.yaml +++ b/eICU_Example/configs/table_preprocessors.yaml @@ -37,12 +37,10 @@ carePlanInfectiousDisease: offset_col: "cplInfectDiseaseOffset" pseudotime_col: "carePlanInfectDiseaseEnteredTimestamp" output_data_cols: - [ - "infectDiseaseSite", - "infectDiseaseAssessment", - "responseToTherapy", - "treatment", - ] + - "infectDiseaseSite" + - "infectDiseaseAssessment" + - "responseToTherapy" + - "treatment" diagonosis: offset_col: "diagnosisOffset" From c482a7823518b6c5942a7fc9488b26f57d9ebe3b Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sat, 1 Jun 2024 16:08:11 -0400 Subject: [PATCH 26/47] Updated scripts to have help messages and to error if any internal piece errors. --- MIMIC-IV_Example/joint_script.sh | 32 +++++++++++++++++++++++++ MIMIC-IV_Example/joint_script_slurm.sh | 33 ++++++++++++++++++++++++++ eICU_Example/joint_script.sh | 32 +++++++++++++++++++++++++ eICU_Example/joint_script_slurm.sh | 33 ++++++++++++++++++++++++++ eICU_Example/pre_MEDS.py | 5 ++-- 5 files changed, 133 insertions(+), 2 deletions(-) diff --git a/MIMIC-IV_Example/joint_script.sh b/MIMIC-IV_Example/joint_script.sh index eb58e89..d3e067f 100755 --- a/MIMIC-IV_Example/joint_script.sh +++ b/MIMIC-IV_Example/joint_script.sh @@ -1,5 +1,37 @@ #!/usr/bin/env bash +# This makes the script fail if any internal script fails +set -e + +# Function to display help message +function display_help() { + echo "Usage: $0 " + echo + echo "This script processes MIMIC-IV data through several steps, handling raw data conversion," + echo "sharding events, splitting patients, converting to sharded events, and merging into a MEDS cohort." + echo + echo "Arguments:" + echo " MIMICIV_RAW_DIR Directory containing raw MIMIC-IV data files." + echo " MIMICIV_PREMEDS_DIR Output directory for pre-MEDS data." + echo " MIMICIV_MEDS_DIR Output directory for processed MEDS data." + echo " N_PARALLEL_WORKERS Number of parallel workers for processing." + echo + echo "Options:" + echo " -h, --help Display this help message and exit." + exit 1 +} + +# Check if the first parameter is '-h' or '--help' +if [[ "$1" == "-h" || "$1" == "--help" ]]; then + display_help +fi + +# Check for mandatory parameters +if [ "$#" -ne 4 ]; then + echo "Error: Incorrect number of arguments provided." + display_help +fi + MIMICIV_RAW_DIR="$1" MIMICIV_PREMEDS_DIR="$2" MIMICIV_MEDS_DIR="$3" diff --git a/MIMIC-IV_Example/joint_script_slurm.sh b/MIMIC-IV_Example/joint_script_slurm.sh index 3948e87..9d9ec0a 100755 --- a/MIMIC-IV_Example/joint_script_slurm.sh +++ b/MIMIC-IV_Example/joint_script_slurm.sh @@ -1,5 +1,38 @@ #!/usr/bin/env bash +# This makes the script fail if any internal script fails +set -e + +# Function to display help message +function display_help() { + echo "Usage: $0 " + echo + echo "This script processes MIMIC-IV data through several steps, handling raw data conversion," + echo "sharding events, splitting patients, converting to sharded events, and merging into a MEDS cohort." + echo "This script uses slurm to process the data in parallel via the 'submitit' Hydra launcher." + echo + echo "Arguments:" + echo " MIMICIV_RAW_DIR Directory containing raw MIMIC-IV data files." + echo " MIMICIV_PREMEDS_DIR Output directory for pre-MEDS data." + echo " MIMICIV_MEDS_DIR Output directory for processed MEDS data." + echo " N_PARALLEL_WORKERS Number of parallel workers for processing." + echo + echo "Options:" + echo " -h, --help Display this help message and exit." + exit 1 +} + +# Check if the first parameter is '-h' or '--help' +if [[ "$1" == "-h" || "$1" == "--help" ]]; then + display_help +fi + +# Check for mandatory parameters +if [ "$#" -ne 4 ]; then + echo "Error: Incorrect number of arguments provided." + display_help +fi + MIMICIV_RAW_DIR="$1" MIMICIV_PREMEDS_DIR="$2" MIMICIV_MEDS_DIR="$3" diff --git a/eICU_Example/joint_script.sh b/eICU_Example/joint_script.sh index 48cec46..4445f49 100755 --- a/eICU_Example/joint_script.sh +++ b/eICU_Example/joint_script.sh @@ -1,5 +1,37 @@ #!/usr/bin/env bash +# This makes the script fail if any internal script fails +set -e + +# Function to display help message +function display_help() { + echo "Usage: $0 " + echo + echo "This script processes eICU data through several steps, handling raw data conversion," + echo "sharding events, splitting patients, converting to sharded events, and merging into a MEDS cohort." + echo + echo "Arguments:" + echo " EICU_RAW_DIR Directory containing raw eICU data files." + echo " EICU_PREMEDS_DIR Output directory for pre-MEDS data." + echo " EICU_MEDS_DIR Output directory for processed MEDS data." + echo " N_PARALLEL_WORKERS Number of parallel workers for processing." + echo + echo "Options:" + echo " -h, --help Display this help message and exit." + exit 1 +} + +# Check if the first parameter is '-h' or '--help' +if [[ "$1" == "-h" || "$1" == "--help" ]]; then + display_help +fi + +# Check for mandatory parameters +if [ "$#" -ne 4 ]; then + echo "Error: Incorrect number of arguments provided." + display_help +fi + EICU_RAW_DIR="$1" EICU_PREMEDS_DIR="$2" EICU_MEDS_DIR="$3" diff --git a/eICU_Example/joint_script_slurm.sh b/eICU_Example/joint_script_slurm.sh index 6b36ef0..7880286 100755 --- a/eICU_Example/joint_script_slurm.sh +++ b/eICU_Example/joint_script_slurm.sh @@ -1,5 +1,38 @@ #!/usr/bin/env bash +# This makes the script fail if any internal script fails +set -e + +# Function to display help message +function display_help() { + echo "Usage: $0 " + echo + echo "This script processes eICU data through several steps, handling raw data conversion," + echo "sharding events, splitting patients, converting to sharded events, and merging into a MEDS cohort." + echo "This script uses slurm to process the data in parallel via the 'submitit' Hydra launcher." + echo + echo "Arguments:" + echo " EICU_RAW_DIR Directory containing raw eICU data files." + echo " EICU_PREMEDS_DIR Output directory for pre-MEDS data." + echo " EICU_MEDS_DIR Output directory for processed MEDS data." + echo " N_PARALLEL_WORKERS Number of parallel workers for processing." + echo + echo "Options:" + echo " -h, --help Display this help message and exit." + exit 1 +} + +# Check if the first parameter is '-h' or '--help' +if [[ "$1" == "-h" || "$1" == "--help" ]]; then + display_help +fi + +# Check for mandatory parameters +if [ "$#" -ne 4 ]; then + echo "Error: Incorrect number of arguments provided." + display_help +fi + EICU_RAW_DIR="$1" EICU_PREMEDS_DIR="$2" EICU_MEDS_DIR="$3" diff --git a/eICU_Example/pre_MEDS.py b/eICU_Example/pre_MEDS.py index 73e774e..694f1dd 100755 --- a/eICU_Example/pre_MEDS.py +++ b/eICU_Example/pre_MEDS.py @@ -270,8 +270,9 @@ def main(cfg: DictConfig): hydra_loguru_init() - logger.info("Loading table preprocessors from configs/table_preprocessors.yaml...") - preprocessors = OmegaConf.load("configs/table_preprocessors.yaml") + table_preprocessors_config_fp = Path("./eICU_Example/configs/table_preprocessors.yaml") + logger.info(f"Loading table preprocessors from {str(table_preprocessors_config_fp.resolve())}...") + preprocessors = OmegaConf.load(table_preprocessors_config_fp) functions = {} for table_name, preprocessor_cfg in preprocessors.items(): logger.info(f" Adding preprocessor for {table_name}:\n{OmegaConf.to_yaml(preprocessor_cfg)}") From e80be1fde249d492a4d91d138102c71775b125f9 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sat, 1 Jun 2024 16:28:20 -0400 Subject: [PATCH 27/47] Every column in the raw files should apparently be lowercase... also other typos --- eICU_Example/configs/table_preprocessors.yaml | 268 +++++++++--------- eICU_Example/pre_MEDS.py | 125 ++++---- 2 files changed, 208 insertions(+), 185 deletions(-) diff --git a/eICU_Example/configs/table_preprocessors.yaml b/eICU_Example/configs/table_preprocessors.yaml index da94e96..7c3316f 100644 --- a/eICU_Example/configs/table_preprocessors.yaml +++ b/eICU_Example/configs/table_preprocessors.yaml @@ -1,14 +1,14 @@ admissiondx: - offset_col: "admitDxEnteredOffset" + offset_col: "admitdxenteredoffset" pseudotime_col: "admitDxEnteredTimestamp" - output_data_cols: ["admitDxName", "admitDxID"] + output_data_cols: ["admitdxname", "admitdxid"] warning_items: - ["How should we use `admitDxTest`?", "How should we use `admitDxPath`?"] + ["How should we use `admitdxtest`?", "How should we use `admitdxpath`?"] allergy: - offset_col: "allergyEnteredOffset" + offset_col: "allergyenteredoffset" pseudotime_col: "allergyEnteredTimestamp" - output_data_cols: ["allergyType", "allergyName"] + output_data_cols: ["allergytype", "allergyname"] warning_items: - "How should we use `allergyNoteType`?" - "How should we use `specialtyType`?" @@ -18,201 +18,201 @@ allergy: (docs say 'name of the selected admission drug')? carePlanGeneral: - offset_col: "cplItemOffset" + offset_col: "cplitemoffset" pseudotime_col: "carePlanGeneralItemEnteredTimestamp" - output_data_cols: ["cplGroup", "cplItemValue"] + output_data_cols: ["cplgroup", "cplitemvalue"] carePlanEOL: - offset_col: "cplEolDiscussionOffset" + offset_col: "cpleoldiscussionoffset" pseudotime_col: "carePlanEolDiscussionOccurredTimestamp" warning_items: - "Is the DiscussionOffset time actually reliable? Should we fall back on the SaveOffset time?" carePlanGoal: - offset_col: "cplGoalOffset" + offset_col: "cplgoaloffset" pseudotime_col: "carePlanGoalEnteredTimestamp" - output_data_cols: ["cplGoalCategory", "cplGoalValue", "cplGoalStatus"] + output_data_cols: ["cplgoalcategory", "cplgoalvalue", "cplgoalstatus"] carePlanInfectiousDisease: - offset_col: "cplInfectDiseaseOffset" + offset_col: "cplinfectdiseaseoffset" pseudotime_col: "carePlanInfectDiseaseEnteredTimestamp" output_data_cols: - - "infectDiseaseSite" - - "infectDiseaseAssessment" - - "responseToTherapy" + - "infectdiseasesite" + - "infectdiseaseassessment" + - "responsetotherapy" - "treatment" diagonosis: - offset_col: "diagnosisOffset" + offset_col: "diagnosisoffset" pseudotime_col: "diagnosisEnteredTimestamp" - output_data_cols: ["ICD9Code", "diagnosisPriority", "diagnosisString"] + output_data_cols: ["icd9code", "diagnosispriority", "diagnosisstring"] warning_items: - "Though we use it, the `diagnosisString` field documentation is unclear -- by what is it separated?" infusionDrug: - offset_col: "infusionOffset" + offset_col: "infusionoffset" pseudotime_col: "infusionEnteredTimestamp" output_data_cols: - - "infusionDrugID" - - "drugName" - - "drugRate" - - "infusionRate" - - "drugAmount" - - "volumeOfFluid" - - "patientWeight" + - "infusiondrugid" + - "drugname" + - "drugrate" + - "infusionrate" + - "drugamount" + - "volumeoffluid" + - "patientweight" lab: - offset_col: "labResultOffset" + offset_col: "labresultoffset" pseudotime_col: "labResultDrawnTimestamp" output_data_cols: - - "labName" - - "labResult" - - "labResultText" - - "labMeasureNameSystem" - - "labMeasureNameInterface" - - "labTypeID" + - "labname" + - "labresult" + - "labresulttext" + - "labmeasurenamesystem" + - "labmeasurenameinterface" + - "labtypeid" warning_items: - "Is this the time the lab was drawn? Entered? The time the result came in?" - "We **IGNORE** the `labResultRevisedOffset` column -- this may be a mistake!" medication: offset_col: - - "drugOrderOffset" - - "drugStartOffset" - - "drugStopOffset" + - "drugorderoffset" + - "drugstartoffset" + - "drugstopoffset" pseudotime_col: - - "drugOrderTimestamp" - - "drugStartTimestamp" - - "drugStopTimestamp" + - "drugordertimestamp" + - "drugstarttimestamp" + - "drugstoptimestamp" output_data_cols: - - "medicationID" - - "drugIVAdmixture" - - "drugName" - - "drugHiclSeqno" + - "medicationid" + - "drugivadmixture" + - "drugname" + - "drughiclseqno" - "dosage" - - "routeAdmin" + - "routeadmin" - "frequency" - - "loadingDose" - - "PRN" - - "GTC" + - "loadingdose" + - "prn" + - "gtc" warning_items: - "We **IGNORE** the `drugOrderCancelled` column -- this may be a mistake!" nurseAssessment: offset_col: - - "nurseAssessOffset" - - "nurseAssessEntryOffset" + - "nurseassessoffset" + - "nurseassessentryoffset" pseudotime_col: - "nurseAssessPerformedTimestamp" - "nurseAssessEnteredTimestamp" output_data_cols: - - "nurseAssessID" - - "cellLabel" - - "cellAttribute" - - "cellAttributeValue" + - "nurseassessid" + - "celllabel" + - "cellattribute" + - "cellattributevalue" warning_items: - "Should we be using `cellAttributePath` instead of `cellAttribute`?" - "SOME MAY BE LISTS" nurseCare: offset_col: - - "nurseCareOffset" - - "nurseCareEntryOffset" + - "nursecareoffset" + - "nursecareentryoffset" pseudotime_col: - "nurseCarePerformedTimestamp" - "nurseCareEnteredTimestamp" output_data_cols: - - "nurseCareID" - - "cellLabel" - - "cellAttribute" - - "cellAttributeValue" + - "nursecareid" + - "celllabel" + - "cellattribute" + - "cellattributevalue" warning_items: - "Should we be using `cellAttributePath` instead of `cellAttribute`?" - "SOME MAY BE LISTS" nurseCharting: offset_col: - - "nursingChartOffset" - - "nursingChartEntryOffset" + - "nursingchartoffset" + - "nursingchartentryoffset" pseudotime_col: - "nursingChartPerformedTimestamp" - "nursingChartEnteredTimestamp" output_data_cols: - - "nursingChartID" - - "nursingChartCellTypeCat" - - "nursingChartCellTypeValName" - - "nursingChartCellTypeValLabel" - - "nursingChartValue" + - "nursingchartid" + - "nursingchartcelltypecat" + - "nursingchartcelltypevalname" + - "nursingchartcelltypevallabel" + - "nursingchartvalue" warning_items: - "SOME MAY BE LISTS" pastHistory: offset_col: - - "pastHistoryOffset" - - "pastHistoryEnteredOffset" + - "pasthistoryoffset" + - "pasthistoryenteredoffset" pseudotime_col: - "pastHistoryTakenTimestamp" - "pastHistoryEnteredTimestamp" output_data_cols: - - "pastHistoryID" - - "pastHistoryNoteType" - - "pastHistoryPath" - - "pastHistoryValue" - - "pastHistoryValueText" + - "pasthistoryid" + - "pasthistorynotetype" + - "pasthistorypath" + - "pasthistoryvalue" + - "pasthistoryvaluetext" warning_items: - "SOME MAY BE LISTS" - "How should we use `pastHistoryPath` vs. `pastHistoryNoteType`?" - "How should we use `pastHistoryValue` vs. `pastHistoryValueText`?" physicalExam: - offset_col: "physicalExamOffset" + offset_col: "physicalexamoffset" pseudotime_col: "physicalExamEnteredTimestamp" output_data_cols: - - "physicalExamID" - - "physicalExamText" - - "physicalExamPath" - - "physicalExamValue" + - "physicalexamid" + - "physicalexamtext" + - "physicalexampath" + - "physicalexamvalue" warning_items: - "How should we use `physicalExamValue` vs. `physicalExamText`?" - "I believe the `physicalExamValue` is a **LIST**. This must be processed specially." respiratoryCare: offset_col: - - "respCareStatusOffset" - - "ventStartOffset" - - "ventEndOffset" + - "respcarestatusoffset" + - "ventstartoffset" + - "ventendoffset" pseudotime_col: - "respCareStatusEnteredTimestamp" - "ventStartTimestamp" - "ventEndTimestamp" output_data_cols: - - "respCareID" - - "airwayType" - - "airwaySize" - - "airwayPosition" - - "cuffPressure" - - "apneaParams" - - "lowExhMVLimit" - - "hiExhMVLimit" - - "lowExhTVLimit" - - "hiPeakPresLimit" - - "lowPeakPresLimit" - - "hiRespRateLimit" - - "lowRespRateLimit" - - "sighPresLimit" - - "lowIronOxLimit" - - "highIronOxLimit" - - "meanAirwayPresLimit" - - "PEEPLimit" - - "CPAPLimit" - - "setApneaInterval" - - "setApneaTV" - - "setApneaIPPEEPHigh" - - "setApneaRR" - - "setApneaPeakFlow" - - "setApneaInspTime" - - "setApneaIE" - - "setApneaFIO2" + - "respcareid" + - "airwaytype" + - "airwaysize" + - "airwayposition" + - "cuffpressure" + - "apneaparams" + - "lowexhmvlimit" + - "hiexhmvlimit" + - "lowexhtvlimit" + - "hipeakpreslimit" + - "lowpeakpreslimit" + - "hirespratelimit" + - "lowrespratelimit" + - "sighpreslimit" + - "lowironoxlimit" + - "highironoxlimit" + - "meanairwaypreslimit" + - "peeplimit" + - "cpaplimit" + - "setapneainterval" + - "setapneatv" + - "setapneaippeephigh" + - "setapnearr" + - "setapneapeakflow" + - "setapneainsptime" + - "setapneaie" + - "setapneafio2" warning_items: - "We ignore the `priorVent*` columns -- this may be a mistake!" - "There is a lot of data in this table -- what should be incorporated into the event structure?" @@ -220,64 +220,64 @@ respiratoryCare: respiratoryCharting: offset_col: - - "respChartOffset" - - "respChartEntryOffset" + - "respchartoffset" + - "respchartentryoffset" pseudotime_col: - "respChartPerformedTimestamp" - "respChartEnteredTimestamp" output_data_cols: - - "respChartID" - - "respChartTypeCat" - - "respChartValueLabel" - - "respChartValue" + - "respchartid" + - "respcharttypecat" + - "respchartvaluelabel" + - "respchartvalue" warning_items: - "SOME MAY BE LISTS" treatment: - offset_col: "treatmentOffset" + offset_col: "treatmentoffset" pseudotime_col: "treatmentEnteredTimestamp" output_data_cols: - - "treatmentID" - - "treatmentString" + - "treatmentid" + - "treatmentstring" warning_items: - "Absence of entries in table do not indicate absence of treatments" vitalAperiodic: - offset_col: "observationOffset" + offset_col: "observationoffset" pseudotime_col: "observationEnteredTimestamp" output_data_cols: - - "vitalAperiodicID" - - "nonInvasiveSystolic" - - "nonInvasiveDiastolic" - - "nonInvasiveMean" + - "vitalaperiodicid" + - "noninvasivesystolic" + - "noninvasivediastolic" + - "noninvasivemean" - "paop" - - "cardiacOutput" - - "cardiacInput" + - "cardiacoutput" + - "cardiacinput" - "svr" - "svri" - "pvr" - "pvri" vitalPeriodic: - offset_col: "observationOffset" + offset_col: "observationoffset" pseudotime_col: "observationEnteredTimestamp" output_data_cols: - - "vitalPeriodicID" + - "vitalperiodicid" - "temperature" - - "saO2" - - "heartRate" + - "sao2" + - "heartrate" - "respiration" - "cvp" - - "etCo2" - - "systemicSystolic" - - "systemicDiastolic" - - "systemicMean" - - "paSystolic" - - "paDiastolic" - - "paMean" + - "etco2" + - "systemicsystolic" + - "systemicdiastolic" + - "systemicmean" + - "pasystolic" + - "padiastolic" + - "pamean" - "st1" - "st2" - "st3" - - "ICP" + - "icp" warning_items: - "These are 5-minute median values. There are going to be a *lot* of events." diff --git a/eICU_Example/pre_MEDS.py b/eICU_Example/pre_MEDS.py index 694f1dd..a14d36e 100755 --- a/eICU_Example/pre_MEDS.py +++ b/eICU_Example/pre_MEDS.py @@ -24,8 +24,8 @@ write_lazyframe, ) -HEALTH_SYSTEM_STAY_ID = "patientHealthSystemStayID" -UNIT_STAY_ID = "patientUnitStayID" +HEALTH_SYSTEM_STAY_ID = "patienthealthsystemstayid" +UNIT_STAY_ID = "patientunitstayid" PATIENT_ID = "uniquepid" # The end of year date, used for year-only timestamps in eICU. The time is set to midnight as we'll add a @@ -48,9 +48,19 @@ def load_raw_eicu_file(fp: Path, **kwargs) -> pl.LazyFrame: def check_timestamps_agree(df: pl.LazyFrame, pseudotime_col: pl.Expr, given_24htime_col: str): + """Checks that the time-of-day portions agree between the pseudotime and given columns. + + Raises a `ValueError` if the times don't match within a minute. + + Args: + TODO + """ expected_time = pl.col(given_24htime_col).str.strptime(pl.Time, "%H:%M:%S") - time_deltas_min = (pseudotime_col.dt.time() - expected_time).dt.total_minutes() + # The use of `.dt.combine` here re-sets the "time-of-day" of the pseudotime_col column + time_deltas_min = ( + pseudotime_col - pseudotime_col.dt.combine(expected_time) + ).dt.total_minutes() # Check that the time deltas are all within 1 minute logger.info( @@ -76,19 +86,25 @@ def process_patient(df: pl.LazyFrame, hospital_df: pl.LazyFrame) -> pl.LazyFrame `configs/event_configs.yaml` file. """ - hospital_discharge_pseudotime = pl.datetime(year=pl.col("hospitalDischargeYear"), **END_OF_YEAR) + pl.col( - "hospitalDischargeTime24" - ).str.strptime(pl.Time, "%H:%M:%S") + hospital_discharge_pseudotime = ( + pl.datetime(year=pl.col("hospitaldischargeyear"), **END_OF_YEAR).dt.combine( + pl.col("hospitaldischargetime24").str.strptime(pl.Time, "%H:%M:%S") + ) + ) - unit_admit_pseudotime = hospital_discharge_pseudotime - pl.duration( - minutes=pl.col("hospitalDischargeOffset") + unit_admit_pseudotime = ( + hospital_discharge_pseudotime - pl.duration(minutes=pl.col("hospitaldischargeoffset")) ) - unit_discharge_pseudotime = unit_admit_pseudotime + pl.duration(minutes=pl.col("unitDischargeOffset")) + unit_discharge_pseudotime = unit_admit_pseudotime + pl.duration(minutes=pl.col("unitdischargeoffset")) - hospital_admit_pseudotime = unit_admit_pseudotime + pl.duration(minutes=pl.col("hospitalAdmitOffset")) + hospital_admit_pseudotime = unit_admit_pseudotime + pl.duration(minutes=pl.col("hospitaladmitoffset")) - age_in_years = pl.when(pl.col("age") == "> 89").then(90).otherwise(pl.col("age").cast(pl.UInt16)) + age_in_years = ( + pl.when(pl.col("age") == "> 89") + .then(90) + .otherwise(pl.col("age").cast(pl.UInt16, strict=False)) + ) age_in_days = age_in_years * 365.25 # We assume that the patient was born at the midpoint of the year as we don't know the actual birthdate pseudo_date_of_birth = unit_admit_pseudotime - pl.duration(days=(age_in_days - 365.25 / 2)) @@ -99,10 +115,10 @@ def process_patient(df: pl.LazyFrame, hospital_df: pl.LazyFrame) -> pl.LazyFrame "Checking that the 24h times are consistent. If this is extremely slow, consider refactoring to have " "only one `.collect()` call." ) - check_timestamps_agree(df, hospital_discharge_pseudotime, "hospitalDischargeTime24") - check_timestamps_agree(df, hospital_admit_pseudotime, "hospitalAdmitTime24") - check_timestamps_agree(df, unit_admit_pseudotime, "unitAdmitTime24") - check_timestamps_agree(df, unit_discharge_pseudotime, "unitDischargeTime24") + check_timestamps_agree(df, hospital_discharge_pseudotime, "hospitaldischargetime24") + check_timestamps_agree(df, hospital_admit_pseudotime, "hospitaladmittime24") + check_timestamps_agree(df, unit_admit_pseudotime, "unitadmittime24") + check_timestamps_agree(df, unit_discharge_pseudotime, "unitdischargetime24") logger.info(f"Validated 24h times in {datetime.now() - start}") logger.warning("NOT validating the `unitVisitNumber` column as that isn't implemented yet.") @@ -116,41 +132,42 @@ def process_patient(df: pl.LazyFrame, hospital_df: pl.LazyFrame) -> pl.LazyFrame " - `age` is interpreted as the age at the time of the unit stay, not the hospital stay. " "Is this right?\n" " - `What is the actual mean age for those > 89? Here we assume 90.\n" + " - Note that all the column names appear to be all in lowercase for the csv versions, vs. the docs" ) - return df.join(hospital_df, left_on="hospitalID", right_on="hospitalid", how="left").select( + return df.join(hospital_df, left_on="hospitalid", right_on="hospitalid", how="left").select( # 1. Static variables PATIENT_ID, "gender", - pseudo_date_of_birth.alias("dateOfBirth"), + pseudo_date_of_birth.alias("dateofbirth"), "ethnicity", # 2. Health system stay parameters HEALTH_SYSTEM_STAY_ID, - "hospitalID", - pl.col("numbedscategory").alias("hospitalNumBedsCategory"), - pl.col("teachingstatus").alias("hospitalTeachingStatus"), - pl.col("region").alias("hospitalRegion"), + "hospitalid", + pl.col("numbedscategory").alias("hospitalnumbedscategory"), + pl.col("teachingstatus").alias("hospitalteachingstatus"), + pl.col("region").alias("hospitalregion"), # 2.1 Admission parameters - hospital_admit_pseudotime.alias("hospitalAdmitTimestamp"), - "hospitalAdmitSource", + hospital_admit_pseudotime.alias("hospitaladmittimestamp"), + "hospitaladmitsource", # 2.2 Discharge parameters - hospital_discharge_pseudotime.alias("hospitalDischargeTimestamp"), - "hospitalDischargeLocation", - "hospitalDischargeStatus", + hospital_discharge_pseudotime.alias("hospitaldischargetimestamp"), + "hospitaldischargelocation", + "hospitaldischargestatus", # 3. Unit stay parameters UNIT_STAY_ID, - "wardID", + "wardid", # 3.1 Admission parameters - unit_admit_pseudotime.alias("unitAdmitTimestamp"), - "unitAdmitSource", - "unitStayType", - pl.col("admissionHeight").alias("unitAdmissionHeight"), - pl.col("admissionWeight").alias("unitAdmissionWeight"), + unit_admit_pseudotime.alias("unitadmittimestamp"), + "unitadmitsource", + "unitstaytype", + pl.col("admissionheight").alias("unitadmissionheight"), + pl.col("admissionweight").alias("unitadmissionweight"), # 3.2 Discharge parameters - unit_discharge_pseudotime.alias("unitDischargeTimestamp"), - "unitDischargeLocation", - "unitDischargeStatus", - pl.col("dischargeWeight").alias("unitDischargeWeight"), + unit_discharge_pseudotime.alias("unitdischargetimestamp"), + "unitdischargelocation", + "unitdischargestatus", + pl.col("dischargeweight").alias("unitdischargeweight"), ) @@ -190,7 +207,7 @@ def fn(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame: """ pseudotimes = [ - (pl.col("unitAdmitTimestamp") + pl.duration(minutes=pl.col(offset))).alias(pseudotime) + (pl.col("unitadmittimestamp") + pl.duration(minutes=pl.col(offset))).alias(pseudotime) for pseudotime, offset in zip(pseudotime_col, offset_col) ] @@ -211,7 +228,7 @@ def fn(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame: return fn -NEEDED_PATIENT_COLS = [UNIT_STAY_ID, HEALTH_SYSTEM_STAY_ID, "unitAdmitTimestamp"] +NEEDED_PATIENT_COLS = [UNIT_STAY_ID, HEALTH_SYSTEM_STAY_ID, "unitadmittimestamp"] @hydra.main(version_base=None, config_path="configs", config_name="pre_MEDS") @@ -281,23 +298,29 @@ def main(cfg: DictConfig): raw_cohort_dir = Path(cfg.raw_cohort_dir) MEDS_input_dir = Path(cfg.output_dir) - logger.info("Processing patient table first...") + patient_out_fp = MEDS_input_dir / "patient.parquet" - hospital_fp = raw_cohort_dir / "hospital.csv.gz" - patient_fp = raw_cohort_dir / "patient.csv.gz" - logger.info(f"Loading {str(hospital_fp.resolve())}...") - hospital_df = load_raw_eicu_file( - hospital_fp, columns=["hospitalid", "numbedscategory", "teachingstatus", "region"] - ) - logger.info(f"Loading {str(patient_fp.resolve())}...") - raw_patient_df = load_raw_eicu_file(patient_fp) + if patient_out_fp.is_file(): + logger.info(f"Reloading processed patient df from {str(patient_out_fp.resolve())}") + patient_df = pl.read_parquet(patient_out_fp, columns=NEEDED_PATIENT_COLS, use_pyarrow=True).lazy() + else: + logger.info("Processing patient table first...") + + hospital_fp = raw_cohort_dir / "hospital.csv.gz" + patient_fp = raw_cohort_dir / "patient.csv.gz" + logger.info(f"Loading {str(hospital_fp.resolve())}...") + hospital_df = load_raw_eicu_file( + hospital_fp, columns=["hospitalid", "numbedscategory", "teachingstatus", "region"] + ) + logger.info(f"Loading {str(patient_fp.resolve())}...") + raw_patient_df = load_raw_eicu_file(patient_fp) - logger.info("Processing patient table...") - patient_df = process_patient(raw_patient_df, hospital_df) - write_lazyframe(patient_df, MEDS_input_dir / "patient.parquet") + logger.info("Processing patient table...") + patient_df = process_patient(raw_patient_df, hospital_df) + write_lazyframe(patient_df, MEDS_input_dir / "patient.parquet") all_fps = [ - fp for fp in raw_cohort_dir.glob("*/.csv.gz") if fp.name not in {"hospital.csv.gz", "patient.csv.gz"} + fp for fp in raw_cohort_dir.glob("*.csv.gz") if fp.name not in {"hospital.csv.gz", "patient.csv.gz"} ] unused_tables = { From 9ad4b924884c882c15a93931eb2bebdc229d0ccd Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sat, 1 Jun 2024 16:29:00 -0400 Subject: [PATCH 28/47] Fixing a typo in config for diagnosis --- eICU_Example/configs/table_preprocessors.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eICU_Example/configs/table_preprocessors.yaml b/eICU_Example/configs/table_preprocessors.yaml index 7c3316f..3fe62f3 100644 --- a/eICU_Example/configs/table_preprocessors.yaml +++ b/eICU_Example/configs/table_preprocessors.yaml @@ -42,7 +42,7 @@ carePlanInfectiousDisease: - "responsetotherapy" - "treatment" -diagonosis: +diagnosis: offset_col: "diagnosisoffset" pseudotime_col: "diagnosisEnteredTimestamp" output_data_cols: ["icd9code", "diagnosispriority", "diagnosisstring"] From 9ced80f81e708fb2474d2ff53cd130dd35af6d2e Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sat, 1 Jun 2024 16:35:42 -0400 Subject: [PATCH 29/47] Fixed numerous typos and issues. Makes it through much of the files now in the pre-MEDS stage --- eICU_Example/configs/event_configs.yaml | 112 +++++++++--------- eICU_Example/configs/table_preprocessors.yaml | 2 +- eICU_Example/pre_MEDS.py | 2 +- 3 files changed, 58 insertions(+), 58 deletions(-) diff --git a/eICU_Example/configs/event_configs.yaml b/eICU_Example/configs/event_configs.yaml index 50c8eb0..6ac7ab9 100644 --- a/eICU_Example/configs/event_configs.yaml +++ b/eICU_Example/configs/event_configs.yaml @@ -17,23 +17,23 @@ patient: hosp_admission: code: - "HOSPITAL_ADMISSION" - - col("hospitalAdmitSource") - - col("hospitalRegion") - - col("hospitalTeachingStatus") - - col("hospitalNumBedsCategory") + - col(hospitaladmitsource) + - col(hospitalregion) + - col(hospitalteachingstatus) + - col(hospitalnumbedscategory) timestamp: "hospitalAdmitTimestamp" hospital_id: "hospitalID" hosp_discharge: code: - "HOSPITAL_DISCHARGE" - - col("hospitalDischargeStatus") - - col("hospitalDischargeLocation") + - col(hospitaldischargestatus) + - col(hospitaldischargelocation) timestamp: "hospitalDischargeTimestamp" unit_admission: code: - "UNIT_ADMISSION" - - col("unitAdmitSource") - - col("unitStayType") + - col(unitadmitsource) + - col(unitstaytype) timestamp: "unitAdmitTimestamp" ward_id: "wardID" unit_stay_id: "patientUnitStayID" @@ -50,8 +50,8 @@ patient: unit_discharge: code: - "UNIT_DISCHARGE" - - col("unitDischargeStatus") - - col("unitDischargeLocation") + - col(unitdischargestatus) + - col(unitdischargelocation) timestamp: "unitDischargeTimestamp" unit_discharge_weight: code: @@ -63,7 +63,7 @@ admissiondx: admission_diagnosis: code: - "ADMISSION_DX" - - col("admitDxName") + - col(admitdxname) timestamp: "admitDxEnteredTimestamp" admission_dx_id: "admitDxID" unit_stay_id: "patientUnitStayID" @@ -72,16 +72,16 @@ allergy: allergy: code: - "ALLERGY" - - col("allergyType") - - col("allergyName") + - col(allergytype) + - col(allergyname) timestamp: "allergyEnteredTimestamp" carePlanGeneral: cplItem: code: - "CAREPLAN_GENERAL" - - col("cplGroup") - - col("cplItemValue") + - col(cplgroup) + - col(cplitemvalue) timestamp: "carePlanGeneralItemEnteredTimestamp" carePlanEOL: @@ -94,27 +94,27 @@ carePlanGoal: cplGoal: code: - "CAREPLAN_GOAL" - - col("cplGoalCategory") - - col("cplGoalValue") - - col("cplGoalStatus") + - col(cplgoalcategory) + - col(cplgoalvalue) + - col(cplgoalstatus) timestamp: "carePlanGoalEnteredTimestamp" carePlanInfectiousDisease: cplInfectDisease: code: - "CAREPLAN_INFECTIOUS_DISEASE" - - col("infectDiseaseSite") - - col("infectDiseaseAssessment") - - col("treatment") - - col("responseToTherapy") + - col(infectdiseasesite) + - col(infectdiseaseassessment) + - col(treatment) + - col(responsetotherapy) timestamp: "carePlanInfectDiseaseEnteredTimestamp" diagnosis: diagnosis: code: - "ICD9CM" - - col("ICD9Code") - - col("diagnosisPriority") + - col(icd9code) + - col(diagnosispriority) timestamp: "diagnosisEnteredTimestamp" diagnosis_string: "diagnosisString" @@ -122,8 +122,8 @@ infusionDrug: infusion: code: - "INFUSION" - - col("infusionDrugID") - - col("drugName") + - col(infusiondrugid) + - col(drugname) timestamp: "infusionEnteredTimestamp" drug_rate: "drugRate" infusion_rate: "infusionRate" @@ -139,9 +139,9 @@ lab: lab: code: - "LAB" - - col("labMeasureNameSystem") - - col("labMeasureNameInterface") - - col("labName") + - col(labmeasurenamesystem) + - col(labmeasurenameinterface) + - col(labname) timestamp: "labResultDrawnTimestamp" numerical_value: "labResult" text_value: "labResultText" @@ -294,32 +294,32 @@ respiratoryCare: timestamp: "respCareStatusEnteredTimestamp" resp_care_id: "respCareID" - airwayType: "airwayType" - airwaySize: "airwaySize" - airwayPosition: "airwayPosition" - cuffPressure: "cuffPressure" - apneaParams: "apneaParams" - lowExhMVLimit: "lowExhMVLimit" - hiExhMVLimit: "hiExhMVLimit" - lowExhTVLimit: "lowExhTVLimit" - hiPeakPresLimit: "hiPeakPresLimit" - lowPeakPresLimit: "lowPeakPresLimit" - hiRespRateLimit: "hiRespRateLimit" - lowRespRateLimit: "lowRespRateLimit" - sighPresLimit: "sighPresLimit" - lowIronOxLimit: "lowIronOxLimit" - highIronOxLimit: "highIronOxLimit" - meanAirwayPresLimit: "meanAirwayPresLimit" - PEEPLimit: "PEEPLimit" - CPAPLimit: "CPAPLimit" - setApneaInterval: "setApneaInterval" - setApneaTV: "setApneaTV" - setApneaIPPEEPHigh: "setApneaIPPEEPHigh" - setApneaRR: "setApneaRR" - setApneaPeakFlow: "setApneaPeakFlow" - setApneaInspTime: "setApneaInspTime" - setApneaIE: "setApneaIE" - setApneaFIO2: "setApneaFIO2" + airwaytype: "airwaytype" + airwaysize: "airwaysize" + airwayposition: "airwayposition" + cuffpressure: "cuffpressure" + apneaparms: "apneaparms" + lowexhmvlimit: "lowexhmvlimit" + hiexhmvlimit: "hiexhmvlimit" + lowexhtvlimit: "lowexhtvlimit" + hipeakpreslimit: "hipeakpreslimit" + lowpeakpreslimit: "lowpeakpreslimit" + hirespratelimit: "hirespratelimit" + lowrespratelimit: "lowrespratelimit" + sighpreslimit: "sighpreslimit" + lowironoxlimit: "lowironoxlimit" + highironoxlimit: "highironoxlimit" + meanairwaypreslimit: "meanairwaypreslimit" + peeplimit: "peeplimit" + cpaplimit: "cpaplimit" + setapneainterval: "setapneainterval" + setapneatv: "setapneatv" + setapneaippeephigh: "setapneaippeephigh" + setapnearr: "setapnearr" + setapneapeakflow: "setapneapeakflow" + setapneainsptime: "setapneainsptime" + setapneaie: "setapneaie" + setapneafio2: "setapneafio2" vent_start: code: @@ -365,7 +365,7 @@ treatment: code: - "TREATMENT" - "ENTERED" - - col("treatmentString") + - col(treatmentstring) timestamp: "treatmentEnteredTimestamp" treatment_id: "treatmentID" diff --git a/eICU_Example/configs/table_preprocessors.yaml b/eICU_Example/configs/table_preprocessors.yaml index 3fe62f3..3faf4aa 100644 --- a/eICU_Example/configs/table_preprocessors.yaml +++ b/eICU_Example/configs/table_preprocessors.yaml @@ -191,7 +191,7 @@ respiratoryCare: - "airwaysize" - "airwayposition" - "cuffpressure" - - "apneaparams" + - "apneaparms" - "lowexhmvlimit" - "hiexhmvlimit" - "lowexhtvlimit" diff --git a/eICU_Example/pre_MEDS.py b/eICU_Example/pre_MEDS.py index a14d36e..d06bd7b 100755 --- a/eICU_Example/pre_MEDS.py +++ b/eICU_Example/pre_MEDS.py @@ -44,7 +44,7 @@ def load_raw_eicu_file(fp: Path, **kwargs) -> pl.LazyFrame: """ with gzip.open(fp, mode="rb") as f: - return pl.read_csv(f, infer_schema_length=100000, **kwargs).lazy() + return pl.read_csv(f, infer_schema_length=100000000, **kwargs).lazy() def check_timestamps_agree(df: pl.LazyFrame, pseudotime_col: pl.Expr, given_24htime_col: str): From 1168641495d20ea54e82699486b5b81faa4dd0fa Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sat, 1 Jun 2024 16:56:29 -0400 Subject: [PATCH 30/47] Linted --- eICU_Example/pre_MEDS.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/eICU_Example/pre_MEDS.py b/eICU_Example/pre_MEDS.py index d06bd7b..e5855f4 100755 --- a/eICU_Example/pre_MEDS.py +++ b/eICU_Example/pre_MEDS.py @@ -58,9 +58,7 @@ def check_timestamps_agree(df: pl.LazyFrame, pseudotime_col: pl.Expr, given_24ht expected_time = pl.col(given_24htime_col).str.strptime(pl.Time, "%H:%M:%S") # The use of `.dt.combine` here re-sets the "time-of-day" of the pseudotime_col column - time_deltas_min = ( - pseudotime_col - pseudotime_col.dt.combine(expected_time) - ).dt.total_minutes() + time_deltas_min = (pseudotime_col - pseudotime_col.dt.combine(expected_time)).dt.total_minutes() # Check that the time deltas are all within 1 minute logger.info( @@ -86,14 +84,12 @@ def process_patient(df: pl.LazyFrame, hospital_df: pl.LazyFrame) -> pl.LazyFrame `configs/event_configs.yaml` file. """ - hospital_discharge_pseudotime = ( - pl.datetime(year=pl.col("hospitaldischargeyear"), **END_OF_YEAR).dt.combine( - pl.col("hospitaldischargetime24").str.strptime(pl.Time, "%H:%M:%S") - ) - ) + hospital_discharge_pseudotime = pl.datetime( + year=pl.col("hospitaldischargeyear"), **END_OF_YEAR + ).dt.combine(pl.col("hospitaldischargetime24").str.strptime(pl.Time, "%H:%M:%S")) - unit_admit_pseudotime = ( - hospital_discharge_pseudotime - pl.duration(minutes=pl.col("hospitaldischargeoffset")) + unit_admit_pseudotime = hospital_discharge_pseudotime - pl.duration( + minutes=pl.col("hospitaldischargeoffset") ) unit_discharge_pseudotime = unit_admit_pseudotime + pl.duration(minutes=pl.col("unitdischargeoffset")) @@ -101,9 +97,7 @@ def process_patient(df: pl.LazyFrame, hospital_df: pl.LazyFrame) -> pl.LazyFrame hospital_admit_pseudotime = unit_admit_pseudotime + pl.duration(minutes=pl.col("hospitaladmitoffset")) age_in_years = ( - pl.when(pl.col("age") == "> 89") - .then(90) - .otherwise(pl.col("age").cast(pl.UInt16, strict=False)) + pl.when(pl.col("age") == "> 89").then(90).otherwise(pl.col("age").cast(pl.UInt16, strict=False)) ) age_in_days = age_in_years * 365.25 # We assume that the patient was born at the midpoint of the year as we don't know the actual birthdate From 39cf4649e67d66aca6f7596c0abb40d12ee3d836 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sat, 1 Jun 2024 17:03:25 -0400 Subject: [PATCH 31/47] Corrected more typos --- eICU_Example/configs/event_configs.yaml | 225 ++++++++++++------------ 1 file changed, 112 insertions(+), 113 deletions(-) diff --git a/eICU_Example/configs/event_configs.yaml b/eICU_Example/configs/event_configs.yaml index 6ac7ab9..c57584f 100644 --- a/eICU_Example/configs/event_configs.yaml +++ b/eICU_Example/configs/event_configs.yaml @@ -1,12 +1,12 @@ # Note that there is no "patient_id" for eICU -- patients are only differentiable during the course of a # single health system stay. Accordingly, we set the "patient" id here as the "patientHealthSystemStayID" -patient_id_col: patientHealthSystemStayID +patient_id_col: patienthealthsystemstayid patient: dob: code: "DOB" - timestamp: "dateOfBirth" + timestamp: "dateofbirth" uniquepid: "uniquepid" gender: code: ["GENDER", "col(gender)"] @@ -116,7 +116,7 @@ diagnosis: - col(icd9code) - col(diagnosispriority) timestamp: "diagnosisEnteredTimestamp" - diagnosis_string: "diagnosisString" + diagnosis_string: "diagnosisstring" infusionDrug: infusion: @@ -125,15 +125,15 @@ infusionDrug: - col(infusiondrugid) - col(drugname) timestamp: "infusionEnteredTimestamp" - drug_rate: "drugRate" - infusion_rate: "infusionRate" - drug_amount: "drugAmount" - volume_of_fluid: "volumeOfFluid" + drug_rate: "drugrate" + infusion_rate: "infusionrate" + drug_amount: "drugamount" + volume_of_fluid: "volumeoffluid" patient_weight: code: - "INFUSION_PATIENT_WEIGHT" timestamp: "infusionEnteredTimestamp" - numerical_value: "patientWeight" + numerical_value: "patientweight" lab: lab: @@ -143,9 +143,9 @@ lab: - col(labmeasurenameinterface) - col(labname) timestamp: "labResultDrawnTimestamp" - numerical_value: "labResult" - text_value: "labResultText" - lab_type_id: "labTypeID" + numerical_value: "labresult" + text_value: "labresulttext" + lab_type_id: "labtypeid" medication: drug_ordered: @@ -154,28 +154,28 @@ medication: - "ORDERED" - col(drugName) timestamp: "drugOrderTimestamp" - medication_id: "medicationID" - drug_iv_admixture: "drugIVAdmixture" + medication_id: "medicationid" + drug_iv_admixture: "drugivadmixture" dosage: "dosage" - route_admin: "routeAdmin" + route_admin: "routeadmin" frequency: "frequency" - loading_dose: "loadingDose" - prn: "PRN" - gtc: "GTC" + loading_dose: "loadingdose" + prn: "prn" + gtc: "gtc" drug_started: code: - "MEDICATION" - "STARTED" - col(drugName) timestamp: "drugStartedTimestamp" - medication_id: "medicationID" + medication_id: "medicationid" drug_stopped: code: - "MEDICATION" - "STOPPED" - col(drugName) timestamp: "drugStoppedTimestamp" - medication_id: "medicationID" + medication_id: "medicationid" nurseAssessment: nurse_assessment_performed: @@ -184,10 +184,10 @@ nurseAssessment: - "PERFORMED" - NOT YET DONE timestamp: "nurseAssessPerformedTimestamp" - nurse_assessment_id: "nurseAssessID" - cell_label: "cellLabel" - cell_attribute: "cellAttribute" - cell_attribute_value: "cellAttributeValue" + nurse_assessment_id: "nurseassessid" + cell_label: "celllabel" + cell_attribute: "cellattribute" + cell_attribute_value: "cellattributevalue" nurse_assessment_entered: code: @@ -195,10 +195,10 @@ nurseAssessment: - "ENTERED" - NOT YET DONE timestamp: "nurseAssessEnteredTimestamp" - nurse_assessment_id: "nurseAssessID" - cell_label: "cellLabel" - cell_attribute: "cellAttribute" - cell_attribute_value: "cellAttributeValue" + nurse_assessment_id: "nurseassessid" + cell_label: "celllabel" + cell_attribute: "cellattribute" + cell_attribute_value: "cellattributevalue" nurseCare: nurse_care_performed: @@ -207,10 +207,10 @@ nurseCare: - "PERFORMED" - NOT YET DONE timestamp: "nurseCarePerformedTimestamp" - nurse_care_id: "nurseCareID" - cell_label: "cellLabel" - cell_attribute: "cellAttribute" - cell_attribute_value: "cellAttributeValue" + nurse_care_id: "nursecareid" + cell_label: "celllabel" + cell_attribute: "cellattribute" + cell_attribute_value: "cellattributevalue" nurse_care_entered: code: @@ -218,10 +218,10 @@ nurseCare: - "ENTERED" - NOT YET DONE timestamp: "nurseCareEnteredTimestamp" - nurse_care_id: "nurseCareID" - cell_label: "cellLabel" - cell_attribute: "cellAttribute" - cell_attribute_value: "cellAttributeValue" + nurse_care_id: "nursecareid" + cell_label: "celllabel" + cell_attribute: "cellattribute" + cell_attribute_value: "cellattributevalue" nurseCharting: nurse_charting_performed: @@ -230,11 +230,11 @@ nurseCharting: - "PERFORMED" - NOT YET DONE timestamp: "nursingChartPerformedTimestamp" - nurse_charting_id: "nursingChartID" - cell_type_cat: "nursingChartCellTypeCat" - cell_type_val_name: "nursingChartCellTypeValName" - cell_type_val_label: "nursingChartCellTypeValLabel" - cell_value: "nursingChartValue" + nurse_charting_id: "nursingchartid" + cell_type_cat: "nursingchartcelltypecat" + cell_type_val_name: "nursingchartcelltypevalname" + cell_type_val_label: "nursingchartcelltypevallabel" + cell_value: "nursingchartvalue" nurse_charting_entered: code: @@ -242,11 +242,11 @@ nurseCharting: - "ENTERED" - NOT YET DONE timestamp: "nursingChartEnteredTimestamp" - nurse_charting_id: "nursingChartID" - cell_type_cat: "nursingChartCellTypeCat" - cell_type_val_name: "nursingChartCellTypeValName" - cell_type_val_label: "nursingChartCellTypeValLabel" - cell_value: "nursingChartValue" + nurse_charting_id: "nursingchartid" + cell_type_cat: "nursingchartcelltypecat" + cell_type_val_name: "nursingchartcelltypevalname" + cell_type_val_label: "nursingchartcelltypevallabel" + cell_value: "nursingchartvalue" pastHistory: past_history_taken: @@ -255,11 +255,11 @@ pastHistory: - "TAKEN" - NOT YET DONE timestamp: "pastHistoryTakenTimestamp" - past_history_id: "pastHistoryID" - note_type: "pastHistoryNoteType" - path: "pastHistoryPath" - value: "pastHistoryValue" - value_text: "pastHistoryValueText" + past_history_id: "pasthistoryid" + note_type: "pasthistorynotetype" + path: "pasthistorypath" + value: "pasthistoryvalue" + value_text: "pasthistoryvaluetext" past_history_entered: code: @@ -267,11 +267,11 @@ pastHistory: - "ENTERED" - NOT YET DONE timestamp: "pastHistoryEnteredTimestamp" - past_history_id: "pastHistoryID" - note_type: "pastHistoryNoteType" - path: "pastHistoryPath" - value: "pastHistoryValue" - value_text: "pastHistoryValueText" + past_history_id: "pasthistoryid" + note_type: "pasthistorynotetype" + path: "pasthistorypath" + value: "pasthistoryvalue" + value_text: "pasthistoryvaluetext" physicalExam: physical_exam_entered: @@ -280,10 +280,10 @@ physicalExam: - "ENTERED" - NOT YET DONE timestamp: "physicalExamEnteredTimestamp" - physical_exam_id: "physicalExamID" - text: "physicalExamText" - path: "physicalExamPath" - value: "physicalExamValue" + physical_exam_id: "physicalexamid" + text: "physicalexamtext" + path: "physicalexampath" + value: "physicalexamvalue" respiratoryCare: resp_care_status: @@ -292,8 +292,7 @@ respiratoryCare: - "STATUS" - NOT YET DONE timestamp: "respCareStatusEnteredTimestamp" - resp_care_id: "respCareID" - + resp_care_id: "respcareid" airwaytype: "airwaytype" airwaysize: "airwaysize" airwayposition: "airwayposition" @@ -327,7 +326,7 @@ respiratoryCare: - "START" - NOT YET DONE timestamp: "ventStartTimestamp" - resp_care_id: "respCareID" + resp_care_id: "respcareid" vent_end: code: @@ -335,7 +334,7 @@ respiratoryCare: - "END" - NOT YET DONE timestamp: "ventEndTimestamp" - resp_care_id: "respCareID" + resp_care_id: "respcareid" respiratoryCharting: resp_charting_performed: @@ -344,10 +343,10 @@ respiratoryCharting: - "PERFORMED" - NOT YET DONE timestamp: "respChartPerformedTimestamp" - resp_chart_id: "respChartID" - type_cat: "respChartTypeCat" - value_label: "respChartValueLabel" - value: "respChartValue" + resp_chart_id: "respchartid" + type_cat: "respcharttypecat" + value_label: "respchartvaluelabel" + value: "respchartvalue" resp_charting_entered: code: @@ -355,10 +354,10 @@ respiratoryCharting: - "ENTERED" - NOT YET DONE timestamp: "respChartEnteredTimestamp" - resp_chart_id: "respChartID" - type_cat: "respChartTypeCat" - value_label: "respChartValueLabel" - value: "respChartValue" + resp_chart_id: "respchartid" + type_cat: "respcharttypecat" + value_label: "respchartvaluelabel" + value: "respchartvalue" treatment: treatment: @@ -367,7 +366,7 @@ treatment: - "ENTERED" - col(treatmentstring) timestamp: "treatmentEnteredTimestamp" - treatment_id: "treatmentID" + treatment_id: "treatmentid" vitalAperiodic: non_invasive_systolic: @@ -377,8 +376,8 @@ vitalAperiodic: - "BP" - "NONINVASIVE_SYSTOLIC" timestamp: "observationEnteredTimestamp" - vital_id: "vitalAperiodicID" - numeric_value: "nonInvasiveSystolic" + vital_id: "vitalaperiodicid" + numeric_value: "noninvasivesystolic" non_invasive_diastolic: code: - "VITALS" @@ -386,8 +385,8 @@ vitalAperiodic: - "BP" - "NONINVASIVE_DIASTOLIC" timestamp: "observationEnteredTimestamp" - vital_id: "vitalAperiodicID" - numeric_value: "nonInvasiveDiastolic" + vital_id: "vitalaperiodicid" + numeric_value: "noninvasivediastolic" non_invasive_mean: code: @@ -396,8 +395,8 @@ vitalAperiodic: - "BP" - "NONINVASIVE_MEAN" timestamp: "observationEnteredTimestamp" - vital_id: "vitalAperiodicID" - numeric_value: "nonInvasiveMean" + vital_id: "vitalaperiodicid" + numeric_value: "noninvasivemean" paop: code: @@ -405,7 +404,7 @@ vitalAperiodic: - "APERIODIC" - "PAOP" timestamp: "observationEnteredTimestamp" - vital_id: "vitalAperiodicID" + vital_id: "vitalaperiodicid" numeric_value: "paop" cardiac_output: @@ -414,8 +413,8 @@ vitalAperiodic: - "APERIODIC" - "CARDIAC_OUTPUT" timestamp: "observationEnteredTimestamp" - vital_id: "vitalAperiodicID" - numeric_value: "cardiacOutput" + vital_id: "vitalaperiodicid" + numeric_value: "cardiacoutput" cardiac_input: code: @@ -423,8 +422,8 @@ vitalAperiodic: - "APERIODIC" - "CARDIAC_INPUT" timestamp: "observationEnteredTimestamp" - vital_id: "vitalAperiodicID" - numeric_value: "cardiacInput" + vital_id: "vitalaperiodicid" + numeric_value: "cardiacinput" svr: code: @@ -432,7 +431,7 @@ vitalAperiodic: - "APERIODIC" - "SVR" timestamp: "observationEnteredTimestamp" - vital_id: "vitalAperiodicID" + vital_id: "vitalaperiodicid" numeric_value: "svr" svri: @@ -441,7 +440,7 @@ vitalAperiodic: - "APERIODIC" - "SVRI" timestamp: "observationEnteredTimestamp" - vital_id: "vitalAperiodicID" + vital_id: "vitalaperiodicid" numeric_value: "svri" pvr: @@ -450,7 +449,7 @@ vitalAperiodic: - "APERIODIC" - "PVR" timestamp: "observationEnteredTimestamp" - vital_id: "vitalAperiodicID" + vital_id: "vitalaperiodicid" numeric_value: "pvr" pvri: @@ -459,7 +458,7 @@ vitalAperiodic: - "APERIODIC" - "PVRI" timestamp: "observationEnteredTimestamp" - vital_id: "vitalAperiodicID" + vital_id: "vitalaperiodicid" numeric_value: "pvri" vitalPeriodic: @@ -469,7 +468,7 @@ vitalPeriodic: - "PERIODIC" - "TEMPERATURE" timestamp: "observationEnteredTimestamp" - vital_id: "vitalPeriodicID" + vital_id: "vitalperiodicid" numeric_value: "temperature" saO2: @@ -478,8 +477,8 @@ vitalPeriodic: - "PERIODIC" - "SAO2" timestamp: "observationEnteredTimestamp" - vital_id: "vitalPeriodicID" - numeric_value: "saO2" + vital_id: "vitalperiodicid" + numeric_value: "sao2" heartRate: code: @@ -487,8 +486,8 @@ vitalPeriodic: - "PERIODIC" - "HEARTRATE" timestamp: "observationEnteredTimestamp" - vital_id: "vitalPeriodicID" - numeric_value: "heartRate" + vital_id: "vitalperiodicid" + numeric_value: "heartrate" respiration: code: @@ -496,7 +495,7 @@ vitalPeriodic: - "PERIODIC" - "RESPIRATION" timestamp: "observationEnteredTimestamp" - vital_id: "vitalPeriodicID" + vital_id: "vitalperiodicid" numeric_value: "respiration" cvp: @@ -505,7 +504,7 @@ vitalPeriodic: - "PERIODIC" - "CVP" timestamp: "observationEnteredTimestamp" - vital_id: "vitalPeriodicID" + vital_id: "vitalperiodicid" numeric_value: "cvp" etCo2: @@ -514,8 +513,8 @@ vitalPeriodic: - "PERIODIC" - "ETCO2" timestamp: "observationEnteredTimestamp" - vital_id: "vitalPeriodicID" - numeric_value: "etCo2" + vital_id: "vitalperiodicid" + numeric_value: "etco2" systemic_systolic: code: @@ -524,8 +523,8 @@ vitalPeriodic: - "BP" - "SYSTEMIC_SYSTOLIC" timestamp: "observationEnteredTimestamp" - vital_id: "vitalPeriodicID" - numeric_value: "systemicSystolic" + vital_id: "vitalperiodicid" + numeric_value: "systemicsystolic" systemic_diastolic: code: @@ -534,8 +533,8 @@ vitalPeriodic: - "BP" - "SYSTEMIC_DIASTOLIC" timestamp: "observationEnteredTimestamp" - vital_id: "vitalPeriodicID" - numeric_value: "systemicDiastolic" + vital_id: "vitalperiodicid" + numeric_value: "systemicdiastolic" systemic_mean: code: @@ -544,8 +543,8 @@ vitalPeriodic: - "BP" - "SYSTEMIC_MEAN" timestamp: "observationEnteredTimestamp" - vital_id: "vitalPeriodicID" - numeric_value: "systemicMean" + vital_id: "vitalperiodicid" + numeric_value: "systemicmean" pa_systolic: code: @@ -554,8 +553,8 @@ vitalPeriodic: - "BP" - "PULM_ART_SYSTOLIC" timestamp: "observationEnteredTimestamp" - vital_id: "vitalPeriodicID" - numeric_value: "paSystolic" + vital_id: "vitalperiodicid" + numeric_value: "pasystolic" pa_diastolic: code: @@ -564,8 +563,8 @@ vitalPeriodic: - "BP" - "PULM_ART_DIASTOLIC" timestamp: "observationEnteredTimestamp" - vital_id: "vitalPeriodicID" - numeric_value: "paDiastolic" + vital_id: "vitalperiodicid" + numeric_value: "padiastolic" pa_mean: code: @@ -574,8 +573,8 @@ vitalPeriodic: - "BP" - "PULM_ART_MEAN" timestamp: "observationEnteredTimestamp" - vital_id: "vitalPeriodicID" - numeric_value: "paMean" + vital_id: "vitalperiodicid" + numeric_value: "pamean" st1: code: @@ -583,7 +582,7 @@ vitalPeriodic: - "PERIODIC" - "ST1" timestamp: "observationEnteredTimestamp" - vital_id: "vitalPeriodicID" + vital_id: "vitalperiodicid" numeric_value: "st1" st2: @@ -592,7 +591,7 @@ vitalPeriodic: - "PERIODIC" - "ST2" timestamp: "observationEnteredTimestamp" - vital_id: "vitalPeriodicID" + vital_id: "vitalperiodicid" numeric_value: "st2" st3: @@ -601,7 +600,7 @@ vitalPeriodic: - "PERIODIC" - "ST3" timestamp: "observationEnteredTimestamp" - vital_id: "vitalPeriodicID" + vital_id: "vitalperiodicid" numeric_value: "st3" ICP: @@ -610,5 +609,5 @@ vitalPeriodic: - "PERIODIC" - "ICP" timestamp: "observationEnteredTimestamp" - vital_id: "vitalPeriodicID" - numeric_value: "ICP" + vital_id: "vitalperiodicid" + numeric_value: "icp" From 74a86244d2d7a822fb82df0f221766d41e77568f Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sat, 1 Jun 2024 17:28:33 -0400 Subject: [PATCH 32/47] Working most of the way through. Some error about vitalsaperiodic and floats vs. ints occurring during the event conversion currently, though --- eICU_Example/configs/event_configs.yaml | 144 ++++++++++++------------ 1 file changed, 72 insertions(+), 72 deletions(-) diff --git a/eICU_Example/configs/event_configs.yaml b/eICU_Example/configs/event_configs.yaml index c57584f..77f4023 100644 --- a/eICU_Example/configs/event_configs.yaml +++ b/eICU_Example/configs/event_configs.yaml @@ -6,7 +6,7 @@ patient_id_col: patienthealthsystemstayid patient: dob: code: "DOB" - timestamp: "dateofbirth" + timestamp: col(dateofbirth) uniquepid: "uniquepid" gender: code: ["GENDER", "col(gender)"] @@ -21,52 +21,52 @@ patient: - col(hospitalregion) - col(hospitalteachingstatus) - col(hospitalnumbedscategory) - timestamp: "hospitalAdmitTimestamp" - hospital_id: "hospitalID" + timestamp: col(hospitaladmittimestamp) + hospital_id: "hospitalid" hosp_discharge: code: - "HOSPITAL_DISCHARGE" - col(hospitaldischargestatus) - col(hospitaldischargelocation) - timestamp: "hospitalDischargeTimestamp" + timestamp: col(hospitaldischargetimestamp) unit_admission: code: - "UNIT_ADMISSION" - col(unitadmitsource) - col(unitstaytype) - timestamp: "unitAdmitTimestamp" - ward_id: "wardID" - unit_stay_id: "patientUnitStayID" + timestamp: col(unitadmittimestamp) + ward_id: "wardid" + unit_stay_id: "patientunitstayid" unit_admission_weight: code: - "UNIT_ADMISSION_WEIGHT" - timestamp: "unitAdmitTimestamp" - numerical_value: "unitAdmissionWeight" + timestamp: col(unitadmittimestamp) + numerical_value: "unitadmissionweight" unit_admission_height: code: - "UNIT_ADMISSION_HEIGHT" - timestamp: "unitAdmitTimestamp" - numerical_value: "unitAdmissionHeight" + timestamp: col(unitadmittimestamp) + numerical_value: "unitadmissionheight" unit_discharge: code: - "UNIT_DISCHARGE" - col(unitdischargestatus) - col(unitdischargelocation) - timestamp: "unitDischargeTimestamp" + timestamp: col(unitdischargetimestamp) unit_discharge_weight: code: - "UNIT_DISCHARGE_WEIGHT" - timestamp: "unitDischargeTimestamp" - numerical_value: "unitDischargeWeight" + timestamp: col(unitdischargetimestamp) + numerical_value: "unitdischargeweight" admissiondx: admission_diagnosis: code: - "ADMISSION_DX" - col(admitdxname) - timestamp: "admitDxEnteredTimestamp" + timestamp: col(admitDxEnteredTimestamp) admission_dx_id: "admitDxID" - unit_stay_id: "patientUnitStayID" + unit_stay_id: "patientunitstayid" allergy: allergy: @@ -74,7 +74,7 @@ allergy: - "ALLERGY" - col(allergytype) - col(allergyname) - timestamp: "allergyEnteredTimestamp" + timestamp: col(allergyEnteredTimestamp) carePlanGeneral: cplItem: @@ -82,13 +82,13 @@ carePlanGeneral: - "CAREPLAN_GENERAL" - col(cplgroup) - col(cplitemvalue) - timestamp: "carePlanGeneralItemEnteredTimestamp" + timestamp: col(carePlanGeneralItemEnteredTimestamp) carePlanEOL: cplEolDiscussion: code: - "CAREPLAN_EOL" - timestamp: "carePlanEolDiscussionOccurredTimestamp" + timestamp: col(carePlanEolDiscussionOccurredTimestamp) carePlanGoal: cplGoal: @@ -97,7 +97,7 @@ carePlanGoal: - col(cplgoalcategory) - col(cplgoalvalue) - col(cplgoalstatus) - timestamp: "carePlanGoalEnteredTimestamp" + timestamp: col(carePlanGoalEnteredTimestamp) carePlanInfectiousDisease: cplInfectDisease: @@ -107,7 +107,7 @@ carePlanInfectiousDisease: - col(infectdiseaseassessment) - col(treatment) - col(responsetotherapy) - timestamp: "carePlanInfectDiseaseEnteredTimestamp" + timestamp: col(carePlanInfectDiseaseEnteredTimestamp) diagnosis: diagnosis: @@ -115,7 +115,7 @@ diagnosis: - "ICD9CM" - col(icd9code) - col(diagnosispriority) - timestamp: "diagnosisEnteredTimestamp" + timestamp: col(diagnosisEnteredTimestamp) diagnosis_string: "diagnosisstring" infusionDrug: @@ -124,7 +124,7 @@ infusionDrug: - "INFUSION" - col(infusiondrugid) - col(drugname) - timestamp: "infusionEnteredTimestamp" + timestamp: col(infusionEnteredTimestamp) drug_rate: "drugrate" infusion_rate: "infusionrate" drug_amount: "drugamount" @@ -132,7 +132,7 @@ infusionDrug: patient_weight: code: - "INFUSION_PATIENT_WEIGHT" - timestamp: "infusionEnteredTimestamp" + timestamp: col(infusionEnteredTimestamp) numerical_value: "patientweight" lab: @@ -142,7 +142,7 @@ lab: - col(labmeasurenamesystem) - col(labmeasurenameinterface) - col(labname) - timestamp: "labResultDrawnTimestamp" + timestamp: col(labResultDrawnTimestamp) numerical_value: "labresult" text_value: "labresulttext" lab_type_id: "labtypeid" @@ -152,8 +152,8 @@ medication: code: - "MEDICATION" - "ORDERED" - - col(drugName) - timestamp: "drugOrderTimestamp" + - col(drugname) + timestamp: col(drugordertimestamp) medication_id: "medicationid" drug_iv_admixture: "drugivadmixture" dosage: "dosage" @@ -166,15 +166,15 @@ medication: code: - "MEDICATION" - "STARTED" - - col(drugName) - timestamp: "drugStartedTimestamp" + - col(drugname) + timestamp: col(drugstarttimestamp) medication_id: "medicationid" drug_stopped: code: - "MEDICATION" - "STOPPED" - - col(drugName) - timestamp: "drugStoppedTimestamp" + - col(drugname) + timestamp: col(drugstoptimestamp) medication_id: "medicationid" nurseAssessment: @@ -183,7 +183,7 @@ nurseAssessment: - "NURSE_ASSESSMENT" - "PERFORMED" - NOT YET DONE - timestamp: "nurseAssessPerformedTimestamp" + timestamp: col(nurseAssessPerformedTimestamp) nurse_assessment_id: "nurseassessid" cell_label: "celllabel" cell_attribute: "cellattribute" @@ -194,7 +194,7 @@ nurseAssessment: - "NURSE_ASSESSMENT" - "ENTERED" - NOT YET DONE - timestamp: "nurseAssessEnteredTimestamp" + timestamp: col(nurseAssessEnteredTimestamp) nurse_assessment_id: "nurseassessid" cell_label: "celllabel" cell_attribute: "cellattribute" @@ -206,7 +206,7 @@ nurseCare: - "NURSE_CARE" - "PERFORMED" - NOT YET DONE - timestamp: "nurseCarePerformedTimestamp" + timestamp: col(nurseCarePerformedTimestamp) nurse_care_id: "nursecareid" cell_label: "celllabel" cell_attribute: "cellattribute" @@ -217,7 +217,7 @@ nurseCare: - "NURSE_CARE" - "ENTERED" - NOT YET DONE - timestamp: "nurseCareEnteredTimestamp" + timestamp: col(nurseCareEnteredTimestamp) nurse_care_id: "nursecareid" cell_label: "celllabel" cell_attribute: "cellattribute" @@ -229,7 +229,7 @@ nurseCharting: - "NURSE_CHARTING" - "PERFORMED" - NOT YET DONE - timestamp: "nursingChartPerformedTimestamp" + timestamp: col(nursingChartPerformedTimestamp) nurse_charting_id: "nursingchartid" cell_type_cat: "nursingchartcelltypecat" cell_type_val_name: "nursingchartcelltypevalname" @@ -241,7 +241,7 @@ nurseCharting: - "NURSE_CHARTING" - "ENTERED" - NOT YET DONE - timestamp: "nursingChartEnteredTimestamp" + timestamp: col(nursingChartEnteredTimestamp) nurse_charting_id: "nursingchartid" cell_type_cat: "nursingchartcelltypecat" cell_type_val_name: "nursingchartcelltypevalname" @@ -254,7 +254,7 @@ pastHistory: - "PAST_HISTORY" - "TAKEN" - NOT YET DONE - timestamp: "pastHistoryTakenTimestamp" + timestamp: col(pastHistoryTakenTimestamp) past_history_id: "pasthistoryid" note_type: "pasthistorynotetype" path: "pasthistorypath" @@ -266,7 +266,7 @@ pastHistory: - "PAST_HISTORY" - "ENTERED" - NOT YET DONE - timestamp: "pastHistoryEnteredTimestamp" + timestamp: col(pastHistoryEnteredTimestamp) past_history_id: "pasthistoryid" note_type: "pasthistorynotetype" path: "pasthistorypath" @@ -279,7 +279,7 @@ physicalExam: - "PHYSICAL_EXAM" - "ENTERED" - NOT YET DONE - timestamp: "physicalExamEnteredTimestamp" + timestamp: col(physicalExamEnteredTimestamp) physical_exam_id: "physicalexamid" text: "physicalexamtext" path: "physicalexampath" @@ -291,7 +291,7 @@ respiratoryCare: - "RESP_CARE" - "STATUS" - NOT YET DONE - timestamp: "respCareStatusEnteredTimestamp" + timestamp: col(respCareStatusEnteredTimestamp) resp_care_id: "respcareid" airwaytype: "airwaytype" airwaysize: "airwaysize" @@ -325,7 +325,7 @@ respiratoryCare: - "VENT" - "START" - NOT YET DONE - timestamp: "ventStartTimestamp" + timestamp: col(ventStartTimestamp) resp_care_id: "respcareid" vent_end: @@ -333,7 +333,7 @@ respiratoryCare: - "VENT" - "END" - NOT YET DONE - timestamp: "ventEndTimestamp" + timestamp: col(ventEndTimestamp) resp_care_id: "respcareid" respiratoryCharting: @@ -342,7 +342,7 @@ respiratoryCharting: - "RESP_CHARTING" - "PERFORMED" - NOT YET DONE - timestamp: "respChartPerformedTimestamp" + timestamp: col(respChartPerformedTimestamp) resp_chart_id: "respchartid" type_cat: "respcharttypecat" value_label: "respchartvaluelabel" @@ -353,7 +353,7 @@ respiratoryCharting: - "RESP_CHARTING" - "ENTERED" - NOT YET DONE - timestamp: "respChartEnteredTimestamp" + timestamp: col(respChartEnteredTimestamp) resp_chart_id: "respchartid" type_cat: "respcharttypecat" value_label: "respchartvaluelabel" @@ -365,7 +365,7 @@ treatment: - "TREATMENT" - "ENTERED" - col(treatmentstring) - timestamp: "treatmentEnteredTimestamp" + timestamp: col(treatmentEnteredTimestamp) treatment_id: "treatmentid" vitalAperiodic: @@ -375,7 +375,7 @@ vitalAperiodic: - "APERIODIC" - "BP" - "NONINVASIVE_SYSTOLIC" - timestamp: "observationEnteredTimestamp" + timestamp: col(observationEnteredTimestamp) vital_id: "vitalaperiodicid" numeric_value: "noninvasivesystolic" non_invasive_diastolic: @@ -384,7 +384,7 @@ vitalAperiodic: - "APERIODIC" - "BP" - "NONINVASIVE_DIASTOLIC" - timestamp: "observationEnteredTimestamp" + timestamp: col(observationEnteredTimestamp) vital_id: "vitalaperiodicid" numeric_value: "noninvasivediastolic" @@ -394,7 +394,7 @@ vitalAperiodic: - "APERIODIC" - "BP" - "NONINVASIVE_MEAN" - timestamp: "observationEnteredTimestamp" + timestamp: col(observationEnteredTimestamp) vital_id: "vitalaperiodicid" numeric_value: "noninvasivemean" @@ -403,7 +403,7 @@ vitalAperiodic: - "VITALS" - "APERIODIC" - "PAOP" - timestamp: "observationEnteredTimestamp" + timestamp: col(observationEnteredTimestamp) vital_id: "vitalaperiodicid" numeric_value: "paop" @@ -412,7 +412,7 @@ vitalAperiodic: - "VITALS" - "APERIODIC" - "CARDIAC_OUTPUT" - timestamp: "observationEnteredTimestamp" + timestamp: col(observationEnteredTimestamp) vital_id: "vitalaperiodicid" numeric_value: "cardiacoutput" @@ -421,7 +421,7 @@ vitalAperiodic: - "VITALS" - "APERIODIC" - "CARDIAC_INPUT" - timestamp: "observationEnteredTimestamp" + timestamp: col(observationEnteredTimestamp) vital_id: "vitalaperiodicid" numeric_value: "cardiacinput" @@ -430,7 +430,7 @@ vitalAperiodic: - "VITALS" - "APERIODIC" - "SVR" - timestamp: "observationEnteredTimestamp" + timestamp: col(observationEnteredTimestamp) vital_id: "vitalaperiodicid" numeric_value: "svr" @@ -439,7 +439,7 @@ vitalAperiodic: - "VITALS" - "APERIODIC" - "SVRI" - timestamp: "observationEnteredTimestamp" + timestamp: col(observationEnteredTimestamp) vital_id: "vitalaperiodicid" numeric_value: "svri" @@ -448,7 +448,7 @@ vitalAperiodic: - "VITALS" - "APERIODIC" - "PVR" - timestamp: "observationEnteredTimestamp" + timestamp: col(observationEnteredTimestamp) vital_id: "vitalaperiodicid" numeric_value: "pvr" @@ -457,7 +457,7 @@ vitalAperiodic: - "VITALS" - "APERIODIC" - "PVRI" - timestamp: "observationEnteredTimestamp" + timestamp: col(observationEnteredTimestamp) vital_id: "vitalaperiodicid" numeric_value: "pvri" @@ -467,7 +467,7 @@ vitalPeriodic: - "VITALS" - "PERIODIC" - "TEMPERATURE" - timestamp: "observationEnteredTimestamp" + timestamp: col(observationEnteredTimestamp) vital_id: "vitalperiodicid" numeric_value: "temperature" @@ -476,7 +476,7 @@ vitalPeriodic: - "VITALS" - "PERIODIC" - "SAO2" - timestamp: "observationEnteredTimestamp" + timestamp: col(observationEnteredTimestamp) vital_id: "vitalperiodicid" numeric_value: "sao2" @@ -485,7 +485,7 @@ vitalPeriodic: - "VITALS" - "PERIODIC" - "HEARTRATE" - timestamp: "observationEnteredTimestamp" + timestamp: col(observationEnteredTimestamp) vital_id: "vitalperiodicid" numeric_value: "heartrate" @@ -494,7 +494,7 @@ vitalPeriodic: - "VITALS" - "PERIODIC" - "RESPIRATION" - timestamp: "observationEnteredTimestamp" + timestamp: col(observationEnteredTimestamp) vital_id: "vitalperiodicid" numeric_value: "respiration" @@ -503,7 +503,7 @@ vitalPeriodic: - "VITALS" - "PERIODIC" - "CVP" - timestamp: "observationEnteredTimestamp" + timestamp: col(observationEnteredTimestamp) vital_id: "vitalperiodicid" numeric_value: "cvp" @@ -512,7 +512,7 @@ vitalPeriodic: - "VITALS" - "PERIODIC" - "ETCO2" - timestamp: "observationEnteredTimestamp" + timestamp: col(observationEnteredTimestamp) vital_id: "vitalperiodicid" numeric_value: "etco2" @@ -522,7 +522,7 @@ vitalPeriodic: - "PERIODIC" - "BP" - "SYSTEMIC_SYSTOLIC" - timestamp: "observationEnteredTimestamp" + timestamp: col(observationEnteredTimestamp) vital_id: "vitalperiodicid" numeric_value: "systemicsystolic" @@ -532,7 +532,7 @@ vitalPeriodic: - "PERIODIC" - "BP" - "SYSTEMIC_DIASTOLIC" - timestamp: "observationEnteredTimestamp" + timestamp: col(observationEnteredTimestamp) vital_id: "vitalperiodicid" numeric_value: "systemicdiastolic" @@ -542,7 +542,7 @@ vitalPeriodic: - "PERIODIC" - "BP" - "SYSTEMIC_MEAN" - timestamp: "observationEnteredTimestamp" + timestamp: col(observationEnteredTimestamp) vital_id: "vitalperiodicid" numeric_value: "systemicmean" @@ -552,7 +552,7 @@ vitalPeriodic: - "PERIODIC" - "BP" - "PULM_ART_SYSTOLIC" - timestamp: "observationEnteredTimestamp" + timestamp: col(observationEnteredTimestamp) vital_id: "vitalperiodicid" numeric_value: "pasystolic" @@ -562,7 +562,7 @@ vitalPeriodic: - "PERIODIC" - "BP" - "PULM_ART_DIASTOLIC" - timestamp: "observationEnteredTimestamp" + timestamp: col(observationEnteredTimestamp) vital_id: "vitalperiodicid" numeric_value: "padiastolic" @@ -572,7 +572,7 @@ vitalPeriodic: - "PERIODIC" - "BP" - "PULM_ART_MEAN" - timestamp: "observationEnteredTimestamp" + timestamp: col(observationEnteredTimestamp) vital_id: "vitalperiodicid" numeric_value: "pamean" @@ -581,7 +581,7 @@ vitalPeriodic: - "VITALS" - "PERIODIC" - "ST1" - timestamp: "observationEnteredTimestamp" + timestamp: col(observationEnteredTimestamp) vital_id: "vitalperiodicid" numeric_value: "st1" @@ -590,7 +590,7 @@ vitalPeriodic: - "VITALS" - "PERIODIC" - "ST2" - timestamp: "observationEnteredTimestamp" + timestamp: col(observationEnteredTimestamp) vital_id: "vitalperiodicid" numeric_value: "st2" @@ -599,7 +599,7 @@ vitalPeriodic: - "VITALS" - "PERIODIC" - "ST3" - timestamp: "observationEnteredTimestamp" + timestamp: col(observationEnteredTimestamp) vital_id: "vitalperiodicid" numeric_value: "st3" @@ -608,6 +608,6 @@ vitalPeriodic: - "VITALS" - "PERIODIC" - "ICP" - timestamp: "observationEnteredTimestamp" + timestamp: col(observationEnteredTimestamp) vital_id: "vitalperiodicid" numeric_value: "icp" From f979ea46416b9b8f307eedac9efaf31146b746d8 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sun, 2 Jun 2024 13:16:39 -0400 Subject: [PATCH 33/47] Incorporating fixes from #8 -- thanks @prenc! --- MIMIC-IV_Example/pre_MEDS.py | 4 ++-- scripts/extraction/merge_to_MEDS_cohort.py | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/MIMIC-IV_Example/pre_MEDS.py b/MIMIC-IV_Example/pre_MEDS.py index 789b882..bf99f3a 100755 --- a/MIMIC-IV_Example/pre_MEDS.py +++ b/MIMIC-IV_Example/pre_MEDS.py @@ -59,7 +59,7 @@ def fix_static_data(raw_static_df: pl.LazyFrame, death_times_df: pl.LazyFrame) - return raw_static_df.join(death_times_df, on="subject_id", how="left").select( "subject_id", - pl.coalesce(pl.col("dod"), pl.col("deathtime")).alias("dod"), + pl.coalesce(pl.col("deathtime"), pl.col("dod")).alias("dod"), (pl.col("anchor_year") - pl.col("anchor_age")).cast(str).alias("year_of_birth"), "gender", ) @@ -106,7 +106,7 @@ def main(cfg: DictConfig): f"No function needed for {pfx}: " f"Symlinking {str(in_fp.resolve())} to {str(out_fp.resolve())}" ) - relative_in_fp = in_fp.relative_to(out_fp.parent, walk_up=True) + relative_in_fp = in_fp.relative_to(out_fp.resolve().parent, walk_up=True) out_fp.symlink_to(relative_in_fp) continue else: diff --git a/scripts/extraction/merge_to_MEDS_cohort.py b/scripts/extraction/merge_to_MEDS_cohort.py index 1c7271d..e7f8bdf 100755 --- a/scripts/extraction/merge_to_MEDS_cohort.py +++ b/scripts/extraction/merge_to_MEDS_cohort.py @@ -25,7 +25,11 @@ def read_fn(sp_dir: Path) -> pl.LazyFrame: logger.info(f"Reading {len(files_to_read)} files:\n{file_strs}") dfs = [pl.scan_parquet(fp, glob=False) for fp in files_to_read] - return pl.concat(dfs, how="diagonal").unique(maintain_order=False).sort(by=["patient_id", "timestamp"]) + return ( + pl.concat(dfs, how="diagonal_relaxed") + .unique(maintain_order=False) + .sort(by=["patient_id", "timestamp"]) + ) def write_fn(df: pl.LazyFrame, out_fp: Path) -> None: From 21dfc19f2361f5d10d3d8f665aa832b585de8fb8 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sun, 2 Jun 2024 14:19:28 -0400 Subject: [PATCH 34/47] Make log dir stage dependent --- configs/pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/pipeline.yaml b/configs/pipeline.yaml index 5694e25..be99f84 100644 --- a/configs/pipeline.yaml +++ b/configs/pipeline.yaml @@ -2,7 +2,7 @@ input_dir: ??? cohort_dir: ??? -log_dir: "${cohort_dir}/.logs" +log_dir: "${cohort_dir}/.logs/${stage}" # General pipeline variables do_overwrite: False From d9501a74ec6a803bcb884b368cc169e56e58ebda Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Thu, 6 Jun 2024 09:23:58 -0400 Subject: [PATCH 35/47] Made submitit launcher script work --- MIMIC-IV_Example/joint_script_slurm.sh | 109 +++++++++++++------------ 1 file changed, 55 insertions(+), 54 deletions(-) diff --git a/MIMIC-IV_Example/joint_script_slurm.sh b/MIMIC-IV_Example/joint_script_slurm.sh index 3948e87..0e0af8e 100755 --- a/MIMIC-IV_Example/joint_script_slurm.sh +++ b/MIMIC-IV_Example/joint_script_slurm.sh @@ -1,9 +1,9 @@ #!/usr/bin/env bash -MIMICIV_RAW_DIR="$1" -MIMICIV_PREMEDS_DIR="$2" -MIMICIV_MEDS_DIR="$3" -N_PARALLEL_WORKERS="$4" +export MIMICIV_RAW_DIR="$1" +export MIMICIV_PREMEDS_DIR="$2" +export MIMICIV_MEDS_DIR="$3" +export N_PARALLEL_WORKERS="$4" shift 4 @@ -11,17 +11,17 @@ shift 4 # this doesn't fall back on running anything locally in a setting where only slurm worker nodes have # sufficient computational resources to run the actual jobs. -echo "Running pre-MEDS conversion on one worker." -./MIMIC-IV_Example/pre_MEDS.py \ - --multirun \ - worker="range(0,1)" \ - hydra/launcher=submitit_slurm \ - hydra.launcher.timeout_min=60 \ - hydra.launcher.cpus_per_task=10 \ - hydra.launcher.mem_gb=50 \ - hydra.launcher.partition="short" \ - raw_cohort_dir="$MIMICIV_RAW_DIR" \ - output_dir="$MIMICIV_PREMEDS_DIR" +# echo "Running pre-MEDS conversion on one worker." +# ./MIMIC-IV_Example/pre_MEDS.py \ +# --multirun \ +# worker="range(0,1)" \ +# hydra/launcher=submitit_slurm \ +# hydra.launcher.timeout_min=60 \ +# hydra.launcher.cpus_per_task=10 \ +# hydra.launcher.mem_gb=50 \ +# hydra.launcher.partition="short" \ +# raw_cohort_dir="$MIMICIV_RAW_DIR" \ +# output_dir="$MIMICIV_PREMEDS_DIR" echo "Trying submitit launching with $N_PARALLEL_WORKERS jobs." @@ -36,43 +36,44 @@ echo "Trying submitit launching with $N_PARALLEL_WORKERS jobs." "hydra.job.env_copy=[PATH]" \ input_dir="$MIMICIV_PREMEDS_DIR" \ cohort_dir="$MIMICIV_MEDS_DIR" \ - event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml + event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml \ + stage=shard_events -echo "Splitting patients on one worker" -./scripts/extraction/split_and_shard_patients.py \ - --multirun \ - worker="range(0,1)" \ - hydra/launcher=submitit_slurm \ - hydra.launcher.timeout_min=60 \ - hydra.launcher.cpus_per_task=10 \ - hydra.launcher.mem_gb=50 \ - hydra.launcher.partition="short" \ - input_dir="$MIMICIV_PREMEDS_DIR" \ - cohort_dir="$MIMICIV_MEDS_DIR" \ - event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" - -echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" -./scripts/extraction/convert_to_sharded_events.py \ - --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ - hydra/launcher=submitit_slurm \ - hydra.launcher.timeout_min=60 \ - hydra.launcher.cpus_per_task=10 \ - hydra.launcher.mem_gb=50 \ - hydra.launcher.partition="short" \ - input_dir="$MIMICIV_PREMEDS_DIR" \ - cohort_dir="$MIMICIV_MEDS_DIR" \ - event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" - -echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel" -./scripts/extraction/merge_to_MEDS_cohort.py \ - --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ - hydra/launcher=submitit_slurm \ - hydra.launcher.timeout_min=60 \ - hydra.launcher.cpus_per_task=10 \ - hydra.launcher.mem_gb=50 \ - hydra.launcher.partition="short" \ - input_dir="$MIMICIV_PREMEDS_DIR" \ - cohort_dir="$MIMICIV_MEDS_DIR" \ - event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" +# echo "Splitting patients on one worker" +# ./scripts/extraction/split_and_shard_patients.py \ +# --multirun \ +# worker="range(0,1)" \ +# hydra/launcher=submitit_slurm \ +# hydra.launcher.timeout_min=60 \ +# hydra.launcher.cpus_per_task=10 \ +# hydra.launcher.mem_gb=50 \ +# hydra.launcher.partition="short" \ +# input_dir="$MIMICIV_PREMEDS_DIR" \ +# cohort_dir="$MIMICIV_MEDS_DIR" \ +# event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" +# +# echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" +# ./scripts/extraction/convert_to_sharded_events.py \ +# --multirun \ +# worker="range(0,$N_PARALLEL_WORKERS)" \ +# hydra/launcher=submitit_slurm \ +# hydra.launcher.timeout_min=60 \ +# hydra.launcher.cpus_per_task=10 \ +# hydra.launcher.mem_gb=50 \ +# hydra.launcher.partition="short" \ +# input_dir="$MIMICIV_PREMEDS_DIR" \ +# cohort_dir="$MIMICIV_MEDS_DIR" \ +# event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" +# +# echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel" +# ./scripts/extraction/merge_to_MEDS_cohort.py \ +# --multirun \ +# worker="range(0,$N_PARALLEL_WORKERS)" \ +# hydra/launcher=submitit_slurm \ +# hydra.launcher.timeout_min=60 \ +# hydra.launcher.cpus_per_task=10 \ +# hydra.launcher.mem_gb=50 \ +# hydra.launcher.partition="short" \ +# input_dir="$MIMICIV_PREMEDS_DIR" \ +# cohort_dir="$MIMICIV_MEDS_DIR" \ +# event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" From 6878bf20a11ce44f4e45b48d64082e4910f0df17 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Thu, 6 Jun 2024 09:24:36 -0400 Subject: [PATCH 36/47] Added singleton sbatch script --- MIMIC-IV_Example/sbatch_joint_script.sh | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 MIMIC-IV_Example/sbatch_joint_script.sh diff --git a/MIMIC-IV_Example/sbatch_joint_script.sh b/MIMIC-IV_Example/sbatch_joint_script.sh new file mode 100644 index 0000000..e031363 --- /dev/null +++ b/MIMIC-IV_Example/sbatch_joint_script.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +#SBATCH -c 10 # Request one core +#SBATCH -t 0-03:00 # Runtime in D-HH:MM format +#SBATCH -p short # Partition to run in +#SBATCH --mem=300GB # Memory total in MiB (for all cores) +#SBATCH -o MIMIC_IV_MEDS_%j_sbatch.out # File to which STDOUT will be written, including job ID (%j) +#SBATCH -e MIMIC_IV_MEDS_%j_sbatch.err # File to which STDERR will be written, including job ID (%j) + +cd /n/data1/hms/dbmi/zaklab/mmd/MEDS_polars_functions + +MIMICIV_RAW_DIR="$1" +MIMICIV_PREMEDS_DIR="$2" +MIMICIV_MEDS_DIR="$3" +N_PARALLEL_WORKERS="$4" + +LOG_DIR="$MIMICIV_MEDS_DIR/.logs" + +echo "Running with saving to $LOG_DIR" + +mkdir -p $LOG_DIR + +PATH="/home/mbm47/.conda/envs/MEDS_pipelines/bin:$PATH" \ + time mprof run --include-children --exit-code --output "$LOG_DIR/mprofile.dat" \ + ./MIMIC-IV_Example/joint_script.sh "$@" 2> $LOG_DIR/timings.txt From dced00b83641e35e4faa17a3413a5b4f5861090e Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Thu, 6 Jun 2024 09:34:52 -0400 Subject: [PATCH 37/47] Adding inits to make tests pass despite shared 'pre_MEDS.py' name --- MIMIC-IV_Example/__init__.py | 0 eICU_Example/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 MIMIC-IV_Example/__init__.py create mode 100644 eICU_Example/__init__.py diff --git a/MIMIC-IV_Example/__init__.py b/MIMIC-IV_Example/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/eICU_Example/__init__.py b/eICU_Example/__init__.py new file mode 100644 index 0000000..e69de29 From 7d74d60156f96791f45766bdd242bdf113e61f69 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Thu, 6 Jun 2024 09:45:05 -0400 Subject: [PATCH 38/47] Make it always retype numerical values --- src/MEDS_polars_functions/event_conversion.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/MEDS_polars_functions/event_conversion.py b/src/MEDS_polars_functions/event_conversion.py index 15f1e9a..163bf10 100644 --- a/src/MEDS_polars_functions/event_conversion.py +++ b/src/MEDS_polars_functions/event_conversion.py @@ -381,7 +381,8 @@ def extract_event(df: pl.LazyFrame, event_cfg: dict[str, str | None]) -> pl.Lazy # if numerical_value column is not numeric, convert it to float if "numerical_value" in df.columns and not df.schema["numerical_value"].is_numeric(): logger.warning(f"Converting numerical_value to float for codes {codes}") - df = df.with_columns(pl.col("numerical_value").cast(pl.Float64, strict=False)) + + df = df.with_columns(pl.col("numerical_value").cast(pl.Float64, strict=False)) return df From 3ec1436414c683ef36bd5e875dee9fa763fe5e7b Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Thu, 6 Jun 2024 09:46:21 -0400 Subject: [PATCH 39/47] typo fix --- src/MEDS_polars_functions/event_conversion.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/MEDS_polars_functions/event_conversion.py b/src/MEDS_polars_functions/event_conversion.py index 163bf10..56c90d8 100644 --- a/src/MEDS_polars_functions/event_conversion.py +++ b/src/MEDS_polars_functions/event_conversion.py @@ -379,10 +379,10 @@ def extract_event(df: pl.LazyFrame, event_cfg: dict[str, str | None]) -> pl.Lazy df = df.select(**event_exprs).unique(maintain_order=True) # if numerical_value column is not numeric, convert it to float - if "numerical_value" in df.columns and not df.schema["numerical_value"].is_numeric(): - logger.warning(f"Converting numerical_value to float for codes {codes}") - - df = df.with_columns(pl.col("numerical_value").cast(pl.Float64, strict=False)) + if "numerical_value" in df.columns: + if not df.schema["numerical_value"].is_numeric(): + logger.warning(f"Converting numerical_value to float for codes {codes}") + df = df.with_columns(pl.col("numerical_value").cast(pl.Float64, strict=False)) return df From 1169cc9b62da9315b463d2c0bb249ba9ca2b5eb0 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Thu, 6 Jun 2024 09:47:49 -0400 Subject: [PATCH 40/47] Undoing recent changes as they don't help --- src/MEDS_polars_functions/event_conversion.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/MEDS_polars_functions/event_conversion.py b/src/MEDS_polars_functions/event_conversion.py index 56c90d8..15f1e9a 100644 --- a/src/MEDS_polars_functions/event_conversion.py +++ b/src/MEDS_polars_functions/event_conversion.py @@ -379,9 +379,8 @@ def extract_event(df: pl.LazyFrame, event_cfg: dict[str, str | None]) -> pl.Lazy df = df.select(**event_exprs).unique(maintain_order=True) # if numerical_value column is not numeric, convert it to float - if "numerical_value" in df.columns: - if not df.schema["numerical_value"].is_numeric(): - logger.warning(f"Converting numerical_value to float for codes {codes}") + if "numerical_value" in df.columns and not df.schema["numerical_value"].is_numeric(): + logger.warning(f"Converting numerical_value to float for codes {codes}") df = df.with_columns(pl.col("numerical_value").cast(pl.Float64, strict=False)) return df From 637e4bda515e89f9ee58d28ae2d95d36994fde69 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Thu, 6 Jun 2024 10:02:01 -0400 Subject: [PATCH 41/47] Use diagonal relaxed to combine the event subshards --- src/MEDS_polars_functions/event_conversion.py | 3 ++- src/MEDS_polars_functions/utils.py | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/MEDS_polars_functions/event_conversion.py b/src/MEDS_polars_functions/event_conversion.py index 15f1e9a..eae9505 100644 --- a/src/MEDS_polars_functions/event_conversion.py +++ b/src/MEDS_polars_functions/event_conversion.py @@ -278,6 +278,7 @@ def extract_event(df: pl.LazyFrame, event_cfg: dict[str, str | None]) -> pl.Lazy ... ValueError: Source column 'discharge_time' for event column foobar is not numeric or categorical! Cannot be used as an event col. """ # noqa: E501 + df = df event_exprs = {"patient_id": pl.col("patient_id")} if "code" not in event_cfg: @@ -550,5 +551,5 @@ def convert_to_events( except Exception as e: raise ValueError(f"Error extracting event {event_name}: {e}") from e - df = pl.concat(event_dfs, how="diagonal") + df = pl.concat(event_dfs, how="diagonal_relaxed") return df diff --git a/src/MEDS_polars_functions/utils.py b/src/MEDS_polars_functions/utils.py index b2fbbb7..d1e6e09 100644 --- a/src/MEDS_polars_functions/utils.py +++ b/src/MEDS_polars_functions/utils.py @@ -157,7 +157,10 @@ def hydra_loguru_init() -> None: def write_lazyframe(df: pl.LazyFrame, out_fp: Path) -> None: - df.collect().write_parquet(out_fp, use_pyarrow=True) + if isinstance(df, pl.LazyFrame): + df = df.collect() + + df.write_parquet(out_fp, use_pyarrow=True) def get_shard_prefix(base_path: Path, fp: Path) -> str: From eb94a1d961e6208ce91243c83f0bcc3a1a0dd834 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Thu, 6 Jun 2024 16:52:54 -0400 Subject: [PATCH 42/47] fixed error in joint script help message for eICU. should apply to MIMIC as well. --- eICU_Example/joint_script.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eICU_Example/joint_script.sh b/eICU_Example/joint_script.sh index 4445f49..26e2b57 100755 --- a/eICU_Example/joint_script.sh +++ b/eICU_Example/joint_script.sh @@ -27,7 +27,7 @@ if [[ "$1" == "-h" || "$1" == "--help" ]]; then fi # Check for mandatory parameters -if [ "$#" -ne 4 ]; then +if [ "$#" -lt 4 ]; then echo "Error: Incorrect number of arguments provided." display_help fi From 0af21c7e4c2cb062c54e8035312fa0172e99b33d Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Fri, 7 Jun 2024 10:49:03 -0400 Subject: [PATCH 43/47] Fixed up sbatch script --- MIMIC-IV_Example/joint_script.sh | 2 +- MIMIC-IV_Example/sbatch_joint_script.sh | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/MIMIC-IV_Example/joint_script.sh b/MIMIC-IV_Example/joint_script.sh index d3e067f..bf3438e 100755 --- a/MIMIC-IV_Example/joint_script.sh +++ b/MIMIC-IV_Example/joint_script.sh @@ -27,7 +27,7 @@ if [[ "$1" == "-h" || "$1" == "--help" ]]; then fi # Check for mandatory parameters -if [ "$#" -ne 4 ]; then +if [ "$#" -lt 4 ]; then echo "Error: Incorrect number of arguments provided." display_help fi diff --git a/MIMIC-IV_Example/sbatch_joint_script.sh b/MIMIC-IV_Example/sbatch_joint_script.sh index e031363..75d3281 100644 --- a/MIMIC-IV_Example/sbatch_joint_script.sh +++ b/MIMIC-IV_Example/sbatch_joint_script.sh @@ -6,19 +6,16 @@ #SBATCH -o MIMIC_IV_MEDS_%j_sbatch.out # File to which STDOUT will be written, including job ID (%j) #SBATCH -e MIMIC_IV_MEDS_%j_sbatch.err # File to which STDERR will be written, including job ID (%j) -cd /n/data1/hms/dbmi/zaklab/mmd/MEDS_polars_functions +cd /n/data1/hms/dbmi/zaklab/mmd/MEDS_polars_functions || exit -MIMICIV_RAW_DIR="$1" -MIMICIV_PREMEDS_DIR="$2" MIMICIV_MEDS_DIR="$3" -N_PARALLEL_WORKERS="$4" LOG_DIR="$MIMICIV_MEDS_DIR/.logs" echo "Running with saving to $LOG_DIR" -mkdir -p $LOG_DIR +mkdir -p "$LOG_DIR" PATH="/home/mbm47/.conda/envs/MEDS_pipelines/bin:$PATH" \ time mprof run --include-children --exit-code --output "$LOG_DIR/mprofile.dat" \ - ./MIMIC-IV_Example/joint_script.sh "$@" 2> $LOG_DIR/timings.txt + ./MIMIC-IV_Example/joint_script.sh "$@" 2> "$LOG_DIR/timings.txt" From 5cebbfa6100eb84dc9efa359a2600a8628fbde34 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sat, 8 Jun 2024 16:32:26 -0400 Subject: [PATCH 44/47] Allowing for skipping the unique-by in the merge stage. --- configs/extraction.yaml | 1 + scripts/extraction/merge_to_MEDS_cohort.py | 31 +++++++++++++++++----- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/configs/extraction.yaml b/configs/extraction.yaml index e1e985a..c351951 100644 --- a/configs/extraction.yaml +++ b/configs/extraction.yaml @@ -25,3 +25,4 @@ stage_configs: held_out: 0.1 merge_to_MEDS_cohort: output_dir: ${cohort_dir}/final_cohort + unique_by: "*" diff --git a/scripts/extraction/merge_to_MEDS_cohort.py b/scripts/extraction/merge_to_MEDS_cohort.py index e7f8bdf..ade8d50 100755 --- a/scripts/extraction/merge_to_MEDS_cohort.py +++ b/scripts/extraction/merge_to_MEDS_cohort.py @@ -2,6 +2,7 @@ import json import random +from functools import partial from pathlib import Path import hydra @@ -15,7 +16,7 @@ pl.enable_string_cache() -def read_fn(sp_dir: Path) -> pl.LazyFrame: +def read_fn(sp_dir: Path, unique_by: list[str] | str | None) -> pl.LazyFrame: files_to_read = list(sp_dir.glob("**/*.parquet")) if not files_to_read: @@ -25,11 +26,25 @@ def read_fn(sp_dir: Path) -> pl.LazyFrame: logger.info(f"Reading {len(files_to_read)} files:\n{file_strs}") dfs = [pl.scan_parquet(fp, glob=False) for fp in files_to_read] - return ( - pl.concat(dfs, how="diagonal_relaxed") - .unique(maintain_order=False) - .sort(by=["patient_id", "timestamp"]) - ) + df = pl.concat(dfs, how="diagonal_relaxed") + + match unique_by: + case None: + pass + case "*": + df = df.unique(maintain_order=False) + case list() if len(unique_by) == 0 and all(isinstance(u, str) for u in unique_by): + subset = [] + for u in unique_by: + if u in df.columns: + subset.append(u) + else: + logger.warning(f"Column {u} not found in dataframe. Omitting from unique-by subset.") + df = df.unique(maintain_order=False, subset=subset) + case _: + raise ValueError(f"Invalid unique_by value: {unique_by}") + + return df.sort(by=["patient_id", "timestamp"], multithreaded=False) def write_fn(df: pl.LazyFrame, out_fp: Path) -> None: @@ -63,6 +78,8 @@ def main(cfg: DictConfig): patient_splits = list(shards.keys()) random.shuffle(patient_splits) + reader = partial(read_fn, unique_by=cfg.stage_cfg.get("unique_by", None)) + for sp in patient_splits: in_dir = patient_subsharded_dir / sp out_fp = Path(cfg.stage_cfg.output_dir) / f"{sp}.parquet" @@ -70,7 +87,7 @@ def main(cfg: DictConfig): shard_fps = sorted(list(in_dir.glob("**/*.parquet"))) shard_fp_strs = [f" * {str(fp.resolve())}" for fp in shard_fps] logger.info(f"Merging {len(shard_fp_strs)} shards into {out_fp}:\n" + "\n".join(shard_fp_strs)) - rwlock_wrap(in_dir, out_fp, read_fn, write_fn, identity_fn, do_return=False) + rwlock_wrap(in_dir, out_fp, reader, write_fn, identity_fn, do_return=False) logger.info("Output cohort written.") From f48ddb72a645c0b9cec2fc107a911be801373c97 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sat, 8 Jun 2024 16:46:00 -0400 Subject: [PATCH 45/47] Added a note to eICU example --- eICU_Example/joint_script.sh | 55 ++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/eICU_Example/joint_script.sh b/eICU_Example/joint_script.sh index 26e2b57..97515f4 100755 --- a/eICU_Example/joint_script.sh +++ b/eICU_Example/joint_script.sh @@ -39,32 +39,37 @@ N_PARALLEL_WORKERS="$4" shift 4 -echo "Running pre-MEDS conversion." -./eICU_Example/pre_MEDS.py raw_cohort_dir="$EICU_RAW_DIR" output_dir="$EICU_PREMEDS_DIR" +echo "Note that eICU is expensive (in memory) in some final stages as each MEDS shards will end up being " +echo "large in # of rows (e.g., ~175M) given the frequency of periodic vitals signs. We recommend setting " +echo "stage_configs.merge_to_MEDS_cohort.unique_by=null in order to mitigate the cost of the unique " +echo "operation at to avoid OOM issues." -echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel" -./scripts/extraction/shard_events.py \ - --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ - hydra/launcher=joblib \ - input_dir="$EICU_PREMEDS_DIR" \ - cohort_dir="$EICU_MEDS_DIR" \ - event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@" - -echo "Splitting patients in serial" -./scripts/extraction/split_and_shard_patients.py \ - input_dir="$EICU_PREMEDS_DIR" \ - cohort_dir="$EICU_MEDS_DIR" \ - event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@" - -echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" -./scripts/extraction/convert_to_sharded_events.py \ - --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ - hydra/launcher=joblib \ - input_dir="$EICU_PREMEDS_DIR" \ - cohort_dir="$EICU_MEDS_DIR" \ - event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@" +#echo "Running pre-MEDS conversion." +#./eICU_Example/pre_MEDS.py raw_cohort_dir="$EICU_RAW_DIR" output_dir="$EICU_PREMEDS_DIR" +# +#echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel" +#./scripts/extraction/shard_events.py \ +# --multirun \ +# worker="range(0,$N_PARALLEL_WORKERS)" \ +# hydra/launcher=joblib \ +# input_dir="$EICU_PREMEDS_DIR" \ +# cohort_dir="$EICU_MEDS_DIR" \ +# event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@" +# +#echo "Splitting patients in serial" +#./scripts/extraction/split_and_shard_patients.py \ +# input_dir="$EICU_PREMEDS_DIR" \ +# cohort_dir="$EICU_MEDS_DIR" \ +# event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@" +# +#echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" +#./scripts/extraction/convert_to_sharded_events.py \ +# --multirun \ +# worker="range(0,$N_PARALLEL_WORKERS)" \ +# hydra/launcher=joblib \ +# input_dir="$EICU_PREMEDS_DIR" \ +# cohort_dir="$EICU_MEDS_DIR" \ +# event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@" echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel" ./scripts/extraction/merge_to_MEDS_cohort.py \ From e152a17c2e747b1f86e836fca75b8d5533cbf896 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Sat, 8 Jun 2024 17:06:45 -0400 Subject: [PATCH 46/47] Updated scripts and added note to README.md for eICU --- eICU_Example/README.md | 8 +++++ eICU_Example/joint_script.sh | 63 +++++++++++++++++++----------------- 2 files changed, 41 insertions(+), 30 deletions(-) diff --git a/eICU_Example/README.md b/eICU_Example/README.md index 2715613..0984b99 100644 --- a/eICU_Example/README.md +++ b/eICU_Example/README.md @@ -70,6 +70,14 @@ In practice, on a machine with 150 GB of RAM and 10 cores, this step takes less ## Step 3: Run the MEDS extraction ETL +Note that eICU has a lot more observations per patient than does MIMIC-IV, so to keep to a reasonable memory +burden (e.g., \< 150GB per worker), you will want a smaller shard size, as well as to turn off the final unique +check (which should not be necessary given the structure of eICU and is expensive) in the merge stage. You can +do this by setting the following parameters at the end of the mandatory args when running this script: + +- `stage_configs.split_and_shard_patients.n_patients_per_shard=10000` +- `stage_configs.merge_to_MEDS_cohort.unique_by=null` + ### Running locally, serially We will assume you want to output the final MEDS dataset into a directory we'll denote as `$EICU_MEDS_DIR`. diff --git a/eICU_Example/joint_script.sh b/eICU_Example/joint_script.sh index 97515f4..fd76ee2 100755 --- a/eICU_Example/joint_script.sh +++ b/eICU_Example/joint_script.sh @@ -39,37 +39,40 @@ N_PARALLEL_WORKERS="$4" shift 4 -echo "Note that eICU is expensive (in memory) in some final stages as each MEDS shards will end up being " -echo "large in # of rows (e.g., ~175M) given the frequency of periodic vitals signs. We recommend setting " -echo "stage_configs.merge_to_MEDS_cohort.unique_by=null in order to mitigate the cost of the unique " -echo "operation at to avoid OOM issues." +echo "Note that eICU has a lot more observations per patient than does MIMIC-IV, so to keep to a reasonable " +echo "memory burden (e.g., < 150GB per worker), you will want a smaller shard size, as well as to turn off " +echo "the final unique check (which should not be necessary given the structure of eICU and is expensive) " +echo "in the merge stage. You can do this by setting the following parameters at the end of the mandatory " +echo "args when running this script:" +echo " * stage_configs.split_and_shard_patients.n_patients_per_shard=10000" +echo " * stage_configs.merge_to_MEDS_cohort.unique_by=null" -#echo "Running pre-MEDS conversion." -#./eICU_Example/pre_MEDS.py raw_cohort_dir="$EICU_RAW_DIR" output_dir="$EICU_PREMEDS_DIR" -# -#echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel" -#./scripts/extraction/shard_events.py \ -# --multirun \ -# worker="range(0,$N_PARALLEL_WORKERS)" \ -# hydra/launcher=joblib \ -# input_dir="$EICU_PREMEDS_DIR" \ -# cohort_dir="$EICU_MEDS_DIR" \ -# event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@" -# -#echo "Splitting patients in serial" -#./scripts/extraction/split_and_shard_patients.py \ -# input_dir="$EICU_PREMEDS_DIR" \ -# cohort_dir="$EICU_MEDS_DIR" \ -# event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@" -# -#echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" -#./scripts/extraction/convert_to_sharded_events.py \ -# --multirun \ -# worker="range(0,$N_PARALLEL_WORKERS)" \ -# hydra/launcher=joblib \ -# input_dir="$EICU_PREMEDS_DIR" \ -# cohort_dir="$EICU_MEDS_DIR" \ -# event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@" +echo "Running pre-MEDS conversion." +./eICU_Example/pre_MEDS.py raw_cohort_dir="$EICU_RAW_DIR" output_dir="$EICU_PREMEDS_DIR" + +echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel" +./scripts/extraction/shard_events.py \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=joblib \ + input_dir="$EICU_PREMEDS_DIR" \ + cohort_dir="$EICU_MEDS_DIR" \ + event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@" + +echo "Splitting patients in serial" +./scripts/extraction/split_and_shard_patients.py \ + input_dir="$EICU_PREMEDS_DIR" \ + cohort_dir="$EICU_MEDS_DIR" \ + event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@" + +echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" +./scripts/extraction/convert_to_sharded_events.py \ + --multirun \ + worker="range(0,$N_PARALLEL_WORKERS)" \ + hydra/launcher=joblib \ + input_dir="$EICU_PREMEDS_DIR" \ + cohort_dir="$EICU_MEDS_DIR" \ + event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@" echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel" ./scripts/extraction/merge_to_MEDS_cohort.py \ From f7415559e3f34a1b370af558bec6501886ad051c Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Tue, 11 Jun 2024 09:06:20 -0400 Subject: [PATCH 47/47] Updated some docstrings --- configs/extraction.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/configs/extraction.yaml b/configs/extraction.yaml index 41a0f3a..1a1c0dd 100644 --- a/configs/extraction.yaml +++ b/configs/extraction.yaml @@ -29,7 +29,7 @@ stage_configs: This stage shards the raw input events into smaller files for easier processing. Arguments: - `row_chunksize`: The number of rows to read in at a time. - `infer_schema_length`: The number of rows to read in to infer the schema (only used if the source - files are pdfs) + files are csvs) row_chunksize: 200000000 infer_schema_length: 10000 split_and_shard_patients: @@ -41,6 +41,11 @@ stage_configs: held-out test sets beyond the IID held out set that will be produced (e.g., for prospective datasets, etc.). - `split_fracs`: The fraction of patients to include in the IID training, tuning, and held-out sets. + Split fractions can be changed for the default names by adding a hydra-syntax command line argument + for the nested name; e.g., `split_fracs.train=0.7 split_fracs.tuning=0.1 split_fracs.held_out=0.2`. + A split can be removed with the `~` override Hydra syntax. Similarly, a new split name can be added + with the standard Hydra `+` override option. E.g., `~split_fracs.held_out +split_fracs.test=0.1`. It + is the user's responsibility to ensure that split fractions sum to 1. is_metadata: True output_dir: ${cohort_dir} n_patients_per_shard: 50000