From 4f18745c80a096141d619c4f38d06c8b56e6dfc1 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Mon, 27 May 2024 12:09:55 -0400
Subject: [PATCH 01/47] partial thoughts -- not working

---
 README.md                          | 18 +++++++++-
 configs/extraction.yaml            | 25 +++++--------
 configs/pipeline.yaml              | 27 ++++++++++++++
 configs/preprocess.yaml            | 56 ++++++++++++++++++------------
 pyproject.toml                     |  2 ++
 src/MEDS_polars_functions/utils.py | 34 ++++++++++++++++++
 6 files changed, 123 insertions(+), 39 deletions(-)
 create mode 100644 configs/pipeline.yaml

diff --git a/README.md b/README.md
index e03dda6..101f1f8 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,11 @@ more information.
 This package provides three things:
 
 1. A working, scalable, simple example of how to extract and pre-process MEDS data for downstream modeling.
+   These examples are provided in the form of:
+   - A set of integration tests that are run over synthetic data to verify correctness of the ETL pipeline.
+     See `tests/test_extraction.py` for the ETL tests with the in-built synthetic source data.
+   - A working MIMIC-IV MEDS ETL pipeline that can be run over MIMIC-IV v2.2 in approximately 1 hour in serial
+     mode (and much faster if parallelized). See `MIMIC-IV_Example` for more details.
 2. A flexible ETL for extracting MEDS data from a variety of source formats.
 3. A pre-processing pipeline that can be used for models that require:
    - Filtering data to only include patients with a certain number of events
@@ -27,7 +32,8 @@ This package provides three things:
 
 ## Installation
 
-For now, clone this repository and run `pip install -e .` from the repository root.
+For now, clone this repository and run `pip install -e .` from the repository root. To use the MIMIC-IV
+example, install the optional MIMIC dependencies as well with `pip install -e .[mimic]`.
 
 ## MEDS ETL / Extraction Pipeline
 
@@ -197,6 +203,16 @@ running multiple copies of the same script on independent workers to process the
 steps again need to happen in a single-threaded manner, but these steps are generally very fast and should not
 be a bottleneck.
 
+## Running the Pipeline in Parallel via Hydra Multirun
+We support two (optional) hydra multirun job launchers for parallelizing ETL and pre-processing pipeline
+steps: [`joblib`](https://hydra.cc/docs/plugins/joblib_launcher/) (for local parallelism) and
+[`submitit`](https://hydra.cc/docs/plugins/submitit_launcher/) to launch things with slurm for cluster
+parallelism.
+
+To use either of these, you need to install additional optional dependencies:
+  1. `pip install -e .[local_parallelism]` for joblib local parallelism support, or
+  2. `pip install -e .[slurm_parallelism]` for submitit cluster parallelism support.
+
 ## TODOs:
 
 1. We need to have a vehicle to cleanly separate dataset-specific variables from the general configuration
diff --git a/configs/extraction.yaml b/configs/extraction.yaml
index 54708d0..c46c0af 100644
--- a/configs/extraction.yaml
+++ b/configs/extraction.yaml
@@ -1,6 +1,12 @@
-# Raw data
-raw_cohort_dir: ???
-MEDS_cohort_dir: ???
+defaults:
+  - pipeline
+
+# Pipeline Structure
+stages:
+  - shard_by_event
+  - generate_patient_shards
+  - convert_to_MEDS_and_subshard
+  - merge_subshards
 
 # Event Conversion
 event_conversion_config_fp: ???
@@ -16,16 +22,3 @@ split_fracs:
 row_chunksize: 200000000
 n_patients_per_shard: 50000
 infer_schema_length: 10000
-
-# Misc
-do_overwrite: False
-seed: 1
-
-# Hydra
-hydra:
-  job:
-    name: MEDS_ETL_step_${now:%Y-%m-%d_%H-%M-%S}
-  run:
-    dir: ${MEDS_cohort_dir}/.logs/etl/${hydra.job.name}
-  sweep:
-    dir: ${MEDS_cohort_dir}/.logs/etl/${hydra.job.name}
diff --git a/configs/pipeline.yaml b/configs/pipeline.yaml
new file mode 100644
index 0000000..c25ba0a
--- /dev/null
+++ b/configs/pipeline.yaml
@@ -0,0 +1,27 @@
+
+# Global IO
+input_dir: ???
+cohort_dir: ???
+
+log_dir: "${cohort_dir}/.logs/${stage}/worker_${worker}/${now:%Y-%m-%d_%H-%M-%S}"
+
+# General pipeline variables
+do_overwrite: False
+seed: 1
+stages: ??? # The list of stages to this overall pipeline
+
+# Worker / Stage information
+stage: ???
+worker: 1
+polling_time: 300 # wait time in seconds before beginning reduction steps
+
+# Stage-specific IO
+stage_output_dir: "${cohort_dir}/${stage}"
+stage_input_dir: "${stage_input_dir:${input_dir},${cohort_dir},${stages},${stage}}"
+
+# Hydra
+hydra:
+  run:
+    dir: "${log_dir}/${hydra.job.name}"
+  sweep:
+    dir: "${log_dir}/${hydra.job.name}"
diff --git a/configs/preprocess.yaml b/configs/preprocess.yaml
index 397ff93..b1a5517 100644
--- a/configs/preprocess.yaml
+++ b/configs/preprocess.yaml
@@ -1,12 +1,36 @@
-# Raw data
-MEDS_cohort_dir: ???
-output_data_dir: ???
-log_dir: "${output_data_dir}/.logs"
+defaults:
+  - pipeline
+
+# Pipeline Structure
+stages:
+  - filter_patients_by_length
+  - add_time_derived_measurements
+  - preliminary_counts
+  - filter_codes
+  - fit_outlier_detection
+  - filter_outliers
+  - fit_normalization
+  - normalization
+  - tokenization
+  - tensorization
+
+stages:
+  filter_patients_by_length:
+    input_dir: ???
+    output_dir: ???
+    min_events_per_patient: null
+    min_measurements_per_patient: null
+
+  add_time_derived_measurements
+  preliminary_counts
+  filter_codes
+  fit_outlier_detection
+  filter_outliers
+  fit_normalization
+  normalization
+  tokenization
+  tensorization
 
-# Worker / Stage information
-stage: ???
-worker: 1
-polling_time: 300 # wait time in seconds before beginning reduction steps
 
 # Filtering parameters
 min_code_occurrences: null
@@ -32,11 +56,11 @@ code_processing_stages:
   preliminary_counts:
     - "code/n_occurrences"
     - "code/n_patients"
-  outlier_detection:
+  fit_outlier_detection:
     - "values/n_occurrences"
     - "values/sum"
     - "values/sum_sqd"
-  normalization:
+  fit_normalization:
     - "code/n_occurrences"
     - "code/n_patients"
     - "values/n_occurrences"
@@ -45,15 +69,3 @@ code_processing_stages:
 
 # Outlier detection
 outlier_stddev_cutoff: 4.5
-
-# Misc
-do_overwrite: False
-
-# Hydra
-hydra:
-  job:
-    name: "MEDS_Preprocessor/stage_${stage}/worker_${worker}/${now:%Y-%m-%d_%H-%M-%S}"
-  run:
-    dir: "${log_dir}/${hydra.job.name}"
-  sweep:
-    dir: "${log_dir}/${hydra.job.name}"
diff --git a/pyproject.toml b/pyproject.toml
index bbf8dee..29bba91 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,6 +22,8 @@ dependencies = ["polars", "pyarrow", "nested_ragged_tensors", "loguru", "hydra-c
 mimic = ["rootutils"]
 dev = ["pre-commit"]
 tests = ["pytest", "pytest-cov[toml]", "rootutils"]
+local_parallelism = ["hydra-joblib-launcher"]
+slurm_parallelism = ["hydra-submitit-launcher"]
 
 [project.urls]
 Homepage = "https://github.com/mmcdermott/MEDS_polars_functions"
diff --git a/src/MEDS_polars_functions/utils.py b/src/MEDS_polars_functions/utils.py
index 7899653..f389d49 100644
--- a/src/MEDS_polars_functions/utils.py
+++ b/src/MEDS_polars_functions/utils.py
@@ -3,10 +3,44 @@
 import os
 from pathlib import Path
 
+from omegaconf import OmegaConf
 import hydra
 import polars as pl
 from loguru import logger as log
 
+def get_stage_input_dir(
+    raw_input_dir: str, cohort_dir: str, stages: list[str], stage: str
+) -> str:
+    """Resolves the input directory for a stage in a MEDS pipeline.
+
+    Args:
+        raw_input_dir: The raw input directory (used as the input when the stage is the 1st stage).
+        cohort_dir: The cohort (output) directory; used as the source for the default stage output.
+        stages: The stages in the pipeline.
+        stage: The current stage.
+
+    Returns:
+        The input directory for the current stage.
+
+    Examples:
+        >>> get_stage_input_dir("/a/b", "/c/d", ["stage1", "stage2"], "stage1")
+        '/a/b'
+        >>> get_stage_input_dir("/a/b", "/c/d", ["stage1", "stage2"], "stage2")
+        '/c/d/stage1'
+    """
+    if stage == stages[0]:
+        return raw_input_dir
+    elif stage not in stages:
+        raise ValueError(
+            f"Can't impute input directory for {stage} as it is not in the stages list! "
+            f"Stages: {stages}. "
+            "If this is intentional, please provide the input directory explicitly or remove the "
+            "attempted interpolation from your config by overwriting the `stage_input_dir` parameter."
+        )
+    return os.path.join(cohort_dir, stages[stages.index(stage) - 1])
+
+# We actually call this here that way it is registered in every script when the module is imported.
+OmegaConf.register_new_resolver("stage_input_idr", get_stage_input_dir, replace=True)
 
 def hydra_loguru_init() -> None:
     """Adds loguru output to the logs that hydra scrapes.

From 84eaf5f9a6b0699a846ae99692b8770871516c70 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Mon, 27 May 2024 13:00:59 -0400
Subject: [PATCH 02/47] New structure based on conversation with Nassim

---
 configs/preprocess.yaml | 103 +++++++++++++++-------------------------
 1 file changed, 39 insertions(+), 64 deletions(-)

diff --git a/configs/preprocess.yaml b/configs/preprocess.yaml
index b1a5517..0b7e71d 100644
--- a/configs/preprocess.yaml
+++ b/configs/preprocess.yaml
@@ -1,71 +1,46 @@
 defaults:
   - pipeline
 
-# Pipeline Structure
-stages:
-  - filter_patients_by_length
-  - add_time_derived_measurements
-  - preliminary_counts
-  - filter_codes
-  - fit_outlier_detection
-  - filter_outliers
-  - fit_normalization
-  - normalization
-  - tokenization
-  - tensorization
+# Global pipeline parameters:
+# 1. Code modifiers will be used as adjoining parts of the `code` columns during group-bys and eventual
+# tokenization.
+code_modifier_columns: ???
 
+# Pipeline Structure
 stages:
-  filter_patients_by_length:
-    input_dir: ???
-    output_dir: ???
+  - name: filter_patients_by_length:
     min_events_per_patient: null
     min_measurements_per_patient: null
-
-  add_time_derived_measurements
-  preliminary_counts
-  filter_codes
-  fit_outlier_detection
-  filter_outliers
-  fit_normalization
-  normalization
-  tokenization
-  tensorization
-
-
-# Filtering parameters
-min_code_occurrences: null
-min_events_per_patient: null
-min_measurements_per_patient: null
-
-# Time-derived measurements
-time_derived_measurements:
-  age:
-    dob_code: ???
-    age_code: "AGE"
-    age_unit: "years"
-  time_of_day:
-    bin_endpoints: [6, 12, 18, 24]
-
-# Code modifiers will be used as adjoining parts of the `code` columns during group-bys and eventual
-# tokenization.
-code_modifier_columns: ???
-
-# Code metadata extraction. These may contain duplicates because the data may be filtered between different
-# stages, depending on the pipeline in question.
-code_processing_stages:
-  preliminary_counts:
-    - "code/n_occurrences"
-    - "code/n_patients"
-  fit_outlier_detection:
-    - "values/n_occurrences"
-    - "values/sum"
-    - "values/sum_sqd"
-  fit_normalization:
-    - "code/n_occurrences"
-    - "code/n_patients"
-    - "values/n_occurrences"
-    - "values/sum"
-    - "values/sum_sqd"
-
-# Outlier detection
-outlier_stddev_cutoff: 4.5
+  - name: add_time_derived_measurements:
+    age:
+      dob_code: ???
+      age_code: "AGE"
+      age_unit: "years"
+    time_of_day:
+      bin_endpoints: [6, 12, 18, 24]
+  - name: preliminary_counts
+    obs_aggregations:
+      - "code/n_occurrences"
+      - "code/n_patients"
+  - name: filter_codes
+    min_code_occurrences: null
+  - name: fit_outlier_detection
+    aggregations:
+      - "values/n_occurrences"
+      - "values/sum"
+      - "values/sum_sqd"
+  - name: filter_outliers
+    stddev_cutoff: 4.5
+  - name: fit_normalization
+    aggregations:
+      - "code/n_occurrences"
+      - "code/n_patients"
+      - "values/n_occurrences"
+      - "values/sum"
+      - "values/sum_sqd"
+  - name: normalization
+  - name: tokenization
+  - name: tensorization
+
+stage: ???
+stage_cfg: ${populate_stage:${stage}}

From 2cde29a4be5af2d36b67001603adb4260d8131dc Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Mon, 27 May 2024 13:21:30 -0400
Subject: [PATCH 03/47] Updated configs further and started README
 documentation for this.

---
 README.md               | 29 ++++++++++++++++++++++++++---
 configs/extraction.yaml | 33 ++++++++++++++-------------------
 configs/pipeline.yaml   | 12 ++++++------
 configs/preprocess.yaml | 14 +++++++++-----
 4 files changed, 55 insertions(+), 33 deletions(-)

diff --git a/README.md b/README.md
index 101f1f8..9669c48 100644
--- a/README.md
+++ b/README.md
@@ -203,15 +203,38 @@ running multiple copies of the same script on independent workers to process the
 steps again need to happen in a single-threaded manner, but these steps are generally very fast and should not
 be a bottleneck.
 
-## Running the Pipeline in Parallel via Hydra Multirun
+## Overview of configuration manipulation
+
+### Pipeline configuration: Stages and OmegaConf Resolvers
+
+The pipeline configuration file for both the provided extraction and pre-processing pipelines are structured
+to permit both ease of understanding, flexibility for user-derived modifications, and ease of use in the
+simple, file-in/file-out scripts that this repository promotes. How this works is that each pipeline
+(extraction and pre-processing) defines one global configuration file which is used as the Hydra specification
+for all scripts in that pipeline. This file leverages some generic pipeline configuration options, specified
+in `pipeline.yaml` and imported via the Hydra `defaults:` list, but also defines a list of stages with
+stage-specific configurations.
+
+The user can specify the stage in question on the command line either manually (e.g., `stage=stage_name`) or
+allow the stage name to be inferred automatically from the script name. Each script receives both the global
+configuration file but also a sub-configuration (within the `stage_cfg` node in the received global
+configuration) which is pre-populated with the stage-specific configuration for the stage in question and
+automatically inferred input and output file paths (if not overwritten in the config file) based on the stage
+name and its position in the overall pipeline. This makes it easy to leverage transformations and scripts
+defined here in new configuration pipelines, simply by placing them as a stage in a broader pipeline in a
+different configuration or order relative to other stages.
+
+### Running the Pipeline in Parallel via Hydra Multirun
+
 We support two (optional) hydra multirun job launchers for parallelizing ETL and pre-processing pipeline
 steps: [`joblib`](https://hydra.cc/docs/plugins/joblib_launcher/) (for local parallelism) and
 [`submitit`](https://hydra.cc/docs/plugins/submitit_launcher/) to launch things with slurm for cluster
 parallelism.
 
 To use either of these, you need to install additional optional dependencies:
-  1. `pip install -e .[local_parallelism]` for joblib local parallelism support, or
-  2. `pip install -e .[slurm_parallelism]` for submitit cluster parallelism support.
+
+1. `pip install -e .[local_parallelism]` for joblib local parallelism support, or
+2. `pip install -e .[slurm_parallelism]` for submitit cluster parallelism support.
 
 ## TODOs:
 
diff --git a/configs/extraction.yaml b/configs/extraction.yaml
index c46c0af..c454d31 100644
--- a/configs/extraction.yaml
+++ b/configs/extraction.yaml
@@ -1,24 +1,19 @@
 defaults:
   - pipeline
 
-# Pipeline Structure
-stages:
-  - shard_by_event
-  - generate_patient_shards
-  - convert_to_MEDS_and_subshard
-  - merge_subshards
-
-# Event Conversion
+# The event conversion configuration file is used throughout the pipeline to define the events to extract.
 event_conversion_config_fp: ???
 
-# Splits
-external_splits_json_fp: null
-split_fracs:
-  train: 0.8
-  tuning: 0.1
-  held_out: 0.1
-
-# Sharding
-row_chunksize: 200000000
-n_patients_per_shard: 50000
-infer_schema_length: 10000
+stages:
+  - name: shard_events
+    row_chunksize: 200000000
+    infer_schema_length: 10000
+  - name: split_and_shard_patients
+    n_patients_per_shard: 50000
+    external_splits_json_fp: null
+    split_fracs:
+      train: 0.8
+      tuning: 0.1
+      held_out: 0.1
+  - name: convert_to_sharded_events
+  - name: merge_to_MEDS_cohort
diff --git a/configs/pipeline.yaml b/configs/pipeline.yaml
index c25ba0a..a477667 100644
--- a/configs/pipeline.yaml
+++ b/configs/pipeline.yaml
@@ -1,4 +1,3 @@
-
 # Global IO
 input_dir: ???
 cohort_dir: ???
@@ -10,17 +9,18 @@ do_overwrite: False
 seed: 1
 stages: ??? # The list of stages to this overall pipeline
 
-# Worker / Stage information
-stage: ???
+# Mapreduce information
 worker: 1
 polling_time: 300 # wait time in seconds before beginning reduction steps
 
-# Stage-specific IO
-stage_output_dir: "${cohort_dir}/${stage}"
-stage_input_dir: "${stage_input_dir:${input_dir},${cohort_dir},${stages},${stage}}"
+# Filling in the current stage
+stage: ${current_script_name:}
+stage_cfg: ${oc.create:${populate_stage:${stage}}}
 
 # Hydra
 hydra:
+  job:
+    name: "${stage}"
   run:
     dir: "${log_dir}/${hydra.job.name}"
   sweep:
diff --git a/configs/preprocess.yaml b/configs/preprocess.yaml
index 0b7e71d..9b60579 100644
--- a/configs/preprocess.yaml
+++ b/configs/preprocess.yaml
@@ -8,29 +8,35 @@ code_modifier_columns: ???
 
 # Pipeline Structure
 stages:
-  - name: filter_patients_by_length:
+  - name: filter_patients
     min_events_per_patient: null
     min_measurements_per_patient: null
-  - name: add_time_derived_measurements:
+
+  - name: add_time_derived_measurements
     age:
       dob_code: ???
       age_code: "AGE"
       age_unit: "years"
     time_of_day:
       bin_endpoints: [6, 12, 18, 24]
+
   - name: preliminary_counts
     obs_aggregations:
       - "code/n_occurrences"
       - "code/n_patients"
+
   - name: filter_codes
     min_code_occurrences: null
+
   - name: fit_outlier_detection
     aggregations:
       - "values/n_occurrences"
       - "values/sum"
       - "values/sum_sqd"
+
   - name: filter_outliers
     stddev_cutoff: 4.5
+
   - name: fit_normalization
     aggregations:
       - "code/n_occurrences"
@@ -38,9 +44,7 @@ stages:
       - "values/n_occurrences"
       - "values/sum"
       - "values/sum_sqd"
+
   - name: normalization
   - name: tokenization
   - name: tensorization
-
-stage: ???
-stage_cfg: ${populate_stage:${stage}}

From aaec6f3b291a418ba6ca11b986eb9ef259584296 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Mon, 27 May 2024 16:41:19 -0400
Subject: [PATCH 04/47] Got the custom OmegaConf resolvers working for
 populating the stage config.

---
 .pre-commit-config.yaml                       |   1 +
 configs/pipeline.yaml                         |   2 +-
 .../extraction/convert_to_sharded_events.py   |   6 +
 src/MEDS_polars_functions/utils.py            | 145 +++++++++++++++---
 4 files changed, 129 insertions(+), 25 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7540f52..1533f74 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -38,6 +38,7 @@ repos:
     rev: v2.2.0
     hooks:
       - id: autoflake
+        args: [--in-place, --remove-all-unused-imports]
 
   # python upgrading syntax to newer version
   - repo: https://github.com/asottile/pyupgrade
diff --git a/configs/pipeline.yaml b/configs/pipeline.yaml
index a477667..12feaea 100644
--- a/configs/pipeline.yaml
+++ b/configs/pipeline.yaml
@@ -15,7 +15,7 @@ polling_time: 300 # wait time in seconds before beginning reduction steps
 
 # Filling in the current stage
 stage: ${current_script_name:}
-stage_cfg: ${oc.create:${populate_stage:${stage}}}
+stage_cfg: ${oc.create:${populate_stage:${stage}, ${input_dir}, ${cohort_dir}, ${stages}}}
 
 # Hydra
 hydra:
diff --git a/scripts/extraction/convert_to_sharded_events.py b/scripts/extraction/convert_to_sharded_events.py
index 50fcab2..07eb7a2 100755
--- a/scripts/extraction/convert_to_sharded_events.py
+++ b/scripts/extraction/convert_to_sharded_events.py
@@ -22,6 +22,12 @@ def main(cfg: DictConfig):
 
     hydra_loguru_init()
 
+    logger.info(
+        f"Running with config:\n{OmegaConf.to_yaml(cfg)}\n"
+        f"Stage: {cfg.stage}\n\n"
+        f"Stage config:\n{OmegaConf.to_yaml(cfg.stage_cfg)}"
+    )
+
     Path(cfg.raw_cohort_dir)
     MEDS_cohort_dir = Path(cfg.MEDS_cohort_dir)
 
diff --git a/src/MEDS_polars_functions/utils.py b/src/MEDS_polars_functions/utils.py
index f389d49..e61f21d 100644
--- a/src/MEDS_polars_functions/utils.py
+++ b/src/MEDS_polars_functions/utils.py
@@ -1,46 +1,143 @@
 """Core utilities for MEDS pipelines built with these tools."""
 
 import os
+import sys
 from pathlib import Path
 
-from omegaconf import OmegaConf
 import hydra
 import polars as pl
 from loguru import logger as log
+from omegaconf import OmegaConf
+
 
-def get_stage_input_dir(
-    raw_input_dir: str, cohort_dir: str, stages: list[str], stage: str
-) -> str:
-    """Resolves the input directory for a stage in a MEDS pipeline.
+def current_script_name() -> str:
+    """Returns the name of the script that called this function.
+
+    Returns:
+        str: The name of the script that called this function.
+    """
+    return Path(sys.argv[0]).stem
+
+
+def populate_stage(
+    stage_name: str,
+    input_dir: str,
+    cohort_dir: str,
+    stages: list[dict],
+    pre_parsed_stages: list[dict] | None = None,
+) -> dict:
+    """Populates a stage in the stages configuration with inferred stage parameters.
+
+    Infers and adds (unless already present, in which case the provided value is used) the following
+    parameters to the stage configuration:
+      - `is_metadata`: Whether the stage is a metadata stage, which is determined to be `False` if the stage
+        does not have an `aggregations` parameter.
+      - `data_input_dir`: The input directory for the stage (either the global input directory or the previous
+        data stage's output directory).
+      - `metadata_input_dir`: The input directory for the stage (either the global input directory or the
+        previous metadata stage's output directory).
+      - `output_dir`: The output directory for the stage (the cohort directory with the stage name appended).
 
     Args:
-        raw_input_dir: The raw input directory (used as the input when the stage is the 1st stage).
-        cohort_dir: The cohort (output) directory; used as the source for the default stage output.
-        stages: The stages in the pipeline.
-        stage: The current stage.
+        stage_name: The name of the stage to populate.
+        input_dir: The global input directory.
+        cohort_dir: The cohort directory into which this overall pipeline is writing data.
+        stages: The stages configuration dictionaries (unresolved).
+        pre_parsed_stages: The stages configuration dictionaries (resolved). If specified, the function will
+            not re-resolve the stages in this list.
 
     Returns:
-        The input directory for the current stage.
+        dict: The populated stage configuration.
+
+    Raises:
+        ValueError: If the stage is not present in the stages configuration.
 
     Examples:
-        >>> get_stage_input_dir("/a/b", "/c/d", ["stage1", "stage2"], "stage1")
-        '/a/b'
-        >>> get_stage_input_dir("/a/b", "/c/d", ["stage1", "stage2"], "stage2")
-        '/c/d/stage1'
+        >>> root_config = DictConfig({
+        ...     "input_dir": "/a/b",
+        ...     "cohort_dir": "/c/d",
+        ...     "stages": [
+        ...         {"name": "stage1"},
+        ...         {"name": "stage2", "is_metadata": True},
+        ...         {"name": "stage3", "is_metadata": None},
+        ...         {"name": "stage4", "data_input_dir": "/e/f", "output_dir": "/g/h"},
+        ...         {"name": "stage5", "aggregations": ["foo"]},
+        ...         {"name": "stage6"},
+        ...     ],
+        ... })
+        >>> args = (root_config["input_dir"], root_config["cohort_dir"], root_config["stages"])
+        >>> populate_stage("stage1", *args) # doctest: +NORMALIZE_WHITESPACE
+        {'name': 'stage1', 'is_metadata': False, 'data_input_dir': '/a/b', 'metadata_input_dir': '/a/b',
+         'output_dir': '/c/d/stage1'}
+        >>> populate_stage("stage2", *args) # doctest: +NORMALIZE_WHITESPACE
+        {'name': 'stage2', 'is_metadata': True, 'data_input_dir': '/c/d/stage1', 'metadata_input_dir': '/a/b',
+         'output_dir': '/c/d/stage2'}
+        >>> populate_stage("stage3", *args) # doctest: +NORMALIZE_WHITESPACE
+        {'name': 'stage3', 'is_metadata': False, 'data_input_dir': '/c/d/stage1',
+         'metadata_input_dir': '/c/d/stage2', 'output_dir': '/c/d/stage3'}
+        >>> populate_stage("stage4", *args) # doctest: +NORMALIZE_WHITESPACE
+        {'name': 'stage4', 'data_input_dir': '/e/f', 'output_dir': '/g/h', 'is_metadata': False,
+         'metadata_input_dir': '/c/d/stage2'}
+        >>> populate_stage("stage5", *args) # doctest: +NORMALIZE_WHITESPACE
+        {'name': 'stage5', 'aggregations': ['foo'], 'is_metadata': True, 'data_input_dir': '/g/h',
+         'metadata_input_dir': '/c/d/stage2', 'output_dir': '/c/d/stage5'}
+        >>> populate_stage("stage6", *args) # doctest: +NORMALIZE_WHITESPACE
+        {'name': 'stage6', 'is_metadata': False, 'data_input_dir': '/g/h',
+         'metadata_input_dir': '/c/d/stage5', 'output_dir': '/c/d/stage6'}
+        >>> populate_stage("stage7", *args) # doctest: +NORMALIZE_WHITESPACE
+        Traceback (most recent call last):
+            ...
+        ValueError: 'stage7' is not a valid stage name. Options are:
+            ['stage1', 'stage2', 'stage3', 'stage4', 'stage5', 'stage6']
     """
-    if stage == stages[0]:
-        return raw_input_dir
-    elif stage not in stages:
+
+    if stage_name not in {s["name"] for s in stages}:
         raise ValueError(
-            f"Can't impute input directory for {stage} as it is not in the stages list! "
-            f"Stages: {stages}. "
-            "If this is intentional, please provide the input directory explicitly or remove the "
-            "attempted interpolation from your config by overwriting the `stage_input_dir` parameter."
+            f"'{stage_name}' is not a valid stage name. Options are: {list(s['name'] for s in stages)}"
         )
-    return os.path.join(cohort_dir, stages[stages.index(stage) - 1])
 
-# We actually call this here that way it is registered in every script when the module is imported.
-OmegaConf.register_new_resolver("stage_input_idr", get_stage_input_dir, replace=True)
+    pre_pop_stages_by_name = {s["name"]: s for s in pre_parsed_stages} if pre_parsed_stages else {}
+    pre_parsed_stages = pre_parsed_stages or []
+
+    prior_stages = []
+    stage = None
+    prior_data_stage = None
+    prior_metadata_stage = None
+    for s in stages:
+        if s["name"] == stage_name:
+            stage = s
+            break
+        elif s["name"] in pre_pop_stages_by_name:
+            s_resolved = pre_pop_stages_by_name[s["name"]]
+        else:
+            s_resolved = populate_stage(s["name"], input_dir, cohort_dir, stages, prior_stages)
+
+        if s_resolved["is_metadata"]:
+            prior_metadata_stage = s_resolved
+        else:
+            prior_data_stage = s_resolved
+        prior_stages.append(s_resolved)
+
+    inferred_keys = {
+        "is_metadata": "aggregations" in stage,
+        "data_input_dir": input_dir if prior_data_stage is None else prior_data_stage["output_dir"],
+        "metadata_input_dir": (
+            input_dir if prior_metadata_stage is None else prior_metadata_stage["output_dir"]
+        ),
+        "output_dir": os.path.join(cohort_dir, stage_name),
+    }
+
+    out = {**stage}
+    for key, val in inferred_keys.items():
+        if key not in out or out[key] is None:
+            out[key] = val
+
+    return out
+
+
+OmegaConf.register_new_resolver("current_script_name", current_script_name, replace=False)
+OmegaConf.register_new_resolver("populate_stage", populate_stage, replace=False)
+
 
 def hydra_loguru_init() -> None:
     """Adds loguru output to the logs that hydra scrapes.

From 732a0002ad002b4595121ae59d1c7b0179005765 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Mon, 27 May 2024 17:56:36 -0400
Subject: [PATCH 05/47] Got tests to pass (including integration) on the
 extraction scripts with the new hydra setup and custom resolvers

---
 configs/extraction.yaml                       | 16 +++-
 configs/pipeline.yaml                         |  5 +-
 .../extraction/convert_to_sharded_events.py   |  9 +-
 scripts/extraction/merge_to_MEDS_cohort.py    | 14 +--
 scripts/extraction/shard_events.py            | 23 +++--
 .../extraction/split_and_shard_patients.py    | 18 ++--
 .../add_time_derived_measurements.py          | 13 ++-
 .../preprocessing/collect_code_metadata.py    |  8 +-
 scripts/preprocessing/filter_patients.py      |  8 +-
 src/MEDS_polars_functions/utils.py            | 85 +++++++++++--------
 tests/test_extraction.py                      | 52 +++++++-----
 11 files changed, 155 insertions(+), 96 deletions(-)

diff --git a/configs/extraction.yaml b/configs/extraction.yaml
index c454d31..e1e985a 100644
--- a/configs/extraction.yaml
+++ b/configs/extraction.yaml
@@ -5,15 +5,23 @@ defaults:
 event_conversion_config_fp: ???
 
 stages:
-  - name: shard_events
+  - shard_events
+  - split_and_shard_patients
+  - convert_to_sharded_events
+  - merge_to_MEDS_cohort
+
+stage_configs:
+  shard_events:
     row_chunksize: 200000000
     infer_schema_length: 10000
-  - name: split_and_shard_patients
+  split_and_shard_patients:
+    is_metadata: True
+    output_dir: ${cohort_dir}
     n_patients_per_shard: 50000
     external_splits_json_fp: null
     split_fracs:
       train: 0.8
       tuning: 0.1
       held_out: 0.1
-  - name: convert_to_sharded_events
-  - name: merge_to_MEDS_cohort
+  merge_to_MEDS_cohort:
+    output_dir: ${cohort_dir}/final_cohort
diff --git a/configs/pipeline.yaml b/configs/pipeline.yaml
index 12feaea..851afd7 100644
--- a/configs/pipeline.yaml
+++ b/configs/pipeline.yaml
@@ -7,7 +7,8 @@ log_dir: "${cohort_dir}/.logs/${stage}/worker_${worker}/${now:%Y-%m-%d_%H-%M-%S}
 # General pipeline variables
 do_overwrite: False
 seed: 1
-stages: ??? # The list of stages to this overall pipeline
+stages: ??? # The list of stages to this overall pipeline (in order)
+stage_configs: ??? # The configurations for each stage, keyed by stage name
 
 # Mapreduce information
 worker: 1
@@ -15,7 +16,7 @@ polling_time: 300 # wait time in seconds before beginning reduction steps
 
 # Filling in the current stage
 stage: ${current_script_name:}
-stage_cfg: ${oc.create:${populate_stage:${stage}, ${input_dir}, ${cohort_dir}, ${stages}}}
+stage_cfg: ${oc.create:${populate_stage:${stage}, ${input_dir}, ${cohort_dir}, ${stages}, ${stage_configs}}}
 
 # Hydra
 hydra:
diff --git a/scripts/extraction/convert_to_sharded_events.py b/scripts/extraction/convert_to_sharded_events.py
index 07eb7a2..bc1eff3 100755
--- a/scripts/extraction/convert_to_sharded_events.py
+++ b/scripts/extraction/convert_to_sharded_events.py
@@ -28,10 +28,7 @@ def main(cfg: DictConfig):
         f"Stage config:\n{OmegaConf.to_yaml(cfg.stage_cfg)}"
     )
 
-    Path(cfg.raw_cohort_dir)
-    MEDS_cohort_dir = Path(cfg.MEDS_cohort_dir)
-
-    shards = json.loads((MEDS_cohort_dir / "splits.json").read_text())
+    shards = json.loads((Path(cfg.stage_cfg.metadata_input_dir) / "splits.json").read_text())
 
     event_conversion_cfg_fp = Path(cfg.event_conversion_config_fp)
     if not event_conversion_cfg_fp.exists():
@@ -45,7 +42,7 @@ def main(cfg: DictConfig):
 
     default_patient_id_col = event_conversion_cfg.pop("patient_id_col", "patient_id")
 
-    patient_subsharded_dir = MEDS_cohort_dir / "patient_sub_sharded_events"
+    patient_subsharded_dir = Path(cfg.stage_cfg.output_dir)
     patient_subsharded_dir.mkdir(parents=True, exist_ok=True)
     OmegaConf.save(event_conversion_cfg, patient_subsharded_dir / "event_conversion_config.yaml")
 
@@ -63,7 +60,7 @@ def main(cfg: DictConfig):
             event_cfgs = copy.deepcopy(event_cfgs)
             input_patient_id_column = event_cfgs.pop("patient_id_col", default_patient_id_col)
 
-            event_shards = list((MEDS_cohort_dir / "sub_sharded" / input_prefix).glob("*.parquet"))
+            event_shards = list((Path(cfg.stage_cfg.data_input_dir) / input_prefix).glob("*.parquet"))
             random.shuffle(event_shards)
 
             for shard_fp in event_shards:
diff --git a/scripts/extraction/merge_to_MEDS_cohort.py b/scripts/extraction/merge_to_MEDS_cohort.py
index cc69d2f..1c7271d 100755
--- a/scripts/extraction/merge_to_MEDS_cohort.py
+++ b/scripts/extraction/merge_to_MEDS_cohort.py
@@ -7,7 +7,7 @@
 import hydra
 import polars as pl
 from loguru import logger
-from omegaconf import DictConfig
+from omegaconf import DictConfig, OmegaConf
 
 from MEDS_polars_functions.mapper import wrap as rwlock_wrap
 from MEDS_polars_functions.utils import hydra_loguru_init
@@ -42,13 +42,17 @@ def main(cfg: DictConfig):
 
     hydra_loguru_init()
 
-    MEDS_cohort_dir = Path(cfg.MEDS_cohort_dir)
+    logger.info(
+        f"Running with config:\n{OmegaConf.to_yaml(cfg)}\n"
+        f"Stage: {cfg.stage}\n\n"
+        f"Stage config:\n{OmegaConf.to_yaml(cfg.stage_cfg)}"
+    )
 
-    shards = json.loads((MEDS_cohort_dir / "splits.json").read_text())
+    shards = json.loads((Path(cfg.stage_cfg.metadata_input_dir) / "splits.json").read_text())
 
     logger.info("Starting patient shard merging.")
 
-    patient_subsharded_dir = MEDS_cohort_dir / "patient_sub_sharded_events"
+    patient_subsharded_dir = Path(cfg.stage_cfg.data_input_dir)
     if not patient_subsharded_dir.is_dir():
         raise FileNotFoundError(f"Patient sub-sharded directory not found: {patient_subsharded_dir}")
 
@@ -57,7 +61,7 @@ def main(cfg: DictConfig):
 
     for sp in patient_splits:
         in_dir = patient_subsharded_dir / sp
-        out_fp = MEDS_cohort_dir / "final_cohort" / f"{sp}.parquet"
+        out_fp = Path(cfg.stage_cfg.output_dir) / f"{sp}.parquet"
 
         shard_fps = sorted(list(in_dir.glob("**/*.parquet")))
         shard_fp_strs = [f"  * {str(fp.resolve())}" for fp in shard_fps]
diff --git a/scripts/extraction/shard_events.py b/scripts/extraction/shard_events.py
index 15737c1..5ccc36f 100755
--- a/scripts/extraction/shard_events.py
+++ b/scripts/extraction/shard_events.py
@@ -190,9 +190,14 @@ def main(cfg: DictConfig):
     """
     hydra_loguru_init()
 
-    raw_cohort_dir = Path(cfg.raw_cohort_dir)
-    MEDS_cohort_dir = Path(cfg.MEDS_cohort_dir)
-    row_chunksize = cfg.row_chunksize
+    logger.info(
+        f"Running with config:\n{OmegaConf.to_yaml(cfg)}\n"
+        f"Stage: {cfg.stage}\n\n"
+        f"Stage config:\n{OmegaConf.to_yaml(cfg.stage_cfg)}"
+    )
+
+    raw_cohort_dir = Path(cfg.stage_cfg.data_input_dir)
+    row_chunksize = cfg.stage_cfg.row_chunksize
 
     event_conversion_cfg_fp = Path(cfg.event_conversion_config_fp)
     if not event_conversion_cfg_fp.exists():
@@ -226,19 +231,21 @@ def main(cfg: DictConfig):
     )
     logger.info(
         f"Will read raw data from {str(raw_cohort_dir.resolve())}/$IN_FILE.parquet and write sub-sharded "
-        f"data to {str(MEDS_cohort_dir.resolve())}/sub_sharded/$IN_FILE/$ROW_START-$ROW_END.parquet"
+        f"data to {cfg.stage_cfg.output_dir}/$IN_FILE/$ROW_START-$ROW_END.parquet"
     )
 
     start = datetime.now()
     for input_file in input_files_to_subshard:
         columns = prefix_to_columns[get_shard_prefix(raw_cohort_dir, input_file)]
 
-        out_dir = MEDS_cohort_dir / "sub_sharded" / get_shard_prefix(raw_cohort_dir, input_file)
+        out_dir = Path(cfg.stage_cfg.output_dir) / get_shard_prefix(raw_cohort_dir, input_file)
         out_dir.mkdir(parents=True, exist_ok=True)
         logger.info(f"Processing {input_file} to {out_dir}.")
 
         logger.info(f"Performing preliminary read of {str(input_file.resolve())} to determine row count.")
-        df = scan_with_row_idx(input_file, columns=columns, infer_schema_length=cfg["infer_schema_length"])
+        df = scan_with_row_idx(
+            input_file, columns=columns, infer_schema_length=cfg.stage_cfg.infer_schema_length
+        )
 
         row_count = df.select(pl.len()).collect().item()
 
@@ -272,7 +279,9 @@ def main(cfg: DictConfig):
             rwlock_wrap(
                 input_file,
                 out_fp,
-                partial(scan_with_row_idx, columns=columns, infer_schema_length=cfg["infer_schema_length"]),
+                partial(
+                    scan_with_row_idx, columns=columns, infer_schema_length=cfg.stage_cfg.infer_schema_length
+                ),
                 write_lazyframe,
                 compute_fn,
                 do_overwrite=cfg.do_overwrite,
diff --git a/scripts/extraction/split_and_shard_patients.py b/scripts/extraction/split_and_shard_patients.py
index fa5c1c2..f618da5 100755
--- a/scripts/extraction/split_and_shard_patients.py
+++ b/scripts/extraction/split_and_shard_patients.py
@@ -18,10 +18,16 @@ def main(cfg: DictConfig):
 
     hydra_loguru_init()
 
+    logger.info(
+        f"Running with config:\n{OmegaConf.to_yaml(cfg)}\n"
+        f"Stage: {cfg.stage}\n\n"
+        f"Stage config:\n{OmegaConf.to_yaml(cfg.stage_cfg)}"
+    )
+
     logger.info("Starting patient splitting and sharding")
 
-    MEDS_cohort_dir = Path(cfg.MEDS_cohort_dir)
-    subsharded_dir = MEDS_cohort_dir / "sub_sharded"
+    MEDS_cohort_dir = Path(cfg.stage_cfg.output_dir)
+    subsharded_dir = Path(cfg.stage_cfg.data_input_dir)
 
     event_conversion_cfg_fp = Path(cfg.event_conversion_config_fp)
     if not event_conversion_cfg_fp.exists():
@@ -61,8 +67,8 @@ def main(cfg: DictConfig):
 
     logger.info(f"Found {len(patient_ids)} unique patient IDs of type {patient_ids.dtype}")
 
-    if cfg.external_splits_json_fp:
-        external_splits_json_fp = Path(cfg.external_splits_json_fp)
+    if cfg.stage_cfg.external_splits_json_fp:
+        external_splits_json_fp = Path(cfg.stage_cfg.external_splits_json_fp)
         if not external_splits_json_fp.exists():
             raise FileNotFoundError(f"External splits JSON file not found at {external_splits_json_fp}")
 
@@ -79,8 +85,8 @@ def main(cfg: DictConfig):
     sharded_patients = shard_patients(
         patients=patient_ids,
         external_splits=external_splits,
-        split_fracs_dict=cfg.split_fracs,
-        n_patients_per_shard=cfg.n_patients_per_shard,
+        split_fracs_dict=cfg.stage_cfg.split_fracs,
+        n_patients_per_shard=cfg.stage_cfg.n_patients_per_shard,
         seed=cfg.seed,
     )
 
diff --git a/scripts/preprocessing/add_time_derived_measurements.py b/scripts/preprocessing/add_time_derived_measurements.py
index e5cae0d..1e01067 100644
--- a/scripts/preprocessing/add_time_derived_measurements.py
+++ b/scripts/preprocessing/add_time_derived_measurements.py
@@ -24,12 +24,17 @@ def main(cfg: DictConfig):
 
     hydra_loguru_init()
 
-    MEDS_cohort_dir = Path(cfg.MEDS_cohort_dir)
-    output_dir = Path(cfg.output_data_dir)
+    logger.info(
+        f"Running with config:\n{OmegaConf.to_yaml(cfg)}\n"
+        f"Stage: {cfg.stage}\n\n"
+        f"Stage config:\n{OmegaConf.to_yaml(cfg.stage_cfg)}"
+    )
 
-    shards = json.loads((MEDS_cohort_dir / "splits.json").read_text())
+    output_dir = Path(cfg.stage_dfg.output_dir)
 
-    final_cohort_dir = MEDS_cohort_dir / "final_cohort"
+    shards = json.loads((Path(cfg.stage_cfg.metadata_input_dir) / "splits.json").read_text())
+
+    final_cohort_dir = cfg.stage_cfg.data_input_dir / "final_cohort"
     filtered_patients_dir = output_dir / "patients_above_length_threshold"
     with_time_derived_dir = output_dir / "with_time_derived_measurements"
 
diff --git a/scripts/preprocessing/collect_code_metadata.py b/scripts/preprocessing/collect_code_metadata.py
index 36f4b77..fa25bcb 100644
--- a/scripts/preprocessing/collect_code_metadata.py
+++ b/scripts/preprocessing/collect_code_metadata.py
@@ -9,7 +9,7 @@
 import hydra
 import polars as pl
 from loguru import logger
-from omegaconf import DictConfig
+from omegaconf import DictConfig, OmegaConf
 
 from MEDS_polars_functions.code_metadata import mapper_fntr, reducer_fntr
 from MEDS_polars_functions.mapper import wrap as rwlock_wrap
@@ -22,6 +22,12 @@ def main(cfg: DictConfig):
 
     hydra_loguru_init()
 
+    logger.info(
+        f"Running with config:\n{OmegaConf.to_yaml(cfg)}\n"
+        f"Stage: {cfg.stage}\n\n"
+        f"Stage config:\n{OmegaConf.to_yaml(cfg.stage_cfg)}"
+    )
+
     MEDS_cohort_dir = Path(cfg.MEDS_cohort_dir)
     output_dir = Path(cfg.output_data_dir)
 
diff --git a/scripts/preprocessing/filter_patients.py b/scripts/preprocessing/filter_patients.py
index f926401..a2b6308 100644
--- a/scripts/preprocessing/filter_patients.py
+++ b/scripts/preprocessing/filter_patients.py
@@ -8,7 +8,7 @@
 import hydra
 import polars as pl
 from loguru import logger
-from omegaconf import DictConfig
+from omegaconf import DictConfig, OmegaConf
 
 from MEDS_polars_functions.filter_patients_by_length import (
     filter_patients_by_num_events,
@@ -24,6 +24,12 @@ def main(cfg: DictConfig):
 
     hydra_loguru_init()
 
+    logger.info(
+        f"Running with config:\n{OmegaConf.to_yaml(cfg)}\n"
+        f"Stage: {cfg.stage}\n\n"
+        f"Stage config:\n{OmegaConf.to_yaml(cfg.stage_cfg)}"
+    )
+
     MEDS_cohort_dir = Path(cfg.MEDS_cohort_dir)
     output_dir = Path(cfg.output_data_dir)
 
diff --git a/src/MEDS_polars_functions/utils.py b/src/MEDS_polars_functions/utils.py
index e61f21d..996673a 100644
--- a/src/MEDS_polars_functions/utils.py
+++ b/src/MEDS_polars_functions/utils.py
@@ -6,7 +6,7 @@
 
 import hydra
 import polars as pl
-from loguru import logger as log
+from loguru import logger
 from omegaconf import OmegaConf
 
 
@@ -23,8 +23,9 @@ def populate_stage(
     stage_name: str,
     input_dir: str,
     cohort_dir: str,
-    stages: list[dict],
-    pre_parsed_stages: list[dict] | None = None,
+    stages: list[str],
+    stage_configs: dict[str, dict],
+    pre_parsed_stages: dict[str, dict] | None = None,
 ) -> dict:
     """Populates a stage in the stages configuration with inferred stage parameters.
 
@@ -42,9 +43,11 @@ def populate_stage(
         stage_name: The name of the stage to populate.
         input_dir: The global input directory.
         cohort_dir: The cohort directory into which this overall pipeline is writing data.
-        stages: The stages configuration dictionaries (unresolved).
-        pre_parsed_stages: The stages configuration dictionaries (resolved). If specified, the function will
-            not re-resolve the stages in this list.
+        stages: The names of the stages processed by this pipeline in order.
+        stage_configs: The raw, unresolved stage configuration dictionaries for any stages with specific
+            arguments, keyed by stage name.
+        pre_parsed_stages: The stages configuration dictionaries (resolved), keyed by stage name. If
+            specified, the function will not re-resolve the stages in this list.
 
     Returns:
         dict: The populated stage configuration.
@@ -53,70 +56,78 @@ def populate_stage(
         ValueError: If the stage is not present in the stages configuration.
 
     Examples:
+        >>> from omegaconf import DictConfig
         >>> root_config = DictConfig({
         ...     "input_dir": "/a/b",
         ...     "cohort_dir": "/c/d",
-        ...     "stages": [
-        ...         {"name": "stage1"},
-        ...         {"name": "stage2", "is_metadata": True},
-        ...         {"name": "stage3", "is_metadata": None},
-        ...         {"name": "stage4", "data_input_dir": "/e/f", "output_dir": "/g/h"},
-        ...         {"name": "stage5", "aggregations": ["foo"]},
-        ...         {"name": "stage6"},
-        ...     ],
+        ...     "stages": ["stage1", "stage2", "stage3", "stage4", "stage5", "stage6"],
+        ...     "stage_configs": {
+        ...         "stage2": {"is_metadata": True},
+        ...         "stage3": {"is_metadata": None},
+        ...         "stage4": {"data_input_dir": "/e/f", "output_dir": "/g/h"},
+        ...         "stage5": {"aggregations": ["foo"]},
+        ...     },
         ... })
-        >>> args = (root_config["input_dir"], root_config["cohort_dir"], root_config["stages"])
+        >>> args = [root_config[k] for k in ["input_dir", "cohort_dir", "stages", "stage_configs"]]
         >>> populate_stage("stage1", *args) # doctest: +NORMALIZE_WHITESPACE
-        {'name': 'stage1', 'is_metadata': False, 'data_input_dir': '/a/b', 'metadata_input_dir': '/a/b',
+        {'is_metadata': False, 'data_input_dir': '/a/b', 'metadata_input_dir': '/a/b',
          'output_dir': '/c/d/stage1'}
         >>> populate_stage("stage2", *args) # doctest: +NORMALIZE_WHITESPACE
-        {'name': 'stage2', 'is_metadata': True, 'data_input_dir': '/c/d/stage1', 'metadata_input_dir': '/a/b',
+        {'is_metadata': True, 'data_input_dir': '/c/d/stage1', 'metadata_input_dir': '/a/b',
          'output_dir': '/c/d/stage2'}
         >>> populate_stage("stage3", *args) # doctest: +NORMALIZE_WHITESPACE
-        {'name': 'stage3', 'is_metadata': False, 'data_input_dir': '/c/d/stage1',
+        {'is_metadata': False, 'data_input_dir': '/c/d/stage1',
          'metadata_input_dir': '/c/d/stage2', 'output_dir': '/c/d/stage3'}
         >>> populate_stage("stage4", *args) # doctest: +NORMALIZE_WHITESPACE
-        {'name': 'stage4', 'data_input_dir': '/e/f', 'output_dir': '/g/h', 'is_metadata': False,
+        {'data_input_dir': '/e/f', 'output_dir': '/g/h', 'is_metadata': False,
          'metadata_input_dir': '/c/d/stage2'}
         >>> populate_stage("stage5", *args) # doctest: +NORMALIZE_WHITESPACE
-        {'name': 'stage5', 'aggregations': ['foo'], 'is_metadata': True, 'data_input_dir': '/g/h',
+        {'aggregations': ['foo'], 'is_metadata': True, 'data_input_dir': '/g/h',
          'metadata_input_dir': '/c/d/stage2', 'output_dir': '/c/d/stage5'}
         >>> populate_stage("stage6", *args) # doctest: +NORMALIZE_WHITESPACE
-        {'name': 'stage6', 'is_metadata': False, 'data_input_dir': '/g/h',
+        {'is_metadata': False, 'data_input_dir': '/g/h',
          'metadata_input_dir': '/c/d/stage5', 'output_dir': '/c/d/stage6'}
         >>> populate_stage("stage7", *args) # doctest: +NORMALIZE_WHITESPACE
         Traceback (most recent call last):
             ...
-        ValueError: 'stage7' is not a valid stage name. Options are:
-            ['stage1', 'stage2', 'stage3', 'stage4', 'stage5', 'stage6']
+        ValueError: 'stage7' is not a valid stage name. Options are: stage1, stage2, stage3, stage4, stage5,
+            stage6
     """
 
-    if stage_name not in {s["name"] for s in stages}:
-        raise ValueError(
-            f"'{stage_name}' is not a valid stage name. Options are: {list(s['name'] for s in stages)}"
-        )
+    for s in stage_configs.keys():
+        if s not in stages:
+            raise ValueError(
+                f"stage config key '{s}' is not a valid stage name. Options are: {list(stages.keys())}"
+            )
 
-    pre_pop_stages_by_name = {s["name"]: s for s in pre_parsed_stages} if pre_parsed_stages else {}
-    pre_parsed_stages = pre_parsed_stages or []
+    if stage_name not in stages:
+        raise ValueError(f"'{stage_name}' is not a valid stage name. Options are: {', '.join(stages)}")
+
+    if pre_parsed_stages is None:
+        pre_parsed_stages = {}
 
-    prior_stages = []
     stage = None
     prior_data_stage = None
     prior_metadata_stage = None
     for s in stages:
-        if s["name"] == stage_name:
-            stage = s
+        if s == stage_name:
+            stage = stage_configs.get(s, {})
             break
-        elif s["name"] in pre_pop_stages_by_name:
-            s_resolved = pre_pop_stages_by_name[s["name"]]
+        elif s in pre_parsed_stages:
+            s_resolved = pre_parsed_stages[s]
         else:
-            s_resolved = populate_stage(s["name"], input_dir, cohort_dir, stages, prior_stages)
+            s_resolved = populate_stage(s, input_dir, cohort_dir, stages, stage_configs, pre_parsed_stages)
 
+        pre_parsed_stages[s] = s_resolved
         if s_resolved["is_metadata"]:
             prior_metadata_stage = s_resolved
         else:
             prior_data_stage = s_resolved
-        prior_stages.append(s_resolved)
+
+    logger.debug(
+        f"Parsing stage {stage_name}:\nResolved prior data stage: {prior_data_stage}\n"
+        f"Resolved prior metadata stage: {prior_metadata_stage}"
+    )
 
     inferred_keys = {
         "is_metadata": "aggregations" in stage,
@@ -145,7 +156,7 @@ def hydra_loguru_init() -> None:
     Must be called from a hydra main!
     """
     hydra_path = hydra.core.hydra_config.HydraConfig.get().runtime.output_dir
-    log.add(os.path.join(hydra_path, "main.log"))
+    logger.add(os.path.join(hydra_path, "main.log"))
 
 
 def write_lazyframe(df: pl.LazyFrame, out_fp: Path) -> None:
diff --git a/tests/test_extraction.py b/tests/test_extraction.py
index a256864..9343d17 100644
--- a/tests/test_extraction.py
+++ b/tests/test_extraction.py
@@ -245,14 +245,14 @@ def test_extraction():
         #   4. Merge to the final output.
 
         extraction_config_kwargs = {
-            "raw_cohort_dir": str(raw_cohort_dir.resolve()),
-            "MEDS_cohort_dir": str(MEDS_cohort_dir.resolve()),
+            "input_dir": str(raw_cohort_dir.resolve()),
+            "cohort_dir": str(MEDS_cohort_dir.resolve()),
             "event_conversion_config_fp": str(event_cfgs_yaml.resolve()),
-            "split_fracs.train": 4 / 6,
-            "split_fracs.tuning": 1 / 6,
-            "split_fracs.held_out": 1 / 6,
-            "row_chunksize": 10,
-            "n_patients_per_shard": 2,
+            "stage_configs.split_and_shard_patients.split_fracs.train": 4 / 6,
+            "stage_configs.split_and_shard_patients.split_fracs.tuning": 1 / 6,
+            "stage_configs.split_and_shard_patients.split_fracs.held_out": 1 / 6,
+            "stage_configs.shard_events.row_chunksize": 10,
+            "stage_configs.split_and_shard_patients.n_patients_per_shard": 2,
             "hydra.verbose": True,
         }
 
@@ -269,7 +269,7 @@ def test_extraction():
         all_stderrs.append(stderr)
         all_stdouts.append(stdout)
 
-        subsharded_dir = MEDS_cohort_dir / "sub_sharded"
+        subsharded_dir = MEDS_cohort_dir / "shard_events"
 
         try:
             out_files = list(subsharded_dir.glob("**/*.parquet"))
@@ -319,24 +319,30 @@ def test_extraction():
         all_stderrs.append(stderr)
         all_stdouts.append(stdout)
 
-        splits_fp = MEDS_cohort_dir / "splits.json"
-        assert splits_fp.is_file(), f"Expected splits @ {str(splits_fp.resolve())} to exist."
+        try:
+            splits_fp = MEDS_cohort_dir / "splits.json"
+            assert splits_fp.is_file(), f"Expected splits @ {str(splits_fp.resolve())} to exist."
 
-        splits = json.loads(splits_fp.read_text())
-        expected_keys = ["train/0", "train/1", "tuning/0", "held_out/0"]
+            splits = json.loads(splits_fp.read_text())
+            expected_keys = ["train/0", "train/1", "tuning/0", "held_out/0"]
 
-        expected_keys_str = ", ".join(f"'{k}'" for k in expected_keys)
-        got_keys_str = ", ".join(f"'{k}'" for k in splits.keys())
+            expected_keys_str = ", ".join(f"'{k}'" for k in expected_keys)
+            got_keys_str = ", ".join(f"'{k}'" for k in splits.keys())
 
-        assert set(splits.keys()) == set(expected_keys), (
-            f"Expected splits to have keys {expected_keys_str}.\n" f"Got keys: {got_keys_str}"
-        )
+            assert set(splits.keys()) == set(expected_keys), (
+                f"Expected splits to have keys {expected_keys_str}.\n" f"Got keys: {got_keys_str}"
+            )
 
-        assert splits == EXPECTED_SPLITS, (
-            f"Expected splits to be {EXPECTED_SPLITS}, got {splits}. NOTE THIS MAY CHANGE IF THE SEED OR "
-            "DATA CHANGES -- FAILURE HERE MAY BE JUST DUE TO A NON-DETERMINISTIC SPLIT AND THE TEST NEEDING "
-            "TO BE UPDATED."
-        )
+            assert splits == EXPECTED_SPLITS, (
+                f"Expected splits to be {EXPECTED_SPLITS}, got {splits}. NOTE THIS MAY CHANGE IF THE SEED OR "
+                "DATA CHANGES -- FAILURE HERE MAY BE JUST DUE TO A NON-DETERMINISTIC SPLIT AND THE TEST "
+                "NEEDING TO BE UPDATED."
+            )
+        except AssertionError as e:
+            print("Failed to split patients")
+            print(f"stderr:\n{stderr}")
+            print(f"stdout:\n{stdout}")
+            raise e
 
         # Step 3: Extract the events and sub-shard by patient
         stderr, stdout = run_command(
@@ -347,7 +353,7 @@ def test_extraction():
         all_stderrs.append(stderr)
         all_stdouts.append(stdout)
 
-        patient_subsharded_folder = MEDS_cohort_dir / "patient_sub_sharded_events"
+        patient_subsharded_folder = MEDS_cohort_dir / "convert_to_sharded_events"
         assert patient_subsharded_folder.is_dir(), f"Expected {patient_subsharded_folder} to be a directory."
 
         for split, expected_outputs in SUB_SHARDED_OUTPUTS.items():

From 841f6617ae0fef2859cf935c011530f11d3155b4 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Tue, 28 May 2024 10:56:14 -0400
Subject: [PATCH 06/47] Updated MIMIC examples

---
 MIMIC-IV_Example/README.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/MIMIC-IV_Example/README.md b/MIMIC-IV_Example/README.md
index 535aa56..54ec4ce 100644
--- a/MIMIC-IV_Example/README.md
+++ b/MIMIC-IV_Example/README.md
@@ -84,8 +84,8 @@ This is a step in 4 parts:
 
 ```bash
 ./scripts/extraction/shard_events.py \
-    raw_cohort_dir=$MIMICIV_PREMEDS_DIR \
-    MEDS_cohort_dir=$MIMICIV_MEDS_DIR \
+    input_dir=$MIMICIV_PREMEDS_DIR \
+    cohort_dir=$MIMICIV_MEDS_DIR \
     event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml
 ```
 
@@ -95,8 +95,8 @@ In practice, on a machine with 150 GB of RAM and 10 cores, this step takes appro
 
 ```bash
 ./scripts/extraction/split_and_shard_patients.py \
-    raw_cohort_dir=$MIMICIV_PREMEDS_DIR \
-    MEDS_cohort_dir=$MIMICIV_MEDS_DIR \
+    input_dir=$MIMICIV_PREMEDS_DIR \
+    cohort_dir=$MIMICIV_MEDS_DIR \
     event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml
 ```
 
@@ -106,8 +106,8 @@ In practice, on a machine with 150 GB of RAM and 10 cores, this step takes less
 
 ```bash
 ./scripts/extraction/convert_to_sharded_events.py \
-    raw_cohort_dir=$MIMICIV_PREMEDS_DIR \
-    MEDS_cohort_dir=$MIMICIV_MEDS_DIR \
+    input_dir=$MIMICIV_PREMEDS_DIR \
+    cohort_dir=$MIMICIV_MEDS_DIR \
     event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml
 ```
 
@@ -121,8 +121,8 @@ and performance is not necessary; however, for larger datasets, it can be.
 
 ```bash
 ./scripts/extraction/merge_to_MEDS_cohort.py \
-    raw_cohort_dir=$MIMICIV_PREMEDS_DIR \
-    MEDS_cohort_dir=$MIMICIV_MEDS_DIR \
+    input_dir=$MIMICIV_PREMEDS_DIR \
+    cohort_dir=$MIMICIV_MEDS_DIR \
     event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml
 ```
 

From bbd673d558da55ab45fb674cda66f5b7ec55c3f9 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Thu, 30 May 2024 17:04:12 -0400
Subject: [PATCH 07/47] Added some content to README that still needs to be
 re-worked a bit

---
 README.md | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 85 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 9669c48..d57b82b 100644
--- a/README.md
+++ b/README.md
@@ -32,10 +32,92 @@ This package provides three things:
 
 ## Installation
 
-For now, clone this repository and run `pip install -e .` from the repository root. To use the MIMIC-IV
-example, install the optional MIMIC dependencies as well with `pip install -e .[mimic]`.
+- For a base installation, clone this repository and run `pip install .` from the repository root.
+- For running the MIMIC-IV example, install the optional MIMIC dependencies as well with `pip install .[mimic]`.
+- To support same-machine, process-based parallelism, install the optional joblib dependencies with `pip install .[local_parallelism]`.
+- To support cluster-based parallelism, install the optional submitit dependencies with `pip install .[slurm_parallelism]`.
+- For working on development, install the optional development dependencies with `pip install .[dev,tests]`.
+- Optional dependencies can be mutually installed by combining the optional dependency names with commas in
+  the square brackets, e.g., `pip install .[mimic,local_parallelism]`.
+
+## Usage -- High Level
+
+The MEDS ETL and pre-processing pipelines are designed to be run in a modular, stage-based manner, with each
+stage of the pipeline being run as a separate script. For a single pipeline, all scripts will take the same
+arguments by leveraging the same Hydra configuration file, and to run multiple workers on a single stage in
+parallel, the user can launch the same script multiple times _without changing the arguments or configuration
+file_, and the scripts will automatically handle the parallelism and avoid duplicative work. This permits
+tremendous flexibility in how these pipelines can be run.
+
+- The user can run the entire pipeline in serial, through a single shell script simply by calling each
+  stage's script in sequence.
+- The user can leverage arbitrary scheduling systems (e.g., Slurm, LSF, Kubernetes, etc.) to run each stage
+  in parallel on a cluster, by constructing the appropriate worker scripts to run each stage's script and
+  simply launching as many worker jobs as is desired (note this will typically required a distributed file
+  system to work correctly, as these scripts use manually created file locks to avoid duplicative work).
+- The user can run each stage in parallel on a single machine by launching multiple copies of the same
+  script in different terminal sessions. This can result in a significant speedup depending on the machine
+  configuration as it ensures that parallelism can be used with minimal file read contention.
+
+Two of these methods of parallelism, in particular local-machine parallelism and slurm-based cluster
+parallelism, are supported explicitly by this package through the use of the `joblib` and `submitit` Hydra
+plugins and Hydra's multirun capabilities, which will be discussed in more detail below.
+
+By following this design convention, each individual stage of the pipeline can be kept extremely simple (often
+each stage corresponds simply to a single short "dataframe" function), can be rigorously tested, can be cached
+after completion to permit easy re-suming or re-running of the pipeline, and permits extremely flexible and
+efficient (through parallelization) use of the pipeline in a variety of environments, all without imposing
+significant complexity, overhead, or computational dependencies on the user.
+
+Below we walk through usage of this mechanism for both the ETL and the model-specific pre-processing
+pipelines in more detail.
+
+### Scripts for the ETL Pipeline
+
+The ETL pipeline (which is more complete, and likely to be viable for a wider range of input datasets out of
+the box) relies on the following configuration files and scripts:
+
+Configuration: `configs/extraction.yaml`
 
-## MEDS ETL / Extraction Pipeline
+```yaml
+# The event conversion configuration file is used throughout the pipeline to define the events to extract.
+event_conversion_config_fp: ???
+
+stages:
+  - shard_events
+  - split_and_shard_patients
+  - convert_to_sharded_events
+  - merge_to_MEDS_cohort
+
+stage_configs:
+  shard_events:
+    row_chunksize: 200000000
+    infer_schema_length: 10000
+  split_and_shard_patients:
+    is_metadata: true
+    output_dir: ${cohort_dir}
+    n_patients_per_shard: 50000
+    external_splits_json_fp:
+    split_fracs:
+      train: 0.8
+      tuning: 0.1
+      held_out: 0.1
+  merge_to_MEDS_cohort:
+    output_dir: ${cohort_dir}/final_cohort
+```
+
+Scripts:
+
+1. `shard_events.py`: Shards the input data into smaller, event-level shards.
+2. `split_and_shard_patients.py`: Splits the patient population into ML splits and shards these splits into
+   patient-level shards.
+3. `convert_to_sharded_events.py`: Converts the input, event-level shards into the MEDS event format and
+   sub-shards them into patient-level sub-shards.
+4. `merge_to_MEDS_cohort.py`: Merges the patient-level, event-level shards into full patient-level shards.
+
+See the `MIMIC-IV_Example` directory for a full, worked example of the ETL on MIMIC-IV v2.2.
+
+## MEDS ETL / Extraction Pipeline Details
 
 ### Overview
 

From 67f8b6c2f8f87ece5929c10155ed47639c7076d8 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Thu, 30 May 2024 18:21:01 -0400
Subject: [PATCH 08/47] Added joint script demonstrating joblib launcher

---
 MIMIC-IV_Example/joint_script.sh   | 44 ++++++++++++++++++++++++++++++
 MIMIC-IV_Example/pre_MEDS.py       |  9 ++++++
 scripts/extraction/shard_events.py |  5 ++++
 3 files changed, 58 insertions(+)
 create mode 100755 MIMIC-IV_Example/joint_script.sh

diff --git a/MIMIC-IV_Example/joint_script.sh b/MIMIC-IV_Example/joint_script.sh
new file mode 100755
index 0000000..ebd397b
--- /dev/null
+++ b/MIMIC-IV_Example/joint_script.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+
+MIMICIV_RAW_DIR="$1"
+MIMICIV_PREMEDS_DIR="$2"
+MIMICIV_MEDS_DIR="$3"
+N_PARALLEL_WORKERS="$4"
+
+shift 4
+
+echo "Running pre-MEDS conversion."
+./MIMIC-IV_Example/pre_MEDS.py raw_cohort_dir=$MIMICIV_RAW_DIR output_dir=$MIMICIV_PREMEDS_DIR
+
+echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel"
+./scripts/extraction/shard_events.py \
+    --multirun \
+    worker="range(1,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=joblib \
+    input_dir=$MIMICIV_PREMEDS_DIR \
+    cohort_dir=$MIMICIV_MEDS_DIR \
+    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+
+echo "Splitting patients in serial"
+./scripts/extraction/split_and_shard_patients.py \
+    input_dir=$MIMICIV_PREMEDS_DIR \
+    cohort_dir=$MIMICIV_MEDS_DIR \
+    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+
+echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
+./scripts/extraction/convert_to_sharded_events.py \
+    --multirun \
+    worker="range(1,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=joblib \
+    input_dir=$MIMICIV_PREMEDS_DIR \
+    cohort_dir=$MIMICIV_MEDS_DIR \
+    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+
+echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
+./scripts/extraction/merge_to_MEDS_cohort.py \
+    --multirun \
+    worker="range(1,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=joblib \
+    input_dir=$MIMICIV_PREMEDS_DIR \
+    cohort_dir=$MIMICIV_MEDS_DIR \
+    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
diff --git a/MIMIC-IV_Example/pre_MEDS.py b/MIMIC-IV_Example/pre_MEDS.py
index 1f2f223..789b882 100755
--- a/MIMIC-IV_Example/pre_MEDS.py
+++ b/MIMIC-IV_Example/pre_MEDS.py
@@ -94,6 +94,11 @@ def main(cfg: DictConfig):
         pfx = get_shard_prefix(raw_cohort_dir, in_fp)
 
         out_fp = MEDS_input_dir / in_fp.relative_to(raw_cohort_dir)
+
+        if out_fp.is_file():
+            print(f"Done with {pfx}. Continuing")
+            continue
+
         out_fp.parent.mkdir(parents=True, exist_ok=True)
 
         if pfx not in FUNCTIONS:
@@ -106,6 +111,10 @@ def main(cfg: DictConfig):
             continue
         else:
             out_fp = MEDS_input_dir / f"{pfx}.parquet"
+            if out_fp.is_file():
+                print(f"Done with {pfx}. Continuing")
+                continue
+
             fn, need_df = FUNCTIONS[pfx]
             if not need_df:
                 st = datetime.now()
diff --git a/scripts/extraction/shard_events.py b/scripts/extraction/shard_events.py
index 5ccc36f..d0533e3 100755
--- a/scripts/extraction/shard_events.py
+++ b/scripts/extraction/shard_events.py
@@ -222,6 +222,11 @@ def main(cfg: DictConfig):
                 input_files_to_subshard.append(f)
                 seen_files.add(get_shard_prefix(raw_cohort_dir, f))
 
+    if not input_files_to_subshard:
+        raise FileNotFoundError(
+            f"Can't find any files in {str(raw_cohort_dir.resolve())} to sub-shard!"
+        )
+
     random.shuffle(input_files_to_subshard)
 
     subsharding_files_strs = "\n".join([f"  * {str(fp.resolve())}" for fp in input_files_to_subshard])

From 7b585819ed27bc56433d8bc967e6ee70477a6fdf Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Thu, 30 May 2024 21:00:22 -0400
Subject: [PATCH 09/47] Minor changes mostly to joint script

---
 MIMIC-IV_Example/joint_script.sh   | 6 +++---
 src/MEDS_polars_functions/utils.py | 7 ++-----
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/MIMIC-IV_Example/joint_script.sh b/MIMIC-IV_Example/joint_script.sh
index ebd397b..9d7ae69 100755
--- a/MIMIC-IV_Example/joint_script.sh
+++ b/MIMIC-IV_Example/joint_script.sh
@@ -13,7 +13,7 @@ echo "Running pre-MEDS conversion."
 echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel"
 ./scripts/extraction/shard_events.py \
     --multirun \
-    worker="range(1,$N_PARALLEL_WORKERS)" \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
     hydra/launcher=joblib \
     input_dir=$MIMICIV_PREMEDS_DIR \
     cohort_dir=$MIMICIV_MEDS_DIR \
@@ -28,7 +28,7 @@ echo "Splitting patients in serial"
 echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
 ./scripts/extraction/convert_to_sharded_events.py \
     --multirun \
-    worker="range(1,$N_PARALLEL_WORKERS)" \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
     hydra/launcher=joblib \
     input_dir=$MIMICIV_PREMEDS_DIR \
     cohort_dir=$MIMICIV_MEDS_DIR \
@@ -37,7 +37,7 @@ echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
 echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
 ./scripts/extraction/merge_to_MEDS_cohort.py \
     --multirun \
-    worker="range(1,$N_PARALLEL_WORKERS)" \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
     hydra/launcher=joblib \
     input_dir=$MIMICIV_PREMEDS_DIR \
     cohort_dir=$MIMICIV_MEDS_DIR \
diff --git a/src/MEDS_polars_functions/utils.py b/src/MEDS_polars_functions/utils.py
index 996673a..b2fbbb7 100644
--- a/src/MEDS_polars_functions/utils.py
+++ b/src/MEDS_polars_functions/utils.py
@@ -9,6 +9,8 @@
 from loguru import logger
 from omegaconf import OmegaConf
 
+pl.enable_string_cache()
+
 
 def current_script_name() -> str:
     """Returns the name of the script that called this function.
@@ -124,11 +126,6 @@ def populate_stage(
         else:
             prior_data_stage = s_resolved
 
-    logger.debug(
-        f"Parsing stage {stage_name}:\nResolved prior data stage: {prior_data_stage}\n"
-        f"Resolved prior metadata stage: {prior_metadata_stage}"
-    )
-
     inferred_keys = {
         "is_metadata": "aggregations" in stage,
         "data_input_dir": input_dir if prior_data_stage is None else prior_data_stage["output_dir"],

From 8aa1db7824623cef16cb209fdd8112383914f505 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Fri, 31 May 2024 08:49:33 -0400
Subject: [PATCH 10/47] Made the locking process more robust

---
 src/MEDS_polars_functions/mapper.py | 109 +++++++++++++++++++++++-----
 1 file changed, 90 insertions(+), 19 deletions(-)

diff --git a/src/MEDS_polars_functions/mapper.py b/src/MEDS_polars_functions/mapper.py
index deefd0d..34275b8 100644
--- a/src/MEDS_polars_functions/mapper.py
+++ b/src/MEDS_polars_functions/mapper.py
@@ -8,6 +8,79 @@
 
 from loguru import logger
 
+LOCK_TIME_FMT = "%Y-%m-%dT%H:%M:%S.%f"
+
+
+def get_earliest_lock(cache_directory: Path) -> datetime | None:
+    """Returns the earliest start time of any lock file present in a cache directory, or None if none exist.
+
+    Args:
+        cache_directory: The cache directory to check for the presence of a lock file.
+
+    Examples:
+        >>> import tempfile
+        >>> directory = tempfile.TemporaryDirectory()
+        >>> root = Path(directory.name)
+        >>> empty_directory = root / "cache_empty"
+        >>> empty_directory.mkdir(exist_ok=True, parents=True)
+        >>> cache_directory = root / "cache_with_locks"
+        >>> locks_directory = cache_directory / "locks"
+        >>> locks_directory.mkdir(exist_ok=True, parents=True)
+        >>> time_1 = datetime(2021, 1, 1)
+        >>> time_1_str = time_1.strftime(LOCK_TIME_FMT) # "2021-01-01T00:00:00.000000"
+        >>> lock_fp_1 = locks_directory / f"{time_1_str}.json"
+        >>> _ = lock_fp_1.write_text(json.dumps({"start": time_1_str}))
+        >>> time_2 = datetime(2021, 1, 2, 3, 4, 5)
+        >>> time_2_str = time_2.strftime(LOCK_TIME_FMT) # "2021-01-02T03:04:05.000000"
+        >>> lock_fp_2 = locks_directory / f"{time_2_str}.json"
+        >>> _ = lock_fp_2.write_text(json.dumps({"start": time_2_str}))
+        >>> get_earliest_lock(cache_directory)
+        datetime.datetime(2021, 1, 1, 0, 0)
+        >>> get_earliest_lock(empty_directory) is None
+        True
+        >>> lock_fp_1.unlink()
+        >>> get_earliest_lock(cache_directory)
+        datetime.datetime(2021, 1, 2, 3, 4, 5)
+        >>> directory.cleanup()
+    """
+    locks_directory = cache_directory / "locks"
+
+    lock_times = [
+        datetime.strptime(json.loads(lock_fp.read_text())["start"], LOCK_TIME_FMT)
+        for lock_fp in locks_directory.glob("*.json")
+    ]
+
+    return min(lock_times) if lock_times else None
+
+
+def register_lock(cache_directory: Path) -> tuple[datetime, Path]:
+    """Register a lock file in a cache directory.
+
+    Args:
+        cache_directory: The cache directory to register a lock file in.
+
+    Examples:
+        >>> import tempfile
+        >>> directory = tempfile.TemporaryDirectory()
+        >>> root = Path(directory.name)
+        >>> cache_directory = root / "cache_with_locks"
+        >>> lock_time, lock_fp = register_lock(cache_directory)
+        >>> assert (datetime.now() - lock_time).total_seconds() < 1, "Lock time should be ~ now."
+        >>> lock_fp.is_file()
+        True
+        >>> lock_fp.read_text() == f'{{"start": "{lock_time.strftime(LOCK_TIME_FMT)}"}}'
+        True
+        >>> directory.cleanup()
+    """
+
+    lock_directory = cache_directory / "locks"
+    lock_directory.mkdir(exist_ok=True, parents=True)
+
+    lock_time = datetime.now()
+    lock_fp = lock_directory / f"{lock_time.strftime(LOCK_TIME_FMT)}.json"
+    lock_fp.write_text(json.dumps({"start": lock_time.strftime(LOCK_TIME_FMT)}))
+    return lock_time, lock_fp
+
 
 def wrap[
     DF_T
@@ -108,15 +181,15 @@ def wrap[
         │ 3   ┆ 5   ┆ 12  │
         └─────┴─────┴─────┘
         >>> shutil.rmtree(cache_directory)
-        >>> lock_fp = cache_directory / "lock.json"
-        >>> assert not lock_fp.is_file()
-        >>> def lock_fp_checker_fn(df: pl.DataFrame) -> pl.DataFrame:
-        ...     print(f"Lock fp exists? {lock_fp.is_file()}")
+        >>> lock_dir = cache_directory / "locks"
+        >>> assert not lock_dir.exists()
+        >>> def lock_dir_checker_fn(df: pl.DataFrame) -> pl.DataFrame:
+        ...     print(f"Lock dir exists? {lock_dir.exists()}")
         ...     return df
         >>> result_computed, out_df = wrap(
-        ...     in_fp, out_fp, read_fn, write_fn, lock_fp_checker_fn, do_return=True
+        ...     in_fp, out_fp, read_fn, write_fn, lock_dir_checker_fn, do_return=True
         ... )
-        Lock fp exists? True
+        Lock dir exists? True
         >>> assert result_computed
         >>> out_df
         shape: (3, 3)
@@ -146,21 +219,19 @@ def wrap[
     cache_directory = out_fp.parent / f".{out_fp.stem}_cache"
     cache_directory.mkdir(exist_ok=True, parents=True)
 
-    st_time = datetime.now()
-    runtime_info = {"start": str(st_time)}
+    earliest_lock_time = get_earliest_lock(cache_directory)
+    if earliest_lock_time is not None:
+        logger.info(f"{out_fp} is in progress as of {earliest_lock_time}. Returning.")
+        return False, None if do_return else False
 
-    lock_fp = cache_directory / "lock.json"
-    if lock_fp.is_file():
-        started_at = json.loads(lock_fp.read_text())["start"]
-        logger.info(
-            f"{out_fp} is under construction as of {started_at} as {lock_fp} exists. " "Returning None."
-        )
-        if do_return:
-            return False, None
-        else:
-            return False
+    st_time, lock_fp = register_lock(cache_directory)
 
-    lock_fp.write_text(json.dumps(runtime_info))
+    logger.info(f"Registered lock at {st_time}. Double checking no earlier locks have been registered.")
+    earliest_lock_time = get_earliest_lock(cache_directory)
+    if earliest_lock_time < st_time:
+        logger.info(f"Earlier lock found at {earliest_lock_time}. Deleting current lock and returning.")
+        lock_fp.unlink()
+        return False, None if do_return else False
 
     logger.info(f"Reading input dataframe from {in_fp}")
     df = read_fn(in_fp)

From 42bc74e5b4ed9accea802c3d2ea68103c4ccac46 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Fri, 31 May 2024 09:53:22 -0400
Subject: [PATCH 11/47] Added a slurm script -- yet untested

---
 MIMIC-IV_Example/README.md             | 68 ++++++++++++++++++++++-
 MIMIC-IV_Example/joint_script_slurm.sh | 77 ++++++++++++++++++++++++++
 configs/pipeline.yaml                  |  4 +-
 3 files changed, 146 insertions(+), 3 deletions(-)
 create mode 100755 MIMIC-IV_Example/joint_script_slurm.sh

diff --git a/MIMIC-IV_Example/README.md b/MIMIC-IV_Example/README.md
index 54ec4ce..f72f9b2 100644
--- a/MIMIC-IV_Example/README.md
+++ b/MIMIC-IV_Example/README.md
@@ -72,6 +72,7 @@ root directory of this repository):
 In practice, on a machine with 150 GB of RAM and 10 cores, this step takes less than 5 minutes in total.
 
 ## Step 3: Run the MEDS extraction ETL
+### Running locally, serially
 
 We will assume you want to output the final MEDS dataset into a directory we'll denote as `$MIMICIV_MEDS_DIR`.
 Note this is a different directory than the pre-MEDS directory (though, of course, they can both be
@@ -80,7 +81,7 @@ subdirectories of the same root directory).
 This is a step in 4 parts:
 
 1. Sub-shard the raw files. Run this command as many times simultaneously as you would like to have workers
-   performing this sub-sharding step.
+   performing this sub-sharding step. See below for how to automate this parallelism using hydra launchers.
 
 ```bash
 ./scripts/extraction/shard_events.py \
@@ -126,6 +127,71 @@ and performance is not necessary; however, for larger datasets, it can be.
     event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml
 ```
 
+### Running Locally, in Parallel.
+This step is the exact same commands as above, but leverages Hydra's multirun capabilities with the `joblib`
+launcher. Install this package with the optional `local_parallelism` option (e.g., `pip install -e
+.[local_parallelism]` and run `./MIMIC-IV_Example/joint_script.sh`. See that script for expected args.
+
+### Running Each Step over Slurm
+To use slurm, run each command with the number of workers desired using Hydra's multirun capabilities with the
+`submitit_slurm` launcher. Install this package with the optional `slurm_parallelism` option. See below for
+modified commands. Note these can't be chained in a single script as the jobs will not wait for all slurm jobs
+to finish before moving on to the next stage. Let `$N_PARALLEL_WORKERS` be the number of desired workers
+
+1. Sub-shard the raw files.
+
+```bash
+./scripts/extraction/shard_events.py \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=submitit_slurm \
+    hydra.launcher.timeout_min=60 \
+    hydra.launcher.cpus_per_task=10 \
+    hydra.launcher.mem_gb=50 \
+    hydra.launcher.name="${hydra.job.name}_${worker}" \
+    hydra.launcher.partition="short" \
+    input_dir=$MIMICIV_PREMEDS_DIR \
+    cohort_dir=$MIMICIV_MEDS_DIR \
+    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml
+```
+
+In practice, on a machine with 150 GB of RAM and 10 cores, this step takes approximately 20 minutes in total.
+
+2. Extract and form the patient splits and sub-shards.
+
+```bash
+./scripts/extraction/split_and_shard_patients.py \
+    input_dir=$MIMICIV_PREMEDS_DIR \
+    cohort_dir=$MIMICIV_MEDS_DIR \
+    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml
+```
+
+In practice, on a machine with 150 GB of RAM and 10 cores, this step takes less than 5 minutes in total.
+
+3. Extract patient sub-shards and convert to MEDS events.
+
+```bash
+./scripts/extraction/convert_to_sharded_events.py \
+    input_dir=$MIMICIV_PREMEDS_DIR \
+    cohort_dir=$MIMICIV_MEDS_DIR \
+    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml
+```
+
+In practice, serially, this also takes around 20 minutes or more. However, it can be trivially parallelized to
+cut the time down by a factor of the number of workers processing the data by simply running the command
+multiple times (though this will, of course, consume more resources). If your filesystem is distributed, these
+commands can also be launched as separate slurm jobs, for example. For MIMIC-IV, this level of parallelization
+and performance is not necessary; however, for larger datasets, it can be.
+
+4. Merge the MEDS events into a single file per patient sub-shard.
+
+```bash
+./scripts/extraction/merge_to_MEDS_cohort.py \
+    input_dir=$MIMICIV_PREMEDS_DIR \
+    cohort_dir=$MIMICIV_MEDS_DIR \
+    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml
+```
+
 ## Limitations / TO-DOs:
 
 Currently, some tables are ignored, including:
diff --git a/MIMIC-IV_Example/joint_script_slurm.sh b/MIMIC-IV_Example/joint_script_slurm.sh
new file mode 100755
index 0000000..d6db681
--- /dev/null
+++ b/MIMIC-IV_Example/joint_script_slurm.sh
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+
+MIMICIV_RAW_DIR="$1"
+MIMICIV_PREMEDS_DIR="$2"
+MIMICIV_MEDS_DIR="$3"
+N_PARALLEL_WORKERS="$4"
+
+shift 4
+
+# Note we use `--multirun` throughout here due to ensure the submitit launcher is used throughout, so that
+# this doesn't fall back on running anything locally in a setting where only slurm worker nodes have
+# sufficient computational resources to run the actual jobs.
+
+echo "Running pre-MEDS conversion on one worker."
+./MIMIC-IV_Example/pre_MEDS.py \
+  --multirun \
+  worker="range(0,1)" \
+  hydra/launcher=submitit_slurm \
+  hydra.launcher.timeout_min=60 \
+  hydra.launcher.cpus_per_task=10 \
+  hydra.launcher.mem_gb=50 \
+  hydra.launcher.partition="short" \
+  raw_cohort_dir=$MIMICIV_RAW_DIR \
+  output_dir=$MIMICIV_PREMEDS_DIR
+
+echo "Trying submitit launching with $N_PARALLEL_WORKERS jobs."
+
+./scripts/extraction/shard_events.py \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=submitit_slurm \
+    hydra.launcher.timeout_min=60 \
+    hydra.launcher.cpus_per_task=10 \
+    hydra.launcher.mem_gb=50 \
+    hydra.launcher.partition="short" \
+    input_dir=$MIMICIV_PREMEDS_DIR \
+    cohort_dir=$MIMICIV_MEDS_DIR \
+    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml
+
+echo "Splitting patients on one worker"
+./scripts/extraction/split_and_shard_patients.py \
+    --multirun \
+    worker="range(0,1)" \
+    hydra/launcher=submitit_slurm \
+    hydra.launcher.timeout_min=60 \
+    hydra.launcher.cpus_per_task=10 \
+    hydra.launcher.mem_gb=50 \
+    hydra.launcher.partition="short" \
+    input_dir=$MIMICIV_PREMEDS_DIR \
+    cohort_dir=$MIMICIV_MEDS_DIR \
+    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+
+echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
+./scripts/extraction/convert_to_sharded_events.py \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=submitit_slurm \
+    hydra.launcher.timeout_min=60 \
+    hydra.launcher.cpus_per_task=10 \
+    hydra.launcher.mem_gb=50 \
+    hydra.launcher.partition="short" \
+    input_dir=$MIMICIV_PREMEDS_DIR \
+    cohort_dir=$MIMICIV_MEDS_DIR \
+    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+
+echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
+./scripts/extraction/merge_to_MEDS_cohort.py \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=submitit_slurm \
+    hydra.launcher.timeout_min=60 \
+    hydra.launcher.cpus_per_task=10 \
+    hydra.launcher.mem_gb=50 \
+    hydra.launcher.partition="short" \
+    input_dir=$MIMICIV_PREMEDS_DIR \
+    cohort_dir=$MIMICIV_MEDS_DIR \
+    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
diff --git a/configs/pipeline.yaml b/configs/pipeline.yaml
index 851afd7..29a2dfb 100644
--- a/configs/pipeline.yaml
+++ b/configs/pipeline.yaml
@@ -2,7 +2,7 @@
 input_dir: ???
 cohort_dir: ???
 
-log_dir: "${cohort_dir}/.logs/${stage}/worker_${worker}/${now:%Y-%m-%d_%H-%M-%S}"
+log_dir: "${cohort_dir}/.logs"
 
 # General pipeline variables
 do_overwrite: False
@@ -21,7 +21,7 @@ stage_cfg: ${oc.create:${populate_stage:${stage}, ${input_dir}, ${cohort_dir}, $
 # Hydra
 hydra:
   job:
-    name: "${stage}"
+    name: "${stage}/${worker}/${now:%Y-%m-%d_%H-%M-%S}"
   run:
     dir: "${log_dir}/${hydra.job.name}"
   sweep:

From f8441684823971c5e08424e0ceddd1ae9133caee Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Fri, 31 May 2024 15:46:27 -0400
Subject: [PATCH 12/47] Updates to pipeline.yaml

---
 MIMIC-IV_Example/joint_script_slurm.sh | 99 +++++++++++++-------------
 configs/pipeline.yaml                  |  6 +-
 2 files changed, 53 insertions(+), 52 deletions(-)

diff --git a/MIMIC-IV_Example/joint_script_slurm.sh b/MIMIC-IV_Example/joint_script_slurm.sh
index d6db681..8ce85fb 100755
--- a/MIMIC-IV_Example/joint_script_slurm.sh
+++ b/MIMIC-IV_Example/joint_script_slurm.sh
@@ -11,17 +11,17 @@ shift 4
 # this doesn't fall back on running anything locally in a setting where only slurm worker nodes have
 # sufficient computational resources to run the actual jobs.
 
-echo "Running pre-MEDS conversion on one worker."
-./MIMIC-IV_Example/pre_MEDS.py \
-  --multirun \
-  worker="range(0,1)" \
-  hydra/launcher=submitit_slurm \
-  hydra.launcher.timeout_min=60 \
-  hydra.launcher.cpus_per_task=10 \
-  hydra.launcher.mem_gb=50 \
-  hydra.launcher.partition="short" \
-  raw_cohort_dir=$MIMICIV_RAW_DIR \
-  output_dir=$MIMICIV_PREMEDS_DIR
+# echo "Running pre-MEDS conversion on one worker."
+# ./MIMIC-IV_Example/pre_MEDS.py \
+#   --multirun \
+#   worker="range(0,1)" \
+#   hydra/launcher=submitit_slurm \
+#   hydra.launcher.timeout_min=60 \
+#   hydra.launcher.cpus_per_task=10 \
+#   hydra.launcher.mem_gb=50 \
+#   hydra.launcher.partition="short" \
+#   raw_cohort_dir=$MIMICIV_RAW_DIR \
+#   output_dir=$MIMICIV_PREMEDS_DIR
 
 echo "Trying submitit launching with $N_PARALLEL_WORKERS jobs."
 
@@ -33,45 +33,46 @@ echo "Trying submitit launching with $N_PARALLEL_WORKERS jobs."
     hydra.launcher.cpus_per_task=10 \
     hydra.launcher.mem_gb=50 \
     hydra.launcher.partition="short" \
+    "hydra.job.env_copy=[PATH]" \
     input_dir=$MIMICIV_PREMEDS_DIR \
     cohort_dir=$MIMICIV_MEDS_DIR \
     event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml
 
-echo "Splitting patients on one worker"
-./scripts/extraction/split_and_shard_patients.py \
-    --multirun \
-    worker="range(0,1)" \
-    hydra/launcher=submitit_slurm \
-    hydra.launcher.timeout_min=60 \
-    hydra.launcher.cpus_per_task=10 \
-    hydra.launcher.mem_gb=50 \
-    hydra.launcher.partition="short" \
-    input_dir=$MIMICIV_PREMEDS_DIR \
-    cohort_dir=$MIMICIV_MEDS_DIR \
-    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
-
-echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
-./scripts/extraction/convert_to_sharded_events.py \
-    --multirun \
-    worker="range(0,$N_PARALLEL_WORKERS)" \
-    hydra/launcher=submitit_slurm \
-    hydra.launcher.timeout_min=60 \
-    hydra.launcher.cpus_per_task=10 \
-    hydra.launcher.mem_gb=50 \
-    hydra.launcher.partition="short" \
-    input_dir=$MIMICIV_PREMEDS_DIR \
-    cohort_dir=$MIMICIV_MEDS_DIR \
-    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
-
-echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
-./scripts/extraction/merge_to_MEDS_cohort.py \
-    --multirun \
-    worker="range(0,$N_PARALLEL_WORKERS)" \
-    hydra/launcher=submitit_slurm \
-    hydra.launcher.timeout_min=60 \
-    hydra.launcher.cpus_per_task=10 \
-    hydra.launcher.mem_gb=50 \
-    hydra.launcher.partition="short" \
-    input_dir=$MIMICIV_PREMEDS_DIR \
-    cohort_dir=$MIMICIV_MEDS_DIR \
-    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+#echo "Splitting patients on one worker"
+#./scripts/extraction/split_and_shard_patients.py \
+#    --multirun \
+#    worker="range(0,1)" \
+#    hydra/launcher=submitit_slurm \
+#    hydra.launcher.timeout_min=60 \
+#    hydra.launcher.cpus_per_task=10 \
+#    hydra.launcher.mem_gb=50 \
+#    hydra.launcher.partition="short" \
+#    input_dir=$MIMICIV_PREMEDS_DIR \
+#    cohort_dir=$MIMICIV_MEDS_DIR \
+#    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+#
+#echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
+#./scripts/extraction/convert_to_sharded_events.py \
+#    --multirun \
+#    worker="range(0,$N_PARALLEL_WORKERS)" \
+#    hydra/launcher=submitit_slurm \
+#    hydra.launcher.timeout_min=60 \
+#    hydra.launcher.cpus_per_task=10 \
+#    hydra.launcher.mem_gb=50 \
+#    hydra.launcher.partition="short" \
+#    input_dir=$MIMICIV_PREMEDS_DIR \
+#    cohort_dir=$MIMICIV_MEDS_DIR \
+#    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+#
+#echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
+#./scripts/extraction/merge_to_MEDS_cohort.py \
+#    --multirun \
+#    worker="range(0,$N_PARALLEL_WORKERS)" \
+#    hydra/launcher=submitit_slurm \
+#    hydra.launcher.timeout_min=60 \
+#    hydra.launcher.cpus_per_task=10 \
+#    hydra.launcher.mem_gb=50 \
+#    hydra.launcher.partition="short" \
+#    input_dir=$MIMICIV_PREMEDS_DIR \
+#    cohort_dir=$MIMICIV_MEDS_DIR \
+#    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
diff --git a/configs/pipeline.yaml b/configs/pipeline.yaml
index 29a2dfb..5694e25 100644
--- a/configs/pipeline.yaml
+++ b/configs/pipeline.yaml
@@ -21,8 +21,8 @@ stage_cfg: ${oc.create:${populate_stage:${stage}, ${input_dir}, ${cohort_dir}, $
 # Hydra
 hydra:
   job:
-    name: "${stage}/${worker}/${now:%Y-%m-%d_%H-%M-%S}"
+    name: "${stage}_${worker}_${now:%Y-%m-%d_%H-%M-%S}"
   run:
-    dir: "${log_dir}/${hydra.job.name}"
+    dir: "${log_dir}"
   sweep:
-    dir: "${log_dir}/${hydra.job.name}"
+    dir: "${log_dir}"

From 6f910b9eb5504df196dea819101f69687a698991 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Fri, 31 May 2024 15:49:18 -0400
Subject: [PATCH 13/47] cleaned files

---
 MIMIC-IV_Example/README.md             |   6 +-
 MIMIC-IV_Example/joint_script.sh       |  18 ++---
 MIMIC-IV_Example/joint_script_slurm.sh | 102 ++++++++++++-------------
 scripts/extraction/shard_events.py     |   4 +-
 4 files changed, 65 insertions(+), 65 deletions(-)

diff --git a/MIMIC-IV_Example/README.md b/MIMIC-IV_Example/README.md
index f72f9b2..16e4431 100644
--- a/MIMIC-IV_Example/README.md
+++ b/MIMIC-IV_Example/README.md
@@ -72,6 +72,7 @@ root directory of this repository):
 In practice, on a machine with 150 GB of RAM and 10 cores, this step takes less than 5 minutes in total.
 
 ## Step 3: Run the MEDS extraction ETL
+
 ### Running locally, serially
 
 We will assume you want to output the final MEDS dataset into a directory we'll denote as `$MIMICIV_MEDS_DIR`.
@@ -128,11 +129,12 @@ and performance is not necessary; however, for larger datasets, it can be.
 ```
 
 ### Running Locally, in Parallel.
+
 This step is the exact same commands as above, but leverages Hydra's multirun capabilities with the `joblib`
-launcher. Install this package with the optional `local_parallelism` option (e.g., `pip install -e
-.[local_parallelism]` and run `./MIMIC-IV_Example/joint_script.sh`. See that script for expected args.
+launcher. Install this package with the optional `local_parallelism` option (e.g., `pip install -e .[local_parallelism]` and run `./MIMIC-IV_Example/joint_script.sh`. See that script for expected args.
 
 ### Running Each Step over Slurm
+
 To use slurm, run each command with the number of workers desired using Hydra's multirun capabilities with the
 `submitit_slurm` launcher. Install this package with the optional `slurm_parallelism` option. See below for
 modified commands. Note these can't be chained in a single script as the jobs will not wait for all slurm jobs
diff --git a/MIMIC-IV_Example/joint_script.sh b/MIMIC-IV_Example/joint_script.sh
index 9d7ae69..eb58e89 100755
--- a/MIMIC-IV_Example/joint_script.sh
+++ b/MIMIC-IV_Example/joint_script.sh
@@ -8,21 +8,21 @@ N_PARALLEL_WORKERS="$4"
 shift 4
 
 echo "Running pre-MEDS conversion."
-./MIMIC-IV_Example/pre_MEDS.py raw_cohort_dir=$MIMICIV_RAW_DIR output_dir=$MIMICIV_PREMEDS_DIR
+./MIMIC-IV_Example/pre_MEDS.py raw_cohort_dir="$MIMICIV_RAW_DIR" output_dir="$MIMICIV_PREMEDS_DIR"
 
 echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel"
 ./scripts/extraction/shard_events.py \
     --multirun \
     worker="range(0,$N_PARALLEL_WORKERS)" \
     hydra/launcher=joblib \
-    input_dir=$MIMICIV_PREMEDS_DIR \
-    cohort_dir=$MIMICIV_MEDS_DIR \
+    input_dir="$MIMICIV_PREMEDS_DIR" \
+    cohort_dir="$MIMICIV_MEDS_DIR" \
     event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
 
 echo "Splitting patients in serial"
 ./scripts/extraction/split_and_shard_patients.py \
-    input_dir=$MIMICIV_PREMEDS_DIR \
-    cohort_dir=$MIMICIV_MEDS_DIR \
+    input_dir="$MIMICIV_PREMEDS_DIR" \
+    cohort_dir="$MIMICIV_MEDS_DIR" \
     event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
 
 echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
@@ -30,8 +30,8 @@ echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
     --multirun \
     worker="range(0,$N_PARALLEL_WORKERS)" \
     hydra/launcher=joblib \
-    input_dir=$MIMICIV_PREMEDS_DIR \
-    cohort_dir=$MIMICIV_MEDS_DIR \
+    input_dir="$MIMICIV_PREMEDS_DIR" \
+    cohort_dir="$MIMICIV_MEDS_DIR" \
     event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
 
 echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
@@ -39,6 +39,6 @@ echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
     --multirun \
     worker="range(0,$N_PARALLEL_WORKERS)" \
     hydra/launcher=joblib \
-    input_dir=$MIMICIV_PREMEDS_DIR \
-    cohort_dir=$MIMICIV_MEDS_DIR \
+    input_dir="$MIMICIV_PREMEDS_DIR" \
+    cohort_dir="$MIMICIV_MEDS_DIR" \
     event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
diff --git a/MIMIC-IV_Example/joint_script_slurm.sh b/MIMIC-IV_Example/joint_script_slurm.sh
index 8ce85fb..3948e87 100755
--- a/MIMIC-IV_Example/joint_script_slurm.sh
+++ b/MIMIC-IV_Example/joint_script_slurm.sh
@@ -11,17 +11,17 @@ shift 4
 # this doesn't fall back on running anything locally in a setting where only slurm worker nodes have
 # sufficient computational resources to run the actual jobs.
 
-# echo "Running pre-MEDS conversion on one worker."
-# ./MIMIC-IV_Example/pre_MEDS.py \
-#   --multirun \
-#   worker="range(0,1)" \
-#   hydra/launcher=submitit_slurm \
-#   hydra.launcher.timeout_min=60 \
-#   hydra.launcher.cpus_per_task=10 \
-#   hydra.launcher.mem_gb=50 \
-#   hydra.launcher.partition="short" \
-#   raw_cohort_dir=$MIMICIV_RAW_DIR \
-#   output_dir=$MIMICIV_PREMEDS_DIR
+echo "Running pre-MEDS conversion on one worker."
+./MIMIC-IV_Example/pre_MEDS.py \
+  --multirun \
+  worker="range(0,1)" \
+  hydra/launcher=submitit_slurm \
+  hydra.launcher.timeout_min=60 \
+  hydra.launcher.cpus_per_task=10 \
+  hydra.launcher.mem_gb=50 \
+  hydra.launcher.partition="short" \
+  raw_cohort_dir="$MIMICIV_RAW_DIR" \
+  output_dir="$MIMICIV_PREMEDS_DIR"
 
 echo "Trying submitit launching with $N_PARALLEL_WORKERS jobs."
 
@@ -34,45 +34,45 @@ echo "Trying submitit launching with $N_PARALLEL_WORKERS jobs."
     hydra.launcher.mem_gb=50 \
     hydra.launcher.partition="short" \
     "hydra.job.env_copy=[PATH]" \
-    input_dir=$MIMICIV_PREMEDS_DIR \
-    cohort_dir=$MIMICIV_MEDS_DIR \
+    input_dir="$MIMICIV_PREMEDS_DIR" \
+    cohort_dir="$MIMICIV_MEDS_DIR" \
     event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml
 
-#echo "Splitting patients on one worker"
-#./scripts/extraction/split_and_shard_patients.py \
-#    --multirun \
-#    worker="range(0,1)" \
-#    hydra/launcher=submitit_slurm \
-#    hydra.launcher.timeout_min=60 \
-#    hydra.launcher.cpus_per_task=10 \
-#    hydra.launcher.mem_gb=50 \
-#    hydra.launcher.partition="short" \
-#    input_dir=$MIMICIV_PREMEDS_DIR \
-#    cohort_dir=$MIMICIV_MEDS_DIR \
-#    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
-#
-#echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
-#./scripts/extraction/convert_to_sharded_events.py \
-#    --multirun \
-#    worker="range(0,$N_PARALLEL_WORKERS)" \
-#    hydra/launcher=submitit_slurm \
-#    hydra.launcher.timeout_min=60 \
-#    hydra.launcher.cpus_per_task=10 \
-#    hydra.launcher.mem_gb=50 \
-#    hydra.launcher.partition="short" \
-#    input_dir=$MIMICIV_PREMEDS_DIR \
-#    cohort_dir=$MIMICIV_MEDS_DIR \
-#    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
-#
-#echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
-#./scripts/extraction/merge_to_MEDS_cohort.py \
-#    --multirun \
-#    worker="range(0,$N_PARALLEL_WORKERS)" \
-#    hydra/launcher=submitit_slurm \
-#    hydra.launcher.timeout_min=60 \
-#    hydra.launcher.cpus_per_task=10 \
-#    hydra.launcher.mem_gb=50 \
-#    hydra.launcher.partition="short" \
-#    input_dir=$MIMICIV_PREMEDS_DIR \
-#    cohort_dir=$MIMICIV_MEDS_DIR \
-#    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+echo "Splitting patients on one worker"
+./scripts/extraction/split_and_shard_patients.py \
+    --multirun \
+    worker="range(0,1)" \
+    hydra/launcher=submitit_slurm \
+    hydra.launcher.timeout_min=60 \
+    hydra.launcher.cpus_per_task=10 \
+    hydra.launcher.mem_gb=50 \
+    hydra.launcher.partition="short" \
+    input_dir="$MIMICIV_PREMEDS_DIR" \
+    cohort_dir="$MIMICIV_MEDS_DIR" \
+    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+
+echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
+./scripts/extraction/convert_to_sharded_events.py \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=submitit_slurm \
+    hydra.launcher.timeout_min=60 \
+    hydra.launcher.cpus_per_task=10 \
+    hydra.launcher.mem_gb=50 \
+    hydra.launcher.partition="short" \
+    input_dir="$MIMICIV_PREMEDS_DIR" \
+    cohort_dir="$MIMICIV_MEDS_DIR" \
+    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+
+echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
+./scripts/extraction/merge_to_MEDS_cohort.py \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=submitit_slurm \
+    hydra.launcher.timeout_min=60 \
+    hydra.launcher.cpus_per_task=10 \
+    hydra.launcher.mem_gb=50 \
+    hydra.launcher.partition="short" \
+    input_dir="$MIMICIV_PREMEDS_DIR" \
+    cohort_dir="$MIMICIV_MEDS_DIR" \
+    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
diff --git a/scripts/extraction/shard_events.py b/scripts/extraction/shard_events.py
index d0533e3..9ce0ac9 100755
--- a/scripts/extraction/shard_events.py
+++ b/scripts/extraction/shard_events.py
@@ -223,9 +223,7 @@ def main(cfg: DictConfig):
                 seen_files.add(get_shard_prefix(raw_cohort_dir, f))
 
     if not input_files_to_subshard:
-        raise FileNotFoundError(
-            f"Can't find any files in {str(raw_cohort_dir.resolve())} to sub-shard!"
-        )
+        raise FileNotFoundError(f"Can't find any files in {str(raw_cohort_dir.resolve())} to sub-shard!")
 
     random.shuffle(input_files_to_subshard)
 

From 4eadda50ed118b35ecc2804d176ed1c09d504404 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Fri, 31 May 2024 18:49:12 -0400
Subject: [PATCH 14/47] Not remotely working; moving to local for dev

---
 MIMIC-IV_Example/README.md              |   3 +-
 eICU_Example/README.md                  | 216 +++++++++++++++++++++++
 eICU_Example/configs/event_configs.yaml | 219 ++++++++++++++++++++++++
 eICU_Example/configs/pre_MEDS.yaml      |  11 ++
 eICU_Example/joint_script.sh            |  44 +++++
 eICU_Example/joint_script_slurm.sh      |  78 +++++++++
 eICU_Example/pre_MEDS.py                | 200 ++++++++++++++++++++++
 eICU_Example/sbatch_joint_script.sh     |  24 +++
 pyproject.toml                          |   2 +-
 9 files changed, 794 insertions(+), 3 deletions(-)
 create mode 100644 eICU_Example/README.md
 create mode 100644 eICU_Example/configs/event_configs.yaml
 create mode 100644 eICU_Example/configs/pre_MEDS.yaml
 create mode 100755 eICU_Example/joint_script.sh
 create mode 100755 eICU_Example/joint_script_slurm.sh
 create mode 100755 eICU_Example/pre_MEDS.py
 create mode 100644 eICU_Example/sbatch_joint_script.sh

diff --git a/MIMIC-IV_Example/README.md b/MIMIC-IV_Example/README.md
index f72f9b2..4056319 100644
--- a/MIMIC-IV_Example/README.md
+++ b/MIMIC-IV_Example/README.md
@@ -33,10 +33,9 @@ Download this repository and install the requirements:
 ```bash
 git clone git@github.com:mmcdermott/MEDS_polars_functions.git
 cd MEDS_polars_functions
-git checkout MIMIC_IV
 conda create -n MEDS python=3.12
 conda activate MEDS
-pip install .[mimic]
+pip install .[examples]
 ```
 
 ## Step 1: Download MIMIC-IV
diff --git a/eICU_Example/README.md b/eICU_Example/README.md
new file mode 100644
index 0000000..b23ae9e
--- /dev/null
+++ b/eICU_Example/README.md
@@ -0,0 +1,216 @@
+# eICU-CRD Example
+
+This is an example of how to extract a MEDS dataset from [eICU-CRD
+v2.0](https://physionet.org/content/eicu-crd/2.0/). All scripts in this README are assumed to
+be run **not** from this directory but from the root directory of this entire repository (e.g., one directory
+up from this one).
+
+**Status**: This is a work in progress. The code is not yet functional. Remaining work includes:
+
+- [ ] Implementing the pre-MEDS processing step.
+  - [ ] Identifying the pre-MEDS steps for eICU
+- [ ] Testing the pre-MEDS processing step on live eICU-CRD.
+  - [ ] Test that it runs at all.
+  - [ ] Test that the output is as expected.
+- [ ] Check the installation instructions on a fresh client.
+- [ ] Testing the `configs/event_configs.yaml` configuration on eICU-CRD
+- [ ] Testing the MEDS extraction ETL runs on eICU-CRD (this should be expected to work, but needs
+  live testing).
+  - [ ] Sub-sharding
+  - [ ] Patient split gathering
+  - [ ] Event extraction
+  - [ ] Merging
+- [ ] Validating the output MEDS cohort
+  - [ ] Basic validation
+  - [ ] Detailed validation
+
+## Step 0: Installation
+
+Download this repository and install the requirements:
+
+```bash
+git clone git@github.com:mmcdermott/MEDS_polars_functions.git
+cd MEDS_polars_functions
+conda create -n MEDS python=3.12
+conda activate MEDS
+pip install .[examples]
+```
+
+## Step 1: Download eICU
+
+Download the eICU-CRD dataset (version 2.0) from https://physionet.org/content/eicu-crd/2.0/ following the
+instructions on that page. You will need the raw `.csv.gz` files for this example. We will use
+`$EICU_RAW_DIR` to denote the root directory of where the resulting _core data files_ are stored -- e.g.,
+there should be a `hosp` and `icu` subdirectory of `$EICU_RAW_DIR`.
+
+## Step 2: Get the data ready for base MEDS extraction
+
+This is a step in a few parts:
+
+1. Join a few tables by `hadm_id` to get the right timestamps in the right rows for processing. In
+   particular, we need to join:
+   - TODO
+2. Convert the patient's static data to a more parseable form. This entails:
+   - Get the patient's DOB in a format that is usable for MEDS, rather than the integral `anchor_year` and
+     `anchor_offset` fields.
+   - Merge the patient's `dod` with the `deathtime` from the `admissions` table.
+
+After these steps, modified files or symlinks to the original files will be written in a new directory which
+will be used as the input to the actual MEDS extraction ETL. We'll use `$EICU_PREMEDS_DIR` to denote this
+directory.
+
+To run this step, you can use the following script (assumed to be run **not** from this directory but from the
+root directory of this repository):
+
+```bash
+./eICU_Example/pre_MEDS.py raw_cohort_dir=$EICU_RAW_DIR output_dir=$EICU_PREMEDS_DIR
+```
+
+In practice, on a machine with 150 GB of RAM and 10 cores, this step takes less than 5 minutes in total.
+
+## Step 3: Run the MEDS extraction ETL
+### Running locally, serially
+
+We will assume you want to output the final MEDS dataset into a directory we'll denote as `$EICU_MEDS_DIR`.
+Note this is a different directory than the pre-MEDS directory (though, of course, they can both be
+subdirectories of the same root directory).
+
+This is a step in 4 parts:
+
+1. Sub-shard the raw files. Run this command as many times simultaneously as you would like to have workers
+   performing this sub-sharding step. See below for how to automate this parallelism using hydra launchers.
+
+```bash
+./scripts/extraction/shard_events.py \
+    input_dir=$EICU_PREMEDS_DIR \
+    cohort_dir=$EICU_MEDS_DIR \
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml
+```
+
+In practice, on a machine with 150 GB of RAM and 10 cores, this step takes approximately 20 minutes in total.
+
+2. Extract and form the patient splits and sub-shards.
+
+```bash
+./scripts/extraction/split_and_shard_patients.py \
+    input_dir=$EICU_PREMEDS_DIR \
+    cohort_dir=$EICU_MEDS_DIR \
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml
+```
+
+In practice, on a machine with 150 GB of RAM and 10 cores, this step takes less than 5 minutes in total.
+
+3. Extract patient sub-shards and convert to MEDS events.
+
+```bash
+./scripts/extraction/convert_to_sharded_events.py \
+    input_dir=$EICU_PREMEDS_DIR \
+    cohort_dir=$EICU_MEDS_DIR \
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml
+```
+
+In practice, serially, this also takes around 20 minutes or more. However, it can be trivially parallelized to
+cut the time down by a factor of the number of workers processing the data by simply running the command
+multiple times (though this will, of course, consume more resources). If your filesystem is distributed, these
+commands can also be launched as separate slurm jobs, for example. For eICU, this level of parallelization
+and performance is not necessary; however, for larger datasets, it can be.
+
+4. Merge the MEDS events into a single file per patient sub-shard.
+
+```bash
+./scripts/extraction/merge_to_MEDS_cohort.py \
+    input_dir=$EICU_PREMEDS_DIR \
+    cohort_dir=$EICU_MEDS_DIR \
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml
+```
+
+### Running Locally, in Parallel.
+This step is the exact same commands as above, but leverages Hydra's multirun capabilities with the `joblib`
+launcher. Install this package with the optional `local_parallelism` option (e.g., `pip install -e
+.[local_parallelism]` and run `./eICU_Example/joint_script.sh`. See that script for expected args.
+
+### Running Each Step over Slurm
+To use slurm, run each command with the number of workers desired using Hydra's multirun capabilities with the
+`submitit_slurm` launcher. Install this package with the optional `slurm_parallelism` option. See below for
+modified commands. Note these can't be chained in a single script as the jobs will not wait for all slurm jobs
+to finish before moving on to the next stage. Let `$N_PARALLEL_WORKERS` be the number of desired workers
+
+1. Sub-shard the raw files.
+
+```bash
+./scripts/extraction/shard_events.py \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=submitit_slurm \
+    hydra.launcher.timeout_min=60 \
+    hydra.launcher.cpus_per_task=10 \
+    hydra.launcher.mem_gb=50 \
+    hydra.launcher.name="${hydra.job.name}_${worker}" \
+    hydra.launcher.partition="short" \
+    input_dir=$EICU_PREMEDS_DIR \
+    cohort_dir=$EICU_MEDS_DIR \
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml
+```
+
+In practice, on a machine with 150 GB of RAM and 10 cores, this step takes approximately 20 minutes in total.
+
+2. Extract and form the patient splits and sub-shards.
+
+```bash
+./scripts/extraction/split_and_shard_patients.py \
+    input_dir=$EICU_PREMEDS_DIR \
+    cohort_dir=$EICU_MEDS_DIR \
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml
+```
+
+In practice, on a machine with 150 GB of RAM and 10 cores, this step takes less than 5 minutes in total.
+
+3. Extract patient sub-shards and convert to MEDS events.
+
+```bash
+./scripts/extraction/convert_to_sharded_events.py \
+    input_dir=$EICU_PREMEDS_DIR \
+    cohort_dir=$EICU_MEDS_DIR \
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml
+```
+
+In practice, serially, this also takes around 20 minutes or more. However, it can be trivially parallelized to
+cut the time down by a factor of the number of workers processing the data by simply running the command
+multiple times (though this will, of course, consume more resources). If your filesystem is distributed, these
+commands can also be launched as separate slurm jobs, for example. For eICU, this level of parallelization
+and performance is not necessary; however, for larger datasets, it can be.
+
+4. Merge the MEDS events into a single file per patient sub-shard.
+
+```bash
+./scripts/extraction/merge_to_MEDS_cohort.py \
+    input_dir=$EICU_PREMEDS_DIR \
+    cohort_dir=$EICU_MEDS_DIR \
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml
+```
+
+## Limitations / TO-DOs:
+
+Currently, some tables are ignored, including:
+
+1. `admissiondrug`: The [documentation](https://eicu-crd.mit.edu/eicutables/admissiondrug/) notes that this is
+   extremely infrequently used, so we skip it.
+2. 
+
+
+Lots of questions remain about how to appropriately handle timestamps of the data -- e.g., things like HCPCS
+events are stored at the level of the _date_, not the _datetime_. How should those be slotted into the
+timeline which is otherwise stored at the _datetime_ resolution?
+
+Other questions:
+
+1. How to handle merging the deathtimes between the hosp table and the patients table?
+2. How to handle the dob nonsense MIMIC has?
+
+## Future Work
+
+### Pre-MEDS Processing
+
+If you wanted, some other processing could also be done here, such as:
+
+1. Converting the patient's dynamically recorded race into a static, most commonly recorded race field.
diff --git a/eICU_Example/configs/event_configs.yaml b/eICU_Example/configs/event_configs.yaml
new file mode 100644
index 0000000..af626d1
--- /dev/null
+++ b/eICU_Example/configs/event_configs.yaml
@@ -0,0 +1,219 @@
+# Note that there is no "patient_id" for eICU -- patients are only differentiable during the course of a
+# single health system stay. Accordingly, we set the "patient" id here as the "patientHealthSystemStayID"
+
+patient_id_col: patientHealthSystemStayID
+
+hosp/admissions:
+  ed_registration:
+    code: ED_REGISTRATION
+    timestamp: col(edregtime)
+    timestamp_format: "%Y-%m-%d %H:%M:%S"
+  ed_out:
+    code: ED_OUT
+    timestamp: col(edouttime)
+    timestamp_format: "%Y-%m-%d %H:%M:%S"
+  admission:
+    code:
+      - HOSPITAL_ADMISSION
+      - col(admission_type)
+      - col(admission_location)
+    timestamp: col(admittime)
+    timestamp_format: "%Y-%m-%d %H:%M:%S"
+    insurance: insurance
+    language: language
+    marital_status: marital_status
+    race: race
+    hadm_id: hadm_id
+  discharge:
+    code:
+      - HOSPITAL_DISCHARGE
+      - col(discharge_location)
+    timestamp: col(dischtime)
+    timestamp_format: "%Y-%m-%d %H:%M:%S"
+    hadm_id: hadm_id
+  # We omit the death event here as it is joined to the data in the patients table in the pre-MEDS step.
+  #death:
+  #  code: DEATH
+  #  timestamp: col(deathtime)
+  #  timestamp_format: "%Y-%m-%d %H:%M:%S"
+  #  death_location: death_location
+  #  death_type: death_type
+
+hosp/diagnoses_icd:
+  diagnosis:
+    code:
+      - DIAGNOSIS
+      - ICD
+      - col(icd_version)
+      - col(icd_code)
+    hadm_id: hadm_id
+    timestamp: col(hadm_discharge_time)
+    timestamp_format: "%Y-%m-%d %H:%M:%S"
+
+hosp/drgcodes:
+  drg:
+    code:
+      - DRG
+      - col(drg_type)
+      - col(drg_code)
+      - col(description)
+    hadm_id: hadm_id
+    timestamp: col(hadm_discharge_time)
+    timestamp_format: "%Y-%m-%d %H:%M:%S"
+    drg_severity: drg_severity
+    drg_mortality: drg_mortality
+
+hosp/emar:
+  medication:
+    code:
+      - MEDICATION
+      - col(medication)
+      - col(event_txt)
+    timestamp: col(charttime)
+    timestamp_format: "%Y-%m-%d %H:%M:%S"
+    hadm_id: hadm_id
+    emar_id: emar_id
+    emar_seq: emar_seq
+
+hosp/hcpcsevents:
+  hcpcs:
+    code:
+      - HCPCS
+      - col(short_description)
+    hadm_id: hadm_id
+    timestamp: col(chartdate)
+    timestamp_format: "%Y-%m-%d"
+
+hosp/labevents:
+  lab:
+    code:
+      - LAB
+      - col(itemid)
+      - col(valueuom)
+    hadm_id: hadm_id
+    timestamp: col(charttime)
+    timestamp_format: "%Y-%m-%d %H:%M:%S"
+    numerical_value: valuenum
+    text_value: value
+    priority: priority
+
+hosp/omr:
+  omr:
+    code: col(result_name)
+    text_value: col(result_value)
+    timestamp: col(chartdate)
+    timestamp_format: "%Y-%m-%d"
+
+hosp/patients:
+  gender:
+    code:
+      - GENDER
+      - col(gender)
+    timestamp: null
+  dob:
+    code: DOB
+    timestamp: col(year_of_birth)
+    timestamp_format: "%Y"
+  death:
+    code: DEATH
+    timestamp: col(dod)
+    timestamp_format:
+      - "%Y-%m-%d %H:%M:%S"
+      - "%Y-%m-%d"
+
+hosp/pharmacy:
+  medication_start:
+    code:
+      - MEDICATION
+      - START
+      - col(medication)
+    timestamp: col(starttime)
+    route: route
+    frequency: frequency
+    doses_per_24_hrs: doses_per_24_hrs
+    poe_id: poe_id
+    timestamp_format:
+      - "%Y-%m-%d %H:%M:%S"
+      - "%Y-%m-%d"
+  medication_stop:
+    code:
+      - MEDICATION
+      - STOP
+      - col(medication)
+    timestamp: col(stoptime)
+    poe_id: poe_id
+    timestamp_format:
+      - "%Y-%m-%d %H:%M:%S"
+      - "%Y-%m-%d"
+
+hosp/procedures_icd:
+  procedure:
+    code:
+      - PROCEDURE
+      - ICD
+      - col(icd_version)
+      - col(icd_code)
+    hadm_id: hadm_id
+    timestamp: col(chartdate)
+    timestamp_format: "%Y-%m-%d"
+
+hosp/transfers:
+  transfer:
+    code:
+      - TRANSFER_TO
+      - col(eventtype)
+      - col(careunit)
+    timestamp: col(intime)
+    timestamp_format: "%Y-%m-%d %H:%M:%S"
+    hadm_id: hadm_id
+
+icu/icustays:
+  icu_admission:
+    code:
+      - ICU_ADMISSION
+      - col(first_careunit)
+    timestamp: col(intime)
+    timestamp_format: "%Y-%m-%d %H:%M:%S"
+    hadm_id: hadm_id
+    icustay_id: stay_id
+  icu_discharge:
+    code:
+      - ICU_DISCHARGE
+      - col(last_careunit)
+    timestamp: col(outtime)
+    timestamp_format: "%Y-%m-%d %H:%M:%S"
+    hadm_id: hadm_id
+    icustay_id: stay_id
+
+icu/chartevents:
+  event:
+    code:
+      - LAB
+      - col(itemid)
+      - col(valueuom)
+    timestamp: col(charttime)
+    timestamp_format: "%Y-%m-%d %H:%M:%S"
+    numerical_value: valuenum
+    text_value: value
+    hadm_id: hadm_id
+    icustay_id: stay_id
+
+icu/procedureevents:
+  start:
+    code:
+      - PROCEDURE
+      - START
+      - col(itemid)
+    timestamp: col(starttime)
+    timestamp_format: "%Y-%m-%d %H:%M:%S"
+    hadm_id: hadm_id
+    icustay_id: stay_id
+  end:
+    code:
+      - PROCEDURE
+      - END
+      - col(itemid)
+    timestamp: col(endtime)
+    timestamp_format: "%Y-%m-%d %H:%M:%S"
+    hadm_id: hadm_id
+    icustay_id: stay_id
diff --git a/eICU_Example/configs/pre_MEDS.yaml b/eICU_Example/configs/pre_MEDS.yaml
new file mode 100644
index 0000000..b5cfa4c
--- /dev/null
+++ b/eICU_Example/configs/pre_MEDS.yaml
@@ -0,0 +1,11 @@
+raw_cohort_dir: ???
+output_dir: ???
+
+# Hydra
+hydra:
+  job:
+    name: pre_MEDS_${now:%Y-%m-%d_%H-%M-%S}
+  run:
+    dir: ${output_dir}/.logs/${hydra.job.name}
+  sweep:
+    dir: ${output_dir}/.logs/${hydra.job.name}
diff --git a/eICU_Example/joint_script.sh b/eICU_Example/joint_script.sh
new file mode 100755
index 0000000..9d7ae69
--- /dev/null
+++ b/eICU_Example/joint_script.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+
+MIMICIV_RAW_DIR="$1"
+MIMICIV_PREMEDS_DIR="$2"
+MIMICIV_MEDS_DIR="$3"
+N_PARALLEL_WORKERS="$4"
+
+shift 4
+
+echo "Running pre-MEDS conversion."
+./MIMIC-IV_Example/pre_MEDS.py raw_cohort_dir=$MIMICIV_RAW_DIR output_dir=$MIMICIV_PREMEDS_DIR
+
+echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel"
+./scripts/extraction/shard_events.py \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=joblib \
+    input_dir=$MIMICIV_PREMEDS_DIR \
+    cohort_dir=$MIMICIV_MEDS_DIR \
+    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+
+echo "Splitting patients in serial"
+./scripts/extraction/split_and_shard_patients.py \
+    input_dir=$MIMICIV_PREMEDS_DIR \
+    cohort_dir=$MIMICIV_MEDS_DIR \
+    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+
+echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
+./scripts/extraction/convert_to_sharded_events.py \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=joblib \
+    input_dir=$MIMICIV_PREMEDS_DIR \
+    cohort_dir=$MIMICIV_MEDS_DIR \
+    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+
+echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
+./scripts/extraction/merge_to_MEDS_cohort.py \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=joblib \
+    input_dir=$MIMICIV_PREMEDS_DIR \
+    cohort_dir=$MIMICIV_MEDS_DIR \
+    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
diff --git a/eICU_Example/joint_script_slurm.sh b/eICU_Example/joint_script_slurm.sh
new file mode 100755
index 0000000..8ce85fb
--- /dev/null
+++ b/eICU_Example/joint_script_slurm.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+
+MIMICIV_RAW_DIR="$1"
+MIMICIV_PREMEDS_DIR="$2"
+MIMICIV_MEDS_DIR="$3"
+N_PARALLEL_WORKERS="$4"
+
+shift 4
+
+# Note we use `--multirun` throughout here due to ensure the submitit launcher is used throughout, so that
+# this doesn't fall back on running anything locally in a setting where only slurm worker nodes have
+# sufficient computational resources to run the actual jobs.
+
+# echo "Running pre-MEDS conversion on one worker."
+# ./MIMIC-IV_Example/pre_MEDS.py \
+#   --multirun \
+#   worker="range(0,1)" \
+#   hydra/launcher=submitit_slurm \
+#   hydra.launcher.timeout_min=60 \
+#   hydra.launcher.cpus_per_task=10 \
+#   hydra.launcher.mem_gb=50 \
+#   hydra.launcher.partition="short" \
+#   raw_cohort_dir=$MIMICIV_RAW_DIR \
+#   output_dir=$MIMICIV_PREMEDS_DIR
+
+echo "Trying submitit launching with $N_PARALLEL_WORKERS jobs."
+
+./scripts/extraction/shard_events.py \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=submitit_slurm \
+    hydra.launcher.timeout_min=60 \
+    hydra.launcher.cpus_per_task=10 \
+    hydra.launcher.mem_gb=50 \
+    hydra.launcher.partition="short" \
+    "hydra.job.env_copy=[PATH]" \
+    input_dir=$MIMICIV_PREMEDS_DIR \
+    cohort_dir=$MIMICIV_MEDS_DIR \
+    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml
+
+#echo "Splitting patients on one worker"
+#./scripts/extraction/split_and_shard_patients.py \
+#    --multirun \
+#    worker="range(0,1)" \
+#    hydra/launcher=submitit_slurm \
+#    hydra.launcher.timeout_min=60 \
+#    hydra.launcher.cpus_per_task=10 \
+#    hydra.launcher.mem_gb=50 \
+#    hydra.launcher.partition="short" \
+#    input_dir=$MIMICIV_PREMEDS_DIR \
+#    cohort_dir=$MIMICIV_MEDS_DIR \
+#    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+#
+#echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
+#./scripts/extraction/convert_to_sharded_events.py \
+#    --multirun \
+#    worker="range(0,$N_PARALLEL_WORKERS)" \
+#    hydra/launcher=submitit_slurm \
+#    hydra.launcher.timeout_min=60 \
+#    hydra.launcher.cpus_per_task=10 \
+#    hydra.launcher.mem_gb=50 \
+#    hydra.launcher.partition="short" \
+#    input_dir=$MIMICIV_PREMEDS_DIR \
+#    cohort_dir=$MIMICIV_MEDS_DIR \
+#    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+#
+#echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
+#./scripts/extraction/merge_to_MEDS_cohort.py \
+#    --multirun \
+#    worker="range(0,$N_PARALLEL_WORKERS)" \
+#    hydra/launcher=submitit_slurm \
+#    hydra.launcher.timeout_min=60 \
+#    hydra.launcher.cpus_per_task=10 \
+#    hydra.launcher.mem_gb=50 \
+#    hydra.launcher.partition="short" \
+#    input_dir=$MIMICIV_PREMEDS_DIR \
+#    cohort_dir=$MIMICIV_MEDS_DIR \
+#    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
diff --git a/eICU_Example/pre_MEDS.py b/eICU_Example/pre_MEDS.py
new file mode 100755
index 0000000..bf0204c
--- /dev/null
+++ b/eICU_Example/pre_MEDS.py
@@ -0,0 +1,200 @@
+#!/usr/bin/env python
+
+"""Performs pre-MEDS data wrangling for eICU."""
+import rootutils
+
+root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True)
+
+import gzip
+from datetime import datetime
+from pathlib import Path
+
+import hydra
+import polars as pl
+from loguru import logger
+from omegaconf import DictConfig
+
+from MEDS_polars_functions.utils import (
+    get_shard_prefix,
+    hydra_loguru_init,
+    write_lazyframe,
+)
+
+GLOBAL_TIME_ROOT = datetime(2024, 1, 1)
+
+
+def load_raw_eicu_file(fp: Path, **kwargs) -> pl.LazyFrame:
+    """Load a raw MIMIC file into a Polars DataFrame.
+
+    Args:
+        fp: The path to the MIMIC file.
+
+    Returns:
+        The Polars DataFrame containing the MIMIC data.
+    """
+
+    with gzip.open(fp, mode="rb") as f:
+        return pl.read_csv(f, infer_schema_length=100000, **kwargs).lazy()
+
+def process_patients_table(df: pl.LazyFrame) -> pl.LazyFrame:
+    """Takes the patients table and converts it to a form that includes timestamps
+
+    As eICU stores only offset times, note here that we add a CONSTANT TIME ACROSS ALL PATIENTS for the true
+    timestamp of their health system admission. This is acceptable because in eICU ONLY RELATIVE TIME
+    DIFFERENCES ARE MEANINGFUL, NOT ABSOLUTE TIMES.
+    """
+
+    return (
+        df
+        .with_columns(
+            pl.lit(GLOBAL_TIME_ROOT, dtype=pl.Datetime).alias("healthSystemAdmitTimestamp"),
+        )
+        .select(
+            "patientHealthSystemStayID",
+            "gender",
+            "age",
+            "ethnicity",
+            # Unit stay parameters
+            "patientUnitStayID", # The unit stay ID
+            pl.col("healthSystemAdmitTimestamp")
+            "hospitalID",
+            "admissionHeight",
+            # "apacheAdmissionDx", This we grab from `admissiondx` later instead.
+
+
+
+
+
+
+
+##### MIMIC STUFF --- OLD #####
+
+def add_discharge_time_by_hadm_id(
+    df: pl.LazyFrame, discharge_time_df: pl.LazyFrame, out_column_name: str = "hadm_discharge_time"
+) -> pl.LazyFrame:
+    """Joins the two dataframes by ``"hadm_id"`` and adds the discharge time to the original dataframe."""
+
+    discharge_time_df = discharge_time_df.select("hadm_id", pl.col("dischtime").alias(out_column_name))
+    return df.join(discharge_time_df, on="hadm_id", how="left")
+
+
+def fix_static_data(raw_static_df: pl.LazyFrame, death_times_df: pl.LazyFrame) -> pl.LazyFrame:
+    """Fixes the static data by adding the death time to the static data and fixes the DOB nonsense.
+
+    Args:
+        raw_static_df: The raw static data.
+        death_times_df: The death times data.
+
+    Returns:
+        The fixed static data.
+    """
+
+    death_times_df = death_times_df.group_by("subject_id").agg(pl.col("deathtime").min())
+
+    return raw_static_df.join(death_times_df, on="subject_id", how="left").select(
+        "subject_id",
+        pl.coalesce(pl.col("dod"), pl.col("deathtime")).alias("dod"),
+        (pl.col("anchor_year") - pl.col("anchor_age")).cast(str).alias("year_of_birth"),
+        "gender",
+    )
+
+
+FUNCTIONS = {
+    "hosp/diagnoses_icd": (add_discharge_time_by_hadm_id, ("hosp/admissions", ["hadm_id", "dischtime"])),
+    "hosp/drgcodes": (add_discharge_time_by_hadm_id, ("hosp/admissions", ["hadm_id", "dischtime"])),
+    "hosp/patients": (fix_static_data, ("hosp/admissions", ["subject_id", "deathtime"])),
+}
+
+
+@hydra.main(version_base=None, config_path="configs", config_name="pre_MEDS")
+def main(cfg: DictConfig):
+    """Performs pre-MEDS data wrangling for eICU.
+
+    Inputs are the raw MIMIC files, read from the `raw_cohort_dir` config parameter. Output files are either
+    symlinked (if they are not modified) or written in processed form to the `MEDS_input_dir` config
+    parameter. Hydra is used to manage configuration parameters and logging.
+    """
+
+    hydra_loguru_init()
+
+    raw_cohort_dir = Path(cfg.raw_cohort_dir)
+    MEDS_input_dir = Path(cfg.output_dir)
+
+    all_fps = list(raw_cohort_dir.glob("**/*.csv.gz"))
+
+    dfs_to_load = {}
+
+    for in_fp in all_fps:
+        pfx = get_shard_prefix(raw_cohort_dir, in_fp)
+
+        out_fp = MEDS_input_dir / in_fp.relative_to(raw_cohort_dir)
+
+        if out_fp.is_file():
+            print(f"Done with {pfx}. Continuing")
+            continue
+
+        out_fp.parent.mkdir(parents=True, exist_ok=True)
+
+        if pfx not in FUNCTIONS:
+            logger.info(
+                f"No function needed for {pfx}: "
+                f"Symlinking {str(in_fp.resolve())} to {str(out_fp.resolve())}"
+            )
+            relative_in_fp = in_fp.relative_to(out_fp.parent, walk_up=True)
+            out_fp.symlink_to(relative_in_fp)
+            continue
+        else:
+            out_fp = MEDS_input_dir / f"{pfx}.parquet"
+            if out_fp.is_file():
+                print(f"Done with {pfx}. Continuing")
+                continue
+
+            fn, need_df = FUNCTIONS[pfx]
+            if not need_df:
+                st = datetime.now()
+                logger.info(f"Processing {pfx}...")
+                df = load_raw_eicu_file(in_fp)
+                logger.info(f"  Loaded raw {in_fp} in {datetime.now() - st}")
+                processed_df = fn(df)
+                write_lazyframe(processed_df, out_fp)
+                logger.info(f"  Processed and wrote to {str(out_fp.resolve())} in {datetime.now() - st}")
+            else:
+                needed_pfx, needed_cols = need_df
+                if needed_pfx not in dfs_to_load:
+                    dfs_to_load[needed_pfx] = {"fps": set(), "cols": set()}
+
+                dfs_to_load[needed_pfx]["fps"].add(in_fp)
+                dfs_to_load[needed_pfx]["cols"].update(needed_cols)
+
+    for df_to_load_pfx, fps_and_cols in dfs_to_load.items():
+        fps = fps_and_cols["fps"]
+        cols = list(fps_and_cols["cols"])
+
+        df_to_load_fp = raw_cohort_dir / f"{df_to_load_pfx}.csv.gz"
+
+        st = datetime.now()
+
+        logger.info(f"Loading {str(df_to_load_fp.resolve())} for manipulating other dataframes...")
+        df = load_raw_eicu_file(df_to_load_fp, columns=cols)
+        logger.info(f"  Loaded in {datetime.now() - st}")
+
+        for fp in fps:
+            pfx = get_shard_prefix(raw_cohort_dir, fp)
+            out_fp = MEDS_input_dir / f"{pfx}.parquet"
+
+            logger.info(f"  Processing dependent df @ {pfx}...")
+            fn, _ = FUNCTIONS[pfx]
+
+            fp_st = datetime.now()
+            logger.info(f"    Loading {str(fp.resolve())}...")
+            fp_df = load_raw_eicu_file(fp)
+            logger.info(f"    Loaded in {datetime.now() - fp_st}")
+            processed_df = fn(fp_df, df)
+            write_lazyframe(processed_df, out_fp)
+            logger.info(f"    Processed and wrote to {str(out_fp.resolve())} in {datetime.now() - fp_st}")
+
+    logger.info(f"Done! All dataframes processed and written to {str(MEDS_input_dir.resolve())}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/eICU_Example/sbatch_joint_script.sh b/eICU_Example/sbatch_joint_script.sh
new file mode 100644
index 0000000..e031363
--- /dev/null
+++ b/eICU_Example/sbatch_joint_script.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+#SBATCH -c 10                           # Request one core
+#SBATCH -t 0-03:00                      # Runtime in D-HH:MM format
+#SBATCH -p short                        # Partition to run in
+#SBATCH --mem=300GB                     # Memory total in MiB (for all cores)
+#SBATCH -o MIMIC_IV_MEDS_%j_sbatch.out  # File to which STDOUT will be written, including job ID (%j)
+#SBATCH -e MIMIC_IV_MEDS_%j_sbatch.err  # File to which STDERR will be written, including job ID (%j)
+
+cd /n/data1/hms/dbmi/zaklab/mmd/MEDS_polars_functions
+
+MIMICIV_RAW_DIR="$1"
+MIMICIV_PREMEDS_DIR="$2"
+MIMICIV_MEDS_DIR="$3"
+N_PARALLEL_WORKERS="$4"
+
+LOG_DIR="$MIMICIV_MEDS_DIR/.logs"
+
+echo "Running with saving to $LOG_DIR"
+
+mkdir -p $LOG_DIR
+
+PATH="/home/mbm47/.conda/envs/MEDS_pipelines/bin:$PATH" \
+  time mprof run --include-children --exit-code --output "$LOG_DIR/mprofile.dat" \
+      ./MIMIC-IV_Example/joint_script.sh "$@" 2> $LOG_DIR/timings.txt
diff --git a/pyproject.toml b/pyproject.toml
index 29bba91..25b9527 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,7 +19,7 @@ classifiers = [
 dependencies = ["polars", "pyarrow", "nested_ragged_tensors", "loguru", "hydra-core", "numpy"]
 
 [project.optional-dependencies]
-mimic = ["rootutils"]
+examples = ["rootutils"]
 dev = ["pre-commit"]
 tests = ["pytest", "pytest-cov[toml]", "rootutils"]
 local_parallelism = ["hydra-joblib-launcher"]

From 7c2e7677c09289b4cf358346b9223d8934d7f98f Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Fri, 31 May 2024 20:04:22 -0400
Subject: [PATCH 15/47] Updated configs and added a resolver to get informative
 help messages from the right sources

---
 configs/extraction.yaml            | 34 ++++++++++++++++++++++++++++++
 configs/pipeline.yaml              | 15 +++++++++++++
 configs/preprocess.yaml            |  1 +
 src/MEDS_polars_functions/utils.py | 16 ++++++++++++++
 4 files changed, 66 insertions(+)

diff --git a/configs/extraction.yaml b/configs/extraction.yaml
index e1e985a..b762894 100644
--- a/configs/extraction.yaml
+++ b/configs/extraction.yaml
@@ -1,5 +1,18 @@
 defaults:
   - pipeline
+  - _self_
+
+description: |-
+  This pipeline extracts raw MEDS events in longitudinal, sparse form from an input dataset meeting select
+  criteria and converts them to the flattened, MEDS format. It can be run in its entirety, with controllable
+  levels of parallelism, or in stages. Arguments:
+    - `event_conversion_config_fp`: The path to the event conversion configuration file. This file defines
+      the events to extract from the various rows of the various input files encountered in the global input
+      directory.
+    - `input_dir`: The path to the directory containing the raw input files.
+    - `cohort_dir`: The path to the directory where the output cohort will be written. It will be written in
+      various subfolders of this dir depending on the stage, as intermediate stages cache their output during
+      computation for efficiency of re-running and distributing.
 
 # The event conversion configuration file is used throughout the pipeline to define the events to extract.
 event_conversion_config_fp: ???
@@ -12,9 +25,22 @@ stages:
 
 stage_configs:
   shard_events:
+    description: |-
+      This stage shards the raw input events into smaller files for easier processing. Arguments:
+        - `row_chunksize`: The number of rows to read in at a time.
+        - `infer_schema_length`: The number of rows to read in to infer the schema (only used if the source
+          files are pdfs)
     row_chunksize: 200000000
     infer_schema_length: 10000
   split_and_shard_patients:
+    description: |-
+      This stage splits the patients into training, tuning, and held-out sets, and further splits those sets
+      into shards. Arguments:
+        - `n_patients_per_shard`: The number of patients to include in a shard.
+        - `external_splits_json_fp`: The path to a json file containing any pre-defined splits for specially
+          held-out test sets beyond the IID held out set that will be produced (e.g., for prospective
+          datasets, etc.).
+        - `split_fracs`: The fraction of patients to include in the IID training, tuning, and held-out sets.
     is_metadata: True
     output_dir: ${cohort_dir}
     n_patients_per_shard: 50000
@@ -24,4 +50,12 @@ stage_configs:
       tuning: 0.1
       held_out: 0.1
   merge_to_MEDS_cohort:
+    description: |-
+      This stage splits the patients into training, tuning, and held-out sets, and further splits those sets
+      into shards. Arguments:
+        - `n_patients_per_shard`: The number of patients to include in a shard.
+        - `external_splits_json_fp`: The path to a json file containing any pre-defined splits for specially
+          held-out test sets beyond the IID held out set that will be produced (e.g., for prospective
+          datasets, etc.).
+        - `split_fracs`: The fraction of patients to include in the IID training, tuning, and held-out sets.
     output_dir: ${cohort_dir}/final_cohort
diff --git a/configs/pipeline.yaml b/configs/pipeline.yaml
index 5694e25..857785f 100644
--- a/configs/pipeline.yaml
+++ b/configs/pipeline.yaml
@@ -2,6 +2,10 @@
 input_dir: ???
 cohort_dir: ???
 
+_default_description: |-
+  This is a MEDS pipeline ETL. Please set a more detailed description at the top of your specific pipeline
+  configuration file.
+
 log_dir: "${cohort_dir}/.logs"
 
 # General pipeline variables
@@ -26,3 +30,14 @@ hydra:
     dir: "${log_dir}"
   sweep:
     dir: "${log_dir}"
+  help:
+    app_name: "MEDS/${stage}"
+    template: |-
+      == ${hydra.help.app_name} ==
+      ${hydra.help.app_name} is a command line tool that provides an interface for running MEDS pipelines.
+
+      **Pipeline description:**
+      ${oc.select:description, ${_default_description}}
+
+      **Stage description:**
+      ${oc.select:stage_configs.${stage}.description, ${get_script_docstring:}}
diff --git a/configs/preprocess.yaml b/configs/preprocess.yaml
index 9b60579..d65150b 100644
--- a/configs/preprocess.yaml
+++ b/configs/preprocess.yaml
@@ -1,5 +1,6 @@
 defaults:
   - pipeline
+  - _self_
 
 # Global pipeline parameters:
 # 1. Code modifiers will be used as adjoining parts of the `code` columns during group-bys and eventual
diff --git a/src/MEDS_polars_functions/utils.py b/src/MEDS_polars_functions/utils.py
index b2fbbb7..11d738a 100644
--- a/src/MEDS_polars_functions/utils.py
+++ b/src/MEDS_polars_functions/utils.py
@@ -1,5 +1,6 @@
 """Core utilities for MEDS pipelines built with these tools."""
 
+import inspect
 import os
 import sys
 from pathlib import Path
@@ -12,6 +13,20 @@
 pl.enable_string_cache()
 
 
+def get_script_docstring() -> str:
+    """Returns the docstring of the main function of the script that was called.
+
+    Returns:
+        str: TODO
+    """
+
+    main_module = sys.modules["__main__"]
+    func = getattr(main_module, "main", None)
+    if func and callable(func):
+        return inspect.getdoc(func) or ""
+    return ""
+
+
 def current_script_name() -> str:
     """Returns the name of the script that called this function.
 
@@ -143,6 +158,7 @@ def populate_stage(
     return out
 
 
+OmegaConf.register_new_resolver("get_script_docstring", get_script_docstring, replace=False)
 OmegaConf.register_new_resolver("current_script_name", current_script_name, replace=False)
 OmegaConf.register_new_resolver("populate_stage", populate_stage, replace=False)
 

From bc78cd448e95738413ad8888d563dbd3c926b8e5 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sat, 1 Jun 2024 09:03:55 -0400
Subject: [PATCH 16/47] Starting eICU scripts and configs

---
 eICU_Example/configs/event_configs.yaml | 266 +++++-------------------
 eICU_Example/pre_MEDS.py                | 198 +++++++++++++-----
 2 files changed, 202 insertions(+), 262 deletions(-)

diff --git a/eICU_Example/configs/event_configs.yaml b/eICU_Example/configs/event_configs.yaml
index af626d1..9d467f6 100644
--- a/eICU_Example/configs/event_configs.yaml
+++ b/eICU_Example/configs/event_configs.yaml
@@ -3,217 +3,59 @@
 
 patient_id_col: patientHealthSystemStayID
 
-hosp/admissions:
-  ed_registration:
-    code: ED_REGISTRATION
-    timestamp: col(edregtime)
-    timestamp_format: "%Y-%m-%d %H:%M:%S"
-  ed_out:
-    code: ED_OUT
-    timestamp: col(edouttime)
-    timestamp_format: "%Y-%m-%d %H:%M:%S"
-  admission:
-    code:
-      - HOSPITAL_ADMISSION
-      - col(admission_type)
-      - col(admission_location)
-    timestamp: col(admittime)
-    timestamp_format: "%Y-%m-%d %H:%M:%S"
-    insurance: insurance
-    language: language
-    marital_status: marital_status
-    race: race
-    hadm_id: hadm_id
-  discharge:
-    code:
-      - HOSPITAL_DISCHARGE
-      - col(discharge_location)
-    timestamp: col(dischtime)
-    timestamp_format: "%Y-%m-%d %H:%M:%S"
-    hadm_id: hadm_id
-  # We omit the death event here as it is joined to the data in the patients table in the pre-MEDS step.
-  #death:
-  #  code: DEATH
-  #  timestamp: col(deathtime)
-  #  timestamp_format: "%Y-%m-%d %H:%M:%S"
-  #  death_location: death_location
-  #  death_type: death_type
-
-hosp/diagnoses_icd:
-  diagnosis:
-    code:
-      - DIAGNOSIS
-      - ICD
-      - col(icd_version)
-      - col(icd_code)
-    hadm_id: hadm_id
-    timestamp: col(hadm_discharge_time)
-    timestamp_format: "%Y-%m-%d %H:%M:%S"
-
-hosp/drgcodes:
-  drg:
-    code:
-      - DRG
-      - col(drg_type)
-      - col(drg_code)
-      - col(description)
-    hadm_id: hadm_id
-    timestamp: col(hadm_discharge_time)
-    timestamp_format: "%Y-%m-%d %H:%M:%S"
-    drg_severity: drg_severity
-    drg_mortality: drg_mortality
-
-hosp/emar:
-  medication:
-    code:
-      - MEDICATION
-      - col(medication)
-      - col(event_txt)
-    timestamp: col(charttime)
-    timestamp_format: "%Y-%m-%d %H:%M:%S"
-    hadm_id: hadm_id
-    emar_id: emar_id
-    emar_seq: emar_seq
-
-hosp/hcpcsevents:
-  hcpcs:
-    code:
-      - HCPCS
-      - col(short_description)
-    hadm_id: hadm_id
-    timestamp: col(chartdate)
-    timestamp_format: "%Y-%m-%d"
-
-hosp/labevents:
-  lab:
-    code:
-      - LAB
-      - col(itemid)
-      - col(valueuom)
-    hadm_id: hadm_id
-    timestamp: col(charttime)
-    timestamp_format: "%Y-%m-%d %H:%M:%S"
-    numerical_value: valuenum
-    text_value: value
-    priority: priority
-
-hosp/omr:
-  omr:
-    code: col(result_name)
-    text_value: col(result_value)
-    timestamp: col(chartdate)
-    timestamp_format: "%Y-%m-%d"
-
-hosp/patients:
+patient:
+  dob:
+    code: "DOB"
+    timestamp: "dateOfBirth"
+    uniquepid: "uniquepid"
   gender:
-    code:
-      - GENDER
-      - col(gender)
+    code: ["GENDER", "col(gender)"]
     timestamp: null
-  dob:
-    code: DOB
-    timestamp: col(year_of_birth)
-    timestamp_format: "%Y"
-  death:
-    code: DEATH
-    timestamp: col(dod)
-    timestamp_format:
-      - "%Y-%m-%d %H:%M:%S"
-      - "%Y-%m-%d"
-
-hosp/pharmacy:
-  medication_start:
-    code:
-      - MEDICATION
-      - START
-      - col(medication)
-    timestamp: col(starttime)
-    route: route
-    frequency: frequency
-    doses_per_24_hrs: doses_per_24_hrs
-    poe_id: poe_id
-    timestamp_format:
-      - "%Y-%m-%d %H:%M:%S"
-      - "%Y-%m-%d"
-  medication_stop:
-    code:
-      - MEDICATION
-      - STOP
-      - col(medication)
-    timestamp: col(stoptime)
-    poe_id: poe_id
-    timestamp_format:
-      - "%Y-%m-%d %H:%M:%S"
-      - "%Y-%m-%d"
-
-hosp/procedures_icd:
-  procedure:
-    code:
-      - PROCEDURE
-      - ICD
-      - col(icd_version)
-      - col(icd_code)
-    hadm_id: hadm_id
-    timestamp: col(chartdate)
-    timestamp_format: "%Y-%m-%d"
-
-hosp/transfers:
-  transfer:
-    code:
-      - TRANSFER_TO
-      - col(eventtype)
-      - col(careunit)
-    timestamp: col(intime)
-    timestamp_format: "%Y-%m-%d %H:%M:%S"
-    hadm_id: hadm_id
-
-icu/icustays:
-  icu_admission:
-    code:
-      - ICU_ADMISSION
-      - col(first_careunit)
-    timestamp: col(intime)
-    timestamp_format: "%Y-%m-%d %H:%M:%S"
-    hadm_id: hadm_id
-    icustay_id: stay_id
-  icu_discharge:
-    code:
-      - ICU_DISCHARGE
-      - col(last_careunit)
-    timestamp: col(outtime)
-    timestamp_format: "%Y-%m-%d %H:%M:%S"
-    hadm_id: hadm_id
-    icustay_id: stay_id
-
-icu/chartevents:
-  event:
-    code:
-      - LAB
-      - col(itemid)
-      - col(valueuom)
-    timestamp: col(charttime)
-    timestamp_format: "%Y-%m-%d %H:%M:%S"
-    numerical_value: valuenum
-    text_value: value
-    hadm_id: hadm_id
-    icustay_id: stay_id
-
-icu/procedureevents:
-  start:
-    code:
-      - PROCEDURE
-      - START
-      - col(itemid)
-    timestamp: col(starttime)
-    timestamp_format: "%Y-%m-%d %H:%M:%S"
-    hadm_id: hadm_id
-    icustay_id: stay_id
-  end:
-    code:
-      - PROCEDURE
-      - END
-      - col(itemid)
-    timestamp: col(endtime)
-    timestamp_format: "%Y-%m-%d %H:%M:%S"
-    hadm_id: hadm_id
-    icustay_id: stay_id
+  ethnicity:
+    code: ["ETHNICITY", "col(ethnicity)"]
+    timestamp: null
+  hosp_admission:
+    code:
+      - "HOSPITAL_ADMISSION"
+      - col("hospitalAdmitSource")
+      - col("hospitalRegion")
+      - col("hospitalTeachingStatus")
+      - col("hospitalNumBedsCategory")
+    timestamp: "hospitalAdmitTimestamp"
+    hospital_id: "hospitalID"
+  hosp_discharge:
+    code:
+      - "HOSPITAL_DISCHARGE"
+      - col("hospitalDischargeStatus")
+      - col("hospitalDischargeLocation")
+    timestamp: "hospitalDischargeTimestamp"
+  unit_admission:
+    code:
+      - "UNIT_ADMISSION"
+      - col("unitAdmitSource")
+      - col("unitStayType")
+    timestamp: "unitAdmitTimestamp"
+    ward_id: "wardID"
+  unit_admission_weight:
+    code:
+      - "UNIT_ADMISSION_WEIGHT"
+    timestamp: "unitAdmitTimestamp"
+    numerical_value: "unitAdmissionWeight"
+  unit_admission_height:
+    code:
+      - "UNIT_ADMISSION_HEIGHT"
+    timestamp: "unitAdmitTimestamp"
+    numerical_value: "unitAdmissionHeight"
+  unit_discharge:
+    code:
+      - "UNIT_DISCHARGE"
+      - col("unitDischargeStatus")
+      - col("unitDischargeLocation")
+    timestamp: "unitDischargeTimestamp"
+  unit_discharge_weight:
+    code:
+      - "UNIT_DISCHARGE_WEIGHT"
+    timestamp: "unitDischargeTimestamp"
+    numerical_value: "unitDischargeWeight"
+
+timestamp_format: "%Y-%m-%d %H:%M:%S"
diff --git a/eICU_Example/pre_MEDS.py b/eICU_Example/pre_MEDS.py
index bf0204c..c78d506 100755
--- a/eICU_Example/pre_MEDS.py
+++ b/eICU_Example/pre_MEDS.py
@@ -1,13 +1,18 @@
 #!/usr/bin/env python
 
-"""Performs pre-MEDS data wrangling for eICU."""
+"""Performs pre-MEDS data wrangling for eICU.
+
+See the docstring of `main` for more information.
+"""
 import rootutils
 
 root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True)
 
 import gzip
+from collections.abc import Callable, Sequence
 from datetime import datetime
 from pathlib import Path
+from typing import NamedTuple
 
 import hydra
 import polars as pl
@@ -20,7 +25,13 @@
     write_lazyframe,
 )
 
-GLOBAL_TIME_ROOT = datetime(2024, 1, 1)
+HEALTH_SYSTEM_STAY_ID = "patientHealthSystemStayID"
+UNIT_STAY_ID = "patientUnitStayID"
+PATIENT_ID = "uniquepid"
+
+# The end of year date, used for year-only timestamps in eICU. The time is set to midnight as we'll add a
+# 24-hour time component from other columns in the data.
+END_OF_YEAR = {"month": 12, "day": 31, "hour": 0, "minute": 0, "second": 0}
 
 
 def load_raw_eicu_file(fp: Path, **kwargs) -> pl.LazyFrame:
@@ -36,85 +47,172 @@ def load_raw_eicu_file(fp: Path, **kwargs) -> pl.LazyFrame:
     with gzip.open(fp, mode="rb") as f:
         return pl.read_csv(f, infer_schema_length=100000, **kwargs).lazy()
 
-def process_patients_table(df: pl.LazyFrame) -> pl.LazyFrame:
-    """Takes the patients table and converts it to a form that includes timestamps
 
-    As eICU stores only offset times, note here that we add a CONSTANT TIME ACROSS ALL PATIENTS for the true
-    timestamp of their health system admission. This is acceptable because in eICU ONLY RELATIVE TIME
-    DIFFERENCES ARE MEANINGFUL, NOT ABSOLUTE TIMES.
-    """
+def check_timestamps_agree(df: pl.LazyFrame, pseudotime_col: pl.Expr, given_24htime_col: str):
+    expected_time = pl.col(given_24htime_col).str.strptime(pl.Time, "%H:%M:%S")
+
+    time_deltas_min = (pseudotime_col.dt.time() - expected_time).dt.total_minutes()
 
-    return (
-        df
-        .with_columns(
-            pl.lit(GLOBAL_TIME_ROOT, dtype=pl.Datetime).alias("healthSystemAdmitTimestamp"),
+    # Check that the time deltas are all within 1 minute
+    logger.info(
+        "Checking that stated 24h times are consistent given offsets between {pseudotime_col.name} and "
+        f"{given_24htime_col}..."
+    )
+    max_time_deltas_min = df.select(time_deltas_min.abs().max()).collect().item()
+    if max_time_deltas_min > 1:
+        raise ValueError(
+            f"Max number of minutes between {pseudotime_col.name} and {given_24htime_col} is "
+            f"{max_time_deltas_min}. Should be <= 1."
         )
-        .select(
-            "patientHealthSystemStayID",
-            "gender",
-            "age",
-            "ethnicity",
-            # Unit stay parameters
-            "patientUnitStayID", # The unit stay ID
-            pl.col("healthSystemAdmitTimestamp")
-            "hospitalID",
-            "admissionHeight",
-            # "apacheAdmissionDx", This we grab from `admissiondx` later instead.
 
 
+def process_patient_table(df: pl.LazyFrame, hospital_df: pl.LazyFrame) -> pl.LazyFrame:
+    """Takes the patient table and converts it to a form that includes timestamps.
 
+    As eICU stores only offset times, note here that we add a CONSTANT TIME ACROSS ALL PATIENTS for the true
+    timestamp of their health system admission. This is acceptable because in eICU ONLY RELATIVE TIME
+    DIFFERENCES ARE MEANINGFUL, NOT ABSOLUTE TIMES.
 
+    The output of this process is ultimately converted to events via the `patient` key in the
+    `configs/event_configs.yaml` file.
+    """
 
+    hospital_discharge_pseudotime = pl.datetime(year=pl.col("hospitalDischargeYear"), **END_OF_YEAR) + pl.col(
+        "hospitalDischargeTime24"
+    ).str.strptime(pl.Time, "%H:%M:%S")
 
+    unit_admit_pseudotime = hospital_discharge_pseudotime - pl.duration(
+        minutes=pl.col("hospitalDischargeOffset")
+    )
 
-##### MIMIC STUFF --- OLD #####
+    unit_discharge_pseudotime = unit_admit_pseudotime + pl.duration(minutes=pl.col("unitDischargeOffset"))
 
-def add_discharge_time_by_hadm_id(
-    df: pl.LazyFrame, discharge_time_df: pl.LazyFrame, out_column_name: str = "hadm_discharge_time"
-) -> pl.LazyFrame:
-    """Joins the two dataframes by ``"hadm_id"`` and adds the discharge time to the original dataframe."""
+    hospital_admit_pseudotime = unit_admit_pseudotime + pl.duration(minutes=pl.col("hospitalAdmitOffset"))
 
-    discharge_time_df = discharge_time_df.select("hadm_id", pl.col("dischtime").alias(out_column_name))
-    return df.join(discharge_time_df, on="hadm_id", how="left")
+    age_in_years = pl.when(pl.col("age") == "> 89").then(90).otherwise(pl.col("age").cast(pl.UInt16))
+    age_in_days = age_in_years * 365.25
+    # We assume that the patient was born at the midpoint of the year as we don't know the actual birthdate
+    pseudo_date_of_birth = unit_admit_pseudotime - pl.duration(days=(age_in_days - 365.25 / 2))
 
+    # Check the times
+    start = datetime.now()
+    logger.info(
+        "Checking that the 24h times are consistent. If this is extremely slow, consider refactoring to have "
+        "only one `.collect()` call."
+    )
+    check_timestamps_agree(df, hospital_discharge_pseudotime, "hospitalDischargeTime24")
+    check_timestamps_agree(df, hospital_admit_pseudotime, "hospitalAdmitTime24")
+    check_timestamps_agree(df, unit_admit_pseudotime, "unitAdmitTime24")
+    check_timestamps_agree(df, unit_discharge_pseudotime, "unitDischargeTime24")
+    logger.info(f"Validated 24h times in {datetime.now() - start}")
+
+    logger.warning("NOT validating the `unitVisitNumber` column as that isn't implemented yet.")
+
+    logger.warning(
+        "NOT SURE ABOUT THE FOLLOWING. Check with the eICU team:\n"
+        "  - `apacheAdmissionDx` is not selected from the patients table as we grab it from `admissiondx`. "
+        "Is this right?\n"
+        "  - `admissionHeight` and `admissionWeight` are interpreted as **unit** admission height/weight, "
+        "not hospital admission height/weight. Is this right?\n"
+        "  - `age` is interpreted as the age at the time of the unit stay, not the hospital stay. "
+        "Is this right?\n"
+        "  - `What is the actual mean age for those > 89? Here we assume 90.\n"
+    )
 
-def fix_static_data(raw_static_df: pl.LazyFrame, death_times_df: pl.LazyFrame) -> pl.LazyFrame:
-    """Fixes the static data by adding the death time to the static data and fixes the DOB nonsense.
+    return df.join(hospital_df, left_on="hospitalID", right_on="hospitalid", how="left").select(
+        # 1. Static variables
+        "uniquepid",
+        "gender",
+        pseudo_date_of_birth.alias("dateOfBirth"),
+        "ethnicity",
+        # 2. Health system stay parameters
+        "patientHealthSystemStayID",
+        "hospitalID",
+        pl.col("numbedscategory").alias("hospitalNumBedsCategory"),
+        pl.col("teachingstatus").alias("hospitalTeachingStatus"),
+        pl.col("region").alias("hospitalRegion"),
+        # 2.1 Admission parameters
+        hospital_admit_pseudotime.alias("hospitalAdmitTimestamp"),
+        "hospitalAdmitSource",
+        # 2.2 Discharge parameters
+        hospital_discharge_pseudotime.alias("hospitalDischargeTimestamp"),
+        "hospitalDischargeLocation",
+        "hospitalDischargeStatus",
+        # 3. Unit stay parameters
+        "patientUnitStayID",  # The unit stay ID
+        "wardID",
+        # 3.1 Admission parameters
+        unit_admit_pseudotime.alias("unitAdmitTimestamp"),
+        "unitAdmitSource",
+        "unitStayType",
+        pl.col("admissionHeight").alias("unitAdmissionHeight"),
+        pl.col("admissionWeight").alias("unitAdmissionWeight"),
+        # 3.2 Discharge parameters
+        unit_discharge_pseudotime.alias("unitDischargeTimestamp"),
+        "unitDischargeLocation",
+        "unitDischargeStatus",
+        pl.col("dischargeWeight").alias("unitDischargeWeight"),
+    )
 
-    Args:
-        raw_static_df: The raw static data.
-        death_times_df: The death times data.
 
-    Returns:
-        The fixed static data.
-    """
+class PreProcessor(NamedTuple):
+    """A preprocessor function and its dependencies.
 
-    death_times_df = death_times_df.group_by("subject_id").agg(pl.col("deathtime").min())
+    Args:
+      function: TODO
+      dependencies: A two-element tuple containing the prefix of the dependent dataframe and a list of
+        columns needed from that dataframe.
+    """
 
-    return raw_static_df.join(death_times_df, on="subject_id", how="left").select(
-        "subject_id",
-        pl.coalesce(pl.col("dod"), pl.col("deathtime")).alias("dod"),
-        (pl.col("anchor_year") - pl.col("anchor_age")).cast(str).alias("year_of_birth"),
-        "gender",
-    )
+    function: Callable[[Sequence[pl.LazyFrame]], pl.LazyFrame]
+    dependencies: tuple[str, list[str]]
 
 
-FUNCTIONS = {
-    "hosp/diagnoses_icd": (add_discharge_time_by_hadm_id, ("hosp/admissions", ["hadm_id", "dischtime"])),
-    "hosp/drgcodes": (add_discharge_time_by_hadm_id, ("hosp/admissions", ["hadm_id", "dischtime"])),
-    "hosp/patients": (fix_static_data, ("hosp/admissions", ["subject_id", "deathtime"])),
+FUNCTIONS: dict[str, PreProcessor] = {
+    "patient": PreProcessor(
+        process_patient_table, ("hospital", ["hospitalid", "numbedscategory", "teachingstatus", "region"])
+    ),
 }
 
+# From MIMIC
+# "hosp/diagnoses_icd": (add_discharge_time_by_hadm_id, ("hosp/admissions", ["hadm_id", "dischtime"])),
+# "hosp/drgcodes": (add_discharge_time_by_hadm_id, ("hosp/admissions", ["hadm_id", "dischtime"])),
+# "hosp/patients": (fix_static_data, ("hosp/admissions", ["subject_id", "deathtime"])),
+
 
 @hydra.main(version_base=None, config_path="configs", config_name="pre_MEDS")
 def main(cfg: DictConfig):
     """Performs pre-MEDS data wrangling for eICU.
 
-    Inputs are the raw MIMIC files, read from the `raw_cohort_dir` config parameter. Output files are either
+    Inputs are the raw eICU files, read from the `raw_cohort_dir` config parameter. Output files are either
     symlinked (if they are not modified) or written in processed form to the `MEDS_input_dir` config
     parameter. Hydra is used to manage configuration parameters and logging.
+
+    Note that eICU has only a tentative ability to identify true relative admission times for even the same
+    patient, as health system stay IDs are only temporally ordered at the *year* level. As such, to properly
+    parse this dataset in a longitudinal form, you must do one of the following:
+      1. Not operate at the level of patients at all, but instead at the level of health system stays, as
+         individual events within a health system stay can be well ordered.
+      2. Restrict the analysis to only patients who do not have multiple health system stays within a single
+         year (as health system stays across years can be well ordered, provided we assume to distinct stays
+         within a single health system cannot overlap).
+
+    In this pipeline, we choose to operate at the level of health system stays, as this is the most general
+    approach. The only downside is that we lose the ability to track individual patients across health system
+    stays, and thus can only explore questions of limited longitudinal scope.
+
+    We ignore the following tables for the given reasons:
+      1. `admissiondrug`: This table is noted in the
+         [documentation](https://eicu-crd.mit.edu/eicutables/admissiondrug/) as being "Extremely infrequently
+         used".
+
+    Args (all as part of the config file):
+        raw_cohort_dir: The directory containing the raw eICU files.
+        output_dir: The directory to write the processed files to.
     """
 
+    raise NotImplementedError("This script is not yet implemented for eICU.")
+
     hydra_loguru_init()
 
     raw_cohort_dir = Path(cfg.raw_cohort_dir)

From 4c7e2cb34a2fe89d5653b227c06884bf6b863526 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sat, 1 Jun 2024 09:17:06 -0400
Subject: [PATCH 17/47] Added (again untested) allergy table

---
 eICU_Example/configs/event_configs.yaml | 18 ++++++-
 eICU_Example/pre_MEDS.py                | 69 +++++++++++++++++++++++--
 2 files changed, 81 insertions(+), 6 deletions(-)

diff --git a/eICU_Example/configs/event_configs.yaml b/eICU_Example/configs/event_configs.yaml
index 9d467f6..9d7177b 100644
--- a/eICU_Example/configs/event_configs.yaml
+++ b/eICU_Example/configs/event_configs.yaml
@@ -36,6 +36,7 @@ patient:
       - col("unitStayType")
     timestamp: "unitAdmitTimestamp"
     ward_id: "wardID"
+    unit_stay_id: "patientUnitStayID"
   unit_admission_weight:
     code:
       - "UNIT_ADMISSION_WEIGHT"
@@ -58,4 +59,19 @@ patient:
     timestamp: "unitDischargeTimestamp"
     numerical_value: "unitDischargeWeight"
 
-timestamp_format: "%Y-%m-%d %H:%M:%S"
+admissiondx:
+  admission_diagnosis:
+    code:
+      - "ADMISSION_DX"
+      - col("admitDxName")
+    timestamp: "admitDxEnteredTimestamp"
+    admission_dx_id: "admitDxID"
+    unit_stay_id: "patientUnitStayID"
+
+allergy:
+  allergy:
+    code:
+      - "ALLERGY"
+      - col("allergyType")
+      - col("allergyName")
+    timestamp: "allergyEnteredTimestamp"
diff --git a/eICU_Example/pre_MEDS.py b/eICU_Example/pre_MEDS.py
index c78d506..38a56d8 100755
--- a/eICU_Example/pre_MEDS.py
+++ b/eICU_Example/pre_MEDS.py
@@ -66,7 +66,7 @@ def check_timestamps_agree(df: pl.LazyFrame, pseudotime_col: pl.Expr, given_24ht
         )
 
 
-def process_patient_table(df: pl.LazyFrame, hospital_df: pl.LazyFrame) -> pl.LazyFrame:
+def process_patient(df: pl.LazyFrame, hospital_df: pl.LazyFrame) -> pl.LazyFrame:
     """Takes the patient table and converts it to a form that includes timestamps.
 
     As eICU stores only offset times, note here that we add a CONSTANT TIME ACROSS ALL PATIENTS for the true
@@ -121,12 +121,12 @@ def process_patient_table(df: pl.LazyFrame, hospital_df: pl.LazyFrame) -> pl.Laz
 
     return df.join(hospital_df, left_on="hospitalID", right_on="hospitalid", how="left").select(
         # 1. Static variables
-        "uniquepid",
+        PATIENT_ID,
         "gender",
         pseudo_date_of_birth.alias("dateOfBirth"),
         "ethnicity",
         # 2. Health system stay parameters
-        "patientHealthSystemStayID",
+        HEALTH_SYSTEM_STAY_ID,
         "hospitalID",
         pl.col("numbedscategory").alias("hospitalNumBedsCategory"),
         pl.col("teachingstatus").alias("hospitalTeachingStatus"),
@@ -139,7 +139,7 @@ def process_patient_table(df: pl.LazyFrame, hospital_df: pl.LazyFrame) -> pl.Laz
         "hospitalDischargeLocation",
         "hospitalDischargeStatus",
         # 3. Unit stay parameters
-        "patientUnitStayID",  # The unit stay ID
+        UNIT_STAY_ID,
         "wardID",
         # 3.1 Admission parameters
         unit_admit_pseudotime.alias("unitAdmitTimestamp"),
@@ -155,6 +155,59 @@ def process_patient_table(df: pl.LazyFrame, hospital_df: pl.LazyFrame) -> pl.Laz
     )
 
 
+def process_admissiondx(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame:
+    """Takes the admissiondx table and converts it to a form that includes timestamps.
+
+    The output of this process is ultimately converted to events via the `admissiondx` key in the
+    `configs/event_configs.yaml` file.
+    """
+
+    admission_dx_pseudotime = pl.col("unitAdmitTimestamp") + pl.duration(
+        minutes=pl.col("admitDxEnteredOffset")
+    )
+
+    logger.warning(
+        "NOT SURE ABOUT THE FOLLOWING for admissiondx table. Check with the eICU team:\n"
+        "  - How should we use `admitDxTest`? It's not used here.\n"
+        "  - How should we use `admitDxPath`? It's not used here.\n"
+    )
+
+    return df.join(patient_df, on=UNIT_STAY_ID, how="inner").select(
+        HEALTH_SYSTEM_STAY_ID,
+        UNIT_STAY_ID,
+        admission_dx_pseudotime.alias("admitDxEnteredTimestamp"),
+        "admitDxName",
+        "admitDxID",
+    )
+
+
+def process_allergy(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame:
+    """Takes the allergy table and converts it to a form that includes timestamps.
+
+    The output of this process is ultimately converted to events via the `allergy` key in the
+    `configs/event_configs.yaml` file.
+    """
+
+    allergy_pseudotime = pl.col("unitAdmitTimestamp") + pl.duration(minutes=pl.col("allergyEnteredOffset"))
+
+    logger.warning(
+        "NOT SURE ABOUT THE FOLLOWING for allergy table. Check with the eICU team:\n"
+        "  - How should we use `allergyNoteType`? It's not used here.\n"
+        "  - How should we use `specialtyType`? It's not used here.\n"
+        "  - How should we use `userType`? It's not used here.\n"
+        "  - Is `drugName` the name of the drug to which the patient is allergic or the drug given to the "
+        "patient (docs say 'name of the selected admission drug')?\n"
+    )
+
+    return df.join(patient_df, on=UNIT_STAY_ID, how="inner").select(
+        HEALTH_SYSTEM_STAY_ID,
+        UNIT_STAY_ID,
+        allergy_pseudotime.alias("allergyEnteredTimestamp"),
+        "allergyType",
+        "allergyName",
+    )
+
+
 class PreProcessor(NamedTuple):
     """A preprocessor function and its dependencies.
 
@@ -170,7 +223,13 @@ class PreProcessor(NamedTuple):
 
 FUNCTIONS: dict[str, PreProcessor] = {
     "patient": PreProcessor(
-        process_patient_table, ("hospital", ["hospitalid", "numbedscategory", "teachingstatus", "region"])
+        process_patient, ("hospital", ["hospitalid", "numbedscategory", "teachingstatus", "region"])
+    ),
+    "admissiondx": PreProcessor(
+        process_admissiondx, ("patient", [UNIT_STAY_ID, HEALTH_SYSTEM_STAY_ID, "unitAdmitTimestamp"])
+    ),
+    "allergy": PreProcessor(
+        process_allergy, ("patient", [UNIT_STAY_ID, HEALTH_SYSTEM_STAY_ID, "unitAdmitTimestamp"])
     ),
 }
 

From f3463f5f8f4715c0ba8c97a16816310a59a91c8e Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sat, 1 Jun 2024 12:12:23 -0400
Subject: [PATCH 18/47] Improved the structure of the pipeline and added a
 bunch more tables. Still untested.

---
 eICU_Example/pre_MEDS.py | 125 +++++++++++++++++++++------------------
 1 file changed, 67 insertions(+), 58 deletions(-)

diff --git a/eICU_Example/pre_MEDS.py b/eICU_Example/pre_MEDS.py
index 38a56d8..0b34356 100755
--- a/eICU_Example/pre_MEDS.py
+++ b/eICU_Example/pre_MEDS.py
@@ -17,7 +17,7 @@
 import hydra
 import polars as pl
 from loguru import logger
-from omegaconf import DictConfig
+from omegaconf import DictConfig, OmegaConf
 
 from MEDS_polars_functions.utils import (
     get_shard_prefix,
@@ -155,57 +155,47 @@ def process_patient(df: pl.LazyFrame, hospital_df: pl.LazyFrame) -> pl.LazyFrame
     )
 
 
-def process_admissiondx(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame:
-    """Takes the admissiondx table and converts it to a form that includes timestamps.
+def join_and_get_pseudotime_fntr(
+    table_name: str,
+    offset_col: str,
+    pseudotime_col: str,
+    output_data_cols: list[str] | None = None,
+    warning_items: list[str] | None = None,
+) -> Callable[[pl.LazyFrame, pl.LazyFrame], pl.LazyFrame]:
+    """Returns a function that joins a dataframe to the `patient` table and adds pseudotimes.
 
-    The output of this process is ultimately converted to events via the `admissiondx` key in the
-    `configs/event_configs.yaml` file.
+    Also raises specified warning strings via the logger for uncertain columns.
+
+    TODO
     """
 
-    admission_dx_pseudotime = pl.col("unitAdmitTimestamp") + pl.duration(
-        minutes=pl.col("admitDxEnteredOffset")
-    )
+    if output_data_cols is None:
+        output_data_cols = []
 
-    logger.warning(
-        "NOT SURE ABOUT THE FOLLOWING for admissiondx table. Check with the eICU team:\n"
-        "  - How should we use `admitDxTest`? It's not used here.\n"
-        "  - How should we use `admitDxPath`? It's not used here.\n"
-    )
+    def fn(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame:
+        f"""Takes the {table_name} table and converts it to a form that includes pseudo-timestamps.
 
-    return df.join(patient_df, on=UNIT_STAY_ID, how="inner").select(
-        HEALTH_SYSTEM_STAY_ID,
-        UNIT_STAY_ID,
-        admission_dx_pseudotime.alias("admitDxEnteredTimestamp"),
-        "admitDxName",
-        "admitDxID",
-    )
-
-
-def process_allergy(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame:
-    """Takes the allergy table and converts it to a form that includes timestamps.
+        The output of this process is ultimately converted to events via the `{table_name}` key in the
+        `configs/event_configs.yaml` file.
+        """
 
-    The output of this process is ultimately converted to events via the `allergy` key in the
-    `configs/event_configs.yaml` file.
-    """
+        pseudotime = pl.col("unitAdmitTimestamp") + pl.duration(minutes=pl.col(offset_col))
 
-    allergy_pseudotime = pl.col("unitAdmitTimestamp") + pl.duration(minutes=pl.col("allergyEnteredOffset"))
+        if warning_items:
+            warning_lines = [
+                f"NOT SURE ABOUT THE FOLLOWING for {table_name} table. Check with the eICU team:",
+                *(f"  - {item}" for item in warning_items),
+            ]
+            logger.warning("\n".join(warning_lines))
 
-    logger.warning(
-        "NOT SURE ABOUT THE FOLLOWING for allergy table. Check with the eICU team:\n"
-        "  - How should we use `allergyNoteType`? It's not used here.\n"
-        "  - How should we use `specialtyType`? It's not used here.\n"
-        "  - How should we use `userType`? It's not used here.\n"
-        "  - Is `drugName` the name of the drug to which the patient is allergic or the drug given to the "
-        "patient (docs say 'name of the selected admission drug')?\n"
-    )
+        return df.join(patient_df, on=UNIT_STAY_ID, how="inner").select(
+            HEALTH_SYSTEM_STAY_ID,
+            UNIT_STAY_ID,
+            pseudotime.alias(pseudotime_col),
+            *output_data_cols,
+        )
 
-    return df.join(patient_df, on=UNIT_STAY_ID, how="inner").select(
-        HEALTH_SYSTEM_STAY_ID,
-        UNIT_STAY_ID,
-        allergy_pseudotime.alias("allergyEnteredTimestamp"),
-        "allergyType",
-        "allergyName",
-    )
+    return fn
 
 
 class PreProcessor(NamedTuple):
@@ -221,23 +211,17 @@ class PreProcessor(NamedTuple):
     dependencies: tuple[str, list[str]]
 
 
-FUNCTIONS: dict[str, PreProcessor] = {
+NEEDED_PATIENT_COLS = [UNIT_STAY_ID, HEALTH_SYSTEM_STAY_ID, "unitAdmitTimestamp"]
+PATIENT_DEPENDENCY = ("patient", NEEDED_PATIENT_COLS)
+
+# Generic "copy from patients" functions are stored in `configs/table_preprocessors.yaml` and loaded in
+# `main`.
+SPECIALTY_FUNCTIONS: dict[str, PreProcessor] = {
     "patient": PreProcessor(
         process_patient, ("hospital", ["hospitalid", "numbedscategory", "teachingstatus", "region"])
     ),
-    "admissiondx": PreProcessor(
-        process_admissiondx, ("patient", [UNIT_STAY_ID, HEALTH_SYSTEM_STAY_ID, "unitAdmitTimestamp"])
-    ),
-    "allergy": PreProcessor(
-        process_allergy, ("patient", [UNIT_STAY_ID, HEALTH_SYSTEM_STAY_ID, "unitAdmitTimestamp"])
-    ),
 }
 
-# From MIMIC
-# "hosp/diagnoses_icd": (add_discharge_time_by_hadm_id, ("hosp/admissions", ["hadm_id", "dischtime"])),
-# "hosp/drgcodes": (add_discharge_time_by_hadm_id, ("hosp/admissions", ["hadm_id", "dischtime"])),
-# "hosp/patients": (fix_static_data, ("hosp/admissions", ["subject_id", "deathtime"])),
-
 
 @hydra.main(version_base=None, config_path="configs", config_name="pre_MEDS")
 def main(cfg: DictConfig):
@@ -264,6 +248,20 @@ def main(cfg: DictConfig):
       1. `admissiondrug`: This table is noted in the
          [documentation](https://eicu-crd.mit.edu/eicutables/admissiondrug/) as being "Extremely infrequently
          used".
+      2. `apacheApsVar`: This table is a sort of "meta-table" that contains variables used to compute the
+         APACHE score; we won't use these raw variables from this table, but instead will use the raw data.
+      3. `apachePatientResult`: This table has pre-computed APACHE score variables; we won't use these and
+         will use the raw data directly.
+      4. `apachePredVar`: This table contains variables used to compute the APACHE score; we won't use these
+         in favor of the raw data directly.
+      5. `carePlanCareProvider`: This table contains information about the provider for given care-plan
+         entries; however, as we can't link this table to the particular care-plan entries, we don't use it
+         here. It also is not clear (to the author of this script; the eICU team may know more) how reliable
+         the time-offsets are for this table as they merely denote when a provider was entered into the care
+         plan.
+      6. `customLab`: The documentation for this table is very sparse, so we skip it.
+      7. `intakeOutput`: There are a number of significant warnings about duplicates, cumulative values, and
+         more in the documentation for this table, so for now we skip it.
 
     Args (all as part of the config file):
         raw_cohort_dir: The directory containing the raw eICU files.
@@ -274,6 +272,17 @@ def main(cfg: DictConfig):
 
     hydra_loguru_init()
 
+    functions = {**SPECIALTY_FUNCTIONS}
+
+    logger.info("Loading table preprocessors from configs/table_preprocessors.yaml...")
+    preprocessors = OmegaConf.load("configs/table_preprocessors.yaml")
+    for table_name, preprocessor_cfg in preprocessors.items():
+        logger.info(f"  Adding preprocessor for {table_name}:\n{OmegaConf.to_yaml(preprocessor_cfg)}")
+        functions[table_name] = PreProcessor(
+            join_and_get_pseudotime_fntr(table_name=table_name, **preprocessor_cfg),
+            PATIENT_DEPENDENCY,
+        )
+
     raw_cohort_dir = Path(cfg.raw_cohort_dir)
     MEDS_input_dir = Path(cfg.output_dir)
 
@@ -292,7 +301,7 @@ def main(cfg: DictConfig):
 
         out_fp.parent.mkdir(parents=True, exist_ok=True)
 
-        if pfx not in FUNCTIONS:
+        if pfx not in functions:
             logger.info(
                 f"No function needed for {pfx}: "
                 f"Symlinking {str(in_fp.resolve())} to {str(out_fp.resolve())}"
@@ -306,7 +315,7 @@ def main(cfg: DictConfig):
                 print(f"Done with {pfx}. Continuing")
                 continue
 
-            fn, need_df = FUNCTIONS[pfx]
+            fn, need_df = functions[pfx]
             if not need_df:
                 st = datetime.now()
                 logger.info(f"Processing {pfx}...")
@@ -340,7 +349,7 @@ def main(cfg: DictConfig):
             out_fp = MEDS_input_dir / f"{pfx}.parquet"
 
             logger.info(f"  Processing dependent df @ {pfx}...")
-            fn, _ = FUNCTIONS[pfx]
+            fn, _ = functions[pfx]
 
             fp_st = datetime.now()
             logger.info(f"    Loading {str(fp.resolve())}...")

From 542b7cb4ec3ea3ef806984ec39aa33032ed22d9d Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sat, 1 Jun 2024 12:12:55 -0400
Subject: [PATCH 19/47] Forgot table configs -- likely currently malformed.

---
 eICU_Example/configs/table_preprocessors.yaml | 94 +++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 eICU_Example/configs/table_preprocessors.yaml

diff --git a/eICU_Example/configs/table_preprocessors.yaml b/eICU_Example/configs/table_preprocessors.yaml
new file mode 100644
index 0000000..c7d7a80
--- /dev/null
+++ b/eICU_Example/configs/table_preprocessors.yaml
@@ -0,0 +1,94 @@
+
+admissiondx:
+  offset_col: "admitDxEnteredOffset"
+  pseudotime_col: "admitDxEnteredTimestamp"
+  output_data_cols: ["admitDxName", "admitDxID"]
+  warning_items: ["How should we use `admitDxTest`?", "How should we use `admitDxPath`?"]
+
+allergy:
+  offset_col: "allergyEnteredOffset"
+  pseudotime_col: "allergyEnteredTimestamp"
+  output_data_cols: ["allergyType", "allergyName"]
+  warning_items:
+    - "How should we use `allergyNoteType`?"
+    - "How should we use `specialtyType`?"
+    - "How should we use `userType`?"
+    - |- Is `drugName` the name of the drug to which the patient is allergic or the drug given to the patient
+      (docs say \'name of the selected admission drug\')?
+
+carePlanGeneral:
+  offset_col: "cplItemOffset"
+  pseudotime_col: "carePlanGeneralItemEnteredTimestamp"
+  output_data_cols: ["cplGroup", "cplItemValue"]
+
+carePlanEOL:
+  offset_col: "cplEolDiscussionOffset"
+  pseudotime_col: "carePlanEolDiscussionOccurredTimestamp"
+  warning_items:
+    - "Is the DiscussionOffset time actually reliable? Should we fall back on the SaveOffset time?"
+
+carePlanGoal:
+  offset_col: "cplGoalOffset"
+  pseudotime_col: "carePlanGoalEnteredTimestamp"
+  output_data_cols: ["cplGoalCategory", "cplGoalValue", "cplGoalStatus"]
+
+carePlanInfectiousDisease:
+  offset_col: "cplInfectDiseaseOffset"
+  pseudotime_col: "carePlanInfectDiseaseEnteredTimestamp"
+  output_data_cols: ["infectDiseaseSite", "infectDiseaseAssessment", "responseToTherapy", "treatment"]
+
+diagonosis:
+  offset_col: "diagnosisOffset"
+  pseudotime_col: "diagnosisEnteredTimestamp"
+  output_data_cols: ["ICD9Code", "diagnosisPriority", "diagnosisString"]
+  warning_items:
+    - "Though we use it, the `diagnosisString` field documentation is unclear -- by what is it separated?"
+
+infusionDrug:
+  offset_col: "infusionOffset"
+  pseudotime_col: "infusionEnteredTimestamp"
+  output_data_cols:
+    - "infusionDrugID"
+    - "drugName"
+    - "drugRate"
+    - "infusionRate"
+    - "drugAmount"
+    - "volumeOfFluid"
+    - "patientWeight"
+
+lab:
+  offset_col: "labResultOffset"
+  pseudotime_col: "labResultDrawnTimestamp"
+  output_data_cols:
+    - "labName"
+    - "labResult"
+    - "labResultText"
+    - "labMeasureNameSystem"
+    - "labMeasureNameInterface"
+    - "labTypeID"
+  warning_items:
+    - "Is this the time the lab was drawn? Entered? The time the result came in?"
+    - "We **IGNORE** the `labResultRevisedOffset` column -- this may be a mistake!"
+
+medication:
+  offset_col:
+    - "drugOrderOffset"
+    - "drugStartOffset"
+    - "drugStopOffset"
+  pseudotime_col:
+    - "drugOrderTimestamp"
+    - "drugStartTimestamp"
+    - "drugStopTimestamp"
+  output_data_cols:
+    - "medicationID"
+    - "drugIVAdmixture"
+    - "drugName"
+    - "drugHiclSeqno"
+    - "dosage"
+    - "routeAdmin"
+    - "frequency"
+    - "loadingDose"
+    - "PRN"
+    - "GTC"
+  warning_items:
+    - "We **IGNORE** the `drugOrderCancelled` column -- this may be a mistake!"

From 167acb07788439a9cac141e10d88ef9131925578 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sat, 1 Jun 2024 12:18:24 -0400
Subject: [PATCH 20/47] Added soon to be deleted microlab table

---
 eICU_Example/configs/table_preprocessors.yaml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/eICU_Example/configs/table_preprocessors.yaml b/eICU_Example/configs/table_preprocessors.yaml
index c7d7a80..a4e1cc8 100644
--- a/eICU_Example/configs/table_preprocessors.yaml
+++ b/eICU_Example/configs/table_preprocessors.yaml
@@ -92,3 +92,14 @@ medication:
     - "GTC"
   warning_items:
     - "We **IGNORE** the `drugOrderCancelled` column -- this may be a mistake!"
+
+# We don't use this because the culture taken time != culture result time, so seeing this data would give a
+# model an advantage over any possible real-world implementation. But, I'm including its data here as it would
+# be easy to fit into this paradigm.
+#microLab:
+#  offset_col: "cultureTakenOffset"
+#  pseudotime_col: "cultureTakenTimestamp"
+#  output_data_cols:
+#    - "cultureSite"
+#    - "organism"
+#    - "antibiotic"

From 15815f3c0314727e06fbcf2ce0e04c07bb974f4e Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sat, 1 Jun 2024 12:19:05 -0400
Subject: [PATCH 21/47] docs update

---
 eICU_Example/configs/table_preprocessors.yaml | 11 -----------
 eICU_Example/pre_MEDS.py                      |  3 +++
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/eICU_Example/configs/table_preprocessors.yaml b/eICU_Example/configs/table_preprocessors.yaml
index a4e1cc8..c7d7a80 100644
--- a/eICU_Example/configs/table_preprocessors.yaml
+++ b/eICU_Example/configs/table_preprocessors.yaml
@@ -92,14 +92,3 @@ medication:
     - "GTC"
   warning_items:
     - "We **IGNORE** the `drugOrderCancelled` column -- this may be a mistake!"
-
-# We don't use this because the culture taken time != culture result time, so seeing this data would give a
-# model an advantage over any possible real-world implementation. But, I'm including its data here as it would
-# be easy to fit into this paradigm.
-#microLab:
-#  offset_col: "cultureTakenOffset"
-#  pseudotime_col: "cultureTakenTimestamp"
-#  output_data_cols:
-#    - "cultureSite"
-#    - "organism"
-#    - "antibiotic"
diff --git a/eICU_Example/pre_MEDS.py b/eICU_Example/pre_MEDS.py
index 0b34356..f02c0e8 100755
--- a/eICU_Example/pre_MEDS.py
+++ b/eICU_Example/pre_MEDS.py
@@ -262,6 +262,9 @@ def main(cfg: DictConfig):
       6. `customLab`: The documentation for this table is very sparse, so we skip it.
       7. `intakeOutput`: There are a number of significant warnings about duplicates, cumulative values, and
          more in the documentation for this table, so for now we skip it.
+      8. `microLab`: We don't use this because the culture taken time != culture result time, so seeing this
+         data would give a model an advantage over any possible real-world implementation. Plus, the docs say
+         it is not well populated.
 
     Args (all as part of the config file):
         raw_cohort_dir: The directory containing the raw eICU files.

From bda16e83933f537f8a608b6c2edc265bd22c7762 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sat, 1 Jun 2024 14:36:03 -0400
Subject: [PATCH 22/47] Added partial event configs for all tables.

---
 eICU_Example/configs/event_configs.yaml       | 540 ++++++++++++++++++
 eICU_Example/configs/table_preprocessors.yaml | 185 +++++-
 eICU_Example/pre_MEDS.py                      |  30 +-
 3 files changed, 750 insertions(+), 5 deletions(-)

diff --git a/eICU_Example/configs/event_configs.yaml b/eICU_Example/configs/event_configs.yaml
index 9d7177b..7ac4225 100644
--- a/eICU_Example/configs/event_configs.yaml
+++ b/eICU_Example/configs/event_configs.yaml
@@ -75,3 +75,543 @@ allergy:
       - col("allergyType")
       - col("allergyName")
     timestamp: "allergyEnteredTimestamp"
+
+carePlanGeneral:
+  cplItem:
+    code:
+      - "CAREPLAN_GENERAL"
+      - col("cplGroup")
+      - col("cplItemValue")
+    timestamp: "carePlanGeneralItemEnteredTimestamp"
+
+carePlanEOL:
+  cplEolDiscussion:
+    code:
+      - "CAREPLAN_EOL"
+    timestamp: "carePlanEolDiscussionOccurredTimestamp"
+
+carePlanGoal:
+  cplGoal:
+    code:
+      - "CAREPLAN_GOAL"
+      - col("cplGoalCategory")
+      - col("cplGoalValue")
+      - col("cplGoalStatus")
+    timestamp: "carePlanGoalEnteredTimestamp"
+
+carePlanInfectiousDisease:
+  cplInfectDisease:
+    code:
+      - "CAREPLAN_INFECTIOUS_DISEASE"
+      - col("infectDiseaseSite")
+      - col("infectDiseaseAssessment")
+      - col("treatment")
+      - col("responseToTherapy")
+    timestamp: "carePlanInfectDiseaseEnteredTimestamp"
+
+diagnosis:
+  diagnosis:
+    code:
+      - "ICD9CM"
+      - col("ICD9Code")
+      - col("diagnosisPriority")
+    timestamp: "diagnosisEnteredTimestamp"
+    diagnosis_string: "diagnosisString"
+
+infusionDrug:
+  infusion:
+    code:
+      - "INFUSION"
+      - col("infusionDrugID")
+      - col("drugName")
+    timestamp: "infusionEnteredTimestamp"
+    drug_rate: "drugRate"
+    infusion_rate: "infusionRate"
+    drug_amount: "drugAmount"
+    volume_of_fluid: "volumeOfFluid"
+  patient_weight:
+    code:
+      - "INFUSION_PATIENT_WEIGHT"
+    timestamp: "infusionEnteredTimestamp"
+    numerical_value: "patientWeight"
+
+lab:
+  lab:
+    code:
+      - "LAB"
+      - col("labMeasureNameSystem")
+      - col("labMeasureNameInterface")
+      - col("labName")
+    timestamp: "labResultDrawnTimestamp"
+    numerical_value: "labResult"
+    text_value: "labResultText"
+    lab_type_id: "labTypeID"
+
+medication:
+  drug_ordered:
+    code:
+      - "MEDICATION"
+      - "ORDERED"
+      - col(drugName)
+    timestamp: "drugOrderTimestamp"
+    medication_id: "medicationID"
+    drug_iv_admixture: "drugIVAdmixture"
+    dosage: "dosage"
+    route_admin: "routeAdmin"
+    frequency: "frequency"
+    loading_dose: "loadingDose"
+    prn: "PRN"
+    gtc: "GTC"
+  drug_started:
+    code:
+      - "MEDICATION"
+      - "STARTED"
+      - col(drugName)
+    timestamp: "drugStartedTimestamp"
+    medication_id: "medicationID"
+  drug_stopped:
+    code:
+      - "MEDICATION"
+      - "STOPPED"
+      - col(drugName)
+    timestamp: "drugStoppedTimestamp"
+    medication_id: "medicationID"
+
+nurseAssessment:
+  nurse_assessment_performed:
+    code:
+      - "NURSE_ASSESSMENT"
+      - "PERFORMED"
+      - NOT YET DONE
+    timestamp: "nurseAssessPerformedTimestamp"
+    nurse_assessment_id: "nurseAssessID"
+    cell_label: "cellLabel"
+    cell_attribute: "cellAttribute"
+    cell_attribute_value: "cellAttributeValue"
+
+  nurse_assessment_entered:
+    code:
+      - "NURSE_ASSESSMENT"
+      - "ENTERED"
+      - NOT YET DONE
+    timestamp: "nurseAssessEnteredTimestamp"
+    nurse_assessment_id: "nurseAssessID"
+    cell_label: "cellLabel"
+    cell_attribute: "cellAttribute"
+    cell_attribute_value: "cellAttributeValue"
+
+nurseCare:
+  nurse_care_performed:
+    code:
+      - "NURSE_CARE"
+      - "PERFORMED"
+      - NOT YET DONE
+    timestamp: "nurseCarePerformedTimestamp"
+    nurse_care_id: "nurseCareID"
+    cell_label: "cellLabel"
+    cell_attribute: "cellAttribute"
+    cell_attribute_value: "cellAttributeValue"
+
+  nurse_care_entered:
+    code:
+      - "NURSE_CARE"
+      - "ENTERED"
+      - NOT YET DONE
+    timestamp: "nurseCareEnteredTimestamp"
+    nurse_care_id: "nurseCareID"
+    cell_label: "cellLabel"
+    cell_attribute: "cellAttribute"
+    cell_attribute_value: "cellAttributeValue"
+
+nurseCharting:
+  nurse_charting_performed:
+    code:
+      - "NURSE_CHARTING"
+      - "PERFORMED"
+      - NOT YET DONE
+    timestamp: "nursingChartPerformedTimestamp"
+    nurse_charting_id: "nursingChartID"
+    cell_type_cat: "nursingChartCellTypeCat"
+    cell_type_val_name: "nursingChartCellTypeValName"
+    cell_type_val_label: "nursingChartCellTypeValLabel"
+    cell_value: "nursingChartValue"
+
+  nurse_charting_entered:
+    code:
+      - "NURSE_CHARTING"
+      - "ENTERED"
+      - NOT YET DONE
+    timestamp: "nursingChartEnteredTimestamp"
+    nurse_charting_id: "nursingChartID"
+    cell_type_cat: "nursingChartCellTypeCat"
+    cell_type_val_name: "nursingChartCellTypeValName"
+    cell_type_val_label: "nursingChartCellTypeValLabel"
+    cell_value: "nursingChartValue"
+
+pastHistory:
+  past_history_taken:
+    code:
+      - "PAST_HISTORY"
+      - "TAKEN"
+      - NOT YET DONE
+    timestamp: "pastHistoryTakenTimestamp"
+    past_history_id: "pastHistoryID"
+    note_type: "pastHistoryNoteType"
+    path: "pastHistoryPath"
+    value: "pastHistoryValue"
+    value_text: "pastHistoryValueText"
+
+  past_history_entered:
+    code:
+      - "PAST_HISTORY"
+      - "ENTERED"
+      - NOT YET DONE
+    timestamp: "pastHistoryEnteredTimestamp"
+    past_history_id: "pastHistoryID"
+    note_type: "pastHistoryNoteType"
+    path: "pastHistoryPath"
+    value: "pastHistoryValue"
+    value_text: "pastHistoryValueText"
+
+
+physicalExam:
+  physical_exam_entered:
+    code:
+      - "PHYSICAL_EXAM"
+      - "ENTERED"
+      - NOT YET DONE
+    timestamp: "physicalExamEnteredTimestamp"
+    physical_exam_id: "physicalExamID"
+    text: "physicalExamText"
+    path: "physicalExamPath"
+    value: "physicalExamValue"
+
+
+respiratoryCare:
+  resp_care_status:
+    code:
+      - "RESP_CARE"
+      - "STATUS"
+      - NOT YET DONE
+    timestamp: "respCareStatusEnteredTimestamp"
+    resp_care_id: "respCareID"
+
+    airwayType:         "airwayType"
+    airwaySize:         "airwaySize"
+    airwayPosition:     "airwayPosition"
+    cuffPressure:       "cuffPressure"
+    apneaParams:        "apneaParams"
+    lowExhMVLimit:      "lowExhMVLimit"
+    hiExhMVLimit:       "hiExhMVLimit"
+    lowExhTVLimit:      "lowExhTVLimit"
+    hiPeakPresLimit:    "hiPeakPresLimit"
+    lowPeakPresLimit:   "lowPeakPresLimit"
+    hiRespRateLimit:    "hiRespRateLimit"
+    lowRespRateLimit:   "lowRespRateLimit"
+    sighPresLimit:      "sighPresLimit"
+    lowIronOxLimit:     "lowIronOxLimit"
+    highIronOxLimit:    "highIronOxLimit"
+    meanAirwayPresLimit: "meanAirwayPresLimit"
+    PEEPLimit:          "PEEPLimit"
+    CPAPLimit:          "CPAPLimit"
+    setApneaInterval:   "setApneaInterval"
+    setApneaTV:         "setApneaTV"
+    setApneaIPPEEPHigh: "setApneaIPPEEPHigh"
+    setApneaRR:         "setApneaRR"
+    setApneaPeakFlow:   "setApneaPeakFlow"
+    setApneaInspTime:   "setApneaInspTime"
+    setApneaIE:         "setApneaIE"
+    setApneaFIO2:       "setApneaFIO2"
+
+  vent_start:
+    code:
+      - "VENT"
+      - "START"
+      - NOT YET DONE
+    timestamp: "ventStartTimestamp"
+    resp_care_id: "respCareID"
+
+  vent_end:
+    code:
+      - "VENT"
+      - "END"
+      - NOT YET DONE
+    timestamp: "ventEndTimestamp"
+    resp_care_id: "respCareID"
+
+
+respiratoryCharting:
+  resp_charting_performed:
+    code:
+      - "RESP_CHARTING"
+      - "PERFORMED"
+      - NOT YET DONE
+    timestamp: "respChartPerformedTimestamp"
+    resp_chart_id: "respChartID"
+    type_cat: "respChartTypeCat"
+    value_label: "respChartValueLabel"
+    value: "respChartValue"
+
+  resp_charting_entered:
+    code:
+      - "RESP_CHARTING"
+      - "ENTERED"
+      - NOT YET DONE
+    timestamp: "respChartEnteredTimestamp"
+    resp_chart_id: "respChartID"
+    type_cat: "respChartTypeCat"
+    value_label: "respChartValueLabel"
+    value: "respChartValue"
+
+treatment:
+  treatment:
+    code:
+      - "TREATMENT"
+      - "ENTERED"
+      - col("treatmentString")
+    timestamp: "treatmentEnteredTimestamp"
+    treatment_id: "treatmentID"
+
+vitalAperiodic:
+  non_invasive_systolic:
+    code:
+      - "VITALS"
+      - "APERIODIC"
+      - "BP"
+      - "NONINVASIVE_SYSTOLIC"
+    timestamp: "observationEnteredTimestamp"
+    vital_id: "vitalAperiodicID"
+    numeric_value: "nonInvasiveSystolic"
+  non_invasive_diastolic:
+    code:
+      - "VITALS"
+      - "APERIODIC"
+      - "BP"
+      - "NONINVASIVE_DIASTOLIC"
+    timestamp: "observationEnteredTimestamp"
+    vital_id: "vitalAperiodicID"
+    numeric_value: "nonInvasiveDiastolic"
+
+  non_invasive_mean:
+    code:
+      - "VITALS"
+      - "APERIODIC"
+      - "BP"
+      - "NONINVASIVE_MEAN"
+    timestamp: "observationEnteredTimestamp"
+    vital_id: "vitalAperiodicID"
+    numeric_value: "nonInvasiveMean"
+
+  paop:
+    code:
+      - "VITALS"
+      - "APERIODIC"
+      - "PAOP"
+    timestamp: "observationEnteredTimestamp"
+    vital_id: "vitalAperiodicID"
+    numeric_value: "paop"
+
+  cardiac_output:
+    code:
+      - "VITALS"
+      - "APERIODIC"
+      - "CARDIAC_OUTPUT"
+    timestamp: "observationEnteredTimestamp"
+    vital_id: "vitalAperiodicID"
+    numeric_value: "cardiacOutput"
+
+  cardiac_input:
+    code:
+      - "VITALS"
+      - "APERIODIC"
+      - "CARDIAC_INPUT"
+    timestamp: "observationEnteredTimestamp"
+    vital_id: "vitalAperiodicID"
+    numeric_value: "cardiacInput"
+
+  svr:
+    code:
+      - "VITALS"
+      - "APERIODIC"
+      - "SVR"
+    timestamp: "observationEnteredTimestamp"
+    vital_id: "vitalAperiodicID"
+    numeric_value: "svr"
+
+  svri:
+    code:
+      - "VITALS"
+      - "APERIODIC"
+      - "SVRI"
+    timestamp: "observationEnteredTimestamp"
+    vital_id: "vitalAperiodicID"
+    numeric_value: "svri"
+
+  pvr:
+    code:
+      - "VITALS"
+      - "APERIODIC"
+      - "PVR"
+    timestamp: "observationEnteredTimestamp"
+    vital_id: "vitalAperiodicID"
+    numeric_value: "pvr"
+
+  pvri:
+    code:
+      - "VITALS"
+      - "APERIODIC"
+      - "PVRI"
+    timestamp: "observationEnteredTimestamp"
+    vital_id: "vitalAperiodicID"
+    numeric_value: "pvri"
+
+vitalPeriodic:
+  temperature:
+    code:
+      - "VITALS"
+      - "PERIODIC"
+      - "TEMPERATURE"
+    timestamp: "observationEnteredTimestamp"
+    vital_id: "vitalPeriodicID"
+    numeric_value: "temperature"
+
+  saO2:
+    code:
+      - "VITALS"
+      - "PERIODIC"
+      - "SAO2"
+    timestamp: "observationEnteredTimestamp"
+    vital_id: "vitalPeriodicID"
+    numeric_value: "saO2"
+
+  heartRate:
+    code:
+      - "VITALS"
+      - "PERIODIC"
+      - "HEARTRATE"
+    timestamp: "observationEnteredTimestamp"
+    vital_id: "vitalPeriodicID"
+    numeric_value: "heartRate"
+
+  respiration:
+    code:
+      - "VITALS"
+      - "PERIODIC"
+      - "RESPIRATION"
+    timestamp: "observationEnteredTimestamp"
+    vital_id: "vitalPeriodicID"
+    numeric_value: "respiration"
+
+  cvp:
+    code:
+      - "VITALS"
+      - "PERIODIC"
+      - "CVP"
+    timestamp: "observationEnteredTimestamp"
+    vital_id: "vitalPeriodicID"
+    numeric_value: "cvp"
+
+  etCo2:
+    code:
+      - "VITALS"
+      - "PERIODIC"
+      - "ETCO2"
+    timestamp: "observationEnteredTimestamp"
+    vital_id: "vitalPeriodicID"
+    numeric_value: "etCo2"
+
+  systemic_systolic:
+    code:
+      - "VITALS"
+      - "PERIODIC"
+      - "BP"
+      - "SYSTEMIC_SYSTOLIC"
+    timestamp: "observationEnteredTimestamp"
+    vital_id: "vitalPeriodicID"
+    numeric_value: "systemicSystolic"
+
+  systemic_diastolic:
+    code:
+      - "VITALS"
+      - "PERIODIC"
+      - "BP"
+      - "SYSTEMIC_DIASTOLIC"
+    timestamp: "observationEnteredTimestamp"
+    vital_id: "vitalPeriodicID"
+    numeric_value: "systemicDiastolic"
+
+  systemic_mean:
+    code:
+      - "VITALS"
+      - "PERIODIC"
+      - "BP"
+      - "SYSTEMIC_MEAN"
+    timestamp: "observationEnteredTimestamp"
+    vital_id: "vitalPeriodicID"
+    numeric_value: "systemicMean"
+
+  pa_systolic:
+    code:
+      - "VITALS"
+      - "PERIODIC"
+      - "BP"
+      - "PULM_ART_SYSTOLIC"
+    timestamp: "observationEnteredTimestamp"
+    vital_id: "vitalPeriodicID"
+    numeric_value: "paSystolic"
+
+  pa_diastolic:
+    code:
+      - "VITALS"
+      - "PERIODIC"
+      - "BP"
+      - "PULM_ART_DIASTOLIC"
+    timestamp: "observationEnteredTimestamp"
+    vital_id: "vitalPeriodicID"
+    numeric_value: "paDiastolic"
+
+  pa_mean:
+    code:
+      - "VITALS"
+      - "PERIODIC"
+      - "BP"
+      - "PULM_ART_MEAN"
+    timestamp: "observationEnteredTimestamp"
+    vital_id: "vitalPeriodicID"
+    numeric_value: "paMean"
+
+  st1:
+    code:
+      - "VITALS"
+      - "PERIODIC"
+      - "ST1"
+    timestamp: "observationEnteredTimestamp"
+    vital_id: "vitalPeriodicID"
+    numeric_value: "st1"
+
+  st2:
+    code:
+      - "VITALS"
+      - "PERIODIC"
+      - "ST2"
+    timestamp: "observationEnteredTimestamp"
+    vital_id: "vitalPeriodicID"
+    numeric_value: "st2"
+
+  st3:
+    code:
+      - "VITALS"
+      - "PERIODIC"
+      - "ST3"
+    timestamp: "observationEnteredTimestamp"
+    vital_id: "vitalPeriodicID"
+    numeric_value: "st3"
+
+  ICP:
+    code:
+      - "VITALS"
+      - "PERIODIC"
+      - "ICP"
+    timestamp: "observationEnteredTimestamp"
+    vital_id: "vitalPeriodicID"
+    numeric_value: "ICP"
diff --git a/eICU_Example/configs/table_preprocessors.yaml b/eICU_Example/configs/table_preprocessors.yaml
index c7d7a80..e7da494 100644
--- a/eICU_Example/configs/table_preprocessors.yaml
+++ b/eICU_Example/configs/table_preprocessors.yaml
@@ -1,4 +1,3 @@
-
 admissiondx:
   offset_col: "admitDxEnteredOffset"
   pseudotime_col: "admitDxEnteredTimestamp"
@@ -92,3 +91,187 @@ medication:
     - "GTC"
   warning_items:
     - "We **IGNORE** the `drugOrderCancelled` column -- this may be a mistake!"
+
+nurseAssessment:
+  offset_col: 
+    - "nurseAssessOffset"
+    - "nurseAssessEntryOffset"
+  pseudotime_col:
+    - "nurseAssessPerformedTimestamp"
+    - "nurseAssessEnteredTimestamp"
+  output_data_cols:
+    - "nurseAssessID"
+    - "cellLabel"
+    - "cellAttribute"
+    - "cellAttributeValue"
+  warning_items:
+    - "Should we be using `cellAttributePath` instead of `cellAttribute`?"
+    - "SOME MAY BE LISTS"
+
+nurseCare:
+  offset_col:
+    - "nurseCareOffset"
+    - "nurseCareEntryOffset"
+  pseudotime_col:
+    - "nurseCarePerformedTimestamp"
+    - "nurseCareEnteredTimestamp"
+  output_data_cols:
+    - "nurseCareID"
+    - "cellLabel"
+    - "cellAttribute"
+    - "cellAttributeValue"
+  warning_items:
+    - "Should we be using `cellAttributePath` instead of `cellAttribute`?"
+    - "SOME MAY BE LISTS"
+
+nurseCharting:
+  offset_col:
+    - "nursingChartOffset"
+    - "nursingChartEntryOffset"
+  pseudotime_col:
+    - "nursingChartPerformedTimestamp"
+    - "nursingChartEnteredTimestamp"
+  output_data_cols:
+    - "nursingChartID"
+    - "nursingChartCellTypeCat"
+    - "nursingChartCellTypeValName"
+    - "nursingChartCellTypeValLabel"
+    - "nursingChartValue"
+  warning_items:
+    - "SOME MAY BE LISTS"
+
+pastHistory:
+  offset_col:
+    - "pastHistoryOffset"
+    - "pastHistoryEnteredOffset"
+  pseudotime_col:
+    - "pastHistoryTakenTimestamp"
+    - "pastHistoryEnteredTimestamp"
+  output_data_cols:
+    - "pastHistoryID"
+    - "pastHistoryNoteType"
+    - "pastHistoryPath"
+    - "pastHistoryValue"
+    - "pastHistoryValueText"
+  warning_items:
+    - "SOME MAY BE LISTS"
+    - "How should we use `pastHistoryPath` vs. `pastHistoryNoteType`?"
+    - "How should we use `pastHistoryValue` vs. `pastHistoryValueText`?"
+
+physicalExam:
+  offset_col: "physicalExamOffset"
+  pseudotime_col: "physicalExamEnteredTimestamp"
+  output_data_cols:
+    - "physicalExamID"
+    - "physicalExamText"
+    - "physicalExamPath"
+    - "physicalExamValue"
+  warning_items:
+    - "How should we use `physicalExamValue` vs. `physicalExamText`?"
+    - "I believe the `physicalExamValue` is a **LIST**. This must be processed specially."
+
+respiratoryCare:
+  offset_col:
+    - "respCareStatusOffset"
+    - "ventStartOffset"
+    - "ventEndOffset"
+  pseudotime_col: 
+    - "respCareStatusEnteredTimestamp"
+    - "ventStartTimestamp"
+    - "ventEndTimestamp"
+  output_data_cols:
+    - "respCareID"
+    - "airwayType"
+    - "airwaySize"
+    - "airwayPosition"
+    - "cuffPressure"
+    - "apneaParams"
+    - "lowExhMVLimit"
+    - "hiExhMVLimit"
+    - "lowExhTVLimit"
+    - "hiPeakPresLimit"
+    - "lowPeakPresLimit"
+    - "hiRespRateLimit"
+    - "lowRespRateLimit"
+    - "sighPresLimit"
+    - "lowIronOxLimit"
+    - "highIronOxLimit"
+    - "meanAirwayPresLimit"
+    - "PEEPLimit"
+    - "CPAPLimit"
+    - "setApneaInterval"
+    - "setApneaTV"
+    - "setApneaIPPEEPHigh"
+    - "setApneaRR"
+    - "setApneaPeakFlow"
+    - "setApneaInspTime"
+    - "setApneaIE"
+    - "setApneaFIO2"
+  warning_items:
+    - "We ignore the `priorVent*` columns -- this may be a mistake!"
+    - "There is a lot of data in this table -- what should be incorporated into the event structure?"
+    - "We might be able to use `priorVent` timestamps to further refine true season of unit admission."
+
+respiratoryCharting:
+  offset_col:
+    - "respChartOffset"
+    - "respChartEntryOffset"
+  pseudotime_col:
+    - "respChartPerformedTimestamp"
+    - "respChartEnteredTimestamp"
+  output_data_cols:
+    - "respChartID"
+    - "respChartTypeCat"
+    - "respChartValueLabel"
+    - "respChartValue"
+  warning_items:
+    - "SOME MAY BE LISTS"
+
+treatment:
+  offset_col: "treatmentOffset"
+  pseudotime_col: "treatmentEnteredTimestamp"
+  output_data_cols:
+    - "treatmentID"
+    - "treatmentString"
+  warning_items:
+    - "Absence of entries in table do not indicate absence of treatments"
+
+vitalAperiodic:
+  offset_col: "observationOffset"
+  pseudotime_col: "observationEnteredTimestamp"
+  output_data_cols:
+    - "vitalAperiodicID"
+    - "nonInvasiveSystolic"
+    - "nonInvasiveDiastolic"
+    - "nonInvasiveMean"
+    - "paop"
+    - "cardiacOutput"
+    - "cardiacInput"
+    - "svr"
+    - "svri"
+    - "pvr"
+    - "pvri"
+
+vitalPeriodic:
+  offset_col: "observationOffset"
+  pseudotime_col: "observationEnteredTimestamp"
+  output_data_cols:
+    - "vitalPeriodicID"
+    - "temperature"
+    - "saO2"
+    - "heartRate"
+    - "respiration"
+    - "cvp"
+    - "etCo2"
+    - "systemicSystolic"
+    - "systemicDiastolic"
+    - "systemicMean"
+    - "paSystolic"
+    - "paDiastolic"
+    - "paMean"
+    - "st1"
+    - "st2"
+    - "st3"
+    - "ICP"
+  warning_items:
+    - "These are 5-minute median values. There are going to be a *lot* of events."
diff --git a/eICU_Example/pre_MEDS.py b/eICU_Example/pre_MEDS.py
index f02c0e8..1f7cab9 100755
--- a/eICU_Example/pre_MEDS.py
+++ b/eICU_Example/pre_MEDS.py
@@ -157,8 +157,8 @@ def process_patient(df: pl.LazyFrame, hospital_df: pl.LazyFrame) -> pl.LazyFrame
 
 def join_and_get_pseudotime_fntr(
     table_name: str,
-    offset_col: str,
-    pseudotime_col: str,
+    offset_col: str | list[str],
+    pseudotime_col: str | list[str],
     output_data_cols: list[str] | None = None,
     warning_items: list[str] | None = None,
 ) -> Callable[[pl.LazyFrame, pl.LazyFrame], pl.LazyFrame]:
@@ -172,6 +172,18 @@ def join_and_get_pseudotime_fntr(
     if output_data_cols is None:
         output_data_cols = []
 
+    if isinstance(offset_col, str):
+        offset_col = [offset_col]
+    if isinstance(pseudotime_col, str):
+        pseudotime_col = [pseudotime_col]
+
+    if len(offset_col) != len(pseudotime_col):
+        raise ValueError(
+            "There must be the same number of `offset_col`s and `pseudotime_col`s specified. Got "
+            f"{len(offset_col)} and {len(pseudotime_col)}, respectively."
+        )
+
+
     def fn(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame:
         f"""Takes the {table_name} table and converts it to a form that includes pseudo-timestamps.
 
@@ -179,7 +191,11 @@ def fn(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame:
         `configs/event_configs.yaml` file.
         """
 
-        pseudotime = pl.col("unitAdmitTimestamp") + pl.duration(minutes=pl.col(offset_col))
+
+        pseudotimes = [
+            (pl.col("unitAdmitTimestamp") + pl.duration(minutes=pl.col(offset))).alias(pseudotime)
+            for pseudotime, offset in zip(pseudotime_col, offset_col)
+        ]
 
         if warning_items:
             warning_lines = [
@@ -191,7 +207,7 @@ def fn(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame:
         return df.join(patient_df, on=UNIT_STAY_ID, how="inner").select(
             HEALTH_SYSTEM_STAY_ID,
             UNIT_STAY_ID,
-            pseudotime.alias(pseudotime_col),
+            *pseudotimes,
             *output_data_cols,
         )
 
@@ -265,6 +281,12 @@ def main(cfg: DictConfig):
       8. `microLab`: We don't use this because the culture taken time != culture result time, so seeing this
          data would give a model an advantage over any possible real-world implementation. Plus, the docs say
          it is not well populated.
+      9. `note`: This table is largely duplicated with structured data due to the fact that primarily
+         narrative notes were removed due to PHI constraints (see the docs).
+
+    There are other notes for this pipeline:
+      1. Many fields here are, I believe, **lists**, not simple categoricals, and should be split and
+         processed accordingly. This is not yet done.
 
     Args (all as part of the config file):
         raw_cohort_dir: The directory containing the raw eICU files.

From 26a386b31929100c60b863d8fbf44db2b7f59c97 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sat, 1 Jun 2024 15:47:02 -0400
Subject: [PATCH 23/47] Revised main script

---
 eICU_Example/pre_MEDS.py | 138 +++++++++++++--------------------------
 1 file changed, 46 insertions(+), 92 deletions(-)

diff --git a/eICU_Example/pre_MEDS.py b/eICU_Example/pre_MEDS.py
index 1f7cab9..73e774e 100755
--- a/eICU_Example/pre_MEDS.py
+++ b/eICU_Example/pre_MEDS.py
@@ -9,10 +9,9 @@
 root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True)
 
 import gzip
-from collections.abc import Callable, Sequence
+from collections.abc import Callable
 from datetime import datetime
 from pathlib import Path
-from typing import NamedTuple
 
 import hydra
 import polars as pl
@@ -183,7 +182,6 @@ def join_and_get_pseudotime_fntr(
             f"{len(offset_col)} and {len(pseudotime_col)}, respectively."
         )
 
-
     def fn(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame:
         f"""Takes the {table_name} table and converts it to a form that includes pseudo-timestamps.
 
@@ -191,7 +189,6 @@ def fn(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame:
         `configs/event_configs.yaml` file.
         """
 
-
         pseudotimes = [
             (pl.col("unitAdmitTimestamp") + pl.duration(minutes=pl.col(offset))).alias(pseudotime)
             for pseudotime, offset in zip(pseudotime_col, offset_col)
@@ -214,29 +211,7 @@ def fn(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame:
     return fn
 
 
-class PreProcessor(NamedTuple):
-    """A preprocessor function and its dependencies.
-
-    Args:
-      function: TODO
-      dependencies: A two-element tuple containing the prefix of the dependent dataframe and a list of
-        columns needed from that dataframe.
-    """
-
-    function: Callable[[Sequence[pl.LazyFrame]], pl.LazyFrame]
-    dependencies: tuple[str, list[str]]
-
-
 NEEDED_PATIENT_COLS = [UNIT_STAY_ID, HEALTH_SYSTEM_STAY_ID, "unitAdmitTimestamp"]
-PATIENT_DEPENDENCY = ("patient", NEEDED_PATIENT_COLS)
-
-# Generic "copy from patients" functions are stored in `configs/table_preprocessors.yaml` and loaded in
-# `main`.
-SPECIALTY_FUNCTIONS: dict[str, PreProcessor] = {
-    "patient": PreProcessor(
-        process_patient, ("hospital", ["hospitalid", "numbedscategory", "teachingstatus", "region"])
-    ),
-}
 
 
 @hydra.main(version_base=None, config_path="configs", config_name="pre_MEDS")
@@ -293,32 +268,59 @@ def main(cfg: DictConfig):
         output_dir: The directory to write the processed files to.
     """
 
-    raise NotImplementedError("This script is not yet implemented for eICU.")
-
     hydra_loguru_init()
 
-    functions = {**SPECIALTY_FUNCTIONS}
-
     logger.info("Loading table preprocessors from configs/table_preprocessors.yaml...")
     preprocessors = OmegaConf.load("configs/table_preprocessors.yaml")
+    functions = {}
     for table_name, preprocessor_cfg in preprocessors.items():
         logger.info(f"  Adding preprocessor for {table_name}:\n{OmegaConf.to_yaml(preprocessor_cfg)}")
-        functions[table_name] = PreProcessor(
-            join_and_get_pseudotime_fntr(table_name=table_name, **preprocessor_cfg),
-            PATIENT_DEPENDENCY,
-        )
+        functions[table_name] = join_and_get_pseudotime_fntr(table_name=table_name, **preprocessor_cfg)
 
     raw_cohort_dir = Path(cfg.raw_cohort_dir)
     MEDS_input_dir = Path(cfg.output_dir)
 
-    all_fps = list(raw_cohort_dir.glob("**/*.csv.gz"))
+    logger.info("Processing patient table first...")
 
-    dfs_to_load = {}
+    hospital_fp = raw_cohort_dir / "hospital.csv.gz"
+    patient_fp = raw_cohort_dir / "patient.csv.gz"
+    logger.info(f"Loading {str(hospital_fp.resolve())}...")
+    hospital_df = load_raw_eicu_file(
+        hospital_fp, columns=["hospitalid", "numbedscategory", "teachingstatus", "region"]
+    )
+    logger.info(f"Loading {str(patient_fp.resolve())}...")
+    raw_patient_df = load_raw_eicu_file(patient_fp)
+
+    logger.info("Processing patient table...")
+    patient_df = process_patient(raw_patient_df, hospital_df)
+    write_lazyframe(patient_df, MEDS_input_dir / "patient.parquet")
+
+    all_fps = [
+        fp for fp in raw_cohort_dir.glob("*/.csv.gz") if fp.name not in {"hospital.csv.gz", "patient.csv.gz"}
+    ]
+
+    unused_tables = {
+        "admissiondrug",
+        "apacheApsVar",
+        "apachePatientResult",
+        "apachePredVar",
+        "carePlanCareProvider",
+        "customLab",
+        "intakeOutput",
+        "microLab",
+        "note",
+    }
 
     for in_fp in all_fps:
         pfx = get_shard_prefix(raw_cohort_dir, in_fp)
+        if pfx in unused_tables:
+            logger.warning(f"Skipping {pfx} as it is not supported in this pipeline.")
+            continue
+        elif pfx not in functions:
+            logger.warning(f"No function needed for {pfx}. For eICU, THIS IS UNEXPECTED")
+            continue
 
-        out_fp = MEDS_input_dir / in_fp.relative_to(raw_cohort_dir)
+        out_fp = MEDS_input_dir / f"{pfx}.parquet"
 
         if out_fp.is_file():
             print(f"Done with {pfx}. Continuing")
@@ -326,63 +328,15 @@ def main(cfg: DictConfig):
 
         out_fp.parent.mkdir(parents=True, exist_ok=True)
 
-        if pfx not in functions:
-            logger.info(
-                f"No function needed for {pfx}: "
-                f"Symlinking {str(in_fp.resolve())} to {str(out_fp.resolve())}"
-            )
-            relative_in_fp = in_fp.relative_to(out_fp.parent, walk_up=True)
-            out_fp.symlink_to(relative_in_fp)
-            continue
-        else:
-            out_fp = MEDS_input_dir / f"{pfx}.parquet"
-            if out_fp.is_file():
-                print(f"Done with {pfx}. Continuing")
-                continue
-
-            fn, need_df = functions[pfx]
-            if not need_df:
-                st = datetime.now()
-                logger.info(f"Processing {pfx}...")
-                df = load_raw_eicu_file(in_fp)
-                logger.info(f"  Loaded raw {in_fp} in {datetime.now() - st}")
-                processed_df = fn(df)
-                write_lazyframe(processed_df, out_fp)
-                logger.info(f"  Processed and wrote to {str(out_fp.resolve())} in {datetime.now() - st}")
-            else:
-                needed_pfx, needed_cols = need_df
-                if needed_pfx not in dfs_to_load:
-                    dfs_to_load[needed_pfx] = {"fps": set(), "cols": set()}
-
-                dfs_to_load[needed_pfx]["fps"].add(in_fp)
-                dfs_to_load[needed_pfx]["cols"].update(needed_cols)
-
-    for df_to_load_pfx, fps_and_cols in dfs_to_load.items():
-        fps = fps_and_cols["fps"]
-        cols = list(fps_and_cols["cols"])
-
-        df_to_load_fp = raw_cohort_dir / f"{df_to_load_pfx}.csv.gz"
+        fn = functions[pfx]
 
         st = datetime.now()
-
-        logger.info(f"Loading {str(df_to_load_fp.resolve())} for manipulating other dataframes...")
-        df = load_raw_eicu_file(df_to_load_fp, columns=cols)
-        logger.info(f"  Loaded in {datetime.now() - st}")
-
-        for fp in fps:
-            pfx = get_shard_prefix(raw_cohort_dir, fp)
-            out_fp = MEDS_input_dir / f"{pfx}.parquet"
-
-            logger.info(f"  Processing dependent df @ {pfx}...")
-            fn, _ = functions[pfx]
-
-            fp_st = datetime.now()
-            logger.info(f"    Loading {str(fp.resolve())}...")
-            fp_df = load_raw_eicu_file(fp)
-            logger.info(f"    Loaded in {datetime.now() - fp_st}")
-            processed_df = fn(fp_df, df)
-            write_lazyframe(processed_df, out_fp)
-            logger.info(f"    Processed and wrote to {str(out_fp.resolve())} in {datetime.now() - fp_st}")
+        logger.info(f"Processing {pfx}...")
+        df = load_raw_eicu_file(in_fp)
+        logger.info(f"  * Loaded raw {in_fp} in {datetime.now() - st}")
+        processed_df = fn(df, patient_df)
+        write_lazyframe(processed_df, out_fp)
+        logger.info(f"  * Processed and wrote to {str(out_fp.resolve())} in {datetime.now() - st}")
 
     logger.info(f"Done! All dataframes processed and written to {str(MEDS_input_dir.resolve())}")
 

From e9000964f7f0a5a039bb10a8d9fece7e6debb087 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sat, 1 Jun 2024 15:56:01 -0400
Subject: [PATCH 24/47] Fixed a variety of lint errors

---
 MIMIC-IV_Example/README.md                    |   6 +-
 MIMIC-IV_Example/joint_script.sh              |  18 +--
 MIMIC-IV_Example/joint_script_slurm.sh        | 102 ++++++++--------
 eICU_Example/README.md                        |   9 +-
 eICU_Example/configs/event_configs.yaml       |  51 ++++----
 eICU_Example/configs/table_preprocessors.yaml |  20 +++-
 eICU_Example/joint_script.sh                  |  32 ++---
 eICU_Example/joint_script_slurm.sh            | 110 +++++++++---------
 eICU_Example/sbatch_joint_script.sh           |  24 ----
 scripts/extraction/shard_events.py            |   4 +-
 10 files changed, 179 insertions(+), 197 deletions(-)
 delete mode 100644 eICU_Example/sbatch_joint_script.sh

diff --git a/MIMIC-IV_Example/README.md b/MIMIC-IV_Example/README.md
index 4056319..406f1f2 100644
--- a/MIMIC-IV_Example/README.md
+++ b/MIMIC-IV_Example/README.md
@@ -71,6 +71,7 @@ root directory of this repository):
 In practice, on a machine with 150 GB of RAM and 10 cores, this step takes less than 5 minutes in total.
 
 ## Step 3: Run the MEDS extraction ETL
+
 ### Running locally, serially
 
 We will assume you want to output the final MEDS dataset into a directory we'll denote as `$MIMICIV_MEDS_DIR`.
@@ -127,11 +128,12 @@ and performance is not necessary; however, for larger datasets, it can be.
 ```
 
 ### Running Locally, in Parallel.
+
 This step is the exact same commands as above, but leverages Hydra's multirun capabilities with the `joblib`
-launcher. Install this package with the optional `local_parallelism` option (e.g., `pip install -e
-.[local_parallelism]` and run `./MIMIC-IV_Example/joint_script.sh`. See that script for expected args.
+launcher. Install this package with the optional `local_parallelism` option (e.g., `pip install -e .[local_parallelism]` and run `./MIMIC-IV_Example/joint_script.sh`. See that script for expected args.
 
 ### Running Each Step over Slurm
+
 To use slurm, run each command with the number of workers desired using Hydra's multirun capabilities with the
 `submitit_slurm` launcher. Install this package with the optional `slurm_parallelism` option. See below for
 modified commands. Note these can't be chained in a single script as the jobs will not wait for all slurm jobs
diff --git a/MIMIC-IV_Example/joint_script.sh b/MIMIC-IV_Example/joint_script.sh
index 9d7ae69..eb58e89 100755
--- a/MIMIC-IV_Example/joint_script.sh
+++ b/MIMIC-IV_Example/joint_script.sh
@@ -8,21 +8,21 @@ N_PARALLEL_WORKERS="$4"
 shift 4
 
 echo "Running pre-MEDS conversion."
-./MIMIC-IV_Example/pre_MEDS.py raw_cohort_dir=$MIMICIV_RAW_DIR output_dir=$MIMICIV_PREMEDS_DIR
+./MIMIC-IV_Example/pre_MEDS.py raw_cohort_dir="$MIMICIV_RAW_DIR" output_dir="$MIMICIV_PREMEDS_DIR"
 
 echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel"
 ./scripts/extraction/shard_events.py \
     --multirun \
     worker="range(0,$N_PARALLEL_WORKERS)" \
     hydra/launcher=joblib \
-    input_dir=$MIMICIV_PREMEDS_DIR \
-    cohort_dir=$MIMICIV_MEDS_DIR \
+    input_dir="$MIMICIV_PREMEDS_DIR" \
+    cohort_dir="$MIMICIV_MEDS_DIR" \
     event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
 
 echo "Splitting patients in serial"
 ./scripts/extraction/split_and_shard_patients.py \
-    input_dir=$MIMICIV_PREMEDS_DIR \
-    cohort_dir=$MIMICIV_MEDS_DIR \
+    input_dir="$MIMICIV_PREMEDS_DIR" \
+    cohort_dir="$MIMICIV_MEDS_DIR" \
     event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
 
 echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
@@ -30,8 +30,8 @@ echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
     --multirun \
     worker="range(0,$N_PARALLEL_WORKERS)" \
     hydra/launcher=joblib \
-    input_dir=$MIMICIV_PREMEDS_DIR \
-    cohort_dir=$MIMICIV_MEDS_DIR \
+    input_dir="$MIMICIV_PREMEDS_DIR" \
+    cohort_dir="$MIMICIV_MEDS_DIR" \
     event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
 
 echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
@@ -39,6 +39,6 @@ echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
     --multirun \
     worker="range(0,$N_PARALLEL_WORKERS)" \
     hydra/launcher=joblib \
-    input_dir=$MIMICIV_PREMEDS_DIR \
-    cohort_dir=$MIMICIV_MEDS_DIR \
+    input_dir="$MIMICIV_PREMEDS_DIR" \
+    cohort_dir="$MIMICIV_MEDS_DIR" \
     event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
diff --git a/MIMIC-IV_Example/joint_script_slurm.sh b/MIMIC-IV_Example/joint_script_slurm.sh
index 8ce85fb..3948e87 100755
--- a/MIMIC-IV_Example/joint_script_slurm.sh
+++ b/MIMIC-IV_Example/joint_script_slurm.sh
@@ -11,17 +11,17 @@ shift 4
 # this doesn't fall back on running anything locally in a setting where only slurm worker nodes have
 # sufficient computational resources to run the actual jobs.
 
-# echo "Running pre-MEDS conversion on one worker."
-# ./MIMIC-IV_Example/pre_MEDS.py \
-#   --multirun \
-#   worker="range(0,1)" \
-#   hydra/launcher=submitit_slurm \
-#   hydra.launcher.timeout_min=60 \
-#   hydra.launcher.cpus_per_task=10 \
-#   hydra.launcher.mem_gb=50 \
-#   hydra.launcher.partition="short" \
-#   raw_cohort_dir=$MIMICIV_RAW_DIR \
-#   output_dir=$MIMICIV_PREMEDS_DIR
+echo "Running pre-MEDS conversion on one worker."
+./MIMIC-IV_Example/pre_MEDS.py \
+  --multirun \
+  worker="range(0,1)" \
+  hydra/launcher=submitit_slurm \
+  hydra.launcher.timeout_min=60 \
+  hydra.launcher.cpus_per_task=10 \
+  hydra.launcher.mem_gb=50 \
+  hydra.launcher.partition="short" \
+  raw_cohort_dir="$MIMICIV_RAW_DIR" \
+  output_dir="$MIMICIV_PREMEDS_DIR"
 
 echo "Trying submitit launching with $N_PARALLEL_WORKERS jobs."
 
@@ -34,45 +34,45 @@ echo "Trying submitit launching with $N_PARALLEL_WORKERS jobs."
     hydra.launcher.mem_gb=50 \
     hydra.launcher.partition="short" \
     "hydra.job.env_copy=[PATH]" \
-    input_dir=$MIMICIV_PREMEDS_DIR \
-    cohort_dir=$MIMICIV_MEDS_DIR \
+    input_dir="$MIMICIV_PREMEDS_DIR" \
+    cohort_dir="$MIMICIV_MEDS_DIR" \
     event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml
 
-#echo "Splitting patients on one worker"
-#./scripts/extraction/split_and_shard_patients.py \
-#    --multirun \
-#    worker="range(0,1)" \
-#    hydra/launcher=submitit_slurm \
-#    hydra.launcher.timeout_min=60 \
-#    hydra.launcher.cpus_per_task=10 \
-#    hydra.launcher.mem_gb=50 \
-#    hydra.launcher.partition="short" \
-#    input_dir=$MIMICIV_PREMEDS_DIR \
-#    cohort_dir=$MIMICIV_MEDS_DIR \
-#    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
-#
-#echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
-#./scripts/extraction/convert_to_sharded_events.py \
-#    --multirun \
-#    worker="range(0,$N_PARALLEL_WORKERS)" \
-#    hydra/launcher=submitit_slurm \
-#    hydra.launcher.timeout_min=60 \
-#    hydra.launcher.cpus_per_task=10 \
-#    hydra.launcher.mem_gb=50 \
-#    hydra.launcher.partition="short" \
-#    input_dir=$MIMICIV_PREMEDS_DIR \
-#    cohort_dir=$MIMICIV_MEDS_DIR \
-#    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
-#
-#echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
-#./scripts/extraction/merge_to_MEDS_cohort.py \
-#    --multirun \
-#    worker="range(0,$N_PARALLEL_WORKERS)" \
-#    hydra/launcher=submitit_slurm \
-#    hydra.launcher.timeout_min=60 \
-#    hydra.launcher.cpus_per_task=10 \
-#    hydra.launcher.mem_gb=50 \
-#    hydra.launcher.partition="short" \
-#    input_dir=$MIMICIV_PREMEDS_DIR \
-#    cohort_dir=$MIMICIV_MEDS_DIR \
-#    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+echo "Splitting patients on one worker"
+./scripts/extraction/split_and_shard_patients.py \
+    --multirun \
+    worker="range(0,1)" \
+    hydra/launcher=submitit_slurm \
+    hydra.launcher.timeout_min=60 \
+    hydra.launcher.cpus_per_task=10 \
+    hydra.launcher.mem_gb=50 \
+    hydra.launcher.partition="short" \
+    input_dir="$MIMICIV_PREMEDS_DIR" \
+    cohort_dir="$MIMICIV_MEDS_DIR" \
+    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+
+echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
+./scripts/extraction/convert_to_sharded_events.py \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=submitit_slurm \
+    hydra.launcher.timeout_min=60 \
+    hydra.launcher.cpus_per_task=10 \
+    hydra.launcher.mem_gb=50 \
+    hydra.launcher.partition="short" \
+    input_dir="$MIMICIV_PREMEDS_DIR" \
+    cohort_dir="$MIMICIV_MEDS_DIR" \
+    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+
+echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
+./scripts/extraction/merge_to_MEDS_cohort.py \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=submitit_slurm \
+    hydra.launcher.timeout_min=60 \
+    hydra.launcher.cpus_per_task=10 \
+    hydra.launcher.mem_gb=50 \
+    hydra.launcher.partition="short" \
+    input_dir="$MIMICIV_PREMEDS_DIR" \
+    cohort_dir="$MIMICIV_MEDS_DIR" \
+    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
diff --git a/eICU_Example/README.md b/eICU_Example/README.md
index b23ae9e..2715613 100644
--- a/eICU_Example/README.md
+++ b/eICU_Example/README.md
@@ -69,6 +69,7 @@ root directory of this repository):
 In practice, on a machine with 150 GB of RAM and 10 cores, this step takes less than 5 minutes in total.
 
 ## Step 3: Run the MEDS extraction ETL
+
 ### Running locally, serially
 
 We will assume you want to output the final MEDS dataset into a directory we'll denote as `$EICU_MEDS_DIR`.
@@ -125,11 +126,12 @@ and performance is not necessary; however, for larger datasets, it can be.
 ```
 
 ### Running Locally, in Parallel.
+
 This step is the exact same commands as above, but leverages Hydra's multirun capabilities with the `joblib`
-launcher. Install this package with the optional `local_parallelism` option (e.g., `pip install -e
-.[local_parallelism]` and run `./eICU_Example/joint_script.sh`. See that script for expected args.
+launcher. Install this package with the optional `local_parallelism` option (e.g., `pip install -e .[local_parallelism]` and run `./eICU_Example/joint_script.sh`. See that script for expected args.
 
 ### Running Each Step over Slurm
+
 To use slurm, run each command with the number of workers desired using Hydra's multirun capabilities with the
 `submitit_slurm` launcher. Install this package with the optional `slurm_parallelism` option. See below for
 modified commands. Note these can't be chained in a single script as the jobs will not wait for all slurm jobs
@@ -195,8 +197,7 @@ Currently, some tables are ignored, including:
 
 1. `admissiondrug`: The [documentation](https://eicu-crd.mit.edu/eicutables/admissiondrug/) notes that this is
    extremely infrequently used, so we skip it.
-2. 
-
+2.
 
 Lots of questions remain about how to appropriately handle timestamps of the data -- e.g., things like HCPCS
 events are stored at the level of the _date_, not the _datetime_. How should those be slotted into the
diff --git a/eICU_Example/configs/event_configs.yaml b/eICU_Example/configs/event_configs.yaml
index 7ac4225..50c8eb0 100644
--- a/eICU_Example/configs/event_configs.yaml
+++ b/eICU_Example/configs/event_configs.yaml
@@ -273,7 +273,6 @@ pastHistory:
     value: "pastHistoryValue"
     value_text: "pastHistoryValueText"
 
-
 physicalExam:
   physical_exam_entered:
     code:
@@ -286,7 +285,6 @@ physicalExam:
     path: "physicalExamPath"
     value: "physicalExamValue"
 
-
 respiratoryCare:
   resp_care_status:
     code:
@@ -296,32 +294,32 @@ respiratoryCare:
     timestamp: "respCareStatusEnteredTimestamp"
     resp_care_id: "respCareID"
 
-    airwayType:         "airwayType"
-    airwaySize:         "airwaySize"
-    airwayPosition:     "airwayPosition"
-    cuffPressure:       "cuffPressure"
-    apneaParams:        "apneaParams"
-    lowExhMVLimit:      "lowExhMVLimit"
-    hiExhMVLimit:       "hiExhMVLimit"
-    lowExhTVLimit:      "lowExhTVLimit"
-    hiPeakPresLimit:    "hiPeakPresLimit"
-    lowPeakPresLimit:   "lowPeakPresLimit"
-    hiRespRateLimit:    "hiRespRateLimit"
-    lowRespRateLimit:   "lowRespRateLimit"
-    sighPresLimit:      "sighPresLimit"
-    lowIronOxLimit:     "lowIronOxLimit"
-    highIronOxLimit:    "highIronOxLimit"
+    airwayType: "airwayType"
+    airwaySize: "airwaySize"
+    airwayPosition: "airwayPosition"
+    cuffPressure: "cuffPressure"
+    apneaParams: "apneaParams"
+    lowExhMVLimit: "lowExhMVLimit"
+    hiExhMVLimit: "hiExhMVLimit"
+    lowExhTVLimit: "lowExhTVLimit"
+    hiPeakPresLimit: "hiPeakPresLimit"
+    lowPeakPresLimit: "lowPeakPresLimit"
+    hiRespRateLimit: "hiRespRateLimit"
+    lowRespRateLimit: "lowRespRateLimit"
+    sighPresLimit: "sighPresLimit"
+    lowIronOxLimit: "lowIronOxLimit"
+    highIronOxLimit: "highIronOxLimit"
     meanAirwayPresLimit: "meanAirwayPresLimit"
-    PEEPLimit:          "PEEPLimit"
-    CPAPLimit:          "CPAPLimit"
-    setApneaInterval:   "setApneaInterval"
-    setApneaTV:         "setApneaTV"
+    PEEPLimit: "PEEPLimit"
+    CPAPLimit: "CPAPLimit"
+    setApneaInterval: "setApneaInterval"
+    setApneaTV: "setApneaTV"
     setApneaIPPEEPHigh: "setApneaIPPEEPHigh"
-    setApneaRR:         "setApneaRR"
-    setApneaPeakFlow:   "setApneaPeakFlow"
-    setApneaInspTime:   "setApneaInspTime"
-    setApneaIE:         "setApneaIE"
-    setApneaFIO2:       "setApneaFIO2"
+    setApneaRR: "setApneaRR"
+    setApneaPeakFlow: "setApneaPeakFlow"
+    setApneaInspTime: "setApneaInspTime"
+    setApneaIE: "setApneaIE"
+    setApneaFIO2: "setApneaFIO2"
 
   vent_start:
     code:
@@ -339,7 +337,6 @@ respiratoryCare:
     timestamp: "ventEndTimestamp"
     resp_care_id: "respCareID"
 
-
 respiratoryCharting:
   resp_charting_performed:
     code:
diff --git a/eICU_Example/configs/table_preprocessors.yaml b/eICU_Example/configs/table_preprocessors.yaml
index e7da494..3c3a9ca 100644
--- a/eICU_Example/configs/table_preprocessors.yaml
+++ b/eICU_Example/configs/table_preprocessors.yaml
@@ -2,7 +2,8 @@ admissiondx:
   offset_col: "admitDxEnteredOffset"
   pseudotime_col: "admitDxEnteredTimestamp"
   output_data_cols: ["admitDxName", "admitDxID"]
-  warning_items: ["How should we use `admitDxTest`?", "How should we use `admitDxPath`?"]
+  warning_items:
+    ["How should we use `admitDxTest`?", "How should we use `admitDxPath`?"]
 
 allergy:
   offset_col: "allergyEnteredOffset"
@@ -12,8 +13,9 @@ allergy:
     - "How should we use `allergyNoteType`?"
     - "How should we use `specialtyType`?"
     - "How should we use `userType`?"
-    - |- Is `drugName` the name of the drug to which the patient is allergic or the drug given to the patient
-      (docs say \'name of the selected admission drug\')?
+    - >-
+      Is `drugName` the name of the drug to which the patient is allergic or the drug given to the patient
+      (docs say 'name of the selected admission drug')?
 
 carePlanGeneral:
   offset_col: "cplItemOffset"
@@ -34,7 +36,13 @@ carePlanGoal:
 carePlanInfectiousDisease:
   offset_col: "cplInfectDiseaseOffset"
   pseudotime_col: "carePlanInfectDiseaseEnteredTimestamp"
-  output_data_cols: ["infectDiseaseSite", "infectDiseaseAssessment", "responseToTherapy", "treatment"]
+  output_data_cols:
+    [
+      "infectDiseaseSite",
+      "infectDiseaseAssessment",
+      "responseToTherapy",
+      "treatment",
+    ]
 
 diagonosis:
   offset_col: "diagnosisOffset"
@@ -93,7 +101,7 @@ medication:
     - "We **IGNORE** the `drugOrderCancelled` column -- this may be a mistake!"
 
 nurseAssessment:
-  offset_col: 
+  offset_col:
     - "nurseAssessOffset"
     - "nurseAssessEntryOffset"
   pseudotime_col:
@@ -175,7 +183,7 @@ respiratoryCare:
     - "respCareStatusOffset"
     - "ventStartOffset"
     - "ventEndOffset"
-  pseudotime_col: 
+  pseudotime_col:
     - "respCareStatusEnteredTimestamp"
     - "ventStartTimestamp"
     - "ventEndTimestamp"
diff --git a/eICU_Example/joint_script.sh b/eICU_Example/joint_script.sh
index 9d7ae69..48cec46 100755
--- a/eICU_Example/joint_script.sh
+++ b/eICU_Example/joint_script.sh
@@ -1,44 +1,44 @@
 #!/usr/bin/env bash
 
-MIMICIV_RAW_DIR="$1"
-MIMICIV_PREMEDS_DIR="$2"
-MIMICIV_MEDS_DIR="$3"
+EICU_RAW_DIR="$1"
+EICU_PREMEDS_DIR="$2"
+EICU_MEDS_DIR="$3"
 N_PARALLEL_WORKERS="$4"
 
 shift 4
 
 echo "Running pre-MEDS conversion."
-./MIMIC-IV_Example/pre_MEDS.py raw_cohort_dir=$MIMICIV_RAW_DIR output_dir=$MIMICIV_PREMEDS_DIR
+./eICU_Example/pre_MEDS.py raw_cohort_dir="$EICU_RAW_DIR" output_dir="$EICU_PREMEDS_DIR"
 
 echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel"
 ./scripts/extraction/shard_events.py \
     --multirun \
     worker="range(0,$N_PARALLEL_WORKERS)" \
     hydra/launcher=joblib \
-    input_dir=$MIMICIV_PREMEDS_DIR \
-    cohort_dir=$MIMICIV_MEDS_DIR \
-    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+    input_dir="$EICU_PREMEDS_DIR" \
+    cohort_dir="$EICU_MEDS_DIR" \
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
 
 echo "Splitting patients in serial"
 ./scripts/extraction/split_and_shard_patients.py \
-    input_dir=$MIMICIV_PREMEDS_DIR \
-    cohort_dir=$MIMICIV_MEDS_DIR \
-    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+    input_dir="$EICU_PREMEDS_DIR" \
+    cohort_dir="$EICU_MEDS_DIR" \
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
 
 echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
 ./scripts/extraction/convert_to_sharded_events.py \
     --multirun \
     worker="range(0,$N_PARALLEL_WORKERS)" \
     hydra/launcher=joblib \
-    input_dir=$MIMICIV_PREMEDS_DIR \
-    cohort_dir=$MIMICIV_MEDS_DIR \
-    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+    input_dir="$EICU_PREMEDS_DIR" \
+    cohort_dir="$EICU_MEDS_DIR" \
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
 
 echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
 ./scripts/extraction/merge_to_MEDS_cohort.py \
     --multirun \
     worker="range(0,$N_PARALLEL_WORKERS)" \
     hydra/launcher=joblib \
-    input_dir=$MIMICIV_PREMEDS_DIR \
-    cohort_dir=$MIMICIV_MEDS_DIR \
-    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+    input_dir="$EICU_PREMEDS_DIR" \
+    cohort_dir="$EICU_MEDS_DIR" \
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
diff --git a/eICU_Example/joint_script_slurm.sh b/eICU_Example/joint_script_slurm.sh
index 8ce85fb..6b36ef0 100755
--- a/eICU_Example/joint_script_slurm.sh
+++ b/eICU_Example/joint_script_slurm.sh
@@ -1,8 +1,8 @@
 #!/usr/bin/env bash
 
-MIMICIV_RAW_DIR="$1"
-MIMICIV_PREMEDS_DIR="$2"
-MIMICIV_MEDS_DIR="$3"
+EICU_RAW_DIR="$1"
+EICU_PREMEDS_DIR="$2"
+EICU_MEDS_DIR="$3"
 N_PARALLEL_WORKERS="$4"
 
 shift 4
@@ -11,17 +11,17 @@ shift 4
 # this doesn't fall back on running anything locally in a setting where only slurm worker nodes have
 # sufficient computational resources to run the actual jobs.
 
-# echo "Running pre-MEDS conversion on one worker."
-# ./MIMIC-IV_Example/pre_MEDS.py \
-#   --multirun \
-#   worker="range(0,1)" \
-#   hydra/launcher=submitit_slurm \
-#   hydra.launcher.timeout_min=60 \
-#   hydra.launcher.cpus_per_task=10 \
-#   hydra.launcher.mem_gb=50 \
-#   hydra.launcher.partition="short" \
-#   raw_cohort_dir=$MIMICIV_RAW_DIR \
-#   output_dir=$MIMICIV_PREMEDS_DIR
+echo "Running pre-MEDS conversion on one worker."
+./eICU_Example/pre_MEDS.py \
+  --multirun \
+  worker="range(0,1)" \
+  hydra/launcher=submitit_slurm \
+  hydra.launcher.timeout_min=60 \
+  hydra.launcher.cpus_per_task=10 \
+  hydra.launcher.mem_gb=50 \
+  hydra.launcher.partition="short" \
+  raw_cohort_dir="$EICU_RAW_DIR" \
+  output_dir="$EICU_PREMEDS_DIR"
 
 echo "Trying submitit launching with $N_PARALLEL_WORKERS jobs."
 
@@ -34,45 +34,45 @@ echo "Trying submitit launching with $N_PARALLEL_WORKERS jobs."
     hydra.launcher.mem_gb=50 \
     hydra.launcher.partition="short" \
     "hydra.job.env_copy=[PATH]" \
-    input_dir=$MIMICIV_PREMEDS_DIR \
-    cohort_dir=$MIMICIV_MEDS_DIR \
-    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml
+    input_dir="$EICU_PREMEDS_DIR" \
+    cohort_dir="$EICU_MEDS_DIR" \
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml
 
-#echo "Splitting patients on one worker"
-#./scripts/extraction/split_and_shard_patients.py \
-#    --multirun \
-#    worker="range(0,1)" \
-#    hydra/launcher=submitit_slurm \
-#    hydra.launcher.timeout_min=60 \
-#    hydra.launcher.cpus_per_task=10 \
-#    hydra.launcher.mem_gb=50 \
-#    hydra.launcher.partition="short" \
-#    input_dir=$MIMICIV_PREMEDS_DIR \
-#    cohort_dir=$MIMICIV_MEDS_DIR \
-#    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
-#
-#echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
-#./scripts/extraction/convert_to_sharded_events.py \
-#    --multirun \
-#    worker="range(0,$N_PARALLEL_WORKERS)" \
-#    hydra/launcher=submitit_slurm \
-#    hydra.launcher.timeout_min=60 \
-#    hydra.launcher.cpus_per_task=10 \
-#    hydra.launcher.mem_gb=50 \
-#    hydra.launcher.partition="short" \
-#    input_dir=$MIMICIV_PREMEDS_DIR \
-#    cohort_dir=$MIMICIV_MEDS_DIR \
-#    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
-#
-#echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
-#./scripts/extraction/merge_to_MEDS_cohort.py \
-#    --multirun \
-#    worker="range(0,$N_PARALLEL_WORKERS)" \
-#    hydra/launcher=submitit_slurm \
-#    hydra.launcher.timeout_min=60 \
-#    hydra.launcher.cpus_per_task=10 \
-#    hydra.launcher.mem_gb=50 \
-#    hydra.launcher.partition="short" \
-#    input_dir=$MIMICIV_PREMEDS_DIR \
-#    cohort_dir=$MIMICIV_MEDS_DIR \
-#    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+echo "Splitting patients on one worker"
+./scripts/extraction/split_and_shard_patients.py \
+    --multirun \
+    worker="range(0,1)" \
+    hydra/launcher=submitit_slurm \
+    hydra.launcher.timeout_min=60 \
+    hydra.launcher.cpus_per_task=10 \
+    hydra.launcher.mem_gb=50 \
+    hydra.launcher.partition="short" \
+    input_dir="$EICU_PREMEDS_DIR" \
+    cohort_dir="$EICU_MEDS_DIR" \
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
+
+echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
+./scripts/extraction/convert_to_sharded_events.py \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=submitit_slurm \
+    hydra.launcher.timeout_min=60 \
+    hydra.launcher.cpus_per_task=10 \
+    hydra.launcher.mem_gb=50 \
+    hydra.launcher.partition="short" \
+    input_dir="$EICU_PREMEDS_DIR" \
+    cohort_dir="$EICU_MEDS_DIR" \
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
+
+echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
+./scripts/extraction/merge_to_MEDS_cohort.py \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=submitit_slurm \
+    hydra.launcher.timeout_min=60 \
+    hydra.launcher.cpus_per_task=10 \
+    hydra.launcher.mem_gb=50 \
+    hydra.launcher.partition="short" \
+    input_dir="$EICU_PREMEDS_DIR" \
+    cohort_dir="$EICU_MEDS_DIR" \
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
diff --git a/eICU_Example/sbatch_joint_script.sh b/eICU_Example/sbatch_joint_script.sh
deleted file mode 100644
index e031363..0000000
--- a/eICU_Example/sbatch_joint_script.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/env bash
-#SBATCH -c 10                           # Request one core
-#SBATCH -t 0-03:00                      # Runtime in D-HH:MM format
-#SBATCH -p short                        # Partition to run in
-#SBATCH --mem=300GB                     # Memory total in MiB (for all cores)
-#SBATCH -o MIMIC_IV_MEDS_%j_sbatch.out  # File to which STDOUT will be written, including job ID (%j)
-#SBATCH -e MIMIC_IV_MEDS_%j_sbatch.err  # File to which STDERR will be written, including job ID (%j)
-
-cd /n/data1/hms/dbmi/zaklab/mmd/MEDS_polars_functions
-
-MIMICIV_RAW_DIR="$1"
-MIMICIV_PREMEDS_DIR="$2"
-MIMICIV_MEDS_DIR="$3"
-N_PARALLEL_WORKERS="$4"
-
-LOG_DIR="$MIMICIV_MEDS_DIR/.logs"
-
-echo "Running with saving to $LOG_DIR"
-
-mkdir -p $LOG_DIR
-
-PATH="/home/mbm47/.conda/envs/MEDS_pipelines/bin:$PATH" \
-  time mprof run --include-children --exit-code --output "$LOG_DIR/mprofile.dat" \
-      ./MIMIC-IV_Example/joint_script.sh "$@" 2> $LOG_DIR/timings.txt
diff --git a/scripts/extraction/shard_events.py b/scripts/extraction/shard_events.py
index d0533e3..9ce0ac9 100755
--- a/scripts/extraction/shard_events.py
+++ b/scripts/extraction/shard_events.py
@@ -223,9 +223,7 @@ def main(cfg: DictConfig):
                 seen_files.add(get_shard_prefix(raw_cohort_dir, f))
 
     if not input_files_to_subshard:
-        raise FileNotFoundError(
-            f"Can't find any files in {str(raw_cohort_dir.resolve())} to sub-shard!"
-        )
+        raise FileNotFoundError(f"Can't find any files in {str(raw_cohort_dir.resolve())} to sub-shard!")
 
     random.shuffle(input_files_to_subshard)
 

From 2f92036ed0177ac5886cde3d46d35b27091c1bd9 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sat, 1 Jun 2024 15:56:37 -0400
Subject: [PATCH 25/47] Adjusted a tiny thing in the yaml

---
 eICU_Example/configs/table_preprocessors.yaml | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/eICU_Example/configs/table_preprocessors.yaml b/eICU_Example/configs/table_preprocessors.yaml
index 3c3a9ca..da94e96 100644
--- a/eICU_Example/configs/table_preprocessors.yaml
+++ b/eICU_Example/configs/table_preprocessors.yaml
@@ -37,12 +37,10 @@ carePlanInfectiousDisease:
   offset_col: "cplInfectDiseaseOffset"
   pseudotime_col: "carePlanInfectDiseaseEnteredTimestamp"
   output_data_cols:
-    [
-      "infectDiseaseSite",
-      "infectDiseaseAssessment",
-      "responseToTherapy",
-      "treatment",
-    ]
+    - "infectDiseaseSite"
+    - "infectDiseaseAssessment"
+    - "responseToTherapy"
+    - "treatment"
 
 diagonosis:
   offset_col: "diagnosisOffset"

From c482a7823518b6c5942a7fc9488b26f57d9ebe3b Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sat, 1 Jun 2024 16:08:11 -0400
Subject: [PATCH 26/47] Updated scripts to have help messages and to error if
 any internal piece errors.

---
 MIMIC-IV_Example/joint_script.sh       | 32 +++++++++++++++++++++++++
 MIMIC-IV_Example/joint_script_slurm.sh | 33 ++++++++++++++++++++++++++
 eICU_Example/joint_script.sh           | 32 +++++++++++++++++++++++++
 eICU_Example/joint_script_slurm.sh     | 33 ++++++++++++++++++++++++++
 eICU_Example/pre_MEDS.py               |  5 ++--
 5 files changed, 133 insertions(+), 2 deletions(-)

diff --git a/MIMIC-IV_Example/joint_script.sh b/MIMIC-IV_Example/joint_script.sh
index eb58e89..d3e067f 100755
--- a/MIMIC-IV_Example/joint_script.sh
+++ b/MIMIC-IV_Example/joint_script.sh
@@ -1,5 +1,37 @@
 #!/usr/bin/env bash
 
+# This makes the script fail if any internal script fails
+set -e
+
+# Function to display help message
+function display_help() {
+    echo "Usage: $0 <MIMICIV_RAW_DIR> <MIMICIV_PREMEDS_DIR> <MIMICIV_MEDS_DIR> <N_PARALLEL_WORKERS>"
+    echo
+    echo "This script processes MIMIC-IV data through several steps, handling raw data conversion,"
+    echo "sharding events, splitting patients, converting to sharded events, and merging into a MEDS cohort."
+    echo
+    echo "Arguments:"
+    echo "  MIMICIV_RAW_DIR        Directory containing raw MIMIC-IV data files."
+    echo "  MIMICIV_PREMEDS_DIR    Output directory for pre-MEDS data."
+    echo "  MIMICIV_MEDS_DIR       Output directory for processed MEDS data."
+    echo "  N_PARALLEL_WORKERS     Number of parallel workers for processing."
+    echo
+    echo "Options:"
+    echo "  -h, --help          Display this help message and exit."
+    exit 1
+}
+
+# Check if the first parameter is '-h' or '--help'
+if [[ "$1" == "-h" || "$1" == "--help" ]]; then
+    display_help
+fi
+
+# Check for mandatory parameters
+if [ "$#" -ne 4 ]; then
+    echo "Error: Incorrect number of arguments provided."
+    display_help
+fi
+
 MIMICIV_RAW_DIR="$1"
 MIMICIV_PREMEDS_DIR="$2"
 MIMICIV_MEDS_DIR="$3"
diff --git a/MIMIC-IV_Example/joint_script_slurm.sh b/MIMIC-IV_Example/joint_script_slurm.sh
index 3948e87..9d9ec0a 100755
--- a/MIMIC-IV_Example/joint_script_slurm.sh
+++ b/MIMIC-IV_Example/joint_script_slurm.sh
@@ -1,5 +1,38 @@
 #!/usr/bin/env bash
 
+# This makes the script fail if any internal script fails
+set -e
+
+# Function to display help message
+function display_help() {
+    echo "Usage: $0 <MIMICIV_RAW_DIR> <MIMICIV_PREMEDS_DIR> <MIMICIV_MEDS_DIR> <N_PARALLEL_WORKERS>"
+    echo
+    echo "This script processes MIMIC-IV data through several steps, handling raw data conversion,"
+    echo "sharding events, splitting patients, converting to sharded events, and merging into a MEDS cohort."
+    echo "This script uses slurm to process the data in parallel via the 'submitit' Hydra launcher."
+    echo
+    echo "Arguments:"
+    echo "  MIMICIV_RAW_DIR        Directory containing raw MIMIC-IV data files."
+    echo "  MIMICIV_PREMEDS_DIR    Output directory for pre-MEDS data."
+    echo "  MIMICIV_MEDS_DIR       Output directory for processed MEDS data."
+    echo "  N_PARALLEL_WORKERS     Number of parallel workers for processing."
+    echo
+    echo "Options:"
+    echo "  -h, --help          Display this help message and exit."
+    exit 1
+}
+
+# Check if the first parameter is '-h' or '--help'
+if [[ "$1" == "-h" || "$1" == "--help" ]]; then
+    display_help
+fi
+
+# Check for mandatory parameters
+if [ "$#" -ne 4 ]; then
+    echo "Error: Incorrect number of arguments provided."
+    display_help
+fi
+
 MIMICIV_RAW_DIR="$1"
 MIMICIV_PREMEDS_DIR="$2"
 MIMICIV_MEDS_DIR="$3"
diff --git a/eICU_Example/joint_script.sh b/eICU_Example/joint_script.sh
index 48cec46..4445f49 100755
--- a/eICU_Example/joint_script.sh
+++ b/eICU_Example/joint_script.sh
@@ -1,5 +1,37 @@
 #!/usr/bin/env bash
 
+# This makes the script fail if any internal script fails
+set -e
+
+# Function to display help message
+function display_help() {
+    echo "Usage: $0 <EICU_RAW_DIR> <EICU_PREMEDS_DIR> <EICU_MEDS_DIR> <N_PARALLEL_WORKERS>"
+    echo
+    echo "This script processes eICU data through several steps, handling raw data conversion,"
+    echo "sharding events, splitting patients, converting to sharded events, and merging into a MEDS cohort."
+    echo
+    echo "Arguments:"
+    echo "  EICU_RAW_DIR        Directory containing raw eICU data files."
+    echo "  EICU_PREMEDS_DIR    Output directory for pre-MEDS data."
+    echo "  EICU_MEDS_DIR       Output directory for processed MEDS data."
+    echo "  N_PARALLEL_WORKERS  Number of parallel workers for processing."
+    echo
+    echo "Options:"
+    echo "  -h, --help          Display this help message and exit."
+    exit 1
+}
+
+# Check if the first parameter is '-h' or '--help'
+if [[ "$1" == "-h" || "$1" == "--help" ]]; then
+    display_help
+fi
+
+# Check for mandatory parameters
+if [ "$#" -ne 4 ]; then
+    echo "Error: Incorrect number of arguments provided."
+    display_help
+fi
+
 EICU_RAW_DIR="$1"
 EICU_PREMEDS_DIR="$2"
 EICU_MEDS_DIR="$3"
diff --git a/eICU_Example/joint_script_slurm.sh b/eICU_Example/joint_script_slurm.sh
index 6b36ef0..7880286 100755
--- a/eICU_Example/joint_script_slurm.sh
+++ b/eICU_Example/joint_script_slurm.sh
@@ -1,5 +1,38 @@
 #!/usr/bin/env bash
 
+# This makes the script fail if any internal script fails
+set -e
+
+# Function to display help message
+function display_help() {
+    echo "Usage: $0 <EICU_RAW_DIR> <EICU_PREMEDS_DIR> <EICU_MEDS_DIR> <N_PARALLEL_WORKERS>"
+    echo
+    echo "This script processes eICU data through several steps, handling raw data conversion,"
+    echo "sharding events, splitting patients, converting to sharded events, and merging into a MEDS cohort."
+    echo "This script uses slurm to process the data in parallel via the 'submitit' Hydra launcher."
+    echo
+    echo "Arguments:"
+    echo "  EICU_RAW_DIR        Directory containing raw eICU data files."
+    echo "  EICU_PREMEDS_DIR    Output directory for pre-MEDS data."
+    echo "  EICU_MEDS_DIR       Output directory for processed MEDS data."
+    echo "  N_PARALLEL_WORKERS  Number of parallel workers for processing."
+    echo
+    echo "Options:"
+    echo "  -h, --help          Display this help message and exit."
+    exit 1
+}
+
+# Check if the first parameter is '-h' or '--help'
+if [[ "$1" == "-h" || "$1" == "--help" ]]; then
+    display_help
+fi
+
+# Check for mandatory parameters
+if [ "$#" -ne 4 ]; then
+    echo "Error: Incorrect number of arguments provided."
+    display_help
+fi
+
 EICU_RAW_DIR="$1"
 EICU_PREMEDS_DIR="$2"
 EICU_MEDS_DIR="$3"
diff --git a/eICU_Example/pre_MEDS.py b/eICU_Example/pre_MEDS.py
index 73e774e..694f1dd 100755
--- a/eICU_Example/pre_MEDS.py
+++ b/eICU_Example/pre_MEDS.py
@@ -270,8 +270,9 @@ def main(cfg: DictConfig):
 
     hydra_loguru_init()
 
-    logger.info("Loading table preprocessors from configs/table_preprocessors.yaml...")
-    preprocessors = OmegaConf.load("configs/table_preprocessors.yaml")
+    table_preprocessors_config_fp = Path("./eICU_Example/configs/table_preprocessors.yaml")
+    logger.info(f"Loading table preprocessors from {str(table_preprocessors_config_fp.resolve())}...")
+    preprocessors = OmegaConf.load(table_preprocessors_config_fp)
     functions = {}
     for table_name, preprocessor_cfg in preprocessors.items():
         logger.info(f"  Adding preprocessor for {table_name}:\n{OmegaConf.to_yaml(preprocessor_cfg)}")

From e80be1fde249d492a4d91d138102c71775b125f9 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sat, 1 Jun 2024 16:28:20 -0400
Subject: [PATCH 27/47] Every column in the raw files should apparently be
 lowercase... also other typos

---
 eICU_Example/configs/table_preprocessors.yaml | 268 +++++++++---------
 eICU_Example/pre_MEDS.py                      | 125 ++++----
 2 files changed, 208 insertions(+), 185 deletions(-)

diff --git a/eICU_Example/configs/table_preprocessors.yaml b/eICU_Example/configs/table_preprocessors.yaml
index da94e96..7c3316f 100644
--- a/eICU_Example/configs/table_preprocessors.yaml
+++ b/eICU_Example/configs/table_preprocessors.yaml
@@ -1,14 +1,14 @@
 admissiondx:
-  offset_col: "admitDxEnteredOffset"
+  offset_col: "admitdxenteredoffset"
   pseudotime_col: "admitDxEnteredTimestamp"
-  output_data_cols: ["admitDxName", "admitDxID"]
+  output_data_cols: ["admitdxname", "admitdxid"]
   warning_items:
-    ["How should we use `admitDxTest`?", "How should we use `admitDxPath`?"]
+    ["How should we use `admitdxtest`?", "How should we use `admitdxpath`?"]
 
 allergy:
-  offset_col: "allergyEnteredOffset"
+  offset_col: "allergyenteredoffset"
   pseudotime_col: "allergyEnteredTimestamp"
-  output_data_cols: ["allergyType", "allergyName"]
+  output_data_cols: ["allergytype", "allergyname"]
   warning_items:
     - "How should we use `allergyNoteType`?"
     - "How should we use `specialtyType`?"
@@ -18,201 +18,201 @@ allergy:
       (docs say 'name of the selected admission drug')?
 
 carePlanGeneral:
-  offset_col: "cplItemOffset"
+  offset_col: "cplitemoffset"
   pseudotime_col: "carePlanGeneralItemEnteredTimestamp"
-  output_data_cols: ["cplGroup", "cplItemValue"]
+  output_data_cols: ["cplgroup", "cplitemvalue"]
 
 carePlanEOL:
-  offset_col: "cplEolDiscussionOffset"
+  offset_col: "cpleoldiscussionoffset"
   pseudotime_col: "carePlanEolDiscussionOccurredTimestamp"
   warning_items:
     - "Is the DiscussionOffset time actually reliable? Should we fall back on the SaveOffset time?"
 
 carePlanGoal:
-  offset_col: "cplGoalOffset"
+  offset_col: "cplgoaloffset"
   pseudotime_col: "carePlanGoalEnteredTimestamp"
-  output_data_cols: ["cplGoalCategory", "cplGoalValue", "cplGoalStatus"]
+  output_data_cols: ["cplgoalcategory", "cplgoalvalue", "cplgoalstatus"]
 
 carePlanInfectiousDisease:
-  offset_col: "cplInfectDiseaseOffset"
+  offset_col: "cplinfectdiseaseoffset"
   pseudotime_col: "carePlanInfectDiseaseEnteredTimestamp"
   output_data_cols:
-    - "infectDiseaseSite"
-    - "infectDiseaseAssessment"
-    - "responseToTherapy"
+    - "infectdiseasesite"
+    - "infectdiseaseassessment"
+    - "responsetotherapy"
     - "treatment"
 
 diagonosis:
-  offset_col: "diagnosisOffset"
+  offset_col: "diagnosisoffset"
   pseudotime_col: "diagnosisEnteredTimestamp"
-  output_data_cols: ["ICD9Code", "diagnosisPriority", "diagnosisString"]
+  output_data_cols: ["icd9code", "diagnosispriority", "diagnosisstring"]
   warning_items:
     - "Though we use it, the `diagnosisString` field documentation is unclear -- by what is it separated?"
 
 infusionDrug:
-  offset_col: "infusionOffset"
+  offset_col: "infusionoffset"
   pseudotime_col: "infusionEnteredTimestamp"
   output_data_cols:
-    - "infusionDrugID"
-    - "drugName"
-    - "drugRate"
-    - "infusionRate"
-    - "drugAmount"
-    - "volumeOfFluid"
-    - "patientWeight"
+    - "infusiondrugid"
+    - "drugname"
+    - "drugrate"
+    - "infusionrate"
+    - "drugamount"
+    - "volumeoffluid"
+    - "patientweight"
 
 lab:
-  offset_col: "labResultOffset"
+  offset_col: "labresultoffset"
   pseudotime_col: "labResultDrawnTimestamp"
   output_data_cols:
-    - "labName"
-    - "labResult"
-    - "labResultText"
-    - "labMeasureNameSystem"
-    - "labMeasureNameInterface"
-    - "labTypeID"
+    - "labname"
+    - "labresult"
+    - "labresulttext"
+    - "labmeasurenamesystem"
+    - "labmeasurenameinterface"
+    - "labtypeid"
   warning_items:
     - "Is this the time the lab was drawn? Entered? The time the result came in?"
     - "We **IGNORE** the `labResultRevisedOffset` column -- this may be a mistake!"
 
 medication:
   offset_col:
-    - "drugOrderOffset"
-    - "drugStartOffset"
-    - "drugStopOffset"
+    - "drugorderoffset"
+    - "drugstartoffset"
+    - "drugstopoffset"
   pseudotime_col:
-    - "drugOrderTimestamp"
-    - "drugStartTimestamp"
-    - "drugStopTimestamp"
+    - "drugordertimestamp"
+    - "drugstarttimestamp"
+    - "drugstoptimestamp"
   output_data_cols:
-    - "medicationID"
-    - "drugIVAdmixture"
-    - "drugName"
-    - "drugHiclSeqno"
+    - "medicationid"
+    - "drugivadmixture"
+    - "drugname"
+    - "drughiclseqno"
     - "dosage"
-    - "routeAdmin"
+    - "routeadmin"
     - "frequency"
-    - "loadingDose"
-    - "PRN"
-    - "GTC"
+    - "loadingdose"
+    - "prn"
+    - "gtc"
   warning_items:
     - "We **IGNORE** the `drugOrderCancelled` column -- this may be a mistake!"
 
 nurseAssessment:
   offset_col:
-    - "nurseAssessOffset"
-    - "nurseAssessEntryOffset"
+    - "nurseassessoffset"
+    - "nurseassessentryoffset"
   pseudotime_col:
     - "nurseAssessPerformedTimestamp"
     - "nurseAssessEnteredTimestamp"
   output_data_cols:
-    - "nurseAssessID"
-    - "cellLabel"
-    - "cellAttribute"
-    - "cellAttributeValue"
+    - "nurseassessid"
+    - "celllabel"
+    - "cellattribute"
+    - "cellattributevalue"
   warning_items:
     - "Should we be using `cellAttributePath` instead of `cellAttribute`?"
     - "SOME MAY BE LISTS"
 
 nurseCare:
   offset_col:
-    - "nurseCareOffset"
-    - "nurseCareEntryOffset"
+    - "nursecareoffset"
+    - "nursecareentryoffset"
   pseudotime_col:
     - "nurseCarePerformedTimestamp"
     - "nurseCareEnteredTimestamp"
   output_data_cols:
-    - "nurseCareID"
-    - "cellLabel"
-    - "cellAttribute"
-    - "cellAttributeValue"
+    - "nursecareid"
+    - "celllabel"
+    - "cellattribute"
+    - "cellattributevalue"
   warning_items:
     - "Should we be using `cellAttributePath` instead of `cellAttribute`?"
     - "SOME MAY BE LISTS"
 
 nurseCharting:
   offset_col:
-    - "nursingChartOffset"
-    - "nursingChartEntryOffset"
+    - "nursingchartoffset"
+    - "nursingchartentryoffset"
   pseudotime_col:
     - "nursingChartPerformedTimestamp"
     - "nursingChartEnteredTimestamp"
   output_data_cols:
-    - "nursingChartID"
-    - "nursingChartCellTypeCat"
-    - "nursingChartCellTypeValName"
-    - "nursingChartCellTypeValLabel"
-    - "nursingChartValue"
+    - "nursingchartid"
+    - "nursingchartcelltypecat"
+    - "nursingchartcelltypevalname"
+    - "nursingchartcelltypevallabel"
+    - "nursingchartvalue"
   warning_items:
     - "SOME MAY BE LISTS"
 
 pastHistory:
   offset_col:
-    - "pastHistoryOffset"
-    - "pastHistoryEnteredOffset"
+    - "pasthistoryoffset"
+    - "pasthistoryenteredoffset"
   pseudotime_col:
     - "pastHistoryTakenTimestamp"
     - "pastHistoryEnteredTimestamp"
   output_data_cols:
-    - "pastHistoryID"
-    - "pastHistoryNoteType"
-    - "pastHistoryPath"
-    - "pastHistoryValue"
-    - "pastHistoryValueText"
+    - "pasthistoryid"
+    - "pasthistorynotetype"
+    - "pasthistorypath"
+    - "pasthistoryvalue"
+    - "pasthistoryvaluetext"
   warning_items:
     - "SOME MAY BE LISTS"
     - "How should we use `pastHistoryPath` vs. `pastHistoryNoteType`?"
     - "How should we use `pastHistoryValue` vs. `pastHistoryValueText`?"
 
 physicalExam:
-  offset_col: "physicalExamOffset"
+  offset_col: "physicalexamoffset"
   pseudotime_col: "physicalExamEnteredTimestamp"
   output_data_cols:
-    - "physicalExamID"
-    - "physicalExamText"
-    - "physicalExamPath"
-    - "physicalExamValue"
+    - "physicalexamid"
+    - "physicalexamtext"
+    - "physicalexampath"
+    - "physicalexamvalue"
   warning_items:
     - "How should we use `physicalExamValue` vs. `physicalExamText`?"
     - "I believe the `physicalExamValue` is a **LIST**. This must be processed specially."
 
 respiratoryCare:
   offset_col:
-    - "respCareStatusOffset"
-    - "ventStartOffset"
-    - "ventEndOffset"
+    - "respcarestatusoffset"
+    - "ventstartoffset"
+    - "ventendoffset"
   pseudotime_col:
     - "respCareStatusEnteredTimestamp"
     - "ventStartTimestamp"
     - "ventEndTimestamp"
   output_data_cols:
-    - "respCareID"
-    - "airwayType"
-    - "airwaySize"
-    - "airwayPosition"
-    - "cuffPressure"
-    - "apneaParams"
-    - "lowExhMVLimit"
-    - "hiExhMVLimit"
-    - "lowExhTVLimit"
-    - "hiPeakPresLimit"
-    - "lowPeakPresLimit"
-    - "hiRespRateLimit"
-    - "lowRespRateLimit"
-    - "sighPresLimit"
-    - "lowIronOxLimit"
-    - "highIronOxLimit"
-    - "meanAirwayPresLimit"
-    - "PEEPLimit"
-    - "CPAPLimit"
-    - "setApneaInterval"
-    - "setApneaTV"
-    - "setApneaIPPEEPHigh"
-    - "setApneaRR"
-    - "setApneaPeakFlow"
-    - "setApneaInspTime"
-    - "setApneaIE"
-    - "setApneaFIO2"
+    - "respcareid"
+    - "airwaytype"
+    - "airwaysize"
+    - "airwayposition"
+    - "cuffpressure"
+    - "apneaparams"
+    - "lowexhmvlimit"
+    - "hiexhmvlimit"
+    - "lowexhtvlimit"
+    - "hipeakpreslimit"
+    - "lowpeakpreslimit"
+    - "hirespratelimit"
+    - "lowrespratelimit"
+    - "sighpreslimit"
+    - "lowironoxlimit"
+    - "highironoxlimit"
+    - "meanairwaypreslimit"
+    - "peeplimit"
+    - "cpaplimit"
+    - "setapneainterval"
+    - "setapneatv"
+    - "setapneaippeephigh"
+    - "setapnearr"
+    - "setapneapeakflow"
+    - "setapneainsptime"
+    - "setapneaie"
+    - "setapneafio2"
   warning_items:
     - "We ignore the `priorVent*` columns -- this may be a mistake!"
     - "There is a lot of data in this table -- what should be incorporated into the event structure?"
@@ -220,64 +220,64 @@ respiratoryCare:
 
 respiratoryCharting:
   offset_col:
-    - "respChartOffset"
-    - "respChartEntryOffset"
+    - "respchartoffset"
+    - "respchartentryoffset"
   pseudotime_col:
     - "respChartPerformedTimestamp"
     - "respChartEnteredTimestamp"
   output_data_cols:
-    - "respChartID"
-    - "respChartTypeCat"
-    - "respChartValueLabel"
-    - "respChartValue"
+    - "respchartid"
+    - "respcharttypecat"
+    - "respchartvaluelabel"
+    - "respchartvalue"
   warning_items:
     - "SOME MAY BE LISTS"
 
 treatment:
-  offset_col: "treatmentOffset"
+  offset_col: "treatmentoffset"
   pseudotime_col: "treatmentEnteredTimestamp"
   output_data_cols:
-    - "treatmentID"
-    - "treatmentString"
+    - "treatmentid"
+    - "treatmentstring"
   warning_items:
     - "Absence of entries in table do not indicate absence of treatments"
 
 vitalAperiodic:
-  offset_col: "observationOffset"
+  offset_col: "observationoffset"
   pseudotime_col: "observationEnteredTimestamp"
   output_data_cols:
-    - "vitalAperiodicID"
-    - "nonInvasiveSystolic"
-    - "nonInvasiveDiastolic"
-    - "nonInvasiveMean"
+    - "vitalaperiodicid"
+    - "noninvasivesystolic"
+    - "noninvasivediastolic"
+    - "noninvasivemean"
     - "paop"
-    - "cardiacOutput"
-    - "cardiacInput"
+    - "cardiacoutput"
+    - "cardiacinput"
     - "svr"
     - "svri"
     - "pvr"
     - "pvri"
 
 vitalPeriodic:
-  offset_col: "observationOffset"
+  offset_col: "observationoffset"
   pseudotime_col: "observationEnteredTimestamp"
   output_data_cols:
-    - "vitalPeriodicID"
+    - "vitalperiodicid"
     - "temperature"
-    - "saO2"
-    - "heartRate"
+    - "sao2"
+    - "heartrate"
     - "respiration"
     - "cvp"
-    - "etCo2"
-    - "systemicSystolic"
-    - "systemicDiastolic"
-    - "systemicMean"
-    - "paSystolic"
-    - "paDiastolic"
-    - "paMean"
+    - "etco2"
+    - "systemicsystolic"
+    - "systemicdiastolic"
+    - "systemicmean"
+    - "pasystolic"
+    - "padiastolic"
+    - "pamean"
     - "st1"
     - "st2"
     - "st3"
-    - "ICP"
+    - "icp"
   warning_items:
     - "These are 5-minute median values. There are going to be a *lot* of events."
diff --git a/eICU_Example/pre_MEDS.py b/eICU_Example/pre_MEDS.py
index 694f1dd..a14d36e 100755
--- a/eICU_Example/pre_MEDS.py
+++ b/eICU_Example/pre_MEDS.py
@@ -24,8 +24,8 @@
     write_lazyframe,
 )
 
-HEALTH_SYSTEM_STAY_ID = "patientHealthSystemStayID"
-UNIT_STAY_ID = "patientUnitStayID"
+HEALTH_SYSTEM_STAY_ID = "patienthealthsystemstayid"
+UNIT_STAY_ID = "patientunitstayid"
 PATIENT_ID = "uniquepid"
 
 # The end of year date, used for year-only timestamps in eICU. The time is set to midnight as we'll add a
@@ -48,9 +48,19 @@ def load_raw_eicu_file(fp: Path, **kwargs) -> pl.LazyFrame:
 
 
 def check_timestamps_agree(df: pl.LazyFrame, pseudotime_col: pl.Expr, given_24htime_col: str):
+    """Checks that the time-of-day portions agree between the pseudotime and given columns.
+
+    Raises a `ValueError` if the times don't match within a minute.
+
+    Args:
+        TODO
+    """
     expected_time = pl.col(given_24htime_col).str.strptime(pl.Time, "%H:%M:%S")
 
-    time_deltas_min = (pseudotime_col.dt.time() - expected_time).dt.total_minutes()
+    # The use of `.dt.combine` here re-sets the "time-of-day" of the pseudotime_col column
+    time_deltas_min = (
+        pseudotime_col - pseudotime_col.dt.combine(expected_time)
+    ).dt.total_minutes()
 
     # Check that the time deltas are all within 1 minute
     logger.info(
@@ -76,19 +86,25 @@ def process_patient(df: pl.LazyFrame, hospital_df: pl.LazyFrame) -> pl.LazyFrame
     `configs/event_configs.yaml` file.
     """
 
-    hospital_discharge_pseudotime = pl.datetime(year=pl.col("hospitalDischargeYear"), **END_OF_YEAR) + pl.col(
-        "hospitalDischargeTime24"
-    ).str.strptime(pl.Time, "%H:%M:%S")
+    hospital_discharge_pseudotime = (
+        pl.datetime(year=pl.col("hospitaldischargeyear"), **END_OF_YEAR).dt.combine(
+            pl.col("hospitaldischargetime24").str.strptime(pl.Time, "%H:%M:%S")
+        )
+    )
 
-    unit_admit_pseudotime = hospital_discharge_pseudotime - pl.duration(
-        minutes=pl.col("hospitalDischargeOffset")
+    unit_admit_pseudotime = (
+        hospital_discharge_pseudotime - pl.duration(minutes=pl.col("hospitaldischargeoffset"))
     )
 
-    unit_discharge_pseudotime = unit_admit_pseudotime + pl.duration(minutes=pl.col("unitDischargeOffset"))
+    unit_discharge_pseudotime = unit_admit_pseudotime + pl.duration(minutes=pl.col("unitdischargeoffset"))
 
-    hospital_admit_pseudotime = unit_admit_pseudotime + pl.duration(minutes=pl.col("hospitalAdmitOffset"))
+    hospital_admit_pseudotime = unit_admit_pseudotime + pl.duration(minutes=pl.col("hospitaladmitoffset"))
 
-    age_in_years = pl.when(pl.col("age") == "> 89").then(90).otherwise(pl.col("age").cast(pl.UInt16))
+    age_in_years = (
+        pl.when(pl.col("age") == "> 89")
+        .then(90)
+        .otherwise(pl.col("age").cast(pl.UInt16, strict=False))
+    )
     age_in_days = age_in_years * 365.25
     # We assume that the patient was born at the midpoint of the year as we don't know the actual birthdate
     pseudo_date_of_birth = unit_admit_pseudotime - pl.duration(days=(age_in_days - 365.25 / 2))
@@ -99,10 +115,10 @@ def process_patient(df: pl.LazyFrame, hospital_df: pl.LazyFrame) -> pl.LazyFrame
         "Checking that the 24h times are consistent. If this is extremely slow, consider refactoring to have "
         "only one `.collect()` call."
     )
-    check_timestamps_agree(df, hospital_discharge_pseudotime, "hospitalDischargeTime24")
-    check_timestamps_agree(df, hospital_admit_pseudotime, "hospitalAdmitTime24")
-    check_timestamps_agree(df, unit_admit_pseudotime, "unitAdmitTime24")
-    check_timestamps_agree(df, unit_discharge_pseudotime, "unitDischargeTime24")
+    check_timestamps_agree(df, hospital_discharge_pseudotime, "hospitaldischargetime24")
+    check_timestamps_agree(df, hospital_admit_pseudotime, "hospitaladmittime24")
+    check_timestamps_agree(df, unit_admit_pseudotime, "unitadmittime24")
+    check_timestamps_agree(df, unit_discharge_pseudotime, "unitdischargetime24")
     logger.info(f"Validated 24h times in {datetime.now() - start}")
 
     logger.warning("NOT validating the `unitVisitNumber` column as that isn't implemented yet.")
@@ -116,41 +132,42 @@ def process_patient(df: pl.LazyFrame, hospital_df: pl.LazyFrame) -> pl.LazyFrame
         "  - `age` is interpreted as the age at the time of the unit stay, not the hospital stay. "
         "Is this right?\n"
         "  - `What is the actual mean age for those > 89? Here we assume 90.\n"
+        "  - Note that all the column names appear to be all in lowercase for the csv versions, vs. the docs"
     )
 
-    return df.join(hospital_df, left_on="hospitalID", right_on="hospitalid", how="left").select(
+    return df.join(hospital_df, left_on="hospitalid", right_on="hospitalid", how="left").select(
         # 1. Static variables
         PATIENT_ID,
         "gender",
-        pseudo_date_of_birth.alias("dateOfBirth"),
+        pseudo_date_of_birth.alias("dateofbirth"),
         "ethnicity",
         # 2. Health system stay parameters
         HEALTH_SYSTEM_STAY_ID,
-        "hospitalID",
-        pl.col("numbedscategory").alias("hospitalNumBedsCategory"),
-        pl.col("teachingstatus").alias("hospitalTeachingStatus"),
-        pl.col("region").alias("hospitalRegion"),
+        "hospitalid",
+        pl.col("numbedscategory").alias("hospitalnumbedscategory"),
+        pl.col("teachingstatus").alias("hospitalteachingstatus"),
+        pl.col("region").alias("hospitalregion"),
         # 2.1 Admission parameters
-        hospital_admit_pseudotime.alias("hospitalAdmitTimestamp"),
-        "hospitalAdmitSource",
+        hospital_admit_pseudotime.alias("hospitaladmittimestamp"),
+        "hospitaladmitsource",
         # 2.2 Discharge parameters
-        hospital_discharge_pseudotime.alias("hospitalDischargeTimestamp"),
-        "hospitalDischargeLocation",
-        "hospitalDischargeStatus",
+        hospital_discharge_pseudotime.alias("hospitaldischargetimestamp"),
+        "hospitaldischargelocation",
+        "hospitaldischargestatus",
         # 3. Unit stay parameters
         UNIT_STAY_ID,
-        "wardID",
+        "wardid",
         # 3.1 Admission parameters
-        unit_admit_pseudotime.alias("unitAdmitTimestamp"),
-        "unitAdmitSource",
-        "unitStayType",
-        pl.col("admissionHeight").alias("unitAdmissionHeight"),
-        pl.col("admissionWeight").alias("unitAdmissionWeight"),
+        unit_admit_pseudotime.alias("unitadmittimestamp"),
+        "unitadmitsource",
+        "unitstaytype",
+        pl.col("admissionheight").alias("unitadmissionheight"),
+        pl.col("admissionweight").alias("unitadmissionweight"),
         # 3.2 Discharge parameters
-        unit_discharge_pseudotime.alias("unitDischargeTimestamp"),
-        "unitDischargeLocation",
-        "unitDischargeStatus",
-        pl.col("dischargeWeight").alias("unitDischargeWeight"),
+        unit_discharge_pseudotime.alias("unitdischargetimestamp"),
+        "unitdischargelocation",
+        "unitdischargestatus",
+        pl.col("dischargeweight").alias("unitdischargeweight"),
     )
 
 
@@ -190,7 +207,7 @@ def fn(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame:
         """
 
         pseudotimes = [
-            (pl.col("unitAdmitTimestamp") + pl.duration(minutes=pl.col(offset))).alias(pseudotime)
+            (pl.col("unitadmittimestamp") + pl.duration(minutes=pl.col(offset))).alias(pseudotime)
             for pseudotime, offset in zip(pseudotime_col, offset_col)
         ]
 
@@ -211,7 +228,7 @@ def fn(df: pl.LazyFrame, patient_df: pl.LazyFrame) -> pl.LazyFrame:
     return fn
 
 
-NEEDED_PATIENT_COLS = [UNIT_STAY_ID, HEALTH_SYSTEM_STAY_ID, "unitAdmitTimestamp"]
+NEEDED_PATIENT_COLS = [UNIT_STAY_ID, HEALTH_SYSTEM_STAY_ID, "unitadmittimestamp"]
 
 
 @hydra.main(version_base=None, config_path="configs", config_name="pre_MEDS")
@@ -281,23 +298,29 @@ def main(cfg: DictConfig):
     raw_cohort_dir = Path(cfg.raw_cohort_dir)
     MEDS_input_dir = Path(cfg.output_dir)
 
-    logger.info("Processing patient table first...")
+    patient_out_fp = MEDS_input_dir / "patient.parquet"
 
-    hospital_fp = raw_cohort_dir / "hospital.csv.gz"
-    patient_fp = raw_cohort_dir / "patient.csv.gz"
-    logger.info(f"Loading {str(hospital_fp.resolve())}...")
-    hospital_df = load_raw_eicu_file(
-        hospital_fp, columns=["hospitalid", "numbedscategory", "teachingstatus", "region"]
-    )
-    logger.info(f"Loading {str(patient_fp.resolve())}...")
-    raw_patient_df = load_raw_eicu_file(patient_fp)
+    if patient_out_fp.is_file():
+        logger.info(f"Reloading processed patient df from {str(patient_out_fp.resolve())}")
+        patient_df = pl.read_parquet(patient_out_fp, columns=NEEDED_PATIENT_COLS, use_pyarrow=True).lazy()
+    else:
+        logger.info("Processing patient table first...")
+
+        hospital_fp = raw_cohort_dir / "hospital.csv.gz"
+        patient_fp = raw_cohort_dir / "patient.csv.gz"
+        logger.info(f"Loading {str(hospital_fp.resolve())}...")
+        hospital_df = load_raw_eicu_file(
+            hospital_fp, columns=["hospitalid", "numbedscategory", "teachingstatus", "region"]
+        )
+        logger.info(f"Loading {str(patient_fp.resolve())}...")
+        raw_patient_df = load_raw_eicu_file(patient_fp)
 
-    logger.info("Processing patient table...")
-    patient_df = process_patient(raw_patient_df, hospital_df)
-    write_lazyframe(patient_df, MEDS_input_dir / "patient.parquet")
+        logger.info("Processing patient table...")
+        patient_df = process_patient(raw_patient_df, hospital_df)
+        write_lazyframe(patient_df, MEDS_input_dir / "patient.parquet")
 
     all_fps = [
-        fp for fp in raw_cohort_dir.glob("*/.csv.gz") if fp.name not in {"hospital.csv.gz", "patient.csv.gz"}
+        fp for fp in raw_cohort_dir.glob("*.csv.gz") if fp.name not in {"hospital.csv.gz", "patient.csv.gz"}
     ]
 
     unused_tables = {

From 9ad4b924884c882c15a93931eb2bebdc229d0ccd Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sat, 1 Jun 2024 16:29:00 -0400
Subject: [PATCH 28/47] Fixing a typo in config for diagnosis

---
 eICU_Example/configs/table_preprocessors.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eICU_Example/configs/table_preprocessors.yaml b/eICU_Example/configs/table_preprocessors.yaml
index 7c3316f..3fe62f3 100644
--- a/eICU_Example/configs/table_preprocessors.yaml
+++ b/eICU_Example/configs/table_preprocessors.yaml
@@ -42,7 +42,7 @@ carePlanInfectiousDisease:
     - "responsetotherapy"
     - "treatment"
 
-diagonosis:
+diagnosis:
   offset_col: "diagnosisoffset"
   pseudotime_col: "diagnosisEnteredTimestamp"
   output_data_cols: ["icd9code", "diagnosispriority", "diagnosisstring"]

From 9ced80f81e708fb2474d2ff53cd130dd35af6d2e Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sat, 1 Jun 2024 16:35:42 -0400
Subject: [PATCH 29/47] Fixed numerous typos and issues. Makes it through much
 of the files now in the pre-MEDS stage

---
 eICU_Example/configs/event_configs.yaml       | 112 +++++++++---------
 eICU_Example/configs/table_preprocessors.yaml |   2 +-
 eICU_Example/pre_MEDS.py                      |   2 +-
 3 files changed, 58 insertions(+), 58 deletions(-)

diff --git a/eICU_Example/configs/event_configs.yaml b/eICU_Example/configs/event_configs.yaml
index 50c8eb0..6ac7ab9 100644
--- a/eICU_Example/configs/event_configs.yaml
+++ b/eICU_Example/configs/event_configs.yaml
@@ -17,23 +17,23 @@ patient:
   hosp_admission:
     code:
       - "HOSPITAL_ADMISSION"
-      - col("hospitalAdmitSource")
-      - col("hospitalRegion")
-      - col("hospitalTeachingStatus")
-      - col("hospitalNumBedsCategory")
+      - col(hospitaladmitsource)
+      - col(hospitalregion)
+      - col(hospitalteachingstatus)
+      - col(hospitalnumbedscategory)
     timestamp: "hospitalAdmitTimestamp"
     hospital_id: "hospitalID"
   hosp_discharge:
     code:
       - "HOSPITAL_DISCHARGE"
-      - col("hospitalDischargeStatus")
-      - col("hospitalDischargeLocation")
+      - col(hospitaldischargestatus)
+      - col(hospitaldischargelocation)
     timestamp: "hospitalDischargeTimestamp"
   unit_admission:
     code:
       - "UNIT_ADMISSION"
-      - col("unitAdmitSource")
-      - col("unitStayType")
+      - col(unitadmitsource)
+      - col(unitstaytype)
     timestamp: "unitAdmitTimestamp"
     ward_id: "wardID"
     unit_stay_id: "patientUnitStayID"
@@ -50,8 +50,8 @@ patient:
   unit_discharge:
     code:
       - "UNIT_DISCHARGE"
-      - col("unitDischargeStatus")
-      - col("unitDischargeLocation")
+      - col(unitdischargestatus)
+      - col(unitdischargelocation)
     timestamp: "unitDischargeTimestamp"
   unit_discharge_weight:
     code:
@@ -63,7 +63,7 @@ admissiondx:
   admission_diagnosis:
     code:
       - "ADMISSION_DX"
-      - col("admitDxName")
+      - col(admitdxname)
     timestamp: "admitDxEnteredTimestamp"
     admission_dx_id: "admitDxID"
     unit_stay_id: "patientUnitStayID"
@@ -72,16 +72,16 @@ allergy:
   allergy:
     code:
       - "ALLERGY"
-      - col("allergyType")
-      - col("allergyName")
+      - col(allergytype)
+      - col(allergyname)
     timestamp: "allergyEnteredTimestamp"
 
 carePlanGeneral:
   cplItem:
     code:
       - "CAREPLAN_GENERAL"
-      - col("cplGroup")
-      - col("cplItemValue")
+      - col(cplgroup)
+      - col(cplitemvalue)
     timestamp: "carePlanGeneralItemEnteredTimestamp"
 
 carePlanEOL:
@@ -94,27 +94,27 @@ carePlanGoal:
   cplGoal:
     code:
       - "CAREPLAN_GOAL"
-      - col("cplGoalCategory")
-      - col("cplGoalValue")
-      - col("cplGoalStatus")
+      - col(cplgoalcategory)
+      - col(cplgoalvalue)
+      - col(cplgoalstatus)
     timestamp: "carePlanGoalEnteredTimestamp"
 
 carePlanInfectiousDisease:
   cplInfectDisease:
     code:
       - "CAREPLAN_INFECTIOUS_DISEASE"
-      - col("infectDiseaseSite")
-      - col("infectDiseaseAssessment")
-      - col("treatment")
-      - col("responseToTherapy")
+      - col(infectdiseasesite)
+      - col(infectdiseaseassessment)
+      - col(treatment)
+      - col(responsetotherapy)
     timestamp: "carePlanInfectDiseaseEnteredTimestamp"
 
 diagnosis:
   diagnosis:
     code:
       - "ICD9CM"
-      - col("ICD9Code")
-      - col("diagnosisPriority")
+      - col(icd9code)
+      - col(diagnosispriority)
     timestamp: "diagnosisEnteredTimestamp"
     diagnosis_string: "diagnosisString"
 
@@ -122,8 +122,8 @@ infusionDrug:
   infusion:
     code:
       - "INFUSION"
-      - col("infusionDrugID")
-      - col("drugName")
+      - col(infusiondrugid)
+      - col(drugname)
     timestamp: "infusionEnteredTimestamp"
     drug_rate: "drugRate"
     infusion_rate: "infusionRate"
@@ -139,9 +139,9 @@ lab:
   lab:
     code:
       - "LAB"
-      - col("labMeasureNameSystem")
-      - col("labMeasureNameInterface")
-      - col("labName")
+      - col(labmeasurenamesystem)
+      - col(labmeasurenameinterface)
+      - col(labname)
     timestamp: "labResultDrawnTimestamp"
     numerical_value: "labResult"
     text_value: "labResultText"
@@ -294,32 +294,32 @@ respiratoryCare:
     timestamp: "respCareStatusEnteredTimestamp"
     resp_care_id: "respCareID"
 
-    airwayType: "airwayType"
-    airwaySize: "airwaySize"
-    airwayPosition: "airwayPosition"
-    cuffPressure: "cuffPressure"
-    apneaParams: "apneaParams"
-    lowExhMVLimit: "lowExhMVLimit"
-    hiExhMVLimit: "hiExhMVLimit"
-    lowExhTVLimit: "lowExhTVLimit"
-    hiPeakPresLimit: "hiPeakPresLimit"
-    lowPeakPresLimit: "lowPeakPresLimit"
-    hiRespRateLimit: "hiRespRateLimit"
-    lowRespRateLimit: "lowRespRateLimit"
-    sighPresLimit: "sighPresLimit"
-    lowIronOxLimit: "lowIronOxLimit"
-    highIronOxLimit: "highIronOxLimit"
-    meanAirwayPresLimit: "meanAirwayPresLimit"
-    PEEPLimit: "PEEPLimit"
-    CPAPLimit: "CPAPLimit"
-    setApneaInterval: "setApneaInterval"
-    setApneaTV: "setApneaTV"
-    setApneaIPPEEPHigh: "setApneaIPPEEPHigh"
-    setApneaRR: "setApneaRR"
-    setApneaPeakFlow: "setApneaPeakFlow"
-    setApneaInspTime: "setApneaInspTime"
-    setApneaIE: "setApneaIE"
-    setApneaFIO2: "setApneaFIO2"
+    airwaytype: "airwaytype"
+    airwaysize: "airwaysize"
+    airwayposition: "airwayposition"
+    cuffpressure: "cuffpressure"
+    apneaparms: "apneaparms"
+    lowexhmvlimit: "lowexhmvlimit"
+    hiexhmvlimit: "hiexhmvlimit"
+    lowexhtvlimit: "lowexhtvlimit"
+    hipeakpreslimit: "hipeakpreslimit"
+    lowpeakpreslimit: "lowpeakpreslimit"
+    hirespratelimit: "hirespratelimit"
+    lowrespratelimit: "lowrespratelimit"
+    sighpreslimit: "sighpreslimit"
+    lowironoxlimit: "lowironoxlimit"
+    highironoxlimit: "highironoxlimit"
+    meanairwaypreslimit: "meanairwaypreslimit"
+    peeplimit: "peeplimit"
+    cpaplimit: "cpaplimit"
+    setapneainterval: "setapneainterval"
+    setapneatv: "setapneatv"
+    setapneaippeephigh: "setapneaippeephigh"
+    setapnearr: "setapnearr"
+    setapneapeakflow: "setapneapeakflow"
+    setapneainsptime: "setapneainsptime"
+    setapneaie: "setapneaie"
+    setapneafio2: "setapneafio2"
 
   vent_start:
     code:
@@ -365,7 +365,7 @@ treatment:
     code:
       - "TREATMENT"
       - "ENTERED"
-      - col("treatmentString")
+      - col(treatmentstring)
     timestamp: "treatmentEnteredTimestamp"
     treatment_id: "treatmentID"
 
diff --git a/eICU_Example/configs/table_preprocessors.yaml b/eICU_Example/configs/table_preprocessors.yaml
index 3fe62f3..3faf4aa 100644
--- a/eICU_Example/configs/table_preprocessors.yaml
+++ b/eICU_Example/configs/table_preprocessors.yaml
@@ -191,7 +191,7 @@ respiratoryCare:
     - "airwaysize"
     - "airwayposition"
     - "cuffpressure"
-    - "apneaparams"
+    - "apneaparms"
     - "lowexhmvlimit"
     - "hiexhmvlimit"
     - "lowexhtvlimit"
diff --git a/eICU_Example/pre_MEDS.py b/eICU_Example/pre_MEDS.py
index a14d36e..d06bd7b 100755
--- a/eICU_Example/pre_MEDS.py
+++ b/eICU_Example/pre_MEDS.py
@@ -44,7 +44,7 @@ def load_raw_eicu_file(fp: Path, **kwargs) -> pl.LazyFrame:
     """
 
     with gzip.open(fp, mode="rb") as f:
-        return pl.read_csv(f, infer_schema_length=100000, **kwargs).lazy()
+        return pl.read_csv(f, infer_schema_length=100000000, **kwargs).lazy()
 
 
 def check_timestamps_agree(df: pl.LazyFrame, pseudotime_col: pl.Expr, given_24htime_col: str):

From 1168641495d20ea54e82699486b5b81faa4dd0fa Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sat, 1 Jun 2024 16:56:29 -0400
Subject: [PATCH 30/47] Linted

---
 eICU_Example/pre_MEDS.py | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/eICU_Example/pre_MEDS.py b/eICU_Example/pre_MEDS.py
index d06bd7b..e5855f4 100755
--- a/eICU_Example/pre_MEDS.py
+++ b/eICU_Example/pre_MEDS.py
@@ -58,9 +58,7 @@ def check_timestamps_agree(df: pl.LazyFrame, pseudotime_col: pl.Expr, given_24ht
     expected_time = pl.col(given_24htime_col).str.strptime(pl.Time, "%H:%M:%S")
 
     # The use of `.dt.combine` here re-sets the "time-of-day" of the pseudotime_col column
-    time_deltas_min = (
-        pseudotime_col - pseudotime_col.dt.combine(expected_time)
-    ).dt.total_minutes()
+    time_deltas_min = (pseudotime_col - pseudotime_col.dt.combine(expected_time)).dt.total_minutes()
 
     # Check that the time deltas are all within 1 minute
     logger.info(
@@ -86,14 +84,12 @@ def process_patient(df: pl.LazyFrame, hospital_df: pl.LazyFrame) -> pl.LazyFrame
     `configs/event_configs.yaml` file.
     """
 
-    hospital_discharge_pseudotime = (
-        pl.datetime(year=pl.col("hospitaldischargeyear"), **END_OF_YEAR).dt.combine(
-            pl.col("hospitaldischargetime24").str.strptime(pl.Time, "%H:%M:%S")
-        )
-    )
+    hospital_discharge_pseudotime = pl.datetime(
+        year=pl.col("hospitaldischargeyear"), **END_OF_YEAR
+    ).dt.combine(pl.col("hospitaldischargetime24").str.strptime(pl.Time, "%H:%M:%S"))
 
-    unit_admit_pseudotime = (
-        hospital_discharge_pseudotime - pl.duration(minutes=pl.col("hospitaldischargeoffset"))
+    unit_admit_pseudotime = hospital_discharge_pseudotime - pl.duration(
+        minutes=pl.col("hospitaldischargeoffset")
     )
 
     unit_discharge_pseudotime = unit_admit_pseudotime + pl.duration(minutes=pl.col("unitdischargeoffset"))
@@ -101,9 +97,7 @@ def process_patient(df: pl.LazyFrame, hospital_df: pl.LazyFrame) -> pl.LazyFrame
     hospital_admit_pseudotime = unit_admit_pseudotime + pl.duration(minutes=pl.col("hospitaladmitoffset"))
 
     age_in_years = (
-        pl.when(pl.col("age") == "> 89")
-        .then(90)
-        .otherwise(pl.col("age").cast(pl.UInt16, strict=False))
+        pl.when(pl.col("age") == "> 89").then(90).otherwise(pl.col("age").cast(pl.UInt16, strict=False))
     )
     age_in_days = age_in_years * 365.25
     # We assume that the patient was born at the midpoint of the year as we don't know the actual birthdate

From 39cf4649e67d66aca6f7596c0abb40d12ee3d836 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sat, 1 Jun 2024 17:03:25 -0400
Subject: [PATCH 31/47] Corrected more typos

---
 eICU_Example/configs/event_configs.yaml | 225 ++++++++++++------------
 1 file changed, 112 insertions(+), 113 deletions(-)

diff --git a/eICU_Example/configs/event_configs.yaml b/eICU_Example/configs/event_configs.yaml
index 6ac7ab9..c57584f 100644
--- a/eICU_Example/configs/event_configs.yaml
+++ b/eICU_Example/configs/event_configs.yaml
@@ -1,12 +1,12 @@
 # Note that there is no "patient_id" for eICU -- patients are only differentiable during the course of a
 # single health system stay. Accordingly, we set the "patient" id here as the "patientHealthSystemStayID"
 
-patient_id_col: patientHealthSystemStayID
+patient_id_col: patienthealthsystemstayid
 
 patient:
   dob:
     code: "DOB"
-    timestamp: "dateOfBirth"
+    timestamp: "dateofbirth"
     uniquepid: "uniquepid"
   gender:
     code: ["GENDER", "col(gender)"]
@@ -116,7 +116,7 @@ diagnosis:
       - col(icd9code)
       - col(diagnosispriority)
     timestamp: "diagnosisEnteredTimestamp"
-    diagnosis_string: "diagnosisString"
+    diagnosis_string: "diagnosisstring"
 
 infusionDrug:
   infusion:
@@ -125,15 +125,15 @@ infusionDrug:
       - col(infusiondrugid)
       - col(drugname)
     timestamp: "infusionEnteredTimestamp"
-    drug_rate: "drugRate"
-    infusion_rate: "infusionRate"
-    drug_amount: "drugAmount"
-    volume_of_fluid: "volumeOfFluid"
+    drug_rate: "drugrate"
+    infusion_rate: "infusionrate"
+    drug_amount: "drugamount"
+    volume_of_fluid: "volumeoffluid"
   patient_weight:
     code:
       - "INFUSION_PATIENT_WEIGHT"
     timestamp: "infusionEnteredTimestamp"
-    numerical_value: "patientWeight"
+    numerical_value: "patientweight"
 
 lab:
   lab:
@@ -143,9 +143,9 @@ lab:
       - col(labmeasurenameinterface)
       - col(labname)
     timestamp: "labResultDrawnTimestamp"
-    numerical_value: "labResult"
-    text_value: "labResultText"
-    lab_type_id: "labTypeID"
+    numerical_value: "labresult"
+    text_value: "labresulttext"
+    lab_type_id: "labtypeid"
 
 medication:
   drug_ordered:
@@ -154,28 +154,28 @@ medication:
       - "ORDERED"
       - col(drugName)
     timestamp: "drugOrderTimestamp"
-    medication_id: "medicationID"
-    drug_iv_admixture: "drugIVAdmixture"
+    medication_id: "medicationid"
+    drug_iv_admixture: "drugivadmixture"
     dosage: "dosage"
-    route_admin: "routeAdmin"
+    route_admin: "routeadmin"
     frequency: "frequency"
-    loading_dose: "loadingDose"
-    prn: "PRN"
-    gtc: "GTC"
+    loading_dose: "loadingdose"
+    prn: "prn"
+    gtc: "gtc"
   drug_started:
     code:
       - "MEDICATION"
       - "STARTED"
       - col(drugName)
     timestamp: "drugStartedTimestamp"
-    medication_id: "medicationID"
+    medication_id: "medicationid"
   drug_stopped:
     code:
       - "MEDICATION"
       - "STOPPED"
       - col(drugName)
     timestamp: "drugStoppedTimestamp"
-    medication_id: "medicationID"
+    medication_id: "medicationid"
 
 nurseAssessment:
   nurse_assessment_performed:
@@ -184,10 +184,10 @@ nurseAssessment:
       - "PERFORMED"
       - NOT YET DONE
     timestamp: "nurseAssessPerformedTimestamp"
-    nurse_assessment_id: "nurseAssessID"
-    cell_label: "cellLabel"
-    cell_attribute: "cellAttribute"
-    cell_attribute_value: "cellAttributeValue"
+    nurse_assessment_id: "nurseassessid"
+    cell_label: "celllabel"
+    cell_attribute: "cellattribute"
+    cell_attribute_value: "cellattributevalue"
 
   nurse_assessment_entered:
     code:
@@ -195,10 +195,10 @@ nurseAssessment:
       - "ENTERED"
       - NOT YET DONE
     timestamp: "nurseAssessEnteredTimestamp"
-    nurse_assessment_id: "nurseAssessID"
-    cell_label: "cellLabel"
-    cell_attribute: "cellAttribute"
-    cell_attribute_value: "cellAttributeValue"
+    nurse_assessment_id: "nurseassessid"
+    cell_label: "celllabel"
+    cell_attribute: "cellattribute"
+    cell_attribute_value: "cellattributevalue"
 
 nurseCare:
   nurse_care_performed:
@@ -207,10 +207,10 @@ nurseCare:
       - "PERFORMED"
       - NOT YET DONE
     timestamp: "nurseCarePerformedTimestamp"
-    nurse_care_id: "nurseCareID"
-    cell_label: "cellLabel"
-    cell_attribute: "cellAttribute"
-    cell_attribute_value: "cellAttributeValue"
+    nurse_care_id: "nursecareid"
+    cell_label: "celllabel"
+    cell_attribute: "cellattribute"
+    cell_attribute_value: "cellattributevalue"
 
   nurse_care_entered:
     code:
@@ -218,10 +218,10 @@ nurseCare:
       - "ENTERED"
       - NOT YET DONE
     timestamp: "nurseCareEnteredTimestamp"
-    nurse_care_id: "nurseCareID"
-    cell_label: "cellLabel"
-    cell_attribute: "cellAttribute"
-    cell_attribute_value: "cellAttributeValue"
+    nurse_care_id: "nursecareid"
+    cell_label: "celllabel"
+    cell_attribute: "cellattribute"
+    cell_attribute_value: "cellattributevalue"
 
 nurseCharting:
   nurse_charting_performed:
@@ -230,11 +230,11 @@ nurseCharting:
       - "PERFORMED"
       - NOT YET DONE
     timestamp: "nursingChartPerformedTimestamp"
-    nurse_charting_id: "nursingChartID"
-    cell_type_cat: "nursingChartCellTypeCat"
-    cell_type_val_name: "nursingChartCellTypeValName"
-    cell_type_val_label: "nursingChartCellTypeValLabel"
-    cell_value: "nursingChartValue"
+    nurse_charting_id: "nursingchartid"
+    cell_type_cat: "nursingchartcelltypecat"
+    cell_type_val_name: "nursingchartcelltypevalname"
+    cell_type_val_label: "nursingchartcelltypevallabel"
+    cell_value: "nursingchartvalue"
 
   nurse_charting_entered:
     code:
@@ -242,11 +242,11 @@ nurseCharting:
       - "ENTERED"
       - NOT YET DONE
     timestamp: "nursingChartEnteredTimestamp"
-    nurse_charting_id: "nursingChartID"
-    cell_type_cat: "nursingChartCellTypeCat"
-    cell_type_val_name: "nursingChartCellTypeValName"
-    cell_type_val_label: "nursingChartCellTypeValLabel"
-    cell_value: "nursingChartValue"
+    nurse_charting_id: "nursingchartid"
+    cell_type_cat: "nursingchartcelltypecat"
+    cell_type_val_name: "nursingchartcelltypevalname"
+    cell_type_val_label: "nursingchartcelltypevallabel"
+    cell_value: "nursingchartvalue"
 
 pastHistory:
   past_history_taken:
@@ -255,11 +255,11 @@ pastHistory:
       - "TAKEN"
       - NOT YET DONE
     timestamp: "pastHistoryTakenTimestamp"
-    past_history_id: "pastHistoryID"
-    note_type: "pastHistoryNoteType"
-    path: "pastHistoryPath"
-    value: "pastHistoryValue"
-    value_text: "pastHistoryValueText"
+    past_history_id: "pasthistoryid"
+    note_type: "pasthistorynotetype"
+    path: "pasthistorypath"
+    value: "pasthistoryvalue"
+    value_text: "pasthistoryvaluetext"
 
   past_history_entered:
     code:
@@ -267,11 +267,11 @@ pastHistory:
       - "ENTERED"
       - NOT YET DONE
     timestamp: "pastHistoryEnteredTimestamp"
-    past_history_id: "pastHistoryID"
-    note_type: "pastHistoryNoteType"
-    path: "pastHistoryPath"
-    value: "pastHistoryValue"
-    value_text: "pastHistoryValueText"
+    past_history_id: "pasthistoryid"
+    note_type: "pasthistorynotetype"
+    path: "pasthistorypath"
+    value: "pasthistoryvalue"
+    value_text: "pasthistoryvaluetext"
 
 physicalExam:
   physical_exam_entered:
@@ -280,10 +280,10 @@ physicalExam:
       - "ENTERED"
       - NOT YET DONE
     timestamp: "physicalExamEnteredTimestamp"
-    physical_exam_id: "physicalExamID"
-    text: "physicalExamText"
-    path: "physicalExamPath"
-    value: "physicalExamValue"
+    physical_exam_id: "physicalexamid"
+    text: "physicalexamtext"
+    path: "physicalexampath"
+    value: "physicalexamvalue"
 
 respiratoryCare:
   resp_care_status:
@@ -292,8 +292,7 @@ respiratoryCare:
       - "STATUS"
       - NOT YET DONE
     timestamp: "respCareStatusEnteredTimestamp"
-    resp_care_id: "respCareID"
-
+    resp_care_id: "respcareid"
     airwaytype: "airwaytype"
     airwaysize: "airwaysize"
     airwayposition: "airwayposition"
@@ -327,7 +326,7 @@ respiratoryCare:
       - "START"
       - NOT YET DONE
     timestamp: "ventStartTimestamp"
-    resp_care_id: "respCareID"
+    resp_care_id: "respcareid"
 
   vent_end:
     code:
@@ -335,7 +334,7 @@ respiratoryCare:
       - "END"
       - NOT YET DONE
     timestamp: "ventEndTimestamp"
-    resp_care_id: "respCareID"
+    resp_care_id: "respcareid"
 
 respiratoryCharting:
   resp_charting_performed:
@@ -344,10 +343,10 @@ respiratoryCharting:
       - "PERFORMED"
       - NOT YET DONE
     timestamp: "respChartPerformedTimestamp"
-    resp_chart_id: "respChartID"
-    type_cat: "respChartTypeCat"
-    value_label: "respChartValueLabel"
-    value: "respChartValue"
+    resp_chart_id: "respchartid"
+    type_cat: "respcharttypecat"
+    value_label: "respchartvaluelabel"
+    value: "respchartvalue"
 
   resp_charting_entered:
     code:
@@ -355,10 +354,10 @@ respiratoryCharting:
       - "ENTERED"
       - NOT YET DONE
     timestamp: "respChartEnteredTimestamp"
-    resp_chart_id: "respChartID"
-    type_cat: "respChartTypeCat"
-    value_label: "respChartValueLabel"
-    value: "respChartValue"
+    resp_chart_id: "respchartid"
+    type_cat: "respcharttypecat"
+    value_label: "respchartvaluelabel"
+    value: "respchartvalue"
 
 treatment:
   treatment:
@@ -367,7 +366,7 @@ treatment:
       - "ENTERED"
       - col(treatmentstring)
     timestamp: "treatmentEnteredTimestamp"
-    treatment_id: "treatmentID"
+    treatment_id: "treatmentid"
 
 vitalAperiodic:
   non_invasive_systolic:
@@ -377,8 +376,8 @@ vitalAperiodic:
       - "BP"
       - "NONINVASIVE_SYSTOLIC"
     timestamp: "observationEnteredTimestamp"
-    vital_id: "vitalAperiodicID"
-    numeric_value: "nonInvasiveSystolic"
+    vital_id: "vitalaperiodicid"
+    numeric_value: "noninvasivesystolic"
   non_invasive_diastolic:
     code:
       - "VITALS"
@@ -386,8 +385,8 @@ vitalAperiodic:
       - "BP"
       - "NONINVASIVE_DIASTOLIC"
     timestamp: "observationEnteredTimestamp"
-    vital_id: "vitalAperiodicID"
-    numeric_value: "nonInvasiveDiastolic"
+    vital_id: "vitalaperiodicid"
+    numeric_value: "noninvasivediastolic"
 
   non_invasive_mean:
     code:
@@ -396,8 +395,8 @@ vitalAperiodic:
       - "BP"
       - "NONINVASIVE_MEAN"
     timestamp: "observationEnteredTimestamp"
-    vital_id: "vitalAperiodicID"
-    numeric_value: "nonInvasiveMean"
+    vital_id: "vitalaperiodicid"
+    numeric_value: "noninvasivemean"
 
   paop:
     code:
@@ -405,7 +404,7 @@ vitalAperiodic:
       - "APERIODIC"
       - "PAOP"
     timestamp: "observationEnteredTimestamp"
-    vital_id: "vitalAperiodicID"
+    vital_id: "vitalaperiodicid"
     numeric_value: "paop"
 
   cardiac_output:
@@ -414,8 +413,8 @@ vitalAperiodic:
       - "APERIODIC"
       - "CARDIAC_OUTPUT"
     timestamp: "observationEnteredTimestamp"
-    vital_id: "vitalAperiodicID"
-    numeric_value: "cardiacOutput"
+    vital_id: "vitalaperiodicid"
+    numeric_value: "cardiacoutput"
 
   cardiac_input:
     code:
@@ -423,8 +422,8 @@ vitalAperiodic:
       - "APERIODIC"
       - "CARDIAC_INPUT"
     timestamp: "observationEnteredTimestamp"
-    vital_id: "vitalAperiodicID"
-    numeric_value: "cardiacInput"
+    vital_id: "vitalaperiodicid"
+    numeric_value: "cardiacinput"
 
   svr:
     code:
@@ -432,7 +431,7 @@ vitalAperiodic:
       - "APERIODIC"
       - "SVR"
     timestamp: "observationEnteredTimestamp"
-    vital_id: "vitalAperiodicID"
+    vital_id: "vitalaperiodicid"
     numeric_value: "svr"
 
   svri:
@@ -441,7 +440,7 @@ vitalAperiodic:
       - "APERIODIC"
       - "SVRI"
     timestamp: "observationEnteredTimestamp"
-    vital_id: "vitalAperiodicID"
+    vital_id: "vitalaperiodicid"
     numeric_value: "svri"
 
   pvr:
@@ -450,7 +449,7 @@ vitalAperiodic:
       - "APERIODIC"
       - "PVR"
     timestamp: "observationEnteredTimestamp"
-    vital_id: "vitalAperiodicID"
+    vital_id: "vitalaperiodicid"
     numeric_value: "pvr"
 
   pvri:
@@ -459,7 +458,7 @@ vitalAperiodic:
       - "APERIODIC"
       - "PVRI"
     timestamp: "observationEnteredTimestamp"
-    vital_id: "vitalAperiodicID"
+    vital_id: "vitalaperiodicid"
     numeric_value: "pvri"
 
 vitalPeriodic:
@@ -469,7 +468,7 @@ vitalPeriodic:
       - "PERIODIC"
       - "TEMPERATURE"
     timestamp: "observationEnteredTimestamp"
-    vital_id: "vitalPeriodicID"
+    vital_id: "vitalperiodicid"
     numeric_value: "temperature"
 
   saO2:
@@ -478,8 +477,8 @@ vitalPeriodic:
       - "PERIODIC"
       - "SAO2"
     timestamp: "observationEnteredTimestamp"
-    vital_id: "vitalPeriodicID"
-    numeric_value: "saO2"
+    vital_id: "vitalperiodicid"
+    numeric_value: "sao2"
 
   heartRate:
     code:
@@ -487,8 +486,8 @@ vitalPeriodic:
       - "PERIODIC"
       - "HEARTRATE"
     timestamp: "observationEnteredTimestamp"
-    vital_id: "vitalPeriodicID"
-    numeric_value: "heartRate"
+    vital_id: "vitalperiodicid"
+    numeric_value: "heartrate"
 
   respiration:
     code:
@@ -496,7 +495,7 @@ vitalPeriodic:
       - "PERIODIC"
       - "RESPIRATION"
     timestamp: "observationEnteredTimestamp"
-    vital_id: "vitalPeriodicID"
+    vital_id: "vitalperiodicid"
     numeric_value: "respiration"
 
   cvp:
@@ -505,7 +504,7 @@ vitalPeriodic:
       - "PERIODIC"
       - "CVP"
     timestamp: "observationEnteredTimestamp"
-    vital_id: "vitalPeriodicID"
+    vital_id: "vitalperiodicid"
     numeric_value: "cvp"
 
   etCo2:
@@ -514,8 +513,8 @@ vitalPeriodic:
       - "PERIODIC"
       - "ETCO2"
     timestamp: "observationEnteredTimestamp"
-    vital_id: "vitalPeriodicID"
-    numeric_value: "etCo2"
+    vital_id: "vitalperiodicid"
+    numeric_value: "etco2"
 
   systemic_systolic:
     code:
@@ -524,8 +523,8 @@ vitalPeriodic:
       - "BP"
       - "SYSTEMIC_SYSTOLIC"
     timestamp: "observationEnteredTimestamp"
-    vital_id: "vitalPeriodicID"
-    numeric_value: "systemicSystolic"
+    vital_id: "vitalperiodicid"
+    numeric_value: "systemicsystolic"
 
   systemic_diastolic:
     code:
@@ -534,8 +533,8 @@ vitalPeriodic:
       - "BP"
       - "SYSTEMIC_DIASTOLIC"
     timestamp: "observationEnteredTimestamp"
-    vital_id: "vitalPeriodicID"
-    numeric_value: "systemicDiastolic"
+    vital_id: "vitalperiodicid"
+    numeric_value: "systemicdiastolic"
 
   systemic_mean:
     code:
@@ -544,8 +543,8 @@ vitalPeriodic:
       - "BP"
       - "SYSTEMIC_MEAN"
     timestamp: "observationEnteredTimestamp"
-    vital_id: "vitalPeriodicID"
-    numeric_value: "systemicMean"
+    vital_id: "vitalperiodicid"
+    numeric_value: "systemicmean"
 
   pa_systolic:
     code:
@@ -554,8 +553,8 @@ vitalPeriodic:
       - "BP"
       - "PULM_ART_SYSTOLIC"
     timestamp: "observationEnteredTimestamp"
-    vital_id: "vitalPeriodicID"
-    numeric_value: "paSystolic"
+    vital_id: "vitalperiodicid"
+    numeric_value: "pasystolic"
 
   pa_diastolic:
     code:
@@ -564,8 +563,8 @@ vitalPeriodic:
       - "BP"
       - "PULM_ART_DIASTOLIC"
     timestamp: "observationEnteredTimestamp"
-    vital_id: "vitalPeriodicID"
-    numeric_value: "paDiastolic"
+    vital_id: "vitalperiodicid"
+    numeric_value: "padiastolic"
 
   pa_mean:
     code:
@@ -574,8 +573,8 @@ vitalPeriodic:
       - "BP"
       - "PULM_ART_MEAN"
     timestamp: "observationEnteredTimestamp"
-    vital_id: "vitalPeriodicID"
-    numeric_value: "paMean"
+    vital_id: "vitalperiodicid"
+    numeric_value: "pamean"
 
   st1:
     code:
@@ -583,7 +582,7 @@ vitalPeriodic:
       - "PERIODIC"
       - "ST1"
     timestamp: "observationEnteredTimestamp"
-    vital_id: "vitalPeriodicID"
+    vital_id: "vitalperiodicid"
     numeric_value: "st1"
 
   st2:
@@ -592,7 +591,7 @@ vitalPeriodic:
       - "PERIODIC"
       - "ST2"
     timestamp: "observationEnteredTimestamp"
-    vital_id: "vitalPeriodicID"
+    vital_id: "vitalperiodicid"
     numeric_value: "st2"
 
   st3:
@@ -601,7 +600,7 @@ vitalPeriodic:
       - "PERIODIC"
       - "ST3"
     timestamp: "observationEnteredTimestamp"
-    vital_id: "vitalPeriodicID"
+    vital_id: "vitalperiodicid"
     numeric_value: "st3"
 
   ICP:
@@ -610,5 +609,5 @@ vitalPeriodic:
       - "PERIODIC"
       - "ICP"
     timestamp: "observationEnteredTimestamp"
-    vital_id: "vitalPeriodicID"
-    numeric_value: "ICP"
+    vital_id: "vitalperiodicid"
+    numeric_value: "icp"

From 74a86244d2d7a822fb82df0f221766d41e77568f Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sat, 1 Jun 2024 17:28:33 -0400
Subject: [PATCH 32/47] Working most of the way through. Some error about
 vitalsaperiodic and floats vs. ints occurring during the event conversion
 currently, though

---
 eICU_Example/configs/event_configs.yaml | 144 ++++++++++++------------
 1 file changed, 72 insertions(+), 72 deletions(-)

diff --git a/eICU_Example/configs/event_configs.yaml b/eICU_Example/configs/event_configs.yaml
index c57584f..77f4023 100644
--- a/eICU_Example/configs/event_configs.yaml
+++ b/eICU_Example/configs/event_configs.yaml
@@ -6,7 +6,7 @@ patient_id_col: patienthealthsystemstayid
 patient:
   dob:
     code: "DOB"
-    timestamp: "dateofbirth"
+    timestamp: col(dateofbirth)
     uniquepid: "uniquepid"
   gender:
     code: ["GENDER", "col(gender)"]
@@ -21,52 +21,52 @@ patient:
       - col(hospitalregion)
       - col(hospitalteachingstatus)
       - col(hospitalnumbedscategory)
-    timestamp: "hospitalAdmitTimestamp"
-    hospital_id: "hospitalID"
+    timestamp: col(hospitaladmittimestamp)
+    hospital_id: "hospitalid"
   hosp_discharge:
     code:
       - "HOSPITAL_DISCHARGE"
       - col(hospitaldischargestatus)
       - col(hospitaldischargelocation)
-    timestamp: "hospitalDischargeTimestamp"
+    timestamp: col(hospitaldischargetimestamp)
   unit_admission:
     code:
       - "UNIT_ADMISSION"
       - col(unitadmitsource)
       - col(unitstaytype)
-    timestamp: "unitAdmitTimestamp"
-    ward_id: "wardID"
-    unit_stay_id: "patientUnitStayID"
+    timestamp: col(unitadmittimestamp)
+    ward_id: "wardid"
+    unit_stay_id: "patientunitstayid"
   unit_admission_weight:
     code:
       - "UNIT_ADMISSION_WEIGHT"
-    timestamp: "unitAdmitTimestamp"
-    numerical_value: "unitAdmissionWeight"
+    timestamp: col(unitadmittimestamp)
+    numerical_value: "unitadmissionweight"
   unit_admission_height:
     code:
       - "UNIT_ADMISSION_HEIGHT"
-    timestamp: "unitAdmitTimestamp"
-    numerical_value: "unitAdmissionHeight"
+    timestamp: col(unitadmittimestamp)
+    numerical_value: "unitadmissionheight"
   unit_discharge:
     code:
       - "UNIT_DISCHARGE"
       - col(unitdischargestatus)
       - col(unitdischargelocation)
-    timestamp: "unitDischargeTimestamp"
+    timestamp: col(unitdischargetimestamp)
   unit_discharge_weight:
     code:
       - "UNIT_DISCHARGE_WEIGHT"
-    timestamp: "unitDischargeTimestamp"
-    numerical_value: "unitDischargeWeight"
+    timestamp: col(unitdischargetimestamp)
+    numerical_value: "unitdischargeweight"
 
 admissiondx:
   admission_diagnosis:
     code:
       - "ADMISSION_DX"
       - col(admitdxname)
-    timestamp: "admitDxEnteredTimestamp"
+    timestamp: col(admitDxEnteredTimestamp)
     admission_dx_id: "admitDxID"
-    unit_stay_id: "patientUnitStayID"
+    unit_stay_id: "patientunitstayid"
 
 allergy:
   allergy:
@@ -74,7 +74,7 @@ allergy:
       - "ALLERGY"
       - col(allergytype)
       - col(allergyname)
-    timestamp: "allergyEnteredTimestamp"
+    timestamp: col(allergyEnteredTimestamp)
 
 carePlanGeneral:
   cplItem:
@@ -82,13 +82,13 @@ carePlanGeneral:
       - "CAREPLAN_GENERAL"
       - col(cplgroup)
       - col(cplitemvalue)
-    timestamp: "carePlanGeneralItemEnteredTimestamp"
+    timestamp: col(carePlanGeneralItemEnteredTimestamp)
 
 carePlanEOL:
   cplEolDiscussion:
     code:
       - "CAREPLAN_EOL"
-    timestamp: "carePlanEolDiscussionOccurredTimestamp"
+    timestamp: col(carePlanEolDiscussionOccurredTimestamp)
 
 carePlanGoal:
   cplGoal:
@@ -97,7 +97,7 @@ carePlanGoal:
       - col(cplgoalcategory)
       - col(cplgoalvalue)
       - col(cplgoalstatus)
-    timestamp: "carePlanGoalEnteredTimestamp"
+    timestamp: col(carePlanGoalEnteredTimestamp)
 
 carePlanInfectiousDisease:
   cplInfectDisease:
@@ -107,7 +107,7 @@ carePlanInfectiousDisease:
       - col(infectdiseaseassessment)
       - col(treatment)
       - col(responsetotherapy)
-    timestamp: "carePlanInfectDiseaseEnteredTimestamp"
+    timestamp: col(carePlanInfectDiseaseEnteredTimestamp)
 
 diagnosis:
   diagnosis:
@@ -115,7 +115,7 @@ diagnosis:
       - "ICD9CM"
       - col(icd9code)
       - col(diagnosispriority)
-    timestamp: "diagnosisEnteredTimestamp"
+    timestamp: col(diagnosisEnteredTimestamp)
     diagnosis_string: "diagnosisstring"
 
 infusionDrug:
@@ -124,7 +124,7 @@ infusionDrug:
       - "INFUSION"
       - col(infusiondrugid)
       - col(drugname)
-    timestamp: "infusionEnteredTimestamp"
+    timestamp: col(infusionEnteredTimestamp)
     drug_rate: "drugrate"
     infusion_rate: "infusionrate"
     drug_amount: "drugamount"
@@ -132,7 +132,7 @@ infusionDrug:
   patient_weight:
     code:
       - "INFUSION_PATIENT_WEIGHT"
-    timestamp: "infusionEnteredTimestamp"
+    timestamp: col(infusionEnteredTimestamp)
     numerical_value: "patientweight"
 
 lab:
@@ -142,7 +142,7 @@ lab:
       - col(labmeasurenamesystem)
       - col(labmeasurenameinterface)
       - col(labname)
-    timestamp: "labResultDrawnTimestamp"
+    timestamp: col(labResultDrawnTimestamp)
     numerical_value: "labresult"
     text_value: "labresulttext"
     lab_type_id: "labtypeid"
@@ -152,8 +152,8 @@ medication:
     code:
       - "MEDICATION"
       - "ORDERED"
-      - col(drugName)
-    timestamp: "drugOrderTimestamp"
+      - col(drugname)
+    timestamp: col(drugordertimestamp)
     medication_id: "medicationid"
     drug_iv_admixture: "drugivadmixture"
     dosage: "dosage"
@@ -166,15 +166,15 @@ medication:
     code:
       - "MEDICATION"
       - "STARTED"
-      - col(drugName)
-    timestamp: "drugStartedTimestamp"
+      - col(drugname)
+    timestamp: col(drugstarttimestamp)
     medication_id: "medicationid"
   drug_stopped:
     code:
       - "MEDICATION"
       - "STOPPED"
-      - col(drugName)
-    timestamp: "drugStoppedTimestamp"
+      - col(drugname)
+    timestamp: col(drugstoptimestamp)
     medication_id: "medicationid"
 
 nurseAssessment:
@@ -183,7 +183,7 @@ nurseAssessment:
       - "NURSE_ASSESSMENT"
       - "PERFORMED"
       - NOT YET DONE
-    timestamp: "nurseAssessPerformedTimestamp"
+    timestamp: col(nurseAssessPerformedTimestamp)
     nurse_assessment_id: "nurseassessid"
     cell_label: "celllabel"
     cell_attribute: "cellattribute"
@@ -194,7 +194,7 @@ nurseAssessment:
       - "NURSE_ASSESSMENT"
       - "ENTERED"
       - NOT YET DONE
-    timestamp: "nurseAssessEnteredTimestamp"
+    timestamp: col(nurseAssessEnteredTimestamp)
     nurse_assessment_id: "nurseassessid"
     cell_label: "celllabel"
     cell_attribute: "cellattribute"
@@ -206,7 +206,7 @@ nurseCare:
       - "NURSE_CARE"
       - "PERFORMED"
       - NOT YET DONE
-    timestamp: "nurseCarePerformedTimestamp"
+    timestamp: col(nurseCarePerformedTimestamp)
     nurse_care_id: "nursecareid"
     cell_label: "celllabel"
     cell_attribute: "cellattribute"
@@ -217,7 +217,7 @@ nurseCare:
       - "NURSE_CARE"
       - "ENTERED"
       - NOT YET DONE
-    timestamp: "nurseCareEnteredTimestamp"
+    timestamp: col(nurseCareEnteredTimestamp)
     nurse_care_id: "nursecareid"
     cell_label: "celllabel"
     cell_attribute: "cellattribute"
@@ -229,7 +229,7 @@ nurseCharting:
       - "NURSE_CHARTING"
       - "PERFORMED"
       - NOT YET DONE
-    timestamp: "nursingChartPerformedTimestamp"
+    timestamp: col(nursingChartPerformedTimestamp)
     nurse_charting_id: "nursingchartid"
     cell_type_cat: "nursingchartcelltypecat"
     cell_type_val_name: "nursingchartcelltypevalname"
@@ -241,7 +241,7 @@ nurseCharting:
       - "NURSE_CHARTING"
       - "ENTERED"
       - NOT YET DONE
-    timestamp: "nursingChartEnteredTimestamp"
+    timestamp: col(nursingChartEnteredTimestamp)
     nurse_charting_id: "nursingchartid"
     cell_type_cat: "nursingchartcelltypecat"
     cell_type_val_name: "nursingchartcelltypevalname"
@@ -254,7 +254,7 @@ pastHistory:
       - "PAST_HISTORY"
       - "TAKEN"
       - NOT YET DONE
-    timestamp: "pastHistoryTakenTimestamp"
+    timestamp: col(pastHistoryTakenTimestamp)
     past_history_id: "pasthistoryid"
     note_type: "pasthistorynotetype"
     path: "pasthistorypath"
@@ -266,7 +266,7 @@ pastHistory:
       - "PAST_HISTORY"
       - "ENTERED"
       - NOT YET DONE
-    timestamp: "pastHistoryEnteredTimestamp"
+    timestamp: col(pastHistoryEnteredTimestamp)
     past_history_id: "pasthistoryid"
     note_type: "pasthistorynotetype"
     path: "pasthistorypath"
@@ -279,7 +279,7 @@ physicalExam:
       - "PHYSICAL_EXAM"
       - "ENTERED"
       - NOT YET DONE
-    timestamp: "physicalExamEnteredTimestamp"
+    timestamp: col(physicalExamEnteredTimestamp)
     physical_exam_id: "physicalexamid"
     text: "physicalexamtext"
     path: "physicalexampath"
@@ -291,7 +291,7 @@ respiratoryCare:
       - "RESP_CARE"
       - "STATUS"
       - NOT YET DONE
-    timestamp: "respCareStatusEnteredTimestamp"
+    timestamp: col(respCareStatusEnteredTimestamp)
     resp_care_id: "respcareid"
     airwaytype: "airwaytype"
     airwaysize: "airwaysize"
@@ -325,7 +325,7 @@ respiratoryCare:
       - "VENT"
       - "START"
       - NOT YET DONE
-    timestamp: "ventStartTimestamp"
+    timestamp: col(ventStartTimestamp)
     resp_care_id: "respcareid"
 
   vent_end:
@@ -333,7 +333,7 @@ respiratoryCare:
       - "VENT"
       - "END"
       - NOT YET DONE
-    timestamp: "ventEndTimestamp"
+    timestamp: col(ventEndTimestamp)
     resp_care_id: "respcareid"
 
 respiratoryCharting:
@@ -342,7 +342,7 @@ respiratoryCharting:
       - "RESP_CHARTING"
       - "PERFORMED"
       - NOT YET DONE
-    timestamp: "respChartPerformedTimestamp"
+    timestamp: col(respChartPerformedTimestamp)
     resp_chart_id: "respchartid"
     type_cat: "respcharttypecat"
     value_label: "respchartvaluelabel"
@@ -353,7 +353,7 @@ respiratoryCharting:
       - "RESP_CHARTING"
       - "ENTERED"
       - NOT YET DONE
-    timestamp: "respChartEnteredTimestamp"
+    timestamp: col(respChartEnteredTimestamp)
     resp_chart_id: "respchartid"
     type_cat: "respcharttypecat"
     value_label: "respchartvaluelabel"
@@ -365,7 +365,7 @@ treatment:
       - "TREATMENT"
       - "ENTERED"
       - col(treatmentstring)
-    timestamp: "treatmentEnteredTimestamp"
+    timestamp: col(treatmentEnteredTimestamp)
     treatment_id: "treatmentid"
 
 vitalAperiodic:
@@ -375,7 +375,7 @@ vitalAperiodic:
       - "APERIODIC"
       - "BP"
       - "NONINVASIVE_SYSTOLIC"
-    timestamp: "observationEnteredTimestamp"
+    timestamp: col(observationEnteredTimestamp)
     vital_id: "vitalaperiodicid"
     numeric_value: "noninvasivesystolic"
   non_invasive_diastolic:
@@ -384,7 +384,7 @@ vitalAperiodic:
       - "APERIODIC"
       - "BP"
       - "NONINVASIVE_DIASTOLIC"
-    timestamp: "observationEnteredTimestamp"
+    timestamp: col(observationEnteredTimestamp)
     vital_id: "vitalaperiodicid"
     numeric_value: "noninvasivediastolic"
 
@@ -394,7 +394,7 @@ vitalAperiodic:
       - "APERIODIC"
       - "BP"
       - "NONINVASIVE_MEAN"
-    timestamp: "observationEnteredTimestamp"
+    timestamp: col(observationEnteredTimestamp)
     vital_id: "vitalaperiodicid"
     numeric_value: "noninvasivemean"
 
@@ -403,7 +403,7 @@ vitalAperiodic:
       - "VITALS"
       - "APERIODIC"
       - "PAOP"
-    timestamp: "observationEnteredTimestamp"
+    timestamp: col(observationEnteredTimestamp)
     vital_id: "vitalaperiodicid"
     numeric_value: "paop"
 
@@ -412,7 +412,7 @@ vitalAperiodic:
       - "VITALS"
       - "APERIODIC"
       - "CARDIAC_OUTPUT"
-    timestamp: "observationEnteredTimestamp"
+    timestamp: col(observationEnteredTimestamp)
     vital_id: "vitalaperiodicid"
     numeric_value: "cardiacoutput"
 
@@ -421,7 +421,7 @@ vitalAperiodic:
       - "VITALS"
       - "APERIODIC"
       - "CARDIAC_INPUT"
-    timestamp: "observationEnteredTimestamp"
+    timestamp: col(observationEnteredTimestamp)
     vital_id: "vitalaperiodicid"
     numeric_value: "cardiacinput"
 
@@ -430,7 +430,7 @@ vitalAperiodic:
       - "VITALS"
       - "APERIODIC"
       - "SVR"
-    timestamp: "observationEnteredTimestamp"
+    timestamp: col(observationEnteredTimestamp)
     vital_id: "vitalaperiodicid"
     numeric_value: "svr"
 
@@ -439,7 +439,7 @@ vitalAperiodic:
       - "VITALS"
       - "APERIODIC"
       - "SVRI"
-    timestamp: "observationEnteredTimestamp"
+    timestamp: col(observationEnteredTimestamp)
     vital_id: "vitalaperiodicid"
     numeric_value: "svri"
 
@@ -448,7 +448,7 @@ vitalAperiodic:
       - "VITALS"
       - "APERIODIC"
       - "PVR"
-    timestamp: "observationEnteredTimestamp"
+    timestamp: col(observationEnteredTimestamp)
     vital_id: "vitalaperiodicid"
     numeric_value: "pvr"
 
@@ -457,7 +457,7 @@ vitalAperiodic:
       - "VITALS"
       - "APERIODIC"
       - "PVRI"
-    timestamp: "observationEnteredTimestamp"
+    timestamp: col(observationEnteredTimestamp)
     vital_id: "vitalaperiodicid"
     numeric_value: "pvri"
 
@@ -467,7 +467,7 @@ vitalPeriodic:
       - "VITALS"
       - "PERIODIC"
       - "TEMPERATURE"
-    timestamp: "observationEnteredTimestamp"
+    timestamp: col(observationEnteredTimestamp)
     vital_id: "vitalperiodicid"
     numeric_value: "temperature"
 
@@ -476,7 +476,7 @@ vitalPeriodic:
       - "VITALS"
       - "PERIODIC"
       - "SAO2"
-    timestamp: "observationEnteredTimestamp"
+    timestamp: col(observationEnteredTimestamp)
     vital_id: "vitalperiodicid"
     numeric_value: "sao2"
 
@@ -485,7 +485,7 @@ vitalPeriodic:
       - "VITALS"
       - "PERIODIC"
       - "HEARTRATE"
-    timestamp: "observationEnteredTimestamp"
+    timestamp: col(observationEnteredTimestamp)
     vital_id: "vitalperiodicid"
     numeric_value: "heartrate"
 
@@ -494,7 +494,7 @@ vitalPeriodic:
       - "VITALS"
       - "PERIODIC"
       - "RESPIRATION"
-    timestamp: "observationEnteredTimestamp"
+    timestamp: col(observationEnteredTimestamp)
     vital_id: "vitalperiodicid"
     numeric_value: "respiration"
 
@@ -503,7 +503,7 @@ vitalPeriodic:
       - "VITALS"
       - "PERIODIC"
       - "CVP"
-    timestamp: "observationEnteredTimestamp"
+    timestamp: col(observationEnteredTimestamp)
     vital_id: "vitalperiodicid"
     numeric_value: "cvp"
 
@@ -512,7 +512,7 @@ vitalPeriodic:
       - "VITALS"
       - "PERIODIC"
       - "ETCO2"
-    timestamp: "observationEnteredTimestamp"
+    timestamp: col(observationEnteredTimestamp)
     vital_id: "vitalperiodicid"
     numeric_value: "etco2"
 
@@ -522,7 +522,7 @@ vitalPeriodic:
       - "PERIODIC"
       - "BP"
       - "SYSTEMIC_SYSTOLIC"
-    timestamp: "observationEnteredTimestamp"
+    timestamp: col(observationEnteredTimestamp)
     vital_id: "vitalperiodicid"
     numeric_value: "systemicsystolic"
 
@@ -532,7 +532,7 @@ vitalPeriodic:
       - "PERIODIC"
       - "BP"
       - "SYSTEMIC_DIASTOLIC"
-    timestamp: "observationEnteredTimestamp"
+    timestamp: col(observationEnteredTimestamp)
     vital_id: "vitalperiodicid"
     numeric_value: "systemicdiastolic"
 
@@ -542,7 +542,7 @@ vitalPeriodic:
       - "PERIODIC"
       - "BP"
       - "SYSTEMIC_MEAN"
-    timestamp: "observationEnteredTimestamp"
+    timestamp: col(observationEnteredTimestamp)
     vital_id: "vitalperiodicid"
     numeric_value: "systemicmean"
 
@@ -552,7 +552,7 @@ vitalPeriodic:
       - "PERIODIC"
       - "BP"
       - "PULM_ART_SYSTOLIC"
-    timestamp: "observationEnteredTimestamp"
+    timestamp: col(observationEnteredTimestamp)
     vital_id: "vitalperiodicid"
     numeric_value: "pasystolic"
 
@@ -562,7 +562,7 @@ vitalPeriodic:
       - "PERIODIC"
       - "BP"
       - "PULM_ART_DIASTOLIC"
-    timestamp: "observationEnteredTimestamp"
+    timestamp: col(observationEnteredTimestamp)
     vital_id: "vitalperiodicid"
     numeric_value: "padiastolic"
 
@@ -572,7 +572,7 @@ vitalPeriodic:
       - "PERIODIC"
       - "BP"
       - "PULM_ART_MEAN"
-    timestamp: "observationEnteredTimestamp"
+    timestamp: col(observationEnteredTimestamp)
     vital_id: "vitalperiodicid"
     numeric_value: "pamean"
 
@@ -581,7 +581,7 @@ vitalPeriodic:
       - "VITALS"
       - "PERIODIC"
       - "ST1"
-    timestamp: "observationEnteredTimestamp"
+    timestamp: col(observationEnteredTimestamp)
     vital_id: "vitalperiodicid"
     numeric_value: "st1"
 
@@ -590,7 +590,7 @@ vitalPeriodic:
       - "VITALS"
       - "PERIODIC"
       - "ST2"
-    timestamp: "observationEnteredTimestamp"
+    timestamp: col(observationEnteredTimestamp)
     vital_id: "vitalperiodicid"
     numeric_value: "st2"
 
@@ -599,7 +599,7 @@ vitalPeriodic:
       - "VITALS"
       - "PERIODIC"
       - "ST3"
-    timestamp: "observationEnteredTimestamp"
+    timestamp: col(observationEnteredTimestamp)
     vital_id: "vitalperiodicid"
     numeric_value: "st3"
 
@@ -608,6 +608,6 @@ vitalPeriodic:
       - "VITALS"
       - "PERIODIC"
       - "ICP"
-    timestamp: "observationEnteredTimestamp"
+    timestamp: col(observationEnteredTimestamp)
     vital_id: "vitalperiodicid"
     numeric_value: "icp"

From f979ea46416b9b8f307eedac9efaf31146b746d8 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sun, 2 Jun 2024 13:16:39 -0400
Subject: [PATCH 33/47] Incorporating fixes from #8 -- thanks @prenc!

---
 MIMIC-IV_Example/pre_MEDS.py               | 4 ++--
 scripts/extraction/merge_to_MEDS_cohort.py | 6 +++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/MIMIC-IV_Example/pre_MEDS.py b/MIMIC-IV_Example/pre_MEDS.py
index 789b882..bf99f3a 100755
--- a/MIMIC-IV_Example/pre_MEDS.py
+++ b/MIMIC-IV_Example/pre_MEDS.py
@@ -59,7 +59,7 @@ def fix_static_data(raw_static_df: pl.LazyFrame, death_times_df: pl.LazyFrame) -
 
     return raw_static_df.join(death_times_df, on="subject_id", how="left").select(
         "subject_id",
-        pl.coalesce(pl.col("dod"), pl.col("deathtime")).alias("dod"),
+        pl.coalesce(pl.col("deathtime"), pl.col("dod")).alias("dod"),
         (pl.col("anchor_year") - pl.col("anchor_age")).cast(str).alias("year_of_birth"),
         "gender",
     )
@@ -106,7 +106,7 @@ def main(cfg: DictConfig):
                 f"No function needed for {pfx}: "
                 f"Symlinking {str(in_fp.resolve())} to {str(out_fp.resolve())}"
             )
-            relative_in_fp = in_fp.relative_to(out_fp.parent, walk_up=True)
+            relative_in_fp = in_fp.relative_to(out_fp.resolve().parent, walk_up=True)
             out_fp.symlink_to(relative_in_fp)
             continue
         else:
diff --git a/scripts/extraction/merge_to_MEDS_cohort.py b/scripts/extraction/merge_to_MEDS_cohort.py
index 1c7271d..e7f8bdf 100755
--- a/scripts/extraction/merge_to_MEDS_cohort.py
+++ b/scripts/extraction/merge_to_MEDS_cohort.py
@@ -25,7 +25,11 @@ def read_fn(sp_dir: Path) -> pl.LazyFrame:
     logger.info(f"Reading {len(files_to_read)} files:\n{file_strs}")
 
     dfs = [pl.scan_parquet(fp, glob=False) for fp in files_to_read]
-    return pl.concat(dfs, how="diagonal").unique(maintain_order=False).sort(by=["patient_id", "timestamp"])
+    return (
+        pl.concat(dfs, how="diagonal_relaxed")
+        .unique(maintain_order=False)
+        .sort(by=["patient_id", "timestamp"])
+    )
 
 
 def write_fn(df: pl.LazyFrame, out_fp: Path) -> None:

From 21dfc19f2361f5d10d3d8f665aa832b585de8fb8 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sun, 2 Jun 2024 14:19:28 -0400
Subject: [PATCH 34/47] Make log dir stage dependent

---
 configs/pipeline.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/pipeline.yaml b/configs/pipeline.yaml
index 5694e25..be99f84 100644
--- a/configs/pipeline.yaml
+++ b/configs/pipeline.yaml
@@ -2,7 +2,7 @@
 input_dir: ???
 cohort_dir: ???
 
-log_dir: "${cohort_dir}/.logs"
+log_dir: "${cohort_dir}/.logs/${stage}"
 
 # General pipeline variables
 do_overwrite: False

From d9501a74ec6a803bcb884b368cc169e56e58ebda Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Thu, 6 Jun 2024 09:23:58 -0400
Subject: [PATCH 35/47] Made submitit launcher script work

---
 MIMIC-IV_Example/joint_script_slurm.sh | 109 +++++++++++++------------
 1 file changed, 55 insertions(+), 54 deletions(-)

diff --git a/MIMIC-IV_Example/joint_script_slurm.sh b/MIMIC-IV_Example/joint_script_slurm.sh
index 3948e87..0e0af8e 100755
--- a/MIMIC-IV_Example/joint_script_slurm.sh
+++ b/MIMIC-IV_Example/joint_script_slurm.sh
@@ -1,9 +1,9 @@
 #!/usr/bin/env bash
 
-MIMICIV_RAW_DIR="$1"
-MIMICIV_PREMEDS_DIR="$2"
-MIMICIV_MEDS_DIR="$3"
-N_PARALLEL_WORKERS="$4"
+export MIMICIV_RAW_DIR="$1"
+export MIMICIV_PREMEDS_DIR="$2"
+export MIMICIV_MEDS_DIR="$3"
+export N_PARALLEL_WORKERS="$4"
 
 shift 4
 
@@ -11,17 +11,17 @@ shift 4
 # this doesn't fall back on running anything locally in a setting where only slurm worker nodes have
 # sufficient computational resources to run the actual jobs.
 
-echo "Running pre-MEDS conversion on one worker."
-./MIMIC-IV_Example/pre_MEDS.py \
-  --multirun \
-  worker="range(0,1)" \
-  hydra/launcher=submitit_slurm \
-  hydra.launcher.timeout_min=60 \
-  hydra.launcher.cpus_per_task=10 \
-  hydra.launcher.mem_gb=50 \
-  hydra.launcher.partition="short" \
-  raw_cohort_dir="$MIMICIV_RAW_DIR" \
-  output_dir="$MIMICIV_PREMEDS_DIR"
+# echo "Running pre-MEDS conversion on one worker."
+# ./MIMIC-IV_Example/pre_MEDS.py \
+#   --multirun \
+#   worker="range(0,1)" \
+#   hydra/launcher=submitit_slurm \
+#   hydra.launcher.timeout_min=60 \
+#   hydra.launcher.cpus_per_task=10 \
+#   hydra.launcher.mem_gb=50 \
+#   hydra.launcher.partition="short" \
+#   raw_cohort_dir="$MIMICIV_RAW_DIR" \
+#   output_dir="$MIMICIV_PREMEDS_DIR"
 
 echo "Trying submitit launching with $N_PARALLEL_WORKERS jobs."
 
@@ -36,43 +36,44 @@ echo "Trying submitit launching with $N_PARALLEL_WORKERS jobs."
     "hydra.job.env_copy=[PATH]" \
     input_dir="$MIMICIV_PREMEDS_DIR" \
     cohort_dir="$MIMICIV_MEDS_DIR" \
-    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml
+    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml \
+    stage=shard_events
 
-echo "Splitting patients on one worker"
-./scripts/extraction/split_and_shard_patients.py \
-    --multirun \
-    worker="range(0,1)" \
-    hydra/launcher=submitit_slurm \
-    hydra.launcher.timeout_min=60 \
-    hydra.launcher.cpus_per_task=10 \
-    hydra.launcher.mem_gb=50 \
-    hydra.launcher.partition="short" \
-    input_dir="$MIMICIV_PREMEDS_DIR" \
-    cohort_dir="$MIMICIV_MEDS_DIR" \
-    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
-
-echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
-./scripts/extraction/convert_to_sharded_events.py \
-    --multirun \
-    worker="range(0,$N_PARALLEL_WORKERS)" \
-    hydra/launcher=submitit_slurm \
-    hydra.launcher.timeout_min=60 \
-    hydra.launcher.cpus_per_task=10 \
-    hydra.launcher.mem_gb=50 \
-    hydra.launcher.partition="short" \
-    input_dir="$MIMICIV_PREMEDS_DIR" \
-    cohort_dir="$MIMICIV_MEDS_DIR" \
-    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
-
-echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
-./scripts/extraction/merge_to_MEDS_cohort.py \
-    --multirun \
-    worker="range(0,$N_PARALLEL_WORKERS)" \
-    hydra/launcher=submitit_slurm \
-    hydra.launcher.timeout_min=60 \
-    hydra.launcher.cpus_per_task=10 \
-    hydra.launcher.mem_gb=50 \
-    hydra.launcher.partition="short" \
-    input_dir="$MIMICIV_PREMEDS_DIR" \
-    cohort_dir="$MIMICIV_MEDS_DIR" \
-    event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+# echo "Splitting patients on one worker"
+# ./scripts/extraction/split_and_shard_patients.py \
+#     --multirun \
+#     worker="range(0,1)" \
+#     hydra/launcher=submitit_slurm \
+#     hydra.launcher.timeout_min=60 \
+#     hydra.launcher.cpus_per_task=10 \
+#     hydra.launcher.mem_gb=50 \
+#     hydra.launcher.partition="short" \
+#     input_dir="$MIMICIV_PREMEDS_DIR" \
+#     cohort_dir="$MIMICIV_MEDS_DIR" \
+#     event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+# 
+# echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
+# ./scripts/extraction/convert_to_sharded_events.py \
+#     --multirun \
+#     worker="range(0,$N_PARALLEL_WORKERS)" \
+#     hydra/launcher=submitit_slurm \
+#     hydra.launcher.timeout_min=60 \
+#     hydra.launcher.cpus_per_task=10 \
+#     hydra.launcher.mem_gb=50 \
+#     hydra.launcher.partition="short" \
+#     input_dir="$MIMICIV_PREMEDS_DIR" \
+#     cohort_dir="$MIMICIV_MEDS_DIR" \
+#     event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"
+# 
+# echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
+# ./scripts/extraction/merge_to_MEDS_cohort.py \
+#     --multirun \
+#     worker="range(0,$N_PARALLEL_WORKERS)" \
+#     hydra/launcher=submitit_slurm \
+#     hydra.launcher.timeout_min=60 \
+#     hydra.launcher.cpus_per_task=10 \
+#     hydra.launcher.mem_gb=50 \
+#     hydra.launcher.partition="short" \
+#     input_dir="$MIMICIV_PREMEDS_DIR" \
+#     cohort_dir="$MIMICIV_MEDS_DIR" \
+#     event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@"

From 6878bf20a11ce44f4e45b48d64082e4910f0df17 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Thu, 6 Jun 2024 09:24:36 -0400
Subject: [PATCH 36/47] Added singleton sbatch script

---
 MIMIC-IV_Example/sbatch_joint_script.sh | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 MIMIC-IV_Example/sbatch_joint_script.sh

diff --git a/MIMIC-IV_Example/sbatch_joint_script.sh b/MIMIC-IV_Example/sbatch_joint_script.sh
new file mode 100644
index 0000000..e031363
--- /dev/null
+++ b/MIMIC-IV_Example/sbatch_joint_script.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+#SBATCH -c 10                           # Request one core
+#SBATCH -t 0-03:00                      # Runtime in D-HH:MM format
+#SBATCH -p short                        # Partition to run in
+#SBATCH --mem=300GB                     # Memory total in MiB (for all cores)
+#SBATCH -o MIMIC_IV_MEDS_%j_sbatch.out  # File to which STDOUT will be written, including job ID (%j)
+#SBATCH -e MIMIC_IV_MEDS_%j_sbatch.err  # File to which STDERR will be written, including job ID (%j)
+
+cd /n/data1/hms/dbmi/zaklab/mmd/MEDS_polars_functions
+
+MIMICIV_RAW_DIR="$1"
+MIMICIV_PREMEDS_DIR="$2"
+MIMICIV_MEDS_DIR="$3"
+N_PARALLEL_WORKERS="$4"
+
+LOG_DIR="$MIMICIV_MEDS_DIR/.logs"
+
+echo "Running with saving to $LOG_DIR"
+
+mkdir -p $LOG_DIR
+
+PATH="/home/mbm47/.conda/envs/MEDS_pipelines/bin:$PATH" \
+  time mprof run --include-children --exit-code --output "$LOG_DIR/mprofile.dat" \
+      ./MIMIC-IV_Example/joint_script.sh "$@" 2> $LOG_DIR/timings.txt

From dced00b83641e35e4faa17a3413a5b4f5861090e Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Thu, 6 Jun 2024 09:34:52 -0400
Subject: [PATCH 37/47] Adding inits to make tests pass despite shared
 'pre_MEDS.py' name

---
 MIMIC-IV_Example/__init__.py | 0
 eICU_Example/__init__.py     | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 MIMIC-IV_Example/__init__.py
 create mode 100644 eICU_Example/__init__.py

diff --git a/MIMIC-IV_Example/__init__.py b/MIMIC-IV_Example/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/eICU_Example/__init__.py b/eICU_Example/__init__.py
new file mode 100644
index 0000000..e69de29

From 7d74d60156f96791f45766bdd242bdf113e61f69 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Thu, 6 Jun 2024 09:45:05 -0400
Subject: [PATCH 38/47] Make it always retype numerical values

---
 src/MEDS_polars_functions/event_conversion.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/MEDS_polars_functions/event_conversion.py b/src/MEDS_polars_functions/event_conversion.py
index 15f1e9a..163bf10 100644
--- a/src/MEDS_polars_functions/event_conversion.py
+++ b/src/MEDS_polars_functions/event_conversion.py
@@ -381,7 +381,8 @@ def extract_event(df: pl.LazyFrame, event_cfg: dict[str, str | None]) -> pl.Lazy
     # if numerical_value column is not numeric, convert it to float
     if "numerical_value" in df.columns and not df.schema["numerical_value"].is_numeric():
         logger.warning(f"Converting numerical_value to float for codes {codes}")
-        df = df.with_columns(pl.col("numerical_value").cast(pl.Float64, strict=False))
+
+    df = df.with_columns(pl.col("numerical_value").cast(pl.Float64, strict=False))
 
     return df
 

From 3ec1436414c683ef36bd5e875dee9fa763fe5e7b Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Thu, 6 Jun 2024 09:46:21 -0400
Subject: [PATCH 39/47] typo fix

---
 src/MEDS_polars_functions/event_conversion.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/MEDS_polars_functions/event_conversion.py b/src/MEDS_polars_functions/event_conversion.py
index 163bf10..56c90d8 100644
--- a/src/MEDS_polars_functions/event_conversion.py
+++ b/src/MEDS_polars_functions/event_conversion.py
@@ -379,10 +379,10 @@ def extract_event(df: pl.LazyFrame, event_cfg: dict[str, str | None]) -> pl.Lazy
     df = df.select(**event_exprs).unique(maintain_order=True)
 
     # if numerical_value column is not numeric, convert it to float
-    if "numerical_value" in df.columns and not df.schema["numerical_value"].is_numeric():
-        logger.warning(f"Converting numerical_value to float for codes {codes}")
-
-    df = df.with_columns(pl.col("numerical_value").cast(pl.Float64, strict=False))
+    if "numerical_value" in df.columns:
+        if not df.schema["numerical_value"].is_numeric():
+            logger.warning(f"Converting numerical_value to float for codes {codes}")
+        df = df.with_columns(pl.col("numerical_value").cast(pl.Float64, strict=False))
 
     return df
 

From 1169cc9b62da9315b463d2c0bb249ba9ca2b5eb0 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Thu, 6 Jun 2024 09:47:49 -0400
Subject: [PATCH 40/47] Undoing recent changes as they don't help

---
 src/MEDS_polars_functions/event_conversion.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/MEDS_polars_functions/event_conversion.py b/src/MEDS_polars_functions/event_conversion.py
index 56c90d8..15f1e9a 100644
--- a/src/MEDS_polars_functions/event_conversion.py
+++ b/src/MEDS_polars_functions/event_conversion.py
@@ -379,9 +379,8 @@ def extract_event(df: pl.LazyFrame, event_cfg: dict[str, str | None]) -> pl.Lazy
     df = df.select(**event_exprs).unique(maintain_order=True)
 
     # if numerical_value column is not numeric, convert it to float
-    if "numerical_value" in df.columns:
-        if not df.schema["numerical_value"].is_numeric():
-            logger.warning(f"Converting numerical_value to float for codes {codes}")
+    if "numerical_value" in df.columns and not df.schema["numerical_value"].is_numeric():
+        logger.warning(f"Converting numerical_value to float for codes {codes}")
         df = df.with_columns(pl.col("numerical_value").cast(pl.Float64, strict=False))
 
     return df

From 637e4bda515e89f9ee58d28ae2d95d36994fde69 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Thu, 6 Jun 2024 10:02:01 -0400
Subject: [PATCH 41/47] Use diagonal relaxed to combine the event subshards

---
 src/MEDS_polars_functions/event_conversion.py | 3 ++-
 src/MEDS_polars_functions/utils.py            | 5 ++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/MEDS_polars_functions/event_conversion.py b/src/MEDS_polars_functions/event_conversion.py
index 15f1e9a..eae9505 100644
--- a/src/MEDS_polars_functions/event_conversion.py
+++ b/src/MEDS_polars_functions/event_conversion.py
@@ -278,6 +278,7 @@ def extract_event(df: pl.LazyFrame, event_cfg: dict[str, str | None]) -> pl.Lazy
             ...
         ValueError: Source column 'discharge_time' for event column foobar is not numeric or categorical! Cannot be used as an event col.
     """  # noqa: E501
+    df = df
     event_exprs = {"patient_id": pl.col("patient_id")}
 
     if "code" not in event_cfg:
@@ -550,5 +551,5 @@ def convert_to_events(
         except Exception as e:
             raise ValueError(f"Error extracting event {event_name}: {e}") from e
 
-    df = pl.concat(event_dfs, how="diagonal")
+    df = pl.concat(event_dfs, how="diagonal_relaxed")
     return df
diff --git a/src/MEDS_polars_functions/utils.py b/src/MEDS_polars_functions/utils.py
index b2fbbb7..d1e6e09 100644
--- a/src/MEDS_polars_functions/utils.py
+++ b/src/MEDS_polars_functions/utils.py
@@ -157,7 +157,10 @@ def hydra_loguru_init() -> None:
 
 
 def write_lazyframe(df: pl.LazyFrame, out_fp: Path) -> None:
-    df.collect().write_parquet(out_fp, use_pyarrow=True)
+    if isinstance(df, pl.LazyFrame):
+        df = df.collect()
+
+    df.write_parquet(out_fp, use_pyarrow=True)
 
 
 def get_shard_prefix(base_path: Path, fp: Path) -> str:

From eb94a1d961e6208ce91243c83f0bcc3a1a0dd834 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Thu, 6 Jun 2024 16:52:54 -0400
Subject: [PATCH 42/47] fixed error in joint script help message for eICU.
 should apply to MIMIC as well.

---
 eICU_Example/joint_script.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eICU_Example/joint_script.sh b/eICU_Example/joint_script.sh
index 4445f49..26e2b57 100755
--- a/eICU_Example/joint_script.sh
+++ b/eICU_Example/joint_script.sh
@@ -27,7 +27,7 @@ if [[ "$1" == "-h" || "$1" == "--help" ]]; then
 fi
 
 # Check for mandatory parameters
-if [ "$#" -ne 4 ]; then
+if [ "$#" -lt 4 ]; then
     echo "Error: Incorrect number of arguments provided."
     display_help
 fi

From 0af21c7e4c2cb062c54e8035312fa0172e99b33d Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Fri, 7 Jun 2024 10:49:03 -0400
Subject: [PATCH 43/47] Fixed up sbatch script

---
 MIMIC-IV_Example/joint_script.sh        | 2 +-
 MIMIC-IV_Example/sbatch_joint_script.sh | 9 +++------
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/MIMIC-IV_Example/joint_script.sh b/MIMIC-IV_Example/joint_script.sh
index d3e067f..bf3438e 100755
--- a/MIMIC-IV_Example/joint_script.sh
+++ b/MIMIC-IV_Example/joint_script.sh
@@ -27,7 +27,7 @@ if [[ "$1" == "-h" || "$1" == "--help" ]]; then
 fi
 
 # Check for mandatory parameters
-if [ "$#" -ne 4 ]; then
+if [ "$#" -lt 4 ]; then
     echo "Error: Incorrect number of arguments provided."
     display_help
 fi
diff --git a/MIMIC-IV_Example/sbatch_joint_script.sh b/MIMIC-IV_Example/sbatch_joint_script.sh
index e031363..75d3281 100644
--- a/MIMIC-IV_Example/sbatch_joint_script.sh
+++ b/MIMIC-IV_Example/sbatch_joint_script.sh
@@ -6,19 +6,16 @@
 #SBATCH -o MIMIC_IV_MEDS_%j_sbatch.out  # File to which STDOUT will be written, including job ID (%j)
 #SBATCH -e MIMIC_IV_MEDS_%j_sbatch.err  # File to which STDERR will be written, including job ID (%j)
 
-cd /n/data1/hms/dbmi/zaklab/mmd/MEDS_polars_functions
+cd /n/data1/hms/dbmi/zaklab/mmd/MEDS_polars_functions || exit
 
-MIMICIV_RAW_DIR="$1"
-MIMICIV_PREMEDS_DIR="$2"
 MIMICIV_MEDS_DIR="$3"
-N_PARALLEL_WORKERS="$4"
 
 LOG_DIR="$MIMICIV_MEDS_DIR/.logs"
 
 echo "Running with saving to $LOG_DIR"
 
-mkdir -p $LOG_DIR
+mkdir -p "$LOG_DIR"
 
 PATH="/home/mbm47/.conda/envs/MEDS_pipelines/bin:$PATH" \
   time mprof run --include-children --exit-code --output "$LOG_DIR/mprofile.dat" \
-      ./MIMIC-IV_Example/joint_script.sh "$@" 2> $LOG_DIR/timings.txt
+      ./MIMIC-IV_Example/joint_script.sh "$@" 2> "$LOG_DIR/timings.txt"

From 5cebbfa6100eb84dc9efa359a2600a8628fbde34 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sat, 8 Jun 2024 16:32:26 -0400
Subject: [PATCH 44/47] Allowing for skipping the unique-by in the merge stage.

---
 configs/extraction.yaml                    |  1 +
 scripts/extraction/merge_to_MEDS_cohort.py | 31 +++++++++++++++++-----
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/configs/extraction.yaml b/configs/extraction.yaml
index e1e985a..c351951 100644
--- a/configs/extraction.yaml
+++ b/configs/extraction.yaml
@@ -25,3 +25,4 @@ stage_configs:
       held_out: 0.1
   merge_to_MEDS_cohort:
     output_dir: ${cohort_dir}/final_cohort
+    unique_by: "*"
diff --git a/scripts/extraction/merge_to_MEDS_cohort.py b/scripts/extraction/merge_to_MEDS_cohort.py
index e7f8bdf..ade8d50 100755
--- a/scripts/extraction/merge_to_MEDS_cohort.py
+++ b/scripts/extraction/merge_to_MEDS_cohort.py
@@ -2,6 +2,7 @@
 
 import json
 import random
+from functools import partial
 from pathlib import Path
 
 import hydra
@@ -15,7 +16,7 @@
 pl.enable_string_cache()
 
 
-def read_fn(sp_dir: Path) -> pl.LazyFrame:
+def read_fn(sp_dir: Path, unique_by: list[str] | str | None) -> pl.LazyFrame:
     files_to_read = list(sp_dir.glob("**/*.parquet"))
 
     if not files_to_read:
@@ -25,11 +26,25 @@ def read_fn(sp_dir: Path) -> pl.LazyFrame:
     logger.info(f"Reading {len(files_to_read)} files:\n{file_strs}")
 
     dfs = [pl.scan_parquet(fp, glob=False) for fp in files_to_read]
-    return (
-        pl.concat(dfs, how="diagonal_relaxed")
-        .unique(maintain_order=False)
-        .sort(by=["patient_id", "timestamp"])
-    )
+    df = pl.concat(dfs, how="diagonal_relaxed")
+
+    match unique_by:
+        case None:
+            pass
+        case "*":
+            df = df.unique(maintain_order=False)
+        case list() if len(unique_by) == 0 and all(isinstance(u, str) for u in unique_by):
+            subset = []
+            for u in unique_by:
+                if u in df.columns:
+                    subset.append(u)
+                else:
+                    logger.warning(f"Column {u} not found in dataframe. Omitting from unique-by subset.")
+            df = df.unique(maintain_order=False, subset=subset)
+        case _:
+            raise ValueError(f"Invalid unique_by value: {unique_by}")
+
+    return df.sort(by=["patient_id", "timestamp"], multithreaded=False)
 
 
 def write_fn(df: pl.LazyFrame, out_fp: Path) -> None:
@@ -63,6 +78,8 @@ def main(cfg: DictConfig):
     patient_splits = list(shards.keys())
     random.shuffle(patient_splits)
 
+    reader = partial(read_fn, unique_by=cfg.stage_cfg.get("unique_by", None))
+
     for sp in patient_splits:
         in_dir = patient_subsharded_dir / sp
         out_fp = Path(cfg.stage_cfg.output_dir) / f"{sp}.parquet"
@@ -70,7 +87,7 @@ def main(cfg: DictConfig):
         shard_fps = sorted(list(in_dir.glob("**/*.parquet")))
         shard_fp_strs = [f"  * {str(fp.resolve())}" for fp in shard_fps]
         logger.info(f"Merging {len(shard_fp_strs)} shards into {out_fp}:\n" + "\n".join(shard_fp_strs))
-        rwlock_wrap(in_dir, out_fp, read_fn, write_fn, identity_fn, do_return=False)
+        rwlock_wrap(in_dir, out_fp, reader, write_fn, identity_fn, do_return=False)
 
     logger.info("Output cohort written.")
 

From f48ddb72a645c0b9cec2fc107a911be801373c97 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sat, 8 Jun 2024 16:46:00 -0400
Subject: [PATCH 45/47] Added a note to eICU example

---
 eICU_Example/joint_script.sh | 55 ++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 25 deletions(-)

diff --git a/eICU_Example/joint_script.sh b/eICU_Example/joint_script.sh
index 26e2b57..97515f4 100755
--- a/eICU_Example/joint_script.sh
+++ b/eICU_Example/joint_script.sh
@@ -39,32 +39,37 @@ N_PARALLEL_WORKERS="$4"
 
 shift 4
 
-echo "Running pre-MEDS conversion."
-./eICU_Example/pre_MEDS.py raw_cohort_dir="$EICU_RAW_DIR" output_dir="$EICU_PREMEDS_DIR"
+echo "Note that eICU is expensive (in memory) in some final stages as each MEDS shards will end up being "
+echo "large in # of rows (e.g., ~175M) given the frequency of periodic vitals signs. We recommend setting "
+echo "stage_configs.merge_to_MEDS_cohort.unique_by=null in order to mitigate the cost of the unique "
+echo "operation at to avoid OOM issues."
 
-echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel"
-./scripts/extraction/shard_events.py \
-    --multirun \
-    worker="range(0,$N_PARALLEL_WORKERS)" \
-    hydra/launcher=joblib \
-    input_dir="$EICU_PREMEDS_DIR" \
-    cohort_dir="$EICU_MEDS_DIR" \
-    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
-
-echo "Splitting patients in serial"
-./scripts/extraction/split_and_shard_patients.py \
-    input_dir="$EICU_PREMEDS_DIR" \
-    cohort_dir="$EICU_MEDS_DIR" \
-    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
-
-echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
-./scripts/extraction/convert_to_sharded_events.py \
-    --multirun \
-    worker="range(0,$N_PARALLEL_WORKERS)" \
-    hydra/launcher=joblib \
-    input_dir="$EICU_PREMEDS_DIR" \
-    cohort_dir="$EICU_MEDS_DIR" \
-    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
+#echo "Running pre-MEDS conversion."
+#./eICU_Example/pre_MEDS.py raw_cohort_dir="$EICU_RAW_DIR" output_dir="$EICU_PREMEDS_DIR"
+#
+#echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel"
+#./scripts/extraction/shard_events.py \
+#    --multirun \
+#    worker="range(0,$N_PARALLEL_WORKERS)" \
+#    hydra/launcher=joblib \
+#    input_dir="$EICU_PREMEDS_DIR" \
+#    cohort_dir="$EICU_MEDS_DIR" \
+#    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
+#
+#echo "Splitting patients in serial"
+#./scripts/extraction/split_and_shard_patients.py \
+#    input_dir="$EICU_PREMEDS_DIR" \
+#    cohort_dir="$EICU_MEDS_DIR" \
+#    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
+#
+#echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
+#./scripts/extraction/convert_to_sharded_events.py \
+#    --multirun \
+#    worker="range(0,$N_PARALLEL_WORKERS)" \
+#    hydra/launcher=joblib \
+#    input_dir="$EICU_PREMEDS_DIR" \
+#    cohort_dir="$EICU_MEDS_DIR" \
+#    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
 
 echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
 ./scripts/extraction/merge_to_MEDS_cohort.py \

From e152a17c2e747b1f86e836fca75b8d5533cbf896 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Sat, 8 Jun 2024 17:06:45 -0400
Subject: [PATCH 46/47] Updated scripts and added note to README.md for eICU

---
 eICU_Example/README.md       |  8 +++++
 eICU_Example/joint_script.sh | 63 +++++++++++++++++++-----------------
 2 files changed, 41 insertions(+), 30 deletions(-)

diff --git a/eICU_Example/README.md b/eICU_Example/README.md
index 2715613..0984b99 100644
--- a/eICU_Example/README.md
+++ b/eICU_Example/README.md
@@ -70,6 +70,14 @@ In practice, on a machine with 150 GB of RAM and 10 cores, this step takes less
 
 ## Step 3: Run the MEDS extraction ETL
 
+Note that eICU has a lot more observations per patient than does MIMIC-IV, so to keep to a reasonable memory
+burden (e.g., \< 150GB per worker), you will want a smaller shard size, as well as to turn off the final unique
+check (which should not be necessary given the structure of eICU and is expensive) in the merge stage. You can
+do this by setting the following parameters at the end of the mandatory args when running this script:
+
+- `stage_configs.split_and_shard_patients.n_patients_per_shard=10000`
+- `stage_configs.merge_to_MEDS_cohort.unique_by=null`
+
 ### Running locally, serially
 
 We will assume you want to output the final MEDS dataset into a directory we'll denote as `$EICU_MEDS_DIR`.
diff --git a/eICU_Example/joint_script.sh b/eICU_Example/joint_script.sh
index 97515f4..fd76ee2 100755
--- a/eICU_Example/joint_script.sh
+++ b/eICU_Example/joint_script.sh
@@ -39,37 +39,40 @@ N_PARALLEL_WORKERS="$4"
 
 shift 4
 
-echo "Note that eICU is expensive (in memory) in some final stages as each MEDS shards will end up being "
-echo "large in # of rows (e.g., ~175M) given the frequency of periodic vitals signs. We recommend setting "
-echo "stage_configs.merge_to_MEDS_cohort.unique_by=null in order to mitigate the cost of the unique "
-echo "operation at to avoid OOM issues."
+echo "Note that eICU has a lot more observations per patient than does MIMIC-IV, so to keep to a reasonable "
+echo "memory burden (e.g., < 150GB per worker), you will want a smaller shard size, as well as to turn off "
+echo "the final unique check (which should not be necessary given the structure of eICU and is expensive) "
+echo "in the merge stage. You can do this by setting the following parameters at the end of the mandatory "
+echo "args when running this script:"
+echo "  * stage_configs.split_and_shard_patients.n_patients_per_shard=10000"
+echo "  * stage_configs.merge_to_MEDS_cohort.unique_by=null"
 
-#echo "Running pre-MEDS conversion."
-#./eICU_Example/pre_MEDS.py raw_cohort_dir="$EICU_RAW_DIR" output_dir="$EICU_PREMEDS_DIR"
-#
-#echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel"
-#./scripts/extraction/shard_events.py \
-#    --multirun \
-#    worker="range(0,$N_PARALLEL_WORKERS)" \
-#    hydra/launcher=joblib \
-#    input_dir="$EICU_PREMEDS_DIR" \
-#    cohort_dir="$EICU_MEDS_DIR" \
-#    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
-#
-#echo "Splitting patients in serial"
-#./scripts/extraction/split_and_shard_patients.py \
-#    input_dir="$EICU_PREMEDS_DIR" \
-#    cohort_dir="$EICU_MEDS_DIR" \
-#    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
-#
-#echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
-#./scripts/extraction/convert_to_sharded_events.py \
-#    --multirun \
-#    worker="range(0,$N_PARALLEL_WORKERS)" \
-#    hydra/launcher=joblib \
-#    input_dir="$EICU_PREMEDS_DIR" \
-#    cohort_dir="$EICU_MEDS_DIR" \
-#    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
+echo "Running pre-MEDS conversion."
+./eICU_Example/pre_MEDS.py raw_cohort_dir="$EICU_RAW_DIR" output_dir="$EICU_PREMEDS_DIR"
+
+echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel"
+./scripts/extraction/shard_events.py \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=joblib \
+    input_dir="$EICU_PREMEDS_DIR" \
+    cohort_dir="$EICU_MEDS_DIR" \
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
+
+echo "Splitting patients in serial"
+./scripts/extraction/split_and_shard_patients.py \
+    input_dir="$EICU_PREMEDS_DIR" \
+    cohort_dir="$EICU_MEDS_DIR" \
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
+
+echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
+./scripts/extraction/convert_to_sharded_events.py \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=joblib \
+    input_dir="$EICU_PREMEDS_DIR" \
+    cohort_dir="$EICU_MEDS_DIR" \
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
 
 echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
 ./scripts/extraction/merge_to_MEDS_cohort.py \

From f7415559e3f34a1b370af558bec6501886ad051c Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Tue, 11 Jun 2024 09:06:20 -0400
Subject: [PATCH 47/47] Updated some docstrings

---
 configs/extraction.yaml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/configs/extraction.yaml b/configs/extraction.yaml
index 41a0f3a..1a1c0dd 100644
--- a/configs/extraction.yaml
+++ b/configs/extraction.yaml
@@ -29,7 +29,7 @@ stage_configs:
       This stage shards the raw input events into smaller files for easier processing. Arguments:
         - `row_chunksize`: The number of rows to read in at a time.
         - `infer_schema_length`: The number of rows to read in to infer the schema (only used if the source
-          files are pdfs)
+          files are csvs)
     row_chunksize: 200000000
     infer_schema_length: 10000
   split_and_shard_patients:
@@ -41,6 +41,11 @@ stage_configs:
           held-out test sets beyond the IID held out set that will be produced (e.g., for prospective
           datasets, etc.).
         - `split_fracs`: The fraction of patients to include in the IID training, tuning, and held-out sets.
+          Split fractions can be changed for the default names by adding a hydra-syntax command line argument
+          for the nested name; e.g., `split_fracs.train=0.7 split_fracs.tuning=0.1 split_fracs.held_out=0.2`.
+          A split can be removed with the `~` override Hydra syntax. Similarly, a new split name can be added
+          with the standard Hydra `+` override option. E.g., `~split_fracs.held_out +split_fracs.test=0.1`. It
+          is the user's responsibility to ensure that split fractions sum to 1.
     is_metadata: True
     output_dir: ${cohort_dir}
     n_patients_per_shard: 50000