Updated scripts and added note to README.md for eICU

mmcdermott · Jun 8, 2024 · e152a17 · e152a17
1 parent f48ddb7
commit e152a17
Show file tree

Hide file tree

Showing 2 changed files with 41 additions and 30 deletions.
diff --git a/eICU_Example/README.md b/eICU_Example/README.md
@@ -70,6 +70,14 @@ In practice, on a machine with 150 GB of RAM and 10 cores, this step takes less
 
 ## Step 3: Run the MEDS extraction ETL
 
+Note that eICU has a lot more observations per patient than does MIMIC-IV, so to keep to a reasonable memory
+burden (e.g., \< 150GB per worker), you will want a smaller shard size, as well as to turn off the final unique
+check (which should not be necessary given the structure of eICU and is expensive) in the merge stage. You can
+do this by setting the following parameters at the end of the mandatory args when running this script:
+
+- `stage_configs.split_and_shard_patients.n_patients_per_shard=10000`
+- `stage_configs.merge_to_MEDS_cohort.unique_by=null`
+
 ### Running locally, serially
 
 We will assume you want to output the final MEDS dataset into a directory we'll denote as `$EICU_MEDS_DIR`.

diff --git a/eICU_Example/joint_script.sh b/eICU_Example/joint_script.sh
@@ -39,37 +39,40 @@ N_PARALLEL_WORKERS="$4"
 
 shift 4
 
-echo "Note that eICU is expensive (in memory) in some final stages as each MEDS shards will end up being "
-echo "large in # of rows (e.g., ~175M) given the frequency of periodic vitals signs. We recommend setting "
-echo "stage_configs.merge_to_MEDS_cohort.unique_by=null in order to mitigate the cost of the unique "
-echo "operation at to avoid OOM issues."
+echo "Note that eICU has a lot more observations per patient than does MIMIC-IV, so to keep to a reasonable "
+echo "memory burden (e.g., < 150GB per worker), you will want a smaller shard size, as well as to turn off "
+echo "the final unique check (which should not be necessary given the structure of eICU and is expensive) "
+echo "in the merge stage. You can do this by setting the following parameters at the end of the mandatory "
+echo "args when running this script:"
+echo "  * stage_configs.split_and_shard_patients.n_patients_per_shard=10000"
+echo "  * stage_configs.merge_to_MEDS_cohort.unique_by=null"
 
-#echo "Running pre-MEDS conversion."
-#./eICU_Example/pre_MEDS.py raw_cohort_dir="$EICU_RAW_DIR" output_dir="$EICU_PREMEDS_DIR"
-#
-#echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel"
-#./scripts/extraction/shard_events.py \
-#    --multirun \
-#    worker="range(0,$N_PARALLEL_WORKERS)" \
-#    hydra/launcher=joblib \
-#    input_dir="$EICU_PREMEDS_DIR" \
-#    cohort_dir="$EICU_MEDS_DIR" \
-#    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
-#
-#echo "Splitting patients in serial"
-#./scripts/extraction/split_and_shard_patients.py \
-#    input_dir="$EICU_PREMEDS_DIR" \
-#    cohort_dir="$EICU_MEDS_DIR" \
-#    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
-#
-#echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
-#./scripts/extraction/convert_to_sharded_events.py \
-#    --multirun \
-#    worker="range(0,$N_PARALLEL_WORKERS)" \
-#    hydra/launcher=joblib \
-#    input_dir="$EICU_PREMEDS_DIR" \
-#    cohort_dir="$EICU_MEDS_DIR" \
-#    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
+echo "Running pre-MEDS conversion."
+./eICU_Example/pre_MEDS.py raw_cohort_dir="$EICU_RAW_DIR" output_dir="$EICU_PREMEDS_DIR"
+
+echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel"
+./scripts/extraction/shard_events.py \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=joblib \
+    input_dir="$EICU_PREMEDS_DIR" \
+    cohort_dir="$EICU_MEDS_DIR" \
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
+
+echo "Splitting patients in serial"
+./scripts/extraction/split_and_shard_patients.py \
+    input_dir="$EICU_PREMEDS_DIR" \
+    cohort_dir="$EICU_MEDS_DIR" \
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
+
+echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel"
+./scripts/extraction/convert_to_sharded_events.py \
+    --multirun \
+    worker="range(0,$N_PARALLEL_WORKERS)" \
+    hydra/launcher=joblib \
+    input_dir="$EICU_PREMEDS_DIR" \
+    cohort_dir="$EICU_MEDS_DIR" \
+    event_conversion_config_fp=./eICU_Example/configs/event_configs.yaml "$@"
 
 echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel"
 ./scripts/extraction/merge_to_MEDS_cohort.py \