From 7c2e7677c09289b4cf358346b9223d8934d7f98f Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Fri, 31 May 2024 20:04:22 -0400
Subject: [PATCH 1/2] Updated configs and added a resolver to get informative
 help messages from the right sources

---
 configs/extraction.yaml            | 34 ++++++++++++++++++++++++++++++
 configs/pipeline.yaml              | 15 +++++++++++++
 configs/preprocess.yaml            |  1 +
 src/MEDS_polars_functions/utils.py | 16 ++++++++++++++
 4 files changed, 66 insertions(+)

diff --git a/configs/extraction.yaml b/configs/extraction.yaml
index e1e985a..b762894 100644
--- a/configs/extraction.yaml
+++ b/configs/extraction.yaml
@@ -1,5 +1,18 @@
 defaults:
   - pipeline
+  - _self_
+
+description: |-
+  This pipeline extracts raw MEDS events in longitudinal, sparse form from an input dataset meeting select
+  criteria and converts them to the flattened, MEDS format. It can be run in its entirety, with controllable
+  levels of parallelism, or in stages. Arguments:
+    - `event_conversion_config_fp`: The path to the event conversion configuration file. This file defines
+      the events to extract from the various rows of the various input files encountered in the global input
+      directory.
+    - `input_dir`: The path to the directory containing the raw input files.
+    - `cohort_dir`: The path to the directory where the output cohort will be written. It will be written in
+      various subfolders of this dir depending on the stage, as intermediate stages cache their output during
+      computation for efficiency of re-running and distributing.
 
 # The event conversion configuration file is used throughout the pipeline to define the events to extract.
 event_conversion_config_fp: ???
@@ -12,9 +25,22 @@ stages:
 
 stage_configs:
   shard_events:
+    description: |-
+      This stage shards the raw input events into smaller files for easier processing. Arguments:
+        - `row_chunksize`: The number of rows to read in at a time.
+        - `infer_schema_length`: The number of rows to read in to infer the schema (only used if the source
+          files are pdfs)
     row_chunksize: 200000000
     infer_schema_length: 10000
   split_and_shard_patients:
+    description: |-
+      This stage splits the patients into training, tuning, and held-out sets, and further splits those sets
+      into shards. Arguments:
+        - `n_patients_per_shard`: The number of patients to include in a shard.
+        - `external_splits_json_fp`: The path to a json file containing any pre-defined splits for specially
+          held-out test sets beyond the IID held out set that will be produced (e.g., for prospective
+          datasets, etc.).
+        - `split_fracs`: The fraction of patients to include in the IID training, tuning, and held-out sets.
     is_metadata: True
     output_dir: ${cohort_dir}
     n_patients_per_shard: 50000
@@ -24,4 +50,12 @@ stage_configs:
       tuning: 0.1
       held_out: 0.1
   merge_to_MEDS_cohort:
+    description: |-
+      This stage splits the patients into training, tuning, and held-out sets, and further splits those sets
+      into shards. Arguments:
+        - `n_patients_per_shard`: The number of patients to include in a shard.
+        - `external_splits_json_fp`: The path to a json file containing any pre-defined splits for specially
+          held-out test sets beyond the IID held out set that will be produced (e.g., for prospective
+          datasets, etc.).
+        - `split_fracs`: The fraction of patients to include in the IID training, tuning, and held-out sets.
     output_dir: ${cohort_dir}/final_cohort
diff --git a/configs/pipeline.yaml b/configs/pipeline.yaml
index 5694e25..857785f 100644
--- a/configs/pipeline.yaml
+++ b/configs/pipeline.yaml
@@ -2,6 +2,10 @@
 input_dir: ???
 cohort_dir: ???
 
+_default_description: |-
+  This is a MEDS pipeline ETL. Please set a more detailed description at the top of your specific pipeline
+  configuration file.
+
 log_dir: "${cohort_dir}/.logs"
 
 # General pipeline variables
@@ -26,3 +30,14 @@ hydra:
     dir: "${log_dir}"
   sweep:
     dir: "${log_dir}"
+  help:
+    app_name: "MEDS/${stage}"
+    template: |-
+      == ${hydra.help.app_name} ==
+      ${hydra.help.app_name} is a command line tool that provides an interface for running MEDS pipelines.
+
+      **Pipeline description:**
+      ${oc.select:description, ${_default_description}}
+
+      **Stage description:**
+      ${oc.select:stage_configs.${stage}.description, ${get_script_docstring:}}
diff --git a/configs/preprocess.yaml b/configs/preprocess.yaml
index 9b60579..d65150b 100644
--- a/configs/preprocess.yaml
+++ b/configs/preprocess.yaml
@@ -1,5 +1,6 @@
 defaults:
   - pipeline
+  - _self_
 
 # Global pipeline parameters:
 # 1. Code modifiers will be used as adjoining parts of the `code` columns during group-bys and eventual
diff --git a/src/MEDS_polars_functions/utils.py b/src/MEDS_polars_functions/utils.py
index b2fbbb7..11d738a 100644
--- a/src/MEDS_polars_functions/utils.py
+++ b/src/MEDS_polars_functions/utils.py
@@ -1,5 +1,6 @@
 """Core utilities for MEDS pipelines built with these tools."""
 
+import inspect
 import os
 import sys
 from pathlib import Path
@@ -12,6 +13,20 @@
 pl.enable_string_cache()
 
 
+def get_script_docstring() -> str:
+    """Returns the docstring of the main function of the script that was called.
+
+    Returns:
+        str: TODO
+    """
+
+    main_module = sys.modules["__main__"]
+    func = getattr(main_module, "main", None)
+    if func and callable(func):
+        return inspect.getdoc(func) or ""
+    return ""
+
+
 def current_script_name() -> str:
     """Returns the name of the script that called this function.
 
@@ -143,6 +158,7 @@ def populate_stage(
     return out
 
 
+OmegaConf.register_new_resolver("get_script_docstring", get_script_docstring, replace=False)
 OmegaConf.register_new_resolver("current_script_name", current_script_name, replace=False)
 OmegaConf.register_new_resolver("populate_stage", populate_stage, replace=False)
 

From f7415559e3f34a1b370af558bec6501886ad051c Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Tue, 11 Jun 2024 09:06:20 -0400
Subject: [PATCH 2/2] Updated some docstrings

---
 configs/extraction.yaml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/configs/extraction.yaml b/configs/extraction.yaml
index 41a0f3a..1a1c0dd 100644
--- a/configs/extraction.yaml
+++ b/configs/extraction.yaml
@@ -29,7 +29,7 @@ stage_configs:
       This stage shards the raw input events into smaller files for easier processing. Arguments:
         - `row_chunksize`: The number of rows to read in at a time.
         - `infer_schema_length`: The number of rows to read in to infer the schema (only used if the source
-          files are pdfs)
+          files are csvs)
     row_chunksize: 200000000
     infer_schema_length: 10000
   split_and_shard_patients:
@@ -41,6 +41,11 @@ stage_configs:
           held-out test sets beyond the IID held out set that will be produced (e.g., for prospective
           datasets, etc.).
         - `split_fracs`: The fraction of patients to include in the IID training, tuning, and held-out sets.
+          Split fractions can be changed for the default names by adding a hydra-syntax command line argument
+          for the nested name; e.g., `split_fracs.train=0.7 split_fracs.tuning=0.1 split_fracs.held_out=0.2`.
+          A split can be removed with the `~` override Hydra syntax. Similarly, a new split name can be added
+          with the standard Hydra `+` override option. E.g., `~split_fracs.held_out +split_fracs.test=0.1`. It
+          is the user's responsibility to ensure that split fractions sum to 1.
     is_metadata: True
     output_dir: ${cohort_dir}
     n_patients_per_shard: 50000