Merge pull request #7 from mmcdermott/docstrings

Updated configs and added a resolver to get informative help messages from either the config or the script docstrings.
mmcdermott · Jun 11, 2024 · 1423051 · 1423051
2 parents e152a17 + f741555
commit 1423051
Show file tree

Hide file tree

Showing 4 changed files with 71 additions and 0 deletions.
diff --git a/configs/extraction.yaml b/configs/extraction.yaml
@@ -1,5 +1,18 @@
 defaults:
   - pipeline
+  - _self_
+
+description: |-
+  This pipeline extracts raw MEDS events in longitudinal, sparse form from an input dataset meeting select
+  criteria and converts them to the flattened, MEDS format. It can be run in its entirety, with controllable
+  levels of parallelism, or in stages. Arguments:
+    - `event_conversion_config_fp`: The path to the event conversion configuration file. This file defines
+      the events to extract from the various rows of the various input files encountered in the global input
+      directory.
+    - `input_dir`: The path to the directory containing the raw input files.
+    - `cohort_dir`: The path to the directory where the output cohort will be written. It will be written in
+      various subfolders of this dir depending on the stage, as intermediate stages cache their output during
+      computation for efficiency of re-running and distributing.
 
 # The event conversion configuration file is used throughout the pipeline to define the events to extract.
 event_conversion_config_fp: ???
@@ -12,9 +25,27 @@ stages:
 
 stage_configs:
   shard_events:
+    description: |-
+      This stage shards the raw input events into smaller files for easier processing. Arguments:
+        - `row_chunksize`: The number of rows to read in at a time.
+        - `infer_schema_length`: The number of rows to read in to infer the schema (only used if the source
+          files are csvs)
     row_chunksize: 200000000
     infer_schema_length: 10000
   split_and_shard_patients:
+    description: |-
+      This stage splits the patients into training, tuning, and held-out sets, and further splits those sets
+      into shards. Arguments:
+        - `n_patients_per_shard`: The number of patients to include in a shard.
+        - `external_splits_json_fp`: The path to a json file containing any pre-defined splits for specially
+          held-out test sets beyond the IID held out set that will be produced (e.g., for prospective
+          datasets, etc.).
+        - `split_fracs`: The fraction of patients to include in the IID training, tuning, and held-out sets.
+          Split fractions can be changed for the default names by adding a hydra-syntax command line argument
+          for the nested name; e.g., `split_fracs.train=0.7 split_fracs.tuning=0.1 split_fracs.held_out=0.2`.
+          A split can be removed with the `~` override Hydra syntax. Similarly, a new split name can be added
+          with the standard Hydra `+` override option. E.g., `~split_fracs.held_out +split_fracs.test=0.1`. It
+          is the user's responsibility to ensure that split fractions sum to 1.
     is_metadata: True
     output_dir: ${cohort_dir}
     n_patients_per_shard: 50000
@@ -24,5 +55,13 @@ stage_configs:
       tuning: 0.1
       held_out: 0.1
   merge_to_MEDS_cohort:
+    description: |-
+      This stage splits the patients into training, tuning, and held-out sets, and further splits those sets
+      into shards. Arguments:
+        - `n_patients_per_shard`: The number of patients to include in a shard.
+        - `external_splits_json_fp`: The path to a json file containing any pre-defined splits for specially
+          held-out test sets beyond the IID held out set that will be produced (e.g., for prospective
+          datasets, etc.).
+        - `split_fracs`: The fraction of patients to include in the IID training, tuning, and held-out sets.
     output_dir: ${cohort_dir}/final_cohort
     unique_by: "*"
diff --git a/configs/pipeline.yaml b/configs/pipeline.yaml
@@ -2,6 +2,10 @@
 input_dir: ???
 cohort_dir: ???
 
+_default_description: |-
+  This is a MEDS pipeline ETL. Please set a more detailed description at the top of your specific pipeline
+  configuration file.
+
 log_dir: "${cohort_dir}/.logs/${stage}"
 
 # General pipeline variables
@@ -26,3 +30,14 @@ hydra:
     dir: "${log_dir}"
   sweep:
     dir: "${log_dir}"
+  help:
+    app_name: "MEDS/${stage}"
+    template: |-
+      == ${hydra.help.app_name} ==
+      ${hydra.help.app_name} is a command line tool that provides an interface for running MEDS pipelines.
+
+      **Pipeline description:**
+      ${oc.select:description, ${_default_description}}
+
+      **Stage description:**
+      ${oc.select:stage_configs.${stage}.description, ${get_script_docstring:}}
diff --git a/configs/preprocess.yaml b/configs/preprocess.yaml
@@ -1,5 +1,6 @@
 defaults:
   - pipeline
+  - _self_
 
 # Global pipeline parameters:
 # 1. Code modifiers will be used as adjoining parts of the `code` columns during group-bys and eventual

diff --git a/src/MEDS_polars_functions/utils.py b/src/MEDS_polars_functions/utils.py
@@ -1,5 +1,6 @@
 """Core utilities for MEDS pipelines built with these tools."""
 
+import inspect
 import os
 import sys
 from pathlib import Path
@@ -12,6 +13,20 @@
 pl.enable_string_cache()
 
 
+def get_script_docstring() -> str:
+    """Returns the docstring of the main function of the script that was called.
+
+    Returns:
+        str: TODO
+    """
+
+    main_module = sys.modules["__main__"]
+    func = getattr(main_module, "main", None)
+    if func and callable(func):
+        return inspect.getdoc(func) or ""
+    return ""
+
+
 def current_script_name() -> str:
     """Returns the name of the script that called this function.
 
@@ -143,6 +158,7 @@ def populate_stage(
     return out
 
 
+OmegaConf.register_new_resolver("get_script_docstring", get_script_docstring, replace=False)
 OmegaConf.register_new_resolver("current_script_name", current_script_name, replace=False)
 OmegaConf.register_new_resolver("populate_stage", populate_stage, replace=False)