diff --git a/configs/extraction.yaml b/configs/extraction.yaml index c351951..1a1c0dd 100644 --- a/configs/extraction.yaml +++ b/configs/extraction.yaml @@ -1,5 +1,18 @@ defaults: - pipeline + - _self_ + +description: |- + This pipeline extracts raw MEDS events in longitudinal, sparse form from an input dataset meeting select + criteria and converts them to the flattened, MEDS format. It can be run in its entirety, with controllable + levels of parallelism, or in stages. Arguments: + - `event_conversion_config_fp`: The path to the event conversion configuration file. This file defines + the events to extract from the various rows of the various input files encountered in the global input + directory. + - `input_dir`: The path to the directory containing the raw input files. + - `cohort_dir`: The path to the directory where the output cohort will be written. It will be written in + various subfolders of this dir depending on the stage, as intermediate stages cache their output during + computation for efficiency of re-running and distributing. # The event conversion configuration file is used throughout the pipeline to define the events to extract. event_conversion_config_fp: ??? @@ -12,9 +25,27 @@ stages: stage_configs: shard_events: + description: |- + This stage shards the raw input events into smaller files for easier processing. Arguments: + - `row_chunksize`: The number of rows to read in at a time. + - `infer_schema_length`: The number of rows to read in to infer the schema (only used if the source + files are csvs) row_chunksize: 200000000 infer_schema_length: 10000 split_and_shard_patients: + description: |- + This stage splits the patients into training, tuning, and held-out sets, and further splits those sets + into shards. Arguments: + - `n_patients_per_shard`: The number of patients to include in a shard. + - `external_splits_json_fp`: The path to a json file containing any pre-defined splits for specially + held-out test sets beyond the IID held out set that will be produced (e.g., for prospective + datasets, etc.). + - `split_fracs`: The fraction of patients to include in the IID training, tuning, and held-out sets. + Split fractions can be changed for the default names by adding a hydra-syntax command line argument + for the nested name; e.g., `split_fracs.train=0.7 split_fracs.tuning=0.1 split_fracs.held_out=0.2`. + A split can be removed with the `~` override Hydra syntax. Similarly, a new split name can be added + with the standard Hydra `+` override option. E.g., `~split_fracs.held_out +split_fracs.test=0.1`. It + is the user's responsibility to ensure that split fractions sum to 1. is_metadata: True output_dir: ${cohort_dir} n_patients_per_shard: 50000 @@ -24,5 +55,13 @@ stage_configs: tuning: 0.1 held_out: 0.1 merge_to_MEDS_cohort: + description: |- + This stage splits the patients into training, tuning, and held-out sets, and further splits those sets + into shards. Arguments: + - `n_patients_per_shard`: The number of patients to include in a shard. + - `external_splits_json_fp`: The path to a json file containing any pre-defined splits for specially + held-out test sets beyond the IID held out set that will be produced (e.g., for prospective + datasets, etc.). + - `split_fracs`: The fraction of patients to include in the IID training, tuning, and held-out sets. output_dir: ${cohort_dir}/final_cohort unique_by: "*" diff --git a/configs/pipeline.yaml b/configs/pipeline.yaml index be99f84..229ea26 100644 --- a/configs/pipeline.yaml +++ b/configs/pipeline.yaml @@ -2,6 +2,10 @@ input_dir: ??? cohort_dir: ??? +_default_description: |- + This is a MEDS pipeline ETL. Please set a more detailed description at the top of your specific pipeline + configuration file. + log_dir: "${cohort_dir}/.logs/${stage}" # General pipeline variables @@ -26,3 +30,14 @@ hydra: dir: "${log_dir}" sweep: dir: "${log_dir}" + help: + app_name: "MEDS/${stage}" + template: |- + == ${hydra.help.app_name} == + ${hydra.help.app_name} is a command line tool that provides an interface for running MEDS pipelines. + + **Pipeline description:** + ${oc.select:description, ${_default_description}} + + **Stage description:** + ${oc.select:stage_configs.${stage}.description, ${get_script_docstring:}} diff --git a/configs/preprocess.yaml b/configs/preprocess.yaml index 9b60579..d65150b 100644 --- a/configs/preprocess.yaml +++ b/configs/preprocess.yaml @@ -1,5 +1,6 @@ defaults: - pipeline + - _self_ # Global pipeline parameters: # 1. Code modifiers will be used as adjoining parts of the `code` columns during group-bys and eventual diff --git a/src/MEDS_polars_functions/utils.py b/src/MEDS_polars_functions/utils.py index d1e6e09..a307bae 100644 --- a/src/MEDS_polars_functions/utils.py +++ b/src/MEDS_polars_functions/utils.py @@ -1,5 +1,6 @@ """Core utilities for MEDS pipelines built with these tools.""" +import inspect import os import sys from pathlib import Path @@ -12,6 +13,20 @@ pl.enable_string_cache() +def get_script_docstring() -> str: + """Returns the docstring of the main function of the script that was called. + + Returns: + str: TODO + """ + + main_module = sys.modules["__main__"] + func = getattr(main_module, "main", None) + if func and callable(func): + return inspect.getdoc(func) or "" + return "" + + def current_script_name() -> str: """Returns the name of the script that called this function. @@ -143,6 +158,7 @@ def populate_stage( return out +OmegaConf.register_new_resolver("get_script_docstring", get_script_docstring, replace=False) OmegaConf.register_new_resolver("current_script_name", current_script_name, replace=False) OmegaConf.register_new_resolver("populate_stage", populate_stage, replace=False)