From 7c2e7677c09289b4cf358346b9223d8934d7f98f Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Fri, 31 May 2024 20:04:22 -0400 Subject: [PATCH 1/2] Updated configs and added a resolver to get informative help messages from the right sources --- configs/extraction.yaml | 34 ++++++++++++++++++++++++++++++ configs/pipeline.yaml | 15 +++++++++++++ configs/preprocess.yaml | 1 + src/MEDS_polars_functions/utils.py | 16 ++++++++++++++ 4 files changed, 66 insertions(+) diff --git a/configs/extraction.yaml b/configs/extraction.yaml index e1e985a..b762894 100644 --- a/configs/extraction.yaml +++ b/configs/extraction.yaml @@ -1,5 +1,18 @@ defaults: - pipeline + - _self_ + +description: |- + This pipeline extracts raw MEDS events in longitudinal, sparse form from an input dataset meeting select + criteria and converts them to the flattened, MEDS format. It can be run in its entirety, with controllable + levels of parallelism, or in stages. Arguments: + - `event_conversion_config_fp`: The path to the event conversion configuration file. This file defines + the events to extract from the various rows of the various input files encountered in the global input + directory. + - `input_dir`: The path to the directory containing the raw input files. + - `cohort_dir`: The path to the directory where the output cohort will be written. It will be written in + various subfolders of this dir depending on the stage, as intermediate stages cache their output during + computation for efficiency of re-running and distributing. # The event conversion configuration file is used throughout the pipeline to define the events to extract. event_conversion_config_fp: ??? @@ -12,9 +25,22 @@ stages: stage_configs: shard_events: + description: |- + This stage shards the raw input events into smaller files for easier processing. Arguments: + - `row_chunksize`: The number of rows to read in at a time. + - `infer_schema_length`: The number of rows to read in to infer the schema (only used if the source + files are pdfs) row_chunksize: 200000000 infer_schema_length: 10000 split_and_shard_patients: + description: |- + This stage splits the patients into training, tuning, and held-out sets, and further splits those sets + into shards. Arguments: + - `n_patients_per_shard`: The number of patients to include in a shard. + - `external_splits_json_fp`: The path to a json file containing any pre-defined splits for specially + held-out test sets beyond the IID held out set that will be produced (e.g., for prospective + datasets, etc.). + - `split_fracs`: The fraction of patients to include in the IID training, tuning, and held-out sets. is_metadata: True output_dir: ${cohort_dir} n_patients_per_shard: 50000 @@ -24,4 +50,12 @@ stage_configs: tuning: 0.1 held_out: 0.1 merge_to_MEDS_cohort: + description: |- + This stage splits the patients into training, tuning, and held-out sets, and further splits those sets + into shards. Arguments: + - `n_patients_per_shard`: The number of patients to include in a shard. + - `external_splits_json_fp`: The path to a json file containing any pre-defined splits for specially + held-out test sets beyond the IID held out set that will be produced (e.g., for prospective + datasets, etc.). + - `split_fracs`: The fraction of patients to include in the IID training, tuning, and held-out sets. output_dir: ${cohort_dir}/final_cohort diff --git a/configs/pipeline.yaml b/configs/pipeline.yaml index 5694e25..857785f 100644 --- a/configs/pipeline.yaml +++ b/configs/pipeline.yaml @@ -2,6 +2,10 @@ input_dir: ??? cohort_dir: ??? +_default_description: |- + This is a MEDS pipeline ETL. Please set a more detailed description at the top of your specific pipeline + configuration file. + log_dir: "${cohort_dir}/.logs" # General pipeline variables @@ -26,3 +30,14 @@ hydra: dir: "${log_dir}" sweep: dir: "${log_dir}" + help: + app_name: "MEDS/${stage}" + template: |- + == ${hydra.help.app_name} == + ${hydra.help.app_name} is a command line tool that provides an interface for running MEDS pipelines. + + **Pipeline description:** + ${oc.select:description, ${_default_description}} + + **Stage description:** + ${oc.select:stage_configs.${stage}.description, ${get_script_docstring:}} diff --git a/configs/preprocess.yaml b/configs/preprocess.yaml index 9b60579..d65150b 100644 --- a/configs/preprocess.yaml +++ b/configs/preprocess.yaml @@ -1,5 +1,6 @@ defaults: - pipeline + - _self_ # Global pipeline parameters: # 1. Code modifiers will be used as adjoining parts of the `code` columns during group-bys and eventual diff --git a/src/MEDS_polars_functions/utils.py b/src/MEDS_polars_functions/utils.py index b2fbbb7..11d738a 100644 --- a/src/MEDS_polars_functions/utils.py +++ b/src/MEDS_polars_functions/utils.py @@ -1,5 +1,6 @@ """Core utilities for MEDS pipelines built with these tools.""" +import inspect import os import sys from pathlib import Path @@ -12,6 +13,20 @@ pl.enable_string_cache() +def get_script_docstring() -> str: + """Returns the docstring of the main function of the script that was called. + + Returns: + str: TODO + """ + + main_module = sys.modules["__main__"] + func = getattr(main_module, "main", None) + if func and callable(func): + return inspect.getdoc(func) or "" + return "" + + def current_script_name() -> str: """Returns the name of the script that called this function. @@ -143,6 +158,7 @@ def populate_stage( return out +OmegaConf.register_new_resolver("get_script_docstring", get_script_docstring, replace=False) OmegaConf.register_new_resolver("current_script_name", current_script_name, replace=False) OmegaConf.register_new_resolver("populate_stage", populate_stage, replace=False) From f7415559e3f34a1b370af558bec6501886ad051c Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Tue, 11 Jun 2024 09:06:20 -0400 Subject: [PATCH 2/2] Updated some docstrings --- configs/extraction.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/configs/extraction.yaml b/configs/extraction.yaml index 41a0f3a..1a1c0dd 100644 --- a/configs/extraction.yaml +++ b/configs/extraction.yaml @@ -29,7 +29,7 @@ stage_configs: This stage shards the raw input events into smaller files for easier processing. Arguments: - `row_chunksize`: The number of rows to read in at a time. - `infer_schema_length`: The number of rows to read in to infer the schema (only used if the source - files are pdfs) + files are csvs) row_chunksize: 200000000 infer_schema_length: 10000 split_and_shard_patients: @@ -41,6 +41,11 @@ stage_configs: held-out test sets beyond the IID held out set that will be produced (e.g., for prospective datasets, etc.). - `split_fracs`: The fraction of patients to include in the IID training, tuning, and held-out sets. + Split fractions can be changed for the default names by adding a hydra-syntax command line argument + for the nested name; e.g., `split_fracs.train=0.7 split_fracs.tuning=0.1 split_fracs.held_out=0.2`. + A split can be removed with the `~` override Hydra syntax. Similarly, a new split name can be added + with the standard Hydra `+` override option. E.g., `~split_fracs.held_out +split_fracs.test=0.1`. It + is the user's responsibility to ensure that split fractions sum to 1. is_metadata: True output_dir: ${cohort_dir} n_patients_per_shard: 50000