diff --git a/MIMIC-IV_Example/README.md b/MIMIC-IV_Example/README.md index 6bf348d0..dbfebf9e 100644 --- a/MIMIC-IV_Example/README.md +++ b/MIMIC-IV_Example/README.md @@ -6,33 +6,34 @@ up from this one). ## Step 0: Installation -Download this repository and install the requirements: -If you want to install via pypi, (note that for now, you still need to copy some files locally even with a -pypi installation, which is covered below, so make sure you are in a suitable directory) use: - ```bash conda create -n MEDS python=3.12 conda activate MEDS -pip install "MEDS_transforms[local_parallelism]" -mkdir MIMIC-IV_Example -cd MIMIC-IV_Example -wget https://raw.githubusercontent.com/mmcdermott/MEDS_transforms/main/MIMIC-IV_Example/joint_script.sh -wget https://raw.githubusercontent.com/mmcdermott/MEDS_transforms/main/MIMIC-IV_Example/joint_script_slurm.sh -wget https://raw.githubusercontent.com/mmcdermott/MEDS_transforms/main/MIMIC-IV_Example/pre_MEDS.py -chmod +x joint_script.sh -chmod +x joint_script_slurm.sh -chmod +x pre_MEDS.py -cd .. +pip install "MEDS_transforms[local_parallelism,slurm_parallelism]" ``` -If you want to install locally, use: +If you want to profile the time and memory costs of your ETL, also install: `pip install hydra-profiler`. +## Step 0.5: Set-up +Set some environment variables and download the necessary files: ```bash -git clone git@github.com:mmcdermott/MEDS_transforms.git -cd MEDS_transforms -conda create -n MEDS python=3.12 -conda activate MEDS -pip install .[local_parallelism] +export MIMICIV_RAW_DIR=??? # set to the directory in which you want to store the raw MIMIC-IV data +export MIMICIV_PRE_MEDS_DIR=??? # set to the directory in which you want to store the raw MIMIC-IV data +export MIMICIV_MEDS_COHORT_DIR=??? # set to the directory in which you want to store the raw MIMIC-IV data + +export VERSION=0.0.6 # or whatever version you want +export URL="https://raw.githubusercontent.com/mmcdermott/MEDS_transforms/$VERSION/MIMIC-IV_Example" + +wget $URL/run.sh +wget $URL/pre_MEDS.py +wget $URL/local_parallelism_runner.yaml +wget $URL/slurm_runner.yaml +mkdir configs +cd configs +wget $URL/configs/extract_MIMIC.yaml +cd .. +chmod +x run.sh +chmod +x pre_MEDS.py ``` ## Step 1: Download MIMIC-IV @@ -46,101 +47,51 @@ the root directory of where the resulting _core data files_ are stored -- e.g., ```bash cd $MIMIC_RAW_DIR -wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/d_labitems_to_loinc.csv -wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/inputevents_to_rxnorm.csv -wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/lab_itemid_to_loinc.csv -wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/meas_chartevents_main.csv -wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/meas_chartevents_value.csv -wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/numerics-summary.csv -wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/outputevents_to_loinc.csv -wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/proc_datetimeevents.csv -wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/proc_itemid.csv -wget https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map/waveforms-summary.csv +export MIMIC_URL=https://raw.githubusercontent.com/MIT-LCP/mimic-code/v2.4.0/mimic-iv/concepts/concept_map +wget $MIMIC_URL/d_labitems_to_loinc.csv +wget $MIMIC_URL/inputevents_to_rxnorm.csv +wget $MIMIC_URL/lab_itemid_to_loinc.csv +wget $MIMIC_URL/meas_chartevents_main.csv +wget $MIMIC_URL/meas_chartevents_value.csv +wget $MIMIC_URL/numerics-summary.csv +wget $MIMIC_URL/outputevents_to_loinc.csv +wget $MIMIC_URL/proc_datetimeevents.csv +wget $MIMIC_URL/proc_itemid.csv +wget $MIMIC_URL/waveforms-summary.csv ``` -## Step 2: Run the basic MEDS ETL - -This step contains several sub-steps; luckily, all these substeps can be run via a single script, with the -`joint_script.sh` script which uses the Hydra `joblib` launcher to run things with local parallelism (make -sure you enable this feature by including the `[local_parallelism]` option during installation) or via -`joint_script_slurm.sh` which uses the Hydra `submitit` launcher to run things through slurm (make sure you -enable this feature by including the `[slurm_parallelism]` option during installation). This script entails -several steps: - -### Step 2.1: Get the data ready for base MEDS extraction - -This is a step in a few parts: - -1. Join a few tables by `hadm_id` to get the right times in the right rows for processing. In - particular, we need to join: - - the `hosp/diagnoses_icd` table with the `hosp/admissions` table to get the `dischtime` for each - `hadm_id`. - - the `hosp/drgcodes` table with the `hosp/admissions` table to get the `dischtime` for each `hadm_id`. -2. Convert the subject's static data to a more parseable form. This entails: - - Get the subject's DOB in a format that is usable for MEDS, rather than the integral `anchor_year` and - `anchor_offset` fields. - - Merge the subject's `dod` with the `deathtime` from the `admissions` table. - -After these steps, modified files or symlinks to the original files will be written in a new directory which -will be used as the input to the actual MEDS extraction ETL. We'll use `$MIMICIV_PREMEDS_DIR` to denote this -directory. +## Step 2: Run the MEDS ETL -This step is run in the `joint_script.sh` script or the `joint_script_slurm.sh` script, but in either case the -base command that is run is as follows (assumed to be run **not** from this directory but from the -root directory of this repository): +To run the MEDS ETL, run the following command: ```bash -./MIMIC-IV_Example/pre_MEDS.py raw_cohort_dir=$MIMICIV_RAW_DIR output_dir=$MIMICIV_PREMEDS_DIR +./run.sh $MIMICIV_RAW_DIR $MIMICIV_PRE_MEDS_DIR $MIMICIV_MEDS_DIR do_unzip=true ``` -In practice, on a machine with 150 GB of RAM and 10 cores, this step takes less than 5 minutes in total. +To not unzip the `.csv.gz` files, set `do_unzip=false` instead of `do_unzip=true`. -### Step 2.2: Run the MEDS extraction ETL +To use a specific stage runner file (e.g., to set different parallelism options), you can specify it as an +additional argument -We will assume you want to output the final MEDS dataset into a directory we'll denote as `$MIMICIV_MEDS_DIR`. -Note this is a different directory than the pre-MEDS directory (though, of course, they can both be -subdirectories of the same root directory). - -This is a step in 4 parts: - -1. Sub-shard the raw files. Run this command as many times simultaneously as you would like to have workers - performing this sub-sharding step. See below for how to automate this parallelism using hydra launchers. - - This step uses the `./scripts/extraction/shard_events.py` script. See `joint_script*.sh` for the expected - format of the command. - -2. Extract and form the subject splits and sub-shards. The `./scripts/extraction/split_and_shard_subjects.py` - script is used for this step. See `joint_script*.sh` for the expected format of the command. - -3. Extract subject sub-shards and convert to MEDS events. The - `./scripts/extraction/convert_to_sharded_events.py` script is used for this step. See `joint_script*.sh` for - the expected format of the command. - -4. Merge the MEDS events into a single file per subject sub-shard. The - `./scripts/extraction/merge_to_MEDS_cohort.py` script is used for this step. See `joint_script*.sh` for the - expected format of the command. - -5. (Optional) Generate preliminary code statistics and merge to external metadata. This is not performed - currently in the `joint_script*.sh` scripts. - -## Limitations / TO-DOs: - -Currently, some tables are ignored, including: +```bash +export N_WORKERS=5 +./run.sh $MIMICIV_RAW_DIR $MIMICIV_PRE_MEDS_DIR $MIMICIV_MEDS_DIR do_unzip=true \ + stage_runner_fp=slurm_runner.yaml +``` -1. `hosp/emar_detail` -2. `hosp/microbiologyevents` -3. `hosp/services` -4. `icu/datetimeevents` -5. `icu/ingredientevents` +The `N_WORKERS` environment variable set before the command controls how many parallel workers should be used +at maximum. -Lots of questions remain about how to appropriately handle times of the data -- e.g., things like HCPCS -events are stored at the level of the _date_, not the _datetime_. How should those be slotted into the -timeline which is otherwise stored at the _datetime_ resolution? +The `slurm_runner.yaml` file (downloaded above) runs each stage across several workers on separate slurm +worker nodes using the `submitit` launcher. _**You will need to customize this file to your own slurm system +so that the partition names are correct before use.**_ The memory and time costs are viable in the current +configuration, but if your nodes are sufficiently different you may need to adjust those as well. -Other questions: +The `local_parallelism_runner.yaml` file (downloaded above) runs each stage via separate processes on the +launching machine. There are no additional arguments needed for this stage beyond the `N_WORKERS` environment +variable and there is nothing to customize in this file. -1. How to handle merging the deathtimes between the hosp table and the subjects table? -2. How to handle the dob nonsense MIMIC has? +To profile the time and memory costs of your ETL, add the `do_profile=true` flag at the end. ## Notes diff --git a/MIMIC-IV_Example/configs/event_configs.yaml b/MIMIC-IV_Example/configs/event_configs.yaml index 2986a958..0d67a6c6 100644 --- a/MIMIC-IV_Example/configs/event_configs.yaml +++ b/MIMIC-IV_Example/configs/event_configs.yaml @@ -42,7 +42,7 @@ hosp/diagnoses_icd: _metadata: hosp/d_icd_diagnoses: description: "long_title" - parent_codes: "ICD{icd_version}CM/{icd_code}" # Single strings are templates of columns. + parent_codes: "ICD{icd_version}CM/{norm_icd_code}" # Single strings are templates of columns. hosp/drgcodes: drg: @@ -109,7 +109,7 @@ hosp/omr: time: col(chartdate) time_format: "%Y-%m-%d" -hosp/subjects: +hosp/patients: gender: code: - GENDER @@ -165,8 +165,8 @@ hosp/procedures_icd: hosp/d_icd_procedures: description: "long_title" parent_codes: # List of objects are string labels mapping to filters to be evaluated. - - "ICD{icd_version}Proc/{icd_code}": { icd_version: 9 } - - "ICD{icd_version}PCS/{icd_code}": { icd_version: 10 } + - "ICD{icd_version}Proc/{norm_icd_code}": { icd_version: "9" } + - "ICD{icd_version}PCS/{norm_icd_code}": { icd_version: "10" } hosp/transfers: transfer: @@ -303,7 +303,7 @@ icu/inputevents: - KG time: col(starttime) time_format: "%Y-%m-%d %H:%M:%S" - numeric_value: subjectweight + numeric_value: patientweight icu/outputevents: output: diff --git a/MIMIC-IV_Example/configs/extract_MIMIC.yaml b/MIMIC-IV_Example/configs/extract_MIMIC.yaml new file mode 100644 index 00000000..eb9b32ee --- /dev/null +++ b/MIMIC-IV_Example/configs/extract_MIMIC.yaml @@ -0,0 +1,36 @@ +defaults: + - _extract + - _self_ + +description: |- + This pipeline extracts the MIMIC-IV dataset in longitudinal, sparse form from an input dataset meeting + select criteria and converts them to the flattened, MEDS format. You can control the key arguments to this + pipeline by setting environment variables: + ```bash + export EVENT_CONVERSION_CONFIG_FP=# Path to your event conversion config + export MIMICIV_PRE_MEDS_DIR=# Path to the output dir of the pre-MEDS step + export MIMICIV_MEDS_COHORT_DIR=# Path to where you want the dataset to live + ``` + +# The event conversion configuration file is used throughout the pipeline to define the events to extract. +event_conversion_config_fp: ${oc.env:EVENT_CONVERSION_CONFIG_FP} + +input_dir: ${oc.env:MIMICIV_PRE_MEDS_DIR} +cohort_dir: ${oc.env:MIMICIV_MEDS_COHORT_DIR} + +etl_metadata: + dataset_name: MIMIC-IV + dataset_version: 2.2 + +stage_configs: + shard_events: + infer_schema_length: 999999999 + +stages: + - shard_events + - split_and_shard_subjects + - convert_to_sharded_events + - merge_to_MEDS_cohort + - extract_code_metadata + - finalize_MEDS_metadata + - finalize_MEDS_data diff --git a/MIMIC-IV_Example/configs/pre_MEDS.yaml b/MIMIC-IV_Example/configs/pre_MEDS.yaml index b5cfa4cb..325903e0 100644 --- a/MIMIC-IV_Example/configs/pre_MEDS.yaml +++ b/MIMIC-IV_Example/configs/pre_MEDS.yaml @@ -1,11 +1,15 @@ -raw_cohort_dir: ??? -output_dir: ??? +input_dir: ${oc.env:MIMICIV_RAW_DIR} +cohort_dir: ${oc.env:MIMICIV_PRE_MEDS_DIR} + +do_overwrite: false + +log_dir: ${cohort_dir}/.logs # Hydra hydra: job: name: pre_MEDS_${now:%Y-%m-%d_%H-%M-%S} run: - dir: ${output_dir}/.logs/${hydra.job.name} + dir: ${log_dir} sweep: - dir: ${output_dir}/.logs/${hydra.job.name} + dir: ${log_dir} diff --git a/MIMIC-IV_Example/joint_script.sh b/MIMIC-IV_Example/joint_script.sh deleted file mode 100755 index dd1459c4..00000000 --- a/MIMIC-IV_Example/joint_script.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env bash - -# This makes the script fail if any internal script fails -set -e - -# Function to display help message -function display_help() { - echo "Usage: $0 " - echo - echo "This script processes MIMIC-IV data through several steps, handling raw data conversion," - echo "sharding events, splitting subjects, converting to sharded events, and merging into a MEDS cohort." - echo - echo "Arguments:" - echo " MIMICIV_RAW_DIR Directory containing raw MIMIC-IV data files." - echo " MIMICIV_PREMEDS_DIR Output directory for pre-MEDS data." - echo " MIMICIV_MEDS_DIR Output directory for processed MEDS data." - echo " N_PARALLEL_WORKERS Number of parallel workers for processing." - echo " (OPTIONAL) do_unzip=true OR do_unzip=false Optional flag to unzip csv files before processing." - echo - echo "Options:" - echo " -h, --help Display this help message and exit." - exit 1 -} - -# Check if the first parameter is '-h' or '--help' -if [[ "$1" == "-h" || "$1" == "--help" ]]; then - display_help -fi - -# Check for mandatory parameters -if [ "$#" -lt 4 ]; then - echo "Error: Incorrect number of arguments provided." - display_help -fi - -MIMICIV_RAW_DIR="$1" -MIMICIV_PREMEDS_DIR="$2" -MIMICIV_MEDS_DIR="$3" -N_PARALLEL_WORKERS="$4" - -# Default do_unzip value -DO_UNZIP="false" - -# Check if the 5th argument is either do_unzip=true or do_unzip=false -if [ $# -ge 5 ]; then - case "$5" in - do_unzip=true) - DO_UNZIP="true" - shift 5 - ;; - do_unzip=false) - DO_UNZIP="false" - shift 5 - ;; - do_unzip=*) - echo "Error: Invalid do_unzip value. Use 'do_unzip=true' or 'do_unzip=false'." - exit 1 - ;; - *) - # If the 5th argument is not related to do_unzip, leave it for other_args - shift 4 - ;; - esac -else - shift 4 -fi - -if [ "$DO_UNZIP" == "true" ]; then - echo "Unzipping csv files." - for file in "${MIMICIV_RAW_DIR}"/*/*.csv.gz; do gzip -d --force "$file"; done -else - echo "Skipping unzipping." -fi - -echo "Running pre-MEDS conversion." -./MIMIC-IV_Example/pre_MEDS.py raw_cohort_dir="$MIMICIV_RAW_DIR" output_dir="$MIMICIV_PREMEDS_DIR" - -echo "Running shard_events.py with $N_PARALLEL_WORKERS workers in parallel" -MEDS_extract-shard_events \ - --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ - hydra/launcher=joblib \ - input_dir="$MIMICIV_PREMEDS_DIR" \ - cohort_dir="$MIMICIV_MEDS_DIR" \ - stage="shard_events" \ - stage_configs.shard_events.infer_schema_length=999999999 \ - etl_metadata.dataset_name="MIMIC-IV" \ - etl_metadata.dataset_version="2.2" \ - event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" - -echo "Splitting subjects in serial" -MEDS_extract-split_and_shard_subjects \ - input_dir="$MIMICIV_PREMEDS_DIR" \ - cohort_dir="$MIMICIV_MEDS_DIR" \ - stage="split_and_shard_subjects" \ - etl_metadata.dataset_name="MIMIC-IV" \ - etl_metadata.dataset_version="2.2" \ - event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" - -echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" -MEDS_extract-convert_to_sharded_events \ - --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ - hydra/launcher=joblib \ - input_dir="$MIMICIV_PREMEDS_DIR" \ - cohort_dir="$MIMICIV_MEDS_DIR" \ - stage="convert_to_sharded_events" \ - etl_metadata.dataset_name="MIMIC-IV" \ - etl_metadata.dataset_version="2.2" \ - event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" - -echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel" -MEDS_extract-merge_to_MEDS_cohort \ - --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ - hydra/launcher=joblib \ - input_dir="$MIMICIV_PREMEDS_DIR" \ - cohort_dir="$MIMICIV_MEDS_DIR" \ - stage="merge_to_MEDS_cohort" \ - etl_metadata.dataset_name="MIMIC-IV" \ - etl_metadata.dataset_version="2.2" \ - event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" - -echo "Aggregating initial code stats with $N_PARALLEL_WORKERS workers in parallel" -MEDS_transform-aggregate_code_metadata \ - --config-name="extract" \ - --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ - hydra/launcher=joblib \ - input_dir="$MIMICIV_PREMEDS_DIR" \ - cohort_dir="$MIMICIV_MEDS_DIR" \ - stage="aggregate_code_metadata" \ - etl_metadata.dataset_name="MIMIC-IV" \ - etl_metadata.dataset_version="2.2" \ - event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" - -# TODO -- make this the pre-meds dir and have the pre-meds script symlink -echo "Collecting code metadata in serial." -MEDS_extract-extract_code_metadata \ - input_dir="$MIMICIV_RAW_DIR" \ - cohort_dir="$MIMICIV_MEDS_DIR" \ - stage="extract_code_metadata" \ - etl_metadata.dataset_name="MIMIC-IV" \ - etl_metadata.dataset_version="2.2" \ - event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" - -echo "Finalizing MEDS data with $N_PARALLEL_WORKERS workers in parallel" -MEDS_extract-finalize_MEDS_data \ - --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ - hydra/launcher=joblib \ - input_dir="$MIMICIV_RAW_DIR" \ - cohort_dir="$MIMICIV_MEDS_DIR" \ - stage="finalize_MEDS_data" \ - etl_metadata.dataset_name="MIMIC-IV" \ - etl_metadata.dataset_version="2.2" \ - event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" - -echo "Finalizing MEDS metadata in serial." -MEDS_extract-finalize_MEDS_metadata \ - input_dir="$MIMICIV_RAW_DIR" \ - cohort_dir="$MIMICIV_MEDS_DIR" \ - stage="finalize_MEDS_metadata" \ - etl_metadata.dataset_name="MIMIC-IV" \ - etl_metadata.dataset_version="2.2" \ - event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" diff --git a/MIMIC-IV_Example/joint_script_slurm.sh b/MIMIC-IV_Example/joint_script_slurm.sh deleted file mode 100755 index e13fb7e9..00000000 --- a/MIMIC-IV_Example/joint_script_slurm.sh +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env bash - -# This makes the script fail if any internal script fails -set -e - -# Function to display help message -function display_help() { - echo "Usage: $0 " - echo - echo "This script processes MIMIC-IV data through several steps, handling raw data conversion," - echo "sharding events, splitting subjects, converting to sharded events, and merging into a MEDS cohort." - echo "This script uses slurm to process the data in parallel via the 'submitit' Hydra launcher." - echo - echo "Arguments:" - echo " MIMICIV_RAW_DIR Directory containing raw MIMIC-IV data files." - echo " MIMICIV_PREMEDS_DIR Output directory for pre-MEDS data." - echo " MIMICIV_MEDS_DIR Output directory for processed MEDS data." - echo " N_PARALLEL_WORKERS Number of parallel workers for processing." - echo - echo "Options:" - echo " -h, --help Display this help message and exit." - exit 1 -} - -# Check if the first parameter is '-h' or '--help' -if [[ "$1" == "-h" || "$1" == "--help" ]]; then - display_help -fi - -# Check for mandatory parameters -if [ "$#" -ne 4 ]; then - echo "Error: Incorrect number of arguments provided." - display_help -fi - -export MIMICIV_RAW_DIR="$1" -export MIMICIV_PREMEDS_DIR="$2" -export MIMICIV_MEDS_DIR="$3" -export N_PARALLEL_WORKERS="$4" - -shift 4 - -# Note we use `--multirun` throughout here due to ensure the submitit launcher is used throughout, so that -# this doesn't fall back on running anything locally in a setting where only slurm worker nodes have -# sufficient computational resources to run the actual jobs. - -echo "Running pre-MEDS conversion on one worker." -./MIMIC-IV_Example/pre_MEDS.py \ - --multirun \ - +worker="range(0,1)" \ - hydra/launcher=submitit_slurm \ - hydra.launcher.timeout_min=60 \ - hydra.launcher.cpus_per_task=10 \ - hydra.launcher.mem_gb=50 \ - hydra.launcher.partition="short" \ - raw_cohort_dir="$MIMICIV_RAW_DIR" \ - output_dir="$MIMICIV_PREMEDS_DIR" - -echo "Trying submitit launching with $N_PARALLEL_WORKERS jobs." - -MEDS_extract-shard_events \ - --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ - hydra/launcher=submitit_slurm \ - hydra.launcher.timeout_min=60 \ - hydra.launcher.cpus_per_task=10 \ - hydra.launcher.mem_gb=50 \ - hydra.launcher.partition="short" \ - "hydra.job.env_copy=[PATH]" \ - input_dir="$MIMICIV_PREMEDS_DIR" \ - cohort_dir="$MIMICIV_MEDS_DIR" \ - event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml \ - stage=shard_events - -echo "Splitting subjects on one worker" -MEDS_extract-split_and_shard_subjects \ - --multirun \ - worker="range(0,1)" \ - hydra/launcher=submitit_slurm \ - hydra.launcher.timeout_min=60 \ - hydra.launcher.cpus_per_task=10 \ - hydra.launcher.mem_gb=50 \ - hydra.launcher.partition="short" \ - input_dir="$MIMICIV_PREMEDS_DIR" \ - cohort_dir="$MIMICIV_MEDS_DIR" \ - event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" - -echo "Converting to sharded events with $N_PARALLEL_WORKERS workers in parallel" -MEDS_extract-convert_to_sharded_events \ - --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ - hydra/launcher=submitit_slurm \ - hydra.launcher.timeout_min=60 \ - hydra.launcher.cpus_per_task=10 \ - hydra.launcher.mem_gb=50 \ - hydra.launcher.partition="short" \ - input_dir="$MIMICIV_PREMEDS_DIR" \ - cohort_dir="$MIMICIV_MEDS_DIR" \ - event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" - -echo "Merging to a MEDS cohort with $N_PARALLEL_WORKERS workers in parallel" -MEDS_extract-merge_to_MEDS_cohort \ - --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ - hydra/launcher=submitit_slurm \ - hydra.launcher.timeout_min=60 \ - hydra.launcher.cpus_per_task=10 \ - hydra.launcher.mem_gb=50 \ - hydra.launcher.partition="short" \ - input_dir="$MIMICIV_PREMEDS_DIR" \ - cohort_dir="$MIMICIV_MEDS_DIR" \ - event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" - -echo "Aggregating initial code stats with $N_PARALLEL_WORKERS workers in parallel" -MEDS_transform-aggregate_code_metadata \ - --config-name="extract" \ - --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ - hydra/launcher=submitit_slurm \ - hydra.launcher.timeout_min=60 \ - hydra.launcher.cpus_per_task=10 \ - hydra.launcher.mem_gb=50 \ - hydra.launcher.partition="short" \ - input_dir="$MIMICIV_PREMEDS_DIR" \ - cohort_dir="$MIMICIV_MEDS_DIR" \ - stage="aggregate_code_metadata" - event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" - -# TODO -- make this the pre-meds dir and have the pre-meds script symlink -echo "Collecting code metadata with $N_PARALLEL_WORKERS workers in parallel" -MEDS_extract-extract_code_metadata \ - --multirun \ - worker="range(0,$N_PARALLEL_WORKERS)" \ - hydra/launcher=submitit_slurm \ - hydra.launcher.timeout_min=60 \ - hydra.launcher.cpus_per_task=10 \ - hydra.launcher.mem_gb=50 \ - hydra.launcher.partition="short" \ - input_dir="$MIMICIV_RAW_DIR" \ - cohort_dir="$MIMICIV_MEDS_DIR" \ - event_conversion_config_fp=./MIMIC-IV_Example/configs/event_configs.yaml "$@" diff --git a/MIMIC-IV_Example/local_parallelism_runner.yaml b/MIMIC-IV_Example/local_parallelism_runner.yaml new file mode 100644 index 00000000..a1d9a6c1 --- /dev/null +++ b/MIMIC-IV_Example/local_parallelism_runner.yaml @@ -0,0 +1,3 @@ +parallelize: + n_workers: ${oc.env:N_WORKERS} + launcher: "joblib" diff --git a/MIMIC-IV_Example/pre_MEDS.py b/MIMIC-IV_Example/pre_MEDS.py index b40bb925..846c3a9d 100755 --- a/MIMIC-IV_Example/pre_MEDS.py +++ b/MIMIC-IV_Example/pre_MEDS.py @@ -15,6 +15,156 @@ from MEDS_transforms.utils import get_shard_prefix, hydra_loguru_init, write_lazyframe +def add_dot(code: pl.Expr, position: int) -> pl.Expr: + """Adds a dot to the code expression at the specified position. + + Args: + code: The code expression. + position: The position to add the dot. + + Returns: + The expression which would yield the code string with a dot added at the specified position + + Example: + >>> pl.select(add_dot(pl.lit("12345"), 3)) + shape: (1, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ str │ + ╞═════════╡ + │ 123.45 │ + └─────────┘ + >>> pl.select(add_dot(pl.lit("12345"), 1)) + shape: (1, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ str │ + ╞═════════╡ + │ 1.2345 │ + └─────────┘ + >>> pl.select(add_dot(pl.lit("12345"), 6)) + shape: (1, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ str │ + ╞═════════╡ + │ 12345 │ + └─────────┘ + """ + return ( + pl.when(code.str.len_chars() > position) + .then(code.str.slice(0, position) + "." + code.str.slice(position)) + .otherwise(code) + ) + + +def add_icd_diagnosis_dot(icd_version: pl.Expr, icd_code: pl.Expr) -> pl.Expr: + """Adds the appropriate dot to the ICD diagnosis codebased on the version. + + Args: + icd_version: The ICD version. + icd_code: The ICD code. + + Returns: + The ICD code with appropriate dot syntax based on the version. + + Examples: + >>> pl.select(add_icd_diagnosis_dot(pl.lit("9"), pl.lit("12345"))) + shape: (1, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ str │ + ╞═════════╡ + │ 123.45 │ + └─────────┘ + >>> pl.select(add_icd_diagnosis_dot(pl.lit("9"), pl.lit("E1234"))) + shape: (1, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ str │ + ╞═════════╡ + │ E123.4 │ + └─────────┘ + >>> pl.select(add_icd_diagnosis_dot(pl.lit("9"), pl.lit("F1234"))) + shape: (1, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ str │ + ╞═════════╡ + │ F12.34 │ + └─────────┘ + >>> pl.select(add_icd_diagnosis_dot(pl.lit("10"), pl.lit("12345"))) + shape: (1, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ str │ + ╞═════════╡ + │ 123.45 │ + └─────────┘ + >>> pl.select(add_icd_diagnosis_dot(pl.lit("10"), pl.lit("E1234"))) + shape: (1, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ str │ + ╞═════════╡ + │ E12.34 │ + └─────────┘ + """ + + icd9_code = ( + pl.when(icd_code.str.starts_with("E")).then(add_dot(icd_code, 4)).otherwise(add_dot(icd_code, 3)) + ) + + icd10_code = add_dot(icd_code, 3) + + return pl.when(icd_version == "9").then(icd9_code).otherwise(icd10_code) + + +def add_icd_procedure_dot(icd_version: pl.Expr, icd_code: pl.Expr) -> pl.Expr: + """Adds the appropriate dot to the ICD procedure code based on the version. + + Args: + icd_version: The ICD version. + icd_code: The ICD code. + + Returns: + The ICD code with appropriate dot syntax based on the version. + + Examples: + >>> pl.select(add_icd_procedure_dot(pl.lit("9"), pl.lit("12345"))) + shape: (1, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ str │ + ╞═════════╡ + │ 12.345 │ + └─────────┘ + >>> pl.select(add_icd_procedure_dot(pl.lit("10"), pl.lit("12345"))) + shape: (1, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ str │ + ╞═════════╡ + │ 12345 │ + └─────────┘ + """ + + icd9_code = add_dot(icd_code, 2) + icd10_code = icd_code + + return pl.when(icd_version == "9").then(icd9_code).otherwise(icd10_code) + + def add_discharge_time_by_hadm_id( df: pl.LazyFrame, discharge_time_df: pl.LazyFrame, out_column_name: str = "hadm_discharge_time" ) -> pl.LazyFrame: @@ -51,31 +201,44 @@ def fix_static_data(raw_static_df: pl.LazyFrame, death_times_df: pl.LazyFrame) - "hosp/patients": (fix_static_data, ("hosp/admissions", ["subject_id", "deathtime"])), } +ICD_DFS_TO_FIX = [ + ("hosp/d_icd_diagnoses", add_icd_diagnosis_dot), + ("hosp/d_icd_procedures", add_icd_procedure_dot), +] + @hydra.main(version_base=None, config_path="configs", config_name="pre_MEDS") def main(cfg: DictConfig): """Performs pre-MEDS data wrangling for MIMIC-IV. - Inputs are the raw MIMIC files, read from the `raw_cohort_dir` config parameter. Output files are either + Inputs are the raw MIMIC files, read from the `input_dir` config parameter. Output files are either symlinked (if they are not modified) or written in processed form to the `MEDS_input_dir` config parameter. Hydra is used to manage configuration parameters and logging. """ hydra_loguru_init() - raw_cohort_dir = Path(cfg.raw_cohort_dir) - MEDS_input_dir = Path(cfg.output_dir) + input_dir = Path(cfg.input_dir) + MEDS_input_dir = Path(cfg.cohort_dir) + + done_fp = MEDS_input_dir / ".done" + if done_fp.is_file() and not cfg.do_overwrite: + logger.info( + f"Pre-MEDS transformation already complete as {done_fp} exists and " + f"do_overwrite={cfg.do_overwrite}. Returning." + ) + exit(0) - all_fps = list(raw_cohort_dir.glob("**/*.*")) + all_fps = list(input_dir.glob("**/*.*")) dfs_to_load = {} seen_fps = {} for in_fp in all_fps: - pfx = get_shard_prefix(raw_cohort_dir, in_fp) + pfx = get_shard_prefix(input_dir, in_fp) try: - fp, read_fn = get_supported_fp(raw_cohort_dir, pfx) + fp, read_fn = get_supported_fp(input_dir, pfx) except FileNotFoundError: logger.info(f"Skipping {pfx} @ {str(in_fp.resolve())} as no compatible dataframe file was found.") continue @@ -88,7 +251,7 @@ def main(cfg: DictConfig): else: seen_fps[str(fp.resolve())] = read_fn - out_fp = MEDS_input_dir / fp.relative_to(raw_cohort_dir) + out_fp = MEDS_input_dir / fp.relative_to(input_dir) if out_fp.is_file(): print(f"Done with {pfx}. Continuing") @@ -96,14 +259,14 @@ def main(cfg: DictConfig): out_fp.parent.mkdir(parents=True, exist_ok=True) - if pfx not in FUNCTIONS: + if pfx not in FUNCTIONS and pfx not in [p for p, _ in ICD_DFS_TO_FIX]: logger.info( f"No function needed for {pfx}: " f"Symlinking {str(fp.resolve())} to {str(out_fp.resolve())}" ) relative_in_fp = fp.relative_to(out_fp.resolve().parent, walk_up=True) out_fp.symlink_to(relative_in_fp) continue - else: + elif pfx in FUNCTIONS: out_fp = MEDS_input_dir / f"{pfx}.parquet" if out_fp.is_file(): print(f"Done with {pfx}. Continuing") @@ -130,7 +293,7 @@ def main(cfg: DictConfig): fps = fps_and_cols["fps"] cols = list(fps_and_cols["cols"]) - df_to_load_fp, df_to_load_read_fn = get_supported_fp(raw_cohort_dir, df_to_load_pfx) + df_to_load_fp, df_to_load_read_fn = get_supported_fp(input_dir, df_to_load_pfx) st = datetime.now() @@ -142,7 +305,7 @@ def main(cfg: DictConfig): logger.info(f" Loaded in {datetime.now() - st}") for fp in fps: - pfx = get_shard_prefix(raw_cohort_dir, fp) + pfx = get_shard_prefix(input_dir, fp) out_fp = MEDS_input_dir / f"{pfx}.parquet" logger.info(f" Processing dependent df @ {pfx}...") @@ -156,7 +319,33 @@ def main(cfg: DictConfig): write_lazyframe(processed_df, out_fp) logger.info(f" Processed and wrote to {str(out_fp.resolve())} in {datetime.now() - fp_st}") + for pfx, fn in ICD_DFS_TO_FIX: + fp, read_fn = get_supported_fp(input_dir, pfx) + out_fp = MEDS_input_dir / f"{pfx}.parquet" + + if out_fp.is_file(): + print(f"Done with {pfx}. Continuing") + continue + + if fp.suffix != ".parquet": + read_fn = partial(read_fn, infer_schema=False) + + st = datetime.now() + logger.info(f"Processing {pfx}...") + processed_df = ( + read_fn(fp) + .collect() + .with_columns( + fn(pl.col("icd_version").cast(pl.String), pl.col("icd_code").cast(pl.String)).alias( + "norm_icd_code" + ) + ) + ) + processed_df.write_parquet(out_fp, use_pyarrow=True) + logger.info(f" Processed and wrote to {str(out_fp.resolve())} in {datetime.now() - st}") + logger.info(f"Done! All dataframes processed and written to {str(MEDS_input_dir.resolve())}") + done_fp.write_text(f"Finished at {datetime.now()}") if __name__ == "__main__": diff --git a/MIMIC-IV_Example/run.sh b/MIMIC-IV_Example/run.sh new file mode 100755 index 00000000..9c06c7e9 --- /dev/null +++ b/MIMIC-IV_Example/run.sh @@ -0,0 +1,105 @@ +#!/usr/bin/env bash + +# This makes the script fail if any internal script fails +set -e + +# Function to display help message +function display_help() { + echo "Usage: $0 " + echo + echo "This script processes MIMIC-IV data through several steps, handling raw data conversion," + echo "sharding events, splitting subjects, converting to sharded events, and merging into a MEDS cohort." + echo + echo "Arguments:" + echo " MIMICIV_RAW_DIR Directory containing raw MIMIC-IV data files." + echo " MIMICIV_PREMEDS_DIR Output directory for pre-MEDS data." + echo " MIMICIV_MEDS_DIR Output directory for processed MEDS data." + echo " (OPTIONAL) do_unzip=true OR do_unzip=false Optional flag to unzip files before processing." + echo + echo "Options:" + echo " -h, --help Display this help message and exit." + exit 1 +} + +echo "Unsetting SLURM_CPU_BIND in case you're running this on a slurm interactive node with slurm parallelism" +unset SLURM_CPU_BIND + +# Check if the first parameter is '-h' or '--help' +if [[ "$1" == "-h" || "$1" == "--help" ]]; then + display_help +fi + +# Check for mandatory parameters +if [ "$#" -lt 3 ]; then + echo "Error: Incorrect number of arguments provided." + display_help +fi + +export MIMICIV_RAW_DIR=$1 +export MIMICIV_PRE_MEDS_DIR=$2 +export MIMICIV_MEDS_COHORT_DIR=$3 +shift 3 + +# Defaults +_DO_UNZIP_ARG_STR="" + +if [ $# -ge 1 ]; then + case "$1" in + do_unzip=*) + _DO_UNZIP_ARG_STR="$1" + shift 1 + ;; + esac +fi + +DO_UNZIP="false" + +if [ -n "$_DO_UNZIP_ARG_STR" ]; then + case "$_DO_UNZIP_ARG_STR" in + do_unzip=true) + DO_UNZIP="true" + ;; + do_unzip=false) + DO_UNZIP="false" + ;; + *) + echo "Error: Invalid do_unzip value. Use 'do_unzip=true' or 'do_unzip=false'." + exit 1 + ;; + esac + echo "Setting DO_UNZIP=$DO_UNZIP" +fi + +# TODO: Add wget blocks once testing is validated. + +EVENT_CONVERSION_CONFIG_FP="$(pwd)/configs/event_configs.yaml" +PIPELINE_CONFIG_FP="$(pwd)/configs/extract_MIMIC.yaml" +PRE_MEDS_PY_FP="$(pwd)/pre_MEDS.py" + +# We export these variables separately from their assignment so that any errors during assignment are caught. +export EVENT_CONVERSION_CONFIG_FP +export PIPELINE_CONFIG_FP +export PRE_MEDS_PY_FP + +if [ "$DO_UNZIP" == "true" ]; then + GZ_FILES="${MIMICIV_RAW_DIR}/*/*.csv.gz" + if compgen -G "$GZ_FILES" > /dev/null; then + echo "Unzipping csv.gz files matching $GZ_FILES." + for file in $GZ_FILES; do gzip -d --force "$file"; done + else + echo "No csz.gz files to unzip at $GZ_FILES." + fi +else + echo "Skipping unzipping." +fi + +echo "Running pre-MEDS conversion." +python "$PRE_MEDS_PY_FP" input_dir="$MIMICIV_RAW_DIR" cohort_dir="$MIMICIV_PRE_MEDS_DIR" + +if [ -z "$N_WORKERS" ]; then + echo "Setting N_WORKERS to 1 to avoid issues with the runners." + export N_WORKERS="1" +fi + +echo "Running extraction pipeline." +MEDS_transform-runner "pipeline_config_fp=$PIPELINE_CONFIG_FP" "$@" diff --git a/MIMIC-IV_Example/slurm_runner.yaml b/MIMIC-IV_Example/slurm_runner.yaml new file mode 100644 index 00000000..4dbed261 --- /dev/null +++ b/MIMIC-IV_Example/slurm_runner.yaml @@ -0,0 +1,61 @@ +parallelize: + n_workers: ${oc.env:N_WORKERS} + launcher: "submitit_slurm" + +shard_events: + parallelize: + launcher_params: + timeout_min: 50 + cpus_per_task: 10 + mem_gb: 40 + partition: "short" + +split_and_shard_subjects: + parallelize: + n_workers: 1 + launcher_params: + timeout_min: 10 + cpus_per_task: 10 + mem_gb: 7 + partition: "short" + +convert_to_sharded_events: + parallelize: + launcher_params: + timeout_min: 10 + cpus_per_task: 10 + mem_gb: 25 + partition: "short" + +merge_to_MEDS_cohort: + parallelize: + launcher_params: + timeout_min: 15 + cpus_per_task: 10 + mem_gb: 85 + partition: "short" + +extract_code_metadata: + parallelize: + launcher_params: + timeout_min: 10 + cpus_per_task: 10 + mem_gb: 25 + partition: "short" + +finalize_MEDS_metadata: + parallelize: + n_workers: 1 + launcher_params: + timeout_min: 10 + cpus_per_task: 5 + mem_gb: 10 + partition: "short" + +finalize_MEDS_data: + parallelize: + launcher_params: + timeout_min: 10 + cpus_per_task: 10 + mem_gb: 70 + partition: "short" diff --git a/pyproject.toml b/pyproject.toml index c9f49069..ef352990 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,14 +17,14 @@ classifiers = [ "Operating System :: OS Independent", ] dependencies = [ - "polars~=1.1.0", "pyarrow", "nested_ragged_tensors", "loguru", "hydra-core", "numpy", "meds==0.3.2", + "polars~=1.6.0", "pyarrow", "nested_ragged_tensors", "loguru", "hydra-core", "numpy", "meds==0.3.3", ] [tool.setuptools_scm] [project.optional-dependencies] dev = ["pre-commit"] -tests = ["pytest", "pytest-cov", "rootutils"] +tests = ["pytest", "pytest-cov", "rootutils", "hydra-joblib-launcher"] local_parallelism = ["hydra-joblib-launcher"] slurm_parallelism = ["hydra-submitit-launcher"] docs = [ @@ -60,6 +60,9 @@ MEDS_transform-occlude_outliers = "MEDS_transforms.transforms.occlude_outliers:m MEDS_transform-tensorization = "MEDS_transforms.transforms.tensorization:main" MEDS_transform-tokenization = "MEDS_transforms.transforms.tokenization:main" +# Runner +MEDS_transform-runner = "MEDS_transforms.runner:main" + [project.urls] Homepage = "https://github.com/mmcdermott/MEDS_transforms" Issues = "https://github.com/mmcdermott/MEDS_transforms/issues" diff --git a/src/MEDS_transforms/__init__.py b/src/MEDS_transforms/__init__.py index 8d0ffd6c..2d62ae87 100644 --- a/src/MEDS_transforms/__init__.py +++ b/src/MEDS_transforms/__init__.py @@ -10,11 +10,14 @@ except PackageNotFoundError: # pragma: no cover __version__ = "unknown" -PREPROCESS_CONFIG_YAML = files(__package_name__).joinpath("configs/preprocess.yaml") -EXTRACT_CONFIG_YAML = files(__package_name__).joinpath("configs/extract.yaml") +PREPROCESS_CONFIG_YAML = files(__package_name__).joinpath("configs/_preprocess.yaml") +EXTRACT_CONFIG_YAML = files(__package_name__).joinpath("configs/_extract.yaml") +RUNNER_CONFIG_YAML = files(__package_name__).joinpath("configs/_runner.yaml") MANDATORY_COLUMNS = [subject_id_field, time_field, code_field, "numeric_value"] +RESERVED_CONFIG_NAMES = {c.stem for c in (PREPROCESS_CONFIG_YAML, EXTRACT_CONFIG_YAML, RUNNER_CONFIG_YAML)} + MANDATORY_TYPES = { subject_id_field: pl.Int64, time_field: pl.Datetime("us"), diff --git a/src/MEDS_transforms/configs/extract.yaml b/src/MEDS_transforms/configs/_extract.yaml similarity index 97% rename from src/MEDS_transforms/configs/extract.yaml rename to src/MEDS_transforms/configs/_extract.yaml index 3abd498e..2ee757cd 100644 --- a/src/MEDS_transforms/configs/extract.yaml +++ b/src/MEDS_transforms/configs/_extract.yaml @@ -1,8 +1,9 @@ defaults: - - pipeline + - _pipeline - stage_configs: - shard_events - split_and_shard_subjects + - convert_to_sharded_events - merge_to_MEDS_cohort - extract_code_metadata - finalize_MEDS_metadata diff --git a/src/MEDS_transforms/configs/pipeline.yaml b/src/MEDS_transforms/configs/_pipeline.yaml similarity index 100% rename from src/MEDS_transforms/configs/pipeline.yaml rename to src/MEDS_transforms/configs/_pipeline.yaml diff --git a/src/MEDS_transforms/configs/preprocess.yaml b/src/MEDS_transforms/configs/_preprocess.yaml similarity index 98% rename from src/MEDS_transforms/configs/preprocess.yaml rename to src/MEDS_transforms/configs/_preprocess.yaml index dab87a9a..6ebafdc3 100644 --- a/src/MEDS_transforms/configs/preprocess.yaml +++ b/src/MEDS_transforms/configs/_preprocess.yaml @@ -1,5 +1,5 @@ defaults: - - pipeline + - _pipeline - stage_configs: - reshard_to_split - filter_subjects diff --git a/src/MEDS_transforms/configs/_runner.yaml b/src/MEDS_transforms/configs/_runner.yaml new file mode 100644 index 00000000..f8266788 --- /dev/null +++ b/src/MEDS_transforms/configs/_runner.yaml @@ -0,0 +1,31 @@ +# Global IO +pipeline_config_fp: ??? +stage_runner_fp: null + +_local_pipeline_config: ${oc.create:${load_yaml_file:${oc.select:pipeline_config_fp,null}}} +_stage_runners: ${oc.create:${load_yaml_file:${stage_runner_fp}}} + +log_dir: "${_local_pipeline_config.cohort_dir}/.logs" + +_pipeline_description: ${oc.select:_local_pipeline_config.description,"No description provided."} + +do_profile: False + +hydra: + job: + name: "MEDS-transforms_runner_${now:%Y-%m-%d_%H-%M-%S}" + run: + dir: "${log_dir}" + help: + app_name: "MEDS-Transforms Pipeline Runner" + + template: |- + == ${hydra.help.app_name} == + ${hydra.help.app_name} is a command line tool for running entire MEDS-transform pipelines in a single + command. + + ${get_script_docstring:runner} + + **MEDS-transforms Pipeline description:** + + ${_pipeline_description} diff --git a/src/MEDS_transforms/configs/stage_configs/convert_to_sharded_events.yaml b/src/MEDS_transforms/configs/stage_configs/convert_to_sharded_events.yaml new file mode 100644 index 00000000..7ab5c1b8 --- /dev/null +++ b/src/MEDS_transforms/configs/stage_configs/convert_to_sharded_events.yaml @@ -0,0 +1,2 @@ +convert_to_sharded_events: + do_dedup_text_and_numeric: True diff --git a/src/MEDS_transforms/extract/convert_to_sharded_events.py b/src/MEDS_transforms/extract/convert_to_sharded_events.py index 8ac66ac0..39aea54f 100755 --- a/src/MEDS_transforms/extract/convert_to_sharded_events.py +++ b/src/MEDS_transforms/extract/convert_to_sharded_events.py @@ -93,7 +93,11 @@ def get_code_expr(code_field: str | list | ListConfig) -> tuple[pl.Expr, pl.Expr return code_expr, code_null_filter_expr, needed_cols -def extract_event(df: pl.LazyFrame, event_cfg: dict[str, str | None]) -> pl.LazyFrame: +def extract_event( + df: pl.LazyFrame, + event_cfg: dict[str, str | None], + do_dedup_text_and_numeric: bool = False, +) -> pl.LazyFrame: """Extracts a single event dataframe from the raw data. Args: @@ -123,6 +127,8 @@ def extract_event(df: pl.LazyFrame, event_cfg: dict[str, str | None]) -> pl.Lazy possible, these additional columns should conform to the conventions of the MEDS data schema --- e.g., primary numeric values associated with the event should be named `"numeric_value"` in the output MEDS data (and thus have the key `"numeric_value"` in the `event_cfg` dictionary). + do_dedup_text_and_numeric: If true, the result will ensure that the `text_value` column is dropped if + it is simply a string version of the `numeric_value` column. Returns: A DataFrame containing the event data extracted from the raw data, containing only unique rows across @@ -150,25 +156,27 @@ def extract_event(df: pl.LazyFrame, event_cfg: dict[str, str | None]) -> pl.Lazy ... "code_modifier": ["1", "2", "3", "4"], ... "time": ["2021-01-01", "2021-01-02", "2021-01-03", "2021-01-04"], ... "numeric_value": [1, 2, 3, 4], + ... "woo_text": ["1", "2", "3/10", "4.24"], ... }) >>> event_cfg = { ... "code": ["FOO", "col(code)", "col(code_modifier)"], ... "time": "col(time)", ... "time_format": "%Y-%m-%d", ... "numeric_value": "numeric_value", + ... "text_value": "woo_text", ... } - >>> extract_event(raw_data, event_cfg) - shape: (4, 4) - ┌────────────┬───────────┬─────────────────────┬───────────────┐ - │ subject_id ┆ code ┆ time ┆ numeric_value │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ datetime[μs] ┆ i64 │ - ╞════════════╪═══════════╪═════════════════════╪═══════════════╡ - │ 1 ┆ FOO//A//1 ┆ 2021-01-01 00:00:00 ┆ 1 │ - │ 1 ┆ FOO//B//2 ┆ 2021-01-02 00:00:00 ┆ 2 │ - │ 2 ┆ FOO//C//3 ┆ 2021-01-03 00:00:00 ┆ 3 │ - │ 2 ┆ FOO//D//4 ┆ 2021-01-04 00:00:00 ┆ 4 │ - └────────────┴───────────┴─────────────────────┴───────────────┘ + >>> extract_event(raw_data, event_cfg, do_dedup_text_and_numeric=True) + shape: (4, 5) + ┌────────────┬───────────┬─────────────────────┬───────────────┬────────────┐ + │ subject_id ┆ code ┆ time ┆ numeric_value ┆ text_value │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ datetime[μs] ┆ i64 ┆ str │ + ╞════════════╪═══════════╪═════════════════════╪═══════════════╪════════════╡ + │ 1 ┆ FOO//A//1 ┆ 2021-01-01 00:00:00 ┆ 1 ┆ null │ + │ 1 ┆ FOO//B//2 ┆ 2021-01-02 00:00:00 ┆ 2 ┆ null │ + │ 2 ┆ FOO//C//3 ┆ 2021-01-03 00:00:00 ┆ 3 ┆ 3/10 │ + │ 2 ┆ FOO//D//4 ┆ 2021-01-04 00:00:00 ┆ 4 ┆ 4.24 │ + └────────────┴───────────┴─────────────────────┴───────────────┴────────────┘ >>> data_with_nulls = pl.DataFrame({ ... "subject_id": [1, 1, 2, 2], ... "code": ["A", None, "C", "D"], @@ -484,6 +492,18 @@ def extract_event(df: pl.LazyFrame, event_cfg: dict[str, str | None]) -> pl.Lazy event_exprs[k] = col + has_numeric = "numeric_value" in event_exprs + has_text = "text_value" in event_exprs + + if do_dedup_text_and_numeric and has_numeric and has_text: + text_expr = event_exprs["text_value"] + num_expr = event_exprs["numeric_value"] + event_exprs["text_value"] = ( + pl.when(text_expr.cast(pl.Float32, strict=False) == num_expr.cast(pl.Float32)) + .then(pl.lit(None, pl.String)) + .otherwise(text_expr) + ) + if code_null_filter_expr is not None: logger.info(f"Filtering out rows with null codes via {code_null_filter_expr}") df = df.filter(code_null_filter_expr) @@ -497,7 +517,9 @@ def extract_event(df: pl.LazyFrame, event_cfg: dict[str, str | None]) -> pl.Lazy def convert_to_events( - df: pl.LazyFrame, event_cfgs: dict[str, dict[str, str | None | Sequence[str]]] + df: pl.LazyFrame, + event_cfgs: dict[str, dict[str, str | None | Sequence[str]]], + do_dedup_text_and_numeric: bool = False, ) -> pl.LazyFrame: """Converts a DataFrame of raw data into a DataFrame of events. @@ -656,7 +678,13 @@ def convert_to_events( for event_name, event_cfg in event_cfgs.items(): try: logger.info(f"Building computational graph for extracting {event_name}") - event_dfs.append(extract_event(df, event_cfg)) + event_dfs.append( + extract_event( + df, + event_cfg, + do_dedup_text_and_numeric=do_dedup_text_and_numeric, + ) + ) except Exception as e: raise ValueError(f"Error extracting event {event_name}: {e}") from e @@ -731,6 +759,7 @@ def compute_fn(df: pl.LazyFrame) -> pl.LazyFrame: return convert_to_events( df.filter(pl.col("subject_id").is_in(typed_subjects)), event_cfgs=copy.deepcopy(event_cfgs), + do_dedup_text_and_numeric=cfg.stage_cfg.get("do_dedup_text_and_numeric", False), ) except Exception as e: raise ValueError( diff --git a/src/MEDS_transforms/extract/extract_code_metadata.py b/src/MEDS_transforms/extract/extract_code_metadata.py index 31d883c4..3460cfbd 100644 --- a/src/MEDS_transforms/extract/extract_code_metadata.py +++ b/src/MEDS_transforms/extract/extract_code_metadata.py @@ -386,7 +386,7 @@ def main(cfg: DictConfig): metadata_fp, read_fn = get_supported_fp(raw_input_dir, input_prefix) if metadata_fp.suffix != ".parquet": - read_fn = partial(read_fn, infer_schema_length=999999999) + read_fn = partial(read_fn, infer_schema=False) out_fp = partial_metadata_dir / f"{input_prefix}.parquet" logger.info(f"Extracting metadata from {metadata_fp} and saving to {out_fp}") diff --git a/src/MEDS_transforms/extract/finalize_MEDS_metadata.py b/src/MEDS_transforms/extract/finalize_MEDS_metadata.py index 65549309..a0201803 100755 --- a/src/MEDS_transforms/extract/finalize_MEDS_metadata.py +++ b/src/MEDS_transforms/extract/finalize_MEDS_metadata.py @@ -2,6 +2,7 @@ """Utilities for finalizing the metadata files for extracted MEDS datasets.""" import json +from datetime import datetime from pathlib import Path import hydra @@ -12,11 +13,14 @@ from loguru import logger from meds import __version__ as MEDS_VERSION from meds import ( + code_metadata_filepath, code_metadata_schema, + dataset_metadata_filepath, dataset_metadata_schema, held_out_split, subject_id_field, subject_split_schema, + subject_splits_filepath, train_split, tuning_split, ) @@ -150,9 +154,12 @@ def main(cfg: DictConfig): _, _, input_metadata_dir = stage_init(cfg) output_metadata_dir = Path(cfg.stage_cfg.reducer_output_dir) - output_code_metadata_fp = output_metadata_dir / "codes.parquet" - dataset_metadata_fp = output_metadata_dir / "dataset.json" - subject_splits_fp = output_metadata_dir / "subject_splits.parquet" + if output_metadata_dir.parts[-1] != Path(code_metadata_filepath).parts[0]: + raise ValueError(f"Output metadata directory must end in 'metadata'. Got {output_metadata_dir}") + + output_code_metadata_fp = output_metadata_dir.parent / code_metadata_filepath + dataset_metadata_fp = output_metadata_dir.parent / dataset_metadata_filepath + subject_splits_fp = output_metadata_dir.parent / subject_splits_filepath for out_fp in [output_code_metadata_fp, dataset_metadata_fp, subject_splits_fp]: out_fp.parent.mkdir(parents=True, exist_ok=True) @@ -187,6 +194,7 @@ def main(cfg: DictConfig): "etl_name": cfg.etl_metadata.package_name, "etl_version": str(cfg.etl_metadata.package_version), "meds_version": MEDS_VERSION, + "created_at": datetime.now().isoformat(), } jsonschema.validate(instance=dataset_metadata, schema=dataset_metadata_schema) diff --git a/src/MEDS_transforms/mapreduce/mapper.py b/src/MEDS_transforms/mapreduce/mapper.py index 6cc44e85..ade9910a 100644 --- a/src/MEDS_transforms/mapreduce/mapper.py +++ b/src/MEDS_transforms/mapreduce/mapper.py @@ -11,7 +11,7 @@ import hydra import polars as pl from loguru import logger -from meds import subject_id_field +from meds import subject_id_field, subject_splits_filepath from omegaconf import DictConfig, ListConfig from ..parser import is_matcher, matcher_to_expr @@ -621,11 +621,11 @@ def map_over( start = datetime.now() train_only = cfg.stage_cfg.get("train_only", False) - split_fp = Path(cfg.input_dir) / "metadata" / "subject_split.parquet" shards, includes_only_train = shard_iterator_fntr(cfg) if train_only: + split_fp = Path(cfg.input_dir) / subject_splits_filepath if includes_only_train: logger.info( f"Processing train split only via shard prefix. Not filtering with {str(split_fp.resolve())}." @@ -636,7 +636,7 @@ def map_over( pl.scan_parquet(split_fp) .filter(pl.col("split") == "train") .select(subject_id_field) - .collect() + .collect()[subject_id_field] .to_list() ) read_fn = read_and_filter_fntr(train_subjects, read_fn) diff --git a/src/MEDS_transforms/mapreduce/utils.py b/src/MEDS_transforms/mapreduce/utils.py index a653d203..716ddc09 100644 --- a/src/MEDS_transforms/mapreduce/utils.py +++ b/src/MEDS_transforms/mapreduce/utils.py @@ -453,10 +453,11 @@ def shard_iterator( >>> includes_only_train False - If it can't find any files, it will return an empty list: + If it can't find any files, it will error: >>> fps, includes_only_train = shard_iterator(cfg) - >>> fps - [] + Traceback (most recent call last): + ... + FileNotFoundError: No shards found in ... with suffix .parquet. Directory contents:... """ input_dir = Path(cfg.stage_cfg.data_input_dir) @@ -474,6 +475,12 @@ def shard_iterator( shard_name = shard_name[: -len(in_suffix)] shards.append(shard_name) + if not shards: + raise FileNotFoundError( + f"No shards found in {input_dir} with suffix {in_suffix}. Directory contents: " + f"{', '.join(str(p.relative_to(input_dir)) for p in input_dir.glob('**/*'))}" + ) + # We initialize this to False and overwrite it if we find dedicated train shards. includes_only_train = False diff --git a/src/MEDS_transforms/parser.py b/src/MEDS_transforms/parser.py index 948ca003..3b663f7f 100644 --- a/src/MEDS_transforms/parser.py +++ b/src/MEDS_transforms/parser.py @@ -596,18 +596,6 @@ def cfg_to_expr(cfg: str | ListConfig | DictConfig) -> tuple[pl.Expr, set[str]]: ['34.2', 'bar//2', '34.2'] >>> sorted(cols) ['baz'] - - Note that sometimes coalescing can lead to unexpected results. For example, if the first expression is of - a different type than the second, the second expression may have its type coerced to match the first, - potentially in an unexpected manner. This is also related to some polars, bugs, such as - https://github.com/pola-rs/polars/issues/17773 - >>> cfg = [ - ... {"matcher": {"baz": 2}, "output": {"str": "bar//{baz}"}}, - ... {"literal": 34.8218}, - ... ] - >>> expr, cols = cfg_to_expr(cfg) - >>> data.select(expr.alias("out"))["out"].to_list() - ['34', 'bar//2', '34'] """ structured_expr = parse_col_expr(cfg) return structured_expr_to_pl(structured_expr) diff --git a/src/MEDS_transforms/reshard_to_split.py b/src/MEDS_transforms/reshard_to_split.py index 0fc06fef..deccc49f 100644 --- a/src/MEDS_transforms/reshard_to_split.py +++ b/src/MEDS_transforms/reshard_to_split.py @@ -10,6 +10,7 @@ import hydra import polars as pl from loguru import logger +from meds import subject_id_field, subject_splits_filepath, time_field from omegaconf import DictConfig from MEDS_transforms import PREPROCESS_CONFIG_YAML @@ -60,7 +61,7 @@ def make_new_shards_fn(df: pl.DataFrame, cfg: DictConfig, stage_cfg: DictConfig) splits_map[sp].append(pt_id) return shard_subjects( - subjects=df["subject_id"].to_numpy(), + subjects=df[subject_id_field].to_numpy(), n_subjects_per_shard=stage_cfg.n_subjects_per_shard, external_splits=splits_map, split_fracs_dict=None, @@ -96,7 +97,7 @@ def main(cfg: DictConfig): output_dir = Path(cfg.stage_cfg.output_dir) - splits_file = Path(cfg.input_dir) / "metadata" / "subject_splits.parquet" + splits_file = Path(cfg.input_dir) / subject_splits_filepath shards_fp = output_dir / ".shards.json" rwlock_wrap( @@ -139,15 +140,15 @@ def read_fn(input_dir: Path) -> pl.LazyFrame: logger.info(f"Reading shards for {subshard_name} (file names are in the input sharding scheme):") for in_fp, _ in orig_shards_iter: logger.info(f" - {str(in_fp.relative_to(input_dir).resolve())}") - new_df = pl.scan_parquet(in_fp, glob=False).filter(pl.col("subject_id").is_in(subjects)) + new_df = pl.scan_parquet(in_fp, glob=False).filter(pl.col(subject_id_field).is_in(subjects)) if df is None: df = new_df else: - df = df.merge_sorted(new_df, key="subject_id") + df = df.merge_sorted(new_df, key=subject_id_field) return df def compute_fn(df: list[pl.DataFrame]) -> pl.LazyFrame: - return df.sort(by=["subject_id", "time"], maintain_order=True, multithreaded=False) + return df.sort(by=[subject_id_field, time_field], maintain_order=True, multithreaded=False) def write_fn(df: pl.LazyFrame, out_fp: Path) -> None: write_lazyframe(df, out_fp) diff --git a/src/MEDS_transforms/runner.py b/src/MEDS_transforms/runner.py new file mode 100755 index 00000000..e99e014a --- /dev/null +++ b/src/MEDS_transforms/runner.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python +"""This script is a helper utility to run entire pipelines from a single script. + +To do this effectively, this runner functionally takes a "meta configuration" file that contains: + 1. The path to the pipeline configuration file. + 2. Configuration details for how to run each stage of the pipeline, including mappings to the underlying + stage scripts and Hydra launcher configurations for each stage to control parallelism, resources, etc. +""" + +import importlib +import subprocess +from pathlib import Path + +import hydra +import yaml +from loguru import logger +from omegaconf import DictConfig, OmegaConf + +try: + from yaml import CLoader as Loader +except ImportError: # pragma: no cover + from yaml import Loader + +from MEDS_transforms import RESERVED_CONFIG_NAMES, RUNNER_CONFIG_YAML +from MEDS_transforms.utils import hydra_loguru_init + + +def get_script_from_name(stage_name: str) -> str | None: + """Returns the script name for the given stage name. + + Args: + stage_name: The name of the stage. + + Returns: + The script name for the given stage name. + """ + + try: + _ = importlib.import_module(f"MEDS_transforms.extract.{stage_name}") + return f"MEDS_extract-{stage_name}" + except ImportError: + pass + + for pfx in ("MEDS_transforms.transforms", "MEDS_transforms.filters", "MEDS_transforms"): + try: + _ = importlib.import_module(f"{pfx}.{stage_name}") + return f"MEDS_transform-{stage_name}" + except ImportError: + pass + + raise ValueError(f"Could not find a script for stage {stage_name}.") + + +def get_parallelization_args( + parallelization_cfg: dict | DictConfig | None, default_parallelization_cfg: dict | DictConfig +) -> list[str]: + """Gets the parallelization args.""" + + if parallelization_cfg is None: + return [] + + if len(parallelization_cfg) == 0 and len(default_parallelization_cfg) == 0: + return [] + + if "n_workers" in parallelization_cfg: + n_workers = parallelization_cfg["n_workers"] + elif "n_workers" in default_parallelization_cfg: + n_workers = default_parallelization_cfg["n_workers"] + else: + n_workers = 1 + + parallelization_args = [ + "--multirun", + f'worker="range(0,{n_workers})"', + ] + + if "launcher" in parallelization_cfg: + launcher = parallelization_cfg["launcher"] + elif "launcher" in default_parallelization_cfg: + launcher = default_parallelization_cfg["launcher"] + else: + launcher = None + + if launcher is None: + return parallelization_args + + if "launcher_params" in parallelization_cfg: + raise ValueError("If launcher_params is provided, launcher must also be provided.") + + parallelization_args.append(f"hydra/launcher={launcher}") + + if "launcher_params" in parallelization_cfg: + launcher_params = parallelization_cfg["launcher_params"] + elif "launcher_params" in default_parallelization_cfg: + launcher_params = default_parallelization_cfg["launcher_params"] + else: + launcher_params = {} + + for k, v in launcher_params.items(): + parallelization_args.append(f"hydra.launcher.{k}={v}") + + return parallelization_args + + +def run_stage(cfg: DictConfig, stage_name: str, default_parallelization_cfg: dict | DictConfig | None = None): + """Runs a single stage of the pipeline. + + Args: + cfg: The configuration for the entire pipeline. + stage_name: The name of the stage to run. + """ + + if default_parallelization_cfg is None: + default_parallelization_cfg = {} + + do_profile = cfg.get("do_profile", False) + pipeline_config_fp = Path(cfg.pipeline_config_fp) + stage_config = cfg._local_pipeline_config.get("stage_configs", {}).get(stage_name, {}) + stage_runner_config = cfg._stage_runners.get(stage_name, {}) + + script = None + if "script" in stage_runner_config: + script = stage_runner_config.script + elif "_script" in stage_config: + script = stage_config._script + else: + script = get_script_from_name(stage_name) + + command_parts = [ + script, + f"--config-dir={str(pipeline_config_fp.parent.resolve())}", + f"--config-name={pipeline_config_fp.stem}", + "'hydra.searchpath=[pkg://MEDS_transforms.configs]'", + f"stage={stage_name}", + ] + + parallelization_args = get_parallelization_args( + stage_runner_config.get("parallelize", {}), default_parallelization_cfg + ) + + if parallelization_args: + multirun = parallelization_args.pop(0) + command_parts = command_parts[:3] + [multirun] + command_parts[3:] + parallelization_args + + if do_profile: + command_parts.append("++hydra.callbacks.profiler._target_=hydra_profiler.profiler.ProfilerCallback") + + full_cmd = " ".join(command_parts) + logger.info(f"Running command: {full_cmd}") + command_out = subprocess.run(full_cmd, shell=True, capture_output=True) + + # https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging + # https://loguru.readthedocs.io/en/stable/api/logger.html#loguru._logger.Logger.parse + + stderr = command_out.stderr.decode() + stdout = command_out.stdout.decode() + logger.info(f"Command output:\n{stdout}") + logger.info(f"Command error:\n{stderr}") + + if command_out.returncode != 0: + raise ValueError( + f"Stage {stage_name} failed via {full_cmd} with return code {command_out.returncode}." + ) + + +@hydra.main( + version_base=None, config_path=str(RUNNER_CONFIG_YAML.parent), config_name=RUNNER_CONFIG_YAML.stem +) +def main(cfg: DictConfig): + """Runs the entire pipeline, end-to-end, based on the configuration provided. + + This script will launch many subsidiary commands via `subprocess`, one for each stage of the specified + pipeline. + """ + + hydra_loguru_init() + + pipeline_config_fp = Path(cfg.pipeline_config_fp) + if not pipeline_config_fp.exists(): + raise FileNotFoundError(f"Pipeline configuration file {pipeline_config_fp} does not exist.") + if not pipeline_config_fp.suffix == ".yaml": + raise ValueError(f"Pipeline configuration file {pipeline_config_fp} must have a .yaml extension.") + if pipeline_config_fp.stem in RESERVED_CONFIG_NAMES: + raise ValueError( + f"Pipeline configuration file {pipeline_config_fp} must not have a name in " + f"{RESERVED_CONFIG_NAMES}." + ) + + pipeline_config = load_yaml_file(cfg.pipeline_config_fp) + stages = pipeline_config.get("stages", []) + if not stages: + raise ValueError("Pipeline configuration must specify at least one stage.") + + log_dir = Path(cfg.log_dir) + + if cfg.get("do_profile", False): + try: + import hydra_profiler # noqa: F401 + except ImportError as e: + raise ValueError( + "You can't run in profiling mode without installing hydra-profiler. Try installing " + "MEDS-transforms with the 'profiler' optional dependency: " + "`pip install MEDS-transforms[profiler]`." + ) from e + + global_done_file = log_dir / "_all_stages.done" + if global_done_file.exists(): + logger.info("All stages are already complete. Exiting.") + return + + if "parallelize" in cfg._stage_runners: + default_parallelization_cfg = cfg._stage_runners.parallelize + elif "parallelize" in cfg: + default_parallelization_cfg = cfg.parallelize + else: + default_parallelization_cfg = None + + for stage in stages: + done_file = log_dir / f"{stage}.done" + + if done_file.exists(): + logger.info(f"Skipping stage {stage} as it is already complete.") + else: + logger.info(f"Running stage: {stage}") + run_stage(cfg, stage, default_parallelization_cfg=default_parallelization_cfg) + done_file.touch() + + global_done_file.touch() + + +def load_yaml_file(path: str | None) -> dict | DictConfig: + if not path: + return {} + + path = Path(path) + if not path.exists(): + raise FileNotFoundError(f"File {path} does not exist.") + + try: + return OmegaConf.load(path) + except Exception as e: + logger.warning(f"Failed to load {path} as an OmegaConf: {e}. Trying as a plain YAML file.") + yaml_text = path.read_text() + return yaml.load(yaml_text, Loader=Loader) + + +OmegaConf.register_new_resolver("load_yaml_file", load_yaml_file, replace=True) + +if __name__ == "__main__": # pragma: no cover + main() diff --git a/src/MEDS_transforms/utils.py b/src/MEDS_transforms/utils.py index b62f7d12..871b90a0 100644 --- a/src/MEDS_transforms/utils.py +++ b/src/MEDS_transforms/utils.py @@ -1,5 +1,6 @@ """Core utilities for MEDS pipelines built with these tools.""" +import importlib import inspect import os import sys @@ -108,10 +109,13 @@ def get_package_version() -> str: return package_version -def get_script_docstring() -> str: +def get_script_docstring(filename: str | None = None) -> str: """Returns the docstring of the main function of the script from which this function was called.""" - main_module = sys.modules["__main__"] + if filename is not None: + main_module = importlib.import_module(f"MEDS_transforms.{filename}") + else: + main_module = sys.modules["__main__"] func = getattr(main_module, "main", None) if func and callable(func): return inspect.getdoc(func) or "" diff --git a/tests/MEDS_Extract/test_convert_to_sharded_events.py b/tests/MEDS_Extract/test_convert_to_sharded_events.py index 074e897d..653ce737 100644 --- a/tests/MEDS_Extract/test_convert_to_sharded_events.py +++ b/tests/MEDS_Extract/test_convert_to_sharded_events.py @@ -89,6 +89,7 @@ time: col(vitals_date) time_format: "%m/%d/%Y, %H:%M:%S" numeric_value: temp + text_value: temp _metadata: input_metadata: description: {"title": {"lab_code": "temp"}} @@ -102,109 +103,215 @@ "held_out/0": [1500733], } +WANT_OUTPUTS_NO_DEDUP = parse_shards_yaml( + """ +data/train/0/subjects/[0-6).parquet: |-2 + subject_id,time,code,numeric_value + 239684,,EYE_COLOR//BROWN, + 239684,,HEIGHT,175.271115221765 + 239684,"12/28/1980, 00:00:00",DOB, + 1195293,,EYE_COLOR//BLUE, + 1195293,,HEIGHT,164.6868838269085 + 1195293,"06/20/1978, 00:00:00",DOB, + +data/train/1/subjects/[0-6).parquet: |-2 + subject_id,time,code,numeric_value + 68729,,EYE_COLOR//HAZEL, + 68729,,HEIGHT,160.3953106166676 + 68729,"03/09/1978, 00:00:00",DOB, + 814703,,EYE_COLOR//HAZEL, + 814703,,HEIGHT,156.48559093209357 + 814703,"03/28/1976, 00:00:00",DOB, + +data/tuning/0/subjects/[0-6).parquet: |-2 + subject_id,time,code,numeric_value + 754281,,EYE_COLOR//BROWN, + 754281,,HEIGHT,166.22261567137025 + 754281,"12/19/1988, 00:00:00",DOB, + +data/held_out/0/subjects/[0-6).parquet: |-2 + subject_id,time,code,numeric_value + 1500733,,EYE_COLOR//BROWN, + 1500733,,HEIGHT,158.60131573580904 + 1500733,"07/20/1986, 00:00:00",DOB, + +data/train/0/admit_vitals/[0-10).parquet: |-2 + subject_id,time,code,numeric_value,text_value + 239684,"05/11/2010, 17:41:51",ADMISSION//CARDIAC,, + 239684,"05/11/2010, 17:41:51",HR,102.6, + 239684,"05/11/2010, 17:41:51",TEMP,96.0,96.0 + 239684,"05/11/2010, 17:48:48",HR,105.1, + 239684,"05/11/2010, 17:48:48",TEMP,96.2,96.2 + 239684,"05/11/2010, 18:25:35",HR,113.4, + 239684,"05/11/2010, 18:25:35",TEMP,95.8,95.8 + 239684,"05/11/2010, 18:57:18",HR,112.6, + 239684,"05/11/2010, 18:57:18",TEMP,95.5,95.5 + 239684,"05/11/2010, 19:27:19",DISCHARGE,, + 1195293,"06/20/2010, 19:23:52",ADMISSION//CARDIAC,, + 1195293,"06/20/2010, 19:25:32",HR,114.1, + 1195293,"06/20/2010, 19:25:32",TEMP,100.0,100.0 + 1195293,"06/20/2010, 20:12:31",HR,112.5, + 1195293,"06/20/2010, 20:12:31",TEMP,99.8,99.8 + 1195293,"06/20/2010, 20:50:04",DISCHARGE,, + +data/train/0/admit_vitals/[10-16).parquet: |-2 + subject_id,time,code,numeric_value,text_value + 1195293,"06/20/2010, 19:23:52",ADMISSION//CARDIAC,, + 1195293,"06/20/2010, 19:23:52",HR,109.0, + 1195293,"06/20/2010, 19:23:52",TEMP,100.0,100.0 + 1195293,"06/20/2010, 19:45:19",HR,119.8, + 1195293,"06/20/2010, 19:45:19",TEMP,99.9,99.9 + 1195293,"06/20/2010, 20:24:44",HR,107.7, + 1195293,"06/20/2010, 20:24:44",TEMP,100.0,100.0 + 1195293,"06/20/2010, 20:41:33",HR,107.5, + 1195293,"06/20/2010, 20:41:33",TEMP,100.4,100.4 + 1195293,"06/20/2010, 20:50:04",DISCHARGE,, + +data/train/1/admit_vitals/[0-10).parquet: |-2 + subject_id,time,code,numeric_value,text_value + 68729,"05/26/2010, 02:30:56",ADMISSION//PULMONARY,, + 68729,"05/26/2010, 02:30:56",HR,86.0, + 68729,"05/26/2010, 02:30:56",TEMP,97.8,97.8 + 68729,"05/26/2010, 04:51:52",DISCHARGE,, + 814703,"02/05/2010, 05:55:39",ADMISSION//ORTHOPEDIC,, + 814703,"02/05/2010, 05:55:39",HR,170.2, + 814703,"02/05/2010, 05:55:39",TEMP,100.1,100.1 + 814703,"02/05/2010, 07:02:30",DISCHARGE,, + +data/train/1/admit_vitals/[10-16).parquet: |-2 + subject_id,time,code,numeric_value,text_value + +data/tuning/0/admit_vitals/[0-10).parquet: |-2 + subject_id,time,code,numeric_value,text_value + 754281,"01/03/2010, 06:27:59",ADMISSION//PULMONARY,, + 754281,"01/03/2010, 06:27:59",HR,142.0, + 754281,"01/03/2010, 06:27:59",TEMP,99.8,99.8 + 754281,"01/03/2010, 08:22:13",DISCHARGE,, + +data/tuning/0/admit_vitals/[10-16).parquet: |-2 + subject_id,time,code,numeric_value,text_value + +data/held_out/0/admit_vitals/[0-10).parquet: |-2 + subject_id,time,code,numeric_value,text_value + 1500733,"06/03/2010, 14:54:38",ADMISSION//ORTHOPEDIC,, + 1500733,"06/03/2010, 16:20:49",HR,90.1, + 1500733,"06/03/2010, 16:20:49",TEMP,100.1,100.1 + 1500733,"06/03/2010, 16:44:26",DISCHARGE,, + +data/held_out/0/admit_vitals/[10-16).parquet: |-2 + subject_id,time,code,numeric_value,text_value + 1500733,"06/03/2010, 14:54:38",ADMISSION//ORTHOPEDIC,, + 1500733,"06/03/2010, 14:54:38",HR,91.4, + 1500733,"06/03/2010, 14:54:38",TEMP,100.0,100.0 + 1500733,"06/03/2010, 15:39:49",HR,84.4, + 1500733,"06/03/2010, 15:39:49",TEMP,100.3,100.3 + 1500733,"06/03/2010, 16:44:26",DISCHARGE,, + """ +) + WANT_OUTPUTS = parse_shards_yaml( """ - data/train/0/subjects/[0-6).parquet: |-2 - subject_id,time,code,numeric_value - 239684,,EYE_COLOR//BROWN, - 239684,,HEIGHT,175.271115221764 - 239684,"12/28/1980, 00:00:00",DOB, - 1195293,,EYE_COLOR//BLUE, - 1195293,,HEIGHT,164.6868838269085 - 1195293,"06/20/1978, 00:00:00",DOB, - - data/train/1/subjects/[0-6).parquet: |-2 - subject_id,time,code,numeric_value - 68729,,EYE_COLOR//HAZEL, - 68729,,HEIGHT,160.3953106166676 - 68729,"03/09/1978, 00:00:00",DOB, - 814703,,EYE_COLOR//HAZEL, - 814703,,HEIGHT,156.48559093209357 - 814703,"03/28/1976, 00:00:00",DOB, - - data/tuning/0/subjects/[0-6).parquet: |-2 - subject_id,time,code,numeric_value - 754281,,EYE_COLOR//BROWN, - 754281,,HEIGHT,166.22261567137025 - 754281,"12/19/1988, 00:00:00",DOB, - - data/held_out/0/subjects/[0-6).parquet: |-2 - subject_id,time,code,numeric_value - 1500733,,EYE_COLOR//BROWN, - 1500733,,HEIGHT,158.60131573580904 - 1500733,"07/20/1986, 00:00:00",DOB, - - data/train/0/admit_vitals/[0-10).parquet: |-2 - subject_id,time,code,numeric_value - 239684,"05/11/2010, 17:41:51",ADMISSION//CARDIAC, - 239684,"05/11/2010, 17:41:51",HR,102.6 - 239684,"05/11/2010, 17:41:51",TEMP,96.0 - 239684,"05/11/2010, 17:48:48",HR,105.1 - 239684,"05/11/2010, 17:48:48",TEMP,96.2 - 239684,"05/11/2010, 18:25:35",HR,113.4 - 239684,"05/11/2010, 18:25:35",TEMP,95.8 - 239684,"05/11/2010, 18:57:18",HR,112.6 - 239684,"05/11/2010, 18:57:18",TEMP,95.5 - 239684,"05/11/2010, 19:27:19",DISCHARGE, - 1195293,"06/20/2010, 19:23:52",ADMISSION//CARDIAC, - 1195293,"06/20/2010, 19:25:32",HR,114.1 - 1195293,"06/20/2010, 19:25:32",TEMP,100.0 - 1195293,"06/20/2010, 20:12:31",HR,112.5 - 1195293,"06/20/2010, 20:12:31",TEMP,99.8 - 1195293,"06/20/2010, 20:50:04",DISCHARGE, - - data/train/0/admit_vitals/[10-16).parquet: |-2 - subject_id,time,code,numeric_value - 1195293,"06/20/2010, 19:23:52",ADMISSION//CARDIAC, - 1195293,"06/20/2010, 19:23:52",HR,109.0 - 1195293,"06/20/2010, 19:23:52",TEMP,100.0 - 1195293,"06/20/2010, 19:45:19",HR,119.8 - 1195293,"06/20/2010, 19:45:19",TEMP,99.9 - 1195293,"06/20/2010, 20:24:44",HR,107.7 - 1195293,"06/20/2010, 20:24:44",TEMP,100.0 - 1195293,"06/20/2010, 20:41:33",HR,107.5 - 1195293,"06/20/2010, 20:41:33",TEMP,100.4 - 1195293,"06/20/2010, 20:50:04",DISCHARGE, - - data/train/1/admit_vitals/[0-10).parquet: |-2 - subject_id,time,code,numeric_value - 68729,"05/26/2010, 02:30:56",ADMISSION//PULMONARY, - 68729,"05/26/2010, 02:30:56",HR,86.0 - 68729,"05/26/2010, 02:30:56",TEMP,97.8 - 68729,"05/26/2010, 04:51:52",DISCHARGE, - 814703,"02/05/2010, 05:55:39",ADMISSION//ORTHOPEDIC, - 814703,"02/05/2010, 05:55:39",HR,170.2 - 814703,"02/05/2010, 05:55:39",TEMP,100.1 - 814703,"02/05/2010, 07:02:30",DISCHARGE, - - data/train/1/admit_vitals/[10-16).parquet: |-2 - subject_id,time,code,numeric_value - - data/tuning/0/admit_vitals/[0-10).parquet: |-2 - subject_id,time,code,numeric_value - 754281,"01/03/2010, 06:27:59",ADMISSION//PULMONARY, - 754281,"01/03/2010, 06:27:59",HR,142.0 - 754281,"01/03/2010, 06:27:59",TEMP,99.8 - 754281,"01/03/2010, 08:22:13",DISCHARGE, - - data/tuning/0/admit_vitals/[10-16).parquet: |-2 - subject_id,time,code,numeric_value - - data/held_out/0/admit_vitals/[0-10).parquet: |-2 - subject_id,time,code,numeric_value - 1500733,"06/03/2010, 14:54:38",ADMISSION//ORTHOPEDIC, - 1500733,"06/03/2010, 16:20:49",HR,90.1 - 1500733,"06/03/2010, 16:20:49",TEMP,100.1 - 1500733,"06/03/2010, 16:44:26",DISCHARGE, - - data/held_out/0/admit_vitals/[10-16).parquet: |-2 - subject_id,time,code,numeric_value - 1500733,"06/03/2010, 14:54:38",ADMISSION//ORTHOPEDIC, - 1500733,"06/03/2010, 14:54:38",HR,91.4 - 1500733,"06/03/2010, 14:54:38",TEMP,100.0 - 1500733,"06/03/2010, 15:39:49",HR,84.4 - 1500733,"06/03/2010, 15:39:49",TEMP,100.3 - 1500733,"06/03/2010, 16:44:26",DISCHARGE, +data/train/0/subjects/[0-6).parquet: |-2 + subject_id,time,code,numeric_value + 239684,,EYE_COLOR//BROWN, + 239684,,HEIGHT,175.271115221765 + 239684,"12/28/1980, 00:00:00",DOB, + 1195293,,EYE_COLOR//BLUE, + 1195293,,HEIGHT,164.6868838269085 + 1195293,"06/20/1978, 00:00:00",DOB, + +data/train/1/subjects/[0-6).parquet: |-2 + subject_id,time,code,numeric_value + 68729,,EYE_COLOR//HAZEL, + 68729,,HEIGHT,160.3953106166676 + 68729,"03/09/1978, 00:00:00",DOB, + 814703,,EYE_COLOR//HAZEL, + 814703,,HEIGHT,156.48559093209357 + 814703,"03/28/1976, 00:00:00",DOB, + +data/tuning/0/subjects/[0-6).parquet: |-2 + subject_id,time,code,numeric_value + 754281,,EYE_COLOR//BROWN, + 754281,,HEIGHT,166.22261567137025 + 754281,"12/19/1988, 00:00:00",DOB, + +data/held_out/0/subjects/[0-6).parquet: |-2 + subject_id,time,code,numeric_value + 1500733,,EYE_COLOR//BROWN, + 1500733,,HEIGHT,158.60131573580904 + 1500733,"07/20/1986, 00:00:00",DOB, + +data/train/0/admit_vitals/[0-10).parquet: |-2 + subject_id,time,code,numeric_value,text_value + 239684,"05/11/2010, 17:41:51",ADMISSION//CARDIAC,, + 239684,"05/11/2010, 17:41:51",HR,102.6, + 239684,"05/11/2010, 17:41:51",TEMP,96.0, + 239684,"05/11/2010, 17:48:48",HR,105.1, + 239684,"05/11/2010, 17:48:48",TEMP,96.2, + 239684,"05/11/2010, 18:25:35",HR,113.4, + 239684,"05/11/2010, 18:25:35",TEMP,95.8, + 239684,"05/11/2010, 18:57:18",HR,112.6, + 239684,"05/11/2010, 18:57:18",TEMP,95.5, + 239684,"05/11/2010, 19:27:19",DISCHARGE,, + 1195293,"06/20/2010, 19:23:52",ADMISSION//CARDIAC,, + 1195293,"06/20/2010, 19:25:32",HR,114.1, + 1195293,"06/20/2010, 19:25:32",TEMP,100.0, + 1195293,"06/20/2010, 20:12:31",HR,112.5, + 1195293,"06/20/2010, 20:12:31",TEMP,99.8, + 1195293,"06/20/2010, 20:50:04",DISCHARGE,, + +data/train/0/admit_vitals/[10-16).parquet: |-2 + subject_id,time,code,numeric_value,text_value + 1195293,"06/20/2010, 19:23:52",ADMISSION//CARDIAC,, + 1195293,"06/20/2010, 19:23:52",HR,109.0, + 1195293,"06/20/2010, 19:23:52",TEMP,100.0, + 1195293,"06/20/2010, 19:45:19",HR,119.8, + 1195293,"06/20/2010, 19:45:19",TEMP,99.9, + 1195293,"06/20/2010, 20:24:44",HR,107.7, + 1195293,"06/20/2010, 20:24:44",TEMP,100.0, + 1195293,"06/20/2010, 20:41:33",HR,107.5, + 1195293,"06/20/2010, 20:41:33",TEMP,100.4, + 1195293,"06/20/2010, 20:50:04",DISCHARGE,, + +data/train/1/admit_vitals/[0-10).parquet: |-2 + subject_id,time,code,numeric_value,text_value + 68729,"05/26/2010, 02:30:56",ADMISSION//PULMONARY,, + 68729,"05/26/2010, 02:30:56",HR,86.0, + 68729,"05/26/2010, 02:30:56",TEMP,97.8, + 68729,"05/26/2010, 04:51:52",DISCHARGE,, + 814703,"02/05/2010, 05:55:39",ADMISSION//ORTHOPEDIC,, + 814703,"02/05/2010, 05:55:39",HR,170.2, + 814703,"02/05/2010, 05:55:39",TEMP,100.1, + 814703,"02/05/2010, 07:02:30",DISCHARGE,, + +data/train/1/admit_vitals/[10-16).parquet: |-2 + subject_id,time,code,numeric_value,text_value + +data/tuning/0/admit_vitals/[0-10).parquet: |-2 + subject_id,time,code,numeric_value,text_value + 754281,"01/03/2010, 06:27:59",ADMISSION//PULMONARY,, + 754281,"01/03/2010, 06:27:59",HR,142.0, + 754281,"01/03/2010, 06:27:59",TEMP,99.8, + 754281,"01/03/2010, 08:22:13",DISCHARGE,, + +data/tuning/0/admit_vitals/[10-16).parquet: |-2 + subject_id,time,code,numeric_value,text_value + +data/held_out/0/admit_vitals/[0-10).parquet: |-2 + subject_id,time,code,numeric_value,text_value + 1500733,"06/03/2010, 14:54:38",ADMISSION//ORTHOPEDIC,, + 1500733,"06/03/2010, 16:20:49",HR,90.1, + 1500733,"06/03/2010, 16:20:49",TEMP,100.1, + 1500733,"06/03/2010, 16:44:26",DISCHARGE,, + +data/held_out/0/admit_vitals/[10-16).parquet: |-2 + subject_id,time,code,numeric_value,text_value + 1500733,"06/03/2010, 14:54:38",ADMISSION//ORTHOPEDIC,, + 1500733,"06/03/2010, 14:54:38",HR,91.4, + 1500733,"06/03/2010, 14:54:38",TEMP,100.0, + 1500733,"06/03/2010, 15:39:49",HR,84.4, + 1500733,"06/03/2010, 15:39:49",TEMP,100.3, + 1500733,"06/03/2010, 16:44:26",DISCHARGE,, """ ) @@ -213,7 +320,7 @@ def test_convert_to_sharded_events(): single_stage_tester( script=CONVERT_TO_SHARDED_EVENTS_SCRIPT, stage_name="convert_to_sharded_events", - stage_kwargs=None, + stage_kwargs={"do_dedup_text_and_numeric": True}, config_name="extract", input_files={ "data/subjects/[0-6).parquet": pl.read_csv(StringIO(SUBJECTS_CSV)), @@ -225,5 +332,25 @@ def test_convert_to_sharded_events(): event_conversion_config_fp="{input_dir}/event_cfgs.yaml", shards_map_fp="{input_dir}/metadata/.shards.json", want_outputs=WANT_OUTPUTS, + test_name="Stage tester: convert_to_sharded_events ; with dedup", + df_check_kwargs={"check_row_order": False, "check_column_order": False, "check_dtypes": False}, + ) + + single_stage_tester( + script=CONVERT_TO_SHARDED_EVENTS_SCRIPT, + stage_name="convert_to_sharded_events", + stage_kwargs={"do_dedup_text_and_numeric": False}, + config_name="extract", + input_files={ + "data/subjects/[0-6).parquet": pl.read_csv(StringIO(SUBJECTS_CSV)), + "data/admit_vitals/[0-10).parquet": pl.read_csv(StringIO(ADMIT_VITALS_0_10_CSV)), + "data/admit_vitals/[10-16).parquet": pl.read_csv(StringIO(ADMIT_VITALS_10_16_CSV)), + "event_cfgs.yaml": EVENT_CFGS_YAML, + "metadata/.shards.json": SHARDS_JSON, + }, + event_conversion_config_fp="{input_dir}/event_cfgs.yaml", + shards_map_fp="{input_dir}/metadata/.shards.json", + want_outputs=WANT_OUTPUTS_NO_DEDUP, + test_name="Stage tester: convert_to_sharded_events ; no dedup", df_check_kwargs={"check_row_order": False, "check_column_order": False, "check_dtypes": False}, ) diff --git a/tests/MEDS_Extract/test_extract.py b/tests/MEDS_Extract/test_extract.py index 954ca21e..96aeba4d 100644 --- a/tests/MEDS_Extract/test_extract.py +++ b/tests/MEDS_Extract/test_extract.py @@ -6,11 +6,13 @@ import json import tempfile +from datetime import datetime from io import StringIO from pathlib import Path import polars as pl from meds import __version__ as MEDS_VERSION +from meds import code_metadata_filepath, dataset_metadata_filepath, subject_splits_filepath from tests.MEDS_Extract import ( CONVERT_TO_SHARDED_EVENTS_SCRIPT, @@ -597,7 +599,7 @@ def test_extraction(): full_stdout = "\n".join(all_stdouts) # Check code metadata - output_file = MEDS_cohort_dir / "metadata" / "codes.parquet" + output_file = MEDS_cohort_dir / code_metadata_filepath assert output_file.is_file(), f"Expected {output_file} to exist: stderr:\n{stderr}\nstdout:\n{stdout}" got_df = pl.read_parquet(output_file, glob=False, use_pyarrow=True) @@ -620,16 +622,23 @@ def test_extraction(): ) # Check dataset metadata - output_file = MEDS_cohort_dir / "metadata" / "dataset.json" + output_file = MEDS_cohort_dir / dataset_metadata_filepath assert output_file.is_file(), f"Expected {output_file} to exist: stderr:\n{stderr}\nstdout:\n{stdout}" got_json = json.loads(output_file.read_text()) assert "etl_version" in got_json, "Expected 'etl_version' to be in the dataset metadata." got_json.pop("etl_version") # We don't test this as it changes with the commits. + + assert "created_at" in got_json, "Expected 'created_at' to be in the dataset metadata." + created_at_obs = got_json.pop("created_at") + as_dt = datetime.fromisoformat(created_at_obs) + assert as_dt < datetime.now(), f"Expected 'created_at' to be before now, got {created_at_obs}." + created_ago = datetime.now() - as_dt + assert created_ago.total_seconds() < 5 * 60, "Expected 'created_at' to be within 5 minutes of now." + assert got_json == MEDS_OUTPUT_DATASET_METADATA_JSON, f"Dataset metadata differs: {got_json}" - # Check the splits parquet - output_file = MEDS_cohort_dir / "metadata" / "subject_splits.parquet" + output_file = MEDS_cohort_dir / subject_splits_filepath assert output_file.is_file(), f"Expected {output_file} to exist: stderr:\n{stderr}\nstdout:\n{stdout}" got_df = pl.read_parquet(output_file, glob=False, use_pyarrow=True) diff --git a/tests/MEDS_Extract/test_extract_code_metadata.py b/tests/MEDS_Extract/test_extract_code_metadata.py index e93bc346..7700426f 100644 --- a/tests/MEDS_Extract/test_extract_code_metadata.py +++ b/tests/MEDS_Extract/test_extract_code_metadata.py @@ -12,80 +12,80 @@ INPUT_SHARDS = parse_shards_yaml( """ - data/train/0: |-2 - subject_id,time,code,numeric_value - 239684,,EYE_COLOR//BROWN, - 239684,,HEIGHT,175.271115221764 - 239684,"12/28/1980, 00:00:00",DOB, - 239684,"05/11/2010, 17:41:51",ADMISSION//CARDIAC, - 239684,"05/11/2010, 17:41:51",HR//bpm,102.6 - 239684,"05/11/2010, 17:41:51",TEMP,96.0 - 239684,"05/11/2010, 17:48:48",HR//bpm,105.1 - 239684,"05/11/2010, 17:48:48",TEMP,96.2 - 239684,"05/11/2010, 18:25:35",HR//bpm,113.4 - 239684,"05/11/2010, 18:25:35",TEMP,95.8 - 239684,"05/11/2010, 18:57:18",HR//bpm,112.6 - 239684,"05/11/2010, 18:57:18",TEMP,95.5 - 239684,"05/11/2010, 19:27:19",DISCHARGE, - 1195293,,EYE_COLOR//BLUE, - 1195293,,HEIGHT,164.6868838269085 - 1195293,"06/20/1978, 00:00:00",DOB, - 1195293,"06/20/2010, 19:23:52",ADMISSION//CARDIAC, - 1195293,"06/20/2010, 19:23:52",HR//bpm,109.0 - 1195293,"06/20/2010, 19:23:52",TEMP,100.0 - 1195293,"06/20/2010, 19:25:32",HR//bpm,114.1 - 1195293,"06/20/2010, 19:25:32",TEMP,100.0 - 1195293,"06/20/2010, 19:45:19",HR//bpm,119.8 - 1195293,"06/20/2010, 19:45:19",TEMP,99.9 - 1195293,"06/20/2010, 20:12:31",HR//bpm,112.5 - 1195293,"06/20/2010, 20:12:31",TEMP,99.8 - 1195293,"06/20/2010, 20:24:44",HR//bpm,107.7 - 1195293,"06/20/2010, 20:24:44",TEMP,100.0 - 1195293,"06/20/2010, 20:41:33",HR//bpm,107.5 - 1195293,"06/20/2010, 20:41:33",TEMP,100.4 - 1195293,"06/20/2010, 20:50:04",DISCHARGE, - - - data/train/1: |-2 - subject_id,time,code,numeric_value - 68729,,EYE_COLOR//HAZEL, - 68729,,HEIGHT,160.3953106166676 - 68729,"03/09/1978, 00:00:00",DOB, - 68729,"05/26/2010, 02:30:56",ADMISSION//PULMONARY, - 68729,"05/26/2010, 02:30:56",HR//bpm,86.0 - 68729,"05/26/2010, 02:30:56",TEMP,97.8 - 68729,"05/26/2010, 04:51:52",DISCHARGE, - 814703,,EYE_COLOR//HAZEL, - 814703,,HEIGHT,156.48559093209357 - 814703,"03/28/1976, 00:00:00",DOB, - 814703,"02/05/2010, 05:55:39",ADMISSION//ORTHOPEDIC, - 814703,"02/05/2010, 05:55:39",HR//bpm,170.2 - 814703,"02/05/2010, 05:55:39",TEMP,100.1 - 814703,"02/05/2010, 07:02:30",DISCHARGE, - - data/tuning/0: |-2 - subject_id,time,code,numeric_value - 754281,,EYE_COLOR//BROWN, - 754281,,HEIGHT,166.22261567137025 - 754281,"12/19/1988, 00:00:00",DOB, - 754281,"01/03/2010, 06:27:59",ADMISSION//PULMONARY, - 754281,"01/03/2010, 06:27:59",HR//bpm,142.0 - 754281,"01/03/2010, 06:27:59",TEMP,99.8 - 754281,"01/03/2010, 08:22:13",DISCHARGE, - - data/held_out/0: |-2 - subject_id,time,code,numeric_value - 1500733,,EYE_COLOR//BROWN, - 1500733,,HEIGHT,158.60131573580904 - 1500733,"07/20/1986, 00:00:00",DOB, - 1500733,"06/03/2010, 14:54:38",ADMISSION//ORTHOPEDIC, - 1500733,"06/03/2010, 14:54:38",HR//bpm,91.4 - 1500733,"06/03/2010, 14:54:38",TEMP,100.0 - 1500733,"06/03/2010, 15:39:49",HR//bpm,84.4 - 1500733,"06/03/2010, 15:39:49",TEMP,100.3 - 1500733,"06/03/2010, 16:20:49",HR//bpm,90.1 - 1500733,"06/03/2010, 16:20:49",TEMP,100.1 - 1500733,"06/03/2010, 16:44:26",DISCHARGE, +data/train/0: |-2 + subject_id,time,code,numeric_value + 239684,,EYE_COLOR//BROWN, + 239684,,HEIGHT,175.271115221764 + 239684,"12/28/1980, 00:00:00",DOB, + 239684,"05/11/2010, 17:41:51",ADMISSION//CARDIAC, + 239684,"05/11/2010, 17:41:51",HR//bpm,102.6 + 239684,"05/11/2010, 17:41:51",TEMP,96.0 + 239684,"05/11/2010, 17:48:48",HR//bpm,105.1 + 239684,"05/11/2010, 17:48:48",TEMP,96.2 + 239684,"05/11/2010, 18:25:35",HR//bpm,113.4 + 239684,"05/11/2010, 18:25:35",TEMP,95.8 + 239684,"05/11/2010, 18:57:18",HR//bpm,112.6 + 239684,"05/11/2010, 18:57:18",TEMP,95.5 + 239684,"05/11/2010, 19:27:19",DISCHARGE, + 1195293,,EYE_COLOR//BLUE, + 1195293,,HEIGHT,164.6868838269085 + 1195293,"06/20/1978, 00:00:00",DOB, + 1195293,"06/20/2010, 19:23:52",ADMISSION//CARDIAC, + 1195293,"06/20/2010, 19:23:52",HR//bpm,109.0 + 1195293,"06/20/2010, 19:23:52",TEMP,100.0 + 1195293,"06/20/2010, 19:25:32",HR//bpm,114.1 + 1195293,"06/20/2010, 19:25:32",TEMP,100.0 + 1195293,"06/20/2010, 19:45:19",HR//bpm,119.8 + 1195293,"06/20/2010, 19:45:19",TEMP,99.9 + 1195293,"06/20/2010, 20:12:31",HR//bpm,112.5 + 1195293,"06/20/2010, 20:12:31",TEMP,99.8 + 1195293,"06/20/2010, 20:24:44",HR//bpm,107.7 + 1195293,"06/20/2010, 20:24:44",TEMP,100.0 + 1195293,"06/20/2010, 20:41:33",HR//bpm,107.5 + 1195293,"06/20/2010, 20:41:33",TEMP,100.4 + 1195293,"06/20/2010, 20:50:04",DISCHARGE, + + +data/train/1: |-2 + subject_id,time,code,numeric_value + 68729,,EYE_COLOR//HAZEL, + 68729,,HEIGHT,160.3953106166676 + 68729,"03/09/1978, 00:00:00",DOB, + 68729,"05/26/2010, 02:30:56",ADMISSION//PULMONARY, + 68729,"05/26/2010, 02:30:56",HR//bpm,86.0 + 68729,"05/26/2010, 02:30:56",TEMP,97.8 + 68729,"05/26/2010, 04:51:52",DISCHARGE, + 814703,,EYE_COLOR//HAZEL, + 814703,,HEIGHT,156.48559093209357 + 814703,"03/28/1976, 00:00:00",DOB, + 814703,"02/05/2010, 05:55:39",ADMISSION//ORTHOPEDIC, + 814703,"02/05/2010, 05:55:39",HR//bpm,170.2 + 814703,"02/05/2010, 05:55:39",TEMP,100.1 + 814703,"02/05/2010, 07:02:30",DISCHARGE, + +data/tuning/0: |-2 + subject_id,time,code,numeric_value + 754281,,EYE_COLOR//BROWN, + 754281,,HEIGHT,166.22261567137025 + 754281,"12/19/1988, 00:00:00",DOB, + 754281,"01/03/2010, 06:27:59",ADMISSION//PULMONARY, + 754281,"01/03/2010, 06:27:59",HR//bpm,142.0 + 754281,"01/03/2010, 06:27:59",TEMP,99.8 + 754281,"01/03/2010, 08:22:13",DISCHARGE, + +data/held_out/0: |-2 + subject_id,time,code,numeric_value + 1500733,,EYE_COLOR//BROWN, + 1500733,,HEIGHT,158.60131573580904 + 1500733,"07/20/1986, 00:00:00",DOB, + 1500733,"06/03/2010, 14:54:38",ADMISSION//ORTHOPEDIC, + 1500733,"06/03/2010, 14:54:38",HR//bpm,91.4 + 1500733,"06/03/2010, 14:54:38",TEMP,100.0 + 1500733,"06/03/2010, 15:39:49",HR//bpm,84.4 + 1500733,"06/03/2010, 15:39:49",TEMP,100.3 + 1500733,"06/03/2010, 16:20:49",HR//bpm,90.1 + 1500733,"06/03/2010, 16:20:49",TEMP,100.1 + 1500733,"06/03/2010, 16:44:26",DISCHARGE, """ ) diff --git a/tests/MEDS_Extract/test_extract_no_metadata.py b/tests/MEDS_Extract/test_extract_no_metadata.py index 0fa8eec8..a8d83a01 100644 --- a/tests/MEDS_Extract/test_extract_no_metadata.py +++ b/tests/MEDS_Extract/test_extract_no_metadata.py @@ -6,11 +6,13 @@ import json import tempfile +from datetime import datetime from io import StringIO from pathlib import Path import polars as pl from meds import __version__ as MEDS_VERSION +from meds import code_metadata_filepath, dataset_metadata_filepath, subject_splits_filepath from tests.MEDS_Extract import ( CONVERT_TO_SHARDED_EVENTS_SCRIPT, @@ -569,7 +571,7 @@ def test_extraction(): full_stdout = "\n".join(all_stdouts) # Check code metadata - output_file = MEDS_cohort_dir / "metadata" / "codes.parquet" + output_file = MEDS_cohort_dir / code_metadata_filepath assert output_file.is_file(), f"Expected {output_file} to exist: stderr:\n{stderr}\nstdout:\n{stdout}" got_df = pl.read_parquet(output_file, glob=False, use_pyarrow=True) @@ -592,16 +594,24 @@ def test_extraction(): ) # Check dataset metadata - output_file = MEDS_cohort_dir / "metadata" / "dataset.json" + output_file = MEDS_cohort_dir / dataset_metadata_filepath assert output_file.is_file(), f"Expected {output_file} to exist: stderr:\n{stderr}\nstdout:\n{stdout}" got_json = json.loads(output_file.read_text()) assert "etl_version" in got_json, "Expected 'etl_version' to be in the dataset metadata." got_json.pop("etl_version") # We don't test this as it changes with the commits. + + assert "created_at" in got_json, "Expected 'created_at' to be in the dataset metadata." + created_at_obs = got_json.pop("created_at") + as_dt = datetime.fromisoformat(created_at_obs) + assert as_dt < datetime.now(), f"Expected 'created_at' to be before now, got {created_at_obs}." + created_ago = datetime.now() - as_dt + assert created_ago.total_seconds() < 5 * 60, "Expected 'created_at' to be within 5 minutes of now." + assert got_json == MEDS_OUTPUT_DATASET_METADATA_JSON, f"Dataset metadata differs: {got_json}" # Check the splits parquet - output_file = MEDS_cohort_dir / "metadata" / "subject_splits.parquet" + output_file = MEDS_cohort_dir / subject_splits_filepath assert output_file.is_file(), f"Expected {output_file} to exist: stderr:\n{stderr}\nstdout:\n{stdout}" got_df = pl.read_parquet(output_file, glob=False, use_pyarrow=True) diff --git a/tests/MEDS_Extract/test_finalize_MEDS_data.py b/tests/MEDS_Extract/test_finalize_MEDS_data.py index d9a3e0ad..3348d121 100644 --- a/tests/MEDS_Extract/test_finalize_MEDS_data.py +++ b/tests/MEDS_Extract/test_finalize_MEDS_data.py @@ -11,81 +11,81 @@ INPUT_SHARDS = parse_shards_yaml( """ - data/train/0: |-2 - subject_id,time,code,numeric_value - 239684,,EYE_COLOR//BROWN, - 239684,,HEIGHT,175.271115221764 - 239684,"12/28/1980, 00:00:00",DOB, - 239684,"05/11/2010, 17:41:51",ADMISSION//CARDIAC, - 239684,"05/11/2010, 17:41:51",HR,102.6 - 239684,"05/11/2010, 17:41:51",TEMP,96.0 - 239684,"05/11/2010, 17:48:48",HR,105.1 - 239684,"05/11/2010, 17:48:48",TEMP,96.2 - 239684,"05/11/2010, 18:25:35",HR,113.4 - 239684,"05/11/2010, 18:25:35",TEMP,95.8 - 239684,"05/11/2010, 18:57:18",HR,112.6 - 239684,"05/11/2010, 18:57:18",TEMP,95.5 - 239684,"05/11/2010, 19:27:19",DISCHARGE, - 1195293,,EYE_COLOR//BLUE, - 1195293,,HEIGHT,164.6868838269085 - 1195293,"06/20/1978, 00:00:00",DOB, - 1195293,"06/20/2010, 19:23:52",ADMISSION//CARDIAC, - 1195293,"06/20/2010, 19:23:52",HR,109.0 - 1195293,"06/20/2010, 19:23:52",TEMP,100.0 - 1195293,"06/20/2010, 19:25:32",HR,114.1 - 1195293,"06/20/2010, 19:25:32",TEMP,100.0 - 1195293,"06/20/2010, 19:45:19",HR,119.8 - 1195293,"06/20/2010, 19:45:19",TEMP,99.9 - 1195293,"06/20/2010, 20:12:31",HR,112.5 - 1195293,"06/20/2010, 20:12:31",TEMP,99.8 - 1195293,"06/20/2010, 20:24:44",HR,107.7 - 1195293,"06/20/2010, 20:24:44",TEMP,100.0 - 1195293,"06/20/2010, 20:41:33",HR,107.5 - 1195293,"06/20/2010, 20:41:33",TEMP,100.4 - 1195293,"06/20/2010, 20:50:04",DISCHARGE, +data/train/0: |-2 + subject_id,time,code,numeric_value + 239684,,EYE_COLOR//BROWN, + 239684,,HEIGHT,175.271115221764 + 239684,"12/28/1980, 00:00:00",DOB, + 239684,"05/11/2010, 17:41:51",ADMISSION//CARDIAC, + 239684,"05/11/2010, 17:41:51",HR,102.6 + 239684,"05/11/2010, 17:41:51",TEMP,96.0 + 239684,"05/11/2010, 17:48:48",HR,105.1 + 239684,"05/11/2010, 17:48:48",TEMP,96.2 + 239684,"05/11/2010, 18:25:35",HR,113.4 + 239684,"05/11/2010, 18:25:35",TEMP,95.8 + 239684,"05/11/2010, 18:57:18",HR,112.6 + 239684,"05/11/2010, 18:57:18",TEMP,95.5 + 239684,"05/11/2010, 19:27:19",DISCHARGE, + 1195293,,EYE_COLOR//BLUE, + 1195293,,HEIGHT,164.6868838269085 + 1195293,"06/20/1978, 00:00:00",DOB, + 1195293,"06/20/2010, 19:23:52",ADMISSION//CARDIAC, + 1195293,"06/20/2010, 19:23:52",HR,109.0 + 1195293,"06/20/2010, 19:23:52",TEMP,100.0 + 1195293,"06/20/2010, 19:25:32",HR,114.1 + 1195293,"06/20/2010, 19:25:32",TEMP,100.0 + 1195293,"06/20/2010, 19:45:19",HR,119.8 + 1195293,"06/20/2010, 19:45:19",TEMP,99.9 + 1195293,"06/20/2010, 20:12:31",HR,112.5 + 1195293,"06/20/2010, 20:12:31",TEMP,99.8 + 1195293,"06/20/2010, 20:24:44",HR,107.7 + 1195293,"06/20/2010, 20:24:44",TEMP,100.0 + 1195293,"06/20/2010, 20:41:33",HR,107.5 + 1195293,"06/20/2010, 20:41:33",TEMP,100.4 + 1195293,"06/20/2010, 20:50:04",DISCHARGE, - data/train/1: |-2 - subject_id,time,code,numeric_value - 68729,,EYE_COLOR//HAZEL, - 68729,,HEIGHT,160.3953106166676 - 68729,"03/09/1978, 00:00:00",DOB, - 68729,"05/26/2010, 02:30:56",ADMISSION//PULMONARY, - 68729,"05/26/2010, 02:30:56",HR,86.0 - 68729,"05/26/2010, 02:30:56",TEMP,97.8 - 68729,"05/26/2010, 04:51:52",DISCHARGE, - 814703,,EYE_COLOR//HAZEL, - 814703,,HEIGHT,156.48559093209357 - 814703,"03/28/1976, 00:00:00",DOB, - 814703,"02/05/2010, 05:55:39",ADMISSION//ORTHOPEDIC, - 814703,"02/05/2010, 05:55:39",HR,170.2 - 814703,"02/05/2010, 05:55:39",TEMP,100.1 - 814703,"02/05/2010, 07:02:30",DISCHARGE, +data/train/1: |-2 + subject_id,time,code,numeric_value + 68729,,EYE_COLOR//HAZEL, + 68729,,HEIGHT,160.3953106166676 + 68729,"03/09/1978, 00:00:00",DOB, + 68729,"05/26/2010, 02:30:56",ADMISSION//PULMONARY, + 68729,"05/26/2010, 02:30:56",HR,86.0 + 68729,"05/26/2010, 02:30:56",TEMP,97.8 + 68729,"05/26/2010, 04:51:52",DISCHARGE, + 814703,,EYE_COLOR//HAZEL, + 814703,,HEIGHT,156.48559093209357 + 814703,"03/28/1976, 00:00:00",DOB, + 814703,"02/05/2010, 05:55:39",ADMISSION//ORTHOPEDIC, + 814703,"02/05/2010, 05:55:39",HR,170.2 + 814703,"02/05/2010, 05:55:39",TEMP,100.1 + 814703,"02/05/2010, 07:02:30",DISCHARGE, - data/tuning/0: |-2 - subject_id,time,code,numeric_value - 754281,,EYE_COLOR//BROWN, - 754281,,HEIGHT,166.22261567137025 - 754281,"12/19/1988, 00:00:00",DOB, - 754281,"01/03/2010, 06:27:59",ADMISSION//PULMONARY, - 754281,"01/03/2010, 06:27:59",HR,142.0 - 754281,"01/03/2010, 06:27:59",TEMP,99.8 - 754281,"01/03/2010, 08:22:13",DISCHARGE, +data/tuning/0: |-2 + subject_id,time,code,numeric_value + 754281,,EYE_COLOR//BROWN, + 754281,,HEIGHT,166.22261567137025 + 754281,"12/19/1988, 00:00:00",DOB, + 754281,"01/03/2010, 06:27:59",ADMISSION//PULMONARY, + 754281,"01/03/2010, 06:27:59",HR,142.0 + 754281,"01/03/2010, 06:27:59",TEMP,99.8 + 754281,"01/03/2010, 08:22:13",DISCHARGE, - data/held_out/0: |-2 - subject_id,time,code,numeric_value - 1500733,,EYE_COLOR//BROWN, - 1500733,,HEIGHT,158.60131573580904 - 1500733,"07/20/1986, 00:00:00",DOB, - 1500733,"06/03/2010, 14:54:38",ADMISSION//ORTHOPEDIC, - 1500733,"06/03/2010, 14:54:38",HR,91.4 - 1500733,"06/03/2010, 14:54:38",TEMP,100.0 - 1500733,"06/03/2010, 15:39:49",HR,84.4 - 1500733,"06/03/2010, 15:39:49",TEMP,100.3 - 1500733,"06/03/2010, 16:20:49",HR,90.1 - 1500733,"06/03/2010, 16:20:49",TEMP,100.1 - 1500733,"06/03/2010, 16:44:26",DISCHARGE, - """ +data/held_out/0: |-2 + subject_id,time,code,numeric_value + 1500733,,EYE_COLOR//BROWN, + 1500733,,HEIGHT,158.60131573580904 + 1500733,"07/20/1986, 00:00:00",DOB, + 1500733,"06/03/2010, 14:54:38",ADMISSION//ORTHOPEDIC, + 1500733,"06/03/2010, 14:54:38",HR,91.4 + 1500733,"06/03/2010, 14:54:38",TEMP,100.0 + 1500733,"06/03/2010, 15:39:49",HR,84.4 + 1500733,"06/03/2010, 15:39:49",TEMP,100.3 + 1500733,"06/03/2010, 16:20:49",HR,90.1 + 1500733,"06/03/2010, 16:20:49",TEMP,100.1 + 1500733,"06/03/2010, 16:44:26",DISCHARGE, +""" ) WANT_OUTPUTS = { diff --git a/tests/MEDS_Extract/test_finalize_MEDS_metadata.py b/tests/MEDS_Extract/test_finalize_MEDS_metadata.py index 274997f4..fd926055 100644 --- a/tests/MEDS_Extract/test_finalize_MEDS_metadata.py +++ b/tests/MEDS_Extract/test_finalize_MEDS_metadata.py @@ -5,8 +5,11 @@ """ +from datetime import datetime + import polars as pl from meds import __version__ as MEDS_VERSION +from meds import code_metadata_filepath, dataset_metadata_filepath, subject_splits_filepath from MEDS_transforms.utils import get_package_version as get_meds_transform_version from tests.MEDS_Extract import FINALIZE_METADATA_SCRIPT @@ -49,27 +52,41 @@ } ) + +def want_dataset_metadata(got: dict): + want_known = { + "dataset_name": "TEST", + "dataset_version": "1.0", + "etl_name": "MEDS_transforms", + "etl_version": get_meds_transform_version(), + "meds_version": MEDS_VERSION, + } + + assert "created_at" in got, "Expected 'created_at' to be in the dataset metadata." + created_at_obs = got.pop("created_at") + as_dt = datetime.fromisoformat(created_at_obs) + assert as_dt < datetime.now(), f"Expected 'created_at' to be before now, got {created_at_obs}." + created_ago = datetime.now() - as_dt + assert created_ago.total_seconds() < 5 * 60, "Expected 'created_at' to be within 5 minutes of now." + + assert got == want_known, f"Expected dataset metadata (less created at) to be {want_known}, got {got}." + + WANT_OUTPUTS = { - "metadata/codes": ( + code_metadata_filepath: ( METADATA_DF.with_columns( pl.col("code").cast(pl.String), pl.col("description").cast(pl.String), pl.col("parent_codes").cast(pl.List(pl.String)), ).select(["code", "description", "parent_codes"]) ), - "metadata/subject_splits": pl.DataFrame( + subject_splits_filepath: pl.DataFrame( { "subject_id": [239684, 1195293, 68729, 814703, 754281, 1500733], "split": ["train", "train", "train", "train", "tuning", "held_out"], } ), - "metadata/dataset.json": { - "dataset_name": "TEST", - "dataset_version": "1.0", - "etl_name": "MEDS_transforms", - "etl_version": get_meds_transform_version(), - "meds_version": MEDS_VERSION, - }, + dataset_metadata_filepath: want_dataset_metadata, } diff --git a/tests/MEDS_Extract/test_merge_to_MEDS_cohort.py b/tests/MEDS_Extract/test_merge_to_MEDS_cohort.py index b9a21ffb..74688043 100644 --- a/tests/MEDS_Extract/test_merge_to_MEDS_cohort.py +++ b/tests/MEDS_Extract/test_merge_to_MEDS_cohort.py @@ -66,186 +66,186 @@ INPUT_SHARDS = parse_shards_yaml( """ - data/train/0/subjects/[0-6): |-2 - subject_id,time,code,numeric_value - 239684,,EYE_COLOR//BROWN, - 239684,,HEIGHT,175.271115221764 - 239684,"12/28/1980, 00:00:00",DOB, - 1195293,,EYE_COLOR//BLUE, - 1195293,,HEIGHT,164.6868838269085 - 1195293,"06/20/1978, 00:00:00",DOB, +data/train/0/subjects/[0-6): |-2 + subject_id,time,code,numeric_value + 239684,,EYE_COLOR//BROWN, + 239684,,HEIGHT,175.271115221764 + 239684,"12/28/1980, 00:00:00",DOB, + 1195293,,EYE_COLOR//BLUE, + 1195293,,HEIGHT,164.6868838269085 + 1195293,"06/20/1978, 00:00:00",DOB, - data/train/1/subjects/[0-6): |-2 - subject_id,time,code,numeric_value - 68729,,EYE_COLOR//HAZEL, - 68729,,HEIGHT,160.3953106166676 - 68729,"03/09/1978, 00:00:00",DOB, - 814703,,EYE_COLOR//HAZEL, - 814703,,HEIGHT,156.48559093209357 - 814703,"03/28/1976, 00:00:00",DOB, +data/train/1/subjects/[0-6): |-2 + subject_id,time,code,numeric_value + 68729,,EYE_COLOR//HAZEL, + 68729,,HEIGHT,160.3953106166676 + 68729,"03/09/1978, 00:00:00",DOB, + 814703,,EYE_COLOR//HAZEL, + 814703,,HEIGHT,156.48559093209357 + 814703,"03/28/1976, 00:00:00",DOB, - data/tuning/0/subjects/[0-6): |-2 - subject_id,time,code,numeric_value - 754281,,EYE_COLOR//BROWN, - 754281,,HEIGHT,166.22261567137025 - 754281,"12/19/1988, 00:00:00",DOB, +data/tuning/0/subjects/[0-6): |-2 + subject_id,time,code,numeric_value + 754281,,EYE_COLOR//BROWN, + 754281,,HEIGHT,166.22261567137025 + 754281,"12/19/1988, 00:00:00",DOB, - data/held_out/0/subjects/[0-6): |-2 - subject_id,time,code,numeric_value - 1500733,,EYE_COLOR//BROWN, - 1500733,,HEIGHT,158.60131573580904 - 1500733,"07/20/1986, 00:00:00",DOB, +data/held_out/0/subjects/[0-6): |-2 + subject_id,time,code,numeric_value + 1500733,,EYE_COLOR//BROWN, + 1500733,,HEIGHT,158.60131573580904 + 1500733,"07/20/1986, 00:00:00",DOB, - data/train/0/admit_vitals/[0-10): |-2 - subject_id,time,code,numeric_value - 239684,"05/11/2010, 17:41:51",ADMISSION//CARDIAC, - 239684,"05/11/2010, 17:41:51",HR,102.6 - 239684,"05/11/2010, 17:41:51",TEMP,96.0 - 239684,"05/11/2010, 17:48:48",HR,105.1 - 239684,"05/11/2010, 17:48:48",TEMP,96.2 - 239684,"05/11/2010, 18:25:35",HR,113.4 - 239684,"05/11/2010, 18:25:35",TEMP,95.8 - 239684,"05/11/2010, 18:57:18",HR,112.6 - 239684,"05/11/2010, 18:57:18",TEMP,95.5 - 239684,"05/11/2010, 19:27:19",DISCHARGE, - 1195293,"06/20/2010, 19:23:52",ADMISSION//CARDIAC, - 1195293,"06/20/2010, 19:25:32",HR,114.1 - 1195293,"06/20/2010, 19:25:32",TEMP,100.0 - 1195293,"06/20/2010, 20:12:31",HR,112.5 - 1195293,"06/20/2010, 20:12:31",TEMP,99.8 - 1195293,"06/20/2010, 20:50:04",DISCHARGE, +data/train/0/admit_vitals/[0-10): |-2 + subject_id,time,code,numeric_value + 239684,"05/11/2010, 17:41:51",ADMISSION//CARDIAC, + 239684,"05/11/2010, 17:41:51",HR,102.6 + 239684,"05/11/2010, 17:41:51",TEMP,96.0 + 239684,"05/11/2010, 17:48:48",HR,105.1 + 239684,"05/11/2010, 17:48:48",TEMP,96.2 + 239684,"05/11/2010, 18:25:35",HR,113.4 + 239684,"05/11/2010, 18:25:35",TEMP,95.8 + 239684,"05/11/2010, 18:57:18",HR,112.6 + 239684,"05/11/2010, 18:57:18",TEMP,95.5 + 239684,"05/11/2010, 19:27:19",DISCHARGE, + 1195293,"06/20/2010, 19:23:52",ADMISSION//CARDIAC, + 1195293,"06/20/2010, 19:25:32",HR,114.1 + 1195293,"06/20/2010, 19:25:32",TEMP,100.0 + 1195293,"06/20/2010, 20:12:31",HR,112.5 + 1195293,"06/20/2010, 20:12:31",TEMP,99.8 + 1195293,"06/20/2010, 20:50:04",DISCHARGE, - data/train/0/admit_vitals/[10-16): |-2 - subject_id,time,code,numeric_value - 1195293,"06/20/2010, 19:23:52",ADMISSION//CARDIAC, - 1195293,"06/20/2010, 19:23:52",HR,109.0 - 1195293,"06/20/2010, 19:23:52",TEMP,100.0 - 1195293,"06/20/2010, 19:45:19",HR,119.8 - 1195293,"06/20/2010, 19:45:19",TEMP,99.9 - 1195293,"06/20/2010, 20:24:44",HR,107.7 - 1195293,"06/20/2010, 20:24:44",TEMP,100.0 - 1195293,"06/20/2010, 20:41:33",HR,107.5 - 1195293,"06/20/2010, 20:41:33",TEMP,100.4 - 1195293,"06/20/2010, 20:50:04",DISCHARGE, +data/train/0/admit_vitals/[10-16): |-2 + subject_id,time,code,numeric_value + 1195293,"06/20/2010, 19:23:52",ADMISSION//CARDIAC, + 1195293,"06/20/2010, 19:23:52",HR,109.0 + 1195293,"06/20/2010, 19:23:52",TEMP,100.0 + 1195293,"06/20/2010, 19:45:19",HR,119.8 + 1195293,"06/20/2010, 19:45:19",TEMP,99.9 + 1195293,"06/20/2010, 20:24:44",HR,107.7 + 1195293,"06/20/2010, 20:24:44",TEMP,100.0 + 1195293,"06/20/2010, 20:41:33",HR,107.5 + 1195293,"06/20/2010, 20:41:33",TEMP,100.4 + 1195293,"06/20/2010, 20:50:04",DISCHARGE, - data/train/1/admit_vitals/[0-10): |-2 - subject_id,time,code,numeric_value - 68729,"05/26/2010, 02:30:56",ADMISSION//PULMONARY, - 68729,"05/26/2010, 02:30:56",HR,86.0 - 68729,"05/26/2010, 02:30:56",TEMP,97.8 - 68729,"05/26/2010, 04:51:52",DISCHARGE, - 814703,"02/05/2010, 05:55:39",ADMISSION//ORTHOPEDIC, - 814703,"02/05/2010, 05:55:39",HR,170.2 - 814703,"02/05/2010, 05:55:39",TEMP,100.1 - 814703,"02/05/2010, 07:02:30",DISCHARGE, +data/train/1/admit_vitals/[0-10): |-2 + subject_id,time,code,numeric_value + 68729,"05/26/2010, 02:30:56",ADMISSION//PULMONARY, + 68729,"05/26/2010, 02:30:56",HR,86.0 + 68729,"05/26/2010, 02:30:56",TEMP,97.8 + 68729,"05/26/2010, 04:51:52",DISCHARGE, + 814703,"02/05/2010, 05:55:39",ADMISSION//ORTHOPEDIC, + 814703,"02/05/2010, 05:55:39",HR,170.2 + 814703,"02/05/2010, 05:55:39",TEMP,100.1 + 814703,"02/05/2010, 07:02:30",DISCHARGE, - data/train/1/admit_vitals/[10-16): |-2 - subject_id,time,code,numeric_value +data/train/1/admit_vitals/[10-16): |-2 + subject_id,time,code,numeric_value - data/tuning/0/admit_vitals/[0-10): |-2 - subject_id,time,code,numeric_value - 754281,"01/03/2010, 06:27:59",ADMISSION//PULMONARY, - 754281,"01/03/2010, 06:27:59",HR,142.0 - 754281,"01/03/2010, 06:27:59",TEMP,99.8 - 754281,"01/03/2010, 08:22:13",DISCHARGE, +data/tuning/0/admit_vitals/[0-10): |-2 + subject_id,time,code,numeric_value + 754281,"01/03/2010, 06:27:59",ADMISSION//PULMONARY, + 754281,"01/03/2010, 06:27:59",HR,142.0 + 754281,"01/03/2010, 06:27:59",TEMP,99.8 + 754281,"01/03/2010, 08:22:13",DISCHARGE, - data/tuning/0/admit_vitals/[10-16): |-2 - subject_id,time,code,numeric_value +data/tuning/0/admit_vitals/[10-16): |-2 + subject_id,time,code,numeric_value - data/held_out/0/admit_vitals/[0-10): |-2 - subject_id,time,code,numeric_value - 1500733,"06/03/2010, 14:54:38",ADMISSION//ORTHOPEDIC, - 1500733,"06/03/2010, 16:20:49",HR,90.1 - 1500733,"06/03/2010, 16:20:49",TEMP,100.1 - 1500733,"06/03/2010, 16:44:26",DISCHARGE, +data/held_out/0/admit_vitals/[0-10): |-2 + subject_id,time,code,numeric_value + 1500733,"06/03/2010, 14:54:38",ADMISSION//ORTHOPEDIC, + 1500733,"06/03/2010, 16:20:49",HR,90.1 + 1500733,"06/03/2010, 16:20:49",TEMP,100.1 + 1500733,"06/03/2010, 16:44:26",DISCHARGE, - data/held_out/0/admit_vitals/[10-16): |-2 - subject_id,time,code,numeric_value - 1500733,"06/03/2010, 14:54:38",ADMISSION//ORTHOPEDIC, - 1500733,"06/03/2010, 14:54:38",HR,91.4 - 1500733,"06/03/2010, 14:54:38",TEMP,100.0 - 1500733,"06/03/2010, 15:39:49",HR,84.4 - 1500733,"06/03/2010, 15:39:49",TEMP,100.3 - 1500733,"06/03/2010, 16:44:26",DISCHARGE, +data/held_out/0/admit_vitals/[10-16): |-2 + subject_id,time,code,numeric_value + 1500733,"06/03/2010, 14:54:38",ADMISSION//ORTHOPEDIC, + 1500733,"06/03/2010, 14:54:38",HR,91.4 + 1500733,"06/03/2010, 14:54:38",TEMP,100.0 + 1500733,"06/03/2010, 15:39:49",HR,84.4 + 1500733,"06/03/2010, 15:39:49",TEMP,100.3 + 1500733,"06/03/2010, 16:44:26",DISCHARGE, """ ) WANT_OUTPUTS = parse_shards_yaml( """ - data/train/0: |-2 - subject_id,time,code,numeric_value - 239684,,EYE_COLOR//BROWN, - 239684,,HEIGHT,175.271115221764 - 239684,"12/28/1980, 00:00:00",DOB, - 239684,"05/11/2010, 17:41:51",ADMISSION//CARDIAC, - 239684,"05/11/2010, 17:41:51",HR,102.6 - 239684,"05/11/2010, 17:41:51",TEMP,96.0 - 239684,"05/11/2010, 17:48:48",HR,105.1 - 239684,"05/11/2010, 17:48:48",TEMP,96.2 - 239684,"05/11/2010, 18:25:35",HR,113.4 - 239684,"05/11/2010, 18:25:35",TEMP,95.8 - 239684,"05/11/2010, 18:57:18",HR,112.6 - 239684,"05/11/2010, 18:57:18",TEMP,95.5 - 239684,"05/11/2010, 19:27:19",DISCHARGE, - 1195293,,EYE_COLOR//BLUE, - 1195293,,HEIGHT,164.6868838269085 - 1195293,"06/20/1978, 00:00:00",DOB, - 1195293,"06/20/2010, 19:23:52",ADMISSION//CARDIAC, - 1195293,"06/20/2010, 19:23:52",HR,109.0 - 1195293,"06/20/2010, 19:23:52",TEMP,100.0 - 1195293,"06/20/2010, 19:25:32",HR,114.1 - 1195293,"06/20/2010, 19:25:32",TEMP,100.0 - 1195293,"06/20/2010, 19:45:19",HR,119.8 - 1195293,"06/20/2010, 19:45:19",TEMP,99.9 - 1195293,"06/20/2010, 20:12:31",HR,112.5 - 1195293,"06/20/2010, 20:12:31",TEMP,99.8 - 1195293,"06/20/2010, 20:24:44",HR,107.7 - 1195293,"06/20/2010, 20:24:44",TEMP,100.0 - 1195293,"06/20/2010, 20:41:33",HR,107.5 - 1195293,"06/20/2010, 20:41:33",TEMP,100.4 - 1195293,"06/20/2010, 20:50:04",DISCHARGE, +data/train/0: |-2 + subject_id,time,code,numeric_value + 239684,,EYE_COLOR//BROWN, + 239684,,HEIGHT,175.271115221764 + 239684,"12/28/1980, 00:00:00",DOB, + 239684,"05/11/2010, 17:41:51",ADMISSION//CARDIAC, + 239684,"05/11/2010, 17:41:51",HR,102.6 + 239684,"05/11/2010, 17:41:51",TEMP,96.0 + 239684,"05/11/2010, 17:48:48",HR,105.1 + 239684,"05/11/2010, 17:48:48",TEMP,96.2 + 239684,"05/11/2010, 18:25:35",HR,113.4 + 239684,"05/11/2010, 18:25:35",TEMP,95.8 + 239684,"05/11/2010, 18:57:18",HR,112.6 + 239684,"05/11/2010, 18:57:18",TEMP,95.5 + 239684,"05/11/2010, 19:27:19",DISCHARGE, + 1195293,,EYE_COLOR//BLUE, + 1195293,,HEIGHT,164.6868838269085 + 1195293,"06/20/1978, 00:00:00",DOB, + 1195293,"06/20/2010, 19:23:52",ADMISSION//CARDIAC, + 1195293,"06/20/2010, 19:23:52",HR,109.0 + 1195293,"06/20/2010, 19:23:52",TEMP,100.0 + 1195293,"06/20/2010, 19:25:32",HR,114.1 + 1195293,"06/20/2010, 19:25:32",TEMP,100.0 + 1195293,"06/20/2010, 19:45:19",HR,119.8 + 1195293,"06/20/2010, 19:45:19",TEMP,99.9 + 1195293,"06/20/2010, 20:12:31",HR,112.5 + 1195293,"06/20/2010, 20:12:31",TEMP,99.8 + 1195293,"06/20/2010, 20:24:44",HR,107.7 + 1195293,"06/20/2010, 20:24:44",TEMP,100.0 + 1195293,"06/20/2010, 20:41:33",HR,107.5 + 1195293,"06/20/2010, 20:41:33",TEMP,100.4 + 1195293,"06/20/2010, 20:50:04",DISCHARGE, - data/train/1: |-2 - subject_id,time,code,numeric_value - 68729,,EYE_COLOR//HAZEL, - 68729,,HEIGHT,160.3953106166676 - 68729,"03/09/1978, 00:00:00",DOB, - 68729,"05/26/2010, 02:30:56",ADMISSION//PULMONARY, - 68729,"05/26/2010, 02:30:56",HR,86.0 - 68729,"05/26/2010, 02:30:56",TEMP,97.8 - 68729,"05/26/2010, 04:51:52",DISCHARGE, - 814703,,EYE_COLOR//HAZEL, - 814703,,HEIGHT,156.48559093209357 - 814703,"03/28/1976, 00:00:00",DOB, - 814703,"02/05/2010, 05:55:39",ADMISSION//ORTHOPEDIC, - 814703,"02/05/2010, 05:55:39",HR,170.2 - 814703,"02/05/2010, 05:55:39",TEMP,100.1 - 814703,"02/05/2010, 07:02:30",DISCHARGE, +data/train/1: |-2 + subject_id,time,code,numeric_value + 68729,,EYE_COLOR//HAZEL, + 68729,,HEIGHT,160.3953106166676 + 68729,"03/09/1978, 00:00:00",DOB, + 68729,"05/26/2010, 02:30:56",ADMISSION//PULMONARY, + 68729,"05/26/2010, 02:30:56",HR,86.0 + 68729,"05/26/2010, 02:30:56",TEMP,97.8 + 68729,"05/26/2010, 04:51:52",DISCHARGE, + 814703,,EYE_COLOR//HAZEL, + 814703,,HEIGHT,156.48559093209357 + 814703,"03/28/1976, 00:00:00",DOB, + 814703,"02/05/2010, 05:55:39",ADMISSION//ORTHOPEDIC, + 814703,"02/05/2010, 05:55:39",HR,170.2 + 814703,"02/05/2010, 05:55:39",TEMP,100.1 + 814703,"02/05/2010, 07:02:30",DISCHARGE, - data/tuning/0: |-2 - subject_id,time,code,numeric_value - 754281,,EYE_COLOR//BROWN, - 754281,,HEIGHT,166.22261567137025 - 754281,"12/19/1988, 00:00:00",DOB, - 754281,"01/03/2010, 06:27:59",ADMISSION//PULMONARY, - 754281,"01/03/2010, 06:27:59",HR,142.0 - 754281,"01/03/2010, 06:27:59",TEMP,99.8 - 754281,"01/03/2010, 08:22:13",DISCHARGE, +data/tuning/0: |-2 + subject_id,time,code,numeric_value + 754281,,EYE_COLOR//BROWN, + 754281,,HEIGHT,166.22261567137025 + 754281,"12/19/1988, 00:00:00",DOB, + 754281,"01/03/2010, 06:27:59",ADMISSION//PULMONARY, + 754281,"01/03/2010, 06:27:59",HR,142.0 + 754281,"01/03/2010, 06:27:59",TEMP,99.8 + 754281,"01/03/2010, 08:22:13",DISCHARGE, - data/held_out/0: |-2 - subject_id,time,code,numeric_value - 1500733,,EYE_COLOR//BROWN, - 1500733,,HEIGHT,158.60131573580904 - 1500733,"07/20/1986, 00:00:00",DOB, - 1500733,"06/03/2010, 14:54:38",ADMISSION//ORTHOPEDIC, - 1500733,"06/03/2010, 14:54:38",HR,91.4 - 1500733,"06/03/2010, 14:54:38",TEMP,100.0 - 1500733,"06/03/2010, 15:39:49",HR,84.4 - 1500733,"06/03/2010, 15:39:49",TEMP,100.3 - 1500733,"06/03/2010, 16:20:49",HR,90.1 - 1500733,"06/03/2010, 16:20:49",TEMP,100.1 - 1500733,"06/03/2010, 16:44:26",DISCHARGE, +data/held_out/0: |-2 + subject_id,time,code,numeric_value + 1500733,,EYE_COLOR//BROWN, + 1500733,,HEIGHT,158.60131573580904 + 1500733,"07/20/1986, 00:00:00",DOB, + 1500733,"06/03/2010, 14:54:38",ADMISSION//ORTHOPEDIC, + 1500733,"06/03/2010, 14:54:38",HR,91.4 + 1500733,"06/03/2010, 14:54:38",TEMP,100.0 + 1500733,"06/03/2010, 15:39:49",HR,84.4 + 1500733,"06/03/2010, 15:39:49",TEMP,100.3 + 1500733,"06/03/2010, 16:20:49",HR,90.1 + 1500733,"06/03/2010, 16:20:49",TEMP,100.1 + 1500733,"06/03/2010, 16:44:26",DISCHARGE, """ ) diff --git a/tests/MEDS_Transforms/test_extract_values.py b/tests/MEDS_Transforms/test_extract_values.py index 4114b3ba..83a2aa3d 100644 --- a/tests/MEDS_Transforms/test_extract_values.py +++ b/tests/MEDS_Transforms/test_extract_values.py @@ -9,71 +9,71 @@ INPUT_SHARDS = parse_shards_yaml( """ - train/0: |-2 - subject_id,time,code,numeric_value,text_value - 239684,,EYE_COLOR//BROWN,, - 239684,"12/28/1980, 00:00:00",DOB,, - 239684,"05/11/2010, 17:41:51",BP,,"120/80" - 1195293,,EYE_COLOR//BLUE,, - 1195293,"06/20/1978, 00:00:00",DOB,, - 1195293,"06/20/2010, 19:23:52",BP,,"144/96" - 1195293,"06/20/2010, 19:23:52",HR,80, - 1195293,"06/20/2010, 19:23:52",TEMP,,"100F" - train/1: |-2 - subject_id,time,code,numeric_value,text_value - 68729,,EYE_COLOR//HAZEL,, - 68729,"03/09/1978, 00:00:00",DOB,, - 814703,"02/05/2010, 05:55:39",HR,170.2, - tuning/0: |-2 - subject_id,time,code,numeric_value,text_value - 754281,,EYE_COLOR//BROWN,, - 754281,"12/19/1988, 00:00:00",DOB,, - 754281,"01/03/2010, 06:27:59",HR,142.0, - 754281,"06/20/2010, 20:23:50",BP,,"134/76" - 754281,"06/20/2010, 21:00:02",TEMP,,"36.2C" - held_out/0: |-2 - subject_id,time,code,numeric_value,text_value - 1500733,,EYE_COLOR//BROWN,, - 1500733,"07/20/1986, 00:00:00",DOB,, - 1500733,"06/03/2010, 14:54:38",HR,91.4 - 1500733,"06/03/2010, 14:54:38",BP,,"123/82" +train/0: |-2 + subject_id,time,code,numeric_value,text_value + 239684,,EYE_COLOR//BROWN,, + 239684,"12/28/1980, 00:00:00",DOB,, + 239684,"05/11/2010, 17:41:51",BP,,"120/80" + 1195293,,EYE_COLOR//BLUE,, + 1195293,"06/20/1978, 00:00:00",DOB,, + 1195293,"06/20/2010, 19:23:52",BP,,"144/96" + 1195293,"06/20/2010, 19:23:52",HR,80, + 1195293,"06/20/2010, 19:23:52",TEMP,,"100F" +train/1: |-2 + subject_id,time,code,numeric_value,text_value + 68729,,EYE_COLOR//HAZEL,, + 68729,"03/09/1978, 00:00:00",DOB,, + 814703,"02/05/2010, 05:55:39",HR,170.2, +tuning/0: |-2 + subject_id,time,code,numeric_value,text_value + 754281,,EYE_COLOR//BROWN,, + 754281,"12/19/1988, 00:00:00",DOB,, + 754281,"01/03/2010, 06:27:59",HR,142.0, + 754281,"06/20/2010, 20:23:50",BP,,"134/76" + 754281,"06/20/2010, 21:00:02",TEMP,,"36.2C" +held_out/0: |-2 + subject_id,time,code,numeric_value,text_value + 1500733,,EYE_COLOR//BROWN,, + 1500733,"07/20/1986, 00:00:00",DOB,, + 1500733,"06/03/2010, 14:54:38",HR,91.4 + 1500733,"06/03/2010, 14:54:38",BP,,"123/82" """ ) WANT_SHARDS = parse_shards_yaml( """ - train/0: |-2 - subject_id,time,code,numeric_value,text_value - 239684,,EYE_COLOR//BROWN,, - 239684,"12/28/1980, 00:00:00",DOB,, - 239684,"05/11/2010, 17:41:51",BP//SYSTOLIC,120, - 239684,"05/11/2010, 17:41:51",BP//DIASTOLIC,80, - 1195293,,EYE_COLOR//BLUE,, - 1195293,"06/20/1978, 00:00:00",DOB,, - 1195293,"06/20/2010, 19:23:52",BP//SYSTOLIC,144, - 1195293,"06/20/2010, 19:23:52",BP//DIASTOLIC,96, - 1195293,"06/20/2010, 19:23:52",TEMP//F,100, - 1195293,"06/20/2010, 19:23:52",HR,80, - train/1: |-2 - subject_id,time,code,numeric_value,text_value - 68729,,EYE_COLOR//HAZEL,, - 68729,"03/09/1978, 00:00:00",DOB,, - 814703,"02/05/2010, 05:55:39",HR,170.2, - tuning/0: |-2 - subject_id,time,code,numeric_value,text_value - 754281,,EYE_COLOR//BROWN,, - 754281,"12/19/1988, 00:00:00",DOB,, - 754281,"01/03/2010, 06:27:59",HR,142.0, - 754281,"06/20/2010, 20:23:50",BP//SYSTOLIC,134, - 754281,"06/20/2010, 20:23:50",BP//DIASTOLIC,76, - 754281,"06/20/2010, 21:00:02",TEMP//C,36.2, - held_out/0: |-2 - subject_id,time,code,numeric_value,text_value - 1500733,,EYE_COLOR//BROWN,, - 1500733,"07/20/1986, 00:00:00",DOB,, - 1500733,"06/03/2010, 14:54:38",BP//SYSTOLIC,123, - 1500733,"06/03/2010, 14:54:38",BP//DIASTOLIC,82, - 1500733,"06/03/2010, 14:54:38",HR,91.4, +train/0: |-2 + subject_id,time,code,numeric_value,text_value + 239684,,EYE_COLOR//BROWN,, + 239684,"12/28/1980, 00:00:00",DOB,, + 239684,"05/11/2010, 17:41:51",BP//SYSTOLIC,120, + 239684,"05/11/2010, 17:41:51",BP//DIASTOLIC,80, + 1195293,,EYE_COLOR//BLUE,, + 1195293,"06/20/1978, 00:00:00",DOB,, + 1195293,"06/20/2010, 19:23:52",BP//SYSTOLIC,144, + 1195293,"06/20/2010, 19:23:52",BP//DIASTOLIC,96, + 1195293,"06/20/2010, 19:23:52",TEMP//F,100, + 1195293,"06/20/2010, 19:23:52",HR,80, +train/1: |-2 + subject_id,time,code,numeric_value,text_value + 68729,,EYE_COLOR//HAZEL,, + 68729,"03/09/1978, 00:00:00",DOB,, + 814703,"02/05/2010, 05:55:39",HR,170.2, +tuning/0: |-2 + subject_id,time,code,numeric_value,text_value + 754281,,EYE_COLOR//BROWN,, + 754281,"12/19/1988, 00:00:00",DOB,, + 754281,"01/03/2010, 06:27:59",HR,142.0, + 754281,"06/20/2010, 20:23:50",BP//SYSTOLIC,134, + 754281,"06/20/2010, 20:23:50",BP//DIASTOLIC,76, + 754281,"06/20/2010, 21:00:02",TEMP//C,36.2, +held_out/0: |-2 + subject_id,time,code,numeric_value,text_value + 1500733,,EYE_COLOR//BROWN,, + 1500733,"07/20/1986, 00:00:00",DOB,, + 1500733,"06/03/2010, 14:54:38",BP//SYSTOLIC,123, + 1500733,"06/03/2010, 14:54:38",BP//DIASTOLIC,82, + 1500733,"06/03/2010, 14:54:38",HR,91.4, """ ) diff --git a/tests/MEDS_Transforms/test_multi_stage_preprocess_pipeline.py b/tests/MEDS_Transforms/test_multi_stage_preprocess_pipeline.py index 6667313f..7a1b1a75 100644 --- a/tests/MEDS_Transforms/test_multi_stage_preprocess_pipeline.py +++ b/tests/MEDS_Transforms/test_multi_stage_preprocess_pipeline.py @@ -81,150 +81,150 @@ # After filtering out subjects with fewer than 5 events: WANT_FILTER = parse_shards_yaml( f""" - "filter_subjects/train/0": |-2 - {subject_id_field},time,code,numeric_value - 239684,,EYE_COLOR//BROWN, - 239684,,HEIGHT,175.271115221764 - 239684,"12/28/1980, 00:00:00",DOB, - 239684,"05/11/2010, 17:41:51",ADMISSION//CARDIAC, - 239684,"05/11/2010, 17:41:51",HR,102.6 - 239684,"05/11/2010, 17:41:51",TEMP,96.0 - 239684,"05/11/2010, 17:48:48",HR,105.1 - 239684,"05/11/2010, 17:48:48",TEMP,96.2 - 239684,"05/11/2010, 18:25:35",HR,113.4 - 239684,"05/11/2010, 18:25:35",TEMP,95.8 - 239684,"05/11/2010, 18:57:18",HR,112.6 - 239684,"05/11/2010, 18:57:18",TEMP,95.5 - 239684,"05/11/2010, 19:27:19",DISCHARGE, - 1195293,,EYE_COLOR//BLUE, - 1195293,,HEIGHT,164.6868838269085 - 1195293,"06/20/1978, 00:00:00",DOB, - 1195293,"06/20/2010, 19:23:52",ADMISSION//CARDIAC, - 1195293,"06/20/2010, 19:23:52",HR,109.0 - 1195293,"06/20/2010, 19:23:52",TEMP,100.0 - 1195293,"06/20/2010, 19:25:32",HR,114.1 - 1195293,"06/20/2010, 19:25:32",TEMP,100.0 - 1195293,"06/20/2010, 19:45:19",HR,119.8 - 1195293,"06/20/2010, 19:45:19",TEMP,99.9 - 1195293,"06/20/2010, 20:12:31",HR,112.5 - 1195293,"06/20/2010, 20:12:31",TEMP,99.8 - 1195293,"06/20/2010, 20:24:44",HR,107.7 - 1195293,"06/20/2010, 20:24:44",TEMP,100.0 - 1195293,"06/20/2010, 20:41:33",HR,107.5 - 1195293,"06/20/2010, 20:41:33",TEMP,100.4 - 1195293,"06/20/2010, 20:50:04",DISCHARGE, +"filter_subjects/train/0": |-2 + {subject_id_field},time,code,numeric_value + 239684,,EYE_COLOR//BROWN, + 239684,,HEIGHT,175.271115221764 + 239684,"12/28/1980, 00:00:00",DOB, + 239684,"05/11/2010, 17:41:51",ADMISSION//CARDIAC, + 239684,"05/11/2010, 17:41:51",HR,102.6 + 239684,"05/11/2010, 17:41:51",TEMP,96.0 + 239684,"05/11/2010, 17:48:48",HR,105.1 + 239684,"05/11/2010, 17:48:48",TEMP,96.2 + 239684,"05/11/2010, 18:25:35",HR,113.4 + 239684,"05/11/2010, 18:25:35",TEMP,95.8 + 239684,"05/11/2010, 18:57:18",HR,112.6 + 239684,"05/11/2010, 18:57:18",TEMP,95.5 + 239684,"05/11/2010, 19:27:19",DISCHARGE, + 1195293,,EYE_COLOR//BLUE, + 1195293,,HEIGHT,164.6868838269085 + 1195293,"06/20/1978, 00:00:00",DOB, + 1195293,"06/20/2010, 19:23:52",ADMISSION//CARDIAC, + 1195293,"06/20/2010, 19:23:52",HR,109.0 + 1195293,"06/20/2010, 19:23:52",TEMP,100.0 + 1195293,"06/20/2010, 19:25:32",HR,114.1 + 1195293,"06/20/2010, 19:25:32",TEMP,100.0 + 1195293,"06/20/2010, 19:45:19",HR,119.8 + 1195293,"06/20/2010, 19:45:19",TEMP,99.9 + 1195293,"06/20/2010, 20:12:31",HR,112.5 + 1195293,"06/20/2010, 20:12:31",TEMP,99.8 + 1195293,"06/20/2010, 20:24:44",HR,107.7 + 1195293,"06/20/2010, 20:24:44",TEMP,100.0 + 1195293,"06/20/2010, 20:41:33",HR,107.5 + 1195293,"06/20/2010, 20:41:33",TEMP,100.4 + 1195293,"06/20/2010, 20:50:04",DISCHARGE, - "filter_subjects/train/1": |-2 - {subject_id_field},time,code,numeric_value +"filter_subjects/train/1": |-2 + {subject_id_field},time,code,numeric_value - "filter_subjects/tuning/0": |-2 - {subject_id_field},time,code,numeric_value +"filter_subjects/tuning/0": |-2 + {subject_id_field},time,code,numeric_value - "filter_subjects/held_out/0": |-2 - {subject_id_field},time,code,numeric_value - 1500733,,EYE_COLOR//BROWN, - 1500733,,HEIGHT,158.60131573580904 - 1500733,"07/20/1986, 00:00:00",DOB, - 1500733,"06/03/2010, 14:54:38",ADMISSION//ORTHOPEDIC, - 1500733,"06/03/2010, 14:54:38",HR,91.4 - 1500733,"06/03/2010, 14:54:38",TEMP,100.0 - 1500733,"06/03/2010, 15:39:49",HR,84.4 - 1500733,"06/03/2010, 15:39:49",TEMP,100.3 - 1500733,"06/03/2010, 16:20:49",HR,90.1 - 1500733,"06/03/2010, 16:20:49",TEMP,100.1 - 1500733,"06/03/2010, 16:44:26",DISCHARGE, +"filter_subjects/held_out/0": |-2 + {subject_id_field},time,code,numeric_value + 1500733,,EYE_COLOR//BROWN, + 1500733,,HEIGHT,158.60131573580904 + 1500733,"07/20/1986, 00:00:00",DOB, + 1500733,"06/03/2010, 14:54:38",ADMISSION//ORTHOPEDIC, + 1500733,"06/03/2010, 14:54:38",HR,91.4 + 1500733,"06/03/2010, 14:54:38",TEMP,100.0 + 1500733,"06/03/2010, 15:39:49",HR,84.4 + 1500733,"06/03/2010, 15:39:49",TEMP,100.3 + 1500733,"06/03/2010, 16:20:49",HR,90.1 + 1500733,"06/03/2010, 16:20:49",TEMP,100.1 + 1500733,"06/03/2010, 16:44:26",DISCHARGE, """ ) WANT_TIME_DERIVED = parse_shards_yaml( f""" - "add_time_derived_measurements/train/0": |-2 - {subject_id_field},time,code,numeric_value - 239684,,EYE_COLOR//BROWN, - 239684,,HEIGHT,175.271115221764 - 239684,"12/28/1980, 00:00:00","TIME_OF_DAY//[00,06)", - 239684,"12/28/1980, 00:00:00",DOB, - 239684,"05/11/2010, 17:41:51","TIME_OF_DAY//[12,18)", - 239684,"05/11/2010, 17:41:51",AGE,29.36883360091833 - 239684,"05/11/2010, 17:41:51",ADMISSION//CARDIAC, - 239684,"05/11/2010, 17:41:51",HR,102.6 - 239684,"05/11/2010, 17:41:51",TEMP,96.0 - 239684,"05/11/2010, 17:48:48","TIME_OF_DAY//[12,18)", - 239684,"05/11/2010, 17:48:48",AGE,29.36884681513314 - 239684,"05/11/2010, 17:48:48",HR,105.1 - 239684,"05/11/2010, 17:48:48",TEMP,96.2 - 239684,"05/11/2010, 18:25:35","TIME_OF_DAY//[18,24)", - 239684,"05/11/2010, 18:25:35",AGE,29.36891675223647 - 239684,"05/11/2010, 18:25:35",HR,113.4 - 239684,"05/11/2010, 18:25:35",TEMP,95.8 - 239684,"05/11/2010, 18:57:18","TIME_OF_DAY//[18,24)", - 239684,"05/11/2010, 18:57:18",AGE,29.36897705595538 - 239684,"05/11/2010, 18:57:18",HR,112.6 - 239684,"05/11/2010, 18:57:18",TEMP,95.5 - 239684,"05/11/2010, 19:27:19","TIME_OF_DAY//[18,24)", - 239684,"05/11/2010, 19:27:19",AGE,29.369034127420306 - 239684,"05/11/2010, 19:27:19",DISCHARGE, - 1195293,,EYE_COLOR//BLUE, - 1195293,,HEIGHT,164.6868838269085 - 1195293,"06/20/1978, 00:00:00","TIME_OF_DAY//[00,06)", - 1195293,"06/20/1978, 00:00:00",DOB, - 1195293,"06/20/2010, 19:23:52","TIME_OF_DAY//[18,24)", - 1195293,"06/20/2010, 19:23:52",AGE,32.002896271955265 - 1195293,"06/20/2010, 19:23:52",ADMISSION//CARDIAC, - 1195293,"06/20/2010, 19:23:52",HR,109.0 - 1195293,"06/20/2010, 19:23:52",TEMP,100.0 - 1195293,"06/20/2010, 19:25:32","TIME_OF_DAY//[18,24)", - 1195293,"06/20/2010, 19:25:32",AGE,32.00289944083172 - 1195293,"06/20/2010, 19:25:32",HR,114.1 - 1195293,"06/20/2010, 19:25:32",TEMP,100.0 - 1195293,"06/20/2010, 19:45:19","TIME_OF_DAY//[18,24)", - 1195293,"06/20/2010, 19:45:19",AGE,32.00293705539522 - 1195293,"06/20/2010, 19:45:19",HR,119.8 - 1195293,"06/20/2010, 19:45:19",TEMP,99.9 - 1195293,"06/20/2010, 20:12:31","TIME_OF_DAY//[18,24)", - 1195293,"06/20/2010, 20:12:31",AGE,32.002988771458945 - 1195293,"06/20/2010, 20:12:31",HR,112.5 - 1195293,"06/20/2010, 20:12:31",TEMP,99.8 - 1195293,"06/20/2010, 20:24:44","TIME_OF_DAY//[18,24)", - 1195293,"06/20/2010, 20:24:44",AGE,32.00301199932335 - 1195293,"06/20/2010, 20:24:44",HR,107.7 - 1195293,"06/20/2010, 20:24:44",TEMP,100.0 - 1195293,"06/20/2010, 20:41:33","TIME_OF_DAY//[18,24)", - 1195293,"06/20/2010, 20:41:33",AGE,32.003043973286765 - 1195293,"06/20/2010, 20:41:33",HR,107.5 - 1195293,"06/20/2010, 20:41:33",TEMP,100.4 - 1195293,"06/20/2010, 20:50:04","TIME_OF_DAY//[18,24)", - 1195293,"06/20/2010, 20:50:04",AGE,32.00306016624544 - 1195293,"06/20/2010, 20:50:04",DISCHARGE, +"add_time_derived_measurements/train/0": |-2 + {subject_id_field},time,code,numeric_value + 239684,,EYE_COLOR//BROWN, + 239684,,HEIGHT,175.271115221764 + 239684,"12/28/1980, 00:00:00","TIME_OF_DAY//[00,06)", + 239684,"12/28/1980, 00:00:00",DOB, + 239684,"05/11/2010, 17:41:51","TIME_OF_DAY//[12,18)", + 239684,"05/11/2010, 17:41:51",AGE,29.36883360091833 + 239684,"05/11/2010, 17:41:51",ADMISSION//CARDIAC, + 239684,"05/11/2010, 17:41:51",HR,102.6 + 239684,"05/11/2010, 17:41:51",TEMP,96.0 + 239684,"05/11/2010, 17:48:48","TIME_OF_DAY//[12,18)", + 239684,"05/11/2010, 17:48:48",AGE,29.36884681513314 + 239684,"05/11/2010, 17:48:48",HR,105.1 + 239684,"05/11/2010, 17:48:48",TEMP,96.2 + 239684,"05/11/2010, 18:25:35","TIME_OF_DAY//[18,24)", + 239684,"05/11/2010, 18:25:35",AGE,29.36891675223647 + 239684,"05/11/2010, 18:25:35",HR,113.4 + 239684,"05/11/2010, 18:25:35",TEMP,95.8 + 239684,"05/11/2010, 18:57:18","TIME_OF_DAY//[18,24)", + 239684,"05/11/2010, 18:57:18",AGE,29.36897705595538 + 239684,"05/11/2010, 18:57:18",HR,112.6 + 239684,"05/11/2010, 18:57:18",TEMP,95.5 + 239684,"05/11/2010, 19:27:19","TIME_OF_DAY//[18,24)", + 239684,"05/11/2010, 19:27:19",AGE,29.369034127420306 + 239684,"05/11/2010, 19:27:19",DISCHARGE, + 1195293,,EYE_COLOR//BLUE, + 1195293,,HEIGHT,164.6868838269085 + 1195293,"06/20/1978, 00:00:00","TIME_OF_DAY//[00,06)", + 1195293,"06/20/1978, 00:00:00",DOB, + 1195293,"06/20/2010, 19:23:52","TIME_OF_DAY//[18,24)", + 1195293,"06/20/2010, 19:23:52",AGE,32.002896271955265 + 1195293,"06/20/2010, 19:23:52",ADMISSION//CARDIAC, + 1195293,"06/20/2010, 19:23:52",HR,109.0 + 1195293,"06/20/2010, 19:23:52",TEMP,100.0 + 1195293,"06/20/2010, 19:25:32","TIME_OF_DAY//[18,24)", + 1195293,"06/20/2010, 19:25:32",AGE,32.00289944083172 + 1195293,"06/20/2010, 19:25:32",HR,114.1 + 1195293,"06/20/2010, 19:25:32",TEMP,100.0 + 1195293,"06/20/2010, 19:45:19","TIME_OF_DAY//[18,24)", + 1195293,"06/20/2010, 19:45:19",AGE,32.00293705539522 + 1195293,"06/20/2010, 19:45:19",HR,119.8 + 1195293,"06/20/2010, 19:45:19",TEMP,99.9 + 1195293,"06/20/2010, 20:12:31","TIME_OF_DAY//[18,24)", + 1195293,"06/20/2010, 20:12:31",AGE,32.002988771458945 + 1195293,"06/20/2010, 20:12:31",HR,112.5 + 1195293,"06/20/2010, 20:12:31",TEMP,99.8 + 1195293,"06/20/2010, 20:24:44","TIME_OF_DAY//[18,24)", + 1195293,"06/20/2010, 20:24:44",AGE,32.00301199932335 + 1195293,"06/20/2010, 20:24:44",HR,107.7 + 1195293,"06/20/2010, 20:24:44",TEMP,100.0 + 1195293,"06/20/2010, 20:41:33","TIME_OF_DAY//[18,24)", + 1195293,"06/20/2010, 20:41:33",AGE,32.003043973286765 + 1195293,"06/20/2010, 20:41:33",HR,107.5 + 1195293,"06/20/2010, 20:41:33",TEMP,100.4 + 1195293,"06/20/2010, 20:50:04","TIME_OF_DAY//[18,24)", + 1195293,"06/20/2010, 20:50:04",AGE,32.00306016624544 + 1195293,"06/20/2010, 20:50:04",DISCHARGE, - "add_time_derived_measurements/train/1": |-2 - {subject_id_field},time,code,numeric_value +"add_time_derived_measurements/train/1": |-2 + {subject_id_field},time,code,numeric_value - "add_time_derived_measurements/tuning/0": |-2 - {subject_id_field},time,code,numeric_value +"add_time_derived_measurements/tuning/0": |-2 + {subject_id_field},time,code,numeric_value - "add_time_derived_measurements/held_out/0": |-2 - {subject_id_field},time,code,numeric_value - 1500733,,EYE_COLOR//BROWN, - 1500733,,HEIGHT,158.60131573580904 - 1500733,"07/20/1986, 00:00:00","TIME_OF_DAY//[00,06)", - 1500733,"07/20/1986, 00:00:00",DOB, - 1500733,"06/03/2010, 14:54:38","TIME_OF_DAY//[12,18)", - 1500733,"06/03/2010, 14:54:38",AGE,23.873531791091356 - 1500733,"06/03/2010, 14:54:38",ADMISSION//ORTHOPEDIC, - 1500733,"06/03/2010, 14:54:38",HR,91.4 - 1500733,"06/03/2010, 14:54:38",TEMP,100.0 - 1500733,"06/03/2010, 15:39:49","TIME_OF_DAY//[12,18)", - 1500733,"06/03/2010, 15:39:49",AGE,23.873617699332012 - 1500733,"06/03/2010, 15:39:49",HR,84.4 - 1500733,"06/03/2010, 15:39:49",TEMP,100.3 - 1500733,"06/03/2010, 16:20:49","TIME_OF_DAY//[12,18)", - 1500733,"06/03/2010, 16:20:49",AGE,23.873695653692767 - 1500733,"06/03/2010, 16:20:49",HR,90.1 - 1500733,"06/03/2010, 16:20:49",TEMP,100.1 - 1500733,"06/03/2010, 16:44:26","TIME_OF_DAY//[12,18)", - 1500733,"06/03/2010, 16:44:26",AGE,23.873740556672114 - 1500733,"06/03/2010, 16:44:26",DISCHARGE, +"add_time_derived_measurements/held_out/0": |-2 + {subject_id_field},time,code,numeric_value + 1500733,,EYE_COLOR//BROWN, + 1500733,,HEIGHT,158.60131573580904 + 1500733,"07/20/1986, 00:00:00","TIME_OF_DAY//[00,06)", + 1500733,"07/20/1986, 00:00:00",DOB, + 1500733,"06/03/2010, 14:54:38","TIME_OF_DAY//[12,18)", + 1500733,"06/03/2010, 14:54:38",AGE,23.873531791091356 + 1500733,"06/03/2010, 14:54:38",ADMISSION//ORTHOPEDIC, + 1500733,"06/03/2010, 14:54:38",HR,91.4 + 1500733,"06/03/2010, 14:54:38",TEMP,100.0 + 1500733,"06/03/2010, 15:39:49","TIME_OF_DAY//[12,18)", + 1500733,"06/03/2010, 15:39:49",AGE,23.873617699332012 + 1500733,"06/03/2010, 15:39:49",HR,84.4 + 1500733,"06/03/2010, 15:39:49",TEMP,100.3 + 1500733,"06/03/2010, 16:20:49","TIME_OF_DAY//[12,18)", + 1500733,"06/03/2010, 16:20:49",AGE,23.873695653692767 + 1500733,"06/03/2010, 16:20:49",HR,90.1 + 1500733,"06/03/2010, 16:20:49",TEMP,100.1 + 1500733,"06/03/2010, 16:44:26","TIME_OF_DAY//[12,18)", + 1500733,"06/03/2010, 16:44:26",AGE,23.873740556672114 + 1500733,"06/03/2010, 16:44:26",DISCHARGE, """ ) @@ -389,93 +389,93 @@ WANT_OCCLUDE_OUTLIERS = parse_shards_yaml( f""" - "occlude_outliers/train/0": |-2 - {subject_id_field},time,code,numeric_value,numeric_value/is_inlier - 239684,,EYE_COLOR//BROWN,, - 239684,,HEIGHT,,false - 239684,"12/28/1980, 00:00:00","TIME_OF_DAY//[00,06)",, - 239684,"12/28/1980, 00:00:00",DOB,, - 239684,"05/11/2010, 17:41:51","TIME_OF_DAY//[12,18)",, - 239684,"05/11/2010, 17:41:51",AGE,,false - 239684,"05/11/2010, 17:41:51",ADMISSION//CARDIAC,, - 239684,"05/11/2010, 17:41:51",HR,,false - 239684,"05/11/2010, 17:41:51",TEMP,,false - 239684,"05/11/2010, 17:48:48","TIME_OF_DAY//[12,18)",, - 239684,"05/11/2010, 17:48:48",AGE,,false - 239684,"05/11/2010, 17:48:48",HR,,false - 239684,"05/11/2010, 17:48:48",TEMP,,false - 239684,"05/11/2010, 18:25:35","TIME_OF_DAY//[18,24)",, - 239684,"05/11/2010, 18:25:35",AGE,,false - 239684,"05/11/2010, 18:25:35",HR,113.4,true - 239684,"05/11/2010, 18:25:35",TEMP,,false - 239684,"05/11/2010, 18:57:18","TIME_OF_DAY//[18,24)",, - 239684,"05/11/2010, 18:57:18",AGE,,false - 239684,"05/11/2010, 18:57:18",HR,112.6,true - 239684,"05/11/2010, 18:57:18",TEMP,,false - 239684,"05/11/2010, 19:27:19","TIME_OF_DAY//[18,24)",, - 239684,"05/11/2010, 19:27:19",AGE,,false - 239684,"05/11/2010, 19:27:19",DISCHARGE,, - 1195293,,EYE_COLOR//BLUE,, - 1195293,,HEIGHT,,false - 1195293,"06/20/1978, 00:00:00","TIME_OF_DAY//[00,06)",, - 1195293,"06/20/1978, 00:00:00",DOB,, - 1195293,"06/20/2010, 19:23:52","TIME_OF_DAY//[18,24)",, - 1195293,"06/20/2010, 19:23:52",AGE,32.002896271955265,true - 1195293,"06/20/2010, 19:23:52",ADMISSION//CARDIAC,, - 1195293,"06/20/2010, 19:23:52",HR,109.0,true - 1195293,"06/20/2010, 19:23:52",TEMP,100.0,true - 1195293,"06/20/2010, 19:25:32","TIME_OF_DAY//[18,24)",, - 1195293,"06/20/2010, 19:25:32",AGE,32.00289944083172,true - 1195293,"06/20/2010, 19:25:32",HR,114.1,true - 1195293,"06/20/2010, 19:25:32",TEMP,100.0,true - 1195293,"06/20/2010, 19:45:19","TIME_OF_DAY//[18,24)",, - 1195293,"06/20/2010, 19:45:19",AGE,32.00293705539522,true - 1195293,"06/20/2010, 19:45:19",HR,,false - 1195293,"06/20/2010, 19:45:19",TEMP,99.9,true - 1195293,"06/20/2010, 20:12:31","TIME_OF_DAY//[18,24)",, - 1195293,"06/20/2010, 20:12:31",AGE,32.002988771458945,true - 1195293,"06/20/2010, 20:12:31",HR,112.5,true - 1195293,"06/20/2010, 20:12:31",TEMP,99.8,true - 1195293,"06/20/2010, 20:24:44","TIME_OF_DAY//[18,24)", - 1195293,"06/20/2010, 20:24:44",AGE,32.00301199932335,true - 1195293,"06/20/2010, 20:24:44",HR,107.7,true - 1195293,"06/20/2010, 20:24:44",TEMP,100.0,true - 1195293,"06/20/2010, 20:41:33","TIME_OF_DAY//[18,24)",, - 1195293,"06/20/2010, 20:41:33",AGE,32.003043973286765,true - 1195293,"06/20/2010, 20:41:33",HR,107.5,true - 1195293,"06/20/2010, 20:41:33",TEMP,100.4,true - 1195293,"06/20/2010, 20:50:04","TIME_OF_DAY//[18,24)",, - 1195293,"06/20/2010, 20:50:04",AGE,32.00306016624544,true - 1195293,"06/20/2010, 20:50:04",DISCHARGE,, +"occlude_outliers/train/0": |-2 + {subject_id_field},time,code,numeric_value,numeric_value/is_inlier + 239684,,EYE_COLOR//BROWN,, + 239684,,HEIGHT,,false + 239684,"12/28/1980, 00:00:00","TIME_OF_DAY//[00,06)",, + 239684,"12/28/1980, 00:00:00",DOB,, + 239684,"05/11/2010, 17:41:51","TIME_OF_DAY//[12,18)",, + 239684,"05/11/2010, 17:41:51",AGE,,false + 239684,"05/11/2010, 17:41:51",ADMISSION//CARDIAC,, + 239684,"05/11/2010, 17:41:51",HR,,false + 239684,"05/11/2010, 17:41:51",TEMP,,false + 239684,"05/11/2010, 17:48:48","TIME_OF_DAY//[12,18)",, + 239684,"05/11/2010, 17:48:48",AGE,,false + 239684,"05/11/2010, 17:48:48",HR,,false + 239684,"05/11/2010, 17:48:48",TEMP,,false + 239684,"05/11/2010, 18:25:35","TIME_OF_DAY//[18,24)",, + 239684,"05/11/2010, 18:25:35",AGE,,false + 239684,"05/11/2010, 18:25:35",HR,113.4,true + 239684,"05/11/2010, 18:25:35",TEMP,,false + 239684,"05/11/2010, 18:57:18","TIME_OF_DAY//[18,24)",, + 239684,"05/11/2010, 18:57:18",AGE,,false + 239684,"05/11/2010, 18:57:18",HR,112.6,true + 239684,"05/11/2010, 18:57:18",TEMP,,false + 239684,"05/11/2010, 19:27:19","TIME_OF_DAY//[18,24)",, + 239684,"05/11/2010, 19:27:19",AGE,,false + 239684,"05/11/2010, 19:27:19",DISCHARGE,, + 1195293,,EYE_COLOR//BLUE,, + 1195293,,HEIGHT,,false + 1195293,"06/20/1978, 00:00:00","TIME_OF_DAY//[00,06)",, + 1195293,"06/20/1978, 00:00:00",DOB,, + 1195293,"06/20/2010, 19:23:52","TIME_OF_DAY//[18,24)",, + 1195293,"06/20/2010, 19:23:52",AGE,32.002896271955265,true + 1195293,"06/20/2010, 19:23:52",ADMISSION//CARDIAC,, + 1195293,"06/20/2010, 19:23:52",HR,109.0,true + 1195293,"06/20/2010, 19:23:52",TEMP,100.0,true + 1195293,"06/20/2010, 19:25:32","TIME_OF_DAY//[18,24)",, + 1195293,"06/20/2010, 19:25:32",AGE,32.00289944083172,true + 1195293,"06/20/2010, 19:25:32",HR,114.1,true + 1195293,"06/20/2010, 19:25:32",TEMP,100.0,true + 1195293,"06/20/2010, 19:45:19","TIME_OF_DAY//[18,24)",, + 1195293,"06/20/2010, 19:45:19",AGE,32.00293705539522,true + 1195293,"06/20/2010, 19:45:19",HR,,false + 1195293,"06/20/2010, 19:45:19",TEMP,99.9,true + 1195293,"06/20/2010, 20:12:31","TIME_OF_DAY//[18,24)",, + 1195293,"06/20/2010, 20:12:31",AGE,32.002988771458945,true + 1195293,"06/20/2010, 20:12:31",HR,112.5,true + 1195293,"06/20/2010, 20:12:31",TEMP,99.8,true + 1195293,"06/20/2010, 20:24:44","TIME_OF_DAY//[18,24)", + 1195293,"06/20/2010, 20:24:44",AGE,32.00301199932335,true + 1195293,"06/20/2010, 20:24:44",HR,107.7,true + 1195293,"06/20/2010, 20:24:44",TEMP,100.0,true + 1195293,"06/20/2010, 20:41:33","TIME_OF_DAY//[18,24)",, + 1195293,"06/20/2010, 20:41:33",AGE,32.003043973286765,true + 1195293,"06/20/2010, 20:41:33",HR,107.5,true + 1195293,"06/20/2010, 20:41:33",TEMP,100.4,true + 1195293,"06/20/2010, 20:50:04","TIME_OF_DAY//[18,24)",, + 1195293,"06/20/2010, 20:50:04",AGE,32.00306016624544,true + 1195293,"06/20/2010, 20:50:04",DISCHARGE,, - "occlude_outliers/train/1": |-2 - {subject_id_field},time,code,numeric_value,numeric_value/is_inlier +"occlude_outliers/train/1": |-2 + {subject_id_field},time,code,numeric_value,numeric_value/is_inlier - "occlude_outliers/tuning/0": |-2 - {subject_id_field},time,code,numeric_value,numeric_value/is_inlier +"occlude_outliers/tuning/0": |-2 + {subject_id_field},time,code,numeric_value,numeric_value/is_inlier - "occlude_outliers/held_out/0": |-2 - {subject_id_field},time,code,numeric_value,numeric_value/is_inlier - 1500733,,EYE_COLOR//BROWN,, - 1500733,,HEIGHT,,false - 1500733,"07/20/1986, 00:00:00","TIME_OF_DAY//[00,06)",, - 1500733,"07/20/1986, 00:00:00",DOB,, - 1500733,"06/03/2010, 14:54:38","TIME_OF_DAY//[12,18)",, - 1500733,"06/03/2010, 14:54:38",AGE,,false - 1500733,"06/03/2010, 14:54:38",ADMISSION//ORTHOPEDIC,, - 1500733,"06/03/2010, 14:54:38",HR,,false - 1500733,"06/03/2010, 14:54:38",TEMP,100.0,true - 1500733,"06/03/2010, 15:39:49","TIME_OF_DAY//[12,18)",, - 1500733,"06/03/2010, 15:39:49",AGE,,false - 1500733,"06/03/2010, 15:39:49",HR,,false - 1500733,"06/03/2010, 15:39:49",TEMP,100.3,true - 1500733,"06/03/2010, 16:20:49","TIME_OF_DAY//[12,18)",, - 1500733,"06/03/2010, 16:20:49",AGE,,false - 1500733,"06/03/2010, 16:20:49",HR,,false - 1500733,"06/03/2010, 16:20:49",TEMP,100.1,true - 1500733,"06/03/2010, 16:44:26","TIME_OF_DAY//[12,18)",, - 1500733,"06/03/2010, 16:44:26",AGE,,false - 1500733,"06/03/2010, 16:44:26",DISCHARGE,, +"occlude_outliers/held_out/0": |-2 + {subject_id_field},time,code,numeric_value,numeric_value/is_inlier + 1500733,,EYE_COLOR//BROWN,, + 1500733,,HEIGHT,,false + 1500733,"07/20/1986, 00:00:00","TIME_OF_DAY//[00,06)",, + 1500733,"07/20/1986, 00:00:00",DOB,, + 1500733,"06/03/2010, 14:54:38","TIME_OF_DAY//[12,18)",, + 1500733,"06/03/2010, 14:54:38",AGE,,false + 1500733,"06/03/2010, 14:54:38",ADMISSION//ORTHOPEDIC,, + 1500733,"06/03/2010, 14:54:38",HR,,false + 1500733,"06/03/2010, 14:54:38",TEMP,100.0,true + 1500733,"06/03/2010, 15:39:49","TIME_OF_DAY//[12,18)",, + 1500733,"06/03/2010, 15:39:49",AGE,,false + 1500733,"06/03/2010, 15:39:49",HR,,false + 1500733,"06/03/2010, 15:39:49",TEMP,100.3,true + 1500733,"06/03/2010, 16:20:49","TIME_OF_DAY//[12,18)",, + 1500733,"06/03/2010, 16:20:49",AGE,,false + 1500733,"06/03/2010, 16:20:49",HR,,false + 1500733,"06/03/2010, 16:20:49",TEMP,100.1,true + 1500733,"06/03/2010, 16:44:26","TIME_OF_DAY//[12,18)",, + 1500733,"06/03/2010, 16:44:26",AGE,,false + 1500733,"06/03/2010, 16:44:26",DISCHARGE,, """ ) @@ -774,93 +774,93 @@ # Note we have dropped the row in the held out shard that doesn't have a code in the vocabulary! WANT_NORMALIZATION = parse_shards_yaml( f""" - "normalization/train/0": |-2 - {subject_id_field},time,code,numeric_value - 239684,,6, - 239684,,7, - 239684,"12/28/1980, 00:00:00",10, - 239684,"12/28/1980, 00:00:00",4, - 239684,"05/11/2010, 17:41:51",11, - 239684,"05/11/2010, 17:41:51",2, - 239684,"05/11/2010, 17:41:51",1, - 239684,"05/11/2010, 17:41:51",8, - 239684,"05/11/2010, 17:41:51",9, - 239684,"05/11/2010, 17:48:48",11, - 239684,"05/11/2010, 17:48:48",2, - 239684,"05/11/2010, 17:48:48",8, - 239684,"05/11/2010, 17:48:48",9, - 239684,"05/11/2010, 18:25:35",12, - 239684,"05/11/2010, 18:25:35",2, - 239684,"05/11/2010, 18:25:35",8,0.9341503977775574 - 239684,"05/11/2010, 18:25:35",9, - 239684,"05/11/2010, 18:57:18",12, - 239684,"05/11/2010, 18:57:18",2, - 239684,"05/11/2010, 18:57:18",8,0.6264293789863586 - 239684,"05/11/2010, 18:57:18",9, - 239684,"05/11/2010, 19:27:19",12, - 239684,"05/11/2010, 19:27:19",2, - 239684,"05/11/2010, 19:27:19",3, - 1195293,,5, - 1195293,,7, - 1195293,"06/20/1978, 00:00:00",10, - 1195293,"06/20/1978, 00:00:00",4, - 1195293,"06/20/2010, 19:23:52",12, - 1195293,"06/20/2010, 19:23:52",2,nan - 1195293,"06/20/2010, 19:23:52",1, - 1195293,"06/20/2010, 19:23:52",8,-0.7583094239234924 - 1195293,"06/20/2010, 19:23:52",9,-0.0889078751206398 - 1195293,"06/20/2010, 19:25:32",12, - 1195293,"06/20/2010, 19:25:32",2,nan - 1195293,"06/20/2010, 19:25:32",8,1.2034040689468384 - 1195293,"06/20/2010, 19:25:32",9,-0.0889078751206398 - 1195293,"06/20/2010, 19:45:19",12, - 1195293,"06/20/2010, 19:45:19",2,nan - 1195293,"06/20/2010, 19:45:19",8, - 1195293,"06/20/2010, 19:45:19",9,-0.6222330927848816 - 1195293,"06/20/2010, 20:12:31",12, - 1195293,"06/20/2010, 20:12:31",2,nan - 1195293,"06/20/2010, 20:12:31",8,0.5879650115966797 - 1195293,"06/20/2010, 20:12:31",9,-1.1555582284927368 - 1195293,"06/20/2010, 20:24:44",12 - 1195293,"06/20/2010, 20:24:44",2,nan - 1195293,"06/20/2010, 20:24:44",8,-1.2583553791046143 - 1195293,"06/20/2010, 20:24:44",9,-0.0889078751206398 - 1195293,"06/20/2010, 20:41:33",12, - 1195293,"06/20/2010, 20:41:33",2,nan - 1195293,"06/20/2010, 20:41:33",8,-1.3352841138839722 - 1195293,"06/20/2010, 20:41:33",9,2.04443359375 - 1195293,"06/20/2010, 20:50:04",12, - 1195293,"06/20/2010, 20:50:04",2,nan - 1195293,"06/20/2010, 20:50:04",3, +"normalization/train/0": |-2 + {subject_id_field},time,code,numeric_value + 239684,,6, + 239684,,7, + 239684,"12/28/1980, 00:00:00",10, + 239684,"12/28/1980, 00:00:00",4, + 239684,"05/11/2010, 17:41:51",11, + 239684,"05/11/2010, 17:41:51",2, + 239684,"05/11/2010, 17:41:51",1, + 239684,"05/11/2010, 17:41:51",8, + 239684,"05/11/2010, 17:41:51",9, + 239684,"05/11/2010, 17:48:48",11, + 239684,"05/11/2010, 17:48:48",2, + 239684,"05/11/2010, 17:48:48",8, + 239684,"05/11/2010, 17:48:48",9, + 239684,"05/11/2010, 18:25:35",12, + 239684,"05/11/2010, 18:25:35",2, + 239684,"05/11/2010, 18:25:35",8,0.9341503977775574 + 239684,"05/11/2010, 18:25:35",9, + 239684,"05/11/2010, 18:57:18",12, + 239684,"05/11/2010, 18:57:18",2, + 239684,"05/11/2010, 18:57:18",8,0.6264293789863586 + 239684,"05/11/2010, 18:57:18",9, + 239684,"05/11/2010, 19:27:19",12, + 239684,"05/11/2010, 19:27:19",2, + 239684,"05/11/2010, 19:27:19",3, + 1195293,,5, + 1195293,,7, + 1195293,"06/20/1978, 00:00:00",10, + 1195293,"06/20/1978, 00:00:00",4, + 1195293,"06/20/2010, 19:23:52",12, + 1195293,"06/20/2010, 19:23:52",2,nan + 1195293,"06/20/2010, 19:23:52",1, + 1195293,"06/20/2010, 19:23:52",8,-0.7583094239234924 + 1195293,"06/20/2010, 19:23:52",9,-0.0889078751206398 + 1195293,"06/20/2010, 19:25:32",12, + 1195293,"06/20/2010, 19:25:32",2,nan + 1195293,"06/20/2010, 19:25:32",8,1.2034040689468384 + 1195293,"06/20/2010, 19:25:32",9,-0.0889078751206398 + 1195293,"06/20/2010, 19:45:19",12, + 1195293,"06/20/2010, 19:45:19",2,nan + 1195293,"06/20/2010, 19:45:19",8, + 1195293,"06/20/2010, 19:45:19",9,-0.6222330927848816 + 1195293,"06/20/2010, 20:12:31",12, + 1195293,"06/20/2010, 20:12:31",2,nan + 1195293,"06/20/2010, 20:12:31",8,0.5879650115966797 + 1195293,"06/20/2010, 20:12:31",9,-1.1555582284927368 + 1195293,"06/20/2010, 20:24:44",12 + 1195293,"06/20/2010, 20:24:44",2,nan + 1195293,"06/20/2010, 20:24:44",8,-1.2583553791046143 + 1195293,"06/20/2010, 20:24:44",9,-0.0889078751206398 + 1195293,"06/20/2010, 20:41:33",12, + 1195293,"06/20/2010, 20:41:33",2,nan + 1195293,"06/20/2010, 20:41:33",8,-1.3352841138839722 + 1195293,"06/20/2010, 20:41:33",9,2.04443359375 + 1195293,"06/20/2010, 20:50:04",12, + 1195293,"06/20/2010, 20:50:04",2,nan + 1195293,"06/20/2010, 20:50:04",3, - "normalization/train/1": |-2 - {subject_id_field},time,code,numeric_value +"normalization/train/1": |-2 + {subject_id_field},time,code,numeric_value - "normalization/tuning/0": |-2 - {subject_id_field},time,code,numeric_value +"normalization/tuning/0": |-2 + {subject_id_field},time,code,numeric_value - "normalization/held_out/0": |-2 - {subject_id_field},time,code,numeric_value - 1500733,,6, - 1500733,,7, - 1500733,"07/20/1986, 00:00:00",10, - 1500733,"07/20/1986, 00:00:00",4, - 1500733,"06/03/2010, 14:54:38",11, - 1500733,"06/03/2010, 14:54:38",2, - 1500733,"06/03/2010, 14:54:38",8, - 1500733,"06/03/2010, 14:54:38",9,-0.0889078751206398 - 1500733,"06/03/2010, 15:39:49",11, - 1500733,"06/03/2010, 15:39:49",2, - 1500733,"06/03/2010, 15:39:49",8, - 1500733,"06/03/2010, 15:39:49",9,1.5111083984375 - 1500733,"06/03/2010, 16:20:49",11, - 1500733,"06/03/2010, 16:20:49",2, - 1500733,"06/03/2010, 16:20:49",8, - 1500733,"06/03/2010, 16:20:49",9,0.4444173276424408 - 1500733,"06/03/2010, 16:44:26",11, - 1500733,"06/03/2010, 16:44:26",2, - 1500733,"06/03/2010, 16:44:26",3, - """, +"normalization/held_out/0": |-2 + {subject_id_field},time,code,numeric_value + 1500733,,6, + 1500733,,7, + 1500733,"07/20/1986, 00:00:00",10, + 1500733,"07/20/1986, 00:00:00",4, + 1500733,"06/03/2010, 14:54:38",11, + 1500733,"06/03/2010, 14:54:38",2, + 1500733,"06/03/2010, 14:54:38",8, + 1500733,"06/03/2010, 14:54:38",9,-0.0889078751206398 + 1500733,"06/03/2010, 15:39:49",11, + 1500733,"06/03/2010, 15:39:49",2, + 1500733,"06/03/2010, 15:39:49",8, + 1500733,"06/03/2010, 15:39:49",9,1.5111083984375 + 1500733,"06/03/2010, 16:20:49",11, + 1500733,"06/03/2010, 16:20:49",2, + 1500733,"06/03/2010, 16:20:49",8, + 1500733,"06/03/2010, 16:20:49",9,0.4444173276424408 + 1500733,"06/03/2010, 16:44:26",11, + 1500733,"06/03/2010, 16:44:26",2, + 1500733,"06/03/2010, 16:44:26",3, + """, code=pl.UInt8, ) diff --git a/tests/MEDS_Transforms/transform_tester_base.py b/tests/MEDS_Transforms/transform_tester_base.py index 6a1d4ab8..7a26c855 100644 --- a/tests/MEDS_Transforms/transform_tester_base.py +++ b/tests/MEDS_Transforms/transform_tester_base.py @@ -5,22 +5,14 @@ """ -try: - pass -except ImportError: - pass - from collections import defaultdict from io import StringIO from pathlib import Path import polars as pl -from meds import subject_id_field - -from tests.utils import FILE_T, multi_stage_tester, parse_meds_csvs, parse_shards_yaml, single_stage_tester +from meds import subject_id_field, subject_splits_filepath -# So it can be imported from here -parse_shards_yaml = parse_shards_yaml +from tests.utils import FILE_T, multi_stage_tester, parse_shards_yaml, single_stage_tester # Test MEDS data (inputs) @@ -31,99 +23,90 @@ "held_out/0": [1500733], } -SPLITS = { - "train": [239684, 1195293, 68729, 814703], - "tuning": [754281], - "held_out": [1500733], -} - -MEDS_TRAIN_0 = """ -subject_id,time,code,numeric_value -239684,,EYE_COLOR//BROWN, -239684,,HEIGHT,175.271115221764 -239684,"12/28/1980, 00:00:00",DOB, -239684,"05/11/2010, 17:41:51",ADMISSION//CARDIAC, -239684,"05/11/2010, 17:41:51",HR,102.6 -239684,"05/11/2010, 17:41:51",TEMP,96.0 -239684,"05/11/2010, 17:48:48",HR,105.1 -239684,"05/11/2010, 17:48:48",TEMP,96.2 -239684,"05/11/2010, 18:25:35",HR,113.4 -239684,"05/11/2010, 18:25:35",TEMP,95.8 -239684,"05/11/2010, 18:57:18",HR,112.6 -239684,"05/11/2010, 18:57:18",TEMP,95.5 -239684,"05/11/2010, 19:27:19",DISCHARGE, -1195293,,EYE_COLOR//BLUE, -1195293,,HEIGHT,164.6868838269085 -1195293,"06/20/1978, 00:00:00",DOB, -1195293,"06/20/2010, 19:23:52",ADMISSION//CARDIAC, -1195293,"06/20/2010, 19:23:52",HR,109.0 -1195293,"06/20/2010, 19:23:52",TEMP,100.0 -1195293,"06/20/2010, 19:25:32",HR,114.1 -1195293,"06/20/2010, 19:25:32",TEMP,100.0 -1195293,"06/20/2010, 19:45:19",HR,119.8 -1195293,"06/20/2010, 19:45:19",TEMP,99.9 -1195293,"06/20/2010, 20:12:31",HR,112.5 -1195293,"06/20/2010, 20:12:31",TEMP,99.8 -1195293,"06/20/2010, 20:24:44",HR,107.7 -1195293,"06/20/2010, 20:24:44",TEMP,100.0 -1195293,"06/20/2010, 20:41:33",HR,107.5 -1195293,"06/20/2010, 20:41:33",TEMP,100.4 -1195293,"06/20/2010, 20:50:04",DISCHARGE, -""" - -MEDS_TRAIN_1 = """ -subject_id,time,code,numeric_value -68729,,EYE_COLOR//HAZEL, -68729,,HEIGHT,160.3953106166676 -68729,"03/09/1978, 00:00:00",DOB, -68729,"05/26/2010, 02:30:56",ADMISSION//PULMONARY, -68729,"05/26/2010, 02:30:56",HR,86.0 -68729,"05/26/2010, 02:30:56",TEMP,97.8 -68729,"05/26/2010, 04:51:52",DISCHARGE, -814703,,EYE_COLOR//HAZEL, -814703,,HEIGHT,156.48559093209357 -814703,"03/28/1976, 00:00:00",DOB, -814703,"02/05/2010, 05:55:39",ADMISSION//ORTHOPEDIC, -814703,"02/05/2010, 05:55:39",HR,170.2 -814703,"02/05/2010, 05:55:39",TEMP,100.1 -814703,"02/05/2010, 07:02:30",DISCHARGE, -""" - -MEDS_TUNING_0 = """ -subject_id,time,code,numeric_value -754281,,EYE_COLOR//BROWN, -754281,,HEIGHT,166.22261567137025 -754281,"12/19/1988, 00:00:00",DOB, -754281,"01/03/2010, 06:27:59",ADMISSION//PULMONARY, -754281,"01/03/2010, 06:27:59",HR,142.0 -754281,"01/03/2010, 06:27:59",TEMP,99.8 -754281,"01/03/2010, 08:22:13",DISCHARGE, -""" - -MEDS_HELD_OUT_0 = """ -subject_id,time,code,numeric_value -1500733,,EYE_COLOR//BROWN, -1500733,,HEIGHT,158.60131573580904 -1500733,"07/20/1986, 00:00:00",DOB, -1500733,"06/03/2010, 14:54:38",ADMISSION//ORTHOPEDIC, -1500733,"06/03/2010, 14:54:38",HR,91.4 -1500733,"06/03/2010, 14:54:38",TEMP,100.0 -1500733,"06/03/2010, 15:39:49",HR,84.4 -1500733,"06/03/2010, 15:39:49",TEMP,100.3 -1500733,"06/03/2010, 16:20:49",HR,90.1 -1500733,"06/03/2010, 16:20:49",TEMP,100.1 -1500733,"06/03/2010, 16:44:26",DISCHARGE, -""" - -MEDS_SHARDS = parse_meds_csvs( +SPLITS_DF = pl.DataFrame( { - "train/0": MEDS_TRAIN_0, - "train/1": MEDS_TRAIN_1, - "tuning/0": MEDS_TUNING_0, - "held_out/0": MEDS_HELD_OUT_0, + subject_id_field: [239684, 1195293, 68729, 814703, 754281, 1500733], + "split": ["train", "train", "train", "train", "tuning", "held_out"], } ) +MEDS_SHARDS = parse_shards_yaml( + """ +train/0: |-2 + subject_id,time,code,numeric_value + 239684,,EYE_COLOR//BROWN, + 239684,,HEIGHT,175.271115221764 + 239684,"12/28/1980, 00:00:00",DOB, + 239684,"05/11/2010, 17:41:51",ADMISSION//CARDIAC, + 239684,"05/11/2010, 17:41:51",HR,102.6 + 239684,"05/11/2010, 17:41:51",TEMP,96.0 + 239684,"05/11/2010, 17:48:48",HR,105.1 + 239684,"05/11/2010, 17:48:48",TEMP,96.2 + 239684,"05/11/2010, 18:25:35",HR,113.4 + 239684,"05/11/2010, 18:25:35",TEMP,95.8 + 239684,"05/11/2010, 18:57:18",HR,112.6 + 239684,"05/11/2010, 18:57:18",TEMP,95.5 + 239684,"05/11/2010, 19:27:19",DISCHARGE, + 1195293,,EYE_COLOR//BLUE, + 1195293,,HEIGHT,164.6868838269085 + 1195293,"06/20/1978, 00:00:00",DOB, + 1195293,"06/20/2010, 19:23:52",ADMISSION//CARDIAC, + 1195293,"06/20/2010, 19:23:52",HR,109.0 + 1195293,"06/20/2010, 19:23:52",TEMP,100.0 + 1195293,"06/20/2010, 19:25:32",HR,114.1 + 1195293,"06/20/2010, 19:25:32",TEMP,100.0 + 1195293,"06/20/2010, 19:45:19",HR,119.8 + 1195293,"06/20/2010, 19:45:19",TEMP,99.9 + 1195293,"06/20/2010, 20:12:31",HR,112.5 + 1195293,"06/20/2010, 20:12:31",TEMP,99.8 + 1195293,"06/20/2010, 20:24:44",HR,107.7 + 1195293,"06/20/2010, 20:24:44",TEMP,100.0 + 1195293,"06/20/2010, 20:41:33",HR,107.5 + 1195293,"06/20/2010, 20:41:33",TEMP,100.4 + 1195293,"06/20/2010, 20:50:04",DISCHARGE, + +train/1: |-2 + subject_id,time,code,numeric_value + 68729,,EYE_COLOR//HAZEL, + 68729,,HEIGHT,160.3953106166676 + 68729,"03/09/1978, 00:00:00",DOB, + 68729,"05/26/2010, 02:30:56",ADMISSION//PULMONARY, + 68729,"05/26/2010, 02:30:56",HR,86.0 + 68729,"05/26/2010, 02:30:56",TEMP,97.8 + 68729,"05/26/2010, 04:51:52",DISCHARGE, + 814703,,EYE_COLOR//HAZEL, + 814703,,HEIGHT,156.48559093209357 + 814703,"03/28/1976, 00:00:00",DOB, + 814703,"02/05/2010, 05:55:39",ADMISSION//ORTHOPEDIC, + 814703,"02/05/2010, 05:55:39",HR,170.2 + 814703,"02/05/2010, 05:55:39",TEMP,100.1 + 814703,"02/05/2010, 07:02:30",DISCHARGE, + +tuning/0: |-2 + subject_id,time,code,numeric_value + 754281,,EYE_COLOR//BROWN, + 754281,,HEIGHT,166.22261567137025 + 754281,"12/19/1988, 00:00:00",DOB, + 754281,"01/03/2010, 06:27:59",ADMISSION//PULMONARY, + 754281,"01/03/2010, 06:27:59",HR,142.0 + 754281,"01/03/2010, 06:27:59",TEMP,99.8 + 754281,"01/03/2010, 08:22:13",DISCHARGE, + +held_out/0: |-2 + subject_id,time,code,numeric_value + 1500733,,EYE_COLOR//BROWN, + 1500733,,HEIGHT,158.60131573580904 + 1500733,"07/20/1986, 00:00:00",DOB, + 1500733,"06/03/2010, 14:54:38",ADMISSION//ORTHOPEDIC, + 1500733,"06/03/2010, 14:54:38",HR,91.4 + 1500733,"06/03/2010, 14:54:38",TEMP,100.0 + 1500733,"06/03/2010, 15:39:49",HR,84.4 + 1500733,"06/03/2010, 15:39:49",TEMP,100.3 + 1500733,"06/03/2010, 16:20:49",HR,90.1 + 1500733,"06/03/2010, 16:20:49",TEMP,100.1 + 1500733,"06/03/2010, 16:44:26",DISCHARGE, + """ +) MEDS_CODE_METADATA_CSV = """ code,code/n_occurrences,code/n_subjects,values/n_occurrences,values/sum,values/sum_sqd,description,parent_codes @@ -197,16 +180,19 @@ def remap_inputs_for_transform( unified_inputs["metadata/.shards.json"] = input_shards_map if input_splits_map is None: - input_splits_map = SPLITS + input_splits_map = SPLITS_DF - input_splits_as_df = defaultdict(list) - for split_name, subject_ids in input_splits_map.items(): - input_splits_as_df[subject_id_field].extend(subject_ids) - input_splits_as_df["split"].extend([split_name] * len(subject_ids)) + if isinstance(input_splits_map, pl.DataFrame): + input_splits_df = input_splits_map + else: + input_splits_as_df = defaultdict(list) + for split_name, subject_ids in input_splits_map.items(): + input_splits_as_df[subject_id_field].extend(subject_ids) + input_splits_as_df["split"].extend([split_name] * len(subject_ids)) - input_splits_df = pl.DataFrame(input_splits_as_df) + input_splits_df = pl.DataFrame(input_splits_as_df) - unified_inputs["metadata/subject_splits.parquet"] = input_splits_df + unified_inputs[subject_splits_filepath] = input_splits_df return unified_inputs diff --git a/tests/__init__.py b/tests/__init__.py index e69de29b..89520379 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1,14 @@ +import os + +import rootutils + +root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=True) + +code_root = root / "src" / "MEDS_transforms" + +USE_LOCAL_SCRIPTS = os.environ.get("DO_USE_LOCAL_SCRIPTS", "0") == "1" + +if USE_LOCAL_SCRIPTS: + RUNNER_SCRIPT = code_root / "runner.py" +else: + RUNNER_SCRIPT = "MEDS_transform-runner" diff --git a/tests/test_with_runner.py b/tests/test_with_runner.py new file mode 100644 index 00000000..7913cb13 --- /dev/null +++ b/tests/test_with_runner.py @@ -0,0 +1,275 @@ +"""Tests a multi-stage pre-processing pipeline via the Runner utility. Only checks final outputs. + +Set the bash env variable `DO_USE_LOCAL_SCRIPTS=1` to use the local py files, rather than the installed +scripts. + +In this test, the following stages are run: + - filter_subjects + - add_time_derived_measurements + - fit_outlier_detection + - occlude_outliers + - fit_normalization + - fit_vocabulary_indices + - normalization + - tokenization + - tensorization + +The stage configuration arguments will be as given in the yaml block below: +""" + + +from functools import partial + +from meds import code_metadata_filepath, subject_splits_filepath + +from tests import RUNNER_SCRIPT, USE_LOCAL_SCRIPTS +from tests.MEDS_Transforms import ( + ADD_TIME_DERIVED_MEASUREMENTS_SCRIPT, + AGGREGATE_CODE_METADATA_SCRIPT, + FILTER_SUBJECTS_SCRIPT, + FIT_VOCABULARY_INDICES_SCRIPT, + NORMALIZATION_SCRIPT, + OCCLUDE_OUTLIERS_SCRIPT, + TOKENIZATION_SCRIPT, +) +from tests.MEDS_Transforms.test_multi_stage_preprocess_pipeline import ( + MEDS_CODE_METADATA, + WANT_FILTER, + WANT_FIT_NORMALIZATION, + WANT_FIT_OUTLIERS, + WANT_FIT_VOCABULARY_INDICES, + WANT_NORMALIZATION, + WANT_OCCLUDE_OUTLIERS, + WANT_TIME_DERIVED, + WANT_TOKENIZATION_EVENT_SEQS, + WANT_TOKENIZATION_SCHEMAS, + WANT_NRTs, +) +from tests.MEDS_Transforms.transform_tester_base import MEDS_SHARDS, SPLITS_DF +from tests.utils import add_params, exact_str_regex, single_stage_tester + +# Normally, you wouldn't need to specify all of these scripts, but in testing with local scripts we need to +# specify them all as they need to point to their python paths. +if USE_LOCAL_SCRIPTS: + STAGE_RUNNER_YAML = f""" +filter_subjects: + script: "python {FILTER_SUBJECTS_SCRIPT}" + +add_time_derived_measurements: + script: "python {ADD_TIME_DERIVED_MEASUREMENTS_SCRIPT}" + +occlude_outliers: + script: "python {OCCLUDE_OUTLIERS_SCRIPT}" + +fit_normalization: + script: "python {AGGREGATE_CODE_METADATA_SCRIPT}" + +fit_vocabulary_indices: + script: "python {FIT_VOCABULARY_INDICES_SCRIPT}" + +normalization: + script: "python {NORMALIZATION_SCRIPT}" + +tokenization: + script: "python {TOKENIZATION_SCRIPT}" + """ +else: + STAGE_RUNNER_YAML = f""" +fit_normalization: + script: {AGGREGATE_CODE_METADATA_SCRIPT} + """ + +PARALLEL_STAGE_RUNNER_YAML = f""" +parallelize: + n_workers: 2 + launcher: "joblib" + +{STAGE_RUNNER_YAML} +""" + + +PIPELINE_YAML = f""" +defaults: + - _preprocess + - _self_ + +input_dir: {{input_dir}} +cohort_dir: {{cohort_dir}} + +description: "A test pipeline for the MEDS-transforms pipeline runner." + +stages: + - filter_subjects + - add_time_derived_measurements + - fit_outlier_detection + - occlude_outliers + - fit_normalization + - fit_vocabulary_indices + - normalization + - tokenization + - tensorization + +stage_configs: + filter_subjects: + min_events_per_subject: 5 + add_time_derived_measurements: + age: + DOB_code: "DOB" # This is the MEDS official code for BIRTH + age_code: "AGE" + age_unit: "years" + time_of_day: + time_of_day_code: "TIME_OF_DAY" + endpoints: [6, 12, 18, 24] + fit_outlier_detection: + _script: {("python " if USE_LOCAL_SCRIPTS else "") + str(AGGREGATE_CODE_METADATA_SCRIPT)} + aggregations: + - "values/n_occurrences" + - "values/sum" + - "values/sum_sqd" + occlude_outliers: + stddev_cutoff: 1 + fit_normalization: + aggregations: + - "code/n_occurrences" + - "code/n_subjects" + - "values/n_occurrences" + - "values/sum" + - "values/sum_sqd" +""" + +NO_ARGS_HELP_STR = """ +== MEDS-Transforms Pipeline Runner == +MEDS-Transforms Pipeline Runner is a command line tool for running entire MEDS-transform pipelines in a single +command. + +Runs the entire pipeline, end-to-end, based on the configuration provided. + +This script will launch many subsidiary commands via `subprocess`, one for each stage of the specified +pipeline. + +**MEDS-transforms Pipeline description:** + +No description provided. +""" + +WITH_CONFIG_HELP_STR = """ +== MEDS-Transforms Pipeline Runner == +MEDS-Transforms Pipeline Runner is a command line tool for running entire MEDS-transform pipelines in a single +command. + +Runs the entire pipeline, end-to-end, based on the configuration provided. + +This script will launch many subsidiary commands via `subprocess`, one for each stage of the specified +pipeline. + +**MEDS-transforms Pipeline description:** + +A test pipeline for the MEDS-transforms pipeline runner. +""" + + +def test_pipeline(): + single_stage_tester( + script=str(RUNNER_SCRIPT) + " -h", + config_name="runner", + stage_name=None, + stage_kwargs=None, + do_pass_stage_name=False, + do_use_config_yaml=False, + input_files={}, + want_outputs={}, + assert_no_other_outputs=True, + should_error=False, + test_name="Runner Help Test", + do_include_dirs=False, + hydra_verbose=False, + stdout_regex=exact_str_regex(NO_ARGS_HELP_STR.strip()), + ) + + single_stage_tester( + script=str(RUNNER_SCRIPT) + " -h", + config_name="runner", + stage_name=None, + stage_kwargs=None, + do_pass_stage_name=False, + do_use_config_yaml=False, + input_files={"pipeline.yaml": partial(add_params, PIPELINE_YAML)}, + want_outputs={}, + assert_no_other_outputs=True, + should_error=False, + pipeline_config_fp="{input_dir}/pipeline.yaml", + test_name="Runner Help Test", + do_include_dirs=False, + hydra_verbose=False, + stdout_regex=exact_str_regex(WITH_CONFIG_HELP_STR.strip()), + ) + + single_stage_tester( + script=RUNNER_SCRIPT, + config_name="runner", + stage_name=None, + stage_kwargs=None, + do_pass_stage_name=False, + do_use_config_yaml=False, + input_files={ + **{f"data/{k}": v for k, v in MEDS_SHARDS.items()}, + code_metadata_filepath: MEDS_CODE_METADATA, + subject_splits_filepath: SPLITS_DF, + "pipeline.yaml": partial(add_params, PIPELINE_YAML), + "stage_runner.yaml": STAGE_RUNNER_YAML, + }, + want_outputs={ + **WANT_FIT_NORMALIZATION, + **WANT_FIT_OUTLIERS, + **WANT_FIT_VOCABULARY_INDICES, + **WANT_FILTER, + **WANT_TIME_DERIVED, + **WANT_OCCLUDE_OUTLIERS, + **WANT_NORMALIZATION, + **WANT_TOKENIZATION_SCHEMAS, + **WANT_TOKENIZATION_EVENT_SEQS, + **WANT_NRTs, + }, + assert_no_other_outputs=False, + should_error=False, + pipeline_config_fp="{input_dir}/pipeline.yaml", + stage_runner_fp="{input_dir}/stage_runner.yaml", + test_name="Runner Test", + do_include_dirs=False, + df_check_kwargs={"check_column_order": False}, + ) + + single_stage_tester( + script=RUNNER_SCRIPT, + config_name="runner", + stage_name=None, + stage_kwargs=None, + do_pass_stage_name=False, + do_use_config_yaml=False, + input_files={ + **{f"data/{k}": v for k, v in MEDS_SHARDS.items()}, + code_metadata_filepath: MEDS_CODE_METADATA, + subject_splits_filepath: SPLITS_DF, + "pipeline.yaml": partial(add_params, PIPELINE_YAML), + "stage_runner.yaml": PARALLEL_STAGE_RUNNER_YAML, + }, + want_outputs={ + **WANT_FIT_NORMALIZATION, + **WANT_FIT_OUTLIERS, + **WANT_FIT_VOCABULARY_INDICES, + **WANT_FILTER, + **WANT_TIME_DERIVED, + **WANT_OCCLUDE_OUTLIERS, + **WANT_NORMALIZATION, + **WANT_TOKENIZATION_SCHEMAS, + **WANT_TOKENIZATION_EVENT_SEQS, + **WANT_NRTs, + }, + assert_no_other_outputs=False, + should_error=False, + pipeline_config_fp="{input_dir}/pipeline.yaml", + stage_runner_fp="{input_dir}/stage_runner.yaml", + test_name="Runner Test with parallelism", + do_include_dirs=False, + df_check_kwargs={"check_column_order": False}, + ) diff --git a/tests/utils.py b/tests/utils.py index 450a86b4..0e9ae943 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,6 +1,8 @@ import json +import re import subprocess import tempfile +from collections.abc import Callable from contextlib import contextmanager from io import StringIO from pathlib import Path @@ -31,6 +33,10 @@ } +def exact_str_regex(s: str) -> str: + return f"^{re.escape(s)}$" + + def parse_meds_csvs( csvs: str | dict[str, str], schema: dict[str, pl.DataType] = MEDS_PL_SCHEMA ) -> pl.DataFrame | dict[str, pl.DataFrame]: @@ -57,7 +63,7 @@ def reader(csv_str: str) -> pl.DataFrame: def parse_shards_yaml(yaml_str: str, **schema_updates) -> pl.DataFrame: schema = {**MEDS_PL_SCHEMA, **schema_updates} - return parse_meds_csvs(load_yaml(yaml_str, Loader=Loader), schema=schema) + return parse_meds_csvs(load_yaml(yaml_str.strip(), Loader=Loader), schema=schema) def dict_to_hydra_kwargs(d: dict[str, str]) -> str: @@ -137,6 +143,9 @@ def run_command( err_cmd_lines = [] + if config_name is not None and not config_name.startswith("_"): + config_name = f"_{config_name}" + if do_use_config_yaml: if config_name is None: raise ValueError("config_name must be provided if do_use_config_yaml is True.") @@ -220,6 +229,19 @@ def assert_df_equal(want: pl.DataFrame, got: pl.DataFrame, msg: str = None, **kw raise AssertionError(f"{msg}:\nWant:\n{want}\nGot:\n{got}\n{e}") from e +def check_json(want: dict | Callable, got: dict, msg: str): + try: + match want: + case dict(): + assert got == want, f"Want:\n{want}\nGot:\n{got}" + case _ if callable(want): + want(got) + case _: + raise ValueError(f"Unknown want type: {type(want)}") + except AssertionError as e: + raise AssertionError(f"{msg}: {e}") from e + + def check_NRT_output( output_fp: Path, want_nrt: JointNestedRaggedTensorDict, @@ -271,6 +293,10 @@ def check_NRT_output( FILE_T = pl.DataFrame | dict[str, Any] | str +def add_params(templ_str: str, **kwargs): + return templ_str.format(**kwargs) + + @contextmanager def input_dataset(input_files: dict[str, FILE_T] | None = None): with tempfile.TemporaryDirectory() as d: @@ -294,6 +320,12 @@ def input_dataset(input_files: dict[str, FILE_T] | None = None): fp.write_text(json.dumps(data)) case str(): fp.write_text(data.strip()) + case _ if callable(data): + data_str = data( + input_dir=str(input_dir.resolve()), + cohort_dir=str(cohort_dir.resolve()), + ) + fp.write_text(data_str) case _: raise ValueError(f"Unknown data type {type(data)} for file {fp.relative_to(input_dir)}") @@ -334,13 +366,8 @@ def check_outputs( case ".nrt": check_NRT_output(output_fp, want, msg=msg) case ".json": - with open(output_fp) as f: - got = json.load(f) - assert got == want, ( - f"Expected JSON at {output_fp.relative_to(cohort_dir)} to be equal to the target.\n" - f"Wanted:\n{want}\n" - f"Got:\n{got}" - ) + got = json.loads(output_fp.read_text()) + check_json(want, got, msg=msg) case _: raise ValueError(f"Unknown file suffix: {file_suffix}") @@ -356,7 +383,7 @@ def check_outputs( def single_stage_tester( script: str | Path, - stage_name: str, + stage_name: str | None, stage_kwargs: dict[str, str] | None, do_pass_stage_name: bool = False, do_use_config_yaml: bool = False, @@ -366,8 +393,15 @@ def single_stage_tester( config_name: str = "preprocess", input_files: dict[str, FILE_T] | None = None, df_check_kwargs: dict | None = None, + test_name: str | None = None, + do_include_dirs: bool = True, + hydra_verbose: bool = True, + stdout_regex: str | None = None, **pipeline_kwargs, ): + if test_name is None: + test_name = f"Single stage transform: {stage_name}" + if df_check_kwargs is None: df_check_kwargs = {} @@ -377,20 +411,23 @@ def single_stage_tester( pipeline_kwargs[k] = v.format(input_dir=str(input_dir.resolve())) pipeline_config_kwargs = { - "input_dir": str(input_dir.resolve()), - "cohort_dir": str(cohort_dir.resolve()), - "stages": [stage_name], - "hydra.verbose": True, + "hydra.verbose": hydra_verbose, **pipeline_kwargs, } + if do_include_dirs: + pipeline_config_kwargs["input_dir"] = str(input_dir.resolve()) + pipeline_config_kwargs["cohort_dir"] = str(cohort_dir.resolve()) + + if stage_name is not None: + pipeline_config_kwargs["stages"] = [stage_name] if stage_kwargs: pipeline_config_kwargs["stage_configs"] = {stage_name: stage_kwargs} run_command_kwargs = { "script": script, "hydra_kwargs": pipeline_config_kwargs, - "test_name": f"Single stage transform: {stage_name}", + "test_name": test_name, "should_error": should_error, "config_name": config_name, "do_use_config_yaml": do_use_config_yaml, @@ -405,6 +442,12 @@ def single_stage_tester( if should_error: return + if stdout_regex is not None: + regex = re.compile(stdout_regex) + assert regex.search(stdout) is not None, ( + f"Expected stdout to match regex:\n{stdout_regex}\n" f"Got:\n{stdout}" + ) + try: check_outputs( cohort_dir,