From 5952213bc87e2fb272982ad1ebc932d4647c221e Mon Sep 17 00:00:00 2001 From: Ethan Steinberg Date: Thu, 20 Jun 2024 08:27:25 -0700 Subject: [PATCH 01/20] Update schema.py --- src/meds/schema.py | 39 ++++++--------------------------------- 1 file changed, 6 insertions(+), 33 deletions(-) diff --git a/src/meds/schema.py b/src/meds/schema.py index edbc9e1..3df6b44 100644 --- a/src/meds/schema.py +++ b/src/meds/schema.py @@ -21,45 +21,18 @@ birth_code = "SNOMED/184099003" death_code = "SNOMED/419620001" -def patient_schema(per_event_properties_schema=pa.null()): - # Return a patient schema with a particular per event metadata subschema - event = pa.struct( +def events_schema(patient_id_type, custom_per_event_properties=pa.null()): + assert patient_id_type == pa.int64() or patient_id_type == pa.string() + + return pa.schema( [ + ("patient_id", patient_id_type), ("time", pa.timestamp("us")), # Static events will have a null timestamp ("code", pa.string()), - ("text_value", pa.string()), ("numeric_value", pa.float32()), - ("datetime_value", pa.timestamp("us")), - ("properties", per_event_properties_schema), - ] + ] + custom_per_event_properties ) - patient = pa.schema( - [ - ("patient_id", pa.int64()), - ("events", pa.list_(event)), # Require ordered by time, nulls must be first - ] - ) - - return patient - - -# Python types for the above schema - -Event = TypedDict( - "Event", - { - "time": NotRequired[datetime.datetime], - "code": str, - "text_value": NotRequired[str], - "numeric_value": NotRequired[float], - "datetime_value": NotRequired[datetime.datetime], - "properties": NotRequired[Any], - }, -) - -Patient = TypedDict("Patient", {"patient_id": int, "events": List[Event]}) - ############################################################ # The label schema. From ca6ab08fc24905dee8dc8db329fb1633ad4d43a1 Mon Sep 17 00:00:00 2001 From: Ethan Steinberg Date: Thu, 20 Jun 2024 08:29:01 -0700 Subject: [PATCH 02/20] Update schema.py --- src/meds/schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/meds/schema.py b/src/meds/schema.py index 3df6b44..dfa97c7 100644 --- a/src/meds/schema.py +++ b/src/meds/schema.py @@ -21,7 +21,7 @@ birth_code = "SNOMED/184099003" death_code = "SNOMED/419620001" -def events_schema(patient_id_type, custom_per_event_properties=pa.null()): +def events_schema(patient_id_type, custom_per_event_properties=[]): assert patient_id_type == pa.int64() or patient_id_type == pa.string() return pa.schema( From 4ca8f6a98738a10c969838033c291a9379591a03 Mon Sep 17 00:00:00 2001 From: Ethan Steinberg Date: Mon, 15 Jul 2024 06:04:48 -0700 Subject: [PATCH 03/20] Update schema.py --- src/meds/schema.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/meds/schema.py b/src/meds/schema.py index dfa97c7..861f3eb 100644 --- a/src/meds/schema.py +++ b/src/meds/schema.py @@ -22,11 +22,9 @@ death_code = "SNOMED/419620001" def events_schema(patient_id_type, custom_per_event_properties=[]): - assert patient_id_type == pa.int64() or patient_id_type == pa.string() - return pa.schema( [ - ("patient_id", patient_id_type), + ("patient_id", pa.int64()), ("time", pa.timestamp("us")), # Static events will have a null timestamp ("code", pa.string()), ("numeric_value", pa.float32()), From 52d560f2b03c48493d2a2cddc64b190534f2cdc0 Mon Sep 17 00:00:00 2001 From: Ethan Steinberg Date: Mon, 15 Jul 2024 06:19:32 -0700 Subject: [PATCH 04/20] Update schema.py --- src/meds/schema.py | 77 +++++++++++++++++++++++++++++++--------------- 1 file changed, 52 insertions(+), 25 deletions(-) diff --git a/src/meds/schema.py b/src/meds/schema.py index 861f3eb..760beea 100644 --- a/src/meds/schema.py +++ b/src/meds/schema.py @@ -4,24 +4,42 @@ import pyarrow as pa from typing_extensions import NotRequired, TypedDict -# Medical Event Data Standard consists of three main components: -# 1. A patient data schema +# Medical Event Data Standard consists of four main components: +# 1. A patient event schema # 2. A label schema # 3. A dataset metadata schema. +# 4. A code metadata schema. # -# Patient data and labels are specified using pyarrow. Dataset metadata is specified using JSON. +# Event data, labels, and code metadata is specified using pyarrow. Dataset metadata is specified using JSON. -# We also provide TypedDict Python type signatures for these schemas. +# We also specify a directory structure for how these should be laid out on disk. + +# Every MEDS extract consists of a folder that contains both metadata and patient data with the following structure: +# - data/ +# A (possibly) nested folder containing multiple parquet files containing patient event data following the events_schema folder. +# glob("data/**/*.parquet") is the recommended way for obtaining all patient event files. +# - dataset_metadata.json +# Dataset level metadata containing information about the ETL used, data version, etc +# - code_metadata.parquet +# Code level metadata containing information about the code descriptions, standard mappings, etc ############################################################ -# The patient data schema. +# The patient event data schema. +# +# Patient event data also must satisfy two important properties: +# +# 1. Patient event data cannot be split across parquet files. If a patient is in a dataset it must be in one and only one parquet file. +# 2. Patient event data must be contiguous within a particular parquet file and sorted by event time. + +# Both of these restrictions allow the stream rolling processing (see https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.rolling.html), +# which vastly simplifies many data analysis pipelines. # We define some codes for particularly important events birth_code = "SNOMED/184099003" death_code = "SNOMED/419620001" -def events_schema(patient_id_type, custom_per_event_properties=[]): +def patient_events_schema(custom_per_event_properties=[]): return pa.schema( [ ("patient_id", pa.int64()), @@ -31,6 +49,9 @@ def events_schema(patient_id_type, custom_per_event_properties=[]): ] + custom_per_event_properties ) +# No python type is provided because Python tools for processing MEDS data will often provide their own types. +# See https://github.com/EthanSteinberg/meds_reader/blob/0.0.6/src/meds_reader/__init__.pyi#L55 for example. + ############################################################ # The label schema. @@ -55,26 +76,14 @@ def events_schema(patient_id_type, custom_per_event_properties=[]): "integer_value" : Optional[int], "float_value" : Optional[float], "categorical_value" : Optional[str], -}) +}, total=False) ############################################################ # The dataset metadata schema. # This is a JSON schema. -# This data should be stored in metadata.json within the dataset folder. - -code_metadata_entry = { - "type": "object", - "properties": { - "description": {"type": "string"}, - "parent_codes": {"type": "array", "items": {"type": "string"}}, - }, -} +# This data should be stored in dataset_metadata.json within the dataset folder. -code_metadata = { - "type": "object", - "additionalProperties": code_metadata_entry, -} dataset_metadata = { "type": "object", @@ -83,15 +92,12 @@ def events_schema(patient_id_type, custom_per_event_properties=[]): "dataset_version": {"type": "string"}, "etl_name": {"type": "string"}, "etl_version": {"type": "string"}, - "code_metadata": code_metadata, "meds_version": {"type": "string"}, }, } -# Python types for the above schema +# Python type for the above schema -CodeMetadataEntry = TypedDict("CodeMetadataEntry", {"description": str, "parent_codes": List[str]}) -CodeMetadata = Mapping[str, CodeMetadataEntry] DatasetMetadata = TypedDict( "DatasetMetadata", { @@ -99,7 +105,28 @@ def events_schema(patient_id_type, custom_per_event_properties=[]): "dataset_version": NotRequired[str], "etl_name": NotRequired[str], "etl_version": NotRequired[str], - "code_metadata": NotRequired[CodeMetadata], "meds_version": NotRequired[str], }, + total=False, ) + +############################################################ + +# The code metadata schema. +# This is a parquet schema. +# This data should be stored in code_metadata.parquet within the dataset folder. + +def code_metadata_schema(custom_per_event_properties=[]): + code_metadata = pa.schema( + [ + ("code", pa.string()), + ("description", pa.string()), + ("parent_codes", pa.list(pa.string()), + ] + custom_per_event_properties + ) + + return code_metadata + +# Python type for the above schema + +CodeMetadata = TypedDict("CodeMetadata", {"code": str, "description": str, "parent_codes": List[str]}, total=False) From 421df192145a015d4320f03b9e922406c9e7f246 Mon Sep 17 00:00:00 2001 From: Ethan Steinberg Date: Mon, 15 Jul 2024 06:22:37 -0700 Subject: [PATCH 05/20] Update schema.py --- src/meds/schema.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/meds/schema.py b/src/meds/schema.py index 760beea..aea10a3 100644 --- a/src/meds/schema.py +++ b/src/meds/schema.py @@ -116,13 +116,13 @@ def patient_events_schema(custom_per_event_properties=[]): # This is a parquet schema. # This data should be stored in code_metadata.parquet within the dataset folder. -def code_metadata_schema(custom_per_event_properties=[]): +def code_metadata_schema(custom_per_code_properties=[]): code_metadata = pa.schema( [ ("code", pa.string()), ("description", pa.string()), ("parent_codes", pa.list(pa.string()), - ] + custom_per_event_properties + ] + custom_per_code_properties ) return code_metadata From 28b21b02b98198abe25bac26b5563b7e26622993 Mon Sep 17 00:00:00 2001 From: Ethan Steinberg Date: Mon, 15 Jul 2024 06:23:07 -0700 Subject: [PATCH 06/20] Update schema.py --- src/meds/schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/meds/schema.py b/src/meds/schema.py index aea10a3..49eb418 100644 --- a/src/meds/schema.py +++ b/src/meds/schema.py @@ -16,7 +16,7 @@ # Every MEDS extract consists of a folder that contains both metadata and patient data with the following structure: # - data/ -# A (possibly) nested folder containing multiple parquet files containing patient event data following the events_schema folder. +# A (possibly nested) folder containing multiple parquet files containing patient event data following the events_schema folder. # glob("data/**/*.parquet") is the recommended way for obtaining all patient event files. # - dataset_metadata.json # Dataset level metadata containing information about the ETL used, data version, etc From 6544c3d1dcc7bb545357e644fad2572985f826b8 Mon Sep 17 00:00:00 2001 From: Ethan Steinberg Date: Fri, 19 Jul 2024 07:38:35 -0700 Subject: [PATCH 07/20] Update schema.py --- src/meds/schema.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/meds/schema.py b/src/meds/schema.py index 49eb418..f7a48ca 100644 --- a/src/meds/schema.py +++ b/src/meds/schema.py @@ -20,8 +20,10 @@ # glob("data/**/*.parquet") is the recommended way for obtaining all patient event files. # - dataset_metadata.json # Dataset level metadata containing information about the ETL used, data version, etc -# - code_metadata.parquet +# - (Optional) code_metadata.parquet # Code level metadata containing information about the code descriptions, standard mappings, etc +# - (Optional) patient_split.csv +# A specification of patient splits that should be used. ############################################################ @@ -78,6 +80,22 @@ def patient_events_schema(custom_per_event_properties=[]): "categorical_value" : Optional[str], }, total=False) + +############################################################ + +# The patient split schema. + +train_split = "train" +tuning_split = "tuning" +test_split = "test" + +patient_split = pa.schema( + [ + ("patient_id", pa.int64()), + ("split", pa.string()), + ] +) + ############################################################ # The dataset metadata schema. From 3f7c441c09d16bc4aafac5238956e1306c7305e0 Mon Sep 17 00:00:00 2001 From: Ethan Steinberg Date: Fri, 19 Jul 2024 07:51:47 -0700 Subject: [PATCH 08/20] Update schema.py --- src/meds/schema.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/meds/schema.py b/src/meds/schema.py index f7a48ca..68d7fc0 100644 --- a/src/meds/schema.py +++ b/src/meds/schema.py @@ -38,8 +38,8 @@ # which vastly simplifies many data analysis pipelines. # We define some codes for particularly important events -birth_code = "SNOMED/184099003" -death_code = "SNOMED/419620001" +birth_code = "MEDS_BIRTH" +death_code = "MEDS_DEATH" def patient_events_schema(custom_per_event_properties=[]): return pa.schema( From c922b44ef5cb7515a1f5309a8b20dcfce10e7e9f Mon Sep 17 00:00:00 2001 From: Ethan Steinberg Date: Mon, 29 Jul 2024 20:15:30 -0700 Subject: [PATCH 09/20] Update schema.py --- src/meds/schema.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/meds/schema.py b/src/meds/schema.py index 68d7fc0..5839ce2 100644 --- a/src/meds/schema.py +++ b/src/meds/schema.py @@ -61,7 +61,14 @@ def patient_events_schema(custom_per_event_properties=[]): label = pa.schema( [ ("patient_id", pa.int64()), - ("prediction_time", pa.timestamp("us")), + # The patient who is being labeled. + + ("prediction_time", pa.timestamp("us")), + # The time the prediction is made. + # Machine learning models are allowed to use features that have timestamps less than or equal + # to this timestamp. + + # Possible values for the label. ("boolean_value", pa.bool_()), ("integer_value", pa.int64()), ("float_value", pa.float64()), From 3ed25abebb126f6d22720ac6b22618f42cbd4497 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Mon, 29 Jul 2024 23:55:09 -0400 Subject: [PATCH 10/20] Started updating README --- README.md | 100 ++++++++++++++++++++++++++++++++++++++++++++- src/meds/schema.py | 7 +++- 2 files changed, 105 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b097d77..f33a326 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,104 @@ # Medical Event Data Standard -The Medical Event Data Standard (MEDS) is a draft data schema for storing streams of medical events, often sourced from either Electronic Health Records or claims records. +The Medical Event Data Standard (MEDS) is a data schema for storing streams of medical events, often +sourced from either Electronic Health Records or claims records. Before we define the various schema that make +up MEDS, we will define some key terminology that we use in this standard. + +## Terminology + 1. A _patient_ in a MEDS dataset is the primary entity being described by the sequences of care observations + in the underlying dataset. In most cases, _patients_ will, naturally, be individuals, and the sequences + of care observations will cover all known observations about those individuals in a source health + datasets. However, in some cases, data may be organized so that we cannot describe all the data for an + individual reliably in a dataset, but instead can only describe subsequences of an individual's data, + such as in datasets that only link an individual's data observations together if they are within the same + hospital admission, regardless of how many admissions that individual has in the dataset (such as the + [eICU](https://eicu-crd.mit.edu/) dataset). In these cases, a _patient_ in the MEDS dataset may refer to + a hospital admission rather than an individual. + 2. A _measurement_ or _patient measurement_ or _observation_ in a MEDS dataset refers to a single measurable + quantity observed about the patient during their care. These observations can take on many forms, such as + observing a diagnostic code being applied to the patient, observing a patient's admission or transfer + from one unit to another, observing a laboratory test result, but always correspond to a single + measureable unit about a single patient. + 3. A _code_ is the categorical descriptor of what happened in a patient measurement. In particular, in + almost all structured, longitudinal datasets, a measurement can be described as consisting of a tuple + containing a `patient_id` (who this measurement is about); a `timestamp` (when this measurement + happened); some categorical qualifier describing what was measured, which we will call a `code`; a value + of a given type, such as a `numerical_value`, a `text_value`, or a `categorical_value`; and possibly one + or more additional measurement properties that describe the measurement in a non-standardized manner. + 4. An _event_ or _patient event_ in a MEDS dataset corresponds to all observations about a patient that + occur at a unique timestamp (within the level of temporal granularity in the MEDS dataset). + 5. A _static_ measurement is one that occurs without a source timestamp being recorded in the raw dataset + **and** that can be interpreted as being applicable to the patient at any point in time during their + care. All other measurements observed in the raw dataset will be considered to be _dynamic_ measurements + that can vary in time in an unknown manner. Note that there are a third class of measurements that may, + at times, be induced in the dataset known as _time-derived_ measurements which correspond to measurements + that occur in time like _dynamic_ measurements but can be computed deterministically in advance using + only the timestamp at which a measurement occurs and the patient's static (or, rarely, historical) data, + such as the patient's age or the season of the year in which a measurement occurs. These are rarely + recorded in the raw data but may be used during modeling. + +## Core MEDS Data Organization + +MEDS consists of four main data components/schemas: + 1. A _patient measurement schema_. This schema describes the underlying medical data, organized as sequences + of patient measurements, in the dataset. + 2. A _patient subsequence label schema_. This schema describes labels that may be predicted about a patient + at a given timestamp in the patient record. + 3. A _code metadata schema_. This schema contains metadata describing the codes used to categorize the + observed measurements in the dataset. + 4. A _dataset metadata schema_. This schema contains metadata about the MEDS dataset itself, such as when it + was produced, using what version of what code, etc. + 5. A _patient split schema_. This schema contains metadata about how patients in the MEDS dataset are + assigned to different subpopulations, most commonly used to dictate ML splits. + +### Organization on Disk +Given a MEDS dataset stored in the `$MEDS_ROOT` directory data of the various schemas outlined above can be +found in the following subfolders: + - `$MEDS_ROOT/data/`: This directory will contain data in the _patient measurement schema_, organized as a + series of possibly nested sharded dataframes, often as `parquet` files. In particular, the file glob + `glob("$MEDS_ROOT/data/**/*.parquet)` will capture all sharded data files of the raw MEDS data, all + organized into _patient measurement schema_ files, sharded by patient and sorted, for each patient, by + timestamp. + - `$MEDS_ROOT/metadata/codes.csv`: This file contains per-code metadata in the _code metadata schema_ + about the MEDS dataset. As this dataset describes all codes observed in the full MEDS dataset, it is _not_ + sharded. Note that some pre-processing operations may, at times, produce sharded code metadata files, but + these will always appear in subdirectories of `$MEDS_ROOT/metadata/` rather than at the top level, and + should generally not be used for overall metadata operations. The preferred file format for this dataframe + is CSV for ease of human inspection and readability. + - `$MEDS_ROOT/metadata/dataset.json`: This schema contains metadata in the _dataset metadata schema_ about + the dataset and its production process. + - `$MEDS_ROOT/metdata/patient_splits.csv`: This schema contains information in the _patient split schema_ + about what splits different patients are in. Unlike the raw data, which should preferrably be stored in + the parquet format for compression, columnar read capabilities, and compression, the patient splits is + preferrably stored in a comma separated value (CSV) format for ease of readability and shareability. + +Task label dataframes are stored in the _TODO label_ schema, in a file path that depends on both a +`$TASK_ROOT` directory where task label dataframes are stored and a `$TASK_NAME` parameter that separates +different tasks from one another. In particular, the file glob `glob($TASK_ROOT/$TASK_NAME/**/*.parquet)` will +retrieve a sharded set of dataframes in the _TODO label_ schema where the sharding matches up precisely with +the sharding used in the raw `$MEDS_ROOT/data/**/*.parquet` files (e.g., the file +`$TASK_ROOT/$TASK_NAME/$SHARD_NAME.parquet` will cover the labels for the same set of patients as are +contained in the raw data file at `$MEDS_ROOT/data/**/*.parquet`). Note that (1) `$TASK_ROOT` may be a subdir +of `$MEDS_ROOT` (e.g., often `$TASK_ROOT` will be set to `$MEDS_ROOT/tasks`), (2) `$TASK_NAME` may have `/`s +in it, thereby rendering the task label directory a deep, nested subdir of `$TASK_ROOT`, and (3) in some +cases, there may be no task labels for a shard of the raw data, if no patient in that shard qualifies for that +task, in which case it may be true that either `$TASK_ROOT/$TASK_NAME/$SHARD_NAME.parquet` is empty or that it +does not exist. + +While we give preferred file formats in the list above, the important thing about these data are that they are +stored in the appropriate schemas, not that they use the preferred file formats. Datasets can be stored using +parquet files for splits or CSV files for raw datasets and still be compliant with the MEDS format. + +### Schemas + +**TODO**: copy here from the schema file and describe. + + + + + + +## Old -- to be deleted. The core of the standard is that we define a ``patient`` data structure that contains a series of time stamped events, that in turn contain measurements of various sorts. diff --git a/src/meds/schema.py b/src/meds/schema.py index 68d7fc0..fff43c7 100644 --- a/src/meds/schema.py +++ b/src/meds/schema.py @@ -44,7 +44,7 @@ def patient_events_schema(custom_per_event_properties=[]): return pa.schema( [ - ("patient_id", pa.int64()), + ("patient_id", pa.int64()), ("time", pa.timestamp("us")), # Static events will have a null timestamp ("code", pa.string()), ("numeric_value", pa.float32()), @@ -96,6 +96,11 @@ def patient_events_schema(custom_per_event_properties=[]): ] ) +PatientSplit = TypedDict("PatientSplit", { + "patient_id": int, + "split": str, +}, total=True) + ############################################################ # The dataset metadata schema. From ed9cb9150b9d2c2c4063e0ff771d625a5785c7cc Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Tue, 30 Jul 2024 09:04:44 -0400 Subject: [PATCH 11/20] Updating to mandatory file formats. --- README.md | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index f33a326..3d7b6d7 100644 --- a/README.md +++ b/README.md @@ -55,22 +55,19 @@ MEDS consists of four main data components/schemas: Given a MEDS dataset stored in the `$MEDS_ROOT` directory data of the various schemas outlined above can be found in the following subfolders: - `$MEDS_ROOT/data/`: This directory will contain data in the _patient measurement schema_, organized as a - series of possibly nested sharded dataframes, often as `parquet` files. In particular, the file glob + series of possibly nested sharded dataframes stored in `parquet` files. In particular, the file glob `glob("$MEDS_ROOT/data/**/*.parquet)` will capture all sharded data files of the raw MEDS data, all organized into _patient measurement schema_ files, sharded by patient and sorted, for each patient, by timestamp. - - `$MEDS_ROOT/metadata/codes.csv`: This file contains per-code metadata in the _code metadata schema_ + - `$MEDS_ROOT/metadata/codes.parquet`: This file contains per-code metadata in the _code metadata schema_ about the MEDS dataset. As this dataset describes all codes observed in the full MEDS dataset, it is _not_ sharded. Note that some pre-processing operations may, at times, produce sharded code metadata files, but these will always appear in subdirectories of `$MEDS_ROOT/metadata/` rather than at the top level, and - should generally not be used for overall metadata operations. The preferred file format for this dataframe - is CSV for ease of human inspection and readability. + should generally not be used for overall metadata operations. - `$MEDS_ROOT/metadata/dataset.json`: This schema contains metadata in the _dataset metadata schema_ about the dataset and its production process. - - `$MEDS_ROOT/metdata/patient_splits.csv`: This schema contains information in the _patient split schema_ - about what splits different patients are in. Unlike the raw data, which should preferrably be stored in - the parquet format for compression, columnar read capabilities, and compression, the patient splits is - preferrably stored in a comma separated value (CSV) format for ease of readability and shareability. + - `$MEDS_ROOT/metdata/patient_splits.parquet`: This schema contains information in the _patient split + schema_ about what splits different patients are in. Task label dataframes are stored in the _TODO label_ schema, in a file path that depends on both a `$TASK_ROOT` directory where task label dataframes are stored and a `$TASK_NAME` parameter that separates @@ -85,10 +82,6 @@ cases, there may be no task labels for a shard of the raw data, if no patient in task, in which case it may be true that either `$TASK_ROOT/$TASK_NAME/$SHARD_NAME.parquet` is empty or that it does not exist. -While we give preferred file formats in the list above, the important thing about these data are that they are -stored in the appropriate schemas, not that they use the preferred file formats. Datasets can be stored using -parquet files for splits or CSV files for raw datasets and still be compliant with the MEDS format. - ### Schemas **TODO**: copy here from the schema file and describe. From 59856292b93e54f798b8713ebd18ad501feb51d7 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Tue, 30 Jul 2024 14:14:29 -0400 Subject: [PATCH 12/20] Removed controversial or unneeded terms --- README.md | 37 +++++++++++-------------------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 3d7b6d7..41eca97 100644 --- a/README.md +++ b/README.md @@ -14,34 +14,19 @@ up MEDS, we will define some key terminology that we use in this standard. hospital admission, regardless of how many admissions that individual has in the dataset (such as the [eICU](https://eicu-crd.mit.edu/) dataset). In these cases, a _patient_ in the MEDS dataset may refer to a hospital admission rather than an individual. - 2. A _measurement_ or _patient measurement_ or _observation_ in a MEDS dataset refers to a single measurable - quantity observed about the patient during their care. These observations can take on many forms, such as - observing a diagnostic code being applied to the patient, observing a patient's admission or transfer - from one unit to another, observing a laboratory test result, but always correspond to a single - measureable unit about a single patient. - 3. A _code_ is the categorical descriptor of what happened in a patient measurement. In particular, in - almost all structured, longitudinal datasets, a measurement can be described as consisting of a tuple - containing a `patient_id` (who this measurement is about); a `timestamp` (when this measurement - happened); some categorical qualifier describing what was measured, which we will call a `code`; a value - of a given type, such as a `numerical_value`, a `text_value`, or a `categorical_value`; and possibly one - or more additional measurement properties that describe the measurement in a non-standardized manner. - 4. An _event_ or _patient event_ in a MEDS dataset corresponds to all observations about a patient that - occur at a unique timestamp (within the level of temporal granularity in the MEDS dataset). - 5. A _static_ measurement is one that occurs without a source timestamp being recorded in the raw dataset - **and** that can be interpreted as being applicable to the patient at any point in time during their - care. All other measurements observed in the raw dataset will be considered to be _dynamic_ measurements - that can vary in time in an unknown manner. Note that there are a third class of measurements that may, - at times, be induced in the dataset known as _time-derived_ measurements which correspond to measurements - that occur in time like _dynamic_ measurements but can be computed deterministically in advance using - only the timestamp at which a measurement occurs and the patient's static (or, rarely, historical) data, - such as the patient's age or the season of the year in which a measurement occurs. These are rarely - recorded in the raw data but may be used during modeling. + 2. A _code_ is the categorical descriptor of what is being observed in any given observation of a patient. + In particular, in almost all structured, longitudinal datasets, a measurement can be described as + consisting of a tuple containing a `patient_id` (who this measurement is about); a `timestamp` (when this + measurement happened); some categorical qualifier describing what was measured, which we will call a + `code`; a value of a given type, such as a `numerical_value`, a `text_value`, or a `categorical_value`; + and possibly one or more additional measurement properties that describe the measurement in a + non-standardized manner. ## Core MEDS Data Organization MEDS consists of four main data components/schemas: - 1. A _patient measurement schema_. This schema describes the underlying medical data, organized as sequences - of patient measurements, in the dataset. + 1. A _data schema_. This schema describes the underlying medical data, organized as sequences of patient + observations, in the dataset. 2. A _patient subsequence label schema_. This schema describes labels that may be predicted about a patient at a given timestamp in the patient record. 3. A _code metadata schema_. This schema contains metadata describing the codes used to categorize the @@ -54,10 +39,10 @@ MEDS consists of four main data components/schemas: ### Organization on Disk Given a MEDS dataset stored in the `$MEDS_ROOT` directory data of the various schemas outlined above can be found in the following subfolders: - - `$MEDS_ROOT/data/`: This directory will contain data in the _patient measurement schema_, organized as a + - `$MEDS_ROOT/data/`: This directory will contain data in the _data schema_, organized as a series of possibly nested sharded dataframes stored in `parquet` files. In particular, the file glob `glob("$MEDS_ROOT/data/**/*.parquet)` will capture all sharded data files of the raw MEDS data, all - organized into _patient measurement schema_ files, sharded by patient and sorted, for each patient, by + organized into _data schema_ files, sharded by patient and sorted, for each patient, by timestamp. - `$MEDS_ROOT/metadata/codes.parquet`: This file contains per-code metadata in the _code metadata schema_ about the MEDS dataset. As this dataset describes all codes observed in the full MEDS dataset, it is _not_ From 1da2ec01ebb92fbfafbf33b20d7de51e9810c162 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Tue, 30 Jul 2024 14:25:40 -0400 Subject: [PATCH 13/20] Updated schemas and documentation with consensus terms and deduplicated file path instructions. --- README.md | 162 ++++++++++++++++++++++++++++++++------------- src/meds/schema.py | 49 +++++--------- 2 files changed, 132 insertions(+), 79 deletions(-) diff --git a/README.md b/README.md index 41eca97..9ab090e 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ up MEDS, we will define some key terminology that we use in this standard. a hospital admission rather than an individual. 2. A _code_ is the categorical descriptor of what is being observed in any given observation of a patient. In particular, in almost all structured, longitudinal datasets, a measurement can be described as - consisting of a tuple containing a `patient_id` (who this measurement is about); a `timestamp` (when this + consisting of a tuple containing a `patient_id` (who this measurement is about); a `time` (when this measurement happened); some categorical qualifier describing what was measured, which we will call a `code`; a value of a given type, such as a `numerical_value`, a `text_value`, or a `categorical_value`; and possibly one or more additional measurement properties that describe the measurement in a @@ -28,7 +28,7 @@ MEDS consists of four main data components/schemas: 1. A _data schema_. This schema describes the underlying medical data, organized as sequences of patient observations, in the dataset. 2. A _patient subsequence label schema_. This schema describes labels that may be predicted about a patient - at a given timestamp in the patient record. + at a given time in the patient record. 3. A _code metadata schema_. This schema contains metadata describing the codes used to categorize the observed measurements in the dataset. 4. A _dataset metadata schema_. This schema contains metadata about the MEDS dataset itself, such as when it @@ -43,7 +43,7 @@ found in the following subfolders: series of possibly nested sharded dataframes stored in `parquet` files. In particular, the file glob `glob("$MEDS_ROOT/data/**/*.parquet)` will capture all sharded data files of the raw MEDS data, all organized into _data schema_ files, sharded by patient and sorted, for each patient, by - timestamp. + time. - `$MEDS_ROOT/metadata/codes.parquet`: This file contains per-code metadata in the _code metadata schema_ about the MEDS dataset. As this dataset describes all codes observed in the full MEDS dataset, it is _not_ sharded. Note that some pre-processing operations may, at times, produce sharded code metadata files, but @@ -69,67 +69,135 @@ does not exist. ### Schemas -**TODO**: copy here from the schema file and describe. - +#### The Data Schema +MEDS data also must satisfy two important properties: + 1. Data about a single patient cannot be split across parquet files. If a patient is in a dataset it must be + in one and only one parquet file. + 2. Data about a single patient must be contiguous within a particular parquet file and sorted by time. +The data schema has four mandatory fields: + 1. `patient_id`: The ID of the patient this event is about. + 2. `time`: The time of the event. This field is nullable for static events. + 3. `code`: The code of the event. + 4. `numeric_value`: The numeric value of the event. This field is nullable for non-numeric events. +In addition, it can contain any number of custom properties to further enrich observations. The python +function below generates a pyarrow schema for a given set of custom properties. +```python +def data_schema(custom_properties=[]): + return pa.schema( + [ + ("patient_id", pa.int64()), + ("time", pa.timestamp("us")), # Static events will have a null timestamp + ("code", pa.string()), + ("numeric_value", pa.float32()), + ] + custom_properties + ) +``` +#### The label schema. +Models, when predicting this label, are allowed to use all data about a patient up to and including the +prediction time. Exclusive prediction times are not currently supported, but if you have a use case for them +please add a GitHub issue. -## Old -- to be deleted. +```python +label = pa.schema( + [ + ("patient_id", pa.int64()), + ("prediction_time", pa.timestamp("us")), + ("boolean_value", pa.bool_()), + ("integer_value", pa.int64()), + ("float_value", pa.float64()), + ("categorical_value", pa.string()), + ] +) + +Label = TypedDict("Label", { + "patient_id": int, + "prediction_time": datetime.datetime, + "boolean_value": Optional[bool], + "integer_value" : Optional[int], + "float_value" : Optional[float], + "categorical_value" : Optional[str], +}, total=False) +``` -The core of the standard is that we define a ``patient`` data structure that contains a series of time stamped events, that in turn contain measurements of various sorts. +#### The patient split schema. -The Python type signature for the schema is as follows: +Three sentinel split names are defined for convenience and shared processing: + 1. A training split, named `train`, used for ML model training. + 2. A tuning split, named `tuning`, used for hyperparameter tuning. This is sometimes also called a + "validation" split or a "dev" split. In many cases, standardizing on a tuning split is not necessary and + models should feel free to merge this split with the training split if desired. + 3. A held-out split, named `held_out`, used for final model evaluation. In many cases, this is also called a + "test" split. When performing benchmarking, this split should not be used at all for model selection, + training, or for any purposes up to final validation. -```python +Additional split names can be used by the user as desired. -Patient = TypedDict('Patient', { - 'patient_id': int, - 'events': List[Event], -}) - -Event = TypedDict('Event',{ - 'time': NotRequired[datetime.datetime], # Static events will have a null timestamp here - 'code': str, - 'text_value': NotRequired[str], - 'numeric_value': NotRequired[float], - 'datetime_value': NotRequired[datetime.datetime], - 'metadata': NotRequired[Mapping[str, Any]], -}) +``` +train_split = "train" +tuning_split = "tuning" +held_out_split = "held_out" + +patient_split = pa.schema( + [ + ("patient_id", pa.int64()), + ("split", pa.string()), + ] +) + +PatientSplit = TypedDict("PatientSplit", { + "patient_id": int, + "split": str, +}, total=True) ``` -We also provide ETLs to convert common data formats to this schema: https://github.com/Medical-Event-Data-Standard/meds_etl - -An example patient following this schema +#### The dataset metadata schema. ```python - -patient_data = { - "patient_id": 123, - "events": [ - # Store static events like gender with a null timestamp - { - "time": None, - "code": "Gender/F", +dataset_metadata = { + "type": "object", + "properties": { + "dataset_name": {"type": "string"}, + "dataset_version": {"type": "string"}, + "etl_name": {"type": "string"}, + "etl_version": {"type": "string"}, + "meds_version": {"type": "string"}, }, +} - # It's recommended to record birth using the birth_code - { - "time": datetime.datetime(1995, 8, 20), - "code": meds.birth_code, - }, +# Python type for the above schema - # Arbitrary events with sophisticated data can also be added +DatasetMetadata = TypedDict( + "DatasetMetadata", { - "time": datetime.datetime(2020, 1, 1, 12, 0, 0), - "code": "some_code", - "text_value": "Example", - "numeric_value": 10.0, - "datetime_value": datetime.datetime(2020, 1, 1, 12, 0, 0), - "properties": None + "dataset_name": NotRequired[str], + "dataset_version": NotRequired[str], + "etl_name": NotRequired[str], + "etl_version": NotRequired[str], + "meds_version": NotRequired[str], }, - ] -} + total=False, +) +``` + +#### The code metadata schema. + +```python +def code_metadata_schema(custom_per_code_properties=[]): + code_metadata = pa.schema( + [ + ("code", pa.string()), + ("description", pa.string()), + ("parent_codes", pa.list(pa.string()), + ] + custom_per_code_properties + ) + + return code_metadata + +# Python type for the above schema +CodeMetadata = TypedDict("CodeMetadata", {"code": str, "description": str, "parent_codes": List[str]}, total=False) ``` diff --git a/src/meds/schema.py b/src/meds/schema.py index fff43c7..4a91f0d 100644 --- a/src/meds/schema.py +++ b/src/meds/schema.py @@ -1,38 +1,23 @@ +"""The core schemas for the MEDS format. + +Please see the README for more information, including expected file organization on disk, more details on what +each schema should capture, etc. +""" import datetime from typing import Any, List, Mapping, Optional import pyarrow as pa from typing_extensions import NotRequired, TypedDict -# Medical Event Data Standard consists of four main components: -# 1. A patient event schema -# 2. A label schema -# 3. A dataset metadata schema. -# 4. A code metadata schema. -# -# Event data, labels, and code metadata is specified using pyarrow. Dataset metadata is specified using JSON. - -# We also specify a directory structure for how these should be laid out on disk. - -# Every MEDS extract consists of a folder that contains both metadata and patient data with the following structure: -# - data/ -# A (possibly nested) folder containing multiple parquet files containing patient event data following the events_schema folder. -# glob("data/**/*.parquet") is the recommended way for obtaining all patient event files. -# - dataset_metadata.json -# Dataset level metadata containing information about the ETL used, data version, etc -# - (Optional) code_metadata.parquet -# Code level metadata containing information about the code descriptions, standard mappings, etc -# - (Optional) patient_split.csv -# A specification of patient splits that should be used. ############################################################ -# The patient event data schema. +# The data schema. # -# Patient event data also must satisfy two important properties: +# MEDS data also must satisfy two important properties: # -# 1. Patient event data cannot be split across parquet files. If a patient is in a dataset it must be in one and only one parquet file. -# 2. Patient event data must be contiguous within a particular parquet file and sorted by event time. +# 1. Data about a single patient cannot be split across parquet files. If a patient is in a dataset it must be in one and only one parquet file. +# 2. Data about a single patient must be contiguous within a particular parquet file and sorted by time. # Both of these restrictions allow the stream rolling processing (see https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.rolling.html), # which vastly simplifies many data analysis pipelines. @@ -41,14 +26,14 @@ birth_code = "MEDS_BIRTH" death_code = "MEDS_DEATH" -def patient_events_schema(custom_per_event_properties=[]): +def data_schema(custom_properties=[]): return pa.schema( [ ("patient_id", pa.int64()), ("time", pa.timestamp("us")), # Static events will have a null timestamp ("code", pa.string()), ("numeric_value", pa.float32()), - ] + custom_per_event_properties + ] + custom_properties ) # No python type is provided because Python tools for processing MEDS data will often provide their own types. @@ -56,7 +41,9 @@ def patient_events_schema(custom_per_event_properties=[]): ############################################################ -# The label schema. +# The label schema. Models, when predicting this label, are allowed to use all data about a patient up to and +# including the prediction time. Exclusive prediction times are not currently supported, but if you have a use +# case for them please add a GitHub issue. label = pa.schema( [ @@ -85,9 +72,9 @@ def patient_events_schema(custom_per_event_properties=[]): # The patient split schema. -train_split = "train" -tuning_split = "tuning" -test_split = "test" +train_split = "train" # For ML training. +tuning_split = "tuning" # For ML hyperparameter tuning. Also often called "validation" or "dev". +held_out_split = "held_out" # For final ML evaluation. Also often called "test". patient_split = pa.schema( [ @@ -105,7 +92,6 @@ def patient_events_schema(custom_per_event_properties=[]): # The dataset metadata schema. # This is a JSON schema. -# This data should be stored in dataset_metadata.json within the dataset folder. dataset_metadata = { @@ -137,7 +123,6 @@ def patient_events_schema(custom_per_event_properties=[]): # The code metadata schema. # This is a parquet schema. -# This data should be stored in code_metadata.parquet within the dataset folder. def code_metadata_schema(custom_per_code_properties=[]): code_metadata = pa.schema( From e10c22d2cab9213aefe92ae79e454e66e8db261b Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Tue, 30 Jul 2024 14:28:10 -0400 Subject: [PATCH 14/20] Removed unneeded python object format. --- README.md | 5 ----- src/meds/schema.py | 5 ----- 2 files changed, 10 deletions(-) diff --git a/README.md b/README.md index 9ab090e..d0c3c82 100644 --- a/README.md +++ b/README.md @@ -147,11 +147,6 @@ patient_split = pa.schema( ("split", pa.string()), ] ) - -PatientSplit = TypedDict("PatientSplit", { - "patient_id": int, - "split": str, -}, total=True) ``` #### The dataset metadata schema. diff --git a/src/meds/schema.py b/src/meds/schema.py index 4a91f0d..919b7e6 100644 --- a/src/meds/schema.py +++ b/src/meds/schema.py @@ -83,11 +83,6 @@ def data_schema(custom_properties=[]): ] ) -PatientSplit = TypedDict("PatientSplit", { - "patient_id": int, - "split": str, -}, total=True) - ############################################################ # The dataset metadata schema. From dd4ca79c7ba168adc1f9c0e2948db9e203986adb Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Tue, 30 Jul 2024 14:41:16 -0400 Subject: [PATCH 15/20] Adding missing close paren --- src/meds/schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/meds/schema.py b/src/meds/schema.py index 1fd82de..0c1e73d 100644 --- a/src/meds/schema.py +++ b/src/meds/schema.py @@ -131,7 +131,7 @@ def code_metadata_schema(custom_per_code_properties=[]): [ ("code", pa.string()), ("description", pa.string()), - ("parent_codes", pa.list(pa.string()), + ("parent_codes", pa.list(pa.string())), ] + custom_per_code_properties ) From 986b2964c613da4be461fae573a4b75a779c6e12 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Tue, 30 Jul 2024 14:59:05 -0400 Subject: [PATCH 16/20] Fixed another test error to do with imports. --- src/meds/__init__.py | 20 ++++++++++---------- src/meds/schema.py | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/meds/__init__.py b/src/meds/__init__.py index 2c76ef6..eb6ebd6 100644 --- a/src/meds/__init__.py +++ b/src/meds/__init__.py @@ -1,26 +1,26 @@ from meds._version import __version__ # noqa -from .schema import (patient_schema, Event, Patient, label, Label, - code_metadata_entry, code_metadata, dataset_metadata, - CodeMetadataEntry, CodeMetadata, DatasetMetadata, birth_code, - death_code) +from .schema import ( + data_schema, label, Label, train_split, tuning_split, held_out_split, patient_split, code_metadata, + dataset_metadata, CodeMetadata, DatasetMetadata, birth_code, death_code +) # List all objects that we want to export _exported_objects = { - 'patient_schema': patient_schema, - 'Event': Event, - 'Patient': Patient, + 'data_schema': data_schema, 'label': label, 'Label': Label, - 'code_metadata_entry': code_metadata_entry, + 'train_split': train_split, + 'tuning_split': tuning_split, + 'held_out_split': held_out_split, + 'patient_split': patient_split, 'code_metadata': code_metadata, 'dataset_metadata': dataset_metadata, - 'CodeMetadataEntry': CodeMetadataEntry, 'CodeMetadata': CodeMetadata, 'DatasetMetadata': DatasetMetadata, 'birth_code': birth_code, - 'death_code': death_code + 'death_code': death_code, } __all__ = list(_exported_objects.keys()) diff --git a/src/meds/schema.py b/src/meds/schema.py index 0c1e73d..03b4d73 100644 --- a/src/meds/schema.py +++ b/src/meds/schema.py @@ -49,7 +49,7 @@ def data_schema(custom_properties=[]): [ ("patient_id", pa.int64()), # The patient who is being labeled. - + ("prediction_time", pa.timestamp("us")), # The time the prediction is made. # Machine learning models are allowed to use features that have timestamps less than or equal From 34465fe6a7112919fb2ef3d22745e68a2f85b007 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Tue, 30 Jul 2024 15:01:07 -0400 Subject: [PATCH 17/20] Standardized schema naming convention and fixed another typo. --- README.md | 8 +++----- src/meds/__init__.py | 4 ++-- src/meds/schema.py | 8 +++----- 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index d0c3c82..ebd5e9c 100644 --- a/README.md +++ b/README.md @@ -85,7 +85,7 @@ In addition, it can contain any number of custom properties to further enrich ob function below generates a pyarrow schema for a given set of custom properties. ```python -def data_schema(custom_properties=[]): +def data(custom_properties=[]): return pa.schema( [ ("patient_id", pa.int64()), @@ -181,8 +181,8 @@ DatasetMetadata = TypedDict( #### The code metadata schema. ```python -def code_metadata_schema(custom_per_code_properties=[]): - code_metadata = pa.schema( +def code_metadata(custom_per_code_properties=[]): + return pa.schema( [ ("code", pa.string()), ("description", pa.string()), @@ -190,8 +190,6 @@ def code_metadata_schema(custom_per_code_properties=[]): ] + custom_per_code_properties ) - return code_metadata - # Python type for the above schema CodeMetadata = TypedDict("CodeMetadata", {"code": str, "description": str, "parent_codes": List[str]}, total=False) diff --git a/src/meds/__init__.py b/src/meds/__init__.py index eb6ebd6..8853647 100644 --- a/src/meds/__init__.py +++ b/src/meds/__init__.py @@ -1,14 +1,14 @@ from meds._version import __version__ # noqa from .schema import ( - data_schema, label, Label, train_split, tuning_split, held_out_split, patient_split, code_metadata, + data, label, Label, train_split, tuning_split, held_out_split, patient_split, code_metadata, dataset_metadata, CodeMetadata, DatasetMetadata, birth_code, death_code ) # List all objects that we want to export _exported_objects = { - 'data_schema': data_schema, + 'data': data, 'label': label, 'Label': Label, 'train_split': train_split, diff --git a/src/meds/schema.py b/src/meds/schema.py index 03b4d73..3756283 100644 --- a/src/meds/schema.py +++ b/src/meds/schema.py @@ -26,7 +26,7 @@ birth_code = "MEDS_BIRTH" death_code = "MEDS_DEATH" -def data_schema(custom_properties=[]): +def data(custom_properties=[]): return pa.schema( [ ("patient_id", pa.int64()), @@ -126,8 +126,8 @@ def data_schema(custom_properties=[]): # The code metadata schema. # This is a parquet schema. -def code_metadata_schema(custom_per_code_properties=[]): - code_metadata = pa.schema( +def code_metadata(custom_per_code_properties=[]): + return pa.schema( [ ("code", pa.string()), ("description", pa.string()), @@ -135,8 +135,6 @@ def code_metadata_schema(custom_per_code_properties=[]): ] + custom_per_code_properties ) - return code_metadata - # Python type for the above schema CodeMetadata = TypedDict("CodeMetadata", {"code": str, "description": str, "parent_codes": List[str]}, total=False) From 94d4ce9776f3a5bd5cb94f2776ff319c389db3fc Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Tue, 30 Jul 2024 15:09:29 -0400 Subject: [PATCH 18/20] Maybe fixed tests --- tests/test_schema.py | 68 ++++++++++++++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 21 deletions(-) diff --git a/tests/test_schema.py b/tests/test_schema.py index ccdf31b..27fc192 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -4,30 +4,62 @@ import pyarrow as pa import pytest -from meds import patient_schema, label, dataset_metadata +from meds import ( + data, label, dataset_metadata, patient_split, code_metadata, train_split, tuning_split, held_out_split +) - -def test_patient_schema(): +def test_data_schema(): """ - Test that mock patient data follows the patient_schema schema. + Test that mock data follows the data schema. """ # Each element in the list is a row in the table - patient_data = [ + data = [ { "patient_id": 123, - "events": [{ # Nested list for events - "time": datetime.datetime(2020, 1, 1, 12, 0, 0), - "code": "some_code", - "text_value": "Example", - "numeric_value": 10.0, - "datetime_value": datetime.datetime(2020, 1, 1, 12, 0, 0), - "properties": None - }] + "time": datetime.datetime(2020, 1, 1, 12, 0, 0), + "code": "some_code", + "text_value": "Example", + "numeric_value": 10.0, } ] - patient_table = pa.Table.from_pylist(patient_data, schema=patient_schema()) - assert patient_table.schema.equals(patient_schema()), "Patient schema does not match" + schema = data([("text_value", pa.string())]) + + table = pa.Table.from_pylist(data, schema=schema) + assert table.schema.equals(schema), "Patient schema does not match" + +def test_code_metadata_schema(): + """ + Test that mock code metadata follows the schema. + """ + # Each element in the list is a row in the table + data = [ + { + "code": "some_code", + "description": "foo", + "parent_code": ["parent_code"], + } + ] + + schema = code_metadata() + + table = pa.Table.from_pylist(data, schema=schema) + assert table.schema.equals(schema), "Code metadata schema does not match" + +def test_patient_split_schema(): + """ + Test that mock data follows the data schema. + """ + # Each element in the list is a row in the table + data = [ + {"patient_id": 123, "split": train_split}, + {"patient_id": 123, "split": tuning_split}, + {"patient_id": 123, "split": held_out_split}, + {"patient_id": 123, "split": "special"}, + ] + + table = pa.Table.from_pylist(data, schema=patient_split) + assert table.schema.equals(patient_split), "Patient split schema does not match" def test_label_schema(): """ @@ -83,12 +115,6 @@ def test_dataset_metadata_schema(): "dataset_version": "1.0", "etl_name": "Test ETL", "etl_version": "1.0", - "code_metadata": { - "test_code": { - "description": "A test code", - "standard_ontology_codes": ["12345"], - } - }, } jsonschema.validate(instance=metadata, schema=dataset_metadata) From d962dac5b950470076b66db49c5dfda37e3151fd Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Tue, 30 Jul 2024 15:13:07 -0400 Subject: [PATCH 19/20] Fixed a typo in the code metadata schema and the tests --- src/meds/schema.py | 2 +- tests/test_schema.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/meds/schema.py b/src/meds/schema.py index 3756283..a7b67b4 100644 --- a/src/meds/schema.py +++ b/src/meds/schema.py @@ -131,7 +131,7 @@ def code_metadata(custom_per_code_properties=[]): [ ("code", pa.string()), ("description", pa.string()), - ("parent_codes", pa.list(pa.string())), + ("parent_codes", pa.list_(pa.string())), ] + custom_per_code_properties ) diff --git a/tests/test_schema.py b/tests/test_schema.py index 27fc192..168c4e9 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -13,7 +13,7 @@ def test_data_schema(): Test that mock data follows the data schema. """ # Each element in the list is a row in the table - data = [ + raw_data = [ { "patient_id": 123, "time": datetime.datetime(2020, 1, 1, 12, 0, 0), @@ -25,7 +25,7 @@ def test_data_schema(): schema = data([("text_value", pa.string())]) - table = pa.Table.from_pylist(data, schema=schema) + table = pa.Table.from_pylist(raw_data, schema=schema) assert table.schema.equals(schema), "Patient schema does not match" def test_code_metadata_schema(): @@ -33,7 +33,7 @@ def test_code_metadata_schema(): Test that mock code metadata follows the schema. """ # Each element in the list is a row in the table - data = [ + code_metadata = [ { "code": "some_code", "description": "foo", @@ -43,7 +43,7 @@ def test_code_metadata_schema(): schema = code_metadata() - table = pa.Table.from_pylist(data, schema=schema) + table = pa.Table.from_pylist(code_metadata, schema=schema) assert table.schema.equals(schema), "Code metadata schema does not match" def test_patient_split_schema(): @@ -51,14 +51,14 @@ def test_patient_split_schema(): Test that mock data follows the data schema. """ # Each element in the list is a row in the table - data = [ + patient_split_data = [ {"patient_id": 123, "split": train_split}, {"patient_id": 123, "split": tuning_split}, {"patient_id": 123, "split": held_out_split}, {"patient_id": 123, "split": "special"}, ] - table = pa.Table.from_pylist(data, schema=patient_split) + table = pa.Table.from_pylist(patient_split_data, schema=patient_split) assert table.schema.equals(patient_split), "Patient split schema does not match" def test_label_schema(): From ae6d26992620b13e0f50f6adf86929b61753ddf4 Mon Sep 17 00:00:00 2001 From: Matthew McDermott Date: Tue, 30 Jul 2024 15:19:17 -0400 Subject: [PATCH 20/20] Adjust naming convention to minimize import and variable name conflicts. --- src/meds/__init__.py | 14 +++++++------- src/meds/schema.py | 10 +++++----- tests/test_schema.py | 29 +++++++++++++++-------------- 3 files changed, 27 insertions(+), 26 deletions(-) diff --git a/src/meds/__init__.py b/src/meds/__init__.py index 8853647..3d8d36c 100644 --- a/src/meds/__init__.py +++ b/src/meds/__init__.py @@ -1,22 +1,22 @@ from meds._version import __version__ # noqa from .schema import ( - data, label, Label, train_split, tuning_split, held_out_split, patient_split, code_metadata, - dataset_metadata, CodeMetadata, DatasetMetadata, birth_code, death_code + data_schema, label_schema, Label, train_split, tuning_split, held_out_split, patient_split_schema, + code_metadata_schema, dataset_metadata_schema, CodeMetadata, DatasetMetadata, birth_code, death_code ) # List all objects that we want to export _exported_objects = { - 'data': data, - 'label': label, + 'data_schema': data_schema, + 'label_schema': label_schema, 'Label': Label, 'train_split': train_split, 'tuning_split': tuning_split, 'held_out_split': held_out_split, - 'patient_split': patient_split, - 'code_metadata': code_metadata, - 'dataset_metadata': dataset_metadata, + 'patient_split_schema': patient_split_schema, + 'code_metadata_schema': code_metadata_schema, + 'dataset_metadata_schema': dataset_metadata_schema, 'CodeMetadata': CodeMetadata, 'DatasetMetadata': DatasetMetadata, 'birth_code': birth_code, diff --git a/src/meds/schema.py b/src/meds/schema.py index a7b67b4..5b263f4 100644 --- a/src/meds/schema.py +++ b/src/meds/schema.py @@ -26,7 +26,7 @@ birth_code = "MEDS_BIRTH" death_code = "MEDS_DEATH" -def data(custom_properties=[]): +def data_schema(custom_properties=[]): return pa.schema( [ ("patient_id", pa.int64()), @@ -45,7 +45,7 @@ def data(custom_properties=[]): # including the prediction time. Exclusive prediction times are not currently supported, but if you have a use # case for them please add a GitHub issue. -label = pa.schema( +label_schema = pa.schema( [ ("patient_id", pa.int64()), # The patient who is being labeled. @@ -83,7 +83,7 @@ def data(custom_properties=[]): tuning_split = "tuning" # For ML hyperparameter tuning. Also often called "validation" or "dev". held_out_split = "held_out" # For final ML evaluation. Also often called "test". -patient_split = pa.schema( +patient_split_schema = pa.schema( [ ("patient_id", pa.int64()), ("split", pa.string()), @@ -96,7 +96,7 @@ def data(custom_properties=[]): # This is a JSON schema. -dataset_metadata = { +dataset_metadata_schema = { "type": "object", "properties": { "dataset_name": {"type": "string"}, @@ -126,7 +126,7 @@ def data(custom_properties=[]): # The code metadata schema. # This is a parquet schema. -def code_metadata(custom_per_code_properties=[]): +def code_metadata_schema(custom_per_code_properties=[]): return pa.schema( [ ("code", pa.string()), diff --git a/tests/test_schema.py b/tests/test_schema.py index 168c4e9..b945909 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -5,7 +5,8 @@ import pytest from meds import ( - data, label, dataset_metadata, patient_split, code_metadata, train_split, tuning_split, held_out_split + data_schema, label_schema, dataset_metadata_schema, patient_split_schema, code_metadata_schema, + train_split, tuning_split, held_out_split ) def test_data_schema(): @@ -23,7 +24,7 @@ def test_data_schema(): } ] - schema = data([("text_value", pa.string())]) + schema = data_schema([("text_value", pa.string())]) table = pa.Table.from_pylist(raw_data, schema=schema) assert table.schema.equals(schema), "Patient schema does not match" @@ -41,7 +42,7 @@ def test_code_metadata_schema(): } ] - schema = code_metadata() + schema = code_metadata_schema() table = pa.Table.from_pylist(code_metadata, schema=schema) assert table.schema.equals(schema), "Code metadata schema does not match" @@ -58,8 +59,8 @@ def test_patient_split_schema(): {"patient_id": 123, "split": "special"}, ] - table = pa.Table.from_pylist(patient_split_data, schema=patient_split) - assert table.schema.equals(patient_split), "Patient split schema does not match" + table = pa.Table.from_pylist(patient_split_data, schema=patient_split_schema) + assert table.schema.equals(patient_split_schema), "Patient split schema does not match" def test_label_schema(): """ @@ -73,8 +74,8 @@ def test_label_schema(): "boolean_value": True } ] - label_table = pa.Table.from_pylist(label_data, schema=label) - assert label_table.schema.equals(label), "Label schema does not match" + label_table = pa.Table.from_pylist(label_data, schema=label_schema) + assert label_table.schema.equals(label_schema), "Label schema does not match" label_data = [ { @@ -83,8 +84,8 @@ def test_label_schema(): "integer_value": 4 } ] - label_table = pa.Table.from_pylist(label_data, schema=label) - assert label_table.schema.equals(label), "Label schema does not match" + label_table = pa.Table.from_pylist(label_data, schema=label_schema) + assert label_table.schema.equals(label_schema), "Label schema does not match" label_data = [ { @@ -93,8 +94,8 @@ def test_label_schema(): "float_value": 0.4 } ] - label_table = pa.Table.from_pylist(label_data, schema=label) - assert label_table.schema.equals(label), "Label schema does not match" + label_table = pa.Table.from_pylist(label_data, schema=label_schema) + assert label_table.schema.equals(label_schema), "Label schema does not match" label_data = [ { @@ -103,8 +104,8 @@ def test_label_schema(): "categorical_value": "text" } ] - label_table = pa.Table.from_pylist(label_data, schema=label) - assert label_table.schema.equals(label), "Label schema does not match" + label_table = pa.Table.from_pylist(label_data, schema=label_schema) + assert label_table.schema.equals(label_schema), "Label schema does not match" def test_dataset_metadata_schema(): """ @@ -117,5 +118,5 @@ def test_dataset_metadata_schema(): "etl_version": "1.0", } - jsonschema.validate(instance=metadata, schema=dataset_metadata) + jsonschema.validate(instance=metadata, schema=dataset_metadata_schema) assert True, "Dataset metadata schema validation failed"