From 5952213bc87e2fb272982ad1ebc932d4647c221e Mon Sep 17 00:00:00 2001
From: Ethan Steinberg <ethan.steinberg@gmail.com>
Date: Thu, 20 Jun 2024 08:27:25 -0700
Subject: [PATCH 01/20] Update schema.py

---
 src/meds/schema.py | 39 ++++++---------------------------------
 1 file changed, 6 insertions(+), 33 deletions(-)

diff --git a/src/meds/schema.py b/src/meds/schema.py
index edbc9e1..3df6b44 100644
--- a/src/meds/schema.py
+++ b/src/meds/schema.py
@@ -21,45 +21,18 @@
 birth_code = "SNOMED/184099003"
 death_code = "SNOMED/419620001"
 
-def patient_schema(per_event_properties_schema=pa.null()):
-    # Return a patient schema with a particular per event metadata subschema
-    event = pa.struct(
+def events_schema(patient_id_type, custom_per_event_properties=pa.null()):
+    assert patient_id_type == pa.int64() or patient_id_type == pa.string()
+    
+    return pa.schema(
         [
+            ("patient_id", patient_id_type),   
             ("time", pa.timestamp("us")), # Static events will have a null timestamp
             ("code", pa.string()),
-            ("text_value", pa.string()),
             ("numeric_value", pa.float32()),
-            ("datetime_value", pa.timestamp("us")),
-            ("properties", per_event_properties_schema),
-        ]
+        ] + custom_per_event_properties
     )
 
-    patient = pa.schema(
-        [
-            ("patient_id", pa.int64()),
-            ("events", pa.list_(event)),  # Require ordered by time, nulls must be first
-        ]
-    )
-
-    return patient
-
-
-# Python types for the above schema
-
-Event = TypedDict(
-    "Event",
-    {
-        "time": NotRequired[datetime.datetime],
-        "code": str,
-        "text_value": NotRequired[str],
-        "numeric_value": NotRequired[float],
-        "datetime_value": NotRequired[datetime.datetime],
-        "properties": NotRequired[Any],
-    },
-)
-
-Patient = TypedDict("Patient", {"patient_id": int, "events": List[Event]})
-
 ############################################################
 
 # The label schema.

From ca6ab08fc24905dee8dc8db329fb1633ad4d43a1 Mon Sep 17 00:00:00 2001
From: Ethan Steinberg <ethan.steinberg@gmail.com>
Date: Thu, 20 Jun 2024 08:29:01 -0700
Subject: [PATCH 02/20] Update schema.py

---
 src/meds/schema.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/meds/schema.py b/src/meds/schema.py
index 3df6b44..dfa97c7 100644
--- a/src/meds/schema.py
+++ b/src/meds/schema.py
@@ -21,7 +21,7 @@
 birth_code = "SNOMED/184099003"
 death_code = "SNOMED/419620001"
 
-def events_schema(patient_id_type, custom_per_event_properties=pa.null()):
+def events_schema(patient_id_type, custom_per_event_properties=[]):
     assert patient_id_type == pa.int64() or patient_id_type == pa.string()
     
     return pa.schema(

From 4ca8f6a98738a10c969838033c291a9379591a03 Mon Sep 17 00:00:00 2001
From: Ethan Steinberg <ethan.steinberg@gmail.com>
Date: Mon, 15 Jul 2024 06:04:48 -0700
Subject: [PATCH 03/20] Update schema.py

---
 src/meds/schema.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/meds/schema.py b/src/meds/schema.py
index dfa97c7..861f3eb 100644
--- a/src/meds/schema.py
+++ b/src/meds/schema.py
@@ -22,11 +22,9 @@
 death_code = "SNOMED/419620001"
 
 def events_schema(patient_id_type, custom_per_event_properties=[]):
-    assert patient_id_type == pa.int64() or patient_id_type == pa.string()
-    
     return pa.schema(
         [
-            ("patient_id", patient_id_type),   
+            ("patient_id", pa.int64()),   
             ("time", pa.timestamp("us")), # Static events will have a null timestamp
             ("code", pa.string()),
             ("numeric_value", pa.float32()),

From 52d560f2b03c48493d2a2cddc64b190534f2cdc0 Mon Sep 17 00:00:00 2001
From: Ethan Steinberg <ethan.steinberg@gmail.com>
Date: Mon, 15 Jul 2024 06:19:32 -0700
Subject: [PATCH 04/20] Update schema.py

---
 src/meds/schema.py | 77 +++++++++++++++++++++++++++++++---------------
 1 file changed, 52 insertions(+), 25 deletions(-)

diff --git a/src/meds/schema.py b/src/meds/schema.py
index 861f3eb..760beea 100644
--- a/src/meds/schema.py
+++ b/src/meds/schema.py
@@ -4,24 +4,42 @@
 import pyarrow as pa
 from typing_extensions import NotRequired, TypedDict
 
-# Medical Event Data Standard consists of three main components:
-# 1. A patient data schema
+# Medical Event Data Standard consists of four main components:
+# 1. A patient event schema
 # 2. A label schema
 # 3. A dataset metadata schema.
+# 4. A code metadata schema.
 #
-# Patient data and labels are specified using pyarrow. Dataset metadata is specified using JSON.
+# Event data, labels, and code metadata is specified using pyarrow. Dataset metadata is specified using JSON.
 
-# We also provide TypedDict Python type signatures for these schemas.
+# We also specify a directory structure for how these should be laid out on disk.
+
+# Every MEDS extract consists of a folder that contains both metadata and patient data with the following structure:
+# - data/
+#    A (possibly) nested folder containing multiple parquet files containing patient event data following the events_schema folder.
+#    glob("data/**/*.parquet") is the recommended way for obtaining all patient event files.
+# - dataset_metadata.json
+#    Dataset level metadata containing information about the ETL used, data version, etc
+# - code_metadata.parquet
+#    Code level metadata containing information about the code descriptions, standard mappings, etc
 
 ############################################################
 
-# The patient data schema.
+# The patient event data schema.
+#
+# Patient event data also must satisfy two important properties:
+#
+# 1. Patient event data cannot be split across parquet files. If a patient is in a dataset it must be in one and only one parquet file.
+# 2. Patient event data must be contiguous within a particular parquet file and sorted by event time. 
+
+# Both of these restrictions allow the stream rolling processing (see https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.rolling.html),
+# which vastly simplifies many data analysis pipelines.
 
 # We define some codes for particularly important events
 birth_code = "SNOMED/184099003"
 death_code = "SNOMED/419620001"
 
-def events_schema(patient_id_type, custom_per_event_properties=[]):
+def patient_events_schema(custom_per_event_properties=[]):
     return pa.schema(
         [
             ("patient_id", pa.int64()),   
@@ -31,6 +49,9 @@ def events_schema(patient_id_type, custom_per_event_properties=[]):
         ] + custom_per_event_properties
     )
 
+# No python type is provided because Python tools for processing MEDS data will often provide their own types.
+# See https://github.com/EthanSteinberg/meds_reader/blob/0.0.6/src/meds_reader/__init__.pyi#L55 for example.
+
 ############################################################
 
 # The label schema.
@@ -55,26 +76,14 @@ def events_schema(patient_id_type, custom_per_event_properties=[]):
     "integer_value" : Optional[int],
     "float_value" : Optional[float],
     "categorical_value" : Optional[str],
-})
+}, total=False)
 
 ############################################################
 
 # The dataset metadata schema.
 # This is a JSON schema.
-# This data should be stored in metadata.json within the dataset folder.
-
-code_metadata_entry = {
-    "type": "object",
-    "properties": {
-        "description": {"type": "string"},
-        "parent_codes": {"type": "array", "items": {"type": "string"}},
-    },
-}
+# This data should be stored in dataset_metadata.json within the dataset folder.
 
-code_metadata = {
-    "type": "object",
-    "additionalProperties": code_metadata_entry,
-}
 
 dataset_metadata = {
     "type": "object",
@@ -83,15 +92,12 @@ def events_schema(patient_id_type, custom_per_event_properties=[]):
         "dataset_version": {"type": "string"},
         "etl_name": {"type": "string"},
         "etl_version": {"type": "string"},
-        "code_metadata": code_metadata,
         "meds_version": {"type": "string"},
     },
 }
 
-# Python types for the above schema
+# Python type for the above schema
 
-CodeMetadataEntry = TypedDict("CodeMetadataEntry", {"description": str, "parent_codes": List[str]})
-CodeMetadata = Mapping[str, CodeMetadataEntry]
 DatasetMetadata = TypedDict(
     "DatasetMetadata",
     {
@@ -99,7 +105,28 @@ def events_schema(patient_id_type, custom_per_event_properties=[]):
         "dataset_version": NotRequired[str],
         "etl_name": NotRequired[str],
         "etl_version": NotRequired[str],
-        "code_metadata": NotRequired[CodeMetadata],
         "meds_version": NotRequired[str],
     },
+    total=False,
 )
+
+############################################################
+
+# The code metadata schema.
+# This is a parquet schema.
+# This data should be stored in code_metadata.parquet within the dataset folder.
+
+def code_metadata_schema(custom_per_event_properties=[]): 
+    code_metadata = pa.schema(
+        [
+            ("code", pa.string()),
+            ("description", pa.string()),
+            ("parent_codes", pa.list(pa.string()),
+        ] + custom_per_event_properties
+    )
+
+    return code_metadata
+
+# Python type for the above schema
+
+CodeMetadata = TypedDict("CodeMetadata", {"code": str, "description": str, "parent_codes": List[str]}, total=False)

From 421df192145a015d4320f03b9e922406c9e7f246 Mon Sep 17 00:00:00 2001
From: Ethan Steinberg <ethan.steinberg@gmail.com>
Date: Mon, 15 Jul 2024 06:22:37 -0700
Subject: [PATCH 05/20] Update schema.py

---
 src/meds/schema.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/meds/schema.py b/src/meds/schema.py
index 760beea..aea10a3 100644
--- a/src/meds/schema.py
+++ b/src/meds/schema.py
@@ -116,13 +116,13 @@ def patient_events_schema(custom_per_event_properties=[]):
 # This is a parquet schema.
 # This data should be stored in code_metadata.parquet within the dataset folder.
 
-def code_metadata_schema(custom_per_event_properties=[]): 
+def code_metadata_schema(custom_per_code_properties=[]): 
     code_metadata = pa.schema(
         [
             ("code", pa.string()),
             ("description", pa.string()),
             ("parent_codes", pa.list(pa.string()),
-        ] + custom_per_event_properties
+        ] + custom_per_code_properties
     )
 
     return code_metadata

From 28b21b02b98198abe25bac26b5563b7e26622993 Mon Sep 17 00:00:00 2001
From: Ethan Steinberg <ethan.steinberg@gmail.com>
Date: Mon, 15 Jul 2024 06:23:07 -0700
Subject: [PATCH 06/20] Update schema.py

---
 src/meds/schema.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/meds/schema.py b/src/meds/schema.py
index aea10a3..49eb418 100644
--- a/src/meds/schema.py
+++ b/src/meds/schema.py
@@ -16,7 +16,7 @@
 
 # Every MEDS extract consists of a folder that contains both metadata and patient data with the following structure:
 # - data/
-#    A (possibly) nested folder containing multiple parquet files containing patient event data following the events_schema folder.
+#    A (possibly nested) folder containing multiple parquet files containing patient event data following the events_schema folder.
 #    glob("data/**/*.parquet") is the recommended way for obtaining all patient event files.
 # - dataset_metadata.json
 #    Dataset level metadata containing information about the ETL used, data version, etc

From 6544c3d1dcc7bb545357e644fad2572985f826b8 Mon Sep 17 00:00:00 2001
From: Ethan Steinberg <ethan.steinberg@gmail.com>
Date: Fri, 19 Jul 2024 07:38:35 -0700
Subject: [PATCH 07/20] Update schema.py

---
 src/meds/schema.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/meds/schema.py b/src/meds/schema.py
index 49eb418..f7a48ca 100644
--- a/src/meds/schema.py
+++ b/src/meds/schema.py
@@ -20,8 +20,10 @@
 #    glob("data/**/*.parquet") is the recommended way for obtaining all patient event files.
 # - dataset_metadata.json
 #    Dataset level metadata containing information about the ETL used, data version, etc
-# - code_metadata.parquet
+# - (Optional) code_metadata.parquet
 #    Code level metadata containing information about the code descriptions, standard mappings, etc
+# - (Optional) patient_split.csv
+#    A specification of patient splits that should be used.
 
 ############################################################
 
@@ -78,6 +80,22 @@ def patient_events_schema(custom_per_event_properties=[]):
     "categorical_value" : Optional[str],
 }, total=False)
 
+
+############################################################
+
+# The patient split schema.
+
+train_split = "train"
+tuning_split = "tuning"
+test_split = "test"
+
+patient_split = pa.schema(
+    [
+        ("patient_id", pa.int64()),
+        ("split", pa.string()),
+    ]
+)
+
 ############################################################
 
 # The dataset metadata schema.

From 3f7c441c09d16bc4aafac5238956e1306c7305e0 Mon Sep 17 00:00:00 2001
From: Ethan Steinberg <ethan.steinberg@gmail.com>
Date: Fri, 19 Jul 2024 07:51:47 -0700
Subject: [PATCH 08/20] Update schema.py

---
 src/meds/schema.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/meds/schema.py b/src/meds/schema.py
index f7a48ca..68d7fc0 100644
--- a/src/meds/schema.py
+++ b/src/meds/schema.py
@@ -38,8 +38,8 @@
 # which vastly simplifies many data analysis pipelines.
 
 # We define some codes for particularly important events
-birth_code = "SNOMED/184099003"
-death_code = "SNOMED/419620001"
+birth_code = "MEDS_BIRTH"
+death_code = "MEDS_DEATH"
 
 def patient_events_schema(custom_per_event_properties=[]):
     return pa.schema(

From c922b44ef5cb7515a1f5309a8b20dcfce10e7e9f Mon Sep 17 00:00:00 2001
From: Ethan Steinberg <ethan.steinberg@gmail.com>
Date: Mon, 29 Jul 2024 20:15:30 -0700
Subject: [PATCH 09/20] Update schema.py

---
 src/meds/schema.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/meds/schema.py b/src/meds/schema.py
index 68d7fc0..5839ce2 100644
--- a/src/meds/schema.py
+++ b/src/meds/schema.py
@@ -61,7 +61,14 @@ def patient_events_schema(custom_per_event_properties=[]):
 label = pa.schema(
     [
         ("patient_id", pa.int64()),
-        ("prediction_time", pa.timestamp("us")),
+         # The patient who is being labeled.
+        
+        ("prediction_time", pa.timestamp("us")), 
+        # The time the prediction is made. 
+        # Machine learning models are allowed to use features that have timestamps less than or equal
+        # to this timestamp.
+
+        # Possible values for the label.
         ("boolean_value", pa.bool_()),
         ("integer_value", pa.int64()),
         ("float_value", pa.float64()),

From 3ed25abebb126f6d22720ac6b22618f42cbd4497 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Mon, 29 Jul 2024 23:55:09 -0400
Subject: [PATCH 10/20] Started updating README

---
 README.md          | 100 ++++++++++++++++++++++++++++++++++++++++++++-
 src/meds/schema.py |   7 +++-
 2 files changed, 105 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index b097d77..f33a326 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,104 @@
 # Medical Event Data Standard
 
-The Medical Event Data Standard (MEDS) is a draft data schema for storing streams of medical events, often sourced from either Electronic Health Records or claims records.
+The Medical Event Data Standard (MEDS) is a data schema for storing streams of medical events, often
+sourced from either Electronic Health Records or claims records. Before we define the various schema that make
+up MEDS, we will define some key terminology that we use in this standard.
+
+## Terminology
+  1. A _patient_ in a MEDS dataset is the primary entity being described by the sequences of care observations
+     in the underlying dataset. In most cases, _patients_ will, naturally, be individuals, and the sequences
+     of care observations will cover all known observations about those individuals in a source health
+     datasets. However, in some cases, data may be organized so that we cannot describe all the data for an
+     individual reliably in a dataset, but instead can only describe subsequences of an individual's data,
+     such as in datasets that only link an individual's data observations together if they are within the same
+     hospital admission, regardless of how many admissions that individual has in the dataset (such as the
+     [eICU](https://eicu-crd.mit.edu/) dataset). In these cases, a _patient_ in the MEDS dataset may refer to
+     a hospital admission rather than an individual.
+  2. A _measurement_ or _patient measurement_ or _observation_ in a MEDS dataset refers to a single measurable
+     quantity observed about the patient during their care. These observations can take on many forms, such as
+     observing a diagnostic code being applied to the patient, observing a patient's admission or transfer
+     from one unit to another, observing a laboratory test result, but always correspond to a single
+     measureable unit about a single patient.
+  3. A _code_ is the categorical descriptor of what happened in a patient measurement. In particular, in
+     almost all structured, longitudinal datasets, a measurement can be described as consisting of a tuple
+     containing a `patient_id` (who this measurement is about); a `timestamp` (when this measurement
+     happened); some categorical qualifier describing what was measured, which we will call a `code`; a value
+     of a given type, such as a `numerical_value`, a `text_value`, or a `categorical_value`; and possibly one
+     or more additional measurement properties that describe the measurement in a non-standardized manner.
+  4. An _event_ or _patient event_ in a MEDS dataset corresponds to all observations about a patient that
+     occur at a unique timestamp (within the level of temporal granularity in the MEDS dataset).
+  5. A _static_ measurement is one that occurs without a source timestamp being recorded in the raw dataset
+     **and** that can be interpreted as being applicable to the patient at any point in time during their
+     care. All other measurements observed in the raw dataset will be considered to be _dynamic_ measurements
+     that can vary in time in an unknown manner. Note that there are a third class of measurements that may,
+     at times, be induced in the dataset known as _time-derived_ measurements which correspond to measurements
+     that occur in time like _dynamic_ measurements but can be computed deterministically in advance using
+     only the timestamp at which a measurement occurs and the patient's static (or, rarely, historical) data,
+     such as the patient's age or the season of the year in which a measurement occurs. These are rarely
+     recorded in the raw data but may be used during modeling.
+
+## Core MEDS Data Organization
+
+MEDS consists of four main data components/schemas:
+  1. A _patient measurement schema_. This schema describes the underlying medical data, organized as sequences
+     of patient measurements, in the dataset.
+  2. A _patient subsequence label schema_. This schema describes labels that may be predicted about a patient
+     at a given timestamp in the patient record.
+  3. A _code metadata schema_. This schema contains metadata describing the codes used to categorize the
+     observed measurements in the dataset.
+  4. A _dataset metadata schema_. This schema contains metadata about the MEDS dataset itself, such as when it
+     was produced, using what version of what code, etc.
+  5. A _patient split schema_. This schema contains metadata about how patients in the MEDS dataset are
+     assigned to different subpopulations, most commonly used to dictate ML splits.
+
+### Organization on Disk
+Given a MEDS dataset stored in the `$MEDS_ROOT` directory data of the various schemas outlined above can be
+found in the following subfolders:
+  - `$MEDS_ROOT/data/`: This directory will contain data in the _patient measurement schema_, organized as a
+    series of possibly nested sharded dataframes, often as `parquet` files. In particular, the file glob
+    `glob("$MEDS_ROOT/data/**/*.parquet)` will capture all sharded data files of the raw MEDS data, all
+    organized into _patient measurement schema_ files, sharded by patient and sorted, for each patient, by
+    timestamp.
+  - `$MEDS_ROOT/metadata/codes.csv`: This file contains per-code metadata in the _code metadata schema_
+    about the MEDS dataset. As this dataset describes all codes observed in the full MEDS dataset, it is _not_
+    sharded. Note that some pre-processing operations may, at times, produce sharded code metadata files, but
+    these will always appear in subdirectories of `$MEDS_ROOT/metadata/` rather than at the top level, and
+    should generally not be used for overall metadata operations. The preferred file format for this dataframe
+    is CSV for ease of human inspection and readability.
+  - `$MEDS_ROOT/metadata/dataset.json`: This schema contains metadata in the _dataset metadata schema_ about
+    the dataset and its production process.
+  - `$MEDS_ROOT/metdata/patient_splits.csv`: This schema contains information in the _patient split schema_
+    about what splits different patients are in. Unlike the raw data, which should preferrably be stored in
+    the parquet format for compression, columnar read capabilities, and compression, the patient splits is
+    preferrably stored in a comma separated value (CSV) format for ease of readability and shareability.
+
+Task label dataframes are stored in the _TODO label_ schema, in a file path that depends on both a
+`$TASK_ROOT` directory where task label dataframes are stored and a `$TASK_NAME` parameter that separates
+different tasks from one another. In particular, the file glob `glob($TASK_ROOT/$TASK_NAME/**/*.parquet)` will
+retrieve a sharded set of dataframes in the _TODO label_ schema where the sharding matches up precisely with
+the sharding used in the raw `$MEDS_ROOT/data/**/*.parquet` files (e.g., the file
+`$TASK_ROOT/$TASK_NAME/$SHARD_NAME.parquet` will cover the labels for the same set of patients as are
+contained in the raw data file at `$MEDS_ROOT/data/**/*.parquet`). Note that (1) `$TASK_ROOT` may be a subdir
+of `$MEDS_ROOT` (e.g., often `$TASK_ROOT` will be set to `$MEDS_ROOT/tasks`), (2) `$TASK_NAME` may have `/`s
+in it, thereby rendering the task label directory a deep, nested subdir of `$TASK_ROOT`, and (3) in some
+cases, there may be no task labels for a shard of the raw data, if no patient in that shard qualifies for that
+task, in which case it may be true that either `$TASK_ROOT/$TASK_NAME/$SHARD_NAME.parquet` is empty or that it
+does not exist.
+
+While we give preferred file formats in the list above, the important thing about these data are that they are
+stored in the appropriate schemas, not that they use the preferred file formats. Datasets can be stored using
+parquet files for splits or CSV files for raw datasets and still be compliant with the MEDS format.
+
+### Schemas
+
+**TODO**: copy here from the schema file and describe.
+
+
+
+
+
+
+## Old -- to be deleted.
 
 The core of the standard is that we define a ``patient`` data structure that contains a series of time stamped events, that in turn contain measurements of various sorts.
 
diff --git a/src/meds/schema.py b/src/meds/schema.py
index 68d7fc0..fff43c7 100644
--- a/src/meds/schema.py
+++ b/src/meds/schema.py
@@ -44,7 +44,7 @@
 def patient_events_schema(custom_per_event_properties=[]):
     return pa.schema(
         [
-            ("patient_id", pa.int64()),   
+            ("patient_id", pa.int64()),
             ("time", pa.timestamp("us")), # Static events will have a null timestamp
             ("code", pa.string()),
             ("numeric_value", pa.float32()),
@@ -96,6 +96,11 @@ def patient_events_schema(custom_per_event_properties=[]):
     ]
 )
 
+PatientSplit = TypedDict("PatientSplit", {
+    "patient_id": int,
+    "split": str,
+}, total=True)
+
 ############################################################
 
 # The dataset metadata schema.

From ed9cb9150b9d2c2c4063e0ff771d625a5785c7cc Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Tue, 30 Jul 2024 09:04:44 -0400
Subject: [PATCH 11/20] Updating to mandatory file formats.

---
 README.md | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index f33a326..3d7b6d7 100644
--- a/README.md
+++ b/README.md
@@ -55,22 +55,19 @@ MEDS consists of four main data components/schemas:
 Given a MEDS dataset stored in the `$MEDS_ROOT` directory data of the various schemas outlined above can be
 found in the following subfolders:
   - `$MEDS_ROOT/data/`: This directory will contain data in the _patient measurement schema_, organized as a
-    series of possibly nested sharded dataframes, often as `parquet` files. In particular, the file glob
+    series of possibly nested sharded dataframes stored in `parquet` files. In particular, the file glob
     `glob("$MEDS_ROOT/data/**/*.parquet)` will capture all sharded data files of the raw MEDS data, all
     organized into _patient measurement schema_ files, sharded by patient and sorted, for each patient, by
     timestamp.
-  - `$MEDS_ROOT/metadata/codes.csv`: This file contains per-code metadata in the _code metadata schema_
+  - `$MEDS_ROOT/metadata/codes.parquet`: This file contains per-code metadata in the _code metadata schema_
     about the MEDS dataset. As this dataset describes all codes observed in the full MEDS dataset, it is _not_
     sharded. Note that some pre-processing operations may, at times, produce sharded code metadata files, but
     these will always appear in subdirectories of `$MEDS_ROOT/metadata/` rather than at the top level, and
-    should generally not be used for overall metadata operations. The preferred file format for this dataframe
-    is CSV for ease of human inspection and readability.
+    should generally not be used for overall metadata operations.
   - `$MEDS_ROOT/metadata/dataset.json`: This schema contains metadata in the _dataset metadata schema_ about
     the dataset and its production process.
-  - `$MEDS_ROOT/metdata/patient_splits.csv`: This schema contains information in the _patient split schema_
-    about what splits different patients are in. Unlike the raw data, which should preferrably be stored in
-    the parquet format for compression, columnar read capabilities, and compression, the patient splits is
-    preferrably stored in a comma separated value (CSV) format for ease of readability and shareability.
+  - `$MEDS_ROOT/metdata/patient_splits.parquet`: This schema contains information in the _patient split
+    schema_ about what splits different patients are in.
 
 Task label dataframes are stored in the _TODO label_ schema, in a file path that depends on both a
 `$TASK_ROOT` directory where task label dataframes are stored and a `$TASK_NAME` parameter that separates
@@ -85,10 +82,6 @@ cases, there may be no task labels for a shard of the raw data, if no patient in
 task, in which case it may be true that either `$TASK_ROOT/$TASK_NAME/$SHARD_NAME.parquet` is empty or that it
 does not exist.
 
-While we give preferred file formats in the list above, the important thing about these data are that they are
-stored in the appropriate schemas, not that they use the preferred file formats. Datasets can be stored using
-parquet files for splits or CSV files for raw datasets and still be compliant with the MEDS format.
-
 ### Schemas
 
 **TODO**: copy here from the schema file and describe.

From 59856292b93e54f798b8713ebd18ad501feb51d7 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Tue, 30 Jul 2024 14:14:29 -0400
Subject: [PATCH 12/20] Removed controversial or unneeded terms

---
 README.md | 37 +++++++++++--------------------------
 1 file changed, 11 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 3d7b6d7..41eca97 100644
--- a/README.md
+++ b/README.md
@@ -14,34 +14,19 @@ up MEDS, we will define some key terminology that we use in this standard.
      hospital admission, regardless of how many admissions that individual has in the dataset (such as the
      [eICU](https://eicu-crd.mit.edu/) dataset). In these cases, a _patient_ in the MEDS dataset may refer to
      a hospital admission rather than an individual.
-  2. A _measurement_ or _patient measurement_ or _observation_ in a MEDS dataset refers to a single measurable
-     quantity observed about the patient during their care. These observations can take on many forms, such as
-     observing a diagnostic code being applied to the patient, observing a patient's admission or transfer
-     from one unit to another, observing a laboratory test result, but always correspond to a single
-     measureable unit about a single patient.
-  3. A _code_ is the categorical descriptor of what happened in a patient measurement. In particular, in
-     almost all structured, longitudinal datasets, a measurement can be described as consisting of a tuple
-     containing a `patient_id` (who this measurement is about); a `timestamp` (when this measurement
-     happened); some categorical qualifier describing what was measured, which we will call a `code`; a value
-     of a given type, such as a `numerical_value`, a `text_value`, or a `categorical_value`; and possibly one
-     or more additional measurement properties that describe the measurement in a non-standardized manner.
-  4. An _event_ or _patient event_ in a MEDS dataset corresponds to all observations about a patient that
-     occur at a unique timestamp (within the level of temporal granularity in the MEDS dataset).
-  5. A _static_ measurement is one that occurs without a source timestamp being recorded in the raw dataset
-     **and** that can be interpreted as being applicable to the patient at any point in time during their
-     care. All other measurements observed in the raw dataset will be considered to be _dynamic_ measurements
-     that can vary in time in an unknown manner. Note that there are a third class of measurements that may,
-     at times, be induced in the dataset known as _time-derived_ measurements which correspond to measurements
-     that occur in time like _dynamic_ measurements but can be computed deterministically in advance using
-     only the timestamp at which a measurement occurs and the patient's static (or, rarely, historical) data,
-     such as the patient's age or the season of the year in which a measurement occurs. These are rarely
-     recorded in the raw data but may be used during modeling.
+  2. A _code_ is the categorical descriptor of what is being observed in any given observation of a patient.
+     In particular, in almost all structured, longitudinal datasets, a measurement can be described as
+     consisting of a tuple containing a `patient_id` (who this measurement is about); a `timestamp` (when this
+     measurement happened); some categorical qualifier describing what was measured, which we will call a
+     `code`; a value of a given type, such as a `numerical_value`, a `text_value`, or a `categorical_value`;
+     and possibly one or more additional measurement properties that describe the measurement in a
+     non-standardized manner.
 
 ## Core MEDS Data Organization
 
 MEDS consists of four main data components/schemas:
-  1. A _patient measurement schema_. This schema describes the underlying medical data, organized as sequences
-     of patient measurements, in the dataset.
+  1. A _data schema_. This schema describes the underlying medical data, organized as sequences of patient
+     observations, in the dataset.
   2. A _patient subsequence label schema_. This schema describes labels that may be predicted about a patient
      at a given timestamp in the patient record.
   3. A _code metadata schema_. This schema contains metadata describing the codes used to categorize the
@@ -54,10 +39,10 @@ MEDS consists of four main data components/schemas:
 ### Organization on Disk
 Given a MEDS dataset stored in the `$MEDS_ROOT` directory data of the various schemas outlined above can be
 found in the following subfolders:
-  - `$MEDS_ROOT/data/`: This directory will contain data in the _patient measurement schema_, organized as a
+  - `$MEDS_ROOT/data/`: This directory will contain data in the _data schema_, organized as a
     series of possibly nested sharded dataframes stored in `parquet` files. In particular, the file glob
     `glob("$MEDS_ROOT/data/**/*.parquet)` will capture all sharded data files of the raw MEDS data, all
-    organized into _patient measurement schema_ files, sharded by patient and sorted, for each patient, by
+    organized into _data schema_ files, sharded by patient and sorted, for each patient, by
     timestamp.
   - `$MEDS_ROOT/metadata/codes.parquet`: This file contains per-code metadata in the _code metadata schema_
     about the MEDS dataset. As this dataset describes all codes observed in the full MEDS dataset, it is _not_

From 1da2ec01ebb92fbfafbf33b20d7de51e9810c162 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Tue, 30 Jul 2024 14:25:40 -0400
Subject: [PATCH 13/20] Updated schemas and documentation with consensus terms
 and deduplicated file path instructions.

---
 README.md          | 162 ++++++++++++++++++++++++++++++++-------------
 src/meds/schema.py |  49 +++++---------
 2 files changed, 132 insertions(+), 79 deletions(-)

diff --git a/README.md b/README.md
index 41eca97..9ab090e 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ up MEDS, we will define some key terminology that we use in this standard.
      a hospital admission rather than an individual.
   2. A _code_ is the categorical descriptor of what is being observed in any given observation of a patient.
      In particular, in almost all structured, longitudinal datasets, a measurement can be described as
-     consisting of a tuple containing a `patient_id` (who this measurement is about); a `timestamp` (when this
+     consisting of a tuple containing a `patient_id` (who this measurement is about); a `time` (when this
      measurement happened); some categorical qualifier describing what was measured, which we will call a
      `code`; a value of a given type, such as a `numerical_value`, a `text_value`, or a `categorical_value`;
      and possibly one or more additional measurement properties that describe the measurement in a
@@ -28,7 +28,7 @@ MEDS consists of four main data components/schemas:
   1. A _data schema_. This schema describes the underlying medical data, organized as sequences of patient
      observations, in the dataset.
   2. A _patient subsequence label schema_. This schema describes labels that may be predicted about a patient
-     at a given timestamp in the patient record.
+     at a given time in the patient record.
   3. A _code metadata schema_. This schema contains metadata describing the codes used to categorize the
      observed measurements in the dataset.
   4. A _dataset metadata schema_. This schema contains metadata about the MEDS dataset itself, such as when it
@@ -43,7 +43,7 @@ found in the following subfolders:
     series of possibly nested sharded dataframes stored in `parquet` files. In particular, the file glob
     `glob("$MEDS_ROOT/data/**/*.parquet)` will capture all sharded data files of the raw MEDS data, all
     organized into _data schema_ files, sharded by patient and sorted, for each patient, by
-    timestamp.
+    time.
   - `$MEDS_ROOT/metadata/codes.parquet`: This file contains per-code metadata in the _code metadata schema_
     about the MEDS dataset. As this dataset describes all codes observed in the full MEDS dataset, it is _not_
     sharded. Note that some pre-processing operations may, at times, produce sharded code metadata files, but
@@ -69,67 +69,135 @@ does not exist.
 
 ### Schemas
 
-**TODO**: copy here from the schema file and describe.
-
+#### The Data Schema
+MEDS data also must satisfy two important properties:
+  1. Data about a single patient cannot be split across parquet files. If a patient is in a dataset it must be
+     in one and only one parquet file.
+  2. Data about a single patient must be contiguous within a particular parquet file and sorted by time. 
 
+The data schema has four mandatory fields:
+  1. `patient_id`: The ID of the patient this event is about.
+  2. `time`: The time of the event. This field is nullable for static events.
+  3. `code`: The code of the event.
+  4. `numeric_value`: The numeric value of the event. This field is nullable for non-numeric events.
 
+In addition, it can contain any number of custom properties to further enrich observations. The python
+function below generates a pyarrow schema for a given set of custom properties.
 
+```python
+def data_schema(custom_properties=[]):
+    return pa.schema(
+        [
+            ("patient_id", pa.int64()),
+            ("time", pa.timestamp("us")), # Static events will have a null timestamp
+            ("code", pa.string()),
+            ("numeric_value", pa.float32()),
+        ] + custom_properties
+    )
+```
 
+#### The label schema.
+Models, when predicting this label, are allowed to use all data about a patient up to and including the
+prediction time. Exclusive prediction times are not currently supported, but if you have a use case for them
+please add a GitHub issue.
 
-## Old -- to be deleted.
+```python
+label = pa.schema(
+    [
+        ("patient_id", pa.int64()),
+        ("prediction_time", pa.timestamp("us")),
+        ("boolean_value", pa.bool_()),
+        ("integer_value", pa.int64()),
+        ("float_value", pa.float64()),
+        ("categorical_value", pa.string()),
+    ]
+)
+
+Label = TypedDict("Label", {
+    "patient_id": int, 
+    "prediction_time": datetime.datetime, 
+    "boolean_value": Optional[bool],
+    "integer_value" : Optional[int],
+    "float_value" : Optional[float],
+    "categorical_value" : Optional[str],
+}, total=False)
+```
 
-The core of the standard is that we define a ``patient`` data structure that contains a series of time stamped events, that in turn contain measurements of various sorts.
+#### The patient split schema.
 
-The Python type signature for the schema is as follows:
+Three sentinel split names are defined for convenience and shared processing:
+  1. A training split, named `train`, used for ML model training.
+  2. A tuning split, named `tuning`, used for hyperparameter tuning. This is sometimes also called a
+     "validation" split or a "dev" split. In many cases, standardizing on a tuning split is not necessary and
+     models should feel free to merge this split with the training split if desired.
+  3. A held-out split, named `held_out`, used for final model evaluation. In many cases, this is also called a
+     "test" split. When performing benchmarking, this split should not be used at all for model selection,
+     training, or for any purposes up to final validation.
 
-```python
+Additional split names can be used by the user as desired.
 
-Patient = TypedDict('Patient', {
-  'patient_id': int,
-  'events': List[Event],
-})
-
-Event = TypedDict('Event',{
-    'time': NotRequired[datetime.datetime], # Static events will have a null timestamp here
-    'code': str,
-    'text_value': NotRequired[str],
-    'numeric_value': NotRequired[float],
-    'datetime_value': NotRequired[datetime.datetime],
-    'metadata': NotRequired[Mapping[str, Any]],
-})
+```
+train_split = "train"
+tuning_split = "tuning"
+held_out_split = "held_out"
+
+patient_split = pa.schema(
+    [
+        ("patient_id", pa.int64()),
+        ("split", pa.string()),
+    ]
+)
+
+PatientSplit = TypedDict("PatientSplit", {
+    "patient_id": int,
+    "split": str,
+}, total=True)
 ```
 
-We also provide ETLs to convert common data formats to this schema: https://github.com/Medical-Event-Data-Standard/meds_etl
-
-An example patient following this schema
+#### The dataset metadata schema.
 
 ```python
-
-patient_data = {
-  "patient_id": 123,
-  "events": [
-    # Store static events like gender with a null timestamp
-    {
-        "time": None,
-        "code": "Gender/F",
+dataset_metadata = {
+    "type": "object",
+    "properties": {
+        "dataset_name": {"type": "string"},
+        "dataset_version": {"type": "string"},
+        "etl_name": {"type": "string"},
+        "etl_version": {"type": "string"},
+        "meds_version": {"type": "string"},
     },
+}
 
-    # It's recommended to record birth using the birth_code
-    {
-      "time": datetime.datetime(1995, 8, 20),
-      "code": meds.birth_code,
-    },
+# Python type for the above schema
 
-    # Arbitrary events with sophisticated data can also be added
+DatasetMetadata = TypedDict(
+    "DatasetMetadata",
     {
-        "time": datetime.datetime(2020, 1, 1, 12, 0, 0),
-        "code": "some_code",
-        "text_value": "Example",
-        "numeric_value": 10.0,
-        "datetime_value": datetime.datetime(2020, 1, 1, 12, 0, 0),
-        "properties": None
+        "dataset_name": NotRequired[str],
+        "dataset_version": NotRequired[str],
+        "etl_name": NotRequired[str],
+        "etl_version": NotRequired[str],
+        "meds_version": NotRequired[str],
     },
-  ]
-}
+    total=False,
+)
+```
+
+#### The code metadata schema.
+
+```python
+def code_metadata_schema(custom_per_code_properties=[]): 
+    code_metadata = pa.schema(
+        [
+            ("code", pa.string()),
+            ("description", pa.string()),
+            ("parent_codes", pa.list(pa.string()),
+        ] + custom_per_code_properties
+    )
+
+    return code_metadata
+
+# Python type for the above schema
 
+CodeMetadata = TypedDict("CodeMetadata", {"code": str, "description": str, "parent_codes": List[str]}, total=False)
 ```
diff --git a/src/meds/schema.py b/src/meds/schema.py
index fff43c7..4a91f0d 100644
--- a/src/meds/schema.py
+++ b/src/meds/schema.py
@@ -1,38 +1,23 @@
+"""The core schemas for the MEDS format.
+
+Please see the README for more information, including expected file organization on disk, more details on what
+each schema should capture, etc.
+"""
 import datetime
 from typing import Any, List, Mapping, Optional
 
 import pyarrow as pa
 from typing_extensions import NotRequired, TypedDict
 
-# Medical Event Data Standard consists of four main components:
-# 1. A patient event schema
-# 2. A label schema
-# 3. A dataset metadata schema.
-# 4. A code metadata schema.
-#
-# Event data, labels, and code metadata is specified using pyarrow. Dataset metadata is specified using JSON.
-
-# We also specify a directory structure for how these should be laid out on disk.
-
-# Every MEDS extract consists of a folder that contains both metadata and patient data with the following structure:
-# - data/
-#    A (possibly nested) folder containing multiple parquet files containing patient event data following the events_schema folder.
-#    glob("data/**/*.parquet") is the recommended way for obtaining all patient event files.
-# - dataset_metadata.json
-#    Dataset level metadata containing information about the ETL used, data version, etc
-# - (Optional) code_metadata.parquet
-#    Code level metadata containing information about the code descriptions, standard mappings, etc
-# - (Optional) patient_split.csv
-#    A specification of patient splits that should be used.
 
 ############################################################
 
-# The patient event data schema.
+# The data schema.
 #
-# Patient event data also must satisfy two important properties:
+# MEDS data also must satisfy two important properties:
 #
-# 1. Patient event data cannot be split across parquet files. If a patient is in a dataset it must be in one and only one parquet file.
-# 2. Patient event data must be contiguous within a particular parquet file and sorted by event time. 
+# 1. Data about a single patient cannot be split across parquet files. If a patient is in a dataset it must be in one and only one parquet file.
+# 2. Data about a single patient must be contiguous within a particular parquet file and sorted by time. 
 
 # Both of these restrictions allow the stream rolling processing (see https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.rolling.html),
 # which vastly simplifies many data analysis pipelines.
@@ -41,14 +26,14 @@
 birth_code = "MEDS_BIRTH"
 death_code = "MEDS_DEATH"
 
-def patient_events_schema(custom_per_event_properties=[]):
+def data_schema(custom_properties=[]):
     return pa.schema(
         [
             ("patient_id", pa.int64()),
             ("time", pa.timestamp("us")), # Static events will have a null timestamp
             ("code", pa.string()),
             ("numeric_value", pa.float32()),
-        ] + custom_per_event_properties
+        ] + custom_properties
     )
 
 # No python type is provided because Python tools for processing MEDS data will often provide their own types.
@@ -56,7 +41,9 @@ def patient_events_schema(custom_per_event_properties=[]):
 
 ############################################################
 
-# The label schema.
+# The label schema. Models, when predicting this label, are allowed to use all data about a patient up to and
+# including the prediction time. Exclusive prediction times are not currently supported, but if you have a use
+# case for them please add a GitHub issue.
 
 label = pa.schema(
     [
@@ -85,9 +72,9 @@ def patient_events_schema(custom_per_event_properties=[]):
 
 # The patient split schema.
 
-train_split = "train"
-tuning_split = "tuning"
-test_split = "test"
+train_split = "train" # For ML training.
+tuning_split = "tuning" # For ML hyperparameter tuning. Also often called "validation" or "dev".
+held_out_split = "held_out" # For final ML evaluation. Also often called "test".
 
 patient_split = pa.schema(
     [
@@ -105,7 +92,6 @@ def patient_events_schema(custom_per_event_properties=[]):
 
 # The dataset metadata schema.
 # This is a JSON schema.
-# This data should be stored in dataset_metadata.json within the dataset folder.
 
 
 dataset_metadata = {
@@ -137,7 +123,6 @@ def patient_events_schema(custom_per_event_properties=[]):
 
 # The code metadata schema.
 # This is a parquet schema.
-# This data should be stored in code_metadata.parquet within the dataset folder.
 
 def code_metadata_schema(custom_per_code_properties=[]): 
     code_metadata = pa.schema(

From e10c22d2cab9213aefe92ae79e454e66e8db261b Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Tue, 30 Jul 2024 14:28:10 -0400
Subject: [PATCH 14/20] Removed unneeded python object format.

---
 README.md          | 5 -----
 src/meds/schema.py | 5 -----
 2 files changed, 10 deletions(-)

diff --git a/README.md b/README.md
index 9ab090e..d0c3c82 100644
--- a/README.md
+++ b/README.md
@@ -147,11 +147,6 @@ patient_split = pa.schema(
         ("split", pa.string()),
     ]
 )
-
-PatientSplit = TypedDict("PatientSplit", {
-    "patient_id": int,
-    "split": str,
-}, total=True)
 ```
 
 #### The dataset metadata schema.
diff --git a/src/meds/schema.py b/src/meds/schema.py
index 4a91f0d..919b7e6 100644
--- a/src/meds/schema.py
+++ b/src/meds/schema.py
@@ -83,11 +83,6 @@ def data_schema(custom_properties=[]):
     ]
 )
 
-PatientSplit = TypedDict("PatientSplit", {
-    "patient_id": int,
-    "split": str,
-}, total=True)
-
 ############################################################
 
 # The dataset metadata schema.

From dd4ca79c7ba168adc1f9c0e2948db9e203986adb Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Tue, 30 Jul 2024 14:41:16 -0400
Subject: [PATCH 15/20] Adding missing close paren

---
 src/meds/schema.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/meds/schema.py b/src/meds/schema.py
index 1fd82de..0c1e73d 100644
--- a/src/meds/schema.py
+++ b/src/meds/schema.py
@@ -131,7 +131,7 @@ def code_metadata_schema(custom_per_code_properties=[]):
         [
             ("code", pa.string()),
             ("description", pa.string()),
-            ("parent_codes", pa.list(pa.string()),
+            ("parent_codes", pa.list(pa.string())),
         ] + custom_per_code_properties
     )
 

From 986b2964c613da4be461fae573a4b75a779c6e12 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Tue, 30 Jul 2024 14:59:05 -0400
Subject: [PATCH 16/20] Fixed another test error to do with imports.

---
 src/meds/__init__.py | 20 ++++++++++----------
 src/meds/schema.py   |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/meds/__init__.py b/src/meds/__init__.py
index 2c76ef6..eb6ebd6 100644
--- a/src/meds/__init__.py
+++ b/src/meds/__init__.py
@@ -1,26 +1,26 @@
 from meds._version import __version__  # noqa
 
-from .schema import (patient_schema, Event, Patient, label, Label,
-                     code_metadata_entry, code_metadata, dataset_metadata,
-                     CodeMetadataEntry, CodeMetadata, DatasetMetadata, birth_code,
-                     death_code)
+from .schema import (
+    data_schema, label, Label, train_split, tuning_split, held_out_split, patient_split, code_metadata,
+    dataset_metadata, CodeMetadata, DatasetMetadata, birth_code, death_code
+)
 
 
 # List all objects that we want to export
 _exported_objects = {
-    'patient_schema': patient_schema,
-    'Event': Event,
-    'Patient': Patient,
+    'data_schema': data_schema,
     'label': label,
     'Label': Label,
-    'code_metadata_entry': code_metadata_entry,
+    'train_split': train_split,
+    'tuning_split': tuning_split,
+    'held_out_split': held_out_split,
+    'patient_split': patient_split,
     'code_metadata': code_metadata,
     'dataset_metadata': dataset_metadata,
-    'CodeMetadataEntry': CodeMetadataEntry,
     'CodeMetadata': CodeMetadata,
     'DatasetMetadata': DatasetMetadata,
     'birth_code': birth_code,
-    'death_code': death_code
+    'death_code': death_code,
 }
 
 __all__ = list(_exported_objects.keys())
diff --git a/src/meds/schema.py b/src/meds/schema.py
index 0c1e73d..03b4d73 100644
--- a/src/meds/schema.py
+++ b/src/meds/schema.py
@@ -49,7 +49,7 @@ def data_schema(custom_properties=[]):
     [
         ("patient_id", pa.int64()),
          # The patient who is being labeled.
-        
+
         ("prediction_time", pa.timestamp("us")), 
         # The time the prediction is made. 
         # Machine learning models are allowed to use features that have timestamps less than or equal

From 34465fe6a7112919fb2ef3d22745e68a2f85b007 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Tue, 30 Jul 2024 15:01:07 -0400
Subject: [PATCH 17/20] Standardized schema naming convention and fixed another
 typo.

---
 README.md            | 8 +++-----
 src/meds/__init__.py | 4 ++--
 src/meds/schema.py   | 8 +++-----
 3 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index d0c3c82..ebd5e9c 100644
--- a/README.md
+++ b/README.md
@@ -85,7 +85,7 @@ In addition, it can contain any number of custom properties to further enrich ob
 function below generates a pyarrow schema for a given set of custom properties.
 
 ```python
-def data_schema(custom_properties=[]):
+def data(custom_properties=[]):
     return pa.schema(
         [
             ("patient_id", pa.int64()),
@@ -181,8 +181,8 @@ DatasetMetadata = TypedDict(
 #### The code metadata schema.
 
 ```python
-def code_metadata_schema(custom_per_code_properties=[]): 
-    code_metadata = pa.schema(
+def code_metadata(custom_per_code_properties=[]): 
+    return pa.schema(
         [
             ("code", pa.string()),
             ("description", pa.string()),
@@ -190,8 +190,6 @@ def code_metadata_schema(custom_per_code_properties=[]):
         ] + custom_per_code_properties
     )
 
-    return code_metadata
-
 # Python type for the above schema
 
 CodeMetadata = TypedDict("CodeMetadata", {"code": str, "description": str, "parent_codes": List[str]}, total=False)
diff --git a/src/meds/__init__.py b/src/meds/__init__.py
index eb6ebd6..8853647 100644
--- a/src/meds/__init__.py
+++ b/src/meds/__init__.py
@@ -1,14 +1,14 @@
 from meds._version import __version__  # noqa
 
 from .schema import (
-    data_schema, label, Label, train_split, tuning_split, held_out_split, patient_split, code_metadata,
+    data, label, Label, train_split, tuning_split, held_out_split, patient_split, code_metadata,
     dataset_metadata, CodeMetadata, DatasetMetadata, birth_code, death_code
 )
 
 
 # List all objects that we want to export
 _exported_objects = {
-    'data_schema': data_schema,
+    'data': data,
     'label': label,
     'Label': Label,
     'train_split': train_split,
diff --git a/src/meds/schema.py b/src/meds/schema.py
index 03b4d73..3756283 100644
--- a/src/meds/schema.py
+++ b/src/meds/schema.py
@@ -26,7 +26,7 @@
 birth_code = "MEDS_BIRTH"
 death_code = "MEDS_DEATH"
 
-def data_schema(custom_properties=[]):
+def data(custom_properties=[]):
     return pa.schema(
         [
             ("patient_id", pa.int64()),
@@ -126,8 +126,8 @@ def data_schema(custom_properties=[]):
 # The code metadata schema.
 # This is a parquet schema.
 
-def code_metadata_schema(custom_per_code_properties=[]): 
-    code_metadata = pa.schema(
+def code_metadata(custom_per_code_properties=[]): 
+    return pa.schema(
         [
             ("code", pa.string()),
             ("description", pa.string()),
@@ -135,8 +135,6 @@ def code_metadata_schema(custom_per_code_properties=[]):
         ] + custom_per_code_properties
     )
 
-    return code_metadata
-
 # Python type for the above schema
 
 CodeMetadata = TypedDict("CodeMetadata", {"code": str, "description": str, "parent_codes": List[str]}, total=False)

From 94d4ce9776f3a5bd5cb94f2776ff319c389db3fc Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Tue, 30 Jul 2024 15:09:29 -0400
Subject: [PATCH 18/20] Maybe fixed tests

---
 tests/test_schema.py | 68 ++++++++++++++++++++++++++++++--------------
 1 file changed, 47 insertions(+), 21 deletions(-)

diff --git a/tests/test_schema.py b/tests/test_schema.py
index ccdf31b..27fc192 100644
--- a/tests/test_schema.py
+++ b/tests/test_schema.py
@@ -4,30 +4,62 @@
 import pyarrow as pa
 import pytest
 
-from meds import patient_schema, label, dataset_metadata
+from meds import (
+    data, label, dataset_metadata, patient_split, code_metadata, train_split, tuning_split, held_out_split
+)
 
-
-def test_patient_schema():
+def test_data_schema():
     """
-    Test that mock patient data follows the patient_schema schema.
+    Test that mock data follows the data schema.
     """
     # Each element in the list is a row in the table
-    patient_data = [
+    data = [
         {
             "patient_id": 123,
-            "events": [{  # Nested list for events
-                "time": datetime.datetime(2020, 1, 1, 12, 0, 0),
-                "code": "some_code",
-                "text_value": "Example",
-                "numeric_value": 10.0,
-                "datetime_value": datetime.datetime(2020, 1, 1, 12, 0, 0),
-                "properties": None
-            }]
+            "time": datetime.datetime(2020, 1, 1, 12, 0, 0),
+            "code": "some_code",
+            "text_value": "Example",
+            "numeric_value": 10.0,
         }
     ]
 
-    patient_table = pa.Table.from_pylist(patient_data, schema=patient_schema())
-    assert patient_table.schema.equals(patient_schema()), "Patient schema does not match"
+    schema = data([("text_value", pa.string())])
+
+    table = pa.Table.from_pylist(data, schema=schema)
+    assert table.schema.equals(schema), "Patient schema does not match"
+
+def test_code_metadata_schema():
+    """
+    Test that mock code metadata follows the schema.
+    """
+    # Each element in the list is a row in the table
+    data = [
+        {
+            "code": "some_code",
+            "description": "foo",
+            "parent_code": ["parent_code"],
+        }
+    ]
+
+    schema = code_metadata()
+
+    table = pa.Table.from_pylist(data, schema=schema)
+    assert table.schema.equals(schema), "Code metadata schema does not match"
+
+def test_patient_split_schema():
+    """
+    Test that mock data follows the data schema.
+    """
+    # Each element in the list is a row in the table
+    data = [
+        {"patient_id": 123, "split": train_split},
+        {"patient_id": 123, "split": tuning_split},
+        {"patient_id": 123, "split": held_out_split},
+        {"patient_id": 123, "split": "special"},
+    ]
+
+    table = pa.Table.from_pylist(data, schema=patient_split)
+    assert table.schema.equals(patient_split), "Patient split schema does not match"
 
 def test_label_schema():
     """
@@ -83,12 +115,6 @@ def test_dataset_metadata_schema():
         "dataset_version": "1.0",
         "etl_name": "Test ETL",
         "etl_version": "1.0",
-        "code_metadata": {
-            "test_code": {
-                "description": "A test code",
-                "standard_ontology_codes": ["12345"],
-            }
-        },
     }
 
     jsonschema.validate(instance=metadata, schema=dataset_metadata)

From d962dac5b950470076b66db49c5dfda37e3151fd Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Tue, 30 Jul 2024 15:13:07 -0400
Subject: [PATCH 19/20] Fixed a typo in the code metadata schema and the tests

---
 src/meds/schema.py   |  2 +-
 tests/test_schema.py | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/meds/schema.py b/src/meds/schema.py
index 3756283..a7b67b4 100644
--- a/src/meds/schema.py
+++ b/src/meds/schema.py
@@ -131,7 +131,7 @@ def code_metadata(custom_per_code_properties=[]):
         [
             ("code", pa.string()),
             ("description", pa.string()),
-            ("parent_codes", pa.list(pa.string())),
+            ("parent_codes", pa.list_(pa.string())),
         ] + custom_per_code_properties
     )
 
diff --git a/tests/test_schema.py b/tests/test_schema.py
index 27fc192..168c4e9 100644
--- a/tests/test_schema.py
+++ b/tests/test_schema.py
@@ -13,7 +13,7 @@ def test_data_schema():
     Test that mock data follows the data schema.
     """
     # Each element in the list is a row in the table
-    data = [
+    raw_data = [
         {
             "patient_id": 123,
             "time": datetime.datetime(2020, 1, 1, 12, 0, 0),
@@ -25,7 +25,7 @@ def test_data_schema():
 
     schema = data([("text_value", pa.string())])
 
-    table = pa.Table.from_pylist(data, schema=schema)
+    table = pa.Table.from_pylist(raw_data, schema=schema)
     assert table.schema.equals(schema), "Patient schema does not match"
 
 def test_code_metadata_schema():
@@ -33,7 +33,7 @@ def test_code_metadata_schema():
     Test that mock code metadata follows the schema.
     """
     # Each element in the list is a row in the table
-    data = [
+    code_metadata = [
         {
             "code": "some_code",
             "description": "foo",
@@ -43,7 +43,7 @@ def test_code_metadata_schema():
 
     schema = code_metadata()
 
-    table = pa.Table.from_pylist(data, schema=schema)
+    table = pa.Table.from_pylist(code_metadata, schema=schema)
     assert table.schema.equals(schema), "Code metadata schema does not match"
 
 def test_patient_split_schema():
@@ -51,14 +51,14 @@ def test_patient_split_schema():
     Test that mock data follows the data schema.
     """
     # Each element in the list is a row in the table
-    data = [
+    patient_split_data = [
         {"patient_id": 123, "split": train_split},
         {"patient_id": 123, "split": tuning_split},
         {"patient_id": 123, "split": held_out_split},
         {"patient_id": 123, "split": "special"},
     ]
 
-    table = pa.Table.from_pylist(data, schema=patient_split)
+    table = pa.Table.from_pylist(patient_split_data, schema=patient_split)
     assert table.schema.equals(patient_split), "Patient split schema does not match"
 
 def test_label_schema():

From ae6d26992620b13e0f50f6adf86929b61753ddf4 Mon Sep 17 00:00:00 2001
From: Matthew McDermott <mattmcdermott8@gmail.com>
Date: Tue, 30 Jul 2024 15:19:17 -0400
Subject: [PATCH 20/20] Adjust naming convention to minimize import and
 variable name conflicts.

---
 src/meds/__init__.py | 14 +++++++-------
 src/meds/schema.py   | 10 +++++-----
 tests/test_schema.py | 29 +++++++++++++++--------------
 3 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/src/meds/__init__.py b/src/meds/__init__.py
index 8853647..3d8d36c 100644
--- a/src/meds/__init__.py
+++ b/src/meds/__init__.py
@@ -1,22 +1,22 @@
 from meds._version import __version__  # noqa
 
 from .schema import (
-    data, label, Label, train_split, tuning_split, held_out_split, patient_split, code_metadata,
-    dataset_metadata, CodeMetadata, DatasetMetadata, birth_code, death_code
+    data_schema, label_schema, Label, train_split, tuning_split, held_out_split, patient_split_schema,
+    code_metadata_schema, dataset_metadata_schema, CodeMetadata, DatasetMetadata, birth_code, death_code
 )
 
 
 # List all objects that we want to export
 _exported_objects = {
-    'data': data,
-    'label': label,
+    'data_schema': data_schema,
+    'label_schema': label_schema,
     'Label': Label,
     'train_split': train_split,
     'tuning_split': tuning_split,
     'held_out_split': held_out_split,
-    'patient_split': patient_split,
-    'code_metadata': code_metadata,
-    'dataset_metadata': dataset_metadata,
+    'patient_split_schema': patient_split_schema,
+    'code_metadata_schema': code_metadata_schema,
+    'dataset_metadata_schema': dataset_metadata_schema,
     'CodeMetadata': CodeMetadata,
     'DatasetMetadata': DatasetMetadata,
     'birth_code': birth_code,
diff --git a/src/meds/schema.py b/src/meds/schema.py
index a7b67b4..5b263f4 100644
--- a/src/meds/schema.py
+++ b/src/meds/schema.py
@@ -26,7 +26,7 @@
 birth_code = "MEDS_BIRTH"
 death_code = "MEDS_DEATH"
 
-def data(custom_properties=[]):
+def data_schema(custom_properties=[]):
     return pa.schema(
         [
             ("patient_id", pa.int64()),
@@ -45,7 +45,7 @@ def data(custom_properties=[]):
 # including the prediction time. Exclusive prediction times are not currently supported, but if you have a use
 # case for them please add a GitHub issue.
 
-label = pa.schema(
+label_schema = pa.schema(
     [
         ("patient_id", pa.int64()),
          # The patient who is being labeled.
@@ -83,7 +83,7 @@ def data(custom_properties=[]):
 tuning_split = "tuning" # For ML hyperparameter tuning. Also often called "validation" or "dev".
 held_out_split = "held_out" # For final ML evaluation. Also often called "test".
 
-patient_split = pa.schema(
+patient_split_schema = pa.schema(
     [
         ("patient_id", pa.int64()),
         ("split", pa.string()),
@@ -96,7 +96,7 @@ def data(custom_properties=[]):
 # This is a JSON schema.
 
 
-dataset_metadata = {
+dataset_metadata_schema = {
     "type": "object",
     "properties": {
         "dataset_name": {"type": "string"},
@@ -126,7 +126,7 @@ def data(custom_properties=[]):
 # The code metadata schema.
 # This is a parquet schema.
 
-def code_metadata(custom_per_code_properties=[]): 
+def code_metadata_schema(custom_per_code_properties=[]): 
     return pa.schema(
         [
             ("code", pa.string()),
diff --git a/tests/test_schema.py b/tests/test_schema.py
index 168c4e9..b945909 100644
--- a/tests/test_schema.py
+++ b/tests/test_schema.py
@@ -5,7 +5,8 @@
 import pytest
 
 from meds import (
-    data, label, dataset_metadata, patient_split, code_metadata, train_split, tuning_split, held_out_split
+    data_schema, label_schema, dataset_metadata_schema, patient_split_schema, code_metadata_schema,
+    train_split, tuning_split, held_out_split
 )
 
 def test_data_schema():
@@ -23,7 +24,7 @@ def test_data_schema():
         }
     ]
 
-    schema = data([("text_value", pa.string())])
+    schema = data_schema([("text_value", pa.string())])
 
     table = pa.Table.from_pylist(raw_data, schema=schema)
     assert table.schema.equals(schema), "Patient schema does not match"
@@ -41,7 +42,7 @@ def test_code_metadata_schema():
         }
     ]
 
-    schema = code_metadata()
+    schema = code_metadata_schema()
 
     table = pa.Table.from_pylist(code_metadata, schema=schema)
     assert table.schema.equals(schema), "Code metadata schema does not match"
@@ -58,8 +59,8 @@ def test_patient_split_schema():
         {"patient_id": 123, "split": "special"},
     ]
 
-    table = pa.Table.from_pylist(patient_split_data, schema=patient_split)
-    assert table.schema.equals(patient_split), "Patient split schema does not match"
+    table = pa.Table.from_pylist(patient_split_data, schema=patient_split_schema)
+    assert table.schema.equals(patient_split_schema), "Patient split schema does not match"
 
 def test_label_schema():
     """
@@ -73,8 +74,8 @@ def test_label_schema():
             "boolean_value": True
         }
     ]
-    label_table = pa.Table.from_pylist(label_data, schema=label)
-    assert label_table.schema.equals(label), "Label schema does not match"
+    label_table = pa.Table.from_pylist(label_data, schema=label_schema)
+    assert label_table.schema.equals(label_schema), "Label schema does not match"
 
     label_data = [
         {
@@ -83,8 +84,8 @@ def test_label_schema():
             "integer_value": 4
         }
     ]
-    label_table = pa.Table.from_pylist(label_data, schema=label)
-    assert label_table.schema.equals(label), "Label schema does not match"
+    label_table = pa.Table.from_pylist(label_data, schema=label_schema)
+    assert label_table.schema.equals(label_schema), "Label schema does not match"
     
     label_data = [
         {
@@ -93,8 +94,8 @@ def test_label_schema():
             "float_value": 0.4
         }
     ]
-    label_table = pa.Table.from_pylist(label_data, schema=label)
-    assert label_table.schema.equals(label), "Label schema does not match"
+    label_table = pa.Table.from_pylist(label_data, schema=label_schema)
+    assert label_table.schema.equals(label_schema), "Label schema does not match"
     
     label_data = [
         {
@@ -103,8 +104,8 @@ def test_label_schema():
             "categorical_value": "text"
         }
     ]
-    label_table = pa.Table.from_pylist(label_data, schema=label)
-    assert label_table.schema.equals(label), "Label schema does not match"
+    label_table = pa.Table.from_pylist(label_data, schema=label_schema)
+    assert label_table.schema.equals(label_schema), "Label schema does not match"
 
 def test_dataset_metadata_schema():
     """
@@ -117,5 +118,5 @@ def test_dataset_metadata_schema():
         "etl_version": "1.0",
     }
 
-    jsonschema.validate(instance=metadata, schema=dataset_metadata)
+    jsonschema.validate(instance=metadata, schema=dataset_metadata_schema)
     assert True, "Dataset metadata schema validation failed"