Skip to content

Commit

Permalink
split data model file up into one file per class
Browse files Browse the repository at this point in the history
  • Loading branch information
frehburg committed Sep 24, 2024
1 parent 2af6dea commit c8164ca
Show file tree
Hide file tree
Showing 11 changed files with 454 additions and 524 deletions.
7 changes: 3 additions & 4 deletions src/phenopacket_mapper/data_standards/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,13 @@
from .date import Date
from .code_system import CodeSystem, SNOMED_CT, HPO, MONDO, OMIM, ORDO, LOINC
from .code import Coding, CodeableConcept
from .data_model import DataModel, DataField, DataModelInstance, DataFieldValue, DataSet
from . import data_models
from phenopacket_mapper.data_standards.data_model import (DataModel, DataField, DataModelInstance, DataFieldValue,
DataSet)
from .value_set import ValueSet

__all__ = [
"Coding", "CodeableConcept",
"DataModel", "DataField", "DataModelInstance", "DataFieldValue",
"data_models",
"DataModel", "DataField", "DataModelInstance", "DataFieldValue", "DataSet",
"CodeSystem",
"SNOMED_CT", "HPO", "MONDO", "OMIM", "ORDO", "LOINC",
"Date",
Expand Down
418 changes: 0 additions & 418 deletions src/phenopacket_mapper/data_standards/data_model.py

This file was deleted.

23 changes: 23 additions & 0 deletions src/phenopacket_mapper/data_standards/data_model/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""
This module defines the `DataModel` class, which is used to define a data model for medical data. A `DataModel` is a
collection of `DataField` objects, which define the fields of the data model. Each `DataField` has a name, a value set,
a description, a section, a required flag, a specification, and an ordinal. The `DataModel` class also has a list of
`CodeSystem` objects, which are used as resources in the data model.
The `DataFieldValue` class is used to define the value of a `DataField` in a `DataModelInstance`. The
`DataModelInstance` class is used to define an instance of a `DataModel`, i.e. a record in a dataset.
"""

from .data_field import DataField
from .data_field_value import DataFieldValue
from .data_model import DataModel
from .data_model_instance import DataModelInstance
from .data_set import DataSet

__all__ = [
"DataField",
"DataFieldValue",
"DataModel",
"DataModelInstance",
"DataSet",
]
66 changes: 66 additions & 0 deletions src/phenopacket_mapper/data_standards/data_model/data_field.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from dataclasses import dataclass, field
from typing import Union

from phenopacket_mapper.data_standards.value_set import ValueSet


@dataclass(slots=True, frozen=True)
class DataField:
"""This class defines fields used in the definition of a `DataModel`
A dataa field is the equivalent of a column in a table. It has a name, a value set, a description, a section, a
required flag, a specification, and an ordinal.
The string for the `id` field is generated from the `name` field using the `str_to_valid_id` function from the
`phenopacket_mapper.utils` module. This attempts to convert the `name` field. Sometimes this might not work as
desired, in which case the `id` field can be set manually.
Naming rules for the `id` field:
- The `id` field must be a valid Python identifier
- The `id` field must start with a letter or the underscore character
- The `id` field must cannot start with a number
- The `id` field can only contain lowercase alpha-numeric characters and underscores (a-z, 0-9, and _ )
- The `id` field cannot be any of the Python keywords (e.g. `in`, `is`, `not`, `class`, etc.).
- The `id` field must be unique within a `DataModel`
If the `value_set` is a single type, it can be passed directly as the `value_set` parameter.
e.g.:
>>> DataField(name="Field 1", value_set=int)
DataField(name='Field 1', value_set=ValueSet(elements=[<class 'int'>], name='', description=''), id='field_1', description='', section='', required=True, specification='', ordinal='')
:ivar name: Name of the field
:ivar value_set: Value set of the field, if the value set is only one type, can also pass that type directly
:ivar id: Id of the field, adhering to the naming rules stated above
:ivar description: Description of the field
:ivar section: Section of the field (Only applicable if the data model is divided into sections)
:ivar required: Required flag of the field
:ivar specification: Text specification of the field (a description of the value set and field)
:ivar ordinal: Ordinal of the field (E.g. 1.1, 1.2, 2.1, etc.)
"""
name: str = field()
value_set: Union[ValueSet, type] = field()
id: str = field(default=None)
description: str = field(default='')
section: str = field(default='')
required: bool = field(default=True)
specification: str = field(default='')
ordinal: str = field(default='')

def __post_init__(self):
if not self.id:
from phenopacket_mapper.utils import str_to_valid_id
object.__setattr__(self, 'id', str_to_valid_id(self.name))

if isinstance(self.value_set, type):
object.__setattr__(self, 'value_set', ValueSet(elements=[self.value_set]))

def __str__(self):
ret = "DataField(\n"
ret += f"\t\tid: {self.id},\n"
ret += f"\t\tsection: {self.section},\n"
ret += f"\t\tordinal, name: ({self.ordinal}, {self.name}),\n"
ret += f"\t\tvalue_set: {self.value_set}, required: {self.required},\n"
ret += f"\t\tspecification: {self.specification}\n"
ret += "\t)"
return ret
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from dataclasses import dataclass
from typing import Union, Any
import warnings

from . import DataField
from phenopacket_mapper.data_standards import CodeSystem
from phenopacket_mapper.data_standards.date import Date


@dataclass(slots=True, frozen=True)
class DataFieldValue:
"""This class defines the value of a `DataField` in a `DataModelInstance`
Equivalent to a cell value in a table.
:ivar row_no: The id of the value, i.e. the row number
:ivar field: DataField: The `DataField` to which this value belongs and which defines the value set for the field.
:ivar value: The value of the field.
"""
row_no: Union[str, int]
field: DataField
value: Union[int, float, str, bool, Date, CodeSystem]

def validate(self) -> bool:
"""Validates the data model instance based on data model definition
This method checks if the instance is valid based on the data model definition. It checks if all required fields
are present, if the values are in the value set, etc.
:return: True if the instance is valid, False otherwise
"""
if self.field.required and self.value is None: # no value
warnings.warn(f"Field {self.field.name} is required but has no value")
return False
elif self.value is not None and self.field.value_set:
if Any in self.field.value_set: # value set allows any
return True
elif self.value in self.field.value_set: # raw value (likely a primitive) is in the value set
return True
else: # check if the value matches one of the types in the value set
for e in self.field.value_set:
if isinstance(e, type):
cur_type = e
if cur_type is type(self.value):
return True
elif isinstance(e, CodeSystem):
cs = e
from phenopacket_mapper.data_standards import Coding
if isinstance(self.value, Coding) and self.value.system == cs:
return True

warnings.warn(f"Value {self.value} of type {type(self.value)} is not in the value set of field "
f"{self.field.name} (row {self.row_no})")
return False
183 changes: 183 additions & 0 deletions src/phenopacket_mapper/data_standards/data_model/data_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
from dataclasses import dataclass, field
from pathlib import Path
from types import MappingProxyType
from typing import Union, List, Literal, Dict, Optional

from phenopacket_mapper.data_standards import CodeSystem
from . import DataSet, DataField
from phenopacket_mapper.data_standards.value_set import ValueSet


@dataclass(slots=True, frozen=True)
class DataModel:
"""This class defines a data model for medical data using `DataField`
A data model can be used to import data and map it to the Phenopacket schema. It is made up of a list of `DataField`
Given that all `DataField` objects in a `DataModel` have unique names, the `id` field is generated from the `name`.
E.g.: `DataField(name='Date of Birth', ...)` will have an `id` of `'date_of_birth'`. The `DataField` objects can
be accessed using the `id` as an attribute of the `DataModel` object. E.g.: `data_model.date_of_birth`. This is
useful in the data reading and mapping processes.
>>> data_model = DataModel("Test data model", [DataField(name="Field 1", value_set=ValueSet())])
>>> data_model.field_1
DataField(name='Field 1', value_set=ValueSet(elements=[], name='', description=''), id='field_1', description='', section='', required=True, specification='', ordinal='')
:ivar data_model_name: Name of the data model
:ivar fields: List of `DataField` objects
:ivar resources: List of `CodeSystem` objects
"""
data_model_name: str = field()
fields: List[DataField] = field()
resources: List[CodeSystem] = field(default_factory=list)

def __post_init__(self):
if len(self.fields) != len(set([f.id for f in self.fields])):
raise ValueError("All fields in a DataModel must have unique identifiers")

def __getattr__(self, var_name: str) -> DataField:
for f in self.fields:
if f.id == var_name:
return f
raise AttributeError(f"'DataModel' object has no attribute '{var_name}'")

def __str__(self):
ret = f"DataModel(name={self.data_model_name}\n"
for field in self.fields:
ret += f"\t{str(field)}\n"
ret += "---\n"
for res in self.resources:
ret += f"\t{str(res)}\n"
ret += ")"
return ret

def __iter__(self):
return iter(self.fields)

def get_field(self, field_id: str, default: Optional = None) -> Optional[DataField]:
"""Returns a DataField object by its id
:param field_id: The id of the field
:param default: The default value to return if the field is not found
:return: The DataField object
"""
for f in self.fields:
if f.id == field_id:
return f
if default or default is None:
return default
raise ValueError(f"Field with id {field_id} not found in DataModel")

def get_field_ids(self) -> List[str]:
"""Returns a list of the ids of the DataFields in the DataModel"""
return [f.id for f in self.fields]

def load_data(
self,
path: Union[str, Path],
compliance: Literal['soft', 'hard'] = 'soft',
**kwargs
) -> DataSet:
"""Loads data from a file using a DataModel definition
To call this method, pass the column name for each field in the DataModel as a keyword argument. This is done
by passing the field id followed by '_column'. E.g. if the DataModel has a field with id 'date_of_birth', the
column name in the file should be passed as 'date_of_birth_column'. The method will raise an error if any of
the fields are missing.
E.g.:
```python
data_model = DataModel("Test data model", [DataField(name="Field 1", value_set=ValueSet())])
data_model.load_data("data.csv", field_1_column="column_name_in_file")
```
:param path: Path to the file containing the data
:param compliance: Compliance level to use when loading the data.
:param kwargs: Dynamically passed parameters that match {id}_column for each item
:return: A list of `DataModelInstance` objects
"""
column_names = dict()
for f in self.fields:
column_param = f"{f.id}_column"
if column_param not in kwargs:
raise TypeError(f"load_data() missing 1 required argument: '{column_param}'")
else:
column_names[f.id] = kwargs[column_param]

from phenopacket_mapper.pipeline import load_data_using_data_model
return load_data_using_data_model(
path=path,
data_model=self,
column_names=column_names,
compliance=compliance
)

@staticmethod
def from_file(
data_model_name: str,
resources: List[CodeSystem],
path: Union[str, Path],
file_type: Literal['csv', 'excel', 'unknown'] = 'unknown',
column_names: Dict[str, str] = MappingProxyType({
DataField.name.__name__: 'data_field_name',
DataField.section.__name__: 'data_model_section',
DataField.description.__name__: 'description',
DataField.value_set.__name__: 'value_set',
DataField.required.__name__: 'required',
DataField.specification.__name__: 'specification',
DataField.ordinal.__name__: 'ordinal'
}),
parse_value_sets: bool = False,
remove_line_breaks: bool = False,
parse_ordinals: bool = True,
) -> 'DataModel':
"""Reads a Data Model from a file
:param data_model_name: Name to be given to the `DataModel` object
:param resources: List of `CodeSystem` objects to be used as resources in the `DataModel`
:param path: Path to Data Model file
:param file_type: Type of file to read, either 'csv' or 'excel'
:param column_names: A dictionary mapping from each field of the `DataField` (key) class to a column of the file
(value). Leaving a value empty (`''`) will leave the field in the `DataModel` definition empty.
:param parse_value_sets: If True, parses the string to a ValueSet object, can later be used to check
validity of the data. Optional, but highly recommended.
:param remove_line_breaks: Whether to remove line breaks from string values
:param parse_ordinals: Whether to extract the ordinal number from the field name. Warning: this can overwrite values
Ordinals could look like: "1.1.", "1.", "I.a.", or "ii.", etc.
"""
from phenopacket_mapper.pipeline import read_data_model
return read_data_model(
data_model_name,
resources,
path,
file_type,
column_names,
parse_value_sets,
remove_line_breaks,
parse_ordinals
)

@staticmethod
def load_data_using_data_model(
path: Union[str, Path],
data_model: 'DataModel',
column_names: Dict[str, str],
compliance: Literal['soft', 'hard'] = 'soft',
) -> DataSet:
"""Loads data from a file using a DataModel definition
:param path: Path to formatted csv or excel file
:param data_model: DataModel to use for reading the file
:param column_names: A dictionary mapping from the id of each field of the `DataField` to the name of a
column in the file
:param compliance: Compliance level to enforce when reading the file. If 'soft', the file can have extra fields
that are not in the DataModel. If 'hard', the file must have all fields in the DataModel.
:return: List of DataModelInstances
"""
from phenopacket_mapper.pipeline import load_data_using_data_model
return load_data_using_data_model(
path=path,
data_model=data_model,
column_names=column_names,
compliance=compliance
)
Loading

0 comments on commit c8164ca

Please sign in to comment.