generated from frehburg/TemplateForPythonProjects
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
split data model file up into one file per class
- Loading branch information
Showing
11 changed files
with
454 additions
and
524 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
23 changes: 23 additions & 0 deletions
23
src/phenopacket_mapper/data_standards/data_model/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
""" | ||
This module defines the `DataModel` class, which is used to define a data model for medical data. A `DataModel` is a | ||
collection of `DataField` objects, which define the fields of the data model. Each `DataField` has a name, a value set, | ||
a description, a section, a required flag, a specification, and an ordinal. The `DataModel` class also has a list of | ||
`CodeSystem` objects, which are used as resources in the data model. | ||
The `DataFieldValue` class is used to define the value of a `DataField` in a `DataModelInstance`. The | ||
`DataModelInstance` class is used to define an instance of a `DataModel`, i.e. a record in a dataset. | ||
""" | ||
|
||
from .data_field import DataField | ||
from .data_field_value import DataFieldValue | ||
from .data_model import DataModel | ||
from .data_model_instance import DataModelInstance | ||
from .data_set import DataSet | ||
|
||
__all__ = [ | ||
"DataField", | ||
"DataFieldValue", | ||
"DataModel", | ||
"DataModelInstance", | ||
"DataSet", | ||
] |
66 changes: 66 additions & 0 deletions
66
src/phenopacket_mapper/data_standards/data_model/data_field.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
from dataclasses import dataclass, field | ||
from typing import Union | ||
|
||
from phenopacket_mapper.data_standards.value_set import ValueSet | ||
|
||
|
||
@dataclass(slots=True, frozen=True) | ||
class DataField: | ||
"""This class defines fields used in the definition of a `DataModel` | ||
A dataa field is the equivalent of a column in a table. It has a name, a value set, a description, a section, a | ||
required flag, a specification, and an ordinal. | ||
The string for the `id` field is generated from the `name` field using the `str_to_valid_id` function from the | ||
`phenopacket_mapper.utils` module. This attempts to convert the `name` field. Sometimes this might not work as | ||
desired, in which case the `id` field can be set manually. | ||
Naming rules for the `id` field: | ||
- The `id` field must be a valid Python identifier | ||
- The `id` field must start with a letter or the underscore character | ||
- The `id` field must cannot start with a number | ||
- The `id` field can only contain lowercase alpha-numeric characters and underscores (a-z, 0-9, and _ ) | ||
- The `id` field cannot be any of the Python keywords (e.g. `in`, `is`, `not`, `class`, etc.). | ||
- The `id` field must be unique within a `DataModel` | ||
If the `value_set` is a single type, it can be passed directly as the `value_set` parameter. | ||
e.g.: | ||
>>> DataField(name="Field 1", value_set=int) | ||
DataField(name='Field 1', value_set=ValueSet(elements=[<class 'int'>], name='', description=''), id='field_1', description='', section='', required=True, specification='', ordinal='') | ||
:ivar name: Name of the field | ||
:ivar value_set: Value set of the field, if the value set is only one type, can also pass that type directly | ||
:ivar id: Id of the field, adhering to the naming rules stated above | ||
:ivar description: Description of the field | ||
:ivar section: Section of the field (Only applicable if the data model is divided into sections) | ||
:ivar required: Required flag of the field | ||
:ivar specification: Text specification of the field (a description of the value set and field) | ||
:ivar ordinal: Ordinal of the field (E.g. 1.1, 1.2, 2.1, etc.) | ||
""" | ||
name: str = field() | ||
value_set: Union[ValueSet, type] = field() | ||
id: str = field(default=None) | ||
description: str = field(default='') | ||
section: str = field(default='') | ||
required: bool = field(default=True) | ||
specification: str = field(default='') | ||
ordinal: str = field(default='') | ||
|
||
def __post_init__(self): | ||
if not self.id: | ||
from phenopacket_mapper.utils import str_to_valid_id | ||
object.__setattr__(self, 'id', str_to_valid_id(self.name)) | ||
|
||
if isinstance(self.value_set, type): | ||
object.__setattr__(self, 'value_set', ValueSet(elements=[self.value_set])) | ||
|
||
def __str__(self): | ||
ret = "DataField(\n" | ||
ret += f"\t\tid: {self.id},\n" | ||
ret += f"\t\tsection: {self.section},\n" | ||
ret += f"\t\tordinal, name: ({self.ordinal}, {self.name}),\n" | ||
ret += f"\t\tvalue_set: {self.value_set}, required: {self.required},\n" | ||
ret += f"\t\tspecification: {self.specification}\n" | ||
ret += "\t)" | ||
return ret |
54 changes: 54 additions & 0 deletions
54
src/phenopacket_mapper/data_standards/data_model/data_field_value.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
from dataclasses import dataclass | ||
from typing import Union, Any | ||
import warnings | ||
|
||
from . import DataField | ||
from phenopacket_mapper.data_standards import CodeSystem | ||
from phenopacket_mapper.data_standards.date import Date | ||
|
||
|
||
@dataclass(slots=True, frozen=True) | ||
class DataFieldValue: | ||
"""This class defines the value of a `DataField` in a `DataModelInstance` | ||
Equivalent to a cell value in a table. | ||
:ivar row_no: The id of the value, i.e. the row number | ||
:ivar field: DataField: The `DataField` to which this value belongs and which defines the value set for the field. | ||
:ivar value: The value of the field. | ||
""" | ||
row_no: Union[str, int] | ||
field: DataField | ||
value: Union[int, float, str, bool, Date, CodeSystem] | ||
|
||
def validate(self) -> bool: | ||
"""Validates the data model instance based on data model definition | ||
This method checks if the instance is valid based on the data model definition. It checks if all required fields | ||
are present, if the values are in the value set, etc. | ||
:return: True if the instance is valid, False otherwise | ||
""" | ||
if self.field.required and self.value is None: # no value | ||
warnings.warn(f"Field {self.field.name} is required but has no value") | ||
return False | ||
elif self.value is not None and self.field.value_set: | ||
if Any in self.field.value_set: # value set allows any | ||
return True | ||
elif self.value in self.field.value_set: # raw value (likely a primitive) is in the value set | ||
return True | ||
else: # check if the value matches one of the types in the value set | ||
for e in self.field.value_set: | ||
if isinstance(e, type): | ||
cur_type = e | ||
if cur_type is type(self.value): | ||
return True | ||
elif isinstance(e, CodeSystem): | ||
cs = e | ||
from phenopacket_mapper.data_standards import Coding | ||
if isinstance(self.value, Coding) and self.value.system == cs: | ||
return True | ||
|
||
warnings.warn(f"Value {self.value} of type {type(self.value)} is not in the value set of field " | ||
f"{self.field.name} (row {self.row_no})") | ||
return False |
183 changes: 183 additions & 0 deletions
183
src/phenopacket_mapper/data_standards/data_model/data_model.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,183 @@ | ||
from dataclasses import dataclass, field | ||
from pathlib import Path | ||
from types import MappingProxyType | ||
from typing import Union, List, Literal, Dict, Optional | ||
|
||
from phenopacket_mapper.data_standards import CodeSystem | ||
from . import DataSet, DataField | ||
from phenopacket_mapper.data_standards.value_set import ValueSet | ||
|
||
|
||
@dataclass(slots=True, frozen=True) | ||
class DataModel: | ||
"""This class defines a data model for medical data using `DataField` | ||
A data model can be used to import data and map it to the Phenopacket schema. It is made up of a list of `DataField` | ||
Given that all `DataField` objects in a `DataModel` have unique names, the `id` field is generated from the `name`. | ||
E.g.: `DataField(name='Date of Birth', ...)` will have an `id` of `'date_of_birth'`. The `DataField` objects can | ||
be accessed using the `id` as an attribute of the `DataModel` object. E.g.: `data_model.date_of_birth`. This is | ||
useful in the data reading and mapping processes. | ||
>>> data_model = DataModel("Test data model", [DataField(name="Field 1", value_set=ValueSet())]) | ||
>>> data_model.field_1 | ||
DataField(name='Field 1', value_set=ValueSet(elements=[], name='', description=''), id='field_1', description='', section='', required=True, specification='', ordinal='') | ||
:ivar data_model_name: Name of the data model | ||
:ivar fields: List of `DataField` objects | ||
:ivar resources: List of `CodeSystem` objects | ||
""" | ||
data_model_name: str = field() | ||
fields: List[DataField] = field() | ||
resources: List[CodeSystem] = field(default_factory=list) | ||
|
||
def __post_init__(self): | ||
if len(self.fields) != len(set([f.id for f in self.fields])): | ||
raise ValueError("All fields in a DataModel must have unique identifiers") | ||
|
||
def __getattr__(self, var_name: str) -> DataField: | ||
for f in self.fields: | ||
if f.id == var_name: | ||
return f | ||
raise AttributeError(f"'DataModel' object has no attribute '{var_name}'") | ||
|
||
def __str__(self): | ||
ret = f"DataModel(name={self.data_model_name}\n" | ||
for field in self.fields: | ||
ret += f"\t{str(field)}\n" | ||
ret += "---\n" | ||
for res in self.resources: | ||
ret += f"\t{str(res)}\n" | ||
ret += ")" | ||
return ret | ||
|
||
def __iter__(self): | ||
return iter(self.fields) | ||
|
||
def get_field(self, field_id: str, default: Optional = None) -> Optional[DataField]: | ||
"""Returns a DataField object by its id | ||
:param field_id: The id of the field | ||
:param default: The default value to return if the field is not found | ||
:return: The DataField object | ||
""" | ||
for f in self.fields: | ||
if f.id == field_id: | ||
return f | ||
if default or default is None: | ||
return default | ||
raise ValueError(f"Field with id {field_id} not found in DataModel") | ||
|
||
def get_field_ids(self) -> List[str]: | ||
"""Returns a list of the ids of the DataFields in the DataModel""" | ||
return [f.id for f in self.fields] | ||
|
||
def load_data( | ||
self, | ||
path: Union[str, Path], | ||
compliance: Literal['soft', 'hard'] = 'soft', | ||
**kwargs | ||
) -> DataSet: | ||
"""Loads data from a file using a DataModel definition | ||
To call this method, pass the column name for each field in the DataModel as a keyword argument. This is done | ||
by passing the field id followed by '_column'. E.g. if the DataModel has a field with id 'date_of_birth', the | ||
column name in the file should be passed as 'date_of_birth_column'. The method will raise an error if any of | ||
the fields are missing. | ||
E.g.: | ||
```python | ||
data_model = DataModel("Test data model", [DataField(name="Field 1", value_set=ValueSet())]) | ||
data_model.load_data("data.csv", field_1_column="column_name_in_file") | ||
``` | ||
:param path: Path to the file containing the data | ||
:param compliance: Compliance level to use when loading the data. | ||
:param kwargs: Dynamically passed parameters that match {id}_column for each item | ||
:return: A list of `DataModelInstance` objects | ||
""" | ||
column_names = dict() | ||
for f in self.fields: | ||
column_param = f"{f.id}_column" | ||
if column_param not in kwargs: | ||
raise TypeError(f"load_data() missing 1 required argument: '{column_param}'") | ||
else: | ||
column_names[f.id] = kwargs[column_param] | ||
|
||
from phenopacket_mapper.pipeline import load_data_using_data_model | ||
return load_data_using_data_model( | ||
path=path, | ||
data_model=self, | ||
column_names=column_names, | ||
compliance=compliance | ||
) | ||
|
||
@staticmethod | ||
def from_file( | ||
data_model_name: str, | ||
resources: List[CodeSystem], | ||
path: Union[str, Path], | ||
file_type: Literal['csv', 'excel', 'unknown'] = 'unknown', | ||
column_names: Dict[str, str] = MappingProxyType({ | ||
DataField.name.__name__: 'data_field_name', | ||
DataField.section.__name__: 'data_model_section', | ||
DataField.description.__name__: 'description', | ||
DataField.value_set.__name__: 'value_set', | ||
DataField.required.__name__: 'required', | ||
DataField.specification.__name__: 'specification', | ||
DataField.ordinal.__name__: 'ordinal' | ||
}), | ||
parse_value_sets: bool = False, | ||
remove_line_breaks: bool = False, | ||
parse_ordinals: bool = True, | ||
) -> 'DataModel': | ||
"""Reads a Data Model from a file | ||
:param data_model_name: Name to be given to the `DataModel` object | ||
:param resources: List of `CodeSystem` objects to be used as resources in the `DataModel` | ||
:param path: Path to Data Model file | ||
:param file_type: Type of file to read, either 'csv' or 'excel' | ||
:param column_names: A dictionary mapping from each field of the `DataField` (key) class to a column of the file | ||
(value). Leaving a value empty (`''`) will leave the field in the `DataModel` definition empty. | ||
:param parse_value_sets: If True, parses the string to a ValueSet object, can later be used to check | ||
validity of the data. Optional, but highly recommended. | ||
:param remove_line_breaks: Whether to remove line breaks from string values | ||
:param parse_ordinals: Whether to extract the ordinal number from the field name. Warning: this can overwrite values | ||
Ordinals could look like: "1.1.", "1.", "I.a.", or "ii.", etc. | ||
""" | ||
from phenopacket_mapper.pipeline import read_data_model | ||
return read_data_model( | ||
data_model_name, | ||
resources, | ||
path, | ||
file_type, | ||
column_names, | ||
parse_value_sets, | ||
remove_line_breaks, | ||
parse_ordinals | ||
) | ||
|
||
@staticmethod | ||
def load_data_using_data_model( | ||
path: Union[str, Path], | ||
data_model: 'DataModel', | ||
column_names: Dict[str, str], | ||
compliance: Literal['soft', 'hard'] = 'soft', | ||
) -> DataSet: | ||
"""Loads data from a file using a DataModel definition | ||
:param path: Path to formatted csv or excel file | ||
:param data_model: DataModel to use for reading the file | ||
:param column_names: A dictionary mapping from the id of each field of the `DataField` to the name of a | ||
column in the file | ||
:param compliance: Compliance level to enforce when reading the file. If 'soft', the file can have extra fields | ||
that are not in the DataModel. If 'hard', the file must have all fields in the DataModel. | ||
:return: List of DataModelInstances | ||
""" | ||
from phenopacket_mapper.pipeline import load_data_using_data_model | ||
return load_data_using_data_model( | ||
path=path, | ||
data_model=data_model, | ||
column_names=column_names, | ||
compliance=compliance | ||
) |
Oops, something went wrong.