Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

110 overhaul mapper #111

Merged
merged 31 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
74d1018
overhauled init
frehburg Sep 22, 2024
30b690a
removed method
frehburg Sep 22, 2024
6cca6b8
renamed map element to mapfield
frehburg Sep 22, 2024
b84747b
improved docstring of mapfield
frehburg Sep 22, 2024
76a34b5
wrote method signature of map for phenopacket element
frehburg Sep 22, 2024
9748ee3
implemented
frehburg Sep 22, 2024
e0ba497
wrote pydoc including doctest
frehburg Sep 22, 2024
74af3ed
mini typo in doctest
frehburg Sep 22, 2024
a9a7f5a
.
frehburg Sep 22, 2024
c5668d6
corrected doctest output
frehburg Sep 22, 2024
a551ddf
doctest hell
frehburg Sep 22, 2024
20aca79
update
frehburg Sep 24, 2024
9ff8f36
changed from list to dict
frehburg Sep 24, 2024
677da00
removed write method in mapper
frehburg Sep 24, 2024
a813c9e
changed write definition in output
frehburg Sep 24, 2024
52437ae
implemented new constructor for phenopacket element
frehburg Sep 24, 2024
b0931bd
implemented new map method in phenopacket element
frehburg Sep 24, 2024
efde206
updated doctest
frehburg Sep 24, 2024
b41738b
fixed some bugs in the setup of the mapping
frehburg Sep 24, 2024
52824b8
added iter to datamodel
frehburg Sep 24, 2024
8304460
added resources to mapper
frehburg Sep 24, 2024
79ba89c
renamed var and bug fix
frehburg Sep 24, 2024
d9fc76c
renamed element and bug fix
frehburg Sep 24, 2024
813db83
typo
frehburg Sep 24, 2024
22e2e5d
removed print statement
frehburg Sep 24, 2024
c6114e0
tab for doctest
frehburg Sep 24, 2024
bd41e31
trying to fix doctest indent
frehburg Sep 24, 2024
ce92fa6
added ellipsis
frehburg Sep 24, 2024
e584eae
removed unnecessary flags
frehburg Sep 24, 2024
2cd2c84
new doc test
frehburg Sep 24, 2024
967feb0
single quotes in docstrings always single quotes '''''''
frehburg Sep 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@ python_files = _test*.py test*.py

; Also test the documentation tests and the tutorial scripts.
addopts = --doctest-modules --doctest-glob *.rst

# Exclude certain directories from recursion when discovering tests.
norecursedirs = docs submodules
5 changes: 2 additions & 3 deletions src/phenopacket_mapper/cli/mapping_command.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from pathlib import Path

from phenopacket_mapper.mapping.mapper import mapping


def main(args):
"""Mapping command: Executes the pipeline mapping a dataset in the format to the Phenopacket schema
Expand Down Expand Up @@ -30,4 +28,5 @@ def main(args):
else:
validate_ = False

mapping(path, output, validate_)
# mapping(path, output, validate_)
raise NotImplementedError
3 changes: 3 additions & 0 deletions src/phenopacket_mapper/data_standards/data_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,9 @@ def __str__(self):
ret += ")"
return ret

def __iter__(self):
return iter(self.fields)

def get_field(self, field_id: str, default: Optional = None) -> Optional[DataField]:
"""Returns a DataField object by its id

Expand Down
2 changes: 0 additions & 2 deletions src/phenopacket_mapper/mapping/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
"""This module facilitates the mapping from a local data model to the phenopacket schema"""

from .map_element import MapElement
from .phenopacket_element import PhenopacketElement
from .mapper import PhenopacketMapper

__all__ = [
'MapElement',
'PhenopacketElement',
'PhenopacketMapper',

Expand Down
10 changes: 0 additions & 10 deletions src/phenopacket_mapper/mapping/map_element.py

This file was deleted.

1 change: 1 addition & 0 deletions src/phenopacket_mapper/mapping/map_field.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

130 changes: 68 additions & 62 deletions src/phenopacket_mapper/mapping/mapper.py
Original file line number Diff line number Diff line change
@@ -1,80 +1,86 @@
from pathlib import Path
from typing import List, Union
from typing import List, Union, Dict

from phenopackets import Phenopacket

from phenopacket_mapper.data_standards.DataModel2PhenopacketSchema import DataModel2PhenopacketSchema
from phenopacket_mapper.data_standards.data_model import DataModel, DataModelInstance, DataSet
from phenopacket_mapper.data_standards.data_models import ERDRI_CDS
from phenopacket_mapper.pipeline import validate
from phenopacket_mapper.data_standards import CodeSystem
from phenopacket_mapper.data_standards.data_model import DataModel, DataSet, DataField, DataFieldValue
from phenopacket_mapper.mapping import PhenopacketElement


class PhenopacketMapper:
"""Class to map data using a DataModel to Phenopackets

This class is central to the pipeline for mapping data from a DataModel to Phenopackets.
A dataset can be mapped from its tabular format to the Phenopacket schema in a few simple steps:
1. Define the DataModel for the dataset, if it does not exist yet
2. Load the data from the dataset
3. Define the mapping from the DataModel to the Phenopacket schema
4. Perform the mapping
5. Write the Phenopackets to a file
6. Optionally validate the Phenopackets
"""Class to map data to Phenopackets

:ivar data_set: The data set to map to Phenopackets
:ivar elements: List of PhenopacketElements to map the data to Phenopackets
"""
def __init__(self, datamodel: DataModel):
self.data_model = datamodel

def load_data(self, path: Union[str, Path]) -> DataSet:
"""Load data from a file using the DataModel

Will raise an error if the file type is not recognized or the file does not follow the DataModel
def __init__(self, data_model: DataModel, resources: List[CodeSystem], **kwargs):
"""Create a PhenopacketMapper, this method is equivalent to the constructor of the ´Phenopacket´ for the mapping

:param path: Path to the file to load
:return: List of DataModelInstances
"""
raise NotImplementedError
List fields of the ´Phenopacket´ constructor in the kwargs to map the data to Phenopackets.

def map(self, mapping_: DataModel2PhenopacketSchema, data: DataSet) -> List[Phenopacket]:
:param data_model: The data model to map to Phenopackets
:param kwargs: The elements to map the data to Phenopackets
"""
self.data_model = data_model
self.elements: Dict[str, Union[PhenopacketElement, DataField]] = {}
self.resources = resources
for k, v in kwargs.items():
setattr(self, k, v)
self.elements[k] = v

self.__post_init__()

def __post_init__(self):
# Check if the fields in the mapping are in the data model
for e in self.elements.values():
self.check_data_fields_in_model(e)

def check_data_fields_in_model(self, element: Union[PhenopacketElement, DataField]):
if isinstance(element, DataField):
field = element
if field not in self.data_model:
raise AttributeError(f"The mapping definition contains an invalid field. "
f"{field} is not in the data model underlying the passed data set."
f" (The data model includes the fields: {self.data_model.get_field_ids()})")
elif isinstance(element, PhenopacketElement):
for key, ee in element.elements.items():
self.check_data_fields_in_model(ee)

def map(self, data: DataSet) -> List[Phenopacket]:
"""Map data from the DataModel to Phenopackets

The mapping is based on the definition of the DataModel and the DataModel2PhenopacketSchema mapping.
The mapping is based on the definition of the DataModel and the parameters passed to the constructor.

If successful, a list of Phenopackets will be returned

:param mapping_: Mapping from the DataModel to the Phenopacket schema, defined in DataModel2PhenopacketSchema
:param data: List of DataModelInstances created from the data using the DataModel
:return: List of Phenopackets
"""
# TODO: Implement the mapping logic
raise NotImplementedError

def write(self, phenopackets: List[Phenopacket], output_path: Union[str, Path]) -> bool:
"""Write Phenopackets to a file

:param phenopackets: List of Phenopackets to write
:param output_path: Path to write the Phenopackets to
:return: True if successful, False otherwise
"""
raise NotImplementedError


def mapping(path: Path, output: Path, validate_: bool, datamodel: DataModel = ERDRI_CDS):
"""Executes the pipeline mapping a dataset in the format to the Phenopacket schema

:param path: Path to formatted csv or excel file
:param output: Path to write Phenopackets to
:param validate_: Validate phenopackets using phenopacket-tools after creation
:param datamodel: DataModel to use for the mapping, defaults to
"""
print(f"{path=}, {output=}, {validate_=}")
mapper = PhenopacketMapper(datamodel=datamodel)
data = mapper.load_data(path=path)
# TODO: Define the mapping from the data model to the Phenopacket schema
phenopackets = mapper.map(data)
if mapper.write(phenopackets, output):
print('Phenopackets written successfully')
else:
print('Error writing phenopackets')
if validate_:
validate(phenopackets)
raise NotImplementedError("The function mapping has not been implemented yet")
phenopackets_list = []
for instance in data:
kwargs = {}
for key, e in self.elements.items():
if isinstance(e, DataField):
data_field = e
try:
value: DataFieldValue = getattr(instance, data_field.id).value
kwargs[key] = value
except AttributeError:
continue
elif isinstance(e, PhenopacketElement):
phenopacket_element = e
kwargs[key] = phenopacket_element.map(instance)
# TODO: Add the resources to the phenopacket
try:
phenopackets_list.append(
Phenopacket(
**kwargs
)
)
except TypeError as e:
raise TypeError(f"Error in mapping: {e}")
except Exception as e:
raise e

return phenopackets_list
58 changes: 46 additions & 12 deletions src/phenopacket_mapper/mapping/phenopacket_element.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,50 @@
from dataclasses import dataclass, field
from typing import Any, List
from typing import Union, Dict

from phenopacket_mapper.mapping import MapElement
from phenopacket_mapper.data_standards import DataModelInstance, DataField, DataFieldValue


@dataclass(frozen=True, slots=True)
class PhenopacketElement:
phenopacket_element: Any = field()
fields: List[MapElement] = field()

def __post_init__(self):
for f in self.fields:
if not hasattr(self.phenopacket_element, f.to_field):
raise AttributeError(f"The class: {self.phenopacket_element} has no attribute {f.to_field}")


def __init__(self, phenopacket_element, **kwargs):
"""Mapping equivalent to the constructor of a Phenopacket element (e.g., Individual) for the mapping

List fields of the Phenopacket element constructor in the kwargs to map the data to Phenopackets.

:param phenopacket_element: The phenopacket element to map to (e.g., `phenopackets.Individual`)
:param kwargs: The elements to map the data to Phenopackets
"""
self.phenopacket_element = phenopacket_element
self.elements: Dict[str, Union[PhenopacketElement, DataField]] = {}
for k, v in kwargs.items():
setattr(self, k, v)
self.elements[k] = v

def map(self, instance: DataModelInstance):
"""Creates the phenopacket element by the mapping specified in fields

>>> import phenopackets
>>> from phenopacket_mapper.data_standards import DataModelInstance, DataModel, DataField, DataFieldValue
>>> data_field = DataField("pseudonym", str)
>>> data_model = DataModel("Example data model", [data_field], [])
>>> inst = DataModelInstance(0, data_model, [DataFieldValue(0, data_field, "example_pseudonym")])
>>> individual = PhenopacketElement(phenopackets.Individual, id=data_field).map(inst)
>>> individual.id
'example_pseudonym'

:param instance: the ´DataModelInstance´ from which to map to a Phenopacket schema element
:return: the resulting Phenopacket schema element
"""
kwargs = {}
for key, e in self.elements.items():
if isinstance(e, DataField):
df = e
try:
value: DataFieldValue = getattr(instance, df.id).value
kwargs[key] = value
except AttributeError:
continue
elif isinstance(e, PhenopacketElement):
phenopacket_element = e
kwargs[key] = phenopacket_element.map(instance)

return self.phenopacket_element(**kwargs)
8 changes: 4 additions & 4 deletions src/phenopacket_mapper/pipeline/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

def write(
phenopackets_list: List[Phenopacket], out_dir: Union[str, Path]
) -> None:
):
"""Writes a list of phenopackets to JSON files.

:param phenopackets_list: The list of phenopackets.
Expand All @@ -25,8 +25,8 @@ def write(

def _write_single_phenopacket(
phenopacket: Phenopacket,
out_dr: Union[str, Path]
) -> None:
out_dir: Union[str, Path]
):
"""Writes a phenopacket to a JSON file.

:param phenopacket: The phenopacket.
Expand All @@ -35,6 +35,6 @@ def _write_single_phenopacket(
:type out_dr: Union[str, Path]
"""
json_str = MessageToJson(phenopacket) # Convert phenopacket to JSON string.
out_path = os.path.join(out_dr, (phenopacket.id + '.json'))
out_path = os.path.join(out_dir, (phenopacket.id + '.json'))
with open(out_path, 'w') as fh:
fh.write(json_str)
Loading