diff --git a/pytest.ini b/pytest.ini index bc7e9e1e..e1ac026e 100644 --- a/pytest.ini +++ b/pytest.ini @@ -6,3 +6,6 @@ python_files = _test*.py test*.py ; Also test the documentation tests and the tutorial scripts. addopts = --doctest-modules --doctest-glob *.rst + +# Exclude certain directories from recursion when discovering tests. +norecursedirs = docs submodules \ No newline at end of file diff --git a/src/phenopacket_mapper/cli/mapping_command.py b/src/phenopacket_mapper/cli/mapping_command.py index 5d28f27e..7221abd2 100644 --- a/src/phenopacket_mapper/cli/mapping_command.py +++ b/src/phenopacket_mapper/cli/mapping_command.py @@ -1,7 +1,5 @@ from pathlib import Path -from phenopacket_mapper.mapping.mapper import mapping - def main(args): """Mapping command: Executes the pipeline mapping a dataset in the format to the Phenopacket schema @@ -30,4 +28,5 @@ def main(args): else: validate_ = False - mapping(path, output, validate_) + # mapping(path, output, validate_) + raise NotImplementedError diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index 646e1f7e..bc5426ff 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -173,6 +173,9 @@ def __str__(self): ret += ")" return ret + def __iter__(self): + return iter(self.fields) + def get_field(self, field_id: str, default: Optional = None) -> Optional[DataField]: """Returns a DataField object by its id diff --git a/src/phenopacket_mapper/mapping/__init__.py b/src/phenopacket_mapper/mapping/__init__.py index b5ed5f2c..65960ca3 100644 --- a/src/phenopacket_mapper/mapping/__init__.py +++ b/src/phenopacket_mapper/mapping/__init__.py @@ -1,11 +1,9 @@ """This module facilitates the mapping from a local data model to the phenopacket schema""" -from .map_element import MapElement from .phenopacket_element import PhenopacketElement from .mapper import PhenopacketMapper __all__ = [ - 'MapElement', 'PhenopacketElement', 'PhenopacketMapper', diff --git a/src/phenopacket_mapper/mapping/map_element.py b/src/phenopacket_mapper/mapping/map_element.py deleted file mode 100644 index d9cfcc35..00000000 --- a/src/phenopacket_mapper/mapping/map_element.py +++ /dev/null @@ -1,10 +0,0 @@ -from dataclasses import dataclass, field - -from phenopacket_mapper.data_standards import DataField - - -@dataclass(frozen=True, slots=True) -class MapElement: - """This class represents the mapping from an element of the ´DataModel´ to a field in the Phenopacket schema""" - from_field: DataField = field(init=True, repr=True) - to_field: str = field(init=True, repr=True) diff --git a/src/phenopacket_mapper/mapping/map_field.py b/src/phenopacket_mapper/mapping/map_field.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/phenopacket_mapper/mapping/map_field.py @@ -0,0 +1 @@ + diff --git a/src/phenopacket_mapper/mapping/mapper.py b/src/phenopacket_mapper/mapping/mapper.py index 3355740e..b4b3d939 100644 --- a/src/phenopacket_mapper/mapping/mapper.py +++ b/src/phenopacket_mapper/mapping/mapper.py @@ -1,80 +1,86 @@ -from pathlib import Path -from typing import List, Union +from typing import List, Union, Dict from phenopackets import Phenopacket -from phenopacket_mapper.data_standards.DataModel2PhenopacketSchema import DataModel2PhenopacketSchema -from phenopacket_mapper.data_standards.data_model import DataModel, DataModelInstance, DataSet -from phenopacket_mapper.data_standards.data_models import ERDRI_CDS -from phenopacket_mapper.pipeline import validate +from phenopacket_mapper.data_standards import CodeSystem +from phenopacket_mapper.data_standards.data_model import DataModel, DataSet, DataField, DataFieldValue +from phenopacket_mapper.mapping import PhenopacketElement class PhenopacketMapper: - """Class to map data using a DataModel to Phenopackets - - This class is central to the pipeline for mapping data from a DataModel to Phenopackets. - A dataset can be mapped from its tabular format to the Phenopacket schema in a few simple steps: - 1. Define the DataModel for the dataset, if it does not exist yet - 2. Load the data from the dataset - 3. Define the mapping from the DataModel to the Phenopacket schema - 4. Perform the mapping - 5. Write the Phenopackets to a file - 6. Optionally validate the Phenopackets + """Class to map data to Phenopackets + + :ivar data_set: The data set to map to Phenopackets + :ivar elements: List of PhenopacketElements to map the data to Phenopackets """ - def __init__(self, datamodel: DataModel): - self.data_model = datamodel - def load_data(self, path: Union[str, Path]) -> DataSet: - """Load data from a file using the DataModel - - Will raise an error if the file type is not recognized or the file does not follow the DataModel + def __init__(self, data_model: DataModel, resources: List[CodeSystem], **kwargs): + """Create a PhenopacketMapper, this method is equivalent to the constructor of the ´Phenopacket´ for the mapping - :param path: Path to the file to load - :return: List of DataModelInstances - """ - raise NotImplementedError + List fields of the ´Phenopacket´ constructor in the kwargs to map the data to Phenopackets. - def map(self, mapping_: DataModel2PhenopacketSchema, data: DataSet) -> List[Phenopacket]: + :param data_model: The data model to map to Phenopackets + :param kwargs: The elements to map the data to Phenopackets + """ + self.data_model = data_model + self.elements: Dict[str, Union[PhenopacketElement, DataField]] = {} + self.resources = resources + for k, v in kwargs.items(): + setattr(self, k, v) + self.elements[k] = v + + self.__post_init__() + + def __post_init__(self): + # Check if the fields in the mapping are in the data model + for e in self.elements.values(): + self.check_data_fields_in_model(e) + + def check_data_fields_in_model(self, element: Union[PhenopacketElement, DataField]): + if isinstance(element, DataField): + field = element + if field not in self.data_model: + raise AttributeError(f"The mapping definition contains an invalid field. " + f"{field} is not in the data model underlying the passed data set." + f" (The data model includes the fields: {self.data_model.get_field_ids()})") + elif isinstance(element, PhenopacketElement): + for key, ee in element.elements.items(): + self.check_data_fields_in_model(ee) + + def map(self, data: DataSet) -> List[Phenopacket]: """Map data from the DataModel to Phenopackets - The mapping is based on the definition of the DataModel and the DataModel2PhenopacketSchema mapping. + The mapping is based on the definition of the DataModel and the parameters passed to the constructor. If successful, a list of Phenopackets will be returned - :param mapping_: Mapping from the DataModel to the Phenopacket schema, defined in DataModel2PhenopacketSchema :param data: List of DataModelInstances created from the data using the DataModel :return: List of Phenopackets """ - # TODO: Implement the mapping logic - raise NotImplementedError - - def write(self, phenopackets: List[Phenopacket], output_path: Union[str, Path]) -> bool: - """Write Phenopackets to a file - - :param phenopackets: List of Phenopackets to write - :param output_path: Path to write the Phenopackets to - :return: True if successful, False otherwise - """ - raise NotImplementedError - - -def mapping(path: Path, output: Path, validate_: bool, datamodel: DataModel = ERDRI_CDS): - """Executes the pipeline mapping a dataset in the format to the Phenopacket schema - - :param path: Path to formatted csv or excel file - :param output: Path to write Phenopackets to - :param validate_: Validate phenopackets using phenopacket-tools after creation - :param datamodel: DataModel to use for the mapping, defaults to - """ - print(f"{path=}, {output=}, {validate_=}") - mapper = PhenopacketMapper(datamodel=datamodel) - data = mapper.load_data(path=path) - # TODO: Define the mapping from the data model to the Phenopacket schema - phenopackets = mapper.map(data) - if mapper.write(phenopackets, output): - print('Phenopackets written successfully') - else: - print('Error writing phenopackets') - if validate_: - validate(phenopackets) - raise NotImplementedError("The function mapping has not been implemented yet") + phenopackets_list = [] + for instance in data: + kwargs = {} + for key, e in self.elements.items(): + if isinstance(e, DataField): + data_field = e + try: + value: DataFieldValue = getattr(instance, data_field.id).value + kwargs[key] = value + except AttributeError: + continue + elif isinstance(e, PhenopacketElement): + phenopacket_element = e + kwargs[key] = phenopacket_element.map(instance) + # TODO: Add the resources to the phenopacket + try: + phenopackets_list.append( + Phenopacket( + **kwargs + ) + ) + except TypeError as e: + raise TypeError(f"Error in mapping: {e}") + except Exception as e: + raise e + + return phenopackets_list diff --git a/src/phenopacket_mapper/mapping/phenopacket_element.py b/src/phenopacket_mapper/mapping/phenopacket_element.py index ca0ae9fe..05034f72 100644 --- a/src/phenopacket_mapper/mapping/phenopacket_element.py +++ b/src/phenopacket_mapper/mapping/phenopacket_element.py @@ -1,16 +1,50 @@ -from dataclasses import dataclass, field -from typing import Any, List +from typing import Union, Dict -from phenopacket_mapper.mapping import MapElement +from phenopacket_mapper.data_standards import DataModelInstance, DataField, DataFieldValue -@dataclass(frozen=True, slots=True) class PhenopacketElement: - phenopacket_element: Any = field() - fields: List[MapElement] = field() - - def __post_init__(self): - for f in self.fields: - if not hasattr(self.phenopacket_element, f.to_field): - raise AttributeError(f"The class: {self.phenopacket_element} has no attribute {f.to_field}") - \ No newline at end of file + + def __init__(self, phenopacket_element, **kwargs): + """Mapping equivalent to the constructor of a Phenopacket element (e.g., Individual) for the mapping + + List fields of the Phenopacket element constructor in the kwargs to map the data to Phenopackets. + + :param phenopacket_element: The phenopacket element to map to (e.g., `phenopackets.Individual`) + :param kwargs: The elements to map the data to Phenopackets + """ + self.phenopacket_element = phenopacket_element + self.elements: Dict[str, Union[PhenopacketElement, DataField]] = {} + for k, v in kwargs.items(): + setattr(self, k, v) + self.elements[k] = v + + def map(self, instance: DataModelInstance): + """Creates the phenopacket element by the mapping specified in fields + + >>> import phenopackets + >>> from phenopacket_mapper.data_standards import DataModelInstance, DataModel, DataField, DataFieldValue + >>> data_field = DataField("pseudonym", str) + >>> data_model = DataModel("Example data model", [data_field], []) + >>> inst = DataModelInstance(0, data_model, [DataFieldValue(0, data_field, "example_pseudonym")]) + >>> individual = PhenopacketElement(phenopackets.Individual, id=data_field).map(inst) + >>> individual.id + 'example_pseudonym' + + :param instance: the ´DataModelInstance´ from which to map to a Phenopacket schema element + :return: the resulting Phenopacket schema element + """ + kwargs = {} + for key, e in self.elements.items(): + if isinstance(e, DataField): + df = e + try: + value: DataFieldValue = getattr(instance, df.id).value + kwargs[key] = value + except AttributeError: + continue + elif isinstance(e, PhenopacketElement): + phenopacket_element = e + kwargs[key] = phenopacket_element.map(instance) + + return self.phenopacket_element(**kwargs) diff --git a/src/phenopacket_mapper/pipeline/output.py b/src/phenopacket_mapper/pipeline/output.py index 58fe47c0..f050f3eb 100644 --- a/src/phenopacket_mapper/pipeline/output.py +++ b/src/phenopacket_mapper/pipeline/output.py @@ -8,7 +8,7 @@ def write( phenopackets_list: List[Phenopacket], out_dir: Union[str, Path] -) -> None: +): """Writes a list of phenopackets to JSON files. :param phenopackets_list: The list of phenopackets. @@ -25,8 +25,8 @@ def write( def _write_single_phenopacket( phenopacket: Phenopacket, - out_dr: Union[str, Path] -) -> None: + out_dir: Union[str, Path] +): """Writes a phenopacket to a JSON file. :param phenopacket: The phenopacket. @@ -35,6 +35,6 @@ def _write_single_phenopacket( :type out_dr: Union[str, Path] """ json_str = MessageToJson(phenopacket) # Convert phenopacket to JSON string. - out_path = os.path.join(out_dr, (phenopacket.id + '.json')) + out_path = os.path.join(out_dir, (phenopacket.id + '.json')) with open(out_path, 'w') as fh: fh.write(json_str)