Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

115 add preprocessing 2 #118

Merged
merged 7 commits into from
Sep 24, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/phenopacket_mapper/__init__.py
Original file line number Diff line number Diff line change
@@ -2,8 +2,8 @@

__version__ = "0.0.1"

from . import cli, data_standards, pipeline
from . import cli, data_standards, pipeline, preprocessing

from .pipeline import PhenopacketMapper

__all__ = ["cli", "data_standards", "pipeline", "PhenopacketMapper"]
__all__ = ["cli", "data_standards", "pipeline", "PhenopacketMapper", "preprocessing"]
59 changes: 57 additions & 2 deletions src/phenopacket_mapper/data_standards/data_model.py
Original file line number Diff line number Diff line change
@@ -11,14 +11,15 @@
from dataclasses import dataclass, field
from pathlib import Path
from types import MappingProxyType
from typing import Union, List, Literal, Dict, Optional, Any
from typing import Union, List, Literal, Dict, Optional, Any, Callable
import warnings

import pandas as pd

from phenopacket_mapper.data_standards import CodeSystem
from phenopacket_mapper.data_standards.date import Date
from phenopacket_mapper.data_standards.value_set import ValueSet
from phenopacket_mapper.preprocessing import preprocess, preprocess_method


@dataclass(slots=True, frozen=True)
@@ -83,7 +84,7 @@ def __str__(self):
return ret


@dataclass(slots=True, frozen=True)
@dataclass(slots=True)
class DataFieldValue:
"""This class defines the value of a `DataField` in a `DataModelInstance`

@@ -411,6 +412,60 @@ def data_frame(self) -> pd.DataFrame:
def __iter__(self):
return iter(self.data)

def preprocess(
self,
fields: Union[str, DataField, List[Union[str, DataField]]],
mapping: Union[Dict, Callable],
**kwargs
):
"""Preprocesses a field in the dataset

Preprocessing happens in place, i.e. the values in the dataset are modified directly.

If fields is a list of fields, the mapping must be a method that can handle a list of values being passed as
value to it. E.g.:
```python
def preprocess_method(values, method, **kwargs):
field1, field2 = values
# do something with values
return "preprocessed_values" + kwargs["arg1"] + kwargs["arg2"]

dataset.preprocess(["field_1", "field_2"], preprocess_method, arg1="value1", arg2="value2")
```

:param fields: Data fields to be preprocessed, will be passed onto `mapping`
:param mapping: A dictionary or method to use for preprocessing
"""
field_ids = list()
for f in fields:
if isinstance(field, str):
field_ids.append(f)
elif isinstance(f, DataField):
field_ids.append(f.id)
else:
raise ValueError(f"Field {field} is not of type str or DataField")

if len(field_ids) == 0:
raise ValueError("No fields to preprocess")
elif len(field_ids) == 1:
field_id = field_ids[0]
for instance in self.data:
for v in instance.values:
if v.field.id == field_id:
v.value = preprocess(v.value, mapping, **kwargs)
else:
if isinstance(mapping, dict):
raise ValueError("Mapping dictionary cannot be used to preprocess multiple fields")
elif isinstance(mapping, Callable):
values = list()
for instance in self.data:
for field_id in field_ids:
for v in instance.values:
if v.field.id == field_id:
values.append(v.value)

preprocess_method(values, mapping, **kwargs)

def head(self, n: int = 5):
if self.data_frame is not None:
return self.data_frame.head(n)
7 changes: 7 additions & 0 deletions src/phenopacket_mapper/preprocessing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""Methods for preprocessing data before mapping to Phenopackets."""

from .preprocess_dict import preprocess_dict
from .preprocess_method import preprocess_method
from .preprocess import preprocess

__all__ = ["preprocess_dict", "preprocess_method", "preprocess"]
23 changes: 23 additions & 0 deletions src/phenopacket_mapper/preprocessing/preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import warnings
from typing import Any, Union, Dict, Callable

from phenopacket_mapper.preprocessing import preprocess_dict, preprocess_method


def preprocess(
value: Any,
mapping: Union[Dict, Callable],
**kwargs
) -> Any:
"""Preprocess a value before mapping to a Phenopacket.

Relies on `preprocess_dict` and `preprocess_method` to preprocess using a dictionary or method, respectively. Please
consult the documentation for these functions for more information.
"""
if isinstance(mapping, dict):
return preprocess_dict(value, mapping)
elif isinstance(mapping, Callable):
return preprocess_method(value, mapping, **kwargs)

warnings.warn(f"Mapping type {type(mapping)} in preprocessing not supported. Returning original value.")
return value
21 changes: 21 additions & 0 deletions src/phenopacket_mapper/preprocessing/preprocess_dict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import warnings
from typing import Any, Dict


def preprocess_dict(value: Any, mapping_dict: Dict) -> Any:
"""Takes a value and uses a mapping dictionary to preprocess it.

If the value is in the mapping dictionary, the corresponding value is returned.
If the value is not in the mapping dictionary, the original value is returned.

:param value: The value to preprocess.
:param mapping_dict: A dictionary containing the mapping rules.
:return: The preprocessed value.
"""
try:
ret_value = mapping_dict[value]
except KeyError:
ret_value = value
warnings.warn(f"Value {value} not found in mapping dictionary.")

return ret_value
25 changes: 25 additions & 0 deletions src/phenopacket_mapper/preprocessing/preprocess_method.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from typing import Any, Callable


def preprocess_method(value: Any, method: Callable, **kwargs) -> Any:
"""Takes a value and uses a method to preprocess it.

The method is called with the value as an argument.
If the method raises an exception, the original value is returned.

If the method requires additional arguments, they can be passed as keyword arguments in `kwargs`.

Please write the method such that it is callable as `method(value, **kwargs)`.

:param value: The value to preprocess.
:param method: The method to use for preprocessing.
:param kwargs: Additional arguments for the method.
:return: The preprocessed value.
"""
try:
ret_value = method(value, **kwargs)
except Exception as e:
ret_value = value
print(f"Error while preprocessing value {value} with method {method}. Error message: {e}")

return ret_value
Loading