Skip to content

Commit

Permalink
Solve bug in ValueReplacerTransformer
Browse files Browse the repository at this point in the history
  • Loading branch information
chrislemke committed Sep 26, 2022
1 parent ccd8b74 commit 5220bf9
Show file tree
Hide file tree
Showing 7 changed files with 39 additions and 17 deletions.
38 changes: 29 additions & 9 deletions feature_reviser/transformer/generic_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ class ValueIndicatorTransformer(BaseTransformer):
Example:
>>> X = pd.DataFrame({"foo": [1, -999, 3], "bar": ["a", "-999", "c"]})
>>> transformer = NaNIndicatorTransformer([("foo", -999), ("bar", "-999")])
>>> transformer.fit_transform(X).to_dict()
>>> print(transformer.fit_transform(X).to_dict())
{
'foo': {0: 1, 1: -999, 2: 3},
'bar': {0: 'a', 1: '-999', 2: 'c'},
Expand Down Expand Up @@ -148,18 +148,39 @@ def transform(self, Xy: pd.DataFrame) -> pd.DataFrame:


class ValueReplacerTransformer(BaseTransformer):
"""
r"""
Uses Pandas `replace` method to replace values in a column. This transformer loops over the `features` and applies
`replace` to the according columns. If the column is not from type string but a valid regular expression is provided
the column will be temporarily changed to a string column and after the manipulation by `replace` changed back to its
original type. It may happen, that this type changing fails if the modified column is not compatible with its original type.
Example:
>>> X = pd.DataFrame({"foo": ["0000-01-01", "2022/01/08", "bar", "1982-12-7", "28-09-2022"]})
>>> transformer = (
... ValueReplacerTransformer(
... [
... (
... ["foo"],
... r"^(?!(19|20)\d\d[-\/.](0[1-9]|1[012]|[1-9])[-\/.](0[1-9]|[12][0-9]|3[01]|[1-9])$).*",
... "1900-01-01",
... )
... ]
... ),
... )
>>> print(transformer.fit_transform(X).values)
array([['1900-01-01'],
['2022/01/08'],
['1900-01-01'],
['1982-12-7'],
['1900-01-01']], dtype=object)
Args:
features (List[Tuple[List[str], str, Any]]): List of tuples containing the column names as a list,
the value to replace (can be a regex), and the replacement value.
"""

def __init__(self, features: List[Tuple[List[str], str, Any]]) -> None:
def __init__(self, features: List[Tuple[List[str], Any, Any]]) -> None:
self.features = features

def transform(self, X: pd.DataFrame) -> pd.DataFrame:
Expand All @@ -175,27 +196,26 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:

for (columns, value, replacement) in self.features:
for column in columns:
print(column)
is_regex = ValueReplacerTransformer.__check_for_regex(value)
column_dtype = X[column].dtype

if column_dtype is not str and is_regex:
X[column] = X[column].astype(str)

X[column] = X[column].replace(value, replacement, regex=True)
X[column] = X[column].replace(value, replacement, regex=is_regex)

if X[column].dtype != column_dtype:
X[column] = X[column].astype(column_dtype)

return X

@staticmethod
def __check_for_regex(string: str) -> bool:
if not isinstance(string, str):
def __check_for_regex(value: Any) -> bool:
if not isinstance(value, str):
return False
try:
re.compile(string)
re.compile(value)
is_valid = True
except re.error: # pylint: disable=W0702
except re.error:
is_valid = False
return is_valid
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[tool.poetry]
name = "feature-reviser"
version = "0.3.7"
description = "Library of various transformers for all different kinds of data preprocessing 🛠"
version = "0.3.72"
description = "Library of various transformers for all different kinds of data preprocessing"
authors = ["Christopher Lemke <chris@syhbl.mozmail.com>"]
license = "MIT"
homepage = "https://chrislemke.github.io/feature-reviser/"
Expand Down
1 change: 1 addition & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def X_time_values() -> pd.DataFrame:
"10000-01-01",
],
"e": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
"f": ["2", "4", "6", "8", "\\N", "12", "14", "16", "18", "20"],
}
)

Expand Down
5 changes: 1 addition & 4 deletions tests/test_transformer/test_datetime_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,7 @@
import pytest
from sklearn.pipeline import make_pipeline

from feature_reviser.transformer.datetime_transformer import (
DurationCalculatorTransformer,
TimestampTransformer,
)
from feature_reviser import DurationCalculatorTransformer, TimestampTransformer

# pylint: disable=missing-function-docstring, missing-class-docstring

Expand Down
4 changes: 4 additions & 0 deletions tests/test_transformer/test_generic_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,9 @@ def test_value_replacer_transformer_in_pipeline(X_time_values) -> None:
r"^(?!(19|20)\d\d[-\/.](0[1-9]|1[012]|[1-9])[-\/.](0[1-9]|[12][0-9]|3[01]|[1-9])$).*",
"1900-01-01",
),
(["f"], "\\N", "-999"),
]

pipeline = make_pipeline(ValueReplacerTransformer(values))
result = pipeline.fit_transform(X_time_values)
expected_a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 99, 99])
Expand All @@ -79,10 +81,12 @@ def test_value_replacer_transformer_in_pipeline(X_time_values) -> None:
]
)
expected_e = np.array([2, 4, 6, 8, 99, 99, 99, 99, 99, 99])
expected_f = np.array(["2", "4", "6", "8", "-999", "12", "14", "16", "18", "20"])

assert np.array_equal(result["a"].values, expected_a)
assert np.array_equal(result["dd"].values, expected_dd)
assert np.array_equal(result["e"].values, expected_e)
assert np.array_equal(result["f"].values, expected_f)
assert pipeline.steps[0][0] == "valuereplacertransformer"


Expand Down
2 changes: 1 addition & 1 deletion tests/test_transformer/test_number_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from sklearn.pipeline import make_pipeline

from feature_reviser.transformer.number_transformer import MathExpressionTransformer
from feature_reviser import MathExpressionTransformer

# pylint: disable=missing-function-docstring, missing-class-docstring

Expand Down
2 changes: 1 addition & 1 deletion tests/test_transformer/test_string_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas as pd
from sklearn.pipeline import make_pipeline

from feature_reviser.transformer.string_transformer import (
from feature_reviser import (
EmailTransformer,
IPAddressEncoderTransformer,
PhoneTransformer,
Expand Down

0 comments on commit 5220bf9

Please sign in to comment.