Skip to content

Commit

Permalink
refactor code to work with pandas 2.0 (#660)
Browse files Browse the repository at this point in the history
* Transform positional argument into keyword argument

From pandas 2.0 any only accepts keyworkd arguments
ref pandas-dev/pandas#44896

* Change how reciprocal is computed

I have not fully understood why this solve the problem, but splitting
the operation in 2 lines does not seem to work

* Catch warnings from pandas.to_datetime

Now pandas.to_datetime raises a warning when the column cannot be converted

* check_dtype=False in tests datetime features

Pandas dataframes created from python integers are created with int
column types `int64` but the operation tested returns `int32` which
caused issues

* Use droplevel before merging

Merging dfs with different column lelvels has been disallowed
ref pandas-dev/pandas#34862

* Change expected values for months

I am not sure why this caused an issue, maybe due to type casting?

* run black

* run black on tests

* isort _variable_type_checks.py

* Fix datetime_subtraction

---------

Co-authored-by: Claudio Salvatore Arcidiacono <claudio.arcidiacono@mollie.com>
  • Loading branch information
1 parent e73772d commit feddb06
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 14 deletions.
2 changes: 1 addition & 1 deletion feature_engine/datetime/datetime_subtraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ def _sub(self, dt_df: pd.DataFrame):
new_df[new_varnames] = (
dt_df[self.variables_]
.sub(dt_df[reference], axis=0)
.apply(lambda s: s / np.timedelta64(1, self.output_unit))
.div(np.timedelta64(1, self.output_unit).astype("timedelta64[ns]"))
)

if self.new_variables_names is not None:
Expand Down
2 changes: 1 addition & 1 deletion feature_engine/imputation/drop_missing_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def return_na_data(self, X: pd.DataFrame) -> pd.DataFrame:
idx = pd.isnull(X[self.variables_]).mean(axis=1) >= self.threshold
idx = idx[idx]
else:
idx = pd.isnull(X[self.variables_]).any(1)
idx = pd.isnull(X[self.variables_]).any(axis=1)
idx = idx[idx]

return X.loc[idx.index, :]
Expand Down
6 changes: 3 additions & 3 deletions feature_engine/transformation/reciprocal.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ class ReciprocalTransformer(BaseNumericalTransformer):
def __init__(
self, variables: Union[None, int, str, List[Union[str, int]]] = None
) -> None:

self.variables = _check_init_parameter_variables(variables)

def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
Expand Down Expand Up @@ -152,8 +151,9 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:

# transform
# for some reason reciprocal does not work with integers
X.loc[:, self.variables_] = X.loc[:, self.variables_].astype("float")
X.loc[:, self.variables_] = np.reciprocal(X.loc[:, self.variables_])
X.loc[:, self.variables_] = np.reciprocal(
X.loc[:, self.variables_].astype("float")
)

return X

Expand Down
8 changes: 5 additions & 3 deletions feature_engine/variable_handling/_variable_type_checks.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import warnings

import pandas as pd
from pandas.core.dtypes.common import is_categorical_dtype as is_categorical
from pandas.core.dtypes.common import is_datetime64_any_dtype as is_datetime
Expand All @@ -6,7 +8,6 @@


def _is_categorical_and_is_not_datetime(column: pd.Series) -> bool:

# check for datetime only if object cannot be cast as numeric because
# if it could pd.to_datetime would convert it to datetime regardless
if is_object(column):
Expand All @@ -25,15 +26,16 @@ def _is_categories_num(column: pd.Series) -> bool:


def _is_convertible_to_dt(column: pd.Series) -> bool:
return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
with warnings.catch_warnings():
warnings.simplefilter("ignore")
return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))


def _is_convertible_to_num(column: pd.Series) -> bool:
return is_numeric(pd.to_numeric(column, errors="ignore"))


def _is_categorical_and_is_datetime(column: pd.Series) -> bool:

# check for datetime only if object cannot be cast as numeric because
# if it could pd.to_datetime would convert it to datetime regardless
if is_object(column):
Expand Down
26 changes: 20 additions & 6 deletions tests/test_datetime/test_datetime_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ def test_extract_datetime_features_with_default_options(
df_datetime_transformed[
vars_non_dt + [var + feat for var in vars_dt for feat in feat_names_default]
],
check_dtype=False,
)


Expand All @@ -198,6 +199,7 @@ def test_extract_datetime_features_from_specified_variables(
+ ["datetime_range", "date_obj2", "time_obj"]
+ ["date_obj1" + feat for feat in feat_names_default]
],
check_dtype=False,
)

# multiple datetime variables
Expand All @@ -215,6 +217,7 @@ def test_extract_datetime_features_from_specified_variables(
for feat in feat_names_default
]
],
check_dtype=False,
)

# multiple datetime variables in different order than they appear in the df
Expand All @@ -232,6 +235,7 @@ def test_extract_datetime_features_from_specified_variables(
for feat in feat_names_default
]
],
check_dtype=False,
)

# datetime variable is index
Expand All @@ -251,12 +255,15 @@ def test_extract_datetime_features_from_specified_variables(
],
axis=1,
),
check_dtype=False,
)


def test_extract_all_datetime_features(df_datetime, df_datetime_transformed):
X = DatetimeFeatures(features_to_extract="all").fit_transform(df_datetime)
pd.testing.assert_frame_equal(X, df_datetime_transformed.drop(vars_dt, axis=1))
pd.testing.assert_frame_equal(
X, df_datetime_transformed.drop(vars_dt, axis=1), check_dtype=False
)


def test_extract_specified_datetime_features(df_datetime, df_datetime_transformed):
Expand All @@ -269,6 +276,7 @@ def test_extract_specified_datetime_features(df_datetime, df_datetime_transforme
vars_non_dt
+ [var + "_" + feat for var in vars_dt for feat in ["semester", "week"]]
],
check_dtype=False,
)

# different order than they appear in the glossary
Expand All @@ -281,6 +289,7 @@ def test_extract_specified_datetime_features(df_datetime, df_datetime_transforme
vars_non_dt
+ [var + "_" + feat for var in vars_dt for feat in ["hour", "day_of_week"]]
],
check_dtype=False,
)


Expand All @@ -290,7 +299,9 @@ def test_extract_features_from_categorical_variable(
cat_date = pd.DataFrame({"date_obj1": df_datetime["date_obj1"].astype("category")})
X = DatetimeFeatures(variables="date_obj1").fit_transform(cat_date)
pd.testing.assert_frame_equal(
X, df_datetime_transformed[["date_obj1" + feat for feat in feat_names_default]]
X,
df_datetime_transformed[["date_obj1" + feat for feat in feat_names_default]],
check_dtype=False,
)


Expand All @@ -311,6 +322,7 @@ def test_extract_features_from_different_timezones(
df_datetime_transformed[["time_obj_hour"]].apply(
lambda x: x.subtract(time_zones)
),
check_dtype=False,
)
exp_err_msg = (
"ValueError: variable(s) time_obj "
Expand Down Expand Up @@ -356,7 +368,7 @@ def test_extract_features_from_localized_tz_variables():
# transform
X = transformer.transform(tz_df)
df_expected = pd.DataFrame({"date_var_hour": [1, 2, 2, 2, 2, 3, 3]})
pd.testing.assert_frame_equal(X, df_expected)
pd.testing.assert_frame_equal(X, df_expected, check_dtype=False)

# when utc is True
transformer = DatetimeFeatures(features_to_extract=["hour"], utc=True).fit(tz_df)
Expand All @@ -372,7 +384,7 @@ def test_extract_features_from_localized_tz_variables():
# transform
X = transformer.transform(tz_df)
df_expected = pd.DataFrame({"date_var_hour": [5, 6, 6, 6, 6, 7, 7]})
pd.testing.assert_frame_equal(X, df_expected)
pd.testing.assert_frame_equal(X, df_expected, check_dtype=False)


def test_extract_features_without_dropping_original_variables(
Expand All @@ -399,6 +411,7 @@ def test_extract_features_without_dropping_original_variables(
],
axis=1,
),
check_dtype=False,
)


Expand Down Expand Up @@ -435,6 +448,7 @@ def test_extract_features_with_different_datetime_parsing_options(df_datetime):
pd.testing.assert_frame_equal(
X,
pd.DataFrame({"date_obj2_day_of_month": [10, 31, 30, 17]}),
check_dtype=False,
)

X = DatetimeFeatures(features_to_extract=["year"], yearfirst=True).fit_transform(
Expand All @@ -443,6 +457,7 @@ def test_extract_features_with_different_datetime_parsing_options(df_datetime):
pd.testing.assert_frame_equal(
X,
pd.DataFrame({"date_obj2_year": [2010, 2009, 1995, 2004]}),
check_dtype=False,
)


Expand All @@ -457,8 +472,7 @@ def test_get_feature_names_out(df_datetime, df_datetime_transformed):
transformer.get_feature_names_out(input_features=vars_dt)

with pytest.raises(ValueError):
transformer.get_feature_names_out(input_features=["date_obj1"])\

transformer.get_feature_names_out(input_features=["date_obj1"])
# default features from 1 variable
transformer = DatetimeFeatures(variables="date_obj1")
X = transformer.fit_transform(df_datetime)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -380,8 +380,10 @@ def test_multiple_windows(df_time):
X = df_time.copy()
num_vars = ["ambient_temp", "module_temp", "irradiation"]
tmp = X[num_vars].rolling(2).agg(["sum", "mean"]).shift(periods=15, freq="min")
tmp.columns = tmp.columns.droplevel()
X_tr = X.merge(tmp, left_index=True, right_index=True, how="left")
tmp = X[num_vars].rolling(3).agg(["sum", "mean"]).shift(periods=15, freq="min")
tmp.columns = tmp.columns.droplevel()
X_tr = X_tr.merge(tmp, left_index=True, right_index=True, how="left")
X_tr.columns = transformer.get_feature_names_out()

Expand All @@ -404,13 +406,15 @@ def test_multiple_windows(df_time):
.agg(["sum", "mean"])
.shift(freq="30min")
)
tmp.columns = tmp.columns.droplevel()
X_tr = X.merge(tmp, left_index=True, right_index=True, how="left")
tmp = (
X[["ambient_temp", "irradiation"]]
.rolling(3)
.agg(["sum", "mean"])
.shift(freq="30min")
)
tmp.columns = tmp.columns.droplevel()
X_tr = X_tr.merge(tmp, left_index=True, right_index=True, how="left")
X_tr.columns = transformer.get_feature_names_out()

Expand Down

0 comments on commit feddb06

Please sign in to comment.