Skip to content

Commit

Permalink
preprocessing module
Browse files Browse the repository at this point in the history
  • Loading branch information
FBruzzesi committed Dec 15, 2024
1 parent 8cfa6c7 commit 8052bc4
Show file tree
Hide file tree
Showing 10 changed files with 81 additions and 87 deletions.
28 changes: 12 additions & 16 deletions sklego/preprocessing/columncapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
from sklearn_compat.utils.validation import _check_n_features, validate_data


class ColumnCapper(TransformerMixin, BaseEstimator):
Expand Down Expand Up @@ -123,7 +123,9 @@ def fit(self, X, y=None):
"""
self._check_quantile_range(self.quantile_range)
self._check_interpolation(self.interpolation)
X = check_array(X, copy=True, force_all_finite=False, dtype=FLOAT_DTYPES, estimator=self)

X = validate_data(self, X=X, dtype=FLOAT_DTYPES, copy=True, ensure_all_finite=False, reset=True)
_check_n_features(self, X, reset=True)

# If X contains infs, we need to replace them by nans before computing quantiles
np.putmask(X, (X == np.inf) | (X == -np.inf), np.nan)
Expand All @@ -138,9 +140,6 @@ def fit(self, X, y=None):
q = [quantile_limit / 100 for quantile_limit in self.quantile_range]
self.quantiles_ = np.nanquantile(a=X, q=q, axis=0, overwrite_input=True, method=self.interpolation)

# Saving the number of columns to ensure coherence between fit and transform inputs
self.n_features_in_ = X.shape[1]

return self

def transform(self, X):
Expand All @@ -161,17 +160,9 @@ def transform(self, X):
ValueError
If the number of columns from `X` differs from the number of columns when fitting.
"""
check_is_fitted(self, "quantiles_")
X = check_array(
X,
copy=self.copy,
force_all_finite=False,
dtype=FLOAT_DTYPES,
estimator=self,
)

if X.shape[1] != self.n_features_in_:
raise ValueError("X must have the same number of columns in fit and transform")
check_is_fitted(self, ["quantiles_"])
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, copy=self.copy, ensure_all_finite=False, reset=False)
_check_n_features(self, X, reset=False)

if self.discard_infs:
np.putmask(X, (X == np.inf) | (X == -np.inf), np.nan)
Expand Down Expand Up @@ -244,3 +235,8 @@ def n_columns_(self):

def _more_tags(self):
return {"allow_nan": True}

def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.input_tags.allow_nan = True
return tags
34 changes: 13 additions & 21 deletions sklego/preprocessing/dictmapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted
from sklearn_compat.utils.validation import _check_n_features, validate_data


class DictMapper(TransformerMixin, BaseEstimator):
Expand Down Expand Up @@ -74,15 +74,9 @@ def fit(self, X, y=None):
self : DictMapper
The fitted transformer.
"""
X = check_array(
X,
copy=True,
estimator=self,
force_all_finite=False,
dtype=None,
ensure_2d=True,
)
self.n_features_in_ = X.shape[1]
X = validate_data(self, X=X, copy=True, dtype=None, ensure_2d=True, ensure_all_finite=False, reset=True)
_check_n_features(self, X, reset=True)

return self

def transform(self, X):
Expand All @@ -104,17 +98,8 @@ def transform(self, X):
If the number of columns from `X` differs from the number of columns when fitting.
"""
check_is_fitted(self, ["n_features_in_"])
X = check_array(
X,
copy=True,
estimator=self,
force_all_finite=False,
dtype=None,
ensure_2d=True,
)

if X.shape[1] != self.n_features_in_:
raise ValueError(f"number of columns {X.shape[1]} does not match fit size {self.n_features_in_}")
X = validate_data(self, X=X, copy=True, dtype=None, ensure_2d=True, ensure_all_finite=False, reset=False)
_check_n_features(self, X, reset=False)
return np.vectorize(self.mapper.get, otypes=[int])(X, self.default)

@property
Expand All @@ -127,3 +112,10 @@ def dim_(self):

def _more_tags(self):
return {"preserves_dtype": None, "allow_nan": True, "no_validation": True}

def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.transformer_tags.preserves_dtype = []
tags.input_tags.allow_nan = True
tags.no_validation = True
return tags
15 changes: 7 additions & 8 deletions sklego/preprocessing/identitytransformer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted
from sklearn_compat.utils.validation import _check_n_features, validate_data


class IdentityTransformer(TransformerMixin, BaseEstimator):
Expand Down Expand Up @@ -68,7 +68,8 @@ def fit(self, X, y=None):
The fitted transformer.
"""
if self.check_X:
X = check_array(X, copy=True, estimator=self)
X = validate_data(self, X=X, copy=True, reset=True)
_check_n_features(self, X, reset=True)
self.n_samples_, self.n_features_in_ = X.shape
return self

Expand All @@ -90,13 +91,11 @@ def transform(self, X):
ValueError
If the number of columns from `X` differs from the number of columns when fitting.
"""
if self.check_X:
X = check_array(X, copy=True, estimator=self)
check_is_fitted(self, "n_features_in_")
if X.shape[1] != self.n_features_in_:
raise ValueError(
f"Wrong shape is passed to transform. Trained on {self.n_features_in_} cols got {X.shape[1]}"
)

if self.check_X:
X = validate_data(self, X=X, copy=True, reset=False)
_check_n_features(self, X, reset=False)
return X

@property
Expand Down
12 changes: 7 additions & 5 deletions sklego/preprocessing/intervalencoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array, check_X_y
from sklearn.utils.validation import check_is_fitted
from sklearn_compat.utils.validation import _check_n_features, validate_data


def _mk_monotonic_average(xs, ys, intervals, method="increasing", **kwargs):
Expand Down Expand Up @@ -156,7 +156,9 @@ def fit(self, X, y):

# these two matrices will have shape (columns, quantiles)
# quantiles indicate where the interval split occurs
X, y = check_X_y(X, y, estimator=self)
X, y = validate_data(self, X=X, y=y, reset=True)
_check_n_features(self, X, reset=True)

self.quantiles_ = np.zeros((X.shape[1], self.n_chunks))
# heights indicate what heights these intervals will have
self.heights_ = np.zeros((X.shape[1], self.n_chunks))
Expand Down Expand Up @@ -194,9 +196,9 @@ def transform(self, X):
If the number of columns from `X` differs from the number of columns when fitting.
"""
check_is_fitted(self, ["quantiles_", "heights_", "n_features_in_"])
X = check_array(X, estimator=self)
if X.shape[1] != self.n_features_in_:
raise ValueError(f"fitted on {self.n_features_in_} features but received {X.shape[1]}")
X = validate_data(self, X=X, reset=False)
_check_n_features(self, X, reset=False)

transformed = np.zeros(X.shape)
for col in range(transformed.shape[1]):
transformed[:, col] = np.interp(X[:, col], self.quantiles_[col, :], self.heights_[col, :])
Expand Down
18 changes: 5 additions & 13 deletions sklego/preprocessing/monotonicspline.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import SplineTransformer
from sklearn.utils import check_array
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
from sklearn_compat.utils.validation import _check_n_features, validate_data


class MonotonicSplineTransformer(TransformerMixin, BaseEstimator):
Expand Down Expand Up @@ -52,16 +52,15 @@ def fit(self, X, y=None):
ValueError
If `X` contains non-numeric columns.
"""
X = check_array(X, copy=True, force_all_finite=False, dtype=FLOAT_DTYPES, estimator=self)

X = validate_data(self, X=X, copy=True, ensure_all_finite=False, dtype=FLOAT_DTYPES, reset=True)
_check_n_features(self, X, reset=True)
# If X contains infs, we need to replace them by nans before computing quantiles
self.spline_transformer_ = {
col: SplineTransformer(n_knots=self.n_knots, degree=self.degree, knots=self.knots).fit(
X[:, col].reshape(-1, 1)
)
for col in range(X.shape[1])
}
self.n_features_in_ = X.shape[1]
return self

def transform(self, X):
Expand All @@ -82,15 +81,8 @@ def transform(self, X):
If the number of columns from `X` differs from the number of columns when fitting.
"""
check_is_fitted(self, "spline_transformer_")
X = check_array(
X,
force_all_finite=False,
dtype=FLOAT_DTYPES,
estimator=self,
)
if X.shape[1] != self.n_features_in_:
raise ValueError("Number of features going into .transform() do not match number going into .fit().")

X = validate_data(self, X=X, ensure_all_finite=False, dtype=FLOAT_DTYPES, reset=False)
_check_n_features(self, X, reset=False)
out = []
for col in range(X.shape[1]):
out.append(
Expand Down
9 changes: 7 additions & 2 deletions sklego/preprocessing/outlier_remover.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from sklearn import clone
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_array, check_is_fitted
from sklearn.utils.validation import check_is_fitted
from sklearn_compat.utils.validation import _check_n_features, validate_data

from sklego.common import TrainOnlyTransformerMixin

Expand Down Expand Up @@ -68,6 +69,7 @@ def fit(self, X, y=None):
if self.refit:
super().fit(X, y)
self.estimator_.fit(X, y)
_check_n_features(self, X, reset=True)
return self

def transform_train(self, X):
Expand All @@ -84,6 +86,9 @@ def transform_train(self, X):
The data with the outliers removed, where `n_not_outliers = n_samples - n_outliers`.
"""
check_is_fitted(self, "estimator_")
_check_n_features(self, X, reset=False)

predictions = self.estimator_.predict(X)
check_array(predictions, estimator=self.outlier_detector, ensure_2d=False)
validate_data(self.outlier_detector, X=predictions, ensure_2d=False, reset=False)

return X[predictions != -1]
20 changes: 12 additions & 8 deletions sklego/preprocessing/projections.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import narwhals.stable.v1 as nw
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted
from sklearn_compat.utils.validation import _check_n_features, validate_data

from sklego.common import as_list

Expand Down Expand Up @@ -66,10 +66,8 @@ def fit(self, X, y=None):
self : OrthogonalTransformer
The fitted transformer.
"""
X = check_array(X, estimator=self)

if not X.shape[0] > 1:
raise ValueError("Orthogonal transformation not valid for one sample")
X = validate_data(self, X=X, ensure_min_samples=2, reset=True)
_check_n_features(self, X, reset=True)

# Q, R such that X = Q*R, with Q orthogonal, from which follows Q = X*inv(R)
Q, R = np.linalg.qr(X)
Expand All @@ -95,12 +93,14 @@ def transform(self, X):
array-like of shape (n_samples, n_features)
The transformed data.
"""

if self.normalize:
check_is_fitted(self, ["inv_R_", "normalization_vector_"])
else:
check_is_fitted(self, ["inv_R_"])

X = check_array(X, estimator=self)
X = validate_data(self, X=X, reset=False)
_check_n_features(self, X, reset=False)

return X @ self.inv_R_ / self.normalization_vector_

Expand Down Expand Up @@ -235,7 +235,9 @@ def fit(self, X, y=None):
"""
self._check_coltype(X)
self.col_ids_ = [v if isinstance(v, int) else self._col_idx(X, v) for v in as_list(self.columns)]
X = check_array(X, estimator=self)
X = validate_data(self, X=X, reset=True)
_check_n_features(self, X, reset=True)

X_fair = X.copy()
v_vectors = self._make_v_vectors(X, self.col_ids_)
# gram smidt process but only on sensitive attributes
Expand Down Expand Up @@ -269,7 +271,9 @@ def transform(self, X):
"""
check_is_fitted(self, ["projection_", "col_ids_"])
self._check_coltype(X)
X = check_array(X, estimator=self)
X = validate_data(self, X=X, reset=False)
_check_n_features(self, X, reset=False)

# apply the projection and remove the column we won't need
X_fair = X @ self.projection_
X_removed = np.delete(X_fair, self.col_ids_, axis=1)
Expand Down
15 changes: 10 additions & 5 deletions sklego/preprocessing/randomadder.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from warnings import warn

from sklearn.base import BaseEstimator
from sklearn.utils import check_array, check_X_y
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted, check_random_state
from sklearn_compat.utils.validation import _check_n_features, validate_data

from sklego.common import TrainOnlyTransformerMixin

Expand Down Expand Up @@ -69,8 +69,8 @@ def fit(self, X, y):
The fitted transformer.
"""
super().fit(X, y)
X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
self.n_features_in_ = X.shape[1]
X, y = validate_data(self, X=X, y=y, dtype=FLOAT_DTYPES, reset=True)
_check_n_features(self, X, reset=True)

return self

Expand All @@ -89,8 +89,8 @@ def transform_train(self, X):
"""
rs = check_random_state(self.random_state)
check_is_fitted(self, ["n_features_in_"])

X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
_check_n_features(self, X, reset=False)

return X + rs.normal(0, self.noise, size=X.shape)

Expand All @@ -104,3 +104,8 @@ def dim_(self):

def _more_tags(self):
return {"non_deterministic": True}

def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.non_deterministic = True
return tags
Loading

0 comments on commit 8052bc4

Please sign in to comment.