diff --git a/sklego/preprocessing/columncapper.py b/sklego/preprocessing/columncapper.py index 7d867c24..433c921f 100644 --- a/sklego/preprocessing/columncapper.py +++ b/sklego/preprocessing/columncapper.py @@ -2,8 +2,8 @@ import numpy as np from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.utils import check_array from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted +from sklearn_compat.utils.validation import _check_n_features, validate_data class ColumnCapper(TransformerMixin, BaseEstimator): @@ -123,7 +123,9 @@ def fit(self, X, y=None): """ self._check_quantile_range(self.quantile_range) self._check_interpolation(self.interpolation) - X = check_array(X, copy=True, force_all_finite=False, dtype=FLOAT_DTYPES, estimator=self) + + X = validate_data(self, X=X, dtype=FLOAT_DTYPES, copy=True, ensure_all_finite=False, reset=True) + _check_n_features(self, X, reset=True) # If X contains infs, we need to replace them by nans before computing quantiles np.putmask(X, (X == np.inf) | (X == -np.inf), np.nan) @@ -138,9 +140,6 @@ def fit(self, X, y=None): q = [quantile_limit / 100 for quantile_limit in self.quantile_range] self.quantiles_ = np.nanquantile(a=X, q=q, axis=0, overwrite_input=True, method=self.interpolation) - # Saving the number of columns to ensure coherence between fit and transform inputs - self.n_features_in_ = X.shape[1] - return self def transform(self, X): @@ -161,17 +160,9 @@ def transform(self, X): ValueError If the number of columns from `X` differs from the number of columns when fitting. """ - check_is_fitted(self, "quantiles_") - X = check_array( - X, - copy=self.copy, - force_all_finite=False, - dtype=FLOAT_DTYPES, - estimator=self, - ) - - if X.shape[1] != self.n_features_in_: - raise ValueError("X must have the same number of columns in fit and transform") + check_is_fitted(self, ["quantiles_"]) + X = validate_data(self, X=X, dtype=FLOAT_DTYPES, copy=self.copy, ensure_all_finite=False, reset=False) + _check_n_features(self, X, reset=False) if self.discard_infs: np.putmask(X, (X == np.inf) | (X == -np.inf), np.nan) @@ -244,3 +235,8 @@ def n_columns_(self): def _more_tags(self): return {"allow_nan": True} + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags diff --git a/sklego/preprocessing/dictmapper.py b/sklego/preprocessing/dictmapper.py index d718430a..d8d1a438 100644 --- a/sklego/preprocessing/dictmapper.py +++ b/sklego/preprocessing/dictmapper.py @@ -2,8 +2,8 @@ import numpy as np from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.utils import check_array from sklearn.utils.validation import check_is_fitted +from sklearn_compat.utils.validation import _check_n_features, validate_data class DictMapper(TransformerMixin, BaseEstimator): @@ -74,15 +74,9 @@ def fit(self, X, y=None): self : DictMapper The fitted transformer. """ - X = check_array( - X, - copy=True, - estimator=self, - force_all_finite=False, - dtype=None, - ensure_2d=True, - ) - self.n_features_in_ = X.shape[1] + X = validate_data(self, X=X, copy=True, dtype=None, ensure_2d=True, ensure_all_finite=False, reset=True) + _check_n_features(self, X, reset=True) + return self def transform(self, X): @@ -104,17 +98,8 @@ def transform(self, X): If the number of columns from `X` differs from the number of columns when fitting. """ check_is_fitted(self, ["n_features_in_"]) - X = check_array( - X, - copy=True, - estimator=self, - force_all_finite=False, - dtype=None, - ensure_2d=True, - ) - - if X.shape[1] != self.n_features_in_: - raise ValueError(f"number of columns {X.shape[1]} does not match fit size {self.n_features_in_}") + X = validate_data(self, X=X, copy=True, dtype=None, ensure_2d=True, ensure_all_finite=False, reset=False) + _check_n_features(self, X, reset=False) return np.vectorize(self.mapper.get, otypes=[int])(X, self.default) @property @@ -127,3 +112,10 @@ def dim_(self): def _more_tags(self): return {"preserves_dtype": None, "allow_nan": True, "no_validation": True} + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.transformer_tags.preserves_dtype = [] + tags.input_tags.allow_nan = True + tags.no_validation = True + return tags diff --git a/sklego/preprocessing/identitytransformer.py b/sklego/preprocessing/identitytransformer.py index 33dda462..1f26fb50 100644 --- a/sklego/preprocessing/identitytransformer.py +++ b/sklego/preprocessing/identitytransformer.py @@ -1,6 +1,6 @@ from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.utils import check_array from sklearn.utils.validation import check_is_fitted +from sklearn_compat.utils.validation import _check_n_features, validate_data class IdentityTransformer(TransformerMixin, BaseEstimator): @@ -68,7 +68,8 @@ def fit(self, X, y=None): The fitted transformer. """ if self.check_X: - X = check_array(X, copy=True, estimator=self) + X = validate_data(self, X=X, copy=True, reset=True) + _check_n_features(self, X, reset=True) self.n_samples_, self.n_features_in_ = X.shape return self @@ -90,13 +91,11 @@ def transform(self, X): ValueError If the number of columns from `X` differs from the number of columns when fitting. """ - if self.check_X: - X = check_array(X, copy=True, estimator=self) check_is_fitted(self, "n_features_in_") - if X.shape[1] != self.n_features_in_: - raise ValueError( - f"Wrong shape is passed to transform. Trained on {self.n_features_in_} cols got {X.shape[1]}" - ) + + if self.check_X: + X = validate_data(self, X=X, copy=True, reset=False) + _check_n_features(self, X, reset=False) return X @property diff --git a/sklego/preprocessing/intervalencoder.py b/sklego/preprocessing/intervalencoder.py index 429841a6..9935b148 100644 --- a/sklego/preprocessing/intervalencoder.py +++ b/sklego/preprocessing/intervalencoder.py @@ -9,8 +9,8 @@ import numpy as np from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.utils import check_array, check_X_y from sklearn.utils.validation import check_is_fitted +from sklearn_compat.utils.validation import _check_n_features, validate_data def _mk_monotonic_average(xs, ys, intervals, method="increasing", **kwargs): @@ -156,7 +156,9 @@ def fit(self, X, y): # these two matrices will have shape (columns, quantiles) # quantiles indicate where the interval split occurs - X, y = check_X_y(X, y, estimator=self) + X, y = validate_data(self, X=X, y=y, reset=True) + _check_n_features(self, X, reset=True) + self.quantiles_ = np.zeros((X.shape[1], self.n_chunks)) # heights indicate what heights these intervals will have self.heights_ = np.zeros((X.shape[1], self.n_chunks)) @@ -194,9 +196,9 @@ def transform(self, X): If the number of columns from `X` differs from the number of columns when fitting. """ check_is_fitted(self, ["quantiles_", "heights_", "n_features_in_"]) - X = check_array(X, estimator=self) - if X.shape[1] != self.n_features_in_: - raise ValueError(f"fitted on {self.n_features_in_} features but received {X.shape[1]}") + X = validate_data(self, X=X, reset=False) + _check_n_features(self, X, reset=False) + transformed = np.zeros(X.shape) for col in range(transformed.shape[1]): transformed[:, col] = np.interp(X[:, col], self.quantiles_[col, :], self.heights_[col, :]) diff --git a/sklego/preprocessing/monotonicspline.py b/sklego/preprocessing/monotonicspline.py index 13087051..1a65d6ef 100644 --- a/sklego/preprocessing/monotonicspline.py +++ b/sklego/preprocessing/monotonicspline.py @@ -1,8 +1,8 @@ import numpy as np from sklearn.base import BaseEstimator, TransformerMixin from sklearn.preprocessing import SplineTransformer -from sklearn.utils import check_array from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted +from sklearn_compat.utils.validation import _check_n_features, validate_data class MonotonicSplineTransformer(TransformerMixin, BaseEstimator): @@ -52,8 +52,8 @@ def fit(self, X, y=None): ValueError If `X` contains non-numeric columns. """ - X = check_array(X, copy=True, force_all_finite=False, dtype=FLOAT_DTYPES, estimator=self) - + X = validate_data(self, X=X, copy=True, ensure_all_finite=False, dtype=FLOAT_DTYPES, reset=True) + _check_n_features(self, X, reset=True) # If X contains infs, we need to replace them by nans before computing quantiles self.spline_transformer_ = { col: SplineTransformer(n_knots=self.n_knots, degree=self.degree, knots=self.knots).fit( @@ -61,7 +61,6 @@ def fit(self, X, y=None): ) for col in range(X.shape[1]) } - self.n_features_in_ = X.shape[1] return self def transform(self, X): @@ -82,15 +81,8 @@ def transform(self, X): If the number of columns from `X` differs from the number of columns when fitting. """ check_is_fitted(self, "spline_transformer_") - X = check_array( - X, - force_all_finite=False, - dtype=FLOAT_DTYPES, - estimator=self, - ) - if X.shape[1] != self.n_features_in_: - raise ValueError("Number of features going into .transform() do not match number going into .fit().") - + X = validate_data(self, X=X, ensure_all_finite=False, dtype=FLOAT_DTYPES, reset=False) + _check_n_features(self, X, reset=False) out = [] for col in range(X.shape[1]): out.append( diff --git a/sklego/preprocessing/outlier_remover.py b/sklego/preprocessing/outlier_remover.py index bbc84327..29ed9809 100644 --- a/sklego/preprocessing/outlier_remover.py +++ b/sklego/preprocessing/outlier_remover.py @@ -1,6 +1,7 @@ from sklearn import clone from sklearn.base import BaseEstimator -from sklearn.utils.validation import check_array, check_is_fitted +from sklearn.utils.validation import check_is_fitted +from sklearn_compat.utils.validation import _check_n_features, validate_data from sklego.common import TrainOnlyTransformerMixin @@ -68,6 +69,7 @@ def fit(self, X, y=None): if self.refit: super().fit(X, y) self.estimator_.fit(X, y) + _check_n_features(self, X, reset=True) return self def transform_train(self, X): @@ -84,6 +86,9 @@ def transform_train(self, X): The data with the outliers removed, where `n_not_outliers = n_samples - n_outliers`. """ check_is_fitted(self, "estimator_") + _check_n_features(self, X, reset=False) + predictions = self.estimator_.predict(X) - check_array(predictions, estimator=self.outlier_detector, ensure_2d=False) + validate_data(self.outlier_detector, X=predictions, ensure_2d=False, reset=False) + return X[predictions != -1] diff --git a/sklego/preprocessing/projections.py b/sklego/preprocessing/projections.py index cfb41a5d..bcd6332f 100644 --- a/sklego/preprocessing/projections.py +++ b/sklego/preprocessing/projections.py @@ -1,8 +1,8 @@ import narwhals.stable.v1 as nw import numpy as np from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.utils import check_array from sklearn.utils.validation import check_is_fitted +from sklearn_compat.utils.validation import _check_n_features, validate_data from sklego.common import as_list @@ -66,10 +66,8 @@ def fit(self, X, y=None): self : OrthogonalTransformer The fitted transformer. """ - X = check_array(X, estimator=self) - - if not X.shape[0] > 1: - raise ValueError("Orthogonal transformation not valid for one sample") + X = validate_data(self, X=X, ensure_min_samples=2, reset=True) + _check_n_features(self, X, reset=True) # Q, R such that X = Q*R, with Q orthogonal, from which follows Q = X*inv(R) Q, R = np.linalg.qr(X) @@ -95,12 +93,14 @@ def transform(self, X): array-like of shape (n_samples, n_features) The transformed data. """ + if self.normalize: check_is_fitted(self, ["inv_R_", "normalization_vector_"]) else: check_is_fitted(self, ["inv_R_"]) - X = check_array(X, estimator=self) + X = validate_data(self, X=X, reset=False) + _check_n_features(self, X, reset=False) return X @ self.inv_R_ / self.normalization_vector_ @@ -235,7 +235,9 @@ def fit(self, X, y=None): """ self._check_coltype(X) self.col_ids_ = [v if isinstance(v, int) else self._col_idx(X, v) for v in as_list(self.columns)] - X = check_array(X, estimator=self) + X = validate_data(self, X=X, reset=True) + _check_n_features(self, X, reset=True) + X_fair = X.copy() v_vectors = self._make_v_vectors(X, self.col_ids_) # gram smidt process but only on sensitive attributes @@ -269,7 +271,9 @@ def transform(self, X): """ check_is_fitted(self, ["projection_", "col_ids_"]) self._check_coltype(X) - X = check_array(X, estimator=self) + X = validate_data(self, X=X, reset=False) + _check_n_features(self, X, reset=False) + # apply the projection and remove the column we won't need X_fair = X @ self.projection_ X_removed = np.delete(X_fair, self.col_ids_, axis=1) diff --git a/sklego/preprocessing/randomadder.py b/sklego/preprocessing/randomadder.py index c1a79f39..f5fb83f0 100644 --- a/sklego/preprocessing/randomadder.py +++ b/sklego/preprocessing/randomadder.py @@ -1,8 +1,8 @@ from warnings import warn from sklearn.base import BaseEstimator -from sklearn.utils import check_array, check_X_y from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted, check_random_state +from sklearn_compat.utils.validation import _check_n_features, validate_data from sklego.common import TrainOnlyTransformerMixin @@ -69,8 +69,8 @@ def fit(self, X, y): The fitted transformer. """ super().fit(X, y) - X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES) - self.n_features_in_ = X.shape[1] + X, y = validate_data(self, X=X, y=y, dtype=FLOAT_DTYPES, reset=True) + _check_n_features(self, X, reset=True) return self @@ -89,8 +89,8 @@ def transform_train(self, X): """ rs = check_random_state(self.random_state) check_is_fitted(self, ["n_features_in_"]) - - X = check_array(X, estimator=self, dtype=FLOAT_DTYPES) + X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False) + _check_n_features(self, X, reset=False) return X + rs.normal(0, self.noise, size=X.shape) @@ -104,3 +104,8 @@ def dim_(self): def _more_tags(self): return {"non_deterministic": True} + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.non_deterministic = True + return tags diff --git a/sklego/preprocessing/repeatingbasis.py b/sklego/preprocessing/repeatingbasis.py index 5bcb1b9f..8ef9fe6b 100644 --- a/sklego/preprocessing/repeatingbasis.py +++ b/sklego/preprocessing/repeatingbasis.py @@ -1,8 +1,8 @@ import numpy as np from sklearn.base import BaseEstimator, TransformerMixin from sklearn.compose import ColumnTransformer -from sklearn.utils import check_array from sklearn.utils.validation import check_is_fitted +from sklearn_compat.utils.validation import _check_n_features, validate_data class RepeatingBasisFunction(TransformerMixin, BaseEstimator): @@ -163,7 +163,8 @@ def fit(self, X, y=None): self : _RepeatingBasisFunction The fitted transformer. """ - X = check_array(X, estimator=self) + X = validate_data(self, X=X, ensure_2d=True, reset=True) + _check_n_features(self, X, reset=True) # find min and max for standardization if not given explicitly if self.input_range is None: @@ -195,11 +196,9 @@ def transform(self, X): ValueError If X has more than one column, as this transformer only accepts one feature as input. """ - X = check_array(X, estimator=self, ensure_2d=True) check_is_fitted(self, ["bases_", "width_"]) - # This transformer only accepts one feature as input - if X.shape[1] != 1: - raise ValueError(f"X should have exactly one column, it has: {X.shape[1]}") + X = validate_data(self, X=X, ensure_2d=True, reset=False) + _check_n_features(self, X, reset=False) # MinMax Scale to 0-1 X = (X - self.input_range[0]) / (self.input_range[1] - self.input_range[0]) diff --git a/tests/test_preprocessing/test_columncapper.py b/tests/test_preprocessing/test_columncapper.py index 455a28cc..98faa476 100644 --- a/tests/test_preprocessing/test_columncapper.py +++ b/tests/test_preprocessing/test_columncapper.py @@ -15,11 +15,11 @@ def test_sklearn_compatible_estimator(estimator, check): def test_quantile_range(): def expect_type_error(quantile_range): with pytest.raises(TypeError): - ColumnCapper(quantile_range) + ColumnCapper(quantile_range).fit([]) def expect_value_error(quantile_range): with pytest.raises(ValueError): - ColumnCapper(quantile_range) + ColumnCapper(quantile_range).fit([]) # Testing quantile_range type expect_type_error(quantile_range=1) @@ -49,7 +49,7 @@ def test_interpolation(): for interpolation in invalid_interpolations: with pytest.raises(ValueError): - ColumnCapper(interpolation=interpolation) + ColumnCapper(interpolation=interpolation).fit([]) @pytest.fixture()