preprocessing module

koaning · Dec 15, 2024 · 8052bc4 · 8052bc4
1 parent 8cfa6c7
commit 8052bc4
Show file tree

Hide file tree

Showing 10 changed files with 81 additions and 87 deletions.
diff --git a/sklego/preprocessing/columncapper.py b/sklego/preprocessing/columncapper.py
@@ -2,8 +2,8 @@
 
 import numpy as np
 from sklearn.base import BaseEstimator, TransformerMixin
-from sklearn.utils import check_array
 from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
+from sklearn_compat.utils.validation import _check_n_features, validate_data
 
 
 class ColumnCapper(TransformerMixin, BaseEstimator):
@@ -123,7 +123,9 @@ def fit(self, X, y=None):
         """
         self._check_quantile_range(self.quantile_range)
         self._check_interpolation(self.interpolation)
-        X = check_array(X, copy=True, force_all_finite=False, dtype=FLOAT_DTYPES, estimator=self)
+
+        X = validate_data(self, X=X, dtype=FLOAT_DTYPES, copy=True, ensure_all_finite=False, reset=True)
+        _check_n_features(self, X, reset=True)
 
         # If X contains infs, we need to replace them by nans before computing quantiles
         np.putmask(X, (X == np.inf) | (X == -np.inf), np.nan)
@@ -138,9 +140,6 @@ def fit(self, X, y=None):
         q = [quantile_limit / 100 for quantile_limit in self.quantile_range]
         self.quantiles_ = np.nanquantile(a=X, q=q, axis=0, overwrite_input=True, method=self.interpolation)
 
-        # Saving the number of columns to ensure coherence between fit and transform inputs
-        self.n_features_in_ = X.shape[1]
-
         return self
 
     def transform(self, X):
@@ -161,17 +160,9 @@ def transform(self, X):
         ValueError
             If the number of columns from `X` differs from the number of columns when fitting.
         """
-        check_is_fitted(self, "quantiles_")
-        X = check_array(
-            X,
-            copy=self.copy,
-            force_all_finite=False,
-            dtype=FLOAT_DTYPES,
-            estimator=self,
-        )
-
-        if X.shape[1] != self.n_features_in_:
-            raise ValueError("X must have the same number of columns in fit and transform")
+        check_is_fitted(self, ["quantiles_"])
+        X = validate_data(self, X=X, dtype=FLOAT_DTYPES, copy=self.copy, ensure_all_finite=False, reset=False)
+        _check_n_features(self, X, reset=False)
 
         if self.discard_infs:
             np.putmask(X, (X == np.inf) | (X == -np.inf), np.nan)
@@ -244,3 +235,8 @@ def n_columns_(self):
 
     def _more_tags(self):
         return {"allow_nan": True}
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = True
+        return tags
diff --git a/sklego/preprocessing/dictmapper.py b/sklego/preprocessing/dictmapper.py
@@ -2,8 +2,8 @@
 
 import numpy as np
 from sklearn.base import BaseEstimator, TransformerMixin
-from sklearn.utils import check_array
 from sklearn.utils.validation import check_is_fitted
+from sklearn_compat.utils.validation import _check_n_features, validate_data
 
 
 class DictMapper(TransformerMixin, BaseEstimator):
@@ -74,15 +74,9 @@ def fit(self, X, y=None):
         self : DictMapper
             The fitted transformer.
         """
-        X = check_array(
-            X,
-            copy=True,
-            estimator=self,
-            force_all_finite=False,
-            dtype=None,
-            ensure_2d=True,
-        )
-        self.n_features_in_ = X.shape[1]
+        X = validate_data(self, X=X, copy=True, dtype=None, ensure_2d=True, ensure_all_finite=False, reset=True)
+        _check_n_features(self, X, reset=True)
+
         return self
 
     def transform(self, X):
@@ -104,17 +98,8 @@ def transform(self, X):
             If the number of columns from `X` differs from the number of columns when fitting.
         """
         check_is_fitted(self, ["n_features_in_"])
-        X = check_array(
-            X,
-            copy=True,
-            estimator=self,
-            force_all_finite=False,
-            dtype=None,
-            ensure_2d=True,
-        )
-
-        if X.shape[1] != self.n_features_in_:
-            raise ValueError(f"number of columns {X.shape[1]} does not match fit size {self.n_features_in_}")
+        X = validate_data(self, X=X, copy=True, dtype=None, ensure_2d=True, ensure_all_finite=False, reset=False)
+        _check_n_features(self, X, reset=False)
         return np.vectorize(self.mapper.get, otypes=[int])(X, self.default)
 
     @property
@@ -127,3 +112,10 @@ def dim_(self):
 
     def _more_tags(self):
         return {"preserves_dtype": None, "allow_nan": True, "no_validation": True}
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.transformer_tags.preserves_dtype = []
+        tags.input_tags.allow_nan = True
+        tags.no_validation = True
+        return tags
diff --git a/sklego/preprocessing/identitytransformer.py b/sklego/preprocessing/identitytransformer.py
@@ -1,6 +1,6 @@
 from sklearn.base import BaseEstimator, TransformerMixin
-from sklearn.utils import check_array
 from sklearn.utils.validation import check_is_fitted
+from sklearn_compat.utils.validation import _check_n_features, validate_data
 
 
 class IdentityTransformer(TransformerMixin, BaseEstimator):
@@ -68,7 +68,8 @@ def fit(self, X, y=None):
             The fitted transformer.
         """
         if self.check_X:
-            X = check_array(X, copy=True, estimator=self)
+            X = validate_data(self, X=X, copy=True, reset=True)
+        _check_n_features(self, X, reset=True)
         self.n_samples_, self.n_features_in_ = X.shape
         return self
 
@@ -90,13 +91,11 @@ def transform(self, X):
         ValueError
             If the number of columns from `X` differs from the number of columns when fitting.
         """
-        if self.check_X:
-            X = check_array(X, copy=True, estimator=self)
         check_is_fitted(self, "n_features_in_")
-        if X.shape[1] != self.n_features_in_:
-            raise ValueError(
-                f"Wrong shape is passed to transform. Trained on {self.n_features_in_} cols got {X.shape[1]}"
-            )
+
+        if self.check_X:
+            X = validate_data(self, X=X, copy=True, reset=False)
+        _check_n_features(self, X, reset=False)
         return X
 
     @property

diff --git a/sklego/preprocessing/intervalencoder.py b/sklego/preprocessing/intervalencoder.py
@@ -9,8 +9,8 @@
 
 import numpy as np
 from sklearn.base import BaseEstimator, TransformerMixin
-from sklearn.utils import check_array, check_X_y
 from sklearn.utils.validation import check_is_fitted
+from sklearn_compat.utils.validation import _check_n_features, validate_data
 
 
 def _mk_monotonic_average(xs, ys, intervals, method="increasing", **kwargs):
@@ -156,7 +156,9 @@ def fit(self, X, y):
 
         # these two matrices will have shape (columns, quantiles)
         # quantiles indicate where the interval split occurs
-        X, y = check_X_y(X, y, estimator=self)
+        X, y = validate_data(self, X=X, y=y, reset=True)
+        _check_n_features(self, X, reset=True)
+
         self.quantiles_ = np.zeros((X.shape[1], self.n_chunks))
         # heights indicate what heights these intervals will have
         self.heights_ = np.zeros((X.shape[1], self.n_chunks))
@@ -194,9 +196,9 @@ def transform(self, X):
             If the number of columns from `X` differs from the number of columns when fitting.
         """
         check_is_fitted(self, ["quantiles_", "heights_", "n_features_in_"])
-        X = check_array(X, estimator=self)
-        if X.shape[1] != self.n_features_in_:
-            raise ValueError(f"fitted on {self.n_features_in_} features but received {X.shape[1]}")
+        X = validate_data(self, X=X, reset=False)
+        _check_n_features(self, X, reset=False)
+
         transformed = np.zeros(X.shape)
         for col in range(transformed.shape[1]):
             transformed[:, col] = np.interp(X[:, col], self.quantiles_[col, :], self.heights_[col, :])

diff --git a/sklego/preprocessing/monotonicspline.py b/sklego/preprocessing/monotonicspline.py
@@ -1,8 +1,8 @@
 import numpy as np
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.preprocessing import SplineTransformer
-from sklearn.utils import check_array
 from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
+from sklearn_compat.utils.validation import _check_n_features, validate_data
 
 
 class MonotonicSplineTransformer(TransformerMixin, BaseEstimator):
@@ -52,16 +52,15 @@ def fit(self, X, y=None):
         ValueError
             If `X` contains non-numeric columns.
         """
-        X = check_array(X, copy=True, force_all_finite=False, dtype=FLOAT_DTYPES, estimator=self)
-
+        X = validate_data(self, X=X, copy=True, ensure_all_finite=False, dtype=FLOAT_DTYPES, reset=True)
+        _check_n_features(self, X, reset=True)
         # If X contains infs, we need to replace them by nans before computing quantiles
         self.spline_transformer_ = {
             col: SplineTransformer(n_knots=self.n_knots, degree=self.degree, knots=self.knots).fit(
                 X[:, col].reshape(-1, 1)
             )
             for col in range(X.shape[1])
         }
-        self.n_features_in_ = X.shape[1]
         return self
 
     def transform(self, X):
@@ -82,15 +81,8 @@ def transform(self, X):
             If the number of columns from `X` differs from the number of columns when fitting.
         """
         check_is_fitted(self, "spline_transformer_")
-        X = check_array(
-            X,
-            force_all_finite=False,
-            dtype=FLOAT_DTYPES,
-            estimator=self,
-        )
-        if X.shape[1] != self.n_features_in_:
-            raise ValueError("Number of features going into .transform() do not match number going into .fit().")
-
+        X = validate_data(self, X=X, ensure_all_finite=False, dtype=FLOAT_DTYPES, reset=False)
+        _check_n_features(self, X, reset=False)
         out = []
         for col in range(X.shape[1]):
             out.append(

diff --git a/sklego/preprocessing/outlier_remover.py b/sklego/preprocessing/outlier_remover.py
@@ -1,6 +1,7 @@
 from sklearn import clone
 from sklearn.base import BaseEstimator
-from sklearn.utils.validation import check_array, check_is_fitted
+from sklearn.utils.validation import check_is_fitted
+from sklearn_compat.utils.validation import _check_n_features, validate_data
 
 from sklego.common import TrainOnlyTransformerMixin
 
@@ -68,6 +69,7 @@ def fit(self, X, y=None):
         if self.refit:
             super().fit(X, y)
             self.estimator_.fit(X, y)
+        _check_n_features(self, X, reset=True)
         return self
 
     def transform_train(self, X):
@@ -84,6 +86,9 @@ def transform_train(self, X):
             The data with the outliers removed, where `n_not_outliers = n_samples - n_outliers`.
         """
         check_is_fitted(self, "estimator_")
+        _check_n_features(self, X, reset=False)
+
         predictions = self.estimator_.predict(X)
-        check_array(predictions, estimator=self.outlier_detector, ensure_2d=False)
+        validate_data(self.outlier_detector, X=predictions, ensure_2d=False, reset=False)
+
         return X[predictions != -1]
diff --git a/sklego/preprocessing/projections.py b/sklego/preprocessing/projections.py
@@ -1,8 +1,8 @@
 import narwhals.stable.v1 as nw
 import numpy as np
 from sklearn.base import BaseEstimator, TransformerMixin
-from sklearn.utils import check_array
 from sklearn.utils.validation import check_is_fitted
+from sklearn_compat.utils.validation import _check_n_features, validate_data
 
 from sklego.common import as_list
 
@@ -66,10 +66,8 @@ def fit(self, X, y=None):
         self : OrthogonalTransformer
             The fitted transformer.
         """
-        X = check_array(X, estimator=self)
-
-        if not X.shape[0] > 1:
-            raise ValueError("Orthogonal transformation not valid for one sample")
+        X = validate_data(self, X=X, ensure_min_samples=2, reset=True)
+        _check_n_features(self, X, reset=True)
 
         # Q, R such that X = Q*R, with Q orthogonal, from which follows Q = X*inv(R)
         Q, R = np.linalg.qr(X)
@@ -95,12 +93,14 @@ def transform(self, X):
         array-like of shape (n_samples, n_features)
             The transformed data.
         """
+
         if self.normalize:
             check_is_fitted(self, ["inv_R_", "normalization_vector_"])
         else:
             check_is_fitted(self, ["inv_R_"])
 
-        X = check_array(X, estimator=self)
+        X = validate_data(self, X=X, reset=False)
+        _check_n_features(self, X, reset=False)
 
         return X @ self.inv_R_ / self.normalization_vector_
 
@@ -235,7 +235,9 @@ def fit(self, X, y=None):
         """
         self._check_coltype(X)
         self.col_ids_ = [v if isinstance(v, int) else self._col_idx(X, v) for v in as_list(self.columns)]
-        X = check_array(X, estimator=self)
+        X = validate_data(self, X=X, reset=True)
+        _check_n_features(self, X, reset=True)
+
         X_fair = X.copy()
         v_vectors = self._make_v_vectors(X, self.col_ids_)
         # gram smidt process but only on sensitive attributes
@@ -269,7 +271,9 @@ def transform(self, X):
         """
         check_is_fitted(self, ["projection_", "col_ids_"])
         self._check_coltype(X)
-        X = check_array(X, estimator=self)
+        X = validate_data(self, X=X, reset=False)
+        _check_n_features(self, X, reset=False)
+
         # apply the projection and remove the column we won't need
         X_fair = X @ self.projection_
         X_removed = np.delete(X_fair, self.col_ids_, axis=1)

diff --git a/sklego/preprocessing/randomadder.py b/sklego/preprocessing/randomadder.py
@@ -1,8 +1,8 @@
 from warnings import warn
 
 from sklearn.base import BaseEstimator
-from sklearn.utils import check_array, check_X_y
 from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted, check_random_state
+from sklearn_compat.utils.validation import _check_n_features, validate_data
 
 from sklego.common import TrainOnlyTransformerMixin
 
@@ -69,8 +69,8 @@ def fit(self, X, y):
             The fitted transformer.
         """
         super().fit(X, y)
-        X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
-        self.n_features_in_ = X.shape[1]
+        X, y = validate_data(self, X=X, y=y, dtype=FLOAT_DTYPES, reset=True)
+        _check_n_features(self, X, reset=True)
 
         return self
 
@@ -89,8 +89,8 @@ def transform_train(self, X):
         """
         rs = check_random_state(self.random_state)
         check_is_fitted(self, ["n_features_in_"])
-
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
+        X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
+        _check_n_features(self, X, reset=False)
 
         return X + rs.normal(0, self.noise, size=X.shape)
 
@@ -104,3 +104,8 @@ def dim_(self):
 
     def _more_tags(self):
         return {"non_deterministic": True}
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.non_deterministic = True
+        return tags