koaning · FBruzzesi · Nov 19, 2024 · Nov 20, 2024 · Nov 20, 2024 · Nov 20, 2024
diff --git a/sklego/__init__.py b/sklego/__init__.py
@@ -1,9 +1,13 @@
+import re
 import sys
 
 if sys.version_info >= (3, 8):
     from importlib import metadata
 else:
     import importlib_metadata as metadata
 
+
 __title__ = "sklego"
 __version__ = metadata.version("scikit-lego")
+
+SKLEARN_VERSION = tuple(int(re.sub(r"\D", "", str(v))) for v in metadata.version("scikit-learn").split("."))
diff --git a/sklego/common.py b/sklego/common.py
@@ -7,6 +7,8 @@
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
 
+from sklego import SKLEARN_VERSION
+
 
 class TrainOnlyTransformerMixin(TransformerMixin, BaseEstimator):
     """Mixin class for transformers that can handle training and test data differently.
@@ -79,9 +81,9 @@ def fit(self, X, y=None):
             The fitted transformer.
         """
         if y is None:
-            check_array(X, estimator=self)
+            validate_data(self, X)
         else:
-            check_X_y(X, y, estimator=self, multi_output=True)
+            validate_data(self, X, y, multi_output=True)
         self.X_hash_ = self._hash(X)
         self.n_features_in_ = X.shape[1]
         return self
@@ -145,7 +147,7 @@ def transform(self, X, y=None):
             If the input dimension does not match the training dimension.
         """
         check_is_fitted(self, ["X_hash_", "n_features_in_"])
-        check_array(X, estimator=self)
+        validate_data(self, X, reset=False)
 
         if X.shape[1] != self.n_features_in_:
             raise ValueError(f"Unexpected input dimension {X.shape[1]}, expected {self.n_features_in_}")
@@ -339,3 +341,37 @@ def sliding_window(sequence, window_size, step_size):
     ```
     """
     return (sequence[pos : pos + window_size] for pos in range(0, len(sequence), step_size))
+
+
+def validate_data(
+    estimator,
+    X="no_validation",
+    y="no_validation",
+    reset=True,
+    validate_separately=False,
+    skip_check_array=False,
+    y_required=False,
+    **check_params,
+):
+    if SKLEARN_VERSION >= (1, 6):
+        from sklearn.utils.validation import validate_data
+
+        return validate_data(
+            estimator,
+            X=X,
+            y=y,
+            reset=reset,
+            validate_separately=validate_separately,
+            skip_check_array=skip_check_array,
+            **check_params,
+        )
+
+    else:
+        if y is None and y_required:
+            msg = f"This {estimator.__class__.__name__} estimator requires y to be passed, but the target y is None."
+            raise ValueError(msg)
+
+        if y is None or (isinstance(y, str) and y == "no_validation"):
+            return check_array(X, estimator=estimator, **check_params)
+        else:
+            return check_X_y(X=X, y=y, estimator=estimator, **check_params)
diff --git a/sklego/decomposition/pca_reconstruction.py b/sklego/decomposition/pca_reconstruction.py
@@ -1,7 +1,9 @@
 import numpy as np
 from sklearn.base import BaseEstimator, OutlierMixin
 from sklearn.decomposition import PCA
-from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted
+from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
+
+from sklego.common import validate_data
 
 
 class PCAOutlierDetection(OutlierMixin, BaseEstimator):
@@ -94,7 +96,7 @@ def fit(self, X, y=None):
         ValueError
             If `threshold` is `None`.
         """
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
+        X = validate_data(self, X, dtype=FLOAT_DTYPES)
         if not self.threshold:
             raise ValueError("The `threshold` value cannot be `None`.")
 
@@ -157,7 +159,7 @@ def predict(self, X):
         array-like of shape (n_samples,)
             The predicted data. 1 for inliers, -1 for outliers.
         """
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
+        X = validate_data(self, X, dtype=FLOAT_DTYPES)
         check_is_fitted(self, ["pca_", "offset_"])
         result = np.ones(X.shape[0])
         result[self.difference(X) > self.threshold] = -1

diff --git a/sklego/decomposition/umap_reconstruction.py b/sklego/decomposition/umap_reconstruction.py
@@ -8,7 +8,9 @@
 
 import numpy as np
 from sklearn.base import BaseEstimator, OutlierMixin
-from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted
+from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
+
+from sklego.common import validate_data
 
 
 class UMAPOutlierDetection(OutlierMixin, BaseEstimator):
@@ -100,9 +102,9 @@ def fit(self, X, y=None):
             - If `n_components` is less than 2.
             - If `threshold` is `None`.
         """
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
+        X = validate_data(self, X, dtype=FLOAT_DTYPES)
         if y is not None:
-            y = check_array(y, estimator=self, ensure_2d=False)
+            y = validate_data(self, y, ensure_2d=False)
 
         if not self.threshold:
             raise ValueError("The `threshold` value cannot be `None`.")
@@ -133,6 +135,7 @@ def difference(self, X):
             The calculated difference.
         """
         check_is_fitted(self, ["umap_", "offset_"])
+
         reduced = self.umap_.transform(X)
         diff = np.sum(np.abs(self.umap_.inverse_transform(reduced) - X), axis=1)
         if self.variant == "relative":
@@ -155,7 +158,7 @@ def predict(self, X):
         array-like of shape (n_samples,)
             The predicted data. 1 for inliers, -1 for outliers.
         """
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
+        X = validate_data(self, X, dtype=FLOAT_DTYPES, reset=False)
         check_is_fitted(self, ["umap_", "offset_"])
         result = np.ones(X.shape[0])
         result[self.difference(X) > self.threshold] = -1
@@ -172,3 +175,13 @@ def score_samples(self, X):
 
     def _more_tags(self):
         return {"non_deterministic": True}
+
+    def __sklearn_tags__(self):
+        from sklego import SKLEARN_VERSION
+
+        if SKLEARN_VERSION >= (1, 6):
+            tags = super().__sklearn_tags__()
+            tags.non_deterministic = True
+            return tags
+        else:
+            pass
diff --git a/sklego/dummy.py b/sklego/dummy.py
@@ -2,14 +2,14 @@
 
 import numpy as np
 from sklearn.base import BaseEstimator, RegressorMixin
-from sklearn.utils import check_X_y
 from sklearn.utils.validation import (
     FLOAT_DTYPES,
-    check_array,
     check_is_fitted,
     check_random_state,
 )
 
+from sklego.common import validate_data
+
 
 class RandomRegressor(RegressorMixin, BaseEstimator):
     """A `RandomRegressor` makes random predictions only based on the `y` value that is seen.
@@ -72,7 +72,7 @@ def fit(self, X: np.array, y: np.array) -> "RandomRegressor":
         """
         if self.strategy not in self._ALLOWED_STRATEGIES:
             raise ValueError(f"strategy {self.strategy} is not in {self._ALLOWED_STRATEGIES}")
-        X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
+        X, y = validate_data(self, X, y, dtype=FLOAT_DTYPES, y_required=True)
         self.n_features_in_ = X.shape[1]
 
         self.min_ = np.min(y)
@@ -99,9 +99,9 @@ def predict(self, X):
         rs = check_random_state(self.random_state)
         check_is_fitted(self, ["n_features_in_", "min_", "max_", "mu_", "sigma_"])
 
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
+        X = validate_data(self, X, dtype=FLOAT_DTYPES, reset=False)
         if X.shape[1] != self.n_features_in_:
-            raise ValueError(f"Unexpected input dimension {X.shape[1]}, expected {self.dim_}")
+            raise ValueError(f"Unexpected input dimension {X.shape[1]}, expected {self.n_features_in_}")
 
         if self.strategy == "normal":
             return rs.normal(self.mu_, self.sigma_, X.shape[0])
@@ -127,3 +127,14 @@ def allowed_strategies(self):
 
     def _more_tags(self):
         return {"poor_score": True, "non_deterministic": True}
+
+    def __sklearn_tags__(self):
+        from sklego import SKLEARN_VERSION
+
+        if SKLEARN_VERSION >= (1, 6):
+            tags = super().__sklearn_tags__()
+            tags.non_deterministic = True
+            tags.regressor_tags.poor_score = True
+            return tags
+        else:
+            pass
diff --git a/sklego/feature_selection/mrmr.py b/sklego/feature_selection/mrmr.py
@@ -4,7 +4,9 @@
 from sklearn.base import BaseEstimator
 from sklearn.feature_selection import f_classif, f_regression
 from sklearn.feature_selection._base import SelectorMixin
-from sklearn.utils.validation import check_is_fitted, check_X_y
+from sklearn.utils.validation import check_is_fitted
+
+from sklego.common import validate_data
 
 
 def _redundancy_pearson(X, selected, left):
@@ -201,7 +203,8 @@ def fit(self, X, y):
 
                 k parameter is not integer type or is < n_features_in (X.shape[1]) or < 1
         """
-        X, y = check_X_y(X, y, dtype="numeric", y_numeric=True)
+        X, y = validate_data(self, X, y, dtype="numeric", y_numeric=True, y_required=True)
+
         self._y_dtype = y.dtype
 
         relevance = self._get_relevance

diff --git a/sklego/linear_model.py b/sklego/linear_model.py
@@ -17,15 +17,17 @@
 from sklearn.linear_model._base import LinearClassifierMixin
 from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
 from sklearn.preprocessing import LabelEncoder
-from sklearn.utils import check_X_y
 from sklearn.utils.validation import (
     FLOAT_DTYPES,
     _check_sample_weight,
     check_array,
     check_is_fitted,
+    check_X_y,
     column_or_1d,
 )
 
+from sklego.common import validate_data
+
 
 class LowessRegression(RegressorMixin, BaseEstimator):
     """`LowessRegression` estimator: LOWESS (Locally Weighted Scatterplot Smoothing) is a type of
@@ -96,7 +98,7 @@ def fit(self, X, y):
             - If `span` is not between 0 and 1.
             - If `sigma` is negative.
         """
-        X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
+        X, y = validate_data(self, X, y, dtype=FLOAT_DTYPES, y_required=True)
         if self.span is not None:
             if not 0 <= self.span <= 1:
                 raise ValueError(f"Param `span` must be 0 <= span <= 1, got: {self.span}")
@@ -138,7 +140,7 @@ def predict(self, X):
         array-like of shape (n_samples,)
             The predicted values.
         """
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
+        X = validate_data(self, X, dtype=FLOAT_DTYPES, reset=False)
         check_is_fitted(self, ["X_", "y_"])
 
         try:
@@ -233,7 +235,7 @@ def fit(self, X, y):
         self : ProbWeightRegression
             The fitted estimator.
         """
-        X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
+        X, y = validate_data(self, X, y, dtype=FLOAT_DTYPES, y_required=True)
 
         # Construct the problem.
         betas = cp.Variable(X.shape[1])
@@ -263,7 +265,7 @@ def predict(self, X):
         array-like of shape (n_samples,)
             The predicted data.
         """
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
+        X = validate_data(self, X, dtype=FLOAT_DTYPES, reset=False)
         check_is_fitted(self, ["coef_"])
         return np.dot(X, self.coef_)
 
@@ -381,7 +383,7 @@ def fit(self, X, y):
         ValueError
             If `effect` is not one of "linear", "quadratic" or "constant".
         """
-        X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
+        X, y = validate_data(self, X, y, dtype=FLOAT_DTYPES, y_required=True)
         if self.effect not in self._ALLOWED_EFFECTS:
             raise ValueError(f"effect {self.effect} must be in {self._ALLOWED_EFFECTS}")
 
@@ -458,7 +460,7 @@ def predict(self, X):
         array-like of shape (n_samples,)
             The predicted data.
         """
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
+        X = validate_data(self, X, dtype=FLOAT_DTYPES, reset=False)
         check_is_fitted(self, ["coef_"])
         return np.dot(X, self.coef_)
 
@@ -579,7 +581,7 @@ def fit(self, X, y):
         if isinstance(X, nw.DataFrame):
             self.sensitive_col_idx_ = [i for i, name in enumerate(X.columns) if name in self.sensitive_cols]
 
-        X, y = check_X_y(X, y, accept_large_sparse=False)
+        X, y = check_X_y(X, y, accept_large_sparse=False, estimator=self)
         sensitive = X[:, self.sensitive_col_idx_]
 
         if not self.train_sensitive_cols:
@@ -681,6 +683,16 @@ def decision_function(self, X):
     def _more_tags(self):
         return {"poor_score": True}
 
+    def __sklearn_tags__(self):
+        from sklego import SKLEARN_VERSION
+
+        if SKLEARN_VERSION >= (1, 6):
+            tags = super().__sklearn_tags__()
+            tags.classifier_tags.poor_score = True
+            return tags
+        else:
+            pass
+
 
 class DemographicParityClassifier(LinearClassifierMixin, BaseEstimator):
     r"""`DemographicParityClassifier` is a logistic regression classifier which can be constrained on demographic
@@ -970,8 +982,6 @@ def __init__(
         self.fit_intercept = fit_intercept
         self.copy_X = copy_X
         self.positive = positive
-        if method not in ("SLSQP", "TNC", "L-BFGS-B"):
-            raise ValueError(f'method should be one of "SLSQP", "TNC", "L-BFGS-B", ' f"got {method} instead")
         self.method = method
 
     @abstractmethod
@@ -1021,6 +1031,9 @@ def fit(self, X, y, sample_weight=None):
         self : BaseScipyMinimizeRegressor
             Fitted linear model.
         """
+        if self.method not in {"SLSQP", "TNC", "L-BFGS-B"}:
+            msg = f"method should be one of 'SLSQP', 'TNC', 'L-BFGS-B', got {self.method} instead"
+            raise ValueError(msg)
         X_, grad_loss, loss = self._prepare_inputs(X, sample_weight, y)
 
         d = X_.shape[1] - self.n_features_in_  # This is either zero or one.
@@ -1051,7 +1064,7 @@ def _prepare_inputs(self, X, sample_weight, y):
         This method is called by `fit` to prepare the inputs for the optimization problem. It adds an intercept column
         to `X` if `fit_intercept=True`, and returns the loss function and its gradient.
         """
-        X, y = check_X_y(X, y, y_numeric=True)
+        X, y = validate_data(self, X, y, y_numeric=True, y_required=True)
         sample_weight = _check_sample_weight(sample_weight, X)
         self.n_features_in_ = X.shape[1]
 
@@ -1081,7 +1094,7 @@ def predict(self, X):
             The predicted data.
         """
         check_is_fitted(self)
-        X = check_array(X)
+        X = validate_data(self, X, reset=False)
 
         return X @ self.coef_ + self.intercept_