scikit-learn-contrib · rosecers · Jun 1, 2023 · May 30, 2023 · Jun 1, 2023 · Jun 1, 2023
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@ __pycache__
 *.egg-info
 *.swp
 *.swo
+*DS_Store
 
 .tox/
 build/

diff --git a/src/skmatter/_selection.py b/src/skmatter/_selection.py
@@ -12,9 +12,8 @@
 from scipy.sparse.linalg import eigsh
 from sklearn.base import BaseEstimator, MetaEstimatorMixin
 from sklearn.feature_selection._base import SelectorMixin
-from sklearn.utils import check_array, check_random_state, safe_mask
-from sklearn.utils._tags import _safe_tags
-from sklearn.utils.validation import check_is_fitted
+from sklearn.utils import check_array, check_random_state, check_X_y, safe_mask
+from sklearn.utils.validation import FLOAT_DTYPES, as_float_array, check_is_fitted
 
 from .utils import (
     X_orthogonalizer,
@@ -125,7 +124,6 @@ def fit(self, X, y=None, warm_start=False):
         -------
         self : object
         """
-        tags = self._get_tags()
 
         if self.selection_type == "feature":
             self._axis = 1
@@ -144,28 +142,28 @@ def fit(self, X, y=None, warm_start=False):
         elif self.progress_bar is False:
             self.report_progress_ = no_progress_bar
 
-        params = dict(
-            accept_sparse="csc",
-            force_all_finite=not tags.get("allow_nan", True),
-        )
-        if self._axis == 1:
-            params["ensure_min_features"] = 2
-        else:
-            params["ensure_min_samples"] = 2
+        params = dict(ensure_min_samples=2, ensure_min_features=2, dtype=FLOAT_DTYPES)
 
-        if y is not None:
-            params["multi_output"] = True
+        if hasattr(self, "mixing") or y is not None:
             X, y = self._validate_data(X, y, **params)
+            X, y = check_X_y(X, y, multi_output=True)
 
             if len(y.shape) == 1:
                 # force y to have multi_output 2D format even when it's 1D, since
                 # many functions, most notably PCov routines, assume an array storage
                 # format, most notably to compute (y @ y.T)
                 y = y.reshape((len(y), 1))
+
         else:
             X = check_array(X, **params)
 
+        if self.full and self.score_threshold is not None:
+            raise ValueError(
+                "You cannot specify both `score_threshold` and `full=True`."
+            )
+
         n_to_select_from = X.shape[self._axis]
+        self.n_samples_in_, self.n_features_in_ = X.shape
 
         self.n_samples_in_, self.n_features_in_ = X.shape
 
@@ -243,22 +241,27 @@ def transform(self, X, y=None):
             The selected subset of the input.
         """
 
-        if len(X.shape) == 1:
-            X = X.reshape(-1, 1)
+        check_is_fitted(self, ["_axis", "selected_idx_", "n_selected_"])
+
+        if self._axis == 0:
+            raise ValueError(
+                "Transform is not currently supported for sample selection."
+            )
 
         mask = self.get_support()
 
-        # note: we use _safe_tags instead of _get_tags because this is a
-        # public Mixin.
-        X = self._validate_data(
-            X,
-            dtype=None,
-            accept_sparse="csr",
-            force_all_finite=not _safe_tags(self, key="allow_nan"),
-            reset=False,
-            ensure_2d=self._axis,
-        )
+        X = check_array(X)
 
+        if len(X.shape) == 1:
+            if self._axis == 0:
+                X = X.reshape(-1, 1)
+            else:
+                X = X.reshape(1, -1)
+
+        if len(mask) != X.shape[self._axis]:
+            raise ValueError(
+                "X has a different shape than during fitting. Reshape your data."
+            )
         if self._axis == 1:
             return X[:, safe_mask(X, mask)]
         else:
@@ -517,7 +520,7 @@ def _init_greedy_search(self, X, y, n_to_select):
         features and computes their initial importance score.
         """
 
-        self.X_current_ = X.copy()
+        self.X_current_ = as_float_array(X.copy())
         self.pi_ = self._compute_pi(self.X_current_)
 
         super()._init_greedy_search(X, y, n_to_select)

diff --git a/src/skmatter/decomposition/_pcovr.py b/src/skmatter/decomposition/_pcovr.py
@@ -130,6 +130,8 @@ class PCovR(_BasePCA, LinearModel):
          Used when the 'arpack' or 'randomized' solvers are used. Pass an int
          for reproducible results across multiple function calls.
 
+    whiten : boolean, deprecated
+
     Attributes
     ----------
 
@@ -202,12 +204,13 @@ def __init__(
         regressor=None,
         iterated_power="auto",
         random_state=None,
+        whiten=False,
     ):
         self.mixing = mixing
         self.n_components = n_components
         self.space = space
 
-        self.whiten = False
+        self.whiten = whiten
         self.svd_solver = svd_solver
         self.tol = tol
         self.iterated_power = iterated_power

diff --git a/src/skmatter/linear_model/_base.py b/src/skmatter/linear_model/_base.py
@@ -2,6 +2,8 @@
 from scipy.linalg import orthogonal_procrustes
 from sklearn.base import MultiOutputMixin, RegressorMixin
 from sklearn.linear_model import LinearRegression
+from sklearn.utils import check_array, check_X_y
+from sklearn.utils.validation import check_is_fitted
 
 
 class OrthogonalRegression(MultiOutputMixin, RegressorMixin):
@@ -61,6 +63,15 @@ def fit(self, X, y):
             and n_targets is the number of target properties.
         """
 
+        X, y = check_X_y(
+            X,
+            y,
+            y_numeric=True,
+            ensure_min_features=1,
+            ensure_min_samples=1,
+            multi_output=True,
+        )
+
         self.n_samples_in_, self.n_features_in_ = X.shape
         if self.use_orthogonal_projector:
             # check estimator
@@ -71,12 +82,15 @@ def fit(self, X, y):
             )
             # compute orthogonal projectors
             linear_estimator.fit(X, y)
-            U, _, Vt = np.linalg.svd(linear_estimator.coef_.T, full_matrices=False)
-            # project X and y to same dimension
-            X = X @ U
-            y = y @ Vt.T
+            coef = np.reshape(linear_estimator.coef_.T, (X.shape[1], -1))
+            U, _, Vt = np.linalg.svd(coef, full_matrices=False)
+
             # compute weights by solving the Procrustes problem
-            self.coef_ = (U @ orthogonal_procrustes(X, y)[0] @ Vt).T
+            self.coef_ = (
+                U
+                @ orthogonal_procrustes(X @ U, y.reshape(X.shape[0], -1) @ Vt.T)[0]
+                @ Vt
+            ).T
         else:
             self.max_components_ = max(X.shape[1], y.shape[1])
             X = np.pad(X, [(0, 0), (0, self.max_components_ - X.shape[1])])
@@ -93,6 +107,9 @@ def predict(self, X):
             Training data, where n_samples is the number of samples
             and n_features is the number of features.
         """
+        X = check_array(X, ensure_min_features=1, ensure_min_samples=1)
+        check_is_fitted(self, ["coef_"])
+
         if not (self.use_orthogonal_projector):
             X = np.pad(X, [(0, 0), (0, self.max_components_ - X.shape[1])])
         return X @ self.coef_.T
diff --git a/src/skmatter/linear_model/_ridge.py b/src/skmatter/linear_model/_ridge.py
@@ -1,11 +1,13 @@
 import numpy as np
 from joblib import Parallel, delayed
-from sklearn.base import MultiOutputMixin, RegressorMixin
+from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin
 from sklearn.metrics import check_scoring
 from sklearn.model_selection import KFold
+from sklearn.utils import check_array
+from sklearn.utils.validation import check_is_fitted
 
 
-class RidgeRegression2FoldCV(MultiOutputMixin, RegressorMixin):
+class RidgeRegression2FoldCV(BaseEstimator, MultiOutputMixin, RegressorMixin):
     r"""Ridge regression with an efficient 2-fold cross-validation method using the SVD
     solver.
 
@@ -110,6 +112,9 @@ def __init__(
         self.shuffle = shuffle
         self.n_jobs = n_jobs
 
+    def _more_tags(self):
+        return {"multioutput_only": True}
+
     def fit(self, X, y):
         """
         Parameters
@@ -138,6 +143,7 @@ def fit(self, X, y):
                 "[0,1)"
             )
 
+        X, y = self._validate_data(X, y, y_numeric=True, multi_output=True)
         self.n_samples_in_, self.n_features_in_ = X.shape
 
         # check_scoring uses estimators scoring function if the scorer is None, this is
@@ -164,6 +170,11 @@ def predict(self, X):
             Training data, where n_samples is the number of samples
             and n_features is the number of features.
         """
+
+        X = check_array(X)
+
+        check_is_fitted(self, ["coef_"])
+
         return X @ self.coef_.T
 
     def _2fold_cv(self, X, y, fold1_idx, fold2_idx, scorer):

diff --git a/src/skmatter/metrics/_reconstruction_measures.py b/src/skmatter/metrics/_reconstruction_measures.py
@@ -445,7 +445,7 @@ def pointwise_local_reconstruction_error(
 
     scaler.fit(X_train)
     X_train = scaler.transform(X_train)
-    X_test = scaler.transform(X_test)
+    X_test = scaler.transform(X_test).astype(X_train.dtype)
     scaler.fit(Y_train)
     Y_train = scaler.transform(Y_train)
     Y_test = scaler.transform(Y_test)

diff --git a/src/skmatter/preprocessing/_data.py b/src/skmatter/preprocessing/_data.py
@@ -135,6 +135,13 @@ def fit(self, X, y=None, sample_weight=None):
             Fitted scaler.
         """
 
+        X = self._validate_data(
+            X,
+            copy=self.copy,
+            estimator=self,
+            dtype=FLOAT_DTYPES,
+            ensure_min_samples=2,
+        )
         self.n_samples_in_, self.n_features_in_ = X.shape
 
         if sample_weight is not None:
@@ -157,7 +164,7 @@ def fit(self, X, y=None, sample_weight=None):
                 self.scale_ = np.sqrt(var)
             else:
                 var_sum = var.sum()
-                if var_sum < abs(np.mean(X_mean)) * self.rtol + self.atol:
+                if var_sum < abs(np.average(X_mean)) * self.rtol + self.atol:
                     raise ValueError("Cannot normalize a matrix with zero variance")
                 self.scale_ = np.sqrt(var_sum)
 
@@ -187,11 +194,9 @@ def transform(self, X, y=None, copy=None):
         X = self._validate_data(
             X,
             reset=False,
-            accept_sparse="csr",
             copy=copy,
             estimator=self,
             dtype=FLOAT_DTYPES,
-            force_all_finite="allow-nan",
         )
         check_is_fitted(
             self, attributes=["n_samples_in_", "n_features_in_", "scale_", "mean_"]
@@ -288,7 +293,7 @@ def __init__(self, with_center=True, with_trace=True):
         self.with_trace = with_trace
         super().__init__()
 
-    def fit(self, K=None, y=None, sample_weight=None):
+    def fit(self, K, y=None, sample_weight=None):
         """Fit KernelFlexibleCenterer
 
         Parameters
@@ -310,7 +315,7 @@ def fit(self, K=None, y=None, sample_weight=None):
             Fitted transformer.
         """
 
-        Kc = self._validate_data(K, copy=True, dtype=FLOAT_DTYPES, reset=False)
+        K = self._validate_data(K, copy=True, dtype=FLOAT_DTYPES, reset=False)
 
         if sample_weight is not None:
             self.sample_weight_ = _check_sample_weight(sample_weight, K, dtype=K.dtype)
@@ -327,20 +332,20 @@ def fit(self, K=None, y=None, sample_weight=None):
             else:
                 super().fit(K, y)
 
-            K_pred_cols = np.average(Kc, weights=self.sample_weight_, axis=1)[
+            K_pred_cols = np.average(K, weights=self.sample_weight_, axis=1)[
                 :, np.newaxis
             ]
         else:
-            self.K_fit_rows_ = np.zeros(Kc.shape[1])
+            self.K_fit_rows_ = np.zeros(K.shape[1])
             self.K_fit_all_ = 0.0
-            K_pred_cols = np.zeros((Kc.shape[0], 1))
+            K_pred_cols = np.zeros((K.shape[0], 1))
 
         if self.with_trace:
-            Kc -= self.K_fit_rows_
-            Kc -= K_pred_cols
-            Kc += self.K_fit_all_
+            K -= self.K_fit_rows_
+            K -= K_pred_cols
+            K += self.K_fit_all_
 
-            self.scale_ = np.trace(Kc) / Kc.shape[0]
+            self.scale_ = np.trace(K) / K.shape[0]
         else:
             self.scale_ = 1.0
 
@@ -408,7 +413,7 @@ def fit_transform(self, K, y=None, sample_weight=None, copy=True, **fit_params):
         return self.transform(K, copy)
 
 
-class SparseKernelCenterer(TransformerMixin, BaseEstimator):
+class SparseKernelCenterer(TransformerMixin):
     r"""Kernel centering method for sparse kernels, similar to
     KernelFlexibleCenterer.
 

diff --git a/src/skmatter/utils/_orthogonalizers.py b/src/skmatter/utils/_orthogonalizers.py
@@ -56,9 +56,9 @@ def X_orthogonalizer(x1, c=None, x2=None, tol=1e-12, copy=False):
         if np.linalg.norm(col) < tol:
             warnings.warn("Column vector contains only zeros.", stacklevel=1)
         else:
-            col /= np.linalg.norm(col, axis=0)
+            col = np.divide(col, np.linalg.norm(col, axis=0))
 
-        xnew -= col @ (col.T @ xnew)
+        xnew -= (col @ (col.T @ xnew)).astype(xnew.dtype)
 
     return xnew
 

diff --git a/src/skmatter/utils/_pcovr_utils.py b/src/skmatter/utils/_pcovr_utils.py
@@ -186,7 +186,7 @@ def pcovr_covariance(
         C_Y = C_Y.reshape((C.shape[0], -1))
         C_Y = np.real(C_Y)
 
-        C += (1 - mixing) * C_Y @ C_Y.T
+        C += (1 - mixing) * np.array(C_Y @ C_Y.T, dtype=np.float64)
 
     if mixing > 0:
         C += (mixing) * (X.T @ X)

diff --git a/tests/test_check_estimators.py b/tests/test_check_estimators.py
@@ -0,0 +1,26 @@
+from sklearn.utils.estimator_checks import parametrize_with_checks
+
+from skmatter.decomposition import KernelPCovR, PCovR
+from skmatter.feature_selection import CUR as fCUR
+from skmatter.feature_selection import FPS as fFPS
+from skmatter.feature_selection import PCovCUR as fPCovCUR
+from skmatter.feature_selection import PCovFPS as fPCovFPS
+from skmatter.linear_model import RidgeRegression2FoldCV  # OrthogonalRegression,
+from skmatter.preprocessing import KernelNormalizer, StandardFlexibleScaler
+
+
+@parametrize_with_checks(
+    [
+        KernelPCovR(mixing=0.5),
+        PCovR(mixing=0.5),
+        fCUR(),
+        fFPS(),
+        fPCovCUR(),
+        fPCovFPS(),
+        RidgeRegression2FoldCV(),
+        KernelNormalizer(),
+        StandardFlexibleScaler(),
+    ]
+)
+def test_sklearn_compatible_estimator(estimator, check):
+    check(estimator)
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,6 +4,7 @@ __pycache__ @@
     *.egg-info
     *.swp
     *.swo
+    *DS_Store
     .tox/
     build/
@@ Expand Down @@