diff --git a/.gitignore b/.gitignore index 4c2794f1a..15eba83e1 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ __pycache__ *.egg-info *.swp *.swo +*DS_Store .tox/ build/ diff --git a/src/skmatter/_selection.py b/src/skmatter/_selection.py index ebb25ecb2..867fc7b03 100644 --- a/src/skmatter/_selection.py +++ b/src/skmatter/_selection.py @@ -12,9 +12,8 @@ from scipy.sparse.linalg import eigsh from sklearn.base import BaseEstimator, MetaEstimatorMixin from sklearn.feature_selection._base import SelectorMixin -from sklearn.utils import check_array, check_random_state, safe_mask -from sklearn.utils._tags import _safe_tags -from sklearn.utils.validation import check_is_fitted +from sklearn.utils import check_array, check_random_state, check_X_y, safe_mask +from sklearn.utils.validation import FLOAT_DTYPES, as_float_array, check_is_fitted from .utils import ( X_orthogonalizer, @@ -125,7 +124,6 @@ def fit(self, X, y=None, warm_start=False): ------- self : object """ - tags = self._get_tags() if self.selection_type == "feature": self._axis = 1 @@ -144,28 +142,28 @@ def fit(self, X, y=None, warm_start=False): elif self.progress_bar is False: self.report_progress_ = no_progress_bar - params = dict( - accept_sparse="csc", - force_all_finite=not tags.get("allow_nan", True), - ) - if self._axis == 1: - params["ensure_min_features"] = 2 - else: - params["ensure_min_samples"] = 2 + params = dict(ensure_min_samples=2, ensure_min_features=2, dtype=FLOAT_DTYPES) - if y is not None: - params["multi_output"] = True + if hasattr(self, "mixing") or y is not None: X, y = self._validate_data(X, y, **params) + X, y = check_X_y(X, y, multi_output=True) if len(y.shape) == 1: # force y to have multi_output 2D format even when it's 1D, since # many functions, most notably PCov routines, assume an array storage # format, most notably to compute (y @ y.T) y = y.reshape((len(y), 1)) + else: X = check_array(X, **params) + if self.full and self.score_threshold is not None: + raise ValueError( + "You cannot specify both `score_threshold` and `full=True`." + ) + n_to_select_from = X.shape[self._axis] + self.n_samples_in_, self.n_features_in_ = X.shape self.n_samples_in_, self.n_features_in_ = X.shape @@ -243,22 +241,27 @@ def transform(self, X, y=None): The selected subset of the input. """ - if len(X.shape) == 1: - X = X.reshape(-1, 1) + check_is_fitted(self, ["_axis", "selected_idx_", "n_selected_"]) + + if self._axis == 0: + raise ValueError( + "Transform is not currently supported for sample selection." + ) mask = self.get_support() - # note: we use _safe_tags instead of _get_tags because this is a - # public Mixin. - X = self._validate_data( - X, - dtype=None, - accept_sparse="csr", - force_all_finite=not _safe_tags(self, key="allow_nan"), - reset=False, - ensure_2d=self._axis, - ) + X = check_array(X) + if len(X.shape) == 1: + if self._axis == 0: + X = X.reshape(-1, 1) + else: + X = X.reshape(1, -1) + + if len(mask) != X.shape[self._axis]: + raise ValueError( + "X has a different shape than during fitting. Reshape your data." + ) if self._axis == 1: return X[:, safe_mask(X, mask)] else: @@ -517,7 +520,7 @@ def _init_greedy_search(self, X, y, n_to_select): features and computes their initial importance score. """ - self.X_current_ = X.copy() + self.X_current_ = as_float_array(X.copy()) self.pi_ = self._compute_pi(self.X_current_) super()._init_greedy_search(X, y, n_to_select) diff --git a/src/skmatter/decomposition/_pcovr.py b/src/skmatter/decomposition/_pcovr.py index 4b3bb2b97..7e5afd42d 100644 --- a/src/skmatter/decomposition/_pcovr.py +++ b/src/skmatter/decomposition/_pcovr.py @@ -130,6 +130,8 @@ class PCovR(_BasePCA, LinearModel): Used when the 'arpack' or 'randomized' solvers are used. Pass an int for reproducible results across multiple function calls. + whiten : boolean, deprecated + Attributes ---------- @@ -202,12 +204,13 @@ def __init__( regressor=None, iterated_power="auto", random_state=None, + whiten=False, ): self.mixing = mixing self.n_components = n_components self.space = space - self.whiten = False + self.whiten = whiten self.svd_solver = svd_solver self.tol = tol self.iterated_power = iterated_power diff --git a/src/skmatter/linear_model/_base.py b/src/skmatter/linear_model/_base.py index dacde2fca..800cf67f4 100644 --- a/src/skmatter/linear_model/_base.py +++ b/src/skmatter/linear_model/_base.py @@ -2,6 +2,8 @@ from scipy.linalg import orthogonal_procrustes from sklearn.base import MultiOutputMixin, RegressorMixin from sklearn.linear_model import LinearRegression +from sklearn.utils import check_array, check_X_y +from sklearn.utils.validation import check_is_fitted class OrthogonalRegression(MultiOutputMixin, RegressorMixin): @@ -61,6 +63,15 @@ def fit(self, X, y): and n_targets is the number of target properties. """ + X, y = check_X_y( + X, + y, + y_numeric=True, + ensure_min_features=1, + ensure_min_samples=1, + multi_output=True, + ) + self.n_samples_in_, self.n_features_in_ = X.shape if self.use_orthogonal_projector: # check estimator @@ -71,12 +82,15 @@ def fit(self, X, y): ) # compute orthogonal projectors linear_estimator.fit(X, y) - U, _, Vt = np.linalg.svd(linear_estimator.coef_.T, full_matrices=False) - # project X and y to same dimension - X = X @ U - y = y @ Vt.T + coef = np.reshape(linear_estimator.coef_.T, (X.shape[1], -1)) + U, _, Vt = np.linalg.svd(coef, full_matrices=False) + # compute weights by solving the Procrustes problem - self.coef_ = (U @ orthogonal_procrustes(X, y)[0] @ Vt).T + self.coef_ = ( + U + @ orthogonal_procrustes(X @ U, y.reshape(X.shape[0], -1) @ Vt.T)[0] + @ Vt + ).T else: self.max_components_ = max(X.shape[1], y.shape[1]) X = np.pad(X, [(0, 0), (0, self.max_components_ - X.shape[1])]) @@ -93,6 +107,9 @@ def predict(self, X): Training data, where n_samples is the number of samples and n_features is the number of features. """ + X = check_array(X, ensure_min_features=1, ensure_min_samples=1) + check_is_fitted(self, ["coef_"]) + if not (self.use_orthogonal_projector): X = np.pad(X, [(0, 0), (0, self.max_components_ - X.shape[1])]) return X @ self.coef_.T diff --git a/src/skmatter/linear_model/_ridge.py b/src/skmatter/linear_model/_ridge.py index 491ecbc4c..3969b2a33 100644 --- a/src/skmatter/linear_model/_ridge.py +++ b/src/skmatter/linear_model/_ridge.py @@ -1,11 +1,13 @@ import numpy as np from joblib import Parallel, delayed -from sklearn.base import MultiOutputMixin, RegressorMixin +from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin from sklearn.metrics import check_scoring from sklearn.model_selection import KFold +from sklearn.utils import check_array +from sklearn.utils.validation import check_is_fitted -class RidgeRegression2FoldCV(MultiOutputMixin, RegressorMixin): +class RidgeRegression2FoldCV(BaseEstimator, MultiOutputMixin, RegressorMixin): r"""Ridge regression with an efficient 2-fold cross-validation method using the SVD solver. @@ -110,6 +112,9 @@ def __init__( self.shuffle = shuffle self.n_jobs = n_jobs + def _more_tags(self): + return {"multioutput_only": True} + def fit(self, X, y): """ Parameters @@ -138,6 +143,7 @@ def fit(self, X, y): "[0,1)" ) + X, y = self._validate_data(X, y, y_numeric=True, multi_output=True) self.n_samples_in_, self.n_features_in_ = X.shape # check_scoring uses estimators scoring function if the scorer is None, this is @@ -164,6 +170,11 @@ def predict(self, X): Training data, where n_samples is the number of samples and n_features is the number of features. """ + + X = check_array(X) + + check_is_fitted(self, ["coef_"]) + return X @ self.coef_.T def _2fold_cv(self, X, y, fold1_idx, fold2_idx, scorer): diff --git a/src/skmatter/metrics/_reconstruction_measures.py b/src/skmatter/metrics/_reconstruction_measures.py index 02d3d6557..86bab2fab 100644 --- a/src/skmatter/metrics/_reconstruction_measures.py +++ b/src/skmatter/metrics/_reconstruction_measures.py @@ -445,7 +445,7 @@ def pointwise_local_reconstruction_error( scaler.fit(X_train) X_train = scaler.transform(X_train) - X_test = scaler.transform(X_test) + X_test = scaler.transform(X_test).astype(X_train.dtype) scaler.fit(Y_train) Y_train = scaler.transform(Y_train) Y_test = scaler.transform(Y_test) diff --git a/src/skmatter/preprocessing/_data.py b/src/skmatter/preprocessing/_data.py index 94dd0e02e..9e4651466 100644 --- a/src/skmatter/preprocessing/_data.py +++ b/src/skmatter/preprocessing/_data.py @@ -135,6 +135,13 @@ def fit(self, X, y=None, sample_weight=None): Fitted scaler. """ + X = self._validate_data( + X, + copy=self.copy, + estimator=self, + dtype=FLOAT_DTYPES, + ensure_min_samples=2, + ) self.n_samples_in_, self.n_features_in_ = X.shape if sample_weight is not None: @@ -157,7 +164,7 @@ def fit(self, X, y=None, sample_weight=None): self.scale_ = np.sqrt(var) else: var_sum = var.sum() - if var_sum < abs(np.mean(X_mean)) * self.rtol + self.atol: + if var_sum < abs(np.average(X_mean)) * self.rtol + self.atol: raise ValueError("Cannot normalize a matrix with zero variance") self.scale_ = np.sqrt(var_sum) @@ -187,11 +194,9 @@ def transform(self, X, y=None, copy=None): X = self._validate_data( X, reset=False, - accept_sparse="csr", copy=copy, estimator=self, dtype=FLOAT_DTYPES, - force_all_finite="allow-nan", ) check_is_fitted( self, attributes=["n_samples_in_", "n_features_in_", "scale_", "mean_"] @@ -288,7 +293,7 @@ def __init__(self, with_center=True, with_trace=True): self.with_trace = with_trace super().__init__() - def fit(self, K=None, y=None, sample_weight=None): + def fit(self, K, y=None, sample_weight=None): """Fit KernelFlexibleCenterer Parameters @@ -310,7 +315,7 @@ def fit(self, K=None, y=None, sample_weight=None): Fitted transformer. """ - Kc = self._validate_data(K, copy=True, dtype=FLOAT_DTYPES, reset=False) + K = self._validate_data(K, copy=True, dtype=FLOAT_DTYPES, reset=False) if sample_weight is not None: self.sample_weight_ = _check_sample_weight(sample_weight, K, dtype=K.dtype) @@ -327,20 +332,20 @@ def fit(self, K=None, y=None, sample_weight=None): else: super().fit(K, y) - K_pred_cols = np.average(Kc, weights=self.sample_weight_, axis=1)[ + K_pred_cols = np.average(K, weights=self.sample_weight_, axis=1)[ :, np.newaxis ] else: - self.K_fit_rows_ = np.zeros(Kc.shape[1]) + self.K_fit_rows_ = np.zeros(K.shape[1]) self.K_fit_all_ = 0.0 - K_pred_cols = np.zeros((Kc.shape[0], 1)) + K_pred_cols = np.zeros((K.shape[0], 1)) if self.with_trace: - Kc -= self.K_fit_rows_ - Kc -= K_pred_cols - Kc += self.K_fit_all_ + K -= self.K_fit_rows_ + K -= K_pred_cols + K += self.K_fit_all_ - self.scale_ = np.trace(Kc) / Kc.shape[0] + self.scale_ = np.trace(K) / K.shape[0] else: self.scale_ = 1.0 @@ -408,7 +413,7 @@ def fit_transform(self, K, y=None, sample_weight=None, copy=True, **fit_params): return self.transform(K, copy) -class SparseKernelCenterer(TransformerMixin, BaseEstimator): +class SparseKernelCenterer(TransformerMixin): r"""Kernel centering method for sparse kernels, similar to KernelFlexibleCenterer. diff --git a/src/skmatter/utils/_orthogonalizers.py b/src/skmatter/utils/_orthogonalizers.py index 00a68949e..4bfba3739 100644 --- a/src/skmatter/utils/_orthogonalizers.py +++ b/src/skmatter/utils/_orthogonalizers.py @@ -56,9 +56,9 @@ def X_orthogonalizer(x1, c=None, x2=None, tol=1e-12, copy=False): if np.linalg.norm(col) < tol: warnings.warn("Column vector contains only zeros.", stacklevel=1) else: - col /= np.linalg.norm(col, axis=0) + col = np.divide(col, np.linalg.norm(col, axis=0)) - xnew -= col @ (col.T @ xnew) + xnew -= (col @ (col.T @ xnew)).astype(xnew.dtype) return xnew diff --git a/src/skmatter/utils/_pcovr_utils.py b/src/skmatter/utils/_pcovr_utils.py index 515d5edfc..69ae2e394 100644 --- a/src/skmatter/utils/_pcovr_utils.py +++ b/src/skmatter/utils/_pcovr_utils.py @@ -186,7 +186,7 @@ def pcovr_covariance( C_Y = C_Y.reshape((C.shape[0], -1)) C_Y = np.real(C_Y) - C += (1 - mixing) * C_Y @ C_Y.T + C += (1 - mixing) * np.array(C_Y @ C_Y.T, dtype=np.float64) if mixing > 0: C += (mixing) * (X.T @ X) diff --git a/tests/test_check_estimators.py b/tests/test_check_estimators.py new file mode 100644 index 000000000..f744a7a05 --- /dev/null +++ b/tests/test_check_estimators.py @@ -0,0 +1,26 @@ +from sklearn.utils.estimator_checks import parametrize_with_checks + +from skmatter.decomposition import KernelPCovR, PCovR +from skmatter.feature_selection import CUR as fCUR +from skmatter.feature_selection import FPS as fFPS +from skmatter.feature_selection import PCovCUR as fPCovCUR +from skmatter.feature_selection import PCovFPS as fPCovFPS +from skmatter.linear_model import RidgeRegression2FoldCV # OrthogonalRegression, +from skmatter.preprocessing import KernelNormalizer, StandardFlexibleScaler + + +@parametrize_with_checks( + [ + KernelPCovR(mixing=0.5), + PCovR(mixing=0.5), + fCUR(), + fFPS(), + fPCovCUR(), + fPCovFPS(), + RidgeRegression2FoldCV(), + KernelNormalizer(), + StandardFlexibleScaler(), + ] +) +def test_sklearn_compatible_estimator(estimator, check): + check(estimator) diff --git a/tests/test_feature_simple_cur.py b/tests/test_feature_simple_cur.py index ba92facd5..72554471d 100644 --- a/tests/test_feature_simple_cur.py +++ b/tests/test_feature_simple_cur.py @@ -4,12 +4,13 @@ from sklearn import exceptions from skmatter.datasets import load_csd_1000r as load -from skmatter.feature_selection import CUR +from skmatter.feature_selection import CUR, FPS class TestCUR(unittest.TestCase): def setUp(self): self.X, _ = load(return_X_y=True) + self.X = FPS(n_to_select=10).fit(self.X).transform(self.X) def test_bad_transform(self): selector = CUR(n_to_select=2) diff --git a/tests/test_greedy_selector.py b/tests/test_greedy_selector.py index fe83b71a8..0bfe6de99 100644 --- a/tests/test_greedy_selector.py +++ b/tests/test_greedy_selector.py @@ -61,9 +61,10 @@ def test_bad_warm_start(self): def test_bad_y(self): self.X, self.Y = get_dataset(return_X_y=True) + Y = self.Y[:2] selector = GreedyTester(n_to_select=2) with self.assertRaises(ValueError): - selector.fit(X=self.X, y=self.Y[:2]) + selector.fit(X=self.X, y=Y) def test_bad_transform(self): selector = GreedyTester(n_to_select=2) @@ -72,8 +73,7 @@ def test_bad_transform(self): _ = selector.transform(self.X[:, :3]) self.assertEqual( str(cm.exception), - "X has 3 features, but GreedyTester is expecting {} features" - " as input.".format(self.X.shape[1]), + "X has a different shape than during fitting. Reshape your data.", ) def test_no_nfeatures(self): @@ -120,16 +120,16 @@ def test_size_input(self): X = np.array([1, 2, 3, 4, 5]).reshape(-1, 1) selector_sample = GreedyTester(selection_type="sample") selector_feature = GreedyTester(selection_type="feature") - with self.assertRaises(ValueError) as cm: selector_feature.fit(X) self.assertEqual( str(cm.exception), - f"Found array with 1 feature(s) (shape={X.shape}) while a minimum of 2 is " - "required.", + f"Found array with 1 feature(s) (shape={X.shape})" + " while a minimum of 2 is required.", ) X = X.reshape(1, -1) + with self.assertRaises(ValueError) as cm: selector_sample.fit(X) self.assertEqual( diff --git a/tests/test_kernel_normalizer.py b/tests/test_kernel_normalizer.py index 694b39d0a..d17ddf9f3 100644 --- a/tests/test_kernel_normalizer.py +++ b/tests/test_kernel_normalizer.py @@ -41,12 +41,6 @@ def test_invalid_sample_weights(self): with self.assertRaises(ValueError): model.fit_transform(K, sample_weight=wts_dim) - def test_NoInputs(self): - """Checks that fit cannot be called with zero inputs.""" - model = KernelNormalizer() - with self.assertRaises(ValueError): - model.fit() - def test_ValueError(self): """Checks that a non-square matrix cannot be normalized.""" K = self.random_state.uniform(0, 100, size=(3, 4)) diff --git a/tests/test_orthogonalizers.py b/tests/test_orthogonalizers.py index 899cf73ce..0578141c8 100644 --- a/tests/test_orthogonalizers.py +++ b/tests/test_orthogonalizers.py @@ -20,8 +20,8 @@ def __init__(self, *args, **kwargs): self.random_state = np.random.RandomState(0) def setUp(self): - self.n_samples = 100 - self.n_features = 100 + self.n_samples = 2 + self.n_features = 4 def test_null_column(self): # checks that the column passed to the orthogonalizer @@ -117,6 +117,7 @@ def test_multiple_orthogonalizations(self): X_correlated = X_orthogonalizer( X_correlated, x2=X_correlated[:, :n_uncorrelated] ) + print(X_correlated) self.assertLessEqual(np.linalg.norm(X_correlated), EPSILON) diff --git a/tests/test_sample_simple_cur.py b/tests/test_sample_simple_cur.py index 9e82c18c3..b3a9437e1 100644 --- a/tests/test_sample_simple_cur.py +++ b/tests/test_sample_simple_cur.py @@ -1,23 +1,33 @@ import unittest import numpy as np -from sklearn import exceptions from sklearn.datasets import fetch_california_housing as load -from skmatter.sample_selection import CUR +from skmatter.sample_selection import CUR, FPS class TestCUR(unittest.TestCase): def setUp(self): self.X, _ = load(return_X_y=True) - self.X = self.X[:1000] + self.X = self.X[FPS(n_to_select=100).fit(self.X).selected_idx_] self.n_select = min(20, min(self.X.shape) // 2) - def test_bad_transform(self): - selector = CUR(n_to_select=2) - with self.assertRaises(exceptions.NotFittedError): + def test_sample_transform(self): + """ + This test checks that an error is raised when the transform function is used, + because sklearn does not support well transformers that change the number + of samples with other classes like Pipeline + """ + selector = CUR(n_to_select=1) + selector.fit(self.X) + with self.assertRaises(ValueError) as error: _ = selector.transform(self.X) + self.assertTrue( + "Transform is not currently supported for sample selection." + == str(error.exception) + ) + def test_restart(self): """ This test checks that the model can be restarted with a new instance diff --git a/tests/test_standard_flexible_scaler.py b/tests/test_standard_flexible_scaler.py index 5e5108a47..e1d6cc1f6 100644 --- a/tests/test_standard_flexible_scaler.py +++ b/tests/test_standard_flexible_scaler.py @@ -188,6 +188,14 @@ def test_ValueError_full(self): with self.assertRaises(ValueError): model.fit(X) + def test_not_w_mean(self): + """Checks that the matrix normalized `with_mean=False` + does not have a mean.""" + X = np.array([2, 2, 3]).reshape(-1, 1) + model = StandardFlexibleScaler(with_mean=False) + model.fit(X) + self.assertTrue(np.allclose(model.mean_, 0)) + if __name__ == "__main__": unittest.main()