From 5be83365043e043b98983459dd8bb3b09bdeda44 Mon Sep 17 00:00:00 2001 From: "Rose K. Cersonsky" <47536110+rosecers@users.noreply.github.com> Date: Tue, 30 May 2023 12:53:33 -0400 Subject: [PATCH 1/9] Initial commit --- src/skmatter/_selection.py | 86 +++++++++++-------- src/skmatter/sample_selection/_voronoi_fps.py | 8 +- 2 files changed, 53 insertions(+), 41 deletions(-) diff --git a/src/skmatter/_selection.py b/src/skmatter/_selection.py index ebb25ecb2..c19ebfa9f 100644 --- a/src/skmatter/_selection.py +++ b/src/skmatter/_selection.py @@ -39,7 +39,7 @@ class GreedySelector(SelectorMixin, MetaEstimatorMixin, BaseEstimator): selection_type : str, {'feature', 'sample'} whether to choose a subset of columns ('feature') or rows ('sample'). - Stored in :py:attr:`self._axis_name` (as text) and :py:attr:`self._axis` + Stored in :py:attr:`self.axis_name` (as text) and :py:attr:`self.axis` (as 0 or 1 for 'sample' or 'feature', respectively). n_to_select : int or float, default=None @@ -71,6 +71,8 @@ class GreedySelector(SelectorMixin, MetaEstimatorMixin, BaseEstimator): random_state: int or RandomState instance, default=0 + axis: [0,1] axis over which we are doing selection + Attributes ---------- n_selected_ : int @@ -93,8 +95,24 @@ def __init__( progress_bar=False, full=False, random_state=0, + axis=None, ): - self.selection_type = selection_type + if selection_type is not None and axis is None: + self.selection_type = selection_type + if selection_type == "feature": + self.axis = 1 + elif selection_type == "sample": + self.axis = 0 + else: + raise ValueError("Only feature and sample selection supported.") + elif axis is not None: + if axis in [0, 1]: + self.axis = axis + self.selection_type = ["sample", "feature"][axis] + else: + raise ValueError( + "Only feature (axis=1) and sample (axis=0) selection supported." + ) self.n_to_select = n_to_select self.score_threshold = score_threshold self.score_threshold_type = score_threshold_type @@ -127,13 +145,6 @@ def fit(self, X, y=None, warm_start=False): """ tags = self._get_tags() - if self.selection_type == "feature": - self._axis = 1 - elif self.selection_type == "sample": - self._axis = 0 - else: - raise ValueError("Only feature and sample selection supported.") - if self.full and self.score_threshold is not None: raise ValueError( "You cannot specify both `score_threshold` and `full=True`." @@ -148,7 +159,7 @@ def fit(self, X, y=None, warm_start=False): accept_sparse="csc", force_all_finite=not tags.get("allow_nan", True), ) - if self._axis == 1: + if self.axis == 1: params["ensure_min_features"] = 2 else: params["ensure_min_samples"] = 2 @@ -165,7 +176,8 @@ def fit(self, X, y=None, warm_start=False): else: X = check_array(X, **params) - n_to_select_from = X.shape[self._axis] + n_to_select_from = X.shape[self.axis] + self.n_samples_in_, self.n_features_in_ = X.shape self.n_samples_in_, self.n_features_in_ = X.shape @@ -215,7 +227,7 @@ def fit(self, X, y=None, warm_start=False): stacklevel=1, ) self.X_selected_ = np.take( - self.X_selected_, np.arange(self.n_selected_), axis=self._axis + self.X_selected_, np.arange(self.n_selected_), axis=self.axis ) if hasattr(self, "y_selected_"): @@ -256,7 +268,7 @@ def transform(self, X, y=None): accept_sparse="csr", force_all_finite=not _safe_tags(self, key="allow_nan"), reset=False, - ensure_2d=self._axis, + ensure_2d=self.axis, ) if self._axis == 1: @@ -327,11 +339,11 @@ def _init_greedy_search(self, X, y, n_to_select): self.first_score_ = None sel_shape = list(X.shape) - sel_shape[self._axis] = n_to_select + sel_shape[self.axis] = n_to_select self.X_selected_ = np.zeros(sel_shape, float) - if y is not None and self._axis == 0: + if y is not None and self.axis == 0: self.y_selected_ = np.zeros( (n_to_select, y.reshape(y.shape[0], -1).shape[1]), float ) @@ -341,7 +353,7 @@ def _continue_greedy_search(self, X, y, n_to_select): """Continues the search. Prepares an array to store the selected features.""" n_pad = [(0, 0), (0, 0)] - n_pad[self._axis] = (0, n_to_select - self.n_selected_) + n_pad[self.axis] = (0, n_to_select - self.n_selected_) self.X_selected_ = np.pad( self.X_selected_, @@ -385,13 +397,13 @@ def _update_post_selection(self, X, y, last_selected): Saves the most recently selected feature and increments the feature counter """ - if self._axis == 1: + if self.axis == 1: self.X_selected_[:, self.n_selected_] = np.take( - X, last_selected, axis=self._axis + X, last_selected, axis=self.axis ) else: self.X_selected_[self.n_selected_] = np.take( - X, last_selected, axis=self._axis + X, last_selected, axis=self.axis ) if hasattr(self, "y_selected_"): @@ -420,7 +432,7 @@ def _get_support_mask(self): def _postprocess(self, X, y): """Post-process X and / or y when selection is finished""" - self.support_ = np.full(X.shape[self._axis], False) + self.support_ = np.full(X.shape[self.axis], False) self.support_[self.selected_idx_] = True def _more_tags(self): @@ -531,7 +543,7 @@ def _continue_greedy_search(self, X, y, n_to_select): for c in self.selected_idx_: if self.recompute_every != 0 and ( - np.linalg.norm(np.take(self.X_current_, [c], axis=self._axis)) + np.linalg.norm(np.take(self.X_current_, [c], axis=self.axis)) > self.tolerance ): self._orthogonalize(last_selected=c) @@ -575,7 +587,7 @@ def _compute_pi(self, X, y=None): :math:`\\pi` importance for the given samples or features """ - if self._axis == 0: + if self.axis == 0: U, _, _ = scipy.sparse.linalg.svds(X, k=self.k, return_singular_vectors="u") U = np.real(U) new_pi = (U[:, : self.k] ** 2.0).sum(axis=1) @@ -604,7 +616,7 @@ def _update_post_selection(self, X, y, last_selected): self.pi_[last_selected] = 0.0 def _orthogonalize(self, last_selected): - if self._axis == 1: + if self.axis == 1: self.X_current_ = X_orthogonalizer( x1=self.X_current_, c=last_selected, tol=self.tolerance ) @@ -731,7 +743,7 @@ def _continue_greedy_search(self, X, y, n_to_select): for c in self.selected_idx_: if self.recompute_every != 0 and ( - np.linalg.norm(np.take(self.X_current_, [c], axis=self._axis)) + np.linalg.norm(np.take(self.X_current_, [c], axis=self.axis)) > self.tolerance ): self._orthogonalize(last_selected=c) @@ -799,7 +811,7 @@ def _compute_pi(self, X, y=None): :math:`\pi` importance for the given samples or features """ - if self._axis == 0: + if self.axis == 0: pcovr_distance = pcovr_kernel( self.mixing, X, @@ -824,7 +836,7 @@ def _compute_pi(self, X, y=None): return pi def _orthogonalize(self, last_selected): - if self._axis == 1: + if self.axis == 1: self.X_current_ = X_orthogonalizer( x1=self.X_current_, c=last_selected, tol=self.tolerance ) @@ -833,7 +845,7 @@ def _orthogonalize(self, last_selected): x1=self.X_current_.T, c=last_selected, tol=self.tolerance ).T if self.y_current_ is not None: - if self._axis == 1: + if self.axis == 1: self.y_current_ = Y_feature_orthogonalizer( self.y_current_, X=self.X_selected_, tol=self.tolerance ) @@ -959,13 +971,13 @@ def _init_greedy_search(self, X, y, n_to_select): super()._init_greedy_search(X, y, n_to_select) - self.norms_ = (X**2).sum(axis=abs(self._axis - 1)) - self.haussdorf_ = np.full(X.shape[self._axis], np.inf) - self.haussdorf_at_select_ = np.full(X.shape[self._axis], np.inf) + self.norms_ = (X**2).sum(axis=abs(self.axis - 1)) + self.haussdorf_ = np.full(X.shape[self.axis], np.inf) + self.haussdorf_at_select_ = np.full(X.shape[self.axis], np.inf) if self.initialize == "random": random_state = check_random_state(self.random_state) - initialize = random_state.randint(X.shape[self._axis]) + initialize = random_state.randint(X.shape[self.axis]) self.selected_idx_[0] = initialize self._update_post_selection(X, y, self.selected_idx_[0]) elif isinstance(self.initialize, numbers.Integral): @@ -985,7 +997,7 @@ def _update_haussdorf(self, X, y, last_selected): self.haussdorf_at_select_[last_selected] = self.haussdorf_[last_selected] # distances of all points to the new point - if self._axis == 1: + if self.axis == 1: new_dist = ( self.norms_ + self.norms_[last_selected] - 2 * X[:, last_selected].T @ X ) @@ -1115,7 +1127,7 @@ def _init_greedy_search(self, X, y, n_to_select): super()._init_greedy_search(X, y, n_to_select) - if self._axis == 1: + if self.axis == 1: self.pcovr_distance_ = pcovr_covariance(mixing=self.mixing, X=X, Y=y) else: self.pcovr_distance_ = pcovr_kernel(mixing=self.mixing, X=X, Y=y) @@ -1124,15 +1136,15 @@ def _init_greedy_search(self, X, y, n_to_select): if self.initialize == "random": random_state = check_random_state(self.random_state) - initialize = random_state.randint(X.shape[self._axis]) + initialize = random_state.randint(X.shape[self.axis]) elif isinstance(self.initialize, numbers.Integral): initialize = self.initialize else: raise ValueError("Invalid value of the initialize parameter") self.selected_idx_[0] = initialize - self.haussdorf_ = np.full(X.shape[self._axis], np.inf) - self.haussdorf_at_select_ = np.full(X.shape[self._axis], np.inf) + self.haussdorf_ = np.full(X.shape[self.axis], np.inf) + self.haussdorf_at_select_ = np.full(X.shape[self.axis], np.inf) self._update_post_selection(X, y, self.selected_idx_[0]) def _update_haussdorf(self, X, y, last_selected): @@ -1142,7 +1154,7 @@ def _update_haussdorf(self, X, y, last_selected): new_dist = ( self.norms_ + self.norms_[last_selected] - - 2 * np.take(self.pcovr_distance_, last_selected, axis=self._axis) + - 2 * np.take(self.pcovr_distance_, last_selected, axis=self.axis) ) # update in-place the Haussdorf distance list diff --git a/src/skmatter/sample_selection/_voronoi_fps.py b/src/skmatter/sample_selection/_voronoi_fps.py index b69fa35ba..f767e89c1 100644 --- a/src/skmatter/sample_selection/_voronoi_fps.py +++ b/src/skmatter/sample_selection/_voronoi_fps.py @@ -195,19 +195,19 @@ def _init_greedy_search(self, X, y, n_to_select): super()._init_greedy_search(X, y, n_to_select) - self.norms_ = (X**2).sum(axis=abs(self._axis - 1)) + self.norms_ = (X**2).sum(axis=abs(self.axis - 1)) if self.initialize == "random": random_state = check_random_state(self.random_state) - initialize = random_state.randint(X.shape[self._axis]) + initialize = random_state.randint(X.shape[self.axis]) elif isinstance(self.initialize, numbers.Integral): initialize = self.initialize else: raise ValueError("Invalid value of the initialize parameter") self.selected_idx_[0] = initialize - self.haussdorf_ = np.full(X.shape[self._axis], np.inf) - self.haussdorf_at_select_ = np.full(X.shape[self._axis], np.inf) + self.haussdorf_ = np.full(X.shape[self.axis], np.inf) + self.haussdorf_at_select_ = np.full(X.shape[self.axis], np.inf) self._update_post_selection(X, y, self.selected_idx_[0]) def _continue_greedy_search(self, X, y, n_to_select): From d2ea889437fac0a08b55a42b27c6ab886d9a7b9d Mon Sep 17 00:00:00 2001 From: "Rose K. Cersonsky" <47536110+rosecers@users.noreply.github.com> Date: Wed, 31 May 2023 21:29:12 -0400 Subject: [PATCH 2/9] axis -> _axis --- src/skmatter/_selection.py | 86 ++++++++----------- src/skmatter/sample_selection/_voronoi_fps.py | 8 +- 2 files changed, 42 insertions(+), 52 deletions(-) diff --git a/src/skmatter/_selection.py b/src/skmatter/_selection.py index c19ebfa9f..66f118a73 100644 --- a/src/skmatter/_selection.py +++ b/src/skmatter/_selection.py @@ -39,7 +39,7 @@ class GreedySelector(SelectorMixin, MetaEstimatorMixin, BaseEstimator): selection_type : str, {'feature', 'sample'} whether to choose a subset of columns ('feature') or rows ('sample'). - Stored in :py:attr:`self.axis_name` (as text) and :py:attr:`self.axis` + Stored in :py:attr:`self._axis_name` (as text) and :py:attr:`self._axis` (as 0 or 1 for 'sample' or 'feature', respectively). n_to_select : int or float, default=None @@ -71,8 +71,6 @@ class GreedySelector(SelectorMixin, MetaEstimatorMixin, BaseEstimator): random_state: int or RandomState instance, default=0 - axis: [0,1] axis over which we are doing selection - Attributes ---------- n_selected_ : int @@ -95,24 +93,8 @@ def __init__( progress_bar=False, full=False, random_state=0, - axis=None, ): - if selection_type is not None and axis is None: - self.selection_type = selection_type - if selection_type == "feature": - self.axis = 1 - elif selection_type == "sample": - self.axis = 0 - else: - raise ValueError("Only feature and sample selection supported.") - elif axis is not None: - if axis in [0, 1]: - self.axis = axis - self.selection_type = ["sample", "feature"][axis] - else: - raise ValueError( - "Only feature (axis=1) and sample (axis=0) selection supported." - ) + self.selection_type = selection_type self.n_to_select = n_to_select self.score_threshold = score_threshold self.score_threshold_type = score_threshold_type @@ -145,6 +127,13 @@ def fit(self, X, y=None, warm_start=False): """ tags = self._get_tags() + if self.selection_type == "feature": + self._axis = 1 + elif self.selection_type == "sample": + self._axis = 0 + else: + raise ValueError("Only feature and sample selection supported.") + if self.full and self.score_threshold is not None: raise ValueError( "You cannot specify both `score_threshold` and `full=True`." @@ -159,7 +148,7 @@ def fit(self, X, y=None, warm_start=False): accept_sparse="csc", force_all_finite=not tags.get("allow_nan", True), ) - if self.axis == 1: + if self._axis == 1: params["ensure_min_features"] = 2 else: params["ensure_min_samples"] = 2 @@ -176,7 +165,8 @@ def fit(self, X, y=None, warm_start=False): else: X = check_array(X, **params) - n_to_select_from = X.shape[self.axis] + n_to_select_from = X.shape[self._axis] + self.n_samples_in_, self.n_features_in_ = X.shape self.n_samples_in_, self.n_features_in_ = X.shape @@ -227,7 +217,7 @@ def fit(self, X, y=None, warm_start=False): stacklevel=1, ) self.X_selected_ = np.take( - self.X_selected_, np.arange(self.n_selected_), axis=self.axis + self.X_selected_, np.arange(self.n_selected_), axis=self._axis ) if hasattr(self, "y_selected_"): @@ -268,7 +258,7 @@ def transform(self, X, y=None): accept_sparse="csr", force_all_finite=not _safe_tags(self, key="allow_nan"), reset=False, - ensure_2d=self.axis, + ensure_2d=self._axis, ) if self._axis == 1: @@ -339,11 +329,11 @@ def _init_greedy_search(self, X, y, n_to_select): self.first_score_ = None sel_shape = list(X.shape) - sel_shape[self.axis] = n_to_select + sel_shape[self._axis] = n_to_select self.X_selected_ = np.zeros(sel_shape, float) - if y is not None and self.axis == 0: + if y is not None and self._axis == 0: self.y_selected_ = np.zeros( (n_to_select, y.reshape(y.shape[0], -1).shape[1]), float ) @@ -353,7 +343,7 @@ def _continue_greedy_search(self, X, y, n_to_select): """Continues the search. Prepares an array to store the selected features.""" n_pad = [(0, 0), (0, 0)] - n_pad[self.axis] = (0, n_to_select - self.n_selected_) + n_pad[self._axis] = (0, n_to_select - self.n_selected_) self.X_selected_ = np.pad( self.X_selected_, @@ -397,13 +387,13 @@ def _update_post_selection(self, X, y, last_selected): Saves the most recently selected feature and increments the feature counter """ - if self.axis == 1: + if self._axis == 1: self.X_selected_[:, self.n_selected_] = np.take( - X, last_selected, axis=self.axis + X, last_selected, axis=self._axis ) else: self.X_selected_[self.n_selected_] = np.take( - X, last_selected, axis=self.axis + X, last_selected, axis=self._axis ) if hasattr(self, "y_selected_"): @@ -432,7 +422,7 @@ def _get_support_mask(self): def _postprocess(self, X, y): """Post-process X and / or y when selection is finished""" - self.support_ = np.full(X.shape[self.axis], False) + self.support_ = np.full(X.shape[self._axis], False) self.support_[self.selected_idx_] = True def _more_tags(self): @@ -543,7 +533,7 @@ def _continue_greedy_search(self, X, y, n_to_select): for c in self.selected_idx_: if self.recompute_every != 0 and ( - np.linalg.norm(np.take(self.X_current_, [c], axis=self.axis)) + np.linalg.norm(np.take(self.X_current_, [c], axis=self._axis)) > self.tolerance ): self._orthogonalize(last_selected=c) @@ -587,7 +577,7 @@ def _compute_pi(self, X, y=None): :math:`\\pi` importance for the given samples or features """ - if self.axis == 0: + if self._axis == 0: U, _, _ = scipy.sparse.linalg.svds(X, k=self.k, return_singular_vectors="u") U = np.real(U) new_pi = (U[:, : self.k] ** 2.0).sum(axis=1) @@ -616,7 +606,7 @@ def _update_post_selection(self, X, y, last_selected): self.pi_[last_selected] = 0.0 def _orthogonalize(self, last_selected): - if self.axis == 1: + if self._axis == 1: self.X_current_ = X_orthogonalizer( x1=self.X_current_, c=last_selected, tol=self.tolerance ) @@ -743,7 +733,7 @@ def _continue_greedy_search(self, X, y, n_to_select): for c in self.selected_idx_: if self.recompute_every != 0 and ( - np.linalg.norm(np.take(self.X_current_, [c], axis=self.axis)) + np.linalg.norm(np.take(self.X_current_, [c], axis=self._axis)) > self.tolerance ): self._orthogonalize(last_selected=c) @@ -811,7 +801,7 @@ def _compute_pi(self, X, y=None): :math:`\pi` importance for the given samples or features """ - if self.axis == 0: + if self._axis == 0: pcovr_distance = pcovr_kernel( self.mixing, X, @@ -836,7 +826,7 @@ def _compute_pi(self, X, y=None): return pi def _orthogonalize(self, last_selected): - if self.axis == 1: + if self._axis == 1: self.X_current_ = X_orthogonalizer( x1=self.X_current_, c=last_selected, tol=self.tolerance ) @@ -845,7 +835,7 @@ def _orthogonalize(self, last_selected): x1=self.X_current_.T, c=last_selected, tol=self.tolerance ).T if self.y_current_ is not None: - if self.axis == 1: + if self._axis == 1: self.y_current_ = Y_feature_orthogonalizer( self.y_current_, X=self.X_selected_, tol=self.tolerance ) @@ -971,13 +961,13 @@ def _init_greedy_search(self, X, y, n_to_select): super()._init_greedy_search(X, y, n_to_select) - self.norms_ = (X**2).sum(axis=abs(self.axis - 1)) - self.haussdorf_ = np.full(X.shape[self.axis], np.inf) - self.haussdorf_at_select_ = np.full(X.shape[self.axis], np.inf) + self.norms_ = (X**2).sum(axis=abs(self._axis - 1)) + self.haussdorf_ = np.full(X.shape[self._axis], np.inf) + self.haussdorf_at_select_ = np.full(X.shape[self._axis], np.inf) if self.initialize == "random": random_state = check_random_state(self.random_state) - initialize = random_state.randint(X.shape[self.axis]) + initialize = random_state.randint(X.shape[self._axis]) self.selected_idx_[0] = initialize self._update_post_selection(X, y, self.selected_idx_[0]) elif isinstance(self.initialize, numbers.Integral): @@ -997,7 +987,7 @@ def _update_haussdorf(self, X, y, last_selected): self.haussdorf_at_select_[last_selected] = self.haussdorf_[last_selected] # distances of all points to the new point - if self.axis == 1: + if self._axis == 1: new_dist = ( self.norms_ + self.norms_[last_selected] - 2 * X[:, last_selected].T @ X ) @@ -1127,7 +1117,7 @@ def _init_greedy_search(self, X, y, n_to_select): super()._init_greedy_search(X, y, n_to_select) - if self.axis == 1: + if self._axis == 1: self.pcovr_distance_ = pcovr_covariance(mixing=self.mixing, X=X, Y=y) else: self.pcovr_distance_ = pcovr_kernel(mixing=self.mixing, X=X, Y=y) @@ -1136,15 +1126,15 @@ def _init_greedy_search(self, X, y, n_to_select): if self.initialize == "random": random_state = check_random_state(self.random_state) - initialize = random_state.randint(X.shape[self.axis]) + initialize = random_state.randint(X.shape[self._axis]) elif isinstance(self.initialize, numbers.Integral): initialize = self.initialize else: raise ValueError("Invalid value of the initialize parameter") self.selected_idx_[0] = initialize - self.haussdorf_ = np.full(X.shape[self.axis], np.inf) - self.haussdorf_at_select_ = np.full(X.shape[self.axis], np.inf) + self.haussdorf_ = np.full(X.shape[self._axis], np.inf) + self.haussdorf_at_select_ = np.full(X.shape[self._axis], np.inf) self._update_post_selection(X, y, self.selected_idx_[0]) def _update_haussdorf(self, X, y, last_selected): @@ -1154,7 +1144,7 @@ def _update_haussdorf(self, X, y, last_selected): new_dist = ( self.norms_ + self.norms_[last_selected] - - 2 * np.take(self.pcovr_distance_, last_selected, axis=self.axis) + - 2 * np.take(self.pcovr_distance_, last_selected, axis=self._axis) ) # update in-place the Haussdorf distance list diff --git a/src/skmatter/sample_selection/_voronoi_fps.py b/src/skmatter/sample_selection/_voronoi_fps.py index f767e89c1..b69fa35ba 100644 --- a/src/skmatter/sample_selection/_voronoi_fps.py +++ b/src/skmatter/sample_selection/_voronoi_fps.py @@ -195,19 +195,19 @@ def _init_greedy_search(self, X, y, n_to_select): super()._init_greedy_search(X, y, n_to_select) - self.norms_ = (X**2).sum(axis=abs(self.axis - 1)) + self.norms_ = (X**2).sum(axis=abs(self._axis - 1)) if self.initialize == "random": random_state = check_random_state(self.random_state) - initialize = random_state.randint(X.shape[self.axis]) + initialize = random_state.randint(X.shape[self._axis]) elif isinstance(self.initialize, numbers.Integral): initialize = self.initialize else: raise ValueError("Invalid value of the initialize parameter") self.selected_idx_[0] = initialize - self.haussdorf_ = np.full(X.shape[self.axis], np.inf) - self.haussdorf_at_select_ = np.full(X.shape[self.axis], np.inf) + self.haussdorf_ = np.full(X.shape[self._axis], np.inf) + self.haussdorf_at_select_ = np.full(X.shape[self._axis], np.inf) self._update_post_selection(X, y, self.selected_idx_[0]) def _continue_greedy_search(self, X, y, n_to_select): From da7f20c5906fa3e12609cbac000853fe2bb49a03 Mon Sep 17 00:00:00 2001 From: "Rose K. Cersonsky" <47536110+rosecers@users.noreply.github.com> Date: Wed, 31 May 2023 21:29:12 -0400 Subject: [PATCH 3/9] axis -> _axis --- src/skmatter/_selection.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/skmatter/_selection.py b/src/skmatter/_selection.py index 66f118a73..ef409a02c 100644 --- a/src/skmatter/_selection.py +++ b/src/skmatter/_selection.py @@ -166,7 +166,6 @@ def fit(self, X, y=None, warm_start=False): X = check_array(X, **params) n_to_select_from = X.shape[self._axis] - self.n_samples_in_, self.n_features_in_ = X.shape self.n_samples_in_, self.n_features_in_ = X.shape From 58fd90cd44814de2683dfccdae6e5fa1add15f91 Mon Sep 17 00:00:00 2001 From: "Rose K. Cersonsky" <47536110+rosecers@users.noreply.github.com> Date: Tue, 30 May 2023 13:20:44 -0400 Subject: [PATCH 4/9] Initial commit to pass an estimator check. WIP --- .gitignore | 1 + src/skmatter/_selection.py | 27 ++++++++++------ src/skmatter/decomposition/_pcovr.py | 5 ++- src/skmatter/linear_model/_base.py | 27 +++++++++++++--- src/skmatter/linear_model/_ridge.py | 16 ++++++++-- .../metrics/_reconstruction_measures.py | 2 +- src/skmatter/preprocessing/_data.py | 32 +++++++++++-------- src/skmatter/utils/_orthogonalizers.py | 4 +-- src/skmatter/utils/_pcovr_utils.py | 2 +- tests/test_check_estimators.py | 26 +++++++++++++++ tests/test_feature_simple_cur.py | 3 +- tests/test_greedy_selector.py | 6 +++- tests/test_kernel_normalizer.py | 6 ---- tests/test_orthogonalizers.py | 7 ++-- tests/test_sample_simple_cur.py | 22 +++++++++---- tests/test_standard_flexible_scaler.py | 8 +++++ 16 files changed, 143 insertions(+), 51 deletions(-) create mode 100644 tests/test_check_estimators.py diff --git a/.gitignore b/.gitignore index 4c2794f1a..15eba83e1 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ __pycache__ *.egg-info *.swp *.swo +*DS_Store .tox/ build/ diff --git a/src/skmatter/_selection.py b/src/skmatter/_selection.py index ef409a02c..397e4043d 100644 --- a/src/skmatter/_selection.py +++ b/src/skmatter/_selection.py @@ -12,9 +12,8 @@ from scipy.sparse.linalg import eigsh from sklearn.base import BaseEstimator, MetaEstimatorMixin from sklearn.feature_selection._base import SelectorMixin -from sklearn.utils import check_array, check_random_state, safe_mask -from sklearn.utils._tags import _safe_tags -from sklearn.utils.validation import check_is_fitted +from sklearn.utils import check_array, check_random_state, check_X_y, safe_mask +from sklearn.utils.validation import FLOAT_DTYPES, as_float_array, check_is_fitted from .utils import ( X_orthogonalizer, @@ -125,7 +124,6 @@ def fit(self, X, y=None, warm_start=False): ------- self : object """ - tags = self._get_tags() if self.selection_type == "feature": self._axis = 1 @@ -153,19 +151,26 @@ def fit(self, X, y=None, warm_start=False): else: params["ensure_min_samples"] = 2 - if y is not None: - params["multi_output"] = True - X, y = self._validate_data(X, y, **params) + if hasattr(self, "mixing") or y is not None: + self._validate_data(X, y, **params) + X, y = check_X_y(X, y, multi_output=True) if len(y.shape) == 1: # force y to have multi_output 2D format even when it's 1D, since # many functions, most notably PCov routines, assume an array storage # format, most notably to compute (y @ y.T) y = y.reshape((len(y), 1)) + else: X = check_array(X, **params) + if self.full and self.score_threshold is not None: + raise ValueError( + "You cannot specify both `score_threshold` and `full=True`." + ) + n_to_select_from = X.shape[self._axis] + self.n_samples_in_, self.n_features_in_ = X.shape self.n_samples_in_, self.n_features_in_ = X.shape @@ -244,8 +249,10 @@ def transform(self, X, y=None): The selected subset of the input. """ - if len(X.shape) == 1: - X = X.reshape(-1, 1) + if self.axis == 0: + raise ValueError( + "Transform is not currently supported for sample selection." + ) mask = self.get_support() @@ -518,7 +525,7 @@ def _init_greedy_search(self, X, y, n_to_select): features and computes their initial importance score. """ - self.X_current_ = X.copy() + self.X_current_ = as_float_array(X.copy()) self.pi_ = self._compute_pi(self.X_current_) super()._init_greedy_search(X, y, n_to_select) diff --git a/src/skmatter/decomposition/_pcovr.py b/src/skmatter/decomposition/_pcovr.py index 4b3bb2b97..7e5afd42d 100644 --- a/src/skmatter/decomposition/_pcovr.py +++ b/src/skmatter/decomposition/_pcovr.py @@ -130,6 +130,8 @@ class PCovR(_BasePCA, LinearModel): Used when the 'arpack' or 'randomized' solvers are used. Pass an int for reproducible results across multiple function calls. + whiten : boolean, deprecated + Attributes ---------- @@ -202,12 +204,13 @@ def __init__( regressor=None, iterated_power="auto", random_state=None, + whiten=False, ): self.mixing = mixing self.n_components = n_components self.space = space - self.whiten = False + self.whiten = whiten self.svd_solver = svd_solver self.tol = tol self.iterated_power = iterated_power diff --git a/src/skmatter/linear_model/_base.py b/src/skmatter/linear_model/_base.py index dacde2fca..800cf67f4 100644 --- a/src/skmatter/linear_model/_base.py +++ b/src/skmatter/linear_model/_base.py @@ -2,6 +2,8 @@ from scipy.linalg import orthogonal_procrustes from sklearn.base import MultiOutputMixin, RegressorMixin from sklearn.linear_model import LinearRegression +from sklearn.utils import check_array, check_X_y +from sklearn.utils.validation import check_is_fitted class OrthogonalRegression(MultiOutputMixin, RegressorMixin): @@ -61,6 +63,15 @@ def fit(self, X, y): and n_targets is the number of target properties. """ + X, y = check_X_y( + X, + y, + y_numeric=True, + ensure_min_features=1, + ensure_min_samples=1, + multi_output=True, + ) + self.n_samples_in_, self.n_features_in_ = X.shape if self.use_orthogonal_projector: # check estimator @@ -71,12 +82,15 @@ def fit(self, X, y): ) # compute orthogonal projectors linear_estimator.fit(X, y) - U, _, Vt = np.linalg.svd(linear_estimator.coef_.T, full_matrices=False) - # project X and y to same dimension - X = X @ U - y = y @ Vt.T + coef = np.reshape(linear_estimator.coef_.T, (X.shape[1], -1)) + U, _, Vt = np.linalg.svd(coef, full_matrices=False) + # compute weights by solving the Procrustes problem - self.coef_ = (U @ orthogonal_procrustes(X, y)[0] @ Vt).T + self.coef_ = ( + U + @ orthogonal_procrustes(X @ U, y.reshape(X.shape[0], -1) @ Vt.T)[0] + @ Vt + ).T else: self.max_components_ = max(X.shape[1], y.shape[1]) X = np.pad(X, [(0, 0), (0, self.max_components_ - X.shape[1])]) @@ -93,6 +107,9 @@ def predict(self, X): Training data, where n_samples is the number of samples and n_features is the number of features. """ + X = check_array(X, ensure_min_features=1, ensure_min_samples=1) + check_is_fitted(self, ["coef_"]) + if not (self.use_orthogonal_projector): X = np.pad(X, [(0, 0), (0, self.max_components_ - X.shape[1])]) return X @ self.coef_.T diff --git a/src/skmatter/linear_model/_ridge.py b/src/skmatter/linear_model/_ridge.py index 491ecbc4c..726c68028 100644 --- a/src/skmatter/linear_model/_ridge.py +++ b/src/skmatter/linear_model/_ridge.py @@ -1,11 +1,13 @@ import numpy as np from joblib import Parallel, delayed -from sklearn.base import MultiOutputMixin, RegressorMixin +from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin from sklearn.metrics import check_scoring from sklearn.model_selection import KFold +from sklearn.utils import check_array +from sklearn.utils.validation import check_is_fitted -class RidgeRegression2FoldCV(MultiOutputMixin, RegressorMixin): +class RidgeRegression2FoldCV(BaseEstimator, MultiOutputMixin, RegressorMixin): r"""Ridge regression with an efficient 2-fold cross-validation method using the SVD solver. @@ -110,6 +112,9 @@ def __init__( self.shuffle = shuffle self.n_jobs = n_jobs + def _more_tags(self): + return {"multioutput_only": True} + def fit(self, X, y): """ Parameters @@ -138,6 +143,8 @@ def fit(self, X, y): "[0,1)" ) + X, y = self._validate_data(X, y, y_numeric=True, multi_output=True) + self.n_samples_in_, self.n_features_in_ = X.shape # check_scoring uses estimators scoring function if the scorer is None, this is @@ -164,6 +171,11 @@ def predict(self, X): Training data, where n_samples is the number of samples and n_features is the number of features. """ + + X = check_array(X) + + check_is_fitted(self, ["coef_"]) + return X @ self.coef_.T def _2fold_cv(self, X, y, fold1_idx, fold2_idx, scorer): diff --git a/src/skmatter/metrics/_reconstruction_measures.py b/src/skmatter/metrics/_reconstruction_measures.py index 02d3d6557..86bab2fab 100644 --- a/src/skmatter/metrics/_reconstruction_measures.py +++ b/src/skmatter/metrics/_reconstruction_measures.py @@ -445,7 +445,7 @@ def pointwise_local_reconstruction_error( scaler.fit(X_train) X_train = scaler.transform(X_train) - X_test = scaler.transform(X_test) + X_test = scaler.transform(X_test).astype(X_train.dtype) scaler.fit(Y_train) Y_train = scaler.transform(Y_train) Y_test = scaler.transform(Y_test) diff --git a/src/skmatter/preprocessing/_data.py b/src/skmatter/preprocessing/_data.py index 94dd0e02e..97a1380a6 100644 --- a/src/skmatter/preprocessing/_data.py +++ b/src/skmatter/preprocessing/_data.py @@ -135,6 +135,14 @@ def fit(self, X, y=None, sample_weight=None): Fitted scaler. """ + X = self._validate_data( + X, + copy=self.copy, + estimator=self, + dtype=FLOAT_DTYPES, + ensure_min_samples=2, + ) + self.n_samples_in_, self.n_features_in_ = X.shape if sample_weight is not None: @@ -157,7 +165,7 @@ def fit(self, X, y=None, sample_weight=None): self.scale_ = np.sqrt(var) else: var_sum = var.sum() - if var_sum < abs(np.mean(X_mean)) * self.rtol + self.atol: + if var_sum < abs(np.average(X_mean)) * self.rtol + self.atol: raise ValueError("Cannot normalize a matrix with zero variance") self.scale_ = np.sqrt(var_sum) @@ -187,11 +195,9 @@ def transform(self, X, y=None, copy=None): X = self._validate_data( X, reset=False, - accept_sparse="csr", copy=copy, estimator=self, dtype=FLOAT_DTYPES, - force_all_finite="allow-nan", ) check_is_fitted( self, attributes=["n_samples_in_", "n_features_in_", "scale_", "mean_"] @@ -288,7 +294,7 @@ def __init__(self, with_center=True, with_trace=True): self.with_trace = with_trace super().__init__() - def fit(self, K=None, y=None, sample_weight=None): + def fit(self, K, y=None, sample_weight=None): """Fit KernelFlexibleCenterer Parameters @@ -310,7 +316,7 @@ def fit(self, K=None, y=None, sample_weight=None): Fitted transformer. """ - Kc = self._validate_data(K, copy=True, dtype=FLOAT_DTYPES, reset=False) + K = self._validate_data(K, copy=True, dtype=FLOAT_DTYPES, reset=False) if sample_weight is not None: self.sample_weight_ = _check_sample_weight(sample_weight, K, dtype=K.dtype) @@ -327,20 +333,20 @@ def fit(self, K=None, y=None, sample_weight=None): else: super().fit(K, y) - K_pred_cols = np.average(Kc, weights=self.sample_weight_, axis=1)[ + K_pred_cols = np.average(K, weights=self.sample_weight_, axis=1)[ :, np.newaxis ] else: - self.K_fit_rows_ = np.zeros(Kc.shape[1]) + self.K_fit_rows_ = np.zeros(K.shape[1]) self.K_fit_all_ = 0.0 - K_pred_cols = np.zeros((Kc.shape[0], 1)) + K_pred_cols = np.zeros((K.shape[0], 1)) if self.with_trace: - Kc -= self.K_fit_rows_ - Kc -= K_pred_cols - Kc += self.K_fit_all_ + K -= self.K_fit_rows_ + K -= K_pred_cols + K += self.K_fit_all_ - self.scale_ = np.trace(Kc) / Kc.shape[0] + self.scale_ = np.trace(K) / K.shape[0] else: self.scale_ = 1.0 @@ -408,7 +414,7 @@ def fit_transform(self, K, y=None, sample_weight=None, copy=True, **fit_params): return self.transform(K, copy) -class SparseKernelCenterer(TransformerMixin, BaseEstimator): +class SparseKernelCenterer(TransformerMixin): r"""Kernel centering method for sparse kernels, similar to KernelFlexibleCenterer. diff --git a/src/skmatter/utils/_orthogonalizers.py b/src/skmatter/utils/_orthogonalizers.py index 00a68949e..4bfba3739 100644 --- a/src/skmatter/utils/_orthogonalizers.py +++ b/src/skmatter/utils/_orthogonalizers.py @@ -56,9 +56,9 @@ def X_orthogonalizer(x1, c=None, x2=None, tol=1e-12, copy=False): if np.linalg.norm(col) < tol: warnings.warn("Column vector contains only zeros.", stacklevel=1) else: - col /= np.linalg.norm(col, axis=0) + col = np.divide(col, np.linalg.norm(col, axis=0)) - xnew -= col @ (col.T @ xnew) + xnew -= (col @ (col.T @ xnew)).astype(xnew.dtype) return xnew diff --git a/src/skmatter/utils/_pcovr_utils.py b/src/skmatter/utils/_pcovr_utils.py index 515d5edfc..69ae2e394 100644 --- a/src/skmatter/utils/_pcovr_utils.py +++ b/src/skmatter/utils/_pcovr_utils.py @@ -186,7 +186,7 @@ def pcovr_covariance( C_Y = C_Y.reshape((C.shape[0], -1)) C_Y = np.real(C_Y) - C += (1 - mixing) * C_Y @ C_Y.T + C += (1 - mixing) * np.array(C_Y @ C_Y.T, dtype=np.float64) if mixing > 0: C += (mixing) * (X.T @ X) diff --git a/tests/test_check_estimators.py b/tests/test_check_estimators.py new file mode 100644 index 000000000..f744a7a05 --- /dev/null +++ b/tests/test_check_estimators.py @@ -0,0 +1,26 @@ +from sklearn.utils.estimator_checks import parametrize_with_checks + +from skmatter.decomposition import KernelPCovR, PCovR +from skmatter.feature_selection import CUR as fCUR +from skmatter.feature_selection import FPS as fFPS +from skmatter.feature_selection import PCovCUR as fPCovCUR +from skmatter.feature_selection import PCovFPS as fPCovFPS +from skmatter.linear_model import RidgeRegression2FoldCV # OrthogonalRegression, +from skmatter.preprocessing import KernelNormalizer, StandardFlexibleScaler + + +@parametrize_with_checks( + [ + KernelPCovR(mixing=0.5), + PCovR(mixing=0.5), + fCUR(), + fFPS(), + fPCovCUR(), + fPCovFPS(), + RidgeRegression2FoldCV(), + KernelNormalizer(), + StandardFlexibleScaler(), + ] +) +def test_sklearn_compatible_estimator(estimator, check): + check(estimator) diff --git a/tests/test_feature_simple_cur.py b/tests/test_feature_simple_cur.py index ba92facd5..72554471d 100644 --- a/tests/test_feature_simple_cur.py +++ b/tests/test_feature_simple_cur.py @@ -4,12 +4,13 @@ from sklearn import exceptions from skmatter.datasets import load_csd_1000r as load -from skmatter.feature_selection import CUR +from skmatter.feature_selection import CUR, FPS class TestCUR(unittest.TestCase): def setUp(self): self.X, _ = load(return_X_y=True) + self.X = FPS(n_to_select=10).fit(self.X).transform(self.X) def test_bad_transform(self): selector = CUR(n_to_select=2) diff --git a/tests/test_greedy_selector.py b/tests/test_greedy_selector.py index fe83b71a8..3d77a8123 100644 --- a/tests/test_greedy_selector.py +++ b/tests/test_greedy_selector.py @@ -61,9 +61,11 @@ def test_bad_warm_start(self): def test_bad_y(self): self.X, self.Y = get_dataset(return_X_y=True) + Y = self.Y[:2] + print(self.X.shape, Y.shape) selector = GreedyTester(n_to_select=2) with self.assertRaises(ValueError): - selector.fit(X=self.X, y=self.Y[:2]) + selector.fit(X=self.X, y=Y) def test_bad_transform(self): selector = GreedyTester(n_to_select=2) @@ -130,6 +132,7 @@ def test_size_input(self): ) X = X.reshape(1, -1) + with self.assertRaises(ValueError) as cm: selector_sample.fit(X) self.assertEqual( @@ -138,6 +141,7 @@ def test_size_input(self): "required.", ) +>>>>>>> cdefe67 (Edits re: alex) if __name__ == "__main__": unittest.main(verbosity=2) diff --git a/tests/test_kernel_normalizer.py b/tests/test_kernel_normalizer.py index 694b39d0a..d17ddf9f3 100644 --- a/tests/test_kernel_normalizer.py +++ b/tests/test_kernel_normalizer.py @@ -41,12 +41,6 @@ def test_invalid_sample_weights(self): with self.assertRaises(ValueError): model.fit_transform(K, sample_weight=wts_dim) - def test_NoInputs(self): - """Checks that fit cannot be called with zero inputs.""" - model = KernelNormalizer() - with self.assertRaises(ValueError): - model.fit() - def test_ValueError(self): """Checks that a non-square matrix cannot be normalized.""" K = self.random_state.uniform(0, 100, size=(3, 4)) diff --git a/tests/test_orthogonalizers.py b/tests/test_orthogonalizers.py index 899cf73ce..050874c03 100644 --- a/tests/test_orthogonalizers.py +++ b/tests/test_orthogonalizers.py @@ -20,8 +20,8 @@ def __init__(self, *args, **kwargs): self.random_state = np.random.RandomState(0) def setUp(self): - self.n_samples = 100 - self.n_features = 100 + self.n_samples = 2 + self.n_features = 4 def test_null_column(self): # checks that the column passed to the orthogonalizer @@ -117,6 +117,7 @@ def test_multiple_orthogonalizations(self): X_correlated = X_orthogonalizer( X_correlated, x2=X_correlated[:, :n_uncorrelated] ) + print(X_correlated) self.assertLessEqual(np.linalg.norm(X_correlated), EPSILON) @@ -151,10 +152,12 @@ def test_copy(self): -1, 1, size=(self.n_samples, self.n_features) ) + print(X_random) idx = self.random_state.choice(X_random.shape[-1]) new_X = X_orthogonalizer(X_random, idx, tol=EPSILON, copy=True) X_orthogonalizer(X_random, idx, tol=EPSILON, copy=False) + print(new_X, X_random) self.assertTrue(np.allclose(X_random, new_X)) diff --git a/tests/test_sample_simple_cur.py b/tests/test_sample_simple_cur.py index 9e82c18c3..b3a9437e1 100644 --- a/tests/test_sample_simple_cur.py +++ b/tests/test_sample_simple_cur.py @@ -1,23 +1,33 @@ import unittest import numpy as np -from sklearn import exceptions from sklearn.datasets import fetch_california_housing as load -from skmatter.sample_selection import CUR +from skmatter.sample_selection import CUR, FPS class TestCUR(unittest.TestCase): def setUp(self): self.X, _ = load(return_X_y=True) - self.X = self.X[:1000] + self.X = self.X[FPS(n_to_select=100).fit(self.X).selected_idx_] self.n_select = min(20, min(self.X.shape) // 2) - def test_bad_transform(self): - selector = CUR(n_to_select=2) - with self.assertRaises(exceptions.NotFittedError): + def test_sample_transform(self): + """ + This test checks that an error is raised when the transform function is used, + because sklearn does not support well transformers that change the number + of samples with other classes like Pipeline + """ + selector = CUR(n_to_select=1) + selector.fit(self.X) + with self.assertRaises(ValueError) as error: _ = selector.transform(self.X) + self.assertTrue( + "Transform is not currently supported for sample selection." + == str(error.exception) + ) + def test_restart(self): """ This test checks that the model can be restarted with a new instance diff --git a/tests/test_standard_flexible_scaler.py b/tests/test_standard_flexible_scaler.py index 5e5108a47..e1d6cc1f6 100644 --- a/tests/test_standard_flexible_scaler.py +++ b/tests/test_standard_flexible_scaler.py @@ -188,6 +188,14 @@ def test_ValueError_full(self): with self.assertRaises(ValueError): model.fit(X) + def test_not_w_mean(self): + """Checks that the matrix normalized `with_mean=False` + does not have a mean.""" + X = np.array([2, 2, 3]).reshape(-1, 1) + model = StandardFlexibleScaler(with_mean=False) + model.fit(X) + self.assertTrue(np.allclose(model.mean_, 0)) + if __name__ == "__main__": unittest.main() From 3037924a56d3c37880c52a6f7e12a59021aeaaec Mon Sep 17 00:00:00 2001 From: "Rose K. Cersonsky" <47536110+rosecers@users.noreply.github.com> Date: Tue, 30 May 2023 13:26:41 -0400 Subject: [PATCH 5/9] rebasing onto pr#197 --- src/skmatter/_selection.py | 26 ++++++++++++++++---------- src/skmatter/linear_model/_ridge.py | 1 - src/skmatter/preprocessing/_data.py | 1 - tests/test_greedy_selector.py | 9 +++------ 4 files changed, 19 insertions(+), 18 deletions(-) diff --git a/src/skmatter/_selection.py b/src/skmatter/_selection.py index 397e4043d..95795b22a 100644 --- a/src/skmatter/_selection.py +++ b/src/skmatter/_selection.py @@ -169,6 +169,11 @@ def fit(self, X, y=None, warm_start=False): "You cannot specify both `score_threshold` and `full=True`." ) + if self.progress_bar is True: + self.report_progress_ = get_progress_bar() + elif self.progress_bar is False: + self.report_progress_ = no_progress_bar + n_to_select_from = X.shape[self._axis] self.n_samples_in_, self.n_features_in_ = X.shape @@ -256,17 +261,18 @@ def transform(self, X, y=None): mask = self.get_support() - # note: we use _safe_tags instead of _get_tags because this is a - # public Mixin. - X = self._validate_data( - X, - dtype=None, - accept_sparse="csr", - force_all_finite=not _safe_tags(self, key="allow_nan"), - reset=False, - ensure_2d=self._axis, - ) + X = check_array(X) + + if len(X.shape) == 1: + if self.axis == 0: + X = X.reshape(-1, 1) + else: + X = X.reshape(1, -1) + if len(mask) != X.shape[self.axis]: + raise ValueError( + "X has a different shape than during fitting. Reshape your data." + ) if self._axis == 1: return X[:, safe_mask(X, mask)] else: diff --git a/src/skmatter/linear_model/_ridge.py b/src/skmatter/linear_model/_ridge.py index 726c68028..3969b2a33 100644 --- a/src/skmatter/linear_model/_ridge.py +++ b/src/skmatter/linear_model/_ridge.py @@ -144,7 +144,6 @@ def fit(self, X, y): ) X, y = self._validate_data(X, y, y_numeric=True, multi_output=True) - self.n_samples_in_, self.n_features_in_ = X.shape # check_scoring uses estimators scoring function if the scorer is None, this is diff --git a/src/skmatter/preprocessing/_data.py b/src/skmatter/preprocessing/_data.py index 97a1380a6..9e4651466 100644 --- a/src/skmatter/preprocessing/_data.py +++ b/src/skmatter/preprocessing/_data.py @@ -142,7 +142,6 @@ def fit(self, X, y=None, sample_weight=None): dtype=FLOAT_DTYPES, ensure_min_samples=2, ) - self.n_samples_in_, self.n_features_in_ = X.shape if sample_weight is not None: diff --git a/tests/test_greedy_selector.py b/tests/test_greedy_selector.py index 3d77a8123..3866764b5 100644 --- a/tests/test_greedy_selector.py +++ b/tests/test_greedy_selector.py @@ -74,8 +74,7 @@ def test_bad_transform(self): _ = selector.transform(self.X[:, :3]) self.assertEqual( str(cm.exception), - "X has 3 features, but GreedyTester is expecting {} features" - " as input.".format(self.X.shape[1]), + "X has a different shape than during fitting. Reshape your data.", ) def test_no_nfeatures(self): @@ -122,13 +121,12 @@ def test_size_input(self): X = np.array([1, 2, 3, 4, 5]).reshape(-1, 1) selector_sample = GreedyTester(selection_type="sample") selector_feature = GreedyTester(selection_type="feature") - with self.assertRaises(ValueError) as cm: selector_feature.fit(X) self.assertEqual( str(cm.exception), - f"Found array with 1 feature(s) (shape={X.shape}) while a minimum of 2 is " - "required.", + f"Found array with 1 feature(s) (shape={X.shape})" + " while a minimum of 2 is required.", ) X = X.reshape(1, -1) @@ -141,7 +139,6 @@ def test_size_input(self): "required.", ) ->>>>>>> cdefe67 (Edits re: alex) if __name__ == "__main__": unittest.main(verbosity=2) From bef5aac41d4277139ea50a076ef5881033ba8081 Mon Sep 17 00:00:00 2001 From: "Rose K. Cersonsky" <47536110+rosecers@users.noreply.github.com> Date: Tue, 30 May 2023 13:20:44 -0400 Subject: [PATCH 6/9] Initial commit to pass an estimator check. WIP --- src/skmatter/_selection.py | 6 ------ src/skmatter/preprocessing/_data.py | 1 + tests/test_greedy_selector.py | 1 + 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/skmatter/_selection.py b/src/skmatter/_selection.py index 95795b22a..f2f5880a4 100644 --- a/src/skmatter/_selection.py +++ b/src/skmatter/_selection.py @@ -169,13 +169,7 @@ def fit(self, X, y=None, warm_start=False): "You cannot specify both `score_threshold` and `full=True`." ) - if self.progress_bar is True: - self.report_progress_ = get_progress_bar() - elif self.progress_bar is False: - self.report_progress_ = no_progress_bar - n_to_select_from = X.shape[self._axis] - self.n_samples_in_, self.n_features_in_ = X.shape self.n_samples_in_, self.n_features_in_ = X.shape diff --git a/src/skmatter/preprocessing/_data.py b/src/skmatter/preprocessing/_data.py index 9e4651466..97a1380a6 100644 --- a/src/skmatter/preprocessing/_data.py +++ b/src/skmatter/preprocessing/_data.py @@ -142,6 +142,7 @@ def fit(self, X, y=None, sample_weight=None): dtype=FLOAT_DTYPES, ensure_min_samples=2, ) + self.n_samples_in_, self.n_features_in_ = X.shape if sample_weight is not None: diff --git a/tests/test_greedy_selector.py b/tests/test_greedy_selector.py index 3866764b5..667bf3060 100644 --- a/tests/test_greedy_selector.py +++ b/tests/test_greedy_selector.py @@ -139,6 +139,7 @@ def test_size_input(self): "required.", ) +>>>>>>> cdefe67 (Edits re: alex) if __name__ == "__main__": unittest.main(verbosity=2) From 0aa26994a41132c058c7a3f103317e8e32766455 Mon Sep 17 00:00:00 2001 From: "Rose K. Cersonsky" <47536110+rosecers@users.noreply.github.com> Date: Tue, 30 May 2023 13:26:41 -0400 Subject: [PATCH 7/9] rebasing onto pr#197 --- src/skmatter/_selection.py | 11 ++--------- src/skmatter/preprocessing/_data.py | 1 - tests/test_greedy_selector.py | 1 - 3 files changed, 2 insertions(+), 11 deletions(-) diff --git a/src/skmatter/_selection.py b/src/skmatter/_selection.py index f2f5880a4..8aa87e392 100644 --- a/src/skmatter/_selection.py +++ b/src/skmatter/_selection.py @@ -142,14 +142,7 @@ def fit(self, X, y=None, warm_start=False): elif self.progress_bar is False: self.report_progress_ = no_progress_bar - params = dict( - accept_sparse="csc", - force_all_finite=not tags.get("allow_nan", True), - ) - if self._axis == 1: - params["ensure_min_features"] = 2 - else: - params["ensure_min_samples"] = 2 + params = dict(ensure_min_samples=2, ensure_min_features=2, dtype=FLOAT_DTYPES) if hasattr(self, "mixing") or y is not None: self._validate_data(X, y, **params) @@ -263,7 +256,7 @@ def transform(self, X, y=None): else: X = X.reshape(1, -1) - if len(mask) != X.shape[self.axis]: + if len(mask) != X.shape[self._axis]: raise ValueError( "X has a different shape than during fitting. Reshape your data." ) diff --git a/src/skmatter/preprocessing/_data.py b/src/skmatter/preprocessing/_data.py index 97a1380a6..9e4651466 100644 --- a/src/skmatter/preprocessing/_data.py +++ b/src/skmatter/preprocessing/_data.py @@ -142,7 +142,6 @@ def fit(self, X, y=None, sample_weight=None): dtype=FLOAT_DTYPES, ensure_min_samples=2, ) - self.n_samples_in_, self.n_features_in_ = X.shape if sample_weight is not None: diff --git a/tests/test_greedy_selector.py b/tests/test_greedy_selector.py index 667bf3060..3866764b5 100644 --- a/tests/test_greedy_selector.py +++ b/tests/test_greedy_selector.py @@ -139,7 +139,6 @@ def test_size_input(self): "required.", ) ->>>>>>> cdefe67 (Edits re: alex) if __name__ == "__main__": unittest.main(verbosity=2) From cb82cebd9b0f897cffdfd941fd44594f84414b57 Mon Sep 17 00:00:00 2001 From: "Rose K. Cersonsky" <47536110+rosecers@users.noreply.github.com> Date: Tue, 30 May 2023 12:28:09 -0500 Subject: [PATCH 8/9] Apply suggestions from code review Co-authored-by: Alexander Goscinski --- src/skmatter/_selection.py | 6 ++++-- tests/test_greedy_selector.py | 1 - tests/test_orthogonalizers.py | 2 -- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/skmatter/_selection.py b/src/skmatter/_selection.py index 8aa87e392..28b60efce 100644 --- a/src/skmatter/_selection.py +++ b/src/skmatter/_selection.py @@ -241,7 +241,9 @@ def transform(self, X, y=None): The selected subset of the input. """ - if self.axis == 0: + check_is_fitted(self, ["_axis", "selected_idx_", "n_selected_"]) + + if self._axis == 0: raise ValueError( "Transform is not currently supported for sample selection." ) @@ -251,7 +253,7 @@ def transform(self, X, y=None): X = check_array(X) if len(X.shape) == 1: - if self.axis == 0: + if self._axis == 0: X = X.reshape(-1, 1) else: X = X.reshape(1, -1) diff --git a/tests/test_greedy_selector.py b/tests/test_greedy_selector.py index 3866764b5..0bfe6de99 100644 --- a/tests/test_greedy_selector.py +++ b/tests/test_greedy_selector.py @@ -62,7 +62,6 @@ def test_bad_warm_start(self): def test_bad_y(self): self.X, self.Y = get_dataset(return_X_y=True) Y = self.Y[:2] - print(self.X.shape, Y.shape) selector = GreedyTester(n_to_select=2) with self.assertRaises(ValueError): selector.fit(X=self.X, y=Y) diff --git a/tests/test_orthogonalizers.py b/tests/test_orthogonalizers.py index 050874c03..0578141c8 100644 --- a/tests/test_orthogonalizers.py +++ b/tests/test_orthogonalizers.py @@ -152,12 +152,10 @@ def test_copy(self): -1, 1, size=(self.n_samples, self.n_features) ) - print(X_random) idx = self.random_state.choice(X_random.shape[-1]) new_X = X_orthogonalizer(X_random, idx, tol=EPSILON, copy=True) X_orthogonalizer(X_random, idx, tol=EPSILON, copy=False) - print(new_X, X_random) self.assertTrue(np.allclose(X_random, new_X)) From 9eb3b32ae55428a8774fb6ff7b8aa737533d9113 Mon Sep 17 00:00:00 2001 From: "Rose K. Cersonsky" <47536110+rosecers@users.noreply.github.com> Date: Thu, 1 Jun 2023 14:47:46 -0500 Subject: [PATCH 9/9] Update src/skmatter/_selection.py Co-authored-by: Alexander Goscinski --- src/skmatter/_selection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/skmatter/_selection.py b/src/skmatter/_selection.py index 28b60efce..867fc7b03 100644 --- a/src/skmatter/_selection.py +++ b/src/skmatter/_selection.py @@ -145,7 +145,7 @@ def fit(self, X, y=None, warm_start=False): params = dict(ensure_min_samples=2, ensure_min_features=2, dtype=FLOAT_DTYPES) if hasattr(self, "mixing") or y is not None: - self._validate_data(X, y, **params) + X, y = self._validate_data(X, y, **params) X, y = check_X_y(X, y, multi_output=True) if len(y.shape) == 1: