Initial commit to pass an estimator check. WIP

scikit-learn-contrib · May 17, 2023 · af60553 · af60553
1 parent dbab1f2
commit af60553
Show file tree

Hide file tree

Showing 19 changed files with 355 additions and 249 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@ __pycache__
 *.egg-info
 *.swp
 *.swo
+*DS_Store
 
 .tox/
 build/

diff --git a/src/skmatter/_selection.py b/src/skmatter/_selection.py
diff --git a/src/skmatter/decomposition/_kernel_pcovr.py b/src/skmatter/decomposition/_kernel_pcovr.py
@@ -220,7 +220,6 @@ def __init__(
         self.kernel_params = kernel_params
 
         self.n_jobs = n_jobs
-        self.n_samples_ = None
 
         self.fit_inverse_transform = fit_inverse_transform
 
@@ -308,17 +307,19 @@ def fit(self, X, Y, W=None):
 
         if self.n_components is None:
             if self.svd_solver != "arpack":
-                self.n_components = X.shape[0]
+                self.n_components_ = X.shape[0]
             else:
-                self.n_components = X.shape[0] - 1
+                self.n_components_ = X.shape[0] - 1
+        else:
+            self.n_components_ = self.n_components
 
         K = self._get_kernel(X)
 
         if self.center:
             self.centerer_ = KernelNormalizer()
             K = self.centerer_.fit_transform(K)
 
-        self.n_samples_ = X.shape[0]
+        self.n_samples_in_, self.n_features_in_ = X.shape
 
         if self.regressor != "precomputed":
             if self.regressor is None:
@@ -362,7 +363,7 @@ def fit(self, X, Y, W=None):
             # to avoid needing to compute the kernel a second time
             self.regressor_ = check_krr_fit(regressor, K, X, Y)
 
-            W = self.regressor_.dual_coef_.reshape(X.shape[0], -1)
+            W = self.regressor_.dual_coef_.reshape(self.n_samples_in_, -1)
 
             # Use this instead of `self.regressor_.predict(K)`
             # so that we can handle the case of the pre-fitted regressor
@@ -387,12 +388,17 @@ def fit(self, X, Y, W=None):
         # Handle svd_solver
         self._fit_svd_solver = self.svd_solver
         if self._fit_svd_solver == "auto":
-            # Small problem or self.n_components == 'mle', just call full PCA
-            if max(X.shape) <= 500 or self.n_components == "mle":
+            # Small problem or self.n_components_ == 'mle', just call full PCA
+            if (
+                max(self.n_samples_in_, self.n_features_in_) <= 500
+                or self.n_components_ == "mle"
+            ):
                 self._fit_svd_solver = "full"
-            elif self.n_components >= 1 and self.n_components < 0.8 * min(X.shape):
+            elif self.n_components_ >= 1 and self.n_components_ < 0.8 * max(
+                self.n_samples_in_, self.n_features_in_
+            ):
                 self._fit_svd_solver = "randomized"
-            # This is also the case of self.n_components in (0,1)
+            # This is also the case of self.n_components_ in (0,1)
             else:
                 self._fit_svd_solver = "full"
 
@@ -536,31 +542,31 @@ def score(self, X, Y):
         return -sum([Lkpca, Lkrr])
 
     def _decompose_truncated(self, mat):
-        if not 1 <= self.n_components <= self.n_samples_:
+        if not 1 <= self.n_components_ <= self.n_samples_in_:
             raise ValueError(
                 "n_components=%r must be between 1 and "
                 "n_samples=%r with "
                 "svd_solver='%s'"
                 % (
-                    self.n_components,
-                    self.n_samples_,
+                    self.n_components_,
+                    self.n_samples_in_,
                     self.svd_solver,
                 )
             )
-        elif not isinstance(self.n_components, numbers.Integral):
+        elif not isinstance(self.n_components_, numbers.Integral):
             raise ValueError(
                 "n_components=%r must be of type int "
                 "when greater than or equal to 1, was of type=%r"
-                % (self.n_components, type(self.n_components))
+                % (self.n_components_, type(self.n_components_))
             )
-        elif self.svd_solver == "arpack" and self.n_components == self.n_samples_:
+        elif self.svd_solver == "arpack" and self.n_components_ == self.n_samples_in_:
             raise ValueError(
                 "n_components=%r must be strictly less than "
                 "n_samples=%r with "
                 "svd_solver='%s'"
                 % (
-                    self.n_components,
-                    self.n_samples_,
+                    self.n_components_,
+                    self.n_samples_in_,
                     self.svd_solver,
                 )
             )
@@ -569,7 +575,7 @@ def _decompose_truncated(self, mat):
 
         if self._fit_svd_solver == "arpack":
             v0 = _init_arpack_v0(min(mat.shape), random_state)
-            U, S, Vt = svds(mat, k=self.n_components, tol=self.tol, v0=v0)
+            U, S, Vt = svds(mat, k=self.n_components_, tol=self.tol, v0=v0)
             # svds doesn't abide by scipy.linalg.svd/randomized_svd
             # conventions, so reverse its outputs.
             S = S[::-1]
@@ -581,7 +587,7 @@ def _decompose_truncated(self, mat):
             # sign flipping is done inside
             U, S, Vt = randomized_svd(
                 mat,
-                n_components=self.n_components,
+                n_components=self.n_components_,
                 n_iter=self.iterated_power,
                 flip_sign=True,
                 random_state=random_state,
@@ -594,24 +600,25 @@ def _decompose_truncated(self, mat):
         return U, S, Vt
 
     def _decompose_full(self, mat):
-        if self.n_components != "mle":
-            if not (0 <= self.n_components <= self.n_samples_):
+        if self.n_components_ != "mle":
+            if not (0 <= self.n_components_ <= self.n_samples_in_):
                 raise ValueError(
                     "n_components=%r must be between 1 and "
                     "n_samples=%r with "
                     "svd_solver='%s'"
                     % (
-                        self.n_components,
-                        self.n_samples_,
+                        self.n_components_,
+                        self.n_samples_in_,
                         self.svd_solver,
                     )
                 )
-            elif self.n_components >= 1:
-                if not isinstance(self.n_components, numbers.Integral):
+            elif self.n_components_ >= 1:
+                if not isinstance(self.n_components_, numbers.Integral):
                     raise ValueError(
                         "n_components=%r must be of type int "
                         "when greater than or equal to 1, "
-                        "was of type=%r" % (self.n_components, type(self.n_components))
+                        "was of type=%r"
+                        % (self.n_components_, type(self.n_components_))
                     )
 
         U, S, Vt = linalg.svd(mat, full_matrices=False)
@@ -623,26 +630,28 @@ def _decompose_full(self, mat):
         U, Vt = svd_flip(U, Vt)
 
         # Get variance explained by singular values
-        explained_variance_ = (S**2) / (self.n_samples_ - 1)
+        explained_variance_ = (S**2) / (self.n_samples_in_ - 1)
         total_var = explained_variance_.sum()
         explained_variance_ratio_ = explained_variance_ / total_var
 
         # Postprocess the number of components required
-        if self.n_components == "mle":
-            self.n_components = _infer_dimension(explained_variance_, self.n_samples_)
-        elif 0 < self.n_components < 1.0:
+        if self.n_components_ == "mle":
+            self.n_components_ = _infer_dimension(
+                explained_variance_, self.n_samples_in_
+            )
+        elif 0 < self.n_components_ < 1.0:
             # number of components for which the cumulated explained
             # variance percentage is superior to the desired threshold
             # side='right' ensures that number of features selected
-            # their variance is always greater than self.n_components float
+            # their variance is always greater than self.n_components_ float
             # passed. More discussion in issue: #15669
             ratio_cumsum = stable_cumsum(explained_variance_ratio_)
-            self.n_components = (
-                np.searchsorted(ratio_cumsum, self.n_components, side="right") + 1
+            self.n_components_ = (
+                np.searchsorted(ratio_cumsum, self.n_components_, side="right") + 1
             )
-        self.n_components = self.n_components
+
         return (
-            U[:, : self.n_components],
-            S[: self.n_components],
-            Vt[: self.n_components],
+            U[:, : self.n_components_],
+            S[: self.n_components_],
+            Vt[: self.n_components_],
         )
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,6 +4,7 @@ __pycache__ @@
     *.egg-info
     *.swp
     *.swo
+    *DS_Store
     .tox/
     build/
@@ Expand Down @@