Replacing csd in sample selection with california because the larger …

…dataset means a few redundant samples, which is not good for stable testing
scikit-learn-contrib · May 16, 2022 · 0a86d48 · 0a86d48
1 parent f6278b6
commit 0a86d48
Show file tree

Hide file tree

Showing 11 changed files with 55 additions and 161 deletions.
diff --git a/skcosmo/_selection.py b/skcosmo/_selection.py
@@ -333,18 +333,12 @@ def _continue_greedy_search(self, X, y, n_to_select):
         n_pad[self._axis] = (0, n_to_select - self.n_selected_)
 
         self.X_selected_ = np.pad(
-            self.X_selected_,
-            n_pad,
-            "constant",
-            constant_values=0.0,
+            self.X_selected_, n_pad, "constant", constant_values=0.0,
         )
 
         if hasattr(self, "y_selected_"):
             self.y_selected_ = np.pad(
-                self.y_selected_,
-                n_pad,
-                "constant",
-                constant_values=0.0,
+                self.y_selected_, n_pad, "constant", constant_values=0.0,
             )
 
         old_idx = self.selected_idx_.copy()
@@ -775,18 +769,10 @@ def _compute_pi(self, X, y=None):
         """
 
         if self._axis == 0:
-            pcovr_distance = pcovr_kernel(
-                self.mixing,
-                X,
-                y,
-            )
+            pcovr_distance = pcovr_kernel(self.mixing, X, y,)
         else:
             pcovr_distance = pcovr_covariance(
-                self.mixing,
-                X,
-                y,
-                rcond=1e-12,
-                rank=None,
+                self.mixing, X, y, rcond=1e-12, rank=None,
             )
 
         if self.k < pcovr_distance.shape[0] - 1:

diff --git a/skcosmo/datasets/_base.py b/skcosmo/datasets/_base.py
@@ -56,10 +56,7 @@ def load_csd_1000r(return_X_y=False):
     target_filename = join(module_path, "data", "csd-1000r.npz")
     raw_data = np.load(target_filename)
     if not return_X_y:
-        data = Bunch(
-            X=raw_data["X"],
-            y=raw_data["Y"],
-        )
+        data = Bunch(X=raw_data["X"], y=raw_data["Y"],)
         with open(join(module_path, "descr", "csd-1000r.rst")) as rst_file:
             fdescr = rst_file.read()
 

diff --git a/skcosmo/datasets/make_csd_1000r.py b/skcosmo/datasets/make_csd_1000r.py
@@ -8,10 +8,7 @@
 
 # read all of the frames and book-keep the centers and species
 filename = "/path/to/CSD-1000R.xyz"
-frames = np.asarray(
-    read(filename, ":"),
-    dtype=object,
-)
+frames = np.asarray(read(filename, ":"), dtype=object,)
 
 n_centers = np.array([len(frame) for frame in frames])
 center_idx = np.array([i for i, f in enumerate(frames) for p in f])

diff --git a/skcosmo/decomposition/_kernel_pcovr.py b/skcosmo/decomposition/_kernel_pcovr.py
@@ -541,11 +541,7 @@ def _decompose_truncated(self, mat):
                 "n_components=%r must be between 1 and "
                 "n_samples=%r with "
                 "svd_solver='%s'"
-                % (
-                    self.n_components,
-                    self.n_samples_,
-                    self.svd_solver,
-                )
+                % (self.n_components, self.n_samples_, self.svd_solver,)
             )
         elif not isinstance(self.n_components, numbers.Integral):
             raise ValueError(
@@ -558,11 +554,7 @@ def _decompose_truncated(self, mat):
                 "n_components=%r must be strictly less than "
                 "n_samples=%r with "
                 "svd_solver='%s'"
-                % (
-                    self.n_components,
-                    self.n_samples_,
-                    self.svd_solver,
-                )
+                % (self.n_components, self.n_samples_, self.svd_solver,)
             )
 
         random_state = check_random_state(self.random_state)
@@ -601,11 +593,7 @@ def _decompose_full(self, mat):
                     "n_components=%r must be between 1 and "
                     "n_samples=%r with "
                     "svd_solver='%s'"
-                    % (
-                        self.n_components,
-                        self.n_samples_,
-                        self.svd_solver,
-                    )
+                    % (self.n_components, self.n_samples_, self.svd_solver,)
                 )
             elif self.n_components >= 1:
                 if not isinstance(self.n_components, numbers.Integral):

diff --git a/skcosmo/decomposition/_pcovr.py b/skcosmo/decomposition/_pcovr.py
@@ -333,12 +333,8 @@ def fit(self, X, Y):
 
         self.pxy_ = self.pxt_ @ self.pty_
         if len(Y.shape) == 1:
-            self.pxy_ = self.pxy_.reshape(
-                X.shape[1],
-            )
-            self.pty_ = self.pty_.reshape(
-                self.n_components,
-            )
+            self.pxy_ = self.pxy_.reshape(X.shape[1],)
+            self.pty_ = self.pty_.reshape(self.n_components,)
 
         self.components_ = self.pxt_.T  # for sklearn compatibility
         return self
@@ -379,11 +375,7 @@ def _fit_feature_space(self, X, Y, Yhat):
         """
 
         Ct, iCsqrt = pcovr_covariance(
-            mixing=self.mixing,
-            X=X,
-            Y=Yhat,
-            rcond=self.tol,
-            return_isqrt=True,
+            mixing=self.mixing, X=X, Y=Yhat, rcond=self.tol, return_isqrt=True,
         )
         try:
             Csqrt = np.linalg.lstsq(iCsqrt, np.eye(len(iCsqrt)), rcond=None)[0]

diff --git a/skcosmo/metrics/_reconstruction_measures.py b/skcosmo/metrics/_reconstruction_measures.py
@@ -13,12 +13,7 @@
 
 
 def pointwise_global_reconstruction_error(
-    X,
-    Y,
-    train_idx=None,
-    test_idx=None,
-    scaler=None,
-    estimator=None,
+    X, Y, train_idx=None, test_idx=None, scaler=None, estimator=None,
 ):
     """Computes the pointwise global reconstruction error using the source X
     to reconstruct the features or samples of target Y based on a minimization
@@ -102,12 +97,7 @@ def pointwise_global_reconstruction_error(
 
 
 def global_reconstruction_error(
-    X,
-    Y,
-    test_idx=None,
-    train_idx=None,
-    scaler=None,
-    estimator=None,
+    X, Y, test_idx=None, train_idx=None, scaler=None, estimator=None,
 ):
     """Computes the global reconstruction error using the source X
     to reconstruct the features or samples of target Y based on a minimization
@@ -163,28 +153,21 @@ def global_reconstruction_error(
         The global reconstruction error
 
     """
-    pointwise_global_reconstruction_error_values = (
-        pointwise_global_reconstruction_error(
-            X,
-            Y,
-            train_idx=train_idx,
-            test_idx=test_idx,
-            scaler=scaler,
-            estimator=estimator,
-        )
+    pointwise_global_reconstruction_error_values = pointwise_global_reconstruction_error(
+        X,
+        Y,
+        train_idx=train_idx,
+        test_idx=test_idx,
+        scaler=scaler,
+        estimator=estimator,
     )
     return np.linalg.norm(pointwise_global_reconstruction_error_values) / np.sqrt(
         len(pointwise_global_reconstruction_error_values)
     )
 
 
 def pointwise_global_reconstruction_distortion(
-    X,
-    Y,
-    test_idx=None,
-    train_idx=None,
-    scaler=None,
-    estimator=None,
+    X, Y, test_idx=None, train_idx=None, scaler=None, estimator=None,
 ):
     """Computes the pointwise global reconstruction distortion using the source X
     to reconstruct the features or samples of target Y based on a minimization
@@ -273,12 +256,7 @@ def pointwise_global_reconstruction_distortion(
 
 
 def global_reconstruction_distortion(
-    X,
-    Y,
-    test_idx=None,
-    train_idx=None,
-    scaler=None,
-    estimator=None,
+    X, Y, test_idx=None, train_idx=None, scaler=None, estimator=None,
 ):
     """Computes the global reconstruction distortion using the source X
     to reconstruct the features or samples of target Y based on a minimization
@@ -334,15 +312,13 @@ def global_reconstruction_distortion(
         The global reconstruction distortion
 
     """
-    pointwise_global_reconstruction_distortion_values = (
-        pointwise_global_reconstruction_distortion(
-            X,
-            Y,
-            train_idx=train_idx,
-            test_idx=test_idx,
-            scaler=scaler,
-            estimator=estimator,
-        )
+    pointwise_global_reconstruction_distortion_values = pointwise_global_reconstruction_distortion(
+        X,
+        Y,
+        train_idx=train_idx,
+        test_idx=test_idx,
+        scaler=scaler,
+        estimator=estimator,
     )
     return np.linalg.norm(pointwise_global_reconstruction_distortion_values) / np.sqrt(
         len(pointwise_global_reconstruction_distortion_values)
@@ -461,8 +437,7 @@ def local_reconstruction_error_i(i):
         local_Y_train_mean = np.mean(Y_train[local_env_idx], axis=0)
         # P_{FF'}
         estimator.fit(
-            local_X_train - local_X_train_mean,
-            local_Y_train - local_Y_train_mean,
+            local_X_train - local_X_train_mean, local_Y_train - local_Y_train_mean,
         )
         # \tilde{x}_i' = \bar{x}_{F'} + (x_i - \bar{x}_F)P_{FF'}
         tilde_x_i_dash_test = local_Y_train_mean + estimator.predict(

diff --git a/tests/test_kernel_pcovr.py b/tests/test_kernel_pcovr.py
@@ -248,8 +248,7 @@ def test_incompatible_regressor(self):
         with self.assertRaises(ValueError) as cm:
             kpcovr.fit(self.X, self.Y)
             self.assertTrue(
-                str(cm.message),
-                "Regressor must be an instance of `KernelRidge`",
+                str(cm.message), "Regressor must be an instance of `KernelRidge`",
             )
 
     def test_none_regressor(self):
@@ -328,10 +327,7 @@ def test_linear_matches_pcovr(self):
         ridge.fit(self.X, self.Y)
 
         # common instantiation parameters for the two models
-        hypers = dict(
-            mixing=0.5,
-            n_components=1,
-        )
+        hypers = dict(mixing=0.5, n_components=1,)
 
         # computing projection and predicton loss with linear KernelPCovR
         # and use the alpha from RidgeCV for level regression comparisons
@@ -368,13 +364,11 @@ def test_linear_matches_pcovr(self):
 
         rounding = 3
         self.assertEqual(
-            round(ly, rounding),
-            round(ly_ref, rounding),
+            round(ly, rounding), round(ly_ref, rounding),
         )
 
         self.assertEqual(
-            round(lk, rounding),
-            round(lk_ref, rounding),
+            round(lk, rounding), round(lk_ref, rounding),
         )
 
 
@@ -441,11 +435,7 @@ def test_bad_n_components(self):
                     "self.n_components=%r must be between 0 and "
                     "min(n_samples, n_features)=%r with "
                     "svd_solver='%s'"
-                    % (
-                        kpcovr.n_components,
-                        self.X.shape[0],
-                        kpcovr.svd_solver,
-                    ),
+                    % (kpcovr.n_components, self.X.shape[0], kpcovr.svd_solver,),
                 )
         with self.subTest(type="0_ncomponents"):
             with self.assertRaises(ValueError) as cm:
@@ -457,11 +447,7 @@ def test_bad_n_components(self):
                     "self.n_components=%r must be between 1 and "
                     "min(n_samples, n_features)=%r with "
                     "svd_solver='%s'"
-                    % (
-                        kpcovr.n_components,
-                        self.X.shape[0],
-                        kpcovr.svd_solver,
-                    ),
+                    % (kpcovr.n_components, self.X.shape[0], kpcovr.svd_solver,),
                 )
         with self.subTest(type="arpack_X_ncomponents"):
             with self.assertRaises(ValueError) as cm:
@@ -472,11 +458,7 @@ def test_bad_n_components(self):
                     "self.n_components=%r must be strictly less than "
                     "min(n_samples, n_features)=%r with "
                     "svd_solver='%s'"
-                    % (
-                        kpcovr.n_components,
-                        self.X.shape[0],
-                        kpcovr.svd_solver,
-                    ),
+                    % (kpcovr.n_components, self.X.shape[0], kpcovr.svd_solver,),
                 )
 
         for svd_solver in ["auto", "full"]:

diff --git a/tests/test_linear_model.py b/tests/test_linear_model.py
@@ -111,16 +111,16 @@ def setUpClass(cls):
     def test_ridge_regression_2fold_regularization_method_raise_error(self):
         # tests if wrong regularization_method in RidgeRegression2FoldCV raises error
         with self.assertRaises(ValueError):
-            RidgeRegression2FoldCV(
-                regularization_method="dummy",
-            ).fit(self.features_small, self.features_small)
+            RidgeRegression2FoldCV(regularization_method="dummy",).fit(
+                self.features_small, self.features_small
+            )
 
     def test_ridge_regression_2fold_alpha_type_raise_error(self):
         # tests if wrong alpha type in RidgeRegression2FoldCV raises error
         with self.assertRaises(ValueError):
-            RidgeRegression2FoldCV(
-                alpha_type="dummy",
-            ).fit(self.features_small, self.features_small)
+            RidgeRegression2FoldCV(alpha_type="dummy",).fit(
+                self.features_small, self.features_small
+            )
 
     def test_ridge_regression_2fold_relative_alpha_type_raise_error(self):
         # tests if an error is raised if alpha not in [0,1)
@@ -179,8 +179,7 @@ def test_ridge_regression_2fold_cv_small_to_large(
             .predict(self.features_small)
         )
         self.assertTrue(
-            abs(err) < self.eps,
-            f"error {err} surpasses threshold for zero {self.eps}",
+            abs(err) < self.eps, f"error {err} surpasses threshold for zero {self.eps}",
         )
 
     @parameterized.expand(ridge_parameters)

diff --git a/tests/test_pcovr.py b/tests/test_pcovr.py
@@ -319,11 +319,7 @@ def test_bad_n_components(self):
                     "self.n_components=%r must be between 0 and "
                     "min(n_samples, n_features)=%r with "
                     "svd_solver='%s'"
-                    % (
-                        pcovr.n_components,
-                        min(self.X.shape),
-                        pcovr.svd_solver,
-                    ),
+                    % (pcovr.n_components, min(self.X.shape), pcovr.svd_solver,),
                 )
         with self.subTest(type="0_ncomponents"):
             with self.assertRaises(ValueError) as cm:
@@ -335,11 +331,7 @@ def test_bad_n_components(self):
                     "self.n_components=%r must be between 1 and "
                     "min(n_samples, n_features)=%r with "
                     "svd_solver='%s'"
-                    % (
-                        pcovr.n_components,
-                        min(self.X.shape),
-                        pcovr.svd_solver,
-                    ),
+                    % (pcovr.n_components, min(self.X.shape), pcovr.svd_solver,),
                 )
         with self.subTest(type="arpack_X_ncomponents"):
             with self.assertRaises(ValueError) as cm:
@@ -350,11 +342,7 @@ def test_bad_n_components(self):
                     "self.n_components=%r must be strictly less than "
                     "min(n_samples, n_features)=%r with "
                     "svd_solver='%s'"
-                    % (
-                        pcovr.n_components,
-                        min(self.X.shape),
-                        pcovr.svd_solver,
-                    ),
+                    % (pcovr.n_components, min(self.X.shape), pcovr.svd_solver,),
                 )
 
         for svd_solver in ["auto", "full"]: