scikit-learn-contrib · rosecers · May 20, 2022 · Nov 4, 2021 · May 16, 2022
diff --git a/skcosmo/datasets/data/csd-1000r.npz b/skcosmo/datasets/data/csd-1000r.npz
diff --git a/skcosmo/datasets/descr/csd-1000r.rst b/skcosmo/datasets/descr/csd-1000r.rst
@@ -46,98 +46,10 @@ Data Set Characteristics
 References
 ----------
 
-   .. [C1] https://github.com/lab-cosmo/librascal commit e78f1c69
-   .. [C2] https://github.com/lab-cosmo/scikit-cosmo commit 64e789f
+   .. [C1] https://github.com/lab-cosmo/librascal commit ade202a6
+   .. [C2] https://github.com/lab-cosmo/scikit-cosmo commit 4ed1d92
 
 Reference Code
 --------------
 
-.. code-block:: python
-
-  import numpy as np
-  from ase.io import read
-
-  from skcosmo.feature_selection import SimpleFPS
-  from rascal.representations import SphericalInvariants as SOAP
-
-
-  # read all of the frames and book-keep the centers and species
-  filename = "/path/to/CSD-1000r.xyz"
-  frames = np.asarray(
-      read(filename, ":"),
-      dtype=object,
-  )
-
-  n_centers = [len(frame) for frame in frames]
-  n_env_accum = [sum(n_centers[: i + 1]) for i in range(len(n_centers))]
-  n_env = sum(n_centers)
-
-  numbers = np.concatenate([frame.numbers for frame in frames])
-  number_loc = np.array([np.where(numbers == i)[0] for i in [1, 6, 7, 8]], dtype=object)
-
-
-  # compute radial soap vectors as first pass
-  hypers = dict(
-      soap_type="PowerSpectrum",
-      interaction_cutoff=3.5,
-      max_radial=6,
-      max_angular=0,
-      gaussian_sigma_type="Constant",
-      gaussian_sigma_constant=0.4,
-      cutoff_smooth_width=0.5,
-  )
-  soap = SOAP(**hypers)
-  X_raw = soap.transform(frames).get_features(soap)
-
-
-  # select 100 diverse samples
-  i_selected = SimpleFPS(n_features_to_select=100).fit(X_raw.T).selected_idx_
-
-  # book-keep which frames these samples belong in
-  frames_select = [np.where(n_env_accum > i)[0][0] for i in i_selected]
-  reduced_frames_select = list(sorted(set(frames_select)))
-
-  properties_select = [
-      frame.arrays["CS_local"] for frame in frames[reduced_frames_select]
-  ]
-
-  n_centers_select = [len(frame) for frame in frames[reduced_frames_select]]
-  n_env_accum_select = [
-      sum(n_centers_select[: i + 1]) for i in range(len(n_centers_select))
-  ]
-  n_env_select = sum(n_centers_select)
-
-
-  # compute a larger power spectrum for these frames
-  hypers["max_angular"] = 6
-  soap_select = SOAP(**hypers)
-  X_raw_select = soap_select.transform(frames[reduced_frames_select]).get_features(
-      soap_select
-  )
-
-
-  # pull the soap vectors only pertaining to the selected environments
-  i_select_reduced = []
-  properties_select_reduced = np.zeros(len(i_selected), dtype=float)
-  for i in range(len(i_selected)):
-      my_orig_frame = frames_select[i]
-      my_frame = reduced_frames_select.index(my_orig_frame)
-      if my_orig_frame != 0:
-          orig_loc = i_selected[i] - n_env_accum[my_orig_frame - 1]
-          new_loc = orig_loc + n_env_accum_select[my_frame - 1]
-      else:
-          orig_loc = i_selected[i]
-          new_loc = i_selected[i]
-      i_select_reduced.append(new_loc)
-      properties_select_reduced[i] = frames[my_orig_frame].arrays["CS_local"][orig_loc]
-
-  X_sample_select = X_raw_select[i_select_reduced]
-
-
-  # select 100 / 2520 soap features
-  n_select = 100
-  X_select = SimpleFPS(n_features_to_select=n_select).fit_transform(X_sample_select)
-  Y_select = properties_select_reduced.reshape(-1, 1)
-
-  data = dict(X=X_select, Y=Y_select)
-  np.savez("skcosmo/datasets/data/csd-1000r.npz", **data)
+.. literalinclude:: /../../skcosmo/datasets/make_csd_1000r.py
diff --git a/skcosmo/datasets/make_csd_1000r.py b/skcosmo/datasets/make_csd_1000r.py
@@ -0,0 +1,80 @@
+import numpy as np
+from ase.io import read
+from rascal.representations import SphericalInvariants as SOAP
+
+from skcosmo.feature_selection import CUR
+from skcosmo.preprocessing import StandardFlexibleScaler
+from skcosmo.sample_selection import FPS
+
+# read all of the frames and book-keep the centers and species
+filename = "/path/to/CSD-1000R.xyz"
+frames = np.asarray(
+    read(filename, ":"),
+    dtype=object,
+)
+
+n_centers = np.array([len(frame) for frame in frames])
+center_idx = np.array([i for i, f in enumerate(frames) for p in f])
+n_env_accum = np.zeros(len(frames) + 1, dtype=int)
+n_env_accum[1:] = np.cumsum(n_centers)
+
+numbers = np.concatenate([frame.numbers for frame in frames])
+
+# compute radial soap vectors as first pass
+hypers = dict(
+    soap_type="PowerSpectrum",
+    interaction_cutoff=2.5,
+    max_radial=6,
+    max_angular=0,
+    gaussian_sigma_type="Constant",
+    gaussian_sigma_constant=0.4,
+    cutoff_smooth_width=0.5,
+    normalize=False,
+    global_species=[1, 6, 7, 8],
+    expansion_by_species_method="user defined",
+)
+soap = SOAP(**hypers)
+
+X_raw = StandardFlexibleScaler(column_wise=False).fit_transform(
+    soap.transform(frames).get_features(soap)
+)
+
+# rank the environments in terms of diversity
+n_samples = 500
+i_selected = FPS(n_to_select=n_samples, initialize=0).fit(X_raw).selected_idx_
+
+# book-keep which frames these samples belong in
+f_selected = center_idx[i_selected]
+reduced_f_selected = list(sorted(set(f_selected)))
+frames_selected = frames[f_selected].copy()
+ci_selected = i_selected - n_env_accum[f_selected]
+
+properties_select = [
+    frames[fi].arrays["CS_local"][ci] for fi, ci in zip(f_selected, ci_selected)
+]
+
+# mask other environments in the frames so that SOAP vectors
+# will not be computed for other environments on next pass
+for frame, ci, fi in zip(frames_selected, ci_selected, f_selected):
+    frame.arrays["center_atoms_mask"] = np.zeros(len(frame), dtype=bool)
+    frame.arrays["center_atoms_mask"][ci] = True
+
+# compute a larger power spectrum for these frames
+hypers["max_angular"] = 6
+soap_select = SOAP(**hypers)
+X_sample_select = StandardFlexibleScaler(column_wise=False).fit_transform(
+    soap_select.transform(frames_selected).get_features(soap_select)
+)
+X_sample_select.shape
+
+# select 100 / 2520 soap features
+n_select = 100
+X_select = CUR(n_to_select=n_select).fit_transform(X_sample_select)
+Y_select = np.array(properties_select).reshape(-1, 1)
+
+data = dict(
+    X=X_select,
+    Y=Y_select,
+    original_mapping=[(fi, ci) for fi, ci in zip(f_selected, ci_selected)],
+)
+np.savez("./skcosmo/datasets/data/csd-1000r.npz", **data, size=(n_samples, n_select))
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
@@ -29,11 +29,8 @@ def setUpClass(cls):
         cls.csd = load_csd_1000r()
 
     def test_load_csd_1000r_shape(self):
-        # test if representations have correct shape
-        self.assertTrue(self.csd.data.X.shape == (100, 100))
-
-    def test_load_csd_1000r_prop_shape(self):
-        self.assertTrue(self.csd.data.y.shape == (100, 1))
+        # test if representations and properties have commensurate shape
+        self.assertTrue(self.csd.data.X.shape[0] == self.csd.data.y.shape[0])
 
     def test_load_csd_1000r_access_descr(self):
         self.csd.DESCR

diff --git a/tests/test_sample_simple_cur.py b/tests/test_sample_simple_cur.py
@@ -3,14 +3,15 @@
 import numpy as np
 from sklearn import exceptions
 
-from skcosmo.datasets import load_csd_1000r as load
+from sklearn.datasets import fetch_california_housing as load
 from skcosmo.sample_selection import CUR
 
 
 class TestCUR(unittest.TestCase):
     def setUp(self):
-        self.X, self.y = load(return_X_y=True)
-        self.n_select = min(self.X.shape) // 2
+        self.X, _ = load(return_X_y=True)
+        self.X = self.X[:1000]
+        self.n_select = min(20, min(self.X.shape) // 2)
 
     def test_bad_transform(self):
         selector = CUR(n_to_select=2)