diff --git a/examples/Selectors-Pipelines.ipynb b/examples/Selectors-Pipelines.ipynb index 9f2b0a01c..2fc0acc2d 100644 --- a/examples/Selectors-Pipelines.ipynb +++ b/examples/Selectors-Pipelines.ipynb @@ -154,40 +154,6 @@ "plt.ylabel('Predicted Values')\n", "plt.show()" ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2021-11-29T15:19:18.681434Z", - "iopub.status.busy": "2021-11-29T15:19:18.681006Z", - "iopub.status.idle": "2021-11-29T15:19:18.682770Z", - "shell.execute_reply": "2021-11-29T15:19:18.683119Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "(442, 10)" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/skcosmo/datasets/data/csd-1000r.npz b/skcosmo/datasets/data/csd-1000r.npz index 67be8d31e..0ca638738 100644 Binary files a/skcosmo/datasets/data/csd-1000r.npz and b/skcosmo/datasets/data/csd-1000r.npz differ diff --git a/skcosmo/datasets/descr/csd-1000r.rst b/skcosmo/datasets/descr/csd-1000r.rst index edbfb1f2c..7011e783b 100644 --- a/skcosmo/datasets/descr/csd-1000r.rst +++ b/skcosmo/datasets/descr/csd-1000r.rst @@ -46,98 +46,10 @@ Data Set Characteristics References ---------- - .. [C1] https://github.com/lab-cosmo/librascal commit e78f1c69 - .. [C2] https://github.com/lab-cosmo/scikit-cosmo commit 64e789f + .. [C1] https://github.com/lab-cosmo/librascal commit ade202a6 + .. [C2] https://github.com/lab-cosmo/scikit-cosmo commit 4ed1d92 Reference Code -------------- -.. code-block:: python - - import numpy as np - from ase.io import read - - from skcosmo.feature_selection import SimpleFPS - from rascal.representations import SphericalInvariants as SOAP - - - # read all of the frames and book-keep the centers and species - filename = "/path/to/CSD-1000r.xyz" - frames = np.asarray( - read(filename, ":"), - dtype=object, - ) - - n_centers = [len(frame) for frame in frames] - n_env_accum = [sum(n_centers[: i + 1]) for i in range(len(n_centers))] - n_env = sum(n_centers) - - numbers = np.concatenate([frame.numbers for frame in frames]) - number_loc = np.array([np.where(numbers == i)[0] for i in [1, 6, 7, 8]], dtype=object) - - - # compute radial soap vectors as first pass - hypers = dict( - soap_type="PowerSpectrum", - interaction_cutoff=3.5, - max_radial=6, - max_angular=0, - gaussian_sigma_type="Constant", - gaussian_sigma_constant=0.4, - cutoff_smooth_width=0.5, - ) - soap = SOAP(**hypers) - X_raw = soap.transform(frames).get_features(soap) - - - # select 100 diverse samples - i_selected = SimpleFPS(n_features_to_select=100).fit(X_raw.T).selected_idx_ - - # book-keep which frames these samples belong in - frames_select = [np.where(n_env_accum > i)[0][0] for i in i_selected] - reduced_frames_select = list(sorted(set(frames_select))) - - properties_select = [ - frame.arrays["CS_local"] for frame in frames[reduced_frames_select] - ] - - n_centers_select = [len(frame) for frame in frames[reduced_frames_select]] - n_env_accum_select = [ - sum(n_centers_select[: i + 1]) for i in range(len(n_centers_select)) - ] - n_env_select = sum(n_centers_select) - - - # compute a larger power spectrum for these frames - hypers["max_angular"] = 6 - soap_select = SOAP(**hypers) - X_raw_select = soap_select.transform(frames[reduced_frames_select]).get_features( - soap_select - ) - - - # pull the soap vectors only pertaining to the selected environments - i_select_reduced = [] - properties_select_reduced = np.zeros(len(i_selected), dtype=float) - for i in range(len(i_selected)): - my_orig_frame = frames_select[i] - my_frame = reduced_frames_select.index(my_orig_frame) - if my_orig_frame != 0: - orig_loc = i_selected[i] - n_env_accum[my_orig_frame - 1] - new_loc = orig_loc + n_env_accum_select[my_frame - 1] - else: - orig_loc = i_selected[i] - new_loc = i_selected[i] - i_select_reduced.append(new_loc) - properties_select_reduced[i] = frames[my_orig_frame].arrays["CS_local"][orig_loc] - - X_sample_select = X_raw_select[i_select_reduced] - - - # select 100 / 2520 soap features - n_select = 100 - X_select = SimpleFPS(n_features_to_select=n_select).fit_transform(X_sample_select) - Y_select = properties_select_reduced.reshape(-1, 1) - - data = dict(X=X_select, Y=Y_select) - np.savez("skcosmo/datasets/data/csd-1000r.npz", **data) +.. literalinclude:: /../../skcosmo/datasets/make_csd_1000r.py diff --git a/skcosmo/datasets/make_csd_1000r.py b/skcosmo/datasets/make_csd_1000r.py new file mode 100644 index 000000000..520974b8d --- /dev/null +++ b/skcosmo/datasets/make_csd_1000r.py @@ -0,0 +1,80 @@ +import numpy as np +from ase.io import read +from rascal.representations import SphericalInvariants as SOAP + +from skcosmo.feature_selection import CUR +from skcosmo.preprocessing import StandardFlexibleScaler +from skcosmo.sample_selection import FPS + +# read all of the frames and book-keep the centers and species +filename = "/path/to/CSD-1000R.xyz" +frames = np.asarray( + read(filename, ":"), + dtype=object, +) + +n_centers = np.array([len(frame) for frame in frames]) +center_idx = np.array([i for i, f in enumerate(frames) for p in f]) +n_env_accum = np.zeros(len(frames) + 1, dtype=int) +n_env_accum[1:] = np.cumsum(n_centers) + +numbers = np.concatenate([frame.numbers for frame in frames]) + +# compute radial soap vectors as first pass +hypers = dict( + soap_type="PowerSpectrum", + interaction_cutoff=2.5, + max_radial=6, + max_angular=0, + gaussian_sigma_type="Constant", + gaussian_sigma_constant=0.4, + cutoff_smooth_width=0.5, + normalize=False, + global_species=[1, 6, 7, 8], + expansion_by_species_method="user defined", +) +soap = SOAP(**hypers) + +X_raw = StandardFlexibleScaler(column_wise=False).fit_transform( + soap.transform(frames).get_features(soap) +) + +# rank the environments in terms of diversity +n_samples = 500 +i_selected = FPS(n_to_select=n_samples, initialize=0).fit(X_raw).selected_idx_ + +# book-keep which frames these samples belong in +f_selected = center_idx[i_selected] +reduced_f_selected = list(sorted(set(f_selected))) +frames_selected = frames[f_selected].copy() +ci_selected = i_selected - n_env_accum[f_selected] + +properties_select = [ + frames[fi].arrays["CS_local"][ci] for fi, ci in zip(f_selected, ci_selected) +] + +# mask other environments in the frames so that SOAP vectors +# will not be computed for other environments on next pass +for frame, ci, fi in zip(frames_selected, ci_selected, f_selected): + frame.arrays["center_atoms_mask"] = np.zeros(len(frame), dtype=bool) + frame.arrays["center_atoms_mask"][ci] = True + +# compute a larger power spectrum for these frames +hypers["max_angular"] = 6 +soap_select = SOAP(**hypers) +X_sample_select = StandardFlexibleScaler(column_wise=False).fit_transform( + soap_select.transform(frames_selected).get_features(soap_select) +) +X_sample_select.shape + +# select 100 / 2520 soap features +n_select = 100 +X_select = CUR(n_to_select=n_select).fit_transform(X_sample_select) +Y_select = np.array(properties_select).reshape(-1, 1) + +data = dict( + X=X_select, + Y=Y_select, + original_mapping=[(fi, ci) for fi, ci in zip(f_selected, ci_selected)], +) +np.savez("./skcosmo/datasets/data/csd-1000r.npz", **data, size=(n_samples, n_select)) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index ed2ddad65..188da164e 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -29,11 +29,8 @@ def setUpClass(cls): cls.csd = load_csd_1000r() def test_load_csd_1000r_shape(self): - # test if representations have correct shape - self.assertTrue(self.csd.data.X.shape == (100, 100)) - - def test_load_csd_1000r_prop_shape(self): - self.assertTrue(self.csd.data.y.shape == (100, 1)) + # test if representations and properties have commensurate shape + self.assertTrue(self.csd.data.X.shape[0] == self.csd.data.y.shape[0]) def test_load_csd_1000r_access_descr(self): self.csd.DESCR diff --git a/tests/test_sample_simple_cur.py b/tests/test_sample_simple_cur.py index ad4593a64..a79a71aa8 100644 --- a/tests/test_sample_simple_cur.py +++ b/tests/test_sample_simple_cur.py @@ -3,14 +3,15 @@ import numpy as np from sklearn import exceptions -from skcosmo.datasets import load_csd_1000r as load +from sklearn.datasets import fetch_california_housing as load from skcosmo.sample_selection import CUR class TestCUR(unittest.TestCase): def setUp(self): - self.X, self.y = load(return_X_y=True) - self.n_select = min(self.X.shape) // 2 + self.X, _ = load(return_X_y=True) + self.X = self.X[:1000] + self.n_select = min(20, min(self.X.shape) // 2) def test_bad_transform(self): selector = CUR(n_to_select=2)