Skip to content

Commit

Permalink
Updating csd-1000r to contain a sufficient number of sample, use the … (
Browse files Browse the repository at this point in the history
#123)

* Removed boston dataset from tests and examples`

* Updating csd-1000r to contain a sufficient number of sample, use the new syntax, and contain up-to-date docs

Replacing csd in sample selection with california because the larger dataset means a few redundant samples, which is not good for stable testing

Using literal include

Update skcosmo/datasets/descr/csd-1000r.rst

Co-authored-by: Guillaume Fraux <luthaf@luthaf.fr>

Co-authored-by: Guillaume Fraux <luthaf@luthaf.fr>
  • Loading branch information
rosecers and Luthaf authored May 20, 2022
1 parent 18bccdf commit f433e28
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 99 deletions.
Binary file modified skcosmo/datasets/data/csd-1000r.npz
Binary file not shown.
94 changes: 3 additions & 91 deletions skcosmo/datasets/descr/csd-1000r.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,98 +46,10 @@ Data Set Characteristics
References
----------

.. [C1] https://github.com/lab-cosmo/librascal commit e78f1c69
.. [C2] https://github.com/lab-cosmo/scikit-cosmo commit 64e789f
.. [C1] https://github.com/lab-cosmo/librascal commit ade202a6
.. [C2] https://github.com/lab-cosmo/scikit-cosmo commit 4ed1d92
Reference Code
--------------

.. code-block:: python
import numpy as np
from ase.io import read
from skcosmo.feature_selection import SimpleFPS
from rascal.representations import SphericalInvariants as SOAP
# read all of the frames and book-keep the centers and species
filename = "/path/to/CSD-1000r.xyz"
frames = np.asarray(
read(filename, ":"),
dtype=object,
)
n_centers = [len(frame) for frame in frames]
n_env_accum = [sum(n_centers[: i + 1]) for i in range(len(n_centers))]
n_env = sum(n_centers)
numbers = np.concatenate([frame.numbers for frame in frames])
number_loc = np.array([np.where(numbers == i)[0] for i in [1, 6, 7, 8]], dtype=object)
# compute radial soap vectors as first pass
hypers = dict(
soap_type="PowerSpectrum",
interaction_cutoff=3.5,
max_radial=6,
max_angular=0,
gaussian_sigma_type="Constant",
gaussian_sigma_constant=0.4,
cutoff_smooth_width=0.5,
)
soap = SOAP(**hypers)
X_raw = soap.transform(frames).get_features(soap)
# select 100 diverse samples
i_selected = SimpleFPS(n_features_to_select=100).fit(X_raw.T).selected_idx_
# book-keep which frames these samples belong in
frames_select = [np.where(n_env_accum > i)[0][0] for i in i_selected]
reduced_frames_select = list(sorted(set(frames_select)))
properties_select = [
frame.arrays["CS_local"] for frame in frames[reduced_frames_select]
]
n_centers_select = [len(frame) for frame in frames[reduced_frames_select]]
n_env_accum_select = [
sum(n_centers_select[: i + 1]) for i in range(len(n_centers_select))
]
n_env_select = sum(n_centers_select)
# compute a larger power spectrum for these frames
hypers["max_angular"] = 6
soap_select = SOAP(**hypers)
X_raw_select = soap_select.transform(frames[reduced_frames_select]).get_features(
soap_select
)
# pull the soap vectors only pertaining to the selected environments
i_select_reduced = []
properties_select_reduced = np.zeros(len(i_selected), dtype=float)
for i in range(len(i_selected)):
my_orig_frame = frames_select[i]
my_frame = reduced_frames_select.index(my_orig_frame)
if my_orig_frame != 0:
orig_loc = i_selected[i] - n_env_accum[my_orig_frame - 1]
new_loc = orig_loc + n_env_accum_select[my_frame - 1]
else:
orig_loc = i_selected[i]
new_loc = i_selected[i]
i_select_reduced.append(new_loc)
properties_select_reduced[i] = frames[my_orig_frame].arrays["CS_local"][orig_loc]
X_sample_select = X_raw_select[i_select_reduced]
# select 100 / 2520 soap features
n_select = 100
X_select = SimpleFPS(n_features_to_select=n_select).fit_transform(X_sample_select)
Y_select = properties_select_reduced.reshape(-1, 1)
data = dict(X=X_select, Y=Y_select)
np.savez("skcosmo/datasets/data/csd-1000r.npz", **data)
.. literalinclude:: /../../skcosmo/datasets/make_csd_1000r.py
80 changes: 80 additions & 0 deletions skcosmo/datasets/make_csd_1000r.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import numpy as np
from ase.io import read
from rascal.representations import SphericalInvariants as SOAP

from skcosmo.feature_selection import CUR
from skcosmo.preprocessing import StandardFlexibleScaler
from skcosmo.sample_selection import FPS

# read all of the frames and book-keep the centers and species
filename = "/path/to/CSD-1000R.xyz"
frames = np.asarray(
read(filename, ":"),
dtype=object,
)

n_centers = np.array([len(frame) for frame in frames])
center_idx = np.array([i for i, f in enumerate(frames) for p in f])
n_env_accum = np.zeros(len(frames) + 1, dtype=int)
n_env_accum[1:] = np.cumsum(n_centers)

numbers = np.concatenate([frame.numbers for frame in frames])

# compute radial soap vectors as first pass
hypers = dict(
soap_type="PowerSpectrum",
interaction_cutoff=2.5,
max_radial=6,
max_angular=0,
gaussian_sigma_type="Constant",
gaussian_sigma_constant=0.4,
cutoff_smooth_width=0.5,
normalize=False,
global_species=[1, 6, 7, 8],
expansion_by_species_method="user defined",
)
soap = SOAP(**hypers)

X_raw = StandardFlexibleScaler(column_wise=False).fit_transform(
soap.transform(frames).get_features(soap)
)

# rank the environments in terms of diversity
n_samples = 500
i_selected = FPS(n_to_select=n_samples, initialize=0).fit(X_raw).selected_idx_

# book-keep which frames these samples belong in
f_selected = center_idx[i_selected]
reduced_f_selected = list(sorted(set(f_selected)))
frames_selected = frames[f_selected].copy()
ci_selected = i_selected - n_env_accum[f_selected]

properties_select = [
frames[fi].arrays["CS_local"][ci] for fi, ci in zip(f_selected, ci_selected)
]

# mask other environments in the frames so that SOAP vectors
# will not be computed for other environments on next pass
for frame, ci, fi in zip(frames_selected, ci_selected, f_selected):
frame.arrays["center_atoms_mask"] = np.zeros(len(frame), dtype=bool)
frame.arrays["center_atoms_mask"][ci] = True

# compute a larger power spectrum for these frames
hypers["max_angular"] = 6
soap_select = SOAP(**hypers)
X_sample_select = StandardFlexibleScaler(column_wise=False).fit_transform(
soap_select.transform(frames_selected).get_features(soap_select)
)
X_sample_select.shape

# select 100 / 2520 soap features
n_select = 100
X_select = CUR(n_to_select=n_select).fit_transform(X_sample_select)
Y_select = np.array(properties_select).reshape(-1, 1)

data = dict(
X=X_select,
Y=Y_select,
original_mapping=[(fi, ci) for fi, ci in zip(f_selected, ci_selected)],
)
np.savez("./skcosmo/datasets/data/csd-1000r.npz", **data, size=(n_samples, n_select))
7 changes: 2 additions & 5 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,8 @@ def setUpClass(cls):
cls.csd = load_csd_1000r()

def test_load_csd_1000r_shape(self):
# test if representations have correct shape
self.assertTrue(self.csd.data.X.shape == (100, 100))

def test_load_csd_1000r_prop_shape(self):
self.assertTrue(self.csd.data.y.shape == (100, 1))
# test if representations and properties have commensurate shape
self.assertTrue(self.csd.data.X.shape[0] == self.csd.data.y.shape[0])

def test_load_csd_1000r_access_descr(self):
self.csd.DESCR
Expand Down
7 changes: 4 additions & 3 deletions tests/test_sample_simple_cur.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@
import numpy as np
from sklearn import exceptions

from skcosmo.datasets import load_csd_1000r as load
from sklearn.datasets import fetch_california_housing as load
from skcosmo.sample_selection import CUR


class TestCUR(unittest.TestCase):
def setUp(self):
self.X, self.y = load(return_X_y=True)
self.n_select = min(self.X.shape) // 2
self.X, _ = load(return_X_y=True)
self.X = self.X[:1000]
self.n_select = min(20, min(self.X.shape) // 2)

def test_bad_transform(self):
selector = CUR(n_to_select=2)
Expand Down

0 comments on commit f433e28

Please sign in to comment.