Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updating csd-1000r to contain a sufficient number of sample, use the … #123

Merged
merged 2 commits into from
May 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified skcosmo/datasets/data/csd-1000r.npz
Binary file not shown.
94 changes: 3 additions & 91 deletions skcosmo/datasets/descr/csd-1000r.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,98 +46,10 @@ Data Set Characteristics
References
----------

.. [C1] https://github.com/lab-cosmo/librascal commit e78f1c69
.. [C2] https://github.com/lab-cosmo/scikit-cosmo commit 64e789f
.. [C1] https://github.com/lab-cosmo/librascal commit ade202a6
.. [C2] https://github.com/lab-cosmo/scikit-cosmo commit 4ed1d92

Reference Code
--------------

.. code-block:: python

import numpy as np
from ase.io import read

from skcosmo.feature_selection import SimpleFPS
from rascal.representations import SphericalInvariants as SOAP


# read all of the frames and book-keep the centers and species
filename = "/path/to/CSD-1000r.xyz"
frames = np.asarray(
read(filename, ":"),
dtype=object,
)

n_centers = [len(frame) for frame in frames]
n_env_accum = [sum(n_centers[: i + 1]) for i in range(len(n_centers))]
n_env = sum(n_centers)

numbers = np.concatenate([frame.numbers for frame in frames])
number_loc = np.array([np.where(numbers == i)[0] for i in [1, 6, 7, 8]], dtype=object)


# compute radial soap vectors as first pass
hypers = dict(
soap_type="PowerSpectrum",
interaction_cutoff=3.5,
max_radial=6,
max_angular=0,
gaussian_sigma_type="Constant",
gaussian_sigma_constant=0.4,
cutoff_smooth_width=0.5,
)
soap = SOAP(**hypers)
X_raw = soap.transform(frames).get_features(soap)


# select 100 diverse samples
i_selected = SimpleFPS(n_features_to_select=100).fit(X_raw.T).selected_idx_

# book-keep which frames these samples belong in
frames_select = [np.where(n_env_accum > i)[0][0] for i in i_selected]
reduced_frames_select = list(sorted(set(frames_select)))

properties_select = [
frame.arrays["CS_local"] for frame in frames[reduced_frames_select]
]

n_centers_select = [len(frame) for frame in frames[reduced_frames_select]]
n_env_accum_select = [
sum(n_centers_select[: i + 1]) for i in range(len(n_centers_select))
]
n_env_select = sum(n_centers_select)


# compute a larger power spectrum for these frames
hypers["max_angular"] = 6
soap_select = SOAP(**hypers)
X_raw_select = soap_select.transform(frames[reduced_frames_select]).get_features(
soap_select
)


# pull the soap vectors only pertaining to the selected environments
i_select_reduced = []
properties_select_reduced = np.zeros(len(i_selected), dtype=float)
for i in range(len(i_selected)):
my_orig_frame = frames_select[i]
my_frame = reduced_frames_select.index(my_orig_frame)
if my_orig_frame != 0:
orig_loc = i_selected[i] - n_env_accum[my_orig_frame - 1]
new_loc = orig_loc + n_env_accum_select[my_frame - 1]
else:
orig_loc = i_selected[i]
new_loc = i_selected[i]
i_select_reduced.append(new_loc)
properties_select_reduced[i] = frames[my_orig_frame].arrays["CS_local"][orig_loc]

X_sample_select = X_raw_select[i_select_reduced]


# select 100 / 2520 soap features
n_select = 100
X_select = SimpleFPS(n_features_to_select=n_select).fit_transform(X_sample_select)
Y_select = properties_select_reduced.reshape(-1, 1)

data = dict(X=X_select, Y=Y_select)
np.savez("skcosmo/datasets/data/csd-1000r.npz", **data)
.. literalinclude:: /../../skcosmo/datasets/make_csd_1000r.py
80 changes: 80 additions & 0 deletions skcosmo/datasets/make_csd_1000r.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import numpy as np
from ase.io import read
from rascal.representations import SphericalInvariants as SOAP

from skcosmo.feature_selection import CUR
from skcosmo.preprocessing import StandardFlexibleScaler
from skcosmo.sample_selection import FPS

# read all of the frames and book-keep the centers and species
filename = "/path/to/CSD-1000R.xyz"
frames = np.asarray(
read(filename, ":"),
dtype=object,
)

n_centers = np.array([len(frame) for frame in frames])
center_idx = np.array([i for i, f in enumerate(frames) for p in f])
n_env_accum = np.zeros(len(frames) + 1, dtype=int)
n_env_accum[1:] = np.cumsum(n_centers)

numbers = np.concatenate([frame.numbers for frame in frames])

# compute radial soap vectors as first pass
hypers = dict(
soap_type="PowerSpectrum",
interaction_cutoff=2.5,
max_radial=6,
max_angular=0,
gaussian_sigma_type="Constant",
gaussian_sigma_constant=0.4,
cutoff_smooth_width=0.5,
normalize=False,
global_species=[1, 6, 7, 8],
expansion_by_species_method="user defined",
)
soap = SOAP(**hypers)

X_raw = StandardFlexibleScaler(column_wise=False).fit_transform(
soap.transform(frames).get_features(soap)
)

# rank the environments in terms of diversity
n_samples = 500
i_selected = FPS(n_to_select=n_samples, initialize=0).fit(X_raw).selected_idx_

# book-keep which frames these samples belong in
f_selected = center_idx[i_selected]
reduced_f_selected = list(sorted(set(f_selected)))
frames_selected = frames[f_selected].copy()
ci_selected = i_selected - n_env_accum[f_selected]

properties_select = [
frames[fi].arrays["CS_local"][ci] for fi, ci in zip(f_selected, ci_selected)
]

# mask other environments in the frames so that SOAP vectors
# will not be computed for other environments on next pass
for frame, ci, fi in zip(frames_selected, ci_selected, f_selected):
frame.arrays["center_atoms_mask"] = np.zeros(len(frame), dtype=bool)
frame.arrays["center_atoms_mask"][ci] = True

# compute a larger power spectrum for these frames
hypers["max_angular"] = 6
soap_select = SOAP(**hypers)
X_sample_select = StandardFlexibleScaler(column_wise=False).fit_transform(
soap_select.transform(frames_selected).get_features(soap_select)
)
X_sample_select.shape

# select 100 / 2520 soap features
n_select = 100
X_select = CUR(n_to_select=n_select).fit_transform(X_sample_select)
Y_select = np.array(properties_select).reshape(-1, 1)

data = dict(
X=X_select,
Y=Y_select,
original_mapping=[(fi, ci) for fi, ci in zip(f_selected, ci_selected)],
)
np.savez("./skcosmo/datasets/data/csd-1000r.npz", **data, size=(n_samples, n_select))
7 changes: 2 additions & 5 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,8 @@ def setUpClass(cls):
cls.csd = load_csd_1000r()

def test_load_csd_1000r_shape(self):
# test if representations have correct shape
self.assertTrue(self.csd.data.X.shape == (100, 100))

def test_load_csd_1000r_prop_shape(self):
self.assertTrue(self.csd.data.y.shape == (100, 1))
# test if representations and properties have commensurate shape
self.assertTrue(self.csd.data.X.shape[0] == self.csd.data.y.shape[0])

def test_load_csd_1000r_access_descr(self):
self.csd.DESCR
Expand Down
7 changes: 4 additions & 3 deletions tests/test_sample_simple_cur.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@
import numpy as np
from sklearn import exceptions

from skcosmo.datasets import load_csd_1000r as load
from sklearn.datasets import fetch_california_housing as load
from skcosmo.sample_selection import CUR


class TestCUR(unittest.TestCase):
def setUp(self):
self.X, self.y = load(return_X_y=True)
self.n_select = min(self.X.shape) // 2
self.X, _ = load(return_X_y=True)
self.X = self.X[:1000]
self.n_select = min(20, min(self.X.shape) // 2)

def test_bad_transform(self):
selector = CUR(n_to_select=2)
Expand Down