From 24fb83e293b9a81e8d724df5aab08dda072afb3a Mon Sep 17 00:00:00 2001 From: Steven T Date: Fri, 25 Oct 2019 14:38:53 -0400 Subject: [PATCH 1/5] Re-factor std_in_bound_per_species --- flare/util.py | 91 +++++++++++++++++++++++++++------------------------ 1 file changed, 49 insertions(+), 42 deletions(-) diff --git a/flare/util.py b/flare/util.py index d1cad5f42..4ebcc187c 100644 --- a/flare/util.py +++ b/flare/util.py @@ -212,14 +212,31 @@ def is_std_in_bound(std_tolerance, noise, structure, max_atoms_added): else: return True, [-1] -def is_std_in_bound_per_species(rel_std_tolerance, abs_std_tolerance, noise, structure, max_atoms_added, std_per_species): +def is_std_in_bound_per_species(rel_std_tolerance: float, + abs_std_tolerance: float, noise: float, + structure, max_atoms_added: int = + np.inf, max_by_species: dict = {}): """ - If the predicted variance is too high, returns a list of atoms - with the highest uncertainty - :param frame: Structure + Checks the stds of GP prediction assigned to the structure, returns a + list of atoms which either meet an absolute threshold or a relative + threshold defined by rel_std_tolerance * noise. Can limit the + total number of target atoms via max_atoms_added, and limit per species + by max_by_species. + + The max_atoms_added argument will 'overrule' the + max by species; e.g. if max_atoms_added is 2 and max_by_species is {"H":3}, + then at most two atoms total will be added. + + :param rel_std_tolerance: + :param abs_std_tolerance: + :param noise: + :param structure: + :param max_atoms_added: + :param max_by_species: :return: """ + # This indicates test mode, as the GP is not being modified in any way if rel_std_tolerance == 0 and abs_std_tolerance == 0: return True, [-1] @@ -233,43 +250,33 @@ def is_std_in_bound_per_species(rel_std_tolerance, abs_std_tolerance, noise, str threshold = min(rel_std_tolerance * np.abs(noise), abs_std_tolerance) - if (std_per_species is False): - return is_std_in_bound(-threshold, noise, - structure, max_atoms_added) - - # sort max stds - max_stds = np.zeros(structure.nat) - for atom_idx, std in enumerate(structure.stds): - max_stds[atom_idx] = np.max(std) - - std_sorted = {} - id_sorted = {} - species_set = set(structure.coded_species) - for species_i in species_set: - std_sorted[species_i] = [] - id_sorted[species_i] = [] - for i, spec in enumerate(structure.coded_species): - if max_stds[i] > threshold: - std_sorted[spec] += [max_stds[i]] - id_sorted[spec] += [i] + # Determine if any std component will trigger the threshold + max_std_components = [np.max(std) for std in structure.stds] + if max(max_std_components) < threshold: + return True, [-1] target_atoms = [] - ntarget = 0 - for species_i in species_set: - natom_i = len(std_sorted[species_i]) - if (natom_i>0): - std_sorted[species_i] = np.argsort(np.array(std_sorted[species_i])) - if (max_atoms_added == np.inf or \ - max_atoms_added > natom_i): - target_atoms += id_sorted[species_i] - ntarget += natom_i - else: - for i in range(max_atoms_added): - tempid = std_sorted[species_i][i] - target_atoms += [id_sorted[species_i][tempid]] - ntarget += max_atoms_added - if (ntarget > 0): - target_atoms = list(np.hstack(target_atoms)) - return False, target_atoms - else: - return True, [-1] + + # Sort from greatest to smallest max. std component + std_arg_sorted = np.flip(np.argsort(max_std_components)) + + present_species = {spec: 0 for spec in set(structure.species_labels)} + + # Only add atoms up to the bound + for i in std_arg_sorted: + + # If max atoms added reached or stds are now below threshold, done + if len(target_atoms) == max_atoms_added or \ + max_std_components[i] < threshold: + break + + cur_spec = structure.species_labels[i] + + # Only add up to species allowance, if it exists + if present_species[cur_spec] < \ + max_by_species.get(cur_spec, np.inf): + + target_atoms.append(i) + present_species[cur_spec] += 1 + + return False, target_atoms From 130cbab7713e0d5285981bdedbb59ad5a58a2c6c Mon Sep 17 00:00:00 2001 From: Steven T Date: Fri, 25 Oct 2019 15:11:06 -0400 Subject: [PATCH 2/5] Refactored std_in_bound_per_species with 100% coverage --- flare/util.py | 4 +-- tests/test_util.py | 83 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 83 insertions(+), 4 deletions(-) diff --git a/flare/util.py b/flare/util.py index 4ebcc187c..710b3e8bb 100644 --- a/flare/util.py +++ b/flare/util.py @@ -242,9 +242,9 @@ def is_std_in_bound_per_species(rel_std_tolerance: float, return True, [-1] # set uncertainty threshold - if rel_std_tolerance is None: + if rel_std_tolerance is None or rel_std_tolerance == 0 : threshold = abs_std_tolerance - elif abs_std_tolerance is None: + elif abs_std_tolerance is None or abs_std_tolerance == 0: threshold = rel_std_tolerance * np.abs(noise) else: threshold = min(rel_std_tolerance * np.abs(noise), diff --git a/tests/test_util.py b/tests/test_util.py index a1f8e3126..799be21a9 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,7 +1,9 @@ -from flare.util import element_to_Z, Z_to_element +from flare.util import element_to_Z, Z_to_element, is_std_in_bound_per_species import pytest import numpy as np +from tests.test_gp import get_random_structure + from pytest import raises @@ -29,4 +31,81 @@ def test_Z_to_element(): assert Z_to_element(pair[0]) == pair[1] with raises(ValueError): - Z_to_element('a') \ No newline at end of file + Z_to_element('a') + + +def test_std_in_bound_per_species(): + + test_structure, _ = get_random_structure(np.eye(3),['H','O'],3) + test_structure.species_labels = ['H', 'H', 'O'] + test_structure.stds = np.array([[1, 0, 0], [2, 0, 0], [3, 0, 0]]) + + result, target_atoms = \ + is_std_in_bound_per_species(rel_std_tolerance=0, abs_std_tolerance=0, + noise=0, structure = test_structure) + assert result is True and target_atoms == [-1] + + result, target_atoms = \ + is_std_in_bound_per_species(rel_std_tolerance=0, abs_std_tolerance=4, + noise=0, structure = test_structure) + assert result is True and target_atoms == [-1] + + result, target_atoms = \ + is_std_in_bound_per_species(rel_std_tolerance=0, abs_std_tolerance=2.9, + noise=0, structure = test_structure) + assert result is False and target_atoms == [2] + + result, target_atoms = \ + is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=0, + noise=4, structure = test_structure) + assert result is True and target_atoms == [-1] + + result, target_atoms = \ + is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=0, + noise=2.9, structure = test_structure) + assert result is False and target_atoms == [2] + + result, target_atoms = \ + is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=.1, + noise=2.9, structure = test_structure) + assert result is False and target_atoms == [2, 1, 0] + + result, target_atoms = \ + is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=.1, + noise=2.9, structure = test_structure, + max_atoms_added=2) + assert result is False and target_atoms == [2, 1] + + result, target_atoms = \ + is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=.1, + noise=2.9, structure = test_structure, + max_atoms_added=1) + assert result is False and target_atoms == [2] + + result, target_atoms = \ + is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=.1, + noise=2.9, structure = test_structure, + max_atoms_added=1,max_by_species={'H':1, + 'O':0}) + assert result is False and target_atoms == [1] + + result, target_atoms = \ + is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=.1, + noise=2.9, structure = test_structure, + max_atoms_added=1,max_by_species={'H':0, + 'O':1}) + assert result is False and target_atoms == [2] + + result, target_atoms = \ + is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=.1, + noise=2.9, structure = test_structure, + max_atoms_added=1,max_by_species={'H':2, + 'O':0}) + assert result is False and target_atoms == [1] + + result, target_atoms = \ + is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=.1, + noise=2.9, structure = test_structure, + max_atoms_added=3,max_by_species={'H':2, + 'O':0}) + assert result is False and target_atoms == [1,0] \ No newline at end of file From ad8d1c10ad993398ac2e428e106fb34ccdec554e Mon Sep 17 00:00:00 2001 From: Steven T Date: Fri, 25 Oct 2019 15:28:17 -0400 Subject: [PATCH 3/5] Update unit tests for GPFA --- tests/test_gp_from_aimd.py | 73 -------------------------------------- tests/test_util.py | 16 ++++----- 2 files changed, 8 insertions(+), 81 deletions(-) diff --git a/tests/test_gp_from_aimd.py b/tests/test_gp_from_aimd.py index d4c9f0530..d0d052074 100644 --- a/tests/test_gp_from_aimd.py +++ b/tests/test_gp_from_aimd.py @@ -148,76 +148,3 @@ def test_seed_and_run(): os.system('rm ./gp_from_aimd.xyz') os.system('rm ./gp_from_aimd-f.xyz') os.system('rm ./meth_test.pickle') - -# TO DO: this need to be rewrite for the per_species -def test_uncertainty_threshold(fake_gp): - - noise=0.01 - tt = TrajectoryTrainer([], fake_gp, rel_std_tolerance=.5, - abs_std_tolerance=.01) - - fake_structure = Structure(cell=np.eye(3), species=["H"], - positions=np.array([[0, 0, 0]])) - - # Test a structure with no variance passes - fake_structure.stds = np.array([[0, 0, 0]]) - - res1, res2 = is_std_in_bound_per_species(tt.rel_std_tolerance, - tt.abs_std_tolerance, noise, fake_structure, 2, - std_per_species = False) - assert res1 is True - assert res2 == [-1] - - # Test that the absolute criteria trips the threshold - fake_structure.stds = np.array([[.02, 0, 0]]) - - res1, res2 = is_std_in_bound_per_species(tt.rel_std_tolerance, - tt.abs_std_tolerance, noise, fake_structure, 2, - std_per_species = False) - assert res1 is False - assert res2 == [0] - - tt.abs_std_tolerance = 100 - - # Test that the relative criteria trips the threshold - fake_structure.stds = np.array([[.6, 0, 0]]) - - res1, res2 = is_std_in_bound_per_species(tt.rel_std_tolerance, - tt.abs_std_tolerance, noise, fake_structure, 2, - std_per_species = False) - assert res1 is False - assert res2 == [0] - - # Test that 'test mode' works, where no GP modification occurs - tt.abs_std_tolerance = 0 - tt.rel_std_tolerance = 0 - - res1, res2 = is_std_in_bound_per_species(tt.rel_std_tolerance, - tt.abs_std_tolerance, noise, fake_structure, 2, - std_per_species = False) - assert res1 is True - assert res2 == [-1] - - # Test permutations of one / another being off - tt.abs_std_tolerance = 1 - tt.rel_std_tolerance = 0 - - res1, res2 = is_std_in_bound_per_species(tt.rel_std_tolerance, - tt.abs_std_tolerance, noise, fake_structure, 2, - std_per_species = False) - assert res1 is True - assert res2 == [-1] - - tt.abs_std_tolerance = 0 - tt.rel_std_tolerance = 1 - - res1, res2 = is_std_in_bound_per_species(tt.rel_std_tolerance, - tt.abs_std_tolerance, noise, fake_structure, 2, - std_per_species = False) - assert res1 is True - assert res2 == [-1] - - os.system('rm gp_from_aimd.xyz*') - os.system('rm gp_from_aimd.out*') - os.system('rm gp_from_aimd-f.xyz*') - os.system('rm gp_from_aimd-hyps.dat*') diff --git a/tests/test_util.py b/tests/test_util.py index 799be21a9..7d635c875 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -39,49 +39,49 @@ def test_std_in_bound_per_species(): test_structure, _ = get_random_structure(np.eye(3),['H','O'],3) test_structure.species_labels = ['H', 'H', 'O'] test_structure.stds = np.array([[1, 0, 0], [2, 0, 0], [3, 0, 0]]) - + # Test that 'test mode' works result, target_atoms = \ is_std_in_bound_per_species(rel_std_tolerance=0, abs_std_tolerance=0, noise=0, structure = test_structure) assert result is True and target_atoms == [-1] - + # Test that high abs tolerance works result, target_atoms = \ is_std_in_bound_per_species(rel_std_tolerance=0, abs_std_tolerance=4, noise=0, structure = test_structure) assert result is True and target_atoms == [-1] - + # Test that low abs tolerance works result, target_atoms = \ is_std_in_bound_per_species(rel_std_tolerance=0, abs_std_tolerance=2.9, noise=0, structure = test_structure) assert result is False and target_atoms == [2] - + # Test that high rel tolerance works result, target_atoms = \ is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=0, noise=4, structure = test_structure) assert result is True and target_atoms == [-1] - + # Test that low rel tolerance works result, target_atoms = \ is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=0, noise=2.9, structure = test_structure) assert result is False and target_atoms == [2] - + # Test that both work result, target_atoms = \ is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=.1, noise=2.9, structure = test_structure) assert result is False and target_atoms == [2, 1, 0] - + # Test that the max atoms added works result, target_atoms = \ is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=.1, noise=2.9, structure = test_structure, max_atoms_added=2) assert result is False and target_atoms == [2, 1] - result, target_atoms = \ is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=.1, noise=2.9, structure = test_structure, max_atoms_added=1) assert result is False and target_atoms == [2] + # Test that max by species works result, target_atoms = \ is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=.1, noise=2.9, structure = test_structure, From 6ccaf6ba790a1792d4952d37f4387e0e1f879ed6 Mon Sep 17 00:00:00 2001 From: Steven T Date: Fri, 25 Oct 2019 15:43:21 -0400 Subject: [PATCH 4/5] Update GPFA to include fixes, add arguments --- flare/gp_from_aimd.py | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/flare/gp_from_aimd.py b/flare/gp_from_aimd.py index 396847a37..00e8e183f 100644 --- a/flare/gp_from_aimd.py +++ b/flare/gp_from_aimd.py @@ -32,15 +32,16 @@ def __init__(self, frames: List[Structure], validate_ratio: float = 0.1, calculate_energy: bool = False, output_name: str = 'gp_from_aimd', + pre_train_max_iter: int = 50, max_atoms_from_frame: int = np.inf, max_trains: int = np.inf, min_atoms_added: int = 1, shuffle_frames: bool = False, - std_per_species: bool = False, verbose: int = 0, model_write: str = '', pre_train_on_skips: int = -1, pre_train_seed_frames: List[Structure] = None, pre_train_seed_envs: List[Tuple[AtomicEnvironment, np.array]] = None, pre_train_atoms_per_element: dict = None, + train_atoms_per_element: dict = None, checkpoint_interval: int = None, model_format: str = 'json'): """ @@ -69,6 +70,8 @@ def __init__(self, frames: List[Structure], :param pre_train_seed_envs: Environments to train on before running :param pre_train_atoms_per_element: Max # of environments to add from each species in the seed pre-training steps + :param train_atoms_per_element: Max # of environments to add from + each species in the training steps :param checkpoint_interval: How often to write model after trainings :param model_format: Format to write GP model to """ @@ -86,7 +89,6 @@ def __init__(self, frames: List[Structure], self.curr_step = 0 self.max_atoms_from_frame = max_atoms_from_frame self.min_atoms_added = min_atoms_added - self.std_per_species = std_per_species self.verbose = verbose self.train_count = 0 @@ -110,13 +112,16 @@ def __init__(self, frames: List[Structure], # To later be filled in using the time library self.start_time = None + self.pre_train_max_iter = pre_train_max_iter self.pre_train_on_skips = pre_train_on_skips self.seed_envs = [] if pre_train_seed_envs is None else \ pre_train_seed_envs self.seed_frames = [] if pre_train_seed_frames is None \ else pre_train_seed_frames self.pre_train_env_per_species = {} if pre_train_atoms_per_element \ - is None else pre_train_atoms_per_element + is None else pre_train_atoms_per_element + self.train_env_per_species = {} if train_atoms_per_element \ + is None else train_atoms_per_element # Convert to Coded Species if self.pre_train_env_per_species: @@ -202,7 +207,7 @@ def pre_run(self): if self.verbose >= 3: print("Now commencing pre-run training of GP (which has " "non-empty training set)") - self.train_gp() + self.train_gp(max_iter = self.pre_train_max_iter) def run(self): """ @@ -243,9 +248,11 @@ def run(self): if i < train_frame: # Get max uncertainty atoms std_in_bound, train_atoms = is_std_in_bound_per_species( - self.rel_std_tolerance, self.abs_std_tolerance, - self.gp.hyps[-1], cur_frame, - self.max_atoms_from_frame, self.std_per_species) + rel_std_tolerance=self.rel_std_tolerance, + abs_std_tolerance=self.abs_std_tolerance, + noise=self.gp.hyps[-1], structure=cur_frame, + max_atoms_added=self.max_atoms_from_frame, + max_by_species=self.train_env_per_species) if not std_in_bound: @@ -298,14 +305,25 @@ def update_gp_and_print(self, frame: Structure, train_atoms: List[int], if train: self.train_gp() - def train_gp(self): + def train_gp(self, max_iter: int = None): """ Train the Gaussian process and write the results to the output file. """ if self.verbose >= 1: self.output.write_to_log('Train GP\n') - self.gp.train(output=self.output if self.verbose >= 2 else None) + # TODO: Improve flexibility in GP training to make this next step + # unnecessary, so maxiter can be passed as an argument + + if max_iter is not None: + temp_maxiter = self.gp.maxiter + self.gp.maxiter = max_iter + self.gp.train(output=self.output if self.verbose >= 2 else None) + self.gp.maxiter = temp_maxiter + + else: + self.gp.train(output=self.output if self.verbose >= 2 else None) + self.output.write_hyps(self.gp.hyp_labels, self.gp.hyps, self.start_time, self.gp.likelihood, self.gp.likelihood_gradient) From c010e0a3cce30c7f6dbf87b8f1525a6906ef31da Mon Sep 17 00:00:00 2001 From: Steven T Date: Fri, 25 Oct 2019 15:57:28 -0400 Subject: [PATCH 5/5] Added some comments --- flare/gp_from_aimd.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/flare/gp_from_aimd.py b/flare/gp_from_aimd.py index 00e8e183f..db3b198ab 100644 --- a/flare/gp_from_aimd.py +++ b/flare/gp_from_aimd.py @@ -54,6 +54,7 @@ def __init__(self, frames: List[Structure], noise variance hyperparameter :param abs_std_tolerance: Train if uncertainty is above this :param parallel: Use parallel functions or not + :param validate_ratio: Fraction of frames used for validation :param no_cpus: number of cpus to run with multithreading :param skip: Skip through frames :param calculate_energy: Use local energy kernel or not @@ -168,6 +169,7 @@ def pre_run(self): # so all atomic species are represented in the first step. # Otherwise use the seed frames passed in by user. + # Remove frames used as seed from later part of training if self.pre_train_on_skips > 0: self.seed_frames = [] newframes = [] @@ -177,6 +179,7 @@ def pre_run(self): else: newframes += [self.frames[i]] self.frames = newframes + elif len(self.gp.training_data) == 0 and len(self.seed_frames) == 0: self.seed_frames = [self.frames[0]] self.frames = self.frames[1:]