From 24fb83e293b9a81e8d724df5aab08dda072afb3a Mon Sep 17 00:00:00 2001
From: Steven T <torrisi@g.harvard.edu>
Date: Fri, 25 Oct 2019 14:38:53 -0400
Subject: [PATCH 1/5] Re-factor std_in_bound_per_species

---
 flare/util.py | 91 +++++++++++++++++++++++++++------------------------
 1 file changed, 49 insertions(+), 42 deletions(-)

diff --git a/flare/util.py b/flare/util.py
index d1cad5f42..4ebcc187c 100644
--- a/flare/util.py
+++ b/flare/util.py
@@ -212,14 +212,31 @@ def is_std_in_bound(std_tolerance, noise, structure, max_atoms_added):
     else:
         return True, [-1]
 
-def is_std_in_bound_per_species(rel_std_tolerance, abs_std_tolerance, noise, structure, max_atoms_added, std_per_species):
+def is_std_in_bound_per_species(rel_std_tolerance: float,
+                                abs_std_tolerance: float, noise: float,
+                                structure, max_atoms_added: int =
+                                np.inf, max_by_species: dict = {}):
     """
-    If the predicted variance is too high, returns a list of atoms
-    with the highest uncertainty
-    :param frame: Structure
+    Checks the stds of GP prediction assigned to the structure, returns a
+    list of atoms which either meet an absolute threshold or a relative
+    threshold defined by rel_std_tolerance * noise. Can limit the
+    total number of target atoms via max_atoms_added, and limit per species
+    by max_by_species.
+
+    The max_atoms_added argument will 'overrule' the
+    max by species; e.g. if max_atoms_added is 2 and max_by_species is {"H":3},
+    then at most two atoms total will be added.
+
+    :param rel_std_tolerance:
+    :param abs_std_tolerance:
+    :param noise:
+    :param structure:
+    :param max_atoms_added:
+    :param max_by_species:
     :return:
     """
 
+
     # This indicates test mode, as the GP is not being modified in any way
     if rel_std_tolerance == 0 and abs_std_tolerance == 0:
         return True, [-1]
@@ -233,43 +250,33 @@ def is_std_in_bound_per_species(rel_std_tolerance, abs_std_tolerance, noise, str
         threshold = min(rel_std_tolerance * np.abs(noise),
                         abs_std_tolerance)
 
-    if (std_per_species is False):
-        return is_std_in_bound(-threshold, noise,
-                structure, max_atoms_added)
-
-    # sort max stds
-    max_stds = np.zeros(structure.nat)
-    for atom_idx, std in enumerate(structure.stds):
-        max_stds[atom_idx] = np.max(std)
-
-    std_sorted = {}
-    id_sorted = {}
-    species_set = set(structure.coded_species)
-    for species_i in species_set:
-        std_sorted[species_i] = []
-        id_sorted[species_i] = []
-    for i, spec in enumerate(structure.coded_species):
-        if max_stds[i] > threshold:
-            std_sorted[spec] += [max_stds[i]]
-            id_sorted[spec] += [i]
+    # Determine if any std component will trigger the threshold
+    max_std_components = [np.max(std) for std in structure.stds]
+    if max(max_std_components) < threshold:
+        return True, [-1]
 
     target_atoms = []
-    ntarget = 0
-    for species_i in species_set:
-        natom_i = len(std_sorted[species_i])
-        if (natom_i>0):
-            std_sorted[species_i] = np.argsort(np.array(std_sorted[species_i]))
-            if (max_atoms_added == np.inf or \
-                    max_atoms_added > natom_i):
-                target_atoms += id_sorted[species_i]
-                ntarget += natom_i
-            else:
-                for i in range(max_atoms_added):
-                    tempid = std_sorted[species_i][i]
-                    target_atoms += [id_sorted[species_i][tempid]]
-                ntarget += max_atoms_added
-    if (ntarget > 0):
-        target_atoms = list(np.hstack(target_atoms))
-        return False, target_atoms
-    else:
-        return True, [-1]
+
+    # Sort from greatest to smallest max. std component
+    std_arg_sorted = np.flip(np.argsort(max_std_components))
+
+    present_species = {spec: 0 for spec in set(structure.species_labels)}
+
+    # Only add atoms up to the bound
+    for i in std_arg_sorted:
+
+        # If max atoms added reached or stds are now below threshold, done
+        if len(target_atoms) == max_atoms_added or \
+                max_std_components[i] < threshold:
+            break
+
+        cur_spec = structure.species_labels[i]
+
+        # Only add up to species allowance, if it exists
+        if present_species[cur_spec] < \
+            max_by_species.get(cur_spec, np.inf):
+
+            target_atoms.append(i)
+            present_species[cur_spec] += 1
+
+    return False, target_atoms

From 130cbab7713e0d5285981bdedbb59ad5a58a2c6c Mon Sep 17 00:00:00 2001
From: Steven T <torrisi@g.harvard.edu>
Date: Fri, 25 Oct 2019 15:11:06 -0400
Subject: [PATCH 2/5] Refactored std_in_bound_per_species with 100% coverage

---
 flare/util.py      |  4 +--
 tests/test_util.py | 83 ++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 83 insertions(+), 4 deletions(-)

diff --git a/flare/util.py b/flare/util.py
index 4ebcc187c..710b3e8bb 100644
--- a/flare/util.py
+++ b/flare/util.py
@@ -242,9 +242,9 @@ def is_std_in_bound_per_species(rel_std_tolerance: float,
         return True, [-1]
 
     # set uncertainty threshold
-    if rel_std_tolerance is None:
+    if rel_std_tolerance is None or rel_std_tolerance == 0 :
         threshold = abs_std_tolerance
-    elif abs_std_tolerance is None:
+    elif abs_std_tolerance is None or abs_std_tolerance == 0:
         threshold = rel_std_tolerance * np.abs(noise)
     else:
         threshold = min(rel_std_tolerance * np.abs(noise),
diff --git a/tests/test_util.py b/tests/test_util.py
index a1f8e3126..799be21a9 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -1,7 +1,9 @@
-from flare.util import element_to_Z, Z_to_element
+from flare.util import element_to_Z, Z_to_element, is_std_in_bound_per_species
 import pytest
 import numpy as np
 
+from tests.test_gp import get_random_structure
+
 from pytest import raises
 
 
@@ -29,4 +31,81 @@ def test_Z_to_element():
         assert Z_to_element(pair[0]) == pair[1]
 
     with raises(ValueError):
-        Z_to_element('a')
\ No newline at end of file
+        Z_to_element('a')
+
+
+def test_std_in_bound_per_species():
+
+    test_structure, _ = get_random_structure(np.eye(3),['H','O'],3)
+    test_structure.species_labels = ['H', 'H', 'O']
+    test_structure.stds = np.array([[1, 0, 0], [2, 0, 0], [3, 0, 0]])
+
+    result, target_atoms = \
+        is_std_in_bound_per_species(rel_std_tolerance=0, abs_std_tolerance=0,
+                             noise=0, structure = test_structure)
+    assert result is True and target_atoms == [-1]
+
+    result, target_atoms = \
+        is_std_in_bound_per_species(rel_std_tolerance=0, abs_std_tolerance=4,
+                             noise=0, structure = test_structure)
+    assert result is True and target_atoms == [-1]
+
+    result, target_atoms = \
+        is_std_in_bound_per_species(rel_std_tolerance=0, abs_std_tolerance=2.9,
+                             noise=0, structure = test_structure)
+    assert result is False and target_atoms == [2]
+
+    result, target_atoms = \
+        is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=0,
+                             noise=4, structure = test_structure)
+    assert result is True and target_atoms == [-1]
+
+    result, target_atoms = \
+        is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=0,
+                             noise=2.9, structure = test_structure)
+    assert result is False and target_atoms == [2]
+
+    result, target_atoms = \
+        is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=.1,
+                             noise=2.9, structure = test_structure)
+    assert result is False and target_atoms == [2, 1, 0]
+
+    result, target_atoms = \
+        is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=.1,
+                             noise=2.9, structure = test_structure,
+                                    max_atoms_added=2)
+    assert result is False and target_atoms == [2, 1]
+
+    result, target_atoms = \
+        is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=.1,
+                             noise=2.9, structure = test_structure,
+                                    max_atoms_added=1)
+    assert result is False and target_atoms == [2]
+
+    result, target_atoms = \
+        is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=.1,
+                             noise=2.9, structure = test_structure,
+                                    max_atoms_added=1,max_by_species={'H':1,
+                                                                      'O':0})
+    assert result is False and target_atoms == [1]
+
+    result, target_atoms = \
+        is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=.1,
+                             noise=2.9, structure = test_structure,
+                                    max_atoms_added=1,max_by_species={'H':0,
+                                                                      'O':1})
+    assert result is False and target_atoms == [2]
+
+    result, target_atoms = \
+        is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=.1,
+                             noise=2.9, structure = test_structure,
+                                    max_atoms_added=1,max_by_species={'H':2,
+                                                                      'O':0})
+    assert result is False and target_atoms == [1]
+
+    result, target_atoms = \
+        is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=.1,
+                             noise=2.9, structure = test_structure,
+                                    max_atoms_added=3,max_by_species={'H':2,
+                                                                      'O':0})
+    assert result is False and target_atoms == [1,0]
\ No newline at end of file

From ad8d1c10ad993398ac2e428e106fb34ccdec554e Mon Sep 17 00:00:00 2001
From: Steven T <torrisi@g.harvard.edu>
Date: Fri, 25 Oct 2019 15:28:17 -0400
Subject: [PATCH 3/5] Update unit tests for GPFA

---
 tests/test_gp_from_aimd.py | 73 --------------------------------------
 tests/test_util.py         | 16 ++++-----
 2 files changed, 8 insertions(+), 81 deletions(-)

diff --git a/tests/test_gp_from_aimd.py b/tests/test_gp_from_aimd.py
index d4c9f0530..d0d052074 100644
--- a/tests/test_gp_from_aimd.py
+++ b/tests/test_gp_from_aimd.py
@@ -148,76 +148,3 @@ def test_seed_and_run():
     os.system('rm ./gp_from_aimd.xyz')
     os.system('rm ./gp_from_aimd-f.xyz')
     os.system('rm ./meth_test.pickle')
-
-# TO DO: this need to be rewrite for the per_species
-def test_uncertainty_threshold(fake_gp):
-
-    noise=0.01
-    tt = TrajectoryTrainer([], fake_gp, rel_std_tolerance=.5,
-                           abs_std_tolerance=.01)
-
-    fake_structure = Structure(cell=np.eye(3), species=["H"],
-                               positions=np.array([[0, 0, 0]]))
-
-    # Test a structure with no variance passes
-    fake_structure.stds = np.array([[0, 0, 0]])
-
-    res1, res2 = is_std_in_bound_per_species(tt.rel_std_tolerance,
-            tt.abs_std_tolerance, noise, fake_structure, 2,
-            std_per_species = False)
-    assert res1 is True
-    assert res2 == [-1]
-
-    # Test that the absolute criteria trips the threshold
-    fake_structure.stds = np.array([[.02, 0, 0]])
-
-    res1, res2 = is_std_in_bound_per_species(tt.rel_std_tolerance,
-            tt.abs_std_tolerance, noise, fake_structure, 2,
-            std_per_species = False)
-    assert res1 is False
-    assert res2 == [0]
-
-    tt.abs_std_tolerance = 100
-
-    # Test that the relative criteria trips the threshold
-    fake_structure.stds = np.array([[.6, 0, 0]])
-
-    res1, res2 = is_std_in_bound_per_species(tt.rel_std_tolerance,
-            tt.abs_std_tolerance, noise, fake_structure, 2,
-            std_per_species = False)
-    assert res1 is False
-    assert res2 == [0]
-
-    # Test that 'test mode' works, where no GP modification occurs
-    tt.abs_std_tolerance = 0
-    tt.rel_std_tolerance = 0
-
-    res1, res2 = is_std_in_bound_per_species(tt.rel_std_tolerance,
-            tt.abs_std_tolerance, noise, fake_structure, 2,
-            std_per_species = False)
-    assert res1 is True
-    assert res2 == [-1]
-
-    # Test permutations of one / another being off
-    tt.abs_std_tolerance = 1
-    tt.rel_std_tolerance = 0
-
-    res1, res2 = is_std_in_bound_per_species(tt.rel_std_tolerance,
-            tt.abs_std_tolerance, noise, fake_structure, 2,
-            std_per_species = False)
-    assert res1 is True
-    assert res2 == [-1]
-
-    tt.abs_std_tolerance = 0
-    tt.rel_std_tolerance = 1
-
-    res1, res2 = is_std_in_bound_per_species(tt.rel_std_tolerance,
-            tt.abs_std_tolerance, noise, fake_structure, 2,
-            std_per_species = False)
-    assert res1 is True
-    assert res2 == [-1]
-
-    os.system('rm gp_from_aimd.xyz*')
-    os.system('rm gp_from_aimd.out*')
-    os.system('rm gp_from_aimd-f.xyz*')
-    os.system('rm gp_from_aimd-hyps.dat*')
diff --git a/tests/test_util.py b/tests/test_util.py
index 799be21a9..7d635c875 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -39,49 +39,49 @@ def test_std_in_bound_per_species():
     test_structure, _ = get_random_structure(np.eye(3),['H','O'],3)
     test_structure.species_labels = ['H', 'H', 'O']
     test_structure.stds = np.array([[1, 0, 0], [2, 0, 0], [3, 0, 0]])
-
+    # Test that 'test mode' works
     result, target_atoms = \
         is_std_in_bound_per_species(rel_std_tolerance=0, abs_std_tolerance=0,
                              noise=0, structure = test_structure)
     assert result is True and target_atoms == [-1]
-
+    # Test that high abs tolerance works
     result, target_atoms = \
         is_std_in_bound_per_species(rel_std_tolerance=0, abs_std_tolerance=4,
                              noise=0, structure = test_structure)
     assert result is True and target_atoms == [-1]
-
+    # Test that low abs tolerance works
     result, target_atoms = \
         is_std_in_bound_per_species(rel_std_tolerance=0, abs_std_tolerance=2.9,
                              noise=0, structure = test_structure)
     assert result is False and target_atoms == [2]
-
+    # Test that high rel tolerance works
     result, target_atoms = \
         is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=0,
                              noise=4, structure = test_structure)
     assert result is True and target_atoms == [-1]
-
+    # Test that low rel tolerance works
     result, target_atoms = \
         is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=0,
                              noise=2.9, structure = test_structure)
     assert result is False and target_atoms == [2]
-
+    # Test that both work
     result, target_atoms = \
         is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=.1,
                              noise=2.9, structure = test_structure)
     assert result is False and target_atoms == [2, 1, 0]
-
+    # Test that the max atoms added works
     result, target_atoms = \
         is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=.1,
                              noise=2.9, structure = test_structure,
                                     max_atoms_added=2)
     assert result is False and target_atoms == [2, 1]
-
     result, target_atoms = \
         is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=.1,
                              noise=2.9, structure = test_structure,
                                     max_atoms_added=1)
     assert result is False and target_atoms == [2]
 
+    # Test that max by species works
     result, target_atoms = \
         is_std_in_bound_per_species(rel_std_tolerance=1, abs_std_tolerance=.1,
                              noise=2.9, structure = test_structure,

From 6ccaf6ba790a1792d4952d37f4387e0e1f879ed6 Mon Sep 17 00:00:00 2001
From: Steven T <torrisi@g.harvard.edu>
Date: Fri, 25 Oct 2019 15:43:21 -0400
Subject: [PATCH 4/5] Update GPFA to include fixes, add arguments

---
 flare/gp_from_aimd.py | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/flare/gp_from_aimd.py b/flare/gp_from_aimd.py
index 396847a37..00e8e183f 100644
--- a/flare/gp_from_aimd.py
+++ b/flare/gp_from_aimd.py
@@ -32,15 +32,16 @@ def __init__(self, frames: List[Structure],
                  validate_ratio: float = 0.1,
                  calculate_energy: bool = False,
                  output_name: str = 'gp_from_aimd',
+                 pre_train_max_iter: int = 50,
                  max_atoms_from_frame: int = np.inf, max_trains: int = np.inf,
                  min_atoms_added: int = 1, shuffle_frames: bool = False,
-                 std_per_species: bool = False,
                  verbose: int = 0, model_write: str = '',
                  pre_train_on_skips: int = -1,
                  pre_train_seed_frames: List[Structure] = None,
                  pre_train_seed_envs: List[Tuple[AtomicEnvironment,
                                                  np.array]] = None,
                  pre_train_atoms_per_element: dict = None,
+                 train_atoms_per_element: dict = None,
                  checkpoint_interval: int = None,
                  model_format: str = 'json'):
         """
@@ -69,6 +70,8 @@ def __init__(self, frames: List[Structure],
         :param pre_train_seed_envs: Environments to train on before running
         :param pre_train_atoms_per_element: Max # of environments to add from
         each species in the seed pre-training steps
+        :param train_atoms_per_element: Max # of environments to add from
+        each species in the training steps
         :param checkpoint_interval: How often to write model after trainings
         :param model_format: Format to write GP model to
         """
@@ -86,7 +89,6 @@ def __init__(self, frames: List[Structure],
         self.curr_step = 0
         self.max_atoms_from_frame = max_atoms_from_frame
         self.min_atoms_added = min_atoms_added
-        self.std_per_species = std_per_species
         self.verbose = verbose
         self.train_count = 0
 
@@ -110,13 +112,16 @@ def __init__(self, frames: List[Structure],
         # To later be filled in using the time library
         self.start_time = None
 
+        self.pre_train_max_iter = pre_train_max_iter
         self.pre_train_on_skips = pre_train_on_skips
         self.seed_envs = [] if pre_train_seed_envs is None else \
             pre_train_seed_envs
         self.seed_frames = [] if pre_train_seed_frames is None \
             else pre_train_seed_frames
         self.pre_train_env_per_species = {} if pre_train_atoms_per_element \
-                                               is None else pre_train_atoms_per_element
+                                       is None else pre_train_atoms_per_element
+        self.train_env_per_species = {} if train_atoms_per_element \
+                                       is None else train_atoms_per_element
 
         # Convert to Coded Species
         if self.pre_train_env_per_species:
@@ -202,7 +207,7 @@ def pre_run(self):
             if self.verbose >= 3:
                 print("Now commencing pre-run training of GP (which has "
                       "non-empty training set)")
-            self.train_gp()
+            self.train_gp(max_iter = self.pre_train_max_iter)
 
     def run(self):
         """
@@ -243,9 +248,11 @@ def run(self):
             if i < train_frame:
                 # Get max uncertainty atoms
                 std_in_bound, train_atoms = is_std_in_bound_per_species(
-                    self.rel_std_tolerance, self.abs_std_tolerance,
-                    self.gp.hyps[-1], cur_frame,
-                    self.max_atoms_from_frame, self.std_per_species)
+                    rel_std_tolerance=self.rel_std_tolerance,
+                    abs_std_tolerance=self.abs_std_tolerance,
+                    noise=self.gp.hyps[-1], structure=cur_frame,
+                    max_atoms_added=self.max_atoms_from_frame,
+                    max_by_species=self.train_env_per_species)
 
                 if not std_in_bound:
 
@@ -298,14 +305,25 @@ def update_gp_and_print(self, frame: Structure, train_atoms: List[int],
         if train:
             self.train_gp()
 
-    def train_gp(self):
+    def train_gp(self, max_iter: int = None):
         """
         Train the Gaussian process and write the results to the output file.
         """
         if self.verbose >= 1:
             self.output.write_to_log('Train GP\n')
 
-        self.gp.train(output=self.output if self.verbose >= 2 else None)
+        # TODO: Improve flexibility in GP training to make this next step
+        # unnecessary, so maxiter can be passed as an argument
+
+        if max_iter is not None:
+            temp_maxiter = self.gp.maxiter
+            self.gp.maxiter = max_iter
+            self.gp.train(output=self.output if self.verbose >= 2 else None)
+            self.gp.maxiter = temp_maxiter
+
+        else:
+            self.gp.train(output=self.output if self.verbose >= 2 else None)
+
         self.output.write_hyps(self.gp.hyp_labels, self.gp.hyps,
                                self.start_time,
                                self.gp.likelihood, self.gp.likelihood_gradient)

From c010e0a3cce30c7f6dbf87b8f1525a6906ef31da Mon Sep 17 00:00:00 2001
From: Steven T <torrisi@g.harvard.edu>
Date: Fri, 25 Oct 2019 15:57:28 -0400
Subject: [PATCH 5/5] Added some comments

---
 flare/gp_from_aimd.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/flare/gp_from_aimd.py b/flare/gp_from_aimd.py
index 00e8e183f..db3b198ab 100644
--- a/flare/gp_from_aimd.py
+++ b/flare/gp_from_aimd.py
@@ -54,6 +54,7 @@ def __init__(self, frames: List[Structure],
         noise variance hyperparameter
         :param abs_std_tolerance: Train if uncertainty is above this
         :param parallel: Use parallel functions or not
+        :param validate_ratio: Fraction of frames used for validation
         :param no_cpus: number of cpus to run with multithreading
         :param skip: Skip through frames
         :param calculate_energy: Use local energy kernel or not
@@ -168,6 +169,7 @@ def pre_run(self):
         # so all atomic species are represented in the first step.
         # Otherwise use the seed frames passed in by user.
 
+        # Remove frames used as seed from later part of training
         if self.pre_train_on_skips > 0:
             self.seed_frames = []
             newframes = []
@@ -177,6 +179,7 @@ def pre_run(self):
                 else:
                     newframes += [self.frames[i]]
             self.frames = newframes
+
         elif len(self.gp.training_data) == 0 and len(self.seed_frames) == 0:
             self.seed_frames = [self.frames[0]]
             self.frames = self.frames[1:]