Merge pull request #104 from mir-group/steven/feature/std_species

STD Species refactor + 100% coverage
mir-group · Oct 28, 2019 · bf3e5c4 · bf3e5c4
2 parents ed478b9 + c010e0a
commit bf3e5c4
Show file tree

Hide file tree

Showing 6 changed files with 180 additions and 145 deletions.
diff --git a/flare/gp_from_aimd.py b/flare/gp_from_aimd.py
@@ -32,15 +32,16 @@ def __init__(self, frames: List[Structure],
                  validate_ratio: float = 0.1,
                  calculate_energy: bool = False,
                  output_name: str = 'gp_from_aimd',
+                 pre_train_max_iter: int = 50,
                  max_atoms_from_frame: int = np.inf, max_trains: int = np.inf,
                  min_atoms_added: int = 1, shuffle_frames: bool = False,
-                 std_per_species: bool = False,
                  verbose: int = 0, model_write: str = '',
                  pre_train_on_skips: int = -1,
                  pre_train_seed_frames: List[Structure] = None,
                  pre_train_seed_envs: List[Tuple[AtomicEnvironment,
                                                  np.array]] = None,
                  pre_train_atoms_per_element: dict = None,
+                 train_atoms_per_element: dict = None,
                  checkpoint_interval: int = None,
                  model_format: str = 'json'):
         """
@@ -53,6 +54,7 @@ def __init__(self, frames: List[Structure],
         noise variance hyperparameter
         :param abs_std_tolerance: Train if uncertainty is above this
         :param parallel: Use parallel functions or not
+        :param validate_ratio: Fraction of frames used for validation
         :param no_cpus: number of cpus to run with multithreading
         :param skip: Skip through frames
         :param calculate_energy: Use local energy kernel or not
@@ -69,6 +71,8 @@ def __init__(self, frames: List[Structure],
         :param pre_train_seed_envs: Environments to train on before running
         :param pre_train_atoms_per_element: Max # of environments to add from
         each species in the seed pre-training steps
+        :param train_atoms_per_element: Max # of environments to add from
+        each species in the training steps
         :param checkpoint_interval: How often to write model after trainings
         :param model_format: Format to write GP model to
         """
@@ -86,7 +90,6 @@ def __init__(self, frames: List[Structure],
         self.curr_step = 0
         self.max_atoms_from_frame = max_atoms_from_frame
         self.min_atoms_added = min_atoms_added
-        self.std_per_species = std_per_species
         self.verbose = verbose
         self.train_count = 0
 
@@ -110,13 +113,16 @@ def __init__(self, frames: List[Structure],
         # To later be filled in using the time library
         self.start_time = None
 
+        self.pre_train_max_iter = pre_train_max_iter
         self.pre_train_on_skips = pre_train_on_skips
         self.seed_envs = [] if pre_train_seed_envs is None else \
             pre_train_seed_envs
         self.seed_frames = [] if pre_train_seed_frames is None \
             else pre_train_seed_frames
         self.pre_train_env_per_species = {} if pre_train_atoms_per_element \
-                                               is None else pre_train_atoms_per_element
+                                       is None else pre_train_atoms_per_element
+        self.train_env_per_species = {} if train_atoms_per_element \
+                                       is None else train_atoms_per_element
 
         # Convert to Coded Species
         if self.pre_train_env_per_species:
@@ -163,6 +169,7 @@ def pre_run(self):
         # so all atomic species are represented in the first step.
         # Otherwise use the seed frames passed in by user.
 
+        # Remove frames used as seed from later part of training
         if self.pre_train_on_skips > 0:
             self.seed_frames = []
             newframes = []
@@ -172,6 +179,7 @@ def pre_run(self):
                 else:
                     newframes += [self.frames[i]]
             self.frames = newframes
+
         elif len(self.gp.training_data) == 0 and len(self.seed_frames) == 0:
             self.seed_frames = [self.frames[0]]
             self.frames = self.frames[1:]
@@ -202,7 +210,7 @@ def pre_run(self):
             if self.verbose >= 3:
                 print("Now commencing pre-run training of GP (which has "
                       "non-empty training set)")
-            self.train_gp()
+            self.train_gp(max_iter = self.pre_train_max_iter)
 
     def run(self):
         """
@@ -243,9 +251,11 @@ def run(self):
             if i < train_frame:
                 # Get max uncertainty atoms
                 std_in_bound, train_atoms = is_std_in_bound_per_species(
-                    self.rel_std_tolerance, self.abs_std_tolerance,
-                    self.gp.hyps[-1], cur_frame,
-                    self.max_atoms_from_frame, self.std_per_species)
+                    rel_std_tolerance=self.rel_std_tolerance,
+                    abs_std_tolerance=self.abs_std_tolerance,
+                    noise=self.gp.hyps[-1], structure=cur_frame,
+                    max_atoms_added=self.max_atoms_from_frame,
+                    max_by_species=self.train_env_per_species)
 
                 if not std_in_bound:
 
@@ -298,14 +308,25 @@ def update_gp_and_print(self, frame: Structure, train_atoms: List[int],
         if train:
             self.train_gp()
 
-    def train_gp(self):
+    def train_gp(self, max_iter: int = None):
         """
         Train the Gaussian process and write the results to the output file.
         """
         if self.verbose >= 1:
             self.output.write_to_log('Train GP\n')
 
-        self.gp.train(output=self.output if self.verbose >= 2 else None)
+        # TODO: Improve flexibility in GP training to make this next step
+        # unnecessary, so maxiter can be passed as an argument
+
+        if max_iter is not None:
+            temp_maxiter = self.gp.maxiter
+            self.gp.maxiter = max_iter
+            self.gp.train(output=self.output if self.verbose >= 2 else None)
+            self.gp.maxiter = temp_maxiter
+
+        else:
+            self.gp.train(output=self.output if self.verbose >= 2 else None)
+
         self.output.write_hyps(self.gp.hyp_labels, self.gp.hyps,
                                self.start_time,
                                self.gp.likelihood, self.gp.likelihood_gradient)

diff --git a/flare/output.py b/flare/output.py
@@ -88,7 +88,7 @@ def write_header(self, cutoffs, kernel_name,
         """
 
         f = self.outfiles['log']
-        f.write(str(datetime.datetime.now()) + '\n')
+        f.write(f'{datetime.datetime.now()} \n')
 
         if isinstance(std_tolerance, tuple):
             std_string = 'relative uncertainty tolerance: ' \
@@ -161,13 +161,13 @@ def write_md_config(self, dt, curr_step, structure,
         # Mark if a frame had DFT forces with an asterisk
         if not dft_step:
             string += '-' * 80 + '\n'
-            string += "-Frame: " + str(curr_step)
+            string += f"-Frame: {curr_step} "
             header = "-"
         else:
-            string += "\n*-Frame: " + str(curr_step)
+            string += f"\n*-Frame: {curr_step} "
             header = "*-"
 
-        string += '\nSimulation Time: %.3f ps \n' % (dt * curr_step)
+        string += f'\nSimulation Time: {(dt * curr_step):.3} ps \n'
 
         # Construct Header line
         string += 'El  Position (A) \t\t\t\t '
@@ -180,38 +180,38 @@ def write_md_config(self, dt, curr_step, structure,
 
         # Construct atom-by-atom description
         for i in range(len(structure.positions)):
-            string += str(structure.species_labels[i]) + ' '
+            string += f'{structure.species_labels[i]} '
             for j in range(3):
-                string += str("%.8f" % structure.positions[i][j]) + ' '
+                string += f"{structure.positions[i][j]:.8}"
             string += '\t'
             for j in range(3):
-                string += str("%.8f" % structure.forces[i][j]) + ' '
+                string += f"{structure.forces[i][j]:8.3} "
             string += '\t'
             for j in range(3):
-                string += str("%.8e" % structure.stds[i][j]) + ' '
+                string += f'{structure.stds[i][j]:.8e}'
             string += '\t'
             for j in range(3):
-                string += str("%.8e" % velocities[i][j]) + ' '
+                string += f'{velocities[i][j]:.8e}'
             string += '\n'
 
         self.write_xyz_config(curr_step, structure, dft_step)
         self.write_xyz(curr_step, structure.stds, structure.species_labels,
                        "std", header)
 
         string += '\n'
-        string += 'temperature: %.2f K \n' % temperature
-        string += 'kinetic energy: %.6f eV \n' % KE
+        string += f'temperature: {temperature:.2f} K \n'
+        string += f'kinetic energy: {KE:.6f} eV \n'
 
         # calculate potential and total energy
         if local_energies is not None:
             pot_en = np.sum(local_energies)
             tot_en = KE + pot_en
             string += \
-                'potential energy: %.6f eV \n' % pot_en
-            string += 'total energy: %.6f eV \n' % tot_en
+                f'potential energy: {pot_en:.6f} eV \n'
+            string += f'total energy: {tot_en:.6f} eV \n'
 
-        string += 'wall time from start: %.2f s \n' % \
-                  (time.time() - start_time)
+        string += 'wall time from start: '
+        string += f'{(time.time() - start_time):.2f} s \n'
 
         self.outfiles['log'].write(string)
 

diff --git a/flare/util.py b/flare/util.py
@@ -212,64 +212,71 @@ def is_std_in_bound(std_tolerance, noise, structure, max_atoms_added):
     else:
         return True, [-1]
 
-def is_std_in_bound_per_species(rel_std_tolerance, abs_std_tolerance, noise, structure, max_atoms_added, std_per_species):
+def is_std_in_bound_per_species(rel_std_tolerance: float,
+                                abs_std_tolerance: float, noise: float,
+                                structure, max_atoms_added: int =
+                                np.inf, max_by_species: dict = {}):
     """
-    If the predicted variance is too high, returns a list of atoms
-    with the highest uncertainty
-    :param frame: Structure
+    Checks the stds of GP prediction assigned to the structure, returns a
+    list of atoms which either meet an absolute threshold or a relative
+    threshold defined by rel_std_tolerance * noise. Can limit the
+    total number of target atoms via max_atoms_added, and limit per species
+    by max_by_species.
+
+    The max_atoms_added argument will 'overrule' the
+    max by species; e.g. if max_atoms_added is 2 and max_by_species is {"H":3},
+    then at most two atoms total will be added.
+
+    :param rel_std_tolerance:
+    :param abs_std_tolerance:
+    :param noise:
+    :param structure:
+    :param max_atoms_added:
+    :param max_by_species:
     :return:
     """
 
+
     # This indicates test mode, as the GP is not being modified in any way
     if rel_std_tolerance == 0 and abs_std_tolerance == 0:
         return True, [-1]
 
     # set uncertainty threshold
-    if rel_std_tolerance is None:
+    if rel_std_tolerance is None or rel_std_tolerance == 0 :
         threshold = abs_std_tolerance
-    elif abs_std_tolerance is None:
+    elif abs_std_tolerance is None or abs_std_tolerance == 0:
         threshold = rel_std_tolerance * np.abs(noise)
     else:
         threshold = min(rel_std_tolerance * np.abs(noise),
                         abs_std_tolerance)
 
-    if (std_per_species is False):
-        return is_std_in_bound(-threshold, noise,
-                structure, max_atoms_added)
-
-    # sort max stds
-    max_stds = np.zeros(structure.nat)
-    for atom_idx, std in enumerate(structure.stds):
-        max_stds[atom_idx] = np.max(std)
-
-    std_sorted = {}
-    id_sorted = {}
-    species_set = set(structure.coded_species)
-    for species_i in species_set:
-        std_sorted[species_i] = []
-        id_sorted[species_i] = []
-    for i, spec in enumerate(structure.coded_species):
-        if max_stds[i] > threshold:
-            std_sorted[spec] += [max_stds[i]]
-            id_sorted[spec] += [i]
+    # Determine if any std component will trigger the threshold
+    max_std_components = [np.max(std) for std in structure.stds]
+    if max(max_std_components) < threshold:
+        return True, [-1]
 
     target_atoms = []
-    ntarget = 0
-    for species_i in species_set:
-        natom_i = len(std_sorted[species_i])
-        if (natom_i>0):
-            std_sorted[species_i] = np.argsort(np.array(std_sorted[species_i]))
-            if (max_atoms_added == np.inf or \
-                    max_atoms_added > natom_i):
-                target_atoms += id_sorted[species_i]
-                ntarget += natom_i
-            else:
-                for i in range(max_atoms_added):
-                    tempid = std_sorted[species_i][i]
-                    target_atoms += [id_sorted[species_i][tempid]]
-                ntarget += max_atoms_added
-    if (ntarget > 0):
-        target_atoms = list(np.hstack(target_atoms))
-        return False, target_atoms
-    else:
-        return True, [-1]
+
+    # Sort from greatest to smallest max. std component
+    std_arg_sorted = np.flip(np.argsort(max_std_components))
+
+    present_species = {spec: 0 for spec in set(structure.species_labels)}
+
+    # Only add atoms up to the bound
+    for i in std_arg_sorted:
+
+        # If max atoms added reached or stds are now below threshold, done
+        if len(target_atoms) == max_atoms_added or \
+                max_std_components[i] < threshold:
+            break
+
+        cur_spec = structure.species_labels[i]
+
+        # Only add up to species allowance, if it exists
+        if present_species[cur_spec] < \
+            max_by_species.get(cur_spec, np.inf):
+
+            target_atoms.append(i)
+            present_species[cur_spec] += 1
+
+    return False, target_atoms
diff --git a/tests/test_OTF_cp2k_par.py b/tests/test_OTF_cp2k_par.py
@@ -8,11 +8,12 @@
 from flare.struc import Structure
 import flare.kernels as en
 
-def cleanup(target: str = None):
+def cleanup(target: list = None):
     os.remove('cp2k.in')
     os.remove('cp2k-RESTART.wfn')
     if (target is not None):
-        os.remove(target)
+        for i in target:
+            os.remove(i)
 
 # ------------------------------------------------------
 #                   test  otf runs