Skip to content

Commit

Permalink
Merge pull request #104 from mir-group/steven/feature/std_species
Browse files Browse the repository at this point in the history
STD Species refactor + 100% coverage
  • Loading branch information
stevetorr authored and Lixin Sun committed Oct 28, 2019
2 parents ed478b9 + c010e0a commit bf3e5c4
Show file tree
Hide file tree
Showing 6 changed files with 180 additions and 145 deletions.
39 changes: 30 additions & 9 deletions flare/gp_from_aimd.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,16 @@ def __init__(self, frames: List[Structure],
validate_ratio: float = 0.1,
calculate_energy: bool = False,
output_name: str = 'gp_from_aimd',
pre_train_max_iter: int = 50,
max_atoms_from_frame: int = np.inf, max_trains: int = np.inf,
min_atoms_added: int = 1, shuffle_frames: bool = False,
std_per_species: bool = False,
verbose: int = 0, model_write: str = '',
pre_train_on_skips: int = -1,
pre_train_seed_frames: List[Structure] = None,
pre_train_seed_envs: List[Tuple[AtomicEnvironment,
np.array]] = None,
pre_train_atoms_per_element: dict = None,
train_atoms_per_element: dict = None,
checkpoint_interval: int = None,
model_format: str = 'json'):
"""
Expand All @@ -53,6 +54,7 @@ def __init__(self, frames: List[Structure],
noise variance hyperparameter
:param abs_std_tolerance: Train if uncertainty is above this
:param parallel: Use parallel functions or not
:param validate_ratio: Fraction of frames used for validation
:param no_cpus: number of cpus to run with multithreading
:param skip: Skip through frames
:param calculate_energy: Use local energy kernel or not
Expand All @@ -69,6 +71,8 @@ def __init__(self, frames: List[Structure],
:param pre_train_seed_envs: Environments to train on before running
:param pre_train_atoms_per_element: Max # of environments to add from
each species in the seed pre-training steps
:param train_atoms_per_element: Max # of environments to add from
each species in the training steps
:param checkpoint_interval: How often to write model after trainings
:param model_format: Format to write GP model to
"""
Expand All @@ -86,7 +90,6 @@ def __init__(self, frames: List[Structure],
self.curr_step = 0
self.max_atoms_from_frame = max_atoms_from_frame
self.min_atoms_added = min_atoms_added
self.std_per_species = std_per_species
self.verbose = verbose
self.train_count = 0

Expand All @@ -110,13 +113,16 @@ def __init__(self, frames: List[Structure],
# To later be filled in using the time library
self.start_time = None

self.pre_train_max_iter = pre_train_max_iter
self.pre_train_on_skips = pre_train_on_skips
self.seed_envs = [] if pre_train_seed_envs is None else \
pre_train_seed_envs
self.seed_frames = [] if pre_train_seed_frames is None \
else pre_train_seed_frames
self.pre_train_env_per_species = {} if pre_train_atoms_per_element \
is None else pre_train_atoms_per_element
is None else pre_train_atoms_per_element
self.train_env_per_species = {} if train_atoms_per_element \
is None else train_atoms_per_element

# Convert to Coded Species
if self.pre_train_env_per_species:
Expand Down Expand Up @@ -163,6 +169,7 @@ def pre_run(self):
# so all atomic species are represented in the first step.
# Otherwise use the seed frames passed in by user.

# Remove frames used as seed from later part of training
if self.pre_train_on_skips > 0:
self.seed_frames = []
newframes = []
Expand All @@ -172,6 +179,7 @@ def pre_run(self):
else:
newframes += [self.frames[i]]
self.frames = newframes

elif len(self.gp.training_data) == 0 and len(self.seed_frames) == 0:
self.seed_frames = [self.frames[0]]
self.frames = self.frames[1:]
Expand Down Expand Up @@ -202,7 +210,7 @@ def pre_run(self):
if self.verbose >= 3:
print("Now commencing pre-run training of GP (which has "
"non-empty training set)")
self.train_gp()
self.train_gp(max_iter = self.pre_train_max_iter)

def run(self):
"""
Expand Down Expand Up @@ -243,9 +251,11 @@ def run(self):
if i < train_frame:
# Get max uncertainty atoms
std_in_bound, train_atoms = is_std_in_bound_per_species(
self.rel_std_tolerance, self.abs_std_tolerance,
self.gp.hyps[-1], cur_frame,
self.max_atoms_from_frame, self.std_per_species)
rel_std_tolerance=self.rel_std_tolerance,
abs_std_tolerance=self.abs_std_tolerance,
noise=self.gp.hyps[-1], structure=cur_frame,
max_atoms_added=self.max_atoms_from_frame,
max_by_species=self.train_env_per_species)

if not std_in_bound:

Expand Down Expand Up @@ -298,14 +308,25 @@ def update_gp_and_print(self, frame: Structure, train_atoms: List[int],
if train:
self.train_gp()

def train_gp(self):
def train_gp(self, max_iter: int = None):
"""
Train the Gaussian process and write the results to the output file.
"""
if self.verbose >= 1:
self.output.write_to_log('Train GP\n')

self.gp.train(output=self.output if self.verbose >= 2 else None)
# TODO: Improve flexibility in GP training to make this next step
# unnecessary, so maxiter can be passed as an argument

if max_iter is not None:
temp_maxiter = self.gp.maxiter
self.gp.maxiter = max_iter
self.gp.train(output=self.output if self.verbose >= 2 else None)
self.gp.maxiter = temp_maxiter

else:
self.gp.train(output=self.output if self.verbose >= 2 else None)

self.output.write_hyps(self.gp.hyp_labels, self.gp.hyps,
self.start_time,
self.gp.likelihood, self.gp.likelihood_gradient)
Expand Down
30 changes: 15 additions & 15 deletions flare/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def write_header(self, cutoffs, kernel_name,
"""

f = self.outfiles['log']
f.write(str(datetime.datetime.now()) + '\n')
f.write(f'{datetime.datetime.now()} \n')

if isinstance(std_tolerance, tuple):
std_string = 'relative uncertainty tolerance: ' \
Expand Down Expand Up @@ -161,13 +161,13 @@ def write_md_config(self, dt, curr_step, structure,
# Mark if a frame had DFT forces with an asterisk
if not dft_step:
string += '-' * 80 + '\n'
string += "-Frame: " + str(curr_step)
string += f"-Frame: {curr_step} "
header = "-"
else:
string += "\n*-Frame: " + str(curr_step)
string += f"\n*-Frame: {curr_step} "
header = "*-"

string += '\nSimulation Time: %.3f ps \n' % (dt * curr_step)
string += f'\nSimulation Time: {(dt * curr_step):.3} ps \n'

# Construct Header line
string += 'El Position (A) \t\t\t\t '
Expand All @@ -180,38 +180,38 @@ def write_md_config(self, dt, curr_step, structure,

# Construct atom-by-atom description
for i in range(len(structure.positions)):
string += str(structure.species_labels[i]) + ' '
string += f'{structure.species_labels[i]} '
for j in range(3):
string += str("%.8f" % structure.positions[i][j]) + ' '
string += f"{structure.positions[i][j]:.8}"
string += '\t'
for j in range(3):
string += str("%.8f" % structure.forces[i][j]) + ' '
string += f"{structure.forces[i][j]:8.3} "
string += '\t'
for j in range(3):
string += str("%.8e" % structure.stds[i][j]) + ' '
string += f'{structure.stds[i][j]:.8e}'
string += '\t'
for j in range(3):
string += str("%.8e" % velocities[i][j]) + ' '
string += f'{velocities[i][j]:.8e}'
string += '\n'

self.write_xyz_config(curr_step, structure, dft_step)
self.write_xyz(curr_step, structure.stds, structure.species_labels,
"std", header)

string += '\n'
string += 'temperature: %.2f K \n' % temperature
string += 'kinetic energy: %.6f eV \n' % KE
string += f'temperature: {temperature:.2f} K \n'
string += f'kinetic energy: {KE:.6f} eV \n'

# calculate potential and total energy
if local_energies is not None:
pot_en = np.sum(local_energies)
tot_en = KE + pot_en
string += \
'potential energy: %.6f eV \n' % pot_en
string += 'total energy: %.6f eV \n' % tot_en
f'potential energy: {pot_en:.6f} eV \n'
string += f'total energy: {tot_en:.6f} eV \n'

string += 'wall time from start: %.2f s \n' % \
(time.time() - start_time)
string += 'wall time from start: '
string += f'{(time.time() - start_time):.2f} s \n'

self.outfiles['log'].write(string)

Expand Down
95 changes: 51 additions & 44 deletions flare/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,64 +212,71 @@ def is_std_in_bound(std_tolerance, noise, structure, max_atoms_added):
else:
return True, [-1]

def is_std_in_bound_per_species(rel_std_tolerance, abs_std_tolerance, noise, structure, max_atoms_added, std_per_species):
def is_std_in_bound_per_species(rel_std_tolerance: float,
abs_std_tolerance: float, noise: float,
structure, max_atoms_added: int =
np.inf, max_by_species: dict = {}):
"""
If the predicted variance is too high, returns a list of atoms
with the highest uncertainty
:param frame: Structure
Checks the stds of GP prediction assigned to the structure, returns a
list of atoms which either meet an absolute threshold or a relative
threshold defined by rel_std_tolerance * noise. Can limit the
total number of target atoms via max_atoms_added, and limit per species
by max_by_species.
The max_atoms_added argument will 'overrule' the
max by species; e.g. if max_atoms_added is 2 and max_by_species is {"H":3},
then at most two atoms total will be added.
:param rel_std_tolerance:
:param abs_std_tolerance:
:param noise:
:param structure:
:param max_atoms_added:
:param max_by_species:
:return:
"""


# This indicates test mode, as the GP is not being modified in any way
if rel_std_tolerance == 0 and abs_std_tolerance == 0:
return True, [-1]

# set uncertainty threshold
if rel_std_tolerance is None:
if rel_std_tolerance is None or rel_std_tolerance == 0 :
threshold = abs_std_tolerance
elif abs_std_tolerance is None:
elif abs_std_tolerance is None or abs_std_tolerance == 0:
threshold = rel_std_tolerance * np.abs(noise)
else:
threshold = min(rel_std_tolerance * np.abs(noise),
abs_std_tolerance)

if (std_per_species is False):
return is_std_in_bound(-threshold, noise,
structure, max_atoms_added)

# sort max stds
max_stds = np.zeros(structure.nat)
for atom_idx, std in enumerate(structure.stds):
max_stds[atom_idx] = np.max(std)

std_sorted = {}
id_sorted = {}
species_set = set(structure.coded_species)
for species_i in species_set:
std_sorted[species_i] = []
id_sorted[species_i] = []
for i, spec in enumerate(structure.coded_species):
if max_stds[i] > threshold:
std_sorted[spec] += [max_stds[i]]
id_sorted[spec] += [i]
# Determine if any std component will trigger the threshold
max_std_components = [np.max(std) for std in structure.stds]
if max(max_std_components) < threshold:
return True, [-1]

target_atoms = []
ntarget = 0
for species_i in species_set:
natom_i = len(std_sorted[species_i])
if (natom_i>0):
std_sorted[species_i] = np.argsort(np.array(std_sorted[species_i]))
if (max_atoms_added == np.inf or \
max_atoms_added > natom_i):
target_atoms += id_sorted[species_i]
ntarget += natom_i
else:
for i in range(max_atoms_added):
tempid = std_sorted[species_i][i]
target_atoms += [id_sorted[species_i][tempid]]
ntarget += max_atoms_added
if (ntarget > 0):
target_atoms = list(np.hstack(target_atoms))
return False, target_atoms
else:
return True, [-1]

# Sort from greatest to smallest max. std component
std_arg_sorted = np.flip(np.argsort(max_std_components))

present_species = {spec: 0 for spec in set(structure.species_labels)}

# Only add atoms up to the bound
for i in std_arg_sorted:

# If max atoms added reached or stds are now below threshold, done
if len(target_atoms) == max_atoms_added or \
max_std_components[i] < threshold:
break

cur_spec = structure.species_labels[i]

# Only add up to species allowance, if it exists
if present_species[cur_spec] < \
max_by_species.get(cur_spec, np.inf):

target_atoms.append(i)
present_species[cur_spec] += 1

return False, target_atoms
5 changes: 3 additions & 2 deletions tests/test_OTF_cp2k_par.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@
from flare.struc import Structure
import flare.kernels as en

def cleanup(target: str = None):
def cleanup(target: list = None):
os.remove('cp2k.in')
os.remove('cp2k-RESTART.wfn')
if (target is not None):
os.remove(target)
for i in target:
os.remove(i)

# ------------------------------------------------------
# test otf runs
Expand Down
Loading

0 comments on commit bf3e5c4

Please sign in to comment.