From d2ccdda31461b8dcccd3f35c44ce9b59d054b293 Mon Sep 17 00:00:00 2001 From: Yury Lysogorskiy Date: Mon, 22 Feb 2021 15:46:21 +0100 Subject: [PATCH 1/5] add PaceMakerJob job --- pyiron_gpl/__init__.py | 3 +- pyiron_gpl/pacemaker/__init__.py | 0 pyiron_gpl/pacemaker/pacemaker.py | 225 ++++++++++++++++++++++++++++++ setup.py | 3 +- 4 files changed, 229 insertions(+), 2 deletions(-) create mode 100644 pyiron_gpl/pacemaker/__init__.py create mode 100644 pyiron_gpl/pacemaker/pacemaker.py diff --git a/pyiron_gpl/__init__.py b/pyiron_gpl/__init__.py index 7a78ce3..fa9058c 100644 --- a/pyiron_gpl/__init__.py +++ b/pyiron_gpl/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.1" +__version__ = "0.1.1" __all__ = [] from pyiron_atomistics.project import Project @@ -6,6 +6,7 @@ # Make classes available for new pyiron version JOB_CLASS_DICT['ElasticMatrixJob'] = 'pyiron_gpl.elastic.elastic' +JOB_CLASS_DICT['PaceMakerJob'] = 'pyiron_gpl.pacemaker.pacemaker' from ._version import get_versions diff --git a/pyiron_gpl/pacemaker/__init__.py b/pyiron_gpl/pacemaker/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pyiron_gpl/pacemaker/pacemaker.py b/pyiron_gpl/pacemaker/pacemaker.py new file mode 100644 index 0000000..9369575 --- /dev/null +++ b/pyiron_gpl/pacemaker/pacemaker.py @@ -0,0 +1,225 @@ +# coding: utf-8 +# Copyright (c) ICAMS, Ruhr University Bochum, 2021 +# Distributed under the terms of "GPLv3", see the LICENSE file. + +import logging +import numpy as np +import os +import pandas as pd +import re +import ruamel.yaml as yaml +from pyiron_base import GenericJob, GenericParameters +from pyiron_base.settings.generic import Settings +from shutil import copyfile + + +s = Settings() + +try: + from pyace import BBasisConfiguration, ACEBBasisSet + + HAS_PYACE = True +except ImportError as e: + print("Could not import `pyace` package. The package should be installed for proper functionality") + HAS_PYACE = False + +# set loggers +loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict] +for logger in loggers: + logger.setLevel(logging.WARNING) + +class PaceMakerJob(GenericJob): + def __init__(self, project, job_name): + super().__init__(project, job_name) + self.__name__ = "PaceMakerJob" + self.__version__ = "0.1" + + self.input = GenericParameters(table_name="input") + self.input['cutoff'] = 10. + self.input['metadata'] = {} + self.input['data'] = {} # data_config + self.input['potential'] = {} # potential_config + self.input['fit'] = {} # fit_config + self.input['backend'] = {'evaluator': 'tensorpot'} # backend_config + + self.structure_data = None + + # self.executable = "pacemaker input.yaml -l log.txt" + self._executable = None + self._executable_activate() + + def _save_structure_dataframe_pckl_gzip(self, df): + df.rename(columns={"number_of_atoms": "NUMBER_OF_ATOMS", + "energy": "energy_corrected", + "atoms": "ase_atoms"}, inplace=True) + df["NUMBER_OF_ATOMS"] = df["NUMBER_OF_ATOMS"].astype(int) + if "pbc" not in df.columns: + df["pbc"] = df["ase_atoms"].map(lambda atoms: np.all(atoms.pbc)) + + data_file_name = os.path.join(self.working_directory, "df_fit.pckl.gzip") + logging.info("Saving training structures dataframe into {} with pickle protocol = 4, compression = gzip".format( + data_file_name)) + df.to_pickle(data_file_name, compression="gzip", protocol=4) + return data_file_name + + def write_input(self): + # prepare datafile + if self.structure_data is None: + raise ValueError( + "`structure_data` is none, but should be pd.DataFrame, TrainingContainer or valid pickle.gzip filename") + if isinstance(self.structure_data, pd.DataFrame): + logging.info("structure_data is pandas.DataFrame") + data_file_name = self._save_structure_dataframe_pckl_gzip(self.structure_data) + self.input["data"] = {"filename": data_file_name} + elif isinstance(self.structure_data, str): # filename + if os.path.isfile(self.structure_data): + logging.info("structure_data is valid file path") + self.input["data"] = {"filename": self.structure_data} + else: + raise ValueError("Provided structure_data filename ({}) doesn't exists".format(self.structure_data)) + elif hasattr(self.structure_data, "get_pandas"): # duck-typing check for TrainingContainer + logging.info("structure_data is TrainingContainer") + df = self.structure_data.to_pandas() + data_file_name = self._save_structure_dataframe_pckl_gzip(df) + self.input["data"] = {"filename": data_file_name} + + metadata_dict = self.input["metadata"] + metadata_dict["pyiron_job_id"] = str(self.job_id) + + input_yaml_dict = { + "cutoff": self.input["cutoff"], + "metadata": metadata_dict, + 'potential': self.input['potential'], + 'data': self.input["data"], + 'fit': self.input["fit"], + 'backend': self.input["backend"], + } + + if isinstance(self.input["potential"], str): + pot_file_name = self.input["potential"] + if os.path.isfile(pot_file_name): + logging.info("Input potential is filename") + pot_basename = os.path.basename(pot_file_name) + copyfile(pot_file_name, os.path.join(self.working_directory, pot_basename)) + input_yaml_dict['potential'] = pot_basename + # TODO: check if initial potential is provided (for continuation of fit) + else: + raise ValueError("Provided potential filename ({}) doesn't exists".format(self.input["potential"])) + + with open(os.path.join(self.working_directory, "input.yaml"), "w") as f: + yaml.dump(input_yaml_dict, f) + + def _analyse_log(self, logfile="log.txt"): + log_filename = os.path.join(self.working_directory, logfile) + + with open(log_filename, "r") as f: + loglines = f.readlines() + + losses = [] + ef_rmses = [] + + for l in loglines: + if "INFO" in l and "Iteration:" in l: + loss = re.findall("Loss: ([\d.]*)", l)[0] + losses.append(loss) + + ef_rmse_list = re.findall( + "RMSE Energy\(low\): ([0-9.]+) \(([0-9.]+)\) meV/at | Forces\(low\): ([0-9.]+) \(([0-9.]+)\) meV/A", + l) + + ef_rmses.append([ef_rmse_list[0][0], ef_rmse_list[0][1], ef_rmse_list[1][-2], ef_rmse_list[1][-1]]) + + losses = np.array(losses).astype(float) + + ef_rmses = np.array(ef_rmses).astype(float) + res_dict = {} + res_dict["loss"] = losses + res_dict["rmse_energy"] = ef_rmses[:, 0] + res_dict["rmse_energy_low"] = ef_rmses[:, 1] + res_dict["rmse_forces"] = ef_rmses[:, 2] + res_dict["rmse_forces_low"] = ef_rmses[:, 3] + return res_dict + + def collect_output(self): + final_potential_filename = self.get_final_potential_filename() + with open(final_potential_filename, "r") as f: + yaml_lines = f.readlines() + final_potential_yaml_string = "".join(yaml_lines) + + bbasis = ACEBBasisSet(final_potential_filename) + cbasis = bbasis.to_ACECTildeBasisSet() + + cbasis.save(self.get_final_potential_filename_ace()) + with open(self.get_final_potential_filename_ace(), "r") as f: + ace_lines = f.readlines() + final_potential_ace_string = "".join(ace_lines) + + elements_name = bbasis.elements_name + + with self.project_hdf5.open("output/potential") as h5out: + h5out["yaml"] = final_potential_yaml_string + h5out["ace"] = final_potential_ace_string + h5out["elements_name"] = elements_name + + log_res_dict = self._analyse_log() + + with self.project_hdf5.open("output/log") as h5out: + for key, arr in log_res_dict.items(): + h5out[key] = arr + + def get_lammps_potential(self): + elements_name = self["output/potential/elements_name"] + elem = " ".join(elements_name) + pot_file_name = self.get_final_potential_filename_ace() + pot_dict = { + 'Config': [["pair_style pace\n", "pair_coeff * * {} {}\n".format(pot_file_name, elem)]], + 'Filename': [""], + 'Model': ["ACE"], + 'Name': [self.job_name], + 'Species': [elements_name] + } + + ace_potential = pd.DataFrame(pot_dict) + + return ace_potential + + def to_hdf(self, hdf=None, group_name=None): + super().to_hdf( + hdf=hdf, + group_name=group_name + ) + with self.project_hdf5.open("input") as h5in: + self.input.to_hdf(h5in) + + def from_hdf(self, hdf=None, group_name=None): + super().from_hdf( + hdf=hdf, + group_name=group_name + ) + with self.project_hdf5.open("input") as h5in: + self.input.from_hdf(h5in) + + def get_final_potential_filename(self): + return os.path.join(self.working_directory, "output_potential.yaml") + + def get_final_potential_filename_ace(self): + return os.path.join(self.working_directory, "output_potential.ace") + + def get_current_potential_filename(self): + return os.path.join(self.working_directory, "interim_potential_1.yaml") + + def get_current_potential(self): + if HAS_PYACE: + current_potential_filename = self.get_current_potential_filename() + bbasis = BBasisConfiguration(current_potential_filename) + return bbasis + else: + raise RuntimeError("`pyace` package is not installed") + + def get_final_potential(self): + if HAS_PYACE: + final_potential_filename = self.get_final_potential_filename() + bbasis = BBasisConfiguration(final_potential_filename) + return bbasis + else: + raise RuntimeError("`pyace` pacakge is not installed") diff --git a/setup.py b/setup.py index b633ccc..beba965 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,8 @@ install_requires=[ 'pyiron_atomistics==0.2.3', 'spglib==1.16.1', - 'scipy==1.6.0' + 'scipy==1.6.0', + 'numpy==1.19.5', ], cmdclass=versioneer.get_cmdclass(), From fab25b4080467b6e44bb330c088e80215b728947 Mon Sep 17 00:00:00 2001 From: Yury Lysogorskiy Date: Mon, 22 Feb 2021 22:32:58 +0100 Subject: [PATCH 2/5] rename output_potential.yaml to pyiron job's name.yaml in collect output --- pyiron_gpl/pacemaker/pacemaker.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pyiron_gpl/pacemaker/pacemaker.py b/pyiron_gpl/pacemaker/pacemaker.py index 9369575..9e643d5 100644 --- a/pyiron_gpl/pacemaker/pacemaker.py +++ b/pyiron_gpl/pacemaker/pacemaker.py @@ -141,12 +141,15 @@ def _analyse_log(self, logfile="log.txt"): return res_dict def collect_output(self): + output_potential_filename = self.get_output_potential_filename() final_potential_filename = self.get_final_potential_filename() - with open(final_potential_filename, "r") as f: + + copyfile(output_potential_filename, final_potential_filename) + with open(output_potential_filename, "r") as f: yaml_lines = f.readlines() final_potential_yaml_string = "".join(yaml_lines) - bbasis = ACEBBasisSet(final_potential_filename) + bbasis = ACEBBasisSet(output_potential_filename) cbasis = bbasis.to_ACECTildeBasisSet() cbasis.save(self.get_final_potential_filename_ace()) @@ -199,11 +202,14 @@ def from_hdf(self, hdf=None, group_name=None): with self.project_hdf5.open("input") as h5in: self.input.from_hdf(h5in) - def get_final_potential_filename(self): + def get_output_potential_filename(self): return os.path.join(self.working_directory, "output_potential.yaml") + def get_final_potential_filename(self): + return os.path.join(self.working_directory, self.job_name+".yaml") + def get_final_potential_filename_ace(self): - return os.path.join(self.working_directory, "output_potential.ace") + return os.path.join(self.working_directory, self.job_name+".ace") def get_current_potential_filename(self): return os.path.join(self.working_directory, "interim_potential_1.yaml") From e71cce98ba2e66659f2f6783445cefbd7f9843e5 Mon Sep 17 00:00:00 2001 From: Yury Lysogorskiy Date: Sat, 27 Feb 2021 13:32:33 +0100 Subject: [PATCH 3/5] setup.py: remove the numpy restrictions, because no TF will be used in this demo pacemaker.py: use ImportAlarm(); add docstring for PaceMakerJob class; change GenericParameters to InputList; change default backend to 'pyace'; use f-strings instead of .format --- pyiron_gpl/pacemaker/pacemaker.py | 145 +++++++++++++++++++++--------- setup.py | 2 +- 2 files changed, 103 insertions(+), 44 deletions(-) diff --git a/pyiron_gpl/pacemaker/pacemaker.py b/pyiron_gpl/pacemaker/pacemaker.py index 9e643d5..07a00e5 100644 --- a/pyiron_gpl/pacemaker/pacemaker.py +++ b/pyiron_gpl/pacemaker/pacemaker.py @@ -8,43 +8,112 @@ import pandas as pd import re import ruamel.yaml as yaml -from pyiron_base import GenericJob, GenericParameters +from pyiron_base import GenericJob, InputList +from pyiron_base.generic.util import ImportAlarm from pyiron_base.settings.generic import Settings from shutil import copyfile - s = Settings() try: from pyace import BBasisConfiguration, ACEBBasisSet - HAS_PYACE = True + import_alarm = ImportAlarm() + + # set loggers level for WARNING to avoid extra print-outs + # because pyace do its own logging settings + loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict] + for logger in loggers: + logger.setLevel(logging.WARNING) + except ImportError as e: - print("Could not import `pyace` package. The package should be installed for proper functionality") - HAS_PYACE = False + import_alarm = ImportAlarm("Could not import `pyace` package. The package should be installed for proper " + "functionality") -# set loggers -loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict] -for logger in loggers: - logger.setLevel(logging.WARNING) class PaceMakerJob(GenericJob): + """ + Thin wrapper class of the `pacemaker` - Atomic Cluster Expansion fitting code. + + Current functionality is limited to single-species fit with limited number of basis functions only. + Please, contact developers if you would like to use fully functional `pacemaker` code. + + Usage example: + + job = fit_pr.create_job(job_type=PaceMakerJob, job_name="fit_job") + + # setup ACE potential form + job.input["potential"]= { + # spline mesh settings + "deltaSplineBins": 0.001, + + # specie + "element": "Cu", + + # embedding function settings + "ndensity": 2, + "fs_parameters": [1, 1, 1, 0.5], + "npot": "FinnisSinclairShiftedScaled", + + # cutoff function + "NameOfCutoffFunction": "cos", + + # potential specification + ## radial basis functions type and parameters + "radbase": "ChebExpCos", + "radparameters": [5.25], + "rcut": cutoff, + "dcut": 0.01, + + ## max correlation order + "rankmax": 3, + ## specification of max n,l for each of the correlation order + "nradmax": [5,2,1], + "lmax": [0,2,1], ##NOTE: for order=1 lmax always is 0 + } + + # setup fitting: loss function, optimization settings + job.input["fit"]= { + 'optimizer': 'BFGS', + 'maxiter': 150, + 'loss': { + 'kappa': 0.5, + 'L1_coeffs': 5e-7, # L1-regularization + 'L2_coeffs': 5e-7, # L2-regularization + 'w1_coeffs': 1, + 'w2_coeffs': 1, + #radial smoothness regularization + 'w0_rad': 1e-4, + 'w1_rad': 1e-4, + 'w2_rad': 1e-4 + } + } + + # setup global cutoff for atomic distances + job.input["cutoff"] = cutoff + + # setup training data, could be: + # - TrainingContainer + # - pandas Dataframe + # - filename of .pckl.gzip pandas Dataframe + job.structure_data = data_job # + + """ + def __init__(self, project, job_name): super().__init__(project, job_name) self.__name__ = "PaceMakerJob" self.__version__ = "0.1" - self.input = GenericParameters(table_name="input") + self.input = InputList(table_name="input") self.input['cutoff'] = 10. self.input['metadata'] = {} self.input['data'] = {} # data_config self.input['potential'] = {} # potential_config self.input['fit'] = {} # fit_config - self.input['backend'] = {'evaluator': 'tensorpot'} # backend_config + self.input['backend'] = {'evaluator': 'pyace'} # backend_config self.structure_data = None - - # self.executable = "pacemaker input.yaml -l log.txt" self._executable = None self._executable_activate() @@ -57,8 +126,8 @@ def _save_structure_dataframe_pckl_gzip(self, df): df["pbc"] = df["ase_atoms"].map(lambda atoms: np.all(atoms.pbc)) data_file_name = os.path.join(self.working_directory, "df_fit.pckl.gzip") - logging.info("Saving training structures dataframe into {} with pickle protocol = 4, compression = gzip".format( - data_file_name)) + logging.info( + f"Saving training structures dataframe into {data_file_name} with pickle protocol = 4, compression = gzip") df.to_pickle(data_file_name, compression="gzip", protocol=4) return data_file_name @@ -76,7 +145,7 @@ def write_input(self): logging.info("structure_data is valid file path") self.input["data"] = {"filename": self.structure_data} else: - raise ValueError("Provided structure_data filename ({}) doesn't exists".format(self.structure_data)) + raise ValueError(f"Provided structure_data filename ({self.structure_data}) doesn't exists") elif hasattr(self.structure_data, "get_pandas"): # duck-typing check for TrainingContainer logging.info("structure_data is TrainingContainer") df = self.structure_data.to_pandas() @@ -102,9 +171,8 @@ def write_input(self): pot_basename = os.path.basename(pot_file_name) copyfile(pot_file_name, os.path.join(self.working_directory, pot_basename)) input_yaml_dict['potential'] = pot_basename - # TODO: check if initial potential is provided (for continuation of fit) else: - raise ValueError("Provided potential filename ({}) doesn't exists".format(self.input["potential"])) + raise ValueError(f"Provided potential filename ({self.input['potential']}) doesn't exists") with open(os.path.join(self.working_directory, "input.yaml"), "w") as f: yaml.dump(input_yaml_dict, f) @@ -140,6 +208,7 @@ def _analyse_log(self, logfile="log.txt"): res_dict["rmse_forces_low"] = ef_rmses[:, 3] return res_dict + @import_alarm def collect_output(self): output_potential_filename = self.get_output_potential_filename() final_potential_filename = self.get_final_potential_filename() @@ -149,10 +218,11 @@ def collect_output(self): yaml_lines = f.readlines() final_potential_yaml_string = "".join(yaml_lines) + # convert resulting potential to CTilde form and save bbasis = ACEBBasisSet(output_potential_filename) cbasis = bbasis.to_ACECTildeBasisSet() - cbasis.save(self.get_final_potential_filename_ace()) + with open(self.get_final_potential_filename_ace(), "r") as f: ace_lines = f.readlines() final_potential_ace_string = "".join(ace_lines) @@ -175,30 +245,23 @@ def get_lammps_potential(self): elem = " ".join(elements_name) pot_file_name = self.get_final_potential_filename_ace() pot_dict = { - 'Config': [["pair_style pace\n", "pair_coeff * * {} {}\n".format(pot_file_name, elem)]], + 'Config': [["pair_style pace\n", f"pair_coeff * * {pot_file_name} {elem}\n"]], 'Filename': [""], 'Model': ["ACE"], 'Name': [self.job_name], 'Species': [elements_name] } - ace_potential = pd.DataFrame(pot_dict) return ace_potential def to_hdf(self, hdf=None, group_name=None): - super().to_hdf( - hdf=hdf, - group_name=group_name - ) + super().to_hdf(hdf=hdf, group_name=group_name) with self.project_hdf5.open("input") as h5in: self.input.to_hdf(h5in) def from_hdf(self, hdf=None, group_name=None): - super().from_hdf( - hdf=hdf, - group_name=group_name - ) + super().from_hdf(hdf=hdf, group_name=group_name) with self.project_hdf5.open("input") as h5in: self.input.from_hdf(h5in) @@ -206,26 +269,22 @@ def get_output_potential_filename(self): return os.path.join(self.working_directory, "output_potential.yaml") def get_final_potential_filename(self): - return os.path.join(self.working_directory, self.job_name+".yaml") + return os.path.join(self.working_directory, self.job_name + ".yaml") def get_final_potential_filename_ace(self): - return os.path.join(self.working_directory, self.job_name+".ace") + return os.path.join(self.working_directory, self.job_name + ".ace") def get_current_potential_filename(self): return os.path.join(self.working_directory, "interim_potential_1.yaml") + @import_alarm def get_current_potential(self): - if HAS_PYACE: - current_potential_filename = self.get_current_potential_filename() - bbasis = BBasisConfiguration(current_potential_filename) - return bbasis - else: - raise RuntimeError("`pyace` package is not installed") + current_potential_filename = self.get_current_potential_filename() + bbasis = BBasisConfiguration(current_potential_filename) + return bbasis + @import_alarm def get_final_potential(self): - if HAS_PYACE: - final_potential_filename = self.get_final_potential_filename() - bbasis = BBasisConfiguration(final_potential_filename) - return bbasis - else: - raise RuntimeError("`pyace` pacakge is not installed") + final_potential_filename = self.get_final_potential_filename() + bbasis = BBasisConfiguration(final_potential_filename) + return bbasis diff --git a/setup.py b/setup.py index beba965..d176ad5 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ 'pyiron_atomistics==0.2.3', 'spglib==1.16.1', 'scipy==1.6.0', - 'numpy==1.19.5', + 'numpy', ], cmdclass=versioneer.get_cmdclass(), From 7cdf1f19f1e33ab22373460a67eb9b8bb3de974c Mon Sep 17 00:00:00 2001 From: yury-lysogorskiy Date: Sat, 27 Feb 2021 13:37:17 +0100 Subject: [PATCH 4/5] Update pyiron_gpl/pacemaker/pacemaker.py import from root of pyiron_base modules Co-authored-by: Jan Janssen --- pyiron_gpl/pacemaker/pacemaker.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pyiron_gpl/pacemaker/pacemaker.py b/pyiron_gpl/pacemaker/pacemaker.py index 07a00e5..27319ec 100644 --- a/pyiron_gpl/pacemaker/pacemaker.py +++ b/pyiron_gpl/pacemaker/pacemaker.py @@ -8,9 +8,7 @@ import pandas as pd import re import ruamel.yaml as yaml -from pyiron_base import GenericJob, InputList -from pyiron_base.generic.util import ImportAlarm -from pyiron_base.settings.generic import Settings +from pyiron_base import GenericJob, InputList, ImportAlarm, Settings from shutil import copyfile s = Settings() From 1af5b9c167c966338b44fe631818dc75273aab01 Mon Sep 17 00:00:00 2001 From: Yury Lysogorskiy Date: Sat, 27 Feb 2021 19:00:34 +0100 Subject: [PATCH 5/5] pacemaker.py: check isinstance(self.structure_data, TrainingContainer) instead of duck-typing --- pyiron_gpl/pacemaker/pacemaker.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pyiron_gpl/pacemaker/pacemaker.py b/pyiron_gpl/pacemaker/pacemaker.py index 27319ec..864c90d 100644 --- a/pyiron_gpl/pacemaker/pacemaker.py +++ b/pyiron_gpl/pacemaker/pacemaker.py @@ -8,9 +8,13 @@ import pandas as pd import re import ruamel.yaml as yaml -from pyiron_base import GenericJob, InputList, ImportAlarm, Settings + from shutil import copyfile +from pyiron_base import GenericJob, InputList, ImportAlarm, Settings +from pyiron_contrib.atomistic.atomistics.job.trainingcontainer import TrainingContainer + + s = Settings() try: @@ -144,7 +148,7 @@ def write_input(self): self.input["data"] = {"filename": self.structure_data} else: raise ValueError(f"Provided structure_data filename ({self.structure_data}) doesn't exists") - elif hasattr(self.structure_data, "get_pandas"): # duck-typing check for TrainingContainer + elif isinstance(self.structure_data, TrainingContainer): logging.info("structure_data is TrainingContainer") df = self.structure_data.to_pandas() data_file_name = self._save_structure_dataframe_pckl_gzip(df)