diff --git a/.gitignore b/.gitignore index 89826ca0..aa68423b 100644 --- a/.gitignore +++ b/.gitignore @@ -102,7 +102,7 @@ venv.bak/ # mypy .mypy_cache/ -tests/timeit_test.py .DS_Store docs_out/ .vscode/ +.doctrees/ diff --git a/.travis.yml b/.travis.yml index 1b325385..b4179b5b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,6 @@ sudo: false -language: python +language: + python: 3.7 env: global: @@ -18,16 +19,16 @@ install: - conda info -a # Install virtual enviroment - - conda install -c conda-forge python==3.7 - - conda install -c conda-forge h5py - - conda install -c conda-forge rdkit + - conda create -n CAT python=3.7 + - conda install -n CAT -c conda-forge h5py rdkit + - source activate CAT # Install qmflows package - pip install .[test] script: # Run the unitary tests excluding the expensive computations - - pytest -m "not (slow or long)" --cov=CAT test + - pytest -m "not (slow or long)" --cov=CAT tests - coverage xml && coverage report -m branches: diff --git a/CAT/__init__.py b/CAT/__init__.py index 2548653d..15f86d00 100644 --- a/CAT/__init__.py +++ b/CAT/__init__.py @@ -1,14 +1,19 @@ -""" A collection of tools designed for the construction, -and subsequent analysis, of various chemical compounds. """ +""" +CAT +=== -__author__ = "Bas van Beek" -__email__ = 'b.f.van.beek@vu.nl' +A collection of tools designed for the construction of various chemical compounds. + +""" from .__version__ import __version__ -from .analysis import ( - init_asa, CRSJob, CRSResults, job_single_point, job_geometry_opt, job_freq, - init_bde, get_thermo, get_entropy, init_solv +from .jobs import ( + job_single_point, job_geometry_opt, job_freq +) + +from .thermo_chem import ( + get_thermo, get_entropy ) from .attachment import ( @@ -16,26 +21,25 @@ ) from .data_handling import ( - Database, - mol_to_file, - read_mol, set_mol_prop, - sanitize_optional, sanitize_input_mol, sanitize_path + read_mol, set_mol_prop ) from .base import prep from .utils import get_template +__version__ = __version__ +__author__ = "Bas van Beek" +__email__ = 'b.f.van.beek@vu.nl' + __all__ = [ - 'init_asa', 'CRSJob', 'CRSResults', 'job_single_point', 'job_geometry_opt', 'job_freq', - 'init_bde', 'get_thermo', 'get_entropy', 'init_solv', + 'job_single_point', 'job_geometry_opt', 'job_freq', + + 'get_thermo', 'get_entropy', 'init_qd_opt', 'init_ligand_opt', 'init_qd_construction', 'init_ligand_anchoring', - 'Database', - 'mol_to_file', - 'read_mol', 'set_mol_prop', - 'sanitize_optional', 'sanitize_input_mol', 'sanitize_path', + 'read_mol', 'set_mol_prop' 'prep', diff --git a/CAT/__version__.py b/CAT/__version__.py index ab45471d..2b8877c5 100644 --- a/CAT/__version__.py +++ b/CAT/__version__.py @@ -1 +1 @@ -__version__ = '0.4.6' +__version__ = '0.5.0' diff --git a/CAT/analysis/README.rst b/CAT/analysis/README.rst deleted file mode 100644 index eebdd89b..00000000 --- a/CAT/analysis/README.rst +++ /dev/null @@ -1,47 +0,0 @@ -######## -Analysis -######## - -~~~~~~~ -asa.py_ -~~~~~~~ - -A module related to performing activation strain analyses. - -~~~~~~~ -crs.py_ -~~~~~~~ - -A module designed for running COSMO-RS Jobs; -holds the CRSJob & CRSResults classes. - -~~~~~~~~ -jobs.py_ -~~~~~~~~ - -A module designed for running generic Jobs. - -~~~~~~~~~~~~~~ -ligand_bde.py_ -~~~~~~~~~~~~~~ - -A module designed for running Bond Dissociation Energies (BDE) calculations. - -~~~~~~~~~~~~~~~~~~~~ -ligand_solvation.py_ -~~~~~~~~~~~~~~~~~~~~ - -A module designed for calculating solvation energies. - -~~~~~~~~~~~~~~~ -thermo_chem.py_ -~~~~~~~~~~~~~~~ - -A module related to calculating thermochemical properties. - -.. _asa.py: https://github.com/BvB93/CAT/tree/master/CAT/analysis/asa.py -.. _crs.py: https://github.com/BvB93/CAT/tree/master/CAT/analysis/crs.py -.. _jobs.py: https://github.com/BvB93/CAT/tree/master/CAT/analysis/jobs.py -.. _ligand_bde.py: https://github.com/BvB93/CAT/tree/master/CAT/analysis/ligand_bde.py -.. _ligand_solvation.py: https://github.com/BvB93/CAT/tree/master/CAT/analysis/ligand_solvation.py -.. _thermo_chem.py: https://github.com/BvB93/CAT/tree/master/CAT/analysis/thermo_chem.py diff --git a/CAT/analysis/__init__.py b/CAT/analysis/__init__.py deleted file mode 100644 index ab5fb249..00000000 --- a/CAT/analysis/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -""" Modules related to the analysis ligands. """ - -from .asa import init_asa -from .crs import (CRSJob, CRSResults) -from .jobs import (job_single_point, job_geometry_opt, job_freq) -from .ligand_bde import init_bde -from .thermo_chem import (get_thermo, get_entropy) -from .ligand_solvation import init_solv - - -__all__ = [ - 'init_asa', - 'CRSJob', 'CRSResults', - 'job_single_point', 'job_geometry_opt', 'job_freq', - 'init_bde', - 'get_thermo', 'get_entropy', - 'init_solv' -] diff --git a/CAT/analysis/asa.py b/CAT/analysis/asa.py deleted file mode 100644 index c403b214..00000000 --- a/CAT/analysis/asa.py +++ /dev/null @@ -1,83 +0,0 @@ -""" A module related to performing activation strain analyses. """ - -__all__ = ['init_asa'] - -import numpy as np - -from scm.plams.core.settings import Settings -import scm.plams.interfaces.molecule.rdkit as molkit - -import rdkit -from rdkit.Chem import AllChem - -from ..data_handling.database import Database - - -def init_asa(qd_df, arg): - """ Initialize the activation-strain analyses (RDKit UFF level) on the ligands in the - absence of the core. - - :parameter qd_df: A dataframe of quantum dots. - :type qd_df: |pd.DataFrame|_ (columns: |str|_, index=|int|_, values=|plams.Molecule|_) - """ - data = Database(arg.optional.database.dirname) - overwrite = 'qd' in arg.optional.database.overwrite - - # Prepare columns - columns = [('ASA', 'E_int'), ('ASA', 'E_strain'), ('ASA', 'E')] - for i in columns: - qd_df[i] = np.nan - - # Fill columns - qd_df['ASA'] = get_asa_energy(qd_df['mol']) - - # Calculate E_int, E_strain and E - if 'qd' in arg.optional.database.write: - recipe = Settings() - recipe['ASA 1'] = {'key': 'RDKit_' + rdkit.__version__, 'value': 'UFF'} - data.update_csv(qd_df, columns=[('settings', 'ASA 1')]+columns, - job_recipe=recipe, database='QD', overwrite=overwrite) - - -def get_asa_energy(mol_series): - """ Calculate the interaction, strain and total energy in the framework of the - activation-strain analysis (ASA). - The ASA is performed on all ligands in the absence of the core at the UFF level (RDKit). - - :parameter mol_series: A series of PLAMS molecules. - :type mol_series: |pd.Series|_ (index=|str|_, values: |plams.Molecule|_) - :return: An array containing E_int, E_strain and E for all *n* molecules in **mol_series**. - :rtype: *n*3* |np.ndarray|_ [|np.float64|_] - """ - ret = np.zeros((len(mol_series), 4)) - - for i, mol in enumerate(mol_series): - mol_cp = mol.copy() - rd_uff = AllChem.UFFGetMoleculeForceField - - # Calculate the total energy of all perturbed ligands in the absence of the core - for atom in reversed(mol_cp.atoms): - if atom.properties.pdb_info.ResidueName == 'COR': - mol_cp.delete_atom(atom) - rdmol = molkit.to_rdmol(mol_cp) - E_no_frag = rd_uff(rdmol, ignoreInterfragInteractions=False).CalcEnergy() - - # Calculate the total energy of the isolated perturbed ligands in the absence of the core - mol_frag = mol_cp.separate() - E_frag = 0.0 - for plams_mol in mol_frag: - rdmol = molkit.to_rdmol(plams_mol) - E_frag += rd_uff(rdmol, ignoreInterfragInteractions=False).CalcEnergy() - - # Calculate the total energy of the optimized ligand - rd_uff(rdmol, ignoreInterfragInteractions=False).Minimize() - E_opt = rd_uff(rdmol, ignoreInterfragInteractions=False).CalcEnergy() - - # Update ret with the new activation strain terms - ret[i] = E_no_frag, E_frag, E_opt, len(mol_frag) - - # Post-process and return - ret[:, 0] -= ret[:, 1] - ret[:, 1] -= ret[:, 2] * ret[:, 3] - ret[:, 2] = ret[:, 0] + ret[:, 1] - return ret[:, 0:3] diff --git a/CAT/analysis/crs.py b/CAT/analysis/crs.py deleted file mode 100644 index e3fffd27..00000000 --- a/CAT/analysis/crs.py +++ /dev/null @@ -1,84 +0,0 @@ -""" A module designed for running COSMO-RS Jobs. """ - -__all__ = ['CRSResults', 'CRSJob'] - -import numpy as np - -try: - import pandas as pd -except ImportError: - pass - -from scm.plams.core.basejob import SingleJob -from scm.plams.tools.units import Units -from scm.plams.interfaces.adfsuite.scmjob import (SCMJob, SCMResults) - - -class CRSResults(SCMResults): - """ - A class for accessing results of COSMO-RS jobs. - """ - _kfext = '.crskf' - _rename_map = {'CRSKF': '$JN.crskf'} - - def get_energy(self, unit='kcal/mol'): - """ Returns the solute solvation energy from an Activity Coefficients calculation. """ - E = self.readkf('ACTIVITYCOEF', 'deltag')[0] - return Units.convert(E, 'kcal/mol', unit) - - def get_activity_coefficient(self): - """ Returns the solute activity coefficient from an Activity Coefficients calculation. """ - return self.readkf('ACTIVITYCOEF', 'gamma')[0] - - def get_sigma_profile(self, unit='kcal/mol'): - """ Returns all sigma profiles, expressed in *unit*. - Returns a dictionary of numpy arrays or, if available, a pandas dataframe. """ - return self.get_sigma('SIGMAPOTENTIAL', unit) - - def get_sigma_potential(self): - """ Returns all sigma profiles, expressed in *unit*. - Returns a dictionary of numpy arrays or, if available, a pandas dataframe. """ - return self.get_sigma('SIGMAPROFILE') - - def get_sigma(self, section, unit='kcal/mol'): - """ Grab all values of sigma and the sigmapotential/profile; - combine them into a dictionary or pandas dataframe. """ - sigma = self._sigma_y(section, unit) - if self.readkf('PURE' + section) is not None: - sigma['mixture'] = self._sigma_y(section, unit) - sigma['sigma'] = self._sigma_x(section) - try: - return sigma.set_index('sigma') - except AttributeError: - return sigma - - def _sigma_x(self, section): - """ Get all values of sigma. """ - min_max = self.readkf(section, 'sigmax') - nitems = self.readkf(section, 'nitems') - step = int((1 + 2 * min_max) / nitems) - return np.arange(-min_max, min_max, step) - - def _sigma_y(self, section, unit='kcal/mol'): - """ Get all values of . """ - values = np.array(self.readkf(section, 'profil')) - values *= Units.conversion_ratio('kcal/mol', unit) - if 'PURE' in section: - ncomp = self.readkf(section, 'ncomp') - values.shape = len(values) // ncomp, ncomp - keys = self.readkf(section, 'filename') - ret = dict(zip(keys, values)) - try: - return pd.DataFrame(ret).set_index('sigma') - except NameError: - return ret - - -class CRSJob(SCMJob): - """ A class for running COSMO-RS jobs. """ - _command = 'crs' - _result_type = CRSResults - - def __init__(self, **kwargs): - SingleJob.__init__(self, **kwargs) - self.ignore_molecule = True diff --git a/CAT/analysis/ligand_bde.py b/CAT/analysis/ligand_bde.py deleted file mode 100644 index d48191eb..00000000 --- a/CAT/analysis/ligand_bde.py +++ /dev/null @@ -1,637 +0,0 @@ -""" A module designed for the calculation of Bond Dissociation Energies (BDE). """ - -__all__ = ['init_bde'] - -from itertools import chain, combinations, product - -import numpy as np -import pandas as pd -from scipy.spatial.distance import cdist - -from scm.plams import PeriodicTable, AMSJob -from scm.plams.mol.molecule import Molecule -from scm.plams.mol.atom import Atom -from scm.plams.core.functions import (init, finish, config) -from scm.plams.core.settings import Settings - -import qmflows - -from .jobs import (job_single_point, job_geometry_opt, job_freq) -from ..utils import (get_time, type_to_string) -from ..mol_utils import (to_atnum, merge_mol) -from ..attachment.ligand_attach import rot_mol_angle -from ..data_handling.database import Database - - -def init_bde(qd_df, arg): - """ Initialize the bond dissociation energy calculation; involves 4 distinct steps: - 1. Take *n* ligands (X) and another atom from the core (Y, *e.g.* Cd) and create YX*n*. - 2. Given a radius *r*, dissociate all possible YX*n* pairs. - 3. Calculate dE: the "electronic" component of the bond dissociation energy (BDE). - 4. (Optional) Calculate ddG: the thermal and entropic component of the BDE. - - :parameter qd_df: A dataframe of quantum dots. - :type qd_df: |pd.DataFrame|_ (columns: |str|_, index=|str|_, values=|plams.Molecule|_) - :parameter arg: A settings object containing all (optional) arguments. - :type arg: |plams.Settings|_ (superclass: |dict|_). - """ - data = Database(arg.optional.database.dirname) - overwrite = 'qd' in arg.optional.database.overwrite - - # Check if the calculation has been done already - if not overwrite and 'qd' in arg.optional.database.read: - with data.open_csv_qd(data.csv_qd, write=False) as db: - key_ar = np.array(['BDE label', 'BDE dE', 'BDE dG', 'BDE ddG']) - bool_ar = np.isin(key_ar, db.columns.levels[0]) - for i in db[key_ar[bool_ar]]: - qd_df[i] = np.nan - data.from_csv(qd_df, database='QD', get_mol=False) - qd_df.dropna(axis='columns', how='all', inplace=True) - - # Calculate the BDEs with thermochemical corrections - if arg.optional.qd.dissociate.job2 and arg.optional.qd.dissociate.s2: - _bde_w_dg(qd_df, arg) - - # Calculate the BDEs without thermochemical corrections - else: - _bde_wo_dg(qd_df, arg) - - -def _bde_w_dg(qd_df, arg): - """ Calculate the BDEs with thermochemical corrections. - - :parameter qd_df: A dataframe of quantum dots. - :type qd_df: |pd.DataFrame|_ (columns: |str|_, index: |str|_, values: |plams.Molecule|_) - :parameter arg: A settings object containing all (optional) arguments. - :type arg: |plams.Settings|_ (superclass: |dict|_). - """ - j1, j2 = arg.optional.qd.dissociate.job1, arg.optional.qd.dissociate.job2 - s1, s2 = arg.optional.qd.dissociate.s1, arg.optional.qd.dissociate.s2 - ion = arg.optional.qd.dissociate.core_atom - lig_count = arg.optional.qd.dissociate.lig_count - - # Identify previously calculated results - try: - has_na = qd_df[['BDE dE', 'BDE dG']].isna().all(axis='columns') - if not has_na.any(): - return - except KeyError: - has_na = pd.Series(True, index=qd_df.index) - - for idx, mol in qd_df['mol'][has_na].iteritems(): - # Create XYn and all XYn-dissociated quantum dots - xyn = get_xy2(mol, ion, lig_count) - if not arg.optional.qd.dissociate.core_index: - mol_wo_xyn = dissociate_ligand(mol, arg) - else: - mol_wo_xyn = dissociate_ligand2(mol, arg) - - # Construct new columns for **qd_df** - labels = [m.properties.df_index for m in mol_wo_xyn] - sub_idx = np.arange(len(labels)).astype(str, copy=False) - try: - n = qd_df['BDE label'].shape[1] - except KeyError: - n = 0 - if len(labels) > n: - for i in sub_idx[n:]: - qd_df[('BDE label', i)] = qd_df[('BDE dE', i)] = qd_df[('BDE ddG', i)] = np.nan - - # Prepare slices - label_slice = idx, list(product(['BDE label'], sub_idx)) - dE_slice = idx, list(product(['BDE dE'], sub_idx)) - ddG_slice = idx, list(product(['BDE ddG'], sub_idx)) - - # Run the BDE calculations - init(path=mol.properties.path, folder='BDE') - config.default_jobmanager.settings.hashing = None - mol.properties.job_path = [] - qd_df.loc[label_slice] = labels - qd_df.loc[dE_slice] = get_bde_dE(mol, xyn, mol_wo_xyn, job=j1, s=s1) - qd_df.loc[ddG_slice] = get_bde_ddG(mol, xyn, mol_wo_xyn, job=j2, s=s2) - mol.properties.job_path += xyn.properties.pop('job_path') - for m in mol_wo_xyn: - mol.properties.job_path += m.properties.pop('job_path') - finish() - - qd_df['BDE dG'] = qd_df['BDE dE'] + qd_df['BDE ddG'] - - job_settings = [] - for mol in qd_df['mol']: - try: - job_settings.append(mol.properties.pop('job_path')) - except KeyError: - job_settings.append([]) - qd_df[('job_settings_BDE', '')] = job_settings - - # Update the database - if 'qd' in arg.optional.database.write: - with pd.option_context('mode.chained_assignment', None): - _qd_to_db(qd_df, arg, has_na, with_dg=True) - - -def _bde_wo_dg(qd_df, arg): - """ Calculate the BDEs without thermochemical corrections. - - :parameter qd_df: A dataframe of quantum dots. - :type qd_df: |pd.DataFrame|_ (columns: |str|_, index=|str|_, values=|plams.Molecule|_) - :parameter arg: A settings object containing all (optional) arguments. - :type arg: |plams.Settings|_ (superclass: |dict|_). - """ - # Unpack arguments - j1 = arg.optional.qd.dissociate.job1 - s1 = arg.optional.qd.dissociate.s1 - ion = arg.optional.qd.dissociate.core_atom - lig_count = arg.optional.qd.dissociate.lig_count - - # Identify previously calculated results - try: - has_na = qd_df['BDE dE'].isna().all(axis='columns') - if not has_na.any(): - return - except KeyError: - has_na = pd.Series(True, index=qd_df.index) - - for idx, mol in qd_df['mol'][has_na].iteritems(): - # Create XYn and all XYn-dissociated quantum dots - xyn = get_xy2(mol, ion, lig_count) - if not arg.optional.qd.dissociate.core_index: - mol_wo_xyn = dissociate_ligand(mol, arg) - else: - mol_wo_xyn = dissociate_ligand2(mol, arg) - - # Construct new columns for **qd_df** - labels = [m.properties.df_index for m in mol_wo_xyn] - sub_idx = np.arange(len(labels)).astype(str, copy=False) - try: - n = qd_df['BDE label'].shape[1] - except KeyError: - n = 0 - if len(labels) > n: - for i in sub_idx[n:]: - qd_df[('BDE label', i)] = qd_df[('BDE dE', i)] = np.nan - - # Prepare slices - label_slice = idx, list(product(['BDE label'], sub_idx)) - dE_slice = idx, list(product(['BDE dE'], sub_idx)) - - # Run the BDE calculations - init(path=mol.properties.path, folder='BDE') - config.default_jobmanager.settings.hashing = None - mol.properties.job_path = [] - qd_df.loc[label_slice] = labels - qd_df.loc[dE_slice] = get_bde_dE(mol, xyn, mol_wo_xyn, job=j1, s=s1) - mol.properties.job_path += xyn.properties.pop('job_path') - for m in mol_wo_xyn: - mol.properties.job_path += m.properties.pop('job_path') - finish() - - job_settings = [] - for mol in qd_df['mol']: - try: - job_settings.append(mol.properties.pop('job_path')) - except KeyError: - job_settings.append([]) - qd_df[('job_settings_BDE', '')] = job_settings - - # Update the database - if 'qd' in arg.optional.database.write: - with pd.option_context('mode.chained_assignment', None): - _qd_to_db(qd_df, arg, has_na, with_dg=False) - - -def _qd_to_db(qd_df, arg, idx, with_dg=True): - data = Database(arg.optional.database.dirname) - overwrite = 'qd' in arg.optional.database.overwrite - j1 = arg.optional.qd.dissociate.job1 - s1 = arg.optional.qd.dissociate.s1 - - qd_df.sort_index(axis='columns', inplace=True) - kwarg = {'database': 'QD', 'overwrite': overwrite} - if with_dg: - j2 = arg.optional.qd.dissociate.job2 - s2 = arg.optional.qd.dissociate.s2 - kwarg['job_recipe'] = get_recipe(j1, s1, j2, s2) - kwarg['columns'] = [('job_settings_BDE', ''), ('settings', 'BDE 1'), ('settings', 'BDE 2')] - column_tup = ('BDE label', 'BDE dE', 'BDE ddG', 'BDE dG') - else: - kwarg['job_recipe'] = get_recipe(j1, s1) - kwarg['columns'] = [('job_settings_BDE', ''), ('settings', 'BDE 1')] - column_tup = ('BDE label', 'BDE dE') - kwarg['columns'] += [(i, j) for i, j in qd_df.columns if i in column_tup] - - data.update_csv(qd_df[idx], **kwarg) - - -def get_recipe(job1, s1, job2=None, s2=None): - """Return the a dictionary with job types and job settings.""" - ret = Settings() - value1 = qmflows.singlepoint['specific'][type_to_string(job1)].copy() - value1.update(s1) - ret['BDE 1'] = {'key': job1, 'value': value1} - - if job2 is not None and s2 is not None: - value2 = qmflows.freq['specific'][type_to_string(job2)].copy() - value2.update(s2) - ret['BDE 2'] = {'key': job2, 'value': value2} - - return ret - - -def get_bde_dE(tot, lig, core, job=None, s=None): - """Calculate the bond dissociation energy: dE = dE(mopac) + (dG(uff) - dE(uff)).""" - # Optimize XYn - if job == AMSJob: - s_cp = s.copy() - s_cp.input.ams.GeometryOptimization.coordinatetype = 'Cartesian' - lig.job_geometry_opt(job, s_cp, name='BDE_geometry_optimization') - else: - lig.job_geometry_opt(job, s, name='BDE_geometry_optimization') - - E_lig = lig.properties.energy.E - if E_lig is np.nan: - print(get_time() + 'WARNING: The BDE XYn geometry optimization failed, skipping further \ - jobs') - return np.full(len(core), np.nan) - - # Perform a single point on the full quantum dot - tot.job_single_point(job, s, name='BDE_single_point') - E_tot = tot.properties.energy.E - if E_tot is np.nan: - print(get_time() + 'WARNING: The BDE quantum dot single point failed, \ - skipping further jobs') - return np.full(len(core), np.nan) - - # Perform a single point on the quantum dot(s) - XYn - for mol in core: - mol.job_single_point(job, s, name='BDE_single_point') - E_core = np.array([mol.properties.energy.E for mol in core]) - - # Calculate and return dE - dE = (E_lig + E_core) - E_tot - return dE - - -def get_bde_ddG(tot, lig, core, job=None, s=None): - """ Calculate the bond dissociation energy: dE = dE(mopac) + (dG(uff) - dE(uff)) - """ - # Optimize XYn - s.input.ams.Constraints.Atom = lig.properties.indices - lig.job_freq(job, s, name='BDE_frequency_analysis') - G_lig = lig.properties.energy.G - E_lig = lig.properties.energy.E - if np.nan in (E_lig, G_lig): - print(get_time() + 'WARNING: The BDE XYn geometry optimization + freq analysis failed, \ - skipping further jobs') - return np.full(len(core), np.nan) - - # Optimize the full quantum dot - s.input.ams.Constraints.Atom = tot.properties.indices - tot.job_freq(job, s, name='BDE_frequency_analysis') - G_tot = tot.properties.energy.G - E_tot = tot.properties.energy.E - if np.nan in (E_tot, G_tot): - print(get_time() + 'WARNING: The BDE quantum dot geometry optimization + freq analysis \ - failed, skipping further jobs') - return np.full(len(core), np.nan) - - # Optimize the quantum dot(s) - XYn - for mol in core: - s.input.ams.Constraints.Atom = mol.properties.indices - mol.job_freq(job, s, name='BDE_frequency_analysis') - G_core = np.array([mol.properties.energy.G for mol in core]) - E_core = np.array([mol.properties.energy.E for mol in core]) - - # Calculate and return dG and ddG - dG = (G_lig + G_core) - G_tot - dE = (E_lig + E_core) - E_tot - ddG = dG - dE - return ddG - - -def get_xy2(mol, ion='Cd', lig_count=2): - """ Takes a quantum dot with ligands (Y) and an ion (X) and turns it into YXn. - Returns a XYn molecule. - - :parameter mol: A PLAMS molecule containing with ligands (Y). - :type mol: |plams.Molecule|_ - :parameter str ion: An atomic symbol (X). - :parameter int lig_count: The number of ligand (*n*) in XYn. - :return: A new XYn molecule. - :rtype: plams.Molecule. - """ - def get_anchor(mol): - """ Return an index and atom if marked with the properties.anchor attribute """ - for i, at in enumerate(mol.atoms): - if at.properties.anchor: - return i, at - - def get_ligand(mol): - """ Extract a single ligand from **mol**. """ - at_list = [] - res = mol.atoms[-1].properties.pdb_info.ResidueNumber - for at in reversed(mol.atoms): - if at.properties.pdb_info.ResidueNumber == res: - at_list.append(at) - else: - ret = Molecule() - ret.atoms = at_list - ret.bonds = list(set(chain.from_iterable(at.bonds for at in at_list))) - return ret.copy() - - # Translate the ligands to their final position - lig1 = get_ligand(mol) - lig2 = lig1.copy() - idx1, anchor1 = get_anchor(lig1) - idx2, anchor2 = get_anchor(lig2) - - # Return a the ligand without the ion - if ion is None: - lig1.properties.name = 'XYn' - lig1.properties.path = mol.properties.path - lig1.properties.indices = [idx1] - return lig1 - - radius = anchor1.radius + PeriodicTable.get_radius(ion) - target = np.array([radius, 0.0, 0.0]) - lig1.translate(anchor1.vector_to(target)) - lig2.translate(anchor2.vector_to(-target)) - - # Define vectors for the ligand rotation - vec1_1 = np.array(anchor1.vector_to(lig1.get_center_of_mass())) - vec2_1 = -1 * np.array(anchor1.vector_to(np.zeros(3))) - vec1_2 = np.array(anchor2.vector_to(lig2.get_center_of_mass())) - vec2_2 = -1 * np.array(anchor2.vector_to(np.zeros(3))) - - # Rotate the ligands - lig1_ar = rot_mol_angle(lig1, vec1_1, vec2_1, idx=idx1, atoms_other=anchor1, bond_length=False) - lig2_ar = rot_mol_angle(lig2, vec1_2, vec2_2, idx=idx2, atoms_other=anchor2, bond_length=False) - lig1.from_array(lig1_ar) - lig2.from_array(lig2_ar) - - # Construct the CdX2 molecule - CdX2 = Molecule() - CdX2.add_atom(Atom(atnum=to_atnum(ion))) - CdX2.merge_mol([lig1, lig2]) - CdX2.properties.name = 'XYn' - CdX2.properties.path = mol.properties.path - CdX2.properties.indices = [1, 1 + idx1, 2 + len(lig2) + idx2] - CdX2[1].properties.charge = 0 - sum([at.properties.charge for at in CdX2.atoms[1:]]) - CdX2.properties.job_path = [] - - return CdX2 - - -def dissociate_ligand(mol, arg): - """ Create all XYn dissociated quantum dots. - - :parameter mol: A PLAMS molecule. - :type mol: |plams.Molecule|_ - :parameter arg: A settings object containing all (optional) arguments. - :type arg: |plams.Settings|_ (superclass: |dict|_). - """ - # Unpack arguments - atnum = arg.optional.qd.dissociate.core_atom - l_count = arg.optional.qd.dissociate.lig_count - cc_dist = arg.optional.qd.dissociate.core_core_dist - lc_dist = arg.optional.qd.dissociate.lig_core_dist - top_dict = arg.optional.qd.dissociate.topology - - # Convert **mol** to an XYZ array - mol.set_atoms_id() - xyz_array = mol.as_array() - - # Create a nested list of atoms, - # each nested element containing all atoms with a given residue number - res_list = [] - for at in mol: - try: - res_list[at.properties.pdb_info.ResidueNumber - 1].append(at) - except IndexError: - res_list.append([at]) - - # Create a list of all core indices and ligand anchor indices - idx_c_old = np.array([j for j, at in enumerate(res_list[0]) if at.atnum == atnum]) - idx_c, topology = filter_core(xyz_array, idx_c_old, top_dict, cc_dist) - idx_l = np.array([i for i in mol.properties.indices if - mol[i].properties.pdb_info.ResidueName == 'LIG']) - 1 - - # Mark the core atoms with their topologies - for i, top in zip(idx_c_old, topology): - mol[int(i+1)].properties.topology = top - - # Create a dictionary with core indices as keys and all combinations of 2 ligands as values - xy = filter_lig_core(xyz_array, idx_l, idx_c, lc_dist, l_count) - combinations_dict = get_lig_core_combinations(xy, res_list, l_count) - - # Create and return new molecules - indices = [at.id for at in res_list[0][:-l_count]] - indices += (idx_l[:-l_count] + 1).tolist() - return remove_ligands(mol, combinations_dict, indices, idx_l) - - -def dissociate_ligand2(mol, arg): - """ Create all XYn dissociated quantum dots. - - :parameter mol: A PLAMS molecule. - :type mol: |plams.Molecule|_ - :parameter arg: A settings object containing all (optional) arguments. - :type arg: |plams.Settings|_ (superclass: |dict|_). - """ - # Unpack arguments - l_count = arg.optional.qd.dissociate.lig_count - cc_dist = arg.optional.qd.dissociate.core_core_dist - idx_c_old = np.array(arg.optional.qd.dissociate.core_index) - 1 - top_dict = arg.optional.qd.dissociate.topology - - # Convert **mol** to an XYZ array - mol.set_atoms_id() - xyz_array = mol.as_array() - - # Create a nested list of atoms, - # each nested element containing all atoms with a given residue number - res_list = [] - for at in mol: - try: - res_list[at.properties.pdb_info.ResidueNumber - 1].append(at) - except IndexError: - res_list.append([at]) - - # Create a list of all core indices and ligand anchor indices - _, topology = filter_core(xyz_array, idx_c_old, top_dict, cc_dist) - idx_l = np.array([i for i in mol.properties.indices if - mol[i].properties.pdb_info.ResidueName == 'LIG']) - 1 - - # Mark the core atoms with their topologies - for i, top in zip(idx_c_old, topology): - mol[int(i+1)].properties.topology = top - - # Create a dictionary with core indices as keys and all combinations of 2 ligands as values - xy = filter_lig_core2(xyz_array, idx_l, idx_c_old, l_count) - combinations_dict = get_lig_core_combinations(xy, res_list, l_count) - - # Create and return new molecules - indices = [at.id for at in res_list[0][:-l_count]] - indices += (idx_l[:-l_count] + 1).tolist() - return remove_ligands(mol, combinations_dict, indices, idx_l) - - -def filter_lig_core2(xyz_array, idx_lig, idx_core, lig_count=2): - """ Create and return the indices of all possible ligand/core pairs.. - - :parameter xyz_array: An array with the cartesian coordinates of a molecule with *n* atoms. - :type xyz_array: *n*3* |np.ndarray|_ [|np.float64|_] - :parameter idx: An array of all ligand anchor atoms (Y). - :type idx: |np.ndarray|_ [|np.int64|_] - :parameter idx: An array of all core atoms (X). - :type idx: |np.ndarray|_ [|np.int64|_] - :parameter int lig_count: The number of ligand (*n*) in XYn. - :return: An array with the indices of all *m* valid (as determined by **max_diist**) - ligand/core pairs. - :rtype: *m*2* |np.ndarray|_ [|np.int64|_]. - """ - dist = cdist(xyz_array[idx_lig], xyz_array[idx_core]) - xy = [] - for _ in range(lig_count): - xy.append(np.array(np.where(dist == np.nanmin(dist, axis=0)))) - dist[xy[-1][0], xy[-1][1]] = np.nan - xy = np.hstack(xy) - xy = xy[[1, 0]] - xy = xy[:, xy.argsort(axis=1)[0]] - - bincount = np.bincount(xy[0]) - xy = xy[:, [i for i, j in enumerate(xy[0]) if bincount[j] >= lig_count]] - xy[0] = idx_core[xy[0]] - xy[1] += 1 - return xy - - -def remove_ligands(mol, combinations_dict, indices, idx_lig): - """ """ - ret = [] - for core in combinations_dict: - for lig in combinations_dict[core]: - mol_tmp = mol.copy() - - mol_tmp.properties = Settings() - mol_tmp.properties.core_topology = str(mol[core].properties.topology) + '_' + str(core) - mol_tmp.properties.lig_residue = sorted([mol[i[0]].properties.pdb_info.ResidueNumber - for i in lig]) - mol_tmp.properties.df_index = mol_tmp.properties.core_topology - mol_tmp.properties.df_index += ' '.join(str(i) for i in mol_tmp.properties.lig_residue) - - delete_idx = sorted([core] + list(chain.from_iterable(lig)), reverse=True) - for i in delete_idx: - mol_tmp.delete_atom(mol_tmp[i]) - mol_tmp.properties.indices = indices - mol_tmp.properties.job_path = [] - ret.append(mol_tmp) - return ret - - -def filter_core(xyz_array, idx, topology_dict={6: 'vertice', 7: 'edge', 9: 'face'}, max_dist=5.0): - """ Find all atoms (**idx**) in **xyz_array** which are exposed to the surface - and assign a topology to aforementioned atoms based on the number of neighbouring atoms. - - :parameter xyz_array: An array with the cartesian coordinates of a molecule with *n* atoms. - :type xyz_array: *n*3* |np.ndarray|_ [|np.float64|_] - :parameter idx: An array of atomic indices in **xyz_array**. - :type idx: |np.ndarray|_ [|np.int64|_] - :parameter topology_dict: A dictionary which maps the number of neighbours (per atom) to a - user-specified topology. - :type topology_dict: |dict|_ (keys: |int|_) - :parameter float max_dist: The radius (Angstrom) for determining if an atom counts as a - neighbour or not. - :return: The indices of all atoms in **xyz_array[idx]** exposed to the surface and - the topology of atoms in **xyz_array[idx]**. - :rtype: |np.ndarray|_ [|np.int64|_] and |np.ndarray|_ [|np.int64|_] - """ - # Create a distance matrix and find all elements with a distance smaller than **max_dist** - dist = cdist(xyz_array[idx], xyz_array[idx]) - np.fill_diagonal(dist, max_dist) - xy = np.array(np.where(dist <= max_dist)) - bincount = np.bincount(xy[0], minlength=len(idx)) - - # Slice xyz_array, creating arrays of reference atoms and neighbouring atoms - x = xyz_array[idx] - y = xyz_array[idx[xy[1]]] - - # Calculate the length of a vector from a reference atom to the mean position of its neighbours - # A vector length close to 0.0 implies that a reference atom is surrounded by neighbours in - # a more or less spherical pattern (i.e. the reference atom is in the bulk, not on the surface) - vec_length = np.empty((bincount.shape[0], 3), dtype=float) - k = 0 - for i, j in enumerate(bincount): - vec_length[i] = x[i] - np.average(y[k:k+j], axis=0) - k += j - vec_length = np.linalg.norm(vec_length, axis=1) - return idx[np.where(vec_length > 0.5)[0]], get_topology(bincount, topology_dict) - - -def get_topology(bincount, topology_dict={6: 'vertice', 7: 'edge', 9: 'face'}): - """ Translate the number of neighbouring atoms (**bincount**) into a list of topologies. - If a specific number of neighbours (*i*) is absent from **topology_dict** then that particular - element is set to a generic str(*i*) + '_neighbours'. - - :parameter bincount: An array with the number of neighbours per atom for a total of *n* atoms. - :type bincount: *n* |np.ndarray|_ [|np.int64|_] - :parameter topology_dict: A dictionary which maps the number of neighbours (per atom) to a - user-specified topology. - :type topology_dict: |dict|_ (keys: |int|_) - :return: A list of topologies for all *n* atoms in **bincount**. - :rtype: *n* |list|_. - """ - if isinstance(topology_dict, Settings): - topology_dict = topology_dict.as_dict() - ret = [] - for i in bincount: - try: - ret.append(topology_dict[i]) - except KeyError: - ret.append(str(i) + '_neighbours') - return ret - - -def filter_lig_core(xyz_array, idx_lig, idx_core, max_dist=5.0, lig_count=2): - """ Create and return the indices of all possible ligand/pairs that can be constructed within a - given radius (**max_dist**). - - :parameter xyz_array: An array with the cartesian coordinates of a molecule with *n* atoms. - :type xyz_array: *n*3* |np.ndarray|_ [|np.float64|_] - :parameter idx: An array of all ligand anchor atoms (Y). - :type idx: |np.ndarray|_ [|np.int64|_] - :parameter idx: An array of all core atoms (X). - :type idx: |np.ndarray|_ [|np.int64|_] - :parameter float max_dist: The maximum distance for considering XYn pairs. - :parameter int lig_count: The number of ligand (*n*) in XYn. - :return: An array with the indices of all *m* valid (as determined by **max_diist**) - ligand/core pairs. - :rtype: *m*2* |np.ndarray|_ [|np.int64|_]. - """ - dist = cdist(xyz_array[idx_core], xyz_array[idx_lig]) - xy = np.array(np.where(dist <= max_dist)) - bincount = np.bincount(xy[0]) - xy = xy[:, [i for i, j in enumerate(xy[0]) if bincount[j] >= lig_count]] - xy[0] = idx_core[xy[0]] - xy[1] += 1 - return xy - - -def get_lig_core_combinations(xy, res_list, lig_count=2): - """ Given an array of indices (**xy**) and a nested list of atoms **res_list**. - - :parameter xy: An array with the indices of all *m* core/ligand pairs. - :type xy: *m*2* |np.ndarray|_ [|np.int64|_] - :parameter res_list: A list of PLAMS atoms, each nested tuple representing all atoms within - a given residue. - :type res_list: |list|_ [|tuple|_ [|plams.Atom|_]] - :parameter int lig_count: The number of ligand (*n*) in XYn. - :return: - """ - dict_ = {} - for core, lig in xy.T: - try: - dict_[res_list[0][core].id].append([at.id for at in res_list[lig]]) - except KeyError: - dict_[res_list[0][core].id] = [[at.id for at in res_list[lig]]] - return {k: combinations(v, lig_count) for k, v in dict_.items()} diff --git a/CAT/analysis/ligand_solvation.py b/CAT/analysis/ligand_solvation.py deleted file mode 100644 index 81b553c5..00000000 --- a/CAT/analysis/ligand_solvation.py +++ /dev/null @@ -1,201 +0,0 @@ -""" A module designed for calculating solvation energies. """ - -__all__ = ['init_solv'] - -import os -import shutil -from itertools import product -from os.path import (join, dirname) - -import numpy as np -import pandas as pd - -from scm.plams.core.settings import Settings -from scm.plams.core.jobrunner import JobRunner -from scm.plams.core.functions import (init, finish) -from scm.plams.interfaces.adfsuite.adf import ADFJob - -import qmflows - -from .crs import CRSJob -from .. import utils as CAT -from ..utils import (get_time, type_to_string) -from ..data_handling.database import Database - - -def init_solv(ligand_df, arg, solvent_list=None): - """ Initialize the ligand solvation energy calculation. - Performs an inplace update of **ligand_df**, creating 2 sets of columns (*E_solv* & *gamma*) - to hold all solvation energies and activity coefficients, respectively. - - :parameter ligand_df: A dataframe of ligands. - :type ligand_df: |pd.DataFrame|_ (columns: |str|_, index: |int|_, values: |plams.Molecule|_) - :parameter arg: A settings object containing all (optional) arguments. - :type arg: |plams.Settings|_ (superclass: |dict|_). - :parameter solvent_list: A list of paths to the .t21 or .coskf files of solvents. If *None*, - use the default .coskf files distributed with CAT (see CAT.data.coskf). - :type solvent_list: |None|_ or |list|_ [|str|_]. - """ - data = Database(path=arg.optional.database.dirname) - path = ligand_df['mol'][0].properties.path - j1 = arg.optional.ligand.crs.job1 - j2 = arg.optional.ligand.crs.job2 - s1 = arg.optional.ligand.crs.s1 - s2 = arg.optional.ligand.crs.s2 - - # Prepare the job settings and solvent list - if solvent_list is None: - coskf_path = join(join(dirname(dirname(__file__)), 'data'), 'coskf') - solvent_list = [join(coskf_path, solv) for solv in os.listdir(coskf_path) if - solv not in ('__init__.py', 'README.rst')] - solvent_list.sort() - - # Update the columns of **ligand_df** - columns = [i.rsplit('.', 1)[0].rsplit('/', 1)[-1] for i in solvent_list] - columns = list(product(('E_solv', 'gamma'), columns)) - for item in columns: - ligand_df[item] = np.nan - - # Check if the calculation has been done already - overwrite = 'ligand' in arg.optional.database.overwrite - if not overwrite and 'ligand' in arg.optional.database.read: - data.from_csv(ligand_df, database='ligand', get_mol=False) - - # Run COSMO-RS - idx = ligand_df[['E_solv', 'gamma']].isna().all(axis='columns') - if idx.any(): - init(path=path, folder='ligand_solvation') - for i, mol in ligand_df['mol'][idx].iteritems(): - mol.properties.job_path = [] - coskf = get_surface_charge(mol, job=j1, s=s1) - e_and_gamma = get_solv(mol, solvent_list, coskf, job=j2, s=s2) - ligand_df.loc[i, 'E_solv'], ligand_df.loc[i, 'gamma'] = e_and_gamma - finish() - - job_settings = [] - for mol in ligand_df['mol']: - try: - job_settings.append(mol.properties.pop('job_path')) - except KeyError: - job_settings.append([]) - ligand_df[('job_settings_crs', '')] = job_settings - else: - return None # No new molecules here; move along - - # Update the database - if 'ligand' in arg.optional.database.write: - with pd.option_context('mode.chained_assignment', None): - _ligand_to_db(ligand_df, arg, idx, columns) - return None - - -def _ligand_to_db(ligand_df, arg, idx, columns): - data = Database(path=arg.optional.database.dirname) - overwrite = 'ligand' in arg.optional.database.overwrite - j1 = arg.optional.ligand.crs.job1 - j2 = arg.optional.ligand.crs.job2 - s1 = arg.optional.ligand.crs.s1 - s2 = arg.optional.ligand.crs.s2 - - value1 = qmflows.singlepoint['specific'][type_to_string(j1)].copy() - value1.update(s1) - recipe = Settings() - recipe['solv 1'] = {'key': j1, 'value': value1} - recipe['solv 2'] = {'key': j2, 'value': s2} - - data.update_csv( - ligand_df.loc[idx], - database='ligand', - columns=[('settings', 'solv 1'), ('settings', 'solv 2'), - ('job_settings_crs', '')]+columns, - overwrite=overwrite, - job_recipe=recipe - ) - - -def get_surface_charge(mol, job=None, s=None): - """ Construct the COSMO surface of the *mol*. """ - # Special procedure for ADF jobs - # Use the gas-phase electronic structure as a fragment for the COSMO single point - if job is ADFJob: - s = get_surface_charge_adf(mol, job, s) - - s.runscript.post = '$ADFBIN/cosmo2kf "mopac.cos" "mopac.coskf"' - results = mol.job_single_point(job, s, ret_results=True) - results.wait() - return get_coskf(results) - - -def get_solv(mol, solvent_list, coskf, job=None, s=None, keep_files=True): - """ Calculate the solvation energy of *mol* in various *solvents*. """ - # Return 2x np.nan if no coskf is None (i.e. the COSMO-surface construction failed) - if coskf is None: - return np.nan, np.nan - - # Prepare a list of job settings - s.input.Compound._h = coskf - s.ignore_molecule = True - s_list = [] - for solv in solvent_list: - s_tmp = s.copy() - s_tmp.name = solv.rsplit('.', 1)[0].rsplit('/', 1)[-1] - s_tmp.input.compound._h = solv - s_list.append(s_tmp) - - # Run the job - jobs = [CRSJob(settings=s, name=s.name) for s in s_list] - results = [j.run(jobrunner=JobRunner(parallel=True)) for j in jobs] - - # Extract solvation energies and activity coefficients - E_solv = [] - Gamma = [] - for result in results: - result.wait() - try: - E_solv.append(result.get_energy()) - Gamma.append(result.get_activity_coefficient()) - except ValueError: - print(get_time() + 'WARNING: Failed to retrieve COSMO-RS results of ' + - results.job.name) - E_solv.append(np.nan) - Gamma.append(np.nan) - - # Delete all mopac and cosmo-rs files if keep_files=False - if not keep_files: - mopac = dirname(s.input.Compound._h) - shutil.rmtree(mopac) - for job in jobs: - shutil.rmtree(job.path) - - if 'job_path' not in mol.properties: - mol.properties.job_path = [] - mol.properties.job_path += [join(j.path, j.name + '.in') for j in jobs] - - # Return the solvation energies and activity coefficients as dict - return E_solv, Gamma - - -def get_surface_charge_adf(mol, job, s): - """ Perform a gas-phase ADF single point and return settings for a - COSMO-ADF single point, using the previous gas-phase calculation as moleculair fragment. """ - s.input.allpoints = '' - s.input.charge = sum([at.properties.charge for at in mol]) - results = mol.job_single_point(job, s, ret_results=True) - coskf = get_coskf(results) - - for at in mol: - at.properties.adf.fragment = 'gas' - s.update(CAT.get_template('qd.yaml')['COSMO-ADF']) - s.input.fragments.gas = coskf - - return s - - -def get_coskf(results, extensions=['.coskf', '.t21']): - """ Return the file in results containing the COSMO surface. """ - for file in results.files: - for ext in extensions: - if ext in file: - return results[file] - print(get_time() + 'WARNING: Failed to retrieve COSMO surface charges of ' + results.job.name) - return None diff --git a/CAT/analysis/thermo_chem.py b/CAT/analysis/thermo_chem.py deleted file mode 100644 index 87f7e35e..00000000 --- a/CAT/analysis/thermo_chem.py +++ /dev/null @@ -1,87 +0,0 @@ -""" A module related to calculating thermochemical properties. """ - -__all__ = ['get_thermo', 'get_entropy'] - -import numpy as np - -from scm.plams.tools.units import Units - - -def get_entropy(mol, freqs, T=298.15): - """Calculate the translational, vibrational and rotational entropy. - All units and constants are in SI units. - - mol : A PLAMS molecule. - freqs : An iterable consisting of vibrational frequencies in units of s**-1. - T : The temperature in Kelvin - Return : A numpy array containing the translational, rotational and - vibrational contributions to the entropy - """ - if not isinstance(freqs, np.ndarray): - freqs = np.array(freqs) - - # Define constants - kT = 1.380648 * 10**-23 * T # Boltzmann constant * temperature - h = 6.6260701 * 10**-34 # Planck constant - hv_kT = (h * freqs) / kT # (Planck constant * frequencies) / (Boltzmann * temperature) - R = 8.31445 # Gas constant - V_Na = ((R * T) / 10**5) / Units.constants['NA'] # Volume(1 mol ideal gas) / Avogadro's number - pi = np.pi - - # Extract atomic masses and coordinates - m = np.array([at.mass for at in mol]) * 1.6605390 * 10**-27 - x, y, z = mol.as_array().T * 10**-10 - - # Calculate the rotational partition function - inertia = np.array([sum(m*(y**2 + z**2)), -sum(m*x*y), -sum(m*x*z), - -sum(m*x*y), sum(m*(x**2 + z**2)), -sum(m*y*z), - -sum(m*x*z), -sum(m*y*z), sum(m*(x**2 + y**2))]).reshape(3, 3) - inertia = np.product(np.linalg.eig(inertia)[0]) - q_rot = pi**0.5 * ((8 * pi**2 * kT) / h**2)**1.5 * inertia**0.5 - - # Calculate the translational, rotational and vibrational entropy (divided by R) - S_trans = 1.5 + np.log(V_Na * ((2 * pi * sum(m) * kT) / h**2)**1.5) - S_rot = 1.5 + np.log(q_rot) - S_vib = sum(hv_kT / np.expm1(hv_kT) - np.log(1 - np.exp(-hv_kT))) - - return R * np.array([S_trans, S_rot, S_vib]) - - -def get_thermo(mol, freqs, E, T=298.15, export=['E', 'H', 'S', 'G'], unit='kcal/mol'): - """Extract and return Gibbs free energies, entropies and/or enthalpies from an AMS KF file. - All vibrational frequencies smaller than 100 cm**-1 are set to 100 cm**-1. - - mol : A PLAMS molecule. - freqs : An iterable consisting of vibrational frequencies in units of cm**-1. - E : The eletronic energy in kcal/mol. - T : The temperature in Kelvin - export []: An iterable containing strings of the to be exported energies: - 'E': Electronic energy - 'U': Interal energy (E + U_nuc) - 'H': Enthalpy (U + pV) - 'S': Entropy - 'G': Gibbs free energy (H - T*S) - unit : The unit of the to be returned energies. - Return or []: An energy or dictionary of energies - """ - # Get frequencies; set all frequencies smaller than 100 cm**-1 to 100 cm**-1 - freqs = np.array(freqs) - freqs[freqs < 100] = 100 - freqs *= 100 * Units.constants['c'] - - # hv_kT = (Planck constant * frequencies) / (Boltzmann constant * temperature) - hv_kT = (6.6260701 * 10**-34 * freqs) / (1.380648 * 10**-23 * T) - RT = 8.31445 * T # Gas constant * temperature - - # Extract and/or calculate the various energies - E = E * Units.conversion_ratio('kcal/mol', 'kj/mol') * 1000 - U = E + RT * (3.0 + sum(0.5 * hv_kT + hv_kT / np.expm1(hv_kT))) - H = U + RT - S = sum(get_entropy(mol, freqs, T=T)) - G = H - T * S - - ret = {'E': E, 'U': U, 'H': H, 'S': S, 'G': G} - - if len(export) == 1: - return Units.convert(ret[export[0]], 'kj/mol', unit) / 1000 - return {i: Units.convert(ret[i], 'kj/mol', unit) / 1000 for i in ret if i in export} diff --git a/CAT/assertion_functions.py b/CAT/assertion_functions.py new file mode 100644 index 00000000..eaa50262 --- /dev/null +++ b/CAT/assertion_functions.py @@ -0,0 +1,253 @@ +""" +CAT.assertion_functions +======================= + +Various generic assertion functions for the testing of CAT. + +Index +----- +.. currentmodule:: CAT.assertion_functions +.. autosummary:: + Invert + assert_eq + assert_id + assert_subclass + assert_instance + assert_exception + assert_isin + assert_lt + assert_le + assert_gt + assert_ge + _err_msg + _exc_msg + +API +--- +.. autoclass:: Invert +.. autofunction:: assert_eq +.. autofunction:: assert_id +.. autofunction:: assert_subclass +.. autofunction:: assert_instance +.. autofunction:: assert_exception +.. autofunction:: assert_isin +.. autofunction:: assert_lt +.. autofunction:: assert_le +.. autofunction:: assert_gt +.. autofunction:: assert_ge +.. autofunction:: _err_msg +.. autofunction:: _exc_msg + +""" + +from functools import wraps +from typing import (Any, Callable, Tuple, Sequence, Container) + + +class Invert(): + """Context manager for inverting assertion result. + + Instances of :exc:`AssertionError` raised by the passed callables are supressed and + *vice versa*. + + Examples + -------- + .. code:: python + + >>> def assert_true(value): + >>> assert value is True, repr("value is not 'True'") + + >>> assert_true(False) + AssertionError: "value is not 'True'" + + >>> with Invert(assert_true) as func: + >>> func(False) # Raises no exception + + Parameters + ---------- + func : |Callable|_ + A callable that can raise an :exc:`AssertionError`. + + Attributes + ---------- + func : |Callable|_ + A callable constructed from the **func** parameter. + Operations that previously did *not* raise an :exc:`AssertionError` now do and *vice versa*. + + """ + + def __init__(self, func: Callable) -> None: + """Initialize the :class:`.Invert` context manager.""" + self.func = self.invert(func) + + def __enter__(self) -> Callable: + """Return the inverted assertion function.""" + return self.func + + def __exit__(self, *args) -> None: + """Close the :class:`.Invert` context manager.""" + return + + @staticmethod + def get_err_msg(self, func: Callable, + args: Tuple[str, Any, Any]) -> str: + """Create an error message for :meth:`Invert.invert`.""" + if not args: + return '' + elif func is assert_exception: + return _exc_msg(*args) + else: + return _err_msg(*args) + + def invert(self, func: Callable) -> Callable: + """Invert a function that may or may not raise an :exc:`AssertionError`.""" + @wraps(func) + def wrapper(*args, **kwargs): + try: + tup = func(*args, **kwargs) + except AssertionError: + pass + else: + raise AssertionError(self.get_err_msg(func, tup)) + return wrapper + + +def assert_le(value: Any, + ref: Any) -> Tuple[str, Any, Any]: + """Assert :code:`value <= ref`; returns arguments for :func:`._err_msg`.""" + assertion = 'assert value <= reference' + assert value <= ref, _err_msg(assertion, value, ref) + + _assertion = 'assert value > reference' + return _assertion, value, ref + + +def assert_ge(value: Any, + ref: Any) -> Tuple[str, Any, Any]: + """Assert :code:`value >= ref`; returns arguments for :func:`._err_msg`.""" + assertion = 'assert value => reference' + assert value >= ref, _err_msg(assertion, value, ref) + + _assertion = 'assert value < reference' + return _assertion, value, ref + + +def assert_lt(value: Any, + ref: Any) -> Tuple[str, Any, Any]: + """Assert :code:`value < ref`; returns arguments for :func:`._err_msg`.""" + assertion = 'assert value < reference' + assert value < ref, _err_msg(assertion, value, ref) + + _assertion = 'assert value >= reference' + return _assertion, value, ref + + +def assert_gt(value: Any, + ref: Any) -> Tuple[str, Any, Any]: + """Assert :code:`value > ref`; returns arguments for :func:`._err_msg`.""" + assertion = 'assert value > reference' + assert value > ref, _err_msg(assertion, value, ref) + + _assertion = 'assert value <= reference' + return _assertion, value, ref + + +def assert_isin(value: Any, + ref: Container) -> Tuple[str, Any, Container]: + """Assert :code:`value in ref`; returns arguments for :func:`._err_msg`.""" + assertion = 'assert value in reference' + assert value in ref, _err_msg(assertion, value, ref) + + _assertion = 'assert value not in reference' + return _assertion, value, ref + + +def assert_instance(value: Any, + ref: type) -> Tuple[str, str, str]: + """Assert :code:`isinstance(value, ref)`; returns arguments for :func:`._err_msg`.""" + assertion = 'assert isinstance(value, reference)' + ref_name = ref.__name__ + value_name = value.__class__.__name__ + assert isinstance(value, ref), _err_msg(assertion, ref_name, value_name) + + _assertion = 'assert not isinstance(value, reference)' + return _assertion, ref_name, value_name + + +def assert_subclass(value: type, + ref: type) -> Tuple[str, str, str]: + """Assert :code:`issubclass(value, ref)`; returns arguments for :func:`._err_msg`.""" + assertion = 'assert issubclass(value, reference)' + ref_name = ref.__name__ + value_name = value.__name__ + assert issubclass(value, ref), _err_msg(assertion, ref_name, value_name) + + _assertion = 'assert not issubclass(value, reference)' + return _assertion, ref_name, value_name + + +def assert_eq(value: Any, + ref: Any) -> Tuple[str, Any, Any]: + """Assert :code:`value == ref`; returns arguments for :func:`._err_msg`.""" + assertion = 'assert value == reference' + assert value == ref, _err_msg(assertion, value, ref) + + _assertion = 'assert value != reference' + return _assertion, value, ref + + +def assert_id(value: Any, + ref: Any) -> Tuple[str, int, int]: + """Assert :code:`value is ref`; returns arguments for :func:`._err_msg`.""" + assertion = 'assert value is reference' + assert_eq(ref, value) + value_id = f'{id(value)} = id({value})' + ref_id = f'{id(ref)} = id({ref})' + assert ref is value, _err_msg(assertion, value_id, ref_id) + + _assertion = 'assert value is not reference' + return _assertion, value_id, ref_id + + +def assert_exception(exc: Exception, + func: Callable, + *args: Sequence, + **kwargs: dict) -> Tuple[str, str, str]: + """Assert **exc** is raised by :code:`func(*args, **kwargs)`.""" + err_ref = exc.__name__ + err = None.__class__.__name__ + arguments = '' + if args is not None: + arguments += ', '.join(repr(i) for i in args) + if kwargs is not None: + arguments += ', '.join(f'{k}={v}' for k, v in kwargs.items()) + assertion = f'assert {func.__qualname__}({arguments}) -> {exc.__name__}' + + try: + func(*args, **kwargs) + except exc: # The desired exception is raised + pass + except Exception as ex: # An undesired exception is raised + err = ex.__class__.__name__ + raise AssertionError(_exc_msg(assertion, err, err_ref)) + else: # No exception is raised; this is undesired + raise AssertionError(_exc_msg(assertion, err, err_ref)) + + _assertion = f'assert {func.__qualname__}({arguments}) -/> {exc.__name__}' + return _assertion, err, err_ref + + +def _err_msg(assertion: Any, + value: Any, + ref: Any) -> str: + """Return a formatted error message.""" + i, j, k = repr(assertion), repr(value), repr(ref) + return f'{i}\nSupplied value:\n\t{j}\n\nSupplied reference:\n\t{k}' + + +def _exc_msg(assertion: Any, + value: Any, + ref: Any) -> str: + """Return a formatted error message for :func:`.assert_exception`.""" + i, j, k = repr(assertion), repr(value), repr(ref) + return f'{i}\nSupplied exception:\n\t{j}\n\nReference exception:\n\t{k}' diff --git a/CAT/attachment/__init__.py b/CAT/attachment/__init__.py index 7b2c7d1d..581f1ef2 100644 --- a/CAT/attachment/__init__.py +++ b/CAT/attachment/__init__.py @@ -1,4 +1,10 @@ -""" Modules designed for attaching ligands to cores. """ +""" +CAT.attachment +============== + +Modules designed for attaching ligands to cores. + +""" from .qd_opt import init_qd_opt from .ligand_opt import init_ligand_opt diff --git a/CAT/attachment/ligand_anchoring.py b/CAT/attachment/ligand_anchoring.py index a82b2a30..e7580493 100644 --- a/CAT/attachment/ligand_anchoring.py +++ b/CAT/attachment/ligand_anchoring.py @@ -1,128 +1,247 @@ -""" A module designed for finding ligand functional groups. """ +""" +CAT.attachment.ligand_anchoring +=============================== -__all__ = ['init_ligand_anchoring'] +A module designed for finding ligand functional groups. + +Index +----- +.. currentmodule:: CAT.attachment.ligand_anchoring +.. autosummary:: + init_ligand_anchoring + get_functional_groups + _smiles_to_rdmol + find_substructure + substructure_split + _get_df + +API +--- +.. autofunction:: init_ligand_anchoring +.. autofunction:: get_functional_groups +.. autofunction:: find_substructure +.. autofunction:: _smiles_to_rdmol +.. autofunction:: substructure_split +.. autofunction:: _get_df + +""" from itertools import chain +from typing import (Sequence, List, Tuple, Optional, Iterable) import pandas as pd +from scm.plams import (Molecule, Settings) import scm.plams.interfaces.molecule.rdkit as molkit from rdkit import Chem + from ..utils import (get_time, get_template) from ..mol_utils import separate_mod -from ..data_handling.input_sanitizer import santize_smiles +from ..settings_dataframe import SettingsDataFrame +from ..data_handling.validate_mol import santize_smiles + +__all__ = ['init_ligand_anchoring'] + +# Aliases for pd.MultiIndex columns +MOL = ('mol', '') +FORMULA = ('formula', '') +HDF5_INDEX = ('hdf5 index', '') +OPT = ('opt', '') + + +def init_ligand_anchoring(ligand_df: SettingsDataFrame) -> SettingsDataFrame: + """Initialize the ligand functional group searcher. + Parameters + ---------- + ligand_df : |CAT.SettingsDataFrame|_ + A dataframe of valid ligands. -def init_ligand_anchoring(ligand_df, arg): - """ Initialize the ligand functional group searcher. + Returns + ------- + |CAT.SettingsDataFrame|_ + A dataframe of ligands with functional groups that can serve as valid anchor points. - :parameter ligand_df: A dataframe of valid ligands. - :type ligand_df: |pd.DataFrame|_ (columns: |str|_, index=|int|_, values=|plams.Molecule|_) - :parameter arg: A settings object containing all (optional) arguments. - :type arg: |plams.Settings|_ (superclass: |dict|_) - :return: A dataframe of ligands with functional groups that can serve as valid anchor points. - :rtype: |pd.DataFrame|_ (columns: |str|_, index=|str|_, values=|plams.Molecule|_) """ + # Unpack arguments + settings = ligand_df.settings.optional + split = settings.ligand.split + _functional_groups = settings.ligand.functional_groups + + # Construct reference functional groups + functional_groups = get_functional_groups(_functional_groups, split) + # Find all functional groups; return a copy of each mol for each functional group mol_list = [] - for lig in ligand_df['mol']: - if not lig.properties.dummies: # Functional group search - mol_list += find_substructure(lig, split=arg.optional.ligand.split) - else: # Manual specification of a functional group - if len(lig.properties.dummies) == 1: # optional.ligand.split = False - lig.properties.dummies = lig.properties.dummies[0] - 1 - split = False - elif len(lig.properties.dummies) == 2: # optional.ligand.split = True - lig.properties.dummies = [i - 1 for i in lig.properties.dummies] - split = True - mol_list += [substructure_split(lig, lig.properties.dummies, split=split)] + for lig in ligand_df[MOL]: + # Functional group search + if not lig.properties.dummies: + mol_list += find_substructure(lig, functional_groups, split) + continue + + # Manual specification of a functional group + if len(lig.properties.dummies) == 1: # optional.ligand.split = False + lig.properties.dummies = lig.properties.dummies[0] - 1 + split_ = False + elif len(lig.properties.dummies) == 2: # optional.ligand.split = True + lig.properties.dummies = tuple(i - 1 for i in lig.properties.dummies) + split_ = True + mol_list += [substructure_split(lig, lig.properties.dummies, split=split_)] # Convert the results into a dataframe - return _get_df(mol_list) + return _get_df(mol_list, ligand_df.settings) + + +def _get_df(mol_list: Sequence[Molecule], + settings: Settings) -> SettingsDataFrame: + """Create and return a new ligand dataframe. + Parameters + ---------- + mol_list : |list|_ [|plams.Molecule|_] + A list of PLAMS molecules. -def _get_df(mol_list): - """ Create and return a new ligand dataframe. + settings : |Settings|_ + A Settings instance containing all CAT parameters. + + Returns + ------- + |CAT.SettingsDataFrame|_ + A dataframe of ligands with functional groups that can serve as valid anchor points. - :parameter mol_list: A list of PLAMS molecules. - :type mol_list: |list|_ [|plams.Molecule|_] - :return: A dataframe of ligands with functional groups that can serve as valid anchor points. - :rtype: |pd.DataFrame|_ (columns: |str|_, index=|str|_, values=|plams.Molecule|_) """ # Create the dataframe index and columns idx_tuples = [(mol.properties.smiles, mol.properties.anchor) for mol in mol_list] idx = pd.MultiIndex.from_tuples(idx_tuples, names=['smiles', 'anchor']) - columns_tuples = [('mol', ''), ('formula', ''), ('hdf5 index', ''), ('opt', '')] + columns_tuples = [MOL, FORMULA, HDF5_INDEX, OPT] columns = pd.MultiIndex.from_tuples(columns_tuples, names=['index', 'sub index']) # Create, fill and return the dataframe - df = pd.DataFrame(-1, index=idx, columns=columns) - df['mol'] = mol_list - df['formula'] = [lig.get_formula() for lig in df['mol']] - df['opt'] = False + df = SettingsDataFrame(-1, index=idx, columns=columns, settings=settings) + df[MOL] = mol_list + df[FORMULA] = [lig.get_formula() for lig in df[MOL]] + df[OPT] = False return df -def find_substructure(ligand, split=True): - """ Identify interesting functional groups within the ligand. +def get_functional_groups(functional_groups: Optional[Iterable[str]] = None, + split: bool = True) -> Tuple[Chem.Mol]: + """Construct a list of RDKit molecules representing functional groups. + + Parameters + ---------- + functional_groups : |list|_ [|str|_] + Optional: A list of smiles strings representing functional groups. + Will default to templates provided by CAT if ``None``. + + split : bool + If templates should be pulled from the ``['split']`` or ``['no_split']`` block. + Only relevant if **functional_groups** is ``None``. + + Returns + ------- + |tuple|_ [|Chem.Mol|_] + A list of RDKit molecules constructed from either **functional_group** or + the default smiles templates in CAT. - :parameter ligand: The ligand molecule. - :type ligand: |plams.Molecule|_ - :parameter bool split: If a functional group should be split from **ligand** (*True*) - or not (*False*). - :return: A list of ligands. A single copy of **ligand** is created for each identified - functional group, removing parts of the functional group if required (see **split**). - :rtype: |list|_ [|plams.Molecule|_]. """ - rdmol = molkit.to_rdmol(ligand) + # The user has, explicitly, provided functional groups + if functional_groups: + return tuple(_smiles_to_rdmol(smiles) for smiles in functional_groups) - # Creates a list containing predefined functional groups, each saved as an rdkit molecule - # IMPORTANT: The first atom should ALWAYS be the atom that should attach to the core + # Read functional groups from the default CAT SMILES templates if split: func_groups = get_template('smiles.yaml').split else: func_groups = get_template('smiles.yaml').no_split - func_groups = chain.from_iterable(func_groups.values()) - func_groups = [Chem.MolFromSmarts(smarts) for smarts in func_groups] + + return tuple(_smiles_to_rdmol(smiles) for smiles in func_groups) + + +def _smiles_to_rdmol(smiles: str) -> Chem.Mol: + """Convert a SMILES string into an rdkit Mol; supports explicit hydrogens.""" + # RDKit tends to remove explicit hydrogens if SANITIZE_ADJUSTHS is enabled + sanitize = Chem.SanitizeFlags.SANITIZE_ALL ^ Chem.SanitizeFlags.SANITIZE_ADJUSTHS + mol = Chem.MolFromSmiles(smiles, sanitize=False) + Chem.rdmolops.SanitizeMol(mol, sanitizeOps=sanitize) + return mol + + +def find_substructure(ligand: Molecule, + func_groups: Iterable[Chem.Mol], + split: bool = True) -> List[Molecule]: + """Identify interesting functional groups within the ligand. + + Parameters + ---------- + ligand : |plams.Molecule|_ + The ligand molecule. + + func_groups : |tuple|_ [|Chem.Mol|_] + A collection of RDKit molecules representing functional groups. + + split : bool + If a functional group should be split from **ligand** (``True``) or not (``False``). + + Returns + ------- + |list|_ [|plams.Molecule|_] + A list of ligands. + A single copy of **ligand** is created for each identified functional group, + removing parts of the functional group if required (see **split**). + An empty list is returned if no valid functional groups are found. + + """ + rdmol = molkit.to_rdmol(ligand) # Searches for functional groups (defined by functional_group_list) within the ligand - # Duplicates are removed - rdmatch = rdmol.GetSubstructMatches - matches = chain(*[rdmatch(smarts) for smarts in func_groups]) + get_match = rdmol.GetSubstructMatches + matches = chain.from_iterable(get_match(mol, useChirality=True) for mol in func_groups) # Remove all duplicate matches, each heteroatom (match[0]) should have <= 1 entry ligand_indices = [] ref = [] - for match in matches: - if match[0] not in ref: - ligand_indices.append(match) - ref.append(match[0]) + for idx_tup in matches: + i, *_ = idx_tup + if i in ref: + continue # Skip duplicates + + ligand_indices.append(idx_tup) + ref.append(i) if ligand_indices: - ligand_list = [substructure_split(ligand, tup, split) for tup in ligand_indices] + return [substructure_split(ligand, tup, split) for tup in ligand_indices] else: - print(get_time() + 'No functional groups were found (optional.ligand.split = ' + str(split) - + ') for ligand: ' + ligand.properties.smiles) - ligand_list = [] + msg = 'No functional groups were found (optional.ligand.split = {}) for ligand: {}' + print(get_time() + msg.format(split, ligand.properties.smiles)) + return [] - return ligand_list +def substructure_split(ligand: Molecule, + idx: Tuple[int, int], + split: bool = True) -> Molecule: + """Delete the hydrogen or mono-/polyatomic counterion attached to the functional group. + + Sets the charge of the remaining heteroatom to -1 if ``split=True``. + + Parameters + ---------- + ligand: |plams.Molecule|_ + The ligand molecule. + + idx : |tuple|_ [|int|_] + A tuple with 2 atomic indices associated with a functional group. + + split : bool + If a functional group should be split from **ligand** (``True``) or not (``False``). + + Returns + ------- + |plams.Molecule|_ + A copy of **ligand**, with part of its functional group removed (see **split**). -def substructure_split(ligand, idx, split=True): - """ - Delete the hydrogen or mono-/polyatomic counterion attached to the functional group. - Sets the charge of the remaining heteroatom to -1 if split=True. - - :parameter ligand: The ligand molecule. - :type ligand: |plams.Molecule|_ - :parameter idx: A list of 2 atomic indices associated with a functional group. - :type idx: 2 |list|_ [|int|_] - :parameter bool split: If a functional group should be split from **ligand** (*True*) - or not (*False*). - :return: A copy of **ligand**, with part of its functional group removed (see **split**). - :rtype: |plams.Molecule|_ """ lig = ligand.copy() at1 = lig[idx[0] + 1] @@ -132,9 +251,11 @@ def substructure_split(ligand, idx, split=True): lig.delete_atom(at2) mol_list = lig.separate_mod() for mol in mol_list: - if at1 in mol: - lig = mol - break + if at1 not in mol: + continue + + lig = mol + break # Check if the ligand heteroatom has a charge assigned, assigns a charge if not if not at1.properties.charge or at1.properties.charge == 0: diff --git a/CAT/attachment/ligand_attach.py b/CAT/attachment/ligand_attach.py index 528dde68..c21e21cb 100644 --- a/CAT/attachment/ligand_attach.py +++ b/CAT/attachment/ligand_attach.py @@ -1,6 +1,46 @@ -""" A module designed for attaching ligands to cores. """ - -__all__ = ['init_qd_construction'] +""" +CAT.attachment.ligand_attach +============================ + +A module designed for attaching ligands to cores. + +Index +----- +.. currentmodule:: CAT.attachment.ligand_attach +.. autosummary:: + init_qd_construction + construct_mol_series + _read_database + _get_indices + _get_df + ligand_to_qd + _get_rotmat1 + _get_rotmat2 + rot_mol + rot_mol_angle + array_to_qd + sanitize_dim_2 + sanitize_dim_3 + +API +--- +.. autofunction:: init_qd_construction +.. autofunction:: construct_mol_series +.. autofunction:: _read_database +.. autofunction:: _get_indices +.. autofunction:: _get_df +.. autofunction:: ligand_to_qd +.. autofunction:: _get_rotmat1 +.. autofunction:: _get_rotmat2 +.. autofunction:: rot_mol +.. autofunction:: rot_mol_angle +.. autofunction:: array_to_qd +.. autofunction:: sanitize_dim_2 +.. autofunction:: sanitize_dim_3 + +""" + +from typing import (List, Tuple) import numpy as np import pandas as pd @@ -9,75 +49,162 @@ from scm.plams.mol.molecule import Molecule from scm.plams.core.settings import Settings +from ..settings_dataframe import SettingsDataFrame from ..utils import get_time -from ..mol_utils import (merge_mol, get_atom_index) -from ..data_handling.database import Database -from ..data_handling.database_functions import mol_to_file - - -def init_qd_construction(ligand_df, core_df, arg): - """ Initialize the quantum dot construction. - :parameter ligand_df: A dataframe of ligands. - :type ligand_df: |pd.DataFrame|_ (columns: |str|_, index: |str|_, values: |plams.Molecule|_) - :parameter core_df: A dataframe of cores. - :type core_df: |pd.DataFrame|_ (columns: |str|_, index: |str|_, values: |plams.Molecule|_) - :parameter arg: A settings object containing all (optional) arguments. - :type arg: |plams.Settings|_ (superclass: |dict|_). - :return: A dataframe of quantum dots. - :rtype: |pd.DataFrame|_ (columns: |str|_, index: |str|_, values: |plams.Molecule|_) +from ..mol_utils import (merge_mol, get_index) + +try: + from dataCAT import (Database, mol_to_file) + DATA_CAT = True +except ImportError: + DATA_CAT = False + +__all__ = ['init_qd_construction'] + +# Aliases for pd.MultiIndex columns +HDF5_INDEX = ('hdf5 index', '') +MOL = ('mol', '') +OPT = ('opt', '') + + +def init_qd_construction(ligand_df: SettingsDataFrame, + core_df: SettingsDataFrame) -> SettingsDataFrame: + """Initialize the quantum dot construction. + + Parameters + ---------- + ligand_df : |CAT.SettingsDataFrame|_ + A dataframe of ligands. + + core_df : |CAT.SettingsDataFrame|_ + A dataframe of cores. + + Returns + ------- + |CAT.SettingsDataFrame|_ + A dataframe of quantum dots. + """ - overwrite = 'qd' in arg.optional.database.overwrite - data = Database(path=arg.optional.database.dirname) + # Extract arguments + settings = ligand_df.settings.optional + overwrite = DATA_CAT and 'qd' in settings.database.overwrite + write = DATA_CAT and 'qd' in settings.database.write + read = DATA_CAT and 'qd' in settings.database.read + qd_path = settings.qd.dirname + db_path = settings.database.dirname + mol_format = settings.database.mol_format # Attempt to pull structures from the database - qd_df = _get_df(core_df.index, ligand_df.index) + qd_df = _get_df(core_df.index, ligand_df.index, ligand_df.settings) qd_df.sort_index(inplace=True) - if 'qd' in arg.optional.database.read: - mol_series1 = data.from_csv(qd_df, database='QD', inplace=False) - for i, mol in mol_series1.iteritems(): - mol.properties = Settings() - mol.properties.indices = _get_indices(mol, i) - mol.properties.path = arg.optional.qd.dirname - mol.properties.job_path = [] - mol.properties.name = core_df.at[(i[0:2]), ('mol', '')].properties.name + '__' - mol.properties.name += str(mol[-1].properties.pdb_info.ResidueNumber - 1) - mol.properties.name += '_' + ligand_df.at[(i[2:4]), ('mol', '')].properties.name - print(get_time() + mol.properties.name + '\t has been pulled from the database') + if read: + mol_series1 = _read_database(qd_df, ligand_df, core_df) # Identify and create the to be constructed quantum dots - idx = qd_df['hdf5 index'] < 0 - mol_list = [ligand_to_qd(core_df.at[(i, j), ('mol', '')], - ligand_df.at[(k, l), ('mol', '')], - arg) for i, j, k, l in qd_df.index[idx]] - mol_series2 = pd.Series(mol_list, index=qd_df.index[idx], name=('mol', ''), dtype=object) - print() + mol_series2 = construct_mol_series(qd_df, core_df, ligand_df) # Update the *mol* column in qd_df with 1 or 2 series of quantum dots try: - qd_df['mol'] = mol_series1.append(mol_series2) + qd_df[MOL] = mol_series1.append(mol_series2) except NameError: - qd_df['mol'] = mol_series2 + qd_df[MOL] = mol_series2 # Export the resulting geometries back to the database - if 'qd' in arg.optional.database.write: - data.update_csv(qd_df, - columns=[('hdf5 index', '')], - database='QD_no_opt') - path = arg.optional.qd.dirname - mol_to_file(qd_df['mol'], path, overwrite, arg.optional.database.mol_format) + if write: + data = Database(db_path, **settings.database.mongodb) + data.update_csv(qd_df, columns=[HDF5_INDEX], database='QD_no_opt') + mol_to_file(qd_df[MOL], qd_path, overwrite, mol_format) return qd_df -def _get_indices(mol, index): - """ Return a list with the indices of all atoms in the core of **mol** plus the ligand anchor - atoms. Ligand anchor atoms are furthermore marked with the properties.anchor attribute. +def construct_mol_series(qd_df: SettingsDataFrame, + core_df: pd.DataFrame, + ligand_df: pd.DataFrame) -> pd.Series: + """Construct a Series of new quantum dots""" + def _get_mol(i, j, k, l): + ij = i, j + kl = k, l + return ligand_to_qd(core_df.at[ij, MOL], ligand_df.at[kl, MOL], settings) + + settings = qd_df.settings + idx = qd_df[HDF5_INDEX] < 0 + + mol_list = [_get_mol(i, j, k, l) for i, j, k, l in qd_df.index[idx]] + return pd.Series(mol_list, index=qd_df.index[idx], name=MOL, dtype=object) + + +def _read_database(qd_df: SettingsDataFrame, + ligand_df: SettingsDataFrame, + core_df: SettingsDataFrame) -> pd.Series: + """Read quantum dots from the database and set their properties. + + Parameters + ---------- + ligand_df : |CAT.SettingsDataFrame|_ + A dataframe of quantum dots. + + ligand_df : |CAT.SettingsDataFrame|_ + A dataframe of ligands. + + core_df : |CAT.SettingsDataFrame|_ + A dataframe of cores. + + Returns + ------- + |pd.Series|_ [|plams.Molecule|_] + A Series of quantum dots pulled from the database. + + """ + def get_name(): + """Construct the name of a quantum dot.""" + core = core_df.at[(i[0:2]), MOL].properties.name + res = mol[-1].properties.pdb_info.ResidueNumber - 1 + lig = ligand_df.at[(i[2:4]), MOL].properties.name + return '{}__{:d}_{}'.format(core, res, lig) + + # Extract arguments + settings = qd_df.settings.optional + path = settings.database.dirname + data = Database(path, **settings.database.mongodb) + + # Extract molecules from the database and set their properties + # If possible extract optimized structures; supplement with unoptimized structures if required + mol_series_opt = data.from_csv(qd_df, database='QD', inplace=False) + mol_series_no_opt = data.from_csv(qd_df, database='QD_no_opt', inplace=False) + slice_ = mol_series_no_opt.index.isin(mol_series_opt.index) + mol_series = mol_series_opt.append(mol_series_no_opt[~slice_]) + + # Update Molecule.properties + for i, mol in mol_series.iteritems(): + mol.properties = Settings({ + 'indices': _get_indices(mol, i), + 'path': path, + 'job_path': [], + 'name': get_name() + }) + print(get_time() + '{}\t has been pulled from the database'.format(mol.properties.name)) + return mol_series + + +def _get_indices(mol: Molecule, + index: Tuple[str, str, str, str]) -> List[int]: + """Return a list with the indices of all atoms in the core plus ligand anchor atoms. + + Ligand anchor atoms are furthermore marked with the properties.anchor attribute. + + Parameters + ---------- + mol : |plams.Molecule|_ + A PLAMS molecule. + + index : |tuple|_ [|str|_] + A tuple of 4 strings. + + Returns + ------- + |list|_ [|int|_] + A list of atomic indices. - :parameter mol: A PLAMS molecule. - :type mol: |plams.Molecule|_ - :parameter index: A tuple of 4 strings. - :type index: *4* |tuple|_ [|str|_] - :return: A list of atomic indices - :rtype: |list|_ [|int|_] """ # Collect the indices of the atoms in the core ret = [] @@ -106,47 +233,78 @@ def _get_indices(mol, index): return ret -def _get_df(core_index, ligand_index): - """ Create and return a new quantum dot dataframe. +def _get_df(core_index: pd.MultiIndex, + ligand_index: pd.MultiIndex, + settings: Settings) -> SettingsDataFrame: + """Create and return a new quantum dot dataframe. + + Parameters + ---------- + core_index : |pd.MultiIndex|_ + A multiindex of the cores. + + ligand_index : |pd.MultiIndex|_ + A multiindex of the ligands. + + settings : |plams.Settings|_ + A Settings intance extracted from the ligand or core dataframe. + + Returns + ------- + |CAT.SettingsDataFrame|_ + An empty dataframe (*i.e.* filled with ``-1`` and ``False``) of quantum dots. - :parameter core_index: A multiindex of the cores. - :type core_index: |pd.MultiIndex|_ - :parameter ligand_index: A multiindex of the ligands. - :type ligand_index: |pd.MultiIndex|_ - :return: An empty (*i.e.* filled with -1) dataframe of quantum dots. - :rtype: |pd.DataFrame|_ (columns: |str|_, index: |str|_, values: |np.int64|_) """ + # Create the index idx_tups = [(i, j, k, l) for i, j in core_index for k, l in ligand_index] index = pd.MultiIndex.from_tuples( - idx_tups, - names=['core', 'core anchor', 'ligand smiles', 'ligand anchor'] + idx_tups, names=['core', 'core anchor', 'ligand smiles', 'ligand anchor'] ) - column_tups = [('hdf5 index', ''), ('opt', '')] + # Create the collumns + column_tups = [HDF5_INDEX, OPT] columns = pd.MultiIndex.from_tuples(column_tups, names=['index', 'sub index']) - data = {('hdf5 index', ''): -1, ('opt', ''): False} - return pd.DataFrame(data, index=index, columns=columns) + # Create and return the quantum dot dataframe + data = {HDF5_INDEX: -1, OPT: False} + return SettingsDataFrame(data, index=index, columns=columns, settings=settings) -def ligand_to_qd(core, ligand, arg): - """ - Function that handles quantum dot (qd, i.e. core + all ligands) operations. +def ligand_to_qd(core: Molecule, + ligand: Molecule, + settings: Settings) -> Molecule: + """Function that handles quantum dot (qd, *i.e.* core + all ligands) operations. + Combine the core and ligands and assign properties to the quantom dot. - :parameter core: A core molecule. - :type core: |plams.Molecule|_ - :parameter ligand: A ligand molecule. - :type ligand: |plams.Molecule|_ - :parameter arg: A settings object containing all (optional) arguments. - :type arg: |plams.Settings|_ (superclass: |dict|_) - :return: A quantum dot consisting of a core molecule and *n* ligands - :rtype: |plams.Molecule|_ + Parameters + ---------- + core : |plams.Molecule|_ + A core molecule. + + ligand : |plams.Molecule|_ + A ligand molecule. + + settings : |plams.Settings|_ + A settings object containing all (optional) arguments. + + Returns + ------- + |plams.Molecule|_ + A quantum dot consisting of a core molecule and *n* ligands + """ + def get_name(): + ret = core.properties.name + '__' + ret += str(qd[-1].properties.pdb_info.ResidueNumber - 1) + '_' + ligand.properties.name + return ret + + dirname = settings.optional.qd.dirname + # Define vectors and indices used for rotation and translation the ligands vec1 = sanitize_dim_2(ligand.properties.dummies) - np.array(ligand.get_center_of_mass()) vec2 = np.array(core.get_center_of_mass()) - sanitize_dim_2(core.properties.dummies) - idx = ligand.properties.dummies.get_atom_index() - 1 + idx = ligand.get_index(ligand.properties.dummies) - 1 ligand.properties.dummies.properties.anchor = True # Attach the rotated ligands to the core, returning the resulting strucutre (PLAMS Molecule). @@ -155,14 +313,13 @@ def ligand_to_qd(core, ligand, arg): array_to_qd(ligand, lig_array, mol_other=qd) # Set properties - qd.properties = Settings() - qd.properties.indices = [i for i, at in enumerate(qd, 1) if - at.properties.pdb_info.ResidueName == 'COR' or at.properties.anchor] - qd.properties.path = arg.optional.qd.dirname - qd.properties.name = core.properties.name + '__' - qd.properties.name += str(qd[-1].properties.pdb_info.ResidueNumber - 1) - qd.properties.name += '_' + ligand.properties.name - qd.properties.job_path = [] + qd.properties = Settings({ + 'indices': [i for i, at in enumerate(qd, 1) if + at.properties.pdb_info.ResidueName == 'COR' or at.properties.anchor], + 'path': dirname, + 'name': get_name(), + 'job_path': [] + }) # Print and return print(get_time() + qd.properties.name + '\t has been constructed') diff --git a/CAT/attachment/ligand_opt.py b/CAT/attachment/ligand_opt.py index e3be1b03..4afb6239 100644 --- a/CAT/attachment/ligand_opt.py +++ b/CAT/attachment/ligand_opt.py @@ -1,16 +1,52 @@ -""" A module designed for optimizing the geometry of ligands. """ - -__all__ = ['init_ligand_opt'] +""" +CAT.attachment.ligand_opt +========================= + +A module designed for optimizing the geometry of ligands. + +Index +----- +.. currentmodule:: CAT.attachment.ligand_opt +.. autosummary:: + init_ligand_opt + _parse_overwrite + read_data + start_ligand_jobs + _ligand_to_db + remove_duplicates + split_bond + neighbors_mod + split_mol + get_frag_size + recombine_mol + get_dihed + set_dihed + +API +--- +.. autofunction:: init_ligand_opt +.. autofunction:: _parse_overwrite +.. autofunction:: read_data +.. autofunction:: start_ligand_jobs +.. autofunction:: _ligand_to_db +.. autofunction:: remove_duplicates +.. autofunction:: split_bond +.. autofunction:: neighbors_mod +.. autofunction:: get_frag_size +.. autofunction:: recombine_mol +.. autofunction:: get_dihed +.. autofunction:: set_dihed + +""" import itertools +from typing import (Union, Sequence, List, Tuple, Dict, Any) import numpy as np import pandas as pd -from scm.plams.mol.molecule import Molecule -from scm.plams.mol.atom import Atom +from scm.plams import (Molecule, Atom, Bond, Settings) from scm.plams.core.errors import MoleculeError -from scm.plams.core.settings import Settings from scm.plams.core.functions import add_to_class from scm.plams.tools.units import Units from scm.plams.recipes.global_minimum import global_minimum_scan_rdkit @@ -20,125 +56,184 @@ from rdkit.Chem import AllChem from .ligand_attach import (rot_mol_angle, sanitize_dim_2) -from ..data_handling.database import Database -from ..data_handling.database_functions import mol_to_file from ..utils import get_time -from ..mol_utils import (to_symbol, fix_carboxyl, get_bond_index, +from ..settings_dataframe import SettingsDataFrame +from ..mol_utils import (to_symbol, fix_carboxyl, get_index, from_mol_other, from_rdmol, separate_mod) +try: + from dataCAT import (Database, mol_to_file) + DATA_CAT = True +except ImportError: + DATA_CAT = False + +__all__ = ['init_ligand_opt'] + +# Aliases for pd.MultiIndex columns +MOL = ('mol', '') +OPT = ('opt', '') +FORMULA = ('formula', '') +HDF5_INDEX = ('hdf5 index', '') +SETTINGS1 = ('settings', '1') + + +def init_ligand_opt(ligand_df: SettingsDataFrame) -> None: + """Initialize the ligand optimization procedure. -def init_ligand_opt(ligand_df, arg): - """ Initialize the ligand optimization procedure. Performs an inplace update of **ligand_df**. - :parameter ligand_df: A dataframe of valid ligands. - :type ligand_df: |pd.DataFrame|_ (columns: |str|_, index=|int|_, values=|plams.Molecule|_) - :parameter arg: A settings object containing all (optional) arguments. - :type arg: |plams.Settings|_ (superclass: |dict|_). + + Parameters + ---------- + ligand_df : |CAT.SettingsDataFrame|_ + A dataframe of valid ligands. + """ - database = Database(arg.optional.database.dirname) - overwrite = 'ligand' in arg.optional.database.overwrite + settings = ligand_df.settings.optional + database = Database(settings.database.dirname, **settings.database.mongodb) + overwrite = DATA_CAT and 'ligand' in settings.database.overwrite + read = DATA_CAT and 'ligand' in settings.database.read + write = DATA_CAT and 'ligand' in settings.database.write + optimize = settings.ligand.optimize # Searches for matches between the input ligand and the database; imports the structure - if 'ligand' in arg.optional.database.read: - database.from_csv(ligand_df, database='ligand') - for i, mol in zip(ligand_df['opt'], ligand_df['mol']): - if i == -1: - continue - print(get_time() + '{}\t has been pulled from the database'.format(mol.properties.name)) - ligand_df['opt'] = ligand_df['opt'].astype(bool, copy=False) + read_data(ligand_df, database, read) - if 'ligand' in arg.optional.database.write: - _ligand_to_db(ligand_df, arg, database, opt=False) + if write: + _ligand_to_db(ligand_df, database, opt=False) # Optimize all new ligands - if arg.optional.ligand.optimize: + if optimize: # Identify the to be optimized ligands - if overwrite: - idx = pd.Series(True, index=ligand_df.index, name='mol') - message = '{}\t has been (re-)optimized' - else: - idx = np.invert(ligand_df['opt']) - message = '{}\t has been optimized' + idx, message = _parse_overwrite(ligand_df, overwrite) # Optimize the ligands - lig_new = [] - for ligand in ligand_df['mol'][idx]: - mol_list = split_mol(ligand) - for mol in mol_list: - mol.set_dihed(180.0) - ligand_tmp = recombine_mol(mol_list) - fix_carboxyl(ligand_tmp) - lig_new.append(ligand_tmp) - - # Print messages - print(get_time() + message.format(ligand.properties.name)) + lig_new = start_ligand_jobs(ligand_df, idx, message) + + # Update the ligand dataframe if lig_new: if len(lig_new) == 1: # pd.DataFrame.loc has serious issues when assigning 1 molecue idx, _ = next(ligand_df[idx].iterrows()) - ligand_df.at[idx, ('mol', '')] = lig_new[0] + ligand_df.at[idx, MOL] = lig_new[0] else: - ligand_df.loc[idx, 'mol'] = lig_new + ligand_df.loc[idx, MOL] = lig_new print() remove_duplicates(ligand_df) # Write newly optimized structures to the database - if 'ligand' in arg.optional.database.write and arg.optional.ligand.optimize: - _ligand_to_db(ligand_df, arg, database) + if write and optimize: + _ligand_to_db(ligand_df, database) + +def _parse_overwrite(ligand_df: SettingsDataFrame, + overwrite: bool) -> Tuple[pd.Series, str]: + """Return a series for dataframe slicing and a to-be printer message.""" + if overwrite: + idx = pd.Series(True, index=ligand_df.index, name=MOL) + message = '{}\t has been (re-)optimized' + else: + idx = np.invert(ligand_df[OPT]) + message = '{}\t has been optimized' + return idx, message -def _ligand_to_db(ligand_df, arg, database, opt=True): - """Export ligand optimziation results to the database""" - overwrite = 'ligand' in arg.optional.database.overwrite - path = arg.optional.ligand.dirname - kwarg = {'overwrite': overwrite} +def read_data(ligand_df: SettingsDataFrame, + database: 'Database', + read: bool) -> None: + """Read ligands from the database if **read** = ``True``.""" + if read: + database.from_csv(ligand_df, database='ligand') + for i, mol in zip(ligand_df[OPT], ligand_df[MOL]): + if i == -1: + continue + print(get_time() + '{}\t has been pulled from the database'.format(mol.properties.name)) + ligand_df[OPT] = ligand_df[OPT].astype(bool, copy=False) + + +def start_ligand_jobs(ligand_df: SettingsDataFrame, + idx: pd.Series, + message: str) -> List[Molecule]: + """Loop over all molecules in ``ligand_df.loc[idx]`` and perform geometry optimizations.""" + lig_new = [] + for ligand in ligand_df[MOL][idx]: + mol_list = split_mol(ligand) + for mol in mol_list: + mol.set_dihed(180.0) + ligand_tmp = recombine_mol(mol_list) + fix_carboxyl(ligand_tmp) + lig_new.append(ligand_tmp) + + # Print messages + print(get_time() + message.format(ligand.properties.name)) + return lig_new + + +def _ligand_to_db(ligand_df: SettingsDataFrame, + database: 'Database', + opt: bool = True): + """Export ligand optimziation results to the database.""" + # Extract arguments + settings = ligand_df.settings.optional + overwrite = DATA_CAT and 'ligand' in settings.database.overwrite + lig_path = settings.ligand.dirname + mol_format = settings.database.mol_format + + kwargs: Dict[str, Any] = {'overwrite': overwrite} if opt: - kwarg['job_recipe'] = Settings({'1': {'key': 'RDKit_' + rdkit.__version__, 'value': 'UFF'}}) - kwarg['columns'] = [('formula', ''), ('hdf5 index', ''), ('settings', '1')] - kwarg['database'] = 'ligand' - kwarg['opt'] = True - mol_to_file(ligand_df['mol'], path, overwrite, arg.optional.database.mol_format) + kwargs['job_recipe'] = Settings({ + '1': {'key': 'RDKit_' + rdkit.__version__, 'value': 'UFF'} + }) + kwargs['columns'] = [FORMULA, HDF5_INDEX, SETTINGS1] + kwargs['database'] = 'ligand' + kwargs['opt'] = True + mol_to_file(ligand_df[MOL], lig_path, overwrite, mol_format) else: - kwarg['columns'] = [('formula', ''), ('hdf5 index', '')] - kwarg['database'] = 'ligand_no_opt' + kwargs['columns'] = [FORMULA, HDF5_INDEX] + kwargs['database'] = 'ligand_no_opt' - database.update_csv(ligand_df, **kwarg) + database.update_csv(ligand_df, **kwargs) -def remove_duplicates(ligand_df): +def remove_duplicates(df: pd.DataFrame) -> None: """Remove duplicate rows from a dataframe. Duplicates are identified based on their index. Performs an inplace update of **ligand_df**. """ # Remove duplicate ligands and sort - if ligand_df.index.duplicated().any(): - idx_name = ligand_df.index.names - ligand_df.reset_index(inplace=True) + if df.index.duplicated().any(): + idx_name = df.index.names + df.reset_index(inplace=True) i, j = idx_name - ligand_df.drop_duplicates(subset=((i, ''), (j, '')), inplace=True) - ligand_df.set_index(idx_name, inplace=True) - ligand_df.index.names = idx_name - ligand_df.sort_index(inplace=True) + df.drop_duplicates(subset=((i, ''), (j, '')), inplace=True) + df.set_index(idx_name, inplace=True) + df.index.names = idx_name + df.sort_index(inplace=True) @add_to_class(Molecule) -def split_bond(self, bond, atom_type='H', bond_length=1.1): - """ Delete a bond and cap the resulting fragments. +def split_bond(self, bond: Sequence[Atom], + atom_type: Union[str, int] = 'H') -> None: + """Delete a bond and cap the resulting fragments. + A link to the two atoms previously defining the bond & the two capping atoms is stored under - self.properties.mark in a list of 4-tuples. + self.properties.mark in a list of 4-tuples. Performs in inplace update of **self**. - :parameter bond: A PLAMS bond. - :type: |plams.Bond|_ - :parameter atom_type: The atomic symbol or number of the two to be created capping atoms. - :type atom_type: |str|_ or |int|_ - :parameter float bond_length: The length of the two new bonds in angstrom. + Parameters + ---------- + bond : |plams.Bond|_ + A PLAMS bond. + + atom_type : |str|_ or |int|_ + The atomic symbol or number of the two to be created capping atoms. + """ atom_type = to_symbol(atom_type) at1, at2 = bond.atom1, bond.atom2 at3, at4 = Atom(symbol=atom_type, coords=at1.coords), Atom(symbol=atom_type, coords=at2.coords) + bond_length = at3.radius + at2.radius + self.add_atom(at3, adjacent=[at2]) self.add_atom(at4, adjacent=[at1]) self.bonds[-1].resize(at1, bond_length) @@ -151,31 +246,49 @@ def split_bond(self, bond, atom_type='H', bond_length=1.1): @add_to_class(Molecule) -def neighbors_mod(self, atom, exclude=1): - """ A modified PLAMS function: Allows the exlucison of specific elements from the return list. - Return a list of neighbors of **atom** within the molecule. Atoms with - **atom** has to belong to the molecule. Returned list follows the same order as the - **atom.bond** attribute. - - :parameter atom: The plams atom whose nieghbours will be returned. - :type atom: |plams.Atom|_ - :parameter int exclude: Exclude all neighbours with a specific atomic number. - :return: A list of all neighbours of **atom**. - :rtype: |list|_ [|plams.Atom|_]. +def neighbors_mod(self, atom: Atom, + exclude: Union[int, str] = 1) -> List[Atom]: + """A modified PLAMS function: Allows the exlucison of specific elements from the return list. + + Return a list of neighbors of **atom** within the molecule. + Atoms with **atom** has to belong to the molecule. + Returned list follows the same order as the **atom.bond** attribute. + + Parameters + ---------- + atom : |plams.Atom|_ + The plams atom whose neighbours will be returned. + + exclude : |str|_ or |int|_ + Exclude all neighbours with a specific atomic number or symbol. + + Returns + ------- + |list|_ [|plams.Atom|_] + A list of all neighbours of **atom**. + """ + exclude = to_symbol(exclude) if atom.mol != self: raise MoleculeError('neighbors: passed atom should belong to the molecule') return [b.other_end(atom) for b in atom.bonds if b.other_end(atom).atnum != exclude] -def split_mol(plams_mol): - """ Split a molecule into multiple smaller fragments, - one fragment for every branch within **plams_mol**. +def split_mol(plams_mol: Molecule) -> List[Molecule]: + """Split a molecule into multiple smaller fragments. + + One fragment is created for every branch within **plams_mol**. + + Parameters + ---------- + plams_mol : |plams.Molecule|_ + The input molecule with the properties.dummies attribute. + + Returns + ------- + |list|_ [|plams.Molecule|_] + A list of one or more plams molecules. - :parameter plams_mol: The input molecule with the properties.dummies attribute. - :type plams_mol: |plams.Molecule|_ - :return: A list of one or more plams molecules. - :rtype: |list|_ [|plams.Molecule|_] """ # Temporary remove hydrogen atoms h_atoms = [] @@ -201,7 +314,7 @@ def split_mol(plams_mol): plams_mol.add_atom(atom) plams_mol.add_bond(bond) - atom_list = list(itertools.chain.from_iterable((bond.atom1, bond.atom2) for bond in bond_list)) + atom_list = itertools.chain.from_iterable((bond.atom1, bond.atom2) for bond in bond_list) atom_set = {atom for atom in atom_list if atom_list.count(atom) >= 3} atom_dict = {atom: [bond for bond in atom.bonds if bond in bond_list] for atom in atom_set} @@ -225,16 +338,24 @@ def split_mol(plams_mol): @add_to_class(Molecule) -def get_frag_size(self, bond, atom): - """ Return the size of a moleculair fragment containing **atom** if **self** was split into two +def get_frag_size(self, bond: Bond, + atom: Atom) -> int: + """Return the size of the fragment containing **atom** if **self** was split into two molecules by the breaking of **bond**. - :parameter bond: A PLAMS bond. - :type bond: |plams.Bond|_ - :parameter atom: A PLAMS atom. The size of the fragment containg this atom will be returned. - :type atom: |plams.Atom|_ - :return: The number of atoms in the fragment containing **atom**. - :rtype: |int|_. + Parameters + ---------- + bond : |plams.Bond|_ + A PLAMS bond. + + atom : |plams.Atom|_ + A PLAMS atom. The size of the fragment containg this atom will be returned. + + Returns + ------- + |int|_ + The number of atoms in the fragment containing **atom**. + """ if bond not in self.bonds: error = 'get_frag_size: The argument bond should be of type plams.Bond and be part' @@ -255,15 +376,16 @@ def dfs(at1, len_at=0, has_atom=False, atom=atom): has_atom = True for bond in at1.bonds: at2 = bond.other_end(at1) - if not at2._visited: - i, j = dfs(at2) - len_at += i - has_atom = has_atom or j + if at2._visited: + continue + i, j = dfs(at2) + len_at += i + has_atom = has_atom or j return len_at, has_atom bond.atom1._visited = bond.atom2._visited = True size1, has_atom1 = dfs(bond.atom1) - size2, has_atom2 = dfs(bond.atom2) + size2, _ = dfs(bond.atom2) for at in self.atoms: del at._visited @@ -272,15 +394,22 @@ def dfs(at1, len_at=0, has_atom=False, atom=atom): return size2 -def recombine_mol(mol_list): - """ Recombine a list of molecules into a single molecule. +def recombine_mol(mol_list: Sequence[Molecule]) -> Molecule: + """Recombine a list of molecules into a single molecule. + A list of 4-tuples of plams.Atoms will be read from mol_list[0].properties.mark. A bond will be created between tuple[0] & tuple[2]; tuple[1] and tuple[3] will be deleted. - :parameter mol_list: A list of on or more plams molecules with the properties.mark atribute. - :type: |list|_ [|plams.Molecule|_] - :return: The (re-)merged PLAMS molecule. - :rtype: |plams.Molecule|_. + Parameters + ---------- + mol_list : |list|_ [|plams.Molecule|_] + A list of on or more plams molecules with the properties.mark atribute. + + Returns + ------- + |plams.Molecule|_ + The (re-)merged PLAMS molecule. + """ if len(mol_list) == 1: return mol_list[0] @@ -303,21 +432,30 @@ def recombine_mol(mol_list): mol1.delete_atom(tup[1]) mol1.delete_atom(tup[3]) mol1.add_bond(tup[0], tup[2]) - bond_tup = mol1.bonds[-1].get_bond_index() + bond_tup = mol1.get_bond_index(mol1.bonds[-1]) mol1.from_mol_other(global_minimum_scan_rdkit(mol1, bond_tup)) del mol1.properties.mark return mol1 -def get_dihed(atoms, unit='degree'): - """ Returns the dihedral angle defined by four atoms. +def get_dihed(atoms: Tuple[Atom, Atom, Atom, Atom], + unit: str = 'degree') -> float: + """Return the dihedral angle defined by four atoms. + + Parameters + ---------- + atoms : |tuple|_ [|plams.atoms|_] + An iterable consisting of 4 PLAMS atoms + + unit : str + The output unit. + + Returns + ------- + |float|_ + A dihedral angle expressed in **unit**. - :parameter atoms: An iterable consisting of 4 PLAMS atoms - :type atoms: 4 |tuple|_ [|plams.atoms|_] - :parameter str unit: The output unit. - :return: A dihedral angle in **unit**. - :rtype: |float|_. """ vec1 = -np.array(atoms[0].vector_to(atoms[1])) vec2 = np.array(atoms[1].vector_to(atoms[2])) @@ -332,14 +470,24 @@ def get_dihed(atoms, unit='degree'): @add_to_class(Molecule) -def set_dihed(self, angle, opt=True, unit='degree'): - """ Change a dihedral angle into a specific value. - Performs an inplace update of **self**. - - :parameter float angle: The desired dihedral angle. - :parameter bool opt: Whether or not the dihedral adjustment should be followed up by an - RDKit UFF optimization. - :parameter str unit: The input unit. +def set_dihed(self, angle: float, + opt: bool = True, + unit: str = 'degree') -> None: + """Change a dihedral angle into a specific value. + + Performs an inplace update of this instance. + + Parameters + ---------- + angle : float + The desired dihedral angle. + + opt : bool + Whether or not the dihedral adjustment should be followed up by an RDKit UFF optimization. + + unit : str + The input unit. + """ angle = Units.convert(angle, unit, 'degree') bond_list = [bond for bond in self.bonds if bond.atom1.atnum != 1 and bond.atom2.atnum != 1 diff --git a/CAT/attachment/qd_opt.py b/CAT/attachment/qd_opt.py index 0b9953c2..036af124 100644 --- a/CAT/attachment/qd_opt.py +++ b/CAT/attachment/qd_opt.py @@ -1,73 +1,150 @@ -""" A module designed for optimizing the combined ligand & core. """ - -__all__ = ['init_qd_opt'] +""" +CAT.attachment.qd_opt +===================== + +A module designed for optimizing the combined ligand & core. + +Index +----- +.. currentmodule:: CAT.attachment.qd_opt +.. autosummary:: + init_qd_opt + start_qd_opt + get_job_settings + _qd_to_db + qd_opt + +API +--- +.. autofunction:: init_qd_opt +.. autofunction:: start_qd_opt +.. autofunction:: get_job_settings +.. autofunction:: _qd_to_db +.. autofunction:: qd_opt + +""" + +from typing import List import pandas as pd -from scm.plams.core.settings import Settings +from scm.plams import (Molecule, Settings) from scm.plams.core.functions import (init, finish) from scm.plams.interfaces.adfsuite.ams import AMSJob import qmflows +from ..jobs import job_geometry_opt from ..utils import (get_time, type_to_string) from ..mol_utils import (fix_carboxyl, fix_h) -from ..analysis.jobs import job_geometry_opt -from ..data_handling.database import Database -from ..data_handling.database_functions import mol_to_file +from ..settings_dataframe import SettingsDataFrame + +try: + from dataCAT import (Database, mol_to_file) + DATA_CAT = True +except ImportError: + DATA_CAT = False + +__all__ = ['init_qd_opt'] + +# Aliases for pd.MultiIndex columns +MOL = ('mol', '') +OPT = ('opt', '') +HDF5_INDEX = ('hdf5 index', '') +JOB_SETTINGS_QD_OPT = ('job_settings_QD_opt', '') +SETTINGS1 = ('settings', '1') +SETTINGS2 = ('settings', '2') -def init_qd_opt(qd_df, arg): - """ Initialized the quantum dot (constrained) geometry optimization. +def init_qd_opt(qd_df: SettingsDataFrame) -> None: + """Initialize the quantum dot (constrained) geometry optimization. + performs an inplace update of the *mol* column in **qd_df**. - :parameter qd_df: A dataframe of quantum dots. - :type qd_df: |pd.DataFrame|_ (columns: |str|_, index: |str|_, values: |plams.Molecule|_) - :parameter arg: A settings object containing all (optional) arguments. - :type arg: |plams.Settings|_ (superclass: |dict|_). + Parameters + ---------- + qd_df : |CAT.SettingsDataFrame|_ + A dataframe of quantum dots. + """ + # Extract arguments + settings = qd_df.settings.optional + write = DATA_CAT and 'qd' in settings.database.write + overwrite = DATA_CAT and 'qd' in settings.database.overwrite + # Prepare slices - job_recipe = arg.optional.qd.optimize - overwrite = 'qd' in arg.optional.database.overwrite - if overwrite: + if overwrite and DATA_CAT: idx = pd.Series(True, index=qd_df.index, name='mol') message = '\t has been (re-)optimized' else: - idx = qd_df['opt'] == False # noqa + idx = qd_df[OPT] == False # noqa message = '\t has been optimized' # Optimize the geometries if idx.any(): - init(path=arg.optional.qd.dirname, folder='QD_optimize') - for mol in qd_df['mol'][idx]: - mol.properties.job_path = [] - qd_opt(mol, job_recipe) - print(get_time() + mol.properties.name + message) - finish() - - job_settings = [] - for mol in qd_df['mol']: - try: - job_settings.append(mol.properties.pop('job_path')) - except KeyError: - job_settings.append([]) - qd_df[('job_settings_QD_opt', '')] = job_settings + start_qd_opt(qd_df, idx, message) + qd_df[JOB_SETTINGS_QD_OPT] = get_job_settings(qd_df) print() else: # No new molecules, move along return None # Export the geometries to the database - if 'qd' in arg.optional.database.write: + if write and DATA_CAT: with pd.option_context('mode.chained_assignment', None): - _qd_to_db(qd_df, arg, idx) + _qd_to_db(qd_df, idx) return None -def _qd_to_db(qd_df, arg, idx): - """Export quantum dot optimziation results to the database.""" - job_recipe = arg.optional.qd.optimize - overwrite = 'qd' in arg.optional.database.overwrite +def start_qd_opt(qd_df: SettingsDataFrame, + idx: pd.Series, + message: str) -> None: + """Loop over all molecules in ``qd_df.loc[idx]`` and perform geometry optimizations.""" + # Extract arguments + path = qd_df.properties.optional.qd.dirname + job_recipe = qd_df.properties.optional.qd.optimize + + # Perform the main optimization loop + init(path=path, folder='QD_optimize') + for mol in qd_df[MOL][idx]: + mol.properties.job_path = [] + qd_opt(mol, job_recipe) + print(get_time() + mol.properties.name + message) + finish() + +def get_job_settings(qd_df: SettingsDataFrame) -> List[str]: + """Create a nested list of input files for each molecule in **ligand_df**.""" + job_settings = [] + for mol in qd_df[MOL]: + try: + job_settings.append(mol.properties.pop('job_path')) + except KeyError: + job_settings.append([]) + return job_settings + + +def _qd_to_db(qd_df: SettingsDataFrame, + idx: pd.Series) -> None: + """Export quantum dot optimziation results to the database. + + Parameters + ---------- + qd_df : |CAT.SettingsDataFrame|_ + A dataframe of quantum dots. + + idx : |pd.Series|_ + A Series for slicing **qd_df**. + + """ + # Extract arguments + settings = qd_df.settings.optional + job_recipe = settings.qd.optimize + overwrite = DATA_CAT and 'qd' in settings.database.overwrite + mol_format = settings.database.mol_format + qd_path = settings.qd.dirname + db_path = settings.database.dirname + + # Preapre the job recipe v1 = qmflows.geometry['specific'][type_to_string(job_recipe.job1)].copy() v1.update(job_recipe.s1) v2 = qmflows.geometry['specific'][type_to_string(job_recipe.job2)].copy() @@ -77,9 +154,9 @@ def _qd_to_db(qd_df, arg, idx): '2': {'key': job_recipe.job2, 'value': v2} }) - columns = [('hdf5 index', ''), ('job_settings_QD_opt', ''), - ('settings', '1'), ('settings', '2')] - database = Database(path=arg.optional.database.dirname) + # Update the database + columns = [HDF5_INDEX, JOB_SETTINGS_QD_OPT, SETTINGS1, SETTINGS2] + database = Database(path=db_path, **settings.database.mongodb) database.update_csv( qd_df[idx], columns=columns, @@ -88,21 +165,37 @@ def _qd_to_db(qd_df, arg, idx): opt=True ) - path = arg.optional.qd.dirname - mol_to_file(qd_df['mol'], path, overwrite, arg.optional.database.mol_format) + # Export xyz/pdb files + mol_to_file(qd_df[MOL], qd_path, overwrite, mol_format) -def qd_opt(mol, job_recipe): - """ """ +def qd_opt(mol: Molecule, + job_recipe: Settings) -> None: + """Perform an optimization of the quantum dot. + + Performs an inplace update of **mol**. + + Parameters + ---------- + mol : |plams.Molecule|_ + The to-be optimized molecule. + + job_recipe : |plams.Settings|_ + A Settings instance containing all jon settings. + Expects 4 keys: ``"job1"``, ``"job2"``, ``"s1"``, ``"s2"``. + + """ if job_recipe.job1 is AMSJob: job_recipe.s1.input.ams.constraints.atom = mol.properties.indices if job_recipe.job2 is AMSJob: job_recipe.s2.input.ams.constraints.atom = mol.properties.indices # Prepare the job settings - mol.job_geometry_opt(job_recipe.job1, job_recipe.s1, name='QD_opt_part1') + job1, s1 = job_recipe.job1, job_recipe.s1 + mol.job_geometry_opt(job1, s1, name='QD_opt_part1') # Fix broken angles fix_carboxyl(mol) fix_h(mol) - mol.job_geometry_opt(job_recipe.job2, job_recipe.s2, name='QD_opt_part2') + job2, s2 = job_recipe.job2, job_recipe.s2 + mol.job_geometry_opt(job2, s2, name='QD_opt_part2') diff --git a/CAT/base.py b/CAT/base.py index f1762817..8a2c7ef1 100644 --- a/CAT/base.py +++ b/CAT/base.py @@ -1,75 +1,127 @@ -""" A module handling the interaction with all other modules, functioning as recipe. """ - -__all__ = ['prep'] - -import time +""" +CAT.base +======== + +A module handling the interaction with all other modules, functioning as recipe. + +Index +----- +.. currentmodule:: CAT.base +.. autosummary:: + prep + prep_input + prep_core + prep_ligand + prep_qd + val_nano_cat + +API +--- +.. autofunction:: prep +.. autofunction:: prep_input +.. autofunction:: prep_core +.. autofunction:: prep_ligand +.. autofunction:: prep_qd +.. autofunction:: val_nano_cat + +""" + +from time import time +from typing import (Optional, Tuple) import pandas as pd -from scm.plams.mol.atom import Atom +from scm.plams import (Atom, Settings) from scm.plams.core.errors import MoleculeError -from .utils import (check_sys_var, get_time) +from .settings_dataframe import SettingsDataFrame -from .analysis.asa import init_asa -from .analysis.ligand_bde import init_bde -from .analysis.ligand_solvation import init_solv +from .utils import (check_sys_var, get_time) from .data_handling.mol_import import read_mol -from .data_handling.input_sanitizer import (sanitize_path, sanitize_input_mol, sanitize_optional) +from .data_handling.validate_input import validate_input from .attachment.qd_opt import init_qd_opt from .attachment.ligand_opt import init_ligand_opt from .attachment.ligand_attach import init_qd_construction from .attachment.ligand_anchoring import init_ligand_anchoring +try: + from nanoCAT.asa import init_asa + from nanoCAT.ligand_bde import init_bde + from nanoCAT.ligand_solvation import init_solv + NANO_CAT = True +except ImportError: + NANO_CAT = False + +__all__ = ['prep'] + +# Aliases for pd.MultiIndex columns +MOL = ('mol', '') + + +def prep(arg: Settings, + return_mol: bool = True) -> Optional[Tuple[SettingsDataFrame]]: + """Function that handles all tasks related to the three prep functions. -def prep(arg, return_mol=True): - """ function that handles all tasks related to prep_core, prep_ligand and prep_qd. + * :func:`.prep_core` + * :func:`.prep_ligand` + * :func:`.prep_qd` + + Parameters + ---------- + arg : |plams.Settings|_ + A settings object containing all (optional) arguments. + + return_mol : bool + If qd_df, core_df & ligand_df should be returned or not. + + Returns + ------- + |CAT.SettingsDataFrame|_ + Optional: If ``return_mol=True`` return the three QD, core and ligand dataframes. - :parameter arg: A settings object containing all (optional) arguments. - :type arg: |plams.Settings|_ - :parameter bool return_mol: If qd_df, core_df & ligand_df should be returned or not. - :return: If ``return=True``, return a dataframe with quantum dots, cores and ligands. - Molecules are stored in the *mol* column. - :rtype: |pd.DataFrame|_ (columns: |str|_, index: |int|_, values: |plams.Molecule|_) """ # The start - time_start = time.time() + time_start = time() print('\n') # Interpret and extract the input settings ligand_df, core_df = prep_input(arg) # Adds the indices of the core dummy atoms to core.properties.core - core_df = prep_core(core_df, arg) + core_df = prep_core(core_df) # Optimize the ligands, find functional groups, calculate properties and read/write the results - ligand_df = prep_ligand(ligand_df, arg) + ligand_df = prep_ligand(ligand_df) # Combine the cores and ligands; analyze the resulting quantum dots - qd_df = prep_qd(ligand_df, core_df, arg) + qd_df = prep_qd(ligand_df, core_df) # The End - message = get_time() + 'Total elapsed time:\t\t' + '%.4f' % (time.time() - time_start) + ' sec' - print(message) + print(get_time() + 'Total elapsed time:\t\t{:.4f} sec'.format(time() - time_start)) if return_mol: return qd_df, core_df, ligand_df + return None + + +def prep_input(arg: Settings) -> Tuple[SettingsDataFrame, SettingsDataFrame]: + """Interpret and extract the input settings. Returns a list of ligands and a list of cores. + Parameters + ---------- + |plams.Settings|_ + A settings object containing all (optional) arguments. -def prep_input(arg): - """ Interpret and extract the input settings. Returns a list of ligands and a list of cores. + Returns + ------- + |tuple|_ [|CAT.SettingsDataFrame|_, |CAT.SettingsDataFrame|_] + A tuple containing the ligand and core dataframe. - :parameter arg: A settings object containing all (optional) arguments. - :type arg: |plams.Settings|_ - :return: A dataframe of ligands and a dataframe of cores. - :rtype: |pd.DataFrame|_ (columns: |str|_, index: |int|_, values: |plams.Molecule|_) """ # Interpret arguments - arg.update(sanitize_path(arg)) - arg.update(sanitize_optional(arg)) - arg.update(sanitize_input_mol(arg)) + validate_input(arg) # Read the input ligands and cores lig_list = read_mol(arg.input_ligands) @@ -84,35 +136,46 @@ def prep_input(arg): raise MoleculeError('No valid input cores were found, aborting run') # Store the molecules in dataframes - columns = pd.MultiIndex.from_tuples([('mol', '')], names=['index', 'sub index']) - ligand_df = pd.DataFrame(index=pd.RangeIndex(len(lig_list)), columns=columns.copy()) - ligand_df['mol'] = lig_list - core_df = pd.DataFrame(index=pd.RangeIndex(len(core_list)), columns=columns.copy()) - core_df['mol'] = core_list + columns = pd.MultiIndex.from_tuples([MOL], names=['index', 'sub index']) + + ligand_df = SettingsDataFrame(index=pd.RangeIndex(len(lig_list)), + columns=columns, + settings=arg) + core_df = SettingsDataFrame(index=pd.RangeIndex(len(core_list)), + columns=columns.copy(), + settings=arg) + + ligand_df[MOL] = lig_list + core_df[MOL] = core_list return ligand_df, core_df -def prep_core(core_df, arg): - """ Function that handles the identification and marking of all core dummy atoms. +def prep_core(core_df: SettingsDataFrame) -> SettingsDataFrame: + """Function that handles the identification and marking of all core dummy atoms. + + Parameters + ---------- + core_df : |CAT.SettingsDataFrame|_ + A dataframe of core molecules. Molecules are stored in the *mol* column. + + Returns + ------- + |CAT.SettingsDataFrame|_ + A dataframe of cores with all dummy/anchor atoms removed. - :parameter core_df: A dataframe of core molecules. Molecules are stored in the *mol* column. - :type core_df: |pd.DataFrame|_ (columns: |str|_, index: |int|_, values: |plams.Molecule|_) - :parameter arg: A settings object containing all (optional) arguments. - :type arg: |plams.Settings|_ (superclass: |dict|_) - :return: A dataframe of cores with all dummy/anchor atoms removed. - :rtype: |pd.DataFrame|_ (columns: |str|_, index: |str|_, values: |plams.Molecule|_) """ + # Unpack arguments + dummy = core_df.settings.optional.core.dummy + formula_list = [] anchor_list = [] - - for i, core in enumerate(core_df['mol']): + for core in core_df[MOL]: # Checks the if the dummy is a string (atomic symbol) or integer (atomic number) - dummy = arg.optional.core.dummy formula_list.append(core.get_formula()) # Returns the indices and Atoms of all dummy atom ligand placeholders in the core - if core.properties.dummies is None: + if not core.properties.dummies: idx, dummies = zip(*[(j, atom) for j, atom in enumerate(core.atoms, 1) if atom.atnum == dummy]) else: @@ -133,75 +196,123 @@ def prep_core(core_df, arg): idx_tuples = list(zip(formula_list, anchor_list)) idx = pd.MultiIndex.from_tuples(idx_tuples, names=['formula', 'anchor']) ret = core_df.reindex(idx) - ret['mol'] = core_df['mol'].values + ret[MOL] = core_df[MOL].values return ret -def prep_ligand(ligand_df, arg): - """ Function that handles all ligand operations: - - Ligand function group identification - - Ligand geometry optimization - - Ligand COSMO-RS calculations +def prep_ligand(ligand_df: SettingsDataFrame) -> SettingsDataFrame: + """Function that handles all ligand operations. + + * Ligand function group identification + * Ligand geometry optimization + * Ligand COSMO-RS calculations + + .. _Nano-CAT: https://github.com/nlesc-nano/nano-CAT + + Parameters + ---------- + ligand_df : |CAT.SettingsDataFrame|_ + A dataframe of ligand molecules. Molecules are stored in the *mol* column. + + Returns + ------- + |CAT.SettingsDataFrame|_ + A new dataframe containing only valid ligands. + + Raises + ------ + ImportError + Raised if a COSMO-RS calculation is attempted without installing the Nano-CAT_ package. - :parameter ligand_df: A dataframe of ligand molecules. Molecules are stored in the *mol* column. - :type ligand_df: |pd.DataFrame|_ (columns: |str|_, index: |int|_, values: |plams.Molecule|_) - :parameter arg: A settings object containing all (optional) arguments. - :type arg: |plams.Settings|_ (superclass: |dict|_) """ + # Unpack arguments + optimize = ligand_df.settings.optional.ligand.optimize + crs = ligand_df.settings.optional.ligand.crs + # Identify functional groups within the ligand. - ligand_df = init_ligand_anchoring(ligand_df, arg) + ligand_df = init_ligand_anchoring(ligand_df) # Check if any valid functional groups were found - if not ligand_df['mol'].any(): + if not ligand_df[MOL].any(): raise MoleculeError('No valid functional groups found in any of the ligands, aborting run') # Optimize the ligands - if arg.optional.ligand.optimize: - init_ligand_opt(ligand_df, arg) + if optimize: + init_ligand_opt(ligand_df) # Perform a COSMO-RS calculation on the ligands - if arg.optional.ligand.crs: + if crs: + val_nano_cat("Ligand COSMO-RS calculations require the nano-CAT package") check_sys_var() - init_solv(ligand_df, arg) + init_solv(ligand_df) return ligand_df -def prep_qd(ligand_df, core_df, arg): - """ Function that handles all quantum dot (qd, i.e. core + all ligands) operations: - - Constructing the quantum dots - - Optimizing the quantum dots - - Peforming activation strain analyses - - Dissociating ligands on the quantum dot surface - - :parameter ligand_df: A dataframe of ligand molecules. Molecules are stored in the *mol* column. - :type ligand_df: |pd.DataFrame|_ (columns: |str|_, index: |int|_, values: |plams.Molecule|_) - :parameter core_df: A dataframe of core molecules. Molecules are stored in the *mol* column. - :type core_df: |pd.DataFrame|_ (columns: |str|_, index: |int|_, values: |plams.Molecule|_) - :parameter arg: A settings object containing all (optional) arguments. - :type arg: |plams.Settings|_ (superclass: |dict|_) - :return: A dataframe of quantum dots molecules. Molecules are stored in the *mol* column. - :rtype: |pd.DataFrame|_ (columns: |str|_, index: |int|_, values: |plams.Molecule|_) +def prep_qd(ligand_df: SettingsDataFrame, + core_df: SettingsDataFrame) -> SettingsDataFrame: + """Function that handles all quantum dot (qd, i.e. core + all ligands) operations. + + * Constructing the quantum dots + * Optimizing the quantum dots + * Peforming activation strain analyses + * Dissociating ligands on the quantum dot surface + + .. _Nano-CAT: https://github.com/nlesc-nano/nano-CAT + + Parameters + ---------- + ligand_df : |CAT.SettingsDataFrame|_ + A dataframe of ligand molecules. Molecules are stored in the *mol* column. + + core_df : |CAT.SettingsDataFrame|_ + A dataframe of core molecules. Molecules are stored in the *mol* column. + + Returns + ------- + |CAT.SettingsDataFrame|_ + A dataframe of quantum dots molecules. Molecules are stored in the *mol* column. + + Raises + ------ + ImportError + Raised if an activation-strain or ligand dissociation calculation is attempted without + installing the Nano-CAT_ package. + """ + # Unpack arguments + optimize = ligand_df.settings.arg.optional.qd.optimize + dissociate = ligand_df.settings.arg.optional.qd.dissociate + activation_strain = ligand_df.settings.optional.qd.activation_strain + # Construct the quantum dots - qd_df = init_qd_construction(ligand_df, core_df, arg) - if not qd_df['mol'].any(): + qd_df = init_qd_construction(ligand_df, core_df) + if not qd_df[MOL].any(): raise MoleculeError('No valid quantum dots found, aborting') # Optimize the qd with the core frozen - if arg.optional.qd.optimize: + if optimize: check_sys_var() - init_qd_opt(qd_df, arg) + init_qd_opt(qd_df) # Calculate the interaction between ligands on the quantum dot surface - if arg.optional.qd.activation_strain: + if activation_strain: + val_nano_cat("Quantum dot activation-strain calculations require the nano-CAT package") print(get_time() + 'calculating ligand distortion and inter-ligand interaction') - init_asa(qd_df, arg) + init_asa(qd_df) # Calculate the interaction between ligands on the quantum dot surface upon removal of CdX2 - if arg.optional.qd.dissociate: + if dissociate: + val_nano_cat("Quantum dot ligand dissociation calculations require the nano-CAT package") # Start the BDE calculation print(get_time() + 'calculating ligand dissociation energy') - init_bde(qd_df, arg) + init_bde(qd_df) return qd_df + + +def val_nano_cat(error_message: Optional[str] = None) -> None: + """Raise an an :exc:`ImportError` if the module-level constant ``NANO_CAT`` is ``False``.""" + err_message = error_message or '' + if not NANO_CAT: + raise ImportError(err_message) diff --git a/CAT/data/__init__.py b/CAT/data/__init__.py index d0e90463..9c0755a8 100644 --- a/CAT/data/__init__.py +++ b/CAT/data/__init__.py @@ -1 +1,7 @@ -""" Various templates, dictionaries and .coskf files. """ +""" +CAT.data +======== + +Various templates, dictionaries and .coskf files. + +""" diff --git a/CAT/data/coskf/__init__.py b/CAT/data/coskf/__init__.py index 585d4d9e..b50decb4 100644 --- a/CAT/data/coskf/__init__.py +++ b/CAT/data/coskf/__init__.py @@ -1 +1,7 @@ -""" Various solvent .coskf files from the COSMO(crs)-MOPAC level with PM7 parameters. """ +""" +CAT.data.coskf +============== + +Various solvent .coskf files from the COSMO(crs)-MOPAC level with PM7 parameters. + +""" diff --git a/CAT/data/templates/__init__.py b/CAT/data/templates/__init__.py index e69de29b..9483ff0a 100644 --- a/CAT/data/templates/__init__.py +++ b/CAT/data/templates/__init__.py @@ -0,0 +1,7 @@ +""" +CAT.data.templates +================== + +Various .yaml templates. + +""" diff --git a/CAT/data/templates/smiles.yaml b/CAT/data/templates/smiles.yaml index 483f9901..7f7b5f0d 100644 --- a/CAT/data/templates/smiles.yaml +++ b/CAT/data/templates/smiles.yaml @@ -1,84 +1,25 @@ split: - ammonium_cation: - - '[N+]C.[-]' - - '[n+]C.[-]' - - '[N+]c.[-]' - - '[n+]c.[-]' - amide_anion: - - 'N(C)[H]' - - 'N(c)[H]' - phosphide_anion: - - 'P(C)[H]' - - 'P(c)[H]' - phosphorus_oxide_anion: - - 'O(P)[H]' - - 'O(p)[H]' - alkoxide_anion: - - 'O(C)[H]' - - 'O(c)[H]' - sulfide_anion: - - 'S(C)[H]' - - 'S(c)[H]' - sulfoxide_anion: - - 'O(S)[H]' - - 'O(s)[H]' + - '[N+]C.[F-]' # ammonium fluoride + - '[N+]C.[Cl-]' # ammonium chloride + - '[N+]C.[Br-]' # ammonium bromide + - '[N+]C.[I-]' # ammonium iodide + - 'N(C)[H]' # amide anion + - 'P(C)[H]' # phosphide anion + - 'O(P)[H]' # phosphorus oxide anion + - 'O(C)[H]' # alkoxide anion + - 'S(C)[H]' # sulfide anion + - 'O(S)[H]' # sulfoxide anion no_split: - ammonium_cation: - - '[N+]C' - - '[n+]C' - - '[N+]c' - - '[n+]c' - amine: - - 'NC' - - 'Nc' - - 'nC' - - 'nc' - amide_anion: - - '[N-]C' - - '[N-]c' - - '[n-]C' - - '[n-]c' - phosphine: - - 'PC' - - 'Pc' - - 'pC' - - 'pc' - phosphide_anion: - - '[P-]C' - - '[P-]c' - - '[p-]C' - - '[p-]c' - phosphorus_oxide: - - 'OP' - - 'Op' - - 'oP' - - 'op' - phosphorus_oxide_anion: - - '[O-]P' - - '[O-]p' - - '[o-]P' - - '[o-]p' - alkoxide: - - 'OC' - - 'Oc' - - 'oC' - - 'oc' - alkoxide_anion: - - '[O-]C' - - '[O-]c' - sulfide: - - 'SC' - - 'Sc' - - 'sC' - - 'sc' - sulfide_anion: - - '[S-]C' - - '[S-]c' - sulfoxide: - - 'OS' - - 'Os' - - 'oS' - - 'os' - sulfoxide_anion: - - '[O-]S' - - '[O-]s' + - '[N+]C' # ammonium cation + - 'NC' # amine + - '[N-]C' # amide anion + - 'PC' # phosphine + - '[P-]C' # phosphide anion + - 'OP' # phosphorus oxide + - '[O-]P' # phosphorus oxide anion + - 'OC' # alkoxide + - '[O-]C' # alkoxide anion + - 'SC' # sulfide + - '[S-]C' # sulfide anion + - 'OS' # sulfoxide + - '[O-]S' # sulfoxide anion diff --git a/CAT/data_handling/README.rst b/CAT/data_handling/README.rst index bd1b8c91..9cf4b13e 100644 --- a/CAT/data_handling/README.rst +++ b/CAT/data_handling/README.rst @@ -2,19 +2,6 @@ data_handling ############# -~~~~~~~~~~~~~~~~~~~~~~ -database_functions.py_ -~~~~~~~~~~~~~~~~~~~~~~ - -A module for holding functions related to the Database class. - -~~~~~~~~~~~~ -database.py_ -~~~~~~~~~~~~ - -A module which manages all interactions with the database; -holds the Database class. - ~~~~~~~~~~~~~~ mol_import.py_ ~~~~~~~~~~~~~~ @@ -33,8 +20,6 @@ input_parser.py_ A module designed for parsing the input .yaml file. -.. _database_functions.py: https://github.com/BvB93/CAT/tree/master/CAT/data_handling/database_functions.py -.. _database.py: https://github.com/BvB93/CAT/tree/master/CAT/data_handling/database.py .. _mol_import.py: https://github.com/BvB93/CAT/tree/master/CAT/data_handling/mol_import.py .. _input_sanitizer.py: https://github.com/BvB93/CAT/tree/master/CAT/data_handling/input_sanitizer.py .. _input_parser.py: https://github.com/BvB93/CAT/tree/master/CAT/data_handling/input_parser.py diff --git a/CAT/data_handling/__init__.py b/CAT/data_handling/__init__.py index 40fbfb26..8251391d 100644 --- a/CAT/data_handling/__init__.py +++ b/CAT/data_handling/__init__.py @@ -1,14 +1,14 @@ -""" Modules related to the importing, exporting and general handling of data. """ +""" +CAT.data_handling +================= + +Modules related to the importing, exporting and general handling of data. + +""" -from .database import Database -from .database_functions import mol_to_file from .mol_import import (read_mol, set_mol_prop) -from .input_sanitizer import (sanitize_optional, sanitize_input_mol, sanitize_path) __all__ = [ - 'Database', - 'mol_to_file', 'read_mol', 'set_mol_prop', - 'sanitize_optional', 'sanitize_input_mol', 'sanitize_path' ] diff --git a/CAT/data_handling/database.py b/CAT/data_handling/database.py deleted file mode 100644 index 38aef659..00000000 --- a/CAT/data_handling/database.py +++ /dev/null @@ -1,607 +0,0 @@ -""" A module which holds the Database class. """ - -__all__ = ['Database'] - -from os import getcwd -from time import sleep -from typing import Optional -from itertools import count - -import yaml -import h5py -import numpy as np -import pandas as pd -from pymongo import MongoClient -from pymongo.errors import (ServerSelectionTimeoutError, BulkWriteError) - -from scm.plams import Settings - -from .database_functions import ( - _create_csv, _create_yaml, _create_hdf5, _create_mongodb, even_index, - from_pdb_array, sanitize_yaml_settings, as_pdb_array, df_to_mongo_dict -) -from ..mol_utils import from_rdmol - - -class Database(): - """ The Database class. - - :Atributes: * **csv_lig** (|str|_) – Path and filename of the .csv file containing all \ - ligand related results. - - * **csv_qd** (|str|_) – Path and filename of the .csv file containing all \ - quantum dot related results. - - * **yaml** (|str|_) – Path and filename of the .yaml file containing all \ - job settings. - - * **hdf5** (|str|_) – Path and filename of the .hdf5 file containing all \ - structures (as partiallize de-serialized .pdb files). - - * **mongodb** (|None|_ or |dict|_) – Optional: A dictionary with keyword - arguments for `pymongo.MongoClient `_. # noqa - """ - - def __init__(self, path=None, - host: str = 'localhost', - port: int = 27017, - **kwargs: dict) -> None: - path = path or getcwd() - - # Attributes which hold the absolute paths to various components of the database - self.csv_lig = _create_csv(path, database='ligand') - self.csv_qd = _create_csv(path, database='QD') - self.yaml = _create_yaml(path) - self.hdf5 = _create_hdf5(path) - try: - self.mongodb = _create_mongodb(host, port, **kwargs) - except ServerSelectionTimeoutError: - self.mongodb = None - - def __str__(self): - return self._str(str) - - def __repr__(self): - return self._str(type) - - def _str(self, operation=type): - ret = 'Database(\n' - k_len = max(len(k) for k in vars(self)) + 5 - for k, v in vars(self).items(): - ret += '\t{:{width}} {}\n'.format(k + ':', operation(v), width=k_len) - return ret + ')' - - """ ########################### Opening and closing the database ######################### """ - - class open_yaml(): - """ Context manager for opening and closing the job settings database. - - :param str path: The path+filename to the database component. - :param bool write: Whether or not the database file should be updated after - closing **self**. - """ - - def __init__(self, path=None, write=True): - self.path = path or getcwd() - self.write = write - self.settings = None - - def __enter__(self): - with open(self.path, 'r') as f: - self.settings = Settings(yaml.load(f, Loader=yaml.FullLoader)) - return self.settings - - def __exit__(self, type, value, traceback): - if self.write: - yml_dict = self.settings.as_dict() - - # A fix for Settings.as_dict() not functioning when containg a lists of Settings - for key in yml_dict: - for i, value in enumerate(yml_dict[key]): - if isinstance(value, Settings): - yml_dict[key][i] = value.as_dict() - - # Write to the .yaml file - with open(self.path, 'w') as f: - f.write(yaml.dump(yml_dict, default_flow_style=False, indent=4)) - self.settings = False - - class open_csv_lig(): - """ Context manager for opening and closing the ligand database. - - :param str path: The path+filename to the database component. - :param bool write: Whether or not the database file should be updated after - closing **self**. - """ - - def __init__(self, path=None, write=True): - self.path = path or getcwd() - self.write = write - self.df = None - - def __enter__(self): - # Open the .csv file - dtype = {'hdf5 index': int, 'formula': str, 'settings': str, 'opt': bool} - self.df = Database.DF( - pd.read_csv(self.path, index_col=[0, 1], header=[0, 1], dtype=dtype) - ) - - # Fix the columns - idx_tups = [(i, '') if 'Unnamed' in j else (i, j) for i, j in self.df.columns] - columns = pd.MultiIndex.from_tuples(idx_tups, names=self.df.columns.names) - self.df.columns = columns - return self.df - - def __exit__(self, type, value, traceback): - if self.write: - self.df.to_csv(self.path) - self.df = None - - class open_csv_qd(): - """Context manager for opening and closing the quantum dot database. - - :param str path: The path+filename to the database component. - :param bool write: Whether or not the database file should be updated after - closing **self**. - - """ - - def __init__(self, path=None, write=True): - self.path = path or getcwd() - self.write = write - self.df = None - - def __enter__(self): - # Open the .csv file - dtype = {'hdf5 index': int, 'settings': str, 'opt': bool} - self.df = Database.DF( - pd.read_csv(self.path, index_col=[0, 1, 2, 3], header=[0, 1], dtype=dtype) - ) - - # Fix the columns - idx_tups = [(i, '') if 'Unnamed' in j else (i, j) for i, j in self.df.columns] - columns = pd.MultiIndex.from_tuples(idx_tups, names=self.df.columns.names) - self.df.columns = columns - return self.df - - def __exit__(self, type, value, traceback): - if self.write: - self.df.to_csv(self.path) - self.df = None - - class DF(dict): - """A mutable container for holding dataframes. - - A subclass of :class:`dict` containing a single key (``"df"``) and value - (a Pandas DataFrame). - Calling an item or attribute of :class:`.DF` will call said method on the - underlaying DataFrame (``self["df"]``). - An exception to this is the ``"df"`` key, which will get/set the DataFrame - instead. - - """ - - def __init__(self, df: pd.DataFrame) -> None: - super().__init__() - super().__setitem__('df', df) - - def __getattribute__(self, key): - if key == 'update_df' or (key.startswith('__') and key.endswith('__')): - return super().__getattribute__(key) - return self['df'].__getattribute__(key) - - def __setattr__(self, key, value): - self['df'].__setattr__(key, value) - - def __setitem__(self, key, value): - if key == 'df' and not isinstance(value, pd.DataFrame): - try: - value = value['df'] - if not isinstance(value, pd.DataFrame): - raise KeyError - super().__setitem__('df', value) - except KeyError: - err = ("Instance of 'pandas.DataFrame' or 'CAT.Database.DF' expected;" - " observed type: '{}'") - raise TypeError(err.format(value.__class__.__name__)) - elif key == 'df': - super().__setitem__('df', value) - else: - self['df'].__setitem__(key, value) - - def __getitem__(self, key): - df = super().__getitem__('df') - if isinstance(key, str) and key == 'df': - return df - return df.__getitem__(key) - - """ ################################# Updating the database ############################## """ - - def update_mongodb(self, database: str = 'ligand', - overwrite: bool = False) -> None: - """Export ligand or qd results to the MongoDB database. - - Parameters - ---------- - database : str - The type of database; accepted values are ``"ligand"`` and ``"QD"``. - - overwrite : bool - Whether or not previous entries can be overwritten or not. - - """ - if self.mongodb is None: - raise ValueError - - # Open the MongoDB database - client = MongoClient(**self.mongodb) - db = client.cat_database - - # Operate on either the ligand or quantum dot database - if database == 'ligand': - idx_keys = ('smiles', 'anchor') - path = self.csv_lig - open_csv = self.open_csv_lig - collection = db.ligand_database - elif database == 'QD': - idx_keys = ('core', 'core anchor', 'ligand smiles', 'ligand anchor') - collection = db.qd_database - path = self.csv_qd - open_csv = self.open_csv_qd - else: - err = "database={}; accepted values for database are 'ligand' and 'QD'" - raise ValueError(err.format(database)) - - # Parse the ligand or qd dataframe - with open_csv(path, write=False) as db: - df_dict = df_to_mongo_dict(db) - - # Update the collection - for item in df_dict: - try: - collection.insert_one(item) - except BulkWriteError: # An item is already present in the collection - if overwrite: - filter_ = {i: item[i] for i in idx_keys} - collection.replace_one(filter_, item) - - def update_csv(self, df, database='ligand', columns=None, overwrite=False, job_recipe=None, - opt=False): - """ Update **self.csv_lig** or **self.csv_qd** with - (potentially) new user provided settings. - - :parameter df: A dataframe of new (potential) database entries. - :type df: |pd.DataFrame|_ (columns: |str|_, index: |str|_, values: |plams.Molecule|_) - :parameter str database: The type of database; accepted values are *ligand* and *QD*. - :parameter columns: A list of column keys in **df** which - (potentially) are to be added to **self**. If *None*: Add all columns. - :type columns: |None|_ or |list|_ [|tuple|_ [|str|_]] - :parameter bool overwrite: Whether or not previous entries can be overwritten or not. - :parameter job_recipe: A Settings object with settings specific to a job. - :type job_recipe: |None|_ or |plams.Settings|_ (superclass: |dict|_) - """ - # Operate on either the ligand or quantum dot database - if database in ('ligand', 'ligand_no_opt'): - path = self.csv_lig - open_csv = self.open_csv_lig - elif database in ('QD', 'QD_no_opt'): - path = self.csv_qd - open_csv = self.open_csv_qd - - # Update **self.yaml** - if job_recipe is not None: - job_settings = self.update_yaml(job_recipe) - for key, value in job_settings.items(): - df[('settings', key)] = value - - with open_csv(path, write=True) as db: - # Update **db.index** - db['df'] = even_index(db['df'], df) - - # Filter columns - if not columns: - df_columns = df.columns - else: - df_columns = pd.Index(columns) - - # Update **db.columns** - bool_ar = df_columns.isin(db.columns) - for i in df_columns[~bool_ar]: - if 'job_settings' in i[0]: - self._update_hdf5_settings(df, i[0]) - del df[i] - idx = columns.index(i) - columns.pop(idx) - continue - try: - db[i] = np.array((None), dtype=df[i].dtype) - except TypeError: # e.g. if csv[i] consists of the datatype np.int64 - db[i] = -1 - - # Update **self.hdf5**; returns a new series of indices - hdf5_series = self.update_hdf5(df, database=database, overwrite=overwrite, opt=opt) - - # Update **db.values** - db.update(df[columns], overwrite=overwrite) - db.update(hdf5_series, overwrite=True) - if opt: - db.update(df[('opt', '')], overwrite=True) - - def update_yaml(self, job_recipe): - """ Update **self.yaml** with (potentially) new user provided settings. - - :parameter job_recipe: A settings object with one or more settings specific to a job. - :type job_recipe: |plams.Settings|_ (superclass: |dict|_) - :return: A dictionary with the column names as keys and the key for **self.yaml** as - matching values. - :rtype: |dict|_ (keys: |str|_, values: |str|_) - """ - ret = {} - with self.open_yaml(self.yaml) as db: - for item in job_recipe: - # Unpack and sanitize keys - key = job_recipe[item].key - if isinstance(key, type): - key = str(key).rsplit("'", 1)[0].rsplit('.', 1)[-1] - - # Unpack and sanitize values - value = job_recipe[item].value - if isinstance(value, dict): - value = sanitize_yaml_settings(value, key) - - # Check if the appropiate key is available in **self.yaml** - if key not in db: - db[key] = [] - - # Check if the appropiate value is available in **self.yaml** - if value in db[key]: - ret[item] = key + ' ' + str(db[key].index(value)) - else: - db[key].append(value) - ret[item] = key + ' ' + str(len(db[key]) - 1) - return ret - - def update_hdf5(self, df, database='ligand', overwrite=False, opt=False): - """ Export molecules (see the *mol* column in **df**) to the structure database. - Returns a series with the **self.hdf5** indices of all new entries. - - :parameter df: A dataframe of new (potential) database entries. - :type df: |pd.DataFrame|_ (columns: |str|_, index: |str|_, values: |plams.Molecule|_) - :parameter str database: The type of database; accepted values are *ligand* and *QD*. - :parameter bool overwrite: Whether or not previous entries can be overwritten or not. - :return: A series with the index of all new molecules in **self.hdf5** - :rtype: |pd.Series|_ (index: |str|_, values: |np.int64|_) - """ - # Identify new and preexisting entries - if opt: - new = df['hdf5 index'][df['opt'] == False] # noqa - old = df['hdf5 index'][df['opt'] == True] # noqa - else: - new = df['hdf5 index'][df['hdf5 index'] == -1] - old = df['hdf5 index'][df['hdf5 index'] >= 0] - - # Add new entries to the database - self.hdf5_availability() - with h5py.File(self.hdf5, 'r+') as f: - i, j = f[database].shape - - if new.any(): - pdb_array = as_pdb_array(df['mol'][new.index], min_size=j) - - # Reshape and update **self.hdf5** - k = i + pdb_array.shape[0] - f[database].shape = k, pdb_array.shape[1] - f[database][i:k] = pdb_array - - ret = pd.Series(np.arange(i, k), index=new.index, name=('hdf5 index', '')) - df.update(ret, overwrite=True) - if opt: - df.loc[new.index, ('opt', '')] = True - else: - ret = pd.Series(name=('hdf5 index', ''), dtype=int) - - # If **overwrite** is *True* - if overwrite and old.any(): - ar = as_pdb_array(df['mol'][old.index], min_size=j) - - # Ensure that the hdf5 indices are sorted - # import pdb; pdb.set_trace() - idx = np.argsort(old) - old = old[idx] - f[database][old] = ar[idx] - if opt: - df.loc[idx.index, ('opt', '')] = True - - return ret - - def _update_hdf5_settings(self, df, column): - # Add new entries to the database - self.hdf5_availability() - with h5py.File(self.hdf5, 'r+') as f: - i, j, k = f[column].shape - - # Create a 3D array of input files - try: - job_ar = self._read_inp(df[column], j, k) - except ValueError: # df[column] consists of empty lists - return None - - # Reshape **self.hdf5** - k = max(i, 1 + int(df['hdf5 index'].max())) - f[column].shape = k, job_ar.shape[1], job_ar.shape[2] - - # Update the hdf5 dataset - idx = df['hdf5 index'].astype(int, copy=False) - idx_argsort = np.argsort(idx) - f[column][idx[idx_argsort]] = job_ar[idx_argsort] - return None - - @staticmethod - def _read_inp(job_paths, ax2=0, ax3=0): # TODO return a generator instead of an array - """Convert all files in **job_paths** (nested sequence of filenames) into a 3D array.""" - # Determine the minimum size of the to-be returned 3D array - line_count = [[Database._get_line_count(j) for j in i] for i in job_paths] - ax1 = len(line_count) - ax2 = max(ax2, max(len(i) for i in line_count)) - ax3 = max(ax3, max(j for i in line_count for j in i)) - - # Create and return a padded 3D array of strings - ret = np.zeros((ax1, ax2, ax3), dtype='S120') - for i, list1, list2 in zip(count(), line_count, job_paths): - for j, k, filename in zip(count(), list1, list2): - ret[i, j, :k] = np.loadtxt(filename, dtype='S120', comments=None, delimiter='\n') - return ret - - @staticmethod - def _get_line_count(filename): - """Return the total number of lines in **filename**.""" - substract = 0 - with open(filename, 'r') as f: - for i, j in enumerate(f, 1): - if j == '\n': - substract += 1 - return i - substract - - """ ######################## Pulling results from the database ########################### """ - - def from_csv(self, df, database='ligand', get_mol=True, inplace=True): - """ Pull results from **self.csv_lig** or **self.csv_qd**. - Performs in inplace update of **df** if **inplace** = *True*, returing *None*. - - :parameter df: A dataframe of new (potential) database entries. - :type df: |pd.DataFrame|_ (columns: |str|_, index: |str|_, values: |plams.Molecule|_) - :parameter str database: The type of database; accepted values are *ligand* and *QD*. - :parameter columns: A list of to be updated columns in **df**. - :parameter bool get_mol: Attempt to pull preexisting molecules from the database. - See **inplace** for more details. - :parameter bool inplace: If *True* perform an inplace update of the *mol* column in **df**. - Otherwise Return a new series of PLAMS molecules. - :return: If **inplace** = *False*: return a new series of PLAMS molecules pulled - from **self**, else return |None|_ - :rtype: |None|_ or |pd.Series|_ (index: |str|_, values: |plams.Molecule|_) - """ - # Operate on either the ligand or quantum dot database - if database == 'ligand': - path = self.csv_lig - open_csv = self.open_csv_lig - elif database == 'QD': - path = self.csv_qd - open_csv = self.open_csv_qd - - # Update the *hdf5 index* column in **df** - with open_csv(path, write=False) as db: - df.update(db['df'], overwrite=True) - df['hdf5 index'] = df['hdf5 index'].astype(int, copy=False) - - # **df** has been updated and **get_mol** = *False* - if get_mol: - ret = self._get_csv_mol(df, database, inplace) - else: - ret = None - - # Return a new series if **inplace** = *False*; return *None* otherwise - return ret - - def _get_csv_mol(self, df, database='ligand', inplace=True): - """ A method which handles the retrieval and subsequent formatting of molecules. - Called internally by :meth:`Database.from_csv`. - - :parameter df: A dataframe of new (potential) database entries. - :type df: |pd.DataFrame|_ (columns: |str|_, index: |str|_, values: |plams.Molecule|_) - :parameter str database: The type of database; accepted values are *ligand* and *QD*. - :parameter bool inplace: If *True* perform an inplace update of the *mol* column in **df**. - Otherwise Return a new series of PLAMS molecules. - :parameter bool close: If the database component should be closed afterwards. - :return: If **inplace** = *False*: return a new series of PLAMS molecules pulled - from **self**, else return |None|_ - :rtype: |None|_ or |pd.Series|_ (index: |str|_, values: |plams.Molecule|_) - """ - # Sort and find all valid HDF5 indices - df.sort_values(by=['hdf5 index'], inplace=True) - df_slice = df['opt'] == True # noqa - idx = df['hdf5 index'][df_slice].values - - # If no HDF5 indices are availble in **df** then abort the function - if not df_slice.any(): - if inplace: - return None - return pd.Series(None, name=('mol', ''), dtype=object) - - # Update **df** with preexisting molecules from **self**, returning *None* - if inplace: - mol_list = self.from_hdf5(idx, database=database) - for i, rdmol in zip(df_slice.index, mol_list): - df.loc[i, ('mol', '')].from_rdmol(rdmol) - ret = None - - # Create and return a new series of PLAMS molecules - else: - mol_list = self.from_hdf5(idx, database=database, rdmol=False) - ret = pd.Series(mol_list, index=df[df_slice].index, name=('mol', '')) - - return ret - - def from_hdf5(self, index, database='ligand', rdmol=True, close=True): - """ Import structures from the hdf5 database as RDKit or PLAMS molecules. - - :parameter index: The indices of the to be retrieved structures. - :type index: |list|_ [|int|_] - :parameter str database: The type of database; accepted values are *ligand* and *QD*. - :parameter bool rdmol: If *True*, return an RDKit molecule instead of a PLAMS molecule. - :parameter bool close: If the database component should be closed afterwards. - :return: A list of PLAMS or RDKit molecules. - :rtype: |list|_ [|plams.Molecule|_ or |rdkit.Chem.Mol|_] - """ - # Convert **index** to an array if it is a series or dataframe - if isinstance(index, (pd.Series, pd.DataFrame)): - index = index.values.tolist() - elif isinstance(index, np.ndarray): - index = index.tolist() - - # Open the database and pull entries - self.hdf5_availability() - with h5py.File(self.hdf5, 'r') as f: - pdb_array = f[database][index] - - # Return a list of RDKit or PLAMS molecules - return [from_pdb_array(mol, rdmol=rdmol) for mol in pdb_array] - - def hdf5_availability(self, timeout: float = 5.0, - max_attempts: Optional[int] = None) -> None: - """Check if a .hdf5 file is opened by another process; return once it is not. - - If two processes attempt to simultaneously open a single hdf5 file then - h5py will raise an :class:`OSError`. - The purpose of this function is ensure that a .hdf5 is actually closed, - thus allowing :func:`to_hdf5` to safely access **filename** without the risk of raising - an :class:`OSError`. - - Parameters - ---------- - filename : str - The path+filename of the hdf5 file. - timeout : float - Time timeout, in seconds, between subsequent attempts of opening **filename**. - max_attempts : int - Optional: The maximum number attempts for opening **filename**. - If the maximum number of attempts is exceeded, raise an ``OSError``. - - Raises - ------ - OSError - Raised if **max_attempts** is exceded. - - """ - warning = "OSWarning: '{}' is currently unavailable; repeating attempt in {:.0f} seconds" - i = max_attempts or np.inf - - while i: - try: - with h5py.File(self.hdf5, 'r+', libver='latest') as _: - return None # the .hdf5 file can safely be opened - except OSError as ex: # the .hdf5 file cannot be safely opened yet - print((warning).format(self.hdf5, timeout)) - error = ex - sleep(timeout) - i -= 1 - raise error diff --git a/CAT/data_handling/database_functions.py b/CAT/data_handling/database_functions.py deleted file mode 100644 index c0bd893d..00000000 --- a/CAT/data_handling/database_functions.py +++ /dev/null @@ -1,407 +0,0 @@ -"""A module for holding functions related to the Database class.""" - -__all__ = ['mol_to_file', 'df_to_mongo_dict'] - -from os import getcwd -from os.path import (join, isfile, isdir) -from typing import (Dict, Any, List) - -import yaml -import h5py -import numpy as np -import pandas as pd -from pymongo import MongoClient, ASCENDING - -from scm.plams import Settings -import scm.plams.interfaces.molecule.rdkit as molkit - -from rdkit import Chem - -from ..mol_utils import from_rdmol -from ..utils import (get_time, get_template) - - -def even_index(df1: pd.DataFrame, - df2: pd.DataFrame) -> pd.DataFrame: - """Ensure that ``df2.index`` is a subset of ``df1.index``. - - Parameters - ---------- - df1 : |pd.DataFrame|_ - A DataFrame whose index is to-be a superset of ``df2.index``. - - df2 : |pd.DataFrame|_ - A DataFrame whose index is to-be a subset of ``df1.index``. - - Returns - ------- - |pd.DataFrame|_ - A new - - """ - # Figure out if ``df1.index`` is a subset of ``df2.index`` - bool_ar = df2.index.isin(df1.index) - if bool_ar.all(): - return df1 - - # Make ``df1.index`` a subset of ``df2.index`` - nan_row = get_nan_row(df1) - idx = df2.index[~bool_ar] - df_tmp = pd.DataFrame(len(idx) * [nan_row], index=idx, columns=df1.columns) - return df1.append(df_tmp, sort=True) - - -def get_unflattend(input_dict: dict) -> zip: - """Flatten a dictionary and return a :class:`zip` instance consisting of keys and values.""" - def _unflatten(input_dict_: dict) -> dict: - """ """ - ret = Settings() - for key, value in input_dict_.items(): - s = ret - for k1, k2 in zip(key[:-1], key[1:]): - s = s[k1] - s[key[-1]] = value - - return ret.as_dict() - - return zip(*[(k, _unflatten(v)) for k, v in input_dict.items()]) - - -def df_to_mongo_dict(df: pd.DataFrame) -> List[dict]: - """Convert a dataframe into a dictionary suitable for MongoDB.""" - keys, ret = get_unflattend(df.T.to_dict()) - idx_names = df.index.names - - for item, idx in zip(ret, keys): - idx_dict = dict(zip(idx_names, idx)) - item.update(idx_dict) - - return ret - - -def mol_to_file(mol_list, path=None, overwrite=False, mol_format=['xyz', 'pdb']): - """ Export all molecules in **mol_list** to .pdb and/or .xyz files. - - :parameter mol_list: A list of PLAMS molecules. - :type mol_list: |list|_ [|plams.Molecule|_] - :parameter path: The path to the directory where the molecules will be stored. Defaults - to the current working directory if *None*. - :type path: |None|_ or |str|_ - :parameter bool overwrite: If previously generated structures can be overwritten or not. - :parameter mol_format: A list of strings with the to-be exported file types. Accepted values - are *xyz* and/or *pdb*. - :type mol_format: |list|_ [|str|_] - """ - # Set the export path - path = path or getcwd() - assert isdir(path) - - if not mol_format: - return None - - if overwrite: # Export molecules while allowing for file overriding - for mol in mol_list: - mol_path = join(path, mol.properties.name) - if 'pdb' in mol_format: - molkit.writepdb(mol, mol_path + '.pdb') - if 'xyz' in mol_format: - mol.write(mol_path + '.xyz') - - else: # Export molecules without allowing for file overriding - for mol in mol_list: - mol_path = join(path, mol.properties.name) - if 'pdb' in mol_format and not isfile(mol_path + '.pdb'): - molkit.writepdb(mol, mol_path + '.pdb') - if 'xyz' in mol_format and not isfile(mol_path + '.xyz'): - mol.write(mol_path + '.xyz') - - -def get_nan_row(df): - """ Return a list of None-esque objects for each column in **df**. - The object in question depends on the data type of the column. - Will default to *None* if a specific data type is not recognized - - * |np.int64|_: *-1* - - * |np.float64|_: *np.nan* - - * |object|_: *None* - - :parameter df: A dataframe - :type df: |pd.DataFrame|_ - :return: A list of non-esque objects, one for each column in **df**. - :rtype: |list|_ [|int|_, |float|_ and/or |None|_] - """ - dtype_dict = { - np.dtype('int64'): -1, - np.dtype('float64'): np.nan, - np.dtype('O'): None, - np.dtype('bool'): False - } - - if not isinstance(df.index, pd.MultiIndex): - return [dtype_dict[df[i].dtype] for i in df] - else: - ret = [] - for _, value in df.items(): - try: - j = dtype_dict[value.dtype] - except KeyError: # dtype is neither int, float nor object - j = None - ret.append(j) - return ret - - -def as_pdb_array(mol_list, min_size=0): # TODO return a generator instead of an array - """ Converts a list of PLAMS molecule into an array of strings representing (partially) - de-serialized .pdb files. - - :parameter mol_list: A list of PLAMS molecules. - :type mol_list: |list|_ [|plams.Molecule|_] - :parameter int min_size: The minimumum length of the pdb_array. The array is padded with empty - strings if required. - :return: An array with *m* partially deserialized .pdb files with up to *n* lines each. - :rtype: *m*n* |np.ndarray|_ [|np.bytes|_ *|S80*] - """ - pdb_list = [] - shape = min_size - for mol in mol_list: - pdb_block = Chem.MolToPDBBlock(molkit.to_rdmol(mol)).splitlines() - pdb_list.append(pdb_block) - shape = max(shape, len(pdb_block)) - - # Construct, fill and return the pdb array - shape = len(mol_list), shape - ret = np.zeros(shape, dtype='S80') - for i, item in enumerate(pdb_list): - ret[i][:len(item)] = item - - return ret - - -def from_pdb_array(array, rdmol=True): - """ Converts an array with a (partially) de-serialized .pdb file into an - RDKit or PLAMS molecule. - - :parameter array: A (partially) de-serialized .pdb file with *n* lines. - :type array: *n* |np.ndarray|_ [|np.bytes|_ / S80] - :parameter bool rdmol: If *True*, return an RDKit molecule instead of a PLAMS molecule. - :return: A PLAMS or RDKit molecule build from **array**. - :rtype: |plams.Molecule|_ or |rdkit.Chem.Mol|_ - """ - pdb_str = ''.join([item.decode() + '\n' for item in array if item]) - ret = Chem.MolFromPDBBlock(pdb_str, removeHs=False, proximityBonding=False) - if not rdmol: - return molkit.from_rdmol(ret) - return ret - - -def sanitize_yaml_settings(settings, job_type): - """ Remove a predetermined set of unwanted keys and values from a settings object. - - :param settings: A settings object with, potentially, undesired keys and values. - :type settings: |plams.Settings|_ (superclass: |dict|_) - :return: A (nested) dictionary with unwanted keys and values removed. - :rtype: |dict|_ - """ - def recursive_del(s, s_del): - for key in s: - if key in s_del: - if isinstance(s_del[key], dict): - recursive_del(s[key], s_del[key]) - else: - del s[key] - if not s[key]: - del s[key] - - # Prepare a blacklist of specific keys - blacklist = get_template('settings_blacklist.yaml') - settings_del = blacklist['generic'] - settings_del.update(blacklist[job_type]) - - # Recursivelly delete all keys from **s** if aforementioned keys are present in the s_del - recursive_del(settings, settings_del) - return settings - - -def _create_csv(path, database='ligand'): - """ Create a ligand or QD database (csv format) and, if it does not exist, and return - its absolute path. - - :param str path: The path to the database. - :param str database: The type of database, accepted values are *ligand* and *qd*. - :return: The absolute path to the ligand or QD database. - :rtype: |str|_ - """ - path = join(path, database + '_database.csv') - - # Check if the database exists and has the proper keys; create it if it does not - if not isfile(path): - print(get_time() + database + '_database.csv not found in ' + - path + ', creating ' + database + ' database') - if database == 'ligand': - _create_csv_lig(path) - elif database == 'QD': - _create_csv_qd(path) - else: - raise TypeError(str(database) + " is not an accepated value for the 'database' \ - argument") - return path - - -def _create_csv_lig(path): - """ Create a ligand database and and return its absolute path. - - :param str path: The path to the database. - """ - idx = pd.MultiIndex.from_tuples([('-', '-')], names=['smiles', 'anchor']) - - columns = pd.MultiIndex.from_tuples( - [('hdf5 index', ''), ('formula', ''), ('opt', ''), ('settings', 1)], - names=['index', 'sub index'] - ) - - df = pd.DataFrame(None, index=idx, columns=columns) - df['hdf5 index'] = -1 - df['formula'] = 'str' - df['settings'] = 'str' - df['opt'] = False - df.to_csv(path) - - -def _create_csv_qd(path): - """ Create a QD database and and return its absolute path. - - :param str path: The path to the database. - """ - idx = pd.MultiIndex.from_tuples( - [('-', '-', '-', '-')], - names=['core', 'core anchor', 'ligand smiles', 'ligand anchor'] - ) - - columns = pd.MultiIndex.from_tuples( - [('hdf5 index', ''), ('ligand count', ''), ('opt', ''), ('settings', 1), ('settings', 2)], - names=['index', 'sub index'] - ) - - df = pd.DataFrame(None, index=idx, columns=columns) - df['hdf5 index'] = -1 - df['ligand count'] = -1 - df['settings'] = 'str' - df['opt'] = False - df.to_csv(path) - - -def _create_hdf5(path, name='structures.hdf5'): - """ Create a pdb structure database (hdf5 format), populate it with the *core*, *ligand* - and *QD* datasets and finally return its absolute path. - - :param str path: The path to the database. - :param str name: The filename of the database (excluding its path) - :return: The absolute path to the pdb structure database. - :rtype: |str|_ - """ - # Define arguments for 2D datasets - path = join(path, name) - dataset_names = ('core', 'core_no_opt', 'ligand', 'ligand_no_opt', 'QD', 'QD_no_opt', ) - kwarg = {'chunks': True, 'maxshape': (None, None), 'compression': 'gzip'} - - # Create new 2D datasets - with h5py.File(path, 'a') as f: - for name in dataset_names: - if name not in f: - f.create_dataset(name=name, data=np.empty((0, 1), dtype='S80'), **kwarg) - - # Define arguments for 3D datasets - dataset_names_3d = ('job_settings_crs', 'job_settings_QD_opt', 'job_settings_BDE') - kwarg_3d = {'chunks': True, 'maxshape': (None, None, None), 'compression': 'gzip'} - - # Create new 3D datasets - with h5py.File(path, 'a') as f: - for name in dataset_names_3d: - if name not in f: - f.create_dataset(name=name, data=np.empty((0, 1, 1), dtype='S120'), **kwarg_3d) - - return path - - -def _create_yaml(path, name='job_settings.yaml'): - """ Create a job settings database (.yaml - - :param str path: The path to the database. - :param str name: The filename of the database (excluding its path) - :return: The absolute path to the pdb structure database. - :rtype: |str|_ - """ - # Define arguments - path = join(path, name) - - # Create a new .yaml file if it does not yet exist - if not isfile(path): - with open(path, 'w') as f: - f.write(yaml.dump({None: [None]}, default_flow_style=False, indent=4)) - return path - - -def _create_mongodb(host: str = 'localhost', - port: int = 27017, - **kwargs: Dict[str, Any]) -> dict: - """Create the the MongoDB collections and set their index. - - Paramaters - ---------- - host : |str|_ - Hostname or IP address or Unix domain socket path of a single mongod or - mongos instance to connect to, or a mongodb URI, or a list of hostnames mongodb URIs. - If **host** is an IPv6 literal it must be enclosed in ``"["`` and ``"["`` characters - following the RFC2732 URL syntax (e.g. ``"[::1]"`` for localhost). - Multihomed and round robin DNS addresses are not supported. - - port : |str|_ - port number on which to connect. - - kwargs : |dict|_ - Optional keyword argument for `pymongo.MongoClient `_. # noqa - - Returns - ------- - |dict|_ - A dictionary with all supplied keyword arguments. - - Raises - ------ - ServerSelectionTimeoutError - Raised if no connection can be established with the host. - - """ - # Open the client - client = MongoClient(host, port, serverSelectionTimeoutMS=5000, **kwargs) - client.server_info() # Raises an ServerSelectionTimeoutError error if the server is inaccesible - - # Open the database - db = client.cat_database - - # Open and set the index of the ligand collection - lig_collection = db.ligand_database - lig_key = 'smiles_1_anchor_1' - if lig_key not in lig_collection.index_information(): - lig_collection.create_index([ - ('smiles', ASCENDING), - ('anchor', ASCENDING) - ], unique=True) - - # Open and set the index of the QD collection - qd_collection = db.QD_database - qd_key = 'core_1_core anchor_1_ligand smiles_1_ligand anchor_1' - if qd_key not in qd_collection.index_information(): - qd_collection.create_index([ - ('core', ASCENDING), - ('core anchor', ASCENDING), - ('ligand smiles', ASCENDING), - ('ligand anchor', ASCENDING) - ], unique=True) - - # Return all provided keyword argument - ret = {'host': host, 'port': port} - ret.update(kwargs) - return ret diff --git a/CAT/data_handling/entry_points.py b/CAT/data_handling/entry_points.py new file mode 100644 index 00000000..a56aea83 --- /dev/null +++ b/CAT/data_handling/entry_points.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +""" +CAT.data_handling.input_parser +============================== + +Entry point for CAT. + +Index +----- +.. currentmodule:: CAT.data_handling.input_parser +.. autosummary:: + extract_args + main + +API +--- +.. autofunction:: extract_args +.. autofunction:: main + +""" + +import argparse +from os import getcwd +from os.path import (join, exists) +from typing import (Optional, List) + +import yaml + +from scm.plams.core.settings import Settings +import CAT + + +def extract_args(args: Optional[List[str]] = None) -> Settings: + """Extract and return all arguments.""" + input_file = args.YAML[0] + if exists(input_file): + pass + elif exists(join(getcwd(), input_file)): + input_file = join(getcwd(), input_file) + else: + input_file2 = join(getcwd(), input_file) + raise FileNotFoundError(f'No file found at {input_file} or {input_file2}') + + with open(input_file, 'r') as file: + return Settings(yaml.load(file, Loader=yaml.FullLoader)) + + +def main(args: Optional[List[str]] = None) -> None: + parser = argparse.ArgumentParser( + prog='CAT', + usage='init_cat my_settings_file.yaml', + description=('Description: This script initalizes ' + 'the Compound Attachment Tool (CAT).') + ) + + parser.add_argument( + 'YAML', nargs=1, type=str, metavar='input.yaml', + help='Required: A .yaml file with the settings for CAT' + ) + + args = parser.parse_args(args) + CAT.base.prep(extract_args(args), return_mol=False) diff --git a/CAT/data_handling/input_parser.py b/CAT/data_handling/input_parser.py deleted file mode 100755 index e28c1caa..00000000 --- a/CAT/data_handling/input_parser.py +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env python - -from os import getcwd -from os.path import (join, exists) - -import argparse -import yaml - -from scm.plams.core.settings import Settings -import CAT - - -def extract_args(args): - """ Extract and return all arguments. """ - input_file = args.YAML[0] - if exists(input_file): - pass - elif exists(join(getcwd(), input_file)): - input_file = join(getcwd(), input_file) - else: - error = 'No file found at ' + input_file + ' or ' + join(getcwd(), input_file) - raise FileNotFoundError(error) - - with open(input_file, 'r') as file: - return Settings(yaml.load(file, Loader=yaml.FullLoader)) - - -def main(args=None): - parser = argparse.ArgumentParser( - prog='CAT', - usage='init_cat my_settings_file.yaml', - description='Description: This script initalizes \ - the Compound Attachment/Analysis Tool.' - ) - - parser.add_argument( - 'YAML', - nargs='+', - type=str, - help='A .yaml file with the settings for CAT' - ) - - args = parser.parse_args(args) - args = extract_args(args) - - CAT.base.prep(args) diff --git a/CAT/data_handling/input_sanitizer.py b/CAT/data_handling/input_sanitizer.py deleted file mode 100644 index 87d97ed8..00000000 --- a/CAT/data_handling/input_sanitizer.py +++ /dev/null @@ -1,568 +0,0 @@ -""" A module designed for sanitizing and interpreting the input file. """ - -__all__ = ['sanitize_optional', 'sanitize_input_mol', 'sanitize_path'] - -import os -from os.path import (join, isdir, isfile, exists) -from itertools import chain - -import yaml -import numpy as np -from schema import (Schema, Or, And, Use) - -from scm.plams.interfaces.adfsuite.adf import ADFJob -from scm.plams.interfaces.adfsuite.ams import AMSJob -from scm.plams.interfaces.adfsuite.uff import UFFJob -from scm.plams.interfaces.adfsuite.band import BANDJob -from scm.plams.interfaces.adfsuite.dftb import DFTBJob -from scm.plams.interfaces.adfsuite.mopac import MOPACJob -from scm.plams.interfaces.adfsuite.reaxff import ReaxFFJob - -from scm.plams.interfaces.thirdparty.cp2k import Cp2kJob -from scm.plams.interfaces.thirdparty.orca import ORCAJob -from scm.plams.interfaces.thirdparty.dirac import DiracJob -from scm.plams.interfaces.thirdparty.gamess import GamessJob -from scm.plams.interfaces.thirdparty.dftbplus import DFTBPlusJob - -from scm.plams.mol.molecule import Molecule -from scm.plams.core.basejob import Job -from scm.plams.core.settings import Settings -from scm.plams.tools.periodic_table import PeriodicTable -import scm.plams.interfaces.molecule.rdkit as molkit - -from rdkit import Chem - -from .. import utils as CAT - -from ..utils import get_time -from ..mol_utils import to_atnum -from ..analysis.crs import CRSJob - - -""" ################################### Sanitize path ####################################### """ - - -def sanitize_path(arg): - """ Sanitize and return the settings of arg.path. """ - if arg.path is None: - arg.path = os.getcwd() - return arg - elif isinstance(arg.path, str): - if arg.path.lower() in ('none', '.', 'pwd', '$pwd', 'cwd'): - arg.path = os.getcwd() - elif not os.path.exists(arg.path): - raise FileNotFoundError(get_time() + "path '{}' not found".format(arg.path)) - elif os.path.isfile(arg.path): - raise OSError(get_time() + "path '{}' is a file, not a directory".format(arg.path)) - return arg - - else: - error = "arg.path should be None or a string, '{}' is not a valid type" - raise TypeError(error.format(arg.path.__class__.__name__)) - - -""" ########################## Sanitize input_ligands & input_cores ######################## """ - - -def sanitize_input_mol(arg): - """ Sanitize and return the settings of arg.input_cores & arg.input_ligands. """ - core_path = arg.optional.core.dirname - arg.input_cores = get_mol_defaults(arg.input_cores, path=core_path, core=True) - arg.input_cores = sanitize_mol_type(arg.input_cores) - - ligand_path = arg.optional.ligand.dirname - arg.input_ligands = get_mol_defaults(arg.input_ligands, path=ligand_path, core=False) - arg.input_ligands = sanitize_mol_type(arg.input_ligands) - - return arg - - -def get_mol_defaults(mol_list, path=None, core=False): - """ Prepare the default input settings for a molecule. """ - key_dict = { - 'guess_bonds': val_bool, - 'is_core': val_bool, - 'column': val_int, - 'row': val_int, - 'indices': val_indices, - 'type': val_type, - 'name': val_string, - 'path': val_string, - } - - ret = [] - for mol in mol_list: - tmp = get_default_input_mol() - tmp.mol = mol - tmp.path = path - tmp.is_core = core - - if not isinstance(mol, dict): - ret.append(tmp) - continue - - for k1, v1 in mol.items(): - tmp.mol = k1 - for k2, v2 in v1.items(): - try: - tmp[k2] = key_dict[k2](v2) - except KeyError: - raise KeyError("'{}' is not a valid argument for '{}'".format(str(k2), str(k1))) - if k2 == 'guess_bonds': - tmp.tmp_guess = True - - ret.append(tmp) - return ret - - -def sanitize_mol_type(input_mol): - """ Sanitize and return the (file) type of the input molecule (SMILES, .xyz, dir, etc...). """ - for mol in input_mol: - # Figure out the (file) type and mol name - try: - if isfile(join(mol.path, mol.mol)): - mol.type = mol.mol.rsplit('.', 1)[-1] - mol.name = mol.mol.rsplit('.', 1)[0] - mol.mol = join(mol.path, mol.mol) - if mol.type == 'xyz' and not mol.get('tmp_guess'): - mol.guess_bonds = True - elif isdir(join(mol.path, mol.mol)): - mol.type = 'folder' - mol.name = mol.mol - mol.mol = join(mol.path, mol.mol) - elif isfile(mol.mol): - mol.type = mol.mol.rsplit('.', 1)[-1] - mol.name = mol.mol.rsplit('.', 1)[0].rsplit('/', 1)[-1].rsplit('\\', 1)[-1] - elif isdir(mol.mol): - mol.type = 'folder' - mol.name = mol.mol - else: - mol.type = 'smiles' - mol.name = santize_smiles(mol.mol) - except TypeError: - if isinstance(mol.mol, Molecule): - mol.type = 'plams_mol' - if not mol.properties.name: - mol.name = Chem.MolToSmiles(Chem.RemoveHs(molkit.to_rdmol(mol.mol))) - mol.name = Chem.CanonSmiles(mol.name) - else: - mol.name = mol.properties.name - elif isinstance(mol.mol, Chem.rdchem.Mol): - mol.type = 'rdmol' - mol.name = Chem.CanonSmiles(Chem.MolToSmiles(Chem.RemoveHs(mol.mol))) - - return input_mol - - -def get_default_input_mol(): - """ Return the default settings of arg.input_cores & arg.input_ligands. """ - ret = yaml.load(""" - mol: None - name: None - path: None - guess_bonds: False - is_core: False - column: 0 - row: 0 - indices: None - type: None - """, Loader=yaml.FullLoader) - - for key in ret: - if ret[key] == 'None': - ret[key] = None - - return Settings(ret) - - -def santize_smiles(string): - """ Sanitize a SMILES string: turn it into a valid filename. """ - name = string.replace('(', '[').replace(')', ']') - cis_trans = [item for item in string if item == '/' or item == '\\'] - if cis_trans: - cis_trans = [item + cis_trans[i*2+1] for i, item in enumerate(cis_trans[::2])] - cis_trans_dict = {'//': 'trans-', '/\\': 'cis-'} - for item in cis_trans[::-1]: - name = cis_trans_dict[item] + name - name = name.replace('/', '').replace('\\', '') - - return name - - -""" #################################### Sanitize optional ################################## """ - - -def sanitize_optional(arg_dict): - """ Sanitize and return the settings of arg.optional. """ - arg = get_default_optional() - arg.update(arg_dict) - - mol_format = ('xyz', 'pdb') - - # Validate arguments consisting of booleans, integers, strings and/or iterables - arg.optional.core.dirname = val_dir_names(arg.optional.core.dirname, arg.path) - arg.optional.core.dummy = val_atnum(arg.optional.core.dummy) - arg.optional.database.dirname = val_dir_names(arg.optional.database.dirname, arg.path) - arg.optional.database.read = val_data(arg.optional.database.read) - arg.optional.database.write = val_data(arg.optional.database.write) - arg.optional.database.overwrite = val_data(arg.optional.database.overwrite) - arg.optional.database.mol_format = val_format(arg.optional.database.mol_format, mol_format) - arg.optional.database.mongodb = val_mongo(arg.optional.database.mongodb) - arg.optional.ligand.dirname = val_dir_names(arg.optional.ligand.dirname, arg.path) - arg.optional.ligand.optimize = val_bool(arg.optional.ligand.optimize) - arg.optional.ligand.split = val_bool(arg.optional.ligand.split) - arg.optional.qd.dirname = val_dir_names(arg.optional.qd.dirname, arg.path) - arg.optional.qd.activation_strain = val_bool(arg.optional.qd.activation_strain) - - # Prepares COSMO-RS default settings - s2 = CAT.get_template('qd.yaml')['COSMO-RS activity coefficient'] - try: - j1 = arg.optional.ligand['cosmo-rs'].job1 - if 'adf' in j1 or 'ADF' in j1: - s1 = Settings() - s2.update(CAT.get_template('crs.yaml')['ADF combi2005']) - else: - s1 = CAT.get_template('qd.yaml')['COSMO-MOPAC'] - s2.update(CAT.get_template('crs.yaml')['MOPAC PM6']) - except AttributeError: - s1 = CAT.get_template('qd.yaml')['COSMO-MOPAC'] - s2.update(CAT.get_template('crs.yaml')['MOPAC PM6']) - - # Validate arguments containing job recipes - arg.optional.ligand.crs = val_job(arg.optional.ligand['cosmo-rs'], - job1=AMSJob, - job2=CRSJob, - s1=s1, - s2=s2) - del arg.optional.ligand['cosmo-rs'] - - arg.optional.qd.optimize = val_job(arg.optional.qd.optimize, - job1=AMSJob, - job2=AMSJob, - s1=CAT.get_template('qd.yaml')['UFF'], - s2=CAT.get_template('qd.yaml')['UFF']) - - arg.optional.qd.dissociate = val_dissociate(arg.optional.qd.dissociate) - - del arg.path - return arg - - -def get_default_optional(): - """ Return the default settings of arg.optional. """ - ret = yaml.load(""" - optional: - database: - dirname: database - read: True - write: True - overwrite: False - mol_format: [pdb, xyz] - mongodb: False - - core: - dirname: core - dummy: Cl - - ligand: - dirname: ligand - optimize: True - cosmo-rs: False - split: True - - qd: - dirname: QD - optimize: False - activation_strain: False - dissociate: False - """, Loader=yaml.FullLoader) - - return Settings(ret) - - -def get_default_dissociate(): - """ Return the default settings of arg.optional. """ - ret = yaml.load(""" - core_atom: Cd - lig_count: 2 - core_core_dist: 5.0 - lig_core_dist: 5.0 - topology: - 7: vertice - 8: edge - 10: face - - job1: AMSJob - s1: True - job2: AMSJob - s2: True - """, Loader=yaml.FullLoader) - - return Settings(ret) - - -str_to_class = { - 'adf': ADFJob, 'adfjob': ADFJob, - 'ams': AMSJob, 'amsjob': AMSJob, - 'uff': UFFJob, 'uffjob': UFFJob, - 'band': BANDJob, 'bandjob': BANDJob, - 'dftb': DFTBJob, 'dftbjob': DFTBJob, - 'mopac': MOPACJob, 'mopacjob': MOPACJob, - 'reaxff': ReaxFFJob, 'reaxffjob': ReaxFFJob, - 'cp2k': Cp2kJob, 'cp2kjob': Cp2kJob, - 'orca': ORCAJob, 'orcajob': ORCAJob, - 'dirac': DiracJob, 'diracjob': DiracJob, - 'gamess': GamessJob, 'gamessjob': GamessJob, - 'dftbplus': DFTBPlusJob, 'dftbplusjob': DFTBPlusJob, - 'crs': CRSJob, 'cosmo-rs': CRSJob, 'crsjob': CRSJob -} - - -def val_mongo(arg: Settings) -> Settings: - """Validate database.mongodb.""" - if arg is None: - return - - arg.soft_update({ - 'host': 'localhost', - 'port': 27017, - 'username': None, - 'password': None - }) - - schema = Schema({ - 'host': str, - 'port': Or(int, None), - 'username': Or(str, None), - 'password': Or(str, None) - }) - ret = schema.validate(arg) - - user = ret.pop('username') - passwd = ret.pop('password') - if ret.username and ret.password: - hostname = ret.host - ret.host = f"mongodb://{user}:{passwd}@{hostname}/" - return ret - - -def val_format(arg, ref): - """ Validate database.mol_format & database_format. """ - schema = Schema(Or( - And(None, Use(bool)), - And(bool, lambda n: n is False), - And(str, lambda n: not n, Use(bool)), - And(str, lambda n: n.lower().rsplit('.', 1)[-1] in ref), - And([str], lambda n: [i.lower().rsplit('.', 1)[-1] in ref for i in n], Use(list)) - )) - - # Decapitalize and remove any periods. - ret = schema.validate(arg) - if isinstance(ret, list): - for i, item in enumerate(ret): - ret[i] = item.lower().rsplit('.', 1)[-1] - ret = tuple(ret) - elif isinstance(ret, str): - ret = (ret.lower().rsplit('.', 1)[-1]) - elif not ret: - ret = () - - return ret - - -def val_data(arg): - """ Validate the input arguments for database.read, write and overwrite. - Returns *False* or tuple with *ligand*, *core* and/or *qd*. - """ - ref = ('ligand', 'core', 'qd') - - def get_arg(n): - if n: - return ref - else: - return False - - def get_false(n): - return False - - schema = Schema(Or( - And(bool, Use(get_arg)), - And(str, lambda n: not n, Use(bool)), - And(str, lambda n: n.lower() in ref, Use(list)), - And([str], lambda n: not any([bool(i) for i in n]), Use(get_false)), - And([str], lambda n: [i.lower() in ref for i in n], Use(list)) - )) - - # Decapitalize - ret = schema.validate(arg) - if isinstance(ret, list): - for i, item in enumerate(ret): - ret[i] = item.lower() - ret = tuple(ret) - elif not ret: - ret = () - - return ret - - -def val_type(file_type): - """ Validate a the fle type, returns a or . """ - return Schema(Or(str, None, Molecule, Chem.rdchem.Mol)).validate(file_type) - - -def val_int(integer): - """ Validate a positive integer; returns an . """ - schema = Schema(And([int], lambda n: n >= 0)) - return schema.validate(integer) - - -def val_string(string): - """ Validate a string; returns a . """ - return Schema(str).validate(string) - - -def val_indices(indices): - """ Validate an iterable consisting if integers; returns a consisting of 3 . """ - if indices is None: - return tuple() - schema = Schema(And([int], Use(tuple), lambda n: [i >= 0 for i in n])) - return schema.validate(list(indices)) - - -def val_dir_names(dirname, path): - """ Validate a str; returns a str. - Creates a directory at path/dirname if it does not yet exist. """ - ret = join(path, Schema(str).validate(dirname)) - if not exists(ret): - os.makedirs(ret) - else: - assert isdir(ret) - return ret - - -def val_atnum(atnum): - """ Validate an atomic number or symbol; returns an atomic number . """ - at_gen = chain.from_iterable([[i, j[0]] for i, j in enumerate(PeriodicTable.data)]) - schema = Schema(And(Or(int, str), lambda n: n in at_gen, Use(to_atnum))) - return schema.validate(atnum) - - -def val_bool(my_bool): - """ Validate a boolean; returns a . """ - return Schema(bool).validate(my_bool) - - -def val_job(job, job1=None, job2=None, s1=None, s2=None): - """ Validate a job recipe. - Returns a dictionary: {'job1': , 'job2': , 's1': , 's2': }. """ - # Validate the object type - Schema(Or(bool, dict)).is_valid(job) - if isinstance(job, bool): - if job is False: - return job - job = {'job1': True, 's1': True, - 'job2': True, 's2': True} - - # Validate the object types of the various elements - schema = Schema({'job1': Or(None, Job, str), - 's1': Or(None, dict), - 'job2': Or(None, Job, str), - 's2': Or(None, dict)}) - schema.is_valid(job) - - # Assign proper default settings - str_to_def = {'job1': job1, 'job2': job2, 's1': s1, 's2': s2} - for k, v in job.items(): - if v is True: - job[k] = str_to_def[k] - elif not v: - job[k] = False - elif isinstance(v, str): - try: - job[k] = str_to_class[v.lower()] - except KeyError: - raise KeyError(get_time() + 'No Job-derived object exists for the string:', v - + ', please provide the actual object instead of ') - elif isinstance(v, (type, dict)): - pass - else: - raise TypeError(get_time() + str(type(v)), 'is an unspported object type') - return job - - -def val_core_idx(idx): - if not idx: - return False - elif isinstance(idx, (int, np.integer)): - return [idx] - else: - ret = list(idx) - assert isinstance(ret[0], (int, np.integer)) - return sorted(ret) - - -def val_dissociate(dissociate): - """ Validate the optional.qd.dissociate block in the input file. """ - ret = get_default_dissociate() - if dissociate is True: - dissociate = Settings() - elif dissociate is False: - return False - - ret.update(dissociate) - if dissociate.topology: - ret.topology = dissociate.topology - - if ret.job1 is False or ret.s1 is False: - return False - - # Interpret optional arguments - ret.core_index = val_core_idx(ret.core_index) - ret.core_atom = to_atnum(ret.core_atom) - ret.lig_count = int(ret.lig_count) - ret.core_core_dist = float(ret.core_core_dist) - ret.lig_core_dist = float(ret.lig_core_dist) - assert isinstance(ret.topology, dict) - for key in ret.topology: - assert isinstance(key, (int, np.integer)) - assert isinstance(ret.topology[key], str) - - # Interpret job1 - assert isinstance(ret.job1, (bool, type, str)) - if ret.job1 is True: - ret.job1 = AMSJob - elif isinstance(ret.job1, str): - ret.job1 = str_to_class[ret.job1.lower()] - - # Interpret job2 - assert isinstance(ret.job2, (bool, type, str)) - if ret.job2 is True: - ret.job2 = AMSJob - elif ret.job2 is False: - ret.s2 = False - elif isinstance(ret.job2, str): - ret.job2 = str_to_class[ret.job2.lower()] - - # Interpret s1 - assert isinstance(ret.s1, (bool, dict, str)) - if ret.s1 is True: - ret.s1 = CAT.get_template('qd.yaml')['MOPAC'] - elif isinstance(ret.s1, str): - if isfile(ret.s1): - ret.s1 = CAT.get_template(ret.s1, from_cat_data=False) - else: - raise FileNotFoundError(get_time() + str(ret.s1) + ' was not found') - - # Interpret s2 - assert isinstance(ret.s2, (bool, dict, str)) - if ret.s2 is True: - ret.s2 = CAT.get_template('qd.yaml')['UFF'] - elif ret.s2 is False: - ret.job2 = False - elif isinstance(ret.s2, str): - if isfile(ret.s2): - ret.s2 = CAT.get_template(ret.s2, from_cat_data=False) - else: - raise FileNotFoundError(get_time() + str(ret.s1) + ' was not found') - - return ret diff --git a/CAT/data_handling/mol_import.py b/CAT/data_handling/mol_import.py index dbe1859e..5b45eff3 100644 --- a/CAT/data_handling/mol_import.py +++ b/CAT/data_handling/mol_import.py @@ -1,26 +1,76 @@ -""" A module related to the importing of molecules. """ - -__all__ = ['read_mol', 'set_mol_prop'] +""" +CAT.data_handling.mol_import +============================ + +A module related to the importing of molecules. + +Index +----- +.. currentmodule:: CAT.data_handling.mol_import +.. autosummary:: + read_mol + read_mol_xyz + read_mol_pdb + read_mol_mol + read_mol_smiles + read_mol_plams + read_mol_rdkit + read_mol_folder + read_mol_txt + get_charge_dict + set_mol_prop + set_atom_prop + print_exception + +API +--- +.. autofunction:: read_mol +.. autofunction:: read_mol_xyz +.. autofunction:: read_mol_pdb +.. autofunction:: read_mol_mol +.. autofunction:: read_mol_smiles +.. autofunction:: read_mol_plams +.. autofunction:: read_mol_rdkit +.. autofunction:: read_mol_folder +.. autofunction:: read_mol_txt +.. autofunction:: get_charge_dict +.. autofunction:: set_mol_prop +.. autofunction:: set_atom_prop +.. autofunction:: print_exception + +""" import os import itertools -from typing import Dict from string import ascii_letters +from typing import (Dict, Iterable, List, Callable, Sequence, Optional) -from scm.plams.mol.molecule import Molecule -from scm.plams.core.errors import PlamsError +from scm.plams import (Molecule, Atom, Settings) import scm.plams.interfaces.molecule.rdkit as molkit from rdkit import Chem from ..utils import get_time -from ..data_handling.input_sanitizer import (sanitize_mol_type, get_mol_defaults) +from ..data_handling.validate_mol import validate_mol +__all__ = ['read_mol', 'set_mol_prop'] + + +def read_mol(input_mol: Iterable[Settings]) -> List[Molecule]: + """Checks the filetypes of the input molecules. + + Sets the molecules' properties and returns a list of plams molecules. + + Parameters + ---------- + input_mol : |list|_ [|Settings|_] + An iterable consisting of dictionaries with input settings per mol. + + Returns + ------- + |plams.Molecule|_ + A list of plams Molecules. -def read_mol(input_mol): - """ - Checks the filetypes of the input molecules, sets their properties and - returns a list of plams molecules. """ # Creates a dictionary of file extensions extension_dict = { @@ -40,100 +90,191 @@ def read_mol(input_mol): try: read_mol = extension_dict[mol_dict.type] except KeyError as ex: - print(get_time() + ex.__class__.__name__ + ':\t' + str(ex) + '\n') - read_mol = False - - if not read_mol: # Unrecognized input type + print(get_time() + f'{ex.__class__.__name__}:\t {ex}\n') continue mol = read_mol(mol_dict) if not mol: # Failed to import any molecules continue - if isinstance(mol, list): # if mol is a list of molecules - mol_list += mol - else: # if mol is a PLAMS molecule + if isinstance(mol, Molecule): # if mol is a PLAMS molecule if mol_dict.guess_bonds: mol.guess_bonds() set_mol_prop(mol, mol_dict) mol_list.append(mol) + else: # if mol is a list of molecules + mol_list += mol return mol_list -def read_mol_xyz(mol): - """ Read an .xyz file """ +def read_mol_xyz(mol_dict: Settings) -> Optional[Molecule]: + """Read an .xyz file.""" try: - return Molecule(mol.mol, inputformat='xyz') - except (Exception, PlamsError) as ex: - print_exception(read_mol_xyz.__code__, ex, mol.mol) + mol = Molecule(mol_dict.mol, inputformat='xyz') + if mol_dict.guess_bonds: + mol.guess_bonds() + canonicalize_mol(mol) + return mol + except Exception as ex: + print_exception(read_mol_xyz.__code__, ex, mol_dict.mol) -def read_mol_pdb(mol): - """ Read a .pdb file """ +def read_mol_pdb(mol_dict: Settings) -> Optional[Molecule]: + """Read a .pdb file.""" try: - return molkit.readpdb(mol.mol) - except (Exception, PlamsError) as ex: - print_exception(read_mol_pdb.__code__, ex, mol.mol) + mol = molkit.readpdb(mol_dict.mol) + if mol_dict.guess_bonds: + mol.guess_bonds() + canonicalize_mol(mol) + return mol + except Exception as ex: + print_exception(read_mol_pdb.__code__, ex, mol_dict.mol) -def read_mol_mol(mol, mol_dict): - """ Read a .mol file """ +def read_mol_mol(mol_dict: Settings) -> Optional[Molecule]: + """Read a .mol file.""" try: - return molkit.from_rdmol(Chem.MolFromMolFile(mol.mol, removeHs=False)) - except (Exception, PlamsError) as ex: - print_exception(read_mol_mol.__code__, ex, mol.mol) + mol = molkit.from_rdmol(Chem.MolFromMolFile(mol_dict.mol, removeHs=False)) + if mol_dict.guess_bonds: + mol.guess_bonds() + canonicalize_mol(mol) + return mol + except Exception as ex: + print_exception(read_mol_mol.__code__, ex, mol_dict.mol) -def read_mol_smiles(mol): - """ Read a SMILES string """ +def read_mol_smiles(mol_dict: Settings) -> Optional[Molecule]: + """Read a SMILES string.""" try: - return molkit.from_smiles(mol.mol) - except (Exception, PlamsError) as ex: - print_exception(read_mol_smiles.__code__, ex, mol.mol) + mol = molkit.from_smiles(mol_dict.mol) + if mol_dict.guess_bonds: + mol.guess_bonds() + return mol + except Exception as ex: + print_exception(read_mol_smiles.__code__, ex, mol_dict.mol) -def read_mol_plams(mol): - """ Read a PLAMS molecule """ +def read_mol_plams(mol_dict: Settings) -> Optional[Molecule]: + """Read a PLAMS molecule.""" try: - return mol.mol - except (Exception, PlamsError) as ex: - print_exception(read_mol_plams.__code__, ex, mol.mol) + mol = mol_dict.mol + if mol_dict.guess_bonds: + mol.guess_bonds() + canonicalize_mol(mol) + return mol + except Exception as ex: + print_exception(read_mol_plams.__code__, ex, mol_dict.mol) -def read_mol_rdkit(mol): - """ Read a RDKit molecule """ +def read_mol_rdkit(mol_dict: Settings) -> Optional[Molecule]: + """Read a RDKit molecule.""" try: - return molkit.from_rdmol(mol.mol) - except (Exception, PlamsError) as ex: - print_exception(read_mol_rdkit.__code__, ex, mol.mol) + mol = molkit.from_rdmol(mol_dict.mol) + if mol_dict.guess_bonds: + mol.guess_bonds() + canonicalize_mol(mol) + return mol + except Exception as ex: + print_exception(read_mol_rdkit.__code__, ex, mol_dict.mol) -def read_mol_folder(mol): - """ Read all files (.xyz, .pdb, .mol, .txt or further subfolders) within a folder """ +def read_mol_folder(mol_dict: Settings) -> Optional[Molecule]: + """Read all files (.xyz, .pdb, .mol, .txt or further subfolders) within a folder.""" try: - file_list = [file for file in os.listdir(mol.mol)] - input_mol = get_mol_defaults(file_list, path=mol.path, core=mol.is_core) - input_mol = sanitize_mol_type(input_mol) - return read_mol(input_mol) - except (Exception, PlamsError) as ex: - print_exception(read_mol_folder.__code__, ex, mol.mol) + mol_type = 'input_cores' if mol_dict.is_core else 'input_ligands' + _file_list = os.listdir(mol_dict.mol) + optional_dict = Settings({k: v for k, v in mol_dict.items() if k not in ('mol', 'path')}) + file_list = [{i: optional_dict} for i in _file_list] -def read_mol_txt(mol): + validate_mol(file_list, mol_type, mol_dict.path) + return read_mol(file_list) + except Exception as ex: + print_exception(read_mol_folder.__code__, ex, mol_dict.mol) + + +def read_mol_txt(mol_dict: Settings) -> Optional[Molecule]: """Read a plain text file containing one or more SMILES strings.""" try: - with open(mol.mol, 'r') as file: - file_list = file.read().splitlines() - file_list = [file.split()[mol.column] for file in file_list[mol.row:] if file] - input_mol = get_mol_defaults(file_list, path=mol.path, core=mol.is_core) - input_mol = sanitize_mol_type(input_mol) - return read_mol(input_mol) - except (Exception, PlamsError) as ex: - print_exception(read_mol_txt.__code__, ex, mol.mol) + row = 0 if 'row' not in mol_dict else mol_dict.row + column = 0 if 'column' not in mol_dict else mol_dict.column + mol_type = 'input_cores' if mol_dict.is_core else 'input_ligands' + + with open(mol_dict.mol, 'r') as f: + iterator = itertools.islice(f, row, None) + _file_list = [i.rstrip('\n').split()[column] for i in iterator if i] + optional_dict = Settings({k: v for k, v in mol_dict.items() if k not in ('mol', 'path')}) + file_list = [{i: optional_dict} for i in _file_list] + + validate_mol(file_list, mol_type, mol_dict.path) + return read_mol(file_list) + except Exception as ex: + print_exception(read_mol_txt.__code__, ex, mol_dict.mol) + + +def canonicalize_mol(mol: Molecule, + inplace: bool = True) -> Optional[Molecule]: + """Take a PLAMS molecule and sort its atoms based on their canonical rank. + + .. _rdkit.Chem.CanonicalRankAtoms: https://www.rdkit.org/docs/source/rdkit.Chem.rdmolfiles.html#rdkit.Chem.rdmolfiles.CanonicalRankAtoms + + Examples + -------- + .. code:: python + + >>> print(mol) # Methane + Atoms: + 1 H 0.640510 0.640510 -0.640510 + 2 H 0.640510 -0.640510 0.640510 + 3 C 0.000000 0.000000 0.000000 + 4 H -0.640510 0.640510 0.640510 + 5 H -0.640510 -0.640510 -0.640510 + + >>> canonicalize_mol(mol) + >>> print(mol) + Atoms: + 1 C 0.000000 0.000000 0.000000 + 2 H -0.640510 -0.640510 -0.640510 + 3 H -0.640510 0.640510 0.640510 + 4 H 0.640510 -0.640510 0.640510 + 5 H 0.640510 0.640510 -0.640510 + + Parameters + ---------- + mol : |plams.Molecule|_ + A PLAMS molecule. + + inplace : bool + If ``True``, perform an inplace update of **mol** rather than returning + a new :class:`Molecule` instance. + + Returns + ------- + |plams.Molecule|_ + Optional: if ``inplace=False``, return a copy of **mol** with its atoms sorted by their + canonical rank. + + See also + -------- + * rdkit.Chem.CanonicalRankAtoms_: Returns the canonical atom ranking for each atom of a + molecule fragment. + + """ # noqa + rdmol = molkit.to_rdmol(mol) + idx_collection = Chem.CanonicalRankAtoms(rdmol) + + # Reverse sort Molecule.atoms by the atomic indices in idx_collection + if inplace: + mol.atoms = [at for _, at in sorted(zip(idx_collection, mol.atoms), reverse=True)] + return + else: + ret = mol.copy() + ret.atoms = [at for _, at in sorted(zip(idx_collection, ret.atoms), reverse=True)] + return ret -def get_charge_dict(): +def get_charge_dict() -> Dict[str, int]: """Create a dictionary of elements and their formal atomic charge.""" # Create a list of atomic charges and elements charges = (1, 2, -3, -2, -1, 2) @@ -157,7 +298,8 @@ def get_charge_dict(): charge_dict: Dict[str, int] = get_charge_dict() -def set_mol_prop(mol, mol_dict): +def set_mol_prop(mol: Molecule, + mol_dict: Settings) -> None: """Set molecular and atomic properties.""" if mol_dict.is_core: residue_name = 'COR' @@ -178,13 +320,15 @@ def set_mol_prop(mol, mol_dict): set_atom_prop(atom, i, residue_name) if not mol.properties.smiles: - tmp = Chem.MolToSmiles(Chem.RemoveHs(molkit.to_rdmol(mol))) - mol.properties.smiles = Chem.CanonSmiles(tmp) + mol.properties.smiles = Chem.MolToSmiles(Chem.RemoveHs(molkit.to_rdmol(mol)), + canonical=True) -def set_atom_prop(atom, i, residue_name): +def set_atom_prop(atom: Atom, + at_id: Sequence[str], + residue_name: str) -> None: """Set atomic properties.""" - symbol = '{:4}'.format(atom.symbol + ''.join(i)) + symbol = '{:4}'.format(atom.symbol + ''.join(at_id)) # Add a number of properties to atom atom.properties.pdb_info.ResidueName = residue_name @@ -201,33 +345,44 @@ def set_atom_prop(atom, i, residue_name): atom.properties.pdb_info.IsHeteroAtom = True # Sets the formal atomic charge - if not atom.properties.charge: - if atom.symbol in charge_dict: - total_bonds = int(sum([bond.order for bond in atom.bonds])) - default_charge = charge_dict[atom.symbol] - sign = int(-1 * default_charge / abs(default_charge)) - atom.properties.charge = default_charge + sign*total_bonds - - # Update formal atomic charges for hypervalent atoms - if total_bonds > abs(default_charge): - if total_bonds is abs(default_charge) + 2: - atom.properties.charge += sign*2 - elif total_bonds is abs(default_charge) + 4: - atom.properties.charge += sign*4 - elif total_bonds >= abs(default_charge) + 6: - atom.properties.charge += sign*6 - else: - atom.properties.charge = 0 - - -def print_exception(func, ex, name): + if atom.properties.charge: + return + + # Default to a charge of 0 if no charge is available for that specific element + if atom.symbol not in charge_dict: + atom.properties.charge = 0 + return + + # Update the charge of non-hypervalent atoms + total_bonds = int(sum([bond.order for bond in atom.bonds])) + default_charge = charge_dict[atom.symbol] + abs_charge = abs(default_charge) + sign = -1 * int(default_charge / abs_charge) + + # Take the default charge and correct for the number (and order) of bonds + atom.properties.charge = default_charge + sign * total_bonds + if total_bonds <= abs_charge: + return + + # Update formal atomic charges for hypervalent atoms + if total_bonds is abs_charge + 2: + atom.properties.charge += 2 * sign + elif total_bonds is abs_charge + 4: + atom.properties.charge += 4 * sign + elif total_bonds >= abs_charge + 6: + atom.properties.charge += 6 * sign + return + + +def print_exception(func: Callable, + ex: Exception, + name: str) -> None: """Manages the printing of exceptions upon failing to import a molecule.""" extension_dict = {'read_mol_xyz': '.xyz file', 'read_mol_pdb': '.pdb file', 'read_mol_mol': '.mol file', 'read_mol_smiles': 'SMILES string', 'read_mol_folder': 'folder', 'read_mol_txt': '.txt file', 'read_mol_excel': '.xlsx file', 'read_mol_plams': 'PLAMS molecule', 'read_mol_rdkit': 'RDKit molecule'} - print(get_time() + str(type(ex).__name__), str(ex)) - print(get_time() + 'Warning:', name, 'not recognized as a valid', - extension_dict[func.co_name], '\n') - return [] + print(get_time() + f'{ex.__class__.__name__}:\t {ex}') + filename = extension_dict[func.co_name] + print(get_time() + f'Warning: {name} not recognized as a valid {filename}\n') diff --git a/CAT/data_handling/validate_input.py b/CAT/data_handling/validate_input.py new file mode 100644 index 00000000..baf9cc3d --- /dev/null +++ b/CAT/data_handling/validate_input.py @@ -0,0 +1,75 @@ +""" +CAT.data_handling.validate_input +================================ + +A module designed for sanitizing and interpreting the input file. + +Index +----- +.. currentmodule:: CAT.data_handling.validate_input +.. autosummary:: + validate_input + +API +--- +.. autofunction:: validate_input + +""" + +from os import mkdir +from os.path import (join, isdir) + +from scm.plams import Settings + +from CAT.data_handling.validation_schemas import ( + core_schema, ligand_schema, qd_schema, database_schema, + mongodb_schema, bde_schema, qd_opt_schema, crs_schema +) +from .validate_mol import validate_mol +from ..utils import validate_path + +__all__ = ['validate_input'] + + +def validate_input(s: Settings) -> None: + """Initialize the input-validation procedure. + + performs an inplace update of **s**. + + Parameters + ---------- + s : |plams.Settings|_ + A Settings instance with to-be validated CAT input settings. + + """ + # Validate the path + s.path = path = validate_path(s.path) + + # Set the various working directories + dirnames = ('database', 'ligand', 'core', 'qd') + for key in dirnames: + value = join(path, key) + s.optional[key].dirname = value + if not isdir(value): + mkdir(value) + + # Validate optional argument + s.optional.database = database_schema.validate(s.optional.database) + s.optional.ligand = ligand_schema.validate(s.optional.ligand) + s.optional.core = core_schema.validate(s.optional.core) + s.optional.qd = qd_schema.validate(s.optional.qd) + + # Validate some of the more complex optionala rguments + if s.optional.database.mongodb: + s.optional.database.mongodb = mongodb_schema.validate(s.optional.database.mongodb) + if s.optional.qd.optimize: + s.optional.qd.optimize = qd_opt_schema.validate(s.optional.qd.optimize) + if s.optional.qd.dissociate: + s.optional.qd.dissociate = bde_schema.validate(s.optional.qd.dissociate) + if s.optional.ligand['cosmo-rs']: + crs = s.optional.ligand.pop('cosmo-rs') + s.optional.ligand.crs = crs_schema.validate(crs) + + # Validate the input cores and ligands + validate_mol(s.input_cores, 'input_cores', join(path, 'core')) + validate_mol(s.input_ligands, 'input_ligands', join(path, 'ligand')) diff --git a/CAT/data_handling/validate_mol.py b/CAT/data_handling/validate_mol.py new file mode 100644 index 00000000..eb589025 --- /dev/null +++ b/CAT/data_handling/validate_mol.py @@ -0,0 +1,223 @@ +""" +CAT.data_handling.validate_mol +============================== + +A module designed for sanitizing and interpreting all molecule-related settings in the input file. + +Index +----- +.. currentmodule:: CAT.data_handling.validate_mol +.. autosummary:: + santize_smiles + validate_mol + _parse_name_type + _parse_mol_type + +API +--- +.. autofunction:: santize_smiles +.. autofunction:: validate_mol +.. autofunction:: _parse_name_type +.. autofunction:: _parse_mol_type + +""" + +from os.path import (join, isfile, isdir, basename) +from typing import (Sequence, Any, Union, Optional) + +from rdkit import Chem +from scm.plams import Settings, Molecule +import scm.plams.interfaces.molecule.rdkit as molkit + +from .validation_schemas import mol_schema +from ..utils import validate_path + +__all__ = ['validate_mol', 'santize_smiles'] + + +def santize_smiles(smiles: str) -> str: + """Sanitize a SMILES string: turn it into a valid filename.""" + name = smiles.replace('(', '[').replace(')', ']') + cis_trans = [item for item in smiles if item in ('/', '\\')] + if cis_trans: + cis_trans = [item + cis_trans[i*2+1] for i, item in enumerate(cis_trans[::2])] + cis_trans_dict = {'//': 'trans-', '/\\': 'cis-'} + for item in cis_trans[::-1]: + name = cis_trans_dict[item] + name + name = name.replace('/', '').replace('\\', '') + + return name + + +def validate_mol(args: Sequence[Union[Any, Settings]], + mol_type: str, + path: Optional[str] = None) -> None: + r"""Validate the ``"input_ligands"`` or ``"input_cores"`` blocks in the CAT input. + + Performs an inpalce update of **args**. + + Examples + -------- + + An example using a list of .xyz files as input + + .. code:: python + + >>> print(args1) # A list of .xyz files + ['mol1.xyz', 'mol2.xyz'] + + >>> mol_type = 'input_ligands' + >>> validate_mol(args1, mol_type) + + >>> print(args1[0], '\n', args1[1]) + is_core: False + mol: /path/to/my/current/working/dir/mol1.xyz + name: mol1 + path: /path/to/my/current/working/dir + type: smiles + + is_core: False + mol: /path/to/my/current/working/dir/mol2.xyz + name: mol2 + path: /path/to/my/current/working/dir + type: smiles + + + Another example using a list of .pdb-containing dictionaries as input + + .. code :: python + + >>> print(args2) # A list of Settings instances with .xyz files + [mol3.pdb: + guess_bonds: True + mol4.pdb: + guess_bonds: True + ] + + >>> mol_type = 'input_ligands' + >>> path = '/path/to/custom/working/dir' + >>> validate_mol(args2, mol_type, path) + + >>> print(args2[0], '\n', args2[1]) + guess_bonds: True + is_core: True + mol: /path/to/custom/working/dir/mol3.pdb + name: mol3 + path: /path/to/custom/working/dir + type: smiles + + guess_bonds: True + is_core: True + mol: /path/to/custom/working/dir/mol4.pdb + name: mol4 + path: /path/to/custom/working/dir + type: smiles + + + Parameters + ---------- + args : |list|_ [|plams.Settings|_] + A list of input molecules. + Accepts strings, PLAMS molecules and RDKit molecules. + Additional arguments can be provided by putting above-mentioned molecules in a dictionary. + + mol_type : str + The type of molecule. + Accepted values are ``"input_ligands"`` and ``"input_cores"``. + + path : str + Optional: The path to the molecule-containing directory. + + Raises + ------ + FileNotFoundError + Raised if **path** cannot be found. + + NotADirectoryError + Raised if **path** is not a directory. + + ValueError + Raised if the **mol_type** parameter is neither ``"input_cores"`` nor ``"input_ligands"``. + + SchemaError + Raised if invalid input settings are found in while validating **args**. + + """ + # Validate arguments + is_core = _parse_mol_type(mol_type) + _path = validate_path(path) + + for i, dict_ in enumerate(args): + if not isinstance(dict_, dict): # No optional arguments provided + mol = dict_ + mol_dict = Settings({'path': _path, 'is_core': is_core}) + else: # Optional arguments have been provided: parse and validate them + mol, mol_dict = next(iter(dict_.items())) + mol_dict.setdefault('is_core', is_core) + mol_dict = mol_schema.validate(mol_dict) + mol_dict.setdefault('path', _path) + + if isinstance(mol, str) and not isfile(mol): + mol = join(mol_dict.path, mol) + mol_dict.mol = mol + + _parse_name_type(mol_dict) + args[i] = mol_dict + + +def _parse_name_type(mol_dict: Settings) -> None: + """Set the ``"name"`` and ``"type"`` keys in **mol_dict**. + + The new values of ``"name"`` and ``"type"`` depend on the value of ``mol_dict["mol"]``. + + Parameters + ---------- + mol_dict : |plams.Settings|_ + A Settings instance containing the ``"mol"`` key. + ``mol_dict["mol"]`` is exp + + Raises + ------ + TypeError + Raised ``mol_dict["mol"]`` is an instance of neither :class:`str`, :class:`Molecule` nor + :class:`mol`. + + """ + mol = mol_dict.mol + if isinstance(mol, str): + if isfile(mol): # mol is a file + mol_dict.type = mol.rsplit('.', 1)[-1] + mol_dict.name = basename(mol.rsplit('.', 1)[0]) + elif isdir(mol): # mol is a directory + mol_dict.type = 'folder' + mol_dict.name = basename(mol) + else: # mol is (probably; hopefully?) a SMILES string + mol_dict.type = 'smiles' + mol_dict.mol = basename(mol) + mol_dict.name = santize_smiles(basename(mol)) + + elif isinstance(mol, Molecule): # mol is an instance of plams.Molecule + mol_dict.type = 'plams_mol' + if not mol.properties.name: + mol_dict.name = Chem.MolToSmiles(Chem.RemoveHs(molkit.to_rdmol(mol)), canonical=True) + else: + mol_dict.name = mol.properties.name + + elif isinstance(mol, Chem.rdchem.Mol): # mol is an instance of rdkit.Chem.Mol + mol_dict.type = 'rdmol' + mol_dict.name = Chem.MolToSmiles(Chem.RemoveHs(mol), canonical=True) + + else: + raise TypeError(f"mol_dict['mol'] expects an instance of 'str', 'Molecule' or 'Mol'; " + f"observed type: '{mol.__class__.__name__}'") + + +def _parse_mol_type(mol_type: str) -> bool: + """Parse the **mol_type** parameter of :func:`.validate_mol`.""" + if mol_type.lower() == 'input_cores': + return True + elif mol_type.lower() == 'input_ligands': + return False + else: + raise ValueError(f"accepted values for mol_type are 'input_cores' and input_ligands; " + f"observed value: {repr(mol_type)}") diff --git a/CAT/data_handling/validation_schemas.py b/CAT/data_handling/validation_schemas.py new file mode 100644 index 00000000..78586cb0 --- /dev/null +++ b/CAT/data_handling/validation_schemas.py @@ -0,0 +1,422 @@ +""" +CAT.data_handling.validation_schemas +==================================== + +A module designed for sanitizing and interpreting the input file. + +Index +----- +.. currentmodule:: CAT.data_handling.validation_schemas +.. autosummary:: + mol_schema + core_schema + ligand_schema + qd_schema + database_schema + mongodb_schema + bde_schema + qd_opt_schema + crs_schema + +API +--- +.. autodata:: mol_schema + :annotation: = schema.Schema +.. autodata:: core_schema + :annotation: = schema.Schema +.. autodata:: ligand_schema + :annotation: = schema.Schema +.. autodata:: qd_schema + :annotation: = schema.Schema +.. autodata:: database_schema + :annotation: = schema.Schema +.. autodata:: mongodb_schema + :annotation: = schema.Schema +.. autodata:: bde_schema + :annotation: = schema.Schema +.. autodata:: qd_opt_schema + :annotation: = schema.Schema +.. autodata:: crs_schema + :annotation: = schema.Schema + +""" + +from typing import (Dict, Collection) +from collections import abc + +from schema import (Or, And, Use, Schema) +from schema import Optional as Optional_ + +from scm.plams.interfaces.adfsuite.adf import ADFJob +from scm.plams.interfaces.adfsuite.ams import AMSJob +from scm.plams.interfaces.adfsuite.uff import UFFJob +from scm.plams.interfaces.adfsuite.band import BANDJob +from scm.plams.interfaces.adfsuite.dftb import DFTBJob +from scm.plams.interfaces.adfsuite.mopac import MOPACJob +from scm.plams.interfaces.adfsuite.reaxff import ReaxFFJob + +from scm.plams.interfaces.thirdparty.cp2k import Cp2kJob +from scm.plams.interfaces.thirdparty.orca import ORCAJob +from scm.plams.interfaces.thirdparty.dirac import DiracJob +from scm.plams.interfaces.thirdparty.gamess import GamessJob +from scm.plams.interfaces.thirdparty.dftbplus import DFTBPlusJob + +from scm.plams.core.basejob import Job + +from ..utils import get_template, validate_path +from ..mol_utils import to_atnum + +try: + from nanoCAT.crs import CRSJob + NANO_CAT = True +except ImportError: + CRSJob = Job + NANO_CAT = False + +__all__ = ['mol_schema', 'core_schema', 'ligand_schema', 'qd_schema', 'database_schema', + 'mongodb_schema', 'bde_schema', 'qd_opt_schema', 'crs_schema'] + + +def to_tuple(collection: Collection) -> tuple: + """Convert a collection into a sorted tuple.""" + try: + ret = sorted(collection) + except TypeError: # The collection contains a mix of sorting-incompatibl objects + ret = sorted(collection, key=str) + finally: + return tuple(ret) + + +# The **default** parameter of schema.Optional() will automatically call any callable +# Solution: provide a callable that returns another callable +def _get_amsjob() -> type: + """Return a type object of :class:`.AMSJob`.""" + return AMSJob + + +def _get_crsjob() -> type: + """Return a type object of :class:`.CRSJob`.""" + return CRSJob + + +# Default settings templates +_bde_s1_default = get_template('qd.yaml')['MOPAC'] +_bde_s2_default = get_template('qd.yaml')['UFF'] + +_qd_opt_s1_default = get_template('qd.yaml')['UFF'] +_qd_opt_s2_default = _qd_opt_s1_default + +_crs_s1_default = get_template('qd.yaml')['COSMO-MOPAC'] +_crs_s2_default = get_template('qd.yaml')['COSMO-RS activity coefficient'] +_crs_s2_default.update(get_template('crs.yaml')['MOPAC PM6']) + + +# A dictionary for translating strings into :class:`plams.Job` types +_class_dict: Dict[str, type] = { + 'adf': ADFJob, 'adfjob': ADFJob, + 'ams': AMSJob, 'amsjob': AMSJob, + 'uff': UFFJob, 'uffjob': UFFJob, + 'band': BANDJob, 'bandjob': BANDJob, + 'dftb': DFTBJob, 'dftbjob': DFTBJob, + 'mopac': MOPACJob, 'mopacjob': MOPACJob, + 'reaxff': ReaxFFJob, 'reaxffjob': ReaxFFJob, + 'cp2k': Cp2kJob, 'cp2kjob': Cp2kJob, + 'orca': ORCAJob, 'orcajob': ORCAJob, + 'dirac': DiracJob, 'diracjob': DiracJob, + 'gamess': GamessJob, 'gamessjob': GamessJob, + 'dftbplus': DFTBPlusJob, 'dftbplusjob': DFTBPlusJob, + 'crs': CRSJob, 'cosmo-rs': CRSJob, 'crsjob': CRSJob +} + + +#: Schema for validating the ``['input_ligands']`` and ``['input_cores']`` blocks. +mol_schema: Schema = Schema({ + Optional_('guess_bonds', default=False): + bool, + + Optional_('is_core'): + bool, + + Optional_('column'): + And(int, lambda n: n >= 0), + + Optional_('row'): + And(int, lambda n: n >= 0), + + Optional_('indices'): + Or( + And(int, lambda n: n >= 0, Use(lambda n: (n,))), + And( + abc.Collection, + lambda n: all(isinstance(i, int) and i >= 0 for i in n), + lambda n: len(n) == len(set(n)), + Use(tuple) + ), + ), + + Optional_('type'): + str, + + Optional_('name'): + str, + + Optional_('path'): + Use(validate_path) +}) + +#: Schema for validating the ``['optional']['core']`` block. +core_schema: Schema = Schema({ + 'dirname': + str, + + Optional_('dummy', default=17): # Return a tuple of atomic numbers + Or( + And(int, Use(to_atnum)), + And(str, Use(to_atnum)) + ) +}) + +_db_names = ('core', 'ligand', 'qd') +_format_names = ('pdb', 'xyz') + +#: Schema for validating the ``['optional']['database']`` block. +database_schema: Schema = Schema({ + # path+directory name of the database + 'dirname': + str, + + Optional_('read', default=_db_names): # Attempt to pull structures from the database + Or( + And(bool, Use(lambda n: _db_names if n is True else ())), + And(str, lambda n: n in _db_names, Use(lambda n: (n,))), + And(abc.Collection, + lambda n: all(i in _db_names for i in n), + lambda n: len(n) == len(set(n)), + Use(to_tuple)) + ), + + Optional_('write', default=_db_names): # Attempt to write structures to the database + Or( + And(bool, Use(lambda n: _db_names if n is True else ())), + And(str, lambda n: n in _db_names, Use(lambda n: (n,))), + And(abc.Collection, + lambda n: all(i in _db_names for i in n), + lambda n: len(n) == len(set(n)), + Use(to_tuple)) + ), + + Optional_('overwrite', default=tuple): # Allow previous entries to be overwritten + Or( + And(bool, Use(lambda n: _db_names if n is True else ())), + And(str, lambda n: n in _db_names, Use(lambda n: (n,))), + And(abc.Collection, + lambda n: all(i in _db_names for i in n), + lambda n: len(n) == len(set(n)), + Use(to_tuple)) + ), + + Optional_('mongodb', default=dict): # Settings specific to MongoDB + Or( + dict, + And(bool, lambda n: n is False, Use(lambda n: {})) + ), + + Optional_('mol_format', default=_format_names): # Return a tuple of file formats + Or( + And(bool, Use(lambda n: _format_names if n is True else ())), + And(str, lambda n: n in _format_names), + And(abc.Collection, + lambda n: all(i in _format_names for i in n), + lambda n: len(n) == len(set(n)), + Use(to_tuple)) + ) +}) + + +#: Schema for validating the ``['optional']['ligand']`` block. +ligand_schema: Schema = Schema({ + # path+directory name of the ligand directory + 'dirname': + str, + + Optional_('functional_groups', default=None): + Or( + And(str, Use(lambda n: (n,))), + And(abc.Collection, + lambda n: all(isinstance(i, str) for i in n), + lambda n: len(n) == len(set(n)), + Use(to_tuple)) + ), + + Optional_('optimize', default=True): # Optimize the ligands + bool, + + Optional_('split', default=True): # Remove a counterion from the function group + bool, + + Optional_('cosmo-rs', default=False): # Settings specific to ligand COSMO-RS calculations + Or( + dict, + And(bool, Use(lambda n: {'job1': AMSJob} if n else False)) + ), +}) + + +#: Schema for validating the ``['optional']['qd']`` block. +qd_schema: Schema = Schema({ + # path+directory name of the quantum dot directory + 'dirname': + str, + + # Settings specific to a quantum dot activation strain analyses + Optional_('activation_strain', default=False): + bool, + + Optional_('optimize', default=False): # Settings for quantum dot geometry optimizations + Or( + dict, + And(bool, Use(lambda n: ({'job1': AMSJob} if n else False))) + ), + + # Settings for quantum dot ligand dissociation calculations + Optional_('dissociate', default=False): + Or( + dict, + And(bool, lambda n: n is False) + ) +}) + + +#: Schema for validating the ``['optional']['database']['mongodb']`` block. +mongodb_schema: Schema = Schema({ + # Optional username for the MongoDB host + Optional_('username'): + Or(str, int), + + Optional_('password'): # Optional password for the MongoDB host + Or(str, int), + + Optional_('host', default='localhost'): # Name of the MongoDB host + Or(str, int), + + Optional_('port', default=27017): # Port of the MongoDB host + int, + + Optional_(str): # Other keyword arguments for :class:`pymongo.MongoClient` + object +}) + + +#: Schema for validating the ``['optional']['qd']['dissociate']`` block. +bde_schema: Schema = Schema({ + # Atom type of the to-be dissociated core atom + 'core_atom': + And(Or(int, str), Use(to_atnum)), + + 'lig_count': # THe number of ligands per core_atom + And(int, lambda n: n >= 0), + + Optional_('core_core_dist', default=5.0): + And(Or(int, float), lambda n: n >= 0.0, Use(float)), + + Optional_('lig_core_dist', default=5.0): + And(Or(int, float), lambda n: n >= 0.0, Use(float)), + + Optional_('core_index'): + Or( + And(int, lambda n: n >= 0, Use(lambda n: (n,))), + And( + abc.Collection, + lambda n: all(isinstance(i, int) and i >= 0 for i in n), + lambda n: len(n) == len(set(n)), + Use(to_tuple) + ) + ), + + Optional_('topology', default=dict): + And(dict, lambda n: all(isinstance(k, int) for k in n)), + + Optional_('job1', default=_get_amsjob): + Or( + And(type, lambda n: issubclass(n, Job)), + And(str, lambda n: n.lower() in _class_dict, Use(lambda n: _class_dict[n.lower()])) + ), + + Optional_('s1', default=_bde_s1_default): + Or( + dict, + And(str, Use(lambda n: get_template(n, from_cat_data=False))) + ), + + Optional_('job2'): + Or( + And(type, lambda n: issubclass(n, Job)), + And(str, lambda n: n.lower() in _class_dict, Use(lambda n: _class_dict[n.lower()])) + ), + + Optional_('s2'): + Or( + dict, + And(str, Use(lambda n: get_template(n, from_cat_data=False))) + ) +}) + +#: Schema for validating the ``['optional']['qd']['optimize']`` block. +qd_opt_schema: Schema = Schema({ + # The job type for the first half of the optimization + Optional_('job1', default=_get_amsjob): + Or( + And(type, lambda n: issubclass(n, Job)), + And(str, lambda n: n.lower() in _class_dict, Use(lambda n: _class_dict[n.lower()])) + ), + + # The job settings for the first half of the optimization + Optional_('s1', default=_qd_opt_s1_default): + Or( + dict, + And(str, Use(lambda n: get_template(n, from_cat_data=False))) + ), + + # The job type for the second half of the optimization + Optional_('job2', default=_get_amsjob): + Or( + And(type, lambda n: issubclass(n, Job)), + And(str, lambda n: n.lower() in _class_dict, Use(lambda n: _class_dict[n.lower()])) + ), + + # The job settings for the second half of the optimization + Optional_('s2', default=_qd_opt_s2_default): + Or( + dict, + And(str, Use(lambda n: get_template(n, from_cat_data=False))) + ) +}) + +#: Schema for validating the ``['optional']['ligand']['cosmo-rs']`` block. +crs_schema: Schema = Schema({ + # The job type for constructing the COSMO surface + Optional_('job1', default=_get_amsjob): + Or( + And(type, lambda n: issubclass(n, Job)), + And(str, lambda n: n.lower() in _class_dict, Use(lambda n: _class_dict[n.lower()])) + ), + + # The settings for constructing the COSMO surface + Optional_('s1', default=_crs_s1_default): + Or( + dict, + And(str, Use(lambda n: get_template(n, from_cat_data=False))) + ), + + Optional_('job2', default=_get_crsjob): # The job type for the actual COSMO-RS calculation + Or( + And(type, lambda n: issubclass(n, Job)), + And(str, lambda n: n.lower() in _class_dict, Use(lambda n: _class_dict[n.lower()])) + ), + + Optional_('s2', default=_crs_s2_default): # The settings for the actual COSMO-RS calculation + Or( + dict, + And(str, Use(lambda n: get_template(n, from_cat_data=False))) + ) +}) diff --git a/CAT/frozen_settings.py b/CAT/frozen_settings.py new file mode 100644 index 00000000..374bd765 --- /dev/null +++ b/CAT/frozen_settings.py @@ -0,0 +1,78 @@ +""" +CAT.frozen_settings +=================== + +A module which adds the :class:`.FrozenSettings` class, an immutable counterpart to plams.Settings_. + +.. _plams.Settings: https://www.scm.com/doc/plams/components/settings.html + +Index +----- +.. currentmodule:: CAT.frozen_settings +.. autosummary:: + FrozenSettings + +API +--- +.. autoclass:: FrozenSettings + :members: + :private-members: + :special-members: + +""" + +from __future__ import annotations + +from typing import (Any, Union, Iterable) + +from scm.plams import Settings + +__all__ = ['FrozenSettings'] + +# Various immutable objects suited as dictionary keys +Immutable = Union[tuple, int, float, str, frozenset] + + +class FrozenSettings(Settings): + """An inmutable subclass of plams.Settings_. + + .. _plams.Settings: https://www.scm.com/doc/plams/components/settings.html + """ + + def __init__(self, *args: Iterable, **kwargs: dict) -> None: + """Initialize the construction of a :class:`FrozenSettings` instance.""" + dict.__init__(self, *args, **kwargs) + + # Fill the FrozenSettings instance by means of the dict.__setitem__ method + for key, value in self.items(): + if isinstance(value, dict): + Settings.__setitem__(self, key, FrozenSettings(value)) + elif isinstance(value, list): + value = [FrozenSettings(i) if isinstance(i, dict) else i for i in value] + Settings.__setitem__(self, key, value) + + def __missing__(self, name: Immutable) -> FrozenSettings: + """Return a new (empty) :class:`FrozenSettings` instance.""" + return FrozenSettings() + + def __delitem__(self, name: Immutable) -> None: + """Raise a :exc:`TypeError`; :class:`FrozenSettings` instances are immutable.""" + raise TypeError("'FrozenSettings' object does not support item deletion") + + def __setitem__(self, name: Immutable, value: Any) -> None: + """Raise a :exc:`TypeError`; :class:`FrozenSettings` instances are immutable.""" + raise TypeError("'FrozenSettings' object does not support item assignment") + + def copy(self) -> FrozenSettings: + """Create a copy of this instance.""" + ret = FrozenSettings() + for key, value in self.items(): + try: + Settings.__setitem__(ret, key, value.copy()) + except AttributeError: + Settings.__setitem__(ret, key, value) + return ret + + def __copy__(self) -> FrozenSettings: + """Create a copy of this instance by calling :meth:`FrozenSettings.copy`.""" + return self.copy() diff --git a/CAT/analysis/jobs.py b/CAT/jobs.py similarity index 61% rename from CAT/analysis/jobs.py rename to CAT/jobs.py index 8f2a1927..fb350126 100644 --- a/CAT/analysis/jobs.py +++ b/CAT/jobs.py @@ -1,13 +1,35 @@ -""" A module designed for running Jobs. """ - -__all__ = ['job_single_point', 'job_geometry_opt', 'job_freq'] +""" +CAT.jobs +======== + +A module designed for running Jobs. + +Index +----- +.. currentmodule:: CAT.jobs +.. autosummary:: + get_main_molecule + get_energy + job_single_point + job_geometry_opt + job_freq + +API +--- +.. automethod:: get_main_molecule +.. automethod:: get_energy +.. autofunction:: job_single_point +.. autofunction:: job_geometry_opt +.. autofunction:: job_freq + +""" from os.path import join +from typing import (Optional, Callable) import numpy as np -from scm.plams.mol.molecule import Molecule -from scm.plams.core.settings import Settings +from scm.plams import (Molecule, Settings, Results) from scm.plams.core.functions import add_to_class from scm.plams.tools.units import Units @@ -17,12 +39,14 @@ import qmflows from .thermo_chem import get_thermo -from ..utils import (get_time, type_to_string) -from ..mol_utils import (adf_connectivity, from_mol_other) +from .utils import (get_time, type_to_string) +from .mol_utils import (adf_connectivity, from_mol_other) + +__all__ = ['job_single_point', 'job_geometry_opt', 'job_freq'] @add_to_class(Cp2kResults) -def get_main_molecule(self): +def get_main_molecule(self) -> Optional[Molecule]: for file in self.files: if '.xyz' in file: return Molecule(join(self.job.path, file)) @@ -30,20 +54,40 @@ def get_main_molecule(self): @add_to_class(Cp2kResults) -def get_energy(self, index=0, unit='Hartree'): +def get_energy(self, index: int = 0, + unit: str = 'Hartree') -> float: """Returns last occurence of 'Total energy:' in the output.""" energy = self._get_energy_type('Total', index=index) return Units.convert(energy, 'Hartree', unit) @add_to_class(Molecule) -def job_single_point(self, job, settings, name='Single_point', ret_results=False): - """ Function for running an arbritrary , extracting total energies. +def job_single_point(self, job: Callable, + settings: Settings, + name: str = 'Single_point', + ret_results: bool = False) -> Optional[Results]: + """Function for running an arbritrary jobs, extracting total energies. + + Paramaters + ---------- + job : |Callable|_ + A type Callable of a class derived from :class:`Job`, e.g. :class:`AMSJob` + or :class:`Cp2kJob`. + + settings : |plams.Settings|_ + The settings for **job**. + + name : str + The name of **job**. + + ret_results : bool + Whether or not the :class:`Results` instance should be returned or not. + + Returns + ------- + |plams.Results|_ + Optional: If ``ret_results=True` return the :class:`Results` instance produced by this job. - mol : A PLAMS molecule. - job : A type object of a class derived from , e.g. AMSJob or Cp2kJob. - settings : The settings for *job*. - name : The name of *job*. """ # Grab the default settings for a specific job and update them with user provided settings s = Settings() @@ -73,16 +117,36 @@ def job_single_point(self, job, settings, name='Single_point', ret_results=False # Return results if ret_results: return results + return None @add_to_class(Molecule) -def job_geometry_opt(self, job, settings, name='Geometry_optimization', ret_results=False): - """ Function for running an arbritrary , extracting total energies and final geometries. +def job_geometry_opt(self, job: Callable, + settings: Settings, + name: str = 'Geometry_optimization', + ret_results: bool = False) -> Optional[Results]: + """Function for running an arbritrary jobs, extracting total energies and final geometries. + + Paramaters + ---------- + job : |Callable|_ + A type Callable of a class derived from :class:`Job`, e.g. :class:`AMSJob` + or :class:`Cp2kJob`. + + settings : |plams.Settings|_ + The settings for **job**. + + name : str + The name of **job**. + + ret_results : bool + Whether or not the :class:`Results` instance should be returned or not. + + Returns + ------- + |plams.Results|_ + Optional: If ``ret_results=True` return the :class:`Results` instance produced by this job. - mol : A PLAMS molecule. - job : A type object of a class derived from , e.g. AMSJob or Cp2kJob. - settings : The settings for *job*. - name : The name of *job*. """ # Grab the default settings for a specific job and update them with user provided settings s = Settings() @@ -114,18 +178,40 @@ def job_geometry_opt(self, job, settings, name='Geometry_optimization', ret_resu # Return results if ret_results: return results + return None @add_to_class(Molecule) def job_freq(self, job, settings, name='Frequency_analysis', opt=True, ret_results=False): - """ Function for running an arbritrary , extracting total energies, final geometries and + """ Function for running an arbritrary Jobs + + Extracts total energies, final geometries and thermochemical quantities derived from vibrational frequencies. - mol : A PLAMS molecule. - job : A type object of a class derived from , e.g. AMSJob or Cp2kJob. - settings : The settings for *job*. - name : The name of *job*. - opt : Preceed the frequency analysis with a geometry optimization. + Paramaters + ---------- + job : |Callable|_ + A type Callable of a class derived from :class:`Job`, e.g. :class:`AMSJob` + or :class:`Cp2kJob`. + + settings : |plams.Settings|_ + The settings for **job**. + + name : str + The name of **job**. + + opt : bool + Perform a geometry optimization (see :func:`.job_geometry_opt`) before calculating + frequencies. + + ret_results : bool + Whether or not the :class:`Results` instance should be returned or not. + + Returns + ------- + |plams.Results|_ + Optional: If ``ret_results=True` return the :class:`Results` instance produced by this job. + """ # Preceed the frequency analysis with a geometry optimization if opt: @@ -167,3 +253,4 @@ def job_freq(self, job, settings, name='Frequency_analysis', opt=True, ret_resul # Return results if ret_results: return results + return None diff --git a/CAT/mol_utils.py b/CAT/mol_utils.py index 90e621fe..51236ba1 100644 --- a/CAT/mol_utils.py +++ b/CAT/mol_utils.py @@ -1,109 +1,155 @@ -""" A module with misc functions related to manipulating molecules and their geometry. """ - -__all__ = [ - 'merge_mol', 'adf_connectivity', 'fix_h', 'fix_carboxyl', - 'from_mol_other', 'from_rdmol', 'separate_mod' -] - -from scm.plams.mol.atom import Atom -from scm.plams.mol.bond import Bond -from scm.plams.mol.molecule import Molecule -from scm.plams.core.functions import add_to_class +""" +CAT.mol_utils +============= + +A module with misc functions related to manipulating molecules and their geometry. + +Index +----- +.. currentmodule:: CAT.mol_utils +.. autosummary:: + from_mol_other + from_rdmol + get_index + merge_mol + separate_mod + to_atnum + to_symbol + adf_connectivity + fix_carboxyl + fix_h + +API +--- +.. automethod:: from_mol_other +.. automethod:: from_rdmol +.. automethod:: get_index +.. automethod:: merge_mol +.. automethod:: separate_mod +.. autofunction:: to_atnum +.. autofunction:: to_symbol +.. autofunction:: adf_connectivity +.. autofunction:: fix_carboxyl +.. autofunction:: fix_h + +""" + +from __future__ import annotations + +from typing import (Optional, Iterable, Union, Tuple, List) + +from scm.plams import (Molecule, Atom, Bond, MoleculeError, add_to_class) from scm.plams.tools.periodic_table import PeriodicTable import scm.plams.interfaces.molecule.rdkit as molkit from rdkit import Chem from rdkit.Chem import rdMolTransforms +__all__ = ['adf_connectivity', 'fix_h', 'fix_carboxyl'] + @add_to_class(Molecule) -def from_mol_other(self, mol, atom_subset=None): - """ Update the atomic coordinates of *self* with coordinates from another PLAMS molecule. +def from_mol_other(self, mol: Molecule, + atom_subset: Optional[Iterable[Atom]] = None) -> None: + """Update the Cartesian coordinates of this instance with those from another PLAMS molecule. + Alternatively, update only a subset of atoms. - Performs an inplace update of **self**. - :parameter mol: A PLAMS molecule. - :type mol: |plams.Molecule|_ - :parameter atom_subset: A subset of atoms in **self**. - :type atom_subset: |None|_ or |list|_ [|plams.Atom|_] + Parameters + ---------- + mol : |plams.Molecule|_ + A PLAMS molecule. + + atom_subset : |list|_ [|plams.Atom|_] + Optional: A subset of atoms in **self**. + """ - atom_subset = atom_subset or self.atoms - for at1, at2 in zip(atom_subset, mol): + at_subset = atom_subset or self.atoms + for at1, at2 in zip(at_subset, mol): at1.coords = at2.coords @add_to_class(Molecule) -def from_rdmol(self, rdmol, atom_subset=None): - """ Update the atomic coordinates of *self* with coordinates from an RDKit molecule. +def from_rdmol(self, rdmol: Chem.Mol, + atom_subset: Optional[Iterable[Atom]] = None) -> None: + """Update the atomic coordinates of this instance with coordinates from an RDKit molecule. + Alternatively, update only a subset of atoms. - Performs an inplace update of **self**. - :parameter rdmol: An RDKit molecule. - :type rdmol: |rdkit.Chem.Mol|_ - :parameter atom_subset: A subset of atoms in **self**. - :type atom_subset: |None|_ or |list|_ [|plams.Atom|_] + Parameters + ---------- + rdmol : |rdkit.Chem.Mol|_ + An RDKit molecule. + + atom_subset : |list|_ [|plams.Atom|_] + Optional: A subset of atoms in **self**. + """ - atom_subset = atom_subset or self.atoms + at_subset = atom_subset or self.atoms conf = rdmol.GetConformer() - for at1, at2 in zip(atom_subset, rdmol.GetAtoms()): + for at1, at2 in zip(at_subset, rdmol.GetAtoms()): pos = conf.GetAtomPosition(at2.GetIdx()) at1.coords = (pos.x, pos.y, pos.z) -def to_atnum(item): - """ Turn an atomic symbol into an atomic number. +@add_to_class(Molecule) +def get_index(self, value: Union[Atom, Bond]) -> Union[int, Tuple[int, int]]: + """Return the first index of **value** within this instance. - :parameter item: An atomic symbol or number. - :type item: |int|_ or |str|_ - :return: An atomic number. - :rtype: |int|_ - """ - if isinstance(item, str): - return PeriodicTable.get_atomic_number(item) - return item + **value** expects an instance of either :class:`Atom` or :class:`Bond`. + Note + ---- + Following the convention addopted by PLAMS, the returned index/indices are 1-based rather + than 0-based. -def to_symbol(item): - """ Turn an atomic number into an atomic symbol. + Parameters + ---------- + value : |plams.Atom|_ or |plams.Bond|_ + A PLAMS atom or bonds. - :parameter item: An atomic symbol or number. - :type item: |int|_ or |str|_ - :return: An atomic symbol. - :rtype: |str|_ - """ - if isinstance(item, int): - return PeriodicTable.get_symbol(item) - return item + Returns + ------- + |int|_ or |tuple|_ [|int|_] + An atomic index or (**value**: |plams.Atom|_) or + a tuple of two atomic indices (**item**: |plams.Bond|_). + Raises + ------ + TypeError + Raised if **value** is an instance of neither :class:`Atom` nor :class:`Bond`. -@add_to_class(Atom) -def get_atom_index(self): - """ Return the index of an atom (numbering starts with 1). + MoleculeError + Raised if the passed :class:`Atom` or :class:`Bond` is not in this instance. - :return: An atomic index. - :rtype: |int|_. """ - return self.mol.atoms.index(self) + 1 + if isinstance(value, Atom): + if value not in self.atoms: + raise MoleculeError("Passed atom, {repr(value)}, is not in this instance") + return 1 + self.atoms.index(value) + elif isinstance(value, Bond): + if value not in self.bonds: + raise MoleculeError(f"Passed bond, {repr(value)}, is not in this instance") + at1, at2 = value + return 1 + self.atoms.index(at1), 1 + self.atoms.index(at2) + err = "item excepts an instance of 'Atom' or 'Bond'; observed type: '{}'" + raise TypeError(err.format(value.__class__.__name__)) -@add_to_class(Bond) -def get_bond_index(self): - """ Return a tuple of two atomic indices defining a bond (numbering starts with 1). - :return: A tuple of 2 atomic indices defining a bond. - :rtype: 2 |tuple|_ [|int|_]. - """ - return self.atom1.get_atom_index(), self.atom2.get_atom_index() +@add_to_class(Molecule) +def merge_mol(self, mol_list: Union[Molecule, Iterable[Molecule]]) -> None: + """Merge two or more molecules into a single molecule. + No new copies of atoms/bonds are created, all atoms/bonds are moved from + mol_list to plams_mol. + Performs an inplace update of this instance. -@add_to_class(Molecule) -def merge_mol(self, mol_list): - """ Merge two or more molecules into a single molecule. - No new copies of atoms/bonds are created, all atoms/bonds are moved from mol_list to plams_mol. - Performs an inplace update of **self**. + Parameters + ---------- + mol_list : |plams.Molecule|_ or |list|_ [|plams.Molecule|_] + A molecule or list of molecules. - :parameter mol_list: A molecule or list of molecules. - :type mol_list: |plams.Molecule|_ or |list|_ [|plams.Molecule|_]. """ if isinstance(mol_list, Molecule): mol_list = [mol_list] @@ -119,17 +165,21 @@ def merge_mol(self, mol_list): @add_to_class(Molecule) -def separate_mod(self): - """ Modified PLAMS function: seperates a molecule instead of a copy of a molecule. +def separate_mod(self) -> List[Molecule]: + """Modified PLAMS function: seperates a molecule instead of a copy of a molecule. + Separate the molecule into connected components. - returns is a list of new |Molecule| objects (all atoms and bonds are disjoint with - the original molecule). + Returns is a list of new Molecule instrances (all atoms and bonds are disjoint with + the original molecule). Each element of this list is identical to one connected component of the base molecule. A connected component is a subset of atoms such that there exists a path - (along one or more bonds) between any two atoms. + (along one or more bonds) between any two atoms. + + Returns + ------- + |list|_ [|plams.Molecule|_] + A list of molecules with atoms and bonds from **self**. - :return: A list of molecules with atoms and bonds from **self**. - :rtype: |list|_ [|plams.Molecule|_] """ frags = [] for at in self: @@ -159,13 +209,75 @@ def dfs(v, mol): return frags -def adf_connectivity(plams_mol): - """ Create an AMS-compatible connectivity list. +def to_atnum(item: Union[str, int]) -> int: + """Turn an atomic symbol into an atomic number. + + Parameters + ---------- + item : |int|_ or |str|_ + An atomic symbol or number. + + Returns + ------- + |int|_ + An atomic number. + + Raises + ------ + TypeError + Raised if **item** is an instance of neither :class:`str` nor :class:`int`. + + """ + if isinstance(item, str): + return PeriodicTable.get_atomic_number(item) + elif isinstance(item, int): + return item + + err = "item expects an instance of 'str' or 'int'; observed type: '{}'" + raise TypeError(err.format(item.__class__.__name__)) + + +def to_symbol(item: Union[str, int]) -> str: + """Turn an atomic number into an atomic symbol. + + Parameters + ---------- + item : |int|_ or |str|_ + An atomic symbol or number. + + Returns + ------- + |int|_ + An atomic symbol. + + Raises + ------ + TypeError + Raised if **item** is an instance of neither :class:`str` nor :class:`int`. + + """ + if isinstance(item, int): + return PeriodicTable.get_symbol(item) + elif isinstance(item, str): + return item + + err = "item expects an instance of 'str' or 'int'; observed type: '{}'" + raise TypeError(err.format(item.__class__.__name__)) + + +def adf_connectivity(plams_mol: Molecule) -> List[str]: + """Create an AMS-compatible connectivity list. + + Parameters + ---------- + plams_mol : |plams.Molecule|_ + A PLAMS molecule with :math:`n` bonds. + + Returns + ------- + :math:`n` |list|_ [|str|_] + An ADF-compatible connectivity list of :math:`n` bonds. - :parameter plams_mol: A PLAMS molecule. - :type plams_mol: |plams.Molecule|_ - :return: An ADF-compatible connectivity list of *n* bonds. - :rtype: *n* |list|_ [|str|_]. """ # Create list of indices of all aromatic bonds rdmol = molkit.to_rdmol(plams_mol) @@ -173,25 +285,27 @@ def adf_connectivity(plams_mol): # Create a list of bond orders; aromatic bonds get a bond order of 1.5 plams_mol.set_atoms_id() - bond_orders = [bond.order for bond in plams_mol.bonds] - for i, ar in enumerate(aromatic): - if ar: - bond_orders[i] = 1.5 - bonds = [str(bond.atom1.id) + ' ' + str(bond.atom2.id) + ' ' + str(order) for + bond_orders = [(1.5 if ar else bond.order) for ar, bond in zip(aromatic, plams_mol.bonds)] + bonds = ['{:d} {:d} {:.1f}'.format(bond.atom1.id, bond.atom2.id, bond.order) for bond, order in zip(plams_mol.bonds, bond_orders)] plams_mol.unset_atoms_id() return bonds -def fix_carboxyl(plams_mol): - """ Resets carboxylate OCO angles if it is smaller than 60 degrees. +def fix_carboxyl(mol: Molecule) -> None: + """Resets carboxylate OCO angles if it is smaller than :math:`60` degrees. + Performs an inplace update of **plams_mol**. - :parameter plams_mol: A PLAMS molecule. - :type plams_mol: |plams.Molecule|_ + Parameters + ---------- + plams_mol : |plams.Molecule|_ + A PLAMS molecule. + """ - rdmol = molkit.to_rdmol(plams_mol) + rdmol = molkit.to_rdmol(mol) + conf = rdmol.GetConformer() carboxylate = Chem.MolFromSmarts('[O-]C(C)=O') matches = rdmol.GetSubstructMatches(carboxylate) @@ -199,38 +313,48 @@ def fix_carboxyl(plams_mol): get_angle = rdMolTransforms.GetAngleDeg set_angle = rdMolTransforms.SetAngleDeg for idx in matches: - if get_angle(rdmol.GetConformer(), idx[3], idx[1], idx[0]) < 60: - set_angle(rdmol.GetConformer(), idx[2], idx[1], idx[3], 180.0) - set_angle(rdmol.GetConformer(), idx[0], idx[1], idx[3], 120.0) - plams_mol.from_rdmol(rdmol) + if get_angle(conf, idx[3], idx[1], idx[0]) < 60: + set_angle(conf, idx[2], idx[1], idx[3], 180.0) + set_angle(conf, idx[0], idx[1], idx[3], 120.0) + mol.from_rdmol(rdmol) -def fix_h(plams_mol): - """ If a C=C-H angle is smaller than 20.0 degrees, set it back to 120.0 degrees. +def fix_h(mol: Molecule) -> None: + """If a C=C-H angle is smaller than :math:`20` degrees, set it back to :math:`120` degrees. + Performs an inplace update of **plams_mol**. - :parameter plams_mol: A PLAMS molecule. - :type plams_mol: |plams.Molecule|_ + Parameters + ---------- + plams_mol : |plams.Molecule|_ + A PLAMS molecule. + """ - H_list = [atom for atom in plams_mol if atom.atnum == 1 and 2.0 in - [bond.order for bond in plams_mol.neighbors(atom)[0].bonds]] + H_list = [atom for atom in mol if atom.atnum == 1 and 2.0 in + [bond.order for bond in mol.neighbors(atom)[0].bonds]] - rdmol = molkit.to_rdmol(plams_mol) - idx = plams_mol.atoms.index + rdmol = molkit.to_rdmol(mol) + conf = rdmol.GetConformer() + get_idx = mol.atoms.index set_angle = rdMolTransforms.SetAngleDeg get_angle = rdMolTransforms.GetAngleDeg - update = [] + update = False for atom in H_list: - at1 = atom - at2 = plams_mol.neighbors(at1)[0] - at3 = [atom for atom in plams_mol.neighbors(at2) if atom != at1] - if get_angle(rdmol.GetConformer(), idx(at3[0]), idx(at2), idx(at1)) <= 20.0: - set_angle(rdmol.GetConformer(), idx(at3[0]), idx(at2), idx(at1), 120.0) - update.append(True) - elif get_angle(rdmol.GetConformer(), idx(at3[1]), idx(at2), idx(at1)) <= 20.0: - set_angle(rdmol.GetConformer(), idx(at3[1]), idx(at2), idx(at1), 120.0) - update.append(True) + at1 = atom # Central atom + at2 = mol.neighbors(at1)[0] # Neighbours + at3 = [atom for atom in mol.neighbors(at2) if atom != at1] # Neighbours of neighbours + + # Create 2 sets of 3 atomic indices for defining angles: at1-at2=at3 + idx_tup1 = get_idx(at3[0]), get_idx(at2), get_idx(at1) + idx_tup2 = get_idx(at3[1]), get_idx(at2), get_idx(at1) + + if get_angle(conf, *idx_tup1) <= 20.0: + set_angle(conf, *idx_tup1, 120.0) + update = True + elif get_angle(conf, *idx_tup2) <= 20.0: + set_angle(conf, *idx_tup2, 120.0) + update = True if update: - plams_mol.from_rdmol(rdmol) + mol.from_rdmol(rdmol) diff --git a/CAT/settings_dataframe.py b/CAT/settings_dataframe.py new file mode 100644 index 00000000..d4e85e87 --- /dev/null +++ b/CAT/settings_dataframe.py @@ -0,0 +1,136 @@ +""" +CAT.settings_dataframe +====================== + +A module for holding the :class:`.SettingsDataFrame` and :class:`.SettingsSeries` classes. + +Index +----- +.. currentmodule:: CAT.settings_dataframe +.. autosummary:: + SettingsSeries + SettingsDataFrame + +API +--- +.. autoclass:: CAT.settings_dataframe.SettingsSeries + :members: + :private-members: + :special-members: + +.. autoclass:: CAT.settings_dataframe.SettingsDataFrame + :members: + :private-members: + :special-members: + +""" + +from __future__ import annotations + +from typing import Optional + +import pandas as pd + +from .frozen_settings import FrozenSettings + +__all__ = ['SettingsSeries', 'SettingsDataFrame'] + + +class SettingsSeries(pd.Series): + """A subclass of the Pandas Series with an additional :attr:`.settings` attribute. + + Parameters + ---------- + settings : dict + Optional: A dictionary with additional user-defined (meta-)data. + See :attr:`.SettingsSeries.settings`. + + Attributes + ---------- + settings : |CAT.FrozenSettings|_ + An immutable :class:`.FrozenSettings` instance with additional user-defined (meta-)data. + + """ + + _metadata = ['settings'] + + def __init__(self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False, + settings: Optional[dict] = None) -> None: + """Initialize the :class:`.SettingsSeries` construction.""" + self.settings = self._sanitize_settings(settings) + super().__init__(data, index, dtype, name, copy, fastpath) + + @property + def _constructor(self) -> SettingsSeries: + """Construct a :class:`.SettingsSeries` instance.""" + def _series(*args, **kwargs) -> SettingsSeries: + return SettingsSeries(*args, **kwargs).__finalize__(self) + return _series + + @property + def _constructor_expanddim(self) -> SettingsDataFrame: + """Construct a :class:`.SettingsDataFrame` instance.""" + def _df(*args, **kwargs) -> SettingsDataFrame: + return SettingsDataFrame(*args, **kwargs).__finalize__(self) + return _df + + @staticmethod + def _sanitize_settings(settings: Optional[dict]) -> FrozenSettings: + """Sanitize the **settings** parameter for :attr:`SettingsSeries.settings`.""" + if settings is None: + return FrozenSettings() + elif isinstance(settings, dict): + return FrozenSettings(settings) + else: + err = "The settings argument expects an instance of 'dict'; observed type '{}'" + raise TypeError(err.format(settings.__class__.__name__)) + + +class SettingsDataFrame(pd.DataFrame): + """A subclass of the Pandas DataFrame with an additional :attr:`.settings` attribute. + + Parameters + ---------- + settings : dict + Optional: A dictionary with additional user-defined (meta-)data. + See :attr:`.SettingsDataFrame.settings`. + + Attributes + ---------- + settings : |CAT.FrozenSettings|_ + An immutable :class:`.FrozenSettings` instance with additional user-defined (meta-)data. + + """ + + _metadata = ['settings'] + + def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False, + settings: Optional[dict] = None) -> None: + """Initialize the :class:`.SettingsDataFrame` construction.""" + self.settings = self._sanitize_settings(settings) + super().__init__(data, index, columns, dtype, copy) + + @property + def _constructor(self) -> SettingsDataFrame: + """Construct a :class:`.SettingsDataFrame` instance.""" + def _df(*args, **kwargs) -> SettingsDataFrame: + return SettingsDataFrame(*args, **kwargs).__finalize__(self) + return _df + + @property + def _constructor_sliced(self) -> SettingsSeries: + """Construct a :class:`.SettingsSeries` instance.""" + def _series(*args, **kwargs) -> SettingsSeries: + return SettingsSeries(*args, **kwargs).__finalize__(self) + return _series + + @staticmethod + def _sanitize_settings(settings: Optional[dict]) -> FrozenSettings: + """Sanitize the **settings** parameter for :attr:`SettingsDataFrame.settings`.""" + if settings is None: + return FrozenSettings() + elif isinstance(settings, dict): + return FrozenSettings(settings) + else: + err = "The settings argument expects an instance of 'dict'; observed type '{}'" + raise TypeError(err.format(settings.__class__.__name__)) diff --git a/CAT/thermo_chem.py b/CAT/thermo_chem.py new file mode 100644 index 00000000..4cc363ab --- /dev/null +++ b/CAT/thermo_chem.py @@ -0,0 +1,158 @@ +""" +CAT.thermo_chem +=============== + +A module related to calculating thermochemical properties. + +Index +----- +.. currentmodule:: CAT.thermo_chem +.. autosummary:: + get_entropy + get_thermo + +API +--- +.. autofunction:: get_entropy +.. autofunction:: get_thermo + +""" + +from typing import (Sequence, Union, Dict) + +import numpy as np + +from scm.plams import Molecule +from scm.plams.tools.units import Units + +__all__ = ['get_thermo', 'get_entropy'] + + +def get_entropy(mol: Molecule, + freqs: np.ndarray, + T: float = 298.15) -> np.ndarray: + """Calculate the translational, vibrational and rotational entropy. + + All units and constants are in SI units. + + Parameters + ---------- + mol : |plams.Molecule|_ + A PLAMS molecule. + + freqs : |np.ndarray|_ [|np.float64|_] + An iterable consisting of vibrational frequencies in units of cm**-1. + + T : float + The temperature in Kelvin. + + Returns + ------- + |np.ndarray|_ [|np.float64|_]: + An array with translational, rotational and vibrational contributions to the entropy, + ordered in that specific manner. + Units are in J/mol. + + """ + # Define constants + kT = 1.380648 * 10**-23 * T # Boltzmann constant * temperature + h = 6.6260701 * 10**-34 # Planck constant + hv_kT = (h * np.asarray(freqs)) / kT # (Planck * frequencies) / (Boltzmann * temperature) + R = 8.31445 # Gas constant + V_Na = ((R * T) / 10**5) / Units.constants['NA'] # Volume(1 mol ideal gas) / Avogadro's number + pi = np.pi + + # Extract atomic masses and Cartesian coordinates + m = np.array([at.mass for at in mol]) * 1.6605390 * 10**-27 + x, y, z = mol.as_array().T * 10**-10 + + # Calculate the rotational partition function: q_rot + inertia = np.array([ + [sum(m*(y**2 + z**2)), -sum(m*x*y), -sum(m*x*z)], + [-sum(m*x*y), sum(m*(x**2 + z**2)), -sum(m*y*z)], + [-sum(m*x*z), -sum(m*y*z), sum(m*(x**2 + y**2))] + ]) + inertia_product = np.product(np.linalg.eig(inertia)[0]) + q_rot = pi**0.5 * ((8 * pi**2 * kT) / h**2)**1.5 * inertia_product**0.5 + + # Calculate the translational, rotational and vibrational entropy (divided by R) + S_trans = 1.5 + np.log(V_Na * ((2 * pi * sum(m) * kT) / h**2)**1.5) + S_rot = 1.5 + np.log(q_rot) + with np.errstate(divide='ignore', invalid='ignore'): + S_vib_left = hv_kT / np.expm1(hv_kT) + S_vib_left[np.isnan(S_vib_left)] = 0.0 + S_vib_right = np.log(1 - np.exp(-hv_kT)) + S_vib_right[S_vib_right == -np.inf] = 0.0 + S_vib = sum(S_vib_left - S_vib_right) + + return R * np.array([S_trans, S_rot, S_vib]) + + +def get_thermo(mol: Molecule, + freqs: Sequence[float], + E: float = 0.0, + T: float = 298.15, + export: Sequence[str] = ('E', 'U', 'H', 'S', 'G'), + unit: str = 'kcal/mol') -> Union[float, Dict[str, float]]: + """Extract and return Gibbs free energies, entropies and/or enthalpies from an AMS KF file. + + All vibrational frequencies smaller than 100 cm**-1 are set to 100 cm**-1. + + .. _plams.Units: https://www.scm.com/doc/plams/components/utils.html#scm.plams.tools.units.Units + + Parameters + ---------- + mol : |plams.Molecule|_ + A PLAMS molecule. + + freqs : |np.ndarray|_ [|np.float64|_] + An iterable consisting of vibrational frequencies in units of cm**-1. + + E : float + The eletronic energy in kcal/mol. + Defaults to 0.0 kcal/mol. + + T : float + The temperature in Kelvin. + + export : |tuple|_ [|str|_] + An iterable containing strings of the to be exported energies: + + * ``'E'``: Electronic energy (see the **E** parameter) + * ``'U'``: Interal energy (:math:`E + U_{nuc}`) + * ``'H'``: Enthalpy (:math:`U + pV`) + * ``'S'``: Entropy + * ``'G'``: Gibbs free energy (:math:`H - T*S`) + + unit : str + The unit of the to be returned energies. + See plams.Units_ for more details and an overview of available energy units. + + Returns + ------- + |float|_ or |dict|_ [|str|_, |float|_]: + An energy or dictionary of energies. + Keys + + """ + # Get frequencies; set all frequencies smaller than 100 cm**-1 to 100 cm**-1 + freqs = np.array(freqs) + freqs[freqs < 100] = 100 + freqs *= 100 * Units.constants['c'] + + # hv_kT = (Planck constant * frequencies) / (Boltzmann constant * temperature) + hv_kT = (6.6260701 * 10**-34 * freqs) / (1.380648 * 10**-23 * T) + RT = 8.31445 * T # Gas constant * temperature + + # Extract and/or calculate the various energies + E = E * Units.conversion_ratio('kcal/mol', 'kj/mol') * 1000 + U = E + RT * (3.0 + sum(0.5 * hv_kT + hv_kT / np.expm1(hv_kT))) + H = U + RT + S = sum(get_entropy(mol, freqs, T=T)) + G = H - T * S + + ret = {'E': E, 'U': U, 'H': H, 'S': S, 'G': G} + + if len(export) == 1: + return Units.convert(ret[export[0]], 'kj/mol', unit) / 1000 + return {i: Units.convert(ret[i], 'kj/mol', unit) / 1000 for i in ret if i in export} diff --git a/CAT/utils.py b/CAT/utils.py index 56e05dd0..7658edaa 100644 --- a/CAT/utils.py +++ b/CAT/utils.py @@ -1,12 +1,35 @@ -"""A module with miscellaneous functions.""" - -__all__ = ['check_sys_var', 'dict_concatenate', 'get_time', 'get_template'] +""" +CAT.utils +========= + +A module with miscellaneous functions. + +Index +----- +.. currentmodule:: CAT.utils +.. autosummary:: + type_to_string + get_time + check_sys_var + dict_concatenate + get_template + +API +--- +.. autofunction:: type_to_string +.. autofunction:: get_time +.. autofunction:: check_sys_var +.. autofunction:: dict_concatenate +.. autofunction:: get_template + +""" import os import time import yaml import pkg_resources as pkg -from os.path import join +from os.path import (join, isdir, isfile, exists) +from typing import (Callable, Iterable, Optional) from scm.plams.core.settings import Settings @@ -17,24 +40,34 @@ from scm.plams.interfaces.thirdparty.dirac import DiracJob from scm.plams.interfaces.thirdparty.gamess import GamessJob +__all__ = ['check_sys_var', 'dict_concatenate', 'get_time', 'get_template'] + +_job_dict = { + ADFJob: 'adf', + AMSJob: 'ams', + DiracJob: 'dirac', + Cp2kJob: 'cp2k', + GamessJob: 'gamess', + ORCAJob: 'orca' +} + -def type_to_string(job): - """Turn a :class:`type` instance into a string.""" - job_dict = {ADFJob: 'adf', AMSJob: 'ams', DiracJob: 'dirac', - Cp2kJob: 'cp2k', GamessJob: 'gamess', ORCAJob: 'orca'} +def type_to_string(job: Callable) -> str: + """Turn a :class:`type` instance into a :class:`str`.""" try: - return job_dict[job] + return _job_dict[job] except KeyError: - print(get_time() + 'WARNING: No default settings available for ' + str(job)) - return False + err = 'WARNING: No default settings available for {}' + print(get_time() + err.format(repr(job.__class__.__name__))) + return '' -def get_time(): +def get_time() -> str: """Return the current time as string.""" return '[{}] '.format(time.strftime('%H:%M:%S')) -def check_sys_var(): +def check_sys_var() -> None: """Validate all ADF environment variables. Raises @@ -50,30 +83,33 @@ def check_sys_var(): Raised if an ADF version prior to 2019 is found. """ - sys_var = ['ADFBIN', 'ADFHOME', 'ADFRESOURCES', 'SCMLICENSE'] - sys_var_exists = [item in os.environ for item in sys_var] + sys_var = ('ADFBIN', 'ADFHOME', 'ADFRESOURCES', 'SCMLICENSE') + sys_var_exists = [item in os.environ and os.environ[item] for item in sys_var] for i, item in enumerate(sys_var_exists): if not item: - print(get_time() + - 'WARNING: The environment variable ' + sys_var[i] + ' has not been set') - if False in sys_var_exists: + err = 'WARNING: The environment variable {} has not been set' + print(get_time() + err.format(sys_var[i])) + + if not all(sys_var_exists): raise EnvironmentError(get_time() + 'One or more ADF environment variables have ' 'not been set, aborting ADF job.') + if '2019' not in os.environ['ADFHOME']: error = get_time() + 'No ADF/2019 detected in ' + os.environ['ADFHOME'] error += ', aborting ADF job.' raise ImportError(error) -def dict_concatenate(dic): +def dict_concatenate(dict_list: Iterable[dict]) -> dict: """Concatenates a list of dictionaries.""" ret = {} - for item in dic: + for item in dict_list: ret.update(item) return ret -def get_template(template_name, from_cat_data=True): +def get_template(template_name: str, + from_cat_data: bool = True) -> Settings: """Grab a yaml template and return it as Settings object.""" if from_cat_data: path = join('data/templates', template_name) @@ -82,3 +118,36 @@ def get_template(template_name, from_cat_data=True): else: with open(template_name, 'r') as file: return Settings(yaml.load(file, Loader=yaml.FullLoader)) + + +def validate_path(path: Optional[str]) -> str: + """Validate a provided directory path. + + Parameters + ---------- + path : str + Optional: A path to a directory. + Will default to the current working directory if ``None``. + + Results + ------- + |str|_ + Returns either **path** or the current working directory. + + Raises + ------ + FileNotFoundError + Raised if **path** cannot be found. + + NotADirectoryError + Raised if **path** is not a directory. + + """ + if path in (None, '.', ''): + return os.getcwd() + elif isdir(path): + return path + elif not exists(path): + raise FileNotFoundError(get_time() + f"'{path}' not found") + elif isfile(path): + raise NotADirectoryError(get_time() + f"'{path}' is not a directory") diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 563e3da2..aeda3d62 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,10 +1,33 @@ -########### +########## Change Log -########### +########## All notable changes to this project will be documented in this file. This project adheres to `Semantic Versioning `_. +0.5.0 +***** + +* CAT_ has been split into 3 seperate packages (see https://github.com/nlesc-nano/CAT/issues/39): + + * CAT_: A collection of tools designed for the automatic construction of composite chemical compounds. + * nano-CAT_: A collection of tools for the analysis of nanocrystals. + * data-CAT_: A databasing framework for the Compound Attachment Tools package (CAT_). + +* Docstrings have been changed into NumPy style. +* Added typehints. +* Added the CAT.SettingsDataFrame and CAT.SettingsSeries classes. +* Added more tests. +* Cleaned up all input-parsing related modules. +* Custom function groups (*i.e.* SMILES_ strings) can now be specified in the input + under the optional.ligand.functional_groups key (see https://github.com/nlesc-nano/CAT/issues/13). + +.. _CAT: https://github.com/nlesc-nano/CAT/ +.. _nano-CAT: https://github.com/nlesc-nano/nano-CAT/ +.. _data-CAT: https://github.com/nlesc-nano/data-CAT/ +.. _SMILES: https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system + + 0.4.6 ***** diff --git a/CITATION.cff b/CITATION.cff index 9949d3ef..a1cabd1c 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -2,7 +2,7 @@ # Metadata for citation of this software according to the CFF format (https://citation-file-format.github.io/) cff-version: 1.0.3 message: If you use this software, please cite it as below. -title: Compound Attachment/Analysis Tool (CAT) +title: Compound Attachment Tool (CAT) authors: - given-names: Bas family-names: van Beek @@ -13,7 +13,7 @@ keywords: - materials-science - python - Workflows -version: '0.1.0' +version: '0.5.0' date-released: 2019-02-08 -repository-code: https://github.com/BvB93/CAT +repository-code: https://github.com/nlesc-nano/CAT license: "LGPL-3.0" diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 04d6cfc9..a548dced 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -15,14 +15,14 @@ The sections below outline the steps in each case. You have a question ******************* -1. use the search functionality `here `__ to see if someone already filed the same issue; +1. use the search functionality `here `__ to see if someone already filed the same issue; 1. if your issue search did not yield any relevant results, make a new issue; 1. apply the "Question" label; apply other labels when relevant. You think you may have found a bug ********************************** -1. use the search functionality `here `__ to see if someone already filed the same issue; +1. use the search functionality `here `__ to see if someone already filed the same issue; 1. if your issue search did not yield any relevant results, make a new issue, making sure to provide enough information to the rest of the community to understand the cause and context of the problem. Depending on the issue, you may want to include: - the `SHA hashcode `_ of the commit that is causing your problem; - some identifying information (name and version number) for dependencies you're using; diff --git a/NOTICE b/NOTICE index b289b6c0..4eab607d 100644 --- a/NOTICE +++ b/NOTICE @@ -1,2 +1,2 @@ -This product includes Compound Attachment/Analysis Tool, software developed by +This product includes Compound Attachment Tool, software developed by Bas van Beek . diff --git a/README.rst b/README.rst index 82ee6195..a06ff8d2 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,3 @@ - .. image:: https://travis-ci.org/nlesc-nano/CAT.svg?branch=master :target: https://travis-ci.org/nlesc-nano/CAT .. image:: https://readthedocs.org/projects/cat/badge/?version=latest @@ -6,12 +5,11 @@ .. image:: https://img.shields.io/badge/python-3.7-blue.svg :target: https://www.python.org -####################################### -Compound Attachment/Analysis Tool 0.4.6 -####################################### +############################## +Compound Attachment Tool 0.5.0 +############################## -**CAT** is a collection of tools designed for the construction, -and subsequent analysis, of various chemical compounds. +**CAT** is a collection of tools designed for the construction of various chemical compounds. Further information is provided in the documentation_. Installation @@ -39,7 +37,7 @@ Dependencies installation Using the conda environment the following packages should be installed: -- rdkit_ & HDF5_: ``conda install -y --name CAT --channel conda-forge rdkit h5py`` +- rdkit_ : ``conda install -y --name CAT --channel conda-forge rdkit`` .. _installation: diff --git a/docs/7_database.rst b/docs/7_database.rst index beca4ba7..a6acf9c8 100755 --- a/docs/7_database.rst +++ b/docs/7_database.rst @@ -17,21 +17,26 @@ accoring to their functionality: for loading and unloading parts of the database from the harddrive. These methods should be used in conjunction with |with|_ statements: - :: + .. code:: python - import CAT + >>> import CAT - database = CAT.Database() - with database.open_csv_lig(db.csv_lig) as db: - print('my ligand database') - with database.open_yaml(db.yaml) as db: - print('my job settings database') - with h5py.File(db.hdf5) as db: - print('my structure database') + >>> database = CAT.Database() + >>> with database.OpenCsvLig(database.csv_lig) as db: + >>> print(type(db)) + - ====================== ===================== =================== ================== - :class:`.open_csv_lig` :class:`.open_csv_qd` :class:`.open_yaml` :class:`h5py.File` - ====================== ===================== =================== ================== + >>> with database.OpenYaml(database.yaml) as db: + >>> print('my job settings database') + + + >>> with h5py.File(database.hdf5) as db: + >>> print(type(db)) + + + ==================== =================== ================== ================== + :class:`.OpenCsvLig` :class:`.OpenCsvQd` :class:`.OpenYaml` :class:`h5py.File` + ==================== =================== ================== ================== - Importing to the database - these methods handle the importing of new data from python objects to the Database class: @@ -47,15 +52,16 @@ accoring to their functionality: :meth:`.from_csv` :meth:`.from_hdf5` ================= ================== + Index ~~~~~ -.. currentmodule:: CAT.data_handling.database.Database +.. currentmodule:: dataCAT.database.Database .. autosummary:: - open_yaml - open_csv_lig - open_csv_qd + OpenYaml + OpenCsvLig + OpenCsvQd DF update_mongodb update_csv @@ -65,7 +71,7 @@ Index from_hdf5 -.. currentmodule:: CAT.data_handling.database_functions +.. currentmodule:: dataCAT.database_functions .. autosummary:: mol_to_file @@ -77,19 +83,21 @@ Index Class API ~~~~~~~~~ -.. autoclass:: CAT.data_handling.database.Database +.. autoclass:: dataCAT.database.Database :members: + Function API ~~~~~~~~~~~~ -.. autofunction:: CAT.data_handling.database_functions.mol_to_file +.. autofunction:: dataCAT.database_functions.mol_to_file + +.. autofunction:: dataCAT.database_functions.as_pdb_array -.. autofunction:: CAT.data_handling.database_functions.as_pdb_array +.. autofunction:: dataCAT.database_functions.from_pdb_array -.. autofunction:: CAT.data_handling.database_functions.from_pdb_array +.. autofunction:: dataCAT.database_functions.sanitize_yaml_settings -.. autofunction:: CAT.data_handling.database_functions.sanitize_yaml_settings .. _rdkit.Chem.Mol: http://rdkit.org/docs/source/rdkit.Chem.rdchem.html#rdkit.Chem.rdchem.Mol .. _h5py.File: http://docs.h5py.org/en/stable/high/file.html @@ -108,7 +116,9 @@ Function API .. _str: https://docs.python.org/3/library/stdtypes.html#str .. _int: https://docs.python.org/3/library/functions.html#int .. _None: https://docs.python.org/3.7/library/constants.html#None +.. _bool: https://docs.python.org/3/library/functions.html?highlight=bool#bool .. _with: https://docs.python.org/3/reference/compound_stmts.html#with +.. _Sequence: https://docs.python.org/3/library/collections.abc.html#collections.abc.Sequence .. |rdkit.Chem.Mol| replace:: *rdkit.Chem.Mol* .. |h5py.File| replace:: *h5py.File* @@ -127,4 +137,6 @@ Function API .. |str| replace:: *str* .. |int| replace:: *int* .. |None| replace:: *None* +.. |bool| replace:: *bool* .. |with| replace:: ``with`` +.. |Sequence| replace:: *Sequence* diff --git a/docs/conf.py b/docs/conf.py index dcc75c1c..29ee04e0 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -25,49 +25,51 @@ # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -# -# needs_sphinx = '1.0' +needs_sphinx = '2.0' + + +# Output is processed with HTML4 writer. +# Default is False. +html4_writer = True + -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. +# Add any Sphinx extension module names here, as strings. +# They can be extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ - 'nbsphinx', 'sphinx.ext.autodoc', 'sphinx.ext.intersphinx', - 'sphinx.ext.todo', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', - 'sphinx.ext.githubpages', 'sphinx.ext.autosummary', - 'sphinx.ext.napoleon' + 'sphinx.ext.napoleon', + 'sphinx_autodoc_typehints' ] + # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] + # The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# -# source_suffix = ['.rst', '.md'] +# You can specify multiple suffix as a list of string: source_suffix = ['.rst', '.md'] source_suffix = '.rst' + # The master toctree document. master_doc = 'index' + # General information about the project. project = 'CAT' copyright = '2019, B. F. van Beek' author = 'B. F. van Beek' + # The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. +# |version| and |release|, also used in various other places throughout the built documents. +version = '0.5' # The short X.Y version. +release = '0.5.0' # The full version, including alpha/beta/rc tags. -# The short X.Y version. -version = '0.4' -# The full version, including alpha/beta/rc tags. -release = '0.4.6' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -75,6 +77,7 @@ # Usually you set "language" from the command line for these cases. language = 'en' + # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path @@ -87,9 +90,6 @@ # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' -# If true, `todo` and `todoList` produce output, else they produce nothing. -todo_include_todos = True - # -- Options for HTML output ---------------------------------------------- @@ -98,6 +98,7 @@ # html_theme = 'sphinx_rtd_theme' + # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. @@ -105,13 +106,13 @@ # html_theme_options = {} # Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, +# relative to this directory. +# They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] # Custom sidebar templates, must be a dictionary that maps document names # to template names. -# # This is required for the alabaster theme # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars html_sidebars = { @@ -151,6 +152,7 @@ # 'figure_align': 'htbp', } + # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). @@ -192,16 +194,51 @@ # Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {'https://docs.python.org/': None} +intersphinx_mapping = { + 'python': ('https://docs.python.org/', None), + 'h5py': ('http://docs.h5py.org/en/latest/', None) +} # File formats to generate. List of tuples or strings: plot_formats = [('png', 300)] -# This value selects if automatically documented members are sorted alphabetical (value 'alphabetical'), by member type (value 'groupwise') or by source order (value 'bysource'). +# This value selects if automatically documented members are sorted alphabetical (value 'alphabetical'), +# by member type (value 'groupwise') or by source order (value 'bysource'). autodoc_member_order = 'bysource' + +# True to parse NumPy style docstrings. +# False to disable support for NumPy style docstrings. +# Defaults to True. +napoleon_numpy_docstring = True + + +# True to parse NumPy style docstrings. +# False to disable support for NumPy style docstrings. +# Defaults to True. +napoleon_google_docstring = False + + +# True to use the .. admonition:: directive for the Example and Examples sections. +# False to use the .. rubric:: directive instead. One may look better than the other depending on what HTML theme is used. +# Defaults to False. +napoleon_use_admonition_for_examples = True + + +# True to use the .. admonition:: directive for Notes sections. +# False to use the .. rubric:: directive instead. +# Defaults to False. +napoleon_use_admonition_for_notes = True + + +# True to use the .. admonition:: directive for References sections. +# False to use the .. rubric:: directive instead. +# Defaults to False. +napoleon_use_admonition_for_references = True + + # This value contains a list of modules to be mocked up. # This is useful when some external dependencies are not met at build time and break the building process. # You may only specify the root package of the dependencies themselves and omit the sub-modules: diff --git a/examples/input.py b/examples/input.py old mode 100755 new mode 100644 index bc20d90b..a1a0cf4a --- a/examples/input.py +++ b/examples/input.py @@ -1,13 +1,15 @@ -""" An example input file. """ +"""An example input file.""" from os.path import (dirname, join) + import yaml -import CAT -from scm.plams.core.settings import Settings +from scm.plams import Settings + +from CAT import base yaml_path = join(dirname(__file__), 'input_settings.yaml') with open(yaml_path, 'r') as file: arg = Settings(yaml.load(file, Loader=yaml.FullLoader)) -qd_df, core_df, ligand_df = CAT.base.prep(arg) +qd_df, core_df, ligand_df = base.prep(arg) diff --git a/examples/input_settings.yaml b/examples/input_settings.yaml index 82fb20a6..d8408171 100644 --- a/examples/input_settings.yaml +++ b/examples/input_settings.yaml @@ -14,7 +14,7 @@ optional: dirname: database read: True write: True - overwrite: True + overwrite: False mol_format: [pdb] mongodb: username: bob # An optional username @@ -30,12 +30,9 @@ optional: dirname: ligand optimize: True split: True - cosmo-rs: True + cosmo-rs: False qd: dirname: QD - optimize: True - dissociate: - core_index: [60, 62, 64, 71, 74, 78] - job1: True - job2: False + optimize: False + dissociate: False diff --git a/requirements.txt b/requirements.txt index bdbea8dc..6f65213d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,12 @@ -sphinx>=2.0 -sphinx_rtd_theme -nbsphinx -numpy -scipy -pandas -pyyaml>=5.1 -schema -pymongo -git+https://github.com/SCM-NV/PLAMS@master -qmflows@git+https://github.com/SCM-NV/qmflows@master +sphinx>=2.0 +sphinx_rtd_theme +nbsphinx +numpy +scipy +pandas +pyyaml>=5.1 +schema +plams@git+https://github.com/SCM-NV/PLAMS@master +qmflows@git+https://github.com/SCM-NV/qmflows@master +nano-cat@git+https://github.com/nlesc-nano/nano-CAT@master +data-cat@git+https://github.com/nlesc-nano/data-CAT@master diff --git a/setup.cfg b/setup.cfg index d89bbd77..fcf281e5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,7 +3,7 @@ description-file = README.rst [aliases] # Define `python setup.py test` -test=pytest +test = pytest [coverage:run] branch = True diff --git a/setup.py b/setup.py index a7299dc2..e24a8d05 100644 --- a/setup.py +++ b/setup.py @@ -17,14 +17,15 @@ setup( name='CAT', version=version['__version__'], - description='A collection of tools designed for the automatic construction, and subsequent analysis, of chemical compounds.', + description=('A collection of tools designed for the automatic ' + 'construction of chemical compounds.'), long_description=readme + '\n\n', + long_description_content_type='text/x-rst', author=['Bas van Beek'], author_email='b.f.van.beek@vu.nl', - url='https://github.com/BvB93/CAT', + url='https://github.com/nlesc-nano/CAT', packages=[ 'CAT', - 'CAT.analysis', 'CAT.attachment', 'CAT.data', 'CAT.data.coskf', @@ -39,7 +40,7 @@ ] }, entry_points={ - 'console_scripts': ['init_cat=CAT.data_handling.input_parser:main'] + 'console_scripts': ['init_cat=CAT.data_handling.entry_points:main'] }, include_package_data=True, license='GNU Lesser General Public License v3 or later', @@ -58,34 +59,39 @@ 'Development Status :: 4 - Beta', 'Intended Audience :: Science/Research', 'Topic :: Scientific/Engineering :: Chemistry' - 'License :: OSI Approved :: GNU Lesser General Public License', + 'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)', 'Natural Language :: English', 'Programming Language :: Python :: 3.7', ], test_suite='tests', + python_requires='>=3.7', install_requires=[ 'numpy', 'scipy', 'pandas', 'pyyaml>=5.1', 'schema', - 'pymongo', 'plams@git+https://github.com/SCM-NV/PLAMS@release', 'qmflows@git+https://github.com/SCM-NV/qmflows@master' ], setup_requires=[ 'pytest-runner', - 'sphinx', - 'sphinx_rtd_theme', - 'recommonmark' ], tests_require=[ 'pytest', 'pytest-cov', + 'pytest-mock', 'pycodestyle', + 'data-CAT@git+https://github.com/nlesc-nano/data-CAT@master', + 'nano-CAT@git+https://github.com/nlesc-nano/nano-CAT@master' ], extras_require={ - 'test': ['pytest', 'pytest-cov', 'pytest-mock', 'nbsphinx', 'pycodestyle'], - 'doc': ['sphinx', 'sphinx_rtd_theme', 'nbsphinx'] + 'test': ['pytest', + 'pytest-cov', + 'pytest-mock', + 'pycodestyle', + 'data-CAT@git+https://github.com/nlesc-nano/data-CAT@master', + 'nano-CAT@git+https://github.com/nlesc-nano/nano-CAT@master'], + 'doc': ['sphinx>=2.0', 'sphinx_rtd_theme', 'sphinx-autodoc-typehints'] } ) diff --git a/test/__init__.py b/test/__init__.py deleted file mode 100644 index 533b66af..00000000 --- a/test/__init__.py +++ /dev/null @@ -1 +0,0 @@ -""" Various tests. """ diff --git a/test/test_cat.py b/test/test_cat.py deleted file mode 100644 index 4d979ea5..00000000 --- a/test/test_cat.py +++ /dev/null @@ -1,168 +0,0 @@ -"""A module for CAT-related tests.""" - -import contextlib -import io -import os -import shutil - -import yaml -import pandas as pd - -from scm.plams.core.settings import Settings -from scm.plams.mol.molecule import Molecule - -from CAT import Database -from CAT.base import (prep_input, prep_core, prep_ligand) - - -# prepare input -ARG = Settings(yaml.load( - """ - path: test - - input_cores: - - Cd68Se55.xyz: - guess_bonds: False - - input_ligands: - - OC - - OCC - - OCCC - - OCCCC - - optional: - database: - dirname: database - read: True - write: True - overwrite: False - mol_format: [xyz, pdb] - mongodb: null - - core: - dirname: core - dummy: Cl - - ligand: - dirname: ligand - functional_groups: None - optimize: True - split: True - cosmo-rs: False - - qd: - dirname: QD - optimize: False - activation_strain: False - dissociate: False - """, - Loader=yaml.FullLoader)) -LIGAND_DF, CORE_DF = prep_input(ARG) -shutil.rmtree(ARG.optional.database.dirname) - - -def test_prep_core(): - """ Test the :func:`CAT.base.prep_core` function. """ - arg = ARG.copy() - core_df = CORE_DF.copy() - - # Check the dataframe - ret = prep_core(core_df, arg) - assert isinstance(ret, pd.DataFrame) - assert ret.shape == core_df.shape - assert 'mol' in ret.columns - - # Check the molecule in the dataframe - core = ret['mol'][0] - assert isinstance(core, Molecule) - assert len(core) == 123 - assert 'Cl' not in [at.symbol for at in core] - assert len(core.properties.dummies) == 26 - assert len(set([at.symbol for at in core.properties.dummies])) == 1 - - -def test_prep_ligand(): - """ Test the :func:`CAT.base.prep_ligand` function with **split** = *True*. """ - arg = ARG.copy() - lig_df = LIGAND_DF.copy() - if os.path.isdir(arg.optional.database.dirname): - shutil.rmtree(arg.optional.database.dirname) - os.mkdir(arg.optional.database.dirname) - data = Database(path=arg.optional.database.dirname) - - # Check while splite=False - arg.optional.ligand.split = False - ret = prep_ligand(lig_df, arg) - assert isinstance(ret, pd.DataFrame) - assert 'mol' in ret.columns - assert ret['mol'].shape[0] == lig_df.shape[0] - - # Check the molecules in the dataframe - lig_list = ret['mol'].values.tolist() - with data.open_csv_lig(data.csv_lig) as db: - assert [6, 9, 12, 15] == sorted([len(lig) for lig in lig_list]) - for lig in lig_list: - assert isinstance(lig, Molecule) - assert 'O' in lig.properties.anchor - assert lig.properties.charge == 0 - assert lig.properties.dummies.properties.charge == 0 - assert lig.properties.name + '.pdb' in os.listdir(arg.optional.ligand.dirname) - assert lig.properties.name + '.xyz' in os.listdir(arg.optional.ligand.dirname) - assert (lig.properties.smiles, lig.properties.anchor) in db.index - - # Check if previous structures can be pulled from the database - f = io.StringIO() - with contextlib.redirect_stdout(f): - prep_ligand(lig_df, arg) - print_list = f.getvalue().splitlines() - for item in print_list: - if item: - assert 'has been pulled from the database' in item - - # Reset the directories - shutil.rmtree(arg.optional.database.dirname) - shutil.rmtree(arg.optional.ligand.dirname) - os.mkdir(arg.optional.ligand.dirname) - - -def test_prep_ligand_split(): - """ Test the :func:`CAT.base.prep_ligand` function with **split** = *False*. """ - arg = ARG.copy() - lig_df = LIGAND_DF.copy() - if os.path.isdir(arg.optional.database.dirname): - shutil.rmtree(arg.optional.database.dirname) - os.mkdir(arg.optional.database.dirname) - data = Database(path=arg.optional.database.dirname) - - # Check the dataframe - ret = prep_ligand(lig_df, arg) - assert isinstance(ret, pd.DataFrame) - assert 'mol' in ret.columns - assert ret['mol'].shape[0] == lig_df.shape[0] - - # Check the molecules in the dataframe - lig_list = ret['mol'].values.tolist() - with data.open_csv_lig(data.csv_lig) as db: - assert [5, 8, 11, 14] == sorted([len(lig) for lig in lig_list]) - for lig in lig_list: - assert isinstance(lig, Molecule) - assert 'O' in lig.properties.anchor - assert lig.properties.charge == -1 - assert lig.properties.dummies.properties.charge == -1 - assert lig.properties.name + '.pdb' in os.listdir(arg.optional.ligand.dirname) - assert lig.properties.name + '.xyz' in os.listdir(arg.optional.ligand.dirname) - assert (lig.properties.smiles, lig.properties.anchor) in db.index - - # Check if previous structures can be pulled from the database - f = io.StringIO() - with contextlib.redirect_stdout(f): - prep_ligand(lig_df, arg) - print_list = f.getvalue().splitlines() - for item in print_list: - if item: - assert 'has been pulled from the database' in item - - # Reset the directories - shutil.rmtree(arg.optional.database.dirname) - shutil.rmtree(arg.optional.ligand.dirname) - os.mkdir(arg.optional.ligand.dirname) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..bf7bc5d5 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +"""Various tests.""" diff --git a/tests/test_entry_points.py b/tests/test_entry_points.py new file mode 100644 index 00000000..d3f9da5d --- /dev/null +++ b/tests/test_entry_points.py @@ -0,0 +1,22 @@ +"""Tests for :mod:`CAT.data_handling.entry_points`.""" + +from os.path import join +from shutil import rmtree + +from CAT.assertion_functions import assert_exception +from CAT.data_handling.entry_points import main + +PATH = 'tests/test_files' + + +def test_main() -> None: + """Test :func:`CAT.data_handling.entry_points.main`.""" + filename = join(PATH, 'input2.yaml') + try: + main([filename]) + finally: + rmtree(join(PATH, 'ligand')) + rmtree(join(PATH, 'qd')) + rmtree(join(PATH, 'database')) + + assert_exception(FileNotFoundError, main, [filename + 'bob']) diff --git a/tests/test_files/Acetate.xyz b/tests/test_files/Acetate.xyz new file mode 100644 index 00000000..2fc8c963 --- /dev/null +++ b/tests/test_files/Acetate.xyz @@ -0,0 +1,9 @@ +7 +ADF 2019.103; BP86/QZ4P; numerical quality: Very Good; scf convergence: 10**-8 au; gradient convergence: 10**-5 au; Cs symmetry +C -0.00000000 -0.01318940 0.01448054 +C 0.00000007 -0.06030598 -1.55487623 +O 0.00000000 -1.12966629 0.60574456 +O -0.00000006 1.15000368 0.50879818 +H 0.88433680 0.47067984 -1.94016799 +H -0.88433666 0.47067978 -1.94016807 +H 0.00000012 -1.09171426 -1.93473861 diff --git a/tests/test_files/Ethylene.xyz b/tests/test_files/Ethylene.xyz new file mode 100644 index 00000000..766c922c --- /dev/null +++ b/tests/test_files/Ethylene.xyz @@ -0,0 +1,8 @@ +6 +ADF 2019.103; BP86/QZ4P; numerical quality: Very Good; scf convergence: 10**-8 au; gradient convergence: 10**-5 au; D2h symmetry +C -0.00000000 0.00000000 0.66626000 +C 0.00000000 -0.00000000 -0.66626000 +H 0.00000000 0.92713000 1.23984000 +H -0.00000000 -0.92713000 1.23984000 +H 0.00000000 0.92713000 -1.23984000 +H -0.00000000 -0.92713000 -1.23984000 diff --git a/tests/test_files/Methanol.mol b/tests/test_files/Methanol.mol new file mode 100644 index 00000000..e69ab991 --- /dev/null +++ b/tests/test_files/Methanol.mol @@ -0,0 +1,16 @@ +ADF 2019.103; BP86/QZ4P; numerical quality: Very Good; scf convergence: 10**-8 au; gradient convergence: 10**-5 au; Cs symmetry + + + 6 5 0 0 0 0 0 0 0 0999 V2000 + 0.3452 -0.1163 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + -1.0818 -0.2186 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 + 0.7326 -1.1411 0.0000 H 0 0 0 0 0 0 0 0 0 0 0 0 + 0.7259 0.3989 0.8971 H 0 0 0 0 0 0 0 0 0 0 0 0 + 0.7259 0.3989 -0.8971 H 0 0 0 0 0 0 0 0 0 0 0 0 + -1.4477 0.6783 0.0000 H 0 0 0 0 0 0 0 0 0 0 0 0 + 1 2 1 0 0 0 0 + 6 2 1 0 0 0 0 + 1 3 1 0 0 0 0 + 1 5 1 0 0 0 0 + 1 4 1 0 0 0 0 +M END diff --git a/tests/test_files/Methanol.pdb b/tests/test_files/Methanol.pdb new file mode 100644 index 00000000..fb004ed5 --- /dev/null +++ b/tests/test_files/Methanol.pdb @@ -0,0 +1,9 @@ +HETATM 1 C1 UNL 1 0.345 -0.116 0.000 1.00 0.00 C +HETATM 2 O1 UNL 1 -1.082 -0.219 0.000 1.00 0.00 O +HETATM 3 H1 UNL 1 0.733 -1.141 0.000 1.00 0.00 H +HETATM 4 H2 UNL 1 0.726 0.399 0.897 1.00 0.00 H +HETATM 5 H3 UNL 1 0.726 0.399 -0.897 1.00 0.00 H +HETATM 6 H4 UNL 1 -1.448 0.678 0.000 1.00 0.00 H +CONECT 1 2 3 4 5 +CONECT 2 6 +END diff --git a/tests/test_files/Methanol.txt b/tests/test_files/Methanol.txt new file mode 100644 index 00000000..edeba4d3 --- /dev/null +++ b/tests/test_files/Methanol.txt @@ -0,0 +1,3 @@ +Methanol.pdb +Methanol.xyz +Methanol.mol \ No newline at end of file diff --git a/tests/test_files/Methanol.xyz b/tests/test_files/Methanol.xyz new file mode 100644 index 00000000..e9447f99 --- /dev/null +++ b/tests/test_files/Methanol.xyz @@ -0,0 +1,8 @@ +6 +ADF 2019.103; BP86/QZ4P; numerical quality: Very Good; scf convergence: 10**-8 au; gradient convergence: 10**-5 au; Cs symmetry +C 0.34517500 -0.11630200 0.00000000 +H -1.44772100 0.67832000 0.00000000 +O -1.08180100 -0.21862000 0.00000000 +H 0.72586900 0.39885500 -0.89710000 +H 0.72586900 0.39885500 0.89710000 +H 0.73261000 -1.14110600 0.00000000 diff --git a/tests/test_files/Methanol_rotate.xyz b/tests/test_files/Methanol_rotate.xyz new file mode 100644 index 00000000..00e7389c --- /dev/null +++ b/tests/test_files/Methanol_rotate.xyz @@ -0,0 +1,8 @@ +6 +ADF 2019.103; BP86/QZ4P; numerical quality: Very Good; scf convergence: 10**-8 au; gradient convergence: 10**-5 au; Cs symmetry +C -0.24747977 0.24406019 0.10890529 +H 0.95129102 -1.19158162 -0.48082461 +O 1.04091153 -0.25086940 -0.26768735 +H -0.98256976 0.15602753 -0.70779295 +H -0.64374132 -0.26394362 1.00333918 +H -0.11841069 1.30630891 0.34406045 diff --git a/test/core/Cd68Se55.xyz b/tests/test_files/core/Cd68Se55.xyz similarity index 100% rename from test/core/Cd68Se55.xyz rename to tests/test_files/core/Cd68Se55.xyz diff --git a/tests/test_files/freq.npy b/tests/test_files/freq.npy new file mode 100644 index 00000000..7e622cf8 Binary files /dev/null and b/tests/test_files/freq.npy differ diff --git a/tests/test_files/input1.yaml b/tests/test_files/input1.yaml new file mode 100644 index 00000000..76ea251e --- /dev/null +++ b/tests/test_files/input1.yaml @@ -0,0 +1,33 @@ +path: tests/test_files + +input_cores: + - Cd68Se55.xyz: + guess_bonds: False + +input_ligands: + - CO + - CCO + +optional: + database: + dirname: database + read: True + write: True + overwrite: False + mol_format: [pdb] + mongodb: False + + core: + dirname: core + dummy: Cl + + ligand: + dirname: ligand + optimize: True + split: True + cosmo-rs: False + + qd: + dirname: QD + optimize: True + dissociate: False diff --git a/tests/test_files/input2.yaml b/tests/test_files/input2.yaml new file mode 100644 index 00000000..be601990 --- /dev/null +++ b/tests/test_files/input2.yaml @@ -0,0 +1,32 @@ +path: tests/test_files + +input_cores: + - Cd68Se55.xyz + +input_ligands: + - CO + - CCO + +optional: + database: + dirname: database + read: True + write: True + overwrite: False + mol_format: False + mongodb: False + + core: + dirname: core + dummy: Cl + + ligand: + dirname: ligand + optimize: True + split: True + cosmo-rs: False + + qd: + dirname: QD + optimize: False + dissociate: False diff --git a/tests/test_files/settings.yaml b/tests/test_files/settings.yaml new file mode 100644 index 00000000..98aae45b --- /dev/null +++ b/tests/test_files/settings.yaml @@ -0,0 +1,3 @@ +key1: + key2: + key3: True diff --git a/tests/test_frozen_settings.py b/tests/test_frozen_settings.py new file mode 100644 index 00000000..eb69bd82 --- /dev/null +++ b/tests/test_frozen_settings.py @@ -0,0 +1,32 @@ +"""Tests for :mod:`CAT.frozen_settings`.""" + +from CAT.frozen_settings import FrozenSettings +from CAT.assertion_functions import (assert_eq, assert_exception, assert_id, Invert) + +SETTINGS = FrozenSettings({'a': True, 'b': False, 'c': [1, 2, 3, 4]}) + + +def test_missing() -> None: + """Tests for :meth:`CAT.frozen_settings.FrozenSettings.__missing__`.""" + item = SETTINGS.d + assert_eq(item, FrozenSettings()) + + +def test_delitem() -> None: + """Tests for :meth:`CAT.frozen_settings.FrozenSettings.__delitem__`.""" + args = ('a') + assert_exception(TypeError, SETTINGS.__delitem__, args) + + +def test_setitem() -> None: + """Tests for :meth:`CAT.frozen_settings.FrozenSettings.__setitem__`.""" + args = ('d', True) + assert_exception(TypeError, SETTINGS.__setitem__, args) + + +def test_copy() -> None: + """Tests for :meth:`CAT.frozen_settings.FrozenSettings.copy`.""" + settings = SETTINGS.copy() + assert_eq(settings, SETTINGS) + with Invert(assert_id) as func: + func(settings, SETTINGS) diff --git a/test/test_lint.py b/tests/test_lint.py similarity index 84% rename from test/test_lint.py rename to tests/test_lint.py index af3677d3..b4e46af8 100644 --- a/test/test_lint.py +++ b/tests/test_lint.py @@ -1,13 +1,13 @@ -""" Lint tests """ +"""Test CAT for pep8 compliance.""" import os import textwrap import pycodestyle # formerly known as pep8 -def test_pep8_conformance(): - """Test that we conform to PEP-8.""" - check_paths = ['CAT', 'test'] +def test_pep8_conformance() -> None: + """Test that CAT conforms to PEP-8.""" + check_paths = ['CAT', 'tests'] exclude_paths = [] print("PEP8 check of directories: {}\n".format(', '.join(check_paths))) @@ -18,6 +18,7 @@ def test_pep8_conformance(): for i, path in enumerate(paths): paths[i] = os.path.join(package_root, path) + # Increase the maximum amount of characters per line from 79 to 100 style = pycodestyle.StyleGuide(max_line_length=100) style.options.exclude.extend(exclude_paths) diff --git a/tests/test_mol_import.py b/tests/test_mol_import.py new file mode 100644 index 00000000..24712b2d --- /dev/null +++ b/tests/test_mol_import.py @@ -0,0 +1,158 @@ +"""Tests for :mod:`CAT.data_handling.mol_import`.""" + +import random +from os.path import join + +import numpy as np + +from scm.plams import (Settings, Molecule) +import scm.plams.interfaces.molecule.rdkit as molkit + +from CAT.assertion_functions import (assert_eq, assert_lt, assert_instance) +from CAT.data_handling.mol_import import ( + read_mol_xyz, read_mol_pdb, read_mol_mol, read_mol_smiles, read_mol_plams, read_mol_rdkit, + read_mol_folder, read_mol_txt, get_charge_dict, set_mol_prop, canonicalize_mol +) + +PATH = 'tests/test_files' +REF_MOL = Molecule(join(PATH, 'Methanol.xyz')) +REF_MOL.guess_bonds() +canonicalize_mol(REF_MOL) + + +def test_read_mol_xyz() -> None: + """Test :func:`CAT.data_handling.validate_input.read_mol_xyz`.""" + xyz = join(PATH, 'Methanol.xyz') + mol_dict = Settings({'mol': xyz, 'guess_bonds': True}) + mol = read_mol_xyz(mol_dict) + + assert_instance(mol, Molecule) + np.testing.assert_allclose(mol.as_array(), REF_MOL.as_array()) + assert_eq([at.symbol for at in mol], [at.symbol for at in REF_MOL]) + + +def test_read_mol_pdb() -> None: + """Test :func:`CAT.data_handling.validate_input.read_mol_pdb`.""" + pdb = join(PATH, 'Methanol.pdb') + mol_dict = Settings({'mol': pdb, 'guess_bonds': False}) + mol = read_mol_pdb(mol_dict) + + assert_instance(mol, Molecule) + assert_lt(mol.as_array().sum() - REF_MOL.as_array().sum(), 0.01) + assert_eq([at.symbol for at in mol], [at.symbol for at in REF_MOL]) + + +def test_read_mol_mol() -> None: + """Test :func:`CAT.data_handling.validate_input.read_mol_mol`.""" + mol_file = join(PATH, 'Methanol.mol') + mol_dict = Settings({'mol': mol_file, 'guess_bonds': False}) + mol = read_mol_mol(mol_dict) + + assert_instance(mol, Molecule) + assert_lt(mol.as_array().sum() - REF_MOL.as_array().sum(), 0.01) + assert_eq([at.symbol for at in mol], [at.symbol for at in REF_MOL]) + + +def test_read_mol_smiles() -> None: + """Test :func:`CAT.data_handling.validate_input.read_mol_smiles`.""" + smiles = 'CO' + mol_dict = Settings({'mol': smiles, 'guess_bonds': False}) + mol = read_mol_smiles(mol_dict) + + assert_instance(mol, Molecule) + assert_eq([at.symbol for at in mol], [at.symbol for at in REF_MOL]) + + +def test_read_mol_plams() -> None: + """Test :func:`CAT.data_handling.validate_input.read_mol_smiles`.""" + mol = REF_MOL.copy() + random.shuffle(mol.atoms) + mol_dict = Settings({'mol': mol, 'guess_bonds': False}) + mol = read_mol_plams(mol_dict) + + assert_instance(mol, Molecule) + assert_lt(mol.as_array().sum() - REF_MOL.as_array().sum(), 0.01) + assert_eq([at.symbol for at in mol], [at.symbol for at in REF_MOL]) + + +def test_read_mol_rdkit() -> None: + """Test :func:`CAT.data_handling.validate_input.read_mol_rdkit`.""" + mol = REF_MOL.copy() + random.shuffle(mol.atoms) + rdmol = molkit.to_rdmol(mol) + mol_dict = Settings({'mol': rdmol, 'guess_bonds': False}) + mol = read_mol_rdkit(mol_dict) + + assert_instance(mol, Molecule) + assert_lt(mol.as_array().sum() - REF_MOL.as_array().sum(), 0.01) + assert_eq([at.symbol for at in mol], [at.symbol for at in REF_MOL]) + + +def test_read_mol_folder() -> None: + """Test :func:`CAT.data_handling.validate_input.read_mol_folder`.""" + mol_dict = Settings({'mol': PATH, 'path': PATH, 'guess_bonds': True, 'is_core': False}) + _mol_list = read_mol_folder(mol_dict) + mol_list = [mol for mol in _mol_list if mol.get_formula() == 'C1H4O1'] + + for mol in mol_list: + assert_instance(mol, Molecule) + assert_lt(mol.as_array().sum() - REF_MOL.as_array().sum(), 0.01) + assert_eq([at.symbol for at in mol], [at.symbol for at in REF_MOL]) + + +def test_read_mol_txt() -> None: + """Test :func:`CAT.data_handling.validate_input.read_mol_txt`.""" + txt = join(PATH, 'Methanol.txt') + mol_dict = Settings({'mol': txt, 'path': PATH, 'guess_bonds': True, 'is_core': False}) + mol_list = read_mol_txt(mol_dict) + + for mol in mol_list[:-1]: + assert_instance(mol, Molecule) + assert_lt(mol.as_array().sum() - REF_MOL.as_array().sum(), 0.01) + assert_eq([at.symbol for at in mol], [at.symbol for at in REF_MOL]) + + assert_instance(mol_list[-1], Molecule) + assert_eq([at.symbol for at in mol_list[-1]], [at.symbol for at in REF_MOL]) + + +def test_get_charge_dict() -> None: + """Test :func:`CAT.data_handling.validate_input.get_charge_dict`.""" + charge_dict = get_charge_dict() + ref = { + 'Li': 1, 'Na': 1, 'K': 1, 'Rb': 1, 'Cs': 1, + 'Be': 2, 'Mg': 2, 'Ca': 2, 'Sr': 2, 'Ba': 2, + 'N': -3, 'P': -3, 'As': -3, 'Sb': -3, 'Bi': -3, + 'O': -2, 'S': -2, 'Se': -2, 'Te': -2, 'Po': -2, + 'H': -1, 'F': -1, 'Cl': -1, 'Br': -1, 'I': -1, 'At': -1, + 'Cd': 2, 'Pb': 2 + } + + assert_eq(charge_dict, ref) + + +def test_set_mol_prop() -> None: + """Test :func:`CAT.data_handling.validate_input.set_mol_prop`.""" + mol = REF_MOL.copy() + mol.properties = Settings() + mol_dict = Settings({'is_core': False, 'path': PATH, 'name': 'CO'}) + + set_mol_prop(mol, mol_dict) + ref = {'name': 'CO', 'dummies': {}, 'path': PATH, 'job_path': [], 'smiles': 'CO'} + assert_eq(mol.properties, ref) + + ref1 = Settings({ + 'stereo': {}, 'charge': 0, + 'pdb_info': {'ResidueName': 'LIG', 'Occupancy': 1.0, 'TempFactor': 0.0, + 'ResidueNumber': 1, 'ChainId': 'A', 'IsHeteroAtom': False}, + }) + ref2 = Settings({ + 'stereo': {}, 'charge': 0, + 'pdb_info': {'ResidueName': 'LIG', 'Occupancy': 1.0, 'TempFactor': 0.0, + 'ResidueNumber': 1, 'ChainId': 'A', 'IsHeteroAtom': True}, + }) + for at in mol: + del at.properties.pdb_info.Name + if at.symbol == 'O': + assert_eq(at.properties, ref2) + else: + assert_eq(at.properties, ref1) diff --git a/tests/test_mol_utils.py b/tests/test_mol_utils.py new file mode 100644 index 00000000..d0f855db --- /dev/null +++ b/tests/test_mol_utils.py @@ -0,0 +1,144 @@ +"""Tests for :mod:`CAT.mol_utils`.""" + +from os.path import join +from itertools import chain + +from scm.plams import (Molecule, PeriodicTable, PTError) +import scm.plams.interfaces.molecule.rdkit as molkit + +from CAT.assertion_functions import (assert_eq, assert_isin, assert_exception) +from CAT.mol_utils import ( + from_mol_other, from_rdmol, get_index, merge_mol, separate_mod, + to_atnum, to_symbol, adf_connectivity, fix_carboxyl, fix_h +) + +PATH = 'tests/test_files' +MOL = Molecule(join(PATH, 'Methanol.xyz')) # Methanol; BP86/QZ4P +MOL.guess_bonds() + + +def test_from_mol_other() -> None: + """Test :meth:`Molecule.from_mol_other`.""" + mol = MOL.copy() + mol_rot = Molecule(join(PATH, 'Methanol_rotate.xyz')) + mol.from_mol_other(mol_rot) + + for at, at_ref in zip(mol, mol_rot): + assert_eq(at.coords, at_ref.coords) + assert_eq(at.symbol, at_ref.symbol) + + +def test_from_rdmol() -> None: + """Test :meth:`Molecule.from_rdmol`.""" + mol = MOL.copy() + mol_rot = molkit.to_rdmol(Molecule(join(PATH, 'Methanol_rotate.xyz'))) + mol.from_rdmol(mol_rot) + + conf = mol_rot.GetConformer() + for at, at_ref in zip(mol, mol_rot.GetAtoms()): + pos = conf.GetAtomPosition(at_ref.GetIdx()) + coords = pos.x, pos.y, pos.z + symbol = at_ref.GetSymbol() + assert_eq(at.coords, coords) + assert_eq(at.symbol, symbol) + + +def test_get_index() -> None: + """Test :meth:`Molecule.get_index`.""" + for j, at in enumerate(MOL, 1): + i = MOL.get_index(at) + assert_eq(i, j) + + ref = [(1, 3), (2, 3), (1, 6), (1, 4), (1, 5)] + for bond, j in zip(MOL.bonds, ref): + i = MOL.get_index(bond) + assert_eq(i, j) + + +def test_merge_mol() -> None: + """Test :meth:`Molecule.merge_mol`.""" + mol = MOL.copy() + mol_list = [mol.copy() for _ in range(10)] + atom_list = list(chain.from_iterable(mol_list)) + mol.atoms + bond_list = list(chain.from_iterable(m.bonds for m in mol_list)) + mol.bonds + mol.merge_mol(mol_list) + + assert_eq(len(mol.atoms), len(atom_list)) + assert_eq(len(mol.bonds), len(bond_list)) + for at in mol.atoms: + assert_isin(at, atom_list) + for bond in mol.bonds: + assert_isin(bond, bond_list) + + +def test_separate_mod() -> None: + """Test :meth:`Molecule.separate_mod`.""" + mol = MOL.copy() + mol_list = mol.separate_mod() + + for m in mol_list: + for at in m.atoms: + assert_isin(at, mol) + assert_isin(at, mol.atoms) + for bond in m.bonds: + assert_isin(bond, mol.bonds) + + +def test_to_atnum() -> None: + """Test :func:`CAT.mol_utils.to_atnum`.""" + for j, (symbol, *_) in enumerate(PeriodicTable.data): + i = to_atnum(symbol) + assert_eq(i, j) + assert_eq(to_atnum(j), j) + assert_exception(TypeError, to_atnum, {}) + assert_exception(TypeError, to_atnum, []) + assert_exception(TypeError, to_atnum, ()) + assert_exception(PTError, to_atnum, 'bob') + assert_exception(PTError, to_atnum, 'bill') + + +def test_to_symbol() -> None: + """Test :func:`CAT.mol_utils.to_symbol`.""" + for j, (symbol, *_) in enumerate(PeriodicTable.data): + i = to_symbol(j) + assert_eq(i, symbol) + assert_eq(to_symbol(symbol), symbol) + assert_exception(TypeError, to_symbol, {}) + assert_exception(TypeError, to_symbol, []) + assert_exception(TypeError, to_symbol, ()) + assert_exception(PTError, to_symbol, 999) + assert_exception(PTError, to_symbol, -999) + + +def test_adf_connectivity() -> None: + """Test :func:`CAT.mol_utils.adf_connectivity`.""" + ref = ['1 3 1.0', '2 3 1.0', '1 6 1.0', '1 4 1.0', '1 5 1.0'] + connectivity_list = adf_connectivity(MOL) + assert_eq(connectivity_list, ref) + + +def test_fix_carboxyl() -> None: + """Test :func:`CAT.mol_utils.fix_carboxyl`.""" + mol = Molecule(join(PATH, 'Acetate.xyz')) # Acetate; BP86/QZ4P + mol.guess_bonds() + mol[1, 3].order = 2 + mol[1, 4].order = 1 + mol[4].move_to([0.200000, -1.129666, 0.605745]) + mol[4].properties.charge = -1 + + fix_carboxyl(mol) + C, O1, O2 = mol[1], mol[3], mol[4] + angle = C.angle(O1, O2, result_unit='degree') + assert_eq(round(angle), 120) + + +def test_fix_h() -> None: + """Test :func:`CAT.mol_utils.fix_h`.""" + mol = Molecule(join(PATH, 'Ethylene.xyz')) # Ethylene; BP86/QZ4P + mol.guess_bonds() + mol[3].move_to([0.0, 0.3, -0.4]) + + fix_h(mol) + H, C1, C2 = mol[3], mol[1], mol[2] + angle = C1.angle(H, C2, result_unit='degree') + assert_eq(round(angle), 120) diff --git a/tests/test_schemas.py b/tests/test_schemas.py new file mode 100644 index 00000000..f37511ed --- /dev/null +++ b/tests/test_schemas.py @@ -0,0 +1,422 @@ +"""Tests for :mod:`CAT.data_handling.validation_schemas`.""" + +from os.path import join + +from schema import SchemaError + +from scm.plams import AMSJob, ADFJob, Settings + +from CAT.utils import get_template +from CAT.assertion_functions import (assert_eq, assert_id, assert_exception) +from CAT.data_handling.validation_schemas import ( + mol_schema, core_schema, ligand_schema, qd_schema, database_schema, + mongodb_schema, bde_schema, qd_opt_schema, crs_schema +) + +try: + from nanoCAT import CRSJob +except ModuleNotFoundError: + from scm.plams.core.basejob import Job + CRSJob = Job + +PATH = 'tests/test_files' + + +def test_mol_schema() -> None: + """Test :data:`CAT.data_handling.validation_schemas.mol_schema`.""" + mol_dict = {} + args = SchemaError, mol_schema.validate, mol_dict + + assert_eq(mol_schema.validate(mol_dict), {'guess_bonds': False}) + + mol_dict['guess_bonds'] = 1 # Exception: incorrect type + assert_exception(*args) + mol_dict['guess_bonds'] = True # Correct + assert_eq(mol_schema.validate(mol_dict), mol_dict) + + mol_dict['is_core'] = 1 # Exception: incorrect type + assert_exception(*args) + mol_dict['is_core'] = True + assert_eq(mol_schema.validate(mol_dict), mol_dict) + + mol_dict['column'] = -1 # Exception: value < 0 + assert_exception(*args) + mol_dict['column'] = 1.0 # Exception: incorrect type + assert_exception(*args) + mol_dict['column'] = 1 + assert_eq(mol_schema.validate(mol_dict), mol_dict) + + mol_dict['row'] = -1 # Exception: value < 0 + assert_exception(*args) + mol_dict['row'] = 1.0 # Exception: incorrect type + assert_exception(*args) + mol_dict['row'] = 1 + assert_eq(mol_schema.validate(mol_dict), mol_dict) + + mol_dict['indices'] = 1.0 # Exception: incorrect type + assert_exception(*args) + mol_dict['indices'] = [1, 5, 6, 7.0] # Exception: an element has an incorrect type + assert_exception(*args) + mol_dict['indices'] = (i for i in range(10)) # Exception: incorrect type + assert_exception(*args) + mol_dict['indices'] = -1 # Exception: value < 0 + assert_exception(*args) + mol_dict['indices'] = [-1, -2, -3, -4, -5] # Exception: an element is < 0 + assert_exception(*args) + mol_dict['indices'] = [1, 1, 2] # Exception: duplicate elements + assert_exception(*args) + + mol_dict['indices'] = 1 + assert_eq(mol_schema.validate(mol_dict)['indices'], (1,)) + mol_dict['indices'] = [1, 2, 3, 4, 5] + assert_eq(mol_schema.validate(mol_dict)['indices'], (1, 2, 3, 4, 5)) + mol_dict['indices'] = {1, 2, 3, 4, 5} + assert_eq(mol_schema.validate(mol_dict)['indices'], (1, 2, 3, 4, 5)) + mol_dict['indices'] = (1, 2, 3, 4, 5) + + mol_dict['type'] = 1 # Exception: incorrect type + assert_exception(*args) + mol_dict['type'] = 'bob' + assert_eq(mol_schema.validate(mol_dict), mol_dict) + + mol_dict['name'] = 1 # Exception: incorrect type + assert_exception(*args) + mol_dict['name'] = 'bob' + assert_eq(mol_schema.validate(mol_dict), mol_dict) + + +def test_database_schema() -> None: + """Test :data:`CAT.data_handling.validation_schemas.database_schema`.""" + db_dict = {'dirname': '.'} + ref = { + 'dirname': '.', + 'read': ('core', 'ligand', 'qd'), + 'write': ('core', 'ligand', 'qd'), + 'overwrite': (), + 'mongodb': {}, + 'mol_format': ('pdb', 'xyz') + + } + + assert_eq(database_schema.validate(db_dict), ref) + + for key in ('read', 'write', 'overwrite', 'mol_format', 'mongodb'): + _db_dict = db_dict.copy() + args = SchemaError, database_schema.validate, _db_dict + + _db_dict[key] = 1 # Exception: incorrect type + assert_exception(*args) + _db_dict[key] = 'bob' # Exception: incorrect value + assert_exception(*args) + _db_dict[key] = [1] # Exception: element has incorrect type + assert_exception(*args) + _db_dict[key] = ['bob'] # Exception: element has incorrect value + assert_exception(*args) + + args = SchemaError, database_schema.validate, db_dict + db_dict['mongodb'] = True # Exception: incorrect value + assert_exception(*args) + db_dict['mongodb'] = False + assert_eq(database_schema.validate(db_dict), ref) + db_dict['mongodb'] = {} + assert_eq(database_schema.validate(db_dict), ref) + + +def test_ligand_schema() -> None: + """Test :data:`CAT.data_handling.validation_schemas.ligand_schema`.""" + lig_dict = {'dirname': '.'} + ref = { + 'dirname': '.', + 'functional_groups': None, + 'optimize': True, + 'split': True, + 'cosmo-rs': False + } + args = SchemaError, ligand_schema.validate, lig_dict + + assert_eq(ligand_schema.validate(lig_dict), ref) + + lig_dict['optimize'] = 1 # Exception: incorrect type + assert_exception(*args) + lig_dict['optimize'] = True + + lig_dict['split'] = 1 # Exception: incorrect type + assert_exception(*args) + lig_dict['split'] = True + + lig_dict['cosmo-rs'] = 1 # Exception: incorrect type + assert_exception(*args) + lig_dict['cosmo-rs'] = {} + assert_eq(ligand_schema.validate(lig_dict)['cosmo-rs'], {}) + lig_dict['cosmo-rs'] = False + assert_id(ligand_schema.validate(lig_dict)['cosmo-rs'], False) + lig_dict['cosmo-rs'] = True + assert_eq(ligand_schema.validate(lig_dict)['cosmo-rs'], {'job1': AMSJob}) + + lig_dict['functional_groups'] = 1 # Exception: incorrect type + assert_exception(*args) + lig_dict['functional_groups'] = 'CO' + assert_eq(ligand_schema.validate(lig_dict)['functional_groups'], ('CO',)) + lig_dict['functional_groups'] = ['CO'] + assert_eq(ligand_schema.validate(lig_dict)['functional_groups'], ('CO',)) + lig_dict['functional_groups'] = ['CO', 'CO'] # Exception: duplicate elements + assert_exception(*args) + + +def test_core_schema() -> None: + """Test :data:`CAT.data_handling.validation_schemas.core_schema`.""" + core_dict = {'dirname': '.'} + ref = { + 'dirname': '.', + 'dummy': 17 + } + args = SchemaError, core_schema.validate, core_dict + + assert_eq(core_schema.validate(core_dict), ref) + + core_dict['dummy'] = 1.0 # Exception: incorrect type + assert_exception(*args) + core_dict['dummy'] = 'H' + assert_eq(core_schema.validate(core_dict)['dummy'], 1) + core_dict['dummy'] = 1 + assert_eq(core_schema.validate(core_dict)['dummy'], 1) + + +def test_qd_schema() -> None: + """Test :data:`CAT.data_handling.validation_schemas.qd_schema`.""" + qd_dict = {'dirname': '.'} + ref = { + 'dirname': '.', + 'activation_strain': False, + 'optimize': False, + 'dissociate': False + } + args = SchemaError, qd_schema.validate, qd_dict + + assert_eq(qd_schema.validate(qd_dict), ref) + + qd_dict['activation_strain'] = 1 # Exception: incorrect type + assert_exception(*args) + qd_dict['activation_strain'] = True + + qd_dict['optimize'] = 1 # Exception: incorrect type + assert_exception(*args) + qd_dict['optimize'] = True + assert_eq(qd_schema.validate(qd_dict)['optimize'], {'job1': AMSJob}) + qd_dict['optimize'] = False + assert_id(qd_schema.validate(qd_dict)['optimize'], False) + + qd_dict['dissociate'] = 1 # Exception: incorrect type + assert_exception(*args) + qd_dict['dissociate'] = True # Exception: incorrect value + assert_exception(*args) + qd_dict['dissociate'] = False + assert_id(qd_schema.validate(qd_dict)['dissociate'], False) + + +def test_mongodb_schema() -> None: + """Test :data:`CAT.data_handling.validation_schemas.mongodb_schema`.""" + mongodb_dict = {} + ref = { + 'host': 'localhost', + 'port': 27017 + } + args = SchemaError, mongodb_schema.validate, mongodb_dict + + assert_eq(mongodb_schema.validate(mongodb_dict), ref) + + mongodb_dict['port'] = 5.0 # Exception: incorrect type + assert_exception(*args) + mongodb_dict['port'] = 27017 + + mongodb_dict['host'] = 5.0 # Exception: incorrect type + assert_exception(*args) + mongodb_dict['host'] = 'localhost' + assert_eq(mongodb_schema.validate(mongodb_dict)['host'], 'localhost') + mongodb_dict['host'] = 51 + assert_eq(mongodb_schema.validate(mongodb_dict)['host'], 51) + + mongodb_dict['username'] = 5.0 # Exception: incorrect type + assert_exception(*args) + mongodb_dict['username'] = 'bob' + assert_eq(mongodb_schema.validate(mongodb_dict)['username'], 'bob') + mongodb_dict['username'] = 52 + assert_eq(mongodb_schema.validate(mongodb_dict)['username'], 52) + + mongodb_dict['password'] = 5.0 # Exception: incorrect type + assert_exception(*args) + mongodb_dict['password'] = 'secret' + assert_eq(mongodb_schema.validate(mongodb_dict)['password'], 'secret') + mongodb_dict['password'] = 53 + assert_eq(mongodb_schema.validate(mongodb_dict)['password'], 53) + + +def test_qd_opt_schema() -> None: + """Test :data:`CAT.data_handling.validation_schemas.qd_opt_schema`.""" + _qd_opt_s1_default = get_template('qd.yaml')['UFF'] + _qd_opt_s2_default = _qd_opt_s1_default + + qd_opt_dict = Settings() + ref = Settings({ + 'job1': AMSJob, + 's1': _qd_opt_s1_default, + 'job2': AMSJob, + 's2': _qd_opt_s2_default + }) + args = SchemaError, qd_opt_schema.validate, qd_opt_dict + + assert_eq(qd_opt_schema.validate(qd_opt_dict), ref) + + for job in ('job1', 'job2'): + qd_opt_dict[job] = 1 # Exception: incorrect type + assert_exception(*args) + qd_opt_dict[job] = int # Exception: incorrect value + assert_exception(*args) + qd_opt_dict[job] = 'bob' # Exception: incorrect value + assert_exception(*args) + qd_opt_dict[job] = 'ADFJob' + assert_id(qd_opt_schema.validate(qd_opt_dict)[job], ADFJob) + qd_opt_dict[job] = 'ADFJOB' + assert_id(qd_opt_schema.validate(qd_opt_dict)[job], ADFJob) + qd_opt_dict[job] = ADFJob + assert_id(qd_opt_schema.validate(qd_opt_dict)[job], ADFJob) + + ref = {'key1': {'key2': {'key3': True}}} + + for s in ('s1', 's2'): + qd_opt_dict[s] = 1 # Exception: incorrect type + assert_exception(*args) + qd_opt_dict[s] = {'key1': {'key2': {'key3': True}}} + assert_eq(qd_opt_schema.validate(qd_opt_dict)[s], ref) + qd_opt_dict[s] = join(PATH, 'settings.yaml') + assert_eq(qd_opt_schema.validate(qd_opt_dict)[s], ref) + + +def test_crs_schema() -> None: + """Test :data:`CAT.data_handling.validation_schemas.crs_schema`.""" + _crs_s1_default = get_template('qd.yaml')['COSMO-MOPAC'] + _crs_s2_default = get_template('qd.yaml')['COSMO-RS activity coefficient'] + _crs_s2_default.update(get_template('crs.yaml')['MOPAC PM6']) + + crs_dict = Settings() + ref = Settings({ + 'job1': AMSJob, + 's1': _crs_s1_default, + 'job2': CRSJob, + 's2': _crs_s2_default + }) + args = SchemaError, crs_schema.validate, crs_dict + + assert_eq(crs_schema.validate(crs_dict), ref) + + for job in ('job1', 'job2'): + crs_dict[job] = 1 # Exception: incorrect type + assert_exception(*args) + crs_dict[job] = int # Exception: incorrect value + assert_exception(*args) + crs_dict[job] = 'bob' # Exception: incorrect value + assert_exception(*args) + crs_dict[job] = 'ADFJob' + assert_id(crs_schema.validate(crs_dict)[job], ADFJob) + crs_dict[job] = 'ADFJOB' + assert_id(crs_schema.validate(crs_dict)[job], ADFJob) + crs_dict[job] = ADFJob + assert_id(crs_schema.validate(crs_dict)[job], ADFJob) + + ref = {'key1': {'key2': {'key3': True}}} + + for s in ('s1', 's2'): + crs_dict[s] = 1 # Exception: incorrect type + assert_exception(*args) + crs_dict[s] = {'key1': {'key2': {'key3': True}}} + assert_eq(crs_schema.validate(crs_dict)[s], ref) + crs_dict[s] = join(PATH, 'settings.yaml') + assert_eq(crs_schema.validate(crs_dict)[s], ref) + + +def test_bde_schema() -> None: + """Test :data:`CAT.data_handling.validation_schemas.bde_schema`.""" + _bde_s1_default = get_template('qd.yaml')['MOPAC'] + _bde_s2_default = get_template('qd.yaml')['UFF'] + + bde_dict = Settings({'core_atom': 'Cd', 'lig_count': 2}) + ref = Settings({ + 'core_atom': 48, + 'lig_count': 2, + 'core_core_dist': 5.0, + 'lig_core_dist': 5.0, + 'topology': {}, + 'job1': AMSJob, + 's1': _bde_s1_default + }) + args = SchemaError, bde_schema.validate, bde_dict + + assert_eq(bde_schema.validate(bde_dict), ref) + + bde_dict['core_atom'] = 5.0 # Exception: incorrect type + assert_exception(*args) + bde_dict['core_atom'] = 'H' + assert_eq(bde_schema.validate(bde_dict)['core_atom'], 1) + bde_dict['core_atom'] = 1 + assert_eq(bde_schema.validate(bde_dict)['core_atom'], 1) + + bde_dict['lig_count'] = 5.0 # Exception: incorrect type + assert_exception(*args) + bde_dict['lig_count'] = -1 # Exception: incorrect value + assert_exception(*args) + bde_dict['lig_count'] = 3 + assert_eq(bde_schema.validate(bde_dict)['lig_count'], 3) + + bde_dict['core_index'] = 5.0 # Exception: incorrect type + assert_exception(*args) + bde_dict['core_index'] = [1, 2, 3, 4, 5.0] # Exception: incorrect element type + assert_exception(*args) + bde_dict['core_index'] = [1, 2, 3, 4, 4] # Exception: duplicate elements + assert_exception(*args) + bde_dict['core_index'] = 1 + assert_eq(bde_schema.validate(bde_dict)['core_index'], (1,)) + bde_dict['core_index'] = [1, 2, 3] + assert_eq(bde_schema.validate(bde_dict)['core_index'], (1, 2, 3)) + bde_dict['core_index'] = {1, 2, 3} + assert_eq(bde_schema.validate(bde_dict)['core_index'], (1, 2, 3)) + + bde_dict['topology'] = 5.0 # Exception: incorrect type + assert_exception(*args) + bde_dict['topology'] = {'key': 'value'} # Exception: incorrect value + assert_exception(*args) + bde_dict['topology'] = {1: 'value'} + assert_eq(bde_schema.validate(bde_dict)['topology'], {1: 'value'}) + + for dist in ('core_core_dist', 'lig_core_dist'): + bde_dict[dist] = 'bob' # Exception: incorrect type + assert_exception(*args) + bde_dict[dist] = -1 # Exception: incorrect value + assert_exception(*args) + bde_dict[dist] = 4 + assert_eq(bde_schema.validate(bde_dict)[dist], 4.0) + bde_dict[dist] = 4.0 + assert_eq(bde_schema.validate(bde_dict)[dist], 4.0) + + for job in ('job1', 'job2'): + bde_dict[job] = 1 # Exception: incorrect type + assert_exception(*args) + bde_dict[job] = int # Exception: incorrect value + assert_exception(*args) + bde_dict[job] = 'bob' # Exception: incorrect value + assert_exception(*args) + bde_dict[job] = 'ADFJob' + assert_id(bde_schema.validate(bde_dict)[job], ADFJob) + bde_dict[job] = 'ADFJOB' + assert_id(bde_schema.validate(bde_dict)[job], ADFJob) + bde_dict[job] = ADFJob + assert_id(bde_schema.validate(bde_dict)[job], ADFJob) + + ref = {'key1': {'key2': {'key3': True}}} + + for s in ('s1', 's2'): + bde_dict[s] = 1 # Exception: incorrect type + assert_exception(*args) + bde_dict[s] = {'key1': {'key2': {'key3': True}}} + assert_eq(bde_schema.validate(bde_dict)[s], ref) + bde_dict[s] = join(PATH, 'settings.yaml') + assert_eq(bde_schema.validate(bde_dict)[s], ref) diff --git a/tests/test_settings_dataframe.py b/tests/test_settings_dataframe.py new file mode 100644 index 00000000..797c43e4 --- /dev/null +++ b/tests/test_settings_dataframe.py @@ -0,0 +1,23 @@ +"""Tests for :mod:`CAT.settings_dataframe`.""" + +import numpy as np + +from CAT.assertion_functions import assert_eq +from CAT.frozen_settings import FrozenSettings +from CAT.settings_dataframe import (SettingsDataFrame, SettingsSeries) + +_DICT = {'a': True, 'b': False, 'c': [1, 2, 3, 4]} +DF = SettingsDataFrame(np.random.rand(10, 3), settings=_DICT) +SERIES = SettingsSeries(np.random.rand(10), settings=_DICT) + + +def test_df_and_series() -> None: + """Tests for :class:`.SettingsDataFrame` and :class:`.SettingsSeries`.""" + settings = FrozenSettings(_DICT) + + assert_eq(DF.settings, settings) + assert_eq(SERIES.settings, settings) + assert_eq(DF[0].settings, settings) + assert_eq(SERIES.to_frame().settings, settings) + assert_eq(DF.copy().settings, settings) + assert_eq(SERIES.copy().settings, settings) diff --git a/tests/test_thermo_chem.py b/tests/test_thermo_chem.py new file mode 100644 index 00000000..bf93aeea --- /dev/null +++ b/tests/test_thermo_chem.py @@ -0,0 +1,73 @@ +"""Tests for :mod:`CAT.thermo_chem`.""" + +from os.path import join + +import numpy as np + +from scm.plams import (Molecule, Units) + +from CAT.assertion_functions import assert_eq +from CAT.thermo_chem import (get_entropy, get_thermo) + +PATH = 'tests/test_files' +MOL = Molecule(join(PATH, 'Methanol.xyz')) # Methanol; BP86/QZ4P +FREQ = np.load(join(PATH, 'freq.npy')) # cm**-1 + + +def test_get_entropy() -> None: + """Tests for :func:`CAT.thermo_chem.get_entropy`.""" + ref1 = np.array([143.78052972, 81.90558458, 2308.88449109]) + S1 = get_entropy(MOL, FREQ) + np.testing.assert_allclose(ref1, S1) + + # Test with a different temperature + ref2 = np.array([149.8889032, 85.57060867, 2338.20467688]) + S2 = get_entropy(MOL, FREQ, T=400) + np.testing.assert_allclose(ref2, S2) + + +def test_get_thermo() -> None: + """Tests for :func:`CAT.thermo_chem.get_thermo`.""" + ref1 = {'E': 0.0, + 'U': 36.876336257248056, + 'H': 37.46882030779777, + 'S': 0.07643865635175046, + 'G': 14.678634916523373} + thermo1 = get_thermo(MOL, FREQ) + for k, v in thermo1.items(): + i, j = round(v, 8), round(ref1[k], 8) + assert_eq(i, j) + + # Test with E != 0.0 + ref2 = {'E': -692.08, + 'U': -655.203663742752, + 'H': -654.6111796922023, + 'S': 0.07643865635175046, + 'G': -677.4013650834768} + thermo2 = get_thermo(MOL, FREQ, -692.08) + for k, v in thermo2.items(): + i, j = round(v, 8), round(ref2[k], 8) + assert_eq(i, j) + + # Test with a different unit (au) + thermo3 = get_thermo(MOL, FREQ, unit='au') + for k, v in thermo3.items(): + i, j = round(v, 8), round(Units.convert(ref1[k], 'kcal/mol', 'au'), 8) + assert_eq(i, j) + + # Test with a different temperature + ref4 = {'E': 0.0, + 'U': 39.09512888121028, + 'H': 39.89000937834221, + 'S': 0.08340885418719352, + 'G': 6.526467703464801} + thermo4 = get_thermo(MOL, FREQ, T=400) + for k, v in thermo4.items(): + i, j = round(v, 8), round(ref4[k], 8) + assert_eq(i, j) + + # Test when exporting a single quantity + ref5 = 14.678634916523373 + G = get_thermo(MOL, FREQ, export='G') + i, j = round(G, 8), round(ref5, 8) + assert_eq(i, j) diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 00000000..f5c6ed70 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,95 @@ +"""Tests for :mod:`CAT.utils`.""" + +import os +from os.path import join + +from unittest import mock + +from scm.plams.interfaces.adfsuite.ams import AMSJob +from scm.plams.interfaces.adfsuite.adf import ADFJob +from scm.plams.interfaces.thirdparty.orca import ORCAJob +from scm.plams.interfaces.thirdparty.cp2k import Cp2kJob +from scm.plams.interfaces.thirdparty.dirac import DiracJob +from scm.plams.interfaces.thirdparty.gamess import GamessJob + +from CAT.assertion_functions import (assert_eq, assert_lt, assert_id, assert_exception) +from CAT.utils import ( + type_to_string, get_time, dict_concatenate, get_template, validate_path, check_sys_var +) + +PATH = 'tests/test_files' + + +def test_type_to_string() -> None: + """Test :func:`CAT.utils.type_to_string`.""" + ref = { + ADFJob: 'adf', + AMSJob: 'ams', + DiracJob: 'dirac', + Cp2kJob: 'cp2k', + GamessJob: 'gamess', + ORCAJob: 'orca' + } + + for k, j in ref.items(): + i = type_to_string(k) + assert_eq(i, j) + assert_eq(type_to_string('bob'), '') + + +def test_get_time() -> None: + """Test :func:`CAT.utils.get_time`.""" + time = get_time() + assert_eq(time[0], '[') + assert_eq(time[9:], '] ') + assert_eq(time[3], ':') + assert_eq(time[6], ':') + assert_lt(int(time[1:3]), 25) + assert_lt(int(time[4:6]), 61) + assert_lt(int(time[7:9]), 61) + + +def test_dict_concatenate() -> None: + """Test :func:`CAT.utils.dict_concatenate`.""" + ref = {'a': 1, 'b': 2, 'c': 3, 'd': 4} + dict_list = [{'a': 1}, {'b': 2}, {'c': 3}, {'d': 4}] + out = dict_concatenate(dict_list) + assert_eq(out, ref) + + +def test_get_template() -> None: + """Test :func:`CAT.utils.get_template`.""" + ref = {'CRSParameters': {'_1': 'HB_HNOF', '_2': 'HB_TEMP', '_3': 'FAST', '_4': 'COMBI2005', 'rav': 0.4, 'aprime': 1550.0, 'fcorr': 2.802, 'chb': 0.0, 'sigmahbond': 0.00978, 'aeff': 5.96, 'Lambda': 0.135, 'omega': -0.212, 'Eta': -9.65, 'chortf': 0.816}, 'Dispersion': {'H': -0.034, 'C': -0.0356, 'N': -0.0224, 'O': -0.0333, 'F': -0.026, 'Si': -0.04, 'P': -0.045, 'S': -0.052, 'Cl': -0.0485, 'Br': -0.055, 'I': -0.062}, 'Technical': {'rsconv': '1e-7', 'maxiter': 10000, 'bpconv': '1e-6', 'bpmaxiter': 40, 'solconv': '1e-5', 'solmaxiter': 40, 'solxilarge': 0.99, 'ehdeltaT': 1.0}} # noqa + out = get_template('crs.yaml')['MOPAC PM6']['input'].as_dict() + assert_eq(out, ref) + + +def test_validate_path() -> None: + """Test :func:`CAT.utils.validate_path`.""" + assert_eq(validate_path(None), os.getcwd()) + assert_eq(validate_path(''), os.getcwd()) + assert_eq(validate_path('.'), os.getcwd()) + assert_eq(validate_path(PATH), PATH) + assert_exception(FileNotFoundError, validate_path, join(PATH, 'bob')) + assert_exception(NotADirectoryError, validate_path, join(PATH, 'Methanol.xyz')) + + +def test_check_sys_var() -> None: + """Test :func:`CAT.utils.validate_path`.""" + @mock.patch.dict(os.environ, + {'ADFBIN': 'a', 'ADFHOME': '2019', 'ADFRESOURCES': 'b', 'SCMLICENSE': 'c'}) + def test1() -> None: + assert_id(check_sys_var(), None) + + @mock.patch.dict(os.environ, + {'ADFBIN': '', 'ADFHOME': '2019', 'ADFRESOURCES': '', 'SCMLICENSE': ''}) + def test2() -> None: + assert_exception(EnvironmentError, check_sys_var) + + @mock.patch.dict(os.environ, {'ADFHOME': '2018'}) + def test3() -> None: + assert_exception(OSError, check_sys_var) + + test1() + test2() + test3() diff --git a/tests/test_validate_input.py b/tests/test_validate_input.py new file mode 100644 index 00000000..dd313e80 --- /dev/null +++ b/tests/test_validate_input.py @@ -0,0 +1,50 @@ +"""Tests for :mod:`CAT.data_handling.validate_input`.""" + +from os.path import join +from shutil import rmtree + +import yaml + +from scm.plams import (Settings, AMSJob) + +from CAT.assertion_functions import assert_eq +from CAT.data_handling.validate_input import validate_input + +PATH = 'tests/test_files' + + +def test_validate_input() -> None: + """Test :func:`CAT.data_handling.validate_input.validate_input`.""" + with open(join(PATH, 'input1.yaml'), 'r') as f: + s = Settings(yaml.load(f, Loader=yaml.FullLoader)) + s.path = PATH + validate_input(s) + + ref = Settings() + ref.core.dirname = join(PATH, 'core') + ref.core.dummy = 17 + + ref.database.dirname = join(PATH, 'database') + ref.database.mol_format = ('pdb',) + ref.database.mongodb = {} + ref.database.overwrite = () + ref.database.read = ('core', 'ligand', 'qd') + ref.database.write = ('core', 'ligand', 'qd') + + ref.ligand['cosmo-rs'] = False + ref.ligand.dirname = join(PATH, 'ligand') + ref.ligand.functional_groups = None + ref.ligand.optimize = True + ref.ligand.split = True + + ref.qd.activation_strain = False + ref.qd.dirname = join(PATH, 'qd') + ref.qd.dissociate = False + ref.qd.optimize = {'job1': AMSJob, 's2': {'description': 'UFF with the default forcefield', 'input': {'uff': {'library': 'uff'}, 'ams': {'system': {'bondorders': {'_1': None}, 'charge': 0}}}}, 's1': {'description': 'UFF with the default forcefield', 'input': {'uff': {'library': 'uff'}, 'ams': {'system': {'bondorders': {'_1': None}, 'charge': 0}}}}, 'job2': AMSJob} # noqa + + try: + assert_eq(s.optional, ref) + finally: + rmtree(join(PATH, 'ligand')) + rmtree(join(PATH, 'qd')) + rmtree(join(PATH, 'database')) diff --git a/tests/test_validate_mol.py b/tests/test_validate_mol.py new file mode 100644 index 00000000..50f26c77 --- /dev/null +++ b/tests/test_validate_mol.py @@ -0,0 +1,111 @@ +"""Tests for :mod:`CAT.data_handling.validate_mol`.""" + +from os.path import join + +from scm.plams import (Settings, Molecule) +import scm.plams.interfaces.molecule.rdkit as molkit + +from CAT.assertion_functions import (assert_eq, assert_exception) +from CAT.data_handling.validate_mol import ( + validate_mol, santize_smiles, _parse_name_type, _parse_mol_type +) + +PATH = 'tests/test_files' +MOL_PATH = join(PATH, 'Methanol.xyz') +MOL = Molecule(MOL_PATH) +MOL.guess_bonds() + + +def test_santize_smiles() -> None: + """Test :func:`CAT.data_handling.validate_mol.santize_smiles`.""" + assert_eq(santize_smiles('CO'), 'CO') + assert_eq(santize_smiles('C[H]O'), 'C[H]O') + assert_eq(santize_smiles('C(C)O'), 'C[C]O') + assert_eq(santize_smiles('CC=CC'), 'CC=CC') + assert_eq(santize_smiles('C/C=C/C'), 'trans-CC=CC') + assert_eq(santize_smiles(r'C/C=C\C'), 'cis-CC=CC') + assert_eq(santize_smiles('C/C=C/C/C=C/C'), 'trans-trans-CC=CCC=CC') + assert_eq(santize_smiles(r'C/C=C\C/C=C/C'), 'cis-trans-CC=CCC=CC') + + +def test_parse_mol_type() -> None: + """Test :func:`CAT.data_handling.validate_mol._parse_mol_type`.""" + assert_eq(_parse_mol_type('input_cores'), True) + assert_eq(_parse_mol_type('input_ligands'), False) + assert_exception(ValueError, _parse_mol_type, 'bob') + assert_exception(AttributeError, _parse_mol_type, 1) + + +def test_parse_name_type() -> None: + """Test :func:`CAT.data_handling.validate_mol._parse_name_type`.""" + mol_dict = Settings({'mol': 'CCCO'}) + _parse_name_type(mol_dict) + assert_eq(mol_dict, {'mol': 'CCCO', 'type': 'smiles', 'name': 'CCCO'}) + + mol_dict = Settings({'mol': MOL}) + _parse_name_type(mol_dict) + assert_eq(mol_dict, {'mol': MOL, 'type': 'plams_mol', 'name': 'Methanol'}) + + mol = MOL.copy() + mol.properties = Settings() + mol_dict = Settings({'mol': mol}) + _parse_name_type(mol_dict) + assert_eq(mol_dict, {'mol': mol, 'type': 'plams_mol', 'name': 'CO'}) + + rdmol = molkit.to_rdmol(MOL) + mol_dict = Settings({'mol': rdmol}) + _parse_name_type(mol_dict) + assert_eq(mol_dict, {'mol': rdmol, 'type': 'rdmol', 'name': 'CO'}) + + mol_dict = Settings({'mol': PATH}) + _parse_name_type(mol_dict) + assert_eq(mol_dict, {'mol': PATH, 'type': 'folder', 'name': 'test_files'}) + + mol_dict = Settings({'mol': MOL_PATH}) + _parse_name_type(mol_dict) + assert_eq(mol_dict, {'mol': MOL_PATH, 'type': 'xyz', 'name': 'Methanol'}) + + mol_dict = Settings({'mol': 1}) # Excception: Invalid type + assert_exception(TypeError, _parse_name_type, mol_dict) + + +def test_validate_mol() -> None: + """Test :func:`CAT.data_handling.validate_mol.validate_mol`.""" + args1 = ['Methanol.xyz', 'Ethylene.xyz'] + args2 = [ + Settings({'Acetate.xyz': {'guess_bonds': False}}), + Settings({'Methanol_rotate.xyz': {'guess_bonds': False}}) + ] + + ref1 = [ + {'path': PATH, + 'is_core': True, + 'mol': join(PATH, 'Methanol.xyz'), + 'type': 'xyz', + 'name': 'Methanol'}, + {'path': PATH, + 'is_core': True, + 'mol': join(PATH, 'Ethylene.xyz'), + 'type': 'xyz', + 'name': 'Ethylene'} + ] + + ref2 = [ + {'guess_bonds': False, + 'is_core': False, + 'path': PATH, + 'mol': join(PATH, 'Acetate.xyz'), + 'type': 'xyz', + 'name': 'Acetate'}, + {'guess_bonds': False, + 'is_core': False, + 'path': PATH, + 'mol': join(PATH, 'Methanol_rotate.xyz'), + 'type': 'xyz', + 'name': 'Methanol_rotate'} + ] + + validate_mol(args1, 'input_cores', PATH) + validate_mol(args2, 'input_ligands', PATH) + assert_eq(args1, ref1) + assert_eq(args2, ref2)