diff --git a/autoqchem/helper_classes.py b/autoqchem/helper_classes.py index 0573963..c8b9613 100644 --- a/autoqchem/helper_classes.py +++ b/autoqchem/helper_classes.py @@ -36,6 +36,7 @@ class slurm_status(enum.IntEnum): failed = 4 #: job failed incomplete = 5 #: job is incomplete, it should be resubmitted uploaded = 6 #: job has been uploaded to the DB succesfully + inspect = 7 #: job needs to be inspected due to problematic labeling @enum.unique diff --git a/autoqchem/rdkit_utils.py b/autoqchem/rdkit_utils.py index 3b00980..479af8a 100644 --- a/autoqchem/rdkit_utils.py +++ b/autoqchem/rdkit_utils.py @@ -185,6 +185,7 @@ def rdmol_from_slurm_jobs(jobs, postDFT=True) -> Chem.Mol: elements, connectivity_matrix, charges = jobs[0].elements, jobs[0].connectivity_matrix, jobs[0].charges conformer_coordinates = [] energies = [] + labels_ok = True for j in jobs: if postDFT: @@ -193,7 +194,8 @@ def rdmol_from_slurm_jobs(jobs, postDFT=True) -> Chem.Mol: le.get_atom_labels() # verify that the labels are in the same order in gaussian after running it - assert tuple(le.labels) == tuple(elements) + if tuple(le.labels) != tuple(elements): + labels_ok = False le.get_geometry() conformer_coordinates.append(le.geom[list('XYZ')].values) @@ -216,7 +218,7 @@ def rdmol_from_slurm_jobs(jobs, postDFT=True) -> Chem.Mol: energies = [AllChem.MMFFGetMoleculeForceField(rdmol, props, confId=i).CalcEnergy() for i in range(rdmol.GetNumConformers())] - return rdmol, energies + return rdmol, energies, labels_ok def rdmol_from_sge_jobs(jobs, postDFT=True) -> Chem.Mol: diff --git a/autoqchem/slurm_manager.py b/autoqchem/slurm_manager.py index f67c0f0..1013ecb 100644 --- a/autoqchem/slurm_manager.py +++ b/autoqchem/slurm_manager.py @@ -234,33 +234,33 @@ def _retrieve_single_job(self, job) -> slurm_status: # initialize the log extractor, it will try to read basic info from the file le = gaussian_log_extractor(log_file.local) - if len(job.tasks) == le.n_tasks: - job.status = slurm_status.done - else: - try: # look for more specific exception - le.check_for_exceptions() + + try: # look for more specific exception + le.check_for_exceptions() - except NoGeometryException: - job.status = slurm_status.failed - logger.warning( - f"Job {job.base_name} failed - the log file does not contain geometry. Cannot resubmit.") + except NoGeometryException: + job.status = slurm_status.failed + logger.warning( + f"Job {job.base_name} failed - the log file does not contain geometry. Cannot resubmit.") - except NegativeFrequencyException: - job.status = slurm_status.incomplete - logger.warning( - f"Job {job.base_name} incomplete - log file contains negative frequencies. Resubmit job.") + except NegativeFrequencyException: + job.status = slurm_status.incomplete + logger.warning( + f"Job {job.base_name} incomplete - log file contains negative frequencies. Resubmit job.") - except OptimizationIncompleteException: - job.status = slurm_status.incomplete - logger.warning(f"Job {job.base_name} incomplete - geometry optimization did not complete.") + except OptimizationIncompleteException: + job.status = slurm_status.incomplete + logger.warning(f"Job {job.base_name} incomplete - geometry optimization did not complete.") - except Exception as e: - job.status = slurm_status.failed - logger.warning(f"Job {job.base_name} failed with unhandled exception: {e}") + except Exception as e: + job.status = slurm_status.failed + logger.warning(f"Job {job.base_name} failed with unhandled exception: {e}") - else: # no exceptions were thrown, but still the job is incomplete - job.status = slurm_status.incomplete - logger.warning(f"Job {job.base_name} incomplete.") + if len(job.tasks) == le.n_tasks: + job.status = slurm_status.done + else: # no exceptions were thrown, but still the job is incomplete + job.status = slurm_status.incomplete + logger.warning(f"Job {job.base_name} incomplete.") except FileNotFoundError: job.status = slurm_status.failed @@ -369,20 +369,25 @@ def upload_done_molecules_to_db(self, tags, RMSD_threshold=0.35) -> None: for done_can in done_cans: (keys, jobs) = zip(*self.get_jobs(can=done_can).items()) - rdmol, energies = rdmol_from_slurm_jobs(jobs, postDFT=True) - keep = prune_rmsds(rdmol, RMSD_threshold) - logger.info(f"Molecule {done_can} has {len(keys) - len(keep)} / {len(keys)} duplicate conformers.") - - # remove duplicate jobs - can_keys_to_remove = [key for i, key in enumerate(keys) if i not in keep] - to_remove_jobs = {name: job for name, job in self.jobs.items() if name in can_keys_to_remove} - logger.info( - f"Removing {len(keys) - len(keep)} / {len(keys)} jobs and log files that contain duplicate conformers.") - self.remove_jobs(to_remove_jobs) - - # upload non-duplicate jobs - can_keys_to_keep = [key for i, key in enumerate(keys) if i in keep] - self._upload_can_to_db(can_keys_to_keep, tags) + rdmol, energies, labels_ok = rdmol_from_slurm_jobs(jobs, postDFT=True) + if labels_ok: + keep = prune_rmsds(rdmol, RMSD_threshold) + logger.info(f"Molecule {done_can} has {len(keys) - len(keep)} / {len(keys)} duplicate conformers.") + + # remove duplicate jobs + can_keys_to_remove = [key for i, key in enumerate(keys) if i not in keep] + to_remove_jobs = {name: job for name, job in self.jobs.items() if name in can_keys_to_remove} + logger.info( + f"Removing {len(keys) - len(keep)} / {len(keys)} jobs and log files that contain duplicate conformers.") + self.remove_jobs(to_remove_jobs) + + # upload non-duplicate jobs + can_keys_to_keep = [key for i, key in enumerate(keys) if i in keep] + self._upload_can_to_db(can_keys_to_keep, tags) + else: + for key in keys: + self.jobs[key].status = slurm_status.inspect + self._cache() def _upload_can_to_db(self, keys, tags) -> None: """Uploading single molecule conformers to database.