diff --git a/src/ert/scheduler/driver.py b/src/ert/scheduler/driver.py index 4d2f8f5b4e1..a5a9f124d14 100644 --- a/src/ert/scheduler/driver.py +++ b/src/ert/scheduler/driver.py @@ -90,12 +90,16 @@ async def _execute_with_retry( error_message: Optional[str] = None for _ in range(total_attempts): - process = await asyncio.create_subprocess_exec( - *cmd_with_args, - stdin=asyncio.subprocess.PIPE if stdin else None, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) + try: + process = await asyncio.create_subprocess_exec( + *cmd_with_args, + stdin=asyncio.subprocess.PIPE if stdin else None, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + except FileNotFoundError as e: + return (False, str(e)) + stdout, stderr = await process.communicate(stdin) assert process.returncode is not None diff --git a/src/ert/scheduler/lsf_driver.py b/src/ert/scheduler/lsf_driver.py index ca0c0cb344f..98ddd4ea17f 100644 --- a/src/ert/scheduler/lsf_driver.py +++ b/src/ert/scheduler/lsf_driver.py @@ -409,15 +409,20 @@ async def poll(self) -> None: await asyncio.sleep(self._poll_period) continue current_jobids = list(self._jobs.keys()) - process = await asyncio.create_subprocess_exec( - str(self._bjobs_cmd), - "-noheader", - "-o", - "jobid stat delimiter='^'", - *current_jobids, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) + + try: + process = await asyncio.create_subprocess_exec( + str(self._bjobs_cmd), + "-noheader", + "-o", + "jobid stat delimiter='^'", + *current_jobids, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + except FileNotFoundError as e: + logger.error(str(e)) + return stdout, stderr = await process.communicate() if process.returncode: diff --git a/src/ert/scheduler/openpbs_driver.py b/src/ert/scheduler/openpbs_driver.py index 8aa1ee7daf0..84785ba57ec 100644 --- a/src/ert/scheduler/openpbs_driver.py +++ b/src/ert/scheduler/openpbs_driver.py @@ -312,14 +312,18 @@ async def poll(self) -> None: continue if self._non_finished_job_ids: - process = await asyncio.create_subprocess_exec( - str(self._qstat_cmd), - "-Ex", - "-w", # wide format - *self._non_finished_job_ids, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) + try: + process = await asyncio.create_subprocess_exec( + str(self._qstat_cmd), + "-Ex", + "-w", # wide format + *self._non_finished_job_ids, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + except FileNotFoundError as e: + logger.error(str(e)) + return stdout, stderr = await process.communicate() if process.returncode not in {0, QSTAT_UNKNOWN_JOB_ID}: # Any unknown job ids will yield QSTAT_UNKNOWN_JOB_ID, but diff --git a/src/ert/scheduler/slurm_driver.py b/src/ert/scheduler/slurm_driver.py index bb9d9fae31a..be22679d6e6 100644 --- a/src/ert/scheduler/slurm_driver.py +++ b/src/ert/scheduler/slurm_driver.py @@ -256,14 +256,16 @@ async def poll(self) -> None: arguments = ["-h", "--format=%i %T"] if self._user: arguments.append(f"--user={self._user}") - - process = await asyncio.create_subprocess_exec( - str(self._squeue), - *arguments, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - + try: + process = await asyncio.create_subprocess_exec( + str(self._squeue), + *arguments, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + except FileNotFoundError as e: + logger.error(str(e)) + return stdout, stderr = await process.communicate() if process.returncode: logger.warning( diff --git a/tests/integration_tests/scheduler/test_generic_driver.py b/tests/integration_tests/scheduler/test_generic_driver.py index e25e1ad0673..b22526846e0 100644 --- a/tests/integration_tests/scheduler/test_generic_driver.py +++ b/tests/integration_tests/scheduler/test_generic_driver.py @@ -1,4 +1,5 @@ import asyncio +import logging import os import signal import sys @@ -212,3 +213,37 @@ async def test_num_cpu_sets_env_variables(driver: Driver, tmp_path, job_name): elif isinstance(driver, OpenPBSDriver): assert "OMP_NUM_THREADS=2" in env_lines assert "NCPUS=2" in env_lines + + +@pytest.mark.integration_test +async def test_execute_with_retry_exits_on_filenotfounderror(driver: Driver, caplog): + caplog.set_level(logging.DEBUG) + invalid_cmd = ["/usr/bin/foo", "bar"] + (succeeded, message) = await driver._execute_with_retry( + invalid_cmd, total_attempts=3 + ) + + # We log a retry message every time we retry + assert "retry" not in str(caplog.text) + assert not succeeded + assert "No such file or directory" in message + assert "/usr/bin/foo" in message + + +@pytest.mark.integration_test +async def test_poll_exits_on_filenotfounderror(driver: Driver, caplog): + if isinstance(driver, LocalDriver): + pytest.skip("LocalDriver does not poll") + caplog.set_level(logging.DEBUG) + invalid_cmd = ["/usr/bin/foo", "bar"] + driver._bjobs_cmd = invalid_cmd + driver._qstat_cmd = invalid_cmd + driver._squeue = invalid_cmd + driver._jobs = {"foo": "bar"} + driver._non_finished_job_ids = ["foo"] + await driver.poll() + + # We log a retry message every time we retry + assert "retry" not in str(caplog.text) + assert "No such file or directory" in str(caplog.text) + assert "/usr/bin/foo" in str(caplog.text)