From fd9560561d608b0c63f89a8a6c347562494b02ca Mon Sep 17 00:00:00 2001 From: Thomas Vuillaume Date: Mon, 9 Sep 2024 15:22:49 +0200 Subject: [PATCH] modify rerun_cmd to make sure the job crashes if the cmd fails and raises and Error (#485) * modify rerun_cmd to make sure the job crashes if the cmd fails and raises and Error * fix test rerun_cmd --- lstmcpipe/tests/test_utils.py | 30 ++++++++++------ lstmcpipe/utils.py | 67 +++++++++++++++++++++-------------- 2 files changed, 60 insertions(+), 37 deletions(-) diff --git a/lstmcpipe/tests/test_utils.py b/lstmcpipe/tests/test_utils.py index 9e804ea3..3b593af8 100644 --- a/lstmcpipe/tests/test_utils.py +++ b/lstmcpipe/tests/test_utils.py @@ -31,22 +31,30 @@ def test_save_log_to_file(tmp_path): def test_rerun_cmd(): - with tempfile.TemporaryDirectory() as tmp_dir: file, filename = tempfile.mkstemp(dir=tmp_dir) cmd = f'echo "1" >> {filename}; rm nonexistingfile' + # first test: the cmd fails 3 times but the outfile stays in place - subdir_failures = '' - rerun_cmd(cmd, filename, max_ntry=3, subdir_failures=subdir_failures, shell=True) - filename = Path(filename) - filename = Path(tmp_dir).joinpath(subdir_failures, filename.name) - assert open(filename).read() == "1\n1\n1\n" + subdir_failures = "" + try: + n_tries = rerun_cmd(cmd, filename, max_ntry=3, subdir_failures=subdir_failures, shell=True) + filename = Path(filename) + filename = Path(tmp_dir).joinpath(subdir_failures, filename.name) + assert open(filename).read() == "1\n1\n1\n" + assert n_tries == 3 + except Exception as e: + assert isinstance(e, RuntimeError) + # 2nd test: the cmd fails and the outfile is moved in subdir - subdir_failures = 'fail' - rerun_cmd(cmd, filename, max_ntry=3, subdir_failures=subdir_failures, shell=True) - filename = filename.parent.joinpath(subdir_failures).joinpath(filename.name) - assert open(filename).read() == "1\n" - assert filename.exists() + subdir_failures = "fail" + try: + rerun_cmd(cmd, filename, max_ntry=3, subdir_failures=subdir_failures, shell=True) + filename = filename.parent.joinpath(subdir_failures).joinpath(filename.name) + assert open(filename).read() == "1\n" + assert filename.exists() + except Exception as e: + assert isinstance(e, RuntimeError) def test_rerun_cmd_lstchain_mc_r0_to_dl1(mc_gamma_testfile): diff --git a/lstmcpipe/utils.py b/lstmcpipe/utils.py index 59fd59a0..1db5873b 100644 --- a/lstmcpipe/utils.py +++ b/lstmcpipe/utils.py @@ -109,40 +109,55 @@ def batch_mc_production_check( return jobid -def rerun_cmd(cmd, outfile, max_ntry=2, subdir_failures='failed_outputs', **run_kwargs): +def rerun_cmd(cmd, outfile, max_ntry=2, subdir_failures="failed_outputs", **run_kwargs): """ - rerun r0_to_dl1 process given by `cmd` as long as the exit code is 0 and number of try < max_ntry - move the failed output file to subdir failed_outputs + Rerun the command up to max_ntry times. + If all attempts fail, raise an exception. Parameters ---------- - cmd: str - subdir_failures: str + cmd: list + Command to run as a list of strings outfile: Path - path to the cmd output file + Path to the cmd output file max_ntry: int - run_kwargs: kwargs for subprocess.run - - Returns - ------- - ntry: int - number of tries actually run + Maximum number of attempts to run the command + subdir_failures: str + Subdirectory to move failed output files to + run_kwargs: kwargs + Additional keyword arguments for subprocess.run + + Raises + ------ + RuntimeError + If the command fails after all retry attempts """ outfile = Path(outfile) - ret = -1 - ntry = 1 - while ret != 0 and ntry <= max_ntry: - ret = sp.run(cmd, **run_kwargs).returncode - if ret != 0: - failed_jobs_subdir = outfile.parent.joinpath(subdir_failures) - if outfile.exists(): - failed_jobs_subdir.mkdir(exist_ok=True) - outfile_target = failed_jobs_subdir.joinpath(outfile.name) - print(f"Move failed output file from {outfile} to {outfile_target}. try #{ntry}") - shutil.move(outfile, outfile_target) - - ntry += 1 - return ntry - 1 + for ntry in range(1, max_ntry + 1): + result = sp.run(cmd, **run_kwargs, capture_output=True, text=True, check=False) + + if result.returncode == 0: + return ntry # Success, return the number of tries it took + + # Command failed, handle the error + failed_jobs_subdir = outfile.parent.joinpath(subdir_failures) + if outfile.exists(): + failed_jobs_subdir.mkdir(exist_ok=True) + outfile_target = failed_jobs_subdir.joinpath(outfile.name) + print(f"Move failed output file from {outfile} to {outfile_target}. try #{ntry}") + shutil.move(outfile, outfile_target) + + # If this was the last try, raise an exception + if ntry == max_ntry: + error_message = f"Command failed after {max_ntry} attempts. Last failure details:\n" + error_message += f"Command: {' '.join(cmd)}\n" + error_message += f"Return code: {result.returncode}\n" + error_message += f"STDOUT: {result.stdout}\n" + error_message += f"STDERR: {result.stderr}\n" + raise RuntimeError(error_message) + + # This line should never be reached due to the raise in the loop + raise RuntimeError("Unexpected error in rerun_cmd") def dump_lstchain_std_config(filename='lstchain_config.json', allsky=True, overwrite=False):