Skip to content

Commit

Permalink
modify rerun_cmd to make sure the job crashes if the cmd fails and ra…
Browse files Browse the repository at this point in the history
…ises and Error (#485)

* modify rerun_cmd to make sure the job crashes if the cmd fails and raises and Error

* fix test rerun_cmd
  • Loading branch information
vuillaut committed Sep 9, 2024
1 parent 26aae10 commit fd95605
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 37 deletions.
30 changes: 19 additions & 11 deletions lstmcpipe/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,22 +31,30 @@ def test_save_log_to_file(tmp_path):


def test_rerun_cmd():

with tempfile.TemporaryDirectory() as tmp_dir:
file, filename = tempfile.mkstemp(dir=tmp_dir)
cmd = f'echo "1" >> {filename}; rm nonexistingfile'

# first test: the cmd fails 3 times but the outfile stays in place
subdir_failures = ''
rerun_cmd(cmd, filename, max_ntry=3, subdir_failures=subdir_failures, shell=True)
filename = Path(filename)
filename = Path(tmp_dir).joinpath(subdir_failures, filename.name)
assert open(filename).read() == "1\n1\n1\n"
subdir_failures = ""
try:
n_tries = rerun_cmd(cmd, filename, max_ntry=3, subdir_failures=subdir_failures, shell=True)
filename = Path(filename)
filename = Path(tmp_dir).joinpath(subdir_failures, filename.name)
assert open(filename).read() == "1\n1\n1\n"
assert n_tries == 3
except Exception as e:
assert isinstance(e, RuntimeError)

# 2nd test: the cmd fails and the outfile is moved in subdir
subdir_failures = 'fail'
rerun_cmd(cmd, filename, max_ntry=3, subdir_failures=subdir_failures, shell=True)
filename = filename.parent.joinpath(subdir_failures).joinpath(filename.name)
assert open(filename).read() == "1\n"
assert filename.exists()
subdir_failures = "fail"
try:
rerun_cmd(cmd, filename, max_ntry=3, subdir_failures=subdir_failures, shell=True)
filename = filename.parent.joinpath(subdir_failures).joinpath(filename.name)
assert open(filename).read() == "1\n"
assert filename.exists()
except Exception as e:
assert isinstance(e, RuntimeError)


def test_rerun_cmd_lstchain_mc_r0_to_dl1(mc_gamma_testfile):
Expand Down
67 changes: 41 additions & 26 deletions lstmcpipe/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,40 +109,55 @@ def batch_mc_production_check(
return jobid


def rerun_cmd(cmd, outfile, max_ntry=2, subdir_failures='failed_outputs', **run_kwargs):
def rerun_cmd(cmd, outfile, max_ntry=2, subdir_failures="failed_outputs", **run_kwargs):
"""
rerun r0_to_dl1 process given by `cmd` as long as the exit code is 0 and number of try < max_ntry
move the failed output file to subdir failed_outputs
Rerun the command up to max_ntry times.
If all attempts fail, raise an exception.
Parameters
----------
cmd: str
subdir_failures: str
cmd: list
Command to run as a list of strings
outfile: Path
path to the cmd output file
Path to the cmd output file
max_ntry: int
run_kwargs: kwargs for subprocess.run
Returns
-------
ntry: int
number of tries actually run
Maximum number of attempts to run the command
subdir_failures: str
Subdirectory to move failed output files to
run_kwargs: kwargs
Additional keyword arguments for subprocess.run
Raises
------
RuntimeError
If the command fails after all retry attempts
"""
outfile = Path(outfile)
ret = -1
ntry = 1
while ret != 0 and ntry <= max_ntry:
ret = sp.run(cmd, **run_kwargs).returncode
if ret != 0:
failed_jobs_subdir = outfile.parent.joinpath(subdir_failures)
if outfile.exists():
failed_jobs_subdir.mkdir(exist_ok=True)
outfile_target = failed_jobs_subdir.joinpath(outfile.name)
print(f"Move failed output file from {outfile} to {outfile_target}. try #{ntry}")
shutil.move(outfile, outfile_target)

ntry += 1
return ntry - 1
for ntry in range(1, max_ntry + 1):
result = sp.run(cmd, **run_kwargs, capture_output=True, text=True, check=False)

if result.returncode == 0:
return ntry # Success, return the number of tries it took

# Command failed, handle the error
failed_jobs_subdir = outfile.parent.joinpath(subdir_failures)
if outfile.exists():
failed_jobs_subdir.mkdir(exist_ok=True)
outfile_target = failed_jobs_subdir.joinpath(outfile.name)
print(f"Move failed output file from {outfile} to {outfile_target}. try #{ntry}")
shutil.move(outfile, outfile_target)

# If this was the last try, raise an exception
if ntry == max_ntry:
error_message = f"Command failed after {max_ntry} attempts. Last failure details:\n"
error_message += f"Command: {' '.join(cmd)}\n"
error_message += f"Return code: {result.returncode}\n"
error_message += f"STDOUT: {result.stdout}\n"
error_message += f"STDERR: {result.stderr}\n"
raise RuntimeError(error_message)

# This line should never be reached due to the raise in the loop
raise RuntimeError("Unexpected error in rerun_cmd")


def dump_lstchain_std_config(filename='lstchain_config.json', allsky=True, overwrite=False):
Expand Down

0 comments on commit fd95605

Please sign in to comment.