Skip to content

Commit

Permalink
better tp exit log (InternLM#2677)
Browse files Browse the repository at this point in the history
  • Loading branch information
grimoire authored and AllentDan committed Nov 13, 2024
1 parent 1fd99df commit 7335876
Showing 1 changed file with 11 additions and 4 deletions.
15 changes: 11 additions & 4 deletions lmdeploy/pytorch/engine/model_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,15 +553,22 @@ def _start_tp_process(proc_id: int,
def _check_context_alive(mp_context: mp.ProcessContext):
"""check context alive."""
procs: List[mp.Process] = mp_context.processes
failed_ranks = list(idx for idx, p in enumerate(procs) if not p.is_alive())
if len(failed_ranks) == 0:
failed_procs = list(idx for idx, p in enumerate(procs) if not p.is_alive())
if len(failed_procs) == 0:
return
for p in procs:

log_procs = []
for idx, p in enumerate(procs):
if p.is_alive():
p.terminate()
else:
exitcode = p.exitcode
if exitcode > 0:
# terminated exitcode < 0
log_procs.append((idx, exitcode))
p.close()
logger.error(f'TP process {failed_ranks} failed.')
for idx, exitcode in log_procs:
logger.error(f'TP process {idx} failed with exitcode {exitcode}.')
# TODO: not safe exit.
os._exit(1)

Expand Down

0 comments on commit 7335876

Please sign in to comment.