Skip to content

Commit

Permalink
[Distributed] kill(::LocalManager, ...) should actually call kill()
Browse files Browse the repository at this point in the history
When dealing with a local process, if we want to remove a process, we
can try a little harder than simply calling `remote_do(exit, id)`.  We
can actually `kill()` the process by sending `SIGTERM`, then `SIGKILL`.
Because we use `Distributed` to run our Base test workers, this can
provide a more certain method of closing our workers at the end of test
sets, as well as a better way of killing processes such that they dump
core in the event that they do get stuck.
  • Loading branch information
staticfloat authored and pcjentsch committed Aug 18, 2022
1 parent 95bc5dc commit 815565f
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 0 deletions.
23 changes: 23 additions & 0 deletions stdlib/Distributed/src/managers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -725,3 +725,26 @@ function kill(manager::SSHManager, pid::Int, config::WorkerConfig)
cancel_ssh_tunnel(config)
nothing
end

function kill(manager::LocalManager, pid::Int, config::WorkerConfig; exit_timeout = 15, term_timeout = 15)
# First, try sending `exit()` to the remote over the usual control channels
remote_do(exit, pid)

timer_task = @async begin
sleep(exit_timeout)

# Check to see if our child exited, and if not, send an actual kill signal
if !process_exited(config.process)
@warn("Failed to gracefully kill worker $(pid), sending SIGTERM")
kill(config.process, Base.SIGTERM)

sleep(term_timeout)
if !process_exited(config.process)
@warn("Worker $(pid) ignored SIGTERM, sending SIGKILL")
kill(config.process, Base.SIGKILL)
end
end
end
errormonitor(timer_task)
return nothing
end
20 changes: 20 additions & 0 deletions stdlib/Distributed/test/distributed_exec.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1856,6 +1856,26 @@ end end

include("splitrange.jl")

# Clear all workers for timeout tests (issue #45785)
rmprocs(workers())
begin
# First, assert that we get no messages when we close a cooperative worker
w = only(addprocs(1))
@test_nowarn begin
wait(rmprocs([w]))
end

# Next, ensure we get a log message when a worker does not cleanly exit
w = only(addprocs(1))
@test_logs (:warn, r"sending SIGTERM") begin
remote_do(w) do
# Cause the 'exit()' message that `rmprocs()` sends to do nothing
Core.eval(Base, :(exit() = nothing))
end
wait(rmprocs([w]))
end
end

# Run topology tests last after removing all workers, since a given
# cluster at any time only supports a single topology.
rmprocs(workers())
Expand Down

0 comments on commit 815565f

Please sign in to comment.