From 815565f9be21f8ac435cb25f18b357934d7b11f2 Mon Sep 17 00:00:00 2001 From: Elliot Saba Date: Fri, 24 Jun 2022 00:29:46 +0000 Subject: [PATCH] [Distributed] `kill(::LocalManager, ...)` should actually call `kill()` When dealing with a local process, if we want to remove a process, we can try a little harder than simply calling `remote_do(exit, id)`. We can actually `kill()` the process by sending `SIGTERM`, then `SIGKILL`. Because we use `Distributed` to run our Base test workers, this can provide a more certain method of closing our workers at the end of test sets, as well as a better way of killing processes such that they dump core in the event that they do get stuck. --- stdlib/Distributed/src/managers.jl | 23 +++++++++++++++++++++ stdlib/Distributed/test/distributed_exec.jl | 20 ++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/stdlib/Distributed/src/managers.jl b/stdlib/Distributed/src/managers.jl index 1eb15aea2c9513..7b048807eddaee 100644 --- a/stdlib/Distributed/src/managers.jl +++ b/stdlib/Distributed/src/managers.jl @@ -725,3 +725,26 @@ function kill(manager::SSHManager, pid::Int, config::WorkerConfig) cancel_ssh_tunnel(config) nothing end + +function kill(manager::LocalManager, pid::Int, config::WorkerConfig; exit_timeout = 15, term_timeout = 15) + # First, try sending `exit()` to the remote over the usual control channels + remote_do(exit, pid) + + timer_task = @async begin + sleep(exit_timeout) + + # Check to see if our child exited, and if not, send an actual kill signal + if !process_exited(config.process) + @warn("Failed to gracefully kill worker $(pid), sending SIGTERM") + kill(config.process, Base.SIGTERM) + + sleep(term_timeout) + if !process_exited(config.process) + @warn("Worker $(pid) ignored SIGTERM, sending SIGKILL") + kill(config.process, Base.SIGKILL) + end + end + end + errormonitor(timer_task) + return nothing +end diff --git a/stdlib/Distributed/test/distributed_exec.jl b/stdlib/Distributed/test/distributed_exec.jl index dee10220d99977..0be94b28e5da5a 100644 --- a/stdlib/Distributed/test/distributed_exec.jl +++ b/stdlib/Distributed/test/distributed_exec.jl @@ -1856,6 +1856,26 @@ end end include("splitrange.jl") +# Clear all workers for timeout tests (issue #45785) +rmprocs(workers()) +begin + # First, assert that we get no messages when we close a cooperative worker + w = only(addprocs(1)) + @test_nowarn begin + wait(rmprocs([w])) + end + + # Next, ensure we get a log message when a worker does not cleanly exit + w = only(addprocs(1)) + @test_logs (:warn, r"sending SIGTERM") begin + remote_do(w) do + # Cause the 'exit()' message that `rmprocs()` sends to do nothing + Core.eval(Base, :(exit() = nothing)) + end + wait(rmprocs([w])) + end +end + # Run topology tests last after removing all workers, since a given # cluster at any time only supports a single topology. rmprocs(workers())