[Distributed] kill(::LocalManager, ...) should actually call kill()

When dealing with a local process, if we want to remove a process, we can try a little harder than simply calling `remote_do(exit, id)`. We can actually `kill()` the process by sending `SIGTERM`, then `SIGKILL`. Because we use `Distributed` to run our Base test workers, this can provide a more certain method of closing our workers at the end of test sets, as well as a better way of killing processes such that they dump core in the event that they do get stuck.
pcjentsch · Aug 18, 2022 · 815565f · 815565f
1 parent 95bc5dc
commit 815565f
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 0 deletions.
diff --git a/stdlib/Distributed/src/managers.jl b/stdlib/Distributed/src/managers.jl
@@ -725,3 +725,26 @@ function kill(manager::SSHManager, pid::Int, config::WorkerConfig)
     cancel_ssh_tunnel(config)
     nothing
 end
+
+function kill(manager::LocalManager, pid::Int, config::WorkerConfig; exit_timeout = 15, term_timeout = 15)
+    # First, try sending `exit()` to the remote over the usual control channels
+    remote_do(exit, pid)
+
+    timer_task = @async begin
+        sleep(exit_timeout)
+
+        # Check to see if our child exited, and if not, send an actual kill signal
+        if !process_exited(config.process)
+            @warn("Failed to gracefully kill worker $(pid), sending SIGTERM")
+            kill(config.process, Base.SIGTERM)
+
+            sleep(term_timeout)
+            if !process_exited(config.process)
+                @warn("Worker $(pid) ignored SIGTERM, sending SIGKILL")
+                kill(config.process, Base.SIGKILL)
+            end
+        end
+    end
+    errormonitor(timer_task)
+    return nothing
+end
diff --git a/stdlib/Distributed/test/distributed_exec.jl b/stdlib/Distributed/test/distributed_exec.jl
@@ -1856,6 +1856,26 @@ end end
 
 include("splitrange.jl")
 
+# Clear all workers for timeout tests (issue #45785)
+rmprocs(workers())
+begin
+    # First, assert that we get no messages when we close a cooperative worker
+    w = only(addprocs(1))
+    @test_nowarn begin
+        wait(rmprocs([w]))
+    end
+
+    # Next, ensure we get a log message when a worker does not cleanly exit
+    w = only(addprocs(1))
+    @test_logs (:warn, r"sending SIGTERM") begin
+        remote_do(w) do
+            # Cause the 'exit()' message that `rmprocs()` sends to do nothing
+            Core.eval(Base, :(exit() = nothing))
+        end
+        wait(rmprocs([w]))
+    end
+end
+
 # Run topology tests last after removing all workers, since a given
 # cluster at any time only supports a single topology.
 rmprocs(workers())