From 7927c67b1a167dbe6e9b8d845d485672e2121776 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 26 Jan 2024 17:55:38 +0100 Subject: [PATCH] fix(test suite): some tests leak child processes Epic: #6485 Before this PR, some tests would leak child processes. We found them using the approach in https://github.com/neondatabase/neon/pull/6470. This PR fixes the findings because PR#6470 is being delayed due to security concerns. --- control_plane/src/endpoint.rs | 19 +++++++++++++++++-- test_runner/regress/test_import.py | 2 ++ test_runner/regress/test_neon_local_cli.py | 2 ++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index d3b0366d313e..dcad22b9925f 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -438,7 +438,7 @@ impl Endpoint { } fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> { - // TODO use background_process::stop_process instead + // TODO use background_process::stop_process instead: https://github.com/neondatabase/neon/pull/6482 let pidfile_path = self.endpoint_path().join("compute_ctl.pid"); let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?; let pid = nix::unistd::Pid::from_raw(pid as i32); @@ -583,9 +583,21 @@ impl Endpoint { } let child = cmd.spawn()?; + // set up a scopeguard to kill & wait for the child in case we panic or bail below + let child = scopeguard::guard(child, |mut child| { + println!("SIGKILL & wait the started process"); + (|| { + // TODO: use another signal that can be caught by the child so it can clean up any children it spawned + child.kill().context("SIGKILL child")?; + child.wait().context("wait() for child process")?; + anyhow::Ok(()) + })() + .with_context(|| format!("scopeguard kill&wait child {child:?}")) + .unwrap(); + }); // Write down the pid so we can wait for it when we want to stop - // TODO use background_process::start_process instead + // TODO use background_process::start_process instead: https://github.com/neondatabase/neon/pull/6482 let pid = child.id(); let pidfile_path = self.endpoint_path().join("compute_ctl.pid"); std::fs::write(pidfile_path, pid.to_string())?; @@ -634,6 +646,9 @@ impl Endpoint { std::thread::sleep(ATTEMPT_INTERVAL); } + // disarm the scopeguard, let the child outlive this function (and neon_local invoction) + drop(scopeguard::ScopeGuard::into_inner(child)); + Ok(()) } diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index faedf5d9440b..3519cbbaabe8 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -163,6 +163,8 @@ def import_tar(base, wal): endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant) assert endpoint.safe_psql("select count(*) from t") == [(300000,)] + vanilla_pg.stop() + def test_import_from_pageserver_small( pg_bin: PgBin, neon_env_builder: NeonEnvBuilder, test_output_dir: Path diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py index 46b72fbca500..8edba49b8a6f 100644 --- a/test_runner/regress/test_neon_local_cli.py +++ b/test_runner/regress/test_neon_local_cli.py @@ -59,3 +59,5 @@ def test_neon_two_primary_endpoints_fail( env.neon_cli.endpoint_stop("ep1") # ep1 is stopped so create ep2 will succeed env.neon_cli.endpoint_start("ep2") + # cleanup + env.neon_cli.endpoint_stop("ep2")