From e9a3499e87c2f87632849b501282387e2a9dcbeb Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Wed, 17 Aug 2022 08:17:35 -0400 Subject: [PATCH] Fix flaky pageserver restarts in tests (#2261) --- control_plane/src/safekeeper.rs | 45 +++++++++----------------- control_plane/src/storage.rs | 44 ++++++++----------------- test_runner/batch_others/test_setup.py | 17 ++++++++++ 3 files changed, 46 insertions(+), 60 deletions(-) create mode 100644 test_runner/batch_others/test_setup.py diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 0cae479d7147..3fda856e135b 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -1,5 +1,4 @@ use std::io::Write; -use std::net::TcpStream; use std::path::PathBuf; use std::process::Command; use std::sync::Arc; @@ -241,37 +240,23 @@ impl SafekeeperNode { ), } - let address = connection_address(&self.pg_connection_config); - - // TODO Remove this "timeout" and handle it on caller side instead. - // Shutting down may take a long time, - // if safekeeper flushes a lot of data - let mut tcp_stopped = false; + // Wait until process is gone for i in 0..600 { - if !tcp_stopped { - if let Err(err) = TcpStream::connect(&address) { - tcp_stopped = true; - if err.kind() != io::ErrorKind::ConnectionRefused { - eprintln!("\nSafekeeper connection failed with error: {err}"); - } - } - } - if tcp_stopped { - // Also check status on the HTTP port - match self.check_status() { - Err(SafekeeperHttpError::Transport(err)) if err.is_connect() => { - println!("done!"); - return Ok(()); - } - Err(err) => { - eprintln!("\nSafekeeper status check failed with error: {err}"); - return Ok(()); - } - Ok(()) => { - // keep waiting - } + let signal = None; // Send no signal, just get the error code + match kill(pid, signal) { + Ok(_) => (), // Process exists, keep waiting + Err(Errno::ESRCH) => { + // Process not found, we're done + println!("done!"); + return Ok(()); } - } + Err(err) => bail!( + "Failed to send signal to pageserver with pid {}: {}", + pid, + err.desc() + ), + }; + if i % 10 == 0 { print!("."); io::stdout().flush().unwrap(); diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index d2742e84bbda..31858278d3f3 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -1,7 +1,6 @@ use std::collections::HashMap; use std::fs::File; use std::io::{BufReader, Write}; -use std::net::TcpStream; use std::num::NonZeroU64; use std::path::PathBuf; use std::process::Command; @@ -312,38 +311,23 @@ impl PageServerNode { ), } - let address = connection_address(&self.pg_connection_config); - - // TODO Remove this "timeout" and handle it on caller side instead. - // Shutting down may take a long time, - // if pageserver checkpoints a lot of data - let mut tcp_stopped = false; + // Wait until process is gone for i in 0..600 { - if !tcp_stopped { - if let Err(err) = TcpStream::connect(&address) { - tcp_stopped = true; - if err.kind() != io::ErrorKind::ConnectionRefused { - eprintln!("\nPageserver connection failed with error: {err}"); - } + let signal = None; // Send no signal, just get the error code + match kill(pid, signal) { + Ok(_) => (), // Process exists, keep waiting + Err(Errno::ESRCH) => { + // Process not found, we're done + println!("done!"); + return Ok(()); } - } - if tcp_stopped { - // Also check status on the HTTP port + Err(err) => bail!( + "Failed to send signal to pageserver with pid {}: {}", + pid, + err.desc() + ), + }; - match self.check_status() { - Err(PageserverHttpError::Transport(err)) if err.is_connect() => { - println!("done!"); - return Ok(()); - } - Err(err) => { - eprintln!("\nPageserver status check failed with error: {err}"); - return Ok(()); - } - Ok(()) => { - // keep waiting - } - } - } if i % 10 == 0 { print!("."); io::stdout().flush().unwrap(); diff --git a/test_runner/batch_others/test_setup.py b/test_runner/batch_others/test_setup.py new file mode 100644 index 000000000000..3d1471621b22 --- /dev/null +++ b/test_runner/batch_others/test_setup.py @@ -0,0 +1,17 @@ +"""Tests for the code in test fixtures""" + +from fixtures.neon_fixtures import NeonEnvBuilder + + +# Test that pageserver and safekeeper can restart quickly. +# This is a regression test, see https://github.com/neondatabase/neon/issues/2247 +def test_fixture_restart(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + for i in range(3): + env.pageserver.stop() + env.pageserver.start() + + for i in range(3): + env.safekeepers[0].stop() + env.safekeepers[0].start()