Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix flaky pageserver restarts in tests #2261

Merged
merged 19 commits into from
Aug 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 15 additions & 30 deletions control_plane/src/safekeeper.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use std::io::Write;
use std::net::TcpStream;
use std::path::PathBuf;
use std::process::Command;
use std::sync::Arc;
Expand Down Expand Up @@ -241,37 +240,23 @@ impl SafekeeperNode {
),
}

let address = connection_address(&self.pg_connection_config);

// TODO Remove this "timeout" and handle it on caller side instead.
// Shutting down may take a long time,
// if safekeeper flushes a lot of data
let mut tcp_stopped = false;
// Wait until process is gone
for i in 0..600 {
if !tcp_stopped {
if let Err(err) = TcpStream::connect(&address) {
tcp_stopped = true;
if err.kind() != io::ErrorKind::ConnectionRefused {
eprintln!("\nSafekeeper connection failed with error: {err}");
}
}
}
if tcp_stopped {
// Also check status on the HTTP port
match self.check_status() {
Err(SafekeeperHttpError::Transport(err)) if err.is_connect() => {
println!("done!");
return Ok(());
}
Err(err) => {
eprintln!("\nSafekeeper status check failed with error: {err}");
return Ok(());
}
Ok(()) => {
// keep waiting
}
let signal = None; // Send no signal, just get the error code
match kill(pid, signal) {
Ok(_) => (), // Process exists, keep waiting
Err(Errno::ESRCH) => {
// Process not found, we're done
println!("done!");
return Ok(());
}
}
Err(err) => bail!(
"Failed to send signal to pageserver with pid {}: {}",
pid,
err.desc()
),
};

if i % 10 == 0 {
print!(".");
io::stdout().flush().unwrap();
Expand Down
44 changes: 14 additions & 30 deletions control_plane/src/storage.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufReader, Write};
use std::net::TcpStream;
use std::num::NonZeroU64;
use std::path::PathBuf;
use std::process::Command;
Expand Down Expand Up @@ -312,38 +311,23 @@ impl PageServerNode {
),
}

let address = connection_address(&self.pg_connection_config);

// TODO Remove this "timeout" and handle it on caller side instead.
// Shutting down may take a long time,
// if pageserver checkpoints a lot of data
let mut tcp_stopped = false;
// Wait until process is gone
for i in 0..600 {
if !tcp_stopped {
if let Err(err) = TcpStream::connect(&address) {
tcp_stopped = true;
if err.kind() != io::ErrorKind::ConnectionRefused {
eprintln!("\nPageserver connection failed with error: {err}");
}
let signal = None; // Send no signal, just get the error code
match kill(pid, signal) {
Ok(_) => (), // Process exists, keep waiting
Err(Errno::ESRCH) => {
// Process not found, we're done
println!("done!");
return Ok(());
}
}
if tcp_stopped {
// Also check status on the HTTP port
Err(err) => bail!(
"Failed to send signal to pageserver with pid {}: {}",
pid,
err.desc()
),
};

match self.check_status() {
Err(PageserverHttpError::Transport(err)) if err.is_connect() => {
println!("done!");
return Ok(());
}
Err(err) => {
eprintln!("\nPageserver status check failed with error: {err}");
return Ok(());
}
Ok(()) => {
// keep waiting
}
}
}
if i % 10 == 0 {
print!(".");
io::stdout().flush().unwrap();
Expand Down
17 changes: 17 additions & 0 deletions test_runner/batch_others/test_setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""Tests for the code in test fixtures"""

from fixtures.neon_fixtures import NeonEnvBuilder


# Test that pageserver and safekeeper can restart quickly.
# This is a regression test, see https://github.com/neondatabase/neon/issues/2247
def test_fixture_restart(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()

for i in range(3):
env.pageserver.stop()
env.pageserver.start()

for i in range(3):
env.safekeepers[0].stop()
env.safekeepers[0].start()