Skip to content

Commit

Permalink
Fix flaky pageserver restarts in tests (#2261)
Browse files Browse the repository at this point in the history
  • Loading branch information
bojanserafimov committed Aug 17, 2022
1 parent 3414fea commit e9a3499
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 60 deletions.
45 changes: 15 additions & 30 deletions control_plane/src/safekeeper.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use std::io::Write;
use std::net::TcpStream;
use std::path::PathBuf;
use std::process::Command;
use std::sync::Arc;
Expand Down Expand Up @@ -241,37 +240,23 @@ impl SafekeeperNode {
),
}

let address = connection_address(&self.pg_connection_config);

// TODO Remove this "timeout" and handle it on caller side instead.
// Shutting down may take a long time,
// if safekeeper flushes a lot of data
let mut tcp_stopped = false;
// Wait until process is gone
for i in 0..600 {
if !tcp_stopped {
if let Err(err) = TcpStream::connect(&address) {
tcp_stopped = true;
if err.kind() != io::ErrorKind::ConnectionRefused {
eprintln!("\nSafekeeper connection failed with error: {err}");
}
}
}
if tcp_stopped {
// Also check status on the HTTP port
match self.check_status() {
Err(SafekeeperHttpError::Transport(err)) if err.is_connect() => {
println!("done!");
return Ok(());
}
Err(err) => {
eprintln!("\nSafekeeper status check failed with error: {err}");
return Ok(());
}
Ok(()) => {
// keep waiting
}
let signal = None; // Send no signal, just get the error code
match kill(pid, signal) {
Ok(_) => (), // Process exists, keep waiting
Err(Errno::ESRCH) => {
// Process not found, we're done
println!("done!");
return Ok(());
}
}
Err(err) => bail!(
"Failed to send signal to pageserver with pid {}: {}",
pid,
err.desc()
),
};

if i % 10 == 0 {
print!(".");
io::stdout().flush().unwrap();
Expand Down
44 changes: 14 additions & 30 deletions control_plane/src/storage.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufReader, Write};
use std::net::TcpStream;
use std::num::NonZeroU64;
use std::path::PathBuf;
use std::process::Command;
Expand Down Expand Up @@ -312,38 +311,23 @@ impl PageServerNode {
),
}

let address = connection_address(&self.pg_connection_config);

// TODO Remove this "timeout" and handle it on caller side instead.
// Shutting down may take a long time,
// if pageserver checkpoints a lot of data
let mut tcp_stopped = false;
// Wait until process is gone
for i in 0..600 {
if !tcp_stopped {
if let Err(err) = TcpStream::connect(&address) {
tcp_stopped = true;
if err.kind() != io::ErrorKind::ConnectionRefused {
eprintln!("\nPageserver connection failed with error: {err}");
}
let signal = None; // Send no signal, just get the error code
match kill(pid, signal) {
Ok(_) => (), // Process exists, keep waiting
Err(Errno::ESRCH) => {
// Process not found, we're done
println!("done!");
return Ok(());
}
}
if tcp_stopped {
// Also check status on the HTTP port
Err(err) => bail!(
"Failed to send signal to pageserver with pid {}: {}",
pid,
err.desc()
),
};

match self.check_status() {
Err(PageserverHttpError::Transport(err)) if err.is_connect() => {
println!("done!");
return Ok(());
}
Err(err) => {
eprintln!("\nPageserver status check failed with error: {err}");
return Ok(());
}
Ok(()) => {
// keep waiting
}
}
}
if i % 10 == 0 {
print!(".");
io::stdout().flush().unwrap();
Expand Down
17 changes: 17 additions & 0 deletions test_runner/batch_others/test_setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""Tests for the code in test fixtures"""

from fixtures.neon_fixtures import NeonEnvBuilder


# Test that pageserver and safekeeper can restart quickly.
# This is a regression test, see https://github.com/neondatabase/neon/issues/2247
def test_fixture_restart(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()

for i in range(3):
env.pageserver.stop()
env.pageserver.start()

for i in range(3):
env.safekeepers[0].stop()
env.safekeepers[0].start()

0 comments on commit e9a3499

Please sign in to comment.