Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow longer timeout for starting pageserver, safe keeper and storage controller in test cases to make test cases less flaky #8079

Merged
merged 10 commits into from
Jun 21, 2024
33 changes: 20 additions & 13 deletions control_plane/src/background_process.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@ use utils::pid_file::{self, PidFileRead};
// it's waiting. If the process hasn't started/stopped after 5 seconds,
// it prints a notice that it's taking long, but keeps waiting.
//
const RETRY_UNTIL_SECS: u64 = 10;
const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS;
const RETRY_INTERVAL_MILLIS: u64 = 100;
const DOT_EVERY_RETRIES: u64 = 10;
const NOTICE_AFTER_RETRIES: u64 = 50;
const STOP_RETRY_TIMEOUT: Duration = Duration::from_secs(10);
const STOP_RETRIES: u128 = STOP_RETRY_TIMEOUT.as_millis() / RETRY_INTERVAL.as_millis();
const RETRY_INTERVAL: Duration = Duration::from_millis(100);
const DOT_EVERY_RETRIES: u128 = 10;
const NOTICE_AFTER_RETRIES: u128 = 50;

/// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
/// it itself.
Expand All @@ -52,13 +52,15 @@ pub enum InitialPidFile {
}

/// Start a background child process using the parameters given.
#[allow(clippy::too_many_arguments)]
pub async fn start_process<F, Fut, AI, A, EI>(
process_name: &str,
datadir: &Path,
command: &Path,
args: AI,
envs: EI,
initial_pid_file: InitialPidFile,
retry_timeout: &Duration,
process_status_check: F,
) -> anyhow::Result<()>
where
Expand All @@ -69,6 +71,7 @@ where
// Not generic AsRef<OsStr>, otherwise empty `envs` prevents type inference
EI: IntoIterator<Item = (String, String)>,
{
let retries: u128 = retry_timeout.as_millis() / RETRY_INTERVAL.as_millis();
if !datadir.metadata().context("stat datadir")?.is_dir() {
anyhow::bail!("`datadir` must be a directory when calling this function: {datadir:?}");
}
Expand Down Expand Up @@ -130,7 +133,7 @@ where
.unwrap();
});

for retries in 0..RETRIES {
for retries in 0..retries {
match process_started(pid, pid_file_to_check, &process_status_check).await {
Ok(true) => {
println!("\n{process_name} started and passed status check, pid: {pid}");
Expand All @@ -148,7 +151,7 @@ where
print!(".");
io::stdout().flush().unwrap();
}
thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
thread::sleep(RETRY_INTERVAL);
}
Err(e) => {
println!("error starting process {process_name:?}: {e:#}");
Expand All @@ -157,9 +160,10 @@ where
}
}
println!();
anyhow::bail!(
"{process_name} did not start+pass status checks within {RETRY_UNTIL_SECS} seconds"
);
anyhow::bail!(format!(
"{} did not start+pass status checks within {:?} seconds",
process_name, retry_timeout
));
}

/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
Expand Down Expand Up @@ -215,7 +219,7 @@ pub fn stop_process(
}

pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
for retries in 0..RETRIES {
for retries in 0..STOP_RETRIES {
match process_has_stopped(pid) {
Ok(true) => {
println!("\n{process_name} stopped");
Expand All @@ -231,7 +235,7 @@ pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
print!(".");
io::stdout().flush().unwrap();
}
thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
thread::sleep(RETRY_INTERVAL);
}
Err(e) => {
println!("{process_name} with pid {pid} failed to stop: {e:#}");
Expand All @@ -240,7 +244,10 @@ pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
}
}
println!();
anyhow::bail!("{process_name} with pid {pid} did not stop in {RETRY_UNTIL_SECS} seconds");
anyhow::bail!(format!(
"{} with pid {} did not stop in {:?} seconds",
process_name, pid, STOP_RETRY_TIMEOUT
));
}

fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
Expand Down
62 changes: 49 additions & 13 deletions control_plane/src/bin/neon_local.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ use std::collections::{BTreeSet, HashMap};
use std::path::PathBuf;
use std::process::exit;
use std::str::FromStr;
use std::time::Duration;
use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
use url::Host;
use utils::{
Expand Down Expand Up @@ -99,7 +100,7 @@ fn main() -> Result<()> {
let subcommand_result = match sub_name {
"tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
"timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
"start" => rt.block_on(handle_start_all(&env)),
"start" => rt.block_on(handle_start_all(&env, get_start_timeout(sub_args))),
"stop" => rt.block_on(handle_stop_all(sub_args, &env)),
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
"storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
Expand Down Expand Up @@ -1048,10 +1049,20 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
))
}

fn get_start_timeout(args: &ArgMatches) -> &Duration {
let humantime_duration = args
.get_one::<humantime::Duration>("start-timeout")
.expect("invalid value for start-timeout");
humantime_duration.as_ref()
}

async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
match sub_match.subcommand() {
Some(("start", subcommand_args)) => {
if let Err(e) = get_pageserver(env, subcommand_args)?.start().await {
if let Err(e) = get_pageserver(env, subcommand_args)?
.start(get_start_timeout(subcommand_args))
.await
{
eprintln!("pageserver start failed: {e}");
exit(1);
}
Expand All @@ -1077,7 +1088,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
exit(1);
}

if let Err(e) = pageserver.start().await {
if let Err(e) = pageserver.start(get_start_timeout(sub_match)).await {
eprintln!("pageserver start failed: {e}");
exit(1);
}
Expand Down Expand Up @@ -1105,8 +1116,8 @@ async fn handle_storage_controller(
) -> Result<()> {
let svc = StorageController::from_env(env);
match sub_match.subcommand() {
Some(("start", _start_match)) => {
if let Err(e) = svc.start().await {
Some(("start", start_match)) => {
if let Err(e) = svc.start(get_start_timeout(start_match)).await {
eprintln!("start failed: {e}");
exit(1);
}
Expand Down Expand Up @@ -1165,7 +1176,10 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
"start" => {
let extra_opts = safekeeper_extra_opts(sub_args);

if let Err(e) = safekeeper.start(extra_opts).await {
if let Err(e) = safekeeper
.start(extra_opts, get_start_timeout(sub_args))
.await
{
eprintln!("safekeeper start failed: {}", e);
exit(1);
}
Expand All @@ -1191,7 +1205,10 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
}

let extra_opts = safekeeper_extra_opts(sub_args);
if let Err(e) = safekeeper.start(extra_opts).await {
if let Err(e) = safekeeper
.start(extra_opts, get_start_timeout(sub_args))
.await
{
eprintln!("safekeeper start failed: {}", e);
exit(1);
}
Expand All @@ -1204,15 +1221,18 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
Ok(())
}

async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {
async fn handle_start_all(
env: &local_env::LocalEnv,
retry_timeout: &Duration,
) -> anyhow::Result<()> {
// Endpoints are not started automatically

broker::start_broker_process(env).await?;
broker::start_broker_process(env, retry_timeout).await?;

// Only start the storage controller if the pageserver is configured to need it
if env.control_plane_api.is_some() {
let storage_controller = StorageController::from_env(env);
if let Err(e) = storage_controller.start().await {
if let Err(e) = storage_controller.start(retry_timeout).await {
eprintln!("storage_controller start failed: {:#}", e);
try_stop_all(env, true).await;
exit(1);
Expand All @@ -1221,7 +1241,7 @@ async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {

for ps_conf in &env.pageservers {
let pageserver = PageServerNode::from_env(env, ps_conf);
if let Err(e) = pageserver.start().await {
if let Err(e) = pageserver.start(retry_timeout).await {
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
try_stop_all(env, true).await;
exit(1);
Expand All @@ -1230,7 +1250,7 @@ async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {

for node in env.safekeepers.iter() {
let safekeeper = SafekeeperNode::from_env(env, node);
if let Err(e) = safekeeper.start(vec![]).await {
if let Err(e) = safekeeper.start(vec![], retry_timeout).await {
eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
try_stop_all(env, false).await;
exit(1);
Expand Down Expand Up @@ -1290,6 +1310,15 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
}

fn cli() -> Command {
let timeout_arg = Arg::new("start-timeout")
.long("start-timeout")
.short('t')
.global(true)
.help("timeout until we fail the command, e.g. 30s")
.value_parser(value_parser!(humantime::Duration))
.default_value("10s")
.required(false);

let branch_name_arg = Arg::new("branch-name")
.long("branch-name")
.help("Name of the branch to be created or used as an alias for other services")
Expand Down Expand Up @@ -1509,20 +1538,23 @@ fn cli() -> Command {
.subcommand(Command::new("status"))
.subcommand(Command::new("start")
.about("Start local pageserver")
.arg(timeout_arg.clone())
)
.subcommand(Command::new("stop")
.about("Stop local pageserver")
.arg(stop_mode_arg.clone())
)
.subcommand(Command::new("restart")
.about("Restart local pageserver")
.arg(timeout_arg.clone())
)
)
.subcommand(
Command::new("storage_controller")
.arg_required_else_help(true)
.about("Manage storage_controller")
.subcommand(Command::new("start").about("Start storage controller"))
.subcommand(Command::new("start").about("Start storage controller")
.arg(timeout_arg.clone()))
.subcommand(Command::new("stop").about("Stop storage controller")
.arg(stop_mode_arg.clone()))
)
Expand All @@ -1534,6 +1566,7 @@ fn cli() -> Command {
.about("Start local safekeeper")
.arg(safekeeper_id_arg.clone())
.arg(safekeeper_extra_opt_arg.clone())
.arg(timeout_arg.clone())
)
.subcommand(Command::new("stop")
.about("Stop local safekeeper")
Expand All @@ -1545,6 +1578,7 @@ fn cli() -> Command {
.arg(safekeeper_id_arg)
.arg(stop_mode_arg.clone())
.arg(safekeeper_extra_opt_arg)
.arg(timeout_arg.clone())
)
)
.subcommand(
Expand Down Expand Up @@ -1579,6 +1613,7 @@ fn cli() -> Command {
.arg(remote_ext_config_args)
.arg(create_test_user)
.arg(allow_multiple.clone())
.arg(timeout_arg.clone())
)
.subcommand(Command::new("reconfigure")
.about("Reconfigure the endpoint")
Expand Down Expand Up @@ -1630,6 +1665,7 @@ fn cli() -> Command {
.subcommand(
Command::new("start")
.about("Start page server and safekeepers")
.arg(timeout_arg.clone())
)
.subcommand(
Command::new("stop")
Expand Down
8 changes: 7 additions & 1 deletion control_plane/src/broker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,18 @@
//! ```text
//! .neon/safekeepers/<safekeeper id>
//! ```
use std::time::Duration;

use anyhow::Context;

use camino::Utf8PathBuf;

use crate::{background_process, local_env};

pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
pub async fn start_broker_process(
env: &local_env::LocalEnv,
retry_timeout: &Duration,
) -> anyhow::Result<()> {
let broker = &env.broker;
let listen_addr = &broker.listen_addr;

Expand All @@ -27,6 +32,7 @@ pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<(
args,
[],
background_process::InitialPidFile::Create(storage_broker_pid_file_path(env)),
retry_timeout,
|| async {
let url = broker.client_url();
let status_url = url.join("status").with_context(|| {
Expand Down
12 changes: 7 additions & 5 deletions control_plane/src/pageserver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,8 @@ impl PageServerNode {
.expect("non-Unicode path")
}

pub async fn start(&self) -> anyhow::Result<()> {
self.start_node().await
pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
self.start_node(retry_timeout).await
}

fn pageserver_init(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> {
Expand Down Expand Up @@ -214,14 +214,15 @@ impl PageServerNode {
Ok(())
}

async fn start_node(&self) -> anyhow::Result<()> {
async fn start_node(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
// TODO: using a thread here because start_process() is not async but we need to call check_status()
let datadir = self.repo_path();
print!(
"Starting pageserver node {} at '{}' in {:?}",
"Starting pageserver node {} at '{}' in {:?}, retrying for {:?}",
self.conf.id,
self.pg_connection_config.raw_address(),
datadir
datadir,
retry_timeout
);
io::stdout().flush().context("flush stdout")?;

Expand All @@ -239,6 +240,7 @@ impl PageServerNode {
args,
self.pageserver_env_variables()?,
background_process::InitialPidFile::Expect(self.pid_file()),
retry_timeout,
|| async {
let st = self.check_status().await;
match st {
Expand Down
Loading
Loading