Skip to content

Commit

Permalink
pageserver: respond to multiple shutdown signals (#9982)
Browse files Browse the repository at this point in the history
## Problem

The Pageserver signal handler would only respond to a single signal and
initiate shutdown. Subsequent signals were ignored. This meant that a
`SIGQUIT` sent after a `SIGTERM` had no effect (e.g. in the case of a
slow or stalled shutdown). The `test_runner` uses this to force shutdown
if graceful shutdown is slow.

Touches #9740.

## Summary of changes

Keep responding to signals after the initial shutdown signal has been
received.

Arguably, the `test_runner` should also use `SIGKILL` rather than
`SIGQUIT` in this case, but it seems reasonable to respond to `SIGQUIT`
regardless.
  • Loading branch information
erikgrinaker authored and awarus committed Dec 5, 2024
1 parent 6024112 commit 4157eaf
Showing 1 changed file with 48 additions and 34 deletions.
82 changes: 48 additions & 34 deletions pageserver/src/bin/pageserver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -636,45 +636,59 @@ fn start_pageserver(
tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")?
});

let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());

// All started up! Now just sit and wait for shutdown signal.

{
BACKGROUND_RUNTIME.block_on(async move {
BACKGROUND_RUNTIME.block_on(async move {
let signal_token = CancellationToken::new();
let signal_cancel = signal_token.child_token();

// Spawn signal handlers. Runs in a loop since we want to be responsive to multiple signals
// even after triggering shutdown (e.g. a SIGQUIT after a slow SIGTERM shutdown). See:
// https://github.com/neondatabase/neon/issues/9740.
tokio::spawn(async move {
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
let signal = tokio::select! {
_ = sigquit.recv() => {
info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",);
std::process::exit(111);

loop {
let signal = tokio::select! {
_ = sigquit.recv() => {
info!("Got signal SIGQUIT. Terminating in immediate shutdown mode.");
std::process::exit(111);
}
_ = sigint.recv() => "SIGINT",
_ = sigterm.recv() => "SIGTERM",
};

if !signal_token.is_cancelled() {
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode.");
signal_token.cancel();
} else {
info!("Got signal {signal}. Already shutting down.");
}
_ = sigint.recv() => { "SIGINT" },
_ = sigterm.recv() => { "SIGTERM" },
};

info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);

// This cancels the `shutdown_pageserver` cancellation tree.
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
// The plan is to change that over time.
shutdown_pageserver.take();
pageserver::shutdown_pageserver(
http_endpoint_listener,
page_service,
consumption_metrics_tasks,
disk_usage_eviction_task,
&tenant_manager,
background_purges,
deletion_queue.clone(),
secondary_controller_tasks,
0,
)
.await;
unreachable!()
})
}
}
});

// Wait for cancellation signal and shut down the pageserver.
//
// This cancels the `shutdown_pageserver` cancellation tree. Right now that tree doesn't
// reach very far, and `task_mgr` is used instead. The plan is to change that over time.
signal_cancel.cancelled().await;

shutdown_pageserver.cancel();
pageserver::shutdown_pageserver(
http_endpoint_listener,
page_service,
consumption_metrics_tasks,
disk_usage_eviction_task,
&tenant_manager,
background_purges,
deletion_queue.clone(),
secondary_controller_tasks,
0,
)
.await;
unreachable!();
})
}

async fn create_remote_storage_client(
Expand Down

0 comments on commit 4157eaf

Please sign in to comment.