Skip to content

Commit

Permalink
pageserver: expose prometheus metrics for startup time
Browse files Browse the repository at this point in the history
These are broken down by phase, where the phases correspond
to the existing wait points in the code:
- Start of doing I/O
- When tenant load is done
- When initial size calculation is done
- When background jobs start
- Then "complete" when everything is done.
  • Loading branch information
jcsp committed Aug 3, 2023
1 parent cd33089 commit 6707d58
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 1 deletion.
26 changes: 26 additions & 0 deletions pageserver/src/bin/pageserver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ use clap::{Arg, ArgAction, Command};
use fail::FailScenario;
use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
use pageserver::metrics::STARTUP_DURATION;
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
use remote_storage::GenericRemoteStorage;
use tokio::time::Instant;
use tracing::*;

use metrics::set_build_info_metric;
Expand Down Expand Up @@ -226,6 +228,19 @@ fn start_pageserver(
launch_ts: &'static LaunchTimestamp,
conf: &'static PageServerConf,
) -> anyhow::Result<()> {
// Monotonic time for later calculating startup duration
let started_startup_at = Instant::now();

let startup_checkpoint = move |phase: &str| {
STARTUP_DURATION.with_label_values(&[phase]).set(
started_startup_at
.elapsed()
.as_millis()
.try_into()
.unwrap_or(u64::MAX),
);
};

// Print version and launch timestamp to the log,
// and expose them as prometheus metrics.
// A changed version string indicates changed software.
Expand Down Expand Up @@ -335,6 +350,10 @@ fn start_pageserver(
// Set up remote storage client
let remote_storage = create_remote_storage_client(conf)?;

// Up to this point no significant I/O has been done: this should have been fast. Record
// duration prior to starting I/O intensive phase of startup.
startup_checkpoint("initial");

// Startup staging or optimizing:
//
// We want to minimize downtime for `page_service` connections, and trying not to overload
Expand Down Expand Up @@ -378,6 +397,8 @@ fn start_pageserver(
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));

init_done_rx.wait().await;
startup_checkpoint("initial_tenant_load");

// initial logical sizes can now start, as they were waiting on init_done_rx.

scopeguard::ScopeGuard::into_inner(guard);
Expand All @@ -404,6 +425,7 @@ fn start_pageserver(
from_init_millis = (now - init_started_at).as_millis(),
"Initial logical sizes completed"
);
startup_checkpoint("initial_logical_sizes");
None
}
Err(_) => {
Expand All @@ -419,6 +441,7 @@ fn start_pageserver(

// allow background jobs to start
drop(background_jobs_can_start);
startup_checkpoint("background_jobs_can_start");

if let Some(init_sizes_done) = init_sizes_done {
// ending up here is not a bug; at the latest logical sizes will be queried by
Expand All @@ -434,8 +457,11 @@ fn start_pageserver(
from_init_millis = (now - init_started_at).as_millis(),
"Initial logical sizes completed after timeout (background jobs already started)"
);
startup_checkpoint("initial_logical_sizes");

}

startup_checkpoint("complete");
};

async move {
Expand Down
2 changes: 1 addition & 1 deletion pageserver/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ pub mod disk_usage_eviction_task;
pub mod http;
pub mod import_datadir;
pub mod keyspace;
pub(crate) mod metrics;
pub mod metrics;
pub mod page_cache;
pub mod page_service;
pub mod pgdatadir_mapping;
Expand Down
11 changes: 11 additions & 0 deletions pageserver/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,17 @@ pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(||
.expect("failed to define a metric")
});

/// How long did we take to start up? Broken down by labels to describe
/// different phases of startup.
pub static STARTUP_DURATION: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_startup_duration_ms",
"Time taken by phases of pageserver startup, in milliseconds",
&["phase"]
)
.expect("Failed to register pageserver_startup_duration_ms metric")
});

/// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
#[derive(Debug)]
pub struct EvictionsWithLowResidenceDuration {
Expand Down

0 comments on commit 6707d58

Please sign in to comment.