Skip to content

Commit

Permalink
Improve metrics push
Browse files Browse the repository at this point in the history
  • Loading branch information
mwtian committed Sep 20, 2024
1 parent 06b1e4d commit 08c0a79
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 10 deletions.
4 changes: 2 additions & 2 deletions crates/mysten-common/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use prometheus::Encoder;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use tracing::{debug, error, info};

const DEFAULT_METRICS_PUSH_TIMEOUT: Duration = Duration::from_secs(30);
const METRICS_PUSH_TIMEOUT: Duration = Duration::from_secs(45);

pub struct MetricsPushClient {
certificate: std::sync::Arc<sui_tls::SelfSignedCertificate>,
Expand Down Expand Up @@ -77,7 +77,7 @@ pub async fn push_metrics(
.header(reqwest::header::CONTENT_ENCODING, "snappy")
.header(reqwest::header::CONTENT_TYPE, prometheus::PROTOBUF_FORMAT)
.body(compressed)
.timeout(DEFAULT_METRICS_PUSH_TIMEOUT)
.timeout(METRICS_PUSH_TIMEOUT)
.send()
.await?;

Expand Down
15 changes: 11 additions & 4 deletions crates/sui-bridge/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,22 @@ pub fn start_metrics_push_task(
let mut interval = tokio::time::interval(interval);
interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);

let mut errors = 0;
loop {
interval.tick().await;

// Retry pushing metrics if there is an error.
while let Err(error) = push_metrics(&client, &url, &registry).await {
tracing::warn!("unable to push metrics: {error}; new client will be created");
sleep(Duration::from_secs(1)).await;
if let Err(error) = push_metrics(&client, &url, &registry).await {
errors += 1;
if errors >= 10 {
// If we hit 10 failures in a row, start logging errors.
tracing::error!("unable to push metrics: {error}; new client will be created");
} else {
tracing::warn!("unable to push metrics: {error}; new client will be created");
}
// aggressively recreate our client connection if we hit an error
client = MetricsPushClient::new(metrics_key_pair.copy());
} else {
errors = 0;
}
}
});
Expand Down
15 changes: 11 additions & 4 deletions crates/sui-node/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,22 @@ pub fn start_metrics_push_task(config: &sui_config::NodeConfig, registry: Regist
let mut interval = tokio::time::interval(interval);
interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);

let mut errors = 0;
loop {
interval.tick().await;

// Retry pushing metrics if there is an error.
while let Err(error) = push_metrics(&client, &url, &registry).await {
tracing::warn!("unable to push metrics: {error}; new client will be created");
sleep(Duration::from_secs(1)).await;
if let Err(error) = push_metrics(&client, &url, &registry).await {
errors += 1;
if errors >= 10 {
// If we hit 10 failures in a row, start logging errors.
tracing::error!("unable to push metrics: {error}; new client will be created");
} else {
tracing::warn!("unable to push metrics: {error}; new client will be created");
}
// aggressively recreate our client connection if we hit an error
client = MetricsPushClient::new(config_copy.network_key_pair().copy());
} else {
errors = 0;
}
}
});
Expand Down

0 comments on commit 08c0a79

Please sign in to comment.