Skip to content

Commit

Permalink
feat(phoenix-service): enable running only node checks
Browse files Browse the repository at this point in the history
  • Loading branch information
blombern committed Sep 3, 2024
1 parent 95c93ed commit ecb398a
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 25 deletions.
3 changes: 3 additions & 0 deletions src/phoenix/env.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ pub struct AppConfig {
pub consensus_nodes: Vec<Url>,
pub database_url: String,
pub env: Env,
/// Skip global checks in `run_ops_monitors` and only check for beacon/sim node status.
#[serde(default)]
pub ff_node_check_only: bool,
pub loki_url: String,
/// Minimum number of missed slots per check interval to trigger an alert
#[serde(default = "default_missed_slots_alert_threshold")]
Expand Down
67 changes: 42 additions & 25 deletions src/phoenix/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -311,32 +311,49 @@ pub async fn monitor_critical_services() -> Result<()> {

let last_checked = Arc::new(Mutex::new(Utc::now()));

let result = tokio::try_join!(
mount_health_route(),
run_alarm_loop(last_checked),
run_ops_monitors()
);

match result {
Ok(_) => {
let message = TelegramSafeAlert::new("phoenix processes exited unexpectedly");
telegram_alerts.send_alert(message.clone()).await;
Err(anyhow!(message))
// Skip global checks and only check nodes
if APP_CONFIG.ff_node_check_only {
let result = tokio::try_join!(mount_health_route(), run_alarm_loop(last_checked));
match result {
Ok(_) => handle_unexpected_exit(telegram_alerts).await,
Err(err) => handle_unexpected_error(telegram_alerts, err).await,
}
Err(err) => {
let shortned_err = err.to_string().split_off(3072);
let escaped_err = telegram::escape_str(&shortned_err);
let formatted_message = formatdoc!(
"
phoenix process exited with error
```error
{escaped_err}
```
"
);
let message = TelegramSafeAlert::from_escaped_string(formatted_message);
telegram_alerts.send_alert(message).await;
Err(anyhow!(err))
}
// Run all checks
else {
let result = tokio::try_join!(
mount_health_route(),
run_alarm_loop(last_checked),
run_ops_monitors()
);
match result {
Ok(_) => handle_unexpected_exit(telegram_alerts).await,
Err(err) => handle_unexpected_error(telegram_alerts, err).await,
}
}
}

async fn handle_unexpected_exit(telegram_alerts: TelegramAlerts) -> Result<()> {
let message = TelegramSafeAlert::new("phoenix processes exited unexpectedly");
telegram_alerts.send_alert(message.clone()).await;
Err(anyhow!(message))
}

async fn handle_unexpected_error(
telegram_alerts: TelegramAlerts,
err: anyhow::Error,
) -> Result<()> {
let shortned_err = err.to_string().split_off(3072);
let escaped_err = telegram::escape_str(&shortned_err);
let formatted_message = formatdoc!(
"
phoenix process exited with error
```error
{escaped_err}
```
"
);
let message = TelegramSafeAlert::from_escaped_string(formatted_message);
telegram_alerts.send_alert(message).await;
Err(anyhow!(err))
}

0 comments on commit ecb398a

Please sign in to comment.