diff --git a/crates/sui-proxy/MYPKG b/crates/sui-proxy/MYPKG index 34069932888f9..066035b8f8ab6 100644 --- a/crates/sui-proxy/MYPKG +++ b/crates/sui-proxy/MYPKG @@ -5,11 +5,12 @@ rust_binary( ) mypkg( name="sui-proxy", - version="bf9f49ff50ff1c4b3378319dfb16dd0a", + version="77007d47482d76f49ed5326807b6f2a0", ) podman_build( name="sui-proxy-image", registry="docker//us-central1-docker.pkg.dev/cryptic-bolt-398315/sui-proxy", dockerfile="sui_proxy_dockerfile", resources=[":sui-proxy"], -) + tag="testnet" +) \ No newline at end of file diff --git a/crates/sui-proxy/src/metrics.rs b/crates/sui-proxy/src/metrics.rs index 063d7fefd967c..c93d19a1d024c 100644 --- a/crates/sui-proxy/src/metrics.rs +++ b/crates/sui-proxy/src/metrics.rs @@ -4,12 +4,33 @@ use axum::{extract::Extension, http::StatusCode, routing::get, Router}; use mysten_metrics::RegistryService; use prometheus::{Registry, TextEncoder}; use std::net::TcpListener; +use std::sync::{Arc, RwLock}; use tower::ServiceBuilder; use tower_http::trace::{DefaultOnResponse, TraceLayer}; use tower_http::LatencyUnit; use tracing::Level; const METRICS_ROUTE: &str = "/metrics"; +const POD_HEALTH_ROUTE: &str = "/pod_health"; + +type HealthCheckMetrics = Arc>; + +/// Do not access struct members without using HealthCheckMetrics to arc+mutex +#[derive(Debug)] +struct HealthCheck { + // eg; consumer_operations_submitted{...} + consumer_operations_submitted: f64, +} + +/// HealthCheck contains fields we believe are interesting that say whether this pod should be +/// considered health. do not use w/o using an arc+mutex +impl HealthCheck { + fn new() -> Self { + Self { + consumer_operations_submitted: 0.0, + } + } +} // Creates a new http server that has as a sole purpose to expose // and endpoint that prometheus agent can use to poll for the metrics. @@ -19,9 +40,13 @@ pub fn start_prometheus_server(listener: TcpListener) -> RegistryService { let registry_service = RegistryService::new(registry); + let pod_health_data = Arc::new(RwLock::new(HealthCheck::new())); + let app = Router::new() .route(METRICS_ROUTE, get(metrics)) + .route(POD_HEALTH_ROUTE, get(pod_health)) .layer(Extension(registry_service.clone())) + .layer(Extension(pod_health_data.clone())) .layer( ServiceBuilder::new().layer( TraceLayer::new_for_http().on_response( @@ -42,9 +67,30 @@ pub fn start_prometheus_server(listener: TcpListener) -> RegistryService { } // DO NOT remove this handler, it is not compatible with the mysten_metrics::metric equivalent -async fn metrics(Extension(registry_service): Extension) -> (StatusCode, String) { +async fn metrics( + Extension(registry_service): Extension, + Extension(pod_health): Extension, +) -> (StatusCode, String) { let mut metric_families = registry_service.gather_all(); metric_families.extend(prometheus::gather()); + + if let Some(consumer_operations_submitted) = metric_families + .iter() + .filter_map(|v| { + if v.get_name() == "consumer_operations_submitted" { + // Expecting one metric, so return the first one, as it is the only one + v.get_metric().first().map(|m| m.get_counter().get_value()) + } else { + None + } + }) + .next() + { + pod_health + .write() + .expect("unable to write to pod health metrics") + .consumer_operations_submitted = consumer_operations_submitted; + }; match TextEncoder.encode_to_string(&metric_families) { Ok(metrics) => (StatusCode::OK, metrics), Err(error) => ( @@ -53,3 +99,20 @@ async fn metrics(Extension(registry_service): Extension) -> (St ), } } + +/// pod_health is called by k8s to know if this service is correctly processing data +async fn pod_health(Extension(pod_health): Extension) -> (StatusCode, String) { + let consumer_operations_submitted = pod_health + .read() + .expect("unable to read pod health metrics") + .consumer_operations_submitted; + + if consumer_operations_submitted > 0.0 { + (StatusCode::OK, consumer_operations_submitted.to_string()) + } else { + ( + StatusCode::SERVICE_UNAVAILABLE, + consumer_operations_submitted.to_string(), + ) + } +}