From 9e70b089e43813ff5a8337cd8495ab4350585a00 Mon Sep 17 00:00:00 2001 From: Maria Kuklina <101095419+kmd-fl@users.noreply.github.com> Date: Thu, 26 Sep 2024 17:21:40 +0200 Subject: [PATCH] feat(metrics): add info nox config metrics (#2387) --- crates/chain-listener/src/listener.rs | 12 +++ crates/peer-metrics/src/chain_listener.rs | 27 +++++ crates/peer-metrics/src/info.rs | 53 +++++++++- crates/peer-metrics/src/lib.rs | 4 +- crates/server-config/src/node_config.rs | 18 ++++ nox/src/node.rs | 115 ++++++++++++++++------ 6 files changed, 198 insertions(+), 31 deletions(-) diff --git a/crates/chain-listener/src/listener.rs b/crates/chain-listener/src/listener.rs index 9304b94157..b8c0617412 100644 --- a/crates/chain-listener/src/listener.rs +++ b/crates/chain-listener/src/listener.rs @@ -312,6 +312,18 @@ impl ChainListener { async fn refresh_current_commitment_id(&mut self) -> eyre::Result<()> { match self.chain_connector.get_current_commitment_id().await { Ok(id) => { + // This is the only place where `current_commitment` is updated, so it should be fine + // to observe the metrics it here + if id != self.current_commitment { + if let Some(current_commitment) = &self.current_commitment { + self.observe(|m| { + m.observe_removed_commitment(current_commitment.to_string()) + }); + } + if let Some(new_commitment) = &id { + self.observe(|m| m.observe_new_commitment(new_commitment.to_string())); + } + } self.current_commitment = id; Ok(()) } diff --git a/crates/peer-metrics/src/chain_listener.rs b/crates/peer-metrics/src/chain_listener.rs index c8fa90fcba..c742e56c1a 100644 --- a/crates/peer-metrics/src/chain_listener.rs +++ b/crates/peer-metrics/src/chain_listener.rs @@ -21,6 +21,7 @@ use crate::{execution_time_buckets, register}; use prometheus_client::encoding::EncodeLabelSet; use prometheus_client::metrics::counter::Counter; use prometheus_client::metrics::exemplar::CounterWithExemplar; +use prometheus_client::metrics::family::Family; use prometheus_client::metrics::gauge::Gauge; use prometheus_client::metrics::histogram::Histogram; use prometheus_client::registry::Registry; @@ -30,6 +31,11 @@ struct TxLabel { tx_hash: String, } +#[derive(EncodeLabelSet, Hash, Clone, Eq, PartialEq, Debug)] +struct CommitmentLabel { + commitment_id: String, +} + #[derive(Clone)] pub struct ChainListenerMetrics { // how many request Nox sends to ccp @@ -54,6 +60,7 @@ pub struct ChainListenerMetrics { blocks_processed: Counter, last_process_block: Gauge, current_commitment_status: Gauge, + current_commitment: Family, } impl ChainListenerMetrics { @@ -143,6 +150,13 @@ impl ChainListenerMetrics { "Current commitment status", ); + let current_commitment = register( + sub_registry, + Family::default(), + "current_commitment", + "Current commitment", + ); + Self { ccp_requests_total, ccp_replies_total, @@ -156,6 +170,7 @@ impl ChainListenerMetrics { blocks_processed, last_process_block, current_commitment_status, + current_commitment, } } @@ -198,4 +213,16 @@ impl ChainListenerMetrics { pub fn observe_commiment_status(&self, status: u64) { self.current_commitment_status.set(status as i64); } + + pub fn observe_new_commitment(&self, commitment_id: String) { + self.current_commitment + .get_or_create(&CommitmentLabel { commitment_id }) + .set(1); + } + + pub fn observe_removed_commitment(&self, commitment_id: String) { + self.current_commitment + .get_or_create(&CommitmentLabel { commitment_id }) + .set(0); + } } diff --git a/crates/peer-metrics/src/info.rs b/crates/peer-metrics/src/info.rs index bc6ada040d..9921eaa007 100644 --- a/crates/peer-metrics/src/info.rs +++ b/crates/peer-metrics/src/info.rs @@ -17,13 +17,18 @@ * along with this program. If not, see . */ -use prometheus_client::encoding::EncodeLabelSet; +use std::fmt::{Error, Write}; + +use prometheus_client::encoding::{EncodeLabelSet, EncodeLabelValue, LabelValueEncoder}; use prometheus_client::metrics::info::Info; use prometheus_client::registry::Registry; pub struct NoxInfo { pub version: NoxVersion, pub chain_info: ChainInfo, + pub vm_info: VmInfo, + pub network_info: NetworkInfo, + pub system_info: SystemInfo, } #[derive(Debug, Clone, Hash, Eq, PartialEq, EncodeLabelSet)] @@ -71,6 +76,43 @@ impl ChainInfo { } } +#[derive(Default, Debug, Clone, Hash, Eq, PartialEq, EncodeLabelSet)] +pub struct SystemInfo { + pub cpus_range: String, + pub system_cpu_count: usize, + pub particle_execution_timeout_sec: u64, + pub max_spell_particle_ttl_sec: u64, +} + +#[derive(Default, Debug, Clone, Hash, Eq, PartialEq, EncodeLabelSet)] +pub struct VmInfo { + pub allow_gpu: u8, + pub public_ip: String, + pub host_ssh_port: u16, + pub vm_ssh_port: u16, + pub port_range_start: u16, + pub port_range_end: u16, +} + +#[derive(Default, Debug, Clone, Hash, Eq, PartialEq, EncodeLabelSet)] +pub struct NetworkInfo { + pub tcp_port: u16, + pub websocket_port: u16, + pub listen_ip: String, + pub network_type: String, + pub bootstrap_nodes: Addresses, + pub external_address: Option, + pub external_multiaddresses: Addresses, +} + +#[derive(Debug, Clone, Hash, Eq, PartialEq, Default)] +pub struct Addresses(pub Vec); +impl EncodeLabelValue for Addresses { + fn encode(&self, encoder: &mut LabelValueEncoder) -> Result<(), Error> { + encoder.write_str(&self.0.join(", ")) + } +} + pub fn add_info_metrics(registry: &mut Registry, nox_info: NoxInfo) { let sub_registry = registry.sub_registry_with_prefix("nox"); @@ -79,4 +121,13 @@ pub fn add_info_metrics(registry: &mut Registry, nox_info: NoxInfo) { let chain_info = Info::new(nox_info.chain_info); sub_registry.register("chain", "Chain Nox Info", chain_info); + + let network_info = Info::new(nox_info.network_info); + sub_registry.register("network", "Network Nox Info", network_info); + + let vm_info = Info::new(nox_info.vm_info); + sub_registry.register("vm", "VM Nox Info", vm_info); + + let system_info = Info::new(nox_info.system_info); + sub_registry.register("system", "System Nox Info", system_info); } diff --git a/crates/peer-metrics/src/lib.rs b/crates/peer-metrics/src/lib.rs index c4d105d14b..f7116f59f5 100644 --- a/crates/peer-metrics/src/lib.rs +++ b/crates/peer-metrics/src/lib.rs @@ -27,7 +27,9 @@ pub use connection_pool::ConnectionPoolMetrics; pub use connectivity::ConnectivityMetrics; pub use connectivity::Resolution; pub use dispatcher::DispatcherMetrics; -pub use info::{add_info_metrics, ChainInfo, NoxInfo, NoxVersion}; +pub use info::{ + add_info_metrics, Addresses, ChainInfo, NetworkInfo, NoxInfo, NoxVersion, SystemInfo, VmInfo, +}; use particle_execution::ParticleParams; pub use particle_executor::{FunctionKind, ParticleExecutorMetrics, WorkerLabel, WorkerType}; pub use services_metrics::{ diff --git a/crates/server-config/src/node_config.rs b/crates/server-config/src/node_config.rs index 8c44a209cc..e0eb773229 100644 --- a/crates/server-config/src/node_config.rs +++ b/crates/server-config/src/node_config.rs @@ -18,6 +18,7 @@ */ use std::collections::{BTreeMap, HashMap}; +use std::fmt::{Display, Formatter}; use std::net::{IpAddr, Ipv4Addr}; use std::ops::Deref; use std::path::{Path, PathBuf}; @@ -212,6 +213,17 @@ impl TryFrom<&Network> for StreamProtocol { } } +impl Display for Network { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Network::Dar => write!(f, "dar"), + Network::Stage => write!(f, "stage"), + Network::Kras => write!(f, "kras"), + Network::Custom(bytes) => write!(f, "custom:{}", hex::encode(bytes)), + } + } +} + impl UnresolvedNodeConfig { pub fn resolve(self, persistent_base_dir: &Path) -> eyre::Result { let bootstrap_nodes = match self.local { @@ -682,3 +694,9 @@ fn default_port_range_config() -> PortRangeConfig { end: 65535, } } + +impl Display for PortRangeConfig { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}-{}", self.start, self.end) + } +} diff --git a/nox/src/node.rs b/nox/src/node.rs index 7c0830cea1..220adf9b0b 100644 --- a/nox/src/node.rs +++ b/nox/src/node.rs @@ -63,7 +63,7 @@ use peer_metrics::{ ServicesMetrics, ServicesMetricsBackend, SpellMetrics, VmPoolMetrics, }; use server_config::system_services_config::ServiceKey; -use server_config::{NetworkConfig, ResolvedConfig}; +use server_config::{NetworkConfig, NodeConfig, ResolvedConfig}; use sorcerer::Sorcerer; use spell_event_bus::api::{PeerEvent, SpellEventBusApi, TriggerEvent}; use spell_event_bus::bus::SpellEventBus; @@ -420,34 +420,7 @@ impl Node { }), }; if let Some(m) = metrics_registry.as_mut() { - let mut chain_info = peer_metrics::ChainInfo::default(peer_id.to_string()); - if let Some(connector_cfg) = &config.chain_config { - chain_info.http_endpoint = connector_cfg.http_endpoint.clone(); - chain_info.diamond_contract_address = - connector_cfg.diamond_contract_address.clone(); - chain_info.network_id = connector_cfg.network_id; - chain_info.default_base_fee = connector_cfg.default_base_fee.clone(); - chain_info.default_priority_fee = connector_cfg.default_priority_fee.clone(); - } - - if let Some(chain_listener_cfg) = &config.chain_listener_config { - chain_info.ws_endpoint = chain_listener_cfg.ws_endpoint.clone(); - chain_info.proof_poll_period_secs = chain_listener_cfg.proof_poll_period.as_secs(); - chain_info.min_batch_count = chain_listener_cfg.min_batch_count; - chain_info.max_batch_count = chain_listener_cfg.max_batch_count; - chain_info.max_proof_batch_size = chain_listener_cfg.max_proof_batch_size; - chain_info.epoch_end_window_secs = chain_listener_cfg.epoch_end_window.as_secs(); - } - - let nox_info = peer_metrics::NoxInfo { - version: peer_metrics::NoxVersion { - node_version: node_info.node_version.to_string(), - air_version: node_info.air_version.to_string(), - spell_version: node_info.spell_version.to_string(), - }, - chain_info, - }; - + let nox_info = to_nox_info_metrics(&config, &node_info, peer_id.to_string()); peer_metrics::add_info_metrics(m, nox_info); } custom_service_functions.extend_one(make_peer_builtin(node_info)); @@ -866,6 +839,90 @@ fn services_wasm_backend_config(config: &ResolvedConfig) -> WasmBackendConfig { } } +fn to_nox_info_metrics( + config: &NodeConfig, + node_info: &NodeInfo, + peer_id: String, +) -> peer_metrics::NoxInfo { + use peer_metrics::*; + + let mut chain_info = ChainInfo::default(peer_id); + if let Some(connector_cfg) = &config.chain_config { + chain_info.http_endpoint = connector_cfg.http_endpoint.clone(); + chain_info.diamond_contract_address = connector_cfg.diamond_contract_address.clone(); + chain_info.network_id = connector_cfg.network_id; + chain_info.default_base_fee = connector_cfg.default_base_fee.clone(); + chain_info.default_priority_fee = connector_cfg.default_priority_fee.clone(); + } + + if let Some(chain_listener_cfg) = &config.chain_listener_config { + chain_info.ws_endpoint = chain_listener_cfg.ws_endpoint.clone(); + chain_info.proof_poll_period_secs = chain_listener_cfg.proof_poll_period.as_secs(); + chain_info.min_batch_count = chain_listener_cfg.min_batch_count; + chain_info.max_batch_count = chain_listener_cfg.max_batch_count; + chain_info.max_proof_batch_size = chain_listener_cfg.max_proof_batch_size; + chain_info.epoch_end_window_secs = chain_listener_cfg.epoch_end_window.as_secs(); + } + + let version = NoxVersion { + node_version: node_info.node_version.to_string(), + air_version: node_info.air_version.to_string(), + spell_version: node_info.spell_version.to_string(), + }; + + let vm_info = config + .vm + .as_ref() + .map(|vm| VmInfo { + allow_gpu: if vm.allow_gpu { 1 } else { 0 }, + public_ip: vm.network.public_ip.to_string(), + host_ssh_port: vm.network.host_ssh_port, + vm_ssh_port: vm.network.vm_ssh_port, + port_range_start: vm.network.port_range.start, + port_range_end: vm.network.port_range.end, + }) + .unwrap_or_default(); + + let network_info = NetworkInfo { + tcp_port: config.listen_config.tcp_port, + websocket_port: config.listen_config.websocket_port, + listen_ip: config.listen_config.listen_ip.to_string(), + network_type: format!("{}", config.network), + bootstrap_nodes: Addresses( + config + .bootstrap_nodes + .clone() + .iter() + .map(|a| a.to_string()) + .collect::<_>(), + ), + external_address: config.external_address.map(|a| a.to_string()), + external_multiaddresses: Addresses( + config + .external_multiaddresses + .clone() + .iter() + .map(|a| a.to_string()) + .collect::<_>(), + ), + }; + + let system_info = SystemInfo { + cpus_range: format!("{}", config.cpus_range), + system_cpu_count: config.system_cpu_count, + particle_execution_timeout_sec: config.particle_execution_timeout.as_secs(), + max_spell_particle_ttl_sec: config.max_spell_particle_ttl.as_secs(), + }; + + NoxInfo { + version, + chain_info, + vm_info, + network_info, + system_info, + } +} + #[cfg(test)] mod tests { use std::path::PathBuf;