Skip to content

Commit

Permalink
feat(metrics): add info nox config metrics (#2387)
Browse files Browse the repository at this point in the history
  • Loading branch information
kmd-fl committed Sep 26, 2024
1 parent 8557a3a commit 9e70b08
Show file tree
Hide file tree
Showing 6 changed files with 198 additions and 31 deletions.
12 changes: 12 additions & 0 deletions crates/chain-listener/src/listener.rs
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,18 @@ impl ChainListener {
async fn refresh_current_commitment_id(&mut self) -> eyre::Result<()> {
match self.chain_connector.get_current_commitment_id().await {
Ok(id) => {
// This is the only place where `current_commitment` is updated, so it should be fine
// to observe the metrics it here
if id != self.current_commitment {
if let Some(current_commitment) = &self.current_commitment {
self.observe(|m| {
m.observe_removed_commitment(current_commitment.to_string())
});
}
if let Some(new_commitment) = &id {
self.observe(|m| m.observe_new_commitment(new_commitment.to_string()));
}
}
self.current_commitment = id;
Ok(())
}
Expand Down
27 changes: 27 additions & 0 deletions crates/peer-metrics/src/chain_listener.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use crate::{execution_time_buckets, register};
use prometheus_client::encoding::EncodeLabelSet;
use prometheus_client::metrics::counter::Counter;
use prometheus_client::metrics::exemplar::CounterWithExemplar;
use prometheus_client::metrics::family::Family;
use prometheus_client::metrics::gauge::Gauge;
use prometheus_client::metrics::histogram::Histogram;
use prometheus_client::registry::Registry;
Expand All @@ -30,6 +31,11 @@ struct TxLabel {
tx_hash: String,
}

#[derive(EncodeLabelSet, Hash, Clone, Eq, PartialEq, Debug)]
struct CommitmentLabel {
commitment_id: String,
}

#[derive(Clone)]
pub struct ChainListenerMetrics {
// how many request Nox sends to ccp
Expand All @@ -54,6 +60,7 @@ pub struct ChainListenerMetrics {
blocks_processed: Counter,
last_process_block: Gauge,
current_commitment_status: Gauge,
current_commitment: Family<CommitmentLabel, Gauge>,
}

impl ChainListenerMetrics {
Expand Down Expand Up @@ -143,6 +150,13 @@ impl ChainListenerMetrics {
"Current commitment status",
);

let current_commitment = register(
sub_registry,
Family::default(),
"current_commitment",
"Current commitment",
);

Self {
ccp_requests_total,
ccp_replies_total,
Expand All @@ -156,6 +170,7 @@ impl ChainListenerMetrics {
blocks_processed,
last_process_block,
current_commitment_status,
current_commitment,
}
}

Expand Down Expand Up @@ -198,4 +213,16 @@ impl ChainListenerMetrics {
pub fn observe_commiment_status(&self, status: u64) {
self.current_commitment_status.set(status as i64);
}

pub fn observe_new_commitment(&self, commitment_id: String) {
self.current_commitment
.get_or_create(&CommitmentLabel { commitment_id })
.set(1);
}

pub fn observe_removed_commitment(&self, commitment_id: String) {
self.current_commitment
.get_or_create(&CommitmentLabel { commitment_id })
.set(0);
}
}
53 changes: 52 additions & 1 deletion crates/peer-metrics/src/info.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,18 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

use prometheus_client::encoding::EncodeLabelSet;
use std::fmt::{Error, Write};

use prometheus_client::encoding::{EncodeLabelSet, EncodeLabelValue, LabelValueEncoder};
use prometheus_client::metrics::info::Info;
use prometheus_client::registry::Registry;

pub struct NoxInfo {
pub version: NoxVersion,
pub chain_info: ChainInfo,
pub vm_info: VmInfo,
pub network_info: NetworkInfo,
pub system_info: SystemInfo,
}

#[derive(Debug, Clone, Hash, Eq, PartialEq, EncodeLabelSet)]
Expand Down Expand Up @@ -71,6 +76,43 @@ impl ChainInfo {
}
}

#[derive(Default, Debug, Clone, Hash, Eq, PartialEq, EncodeLabelSet)]
pub struct SystemInfo {
pub cpus_range: String,
pub system_cpu_count: usize,
pub particle_execution_timeout_sec: u64,
pub max_spell_particle_ttl_sec: u64,
}

#[derive(Default, Debug, Clone, Hash, Eq, PartialEq, EncodeLabelSet)]
pub struct VmInfo {
pub allow_gpu: u8,
pub public_ip: String,
pub host_ssh_port: u16,
pub vm_ssh_port: u16,
pub port_range_start: u16,
pub port_range_end: u16,
}

#[derive(Default, Debug, Clone, Hash, Eq, PartialEq, EncodeLabelSet)]
pub struct NetworkInfo {
pub tcp_port: u16,
pub websocket_port: u16,
pub listen_ip: String,
pub network_type: String,
pub bootstrap_nodes: Addresses,
pub external_address: Option<String>,
pub external_multiaddresses: Addresses,
}

#[derive(Debug, Clone, Hash, Eq, PartialEq, Default)]
pub struct Addresses(pub Vec<String>);
impl EncodeLabelValue for Addresses {
fn encode(&self, encoder: &mut LabelValueEncoder) -> Result<(), Error> {
encoder.write_str(&self.0.join(", "))
}
}

pub fn add_info_metrics(registry: &mut Registry, nox_info: NoxInfo) {
let sub_registry = registry.sub_registry_with_prefix("nox");

Expand All @@ -79,4 +121,13 @@ pub fn add_info_metrics(registry: &mut Registry, nox_info: NoxInfo) {

let chain_info = Info::new(nox_info.chain_info);
sub_registry.register("chain", "Chain Nox Info", chain_info);

let network_info = Info::new(nox_info.network_info);
sub_registry.register("network", "Network Nox Info", network_info);

let vm_info = Info::new(nox_info.vm_info);
sub_registry.register("vm", "VM Nox Info", vm_info);

let system_info = Info::new(nox_info.system_info);
sub_registry.register("system", "System Nox Info", system_info);
}
4 changes: 3 additions & 1 deletion crates/peer-metrics/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ pub use connection_pool::ConnectionPoolMetrics;
pub use connectivity::ConnectivityMetrics;
pub use connectivity::Resolution;
pub use dispatcher::DispatcherMetrics;
pub use info::{add_info_metrics, ChainInfo, NoxInfo, NoxVersion};
pub use info::{
add_info_metrics, Addresses, ChainInfo, NetworkInfo, NoxInfo, NoxVersion, SystemInfo, VmInfo,
};
use particle_execution::ParticleParams;
pub use particle_executor::{FunctionKind, ParticleExecutorMetrics, WorkerLabel, WorkerType};
pub use services_metrics::{
Expand Down
18 changes: 18 additions & 0 deletions crates/server-config/src/node_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
*/

use std::collections::{BTreeMap, HashMap};
use std::fmt::{Display, Formatter};
use std::net::{IpAddr, Ipv4Addr};
use std::ops::Deref;
use std::path::{Path, PathBuf};
Expand Down Expand Up @@ -212,6 +213,17 @@ impl TryFrom<&Network> for StreamProtocol {
}
}

impl Display for Network {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
Network::Dar => write!(f, "dar"),
Network::Stage => write!(f, "stage"),
Network::Kras => write!(f, "kras"),
Network::Custom(bytes) => write!(f, "custom:{}", hex::encode(bytes)),
}
}
}

impl UnresolvedNodeConfig {
pub fn resolve(self, persistent_base_dir: &Path) -> eyre::Result<NodeConfig> {
let bootstrap_nodes = match self.local {
Expand Down Expand Up @@ -682,3 +694,9 @@ fn default_port_range_config() -> PortRangeConfig {
end: 65535,
}
}

impl Display for PortRangeConfig {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}-{}", self.start, self.end)
}
}
115 changes: 86 additions & 29 deletions nox/src/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ use peer_metrics::{
ServicesMetrics, ServicesMetricsBackend, SpellMetrics, VmPoolMetrics,
};
use server_config::system_services_config::ServiceKey;
use server_config::{NetworkConfig, ResolvedConfig};
use server_config::{NetworkConfig, NodeConfig, ResolvedConfig};
use sorcerer::Sorcerer;
use spell_event_bus::api::{PeerEvent, SpellEventBusApi, TriggerEvent};
use spell_event_bus::bus::SpellEventBus;
Expand Down Expand Up @@ -420,34 +420,7 @@ impl<RT: AquaRuntime> Node<RT> {
}),
};
if let Some(m) = metrics_registry.as_mut() {
let mut chain_info = peer_metrics::ChainInfo::default(peer_id.to_string());
if let Some(connector_cfg) = &config.chain_config {
chain_info.http_endpoint = connector_cfg.http_endpoint.clone();
chain_info.diamond_contract_address =
connector_cfg.diamond_contract_address.clone();
chain_info.network_id = connector_cfg.network_id;
chain_info.default_base_fee = connector_cfg.default_base_fee.clone();
chain_info.default_priority_fee = connector_cfg.default_priority_fee.clone();
}

if let Some(chain_listener_cfg) = &config.chain_listener_config {
chain_info.ws_endpoint = chain_listener_cfg.ws_endpoint.clone();
chain_info.proof_poll_period_secs = chain_listener_cfg.proof_poll_period.as_secs();
chain_info.min_batch_count = chain_listener_cfg.min_batch_count;
chain_info.max_batch_count = chain_listener_cfg.max_batch_count;
chain_info.max_proof_batch_size = chain_listener_cfg.max_proof_batch_size;
chain_info.epoch_end_window_secs = chain_listener_cfg.epoch_end_window.as_secs();
}

let nox_info = peer_metrics::NoxInfo {
version: peer_metrics::NoxVersion {
node_version: node_info.node_version.to_string(),
air_version: node_info.air_version.to_string(),
spell_version: node_info.spell_version.to_string(),
},
chain_info,
};

let nox_info = to_nox_info_metrics(&config, &node_info, peer_id.to_string());
peer_metrics::add_info_metrics(m, nox_info);
}
custom_service_functions.extend_one(make_peer_builtin(node_info));
Expand Down Expand Up @@ -866,6 +839,90 @@ fn services_wasm_backend_config(config: &ResolvedConfig) -> WasmBackendConfig {
}
}

fn to_nox_info_metrics(
config: &NodeConfig,
node_info: &NodeInfo,
peer_id: String,
) -> peer_metrics::NoxInfo {
use peer_metrics::*;

let mut chain_info = ChainInfo::default(peer_id);
if let Some(connector_cfg) = &config.chain_config {
chain_info.http_endpoint = connector_cfg.http_endpoint.clone();
chain_info.diamond_contract_address = connector_cfg.diamond_contract_address.clone();
chain_info.network_id = connector_cfg.network_id;
chain_info.default_base_fee = connector_cfg.default_base_fee.clone();
chain_info.default_priority_fee = connector_cfg.default_priority_fee.clone();
}

if let Some(chain_listener_cfg) = &config.chain_listener_config {
chain_info.ws_endpoint = chain_listener_cfg.ws_endpoint.clone();
chain_info.proof_poll_period_secs = chain_listener_cfg.proof_poll_period.as_secs();
chain_info.min_batch_count = chain_listener_cfg.min_batch_count;
chain_info.max_batch_count = chain_listener_cfg.max_batch_count;
chain_info.max_proof_batch_size = chain_listener_cfg.max_proof_batch_size;
chain_info.epoch_end_window_secs = chain_listener_cfg.epoch_end_window.as_secs();
}

let version = NoxVersion {
node_version: node_info.node_version.to_string(),
air_version: node_info.air_version.to_string(),
spell_version: node_info.spell_version.to_string(),
};

let vm_info = config
.vm
.as_ref()
.map(|vm| VmInfo {
allow_gpu: if vm.allow_gpu { 1 } else { 0 },
public_ip: vm.network.public_ip.to_string(),
host_ssh_port: vm.network.host_ssh_port,
vm_ssh_port: vm.network.vm_ssh_port,
port_range_start: vm.network.port_range.start,
port_range_end: vm.network.port_range.end,
})
.unwrap_or_default();

let network_info = NetworkInfo {
tcp_port: config.listen_config.tcp_port,
websocket_port: config.listen_config.websocket_port,
listen_ip: config.listen_config.listen_ip.to_string(),
network_type: format!("{}", config.network),
bootstrap_nodes: Addresses(
config
.bootstrap_nodes
.clone()
.iter()
.map(|a| a.to_string())
.collect::<_>(),
),
external_address: config.external_address.map(|a| a.to_string()),
external_multiaddresses: Addresses(
config
.external_multiaddresses
.clone()
.iter()
.map(|a| a.to_string())
.collect::<_>(),
),
};

let system_info = SystemInfo {
cpus_range: format!("{}", config.cpus_range),
system_cpu_count: config.system_cpu_count,
particle_execution_timeout_sec: config.particle_execution_timeout.as_secs(),
max_spell_particle_ttl_sec: config.max_spell_particle_ttl.as_secs(),
};

NoxInfo {
version,
chain_info,
vm_info,
network_info,
system_info,
}
}

#[cfg(test)]
mod tests {
use std::path::PathBuf;
Expand Down

0 comments on commit 9e70b08

Please sign in to comment.